23 files changed, 1337 insertions, 394 deletions
diff --git a/sys/amd64/acpica/acpi_wakeup.c b/sys/amd64/acpica/acpi_wakeup.c
index 51d6d5e36840..99565fbb69ca 100644
--- a/sys/amd64/acpica/acpi_wakeup.c
+++ b/sys/amd64/acpica/acpi_wakeup.c
@@ -54,10 +54,8 @@
 
 #include <x86/apicreg.h>
 #include <x86/apicvar.h>
-#ifdef SMP
 #include <machine/smp.h>
 #include <machine/vmparam.h>
-#endif
 
 #include <contrib/dev/acpica/include/acpi.h>
 
@@ -73,19 +71,13 @@ extern int		acpi_resume_beep;
 extern int		acpi_reset_video;
 extern int		acpi_susp_bounce;
 
-#ifdef SMP
 extern struct susppcb	**susppcbs;
 static cpuset_t		suspcpus;
-#else
-static struct susppcb	**susppcbs;
-#endif
 
 static void		acpi_stop_beep(void *);
 
-#ifdef SMP
 static int		acpi_wakeup_ap(struct acpi_softc *, int);
 static void		acpi_wakeup_cpus(struct acpi_softc *);
-#endif
 
 #define	ACPI_WAKEPT_PAGES	7
 
@@ -103,7 +95,6 @@ acpi_stop_beep(void *arg)
 		timer_spkr_release();
 }
 
-#ifdef SMP
 static int
 acpi_wakeup_ap(struct acpi_softc *sc, int cpu)
 {
@@ -177,7 +168,6 @@ acpi_wakeup_cpus(struct acpi_softc *sc)
 		outb(CMOS_DATA, mpbiosreason);
 	}
 }
-#endif
 
 int
 acpi_sleep_machdep(struct acpi_softc *sc, int state)
@@ -190,10 +180,8 @@ acpi_sleep_machdep(struct acpi_softc *sc, int state)
 	if (sc->acpi_wakeaddr == 0ul)
 		return (-1);	/* couldn't alloc wake memory */
 
-#ifdef SMP
 	suspcpus = all_cpus;
 	CPU_CLR(PCPU_GET(cpuid), &suspcpus);
-#endif
 
 	if (acpi_resume_beep != 0)
 		timer_spkr_acquire();
@@ -208,12 +196,10 @@ acpi_sleep_machdep(struct acpi_softc *sc, int state)
 	pcb = &susppcbs[0]->sp_pcb;
 	if (savectx(pcb)) {
 		fpususpend(susppcbs[0]->sp_fpususpend);
-#ifdef SMP
 		if (!CPU_EMPTY(&suspcpus) && suspend_cpus(suspcpus) == 0) {
 			device_printf(sc->acpi_dev, "Failed to suspend APs\n");
 			return (0);	/* couldn't sleep */
 		}
-#endif
 		hw_ibrs_ibpb_active = 0;
 		hw_ssb_active = 0;
 		cpu_stdext_feature3 = 0;
@@ -278,16 +264,12 @@ acpi_wakeup_machdep(struct acpi_softc *sc, int state, int sleep_result,
 			PCPU_SET(switchtime, 0);
 			PCPU_SET(switchticks, ticks);
 			lapic_xapic_mode();
-#ifdef SMP
 			if (!CPU_EMPTY(&suspcpus))
 				acpi_wakeup_cpus(sc);
-#endif
 		}
 
-#ifdef SMP
 		if (!CPU_EMPTY(&suspcpus))
 			resume_cpus(suspcpus);
-#endif
 
 		/*
 		 * Re-read cpu_stdext_feature3, which was zeroed-out
diff --git a/sys/amd64/amd64/apic_vector.S b/sys/amd64/amd64/apic_vector.S
index 6e51ebff298a..e98bae9eb6c5 100644
--- a/sys/amd64/amd64/apic_vector.S
+++ b/sys/amd64/amd64/apic_vector.S
@@ -49,12 +49,6 @@
 #include <machine/specialreg.h>
 #include <x86/apicreg.h>
 
-#ifdef SMP
-#define LK	lock ;
-#else
-#define LK
-#endif
-
 	.text
 	SUPERALIGN_TEXT
 	/* End Of Interrupt to APIC */
@@ -163,7 +157,6 @@ IDTVEC(spuriousint)
 	jmp	doreti
 #endif
 
-#ifdef SMP
 /*
  * Global address space TLB shootdown.
  */
@@ -270,5 +263,3 @@ IDTVEC(justreturn)
 	INTR_HANDLER	justreturn1
 	call	as_lapic_eoi
 	jmp	doreti
-
-#endif /* SMP */
diff --git a/sys/amd64/amd64/cpu_switch.S b/sys/amd64/amd64/cpu_switch.S
index a053f6c70af1..d7e954f573b0 100644
--- a/sys/amd64/amd64/cpu_switch.S
+++ b/sys/amd64/amd64/cpu_switch.S
@@ -136,7 +136,7 @@ ctx_switch_fpusave_done:
 	movq	%r15,TD_LOCK(%r13)		/* Release the old thread */
 sw1:
 	leaq	TD_MD_PCB(%r12),%r8
-#if defined(SCHED_ULE) && defined(SMP)
+#if defined(SCHED_ULE)
 	movq	$blocked_lock, %rdx
 	movq	TD_LOCK(%r12),%rcx
 	cmpq	%rcx, %rdx
@@ -492,7 +492,7 @@ ENTRY(resumectx)
 END(resumectx)
 
 /* Wait for the new thread to become unblocked */
-#if defined(SCHED_ULE) && defined(SMP)
+#if defined(SCHED_ULE)
 sw1wait:
 1:
 	pause
diff --git a/sys/amd64/amd64/exec_machdep.c b/sys/amd64/amd64/exec_machdep.c
index da68289e2c83..6752b716deb5 100644
--- a/sys/amd64/amd64/exec_machdep.c
+++ b/sys/amd64/amd64/exec_machdep.c
@@ -59,9 +59,7 @@
 #include <sys/reg.h>
 #include <sys/rwlock.h>
 #include <sys/signalvar.h>
-#ifdef SMP
 #include <sys/smp.h>
-#endif
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c
index f46462b39fa3..37c7056f649c 100644
--- a/sys/amd64/amd64/machdep.c
+++ b/sys/amd64/amd64/machdep.c
@@ -38,7 +38,6 @@
  * SUCH DAMAGE.
  */
 
-#include <sys/cdefs.h>
 #include "opt_atpic.h"
 #include "opt_cpu.h"
 #include "opt_ddb.h"
@@ -82,9 +81,7 @@
 #include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/signalvar.h>
-#ifdef SMP
 #include <sys/smp.h>
-#endif
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
@@ -132,9 +129,7 @@
 #include <machine/tss.h>
 #include <x86/ucode.h>
 #include <x86/ifunc.h>
-#ifdef SMP
 #include <machine/smp.h>
-#endif
 #ifdef FDT
 #include <x86/fdt.h>
 #endif
@@ -149,6 +144,10 @@
 #include <isa/rtc.h>
 #include <x86/init.h>
 
+#ifndef SMP
+#error amd64 requires options SMP
+#endif
+
 /* Sanity check for __curthread() */
 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
 
diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c
index 2c7777e608b9..8df082f6c5dc 100644
--- a/sys/amd64/amd64/pmap.c
+++ b/sys/amd64/amd64/pmap.c
@@ -162,9 +162,7 @@
 #include <machine/msan.h>
 #include <machine/pcb.h>
 #include <machine/specialreg.h>
-#ifdef SMP
 #include <machine/smp.h>
-#endif
 #include <machine/sysarch.h>
 #include <machine/tss.h>
 
@@ -483,6 +481,8 @@ vm_paddr_t		KERNend;	/* and the end */
 
 struct kva_layout_s	kva_layout = {
 	.kva_min =	KV4ADDR(PML4PML4I, 0, 0, 0),
+	.kva_max =	KV4ADDR(NPML4EPG - 1, NPDPEPG - 1,
+			    NPDEPG - 1, NPTEPG - 1),
 	.dmap_low =	KV4ADDR(DMPML4I, 0, 0, 0),
 	.dmap_high =	KV4ADDR(DMPML4I + NDMPML4E, 0, 0, 0),
 	.lm_low =	KV4ADDR(LMSPML4I, 0, 0, 0),
@@ -491,18 +491,36 @@ struct kva_layout_s	kva_layout = {
 	.km_high =	KV4ADDR(KPML4BASE + NKPML4E - 1, NPDPEPG - 1,
 			    NPDEPG - 1, NPTEPG - 1),
 	.rec_pt =	KV4ADDR(PML4PML4I, 0, 0, 0),
+	.kasan_shadow_low = KV4ADDR(KASANPML4I, 0, 0, 0),
+	.kasan_shadow_high = KV4ADDR(KASANPML4I + NKASANPML4E, 0, 0, 0),
+	.kmsan_shadow_low = KV4ADDR(KMSANSHADPML4I, 0, 0, 0),
+	.kmsan_shadow_high = KV4ADDR(KMSANSHADPML4I + NKMSANSHADPML4E,
+			    0, 0, 0),
+	.kmsan_origin_low = KV4ADDR(KMSANORIGPML4I, 0, 0, 0),
+	.kmsan_origin_high = KV4ADDR(KMSANORIGPML4I + NKMSANORIGPML4E,
+			    0, 0, 0),
 };
 
 struct kva_layout_s	kva_layout_la57 = {
 	.kva_min =	KV5ADDR(NPML5EPG / 2, 0, 0, 0, 0),	/* == rec_pt */
+	.kva_max =	KV5ADDR(NPML5EPG - 1, NPML4EPG - 1, NPDPEPG - 1,
+			    NPDEPG - 1, NPTEPG - 1),
 	.dmap_low =	KV5ADDR(DMPML5I, 0, 0, 0, 0),
 	.dmap_high =	KV5ADDR(DMPML5I + NDMPML5E, 0, 0, 0, 0),
-	.lm_low =	KV4ADDR(LMSPML4I, 0, 0, 0),
-	.lm_high =	KV4ADDR(LMEPML4I + 1, 0, 0, 0),
+	.lm_low =	KV5ADDR(LMSPML5I, 0, 0, 0, 0),
+	.lm_high =	KV5ADDR(LMEPML5I + 1, 0, 0, 0, 0),
 	.km_low =	KV4ADDR(KPML4BASE, 0, 0, 0),
 	.km_high =	KV4ADDR(KPML4BASE + NKPML4E - 1, NPDPEPG - 1,
 			    NPDEPG - 1, NPTEPG - 1),
 	.rec_pt =	KV5ADDR(PML5PML5I, 0, 0, 0, 0),
+	.kasan_shadow_low = KV4ADDR(KASANPML4I, 0, 0, 0),
+	.kasan_shadow_high = KV4ADDR(KASANPML4I + NKASANPML4E, 0, 0, 0),
+	.kmsan_shadow_low = KV4ADDR(KMSANSHADPML4I, 0, 0, 0),
+	.kmsan_shadow_high = KV4ADDR(KMSANSHADPML4I + NKMSANSHADPML4E,
+			    0, 0, 0),
+	.kmsan_origin_low = KV4ADDR(KMSANORIGPML4I, 0, 0, 0),
+	.kmsan_origin_high = KV4ADDR(KMSANORIGPML4I + NKMSANORIGPML4E,
+			    0, 0, 0),
 };
 
 /*
@@ -2005,7 +2023,7 @@ create_pagetables(vm_paddr_t *firstaddr)
 				 */
 				p5_p[i] = KPML5phys | X86_PG_RW | X86_PG_A |
 				    X86_PG_M | X86_PG_V | pg_nx;
-			} else if (i >= DMPML5I && i < DMPML5I + NDMPML5E) {
+			} else if (i >= DMPML5I && i < DMPML5I + ndmpml4phys) {
 				/* Connect DMAP pml4 pages to PML5. */
 				p5_p[i] = (DMPML4phys + ptoa(i - DMPML5I)) |
 				    X86_PG_RW | X86_PG_V | pg_nx;
@@ -2475,6 +2493,7 @@ pmap_init(void)
 	struct pmap_preinit_mapping *ppim;
 	vm_page_t m, mpte;
 	pml4_entry_t *pml4e;
+	unsigned long lm_max;
 	int error, i, ret, skz63;
 
 	/* L1TF, reserve page @0 unconditionally */
@@ -2600,10 +2619,15 @@ pmap_init(void)
 
 	lm_ents = 8;
 	TUNABLE_INT_FETCH("vm.pmap.large_map_pml4_entries", &lm_ents);
-	if (lm_ents > LMEPML4I - LMSPML4I + 1)
-		lm_ents = LMEPML4I - LMSPML4I + 1;
+	lm_max = (kva_layout.lm_high - kva_layout.lm_low) / NBPML4;
+	if (lm_ents > lm_max) {
+		printf(
+	    "pmap: shrinking large map from requested %d slots to %ld slots\n",
+		    lm_ents, lm_max);
+		lm_ents = lm_max;
+	}
 #ifdef KMSAN
-	if (lm_ents > KMSANORIGPML4I - LMSPML4I) {
+	if (!la57 && lm_ents > KMSANORIGPML4I - LMSPML4I) {
 		printf(
 	    "pmap: shrinking large map for KMSAN (%d slots to %ld slots)\n",
 		    lm_ents, KMSANORIGPML4I - LMSPML4I);
@@ -2615,12 +2639,20 @@ pmap_init(void)
 		    lm_ents, (u_long)lm_ents * (NBPML4 / 1024 / 1024 / 1024));
 	if (lm_ents != 0) {
 		large_vmem = vmem_create("large", kva_layout.lm_low,
-		    (vmem_size_t)kva_layout.lm_high - kva_layout.lm_low,
-		    PAGE_SIZE, 0, M_WAITOK);
+		    (vmem_size_t)lm_ents * NBPML4, PAGE_SIZE, 0, M_WAITOK);
 		if (large_vmem == NULL) {
 			printf("pmap: cannot create large map\n");
 			lm_ents = 0;
 		}
+		if (la57) {
+			for (i = 0; i < howmany((vm_offset_t)NBPML4 *
+			    lm_ents, NBPML5); i++) {
+				m = pmap_large_map_getptp_unlocked();
+				kernel_pmap->pm_pmltop[LMSPML5I + i] = X86_PG_V |
+				    X86_PG_RW | X86_PG_A | X86_PG_M |
+				    pg_nx | VM_PAGE_TO_PHYS(m);
+			}
+		}
 		for (i = 0; i < lm_ents; i++) {
 			m = pmap_large_map_getptp_unlocked();
 			pml4e = pmap_pml4e(kernel_pmap, kva_layout.lm_low +
@@ -3031,7 +3063,6 @@ pmap_update_pde_invalidate(pmap_t pmap, vm_offset_t va, pd_entry_t newpde)
  * XXX TODO
  */
 
-#ifdef SMP
 /*
  * Interrupt the cpus that are executing in the guest context.
  * This will force the vcpu to exit and the cached EPT mappings
@@ -3489,168 +3520,6 @@ pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
 	}
 	sched_unpin();
 }
-#else /* !SMP */
-/*
- * Normal, non-SMP, invalidation functions.
- */
-void
-pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
-{
-	struct invpcid_descr d;
-	struct pmap_pcid *pcidp;
-	uint64_t kcr3, ucr3;
-	uint32_t pcid;
-
-	if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) {
-		pmap->pm_eptgen++;
-		return;
-	}
-	KASSERT(pmap->pm_type == PT_X86,
-	    ("pmap_invalidate_range: unknown type %d", pmap->pm_type));
-
-	if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) {
-		invlpg(va);
-		if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled &&
-		    pmap->pm_ucr3 != PMAP_NO_CR3) {
-			critical_enter();
-			pcid = pmap_get_pcid(pmap);
-			if (invpcid_works) {
-				d.pcid = pcid | PMAP_PCID_USER_PT;
-				d.pad = 0;
-				d.addr = va;
-				invpcid(&d, INVPCID_ADDR);
-			} else {
-				kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE;
-				ucr3 = pmap->pm_ucr3 | pcid |
-				    PMAP_PCID_USER_PT | CR3_PCID_SAVE;
-				pmap_pti_pcid_invlpg(ucr3, kcr3, va);
-			}
-			critical_exit();
-		}
-	} else if (pmap_pcid_enabled) {
-		pcidp = zpcpu_get(pmap->pm_pcidp);
-		pcidp->pm_gen = 0;
-	}
-}
-
-void
-pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
-{
-	struct invpcid_descr d;
-	struct pmap_pcid *pcidp;
-	vm_offset_t addr;
-	uint64_t kcr3, ucr3;
-	uint32_t pcid;
-
-	if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) {
-		pmap->pm_eptgen++;
-		return;
-	}
-	KASSERT(pmap->pm_type == PT_X86,
-	    ("pmap_invalidate_range: unknown type %d", pmap->pm_type));
-
-	if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) {
-		for (addr = sva; addr < eva; addr += PAGE_SIZE)
-			invlpg(addr);
-		if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled &&
-		    pmap->pm_ucr3 != PMAP_NO_CR3) {
-			critical_enter();
-			pcid = pmap_get_pcid(pmap);
-			if (invpcid_works) {
-				d.pcid = pcid | PMAP_PCID_USER_PT;
-				d.pad = 0;
-				d.addr = sva;
-				for (; d.addr < eva; d.addr += PAGE_SIZE)
-					invpcid(&d, INVPCID_ADDR);
-			} else {
-				kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE;
-				ucr3 = pmap->pm_ucr3 | pcid |
-				    PMAP_PCID_USER_PT | CR3_PCID_SAVE;
-				pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva);
-			}
-			critical_exit();
-		}
-	} else if (pmap_pcid_enabled) {
-		pcidp = zpcpu_get(pmap->pm_pcidp);
-		pcidp->pm_gen = 0;
-	}
-}
-
-void
-pmap_invalidate_all(pmap_t pmap)
-{
-	struct invpcid_descr d;
-	struct pmap_pcid *pcidp;
-	uint64_t kcr3, ucr3;
-	uint32_t pcid;
-
-	if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) {
-		pmap->pm_eptgen++;
-		return;
-	}
-	KASSERT(pmap->pm_type == PT_X86,
-	    ("pmap_invalidate_all: unknown type %d", pmap->pm_type));
-
-	if (pmap == kernel_pmap) {
-		if (pmap_pcid_enabled && invpcid_works) {
-			bzero(&d, sizeof(d));
-			invpcid(&d, INVPCID_CTXGLOB);
-		} else {
-			invltlb_glob();
-		}
-	} else if (pmap == PCPU_GET(curpmap)) {
-		if (pmap_pcid_enabled) {
-			critical_enter();
-			pcid = pmap_get_pcid(pmap);
-			if (invpcid_works) {
-				d.pcid = pcid;
-				d.pad = 0;
-				d.addr = 0;
-				invpcid(&d, INVPCID_CTX);
-				if (pmap->pm_ucr3 != PMAP_NO_CR3) {
-					d.pcid |= PMAP_PCID_USER_PT;
-					invpcid(&d, INVPCID_CTX);
-				}
-			} else {
-				kcr3 = pmap->pm_cr3 | pcid;
-				if (pmap->pm_ucr3 != PMAP_NO_CR3) {
-					ucr3 = pmap->pm_ucr3 | pcid |
-					    PMAP_PCID_USER_PT;
-					pmap_pti_pcid_invalidate(ucr3, kcr3);
-				} else
-					load_cr3(kcr3);
-			}
-			critical_exit();
-		} else {
-			invltlb();
-		}
-	} else if (pmap_pcid_enabled) {
-		pcidp = zpcpu_get(pmap->pm_pcidp);
-		pcidp->pm_gen = 0;
-	}
-}
-
-void
-pmap_invalidate_cache(void)
-{
-
-	wbinvd();
-}
-
-static void
-pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
-{
-	struct pmap_pcid *pcidp;
-
-	pmap_update_pde_store(pmap, pde, newpde);
-	if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap))
-		pmap_update_pde_invalidate(pmap, va, newpde);
-	else {
-		pcidp = zpcpu_get(pmap->pm_pcidp);
-		pcidp->pm_gen = 0;
-	}
-}
-#endif /* !SMP */
 
 static void
 pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde)
@@ -6093,17 +5962,18 @@ pmap_demote_pde_mpte(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
 	if (mpte == NULL) {
 		/*
 		 * Invalidate the 2MB page mapping and return "failure" if the
-		 * mapping was never accessed.
+		 * mapping was never accessed and not wired.
 		 */
 		if ((oldpde & PG_A) == 0) {
-			KASSERT((oldpde & PG_W) == 0,
-		    ("pmap_demote_pde: a wired mapping is missing PG_A"));
-			pmap_demote_pde_abort(pmap, va, pde, oldpde, lockp);
-			return (false);
-		}
-
-		mpte = pmap_remove_pt_page(pmap, va);
-		if (mpte == NULL) {
+			if ((oldpde & PG_W) == 0) {
+				pmap_demote_pde_abort(pmap, va, pde, oldpde,
+				    lockp);
+				return (false);
+			}
+			mpte = pmap_remove_pt_page(pmap, va);
+			/* Fill the PTP with PTEs that have PG_A cleared. */
+			mpte->valid = 0;
+		} else if ((mpte = pmap_remove_pt_page(pmap, va)) == NULL) {
 			KASSERT((oldpde & PG_W) == 0,
     ("pmap_demote_pde: page table page for a wired mapping is missing"));
 
@@ -6155,7 +6025,7 @@ pmap_demote_pde_mpte(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
 	/*
 	 * If the PTP is not leftover from an earlier promotion or it does not
 	 * have PG_A set in every PTE, then fill it.  The new PTEs will all
-	 * have PG_A set.
+	 * have PG_A set, unless this is a wired mapping with PG_A clear.
 	 */
 	if (!vm_page_all_valid(mpte))
 		pmap_fill_ptp(firstpte, newpte);
@@ -7561,6 +7431,9 @@ pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags,
 	PG_RW = pmap_rw_bit(pmap);
 	KASSERT((newpde & (pmap_modified_bit(pmap) | PG_RW)) != PG_RW,
 	    ("pmap_enter_pde: newpde is missing PG_M"));
+	KASSERT((flags & (PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM)) !=
+	    PMAP_ENTER_NORECLAIM,
+	    ("pmap_enter_pde: flags is missing PMAP_ENTER_NOREPLACE"));
 	PG_V = pmap_valid_bit(pmap);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
@@ -7689,6 +7562,14 @@ pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags,
 		if (!pmap_pv_insert_pde(pmap, va, newpde, flags, lockp)) {
 			if (pdpg != NULL)
 				pmap_abort_ptp(pmap, va, pdpg);
+			else {
+				KASSERT(va >= VM_MAXUSER_ADDRESS &&
+				    (*pde & (PG_PS | PG_V)) == PG_V,
+				    ("pmap_enter_pde: invalid kernel PDE"));
+				mt = pmap_remove_pt_page(pmap, va);
+				KASSERT(mt != NULL,
+				    ("pmap_enter_pde: missing kernel PTP"));
+			}
 			if (uwptpg != NULL) {
 				mt = pmap_remove_pt_page(pmap, va);
 				KASSERT(mt == uwptpg,
@@ -10333,17 +10214,9 @@ pmap_activate_sw(struct thread *td)
 		return;
 	}
 	cpuid = PCPU_GET(cpuid);
-#ifdef SMP
 	CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
-#else
-	CPU_SET(cpuid, &pmap->pm_active);
-#endif
 	pmap_activate_sw_mode(td, pmap, cpuid);
-#ifdef SMP
 	CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active);
-#else
-	CPU_CLR(cpuid, &oldpmap->pm_active);
-#endif
 }
 
 void
@@ -10384,11 +10257,7 @@ pmap_activate_boot(pmap_t pmap)
 	MPASS(pmap != kernel_pmap);
 
 	cpuid = PCPU_GET(cpuid);
-#ifdef SMP
 	CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
-#else
-	CPU_SET(cpuid, &pmap->pm_active);
-#endif
 	PCPU_SET(curpmap, pmap);
 	if (pti) {
 		kcr3 = pmap->pm_cr3;
@@ -10752,19 +10621,28 @@ pmap_large_map_getptp(void)
 static pdp_entry_t *
 pmap_large_map_pdpe(vm_offset_t va)
 {
+	pml4_entry_t *pml4;
 	vm_pindex_t pml4_idx;
 	vm_paddr_t mphys;
 
-	pml4_idx = pmap_pml4e_index(va);
-	KASSERT(LMSPML4I <= pml4_idx && pml4_idx < LMSPML4I + lm_ents,
-	    ("pmap_large_map_pdpe: va %#jx out of range idx %#jx LMSPML4I "
-	    "%#jx lm_ents %d",
-	    (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents));
-	KASSERT((kernel_pml4[pml4_idx] & X86_PG_V) != 0,
-	    ("pmap_large_map_pdpe: invalid pml4 for va %#jx idx %#jx "
-	    "LMSPML4I %#jx lm_ents %d",
-	    (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents));
-	mphys = kernel_pml4[pml4_idx] & PG_FRAME;
+	KASSERT(va >= kva_layout.lm_low && va < kva_layout.lm_low +
+	    (vm_offset_t)NBPML4 * lm_ents, ("va %#lx not in large map", va));
+	if (la57) {
+		pml4 = pmap_pml4e(kernel_pmap, va);
+		mphys = *pml4 & PG_FRAME;
+	} else {
+		pml4_idx = pmap_pml4e_index(va);
+
+		KASSERT(LMSPML4I <= pml4_idx && pml4_idx < LMSPML4I + lm_ents,
+		    ("pmap_large_map_pdpe: va %#jx out of range idx %#jx "
+		    "LMSPML4I %#jx lm_ents %d",
+		    (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents));
+		KASSERT((kernel_pml4[pml4_idx] & X86_PG_V) != 0,
+		    ("pmap_large_map_pdpe: invalid pml4 for va %#jx idx %#jx "
+		    "LMSPML4I %#jx lm_ents %d",
+		    (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents));
+		mphys = kernel_pml4[pml4_idx] & PG_FRAME;
+	}
 	return ((pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va));
 }
 
@@ -12023,9 +11901,7 @@ sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range,
 	    mode, range->pdpes, range->pdes, range->ptes);
 
 	/* Reset to sentinel value. */
-	range->sva = la57 ? KV5ADDR(NPML5EPG - 1, NPML4EPG - 1, NPDPEPG - 1,
-	    NPDEPG - 1, NPTEPG - 1) : KV4ADDR(NPML4EPG - 1, NPDPEPG - 1,
-	    NPDEPG - 1, NPTEPG - 1);
+	range->sva = kva_layout.kva_max;
 }
 
 /*
@@ -12066,12 +11942,18 @@ sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va,
  */
 static void
 sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range,
-    vm_offset_t va, pml4_entry_t pml4e, pdp_entry_t pdpe, pd_entry_t pde,
-    pt_entry_t pte)
+    vm_offset_t va, pml5_entry_t pml5e, pml4_entry_t pml4e, pdp_entry_t pdpe,
+    pd_entry_t pde, pt_entry_t pte)
 {
 	pt_entry_t attrs;
 
-	attrs = pml4e & (X86_PG_RW | X86_PG_U | pg_nx);
+	if (la57) {
+		attrs = pml5e & (X86_PG_RW | X86_PG_U | pg_nx);
+		attrs |= pml4e & pg_nx;
+		attrs &= pg_nx | (pml4e & (X86_PG_RW | X86_PG_U));
+	} else {
+		attrs = pml4e & (X86_PG_RW | X86_PG_U | pg_nx);
+	}
 
 	attrs |= pdpe & pg_nx;
 	attrs &= pg_nx | (pdpe & (X86_PG_RW | X86_PG_U));
@@ -12104,13 +11986,15 @@ sysctl_kmaps(SYSCTL_HANDLER_ARGS)
 {
 	struct pmap_kernel_map_range range;
 	struct sbuf sbuf, *sb;
+	pml5_entry_t pml5e;
 	pml4_entry_t pml4e;
 	pdp_entry_t *pdp, pdpe;
 	pd_entry_t *pd, pde;
 	pt_entry_t *pt, pte;
 	vm_offset_t sva;
 	vm_paddr_t pa;
-	int error, i, j, k, l;
+	int error, j, k, l;
+	bool first;
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
@@ -12119,9 +12003,8 @@ sysctl_kmaps(SYSCTL_HANDLER_ARGS)
 	sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req);
 
 	/* Sentinel value. */
-	range.sva = la57 ? KV5ADDR(NPML5EPG - 1, NPML4EPG - 1, NPDPEPG - 1,
-	    NPDEPG - 1, NPTEPG - 1) : KV4ADDR(NPML4EPG - 1, NPDPEPG - 1,
-	    NPDEPG - 1, NPTEPG - 1);
+	range.sva = kva_layout.kva_max;
+	pml5e = 0;	/* no UB for la48 */
 
 	/*
 	 * Iterate over the kernel page tables without holding the kernel pmap
@@ -12130,41 +12013,50 @@ sysctl_kmaps(SYSCTL_HANDLER_ARGS)
 	 * Within the large map, ensure that PDP and PD page addresses are
 	 * valid before descending.
 	 */
-	for (sva = 0, i = pmap_pml4e_index(sva); i < NPML4EPG; i++) {
-		switch (i) {
-		case PML4PML4I:
+	for (first = true, sva = 0; sva != 0 || first; first = false) {
+		if (sva == kva_layout.rec_pt)
 			sbuf_printf(sb, "\nRecursive map:\n");
-			break;
-		case DMPML4I:
+		else if (sva == kva_layout.dmap_low)
 			sbuf_printf(sb, "\nDirect map:\n");
-			break;
 #ifdef KASAN
-		case KASANPML4I:
+		else if (sva == kva_layout.kasan_shadow_low)
 			sbuf_printf(sb, "\nKASAN shadow map:\n");
-			break;
 #endif
 #ifdef KMSAN
-		case KMSANSHADPML4I:
+		else if (sva == kva_layout.kmsan_shadow_low)
 			sbuf_printf(sb, "\nKMSAN shadow map:\n");
-			break;
-		case KMSANORIGPML4I:
+		else if (sva == kva_layout.kmsan_origin_low)
 			sbuf_printf(sb, "\nKMSAN origin map:\n");
-			break;
 #endif
-		case KPML4BASE:
+		else if (sva == kva_layout.km_low)
 			sbuf_printf(sb, "\nKernel map:\n");
-			break;
-		case LMSPML4I:
+		else if (sva == kva_layout.lm_low)
 			sbuf_printf(sb, "\nLarge map:\n");
-			break;
-		}
 
 		/* Convert to canonical form. */
-		if (sva == 1ul << 47)
-			sva |= -1ul << 48;
+		if (la57) {
+			if (sva == 1ul << 56) {
+				sva |= -1ul << 57;
+				continue;
+			}
+		} else {
+			if (sva == 1ul << 47) {
+				sva |= -1ul << 48;
+				continue;
+			}
+		}
 
 restart:
-		pml4e = kernel_pml4[i];
+		if (la57) {
+			pml5e = *pmap_pml5e(kernel_pmap, sva);
+			if ((pml5e & X86_PG_V) == 0) {
+				sva = rounddown2(sva, NBPML5);
+				sysctl_kmaps_dump(sb, &range, sva);
+				sva += NBPML5;
+				continue;
+			}
+		}
+		pml4e = *pmap_pml4e(kernel_pmap, sva);
 		if ((pml4e & X86_PG_V) == 0) {
 			sva = rounddown2(sva, NBPML4);
 			sysctl_kmaps_dump(sb, &range, sva);
@@ -12185,8 +12077,8 @@ restart:
 			pa = pdpe & PG_FRAME;
 			if ((pdpe & PG_PS) != 0) {
 				sva = rounddown2(sva, NBPDP);
-				sysctl_kmaps_check(sb, &range, sva, pml4e, pdpe,
-				    0, 0);
+				sysctl_kmaps_check(sb, &range, sva, pml5e,
+				    pml4e, pdpe, 0, 0);
 				range.pdpes++;
 				sva += NBPDP;
 				continue;
@@ -12198,6 +12090,7 @@ restart:
 				 * freed.  Validate the next-level address
 				 * before descending.
 				 */
+				sva += NBPDP;
 				goto restart;
 			}
 			pd = (pd_entry_t *)PHYS_TO_DMAP(pa);
@@ -12214,7 +12107,7 @@ restart:
 				if ((pde & PG_PS) != 0) {
 					sva = rounddown2(sva, NBPDR);
 					sysctl_kmaps_check(sb, &range, sva,
-					    pml4e, pdpe, pde, 0);
+					    pml5e, pml4e, pdpe, pde, 0);
 					range.pdes++;
 					sva += NBPDR;
 					continue;
@@ -12226,6 +12119,7 @@ restart:
 					 * may be freed.  Validate the
 					 * next-level address before descending.
 					 */
+					sva += NBPDR;
 					goto restart;
 				}
 				pt = (pt_entry_t *)PHYS_TO_DMAP(pa);
@@ -12239,7 +12133,7 @@ restart:
 						continue;
 					}
 					sysctl_kmaps_check(sb, &range, sva,
-					    pml4e, pdpe, pde, pte);
+					    pml5e, pml4e, pdpe, pde, pte);
 					range.ptes++;
 				}
 			}
diff --git a/sys/amd64/amd64/support.S b/sys/amd64/amd64/support.S
index c95696bbe7ef..870cd255abb7 100644
--- a/sys/amd64/amd64/support.S
+++ b/sys/amd64/amd64/support.S
@@ -934,10 +934,7 @@ ENTRY(casueword32_nosmap)
 	ja	fusufault
 
 	movl	%esi,%eax			/* old */
-#ifdef SMP
-	lock
-#endif
-	cmpxchgl %ecx,(%rdi)			/* new = %ecx */
+	lock cmpxchgl %ecx,(%rdi)		/* new = %ecx */
 	setne	%cl
 
 	/*
@@ -971,10 +968,7 @@ ENTRY(casueword32_smap)
 
 	movl	%esi,%eax			/* old */
 	stac
-#ifdef SMP
-	lock
-#endif
-	cmpxchgl %ecx,(%rdi)			/* new = %ecx */
+	lock cmpxchgl %ecx,(%rdi)		/* new = %ecx */
 	clac
 	setne	%cl
 
@@ -1014,10 +1008,7 @@ ENTRY(casueword_nosmap)
 	ja	fusufault
 
 	movq	%rsi,%rax			/* old */
-#ifdef SMP
-	lock
-#endif
-	cmpxchgq %rcx,(%rdi)			/* new = %rcx */
+	lock cmpxchgq %rcx,(%rdi)		/* new = %rcx */
 	setne	%cl
 
 	/*
@@ -1045,10 +1036,7 @@ ENTRY(casueword_smap)
 
 	movq	%rsi,%rax			/* old */
 	stac
-#ifdef SMP
-	lock
-#endif
-	cmpxchgq %rcx,(%rdi)			/* new = %rcx */
+	lock cmpxchgq %rcx,(%rdi)		/* new = %rcx */
 	clac
 	setne	%cl
 
diff --git a/sys/amd64/amd64/trap.c b/sys/amd64/amd64/trap.c
index eefddad2f142..f3469ed5e2bc 100644
--- a/sys/amd64/amd64/trap.c
+++ b/sys/amd64/amd64/trap.c
@@ -37,7 +37,6 @@
  * SUCH DAMAGE.
  */
 
-#include <sys/cdefs.h>
 /*
  * AMD64 Trap and System call handling
  */
@@ -87,9 +86,7 @@ PMC_SOFT_DEFINE( , , page_fault, write);
 #include <x86/mca.h>
 #include <machine/md_var.h>
 #include <machine/pcb.h>
-#ifdef SMP
 #include <machine/smp.h>
-#endif
 #include <machine/stack.h>
 #include <machine/trap.h>
 #include <machine/tss.h>
@@ -900,11 +897,9 @@ trap_diag(struct trapframe *frame, vm_offset_t eva)
 	printf("\n\nFatal trap %d: %s while in %s mode\n", type,
 	    type < nitems(trap_msg) ? trap_msg[type] : UNKNOWN,
 	    TRAPF_USERMODE(frame) ? "user" : "kernel");
-#ifdef SMP
-	/* two separate prints in case of a trap on an unmapped page */
-	printf("cpuid = %d; ", PCPU_GET(cpuid));
-	printf("apic id = %02x\n", PCPU_GET(apic_id));
-#endif
+	/* Print these separately in case pcpu accesses trap. */
+	printf("cpuid = %d; apic id = %02x\n", PCPU_GET(cpuid),
+	    PCPU_GET(apic_id));
 	if (type == T_PAGEFLT) {
 		printf("fault virtual address	= 0x%lx\n", eva);
 		printf("fault code		= %s %s %s%s%s, %s\n",
@@ -1025,11 +1020,9 @@ dblfault_handler(struct trapframe *frame)
 	    frame->tf_cs, frame->tf_ss, frame->tf_ds, frame->tf_es,
 	    frame->tf_fs, frame->tf_gs,
 	    rdmsr(MSR_FSBASE), rdmsr(MSR_GSBASE), rdmsr(MSR_KGSBASE));
-#ifdef SMP
-	/* two separate prints in case of a trap on an unmapped page */
-	printf("cpuid = %d; ", PCPU_GET(cpuid));
-	printf("apic id = %02x\n", PCPU_GET(apic_id));
-#endif
+	/* Print these separately in case pcpu accesses trap. */
+	printf("cpuid = %d; apic id = %02x\n", PCPU_GET(cpuid),
+	    PCPU_GET(apic_id));
 	panic("double fault");
 }
 
diff --git a/sys/amd64/conf/MINIMALUP b/sys/amd64/conf/MINIMALUP
deleted file mode 100644
index 0dbddbe5b341..000000000000
--- a/sys/amd64/conf/MINIMALUP
+++ /dev/null
@@ -1,4 +0,0 @@
-include 	MINIMAL
-ident 		MINIMALUP
-nooptions 	SMP
-nooptions 	NUMA
diff --git a/sys/amd64/include/param.h b/sys/amd64/include/param.h
index 1bbb302259d6..5a9c3162e14c 100644
--- a/sys/amd64/include/param.h
+++ b/sys/amd64/include/param.h
@@ -150,8 +150,6 @@
     (((va) >= kva_layout.dmap_low && (va) < kva_layout.dmap_high) || \
     ((va) >= kva_layout.km_low && (va) < kva_layout.km_high))
 
-#ifdef SMP
 #define SC_TABLESIZE    1024                     /* Must be power of 2. */
-#endif
 
 #endif /* !_AMD64_INCLUDE_PARAM_H_ */
diff --git a/sys/amd64/include/pmap.h b/sys/amd64/include/pmap.h
index 08e96027a5ed..e2f97442c10f 100644
--- a/sys/amd64/include/pmap.h
+++ b/sys/amd64/include/pmap.h
@@ -202,9 +202,14 @@
 #define	KMSANSHADPML4I	(KPML4BASE - NKMSANSHADPML4E)
 #define	KMSANORIGPML4I	(DMPML4I - NKMSANORIGPML4E)
 
-/* Large map: index of the first and max last pml4 entry */
+/*
+ * Large map: index of the first and max last pml4/la48 and pml5/la57
+ * entry.
+ */
 #define	LMSPML4I	(PML4PML4I + 1)
 #define	LMEPML4I	(KASANPML4I - 1)
+#define	LMSPML5I	(DMPML5I + NDMPML5E)
+#define	LMEPML5I	(LMSPML5I + 32 - 1)	/* 32 slots for large map */
 
 /*
  * XXX doesn't really belong here I guess...
@@ -552,6 +557,7 @@ pmap_pml5e_index(vm_offset_t va)
 
 struct kva_layout_s {
 	vm_offset_t kva_min;
+	vm_offset_t kva_max;
 	vm_offset_t dmap_low;	/* DMAP_MIN_ADDRESS */
 	vm_offset_t dmap_high;	/* DMAP_MAX_ADDRESS */
 	vm_offset_t lm_low;	/* LARGEMAP_MIN_ADDRESS */
@@ -559,6 +565,12 @@ struct kva_layout_s {
 	vm_offset_t km_low;	/* VM_MIN_KERNEL_ADDRESS */
 	vm_offset_t km_high;	/* VM_MAX_KERNEL_ADDRESS */
 	vm_offset_t rec_pt;
+	vm_offset_t kasan_shadow_low;	/* KASAN_MIN_ADDRESS */
+	vm_offset_t kasan_shadow_high;	/* KASAN_MAX_ADDRESS */
+	vm_offset_t kmsan_shadow_low;	/* KMSAN_SHAD_MIN_ADDRESS */
+	vm_offset_t kmsan_shadow_high;	/* KMSAN_SHAD_MAX_ADDRESS */
+	vm_offset_t kmsan_origin_low;	/* KMSAN_ORIG_MIN_ADDRESS */
+	vm_offset_t kmsan_origin_high;	/* KMSAN_ORIG_MAX_ADDRESS */
 };
 extern struct kva_layout_s kva_layout;
 
diff --git a/sys/amd64/include/smp.h b/sys/amd64/include/smp.h
index 26eb227211da..bff92570ff82 100644
--- a/sys/amd64/include/smp.h
+++ b/sys/amd64/include/smp.h
@@ -13,8 +13,6 @@
 
 #ifdef _KERNEL
 
-#ifdef SMP
-
 #ifndef LOCORE
 
 #include <x86/x86_smp.h>
@@ -39,7 +37,6 @@ void	invlop_handler(void);
 int	start_all_aps(void);
 
 #endif /* !LOCORE */
-#endif /* SMP */
 
 #endif /* _KERNEL */
 #endif /* _MACHINE_SMP_H_ */
diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h
index a9c73b75213b..0b3daed4f69e 100644
--- a/sys/amd64/include/vmm.h
+++ b/sys/amd64/include/vmm.h
@@ -649,6 +649,8 @@ struct vm_inout_str {
 	int		addrsize;
 	enum vm_reg_name seg_name;
 	struct seg_desc seg_desc;
+	int		cs_d;
+	uint64_t	cs_base;
 };
 
 enum task_switch_reason {
diff --git a/sys/amd64/include/vmm_dev.h b/sys/amd64/include/vmm_dev.h
index 1f86538ce5f3..441330fd57b8 100644
--- a/sys/amd64/include/vmm_dev.h
+++ b/sys/amd64/include/vmm_dev.h
@@ -29,6 +29,8 @@
 #ifndef	_VMM_DEV_H_
 #define	_VMM_DEV_H_
 
+#include <sys/domainset.h>
+
 #include <machine/vmm.h>
 #include <machine/vmm_snapshot.h>
 
@@ -52,7 +54,10 @@ struct vm_munmap {
 struct vm_memseg {
 	int		segid;
 	size_t		len;
-	char		name[VM_MAX_SUFFIXLEN + 1];
+	char 		name[VM_MAX_SUFFIXLEN + 1];
+	domainset_t	*ds_mask;
+	size_t		ds_mask_size;
+	int 		ds_policy;
 };
 
 struct vm_register {
diff --git a/sys/amd64/include/vmm_instruction_emul.h b/sys/amd64/include/vmm_instruction_emul.h
index d5f0363cfb41..1fb0f97682a7 100644
--- a/sys/amd64/include/vmm_instruction_emul.h
+++ b/sys/amd64/include/vmm_instruction_emul.h
@@ -31,6 +31,31 @@
 
 #include <sys/mman.h>
 
+/* struct vie_op.op_type */
+enum {
+	VIE_OP_TYPE_NONE = 0,
+	VIE_OP_TYPE_MOV,
+	VIE_OP_TYPE_MOVSX,
+	VIE_OP_TYPE_MOVZX,
+	VIE_OP_TYPE_AND,
+	VIE_OP_TYPE_OR,
+	VIE_OP_TYPE_SUB,
+	VIE_OP_TYPE_TWO_BYTE,
+	VIE_OP_TYPE_PUSH,
+	VIE_OP_TYPE_CMP,
+	VIE_OP_TYPE_POP,
+	VIE_OP_TYPE_MOVS,
+	VIE_OP_TYPE_GROUP1,
+	VIE_OP_TYPE_STOS,
+	VIE_OP_TYPE_BITTEST,
+	VIE_OP_TYPE_TWOB_GRP15,
+	VIE_OP_TYPE_ADD,
+	VIE_OP_TYPE_TEST,
+	VIE_OP_TYPE_BEXTR,
+	VIE_OP_TYPE_OUTS,
+	VIE_OP_TYPE_LAST
+};
+
 /*
  * Callback functions to read and write memory regions.
  */
diff --git a/sys/amd64/include/vmparam.h b/sys/amd64/include/vmparam.h
index 59053665dc40..d2ac3c6648b2 100644
--- a/sys/amd64/include/vmparam.h
+++ b/sys/amd64/include/vmparam.h
@@ -181,9 +181,9 @@
  * 0x0100000000000000 - 0xf0ffffffffffffff   does not exist (hole)
  * 0xff00000000000000 - 0xff00ffffffffffff   recursive page table (2048TB slot)
  * 0xff01000000000000 - 0xff20ffffffffffff   direct map (32 x 2048TB slots)
- * 0xff21000000000000 - 0xffff807fffffffff   unused
- * 0xffff808000000000 - 0xffff847fffffffff   large map (can be tuned up)
- * 0xffff848000000000 - 0xfffff77fffffffff   unused (large map extends there)
+ * 0xff21000000000000 - 0xff40ffffffffffff   large map
+ * 0xff41000000000000 - 0xffff7fffffffffff   unused
+ * 0xffff800000000000 - 0xfffff5ffffffffff   unused (start of kernel pml4 entry)
  * 0xfffff60000000000 - 0xfffff7ffffffffff   2TB KMSAN origin map, optional
  * 0xfffff78000000000 - 0xfffff7bfffffffff   512GB KASAN shadow map, optional
  * 0xfffff80000000000 - 0xfffffbffffffffff   4TB unused
@@ -200,16 +200,14 @@
 #define	VM_MIN_KERNEL_ADDRESS		kva_layout.km_low
 #define	VM_MAX_KERNEL_ADDRESS		kva_layout.km_high
 
-#define	KASAN_MIN_ADDRESS	KV4ADDR(KASANPML4I, 0, 0, 0)
-#define	KASAN_MAX_ADDRESS	KV4ADDR(KASANPML4I + NKASANPML4E, 0, 0, 0)
+#define	KASAN_MIN_ADDRESS		(kva_layout.kasan_shadow_low)
+#define	KASAN_MAX_ADDRESS		(kva_layout.kasan_shadow_high)
 
-#define	KMSAN_SHAD_MIN_ADDRESS	KV4ADDR(KMSANSHADPML4I, 0, 0, 0)
-#define	KMSAN_SHAD_MAX_ADDRESS	KV4ADDR(KMSANSHADPML4I + NKMSANSHADPML4E, \
-					0, 0, 0)
+#define	KMSAN_SHAD_MIN_ADDRESS		(kva_layout.kmsan_shadow_low)
+#define	KMSAN_SHAD_MAX_ADDRESS		(kva_layout.kmsan_shadow_high)
 
-#define	KMSAN_ORIG_MIN_ADDRESS	KV4ADDR(KMSANORIGPML4I, 0, 0, 0)
-#define	KMSAN_ORIG_MAX_ADDRESS	KV4ADDR(KMSANORIGPML4I + NKMSANORIGPML4E, \
-					0, 0, 0)
+#define	KMSAN_ORIG_MIN_ADDRESS		(kva_layout.kmsan_origin_low)
+#define	KMSAN_ORIG_MAX_ADDRESS		(kva_layout.kmsan_origin_high)
 
 /*
  * Formally kernel mapping starts at KERNBASE, but kernel linker
@@ -249,7 +247,7 @@
  */
 #define	PHYS_IN_DMAP(pa)	(dmaplimit == 0 || (pa) < dmaplimit)
 #define	VIRT_IN_DMAP(va)	\
-    ((va) >= kva_layout.dmap_low && (va) < kva_layout.dmap_high)
+    ((va) >= kva_layout.dmap_low && (va) < kva_layout.dmap_low + dmaplimit)
 
 #define	PMAP_HAS_DMAP	1
 #define	PHYS_TO_DMAP(x)	__extension__ ({				\
diff --git a/sys/amd64/pt/pt.c b/sys/amd64/pt/pt.c
new file mode 100644
index 000000000000..c7b75767680a
--- /dev/null
+++ b/sys/amd64/pt/pt.c
@@ -0,0 +1,978 @@
+/*
+ * Copyright (c) 2025 Bojan Novković <bnovkov@freebsd.org>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+/*
+ * hwt(4) Intel Processor Trace (PT) backend
+ *
+ * Driver Design Overview
+ *
+ * - Since PT is configured on a per-core basis, the driver uses
+ *   'smp_rendezvous' to start and disable tracing on each target core.
+ * - PT-specific resources are stored in a 'struct pt_ctx' context structure for
+ *   each traced CPU core or thread. Upon initialization, a ToPA configuration
+ *   is generated for each 'pt_ctx' structure using the HWT tracing buffers.
+ *   The HWT tracing buffer is split into 4K ToPA entries. Currently, each
+ *   4K ToPA entry is configured to trigger an interrupt after it is filled.
+ * - The PT driver uses the XSAVE/XRSTOR PT extensions to load and save all
+ *   relevant PT registers. Every time a traced thread is switched
+ *   out or in, its state will be saved to or loaded from its corresponding
+ *   'pt_ctx' context.
+ * - When tracing starts, the PT hardware will start writing data into the
+ *   tracing buffer. When a TOPA_INT entry is filled, it will trigger an
+ *   interrupt before continuing. The interrupt handler will then fetch the
+ *   last valid tracing buffer offset and enqueue a HWT_RECORD_BUFFER record.
+ *   The driver is currently configured to use the NMI interrupt line.
+ * - The userspace PT backend waits for incoming HWT_RECORD_BUFFER records
+ *   and uses the offsets to decode data from the tracing buffer.
+ *
+ * Future improvements and limitations
+ *
+ * - We currently configure the PT hardware to trigger an interrupt whenever
+ *   a 4K ToPA entry is filled. While this is fine when tracing smaller
+ *   functions or infrequent code paths, this will generate too much interrupt
+ *   traffic when tracing hotter functions. A proper solution for this issue
+ *   should estimate the amount of data generated by the current configuration
+ *   and use it to determine interrupt frequency.
+ *
+ * - Support for more tracing options and PT features.
+ *
+ */
+
+#include <sys/systm.h>
+#include <sys/hwt.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/sdt.h>
+#include <sys/smp.h>
+#include <sys/taskqueue.h>
+
+#include <vm/vm.h>
+#include <vm/vm_page.h>
+
+#include <machine/atomic.h>
+#include <machine/cpufunc.h>
+#include <machine/fpu.h>
+#include <machine/smp.h>
+#include <machine/specialreg.h>
+
+#include <x86/apicvar.h>
+#include <x86/x86_var.h>
+
+#include <dev/hwt/hwt_context.h>
+#include <dev/hwt/hwt_vm.h>
+#include <dev/hwt/hwt_backend.h>
+#include <dev/hwt/hwt_config.h>
+#include <dev/hwt/hwt_cpu.h>
+#include <dev/hwt/hwt_record.h>
+#include <dev/hwt/hwt_thread.h>
+
+#include <amd64/pt/pt.h>
+
+#ifdef PT_DEBUG
+#define dprintf(fmt, ...) printf(fmt, ##__VA_ARGS__)
+#else
+#define dprintf(fmt, ...)
+#endif
+#define PT_SUPPORTED_FLAGS						\
+	(RTIT_CTL_MTCEN | RTIT_CTL_CR3FILTER | RTIT_CTL_DIS_TNT |	\
+	    RTIT_CTL_USER | RTIT_CTL_OS | RTIT_CTL_BRANCHEN)
+#define PT_XSAVE_MASK (XFEATURE_ENABLED_X87 | XFEATURE_ENABLED_SSE)
+#define PT_XSTATE_BV (PT_XSAVE_MASK | XFEATURE_ENABLED_PT)
+#define PT_MAX_IP_RANGES 2
+
+#define PT_TOPA_MASK_PTRS 0x7f
+#define PT_TOPA_PAGE_MASK 0xffffff80
+#define PT_TOPA_PAGE_SHIFT 7
+
+#define CPUID_PT_LEAF	0x14
+
+MALLOC_DEFINE(M_PT, "pt", "Intel Processor Trace");
+
+SDT_PROVIDER_DEFINE(pt);
+SDT_PROBE_DEFINE(pt, , , topa__intr);
+
+TASKQUEUE_FAST_DEFINE_THREAD(pt);
+
+static void pt_send_buffer_record(void *arg, int pending __unused);
+static int pt_topa_intr(struct trapframe *tf);
+
+/*
+ * Intel Processor Trace XSAVE-managed state.
+ */
+struct pt_ext_area {
+	uint64_t rtit_ctl;
+	uint64_t rtit_output_base;
+	uint64_t rtit_output_mask_ptrs;
+	uint64_t rtit_status;
+	uint64_t rtit_cr3_match;
+	uint64_t rtit_addr0_a;
+	uint64_t rtit_addr0_b;
+	uint64_t rtit_addr1_a;
+	uint64_t rtit_addr1_b;
+};
+
+struct pt_buffer {
+	uint64_t *topa_hw; /* ToPA table entries. */
+	size_t size;
+	struct mtx lock; /* Lock for fields below. */
+	vm_offset_t offset;
+	uint64_t wrap_count;
+	int curpage;
+};
+
+struct pt_ctx {
+	int id;
+	struct pt_buffer buf; /* ToPA buffer metadata */
+	struct task task;     /* ToPA buffer notification task */
+	struct hwt_context *hwt_ctx;
+	uint8_t *save_area; /* PT XSAVE area */
+};
+/* PT tracing contexts used for CPU mode. */
+static struct pt_ctx *pt_pcpu_ctx;
+
+enum pt_cpu_state {
+	PT_DISABLED = 0,
+	PT_STOPPED,
+	PT_ACTIVE
+};
+
+static struct pt_cpu {
+	struct pt_ctx *ctx;	 /* active PT tracing context */
+	enum pt_cpu_state state; /* used as part of trace stop protocol */
+} *pt_pcpu;
+
+/*
+ * PT-related CPUID bits.
+ */
+static struct pt_cpu_info {
+	uint32_t l0_eax;
+	uint32_t l0_ebx;
+	uint32_t l0_ecx;
+	uint32_t l1_eax;
+	uint32_t l1_ebx;
+	size_t xsave_area_size;
+	size_t xstate_hdr_offset;
+	size_t pt_xsave_offset;
+} pt_info  __read_mostly;
+
+static bool initialized = false;
+static int cpu_mode_ctr = 0;
+
+static __inline enum pt_cpu_state
+pt_cpu_get_state(int cpu_id)
+{
+	return (atomic_load_int(&pt_pcpu[cpu_id].state));
+}
+
+static __inline void
+pt_cpu_set_state(int cpu_id, enum pt_cpu_state state)
+{
+	atomic_store_int(&pt_pcpu[cpu_id].state, state);
+}
+
+static __inline struct xstate_hdr *
+pt_ctx_get_xstate_hdr(struct pt_ctx *ctx)
+{
+	return ((struct xstate_hdr *)(ctx->save_area +
+	    pt_info.xstate_hdr_offset));
+}
+
+
+static __inline struct pt_ext_area *
+pt_ctx_get_ext_area(struct pt_ctx *ctx)
+{
+	return ((struct pt_ext_area *)(ctx->save_area +
+	    pt_info.pt_xsave_offset));
+}
+
+/*
+ * Updates current trace buffer offset from the
+ * ToPA MSRs. Records if the trace buffer wrapped.
+ */
+static __inline void
+pt_update_buffer(struct pt_buffer *buf)
+{
+	uint64_t reg;
+	int curpage;
+
+	/* Update buffer offset. */
+	reg = rdmsr(MSR_IA32_RTIT_OUTPUT_MASK_PTRS);
+	curpage = (reg & PT_TOPA_PAGE_MASK) >> PT_TOPA_PAGE_SHIFT;
+	mtx_lock_spin(&buf->lock);
+	/* Check if the output wrapped. */
+	if (buf->curpage > curpage)
+		buf->wrap_count++;
+	buf->curpage = curpage;
+	buf->offset = reg >> 32;
+	mtx_unlock_spin(&buf->lock);
+
+	dprintf("%s: wrap_cnt: %lu, curpage: %d, offset: %zu\n", __func__,
+	    buf->wrap_count, buf->curpage, buf->offset);
+}
+
+static __inline void
+pt_fill_buffer_record(int id, struct pt_buffer *buf,
+    struct hwt_record_entry *rec)
+{
+	rec->record_type = HWT_RECORD_BUFFER;
+	rec->buf_id = id;
+	rec->curpage = buf->curpage;
+	rec->offset = buf->offset + (buf->wrap_count * buf->size);
+}
+
+/*
+ * Enables or disables tracing on curcpu
+ * using the XSAVE/XRSTOR PT extensions.
+ */
+static void
+pt_cpu_toggle_local(uint8_t *save_area, bool enable)
+{
+	u_long xcr0, cr0;
+	u_long xss;
+
+	cr0 = rcr0();
+	if (cr0 & CR0_TS)
+		clts();
+	xcr0 = rxcr(XCR0);
+	if ((xcr0 & PT_XSAVE_MASK) != PT_XSAVE_MASK)
+		load_xcr(XCR0, xcr0 | PT_XSAVE_MASK);
+	xss = rdmsr(MSR_IA32_XSS);
+	wrmsr(MSR_IA32_XSS, xss | XFEATURE_ENABLED_PT);
+
+	if (!enable) {
+		KASSERT((rdmsr(MSR_IA32_RTIT_CTL) & RTIT_CTL_TRACEEN) != 0,
+		    ("%s: PT is disabled", __func__));
+		xsaves(save_area, XFEATURE_ENABLED_PT);
+	} else {
+		KASSERT((rdmsr(MSR_IA32_RTIT_CTL) & RTIT_CTL_TRACEEN) == 0,
+		    ("%s: PT is enabled", __func__));
+		xrstors(save_area, XFEATURE_ENABLED_PT);
+	}
+	wrmsr(MSR_IA32_XSS, xss);
+	if ((xcr0 & PT_XSAVE_MASK) != PT_XSAVE_MASK)
+		load_xcr(XCR0, xcr0);
+	if (cr0 & CR0_TS)
+		load_cr0(cr0);
+}
+
+/*
+ * Starts PT tracing on 'curcpu'.
+ */
+static void
+pt_cpu_start(void *dummy)
+{
+	struct pt_cpu *cpu;
+
+	cpu = &pt_pcpu[curcpu];
+	MPASS(cpu->ctx != NULL);
+
+	dprintf("%s: curcpu %d\n", __func__, curcpu);
+	load_cr4(rcr4() | CR4_XSAVE);
+	wrmsr(MSR_IA32_RTIT_STATUS, 0);
+	pt_cpu_set_state(curcpu, PT_ACTIVE);
+	pt_cpu_toggle_local(cpu->ctx->save_area, true);
+}
+
+/*
+ * Stops PT tracing on 'curcpu'.
+ * Updates trace buffer offset to ensure
+ * any data generated between the last interrupt
+ * and the trace stop gets picked up by userspace.
+ */
+static void
+pt_cpu_stop(void *dummy)
+{
+	struct pt_cpu *cpu;
+	struct pt_ctx *ctx;
+
+	/* Shutdown may occur before PT gets properly configured. */
+	if (pt_cpu_get_state(curcpu) == PT_DISABLED)
+		return;
+
+	cpu = &pt_pcpu[curcpu];
+	ctx = cpu->ctx;
+	MPASS(ctx != NULL);
+	dprintf("%s: curcpu %d\n", __func__, curcpu);
+
+	pt_cpu_set_state(curcpu, PT_STOPPED);
+	pt_cpu_toggle_local(cpu->ctx->save_area, false);
+	pt_update_buffer(&ctx->buf);
+}
+
+/*
+ * Prepares the Table of Physical Addresses (ToPA) metadata for 'pt_ctx'.
+ * The HWT trace buffer is split into 4K ToPA table entries and used
+ * as a circular buffer, meaning that the last ToPA entry points to
+ * the first ToPA entry. Each entry is configured to raise an
+ * interrupt after being filled.
+ */
+static int
+pt_topa_prepare(struct pt_ctx *ctx, struct hwt_vm *vm)
+{
+	struct pt_buffer *buf;
+	size_t topa_size;
+	int i;
+
+	topa_size = TOPA_SIZE_4K;
+	buf = &ctx->buf;
+
+	KASSERT(buf->topa_hw == NULL,
+	    ("%s: ToPA info already exists", __func__));
+	buf->topa_hw = mallocarray(vm->npages + 1, sizeof(uint64_t), M_PT,
+	    M_ZERO | M_WAITOK);
+	dprintf("%s: ToPA virt addr %p\n", __func__, buf->topa_hw);
+	buf->size = vm->npages * PAGE_SIZE;
+	for (i = 0; i < vm->npages; i++) {
+		buf->topa_hw[i] = VM_PAGE_TO_PHYS(vm->pages[i]) | topa_size;
+		/*
+		 * XXX: TOPA_INT should ideally be set according to
+		 * expected amount of incoming trace data. Too few TOPA_INT
+		 * entries will not trigger interrupts often enough when tracing
+		 * smaller functions.
+		 */
+		buf->topa_hw[i] |= TOPA_INT;
+	}
+	buf->topa_hw[vm->npages] = (uint64_t)vtophys(buf->topa_hw) | TOPA_END;
+
+	return (0);
+}
+
+/*
+ * Configures IP filtering for trace generation.
+ * A maximum of 2 ranges can be specified due to
+ * limitations imposed by the XSAVE/XRSTOR PT extensions.
+ */
+static int
+pt_configure_ranges(struct pt_ctx *ctx, struct pt_cpu_config *cfg)
+{
+	struct pt_ext_area *pt_ext;
+	int nranges_supp, n, error = 0;
+
+	pt_ext = pt_ctx_get_ext_area(ctx);
+	if (pt_info.l0_ebx & CPUPT_IPF) {
+		nranges_supp = (pt_info.l1_eax & CPUPT_NADDR_M) >>
+		    CPUPT_NADDR_S;
+
+		if (nranges_supp > PT_IP_FILTER_MAX_RANGES)
+			nranges_supp = PT_IP_FILTER_MAX_RANGES;
+		n = cfg->nranges;
+		if (n > nranges_supp) {
+			printf("%s: %d IP filtering ranges requested, CPU "
+			       "supports %d, truncating\n",
+			    __func__, n, nranges_supp);
+			n = nranges_supp;
+		}
+
+		switch (n) {
+		case 2:
+			pt_ext->rtit_ctl |= (1UL << RTIT_CTL_ADDR_CFG_S(1));
+			pt_ext->rtit_addr1_a = cfg->ip_ranges[1].start;
+			pt_ext->rtit_addr1_b = cfg->ip_ranges[1].end;
+		case 1:
+			pt_ext->rtit_ctl |= (1UL << RTIT_CTL_ADDR_CFG_S(0));
+			pt_ext->rtit_addr0_a = cfg->ip_ranges[0].start;
+			pt_ext->rtit_addr0_b = cfg->ip_ranges[0].end;
+			break;
+		default:
+			error = (EINVAL);
+			break;
+		};
+	} else
+		error = (ENXIO);
+
+	return (error);
+}
+
+static int
+pt_init_ctx(struct pt_ctx *pt_ctx, struct hwt_vm *vm, int ctx_id)
+{
+
+	dprintf("%s: ctx id %d\n", __func__, ctx_id);
+
+	KASSERT(pt_ctx->buf.topa_hw == NULL,
+	    ("%s: active ToPA buffer in context %p\n", __func__, pt_ctx));
+
+	memset(pt_ctx, 0, sizeof(struct pt_ctx));
+	mtx_init(&pt_ctx->buf.lock, "pttopa", NULL, MTX_SPIN);
+	pt_ctx->save_area = malloc_aligned(pt_info.xsave_area_size, 64,
+	    M_PT, M_NOWAIT | M_ZERO);
+	if (pt_ctx->save_area == NULL)
+		return (ENOMEM);
+	dprintf("%s: preparing ToPA buffer\n", __func__);
+	if (pt_topa_prepare(pt_ctx, vm) != 0) {
+		dprintf("%s: failed to prepare ToPA buffer\n", __func__);
+		free(pt_ctx->save_area, M_PT);
+		return (ENOMEM);
+	}
+
+	pt_ctx->id = ctx_id;
+	TASK_INIT(&pt_ctx->task, 0, pt_send_buffer_record, pt_ctx);
+
+	return (0);
+}
+
+static void
+pt_deinit_ctx(struct pt_ctx *pt_ctx)
+{
+
+	if (pt_ctx->buf.topa_hw != NULL)
+		free(pt_ctx->buf.topa_hw, M_PT);
+	if (pt_ctx->save_area != NULL)
+		free(pt_ctx->save_area, M_PT);
+	memset(pt_ctx, 0, sizeof(*pt_ctx));
+	pt_ctx->buf.topa_hw = NULL;
+}
+
+/*
+ * HWT backend configuration method.
+ *
+ * Checks and translates the user-defined configuration to a
+ * set of PT tracing features. Uses the feature set to initialize
+ * the tracing context for the target CPU or thread.
+ */
+static int
+pt_backend_configure(struct hwt_context *ctx, int cpu_id, int thread_id)
+{
+	struct hwt_cpu *hwt_cpu;
+	struct hwt_thread *thr;
+	struct pt_ctx *pt_ctx;
+	struct pt_cpu_config *cfg;
+	struct pt_ext_area *pt_ext;
+	struct xstate_hdr *hdr;
+	int error;
+
+	dprintf("%s\n", __func__);
+
+	cfg = (struct pt_cpu_config *)ctx->config;
+	pt_ctx = NULL;
+
+	/* Clear any flags we don't support yet. */
+	cfg->rtit_ctl &= PT_SUPPORTED_FLAGS;
+	if (cfg->rtit_ctl & RTIT_CTL_MTCEN) {
+		if ((pt_info.l0_ebx & CPUPT_MTC) == 0) {
+			printf("%s: CPU does not support generating MTC "
+			    "packets\n", __func__);
+			return (ENXIO);
+		}
+	}
+
+	if (cfg->rtit_ctl & RTIT_CTL_CR3FILTER) {
+		if ((pt_info.l0_ebx & CPUPT_CR3) == 0) {
+			printf("%s: CPU does not support CR3 filtering\n",
+			    __func__);
+			return (ENXIO);
+		}
+	}
+
+	if (cfg->rtit_ctl & RTIT_CTL_DIS_TNT) {
+		if ((pt_info.l0_ebx & CPUPT_DIS_TNT) == 0) {
+			printf("%s: CPU does not support TNT\n", __func__);
+			return (ENXIO);
+		}
+	}
+	/* TODO: support for more config bits. */
+
+	if (ctx->mode == HWT_MODE_CPU) {
+		TAILQ_FOREACH(hwt_cpu, &ctx->cpus, next) {
+			if (hwt_cpu->cpu_id != cpu_id)
+				continue;
+			pt_ctx = &pt_pcpu_ctx[cpu_id];
+			break;
+		}
+	} else {
+		TAILQ_FOREACH(thr, &ctx->threads, next) {
+			if (thr->thread_id != thread_id)
+				continue;
+			KASSERT(thr->private != NULL,
+			    ("%s: hwt thread private"
+			     " not set, thr %p",
+				__func__, thr));
+			pt_ctx = (struct pt_ctx *)thr->private;
+			break;
+		}
+	}
+	if (pt_ctx == NULL)
+		return (ENOENT);
+
+	dprintf("%s: preparing MSRs\n", __func__);
+	pt_ext = pt_ctx_get_ext_area(pt_ctx);
+	hdr = pt_ctx_get_xstate_hdr(pt_ctx);
+
+	pt_ext->rtit_ctl |= cfg->rtit_ctl;
+	if (cfg->nranges != 0) {
+		dprintf("%s: preparing IPF ranges\n", __func__);
+		if ((error = pt_configure_ranges(pt_ctx, cfg)) != 0)
+			return (error);
+	}
+	pt_ctx->hwt_ctx = ctx;
+	pt_ext->rtit_ctl |= RTIT_CTL_TOPA;
+	pt_ext->rtit_output_base = (uint64_t)vtophys(pt_ctx->buf.topa_hw);
+	pt_ext->rtit_output_mask_ptrs = PT_TOPA_MASK_PTRS;
+	hdr->xstate_bv = XFEATURE_ENABLED_PT;
+	hdr->xstate_xcomp_bv = XFEATURE_ENABLED_PT |
+	    XSTATE_XCOMP_BV_COMPACT;
+	pt_ext->rtit_ctl |= RTIT_CTL_TRACEEN;
+	pt_pcpu[cpu_id].ctx = pt_ctx;
+	pt_cpu_set_state(cpu_id, PT_STOPPED);
+
+	return (0);
+}
+
+/*
+ * hwt backend trace start operation. CPU affine.
+ */
+static void
+pt_backend_enable(struct hwt_context *ctx, int cpu_id)
+{
+	if (ctx->mode == HWT_MODE_CPU)
+		return;
+
+	KASSERT(curcpu == cpu_id,
+	    ("%s: attempting to start PT on another cpu", __func__));
+	pt_cpu_start(NULL);
+	CPU_SET(cpu_id, &ctx->cpu_map);
+}
+
+/*
+ * hwt backend trace stop operation. CPU affine.
+ */
+static void
+pt_backend_disable(struct hwt_context *ctx, int cpu_id)
+{
+	struct pt_cpu *cpu;
+
+	if (ctx->mode == HWT_MODE_CPU)
+		return;
+
+	KASSERT(curcpu == cpu_id,
+	    ("%s: attempting to disable PT on another cpu", __func__));
+	pt_cpu_stop(NULL);
+	CPU_CLR(cpu_id, &ctx->cpu_map);
+	cpu = &pt_pcpu[cpu_id];
+	cpu->ctx = NULL;
+}
+
+/*
+ * hwt backend trace start operation for remote CPUs.
+ */
+static int
+pt_backend_enable_smp(struct hwt_context *ctx)
+{
+
+	dprintf("%s\n", __func__);
+	if (ctx->mode == HWT_MODE_CPU &&
+	    atomic_swap_32(&cpu_mode_ctr, 1) != 0)
+		return (-1);
+
+	KASSERT(ctx->mode == HWT_MODE_CPU,
+	    ("%s: should only be used for CPU mode", __func__));
+	smp_rendezvous_cpus(ctx->cpu_map, NULL, pt_cpu_start, NULL, NULL);
+
+	return (0);
+}
+
+/*
+ * hwt backend trace stop operation for remote CPUs.
+ */
+static int
+pt_backend_disable_smp(struct hwt_context *ctx)
+{
+
+	dprintf("%s\n", __func__);
+	if (ctx->mode == HWT_MODE_CPU &&
+	    atomic_swap_32(&cpu_mode_ctr, 0) == 0)
+		return (-1);
+
+	if (CPU_EMPTY(&ctx->cpu_map)) {
+		dprintf("%s: empty cpu map\n", __func__);
+		return (-1);
+	}
+	smp_rendezvous_cpus(ctx->cpu_map, NULL, pt_cpu_stop, NULL, NULL);
+
+	return (0);
+}
+
+/*
+ * HWT backend initialization method.
+ *
+ * Installs the ToPA interrupt handler and initializes
+ * the tracing contexts used for HWT_MODE_CPU.
+ */
+static int
+pt_backend_init(struct hwt_context *ctx)
+{
+	struct hwt_cpu *hwt_cpu;
+	int error;
+
+	dprintf("%s\n", __func__);
+	if (ctx->mode == HWT_MODE_CPU) {
+		TAILQ_FOREACH(hwt_cpu, &ctx->cpus, next) {
+			error = pt_init_ctx(&pt_pcpu_ctx[hwt_cpu->cpu_id],
+			    hwt_cpu->vm, hwt_cpu->cpu_id);
+			if (error)
+				return (error);
+		}
+	}
+
+	return (0);
+}
+
+/*
+ * HWT backend teardown method.
+ *
+ * Removes the ToPA interrupt handler, stops tracing on all active CPUs,
+ * and releases all previously allocated ToPA metadata.
+ */
+static int
+pt_backend_deinit(struct hwt_context *ctx)
+{
+	struct pt_ctx *pt_ctx;
+	struct hwt_thread *thr;
+	int cpu_id;
+
+	dprintf("%s\n", __func__);
+
+	pt_backend_disable_smp(ctx);
+	if (ctx->mode == HWT_MODE_THREAD) {
+		TAILQ_FOREACH(thr, &ctx->threads, next) {
+			KASSERT(thr->private != NULL,
+			    ("%s: thr->private not set", __func__));
+			pt_ctx = (struct pt_ctx *)thr->private;
+			pt_deinit_ctx(pt_ctx);
+		}
+	} else {
+		CPU_FOREACH(cpu_id) {
+			if (!CPU_ISSET(cpu_id, &ctx->cpu_map))
+				continue;
+			if (pt_pcpu[cpu_id].ctx != NULL) {
+				KASSERT(pt_pcpu[cpu_id].ctx ==
+					&pt_pcpu_ctx[cpu_id],
+				    ("%s: CPU mode tracing with non-cpu mode PT"
+				     "context active",
+					__func__));
+				pt_pcpu[cpu_id].ctx = NULL;
+			}
+			pt_ctx = &pt_pcpu_ctx[cpu_id];
+			pt_deinit_ctx(pt_ctx);
+			memset(&pt_pcpu[cpu_id], 0, sizeof(struct pt_cpu));
+		}
+	}
+
+	return (0);
+}
+
+/*
+ * Fetches current offset into the tracing buffer.
+ */
+static int
+pt_backend_read(struct hwt_vm *vm, int *curpage, vm_offset_t *curpage_offset,
+    uint64_t *data)
+{
+	struct pt_buffer *buf;
+
+	if (vm->ctx->mode == HWT_MODE_THREAD)
+		buf = &((struct pt_ctx *)vm->thr->private)->buf;
+	else
+		buf = &pt_pcpu[vm->cpu->cpu_id].ctx->buf;
+	mtx_lock_spin(&buf->lock);
+	*curpage = buf->curpage;
+	*curpage_offset = buf->offset + (buf->wrap_count * vm->ctx->bufsize);
+	mtx_unlock_spin(&buf->lock);
+
+	return (0);
+}
+
+/*
+ * HWT thread creation hook.
+ * Allocates and associates a 'struct pt_ctx' for a given hwt thread.
+ */
+static int
+pt_backend_alloc_thread(struct hwt_thread *thr)
+{
+	struct pt_ctx *pt_ctx;
+	int error;
+
+	/* Omit M_WAITOK since this might get invoked a non-sleepable context */
+	pt_ctx = malloc(sizeof(*pt_ctx), M_PT, M_NOWAIT | M_ZERO);
+	if (pt_ctx == NULL)
+		return (ENOMEM);
+
+	error = pt_init_ctx(pt_ctx, thr->vm, thr->thread_id);
+	if (error)
+		return (error);
+
+	thr->private = pt_ctx;
+	return (0);
+}
+/*
+ * HWT thread teardown hook.
+ */
+static void
+pt_backend_free_thread(struct hwt_thread *thr)
+{
+	struct pt_ctx *ctx;
+
+	ctx = (struct pt_ctx *)thr->private;
+
+	pt_deinit_ctx(ctx);
+	free(ctx, M_PT);
+}
+
+static void
+pt_backend_dump(int cpu_id)
+{
+}
+
+static struct hwt_backend_ops pt_ops = {
+	.hwt_backend_init = pt_backend_init,
+	.hwt_backend_deinit = pt_backend_deinit,
+
+	.hwt_backend_configure = pt_backend_configure,
+
+	.hwt_backend_enable = pt_backend_enable,
+	.hwt_backend_disable = pt_backend_disable,
+
+#ifdef SMP
+	.hwt_backend_enable_smp = pt_backend_enable_smp,
+	.hwt_backend_disable_smp = pt_backend_disable_smp,
+#endif
+
+	.hwt_backend_read = pt_backend_read,
+	.hwt_backend_dump = pt_backend_dump,
+
+	.hwt_backend_thread_alloc = pt_backend_alloc_thread,
+	.hwt_backend_thread_free = pt_backend_free_thread,
+};
+
+static struct hwt_backend backend = {
+	.ops = &pt_ops,
+	.name = "pt",
+	.kva_req = 1,
+};
+
+/*
+ * Reads the latest valid trace buffer offset and enqueues
+ * a HWT_RECORD_BUFFER record.
+ * Used as a taskqueue routine from the ToPA interrupt handler.
+ */
+static void
+pt_send_buffer_record(void *arg, int pending __unused)
+{
+	struct hwt_record_entry record;
+	struct pt_ctx *ctx = (struct pt_ctx *)arg;
+
+	/* Prepare buffer record. */
+	mtx_lock_spin(&ctx->buf.lock);
+	pt_fill_buffer_record(ctx->id, &ctx->buf, &record);
+	mtx_unlock_spin(&ctx->buf.lock);
+	hwt_record_ctx(ctx->hwt_ctx, &record, M_ZERO | M_NOWAIT);
+}
+static void
+pt_topa_status_clear(void)
+{
+	uint64_t reg;
+
+	reg = rdmsr(MSR_IA_GLOBAL_STATUS_RESET);
+	reg &= ~GLOBAL_STATUS_FLAG_TRACETOPAPMI;
+	reg |= GLOBAL_STATUS_FLAG_TRACETOPAPMI;
+	wrmsr(MSR_IA_GLOBAL_STATUS_RESET, reg);
+}
+
+/*
+ * ToPA PMI handler.
+ *
+ * Invoked every time a ToPA entry marked with TOPA_INT is filled.
+ * Uses taskqueue to enqueue a buffer record for userspace.
+ * Re-enables the PC interrupt line as long as tracing is active.
+ */
+static int
+pt_topa_intr(struct trapframe *tf)
+{
+	struct pt_buffer *buf;
+	struct pt_ctx *ctx;
+	uint64_t reg;
+
+	SDT_PROBE0(pt, , , topa__intr);
+
+	if (pt_cpu_get_state(curcpu) != PT_ACTIVE) {
+		return (0);
+	}
+	reg = rdmsr(MSR_IA_GLOBAL_STATUS);
+	if ((reg & GLOBAL_STATUS_FLAG_TRACETOPAPMI) == 0) {
+		/* ACK spurious or leftover interrupt. */
+		pt_topa_status_clear();
+		return (1);
+	}
+
+	ctx = pt_pcpu[curcpu].ctx;
+	buf = &ctx->buf;
+	KASSERT(buf->topa_hw != NULL,
+	    ("%s: ToPA PMI interrupt with invalid buffer", __func__));
+
+	pt_cpu_toggle_local(ctx->save_area, false);
+	pt_update_buffer(buf);
+	pt_topa_status_clear();
+	taskqueue_enqueue_flags(taskqueue_pt, &ctx->task,
+	    TASKQUEUE_FAIL_IF_PENDING);
+
+	if (pt_cpu_get_state(curcpu) == PT_ACTIVE) {
+		pt_cpu_toggle_local(ctx->save_area, true);
+		lapic_reenable_pcint();
+	}
+	return (1);
+}
+
+/*
+ * Module initialization.
+ *
+ * Saves all PT-related cpuid info, registers itself as a HWT backend,
+ * and allocates metadata required to keep track of tracing operations
+ * on each CPU.
+ */
+static int
+pt_init(void)
+{
+	u_int cp[4];
+	int error;
+
+	dprintf("pt: Enumerating part 1\n");
+	cpuid_count(CPUID_PT_LEAF, 0, cp);
+	dprintf("pt: Maximum valid sub-leaf Index: %x\n", cp[0]);
+	dprintf("pt: ebx %x\n", cp[1]);
+	dprintf("pt: ecx %x\n", cp[2]);
+
+	pt_info.l0_eax = cp[0];
+	pt_info.l0_ebx = cp[1];
+	pt_info.l0_ecx = cp[2];
+
+	dprintf("pt: Enumerating part 2\n");
+	cpuid_count(CPUID_PT_LEAF, 1, cp);
+	dprintf("pt: eax %x\n", cp[0]);
+	dprintf("pt: ebx %x\n", cp[1]);
+
+	pt_info.l1_eax = cp[0];
+	pt_info.l1_ebx = cp[1];
+
+	error = hwt_backend_register(&backend);
+	if (error != 0) {
+		printf("pt: unable to register hwt backend, error %d\n", error);
+		return (error);
+	}
+	pt_pcpu = mallocarray(mp_ncpus, sizeof(struct pt_cpu), M_PT,
+	    M_ZERO | M_WAITOK);
+	pt_pcpu_ctx = mallocarray(mp_ncpus, sizeof(struct pt_ctx), M_PT,
+	    M_ZERO | M_WAITOK);
+
+	nmi_register_handler(pt_topa_intr);
+	if (!lapic_enable_pcint()) {
+		nmi_remove_handler(pt_topa_intr);
+		hwt_backend_unregister(&backend);
+		free(pt_pcpu, M_PT);
+		free(pt_pcpu_ctx, M_PT);
+		pt_pcpu = NULL;
+		pt_pcpu_ctx = NULL;
+		printf("pt: failed to setup interrupt line\n");
+		return (error);
+	}
+	initialized = true;
+
+	return (0);
+}
+
+/*
+ * Checks whether the CPU support Intel PT and
+ * initializes XSAVE area info.
+ *
+ * The driver relies on XSAVE/XRSTOR PT extensions,
+ * Table of Physical Addresses (ToPA) support, and
+ * support for multiple ToPA entries.
+ */
+static bool
+pt_supported(void)
+{
+	u_int cp[4];
+
+	if ((cpu_stdext_feature & CPUID_STDEXT_PROCTRACE) == 0) {
+		printf("pt: CPU does not support Intel Processor Trace\n");
+		return (false);
+	}
+	if ((cpu_feature2 & CPUID2_XSAVE) == 0) {
+		printf("pt: XSAVE is not supported\n");
+		return (false);
+	}
+	if (!xsave_extfeature_supported(XFEATURE_ENABLED_PT, true)) {
+		printf("pt: CPU does not support managing PT state using XSAVE\n");
+		return (false);
+	}
+	if (!xsave_extension_supported(CPUID_EXTSTATE_XSAVEC)) {
+		printf("pt: XSAVE compaction is not supported\n");
+		return (false);
+	}
+	if (!xsave_extension_supported(CPUID_EXTSTATE_XSAVES)) {
+		printf("pt: CPU does not support XSAVES/XRSTORS\n");
+		return (false);
+	}
+
+	/* Require ToPA support. */
+	cpuid_count(CPUID_PT_LEAF, 0, cp);
+	if ((cp[2] & CPUPT_TOPA) == 0) {
+		printf("pt: ToPA is not supported\n");
+		return (false);
+	}
+	if ((cp[2] & CPUPT_TOPA_MULTI) == 0) {
+		printf("pt: multiple ToPA outputs are not supported\n");
+		return (false);
+	}
+
+	pt_info.xstate_hdr_offset = xsave_area_hdr_offset();
+	pt_info.xsave_area_size = xsave_area_size(PT_XSTATE_BV, true, true);
+	pt_info.pt_xsave_offset = xsave_area_offset(PT_XSTATE_BV,
+	    XFEATURE_ENABLED_PT, true, true);
+
+	return (true);
+}
+
+static void
+pt_deinit(void)
+{
+	if (!initialized)
+		return;
+	nmi_remove_handler(pt_topa_intr);
+	lapic_disable_pcint();
+	hwt_backend_unregister(&backend);
+	free(pt_pcpu, M_PT);
+	free(pt_pcpu_ctx, M_PT);
+	pt_pcpu = NULL;
+	initialized = false;
+}
+
+static int
+pt_modevent(module_t mod, int type, void *data)
+{
+	switch (type) {
+	case MOD_LOAD:
+		if (!pt_supported() || pt_init() != 0) {
+			return (ENXIO);
+		}
+		break;
+	case MOD_UNLOAD:
+		pt_deinit();
+		break;
+	default:
+		break;
+	}
+
+	return (0);
+}
+
+static moduledata_t pt_mod = { "intel_pt", pt_modevent, NULL };
+
+DECLARE_MODULE(intel_pt, pt_mod, SI_SUB_DRIVERS, SI_ORDER_FIRST);
+MODULE_DEPEND(intel_pt, hwt, 1, 1, 1);
+MODULE_VERSION(intel_pt, 1);
diff --git a/sys/amd64/pt/pt.h b/sys/amd64/pt/pt.h
new file mode 100644
index 000000000000..2423afdf22e9
--- /dev/null
+++ b/sys/amd64/pt/pt.h
@@ -0,0 +1,49 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023 Bojan Novković <bnovkov@freebsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _AMD64_PT_PT_H_
+#define _AMD64_PT_PT_H_
+
+#include <sys/types.h>
+
+#include <x86/include/specialreg.h>
+
+#define PT_IP_FILTER_MAX_RANGES (2) /* Intel SDM Vol. 3C, 33-29 */
+
+struct pt_cpu_config {
+	uint64_t rtit_ctl;
+	register_t cr3_filter;
+	int nranges;
+	struct ipf_range {
+		vm_offset_t start;
+		vm_offset_t end;
+	} ip_ranges[PT_IP_FILTER_MAX_RANGES];
+	uint32_t mtc_freq;
+	uint32_t cyc_thresh;
+	uint32_t psb_freq;
+};
+#endif /* !_AMD64_PT_PT_H_ */
diff --git a/sys/amd64/vmm/amd/svm.c b/sys/amd64/vmm/amd/svm.c
index 6c16daaa47c2..2fe6a5bc3584 100644
--- a/sys/amd64/vmm/amd/svm.c
+++ b/sys/amd64/vmm/amd/svm.c
@@ -317,6 +317,33 @@ svm_set_tsc_offset(struct svm_vcpu *vcpu, uint64_t offset)
 #define MSR_AMD7TH_START 	0xC0010000UL
 #define MSR_AMD7TH_END 		0xC0011FFFUL
 
+static void
+svm_get_cs_info(struct vmcb *vmcb, struct vm_guest_paging *paging, int *cs_d,
+    uint64_t *base)
+{
+	struct vmcb_segment seg;
+	int error __diagused;
+
+	error = vmcb_seg(vmcb, VM_REG_GUEST_CS, &seg);
+	KASSERT(error == 0, ("%s: vmcb_seg error %d", __func__, error));
+
+	switch (paging->cpu_mode) {
+	case CPU_MODE_REAL:
+		*base = seg.base;
+		*cs_d = 0;
+		break;
+	case CPU_MODE_PROTECTED:
+	case CPU_MODE_COMPATIBILITY:
+		*cs_d = !!(seg.attrib & VMCB_CS_ATTRIB_D);
+		*base = seg.base;
+		break;
+	default:
+		*base = 0;
+		*cs_d = 0;
+		break;
+	}
+}
+
 /*
  * Get the index and bit position for a MSR in permission bitmap.
  * Two bits are used for each MSR: lower bit for read and higher bit for write.
@@ -735,10 +762,29 @@ svm_inout_str_seginfo(struct svm_vcpu *vcpu, int64_t info1, int in,
 
 	if (in) {
 		vis->seg_name = VM_REG_GUEST_ES;
-	} else {
-		/* The segment field has standard encoding */
+	} else if (decode_assist()) {
+		/*
+		 * The effective segment number in EXITINFO1[12:10] is populated
+		 * only if the processor has the DecodeAssist capability.
+		 *
+		 * XXX this is not specified explicitly in APMv2 but can be
+		 * verified empirically.
+		 */
 		s = (info1 >> 10) & 0x7;
+
+		/* The segment field has standard encoding */
 		vis->seg_name = vm_segment_name(s);
+	} else {
+		/*
+		 * The segment register need to be manually decoded by fetching
+		 * the instructions near ip. However, we are unable to fetch it
+		 * while the interrupts are disabled. Therefore, we leave the
+		 * value unset until the generic ins/outs handler runs.
+		 */
+		vis->seg_name = VM_REG_LAST;
+		svm_get_cs_info(vcpu->vmcb, &vis->paging, &vis->cs_d,
+		    &vis->cs_base);
+		return;
 	}
 
 	error = svm_getdesc(vcpu, vis->seg_name, &vis->seg_desc);
@@ -798,16 +844,6 @@ svm_handle_io(struct svm_vcpu *vcpu, struct vm_exit *vmexit)
 	info1 = ctrl->exitinfo1;
 	inout_string = info1 & BIT(2) ? 1 : 0;
 
-	/*
-	 * The effective segment number in EXITINFO1[12:10] is populated
-	 * only if the processor has the DecodeAssist capability.
-	 *
-	 * XXX this is not specified explicitly in APMv2 but can be verified
-	 * empirically.
-	 */
-	if (inout_string && !decode_assist())
-		return (UNHANDLED);
-
 	vmexit->exitcode 	= VM_EXITCODE_INOUT;
 	vmexit->u.inout.in 	= (info1 & BIT(0)) ? 1 : 0;
 	vmexit->u.inout.string 	= inout_string;
@@ -825,6 +861,8 @@ svm_handle_io(struct svm_vcpu *vcpu, struct vm_exit *vmexit)
 		vis->index = svm_inout_str_index(regs, vmexit->u.inout.in);
 		vis->count = svm_inout_str_count(regs, vmexit->u.inout.rep);
 		vis->addrsize = svm_inout_str_addrsize(info1);
+		vis->cs_d = 0;
+		vis->cs_base = 0;
 		svm_inout_str_seginfo(vcpu, info1, vmexit->u.inout.in, vis);
 	}
 
@@ -866,10 +904,9 @@ static void
 svm_handle_inst_emul(struct vmcb *vmcb, uint64_t gpa, struct vm_exit *vmexit)
 {
 	struct vm_guest_paging *paging;
-	struct vmcb_segment seg;
 	struct vmcb_ctrl *ctrl;
 	char *inst_bytes;
-	int error __diagused, inst_len;
+	int inst_len;
 
 	ctrl = &vmcb->ctrl;
 	paging = &vmexit->u.inst_emul.paging;
@@ -879,29 +916,8 @@ svm_handle_inst_emul(struct vmcb *vmcb, uint64_t gpa, struct vm_exit *vmexit)
 	vmexit->u.inst_emul.gla = VIE_INVALID_GLA;
 	svm_paging_info(vmcb, paging);
 
-	error = vmcb_seg(vmcb, VM_REG_GUEST_CS, &seg);
-	KASSERT(error == 0, ("%s: vmcb_seg(CS) error %d", __func__, error));
-
-	switch(paging->cpu_mode) {
-	case CPU_MODE_REAL:
-		vmexit->u.inst_emul.cs_base = seg.base;
-		vmexit->u.inst_emul.cs_d = 0;
-		break;
-	case CPU_MODE_PROTECTED:
-	case CPU_MODE_COMPATIBILITY:
-		vmexit->u.inst_emul.cs_base = seg.base;
-
-		/*
-		 * Section 4.8.1 of APM2, Default Operand Size or D bit.
-		 */
-		vmexit->u.inst_emul.cs_d = (seg.attrib & VMCB_CS_ATTRIB_D) ?
-		    1 : 0;
-		break;
-	default:
-		vmexit->u.inst_emul.cs_base = 0;
-		vmexit->u.inst_emul.cs_d = 0;
-		break;
-	}
+	svm_get_cs_info(vmcb, paging, &vmexit->u.inst_emul.cs_d,
+	    &vmexit->u.inst_emul.cs_base);
 
 	/*
 	 * Copy the instruction bytes into 'vie' if available.
diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c
index 957217ab2258..842281ab862e 100644
--- a/sys/amd64/vmm/intel/vmx.c
+++ b/sys/amd64/vmm/intel/vmx.c
@@ -2659,6 +2659,8 @@ vmx_exit_process(struct vmx *vmx, struct vmx_vcpu *vcpu, struct vm_exit *vmexit)
 			vis->index = inout_str_index(vcpu, in);
 			vis->count = inout_str_count(vcpu, vis->inout.rep);
 			vis->addrsize = inout_str_addrsize(inst_info);
+			vis->cs_d = 0;
+			vis->cs_base = 0;
 			inout_str_seginfo(vcpu, inst_info, in, vis);
 		}
 		SDT_PROBE3(vmm, vmx, exit, inout, vmx, vcpuid, vmexit);
diff --git a/sys/amd64/vmm/intel/vmx_support.S b/sys/amd64/vmm/intel/vmx_support.S
index f393f160b101..130130b64541 100644
--- a/sys/amd64/vmm/intel/vmx_support.S
+++ b/sys/amd64/vmm/intel/vmx_support.S
@@ -32,12 +32,6 @@
 
 #include "vmx_assym.h"
 
-#ifdef SMP
-#define	LK	lock ;
-#else
-#define	LK
-#endif
-
 /* Be friendly to DTrace FBT's prologue/epilogue pattern matching */
 #define VENTER  push %rbp ; mov %rsp,%rbp
 #define VLEAVE  pop %rbp
diff --git a/sys/amd64/vmm/vmm_instruction_emul.c b/sys/amd64/vmm/vmm_instruction_emul.c
index c53e32889000..c54b6e6d0074 100644
--- a/sys/amd64/vmm/vmm_instruction_emul.c
+++ b/sys/amd64/vmm/vmm_instruction_emul.c
@@ -65,30 +65,6 @@
 #include <x86/psl.h>
 #include <x86/specialreg.h>
 
-/* struct vie_op.op_type */
-enum {
-	VIE_OP_TYPE_NONE = 0,
-	VIE_OP_TYPE_MOV,
-	VIE_OP_TYPE_MOVSX,
-	VIE_OP_TYPE_MOVZX,
-	VIE_OP_TYPE_AND,
-	VIE_OP_TYPE_OR,
-	VIE_OP_TYPE_SUB,
-	VIE_OP_TYPE_TWO_BYTE,
-	VIE_OP_TYPE_PUSH,
-	VIE_OP_TYPE_CMP,
-	VIE_OP_TYPE_POP,
-	VIE_OP_TYPE_MOVS,
-	VIE_OP_TYPE_GROUP1,
-	VIE_OP_TYPE_STOS,
-	VIE_OP_TYPE_BITTEST,
-	VIE_OP_TYPE_TWOB_GRP15,
-	VIE_OP_TYPE_ADD,
-	VIE_OP_TYPE_TEST,
-	VIE_OP_TYPE_BEXTR,
-	VIE_OP_TYPE_LAST
-};
-
 /* struct vie_op.op_flags */
 #define	VIE_OP_F_IMM		(1 << 0)  /* 16/32-bit immediate operand */
 #define	VIE_OP_F_IMM8		(1 << 1)  /* 8-bit immediate operand */
@@ -152,6 +128,16 @@ static const struct vie_op one_byte_opcodes[256] = {
 		.op_byte = 0x3B,
 		.op_type = VIE_OP_TYPE_CMP,
 	},
+	[0x6E] = {
+		.op_byte = 0x6E,
+		.op_type = VIE_OP_TYPE_OUTS,
+		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION,
+	},
+	[0x6F] = {
+		.op_byte = 0x6F,
+		.op_type = VIE_OP_TYPE_OUTS,
+		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION,
+	},
 	[0x88] = {
 		.op_byte = 0x88,
 		.op_type = VIE_OP_TYPE_MOV,
diff --git a/sys/amd64/vmm/vmm_ioport.c b/sys/amd64/vmm/vmm_ioport.c
index fc1ecab9f209..8aab28f5e68e 100644
--- a/sys/amd64/vmm/vmm_ioport.c
+++ b/sys/amd64/vmm/vmm_ioport.c
@@ -145,9 +145,49 @@ emulate_inout_port(struct vcpu *vcpu, struct vm_exit *vmexit, bool *retu)
 }
 
 static int
+decode_segment(struct vcpu *vcpu, enum vm_reg_name *segment)
+{
+	struct vm_guest_paging *paging;
+	struct vie vie;
+	struct vm_exit *vme;
+	int err;
+	int fault;
+
+	vme = vm_exitinfo(vcpu);
+	paging = &vme->u.inout_str.paging;
+
+	vie_init(&vie, NULL, 0);
+	err = vmm_fetch_instruction(vcpu, paging,
+	    vme->rip + vme->u.inout_str.cs_base, VIE_INST_SIZE, &vie, &fault);
+	if (err || fault)
+		return (err);
+
+	err = vmm_decode_instruction(vcpu, VIE_INVALID_GLA, paging->cpu_mode,
+	    vme->u.inout_str.cs_d, &vie);
+
+	if (err || vie.op.op_type != VIE_OP_TYPE_OUTS)
+		return (EINVAL);
+	if (vie.segment_override)
+		*segment = vie.segment_register;
+	else
+		*segment = VM_REG_GUEST_DS;
+
+	return (0);
+}
+
+static int
 emulate_inout_str(struct vcpu *vcpu, struct vm_exit *vmexit, bool *retu)
 {
+	int err;
+
 	*retu = true;
+	if (vmexit->u.inout_str.seg_name == VM_REG_LAST) {
+		err = decode_segment(vcpu, &vmexit->u.inout_str.seg_name);
+		if (err)
+			return (err);
+		return (vm_get_seg_desc(vcpu, vmexit->u.inout_str.seg_name,
+		    &vmexit->u.inout_str.seg_desc));
+	}
 	return (0);	/* Return to userspace to finish emulation */
 }