11 files changed, 1283 insertions, 150 deletions
diff --git a/sys/amd64/amd64/apic_vector.S b/sys/amd64/amd64/apic_vector.S
index 6e51ebff298a..5bb877a174f7 100644
--- a/sys/amd64/amd64/apic_vector.S
+++ b/sys/amd64/amd64/apic_vector.S
@@ -49,12 +49,6 @@
 #include <machine/specialreg.h>
 #include <x86/apicreg.h>
 
-#ifdef SMP
-#define LK	lock ;
-#else
-#define LK
-#endif
-
 	.text
 	SUPERALIGN_TEXT
 	/* End Of Interrupt to APIC */
diff --git a/sys/amd64/amd64/mem.c b/sys/amd64/amd64/mem.c
index 413b7c74890e..851f2df0e6e1 100644
--- a/sys/amd64/amd64/mem.c
+++ b/sys/amd64/amd64/mem.c
@@ -105,8 +105,8 @@ memrw(struct cdev *dev, struct uio *uio, int flags)
 			 * PAGE_SIZE, the uiomove() call does not
 			 * access past the end of the direct map.
 			 */
-			if (v >= DMAP_MIN_ADDRESS &&
-			    v < DMAP_MIN_ADDRESS + dmaplimit) {
+			if (v >= kva_layout.dmap_low &&
+			    v < kva_layout.dmap_high) {
 				error = uiomove((void *)v, c, uio);
 				break;
 			}
diff --git a/sys/amd64/amd64/minidump_machdep.c b/sys/amd64/amd64/minidump_machdep.c
index 6d0917e16099..43bf81a991bf 100644
--- a/sys/amd64/amd64/minidump_machdep.c
+++ b/sys/amd64/amd64/minidump_machdep.c
@@ -186,7 +186,7 @@ cpu_minidumpsys(struct dumperinfo *di, const struct minidumpstate *state)
 	 * tables, so care must be taken to read each entry only once.
 	 */
 	pmapsize = 0;
-	for (va = VM_MIN_KERNEL_ADDRESS; va < kva_end; ) {
+	for (va = kva_layout.km_low; va < kva_end; ) {
 		/*
 		 * We always write a page, even if it is zero. Each
 		 * page written corresponds to 1GB of space
@@ -279,9 +279,9 @@ cpu_minidumpsys(struct dumperinfo *di, const struct minidumpstate *state)
 	mdhdr.msgbufsize = mbp->msg_size;
 	mdhdr.bitmapsize = round_page(BITSET_SIZE(vm_page_dump_pages));
 	mdhdr.pmapsize = pmapsize;
-	mdhdr.kernbase = VM_MIN_KERNEL_ADDRESS;
-	mdhdr.dmapbase = DMAP_MIN_ADDRESS;
-	mdhdr.dmapend = DMAP_MAX_ADDRESS;
+	mdhdr.kernbase = kva_layout.km_low;
+	mdhdr.dmapbase = kva_layout.dmap_low;
+	mdhdr.dmapend = kva_layout.dmap_high;
 	mdhdr.dumpavailsize = round_page(sizeof(dump_avail));
 
 	dump_init_header(di, &kdh, KERNELDUMPMAGIC, KERNELDUMP_AMD64_VERSION,
@@ -323,7 +323,7 @@ cpu_minidumpsys(struct dumperinfo *di, const struct minidumpstate *state)
 
 	/* Dump kernel page directory pages */
 	bzero(fakepd, sizeof(fakepd));
-	for (va = VM_MIN_KERNEL_ADDRESS; va < kva_end; va += NBPDP) {
+	for (va = kva_layout.km_low; va < kva_end; va += NBPDP) {
 		ii = pmap_pml4e_index(va);
 		pml4 = (uint64_t *)PHYS_TO_DMAP(KPML4phys) + ii;
 		pdp = (uint64_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c
index 9c985df13ddf..b2bfe633adcc 100644
--- a/sys/amd64/amd64/pmap.c
+++ b/sys/amd64/amd64/pmap.c
@@ -415,7 +415,7 @@ SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0,
 
 static int ndmpdp;
 vm_paddr_t dmaplimit;
-vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS;
+vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS_LA48;
 pt_entry_t pg_nx;
 
 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
@@ -475,11 +475,36 @@ _Static_assert(DMPML4I + NDMPML4E <= KMSANSHADPML4I, "direct map overflow");
 static pml4_entry_t	*kernel_pml4;
 static u_int64_t	DMPDphys;	/* phys addr of direct mapped level 2 */
 static u_int64_t	DMPDPphys;	/* phys addr of direct mapped level 3 */
+static u_int64_t	DMPML4phys;	/* ... level 4, for la57 */
 static int		ndmpdpphys;	/* number of DMPDPphys pages */
 
 vm_paddr_t		kernphys;	/* phys addr of start of bootstrap data */
 vm_paddr_t		KERNend;	/* and the end */
 
+struct kva_layout_s	kva_layout = {
+	.kva_min =	KV4ADDR(PML4PML4I, 0, 0, 0),
+	.dmap_low =	KV4ADDR(DMPML4I, 0, 0, 0),
+	.dmap_high =	KV4ADDR(DMPML4I + NDMPML4E, 0, 0, 0),
+	.lm_low =	KV4ADDR(LMSPML4I, 0, 0, 0),
+	.lm_high =	KV4ADDR(LMEPML4I + 1, 0, 0, 0),
+	.km_low =	KV4ADDR(KPML4BASE, 0, 0, 0),
+	.km_high =	KV4ADDR(KPML4BASE + NKPML4E - 1, NPDPEPG - 1,
+			    NPDEPG - 1, NPTEPG - 1),
+	.rec_pt =	KV4ADDR(PML4PML4I, 0, 0, 0),
+};
+
+struct kva_layout_s	kva_layout_la57 = {
+	.kva_min =	KV5ADDR(NPML5EPG / 2, 0, 0, 0, 0),	/* == rec_pt */
+	.dmap_low =	KV5ADDR(DMPML5I, 0, 0, 0, 0),
+	.dmap_high =	KV5ADDR(DMPML5I + NDMPML5E, 0, 0, 0, 0),
+	.lm_low =	KV5ADDR(LMSPML5I, 0, 0, 0, 0),
+	.lm_high =	KV5ADDR(LMEPML5I + 1, 0, 0, 0, 0),
+	.km_low =	KV4ADDR(KPML4BASE, 0, 0, 0),
+	.km_high =	KV4ADDR(KPML4BASE + NKPML4E - 1, NPDPEPG - 1,
+			    NPDEPG - 1, NPTEPG - 1),
+	.rec_pt =	KV5ADDR(PML5PML5I, 0, 0, 0, 0),
+};
+
 /*
  * pmap_mapdev support pre initialization (i.e. console)
  */
@@ -549,8 +574,8 @@ static int pmap_flags = PMAP_PDE_SUPERPAGE;	/* flags for x86 pmaps */
 
 static vmem_t *large_vmem;
 static u_int lm_ents;
-#define	PMAP_ADDRESS_IN_LARGEMAP(va)	((va) >= LARGEMAP_MIN_ADDRESS && \
-	(va) < LARGEMAP_MIN_ADDRESS + NBPML4 * (u_long)lm_ents)
+#define	PMAP_ADDRESS_IN_LARGEMAP(va)	((va) >= kva_layout.lm_low && \
+	(va) < kva_layout.lm_high)
 
 int pmap_pcid_enabled = 1;
 SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
@@ -1336,7 +1361,7 @@ static pdp_entry_t *pmap_pti_pdpe(vm_offset_t va);
 static pd_entry_t *pmap_pti_pde(vm_offset_t va);
 static void pmap_pti_wire_pte(void *pte);
 static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
-    bool remove_pt, struct spglist *free, struct rwlock **lockp);
+    bool demote_kpde, struct spglist *free, struct rwlock **lockp);
 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva,
     pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp);
 static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va);
@@ -1722,7 +1747,7 @@ create_pagetables(vm_paddr_t *firstaddr)
 {
 	pd_entry_t *pd_p;
 	pdp_entry_t *pdp_p;
-	pml4_entry_t *p4_p;
+	pml4_entry_t *p4_p, *p4d_p;
 	pml5_entry_t *p5_p;
 	uint64_t DMPDkernphys;
 	vm_paddr_t pax;
@@ -1732,7 +1757,7 @@ create_pagetables(vm_paddr_t *firstaddr)
 	vm_offset_t kasankernbase;
 	int kasankpdpi, kasankpdi, nkasanpte;
 #endif
-	int i, j, ndm1g, nkpdpe, nkdmpde;
+	int i, j, ndm1g, nkpdpe, nkdmpde, ndmpml4phys;
 
 	TSENTER();
 	/* Allocate page table pages for the direct map */
@@ -1740,15 +1765,30 @@ create_pagetables(vm_paddr_t *firstaddr)
 	if (ndmpdp < 4)		/* Minimum 4GB of dirmap */
 		ndmpdp = 4;
 	ndmpdpphys = howmany(ndmpdp, NPDPEPG);
-	if (ndmpdpphys > NDMPML4E) {
-		/*
-		 * Each NDMPML4E allows 512 GB, so limit to that,
-		 * and then readjust ndmpdp and ndmpdpphys.
-		 */
-		printf("NDMPML4E limits system to %d GB\n", NDMPML4E * 512);
-		Maxmem = atop(NDMPML4E * NBPML4);
-		ndmpdpphys = NDMPML4E;
-		ndmpdp = NDMPML4E * NPDEPG;
+	if (la57) {
+		ndmpml4phys = howmany(ndmpdpphys, NPML4EPG);
+		if (ndmpml4phys > NDMPML5E) {
+			printf("NDMPML5E limits system to %ld GB\n",
+			    (u_long)NDMPML5E * NBPML5 / 1024 / 1024 / 1024);
+			Maxmem = atop(NDMPML5E * NBPML5);
+			ndmpml4phys = NDMPML5E;
+			ndmpdpphys = ndmpml4phys * NPML4EPG;
+			ndmpdp = ndmpdpphys * NPDEPG;
+		}
+		DMPML4phys = allocpages(firstaddr, ndmpml4phys);
+	} else {
+		if (ndmpdpphys > NDMPML4E) {
+			/*
+			 * Each NDMPML4E allows 512 GB, so limit to
+			 * that, and then readjust ndmpdp and
+			 * ndmpdpphys.
+			 */
+			printf("NDMPML4E limits system to %d GB\n",
+			    NDMPML4E * 512);
+			Maxmem = atop(NDMPML4E * NBPML4);
+			ndmpdpphys = NDMPML4E;
+			ndmpdp = NDMPML4E * NPDEPG;
+		}
 	}
 	DMPDPphys = allocpages(firstaddr, ndmpdpphys);
 	ndm1g = 0;
@@ -1773,7 +1813,13 @@ create_pagetables(vm_paddr_t *firstaddr)
 	dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT;
 
 	/* Allocate pages. */
+	if (la57) {
+		KPML5phys = allocpages(firstaddr, 1);
+		p5_p = (pml5_entry_t *)KPML5phys;
+	}
 	KPML4phys = allocpages(firstaddr, 1);
+	p4_p = (pml4_entry_t *)KPML4phys;
+
 	KPDPphys = allocpages(firstaddr, NKPML4E);
 #ifdef KASAN
 	KASANPDPphys = allocpages(firstaddr, NKASANPML4E);
@@ -1893,6 +1939,16 @@ create_pagetables(vm_paddr_t *firstaddr)
 	}
 
 	/*
+	 * Connect the Direct Map slots up to the PML4.
+	 * pml5 entries for DMAP are handled below in global pml5 loop.
+	 */
+	p4d_p = la57 ? (pml4_entry_t *)DMPML4phys : &p4_p[DMPML4I];
+	for (i = 0; i < ndmpdpphys; i++) {
+		p4d_p[i] = (DMPDPphys + ptoa(i)) | X86_PG_RW | X86_PG_V |
+		    pg_nx;
+	}
+
+	/*
 	 * Instead of using a 1G page for the memory containing the kernel,
 	 * use 2M pages with read-only and no-execute permissions.  (If using 1G
 	 * pages, this will partially overwrite the PDPEs above.)
@@ -1911,11 +1967,6 @@ create_pagetables(vm_paddr_t *firstaddr)
 		}
 	}
 
-	/* And recursively map PML4 to itself in order to get PTmap */
-	p4_p = (pml4_entry_t *)KPML4phys;
-	p4_p[PML4PML4I] = KPML4phys;
-	p4_p[PML4PML4I] |= X86_PG_RW | X86_PG_V | pg_nx;
-
 #ifdef KASAN
 	/* Connect the KASAN shadow map slots up to the PML4. */
 	for (i = 0; i < NKASANPML4E; i++) {
@@ -1938,25 +1989,15 @@ create_pagetables(vm_paddr_t *firstaddr)
 	}
 #endif
 
-	/* Connect the Direct Map slots up to the PML4. */
-	for (i = 0; i < ndmpdpphys; i++) {
-		p4_p[DMPML4I + i] = DMPDPphys + ptoa(i);
-		p4_p[DMPML4I + i] |= X86_PG_RW | X86_PG_V | pg_nx;
-	}
-
 	/* Connect the KVA slots up to the PML4 */
 	for (i = 0; i < NKPML4E; i++) {
 		p4_p[KPML4BASE + i] = KPDPphys + ptoa(i);
 		p4_p[KPML4BASE + i] |= X86_PG_RW | X86_PG_V;
 	}
 
-	kernel_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys);
-
 	if (la57) {
 		/* XXXKIB bootstrap KPML5phys page is lost */
-		KPML5phys = allocpages(firstaddr, 1);
-		for (i = 0, p5_p = (pml5_entry_t *)KPML5phys; i < NPML5EPG;
-		    i++) {
+		for (i = 0; i < NPML5EPG; i++) {
 			if (i == PML5PML5I) {
 				/*
 				 * Recursively map PML5 to itself in
@@ -1964,6 +2005,10 @@ create_pagetables(vm_paddr_t *firstaddr)
 				 */
 				p5_p[i] = KPML5phys | X86_PG_RW | X86_PG_A |
 				    X86_PG_M | X86_PG_V | pg_nx;
+			} else if (i >= DMPML5I && i < DMPML5I + NDMPML5E) {
+				/* Connect DMAP pml4 pages to PML5. */
+				p5_p[i] = (DMPML4phys + ptoa(i - DMPML5I)) |
+				    X86_PG_RW | X86_PG_V | pg_nx;
 			} else if (i == pmap_pml5e_index(UPT_MAX_ADDRESS)) {
 				p5_p[i] = KPML4phys | X86_PG_RW | X86_PG_A |
 				    X86_PG_M | X86_PG_V;
@@ -1971,6 +2016,10 @@ create_pagetables(vm_paddr_t *firstaddr)
 				p5_p[i] = 0;
 			}
 		}
+	} else {
+		/* Recursively map PML4 to itself in order to get PTmap */
+		p4_p[PML4PML4I] = KPML4phys;
+		p4_p[PML4PML4I] |= X86_PG_RW | X86_PG_V | pg_nx;
 	}
 	TSEXIT();
 }
@@ -2024,7 +2073,7 @@ pmap_bootstrap(vm_paddr_t *firstaddr)
 	 */
 	virtual_avail = (vm_offset_t)KERNSTART + round_2mpage(KERNend -
 	    (vm_paddr_t)kernphys);
-	virtual_end = VM_MAX_KERNEL_ADDRESS;
+	virtual_end = kva_layout.km_high;
 
 	/*
 	 * Enable PG_G global pages, then switch to the kernel page
@@ -2046,9 +2095,13 @@ pmap_bootstrap(vm_paddr_t *firstaddr)
 	 * Initialize the kernel pmap (which is statically allocated).
 	 * Count bootstrap data as being resident in case any of this data is
 	 * later unmapped (using pmap_remove()) and freed.
+	 *
+	 * DMAP_TO_PHYS()/PHYS_TO_DMAP() are functional only after
+	 * kva_layout is fixed.
 	 */
 	PMAP_LOCK_INIT(kernel_pmap);
 	if (la57) {
+		kva_layout = kva_layout_la57;
 		vtoptem = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT +
 		    NPML4EPGSHIFT + NPML5EPGSHIFT)) - 1) << 3;
 		PTmap = (vm_offset_t)P5Tmap;
@@ -2059,6 +2112,7 @@ pmap_bootstrap(vm_paddr_t *firstaddr)
 		kernel_pmap->pm_cr3 = KPML5phys;
 		pmap_pt_page_count_adj(kernel_pmap, 1);	/* top-level page */
 	} else {
+		kernel_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys);
 		kernel_pmap->pm_pmltop = kernel_pml4;
 		kernel_pmap->pm_cr3 = KPML4phys;
 	}
@@ -2420,6 +2474,8 @@ pmap_init(void)
 {
 	struct pmap_preinit_mapping *ppim;
 	vm_page_t m, mpte;
+	pml4_entry_t *pml4e;
+	unsigned long lm_max;
 	int error, i, ret, skz63;
 
 	/* L1TF, reserve page @0 unconditionally */
@@ -2545,10 +2601,15 @@ pmap_init(void)
 
 	lm_ents = 8;
 	TUNABLE_INT_FETCH("vm.pmap.large_map_pml4_entries", &lm_ents);
-	if (lm_ents > LMEPML4I - LMSPML4I + 1)
-		lm_ents = LMEPML4I - LMSPML4I + 1;
+	lm_max = (kva_layout.lm_high - kva_layout.lm_low) / NBPML4;
+	if (lm_ents > lm_max) {
+		printf(
+	    "pmap: shrinking large map from requested %d slots to %ld slots\n",
+		    lm_ents, lm_max);
+		lm_ents = lm_max;
+	}
 #ifdef KMSAN
-	if (lm_ents > KMSANORIGPML4I - LMSPML4I) {
+	if (!la57 && lm_ents > KMSANORIGPML4I - LMSPML4I) {
 		printf(
 	    "pmap: shrinking large map for KMSAN (%d slots to %ld slots)\n",
 		    lm_ents, KMSANORIGPML4I - LMSPML4I);
@@ -2559,18 +2620,27 @@ pmap_init(void)
 		printf("pmap: large map %u PML4 slots (%lu GB)\n",
 		    lm_ents, (u_long)lm_ents * (NBPML4 / 1024 / 1024 / 1024));
 	if (lm_ents != 0) {
-		large_vmem = vmem_create("large", LARGEMAP_MIN_ADDRESS,
+		large_vmem = vmem_create("large", kva_layout.lm_low,
 		    (vmem_size_t)lm_ents * NBPML4, PAGE_SIZE, 0, M_WAITOK);
 		if (large_vmem == NULL) {
 			printf("pmap: cannot create large map\n");
 			lm_ents = 0;
 		}
+		if (la57) {
+			for (i = 0; i < howmany((vm_offset_t)NBPML4 *
+			    lm_ents, NBPML5); i++) {
+				m = pmap_large_map_getptp_unlocked();
+				kernel_pmap->pm_pmltop[LMSPML5I + i] = X86_PG_V |
+				    X86_PG_RW | X86_PG_A | X86_PG_M |
+				    pg_nx | VM_PAGE_TO_PHYS(m);
+			}
+		}
 		for (i = 0; i < lm_ents; i++) {
 			m = pmap_large_map_getptp_unlocked();
-			/* XXXKIB la57 */
-			kernel_pml4[LMSPML4I + i] = X86_PG_V |
-			    X86_PG_RW | X86_PG_A | X86_PG_M | pg_nx |
-			    VM_PAGE_TO_PHYS(m);
+			pml4e = pmap_pml4e(kernel_pmap, kva_layout.lm_low +
+			    (u_long)i * NBPML4);
+			*pml4e = X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M |
+			    pg_nx | VM_PAGE_TO_PHYS(m);
 		}
 	}
 }
@@ -3899,7 +3969,7 @@ pmap_kextract(vm_offset_t va)
 	pd_entry_t pde;
 	vm_paddr_t pa;
 
-	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
+	if (va >= kva_layout.dmap_low && va < kva_layout.dmap_high) {
 		pa = DMAP_TO_PHYS(va);
 	} else if (PMAP_ADDRESS_IN_LARGEMAP(va)) {
 		pa = pmap_large_map_kextract(va);
@@ -4040,7 +4110,7 @@ pmap_qremove(vm_offset_t sva, int count)
 		 * enough to one of those pmap_enter() calls for it to
 		 * be caught up in a promotion.
 		 */
-		KASSERT(va >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", va));
+		KASSERT(va >= kva_layout.km_low, ("usermode va %lx", va));
 		KASSERT((*vtopde(va) & X86_PG_PS) == 0,
 		    ("pmap_qremove on promoted va %#lx", va));
 
@@ -4328,21 +4398,13 @@ void
 pmap_pinit_pml5(vm_page_t pml5pg)
 {
 	pml5_entry_t *pm_pml5;
+	int i;
 
 	pm_pml5 = (pml5_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml5pg));
-
-	/*
-	 * Add pml5 entry at top of KVA pointing to existing pml4 table,
-	 * entering all existing kernel mappings into level 5 table.
-	 */
-	pm_pml5[pmap_pml5e_index(UPT_MAX_ADDRESS)] = KPML4phys | X86_PG_V |
-	    X86_PG_RW | X86_PG_A | X86_PG_M;
-
-	/*
-	 * Install self-referential address mapping entry.
-	 */
-	pm_pml5[PML5PML5I] = VM_PAGE_TO_PHYS(pml5pg) |
-	    X86_PG_RW | X86_PG_V | X86_PG_M | X86_PG_A;
+	for (i = 0; i < NPML5EPG / 2; i++)
+		pm_pml5[i] = 0;
+	for (; i < NPML5EPG; i++)
+		pm_pml5[i] = kernel_pmap->pm_pmltop[i];
 }
 
 static void
@@ -4899,8 +4961,8 @@ pmap_release(pmap_t pmap)
 	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pmltop));
 
 	if (pmap_is_la57(pmap)) {
-		pmap->pm_pmltop[pmap_pml5e_index(UPT_MAX_ADDRESS)] = 0;
-		pmap->pm_pmltop[PML5PML5I] = 0;
+		for (i = NPML5EPG / 2; i < NPML5EPG; i++)
+			pmap->pm_pmltop[i] = 0;
 	} else {
 		for (i = 0; i < NKPML4E; i++)	/* KVA */
 			pmap->pm_pmltop[KPML4BASE + i] = 0;
@@ -4942,7 +5004,7 @@ pmap_release(pmap_t pmap)
 static int
 kvm_size(SYSCTL_HANDLER_ARGS)
 {
-	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
+	unsigned long ksize = kva_layout.km_high - kva_layout.km_low;
 
 	return sysctl_handle_long(oidp, &ksize, 0, req);
 }
@@ -4953,7 +5015,7 @@ SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE,
 static int
 kvm_free(SYSCTL_HANDLER_ARGS)
 {
-	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
+	unsigned long kfree = kva_layout.km_high - kernel_vm_end;
 
 	return sysctl_handle_long(oidp, &kfree, 0, req);
 }
@@ -5031,7 +5093,7 @@ pmap_page_array_startup(long pages)
 
 	vm_page_array_size = pages;
 
-	start = VM_MIN_KERNEL_ADDRESS;
+	start = kva_layout.km_low;
 	end = start + pages * sizeof(struct vm_page);
 	for (va = start; va < end; va += NBPDR) {
 		pfn = first_page + (va - start) / sizeof(struct vm_page);
@@ -6067,8 +6129,8 @@ pmap_demote_pde_mpte(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
 			 * so the direct map region is the only part of the
 			 * kernel address space that must be handled here.
 			 */
-			KASSERT(!in_kernel || (va >= DMAP_MIN_ADDRESS &&
-			    va < DMAP_MAX_ADDRESS),
+			KASSERT(!in_kernel || (va >= kva_layout.dmap_low &&
+			    va < kva_layout.dmap_high),
 			    ("pmap_demote_pde: No saved mpte for va %#lx", va));
 
 			/*
@@ -6165,8 +6227,7 @@ pmap_demote_pde_mpte(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
  * pmap_remove_kernel_pde: Remove a kernel superpage mapping.
  */
 static void
-pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
-    bool remove_pt)
+pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
 {
 	pd_entry_t newpde;
 	vm_paddr_t mptepa;
@@ -6174,12 +6235,8 @@ pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
 
 	KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap));
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
-	if (remove_pt)
-		mpte = pmap_remove_pt_page(pmap, va);
-	else
-		mpte = vm_radix_lookup(&pmap->pm_root, pmap_pde_pindex(va));
-	if (mpte == NULL)
-		panic("pmap_remove_kernel_pde: Missing pt page.");
+	mpte = pmap_remove_pt_page(pmap, va);
+	KASSERT(mpte != NULL, ("pmap_remove_kernel_pde: missing pt page"));
 
 	mptepa = VM_PAGE_TO_PHYS(mpte);
 	newpde = mptepa | X86_PG_M | X86_PG_A | X86_PG_RW | X86_PG_V;
@@ -6209,7 +6266,7 @@ pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
  * pmap_remove_pde: do the things to unmap a superpage in a process
  */
 static int
-pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, bool remove_pt,
+pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, bool demote_kpde,
     struct spglist *free, struct rwlock **lockp)
 {
 	struct md_page *pvh;
@@ -6249,9 +6306,7 @@ pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, bool remove_pt,
 			pmap_delayed_invl_page(m);
 		}
 	}
-	if (pmap == kernel_pmap) {
-		pmap_remove_kernel_pde(pmap, pdq, sva, remove_pt);
-	} else {
+	if (pmap != kernel_pmap) {
 		mpte = pmap_remove_pt_page(pmap, sva);
 		if (mpte != NULL) {
 			KASSERT(vm_page_any_valid(mpte),
@@ -6262,6 +6317,14 @@ pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, bool remove_pt,
 			mpte->ref_count = 0;
 			pmap_add_delayed_free_list(mpte, free, false);
 		}
+	} else if (demote_kpde) {
+		pmap_remove_kernel_pde(pmap, pdq, sva);
+	} else {
+		mpte = vm_radix_lookup(&pmap->pm_root, pmap_pde_pindex(sva));
+		if (vm_page_any_valid(mpte)) {
+			mpte->valid = 0;
+			pmap_zero_page(mpte);
+		}
 	}
 	return (pmap_unuse_pt(pmap, sva, *pmap_pdpe(pmap, sva), free));
 }
@@ -7183,7 +7246,7 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
 	PG_RW = pmap_rw_bit(pmap);
 
 	va = trunc_page(va);
-	KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig"));
+	KASSERT(va <= kva_layout.km_high, ("pmap_enter: toobig"));
 	KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS,
 	    ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)",
 	    va));
@@ -7512,6 +7575,9 @@ pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags,
 	PG_RW = pmap_rw_bit(pmap);
 	KASSERT((newpde & (pmap_modified_bit(pmap) | PG_RW)) != PG_RW,
 	    ("pmap_enter_pde: newpde is missing PG_M"));
+	KASSERT((flags & (PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM)) !=
+	    PMAP_ENTER_NORECLAIM,
+	    ("pmap_enter_pde: flags is missing PMAP_ENTER_NOREPLACE"));
 	PG_V = pmap_valid_bit(pmap);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
@@ -7573,8 +7639,8 @@ pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags,
 			 * the mapping is not from kernel_pmap, then
 			 * a reserved PT page could be freed.
 			 */
-			(void)pmap_remove_pde(pmap, pde, va,
-			    pmap != kernel_pmap, &free, lockp);
+			(void)pmap_remove_pde(pmap, pde, va, false, &free,
+			    lockp);
 			if ((oldpde & PG_G) == 0)
 				pmap_invalidate_pde_page(pmap, va, oldpde);
 		} else {
@@ -7584,10 +7650,9 @@ pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags,
 				 * before any changes to mappings are
 				 * made.  Abort on failure.
 				 */
-				mt = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
-				if (pmap_insert_pt_page(pmap, mt, false, false)) {
-					if (pdpg != NULL)
-						pdpg->ref_count--;
+				mt = PHYS_TO_VM_PAGE(oldpde & PG_FRAME);
+				if (pmap_insert_pt_page(pmap, mt, false,
+				    false)) {
 					CTR1(KTR_PMAP,
 			    "pmap_enter_pde: cannot ins kern ptp va %#lx",
 					    va);
@@ -7641,6 +7706,14 @@ pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags,
 		if (!pmap_pv_insert_pde(pmap, va, newpde, flags, lockp)) {
 			if (pdpg != NULL)
 				pmap_abort_ptp(pmap, va, pdpg);
+			else {
+				KASSERT(va >= VM_MAXUSER_ADDRESS &&
+				    (*pde & (PG_PS | PG_V)) == PG_V,
+				    ("pmap_enter_pde: invalid kernel PDE"));
+				mt = pmap_remove_pt_page(pmap, va);
+				KASSERT(mt != NULL,
+				    ("pmap_enter_pde: missing kernel PTP"));
+			}
 			if (uwptpg != NULL) {
 				mt = pmap_remove_pt_page(pmap, va);
 				KASSERT(mt == uwptpg,
@@ -9550,7 +9623,7 @@ pmap_unmapdev(void *p, vm_size_t size)
 	va = (vm_offset_t)p;
 
 	/* If we gave a direct map region in pmap_mapdev, do nothing */
-	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS)
+	if (va >= kva_layout.dmap_low && va < kva_layout.dmap_high)
 		return;
 	offset = va & PAGE_MASK;
 	size = round_page(offset + size);
@@ -9649,6 +9722,8 @@ pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va, vm_page_t m)
 void
 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
 {
+	if (m->md.pat_mode == ma)
+		return;
 
 	m->md.pat_mode = ma;
 
@@ -9668,6 +9743,9 @@ pmap_page_set_memattr_noflush(vm_page_t m, vm_memattr_t ma)
 {
 	int error;
 
+	if (m->md.pat_mode == ma)
+		return;
+
 	m->md.pat_mode = ma;
 
 	if ((m->flags & PG_FICTITIOUS) != 0)
@@ -9724,7 +9802,7 @@ pmap_change_prot(vm_offset_t va, vm_size_t size, vm_prot_t prot)
 	int error;
 
 	/* Only supported within the kernel map. */
-	if (va < VM_MIN_KERNEL_ADDRESS)
+	if (va < kva_layout.km_low)
 		return (EINVAL);
 
 	PMAP_LOCK(kernel_pmap);
@@ -9755,7 +9833,7 @@ pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot,
 	 * Only supported on kernel virtual addresses, including the direct
 	 * map but excluding the recursive map.
 	 */
-	if (base < DMAP_MIN_ADDRESS)
+	if (base < kva_layout.dmap_low)
 		return (EINVAL);
 
 	/*
@@ -9778,7 +9856,7 @@ pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot,
 			pte_bits |= X86_PG_RW;
 		}
 		if ((prot & VM_PROT_EXECUTE) == 0 ||
-		    va < VM_MIN_KERNEL_ADDRESS) {
+		    va < kva_layout.km_low) {
 			pde_bits |= pg_nx;
 			pte_bits |= pg_nx;
 		}
@@ -9874,7 +9952,7 @@ pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot,
 				pmap_pte_props(pdpe, pde_bits, pde_mask);
 				changed = true;
 			}
-			if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
+			if (tmpva >= kva_layout.km_low &&
 			    (*pdpe & PG_PS_FRAME) < dmaplimit) {
 				if (pa_start == pa_end) {
 					/* Start physical address run. */
@@ -9904,7 +9982,7 @@ pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot,
 				pmap_pte_props(pde, pde_bits, pde_mask);
 				changed = true;
 			}
-			if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
+			if (tmpva >= kva_layout.km_low &&
 			    (*pde & PG_PS_FRAME) < dmaplimit) {
 				if (pa_start == pa_end) {
 					/* Start physical address run. */
@@ -9932,7 +10010,7 @@ pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot,
 				pmap_pte_props(pte, pte_bits, pte_mask);
 				changed = true;
 			}
-			if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
+			if (tmpva >= kva_layout.km_low &&
 			    (*pte & PG_FRAME) < dmaplimit) {
 				if (pa_start == pa_end) {
 					/* Start physical address run. */
@@ -10699,19 +10777,28 @@ pmap_large_map_getptp(void)
 static pdp_entry_t *
 pmap_large_map_pdpe(vm_offset_t va)
 {
+	pml4_entry_t *pml4;
 	vm_pindex_t pml4_idx;
 	vm_paddr_t mphys;
 
-	pml4_idx = pmap_pml4e_index(va);
-	KASSERT(LMSPML4I <= pml4_idx && pml4_idx < LMSPML4I + lm_ents,
-	    ("pmap_large_map_pdpe: va %#jx out of range idx %#jx LMSPML4I "
-	    "%#jx lm_ents %d",
-	    (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents));
-	KASSERT((kernel_pml4[pml4_idx] & X86_PG_V) != 0,
-	    ("pmap_large_map_pdpe: invalid pml4 for va %#jx idx %#jx "
-	    "LMSPML4I %#jx lm_ents %d",
-	    (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents));
-	mphys = kernel_pml4[pml4_idx] & PG_FRAME;
+	KASSERT(va >= kva_layout.lm_low && va < kva_layout.lm_low +
+	    (vm_offset_t)NBPML4 * lm_ents, ("va %#lx not in large map", va));
+	if (la57) {
+		pml4 = pmap_pml4e(kernel_pmap, va);
+		mphys = *pml4 & PG_FRAME;
+	} else {
+		pml4_idx = pmap_pml4e_index(va);
+
+		KASSERT(LMSPML4I <= pml4_idx && pml4_idx < LMSPML4I + lm_ents,
+		    ("pmap_large_map_pdpe: va %#jx out of range idx %#jx "
+		    "LMSPML4I %#jx lm_ents %d",
+		    (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents));
+		KASSERT((kernel_pml4[pml4_idx] & X86_PG_V) != 0,
+		    ("pmap_large_map_pdpe: invalid pml4 for va %#jx idx %#jx "
+		    "LMSPML4I %#jx lm_ents %d",
+		    (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents));
+		mphys = kernel_pml4[pml4_idx] & PG_FRAME;
+	}
 	return ((pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va));
 }
 
@@ -10904,8 +10991,8 @@ pmap_large_unmap(void *svaa, vm_size_t len)
 	struct spglist spgf;
 
 	sva = (vm_offset_t)svaa;
-	if (len == 0 || sva + len < sva || (sva >= DMAP_MIN_ADDRESS &&
-	    sva + len <= DMAP_MIN_ADDRESS + dmaplimit))
+	if (len == 0 || sva + len < sva || (sva >= kva_layout.dmap_low &&
+	    sva + len < kva_layout.dmap_high))
 		return;
 
 	SLIST_INIT(&spgf);
@@ -11151,11 +11238,10 @@ pmap_large_map_wb(void *svap, vm_size_t len)
 	sva = (vm_offset_t)svap;
 	eva = sva + len;
 	pmap_large_map_wb_fence();
-	if (sva >= DMAP_MIN_ADDRESS && eva <= DMAP_MIN_ADDRESS + dmaplimit) {
+	if (sva >= kva_layout.dmap_low && eva < kva_layout.dmap_high) {
 		pmap_large_map_flush_range(sva, len);
 	} else {
-		KASSERT(sva >= LARGEMAP_MIN_ADDRESS &&
-		    eva <= LARGEMAP_MIN_ADDRESS + lm_ents * NBPML4,
+		KASSERT(sva >= kva_layout.lm_low && eva < kva_layout.lm_high,
 		    ("pmap_large_map_wb: not largemap %#lx %#lx", sva, len));
 		pmap_large_map_wb_large(sva, eva);
 	}
@@ -11196,8 +11282,8 @@ pmap_pti_init(void)
 	VM_OBJECT_WLOCK(pti_obj);
 	pml4_pg = pmap_pti_alloc_page();
 	pti_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4_pg));
-	for (va = VM_MIN_KERNEL_ADDRESS; va <= VM_MAX_KERNEL_ADDRESS &&
-	    va >= VM_MIN_KERNEL_ADDRESS && va > NBPML4; va += NBPML4) {
+	for (va = kva_layout.km_low; va <= kva_layout.km_high &&
+	    va >= kva_layout.km_low && va > NBPML4; va += NBPML4) {
 		pdpe = pmap_pti_pdpe(va);
 		pmap_pti_wire_pte(pdpe);
 	}
@@ -12081,10 +12167,12 @@ sysctl_kmaps(SYSCTL_HANDLER_ARGS)
 	for (sva = 0, i = pmap_pml4e_index(sva); i < NPML4EPG; i++) {
 		switch (i) {
 		case PML4PML4I:
-			sbuf_printf(sb, "\nRecursive map:\n");
+			if (!la57)
+				sbuf_printf(sb, "\nRecursive map:\n");
 			break;
 		case DMPML4I:
-			sbuf_printf(sb, "\nDirect map:\n");
+			if (!la57)
+				sbuf_printf(sb, "\nDirect map:\n");
 			break;
 #ifdef KASAN
 		case KASANPML4I:
@@ -12103,7 +12191,8 @@ sysctl_kmaps(SYSCTL_HANDLER_ARGS)
 			sbuf_printf(sb, "\nKernel map:\n");
 			break;
 		case LMSPML4I:
-			sbuf_printf(sb, "\nLarge map:\n");
+			if (!la57)
+				sbuf_printf(sb, "\nLarge map:\n");
 			break;
 		}
 
diff --git a/sys/amd64/amd64/trap.c b/sys/amd64/amd64/trap.c
index 09ac0a67dbef..eefddad2f142 100644
--- a/sys/amd64/amd64/trap.c
+++ b/sys/amd64/amd64/trap.c
@@ -769,7 +769,7 @@ trap_pfault(struct trapframe *frame, bool usermode, int *signo, int *ucode)
 			return (-1);
 		}
 	}
-	if (eva >= VM_MIN_KERNEL_ADDRESS) {
+	if (eva >= kva_layout.km_low) {
 		/*
 		 * Don't allow user-mode faults in kernel address space.
 		 */
diff --git a/sys/amd64/include/param.h b/sys/amd64/include/param.h
index 8db314fa034d..1bbb302259d6 100644
--- a/sys/amd64/include/param.h
+++ b/sys/amd64/include/param.h
@@ -146,8 +146,9 @@
 #define	amd64_btop(x)	((unsigned long)(x) >> PAGE_SHIFT)
 #define	amd64_ptob(x)	((unsigned long)(x) << PAGE_SHIFT)
 
-#define	INKERNEL(va) (((va) >= DMAP_MIN_ADDRESS && (va) < DMAP_MAX_ADDRESS) \
-    || ((va) >= VM_MIN_KERNEL_ADDRESS && (va) < VM_MAX_KERNEL_ADDRESS))
+#define	INKERNEL(va)	\
+    (((va) >= kva_layout.dmap_low && (va) < kva_layout.dmap_high) || \
+    ((va) >= kva_layout.km_low && (va) < kva_layout.km_high))
 
 #ifdef SMP
 #define SC_TABLESIZE    1024                     /* Must be power of 2. */
diff --git a/sys/amd64/include/pmap.h b/sys/amd64/include/pmap.h
index 7d3e91bcd9b9..a0ca97f2d5a0 100644
--- a/sys/amd64/include/pmap.h
+++ b/sys/amd64/include/pmap.h
@@ -169,11 +169,12 @@
  * the recursive page table map.
  */
 #define	NDMPML4E	8
+#define	NDMPML5E	32
 
 /*
- * These values control the layout of virtual memory.  The starting address
- * of the direct map, which is controlled by DMPML4I, must be a multiple of
- * its size.  (See the PHYS_TO_DMAP() and DMAP_TO_PHYS() macros.)
+ * These values control the layout of virtual memory.  The starting
+ * address of the direct map is controlled by DMPML4I on LA48 and
+ * DMPML5I on LA57.
  *
  * Note: KPML4I is the index of the (single) level 4 page that maps
  * the KVA that holds KERNBASE, while KPML4BASE is the index of the
@@ -191,6 +192,7 @@
 
 #define	KPML4BASE	(NPML4EPG-NKPML4E) /* KVM at highest addresses */
 #define	DMPML4I		rounddown(KPML4BASE-NDMPML4E, NDMPML4E) /* Below KVM */
+#define	DMPML5I		(NPML5EPG / 2 + 1)
 
 #define	KPML4I		(NPML4EPG-1)
 #define	KPDPI		(NPDPEPG-2)	/* kernbase at -2GB */
@@ -200,9 +202,14 @@
 #define	KMSANSHADPML4I	(KPML4BASE - NKMSANSHADPML4E)
 #define	KMSANORIGPML4I	(DMPML4I - NKMSANORIGPML4E)
 
-/* Large map: index of the first and max last pml4 entry */
+/*
+ * Large map: index of the first and max last pml4/la48 and pml5/la57
+ * entry.
+ */
 #define	LMSPML4I	(PML4PML4I + 1)
 #define	LMEPML4I	(KASANPML4I - 1)
+#define	LMSPML5I	(DMPML5I + NDMPML5E)
+#define	LMEPML5I	(LMSPML5I + 32 - 1)	/* 32 slots for large map */
 
 /*
  * XXX doesn't really belong here I guess...
@@ -548,6 +555,18 @@ pmap_pml5e_index(vm_offset_t va)
 	return ((va >> PML5SHIFT) & ((1ul << NPML5EPGSHIFT) - 1));
 }
 
+struct kva_layout_s {
+	vm_offset_t kva_min;
+	vm_offset_t dmap_low;	/* DMAP_MIN_ADDRESS */
+	vm_offset_t dmap_high;	/* DMAP_MAX_ADDRESS */
+	vm_offset_t lm_low;	/* LARGEMAP_MIN_ADDRESS */
+	vm_offset_t lm_high;	/* LARGEMAP_MAX_ADDRESS */
+	vm_offset_t km_low;	/* VM_MIN_KERNEL_ADDRESS */
+	vm_offset_t km_high;	/* VM_MAX_KERNEL_ADDRESS */
+	vm_offset_t rec_pt;
+};
+extern struct kva_layout_s kva_layout;
+
 #endif /* !LOCORE */
 
 #endif /* !_MACHINE_PMAP_H_ */
diff --git a/sys/amd64/include/vmparam.h b/sys/amd64/include/vmparam.h
index 0cd9bb4fa7a4..ef352e776af6 100644
--- a/sys/amd64/include/vmparam.h
+++ b/sys/amd64/include/vmparam.h
@@ -163,6 +163,7 @@
  * Virtual addresses of things.  Derived from the page directory and
  * page table indexes from pmap.h for precision.
  *
+ * LA48:
  * 0x0000000000000000 - 0x00007fffffffffff   user map
  * 0x0000800000000000 - 0xffff7fffffffffff   does not exist (hole)
  * 0xffff800000000000 - 0xffff804020100fff   recursive page table (512GB slot)
@@ -175,18 +176,29 @@
  * 0xfffffc0000000000 - 0xfffffdffffffffff   2TB KMSAN shadow map, optional
  * 0xfffffe0000000000 - 0xffffffffffffffff   2TB kernel map
  *
+ * LA57:
+ * 0x0000000000000000 - 0x00ffffffffffffff   user map
+ * 0x0100000000000000 - 0xf0ffffffffffffff   does not exist (hole)
+ * 0xff00000000000000 - 0xff00ffffffffffff   recursive page table (2048TB slot)
+ * 0xff01000000000000 - 0xff20ffffffffffff   direct map (32 x 2048TB slots)
+ * 0xff21000000000000 - 0xff40ffffffffffff   large map
+ * 0xff41000000000000 - 0xffff7fffffffffff   unused
+ * 0xffff800000000000 - 0xfffff5ffffffffff   unused (start of kernel pml4 entry)
+ * 0xfffff60000000000 - 0xfffff7ffffffffff   2TB KMSAN origin map, optional
+ * 0xfffff78000000000 - 0xfffff7bfffffffff   512GB KASAN shadow map, optional
+ * 0xfffff80000000000 - 0xfffffbffffffffff   4TB unused
+ * 0xfffffc0000000000 - 0xfffffdffffffffff   2TB KMSAN shadow map, optional
+ * 0xfffffe0000000000 - 0xffffffffffffffff   2TB kernel map
+ *
  * Within the kernel map:
  *
  * 0xfffffe0000000000                        vm_page_array
  * 0xffffffff80000000                        KERNBASE
  */
 
-#define	VM_MIN_KERNEL_ADDRESS	KV4ADDR(KPML4BASE, 0, 0, 0)
-#define	VM_MAX_KERNEL_ADDRESS	KV4ADDR(KPML4BASE + NKPML4E - 1, \
-					NPDPEPG-1, NPDEPG-1, NPTEPG-1)
-
-#define	DMAP_MIN_ADDRESS	KV4ADDR(DMPML4I, 0, 0, 0)
-#define	DMAP_MAX_ADDRESS	KV4ADDR(DMPML4I + NDMPML4E, 0, 0, 0)
+#define	VM_MIN_KERNEL_ADDRESS_LA48	KV4ADDR(KPML4BASE, 0, 0, 0)
+#define	VM_MIN_KERNEL_ADDRESS		kva_layout.km_low
+#define	VM_MAX_KERNEL_ADDRESS		kva_layout.km_high
 
 #define	KASAN_MIN_ADDRESS	KV4ADDR(KASANPML4I, 0, 0, 0)
 #define	KASAN_MAX_ADDRESS	KV4ADDR(KASANPML4I + NKASANPML4E, 0, 0, 0)
@@ -199,9 +211,6 @@
 #define	KMSAN_ORIG_MAX_ADDRESS	KV4ADDR(KMSANORIGPML4I + NKMSANORIGPML4E, \
 					0, 0, 0)
 
-#define	LARGEMAP_MIN_ADDRESS	KV4ADDR(LMSPML4I, 0, 0, 0)
-#define	LARGEMAP_MAX_ADDRESS	KV4ADDR(LMEPML4I + 1, 0, 0, 0)
-
 /*
  * Formally kernel mapping starts at KERNBASE, but kernel linker
  * script leaves first PDE reserved.  For legacy BIOS boot, kernel is
@@ -239,21 +248,21 @@
  * vt fb startup needs to be reworked.
  */
 #define	PHYS_IN_DMAP(pa)	(dmaplimit == 0 || (pa) < dmaplimit)
-#define	VIRT_IN_DMAP(va)	((va) >= DMAP_MIN_ADDRESS &&		\
-    (va) < (DMAP_MIN_ADDRESS + dmaplimit))
+#define	VIRT_IN_DMAP(va)	\
+    ((va) >= kva_layout.dmap_low && (va) < kva_layout.dmap_low + dmaplimit)
 
 #define	PMAP_HAS_DMAP	1
-#define	PHYS_TO_DMAP(x)	({						\
+#define	PHYS_TO_DMAP(x)	__extension__ ({				\
 	KASSERT(PHYS_IN_DMAP(x),					\
 	    ("physical address %#jx not covered by the DMAP",		\
 	    (uintmax_t)x));						\
-	(x) | DMAP_MIN_ADDRESS; })
+	(x) + kva_layout.dmap_low; })
 
-#define	DMAP_TO_PHYS(x)	({						\
+#define	DMAP_TO_PHYS(x)	__extension__ ({				\
 	KASSERT(VIRT_IN_DMAP(x),					\
 	    ("virtual address %#jx not covered by the DMAP",		\
 	    (uintmax_t)x));						\
-	(x) & ~DMAP_MIN_ADDRESS; })
+	(x) - kva_layout.dmap_low; })
 
 /*
  * amd64 maps the page array into KVA so that it can be more easily
@@ -274,7 +283,7 @@
  */
 #ifndef VM_KMEM_SIZE_MAX
 #define	VM_KMEM_SIZE_MAX	((VM_MAX_KERNEL_ADDRESS - \
-    VM_MIN_KERNEL_ADDRESS + 1) * 3 / 5)
+    kva_layout.km_low + 1) * 3 / 5)
 #endif
 
 /* initial pagein size of beginning of executable file */
diff --git a/sys/amd64/pt/pt.c b/sys/amd64/pt/pt.c
new file mode 100644
index 000000000000..c7b75767680a
--- /dev/null
+++ b/sys/amd64/pt/pt.c
@@ -0,0 +1,978 @@
+/*
+ * Copyright (c) 2025 Bojan Novković <bnovkov@freebsd.org>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+/*
+ * hwt(4) Intel Processor Trace (PT) backend
+ *
+ * Driver Design Overview
+ *
+ * - Since PT is configured on a per-core basis, the driver uses
+ *   'smp_rendezvous' to start and disable tracing on each target core.
+ * - PT-specific resources are stored in a 'struct pt_ctx' context structure for
+ *   each traced CPU core or thread. Upon initialization, a ToPA configuration
+ *   is generated for each 'pt_ctx' structure using the HWT tracing buffers.
+ *   The HWT tracing buffer is split into 4K ToPA entries. Currently, each
+ *   4K ToPA entry is configured to trigger an interrupt after it is filled.
+ * - The PT driver uses the XSAVE/XRSTOR PT extensions to load and save all
+ *   relevant PT registers. Every time a traced thread is switched
+ *   out or in, its state will be saved to or loaded from its corresponding
+ *   'pt_ctx' context.
+ * - When tracing starts, the PT hardware will start writing data into the
+ *   tracing buffer. When a TOPA_INT entry is filled, it will trigger an
+ *   interrupt before continuing. The interrupt handler will then fetch the
+ *   last valid tracing buffer offset and enqueue a HWT_RECORD_BUFFER record.
+ *   The driver is currently configured to use the NMI interrupt line.
+ * - The userspace PT backend waits for incoming HWT_RECORD_BUFFER records
+ *   and uses the offsets to decode data from the tracing buffer.
+ *
+ * Future improvements and limitations
+ *
+ * - We currently configure the PT hardware to trigger an interrupt whenever
+ *   a 4K ToPA entry is filled. While this is fine when tracing smaller
+ *   functions or infrequent code paths, this will generate too much interrupt
+ *   traffic when tracing hotter functions. A proper solution for this issue
+ *   should estimate the amount of data generated by the current configuration
+ *   and use it to determine interrupt frequency.
+ *
+ * - Support for more tracing options and PT features.
+ *
+ */
+
+#include <sys/systm.h>
+#include <sys/hwt.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/sdt.h>
+#include <sys/smp.h>
+#include <sys/taskqueue.h>
+
+#include <vm/vm.h>
+#include <vm/vm_page.h>
+
+#include <machine/atomic.h>
+#include <machine/cpufunc.h>
+#include <machine/fpu.h>
+#include <machine/smp.h>
+#include <machine/specialreg.h>
+
+#include <x86/apicvar.h>
+#include <x86/x86_var.h>
+
+#include <dev/hwt/hwt_context.h>
+#include <dev/hwt/hwt_vm.h>
+#include <dev/hwt/hwt_backend.h>
+#include <dev/hwt/hwt_config.h>
+#include <dev/hwt/hwt_cpu.h>
+#include <dev/hwt/hwt_record.h>
+#include <dev/hwt/hwt_thread.h>
+
+#include <amd64/pt/pt.h>
+
+#ifdef PT_DEBUG
+#define dprintf(fmt, ...) printf(fmt, ##__VA_ARGS__)
+#else
+#define dprintf(fmt, ...)
+#endif
+#define PT_SUPPORTED_FLAGS						\
+	(RTIT_CTL_MTCEN | RTIT_CTL_CR3FILTER | RTIT_CTL_DIS_TNT |	\
+	    RTIT_CTL_USER | RTIT_CTL_OS | RTIT_CTL_BRANCHEN)
+#define PT_XSAVE_MASK (XFEATURE_ENABLED_X87 | XFEATURE_ENABLED_SSE)
+#define PT_XSTATE_BV (PT_XSAVE_MASK | XFEATURE_ENABLED_PT)
+#define PT_MAX_IP_RANGES 2
+
+#define PT_TOPA_MASK_PTRS 0x7f
+#define PT_TOPA_PAGE_MASK 0xffffff80
+#define PT_TOPA_PAGE_SHIFT 7
+
+#define CPUID_PT_LEAF	0x14
+
+MALLOC_DEFINE(M_PT, "pt", "Intel Processor Trace");
+
+SDT_PROVIDER_DEFINE(pt);
+SDT_PROBE_DEFINE(pt, , , topa__intr);
+
+TASKQUEUE_FAST_DEFINE_THREAD(pt);
+
+static void pt_send_buffer_record(void *arg, int pending __unused);
+static int pt_topa_intr(struct trapframe *tf);
+
+/*
+ * Intel Processor Trace XSAVE-managed state.
+ */
+struct pt_ext_area {
+	uint64_t rtit_ctl;
+	uint64_t rtit_output_base;
+	uint64_t rtit_output_mask_ptrs;
+	uint64_t rtit_status;
+	uint64_t rtit_cr3_match;
+	uint64_t rtit_addr0_a;
+	uint64_t rtit_addr0_b;
+	uint64_t rtit_addr1_a;
+	uint64_t rtit_addr1_b;
+};
+
+struct pt_buffer {
+	uint64_t *topa_hw; /* ToPA table entries. */
+	size_t size;
+	struct mtx lock; /* Lock for fields below. */
+	vm_offset_t offset;
+	uint64_t wrap_count;
+	int curpage;
+};
+
+struct pt_ctx {
+	int id;
+	struct pt_buffer buf; /* ToPA buffer metadata */
+	struct task task;     /* ToPA buffer notification task */
+	struct hwt_context *hwt_ctx;
+	uint8_t *save_area; /* PT XSAVE area */
+};
+/* PT tracing contexts used for CPU mode. */
+static struct pt_ctx *pt_pcpu_ctx;
+
+enum pt_cpu_state {
+	PT_DISABLED = 0,
+	PT_STOPPED,
+	PT_ACTIVE
+};
+
+static struct pt_cpu {
+	struct pt_ctx *ctx;	 /* active PT tracing context */
+	enum pt_cpu_state state; /* used as part of trace stop protocol */
+} *pt_pcpu;
+
+/*
+ * PT-related CPUID bits.
+ */
+static struct pt_cpu_info {
+	uint32_t l0_eax;
+	uint32_t l0_ebx;
+	uint32_t l0_ecx;
+	uint32_t l1_eax;
+	uint32_t l1_ebx;
+	size_t xsave_area_size;
+	size_t xstate_hdr_offset;
+	size_t pt_xsave_offset;
+} pt_info  __read_mostly;
+
+static bool initialized = false;
+static int cpu_mode_ctr = 0;
+
+static __inline enum pt_cpu_state
+pt_cpu_get_state(int cpu_id)
+{
+	return (atomic_load_int(&pt_pcpu[cpu_id].state));
+}
+
+static __inline void
+pt_cpu_set_state(int cpu_id, enum pt_cpu_state state)
+{
+	atomic_store_int(&pt_pcpu[cpu_id].state, state);
+}
+
+static __inline struct xstate_hdr *
+pt_ctx_get_xstate_hdr(struct pt_ctx *ctx)
+{
+	return ((struct xstate_hdr *)(ctx->save_area +
+	    pt_info.xstate_hdr_offset));
+}
+
+
+static __inline struct pt_ext_area *
+pt_ctx_get_ext_area(struct pt_ctx *ctx)
+{
+	return ((struct pt_ext_area *)(ctx->save_area +
+	    pt_info.pt_xsave_offset));
+}
+
+/*
+ * Updates current trace buffer offset from the
+ * ToPA MSRs. Records if the trace buffer wrapped.
+ */
+static __inline void
+pt_update_buffer(struct pt_buffer *buf)
+{
+	uint64_t reg;
+	int curpage;
+
+	/* Update buffer offset. */
+	reg = rdmsr(MSR_IA32_RTIT_OUTPUT_MASK_PTRS);
+	curpage = (reg & PT_TOPA_PAGE_MASK) >> PT_TOPA_PAGE_SHIFT;
+	mtx_lock_spin(&buf->lock);
+	/* Check if the output wrapped. */
+	if (buf->curpage > curpage)
+		buf->wrap_count++;
+	buf->curpage = curpage;
+	buf->offset = reg >> 32;
+	mtx_unlock_spin(&buf->lock);
+
+	dprintf("%s: wrap_cnt: %lu, curpage: %d, offset: %zu\n", __func__,
+	    buf->wrap_count, buf->curpage, buf->offset);
+}
+
+static __inline void
+pt_fill_buffer_record(int id, struct pt_buffer *buf,
+    struct hwt_record_entry *rec)
+{
+	rec->record_type = HWT_RECORD_BUFFER;
+	rec->buf_id = id;
+	rec->curpage = buf->curpage;
+	rec->offset = buf->offset + (buf->wrap_count * buf->size);
+}
+
+/*
+ * Enables or disables tracing on curcpu
+ * using the XSAVE/XRSTOR PT extensions.
+ */
+static void
+pt_cpu_toggle_local(uint8_t *save_area, bool enable)
+{
+	u_long xcr0, cr0;
+	u_long xss;
+
+	cr0 = rcr0();
+	if (cr0 & CR0_TS)
+		clts();
+	xcr0 = rxcr(XCR0);
+	if ((xcr0 & PT_XSAVE_MASK) != PT_XSAVE_MASK)
+		load_xcr(XCR0, xcr0 | PT_XSAVE_MASK);
+	xss = rdmsr(MSR_IA32_XSS);
+	wrmsr(MSR_IA32_XSS, xss | XFEATURE_ENABLED_PT);
+
+	if (!enable) {
+		KASSERT((rdmsr(MSR_IA32_RTIT_CTL) & RTIT_CTL_TRACEEN) != 0,
+		    ("%s: PT is disabled", __func__));
+		xsaves(save_area, XFEATURE_ENABLED_PT);
+	} else {
+		KASSERT((rdmsr(MSR_IA32_RTIT_CTL) & RTIT_CTL_TRACEEN) == 0,
+		    ("%s: PT is enabled", __func__));
+		xrstors(save_area, XFEATURE_ENABLED_PT);
+	}
+	wrmsr(MSR_IA32_XSS, xss);
+	if ((xcr0 & PT_XSAVE_MASK) != PT_XSAVE_MASK)
+		load_xcr(XCR0, xcr0);
+	if (cr0 & CR0_TS)
+		load_cr0(cr0);
+}
+
+/*
+ * Starts PT tracing on 'curcpu'.
+ */
+static void
+pt_cpu_start(void *dummy)
+{
+	struct pt_cpu *cpu;
+
+	cpu = &pt_pcpu[curcpu];
+	MPASS(cpu->ctx != NULL);
+
+	dprintf("%s: curcpu %d\n", __func__, curcpu);
+	load_cr4(rcr4() | CR4_XSAVE);
+	wrmsr(MSR_IA32_RTIT_STATUS, 0);
+	pt_cpu_set_state(curcpu, PT_ACTIVE);
+	pt_cpu_toggle_local(cpu->ctx->save_area, true);
+}
+
+/*
+ * Stops PT tracing on 'curcpu'.
+ * Updates trace buffer offset to ensure
+ * any data generated between the last interrupt
+ * and the trace stop gets picked up by userspace.
+ */
+static void
+pt_cpu_stop(void *dummy)
+{
+	struct pt_cpu *cpu;
+	struct pt_ctx *ctx;
+
+	/* Shutdown may occur before PT gets properly configured. */
+	if (pt_cpu_get_state(curcpu) == PT_DISABLED)
+		return;
+
+	cpu = &pt_pcpu[curcpu];
+	ctx = cpu->ctx;
+	MPASS(ctx != NULL);
+	dprintf("%s: curcpu %d\n", __func__, curcpu);
+
+	pt_cpu_set_state(curcpu, PT_STOPPED);
+	pt_cpu_toggle_local(cpu->ctx->save_area, false);
+	pt_update_buffer(&ctx->buf);
+}
+
+/*
+ * Prepares the Table of Physical Addresses (ToPA) metadata for 'pt_ctx'.
+ * The HWT trace buffer is split into 4K ToPA table entries and used
+ * as a circular buffer, meaning that the last ToPA entry points to
+ * the first ToPA entry. Each entry is configured to raise an
+ * interrupt after being filled.
+ */
+static int
+pt_topa_prepare(struct pt_ctx *ctx, struct hwt_vm *vm)
+{
+	struct pt_buffer *buf;
+	size_t topa_size;
+	int i;
+
+	topa_size = TOPA_SIZE_4K;
+	buf = &ctx->buf;
+
+	KASSERT(buf->topa_hw == NULL,
+	    ("%s: ToPA info already exists", __func__));
+	buf->topa_hw = mallocarray(vm->npages + 1, sizeof(uint64_t), M_PT,
+	    M_ZERO | M_WAITOK);
+	dprintf("%s: ToPA virt addr %p\n", __func__, buf->topa_hw);
+	buf->size = vm->npages * PAGE_SIZE;
+	for (i = 0; i < vm->npages; i++) {
+		buf->topa_hw[i] = VM_PAGE_TO_PHYS(vm->pages[i]) | topa_size;
+		/*
+		 * XXX: TOPA_INT should ideally be set according to
+		 * expected amount of incoming trace data. Too few TOPA_INT
+		 * entries will not trigger interrupts often enough when tracing
+		 * smaller functions.
+		 */
+		buf->topa_hw[i] |= TOPA_INT;
+	}
+	buf->topa_hw[vm->npages] = (uint64_t)vtophys(buf->topa_hw) | TOPA_END;
+
+	return (0);
+}
+
+/*
+ * Configures IP filtering for trace generation.
+ * A maximum of 2 ranges can be specified due to
+ * limitations imposed by the XSAVE/XRSTOR PT extensions.
+ */
+static int
+pt_configure_ranges(struct pt_ctx *ctx, struct pt_cpu_config *cfg)
+{
+	struct pt_ext_area *pt_ext;
+	int nranges_supp, n, error = 0;
+
+	pt_ext = pt_ctx_get_ext_area(ctx);
+	if (pt_info.l0_ebx & CPUPT_IPF) {
+		nranges_supp = (pt_info.l1_eax & CPUPT_NADDR_M) >>
+		    CPUPT_NADDR_S;
+
+		if (nranges_supp > PT_IP_FILTER_MAX_RANGES)
+			nranges_supp = PT_IP_FILTER_MAX_RANGES;
+		n = cfg->nranges;
+		if (n > nranges_supp) {
+			printf("%s: %d IP filtering ranges requested, CPU "
+			       "supports %d, truncating\n",
+			    __func__, n, nranges_supp);
+			n = nranges_supp;
+		}
+
+		switch (n) {
+		case 2:
+			pt_ext->rtit_ctl |= (1UL << RTIT_CTL_ADDR_CFG_S(1));
+			pt_ext->rtit_addr1_a = cfg->ip_ranges[1].start;
+			pt_ext->rtit_addr1_b = cfg->ip_ranges[1].end;
+		case 1:
+			pt_ext->rtit_ctl |= (1UL << RTIT_CTL_ADDR_CFG_S(0));
+			pt_ext->rtit_addr0_a = cfg->ip_ranges[0].start;
+			pt_ext->rtit_addr0_b = cfg->ip_ranges[0].end;
+			break;
+		default:
+			error = (EINVAL);
+			break;
+		};
+	} else
+		error = (ENXIO);
+
+	return (error);
+}
+
+static int
+pt_init_ctx(struct pt_ctx *pt_ctx, struct hwt_vm *vm, int ctx_id)
+{
+
+	dprintf("%s: ctx id %d\n", __func__, ctx_id);
+
+	KASSERT(pt_ctx->buf.topa_hw == NULL,
+	    ("%s: active ToPA buffer in context %p\n", __func__, pt_ctx));
+
+	memset(pt_ctx, 0, sizeof(struct pt_ctx));
+	mtx_init(&pt_ctx->buf.lock, "pttopa", NULL, MTX_SPIN);
+	pt_ctx->save_area = malloc_aligned(pt_info.xsave_area_size, 64,
+	    M_PT, M_NOWAIT | M_ZERO);
+	if (pt_ctx->save_area == NULL)
+		return (ENOMEM);
+	dprintf("%s: preparing ToPA buffer\n", __func__);
+	if (pt_topa_prepare(pt_ctx, vm) != 0) {
+		dprintf("%s: failed to prepare ToPA buffer\n", __func__);
+		free(pt_ctx->save_area, M_PT);
+		return (ENOMEM);
+	}
+
+	pt_ctx->id = ctx_id;
+	TASK_INIT(&pt_ctx->task, 0, pt_send_buffer_record, pt_ctx);
+
+	return (0);
+}
+
+static void
+pt_deinit_ctx(struct pt_ctx *pt_ctx)
+{
+
+	if (pt_ctx->buf.topa_hw != NULL)
+		free(pt_ctx->buf.topa_hw, M_PT);
+	if (pt_ctx->save_area != NULL)
+		free(pt_ctx->save_area, M_PT);
+	memset(pt_ctx, 0, sizeof(*pt_ctx));
+	pt_ctx->buf.topa_hw = NULL;
+}
+
+/*
+ * HWT backend configuration method.
+ *
+ * Checks and translates the user-defined configuration to a
+ * set of PT tracing features. Uses the feature set to initialize
+ * the tracing context for the target CPU or thread.
+ */
+static int
+pt_backend_configure(struct hwt_context *ctx, int cpu_id, int thread_id)
+{
+	struct hwt_cpu *hwt_cpu;
+	struct hwt_thread *thr;
+	struct pt_ctx *pt_ctx;
+	struct pt_cpu_config *cfg;
+	struct pt_ext_area *pt_ext;
+	struct xstate_hdr *hdr;
+	int error;
+
+	dprintf("%s\n", __func__);
+
+	cfg = (struct pt_cpu_config *)ctx->config;
+	pt_ctx = NULL;
+
+	/* Clear any flags we don't support yet. */
+	cfg->rtit_ctl &= PT_SUPPORTED_FLAGS;
+	if (cfg->rtit_ctl & RTIT_CTL_MTCEN) {
+		if ((pt_info.l0_ebx & CPUPT_MTC) == 0) {
+			printf("%s: CPU does not support generating MTC "
+			    "packets\n", __func__);
+			return (ENXIO);
+		}
+	}
+
+	if (cfg->rtit_ctl & RTIT_CTL_CR3FILTER) {
+		if ((pt_info.l0_ebx & CPUPT_CR3) == 0) {
+			printf("%s: CPU does not support CR3 filtering\n",
+			    __func__);
+			return (ENXIO);
+		}
+	}
+
+	if (cfg->rtit_ctl & RTIT_CTL_DIS_TNT) {
+		if ((pt_info.l0_ebx & CPUPT_DIS_TNT) == 0) {
+			printf("%s: CPU does not support TNT\n", __func__);
+			return (ENXIO);
+		}
+	}
+	/* TODO: support for more config bits. */
+
+	if (ctx->mode == HWT_MODE_CPU) {
+		TAILQ_FOREACH(hwt_cpu, &ctx->cpus, next) {
+			if (hwt_cpu->cpu_id != cpu_id)
+				continue;
+			pt_ctx = &pt_pcpu_ctx[cpu_id];
+			break;
+		}
+	} else {
+		TAILQ_FOREACH(thr, &ctx->threads, next) {
+			if (thr->thread_id != thread_id)
+				continue;
+			KASSERT(thr->private != NULL,
+			    ("%s: hwt thread private"
+			     " not set, thr %p",
+				__func__, thr));
+			pt_ctx = (struct pt_ctx *)thr->private;
+			break;
+		}
+	}
+	if (pt_ctx == NULL)
+		return (ENOENT);
+
+	dprintf("%s: preparing MSRs\n", __func__);
+	pt_ext = pt_ctx_get_ext_area(pt_ctx);
+	hdr = pt_ctx_get_xstate_hdr(pt_ctx);
+
+	pt_ext->rtit_ctl |= cfg->rtit_ctl;
+	if (cfg->nranges != 0) {
+		dprintf("%s: preparing IPF ranges\n", __func__);
+		if ((error = pt_configure_ranges(pt_ctx, cfg)) != 0)
+			return (error);
+	}
+	pt_ctx->hwt_ctx = ctx;
+	pt_ext->rtit_ctl |= RTIT_CTL_TOPA;
+	pt_ext->rtit_output_base = (uint64_t)vtophys(pt_ctx->buf.topa_hw);
+	pt_ext->rtit_output_mask_ptrs = PT_TOPA_MASK_PTRS;
+	hdr->xstate_bv = XFEATURE_ENABLED_PT;
+	hdr->xstate_xcomp_bv = XFEATURE_ENABLED_PT |
+	    XSTATE_XCOMP_BV_COMPACT;
+	pt_ext->rtit_ctl |= RTIT_CTL_TRACEEN;
+	pt_pcpu[cpu_id].ctx = pt_ctx;
+	pt_cpu_set_state(cpu_id, PT_STOPPED);
+
+	return (0);
+}
+
+/*
+ * hwt backend trace start operation. CPU affine.
+ */
+static void
+pt_backend_enable(struct hwt_context *ctx, int cpu_id)
+{
+	if (ctx->mode == HWT_MODE_CPU)
+		return;
+
+	KASSERT(curcpu == cpu_id,
+	    ("%s: attempting to start PT on another cpu", __func__));
+	pt_cpu_start(NULL);
+	CPU_SET(cpu_id, &ctx->cpu_map);
+}
+
+/*
+ * hwt backend trace stop operation. CPU affine.
+ */
+static void
+pt_backend_disable(struct hwt_context *ctx, int cpu_id)
+{
+	struct pt_cpu *cpu;
+
+	if (ctx->mode == HWT_MODE_CPU)
+		return;
+
+	KASSERT(curcpu == cpu_id,
+	    ("%s: attempting to disable PT on another cpu", __func__));
+	pt_cpu_stop(NULL);
+	CPU_CLR(cpu_id, &ctx->cpu_map);
+	cpu = &pt_pcpu[cpu_id];
+	cpu->ctx = NULL;
+}
+
+/*
+ * hwt backend trace start operation for remote CPUs.
+ */
+static int
+pt_backend_enable_smp(struct hwt_context *ctx)
+{
+
+	dprintf("%s\n", __func__);
+	if (ctx->mode == HWT_MODE_CPU &&
+	    atomic_swap_32(&cpu_mode_ctr, 1) != 0)
+		return (-1);
+
+	KASSERT(ctx->mode == HWT_MODE_CPU,
+	    ("%s: should only be used for CPU mode", __func__));
+	smp_rendezvous_cpus(ctx->cpu_map, NULL, pt_cpu_start, NULL, NULL);
+
+	return (0);
+}
+
+/*
+ * hwt backend trace stop operation for remote CPUs.
+ */
+static int
+pt_backend_disable_smp(struct hwt_context *ctx)
+{
+
+	dprintf("%s\n", __func__);
+	if (ctx->mode == HWT_MODE_CPU &&
+	    atomic_swap_32(&cpu_mode_ctr, 0) == 0)
+		return (-1);
+
+	if (CPU_EMPTY(&ctx->cpu_map)) {
+		dprintf("%s: empty cpu map\n", __func__);
+		return (-1);
+	}
+	smp_rendezvous_cpus(ctx->cpu_map, NULL, pt_cpu_stop, NULL, NULL);
+
+	return (0);
+}
+
+/*
+ * HWT backend initialization method.
+ *
+ * Installs the ToPA interrupt handler and initializes
+ * the tracing contexts used for HWT_MODE_CPU.
+ */
+static int
+pt_backend_init(struct hwt_context *ctx)
+{
+	struct hwt_cpu *hwt_cpu;
+	int error;
+
+	dprintf("%s\n", __func__);
+	if (ctx->mode == HWT_MODE_CPU) {
+		TAILQ_FOREACH(hwt_cpu, &ctx->cpus, next) {
+			error = pt_init_ctx(&pt_pcpu_ctx[hwt_cpu->cpu_id],
+			    hwt_cpu->vm, hwt_cpu->cpu_id);
+			if (error)
+				return (error);
+		}
+	}
+
+	return (0);
+}
+
+/*
+ * HWT backend teardown method.
+ *
+ * Removes the ToPA interrupt handler, stops tracing on all active CPUs,
+ * and releases all previously allocated ToPA metadata.
+ */
+static int
+pt_backend_deinit(struct hwt_context *ctx)
+{
+	struct pt_ctx *pt_ctx;
+	struct hwt_thread *thr;
+	int cpu_id;
+
+	dprintf("%s\n", __func__);
+
+	pt_backend_disable_smp(ctx);
+	if (ctx->mode == HWT_MODE_THREAD) {
+		TAILQ_FOREACH(thr, &ctx->threads, next) {
+			KASSERT(thr->private != NULL,
+			    ("%s: thr->private not set", __func__));
+			pt_ctx = (struct pt_ctx *)thr->private;
+			pt_deinit_ctx(pt_ctx);
+		}
+	} else {
+		CPU_FOREACH(cpu_id) {
+			if (!CPU_ISSET(cpu_id, &ctx->cpu_map))
+				continue;
+			if (pt_pcpu[cpu_id].ctx != NULL) {
+				KASSERT(pt_pcpu[cpu_id].ctx ==
+					&pt_pcpu_ctx[cpu_id],
+				    ("%s: CPU mode tracing with non-cpu mode PT"
+				     "context active",
+					__func__));
+				pt_pcpu[cpu_id].ctx = NULL;
+			}
+			pt_ctx = &pt_pcpu_ctx[cpu_id];
+			pt_deinit_ctx(pt_ctx);
+			memset(&pt_pcpu[cpu_id], 0, sizeof(struct pt_cpu));
+		}
+	}
+
+	return (0);
+}
+
+/*
+ * Fetches current offset into the tracing buffer.
+ */
+static int
+pt_backend_read(struct hwt_vm *vm, int *curpage, vm_offset_t *curpage_offset,
+    uint64_t *data)
+{
+	struct pt_buffer *buf;
+
+	if (vm->ctx->mode == HWT_MODE_THREAD)
+		buf = &((struct pt_ctx *)vm->thr->private)->buf;
+	else
+		buf = &pt_pcpu[vm->cpu->cpu_id].ctx->buf;
+	mtx_lock_spin(&buf->lock);
+	*curpage = buf->curpage;
+	*curpage_offset = buf->offset + (buf->wrap_count * vm->ctx->bufsize);
+	mtx_unlock_spin(&buf->lock);
+
+	return (0);
+}
+
+/*
+ * HWT thread creation hook.
+ * Allocates and associates a 'struct pt_ctx' for a given hwt thread.
+ */
+static int
+pt_backend_alloc_thread(struct hwt_thread *thr)
+{
+	struct pt_ctx *pt_ctx;
+	int error;
+
+	/* Omit M_WAITOK since this might get invoked a non-sleepable context */
+	pt_ctx = malloc(sizeof(*pt_ctx), M_PT, M_NOWAIT | M_ZERO);
+	if (pt_ctx == NULL)
+		return (ENOMEM);
+
+	error = pt_init_ctx(pt_ctx, thr->vm, thr->thread_id);
+	if (error)
+		return (error);
+
+	thr->private = pt_ctx;
+	return (0);
+}
+/*
+ * HWT thread teardown hook.
+ */
+static void
+pt_backend_free_thread(struct hwt_thread *thr)
+{
+	struct pt_ctx *ctx;
+
+	ctx = (struct pt_ctx *)thr->private;
+
+	pt_deinit_ctx(ctx);
+	free(ctx, M_PT);
+}
+
+static void
+pt_backend_dump(int cpu_id)
+{
+}
+
+static struct hwt_backend_ops pt_ops = {
+	.hwt_backend_init = pt_backend_init,
+	.hwt_backend_deinit = pt_backend_deinit,
+
+	.hwt_backend_configure = pt_backend_configure,
+
+	.hwt_backend_enable = pt_backend_enable,
+	.hwt_backend_disable = pt_backend_disable,
+
+#ifdef SMP
+	.hwt_backend_enable_smp = pt_backend_enable_smp,
+	.hwt_backend_disable_smp = pt_backend_disable_smp,
+#endif
+
+	.hwt_backend_read = pt_backend_read,
+	.hwt_backend_dump = pt_backend_dump,
+
+	.hwt_backend_thread_alloc = pt_backend_alloc_thread,
+	.hwt_backend_thread_free = pt_backend_free_thread,
+};
+
+static struct hwt_backend backend = {
+	.ops = &pt_ops,
+	.name = "pt",
+	.kva_req = 1,
+};
+
+/*
+ * Reads the latest valid trace buffer offset and enqueues
+ * a HWT_RECORD_BUFFER record.
+ * Used as a taskqueue routine from the ToPA interrupt handler.
+ */
+static void
+pt_send_buffer_record(void *arg, int pending __unused)
+{
+	struct hwt_record_entry record;
+	struct pt_ctx *ctx = (struct pt_ctx *)arg;
+
+	/* Prepare buffer record. */
+	mtx_lock_spin(&ctx->buf.lock);
+	pt_fill_buffer_record(ctx->id, &ctx->buf, &record);
+	mtx_unlock_spin(&ctx->buf.lock);
+	hwt_record_ctx(ctx->hwt_ctx, &record, M_ZERO | M_NOWAIT);
+}
+static void
+pt_topa_status_clear(void)
+{
+	uint64_t reg;
+
+	reg = rdmsr(MSR_IA_GLOBAL_STATUS_RESET);
+	reg &= ~GLOBAL_STATUS_FLAG_TRACETOPAPMI;
+	reg |= GLOBAL_STATUS_FLAG_TRACETOPAPMI;
+	wrmsr(MSR_IA_GLOBAL_STATUS_RESET, reg);
+}
+
+/*
+ * ToPA PMI handler.
+ *
+ * Invoked every time a ToPA entry marked with TOPA_INT is filled.
+ * Uses taskqueue to enqueue a buffer record for userspace.
+ * Re-enables the PC interrupt line as long as tracing is active.
+ */
+static int
+pt_topa_intr(struct trapframe *tf)
+{
+	struct pt_buffer *buf;
+	struct pt_ctx *ctx;
+	uint64_t reg;
+
+	SDT_PROBE0(pt, , , topa__intr);
+
+	if (pt_cpu_get_state(curcpu) != PT_ACTIVE) {
+		return (0);
+	}
+	reg = rdmsr(MSR_IA_GLOBAL_STATUS);
+	if ((reg & GLOBAL_STATUS_FLAG_TRACETOPAPMI) == 0) {
+		/* ACK spurious or leftover interrupt. */
+		pt_topa_status_clear();
+		return (1);
+	}
+
+	ctx = pt_pcpu[curcpu].ctx;
+	buf = &ctx->buf;
+	KASSERT(buf->topa_hw != NULL,
+	    ("%s: ToPA PMI interrupt with invalid buffer", __func__));
+
+	pt_cpu_toggle_local(ctx->save_area, false);
+	pt_update_buffer(buf);
+	pt_topa_status_clear();
+	taskqueue_enqueue_flags(taskqueue_pt, &ctx->task,
+	    TASKQUEUE_FAIL_IF_PENDING);
+
+	if (pt_cpu_get_state(curcpu) == PT_ACTIVE) {
+		pt_cpu_toggle_local(ctx->save_area, true);
+		lapic_reenable_pcint();
+	}
+	return (1);
+}
+
+/*
+ * Module initialization.
+ *
+ * Saves all PT-related cpuid info, registers itself as a HWT backend,
+ * and allocates metadata required to keep track of tracing operations
+ * on each CPU.
+ */
+static int
+pt_init(void)
+{
+	u_int cp[4];
+	int error;
+
+	dprintf("pt: Enumerating part 1\n");
+	cpuid_count(CPUID_PT_LEAF, 0, cp);
+	dprintf("pt: Maximum valid sub-leaf Index: %x\n", cp[0]);
+	dprintf("pt: ebx %x\n", cp[1]);
+	dprintf("pt: ecx %x\n", cp[2]);
+
+	pt_info.l0_eax = cp[0];
+	pt_info.l0_ebx = cp[1];
+	pt_info.l0_ecx = cp[2];
+
+	dprintf("pt: Enumerating part 2\n");
+	cpuid_count(CPUID_PT_LEAF, 1, cp);
+	dprintf("pt: eax %x\n", cp[0]);
+	dprintf("pt: ebx %x\n", cp[1]);
+
+	pt_info.l1_eax = cp[0];
+	pt_info.l1_ebx = cp[1];
+
+	error = hwt_backend_register(&backend);
+	if (error != 0) {
+		printf("pt: unable to register hwt backend, error %d\n", error);
+		return (error);
+	}
+	pt_pcpu = mallocarray(mp_ncpus, sizeof(struct pt_cpu), M_PT,
+	    M_ZERO | M_WAITOK);
+	pt_pcpu_ctx = mallocarray(mp_ncpus, sizeof(struct pt_ctx), M_PT,
+	    M_ZERO | M_WAITOK);
+
+	nmi_register_handler(pt_topa_intr);
+	if (!lapic_enable_pcint()) {
+		nmi_remove_handler(pt_topa_intr);
+		hwt_backend_unregister(&backend);
+		free(pt_pcpu, M_PT);
+		free(pt_pcpu_ctx, M_PT);
+		pt_pcpu = NULL;
+		pt_pcpu_ctx = NULL;
+		printf("pt: failed to setup interrupt line\n");
+		return (error);
+	}
+	initialized = true;
+
+	return (0);
+}
+
+/*
+ * Checks whether the CPU support Intel PT and
+ * initializes XSAVE area info.
+ *
+ * The driver relies on XSAVE/XRSTOR PT extensions,
+ * Table of Physical Addresses (ToPA) support, and
+ * support for multiple ToPA entries.
+ */
+static bool
+pt_supported(void)
+{
+	u_int cp[4];
+
+	if ((cpu_stdext_feature & CPUID_STDEXT_PROCTRACE) == 0) {
+		printf("pt: CPU does not support Intel Processor Trace\n");
+		return (false);
+	}
+	if ((cpu_feature2 & CPUID2_XSAVE) == 0) {
+		printf("pt: XSAVE is not supported\n");
+		return (false);
+	}
+	if (!xsave_extfeature_supported(XFEATURE_ENABLED_PT, true)) {
+		printf("pt: CPU does not support managing PT state using XSAVE\n");
+		return (false);
+	}
+	if (!xsave_extension_supported(CPUID_EXTSTATE_XSAVEC)) {
+		printf("pt: XSAVE compaction is not supported\n");
+		return (false);
+	}
+	if (!xsave_extension_supported(CPUID_EXTSTATE_XSAVES)) {
+		printf("pt: CPU does not support XSAVES/XRSTORS\n");
+		return (false);
+	}
+
+	/* Require ToPA support. */
+	cpuid_count(CPUID_PT_LEAF, 0, cp);
+	if ((cp[2] & CPUPT_TOPA) == 0) {
+		printf("pt: ToPA is not supported\n");
+		return (false);
+	}
+	if ((cp[2] & CPUPT_TOPA_MULTI) == 0) {
+		printf("pt: multiple ToPA outputs are not supported\n");
+		return (false);
+	}
+
+	pt_info.xstate_hdr_offset = xsave_area_hdr_offset();
+	pt_info.xsave_area_size = xsave_area_size(PT_XSTATE_BV, true, true);
+	pt_info.pt_xsave_offset = xsave_area_offset(PT_XSTATE_BV,
+	    XFEATURE_ENABLED_PT, true, true);
+
+	return (true);
+}
+
+static void
+pt_deinit(void)
+{
+	if (!initialized)
+		return;
+	nmi_remove_handler(pt_topa_intr);
+	lapic_disable_pcint();
+	hwt_backend_unregister(&backend);
+	free(pt_pcpu, M_PT);
+	free(pt_pcpu_ctx, M_PT);
+	pt_pcpu = NULL;
+	initialized = false;
+}
+
+static int
+pt_modevent(module_t mod, int type, void *data)
+{
+	switch (type) {
+	case MOD_LOAD:
+		if (!pt_supported() || pt_init() != 0) {
+			return (ENXIO);
+		}
+		break;
+	case MOD_UNLOAD:
+		pt_deinit();
+		break;
+	default:
+		break;
+	}
+
+	return (0);
+}
+
+static moduledata_t pt_mod = { "intel_pt", pt_modevent, NULL };
+
+DECLARE_MODULE(intel_pt, pt_mod, SI_SUB_DRIVERS, SI_ORDER_FIRST);
+MODULE_DEPEND(intel_pt, hwt, 1, 1, 1);
+MODULE_VERSION(intel_pt, 1);
diff --git a/sys/amd64/pt/pt.h b/sys/amd64/pt/pt.h
new file mode 100644
index 000000000000..2423afdf22e9
--- /dev/null
+++ b/sys/amd64/pt/pt.h
@@ -0,0 +1,49 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023 Bojan Novković <bnovkov@freebsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _AMD64_PT_PT_H_
+#define _AMD64_PT_PT_H_
+
+#include <sys/types.h>
+
+#include <x86/include/specialreg.h>
+
+#define PT_IP_FILTER_MAX_RANGES (2) /* Intel SDM Vol. 3C, 33-29 */
+
+struct pt_cpu_config {
+	uint64_t rtit_ctl;
+	register_t cr3_filter;
+	int nranges;
+	struct ipf_range {
+		vm_offset_t start;
+		vm_offset_t end;
+	} ip_ranges[PT_IP_FILTER_MAX_RANGES];
+	uint32_t mtc_freq;
+	uint32_t cyc_thresh;
+	uint32_t psb_freq;
+};
+#endif /* !_AMD64_PT_PT_H_ */
diff --git a/sys/amd64/vmm/intel/vmx_support.S b/sys/amd64/vmm/intel/vmx_support.S
index f393f160b101..130130b64541 100644
--- a/sys/amd64/vmm/intel/vmx_support.S
+++ b/sys/amd64/vmm/intel/vmx_support.S
@@ -32,12 +32,6 @@
 
 #include "vmx_assym.h"
 
-#ifdef SMP
-#define	LK	lock ;
-#else
-#define	LK
-#endif
-
 /* Be friendly to DTrace FBT's prologue/epilogue pattern matching */
 #define VENTER  push %rbp ; mov %rsp,%rbp
 #define VLEAVE  pop %rbp