diff options
Diffstat (limited to 'sys/amd64')
-rw-r--r-- | sys/amd64/amd64/apic_vector.S | 6 | ||||
-rw-r--r-- | sys/amd64/amd64/mem.c | 4 | ||||
-rw-r--r-- | sys/amd64/amd64/minidump_machdep.c | 10 | ||||
-rw-r--r-- | sys/amd64/amd64/pmap.c | 305 | ||||
-rw-r--r-- | sys/amd64/amd64/trap.c | 2 | ||||
-rw-r--r-- | sys/amd64/include/param.h | 5 | ||||
-rw-r--r-- | sys/amd64/include/pmap.h | 27 | ||||
-rw-r--r-- | sys/amd64/include/vmparam.h | 41 | ||||
-rw-r--r-- | sys/amd64/pt/pt.c | 978 | ||||
-rw-r--r-- | sys/amd64/pt/pt.h | 49 | ||||
-rw-r--r-- | sys/amd64/vmm/intel/vmx_support.S | 6 |
11 files changed, 1283 insertions, 150 deletions
diff --git a/sys/amd64/amd64/apic_vector.S b/sys/amd64/amd64/apic_vector.S index 6e51ebff298a..5bb877a174f7 100644 --- a/sys/amd64/amd64/apic_vector.S +++ b/sys/amd64/amd64/apic_vector.S @@ -49,12 +49,6 @@ #include <machine/specialreg.h> #include <x86/apicreg.h> -#ifdef SMP -#define LK lock ; -#else -#define LK -#endif - .text SUPERALIGN_TEXT /* End Of Interrupt to APIC */ diff --git a/sys/amd64/amd64/mem.c b/sys/amd64/amd64/mem.c index 413b7c74890e..851f2df0e6e1 100644 --- a/sys/amd64/amd64/mem.c +++ b/sys/amd64/amd64/mem.c @@ -105,8 +105,8 @@ memrw(struct cdev *dev, struct uio *uio, int flags) * PAGE_SIZE, the uiomove() call does not * access past the end of the direct map. */ - if (v >= DMAP_MIN_ADDRESS && - v < DMAP_MIN_ADDRESS + dmaplimit) { + if (v >= kva_layout.dmap_low && + v < kva_layout.dmap_high) { error = uiomove((void *)v, c, uio); break; } diff --git a/sys/amd64/amd64/minidump_machdep.c b/sys/amd64/amd64/minidump_machdep.c index 6d0917e16099..43bf81a991bf 100644 --- a/sys/amd64/amd64/minidump_machdep.c +++ b/sys/amd64/amd64/minidump_machdep.c @@ -186,7 +186,7 @@ cpu_minidumpsys(struct dumperinfo *di, const struct minidumpstate *state) * tables, so care must be taken to read each entry only once. */ pmapsize = 0; - for (va = VM_MIN_KERNEL_ADDRESS; va < kva_end; ) { + for (va = kva_layout.km_low; va < kva_end; ) { /* * We always write a page, even if it is zero. Each * page written corresponds to 1GB of space @@ -279,9 +279,9 @@ cpu_minidumpsys(struct dumperinfo *di, const struct minidumpstate *state) mdhdr.msgbufsize = mbp->msg_size; mdhdr.bitmapsize = round_page(BITSET_SIZE(vm_page_dump_pages)); mdhdr.pmapsize = pmapsize; - mdhdr.kernbase = VM_MIN_KERNEL_ADDRESS; - mdhdr.dmapbase = DMAP_MIN_ADDRESS; - mdhdr.dmapend = DMAP_MAX_ADDRESS; + mdhdr.kernbase = kva_layout.km_low; + mdhdr.dmapbase = kva_layout.dmap_low; + mdhdr.dmapend = kva_layout.dmap_high; mdhdr.dumpavailsize = round_page(sizeof(dump_avail)); dump_init_header(di, &kdh, KERNELDUMPMAGIC, KERNELDUMP_AMD64_VERSION, @@ -323,7 +323,7 @@ cpu_minidumpsys(struct dumperinfo *di, const struct minidumpstate *state) /* Dump kernel page directory pages */ bzero(fakepd, sizeof(fakepd)); - for (va = VM_MIN_KERNEL_ADDRESS; va < kva_end; va += NBPDP) { + for (va = kva_layout.km_low; va < kva_end; va += NBPDP) { ii = pmap_pml4e_index(va); pml4 = (uint64_t *)PHYS_TO_DMAP(KPML4phys) + ii; pdp = (uint64_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index 9c985df13ddf..b2bfe633adcc 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -415,7 +415,7 @@ SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0, static int ndmpdp; vm_paddr_t dmaplimit; -vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS; +vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS_LA48; pt_entry_t pg_nx; static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, @@ -475,11 +475,36 @@ _Static_assert(DMPML4I + NDMPML4E <= KMSANSHADPML4I, "direct map overflow"); static pml4_entry_t *kernel_pml4; static u_int64_t DMPDphys; /* phys addr of direct mapped level 2 */ static u_int64_t DMPDPphys; /* phys addr of direct mapped level 3 */ +static u_int64_t DMPML4phys; /* ... level 4, for la57 */ static int ndmpdpphys; /* number of DMPDPphys pages */ vm_paddr_t kernphys; /* phys addr of start of bootstrap data */ vm_paddr_t KERNend; /* and the end */ +struct kva_layout_s kva_layout = { + .kva_min = KV4ADDR(PML4PML4I, 0, 0, 0), + .dmap_low = KV4ADDR(DMPML4I, 0, 0, 0), + .dmap_high = KV4ADDR(DMPML4I + NDMPML4E, 0, 0, 0), + .lm_low = KV4ADDR(LMSPML4I, 0, 0, 0), + .lm_high = KV4ADDR(LMEPML4I + 1, 0, 0, 0), + .km_low = KV4ADDR(KPML4BASE, 0, 0, 0), + .km_high = KV4ADDR(KPML4BASE + NKPML4E - 1, NPDPEPG - 1, + NPDEPG - 1, NPTEPG - 1), + .rec_pt = KV4ADDR(PML4PML4I, 0, 0, 0), +}; + +struct kva_layout_s kva_layout_la57 = { + .kva_min = KV5ADDR(NPML5EPG / 2, 0, 0, 0, 0), /* == rec_pt */ + .dmap_low = KV5ADDR(DMPML5I, 0, 0, 0, 0), + .dmap_high = KV5ADDR(DMPML5I + NDMPML5E, 0, 0, 0, 0), + .lm_low = KV5ADDR(LMSPML5I, 0, 0, 0, 0), + .lm_high = KV5ADDR(LMEPML5I + 1, 0, 0, 0, 0), + .km_low = KV4ADDR(KPML4BASE, 0, 0, 0), + .km_high = KV4ADDR(KPML4BASE + NKPML4E - 1, NPDPEPG - 1, + NPDEPG - 1, NPTEPG - 1), + .rec_pt = KV5ADDR(PML5PML5I, 0, 0, 0, 0), +}; + /* * pmap_mapdev support pre initialization (i.e. console) */ @@ -549,8 +574,8 @@ static int pmap_flags = PMAP_PDE_SUPERPAGE; /* flags for x86 pmaps */ static vmem_t *large_vmem; static u_int lm_ents; -#define PMAP_ADDRESS_IN_LARGEMAP(va) ((va) >= LARGEMAP_MIN_ADDRESS && \ - (va) < LARGEMAP_MIN_ADDRESS + NBPML4 * (u_long)lm_ents) +#define PMAP_ADDRESS_IN_LARGEMAP(va) ((va) >= kva_layout.lm_low && \ + (va) < kva_layout.lm_high) int pmap_pcid_enabled = 1; SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, @@ -1336,7 +1361,7 @@ static pdp_entry_t *pmap_pti_pdpe(vm_offset_t va); static pd_entry_t *pmap_pti_pde(vm_offset_t va); static void pmap_pti_wire_pte(void *pte); static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, - bool remove_pt, struct spglist *free, struct rwlock **lockp); + bool demote_kpde, struct spglist *free, struct rwlock **lockp); static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp); static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va); @@ -1722,7 +1747,7 @@ create_pagetables(vm_paddr_t *firstaddr) { pd_entry_t *pd_p; pdp_entry_t *pdp_p; - pml4_entry_t *p4_p; + pml4_entry_t *p4_p, *p4d_p; pml5_entry_t *p5_p; uint64_t DMPDkernphys; vm_paddr_t pax; @@ -1732,7 +1757,7 @@ create_pagetables(vm_paddr_t *firstaddr) vm_offset_t kasankernbase; int kasankpdpi, kasankpdi, nkasanpte; #endif - int i, j, ndm1g, nkpdpe, nkdmpde; + int i, j, ndm1g, nkpdpe, nkdmpde, ndmpml4phys; TSENTER(); /* Allocate page table pages for the direct map */ @@ -1740,15 +1765,30 @@ create_pagetables(vm_paddr_t *firstaddr) if (ndmpdp < 4) /* Minimum 4GB of dirmap */ ndmpdp = 4; ndmpdpphys = howmany(ndmpdp, NPDPEPG); - if (ndmpdpphys > NDMPML4E) { - /* - * Each NDMPML4E allows 512 GB, so limit to that, - * and then readjust ndmpdp and ndmpdpphys. - */ - printf("NDMPML4E limits system to %d GB\n", NDMPML4E * 512); - Maxmem = atop(NDMPML4E * NBPML4); - ndmpdpphys = NDMPML4E; - ndmpdp = NDMPML4E * NPDEPG; + if (la57) { + ndmpml4phys = howmany(ndmpdpphys, NPML4EPG); + if (ndmpml4phys > NDMPML5E) { + printf("NDMPML5E limits system to %ld GB\n", + (u_long)NDMPML5E * NBPML5 / 1024 / 1024 / 1024); + Maxmem = atop(NDMPML5E * NBPML5); + ndmpml4phys = NDMPML5E; + ndmpdpphys = ndmpml4phys * NPML4EPG; + ndmpdp = ndmpdpphys * NPDEPG; + } + DMPML4phys = allocpages(firstaddr, ndmpml4phys); + } else { + if (ndmpdpphys > NDMPML4E) { + /* + * Each NDMPML4E allows 512 GB, so limit to + * that, and then readjust ndmpdp and + * ndmpdpphys. + */ + printf("NDMPML4E limits system to %d GB\n", + NDMPML4E * 512); + Maxmem = atop(NDMPML4E * NBPML4); + ndmpdpphys = NDMPML4E; + ndmpdp = NDMPML4E * NPDEPG; + } } DMPDPphys = allocpages(firstaddr, ndmpdpphys); ndm1g = 0; @@ -1773,7 +1813,13 @@ create_pagetables(vm_paddr_t *firstaddr) dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT; /* Allocate pages. */ + if (la57) { + KPML5phys = allocpages(firstaddr, 1); + p5_p = (pml5_entry_t *)KPML5phys; + } KPML4phys = allocpages(firstaddr, 1); + p4_p = (pml4_entry_t *)KPML4phys; + KPDPphys = allocpages(firstaddr, NKPML4E); #ifdef KASAN KASANPDPphys = allocpages(firstaddr, NKASANPML4E); @@ -1893,6 +1939,16 @@ create_pagetables(vm_paddr_t *firstaddr) } /* + * Connect the Direct Map slots up to the PML4. + * pml5 entries for DMAP are handled below in global pml5 loop. + */ + p4d_p = la57 ? (pml4_entry_t *)DMPML4phys : &p4_p[DMPML4I]; + for (i = 0; i < ndmpdpphys; i++) { + p4d_p[i] = (DMPDPphys + ptoa(i)) | X86_PG_RW | X86_PG_V | + pg_nx; + } + + /* * Instead of using a 1G page for the memory containing the kernel, * use 2M pages with read-only and no-execute permissions. (If using 1G * pages, this will partially overwrite the PDPEs above.) @@ -1911,11 +1967,6 @@ create_pagetables(vm_paddr_t *firstaddr) } } - /* And recursively map PML4 to itself in order to get PTmap */ - p4_p = (pml4_entry_t *)KPML4phys; - p4_p[PML4PML4I] = KPML4phys; - p4_p[PML4PML4I] |= X86_PG_RW | X86_PG_V | pg_nx; - #ifdef KASAN /* Connect the KASAN shadow map slots up to the PML4. */ for (i = 0; i < NKASANPML4E; i++) { @@ -1938,25 +1989,15 @@ create_pagetables(vm_paddr_t *firstaddr) } #endif - /* Connect the Direct Map slots up to the PML4. */ - for (i = 0; i < ndmpdpphys; i++) { - p4_p[DMPML4I + i] = DMPDPphys + ptoa(i); - p4_p[DMPML4I + i] |= X86_PG_RW | X86_PG_V | pg_nx; - } - /* Connect the KVA slots up to the PML4 */ for (i = 0; i < NKPML4E; i++) { p4_p[KPML4BASE + i] = KPDPphys + ptoa(i); p4_p[KPML4BASE + i] |= X86_PG_RW | X86_PG_V; } - kernel_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys); - if (la57) { /* XXXKIB bootstrap KPML5phys page is lost */ - KPML5phys = allocpages(firstaddr, 1); - for (i = 0, p5_p = (pml5_entry_t *)KPML5phys; i < NPML5EPG; - i++) { + for (i = 0; i < NPML5EPG; i++) { if (i == PML5PML5I) { /* * Recursively map PML5 to itself in @@ -1964,6 +2005,10 @@ create_pagetables(vm_paddr_t *firstaddr) */ p5_p[i] = KPML5phys | X86_PG_RW | X86_PG_A | X86_PG_M | X86_PG_V | pg_nx; + } else if (i >= DMPML5I && i < DMPML5I + NDMPML5E) { + /* Connect DMAP pml4 pages to PML5. */ + p5_p[i] = (DMPML4phys + ptoa(i - DMPML5I)) | + X86_PG_RW | X86_PG_V | pg_nx; } else if (i == pmap_pml5e_index(UPT_MAX_ADDRESS)) { p5_p[i] = KPML4phys | X86_PG_RW | X86_PG_A | X86_PG_M | X86_PG_V; @@ -1971,6 +2016,10 @@ create_pagetables(vm_paddr_t *firstaddr) p5_p[i] = 0; } } + } else { + /* Recursively map PML4 to itself in order to get PTmap */ + p4_p[PML4PML4I] = KPML4phys; + p4_p[PML4PML4I] |= X86_PG_RW | X86_PG_V | pg_nx; } TSEXIT(); } @@ -2024,7 +2073,7 @@ pmap_bootstrap(vm_paddr_t *firstaddr) */ virtual_avail = (vm_offset_t)KERNSTART + round_2mpage(KERNend - (vm_paddr_t)kernphys); - virtual_end = VM_MAX_KERNEL_ADDRESS; + virtual_end = kva_layout.km_high; /* * Enable PG_G global pages, then switch to the kernel page @@ -2046,9 +2095,13 @@ pmap_bootstrap(vm_paddr_t *firstaddr) * Initialize the kernel pmap (which is statically allocated). * Count bootstrap data as being resident in case any of this data is * later unmapped (using pmap_remove()) and freed. + * + * DMAP_TO_PHYS()/PHYS_TO_DMAP() are functional only after + * kva_layout is fixed. */ PMAP_LOCK_INIT(kernel_pmap); if (la57) { + kva_layout = kva_layout_la57; vtoptem = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT + NPML5EPGSHIFT)) - 1) << 3; PTmap = (vm_offset_t)P5Tmap; @@ -2059,6 +2112,7 @@ pmap_bootstrap(vm_paddr_t *firstaddr) kernel_pmap->pm_cr3 = KPML5phys; pmap_pt_page_count_adj(kernel_pmap, 1); /* top-level page */ } else { + kernel_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys); kernel_pmap->pm_pmltop = kernel_pml4; kernel_pmap->pm_cr3 = KPML4phys; } @@ -2420,6 +2474,8 @@ pmap_init(void) { struct pmap_preinit_mapping *ppim; vm_page_t m, mpte; + pml4_entry_t *pml4e; + unsigned long lm_max; int error, i, ret, skz63; /* L1TF, reserve page @0 unconditionally */ @@ -2545,10 +2601,15 @@ pmap_init(void) lm_ents = 8; TUNABLE_INT_FETCH("vm.pmap.large_map_pml4_entries", &lm_ents); - if (lm_ents > LMEPML4I - LMSPML4I + 1) - lm_ents = LMEPML4I - LMSPML4I + 1; + lm_max = (kva_layout.lm_high - kva_layout.lm_low) / NBPML4; + if (lm_ents > lm_max) { + printf( + "pmap: shrinking large map from requested %d slots to %ld slots\n", + lm_ents, lm_max); + lm_ents = lm_max; + } #ifdef KMSAN - if (lm_ents > KMSANORIGPML4I - LMSPML4I) { + if (!la57 && lm_ents > KMSANORIGPML4I - LMSPML4I) { printf( "pmap: shrinking large map for KMSAN (%d slots to %ld slots)\n", lm_ents, KMSANORIGPML4I - LMSPML4I); @@ -2559,18 +2620,27 @@ pmap_init(void) printf("pmap: large map %u PML4 slots (%lu GB)\n", lm_ents, (u_long)lm_ents * (NBPML4 / 1024 / 1024 / 1024)); if (lm_ents != 0) { - large_vmem = vmem_create("large", LARGEMAP_MIN_ADDRESS, + large_vmem = vmem_create("large", kva_layout.lm_low, (vmem_size_t)lm_ents * NBPML4, PAGE_SIZE, 0, M_WAITOK); if (large_vmem == NULL) { printf("pmap: cannot create large map\n"); lm_ents = 0; } + if (la57) { + for (i = 0; i < howmany((vm_offset_t)NBPML4 * + lm_ents, NBPML5); i++) { + m = pmap_large_map_getptp_unlocked(); + kernel_pmap->pm_pmltop[LMSPML5I + i] = X86_PG_V | + X86_PG_RW | X86_PG_A | X86_PG_M | + pg_nx | VM_PAGE_TO_PHYS(m); + } + } for (i = 0; i < lm_ents; i++) { m = pmap_large_map_getptp_unlocked(); - /* XXXKIB la57 */ - kernel_pml4[LMSPML4I + i] = X86_PG_V | - X86_PG_RW | X86_PG_A | X86_PG_M | pg_nx | - VM_PAGE_TO_PHYS(m); + pml4e = pmap_pml4e(kernel_pmap, kva_layout.lm_low + + (u_long)i * NBPML4); + *pml4e = X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M | + pg_nx | VM_PAGE_TO_PHYS(m); } } } @@ -3899,7 +3969,7 @@ pmap_kextract(vm_offset_t va) pd_entry_t pde; vm_paddr_t pa; - if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { + if (va >= kva_layout.dmap_low && va < kva_layout.dmap_high) { pa = DMAP_TO_PHYS(va); } else if (PMAP_ADDRESS_IN_LARGEMAP(va)) { pa = pmap_large_map_kextract(va); @@ -4040,7 +4110,7 @@ pmap_qremove(vm_offset_t sva, int count) * enough to one of those pmap_enter() calls for it to * be caught up in a promotion. */ - KASSERT(va >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", va)); + KASSERT(va >= kva_layout.km_low, ("usermode va %lx", va)); KASSERT((*vtopde(va) & X86_PG_PS) == 0, ("pmap_qremove on promoted va %#lx", va)); @@ -4328,21 +4398,13 @@ void pmap_pinit_pml5(vm_page_t pml5pg) { pml5_entry_t *pm_pml5; + int i; pm_pml5 = (pml5_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml5pg)); - - /* - * Add pml5 entry at top of KVA pointing to existing pml4 table, - * entering all existing kernel mappings into level 5 table. - */ - pm_pml5[pmap_pml5e_index(UPT_MAX_ADDRESS)] = KPML4phys | X86_PG_V | - X86_PG_RW | X86_PG_A | X86_PG_M; - - /* - * Install self-referential address mapping entry. - */ - pm_pml5[PML5PML5I] = VM_PAGE_TO_PHYS(pml5pg) | - X86_PG_RW | X86_PG_V | X86_PG_M | X86_PG_A; + for (i = 0; i < NPML5EPG / 2; i++) + pm_pml5[i] = 0; + for (; i < NPML5EPG; i++) + pm_pml5[i] = kernel_pmap->pm_pmltop[i]; } static void @@ -4899,8 +4961,8 @@ pmap_release(pmap_t pmap) m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pmltop)); if (pmap_is_la57(pmap)) { - pmap->pm_pmltop[pmap_pml5e_index(UPT_MAX_ADDRESS)] = 0; - pmap->pm_pmltop[PML5PML5I] = 0; + for (i = NPML5EPG / 2; i < NPML5EPG; i++) + pmap->pm_pmltop[i] = 0; } else { for (i = 0; i < NKPML4E; i++) /* KVA */ pmap->pm_pmltop[KPML4BASE + i] = 0; @@ -4942,7 +5004,7 @@ pmap_release(pmap_t pmap) static int kvm_size(SYSCTL_HANDLER_ARGS) { - unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS; + unsigned long ksize = kva_layout.km_high - kva_layout.km_low; return sysctl_handle_long(oidp, &ksize, 0, req); } @@ -4953,7 +5015,7 @@ SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, static int kvm_free(SYSCTL_HANDLER_ARGS) { - unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; + unsigned long kfree = kva_layout.km_high - kernel_vm_end; return sysctl_handle_long(oidp, &kfree, 0, req); } @@ -5031,7 +5093,7 @@ pmap_page_array_startup(long pages) vm_page_array_size = pages; - start = VM_MIN_KERNEL_ADDRESS; + start = kva_layout.km_low; end = start + pages * sizeof(struct vm_page); for (va = start; va < end; va += NBPDR) { pfn = first_page + (va - start) / sizeof(struct vm_page); @@ -6067,8 +6129,8 @@ pmap_demote_pde_mpte(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, * so the direct map region is the only part of the * kernel address space that must be handled here. */ - KASSERT(!in_kernel || (va >= DMAP_MIN_ADDRESS && - va < DMAP_MAX_ADDRESS), + KASSERT(!in_kernel || (va >= kva_layout.dmap_low && + va < kva_layout.dmap_high), ("pmap_demote_pde: No saved mpte for va %#lx", va)); /* @@ -6165,8 +6227,7 @@ pmap_demote_pde_mpte(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, * pmap_remove_kernel_pde: Remove a kernel superpage mapping. */ static void -pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, - bool remove_pt) +pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) { pd_entry_t newpde; vm_paddr_t mptepa; @@ -6174,12 +6235,8 @@ pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); PMAP_LOCK_ASSERT(pmap, MA_OWNED); - if (remove_pt) - mpte = pmap_remove_pt_page(pmap, va); - else - mpte = vm_radix_lookup(&pmap->pm_root, pmap_pde_pindex(va)); - if (mpte == NULL) - panic("pmap_remove_kernel_pde: Missing pt page."); + mpte = pmap_remove_pt_page(pmap, va); + KASSERT(mpte != NULL, ("pmap_remove_kernel_pde: missing pt page")); mptepa = VM_PAGE_TO_PHYS(mpte); newpde = mptepa | X86_PG_M | X86_PG_A | X86_PG_RW | X86_PG_V; @@ -6209,7 +6266,7 @@ pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, * pmap_remove_pde: do the things to unmap a superpage in a process */ static int -pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, bool remove_pt, +pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, bool demote_kpde, struct spglist *free, struct rwlock **lockp) { struct md_page *pvh; @@ -6249,9 +6306,7 @@ pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, bool remove_pt, pmap_delayed_invl_page(m); } } - if (pmap == kernel_pmap) { - pmap_remove_kernel_pde(pmap, pdq, sva, remove_pt); - } else { + if (pmap != kernel_pmap) { mpte = pmap_remove_pt_page(pmap, sva); if (mpte != NULL) { KASSERT(vm_page_any_valid(mpte), @@ -6262,6 +6317,14 @@ pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, bool remove_pt, mpte->ref_count = 0; pmap_add_delayed_free_list(mpte, free, false); } + } else if (demote_kpde) { + pmap_remove_kernel_pde(pmap, pdq, sva); + } else { + mpte = vm_radix_lookup(&pmap->pm_root, pmap_pde_pindex(sva)); + if (vm_page_any_valid(mpte)) { + mpte->valid = 0; + pmap_zero_page(mpte); + } } return (pmap_unuse_pt(pmap, sva, *pmap_pdpe(pmap, sva), free)); } @@ -7183,7 +7246,7 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, PG_RW = pmap_rw_bit(pmap); va = trunc_page(va); - KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig")); + KASSERT(va <= kva_layout.km_high, ("pmap_enter: toobig")); KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS, ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)", va)); @@ -7512,6 +7575,9 @@ pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags, PG_RW = pmap_rw_bit(pmap); KASSERT((newpde & (pmap_modified_bit(pmap) | PG_RW)) != PG_RW, ("pmap_enter_pde: newpde is missing PG_M")); + KASSERT((flags & (PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM)) != + PMAP_ENTER_NORECLAIM, + ("pmap_enter_pde: flags is missing PMAP_ENTER_NOREPLACE")); PG_V = pmap_valid_bit(pmap); PMAP_LOCK_ASSERT(pmap, MA_OWNED); @@ -7573,8 +7639,8 @@ pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags, * the mapping is not from kernel_pmap, then * a reserved PT page could be freed. */ - (void)pmap_remove_pde(pmap, pde, va, - pmap != kernel_pmap, &free, lockp); + (void)pmap_remove_pde(pmap, pde, va, false, &free, + lockp); if ((oldpde & PG_G) == 0) pmap_invalidate_pde_page(pmap, va, oldpde); } else { @@ -7584,10 +7650,9 @@ pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags, * before any changes to mappings are * made. Abort on failure. */ - mt = PHYS_TO_VM_PAGE(*pde & PG_FRAME); - if (pmap_insert_pt_page(pmap, mt, false, false)) { - if (pdpg != NULL) - pdpg->ref_count--; + mt = PHYS_TO_VM_PAGE(oldpde & PG_FRAME); + if (pmap_insert_pt_page(pmap, mt, false, + false)) { CTR1(KTR_PMAP, "pmap_enter_pde: cannot ins kern ptp va %#lx", va); @@ -7641,6 +7706,14 @@ pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags, if (!pmap_pv_insert_pde(pmap, va, newpde, flags, lockp)) { if (pdpg != NULL) pmap_abort_ptp(pmap, va, pdpg); + else { + KASSERT(va >= VM_MAXUSER_ADDRESS && + (*pde & (PG_PS | PG_V)) == PG_V, + ("pmap_enter_pde: invalid kernel PDE")); + mt = pmap_remove_pt_page(pmap, va); + KASSERT(mt != NULL, + ("pmap_enter_pde: missing kernel PTP")); + } if (uwptpg != NULL) { mt = pmap_remove_pt_page(pmap, va); KASSERT(mt == uwptpg, @@ -9550,7 +9623,7 @@ pmap_unmapdev(void *p, vm_size_t size) va = (vm_offset_t)p; /* If we gave a direct map region in pmap_mapdev, do nothing */ - if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) + if (va >= kva_layout.dmap_low && va < kva_layout.dmap_high) return; offset = va & PAGE_MASK; size = round_page(offset + size); @@ -9649,6 +9722,8 @@ pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va, vm_page_t m) void pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) { + if (m->md.pat_mode == ma) + return; m->md.pat_mode = ma; @@ -9668,6 +9743,9 @@ pmap_page_set_memattr_noflush(vm_page_t m, vm_memattr_t ma) { int error; + if (m->md.pat_mode == ma) + return; + m->md.pat_mode = ma; if ((m->flags & PG_FICTITIOUS) != 0) @@ -9724,7 +9802,7 @@ pmap_change_prot(vm_offset_t va, vm_size_t size, vm_prot_t prot) int error; /* Only supported within the kernel map. */ - if (va < VM_MIN_KERNEL_ADDRESS) + if (va < kva_layout.km_low) return (EINVAL); PMAP_LOCK(kernel_pmap); @@ -9755,7 +9833,7 @@ pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot, * Only supported on kernel virtual addresses, including the direct * map but excluding the recursive map. */ - if (base < DMAP_MIN_ADDRESS) + if (base < kva_layout.dmap_low) return (EINVAL); /* @@ -9778,7 +9856,7 @@ pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot, pte_bits |= X86_PG_RW; } if ((prot & VM_PROT_EXECUTE) == 0 || - va < VM_MIN_KERNEL_ADDRESS) { + va < kva_layout.km_low) { pde_bits |= pg_nx; pte_bits |= pg_nx; } @@ -9874,7 +9952,7 @@ pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot, pmap_pte_props(pdpe, pde_bits, pde_mask); changed = true; } - if (tmpva >= VM_MIN_KERNEL_ADDRESS && + if (tmpva >= kva_layout.km_low && (*pdpe & PG_PS_FRAME) < dmaplimit) { if (pa_start == pa_end) { /* Start physical address run. */ @@ -9904,7 +9982,7 @@ pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot, pmap_pte_props(pde, pde_bits, pde_mask); changed = true; } - if (tmpva >= VM_MIN_KERNEL_ADDRESS && + if (tmpva >= kva_layout.km_low && (*pde & PG_PS_FRAME) < dmaplimit) { if (pa_start == pa_end) { /* Start physical address run. */ @@ -9932,7 +10010,7 @@ pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot, pmap_pte_props(pte, pte_bits, pte_mask); changed = true; } - if (tmpva >= VM_MIN_KERNEL_ADDRESS && + if (tmpva >= kva_layout.km_low && (*pte & PG_FRAME) < dmaplimit) { if (pa_start == pa_end) { /* Start physical address run. */ @@ -10699,19 +10777,28 @@ pmap_large_map_getptp(void) static pdp_entry_t * pmap_large_map_pdpe(vm_offset_t va) { + pml4_entry_t *pml4; vm_pindex_t pml4_idx; vm_paddr_t mphys; - pml4_idx = pmap_pml4e_index(va); - KASSERT(LMSPML4I <= pml4_idx && pml4_idx < LMSPML4I + lm_ents, - ("pmap_large_map_pdpe: va %#jx out of range idx %#jx LMSPML4I " - "%#jx lm_ents %d", - (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents)); - KASSERT((kernel_pml4[pml4_idx] & X86_PG_V) != 0, - ("pmap_large_map_pdpe: invalid pml4 for va %#jx idx %#jx " - "LMSPML4I %#jx lm_ents %d", - (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents)); - mphys = kernel_pml4[pml4_idx] & PG_FRAME; + KASSERT(va >= kva_layout.lm_low && va < kva_layout.lm_low + + (vm_offset_t)NBPML4 * lm_ents, ("va %#lx not in large map", va)); + if (la57) { + pml4 = pmap_pml4e(kernel_pmap, va); + mphys = *pml4 & PG_FRAME; + } else { + pml4_idx = pmap_pml4e_index(va); + + KASSERT(LMSPML4I <= pml4_idx && pml4_idx < LMSPML4I + lm_ents, + ("pmap_large_map_pdpe: va %#jx out of range idx %#jx " + "LMSPML4I %#jx lm_ents %d", + (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents)); + KASSERT((kernel_pml4[pml4_idx] & X86_PG_V) != 0, + ("pmap_large_map_pdpe: invalid pml4 for va %#jx idx %#jx " + "LMSPML4I %#jx lm_ents %d", + (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents)); + mphys = kernel_pml4[pml4_idx] & PG_FRAME; + } return ((pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va)); } @@ -10904,8 +10991,8 @@ pmap_large_unmap(void *svaa, vm_size_t len) struct spglist spgf; sva = (vm_offset_t)svaa; - if (len == 0 || sva + len < sva || (sva >= DMAP_MIN_ADDRESS && - sva + len <= DMAP_MIN_ADDRESS + dmaplimit)) + if (len == 0 || sva + len < sva || (sva >= kva_layout.dmap_low && + sva + len < kva_layout.dmap_high)) return; SLIST_INIT(&spgf); @@ -11151,11 +11238,10 @@ pmap_large_map_wb(void *svap, vm_size_t len) sva = (vm_offset_t)svap; eva = sva + len; pmap_large_map_wb_fence(); - if (sva >= DMAP_MIN_ADDRESS && eva <= DMAP_MIN_ADDRESS + dmaplimit) { + if (sva >= kva_layout.dmap_low && eva < kva_layout.dmap_high) { pmap_large_map_flush_range(sva, len); } else { - KASSERT(sva >= LARGEMAP_MIN_ADDRESS && - eva <= LARGEMAP_MIN_ADDRESS + lm_ents * NBPML4, + KASSERT(sva >= kva_layout.lm_low && eva < kva_layout.lm_high, ("pmap_large_map_wb: not largemap %#lx %#lx", sva, len)); pmap_large_map_wb_large(sva, eva); } @@ -11196,8 +11282,8 @@ pmap_pti_init(void) VM_OBJECT_WLOCK(pti_obj); pml4_pg = pmap_pti_alloc_page(); pti_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4_pg)); - for (va = VM_MIN_KERNEL_ADDRESS; va <= VM_MAX_KERNEL_ADDRESS && - va >= VM_MIN_KERNEL_ADDRESS && va > NBPML4; va += NBPML4) { + for (va = kva_layout.km_low; va <= kva_layout.km_high && + va >= kva_layout.km_low && va > NBPML4; va += NBPML4) { pdpe = pmap_pti_pdpe(va); pmap_pti_wire_pte(pdpe); } @@ -12081,10 +12167,12 @@ sysctl_kmaps(SYSCTL_HANDLER_ARGS) for (sva = 0, i = pmap_pml4e_index(sva); i < NPML4EPG; i++) { switch (i) { case PML4PML4I: - sbuf_printf(sb, "\nRecursive map:\n"); + if (!la57) + sbuf_printf(sb, "\nRecursive map:\n"); break; case DMPML4I: - sbuf_printf(sb, "\nDirect map:\n"); + if (!la57) + sbuf_printf(sb, "\nDirect map:\n"); break; #ifdef KASAN case KASANPML4I: @@ -12103,7 +12191,8 @@ sysctl_kmaps(SYSCTL_HANDLER_ARGS) sbuf_printf(sb, "\nKernel map:\n"); break; case LMSPML4I: - sbuf_printf(sb, "\nLarge map:\n"); + if (!la57) + sbuf_printf(sb, "\nLarge map:\n"); break; } diff --git a/sys/amd64/amd64/trap.c b/sys/amd64/amd64/trap.c index 09ac0a67dbef..eefddad2f142 100644 --- a/sys/amd64/amd64/trap.c +++ b/sys/amd64/amd64/trap.c @@ -769,7 +769,7 @@ trap_pfault(struct trapframe *frame, bool usermode, int *signo, int *ucode) return (-1); } } - if (eva >= VM_MIN_KERNEL_ADDRESS) { + if (eva >= kva_layout.km_low) { /* * Don't allow user-mode faults in kernel address space. */ diff --git a/sys/amd64/include/param.h b/sys/amd64/include/param.h index 8db314fa034d..1bbb302259d6 100644 --- a/sys/amd64/include/param.h +++ b/sys/amd64/include/param.h @@ -146,8 +146,9 @@ #define amd64_btop(x) ((unsigned long)(x) >> PAGE_SHIFT) #define amd64_ptob(x) ((unsigned long)(x) << PAGE_SHIFT) -#define INKERNEL(va) (((va) >= DMAP_MIN_ADDRESS && (va) < DMAP_MAX_ADDRESS) \ - || ((va) >= VM_MIN_KERNEL_ADDRESS && (va) < VM_MAX_KERNEL_ADDRESS)) +#define INKERNEL(va) \ + (((va) >= kva_layout.dmap_low && (va) < kva_layout.dmap_high) || \ + ((va) >= kva_layout.km_low && (va) < kva_layout.km_high)) #ifdef SMP #define SC_TABLESIZE 1024 /* Must be power of 2. */ diff --git a/sys/amd64/include/pmap.h b/sys/amd64/include/pmap.h index 7d3e91bcd9b9..a0ca97f2d5a0 100644 --- a/sys/amd64/include/pmap.h +++ b/sys/amd64/include/pmap.h @@ -169,11 +169,12 @@ * the recursive page table map. */ #define NDMPML4E 8 +#define NDMPML5E 32 /* - * These values control the layout of virtual memory. The starting address - * of the direct map, which is controlled by DMPML4I, must be a multiple of - * its size. (See the PHYS_TO_DMAP() and DMAP_TO_PHYS() macros.) + * These values control the layout of virtual memory. The starting + * address of the direct map is controlled by DMPML4I on LA48 and + * DMPML5I on LA57. * * Note: KPML4I is the index of the (single) level 4 page that maps * the KVA that holds KERNBASE, while KPML4BASE is the index of the @@ -191,6 +192,7 @@ #define KPML4BASE (NPML4EPG-NKPML4E) /* KVM at highest addresses */ #define DMPML4I rounddown(KPML4BASE-NDMPML4E, NDMPML4E) /* Below KVM */ +#define DMPML5I (NPML5EPG / 2 + 1) #define KPML4I (NPML4EPG-1) #define KPDPI (NPDPEPG-2) /* kernbase at -2GB */ @@ -200,9 +202,14 @@ #define KMSANSHADPML4I (KPML4BASE - NKMSANSHADPML4E) #define KMSANORIGPML4I (DMPML4I - NKMSANORIGPML4E) -/* Large map: index of the first and max last pml4 entry */ +/* + * Large map: index of the first and max last pml4/la48 and pml5/la57 + * entry. + */ #define LMSPML4I (PML4PML4I + 1) #define LMEPML4I (KASANPML4I - 1) +#define LMSPML5I (DMPML5I + NDMPML5E) +#define LMEPML5I (LMSPML5I + 32 - 1) /* 32 slots for large map */ /* * XXX doesn't really belong here I guess... @@ -548,6 +555,18 @@ pmap_pml5e_index(vm_offset_t va) return ((va >> PML5SHIFT) & ((1ul << NPML5EPGSHIFT) - 1)); } +struct kva_layout_s { + vm_offset_t kva_min; + vm_offset_t dmap_low; /* DMAP_MIN_ADDRESS */ + vm_offset_t dmap_high; /* DMAP_MAX_ADDRESS */ + vm_offset_t lm_low; /* LARGEMAP_MIN_ADDRESS */ + vm_offset_t lm_high; /* LARGEMAP_MAX_ADDRESS */ + vm_offset_t km_low; /* VM_MIN_KERNEL_ADDRESS */ + vm_offset_t km_high; /* VM_MAX_KERNEL_ADDRESS */ + vm_offset_t rec_pt; +}; +extern struct kva_layout_s kva_layout; + #endif /* !LOCORE */ #endif /* !_MACHINE_PMAP_H_ */ diff --git a/sys/amd64/include/vmparam.h b/sys/amd64/include/vmparam.h index 0cd9bb4fa7a4..ef352e776af6 100644 --- a/sys/amd64/include/vmparam.h +++ b/sys/amd64/include/vmparam.h @@ -163,6 +163,7 @@ * Virtual addresses of things. Derived from the page directory and * page table indexes from pmap.h for precision. * + * LA48: * 0x0000000000000000 - 0x00007fffffffffff user map * 0x0000800000000000 - 0xffff7fffffffffff does not exist (hole) * 0xffff800000000000 - 0xffff804020100fff recursive page table (512GB slot) @@ -175,18 +176,29 @@ * 0xfffffc0000000000 - 0xfffffdffffffffff 2TB KMSAN shadow map, optional * 0xfffffe0000000000 - 0xffffffffffffffff 2TB kernel map * + * LA57: + * 0x0000000000000000 - 0x00ffffffffffffff user map + * 0x0100000000000000 - 0xf0ffffffffffffff does not exist (hole) + * 0xff00000000000000 - 0xff00ffffffffffff recursive page table (2048TB slot) + * 0xff01000000000000 - 0xff20ffffffffffff direct map (32 x 2048TB slots) + * 0xff21000000000000 - 0xff40ffffffffffff large map + * 0xff41000000000000 - 0xffff7fffffffffff unused + * 0xffff800000000000 - 0xfffff5ffffffffff unused (start of kernel pml4 entry) + * 0xfffff60000000000 - 0xfffff7ffffffffff 2TB KMSAN origin map, optional + * 0xfffff78000000000 - 0xfffff7bfffffffff 512GB KASAN shadow map, optional + * 0xfffff80000000000 - 0xfffffbffffffffff 4TB unused + * 0xfffffc0000000000 - 0xfffffdffffffffff 2TB KMSAN shadow map, optional + * 0xfffffe0000000000 - 0xffffffffffffffff 2TB kernel map + * * Within the kernel map: * * 0xfffffe0000000000 vm_page_array * 0xffffffff80000000 KERNBASE */ -#define VM_MIN_KERNEL_ADDRESS KV4ADDR(KPML4BASE, 0, 0, 0) -#define VM_MAX_KERNEL_ADDRESS KV4ADDR(KPML4BASE + NKPML4E - 1, \ - NPDPEPG-1, NPDEPG-1, NPTEPG-1) - -#define DMAP_MIN_ADDRESS KV4ADDR(DMPML4I, 0, 0, 0) -#define DMAP_MAX_ADDRESS KV4ADDR(DMPML4I + NDMPML4E, 0, 0, 0) +#define VM_MIN_KERNEL_ADDRESS_LA48 KV4ADDR(KPML4BASE, 0, 0, 0) +#define VM_MIN_KERNEL_ADDRESS kva_layout.km_low +#define VM_MAX_KERNEL_ADDRESS kva_layout.km_high #define KASAN_MIN_ADDRESS KV4ADDR(KASANPML4I, 0, 0, 0) #define KASAN_MAX_ADDRESS KV4ADDR(KASANPML4I + NKASANPML4E, 0, 0, 0) @@ -199,9 +211,6 @@ #define KMSAN_ORIG_MAX_ADDRESS KV4ADDR(KMSANORIGPML4I + NKMSANORIGPML4E, \ 0, 0, 0) -#define LARGEMAP_MIN_ADDRESS KV4ADDR(LMSPML4I, 0, 0, 0) -#define LARGEMAP_MAX_ADDRESS KV4ADDR(LMEPML4I + 1, 0, 0, 0) - /* * Formally kernel mapping starts at KERNBASE, but kernel linker * script leaves first PDE reserved. For legacy BIOS boot, kernel is @@ -239,21 +248,21 @@ * vt fb startup needs to be reworked. */ #define PHYS_IN_DMAP(pa) (dmaplimit == 0 || (pa) < dmaplimit) -#define VIRT_IN_DMAP(va) ((va) >= DMAP_MIN_ADDRESS && \ - (va) < (DMAP_MIN_ADDRESS + dmaplimit)) +#define VIRT_IN_DMAP(va) \ + ((va) >= kva_layout.dmap_low && (va) < kva_layout.dmap_low + dmaplimit) #define PMAP_HAS_DMAP 1 -#define PHYS_TO_DMAP(x) ({ \ +#define PHYS_TO_DMAP(x) __extension__ ({ \ KASSERT(PHYS_IN_DMAP(x), \ ("physical address %#jx not covered by the DMAP", \ (uintmax_t)x)); \ - (x) | DMAP_MIN_ADDRESS; }) + (x) + kva_layout.dmap_low; }) -#define DMAP_TO_PHYS(x) ({ \ +#define DMAP_TO_PHYS(x) __extension__ ({ \ KASSERT(VIRT_IN_DMAP(x), \ ("virtual address %#jx not covered by the DMAP", \ (uintmax_t)x)); \ - (x) & ~DMAP_MIN_ADDRESS; }) + (x) - kva_layout.dmap_low; }) /* * amd64 maps the page array into KVA so that it can be more easily @@ -274,7 +283,7 @@ */ #ifndef VM_KMEM_SIZE_MAX #define VM_KMEM_SIZE_MAX ((VM_MAX_KERNEL_ADDRESS - \ - VM_MIN_KERNEL_ADDRESS + 1) * 3 / 5) + kva_layout.km_low + 1) * 3 / 5) #endif /* initial pagein size of beginning of executable file */ diff --git a/sys/amd64/pt/pt.c b/sys/amd64/pt/pt.c new file mode 100644 index 000000000000..c7b75767680a --- /dev/null +++ b/sys/amd64/pt/pt.c @@ -0,0 +1,978 @@ +/* + * Copyright (c) 2025 Bojan Novković <bnovkov@freebsd.org> + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +/* + * hwt(4) Intel Processor Trace (PT) backend + * + * Driver Design Overview + * + * - Since PT is configured on a per-core basis, the driver uses + * 'smp_rendezvous' to start and disable tracing on each target core. + * - PT-specific resources are stored in a 'struct pt_ctx' context structure for + * each traced CPU core or thread. Upon initialization, a ToPA configuration + * is generated for each 'pt_ctx' structure using the HWT tracing buffers. + * The HWT tracing buffer is split into 4K ToPA entries. Currently, each + * 4K ToPA entry is configured to trigger an interrupt after it is filled. + * - The PT driver uses the XSAVE/XRSTOR PT extensions to load and save all + * relevant PT registers. Every time a traced thread is switched + * out or in, its state will be saved to or loaded from its corresponding + * 'pt_ctx' context. + * - When tracing starts, the PT hardware will start writing data into the + * tracing buffer. When a TOPA_INT entry is filled, it will trigger an + * interrupt before continuing. The interrupt handler will then fetch the + * last valid tracing buffer offset and enqueue a HWT_RECORD_BUFFER record. + * The driver is currently configured to use the NMI interrupt line. + * - The userspace PT backend waits for incoming HWT_RECORD_BUFFER records + * and uses the offsets to decode data from the tracing buffer. + * + * Future improvements and limitations + * + * - We currently configure the PT hardware to trigger an interrupt whenever + * a 4K ToPA entry is filled. While this is fine when tracing smaller + * functions or infrequent code paths, this will generate too much interrupt + * traffic when tracing hotter functions. A proper solution for this issue + * should estimate the amount of data generated by the current configuration + * and use it to determine interrupt frequency. + * + * - Support for more tracing options and PT features. + * + */ + +#include <sys/systm.h> +#include <sys/hwt.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/module.h> +#include <sys/mutex.h> +#include <sys/sdt.h> +#include <sys/smp.h> +#include <sys/taskqueue.h> + +#include <vm/vm.h> +#include <vm/vm_page.h> + +#include <machine/atomic.h> +#include <machine/cpufunc.h> +#include <machine/fpu.h> +#include <machine/smp.h> +#include <machine/specialreg.h> + +#include <x86/apicvar.h> +#include <x86/x86_var.h> + +#include <dev/hwt/hwt_context.h> +#include <dev/hwt/hwt_vm.h> +#include <dev/hwt/hwt_backend.h> +#include <dev/hwt/hwt_config.h> +#include <dev/hwt/hwt_cpu.h> +#include <dev/hwt/hwt_record.h> +#include <dev/hwt/hwt_thread.h> + +#include <amd64/pt/pt.h> + +#ifdef PT_DEBUG +#define dprintf(fmt, ...) printf(fmt, ##__VA_ARGS__) +#else +#define dprintf(fmt, ...) +#endif +#define PT_SUPPORTED_FLAGS \ + (RTIT_CTL_MTCEN | RTIT_CTL_CR3FILTER | RTIT_CTL_DIS_TNT | \ + RTIT_CTL_USER | RTIT_CTL_OS | RTIT_CTL_BRANCHEN) +#define PT_XSAVE_MASK (XFEATURE_ENABLED_X87 | XFEATURE_ENABLED_SSE) +#define PT_XSTATE_BV (PT_XSAVE_MASK | XFEATURE_ENABLED_PT) +#define PT_MAX_IP_RANGES 2 + +#define PT_TOPA_MASK_PTRS 0x7f +#define PT_TOPA_PAGE_MASK 0xffffff80 +#define PT_TOPA_PAGE_SHIFT 7 + +#define CPUID_PT_LEAF 0x14 + +MALLOC_DEFINE(M_PT, "pt", "Intel Processor Trace"); + +SDT_PROVIDER_DEFINE(pt); +SDT_PROBE_DEFINE(pt, , , topa__intr); + +TASKQUEUE_FAST_DEFINE_THREAD(pt); + +static void pt_send_buffer_record(void *arg, int pending __unused); +static int pt_topa_intr(struct trapframe *tf); + +/* + * Intel Processor Trace XSAVE-managed state. + */ +struct pt_ext_area { + uint64_t rtit_ctl; + uint64_t rtit_output_base; + uint64_t rtit_output_mask_ptrs; + uint64_t rtit_status; + uint64_t rtit_cr3_match; + uint64_t rtit_addr0_a; + uint64_t rtit_addr0_b; + uint64_t rtit_addr1_a; + uint64_t rtit_addr1_b; +}; + +struct pt_buffer { + uint64_t *topa_hw; /* ToPA table entries. */ + size_t size; + struct mtx lock; /* Lock for fields below. */ + vm_offset_t offset; + uint64_t wrap_count; + int curpage; +}; + +struct pt_ctx { + int id; + struct pt_buffer buf; /* ToPA buffer metadata */ + struct task task; /* ToPA buffer notification task */ + struct hwt_context *hwt_ctx; + uint8_t *save_area; /* PT XSAVE area */ +}; +/* PT tracing contexts used for CPU mode. */ +static struct pt_ctx *pt_pcpu_ctx; + +enum pt_cpu_state { + PT_DISABLED = 0, + PT_STOPPED, + PT_ACTIVE +}; + +static struct pt_cpu { + struct pt_ctx *ctx; /* active PT tracing context */ + enum pt_cpu_state state; /* used as part of trace stop protocol */ +} *pt_pcpu; + +/* + * PT-related CPUID bits. + */ +static struct pt_cpu_info { + uint32_t l0_eax; + uint32_t l0_ebx; + uint32_t l0_ecx; + uint32_t l1_eax; + uint32_t l1_ebx; + size_t xsave_area_size; + size_t xstate_hdr_offset; + size_t pt_xsave_offset; +} pt_info __read_mostly; + +static bool initialized = false; +static int cpu_mode_ctr = 0; + +static __inline enum pt_cpu_state +pt_cpu_get_state(int cpu_id) +{ + return (atomic_load_int(&pt_pcpu[cpu_id].state)); +} + +static __inline void +pt_cpu_set_state(int cpu_id, enum pt_cpu_state state) +{ + atomic_store_int(&pt_pcpu[cpu_id].state, state); +} + +static __inline struct xstate_hdr * +pt_ctx_get_xstate_hdr(struct pt_ctx *ctx) +{ + return ((struct xstate_hdr *)(ctx->save_area + + pt_info.xstate_hdr_offset)); +} + + +static __inline struct pt_ext_area * +pt_ctx_get_ext_area(struct pt_ctx *ctx) +{ + return ((struct pt_ext_area *)(ctx->save_area + + pt_info.pt_xsave_offset)); +} + +/* + * Updates current trace buffer offset from the + * ToPA MSRs. Records if the trace buffer wrapped. + */ +static __inline void +pt_update_buffer(struct pt_buffer *buf) +{ + uint64_t reg; + int curpage; + + /* Update buffer offset. */ + reg = rdmsr(MSR_IA32_RTIT_OUTPUT_MASK_PTRS); + curpage = (reg & PT_TOPA_PAGE_MASK) >> PT_TOPA_PAGE_SHIFT; + mtx_lock_spin(&buf->lock); + /* Check if the output wrapped. */ + if (buf->curpage > curpage) + buf->wrap_count++; + buf->curpage = curpage; + buf->offset = reg >> 32; + mtx_unlock_spin(&buf->lock); + + dprintf("%s: wrap_cnt: %lu, curpage: %d, offset: %zu\n", __func__, + buf->wrap_count, buf->curpage, buf->offset); +} + +static __inline void +pt_fill_buffer_record(int id, struct pt_buffer *buf, + struct hwt_record_entry *rec) +{ + rec->record_type = HWT_RECORD_BUFFER; + rec->buf_id = id; + rec->curpage = buf->curpage; + rec->offset = buf->offset + (buf->wrap_count * buf->size); +} + +/* + * Enables or disables tracing on curcpu + * using the XSAVE/XRSTOR PT extensions. + */ +static void +pt_cpu_toggle_local(uint8_t *save_area, bool enable) +{ + u_long xcr0, cr0; + u_long xss; + + cr0 = rcr0(); + if (cr0 & CR0_TS) + clts(); + xcr0 = rxcr(XCR0); + if ((xcr0 & PT_XSAVE_MASK) != PT_XSAVE_MASK) + load_xcr(XCR0, xcr0 | PT_XSAVE_MASK); + xss = rdmsr(MSR_IA32_XSS); + wrmsr(MSR_IA32_XSS, xss | XFEATURE_ENABLED_PT); + + if (!enable) { + KASSERT((rdmsr(MSR_IA32_RTIT_CTL) & RTIT_CTL_TRACEEN) != 0, + ("%s: PT is disabled", __func__)); + xsaves(save_area, XFEATURE_ENABLED_PT); + } else { + KASSERT((rdmsr(MSR_IA32_RTIT_CTL) & RTIT_CTL_TRACEEN) == 0, + ("%s: PT is enabled", __func__)); + xrstors(save_area, XFEATURE_ENABLED_PT); + } + wrmsr(MSR_IA32_XSS, xss); + if ((xcr0 & PT_XSAVE_MASK) != PT_XSAVE_MASK) + load_xcr(XCR0, xcr0); + if (cr0 & CR0_TS) + load_cr0(cr0); +} + +/* + * Starts PT tracing on 'curcpu'. + */ +static void +pt_cpu_start(void *dummy) +{ + struct pt_cpu *cpu; + + cpu = &pt_pcpu[curcpu]; + MPASS(cpu->ctx != NULL); + + dprintf("%s: curcpu %d\n", __func__, curcpu); + load_cr4(rcr4() | CR4_XSAVE); + wrmsr(MSR_IA32_RTIT_STATUS, 0); + pt_cpu_set_state(curcpu, PT_ACTIVE); + pt_cpu_toggle_local(cpu->ctx->save_area, true); +} + +/* + * Stops PT tracing on 'curcpu'. + * Updates trace buffer offset to ensure + * any data generated between the last interrupt + * and the trace stop gets picked up by userspace. + */ +static void +pt_cpu_stop(void *dummy) +{ + struct pt_cpu *cpu; + struct pt_ctx *ctx; + + /* Shutdown may occur before PT gets properly configured. */ + if (pt_cpu_get_state(curcpu) == PT_DISABLED) + return; + + cpu = &pt_pcpu[curcpu]; + ctx = cpu->ctx; + MPASS(ctx != NULL); + dprintf("%s: curcpu %d\n", __func__, curcpu); + + pt_cpu_set_state(curcpu, PT_STOPPED); + pt_cpu_toggle_local(cpu->ctx->save_area, false); + pt_update_buffer(&ctx->buf); +} + +/* + * Prepares the Table of Physical Addresses (ToPA) metadata for 'pt_ctx'. + * The HWT trace buffer is split into 4K ToPA table entries and used + * as a circular buffer, meaning that the last ToPA entry points to + * the first ToPA entry. Each entry is configured to raise an + * interrupt after being filled. + */ +static int +pt_topa_prepare(struct pt_ctx *ctx, struct hwt_vm *vm) +{ + struct pt_buffer *buf; + size_t topa_size; + int i; + + topa_size = TOPA_SIZE_4K; + buf = &ctx->buf; + + KASSERT(buf->topa_hw == NULL, + ("%s: ToPA info already exists", __func__)); + buf->topa_hw = mallocarray(vm->npages + 1, sizeof(uint64_t), M_PT, + M_ZERO | M_WAITOK); + dprintf("%s: ToPA virt addr %p\n", __func__, buf->topa_hw); + buf->size = vm->npages * PAGE_SIZE; + for (i = 0; i < vm->npages; i++) { + buf->topa_hw[i] = VM_PAGE_TO_PHYS(vm->pages[i]) | topa_size; + /* + * XXX: TOPA_INT should ideally be set according to + * expected amount of incoming trace data. Too few TOPA_INT + * entries will not trigger interrupts often enough when tracing + * smaller functions. + */ + buf->topa_hw[i] |= TOPA_INT; + } + buf->topa_hw[vm->npages] = (uint64_t)vtophys(buf->topa_hw) | TOPA_END; + + return (0); +} + +/* + * Configures IP filtering for trace generation. + * A maximum of 2 ranges can be specified due to + * limitations imposed by the XSAVE/XRSTOR PT extensions. + */ +static int +pt_configure_ranges(struct pt_ctx *ctx, struct pt_cpu_config *cfg) +{ + struct pt_ext_area *pt_ext; + int nranges_supp, n, error = 0; + + pt_ext = pt_ctx_get_ext_area(ctx); + if (pt_info.l0_ebx & CPUPT_IPF) { + nranges_supp = (pt_info.l1_eax & CPUPT_NADDR_M) >> + CPUPT_NADDR_S; + + if (nranges_supp > PT_IP_FILTER_MAX_RANGES) + nranges_supp = PT_IP_FILTER_MAX_RANGES; + n = cfg->nranges; + if (n > nranges_supp) { + printf("%s: %d IP filtering ranges requested, CPU " + "supports %d, truncating\n", + __func__, n, nranges_supp); + n = nranges_supp; + } + + switch (n) { + case 2: + pt_ext->rtit_ctl |= (1UL << RTIT_CTL_ADDR_CFG_S(1)); + pt_ext->rtit_addr1_a = cfg->ip_ranges[1].start; + pt_ext->rtit_addr1_b = cfg->ip_ranges[1].end; + case 1: + pt_ext->rtit_ctl |= (1UL << RTIT_CTL_ADDR_CFG_S(0)); + pt_ext->rtit_addr0_a = cfg->ip_ranges[0].start; + pt_ext->rtit_addr0_b = cfg->ip_ranges[0].end; + break; + default: + error = (EINVAL); + break; + }; + } else + error = (ENXIO); + + return (error); +} + +static int +pt_init_ctx(struct pt_ctx *pt_ctx, struct hwt_vm *vm, int ctx_id) +{ + + dprintf("%s: ctx id %d\n", __func__, ctx_id); + + KASSERT(pt_ctx->buf.topa_hw == NULL, + ("%s: active ToPA buffer in context %p\n", __func__, pt_ctx)); + + memset(pt_ctx, 0, sizeof(struct pt_ctx)); + mtx_init(&pt_ctx->buf.lock, "pttopa", NULL, MTX_SPIN); + pt_ctx->save_area = malloc_aligned(pt_info.xsave_area_size, 64, + M_PT, M_NOWAIT | M_ZERO); + if (pt_ctx->save_area == NULL) + return (ENOMEM); + dprintf("%s: preparing ToPA buffer\n", __func__); + if (pt_topa_prepare(pt_ctx, vm) != 0) { + dprintf("%s: failed to prepare ToPA buffer\n", __func__); + free(pt_ctx->save_area, M_PT); + return (ENOMEM); + } + + pt_ctx->id = ctx_id; + TASK_INIT(&pt_ctx->task, 0, pt_send_buffer_record, pt_ctx); + + return (0); +} + +static void +pt_deinit_ctx(struct pt_ctx *pt_ctx) +{ + + if (pt_ctx->buf.topa_hw != NULL) + free(pt_ctx->buf.topa_hw, M_PT); + if (pt_ctx->save_area != NULL) + free(pt_ctx->save_area, M_PT); + memset(pt_ctx, 0, sizeof(*pt_ctx)); + pt_ctx->buf.topa_hw = NULL; +} + +/* + * HWT backend configuration method. + * + * Checks and translates the user-defined configuration to a + * set of PT tracing features. Uses the feature set to initialize + * the tracing context for the target CPU or thread. + */ +static int +pt_backend_configure(struct hwt_context *ctx, int cpu_id, int thread_id) +{ + struct hwt_cpu *hwt_cpu; + struct hwt_thread *thr; + struct pt_ctx *pt_ctx; + struct pt_cpu_config *cfg; + struct pt_ext_area *pt_ext; + struct xstate_hdr *hdr; + int error; + + dprintf("%s\n", __func__); + + cfg = (struct pt_cpu_config *)ctx->config; + pt_ctx = NULL; + + /* Clear any flags we don't support yet. */ + cfg->rtit_ctl &= PT_SUPPORTED_FLAGS; + if (cfg->rtit_ctl & RTIT_CTL_MTCEN) { + if ((pt_info.l0_ebx & CPUPT_MTC) == 0) { + printf("%s: CPU does not support generating MTC " + "packets\n", __func__); + return (ENXIO); + } + } + + if (cfg->rtit_ctl & RTIT_CTL_CR3FILTER) { + if ((pt_info.l0_ebx & CPUPT_CR3) == 0) { + printf("%s: CPU does not support CR3 filtering\n", + __func__); + return (ENXIO); + } + } + + if (cfg->rtit_ctl & RTIT_CTL_DIS_TNT) { + if ((pt_info.l0_ebx & CPUPT_DIS_TNT) == 0) { + printf("%s: CPU does not support TNT\n", __func__); + return (ENXIO); + } + } + /* TODO: support for more config bits. */ + + if (ctx->mode == HWT_MODE_CPU) { + TAILQ_FOREACH(hwt_cpu, &ctx->cpus, next) { + if (hwt_cpu->cpu_id != cpu_id) + continue; + pt_ctx = &pt_pcpu_ctx[cpu_id]; + break; + } + } else { + TAILQ_FOREACH(thr, &ctx->threads, next) { + if (thr->thread_id != thread_id) + continue; + KASSERT(thr->private != NULL, + ("%s: hwt thread private" + " not set, thr %p", + __func__, thr)); + pt_ctx = (struct pt_ctx *)thr->private; + break; + } + } + if (pt_ctx == NULL) + return (ENOENT); + + dprintf("%s: preparing MSRs\n", __func__); + pt_ext = pt_ctx_get_ext_area(pt_ctx); + hdr = pt_ctx_get_xstate_hdr(pt_ctx); + + pt_ext->rtit_ctl |= cfg->rtit_ctl; + if (cfg->nranges != 0) { + dprintf("%s: preparing IPF ranges\n", __func__); + if ((error = pt_configure_ranges(pt_ctx, cfg)) != 0) + return (error); + } + pt_ctx->hwt_ctx = ctx; + pt_ext->rtit_ctl |= RTIT_CTL_TOPA; + pt_ext->rtit_output_base = (uint64_t)vtophys(pt_ctx->buf.topa_hw); + pt_ext->rtit_output_mask_ptrs = PT_TOPA_MASK_PTRS; + hdr->xstate_bv = XFEATURE_ENABLED_PT; + hdr->xstate_xcomp_bv = XFEATURE_ENABLED_PT | + XSTATE_XCOMP_BV_COMPACT; + pt_ext->rtit_ctl |= RTIT_CTL_TRACEEN; + pt_pcpu[cpu_id].ctx = pt_ctx; + pt_cpu_set_state(cpu_id, PT_STOPPED); + + return (0); +} + +/* + * hwt backend trace start operation. CPU affine. + */ +static void +pt_backend_enable(struct hwt_context *ctx, int cpu_id) +{ + if (ctx->mode == HWT_MODE_CPU) + return; + + KASSERT(curcpu == cpu_id, + ("%s: attempting to start PT on another cpu", __func__)); + pt_cpu_start(NULL); + CPU_SET(cpu_id, &ctx->cpu_map); +} + +/* + * hwt backend trace stop operation. CPU affine. + */ +static void +pt_backend_disable(struct hwt_context *ctx, int cpu_id) +{ + struct pt_cpu *cpu; + + if (ctx->mode == HWT_MODE_CPU) + return; + + KASSERT(curcpu == cpu_id, + ("%s: attempting to disable PT on another cpu", __func__)); + pt_cpu_stop(NULL); + CPU_CLR(cpu_id, &ctx->cpu_map); + cpu = &pt_pcpu[cpu_id]; + cpu->ctx = NULL; +} + +/* + * hwt backend trace start operation for remote CPUs. + */ +static int +pt_backend_enable_smp(struct hwt_context *ctx) +{ + + dprintf("%s\n", __func__); + if (ctx->mode == HWT_MODE_CPU && + atomic_swap_32(&cpu_mode_ctr, 1) != 0) + return (-1); + + KASSERT(ctx->mode == HWT_MODE_CPU, + ("%s: should only be used for CPU mode", __func__)); + smp_rendezvous_cpus(ctx->cpu_map, NULL, pt_cpu_start, NULL, NULL); + + return (0); +} + +/* + * hwt backend trace stop operation for remote CPUs. + */ +static int +pt_backend_disable_smp(struct hwt_context *ctx) +{ + + dprintf("%s\n", __func__); + if (ctx->mode == HWT_MODE_CPU && + atomic_swap_32(&cpu_mode_ctr, 0) == 0) + return (-1); + + if (CPU_EMPTY(&ctx->cpu_map)) { + dprintf("%s: empty cpu map\n", __func__); + return (-1); + } + smp_rendezvous_cpus(ctx->cpu_map, NULL, pt_cpu_stop, NULL, NULL); + + return (0); +} + +/* + * HWT backend initialization method. + * + * Installs the ToPA interrupt handler and initializes + * the tracing contexts used for HWT_MODE_CPU. + */ +static int +pt_backend_init(struct hwt_context *ctx) +{ + struct hwt_cpu *hwt_cpu; + int error; + + dprintf("%s\n", __func__); + if (ctx->mode == HWT_MODE_CPU) { + TAILQ_FOREACH(hwt_cpu, &ctx->cpus, next) { + error = pt_init_ctx(&pt_pcpu_ctx[hwt_cpu->cpu_id], + hwt_cpu->vm, hwt_cpu->cpu_id); + if (error) + return (error); + } + } + + return (0); +} + +/* + * HWT backend teardown method. + * + * Removes the ToPA interrupt handler, stops tracing on all active CPUs, + * and releases all previously allocated ToPA metadata. + */ +static int +pt_backend_deinit(struct hwt_context *ctx) +{ + struct pt_ctx *pt_ctx; + struct hwt_thread *thr; + int cpu_id; + + dprintf("%s\n", __func__); + + pt_backend_disable_smp(ctx); + if (ctx->mode == HWT_MODE_THREAD) { + TAILQ_FOREACH(thr, &ctx->threads, next) { + KASSERT(thr->private != NULL, + ("%s: thr->private not set", __func__)); + pt_ctx = (struct pt_ctx *)thr->private; + pt_deinit_ctx(pt_ctx); + } + } else { + CPU_FOREACH(cpu_id) { + if (!CPU_ISSET(cpu_id, &ctx->cpu_map)) + continue; + if (pt_pcpu[cpu_id].ctx != NULL) { + KASSERT(pt_pcpu[cpu_id].ctx == + &pt_pcpu_ctx[cpu_id], + ("%s: CPU mode tracing with non-cpu mode PT" + "context active", + __func__)); + pt_pcpu[cpu_id].ctx = NULL; + } + pt_ctx = &pt_pcpu_ctx[cpu_id]; + pt_deinit_ctx(pt_ctx); + memset(&pt_pcpu[cpu_id], 0, sizeof(struct pt_cpu)); + } + } + + return (0); +} + +/* + * Fetches current offset into the tracing buffer. + */ +static int +pt_backend_read(struct hwt_vm *vm, int *curpage, vm_offset_t *curpage_offset, + uint64_t *data) +{ + struct pt_buffer *buf; + + if (vm->ctx->mode == HWT_MODE_THREAD) + buf = &((struct pt_ctx *)vm->thr->private)->buf; + else + buf = &pt_pcpu[vm->cpu->cpu_id].ctx->buf; + mtx_lock_spin(&buf->lock); + *curpage = buf->curpage; + *curpage_offset = buf->offset + (buf->wrap_count * vm->ctx->bufsize); + mtx_unlock_spin(&buf->lock); + + return (0); +} + +/* + * HWT thread creation hook. + * Allocates and associates a 'struct pt_ctx' for a given hwt thread. + */ +static int +pt_backend_alloc_thread(struct hwt_thread *thr) +{ + struct pt_ctx *pt_ctx; + int error; + + /* Omit M_WAITOK since this might get invoked a non-sleepable context */ + pt_ctx = malloc(sizeof(*pt_ctx), M_PT, M_NOWAIT | M_ZERO); + if (pt_ctx == NULL) + return (ENOMEM); + + error = pt_init_ctx(pt_ctx, thr->vm, thr->thread_id); + if (error) + return (error); + + thr->private = pt_ctx; + return (0); +} +/* + * HWT thread teardown hook. + */ +static void +pt_backend_free_thread(struct hwt_thread *thr) +{ + struct pt_ctx *ctx; + + ctx = (struct pt_ctx *)thr->private; + + pt_deinit_ctx(ctx); + free(ctx, M_PT); +} + +static void +pt_backend_dump(int cpu_id) +{ +} + +static struct hwt_backend_ops pt_ops = { + .hwt_backend_init = pt_backend_init, + .hwt_backend_deinit = pt_backend_deinit, + + .hwt_backend_configure = pt_backend_configure, + + .hwt_backend_enable = pt_backend_enable, + .hwt_backend_disable = pt_backend_disable, + +#ifdef SMP + .hwt_backend_enable_smp = pt_backend_enable_smp, + .hwt_backend_disable_smp = pt_backend_disable_smp, +#endif + + .hwt_backend_read = pt_backend_read, + .hwt_backend_dump = pt_backend_dump, + + .hwt_backend_thread_alloc = pt_backend_alloc_thread, + .hwt_backend_thread_free = pt_backend_free_thread, +}; + +static struct hwt_backend backend = { + .ops = &pt_ops, + .name = "pt", + .kva_req = 1, +}; + +/* + * Reads the latest valid trace buffer offset and enqueues + * a HWT_RECORD_BUFFER record. + * Used as a taskqueue routine from the ToPA interrupt handler. + */ +static void +pt_send_buffer_record(void *arg, int pending __unused) +{ + struct hwt_record_entry record; + struct pt_ctx *ctx = (struct pt_ctx *)arg; + + /* Prepare buffer record. */ + mtx_lock_spin(&ctx->buf.lock); + pt_fill_buffer_record(ctx->id, &ctx->buf, &record); + mtx_unlock_spin(&ctx->buf.lock); + hwt_record_ctx(ctx->hwt_ctx, &record, M_ZERO | M_NOWAIT); +} +static void +pt_topa_status_clear(void) +{ + uint64_t reg; + + reg = rdmsr(MSR_IA_GLOBAL_STATUS_RESET); + reg &= ~GLOBAL_STATUS_FLAG_TRACETOPAPMI; + reg |= GLOBAL_STATUS_FLAG_TRACETOPAPMI; + wrmsr(MSR_IA_GLOBAL_STATUS_RESET, reg); +} + +/* + * ToPA PMI handler. + * + * Invoked every time a ToPA entry marked with TOPA_INT is filled. + * Uses taskqueue to enqueue a buffer record for userspace. + * Re-enables the PC interrupt line as long as tracing is active. + */ +static int +pt_topa_intr(struct trapframe *tf) +{ + struct pt_buffer *buf; + struct pt_ctx *ctx; + uint64_t reg; + + SDT_PROBE0(pt, , , topa__intr); + + if (pt_cpu_get_state(curcpu) != PT_ACTIVE) { + return (0); + } + reg = rdmsr(MSR_IA_GLOBAL_STATUS); + if ((reg & GLOBAL_STATUS_FLAG_TRACETOPAPMI) == 0) { + /* ACK spurious or leftover interrupt. */ + pt_topa_status_clear(); + return (1); + } + + ctx = pt_pcpu[curcpu].ctx; + buf = &ctx->buf; + KASSERT(buf->topa_hw != NULL, + ("%s: ToPA PMI interrupt with invalid buffer", __func__)); + + pt_cpu_toggle_local(ctx->save_area, false); + pt_update_buffer(buf); + pt_topa_status_clear(); + taskqueue_enqueue_flags(taskqueue_pt, &ctx->task, + TASKQUEUE_FAIL_IF_PENDING); + + if (pt_cpu_get_state(curcpu) == PT_ACTIVE) { + pt_cpu_toggle_local(ctx->save_area, true); + lapic_reenable_pcint(); + } + return (1); +} + +/* + * Module initialization. + * + * Saves all PT-related cpuid info, registers itself as a HWT backend, + * and allocates metadata required to keep track of tracing operations + * on each CPU. + */ +static int +pt_init(void) +{ + u_int cp[4]; + int error; + + dprintf("pt: Enumerating part 1\n"); + cpuid_count(CPUID_PT_LEAF, 0, cp); + dprintf("pt: Maximum valid sub-leaf Index: %x\n", cp[0]); + dprintf("pt: ebx %x\n", cp[1]); + dprintf("pt: ecx %x\n", cp[2]); + + pt_info.l0_eax = cp[0]; + pt_info.l0_ebx = cp[1]; + pt_info.l0_ecx = cp[2]; + + dprintf("pt: Enumerating part 2\n"); + cpuid_count(CPUID_PT_LEAF, 1, cp); + dprintf("pt: eax %x\n", cp[0]); + dprintf("pt: ebx %x\n", cp[1]); + + pt_info.l1_eax = cp[0]; + pt_info.l1_ebx = cp[1]; + + error = hwt_backend_register(&backend); + if (error != 0) { + printf("pt: unable to register hwt backend, error %d\n", error); + return (error); + } + pt_pcpu = mallocarray(mp_ncpus, sizeof(struct pt_cpu), M_PT, + M_ZERO | M_WAITOK); + pt_pcpu_ctx = mallocarray(mp_ncpus, sizeof(struct pt_ctx), M_PT, + M_ZERO | M_WAITOK); + + nmi_register_handler(pt_topa_intr); + if (!lapic_enable_pcint()) { + nmi_remove_handler(pt_topa_intr); + hwt_backend_unregister(&backend); + free(pt_pcpu, M_PT); + free(pt_pcpu_ctx, M_PT); + pt_pcpu = NULL; + pt_pcpu_ctx = NULL; + printf("pt: failed to setup interrupt line\n"); + return (error); + } + initialized = true; + + return (0); +} + +/* + * Checks whether the CPU support Intel PT and + * initializes XSAVE area info. + * + * The driver relies on XSAVE/XRSTOR PT extensions, + * Table of Physical Addresses (ToPA) support, and + * support for multiple ToPA entries. + */ +static bool +pt_supported(void) +{ + u_int cp[4]; + + if ((cpu_stdext_feature & CPUID_STDEXT_PROCTRACE) == 0) { + printf("pt: CPU does not support Intel Processor Trace\n"); + return (false); + } + if ((cpu_feature2 & CPUID2_XSAVE) == 0) { + printf("pt: XSAVE is not supported\n"); + return (false); + } + if (!xsave_extfeature_supported(XFEATURE_ENABLED_PT, true)) { + printf("pt: CPU does not support managing PT state using XSAVE\n"); + return (false); + } + if (!xsave_extension_supported(CPUID_EXTSTATE_XSAVEC)) { + printf("pt: XSAVE compaction is not supported\n"); + return (false); + } + if (!xsave_extension_supported(CPUID_EXTSTATE_XSAVES)) { + printf("pt: CPU does not support XSAVES/XRSTORS\n"); + return (false); + } + + /* Require ToPA support. */ + cpuid_count(CPUID_PT_LEAF, 0, cp); + if ((cp[2] & CPUPT_TOPA) == 0) { + printf("pt: ToPA is not supported\n"); + return (false); + } + if ((cp[2] & CPUPT_TOPA_MULTI) == 0) { + printf("pt: multiple ToPA outputs are not supported\n"); + return (false); + } + + pt_info.xstate_hdr_offset = xsave_area_hdr_offset(); + pt_info.xsave_area_size = xsave_area_size(PT_XSTATE_BV, true, true); + pt_info.pt_xsave_offset = xsave_area_offset(PT_XSTATE_BV, + XFEATURE_ENABLED_PT, true, true); + + return (true); +} + +static void +pt_deinit(void) +{ + if (!initialized) + return; + nmi_remove_handler(pt_topa_intr); + lapic_disable_pcint(); + hwt_backend_unregister(&backend); + free(pt_pcpu, M_PT); + free(pt_pcpu_ctx, M_PT); + pt_pcpu = NULL; + initialized = false; +} + +static int +pt_modevent(module_t mod, int type, void *data) +{ + switch (type) { + case MOD_LOAD: + if (!pt_supported() || pt_init() != 0) { + return (ENXIO); + } + break; + case MOD_UNLOAD: + pt_deinit(); + break; + default: + break; + } + + return (0); +} + +static moduledata_t pt_mod = { "intel_pt", pt_modevent, NULL }; + +DECLARE_MODULE(intel_pt, pt_mod, SI_SUB_DRIVERS, SI_ORDER_FIRST); +MODULE_DEPEND(intel_pt, hwt, 1, 1, 1); +MODULE_VERSION(intel_pt, 1); diff --git a/sys/amd64/pt/pt.h b/sys/amd64/pt/pt.h new file mode 100644 index 000000000000..2423afdf22e9 --- /dev/null +++ b/sys/amd64/pt/pt.h @@ -0,0 +1,49 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023 Bojan Novković <bnovkov@freebsd.org> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _AMD64_PT_PT_H_ +#define _AMD64_PT_PT_H_ + +#include <sys/types.h> + +#include <x86/include/specialreg.h> + +#define PT_IP_FILTER_MAX_RANGES (2) /* Intel SDM Vol. 3C, 33-29 */ + +struct pt_cpu_config { + uint64_t rtit_ctl; + register_t cr3_filter; + int nranges; + struct ipf_range { + vm_offset_t start; + vm_offset_t end; + } ip_ranges[PT_IP_FILTER_MAX_RANGES]; + uint32_t mtc_freq; + uint32_t cyc_thresh; + uint32_t psb_freq; +}; +#endif /* !_AMD64_PT_PT_H_ */ diff --git a/sys/amd64/vmm/intel/vmx_support.S b/sys/amd64/vmm/intel/vmx_support.S index f393f160b101..130130b64541 100644 --- a/sys/amd64/vmm/intel/vmx_support.S +++ b/sys/amd64/vmm/intel/vmx_support.S @@ -32,12 +32,6 @@ #include "vmx_assym.h" -#ifdef SMP -#define LK lock ; -#else -#define LK -#endif - /* Be friendly to DTrace FBT's prologue/epilogue pattern matching */ #define VENTER push %rbp ; mov %rsp,%rbp #define VLEAVE pop %rbp |