diff options
author | Konstantin Belousov <kib@FreeBSD.org> | 2020-08-23 20:19:04 +0000 |
---|---|---|
committer | Konstantin Belousov <kib@FreeBSD.org> | 2020-08-23 20:19:04 +0000 |
commit | 9ce875d9b59dde81bf116d24e6b8649075674303 (patch) | |
tree | 8f51fc54f9c18becf6cdf42fe0284e9767d9ecfa | |
parent | 4ba405dcdbf0abf9e142fc0b9c3c866359bf4c57 (diff) |
Notes
-rw-r--r-- | sys/amd64/amd64/elf_machdep.c | 124 | ||||
-rw-r--r-- | sys/amd64/amd64/genassym.c | 9 | ||||
-rw-r--r-- | sys/amd64/amd64/locore.S | 63 | ||||
-rw-r--r-- | sys/amd64/amd64/mp_machdep.c | 40 | ||||
-rw-r--r-- | sys/amd64/amd64/mpboot.S | 27 | ||||
-rw-r--r-- | sys/amd64/amd64/pmap.c | 787 | ||||
-rw-r--r-- | sys/amd64/include/md_var.h | 2 | ||||
-rw-r--r-- | sys/amd64/include/param.h | 6 | ||||
-rw-r--r-- | sys/amd64/include/pmap.h | 72 | ||||
-rw-r--r-- | sys/amd64/include/proc.h | 2 | ||||
-rw-r--r-- | sys/amd64/include/vmparam.h | 31 | ||||
-rw-r--r-- | sys/amd64/linux/linux_sysvec.c | 8 | ||||
-rw-r--r-- | sys/amd64/vmm/amd/svm.c | 2 | ||||
-rw-r--r-- | sys/amd64/vmm/intel/vmx.c | 2 | ||||
-rw-r--r-- | sys/cddl/dev/dtrace/amd64/dtrace_subr.c | 3 |
15 files changed, 951 insertions, 227 deletions
diff --git a/sys/amd64/amd64/elf_machdep.c b/sys/amd64/amd64/elf_machdep.c index 1ab28676ce7de..3182d1758b18c 100644 --- a/sys/amd64/amd64/elf_machdep.c +++ b/sys/amd64/amd64/elf_machdep.c @@ -49,7 +49,7 @@ __FBSDID("$FreeBSD$"); #include <machine/fpu.h> #include <machine/md_var.h> -struct sysentvec elf64_freebsd_sysvec = { +struct sysentvec elf64_freebsd_sysvec_la48 = { .sv_size = SYS_MAXSYSCALL, .sv_table = sysent, .sv_errsize = 0, @@ -64,9 +64,9 @@ struct sysentvec elf64_freebsd_sysvec = { .sv_imgact_try = NULL, .sv_minsigstksz = MINSIGSTKSZ, .sv_minuser = VM_MIN_ADDRESS, - .sv_maxuser = VM_MAXUSER_ADDRESS, - .sv_usrstack = USRSTACK, - .sv_psstrings = PS_STRINGS, + .sv_maxuser = VM_MAXUSER_ADDRESS_LA48, + .sv_usrstack = USRSTACK_LA48, + .sv_psstrings = PS_STRINGS_LA48, .sv_stackprot = VM_PROT_ALL, .sv_copyout_auxargs = __elfN(freebsd_copyout_auxargs), .sv_copyout_strings = exec_copyout_strings, @@ -78,14 +78,64 @@ struct sysentvec elf64_freebsd_sysvec = { .sv_set_syscall_retval = cpu_set_syscall_retval, .sv_fetch_syscall_args = cpu_fetch_syscall_args, .sv_syscallnames = syscallnames, - .sv_shared_page_base = SHAREDPAGE, + .sv_shared_page_base = SHAREDPAGE_LA48, .sv_shared_page_len = PAGE_SIZE, .sv_schedtail = NULL, .sv_thread_detach = NULL, .sv_trap = NULL, .sv_stackgap = elf64_stackgap, }; -INIT_SYSENTVEC(elf64_sysvec, &elf64_freebsd_sysvec); + +struct sysentvec elf64_freebsd_sysvec_la57 = { + .sv_size = SYS_MAXSYSCALL, + .sv_table = sysent, + .sv_errsize = 0, + .sv_errtbl = NULL, + .sv_transtrap = NULL, + .sv_fixup = __elfN(freebsd_fixup), + .sv_sendsig = sendsig, + .sv_sigcode = sigcode, + .sv_szsigcode = &szsigcode, + .sv_name = "FreeBSD ELF64", + .sv_coredump = __elfN(coredump), + .sv_imgact_try = NULL, + .sv_minsigstksz = MINSIGSTKSZ, + .sv_minuser = VM_MIN_ADDRESS, + .sv_maxuser = VM_MAXUSER_ADDRESS_LA57, + .sv_usrstack = USRSTACK_LA57, + .sv_psstrings = PS_STRINGS_LA57, + .sv_stackprot = VM_PROT_ALL, + .sv_copyout_auxargs = __elfN(freebsd_copyout_auxargs), + .sv_copyout_strings = exec_copyout_strings, + .sv_setregs = exec_setregs, + .sv_fixlimit = NULL, + .sv_maxssiz = NULL, + .sv_flags = SV_ABI_FREEBSD | SV_ASLR | SV_LP64 | SV_SHP | + SV_TIMEKEEP, + .sv_set_syscall_retval = cpu_set_syscall_retval, + .sv_fetch_syscall_args = cpu_fetch_syscall_args, + .sv_syscallnames = syscallnames, + .sv_shared_page_base = SHAREDPAGE_LA57, + .sv_shared_page_len = PAGE_SIZE, + .sv_schedtail = NULL, + .sv_thread_detach = NULL, + .sv_trap = NULL, + .sv_stackgap = elf64_stackgap, +}; + +static void +amd64_init_sysvecs(void *arg) +{ + amd64_lower_shared_page(&elf64_freebsd_sysvec_la48); + if (la57) { + exec_sysvec_init(&elf64_freebsd_sysvec_la57); + exec_sysvec_init_secondary(&elf64_freebsd_sysvec_la57, + &elf64_freebsd_sysvec_la48); + } else { + exec_sysvec_init(&elf64_freebsd_sysvec_la48); + } +} +SYSINIT(elf64_sysvec, SI_SUB_EXEC, SI_ORDER_ANY, amd64_init_sysvecs, NULL); void amd64_lower_shared_page(struct sysentvec *sv) @@ -98,29 +148,57 @@ amd64_lower_shared_page(struct sysentvec *sv) } } -/* - * Do this fixup before INIT_SYSENTVEC (SI_ORDER_ANY) because the latter - * uses the value of sv_shared_page_base. - */ -SYSINIT(elf64_sysvec_fixup, SI_SUB_EXEC, SI_ORDER_FIRST, - (sysinit_cfunc_t) amd64_lower_shared_page, - &elf64_freebsd_sysvec); +static boolean_t +freebsd_brand_info_la57_img_compat(struct image_params *imgp, + int32_t *osrel __unused, uint32_t *fctl0) +{ + if ((imgp->proc->p_md.md_flags & P_MD_LA57) != 0) + return (TRUE); + if (fctl0 == NULL || (*fctl0 & NT_FREEBSD_FCTL_LA48) != 0) + return (FALSE); + if ((imgp->proc->p_md.md_flags & P_MD_LA48) != 0) + return (FALSE); + return (TRUE); +} -static Elf64_Brandinfo freebsd_brand_info = { +static Elf64_Brandinfo freebsd_brand_info_la48 = { .brand = ELFOSABI_FREEBSD, .machine = EM_X86_64, .compat_3_brand = "FreeBSD", .emul_path = NULL, .interp_path = "/libexec/ld-elf.so.1", - .sysvec = &elf64_freebsd_sysvec, + .sysvec = &elf64_freebsd_sysvec_la48, .interp_newpath = NULL, .brand_note = &elf64_freebsd_brandnote, - .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE + .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE, +}; + +static Elf64_Brandinfo freebsd_brand_info_la57 = { + .brand = ELFOSABI_FREEBSD, + .machine = EM_X86_64, + .compat_3_brand = "FreeBSD", + .emul_path = NULL, + .interp_path = "/libexec/ld-elf.so.1", + .sysvec = &elf64_freebsd_sysvec_la57, + .interp_newpath = NULL, + .brand_note = &elf64_freebsd_brandnote, + .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE, + .header_supported = freebsd_brand_info_la57_img_compat, }; +static void +sysinit_register_elf64_brand_entries(void *arg __unused) +{ + /* + * _57 must go first so it can either claim the image or hand + * it to _48. + */ + if (la57) + elf64_insert_brand_entry(&freebsd_brand_info_la57); + elf64_insert_brand_entry(&freebsd_brand_info_la48); +} SYSINIT(elf64, SI_SUB_EXEC, SI_ORDER_FIRST, - (sysinit_cfunc_t) elf64_insert_brand_entry, - &freebsd_brand_info); + sysinit_register_elf64_brand_entries, NULL); static Elf64_Brandinfo freebsd_brand_oinfo = { .brand = ELFOSABI_FREEBSD, @@ -128,15 +206,14 @@ static Elf64_Brandinfo freebsd_brand_oinfo = { .compat_3_brand = "FreeBSD", .emul_path = NULL, .interp_path = "/usr/libexec/ld-elf.so.1", - .sysvec = &elf64_freebsd_sysvec, + .sysvec = &elf64_freebsd_sysvec_la48, .interp_newpath = NULL, .brand_note = &elf64_freebsd_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; SYSINIT(oelf64, SI_SUB_EXEC, SI_ORDER_ANY, - (sysinit_cfunc_t) elf64_insert_brand_entry, - &freebsd_brand_oinfo); + (sysinit_cfunc_t)elf64_insert_brand_entry, &freebsd_brand_oinfo); static Elf64_Brandinfo kfreebsd_brand_info = { .brand = ELFOSABI_FREEBSD, @@ -144,15 +221,14 @@ static Elf64_Brandinfo kfreebsd_brand_info = { .compat_3_brand = "FreeBSD", .emul_path = NULL, .interp_path = "/lib/ld-kfreebsd-x86-64.so.1", - .sysvec = &elf64_freebsd_sysvec, + .sysvec = &elf64_freebsd_sysvec_la48, .interp_newpath = NULL, .brand_note = &elf64_kfreebsd_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE_MANDATORY }; SYSINIT(kelf64, SI_SUB_EXEC, SI_ORDER_ANY, - (sysinit_cfunc_t) elf64_insert_brand_entry, - &kfreebsd_brand_info); + (sysinit_cfunc_t)elf64_insert_brand_entry, &kfreebsd_brand_info); void elf64_dump_thread(struct thread *td, void *dst, size_t *off) diff --git a/sys/amd64/amd64/genassym.c b/sys/amd64/amd64/genassym.c index ec3707ce41f95..75500555105a4 100644 --- a/sys/amd64/amd64/genassym.c +++ b/sys/amd64/amd64/genassym.c @@ -99,11 +99,10 @@ ASSYM(TDP_KTHREAD, TDP_KTHREAD); ASSYM(PAGE_SIZE, PAGE_SIZE); ASSYM(NPTEPG, NPTEPG); ASSYM(NPDEPG, NPDEPG); -ASSYM(addr_PTmap, addr_PTmap); -ASSYM(addr_PDmap, addr_PDmap); -ASSYM(addr_PDPmap, addr_PDPmap); -ASSYM(addr_PML4map, addr_PML4map); -ASSYM(addr_PML4pml4e, addr_PML4pml4e); +ASSYM(addr_P4Tmap, addr_P4Tmap); +ASSYM(addr_P4Dmap, addr_P4Dmap); +ASSYM(addr_P5Tmap, addr_P5Tmap); +ASSYM(addr_P5Dmap, addr_P5Dmap); ASSYM(PDESIZE, sizeof(pd_entry_t)); ASSYM(PTESIZE, sizeof(pt_entry_t)); ASSYM(PAGE_SHIFT, PAGE_SHIFT); diff --git a/sys/amd64/amd64/locore.S b/sys/amd64/amd64/locore.S index d070c10693b70..a9a7b5f3972c5 100644 --- a/sys/amd64/amd64/locore.S +++ b/sys/amd64/amd64/locore.S @@ -36,13 +36,8 @@ /* * Compiled KERNBASE location */ - .globl kernbase,loc_PTmap,loc_PDmap,loc_PDPmap,loc_PML4map,loc_PML4pml4e,dmapbase,dmapend + .globl kernbase, loc_PTmap, loc_PDmap, loc_PDPmap, dmapbase, dmapend .set kernbase,KERNBASE - .set loc_PTmap,addr_PTmap - .set loc_PDmap,addr_PDmap - .set loc_PDPmap,addr_PDPmap - .set loc_PML4map,addr_PML4map - .set loc_PML4pml4e,addr_PML4pml4e .set dmapbase,DMAP_MIN_ADDRESS .set dmapend,DMAP_MAX_ADDRESS @@ -82,6 +77,62 @@ NON_GPROF_ENTRY(btext) 0: hlt jmp 0b +/* la57_trampoline(%rdi pml5) */ +NON_GPROF_ENTRY(la57_trampoline) + movq %rsp,%r11 + movq %rbx,%r10 + leaq la57_trampoline_end(%rip),%rsp + + movq %cr0,%rdx + lgdtq la57_trampoline_gdt_desc(%rip) + + pushq $(2<<3) + leaq l1(%rip),%rax + leaq l2(%rip),%rbx + + pushq %rax + lretq + .code32 + +l1: movl $(3<<3),%eax + movl %eax,%ss + + movl %edx,%eax + andl $~CR0_PG,%eax + movl %eax,%cr0 + + movl %cr4,%eax + orl $CR4_LA57,%eax + movl %eax,%cr4 + + movl %edi,%cr3 + movl %edx,%cr0 + + pushl $(1<<3) + pushl %ebx + lretl + .code64 + +l2: movq %r11,%rsp + movq %r10,%rbx + retq + .p2align 4,0 +NON_GPROF_ENTRY(la57_trampoline_gdt_desc) + .word la57_trampoline_end - la57_trampoline_gdt + .long 0 /* filled by pmap_bootstrap_la57 */ + .p2align 4,0 +NON_GPROF_ENTRY(la57_trampoline_gdt) + .long 0x00000000 /* null desc */ + .long 0x00000000 + .long 0x00000000 /* 64bit code */ + .long 0x00209800 + .long 0x0000ffff /* 32bit code */ + .long 0x00cf9b00 + .long 0x0000ffff /* universal data */ + .long 0x00cf9300 + .dcb.l 16,0 +NON_GPROF_ENTRY(la57_trampoline_end) + .bss ALIGN_DATA /* just to be sure */ .globl bootstack diff --git a/sys/amd64/amd64/mp_machdep.c b/sys/amd64/amd64/mp_machdep.c index d46362ba9f9c0..844cb49b536cd 100644 --- a/sys/amd64/amd64/mp_machdep.c +++ b/sys/amd64/amd64/mp_machdep.c @@ -96,7 +96,7 @@ __FBSDID("$FreeBSD$"); #define GiB(v) (v ## ULL << 30) -#define AP_BOOTPT_SZ (PAGE_SIZE * 3) +#define AP_BOOTPT_SZ (PAGE_SIZE * 4) /* Temporary variables for init_secondary() */ char *doublefault_stack; @@ -104,6 +104,8 @@ char *mce_stack; char *nmi_stack; char *dbg_stack; +extern u_int mptramp_la57; + /* * Local data and functions. */ @@ -240,6 +242,8 @@ cpu_mp_start(void) assign_cpu_ids(); + mptramp_la57 = la57; + /* Start each Application Processor */ init_ops.start_all_aps(); @@ -395,9 +399,9 @@ mp_realloc_pcpu(int cpuid, int domain) int native_start_all_aps(void) { - u_int64_t *pt4, *pt3, *pt2; + u_int64_t *pt5, *pt4, *pt3, *pt2; u_int32_t mpbioswarmvec; - int apic_id, cpu, domain, i; + int apic_id, cpu, domain, i, xo; u_char mpbiosreason; mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN); @@ -406,18 +410,38 @@ native_start_all_aps(void) bcopy(mptramp_start, (void *)PHYS_TO_DMAP(boot_address), bootMP_size); /* Locate the page tables, they'll be below the trampoline */ - pt4 = (uint64_t *)PHYS_TO_DMAP(mptramp_pagetables); + if (la57) { + pt5 = (uint64_t *)PHYS_TO_DMAP(mptramp_pagetables); + xo = 1; + } else { + xo = 0; + } + pt4 = (uint64_t *)PHYS_TO_DMAP(mptramp_pagetables + xo * PAGE_SIZE); pt3 = pt4 + (PAGE_SIZE) / sizeof(u_int64_t); pt2 = pt3 + (PAGE_SIZE) / sizeof(u_int64_t); /* Create the initial 1GB replicated page tables */ for (i = 0; i < 512; i++) { - /* Each slot of the level 4 pages points to the same level 3 page */ - pt4[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + PAGE_SIZE); + if (la57) { + pt5[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + + PAGE_SIZE); + pt5[i] |= PG_V | PG_RW | PG_U; + } + + /* + * Each slot of the level 4 pages points to the same + * level 3 page. + */ + pt4[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + + (xo + 1) * PAGE_SIZE); pt4[i] |= PG_V | PG_RW | PG_U; - /* Each slot of the level 3 pages points to the same level 2 page */ - pt3[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + (2 * PAGE_SIZE)); + /* + * Each slot of the level 3 pages points to the same + * level 2 page. + */ + pt3[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + + ((xo + 2) * PAGE_SIZE)); pt3[i] |= PG_V | PG_RW | PG_U; /* The level 2 page slots are mapped with 2MB pages for 1GB. */ diff --git a/sys/amd64/amd64/mpboot.S b/sys/amd64/amd64/mpboot.S index 5545fe9290d14..fb75d2b884400 100644 --- a/sys/amd64/amd64/mpboot.S +++ b/sys/amd64/amd64/mpboot.S @@ -90,10 +90,16 @@ protmode: mov $bootdata-gdt, %eax mov %ax, %ds - /* Turn on the PAE bit for when paging is enabled */ + /* + * Turn on the PAE bit and optionally the LA57 bit for when paging + * is later enabled. + */ mov %cr4, %eax orl $CR4_PAE, %eax - mov %eax, %cr4 + cmpb $0, mptramp_la57-mptramp_start(%ebx) + je 1f + orl $CR4_LA57, %eax +1: mov %eax, %cr4 /* * Enable EFER.LME so that we get long mode when all the prereqs are @@ -132,9 +138,9 @@ protmode: /* * At this point paging is enabled, and we are in "compatibility" mode. * We do another far jump to reload %cs with the 64 bit selector. - * %cr3 points to a 4-level page table page. + * %cr3 points to a 4- or 5-level page table. * We cannot yet jump all the way to the kernel because we can only - * specify a 32 bit linear address. So, yet another trampoline. + * specify a 32 bit linear address. So, we use yet another trampoline. * * The following instruction is: * ljmp $kernelcode-gdt, $tramp_64-mptramp_start @@ -209,6 +215,11 @@ gdtend: mptramp_pagetables: .long 0 + /* 5-level paging ? */ + .globl mptramp_la57 +mptramp_la57: + .long 0 + /* * The pseudo descriptor for lgdt to use. */ @@ -251,8 +262,12 @@ entry_64: * Load a real %cr3 that has all the direct map stuff and switches * off the 1GB replicated mirror. Load a stack pointer and jump * into AP startup code in C. - */ + */ + cmpl $0, la57 + jne 2f movq KPML4phys, %rax - movq %rax, %cr3 + jmp 3f +2: movq KPML5phys, %rax +3: movq %rax, %cr3 movq bootSTK, %rsp jmp init_secondary diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index d025beff45186..4b17debd480d3 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -398,6 +398,19 @@ static int pg_ps_enabled = 1; SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &pg_ps_enabled, 0, "Are large page mappings enabled?"); +int __read_frequently la57 = 0; +SYSCTL_INT(_vm_pmap, OID_AUTO, la57, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, + &la57, 0, + "5-level paging for host is enabled"); + +static bool +pmap_is_la57(pmap_t pmap) +{ + if (pmap->pm_type == PT_X86) + return (la57); + return (false); /* XXXKIB handle EPT */ +} + #define PAT_INDEX_SIZE 8 static int pat_index[PAT_INDEX_SIZE]; /* cache mode to PAT index conversion */ @@ -405,7 +418,10 @@ static u_int64_t KPTphys; /* phys addr of kernel level 1 */ static u_int64_t KPDphys; /* phys addr of kernel level 2 */ u_int64_t KPDPphys; /* phys addr of kernel level 3 */ u_int64_t KPML4phys; /* phys addr of kernel level 4 */ +u_int64_t KPML5phys; /* phys addr of kernel level 5, + if supported */ +static pml4_entry_t *kernel_pml4; static u_int64_t DMPDphys; /* phys addr of direct mapped level 2 */ static u_int64_t DMPDPphys; /* phys addr of direct mapped level 3 */ static int ndmpdpphys; /* number of DMPDPphys pages */ @@ -1257,7 +1273,7 @@ static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, static void pmap_update_pde_invalidate(pmap_t, vm_offset_t va, pd_entry_t pde); static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, - struct rwlock **lockp); + struct rwlock **lockp, vm_offset_t va); static pd_entry_t *pmap_alloc_pde(pmap_t pmap, vm_offset_t va, vm_page_t *pdpgp, struct rwlock **lockp); static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, @@ -1271,20 +1287,85 @@ static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *); /* Inline functions */ /********************/ -/* Return a non-clipped PD index for a given VA */ +/* + * Return a non-clipped indexes for a given VA, which are page table + * pages indexes at the corresponding level. + */ static __inline vm_pindex_t pmap_pde_pindex(vm_offset_t va) { return (va >> PDRSHIFT); } +static __inline vm_pindex_t +pmap_pdpe_pindex(vm_offset_t va) +{ + return (NUPDE + (va >> PDPSHIFT)); +} + +static __inline vm_pindex_t +pmap_pml4e_pindex(vm_offset_t va) +{ + return (NUPDE + NUPDPE + (va >> PML4SHIFT)); +} + +static __inline vm_pindex_t +pmap_pml5e_pindex(vm_offset_t va) +{ + return (NUPDE + NUPDPE + NUPML4E + (va >> PML5SHIFT)); +} + +static __inline pml4_entry_t * +pmap_pml5e(pmap_t pmap, vm_offset_t va) +{ + + MPASS(pmap_is_la57(pmap)); + return (&pmap->pm_pmltop[pmap_pml5e_index(va)]); +} + +static __inline pml4_entry_t * +pmap_pml5e_u(pmap_t pmap, vm_offset_t va) +{ + + MPASS(pmap_is_la57(pmap)); + return (&pmap->pm_pmltopu[pmap_pml5e_index(va)]); +} + +static __inline pml4_entry_t * +pmap_pml5e_to_pml4e(pml5_entry_t *pml5e, vm_offset_t va) +{ + pml4_entry_t *pml4e; + + /* XXX MPASS(pmap_is_la57(pmap); */ + pml4e = (pml4_entry_t *)PHYS_TO_DMAP(*pml5e & PG_FRAME); + return (&pml4e[pmap_pml4e_index(va)]); +} /* Return a pointer to the PML4 slot that corresponds to a VA */ static __inline pml4_entry_t * pmap_pml4e(pmap_t pmap, vm_offset_t va) { + pml5_entry_t *pml5e; + pml4_entry_t *pml4e; + pt_entry_t PG_V; - return (&pmap->pm_pml4[pmap_pml4e_index(va)]); + if (pmap_is_la57(pmap)) { + pml5e = pmap_pml5e(pmap, va); + PG_V = pmap_valid_bit(pmap); + if ((*pml5e & PG_V) == 0) + return (NULL); + pml4e = (pml4_entry_t *)PHYS_TO_DMAP(*pml5e & PG_FRAME); + } else { + pml4e = pmap->pm_pmltop; + } + return (&pml4e[pmap_pml4e_index(va)]); +} + +static __inline pml4_entry_t * +pmap_pml4e_u(pmap_t pmap, vm_offset_t va) +{ + MPASS(!pmap_is_la57(pmap)); + return (&pmap->pm_pmltopu[pmap_pml4e_index(va)]); } /* Return a pointer to the PDP slot that corresponds to a VA */ @@ -1306,7 +1387,7 @@ pmap_pdpe(pmap_t pmap, vm_offset_t va) PG_V = pmap_valid_bit(pmap); pml4e = pmap_pml4e(pmap, va); - if ((*pml4e & PG_V) == 0) + if (pml4e == NULL || (*pml4e & PG_V) == 0) return (NULL); return (pmap_pml4e_to_pdpe(pml4e, va)); } @@ -1387,21 +1468,37 @@ pmap_resident_count_dec(pmap_t pmap, int count) PMAP_INLINE pt_entry_t * vtopte(vm_offset_t va) { - u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); + u_int64_t mask; KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopte on a uva/gpa 0x%0lx", va)); - return (PTmap + ((va >> PAGE_SHIFT) & mask)); + if (la57) { + mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + + NPML4EPGSHIFT + NPML5EPGSHIFT)) - 1); + return (P5Tmap + ((va >> PAGE_SHIFT) & mask)); + } else { + mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + + NPML4EPGSHIFT)) - 1); + return (P4Tmap + ((va >> PAGE_SHIFT) & mask)); + } } static __inline pd_entry_t * vtopde(vm_offset_t va) { - u_int64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); + u_int64_t mask; KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopde on a uva/gpa 0x%0lx", va)); - return (PDmap + ((va >> PDRSHIFT) & mask)); + if (la57) { + mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + + NPML4EPGSHIFT + NPML5EPGSHIFT)) - 1); + return (P5Dmap + ((va >> PDRSHIFT) & mask)); + } else { + mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + + NPML4EPGSHIFT)) - 1); + return (P4Dmap + ((va >> PDRSHIFT) & mask)); + } } static u_int64_t @@ -1658,6 +1755,8 @@ create_pagetables(vm_paddr_t *firstaddr) p4_p[KPML4BASE + i] = KPDPphys + ptoa(i); p4_p[KPML4BASE + i] |= X86_PG_RW | X86_PG_V; } + + kernel_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys); } /* @@ -1730,7 +1829,7 @@ pmap_bootstrap(vm_paddr_t *firstaddr) * later unmapped (using pmap_remove()) and freed. */ PMAP_LOCK_INIT(kernel_pmap); - kernel_pmap->pm_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(KPML4phys); + kernel_pmap->pm_pmltop = kernel_pml4; kernel_pmap->pm_cr3 = KPML4phys; kernel_pmap->pm_ucr3 = PMAP_NO_CR3; CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ @@ -1891,6 +1990,148 @@ pmap_init_pat(void) load_cr4(cr4); } +extern const char la57_trampoline[], la57_trampoline_gdt_desc[], + la57_trampoline_gdt[], la57_trampoline_end[]; + +static void +pmap_bootstrap_la57(void *arg __unused) +{ + char *v_code; + pml5_entry_t *v_pml5; + pml4_entry_t *v_pml4; + pdp_entry_t *v_pdp; + pd_entry_t *v_pd; + pt_entry_t *v_pt; + vm_page_t m_code, m_pml4, m_pdp, m_pd, m_pt, m_pml5; + void (*la57_tramp)(uint64_t pml5); + struct region_descriptor r_gdt; + + if ((cpu_stdext_feature2 & CPUID_STDEXT2_LA57) == 0) + return; + if (!TUNABLE_INT_FETCH("vm.pmap.la57", &la57)) + la57 = 1; + if (!la57) + return; + + r_gdt.rd_limit = NGDT * sizeof(struct user_segment_descriptor) - 1; + r_gdt.rd_base = (long)__pcpu[0].pc_gdt; + + m_code = vm_page_alloc_contig(NULL, 0, + VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ, + 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); + if ((m_code->flags & PG_ZERO) == 0) + pmap_zero_page(m_code); + v_code = (char *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_code)); + m_pml5 = vm_page_alloc_contig(NULL, 0, + VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ, + 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); + if ((m_pml5->flags & PG_ZERO) == 0) + pmap_zero_page(m_pml5); + KPML5phys = VM_PAGE_TO_PHYS(m_pml5); + v_pml5 = (pml5_entry_t *)PHYS_TO_DMAP(KPML5phys); + m_pml4 = vm_page_alloc_contig(NULL, 0, + VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ, + 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); + if ((m_pml4->flags & PG_ZERO) == 0) + pmap_zero_page(m_pml4); + v_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pml4)); + m_pdp = vm_page_alloc_contig(NULL, 0, + VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ, + 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); + if ((m_pdp->flags & PG_ZERO) == 0) + pmap_zero_page(m_pdp); + v_pdp = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pdp)); + m_pd = vm_page_alloc_contig(NULL, 0, + VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ, + 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); + if ((m_pd->flags & PG_ZERO) == 0) + pmap_zero_page(m_pd); + v_pd = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pd)); + m_pt = vm_page_alloc_contig(NULL, 0, + VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ, + 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); + if ((m_pt->flags & PG_ZERO) == 0) + pmap_zero_page(m_pt); + v_pt = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pt)); + + /* + * Map m_code 1:1, it appears below 4G in KVA due to physical + * address being below 4G. Since kernel KVA is in upper half, + * the pml4e should be zero and free for temporary use. + */ + kernel_pmap->pm_pmltop[pmap_pml4e_index(VM_PAGE_TO_PHYS(m_code))] = + VM_PAGE_TO_PHYS(m_pdp) | X86_PG_V | X86_PG_RW | X86_PG_A | + X86_PG_M; + v_pdp[pmap_pdpe_index(VM_PAGE_TO_PHYS(m_code))] = + VM_PAGE_TO_PHYS(m_pd) | X86_PG_V | X86_PG_RW | X86_PG_A | + X86_PG_M; + v_pd[pmap_pde_index(VM_PAGE_TO_PHYS(m_code))] = + VM_PAGE_TO_PHYS(m_pt) | X86_PG_V | X86_PG_RW | X86_PG_A | + X86_PG_M; + v_pt[pmap_pte_index(VM_PAGE_TO_PHYS(m_code))] = + VM_PAGE_TO_PHYS(m_code) | X86_PG_V | X86_PG_RW | X86_PG_A | + X86_PG_M; + + /* + * Add pml5 entry at top of KVA pointing to existing pml4 table, + * entering all existing kernel mappings into level 5 table. + */ + v_pml5[pmap_pml5e_index(UPT_MAX_ADDRESS)] = KPML4phys | X86_PG_V | + X86_PG_RW | X86_PG_A | X86_PG_M | pg_g; + + /* + * Add pml5 entry for 1:1 trampoline mapping after LA57 is turned on. + */ + v_pml5[pmap_pml5e_index(VM_PAGE_TO_PHYS(m_code))] = + VM_PAGE_TO_PHYS(m_pml4) | X86_PG_V | X86_PG_RW | X86_PG_A | + X86_PG_M; + v_pml4[pmap_pml4e_index(VM_PAGE_TO_PHYS(m_code))] = + VM_PAGE_TO_PHYS(m_pdp) | X86_PG_V | X86_PG_RW | X86_PG_A | + X86_PG_M; + + /* + * Copy and call the 48->57 trampoline, hope we return there, alive. + */ + bcopy(la57_trampoline, v_code, la57_trampoline_end - la57_trampoline); + *(u_long *)(v_code + 2 + (la57_trampoline_gdt_desc - la57_trampoline)) = + la57_trampoline_gdt - la57_trampoline + VM_PAGE_TO_PHYS(m_code); + la57_tramp = (void (*)(uint64_t))VM_PAGE_TO_PHYS(m_code); + la57_tramp(KPML5phys); + + /* + * gdt was necessary reset, switch back to our gdt. + */ + lgdt(&r_gdt); + wrmsr(MSR_GSBASE, (uint64_t)&__pcpu[0]); + load_ds(_udatasel); + load_es(_udatasel); + load_fs(_ufssel); + ssdtosyssd(&gdt_segs[GPROC0_SEL], + (struct system_segment_descriptor *)&__pcpu[0].pc_gdt[GPROC0_SEL]); + ltr(GSEL(GPROC0_SEL, SEL_KPL)); + + /* + * Now unmap the trampoline, and free the pages. + * Clear pml5 entry used for 1:1 trampoline mapping. + */ + pte_clear(&v_pml5[pmap_pml5e_index(VM_PAGE_TO_PHYS(m_code))]); + invlpg((vm_offset_t)v_code); + vm_page_free(m_code); + vm_page_free(m_pdp); + vm_page_free(m_pd); + vm_page_free(m_pt); + + /* + * Recursively map PML5 to itself in order to get PTmap and + * PDmap. + */ + v_pml5[PML5PML5I] = KPML5phys | X86_PG_RW | X86_PG_V | pg_nx; + + kernel_pmap->pm_cr3 = KPML5phys; + kernel_pmap->pm_pmltop = v_pml5; +} +SYSINIT(la57, SI_SUB_KMEM, SI_ORDER_ANY, pmap_bootstrap_la57, NULL); + /* * Initialize a vm_page's machine-dependent fields. */ @@ -2190,7 +2431,8 @@ pmap_init(void) } for (i = 0; i < lm_ents; i++) { m = pmap_large_map_getptp_unlocked(); - kernel_pmap->pm_pml4[LMSPML4I + i] = X86_PG_V | + /* XXXKIB la57 */ + kernel_pml4[LMSPML4I + i] = X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M | pg_nx | VM_PAGE_TO_PHYS(m); } @@ -3566,44 +3808,57 @@ pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) { + pml5_entry_t *pml5; + pml4_entry_t *pml4; + pdp_entry_t *pdp; + pd_entry_t *pd; + vm_page_t pdpg, pdppg, pml4pg; PMAP_LOCK_ASSERT(pmap, MA_OWNED); + /* * unmap the page table page */ - if (m->pindex >= NUPDE + NUPDPE) { + if (m->pindex >= NUPDE + NUPDPE + NUPML4E) { + /* PML4 page */ + MPASS(pmap_is_la57(pmap)); + pml5 = pmap_pml5e(pmap, va); + *pml5 = 0; + if (pmap->pm_pmltopu != NULL && va <= VM_MAXUSER_ADDRESS) { + pml5 = pmap_pml5e_u(pmap, va); + *pml5 = 0; + } + } else if (m->pindex >= NUPDE + NUPDPE) { /* PDP page */ - pml4_entry_t *pml4; pml4 = pmap_pml4e(pmap, va); *pml4 = 0; - if (pmap->pm_pml4u != NULL && va <= VM_MAXUSER_ADDRESS) { - pml4 = &pmap->pm_pml4u[pmap_pml4e_index(va)]; + if (!pmap_is_la57(pmap) && pmap->pm_pmltopu != NULL && + va <= VM_MAXUSER_ADDRESS) { + pml4 = pmap_pml4e_u(pmap, va); *pml4 = 0; } } else if (m->pindex >= NUPDE) { /* PD page */ - pdp_entry_t *pdp; pdp = pmap_pdpe(pmap, va); *pdp = 0; } else { /* PTE page */ - pd_entry_t *pd; pd = pmap_pde(pmap, va); *pd = 0; } pmap_resident_count_dec(pmap, 1); if (m->pindex < NUPDE) { /* We just released a PT, unhold the matching PD */ - vm_page_t pdpg; - pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME); pmap_unwire_ptp(pmap, va, pdpg, free); } else if (m->pindex < NUPDE + NUPDPE) { /* We just released a PD, unhold the matching PDP */ - vm_page_t pdppg; - pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME); pmap_unwire_ptp(pmap, va, pdppg, free); + } else if (m->pindex < NUPDE + NUPDPE + NUPML4E && pmap_is_la57(pmap)) { + /* We just released a PDP, unhold the matching PML4 */ + pml4pg = PHYS_TO_VM_PAGE(*pmap_pml5e(pmap, va) & PG_FRAME); + pmap_unwire_ptp(pmap, va, pml4pg, free); } /* @@ -3659,9 +3914,9 @@ pmap_pinit0(pmap_t pmap) int i; PMAP_LOCK_INIT(pmap); - pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys); - pmap->pm_pml4u = NULL; - pmap->pm_cr3 = KPML4phys; + pmap->pm_pmltop = kernel_pmap->pm_pmltop; + pmap->pm_pmltopu = NULL; + pmap->pm_cr3 = kernel_pmap->pm_cr3; /* hack to keep pmap_pti_pcid_invalidate() alive */ pmap->pm_ucr3 = PMAP_NO_CR3; pmap->pm_root.rt_root = 0; @@ -3714,18 +3969,59 @@ pmap_pinit_pml4(vm_page_t pml4pg) /* install large map entries if configured */ for (i = 0; i < lm_ents; i++) - pm_pml4[LMSPML4I + i] = kernel_pmap->pm_pml4[LMSPML4I + i]; + pm_pml4[LMSPML4I + i] = kernel_pmap->pm_pmltop[LMSPML4I + i]; +} + +void +pmap_pinit_pml5(vm_page_t pml5pg) +{ + pml5_entry_t *pm_pml5; + + pm_pml5 = (pml5_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml5pg)); + + /* + * Add pml5 entry at top of KVA pointing to existing pml4 table, + * entering all existing kernel mappings into level 5 table. + */ + pm_pml5[pmap_pml5e_index(UPT_MAX_ADDRESS)] = KPML4phys | X86_PG_V | + X86_PG_RW | X86_PG_A | X86_PG_M | pg_g | + pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, FALSE); + + /* + * Install self-referential address mapping entry. + */ + pm_pml5[PML5PML5I] = VM_PAGE_TO_PHYS(pml5pg) | + X86_PG_RW | X86_PG_V | X86_PG_M | X86_PG_A | + pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, FALSE); } static void -pmap_pinit_pml4_pti(vm_page_t pml4pg) +pmap_pinit_pml4_pti(vm_page_t pml4pgu) { - pml4_entry_t *pm_pml4; + pml4_entry_t *pm_pml4u; int i; - pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg)); + pm_pml4u = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pgu)); for (i = 0; i < NPML4EPG; i++) - pm_pml4[i] = pti_pml4[i]; + pm_pml4u[i] = pti_pml4[i]; +} + +static void +pmap_pinit_pml5_pti(vm_page_t pml5pgu) +{ + pml5_entry_t *pm_pml5u; + + pm_pml5u = (pml5_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml5pgu)); + + /* + * Add pml5 entry at top of KVA pointing to existing pml4 pti + * table, entering all kernel mappings needed for usermode + * into level 5 table. + */ + pm_pml5u[pmap_pml5e_index(UPT_MAX_ADDRESS)] = + pmap_kextract((vm_offset_t)pti_pml4) | + X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M | pg_g | + pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, FALSE); } /* @@ -3735,29 +4031,30 @@ pmap_pinit_pml4_pti(vm_page_t pml4pg) int pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags) { - vm_page_t pml4pg, pml4pgu; - vm_paddr_t pml4phys; + vm_page_t pmltop_pg, pmltop_pgu; + vm_paddr_t pmltop_phys; int i; /* * allocate the page directory page */ - pml4pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | + pmltop_pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO | VM_ALLOC_WAITOK); - pml4phys = VM_PAGE_TO_PHYS(pml4pg); - pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(pml4phys); + pmltop_phys = VM_PAGE_TO_PHYS(pmltop_pg); + pmap->pm_pmltop = (pml5_entry_t *)PHYS_TO_DMAP(pmltop_phys); + CPU_FOREACH(i) { pmap->pm_pcids[i].pm_pcid = PMAP_PCID_NONE; pmap->pm_pcids[i].pm_gen = 0; } pmap->pm_cr3 = PMAP_NO_CR3; /* initialize to an invalid value */ pmap->pm_ucr3 = PMAP_NO_CR3; - pmap->pm_pml4u = NULL; + pmap->pm_pmltopu = NULL; pmap->pm_type = pm_type; - if ((pml4pg->flags & PG_ZERO) == 0) - pagezero(pmap->pm_pml4); + if ((pmltop_pg->flags & PG_ZERO) == 0) + pagezero(pmap->pm_pmltop); /* * Do not install the host kernel mappings in the nested page @@ -3766,15 +4063,21 @@ pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags) * Install minimal kernel mappings in PTI case. */ if (pm_type == PT_X86) { - pmap->pm_cr3 = pml4phys; - pmap_pinit_pml4(pml4pg); + pmap->pm_cr3 = pmltop_phys; + if (pmap_is_la57(pmap)) + pmap_pinit_pml5(pmltop_pg); + else + pmap_pinit_pml4(pmltop_pg); if ((curproc->p_md.md_flags & P_MD_KPTI) != 0) { - pml4pgu = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | + pmltop_pgu = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_WAITOK); - pmap->pm_pml4u = (pml4_entry_t *)PHYS_TO_DMAP( - VM_PAGE_TO_PHYS(pml4pgu)); - pmap_pinit_pml4_pti(pml4pgu); - pmap->pm_ucr3 = VM_PAGE_TO_PHYS(pml4pgu); + pmap->pm_pmltopu = (pml4_entry_t *)PHYS_TO_DMAP( + VM_PAGE_TO_PHYS(pmltop_pgu)); + if (pmap_is_la57(pmap)) + pmap_pinit_pml5_pti(pmltop_pgu); + else + pmap_pinit_pml4_pti(pmltop_pgu); + pmap->pm_ucr3 = VM_PAGE_TO_PHYS(pmltop_pgu); } if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) { rangeset_init(&pmap->pm_pkru, pkru_dup_range, @@ -3799,14 +4102,88 @@ pmap_pinit(pmap_t pmap) return (pmap_pinit_type(pmap, PT_X86, pmap_flags)); } +static pml4_entry_t * +pmap_allocpte_getpml4(pmap_t pmap, struct rwlock **lockp, vm_offset_t va, + bool addref) +{ + vm_pindex_t pml5index; + pml5_entry_t *pml5; + pml4_entry_t *pml4; + vm_page_t pml4pg; + pt_entry_t PG_V; + bool allocated; + + if (!pmap_is_la57(pmap)) + return (&pmap->pm_pmltop[pmap_pml4e_index(va)]); + + PG_V = pmap_valid_bit(pmap); + pml5index = pmap_pml5e_index(va); + pml5 = &pmap->pm_pmltop[pml5index]; + if ((*pml5 & PG_V) == 0) { + if (_pmap_allocpte(pmap, pmap_pml5e_pindex(va), lockp, va) == + NULL) + return (NULL); + allocated = true; + } else { + allocated = false; + } + pml4 = (pml4_entry_t *)PHYS_TO_DMAP(*pml5 & PG_FRAME); + pml4 = &pml4[pmap_pml4e_index(va)]; + if ((*pml4 & PG_V) == 0) { + pml4pg = PHYS_TO_VM_PAGE(*pml5 & PG_FRAME); + if (allocated && !addref) + pml4pg->ref_count--; + else if (!allocated && addref) + pml4pg->ref_count++; + } + return (pml4); +} + +static pdp_entry_t * +pmap_allocpte_getpdp(pmap_t pmap, struct rwlock **lockp, vm_offset_t va, + bool addref) +{ + vm_page_t pdppg; + pml4_entry_t *pml4; + pdp_entry_t *pdp; + pt_entry_t PG_V; + bool allocated; + + PG_V = pmap_valid_bit(pmap); + + pml4 = pmap_allocpte_getpml4(pmap, lockp, va, false); + if (pml4 == NULL) + return (NULL); + + if ((*pml4 & PG_V) == 0) { + /* Have to allocate a new pdp, recurse */ + if (_pmap_allocpte(pmap, pmap_pml4e_pindex(va), lockp, va) == + NULL) + return (NULL); + allocated = true; + } else { + allocated = false; + } + pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); + pdp = &pdp[pmap_pdpe_index(va)]; + if ((*pdp & PG_V) == 0) { + pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME); + if (allocated && !addref) + pdppg->ref_count--; + else if (!allocated && addref) + pdppg->ref_count++; + } + return (pdp); +} + /* * This routine is called if the desired page table page does not exist. * * If page table page allocation fails, this routine may sleep before * returning NULL. It sleeps only if a lock pointer was given. * - * Note: If a page allocation fails at page table level two or three, - * one or two pages may be held during the wait, only to be released + * Note: If a page allocation fails at page table level two, three, or four, + * up to three pages may be held during the wait, only to be released * afterwards. This conservative approach is easily argued to avoid * race conditions. * @@ -3823,20 +4200,35 @@ pmap_pinit(pmap_t pmap) * - for the page directory pointer page, * ptepindex = NUPDE + NUPDPE + (pmap_pde_index(va) >> (NPDEPGSHIFT + * NPML4EPGSHIFT), - * i.e. index of pml4e is put after the last index of PDPE. + * i.e. index of pml4e is put after the last index of PDPE, + * - for the PML4 page (if LA57 mode is enabled), + * ptepindex = NUPDE + NUPDPE + NUPML4E + (pmap_pde_index(va) >> + * (NPDEPGSHIFT + NPML4EPGSHIFT + NPML5EPGSHIFT), + * i.e. index of pml5e is put after the last index of PML4E. * * Define an order on the paging entries, where all entries of the * same height are put together, then heights are put from deepest to * root. Then ptexpindex is the sequential number of the * corresponding paging entry in this order. * - * The root page at PML4 does not participate in this indexing scheme, since - * it is statically allocated by pmap_pinit() and not by _pmap_allocpte(). + * The values of NUPDE, NUPDPE, and NUPML4E are determined by the size of + * LA57 paging structures even in LA48 paging mode. Moreover, the + * ptepindexes are calculated as if the paging structures were 5-level + * regardless of the actual mode of operation. + * + * The root page at PML4/PML5 does not participate in this indexing scheme, + * since it is statically allocated by pmap_pinit() and not by _pmap_allocpte(). */ static vm_page_t -_pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) +_pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp, + vm_offset_t va __unused) { - vm_page_t m, pdppg, pdpg; + vm_pindex_t pml5index, pml4index; + pml5_entry_t *pml5, *pml5u; + pml4_entry_t *pml4, *pml4u; + pdp_entry_t *pdp; + pd_entry_t *pd; + vm_page_t m, pdpg; pt_entry_t PG_A, PG_M, PG_RW, PG_V; PMAP_LOCK_ASSERT(pmap, MA_OWNED); @@ -3872,16 +4264,38 @@ _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) * Map the pagetable page into the process address space, if * it isn't already there. */ + if (ptepindex >= NUPDE + NUPDPE + NUPML4E) { + MPASS(pmap_is_la57(pmap)); + + pml5index = pmap_pml5e_index(va); + pml5 = &pmap->pm_pmltop[pml5index]; + KASSERT((*pml5 & PG_V) == 0, + ("pmap %p va %#lx pml5 %#lx", pmap, va, *pml5)); + *pml5 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; - if (ptepindex >= (NUPDE + NUPDPE)) { - pml4_entry_t *pml4, *pml4u; - vm_pindex_t pml4index; + if (pmap->pm_pmltopu != NULL && pml5index < NUPML5E) { + if (pmap->pm_ucr3 != PMAP_NO_CR3) + *pml5 |= pg_nx; + pml5u = &pmap->pm_pmltopu[pml5index]; + *pml5u = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | + PG_A | PG_M; + } + } else if (ptepindex >= NUPDE + NUPDPE) { + pml4index = pmap_pml4e_index(va); /* Wire up a new PDPE page */ - pml4index = ptepindex - (NUPDE + NUPDPE); - pml4 = &pmap->pm_pml4[pml4index]; + pml4 = pmap_allocpte_getpml4(pmap, lockp, va, true); + if (pml4 == NULL) { + vm_page_unwire_noq(m); + vm_page_free_zero(m); + return (NULL); + } + KASSERT((*pml4 & PG_V) == 0, + ("pmap %p va %#lx pml4 %#lx", pmap, va, *pml4)); *pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; - if (pmap->pm_pml4u != NULL && pml4index < NUPML4E) { + + if (!pmap_is_la57(pmap) && pmap->pm_pmltopu != NULL && + pml4index < NUPML4E) { /* * PTI: Make all user-space mappings in the * kernel-mode page table no-execute so that @@ -3892,85 +4306,48 @@ _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) if (pmap->pm_ucr3 != PMAP_NO_CR3) *pml4 |= pg_nx; - pml4u = &pmap->pm_pml4u[pml4index]; + pml4u = &pmap->pm_pmltopu[pml4index]; *pml4u = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; } - } else if (ptepindex >= NUPDE) { - vm_pindex_t pml4index; - vm_pindex_t pdpindex; - pml4_entry_t *pml4; - pdp_entry_t *pdp; - /* Wire up a new PDE page */ - pdpindex = ptepindex - NUPDE; - pml4index = pdpindex >> NPML4EPGSHIFT; - - pml4 = &pmap->pm_pml4[pml4index]; - if ((*pml4 & PG_V) == 0) { - /* Have to allocate a new pdp, recurse */ - if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index, - lockp) == NULL) { - vm_page_unwire_noq(m); - vm_page_free_zero(m); - return (NULL); - } - } else { - /* Add reference to pdp page */ - pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME); - pdppg->ref_count++; + pdp = pmap_allocpte_getpdp(pmap, lockp, va, true); + if (pdp == NULL) { + vm_page_unwire_noq(m); + vm_page_free_zero(m); + return (NULL); } - pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); - - /* Now find the pdp page */ - pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; + KASSERT((*pdp & PG_V) == 0, + ("pmap %p va %#lx pdp %#lx", pmap, va, *pdp)); *pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; - } else { - vm_pindex_t pml4index; - vm_pindex_t pdpindex; - pml4_entry_t *pml4; - pdp_entry_t *pdp; - pd_entry_t *pd; - /* Wire up a new PTE page */ - pdpindex = ptepindex >> NPDPEPGSHIFT; - pml4index = pdpindex >> NPML4EPGSHIFT; - - /* First, find the pdp and check that its valid. */ - pml4 = &pmap->pm_pml4[pml4index]; - if ((*pml4 & PG_V) == 0) { + pdp = pmap_allocpte_getpdp(pmap, lockp, va, false); + if (pdp == NULL) { + vm_page_unwire_noq(m); + vm_page_free_zero(m); + return (NULL); + } + if ((*pdp & PG_V) == 0) { /* Have to allocate a new pd, recurse */ - if (_pmap_allocpte(pmap, NUPDE + pdpindex, - lockp) == NULL) { + if (_pmap_allocpte(pmap, pmap_pdpe_pindex(va), + lockp, va) == NULL) { vm_page_unwire_noq(m); vm_page_free_zero(m); return (NULL); } - pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); - pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; } else { - pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); - pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; - if ((*pdp & PG_V) == 0) { - /* Have to allocate a new pd, recurse */ - if (_pmap_allocpte(pmap, NUPDE + pdpindex, - lockp) == NULL) { - vm_page_unwire_noq(m); - vm_page_free_zero(m); - return (NULL); - } - } else { - /* Add reference to the pd page */ - pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME); - pdpg->ref_count++; - } + /* Add reference to the pd page */ + pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME); + pdpg->ref_count++; } pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME); /* Now we know where the page directory page is */ - pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)]; + pd = &pd[pmap_pde_index(va)]; + KASSERT((*pd & PG_V) == 0, + ("pmap %p va %#lx pd %#lx", pmap, va, *pd)); *pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; } @@ -4003,7 +4380,7 @@ retry: } else if (va < VM_MAXUSER_ADDRESS) { /* Allocate a pd page. */ pdpindex = pmap_pde_pindex(va) >> NPDPEPGSHIFT; - pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp); + pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp, va); if (pdpg == NULL) { if (lockp != NULL) goto retry; @@ -4064,7 +4441,7 @@ retry: * Here if the pte page isn't mapped, or if it has been * deallocated. */ - m = _pmap_allocpte(pmap, ptepindex, lockp); + m = _pmap_allocpte(pmap, ptepindex, lockp, va); if (m == NULL && lockp != NULL) goto retry; } @@ -4088,28 +4465,35 @@ pmap_release(pmap_t pmap) int i; KASSERT(pmap->pm_stats.resident_count == 0, - ("pmap_release: pmap resident count %ld != 0", - pmap->pm_stats.resident_count)); + ("pmap_release: pmap %p resident count %ld != 0", + pmap, pmap->pm_stats.resident_count)); KASSERT(vm_radix_is_empty(&pmap->pm_root), - ("pmap_release: pmap has reserved page table page(s)")); + ("pmap_release: pmap %p has reserved page table page(s)", + pmap)); KASSERT(CPU_EMPTY(&pmap->pm_active), ("releasing active pmap %p", pmap)); - m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4)); + m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pmltop)); - for (i = 0; i < NKPML4E; i++) /* KVA */ - pmap->pm_pml4[KPML4BASE + i] = 0; - for (i = 0; i < ndmpdpphys; i++)/* Direct Map */ - pmap->pm_pml4[DMPML4I + i] = 0; - pmap->pm_pml4[PML4PML4I] = 0; /* Recursive Mapping */ - for (i = 0; i < lm_ents; i++) /* Large Map */ - pmap->pm_pml4[LMSPML4I + i] = 0; + if (pmap_is_la57(pmap)) { + pmap->pm_pmltop[pmap_pml5e_index(UPT_MAX_ADDRESS)] = 0; + pmap->pm_pmltop[PML5PML5I] = 0; + } else { + for (i = 0; i < NKPML4E; i++) /* KVA */ + pmap->pm_pmltop[KPML4BASE + i] = 0; + for (i = 0; i < ndmpdpphys; i++)/* Direct Map */ + pmap->pm_pmltop[DMPML4I + i] = 0; + pmap->pm_pmltop[PML4PML4I] = 0; /* Recursive Mapping */ + for (i = 0; i < lm_ents; i++) /* Large Map */ + pmap->pm_pmltop[LMSPML4I + i] = 0; + } vm_page_unwire_noq(m); vm_page_free_zero(m); - if (pmap->pm_pml4u != NULL) { - m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4u)); + if (pmap->pm_pmltopu != NULL) { + m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap-> + pm_pmltopu)); vm_page_unwire_noq(m); vm_page_free(m); } @@ -5448,6 +5832,7 @@ pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { struct rwlock *lock; vm_offset_t va_next; + pml5_entry_t *pml5e; pml4_entry_t *pml4e; pdp_entry_t *pdpe; pd_entry_t ptpaddr, *pde; @@ -5490,7 +5875,18 @@ pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) if (pmap->pm_stats.resident_count == 0) break; - pml4e = pmap_pml4e(pmap, sva); + if (pmap_is_la57(pmap)) { + pml5e = pmap_pml5e(pmap, sva); + if ((*pml5e & PG_V) == 0) { + va_next = (sva + NBPML5) & ~PML5MASK; + if (va_next < sva) + va_next = eva; + continue; + } + pml4e = pmap_pml5e_to_pml4e(pml5e, sva); + } else { + pml4e = pmap_pml4e(pmap, sva); + } if ((*pml4e & PG_V) == 0) { va_next = (sva + NBPML4) & ~PML4MASK; if (va_next < sva) @@ -6110,7 +6506,7 @@ retry: */ nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; mpte = _pmap_allocpte(pmap, pmap_pde_pindex(va), - nosleep ? NULL : &lock); + nosleep ? NULL : &lock, va); if (mpte == NULL && nosleep) { rv = KERN_RESOURCE_SHORTAGE; goto out; @@ -6593,7 +6989,8 @@ pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, * Pass NULL instead of the PV list lock * pointer, because we don't intend to sleep. */ - mpte = _pmap_allocpte(pmap, ptepindex, NULL); + mpte = _pmap_allocpte(pmap, ptepindex, NULL, + va); if (mpte == NULL) return (mpte); } @@ -9346,11 +9743,11 @@ pmap_large_map_pdpe(vm_offset_t va) ("pmap_large_map_pdpe: va %#jx out of range idx %#jx LMSPML4I " "%#jx lm_ents %d", (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents)); - KASSERT((kernel_pmap->pm_pml4[pml4_idx] & X86_PG_V) != 0, + KASSERT((kernel_pml4[pml4_idx] & X86_PG_V) != 0, ("pmap_large_map_pdpe: invalid pml4 for va %#jx idx %#jx " "LMSPML4I %#jx lm_ents %d", (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents)); - mphys = kernel_pmap->pm_pml4[pml4_idx] & PG_FRAME; + mphys = kernel_pml4[pml4_idx] & PG_FRAME; return ((pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va)); } @@ -10425,7 +10822,9 @@ sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range, mode, range->pdpes, range->pdes, range->ptes); /* Reset to sentinel value. */ - range->sva = KVADDR(NPML4EPG - 1, NPDPEPG - 1, NPDEPG - 1, NPTEPG - 1); + range->sva = la57 ? KV5ADDR(NPML5EPG - 1, NPML4EPG - 1, NPDPEPG - 1, + NPDEPG - 1, NPTEPG - 1) : KV4ADDR(NPML4EPG - 1, NPDPEPG - 1, + NPDEPG - 1, NPTEPG - 1); } /* @@ -10519,7 +10918,9 @@ sysctl_kmaps(SYSCTL_HANDLER_ARGS) sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req); /* Sentinel value. */ - range.sva = KVADDR(NPML4EPG - 1, NPDPEPG - 1, NPDEPG - 1, NPTEPG - 1); + range.sva = la57 ? KV5ADDR(NPML5EPG - 1, NPML4EPG - 1, NPDPEPG - 1, + NPDEPG - 1, NPTEPG - 1) : KV4ADDR(NPML4EPG - 1, NPDPEPG - 1, + NPDEPG - 1, NPTEPG - 1); /* * Iterate over the kernel page tables without holding the kernel pmap @@ -10549,7 +10950,7 @@ sysctl_kmaps(SYSCTL_HANDLER_ARGS) sva |= -1ul << 48; restart: - pml4e = kernel_pmap->pm_pml4[i]; + pml4e = kernel_pml4[i]; if ((pml4e & X86_PG_V) == 0) { sva = rounddown2(sva, NBPML4); sysctl_kmaps_dump(sb, &range, sva); @@ -10632,6 +11033,7 @@ SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps, DB_SHOW_COMMAND(pte, pmap_print_pte) { pmap_t pmap; + pml5_entry_t *pml5; pml4_entry_t *pml4; pdp_entry_t *pdp; pd_entry_t *pde; @@ -10650,8 +11052,20 @@ DB_SHOW_COMMAND(pte, pmap_print_pte) pmap = PCPU_GET(curpmap); PG_V = pmap_valid_bit(pmap); - pml4 = pmap_pml4e(pmap, va); - db_printf("VA 0x%016lx pml4e 0x%016lx", va, *pml4); + db_printf("VA 0x%016lx", va); + + if (pmap_is_la57(pmap)) { + pml5 = pmap_pml5e(pmap, va); + db_printf(" pml5e 0x%016lx", *pml5); + if ((*pml5 & PG_V) == 0) { + db_printf("\n"); + return; + } + pml4 = pmap_pml5e_to_pml4e(pml5, va); + } else { + pml4 = pmap_pml4e(pmap, va); + } + db_printf(" pml4e 0x%016lx", *pml4); if ((*pml4 & PG_V) == 0) { db_printf("\n"); return; @@ -10683,4 +11097,95 @@ DB_SHOW_COMMAND(phys2dmap, pmap_phys2dmap) db_printf("show phys2dmap addr\n"); } } + +static void +ptpages_show_page(int level, int idx, vm_page_t pg) +{ + db_printf("l %d i %d pg %p phys %#lx ref %x\n", + level, idx, pg, VM_PAGE_TO_PHYS(pg), pg->ref_count); +} + +static void +ptpages_show_complain(int level, int idx, uint64_t pte) +{ + db_printf("l %d i %d pte %#lx\n", level, idx, pte); +} + +static void +ptpages_show_pml4(vm_page_t pg4, int num_entries, uint64_t PG_V) +{ + vm_page_t pg3, pg2, pg1; + pml4_entry_t *pml4; + pdp_entry_t *pdp; + pd_entry_t *pd; + int i4, i3, i2; + + pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pg4)); + for (i4 = 0; i4 < num_entries; i4++) { + if ((pml4[i4] & PG_V) == 0) + continue; + pg3 = PHYS_TO_VM_PAGE(pml4[i4] & PG_FRAME); + if (pg3 == NULL) { + ptpages_show_complain(3, i4, pml4[i4]); + continue; + } + ptpages_show_page(3, i4, pg3); + pdp = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pg3)); + for (i3 = 0; i3 < NPDPEPG; i3++) { + if ((pdp[i3] & PG_V) == 0) + continue; + pg2 = PHYS_TO_VM_PAGE(pdp[i3] & PG_FRAME); + if (pg3 == NULL) { + ptpages_show_complain(2, i3, pdp[i3]); + continue; + } + ptpages_show_page(2, i3, pg2); + pd = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pg2)); + for (i2 = 0; i2 < NPDEPG; i2++) { + if ((pd[i2] & PG_V) == 0) + continue; + pg1 = PHYS_TO_VM_PAGE(pd[i2] & PG_FRAME); + if (pg1 == NULL) { + ptpages_show_complain(1, i2, pd[i2]); + continue; + } + ptpages_show_page(1, i2, pg1); + } + } + } +} + +DB_SHOW_COMMAND(ptpages, pmap_ptpages) +{ + pmap_t pmap; + vm_page_t pg; + pml5_entry_t *pml5; + uint64_t PG_V; + int i5; + + if (have_addr) + pmap = (pmap_t)addr; + else + pmap = PCPU_GET(curpmap); + + PG_V = pmap_valid_bit(pmap); + + if (pmap_is_la57(pmap)) { + pml5 = pmap->pm_pmltop; + for (i5 = 0; i5 < NUPML5E; i5++) { + if ((pml5[i5] & PG_V) == 0) + continue; + pg = PHYS_TO_VM_PAGE(pml5[i5] & PG_FRAME); + if (pg == NULL) { + ptpages_show_complain(4, i5, pml5[i5]); + continue; + } + ptpages_show_page(4, i5, pg); + ptpages_show_pml4(pg, NPML4EPG, PG_V); + } + } else { + ptpages_show_pml4(PHYS_TO_VM_PAGE(DMAP_TO_PHYS( + (vm_offset_t)pmap->pm_pmltop)), NUP4ML4E, PG_V); + } +} #endif diff --git a/sys/amd64/include/md_var.h b/sys/amd64/include/md_var.h index 8d866fee846aa..9a550d7024feb 100644 --- a/sys/amd64/include/md_var.h +++ b/sys/amd64/include/md_var.h @@ -46,6 +46,8 @@ extern int syscall_ret_l1d_flush_mode; extern vm_paddr_t intel_graphics_stolen_base; extern vm_paddr_t intel_graphics_stolen_size; +extern int la57; + /* * The file "conf/ldscript.amd64" defines the symbol "kernphys". Its * value is the physical address at which the kernel is loaded. diff --git a/sys/amd64/include/param.h b/sys/amd64/include/param.h index ac3df693e4d45..2bd4d913a7b1a 100644 --- a/sys/amd64/include/param.h +++ b/sys/amd64/include/param.h @@ -118,6 +118,12 @@ #define PML4SHIFT 39 /* LOG2(NBPML4) */ #define NBPML4 (1UL<<PML4SHIFT)/* bytes/page map lev4 table */ #define PML4MASK (NBPML4-1) +/* Size of the level 5 page-map level-5 table units */ +#define NPML5EPG (PAGE_SIZE/(sizeof (pml5_entry_t))) +#define NPML5EPGSHIFT 9 /* LOG2(NPML5EPG) */ +#define PML5SHIFT 48 /* LOG2(NBPML5) */ +#define NBPML5 (1UL<<PML5SHIFT)/* bytes/page map lev5 table */ +#define PML5MASK (NBPML5-1) #define MAXPAGESIZES 3 /* maximum number of supported page sizes */ diff --git a/sys/amd64/include/pmap.h b/sys/amd64/include/pmap.h index e2d7a714511b3..5cdcca66d9735 100644 --- a/sys/amd64/include/pmap.h +++ b/sys/amd64/include/pmap.h @@ -166,14 +166,22 @@ * Pte related macros. This is complicated by having to deal with * the sign extension of the 48th bit. */ -#define KVADDR(l4, l3, l2, l1) ( \ +#define KV4ADDR(l4, l3, l2, l1) ( \ ((unsigned long)-1 << 47) | \ ((unsigned long)(l4) << PML4SHIFT) | \ ((unsigned long)(l3) << PDPSHIFT) | \ ((unsigned long)(l2) << PDRSHIFT) | \ ((unsigned long)(l1) << PAGE_SHIFT)) +#define KV5ADDR(l5, l4, l3, l2, l1) ( \ + ((unsigned long)-1 << 56) | \ + ((unsigned long)(l5) << PML5SHIFT) | \ + ((unsigned long)(l4) << PML4SHIFT) | \ + ((unsigned long)(l3) << PDPSHIFT) | \ + ((unsigned long)(l2) << PDRSHIFT) | \ + ((unsigned long)(l1) << PAGE_SHIFT)) -#define UVADDR(l4, l3, l2, l1) ( \ +#define UVADDR(l5, l4, l3, l2, l1) ( \ + ((unsigned long)(l5) << PML5SHIFT) | \ ((unsigned long)(l4) << PML4SHIFT) | \ ((unsigned long)(l3) << PDPSHIFT) | \ ((unsigned long)(l2) << PDRSHIFT) | \ @@ -187,9 +195,19 @@ */ #define NKPML4E 4 -#define NUPML4E (NPML4EPG/2) /* number of userland PML4 pages */ -#define NUPDPE (NUPML4E*NPDPEPG)/* number of userland PDP pages */ -#define NUPDE (NUPDPE*NPDEPG) /* number of userland PD entries */ +/* + * We use the same numbering of the page table pages for 5-level and + * 4-level paging structures. + */ +#define NUPML5E (NPML5EPG / 2) /* number of userland PML5 + pages */ +#define NUPML4E (NUPML5E * NPML4EPG) /* number of userland PML4 + pages */ +#define NUPDPE (NUPML4E * NPDPEPG) /* number of userland PDP + pages */ +#define NUPDE (NUPDPE * NPDEPG) /* number of userland PD + entries */ +#define NUP4ML4E (NPML4EPG / 2) /* * NDMPML4E is the maximum number of PML4 entries that will be @@ -216,7 +234,8 @@ * Or, in other words, KPML4I provides bits 39..47 of KERNBASE, * and KPDPI provides bits 30..38.) */ -#define PML4PML4I (NPML4EPG/2) /* Index of recursive pml4 mapping */ +#define PML4PML4I (NPML4EPG / 2) /* Index of recursive pml4 mapping */ +#define PML5PML5I (NPML5EPG / 2) /* Index of recursive pml5 mapping */ #define KPML4BASE (NPML4EPG-NKPML4E) /* KVM at highest addresses */ #define DMPML4I rounddown(KPML4BASE-NDMPML4E, NDMPML4E) /* Below KVM */ @@ -258,25 +277,34 @@ typedef u_int64_t pd_entry_t; typedef u_int64_t pt_entry_t; typedef u_int64_t pdp_entry_t; typedef u_int64_t pml4_entry_t; +typedef u_int64_t pml5_entry_t; /* * Address of current address space page table maps and directories. */ #ifdef _KERNEL -#define addr_PTmap (KVADDR(PML4PML4I, 0, 0, 0)) -#define addr_PDmap (KVADDR(PML4PML4I, PML4PML4I, 0, 0)) -#define addr_PDPmap (KVADDR(PML4PML4I, PML4PML4I, PML4PML4I, 0)) -#define addr_PML4map (KVADDR(PML4PML4I, PML4PML4I, PML4PML4I, PML4PML4I)) -#define addr_PML4pml4e (addr_PML4map + (PML4PML4I * sizeof(pml4_entry_t))) -#define PTmap ((pt_entry_t *)(addr_PTmap)) -#define PDmap ((pd_entry_t *)(addr_PDmap)) -#define PDPmap ((pd_entry_t *)(addr_PDPmap)) -#define PML4map ((pd_entry_t *)(addr_PML4map)) -#define PML4pml4e ((pd_entry_t *)(addr_PML4pml4e)) +#define addr_P4Tmap (KV4ADDR(PML4PML4I, 0, 0, 0)) +#define addr_P4Dmap (KV4ADDR(PML4PML4I, PML4PML4I, 0, 0)) +#define addr_P4DPmap (KV4ADDR(PML4PML4I, PML4PML4I, PML4PML4I, 0)) +#define addr_P4ML4map (KV4ADDR(PML4PML4I, PML4PML4I, PML4PML4I, PML4PML4I)) +#define addr_P4ML4pml4e (addr_PML4map + (PML4PML4I * sizeof(pml4_entry_t))) +#define P4Tmap ((pt_entry_t *)(addr_P4Tmap)) +#define P4Dmap ((pd_entry_t *)(addr_P4Dmap)) + +#define addr_P5Tmap (KV5ADDR(PML5PML5I, 0, 0, 0, 0)) +#define addr_P5Dmap (KV5ADDR(PML5PML5I, PML5PML5I, 0, 0, 0)) +#define addr_P5DPmap (KV5ADDR(PML5PML5I, PML5PML5I, PML5PML5I, 0, 0)) +#define addr_P5ML4map (KV5ADDR(PML5PML5I, PML5PML5I, PML5PML5I, PML5PML5I, 0)) +#define addr_P5ML5map \ + (KVADDR(PML5PML5I, PML5PML5I, PML5PML5I, PML5PML5I, PML5PML5I)) +#define addr_P5ML5pml5e (addr_P5ML5map + (PML5PML5I * sizeof(pml5_entry_t))) +#define P5Tmap ((pt_entry_t *)(addr_P5Tmap)) +#define P5Dmap ((pd_entry_t *)(addr_P5Dmap)) extern int nkpt; /* Initial number of kernel page tables */ extern u_int64_t KPDPphys; /* physical address of kernel level 3 */ extern u_int64_t KPML4phys; /* physical address of kernel level 4 */ +extern u_int64_t KPML5phys; /* physical address of kernel level 5 */ /* * virtual address to page table entry and @@ -333,8 +361,8 @@ struct pmap_pcids { */ struct pmap { struct mtx pm_mtx; - pml4_entry_t *pm_pml4; /* KVA of level 4 page table */ - pml4_entry_t *pm_pml4u; /* KVA of user l4 page table */ + pml4_entry_t *pm_pmltop; /* KVA of top level page table */ + pml4_entry_t *pm_pmltopu; /* KVA of user top page table */ uint64_t pm_cr3; uint64_t pm_ucr3; TAILQ_HEAD(,pv_chunk) pm_pvchunk; /* list of mappings in pmap */ @@ -447,6 +475,7 @@ bool pmap_not_in_di(void); boolean_t pmap_page_is_mapped(vm_page_t m); void pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma); void pmap_pinit_pml4(vm_page_t); +void pmap_pinit_pml5(vm_page_t); bool pmap_ps_enabled(pmap_t pmap); void pmap_unmapdev(vm_offset_t, vm_size_t); void pmap_invalidate_page(pmap_t, vm_offset_t); @@ -502,6 +531,13 @@ pmap_pml4e_index(vm_offset_t va) return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1)); } +static __inline vm_pindex_t +pmap_pml5e_index(vm_offset_t va) +{ + + return ((va >> PML5SHIFT) & ((1ul << NPML5EPGSHIFT) - 1)); +} + #endif /* !LOCORE */ #endif /* !_MACHINE_PMAP_H_ */ diff --git a/sys/amd64/include/proc.h b/sys/amd64/include/proc.h index 75f357c3a53e6..e74f1626a56a2 100644 --- a/sys/amd64/include/proc.h +++ b/sys/amd64/include/proc.h @@ -84,6 +84,8 @@ struct mdproc { }; #define P_MD_KPTI 0x00000001 /* Enable KPTI on exec */ +#define P_MD_LA48 0x00000002 /* Request LA48 after exec */ +#define P_MD_LA57 0x00000004 /* Request LA57 after exec */ #define KINFO_PROC_SIZE 1088 #define KINFO_PROC32_SIZE 768 diff --git a/sys/amd64/include/vmparam.h b/sys/amd64/include/vmparam.h index 2fe349e0beb59..64eed5760357e 100644 --- a/sys/amd64/include/vmparam.h +++ b/sys/amd64/include/vmparam.h @@ -169,25 +169,32 @@ * 0xffffffff80000000 KERNBASE */ -#define VM_MIN_KERNEL_ADDRESS KVADDR(KPML4BASE, 0, 0, 0) -#define VM_MAX_KERNEL_ADDRESS KVADDR(KPML4BASE + NKPML4E - 1, \ +#define VM_MIN_KERNEL_ADDRESS KV4ADDR(KPML4BASE, 0, 0, 0) +#define VM_MAX_KERNEL_ADDRESS KV4ADDR(KPML4BASE + NKPML4E - 1, \ NPDPEPG-1, NPDEPG-1, NPTEPG-1) -#define DMAP_MIN_ADDRESS KVADDR(DMPML4I, 0, 0, 0) -#define DMAP_MAX_ADDRESS KVADDR(DMPML4I + NDMPML4E, 0, 0, 0) +#define DMAP_MIN_ADDRESS KV4ADDR(DMPML4I, 0, 0, 0) +#define DMAP_MAX_ADDRESS KV4ADDR(DMPML4I + NDMPML4E, 0, 0, 0) -#define LARGEMAP_MIN_ADDRESS KVADDR(LMSPML4I, 0, 0, 0) -#define LARGEMAP_MAX_ADDRESS KVADDR(LMEPML4I + 1, 0, 0, 0) +#define LARGEMAP_MIN_ADDRESS KV4ADDR(LMSPML4I, 0, 0, 0) +#define LARGEMAP_MAX_ADDRESS KV4ADDR(LMEPML4I + 1, 0, 0, 0) -#define KERNBASE KVADDR(KPML4I, KPDPI, 0, 0) +#define KERNBASE KV4ADDR(KPML4I, KPDPI, 0, 0) -#define UPT_MAX_ADDRESS KVADDR(PML4PML4I, PML4PML4I, PML4PML4I, PML4PML4I) -#define UPT_MIN_ADDRESS KVADDR(PML4PML4I, 0, 0, 0) +#define UPT_MAX_ADDRESS KV4ADDR(PML4PML4I, PML4PML4I, PML4PML4I, PML4PML4I) +#define UPT_MIN_ADDRESS KV4ADDR(PML4PML4I, 0, 0, 0) -#define VM_MAXUSER_ADDRESS UVADDR(NUPML4E, 0, 0, 0) +#define VM_MAXUSER_ADDRESS_LA57 UVADDR(NUPML5E, 0, 0, 0, 0) +#define VM_MAXUSER_ADDRESS_LA48 UVADDR(0, NUP4ML4E, 0, 0, 0) +#define VM_MAXUSER_ADDRESS VM_MAXUSER_ADDRESS_LA57 -#define SHAREDPAGE (VM_MAXUSER_ADDRESS - PAGE_SIZE) -#define USRSTACK SHAREDPAGE +#define SHAREDPAGE_LA57 (VM_MAXUSER_ADDRESS_LA57 - PAGE_SIZE) +#define SHAREDPAGE_LA48 (VM_MAXUSER_ADDRESS_LA48 - PAGE_SIZE) +#define USRSTACK_LA57 SHAREDPAGE_LA57 +#define USRSTACK_LA48 SHAREDPAGE_LA48 +#define USRSTACK USRSTACK_LA48 +#define PS_STRINGS_LA57 (USRSTACK_LA57 - sizeof(struct ps_strings)) +#define PS_STRINGS_LA48 (USRSTACK_LA48 - sizeof(struct ps_strings)) #define VM_MAX_ADDRESS UPT_MAX_ADDRESS #define VM_MIN_ADDRESS (0) diff --git a/sys/amd64/linux/linux_sysvec.c b/sys/amd64/linux/linux_sysvec.c index 81ccbd75b5cd0..bb80f324868c1 100644 --- a/sys/amd64/linux/linux_sysvec.c +++ b/sys/amd64/linux/linux_sysvec.c @@ -739,9 +739,9 @@ struct sysentvec elf_linux_sysvec = { .sv_imgact_try = linux_exec_imgact_try, .sv_minsigstksz = LINUX_MINSIGSTKSZ, .sv_minuser = VM_MIN_ADDRESS, - .sv_maxuser = VM_MAXUSER_ADDRESS, - .sv_usrstack = USRSTACK, - .sv_psstrings = PS_STRINGS, + .sv_maxuser = VM_MAXUSER_ADDRESS_LA48, + .sv_usrstack = USRSTACK_LA48, + .sv_psstrings = PS_STRINGS_LA48, .sv_stackprot = VM_PROT_ALL, .sv_copyout_auxargs = linux_copyout_auxargs, .sv_copyout_strings = linux_copyout_strings, @@ -752,7 +752,7 @@ struct sysentvec elf_linux_sysvec = { .sv_set_syscall_retval = linux_set_syscall_retval, .sv_fetch_syscall_args = linux_fetch_syscall_args, .sv_syscallnames = NULL, - .sv_shared_page_base = SHAREDPAGE, + .sv_shared_page_base = SHAREDPAGE_LA48, .sv_shared_page_len = PAGE_SIZE, .sv_schedtail = linux_schedtail, .sv_thread_detach = linux_thread_detach, diff --git a/sys/amd64/vmm/amd/svm.c b/sys/amd64/vmm/amd/svm.c index f9660024fe0c2..3b26de3d00ffa 100644 --- a/sys/amd64/vmm/amd/svm.c +++ b/sys/amd64/vmm/amd/svm.c @@ -560,7 +560,7 @@ svm_vminit(struct vm *vm, pmap_t pmap) panic("contigmalloc of SVM IO bitmap failed"); svm_sc->vm = vm; - svm_sc->nptp = (vm_offset_t)vtophys(pmap->pm_pml4); + svm_sc->nptp = (vm_offset_t)vtophys(pmap->pm_pmltop); /* * Intercept read and write accesses to all MSRs. diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c index ddfada8a60819..3fc6ccf28b639 100644 --- a/sys/amd64/vmm/intel/vmx.c +++ b/sys/amd64/vmm/intel/vmx.c @@ -1030,7 +1030,7 @@ vmx_vminit(struct vm *vm, pmap_t pmap) } vmx->vm = vm; - vmx->eptp = eptp(vtophys((vm_offset_t)pmap->pm_pml4)); + vmx->eptp = eptp(vtophys((vm_offset_t)pmap->pm_pmltop)); /* * Clean up EPTP-tagged guest physical and combined mappings diff --git a/sys/cddl/dev/dtrace/amd64/dtrace_subr.c b/sys/cddl/dev/dtrace/amd64/dtrace_subr.c index cf24e6adae3f1..924a59b3d6568 100644 --- a/sys/cddl/dev/dtrace/amd64/dtrace_subr.c +++ b/sys/cddl/dev/dtrace/amd64/dtrace_subr.c @@ -43,6 +43,7 @@ #include <machine/clock.h> #include <machine/cpufunc.h> #include <machine/frame.h> +#include <machine/md_var.h> #include <machine/psl.h> #include <machine/trap.h> #include <vm/pmap.h> @@ -131,7 +132,7 @@ dtrace_invop_uninit(void) void dtrace_toxic_ranges(void (*func)(uintptr_t base, uintptr_t limit)) { - (*func)(0, (uintptr_t) addr_PTmap); + (*func)(0, la57 ? (uintptr_t)addr_P5Tmap : (uintptr_t)addr_P4Tmap); } void |