summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKonstantin Belousov <kib@FreeBSD.org>2020-08-23 20:19:04 +0000
committerKonstantin Belousov <kib@FreeBSD.org>2020-08-23 20:19:04 +0000
commit9ce875d9b59dde81bf116d24e6b8649075674303 (patch)
tree8f51fc54f9c18becf6cdf42fe0284e9767d9ecfa
parent4ba405dcdbf0abf9e142fc0b9c3c866359bf4c57 (diff)
Notes
-rw-r--r--sys/amd64/amd64/elf_machdep.c124
-rw-r--r--sys/amd64/amd64/genassym.c9
-rw-r--r--sys/amd64/amd64/locore.S63
-rw-r--r--sys/amd64/amd64/mp_machdep.c40
-rw-r--r--sys/amd64/amd64/mpboot.S27
-rw-r--r--sys/amd64/amd64/pmap.c787
-rw-r--r--sys/amd64/include/md_var.h2
-rw-r--r--sys/amd64/include/param.h6
-rw-r--r--sys/amd64/include/pmap.h72
-rw-r--r--sys/amd64/include/proc.h2
-rw-r--r--sys/amd64/include/vmparam.h31
-rw-r--r--sys/amd64/linux/linux_sysvec.c8
-rw-r--r--sys/amd64/vmm/amd/svm.c2
-rw-r--r--sys/amd64/vmm/intel/vmx.c2
-rw-r--r--sys/cddl/dev/dtrace/amd64/dtrace_subr.c3
15 files changed, 951 insertions, 227 deletions
diff --git a/sys/amd64/amd64/elf_machdep.c b/sys/amd64/amd64/elf_machdep.c
index 1ab28676ce7de..3182d1758b18c 100644
--- a/sys/amd64/amd64/elf_machdep.c
+++ b/sys/amd64/amd64/elf_machdep.c
@@ -49,7 +49,7 @@ __FBSDID("$FreeBSD$");
#include <machine/fpu.h>
#include <machine/md_var.h>
-struct sysentvec elf64_freebsd_sysvec = {
+struct sysentvec elf64_freebsd_sysvec_la48 = {
.sv_size = SYS_MAXSYSCALL,
.sv_table = sysent,
.sv_errsize = 0,
@@ -64,9 +64,9 @@ struct sysentvec elf64_freebsd_sysvec = {
.sv_imgact_try = NULL,
.sv_minsigstksz = MINSIGSTKSZ,
.sv_minuser = VM_MIN_ADDRESS,
- .sv_maxuser = VM_MAXUSER_ADDRESS,
- .sv_usrstack = USRSTACK,
- .sv_psstrings = PS_STRINGS,
+ .sv_maxuser = VM_MAXUSER_ADDRESS_LA48,
+ .sv_usrstack = USRSTACK_LA48,
+ .sv_psstrings = PS_STRINGS_LA48,
.sv_stackprot = VM_PROT_ALL,
.sv_copyout_auxargs = __elfN(freebsd_copyout_auxargs),
.sv_copyout_strings = exec_copyout_strings,
@@ -78,14 +78,64 @@ struct sysentvec elf64_freebsd_sysvec = {
.sv_set_syscall_retval = cpu_set_syscall_retval,
.sv_fetch_syscall_args = cpu_fetch_syscall_args,
.sv_syscallnames = syscallnames,
- .sv_shared_page_base = SHAREDPAGE,
+ .sv_shared_page_base = SHAREDPAGE_LA48,
.sv_shared_page_len = PAGE_SIZE,
.sv_schedtail = NULL,
.sv_thread_detach = NULL,
.sv_trap = NULL,
.sv_stackgap = elf64_stackgap,
};
-INIT_SYSENTVEC(elf64_sysvec, &elf64_freebsd_sysvec);
+
+struct sysentvec elf64_freebsd_sysvec_la57 = {
+ .sv_size = SYS_MAXSYSCALL,
+ .sv_table = sysent,
+ .sv_errsize = 0,
+ .sv_errtbl = NULL,
+ .sv_transtrap = NULL,
+ .sv_fixup = __elfN(freebsd_fixup),
+ .sv_sendsig = sendsig,
+ .sv_sigcode = sigcode,
+ .sv_szsigcode = &szsigcode,
+ .sv_name = "FreeBSD ELF64",
+ .sv_coredump = __elfN(coredump),
+ .sv_imgact_try = NULL,
+ .sv_minsigstksz = MINSIGSTKSZ,
+ .sv_minuser = VM_MIN_ADDRESS,
+ .sv_maxuser = VM_MAXUSER_ADDRESS_LA57,
+ .sv_usrstack = USRSTACK_LA57,
+ .sv_psstrings = PS_STRINGS_LA57,
+ .sv_stackprot = VM_PROT_ALL,
+ .sv_copyout_auxargs = __elfN(freebsd_copyout_auxargs),
+ .sv_copyout_strings = exec_copyout_strings,
+ .sv_setregs = exec_setregs,
+ .sv_fixlimit = NULL,
+ .sv_maxssiz = NULL,
+ .sv_flags = SV_ABI_FREEBSD | SV_ASLR | SV_LP64 | SV_SHP |
+ SV_TIMEKEEP,
+ .sv_set_syscall_retval = cpu_set_syscall_retval,
+ .sv_fetch_syscall_args = cpu_fetch_syscall_args,
+ .sv_syscallnames = syscallnames,
+ .sv_shared_page_base = SHAREDPAGE_LA57,
+ .sv_shared_page_len = PAGE_SIZE,
+ .sv_schedtail = NULL,
+ .sv_thread_detach = NULL,
+ .sv_trap = NULL,
+ .sv_stackgap = elf64_stackgap,
+};
+
+static void
+amd64_init_sysvecs(void *arg)
+{
+ amd64_lower_shared_page(&elf64_freebsd_sysvec_la48);
+ if (la57) {
+ exec_sysvec_init(&elf64_freebsd_sysvec_la57);
+ exec_sysvec_init_secondary(&elf64_freebsd_sysvec_la57,
+ &elf64_freebsd_sysvec_la48);
+ } else {
+ exec_sysvec_init(&elf64_freebsd_sysvec_la48);
+ }
+}
+SYSINIT(elf64_sysvec, SI_SUB_EXEC, SI_ORDER_ANY, amd64_init_sysvecs, NULL);
void
amd64_lower_shared_page(struct sysentvec *sv)
@@ -98,29 +148,57 @@ amd64_lower_shared_page(struct sysentvec *sv)
}
}
-/*
- * Do this fixup before INIT_SYSENTVEC (SI_ORDER_ANY) because the latter
- * uses the value of sv_shared_page_base.
- */
-SYSINIT(elf64_sysvec_fixup, SI_SUB_EXEC, SI_ORDER_FIRST,
- (sysinit_cfunc_t) amd64_lower_shared_page,
- &elf64_freebsd_sysvec);
+static boolean_t
+freebsd_brand_info_la57_img_compat(struct image_params *imgp,
+ int32_t *osrel __unused, uint32_t *fctl0)
+{
+ if ((imgp->proc->p_md.md_flags & P_MD_LA57) != 0)
+ return (TRUE);
+ if (fctl0 == NULL || (*fctl0 & NT_FREEBSD_FCTL_LA48) != 0)
+ return (FALSE);
+ if ((imgp->proc->p_md.md_flags & P_MD_LA48) != 0)
+ return (FALSE);
+ return (TRUE);
+}
-static Elf64_Brandinfo freebsd_brand_info = {
+static Elf64_Brandinfo freebsd_brand_info_la48 = {
.brand = ELFOSABI_FREEBSD,
.machine = EM_X86_64,
.compat_3_brand = "FreeBSD",
.emul_path = NULL,
.interp_path = "/libexec/ld-elf.so.1",
- .sysvec = &elf64_freebsd_sysvec,
+ .sysvec = &elf64_freebsd_sysvec_la48,
.interp_newpath = NULL,
.brand_note = &elf64_freebsd_brandnote,
- .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE
+ .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE,
+};
+
+static Elf64_Brandinfo freebsd_brand_info_la57 = {
+ .brand = ELFOSABI_FREEBSD,
+ .machine = EM_X86_64,
+ .compat_3_brand = "FreeBSD",
+ .emul_path = NULL,
+ .interp_path = "/libexec/ld-elf.so.1",
+ .sysvec = &elf64_freebsd_sysvec_la57,
+ .interp_newpath = NULL,
+ .brand_note = &elf64_freebsd_brandnote,
+ .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE,
+ .header_supported = freebsd_brand_info_la57_img_compat,
};
+static void
+sysinit_register_elf64_brand_entries(void *arg __unused)
+{
+ /*
+ * _57 must go first so it can either claim the image or hand
+ * it to _48.
+ */
+ if (la57)
+ elf64_insert_brand_entry(&freebsd_brand_info_la57);
+ elf64_insert_brand_entry(&freebsd_brand_info_la48);
+}
SYSINIT(elf64, SI_SUB_EXEC, SI_ORDER_FIRST,
- (sysinit_cfunc_t) elf64_insert_brand_entry,
- &freebsd_brand_info);
+ sysinit_register_elf64_brand_entries, NULL);
static Elf64_Brandinfo freebsd_brand_oinfo = {
.brand = ELFOSABI_FREEBSD,
@@ -128,15 +206,14 @@ static Elf64_Brandinfo freebsd_brand_oinfo = {
.compat_3_brand = "FreeBSD",
.emul_path = NULL,
.interp_path = "/usr/libexec/ld-elf.so.1",
- .sysvec = &elf64_freebsd_sysvec,
+ .sysvec = &elf64_freebsd_sysvec_la48,
.interp_newpath = NULL,
.brand_note = &elf64_freebsd_brandnote,
.flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE
};
SYSINIT(oelf64, SI_SUB_EXEC, SI_ORDER_ANY,
- (sysinit_cfunc_t) elf64_insert_brand_entry,
- &freebsd_brand_oinfo);
+ (sysinit_cfunc_t)elf64_insert_brand_entry, &freebsd_brand_oinfo);
static Elf64_Brandinfo kfreebsd_brand_info = {
.brand = ELFOSABI_FREEBSD,
@@ -144,15 +221,14 @@ static Elf64_Brandinfo kfreebsd_brand_info = {
.compat_3_brand = "FreeBSD",
.emul_path = NULL,
.interp_path = "/lib/ld-kfreebsd-x86-64.so.1",
- .sysvec = &elf64_freebsd_sysvec,
+ .sysvec = &elf64_freebsd_sysvec_la48,
.interp_newpath = NULL,
.brand_note = &elf64_kfreebsd_brandnote,
.flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE_MANDATORY
};
SYSINIT(kelf64, SI_SUB_EXEC, SI_ORDER_ANY,
- (sysinit_cfunc_t) elf64_insert_brand_entry,
- &kfreebsd_brand_info);
+ (sysinit_cfunc_t)elf64_insert_brand_entry, &kfreebsd_brand_info);
void
elf64_dump_thread(struct thread *td, void *dst, size_t *off)
diff --git a/sys/amd64/amd64/genassym.c b/sys/amd64/amd64/genassym.c
index ec3707ce41f95..75500555105a4 100644
--- a/sys/amd64/amd64/genassym.c
+++ b/sys/amd64/amd64/genassym.c
@@ -99,11 +99,10 @@ ASSYM(TDP_KTHREAD, TDP_KTHREAD);
ASSYM(PAGE_SIZE, PAGE_SIZE);
ASSYM(NPTEPG, NPTEPG);
ASSYM(NPDEPG, NPDEPG);
-ASSYM(addr_PTmap, addr_PTmap);
-ASSYM(addr_PDmap, addr_PDmap);
-ASSYM(addr_PDPmap, addr_PDPmap);
-ASSYM(addr_PML4map, addr_PML4map);
-ASSYM(addr_PML4pml4e, addr_PML4pml4e);
+ASSYM(addr_P4Tmap, addr_P4Tmap);
+ASSYM(addr_P4Dmap, addr_P4Dmap);
+ASSYM(addr_P5Tmap, addr_P5Tmap);
+ASSYM(addr_P5Dmap, addr_P5Dmap);
ASSYM(PDESIZE, sizeof(pd_entry_t));
ASSYM(PTESIZE, sizeof(pt_entry_t));
ASSYM(PAGE_SHIFT, PAGE_SHIFT);
diff --git a/sys/amd64/amd64/locore.S b/sys/amd64/amd64/locore.S
index d070c10693b70..a9a7b5f3972c5 100644
--- a/sys/amd64/amd64/locore.S
+++ b/sys/amd64/amd64/locore.S
@@ -36,13 +36,8 @@
/*
* Compiled KERNBASE location
*/
- .globl kernbase,loc_PTmap,loc_PDmap,loc_PDPmap,loc_PML4map,loc_PML4pml4e,dmapbase,dmapend
+ .globl kernbase, loc_PTmap, loc_PDmap, loc_PDPmap, dmapbase, dmapend
.set kernbase,KERNBASE
- .set loc_PTmap,addr_PTmap
- .set loc_PDmap,addr_PDmap
- .set loc_PDPmap,addr_PDPmap
- .set loc_PML4map,addr_PML4map
- .set loc_PML4pml4e,addr_PML4pml4e
.set dmapbase,DMAP_MIN_ADDRESS
.set dmapend,DMAP_MAX_ADDRESS
@@ -82,6 +77,62 @@ NON_GPROF_ENTRY(btext)
0: hlt
jmp 0b
+/* la57_trampoline(%rdi pml5) */
+NON_GPROF_ENTRY(la57_trampoline)
+ movq %rsp,%r11
+ movq %rbx,%r10
+ leaq la57_trampoline_end(%rip),%rsp
+
+ movq %cr0,%rdx
+ lgdtq la57_trampoline_gdt_desc(%rip)
+
+ pushq $(2<<3)
+ leaq l1(%rip),%rax
+ leaq l2(%rip),%rbx
+
+ pushq %rax
+ lretq
+ .code32
+
+l1: movl $(3<<3),%eax
+ movl %eax,%ss
+
+ movl %edx,%eax
+ andl $~CR0_PG,%eax
+ movl %eax,%cr0
+
+ movl %cr4,%eax
+ orl $CR4_LA57,%eax
+ movl %eax,%cr4
+
+ movl %edi,%cr3
+ movl %edx,%cr0
+
+ pushl $(1<<3)
+ pushl %ebx
+ lretl
+ .code64
+
+l2: movq %r11,%rsp
+ movq %r10,%rbx
+ retq
+ .p2align 4,0
+NON_GPROF_ENTRY(la57_trampoline_gdt_desc)
+ .word la57_trampoline_end - la57_trampoline_gdt
+ .long 0 /* filled by pmap_bootstrap_la57 */
+ .p2align 4,0
+NON_GPROF_ENTRY(la57_trampoline_gdt)
+ .long 0x00000000 /* null desc */
+ .long 0x00000000
+ .long 0x00000000 /* 64bit code */
+ .long 0x00209800
+ .long 0x0000ffff /* 32bit code */
+ .long 0x00cf9b00
+ .long 0x0000ffff /* universal data */
+ .long 0x00cf9300
+ .dcb.l 16,0
+NON_GPROF_ENTRY(la57_trampoline_end)
+
.bss
ALIGN_DATA /* just to be sure */
.globl bootstack
diff --git a/sys/amd64/amd64/mp_machdep.c b/sys/amd64/amd64/mp_machdep.c
index d46362ba9f9c0..844cb49b536cd 100644
--- a/sys/amd64/amd64/mp_machdep.c
+++ b/sys/amd64/amd64/mp_machdep.c
@@ -96,7 +96,7 @@ __FBSDID("$FreeBSD$");
#define GiB(v) (v ## ULL << 30)
-#define AP_BOOTPT_SZ (PAGE_SIZE * 3)
+#define AP_BOOTPT_SZ (PAGE_SIZE * 4)
/* Temporary variables for init_secondary() */
char *doublefault_stack;
@@ -104,6 +104,8 @@ char *mce_stack;
char *nmi_stack;
char *dbg_stack;
+extern u_int mptramp_la57;
+
/*
* Local data and functions.
*/
@@ -240,6 +242,8 @@ cpu_mp_start(void)
assign_cpu_ids();
+ mptramp_la57 = la57;
+
/* Start each Application Processor */
init_ops.start_all_aps();
@@ -395,9 +399,9 @@ mp_realloc_pcpu(int cpuid, int domain)
int
native_start_all_aps(void)
{
- u_int64_t *pt4, *pt3, *pt2;
+ u_int64_t *pt5, *pt4, *pt3, *pt2;
u_int32_t mpbioswarmvec;
- int apic_id, cpu, domain, i;
+ int apic_id, cpu, domain, i, xo;
u_char mpbiosreason;
mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN);
@@ -406,18 +410,38 @@ native_start_all_aps(void)
bcopy(mptramp_start, (void *)PHYS_TO_DMAP(boot_address), bootMP_size);
/* Locate the page tables, they'll be below the trampoline */
- pt4 = (uint64_t *)PHYS_TO_DMAP(mptramp_pagetables);
+ if (la57) {
+ pt5 = (uint64_t *)PHYS_TO_DMAP(mptramp_pagetables);
+ xo = 1;
+ } else {
+ xo = 0;
+ }
+ pt4 = (uint64_t *)PHYS_TO_DMAP(mptramp_pagetables + xo * PAGE_SIZE);
pt3 = pt4 + (PAGE_SIZE) / sizeof(u_int64_t);
pt2 = pt3 + (PAGE_SIZE) / sizeof(u_int64_t);
/* Create the initial 1GB replicated page tables */
for (i = 0; i < 512; i++) {
- /* Each slot of the level 4 pages points to the same level 3 page */
- pt4[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + PAGE_SIZE);
+ if (la57) {
+ pt5[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables +
+ PAGE_SIZE);
+ pt5[i] |= PG_V | PG_RW | PG_U;
+ }
+
+ /*
+ * Each slot of the level 4 pages points to the same
+ * level 3 page.
+ */
+ pt4[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables +
+ (xo + 1) * PAGE_SIZE);
pt4[i] |= PG_V | PG_RW | PG_U;
- /* Each slot of the level 3 pages points to the same level 2 page */
- pt3[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + (2 * PAGE_SIZE));
+ /*
+ * Each slot of the level 3 pages points to the same
+ * level 2 page.
+ */
+ pt3[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables +
+ ((xo + 2) * PAGE_SIZE));
pt3[i] |= PG_V | PG_RW | PG_U;
/* The level 2 page slots are mapped with 2MB pages for 1GB. */
diff --git a/sys/amd64/amd64/mpboot.S b/sys/amd64/amd64/mpboot.S
index 5545fe9290d14..fb75d2b884400 100644
--- a/sys/amd64/amd64/mpboot.S
+++ b/sys/amd64/amd64/mpboot.S
@@ -90,10 +90,16 @@ protmode:
mov $bootdata-gdt, %eax
mov %ax, %ds
- /* Turn on the PAE bit for when paging is enabled */
+ /*
+ * Turn on the PAE bit and optionally the LA57 bit for when paging
+ * is later enabled.
+ */
mov %cr4, %eax
orl $CR4_PAE, %eax
- mov %eax, %cr4
+ cmpb $0, mptramp_la57-mptramp_start(%ebx)
+ je 1f
+ orl $CR4_LA57, %eax
+1: mov %eax, %cr4
/*
* Enable EFER.LME so that we get long mode when all the prereqs are
@@ -132,9 +138,9 @@ protmode:
/*
* At this point paging is enabled, and we are in "compatibility" mode.
* We do another far jump to reload %cs with the 64 bit selector.
- * %cr3 points to a 4-level page table page.
+ * %cr3 points to a 4- or 5-level page table.
* We cannot yet jump all the way to the kernel because we can only
- * specify a 32 bit linear address. So, yet another trampoline.
+ * specify a 32 bit linear address. So, we use yet another trampoline.
*
* The following instruction is:
* ljmp $kernelcode-gdt, $tramp_64-mptramp_start
@@ -209,6 +215,11 @@ gdtend:
mptramp_pagetables:
.long 0
+ /* 5-level paging ? */
+ .globl mptramp_la57
+mptramp_la57:
+ .long 0
+
/*
* The pseudo descriptor for lgdt to use.
*/
@@ -251,8 +262,12 @@ entry_64:
* Load a real %cr3 that has all the direct map stuff and switches
* off the 1GB replicated mirror. Load a stack pointer and jump
* into AP startup code in C.
- */
+ */
+ cmpl $0, la57
+ jne 2f
movq KPML4phys, %rax
- movq %rax, %cr3
+ jmp 3f
+2: movq KPML5phys, %rax
+3: movq %rax, %cr3
movq bootSTK, %rsp
jmp init_secondary
diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c
index d025beff45186..4b17debd480d3 100644
--- a/sys/amd64/amd64/pmap.c
+++ b/sys/amd64/amd64/pmap.c
@@ -398,6 +398,19 @@ static int pg_ps_enabled = 1;
SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
&pg_ps_enabled, 0, "Are large page mappings enabled?");
+int __read_frequently la57 = 0;
+SYSCTL_INT(_vm_pmap, OID_AUTO, la57, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
+ &la57, 0,
+ "5-level paging for host is enabled");
+
+static bool
+pmap_is_la57(pmap_t pmap)
+{
+ if (pmap->pm_type == PT_X86)
+ return (la57);
+ return (false); /* XXXKIB handle EPT */
+}
+
#define PAT_INDEX_SIZE 8
static int pat_index[PAT_INDEX_SIZE]; /* cache mode to PAT index conversion */
@@ -405,7 +418,10 @@ static u_int64_t KPTphys; /* phys addr of kernel level 1 */
static u_int64_t KPDphys; /* phys addr of kernel level 2 */
u_int64_t KPDPphys; /* phys addr of kernel level 3 */
u_int64_t KPML4phys; /* phys addr of kernel level 4 */
+u_int64_t KPML5phys; /* phys addr of kernel level 5,
+ if supported */
+static pml4_entry_t *kernel_pml4;
static u_int64_t DMPDphys; /* phys addr of direct mapped level 2 */
static u_int64_t DMPDPphys; /* phys addr of direct mapped level 3 */
static int ndmpdpphys; /* number of DMPDPphys pages */
@@ -1257,7 +1273,7 @@ static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
static void pmap_update_pde_invalidate(pmap_t, vm_offset_t va, pd_entry_t pde);
static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex,
- struct rwlock **lockp);
+ struct rwlock **lockp, vm_offset_t va);
static pd_entry_t *pmap_alloc_pde(pmap_t pmap, vm_offset_t va, vm_page_t *pdpgp,
struct rwlock **lockp);
static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va,
@@ -1271,20 +1287,85 @@ static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *);
/* Inline functions */
/********************/
-/* Return a non-clipped PD index for a given VA */
+/*
+ * Return a non-clipped indexes for a given VA, which are page table
+ * pages indexes at the corresponding level.
+ */
static __inline vm_pindex_t
pmap_pde_pindex(vm_offset_t va)
{
return (va >> PDRSHIFT);
}
+static __inline vm_pindex_t
+pmap_pdpe_pindex(vm_offset_t va)
+{
+ return (NUPDE + (va >> PDPSHIFT));
+}
+
+static __inline vm_pindex_t
+pmap_pml4e_pindex(vm_offset_t va)
+{
+ return (NUPDE + NUPDPE + (va >> PML4SHIFT));
+}
+
+static __inline vm_pindex_t
+pmap_pml5e_pindex(vm_offset_t va)
+{
+ return (NUPDE + NUPDPE + NUPML4E + (va >> PML5SHIFT));
+}
+
+static __inline pml4_entry_t *
+pmap_pml5e(pmap_t pmap, vm_offset_t va)
+{
+
+ MPASS(pmap_is_la57(pmap));
+ return (&pmap->pm_pmltop[pmap_pml5e_index(va)]);
+}
+
+static __inline pml4_entry_t *
+pmap_pml5e_u(pmap_t pmap, vm_offset_t va)
+{
+
+ MPASS(pmap_is_la57(pmap));
+ return (&pmap->pm_pmltopu[pmap_pml5e_index(va)]);
+}
+
+static __inline pml4_entry_t *
+pmap_pml5e_to_pml4e(pml5_entry_t *pml5e, vm_offset_t va)
+{
+ pml4_entry_t *pml4e;
+
+ /* XXX MPASS(pmap_is_la57(pmap); */
+ pml4e = (pml4_entry_t *)PHYS_TO_DMAP(*pml5e & PG_FRAME);
+ return (&pml4e[pmap_pml4e_index(va)]);
+}
/* Return a pointer to the PML4 slot that corresponds to a VA */
static __inline pml4_entry_t *
pmap_pml4e(pmap_t pmap, vm_offset_t va)
{
+ pml5_entry_t *pml5e;
+ pml4_entry_t *pml4e;
+ pt_entry_t PG_V;
- return (&pmap->pm_pml4[pmap_pml4e_index(va)]);
+ if (pmap_is_la57(pmap)) {
+ pml5e = pmap_pml5e(pmap, va);
+ PG_V = pmap_valid_bit(pmap);
+ if ((*pml5e & PG_V) == 0)
+ return (NULL);
+ pml4e = (pml4_entry_t *)PHYS_TO_DMAP(*pml5e & PG_FRAME);
+ } else {
+ pml4e = pmap->pm_pmltop;
+ }
+ return (&pml4e[pmap_pml4e_index(va)]);
+}
+
+static __inline pml4_entry_t *
+pmap_pml4e_u(pmap_t pmap, vm_offset_t va)
+{
+ MPASS(!pmap_is_la57(pmap));
+ return (&pmap->pm_pmltopu[pmap_pml4e_index(va)]);
}
/* Return a pointer to the PDP slot that corresponds to a VA */
@@ -1306,7 +1387,7 @@ pmap_pdpe(pmap_t pmap, vm_offset_t va)
PG_V = pmap_valid_bit(pmap);
pml4e = pmap_pml4e(pmap, va);
- if ((*pml4e & PG_V) == 0)
+ if (pml4e == NULL || (*pml4e & PG_V) == 0)
return (NULL);
return (pmap_pml4e_to_pdpe(pml4e, va));
}
@@ -1387,21 +1468,37 @@ pmap_resident_count_dec(pmap_t pmap, int count)
PMAP_INLINE pt_entry_t *
vtopte(vm_offset_t va)
{
- u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
+ u_int64_t mask;
KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopte on a uva/gpa 0x%0lx", va));
- return (PTmap + ((va >> PAGE_SHIFT) & mask));
+ if (la57) {
+ mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT +
+ NPML4EPGSHIFT + NPML5EPGSHIFT)) - 1);
+ return (P5Tmap + ((va >> PAGE_SHIFT) & mask));
+ } else {
+ mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT +
+ NPML4EPGSHIFT)) - 1);
+ return (P4Tmap + ((va >> PAGE_SHIFT) & mask));
+ }
}
static __inline pd_entry_t *
vtopde(vm_offset_t va)
{
- u_int64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
+ u_int64_t mask;
KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopde on a uva/gpa 0x%0lx", va));
- return (PDmap + ((va >> PDRSHIFT) & mask));
+ if (la57) {
+ mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT +
+ NPML4EPGSHIFT + NPML5EPGSHIFT)) - 1);
+ return (P5Dmap + ((va >> PDRSHIFT) & mask));
+ } else {
+ mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT +
+ NPML4EPGSHIFT)) - 1);
+ return (P4Dmap + ((va >> PDRSHIFT) & mask));
+ }
}
static u_int64_t
@@ -1658,6 +1755,8 @@ create_pagetables(vm_paddr_t *firstaddr)
p4_p[KPML4BASE + i] = KPDPphys + ptoa(i);
p4_p[KPML4BASE + i] |= X86_PG_RW | X86_PG_V;
}
+
+ kernel_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys);
}
/*
@@ -1730,7 +1829,7 @@ pmap_bootstrap(vm_paddr_t *firstaddr)
* later unmapped (using pmap_remove()) and freed.
*/
PMAP_LOCK_INIT(kernel_pmap);
- kernel_pmap->pm_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(KPML4phys);
+ kernel_pmap->pm_pmltop = kernel_pml4;
kernel_pmap->pm_cr3 = KPML4phys;
kernel_pmap->pm_ucr3 = PMAP_NO_CR3;
CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */
@@ -1891,6 +1990,148 @@ pmap_init_pat(void)
load_cr4(cr4);
}
+extern const char la57_trampoline[], la57_trampoline_gdt_desc[],
+ la57_trampoline_gdt[], la57_trampoline_end[];
+
+static void
+pmap_bootstrap_la57(void *arg __unused)
+{
+ char *v_code;
+ pml5_entry_t *v_pml5;
+ pml4_entry_t *v_pml4;
+ pdp_entry_t *v_pdp;
+ pd_entry_t *v_pd;
+ pt_entry_t *v_pt;
+ vm_page_t m_code, m_pml4, m_pdp, m_pd, m_pt, m_pml5;
+ void (*la57_tramp)(uint64_t pml5);
+ struct region_descriptor r_gdt;
+
+ if ((cpu_stdext_feature2 & CPUID_STDEXT2_LA57) == 0)
+ return;
+ if (!TUNABLE_INT_FETCH("vm.pmap.la57", &la57))
+ la57 = 1;
+ if (!la57)
+ return;
+
+ r_gdt.rd_limit = NGDT * sizeof(struct user_segment_descriptor) - 1;
+ r_gdt.rd_base = (long)__pcpu[0].pc_gdt;
+
+ m_code = vm_page_alloc_contig(NULL, 0,
+ VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ,
+ 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT);
+ if ((m_code->flags & PG_ZERO) == 0)
+ pmap_zero_page(m_code);
+ v_code = (char *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_code));
+ m_pml5 = vm_page_alloc_contig(NULL, 0,
+ VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ,
+ 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT);
+ if ((m_pml5->flags & PG_ZERO) == 0)
+ pmap_zero_page(m_pml5);
+ KPML5phys = VM_PAGE_TO_PHYS(m_pml5);
+ v_pml5 = (pml5_entry_t *)PHYS_TO_DMAP(KPML5phys);
+ m_pml4 = vm_page_alloc_contig(NULL, 0,
+ VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ,
+ 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT);
+ if ((m_pml4->flags & PG_ZERO) == 0)
+ pmap_zero_page(m_pml4);
+ v_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pml4));
+ m_pdp = vm_page_alloc_contig(NULL, 0,
+ VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ,
+ 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT);
+ if ((m_pdp->flags & PG_ZERO) == 0)
+ pmap_zero_page(m_pdp);
+ v_pdp = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pdp));
+ m_pd = vm_page_alloc_contig(NULL, 0,
+ VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ,
+ 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT);
+ if ((m_pd->flags & PG_ZERO) == 0)
+ pmap_zero_page(m_pd);
+ v_pd = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pd));
+ m_pt = vm_page_alloc_contig(NULL, 0,
+ VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ,
+ 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT);
+ if ((m_pt->flags & PG_ZERO) == 0)
+ pmap_zero_page(m_pt);
+ v_pt = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pt));
+
+ /*
+ * Map m_code 1:1, it appears below 4G in KVA due to physical
+ * address being below 4G. Since kernel KVA is in upper half,
+ * the pml4e should be zero and free for temporary use.
+ */
+ kernel_pmap->pm_pmltop[pmap_pml4e_index(VM_PAGE_TO_PHYS(m_code))] =
+ VM_PAGE_TO_PHYS(m_pdp) | X86_PG_V | X86_PG_RW | X86_PG_A |
+ X86_PG_M;
+ v_pdp[pmap_pdpe_index(VM_PAGE_TO_PHYS(m_code))] =
+ VM_PAGE_TO_PHYS(m_pd) | X86_PG_V | X86_PG_RW | X86_PG_A |
+ X86_PG_M;
+ v_pd[pmap_pde_index(VM_PAGE_TO_PHYS(m_code))] =
+ VM_PAGE_TO_PHYS(m_pt) | X86_PG_V | X86_PG_RW | X86_PG_A |
+ X86_PG_M;
+ v_pt[pmap_pte_index(VM_PAGE_TO_PHYS(m_code))] =
+ VM_PAGE_TO_PHYS(m_code) | X86_PG_V | X86_PG_RW | X86_PG_A |
+ X86_PG_M;
+
+ /*
+ * Add pml5 entry at top of KVA pointing to existing pml4 table,
+ * entering all existing kernel mappings into level 5 table.
+ */
+ v_pml5[pmap_pml5e_index(UPT_MAX_ADDRESS)] = KPML4phys | X86_PG_V |
+ X86_PG_RW | X86_PG_A | X86_PG_M | pg_g;
+
+ /*
+ * Add pml5 entry for 1:1 trampoline mapping after LA57 is turned on.
+ */
+ v_pml5[pmap_pml5e_index(VM_PAGE_TO_PHYS(m_code))] =
+ VM_PAGE_TO_PHYS(m_pml4) | X86_PG_V | X86_PG_RW | X86_PG_A |
+ X86_PG_M;
+ v_pml4[pmap_pml4e_index(VM_PAGE_TO_PHYS(m_code))] =
+ VM_PAGE_TO_PHYS(m_pdp) | X86_PG_V | X86_PG_RW | X86_PG_A |
+ X86_PG_M;
+
+ /*
+ * Copy and call the 48->57 trampoline, hope we return there, alive.
+ */
+ bcopy(la57_trampoline, v_code, la57_trampoline_end - la57_trampoline);
+ *(u_long *)(v_code + 2 + (la57_trampoline_gdt_desc - la57_trampoline)) =
+ la57_trampoline_gdt - la57_trampoline + VM_PAGE_TO_PHYS(m_code);
+ la57_tramp = (void (*)(uint64_t))VM_PAGE_TO_PHYS(m_code);
+ la57_tramp(KPML5phys);
+
+ /*
+ * gdt was necessary reset, switch back to our gdt.
+ */
+ lgdt(&r_gdt);
+ wrmsr(MSR_GSBASE, (uint64_t)&__pcpu[0]);
+ load_ds(_udatasel);
+ load_es(_udatasel);
+ load_fs(_ufssel);
+ ssdtosyssd(&gdt_segs[GPROC0_SEL],
+ (struct system_segment_descriptor *)&__pcpu[0].pc_gdt[GPROC0_SEL]);
+ ltr(GSEL(GPROC0_SEL, SEL_KPL));
+
+ /*
+ * Now unmap the trampoline, and free the pages.
+ * Clear pml5 entry used for 1:1 trampoline mapping.
+ */
+ pte_clear(&v_pml5[pmap_pml5e_index(VM_PAGE_TO_PHYS(m_code))]);
+ invlpg((vm_offset_t)v_code);
+ vm_page_free(m_code);
+ vm_page_free(m_pdp);
+ vm_page_free(m_pd);
+ vm_page_free(m_pt);
+
+ /*
+ * Recursively map PML5 to itself in order to get PTmap and
+ * PDmap.
+ */
+ v_pml5[PML5PML5I] = KPML5phys | X86_PG_RW | X86_PG_V | pg_nx;
+
+ kernel_pmap->pm_cr3 = KPML5phys;
+ kernel_pmap->pm_pmltop = v_pml5;
+}
+SYSINIT(la57, SI_SUB_KMEM, SI_ORDER_ANY, pmap_bootstrap_la57, NULL);
+
/*
* Initialize a vm_page's machine-dependent fields.
*/
@@ -2190,7 +2431,8 @@ pmap_init(void)
}
for (i = 0; i < lm_ents; i++) {
m = pmap_large_map_getptp_unlocked();
- kernel_pmap->pm_pml4[LMSPML4I + i] = X86_PG_V |
+ /* XXXKIB la57 */
+ kernel_pml4[LMSPML4I + i] = X86_PG_V |
X86_PG_RW | X86_PG_A | X86_PG_M | pg_nx |
VM_PAGE_TO_PHYS(m);
}
@@ -3566,44 +3808,57 @@ pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
static void
_pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
{
+ pml5_entry_t *pml5;
+ pml4_entry_t *pml4;
+ pdp_entry_t *pdp;
+ pd_entry_t *pd;
+ vm_page_t pdpg, pdppg, pml4pg;
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+
/*
* unmap the page table page
*/
- if (m->pindex >= NUPDE + NUPDPE) {
+ if (m->pindex >= NUPDE + NUPDPE + NUPML4E) {
+ /* PML4 page */
+ MPASS(pmap_is_la57(pmap));
+ pml5 = pmap_pml5e(pmap, va);
+ *pml5 = 0;
+ if (pmap->pm_pmltopu != NULL && va <= VM_MAXUSER_ADDRESS) {
+ pml5 = pmap_pml5e_u(pmap, va);
+ *pml5 = 0;
+ }
+ } else if (m->pindex >= NUPDE + NUPDPE) {
/* PDP page */
- pml4_entry_t *pml4;
pml4 = pmap_pml4e(pmap, va);
*pml4 = 0;
- if (pmap->pm_pml4u != NULL && va <= VM_MAXUSER_ADDRESS) {
- pml4 = &pmap->pm_pml4u[pmap_pml4e_index(va)];
+ if (!pmap_is_la57(pmap) && pmap->pm_pmltopu != NULL &&
+ va <= VM_MAXUSER_ADDRESS) {
+ pml4 = pmap_pml4e_u(pmap, va);
*pml4 = 0;
}
} else if (m->pindex >= NUPDE) {
/* PD page */
- pdp_entry_t *pdp;
pdp = pmap_pdpe(pmap, va);
*pdp = 0;
} else {
/* PTE page */
- pd_entry_t *pd;
pd = pmap_pde(pmap, va);
*pd = 0;
}
pmap_resident_count_dec(pmap, 1);
if (m->pindex < NUPDE) {
/* We just released a PT, unhold the matching PD */
- vm_page_t pdpg;
-
pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME);
pmap_unwire_ptp(pmap, va, pdpg, free);
} else if (m->pindex < NUPDE + NUPDPE) {
/* We just released a PD, unhold the matching PDP */
- vm_page_t pdppg;
-
pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME);
pmap_unwire_ptp(pmap, va, pdppg, free);
+ } else if (m->pindex < NUPDE + NUPDPE + NUPML4E && pmap_is_la57(pmap)) {
+ /* We just released a PDP, unhold the matching PML4 */
+ pml4pg = PHYS_TO_VM_PAGE(*pmap_pml5e(pmap, va) & PG_FRAME);
+ pmap_unwire_ptp(pmap, va, pml4pg, free);
}
/*
@@ -3659,9 +3914,9 @@ pmap_pinit0(pmap_t pmap)
int i;
PMAP_LOCK_INIT(pmap);
- pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys);
- pmap->pm_pml4u = NULL;
- pmap->pm_cr3 = KPML4phys;
+ pmap->pm_pmltop = kernel_pmap->pm_pmltop;
+ pmap->pm_pmltopu = NULL;
+ pmap->pm_cr3 = kernel_pmap->pm_cr3;
/* hack to keep pmap_pti_pcid_invalidate() alive */
pmap->pm_ucr3 = PMAP_NO_CR3;
pmap->pm_root.rt_root = 0;
@@ -3714,18 +3969,59 @@ pmap_pinit_pml4(vm_page_t pml4pg)
/* install large map entries if configured */
for (i = 0; i < lm_ents; i++)
- pm_pml4[LMSPML4I + i] = kernel_pmap->pm_pml4[LMSPML4I + i];
+ pm_pml4[LMSPML4I + i] = kernel_pmap->pm_pmltop[LMSPML4I + i];
+}
+
+void
+pmap_pinit_pml5(vm_page_t pml5pg)
+{
+ pml5_entry_t *pm_pml5;
+
+ pm_pml5 = (pml5_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml5pg));
+
+ /*
+ * Add pml5 entry at top of KVA pointing to existing pml4 table,
+ * entering all existing kernel mappings into level 5 table.
+ */
+ pm_pml5[pmap_pml5e_index(UPT_MAX_ADDRESS)] = KPML4phys | X86_PG_V |
+ X86_PG_RW | X86_PG_A | X86_PG_M | pg_g |
+ pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, FALSE);
+
+ /*
+ * Install self-referential address mapping entry.
+ */
+ pm_pml5[PML5PML5I] = VM_PAGE_TO_PHYS(pml5pg) |
+ X86_PG_RW | X86_PG_V | X86_PG_M | X86_PG_A |
+ pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, FALSE);
}
static void
-pmap_pinit_pml4_pti(vm_page_t pml4pg)
+pmap_pinit_pml4_pti(vm_page_t pml4pgu)
{
- pml4_entry_t *pm_pml4;
+ pml4_entry_t *pm_pml4u;
int i;
- pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg));
+ pm_pml4u = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pgu));
for (i = 0; i < NPML4EPG; i++)
- pm_pml4[i] = pti_pml4[i];
+ pm_pml4u[i] = pti_pml4[i];
+}
+
+static void
+pmap_pinit_pml5_pti(vm_page_t pml5pgu)
+{
+ pml5_entry_t *pm_pml5u;
+
+ pm_pml5u = (pml5_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml5pgu));
+
+ /*
+ * Add pml5 entry at top of KVA pointing to existing pml4 pti
+ * table, entering all kernel mappings needed for usermode
+ * into level 5 table.
+ */
+ pm_pml5u[pmap_pml5e_index(UPT_MAX_ADDRESS)] =
+ pmap_kextract((vm_offset_t)pti_pml4) |
+ X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M | pg_g |
+ pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, FALSE);
}
/*
@@ -3735,29 +4031,30 @@ pmap_pinit_pml4_pti(vm_page_t pml4pg)
int
pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags)
{
- vm_page_t pml4pg, pml4pgu;
- vm_paddr_t pml4phys;
+ vm_page_t pmltop_pg, pmltop_pgu;
+ vm_paddr_t pmltop_phys;
int i;
/*
* allocate the page directory page
*/
- pml4pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
+ pmltop_pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
VM_ALLOC_WIRED | VM_ALLOC_ZERO | VM_ALLOC_WAITOK);
- pml4phys = VM_PAGE_TO_PHYS(pml4pg);
- pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(pml4phys);
+ pmltop_phys = VM_PAGE_TO_PHYS(pmltop_pg);
+ pmap->pm_pmltop = (pml5_entry_t *)PHYS_TO_DMAP(pmltop_phys);
+
CPU_FOREACH(i) {
pmap->pm_pcids[i].pm_pcid = PMAP_PCID_NONE;
pmap->pm_pcids[i].pm_gen = 0;
}
pmap->pm_cr3 = PMAP_NO_CR3; /* initialize to an invalid value */
pmap->pm_ucr3 = PMAP_NO_CR3;
- pmap->pm_pml4u = NULL;
+ pmap->pm_pmltopu = NULL;
pmap->pm_type = pm_type;
- if ((pml4pg->flags & PG_ZERO) == 0)
- pagezero(pmap->pm_pml4);
+ if ((pmltop_pg->flags & PG_ZERO) == 0)
+ pagezero(pmap->pm_pmltop);
/*
* Do not install the host kernel mappings in the nested page
@@ -3766,15 +4063,21 @@ pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags)
* Install minimal kernel mappings in PTI case.
*/
if (pm_type == PT_X86) {
- pmap->pm_cr3 = pml4phys;
- pmap_pinit_pml4(pml4pg);
+ pmap->pm_cr3 = pmltop_phys;
+ if (pmap_is_la57(pmap))
+ pmap_pinit_pml5(pmltop_pg);
+ else
+ pmap_pinit_pml4(pmltop_pg);
if ((curproc->p_md.md_flags & P_MD_KPTI) != 0) {
- pml4pgu = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
+ pmltop_pgu = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_WAITOK);
- pmap->pm_pml4u = (pml4_entry_t *)PHYS_TO_DMAP(
- VM_PAGE_TO_PHYS(pml4pgu));
- pmap_pinit_pml4_pti(pml4pgu);
- pmap->pm_ucr3 = VM_PAGE_TO_PHYS(pml4pgu);
+ pmap->pm_pmltopu = (pml4_entry_t *)PHYS_TO_DMAP(
+ VM_PAGE_TO_PHYS(pmltop_pgu));
+ if (pmap_is_la57(pmap))
+ pmap_pinit_pml5_pti(pmltop_pgu);
+ else
+ pmap_pinit_pml4_pti(pmltop_pgu);
+ pmap->pm_ucr3 = VM_PAGE_TO_PHYS(pmltop_pgu);
}
if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) {
rangeset_init(&pmap->pm_pkru, pkru_dup_range,
@@ -3799,14 +4102,88 @@ pmap_pinit(pmap_t pmap)
return (pmap_pinit_type(pmap, PT_X86, pmap_flags));
}
+static pml4_entry_t *
+pmap_allocpte_getpml4(pmap_t pmap, struct rwlock **lockp, vm_offset_t va,
+ bool addref)
+{
+ vm_pindex_t pml5index;
+ pml5_entry_t *pml5;
+ pml4_entry_t *pml4;
+ vm_page_t pml4pg;
+ pt_entry_t PG_V;
+ bool allocated;
+
+ if (!pmap_is_la57(pmap))
+ return (&pmap->pm_pmltop[pmap_pml4e_index(va)]);
+
+ PG_V = pmap_valid_bit(pmap);
+ pml5index = pmap_pml5e_index(va);
+ pml5 = &pmap->pm_pmltop[pml5index];
+ if ((*pml5 & PG_V) == 0) {
+ if (_pmap_allocpte(pmap, pmap_pml5e_pindex(va), lockp, va) ==
+ NULL)
+ return (NULL);
+ allocated = true;
+ } else {
+ allocated = false;
+ }
+ pml4 = (pml4_entry_t *)PHYS_TO_DMAP(*pml5 & PG_FRAME);
+ pml4 = &pml4[pmap_pml4e_index(va)];
+ if ((*pml4 & PG_V) == 0) {
+ pml4pg = PHYS_TO_VM_PAGE(*pml5 & PG_FRAME);
+ if (allocated && !addref)
+ pml4pg->ref_count--;
+ else if (!allocated && addref)
+ pml4pg->ref_count++;
+ }
+ return (pml4);
+}
+
+static pdp_entry_t *
+pmap_allocpte_getpdp(pmap_t pmap, struct rwlock **lockp, vm_offset_t va,
+ bool addref)
+{
+ vm_page_t pdppg;
+ pml4_entry_t *pml4;
+ pdp_entry_t *pdp;
+ pt_entry_t PG_V;
+ bool allocated;
+
+ PG_V = pmap_valid_bit(pmap);
+
+ pml4 = pmap_allocpte_getpml4(pmap, lockp, va, false);
+ if (pml4 == NULL)
+ return (NULL);
+
+ if ((*pml4 & PG_V) == 0) {
+ /* Have to allocate a new pdp, recurse */
+ if (_pmap_allocpte(pmap, pmap_pml4e_pindex(va), lockp, va) ==
+ NULL)
+ return (NULL);
+ allocated = true;
+ } else {
+ allocated = false;
+ }
+ pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
+ pdp = &pdp[pmap_pdpe_index(va)];
+ if ((*pdp & PG_V) == 0) {
+ pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME);
+ if (allocated && !addref)
+ pdppg->ref_count--;
+ else if (!allocated && addref)
+ pdppg->ref_count++;
+ }
+ return (pdp);
+}
+
/*
* This routine is called if the desired page table page does not exist.
*
* If page table page allocation fails, this routine may sleep before
* returning NULL. It sleeps only if a lock pointer was given.
*
- * Note: If a page allocation fails at page table level two or three,
- * one or two pages may be held during the wait, only to be released
+ * Note: If a page allocation fails at page table level two, three, or four,
+ * up to three pages may be held during the wait, only to be released
* afterwards. This conservative approach is easily argued to avoid
* race conditions.
*
@@ -3823,20 +4200,35 @@ pmap_pinit(pmap_t pmap)
* - for the page directory pointer page,
* ptepindex = NUPDE + NUPDPE + (pmap_pde_index(va) >> (NPDEPGSHIFT +
* NPML4EPGSHIFT),
- * i.e. index of pml4e is put after the last index of PDPE.
+ * i.e. index of pml4e is put after the last index of PDPE,
+ * - for the PML4 page (if LA57 mode is enabled),
+ * ptepindex = NUPDE + NUPDPE + NUPML4E + (pmap_pde_index(va) >>
+ * (NPDEPGSHIFT + NPML4EPGSHIFT + NPML5EPGSHIFT),
+ * i.e. index of pml5e is put after the last index of PML4E.
*
* Define an order on the paging entries, where all entries of the
* same height are put together, then heights are put from deepest to
* root. Then ptexpindex is the sequential number of the
* corresponding paging entry in this order.
*
- * The root page at PML4 does not participate in this indexing scheme, since
- * it is statically allocated by pmap_pinit() and not by _pmap_allocpte().
+ * The values of NUPDE, NUPDPE, and NUPML4E are determined by the size of
+ * LA57 paging structures even in LA48 paging mode. Moreover, the
+ * ptepindexes are calculated as if the paging structures were 5-level
+ * regardless of the actual mode of operation.
+ *
+ * The root page at PML4/PML5 does not participate in this indexing scheme,
+ * since it is statically allocated by pmap_pinit() and not by _pmap_allocpte().
*/
static vm_page_t
-_pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
+_pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp,
+ vm_offset_t va __unused)
{
- vm_page_t m, pdppg, pdpg;
+ vm_pindex_t pml5index, pml4index;
+ pml5_entry_t *pml5, *pml5u;
+ pml4_entry_t *pml4, *pml4u;
+ pdp_entry_t *pdp;
+ pd_entry_t *pd;
+ vm_page_t m, pdpg;
pt_entry_t PG_A, PG_M, PG_RW, PG_V;
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
@@ -3872,16 +4264,38 @@ _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
* Map the pagetable page into the process address space, if
* it isn't already there.
*/
+ if (ptepindex >= NUPDE + NUPDPE + NUPML4E) {
+ MPASS(pmap_is_la57(pmap));
+
+ pml5index = pmap_pml5e_index(va);
+ pml5 = &pmap->pm_pmltop[pml5index];
+ KASSERT((*pml5 & PG_V) == 0,
+ ("pmap %p va %#lx pml5 %#lx", pmap, va, *pml5));
+ *pml5 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
- if (ptepindex >= (NUPDE + NUPDPE)) {
- pml4_entry_t *pml4, *pml4u;
- vm_pindex_t pml4index;
+ if (pmap->pm_pmltopu != NULL && pml5index < NUPML5E) {
+ if (pmap->pm_ucr3 != PMAP_NO_CR3)
+ *pml5 |= pg_nx;
+ pml5u = &pmap->pm_pmltopu[pml5index];
+ *pml5u = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V |
+ PG_A | PG_M;
+ }
+ } else if (ptepindex >= NUPDE + NUPDPE) {
+ pml4index = pmap_pml4e_index(va);
/* Wire up a new PDPE page */
- pml4index = ptepindex - (NUPDE + NUPDPE);
- pml4 = &pmap->pm_pml4[pml4index];
+ pml4 = pmap_allocpte_getpml4(pmap, lockp, va, true);
+ if (pml4 == NULL) {
+ vm_page_unwire_noq(m);
+ vm_page_free_zero(m);
+ return (NULL);
+ }
+ KASSERT((*pml4 & PG_V) == 0,
+ ("pmap %p va %#lx pml4 %#lx", pmap, va, *pml4));
*pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
- if (pmap->pm_pml4u != NULL && pml4index < NUPML4E) {
+
+ if (!pmap_is_la57(pmap) && pmap->pm_pmltopu != NULL &&
+ pml4index < NUPML4E) {
/*
* PTI: Make all user-space mappings in the
* kernel-mode page table no-execute so that
@@ -3892,85 +4306,48 @@ _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
if (pmap->pm_ucr3 != PMAP_NO_CR3)
*pml4 |= pg_nx;
- pml4u = &pmap->pm_pml4u[pml4index];
+ pml4u = &pmap->pm_pmltopu[pml4index];
*pml4u = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V |
PG_A | PG_M;
}
-
} else if (ptepindex >= NUPDE) {
- vm_pindex_t pml4index;
- vm_pindex_t pdpindex;
- pml4_entry_t *pml4;
- pdp_entry_t *pdp;
-
/* Wire up a new PDE page */
- pdpindex = ptepindex - NUPDE;
- pml4index = pdpindex >> NPML4EPGSHIFT;
-
- pml4 = &pmap->pm_pml4[pml4index];
- if ((*pml4 & PG_V) == 0) {
- /* Have to allocate a new pdp, recurse */
- if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index,
- lockp) == NULL) {
- vm_page_unwire_noq(m);
- vm_page_free_zero(m);
- return (NULL);
- }
- } else {
- /* Add reference to pdp page */
- pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME);
- pdppg->ref_count++;
+ pdp = pmap_allocpte_getpdp(pmap, lockp, va, true);
+ if (pdp == NULL) {
+ vm_page_unwire_noq(m);
+ vm_page_free_zero(m);
+ return (NULL);
}
- pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
-
- /* Now find the pdp page */
- pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
+ KASSERT((*pdp & PG_V) == 0,
+ ("pmap %p va %#lx pdp %#lx", pmap, va, *pdp));
*pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
-
} else {
- vm_pindex_t pml4index;
- vm_pindex_t pdpindex;
- pml4_entry_t *pml4;
- pdp_entry_t *pdp;
- pd_entry_t *pd;
-
/* Wire up a new PTE page */
- pdpindex = ptepindex >> NPDPEPGSHIFT;
- pml4index = pdpindex >> NPML4EPGSHIFT;
-
- /* First, find the pdp and check that its valid. */
- pml4 = &pmap->pm_pml4[pml4index];
- if ((*pml4 & PG_V) == 0) {
+ pdp = pmap_allocpte_getpdp(pmap, lockp, va, false);
+ if (pdp == NULL) {
+ vm_page_unwire_noq(m);
+ vm_page_free_zero(m);
+ return (NULL);
+ }
+ if ((*pdp & PG_V) == 0) {
/* Have to allocate a new pd, recurse */
- if (_pmap_allocpte(pmap, NUPDE + pdpindex,
- lockp) == NULL) {
+ if (_pmap_allocpte(pmap, pmap_pdpe_pindex(va),
+ lockp, va) == NULL) {
vm_page_unwire_noq(m);
vm_page_free_zero(m);
return (NULL);
}
- pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
- pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
} else {
- pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
- pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
- if ((*pdp & PG_V) == 0) {
- /* Have to allocate a new pd, recurse */
- if (_pmap_allocpte(pmap, NUPDE + pdpindex,
- lockp) == NULL) {
- vm_page_unwire_noq(m);
- vm_page_free_zero(m);
- return (NULL);
- }
- } else {
- /* Add reference to the pd page */
- pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME);
- pdpg->ref_count++;
- }
+ /* Add reference to the pd page */
+ pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME);
+ pdpg->ref_count++;
}
pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME);
/* Now we know where the page directory page is */
- pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)];
+ pd = &pd[pmap_pde_index(va)];
+ KASSERT((*pd & PG_V) == 0,
+ ("pmap %p va %#lx pd %#lx", pmap, va, *pd));
*pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
}
@@ -4003,7 +4380,7 @@ retry:
} else if (va < VM_MAXUSER_ADDRESS) {
/* Allocate a pd page. */
pdpindex = pmap_pde_pindex(va) >> NPDPEPGSHIFT;
- pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp);
+ pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp, va);
if (pdpg == NULL) {
if (lockp != NULL)
goto retry;
@@ -4064,7 +4441,7 @@ retry:
* Here if the pte page isn't mapped, or if it has been
* deallocated.
*/
- m = _pmap_allocpte(pmap, ptepindex, lockp);
+ m = _pmap_allocpte(pmap, ptepindex, lockp, va);
if (m == NULL && lockp != NULL)
goto retry;
}
@@ -4088,28 +4465,35 @@ pmap_release(pmap_t pmap)
int i;
KASSERT(pmap->pm_stats.resident_count == 0,
- ("pmap_release: pmap resident count %ld != 0",
- pmap->pm_stats.resident_count));
+ ("pmap_release: pmap %p resident count %ld != 0",
+ pmap, pmap->pm_stats.resident_count));
KASSERT(vm_radix_is_empty(&pmap->pm_root),
- ("pmap_release: pmap has reserved page table page(s)"));
+ ("pmap_release: pmap %p has reserved page table page(s)",
+ pmap));
KASSERT(CPU_EMPTY(&pmap->pm_active),
("releasing active pmap %p", pmap));
- m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4));
+ m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pmltop));
- for (i = 0; i < NKPML4E; i++) /* KVA */
- pmap->pm_pml4[KPML4BASE + i] = 0;
- for (i = 0; i < ndmpdpphys; i++)/* Direct Map */
- pmap->pm_pml4[DMPML4I + i] = 0;
- pmap->pm_pml4[PML4PML4I] = 0; /* Recursive Mapping */
- for (i = 0; i < lm_ents; i++) /* Large Map */
- pmap->pm_pml4[LMSPML4I + i] = 0;
+ if (pmap_is_la57(pmap)) {
+ pmap->pm_pmltop[pmap_pml5e_index(UPT_MAX_ADDRESS)] = 0;
+ pmap->pm_pmltop[PML5PML5I] = 0;
+ } else {
+ for (i = 0; i < NKPML4E; i++) /* KVA */
+ pmap->pm_pmltop[KPML4BASE + i] = 0;
+ for (i = 0; i < ndmpdpphys; i++)/* Direct Map */
+ pmap->pm_pmltop[DMPML4I + i] = 0;
+ pmap->pm_pmltop[PML4PML4I] = 0; /* Recursive Mapping */
+ for (i = 0; i < lm_ents; i++) /* Large Map */
+ pmap->pm_pmltop[LMSPML4I + i] = 0;
+ }
vm_page_unwire_noq(m);
vm_page_free_zero(m);
- if (pmap->pm_pml4u != NULL) {
- m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4u));
+ if (pmap->pm_pmltopu != NULL) {
+ m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->
+ pm_pmltopu));
vm_page_unwire_noq(m);
vm_page_free(m);
}
@@ -5448,6 +5832,7 @@ pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
{
struct rwlock *lock;
vm_offset_t va_next;
+ pml5_entry_t *pml5e;
pml4_entry_t *pml4e;
pdp_entry_t *pdpe;
pd_entry_t ptpaddr, *pde;
@@ -5490,7 +5875,18 @@ pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
if (pmap->pm_stats.resident_count == 0)
break;
- pml4e = pmap_pml4e(pmap, sva);
+ if (pmap_is_la57(pmap)) {
+ pml5e = pmap_pml5e(pmap, sva);
+ if ((*pml5e & PG_V) == 0) {
+ va_next = (sva + NBPML5) & ~PML5MASK;
+ if (va_next < sva)
+ va_next = eva;
+ continue;
+ }
+ pml4e = pmap_pml5e_to_pml4e(pml5e, sva);
+ } else {
+ pml4e = pmap_pml4e(pmap, sva);
+ }
if ((*pml4e & PG_V) == 0) {
va_next = (sva + NBPML4) & ~PML4MASK;
if (va_next < sva)
@@ -6110,7 +6506,7 @@ retry:
*/
nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0;
mpte = _pmap_allocpte(pmap, pmap_pde_pindex(va),
- nosleep ? NULL : &lock);
+ nosleep ? NULL : &lock, va);
if (mpte == NULL && nosleep) {
rv = KERN_RESOURCE_SHORTAGE;
goto out;
@@ -6593,7 +6989,8 @@ pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
* Pass NULL instead of the PV list lock
* pointer, because we don't intend to sleep.
*/
- mpte = _pmap_allocpte(pmap, ptepindex, NULL);
+ mpte = _pmap_allocpte(pmap, ptepindex, NULL,
+ va);
if (mpte == NULL)
return (mpte);
}
@@ -9346,11 +9743,11 @@ pmap_large_map_pdpe(vm_offset_t va)
("pmap_large_map_pdpe: va %#jx out of range idx %#jx LMSPML4I "
"%#jx lm_ents %d",
(uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents));
- KASSERT((kernel_pmap->pm_pml4[pml4_idx] & X86_PG_V) != 0,
+ KASSERT((kernel_pml4[pml4_idx] & X86_PG_V) != 0,
("pmap_large_map_pdpe: invalid pml4 for va %#jx idx %#jx "
"LMSPML4I %#jx lm_ents %d",
(uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents));
- mphys = kernel_pmap->pm_pml4[pml4_idx] & PG_FRAME;
+ mphys = kernel_pml4[pml4_idx] & PG_FRAME;
return ((pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va));
}
@@ -10425,7 +10822,9 @@ sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range,
mode, range->pdpes, range->pdes, range->ptes);
/* Reset to sentinel value. */
- range->sva = KVADDR(NPML4EPG - 1, NPDPEPG - 1, NPDEPG - 1, NPTEPG - 1);
+ range->sva = la57 ? KV5ADDR(NPML5EPG - 1, NPML4EPG - 1, NPDPEPG - 1,
+ NPDEPG - 1, NPTEPG - 1) : KV4ADDR(NPML4EPG - 1, NPDPEPG - 1,
+ NPDEPG - 1, NPTEPG - 1);
}
/*
@@ -10519,7 +10918,9 @@ sysctl_kmaps(SYSCTL_HANDLER_ARGS)
sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req);
/* Sentinel value. */
- range.sva = KVADDR(NPML4EPG - 1, NPDPEPG - 1, NPDEPG - 1, NPTEPG - 1);
+ range.sva = la57 ? KV5ADDR(NPML5EPG - 1, NPML4EPG - 1, NPDPEPG - 1,
+ NPDEPG - 1, NPTEPG - 1) : KV4ADDR(NPML4EPG - 1, NPDPEPG - 1,
+ NPDEPG - 1, NPTEPG - 1);
/*
* Iterate over the kernel page tables without holding the kernel pmap
@@ -10549,7 +10950,7 @@ sysctl_kmaps(SYSCTL_HANDLER_ARGS)
sva |= -1ul << 48;
restart:
- pml4e = kernel_pmap->pm_pml4[i];
+ pml4e = kernel_pml4[i];
if ((pml4e & X86_PG_V) == 0) {
sva = rounddown2(sva, NBPML4);
sysctl_kmaps_dump(sb, &range, sva);
@@ -10632,6 +11033,7 @@ SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps,
DB_SHOW_COMMAND(pte, pmap_print_pte)
{
pmap_t pmap;
+ pml5_entry_t *pml5;
pml4_entry_t *pml4;
pdp_entry_t *pdp;
pd_entry_t *pde;
@@ -10650,8 +11052,20 @@ DB_SHOW_COMMAND(pte, pmap_print_pte)
pmap = PCPU_GET(curpmap);
PG_V = pmap_valid_bit(pmap);
- pml4 = pmap_pml4e(pmap, va);
- db_printf("VA 0x%016lx pml4e 0x%016lx", va, *pml4);
+ db_printf("VA 0x%016lx", va);
+
+ if (pmap_is_la57(pmap)) {
+ pml5 = pmap_pml5e(pmap, va);
+ db_printf(" pml5e 0x%016lx", *pml5);
+ if ((*pml5 & PG_V) == 0) {
+ db_printf("\n");
+ return;
+ }
+ pml4 = pmap_pml5e_to_pml4e(pml5, va);
+ } else {
+ pml4 = pmap_pml4e(pmap, va);
+ }
+ db_printf(" pml4e 0x%016lx", *pml4);
if ((*pml4 & PG_V) == 0) {
db_printf("\n");
return;
@@ -10683,4 +11097,95 @@ DB_SHOW_COMMAND(phys2dmap, pmap_phys2dmap)
db_printf("show phys2dmap addr\n");
}
}
+
+static void
+ptpages_show_page(int level, int idx, vm_page_t pg)
+{
+ db_printf("l %d i %d pg %p phys %#lx ref %x\n",
+ level, idx, pg, VM_PAGE_TO_PHYS(pg), pg->ref_count);
+}
+
+static void
+ptpages_show_complain(int level, int idx, uint64_t pte)
+{
+ db_printf("l %d i %d pte %#lx\n", level, idx, pte);
+}
+
+static void
+ptpages_show_pml4(vm_page_t pg4, int num_entries, uint64_t PG_V)
+{
+ vm_page_t pg3, pg2, pg1;
+ pml4_entry_t *pml4;
+ pdp_entry_t *pdp;
+ pd_entry_t *pd;
+ int i4, i3, i2;
+
+ pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pg4));
+ for (i4 = 0; i4 < num_entries; i4++) {
+ if ((pml4[i4] & PG_V) == 0)
+ continue;
+ pg3 = PHYS_TO_VM_PAGE(pml4[i4] & PG_FRAME);
+ if (pg3 == NULL) {
+ ptpages_show_complain(3, i4, pml4[i4]);
+ continue;
+ }
+ ptpages_show_page(3, i4, pg3);
+ pdp = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pg3));
+ for (i3 = 0; i3 < NPDPEPG; i3++) {
+ if ((pdp[i3] & PG_V) == 0)
+ continue;
+ pg2 = PHYS_TO_VM_PAGE(pdp[i3] & PG_FRAME);
+ if (pg3 == NULL) {
+ ptpages_show_complain(2, i3, pdp[i3]);
+ continue;
+ }
+ ptpages_show_page(2, i3, pg2);
+ pd = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pg2));
+ for (i2 = 0; i2 < NPDEPG; i2++) {
+ if ((pd[i2] & PG_V) == 0)
+ continue;
+ pg1 = PHYS_TO_VM_PAGE(pd[i2] & PG_FRAME);
+ if (pg1 == NULL) {
+ ptpages_show_complain(1, i2, pd[i2]);
+ continue;
+ }
+ ptpages_show_page(1, i2, pg1);
+ }
+ }
+ }
+}
+
+DB_SHOW_COMMAND(ptpages, pmap_ptpages)
+{
+ pmap_t pmap;
+ vm_page_t pg;
+ pml5_entry_t *pml5;
+ uint64_t PG_V;
+ int i5;
+
+ if (have_addr)
+ pmap = (pmap_t)addr;
+ else
+ pmap = PCPU_GET(curpmap);
+
+ PG_V = pmap_valid_bit(pmap);
+
+ if (pmap_is_la57(pmap)) {
+ pml5 = pmap->pm_pmltop;
+ for (i5 = 0; i5 < NUPML5E; i5++) {
+ if ((pml5[i5] & PG_V) == 0)
+ continue;
+ pg = PHYS_TO_VM_PAGE(pml5[i5] & PG_FRAME);
+ if (pg == NULL) {
+ ptpages_show_complain(4, i5, pml5[i5]);
+ continue;
+ }
+ ptpages_show_page(4, i5, pg);
+ ptpages_show_pml4(pg, NPML4EPG, PG_V);
+ }
+ } else {
+ ptpages_show_pml4(PHYS_TO_VM_PAGE(DMAP_TO_PHYS(
+ (vm_offset_t)pmap->pm_pmltop)), NUP4ML4E, PG_V);
+ }
+}
#endif
diff --git a/sys/amd64/include/md_var.h b/sys/amd64/include/md_var.h
index 8d866fee846aa..9a550d7024feb 100644
--- a/sys/amd64/include/md_var.h
+++ b/sys/amd64/include/md_var.h
@@ -46,6 +46,8 @@ extern int syscall_ret_l1d_flush_mode;
extern vm_paddr_t intel_graphics_stolen_base;
extern vm_paddr_t intel_graphics_stolen_size;
+extern int la57;
+
/*
* The file "conf/ldscript.amd64" defines the symbol "kernphys". Its
* value is the physical address at which the kernel is loaded.
diff --git a/sys/amd64/include/param.h b/sys/amd64/include/param.h
index ac3df693e4d45..2bd4d913a7b1a 100644
--- a/sys/amd64/include/param.h
+++ b/sys/amd64/include/param.h
@@ -118,6 +118,12 @@
#define PML4SHIFT 39 /* LOG2(NBPML4) */
#define NBPML4 (1UL<<PML4SHIFT)/* bytes/page map lev4 table */
#define PML4MASK (NBPML4-1)
+/* Size of the level 5 page-map level-5 table units */
+#define NPML5EPG (PAGE_SIZE/(sizeof (pml5_entry_t)))
+#define NPML5EPGSHIFT 9 /* LOG2(NPML5EPG) */
+#define PML5SHIFT 48 /* LOG2(NBPML5) */
+#define NBPML5 (1UL<<PML5SHIFT)/* bytes/page map lev5 table */
+#define PML5MASK (NBPML5-1)
#define MAXPAGESIZES 3 /* maximum number of supported page sizes */
diff --git a/sys/amd64/include/pmap.h b/sys/amd64/include/pmap.h
index e2d7a714511b3..5cdcca66d9735 100644
--- a/sys/amd64/include/pmap.h
+++ b/sys/amd64/include/pmap.h
@@ -166,14 +166,22 @@
* Pte related macros. This is complicated by having to deal with
* the sign extension of the 48th bit.
*/
-#define KVADDR(l4, l3, l2, l1) ( \
+#define KV4ADDR(l4, l3, l2, l1) ( \
((unsigned long)-1 << 47) | \
((unsigned long)(l4) << PML4SHIFT) | \
((unsigned long)(l3) << PDPSHIFT) | \
((unsigned long)(l2) << PDRSHIFT) | \
((unsigned long)(l1) << PAGE_SHIFT))
+#define KV5ADDR(l5, l4, l3, l2, l1) ( \
+ ((unsigned long)-1 << 56) | \
+ ((unsigned long)(l5) << PML5SHIFT) | \
+ ((unsigned long)(l4) << PML4SHIFT) | \
+ ((unsigned long)(l3) << PDPSHIFT) | \
+ ((unsigned long)(l2) << PDRSHIFT) | \
+ ((unsigned long)(l1) << PAGE_SHIFT))
-#define UVADDR(l4, l3, l2, l1) ( \
+#define UVADDR(l5, l4, l3, l2, l1) ( \
+ ((unsigned long)(l5) << PML5SHIFT) | \
((unsigned long)(l4) << PML4SHIFT) | \
((unsigned long)(l3) << PDPSHIFT) | \
((unsigned long)(l2) << PDRSHIFT) | \
@@ -187,9 +195,19 @@
*/
#define NKPML4E 4
-#define NUPML4E (NPML4EPG/2) /* number of userland PML4 pages */
-#define NUPDPE (NUPML4E*NPDPEPG)/* number of userland PDP pages */
-#define NUPDE (NUPDPE*NPDEPG) /* number of userland PD entries */
+/*
+ * We use the same numbering of the page table pages for 5-level and
+ * 4-level paging structures.
+ */
+#define NUPML5E (NPML5EPG / 2) /* number of userland PML5
+ pages */
+#define NUPML4E (NUPML5E * NPML4EPG) /* number of userland PML4
+ pages */
+#define NUPDPE (NUPML4E * NPDPEPG) /* number of userland PDP
+ pages */
+#define NUPDE (NUPDPE * NPDEPG) /* number of userland PD
+ entries */
+#define NUP4ML4E (NPML4EPG / 2)
/*
* NDMPML4E is the maximum number of PML4 entries that will be
@@ -216,7 +234,8 @@
* Or, in other words, KPML4I provides bits 39..47 of KERNBASE,
* and KPDPI provides bits 30..38.)
*/
-#define PML4PML4I (NPML4EPG/2) /* Index of recursive pml4 mapping */
+#define PML4PML4I (NPML4EPG / 2) /* Index of recursive pml4 mapping */
+#define PML5PML5I (NPML5EPG / 2) /* Index of recursive pml5 mapping */
#define KPML4BASE (NPML4EPG-NKPML4E) /* KVM at highest addresses */
#define DMPML4I rounddown(KPML4BASE-NDMPML4E, NDMPML4E) /* Below KVM */
@@ -258,25 +277,34 @@ typedef u_int64_t pd_entry_t;
typedef u_int64_t pt_entry_t;
typedef u_int64_t pdp_entry_t;
typedef u_int64_t pml4_entry_t;
+typedef u_int64_t pml5_entry_t;
/*
* Address of current address space page table maps and directories.
*/
#ifdef _KERNEL
-#define addr_PTmap (KVADDR(PML4PML4I, 0, 0, 0))
-#define addr_PDmap (KVADDR(PML4PML4I, PML4PML4I, 0, 0))
-#define addr_PDPmap (KVADDR(PML4PML4I, PML4PML4I, PML4PML4I, 0))
-#define addr_PML4map (KVADDR(PML4PML4I, PML4PML4I, PML4PML4I, PML4PML4I))
-#define addr_PML4pml4e (addr_PML4map + (PML4PML4I * sizeof(pml4_entry_t)))
-#define PTmap ((pt_entry_t *)(addr_PTmap))
-#define PDmap ((pd_entry_t *)(addr_PDmap))
-#define PDPmap ((pd_entry_t *)(addr_PDPmap))
-#define PML4map ((pd_entry_t *)(addr_PML4map))
-#define PML4pml4e ((pd_entry_t *)(addr_PML4pml4e))
+#define addr_P4Tmap (KV4ADDR(PML4PML4I, 0, 0, 0))
+#define addr_P4Dmap (KV4ADDR(PML4PML4I, PML4PML4I, 0, 0))
+#define addr_P4DPmap (KV4ADDR(PML4PML4I, PML4PML4I, PML4PML4I, 0))
+#define addr_P4ML4map (KV4ADDR(PML4PML4I, PML4PML4I, PML4PML4I, PML4PML4I))
+#define addr_P4ML4pml4e (addr_PML4map + (PML4PML4I * sizeof(pml4_entry_t)))
+#define P4Tmap ((pt_entry_t *)(addr_P4Tmap))
+#define P4Dmap ((pd_entry_t *)(addr_P4Dmap))
+
+#define addr_P5Tmap (KV5ADDR(PML5PML5I, 0, 0, 0, 0))
+#define addr_P5Dmap (KV5ADDR(PML5PML5I, PML5PML5I, 0, 0, 0))
+#define addr_P5DPmap (KV5ADDR(PML5PML5I, PML5PML5I, PML5PML5I, 0, 0))
+#define addr_P5ML4map (KV5ADDR(PML5PML5I, PML5PML5I, PML5PML5I, PML5PML5I, 0))
+#define addr_P5ML5map \
+ (KVADDR(PML5PML5I, PML5PML5I, PML5PML5I, PML5PML5I, PML5PML5I))
+#define addr_P5ML5pml5e (addr_P5ML5map + (PML5PML5I * sizeof(pml5_entry_t)))
+#define P5Tmap ((pt_entry_t *)(addr_P5Tmap))
+#define P5Dmap ((pd_entry_t *)(addr_P5Dmap))
extern int nkpt; /* Initial number of kernel page tables */
extern u_int64_t KPDPphys; /* physical address of kernel level 3 */
extern u_int64_t KPML4phys; /* physical address of kernel level 4 */
+extern u_int64_t KPML5phys; /* physical address of kernel level 5 */
/*
* virtual address to page table entry and
@@ -333,8 +361,8 @@ struct pmap_pcids {
*/
struct pmap {
struct mtx pm_mtx;
- pml4_entry_t *pm_pml4; /* KVA of level 4 page table */
- pml4_entry_t *pm_pml4u; /* KVA of user l4 page table */
+ pml4_entry_t *pm_pmltop; /* KVA of top level page table */
+ pml4_entry_t *pm_pmltopu; /* KVA of user top page table */
uint64_t pm_cr3;
uint64_t pm_ucr3;
TAILQ_HEAD(,pv_chunk) pm_pvchunk; /* list of mappings in pmap */
@@ -447,6 +475,7 @@ bool pmap_not_in_di(void);
boolean_t pmap_page_is_mapped(vm_page_t m);
void pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma);
void pmap_pinit_pml4(vm_page_t);
+void pmap_pinit_pml5(vm_page_t);
bool pmap_ps_enabled(pmap_t pmap);
void pmap_unmapdev(vm_offset_t, vm_size_t);
void pmap_invalidate_page(pmap_t, vm_offset_t);
@@ -502,6 +531,13 @@ pmap_pml4e_index(vm_offset_t va)
return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1));
}
+static __inline vm_pindex_t
+pmap_pml5e_index(vm_offset_t va)
+{
+
+ return ((va >> PML5SHIFT) & ((1ul << NPML5EPGSHIFT) - 1));
+}
+
#endif /* !LOCORE */
#endif /* !_MACHINE_PMAP_H_ */
diff --git a/sys/amd64/include/proc.h b/sys/amd64/include/proc.h
index 75f357c3a53e6..e74f1626a56a2 100644
--- a/sys/amd64/include/proc.h
+++ b/sys/amd64/include/proc.h
@@ -84,6 +84,8 @@ struct mdproc {
};
#define P_MD_KPTI 0x00000001 /* Enable KPTI on exec */
+#define P_MD_LA48 0x00000002 /* Request LA48 after exec */
+#define P_MD_LA57 0x00000004 /* Request LA57 after exec */
#define KINFO_PROC_SIZE 1088
#define KINFO_PROC32_SIZE 768
diff --git a/sys/amd64/include/vmparam.h b/sys/amd64/include/vmparam.h
index 2fe349e0beb59..64eed5760357e 100644
--- a/sys/amd64/include/vmparam.h
+++ b/sys/amd64/include/vmparam.h
@@ -169,25 +169,32 @@
* 0xffffffff80000000 KERNBASE
*/
-#define VM_MIN_KERNEL_ADDRESS KVADDR(KPML4BASE, 0, 0, 0)
-#define VM_MAX_KERNEL_ADDRESS KVADDR(KPML4BASE + NKPML4E - 1, \
+#define VM_MIN_KERNEL_ADDRESS KV4ADDR(KPML4BASE, 0, 0, 0)
+#define VM_MAX_KERNEL_ADDRESS KV4ADDR(KPML4BASE + NKPML4E - 1, \
NPDPEPG-1, NPDEPG-1, NPTEPG-1)
-#define DMAP_MIN_ADDRESS KVADDR(DMPML4I, 0, 0, 0)
-#define DMAP_MAX_ADDRESS KVADDR(DMPML4I + NDMPML4E, 0, 0, 0)
+#define DMAP_MIN_ADDRESS KV4ADDR(DMPML4I, 0, 0, 0)
+#define DMAP_MAX_ADDRESS KV4ADDR(DMPML4I + NDMPML4E, 0, 0, 0)
-#define LARGEMAP_MIN_ADDRESS KVADDR(LMSPML4I, 0, 0, 0)
-#define LARGEMAP_MAX_ADDRESS KVADDR(LMEPML4I + 1, 0, 0, 0)
+#define LARGEMAP_MIN_ADDRESS KV4ADDR(LMSPML4I, 0, 0, 0)
+#define LARGEMAP_MAX_ADDRESS KV4ADDR(LMEPML4I + 1, 0, 0, 0)
-#define KERNBASE KVADDR(KPML4I, KPDPI, 0, 0)
+#define KERNBASE KV4ADDR(KPML4I, KPDPI, 0, 0)
-#define UPT_MAX_ADDRESS KVADDR(PML4PML4I, PML4PML4I, PML4PML4I, PML4PML4I)
-#define UPT_MIN_ADDRESS KVADDR(PML4PML4I, 0, 0, 0)
+#define UPT_MAX_ADDRESS KV4ADDR(PML4PML4I, PML4PML4I, PML4PML4I, PML4PML4I)
+#define UPT_MIN_ADDRESS KV4ADDR(PML4PML4I, 0, 0, 0)
-#define VM_MAXUSER_ADDRESS UVADDR(NUPML4E, 0, 0, 0)
+#define VM_MAXUSER_ADDRESS_LA57 UVADDR(NUPML5E, 0, 0, 0, 0)
+#define VM_MAXUSER_ADDRESS_LA48 UVADDR(0, NUP4ML4E, 0, 0, 0)
+#define VM_MAXUSER_ADDRESS VM_MAXUSER_ADDRESS_LA57
-#define SHAREDPAGE (VM_MAXUSER_ADDRESS - PAGE_SIZE)
-#define USRSTACK SHAREDPAGE
+#define SHAREDPAGE_LA57 (VM_MAXUSER_ADDRESS_LA57 - PAGE_SIZE)
+#define SHAREDPAGE_LA48 (VM_MAXUSER_ADDRESS_LA48 - PAGE_SIZE)
+#define USRSTACK_LA57 SHAREDPAGE_LA57
+#define USRSTACK_LA48 SHAREDPAGE_LA48
+#define USRSTACK USRSTACK_LA48
+#define PS_STRINGS_LA57 (USRSTACK_LA57 - sizeof(struct ps_strings))
+#define PS_STRINGS_LA48 (USRSTACK_LA48 - sizeof(struct ps_strings))
#define VM_MAX_ADDRESS UPT_MAX_ADDRESS
#define VM_MIN_ADDRESS (0)
diff --git a/sys/amd64/linux/linux_sysvec.c b/sys/amd64/linux/linux_sysvec.c
index 81ccbd75b5cd0..bb80f324868c1 100644
--- a/sys/amd64/linux/linux_sysvec.c
+++ b/sys/amd64/linux/linux_sysvec.c
@@ -739,9 +739,9 @@ struct sysentvec elf_linux_sysvec = {
.sv_imgact_try = linux_exec_imgact_try,
.sv_minsigstksz = LINUX_MINSIGSTKSZ,
.sv_minuser = VM_MIN_ADDRESS,
- .sv_maxuser = VM_MAXUSER_ADDRESS,
- .sv_usrstack = USRSTACK,
- .sv_psstrings = PS_STRINGS,
+ .sv_maxuser = VM_MAXUSER_ADDRESS_LA48,
+ .sv_usrstack = USRSTACK_LA48,
+ .sv_psstrings = PS_STRINGS_LA48,
.sv_stackprot = VM_PROT_ALL,
.sv_copyout_auxargs = linux_copyout_auxargs,
.sv_copyout_strings = linux_copyout_strings,
@@ -752,7 +752,7 @@ struct sysentvec elf_linux_sysvec = {
.sv_set_syscall_retval = linux_set_syscall_retval,
.sv_fetch_syscall_args = linux_fetch_syscall_args,
.sv_syscallnames = NULL,
- .sv_shared_page_base = SHAREDPAGE,
+ .sv_shared_page_base = SHAREDPAGE_LA48,
.sv_shared_page_len = PAGE_SIZE,
.sv_schedtail = linux_schedtail,
.sv_thread_detach = linux_thread_detach,
diff --git a/sys/amd64/vmm/amd/svm.c b/sys/amd64/vmm/amd/svm.c
index f9660024fe0c2..3b26de3d00ffa 100644
--- a/sys/amd64/vmm/amd/svm.c
+++ b/sys/amd64/vmm/amd/svm.c
@@ -560,7 +560,7 @@ svm_vminit(struct vm *vm, pmap_t pmap)
panic("contigmalloc of SVM IO bitmap failed");
svm_sc->vm = vm;
- svm_sc->nptp = (vm_offset_t)vtophys(pmap->pm_pml4);
+ svm_sc->nptp = (vm_offset_t)vtophys(pmap->pm_pmltop);
/*
* Intercept read and write accesses to all MSRs.
diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c
index ddfada8a60819..3fc6ccf28b639 100644
--- a/sys/amd64/vmm/intel/vmx.c
+++ b/sys/amd64/vmm/intel/vmx.c
@@ -1030,7 +1030,7 @@ vmx_vminit(struct vm *vm, pmap_t pmap)
}
vmx->vm = vm;
- vmx->eptp = eptp(vtophys((vm_offset_t)pmap->pm_pml4));
+ vmx->eptp = eptp(vtophys((vm_offset_t)pmap->pm_pmltop));
/*
* Clean up EPTP-tagged guest physical and combined mappings
diff --git a/sys/cddl/dev/dtrace/amd64/dtrace_subr.c b/sys/cddl/dev/dtrace/amd64/dtrace_subr.c
index cf24e6adae3f1..924a59b3d6568 100644
--- a/sys/cddl/dev/dtrace/amd64/dtrace_subr.c
+++ b/sys/cddl/dev/dtrace/amd64/dtrace_subr.c
@@ -43,6 +43,7 @@
#include <machine/clock.h>
#include <machine/cpufunc.h>
#include <machine/frame.h>
+#include <machine/md_var.h>
#include <machine/psl.h>
#include <machine/trap.h>
#include <vm/pmap.h>
@@ -131,7 +132,7 @@ dtrace_invop_uninit(void)
void
dtrace_toxic_ranges(void (*func)(uintptr_t base, uintptr_t limit))
{
- (*func)(0, (uintptr_t) addr_PTmap);
+ (*func)(0, la57 ? (uintptr_t)addr_P5Tmap : (uintptr_t)addr_P4Tmap);
}
void