aboutsummaryrefslogtreecommitdiff
path: root/sys/amd64
diff options
context:
space:
mode:
Diffstat (limited to 'sys/amd64')
-rw-r--r--sys/amd64/acpica/acpi_wakeup.c18
-rw-r--r--sys/amd64/amd64/apic_vector.S9
-rw-r--r--sys/amd64/amd64/cpu_switch.S4
-rw-r--r--sys/amd64/amd64/exec_machdep.c2
-rw-r--r--sys/amd64/amd64/machdep.c9
-rw-r--r--sys/amd64/amd64/pmap.c372
-rw-r--r--sys/amd64/amd64/support.S20
-rw-r--r--sys/amd64/amd64/trap.c19
-rw-r--r--sys/amd64/conf/MINIMALUP4
-rw-r--r--sys/amd64/include/param.h2
-rw-r--r--sys/amd64/include/pmap.h14
-rw-r--r--sys/amd64/include/smp.h3
-rw-r--r--sys/amd64/include/vmm.h2
-rw-r--r--sys/amd64/include/vmm_dev.h7
-rw-r--r--sys/amd64/include/vmm_instruction_emul.h25
-rw-r--r--sys/amd64/include/vmparam.h22
-rw-r--r--sys/amd64/pt/pt.c978
-rw-r--r--sys/amd64/pt/pt.h49
-rw-r--r--sys/amd64/vmm/amd/svm.c90
-rw-r--r--sys/amd64/vmm/intel/vmx.c2
-rw-r--r--sys/amd64/vmm/intel/vmx_support.S6
-rw-r--r--sys/amd64/vmm/vmm_instruction_emul.c34
-rw-r--r--sys/amd64/vmm/vmm_ioport.c40
23 files changed, 1337 insertions, 394 deletions
diff --git a/sys/amd64/acpica/acpi_wakeup.c b/sys/amd64/acpica/acpi_wakeup.c
index 51d6d5e36840..99565fbb69ca 100644
--- a/sys/amd64/acpica/acpi_wakeup.c
+++ b/sys/amd64/acpica/acpi_wakeup.c
@@ -54,10 +54,8 @@
#include <x86/apicreg.h>
#include <x86/apicvar.h>
-#ifdef SMP
#include <machine/smp.h>
#include <machine/vmparam.h>
-#endif
#include <contrib/dev/acpica/include/acpi.h>
@@ -73,19 +71,13 @@ extern int acpi_resume_beep;
extern int acpi_reset_video;
extern int acpi_susp_bounce;
-#ifdef SMP
extern struct susppcb **susppcbs;
static cpuset_t suspcpus;
-#else
-static struct susppcb **susppcbs;
-#endif
static void acpi_stop_beep(void *);
-#ifdef SMP
static int acpi_wakeup_ap(struct acpi_softc *, int);
static void acpi_wakeup_cpus(struct acpi_softc *);
-#endif
#define ACPI_WAKEPT_PAGES 7
@@ -103,7 +95,6 @@ acpi_stop_beep(void *arg)
timer_spkr_release();
}
-#ifdef SMP
static int
acpi_wakeup_ap(struct acpi_softc *sc, int cpu)
{
@@ -177,7 +168,6 @@ acpi_wakeup_cpus(struct acpi_softc *sc)
outb(CMOS_DATA, mpbiosreason);
}
}
-#endif
int
acpi_sleep_machdep(struct acpi_softc *sc, int state)
@@ -190,10 +180,8 @@ acpi_sleep_machdep(struct acpi_softc *sc, int state)
if (sc->acpi_wakeaddr == 0ul)
return (-1); /* couldn't alloc wake memory */
-#ifdef SMP
suspcpus = all_cpus;
CPU_CLR(PCPU_GET(cpuid), &suspcpus);
-#endif
if (acpi_resume_beep != 0)
timer_spkr_acquire();
@@ -208,12 +196,10 @@ acpi_sleep_machdep(struct acpi_softc *sc, int state)
pcb = &susppcbs[0]->sp_pcb;
if (savectx(pcb)) {
fpususpend(susppcbs[0]->sp_fpususpend);
-#ifdef SMP
if (!CPU_EMPTY(&suspcpus) && suspend_cpus(suspcpus) == 0) {
device_printf(sc->acpi_dev, "Failed to suspend APs\n");
return (0); /* couldn't sleep */
}
-#endif
hw_ibrs_ibpb_active = 0;
hw_ssb_active = 0;
cpu_stdext_feature3 = 0;
@@ -278,16 +264,12 @@ acpi_wakeup_machdep(struct acpi_softc *sc, int state, int sleep_result,
PCPU_SET(switchtime, 0);
PCPU_SET(switchticks, ticks);
lapic_xapic_mode();
-#ifdef SMP
if (!CPU_EMPTY(&suspcpus))
acpi_wakeup_cpus(sc);
-#endif
}
-#ifdef SMP
if (!CPU_EMPTY(&suspcpus))
resume_cpus(suspcpus);
-#endif
/*
* Re-read cpu_stdext_feature3, which was zeroed-out
diff --git a/sys/amd64/amd64/apic_vector.S b/sys/amd64/amd64/apic_vector.S
index 6e51ebff298a..e98bae9eb6c5 100644
--- a/sys/amd64/amd64/apic_vector.S
+++ b/sys/amd64/amd64/apic_vector.S
@@ -49,12 +49,6 @@
#include <machine/specialreg.h>
#include <x86/apicreg.h>
-#ifdef SMP
-#define LK lock ;
-#else
-#define LK
-#endif
-
.text
SUPERALIGN_TEXT
/* End Of Interrupt to APIC */
@@ -163,7 +157,6 @@ IDTVEC(spuriousint)
jmp doreti
#endif
-#ifdef SMP
/*
* Global address space TLB shootdown.
*/
@@ -270,5 +263,3 @@ IDTVEC(justreturn)
INTR_HANDLER justreturn1
call as_lapic_eoi
jmp doreti
-
-#endif /* SMP */
diff --git a/sys/amd64/amd64/cpu_switch.S b/sys/amd64/amd64/cpu_switch.S
index a053f6c70af1..d7e954f573b0 100644
--- a/sys/amd64/amd64/cpu_switch.S
+++ b/sys/amd64/amd64/cpu_switch.S
@@ -136,7 +136,7 @@ ctx_switch_fpusave_done:
movq %r15,TD_LOCK(%r13) /* Release the old thread */
sw1:
leaq TD_MD_PCB(%r12),%r8
-#if defined(SCHED_ULE) && defined(SMP)
+#if defined(SCHED_ULE)
movq $blocked_lock, %rdx
movq TD_LOCK(%r12),%rcx
cmpq %rcx, %rdx
@@ -492,7 +492,7 @@ ENTRY(resumectx)
END(resumectx)
/* Wait for the new thread to become unblocked */
-#if defined(SCHED_ULE) && defined(SMP)
+#if defined(SCHED_ULE)
sw1wait:
1:
pause
diff --git a/sys/amd64/amd64/exec_machdep.c b/sys/amd64/amd64/exec_machdep.c
index da68289e2c83..6752b716deb5 100644
--- a/sys/amd64/amd64/exec_machdep.c
+++ b/sys/amd64/amd64/exec_machdep.c
@@ -59,9 +59,7 @@
#include <sys/reg.h>
#include <sys/rwlock.h>
#include <sys/signalvar.h>
-#ifdef SMP
#include <sys/smp.h>
-#endif
#include <sys/syscallsubr.h>
#include <sys/sysctl.h>
#include <sys/sysent.h>
diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c
index f46462b39fa3..37c7056f649c 100644
--- a/sys/amd64/amd64/machdep.c
+++ b/sys/amd64/amd64/machdep.c
@@ -38,7 +38,6 @@
* SUCH DAMAGE.
*/
-#include <sys/cdefs.h>
#include "opt_atpic.h"
#include "opt_cpu.h"
#include "opt_ddb.h"
@@ -82,9 +81,7 @@
#include <sys/rwlock.h>
#include <sys/sched.h>
#include <sys/signalvar.h>
-#ifdef SMP
#include <sys/smp.h>
-#endif
#include <sys/syscallsubr.h>
#include <sys/sysctl.h>
#include <sys/sysent.h>
@@ -132,9 +129,7 @@
#include <machine/tss.h>
#include <x86/ucode.h>
#include <x86/ifunc.h>
-#ifdef SMP
#include <machine/smp.h>
-#endif
#ifdef FDT
#include <x86/fdt.h>
#endif
@@ -149,6 +144,10 @@
#include <isa/rtc.h>
#include <x86/init.h>
+#ifndef SMP
+#error amd64 requires options SMP
+#endif
+
/* Sanity check for __curthread() */
CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c
index 2c7777e608b9..8df082f6c5dc 100644
--- a/sys/amd64/amd64/pmap.c
+++ b/sys/amd64/amd64/pmap.c
@@ -162,9 +162,7 @@
#include <machine/msan.h>
#include <machine/pcb.h>
#include <machine/specialreg.h>
-#ifdef SMP
#include <machine/smp.h>
-#endif
#include <machine/sysarch.h>
#include <machine/tss.h>
@@ -483,6 +481,8 @@ vm_paddr_t KERNend; /* and the end */
struct kva_layout_s kva_layout = {
.kva_min = KV4ADDR(PML4PML4I, 0, 0, 0),
+ .kva_max = KV4ADDR(NPML4EPG - 1, NPDPEPG - 1,
+ NPDEPG - 1, NPTEPG - 1),
.dmap_low = KV4ADDR(DMPML4I, 0, 0, 0),
.dmap_high = KV4ADDR(DMPML4I + NDMPML4E, 0, 0, 0),
.lm_low = KV4ADDR(LMSPML4I, 0, 0, 0),
@@ -491,18 +491,36 @@ struct kva_layout_s kva_layout = {
.km_high = KV4ADDR(KPML4BASE + NKPML4E - 1, NPDPEPG - 1,
NPDEPG - 1, NPTEPG - 1),
.rec_pt = KV4ADDR(PML4PML4I, 0, 0, 0),
+ .kasan_shadow_low = KV4ADDR(KASANPML4I, 0, 0, 0),
+ .kasan_shadow_high = KV4ADDR(KASANPML4I + NKASANPML4E, 0, 0, 0),
+ .kmsan_shadow_low = KV4ADDR(KMSANSHADPML4I, 0, 0, 0),
+ .kmsan_shadow_high = KV4ADDR(KMSANSHADPML4I + NKMSANSHADPML4E,
+ 0, 0, 0),
+ .kmsan_origin_low = KV4ADDR(KMSANORIGPML4I, 0, 0, 0),
+ .kmsan_origin_high = KV4ADDR(KMSANORIGPML4I + NKMSANORIGPML4E,
+ 0, 0, 0),
};
struct kva_layout_s kva_layout_la57 = {
.kva_min = KV5ADDR(NPML5EPG / 2, 0, 0, 0, 0), /* == rec_pt */
+ .kva_max = KV5ADDR(NPML5EPG - 1, NPML4EPG - 1, NPDPEPG - 1,
+ NPDEPG - 1, NPTEPG - 1),
.dmap_low = KV5ADDR(DMPML5I, 0, 0, 0, 0),
.dmap_high = KV5ADDR(DMPML5I + NDMPML5E, 0, 0, 0, 0),
- .lm_low = KV4ADDR(LMSPML4I, 0, 0, 0),
- .lm_high = KV4ADDR(LMEPML4I + 1, 0, 0, 0),
+ .lm_low = KV5ADDR(LMSPML5I, 0, 0, 0, 0),
+ .lm_high = KV5ADDR(LMEPML5I + 1, 0, 0, 0, 0),
.km_low = KV4ADDR(KPML4BASE, 0, 0, 0),
.km_high = KV4ADDR(KPML4BASE + NKPML4E - 1, NPDPEPG - 1,
NPDEPG - 1, NPTEPG - 1),
.rec_pt = KV5ADDR(PML5PML5I, 0, 0, 0, 0),
+ .kasan_shadow_low = KV4ADDR(KASANPML4I, 0, 0, 0),
+ .kasan_shadow_high = KV4ADDR(KASANPML4I + NKASANPML4E, 0, 0, 0),
+ .kmsan_shadow_low = KV4ADDR(KMSANSHADPML4I, 0, 0, 0),
+ .kmsan_shadow_high = KV4ADDR(KMSANSHADPML4I + NKMSANSHADPML4E,
+ 0, 0, 0),
+ .kmsan_origin_low = KV4ADDR(KMSANORIGPML4I, 0, 0, 0),
+ .kmsan_origin_high = KV4ADDR(KMSANORIGPML4I + NKMSANORIGPML4E,
+ 0, 0, 0),
};
/*
@@ -2005,7 +2023,7 @@ create_pagetables(vm_paddr_t *firstaddr)
*/
p5_p[i] = KPML5phys | X86_PG_RW | X86_PG_A |
X86_PG_M | X86_PG_V | pg_nx;
- } else if (i >= DMPML5I && i < DMPML5I + NDMPML5E) {
+ } else if (i >= DMPML5I && i < DMPML5I + ndmpml4phys) {
/* Connect DMAP pml4 pages to PML5. */
p5_p[i] = (DMPML4phys + ptoa(i - DMPML5I)) |
X86_PG_RW | X86_PG_V | pg_nx;
@@ -2475,6 +2493,7 @@ pmap_init(void)
struct pmap_preinit_mapping *ppim;
vm_page_t m, mpte;
pml4_entry_t *pml4e;
+ unsigned long lm_max;
int error, i, ret, skz63;
/* L1TF, reserve page @0 unconditionally */
@@ -2600,10 +2619,15 @@ pmap_init(void)
lm_ents = 8;
TUNABLE_INT_FETCH("vm.pmap.large_map_pml4_entries", &lm_ents);
- if (lm_ents > LMEPML4I - LMSPML4I + 1)
- lm_ents = LMEPML4I - LMSPML4I + 1;
+ lm_max = (kva_layout.lm_high - kva_layout.lm_low) / NBPML4;
+ if (lm_ents > lm_max) {
+ printf(
+ "pmap: shrinking large map from requested %d slots to %ld slots\n",
+ lm_ents, lm_max);
+ lm_ents = lm_max;
+ }
#ifdef KMSAN
- if (lm_ents > KMSANORIGPML4I - LMSPML4I) {
+ if (!la57 && lm_ents > KMSANORIGPML4I - LMSPML4I) {
printf(
"pmap: shrinking large map for KMSAN (%d slots to %ld slots)\n",
lm_ents, KMSANORIGPML4I - LMSPML4I);
@@ -2615,12 +2639,20 @@ pmap_init(void)
lm_ents, (u_long)lm_ents * (NBPML4 / 1024 / 1024 / 1024));
if (lm_ents != 0) {
large_vmem = vmem_create("large", kva_layout.lm_low,
- (vmem_size_t)kva_layout.lm_high - kva_layout.lm_low,
- PAGE_SIZE, 0, M_WAITOK);
+ (vmem_size_t)lm_ents * NBPML4, PAGE_SIZE, 0, M_WAITOK);
if (large_vmem == NULL) {
printf("pmap: cannot create large map\n");
lm_ents = 0;
}
+ if (la57) {
+ for (i = 0; i < howmany((vm_offset_t)NBPML4 *
+ lm_ents, NBPML5); i++) {
+ m = pmap_large_map_getptp_unlocked();
+ kernel_pmap->pm_pmltop[LMSPML5I + i] = X86_PG_V |
+ X86_PG_RW | X86_PG_A | X86_PG_M |
+ pg_nx | VM_PAGE_TO_PHYS(m);
+ }
+ }
for (i = 0; i < lm_ents; i++) {
m = pmap_large_map_getptp_unlocked();
pml4e = pmap_pml4e(kernel_pmap, kva_layout.lm_low +
@@ -3031,7 +3063,6 @@ pmap_update_pde_invalidate(pmap_t pmap, vm_offset_t va, pd_entry_t newpde)
* XXX TODO
*/
-#ifdef SMP
/*
* Interrupt the cpus that are executing in the guest context.
* This will force the vcpu to exit and the cached EPT mappings
@@ -3489,168 +3520,6 @@ pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
}
sched_unpin();
}
-#else /* !SMP */
-/*
- * Normal, non-SMP, invalidation functions.
- */
-void
-pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
-{
- struct invpcid_descr d;
- struct pmap_pcid *pcidp;
- uint64_t kcr3, ucr3;
- uint32_t pcid;
-
- if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) {
- pmap->pm_eptgen++;
- return;
- }
- KASSERT(pmap->pm_type == PT_X86,
- ("pmap_invalidate_range: unknown type %d", pmap->pm_type));
-
- if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) {
- invlpg(va);
- if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled &&
- pmap->pm_ucr3 != PMAP_NO_CR3) {
- critical_enter();
- pcid = pmap_get_pcid(pmap);
- if (invpcid_works) {
- d.pcid = pcid | PMAP_PCID_USER_PT;
- d.pad = 0;
- d.addr = va;
- invpcid(&d, INVPCID_ADDR);
- } else {
- kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE;
- ucr3 = pmap->pm_ucr3 | pcid |
- PMAP_PCID_USER_PT | CR3_PCID_SAVE;
- pmap_pti_pcid_invlpg(ucr3, kcr3, va);
- }
- critical_exit();
- }
- } else if (pmap_pcid_enabled) {
- pcidp = zpcpu_get(pmap->pm_pcidp);
- pcidp->pm_gen = 0;
- }
-}
-
-void
-pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
-{
- struct invpcid_descr d;
- struct pmap_pcid *pcidp;
- vm_offset_t addr;
- uint64_t kcr3, ucr3;
- uint32_t pcid;
-
- if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) {
- pmap->pm_eptgen++;
- return;
- }
- KASSERT(pmap->pm_type == PT_X86,
- ("pmap_invalidate_range: unknown type %d", pmap->pm_type));
-
- if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) {
- for (addr = sva; addr < eva; addr += PAGE_SIZE)
- invlpg(addr);
- if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled &&
- pmap->pm_ucr3 != PMAP_NO_CR3) {
- critical_enter();
- pcid = pmap_get_pcid(pmap);
- if (invpcid_works) {
- d.pcid = pcid | PMAP_PCID_USER_PT;
- d.pad = 0;
- d.addr = sva;
- for (; d.addr < eva; d.addr += PAGE_SIZE)
- invpcid(&d, INVPCID_ADDR);
- } else {
- kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE;
- ucr3 = pmap->pm_ucr3 | pcid |
- PMAP_PCID_USER_PT | CR3_PCID_SAVE;
- pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva);
- }
- critical_exit();
- }
- } else if (pmap_pcid_enabled) {
- pcidp = zpcpu_get(pmap->pm_pcidp);
- pcidp->pm_gen = 0;
- }
-}
-
-void
-pmap_invalidate_all(pmap_t pmap)
-{
- struct invpcid_descr d;
- struct pmap_pcid *pcidp;
- uint64_t kcr3, ucr3;
- uint32_t pcid;
-
- if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) {
- pmap->pm_eptgen++;
- return;
- }
- KASSERT(pmap->pm_type == PT_X86,
- ("pmap_invalidate_all: unknown type %d", pmap->pm_type));
-
- if (pmap == kernel_pmap) {
- if (pmap_pcid_enabled && invpcid_works) {
- bzero(&d, sizeof(d));
- invpcid(&d, INVPCID_CTXGLOB);
- } else {
- invltlb_glob();
- }
- } else if (pmap == PCPU_GET(curpmap)) {
- if (pmap_pcid_enabled) {
- critical_enter();
- pcid = pmap_get_pcid(pmap);
- if (invpcid_works) {
- d.pcid = pcid;
- d.pad = 0;
- d.addr = 0;
- invpcid(&d, INVPCID_CTX);
- if (pmap->pm_ucr3 != PMAP_NO_CR3) {
- d.pcid |= PMAP_PCID_USER_PT;
- invpcid(&d, INVPCID_CTX);
- }
- } else {
- kcr3 = pmap->pm_cr3 | pcid;
- if (pmap->pm_ucr3 != PMAP_NO_CR3) {
- ucr3 = pmap->pm_ucr3 | pcid |
- PMAP_PCID_USER_PT;
- pmap_pti_pcid_invalidate(ucr3, kcr3);
- } else
- load_cr3(kcr3);
- }
- critical_exit();
- } else {
- invltlb();
- }
- } else if (pmap_pcid_enabled) {
- pcidp = zpcpu_get(pmap->pm_pcidp);
- pcidp->pm_gen = 0;
- }
-}
-
-void
-pmap_invalidate_cache(void)
-{
-
- wbinvd();
-}
-
-static void
-pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
-{
- struct pmap_pcid *pcidp;
-
- pmap_update_pde_store(pmap, pde, newpde);
- if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap))
- pmap_update_pde_invalidate(pmap, va, newpde);
- else {
- pcidp = zpcpu_get(pmap->pm_pcidp);
- pcidp->pm_gen = 0;
- }
-}
-#endif /* !SMP */
static void
pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde)
@@ -6093,17 +5962,18 @@ pmap_demote_pde_mpte(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
if (mpte == NULL) {
/*
* Invalidate the 2MB page mapping and return "failure" if the
- * mapping was never accessed.
+ * mapping was never accessed and not wired.
*/
if ((oldpde & PG_A) == 0) {
- KASSERT((oldpde & PG_W) == 0,
- ("pmap_demote_pde: a wired mapping is missing PG_A"));
- pmap_demote_pde_abort(pmap, va, pde, oldpde, lockp);
- return (false);
- }
-
- mpte = pmap_remove_pt_page(pmap, va);
- if (mpte == NULL) {
+ if ((oldpde & PG_W) == 0) {
+ pmap_demote_pde_abort(pmap, va, pde, oldpde,
+ lockp);
+ return (false);
+ }
+ mpte = pmap_remove_pt_page(pmap, va);
+ /* Fill the PTP with PTEs that have PG_A cleared. */
+ mpte->valid = 0;
+ } else if ((mpte = pmap_remove_pt_page(pmap, va)) == NULL) {
KASSERT((oldpde & PG_W) == 0,
("pmap_demote_pde: page table page for a wired mapping is missing"));
@@ -6155,7 +6025,7 @@ pmap_demote_pde_mpte(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
/*
* If the PTP is not leftover from an earlier promotion or it does not
* have PG_A set in every PTE, then fill it. The new PTEs will all
- * have PG_A set.
+ * have PG_A set, unless this is a wired mapping with PG_A clear.
*/
if (!vm_page_all_valid(mpte))
pmap_fill_ptp(firstpte, newpte);
@@ -7561,6 +7431,9 @@ pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags,
PG_RW = pmap_rw_bit(pmap);
KASSERT((newpde & (pmap_modified_bit(pmap) | PG_RW)) != PG_RW,
("pmap_enter_pde: newpde is missing PG_M"));
+ KASSERT((flags & (PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM)) !=
+ PMAP_ENTER_NORECLAIM,
+ ("pmap_enter_pde: flags is missing PMAP_ENTER_NOREPLACE"));
PG_V = pmap_valid_bit(pmap);
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
@@ -7689,6 +7562,14 @@ pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags,
if (!pmap_pv_insert_pde(pmap, va, newpde, flags, lockp)) {
if (pdpg != NULL)
pmap_abort_ptp(pmap, va, pdpg);
+ else {
+ KASSERT(va >= VM_MAXUSER_ADDRESS &&
+ (*pde & (PG_PS | PG_V)) == PG_V,
+ ("pmap_enter_pde: invalid kernel PDE"));
+ mt = pmap_remove_pt_page(pmap, va);
+ KASSERT(mt != NULL,
+ ("pmap_enter_pde: missing kernel PTP"));
+ }
if (uwptpg != NULL) {
mt = pmap_remove_pt_page(pmap, va);
KASSERT(mt == uwptpg,
@@ -10333,17 +10214,9 @@ pmap_activate_sw(struct thread *td)
return;
}
cpuid = PCPU_GET(cpuid);
-#ifdef SMP
CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
-#else
- CPU_SET(cpuid, &pmap->pm_active);
-#endif
pmap_activate_sw_mode(td, pmap, cpuid);
-#ifdef SMP
CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active);
-#else
- CPU_CLR(cpuid, &oldpmap->pm_active);
-#endif
}
void
@@ -10384,11 +10257,7 @@ pmap_activate_boot(pmap_t pmap)
MPASS(pmap != kernel_pmap);
cpuid = PCPU_GET(cpuid);
-#ifdef SMP
CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
-#else
- CPU_SET(cpuid, &pmap->pm_active);
-#endif
PCPU_SET(curpmap, pmap);
if (pti) {
kcr3 = pmap->pm_cr3;
@@ -10752,19 +10621,28 @@ pmap_large_map_getptp(void)
static pdp_entry_t *
pmap_large_map_pdpe(vm_offset_t va)
{
+ pml4_entry_t *pml4;
vm_pindex_t pml4_idx;
vm_paddr_t mphys;
- pml4_idx = pmap_pml4e_index(va);
- KASSERT(LMSPML4I <= pml4_idx && pml4_idx < LMSPML4I + lm_ents,
- ("pmap_large_map_pdpe: va %#jx out of range idx %#jx LMSPML4I "
- "%#jx lm_ents %d",
- (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents));
- KASSERT((kernel_pml4[pml4_idx] & X86_PG_V) != 0,
- ("pmap_large_map_pdpe: invalid pml4 for va %#jx idx %#jx "
- "LMSPML4I %#jx lm_ents %d",
- (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents));
- mphys = kernel_pml4[pml4_idx] & PG_FRAME;
+ KASSERT(va >= kva_layout.lm_low && va < kva_layout.lm_low +
+ (vm_offset_t)NBPML4 * lm_ents, ("va %#lx not in large map", va));
+ if (la57) {
+ pml4 = pmap_pml4e(kernel_pmap, va);
+ mphys = *pml4 & PG_FRAME;
+ } else {
+ pml4_idx = pmap_pml4e_index(va);
+
+ KASSERT(LMSPML4I <= pml4_idx && pml4_idx < LMSPML4I + lm_ents,
+ ("pmap_large_map_pdpe: va %#jx out of range idx %#jx "
+ "LMSPML4I %#jx lm_ents %d",
+ (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents));
+ KASSERT((kernel_pml4[pml4_idx] & X86_PG_V) != 0,
+ ("pmap_large_map_pdpe: invalid pml4 for va %#jx idx %#jx "
+ "LMSPML4I %#jx lm_ents %d",
+ (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents));
+ mphys = kernel_pml4[pml4_idx] & PG_FRAME;
+ }
return ((pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va));
}
@@ -12023,9 +11901,7 @@ sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range,
mode, range->pdpes, range->pdes, range->ptes);
/* Reset to sentinel value. */
- range->sva = la57 ? KV5ADDR(NPML5EPG - 1, NPML4EPG - 1, NPDPEPG - 1,
- NPDEPG - 1, NPTEPG - 1) : KV4ADDR(NPML4EPG - 1, NPDPEPG - 1,
- NPDEPG - 1, NPTEPG - 1);
+ range->sva = kva_layout.kva_max;
}
/*
@@ -12066,12 +11942,18 @@ sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va,
*/
static void
sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range,
- vm_offset_t va, pml4_entry_t pml4e, pdp_entry_t pdpe, pd_entry_t pde,
- pt_entry_t pte)
+ vm_offset_t va, pml5_entry_t pml5e, pml4_entry_t pml4e, pdp_entry_t pdpe,
+ pd_entry_t pde, pt_entry_t pte)
{
pt_entry_t attrs;
- attrs = pml4e & (X86_PG_RW | X86_PG_U | pg_nx);
+ if (la57) {
+ attrs = pml5e & (X86_PG_RW | X86_PG_U | pg_nx);
+ attrs |= pml4e & pg_nx;
+ attrs &= pg_nx | (pml4e & (X86_PG_RW | X86_PG_U));
+ } else {
+ attrs = pml4e & (X86_PG_RW | X86_PG_U | pg_nx);
+ }
attrs |= pdpe & pg_nx;
attrs &= pg_nx | (pdpe & (X86_PG_RW | X86_PG_U));
@@ -12104,13 +11986,15 @@ sysctl_kmaps(SYSCTL_HANDLER_ARGS)
{
struct pmap_kernel_map_range range;
struct sbuf sbuf, *sb;
+ pml5_entry_t pml5e;
pml4_entry_t pml4e;
pdp_entry_t *pdp, pdpe;
pd_entry_t *pd, pde;
pt_entry_t *pt, pte;
vm_offset_t sva;
vm_paddr_t pa;
- int error, i, j, k, l;
+ int error, j, k, l;
+ bool first;
error = sysctl_wire_old_buffer(req, 0);
if (error != 0)
@@ -12119,9 +12003,8 @@ sysctl_kmaps(SYSCTL_HANDLER_ARGS)
sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req);
/* Sentinel value. */
- range.sva = la57 ? KV5ADDR(NPML5EPG - 1, NPML4EPG - 1, NPDPEPG - 1,
- NPDEPG - 1, NPTEPG - 1) : KV4ADDR(NPML4EPG - 1, NPDPEPG - 1,
- NPDEPG - 1, NPTEPG - 1);
+ range.sva = kva_layout.kva_max;
+ pml5e = 0; /* no UB for la48 */
/*
* Iterate over the kernel page tables without holding the kernel pmap
@@ -12130,41 +12013,50 @@ sysctl_kmaps(SYSCTL_HANDLER_ARGS)
* Within the large map, ensure that PDP and PD page addresses are
* valid before descending.
*/
- for (sva = 0, i = pmap_pml4e_index(sva); i < NPML4EPG; i++) {
- switch (i) {
- case PML4PML4I:
+ for (first = true, sva = 0; sva != 0 || first; first = false) {
+ if (sva == kva_layout.rec_pt)
sbuf_printf(sb, "\nRecursive map:\n");
- break;
- case DMPML4I:
+ else if (sva == kva_layout.dmap_low)
sbuf_printf(sb, "\nDirect map:\n");
- break;
#ifdef KASAN
- case KASANPML4I:
+ else if (sva == kva_layout.kasan_shadow_low)
sbuf_printf(sb, "\nKASAN shadow map:\n");
- break;
#endif
#ifdef KMSAN
- case KMSANSHADPML4I:
+ else if (sva == kva_layout.kmsan_shadow_low)
sbuf_printf(sb, "\nKMSAN shadow map:\n");
- break;
- case KMSANORIGPML4I:
+ else if (sva == kva_layout.kmsan_origin_low)
sbuf_printf(sb, "\nKMSAN origin map:\n");
- break;
#endif
- case KPML4BASE:
+ else if (sva == kva_layout.km_low)
sbuf_printf(sb, "\nKernel map:\n");
- break;
- case LMSPML4I:
+ else if (sva == kva_layout.lm_low)
sbuf_printf(sb, "\nLarge map:\n");
- break;
- }
/* Convert to canonical form. */
- if (sva == 1ul << 47)
- sva |= -1ul << 48;
+ if (la57) {
+ if (sva == 1ul << 56) {
+ sva |= -1ul << 57;
+ continue;
+ }
+ } else {
+ if (sva == 1ul << 47) {
+ sva |= -1ul << 48;
+ continue;
+ }
+ }
restart:
- pml4e = kernel_pml4[i];
+ if (la57) {
+ pml5e = *pmap_pml5e(kernel_pmap, sva);
+ if ((pml5e & X86_PG_V) == 0) {
+ sva = rounddown2(sva, NBPML5);
+ sysctl_kmaps_dump(sb, &range, sva);
+ sva += NBPML5;
+ continue;
+ }
+ }
+ pml4e = *pmap_pml4e(kernel_pmap, sva);
if ((pml4e & X86_PG_V) == 0) {
sva = rounddown2(sva, NBPML4);
sysctl_kmaps_dump(sb, &range, sva);
@@ -12185,8 +12077,8 @@ restart:
pa = pdpe & PG_FRAME;
if ((pdpe & PG_PS) != 0) {
sva = rounddown2(sva, NBPDP);
- sysctl_kmaps_check(sb, &range, sva, pml4e, pdpe,
- 0, 0);
+ sysctl_kmaps_check(sb, &range, sva, pml5e,
+ pml4e, pdpe, 0, 0);
range.pdpes++;
sva += NBPDP;
continue;
@@ -12198,6 +12090,7 @@ restart:
* freed. Validate the next-level address
* before descending.
*/
+ sva += NBPDP;
goto restart;
}
pd = (pd_entry_t *)PHYS_TO_DMAP(pa);
@@ -12214,7 +12107,7 @@ restart:
if ((pde & PG_PS) != 0) {
sva = rounddown2(sva, NBPDR);
sysctl_kmaps_check(sb, &range, sva,
- pml4e, pdpe, pde, 0);
+ pml5e, pml4e, pdpe, pde, 0);
range.pdes++;
sva += NBPDR;
continue;
@@ -12226,6 +12119,7 @@ restart:
* may be freed. Validate the
* next-level address before descending.
*/
+ sva += NBPDR;
goto restart;
}
pt = (pt_entry_t *)PHYS_TO_DMAP(pa);
@@ -12239,7 +12133,7 @@ restart:
continue;
}
sysctl_kmaps_check(sb, &range, sva,
- pml4e, pdpe, pde, pte);
+ pml5e, pml4e, pdpe, pde, pte);
range.ptes++;
}
}
diff --git a/sys/amd64/amd64/support.S b/sys/amd64/amd64/support.S
index c95696bbe7ef..870cd255abb7 100644
--- a/sys/amd64/amd64/support.S
+++ b/sys/amd64/amd64/support.S
@@ -934,10 +934,7 @@ ENTRY(casueword32_nosmap)
ja fusufault
movl %esi,%eax /* old */
-#ifdef SMP
- lock
-#endif
- cmpxchgl %ecx,(%rdi) /* new = %ecx */
+ lock cmpxchgl %ecx,(%rdi) /* new = %ecx */
setne %cl
/*
@@ -971,10 +968,7 @@ ENTRY(casueword32_smap)
movl %esi,%eax /* old */
stac
-#ifdef SMP
- lock
-#endif
- cmpxchgl %ecx,(%rdi) /* new = %ecx */
+ lock cmpxchgl %ecx,(%rdi) /* new = %ecx */
clac
setne %cl
@@ -1014,10 +1008,7 @@ ENTRY(casueword_nosmap)
ja fusufault
movq %rsi,%rax /* old */
-#ifdef SMP
- lock
-#endif
- cmpxchgq %rcx,(%rdi) /* new = %rcx */
+ lock cmpxchgq %rcx,(%rdi) /* new = %rcx */
setne %cl
/*
@@ -1045,10 +1036,7 @@ ENTRY(casueword_smap)
movq %rsi,%rax /* old */
stac
-#ifdef SMP
- lock
-#endif
- cmpxchgq %rcx,(%rdi) /* new = %rcx */
+ lock cmpxchgq %rcx,(%rdi) /* new = %rcx */
clac
setne %cl
diff --git a/sys/amd64/amd64/trap.c b/sys/amd64/amd64/trap.c
index eefddad2f142..f3469ed5e2bc 100644
--- a/sys/amd64/amd64/trap.c
+++ b/sys/amd64/amd64/trap.c
@@ -37,7 +37,6 @@
* SUCH DAMAGE.
*/
-#include <sys/cdefs.h>
/*
* AMD64 Trap and System call handling
*/
@@ -87,9 +86,7 @@ PMC_SOFT_DEFINE( , , page_fault, write);
#include <x86/mca.h>
#include <machine/md_var.h>
#include <machine/pcb.h>
-#ifdef SMP
#include <machine/smp.h>
-#endif
#include <machine/stack.h>
#include <machine/trap.h>
#include <machine/tss.h>
@@ -900,11 +897,9 @@ trap_diag(struct trapframe *frame, vm_offset_t eva)
printf("\n\nFatal trap %d: %s while in %s mode\n", type,
type < nitems(trap_msg) ? trap_msg[type] : UNKNOWN,
TRAPF_USERMODE(frame) ? "user" : "kernel");
-#ifdef SMP
- /* two separate prints in case of a trap on an unmapped page */
- printf("cpuid = %d; ", PCPU_GET(cpuid));
- printf("apic id = %02x\n", PCPU_GET(apic_id));
-#endif
+ /* Print these separately in case pcpu accesses trap. */
+ printf("cpuid = %d; apic id = %02x\n", PCPU_GET(cpuid),
+ PCPU_GET(apic_id));
if (type == T_PAGEFLT) {
printf("fault virtual address = 0x%lx\n", eva);
printf("fault code = %s %s %s%s%s, %s\n",
@@ -1025,11 +1020,9 @@ dblfault_handler(struct trapframe *frame)
frame->tf_cs, frame->tf_ss, frame->tf_ds, frame->tf_es,
frame->tf_fs, frame->tf_gs,
rdmsr(MSR_FSBASE), rdmsr(MSR_GSBASE), rdmsr(MSR_KGSBASE));
-#ifdef SMP
- /* two separate prints in case of a trap on an unmapped page */
- printf("cpuid = %d; ", PCPU_GET(cpuid));
- printf("apic id = %02x\n", PCPU_GET(apic_id));
-#endif
+ /* Print these separately in case pcpu accesses trap. */
+ printf("cpuid = %d; apic id = %02x\n", PCPU_GET(cpuid),
+ PCPU_GET(apic_id));
panic("double fault");
}
diff --git a/sys/amd64/conf/MINIMALUP b/sys/amd64/conf/MINIMALUP
deleted file mode 100644
index 0dbddbe5b341..000000000000
--- a/sys/amd64/conf/MINIMALUP
+++ /dev/null
@@ -1,4 +0,0 @@
-include MINIMAL
-ident MINIMALUP
-nooptions SMP
-nooptions NUMA
diff --git a/sys/amd64/include/param.h b/sys/amd64/include/param.h
index 1bbb302259d6..5a9c3162e14c 100644
--- a/sys/amd64/include/param.h
+++ b/sys/amd64/include/param.h
@@ -150,8 +150,6 @@
(((va) >= kva_layout.dmap_low && (va) < kva_layout.dmap_high) || \
((va) >= kva_layout.km_low && (va) < kva_layout.km_high))
-#ifdef SMP
#define SC_TABLESIZE 1024 /* Must be power of 2. */
-#endif
#endif /* !_AMD64_INCLUDE_PARAM_H_ */
diff --git a/sys/amd64/include/pmap.h b/sys/amd64/include/pmap.h
index 08e96027a5ed..e2f97442c10f 100644
--- a/sys/amd64/include/pmap.h
+++ b/sys/amd64/include/pmap.h
@@ -202,9 +202,14 @@
#define KMSANSHADPML4I (KPML4BASE - NKMSANSHADPML4E)
#define KMSANORIGPML4I (DMPML4I - NKMSANORIGPML4E)
-/* Large map: index of the first and max last pml4 entry */
+/*
+ * Large map: index of the first and max last pml4/la48 and pml5/la57
+ * entry.
+ */
#define LMSPML4I (PML4PML4I + 1)
#define LMEPML4I (KASANPML4I - 1)
+#define LMSPML5I (DMPML5I + NDMPML5E)
+#define LMEPML5I (LMSPML5I + 32 - 1) /* 32 slots for large map */
/*
* XXX doesn't really belong here I guess...
@@ -552,6 +557,7 @@ pmap_pml5e_index(vm_offset_t va)
struct kva_layout_s {
vm_offset_t kva_min;
+ vm_offset_t kva_max;
vm_offset_t dmap_low; /* DMAP_MIN_ADDRESS */
vm_offset_t dmap_high; /* DMAP_MAX_ADDRESS */
vm_offset_t lm_low; /* LARGEMAP_MIN_ADDRESS */
@@ -559,6 +565,12 @@ struct kva_layout_s {
vm_offset_t km_low; /* VM_MIN_KERNEL_ADDRESS */
vm_offset_t km_high; /* VM_MAX_KERNEL_ADDRESS */
vm_offset_t rec_pt;
+ vm_offset_t kasan_shadow_low; /* KASAN_MIN_ADDRESS */
+ vm_offset_t kasan_shadow_high; /* KASAN_MAX_ADDRESS */
+ vm_offset_t kmsan_shadow_low; /* KMSAN_SHAD_MIN_ADDRESS */
+ vm_offset_t kmsan_shadow_high; /* KMSAN_SHAD_MAX_ADDRESS */
+ vm_offset_t kmsan_origin_low; /* KMSAN_ORIG_MIN_ADDRESS */
+ vm_offset_t kmsan_origin_high; /* KMSAN_ORIG_MAX_ADDRESS */
};
extern struct kva_layout_s kva_layout;
diff --git a/sys/amd64/include/smp.h b/sys/amd64/include/smp.h
index 26eb227211da..bff92570ff82 100644
--- a/sys/amd64/include/smp.h
+++ b/sys/amd64/include/smp.h
@@ -13,8 +13,6 @@
#ifdef _KERNEL
-#ifdef SMP
-
#ifndef LOCORE
#include <x86/x86_smp.h>
@@ -39,7 +37,6 @@ void invlop_handler(void);
int start_all_aps(void);
#endif /* !LOCORE */
-#endif /* SMP */
#endif /* _KERNEL */
#endif /* _MACHINE_SMP_H_ */
diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h
index a9c73b75213b..0b3daed4f69e 100644
--- a/sys/amd64/include/vmm.h
+++ b/sys/amd64/include/vmm.h
@@ -649,6 +649,8 @@ struct vm_inout_str {
int addrsize;
enum vm_reg_name seg_name;
struct seg_desc seg_desc;
+ int cs_d;
+ uint64_t cs_base;
};
enum task_switch_reason {
diff --git a/sys/amd64/include/vmm_dev.h b/sys/amd64/include/vmm_dev.h
index 1f86538ce5f3..441330fd57b8 100644
--- a/sys/amd64/include/vmm_dev.h
+++ b/sys/amd64/include/vmm_dev.h
@@ -29,6 +29,8 @@
#ifndef _VMM_DEV_H_
#define _VMM_DEV_H_
+#include <sys/domainset.h>
+
#include <machine/vmm.h>
#include <machine/vmm_snapshot.h>
@@ -52,7 +54,10 @@ struct vm_munmap {
struct vm_memseg {
int segid;
size_t len;
- char name[VM_MAX_SUFFIXLEN + 1];
+ char name[VM_MAX_SUFFIXLEN + 1];
+ domainset_t *ds_mask;
+ size_t ds_mask_size;
+ int ds_policy;
};
struct vm_register {
diff --git a/sys/amd64/include/vmm_instruction_emul.h b/sys/amd64/include/vmm_instruction_emul.h
index d5f0363cfb41..1fb0f97682a7 100644
--- a/sys/amd64/include/vmm_instruction_emul.h
+++ b/sys/amd64/include/vmm_instruction_emul.h
@@ -31,6 +31,31 @@
#include <sys/mman.h>
+/* struct vie_op.op_type */
+enum {
+ VIE_OP_TYPE_NONE = 0,
+ VIE_OP_TYPE_MOV,
+ VIE_OP_TYPE_MOVSX,
+ VIE_OP_TYPE_MOVZX,
+ VIE_OP_TYPE_AND,
+ VIE_OP_TYPE_OR,
+ VIE_OP_TYPE_SUB,
+ VIE_OP_TYPE_TWO_BYTE,
+ VIE_OP_TYPE_PUSH,
+ VIE_OP_TYPE_CMP,
+ VIE_OP_TYPE_POP,
+ VIE_OP_TYPE_MOVS,
+ VIE_OP_TYPE_GROUP1,
+ VIE_OP_TYPE_STOS,
+ VIE_OP_TYPE_BITTEST,
+ VIE_OP_TYPE_TWOB_GRP15,
+ VIE_OP_TYPE_ADD,
+ VIE_OP_TYPE_TEST,
+ VIE_OP_TYPE_BEXTR,
+ VIE_OP_TYPE_OUTS,
+ VIE_OP_TYPE_LAST
+};
+
/*
* Callback functions to read and write memory regions.
*/
diff --git a/sys/amd64/include/vmparam.h b/sys/amd64/include/vmparam.h
index 59053665dc40..d2ac3c6648b2 100644
--- a/sys/amd64/include/vmparam.h
+++ b/sys/amd64/include/vmparam.h
@@ -181,9 +181,9 @@
* 0x0100000000000000 - 0xf0ffffffffffffff does not exist (hole)
* 0xff00000000000000 - 0xff00ffffffffffff recursive page table (2048TB slot)
* 0xff01000000000000 - 0xff20ffffffffffff direct map (32 x 2048TB slots)
- * 0xff21000000000000 - 0xffff807fffffffff unused
- * 0xffff808000000000 - 0xffff847fffffffff large map (can be tuned up)
- * 0xffff848000000000 - 0xfffff77fffffffff unused (large map extends there)
+ * 0xff21000000000000 - 0xff40ffffffffffff large map
+ * 0xff41000000000000 - 0xffff7fffffffffff unused
+ * 0xffff800000000000 - 0xfffff5ffffffffff unused (start of kernel pml4 entry)
* 0xfffff60000000000 - 0xfffff7ffffffffff 2TB KMSAN origin map, optional
* 0xfffff78000000000 - 0xfffff7bfffffffff 512GB KASAN shadow map, optional
* 0xfffff80000000000 - 0xfffffbffffffffff 4TB unused
@@ -200,16 +200,14 @@
#define VM_MIN_KERNEL_ADDRESS kva_layout.km_low
#define VM_MAX_KERNEL_ADDRESS kva_layout.km_high
-#define KASAN_MIN_ADDRESS KV4ADDR(KASANPML4I, 0, 0, 0)
-#define KASAN_MAX_ADDRESS KV4ADDR(KASANPML4I + NKASANPML4E, 0, 0, 0)
+#define KASAN_MIN_ADDRESS (kva_layout.kasan_shadow_low)
+#define KASAN_MAX_ADDRESS (kva_layout.kasan_shadow_high)
-#define KMSAN_SHAD_MIN_ADDRESS KV4ADDR(KMSANSHADPML4I, 0, 0, 0)
-#define KMSAN_SHAD_MAX_ADDRESS KV4ADDR(KMSANSHADPML4I + NKMSANSHADPML4E, \
- 0, 0, 0)
+#define KMSAN_SHAD_MIN_ADDRESS (kva_layout.kmsan_shadow_low)
+#define KMSAN_SHAD_MAX_ADDRESS (kva_layout.kmsan_shadow_high)
-#define KMSAN_ORIG_MIN_ADDRESS KV4ADDR(KMSANORIGPML4I, 0, 0, 0)
-#define KMSAN_ORIG_MAX_ADDRESS KV4ADDR(KMSANORIGPML4I + NKMSANORIGPML4E, \
- 0, 0, 0)
+#define KMSAN_ORIG_MIN_ADDRESS (kva_layout.kmsan_origin_low)
+#define KMSAN_ORIG_MAX_ADDRESS (kva_layout.kmsan_origin_high)
/*
* Formally kernel mapping starts at KERNBASE, but kernel linker
@@ -249,7 +247,7 @@
*/
#define PHYS_IN_DMAP(pa) (dmaplimit == 0 || (pa) < dmaplimit)
#define VIRT_IN_DMAP(va) \
- ((va) >= kva_layout.dmap_low && (va) < kva_layout.dmap_high)
+ ((va) >= kva_layout.dmap_low && (va) < kva_layout.dmap_low + dmaplimit)
#define PMAP_HAS_DMAP 1
#define PHYS_TO_DMAP(x) __extension__ ({ \
diff --git a/sys/amd64/pt/pt.c b/sys/amd64/pt/pt.c
new file mode 100644
index 000000000000..c7b75767680a
--- /dev/null
+++ b/sys/amd64/pt/pt.c
@@ -0,0 +1,978 @@
+/*
+ * Copyright (c) 2025 Bojan Novković <bnovkov@freebsd.org>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+/*
+ * hwt(4) Intel Processor Trace (PT) backend
+ *
+ * Driver Design Overview
+ *
+ * - Since PT is configured on a per-core basis, the driver uses
+ * 'smp_rendezvous' to start and disable tracing on each target core.
+ * - PT-specific resources are stored in a 'struct pt_ctx' context structure for
+ * each traced CPU core or thread. Upon initialization, a ToPA configuration
+ * is generated for each 'pt_ctx' structure using the HWT tracing buffers.
+ * The HWT tracing buffer is split into 4K ToPA entries. Currently, each
+ * 4K ToPA entry is configured to trigger an interrupt after it is filled.
+ * - The PT driver uses the XSAVE/XRSTOR PT extensions to load and save all
+ * relevant PT registers. Every time a traced thread is switched
+ * out or in, its state will be saved to or loaded from its corresponding
+ * 'pt_ctx' context.
+ * - When tracing starts, the PT hardware will start writing data into the
+ * tracing buffer. When a TOPA_INT entry is filled, it will trigger an
+ * interrupt before continuing. The interrupt handler will then fetch the
+ * last valid tracing buffer offset and enqueue a HWT_RECORD_BUFFER record.
+ * The driver is currently configured to use the NMI interrupt line.
+ * - The userspace PT backend waits for incoming HWT_RECORD_BUFFER records
+ * and uses the offsets to decode data from the tracing buffer.
+ *
+ * Future improvements and limitations
+ *
+ * - We currently configure the PT hardware to trigger an interrupt whenever
+ * a 4K ToPA entry is filled. While this is fine when tracing smaller
+ * functions or infrequent code paths, this will generate too much interrupt
+ * traffic when tracing hotter functions. A proper solution for this issue
+ * should estimate the amount of data generated by the current configuration
+ * and use it to determine interrupt frequency.
+ *
+ * - Support for more tracing options and PT features.
+ *
+ */
+
+#include <sys/systm.h>
+#include <sys/hwt.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/sdt.h>
+#include <sys/smp.h>
+#include <sys/taskqueue.h>
+
+#include <vm/vm.h>
+#include <vm/vm_page.h>
+
+#include <machine/atomic.h>
+#include <machine/cpufunc.h>
+#include <machine/fpu.h>
+#include <machine/smp.h>
+#include <machine/specialreg.h>
+
+#include <x86/apicvar.h>
+#include <x86/x86_var.h>
+
+#include <dev/hwt/hwt_context.h>
+#include <dev/hwt/hwt_vm.h>
+#include <dev/hwt/hwt_backend.h>
+#include <dev/hwt/hwt_config.h>
+#include <dev/hwt/hwt_cpu.h>
+#include <dev/hwt/hwt_record.h>
+#include <dev/hwt/hwt_thread.h>
+
+#include <amd64/pt/pt.h>
+
+#ifdef PT_DEBUG
+#define dprintf(fmt, ...) printf(fmt, ##__VA_ARGS__)
+#else
+#define dprintf(fmt, ...)
+#endif
+#define PT_SUPPORTED_FLAGS \
+ (RTIT_CTL_MTCEN | RTIT_CTL_CR3FILTER | RTIT_CTL_DIS_TNT | \
+ RTIT_CTL_USER | RTIT_CTL_OS | RTIT_CTL_BRANCHEN)
+#define PT_XSAVE_MASK (XFEATURE_ENABLED_X87 | XFEATURE_ENABLED_SSE)
+#define PT_XSTATE_BV (PT_XSAVE_MASK | XFEATURE_ENABLED_PT)
+#define PT_MAX_IP_RANGES 2
+
+#define PT_TOPA_MASK_PTRS 0x7f
+#define PT_TOPA_PAGE_MASK 0xffffff80
+#define PT_TOPA_PAGE_SHIFT 7
+
+#define CPUID_PT_LEAF 0x14
+
+MALLOC_DEFINE(M_PT, "pt", "Intel Processor Trace");
+
+SDT_PROVIDER_DEFINE(pt);
+SDT_PROBE_DEFINE(pt, , , topa__intr);
+
+TASKQUEUE_FAST_DEFINE_THREAD(pt);
+
+static void pt_send_buffer_record(void *arg, int pending __unused);
+static int pt_topa_intr(struct trapframe *tf);
+
+/*
+ * Intel Processor Trace XSAVE-managed state.
+ */
+struct pt_ext_area {
+ uint64_t rtit_ctl;
+ uint64_t rtit_output_base;
+ uint64_t rtit_output_mask_ptrs;
+ uint64_t rtit_status;
+ uint64_t rtit_cr3_match;
+ uint64_t rtit_addr0_a;
+ uint64_t rtit_addr0_b;
+ uint64_t rtit_addr1_a;
+ uint64_t rtit_addr1_b;
+};
+
+struct pt_buffer {
+ uint64_t *topa_hw; /* ToPA table entries. */
+ size_t size;
+ struct mtx lock; /* Lock for fields below. */
+ vm_offset_t offset;
+ uint64_t wrap_count;
+ int curpage;
+};
+
+struct pt_ctx {
+ int id;
+ struct pt_buffer buf; /* ToPA buffer metadata */
+ struct task task; /* ToPA buffer notification task */
+ struct hwt_context *hwt_ctx;
+ uint8_t *save_area; /* PT XSAVE area */
+};
+/* PT tracing contexts used for CPU mode. */
+static struct pt_ctx *pt_pcpu_ctx;
+
+enum pt_cpu_state {
+ PT_DISABLED = 0,
+ PT_STOPPED,
+ PT_ACTIVE
+};
+
+static struct pt_cpu {
+ struct pt_ctx *ctx; /* active PT tracing context */
+ enum pt_cpu_state state; /* used as part of trace stop protocol */
+} *pt_pcpu;
+
+/*
+ * PT-related CPUID bits.
+ */
+static struct pt_cpu_info {
+ uint32_t l0_eax;
+ uint32_t l0_ebx;
+ uint32_t l0_ecx;
+ uint32_t l1_eax;
+ uint32_t l1_ebx;
+ size_t xsave_area_size;
+ size_t xstate_hdr_offset;
+ size_t pt_xsave_offset;
+} pt_info __read_mostly;
+
+static bool initialized = false;
+static int cpu_mode_ctr = 0;
+
+static __inline enum pt_cpu_state
+pt_cpu_get_state(int cpu_id)
+{
+ return (atomic_load_int(&pt_pcpu[cpu_id].state));
+}
+
+static __inline void
+pt_cpu_set_state(int cpu_id, enum pt_cpu_state state)
+{
+ atomic_store_int(&pt_pcpu[cpu_id].state, state);
+}
+
+static __inline struct xstate_hdr *
+pt_ctx_get_xstate_hdr(struct pt_ctx *ctx)
+{
+ return ((struct xstate_hdr *)(ctx->save_area +
+ pt_info.xstate_hdr_offset));
+}
+
+
+static __inline struct pt_ext_area *
+pt_ctx_get_ext_area(struct pt_ctx *ctx)
+{
+ return ((struct pt_ext_area *)(ctx->save_area +
+ pt_info.pt_xsave_offset));
+}
+
+/*
+ * Updates current trace buffer offset from the
+ * ToPA MSRs. Records if the trace buffer wrapped.
+ */
+static __inline void
+pt_update_buffer(struct pt_buffer *buf)
+{
+ uint64_t reg;
+ int curpage;
+
+ /* Update buffer offset. */
+ reg = rdmsr(MSR_IA32_RTIT_OUTPUT_MASK_PTRS);
+ curpage = (reg & PT_TOPA_PAGE_MASK) >> PT_TOPA_PAGE_SHIFT;
+ mtx_lock_spin(&buf->lock);
+ /* Check if the output wrapped. */
+ if (buf->curpage > curpage)
+ buf->wrap_count++;
+ buf->curpage = curpage;
+ buf->offset = reg >> 32;
+ mtx_unlock_spin(&buf->lock);
+
+ dprintf("%s: wrap_cnt: %lu, curpage: %d, offset: %zu\n", __func__,
+ buf->wrap_count, buf->curpage, buf->offset);
+}
+
+static __inline void
+pt_fill_buffer_record(int id, struct pt_buffer *buf,
+ struct hwt_record_entry *rec)
+{
+ rec->record_type = HWT_RECORD_BUFFER;
+ rec->buf_id = id;
+ rec->curpage = buf->curpage;
+ rec->offset = buf->offset + (buf->wrap_count * buf->size);
+}
+
+/*
+ * Enables or disables tracing on curcpu
+ * using the XSAVE/XRSTOR PT extensions.
+ */
+static void
+pt_cpu_toggle_local(uint8_t *save_area, bool enable)
+{
+ u_long xcr0, cr0;
+ u_long xss;
+
+ cr0 = rcr0();
+ if (cr0 & CR0_TS)
+ clts();
+ xcr0 = rxcr(XCR0);
+ if ((xcr0 & PT_XSAVE_MASK) != PT_XSAVE_MASK)
+ load_xcr(XCR0, xcr0 | PT_XSAVE_MASK);
+ xss = rdmsr(MSR_IA32_XSS);
+ wrmsr(MSR_IA32_XSS, xss | XFEATURE_ENABLED_PT);
+
+ if (!enable) {
+ KASSERT((rdmsr(MSR_IA32_RTIT_CTL) & RTIT_CTL_TRACEEN) != 0,
+ ("%s: PT is disabled", __func__));
+ xsaves(save_area, XFEATURE_ENABLED_PT);
+ } else {
+ KASSERT((rdmsr(MSR_IA32_RTIT_CTL) & RTIT_CTL_TRACEEN) == 0,
+ ("%s: PT is enabled", __func__));
+ xrstors(save_area, XFEATURE_ENABLED_PT);
+ }
+ wrmsr(MSR_IA32_XSS, xss);
+ if ((xcr0 & PT_XSAVE_MASK) != PT_XSAVE_MASK)
+ load_xcr(XCR0, xcr0);
+ if (cr0 & CR0_TS)
+ load_cr0(cr0);
+}
+
+/*
+ * Starts PT tracing on 'curcpu'.
+ */
+static void
+pt_cpu_start(void *dummy)
+{
+ struct pt_cpu *cpu;
+
+ cpu = &pt_pcpu[curcpu];
+ MPASS(cpu->ctx != NULL);
+
+ dprintf("%s: curcpu %d\n", __func__, curcpu);
+ load_cr4(rcr4() | CR4_XSAVE);
+ wrmsr(MSR_IA32_RTIT_STATUS, 0);
+ pt_cpu_set_state(curcpu, PT_ACTIVE);
+ pt_cpu_toggle_local(cpu->ctx->save_area, true);
+}
+
+/*
+ * Stops PT tracing on 'curcpu'.
+ * Updates trace buffer offset to ensure
+ * any data generated between the last interrupt
+ * and the trace stop gets picked up by userspace.
+ */
+static void
+pt_cpu_stop(void *dummy)
+{
+ struct pt_cpu *cpu;
+ struct pt_ctx *ctx;
+
+ /* Shutdown may occur before PT gets properly configured. */
+ if (pt_cpu_get_state(curcpu) == PT_DISABLED)
+ return;
+
+ cpu = &pt_pcpu[curcpu];
+ ctx = cpu->ctx;
+ MPASS(ctx != NULL);
+ dprintf("%s: curcpu %d\n", __func__, curcpu);
+
+ pt_cpu_set_state(curcpu, PT_STOPPED);
+ pt_cpu_toggle_local(cpu->ctx->save_area, false);
+ pt_update_buffer(&ctx->buf);
+}
+
+/*
+ * Prepares the Table of Physical Addresses (ToPA) metadata for 'pt_ctx'.
+ * The HWT trace buffer is split into 4K ToPA table entries and used
+ * as a circular buffer, meaning that the last ToPA entry points to
+ * the first ToPA entry. Each entry is configured to raise an
+ * interrupt after being filled.
+ */
+static int
+pt_topa_prepare(struct pt_ctx *ctx, struct hwt_vm *vm)
+{
+ struct pt_buffer *buf;
+ size_t topa_size;
+ int i;
+
+ topa_size = TOPA_SIZE_4K;
+ buf = &ctx->buf;
+
+ KASSERT(buf->topa_hw == NULL,
+ ("%s: ToPA info already exists", __func__));
+ buf->topa_hw = mallocarray(vm->npages + 1, sizeof(uint64_t), M_PT,
+ M_ZERO | M_WAITOK);
+ dprintf("%s: ToPA virt addr %p\n", __func__, buf->topa_hw);
+ buf->size = vm->npages * PAGE_SIZE;
+ for (i = 0; i < vm->npages; i++) {
+ buf->topa_hw[i] = VM_PAGE_TO_PHYS(vm->pages[i]) | topa_size;
+ /*
+ * XXX: TOPA_INT should ideally be set according to
+ * expected amount of incoming trace data. Too few TOPA_INT
+ * entries will not trigger interrupts often enough when tracing
+ * smaller functions.
+ */
+ buf->topa_hw[i] |= TOPA_INT;
+ }
+ buf->topa_hw[vm->npages] = (uint64_t)vtophys(buf->topa_hw) | TOPA_END;
+
+ return (0);
+}
+
+/*
+ * Configures IP filtering for trace generation.
+ * A maximum of 2 ranges can be specified due to
+ * limitations imposed by the XSAVE/XRSTOR PT extensions.
+ */
+static int
+pt_configure_ranges(struct pt_ctx *ctx, struct pt_cpu_config *cfg)
+{
+ struct pt_ext_area *pt_ext;
+ int nranges_supp, n, error = 0;
+
+ pt_ext = pt_ctx_get_ext_area(ctx);
+ if (pt_info.l0_ebx & CPUPT_IPF) {
+ nranges_supp = (pt_info.l1_eax & CPUPT_NADDR_M) >>
+ CPUPT_NADDR_S;
+
+ if (nranges_supp > PT_IP_FILTER_MAX_RANGES)
+ nranges_supp = PT_IP_FILTER_MAX_RANGES;
+ n = cfg->nranges;
+ if (n > nranges_supp) {
+ printf("%s: %d IP filtering ranges requested, CPU "
+ "supports %d, truncating\n",
+ __func__, n, nranges_supp);
+ n = nranges_supp;
+ }
+
+ switch (n) {
+ case 2:
+ pt_ext->rtit_ctl |= (1UL << RTIT_CTL_ADDR_CFG_S(1));
+ pt_ext->rtit_addr1_a = cfg->ip_ranges[1].start;
+ pt_ext->rtit_addr1_b = cfg->ip_ranges[1].end;
+ case 1:
+ pt_ext->rtit_ctl |= (1UL << RTIT_CTL_ADDR_CFG_S(0));
+ pt_ext->rtit_addr0_a = cfg->ip_ranges[0].start;
+ pt_ext->rtit_addr0_b = cfg->ip_ranges[0].end;
+ break;
+ default:
+ error = (EINVAL);
+ break;
+ };
+ } else
+ error = (ENXIO);
+
+ return (error);
+}
+
+static int
+pt_init_ctx(struct pt_ctx *pt_ctx, struct hwt_vm *vm, int ctx_id)
+{
+
+ dprintf("%s: ctx id %d\n", __func__, ctx_id);
+
+ KASSERT(pt_ctx->buf.topa_hw == NULL,
+ ("%s: active ToPA buffer in context %p\n", __func__, pt_ctx));
+
+ memset(pt_ctx, 0, sizeof(struct pt_ctx));
+ mtx_init(&pt_ctx->buf.lock, "pttopa", NULL, MTX_SPIN);
+ pt_ctx->save_area = malloc_aligned(pt_info.xsave_area_size, 64,
+ M_PT, M_NOWAIT | M_ZERO);
+ if (pt_ctx->save_area == NULL)
+ return (ENOMEM);
+ dprintf("%s: preparing ToPA buffer\n", __func__);
+ if (pt_topa_prepare(pt_ctx, vm) != 0) {
+ dprintf("%s: failed to prepare ToPA buffer\n", __func__);
+ free(pt_ctx->save_area, M_PT);
+ return (ENOMEM);
+ }
+
+ pt_ctx->id = ctx_id;
+ TASK_INIT(&pt_ctx->task, 0, pt_send_buffer_record, pt_ctx);
+
+ return (0);
+}
+
+static void
+pt_deinit_ctx(struct pt_ctx *pt_ctx)
+{
+
+ if (pt_ctx->buf.topa_hw != NULL)
+ free(pt_ctx->buf.topa_hw, M_PT);
+ if (pt_ctx->save_area != NULL)
+ free(pt_ctx->save_area, M_PT);
+ memset(pt_ctx, 0, sizeof(*pt_ctx));
+ pt_ctx->buf.topa_hw = NULL;
+}
+
+/*
+ * HWT backend configuration method.
+ *
+ * Checks and translates the user-defined configuration to a
+ * set of PT tracing features. Uses the feature set to initialize
+ * the tracing context for the target CPU or thread.
+ */
+static int
+pt_backend_configure(struct hwt_context *ctx, int cpu_id, int thread_id)
+{
+ struct hwt_cpu *hwt_cpu;
+ struct hwt_thread *thr;
+ struct pt_ctx *pt_ctx;
+ struct pt_cpu_config *cfg;
+ struct pt_ext_area *pt_ext;
+ struct xstate_hdr *hdr;
+ int error;
+
+ dprintf("%s\n", __func__);
+
+ cfg = (struct pt_cpu_config *)ctx->config;
+ pt_ctx = NULL;
+
+ /* Clear any flags we don't support yet. */
+ cfg->rtit_ctl &= PT_SUPPORTED_FLAGS;
+ if (cfg->rtit_ctl & RTIT_CTL_MTCEN) {
+ if ((pt_info.l0_ebx & CPUPT_MTC) == 0) {
+ printf("%s: CPU does not support generating MTC "
+ "packets\n", __func__);
+ return (ENXIO);
+ }
+ }
+
+ if (cfg->rtit_ctl & RTIT_CTL_CR3FILTER) {
+ if ((pt_info.l0_ebx & CPUPT_CR3) == 0) {
+ printf("%s: CPU does not support CR3 filtering\n",
+ __func__);
+ return (ENXIO);
+ }
+ }
+
+ if (cfg->rtit_ctl & RTIT_CTL_DIS_TNT) {
+ if ((pt_info.l0_ebx & CPUPT_DIS_TNT) == 0) {
+ printf("%s: CPU does not support TNT\n", __func__);
+ return (ENXIO);
+ }
+ }
+ /* TODO: support for more config bits. */
+
+ if (ctx->mode == HWT_MODE_CPU) {
+ TAILQ_FOREACH(hwt_cpu, &ctx->cpus, next) {
+ if (hwt_cpu->cpu_id != cpu_id)
+ continue;
+ pt_ctx = &pt_pcpu_ctx[cpu_id];
+ break;
+ }
+ } else {
+ TAILQ_FOREACH(thr, &ctx->threads, next) {
+ if (thr->thread_id != thread_id)
+ continue;
+ KASSERT(thr->private != NULL,
+ ("%s: hwt thread private"
+ " not set, thr %p",
+ __func__, thr));
+ pt_ctx = (struct pt_ctx *)thr->private;
+ break;
+ }
+ }
+ if (pt_ctx == NULL)
+ return (ENOENT);
+
+ dprintf("%s: preparing MSRs\n", __func__);
+ pt_ext = pt_ctx_get_ext_area(pt_ctx);
+ hdr = pt_ctx_get_xstate_hdr(pt_ctx);
+
+ pt_ext->rtit_ctl |= cfg->rtit_ctl;
+ if (cfg->nranges != 0) {
+ dprintf("%s: preparing IPF ranges\n", __func__);
+ if ((error = pt_configure_ranges(pt_ctx, cfg)) != 0)
+ return (error);
+ }
+ pt_ctx->hwt_ctx = ctx;
+ pt_ext->rtit_ctl |= RTIT_CTL_TOPA;
+ pt_ext->rtit_output_base = (uint64_t)vtophys(pt_ctx->buf.topa_hw);
+ pt_ext->rtit_output_mask_ptrs = PT_TOPA_MASK_PTRS;
+ hdr->xstate_bv = XFEATURE_ENABLED_PT;
+ hdr->xstate_xcomp_bv = XFEATURE_ENABLED_PT |
+ XSTATE_XCOMP_BV_COMPACT;
+ pt_ext->rtit_ctl |= RTIT_CTL_TRACEEN;
+ pt_pcpu[cpu_id].ctx = pt_ctx;
+ pt_cpu_set_state(cpu_id, PT_STOPPED);
+
+ return (0);
+}
+
+/*
+ * hwt backend trace start operation. CPU affine.
+ */
+static void
+pt_backend_enable(struct hwt_context *ctx, int cpu_id)
+{
+ if (ctx->mode == HWT_MODE_CPU)
+ return;
+
+ KASSERT(curcpu == cpu_id,
+ ("%s: attempting to start PT on another cpu", __func__));
+ pt_cpu_start(NULL);
+ CPU_SET(cpu_id, &ctx->cpu_map);
+}
+
+/*
+ * hwt backend trace stop operation. CPU affine.
+ */
+static void
+pt_backend_disable(struct hwt_context *ctx, int cpu_id)
+{
+ struct pt_cpu *cpu;
+
+ if (ctx->mode == HWT_MODE_CPU)
+ return;
+
+ KASSERT(curcpu == cpu_id,
+ ("%s: attempting to disable PT on another cpu", __func__));
+ pt_cpu_stop(NULL);
+ CPU_CLR(cpu_id, &ctx->cpu_map);
+ cpu = &pt_pcpu[cpu_id];
+ cpu->ctx = NULL;
+}
+
+/*
+ * hwt backend trace start operation for remote CPUs.
+ */
+static int
+pt_backend_enable_smp(struct hwt_context *ctx)
+{
+
+ dprintf("%s\n", __func__);
+ if (ctx->mode == HWT_MODE_CPU &&
+ atomic_swap_32(&cpu_mode_ctr, 1) != 0)
+ return (-1);
+
+ KASSERT(ctx->mode == HWT_MODE_CPU,
+ ("%s: should only be used for CPU mode", __func__));
+ smp_rendezvous_cpus(ctx->cpu_map, NULL, pt_cpu_start, NULL, NULL);
+
+ return (0);
+}
+
+/*
+ * hwt backend trace stop operation for remote CPUs.
+ */
+static int
+pt_backend_disable_smp(struct hwt_context *ctx)
+{
+
+ dprintf("%s\n", __func__);
+ if (ctx->mode == HWT_MODE_CPU &&
+ atomic_swap_32(&cpu_mode_ctr, 0) == 0)
+ return (-1);
+
+ if (CPU_EMPTY(&ctx->cpu_map)) {
+ dprintf("%s: empty cpu map\n", __func__);
+ return (-1);
+ }
+ smp_rendezvous_cpus(ctx->cpu_map, NULL, pt_cpu_stop, NULL, NULL);
+
+ return (0);
+}
+
+/*
+ * HWT backend initialization method.
+ *
+ * Installs the ToPA interrupt handler and initializes
+ * the tracing contexts used for HWT_MODE_CPU.
+ */
+static int
+pt_backend_init(struct hwt_context *ctx)
+{
+ struct hwt_cpu *hwt_cpu;
+ int error;
+
+ dprintf("%s\n", __func__);
+ if (ctx->mode == HWT_MODE_CPU) {
+ TAILQ_FOREACH(hwt_cpu, &ctx->cpus, next) {
+ error = pt_init_ctx(&pt_pcpu_ctx[hwt_cpu->cpu_id],
+ hwt_cpu->vm, hwt_cpu->cpu_id);
+ if (error)
+ return (error);
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * HWT backend teardown method.
+ *
+ * Removes the ToPA interrupt handler, stops tracing on all active CPUs,
+ * and releases all previously allocated ToPA metadata.
+ */
+static int
+pt_backend_deinit(struct hwt_context *ctx)
+{
+ struct pt_ctx *pt_ctx;
+ struct hwt_thread *thr;
+ int cpu_id;
+
+ dprintf("%s\n", __func__);
+
+ pt_backend_disable_smp(ctx);
+ if (ctx->mode == HWT_MODE_THREAD) {
+ TAILQ_FOREACH(thr, &ctx->threads, next) {
+ KASSERT(thr->private != NULL,
+ ("%s: thr->private not set", __func__));
+ pt_ctx = (struct pt_ctx *)thr->private;
+ pt_deinit_ctx(pt_ctx);
+ }
+ } else {
+ CPU_FOREACH(cpu_id) {
+ if (!CPU_ISSET(cpu_id, &ctx->cpu_map))
+ continue;
+ if (pt_pcpu[cpu_id].ctx != NULL) {
+ KASSERT(pt_pcpu[cpu_id].ctx ==
+ &pt_pcpu_ctx[cpu_id],
+ ("%s: CPU mode tracing with non-cpu mode PT"
+ "context active",
+ __func__));
+ pt_pcpu[cpu_id].ctx = NULL;
+ }
+ pt_ctx = &pt_pcpu_ctx[cpu_id];
+ pt_deinit_ctx(pt_ctx);
+ memset(&pt_pcpu[cpu_id], 0, sizeof(struct pt_cpu));
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * Fetches current offset into the tracing buffer.
+ */
+static int
+pt_backend_read(struct hwt_vm *vm, int *curpage, vm_offset_t *curpage_offset,
+ uint64_t *data)
+{
+ struct pt_buffer *buf;
+
+ if (vm->ctx->mode == HWT_MODE_THREAD)
+ buf = &((struct pt_ctx *)vm->thr->private)->buf;
+ else
+ buf = &pt_pcpu[vm->cpu->cpu_id].ctx->buf;
+ mtx_lock_spin(&buf->lock);
+ *curpage = buf->curpage;
+ *curpage_offset = buf->offset + (buf->wrap_count * vm->ctx->bufsize);
+ mtx_unlock_spin(&buf->lock);
+
+ return (0);
+}
+
+/*
+ * HWT thread creation hook.
+ * Allocates and associates a 'struct pt_ctx' for a given hwt thread.
+ */
+static int
+pt_backend_alloc_thread(struct hwt_thread *thr)
+{
+ struct pt_ctx *pt_ctx;
+ int error;
+
+ /* Omit M_WAITOK since this might get invoked a non-sleepable context */
+ pt_ctx = malloc(sizeof(*pt_ctx), M_PT, M_NOWAIT | M_ZERO);
+ if (pt_ctx == NULL)
+ return (ENOMEM);
+
+ error = pt_init_ctx(pt_ctx, thr->vm, thr->thread_id);
+ if (error)
+ return (error);
+
+ thr->private = pt_ctx;
+ return (0);
+}
+/*
+ * HWT thread teardown hook.
+ */
+static void
+pt_backend_free_thread(struct hwt_thread *thr)
+{
+ struct pt_ctx *ctx;
+
+ ctx = (struct pt_ctx *)thr->private;
+
+ pt_deinit_ctx(ctx);
+ free(ctx, M_PT);
+}
+
+static void
+pt_backend_dump(int cpu_id)
+{
+}
+
+static struct hwt_backend_ops pt_ops = {
+ .hwt_backend_init = pt_backend_init,
+ .hwt_backend_deinit = pt_backend_deinit,
+
+ .hwt_backend_configure = pt_backend_configure,
+
+ .hwt_backend_enable = pt_backend_enable,
+ .hwt_backend_disable = pt_backend_disable,
+
+#ifdef SMP
+ .hwt_backend_enable_smp = pt_backend_enable_smp,
+ .hwt_backend_disable_smp = pt_backend_disable_smp,
+#endif
+
+ .hwt_backend_read = pt_backend_read,
+ .hwt_backend_dump = pt_backend_dump,
+
+ .hwt_backend_thread_alloc = pt_backend_alloc_thread,
+ .hwt_backend_thread_free = pt_backend_free_thread,
+};
+
+static struct hwt_backend backend = {
+ .ops = &pt_ops,
+ .name = "pt",
+ .kva_req = 1,
+};
+
+/*
+ * Reads the latest valid trace buffer offset and enqueues
+ * a HWT_RECORD_BUFFER record.
+ * Used as a taskqueue routine from the ToPA interrupt handler.
+ */
+static void
+pt_send_buffer_record(void *arg, int pending __unused)
+{
+ struct hwt_record_entry record;
+ struct pt_ctx *ctx = (struct pt_ctx *)arg;
+
+ /* Prepare buffer record. */
+ mtx_lock_spin(&ctx->buf.lock);
+ pt_fill_buffer_record(ctx->id, &ctx->buf, &record);
+ mtx_unlock_spin(&ctx->buf.lock);
+ hwt_record_ctx(ctx->hwt_ctx, &record, M_ZERO | M_NOWAIT);
+}
+static void
+pt_topa_status_clear(void)
+{
+ uint64_t reg;
+
+ reg = rdmsr(MSR_IA_GLOBAL_STATUS_RESET);
+ reg &= ~GLOBAL_STATUS_FLAG_TRACETOPAPMI;
+ reg |= GLOBAL_STATUS_FLAG_TRACETOPAPMI;
+ wrmsr(MSR_IA_GLOBAL_STATUS_RESET, reg);
+}
+
+/*
+ * ToPA PMI handler.
+ *
+ * Invoked every time a ToPA entry marked with TOPA_INT is filled.
+ * Uses taskqueue to enqueue a buffer record for userspace.
+ * Re-enables the PC interrupt line as long as tracing is active.
+ */
+static int
+pt_topa_intr(struct trapframe *tf)
+{
+ struct pt_buffer *buf;
+ struct pt_ctx *ctx;
+ uint64_t reg;
+
+ SDT_PROBE0(pt, , , topa__intr);
+
+ if (pt_cpu_get_state(curcpu) != PT_ACTIVE) {
+ return (0);
+ }
+ reg = rdmsr(MSR_IA_GLOBAL_STATUS);
+ if ((reg & GLOBAL_STATUS_FLAG_TRACETOPAPMI) == 0) {
+ /* ACK spurious or leftover interrupt. */
+ pt_topa_status_clear();
+ return (1);
+ }
+
+ ctx = pt_pcpu[curcpu].ctx;
+ buf = &ctx->buf;
+ KASSERT(buf->topa_hw != NULL,
+ ("%s: ToPA PMI interrupt with invalid buffer", __func__));
+
+ pt_cpu_toggle_local(ctx->save_area, false);
+ pt_update_buffer(buf);
+ pt_topa_status_clear();
+ taskqueue_enqueue_flags(taskqueue_pt, &ctx->task,
+ TASKQUEUE_FAIL_IF_PENDING);
+
+ if (pt_cpu_get_state(curcpu) == PT_ACTIVE) {
+ pt_cpu_toggle_local(ctx->save_area, true);
+ lapic_reenable_pcint();
+ }
+ return (1);
+}
+
+/*
+ * Module initialization.
+ *
+ * Saves all PT-related cpuid info, registers itself as a HWT backend,
+ * and allocates metadata required to keep track of tracing operations
+ * on each CPU.
+ */
+static int
+pt_init(void)
+{
+ u_int cp[4];
+ int error;
+
+ dprintf("pt: Enumerating part 1\n");
+ cpuid_count(CPUID_PT_LEAF, 0, cp);
+ dprintf("pt: Maximum valid sub-leaf Index: %x\n", cp[0]);
+ dprintf("pt: ebx %x\n", cp[1]);
+ dprintf("pt: ecx %x\n", cp[2]);
+
+ pt_info.l0_eax = cp[0];
+ pt_info.l0_ebx = cp[1];
+ pt_info.l0_ecx = cp[2];
+
+ dprintf("pt: Enumerating part 2\n");
+ cpuid_count(CPUID_PT_LEAF, 1, cp);
+ dprintf("pt: eax %x\n", cp[0]);
+ dprintf("pt: ebx %x\n", cp[1]);
+
+ pt_info.l1_eax = cp[0];
+ pt_info.l1_ebx = cp[1];
+
+ error = hwt_backend_register(&backend);
+ if (error != 0) {
+ printf("pt: unable to register hwt backend, error %d\n", error);
+ return (error);
+ }
+ pt_pcpu = mallocarray(mp_ncpus, sizeof(struct pt_cpu), M_PT,
+ M_ZERO | M_WAITOK);
+ pt_pcpu_ctx = mallocarray(mp_ncpus, sizeof(struct pt_ctx), M_PT,
+ M_ZERO | M_WAITOK);
+
+ nmi_register_handler(pt_topa_intr);
+ if (!lapic_enable_pcint()) {
+ nmi_remove_handler(pt_topa_intr);
+ hwt_backend_unregister(&backend);
+ free(pt_pcpu, M_PT);
+ free(pt_pcpu_ctx, M_PT);
+ pt_pcpu = NULL;
+ pt_pcpu_ctx = NULL;
+ printf("pt: failed to setup interrupt line\n");
+ return (error);
+ }
+ initialized = true;
+
+ return (0);
+}
+
+/*
+ * Checks whether the CPU support Intel PT and
+ * initializes XSAVE area info.
+ *
+ * The driver relies on XSAVE/XRSTOR PT extensions,
+ * Table of Physical Addresses (ToPA) support, and
+ * support for multiple ToPA entries.
+ */
+static bool
+pt_supported(void)
+{
+ u_int cp[4];
+
+ if ((cpu_stdext_feature & CPUID_STDEXT_PROCTRACE) == 0) {
+ printf("pt: CPU does not support Intel Processor Trace\n");
+ return (false);
+ }
+ if ((cpu_feature2 & CPUID2_XSAVE) == 0) {
+ printf("pt: XSAVE is not supported\n");
+ return (false);
+ }
+ if (!xsave_extfeature_supported(XFEATURE_ENABLED_PT, true)) {
+ printf("pt: CPU does not support managing PT state using XSAVE\n");
+ return (false);
+ }
+ if (!xsave_extension_supported(CPUID_EXTSTATE_XSAVEC)) {
+ printf("pt: XSAVE compaction is not supported\n");
+ return (false);
+ }
+ if (!xsave_extension_supported(CPUID_EXTSTATE_XSAVES)) {
+ printf("pt: CPU does not support XSAVES/XRSTORS\n");
+ return (false);
+ }
+
+ /* Require ToPA support. */
+ cpuid_count(CPUID_PT_LEAF, 0, cp);
+ if ((cp[2] & CPUPT_TOPA) == 0) {
+ printf("pt: ToPA is not supported\n");
+ return (false);
+ }
+ if ((cp[2] & CPUPT_TOPA_MULTI) == 0) {
+ printf("pt: multiple ToPA outputs are not supported\n");
+ return (false);
+ }
+
+ pt_info.xstate_hdr_offset = xsave_area_hdr_offset();
+ pt_info.xsave_area_size = xsave_area_size(PT_XSTATE_BV, true, true);
+ pt_info.pt_xsave_offset = xsave_area_offset(PT_XSTATE_BV,
+ XFEATURE_ENABLED_PT, true, true);
+
+ return (true);
+}
+
+static void
+pt_deinit(void)
+{
+ if (!initialized)
+ return;
+ nmi_remove_handler(pt_topa_intr);
+ lapic_disable_pcint();
+ hwt_backend_unregister(&backend);
+ free(pt_pcpu, M_PT);
+ free(pt_pcpu_ctx, M_PT);
+ pt_pcpu = NULL;
+ initialized = false;
+}
+
+static int
+pt_modevent(module_t mod, int type, void *data)
+{
+ switch (type) {
+ case MOD_LOAD:
+ if (!pt_supported() || pt_init() != 0) {
+ return (ENXIO);
+ }
+ break;
+ case MOD_UNLOAD:
+ pt_deinit();
+ break;
+ default:
+ break;
+ }
+
+ return (0);
+}
+
+static moduledata_t pt_mod = { "intel_pt", pt_modevent, NULL };
+
+DECLARE_MODULE(intel_pt, pt_mod, SI_SUB_DRIVERS, SI_ORDER_FIRST);
+MODULE_DEPEND(intel_pt, hwt, 1, 1, 1);
+MODULE_VERSION(intel_pt, 1);
diff --git a/sys/amd64/pt/pt.h b/sys/amd64/pt/pt.h
new file mode 100644
index 000000000000..2423afdf22e9
--- /dev/null
+++ b/sys/amd64/pt/pt.h
@@ -0,0 +1,49 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023 Bojan Novković <bnovkov@freebsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _AMD64_PT_PT_H_
+#define _AMD64_PT_PT_H_
+
+#include <sys/types.h>
+
+#include <x86/include/specialreg.h>
+
+#define PT_IP_FILTER_MAX_RANGES (2) /* Intel SDM Vol. 3C, 33-29 */
+
+struct pt_cpu_config {
+ uint64_t rtit_ctl;
+ register_t cr3_filter;
+ int nranges;
+ struct ipf_range {
+ vm_offset_t start;
+ vm_offset_t end;
+ } ip_ranges[PT_IP_FILTER_MAX_RANGES];
+ uint32_t mtc_freq;
+ uint32_t cyc_thresh;
+ uint32_t psb_freq;
+};
+#endif /* !_AMD64_PT_PT_H_ */
diff --git a/sys/amd64/vmm/amd/svm.c b/sys/amd64/vmm/amd/svm.c
index 6c16daaa47c2..2fe6a5bc3584 100644
--- a/sys/amd64/vmm/amd/svm.c
+++ b/sys/amd64/vmm/amd/svm.c
@@ -317,6 +317,33 @@ svm_set_tsc_offset(struct svm_vcpu *vcpu, uint64_t offset)
#define MSR_AMD7TH_START 0xC0010000UL
#define MSR_AMD7TH_END 0xC0011FFFUL
+static void
+svm_get_cs_info(struct vmcb *vmcb, struct vm_guest_paging *paging, int *cs_d,
+ uint64_t *base)
+{
+ struct vmcb_segment seg;
+ int error __diagused;
+
+ error = vmcb_seg(vmcb, VM_REG_GUEST_CS, &seg);
+ KASSERT(error == 0, ("%s: vmcb_seg error %d", __func__, error));
+
+ switch (paging->cpu_mode) {
+ case CPU_MODE_REAL:
+ *base = seg.base;
+ *cs_d = 0;
+ break;
+ case CPU_MODE_PROTECTED:
+ case CPU_MODE_COMPATIBILITY:
+ *cs_d = !!(seg.attrib & VMCB_CS_ATTRIB_D);
+ *base = seg.base;
+ break;
+ default:
+ *base = 0;
+ *cs_d = 0;
+ break;
+ }
+}
+
/*
* Get the index and bit position for a MSR in permission bitmap.
* Two bits are used for each MSR: lower bit for read and higher bit for write.
@@ -735,10 +762,29 @@ svm_inout_str_seginfo(struct svm_vcpu *vcpu, int64_t info1, int in,
if (in) {
vis->seg_name = VM_REG_GUEST_ES;
- } else {
- /* The segment field has standard encoding */
+ } else if (decode_assist()) {
+ /*
+ * The effective segment number in EXITINFO1[12:10] is populated
+ * only if the processor has the DecodeAssist capability.
+ *
+ * XXX this is not specified explicitly in APMv2 but can be
+ * verified empirically.
+ */
s = (info1 >> 10) & 0x7;
+
+ /* The segment field has standard encoding */
vis->seg_name = vm_segment_name(s);
+ } else {
+ /*
+ * The segment register need to be manually decoded by fetching
+ * the instructions near ip. However, we are unable to fetch it
+ * while the interrupts are disabled. Therefore, we leave the
+ * value unset until the generic ins/outs handler runs.
+ */
+ vis->seg_name = VM_REG_LAST;
+ svm_get_cs_info(vcpu->vmcb, &vis->paging, &vis->cs_d,
+ &vis->cs_base);
+ return;
}
error = svm_getdesc(vcpu, vis->seg_name, &vis->seg_desc);
@@ -798,16 +844,6 @@ svm_handle_io(struct svm_vcpu *vcpu, struct vm_exit *vmexit)
info1 = ctrl->exitinfo1;
inout_string = info1 & BIT(2) ? 1 : 0;
- /*
- * The effective segment number in EXITINFO1[12:10] is populated
- * only if the processor has the DecodeAssist capability.
- *
- * XXX this is not specified explicitly in APMv2 but can be verified
- * empirically.
- */
- if (inout_string && !decode_assist())
- return (UNHANDLED);
-
vmexit->exitcode = VM_EXITCODE_INOUT;
vmexit->u.inout.in = (info1 & BIT(0)) ? 1 : 0;
vmexit->u.inout.string = inout_string;
@@ -825,6 +861,8 @@ svm_handle_io(struct svm_vcpu *vcpu, struct vm_exit *vmexit)
vis->index = svm_inout_str_index(regs, vmexit->u.inout.in);
vis->count = svm_inout_str_count(regs, vmexit->u.inout.rep);
vis->addrsize = svm_inout_str_addrsize(info1);
+ vis->cs_d = 0;
+ vis->cs_base = 0;
svm_inout_str_seginfo(vcpu, info1, vmexit->u.inout.in, vis);
}
@@ -866,10 +904,9 @@ static void
svm_handle_inst_emul(struct vmcb *vmcb, uint64_t gpa, struct vm_exit *vmexit)
{
struct vm_guest_paging *paging;
- struct vmcb_segment seg;
struct vmcb_ctrl *ctrl;
char *inst_bytes;
- int error __diagused, inst_len;
+ int inst_len;
ctrl = &vmcb->ctrl;
paging = &vmexit->u.inst_emul.paging;
@@ -879,29 +916,8 @@ svm_handle_inst_emul(struct vmcb *vmcb, uint64_t gpa, struct vm_exit *vmexit)
vmexit->u.inst_emul.gla = VIE_INVALID_GLA;
svm_paging_info(vmcb, paging);
- error = vmcb_seg(vmcb, VM_REG_GUEST_CS, &seg);
- KASSERT(error == 0, ("%s: vmcb_seg(CS) error %d", __func__, error));
-
- switch(paging->cpu_mode) {
- case CPU_MODE_REAL:
- vmexit->u.inst_emul.cs_base = seg.base;
- vmexit->u.inst_emul.cs_d = 0;
- break;
- case CPU_MODE_PROTECTED:
- case CPU_MODE_COMPATIBILITY:
- vmexit->u.inst_emul.cs_base = seg.base;
-
- /*
- * Section 4.8.1 of APM2, Default Operand Size or D bit.
- */
- vmexit->u.inst_emul.cs_d = (seg.attrib & VMCB_CS_ATTRIB_D) ?
- 1 : 0;
- break;
- default:
- vmexit->u.inst_emul.cs_base = 0;
- vmexit->u.inst_emul.cs_d = 0;
- break;
- }
+ svm_get_cs_info(vmcb, paging, &vmexit->u.inst_emul.cs_d,
+ &vmexit->u.inst_emul.cs_base);
/*
* Copy the instruction bytes into 'vie' if available.
diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c
index 957217ab2258..842281ab862e 100644
--- a/sys/amd64/vmm/intel/vmx.c
+++ b/sys/amd64/vmm/intel/vmx.c
@@ -2659,6 +2659,8 @@ vmx_exit_process(struct vmx *vmx, struct vmx_vcpu *vcpu, struct vm_exit *vmexit)
vis->index = inout_str_index(vcpu, in);
vis->count = inout_str_count(vcpu, vis->inout.rep);
vis->addrsize = inout_str_addrsize(inst_info);
+ vis->cs_d = 0;
+ vis->cs_base = 0;
inout_str_seginfo(vcpu, inst_info, in, vis);
}
SDT_PROBE3(vmm, vmx, exit, inout, vmx, vcpuid, vmexit);
diff --git a/sys/amd64/vmm/intel/vmx_support.S b/sys/amd64/vmm/intel/vmx_support.S
index f393f160b101..130130b64541 100644
--- a/sys/amd64/vmm/intel/vmx_support.S
+++ b/sys/amd64/vmm/intel/vmx_support.S
@@ -32,12 +32,6 @@
#include "vmx_assym.h"
-#ifdef SMP
-#define LK lock ;
-#else
-#define LK
-#endif
-
/* Be friendly to DTrace FBT's prologue/epilogue pattern matching */
#define VENTER push %rbp ; mov %rsp,%rbp
#define VLEAVE pop %rbp
diff --git a/sys/amd64/vmm/vmm_instruction_emul.c b/sys/amd64/vmm/vmm_instruction_emul.c
index c53e32889000..c54b6e6d0074 100644
--- a/sys/amd64/vmm/vmm_instruction_emul.c
+++ b/sys/amd64/vmm/vmm_instruction_emul.c
@@ -65,30 +65,6 @@
#include <x86/psl.h>
#include <x86/specialreg.h>
-/* struct vie_op.op_type */
-enum {
- VIE_OP_TYPE_NONE = 0,
- VIE_OP_TYPE_MOV,
- VIE_OP_TYPE_MOVSX,
- VIE_OP_TYPE_MOVZX,
- VIE_OP_TYPE_AND,
- VIE_OP_TYPE_OR,
- VIE_OP_TYPE_SUB,
- VIE_OP_TYPE_TWO_BYTE,
- VIE_OP_TYPE_PUSH,
- VIE_OP_TYPE_CMP,
- VIE_OP_TYPE_POP,
- VIE_OP_TYPE_MOVS,
- VIE_OP_TYPE_GROUP1,
- VIE_OP_TYPE_STOS,
- VIE_OP_TYPE_BITTEST,
- VIE_OP_TYPE_TWOB_GRP15,
- VIE_OP_TYPE_ADD,
- VIE_OP_TYPE_TEST,
- VIE_OP_TYPE_BEXTR,
- VIE_OP_TYPE_LAST
-};
-
/* struct vie_op.op_flags */
#define VIE_OP_F_IMM (1 << 0) /* 16/32-bit immediate operand */
#define VIE_OP_F_IMM8 (1 << 1) /* 8-bit immediate operand */
@@ -152,6 +128,16 @@ static const struct vie_op one_byte_opcodes[256] = {
.op_byte = 0x3B,
.op_type = VIE_OP_TYPE_CMP,
},
+ [0x6E] = {
+ .op_byte = 0x6E,
+ .op_type = VIE_OP_TYPE_OUTS,
+ .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION,
+ },
+ [0x6F] = {
+ .op_byte = 0x6F,
+ .op_type = VIE_OP_TYPE_OUTS,
+ .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION,
+ },
[0x88] = {
.op_byte = 0x88,
.op_type = VIE_OP_TYPE_MOV,
diff --git a/sys/amd64/vmm/vmm_ioport.c b/sys/amd64/vmm/vmm_ioport.c
index fc1ecab9f209..8aab28f5e68e 100644
--- a/sys/amd64/vmm/vmm_ioport.c
+++ b/sys/amd64/vmm/vmm_ioport.c
@@ -145,9 +145,49 @@ emulate_inout_port(struct vcpu *vcpu, struct vm_exit *vmexit, bool *retu)
}
static int
+decode_segment(struct vcpu *vcpu, enum vm_reg_name *segment)
+{
+ struct vm_guest_paging *paging;
+ struct vie vie;
+ struct vm_exit *vme;
+ int err;
+ int fault;
+
+ vme = vm_exitinfo(vcpu);
+ paging = &vme->u.inout_str.paging;
+
+ vie_init(&vie, NULL, 0);
+ err = vmm_fetch_instruction(vcpu, paging,
+ vme->rip + vme->u.inout_str.cs_base, VIE_INST_SIZE, &vie, &fault);
+ if (err || fault)
+ return (err);
+
+ err = vmm_decode_instruction(vcpu, VIE_INVALID_GLA, paging->cpu_mode,
+ vme->u.inout_str.cs_d, &vie);
+
+ if (err || vie.op.op_type != VIE_OP_TYPE_OUTS)
+ return (EINVAL);
+ if (vie.segment_override)
+ *segment = vie.segment_register;
+ else
+ *segment = VM_REG_GUEST_DS;
+
+ return (0);
+}
+
+static int
emulate_inout_str(struct vcpu *vcpu, struct vm_exit *vmexit, bool *retu)
{
+ int err;
+
*retu = true;
+ if (vmexit->u.inout_str.seg_name == VM_REG_LAST) {
+ err = decode_segment(vcpu, &vmexit->u.inout_str.seg_name);
+ if (err)
+ return (err);
+ return (vm_get_seg_desc(vcpu, vmexit->u.inout_str.seg_name,
+ &vmexit->u.inout_str.seg_desc));
+ }
return (0); /* Return to userspace to finish emulation */
}