aboutsummaryrefslogtreecommitdiff
path: root/sys/amd64
diff options
context:
space:
mode:
Diffstat (limited to 'sys/amd64')
-rw-r--r--sys/amd64/amd64/apic_vector.S11
-rw-r--r--sys/amd64/amd64/genassym.c12
-rw-r--r--sys/amd64/amd64/kexec_support.c300
-rw-r--r--sys/amd64/amd64/kexec_tramp.S91
-rw-r--r--sys/amd64/amd64/mp_machdep.c13
-rw-r--r--sys/amd64/amd64/trap.c4
-rw-r--r--sys/amd64/conf/GENERIC1
-rw-r--r--sys/amd64/conf/MINIMAL1
-rw-r--r--sys/amd64/include/kexec.h41
-rw-r--r--sys/amd64/include/smp.h1
-rw-r--r--sys/amd64/include/vmm.h31
-rw-r--r--sys/amd64/include/vmm_dev.h2
-rw-r--r--sys/amd64/vmm/intel/vmx.c2
-rw-r--r--sys/amd64/vmm/io/ppt.c7
-rw-r--r--sys/amd64/vmm/io/ppt.h6
-rw-r--r--sys/amd64/vmm/io/vlapic.c2
-rw-r--r--sys/amd64/vmm/vmm.c168
-rw-r--r--sys/amd64/vmm/vmm_lapic.c2
-rw-r--r--sys/amd64/vmm/vmm_mem.h5
-rw-r--r--sys/amd64/vmm/vmm_mem_machdep.c61
20 files changed, 552 insertions, 209 deletions
diff --git a/sys/amd64/amd64/apic_vector.S b/sys/amd64/amd64/apic_vector.S
index e98bae9eb6c5..8691387a5a8e 100644
--- a/sys/amd64/amd64/apic_vector.S
+++ b/sys/amd64/amd64/apic_vector.S
@@ -204,6 +204,17 @@ IDTVEC(spuriousint)
jmp doreti
/*
+ * Executed by a CPU when it receives an IPI_OFF from another CPU.
+ * Should never return
+ */
+ INTR_HANDLER cpuoff
+ KMSAN_ENTER
+ call cpuoff_handler
+ call as_lapic_eoi
+ KMSAN_LEAVE
+ jmp doreti
+
+/*
* Executed by a CPU when it receives an IPI_SWI.
*/
INTR_HANDLER ipi_swi
diff --git a/sys/amd64/amd64/genassym.c b/sys/amd64/amd64/genassym.c
index eb1b746f5893..2716784ee871 100644
--- a/sys/amd64/amd64/genassym.c
+++ b/sys/amd64/amd64/genassym.c
@@ -57,6 +57,7 @@
#include <vm/vm_param.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
+#include <sys/kexec.h>
#include <sys/proc.h>
#include <x86/apicreg.h>
#include <machine/cpu.h>
@@ -65,6 +66,7 @@
#include <machine/proc.h>
#include <machine/segments.h>
#include <machine/efi.h>
+#include <machine/kexec.h>
ASSYM(P_VMSPACE, offsetof(struct proc, p_vmspace));
ASSYM(VM_PMAP, offsetof(struct vmspace, vm_pmap));
@@ -295,3 +297,13 @@ ASSYM(EC_R13, offsetof(struct efirt_callinfo, ec_r13));
ASSYM(EC_R14, offsetof(struct efirt_callinfo, ec_r14));
ASSYM(EC_R15, offsetof(struct efirt_callinfo, ec_r15));
ASSYM(EC_RFLAGS, offsetof(struct efirt_callinfo, ec_rflags));
+
+/* Kexec */
+ASSYM(KEXEC_ENTRY, offsetof(struct kexec_image, entry));
+ASSYM(KEXEC_SEGMENTS, offsetof(struct kexec_image, segments));
+ASSYM(KEXEC_SEGMENT_MAX, KEXEC_SEGMENT_MAX);
+ASSYM(KEXEC_IMAGE_SIZE, sizeof(struct kexec_image));
+ASSYM(KEXEC_STAGED_SEGMENT_SIZE, sizeof(struct kexec_segment_stage));
+ASSYM(KEXEC_SEGMENT_SIZE, offsetof(struct kexec_segment_stage, size));
+ASSYM(KEXEC_SEGMENT_MAP, offsetof(struct kexec_segment_stage, map_buf));
+ASSYM(KEXEC_SEGMENT_TARGET, offsetof(struct kexec_segment_stage, target));
diff --git a/sys/amd64/amd64/kexec_support.c b/sys/amd64/amd64/kexec_support.c
new file mode 100644
index 000000000000..8189a48e9ae9
--- /dev/null
+++ b/sys/amd64/amd64/kexec_support.c
@@ -0,0 +1,300 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2025 Juniper Networks, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/conf.h>
+#include <sys/interrupt.h>
+#include <sys/kernel.h>
+#include <sys/kexec.h>
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_object.h>
+#include <vm/vm_phys.h>
+#include <vm/pmap.h>
+#include <vm/vm_page.h>
+#include <vm/vm_radix.h>
+
+#include <machine/intr_machdep.h>
+#include <machine/kexec.h>
+#include <machine/md_var.h>
+#include <machine/pmap.h>
+#include <x86/apicvar.h>
+
+/*
+ * Idea behind this:
+ *
+ * kexec_load_md():
+ * - Update boot page tables (identity map) to include all pages needed before
+ * disabling MMU.
+ *
+ * kexec_reboot_md():
+ * - Copy pages into target(s)
+ * - Do "other stuff"
+ * - Does not return
+ */
+
+/*
+ * do_pte: Create PTE entries (4k pages). If false, create 2MB superpages.
+ * identity: This is for an identity map, treat `start` as a physical address.
+ * Only valid here if do_pte is false.
+ */
+static void
+kexec_generate_page_tables(pml4_entry_t *root, vm_offset_t start,
+ vm_size_t size, bool do_pte, bool identity, struct pctrie_iter *pages)
+{
+ vm_paddr_t mpa;
+ vm_offset_t pg;
+ vm_size_t stride = do_pte ? PAGE_SIZE : NBPDR;
+ vm_page_t m;
+ vm_pindex_t i, j, k, l;
+
+ pg = start & ~(stride - 1);
+ i = pmap_pml4e_index(pg);
+ j = pmap_pdpe_index(pg);
+ k = pmap_pde_index(pg);
+ l = pmap_pte_index(pg);
+ for (; pg < start + size; i++, j = 0, k = 0, l = 0) {
+ /*
+ * Walk linearly, as above, but one fell swoop, one page at a
+ * time.
+ */
+ if (root[i] == 0) {
+ m = vm_radix_iter_next(pages);
+ mpa = VM_PAGE_TO_PHYS(m);
+ root[i] = mpa | PG_RW | PG_V;
+ }
+ pdp_entry_t *pdp =
+ (pdp_entry_t *)(PHYS_TO_DMAP(root[i] & PG_FRAME));
+ for (; j < NPDPEPG && pg < start + size; j++, k = 0, l = 0) {
+ if (pdp[j] == 0) {
+ m = vm_radix_iter_next(pages);
+ mpa = VM_PAGE_TO_PHYS(m);
+ pdp[j] = mpa | PG_RW | PG_V;
+ }
+ pd_entry_t *pde =
+ (pd_entry_t *)(PHYS_TO_DMAP(pdp[j] & PG_FRAME));
+ for (; k < NPDEPG && pg < start + size; k++, l = 0) {
+ if (pde[k] == 0) {
+ if (!do_pte) {
+ pde[k] =
+ (identity ? pg : pmap_kextract(pg)) |
+ PG_RW | PG_PS | PG_V;
+ pg += NBPDR;
+ continue;
+ }
+ m = vm_radix_iter_next(pages);
+ mpa = VM_PAGE_TO_PHYS(m);
+ pde[k] = mpa | PG_V | PG_RW;
+ } else if ((pde[k] & PG_PS) != 0) {
+ pg += NBPDR;
+ continue;
+ }
+ /* Populate the PTEs. */
+ for (; l < NPTEPG && pg < start + size;
+ l++, pg += PAGE_SIZE) {
+ pt_entry_t *pte =
+ (pt_entry_t *)PHYS_TO_DMAP(pde[pmap_pde_index(pg)] & PG_FRAME);
+ pte[pmap_pte_index(pg)] =
+ pmap_kextract(pg) | PG_RW | PG_V;
+ }
+ }
+ }
+ }
+}
+
+void
+kexec_reboot_md(struct kexec_image *image)
+{
+ void (*kexec_do_tramp)(void) = image->md_image;
+
+ intr_disable_all();
+ lapic_disable();
+ kexec_do_reboot_trampoline(VM_PAGE_TO_PHYS(image->first_md_page),
+ kexec_do_tramp);
+
+ for (;;)
+ ;
+}
+
+int
+kexec_load_md(struct kexec_image *image)
+{
+ struct pctrie_iter pct_iter;
+ pml4_entry_t *PT4;
+ pdp_entry_t *PDP_l;
+ pd_entry_t *PD_l0;
+ vm_offset_t va;
+ int i;
+
+ /*
+ * Start building the page table.
+ * First part of the page table is standard for all.
+ */
+ vm_offset_t pa_pdp_l, pa_pd_l0, pa_pd_l1, pa_pd_l2, pa_pd_l3;
+ vm_page_t m;
+
+ if (la57)
+ return (EINVAL);
+
+ vm_radix_iter_init(&pct_iter, &image->map_obj->rtree);
+ /* Working in linear space in the mapped space, `va` is our tracker. */
+ m = vm_radix_iter_lookup(&pct_iter, image->first_md_page->pindex);
+ va = (vm_offset_t)image->map_addr + ptoa(m->pindex);
+ /* We'll find a place for these later */
+ PT4 = (void *)va;
+ va += PAGE_SIZE;
+ m = vm_radix_iter_next(&pct_iter);
+ pa_pdp_l = VM_PAGE_TO_PHYS(m);
+ PDP_l = (void *)va;
+ va += PAGE_SIZE;
+ m = vm_radix_iter_next(&pct_iter);
+ pa_pd_l0 = VM_PAGE_TO_PHYS(m);
+ PD_l0 = (void *)va;
+ va += PAGE_SIZE;
+ m = vm_radix_iter_next(&pct_iter);
+ pa_pd_l1 = VM_PAGE_TO_PHYS(m);
+ m = vm_radix_iter_next(&pct_iter);
+ pa_pd_l2 = VM_PAGE_TO_PHYS(m);
+ m = vm_radix_iter_next(&pct_iter);
+ pa_pd_l3 = VM_PAGE_TO_PHYS(m);
+ m = vm_radix_iter_next(&pct_iter);
+
+ /* 1:1 mapping of lower 4G */
+ PT4[0] = (pml4_entry_t)pa_pdp_l | PG_V | PG_RW;
+ PDP_l[0] = (pdp_entry_t)pa_pd_l0 | PG_V | PG_RW;
+ PDP_l[1] = (pdp_entry_t)pa_pd_l1 | PG_V | PG_RW;
+ PDP_l[2] = (pdp_entry_t)pa_pd_l2 | PG_V | PG_RW;
+ PDP_l[3] = (pdp_entry_t)pa_pd_l3 | PG_V | PG_RW;
+ for (i = 0; i < 4 * NPDEPG; i++) { /* we overflow PD_l0 into _l1, etc */
+ PD_l0[i] = ((pd_entry_t)i << PDRSHIFT) | PG_V |
+ PG_RW | PG_PS;
+ }
+
+ /* Map the target(s) in 2MB chunks. */
+ for (i = 0; i < KEXEC_SEGMENT_MAX; i++) {
+ struct kexec_segment_stage *s = &image->segments[i];
+
+ if (s->size == 0)
+ break;
+ kexec_generate_page_tables(PT4, s->target, s->size, false,
+ true, &pct_iter);
+ }
+ /* Now create the source page tables */
+ kexec_generate_page_tables(PT4, image->map_addr, image->map_size, true,
+ false, &pct_iter);
+ kexec_generate_page_tables(PT4,
+ trunc_page((vm_offset_t)kexec_do_reboot_trampoline),
+ PAGE_SIZE, true, false, &pct_iter);
+ KASSERT(m != NULL, ("kexec_load_md: Missing trampoline page!\n"));
+
+ /* MD control pages start at this next page. */
+ image->md_image = (void *)(image->map_addr + ptoa(m->pindex));
+ bcopy(kexec_do_reboot, image->md_image, kexec_do_reboot_size);
+
+ /* Save the image into the MD page(s) right after the trampoline */
+ bcopy(image, (void *)((vm_offset_t)image->md_image +
+ (vm_offset_t)&kexec_saved_image - (vm_offset_t)&kexec_do_reboot),
+ sizeof(*image));
+
+ return (0);
+}
+
+/*
+ * Required pages:
+ * - L4 (1) (root)
+ * - L3 (PDPE) - 2 (bottom 512GB, bottom 4 used, top range for kernel map)
+ * - L2 (PDP) - 5 (2MB superpage mappings, 1GB each, for bottom 4GB, top 1)
+ * - L1 (PDR) - 1 (kexec trampoline page, first MD page)
+ * - kexec_do_reboot trampoline - 1
+ * - Slop pages for staging (in case it's not aligned nicely) - 3 (worst case)
+ *
+ * Minimum 9 pages for the direct map.
+ */
+int
+kexec_md_pages(struct kexec_segment *seg_in)
+{
+ struct kexec_segment *segs = seg_in;
+ vm_size_t pages = 13; /* Minimum number of starting pages */
+ vm_paddr_t cur_addr = (1UL << 32) - 1; /* Bottom 4G will be identity mapped in full */
+ vm_size_t source_total = 0;
+
+ for (int i = 0; i < KEXEC_SEGMENT_MAX; i++) {
+ vm_offset_t start, end;
+ if (segs[i].memsz == 0)
+ break;
+
+ end = round_2mpage((vm_offset_t)segs[i].mem + segs[i].memsz);
+ start = trunc_2mpage((vm_offset_t)segs[i].mem);
+ start = max(start, cur_addr + 1);
+ /*
+ * Round to cover the full range of page table pages for each
+ * segment.
+ */
+ source_total += round_2mpage(end - start);
+
+ /*
+ * Bottom 4GB are identity mapped already in the count, so skip
+ * any segments that end up there, this will short-circuit that.
+ */
+ if (end <= cur_addr + 1)
+ continue;
+
+ if (pmap_pml4e_index(end) != pmap_pml4e_index(cur_addr)) {
+ /* Need a new 512GB mapping page */
+ pages++;
+ pages += howmany(end - (start & ~PML4MASK), NBPML4);
+ pages += howmany(end - (start & ~PDPMASK), NBPDP);
+ pages += howmany(end - (start & ~PDRMASK), NBPDR);
+
+ } else if (pmap_pdpe_index(end) != pmap_pdpe_index(cur_addr)) {
+ pages++;
+ pages += howmany(end - (start & ~PDPMASK), NBPDP) - 1;
+ pages += howmany(end - (start & ~PDRMASK), NBPDR);
+ }
+
+ }
+ /* Be pessimistic when totaling up source pages. We likely
+ * can't use superpages, so need to map each page individually.
+ */
+ pages += howmany(source_total, NBPDR);
+ pages += howmany(source_total, NBPDP);
+ pages += howmany(source_total, NBPML4);
+
+ /*
+ * Be intentionally sloppy adding in the extra page table pages. It's
+ * better to go over than under.
+ */
+ pages += howmany(pages * PAGE_SIZE, NBPDR);
+ pages += howmany(pages * PAGE_SIZE, NBPDP);
+ pages += howmany(pages * PAGE_SIZE, NBPML4);
+
+ /* Add in the trampoline pages */
+ pages += howmany(kexec_do_reboot_size, PAGE_SIZE);
+
+ return (pages);
+}
diff --git a/sys/amd64/amd64/kexec_tramp.S b/sys/amd64/amd64/kexec_tramp.S
new file mode 100644
index 000000000000..6a2de676bc35
--- /dev/null
+++ b/sys/amd64/amd64/kexec_tramp.S
@@ -0,0 +1,91 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2025 Juniper Networks, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <machine/asmacros.h>
+#include <machine/specialreg.h>
+#include "assym.inc"
+
+/*
+ * Take a pointer to the image, copy each segment, and jump to the trampoline.
+ *
+ * Assumptions:
+ * - image is in safe memory
+ * - We're already running out of the new "identity" map.
+ * - All registers are free game, so go nuts
+ * - Interrupts are disabled
+ * - All APs are disabled
+ */
+ENTRY(kexec_do_reboot)
+ /*
+ r9: image pointer
+ r10: segment pointer
+ r11: segment counter
+ */
+ leaq kexec_stack(%rip), %rsp
+ /* Get the saved kexec_image. */
+ leaq kexec_saved_image(%rip), %r9
+ leaq KEXEC_SEGMENTS(%r9), %r10
+ movq $KEXEC_SEGMENT_MAX, %r11
+copy_segment:
+ movq KEXEC_SEGMENT_SIZE(%r10), %rcx
+ cmpq $0, %rcx
+ je done
+ shrq $3, %rcx
+ movq KEXEC_SEGMENT_TARGET(%r10), %rdi
+ movq KEXEC_SEGMENT_MAP(%r10), %rsi
+ rep
+ movsq
+ addq $KEXEC_STAGED_SEGMENT_SIZE, %r10
+ decq %r11
+ jg copy_segment
+
+done:
+ pushq KEXEC_ENTRY(%r9)
+ ret
+fail:
+ jmp fail
+END(kexec_do_reboot)
+ENTRY(kexec_do_reboot_trampoline)
+ /* Set new page table, clears most of TLB. */
+ movq %rdi, %cr3
+
+ /* Now flush the rest of the TLB, including global pages. */
+ movq %cr4, %rax
+ andq $~CR4_PGE, %rax
+ movq %rax, %cr4
+ jmp *%rsi
+END(kexec_do_reboot_trampoline)
+CNAME(kexec_saved_image):
+ .globl kexec_saved_image
+ .space KEXEC_IMAGE_SIZE
+ .quad 0
+ /* We don't need more than quad, so just fill out the page. */
+ .p2align PAGE_SHIFT
+ kexec_stack:
+CNAME(kexec_do_reboot_size):
+ .globl kexec_do_reboot_size
+ .quad . - kexec_do_reboot
diff --git a/sys/amd64/amd64/mp_machdep.c b/sys/amd64/amd64/mp_machdep.c
index 00e99f9df192..96ed0a2cc3ba 100644
--- a/sys/amd64/amd64/mp_machdep.c
+++ b/sys/amd64/amd64/mp_machdep.c
@@ -140,6 +140,10 @@ cpu_mp_start(void)
setidt(IPI_STOP, pti ? IDTVEC(cpustop_pti) : IDTVEC(cpustop),
SDT_SYSIGT, SEL_KPL, 0);
+ /* Install an inter-CPU IPI for CPU offline */
+ setidt(IPI_OFF, pti ? IDTVEC(cpuoff_pti) : IDTVEC(cpuoff),
+ SDT_SYSIGT, SEL_KPL, 0);
+
/* Install an inter-CPU IPI for CPU suspend/resume */
setidt(IPI_SUSPEND, pti ? IDTVEC(cpususpend_pti) : IDTVEC(cpususpend),
SDT_SYSIGT, SEL_KPL, 0);
@@ -176,6 +180,15 @@ cpu_mp_start(void)
#endif
}
+void
+cpu_mp_stop(void)
+{
+ cpuset_t other_cpus = all_cpus;
+
+ CPU_CLR(PCPU_GET(cpuid), &other_cpus);
+ offline_cpus(other_cpus);
+}
+
/*
* AP CPU's call this to initialize themselves.
*/
diff --git a/sys/amd64/amd64/trap.c b/sys/amd64/amd64/trap.c
index f3469ed5e2bc..84305ca918df 100644
--- a/sys/amd64/amd64/trap.c
+++ b/sys/amd64/amd64/trap.c
@@ -435,9 +435,9 @@ trap(struct trapframe *frame)
if ((print_efirt_faults == 1 && cnt == 0) ||
print_efirt_faults == 2) {
- trap_diag(frame, 0);
printf("EFI RT fault %s\n",
traptype_to_msg(type));
+ trap_diag(frame, 0);
}
frame->tf_rip = (long)curpcb->pcb_onfault;
return;
@@ -870,8 +870,8 @@ after_vmfault:
if ((print_efirt_faults == 1 && cnt == 0) ||
print_efirt_faults == 2) {
- trap_diag(frame, eva);
printf("EFI RT page fault\n");
+ trap_diag(frame, eva);
}
}
frame->tf_rip = (long)curpcb->pcb_onfault;
diff --git a/sys/amd64/conf/GENERIC b/sys/amd64/conf/GENERIC
index 2e41ed26403a..fb8473505128 100644
--- a/sys/amd64/conf/GENERIC
+++ b/sys/amd64/conf/GENERIC
@@ -309,7 +309,6 @@ device wpi # Intel 3945ABG wireless NICs.
device crypto # core crypto support
device aesni # AES-NI OpenCrypto module
device loop # Network loopback
-device padlock_rng # VIA Padlock RNG
device rdrand_rng # Intel Bull Mountain RNG
device ether # Ethernet support
device vlan # 802.1Q VLAN support
diff --git a/sys/amd64/conf/MINIMAL b/sys/amd64/conf/MINIMAL
index 0baf6d6431de..61c713c609a4 100644
--- a/sys/amd64/conf/MINIMAL
+++ b/sys/amd64/conf/MINIMAL
@@ -113,7 +113,6 @@ device uart # Generic UART driver
# Pseudo devices.
device loop # Network loopback
-device padlock_rng # VIA Padlock RNG
device rdrand_rng # Intel Bull Mountain RNG
device ether # Ethernet support
diff --git a/sys/amd64/include/kexec.h b/sys/amd64/include/kexec.h
new file mode 100644
index 000000000000..70bc2991be3f
--- /dev/null
+++ b/sys/amd64/include/kexec.h
@@ -0,0 +1,41 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2025 Juniper Networks, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _AMD64_KEXEC_H_
+#define _AMD64_KEXEC_H_
+
+struct kexec_segment;
+struct kexec_image;
+int kexec_md_pages(struct kexec_segment *);
+extern void kexec_do_reboot(void);
+extern long kexec_do_reboot_size;
+extern void *kexec_saved_image;
+extern void kexec_do_reboot_trampoline(unsigned long, void (*)(void));
+#define KEXEC_MD_PAGES(x) kexec_md_pages(x)
+
+
+#endif /* _AMD64_KEXEC_H_ */
diff --git a/sys/amd64/include/smp.h b/sys/amd64/include/smp.h
index bff92570ff82..28c372a2e556 100644
--- a/sys/amd64/include/smp.h
+++ b/sys/amd64/include/smp.h
@@ -30,6 +30,7 @@ inthand_t
IDTVEC(ipi_intr_bitmap_handler_pti),
IDTVEC(ipi_swi_pti),
IDTVEC(cpustop_pti),
+ IDTVEC(cpuoff_pti),
IDTVEC(cpususpend_pti),
IDTVEC(rendezvous_pti);
diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h
index ad67510fecf3..5cf1ae2d769c 100644
--- a/sys/amd64/include/vmm.h
+++ b/sys/amd64/include/vmm.h
@@ -122,33 +122,7 @@ enum x2apic_state {
#define VM_INTINFO_HWEXCEPTION (3 << 8)
#define VM_INTINFO_SWINTR (4 << 8)
-/*
- * The VM name has to fit into the pathname length constraints of devfs,
- * governed primarily by SPECNAMELEN. The length is the total number of
- * characters in the full path, relative to the mount point and not
- * including any leading '/' characters.
- * A prefix and a suffix are added to the name specified by the user.
- * The prefix is usually "vmm/" or "vmm.io/", but can be a few characters
- * longer for future use.
- * The suffix is a string that identifies a bootrom image or some similar
- * image that is attached to the VM. A separator character gets added to
- * the suffix automatically when generating the full path, so it must be
- * accounted for, reducing the effective length by 1.
- * The effective length of a VM name is 229 bytes for FreeBSD 13 and 37
- * bytes for FreeBSD 12. A minimum length is set for safety and supports
- * a SPECNAMELEN as small as 32 on old systems.
- */
-#define VM_MAX_PREFIXLEN 10
-#define VM_MAX_SUFFIXLEN 15
-#define VM_MIN_NAMELEN 6
-#define VM_MAX_NAMELEN \
- (SPECNAMELEN - VM_MAX_PREFIXLEN - VM_MAX_SUFFIXLEN - 1)
-
#ifdef _KERNEL
-#include <sys/kassert.h>
-
-CTASSERT(VM_MAX_NAMELEN >= VM_MIN_NAMELEN);
-
struct vm;
struct vm_exception;
struct vm_mem;
@@ -232,8 +206,6 @@ struct vmm_ops {
extern const struct vmm_ops vmm_ops_intel;
extern const struct vmm_ops vmm_ops_amd;
-extern u_int vm_maxcpu; /* maximum virtual cpus */
-
int vm_create(const char *name, struct vm **retvm);
struct vcpu *vm_alloc_vcpu(struct vm *vm, int vcpuid);
void vm_disable_vcpu_creation(struct vm *vm);
@@ -383,7 +355,8 @@ vcpu_should_yield(struct vcpu *vcpu)
#endif
void *vcpu_stats(struct vcpu *vcpu);
-void vcpu_notify_event(struct vcpu *vcpu, bool lapic_intr);
+void vcpu_notify_event(struct vcpu *vcpu);
+void vcpu_notify_lapic(struct vcpu *vcpu);
struct vm_mem *vm_mem(struct vm *vm);
struct vatpic *vm_atpic(struct vm *vm);
struct vatpit *vm_atpit(struct vm *vm);
diff --git a/sys/amd64/include/vmm_dev.h b/sys/amd64/include/vmm_dev.h
index 441330fd57b8..f1c07a983a4b 100644
--- a/sys/amd64/include/vmm_dev.h
+++ b/sys/amd64/include/vmm_dev.h
@@ -34,6 +34,8 @@
#include <machine/vmm.h>
#include <machine/vmm_snapshot.h>
+#include <dev/vmm/vmm_param.h>
+
struct vm_memmap {
vm_paddr_t gpa;
int segid; /* memory segment */
diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c
index 842281ab862e..4189c1214b40 100644
--- a/sys/amd64/vmm/intel/vmx.c
+++ b/sys/amd64/vmm/intel/vmx.c
@@ -27,7 +27,6 @@
* SUCH DAMAGE.
*/
-#include <sys/cdefs.h>
#include "opt_bhyve_snapshot.h"
#include <sys/param.h>
@@ -58,6 +57,7 @@
#include <machine/vmm_instruction_emul.h>
#include <machine/vmm_snapshot.h>
+#include <dev/vmm/vmm_dev.h>
#include <dev/vmm/vmm_ktr.h>
#include <dev/vmm/vmm_mem.h>
diff --git a/sys/amd64/vmm/io/ppt.c b/sys/amd64/vmm/io/ppt.c
index 2cb459fb848f..6feac5dcbbed 100644
--- a/sys/amd64/vmm/io/ppt.c
+++ b/sys/amd64/vmm/io/ppt.c
@@ -336,13 +336,6 @@ ppt_teardown_msix(struct pptdev *ppt)
}
int
-ppt_avail_devices(void)
-{
-
- return (num_pptdevs);
-}
-
-int
ppt_assigned_devices(struct vm *vm)
{
struct pptdev *ppt;
diff --git a/sys/amd64/vmm/io/ppt.h b/sys/amd64/vmm/io/ppt.h
index f97c399564d7..9377f34d50e6 100644
--- a/sys/amd64/vmm/io/ppt.h
+++ b/sys/amd64/vmm/io/ppt.h
@@ -43,12 +43,6 @@ int ppt_assigned_devices(struct vm *vm);
bool ppt_is_mmio(struct vm *vm, vm_paddr_t gpa);
/*
- * Returns the number of devices sequestered by the ppt driver for assignment
- * to virtual machines.
- */
-int ppt_avail_devices(void);
-
-/*
* The following functions should never be called directly.
* Use 'vm_assign_pptdev()' and 'vm_unassign_pptdev()' instead.
*/
diff --git a/sys/amd64/vmm/io/vlapic.c b/sys/amd64/vmm/io/vlapic.c
index 9879dfa164a4..afd5045de574 100644
--- a/sys/amd64/vmm/io/vlapic.c
+++ b/sys/amd64/vmm/io/vlapic.c
@@ -456,7 +456,7 @@ vlapic_fire_lvt(struct vlapic *vlapic, u_int lvt)
return (0);
}
if (vlapic_set_intr_ready(vlapic, vec, false))
- vcpu_notify_event(vlapic->vcpu, true);
+ vcpu_notify_lapic(vlapic->vcpu);
break;
case APIC_LVT_DM_NMI:
vm_inject_nmi(vlapic->vcpu);
diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c
index 473887240b9b..2890e990633d 100644
--- a/sys/amd64/vmm/vmm.c
+++ b/sys/amd64/vmm/vmm.c
@@ -31,7 +31,6 @@
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
-#include <sys/module.h>
#include <sys/sysctl.h>
#include <sys/malloc.h>
#include <sys/pcpu.h>
@@ -189,8 +188,6 @@ struct vm {
#define VMM_CTR4(vcpu, format, p1, p2, p3, p4) \
VCPU_CTR4((vcpu)->vm, (vcpu)->vcpuid, format, p1, p2, p3, p4)
-static int vmm_initialized;
-
static void vmmops_panic(void);
static void
@@ -270,11 +267,7 @@ static int trap_wbinvd;
SYSCTL_INT(_hw_vmm, OID_AUTO, trap_wbinvd, CTLFLAG_RDTUN, &trap_wbinvd, 0,
"WBINVD triggers a VM-exit");
-u_int vm_maxcpu;
-SYSCTL_UINT(_hw_vmm, OID_AUTO, maxcpu, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
- &vm_maxcpu, 0, "Maximum number of vCPUs");
-
-static void vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr);
+static void vcpu_notify_event_locked(struct vcpu *vcpu);
/* global statistics */
VMM_STAT(VCPU_MIGRATIONS, "vcpu migration across host cpus");
@@ -299,14 +292,6 @@ VMM_STAT(VMEXIT_USERSPACE, "number of vm exits handled in userspace");
VMM_STAT(VMEXIT_RENDEZVOUS, "number of times rendezvous pending at exit");
VMM_STAT(VMEXIT_EXCEPTION, "number of vm exits due to exceptions");
-/*
- * Upper limit on vm_maxcpu. Limited by use of uint16_t types for CPU
- * counts as well as range of vpid values for VT-x and by the capacity
- * of cpuset_t masks. The call to new_unrhdr() in vpid_init() in
- * vmx.c requires 'vm_maxcpu + 1 <= 0xffff', hence the '- 1' below.
- */
-#define VM_MAXCPU MIN(0xffff - 1, CPU_SETSIZE)
-
#ifdef KTR
static const char *
vcpu_state2str(enum vcpu_state state)
@@ -402,22 +387,12 @@ vm_exitinfo_cpuset(struct vcpu *vcpu)
return (&vcpu->exitinfo_cpuset);
}
-static int
-vmm_init(void)
+int
+vmm_modinit(void)
{
if (!vmm_is_hw_supported())
return (ENXIO);
- vm_maxcpu = mp_ncpus;
- TUNABLE_INT_FETCH("hw.vmm.maxcpu", &vm_maxcpu);
-
- if (vm_maxcpu > VM_MAXCPU) {
- printf("vmm: vm_maxcpu clamped to %u\n", VM_MAXCPU);
- vm_maxcpu = VM_MAXCPU;
- }
- if (vm_maxcpu == 0)
- vm_maxcpu = 1;
-
vmm_host_state_init();
vmm_ipinum = lapic_ipi_alloc(pti ? &IDTVEC(justreturn1_pti) :
@@ -431,70 +406,17 @@ vmm_init(void)
return (vmmops_modinit(vmm_ipinum));
}
-static int
-vmm_handler(module_t mod, int what, void *arg)
+int
+vmm_modcleanup(void)
{
- int error;
-
- switch (what) {
- case MOD_LOAD:
- if (vmm_is_hw_supported()) {
- error = vmmdev_init();
- if (error != 0)
- break;
- error = vmm_init();
- if (error == 0)
- vmm_initialized = 1;
- else
- (void)vmmdev_cleanup();
- } else {
- error = ENXIO;
- }
- break;
- case MOD_UNLOAD:
- if (vmm_is_hw_supported()) {
- error = vmmdev_cleanup();
- if (error == 0) {
- vmm_suspend_p = NULL;
- vmm_resume_p = NULL;
- iommu_cleanup();
- if (vmm_ipinum != IPI_AST)
- lapic_ipi_free(vmm_ipinum);
- error = vmmops_modcleanup();
- /*
- * Something bad happened - prevent new
- * VMs from being created
- */
- if (error)
- vmm_initialized = 0;
- }
- } else {
- error = 0;
- }
- break;
- default:
- error = 0;
- break;
- }
- return (error);
+ vmm_suspend_p = NULL;
+ vmm_resume_p = NULL;
+ iommu_cleanup();
+ if (vmm_ipinum != IPI_AST)
+ lapic_ipi_free(vmm_ipinum);
+ return (vmmops_modcleanup());
}
-static moduledata_t vmm_kmod = {
- "vmm",
- vmm_handler,
- NULL
-};
-
-/*
- * vmm initialization has the following dependencies:
- *
- * - VT-x initialization requires smp_rendezvous() and therefore must happen
- * after SMP is fully functional (after SI_SUB_SMP).
- * - vmm device initialization requires an initialized devfs.
- */
-DECLARE_MODULE(vmm, vmm_kmod, MAX(SI_SUB_SMP, SI_SUB_DEVFS) + 1, SI_ORDER_ANY);
-MODULE_VERSION(vmm, 1);
-
static void
vm_init(struct vm *vm, bool create)
{
@@ -573,29 +495,12 @@ vm_unlock_vcpus(struct vm *vm)
sx_unlock(&vm->vcpus_init_lock);
}
-/*
- * The default CPU topology is a single thread per package.
- */
-u_int cores_per_package = 1;
-u_int threads_per_core = 1;
-
int
vm_create(const char *name, struct vm **retvm)
{
struct vm *vm;
int error;
- /*
- * If vmm.ko could not be successfully initialized then don't attempt
- * to create the virtual machine.
- */
- if (!vmm_initialized)
- return (ENXIO);
-
- if (name == NULL || strnlen(name, VM_MAX_NAMELEN + 1) ==
- VM_MAX_NAMELEN + 1)
- return (EINVAL);
-
vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO);
error = vm_mem_init(&vm->mem, 0, VM_MAXUSER_ADDRESS_LA48);
if (error != 0) {
@@ -609,8 +514,8 @@ vm_create(const char *name, struct vm **retvm)
M_ZERO);
vm->sockets = 1;
- vm->cores = cores_per_package; /* XXX backwards compatibility */
- vm->threads = threads_per_core; /* XXX backwards compatibility */
+ vm->cores = 1; /* XXX backwards compatibility */
+ vm->threads = 1; /* XXX backwards compatibility */
vm->maxcpus = vm_maxcpu;
vm_init(vm, true);
@@ -724,12 +629,7 @@ vm_name(struct vm *vm)
int
vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
{
- vm_object_t obj;
-
- if ((obj = vmm_mmio_alloc(vm_vmspace(vm), gpa, len, hpa)) == NULL)
- return (ENOMEM);
- else
- return (0);
+ return (vmm_mmio_alloc(vm_vmspace(vm), gpa, len, hpa));
}
int
@@ -1033,7 +933,7 @@ vcpu_wait_idle(struct vcpu *vcpu)
KASSERT(vcpu->state != VCPU_IDLE, ("vcpu already idle"));
vcpu->reqidle = 1;
- vcpu_notify_event_locked(vcpu, false);
+ vcpu_notify_event_locked(vcpu);
VMM_CTR1(vcpu, "vcpu state change from %s to "
"idle requested", vcpu_state2str(vcpu->state));
msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz);
@@ -1514,7 +1414,7 @@ vm_handle_suspend(struct vcpu *vcpu, bool *retu)
*/
for (i = 0; i < vm->maxcpus; i++) {
if (CPU_ISSET(i, &vm->suspended_cpus)) {
- vcpu_notify_event(vm_vcpu(vm, i), false);
+ vcpu_notify_event(vm_vcpu(vm, i));
}
}
@@ -1588,7 +1488,7 @@ vm_suspend(struct vm *vm, enum vm_suspend_how how)
*/
for (i = 0; i < vm->maxcpus; i++) {
if (CPU_ISSET(i, &vm->active_cpus))
- vcpu_notify_event(vm_vcpu(vm, i), false);
+ vcpu_notify_event(vm_vcpu(vm, i));
}
return (0);
@@ -2068,7 +1968,7 @@ vm_inject_nmi(struct vcpu *vcpu)
{
vcpu->nmi_pending = 1;
- vcpu_notify_event(vcpu, false);
+ vcpu_notify_event(vcpu);
return (0);
}
@@ -2095,7 +1995,7 @@ vm_inject_extint(struct vcpu *vcpu)
{
vcpu->extint_pending = 1;
- vcpu_notify_event(vcpu, false);
+ vcpu_notify_event(vcpu);
return (0);
}
@@ -2266,14 +2166,14 @@ vm_suspend_cpu(struct vm *vm, struct vcpu *vcpu)
vm->debug_cpus = vm->active_cpus;
for (int i = 0; i < vm->maxcpus; i++) {
if (CPU_ISSET(i, &vm->active_cpus))
- vcpu_notify_event(vm_vcpu(vm, i), false);
+ vcpu_notify_event(vm_vcpu(vm, i));
}
} else {
if (!CPU_ISSET(vcpu->vcpuid, &vm->active_cpus))
return (EINVAL);
CPU_SET_ATOMIC(vcpu->vcpuid, &vm->debug_cpus);
- vcpu_notify_event(vcpu, false);
+ vcpu_notify_event(vcpu);
}
return (0);
}
@@ -2381,7 +2281,7 @@ vm_set_x2apic_state(struct vcpu *vcpu, enum x2apic_state state)
* to the host_cpu to cause the vcpu to trap into the hypervisor.
*/
static void
-vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr)
+vcpu_notify_event_locked(struct vcpu *vcpu)
{
int hostcpu;
@@ -2389,12 +2289,7 @@ vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr)
if (vcpu->state == VCPU_RUNNING) {
KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
if (hostcpu != curcpu) {
- if (lapic_intr) {
- vlapic_post_intr(vcpu->vlapic, hostcpu,
- vmm_ipinum);
- } else {
- ipi_cpu(hostcpu, vmm_ipinum);
- }
+ ipi_cpu(hostcpu, vmm_ipinum);
} else {
/*
* If the 'vcpu' is running on 'curcpu' then it must
@@ -2412,10 +2307,21 @@ vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr)
}
void
-vcpu_notify_event(struct vcpu *vcpu, bool lapic_intr)
+vcpu_notify_event(struct vcpu *vcpu)
{
vcpu_lock(vcpu);
- vcpu_notify_event_locked(vcpu, lapic_intr);
+ vcpu_notify_event_locked(vcpu);
+ vcpu_unlock(vcpu);
+}
+
+void
+vcpu_notify_lapic(struct vcpu *vcpu)
+{
+ vcpu_lock(vcpu);
+ if (vcpu->state == VCPU_RUNNING && vcpu->hostcpu != curcpu)
+ vlapic_post_intr(vcpu->vlapic, vcpu->hostcpu, vmm_ipinum);
+ else
+ vcpu_notify_event_locked(vcpu);
vcpu_unlock(vcpu);
}
@@ -2477,7 +2383,7 @@ restart:
*/
for (i = 0; i < vm->maxcpus; i++) {
if (CPU_ISSET(i, &dest))
- vcpu_notify_event(vm_vcpu(vm, i), false);
+ vcpu_notify_event(vm_vcpu(vm, i));
}
return (vm_handle_rendezvous(vcpu));
diff --git a/sys/amd64/vmm/vmm_lapic.c b/sys/amd64/vmm/vmm_lapic.c
index 0cae01f172ec..63bdee69bb59 100644
--- a/sys/amd64/vmm/vmm_lapic.c
+++ b/sys/amd64/vmm/vmm_lapic.c
@@ -61,7 +61,7 @@ lapic_set_intr(struct vcpu *vcpu, int vector, bool level)
vlapic = vm_lapic(vcpu);
if (vlapic_set_intr_ready(vlapic, vector, level))
- vcpu_notify_event(vcpu, true);
+ vcpu_notify_lapic(vcpu);
return (0);
}
diff --git a/sys/amd64/vmm/vmm_mem.h b/sys/amd64/vmm/vmm_mem.h
index 41b9bf07c4fc..d905fd37001d 100644
--- a/sys/amd64/vmm/vmm_mem.h
+++ b/sys/amd64/vmm/vmm_mem.h
@@ -30,10 +30,9 @@
#define _VMM_MEM_H_
struct vmspace;
-struct vm_object;
-struct vm_object *vmm_mmio_alloc(struct vmspace *, vm_paddr_t gpa, size_t len,
- vm_paddr_t hpa);
+int vmm_mmio_alloc(struct vmspace *, vm_paddr_t gpa, size_t len,
+ vm_paddr_t hpa);
void vmm_mmio_free(struct vmspace *, vm_paddr_t gpa, size_t size);
vm_paddr_t vmm_mem_maxaddr(void);
diff --git a/sys/amd64/vmm/vmm_mem_machdep.c b/sys/amd64/vmm/vmm_mem_machdep.c
index e96c9e4bdc66..afb3a0274e2a 100644
--- a/sys/amd64/vmm/vmm_mem_machdep.c
+++ b/sys/amd64/vmm/vmm_mem_machdep.c
@@ -36,6 +36,7 @@
#include <vm/vm.h>
#include <vm/vm_param.h>
#include <vm/pmap.h>
+#include <vm/vm_extern.h>
#include <vm/vm_map.h>
#include <vm/vm_object.h>
#include <vm/vm_page.h>
@@ -45,40 +46,48 @@
#include "vmm_mem.h"
-vm_object_t
+int
vmm_mmio_alloc(struct vmspace *vmspace, vm_paddr_t gpa, size_t len,
- vm_paddr_t hpa)
+ vm_paddr_t hpa)
{
- int error;
- vm_object_t obj;
struct sglist *sg;
+ vm_object_t obj;
+ int error;
+
+ if (gpa + len < gpa || hpa + len < hpa || (gpa & PAGE_MASK) != 0 ||
+ (hpa & PAGE_MASK) != 0 || (len & PAGE_MASK) != 0)
+ return (EINVAL);
sg = sglist_alloc(1, M_WAITOK);
error = sglist_append_phys(sg, hpa, len);
KASSERT(error == 0, ("error %d appending physaddr to sglist", error));
obj = vm_pager_allocate(OBJT_SG, sg, len, VM_PROT_RW, 0, NULL);
- if (obj != NULL) {
- /*
- * VT-x ignores the MTRR settings when figuring out the
- * memory type for translations obtained through EPT.
- *
- * Therefore we explicitly force the pages provided by
- * this object to be mapped as uncacheable.
- */
- VM_OBJECT_WLOCK(obj);
- error = vm_object_set_memattr(obj, VM_MEMATTR_UNCACHEABLE);
- VM_OBJECT_WUNLOCK(obj);
- if (error != KERN_SUCCESS) {
- panic("vmm_mmio_alloc: vm_object_set_memattr error %d",
- error);
- }
- error = vm_map_find(&vmspace->vm_map, obj, 0, &gpa, len, 0,
- VMFS_NO_SPACE, VM_PROT_RW, VM_PROT_RW, 0);
- if (error != KERN_SUCCESS) {
- vm_object_deallocate(obj);
- obj = NULL;
- }
+ if (obj == NULL)
+ return (ENOMEM);
+
+ /*
+ * VT-x ignores the MTRR settings when figuring out the memory type for
+ * translations obtained through EPT.
+ *
+ * Therefore we explicitly force the pages provided by this object to be
+ * mapped as uncacheable.
+ */
+ VM_OBJECT_WLOCK(obj);
+ error = vm_object_set_memattr(obj, VM_MEMATTR_UNCACHEABLE);
+ VM_OBJECT_WUNLOCK(obj);
+ if (error != KERN_SUCCESS)
+ panic("vmm_mmio_alloc: vm_object_set_memattr error %d", error);
+
+ vm_map_lock(&vmspace->vm_map);
+ error = vm_map_insert(&vmspace->vm_map, obj, 0, gpa, gpa + len,
+ VM_PROT_RW, VM_PROT_RW, 0);
+ vm_map_unlock(&vmspace->vm_map);
+ if (error != KERN_SUCCESS) {
+ error = vm_mmap_to_errno(error);
+ vm_object_deallocate(obj);
+ } else {
+ error = 0;
}
/*
@@ -94,7 +103,7 @@ vmm_mmio_alloc(struct vmspace *vmspace, vm_paddr_t gpa, size_t len,
*/
sglist_free(sg);
- return (obj);
+ return (error);
}
void