16 files changed, 797 insertions, 241 deletions
diff --git a/sys/amd64/amd64/apic_vector.S b/sys/amd64/amd64/apic_vector.S
index e98bae9eb6c5..8691387a5a8e 100644
--- a/sys/amd64/amd64/apic_vector.S
+++ b/sys/amd64/amd64/apic_vector.S
@@ -204,6 +204,17 @@ IDTVEC(spuriousint)
 	jmp	doreti
 
 /*
+ * Executed by a CPU when it receives an IPI_OFF from another CPU.
+ * Should never return
+ */
+	INTR_HANDLER cpuoff
+	KMSAN_ENTER
+	call	cpuoff_handler
+	call	as_lapic_eoi
+	KMSAN_LEAVE
+	jmp	doreti
+
+/*
  * Executed by a CPU when it receives an IPI_SWI.
  */
 	INTR_HANDLER ipi_swi
diff --git a/sys/amd64/amd64/genassym.c b/sys/amd64/amd64/genassym.c
index eb1b746f5893..2716784ee871 100644
--- a/sys/amd64/amd64/genassym.c
+++ b/sys/amd64/amd64/genassym.c
@@ -57,6 +57,7 @@
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
+#include <sys/kexec.h>
 #include <sys/proc.h>
 #include <x86/apicreg.h>
 #include <machine/cpu.h>
@@ -65,6 +66,7 @@
 #include <machine/proc.h>
 #include <machine/segments.h>
 #include <machine/efi.h>
+#include <machine/kexec.h>
 
 ASSYM(P_VMSPACE, offsetof(struct proc, p_vmspace));
 ASSYM(VM_PMAP, offsetof(struct vmspace, vm_pmap));
@@ -295,3 +297,13 @@ ASSYM(EC_R13, offsetof(struct efirt_callinfo, ec_r13));
 ASSYM(EC_R14, offsetof(struct efirt_callinfo, ec_r14));
 ASSYM(EC_R15, offsetof(struct efirt_callinfo, ec_r15));
 ASSYM(EC_RFLAGS, offsetof(struct efirt_callinfo, ec_rflags));
+
+/* Kexec */
+ASSYM(KEXEC_ENTRY, offsetof(struct kexec_image, entry));
+ASSYM(KEXEC_SEGMENTS, offsetof(struct kexec_image, segments));
+ASSYM(KEXEC_SEGMENT_MAX, KEXEC_SEGMENT_MAX);
+ASSYM(KEXEC_IMAGE_SIZE, sizeof(struct kexec_image));
+ASSYM(KEXEC_STAGED_SEGMENT_SIZE, sizeof(struct kexec_segment_stage));
+ASSYM(KEXEC_SEGMENT_SIZE, offsetof(struct kexec_segment_stage, size));
+ASSYM(KEXEC_SEGMENT_MAP, offsetof(struct kexec_segment_stage, map_buf));
+ASSYM(KEXEC_SEGMENT_TARGET, offsetof(struct kexec_segment_stage, target));
diff --git a/sys/amd64/amd64/kexec_support.c b/sys/amd64/amd64/kexec_support.c
new file mode 100644
index 000000000000..8189a48e9ae9
--- /dev/null
+++ b/sys/amd64/amd64/kexec_support.c
@@ -0,0 +1,300 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2025 Juniper Networks, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/conf.h>
+#include <sys/interrupt.h>
+#include <sys/kernel.h>
+#include <sys/kexec.h>
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_object.h>
+#include <vm/vm_phys.h>
+#include <vm/pmap.h>
+#include <vm/vm_page.h>
+#include <vm/vm_radix.h>
+
+#include <machine/intr_machdep.h>
+#include <machine/kexec.h>
+#include <machine/md_var.h>
+#include <machine/pmap.h>
+#include <x86/apicvar.h>
+
+/*
+ * Idea behind this:
+ *
+ * kexec_load_md():
+ * - Update boot page tables (identity map) to include all pages needed before
+ *   disabling MMU.
+ *
+ * kexec_reboot_md():
+ * - Copy pages into target(s)
+ * - Do "other stuff"
+ * - Does not return
+ */
+
+/*
+ * do_pte: Create PTE entries (4k pages). If false, create 2MB superpages.
+ * identity: This is for an identity map, treat `start` as a physical address.
+ * Only valid here if do_pte is false.
+ */
+static void
+kexec_generate_page_tables(pml4_entry_t *root, vm_offset_t start,
+    vm_size_t size, bool do_pte, bool identity, struct pctrie_iter *pages)
+{
+	vm_paddr_t mpa;
+	vm_offset_t pg;
+	vm_size_t stride = do_pte ? PAGE_SIZE : NBPDR;
+	vm_page_t m;
+	vm_pindex_t i, j, k, l;
+
+	pg = start & ~(stride - 1);
+	i = pmap_pml4e_index(pg);
+	j = pmap_pdpe_index(pg);
+	k = pmap_pde_index(pg);
+	l = pmap_pte_index(pg);
+	for (; pg < start + size; i++, j = 0, k = 0, l = 0) {
+		/*
+		 * Walk linearly, as above, but one fell swoop, one page at a
+		 * time.
+		 */
+		if (root[i] == 0) {
+			m = vm_radix_iter_next(pages);
+			mpa = VM_PAGE_TO_PHYS(m);
+			root[i] = mpa | PG_RW | PG_V;
+		}
+		pdp_entry_t *pdp =
+			(pdp_entry_t *)(PHYS_TO_DMAP(root[i] & PG_FRAME));
+		for (; j < NPDPEPG && pg < start + size; j++, k = 0, l = 0) {
+			if (pdp[j] == 0) {
+				m = vm_radix_iter_next(pages);
+				mpa = VM_PAGE_TO_PHYS(m);
+				pdp[j] = mpa | PG_RW | PG_V;
+			}
+			pd_entry_t *pde =
+			    (pd_entry_t *)(PHYS_TO_DMAP(pdp[j] & PG_FRAME));
+			for (; k < NPDEPG && pg < start + size; k++, l = 0) {
+				if (pde[k] == 0) {
+					if (!do_pte) {
+						pde[k] =
+						    (identity ? pg : pmap_kextract(pg)) |
+						    PG_RW | PG_PS | PG_V;
+						pg += NBPDR;
+						continue;
+					}
+					m = vm_radix_iter_next(pages);
+					mpa = VM_PAGE_TO_PHYS(m);
+					pde[k] = mpa | PG_V | PG_RW;
+				} else if ((pde[k] & PG_PS) != 0) {
+					pg += NBPDR;
+					continue;
+				}
+				/* Populate the PTEs. */
+				for (; l < NPTEPG && pg < start + size;
+				    l++, pg += PAGE_SIZE) {
+					pt_entry_t *pte =
+					    (pt_entry_t *)PHYS_TO_DMAP(pde[pmap_pde_index(pg)] & PG_FRAME);
+					pte[pmap_pte_index(pg)] =
+					    pmap_kextract(pg) | PG_RW | PG_V;
+				}
+			}
+		}
+	}
+}
+
+void
+kexec_reboot_md(struct kexec_image *image)
+{
+	void (*kexec_do_tramp)(void) = image->md_image;
+
+	intr_disable_all();
+	lapic_disable();
+	kexec_do_reboot_trampoline(VM_PAGE_TO_PHYS(image->first_md_page),
+	    kexec_do_tramp);
+
+	for (;;)
+		;
+}
+
+int
+kexec_load_md(struct kexec_image *image)
+{
+	struct pctrie_iter pct_iter;
+	pml4_entry_t *PT4;
+	pdp_entry_t *PDP_l;
+	pd_entry_t *PD_l0;
+	vm_offset_t va;
+	int i;
+
+	/*
+	 * Start building the page table.
+	 * First part of the page table is standard for all.
+	 */
+	vm_offset_t pa_pdp_l, pa_pd_l0, pa_pd_l1, pa_pd_l2, pa_pd_l3;
+	vm_page_t m;
+
+	if (la57)
+		return (EINVAL);
+
+	vm_radix_iter_init(&pct_iter, &image->map_obj->rtree);
+	/* Working in linear space in the mapped space, `va` is our tracker. */
+	m = vm_radix_iter_lookup(&pct_iter, image->first_md_page->pindex);
+	va = (vm_offset_t)image->map_addr + ptoa(m->pindex);
+	/* We'll find a place for these later */
+	PT4 = (void *)va;
+	va += PAGE_SIZE;
+	m = vm_radix_iter_next(&pct_iter);
+	pa_pdp_l = VM_PAGE_TO_PHYS(m);
+	PDP_l = (void *)va;
+	va += PAGE_SIZE;
+	m = vm_radix_iter_next(&pct_iter);
+	pa_pd_l0 = VM_PAGE_TO_PHYS(m);
+	PD_l0 = (void *)va;
+	va += PAGE_SIZE;
+	m = vm_radix_iter_next(&pct_iter);
+	pa_pd_l1 = VM_PAGE_TO_PHYS(m);
+	m = vm_radix_iter_next(&pct_iter);
+	pa_pd_l2 = VM_PAGE_TO_PHYS(m);
+	m = vm_radix_iter_next(&pct_iter);
+	pa_pd_l3 = VM_PAGE_TO_PHYS(m);
+	m = vm_radix_iter_next(&pct_iter);
+
+	/* 1:1 mapping of lower 4G */
+	PT4[0] = (pml4_entry_t)pa_pdp_l | PG_V | PG_RW;
+	PDP_l[0] = (pdp_entry_t)pa_pd_l0 | PG_V | PG_RW;
+	PDP_l[1] = (pdp_entry_t)pa_pd_l1 | PG_V | PG_RW;
+	PDP_l[2] = (pdp_entry_t)pa_pd_l2 | PG_V | PG_RW;
+	PDP_l[3] = (pdp_entry_t)pa_pd_l3 | PG_V | PG_RW;
+	for (i = 0; i < 4 * NPDEPG; i++) {	/* we overflow PD_l0 into _l1, etc */
+		PD_l0[i] = ((pd_entry_t)i << PDRSHIFT) | PG_V |
+		    PG_RW | PG_PS;
+	}
+
+	/* Map the target(s) in 2MB chunks. */
+	for (i = 0; i < KEXEC_SEGMENT_MAX; i++) {
+		struct kexec_segment_stage *s = &image->segments[i];
+
+		if (s->size == 0)
+			break;
+		kexec_generate_page_tables(PT4, s->target, s->size, false,
+		    true, &pct_iter);
+	}
+	/* Now create the source page tables */
+	kexec_generate_page_tables(PT4, image->map_addr, image->map_size, true,
+	    false, &pct_iter);
+	kexec_generate_page_tables(PT4,
+	    trunc_page((vm_offset_t)kexec_do_reboot_trampoline),
+	    PAGE_SIZE, true, false, &pct_iter);
+	KASSERT(m != NULL, ("kexec_load_md: Missing trampoline page!\n"));
+
+	/* MD control pages start at this next page. */
+	image->md_image = (void *)(image->map_addr + ptoa(m->pindex));
+	bcopy(kexec_do_reboot, image->md_image, kexec_do_reboot_size);
+
+	/* Save the image into the MD page(s) right after the trampoline */
+	bcopy(image, (void *)((vm_offset_t)image->md_image +
+	    (vm_offset_t)&kexec_saved_image - (vm_offset_t)&kexec_do_reboot),
+	    sizeof(*image));
+
+	return (0);
+}
+
+/*
+ * Required pages:
+ * - L4 (1) (root)
+ * - L3 (PDPE) - 2 (bottom 512GB, bottom 4 used, top range for kernel map)
+ * - L2 (PDP) - 5 (2MB superpage mappings, 1GB each, for bottom 4GB, top 1)
+ * - L1 (PDR) - 1 (kexec trampoline page, first MD page)
+ * - kexec_do_reboot trampoline - 1
+ * - Slop pages for staging (in case it's not aligned nicely) - 3 (worst case)
+ *
+ * Minimum 9 pages for the direct map.
+ */
+int
+kexec_md_pages(struct kexec_segment *seg_in)
+{
+	struct kexec_segment *segs = seg_in;
+	vm_size_t pages = 13;	/* Minimum number of starting pages */
+	vm_paddr_t cur_addr = (1UL << 32) - 1;	/* Bottom 4G will be identity mapped in full */
+	vm_size_t source_total = 0;
+
+	for (int i = 0; i < KEXEC_SEGMENT_MAX; i++) {
+		vm_offset_t start, end;
+		if (segs[i].memsz == 0)
+			break;
+
+		end = round_2mpage((vm_offset_t)segs[i].mem + segs[i].memsz);
+		start = trunc_2mpage((vm_offset_t)segs[i].mem);
+		start = max(start, cur_addr + 1);
+		/*
+		 * Round to cover the full range of page table pages for each
+		 * segment.
+		 */
+		source_total += round_2mpage(end - start);
+
+		/*
+		 * Bottom 4GB are identity mapped already in the count, so skip
+		 * any segments that end up there, this will short-circuit that.
+		 */
+		if (end <= cur_addr + 1)
+			continue;
+
+		if (pmap_pml4e_index(end) != pmap_pml4e_index(cur_addr)) {
+			/* Need a new 512GB mapping page */
+			pages++;
+			pages += howmany(end - (start & ~PML4MASK), NBPML4);
+			pages += howmany(end - (start & ~PDPMASK), NBPDP);
+			pages += howmany(end - (start & ~PDRMASK), NBPDR);
+
+		} else if (pmap_pdpe_index(end) != pmap_pdpe_index(cur_addr)) {
+			pages++;
+			pages += howmany(end - (start & ~PDPMASK), NBPDP) - 1;
+			pages += howmany(end - (start & ~PDRMASK), NBPDR);
+		}
+
+	}
+	/* Be pessimistic when totaling up source pages.  We likely
+	 * can't use superpages, so need to map each page individually.
+	 */
+	pages += howmany(source_total, NBPDR);
+	pages += howmany(source_total, NBPDP);
+	pages += howmany(source_total, NBPML4);
+
+	/*
+	 * Be intentionally sloppy adding in the extra page table pages. It's
+	 * better to go over than under.
+	 */
+	pages += howmany(pages * PAGE_SIZE, NBPDR);
+	pages += howmany(pages * PAGE_SIZE, NBPDP);
+	pages += howmany(pages * PAGE_SIZE, NBPML4);
+
+	/* Add in the trampoline pages */
+	pages += howmany(kexec_do_reboot_size, PAGE_SIZE);
+
+	return (pages);
+}
diff --git a/sys/amd64/amd64/kexec_tramp.S b/sys/amd64/amd64/kexec_tramp.S
new file mode 100644
index 000000000000..6a2de676bc35
--- /dev/null
+++ b/sys/amd64/amd64/kexec_tramp.S
@@ -0,0 +1,91 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2025 Juniper Networks, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <machine/asmacros.h>
+#include <machine/specialreg.h>
+#include "assym.inc"
+
+/*
+ * Take a pointer to the image, copy each segment, and jump to the trampoline.
+ *
+ * Assumptions:
+ * - image is in safe memory
+ * - We're already running out of the new "identity" map.
+ * - All registers are free game, so go nuts
+ * - Interrupts are disabled
+ * - All APs are disabled
+ */
+ENTRY(kexec_do_reboot)
+	/*
+		r9:	image pointer
+		r10:	segment pointer
+		r11:	segment counter
+	 */
+	leaq	kexec_stack(%rip), %rsp
+	/* Get the saved kexec_image. */
+	leaq	kexec_saved_image(%rip), %r9
+	leaq	KEXEC_SEGMENTS(%r9), %r10
+	movq	$KEXEC_SEGMENT_MAX, %r11
+copy_segment:
+	movq	KEXEC_SEGMENT_SIZE(%r10), %rcx
+	cmpq	$0, %rcx
+	je	done
+	shrq	$3, %rcx
+	movq	KEXEC_SEGMENT_TARGET(%r10), %rdi
+	movq	KEXEC_SEGMENT_MAP(%r10), %rsi
+	rep
+	movsq
+	addq	$KEXEC_STAGED_SEGMENT_SIZE, %r10
+	decq	%r11
+	jg	copy_segment
+
+done:
+	pushq	KEXEC_ENTRY(%r9)
+	ret
+fail:
+	jmp	fail
+END(kexec_do_reboot)
+ENTRY(kexec_do_reboot_trampoline)
+	/* Set new page table, clears most of TLB. */
+	movq	%rdi, %cr3
+
+	/* Now flush the rest of the TLB, including global pages. */
+	movq	%cr4, %rax
+	andq	$~CR4_PGE, %rax
+	movq	%rax, %cr4
+	jmp	*%rsi
+END(kexec_do_reboot_trampoline)
+CNAME(kexec_saved_image):
+	.globl	kexec_saved_image
+	.space	KEXEC_IMAGE_SIZE
+	.quad	0
+	/* We don't need more than quad, so just fill out the page. */
+	.p2align PAGE_SHIFT
+	kexec_stack:
+CNAME(kexec_do_reboot_size):
+	.globl	kexec_do_reboot_size
+	.quad . - kexec_do_reboot
diff --git a/sys/amd64/amd64/mp_machdep.c b/sys/amd64/amd64/mp_machdep.c
index 00e99f9df192..96ed0a2cc3ba 100644
--- a/sys/amd64/amd64/mp_machdep.c
+++ b/sys/amd64/amd64/mp_machdep.c
@@ -140,6 +140,10 @@ cpu_mp_start(void)
 	setidt(IPI_STOP, pti ? IDTVEC(cpustop_pti) : IDTVEC(cpustop),
 	    SDT_SYSIGT, SEL_KPL, 0);
 
+	/* Install an inter-CPU IPI for CPU offline */
+	setidt(IPI_OFF, pti ? IDTVEC(cpuoff_pti) : IDTVEC(cpuoff),
+	    SDT_SYSIGT, SEL_KPL, 0);
+
 	/* Install an inter-CPU IPI for CPU suspend/resume */
 	setidt(IPI_SUSPEND, pti ? IDTVEC(cpususpend_pti) : IDTVEC(cpususpend),
 	    SDT_SYSIGT, SEL_KPL, 0);
@@ -176,6 +180,15 @@ cpu_mp_start(void)
 #endif
 }
 
+void
+cpu_mp_stop(void)
+{
+	cpuset_t other_cpus = all_cpus;
+
+	CPU_CLR(PCPU_GET(cpuid), &other_cpus);
+	offline_cpus(other_cpus);
+}
+
 /*
  * AP CPU's call this to initialize themselves.
  */
diff --git a/sys/amd64/amd64/trap.c b/sys/amd64/amd64/trap.c
index f3469ed5e2bc..84305ca918df 100644
--- a/sys/amd64/amd64/trap.c
+++ b/sys/amd64/amd64/trap.c
@@ -435,9 +435,9 @@ trap(struct trapframe *frame)
 
 			if ((print_efirt_faults == 1 && cnt == 0) ||
 			    print_efirt_faults == 2) {
-				trap_diag(frame, 0);
 				printf("EFI RT fault %s\n",
 				    traptype_to_msg(type));
+				trap_diag(frame, 0);
 			}
 			frame->tf_rip = (long)curpcb->pcb_onfault;
 			return;
@@ -870,8 +870,8 @@ after_vmfault:
 
 			if ((print_efirt_faults == 1 && cnt == 0) ||
 			    print_efirt_faults == 2) {
-				trap_diag(frame, eva);
 				printf("EFI RT page fault\n");
+				trap_diag(frame, eva);
 			}
 		}
 		frame->tf_rip = (long)curpcb->pcb_onfault;
diff --git a/sys/amd64/conf/GENERIC b/sys/amd64/conf/GENERIC
index 2e41ed26403a..fb8473505128 100644
--- a/sys/amd64/conf/GENERIC
+++ b/sys/amd64/conf/GENERIC
@@ -309,7 +309,6 @@ device		wpi			# Intel 3945ABG wireless NICs.
 device		crypto			# core crypto support
 device		aesni			# AES-NI OpenCrypto module
 device		loop			# Network loopback
-device		padlock_rng		# VIA Padlock RNG
 device		rdrand_rng		# Intel Bull Mountain RNG
 device		ether			# Ethernet support
 device		vlan			# 802.1Q VLAN support
diff --git a/sys/amd64/conf/MINIMAL b/sys/amd64/conf/MINIMAL
index 0baf6d6431de..61c713c609a4 100644
--- a/sys/amd64/conf/MINIMAL
+++ b/sys/amd64/conf/MINIMAL
@@ -113,7 +113,6 @@ device		uart			# Generic UART driver
 
 # Pseudo devices.
 device		loop			# Network loopback
-device		padlock_rng		# VIA Padlock RNG
 device		rdrand_rng		# Intel Bull Mountain RNG
 device		ether			# Ethernet support
 
diff --git a/sys/amd64/include/kexec.h b/sys/amd64/include/kexec.h
new file mode 100644
index 000000000000..70bc2991be3f
--- /dev/null
+++ b/sys/amd64/include/kexec.h
@@ -0,0 +1,41 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2025 Juniper Networks, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _AMD64_KEXEC_H_
+#define _AMD64_KEXEC_H_
+
+struct kexec_segment;
+struct kexec_image;
+int		 kexec_md_pages(struct kexec_segment *);
+extern void	 kexec_do_reboot(void);
+extern long	 kexec_do_reboot_size;
+extern void	*kexec_saved_image;
+extern void	 kexec_do_reboot_trampoline(unsigned long, void (*)(void));
+#define	KEXEC_MD_PAGES(x)	kexec_md_pages(x)
+
+
+#endif /* _AMD64_KEXEC_H_ */
diff --git a/sys/amd64/include/smp.h b/sys/amd64/include/smp.h
index bff92570ff82..28c372a2e556 100644
--- a/sys/amd64/include/smp.h
+++ b/sys/amd64/include/smp.h
@@ -30,6 +30,7 @@ inthand_t
 	IDTVEC(ipi_intr_bitmap_handler_pti),
 	IDTVEC(ipi_swi_pti),
 	IDTVEC(cpustop_pti),
+	IDTVEC(cpuoff_pti),
 	IDTVEC(cpususpend_pti),
 	IDTVEC(rendezvous_pti);
 
diff --git a/sys/amd64/pt/pt.c b/sys/amd64/pt/pt.c
index c7b75767680a..6b2296de049c 100644
--- a/sys/amd64/pt/pt.c
+++ b/sys/amd64/pt/pt.c
@@ -42,15 +42,15 @@
  */
 
 #include <sys/systm.h>
+#include <sys/bus.h>
 #include <sys/hwt.h>
+#include <sys/interrupt.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
-#include <sys/sdt.h>
 #include <sys/smp.h>
-#include <sys/taskqueue.h>
 
 #include <vm/vm.h>
 #include <vm/vm_page.h>
@@ -94,12 +94,7 @@
 
 MALLOC_DEFINE(M_PT, "pt", "Intel Processor Trace");
 
-SDT_PROVIDER_DEFINE(pt);
-SDT_PROBE_DEFINE(pt, , , topa__intr);
-
-TASKQUEUE_FAST_DEFINE_THREAD(pt);
-
-static void pt_send_buffer_record(void *arg, int pending __unused);
+static void pt_send_buffer_record(void *arg);
 static int pt_topa_intr(struct trapframe *tf);
 
 /*
@@ -122,29 +117,24 @@ struct pt_buffer {
 	size_t size;
 	struct mtx lock; /* Lock for fields below. */
 	vm_offset_t offset;
-	uint64_t wrap_count;
-	int curpage;
 };
 
 struct pt_ctx {
 	int id;
 	struct pt_buffer buf; /* ToPA buffer metadata */
-	struct task task;     /* ToPA buffer notification task */
 	struct hwt_context *hwt_ctx;
 	uint8_t *save_area; /* PT XSAVE area */
 };
 /* PT tracing contexts used for CPU mode. */
 static struct pt_ctx *pt_pcpu_ctx;
 
-enum pt_cpu_state {
-	PT_DISABLED = 0,
-	PT_STOPPED,
-	PT_ACTIVE
-};
+enum pt_cpu_state { PT_INACTIVE = 0, PT_ACTIVE };
 
 static struct pt_cpu {
 	struct pt_ctx *ctx;	 /* active PT tracing context */
 	enum pt_cpu_state state; /* used as part of trace stop protocol */
+	void *swi_cookie;	 /* Software interrupt handler context */
+	int in_pcint_handler;
 } *pt_pcpu;
 
 /*
@@ -199,31 +189,28 @@ static __inline void
 pt_update_buffer(struct pt_buffer *buf)
 {
 	uint64_t reg;
-	int curpage;
+	uint64_t offset;
 
 	/* Update buffer offset. */
 	reg = rdmsr(MSR_IA32_RTIT_OUTPUT_MASK_PTRS);
-	curpage = (reg & PT_TOPA_PAGE_MASK) >> PT_TOPA_PAGE_SHIFT;
-	mtx_lock_spin(&buf->lock);
-	/* Check if the output wrapped. */
-	if (buf->curpage > curpage)
-		buf->wrap_count++;
-	buf->curpage = curpage;
-	buf->offset = reg >> 32;
-	mtx_unlock_spin(&buf->lock);
-
-	dprintf("%s: wrap_cnt: %lu, curpage: %d, offset: %zu\n", __func__,
-	    buf->wrap_count, buf->curpage, buf->offset);
+	offset = ((reg & PT_TOPA_PAGE_MASK) >> PT_TOPA_PAGE_SHIFT) * PAGE_SIZE;
+	offset += (reg >> 32);
+
+	atomic_store_rel_64(&buf->offset, offset);
 }
 
 static __inline void
 pt_fill_buffer_record(int id, struct pt_buffer *buf,
     struct hwt_record_entry *rec)
 {
+	vm_offset_t offset;
+
+	offset = atomic_load_acq_64(&buf->offset);
+
 	rec->record_type = HWT_RECORD_BUFFER;
 	rec->buf_id = id;
-	rec->curpage = buf->curpage;
-	rec->offset = buf->offset + (buf->wrap_count * buf->size);
+	rec->curpage = offset / PAGE_SIZE;
+	rec->offset = offset & PAGE_MASK;
 }
 
 /*
@@ -273,9 +260,9 @@ pt_cpu_start(void *dummy)
 	MPASS(cpu->ctx != NULL);
 
 	dprintf("%s: curcpu %d\n", __func__, curcpu);
+	pt_cpu_set_state(curcpu, PT_ACTIVE);
 	load_cr4(rcr4() | CR4_XSAVE);
 	wrmsr(MSR_IA32_RTIT_STATUS, 0);
-	pt_cpu_set_state(curcpu, PT_ACTIVE);
 	pt_cpu_toggle_local(cpu->ctx->save_area, true);
 }
 
@@ -291,16 +278,16 @@ pt_cpu_stop(void *dummy)
 	struct pt_cpu *cpu;
 	struct pt_ctx *ctx;
 
-	/* Shutdown may occur before PT gets properly configured. */
-	if (pt_cpu_get_state(curcpu) == PT_DISABLED)
-		return;
-
 	cpu = &pt_pcpu[curcpu];
 	ctx = cpu->ctx;
-	MPASS(ctx != NULL);
-	dprintf("%s: curcpu %d\n", __func__, curcpu);
 
-	pt_cpu_set_state(curcpu, PT_STOPPED);
+	dprintf("%s: curcpu %d\n", __func__, curcpu);
+	/* Shutdown may occur before PT gets properly configured. */
+	if (ctx == NULL) {
+		dprintf("%s: missing context on cpu %d; bailing\n", __func__,
+		    curcpu);
+		return;
+	}
 	pt_cpu_toggle_local(cpu->ctx->save_area, false);
 	pt_update_buffer(&ctx->buf);
 }
@@ -406,13 +393,11 @@ pt_init_ctx(struct pt_ctx *pt_ctx, struct hwt_vm *vm, int ctx_id)
 		return (ENOMEM);
 	dprintf("%s: preparing ToPA buffer\n", __func__);
 	if (pt_topa_prepare(pt_ctx, vm) != 0) {
-		dprintf("%s: failed to prepare ToPA buffer\n", __func__);
 		free(pt_ctx->save_area, M_PT);
 		return (ENOMEM);
 	}
 
 	pt_ctx->id = ctx_id;
-	TASK_INIT(&pt_ctx->task, 0, pt_send_buffer_record, pt_ctx);
 
 	return (0);
 }
@@ -426,7 +411,6 @@ pt_deinit_ctx(struct pt_ctx *pt_ctx)
 	if (pt_ctx->save_area != NULL)
 		free(pt_ctx->save_area, M_PT);
 	memset(pt_ctx, 0, sizeof(*pt_ctx));
-	pt_ctx->buf.topa_hw = NULL;
 }
 
 /*
@@ -519,7 +503,6 @@ pt_backend_configure(struct hwt_context *ctx, int cpu_id, int thread_id)
 	    XSTATE_XCOMP_BV_COMPACT;
 	pt_ext->rtit_ctl |= RTIT_CTL_TRACEEN;
 	pt_pcpu[cpu_id].ctx = pt_ctx;
-	pt_cpu_set_state(cpu_id, PT_STOPPED);
 
 	return (0);
 }
@@ -549,12 +532,19 @@ pt_backend_disable(struct hwt_context *ctx, int cpu_id)
 
 	if (ctx->mode == HWT_MODE_CPU)
 		return;
-
 	KASSERT(curcpu == cpu_id,
 	    ("%s: attempting to disable PT on another cpu", __func__));
+
+	cpu = &pt_pcpu[cpu_id];
+
+	dprintf("%s: waiting for cpu %d to exit interrupt handler\n", __func__,
+	    cpu_id);
+	pt_cpu_set_state(cpu_id, PT_INACTIVE);
+	while (atomic_cmpset_int(&cpu->in_pcint_handler, 1, 0))
+		;
+
 	pt_cpu_stop(NULL);
 	CPU_CLR(cpu_id, &ctx->cpu_map);
-	cpu = &pt_pcpu[cpu_id];
 	cpu->ctx = NULL;
 }
 
@@ -564,14 +554,14 @@ pt_backend_disable(struct hwt_context *ctx, int cpu_id)
 static int
 pt_backend_enable_smp(struct hwt_context *ctx)
 {
-
 	dprintf("%s\n", __func__);
+
+	KASSERT(ctx->mode == HWT_MODE_CPU,
+	    ("%s: should only be used for CPU mode", __func__));
 	if (ctx->mode == HWT_MODE_CPU &&
 	    atomic_swap_32(&cpu_mode_ctr, 1) != 0)
 		return (-1);
 
-	KASSERT(ctx->mode == HWT_MODE_CPU,
-	    ("%s: should only be used for CPU mode", __func__));
 	smp_rendezvous_cpus(ctx->cpu_map, NULL, pt_cpu_start, NULL, NULL);
 
 	return (0);
@@ -583,6 +573,7 @@ pt_backend_enable_smp(struct hwt_context *ctx)
 static int
 pt_backend_disable_smp(struct hwt_context *ctx)
 {
+	struct pt_cpu *cpu;
 
 	dprintf("%s\n", __func__);
 	if (ctx->mode == HWT_MODE_CPU &&
@@ -593,6 +584,14 @@ pt_backend_disable_smp(struct hwt_context *ctx)
 		dprintf("%s: empty cpu map\n", __func__);
 		return (-1);
 	}
+	CPU_FOREACH_ISSET(cpu_id, &ctx->cpu_map) {
+		cpu = &pt_pcpu[cpu_id];
+		dprintf("%s: waiting for cpu %d to exit interrupt handler\n",
+		    __func__, cpu_id);
+		pt_cpu_set_state(cpu_id, PT_INACTIVE);
+		while (atomic_cmpset_int(&cpu->in_pcint_handler, 1, 0))
+			;
+	}
 	smp_rendezvous_cpus(ctx->cpu_map, NULL, pt_cpu_stop, NULL, NULL);
 
 	return (0);
@@ -611,13 +610,13 @@ pt_backend_init(struct hwt_context *ctx)
 	int error;
 
 	dprintf("%s\n", __func__);
-	if (ctx->mode == HWT_MODE_CPU) {
-		TAILQ_FOREACH(hwt_cpu, &ctx->cpus, next) {
-			error = pt_init_ctx(&pt_pcpu_ctx[hwt_cpu->cpu_id],
-			    hwt_cpu->vm, hwt_cpu->cpu_id);
-			if (error)
-				return (error);
-		}
+	if (ctx->mode != HWT_MODE_CPU)
+		return (0);
+	TAILQ_FOREACH(hwt_cpu, &ctx->cpus, next) {
+		error = pt_init_ctx(&pt_pcpu_ctx[hwt_cpu->cpu_id], hwt_cpu->vm,
+		    hwt_cpu->cpu_id);
+		if (error)
+			return (error);
 	}
 
 	return (0);
@@ -647,20 +646,16 @@ pt_backend_deinit(struct hwt_context *ctx)
 			pt_deinit_ctx(pt_ctx);
 		}
 	} else {
-		CPU_FOREACH(cpu_id) {
-			if (!CPU_ISSET(cpu_id, &ctx->cpu_map))
+		CPU_FOREACH_ISSET(cpu_id, &ctx->cpu_map) {
+			if (pt_pcpu[cpu_id].ctx == NULL)
 				continue;
-			if (pt_pcpu[cpu_id].ctx != NULL) {
-				KASSERT(pt_pcpu[cpu_id].ctx ==
-					&pt_pcpu_ctx[cpu_id],
-				    ("%s: CPU mode tracing with non-cpu mode PT"
-				     "context active",
-					__func__));
-				pt_pcpu[cpu_id].ctx = NULL;
-			}
-			pt_ctx = &pt_pcpu_ctx[cpu_id];
-			pt_deinit_ctx(pt_ctx);
-			memset(&pt_pcpu[cpu_id], 0, sizeof(struct pt_cpu));
+			KASSERT(pt_pcpu[cpu_id].ctx == &pt_pcpu_ctx[cpu_id],
+			    ("%s: CPU mode tracing with non-cpu mode PT"
+			     "context active",
+				__func__));
+			pt_deinit_ctx(pt_pcpu[cpu_id].ctx);
+			pt_pcpu[cpu_id].ctx = NULL;
+			atomic_set_int(&pt_pcpu[cpu_id].in_pcint_handler, 0);
 		}
 	}
 
@@ -675,15 +670,15 @@ pt_backend_read(struct hwt_vm *vm, int *curpage, vm_offset_t *curpage_offset,
     uint64_t *data)
 {
 	struct pt_buffer *buf;
+	uint64_t offset;
 
 	if (vm->ctx->mode == HWT_MODE_THREAD)
 		buf = &((struct pt_ctx *)vm->thr->private)->buf;
 	else
 		buf = &pt_pcpu[vm->cpu->cpu_id].ctx->buf;
-	mtx_lock_spin(&buf->lock);
-	*curpage = buf->curpage;
-	*curpage_offset = buf->offset + (buf->wrap_count * vm->ctx->bufsize);
-	mtx_unlock_spin(&buf->lock);
+	offset = atomic_load_acq_64(&buf->offset);
+	*curpage = offset / PAGE_SIZE;
+	*curpage_offset = offset & PAGE_MASK;
 
 	return (0);
 }
@@ -762,15 +757,13 @@ static struct hwt_backend backend = {
  * Used as a taskqueue routine from the ToPA interrupt handler.
  */
 static void
-pt_send_buffer_record(void *arg, int pending __unused)
+pt_send_buffer_record(void *arg)
 {
+	struct pt_cpu *cpu = (struct pt_cpu *)arg;
 	struct hwt_record_entry record;
-	struct pt_ctx *ctx = (struct pt_ctx *)arg;
 
-	/* Prepare buffer record. */
-	mtx_lock_spin(&ctx->buf.lock);
+	struct pt_ctx *ctx = cpu->ctx;
 	pt_fill_buffer_record(ctx->id, &ctx->buf, &record);
-	mtx_unlock_spin(&ctx->buf.lock);
 	hwt_record_ctx(ctx->hwt_ctx, &record, M_ZERO | M_NOWAIT);
 }
 static void
@@ -795,36 +788,40 @@ static int
 pt_topa_intr(struct trapframe *tf)
 {
 	struct pt_buffer *buf;
+	struct pt_cpu *cpu;
 	struct pt_ctx *ctx;
 	uint64_t reg;
 
-	SDT_PROBE0(pt, , , topa__intr);
-
-	if (pt_cpu_get_state(curcpu) != PT_ACTIVE) {
-		return (0);
-	}
+	cpu = &pt_pcpu[curcpu];
 	reg = rdmsr(MSR_IA_GLOBAL_STATUS);
 	if ((reg & GLOBAL_STATUS_FLAG_TRACETOPAPMI) == 0) {
-		/* ACK spurious or leftover interrupt. */
 		pt_topa_status_clear();
+		return (0);
+	}
+
+	if (pt_cpu_get_state(curcpu) != PT_ACTIVE) {
 		return (1);
 	}
+	atomic_set_int(&cpu->in_pcint_handler, 1);
 
-	ctx = pt_pcpu[curcpu].ctx;
+	ctx = cpu->ctx;
+	KASSERT(ctx != NULL,
+	    ("%s: cpu %d: ToPA PMI interrupt without an active context",
+		__func__, curcpu));
 	buf = &ctx->buf;
 	KASSERT(buf->topa_hw != NULL,
-	    ("%s: ToPA PMI interrupt with invalid buffer", __func__));
-
+	    ("%s: cpu %d: ToPA PMI interrupt with invalid buffer", __func__,
+		curcpu));
 	pt_cpu_toggle_local(ctx->save_area, false);
 	pt_update_buffer(buf);
 	pt_topa_status_clear();
-	taskqueue_enqueue_flags(taskqueue_pt, &ctx->task,
-	    TASKQUEUE_FAIL_IF_PENDING);
 
 	if (pt_cpu_get_state(curcpu) == PT_ACTIVE) {
+		swi_sched(cpu->swi_cookie, SWI_FROMNMI);
 		pt_cpu_toggle_local(ctx->save_area, true);
 		lapic_reenable_pcint();
 	}
+	atomic_set_int(&cpu->in_pcint_handler, 0);
 	return (1);
 }
 
@@ -839,7 +836,7 @@ static int
 pt_init(void)
 {
 	u_int cp[4];
-	int error;
+	int error, i;
 
 	dprintf("pt: Enumerating part 1\n");
 	cpuid_count(CPUID_PT_LEAF, 0, cp);
@@ -869,20 +866,38 @@ pt_init(void)
 	pt_pcpu_ctx = mallocarray(mp_ncpus, sizeof(struct pt_ctx), M_PT,
 	    M_ZERO | M_WAITOK);
 
+	for (i = 0; i < mp_ncpus; i++) {
+		error = swi_add(&clk_intr_event, "pt", pt_send_buffer_record,
+		    &pt_pcpu[i], SWI_CLOCK, INTR_MPSAFE,
+		    &pt_pcpu[i].swi_cookie);
+		if (error != 0) {
+			dprintf(
+			    "%s: failed to add interrupt handler for cpu: %d\n",
+			    __func__, error);
+			goto err;
+		}
+	}
+
 	nmi_register_handler(pt_topa_intr);
-	if (!lapic_enable_pcint()) {
-		nmi_remove_handler(pt_topa_intr);
-		hwt_backend_unregister(&backend);
-		free(pt_pcpu, M_PT);
-		free(pt_pcpu_ctx, M_PT);
-		pt_pcpu = NULL;
-		pt_pcpu_ctx = NULL;
+	if (lapic_enable_pcint()) {
+		initialized = true;
+		return (0);
+	} else
 		printf("pt: failed to setup interrupt line\n");
-		return (error);
+err:
+	nmi_remove_handler(pt_topa_intr);
+	hwt_backend_unregister(&backend);
+
+	for (i = 0; i < mp_ncpus; i++) {
+		if (pt_pcpu[i].swi_cookie != 0)
+			swi_remove(pt_pcpu[i].swi_cookie);
 	}
-	initialized = true;
+	free(pt_pcpu, M_PT);
+	free(pt_pcpu_ctx, M_PT);
+	pt_pcpu = NULL;
+	pt_pcpu_ctx = NULL;
 
-	return (0);
+	return (error);
 }
 
 /*
@@ -941,14 +956,24 @@ pt_supported(void)
 static void
 pt_deinit(void)
 {
+	int i;
+	struct pt_cpu *cpu;
+
 	if (!initialized)
 		return;
 	nmi_remove_handler(pt_topa_intr);
 	lapic_disable_pcint();
 	hwt_backend_unregister(&backend);
+
+	for (i = 0; i < mp_ncpus; i++) {
+		cpu = &pt_pcpu[i];
+		swi_remove(cpu->swi_cookie);
+	}
+
 	free(pt_pcpu, M_PT);
 	free(pt_pcpu_ctx, M_PT);
 	pt_pcpu = NULL;
+	pt_pcpu_ctx = NULL;
 	initialized = false;
 }
 
diff --git a/sys/amd64/sgx/sgx_linux.c b/sys/amd64/sgx/sgx_linux.c
index 6ecef9207a38..d389edc1b2b0 100644
--- a/sys/amd64/sgx/sgx_linux.c
+++ b/sys/amd64/sgx/sgx_linux.c
@@ -92,16 +92,7 @@ out:
 	return (error);
 }
 
-static struct linux_ioctl_handler sgx_linux_handler = {
-	sgx_linux_ioctl,
-	SGX_LINUX_IOCTL_MIN,
-	SGX_LINUX_IOCTL_MAX,
-};
-
-SYSINIT(sgx_linux_register, SI_SUB_KLD, SI_ORDER_MIDDLE,
-    linux_ioctl_register_handler, &sgx_linux_handler);
-SYSUNINIT(sgx_linux_unregister, SI_SUB_KLD, SI_ORDER_MIDDLE,
-    linux_ioctl_unregister_handler, &sgx_linux_handler);
+LINUX_IOCTL_SET(sgx, SGX_LINUX_IOCTL_MIN, SGX_LINUX_IOCTL_MAX);
 
 static int
 sgx_linux_modevent(module_t mod, int type, void *data)
diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c
index f7c59847140b..f2bea0d82b5c 100644
--- a/sys/amd64/vmm/vmm.c
+++ b/sys/amd64/vmm/vmm.c
@@ -724,12 +724,7 @@ vm_name(struct vm *vm)
 int
 vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
 {
-	vm_object_t obj;
-
-	if ((obj = vmm_mmio_alloc(vm_vmspace(vm), gpa, len, hpa)) == NULL)
-		return (ENOMEM);
-	else
-		return (0);
+	return (vmm_mmio_alloc(vm_vmspace(vm), gpa, len, hpa));
 }
 
 int
@@ -870,7 +865,7 @@ vm_assign_pptdev(struct vm *vm, int bus, int slot, int func)
 int
 vm_get_register(struct vcpu *vcpu, int reg, uint64_t *retval)
 {
-
+	/* Negative values represent VM control structure fields. */
 	if (reg >= VM_REG_LAST)
 		return (EINVAL);
 
@@ -882,6 +877,7 @@ vm_set_register(struct vcpu *vcpu, int reg, uint64_t val)
 {
 	int error;
 
+	/* Negative values represent VM control structure fields. */
 	if (reg >= VM_REG_LAST)
 		return (EINVAL);
 
diff --git a/sys/amd64/vmm/vmm_dev_machdep.c b/sys/amd64/vmm/vmm_dev_machdep.c
index dfebc9dcadbf..b84be809ea24 100644
--- a/sys/amd64/vmm/vmm_dev_machdep.c
+++ b/sys/amd64/vmm/vmm_dev_machdep.c
@@ -124,12 +124,16 @@ const struct vmmdev_ioctl vmmdev_machdep_ioctls[] = {
 	VMMDEV_IOCTL(VM_SET_KERNEMU_DEV, VMMDEV_IOCTL_LOCK_ONE_VCPU),
 
 	VMMDEV_IOCTL(VM_BIND_PPTDEV,
-	    VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
+	    VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS |
+	    VMMDEV_IOCTL_PRIV_CHECK_DRIVER),
 	VMMDEV_IOCTL(VM_UNBIND_PPTDEV,
-	    VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
+	    VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS |
+	    VMMDEV_IOCTL_PRIV_CHECK_DRIVER),
 
-	VMMDEV_IOCTL(VM_MAP_PPTDEV_MMIO, VMMDEV_IOCTL_LOCK_ALL_VCPUS),
-	VMMDEV_IOCTL(VM_UNMAP_PPTDEV_MMIO, VMMDEV_IOCTL_LOCK_ALL_VCPUS),
+	VMMDEV_IOCTL(VM_MAP_PPTDEV_MMIO, VMMDEV_IOCTL_LOCK_ALL_VCPUS |
+	    VMMDEV_IOCTL_PRIV_CHECK_DRIVER),
+	VMMDEV_IOCTL(VM_UNMAP_PPTDEV_MMIO, VMMDEV_IOCTL_LOCK_ALL_VCPUS |
+	    VMMDEV_IOCTL_PRIV_CHECK_DRIVER),
 #ifdef BHYVE_SNAPSHOT
 #ifdef COMPAT_FREEBSD13
 	VMMDEV_IOCTL(VM_SNAPSHOT_REQ_13, VMMDEV_IOCTL_LOCK_ALL_VCPUS),
@@ -147,9 +151,9 @@ const struct vmmdev_ioctl vmmdev_machdep_ioctls[] = {
 
 	VMMDEV_IOCTL(VM_LAPIC_LOCAL_IRQ, VMMDEV_IOCTL_MAYBE_ALLOC_VCPU),
 
-	VMMDEV_IOCTL(VM_PPTDEV_MSI, 0),
-	VMMDEV_IOCTL(VM_PPTDEV_MSIX, 0),
-	VMMDEV_IOCTL(VM_PPTDEV_DISABLE_MSIX, 0),
+	VMMDEV_IOCTL(VM_PPTDEV_MSI, VMMDEV_IOCTL_PRIV_CHECK_DRIVER),
+	VMMDEV_IOCTL(VM_PPTDEV_MSIX, VMMDEV_IOCTL_PRIV_CHECK_DRIVER),
+	VMMDEV_IOCTL(VM_PPTDEV_DISABLE_MSIX, VMMDEV_IOCTL_PRIV_CHECK_DRIVER),
 	VMMDEV_IOCTL(VM_LAPIC_MSI, 0),
 	VMMDEV_IOCTL(VM_IOAPIC_ASSERT_IRQ, 0),
 	VMMDEV_IOCTL(VM_IOAPIC_DEASSERT_IRQ, 0),
@@ -172,40 +176,13 @@ int
 vmmdev_machdep_ioctl(struct vm *vm, struct vcpu *vcpu, u_long cmd, caddr_t data,
     int fflag, struct thread *td)
 {
-	struct vm_seg_desc *vmsegdesc;
-	struct vm_run *vmrun;
-#ifdef COMPAT_FREEBSD13
-	struct vm_run_13 *vmrun_13;
-#endif
-	struct vm_exception *vmexc;
-	struct vm_lapic_irq *vmirq;
-	struct vm_lapic_msi *vmmsi;
-	struct vm_ioapic_irq *ioapic_irq;
-	struct vm_isa_irq *isa_irq;
-	struct vm_isa_irq_trigger *isa_irq_trigger;
-	struct vm_pptdev *pptdev;
-	struct vm_pptdev_mmio *pptmmio;
-	struct vm_pptdev_msi *pptmsi;
-	struct vm_pptdev_msix *pptmsix;
-	struct vm_x2apic *x2apic;
-	struct vm_gpa_pte *gpapte;
-	struct vm_gla2gpa *gg;
-	struct vm_intinfo *vmii;
-	struct vm_rtc_time *rtctime;
-	struct vm_rtc_data *rtcdata;
-	struct vm_readwrite_kernemu_device *kernemu;
-#ifdef BHYVE_SNAPSHOT
-	struct vm_snapshot_meta *snapshot_meta;
-#ifdef COMPAT_FREEBSD13
-	struct vm_snapshot_meta_13 *snapshot_13;
-#endif
-#endif
 	int error;
 
 	error = 0;
 	switch (cmd) {
 	case VM_RUN: {
 		struct vm_exit *vme;
+		struct vm_run *vmrun;
 
 		vmrun = (struct vm_run *)data;
 		vme = vm_exitinfo(vcpu);
@@ -243,6 +220,7 @@ vmmdev_machdep_ioctl(struct vm *vm, struct vcpu *vcpu, u_long cmd, caddr_t data,
 	case VM_RUN_13: {
 		struct vm_exit *vme;
 		struct vm_exit_13 *vme_13;
+		struct vm_run_13 *vmrun_13;
 
 		vmrun_13 = (struct vm_run_13 *)data;
 		vme_13 = &vmrun_13->vm_exit;
@@ -281,85 +259,123 @@ vmmdev_machdep_ioctl(struct vm *vm, struct vcpu *vcpu, u_long cmd, caddr_t data,
 		break;
 	}
 #endif
-	case VM_PPTDEV_MSI:
+	case VM_PPTDEV_MSI: {
+		struct vm_pptdev_msi *pptmsi;
+
 		pptmsi = (struct vm_pptdev_msi *)data;
-		error = ppt_setup_msi(vm,
-				      pptmsi->bus, pptmsi->slot, pptmsi->func,
-				      pptmsi->addr, pptmsi->msg,
-				      pptmsi->numvec);
+		error = ppt_setup_msi(vm, pptmsi->bus, pptmsi->slot,
+		    pptmsi->func, pptmsi->addr, pptmsi->msg, pptmsi->numvec);
 		break;
-	case VM_PPTDEV_MSIX:
+	}
+	case VM_PPTDEV_MSIX: {
+		struct vm_pptdev_msix *pptmsix;
+
 		pptmsix = (struct vm_pptdev_msix *)data;
-		error = ppt_setup_msix(vm,
-				       pptmsix->bus, pptmsix->slot,
-				       pptmsix->func, pptmsix->idx,
-				       pptmsix->addr, pptmsix->msg,
-				       pptmsix->vector_control);
+		error = ppt_setup_msix(vm, pptmsix->bus, pptmsix->slot,
+		    pptmsix->func, pptmsix->idx, pptmsix->addr, pptmsix->msg,
+		    pptmsix->vector_control);
 		break;
-	case VM_PPTDEV_DISABLE_MSIX:
+	}
+	case VM_PPTDEV_DISABLE_MSIX: {
+		struct vm_pptdev *pptdev;
+
 		pptdev = (struct vm_pptdev *)data;
 		error = ppt_disable_msix(vm, pptdev->bus, pptdev->slot,
-					 pptdev->func);
+		    pptdev->func);
 		break;
-	case VM_MAP_PPTDEV_MMIO:
+	}
+	case VM_MAP_PPTDEV_MMIO: {
+		struct vm_pptdev_mmio *pptmmio;
+
 		pptmmio = (struct vm_pptdev_mmio *)data;
 		error = ppt_map_mmio(vm, pptmmio->bus, pptmmio->slot,
-				     pptmmio->func, pptmmio->gpa, pptmmio->len,
-				     pptmmio->hpa);
+		    pptmmio->func, pptmmio->gpa, pptmmio->len, pptmmio->hpa);
 		break;
-	case VM_UNMAP_PPTDEV_MMIO:
+	}
+	case VM_UNMAP_PPTDEV_MMIO: {
+		struct vm_pptdev_mmio *pptmmio;
+
 		pptmmio = (struct vm_pptdev_mmio *)data;
 		error = ppt_unmap_mmio(vm, pptmmio->bus, pptmmio->slot,
-				       pptmmio->func, pptmmio->gpa, pptmmio->len);
+		    pptmmio->func, pptmmio->gpa, pptmmio->len);
 		break;
-	case VM_BIND_PPTDEV:
+	}
+	case VM_BIND_PPTDEV: {
+		struct vm_pptdev *pptdev;
+
 		pptdev = (struct vm_pptdev *)data;
 		error = vm_assign_pptdev(vm, pptdev->bus, pptdev->slot,
-					 pptdev->func);
+		    pptdev->func);
 		break;
-	case VM_UNBIND_PPTDEV:
+	}
+	case VM_UNBIND_PPTDEV: {
+		struct vm_pptdev *pptdev;
+
 		pptdev = (struct vm_pptdev *)data;
 		error = vm_unassign_pptdev(vm, pptdev->bus, pptdev->slot,
-					   pptdev->func);
+		    pptdev->func);
 		break;
-	case VM_INJECT_EXCEPTION:
+	}
+	case VM_INJECT_EXCEPTION: {
+		struct vm_exception *vmexc;
+
 		vmexc = (struct vm_exception *)data;
 		error = vm_inject_exception(vcpu,
 		    vmexc->vector, vmexc->error_code_valid, vmexc->error_code,
 		    vmexc->restart_instruction);
 		break;
+	}
 	case VM_INJECT_NMI:
 		error = vm_inject_nmi(vcpu);
 		break;
-	case VM_LAPIC_IRQ:
+	case VM_LAPIC_IRQ: {
+		struct vm_lapic_irq *vmirq;
+
 		vmirq = (struct vm_lapic_irq *)data;
 		error = lapic_intr_edge(vcpu, vmirq->vector);
 		break;
-	case VM_LAPIC_LOCAL_IRQ:
+	}
+	case VM_LAPIC_LOCAL_IRQ: {
+		struct vm_lapic_irq *vmirq;
+
 		vmirq = (struct vm_lapic_irq *)data;
 		error = lapic_set_local_intr(vm, vcpu, vmirq->vector);
 		break;
-	case VM_LAPIC_MSI:
+	}
+	case VM_LAPIC_MSI: {
+		struct vm_lapic_msi *vmmsi;
+
 		vmmsi = (struct vm_lapic_msi *)data;
 		error = lapic_intr_msi(vm, vmmsi->addr, vmmsi->msg);
 		break;
-	case VM_IOAPIC_ASSERT_IRQ:
+	}
+	case VM_IOAPIC_ASSERT_IRQ: {
+		struct vm_ioapic_irq *ioapic_irq;
+
 		ioapic_irq = (struct vm_ioapic_irq *)data;
 		error = vioapic_assert_irq(vm, ioapic_irq->irq);
 		break;
-	case VM_IOAPIC_DEASSERT_IRQ:
+	}
+	case VM_IOAPIC_DEASSERT_IRQ: {
+		struct vm_ioapic_irq *ioapic_irq;
+
 		ioapic_irq = (struct vm_ioapic_irq *)data;
 		error = vioapic_deassert_irq(vm, ioapic_irq->irq);
 		break;
-	case VM_IOAPIC_PULSE_IRQ:
+	}
+	case VM_IOAPIC_PULSE_IRQ: {
+		struct vm_ioapic_irq *ioapic_irq;
+
 		ioapic_irq = (struct vm_ioapic_irq *)data;
 		error = vioapic_pulse_irq(vm, ioapic_irq->irq);
 		break;
+	}
 	case VM_IOAPIC_PINCOUNT:
 		*(int *)data = vioapic_pincount(vm);
 		break;
 	case VM_SET_KERNEMU_DEV:
 	case VM_GET_KERNEMU_DEV: {
+		struct vm_readwrite_kernemu_device *kernemu;
 		mem_region_write_t mwrite;
 		mem_region_read_t mread;
 		int size;
@@ -396,60 +412,86 @@ vmmdev_machdep_ioctl(struct vm *vm, struct vcpu *vcpu, u_long cmd, caddr_t data,
 			error = mread(vcpu, kernemu->gpa,
 			    &kernemu->value, size, &arg);
 		break;
-		}
-	case VM_ISA_ASSERT_IRQ:
+	}
+	case VM_ISA_ASSERT_IRQ: {
+		struct vm_isa_irq *isa_irq;
+
 		isa_irq = (struct vm_isa_irq *)data;
 		error = vatpic_assert_irq(vm, isa_irq->atpic_irq);
 		if (error == 0 && isa_irq->ioapic_irq != -1)
 			error = vioapic_assert_irq(vm, isa_irq->ioapic_irq);
 		break;
-	case VM_ISA_DEASSERT_IRQ:
+	}
+	case VM_ISA_DEASSERT_IRQ: {
+		struct vm_isa_irq *isa_irq;
+
 		isa_irq = (struct vm_isa_irq *)data;
 		error = vatpic_deassert_irq(vm, isa_irq->atpic_irq);
 		if (error == 0 && isa_irq->ioapic_irq != -1)
 			error = vioapic_deassert_irq(vm, isa_irq->ioapic_irq);
 		break;
-	case VM_ISA_PULSE_IRQ:
+	}
+	case VM_ISA_PULSE_IRQ: {
+		struct vm_isa_irq *isa_irq;
+
 		isa_irq = (struct vm_isa_irq *)data;
 		error = vatpic_pulse_irq(vm, isa_irq->atpic_irq);
 		if (error == 0 && isa_irq->ioapic_irq != -1)
 			error = vioapic_pulse_irq(vm, isa_irq->ioapic_irq);
 		break;
-	case VM_ISA_SET_IRQ_TRIGGER:
+	}
+	case VM_ISA_SET_IRQ_TRIGGER: {
+		struct vm_isa_irq_trigger *isa_irq_trigger;
+
 		isa_irq_trigger = (struct vm_isa_irq_trigger *)data;
 		error = vatpic_set_irq_trigger(vm,
 		    isa_irq_trigger->atpic_irq, isa_irq_trigger->trigger);
 		break;
-	case VM_SET_SEGMENT_DESCRIPTOR:
+	}
+	case VM_SET_SEGMENT_DESCRIPTOR: {
+		struct vm_seg_desc *vmsegdesc;
+
 		vmsegdesc = (struct vm_seg_desc *)data;
-		error = vm_set_seg_desc(vcpu,
-					vmsegdesc->regnum,
-					&vmsegdesc->desc);
+		error = vm_set_seg_desc(vcpu, vmsegdesc->regnum,
+		    &vmsegdesc->desc);
 		break;
-	case VM_GET_SEGMENT_DESCRIPTOR:
+	}
+	case VM_GET_SEGMENT_DESCRIPTOR: {
+		struct vm_seg_desc *vmsegdesc;
+
 		vmsegdesc = (struct vm_seg_desc *)data;
-		error = vm_get_seg_desc(vcpu,
-					vmsegdesc->regnum,
-					&vmsegdesc->desc);
+		error = vm_get_seg_desc(vcpu, vmsegdesc->regnum,
+		    &vmsegdesc->desc);
 		break;
-	case VM_SET_X2APIC_STATE:
+	}
+	case VM_SET_X2APIC_STATE: {
+		struct vm_x2apic *x2apic;
+
 		x2apic = (struct vm_x2apic *)data;
 		error = vm_set_x2apic_state(vcpu, x2apic->state);
 		break;
-	case VM_GET_X2APIC_STATE:
+	}
+	case VM_GET_X2APIC_STATE: {
+		struct vm_x2apic *x2apic;
+
 		x2apic = (struct vm_x2apic *)data;
 		error = vm_get_x2apic_state(vcpu, &x2apic->state);
 		break;
-	case VM_GET_GPA_PMAP:
+	}
+	case VM_GET_GPA_PMAP: {
+		struct vm_gpa_pte *gpapte;
+
 		gpapte = (struct vm_gpa_pte *)data;
-		pmap_get_mapping(vmspace_pmap(vm_vmspace(vm)),
-				 gpapte->gpa, gpapte->pte, &gpapte->ptenum);
-		error = 0;
+		pmap_get_mapping(vmspace_pmap(vm_vmspace(vm)), gpapte->gpa,
+		    gpapte->pte, &gpapte->ptenum);
 		break;
+	}
 	case VM_GET_HPET_CAPABILITIES:
 		error = vhpet_getcap((struct vm_hpet_cap *)data);
 		break;
 	case VM_GLA2GPA: {
+		struct vm_gla2gpa *gg;
+
 		CTASSERT(PROT_READ == VM_PROT_READ);
 		CTASSERT(PROT_WRITE == VM_PROT_WRITE);
 		CTASSERT(PROT_EXEC == VM_PROT_EXECUTE);
@@ -460,50 +502,76 @@ vmmdev_machdep_ioctl(struct vm *vm, struct vcpu *vcpu, u_long cmd, caddr_t data,
 		    ("%s: vm_gla2gpa unknown error %d", __func__, error));
 		break;
 	}
-	case VM_GLA2GPA_NOFAULT:
+	case VM_GLA2GPA_NOFAULT: {
+		struct vm_gla2gpa *gg;
+
 		gg = (struct vm_gla2gpa *)data;
 		error = vm_gla2gpa_nofault(vcpu, &gg->paging, gg->gla,
 		    gg->prot, &gg->gpa, &gg->fault);
 		KASSERT(error == 0 || error == EFAULT,
 		    ("%s: vm_gla2gpa unknown error %d", __func__, error));
 		break;
-	case VM_SET_INTINFO:
+	}
+	case VM_SET_INTINFO: {
+		struct vm_intinfo *vmii;
+
 		vmii = (struct vm_intinfo *)data;
 		error = vm_exit_intinfo(vcpu, vmii->info1);
 		break;
-	case VM_GET_INTINFO:
+	}
+	case VM_GET_INTINFO: {
+		struct vm_intinfo *vmii;
+
 		vmii = (struct vm_intinfo *)data;
 		error = vm_get_intinfo(vcpu, &vmii->info1, &vmii->info2);
 		break;
-	case VM_RTC_WRITE:
+	}
+	case VM_RTC_WRITE: {
+		struct vm_rtc_data *rtcdata;
+
 		rtcdata = (struct vm_rtc_data *)data;
 		error = vrtc_nvram_write(vm, rtcdata->offset,
 		    rtcdata->value);
 		break;
-	case VM_RTC_READ:
+	}
+	case VM_RTC_READ: {
+		struct vm_rtc_data *rtcdata;
+
 		rtcdata = (struct vm_rtc_data *)data;
 		error = vrtc_nvram_read(vm, rtcdata->offset,
 		    &rtcdata->value);
 		break;
-	case VM_RTC_SETTIME:
+	}
+	case VM_RTC_SETTIME: {
+		struct vm_rtc_time *rtctime;
+
 		rtctime = (struct vm_rtc_time *)data;
 		error = vrtc_set_time(vm, rtctime->secs);
 		break;
-	case VM_RTC_GETTIME:
-		error = 0;
+	}
+	case VM_RTC_GETTIME: {
+		struct vm_rtc_time *rtctime;
+
 		rtctime = (struct vm_rtc_time *)data;
 		rtctime->secs = vrtc_get_time(vm);
 		break;
+	}
 	case VM_RESTART_INSTRUCTION:
 		error = vm_restart_instruction(vcpu);
 		break;
 #ifdef BHYVE_SNAPSHOT
-	case VM_SNAPSHOT_REQ:
+	case VM_SNAPSHOT_REQ: {
+		struct vm_snapshot_meta *snapshot_meta;
+
 		snapshot_meta = (struct vm_snapshot_meta *)data;
 		error = vm_snapshot_req(vm, snapshot_meta);
 		break;
+	}
 #ifdef COMPAT_FREEBSD13
-	case VM_SNAPSHOT_REQ_13:
+	case VM_SNAPSHOT_REQ_13: {
+		struct vm_snapshot_meta *snapshot_meta;
+		struct vm_snapshot_meta_13 *snapshot_13;
+
 		/*
 		 * The old structure just has an additional pointer at
 		 * the start that is ignored.
@@ -513,6 +581,7 @@ vmmdev_machdep_ioctl(struct vm *vm, struct vcpu *vcpu, u_long cmd, caddr_t data,
 		    (struct vm_snapshot_meta *)&snapshot_13->dev_data;
 		error = vm_snapshot_req(vm, snapshot_meta);
 		break;
+	}
 #endif
 	case VM_RESTORE_TIME:
 		error = vm_restore_time(vm);
diff --git a/sys/amd64/vmm/vmm_mem.h b/sys/amd64/vmm/vmm_mem.h
index 41b9bf07c4fc..d905fd37001d 100644
--- a/sys/amd64/vmm/vmm_mem.h
+++ b/sys/amd64/vmm/vmm_mem.h
@@ -30,10 +30,9 @@
 #define	_VMM_MEM_H_
 
 struct vmspace;
-struct vm_object;
 
-struct vm_object *vmm_mmio_alloc(struct vmspace *, vm_paddr_t gpa, size_t len,
-				 vm_paddr_t hpa);
+int		vmm_mmio_alloc(struct vmspace *, vm_paddr_t gpa, size_t len,
+		    vm_paddr_t hpa);
 void		vmm_mmio_free(struct vmspace *, vm_paddr_t gpa, size_t size);
 vm_paddr_t	vmm_mem_maxaddr(void);
 
diff --git a/sys/amd64/vmm/vmm_mem_machdep.c b/sys/amd64/vmm/vmm_mem_machdep.c
index e96c9e4bdc66..afb3a0274e2a 100644
--- a/sys/amd64/vmm/vmm_mem_machdep.c
+++ b/sys/amd64/vmm/vmm_mem_machdep.c
@@ -36,6 +36,7 @@
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
+#include <vm/vm_extern.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
@@ -45,40 +46,48 @@
 
 #include "vmm_mem.h"
 
-vm_object_t
+int
 vmm_mmio_alloc(struct vmspace *vmspace, vm_paddr_t gpa, size_t len,
-	       vm_paddr_t hpa)
+    vm_paddr_t hpa)
 {
-	int error;
-	vm_object_t obj;
 	struct sglist *sg;
+	vm_object_t obj;
+	int error;
+
+	if (gpa + len < gpa || hpa + len < hpa || (gpa & PAGE_MASK) != 0 ||
+	    (hpa & PAGE_MASK) != 0 || (len & PAGE_MASK) != 0)
+		return (EINVAL);
 
 	sg = sglist_alloc(1, M_WAITOK);
 	error = sglist_append_phys(sg, hpa, len);
 	KASSERT(error == 0, ("error %d appending physaddr to sglist", error));
 
 	obj = vm_pager_allocate(OBJT_SG, sg, len, VM_PROT_RW, 0, NULL);
-	if (obj != NULL) {
-		/*
-		 * VT-x ignores the MTRR settings when figuring out the
-		 * memory type for translations obtained through EPT.
-		 *
-		 * Therefore we explicitly force the pages provided by
-		 * this object to be mapped as uncacheable.
-		 */
-		VM_OBJECT_WLOCK(obj);
-		error = vm_object_set_memattr(obj, VM_MEMATTR_UNCACHEABLE);
-		VM_OBJECT_WUNLOCK(obj);
-		if (error != KERN_SUCCESS) {
-			panic("vmm_mmio_alloc: vm_object_set_memattr error %d",
-				error);
-		}
-		error = vm_map_find(&vmspace->vm_map, obj, 0, &gpa, len, 0,
-				    VMFS_NO_SPACE, VM_PROT_RW, VM_PROT_RW, 0);
-		if (error != KERN_SUCCESS) {
-			vm_object_deallocate(obj);
-			obj = NULL;
-		}
+	if (obj == NULL)
+		return (ENOMEM);
+
+	/*
+	 * VT-x ignores the MTRR settings when figuring out the memory type for
+	 * translations obtained through EPT.
+	 *
+	 * Therefore we explicitly force the pages provided by this object to be
+	 * mapped as uncacheable.
+	 */
+	VM_OBJECT_WLOCK(obj);
+	error = vm_object_set_memattr(obj, VM_MEMATTR_UNCACHEABLE);
+	VM_OBJECT_WUNLOCK(obj);
+	if (error != KERN_SUCCESS)
+		panic("vmm_mmio_alloc: vm_object_set_memattr error %d", error);
+
+	vm_map_lock(&vmspace->vm_map);
+	error = vm_map_insert(&vmspace->vm_map, obj, 0, gpa, gpa + len,
+	    VM_PROT_RW, VM_PROT_RW, 0);
+	vm_map_unlock(&vmspace->vm_map);
+	if (error != KERN_SUCCESS) {
+		error = vm_mmap_to_errno(error);
+		vm_object_deallocate(obj);
+	} else {
+		error = 0;
 	}
 
 	/*
@@ -94,7 +103,7 @@ vmm_mmio_alloc(struct vmspace *vmspace, vm_paddr_t gpa, size_t len,
 	 */
 	sglist_free(sg);
 
-	return (obj);
+	return (error);
 }
 
 void