15 files changed, 496 insertions, 30 deletions
diff --git a/sys/kern/init_sysent.c b/sys/kern/init_sysent.c
index e42e7dcf8b44..cd305de1ed44 100644
--- a/sys/kern/init_sysent.c
+++ b/sys/kern/init_sysent.c
@@ -665,4 +665,5 @@ struct sysent sysent[] = {
 	{ .sy_narg = AS(setgroups_args), .sy_call = (sy_call_t *)sys_setgroups, .sy_auevent = AUE_SETGROUPS, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC },	/* 596 = setgroups */
 	{ .sy_narg = AS(jail_attach_jd_args), .sy_call = (sy_call_t *)sys_jail_attach_jd, .sy_auevent = AUE_JAIL_ATTACH, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC },	/* 597 = jail_attach_jd */
 	{ .sy_narg = AS(jail_remove_jd_args), .sy_call = (sy_call_t *)sys_jail_remove_jd, .sy_auevent = AUE_JAIL_REMOVE, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC },	/* 598 = jail_remove_jd */
+	{ .sy_narg = AS(kexec_load_args), .sy_call = (sy_call_t *)sys_kexec_load, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC },	/* 599 = kexec_load */
 };
diff --git a/sys/kern/kern_jail.c b/sys/kern/kern_jail.c
index 267b60ffb5bc..523b7e314a10 100644
--- a/sys/kern/kern_jail.c
+++ b/sys/kern/kern_jail.c
@@ -3047,6 +3047,8 @@ do_jail_attach(struct thread *td, struct prison *pr, int drflags)
 	setsugid(p);
 #ifdef RACCT
 	racct_proc_ucred_changed(p, oldcred, newcred);
+#endif
+#ifdef RCTL
 	crhold(newcred);
 #endif
 	PROC_UNLOCK(p);
diff --git a/sys/kern/kern_kexec.c b/sys/kern/kern_kexec.c
new file mode 100644
index 000000000000..2efea7dcf9a7
--- /dev/null
+++ b/sys/kern/kern_kexec.c
@@ -0,0 +1,350 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2025 Juniper Networks, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/eventhandler.h>
+#include <sys/kernel.h>
+#ifdef INTRNG
+#include <sys/intr.h>
+#endif
+#include <sys/kexec.h>
+#include <sys/malloc.h>
+#include <sys/proc.h>
+#include <sys/priv.h>
+#include <sys/reboot.h>
+#include <sys/rman.h>
+#include <sys/rwlock.h>
+#include <sys/smp.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysproto.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_map.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pagequeue.h>
+#include <vm/vm_phys.h>
+#include <vm/vm_radix.h>
+
+#include <machine/kexec.h>
+
+#ifndef	KEXEC_MD_PAGES
+/*
+ * Number of MD pages for extra bookkeeping.
+ * This is a macro because it can be a constant (some architectures make it 0).
+ * It accepts an argument, which is an array of
+ * kexec_segment[KEXEC_SEGMENT_MAX].
+ */
+#define	KEXEC_MD_PAGES(x)	0
+#endif
+
+/*
+ * Basic design:
+ *
+ * Given an array of "segment descriptors" stage an image to be loaded and
+ * jumped to at reboot, instead of rebooting via firmware.
+ *
+ * Constraints:
+ * - The segment descriptors' "mem" and "memsz" must each fit within a
+ *   vm_phys_seg segment, which can be obtained via the `vm.phys_segs` sysctl.
+ *   A single segment cannot span multiple vm_phys_seg segments, even if the
+ *   vm_phys_seg segments are adjacent.
+ *
+ * Technical details:
+ *
+ * Take advantage of the VM subsystem and create a vm_object to hold the staged
+ * image.  When grabbing pages for the object, sort the pages so that if a page
+ * in the object is located in the physical range of any of the kexec segment
+ * targets then it gets placed at the pindex corresponding to that physical
+ * address.  This avoids the chance of corruption by writing over the page in
+ * the final copy, or the need for a copy buffer page.
+ */
+
+static struct kexec_image staged_image;
+static vm_offset_t stage_addr;
+static vm_object_t kexec_obj;
+
+static eventhandler_tag kexec_reboot_handler;
+static struct mtx kexec_mutex;
+
+static MALLOC_DEFINE(M_KEXEC, "kexec", "Kexec segments");
+
+
+static void
+kexec_reboot(void *junk __unused, int howto)
+{
+	if ((howto & RB_KEXEC) == 0 || kexec_obj == NULL)
+		return;
+
+#ifdef SMP
+	cpu_mp_stop();
+#endif /* SMP */
+	intr_disable();
+	printf("Starting kexec reboot\n");
+
+	scheduler_stopped = true;
+	kexec_reboot_md(&staged_image);
+}
+
+MTX_SYSINIT(kexec_mutex, &kexec_mutex, "kexec", MTX_DEF);
+
+/* Sort the segment list once copied in */
+static int
+seg_cmp(const void *seg1, const void *seg2)
+{
+	const struct kexec_segment *s1, *s2;
+
+	s1 = seg1;
+	s2 = seg2;
+
+	return ((uintptr_t)s1->mem - (uintptr_t)s2->mem);
+}
+
+static bool
+segment_fits(struct kexec_segment *seg)
+{
+	vm_paddr_t v = (vm_paddr_t)(uintptr_t)seg->mem;
+
+	for (int i = 0; i < vm_phys_nsegs; i++) {
+		if (v >= vm_phys_segs[i].start &&
+		    (v + seg->memsz - 1) <= vm_phys_segs[i].end)
+			return (true);
+	}
+
+	return (false);
+}
+
+static vm_paddr_t
+pa_for_pindex(struct kexec_segment_stage *segs, int count, vm_pindex_t pind)
+{
+	for (int i = count; i > 0; --i) {
+		if (pind >= segs[i - 1].pindex)
+			return (ptoa(pind - segs[i-1].pindex) + segs[i - 1].target);
+	}
+
+	panic("No segment for pindex %ju\n", (uintmax_t)pind);
+}
+
+/*
+ * For now still tied to the system call, so assumes all memory is userspace.
+ */
+int
+kern_kexec_load(struct thread *td, u_long entry, u_long nseg,
+    struct kexec_segment *seg, u_long flags)
+{
+	static int kexec_loading;
+	struct kexec_segment segtmp[KEXEC_SEGMENT_MAX];
+	struct kexec_image *new_image_stage = 0;
+	vm_object_t new_segments = NULL;
+	uint8_t *buf;
+	int err = 0;
+	int i;
+	const size_t segsize = nseg * sizeof(struct kexec_segment);
+	vm_page_t *page_list = 0;
+	vm_size_t image_count, md_pages, page_count, tmpsize;
+	vm_offset_t segment_va = 0;
+	/*
+	 * - Do any sanity checking
+	 * - Load the new segments to temporary
+	 * - Remove the old segments
+	 * - Install the new segments
+	 */
+
+	if (nseg > KEXEC_SEGMENT_MAX)
+		return (EINVAL);
+
+	if (atomic_cmpset_acq_int(&kexec_loading, false, true) == 0)
+		return (EBUSY);
+
+	/* Only do error checking if we're installing new segments. */
+	if (nseg > 0) {
+		/* Create the new kexec object before destroying the old one. */
+		bzero(&segtmp, sizeof(segtmp));
+		err = copyin(seg, segtmp, segsize);
+		if (err != 0)
+			goto out;
+		qsort(segtmp, nseg, sizeof(*segtmp), seg_cmp);
+		new_image_stage = malloc(sizeof(*new_image_stage), M_TEMP, M_WAITOK | M_ZERO);
+		/*
+		 * Sanity checking:
+		 * - All segments must not overlap the kernel, so must be fully enclosed
+		 *   in a vm_phys_seg (each kexec segment must be in a single
+		 *   vm_phys_seg segment, cannot cross even adjacent segments).
+		 */
+		image_count = 0;
+		for (i = 0; i < nseg; i++) {
+			if (!segment_fits(&segtmp[i]) ||
+			    segtmp[i].bufsz > segtmp[i].memsz) {
+				err = EINVAL;
+				goto out;
+			}
+			new_image_stage->segments[i].pindex = image_count;
+			new_image_stage->segments[i].target = (vm_offset_t)segtmp[i].mem;
+			new_image_stage->segments[i].size = segtmp[i].memsz;
+			image_count += atop(segtmp[i].memsz);
+		}
+		md_pages = KEXEC_MD_PAGES(segtmp);
+		page_count = image_count + md_pages;
+		new_segments = vm_object_allocate(OBJT_PHYS, page_count);
+		page_list = malloc(page_count * sizeof(vm_page_t), M_TEMP, M_WAITOK);
+
+		/*
+		 * - Grab all pages for all segments (use pindex to slice it)
+		 * - Walk the list (once)
+		 *   - At each pindex, check if the target PA that corresponds
+		 *     to that index is in the object.  If so, swap the pages.
+		 *   - At the end of this the list will be "best" sorted.
+		 */
+		vm_page_grab_pages_unlocked(new_segments, 0,
+		    VM_ALLOC_NORMAL | VM_ALLOC_WAITOK | VM_ALLOC_WIRED | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO,
+		    page_list, page_count);
+
+		/* Sort the pages to best match the PA */
+		VM_OBJECT_WLOCK(new_segments);
+		for (i = 0; i < image_count; i++) {
+			vm_page_t curpg, otherpg, tmp;
+			vm_pindex_t otheridx;
+
+			curpg = page_list[i];
+			otherpg = PHYS_TO_VM_PAGE(pa_for_pindex(new_image_stage->segments,
+			    nseg, curpg->pindex));
+			otheridx = otherpg->pindex;
+
+			if (otherpg->object == new_segments) {
+				/*
+				 * Swap 'curpg' and 'otherpg', since 'otherpg'
+				 * is at the PA 'curpg' covers.
+				 */
+				vm_radix_remove(&new_segments->rtree, otheridx);
+				vm_radix_remove(&new_segments->rtree, i);
+				otherpg->pindex = i;
+				curpg->pindex = otheridx;
+				vm_radix_insert(&new_segments->rtree, curpg);
+				vm_radix_insert(&new_segments->rtree, otherpg);
+				tmp = curpg;
+				page_list[i] = otherpg;
+				page_list[otheridx] = tmp;
+			}
+		}
+		for (i = 0; i < nseg; i++) {
+			new_image_stage->segments[i].first_page =
+			    vm_radix_lookup(&new_segments->rtree,
+			    new_image_stage->segments[i].pindex);
+		}
+		if (md_pages > 0)
+			new_image_stage->first_md_page =
+			    vm_radix_lookup(&new_segments->rtree,
+			    page_count - md_pages);
+		else
+			new_image_stage->first_md_page = NULL;
+		VM_OBJECT_WUNLOCK(new_segments);
+
+		/* Map the object to do the copies */
+		err = vm_map_find(kernel_map, new_segments, 0, &segment_va,
+		    ptoa(page_count), 0, VMFS_ANY_SPACE,
+		    VM_PROT_RW, VM_PROT_RW, MAP_PREFAULT);
+		if (err != 0)
+			goto out;
+		buf = (void *)segment_va;
+		new_image_stage->map_addr = segment_va;
+		new_image_stage->map_size = ptoa(new_segments->size);
+		new_image_stage->entry = entry;
+		new_image_stage->map_obj = new_segments;
+		for (i = 0; i < nseg; i++) {
+			err = copyin(segtmp[i].buf, buf, segtmp[i].bufsz);
+			if (err != 0) {
+				goto out;
+			}
+			new_image_stage->segments[i].map_buf = buf;
+			buf += segtmp[i].bufsz;
+			tmpsize = segtmp[i].memsz - segtmp[i].bufsz;
+			if (tmpsize > 0)
+				memset(buf, 0, tmpsize);
+			buf += tmpsize;
+		}
+		/* What's left are the MD pages, so zero them all out. */
+		if (md_pages > 0)
+			bzero(buf, ptoa(md_pages));
+
+		cpu_flush_dcache((void *)segment_va, ptoa(page_count));
+		if ((err = kexec_load_md(new_image_stage)) != 0)
+			goto out;
+	}
+	if (kexec_obj != NULL) {
+		vm_object_unwire(kexec_obj, 0, kexec_obj->size, 0);
+		KASSERT(stage_addr != 0, ("Mapped kexec_obj without address"));
+		vm_map_remove(kernel_map, stage_addr, stage_addr + kexec_obj->size);
+	}
+	kexec_obj = new_segments;
+	bzero(&staged_image, sizeof(staged_image));
+	if (nseg > 0)
+		memcpy(&staged_image, new_image_stage, sizeof(*new_image_stage));
+
+	printf("trampoline at %#jx\n", (uintmax_t)staged_image.entry);
+	if (nseg > 0) {
+		if (kexec_reboot_handler == NULL)
+			kexec_reboot_handler =
+			    EVENTHANDLER_REGISTER(shutdown_final, kexec_reboot, NULL,
+			    SHUTDOWN_PRI_DEFAULT - 150);
+	} else {
+		if (kexec_reboot_handler != NULL)
+			EVENTHANDLER_DEREGISTER(shutdown_final, kexec_reboot_handler);
+	}
+out:
+	/* Clean up the mess if we've gotten far. */
+	if (err != 0 && new_segments != NULL) {
+		vm_object_unwire(new_segments, 0, new_segments->size, 0);
+		if (segment_va != 0)
+			vm_map_remove(kernel_map, segment_va, segment_va + kexec_obj->size);
+		else
+			vm_object_deallocate(new_segments);
+	}
+	atomic_store_rel_int(&kexec_loading, false);
+	if (new_image_stage != NULL)
+		free(new_image_stage, M_TEMP);
+	if (page_list != 0)
+		free(page_list, M_TEMP);
+
+	return (err);
+}
+
+int
+sys_kexec_load(struct thread *td, struct kexec_load_args *uap)
+{
+	int error;
+
+	// FIXME: Do w need a better privilege check than PRIV_REBOOT here?
+	error = priv_check(td, PRIV_REBOOT);
+	if (error != 0)
+		return (error);
+	return (kern_kexec_load(td, uap->entry, uap->nseg, uap->segments, uap->flags));
+}
diff --git a/sys/kern/kern_loginclass.c b/sys/kern/kern_loginclass.c
index 55db6c28a1db..0c111c4f78d8 100644
--- a/sys/kern/kern_loginclass.c
+++ b/sys/kern/kern_loginclass.c
@@ -225,6 +225,8 @@ sys_setloginclass(struct thread *td, struct setloginclass_args *uap)
 	proc_set_cred(p, newcred);
 #ifdef RACCT
 	racct_proc_ucred_changed(p, oldcred, newcred);
+#endif
+#ifdef RCTL
 	crhold(newcred);
 #endif
 	PROC_UNLOCK(p);
diff --git a/sys/kern/kern_prot.c b/sys/kern/kern_prot.c
index a4c5bcc52529..3c145851b683 100644
--- a/sys/kern/kern_prot.c
+++ b/sys/kern/kern_prot.c
@@ -696,7 +696,7 @@ kern_setcred(struct thread *const td, const u_int flags,
 	gid_t *groups = NULL;
 	gid_t smallgroups[CRED_SMALLGROUPS_NB];
 	int error;
-	bool cred_set;
+	bool cred_set = false;
 
 	/* Bail out on unrecognized flags. */
 	if (flags & ~SETCREDF_MASK)
@@ -839,17 +839,32 @@ kern_setcred(struct thread *const td, const u_int flags,
 	if (cred_set) {
 		setsugid(p);
 		to_free_cred = old_cred;
+#ifdef RACCT
+		racct_proc_ucred_changed(p, old_cred, new_cred);
+#endif
+#ifdef RCTL
+		crhold(new_cred);
+#endif
 		MPASS(error == 0);
 	} else
 		error = EAGAIN;
 
 unlock_finish:
 	PROC_UNLOCK(p);
+
 	/*
 	 * Part 3: After releasing the process lock, we perform cleanups and
 	 * finishing operations.
 	 */
 
+#ifdef RCTL
+	if (cred_set) {
+		rctl_proc_ucred_changed(p, new_cred);
+		/* Paired with the crhold() just above. */
+		crfree(new_cred);
+	}
+#endif
+
 #ifdef MAC
 	if (mac_set_proc_data != NULL)
 		mac_set_proc_finish(td, proc_label_set, mac_set_proc_data);
@@ -982,6 +997,8 @@ sys_setuid(struct thread *td, struct setuid_args *uap)
 	proc_set_cred(p, newcred);
 #ifdef RACCT
 	racct_proc_ucred_changed(p, oldcred, newcred);
+#endif
+#ifdef RCTL
 	crhold(newcred);
 #endif
 	PROC_UNLOCK(p);
@@ -1390,6 +1407,8 @@ sys_setreuid(struct thread *td, struct setreuid_args *uap)
 	proc_set_cred(p, newcred);
 #ifdef RACCT
 	racct_proc_ucred_changed(p, oldcred, newcred);
+#endif
+#ifdef RCTL
 	crhold(newcred);
 #endif
 	PROC_UNLOCK(p);
@@ -1536,6 +1555,8 @@ sys_setresuid(struct thread *td, struct setresuid_args *uap)
 	proc_set_cred(p, newcred);
 #ifdef RACCT
 	racct_proc_ucred_changed(p, oldcred, newcred);
+#endif
+#ifdef RCTL
 	crhold(newcred);
 #endif
 	PROC_UNLOCK(p);
diff --git a/sys/kern/kern_racct.c b/sys/kern/kern_racct.c
index 2aab151aba08..17b64ad00bb5 100644
--- a/sys/kern/kern_racct.c
+++ b/sys/kern/kern_racct.c
@@ -1236,16 +1236,20 @@ racct_updatepcpu_containers(void)
 	    racct_updatepcpu_post, NULL, NULL);
 }
 
+static bool
+racct_proc_to_skip(const struct proc *p)
+{
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	return (p->p_state != PRS_NORMAL || (p->p_flag & P_IDLEPROC) != 0);
+}
+
 static void
 racctd(void)
 {
 	struct proc *p;
-	struct proc *idle;
 
 	ASSERT_RACCT_ENABLED();
 
-	idle = STAILQ_FIRST(&cpuhead)->pc_idlethread->td_proc;
-
 	for (;;) {
 		racct_decay();
 
@@ -1253,12 +1257,7 @@ racctd(void)
 
 		FOREACH_PROC_IN_SYSTEM(p) {
 			PROC_LOCK(p);
-			if (p == idle) {
-				PROC_UNLOCK(p);
-				continue;
-			}
-			if (p->p_state != PRS_NORMAL ||
-			    (p->p_flag & P_IDLEPROC) != 0) {
+			if (racct_proc_to_skip(p)) {
 				PROC_UNLOCK(p);
 				continue;
 			}
@@ -1284,7 +1283,7 @@ racctd(void)
 		 */
 		FOREACH_PROC_IN_SYSTEM(p) {
 			PROC_LOCK(p);
-			if (p->p_state != PRS_NORMAL) {
+			if (racct_proc_to_skip(p)) {
 				PROC_UNLOCK(p);
 				continue;
 			}
diff --git a/sys/kern/subr_bus.c b/sys/kern/subr_bus.c
index bf5bda7e058d..b84f69cfd03e 100644
--- a/sys/kern/subr_bus.c
+++ b/sys/kern/subr_bus.c
@@ -4633,7 +4633,7 @@ bus_release_resources(device_t dev, const struct resource_spec *rs,
  * parent of @p dev.
  */
 struct resource *
-bus_alloc_resource(device_t dev, int type, int *rid, rman_res_t start,
+(bus_alloc_resource)(device_t dev, int type, int *rid, rman_res_t start,
     rman_res_t end, rman_res_t count, u_int flags)
 {
 	struct resource *res;
diff --git a/sys/kern/subr_smp.c b/sys/kern/subr_smp.c
index 1f9577fddf9c..9f5106316018 100644
--- a/sys/kern/subr_smp.c
+++ b/sys/kern/subr_smp.c
@@ -242,7 +242,7 @@ generic_stop_cpus(cpuset_t map, u_int type)
 	KASSERT(
 	    type == IPI_STOP || type == IPI_STOP_HARD
 #if X86
-	    || type == IPI_SUSPEND
+	    || type == IPI_SUSPEND || type == IPI_OFF
 #endif
 	    , ("%s: invalid stop type", __func__));
 
@@ -260,7 +260,7 @@ generic_stop_cpus(cpuset_t map, u_int type)
 	 * will be lost, violating FreeBSD's assumption of reliable
 	 * IPI delivery.
 	 */
-	if (type == IPI_SUSPEND)
+	if (type == IPI_SUSPEND || type == IPI_OFF)
 		mtx_lock_spin(&smp_ipi_mtx);
 #endif
 
@@ -280,7 +280,7 @@ generic_stop_cpus(cpuset_t map, u_int type)
 #endif
 
 #if X86
-	if (type == IPI_SUSPEND)
+	if (type == IPI_SUSPEND || type == IPI_OFF)
 		cpus = &suspended_cpus;
 	else
 #endif
@@ -298,7 +298,7 @@ generic_stop_cpus(cpuset_t map, u_int type)
 	}
 
 #if X86
-	if (type == IPI_SUSPEND)
+	if (type == IPI_SUSPEND || type == IPI_OFF)
 		mtx_unlock_spin(&smp_ipi_mtx);
 #endif
 
@@ -327,6 +327,13 @@ suspend_cpus(cpuset_t map)
 
 	return (generic_stop_cpus(map, IPI_SUSPEND));
 }
+
+int
+offline_cpus(cpuset_t map)
+{
+
+	return (generic_stop_cpus(map, IPI_OFF));
+}
 #endif
 
 /*
diff --git a/sys/kern/subr_syscall.c b/sys/kern/subr_syscall.c
index d5b3b62f0821..48896529f685 100644
--- a/sys/kern/subr_syscall.c
+++ b/sys/kern/subr_syscall.c
@@ -55,8 +55,8 @@ syscallenter(struct thread *td)
 	struct proc *p;
 	struct syscall_args *sa;
 	struct sysent *se;
-	int error, traced;
-	bool sy_thr_static;
+	int error;
+	bool sy_thr_static, traced;
 
 	VM_CNT_INC(v_syscall);
 	p = td->td_proc;
@@ -219,7 +219,7 @@ syscallret(struct thread *td)
 	struct proc *p;
 	struct syscall_args *sa;
 	ksiginfo_t ksi;
-	int traced;
+	bool traced;
 
 	KASSERT(td->td_errno != ERELOOKUP,
 	    ("ERELOOKUP not consumed syscall %d", td->td_sa.code));
@@ -250,9 +250,9 @@ syscallret(struct thread *td)
 	}
 #endif
 
-	traced = 0;
+	traced = false;
 	if (__predict_false(p->p_flag & P_TRACED)) {
-		traced = 1;
+		traced = true;
 		PROC_LOCK(p);
 		td->td_dbgflags |= TDB_SCX;
 		PROC_UNLOCK(p);
diff --git a/sys/kern/sys_generic.c b/sys/kern/sys_generic.c
index 7d666da9f88b..b84f675d1dcb 100644
--- a/sys/kern/sys_generic.c
+++ b/sys/kern/sys_generic.c
@@ -2345,3 +2345,35 @@ exterr_set(int eerror, int category, const char *mmsg, uintptr_t pp1,
 	}
 	return (eerror);
 }
+
+int
+exterr_set_from(const struct kexterr *ke)
+{
+	struct thread *td;
+
+	td = curthread;
+	if ((td->td_pflags2 & TDP2_UEXTERR) != 0) {
+		td->td_pflags2 |= TDP2_EXTERR;
+		td->td_kexterr = *ke;
+	}
+	return (td->td_kexterr.error);
+}
+
+void
+exterr_clear(struct kexterr *ke)
+{
+	memset(ke, 0, sizeof(*ke));
+}
+
+#include "opt_ddb.h"
+#ifdef DDB
+#include <ddb/ddb.h>
+
+void
+exterr_db_print(struct kexterr *ke)
+{
+	db_printf("errno %d cat %d msg %s p1 %#jx p2 %#jx line %d\n",
+	    ke->error, ke->cat, ke->msg == NULL ? "<none>" : ke->msg,
+	    (uintmax_t)ke->p1, (uintmax_t)ke->p2, ke->src_line);
+}
+#endif
diff --git a/sys/kern/syscalls.c b/sys/kern/syscalls.c
index 4cef89cd5219..06a4adc3d8cb 100644
--- a/sys/kern/syscalls.c
+++ b/sys/kern/syscalls.c
@@ -604,4 +604,5 @@ const char *syscallnames[] = {
 	"setgroups",			/* 596 = setgroups */
 	"jail_attach_jd",			/* 597 = jail_attach_jd */
 	"jail_remove_jd",			/* 598 = jail_remove_jd */
+	"kexec_load",			/* 599 = kexec_load */
 };
diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master
index 967af1f5313c..ea6d2b5aa1ef 100644
--- a/sys/kern/syscalls.master
+++ b/sys/kern/syscalls.master
@@ -3394,4 +3394,12 @@
 		);
 	}
 
+599	AUE_NULL	STD {
+		int kexec_load(
+			uint64_t entry,
+			u_long nseg,
+			_In_reads_(nseg) _Contains_long_ptr_ struct kexec_segment *segments,
+			u_long flags
+		);
+	}
 ; vim: syntax=off
diff --git a/sys/kern/systrace_args.c b/sys/kern/systrace_args.c
index e28fef931ea8..5951cebbe74a 100644
--- a/sys/kern/systrace_args.c
+++ b/sys/kern/systrace_args.c
@@ -3514,6 +3514,16 @@ systrace_args(int sysnum, void *params, uint64_t *uarg, int *n_args)
 		*n_args = 1;
 		break;
 	}
+	/* kexec_load */
+	case 599: {
+		struct kexec_load_args *p = params;
+		uarg[a++] = p->entry; /* uint64_t */
+		uarg[a++] = p->nseg; /* u_long */
+		uarg[a++] = (intptr_t)p->segments; /* struct kexec_segment * */
+		uarg[a++] = p->flags; /* u_long */
+		*n_args = 4;
+		break;
+	}
 	default:
 		*n_args = 0;
 		break;
@@ -9401,6 +9411,25 @@ systrace_entry_setargdesc(int sysnum, int ndx, char *desc, size_t descsz)
 			break;
 		};
 		break;
+	/* kexec_load */
+	case 599:
+		switch (ndx) {
+		case 0:
+			p = "uint64_t";
+			break;
+		case 1:
+			p = "u_long";
+			break;
+		case 2:
+			p = "userland struct kexec_segment *";
+			break;
+		case 3:
+			p = "u_long";
+			break;
+		default:
+			break;
+		};
+		break;
 	default:
 		break;
 	};
@@ -11409,6 +11438,11 @@ systrace_return_setargdesc(int sysnum, int ndx, char *desc, size_t descsz)
 		if (ndx == 0 || ndx == 1)
 			p = "int";
 		break;
+	/* kexec_load */
+	case 599:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
 	default:
 		break;
 	};
diff --git a/sys/kern/vfs_aio.c b/sys/kern/vfs_aio.c
index 60916a9fbd32..02d4b8426757 100644
--- a/sys/kern/vfs_aio.c
+++ b/sys/kern/vfs_aio.c
@@ -2487,7 +2487,7 @@ aio_biowakeup(struct bio *bp)
 	long bcount = bp->bio_bcount;
 	long resid = bp->bio_resid;
 	int opcode, nblks;
-	int bio_error = bp->bio_error;
+	int abio_error = bp->bio_error;
 	uint16_t flags = bp->bio_flags;
 
 	opcode = job->uaiocb.aio_lio_opcode;
@@ -2503,16 +2503,16 @@ aio_biowakeup(struct bio *bp)
 	 * error of whichever failed bio completed last.
 	 */
 	if (flags & BIO_ERROR)
-		atomic_store_int(&job->error, bio_error);
+		atomic_store_int(&job->error, abio_error);
 	if (opcode & LIO_WRITE)
 		atomic_add_int(&job->outblock, nblks);
 	else
 		atomic_add_int(&job->inblock, nblks);
 
 	if (refcount_release(&job->nbio)) {
-		bio_error = atomic_load_int(&job->error);
-		if (bio_error != 0)
-			aio_complete(job, -1, bio_error);
+		abio_error = atomic_load_int(&job->error);
+		if (abio_error != 0)
+			aio_complete(job, -1, abio_error);
 		else
 			aio_complete(job, atomic_load_long(&job->nbytes), 0);
 	}
diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c
index 19c39e42bafa..880cc6b99951 100644
--- a/sys/kern/vfs_bio.c
+++ b/sys/kern/vfs_bio.c
@@ -44,6 +44,7 @@
  * see man buf(9) for more info.
  */
 
+#define	EXTERR_CATEGORY	EXTERR_CAT_VFSBIO
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/asan.h>
@@ -55,6 +56,7 @@
 #include <sys/counter.h>
 #include <sys/devicestat.h>
 #include <sys/eventhandler.h>
+#include <sys/exterrvar.h>
 #include <sys/fail.h>
 #include <sys/ktr.h>
 #include <sys/limits.h>
@@ -1775,7 +1777,6 @@ buf_alloc(struct bufdomain *bd)
 	bp->b_blkno = bp->b_lblkno = 0;
 	bp->b_offset = NOOFFSET;
 	bp->b_iodone = 0;
-	bp->b_error = 0;
 	bp->b_resid = 0;
 	bp->b_bcount = 0;
 	bp->b_npages = 0;
@@ -1785,6 +1786,7 @@ buf_alloc(struct bufdomain *bd)
 	bp->b_fsprivate1 = NULL;
 	bp->b_fsprivate2 = NULL;
 	bp->b_fsprivate3 = NULL;
+	exterr_clear(&bp->b_exterr);
 	LIST_INIT(&bp->b_dep);
 
 	return (bp);
@@ -2276,7 +2278,7 @@ breadn_flags(struct vnode *vp, daddr_t blkno, daddr_t dblkno, int size,
 		}
 		if ((flags & GB_CVTENXIO) != 0)
 			bp->b_xflags |= BX_CVTENXIO;
-		bp->b_ioflags &= ~BIO_ERROR;
+		bp->b_ioflags &= ~(BIO_ERROR | BIO_EXTERR);
 		if (bp->b_rcred == NOCRED && cred != NOCRED)
 			bp->b_rcred = crhold(cred);
 		vfs_busy_pages(bp, 0);
@@ -2353,7 +2355,7 @@ bufwrite(struct buf *bp)
 	bundirty(bp);
 
 	bp->b_flags &= ~B_DONE;
-	bp->b_ioflags &= ~BIO_ERROR;
+	bp->b_ioflags &= ~(BIO_ERROR | BIO_EXTERR);
 	bp->b_flags |= B_CACHE;
 	bp->b_iocmd = BIO_WRITE;
 
@@ -4520,8 +4522,11 @@ biowait(struct bio *bp, const char *wmesg)
 	while ((bp->bio_flags & BIO_DONE) == 0)
 		msleep(bp, mtxp, PRIBIO, wmesg, 0);
 	mtx_unlock(mtxp);
-	if (bp->bio_error != 0)
+	if (bp->bio_error != 0) {
+		if ((bp->bio_flags & BIO_EXTERR) != 0)
+			return (exterr_set_from(&bp->bio_exterr));
 		return (bp->bio_error);
+	}
 	if (!(bp->bio_flags & BIO_ERROR))
 		return (0);
 	return (EIO);
@@ -4568,6 +4573,8 @@ bufwait(struct buf *bp)
 		return (EINTR);
 	}
 	if (bp->b_ioflags & BIO_ERROR) {
+		if ((bp->b_ioflags & BIO_EXTERR) != 0)
+			exterr_set_from(&bp->b_exterr);
 		return (bp->b_error ? bp->b_error : EIO);
 	} else {
 		return (0);
@@ -5522,6 +5529,8 @@ DB_SHOW_COMMAND(buffer, db_show_buffer)
 		db_printf("\n");
 	}
 	BUF_LOCKPRINTINFO(bp);
+	if ((bp->b_ioflags & BIO_EXTERR) != 0)
+		exterr_db_print(&bp->b_exterr);
 #if defined(FULL_BUF_TRACKING)
 	db_printf("b_io_tracking: b_io_tcnt = %u\n", bp->b_io_tcnt);