diff options
Diffstat (limited to 'sys/kern')
| -rw-r--r-- | sys/kern/init_sysent.c | 1 | ||||
| -rw-r--r-- | sys/kern/kern_jail.c | 2 | ||||
| -rw-r--r-- | sys/kern/kern_kexec.c | 350 | ||||
| -rw-r--r-- | sys/kern/kern_loginclass.c | 2 | ||||
| -rw-r--r-- | sys/kern/kern_prot.c | 23 | ||||
| -rw-r--r-- | sys/kern/kern_racct.c | 19 | ||||
| -rw-r--r-- | sys/kern/subr_bus.c | 2 | ||||
| -rw-r--r-- | sys/kern/subr_smp.c | 15 | ||||
| -rw-r--r-- | sys/kern/subr_syscall.c | 10 | ||||
| -rw-r--r-- | sys/kern/sys_generic.c | 32 | ||||
| -rw-r--r-- | sys/kern/syscalls.c | 1 | ||||
| -rw-r--r-- | sys/kern/syscalls.master | 8 | ||||
| -rw-r--r-- | sys/kern/systrace_args.c | 34 | ||||
| -rw-r--r-- | sys/kern/vfs_aio.c | 10 | ||||
| -rw-r--r-- | sys/kern/vfs_bio.c | 17 |
15 files changed, 496 insertions, 30 deletions
diff --git a/sys/kern/init_sysent.c b/sys/kern/init_sysent.c index e42e7dcf8b44..cd305de1ed44 100644 --- a/sys/kern/init_sysent.c +++ b/sys/kern/init_sysent.c @@ -665,4 +665,5 @@ struct sysent sysent[] = { { .sy_narg = AS(setgroups_args), .sy_call = (sy_call_t *)sys_setgroups, .sy_auevent = AUE_SETGROUPS, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 596 = setgroups */ { .sy_narg = AS(jail_attach_jd_args), .sy_call = (sy_call_t *)sys_jail_attach_jd, .sy_auevent = AUE_JAIL_ATTACH, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 597 = jail_attach_jd */ { .sy_narg = AS(jail_remove_jd_args), .sy_call = (sy_call_t *)sys_jail_remove_jd, .sy_auevent = AUE_JAIL_REMOVE, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 598 = jail_remove_jd */ + { .sy_narg = AS(kexec_load_args), .sy_call = (sy_call_t *)sys_kexec_load, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 599 = kexec_load */ }; diff --git a/sys/kern/kern_jail.c b/sys/kern/kern_jail.c index 267b60ffb5bc..523b7e314a10 100644 --- a/sys/kern/kern_jail.c +++ b/sys/kern/kern_jail.c @@ -3047,6 +3047,8 @@ do_jail_attach(struct thread *td, struct prison *pr, int drflags) setsugid(p); #ifdef RACCT racct_proc_ucred_changed(p, oldcred, newcred); +#endif +#ifdef RCTL crhold(newcred); #endif PROC_UNLOCK(p); diff --git a/sys/kern/kern_kexec.c b/sys/kern/kern_kexec.c new file mode 100644 index 000000000000..2efea7dcf9a7 --- /dev/null +++ b/sys/kern/kern_kexec.c @@ -0,0 +1,350 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2025 Juniper Networks, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/param.h> +#include <sys/bus.h> +#include <sys/eventhandler.h> +#include <sys/kernel.h> +#ifdef INTRNG +#include <sys/intr.h> +#endif +#include <sys/kexec.h> +#include <sys/malloc.h> +#include <sys/proc.h> +#include <sys/priv.h> +#include <sys/reboot.h> +#include <sys/rman.h> +#include <sys/rwlock.h> +#include <sys/smp.h> +#include <sys/syscallsubr.h> +#include <sys/sysproto.h> + +#include <vm/vm.h> +#include <vm/pmap.h> +#include <vm/vm_extern.h> +#include <vm/vm_kern.h> +#include <vm/vm_map.h> +#include <vm/vm_object.h> +#include <vm/vm_page.h> +#include <vm/vm_pagequeue.h> +#include <vm/vm_phys.h> +#include <vm/vm_radix.h> + +#include <machine/kexec.h> + +#ifndef KEXEC_MD_PAGES +/* + * Number of MD pages for extra bookkeeping. + * This is a macro because it can be a constant (some architectures make it 0). + * It accepts an argument, which is an array of + * kexec_segment[KEXEC_SEGMENT_MAX]. + */ +#define KEXEC_MD_PAGES(x) 0 +#endif + +/* + * Basic design: + * + * Given an array of "segment descriptors" stage an image to be loaded and + * jumped to at reboot, instead of rebooting via firmware. + * + * Constraints: + * - The segment descriptors' "mem" and "memsz" must each fit within a + * vm_phys_seg segment, which can be obtained via the `vm.phys_segs` sysctl. + * A single segment cannot span multiple vm_phys_seg segments, even if the + * vm_phys_seg segments are adjacent. + * + * Technical details: + * + * Take advantage of the VM subsystem and create a vm_object to hold the staged + * image. When grabbing pages for the object, sort the pages so that if a page + * in the object is located in the physical range of any of the kexec segment + * targets then it gets placed at the pindex corresponding to that physical + * address. This avoids the chance of corruption by writing over the page in + * the final copy, or the need for a copy buffer page. + */ + +static struct kexec_image staged_image; +static vm_offset_t stage_addr; +static vm_object_t kexec_obj; + +static eventhandler_tag kexec_reboot_handler; +static struct mtx kexec_mutex; + +static MALLOC_DEFINE(M_KEXEC, "kexec", "Kexec segments"); + + +static void +kexec_reboot(void *junk __unused, int howto) +{ + if ((howto & RB_KEXEC) == 0 || kexec_obj == NULL) + return; + +#ifdef SMP + cpu_mp_stop(); +#endif /* SMP */ + intr_disable(); + printf("Starting kexec reboot\n"); + + scheduler_stopped = true; + kexec_reboot_md(&staged_image); +} + +MTX_SYSINIT(kexec_mutex, &kexec_mutex, "kexec", MTX_DEF); + +/* Sort the segment list once copied in */ +static int +seg_cmp(const void *seg1, const void *seg2) +{ + const struct kexec_segment *s1, *s2; + + s1 = seg1; + s2 = seg2; + + return ((uintptr_t)s1->mem - (uintptr_t)s2->mem); +} + +static bool +segment_fits(struct kexec_segment *seg) +{ + vm_paddr_t v = (vm_paddr_t)(uintptr_t)seg->mem; + + for (int i = 0; i < vm_phys_nsegs; i++) { + if (v >= vm_phys_segs[i].start && + (v + seg->memsz - 1) <= vm_phys_segs[i].end) + return (true); + } + + return (false); +} + +static vm_paddr_t +pa_for_pindex(struct kexec_segment_stage *segs, int count, vm_pindex_t pind) +{ + for (int i = count; i > 0; --i) { + if (pind >= segs[i - 1].pindex) + return (ptoa(pind - segs[i-1].pindex) + segs[i - 1].target); + } + + panic("No segment for pindex %ju\n", (uintmax_t)pind); +} + +/* + * For now still tied to the system call, so assumes all memory is userspace. + */ +int +kern_kexec_load(struct thread *td, u_long entry, u_long nseg, + struct kexec_segment *seg, u_long flags) +{ + static int kexec_loading; + struct kexec_segment segtmp[KEXEC_SEGMENT_MAX]; + struct kexec_image *new_image_stage = 0; + vm_object_t new_segments = NULL; + uint8_t *buf; + int err = 0; + int i; + const size_t segsize = nseg * sizeof(struct kexec_segment); + vm_page_t *page_list = 0; + vm_size_t image_count, md_pages, page_count, tmpsize; + vm_offset_t segment_va = 0; + /* + * - Do any sanity checking + * - Load the new segments to temporary + * - Remove the old segments + * - Install the new segments + */ + + if (nseg > KEXEC_SEGMENT_MAX) + return (EINVAL); + + if (atomic_cmpset_acq_int(&kexec_loading, false, true) == 0) + return (EBUSY); + + /* Only do error checking if we're installing new segments. */ + if (nseg > 0) { + /* Create the new kexec object before destroying the old one. */ + bzero(&segtmp, sizeof(segtmp)); + err = copyin(seg, segtmp, segsize); + if (err != 0) + goto out; + qsort(segtmp, nseg, sizeof(*segtmp), seg_cmp); + new_image_stage = malloc(sizeof(*new_image_stage), M_TEMP, M_WAITOK | M_ZERO); + /* + * Sanity checking: + * - All segments must not overlap the kernel, so must be fully enclosed + * in a vm_phys_seg (each kexec segment must be in a single + * vm_phys_seg segment, cannot cross even adjacent segments). + */ + image_count = 0; + for (i = 0; i < nseg; i++) { + if (!segment_fits(&segtmp[i]) || + segtmp[i].bufsz > segtmp[i].memsz) { + err = EINVAL; + goto out; + } + new_image_stage->segments[i].pindex = image_count; + new_image_stage->segments[i].target = (vm_offset_t)segtmp[i].mem; + new_image_stage->segments[i].size = segtmp[i].memsz; + image_count += atop(segtmp[i].memsz); + } + md_pages = KEXEC_MD_PAGES(segtmp); + page_count = image_count + md_pages; + new_segments = vm_object_allocate(OBJT_PHYS, page_count); + page_list = malloc(page_count * sizeof(vm_page_t), M_TEMP, M_WAITOK); + + /* + * - Grab all pages for all segments (use pindex to slice it) + * - Walk the list (once) + * - At each pindex, check if the target PA that corresponds + * to that index is in the object. If so, swap the pages. + * - At the end of this the list will be "best" sorted. + */ + vm_page_grab_pages_unlocked(new_segments, 0, + VM_ALLOC_NORMAL | VM_ALLOC_WAITOK | VM_ALLOC_WIRED | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO, + page_list, page_count); + + /* Sort the pages to best match the PA */ + VM_OBJECT_WLOCK(new_segments); + for (i = 0; i < image_count; i++) { + vm_page_t curpg, otherpg, tmp; + vm_pindex_t otheridx; + + curpg = page_list[i]; + otherpg = PHYS_TO_VM_PAGE(pa_for_pindex(new_image_stage->segments, + nseg, curpg->pindex)); + otheridx = otherpg->pindex; + + if (otherpg->object == new_segments) { + /* + * Swap 'curpg' and 'otherpg', since 'otherpg' + * is at the PA 'curpg' covers. + */ + vm_radix_remove(&new_segments->rtree, otheridx); + vm_radix_remove(&new_segments->rtree, i); + otherpg->pindex = i; + curpg->pindex = otheridx; + vm_radix_insert(&new_segments->rtree, curpg); + vm_radix_insert(&new_segments->rtree, otherpg); + tmp = curpg; + page_list[i] = otherpg; + page_list[otheridx] = tmp; + } + } + for (i = 0; i < nseg; i++) { + new_image_stage->segments[i].first_page = + vm_radix_lookup(&new_segments->rtree, + new_image_stage->segments[i].pindex); + } + if (md_pages > 0) + new_image_stage->first_md_page = + vm_radix_lookup(&new_segments->rtree, + page_count - md_pages); + else + new_image_stage->first_md_page = NULL; + VM_OBJECT_WUNLOCK(new_segments); + + /* Map the object to do the copies */ + err = vm_map_find(kernel_map, new_segments, 0, &segment_va, + ptoa(page_count), 0, VMFS_ANY_SPACE, + VM_PROT_RW, VM_PROT_RW, MAP_PREFAULT); + if (err != 0) + goto out; + buf = (void *)segment_va; + new_image_stage->map_addr = segment_va; + new_image_stage->map_size = ptoa(new_segments->size); + new_image_stage->entry = entry; + new_image_stage->map_obj = new_segments; + for (i = 0; i < nseg; i++) { + err = copyin(segtmp[i].buf, buf, segtmp[i].bufsz); + if (err != 0) { + goto out; + } + new_image_stage->segments[i].map_buf = buf; + buf += segtmp[i].bufsz; + tmpsize = segtmp[i].memsz - segtmp[i].bufsz; + if (tmpsize > 0) + memset(buf, 0, tmpsize); + buf += tmpsize; + } + /* What's left are the MD pages, so zero them all out. */ + if (md_pages > 0) + bzero(buf, ptoa(md_pages)); + + cpu_flush_dcache((void *)segment_va, ptoa(page_count)); + if ((err = kexec_load_md(new_image_stage)) != 0) + goto out; + } + if (kexec_obj != NULL) { + vm_object_unwire(kexec_obj, 0, kexec_obj->size, 0); + KASSERT(stage_addr != 0, ("Mapped kexec_obj without address")); + vm_map_remove(kernel_map, stage_addr, stage_addr + kexec_obj->size); + } + kexec_obj = new_segments; + bzero(&staged_image, sizeof(staged_image)); + if (nseg > 0) + memcpy(&staged_image, new_image_stage, sizeof(*new_image_stage)); + + printf("trampoline at %#jx\n", (uintmax_t)staged_image.entry); + if (nseg > 0) { + if (kexec_reboot_handler == NULL) + kexec_reboot_handler = + EVENTHANDLER_REGISTER(shutdown_final, kexec_reboot, NULL, + SHUTDOWN_PRI_DEFAULT - 150); + } else { + if (kexec_reboot_handler != NULL) + EVENTHANDLER_DEREGISTER(shutdown_final, kexec_reboot_handler); + } +out: + /* Clean up the mess if we've gotten far. */ + if (err != 0 && new_segments != NULL) { + vm_object_unwire(new_segments, 0, new_segments->size, 0); + if (segment_va != 0) + vm_map_remove(kernel_map, segment_va, segment_va + kexec_obj->size); + else + vm_object_deallocate(new_segments); + } + atomic_store_rel_int(&kexec_loading, false); + if (new_image_stage != NULL) + free(new_image_stage, M_TEMP); + if (page_list != 0) + free(page_list, M_TEMP); + + return (err); +} + +int +sys_kexec_load(struct thread *td, struct kexec_load_args *uap) +{ + int error; + + // FIXME: Do w need a better privilege check than PRIV_REBOOT here? + error = priv_check(td, PRIV_REBOOT); + if (error != 0) + return (error); + return (kern_kexec_load(td, uap->entry, uap->nseg, uap->segments, uap->flags)); +} diff --git a/sys/kern/kern_loginclass.c b/sys/kern/kern_loginclass.c index 55db6c28a1db..0c111c4f78d8 100644 --- a/sys/kern/kern_loginclass.c +++ b/sys/kern/kern_loginclass.c @@ -225,6 +225,8 @@ sys_setloginclass(struct thread *td, struct setloginclass_args *uap) proc_set_cred(p, newcred); #ifdef RACCT racct_proc_ucred_changed(p, oldcred, newcred); +#endif +#ifdef RCTL crhold(newcred); #endif PROC_UNLOCK(p); diff --git a/sys/kern/kern_prot.c b/sys/kern/kern_prot.c index a4c5bcc52529..3c145851b683 100644 --- a/sys/kern/kern_prot.c +++ b/sys/kern/kern_prot.c @@ -696,7 +696,7 @@ kern_setcred(struct thread *const td, const u_int flags, gid_t *groups = NULL; gid_t smallgroups[CRED_SMALLGROUPS_NB]; int error; - bool cred_set; + bool cred_set = false; /* Bail out on unrecognized flags. */ if (flags & ~SETCREDF_MASK) @@ -839,17 +839,32 @@ kern_setcred(struct thread *const td, const u_int flags, if (cred_set) { setsugid(p); to_free_cred = old_cred; +#ifdef RACCT + racct_proc_ucred_changed(p, old_cred, new_cred); +#endif +#ifdef RCTL + crhold(new_cred); +#endif MPASS(error == 0); } else error = EAGAIN; unlock_finish: PROC_UNLOCK(p); + /* * Part 3: After releasing the process lock, we perform cleanups and * finishing operations. */ +#ifdef RCTL + if (cred_set) { + rctl_proc_ucred_changed(p, new_cred); + /* Paired with the crhold() just above. */ + crfree(new_cred); + } +#endif + #ifdef MAC if (mac_set_proc_data != NULL) mac_set_proc_finish(td, proc_label_set, mac_set_proc_data); @@ -982,6 +997,8 @@ sys_setuid(struct thread *td, struct setuid_args *uap) proc_set_cred(p, newcred); #ifdef RACCT racct_proc_ucred_changed(p, oldcred, newcred); +#endif +#ifdef RCTL crhold(newcred); #endif PROC_UNLOCK(p); @@ -1390,6 +1407,8 @@ sys_setreuid(struct thread *td, struct setreuid_args *uap) proc_set_cred(p, newcred); #ifdef RACCT racct_proc_ucred_changed(p, oldcred, newcred); +#endif +#ifdef RCTL crhold(newcred); #endif PROC_UNLOCK(p); @@ -1536,6 +1555,8 @@ sys_setresuid(struct thread *td, struct setresuid_args *uap) proc_set_cred(p, newcred); #ifdef RACCT racct_proc_ucred_changed(p, oldcred, newcred); +#endif +#ifdef RCTL crhold(newcred); #endif PROC_UNLOCK(p); diff --git a/sys/kern/kern_racct.c b/sys/kern/kern_racct.c index 2aab151aba08..17b64ad00bb5 100644 --- a/sys/kern/kern_racct.c +++ b/sys/kern/kern_racct.c @@ -1236,16 +1236,20 @@ racct_updatepcpu_containers(void) racct_updatepcpu_post, NULL, NULL); } +static bool +racct_proc_to_skip(const struct proc *p) +{ + PROC_LOCK_ASSERT(p, MA_OWNED); + return (p->p_state != PRS_NORMAL || (p->p_flag & P_IDLEPROC) != 0); +} + static void racctd(void) { struct proc *p; - struct proc *idle; ASSERT_RACCT_ENABLED(); - idle = STAILQ_FIRST(&cpuhead)->pc_idlethread->td_proc; - for (;;) { racct_decay(); @@ -1253,12 +1257,7 @@ racctd(void) FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); - if (p == idle) { - PROC_UNLOCK(p); - continue; - } - if (p->p_state != PRS_NORMAL || - (p->p_flag & P_IDLEPROC) != 0) { + if (racct_proc_to_skip(p)) { PROC_UNLOCK(p); continue; } @@ -1284,7 +1283,7 @@ racctd(void) */ FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); - if (p->p_state != PRS_NORMAL) { + if (racct_proc_to_skip(p)) { PROC_UNLOCK(p); continue; } diff --git a/sys/kern/subr_bus.c b/sys/kern/subr_bus.c index bf5bda7e058d..b84f69cfd03e 100644 --- a/sys/kern/subr_bus.c +++ b/sys/kern/subr_bus.c @@ -4633,7 +4633,7 @@ bus_release_resources(device_t dev, const struct resource_spec *rs, * parent of @p dev. */ struct resource * -bus_alloc_resource(device_t dev, int type, int *rid, rman_res_t start, +(bus_alloc_resource)(device_t dev, int type, int *rid, rman_res_t start, rman_res_t end, rman_res_t count, u_int flags) { struct resource *res; diff --git a/sys/kern/subr_smp.c b/sys/kern/subr_smp.c index 1f9577fddf9c..9f5106316018 100644 --- a/sys/kern/subr_smp.c +++ b/sys/kern/subr_smp.c @@ -242,7 +242,7 @@ generic_stop_cpus(cpuset_t map, u_int type) KASSERT( type == IPI_STOP || type == IPI_STOP_HARD #if X86 - || type == IPI_SUSPEND + || type == IPI_SUSPEND || type == IPI_OFF #endif , ("%s: invalid stop type", __func__)); @@ -260,7 +260,7 @@ generic_stop_cpus(cpuset_t map, u_int type) * will be lost, violating FreeBSD's assumption of reliable * IPI delivery. */ - if (type == IPI_SUSPEND) + if (type == IPI_SUSPEND || type == IPI_OFF) mtx_lock_spin(&smp_ipi_mtx); #endif @@ -280,7 +280,7 @@ generic_stop_cpus(cpuset_t map, u_int type) #endif #if X86 - if (type == IPI_SUSPEND) + if (type == IPI_SUSPEND || type == IPI_OFF) cpus = &suspended_cpus; else #endif @@ -298,7 +298,7 @@ generic_stop_cpus(cpuset_t map, u_int type) } #if X86 - if (type == IPI_SUSPEND) + if (type == IPI_SUSPEND || type == IPI_OFF) mtx_unlock_spin(&smp_ipi_mtx); #endif @@ -327,6 +327,13 @@ suspend_cpus(cpuset_t map) return (generic_stop_cpus(map, IPI_SUSPEND)); } + +int +offline_cpus(cpuset_t map) +{ + + return (generic_stop_cpus(map, IPI_OFF)); +} #endif /* diff --git a/sys/kern/subr_syscall.c b/sys/kern/subr_syscall.c index d5b3b62f0821..48896529f685 100644 --- a/sys/kern/subr_syscall.c +++ b/sys/kern/subr_syscall.c @@ -55,8 +55,8 @@ syscallenter(struct thread *td) struct proc *p; struct syscall_args *sa; struct sysent *se; - int error, traced; - bool sy_thr_static; + int error; + bool sy_thr_static, traced; VM_CNT_INC(v_syscall); p = td->td_proc; @@ -219,7 +219,7 @@ syscallret(struct thread *td) struct proc *p; struct syscall_args *sa; ksiginfo_t ksi; - int traced; + bool traced; KASSERT(td->td_errno != ERELOOKUP, ("ERELOOKUP not consumed syscall %d", td->td_sa.code)); @@ -250,9 +250,9 @@ syscallret(struct thread *td) } #endif - traced = 0; + traced = false; if (__predict_false(p->p_flag & P_TRACED)) { - traced = 1; + traced = true; PROC_LOCK(p); td->td_dbgflags |= TDB_SCX; PROC_UNLOCK(p); diff --git a/sys/kern/sys_generic.c b/sys/kern/sys_generic.c index 7d666da9f88b..b84f675d1dcb 100644 --- a/sys/kern/sys_generic.c +++ b/sys/kern/sys_generic.c @@ -2345,3 +2345,35 @@ exterr_set(int eerror, int category, const char *mmsg, uintptr_t pp1, } return (eerror); } + +int +exterr_set_from(const struct kexterr *ke) +{ + struct thread *td; + + td = curthread; + if ((td->td_pflags2 & TDP2_UEXTERR) != 0) { + td->td_pflags2 |= TDP2_EXTERR; + td->td_kexterr = *ke; + } + return (td->td_kexterr.error); +} + +void +exterr_clear(struct kexterr *ke) +{ + memset(ke, 0, sizeof(*ke)); +} + +#include "opt_ddb.h" +#ifdef DDB +#include <ddb/ddb.h> + +void +exterr_db_print(struct kexterr *ke) +{ + db_printf("errno %d cat %d msg %s p1 %#jx p2 %#jx line %d\n", + ke->error, ke->cat, ke->msg == NULL ? "<none>" : ke->msg, + (uintmax_t)ke->p1, (uintmax_t)ke->p2, ke->src_line); +} +#endif diff --git a/sys/kern/syscalls.c b/sys/kern/syscalls.c index 4cef89cd5219..06a4adc3d8cb 100644 --- a/sys/kern/syscalls.c +++ b/sys/kern/syscalls.c @@ -604,4 +604,5 @@ const char *syscallnames[] = { "setgroups", /* 596 = setgroups */ "jail_attach_jd", /* 597 = jail_attach_jd */ "jail_remove_jd", /* 598 = jail_remove_jd */ + "kexec_load", /* 599 = kexec_load */ }; diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master index 967af1f5313c..ea6d2b5aa1ef 100644 --- a/sys/kern/syscalls.master +++ b/sys/kern/syscalls.master @@ -3394,4 +3394,12 @@ ); } +599 AUE_NULL STD { + int kexec_load( + uint64_t entry, + u_long nseg, + _In_reads_(nseg) _Contains_long_ptr_ struct kexec_segment *segments, + u_long flags + ); + } ; vim: syntax=off diff --git a/sys/kern/systrace_args.c b/sys/kern/systrace_args.c index e28fef931ea8..5951cebbe74a 100644 --- a/sys/kern/systrace_args.c +++ b/sys/kern/systrace_args.c @@ -3514,6 +3514,16 @@ systrace_args(int sysnum, void *params, uint64_t *uarg, int *n_args) *n_args = 1; break; } + /* kexec_load */ + case 599: { + struct kexec_load_args *p = params; + uarg[a++] = p->entry; /* uint64_t */ + uarg[a++] = p->nseg; /* u_long */ + uarg[a++] = (intptr_t)p->segments; /* struct kexec_segment * */ + uarg[a++] = p->flags; /* u_long */ + *n_args = 4; + break; + } default: *n_args = 0; break; @@ -9401,6 +9411,25 @@ systrace_entry_setargdesc(int sysnum, int ndx, char *desc, size_t descsz) break; }; break; + /* kexec_load */ + case 599: + switch (ndx) { + case 0: + p = "uint64_t"; + break; + case 1: + p = "u_long"; + break; + case 2: + p = "userland struct kexec_segment *"; + break; + case 3: + p = "u_long"; + break; + default: + break; + }; + break; default: break; }; @@ -11409,6 +11438,11 @@ systrace_return_setargdesc(int sysnum, int ndx, char *desc, size_t descsz) if (ndx == 0 || ndx == 1) p = "int"; break; + /* kexec_load */ + case 599: + if (ndx == 0 || ndx == 1) + p = "int"; + break; default: break; }; diff --git a/sys/kern/vfs_aio.c b/sys/kern/vfs_aio.c index 60916a9fbd32..02d4b8426757 100644 --- a/sys/kern/vfs_aio.c +++ b/sys/kern/vfs_aio.c @@ -2487,7 +2487,7 @@ aio_biowakeup(struct bio *bp) long bcount = bp->bio_bcount; long resid = bp->bio_resid; int opcode, nblks; - int bio_error = bp->bio_error; + int abio_error = bp->bio_error; uint16_t flags = bp->bio_flags; opcode = job->uaiocb.aio_lio_opcode; @@ -2503,16 +2503,16 @@ aio_biowakeup(struct bio *bp) * error of whichever failed bio completed last. */ if (flags & BIO_ERROR) - atomic_store_int(&job->error, bio_error); + atomic_store_int(&job->error, abio_error); if (opcode & LIO_WRITE) atomic_add_int(&job->outblock, nblks); else atomic_add_int(&job->inblock, nblks); if (refcount_release(&job->nbio)) { - bio_error = atomic_load_int(&job->error); - if (bio_error != 0) - aio_complete(job, -1, bio_error); + abio_error = atomic_load_int(&job->error); + if (abio_error != 0) + aio_complete(job, -1, abio_error); else aio_complete(job, atomic_load_long(&job->nbytes), 0); } diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c index 19c39e42bafa..880cc6b99951 100644 --- a/sys/kern/vfs_bio.c +++ b/sys/kern/vfs_bio.c @@ -44,6 +44,7 @@ * see man buf(9) for more info. */ +#define EXTERR_CATEGORY EXTERR_CAT_VFSBIO #include <sys/param.h> #include <sys/systm.h> #include <sys/asan.h> @@ -55,6 +56,7 @@ #include <sys/counter.h> #include <sys/devicestat.h> #include <sys/eventhandler.h> +#include <sys/exterrvar.h> #include <sys/fail.h> #include <sys/ktr.h> #include <sys/limits.h> @@ -1775,7 +1777,6 @@ buf_alloc(struct bufdomain *bd) bp->b_blkno = bp->b_lblkno = 0; bp->b_offset = NOOFFSET; bp->b_iodone = 0; - bp->b_error = 0; bp->b_resid = 0; bp->b_bcount = 0; bp->b_npages = 0; @@ -1785,6 +1786,7 @@ buf_alloc(struct bufdomain *bd) bp->b_fsprivate1 = NULL; bp->b_fsprivate2 = NULL; bp->b_fsprivate3 = NULL; + exterr_clear(&bp->b_exterr); LIST_INIT(&bp->b_dep); return (bp); @@ -2276,7 +2278,7 @@ breadn_flags(struct vnode *vp, daddr_t blkno, daddr_t dblkno, int size, } if ((flags & GB_CVTENXIO) != 0) bp->b_xflags |= BX_CVTENXIO; - bp->b_ioflags &= ~BIO_ERROR; + bp->b_ioflags &= ~(BIO_ERROR | BIO_EXTERR); if (bp->b_rcred == NOCRED && cred != NOCRED) bp->b_rcred = crhold(cred); vfs_busy_pages(bp, 0); @@ -2353,7 +2355,7 @@ bufwrite(struct buf *bp) bundirty(bp); bp->b_flags &= ~B_DONE; - bp->b_ioflags &= ~BIO_ERROR; + bp->b_ioflags &= ~(BIO_ERROR | BIO_EXTERR); bp->b_flags |= B_CACHE; bp->b_iocmd = BIO_WRITE; @@ -4520,8 +4522,11 @@ biowait(struct bio *bp, const char *wmesg) while ((bp->bio_flags & BIO_DONE) == 0) msleep(bp, mtxp, PRIBIO, wmesg, 0); mtx_unlock(mtxp); - if (bp->bio_error != 0) + if (bp->bio_error != 0) { + if ((bp->bio_flags & BIO_EXTERR) != 0) + return (exterr_set_from(&bp->bio_exterr)); return (bp->bio_error); + } if (!(bp->bio_flags & BIO_ERROR)) return (0); return (EIO); @@ -4568,6 +4573,8 @@ bufwait(struct buf *bp) return (EINTR); } if (bp->b_ioflags & BIO_ERROR) { + if ((bp->b_ioflags & BIO_EXTERR) != 0) + exterr_set_from(&bp->b_exterr); return (bp->b_error ? bp->b_error : EIO); } else { return (0); @@ -5522,6 +5529,8 @@ DB_SHOW_COMMAND(buffer, db_show_buffer) db_printf("\n"); } BUF_LOCKPRINTINFO(bp); + if ((bp->b_ioflags & BIO_EXTERR) != 0) + exterr_db_print(&bp->b_exterr); #if defined(FULL_BUF_TRACKING) db_printf("b_io_tracking: b_io_tcnt = %u\n", bp->b_io_tcnt); |
