/*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2025 Juniper Networks, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include #ifdef INTRNG #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef KEXEC_MD_PAGES /* * Number of MD pages for extra bookkeeping. * This is a macro because it can be a constant (some architectures make it 0). * It accepts an argument, which is an array of * kexec_segment[KEXEC_SEGMENT_MAX]. */ #define KEXEC_MD_PAGES(x) 0 #endif /* * Basic design: * * Given an array of "segment descriptors" stage an image to be loaded and * jumped to at reboot, instead of rebooting via firmware. * * Constraints: * - The segment descriptors' "mem" and "memsz" must each fit within a * vm_phys_seg segment, which can be obtained via the `vm.phys_segs` sysctl. * A single segment cannot span multiple vm_phys_seg segments, even if the * vm_phys_seg segments are adjacent. * * Technical details: * * Take advantage of the VM subsystem and create a vm_object to hold the staged * image. When grabbing pages for the object, sort the pages so that if a page * in the object is located in the physical range of any of the kexec segment * targets then it gets placed at the pindex corresponding to that physical * address. This avoids the chance of corruption by writing over the page in * the final copy, or the need for a copy buffer page. */ static struct kexec_image staged_image; static vm_offset_t stage_addr; static vm_object_t kexec_obj; static eventhandler_tag kexec_reboot_handler; static struct mtx kexec_mutex; static MALLOC_DEFINE(M_KEXEC, "kexec", "Kexec segments"); static void kexec_reboot(void *junk __unused, int howto) { if ((howto & RB_KEXEC) == 0 || kexec_obj == NULL) return; #ifdef SMP cpu_mp_stop(); #endif /* SMP */ intr_disable(); printf("Starting kexec reboot\n"); scheduler_stopped = true; kexec_reboot_md(&staged_image); } MTX_SYSINIT(kexec_mutex, &kexec_mutex, "kexec", MTX_DEF); /* Sort the segment list once copied in */ static int seg_cmp(const void *seg1, const void *seg2) { const struct kexec_segment *s1, *s2; s1 = seg1; s2 = seg2; return ((uintptr_t)s1->mem - (uintptr_t)s2->mem); } static bool segment_fits(struct kexec_segment *seg) { vm_paddr_t v = (vm_paddr_t)(uintptr_t)seg->mem; for (int i = 0; i < vm_phys_nsegs; i++) { if (v >= vm_phys_segs[i].start && (v + seg->memsz - 1) <= vm_phys_segs[i].end) return (true); } return (false); } static vm_paddr_t pa_for_pindex(struct kexec_segment_stage *segs, int count, vm_pindex_t pind) { for (int i = count; i > 0; --i) { if (pind >= segs[i - 1].pindex) return (ptoa(pind - segs[i-1].pindex) + segs[i - 1].target); } panic("No segment for pindex %ju\n", (uintmax_t)pind); } /* * For now still tied to the system call, so assumes all memory is userspace. */ int kern_kexec_load(struct thread *td, u_long entry, u_long nseg, struct kexec_segment *seg, u_long flags) { static int kexec_loading; struct kexec_segment segtmp[KEXEC_SEGMENT_MAX]; struct kexec_image *new_image_stage = 0; vm_object_t new_segments = NULL; uint8_t *buf; int err = 0; int i; const size_t segsize = nseg * sizeof(struct kexec_segment); vm_page_t *page_list = 0; vm_size_t image_count, md_pages, page_count, tmpsize; vm_offset_t segment_va = 0; /* * - Do any sanity checking * - Load the new segments to temporary * - Remove the old segments * - Install the new segments */ if (nseg > KEXEC_SEGMENT_MAX) return (EINVAL); if (atomic_cmpset_acq_int(&kexec_loading, false, true) == 0) return (EBUSY); /* Only do error checking if we're installing new segments. */ if (nseg > 0) { /* Create the new kexec object before destroying the old one. */ bzero(&segtmp, sizeof(segtmp)); err = copyin(seg, segtmp, segsize); if (err != 0) goto out; qsort(segtmp, nseg, sizeof(*segtmp), seg_cmp); new_image_stage = malloc(sizeof(*new_image_stage), M_TEMP, M_WAITOK | M_ZERO); /* * Sanity checking: * - All segments must not overlap the kernel, so must be fully enclosed * in a vm_phys_seg (each kexec segment must be in a single * vm_phys_seg segment, cannot cross even adjacent segments). */ image_count = 0; for (i = 0; i < nseg; i++) { if (!segment_fits(&segtmp[i]) || segtmp[i].bufsz > segtmp[i].memsz) { err = EINVAL; goto out; } new_image_stage->segments[i].pindex = image_count; new_image_stage->segments[i].target = (vm_offset_t)segtmp[i].mem; new_image_stage->segments[i].size = segtmp[i].memsz; image_count += atop(segtmp[i].memsz); } md_pages = KEXEC_MD_PAGES(segtmp); page_count = image_count + md_pages; new_segments = vm_object_allocate(OBJT_PHYS, page_count); page_list = malloc(page_count * sizeof(vm_page_t), M_TEMP, M_WAITOK); /* * - Grab all pages for all segments (use pindex to slice it) * - Walk the list (once) * - At each pindex, check if the target PA that corresponds * to that index is in the object. If so, swap the pages. * - At the end of this the list will be "best" sorted. */ vm_page_grab_pages_unlocked(new_segments, 0, VM_ALLOC_NORMAL | VM_ALLOC_WAITOK | VM_ALLOC_WIRED | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO, page_list, page_count); /* Sort the pages to best match the PA */ VM_OBJECT_WLOCK(new_segments); for (i = 0; i < image_count; i++) { vm_page_t curpg, otherpg, tmp; vm_pindex_t otheridx; curpg = page_list[i]; otherpg = PHYS_TO_VM_PAGE(pa_for_pindex(new_image_stage->segments, nseg, curpg->pindex)); otheridx = otherpg->pindex; if (otherpg->object == new_segments) { /* * Swap 'curpg' and 'otherpg', since 'otherpg' * is at the PA 'curpg' covers. */ vm_radix_remove(&new_segments->rtree, otheridx); vm_radix_remove(&new_segments->rtree, i); otherpg->pindex = i; curpg->pindex = otheridx; vm_radix_insert(&new_segments->rtree, curpg); vm_radix_insert(&new_segments->rtree, otherpg); tmp = curpg; page_list[i] = otherpg; page_list[otheridx] = tmp; } } for (i = 0; i < nseg; i++) { new_image_stage->segments[i].first_page = vm_radix_lookup(&new_segments->rtree, new_image_stage->segments[i].pindex); } if (md_pages > 0) new_image_stage->first_md_page = vm_radix_lookup(&new_segments->rtree, page_count - md_pages); else new_image_stage->first_md_page = NULL; VM_OBJECT_WUNLOCK(new_segments); /* Map the object to do the copies */ err = vm_map_find(kernel_map, new_segments, 0, &segment_va, ptoa(page_count), 0, VMFS_ANY_SPACE, VM_PROT_RW, VM_PROT_RW, MAP_PREFAULT); if (err != 0) goto out; buf = (void *)segment_va; new_image_stage->map_addr = segment_va; new_image_stage->map_size = ptoa(new_segments->size); new_image_stage->entry = entry; new_image_stage->map_obj = new_segments; for (i = 0; i < nseg; i++) { err = copyin(segtmp[i].buf, buf, segtmp[i].bufsz); if (err != 0) { goto out; } new_image_stage->segments[i].map_buf = buf; buf += segtmp[i].bufsz; tmpsize = segtmp[i].memsz - segtmp[i].bufsz; if (tmpsize > 0) memset(buf, 0, tmpsize); buf += tmpsize; } /* What's left are the MD pages, so zero them all out. */ if (md_pages > 0) bzero(buf, ptoa(md_pages)); cpu_flush_dcache((void *)segment_va, ptoa(page_count)); if ((err = kexec_load_md(new_image_stage)) != 0) goto out; } if (kexec_obj != NULL) { vm_object_unwire(kexec_obj, 0, kexec_obj->size, 0); KASSERT(stage_addr != 0, ("Mapped kexec_obj without address")); vm_map_remove(kernel_map, stage_addr, stage_addr + kexec_obj->size); } kexec_obj = new_segments; bzero(&staged_image, sizeof(staged_image)); if (nseg > 0) memcpy(&staged_image, new_image_stage, sizeof(*new_image_stage)); printf("trampoline at %#jx\n", (uintmax_t)staged_image.entry); if (nseg > 0) { if (kexec_reboot_handler == NULL) kexec_reboot_handler = EVENTHANDLER_REGISTER(shutdown_final, kexec_reboot, NULL, SHUTDOWN_PRI_DEFAULT - 150); } else { if (kexec_reboot_handler != NULL) EVENTHANDLER_DEREGISTER(shutdown_final, kexec_reboot_handler); } out: /* Clean up the mess if we've gotten far. */ if (err != 0 && new_segments != NULL) { vm_object_unwire(new_segments, 0, new_segments->size, 0); if (segment_va != 0) vm_map_remove(kernel_map, segment_va, segment_va + kexec_obj->size); else vm_object_deallocate(new_segments); } atomic_store_rel_int(&kexec_loading, false); if (new_image_stage != NULL) free(new_image_stage, M_TEMP); if (page_list != 0) free(page_list, M_TEMP); return (err); } int sys_kexec_load(struct thread *td, struct kexec_load_args *uap) { int error; // FIXME: Do w need a better privilege check than PRIV_REBOOT here? error = priv_check(td, PRIV_REBOOT); if (error != 0) return (error); return (kern_kexec_load(td, uap->entry, uap->nseg, uap->segments, uap->flags)); }