diff options
Diffstat (limited to 'sys')
67 files changed, 1723 insertions, 1011 deletions
diff --git a/sys/amd64/amd64/apic_vector.S b/sys/amd64/amd64/apic_vector.S index e98bae9eb6c5..8691387a5a8e 100644 --- a/sys/amd64/amd64/apic_vector.S +++ b/sys/amd64/amd64/apic_vector.S @@ -204,6 +204,17 @@ IDTVEC(spuriousint) jmp doreti /* + * Executed by a CPU when it receives an IPI_OFF from another CPU. + * Should never return + */ + INTR_HANDLER cpuoff + KMSAN_ENTER + call cpuoff_handler + call as_lapic_eoi + KMSAN_LEAVE + jmp doreti + +/* * Executed by a CPU when it receives an IPI_SWI. */ INTR_HANDLER ipi_swi diff --git a/sys/amd64/amd64/genassym.c b/sys/amd64/amd64/genassym.c index eb1b746f5893..2716784ee871 100644 --- a/sys/amd64/amd64/genassym.c +++ b/sys/amd64/amd64/genassym.c @@ -57,6 +57,7 @@ #include <vm/vm_param.h> #include <vm/pmap.h> #include <vm/vm_map.h> +#include <sys/kexec.h> #include <sys/proc.h> #include <x86/apicreg.h> #include <machine/cpu.h> @@ -65,6 +66,7 @@ #include <machine/proc.h> #include <machine/segments.h> #include <machine/efi.h> +#include <machine/kexec.h> ASSYM(P_VMSPACE, offsetof(struct proc, p_vmspace)); ASSYM(VM_PMAP, offsetof(struct vmspace, vm_pmap)); @@ -295,3 +297,13 @@ ASSYM(EC_R13, offsetof(struct efirt_callinfo, ec_r13)); ASSYM(EC_R14, offsetof(struct efirt_callinfo, ec_r14)); ASSYM(EC_R15, offsetof(struct efirt_callinfo, ec_r15)); ASSYM(EC_RFLAGS, offsetof(struct efirt_callinfo, ec_rflags)); + +/* Kexec */ +ASSYM(KEXEC_ENTRY, offsetof(struct kexec_image, entry)); +ASSYM(KEXEC_SEGMENTS, offsetof(struct kexec_image, segments)); +ASSYM(KEXEC_SEGMENT_MAX, KEXEC_SEGMENT_MAX); +ASSYM(KEXEC_IMAGE_SIZE, sizeof(struct kexec_image)); +ASSYM(KEXEC_STAGED_SEGMENT_SIZE, sizeof(struct kexec_segment_stage)); +ASSYM(KEXEC_SEGMENT_SIZE, offsetof(struct kexec_segment_stage, size)); +ASSYM(KEXEC_SEGMENT_MAP, offsetof(struct kexec_segment_stage, map_buf)); +ASSYM(KEXEC_SEGMENT_TARGET, offsetof(struct kexec_segment_stage, target)); diff --git a/sys/amd64/amd64/kexec_support.c b/sys/amd64/amd64/kexec_support.c new file mode 100644 index 000000000000..8189a48e9ae9 --- /dev/null +++ b/sys/amd64/amd64/kexec_support.c @@ -0,0 +1,300 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2025 Juniper Networks, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/systm.h> +#include <sys/bus.h> +#include <sys/conf.h> +#include <sys/interrupt.h> +#include <sys/kernel.h> +#include <sys/kexec.h> +#include <vm/vm.h> +#include <vm/vm_extern.h> +#include <vm/vm_object.h> +#include <vm/vm_phys.h> +#include <vm/pmap.h> +#include <vm/vm_page.h> +#include <vm/vm_radix.h> + +#include <machine/intr_machdep.h> +#include <machine/kexec.h> +#include <machine/md_var.h> +#include <machine/pmap.h> +#include <x86/apicvar.h> + +/* + * Idea behind this: + * + * kexec_load_md(): + * - Update boot page tables (identity map) to include all pages needed before + * disabling MMU. + * + * kexec_reboot_md(): + * - Copy pages into target(s) + * - Do "other stuff" + * - Does not return + */ + +/* + * do_pte: Create PTE entries (4k pages). If false, create 2MB superpages. + * identity: This is for an identity map, treat `start` as a physical address. + * Only valid here if do_pte is false. + */ +static void +kexec_generate_page_tables(pml4_entry_t *root, vm_offset_t start, + vm_size_t size, bool do_pte, bool identity, struct pctrie_iter *pages) +{ + vm_paddr_t mpa; + vm_offset_t pg; + vm_size_t stride = do_pte ? PAGE_SIZE : NBPDR; + vm_page_t m; + vm_pindex_t i, j, k, l; + + pg = start & ~(stride - 1); + i = pmap_pml4e_index(pg); + j = pmap_pdpe_index(pg); + k = pmap_pde_index(pg); + l = pmap_pte_index(pg); + for (; pg < start + size; i++, j = 0, k = 0, l = 0) { + /* + * Walk linearly, as above, but one fell swoop, one page at a + * time. + */ + if (root[i] == 0) { + m = vm_radix_iter_next(pages); + mpa = VM_PAGE_TO_PHYS(m); + root[i] = mpa | PG_RW | PG_V; + } + pdp_entry_t *pdp = + (pdp_entry_t *)(PHYS_TO_DMAP(root[i] & PG_FRAME)); + for (; j < NPDPEPG && pg < start + size; j++, k = 0, l = 0) { + if (pdp[j] == 0) { + m = vm_radix_iter_next(pages); + mpa = VM_PAGE_TO_PHYS(m); + pdp[j] = mpa | PG_RW | PG_V; + } + pd_entry_t *pde = + (pd_entry_t *)(PHYS_TO_DMAP(pdp[j] & PG_FRAME)); + for (; k < NPDEPG && pg < start + size; k++, l = 0) { + if (pde[k] == 0) { + if (!do_pte) { + pde[k] = + (identity ? pg : pmap_kextract(pg)) | + PG_RW | PG_PS | PG_V; + pg += NBPDR; + continue; + } + m = vm_radix_iter_next(pages); + mpa = VM_PAGE_TO_PHYS(m); + pde[k] = mpa | PG_V | PG_RW; + } else if ((pde[k] & PG_PS) != 0) { + pg += NBPDR; + continue; + } + /* Populate the PTEs. */ + for (; l < NPTEPG && pg < start + size; + l++, pg += PAGE_SIZE) { + pt_entry_t *pte = + (pt_entry_t *)PHYS_TO_DMAP(pde[pmap_pde_index(pg)] & PG_FRAME); + pte[pmap_pte_index(pg)] = + pmap_kextract(pg) | PG_RW | PG_V; + } + } + } + } +} + +void +kexec_reboot_md(struct kexec_image *image) +{ + void (*kexec_do_tramp)(void) = image->md_image; + + intr_disable_all(); + lapic_disable(); + kexec_do_reboot_trampoline(VM_PAGE_TO_PHYS(image->first_md_page), + kexec_do_tramp); + + for (;;) + ; +} + +int +kexec_load_md(struct kexec_image *image) +{ + struct pctrie_iter pct_iter; + pml4_entry_t *PT4; + pdp_entry_t *PDP_l; + pd_entry_t *PD_l0; + vm_offset_t va; + int i; + + /* + * Start building the page table. + * First part of the page table is standard for all. + */ + vm_offset_t pa_pdp_l, pa_pd_l0, pa_pd_l1, pa_pd_l2, pa_pd_l3; + vm_page_t m; + + if (la57) + return (EINVAL); + + vm_radix_iter_init(&pct_iter, &image->map_obj->rtree); + /* Working in linear space in the mapped space, `va` is our tracker. */ + m = vm_radix_iter_lookup(&pct_iter, image->first_md_page->pindex); + va = (vm_offset_t)image->map_addr + ptoa(m->pindex); + /* We'll find a place for these later */ + PT4 = (void *)va; + va += PAGE_SIZE; + m = vm_radix_iter_next(&pct_iter); + pa_pdp_l = VM_PAGE_TO_PHYS(m); + PDP_l = (void *)va; + va += PAGE_SIZE; + m = vm_radix_iter_next(&pct_iter); + pa_pd_l0 = VM_PAGE_TO_PHYS(m); + PD_l0 = (void *)va; + va += PAGE_SIZE; + m = vm_radix_iter_next(&pct_iter); + pa_pd_l1 = VM_PAGE_TO_PHYS(m); + m = vm_radix_iter_next(&pct_iter); + pa_pd_l2 = VM_PAGE_TO_PHYS(m); + m = vm_radix_iter_next(&pct_iter); + pa_pd_l3 = VM_PAGE_TO_PHYS(m); + m = vm_radix_iter_next(&pct_iter); + + /* 1:1 mapping of lower 4G */ + PT4[0] = (pml4_entry_t)pa_pdp_l | PG_V | PG_RW; + PDP_l[0] = (pdp_entry_t)pa_pd_l0 | PG_V | PG_RW; + PDP_l[1] = (pdp_entry_t)pa_pd_l1 | PG_V | PG_RW; + PDP_l[2] = (pdp_entry_t)pa_pd_l2 | PG_V | PG_RW; + PDP_l[3] = (pdp_entry_t)pa_pd_l3 | PG_V | PG_RW; + for (i = 0; i < 4 * NPDEPG; i++) { /* we overflow PD_l0 into _l1, etc */ + PD_l0[i] = ((pd_entry_t)i << PDRSHIFT) | PG_V | + PG_RW | PG_PS; + } + + /* Map the target(s) in 2MB chunks. */ + for (i = 0; i < KEXEC_SEGMENT_MAX; i++) { + struct kexec_segment_stage *s = &image->segments[i]; + + if (s->size == 0) + break; + kexec_generate_page_tables(PT4, s->target, s->size, false, + true, &pct_iter); + } + /* Now create the source page tables */ + kexec_generate_page_tables(PT4, image->map_addr, image->map_size, true, + false, &pct_iter); + kexec_generate_page_tables(PT4, + trunc_page((vm_offset_t)kexec_do_reboot_trampoline), + PAGE_SIZE, true, false, &pct_iter); + KASSERT(m != NULL, ("kexec_load_md: Missing trampoline page!\n")); + + /* MD control pages start at this next page. */ + image->md_image = (void *)(image->map_addr + ptoa(m->pindex)); + bcopy(kexec_do_reboot, image->md_image, kexec_do_reboot_size); + + /* Save the image into the MD page(s) right after the trampoline */ + bcopy(image, (void *)((vm_offset_t)image->md_image + + (vm_offset_t)&kexec_saved_image - (vm_offset_t)&kexec_do_reboot), + sizeof(*image)); + + return (0); +} + +/* + * Required pages: + * - L4 (1) (root) + * - L3 (PDPE) - 2 (bottom 512GB, bottom 4 used, top range for kernel map) + * - L2 (PDP) - 5 (2MB superpage mappings, 1GB each, for bottom 4GB, top 1) + * - L1 (PDR) - 1 (kexec trampoline page, first MD page) + * - kexec_do_reboot trampoline - 1 + * - Slop pages for staging (in case it's not aligned nicely) - 3 (worst case) + * + * Minimum 9 pages for the direct map. + */ +int +kexec_md_pages(struct kexec_segment *seg_in) +{ + struct kexec_segment *segs = seg_in; + vm_size_t pages = 13; /* Minimum number of starting pages */ + vm_paddr_t cur_addr = (1UL << 32) - 1; /* Bottom 4G will be identity mapped in full */ + vm_size_t source_total = 0; + + for (int i = 0; i < KEXEC_SEGMENT_MAX; i++) { + vm_offset_t start, end; + if (segs[i].memsz == 0) + break; + + end = round_2mpage((vm_offset_t)segs[i].mem + segs[i].memsz); + start = trunc_2mpage((vm_offset_t)segs[i].mem); + start = max(start, cur_addr + 1); + /* + * Round to cover the full range of page table pages for each + * segment. + */ + source_total += round_2mpage(end - start); + + /* + * Bottom 4GB are identity mapped already in the count, so skip + * any segments that end up there, this will short-circuit that. + */ + if (end <= cur_addr + 1) + continue; + + if (pmap_pml4e_index(end) != pmap_pml4e_index(cur_addr)) { + /* Need a new 512GB mapping page */ + pages++; + pages += howmany(end - (start & ~PML4MASK), NBPML4); + pages += howmany(end - (start & ~PDPMASK), NBPDP); + pages += howmany(end - (start & ~PDRMASK), NBPDR); + + } else if (pmap_pdpe_index(end) != pmap_pdpe_index(cur_addr)) { + pages++; + pages += howmany(end - (start & ~PDPMASK), NBPDP) - 1; + pages += howmany(end - (start & ~PDRMASK), NBPDR); + } + + } + /* Be pessimistic when totaling up source pages. We likely + * can't use superpages, so need to map each page individually. + */ + pages += howmany(source_total, NBPDR); + pages += howmany(source_total, NBPDP); + pages += howmany(source_total, NBPML4); + + /* + * Be intentionally sloppy adding in the extra page table pages. It's + * better to go over than under. + */ + pages += howmany(pages * PAGE_SIZE, NBPDR); + pages += howmany(pages * PAGE_SIZE, NBPDP); + pages += howmany(pages * PAGE_SIZE, NBPML4); + + /* Add in the trampoline pages */ + pages += howmany(kexec_do_reboot_size, PAGE_SIZE); + + return (pages); +} diff --git a/sys/amd64/amd64/kexec_tramp.S b/sys/amd64/amd64/kexec_tramp.S new file mode 100644 index 000000000000..6a2de676bc35 --- /dev/null +++ b/sys/amd64/amd64/kexec_tramp.S @@ -0,0 +1,91 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2025 Juniper Networks, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <machine/asmacros.h> +#include <machine/specialreg.h> +#include "assym.inc" + +/* + * Take a pointer to the image, copy each segment, and jump to the trampoline. + * + * Assumptions: + * - image is in safe memory + * - We're already running out of the new "identity" map. + * - All registers are free game, so go nuts + * - Interrupts are disabled + * - All APs are disabled + */ +ENTRY(kexec_do_reboot) + /* + r9: image pointer + r10: segment pointer + r11: segment counter + */ + leaq kexec_stack(%rip), %rsp + /* Get the saved kexec_image. */ + leaq kexec_saved_image(%rip), %r9 + leaq KEXEC_SEGMENTS(%r9), %r10 + movq $KEXEC_SEGMENT_MAX, %r11 +copy_segment: + movq KEXEC_SEGMENT_SIZE(%r10), %rcx + cmpq $0, %rcx + je done + shrq $3, %rcx + movq KEXEC_SEGMENT_TARGET(%r10), %rdi + movq KEXEC_SEGMENT_MAP(%r10), %rsi + rep + movsq + addq $KEXEC_STAGED_SEGMENT_SIZE, %r10 + decq %r11 + jg copy_segment + +done: + pushq KEXEC_ENTRY(%r9) + ret +fail: + jmp fail +END(kexec_do_reboot) +ENTRY(kexec_do_reboot_trampoline) + /* Set new page table, clears most of TLB. */ + movq %rdi, %cr3 + + /* Now flush the rest of the TLB, including global pages. */ + movq %cr4, %rax + andq $~CR4_PGE, %rax + movq %rax, %cr4 + jmp *%rsi +END(kexec_do_reboot_trampoline) +CNAME(kexec_saved_image): + .globl kexec_saved_image + .space KEXEC_IMAGE_SIZE + .quad 0 + /* We don't need more than quad, so just fill out the page. */ + .p2align PAGE_SHIFT + kexec_stack: +CNAME(kexec_do_reboot_size): + .globl kexec_do_reboot_size + .quad . - kexec_do_reboot diff --git a/sys/amd64/amd64/mp_machdep.c b/sys/amd64/amd64/mp_machdep.c index 00e99f9df192..96ed0a2cc3ba 100644 --- a/sys/amd64/amd64/mp_machdep.c +++ b/sys/amd64/amd64/mp_machdep.c @@ -140,6 +140,10 @@ cpu_mp_start(void) setidt(IPI_STOP, pti ? IDTVEC(cpustop_pti) : IDTVEC(cpustop), SDT_SYSIGT, SEL_KPL, 0); + /* Install an inter-CPU IPI for CPU offline */ + setidt(IPI_OFF, pti ? IDTVEC(cpuoff_pti) : IDTVEC(cpuoff), + SDT_SYSIGT, SEL_KPL, 0); + /* Install an inter-CPU IPI for CPU suspend/resume */ setidt(IPI_SUSPEND, pti ? IDTVEC(cpususpend_pti) : IDTVEC(cpususpend), SDT_SYSIGT, SEL_KPL, 0); @@ -176,6 +180,15 @@ cpu_mp_start(void) #endif } +void +cpu_mp_stop(void) +{ + cpuset_t other_cpus = all_cpus; + + CPU_CLR(PCPU_GET(cpuid), &other_cpus); + offline_cpus(other_cpus); +} + /* * AP CPU's call this to initialize themselves. */ diff --git a/sys/amd64/include/kexec.h b/sys/amd64/include/kexec.h new file mode 100644 index 000000000000..70bc2991be3f --- /dev/null +++ b/sys/amd64/include/kexec.h @@ -0,0 +1,41 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2025 Juniper Networks, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _AMD64_KEXEC_H_ +#define _AMD64_KEXEC_H_ + +struct kexec_segment; +struct kexec_image; +int kexec_md_pages(struct kexec_segment *); +extern void kexec_do_reboot(void); +extern long kexec_do_reboot_size; +extern void *kexec_saved_image; +extern void kexec_do_reboot_trampoline(unsigned long, void (*)(void)); +#define KEXEC_MD_PAGES(x) kexec_md_pages(x) + + +#endif /* _AMD64_KEXEC_H_ */ diff --git a/sys/amd64/include/smp.h b/sys/amd64/include/smp.h index bff92570ff82..28c372a2e556 100644 --- a/sys/amd64/include/smp.h +++ b/sys/amd64/include/smp.h @@ -30,6 +30,7 @@ inthand_t IDTVEC(ipi_intr_bitmap_handler_pti), IDTVEC(ipi_swi_pti), IDTVEC(cpustop_pti), + IDTVEC(cpuoff_pti), IDTVEC(cpususpend_pti), IDTVEC(rendezvous_pti); diff --git a/sys/arm/include/kexec.h b/sys/arm/include/kexec.h new file mode 100644 index 000000000000..50391d32812a --- /dev/null +++ b/sys/arm/include/kexec.h @@ -0,0 +1,38 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2025 Juniper Networks, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _ARM_KEXEC_H_ +#define _ARM_KEXEC_H_ + +int +kexec_load_md(struct kexec_image *image) +{ + return (ENOSYS); +} + +#define kexec_reboot_md(x) do {} while (0) +#endif /* _ARM_KEXEC_H_ */ diff --git a/sys/arm64/arm64/kexec_support.c b/sys/arm64/arm64/kexec_support.c new file mode 100644 index 000000000000..8b9719c05b67 --- /dev/null +++ b/sys/arm64/arm64/kexec_support.c @@ -0,0 +1,188 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2025 Juniper Networks, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/systm.h> +#include <sys/kexec.h> +#include <vm/vm.h> +#include <vm/vm_extern.h> +#include <vm/vm_object.h> +#include <vm/vm_phys.h> +#include <vm/vm_radix.h> +#include <vm/pmap.h> +#include <vm/vm_page.h> + +#include <machine/armreg.h> +#include <machine/pmap.h> +#include <machine/pte.h> + +/* + * Idea behind this: + * + * kexec_load_md(): + * - Update boot page tables (identity map) to include all pages needed before + * disabling MMU. + * + * kexec_reboot_md(): + * - Copy pages into target(s) + * - Do "other stuff" + * - Does not return + */ + +extern pt_entry_t pagetable_l0_ttbr0_bootstrap[]; +extern unsigned long initstack_end[]; +void switch_stack(void *, void (*)(void *, void *, struct kexec_image *), void *); + +#define SCTLR_EL1_NO_MMU (SCTLR_RES1 | SCTLR_LSMAOE | SCTLR_nTLSMD | \ + SCTLR_EIS | SCTLR_TSCXT | SCTLR_EOS) +#define vm_page_offset(m) ((vm_offset_t)(m) - vm_page_base) +static inline vm_page_t +phys_vm_page(vm_page_t m, vm_offset_t vm_page_v, vm_paddr_t vm_page_p) +{ + return ((vm_page_t)((vm_offset_t)m - vm_page_v + vm_page_p)); +} + +/* First 2 args are filler for switch_stack() */ +static void __aligned(16) __dead2 +kexec_reboot_bottom( void *arg1 __unused, void *arg2 __unused, + struct kexec_image *image) +{ + void (*e)(void) = (void *)image->entry; + vm_offset_t vm_page_base = (vm_offset_t)vm_page_array; + vm_paddr_t vm_page_phys = pmap_kextract((vm_offset_t)vm_page_array); + struct kexec_segment_stage *phys_segs = + (void *)pmap_kextract((vm_offset_t)&image->segments); + vm_paddr_t from_pa, to_pa; + vm_size_t size; + vm_page_t first, m, mp; + struct pctrie_iter pct_i; + + /* + * Create a linked list of all pages in the object before we disable the + * MMU. Once the MMU is disabled we can't use the vm_radix iterators, + * as they rely on virtual address pointers. + */ + first = NULL; + vm_radix_iter_init(&pct_i, &image->map_obj->rtree); + VM_RADIX_FORALL(m, &pct_i) { + if (first == NULL) + first = m; + else + SLIST_INSERT_AFTER(mp, m, plinks.s.ss); + mp = m; + } + + /* + * We're running out of the identity map now, disable the MMU before we + * continue. It's possible page tables can be overwritten, which would + * be very bad if we were running with the MMU enabled. + */ + WRITE_SPECIALREG(sctlr_el1, SCTLR_EL1_NO_MMU); + isb(); + for (int i = 0; i < KEXEC_SEGMENT_MAX; i++) { + if (phys_segs[i].size == 0) + break; + to_pa = phys_segs[i].target; + /* Copy the segment here... */ + for (vm_page_t p = phys_segs[i].first_page; + p != NULL && to_pa - phys_segs[i].target < phys_segs[i].size; + p = SLIST_NEXT(p, plinks.s.ss)) { + p = phys_vm_page(p, vm_page_base, vm_page_phys); + from_pa = p->phys_addr; + if (p->phys_addr == to_pa) { + to_pa += PAGE_SIZE; + continue; + } + for (size = PAGE_SIZE / sizeof(register_t); + size > 0; --size) { + *(register_t *)to_pa = *(register_t *)from_pa; + to_pa += sizeof(register_t); + from_pa += sizeof(register_t); + } + } + } + invalidate_icache(); + e(); + while (1) + ; +} + +void +kexec_reboot_md(struct kexec_image *image) +{ + uintptr_t ptr; + register_t reg; + + for (int i = 0; i < KEXEC_SEGMENT_MAX; i++) { + if (image->segments[i].size > 0) + cpu_dcache_inv_range((void *)PHYS_TO_DMAP(image->segments[i].target), + image->segments[i].size); + } + ptr = pmap_kextract((vm_offset_t)kexec_reboot_bottom); + serror_disable(); + + reg = pmap_kextract((vm_offset_t)pagetable_l0_ttbr0_bootstrap); + set_ttbr0(reg); + cpu_tlb_flushID(); + + typeof(kexec_reboot_bottom) *p = (void *)ptr; + switch_stack((void *)pmap_kextract((vm_offset_t)initstack_end), + p, image); + while (1) + ; +} + +int +kexec_load_md(struct kexec_image *image) +{ + vm_paddr_t tmp; + pt_entry_t *pte; + + /* Create L2 page blocks for the trampoline. L0/L1 are from the startup. */ + + /* + * There are exactly 2 pages before the pagetable_l0_ttbr0_bootstrap, so + * move to there. + */ + pte = pagetable_l0_ttbr0_bootstrap; + pte -= (Ln_ENTRIES * 2); /* move to start of L2 pages */ + + /* + * Populate the identity map with symbols we know we'll need before we + * turn off the MMU. + */ + tmp = pmap_kextract((vm_offset_t)kexec_reboot_bottom); + pte[pmap_l2_index(tmp)] = (tmp | L2_BLOCK | ATTR_AF | ATTR_S1_UXN); + tmp = pmap_kextract((vm_offset_t)initstack_end); + pte[pmap_l2_index(tmp)] = (tmp | L2_BLOCK | ATTR_AF | ATTR_S1_UXN); + /* We'll need vm_page_array for doing offset calculations. */ + tmp = pmap_kextract((vm_offset_t)&vm_page_array); + pte[pmap_l2_index(tmp)] = (tmp | L2_BLOCK | ATTR_AF | ATTR_S1_UXN); + + return (0); +} diff --git a/sys/arm64/arm64/locore.S b/sys/arm64/arm64/locore.S index d35e334905a7..3ec12140f139 100644 --- a/sys/arm64/arm64/locore.S +++ b/sys/arm64/arm64/locore.S @@ -325,6 +325,19 @@ mp_virtdone: b init_secondary LEND(mpentry_common) + +ENTRY(mp_cpu_spinloop) +0: + wfe + ldr x0, mp_cpu_spin_table_release_addr + cbz x0, 0b + blr x0 + .globl mp_cpu_spin_table_release_addr +mp_cpu_spin_table_release_addr: + .quad 0 + .globl mp_cpu_spinloop_end +mp_cpu_spinloop_end: +END(mp_cpu_spinloop) #endif /* @@ -475,6 +488,29 @@ LENTRY(enter_kernel_el) eret LEND(enter_kernel_el) +/* Turn off the MMU. Install ttbr0 from the bootstrap page table, and go there. + * Does not return. + * - x0 - target address to jump to after stopping the MMU. + * - x1 - kernel load address + */ +ENTRY(stop_mmu) + mov x16, x0 /* Save target. */ + ldr x2, =(1f - KERNBASE) + add x17, x1, x2 + ldr x3, =(pagetable_l0_ttbr0_bootstrap - KERNBASE) + add x1, x1, x3 + msr ttbr0_el1, x1 + isb + br x17 +1: + BTI_J + mrs x0, sctlr_el1 + bic x0, x0, SCTLR_M + bic x0, x0, SCTLR_C + msr sctlr_el1, x0 + isb + br x16 +END(stop_mmu) /* * Get the physical address the kernel was loaded at. */ @@ -1094,12 +1130,19 @@ tcr: TCR_SH0_IS | TCR_ORGN0_WBWA | TCR_IRGN0_WBWA) LEND(start_mmu) +ENTRY(switch_stack) + mov sp, x0 + mov x16, x1 + br x16 +END(switch_stack) + ENTRY(abort) b abort END(abort) .bss .align PAGE_SHIFT + .globl initstack_end initstack: .space BOOT_STACK_SIZE initstack_end: @@ -1116,6 +1159,7 @@ initstack_end: * L0 for user */ .globl pagetable_l0_ttbr1 + .globl pagetable_l0_ttbr0_bootstrap pagetable: pagetable_l3_ttbr1: .space (PAGE_SIZE * L3_PAGE_COUNT) diff --git a/sys/arm64/arm64/mp_machdep.c b/sys/arm64/arm64/mp_machdep.c index e4d011df3a06..0bdd2ecfd8a7 100644 --- a/sys/arm64/arm64/mp_machdep.c +++ b/sys/arm64/arm64/mp_machdep.c @@ -60,6 +60,7 @@ #include <machine/debug_monitor.h> #include <machine/intr.h> #include <machine/smp.h> +#include <machine/vmparam.h> #ifdef VFP #include <machine/vfp.h> #endif @@ -103,6 +104,7 @@ static void ipi_hardclock(void *); static void ipi_preempt(void *); static void ipi_rendezvous(void *); static void ipi_stop(void *); +static void ipi_off(void *); #ifdef FDT static u_int fdt_cpuid; @@ -193,6 +195,7 @@ release_aps(void *dummy __unused) intr_ipi_setup(IPI_STOP, "stop", ipi_stop, NULL); intr_ipi_setup(IPI_STOP_HARD, "stop hard", ipi_stop, NULL); intr_ipi_setup(IPI_HARDCLOCK, "hardclock", ipi_hardclock, NULL); + intr_ipi_setup(IPI_OFF, "off", ipi_off, NULL); atomic_store_int(&aps_started, 0); atomic_store_rel_int(&aps_ready, 1); @@ -390,6 +393,34 @@ ipi_stop(void *dummy __unused) CTR0(KTR_SMP, "IPI_STOP (restart)"); } +void stop_mmu(vm_paddr_t, vm_paddr_t) __dead2; +extern uint32_t mp_cpu_spinloop[]; +extern uint32_t mp_cpu_spinloop_end[]; +extern uint64_t mp_cpu_spin_table_release_addr; +static void +ipi_off(void *dummy __unused) +{ + CTR0(KTR_SMP, "IPI_OFF"); + if (psci_present) + psci_cpu_off(); + else { + uint64_t release_addr; + vm_size_t size; + + size = (vm_offset_t)&mp_cpu_spin_table_release_addr - + (vm_offset_t)mp_cpu_spinloop; + release_addr = PCPU_GET(release_addr) - size; + isb(); + invalidate_icache(); + /* Go catatonic, don't take any interrupts. */ + intr_disable(); + stop_mmu(release_addr, pmap_kextract(KERNBASE)); + + + } + CTR0(KTR_SMP, "IPI_OFF failed"); +} + struct cpu_group * cpu_topo(void) { @@ -511,6 +542,7 @@ start_cpu(u_int cpuid, uint64_t target_cpu, int domain, vm_paddr_t release_addr) pcpu_init(pcpup, cpuid, sizeof(struct pcpu)); pcpup->pc_mpidr = target_cpu & CPU_AFF_MASK; bootpcpu = pcpup; + pcpup->pc_release_addr = release_addr; dpcpu[cpuid - 1] = (void *)(pcpup + 1); dpcpu_init(dpcpu[cpuid - 1], cpuid); @@ -752,6 +784,52 @@ cpu_mp_start(void) } } +void +cpu_mp_stop(void) +{ + + /* Short-circuit for single-CPU */ + if (CPU_COUNT(&all_cpus) == 1) + return; + + KASSERT(PCPU_GET(cpuid) == CPU_FIRST(), ("Not on the first CPU!\n")); + + /* + * If we use spin-table, assume U-boot method for now (single address + * shared by all CPUs). + */ + if (!psci_present) { + int cpu; + vm_paddr_t release_addr; + void *release_vaddr; + vm_size_t size; + + /* Find the shared release address. */ + CPU_FOREACH(cpu) { + release_addr = pcpu_find(cpu)->pc_release_addr; + if (release_addr != 0) + break; + } + /* No release address? No way of notifying other CPUs. */ + if (release_addr == 0) + return; + + size = (vm_offset_t)&mp_cpu_spinloop_end - + (vm_offset_t)&mp_cpu_spinloop; + + release_addr -= (vm_offset_t)&mp_cpu_spin_table_release_addr - + (vm_offset_t)mp_cpu_spinloop; + + release_vaddr = pmap_mapdev(release_addr, size); + bcopy(mp_cpu_spinloop, release_vaddr, size); + cpu_dcache_wbinv_range(release_vaddr, size); + pmap_unmapdev(release_vaddr, size); + invalidate_icache(); + } + ipi_all_but_self(IPI_OFF); + DELAY(1000000); +} + /* Introduce rest of cores to the world */ void cpu_mp_announce(void) diff --git a/sys/arm64/include/cpufunc.h b/sys/arm64/include/cpufunc.h index e6e1f682794e..e9eee643216b 100644 --- a/sys/arm64/include/cpufunc.h +++ b/sys/arm64/include/cpufunc.h @@ -96,6 +96,13 @@ serror_enable(void) __asm __volatile("msr daifclr, #(" __XSTRING(DAIF_A) ")"); } +static __inline void +serror_disable(void) +{ + + __asm __volatile("msr daifset, #(" __XSTRING(DAIF_A) ")"); +} + static __inline register_t get_midr(void) { diff --git a/sys/arm64/include/kexec.h b/sys/arm64/include/kexec.h new file mode 100644 index 000000000000..0a8c7a053331 --- /dev/null +++ b/sys/arm64/include/kexec.h @@ -0,0 +1,33 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2025 Juniper Networks, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _ARM64_KEXEC_H_ +#define _ARM64_KEXEC_H_ + +#define KEXEC_MD_PAGES(x) 0 + +#endif /* _ARM64_KEXEC_H_ */ diff --git a/sys/arm64/include/pcpu.h b/sys/arm64/include/pcpu.h index 09bd8fa8a966..73399d2c3f8c 100644 --- a/sys/arm64/include/pcpu.h +++ b/sys/arm64/include/pcpu.h @@ -50,7 +50,8 @@ struct debug_monitor_state; struct pmap *pc_curvmpmap; \ uint64_t pc_mpidr; \ u_int pc_bcast_tlbi_workaround; \ - char __pad[197] + uint64_t pc_release_addr; \ + char __pad[189] #ifdef _KERNEL diff --git a/sys/arm64/include/smp.h b/sys/arm64/include/smp.h index 500cd1ef4f02..4a5bfda3ac1c 100644 --- a/sys/arm64/include/smp.h +++ b/sys/arm64/include/smp.h @@ -40,6 +40,7 @@ enum { IPI_STOP, IPI_STOP_HARD, IPI_HARDCLOCK, + IPI_OFF, INTR_IPI_COUNT, }; diff --git a/sys/compat/freebsd32/freebsd32_syscall.h b/sys/compat/freebsd32/freebsd32_syscall.h index 54063150eef9..f8ef7e4a20d3 100644 --- a/sys/compat/freebsd32/freebsd32_syscall.h +++ b/sys/compat/freebsd32/freebsd32_syscall.h @@ -517,4 +517,4 @@ #define FREEBSD32_SYS_setgroups 596 #define FREEBSD32_SYS_jail_attach_jd 597 #define FREEBSD32_SYS_jail_remove_jd 598 -#define FREEBSD32_SYS_MAXSYSCALL 599 +#define FREEBSD32_SYS_MAXSYSCALL 600 diff --git a/sys/compat/freebsd32/freebsd32_syscalls.c b/sys/compat/freebsd32/freebsd32_syscalls.c index f7cc4c284e4d..645cdccbc02d 100644 --- a/sys/compat/freebsd32/freebsd32_syscalls.c +++ b/sys/compat/freebsd32/freebsd32_syscalls.c @@ -604,4 +604,5 @@ const char *freebsd32_syscallnames[] = { "setgroups", /* 596 = setgroups */ "jail_attach_jd", /* 597 = jail_attach_jd */ "jail_remove_jd", /* 598 = jail_remove_jd */ + "#599", /* 599 = kexec_load */ }; diff --git a/sys/compat/freebsd32/freebsd32_sysent.c b/sys/compat/freebsd32/freebsd32_sysent.c index 18f809ef04e3..240b54ae9011 100644 --- a/sys/compat/freebsd32/freebsd32_sysent.c +++ b/sys/compat/freebsd32/freebsd32_sysent.c @@ -666,4 +666,5 @@ struct sysent freebsd32_sysent[] = { { .sy_narg = AS(setgroups_args), .sy_call = (sy_call_t *)sys_setgroups, .sy_auevent = AUE_SETGROUPS, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 596 = setgroups */ { .sy_narg = AS(jail_attach_jd_args), .sy_call = (sy_call_t *)sys_jail_attach_jd, .sy_auevent = AUE_JAIL_ATTACH, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 597 = jail_attach_jd */ { .sy_narg = AS(jail_remove_jd_args), .sy_call = (sy_call_t *)sys_jail_remove_jd, .sy_auevent = AUE_JAIL_REMOVE, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 598 = jail_remove_jd */ + { .sy_narg = 0, .sy_call = (sy_call_t *)nosys, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_ABSENT }, /* 599 = freebsd32_kexec_load */ }; diff --git a/sys/compat/freebsd32/syscalls.conf b/sys/compat/freebsd32/syscalls.conf index 72006631c89e..9308d1529c63 100644 --- a/sys/compat/freebsd32/syscalls.conf +++ b/sys/compat/freebsd32/syscalls.conf @@ -48,10 +48,11 @@ obsol="getkerninfo" # Syscalls without implementations: # __mac_* - should be implemented # afs3_syscall - requires significant porting, probably doesn't make sense +# kexec_load - makes little sense on 64-bit hardware # kldsym - can't be implemented (kernel virtual addresses can't fit in 32-bits) # lgetfh - should be implemented # nlm_syscall - requires significant porting, probably doesn't make sense # nnpfs_syscall - requires significant porting, probably doesn't make sense # ntp_gettime - should be implemented # thr_create - was unimplemented and appears to be unnecessary -unimpl="afs3_syscall kldsym __mac_get_proc __mac_set_proc __mac_get_fd __mac_get_file __mac_set_fd __mac_set_file __mac_get_pid __mac_get_link __mac_set_link __mac_execve nfssvc nlm_syscall ntp_gettime lgetfh nnpfs_syscall thr_create" +unimpl="afs3_syscall kexec_load kldsym __mac_get_proc __mac_set_proc __mac_get_fd __mac_get_file __mac_set_fd __mac_set_file __mac_get_pid __mac_get_link __mac_set_link __mac_execve nfssvc nlm_syscall ntp_gettime lgetfh nnpfs_syscall thr_create" diff --git a/sys/conf/files b/sys/conf/files index c17451324324..0a24b5e1e39b 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -3842,6 +3842,7 @@ kern/kern_jaildesc.c standard kern/kern_jailmeta.c standard kern/kern_kcov.c optional kcov \ compile-with "${NOSAN_C} ${MSAN_CFLAGS}" +kern/kern_kexec.c standard kern/kern_khelp.c standard kern/kern_kthread.c standard kern/kern_ktr.c optional ktr @@ -4549,7 +4550,6 @@ netpfil/ipfw/dn_sched_rr.c optional inet dummynet netpfil/ipfw/dn_sched_wf2q.c optional inet dummynet netpfil/ipfw/ip_dummynet.c optional inet dummynet netpfil/ipfw/ip_dn_io.c optional inet dummynet -netpfil/ipfw/ip_dn_glue.c optional inet dummynet netpfil/ipfw/ip_fw2.c optional inet ipfirewall netpfil/ipfw/ip_fw_bpf.c optional inet ipfirewall netpfil/ipfw/ip_fw_dynamic.c optional inet ipfirewall \ diff --git a/sys/conf/files.amd64 b/sys/conf/files.amd64 index a342242ac66e..e4f01813bc8f 100644 --- a/sys/conf/files.amd64 +++ b/sys/conf/files.amd64 @@ -77,6 +77,8 @@ amd64/amd64/fpu.c standard amd64/amd64/gdb_machdep.c optional gdb amd64/amd64/initcpu.c standard amd64/amd64/io.c optional io +amd64/amd64/kexec_support.c standard +amd64/amd64/kexec_tramp.S standard amd64/amd64/locore.S standard no-obj amd64/amd64/xen-locore.S optional xenhvm \ compile-with "${NORMAL_S} -g0" \ diff --git a/sys/conf/files.arm64 b/sys/conf/files.arm64 index 2f412fa3cb1b..882aca705336 100644 --- a/sys/conf/files.arm64 +++ b/sys/conf/files.arm64 @@ -55,6 +55,7 @@ arm64/arm64/gic_v3_acpi.c optional acpi arm64/arm64/gic_v3_fdt.c optional fdt arm64/arm64/hyp_stub.S standard arm64/arm64/identcpu.c standard +arm64/arm64/kexec_support.c standard arm64/arm64/locore.S standard no-obj arm64/arm64/machdep.c standard arm64/arm64/machdep_boot.c standard diff --git a/sys/dev/acpica/acpi.c b/sys/dev/acpica/acpi.c index 3f0a7b40245d..e3ff4f6937d2 100644 --- a/sys/dev/acpica/acpi.c +++ b/sys/dev/acpica/acpi.c @@ -4430,8 +4430,8 @@ acpi_stype_sysctl(SYSCTL_HANDLER_ARGS) return (EINVAL); printf("warning: this sysctl expects a sleep type, but an ACPI S-state has " "been passed to it. This functionality is deprecated; see acpi(4).\n"); - MPASS(sstate < ACPI_S_STATE_COUNT); - if (acpi_supported_sstates[sstate] == false) + if (sstate < ACPI_S_STATE_COUNT && + !acpi_supported_sstates[sstate]) return (EOPNOTSUPP); new_stype = acpi_sstate_to_stype(sstate); } diff --git a/sys/dev/ice/ice_drv_info.h b/sys/dev/ice/ice_drv_info.h index 46965f4124bc..abb11bdb5fd9 100644 --- a/sys/dev/ice/ice_drv_info.h +++ b/sys/dev/ice/ice_drv_info.h @@ -238,6 +238,9 @@ static const pci_vendor_info_t ice_vendor_info_array[] = { ICE_INTEL_VENDOR_ID, 0x0001, 0, "Intel(R) Ethernet Network Adapter E835-XXV-2 for OCP 3.0"), PVIDV_OEM(ICE_INTEL_VENDOR_ID, ICE_DEV_ID_E835CC_SFP, + ICE_INTEL_VENDOR_ID, 0x0002, 0, + "Intel(R) Ethernet Network Adapter E835-XXV-4"), + PVIDV_OEM(ICE_INTEL_VENDOR_ID, ICE_DEV_ID_E835CC_SFP, ICE_INTEL_VENDOR_ID, 0x0003, 0, "Intel(R) Ethernet Network Adapter E835-XXV-2"), PVIDV_OEM(ICE_INTEL_VENDOR_ID, ICE_DEV_ID_E835CC_SFP, diff --git a/sys/dev/psci/psci.c b/sys/dev/psci/psci.c index 497b23d2d4c3..2b250401ae83 100644 --- a/sys/dev/psci/psci.c +++ b/sys/dev/psci/psci.c @@ -474,6 +474,19 @@ psci_cpu_on(unsigned long cpu, unsigned long entry, unsigned long context_id) return (psci_call(fnid, cpu, entry, context_id)); } +int +psci_cpu_off(void) +{ + uint32_t fnid; + + fnid = PSCI_FNID_CPU_OFF; + if (psci_softc != NULL) + fnid = psci_softc->psci_fnids[PSCI_FN_CPU_OFF]; + + /* Returns PSCI_RETVAL_DENIED on error. */ + return (psci_call(fnid, 0, 0, 0)); +} + static void psci_shutdown(void *xsc, int howto) { diff --git a/sys/dev/psci/psci.h b/sys/dev/psci/psci.h index 451d40c0178d..6704eaf26c71 100644 --- a/sys/dev/psci/psci.h +++ b/sys/dev/psci/psci.h @@ -39,6 +39,7 @@ typedef int (*psci_callfn_t)(register_t, register_t, register_t, register_t, extern bool psci_present; int psci_cpu_on(unsigned long, unsigned long, unsigned long); +int psci_cpu_off(void); /* Operates on caller. */ void psci_reset(void); int32_t psci_features(uint32_t); int psci_get_version(void); diff --git a/sys/dev/xilinx/xlnx_pcib.c b/sys/dev/xilinx/xlnx_pcib.c index d549ec445ea9..816b33ec1142 100644 --- a/sys/dev/xilinx/xlnx_pcib.c +++ b/sys/dev/xilinx/xlnx_pcib.c @@ -1,7 +1,7 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * - * Copyright (c) 2020 Ruslan Bukin <br@bsdpad.com> + * Copyright (c) 2020-2025 Ruslan Bukin <br@bsdpad.com> * * This software was developed by SRI International and the University of * Cambridge Computer Laboratory (Department of Computer Science and @@ -84,7 +84,7 @@ struct xlnx_pcib_softc { struct generic_pcie_fdt_softc fdt_sc; struct resource *res[4]; struct mtx mtx; - vm_offset_t msi_page; + void *msi_page; struct xlnx_pcib_irqsrc *isrcs; device_t dev; void *intr_cookie[3]; @@ -105,6 +105,12 @@ struct xlnx_pcib_irqsrc { u_int flags; }; +static struct ofw_compat_data compat_data[] = { + { "xlnx,xdma-host-3.00", 1 }, + { "xlnx,axi-pcie-host-1.00.a", 1 }, + { NULL, 0 }, +}; + static void xlnx_pcib_clear_err_interrupts(struct generic_pcie_core_softc *sc) { @@ -333,12 +339,12 @@ xlnx_pcib_fdt_probe(device_t dev) if (!ofw_bus_status_okay(dev)) return (ENXIO); - if (ofw_bus_is_compatible(dev, "xlnx,xdma-host-3.00")) { - device_set_desc(dev, "Xilinx XDMA PCIe Controller"); - return (BUS_PROBE_DEFAULT); - } + if (ofw_bus_search_compatible(dev, compat_data)->ocd_data == 0) + return (ENXIO); + + device_set_desc(dev, "Xilinx XDMA PCIe Controller"); - return (ENXIO); + return (BUS_PROBE_DEFAULT); } static int @@ -424,8 +430,8 @@ xlnx_pcib_req_valid(struct generic_pcie_core_softc *sc, bus_space_tag_t t; uint32_t val; - t = sc->bst; - h = sc->bsh; + t = rman_get_bustag(sc->res); + h = rman_get_bushandle(sc->res); if ((bus < sc->bus_start) || (bus > sc->bus_end)) return (0); @@ -467,8 +473,8 @@ xlnx_pcib_read_config(device_t dev, u_int bus, u_int slot, return (~0U); offset = PCIE_ADDR_OFFSET(bus - sc->bus_start, slot, func, reg); - t = sc->bst; - h = sc->bsh; + t = rman_get_bustag(sc->res); + h = rman_get_bushandle(sc->res); data = bus_space_read_4(t, h, offset & ~3); @@ -512,8 +518,8 @@ xlnx_pcib_write_config(device_t dev, u_int bus, u_int slot, offset = PCIE_ADDR_OFFSET(bus - sc->bus_start, slot, func, reg); - t = sc->bst; - h = sc->bsh; + t = rman_get_bustag(sc->res); + h = rman_get_bushandle(sc->res); /* * 32-bit access used due to a bug in the Xilinx bridge that diff --git a/sys/fs/nfsclient/nfs_clrpcops.c b/sys/fs/nfsclient/nfs_clrpcops.c index d3b83eb8b94b..983eb8b9226f 100644 --- a/sys/fs/nfsclient/nfs_clrpcops.c +++ b/sys/fs/nfsclient/nfs_clrpcops.c @@ -2212,7 +2212,7 @@ nfsrpc_writerpc(vnode_t vp, struct uio *uiop, int *iomode, NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED + NFSX_VERF); rlen = fxdr_unsigned(int, *tl++); - if (rlen == 0) { + if (rlen <= 0 || rlen > len) { error = NFSERR_IO; goto nfsmout; } else if (rlen < len) { @@ -5599,7 +5599,7 @@ nfsrpc_createsession(struct nfsmount *nmp, struct nfsclsession *sep, } *tl++ = txdr_unsigned(4096); /* Max response size cached */ *tl++ = txdr_unsigned(20); /* Max operations */ - *tl++ = txdr_unsigned(64); /* Max slots */ + *tl++ = txdr_unsigned(NFSV4_SLOTS); /* Max slots */ *tl = 0; /* No rdma ird */ /* Fill in back channel attributes. */ @@ -5668,6 +5668,11 @@ nfsrpc_createsession(struct nfsmount *nmp, struct nfsclsession *sep, sep->nfsess_maxcache = fxdr_unsigned(int, *tl++); tl++; sep->nfsess_foreslots = fxdr_unsigned(uint16_t, *tl++); + if (sep->nfsess_foreslots == 0) { + error = NFSERR_BADXDR; + goto nfsmout; + } else if (sep->nfsess_foreslots > NFSV4_SLOTS) + sep->nfsess_foreslots = NFSV4_SLOTS; NFSCL_DEBUG(4, "fore slots=%d\n", (int)sep->nfsess_foreslots); irdcnt = fxdr_unsigned(int, *tl); if (irdcnt < 0 || irdcnt > 1) { @@ -5681,6 +5686,8 @@ nfsrpc_createsession(struct nfsmount *nmp, struct nfsclsession *sep, NFSM_DISSECT(tl, uint32_t *, 7 * NFSX_UNSIGNED); tl += 5; sep->nfsess_backslots = fxdr_unsigned(uint16_t, *tl); + if (sep->nfsess_backslots > NFSV4_CBSLOTS) + sep->nfsess_backslots = NFSV4_CBSLOTS; NFSCL_DEBUG(4, "back slots=%d\n", (int)sep->nfsess_backslots); } error = nd->nd_repstat; @@ -5800,7 +5807,8 @@ nfsrpc_getdeviceinfo(struct nfsmount *nmp, uint8_t *deviceid, int layouttype, NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); stripecnt = fxdr_unsigned(int, *tl); NFSCL_DEBUG(4, "stripecnt=%d\n", stripecnt); - if (stripecnt < 1 || stripecnt > 4096) { + if (stripecnt >= MHLEN / NFSX_UNSIGNED || + stripecnt < 1) { printf("pNFS File layout devinfo stripecnt %d:" " out of range\n", stripecnt); error = NFSERR_BADXDR; @@ -7249,7 +7257,7 @@ nfsrpc_writeds(vnode_t vp, struct uio *uiop, int *iomode, int *must_commit, NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED + NFSX_VERF); rlen = fxdr_unsigned(int, *tl++); NFSCL_DEBUG(4, "nfsrpc_writeds: len=%d rlen=%d\n", len, rlen); - if (rlen == 0) { + if (rlen <= 0 || rlen > len) { error = NFSERR_IO; goto nfsmout; } else if (rlen < len) { @@ -8246,7 +8254,7 @@ nfsrv_parseug(struct nfsrv_descript *nd, int dogrp, uid_t *uidp, gid_t *gidp, NFSPROC_T *p) { uint32_t *tl; - char *cp, *str, str0[NFSV4_SMALLSTR + 1]; + char *str, str0[NFSV4_SMALLSTR + 1]; uint32_t len = 0; int error = 0; @@ -8269,9 +8277,9 @@ nfsrv_parseug(struct nfsrv_descript *nd, int dogrp, uid_t *uidp, gid_t *gidp, str = malloc(len + 1, M_TEMP, M_WAITOK); else str = str0; - NFSM_DISSECT(cp, char *, NFSM_RNDUP(len)); - NFSBCOPY(cp, str, len); - str[len] = '\0'; + error = nfsrv_mtostr(nd, str, len); + if (error != 0) + goto nfsmout; NFSCL_DEBUG(4, "nfsrv_parseug: str=%s\n", str); if (dogrp != 0) error = nfsv4_strtogid(nd, str, len, gidp); diff --git a/sys/fs/nfsserver/nfs_nfsdserv.c b/sys/fs/nfsserver/nfs_nfsdserv.c index 6f3447f26620..67af0cf71175 100644 --- a/sys/fs/nfsserver/nfs_nfsdserv.c +++ b/sys/fs/nfsserver/nfs_nfsdserv.c @@ -5138,6 +5138,11 @@ nfsrvd_layoutcommit(struct nfsrv_descript *nd, __unused int isdgram, NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED); layouttype = fxdr_unsigned(int, *tl++); maxcnt = fxdr_unsigned(int, *tl); + /* There is no limit in the RFC, so use 1000 as a sanity limit. */ + if (maxcnt < 0 || maxcnt > 1000) { + error = NFSERR_BADXDR; + goto nfsmout; + } if (maxcnt > 0) { layp = malloc(maxcnt + 1, M_TEMP, M_WAITOK); error = nfsrv_mtostr(nd, layp, maxcnt); diff --git a/sys/i386/include/kexec.h b/sys/i386/include/kexec.h new file mode 100644 index 000000000000..9fbdef38ad2e --- /dev/null +++ b/sys/i386/include/kexec.h @@ -0,0 +1,38 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2025 Juniper Networks, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _I386_KEXEC_H_ +#define _I386_KEXEC_H_ + +int +kexec_load_md(struct kexec_image *image) +{ + return (ENOSYS); +} + +#define kexec_reboot_md(x) do {} while (0) +#endif /* _I386_KEXEC_H_ */ diff --git a/sys/kern/init_sysent.c b/sys/kern/init_sysent.c index e42e7dcf8b44..cd305de1ed44 100644 --- a/sys/kern/init_sysent.c +++ b/sys/kern/init_sysent.c @@ -665,4 +665,5 @@ struct sysent sysent[] = { { .sy_narg = AS(setgroups_args), .sy_call = (sy_call_t *)sys_setgroups, .sy_auevent = AUE_SETGROUPS, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 596 = setgroups */ { .sy_narg = AS(jail_attach_jd_args), .sy_call = (sy_call_t *)sys_jail_attach_jd, .sy_auevent = AUE_JAIL_ATTACH, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 597 = jail_attach_jd */ { .sy_narg = AS(jail_remove_jd_args), .sy_call = (sy_call_t *)sys_jail_remove_jd, .sy_auevent = AUE_JAIL_REMOVE, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 598 = jail_remove_jd */ + { .sy_narg = AS(kexec_load_args), .sy_call = (sy_call_t *)sys_kexec_load, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 599 = kexec_load */ }; diff --git a/sys/kern/kern_kexec.c b/sys/kern/kern_kexec.c new file mode 100644 index 000000000000..2efea7dcf9a7 --- /dev/null +++ b/sys/kern/kern_kexec.c @@ -0,0 +1,350 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2025 Juniper Networks, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/param.h> +#include <sys/bus.h> +#include <sys/eventhandler.h> +#include <sys/kernel.h> +#ifdef INTRNG +#include <sys/intr.h> +#endif +#include <sys/kexec.h> +#include <sys/malloc.h> +#include <sys/proc.h> +#include <sys/priv.h> +#include <sys/reboot.h> +#include <sys/rman.h> +#include <sys/rwlock.h> +#include <sys/smp.h> +#include <sys/syscallsubr.h> +#include <sys/sysproto.h> + +#include <vm/vm.h> +#include <vm/pmap.h> +#include <vm/vm_extern.h> +#include <vm/vm_kern.h> +#include <vm/vm_map.h> +#include <vm/vm_object.h> +#include <vm/vm_page.h> +#include <vm/vm_pagequeue.h> +#include <vm/vm_phys.h> +#include <vm/vm_radix.h> + +#include <machine/kexec.h> + +#ifndef KEXEC_MD_PAGES +/* + * Number of MD pages for extra bookkeeping. + * This is a macro because it can be a constant (some architectures make it 0). + * It accepts an argument, which is an array of + * kexec_segment[KEXEC_SEGMENT_MAX]. + */ +#define KEXEC_MD_PAGES(x) 0 +#endif + +/* + * Basic design: + * + * Given an array of "segment descriptors" stage an image to be loaded and + * jumped to at reboot, instead of rebooting via firmware. + * + * Constraints: + * - The segment descriptors' "mem" and "memsz" must each fit within a + * vm_phys_seg segment, which can be obtained via the `vm.phys_segs` sysctl. + * A single segment cannot span multiple vm_phys_seg segments, even if the + * vm_phys_seg segments are adjacent. + * + * Technical details: + * + * Take advantage of the VM subsystem and create a vm_object to hold the staged + * image. When grabbing pages for the object, sort the pages so that if a page + * in the object is located in the physical range of any of the kexec segment + * targets then it gets placed at the pindex corresponding to that physical + * address. This avoids the chance of corruption by writing over the page in + * the final copy, or the need for a copy buffer page. + */ + +static struct kexec_image staged_image; +static vm_offset_t stage_addr; +static vm_object_t kexec_obj; + +static eventhandler_tag kexec_reboot_handler; +static struct mtx kexec_mutex; + +static MALLOC_DEFINE(M_KEXEC, "kexec", "Kexec segments"); + + +static void +kexec_reboot(void *junk __unused, int howto) +{ + if ((howto & RB_KEXEC) == 0 || kexec_obj == NULL) + return; + +#ifdef SMP + cpu_mp_stop(); +#endif /* SMP */ + intr_disable(); + printf("Starting kexec reboot\n"); + + scheduler_stopped = true; + kexec_reboot_md(&staged_image); +} + +MTX_SYSINIT(kexec_mutex, &kexec_mutex, "kexec", MTX_DEF); + +/* Sort the segment list once copied in */ +static int +seg_cmp(const void *seg1, const void *seg2) +{ + const struct kexec_segment *s1, *s2; + + s1 = seg1; + s2 = seg2; + + return ((uintptr_t)s1->mem - (uintptr_t)s2->mem); +} + +static bool +segment_fits(struct kexec_segment *seg) +{ + vm_paddr_t v = (vm_paddr_t)(uintptr_t)seg->mem; + + for (int i = 0; i < vm_phys_nsegs; i++) { + if (v >= vm_phys_segs[i].start && + (v + seg->memsz - 1) <= vm_phys_segs[i].end) + return (true); + } + + return (false); +} + +static vm_paddr_t +pa_for_pindex(struct kexec_segment_stage *segs, int count, vm_pindex_t pind) +{ + for (int i = count; i > 0; --i) { + if (pind >= segs[i - 1].pindex) + return (ptoa(pind - segs[i-1].pindex) + segs[i - 1].target); + } + + panic("No segment for pindex %ju\n", (uintmax_t)pind); +} + +/* + * For now still tied to the system call, so assumes all memory is userspace. + */ +int +kern_kexec_load(struct thread *td, u_long entry, u_long nseg, + struct kexec_segment *seg, u_long flags) +{ + static int kexec_loading; + struct kexec_segment segtmp[KEXEC_SEGMENT_MAX]; + struct kexec_image *new_image_stage = 0; + vm_object_t new_segments = NULL; + uint8_t *buf; + int err = 0; + int i; + const size_t segsize = nseg * sizeof(struct kexec_segment); + vm_page_t *page_list = 0; + vm_size_t image_count, md_pages, page_count, tmpsize; + vm_offset_t segment_va = 0; + /* + * - Do any sanity checking + * - Load the new segments to temporary + * - Remove the old segments + * - Install the new segments + */ + + if (nseg > KEXEC_SEGMENT_MAX) + return (EINVAL); + + if (atomic_cmpset_acq_int(&kexec_loading, false, true) == 0) + return (EBUSY); + + /* Only do error checking if we're installing new segments. */ + if (nseg > 0) { + /* Create the new kexec object before destroying the old one. */ + bzero(&segtmp, sizeof(segtmp)); + err = copyin(seg, segtmp, segsize); + if (err != 0) + goto out; + qsort(segtmp, nseg, sizeof(*segtmp), seg_cmp); + new_image_stage = malloc(sizeof(*new_image_stage), M_TEMP, M_WAITOK | M_ZERO); + /* + * Sanity checking: + * - All segments must not overlap the kernel, so must be fully enclosed + * in a vm_phys_seg (each kexec segment must be in a single + * vm_phys_seg segment, cannot cross even adjacent segments). + */ + image_count = 0; + for (i = 0; i < nseg; i++) { + if (!segment_fits(&segtmp[i]) || + segtmp[i].bufsz > segtmp[i].memsz) { + err = EINVAL; + goto out; + } + new_image_stage->segments[i].pindex = image_count; + new_image_stage->segments[i].target = (vm_offset_t)segtmp[i].mem; + new_image_stage->segments[i].size = segtmp[i].memsz; + image_count += atop(segtmp[i].memsz); + } + md_pages = KEXEC_MD_PAGES(segtmp); + page_count = image_count + md_pages; + new_segments = vm_object_allocate(OBJT_PHYS, page_count); + page_list = malloc(page_count * sizeof(vm_page_t), M_TEMP, M_WAITOK); + + /* + * - Grab all pages for all segments (use pindex to slice it) + * - Walk the list (once) + * - At each pindex, check if the target PA that corresponds + * to that index is in the object. If so, swap the pages. + * - At the end of this the list will be "best" sorted. + */ + vm_page_grab_pages_unlocked(new_segments, 0, + VM_ALLOC_NORMAL | VM_ALLOC_WAITOK | VM_ALLOC_WIRED | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO, + page_list, page_count); + + /* Sort the pages to best match the PA */ + VM_OBJECT_WLOCK(new_segments); + for (i = 0; i < image_count; i++) { + vm_page_t curpg, otherpg, tmp; + vm_pindex_t otheridx; + + curpg = page_list[i]; + otherpg = PHYS_TO_VM_PAGE(pa_for_pindex(new_image_stage->segments, + nseg, curpg->pindex)); + otheridx = otherpg->pindex; + + if (otherpg->object == new_segments) { + /* + * Swap 'curpg' and 'otherpg', since 'otherpg' + * is at the PA 'curpg' covers. + */ + vm_radix_remove(&new_segments->rtree, otheridx); + vm_radix_remove(&new_segments->rtree, i); + otherpg->pindex = i; + curpg->pindex = otheridx; + vm_radix_insert(&new_segments->rtree, curpg); + vm_radix_insert(&new_segments->rtree, otherpg); + tmp = curpg; + page_list[i] = otherpg; + page_list[otheridx] = tmp; + } + } + for (i = 0; i < nseg; i++) { + new_image_stage->segments[i].first_page = + vm_radix_lookup(&new_segments->rtree, + new_image_stage->segments[i].pindex); + } + if (md_pages > 0) + new_image_stage->first_md_page = + vm_radix_lookup(&new_segments->rtree, + page_count - md_pages); + else + new_image_stage->first_md_page = NULL; + VM_OBJECT_WUNLOCK(new_segments); + + /* Map the object to do the copies */ + err = vm_map_find(kernel_map, new_segments, 0, &segment_va, + ptoa(page_count), 0, VMFS_ANY_SPACE, + VM_PROT_RW, VM_PROT_RW, MAP_PREFAULT); + if (err != 0) + goto out; + buf = (void *)segment_va; + new_image_stage->map_addr = segment_va; + new_image_stage->map_size = ptoa(new_segments->size); + new_image_stage->entry = entry; + new_image_stage->map_obj = new_segments; + for (i = 0; i < nseg; i++) { + err = copyin(segtmp[i].buf, buf, segtmp[i].bufsz); + if (err != 0) { + goto out; + } + new_image_stage->segments[i].map_buf = buf; + buf += segtmp[i].bufsz; + tmpsize = segtmp[i].memsz - segtmp[i].bufsz; + if (tmpsize > 0) + memset(buf, 0, tmpsize); + buf += tmpsize; + } + /* What's left are the MD pages, so zero them all out. */ + if (md_pages > 0) + bzero(buf, ptoa(md_pages)); + + cpu_flush_dcache((void *)segment_va, ptoa(page_count)); + if ((err = kexec_load_md(new_image_stage)) != 0) + goto out; + } + if (kexec_obj != NULL) { + vm_object_unwire(kexec_obj, 0, kexec_obj->size, 0); + KASSERT(stage_addr != 0, ("Mapped kexec_obj without address")); + vm_map_remove(kernel_map, stage_addr, stage_addr + kexec_obj->size); + } + kexec_obj = new_segments; + bzero(&staged_image, sizeof(staged_image)); + if (nseg > 0) + memcpy(&staged_image, new_image_stage, sizeof(*new_image_stage)); + + printf("trampoline at %#jx\n", (uintmax_t)staged_image.entry); + if (nseg > 0) { + if (kexec_reboot_handler == NULL) + kexec_reboot_handler = + EVENTHANDLER_REGISTER(shutdown_final, kexec_reboot, NULL, + SHUTDOWN_PRI_DEFAULT - 150); + } else { + if (kexec_reboot_handler != NULL) + EVENTHANDLER_DEREGISTER(shutdown_final, kexec_reboot_handler); + } +out: + /* Clean up the mess if we've gotten far. */ + if (err != 0 && new_segments != NULL) { + vm_object_unwire(new_segments, 0, new_segments->size, 0); + if (segment_va != 0) + vm_map_remove(kernel_map, segment_va, segment_va + kexec_obj->size); + else + vm_object_deallocate(new_segments); + } + atomic_store_rel_int(&kexec_loading, false); + if (new_image_stage != NULL) + free(new_image_stage, M_TEMP); + if (page_list != 0) + free(page_list, M_TEMP); + + return (err); +} + +int +sys_kexec_load(struct thread *td, struct kexec_load_args *uap) +{ + int error; + + // FIXME: Do w need a better privilege check than PRIV_REBOOT here? + error = priv_check(td, PRIV_REBOOT); + if (error != 0) + return (error); + return (kern_kexec_load(td, uap->entry, uap->nseg, uap->segments, uap->flags)); +} diff --git a/sys/kern/subr_smp.c b/sys/kern/subr_smp.c index 1f9577fddf9c..9f5106316018 100644 --- a/sys/kern/subr_smp.c +++ b/sys/kern/subr_smp.c @@ -242,7 +242,7 @@ generic_stop_cpus(cpuset_t map, u_int type) KASSERT( type == IPI_STOP || type == IPI_STOP_HARD #if X86 - || type == IPI_SUSPEND + || type == IPI_SUSPEND || type == IPI_OFF #endif , ("%s: invalid stop type", __func__)); @@ -260,7 +260,7 @@ generic_stop_cpus(cpuset_t map, u_int type) * will be lost, violating FreeBSD's assumption of reliable * IPI delivery. */ - if (type == IPI_SUSPEND) + if (type == IPI_SUSPEND || type == IPI_OFF) mtx_lock_spin(&smp_ipi_mtx); #endif @@ -280,7 +280,7 @@ generic_stop_cpus(cpuset_t map, u_int type) #endif #if X86 - if (type == IPI_SUSPEND) + if (type == IPI_SUSPEND || type == IPI_OFF) cpus = &suspended_cpus; else #endif @@ -298,7 +298,7 @@ generic_stop_cpus(cpuset_t map, u_int type) } #if X86 - if (type == IPI_SUSPEND) + if (type == IPI_SUSPEND || type == IPI_OFF) mtx_unlock_spin(&smp_ipi_mtx); #endif @@ -327,6 +327,13 @@ suspend_cpus(cpuset_t map) return (generic_stop_cpus(map, IPI_SUSPEND)); } + +int +offline_cpus(cpuset_t map) +{ + + return (generic_stop_cpus(map, IPI_OFF)); +} #endif /* diff --git a/sys/kern/syscalls.c b/sys/kern/syscalls.c index 4cef89cd5219..06a4adc3d8cb 100644 --- a/sys/kern/syscalls.c +++ b/sys/kern/syscalls.c @@ -604,4 +604,5 @@ const char *syscallnames[] = { "setgroups", /* 596 = setgroups */ "jail_attach_jd", /* 597 = jail_attach_jd */ "jail_remove_jd", /* 598 = jail_remove_jd */ + "kexec_load", /* 599 = kexec_load */ }; diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master index 967af1f5313c..ea6d2b5aa1ef 100644 --- a/sys/kern/syscalls.master +++ b/sys/kern/syscalls.master @@ -3394,4 +3394,12 @@ ); } +599 AUE_NULL STD { + int kexec_load( + uint64_t entry, + u_long nseg, + _In_reads_(nseg) _Contains_long_ptr_ struct kexec_segment *segments, + u_long flags + ); + } ; vim: syntax=off diff --git a/sys/kern/systrace_args.c b/sys/kern/systrace_args.c index e28fef931ea8..5951cebbe74a 100644 --- a/sys/kern/systrace_args.c +++ b/sys/kern/systrace_args.c @@ -3514,6 +3514,16 @@ systrace_args(int sysnum, void *params, uint64_t *uarg, int *n_args) *n_args = 1; break; } + /* kexec_load */ + case 599: { + struct kexec_load_args *p = params; + uarg[a++] = p->entry; /* uint64_t */ + uarg[a++] = p->nseg; /* u_long */ + uarg[a++] = (intptr_t)p->segments; /* struct kexec_segment * */ + uarg[a++] = p->flags; /* u_long */ + *n_args = 4; + break; + } default: *n_args = 0; break; @@ -9401,6 +9411,25 @@ systrace_entry_setargdesc(int sysnum, int ndx, char *desc, size_t descsz) break; }; break; + /* kexec_load */ + case 599: + switch (ndx) { + case 0: + p = "uint64_t"; + break; + case 1: + p = "u_long"; + break; + case 2: + p = "userland struct kexec_segment *"; + break; + case 3: + p = "u_long"; + break; + default: + break; + }; + break; default: break; }; @@ -11409,6 +11438,11 @@ systrace_return_setargdesc(int sysnum, int ndx, char *desc, size_t descsz) if (ndx == 0 || ndx == 1) p = "int"; break; + /* kexec_load */ + case 599: + if (ndx == 0 || ndx == 1) + p = "int"; + break; default: break; }; diff --git a/sys/modules/dummynet/Makefile b/sys/modules/dummynet/Makefile index 4ff023e6bca5..a645c1673167 100644 --- a/sys/modules/dummynet/Makefile +++ b/sys/modules/dummynet/Makefile @@ -1,7 +1,6 @@ .PATH: ${SRCTOP}/sys/netpfil/ipfw KMOD= dummynet -SRCS= ip_dummynet.c -SRCS+= ip_dn_glue.c ip_dn_io.c +SRCS= ip_dummynet.c ip_dn_io.c SRCS+= dn_aqm_codel.c dn_aqm_pie.c SRCS+= dn_heap.c dn_sched_fifo.c dn_sched_qfq.c dn_sched_rr.c dn_sched_wf2q.c SRCS+= dn_sched_prio.c dn_sched_fq_codel.c dn_sched_fq_pie.c diff --git a/sys/net/altq/altq_cbq.c b/sys/net/altq/altq_cbq.c index fdf39690160b..2333b9ea8678 100644 --- a/sys/net/altq/altq_cbq.c +++ b/sys/net/altq/altq_cbq.c @@ -173,6 +173,8 @@ cbq_request(struct ifaltq *ifq, int req, void *arg) static void get_class_stats(class_stats_t *statsp, struct rm_class *cl) { + memset(statsp, 0, sizeof(*statsp)); + statsp->xmit_cnt = cl->stats_.xmit_cnt; statsp->drop_cnt = cl->stats_.drop_cnt; statsp->over = cl->stats_.over; diff --git a/sys/net/altq/altq_fairq.c b/sys/net/altq/altq_fairq.c index 6069865101a0..0a00168e547e 100644 --- a/sys/net/altq/altq_fairq.c +++ b/sys/net/altq/altq_fairq.c @@ -857,6 +857,8 @@ get_class_stats(struct fairq_classstats *sp, struct fairq_class *cl) { fairq_bucket_t *b; + memset(sp, 0, sizeof(*sp)); + sp->class_handle = cl->cl_handle; sp->qlimit = cl->cl_qlimit; sp->xmit_cnt = cl->cl_xmitcnt; diff --git a/sys/net/altq/altq_priq.c b/sys/net/altq/altq_priq.c index 026346639b2e..fec488418546 100644 --- a/sys/net/altq/altq_priq.c +++ b/sys/net/altq/altq_priq.c @@ -597,6 +597,8 @@ priq_purgeq(struct priq_class *cl) static void get_class_stats(struct priq_classstats *sp, struct priq_class *cl) { + memset(sp, 0, sizeof(*sp)); + sp->class_handle = cl->cl_handle; sp->qlength = qlen(cl->cl_q); sp->qlimit = qlimit(cl->cl_q); diff --git a/sys/net/if.c b/sys/net/if.c index b6a798aa0fab..cb9c47c14c32 100644 --- a/sys/net/if.c +++ b/sys/net/if.c @@ -2842,15 +2842,20 @@ ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td) break; case SIOCAIFGROUP: + { + const char *groupname; + error = priv_check(td, PRIV_NET_ADDIFGROUP); if (error) return (error); - error = if_addgroup(ifp, - ((struct ifgroupreq *)data)->ifgr_group); + groupname = ((struct ifgroupreq *)data)->ifgr_group; + if (strnlen(groupname, IFNAMSIZ) == IFNAMSIZ) + return (EINVAL); + error = if_addgroup(ifp, groupname); if (error != 0) return (error); break; - + } case SIOCGIFGROUP: { struct epoch_tracker et; @@ -2862,15 +2867,20 @@ ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td) } case SIOCDIFGROUP: + { + const char *groupname; + error = priv_check(td, PRIV_NET_DELIFGROUP); if (error) return (error); - error = if_delgroup(ifp, - ((struct ifgroupreq *)data)->ifgr_group); + groupname = ((struct ifgroupreq *)data)->ifgr_group; + if (strnlen(groupname, IFNAMSIZ) == IFNAMSIZ) + return (EINVAL); + error = if_delgroup(ifp, groupname); if (error != 0) return (error); break; - + } default: error = ENOIOCTL; break; @@ -3014,9 +3024,17 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct thread *td) goto out_noref; case SIOCGIFGMEMB: - error = if_getgroupmembers((struct ifgroupreq *)data); - goto out_noref; + { + struct ifgroupreq *req; + req = (struct ifgroupreq *)data; + if (strnlen(req->ifgr_name, IFNAMSIZ) == IFNAMSIZ) { + error = EINVAL; + goto out_noref; + } + error = if_getgroupmembers(req); + goto out_noref; + } #if defined(INET) || defined(INET6) case SIOCSVH: case SIOCGVH: diff --git a/sys/net/if_var.h b/sys/net/if_var.h index f2df612b19c1..961259bb0ca1 100644 --- a/sys/net/if_var.h +++ b/sys/net/if_var.h @@ -383,18 +383,18 @@ struct ifg_group { char ifg_group[IFNAMSIZ]; u_int ifg_refcnt; void *ifg_pf_kif; - CK_STAILQ_HEAD(, ifg_member) ifg_members; /* (CK_) */ - CK_STAILQ_ENTRY(ifg_group) ifg_next; /* (CK_) */ + CK_STAILQ_HEAD(, ifg_member) ifg_members; + CK_STAILQ_ENTRY(ifg_group) ifg_next; }; struct ifg_member { - CK_STAILQ_ENTRY(ifg_member) ifgm_next; /* (CK_) */ + CK_STAILQ_ENTRY(ifg_member) ifgm_next; if_t ifgm_ifp; }; struct ifg_list { struct ifg_group *ifgl_group; - CK_STAILQ_ENTRY(ifg_list) ifgl_next; /* (CK_) */ + CK_STAILQ_ENTRY(ifg_list) ifgl_next; }; #ifdef _SYS_EVENTHANDLER_H_ diff --git a/sys/netinet/raw_ip.c b/sys/netinet/raw_ip.c index 66070faf97e9..bfe608be6b36 100644 --- a/sys/netinet/raw_ip.c +++ b/sys/netinet/raw_ip.c @@ -680,7 +680,6 @@ rip_ctloutput(struct socket *so, struct sockopt *sopt) break; case IP_DUMMYNET3: /* generic dummynet v.3 functions */ - case IP_DUMMYNET_GET: if (ip_dn_ctl_ptr != NULL) error = ip_dn_ctl_ptr(sopt); else @@ -747,9 +746,6 @@ rip_ctloutput(struct socket *so, struct sockopt *sopt) break; case IP_DUMMYNET3: /* generic dummynet v.3 functions */ - case IP_DUMMYNET_CONFIGURE: - case IP_DUMMYNET_DEL: - case IP_DUMMYNET_FLUSH: if (ip_dn_ctl_ptr != NULL) error = ip_dn_ctl_ptr(sopt); else diff --git a/sys/netpfil/ipfw/ip_dn_glue.c b/sys/netpfil/ipfw/ip_dn_glue.c deleted file mode 100644 index 0412b730e4df..000000000000 --- a/sys/netpfil/ipfw/ip_dn_glue.c +++ /dev/null @@ -1,858 +0,0 @@ -/*- - * SPDX-License-Identifier: BSD-2-Clause - * - * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa - * All rights reserved - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -/* - * - * Binary compatibility support for /sbin/ipfw RELENG_7 and RELENG_8 - */ - -#include "opt_inet6.h" - -#include <sys/param.h> -#include <sys/systm.h> -#include <sys/malloc.h> -#include <sys/mbuf.h> -#include <sys/kernel.h> -#include <sys/lock.h> -#include <sys/module.h> -#include <sys/priv.h> -#include <sys/proc.h> -#include <sys/rwlock.h> -#include <sys/socket.h> -#include <sys/socketvar.h> -#include <sys/time.h> -#include <sys/taskqueue.h> -#include <net/if.h> /* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */ -#include <netinet/in.h> -#include <netinet/ip_var.h> /* ip_output(), IP_FORWARDING */ -#include <netinet/ip_fw.h> -#include <netinet/ip_dummynet.h> - -#include <netpfil/ipfw/ip_fw_private.h> -#include <netpfil/ipfw/dn_heap.h> -#include <netpfil/ipfw/ip_dn_private.h> -#ifdef NEW_AQM -#include <netpfil/ipfw/dn_aqm.h> -#endif -#include <netpfil/ipfw/dn_sched.h> - -/* FREEBSD7.2 ip_dummynet.h r191715*/ - -struct dn_heap_entry7 { - int64_t key; /* sorting key. Topmost element is smallest one */ - void *object; /* object pointer */ -}; - -struct dn_heap7 { - int size; - int elements; - int offset; /* XXX if > 0 this is the offset of direct ptr to obj */ - struct dn_heap_entry7 *p; /* really an array of "size" entries */ -}; - -/* Common to 7.2 and 8 */ -struct dn_flow_set { - SLIST_ENTRY(dn_flow_set) next; /* linked list in a hash slot */ - - u_short fs_nr ; /* flow_set number */ - u_short flags_fs; -#define DNOLD_HAVE_FLOW_MASK 0x0001 -#define DNOLD_IS_RED 0x0002 -#define DNOLD_IS_GENTLE_RED 0x0004 -#define DNOLD_QSIZE_IS_BYTES 0x0008 /* queue size is measured in bytes */ -#define DNOLD_NOERROR 0x0010 /* do not report ENOBUFS on drops */ -#define DNOLD_HAS_PROFILE 0x0020 /* the pipe has a delay profile. */ -#define DNOLD_IS_PIPE 0x4000 -#define DNOLD_IS_QUEUE 0x8000 - - struct dn_pipe7 *pipe ; /* pointer to parent pipe */ - u_short parent_nr ; /* parent pipe#, 0 if local to a pipe */ - - int weight ; /* WFQ queue weight */ - int qsize ; /* queue size in slots or bytes */ - int plr[4] ; /* pkt loss rate (2^31-1 means 100%) */ - - struct ipfw_flow_id flow_mask ; - - /* hash table of queues onto this flow_set */ - int rq_size ; /* number of slots */ - int rq_elements ; /* active elements */ - struct dn_flow_queue7 **rq ; /* array of rq_size entries */ - - u_int32_t last_expired ; /* do not expire too frequently */ - int backlogged ; /* #active queues for this flowset */ - - /* RED parameters */ -#define SCALE_RED 16 -#define SCALE(x) ( (x) << SCALE_RED ) -#define SCALE_VAL(x) ( (x) >> SCALE_RED ) -#define SCALE_MUL(x,y) ( ( (x) * (y) ) >> SCALE_RED ) - int w_q ; /* queue weight (scaled) */ - int max_th ; /* maximum threshold for queue (scaled) */ - int min_th ; /* minimum threshold for queue (scaled) */ - int max_p ; /* maximum value for p_b (scaled) */ - u_int c_1 ; /* max_p/(max_th-min_th) (scaled) */ - u_int c_2 ; /* max_p*min_th/(max_th-min_th) (scaled) */ - u_int c_3 ; /* for GRED, (1-max_p)/max_th (scaled) */ - u_int c_4 ; /* for GRED, 1 - 2*max_p (scaled) */ - u_int * w_q_lookup ; /* lookup table for computing (1-w_q)^t */ - u_int lookup_depth ; /* depth of lookup table */ - int lookup_step ; /* granularity inside the lookup table */ - int lookup_weight ; /* equal to (1-w_q)^t / (1-w_q)^(t+1) */ - int avg_pkt_size ; /* medium packet size */ - int max_pkt_size ; /* max packet size */ -}; -SLIST_HEAD(dn_flow_set_head, dn_flow_set); - -#define DN_IS_PIPE 0x4000 -#define DN_IS_QUEUE 0x8000 -struct dn_flow_queue7 { - struct dn_flow_queue7 *next ; - struct ipfw_flow_id id ; - - struct mbuf *head, *tail ; /* queue of packets */ - u_int len ; - u_int len_bytes ; - - u_long numbytes; - - u_int64_t tot_pkts ; /* statistics counters */ - u_int64_t tot_bytes ; - u_int32_t drops ; - - int hash_slot ; /* debugging/diagnostic */ - - /* RED parameters */ - int avg ; /* average queue length est. (scaled) */ - int count ; /* arrivals since last RED drop */ - int random ; /* random value (scaled) */ - u_int32_t q_time; /* start of queue idle time */ - - /* WF2Q+ support */ - struct dn_flow_set *fs ; /* parent flow set */ - int heap_pos ; /* position (index) of struct in heap */ - int64_t sched_time ; /* current time when queue enters ready_heap */ - - int64_t S,F ; /* start time, finish time */ -}; - -struct dn_pipe7 { /* a pipe */ - SLIST_ENTRY(dn_pipe7) next; /* linked list in a hash slot */ - - int pipe_nr ; /* number */ - uint32_t bandwidth; /* really, bytes/tick. */ - int delay ; /* really, ticks */ - - struct mbuf *head, *tail ; /* packets in delay line */ - - /* WF2Q+ */ - struct dn_heap7 scheduler_heap ; /* top extract - key Finish time*/ - struct dn_heap7 not_eligible_heap; /* top extract- key Start time */ - struct dn_heap7 idle_heap ; /* random extract - key Start=Finish time */ - - int64_t V ; /* virtual time */ - int sum; /* sum of weights of all active sessions */ - - int numbytes; - - int64_t sched_time ; /* time pipe was scheduled in ready_heap */ - - /* - * When the tx clock come from an interface (if_name[0] != '\0'), its name - * is stored below, whereas the ifp is filled when the rule is configured. - */ - char if_name[IFNAMSIZ]; - struct ifnet *ifp ; - int ready ; /* set if ifp != NULL and we got a signal from it */ - - struct dn_flow_set fs ; /* used with fixed-rate flows */ -}; -SLIST_HEAD(dn_pipe_head7, dn_pipe7); - -/* FREEBSD8 ip_dummynet.h r196045 */ -struct dn_flow_queue8 { - struct dn_flow_queue8 *next ; - struct ipfw_flow_id id ; - - struct mbuf *head, *tail ; /* queue of packets */ - u_int len ; - u_int len_bytes ; - - uint64_t numbytes ; /* credit for transmission (dynamic queues) */ - int64_t extra_bits; /* extra bits simulating unavailable channel */ - - u_int64_t tot_pkts ; /* statistics counters */ - u_int64_t tot_bytes ; - u_int32_t drops ; - - int hash_slot ; /* debugging/diagnostic */ - - /* RED parameters */ - int avg ; /* average queue length est. (scaled) */ - int count ; /* arrivals since last RED drop */ - int random ; /* random value (scaled) */ - int64_t idle_time; /* start of queue idle time */ - - /* WF2Q+ support */ - struct dn_flow_set *fs ; /* parent flow set */ - int heap_pos ; /* position (index) of struct in heap */ - int64_t sched_time ; /* current time when queue enters ready_heap */ - - int64_t S,F ; /* start time, finish time */ -}; - -struct dn_pipe8 { /* a pipe */ - SLIST_ENTRY(dn_pipe8) next; /* linked list in a hash slot */ - - int pipe_nr ; /* number */ - uint32_t bandwidth; /* really, bytes/tick. */ - int delay ; /* really, ticks */ - - struct mbuf *head, *tail ; /* packets in delay line */ - - /* WF2Q+ */ - struct dn_heap7 scheduler_heap ; /* top extract - key Finish time*/ - struct dn_heap7 not_eligible_heap; /* top extract- key Start time */ - struct dn_heap7 idle_heap ; /* random extract - key Start=Finish time */ - - int64_t V ; /* virtual time */ - int sum; /* sum of weights of all active sessions */ - - /* Same as in dn_flow_queue, numbytes can become large */ - int64_t numbytes; /* bits I can transmit (more or less). */ - uint64_t burst; /* burst size, scaled: bits * hz */ - - int64_t sched_time ; /* time pipe was scheduled in ready_heap */ - int64_t idle_time; /* start of pipe idle time */ - - char if_name[IFNAMSIZ]; - struct ifnet *ifp ; - int ready ; /* set if ifp != NULL and we got a signal from it */ - - struct dn_flow_set fs ; /* used with fixed-rate flows */ - - /* fields to simulate a delay profile */ -#define ED_MAX_NAME_LEN 32 - char name[ED_MAX_NAME_LEN]; - int loss_level; - int samples_no; - int *samples; -}; - -#define ED_MAX_SAMPLES_NO 1024 -struct dn_pipe_max8 { - struct dn_pipe8 pipe; - int samples[ED_MAX_SAMPLES_NO]; -}; -SLIST_HEAD(dn_pipe_head8, dn_pipe8); - -/* - * Changes from 7.2 to 8: - * dn_pipe: - * numbytes from int to int64_t - * add burst (int64_t) - * add idle_time (int64_t) - * add profile - * add struct dn_pipe_max - * add flag DN_HAS_PROFILE - * - * dn_flow_queue - * numbytes from u_long to int64_t - * add extra_bits (int64_t) - * q_time from u_int32_t to int64_t and name idle_time - * - * dn_flow_set unchanged - * - */ - -/* NOTE:XXX copied from dummynet.c */ -#define O_NEXT(p, len) ((void *)((char *)p + len)) -static void -oid_fill(struct dn_id *oid, int len, int type, uintptr_t id) -{ - oid->len = len; - oid->type = type; - oid->subtype = 0; - oid->id = id; -} -/* make room in the buffer and move the pointer forward */ -static void * -o_next(struct dn_id **o, int len, int type) -{ - struct dn_id *ret = *o; - oid_fill(ret, len, type, 0); - *o = O_NEXT(*o, len); - return ret; -} - -static size_t pipesize7 = sizeof(struct dn_pipe7); -static size_t pipesize8 = sizeof(struct dn_pipe8); -static size_t pipesizemax8 = sizeof(struct dn_pipe_max8); - -/* Indicate 'ipfw' version - * 1: from FreeBSD 7.2 - * 0: from FreeBSD 8 - * -1: unknown (for now is unused) - * - * It is update when a IP_DUMMYNET_DEL or IP_DUMMYNET_CONFIGURE request arrives - * NOTE: if a IP_DUMMYNET_GET arrives and the 'ipfw' version is unknown, - * it is suppose to be the FreeBSD 8 version. - */ -static int is7 = 0; - -static int -convertflags2new(int src) -{ - int dst = 0; - - if (src & DNOLD_HAVE_FLOW_MASK) - dst |= DN_HAVE_MASK; - if (src & DNOLD_QSIZE_IS_BYTES) - dst |= DN_QSIZE_BYTES; - if (src & DNOLD_NOERROR) - dst |= DN_NOERROR; - if (src & DNOLD_IS_RED) - dst |= DN_IS_RED; - if (src & DNOLD_IS_GENTLE_RED) - dst |= DN_IS_GENTLE_RED; - if (src & DNOLD_HAS_PROFILE) - dst |= DN_HAS_PROFILE; - - return dst; -} - -static int -convertflags2old(int src) -{ - int dst = 0; - - if (src & DN_HAVE_MASK) - dst |= DNOLD_HAVE_FLOW_MASK; - if (src & DN_IS_RED) - dst |= DNOLD_IS_RED; - if (src & DN_IS_GENTLE_RED) - dst |= DNOLD_IS_GENTLE_RED; - if (src & DN_NOERROR) - dst |= DNOLD_NOERROR; - if (src & DN_HAS_PROFILE) - dst |= DNOLD_HAS_PROFILE; - if (src & DN_QSIZE_BYTES) - dst |= DNOLD_QSIZE_IS_BYTES; - - return dst; -} - -static int -dn_compat_del(void *v) -{ - struct dn_pipe7 *p = (struct dn_pipe7 *) v; - struct dn_pipe8 *p8 = (struct dn_pipe8 *) v; - struct { - struct dn_id oid; - uintptr_t a[1]; /* add more if we want a list */ - } cmd; - - /* XXX DN_API_VERSION ??? */ - oid_fill((void *)&cmd, sizeof(cmd), DN_CMD_DELETE, DN_API_VERSION); - - if (is7) { - if (p->pipe_nr == 0 && p->fs.fs_nr == 0) - return EINVAL; - if (p->pipe_nr != 0 && p->fs.fs_nr != 0) - return EINVAL; - } else { - if (p8->pipe_nr == 0 && p8->fs.fs_nr == 0) - return EINVAL; - if (p8->pipe_nr != 0 && p8->fs.fs_nr != 0) - return EINVAL; - } - - if (p->pipe_nr != 0) { /* pipe x delete */ - cmd.a[0] = p->pipe_nr; - cmd.oid.subtype = DN_LINK; - } else { /* queue x delete */ - cmd.oid.subtype = DN_FS; - cmd.a[0] = (is7) ? p->fs.fs_nr : p8->fs.fs_nr; - } - - return do_config(&cmd, cmd.oid.len); -} - -static int -dn_compat_config_queue(struct dn_fs *fs, void* v) -{ - struct dn_pipe7 *p7 = (struct dn_pipe7 *)v; - struct dn_pipe8 *p8 = (struct dn_pipe8 *)v; - struct dn_flow_set *f; - - if (is7) - f = &p7->fs; - else - f = &p8->fs; - - fs->fs_nr = f->fs_nr; - fs->sched_nr = f->parent_nr; - fs->flow_mask = f->flow_mask; - fs->buckets = f->rq_size; - fs->qsize = f->qsize; - fs->plr[0] = f->plr[0]; - fs->plr[1] = f->plr[1]; - fs->plr[2] = f->plr[2]; - fs->plr[3] = f->plr[3]; - fs->par[0] = f->weight; - fs->flags = convertflags2new(f->flags_fs); - if (fs->flags & DN_IS_GENTLE_RED || fs->flags & DN_IS_RED) { - fs->w_q = f->w_q; - fs->max_th = f->max_th; - fs->min_th = f->min_th; - fs->max_p = f->max_p; - } - - return 0; -} - -static int -dn_compat_config_pipe(struct dn_sch *sch, struct dn_link *p, - struct dn_fs *fs, void* v) -{ - struct dn_pipe7 *p7 = (struct dn_pipe7 *)v; - struct dn_pipe8 *p8 = (struct dn_pipe8 *)v; - int i = p7->pipe_nr; - - sch->sched_nr = i; - sch->oid.subtype = 0; - p->link_nr = i; - fs->fs_nr = i + 2*DN_MAX_ID; - fs->sched_nr = i + DN_MAX_ID; - - /* Common to 7 and 8 */ - p->bandwidth = p7->bandwidth; - p->delay = p7->delay; - if (!is7) { - /* FreeBSD 8 has burst */ - p->burst = p8->burst; - } - - /* fill the fifo flowset */ - dn_compat_config_queue(fs, v); - fs->fs_nr = i + 2*DN_MAX_ID; - fs->sched_nr = i + DN_MAX_ID; - - /* Move scheduler related parameter from fs to sch */ - sch->buckets = fs->buckets; /*XXX*/ - fs->buckets = 0; - if (fs->flags & DN_HAVE_MASK) { - sch->flags |= DN_HAVE_MASK; - fs->flags &= ~DN_HAVE_MASK; - sch->sched_mask = fs->flow_mask; - bzero(&fs->flow_mask, sizeof(struct ipfw_flow_id)); - } - - return 0; -} - -static int -dn_compat_config_profile(struct dn_profile *pf, struct dn_link *p, - void *v) -{ - struct dn_pipe8 *p8 = (struct dn_pipe8 *)v; - - p8->samples = &(((struct dn_pipe_max8 *)p8)->samples[0]); - - pf->link_nr = p->link_nr; - pf->loss_level = p8->loss_level; -// pf->bandwidth = p->bandwidth; //XXX bandwidth redundant? - pf->samples_no = p8->samples_no; - strncpy(pf->name, p8->name,sizeof(pf->name)); - bcopy(p8->samples, pf->samples, sizeof(pf->samples)); - - return 0; -} - -/* - * If p->pipe_nr != 0 the command is 'pipe x config', so need to create - * the three main struct, else only a flowset is created - */ -static int -dn_compat_configure(void *v) -{ - struct dn_id *buf = NULL, *base; - struct dn_sch *sch = NULL; - struct dn_link *p = NULL; - struct dn_fs *fs = NULL; - struct dn_profile *pf = NULL; - int lmax; - int error; - - struct dn_pipe7 *p7 = (struct dn_pipe7 *)v; - struct dn_pipe8 *p8 = (struct dn_pipe8 *)v; - - int i; /* number of object to configure */ - - lmax = sizeof(struct dn_id); /* command header */ - lmax += sizeof(struct dn_sch) + sizeof(struct dn_link) + - sizeof(struct dn_fs) + sizeof(struct dn_profile); - - base = buf = malloc(lmax, M_DUMMYNET, M_WAITOK|M_ZERO); - o_next(&buf, sizeof(struct dn_id), DN_CMD_CONFIG); - base->id = DN_API_VERSION; - - /* pipe_nr is the same in p7 and p8 */ - i = p7->pipe_nr; - if (i != 0) { /* pipe config */ - sch = o_next(&buf, sizeof(*sch), DN_SCH); - p = o_next(&buf, sizeof(*p), DN_LINK); - fs = o_next(&buf, sizeof(*fs), DN_FS); - - error = dn_compat_config_pipe(sch, p, fs, v); - if (error) { - free(buf, M_DUMMYNET); - return error; - } - if (!is7 && p8->samples_no > 0) { - /* Add profiles*/ - pf = o_next(&buf, sizeof(*pf), DN_PROFILE); - error = dn_compat_config_profile(pf, p, v); - if (error) { - free(buf, M_DUMMYNET); - return error; - } - } - } else { /* queue config */ - fs = o_next(&buf, sizeof(*fs), DN_FS); - error = dn_compat_config_queue(fs, v); - if (error) { - free(buf, M_DUMMYNET); - return error; - } - } - error = do_config(base, (char *)buf - (char *)base); - - if (buf) - free(buf, M_DUMMYNET); - return error; -} - -int -dn_compat_calc_size(void) -{ - int need = 0; - /* XXX use FreeBSD 8 struct size */ - /* NOTE: - * - half scheduler: schk_count/2 - * - all flowset: fsk_count - * - all flowset queues: queue_count - * - all pipe queue: si_count - */ - need += V_dn_cfg.schk_count * sizeof(struct dn_pipe8) / 2; - need += V_dn_cfg.fsk_count * sizeof(struct dn_flow_set); - need += V_dn_cfg.si_count * sizeof(struct dn_flow_queue8); - need += V_dn_cfg.queue_count * sizeof(struct dn_flow_queue8); - - return need; -} - -int -dn_c_copy_q (void *_ni, void *arg) -{ - struct copy_args *a = arg; - struct dn_flow_queue7 *fq7 = (struct dn_flow_queue7 *)*a->start; - struct dn_flow_queue8 *fq8 = (struct dn_flow_queue8 *)*a->start; - struct dn_flow *ni = (struct dn_flow *)_ni; - int size = 0; - - /* XXX hash slot not set */ - /* No difference between 7.2/8 */ - fq7->len = ni->length; - fq7->len_bytes = ni->len_bytes; - fq7->id = ni->fid; - - if (is7) { - size = sizeof(struct dn_flow_queue7); - fq7->tot_pkts = ni->tot_pkts; - fq7->tot_bytes = ni->tot_bytes; - fq7->drops = ni->drops; - } else { - size = sizeof(struct dn_flow_queue8); - fq8->tot_pkts = ni->tot_pkts; - fq8->tot_bytes = ni->tot_bytes; - fq8->drops = ni->drops; - } - - *a->start += size; - return 0; -} - -int -dn_c_copy_pipe(struct dn_schk *s, struct copy_args *a, int nq) -{ - struct dn_link *l = &s->link; - struct dn_fsk *f = s->fs; - - struct dn_pipe7 *pipe7 = (struct dn_pipe7 *)*a->start; - struct dn_pipe8 *pipe8 = (struct dn_pipe8 *)*a->start; - struct dn_flow_set *fs; - int size = 0; - - if (is7) { - fs = &pipe7->fs; - size = sizeof(struct dn_pipe7); - } else { - fs = &pipe8->fs; - size = sizeof(struct dn_pipe8); - } - - /* These 4 field are the same in pipe7 and pipe8 */ - pipe7->next.sle_next = (struct dn_pipe7 *)DN_IS_PIPE; - pipe7->bandwidth = l->bandwidth; - pipe7->delay = l->delay * 1000 / hz; - pipe7->pipe_nr = l->link_nr - DN_MAX_ID; - - if (!is7) { - if (s->profile) { - struct dn_profile *pf = s->profile; - strncpy(pipe8->name, pf->name, sizeof(pf->name)); - pipe8->loss_level = pf->loss_level; - pipe8->samples_no = pf->samples_no; - } - pipe8->burst = div64(l->burst , 8 * hz); - } - - fs->flow_mask = s->sch.sched_mask; - fs->rq_size = s->sch.buckets ? s->sch.buckets : 1; - - fs->parent_nr = l->link_nr - DN_MAX_ID; - fs->qsize = f->fs.qsize; - fs->plr[0] = f->fs.plr[0]; - fs->plr[1] = f->fs.plr[1]; - fs->plr[2] = f->fs.plr[2]; - fs->plr[3] = f->fs.plr[3]; - fs->w_q = f->fs.w_q; - fs->max_th = f->max_th; - fs->min_th = f->min_th; - fs->max_p = f->fs.max_p; - fs->rq_elements = nq; - - fs->flags_fs = convertflags2old(f->fs.flags); - - *a->start += size; - return 0; -} - -int -dn_compat_copy_pipe(struct copy_args *a, void *_o) -{ - int have = a->end - *a->start; - int need = 0; - int pipe_size = sizeof(struct dn_pipe8); - int queue_size = sizeof(struct dn_flow_queue8); - int n_queue = 0; /* number of queues */ - - struct dn_schk *s = (struct dn_schk *)_o; - /* calculate needed space: - * - struct dn_pipe - * - if there are instances, dn_queue * n_instances - */ - n_queue = (s->sch.flags & DN_HAVE_MASK ? dn_ht_entries(s->siht) : - (s->siht ? 1 : 0)); - need = pipe_size + queue_size * n_queue; - if (have < need) { - D("have %d < need %d", have, need); - return 1; - } - /* copy pipe */ - dn_c_copy_pipe(s, a, n_queue); - - /* copy queues */ - if (s->sch.flags & DN_HAVE_MASK) - dn_ht_scan(s->siht, dn_c_copy_q, a); - else if (s->siht) - dn_c_copy_q(s->siht, a); - return 0; -} - -int -dn_c_copy_fs(struct dn_fsk *f, struct copy_args *a, int nq) -{ - struct dn_flow_set *fs = (struct dn_flow_set *)*a->start; - - fs->next.sle_next = (struct dn_flow_set *)DN_IS_QUEUE; - fs->fs_nr = f->fs.fs_nr; - fs->qsize = f->fs.qsize; - fs->plr[0] = f->fs.plr[0]; - fs->plr[1] = f->fs.plr[1]; - fs->plr[2] = f->fs.plr[2]; - fs->plr[3] = f->fs.plr[3]; - fs->w_q = f->fs.w_q; - fs->max_th = f->max_th; - fs->min_th = f->min_th; - fs->max_p = f->fs.max_p; - fs->flow_mask = f->fs.flow_mask; - fs->rq_elements = nq; - fs->rq_size = (f->fs.buckets ? f->fs.buckets : 1); - fs->parent_nr = f->fs.sched_nr; - fs->weight = f->fs.par[0]; - - fs->flags_fs = convertflags2old(f->fs.flags); - *a->start += sizeof(struct dn_flow_set); - return 0; -} - -int -dn_compat_copy_queue(struct copy_args *a, void *_o) -{ - int have = a->end - *a->start; - int need = 0; - int fs_size = sizeof(struct dn_flow_set); - int queue_size = sizeof(struct dn_flow_queue8); - - struct dn_fsk *fs = (struct dn_fsk *)_o; - int n_queue = 0; /* number of queues */ - - n_queue = (fs->fs.flags & DN_HAVE_MASK ? dn_ht_entries(fs->qht) : - (fs->qht ? 1 : 0)); - - need = fs_size + queue_size * n_queue; - if (have < need) { - D("have < need"); - return 1; - } - - /* copy flowset */ - dn_c_copy_fs(fs, a, n_queue); - - /* copy queues */ - if (fs->fs.flags & DN_HAVE_MASK) - dn_ht_scan(fs->qht, dn_c_copy_q, a); - else if (fs->qht) - dn_c_copy_q(fs->qht, a); - - return 0; -} - -int -copy_data_helper_compat(void *_o, void *_arg) -{ - struct copy_args *a = _arg; - - if (a->type == DN_COMPAT_PIPE) { - struct dn_schk *s = _o; - if (s->sch.oid.subtype != 1 || s->sch.sched_nr <= DN_MAX_ID) { - return 0; /* not old type */ - } - /* copy pipe parameters, and if instance exists, copy - * other parameters and eventually queues. - */ - if(dn_compat_copy_pipe(a, _o)) - return DNHT_SCAN_END; - } else if (a->type == DN_COMPAT_QUEUE) { - struct dn_fsk *fs = _o; - if (fs->fs.fs_nr >= DN_MAX_ID) - return 0; - if (dn_compat_copy_queue(a, _o)) - return DNHT_SCAN_END; - } - return 0; -} - -/* Main function to manage old requests */ -int -ip_dummynet_compat(struct sockopt *sopt) -{ - int error=0; - void *v = NULL; - struct dn_id oid; - - /* Length of data, used to found ipfw version... */ - int len = sopt->sopt_valsize; - - /* len can be 0 if command was dummynet_flush */ - if (len == pipesize7) { - D("setting compatibility with FreeBSD 7.2"); - is7 = 1; - } - else if (len == pipesize8 || len == pipesizemax8) { - D("setting compatibility with FreeBSD 8"); - is7 = 0; - } - - switch (sopt->sopt_name) { - default: - printf("dummynet: -- unknown option %d", sopt->sopt_name); - error = EINVAL; - break; - - case IP_DUMMYNET_FLUSH: - oid_fill(&oid, sizeof(oid), DN_CMD_FLUSH, DN_API_VERSION); - do_config(&oid, oid.len); - break; - - case IP_DUMMYNET_DEL: - v = malloc(len, M_TEMP, M_WAITOK); - error = sooptcopyin(sopt, v, len, len); - if (error) - break; - error = dn_compat_del(v); - free(v, M_TEMP); - break; - - case IP_DUMMYNET_CONFIGURE: - v = malloc(len, M_TEMP, M_NOWAIT); - if (v == NULL) { - error = ENOMEM; - break; - } - error = sooptcopyin(sopt, v, len, len); - if (error) - break; - error = dn_compat_configure(v); - free(v, M_TEMP); - break; - - case IP_DUMMYNET_GET: { - void *buf; - int ret; - int original_size = sopt->sopt_valsize; - int size; - - ret = dummynet_get(sopt, &buf); - if (ret) - return 0;//XXX ? - size = sopt->sopt_valsize; - sopt->sopt_valsize = original_size; - D("size=%d, buf=%p", size, buf); - ret = sooptcopyout(sopt, buf, size); - if (ret) - printf(" %s ERROR sooptcopyout\n", __FUNCTION__); - if (buf) - free(buf, M_DUMMYNET); - } - } - - return error; -} diff --git a/sys/netpfil/ipfw/ip_dn_private.h b/sys/netpfil/ipfw/ip_dn_private.h index 756a997b6ec3..9a43b86791e0 100644 --- a/sys/netpfil/ipfw/ip_dn_private.h +++ b/sys/netpfil/ipfw/ip_dn_private.h @@ -437,15 +437,7 @@ struct copy_args { }; struct sockopt; -int ip_dummynet_compat(struct sockopt *sopt); -int dummynet_get(struct sockopt *sopt, void **compat); -int dn_c_copy_q (void *_ni, void *arg); -int dn_c_copy_pipe(struct dn_schk *s, struct copy_args *a, int nq); -int dn_c_copy_fs(struct dn_fsk *f, struct copy_args *a, int nq); -int dn_compat_copy_queue(struct copy_args *a, void *_o); -int dn_compat_copy_pipe(struct copy_args *a, void *_o); -int copy_data_helper_compat(void *_o, void *_arg); -int dn_compat_calc_size(void); +int dummynet_get(struct sockopt *sopt); int do_config(void *p, size_t l); /* function to drain idle object */ diff --git a/sys/netpfil/ipfw/ip_dummynet.c b/sys/netpfil/ipfw/ip_dummynet.c index d522f9da0fbe..61442c617753 100644 --- a/sys/netpfil/ipfw/ip_dummynet.c +++ b/sys/netpfil/ipfw/ip_dummynet.c @@ -2198,9 +2198,6 @@ compute_space(struct dn_id *cmd, struct copy_args *a) case DN_FS: /* queue show */ x = DN_C_FS | DN_C_QUEUE; break; - case DN_GET_COMPAT: /* compatibility mode */ - need = dn_compat_calc_size(); - break; } a->flags = x; if (x & DN_C_SCH) { @@ -2226,11 +2223,9 @@ compute_space(struct dn_id *cmd, struct copy_args *a) } /* - * If compat != NULL dummynet_get is called in compatibility mode. - * *compat will be the pointer to the buffer to pass to ipfw */ int -dummynet_get(struct sockopt *sopt, void **compat) +dummynet_get(struct sockopt *sopt) { int have, i, need, error; char *start = NULL, *buf; @@ -2248,37 +2243,28 @@ dummynet_get(struct sockopt *sopt, void **compat) cmd = &r.o; - if (!compat) { - /* copy at least an oid, and possibly a full object */ - error = sooptcopyin(sopt, cmd, sizeof(r), sizeof(*cmd)); - sopt->sopt_valsize = sopt_valsize; - if (error) - goto done; - l = cmd->len; + /* copy at least an oid, and possibly a full object */ + error = sooptcopyin(sopt, cmd, sizeof(r), sizeof(*cmd)); + sopt->sopt_valsize = sopt_valsize; + if (error) + goto done; + l = cmd->len; #ifdef EMULATE_SYSCTL - /* sysctl emulation. */ - if (cmd->type == DN_SYSCTL_GET) - return kesysctl_emu_get(sopt); + /* sysctl emulation. */ + if (cmd->type == DN_SYSCTL_GET) + return kesysctl_emu_get(sopt); #endif - if (l > sizeof(r)) { - /* request larger than default, allocate buffer */ - cmd = malloc(l, M_DUMMYNET, M_NOWAIT); - if (cmd == NULL) { - error = ENOMEM; - goto done; - } - error = sooptcopyin(sopt, cmd, l, l); - sopt->sopt_valsize = sopt_valsize; - if (error) - goto done; + if (l > sizeof(r)) { + /* request larger than default, allocate buffer */ + cmd = malloc(l, M_DUMMYNET, M_NOWAIT); + if (cmd == NULL) { + error = ENOMEM; + goto done; } - } else { /* compatibility */ - error = 0; - cmd->type = DN_CMD_GET; - cmd->len = sizeof(struct dn_id); - cmd->subtype = DN_GET_COMPAT; - // cmd->id = sopt_valsize; - D("compatibility mode"); + error = sooptcopyin(sopt, cmd, l, l); + sopt->sopt_valsize = sopt_valsize; + if (error) + goto done; } #ifdef NEW_AQM @@ -2337,12 +2323,7 @@ dummynet_get(struct sockopt *sopt, void **compat) } if (start == NULL) { - if (compat) { - *compat = NULL; - error = 1; // XXX - } else { - error = sooptcopyout(sopt, cmd, sizeof(*cmd)); - } + error = sooptcopyout(sopt, cmd, sizeof(*cmd)); goto done; } ND("have %d:%d sched %d, %d:%d links %d, %d:%d flowsets %d, " @@ -2355,35 +2336,20 @@ dummynet_get(struct sockopt *sopt, void **compat) sopt->sopt_valsize = sopt_valsize; a.type = cmd->subtype; - if (compat == NULL) { - memcpy(start, cmd, sizeof(*cmd)); - ((struct dn_id*)(start))->len = sizeof(struct dn_id); - buf = start + sizeof(*cmd); - } else - buf = start; + memcpy(start, cmd, sizeof(*cmd)); + ((struct dn_id*)(start))->len = sizeof(struct dn_id); + buf = start + sizeof(*cmd); a.start = &buf; a.end = start + have; /* start copying other objects */ - if (compat) { - a.type = DN_COMPAT_PIPE; - dn_ht_scan(V_dn_cfg.schedhash, copy_data_helper_compat, &a); - a.type = DN_COMPAT_QUEUE; - dn_ht_scan(V_dn_cfg.fshash, copy_data_helper_compat, &a); - } else if (a.type == DN_FS) { + if (a.type == DN_FS) { dn_ht_scan(V_dn_cfg.fshash, copy_data_helper, &a); } else { dn_ht_scan(V_dn_cfg.schedhash, copy_data_helper, &a); } DN_BH_WUNLOCK(); - if (compat) { - *compat = start; - sopt->sopt_valsize = buf - start; - /* free() is done by ip_dummynet_compat() */ - start = NULL; //XXX hack - } else { - error = sooptcopyout(sopt, start, buf - start); - } + error = sooptcopyout(sopt, start, buf - start); done: if (cmd != &r.o) free(cmd, M_DUMMYNET); @@ -2519,17 +2485,9 @@ ip_dn_ctl(struct sockopt *sopt) error = EINVAL; break; - case IP_DUMMYNET_FLUSH: - case IP_DUMMYNET_CONFIGURE: - case IP_DUMMYNET_DEL: /* remove a pipe or queue */ - case IP_DUMMYNET_GET: - D("dummynet: compat option %d", sopt->sopt_name); - error = ip_dummynet_compat(sopt); - break; - case IP_DUMMYNET3: if (sopt->sopt_dir == SOPT_GET) { - error = dummynet_get(sopt, NULL); + error = dummynet_get(sopt); break; } l = sopt->sopt_valsize; diff --git a/sys/powerpc/include/kexec.h b/sys/powerpc/include/kexec.h new file mode 100644 index 000000000000..a57c50926696 --- /dev/null +++ b/sys/powerpc/include/kexec.h @@ -0,0 +1,38 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2025 Juniper Networks, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _POWERPC_KEXEC_H_ +#define _POWERPC_KEXEC_H_ + +int +kexec_load_md(struct kexec_image *image) +{ + return (ENOSYS); +} + +#define kexec_reboot_md(x) do {} while (0) +#endif /* _POWERPC_KEXEC_H_ */ diff --git a/sys/riscv/include/kexec.h b/sys/riscv/include/kexec.h new file mode 100644 index 000000000000..5fb6fd321989 --- /dev/null +++ b/sys/riscv/include/kexec.h @@ -0,0 +1,39 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2025 Juniper Networks, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _RISCV_KEXEC_H_ +#define _RISCV_KEXEC_H_ + +int +kexec_load_md(struct kexec_image *image) +{ + return (ENOSYS); +} + +#define kexec_reboot_md(x) do {} while (0) +#endif /* _RISCV_KEXEC_H_ */ diff --git a/sys/security/audit/audit_bsm_db.c b/sys/security/audit/audit_bsm_db.c index c9f3d5c8a549..358162544287 100644 --- a/sys/security/audit/audit_bsm_db.c +++ b/sys/security/audit/audit_bsm_db.c @@ -56,6 +56,8 @@ #include <security/audit/audit.h> #include <security/audit/audit_private.h> +#include <contrib/ck/include/ck_queue.h> + /* * Hash table functions for the audit event number to event class mask * mapping. @@ -64,21 +66,21 @@ struct evclass_elem { au_event_t event; au_class_t class; - LIST_ENTRY(evclass_elem) entry; + CK_LIST_ENTRY(evclass_elem) entry; }; struct evclass_list { - LIST_HEAD(, evclass_elem) head; + CK_LIST_HEAD(, evclass_elem) head; }; static MALLOC_DEFINE(M_AUDITEVCLASS, "audit_evclass", "Audit event class"); -static struct rwlock evclass_lock; static struct evclass_list evclass_hash[EVCLASSMAP_HASH_TABLE_SIZE]; - -#define EVCLASS_LOCK_INIT() rw_init(&evclass_lock, "evclass_lock") -#define EVCLASS_RLOCK() rw_rlock(&evclass_lock) -#define EVCLASS_RUNLOCK() rw_runlock(&evclass_lock) -#define EVCLASS_WLOCK() rw_wlock(&evclass_lock) -#define EVCLASS_WUNLOCK() rw_wunlock(&evclass_lock) +static struct mtx evclass_mtx; +#define EVCLASS_LOCK_INIT() mtx_init(&evclass_mtx, "evclass_lock", NULL, MTX_DEF) +#define EVCLASS_WLOCK() mtx_lock(&evclass_mtx); +#define EVCLASS_WUNLOCK() mtx_unlock(&evclass_mtx); +/* make these do something if we ever remove entries from the hash */ +#define EVCLASS_RLOCK() {} +#define EVCLASS_RUNLOCK() {} /* * Hash table maintaining a mapping from audit event numbers to audit event @@ -118,7 +120,7 @@ au_event_class(au_event_t event) EVCLASS_RLOCK(); evcl = &evclass_hash[event % EVCLASSMAP_HASH_TABLE_SIZE]; class = 0; - LIST_FOREACH(evc, &evcl->head, entry) { + CK_LIST_FOREACH(evc, &evcl->head, entry) { if (evc->event == event) { class = evc->class; goto out; @@ -150,7 +152,7 @@ au_evclassmap_insert(au_event_t event, au_class_t class) EVCLASS_WLOCK(); evcl = &evclass_hash[event % EVCLASSMAP_HASH_TABLE_SIZE]; - LIST_FOREACH(evc, &evcl->head, entry) { + CK_LIST_FOREACH(evc, &evcl->head, entry) { if (evc->event == event) { evc->class = class; EVCLASS_WUNLOCK(); @@ -161,7 +163,7 @@ au_evclassmap_insert(au_event_t event, au_class_t class) evc = evc_new; evc->event = event; evc->class = class; - LIST_INSERT_HEAD(&evcl->head, evc, entry); + CK_LIST_INSERT_HEAD(&evcl->head, evc, entry); EVCLASS_WUNLOCK(); } @@ -172,7 +174,7 @@ au_evclassmap_init(void) EVCLASS_LOCK_INIT(); for (i = 0; i < EVCLASSMAP_HASH_TABLE_SIZE; i++) - LIST_INIT(&evclass_hash[i].head); + CK_LIST_INIT(&evclass_hash[i].head); /* * Set up the initial event to class mapping for system calls. diff --git a/sys/sys/kexec.h b/sys/sys/kexec.h new file mode 100644 index 000000000000..478193749368 --- /dev/null +++ b/sys/sys/kexec.h @@ -0,0 +1,81 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2025 Juniper Networks, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _SYS_KEXEC_H_ +#define _SYS_KEXEC_H_ + +#include <sys/types.h> + +struct kexec_segment { + void *buf; + size_t bufsz; + vm_paddr_t mem; + vm_size_t memsz; +}; + +/* Flags (aligned with Linux) */ +#define KEXEC_ON_CRASH 0x1 + +/* Aligned with Linux's limit */ +#define KEXEC_SEGMENT_MAX 16 + +#ifdef _KERNEL +struct kexec_segment_stage { + vm_page_t first_page; + void *map_buf; + vm_paddr_t target; + vm_size_t size; + vm_pindex_t pindex; +}; + +struct kexec_image { + struct kexec_segment_stage segments[KEXEC_SEGMENT_MAX]; + vm_paddr_t entry; + struct vm_object *map_obj; /* Containing object */ + vm_offset_t map_addr; /* Mapped in kernel space */ + vm_size_t map_size; + vm_page_t first_md_page; + void *md_image; +}; + +#endif + +#ifndef _KERNEL + +__BEGIN_DECLS +int kexec_load(uint64_t, unsigned long, struct kexec_segment *, unsigned long); +__END_DECLS + +#else + +void kexec_reboot_md(struct kexec_image *); +int kexec_load_md(struct kexec_image *); + +#endif + +#endif diff --git a/sys/sys/reboot.h b/sys/sys/reboot.h index 26e78632fb2c..50ad2b78083c 100644 --- a/sys/sys/reboot.h +++ b/sys/sys/reboot.h @@ -61,6 +61,7 @@ #define RB_REROOT 0x200000 /* unmount the rootfs and mount it again */ #define RB_POWERCYCLE 0x400000 /* Power cycle if possible */ #define RB_MUTEMSGS 0x800000 /* start up with console muted after banner */ +#define RB_KEXEC 0x1000000 /* Boot new kernel using kexec */ #define RB_PROBE 0x10000000 /* Probe multiple consoles */ #define RB_MULTIPLE 0x20000000 /* use multiple consoles */ diff --git a/sys/sys/smp.h b/sys/sys/smp.h index 252dc9dc1cae..b642a6014f33 100644 --- a/sys/sys/smp.h +++ b/sys/sys/smp.h @@ -251,6 +251,7 @@ void cpu_mp_announce(void); int cpu_mp_probe(void); void cpu_mp_setmaxid(void); void cpu_mp_start(void); +void cpu_mp_stop(void); /* Go back to single-CPU */ void forward_signal(struct thread *); int restart_cpus(cpuset_t); @@ -259,6 +260,7 @@ int stop_cpus_hard(cpuset_t); #if defined(__amd64__) || defined(__i386__) int suspend_cpus(cpuset_t); int resume_cpus(cpuset_t); +int offline_cpus(cpuset_t); #endif void smp_rendezvous_action(void); diff --git a/sys/sys/syscall.h b/sys/sys/syscall.h index cff27b8be316..43f46f063e3e 100644 --- a/sys/sys/syscall.h +++ b/sys/sys/syscall.h @@ -537,4 +537,5 @@ #define SYS_setgroups 596 #define SYS_jail_attach_jd 597 #define SYS_jail_remove_jd 598 -#define SYS_MAXSYSCALL 599 +#define SYS_kexec_load 599 +#define SYS_MAXSYSCALL 600 diff --git a/sys/sys/syscall.mk b/sys/sys/syscall.mk index 443dbadcfbff..ce29c050885e 100644 --- a/sys/sys/syscall.mk +++ b/sys/sys/syscall.mk @@ -440,4 +440,5 @@ MIASM = \ getgroups.o \ setgroups.o \ jail_attach_jd.o \ - jail_remove_jd.o + jail_remove_jd.o \ + kexec_load.o diff --git a/sys/sys/syscallsubr.h b/sys/sys/syscallsubr.h index d32690634059..8f106150e193 100644 --- a/sys/sys/syscallsubr.h +++ b/sys/sys/syscallsubr.h @@ -47,6 +47,7 @@ struct image_args; struct jail; struct kevent; struct kevent_copyops; +struct kexec_segment; struct kld_file_stat; struct ksiginfo; struct mbuf; @@ -401,6 +402,8 @@ int kern_writev(struct thread *td, int fd, struct uio *auio); int kern_socketpair(struct thread *td, int domain, int type, int protocol, int *rsv); int kern_unmount(struct thread *td, const char *path, int flags); +int kern_kexec_load(struct thread *td, u_long entry, + u_long nseg, struct kexec_segment *seg, u_long flags); /* flags for kern_sigaction */ #define KSA_OSIGSET 0x0001 /* uses osigact_t */ diff --git a/sys/sys/sysproto.h b/sys/sys/sysproto.h index 8dda4b4533ea..5f5524a4519b 100644 --- a/sys/sys/sysproto.h +++ b/sys/sys/sysproto.h @@ -1907,6 +1907,12 @@ struct jail_attach_jd_args { struct jail_remove_jd_args { char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)]; }; +struct kexec_load_args { + char entry_l_[PADL_(uint64_t)]; uint64_t entry; char entry_r_[PADR_(uint64_t)]; + char nseg_l_[PADL_(u_long)]; u_long nseg; char nseg_r_[PADR_(u_long)]; + char segments_l_[PADL_(struct kexec_segment *)]; struct kexec_segment * segments; char segments_r_[PADR_(struct kexec_segment *)]; + char flags_l_[PADL_(u_long)]; u_long flags; char flags_r_[PADR_(u_long)]; +}; int sys__exit(struct thread *, struct _exit_args *); int sys_fork(struct thread *, struct fork_args *); int sys_read(struct thread *, struct read_args *); @@ -2313,6 +2319,7 @@ int sys_getgroups(struct thread *, struct getgroups_args *); int sys_setgroups(struct thread *, struct setgroups_args *); int sys_jail_attach_jd(struct thread *, struct jail_attach_jd_args *); int sys_jail_remove_jd(struct thread *, struct jail_remove_jd_args *); +int sys_kexec_load(struct thread *, struct kexec_load_args *); #ifdef COMPAT_43 @@ -3311,6 +3318,7 @@ int freebsd14_setgroups(struct thread *, struct freebsd14_setgroups_args *); #define SYS_AUE_setgroups AUE_SETGROUPS #define SYS_AUE_jail_attach_jd AUE_JAIL_ATTACH #define SYS_AUE_jail_remove_jd AUE_JAIL_REMOVE +#define SYS_AUE_kexec_load AUE_NULL #undef PAD_ #undef PADL_ diff --git a/sys/tools/gdb/README.txt b/sys/tools/gdb/README.txt index 8c31565ddc42..ad1544912c3c 100644 --- a/sys/tools/gdb/README.txt +++ b/sys/tools/gdb/README.txt @@ -8,6 +8,9 @@ be automatically loaded by kgdb when opening a vmcore, so if you add new GDB commands or functions, that script should be updated to import them, and you should document them here. +When improving these scripts, you can use the "kgdb-reload" command to reload +them from /usr/lib/debug/boot/kernel/gdb/*. + To provide some rudimentary testing, selftest.py tries to exercise all of the commands and functions defined here. To use it, run selftest.sh to panic the system. Then, create a kernel dump or attach to the panicked kernel, and invoke @@ -15,6 +18,8 @@ the script with "python import selftest" in (k)gdb. Commands: acttrace Display a backtrace for all on-CPU threads +kgdb-reload Reload all gdb modules, useful when developing the modules + themselves. Functions: $PCPU(<field>[, <cpuid>]) Display the value of a PCPU/DPCPU field diff --git a/sys/tools/gdb/acttrace.py b/sys/tools/gdb/acttrace.py index 147effbbddf1..fdd18a4833cd 100644 --- a/sys/tools/gdb/acttrace.py +++ b/sys/tools/gdb/acttrace.py @@ -13,10 +13,8 @@ from pcpu import * class acttrace(gdb.Command): """ - Register an acttrace command with gdb. - - When run, acttrace prints the stack trace of all threads that were on-CPU - at the time of the panic. + Print the stack trace of all threads that were on-CPU at the time of + the panic. """ def __init__(self): super(acttrace, self).__init__("acttrace", gdb.COMMAND_USER) diff --git a/sys/tools/gdb/pcpu.py b/sys/tools/gdb/pcpu.py index aadc4b2d42df..94c451e6eca5 100644 --- a/sys/tools/gdb/pcpu.py +++ b/sys/tools/gdb/pcpu.py @@ -9,7 +9,7 @@ from freebsd import * class pcpu(gdb.Function): """ - Register a function to lookup PCPU and DPCPU variables by name. + A function to look up PCPU and DPCPU fields by name. To look up the value of the PCPU field foo on CPU n, use $PCPU("foo", n). This works for DPCPU fields too. If the CPU ID is diff --git a/sys/tools/gdb/vnet.py b/sys/tools/gdb/vnet.py index 36b4d512a3eb..5f416b2a515a 100644 --- a/sys/tools/gdb/vnet.py +++ b/sys/tools/gdb/vnet.py @@ -10,7 +10,7 @@ from freebsd import * class vnet(gdb.Function): """ - Register a function to look up VNET variables by name. + A function to look up VNET variables by name. To look at the value of a VNET variable V_foo, print $V("foo"). The currently selected thread's VNET is used by default, but can be optionally diff --git a/sys/tools/kernel-gdb.py b/sys/tools/kernel-gdb.py index 8a41ef6efab1..990bdaf31fda 100644 --- a/sys/tools/kernel-gdb.py +++ b/sys/tools/kernel-gdb.py @@ -4,12 +4,40 @@ # SPDX-License-Identifier: BSD-2-Clause # +import importlib import os import sys sys.path.append(os.path.join(os.path.dirname(__file__), "gdb")) -# Import FreeBSD kernel debugging commands and modules below. -import acttrace -import pcpu -import vnet +modules = [ + "acttrace", + "freebsd", + "pcpu", + "vnet" +] + + +def reload_modules(modules): + for mod in modules: + if mod in sys.modules: + importlib.reload(sys.modules[mod]) + else: + importlib.import_module(mod) + +reload_modules(modules) + + +class reload(gdb.Command): + """ + Reload the FreeBSD kernel GDB helper scripts. + """ + def __init__(self): + super(reload, self).__init__("kgdb-reload", gdb.COMMAND_USER) + + def invoke(self, arg, from_tty): + reload_modules(modules) + + +# Register the reload command with gdb. +reload() diff --git a/sys/x86/include/apicvar.h b/sys/x86/include/apicvar.h index c537d0ee0cdd..551f5527ac00 100644 --- a/sys/x86/include/apicvar.h +++ b/sys/x86/include/apicvar.h @@ -134,7 +134,8 @@ #define IPI_STOP (APIC_IPI_INTS + 6) /* Stop CPU until restarted. */ #define IPI_SUSPEND (APIC_IPI_INTS + 7) /* Suspend CPU until restarted. */ #define IPI_SWI (APIC_IPI_INTS + 8) /* Run clk_intr_event. */ -#define IPI_DYN_FIRST (APIC_IPI_INTS + 9) +#define IPI_OFF (APIC_IPI_INTS + 9) /* Stop CPU forever */ +#define IPI_DYN_FIRST (APIC_IPI_INTS + 10) #define IPI_DYN_LAST (254) /* IPIs allocated at runtime */ /* diff --git a/sys/x86/include/intr_machdep.h b/sys/x86/include/intr_machdep.h index 9e913440c712..497c89b0a7eb 100644 --- a/sys/x86/include/intr_machdep.h +++ b/sys/x86/include/intr_machdep.h @@ -142,6 +142,7 @@ int intr_add_handler(struct intsrc *isrc, const char *name, int intr_config_intr(struct intsrc *isrc, enum intr_trigger trig, enum intr_polarity pol); int intr_describe(struct intsrc *isrc, void *ih, const char *descr); +void intr_disable_all(void); void intr_execute_handlers(struct intsrc *isrc, struct trapframe *frame); u_int intr_next_cpu(int domain); struct intsrc *intr_lookup_source(int vector); diff --git a/sys/x86/include/x86_smp.h b/sys/x86/include/x86_smp.h index 8b9eb2ec9b66..f5015e9d8a24 100644 --- a/sys/x86/include/x86_smp.h +++ b/sys/x86/include/x86_smp.h @@ -77,6 +77,7 @@ extern u_long *ipi_rendezvous_counts[MAXCPU]; inthand_t IDTVEC(ipi_intr_bitmap_handler), /* Bitmap based IPIs */ IDTVEC(ipi_swi), /* Runs delayed SWI */ + IDTVEC(cpuoff), /* CPU goes offline until hard reset */ IDTVEC(cpustop), /* CPU stops & waits to be restarted */ IDTVEC(cpususpend), /* CPU suspends & waits to be resumed */ IDTVEC(rendezvous); /* handle CPU rendezvous */ @@ -93,6 +94,7 @@ void assign_cpu_ids(void); void cpu_add(u_int apic_id, char boot_cpu); void cpustop_handler(void); void cpususpend_handler(void); +void cpuoff_handler(void); void init_secondary_tail(void); void init_secondary(void); void ipi_startup(int apic_id, int vector); diff --git a/sys/x86/x86/intr_machdep.c b/sys/x86/x86/intr_machdep.c index 023c3df22580..a16d2ced8dba 100644 --- a/sys/x86/x86/intr_machdep.c +++ b/sys/x86/x86/intr_machdep.c @@ -245,6 +245,26 @@ intr_register_source(struct intsrc *isrc) return (0); } +void +intr_disable_all(void) +{ + /* + * Disable all external interrupts. This is used by kexec_reboot() to + * prevent problems on the other side when APs are brought up. + */ + for (int v = 0; v < num_io_irqs; v++) { + struct intsrc *is; + + is = interrupt_sources[v]; + if (is == NULL) + continue; + if (is->is_pic->pic_disable_intr != NULL) { + is->is_pic->pic_disable_source(is, PIC_EOI); + is->is_pic->pic_disable_intr(is); + } + } +} + struct intsrc * intr_lookup_source(int vector) { diff --git a/sys/x86/x86/mp_x86.c b/sys/x86/x86/mp_x86.c index c0da41a4d222..6b1715853763 100644 --- a/sys/x86/x86/mp_x86.c +++ b/sys/x86/x86/mp_x86.c @@ -1696,6 +1696,28 @@ cpususpend_handler(void) CPU_CLR_ATOMIC(cpu, &toresume_cpus); } +void +cpuoff_handler(void) +{ + u_int cpu; + + cpu = PCPU_GET(cpuid); + + /* Time to go catatonic. A reset will be required to leave. */ + disable_intr(); + lapic_disable(); + CPU_SET_ATOMIC(cpu, &suspended_cpus); + + /* + * There technically should be no need for the `while` here, since it + * cannot be interrupted (interrupts are disabled). Be safe anyway. + * Any interrupt at this point will likely be fatal, as the page tables + * are likely going away shortly. + */ + while (1) + halt(); +} + /* * Handle an IPI_SWI by waking delayed SWI thread. */ diff --git a/sys/x86/x86/msi.c b/sys/x86/x86/msi.c index 9d5a51f9753c..b38247bf6e45 100644 --- a/sys/x86/x86/msi.c +++ b/sys/x86/x86/msi.c @@ -219,6 +219,14 @@ msi_disable_intr(struct intsrc *isrc) struct msi_intsrc *msi = (struct msi_intsrc *)isrc; msi = msi->msi_first; + + /* + * Interrupt sources are always registered, but never unregistered. + * Handle the case where MSIs have all been unregistered. + */ + if (msi == NULL) + return; + msi->msi_enabled--; if (msi->msi_enabled == 0) { for (u_int i = 0; i < msi->msi_count; i++) |
