aboutsummaryrefslogtreecommitdiff
path: root/sys
diff options
context:
space:
mode:
Diffstat (limited to 'sys')
-rw-r--r--sys/amd64/amd64/apic_vector.S11
-rw-r--r--sys/amd64/amd64/genassym.c12
-rw-r--r--sys/amd64/amd64/kexec_support.c300
-rw-r--r--sys/amd64/amd64/kexec_tramp.S91
-rw-r--r--sys/amd64/amd64/mp_machdep.c13
-rw-r--r--sys/amd64/include/kexec.h41
-rw-r--r--sys/amd64/include/smp.h1
-rw-r--r--sys/arm/include/kexec.h38
-rw-r--r--sys/arm64/arm64/kexec_support.c188
-rw-r--r--sys/arm64/arm64/locore.S44
-rw-r--r--sys/arm64/arm64/mp_machdep.c78
-rw-r--r--sys/arm64/include/cpufunc.h7
-rw-r--r--sys/arm64/include/kexec.h33
-rw-r--r--sys/arm64/include/pcpu.h3
-rw-r--r--sys/arm64/include/smp.h1
-rw-r--r--sys/compat/freebsd32/freebsd32_syscall.h2
-rw-r--r--sys/compat/freebsd32/freebsd32_syscalls.c1
-rw-r--r--sys/compat/freebsd32/freebsd32_sysent.c1
-rw-r--r--sys/compat/freebsd32/syscalls.conf3
-rw-r--r--sys/conf/files2
-rw-r--r--sys/conf/files.amd642
-rw-r--r--sys/conf/files.arm641
-rw-r--r--sys/dev/acpica/acpi.c4
-rw-r--r--sys/dev/ice/ice_drv_info.h3
-rw-r--r--sys/dev/psci/psci.c13
-rw-r--r--sys/dev/psci/psci.h1
-rw-r--r--sys/dev/xilinx/xlnx_pcib.c32
-rw-r--r--sys/fs/nfsclient/nfs_clrpcops.c24
-rw-r--r--sys/fs/nfsserver/nfs_nfsdserv.c5
-rw-r--r--sys/i386/include/kexec.h38
-rw-r--r--sys/kern/init_sysent.c1
-rw-r--r--sys/kern/kern_kexec.c350
-rw-r--r--sys/kern/subr_smp.c15
-rw-r--r--sys/kern/syscalls.c1
-rw-r--r--sys/kern/syscalls.master8
-rw-r--r--sys/kern/systrace_args.c34
-rw-r--r--sys/modules/dummynet/Makefile3
-rw-r--r--sys/net/altq/altq_cbq.c2
-rw-r--r--sys/net/altq/altq_fairq.c2
-rw-r--r--sys/net/altq/altq_priq.c2
-rw-r--r--sys/net/if.c34
-rw-r--r--sys/net/if_var.h8
-rw-r--r--sys/netinet/raw_ip.c4
-rw-r--r--sys/netpfil/ipfw/ip_dn_glue.c858
-rw-r--r--sys/netpfil/ipfw/ip_dn_private.h10
-rw-r--r--sys/netpfil/ipfw/ip_dummynet.c96
-rw-r--r--sys/powerpc/include/kexec.h38
-rw-r--r--sys/riscv/include/kexec.h39
-rw-r--r--sys/security/audit/audit_bsm_db.c28
-rw-r--r--sys/sys/kexec.h81
-rw-r--r--sys/sys/reboot.h1
-rw-r--r--sys/sys/smp.h2
-rw-r--r--sys/sys/syscall.h3
-rw-r--r--sys/sys/syscall.mk3
-rw-r--r--sys/sys/syscallsubr.h3
-rw-r--r--sys/sys/sysproto.h8
-rw-r--r--sys/tools/gdb/README.txt5
-rw-r--r--sys/tools/gdb/acttrace.py6
-rw-r--r--sys/tools/gdb/pcpu.py2
-rw-r--r--sys/tools/gdb/vnet.py2
-rw-r--r--sys/tools/kernel-gdb.py36
-rw-r--r--sys/x86/include/apicvar.h3
-rw-r--r--sys/x86/include/intr_machdep.h1
-rw-r--r--sys/x86/include/x86_smp.h2
-rw-r--r--sys/x86/x86/intr_machdep.c20
-rw-r--r--sys/x86/x86/mp_x86.c22
-rw-r--r--sys/x86/x86/msi.c8
67 files changed, 1723 insertions, 1011 deletions
diff --git a/sys/amd64/amd64/apic_vector.S b/sys/amd64/amd64/apic_vector.S
index e98bae9eb6c5..8691387a5a8e 100644
--- a/sys/amd64/amd64/apic_vector.S
+++ b/sys/amd64/amd64/apic_vector.S
@@ -204,6 +204,17 @@ IDTVEC(spuriousint)
jmp doreti
/*
+ * Executed by a CPU when it receives an IPI_OFF from another CPU.
+ * Should never return
+ */
+ INTR_HANDLER cpuoff
+ KMSAN_ENTER
+ call cpuoff_handler
+ call as_lapic_eoi
+ KMSAN_LEAVE
+ jmp doreti
+
+/*
* Executed by a CPU when it receives an IPI_SWI.
*/
INTR_HANDLER ipi_swi
diff --git a/sys/amd64/amd64/genassym.c b/sys/amd64/amd64/genassym.c
index eb1b746f5893..2716784ee871 100644
--- a/sys/amd64/amd64/genassym.c
+++ b/sys/amd64/amd64/genassym.c
@@ -57,6 +57,7 @@
#include <vm/vm_param.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
+#include <sys/kexec.h>
#include <sys/proc.h>
#include <x86/apicreg.h>
#include <machine/cpu.h>
@@ -65,6 +66,7 @@
#include <machine/proc.h>
#include <machine/segments.h>
#include <machine/efi.h>
+#include <machine/kexec.h>
ASSYM(P_VMSPACE, offsetof(struct proc, p_vmspace));
ASSYM(VM_PMAP, offsetof(struct vmspace, vm_pmap));
@@ -295,3 +297,13 @@ ASSYM(EC_R13, offsetof(struct efirt_callinfo, ec_r13));
ASSYM(EC_R14, offsetof(struct efirt_callinfo, ec_r14));
ASSYM(EC_R15, offsetof(struct efirt_callinfo, ec_r15));
ASSYM(EC_RFLAGS, offsetof(struct efirt_callinfo, ec_rflags));
+
+/* Kexec */
+ASSYM(KEXEC_ENTRY, offsetof(struct kexec_image, entry));
+ASSYM(KEXEC_SEGMENTS, offsetof(struct kexec_image, segments));
+ASSYM(KEXEC_SEGMENT_MAX, KEXEC_SEGMENT_MAX);
+ASSYM(KEXEC_IMAGE_SIZE, sizeof(struct kexec_image));
+ASSYM(KEXEC_STAGED_SEGMENT_SIZE, sizeof(struct kexec_segment_stage));
+ASSYM(KEXEC_SEGMENT_SIZE, offsetof(struct kexec_segment_stage, size));
+ASSYM(KEXEC_SEGMENT_MAP, offsetof(struct kexec_segment_stage, map_buf));
+ASSYM(KEXEC_SEGMENT_TARGET, offsetof(struct kexec_segment_stage, target));
diff --git a/sys/amd64/amd64/kexec_support.c b/sys/amd64/amd64/kexec_support.c
new file mode 100644
index 000000000000..8189a48e9ae9
--- /dev/null
+++ b/sys/amd64/amd64/kexec_support.c
@@ -0,0 +1,300 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2025 Juniper Networks, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/conf.h>
+#include <sys/interrupt.h>
+#include <sys/kernel.h>
+#include <sys/kexec.h>
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_object.h>
+#include <vm/vm_phys.h>
+#include <vm/pmap.h>
+#include <vm/vm_page.h>
+#include <vm/vm_radix.h>
+
+#include <machine/intr_machdep.h>
+#include <machine/kexec.h>
+#include <machine/md_var.h>
+#include <machine/pmap.h>
+#include <x86/apicvar.h>
+
+/*
+ * Idea behind this:
+ *
+ * kexec_load_md():
+ * - Update boot page tables (identity map) to include all pages needed before
+ * disabling MMU.
+ *
+ * kexec_reboot_md():
+ * - Copy pages into target(s)
+ * - Do "other stuff"
+ * - Does not return
+ */
+
+/*
+ * do_pte: Create PTE entries (4k pages). If false, create 2MB superpages.
+ * identity: This is for an identity map, treat `start` as a physical address.
+ * Only valid here if do_pte is false.
+ */
+static void
+kexec_generate_page_tables(pml4_entry_t *root, vm_offset_t start,
+ vm_size_t size, bool do_pte, bool identity, struct pctrie_iter *pages)
+{
+ vm_paddr_t mpa;
+ vm_offset_t pg;
+ vm_size_t stride = do_pte ? PAGE_SIZE : NBPDR;
+ vm_page_t m;
+ vm_pindex_t i, j, k, l;
+
+ pg = start & ~(stride - 1);
+ i = pmap_pml4e_index(pg);
+ j = pmap_pdpe_index(pg);
+ k = pmap_pde_index(pg);
+ l = pmap_pte_index(pg);
+ for (; pg < start + size; i++, j = 0, k = 0, l = 0) {
+ /*
+ * Walk linearly, as above, but one fell swoop, one page at a
+ * time.
+ */
+ if (root[i] == 0) {
+ m = vm_radix_iter_next(pages);
+ mpa = VM_PAGE_TO_PHYS(m);
+ root[i] = mpa | PG_RW | PG_V;
+ }
+ pdp_entry_t *pdp =
+ (pdp_entry_t *)(PHYS_TO_DMAP(root[i] & PG_FRAME));
+ for (; j < NPDPEPG && pg < start + size; j++, k = 0, l = 0) {
+ if (pdp[j] == 0) {
+ m = vm_radix_iter_next(pages);
+ mpa = VM_PAGE_TO_PHYS(m);
+ pdp[j] = mpa | PG_RW | PG_V;
+ }
+ pd_entry_t *pde =
+ (pd_entry_t *)(PHYS_TO_DMAP(pdp[j] & PG_FRAME));
+ for (; k < NPDEPG && pg < start + size; k++, l = 0) {
+ if (pde[k] == 0) {
+ if (!do_pte) {
+ pde[k] =
+ (identity ? pg : pmap_kextract(pg)) |
+ PG_RW | PG_PS | PG_V;
+ pg += NBPDR;
+ continue;
+ }
+ m = vm_radix_iter_next(pages);
+ mpa = VM_PAGE_TO_PHYS(m);
+ pde[k] = mpa | PG_V | PG_RW;
+ } else if ((pde[k] & PG_PS) != 0) {
+ pg += NBPDR;
+ continue;
+ }
+ /* Populate the PTEs. */
+ for (; l < NPTEPG && pg < start + size;
+ l++, pg += PAGE_SIZE) {
+ pt_entry_t *pte =
+ (pt_entry_t *)PHYS_TO_DMAP(pde[pmap_pde_index(pg)] & PG_FRAME);
+ pte[pmap_pte_index(pg)] =
+ pmap_kextract(pg) | PG_RW | PG_V;
+ }
+ }
+ }
+ }
+}
+
+void
+kexec_reboot_md(struct kexec_image *image)
+{
+ void (*kexec_do_tramp)(void) = image->md_image;
+
+ intr_disable_all();
+ lapic_disable();
+ kexec_do_reboot_trampoline(VM_PAGE_TO_PHYS(image->first_md_page),
+ kexec_do_tramp);
+
+ for (;;)
+ ;
+}
+
+int
+kexec_load_md(struct kexec_image *image)
+{
+ struct pctrie_iter pct_iter;
+ pml4_entry_t *PT4;
+ pdp_entry_t *PDP_l;
+ pd_entry_t *PD_l0;
+ vm_offset_t va;
+ int i;
+
+ /*
+ * Start building the page table.
+ * First part of the page table is standard for all.
+ */
+ vm_offset_t pa_pdp_l, pa_pd_l0, pa_pd_l1, pa_pd_l2, pa_pd_l3;
+ vm_page_t m;
+
+ if (la57)
+ return (EINVAL);
+
+ vm_radix_iter_init(&pct_iter, &image->map_obj->rtree);
+ /* Working in linear space in the mapped space, `va` is our tracker. */
+ m = vm_radix_iter_lookup(&pct_iter, image->first_md_page->pindex);
+ va = (vm_offset_t)image->map_addr + ptoa(m->pindex);
+ /* We'll find a place for these later */
+ PT4 = (void *)va;
+ va += PAGE_SIZE;
+ m = vm_radix_iter_next(&pct_iter);
+ pa_pdp_l = VM_PAGE_TO_PHYS(m);
+ PDP_l = (void *)va;
+ va += PAGE_SIZE;
+ m = vm_radix_iter_next(&pct_iter);
+ pa_pd_l0 = VM_PAGE_TO_PHYS(m);
+ PD_l0 = (void *)va;
+ va += PAGE_SIZE;
+ m = vm_radix_iter_next(&pct_iter);
+ pa_pd_l1 = VM_PAGE_TO_PHYS(m);
+ m = vm_radix_iter_next(&pct_iter);
+ pa_pd_l2 = VM_PAGE_TO_PHYS(m);
+ m = vm_radix_iter_next(&pct_iter);
+ pa_pd_l3 = VM_PAGE_TO_PHYS(m);
+ m = vm_radix_iter_next(&pct_iter);
+
+ /* 1:1 mapping of lower 4G */
+ PT4[0] = (pml4_entry_t)pa_pdp_l | PG_V | PG_RW;
+ PDP_l[0] = (pdp_entry_t)pa_pd_l0 | PG_V | PG_RW;
+ PDP_l[1] = (pdp_entry_t)pa_pd_l1 | PG_V | PG_RW;
+ PDP_l[2] = (pdp_entry_t)pa_pd_l2 | PG_V | PG_RW;
+ PDP_l[3] = (pdp_entry_t)pa_pd_l3 | PG_V | PG_RW;
+ for (i = 0; i < 4 * NPDEPG; i++) { /* we overflow PD_l0 into _l1, etc */
+ PD_l0[i] = ((pd_entry_t)i << PDRSHIFT) | PG_V |
+ PG_RW | PG_PS;
+ }
+
+ /* Map the target(s) in 2MB chunks. */
+ for (i = 0; i < KEXEC_SEGMENT_MAX; i++) {
+ struct kexec_segment_stage *s = &image->segments[i];
+
+ if (s->size == 0)
+ break;
+ kexec_generate_page_tables(PT4, s->target, s->size, false,
+ true, &pct_iter);
+ }
+ /* Now create the source page tables */
+ kexec_generate_page_tables(PT4, image->map_addr, image->map_size, true,
+ false, &pct_iter);
+ kexec_generate_page_tables(PT4,
+ trunc_page((vm_offset_t)kexec_do_reboot_trampoline),
+ PAGE_SIZE, true, false, &pct_iter);
+ KASSERT(m != NULL, ("kexec_load_md: Missing trampoline page!\n"));
+
+ /* MD control pages start at this next page. */
+ image->md_image = (void *)(image->map_addr + ptoa(m->pindex));
+ bcopy(kexec_do_reboot, image->md_image, kexec_do_reboot_size);
+
+ /* Save the image into the MD page(s) right after the trampoline */
+ bcopy(image, (void *)((vm_offset_t)image->md_image +
+ (vm_offset_t)&kexec_saved_image - (vm_offset_t)&kexec_do_reboot),
+ sizeof(*image));
+
+ return (0);
+}
+
+/*
+ * Required pages:
+ * - L4 (1) (root)
+ * - L3 (PDPE) - 2 (bottom 512GB, bottom 4 used, top range for kernel map)
+ * - L2 (PDP) - 5 (2MB superpage mappings, 1GB each, for bottom 4GB, top 1)
+ * - L1 (PDR) - 1 (kexec trampoline page, first MD page)
+ * - kexec_do_reboot trampoline - 1
+ * - Slop pages for staging (in case it's not aligned nicely) - 3 (worst case)
+ *
+ * Minimum 9 pages for the direct map.
+ */
+int
+kexec_md_pages(struct kexec_segment *seg_in)
+{
+ struct kexec_segment *segs = seg_in;
+ vm_size_t pages = 13; /* Minimum number of starting pages */
+ vm_paddr_t cur_addr = (1UL << 32) - 1; /* Bottom 4G will be identity mapped in full */
+ vm_size_t source_total = 0;
+
+ for (int i = 0; i < KEXEC_SEGMENT_MAX; i++) {
+ vm_offset_t start, end;
+ if (segs[i].memsz == 0)
+ break;
+
+ end = round_2mpage((vm_offset_t)segs[i].mem + segs[i].memsz);
+ start = trunc_2mpage((vm_offset_t)segs[i].mem);
+ start = max(start, cur_addr + 1);
+ /*
+ * Round to cover the full range of page table pages for each
+ * segment.
+ */
+ source_total += round_2mpage(end - start);
+
+ /*
+ * Bottom 4GB are identity mapped already in the count, so skip
+ * any segments that end up there, this will short-circuit that.
+ */
+ if (end <= cur_addr + 1)
+ continue;
+
+ if (pmap_pml4e_index(end) != pmap_pml4e_index(cur_addr)) {
+ /* Need a new 512GB mapping page */
+ pages++;
+ pages += howmany(end - (start & ~PML4MASK), NBPML4);
+ pages += howmany(end - (start & ~PDPMASK), NBPDP);
+ pages += howmany(end - (start & ~PDRMASK), NBPDR);
+
+ } else if (pmap_pdpe_index(end) != pmap_pdpe_index(cur_addr)) {
+ pages++;
+ pages += howmany(end - (start & ~PDPMASK), NBPDP) - 1;
+ pages += howmany(end - (start & ~PDRMASK), NBPDR);
+ }
+
+ }
+ /* Be pessimistic when totaling up source pages. We likely
+ * can't use superpages, so need to map each page individually.
+ */
+ pages += howmany(source_total, NBPDR);
+ pages += howmany(source_total, NBPDP);
+ pages += howmany(source_total, NBPML4);
+
+ /*
+ * Be intentionally sloppy adding in the extra page table pages. It's
+ * better to go over than under.
+ */
+ pages += howmany(pages * PAGE_SIZE, NBPDR);
+ pages += howmany(pages * PAGE_SIZE, NBPDP);
+ pages += howmany(pages * PAGE_SIZE, NBPML4);
+
+ /* Add in the trampoline pages */
+ pages += howmany(kexec_do_reboot_size, PAGE_SIZE);
+
+ return (pages);
+}
diff --git a/sys/amd64/amd64/kexec_tramp.S b/sys/amd64/amd64/kexec_tramp.S
new file mode 100644
index 000000000000..6a2de676bc35
--- /dev/null
+++ b/sys/amd64/amd64/kexec_tramp.S
@@ -0,0 +1,91 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2025 Juniper Networks, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <machine/asmacros.h>
+#include <machine/specialreg.h>
+#include "assym.inc"
+
+/*
+ * Take a pointer to the image, copy each segment, and jump to the trampoline.
+ *
+ * Assumptions:
+ * - image is in safe memory
+ * - We're already running out of the new "identity" map.
+ * - All registers are free game, so go nuts
+ * - Interrupts are disabled
+ * - All APs are disabled
+ */
+ENTRY(kexec_do_reboot)
+ /*
+ r9: image pointer
+ r10: segment pointer
+ r11: segment counter
+ */
+ leaq kexec_stack(%rip), %rsp
+ /* Get the saved kexec_image. */
+ leaq kexec_saved_image(%rip), %r9
+ leaq KEXEC_SEGMENTS(%r9), %r10
+ movq $KEXEC_SEGMENT_MAX, %r11
+copy_segment:
+ movq KEXEC_SEGMENT_SIZE(%r10), %rcx
+ cmpq $0, %rcx
+ je done
+ shrq $3, %rcx
+ movq KEXEC_SEGMENT_TARGET(%r10), %rdi
+ movq KEXEC_SEGMENT_MAP(%r10), %rsi
+ rep
+ movsq
+ addq $KEXEC_STAGED_SEGMENT_SIZE, %r10
+ decq %r11
+ jg copy_segment
+
+done:
+ pushq KEXEC_ENTRY(%r9)
+ ret
+fail:
+ jmp fail
+END(kexec_do_reboot)
+ENTRY(kexec_do_reboot_trampoline)
+ /* Set new page table, clears most of TLB. */
+ movq %rdi, %cr3
+
+ /* Now flush the rest of the TLB, including global pages. */
+ movq %cr4, %rax
+ andq $~CR4_PGE, %rax
+ movq %rax, %cr4
+ jmp *%rsi
+END(kexec_do_reboot_trampoline)
+CNAME(kexec_saved_image):
+ .globl kexec_saved_image
+ .space KEXEC_IMAGE_SIZE
+ .quad 0
+ /* We don't need more than quad, so just fill out the page. */
+ .p2align PAGE_SHIFT
+ kexec_stack:
+CNAME(kexec_do_reboot_size):
+ .globl kexec_do_reboot_size
+ .quad . - kexec_do_reboot
diff --git a/sys/amd64/amd64/mp_machdep.c b/sys/amd64/amd64/mp_machdep.c
index 00e99f9df192..96ed0a2cc3ba 100644
--- a/sys/amd64/amd64/mp_machdep.c
+++ b/sys/amd64/amd64/mp_machdep.c
@@ -140,6 +140,10 @@ cpu_mp_start(void)
setidt(IPI_STOP, pti ? IDTVEC(cpustop_pti) : IDTVEC(cpustop),
SDT_SYSIGT, SEL_KPL, 0);
+ /* Install an inter-CPU IPI for CPU offline */
+ setidt(IPI_OFF, pti ? IDTVEC(cpuoff_pti) : IDTVEC(cpuoff),
+ SDT_SYSIGT, SEL_KPL, 0);
+
/* Install an inter-CPU IPI for CPU suspend/resume */
setidt(IPI_SUSPEND, pti ? IDTVEC(cpususpend_pti) : IDTVEC(cpususpend),
SDT_SYSIGT, SEL_KPL, 0);
@@ -176,6 +180,15 @@ cpu_mp_start(void)
#endif
}
+void
+cpu_mp_stop(void)
+{
+ cpuset_t other_cpus = all_cpus;
+
+ CPU_CLR(PCPU_GET(cpuid), &other_cpus);
+ offline_cpus(other_cpus);
+}
+
/*
* AP CPU's call this to initialize themselves.
*/
diff --git a/sys/amd64/include/kexec.h b/sys/amd64/include/kexec.h
new file mode 100644
index 000000000000..70bc2991be3f
--- /dev/null
+++ b/sys/amd64/include/kexec.h
@@ -0,0 +1,41 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2025 Juniper Networks, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _AMD64_KEXEC_H_
+#define _AMD64_KEXEC_H_
+
+struct kexec_segment;
+struct kexec_image;
+int kexec_md_pages(struct kexec_segment *);
+extern void kexec_do_reboot(void);
+extern long kexec_do_reboot_size;
+extern void *kexec_saved_image;
+extern void kexec_do_reboot_trampoline(unsigned long, void (*)(void));
+#define KEXEC_MD_PAGES(x) kexec_md_pages(x)
+
+
+#endif /* _AMD64_KEXEC_H_ */
diff --git a/sys/amd64/include/smp.h b/sys/amd64/include/smp.h
index bff92570ff82..28c372a2e556 100644
--- a/sys/amd64/include/smp.h
+++ b/sys/amd64/include/smp.h
@@ -30,6 +30,7 @@ inthand_t
IDTVEC(ipi_intr_bitmap_handler_pti),
IDTVEC(ipi_swi_pti),
IDTVEC(cpustop_pti),
+ IDTVEC(cpuoff_pti),
IDTVEC(cpususpend_pti),
IDTVEC(rendezvous_pti);
diff --git a/sys/arm/include/kexec.h b/sys/arm/include/kexec.h
new file mode 100644
index 000000000000..50391d32812a
--- /dev/null
+++ b/sys/arm/include/kexec.h
@@ -0,0 +1,38 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2025 Juniper Networks, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _ARM_KEXEC_H_
+#define _ARM_KEXEC_H_
+
+int
+kexec_load_md(struct kexec_image *image)
+{
+ return (ENOSYS);
+}
+
+#define kexec_reboot_md(x) do {} while (0)
+#endif /* _ARM_KEXEC_H_ */
diff --git a/sys/arm64/arm64/kexec_support.c b/sys/arm64/arm64/kexec_support.c
new file mode 100644
index 000000000000..8b9719c05b67
--- /dev/null
+++ b/sys/arm64/arm64/kexec_support.c
@@ -0,0 +1,188 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2025 Juniper Networks, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/kexec.h>
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_object.h>
+#include <vm/vm_phys.h>
+#include <vm/vm_radix.h>
+#include <vm/pmap.h>
+#include <vm/vm_page.h>
+
+#include <machine/armreg.h>
+#include <machine/pmap.h>
+#include <machine/pte.h>
+
+/*
+ * Idea behind this:
+ *
+ * kexec_load_md():
+ * - Update boot page tables (identity map) to include all pages needed before
+ * disabling MMU.
+ *
+ * kexec_reboot_md():
+ * - Copy pages into target(s)
+ * - Do "other stuff"
+ * - Does not return
+ */
+
+extern pt_entry_t pagetable_l0_ttbr0_bootstrap[];
+extern unsigned long initstack_end[];
+void switch_stack(void *, void (*)(void *, void *, struct kexec_image *), void *);
+
+#define SCTLR_EL1_NO_MMU (SCTLR_RES1 | SCTLR_LSMAOE | SCTLR_nTLSMD | \
+ SCTLR_EIS | SCTLR_TSCXT | SCTLR_EOS)
+#define vm_page_offset(m) ((vm_offset_t)(m) - vm_page_base)
+static inline vm_page_t
+phys_vm_page(vm_page_t m, vm_offset_t vm_page_v, vm_paddr_t vm_page_p)
+{
+ return ((vm_page_t)((vm_offset_t)m - vm_page_v + vm_page_p));
+}
+
+/* First 2 args are filler for switch_stack() */
+static void __aligned(16) __dead2
+kexec_reboot_bottom( void *arg1 __unused, void *arg2 __unused,
+ struct kexec_image *image)
+{
+ void (*e)(void) = (void *)image->entry;
+ vm_offset_t vm_page_base = (vm_offset_t)vm_page_array;
+ vm_paddr_t vm_page_phys = pmap_kextract((vm_offset_t)vm_page_array);
+ struct kexec_segment_stage *phys_segs =
+ (void *)pmap_kextract((vm_offset_t)&image->segments);
+ vm_paddr_t from_pa, to_pa;
+ vm_size_t size;
+ vm_page_t first, m, mp;
+ struct pctrie_iter pct_i;
+
+ /*
+ * Create a linked list of all pages in the object before we disable the
+ * MMU. Once the MMU is disabled we can't use the vm_radix iterators,
+ * as they rely on virtual address pointers.
+ */
+ first = NULL;
+ vm_radix_iter_init(&pct_i, &image->map_obj->rtree);
+ VM_RADIX_FORALL(m, &pct_i) {
+ if (first == NULL)
+ first = m;
+ else
+ SLIST_INSERT_AFTER(mp, m, plinks.s.ss);
+ mp = m;
+ }
+
+ /*
+ * We're running out of the identity map now, disable the MMU before we
+ * continue. It's possible page tables can be overwritten, which would
+ * be very bad if we were running with the MMU enabled.
+ */
+ WRITE_SPECIALREG(sctlr_el1, SCTLR_EL1_NO_MMU);
+ isb();
+ for (int i = 0; i < KEXEC_SEGMENT_MAX; i++) {
+ if (phys_segs[i].size == 0)
+ break;
+ to_pa = phys_segs[i].target;
+ /* Copy the segment here... */
+ for (vm_page_t p = phys_segs[i].first_page;
+ p != NULL && to_pa - phys_segs[i].target < phys_segs[i].size;
+ p = SLIST_NEXT(p, plinks.s.ss)) {
+ p = phys_vm_page(p, vm_page_base, vm_page_phys);
+ from_pa = p->phys_addr;
+ if (p->phys_addr == to_pa) {
+ to_pa += PAGE_SIZE;
+ continue;
+ }
+ for (size = PAGE_SIZE / sizeof(register_t);
+ size > 0; --size) {
+ *(register_t *)to_pa = *(register_t *)from_pa;
+ to_pa += sizeof(register_t);
+ from_pa += sizeof(register_t);
+ }
+ }
+ }
+ invalidate_icache();
+ e();
+ while (1)
+ ;
+}
+
+void
+kexec_reboot_md(struct kexec_image *image)
+{
+ uintptr_t ptr;
+ register_t reg;
+
+ for (int i = 0; i < KEXEC_SEGMENT_MAX; i++) {
+ if (image->segments[i].size > 0)
+ cpu_dcache_inv_range((void *)PHYS_TO_DMAP(image->segments[i].target),
+ image->segments[i].size);
+ }
+ ptr = pmap_kextract((vm_offset_t)kexec_reboot_bottom);
+ serror_disable();
+
+ reg = pmap_kextract((vm_offset_t)pagetable_l0_ttbr0_bootstrap);
+ set_ttbr0(reg);
+ cpu_tlb_flushID();
+
+ typeof(kexec_reboot_bottom) *p = (void *)ptr;
+ switch_stack((void *)pmap_kextract((vm_offset_t)initstack_end),
+ p, image);
+ while (1)
+ ;
+}
+
+int
+kexec_load_md(struct kexec_image *image)
+{
+ vm_paddr_t tmp;
+ pt_entry_t *pte;
+
+ /* Create L2 page blocks for the trampoline. L0/L1 are from the startup. */
+
+ /*
+ * There are exactly 2 pages before the pagetable_l0_ttbr0_bootstrap, so
+ * move to there.
+ */
+ pte = pagetable_l0_ttbr0_bootstrap;
+ pte -= (Ln_ENTRIES * 2); /* move to start of L2 pages */
+
+ /*
+ * Populate the identity map with symbols we know we'll need before we
+ * turn off the MMU.
+ */
+ tmp = pmap_kextract((vm_offset_t)kexec_reboot_bottom);
+ pte[pmap_l2_index(tmp)] = (tmp | L2_BLOCK | ATTR_AF | ATTR_S1_UXN);
+ tmp = pmap_kextract((vm_offset_t)initstack_end);
+ pte[pmap_l2_index(tmp)] = (tmp | L2_BLOCK | ATTR_AF | ATTR_S1_UXN);
+ /* We'll need vm_page_array for doing offset calculations. */
+ tmp = pmap_kextract((vm_offset_t)&vm_page_array);
+ pte[pmap_l2_index(tmp)] = (tmp | L2_BLOCK | ATTR_AF | ATTR_S1_UXN);
+
+ return (0);
+}
diff --git a/sys/arm64/arm64/locore.S b/sys/arm64/arm64/locore.S
index d35e334905a7..3ec12140f139 100644
--- a/sys/arm64/arm64/locore.S
+++ b/sys/arm64/arm64/locore.S
@@ -325,6 +325,19 @@ mp_virtdone:
b init_secondary
LEND(mpentry_common)
+
+ENTRY(mp_cpu_spinloop)
+0:
+ wfe
+ ldr x0, mp_cpu_spin_table_release_addr
+ cbz x0, 0b
+ blr x0
+ .globl mp_cpu_spin_table_release_addr
+mp_cpu_spin_table_release_addr:
+ .quad 0
+ .globl mp_cpu_spinloop_end
+mp_cpu_spinloop_end:
+END(mp_cpu_spinloop)
#endif
/*
@@ -475,6 +488,29 @@ LENTRY(enter_kernel_el)
eret
LEND(enter_kernel_el)
+/* Turn off the MMU. Install ttbr0 from the bootstrap page table, and go there.
+ * Does not return.
+ * - x0 - target address to jump to after stopping the MMU.
+ * - x1 - kernel load address
+ */
+ENTRY(stop_mmu)
+ mov x16, x0 /* Save target. */
+ ldr x2, =(1f - KERNBASE)
+ add x17, x1, x2
+ ldr x3, =(pagetable_l0_ttbr0_bootstrap - KERNBASE)
+ add x1, x1, x3
+ msr ttbr0_el1, x1
+ isb
+ br x17
+1:
+ BTI_J
+ mrs x0, sctlr_el1
+ bic x0, x0, SCTLR_M
+ bic x0, x0, SCTLR_C
+ msr sctlr_el1, x0
+ isb
+ br x16
+END(stop_mmu)
/*
* Get the physical address the kernel was loaded at.
*/
@@ -1094,12 +1130,19 @@ tcr:
TCR_SH0_IS | TCR_ORGN0_WBWA | TCR_IRGN0_WBWA)
LEND(start_mmu)
+ENTRY(switch_stack)
+ mov sp, x0
+ mov x16, x1
+ br x16
+END(switch_stack)
+
ENTRY(abort)
b abort
END(abort)
.bss
.align PAGE_SHIFT
+ .globl initstack_end
initstack:
.space BOOT_STACK_SIZE
initstack_end:
@@ -1116,6 +1159,7 @@ initstack_end:
* L0 for user
*/
.globl pagetable_l0_ttbr1
+ .globl pagetable_l0_ttbr0_bootstrap
pagetable:
pagetable_l3_ttbr1:
.space (PAGE_SIZE * L3_PAGE_COUNT)
diff --git a/sys/arm64/arm64/mp_machdep.c b/sys/arm64/arm64/mp_machdep.c
index e4d011df3a06..0bdd2ecfd8a7 100644
--- a/sys/arm64/arm64/mp_machdep.c
+++ b/sys/arm64/arm64/mp_machdep.c
@@ -60,6 +60,7 @@
#include <machine/debug_monitor.h>
#include <machine/intr.h>
#include <machine/smp.h>
+#include <machine/vmparam.h>
#ifdef VFP
#include <machine/vfp.h>
#endif
@@ -103,6 +104,7 @@ static void ipi_hardclock(void *);
static void ipi_preempt(void *);
static void ipi_rendezvous(void *);
static void ipi_stop(void *);
+static void ipi_off(void *);
#ifdef FDT
static u_int fdt_cpuid;
@@ -193,6 +195,7 @@ release_aps(void *dummy __unused)
intr_ipi_setup(IPI_STOP, "stop", ipi_stop, NULL);
intr_ipi_setup(IPI_STOP_HARD, "stop hard", ipi_stop, NULL);
intr_ipi_setup(IPI_HARDCLOCK, "hardclock", ipi_hardclock, NULL);
+ intr_ipi_setup(IPI_OFF, "off", ipi_off, NULL);
atomic_store_int(&aps_started, 0);
atomic_store_rel_int(&aps_ready, 1);
@@ -390,6 +393,34 @@ ipi_stop(void *dummy __unused)
CTR0(KTR_SMP, "IPI_STOP (restart)");
}
+void stop_mmu(vm_paddr_t, vm_paddr_t) __dead2;
+extern uint32_t mp_cpu_spinloop[];
+extern uint32_t mp_cpu_spinloop_end[];
+extern uint64_t mp_cpu_spin_table_release_addr;
+static void
+ipi_off(void *dummy __unused)
+{
+ CTR0(KTR_SMP, "IPI_OFF");
+ if (psci_present)
+ psci_cpu_off();
+ else {
+ uint64_t release_addr;
+ vm_size_t size;
+
+ size = (vm_offset_t)&mp_cpu_spin_table_release_addr -
+ (vm_offset_t)mp_cpu_spinloop;
+ release_addr = PCPU_GET(release_addr) - size;
+ isb();
+ invalidate_icache();
+ /* Go catatonic, don't take any interrupts. */
+ intr_disable();
+ stop_mmu(release_addr, pmap_kextract(KERNBASE));
+
+
+ }
+ CTR0(KTR_SMP, "IPI_OFF failed");
+}
+
struct cpu_group *
cpu_topo(void)
{
@@ -511,6 +542,7 @@ start_cpu(u_int cpuid, uint64_t target_cpu, int domain, vm_paddr_t release_addr)
pcpu_init(pcpup, cpuid, sizeof(struct pcpu));
pcpup->pc_mpidr = target_cpu & CPU_AFF_MASK;
bootpcpu = pcpup;
+ pcpup->pc_release_addr = release_addr;
dpcpu[cpuid - 1] = (void *)(pcpup + 1);
dpcpu_init(dpcpu[cpuid - 1], cpuid);
@@ -752,6 +784,52 @@ cpu_mp_start(void)
}
}
+void
+cpu_mp_stop(void)
+{
+
+ /* Short-circuit for single-CPU */
+ if (CPU_COUNT(&all_cpus) == 1)
+ return;
+
+ KASSERT(PCPU_GET(cpuid) == CPU_FIRST(), ("Not on the first CPU!\n"));
+
+ /*
+ * If we use spin-table, assume U-boot method for now (single address
+ * shared by all CPUs).
+ */
+ if (!psci_present) {
+ int cpu;
+ vm_paddr_t release_addr;
+ void *release_vaddr;
+ vm_size_t size;
+
+ /* Find the shared release address. */
+ CPU_FOREACH(cpu) {
+ release_addr = pcpu_find(cpu)->pc_release_addr;
+ if (release_addr != 0)
+ break;
+ }
+ /* No release address? No way of notifying other CPUs. */
+ if (release_addr == 0)
+ return;
+
+ size = (vm_offset_t)&mp_cpu_spinloop_end -
+ (vm_offset_t)&mp_cpu_spinloop;
+
+ release_addr -= (vm_offset_t)&mp_cpu_spin_table_release_addr -
+ (vm_offset_t)mp_cpu_spinloop;
+
+ release_vaddr = pmap_mapdev(release_addr, size);
+ bcopy(mp_cpu_spinloop, release_vaddr, size);
+ cpu_dcache_wbinv_range(release_vaddr, size);
+ pmap_unmapdev(release_vaddr, size);
+ invalidate_icache();
+ }
+ ipi_all_but_self(IPI_OFF);
+ DELAY(1000000);
+}
+
/* Introduce rest of cores to the world */
void
cpu_mp_announce(void)
diff --git a/sys/arm64/include/cpufunc.h b/sys/arm64/include/cpufunc.h
index e6e1f682794e..e9eee643216b 100644
--- a/sys/arm64/include/cpufunc.h
+++ b/sys/arm64/include/cpufunc.h
@@ -96,6 +96,13 @@ serror_enable(void)
__asm __volatile("msr daifclr, #(" __XSTRING(DAIF_A) ")");
}
+static __inline void
+serror_disable(void)
+{
+
+ __asm __volatile("msr daifset, #(" __XSTRING(DAIF_A) ")");
+}
+
static __inline register_t
get_midr(void)
{
diff --git a/sys/arm64/include/kexec.h b/sys/arm64/include/kexec.h
new file mode 100644
index 000000000000..0a8c7a053331
--- /dev/null
+++ b/sys/arm64/include/kexec.h
@@ -0,0 +1,33 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2025 Juniper Networks, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _ARM64_KEXEC_H_
+#define _ARM64_KEXEC_H_
+
+#define KEXEC_MD_PAGES(x) 0
+
+#endif /* _ARM64_KEXEC_H_ */
diff --git a/sys/arm64/include/pcpu.h b/sys/arm64/include/pcpu.h
index 09bd8fa8a966..73399d2c3f8c 100644
--- a/sys/arm64/include/pcpu.h
+++ b/sys/arm64/include/pcpu.h
@@ -50,7 +50,8 @@ struct debug_monitor_state;
struct pmap *pc_curvmpmap; \
uint64_t pc_mpidr; \
u_int pc_bcast_tlbi_workaround; \
- char __pad[197]
+ uint64_t pc_release_addr; \
+ char __pad[189]
#ifdef _KERNEL
diff --git a/sys/arm64/include/smp.h b/sys/arm64/include/smp.h
index 500cd1ef4f02..4a5bfda3ac1c 100644
--- a/sys/arm64/include/smp.h
+++ b/sys/arm64/include/smp.h
@@ -40,6 +40,7 @@ enum {
IPI_STOP,
IPI_STOP_HARD,
IPI_HARDCLOCK,
+ IPI_OFF,
INTR_IPI_COUNT,
};
diff --git a/sys/compat/freebsd32/freebsd32_syscall.h b/sys/compat/freebsd32/freebsd32_syscall.h
index 54063150eef9..f8ef7e4a20d3 100644
--- a/sys/compat/freebsd32/freebsd32_syscall.h
+++ b/sys/compat/freebsd32/freebsd32_syscall.h
@@ -517,4 +517,4 @@
#define FREEBSD32_SYS_setgroups 596
#define FREEBSD32_SYS_jail_attach_jd 597
#define FREEBSD32_SYS_jail_remove_jd 598
-#define FREEBSD32_SYS_MAXSYSCALL 599
+#define FREEBSD32_SYS_MAXSYSCALL 600
diff --git a/sys/compat/freebsd32/freebsd32_syscalls.c b/sys/compat/freebsd32/freebsd32_syscalls.c
index f7cc4c284e4d..645cdccbc02d 100644
--- a/sys/compat/freebsd32/freebsd32_syscalls.c
+++ b/sys/compat/freebsd32/freebsd32_syscalls.c
@@ -604,4 +604,5 @@ const char *freebsd32_syscallnames[] = {
"setgroups", /* 596 = setgroups */
"jail_attach_jd", /* 597 = jail_attach_jd */
"jail_remove_jd", /* 598 = jail_remove_jd */
+ "#599", /* 599 = kexec_load */
};
diff --git a/sys/compat/freebsd32/freebsd32_sysent.c b/sys/compat/freebsd32/freebsd32_sysent.c
index 18f809ef04e3..240b54ae9011 100644
--- a/sys/compat/freebsd32/freebsd32_sysent.c
+++ b/sys/compat/freebsd32/freebsd32_sysent.c
@@ -666,4 +666,5 @@ struct sysent freebsd32_sysent[] = {
{ .sy_narg = AS(setgroups_args), .sy_call = (sy_call_t *)sys_setgroups, .sy_auevent = AUE_SETGROUPS, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 596 = setgroups */
{ .sy_narg = AS(jail_attach_jd_args), .sy_call = (sy_call_t *)sys_jail_attach_jd, .sy_auevent = AUE_JAIL_ATTACH, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 597 = jail_attach_jd */
{ .sy_narg = AS(jail_remove_jd_args), .sy_call = (sy_call_t *)sys_jail_remove_jd, .sy_auevent = AUE_JAIL_REMOVE, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 598 = jail_remove_jd */
+ { .sy_narg = 0, .sy_call = (sy_call_t *)nosys, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_ABSENT }, /* 599 = freebsd32_kexec_load */
};
diff --git a/sys/compat/freebsd32/syscalls.conf b/sys/compat/freebsd32/syscalls.conf
index 72006631c89e..9308d1529c63 100644
--- a/sys/compat/freebsd32/syscalls.conf
+++ b/sys/compat/freebsd32/syscalls.conf
@@ -48,10 +48,11 @@ obsol="getkerninfo"
# Syscalls without implementations:
# __mac_* - should be implemented
# afs3_syscall - requires significant porting, probably doesn't make sense
+# kexec_load - makes little sense on 64-bit hardware
# kldsym - can't be implemented (kernel virtual addresses can't fit in 32-bits)
# lgetfh - should be implemented
# nlm_syscall - requires significant porting, probably doesn't make sense
# nnpfs_syscall - requires significant porting, probably doesn't make sense
# ntp_gettime - should be implemented
# thr_create - was unimplemented and appears to be unnecessary
-unimpl="afs3_syscall kldsym __mac_get_proc __mac_set_proc __mac_get_fd __mac_get_file __mac_set_fd __mac_set_file __mac_get_pid __mac_get_link __mac_set_link __mac_execve nfssvc nlm_syscall ntp_gettime lgetfh nnpfs_syscall thr_create"
+unimpl="afs3_syscall kexec_load kldsym __mac_get_proc __mac_set_proc __mac_get_fd __mac_get_file __mac_set_fd __mac_set_file __mac_get_pid __mac_get_link __mac_set_link __mac_execve nfssvc nlm_syscall ntp_gettime lgetfh nnpfs_syscall thr_create"
diff --git a/sys/conf/files b/sys/conf/files
index c17451324324..0a24b5e1e39b 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -3842,6 +3842,7 @@ kern/kern_jaildesc.c standard
kern/kern_jailmeta.c standard
kern/kern_kcov.c optional kcov \
compile-with "${NOSAN_C} ${MSAN_CFLAGS}"
+kern/kern_kexec.c standard
kern/kern_khelp.c standard
kern/kern_kthread.c standard
kern/kern_ktr.c optional ktr
@@ -4549,7 +4550,6 @@ netpfil/ipfw/dn_sched_rr.c optional inet dummynet
netpfil/ipfw/dn_sched_wf2q.c optional inet dummynet
netpfil/ipfw/ip_dummynet.c optional inet dummynet
netpfil/ipfw/ip_dn_io.c optional inet dummynet
-netpfil/ipfw/ip_dn_glue.c optional inet dummynet
netpfil/ipfw/ip_fw2.c optional inet ipfirewall
netpfil/ipfw/ip_fw_bpf.c optional inet ipfirewall
netpfil/ipfw/ip_fw_dynamic.c optional inet ipfirewall \
diff --git a/sys/conf/files.amd64 b/sys/conf/files.amd64
index a342242ac66e..e4f01813bc8f 100644
--- a/sys/conf/files.amd64
+++ b/sys/conf/files.amd64
@@ -77,6 +77,8 @@ amd64/amd64/fpu.c standard
amd64/amd64/gdb_machdep.c optional gdb
amd64/amd64/initcpu.c standard
amd64/amd64/io.c optional io
+amd64/amd64/kexec_support.c standard
+amd64/amd64/kexec_tramp.S standard
amd64/amd64/locore.S standard no-obj
amd64/amd64/xen-locore.S optional xenhvm \
compile-with "${NORMAL_S} -g0" \
diff --git a/sys/conf/files.arm64 b/sys/conf/files.arm64
index 2f412fa3cb1b..882aca705336 100644
--- a/sys/conf/files.arm64
+++ b/sys/conf/files.arm64
@@ -55,6 +55,7 @@ arm64/arm64/gic_v3_acpi.c optional acpi
arm64/arm64/gic_v3_fdt.c optional fdt
arm64/arm64/hyp_stub.S standard
arm64/arm64/identcpu.c standard
+arm64/arm64/kexec_support.c standard
arm64/arm64/locore.S standard no-obj
arm64/arm64/machdep.c standard
arm64/arm64/machdep_boot.c standard
diff --git a/sys/dev/acpica/acpi.c b/sys/dev/acpica/acpi.c
index 3f0a7b40245d..e3ff4f6937d2 100644
--- a/sys/dev/acpica/acpi.c
+++ b/sys/dev/acpica/acpi.c
@@ -4430,8 +4430,8 @@ acpi_stype_sysctl(SYSCTL_HANDLER_ARGS)
return (EINVAL);
printf("warning: this sysctl expects a sleep type, but an ACPI S-state has "
"been passed to it. This functionality is deprecated; see acpi(4).\n");
- MPASS(sstate < ACPI_S_STATE_COUNT);
- if (acpi_supported_sstates[sstate] == false)
+ if (sstate < ACPI_S_STATE_COUNT &&
+ !acpi_supported_sstates[sstate])
return (EOPNOTSUPP);
new_stype = acpi_sstate_to_stype(sstate);
}
diff --git a/sys/dev/ice/ice_drv_info.h b/sys/dev/ice/ice_drv_info.h
index 46965f4124bc..abb11bdb5fd9 100644
--- a/sys/dev/ice/ice_drv_info.h
+++ b/sys/dev/ice/ice_drv_info.h
@@ -238,6 +238,9 @@ static const pci_vendor_info_t ice_vendor_info_array[] = {
ICE_INTEL_VENDOR_ID, 0x0001, 0,
"Intel(R) Ethernet Network Adapter E835-XXV-2 for OCP 3.0"),
PVIDV_OEM(ICE_INTEL_VENDOR_ID, ICE_DEV_ID_E835CC_SFP,
+ ICE_INTEL_VENDOR_ID, 0x0002, 0,
+ "Intel(R) Ethernet Network Adapter E835-XXV-4"),
+ PVIDV_OEM(ICE_INTEL_VENDOR_ID, ICE_DEV_ID_E835CC_SFP,
ICE_INTEL_VENDOR_ID, 0x0003, 0,
"Intel(R) Ethernet Network Adapter E835-XXV-2"),
PVIDV_OEM(ICE_INTEL_VENDOR_ID, ICE_DEV_ID_E835CC_SFP,
diff --git a/sys/dev/psci/psci.c b/sys/dev/psci/psci.c
index 497b23d2d4c3..2b250401ae83 100644
--- a/sys/dev/psci/psci.c
+++ b/sys/dev/psci/psci.c
@@ -474,6 +474,19 @@ psci_cpu_on(unsigned long cpu, unsigned long entry, unsigned long context_id)
return (psci_call(fnid, cpu, entry, context_id));
}
+int
+psci_cpu_off(void)
+{
+ uint32_t fnid;
+
+ fnid = PSCI_FNID_CPU_OFF;
+ if (psci_softc != NULL)
+ fnid = psci_softc->psci_fnids[PSCI_FN_CPU_OFF];
+
+ /* Returns PSCI_RETVAL_DENIED on error. */
+ return (psci_call(fnid, 0, 0, 0));
+}
+
static void
psci_shutdown(void *xsc, int howto)
{
diff --git a/sys/dev/psci/psci.h b/sys/dev/psci/psci.h
index 451d40c0178d..6704eaf26c71 100644
--- a/sys/dev/psci/psci.h
+++ b/sys/dev/psci/psci.h
@@ -39,6 +39,7 @@ typedef int (*psci_callfn_t)(register_t, register_t, register_t, register_t,
extern bool psci_present;
int psci_cpu_on(unsigned long, unsigned long, unsigned long);
+int psci_cpu_off(void); /* Operates on caller. */
void psci_reset(void);
int32_t psci_features(uint32_t);
int psci_get_version(void);
diff --git a/sys/dev/xilinx/xlnx_pcib.c b/sys/dev/xilinx/xlnx_pcib.c
index d549ec445ea9..816b33ec1142 100644
--- a/sys/dev/xilinx/xlnx_pcib.c
+++ b/sys/dev/xilinx/xlnx_pcib.c
@@ -1,7 +1,7 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause
*
- * Copyright (c) 2020 Ruslan Bukin <br@bsdpad.com>
+ * Copyright (c) 2020-2025 Ruslan Bukin <br@bsdpad.com>
*
* This software was developed by SRI International and the University of
* Cambridge Computer Laboratory (Department of Computer Science and
@@ -84,7 +84,7 @@ struct xlnx_pcib_softc {
struct generic_pcie_fdt_softc fdt_sc;
struct resource *res[4];
struct mtx mtx;
- vm_offset_t msi_page;
+ void *msi_page;
struct xlnx_pcib_irqsrc *isrcs;
device_t dev;
void *intr_cookie[3];
@@ -105,6 +105,12 @@ struct xlnx_pcib_irqsrc {
u_int flags;
};
+static struct ofw_compat_data compat_data[] = {
+ { "xlnx,xdma-host-3.00", 1 },
+ { "xlnx,axi-pcie-host-1.00.a", 1 },
+ { NULL, 0 },
+};
+
static void
xlnx_pcib_clear_err_interrupts(struct generic_pcie_core_softc *sc)
{
@@ -333,12 +339,12 @@ xlnx_pcib_fdt_probe(device_t dev)
if (!ofw_bus_status_okay(dev))
return (ENXIO);
- if (ofw_bus_is_compatible(dev, "xlnx,xdma-host-3.00")) {
- device_set_desc(dev, "Xilinx XDMA PCIe Controller");
- return (BUS_PROBE_DEFAULT);
- }
+ if (ofw_bus_search_compatible(dev, compat_data)->ocd_data == 0)
+ return (ENXIO);
+
+ device_set_desc(dev, "Xilinx XDMA PCIe Controller");
- return (ENXIO);
+ return (BUS_PROBE_DEFAULT);
}
static int
@@ -424,8 +430,8 @@ xlnx_pcib_req_valid(struct generic_pcie_core_softc *sc,
bus_space_tag_t t;
uint32_t val;
- t = sc->bst;
- h = sc->bsh;
+ t = rman_get_bustag(sc->res);
+ h = rman_get_bushandle(sc->res);
if ((bus < sc->bus_start) || (bus > sc->bus_end))
return (0);
@@ -467,8 +473,8 @@ xlnx_pcib_read_config(device_t dev, u_int bus, u_int slot,
return (~0U);
offset = PCIE_ADDR_OFFSET(bus - sc->bus_start, slot, func, reg);
- t = sc->bst;
- h = sc->bsh;
+ t = rman_get_bustag(sc->res);
+ h = rman_get_bushandle(sc->res);
data = bus_space_read_4(t, h, offset & ~3);
@@ -512,8 +518,8 @@ xlnx_pcib_write_config(device_t dev, u_int bus, u_int slot,
offset = PCIE_ADDR_OFFSET(bus - sc->bus_start, slot, func, reg);
- t = sc->bst;
- h = sc->bsh;
+ t = rman_get_bustag(sc->res);
+ h = rman_get_bushandle(sc->res);
/*
* 32-bit access used due to a bug in the Xilinx bridge that
diff --git a/sys/fs/nfsclient/nfs_clrpcops.c b/sys/fs/nfsclient/nfs_clrpcops.c
index d3b83eb8b94b..983eb8b9226f 100644
--- a/sys/fs/nfsclient/nfs_clrpcops.c
+++ b/sys/fs/nfsclient/nfs_clrpcops.c
@@ -2212,7 +2212,7 @@ nfsrpc_writerpc(vnode_t vp, struct uio *uiop, int *iomode,
NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED
+ NFSX_VERF);
rlen = fxdr_unsigned(int, *tl++);
- if (rlen == 0) {
+ if (rlen <= 0 || rlen > len) {
error = NFSERR_IO;
goto nfsmout;
} else if (rlen < len) {
@@ -5599,7 +5599,7 @@ nfsrpc_createsession(struct nfsmount *nmp, struct nfsclsession *sep,
}
*tl++ = txdr_unsigned(4096); /* Max response size cached */
*tl++ = txdr_unsigned(20); /* Max operations */
- *tl++ = txdr_unsigned(64); /* Max slots */
+ *tl++ = txdr_unsigned(NFSV4_SLOTS); /* Max slots */
*tl = 0; /* No rdma ird */
/* Fill in back channel attributes. */
@@ -5668,6 +5668,11 @@ nfsrpc_createsession(struct nfsmount *nmp, struct nfsclsession *sep,
sep->nfsess_maxcache = fxdr_unsigned(int, *tl++);
tl++;
sep->nfsess_foreslots = fxdr_unsigned(uint16_t, *tl++);
+ if (sep->nfsess_foreslots == 0) {
+ error = NFSERR_BADXDR;
+ goto nfsmout;
+ } else if (sep->nfsess_foreslots > NFSV4_SLOTS)
+ sep->nfsess_foreslots = NFSV4_SLOTS;
NFSCL_DEBUG(4, "fore slots=%d\n", (int)sep->nfsess_foreslots);
irdcnt = fxdr_unsigned(int, *tl);
if (irdcnt < 0 || irdcnt > 1) {
@@ -5681,6 +5686,8 @@ nfsrpc_createsession(struct nfsmount *nmp, struct nfsclsession *sep,
NFSM_DISSECT(tl, uint32_t *, 7 * NFSX_UNSIGNED);
tl += 5;
sep->nfsess_backslots = fxdr_unsigned(uint16_t, *tl);
+ if (sep->nfsess_backslots > NFSV4_CBSLOTS)
+ sep->nfsess_backslots = NFSV4_CBSLOTS;
NFSCL_DEBUG(4, "back slots=%d\n", (int)sep->nfsess_backslots);
}
error = nd->nd_repstat;
@@ -5800,7 +5807,8 @@ nfsrpc_getdeviceinfo(struct nfsmount *nmp, uint8_t *deviceid, int layouttype,
NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED);
stripecnt = fxdr_unsigned(int, *tl);
NFSCL_DEBUG(4, "stripecnt=%d\n", stripecnt);
- if (stripecnt < 1 || stripecnt > 4096) {
+ if (stripecnt >= MHLEN / NFSX_UNSIGNED ||
+ stripecnt < 1) {
printf("pNFS File layout devinfo stripecnt %d:"
" out of range\n", stripecnt);
error = NFSERR_BADXDR;
@@ -7249,7 +7257,7 @@ nfsrpc_writeds(vnode_t vp, struct uio *uiop, int *iomode, int *must_commit,
NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED + NFSX_VERF);
rlen = fxdr_unsigned(int, *tl++);
NFSCL_DEBUG(4, "nfsrpc_writeds: len=%d rlen=%d\n", len, rlen);
- if (rlen == 0) {
+ if (rlen <= 0 || rlen > len) {
error = NFSERR_IO;
goto nfsmout;
} else if (rlen < len) {
@@ -8246,7 +8254,7 @@ nfsrv_parseug(struct nfsrv_descript *nd, int dogrp, uid_t *uidp, gid_t *gidp,
NFSPROC_T *p)
{
uint32_t *tl;
- char *cp, *str, str0[NFSV4_SMALLSTR + 1];
+ char *str, str0[NFSV4_SMALLSTR + 1];
uint32_t len = 0;
int error = 0;
@@ -8269,9 +8277,9 @@ nfsrv_parseug(struct nfsrv_descript *nd, int dogrp, uid_t *uidp, gid_t *gidp,
str = malloc(len + 1, M_TEMP, M_WAITOK);
else
str = str0;
- NFSM_DISSECT(cp, char *, NFSM_RNDUP(len));
- NFSBCOPY(cp, str, len);
- str[len] = '\0';
+ error = nfsrv_mtostr(nd, str, len);
+ if (error != 0)
+ goto nfsmout;
NFSCL_DEBUG(4, "nfsrv_parseug: str=%s\n", str);
if (dogrp != 0)
error = nfsv4_strtogid(nd, str, len, gidp);
diff --git a/sys/fs/nfsserver/nfs_nfsdserv.c b/sys/fs/nfsserver/nfs_nfsdserv.c
index 6f3447f26620..67af0cf71175 100644
--- a/sys/fs/nfsserver/nfs_nfsdserv.c
+++ b/sys/fs/nfsserver/nfs_nfsdserv.c
@@ -5138,6 +5138,11 @@ nfsrvd_layoutcommit(struct nfsrv_descript *nd, __unused int isdgram,
NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED);
layouttype = fxdr_unsigned(int, *tl++);
maxcnt = fxdr_unsigned(int, *tl);
+ /* There is no limit in the RFC, so use 1000 as a sanity limit. */
+ if (maxcnt < 0 || maxcnt > 1000) {
+ error = NFSERR_BADXDR;
+ goto nfsmout;
+ }
if (maxcnt > 0) {
layp = malloc(maxcnt + 1, M_TEMP, M_WAITOK);
error = nfsrv_mtostr(nd, layp, maxcnt);
diff --git a/sys/i386/include/kexec.h b/sys/i386/include/kexec.h
new file mode 100644
index 000000000000..9fbdef38ad2e
--- /dev/null
+++ b/sys/i386/include/kexec.h
@@ -0,0 +1,38 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2025 Juniper Networks, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _I386_KEXEC_H_
+#define _I386_KEXEC_H_
+
+int
+kexec_load_md(struct kexec_image *image)
+{
+ return (ENOSYS);
+}
+
+#define kexec_reboot_md(x) do {} while (0)
+#endif /* _I386_KEXEC_H_ */
diff --git a/sys/kern/init_sysent.c b/sys/kern/init_sysent.c
index e42e7dcf8b44..cd305de1ed44 100644
--- a/sys/kern/init_sysent.c
+++ b/sys/kern/init_sysent.c
@@ -665,4 +665,5 @@ struct sysent sysent[] = {
{ .sy_narg = AS(setgroups_args), .sy_call = (sy_call_t *)sys_setgroups, .sy_auevent = AUE_SETGROUPS, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 596 = setgroups */
{ .sy_narg = AS(jail_attach_jd_args), .sy_call = (sy_call_t *)sys_jail_attach_jd, .sy_auevent = AUE_JAIL_ATTACH, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 597 = jail_attach_jd */
{ .sy_narg = AS(jail_remove_jd_args), .sy_call = (sy_call_t *)sys_jail_remove_jd, .sy_auevent = AUE_JAIL_REMOVE, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 598 = jail_remove_jd */
+ { .sy_narg = AS(kexec_load_args), .sy_call = (sy_call_t *)sys_kexec_load, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 599 = kexec_load */
};
diff --git a/sys/kern/kern_kexec.c b/sys/kern/kern_kexec.c
new file mode 100644
index 000000000000..2efea7dcf9a7
--- /dev/null
+++ b/sys/kern/kern_kexec.c
@@ -0,0 +1,350 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2025 Juniper Networks, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/eventhandler.h>
+#include <sys/kernel.h>
+#ifdef INTRNG
+#include <sys/intr.h>
+#endif
+#include <sys/kexec.h>
+#include <sys/malloc.h>
+#include <sys/proc.h>
+#include <sys/priv.h>
+#include <sys/reboot.h>
+#include <sys/rman.h>
+#include <sys/rwlock.h>
+#include <sys/smp.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysproto.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_map.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pagequeue.h>
+#include <vm/vm_phys.h>
+#include <vm/vm_radix.h>
+
+#include <machine/kexec.h>
+
+#ifndef KEXEC_MD_PAGES
+/*
+ * Number of MD pages for extra bookkeeping.
+ * This is a macro because it can be a constant (some architectures make it 0).
+ * It accepts an argument, which is an array of
+ * kexec_segment[KEXEC_SEGMENT_MAX].
+ */
+#define KEXEC_MD_PAGES(x) 0
+#endif
+
+/*
+ * Basic design:
+ *
+ * Given an array of "segment descriptors" stage an image to be loaded and
+ * jumped to at reboot, instead of rebooting via firmware.
+ *
+ * Constraints:
+ * - The segment descriptors' "mem" and "memsz" must each fit within a
+ * vm_phys_seg segment, which can be obtained via the `vm.phys_segs` sysctl.
+ * A single segment cannot span multiple vm_phys_seg segments, even if the
+ * vm_phys_seg segments are adjacent.
+ *
+ * Technical details:
+ *
+ * Take advantage of the VM subsystem and create a vm_object to hold the staged
+ * image. When grabbing pages for the object, sort the pages so that if a page
+ * in the object is located in the physical range of any of the kexec segment
+ * targets then it gets placed at the pindex corresponding to that physical
+ * address. This avoids the chance of corruption by writing over the page in
+ * the final copy, or the need for a copy buffer page.
+ */
+
+static struct kexec_image staged_image;
+static vm_offset_t stage_addr;
+static vm_object_t kexec_obj;
+
+static eventhandler_tag kexec_reboot_handler;
+static struct mtx kexec_mutex;
+
+static MALLOC_DEFINE(M_KEXEC, "kexec", "Kexec segments");
+
+
+static void
+kexec_reboot(void *junk __unused, int howto)
+{
+ if ((howto & RB_KEXEC) == 0 || kexec_obj == NULL)
+ return;
+
+#ifdef SMP
+ cpu_mp_stop();
+#endif /* SMP */
+ intr_disable();
+ printf("Starting kexec reboot\n");
+
+ scheduler_stopped = true;
+ kexec_reboot_md(&staged_image);
+}
+
+MTX_SYSINIT(kexec_mutex, &kexec_mutex, "kexec", MTX_DEF);
+
+/* Sort the segment list once copied in */
+static int
+seg_cmp(const void *seg1, const void *seg2)
+{
+ const struct kexec_segment *s1, *s2;
+
+ s1 = seg1;
+ s2 = seg2;
+
+ return ((uintptr_t)s1->mem - (uintptr_t)s2->mem);
+}
+
+static bool
+segment_fits(struct kexec_segment *seg)
+{
+ vm_paddr_t v = (vm_paddr_t)(uintptr_t)seg->mem;
+
+ for (int i = 0; i < vm_phys_nsegs; i++) {
+ if (v >= vm_phys_segs[i].start &&
+ (v + seg->memsz - 1) <= vm_phys_segs[i].end)
+ return (true);
+ }
+
+ return (false);
+}
+
+static vm_paddr_t
+pa_for_pindex(struct kexec_segment_stage *segs, int count, vm_pindex_t pind)
+{
+ for (int i = count; i > 0; --i) {
+ if (pind >= segs[i - 1].pindex)
+ return (ptoa(pind - segs[i-1].pindex) + segs[i - 1].target);
+ }
+
+ panic("No segment for pindex %ju\n", (uintmax_t)pind);
+}
+
+/*
+ * For now still tied to the system call, so assumes all memory is userspace.
+ */
+int
+kern_kexec_load(struct thread *td, u_long entry, u_long nseg,
+ struct kexec_segment *seg, u_long flags)
+{
+ static int kexec_loading;
+ struct kexec_segment segtmp[KEXEC_SEGMENT_MAX];
+ struct kexec_image *new_image_stage = 0;
+ vm_object_t new_segments = NULL;
+ uint8_t *buf;
+ int err = 0;
+ int i;
+ const size_t segsize = nseg * sizeof(struct kexec_segment);
+ vm_page_t *page_list = 0;
+ vm_size_t image_count, md_pages, page_count, tmpsize;
+ vm_offset_t segment_va = 0;
+ /*
+ * - Do any sanity checking
+ * - Load the new segments to temporary
+ * - Remove the old segments
+ * - Install the new segments
+ */
+
+ if (nseg > KEXEC_SEGMENT_MAX)
+ return (EINVAL);
+
+ if (atomic_cmpset_acq_int(&kexec_loading, false, true) == 0)
+ return (EBUSY);
+
+ /* Only do error checking if we're installing new segments. */
+ if (nseg > 0) {
+ /* Create the new kexec object before destroying the old one. */
+ bzero(&segtmp, sizeof(segtmp));
+ err = copyin(seg, segtmp, segsize);
+ if (err != 0)
+ goto out;
+ qsort(segtmp, nseg, sizeof(*segtmp), seg_cmp);
+ new_image_stage = malloc(sizeof(*new_image_stage), M_TEMP, M_WAITOK | M_ZERO);
+ /*
+ * Sanity checking:
+ * - All segments must not overlap the kernel, so must be fully enclosed
+ * in a vm_phys_seg (each kexec segment must be in a single
+ * vm_phys_seg segment, cannot cross even adjacent segments).
+ */
+ image_count = 0;
+ for (i = 0; i < nseg; i++) {
+ if (!segment_fits(&segtmp[i]) ||
+ segtmp[i].bufsz > segtmp[i].memsz) {
+ err = EINVAL;
+ goto out;
+ }
+ new_image_stage->segments[i].pindex = image_count;
+ new_image_stage->segments[i].target = (vm_offset_t)segtmp[i].mem;
+ new_image_stage->segments[i].size = segtmp[i].memsz;
+ image_count += atop(segtmp[i].memsz);
+ }
+ md_pages = KEXEC_MD_PAGES(segtmp);
+ page_count = image_count + md_pages;
+ new_segments = vm_object_allocate(OBJT_PHYS, page_count);
+ page_list = malloc(page_count * sizeof(vm_page_t), M_TEMP, M_WAITOK);
+
+ /*
+ * - Grab all pages for all segments (use pindex to slice it)
+ * - Walk the list (once)
+ * - At each pindex, check if the target PA that corresponds
+ * to that index is in the object. If so, swap the pages.
+ * - At the end of this the list will be "best" sorted.
+ */
+ vm_page_grab_pages_unlocked(new_segments, 0,
+ VM_ALLOC_NORMAL | VM_ALLOC_WAITOK | VM_ALLOC_WIRED | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO,
+ page_list, page_count);
+
+ /* Sort the pages to best match the PA */
+ VM_OBJECT_WLOCK(new_segments);
+ for (i = 0; i < image_count; i++) {
+ vm_page_t curpg, otherpg, tmp;
+ vm_pindex_t otheridx;
+
+ curpg = page_list[i];
+ otherpg = PHYS_TO_VM_PAGE(pa_for_pindex(new_image_stage->segments,
+ nseg, curpg->pindex));
+ otheridx = otherpg->pindex;
+
+ if (otherpg->object == new_segments) {
+ /*
+ * Swap 'curpg' and 'otherpg', since 'otherpg'
+ * is at the PA 'curpg' covers.
+ */
+ vm_radix_remove(&new_segments->rtree, otheridx);
+ vm_radix_remove(&new_segments->rtree, i);
+ otherpg->pindex = i;
+ curpg->pindex = otheridx;
+ vm_radix_insert(&new_segments->rtree, curpg);
+ vm_radix_insert(&new_segments->rtree, otherpg);
+ tmp = curpg;
+ page_list[i] = otherpg;
+ page_list[otheridx] = tmp;
+ }
+ }
+ for (i = 0; i < nseg; i++) {
+ new_image_stage->segments[i].first_page =
+ vm_radix_lookup(&new_segments->rtree,
+ new_image_stage->segments[i].pindex);
+ }
+ if (md_pages > 0)
+ new_image_stage->first_md_page =
+ vm_radix_lookup(&new_segments->rtree,
+ page_count - md_pages);
+ else
+ new_image_stage->first_md_page = NULL;
+ VM_OBJECT_WUNLOCK(new_segments);
+
+ /* Map the object to do the copies */
+ err = vm_map_find(kernel_map, new_segments, 0, &segment_va,
+ ptoa(page_count), 0, VMFS_ANY_SPACE,
+ VM_PROT_RW, VM_PROT_RW, MAP_PREFAULT);
+ if (err != 0)
+ goto out;
+ buf = (void *)segment_va;
+ new_image_stage->map_addr = segment_va;
+ new_image_stage->map_size = ptoa(new_segments->size);
+ new_image_stage->entry = entry;
+ new_image_stage->map_obj = new_segments;
+ for (i = 0; i < nseg; i++) {
+ err = copyin(segtmp[i].buf, buf, segtmp[i].bufsz);
+ if (err != 0) {
+ goto out;
+ }
+ new_image_stage->segments[i].map_buf = buf;
+ buf += segtmp[i].bufsz;
+ tmpsize = segtmp[i].memsz - segtmp[i].bufsz;
+ if (tmpsize > 0)
+ memset(buf, 0, tmpsize);
+ buf += tmpsize;
+ }
+ /* What's left are the MD pages, so zero them all out. */
+ if (md_pages > 0)
+ bzero(buf, ptoa(md_pages));
+
+ cpu_flush_dcache((void *)segment_va, ptoa(page_count));
+ if ((err = kexec_load_md(new_image_stage)) != 0)
+ goto out;
+ }
+ if (kexec_obj != NULL) {
+ vm_object_unwire(kexec_obj, 0, kexec_obj->size, 0);
+ KASSERT(stage_addr != 0, ("Mapped kexec_obj without address"));
+ vm_map_remove(kernel_map, stage_addr, stage_addr + kexec_obj->size);
+ }
+ kexec_obj = new_segments;
+ bzero(&staged_image, sizeof(staged_image));
+ if (nseg > 0)
+ memcpy(&staged_image, new_image_stage, sizeof(*new_image_stage));
+
+ printf("trampoline at %#jx\n", (uintmax_t)staged_image.entry);
+ if (nseg > 0) {
+ if (kexec_reboot_handler == NULL)
+ kexec_reboot_handler =
+ EVENTHANDLER_REGISTER(shutdown_final, kexec_reboot, NULL,
+ SHUTDOWN_PRI_DEFAULT - 150);
+ } else {
+ if (kexec_reboot_handler != NULL)
+ EVENTHANDLER_DEREGISTER(shutdown_final, kexec_reboot_handler);
+ }
+out:
+ /* Clean up the mess if we've gotten far. */
+ if (err != 0 && new_segments != NULL) {
+ vm_object_unwire(new_segments, 0, new_segments->size, 0);
+ if (segment_va != 0)
+ vm_map_remove(kernel_map, segment_va, segment_va + kexec_obj->size);
+ else
+ vm_object_deallocate(new_segments);
+ }
+ atomic_store_rel_int(&kexec_loading, false);
+ if (new_image_stage != NULL)
+ free(new_image_stage, M_TEMP);
+ if (page_list != 0)
+ free(page_list, M_TEMP);
+
+ return (err);
+}
+
+int
+sys_kexec_load(struct thread *td, struct kexec_load_args *uap)
+{
+ int error;
+
+ // FIXME: Do w need a better privilege check than PRIV_REBOOT here?
+ error = priv_check(td, PRIV_REBOOT);
+ if (error != 0)
+ return (error);
+ return (kern_kexec_load(td, uap->entry, uap->nseg, uap->segments, uap->flags));
+}
diff --git a/sys/kern/subr_smp.c b/sys/kern/subr_smp.c
index 1f9577fddf9c..9f5106316018 100644
--- a/sys/kern/subr_smp.c
+++ b/sys/kern/subr_smp.c
@@ -242,7 +242,7 @@ generic_stop_cpus(cpuset_t map, u_int type)
KASSERT(
type == IPI_STOP || type == IPI_STOP_HARD
#if X86
- || type == IPI_SUSPEND
+ || type == IPI_SUSPEND || type == IPI_OFF
#endif
, ("%s: invalid stop type", __func__));
@@ -260,7 +260,7 @@ generic_stop_cpus(cpuset_t map, u_int type)
* will be lost, violating FreeBSD's assumption of reliable
* IPI delivery.
*/
- if (type == IPI_SUSPEND)
+ if (type == IPI_SUSPEND || type == IPI_OFF)
mtx_lock_spin(&smp_ipi_mtx);
#endif
@@ -280,7 +280,7 @@ generic_stop_cpus(cpuset_t map, u_int type)
#endif
#if X86
- if (type == IPI_SUSPEND)
+ if (type == IPI_SUSPEND || type == IPI_OFF)
cpus = &suspended_cpus;
else
#endif
@@ -298,7 +298,7 @@ generic_stop_cpus(cpuset_t map, u_int type)
}
#if X86
- if (type == IPI_SUSPEND)
+ if (type == IPI_SUSPEND || type == IPI_OFF)
mtx_unlock_spin(&smp_ipi_mtx);
#endif
@@ -327,6 +327,13 @@ suspend_cpus(cpuset_t map)
return (generic_stop_cpus(map, IPI_SUSPEND));
}
+
+int
+offline_cpus(cpuset_t map)
+{
+
+ return (generic_stop_cpus(map, IPI_OFF));
+}
#endif
/*
diff --git a/sys/kern/syscalls.c b/sys/kern/syscalls.c
index 4cef89cd5219..06a4adc3d8cb 100644
--- a/sys/kern/syscalls.c
+++ b/sys/kern/syscalls.c
@@ -604,4 +604,5 @@ const char *syscallnames[] = {
"setgroups", /* 596 = setgroups */
"jail_attach_jd", /* 597 = jail_attach_jd */
"jail_remove_jd", /* 598 = jail_remove_jd */
+ "kexec_load", /* 599 = kexec_load */
};
diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master
index 967af1f5313c..ea6d2b5aa1ef 100644
--- a/sys/kern/syscalls.master
+++ b/sys/kern/syscalls.master
@@ -3394,4 +3394,12 @@
);
}
+599 AUE_NULL STD {
+ int kexec_load(
+ uint64_t entry,
+ u_long nseg,
+ _In_reads_(nseg) _Contains_long_ptr_ struct kexec_segment *segments,
+ u_long flags
+ );
+ }
; vim: syntax=off
diff --git a/sys/kern/systrace_args.c b/sys/kern/systrace_args.c
index e28fef931ea8..5951cebbe74a 100644
--- a/sys/kern/systrace_args.c
+++ b/sys/kern/systrace_args.c
@@ -3514,6 +3514,16 @@ systrace_args(int sysnum, void *params, uint64_t *uarg, int *n_args)
*n_args = 1;
break;
}
+ /* kexec_load */
+ case 599: {
+ struct kexec_load_args *p = params;
+ uarg[a++] = p->entry; /* uint64_t */
+ uarg[a++] = p->nseg; /* u_long */
+ uarg[a++] = (intptr_t)p->segments; /* struct kexec_segment * */
+ uarg[a++] = p->flags; /* u_long */
+ *n_args = 4;
+ break;
+ }
default:
*n_args = 0;
break;
@@ -9401,6 +9411,25 @@ systrace_entry_setargdesc(int sysnum, int ndx, char *desc, size_t descsz)
break;
};
break;
+ /* kexec_load */
+ case 599:
+ switch (ndx) {
+ case 0:
+ p = "uint64_t";
+ break;
+ case 1:
+ p = "u_long";
+ break;
+ case 2:
+ p = "userland struct kexec_segment *";
+ break;
+ case 3:
+ p = "u_long";
+ break;
+ default:
+ break;
+ };
+ break;
default:
break;
};
@@ -11409,6 +11438,11 @@ systrace_return_setargdesc(int sysnum, int ndx, char *desc, size_t descsz)
if (ndx == 0 || ndx == 1)
p = "int";
break;
+ /* kexec_load */
+ case 599:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
default:
break;
};
diff --git a/sys/modules/dummynet/Makefile b/sys/modules/dummynet/Makefile
index 4ff023e6bca5..a645c1673167 100644
--- a/sys/modules/dummynet/Makefile
+++ b/sys/modules/dummynet/Makefile
@@ -1,7 +1,6 @@
.PATH: ${SRCTOP}/sys/netpfil/ipfw
KMOD= dummynet
-SRCS= ip_dummynet.c
-SRCS+= ip_dn_glue.c ip_dn_io.c
+SRCS= ip_dummynet.c ip_dn_io.c
SRCS+= dn_aqm_codel.c dn_aqm_pie.c
SRCS+= dn_heap.c dn_sched_fifo.c dn_sched_qfq.c dn_sched_rr.c dn_sched_wf2q.c
SRCS+= dn_sched_prio.c dn_sched_fq_codel.c dn_sched_fq_pie.c
diff --git a/sys/net/altq/altq_cbq.c b/sys/net/altq/altq_cbq.c
index fdf39690160b..2333b9ea8678 100644
--- a/sys/net/altq/altq_cbq.c
+++ b/sys/net/altq/altq_cbq.c
@@ -173,6 +173,8 @@ cbq_request(struct ifaltq *ifq, int req, void *arg)
static void
get_class_stats(class_stats_t *statsp, struct rm_class *cl)
{
+ memset(statsp, 0, sizeof(*statsp));
+
statsp->xmit_cnt = cl->stats_.xmit_cnt;
statsp->drop_cnt = cl->stats_.drop_cnt;
statsp->over = cl->stats_.over;
diff --git a/sys/net/altq/altq_fairq.c b/sys/net/altq/altq_fairq.c
index 6069865101a0..0a00168e547e 100644
--- a/sys/net/altq/altq_fairq.c
+++ b/sys/net/altq/altq_fairq.c
@@ -857,6 +857,8 @@ get_class_stats(struct fairq_classstats *sp, struct fairq_class *cl)
{
fairq_bucket_t *b;
+ memset(sp, 0, sizeof(*sp));
+
sp->class_handle = cl->cl_handle;
sp->qlimit = cl->cl_qlimit;
sp->xmit_cnt = cl->cl_xmitcnt;
diff --git a/sys/net/altq/altq_priq.c b/sys/net/altq/altq_priq.c
index 026346639b2e..fec488418546 100644
--- a/sys/net/altq/altq_priq.c
+++ b/sys/net/altq/altq_priq.c
@@ -597,6 +597,8 @@ priq_purgeq(struct priq_class *cl)
static void
get_class_stats(struct priq_classstats *sp, struct priq_class *cl)
{
+ memset(sp, 0, sizeof(*sp));
+
sp->class_handle = cl->cl_handle;
sp->qlength = qlen(cl->cl_q);
sp->qlimit = qlimit(cl->cl_q);
diff --git a/sys/net/if.c b/sys/net/if.c
index b6a798aa0fab..cb9c47c14c32 100644
--- a/sys/net/if.c
+++ b/sys/net/if.c
@@ -2842,15 +2842,20 @@ ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td)
break;
case SIOCAIFGROUP:
+ {
+ const char *groupname;
+
error = priv_check(td, PRIV_NET_ADDIFGROUP);
if (error)
return (error);
- error = if_addgroup(ifp,
- ((struct ifgroupreq *)data)->ifgr_group);
+ groupname = ((struct ifgroupreq *)data)->ifgr_group;
+ if (strnlen(groupname, IFNAMSIZ) == IFNAMSIZ)
+ return (EINVAL);
+ error = if_addgroup(ifp, groupname);
if (error != 0)
return (error);
break;
-
+ }
case SIOCGIFGROUP:
{
struct epoch_tracker et;
@@ -2862,15 +2867,20 @@ ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td)
}
case SIOCDIFGROUP:
+ {
+ const char *groupname;
+
error = priv_check(td, PRIV_NET_DELIFGROUP);
if (error)
return (error);
- error = if_delgroup(ifp,
- ((struct ifgroupreq *)data)->ifgr_group);
+ groupname = ((struct ifgroupreq *)data)->ifgr_group;
+ if (strnlen(groupname, IFNAMSIZ) == IFNAMSIZ)
+ return (EINVAL);
+ error = if_delgroup(ifp, groupname);
if (error != 0)
return (error);
break;
-
+ }
default:
error = ENOIOCTL;
break;
@@ -3014,9 +3024,17 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct thread *td)
goto out_noref;
case SIOCGIFGMEMB:
- error = if_getgroupmembers((struct ifgroupreq *)data);
- goto out_noref;
+ {
+ struct ifgroupreq *req;
+ req = (struct ifgroupreq *)data;
+ if (strnlen(req->ifgr_name, IFNAMSIZ) == IFNAMSIZ) {
+ error = EINVAL;
+ goto out_noref;
+ }
+ error = if_getgroupmembers(req);
+ goto out_noref;
+ }
#if defined(INET) || defined(INET6)
case SIOCSVH:
case SIOCGVH:
diff --git a/sys/net/if_var.h b/sys/net/if_var.h
index f2df612b19c1..961259bb0ca1 100644
--- a/sys/net/if_var.h
+++ b/sys/net/if_var.h
@@ -383,18 +383,18 @@ struct ifg_group {
char ifg_group[IFNAMSIZ];
u_int ifg_refcnt;
void *ifg_pf_kif;
- CK_STAILQ_HEAD(, ifg_member) ifg_members; /* (CK_) */
- CK_STAILQ_ENTRY(ifg_group) ifg_next; /* (CK_) */
+ CK_STAILQ_HEAD(, ifg_member) ifg_members;
+ CK_STAILQ_ENTRY(ifg_group) ifg_next;
};
struct ifg_member {
- CK_STAILQ_ENTRY(ifg_member) ifgm_next; /* (CK_) */
+ CK_STAILQ_ENTRY(ifg_member) ifgm_next;
if_t ifgm_ifp;
};
struct ifg_list {
struct ifg_group *ifgl_group;
- CK_STAILQ_ENTRY(ifg_list) ifgl_next; /* (CK_) */
+ CK_STAILQ_ENTRY(ifg_list) ifgl_next;
};
#ifdef _SYS_EVENTHANDLER_H_
diff --git a/sys/netinet/raw_ip.c b/sys/netinet/raw_ip.c
index 66070faf97e9..bfe608be6b36 100644
--- a/sys/netinet/raw_ip.c
+++ b/sys/netinet/raw_ip.c
@@ -680,7 +680,6 @@ rip_ctloutput(struct socket *so, struct sockopt *sopt)
break;
case IP_DUMMYNET3: /* generic dummynet v.3 functions */
- case IP_DUMMYNET_GET:
if (ip_dn_ctl_ptr != NULL)
error = ip_dn_ctl_ptr(sopt);
else
@@ -747,9 +746,6 @@ rip_ctloutput(struct socket *so, struct sockopt *sopt)
break;
case IP_DUMMYNET3: /* generic dummynet v.3 functions */
- case IP_DUMMYNET_CONFIGURE:
- case IP_DUMMYNET_DEL:
- case IP_DUMMYNET_FLUSH:
if (ip_dn_ctl_ptr != NULL)
error = ip_dn_ctl_ptr(sopt);
else
diff --git a/sys/netpfil/ipfw/ip_dn_glue.c b/sys/netpfil/ipfw/ip_dn_glue.c
deleted file mode 100644
index 0412b730e4df..000000000000
--- a/sys/netpfil/ipfw/ip_dn_glue.c
+++ /dev/null
@@ -1,858 +0,0 @@
-/*-
- * SPDX-License-Identifier: BSD-2-Clause
- *
- * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa
- * All rights reserved
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-/*
- *
- * Binary compatibility support for /sbin/ipfw RELENG_7 and RELENG_8
- */
-
-#include "opt_inet6.h"
-
-#include <sys/param.h>
-#include <sys/systm.h>
-#include <sys/malloc.h>
-#include <sys/mbuf.h>
-#include <sys/kernel.h>
-#include <sys/lock.h>
-#include <sys/module.h>
-#include <sys/priv.h>
-#include <sys/proc.h>
-#include <sys/rwlock.h>
-#include <sys/socket.h>
-#include <sys/socketvar.h>
-#include <sys/time.h>
-#include <sys/taskqueue.h>
-#include <net/if.h> /* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */
-#include <netinet/in.h>
-#include <netinet/ip_var.h> /* ip_output(), IP_FORWARDING */
-#include <netinet/ip_fw.h>
-#include <netinet/ip_dummynet.h>
-
-#include <netpfil/ipfw/ip_fw_private.h>
-#include <netpfil/ipfw/dn_heap.h>
-#include <netpfil/ipfw/ip_dn_private.h>
-#ifdef NEW_AQM
-#include <netpfil/ipfw/dn_aqm.h>
-#endif
-#include <netpfil/ipfw/dn_sched.h>
-
-/* FREEBSD7.2 ip_dummynet.h r191715*/
-
-struct dn_heap_entry7 {
- int64_t key; /* sorting key. Topmost element is smallest one */
- void *object; /* object pointer */
-};
-
-struct dn_heap7 {
- int size;
- int elements;
- int offset; /* XXX if > 0 this is the offset of direct ptr to obj */
- struct dn_heap_entry7 *p; /* really an array of "size" entries */
-};
-
-/* Common to 7.2 and 8 */
-struct dn_flow_set {
- SLIST_ENTRY(dn_flow_set) next; /* linked list in a hash slot */
-
- u_short fs_nr ; /* flow_set number */
- u_short flags_fs;
-#define DNOLD_HAVE_FLOW_MASK 0x0001
-#define DNOLD_IS_RED 0x0002
-#define DNOLD_IS_GENTLE_RED 0x0004
-#define DNOLD_QSIZE_IS_BYTES 0x0008 /* queue size is measured in bytes */
-#define DNOLD_NOERROR 0x0010 /* do not report ENOBUFS on drops */
-#define DNOLD_HAS_PROFILE 0x0020 /* the pipe has a delay profile. */
-#define DNOLD_IS_PIPE 0x4000
-#define DNOLD_IS_QUEUE 0x8000
-
- struct dn_pipe7 *pipe ; /* pointer to parent pipe */
- u_short parent_nr ; /* parent pipe#, 0 if local to a pipe */
-
- int weight ; /* WFQ queue weight */
- int qsize ; /* queue size in slots or bytes */
- int plr[4] ; /* pkt loss rate (2^31-1 means 100%) */
-
- struct ipfw_flow_id flow_mask ;
-
- /* hash table of queues onto this flow_set */
- int rq_size ; /* number of slots */
- int rq_elements ; /* active elements */
- struct dn_flow_queue7 **rq ; /* array of rq_size entries */
-
- u_int32_t last_expired ; /* do not expire too frequently */
- int backlogged ; /* #active queues for this flowset */
-
- /* RED parameters */
-#define SCALE_RED 16
-#define SCALE(x) ( (x) << SCALE_RED )
-#define SCALE_VAL(x) ( (x) >> SCALE_RED )
-#define SCALE_MUL(x,y) ( ( (x) * (y) ) >> SCALE_RED )
- int w_q ; /* queue weight (scaled) */
- int max_th ; /* maximum threshold for queue (scaled) */
- int min_th ; /* minimum threshold for queue (scaled) */
- int max_p ; /* maximum value for p_b (scaled) */
- u_int c_1 ; /* max_p/(max_th-min_th) (scaled) */
- u_int c_2 ; /* max_p*min_th/(max_th-min_th) (scaled) */
- u_int c_3 ; /* for GRED, (1-max_p)/max_th (scaled) */
- u_int c_4 ; /* for GRED, 1 - 2*max_p (scaled) */
- u_int * w_q_lookup ; /* lookup table for computing (1-w_q)^t */
- u_int lookup_depth ; /* depth of lookup table */
- int lookup_step ; /* granularity inside the lookup table */
- int lookup_weight ; /* equal to (1-w_q)^t / (1-w_q)^(t+1) */
- int avg_pkt_size ; /* medium packet size */
- int max_pkt_size ; /* max packet size */
-};
-SLIST_HEAD(dn_flow_set_head, dn_flow_set);
-
-#define DN_IS_PIPE 0x4000
-#define DN_IS_QUEUE 0x8000
-struct dn_flow_queue7 {
- struct dn_flow_queue7 *next ;
- struct ipfw_flow_id id ;
-
- struct mbuf *head, *tail ; /* queue of packets */
- u_int len ;
- u_int len_bytes ;
-
- u_long numbytes;
-
- u_int64_t tot_pkts ; /* statistics counters */
- u_int64_t tot_bytes ;
- u_int32_t drops ;
-
- int hash_slot ; /* debugging/diagnostic */
-
- /* RED parameters */
- int avg ; /* average queue length est. (scaled) */
- int count ; /* arrivals since last RED drop */
- int random ; /* random value (scaled) */
- u_int32_t q_time; /* start of queue idle time */
-
- /* WF2Q+ support */
- struct dn_flow_set *fs ; /* parent flow set */
- int heap_pos ; /* position (index) of struct in heap */
- int64_t sched_time ; /* current time when queue enters ready_heap */
-
- int64_t S,F ; /* start time, finish time */
-};
-
-struct dn_pipe7 { /* a pipe */
- SLIST_ENTRY(dn_pipe7) next; /* linked list in a hash slot */
-
- int pipe_nr ; /* number */
- uint32_t bandwidth; /* really, bytes/tick. */
- int delay ; /* really, ticks */
-
- struct mbuf *head, *tail ; /* packets in delay line */
-
- /* WF2Q+ */
- struct dn_heap7 scheduler_heap ; /* top extract - key Finish time*/
- struct dn_heap7 not_eligible_heap; /* top extract- key Start time */
- struct dn_heap7 idle_heap ; /* random extract - key Start=Finish time */
-
- int64_t V ; /* virtual time */
- int sum; /* sum of weights of all active sessions */
-
- int numbytes;
-
- int64_t sched_time ; /* time pipe was scheduled in ready_heap */
-
- /*
- * When the tx clock come from an interface (if_name[0] != '\0'), its name
- * is stored below, whereas the ifp is filled when the rule is configured.
- */
- char if_name[IFNAMSIZ];
- struct ifnet *ifp ;
- int ready ; /* set if ifp != NULL and we got a signal from it */
-
- struct dn_flow_set fs ; /* used with fixed-rate flows */
-};
-SLIST_HEAD(dn_pipe_head7, dn_pipe7);
-
-/* FREEBSD8 ip_dummynet.h r196045 */
-struct dn_flow_queue8 {
- struct dn_flow_queue8 *next ;
- struct ipfw_flow_id id ;
-
- struct mbuf *head, *tail ; /* queue of packets */
- u_int len ;
- u_int len_bytes ;
-
- uint64_t numbytes ; /* credit for transmission (dynamic queues) */
- int64_t extra_bits; /* extra bits simulating unavailable channel */
-
- u_int64_t tot_pkts ; /* statistics counters */
- u_int64_t tot_bytes ;
- u_int32_t drops ;
-
- int hash_slot ; /* debugging/diagnostic */
-
- /* RED parameters */
- int avg ; /* average queue length est. (scaled) */
- int count ; /* arrivals since last RED drop */
- int random ; /* random value (scaled) */
- int64_t idle_time; /* start of queue idle time */
-
- /* WF2Q+ support */
- struct dn_flow_set *fs ; /* parent flow set */
- int heap_pos ; /* position (index) of struct in heap */
- int64_t sched_time ; /* current time when queue enters ready_heap */
-
- int64_t S,F ; /* start time, finish time */
-};
-
-struct dn_pipe8 { /* a pipe */
- SLIST_ENTRY(dn_pipe8) next; /* linked list in a hash slot */
-
- int pipe_nr ; /* number */
- uint32_t bandwidth; /* really, bytes/tick. */
- int delay ; /* really, ticks */
-
- struct mbuf *head, *tail ; /* packets in delay line */
-
- /* WF2Q+ */
- struct dn_heap7 scheduler_heap ; /* top extract - key Finish time*/
- struct dn_heap7 not_eligible_heap; /* top extract- key Start time */
- struct dn_heap7 idle_heap ; /* random extract - key Start=Finish time */
-
- int64_t V ; /* virtual time */
- int sum; /* sum of weights of all active sessions */
-
- /* Same as in dn_flow_queue, numbytes can become large */
- int64_t numbytes; /* bits I can transmit (more or less). */
- uint64_t burst; /* burst size, scaled: bits * hz */
-
- int64_t sched_time ; /* time pipe was scheduled in ready_heap */
- int64_t idle_time; /* start of pipe idle time */
-
- char if_name[IFNAMSIZ];
- struct ifnet *ifp ;
- int ready ; /* set if ifp != NULL and we got a signal from it */
-
- struct dn_flow_set fs ; /* used with fixed-rate flows */
-
- /* fields to simulate a delay profile */
-#define ED_MAX_NAME_LEN 32
- char name[ED_MAX_NAME_LEN];
- int loss_level;
- int samples_no;
- int *samples;
-};
-
-#define ED_MAX_SAMPLES_NO 1024
-struct dn_pipe_max8 {
- struct dn_pipe8 pipe;
- int samples[ED_MAX_SAMPLES_NO];
-};
-SLIST_HEAD(dn_pipe_head8, dn_pipe8);
-
-/*
- * Changes from 7.2 to 8:
- * dn_pipe:
- * numbytes from int to int64_t
- * add burst (int64_t)
- * add idle_time (int64_t)
- * add profile
- * add struct dn_pipe_max
- * add flag DN_HAS_PROFILE
- *
- * dn_flow_queue
- * numbytes from u_long to int64_t
- * add extra_bits (int64_t)
- * q_time from u_int32_t to int64_t and name idle_time
- *
- * dn_flow_set unchanged
- *
- */
-
-/* NOTE:XXX copied from dummynet.c */
-#define O_NEXT(p, len) ((void *)((char *)p + len))
-static void
-oid_fill(struct dn_id *oid, int len, int type, uintptr_t id)
-{
- oid->len = len;
- oid->type = type;
- oid->subtype = 0;
- oid->id = id;
-}
-/* make room in the buffer and move the pointer forward */
-static void *
-o_next(struct dn_id **o, int len, int type)
-{
- struct dn_id *ret = *o;
- oid_fill(ret, len, type, 0);
- *o = O_NEXT(*o, len);
- return ret;
-}
-
-static size_t pipesize7 = sizeof(struct dn_pipe7);
-static size_t pipesize8 = sizeof(struct dn_pipe8);
-static size_t pipesizemax8 = sizeof(struct dn_pipe_max8);
-
-/* Indicate 'ipfw' version
- * 1: from FreeBSD 7.2
- * 0: from FreeBSD 8
- * -1: unknown (for now is unused)
- *
- * It is update when a IP_DUMMYNET_DEL or IP_DUMMYNET_CONFIGURE request arrives
- * NOTE: if a IP_DUMMYNET_GET arrives and the 'ipfw' version is unknown,
- * it is suppose to be the FreeBSD 8 version.
- */
-static int is7 = 0;
-
-static int
-convertflags2new(int src)
-{
- int dst = 0;
-
- if (src & DNOLD_HAVE_FLOW_MASK)
- dst |= DN_HAVE_MASK;
- if (src & DNOLD_QSIZE_IS_BYTES)
- dst |= DN_QSIZE_BYTES;
- if (src & DNOLD_NOERROR)
- dst |= DN_NOERROR;
- if (src & DNOLD_IS_RED)
- dst |= DN_IS_RED;
- if (src & DNOLD_IS_GENTLE_RED)
- dst |= DN_IS_GENTLE_RED;
- if (src & DNOLD_HAS_PROFILE)
- dst |= DN_HAS_PROFILE;
-
- return dst;
-}
-
-static int
-convertflags2old(int src)
-{
- int dst = 0;
-
- if (src & DN_HAVE_MASK)
- dst |= DNOLD_HAVE_FLOW_MASK;
- if (src & DN_IS_RED)
- dst |= DNOLD_IS_RED;
- if (src & DN_IS_GENTLE_RED)
- dst |= DNOLD_IS_GENTLE_RED;
- if (src & DN_NOERROR)
- dst |= DNOLD_NOERROR;
- if (src & DN_HAS_PROFILE)
- dst |= DNOLD_HAS_PROFILE;
- if (src & DN_QSIZE_BYTES)
- dst |= DNOLD_QSIZE_IS_BYTES;
-
- return dst;
-}
-
-static int
-dn_compat_del(void *v)
-{
- struct dn_pipe7 *p = (struct dn_pipe7 *) v;
- struct dn_pipe8 *p8 = (struct dn_pipe8 *) v;
- struct {
- struct dn_id oid;
- uintptr_t a[1]; /* add more if we want a list */
- } cmd;
-
- /* XXX DN_API_VERSION ??? */
- oid_fill((void *)&cmd, sizeof(cmd), DN_CMD_DELETE, DN_API_VERSION);
-
- if (is7) {
- if (p->pipe_nr == 0 && p->fs.fs_nr == 0)
- return EINVAL;
- if (p->pipe_nr != 0 && p->fs.fs_nr != 0)
- return EINVAL;
- } else {
- if (p8->pipe_nr == 0 && p8->fs.fs_nr == 0)
- return EINVAL;
- if (p8->pipe_nr != 0 && p8->fs.fs_nr != 0)
- return EINVAL;
- }
-
- if (p->pipe_nr != 0) { /* pipe x delete */
- cmd.a[0] = p->pipe_nr;
- cmd.oid.subtype = DN_LINK;
- } else { /* queue x delete */
- cmd.oid.subtype = DN_FS;
- cmd.a[0] = (is7) ? p->fs.fs_nr : p8->fs.fs_nr;
- }
-
- return do_config(&cmd, cmd.oid.len);
-}
-
-static int
-dn_compat_config_queue(struct dn_fs *fs, void* v)
-{
- struct dn_pipe7 *p7 = (struct dn_pipe7 *)v;
- struct dn_pipe8 *p8 = (struct dn_pipe8 *)v;
- struct dn_flow_set *f;
-
- if (is7)
- f = &p7->fs;
- else
- f = &p8->fs;
-
- fs->fs_nr = f->fs_nr;
- fs->sched_nr = f->parent_nr;
- fs->flow_mask = f->flow_mask;
- fs->buckets = f->rq_size;
- fs->qsize = f->qsize;
- fs->plr[0] = f->plr[0];
- fs->plr[1] = f->plr[1];
- fs->plr[2] = f->plr[2];
- fs->plr[3] = f->plr[3];
- fs->par[0] = f->weight;
- fs->flags = convertflags2new(f->flags_fs);
- if (fs->flags & DN_IS_GENTLE_RED || fs->flags & DN_IS_RED) {
- fs->w_q = f->w_q;
- fs->max_th = f->max_th;
- fs->min_th = f->min_th;
- fs->max_p = f->max_p;
- }
-
- return 0;
-}
-
-static int
-dn_compat_config_pipe(struct dn_sch *sch, struct dn_link *p,
- struct dn_fs *fs, void* v)
-{
- struct dn_pipe7 *p7 = (struct dn_pipe7 *)v;
- struct dn_pipe8 *p8 = (struct dn_pipe8 *)v;
- int i = p7->pipe_nr;
-
- sch->sched_nr = i;
- sch->oid.subtype = 0;
- p->link_nr = i;
- fs->fs_nr = i + 2*DN_MAX_ID;
- fs->sched_nr = i + DN_MAX_ID;
-
- /* Common to 7 and 8 */
- p->bandwidth = p7->bandwidth;
- p->delay = p7->delay;
- if (!is7) {
- /* FreeBSD 8 has burst */
- p->burst = p8->burst;
- }
-
- /* fill the fifo flowset */
- dn_compat_config_queue(fs, v);
- fs->fs_nr = i + 2*DN_MAX_ID;
- fs->sched_nr = i + DN_MAX_ID;
-
- /* Move scheduler related parameter from fs to sch */
- sch->buckets = fs->buckets; /*XXX*/
- fs->buckets = 0;
- if (fs->flags & DN_HAVE_MASK) {
- sch->flags |= DN_HAVE_MASK;
- fs->flags &= ~DN_HAVE_MASK;
- sch->sched_mask = fs->flow_mask;
- bzero(&fs->flow_mask, sizeof(struct ipfw_flow_id));
- }
-
- return 0;
-}
-
-static int
-dn_compat_config_profile(struct dn_profile *pf, struct dn_link *p,
- void *v)
-{
- struct dn_pipe8 *p8 = (struct dn_pipe8 *)v;
-
- p8->samples = &(((struct dn_pipe_max8 *)p8)->samples[0]);
-
- pf->link_nr = p->link_nr;
- pf->loss_level = p8->loss_level;
-// pf->bandwidth = p->bandwidth; //XXX bandwidth redundant?
- pf->samples_no = p8->samples_no;
- strncpy(pf->name, p8->name,sizeof(pf->name));
- bcopy(p8->samples, pf->samples, sizeof(pf->samples));
-
- return 0;
-}
-
-/*
- * If p->pipe_nr != 0 the command is 'pipe x config', so need to create
- * the three main struct, else only a flowset is created
- */
-static int
-dn_compat_configure(void *v)
-{
- struct dn_id *buf = NULL, *base;
- struct dn_sch *sch = NULL;
- struct dn_link *p = NULL;
- struct dn_fs *fs = NULL;
- struct dn_profile *pf = NULL;
- int lmax;
- int error;
-
- struct dn_pipe7 *p7 = (struct dn_pipe7 *)v;
- struct dn_pipe8 *p8 = (struct dn_pipe8 *)v;
-
- int i; /* number of object to configure */
-
- lmax = sizeof(struct dn_id); /* command header */
- lmax += sizeof(struct dn_sch) + sizeof(struct dn_link) +
- sizeof(struct dn_fs) + sizeof(struct dn_profile);
-
- base = buf = malloc(lmax, M_DUMMYNET, M_WAITOK|M_ZERO);
- o_next(&buf, sizeof(struct dn_id), DN_CMD_CONFIG);
- base->id = DN_API_VERSION;
-
- /* pipe_nr is the same in p7 and p8 */
- i = p7->pipe_nr;
- if (i != 0) { /* pipe config */
- sch = o_next(&buf, sizeof(*sch), DN_SCH);
- p = o_next(&buf, sizeof(*p), DN_LINK);
- fs = o_next(&buf, sizeof(*fs), DN_FS);
-
- error = dn_compat_config_pipe(sch, p, fs, v);
- if (error) {
- free(buf, M_DUMMYNET);
- return error;
- }
- if (!is7 && p8->samples_no > 0) {
- /* Add profiles*/
- pf = o_next(&buf, sizeof(*pf), DN_PROFILE);
- error = dn_compat_config_profile(pf, p, v);
- if (error) {
- free(buf, M_DUMMYNET);
- return error;
- }
- }
- } else { /* queue config */
- fs = o_next(&buf, sizeof(*fs), DN_FS);
- error = dn_compat_config_queue(fs, v);
- if (error) {
- free(buf, M_DUMMYNET);
- return error;
- }
- }
- error = do_config(base, (char *)buf - (char *)base);
-
- if (buf)
- free(buf, M_DUMMYNET);
- return error;
-}
-
-int
-dn_compat_calc_size(void)
-{
- int need = 0;
- /* XXX use FreeBSD 8 struct size */
- /* NOTE:
- * - half scheduler: schk_count/2
- * - all flowset: fsk_count
- * - all flowset queues: queue_count
- * - all pipe queue: si_count
- */
- need += V_dn_cfg.schk_count * sizeof(struct dn_pipe8) / 2;
- need += V_dn_cfg.fsk_count * sizeof(struct dn_flow_set);
- need += V_dn_cfg.si_count * sizeof(struct dn_flow_queue8);
- need += V_dn_cfg.queue_count * sizeof(struct dn_flow_queue8);
-
- return need;
-}
-
-int
-dn_c_copy_q (void *_ni, void *arg)
-{
- struct copy_args *a = arg;
- struct dn_flow_queue7 *fq7 = (struct dn_flow_queue7 *)*a->start;
- struct dn_flow_queue8 *fq8 = (struct dn_flow_queue8 *)*a->start;
- struct dn_flow *ni = (struct dn_flow *)_ni;
- int size = 0;
-
- /* XXX hash slot not set */
- /* No difference between 7.2/8 */
- fq7->len = ni->length;
- fq7->len_bytes = ni->len_bytes;
- fq7->id = ni->fid;
-
- if (is7) {
- size = sizeof(struct dn_flow_queue7);
- fq7->tot_pkts = ni->tot_pkts;
- fq7->tot_bytes = ni->tot_bytes;
- fq7->drops = ni->drops;
- } else {
- size = sizeof(struct dn_flow_queue8);
- fq8->tot_pkts = ni->tot_pkts;
- fq8->tot_bytes = ni->tot_bytes;
- fq8->drops = ni->drops;
- }
-
- *a->start += size;
- return 0;
-}
-
-int
-dn_c_copy_pipe(struct dn_schk *s, struct copy_args *a, int nq)
-{
- struct dn_link *l = &s->link;
- struct dn_fsk *f = s->fs;
-
- struct dn_pipe7 *pipe7 = (struct dn_pipe7 *)*a->start;
- struct dn_pipe8 *pipe8 = (struct dn_pipe8 *)*a->start;
- struct dn_flow_set *fs;
- int size = 0;
-
- if (is7) {
- fs = &pipe7->fs;
- size = sizeof(struct dn_pipe7);
- } else {
- fs = &pipe8->fs;
- size = sizeof(struct dn_pipe8);
- }
-
- /* These 4 field are the same in pipe7 and pipe8 */
- pipe7->next.sle_next = (struct dn_pipe7 *)DN_IS_PIPE;
- pipe7->bandwidth = l->bandwidth;
- pipe7->delay = l->delay * 1000 / hz;
- pipe7->pipe_nr = l->link_nr - DN_MAX_ID;
-
- if (!is7) {
- if (s->profile) {
- struct dn_profile *pf = s->profile;
- strncpy(pipe8->name, pf->name, sizeof(pf->name));
- pipe8->loss_level = pf->loss_level;
- pipe8->samples_no = pf->samples_no;
- }
- pipe8->burst = div64(l->burst , 8 * hz);
- }
-
- fs->flow_mask = s->sch.sched_mask;
- fs->rq_size = s->sch.buckets ? s->sch.buckets : 1;
-
- fs->parent_nr = l->link_nr - DN_MAX_ID;
- fs->qsize = f->fs.qsize;
- fs->plr[0] = f->fs.plr[0];
- fs->plr[1] = f->fs.plr[1];
- fs->plr[2] = f->fs.plr[2];
- fs->plr[3] = f->fs.plr[3];
- fs->w_q = f->fs.w_q;
- fs->max_th = f->max_th;
- fs->min_th = f->min_th;
- fs->max_p = f->fs.max_p;
- fs->rq_elements = nq;
-
- fs->flags_fs = convertflags2old(f->fs.flags);
-
- *a->start += size;
- return 0;
-}
-
-int
-dn_compat_copy_pipe(struct copy_args *a, void *_o)
-{
- int have = a->end - *a->start;
- int need = 0;
- int pipe_size = sizeof(struct dn_pipe8);
- int queue_size = sizeof(struct dn_flow_queue8);
- int n_queue = 0; /* number of queues */
-
- struct dn_schk *s = (struct dn_schk *)_o;
- /* calculate needed space:
- * - struct dn_pipe
- * - if there are instances, dn_queue * n_instances
- */
- n_queue = (s->sch.flags & DN_HAVE_MASK ? dn_ht_entries(s->siht) :
- (s->siht ? 1 : 0));
- need = pipe_size + queue_size * n_queue;
- if (have < need) {
- D("have %d < need %d", have, need);
- return 1;
- }
- /* copy pipe */
- dn_c_copy_pipe(s, a, n_queue);
-
- /* copy queues */
- if (s->sch.flags & DN_HAVE_MASK)
- dn_ht_scan(s->siht, dn_c_copy_q, a);
- else if (s->siht)
- dn_c_copy_q(s->siht, a);
- return 0;
-}
-
-int
-dn_c_copy_fs(struct dn_fsk *f, struct copy_args *a, int nq)
-{
- struct dn_flow_set *fs = (struct dn_flow_set *)*a->start;
-
- fs->next.sle_next = (struct dn_flow_set *)DN_IS_QUEUE;
- fs->fs_nr = f->fs.fs_nr;
- fs->qsize = f->fs.qsize;
- fs->plr[0] = f->fs.plr[0];
- fs->plr[1] = f->fs.plr[1];
- fs->plr[2] = f->fs.plr[2];
- fs->plr[3] = f->fs.plr[3];
- fs->w_q = f->fs.w_q;
- fs->max_th = f->max_th;
- fs->min_th = f->min_th;
- fs->max_p = f->fs.max_p;
- fs->flow_mask = f->fs.flow_mask;
- fs->rq_elements = nq;
- fs->rq_size = (f->fs.buckets ? f->fs.buckets : 1);
- fs->parent_nr = f->fs.sched_nr;
- fs->weight = f->fs.par[0];
-
- fs->flags_fs = convertflags2old(f->fs.flags);
- *a->start += sizeof(struct dn_flow_set);
- return 0;
-}
-
-int
-dn_compat_copy_queue(struct copy_args *a, void *_o)
-{
- int have = a->end - *a->start;
- int need = 0;
- int fs_size = sizeof(struct dn_flow_set);
- int queue_size = sizeof(struct dn_flow_queue8);
-
- struct dn_fsk *fs = (struct dn_fsk *)_o;
- int n_queue = 0; /* number of queues */
-
- n_queue = (fs->fs.flags & DN_HAVE_MASK ? dn_ht_entries(fs->qht) :
- (fs->qht ? 1 : 0));
-
- need = fs_size + queue_size * n_queue;
- if (have < need) {
- D("have < need");
- return 1;
- }
-
- /* copy flowset */
- dn_c_copy_fs(fs, a, n_queue);
-
- /* copy queues */
- if (fs->fs.flags & DN_HAVE_MASK)
- dn_ht_scan(fs->qht, dn_c_copy_q, a);
- else if (fs->qht)
- dn_c_copy_q(fs->qht, a);
-
- return 0;
-}
-
-int
-copy_data_helper_compat(void *_o, void *_arg)
-{
- struct copy_args *a = _arg;
-
- if (a->type == DN_COMPAT_PIPE) {
- struct dn_schk *s = _o;
- if (s->sch.oid.subtype != 1 || s->sch.sched_nr <= DN_MAX_ID) {
- return 0; /* not old type */
- }
- /* copy pipe parameters, and if instance exists, copy
- * other parameters and eventually queues.
- */
- if(dn_compat_copy_pipe(a, _o))
- return DNHT_SCAN_END;
- } else if (a->type == DN_COMPAT_QUEUE) {
- struct dn_fsk *fs = _o;
- if (fs->fs.fs_nr >= DN_MAX_ID)
- return 0;
- if (dn_compat_copy_queue(a, _o))
- return DNHT_SCAN_END;
- }
- return 0;
-}
-
-/* Main function to manage old requests */
-int
-ip_dummynet_compat(struct sockopt *sopt)
-{
- int error=0;
- void *v = NULL;
- struct dn_id oid;
-
- /* Length of data, used to found ipfw version... */
- int len = sopt->sopt_valsize;
-
- /* len can be 0 if command was dummynet_flush */
- if (len == pipesize7) {
- D("setting compatibility with FreeBSD 7.2");
- is7 = 1;
- }
- else if (len == pipesize8 || len == pipesizemax8) {
- D("setting compatibility with FreeBSD 8");
- is7 = 0;
- }
-
- switch (sopt->sopt_name) {
- default:
- printf("dummynet: -- unknown option %d", sopt->sopt_name);
- error = EINVAL;
- break;
-
- case IP_DUMMYNET_FLUSH:
- oid_fill(&oid, sizeof(oid), DN_CMD_FLUSH, DN_API_VERSION);
- do_config(&oid, oid.len);
- break;
-
- case IP_DUMMYNET_DEL:
- v = malloc(len, M_TEMP, M_WAITOK);
- error = sooptcopyin(sopt, v, len, len);
- if (error)
- break;
- error = dn_compat_del(v);
- free(v, M_TEMP);
- break;
-
- case IP_DUMMYNET_CONFIGURE:
- v = malloc(len, M_TEMP, M_NOWAIT);
- if (v == NULL) {
- error = ENOMEM;
- break;
- }
- error = sooptcopyin(sopt, v, len, len);
- if (error)
- break;
- error = dn_compat_configure(v);
- free(v, M_TEMP);
- break;
-
- case IP_DUMMYNET_GET: {
- void *buf;
- int ret;
- int original_size = sopt->sopt_valsize;
- int size;
-
- ret = dummynet_get(sopt, &buf);
- if (ret)
- return 0;//XXX ?
- size = sopt->sopt_valsize;
- sopt->sopt_valsize = original_size;
- D("size=%d, buf=%p", size, buf);
- ret = sooptcopyout(sopt, buf, size);
- if (ret)
- printf(" %s ERROR sooptcopyout\n", __FUNCTION__);
- if (buf)
- free(buf, M_DUMMYNET);
- }
- }
-
- return error;
-}
diff --git a/sys/netpfil/ipfw/ip_dn_private.h b/sys/netpfil/ipfw/ip_dn_private.h
index 756a997b6ec3..9a43b86791e0 100644
--- a/sys/netpfil/ipfw/ip_dn_private.h
+++ b/sys/netpfil/ipfw/ip_dn_private.h
@@ -437,15 +437,7 @@ struct copy_args {
};
struct sockopt;
-int ip_dummynet_compat(struct sockopt *sopt);
-int dummynet_get(struct sockopt *sopt, void **compat);
-int dn_c_copy_q (void *_ni, void *arg);
-int dn_c_copy_pipe(struct dn_schk *s, struct copy_args *a, int nq);
-int dn_c_copy_fs(struct dn_fsk *f, struct copy_args *a, int nq);
-int dn_compat_copy_queue(struct copy_args *a, void *_o);
-int dn_compat_copy_pipe(struct copy_args *a, void *_o);
-int copy_data_helper_compat(void *_o, void *_arg);
-int dn_compat_calc_size(void);
+int dummynet_get(struct sockopt *sopt);
int do_config(void *p, size_t l);
/* function to drain idle object */
diff --git a/sys/netpfil/ipfw/ip_dummynet.c b/sys/netpfil/ipfw/ip_dummynet.c
index d522f9da0fbe..61442c617753 100644
--- a/sys/netpfil/ipfw/ip_dummynet.c
+++ b/sys/netpfil/ipfw/ip_dummynet.c
@@ -2198,9 +2198,6 @@ compute_space(struct dn_id *cmd, struct copy_args *a)
case DN_FS: /* queue show */
x = DN_C_FS | DN_C_QUEUE;
break;
- case DN_GET_COMPAT: /* compatibility mode */
- need = dn_compat_calc_size();
- break;
}
a->flags = x;
if (x & DN_C_SCH) {
@@ -2226,11 +2223,9 @@ compute_space(struct dn_id *cmd, struct copy_args *a)
}
/*
- * If compat != NULL dummynet_get is called in compatibility mode.
- * *compat will be the pointer to the buffer to pass to ipfw
*/
int
-dummynet_get(struct sockopt *sopt, void **compat)
+dummynet_get(struct sockopt *sopt)
{
int have, i, need, error;
char *start = NULL, *buf;
@@ -2248,37 +2243,28 @@ dummynet_get(struct sockopt *sopt, void **compat)
cmd = &r.o;
- if (!compat) {
- /* copy at least an oid, and possibly a full object */
- error = sooptcopyin(sopt, cmd, sizeof(r), sizeof(*cmd));
- sopt->sopt_valsize = sopt_valsize;
- if (error)
- goto done;
- l = cmd->len;
+ /* copy at least an oid, and possibly a full object */
+ error = sooptcopyin(sopt, cmd, sizeof(r), sizeof(*cmd));
+ sopt->sopt_valsize = sopt_valsize;
+ if (error)
+ goto done;
+ l = cmd->len;
#ifdef EMULATE_SYSCTL
- /* sysctl emulation. */
- if (cmd->type == DN_SYSCTL_GET)
- return kesysctl_emu_get(sopt);
+ /* sysctl emulation. */
+ if (cmd->type == DN_SYSCTL_GET)
+ return kesysctl_emu_get(sopt);
#endif
- if (l > sizeof(r)) {
- /* request larger than default, allocate buffer */
- cmd = malloc(l, M_DUMMYNET, M_NOWAIT);
- if (cmd == NULL) {
- error = ENOMEM;
- goto done;
- }
- error = sooptcopyin(sopt, cmd, l, l);
- sopt->sopt_valsize = sopt_valsize;
- if (error)
- goto done;
+ if (l > sizeof(r)) {
+ /* request larger than default, allocate buffer */
+ cmd = malloc(l, M_DUMMYNET, M_NOWAIT);
+ if (cmd == NULL) {
+ error = ENOMEM;
+ goto done;
}
- } else { /* compatibility */
- error = 0;
- cmd->type = DN_CMD_GET;
- cmd->len = sizeof(struct dn_id);
- cmd->subtype = DN_GET_COMPAT;
- // cmd->id = sopt_valsize;
- D("compatibility mode");
+ error = sooptcopyin(sopt, cmd, l, l);
+ sopt->sopt_valsize = sopt_valsize;
+ if (error)
+ goto done;
}
#ifdef NEW_AQM
@@ -2337,12 +2323,7 @@ dummynet_get(struct sockopt *sopt, void **compat)
}
if (start == NULL) {
- if (compat) {
- *compat = NULL;
- error = 1; // XXX
- } else {
- error = sooptcopyout(sopt, cmd, sizeof(*cmd));
- }
+ error = sooptcopyout(sopt, cmd, sizeof(*cmd));
goto done;
}
ND("have %d:%d sched %d, %d:%d links %d, %d:%d flowsets %d, "
@@ -2355,35 +2336,20 @@ dummynet_get(struct sockopt *sopt, void **compat)
sopt->sopt_valsize = sopt_valsize;
a.type = cmd->subtype;
- if (compat == NULL) {
- memcpy(start, cmd, sizeof(*cmd));
- ((struct dn_id*)(start))->len = sizeof(struct dn_id);
- buf = start + sizeof(*cmd);
- } else
- buf = start;
+ memcpy(start, cmd, sizeof(*cmd));
+ ((struct dn_id*)(start))->len = sizeof(struct dn_id);
+ buf = start + sizeof(*cmd);
a.start = &buf;
a.end = start + have;
/* start copying other objects */
- if (compat) {
- a.type = DN_COMPAT_PIPE;
- dn_ht_scan(V_dn_cfg.schedhash, copy_data_helper_compat, &a);
- a.type = DN_COMPAT_QUEUE;
- dn_ht_scan(V_dn_cfg.fshash, copy_data_helper_compat, &a);
- } else if (a.type == DN_FS) {
+ if (a.type == DN_FS) {
dn_ht_scan(V_dn_cfg.fshash, copy_data_helper, &a);
} else {
dn_ht_scan(V_dn_cfg.schedhash, copy_data_helper, &a);
}
DN_BH_WUNLOCK();
- if (compat) {
- *compat = start;
- sopt->sopt_valsize = buf - start;
- /* free() is done by ip_dummynet_compat() */
- start = NULL; //XXX hack
- } else {
- error = sooptcopyout(sopt, start, buf - start);
- }
+ error = sooptcopyout(sopt, start, buf - start);
done:
if (cmd != &r.o)
free(cmd, M_DUMMYNET);
@@ -2519,17 +2485,9 @@ ip_dn_ctl(struct sockopt *sopt)
error = EINVAL;
break;
- case IP_DUMMYNET_FLUSH:
- case IP_DUMMYNET_CONFIGURE:
- case IP_DUMMYNET_DEL: /* remove a pipe or queue */
- case IP_DUMMYNET_GET:
- D("dummynet: compat option %d", sopt->sopt_name);
- error = ip_dummynet_compat(sopt);
- break;
-
case IP_DUMMYNET3:
if (sopt->sopt_dir == SOPT_GET) {
- error = dummynet_get(sopt, NULL);
+ error = dummynet_get(sopt);
break;
}
l = sopt->sopt_valsize;
diff --git a/sys/powerpc/include/kexec.h b/sys/powerpc/include/kexec.h
new file mode 100644
index 000000000000..a57c50926696
--- /dev/null
+++ b/sys/powerpc/include/kexec.h
@@ -0,0 +1,38 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2025 Juniper Networks, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _POWERPC_KEXEC_H_
+#define _POWERPC_KEXEC_H_
+
+int
+kexec_load_md(struct kexec_image *image)
+{
+ return (ENOSYS);
+}
+
+#define kexec_reboot_md(x) do {} while (0)
+#endif /* _POWERPC_KEXEC_H_ */
diff --git a/sys/riscv/include/kexec.h b/sys/riscv/include/kexec.h
new file mode 100644
index 000000000000..5fb6fd321989
--- /dev/null
+++ b/sys/riscv/include/kexec.h
@@ -0,0 +1,39 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2025 Juniper Networks, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _RISCV_KEXEC_H_
+#define _RISCV_KEXEC_H_
+
+int
+kexec_load_md(struct kexec_image *image)
+{
+ return (ENOSYS);
+}
+
+#define kexec_reboot_md(x) do {} while (0)
+#endif /* _RISCV_KEXEC_H_ */
diff --git a/sys/security/audit/audit_bsm_db.c b/sys/security/audit/audit_bsm_db.c
index c9f3d5c8a549..358162544287 100644
--- a/sys/security/audit/audit_bsm_db.c
+++ b/sys/security/audit/audit_bsm_db.c
@@ -56,6 +56,8 @@
#include <security/audit/audit.h>
#include <security/audit/audit_private.h>
+#include <contrib/ck/include/ck_queue.h>
+
/*
* Hash table functions for the audit event number to event class mask
* mapping.
@@ -64,21 +66,21 @@
struct evclass_elem {
au_event_t event;
au_class_t class;
- LIST_ENTRY(evclass_elem) entry;
+ CK_LIST_ENTRY(evclass_elem) entry;
};
struct evclass_list {
- LIST_HEAD(, evclass_elem) head;
+ CK_LIST_HEAD(, evclass_elem) head;
};
static MALLOC_DEFINE(M_AUDITEVCLASS, "audit_evclass", "Audit event class");
-static struct rwlock evclass_lock;
static struct evclass_list evclass_hash[EVCLASSMAP_HASH_TABLE_SIZE];
-
-#define EVCLASS_LOCK_INIT() rw_init(&evclass_lock, "evclass_lock")
-#define EVCLASS_RLOCK() rw_rlock(&evclass_lock)
-#define EVCLASS_RUNLOCK() rw_runlock(&evclass_lock)
-#define EVCLASS_WLOCK() rw_wlock(&evclass_lock)
-#define EVCLASS_WUNLOCK() rw_wunlock(&evclass_lock)
+static struct mtx evclass_mtx;
+#define EVCLASS_LOCK_INIT() mtx_init(&evclass_mtx, "evclass_lock", NULL, MTX_DEF)
+#define EVCLASS_WLOCK() mtx_lock(&evclass_mtx);
+#define EVCLASS_WUNLOCK() mtx_unlock(&evclass_mtx);
+/* make these do something if we ever remove entries from the hash */
+#define EVCLASS_RLOCK() {}
+#define EVCLASS_RUNLOCK() {}
/*
* Hash table maintaining a mapping from audit event numbers to audit event
@@ -118,7 +120,7 @@ au_event_class(au_event_t event)
EVCLASS_RLOCK();
evcl = &evclass_hash[event % EVCLASSMAP_HASH_TABLE_SIZE];
class = 0;
- LIST_FOREACH(evc, &evcl->head, entry) {
+ CK_LIST_FOREACH(evc, &evcl->head, entry) {
if (evc->event == event) {
class = evc->class;
goto out;
@@ -150,7 +152,7 @@ au_evclassmap_insert(au_event_t event, au_class_t class)
EVCLASS_WLOCK();
evcl = &evclass_hash[event % EVCLASSMAP_HASH_TABLE_SIZE];
- LIST_FOREACH(evc, &evcl->head, entry) {
+ CK_LIST_FOREACH(evc, &evcl->head, entry) {
if (evc->event == event) {
evc->class = class;
EVCLASS_WUNLOCK();
@@ -161,7 +163,7 @@ au_evclassmap_insert(au_event_t event, au_class_t class)
evc = evc_new;
evc->event = event;
evc->class = class;
- LIST_INSERT_HEAD(&evcl->head, evc, entry);
+ CK_LIST_INSERT_HEAD(&evcl->head, evc, entry);
EVCLASS_WUNLOCK();
}
@@ -172,7 +174,7 @@ au_evclassmap_init(void)
EVCLASS_LOCK_INIT();
for (i = 0; i < EVCLASSMAP_HASH_TABLE_SIZE; i++)
- LIST_INIT(&evclass_hash[i].head);
+ CK_LIST_INIT(&evclass_hash[i].head);
/*
* Set up the initial event to class mapping for system calls.
diff --git a/sys/sys/kexec.h b/sys/sys/kexec.h
new file mode 100644
index 000000000000..478193749368
--- /dev/null
+++ b/sys/sys/kexec.h
@@ -0,0 +1,81 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2025 Juniper Networks, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _SYS_KEXEC_H_
+#define _SYS_KEXEC_H_
+
+#include <sys/types.h>
+
+struct kexec_segment {
+ void *buf;
+ size_t bufsz;
+ vm_paddr_t mem;
+ vm_size_t memsz;
+};
+
+/* Flags (aligned with Linux) */
+#define KEXEC_ON_CRASH 0x1
+
+/* Aligned with Linux's limit */
+#define KEXEC_SEGMENT_MAX 16
+
+#ifdef _KERNEL
+struct kexec_segment_stage {
+ vm_page_t first_page;
+ void *map_buf;
+ vm_paddr_t target;
+ vm_size_t size;
+ vm_pindex_t pindex;
+};
+
+struct kexec_image {
+ struct kexec_segment_stage segments[KEXEC_SEGMENT_MAX];
+ vm_paddr_t entry;
+ struct vm_object *map_obj; /* Containing object */
+ vm_offset_t map_addr; /* Mapped in kernel space */
+ vm_size_t map_size;
+ vm_page_t first_md_page;
+ void *md_image;
+};
+
+#endif
+
+#ifndef _KERNEL
+
+__BEGIN_DECLS
+int kexec_load(uint64_t, unsigned long, struct kexec_segment *, unsigned long);
+__END_DECLS
+
+#else
+
+void kexec_reboot_md(struct kexec_image *);
+int kexec_load_md(struct kexec_image *);
+
+#endif
+
+#endif
diff --git a/sys/sys/reboot.h b/sys/sys/reboot.h
index 26e78632fb2c..50ad2b78083c 100644
--- a/sys/sys/reboot.h
+++ b/sys/sys/reboot.h
@@ -61,6 +61,7 @@
#define RB_REROOT 0x200000 /* unmount the rootfs and mount it again */
#define RB_POWERCYCLE 0x400000 /* Power cycle if possible */
#define RB_MUTEMSGS 0x800000 /* start up with console muted after banner */
+#define RB_KEXEC 0x1000000 /* Boot new kernel using kexec */
#define RB_PROBE 0x10000000 /* Probe multiple consoles */
#define RB_MULTIPLE 0x20000000 /* use multiple consoles */
diff --git a/sys/sys/smp.h b/sys/sys/smp.h
index 252dc9dc1cae..b642a6014f33 100644
--- a/sys/sys/smp.h
+++ b/sys/sys/smp.h
@@ -251,6 +251,7 @@ void cpu_mp_announce(void);
int cpu_mp_probe(void);
void cpu_mp_setmaxid(void);
void cpu_mp_start(void);
+void cpu_mp_stop(void); /* Go back to single-CPU */
void forward_signal(struct thread *);
int restart_cpus(cpuset_t);
@@ -259,6 +260,7 @@ int stop_cpus_hard(cpuset_t);
#if defined(__amd64__) || defined(__i386__)
int suspend_cpus(cpuset_t);
int resume_cpus(cpuset_t);
+int offline_cpus(cpuset_t);
#endif
void smp_rendezvous_action(void);
diff --git a/sys/sys/syscall.h b/sys/sys/syscall.h
index cff27b8be316..43f46f063e3e 100644
--- a/sys/sys/syscall.h
+++ b/sys/sys/syscall.h
@@ -537,4 +537,5 @@
#define SYS_setgroups 596
#define SYS_jail_attach_jd 597
#define SYS_jail_remove_jd 598
-#define SYS_MAXSYSCALL 599
+#define SYS_kexec_load 599
+#define SYS_MAXSYSCALL 600
diff --git a/sys/sys/syscall.mk b/sys/sys/syscall.mk
index 443dbadcfbff..ce29c050885e 100644
--- a/sys/sys/syscall.mk
+++ b/sys/sys/syscall.mk
@@ -440,4 +440,5 @@ MIASM = \
getgroups.o \
setgroups.o \
jail_attach_jd.o \
- jail_remove_jd.o
+ jail_remove_jd.o \
+ kexec_load.o
diff --git a/sys/sys/syscallsubr.h b/sys/sys/syscallsubr.h
index d32690634059..8f106150e193 100644
--- a/sys/sys/syscallsubr.h
+++ b/sys/sys/syscallsubr.h
@@ -47,6 +47,7 @@ struct image_args;
struct jail;
struct kevent;
struct kevent_copyops;
+struct kexec_segment;
struct kld_file_stat;
struct ksiginfo;
struct mbuf;
@@ -401,6 +402,8 @@ int kern_writev(struct thread *td, int fd, struct uio *auio);
int kern_socketpair(struct thread *td, int domain, int type, int protocol,
int *rsv);
int kern_unmount(struct thread *td, const char *path, int flags);
+int kern_kexec_load(struct thread *td, u_long entry,
+ u_long nseg, struct kexec_segment *seg, u_long flags);
/* flags for kern_sigaction */
#define KSA_OSIGSET 0x0001 /* uses osigact_t */
diff --git a/sys/sys/sysproto.h b/sys/sys/sysproto.h
index 8dda4b4533ea..5f5524a4519b 100644
--- a/sys/sys/sysproto.h
+++ b/sys/sys/sysproto.h
@@ -1907,6 +1907,12 @@ struct jail_attach_jd_args {
struct jail_remove_jd_args {
char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)];
};
+struct kexec_load_args {
+ char entry_l_[PADL_(uint64_t)]; uint64_t entry; char entry_r_[PADR_(uint64_t)];
+ char nseg_l_[PADL_(u_long)]; u_long nseg; char nseg_r_[PADR_(u_long)];
+ char segments_l_[PADL_(struct kexec_segment *)]; struct kexec_segment * segments; char segments_r_[PADR_(struct kexec_segment *)];
+ char flags_l_[PADL_(u_long)]; u_long flags; char flags_r_[PADR_(u_long)];
+};
int sys__exit(struct thread *, struct _exit_args *);
int sys_fork(struct thread *, struct fork_args *);
int sys_read(struct thread *, struct read_args *);
@@ -2313,6 +2319,7 @@ int sys_getgroups(struct thread *, struct getgroups_args *);
int sys_setgroups(struct thread *, struct setgroups_args *);
int sys_jail_attach_jd(struct thread *, struct jail_attach_jd_args *);
int sys_jail_remove_jd(struct thread *, struct jail_remove_jd_args *);
+int sys_kexec_load(struct thread *, struct kexec_load_args *);
#ifdef COMPAT_43
@@ -3311,6 +3318,7 @@ int freebsd14_setgroups(struct thread *, struct freebsd14_setgroups_args *);
#define SYS_AUE_setgroups AUE_SETGROUPS
#define SYS_AUE_jail_attach_jd AUE_JAIL_ATTACH
#define SYS_AUE_jail_remove_jd AUE_JAIL_REMOVE
+#define SYS_AUE_kexec_load AUE_NULL
#undef PAD_
#undef PADL_
diff --git a/sys/tools/gdb/README.txt b/sys/tools/gdb/README.txt
index 8c31565ddc42..ad1544912c3c 100644
--- a/sys/tools/gdb/README.txt
+++ b/sys/tools/gdb/README.txt
@@ -8,6 +8,9 @@ be automatically loaded by kgdb when opening a vmcore, so if you add new GDB
commands or functions, that script should be updated to import them, and you
should document them here.
+When improving these scripts, you can use the "kgdb-reload" command to reload
+them from /usr/lib/debug/boot/kernel/gdb/*.
+
To provide some rudimentary testing, selftest.py tries to exercise all of the
commands and functions defined here. To use it, run selftest.sh to panic the
system. Then, create a kernel dump or attach to the panicked kernel, and invoke
@@ -15,6 +18,8 @@ the script with "python import selftest" in (k)gdb.
Commands:
acttrace Display a backtrace for all on-CPU threads
+kgdb-reload Reload all gdb modules, useful when developing the modules
+ themselves.
Functions:
$PCPU(<field>[, <cpuid>]) Display the value of a PCPU/DPCPU field
diff --git a/sys/tools/gdb/acttrace.py b/sys/tools/gdb/acttrace.py
index 147effbbddf1..fdd18a4833cd 100644
--- a/sys/tools/gdb/acttrace.py
+++ b/sys/tools/gdb/acttrace.py
@@ -13,10 +13,8 @@ from pcpu import *
class acttrace(gdb.Command):
"""
- Register an acttrace command with gdb.
-
- When run, acttrace prints the stack trace of all threads that were on-CPU
- at the time of the panic.
+ Print the stack trace of all threads that were on-CPU at the time of
+ the panic.
"""
def __init__(self):
super(acttrace, self).__init__("acttrace", gdb.COMMAND_USER)
diff --git a/sys/tools/gdb/pcpu.py b/sys/tools/gdb/pcpu.py
index aadc4b2d42df..94c451e6eca5 100644
--- a/sys/tools/gdb/pcpu.py
+++ b/sys/tools/gdb/pcpu.py
@@ -9,7 +9,7 @@ from freebsd import *
class pcpu(gdb.Function):
"""
- Register a function to lookup PCPU and DPCPU variables by name.
+ A function to look up PCPU and DPCPU fields by name.
To look up the value of the PCPU field foo on CPU n, use
$PCPU("foo", n). This works for DPCPU fields too. If the CPU ID is
diff --git a/sys/tools/gdb/vnet.py b/sys/tools/gdb/vnet.py
index 36b4d512a3eb..5f416b2a515a 100644
--- a/sys/tools/gdb/vnet.py
+++ b/sys/tools/gdb/vnet.py
@@ -10,7 +10,7 @@ from freebsd import *
class vnet(gdb.Function):
"""
- Register a function to look up VNET variables by name.
+ A function to look up VNET variables by name.
To look at the value of a VNET variable V_foo, print $V("foo"). The
currently selected thread's VNET is used by default, but can be optionally
diff --git a/sys/tools/kernel-gdb.py b/sys/tools/kernel-gdb.py
index 8a41ef6efab1..990bdaf31fda 100644
--- a/sys/tools/kernel-gdb.py
+++ b/sys/tools/kernel-gdb.py
@@ -4,12 +4,40 @@
# SPDX-License-Identifier: BSD-2-Clause
#
+import importlib
import os
import sys
sys.path.append(os.path.join(os.path.dirname(__file__), "gdb"))
-# Import FreeBSD kernel debugging commands and modules below.
-import acttrace
-import pcpu
-import vnet
+modules = [
+ "acttrace",
+ "freebsd",
+ "pcpu",
+ "vnet"
+]
+
+
+def reload_modules(modules):
+ for mod in modules:
+ if mod in sys.modules:
+ importlib.reload(sys.modules[mod])
+ else:
+ importlib.import_module(mod)
+
+reload_modules(modules)
+
+
+class reload(gdb.Command):
+ """
+ Reload the FreeBSD kernel GDB helper scripts.
+ """
+ def __init__(self):
+ super(reload, self).__init__("kgdb-reload", gdb.COMMAND_USER)
+
+ def invoke(self, arg, from_tty):
+ reload_modules(modules)
+
+
+# Register the reload command with gdb.
+reload()
diff --git a/sys/x86/include/apicvar.h b/sys/x86/include/apicvar.h
index c537d0ee0cdd..551f5527ac00 100644
--- a/sys/x86/include/apicvar.h
+++ b/sys/x86/include/apicvar.h
@@ -134,7 +134,8 @@
#define IPI_STOP (APIC_IPI_INTS + 6) /* Stop CPU until restarted. */
#define IPI_SUSPEND (APIC_IPI_INTS + 7) /* Suspend CPU until restarted. */
#define IPI_SWI (APIC_IPI_INTS + 8) /* Run clk_intr_event. */
-#define IPI_DYN_FIRST (APIC_IPI_INTS + 9)
+#define IPI_OFF (APIC_IPI_INTS + 9) /* Stop CPU forever */
+#define IPI_DYN_FIRST (APIC_IPI_INTS + 10)
#define IPI_DYN_LAST (254) /* IPIs allocated at runtime */
/*
diff --git a/sys/x86/include/intr_machdep.h b/sys/x86/include/intr_machdep.h
index 9e913440c712..497c89b0a7eb 100644
--- a/sys/x86/include/intr_machdep.h
+++ b/sys/x86/include/intr_machdep.h
@@ -142,6 +142,7 @@ int intr_add_handler(struct intsrc *isrc, const char *name,
int intr_config_intr(struct intsrc *isrc, enum intr_trigger trig,
enum intr_polarity pol);
int intr_describe(struct intsrc *isrc, void *ih, const char *descr);
+void intr_disable_all(void);
void intr_execute_handlers(struct intsrc *isrc, struct trapframe *frame);
u_int intr_next_cpu(int domain);
struct intsrc *intr_lookup_source(int vector);
diff --git a/sys/x86/include/x86_smp.h b/sys/x86/include/x86_smp.h
index 8b9eb2ec9b66..f5015e9d8a24 100644
--- a/sys/x86/include/x86_smp.h
+++ b/sys/x86/include/x86_smp.h
@@ -77,6 +77,7 @@ extern u_long *ipi_rendezvous_counts[MAXCPU];
inthand_t
IDTVEC(ipi_intr_bitmap_handler), /* Bitmap based IPIs */
IDTVEC(ipi_swi), /* Runs delayed SWI */
+ IDTVEC(cpuoff), /* CPU goes offline until hard reset */
IDTVEC(cpustop), /* CPU stops & waits to be restarted */
IDTVEC(cpususpend), /* CPU suspends & waits to be resumed */
IDTVEC(rendezvous); /* handle CPU rendezvous */
@@ -93,6 +94,7 @@ void assign_cpu_ids(void);
void cpu_add(u_int apic_id, char boot_cpu);
void cpustop_handler(void);
void cpususpend_handler(void);
+void cpuoff_handler(void);
void init_secondary_tail(void);
void init_secondary(void);
void ipi_startup(int apic_id, int vector);
diff --git a/sys/x86/x86/intr_machdep.c b/sys/x86/x86/intr_machdep.c
index 023c3df22580..a16d2ced8dba 100644
--- a/sys/x86/x86/intr_machdep.c
+++ b/sys/x86/x86/intr_machdep.c
@@ -245,6 +245,26 @@ intr_register_source(struct intsrc *isrc)
return (0);
}
+void
+intr_disable_all(void)
+{
+ /*
+ * Disable all external interrupts. This is used by kexec_reboot() to
+ * prevent problems on the other side when APs are brought up.
+ */
+ for (int v = 0; v < num_io_irqs; v++) {
+ struct intsrc *is;
+
+ is = interrupt_sources[v];
+ if (is == NULL)
+ continue;
+ if (is->is_pic->pic_disable_intr != NULL) {
+ is->is_pic->pic_disable_source(is, PIC_EOI);
+ is->is_pic->pic_disable_intr(is);
+ }
+ }
+}
+
struct intsrc *
intr_lookup_source(int vector)
{
diff --git a/sys/x86/x86/mp_x86.c b/sys/x86/x86/mp_x86.c
index c0da41a4d222..6b1715853763 100644
--- a/sys/x86/x86/mp_x86.c
+++ b/sys/x86/x86/mp_x86.c
@@ -1696,6 +1696,28 @@ cpususpend_handler(void)
CPU_CLR_ATOMIC(cpu, &toresume_cpus);
}
+void
+cpuoff_handler(void)
+{
+ u_int cpu;
+
+ cpu = PCPU_GET(cpuid);
+
+ /* Time to go catatonic. A reset will be required to leave. */
+ disable_intr();
+ lapic_disable();
+ CPU_SET_ATOMIC(cpu, &suspended_cpus);
+
+ /*
+ * There technically should be no need for the `while` here, since it
+ * cannot be interrupted (interrupts are disabled). Be safe anyway.
+ * Any interrupt at this point will likely be fatal, as the page tables
+ * are likely going away shortly.
+ */
+ while (1)
+ halt();
+}
+
/*
* Handle an IPI_SWI by waking delayed SWI thread.
*/
diff --git a/sys/x86/x86/msi.c b/sys/x86/x86/msi.c
index 9d5a51f9753c..b38247bf6e45 100644
--- a/sys/x86/x86/msi.c
+++ b/sys/x86/x86/msi.c
@@ -219,6 +219,14 @@ msi_disable_intr(struct intsrc *isrc)
struct msi_intsrc *msi = (struct msi_intsrc *)isrc;
msi = msi->msi_first;
+
+ /*
+ * Interrupt sources are always registered, but never unregistered.
+ * Handle the case where MSIs have all been unregistered.
+ */
+ if (msi == NULL)
+ return;
+
msi->msi_enabled--;
if (msi->msi_enabled == 0) {
for (u_int i = 0; i < msi->msi_count; i++)