diff options
122 files changed, 2415 insertions, 1486 deletions
diff --git a/contrib/blocklist/bin/blacklistd.c b/contrib/blocklist/bin/blacklistd.c index b5f9358122ef..cb6ce6578d9c 100644 --- a/contrib/blocklist/bin/blacklistd.c +++ b/contrib/blocklist/bin/blacklistd.c @@ -1,4 +1,4 @@ -/* $NetBSD: blocklistd.c,v 1.11 2025/10/25 16:55:23 christos Exp $ */ +/* $NetBSD: blocklistd.c,v 1.12 2025/10/25 18:43:51 christos Exp $ */ /*- * Copyright (c) 2015 The NetBSD Foundation, Inc. @@ -35,7 +35,7 @@ #ifdef HAVE_SYS_CDEFS_H #include <sys/cdefs.h> #endif -__RCSID("$NetBSD: blocklistd.c,v 1.11 2025/10/25 16:55:23 christos Exp $"); +__RCSID("$NetBSD: blocklistd.c,v 1.12 2025/10/25 18:43:51 christos Exp $"); #include <sys/types.h> #include <sys/socket.h> @@ -329,8 +329,8 @@ again: (*lfun)(LOG_INFO, "released %s/%d:%d after %d seconds", buf, c.c_lmask, c.c_port, c.c_duration); } - state_del(state, &c); - goto again; + if (state_del(state, &c) == 0) + goto again; } } diff --git a/contrib/blocklist/bin/blocklistctl.8 b/contrib/blocklist/bin/blocklistctl.8 index 75228599a9de..d241f2844471 100644 --- a/contrib/blocklist/bin/blocklistctl.8 +++ b/contrib/blocklist/bin/blocklistctl.8 @@ -1,4 +1,4 @@ -.\" $NetBSD: blocklistctl.8,v 1.5 2025/10/25 16:56:27 christos Exp $ +.\" $NetBSD: blocklistctl.8,v 1.6 2025/10/26 13:49:22 christos Exp $ .\" .\" Copyright (c) 2015 The NetBSD Foundation, Inc. .\" All rights reserved. diff --git a/contrib/blocklist/bin/blocklistd.c b/contrib/blocklist/bin/blocklistd.c index c78c560613fc..47c145c7aae1 100644 --- a/contrib/blocklist/bin/blocklistd.c +++ b/contrib/blocklist/bin/blocklistd.c @@ -1,4 +1,4 @@ -/* $NetBSD: blocklistd.c,v 1.11 2025/10/25 16:55:23 christos Exp $ */ +/* $NetBSD: blocklistd.c,v 1.12 2025/10/25 18:43:51 christos Exp $ */ /*- * Copyright (c) 2015 The NetBSD Foundation, Inc. @@ -35,7 +35,7 @@ #ifdef HAVE_SYS_CDEFS_H #include <sys/cdefs.h> #endif -__RCSID("$NetBSD: blocklistd.c,v 1.11 2025/10/25 16:55:23 christos Exp $"); +__RCSID("$NetBSD: blocklistd.c,v 1.12 2025/10/25 18:43:51 christos Exp $"); #include <sys/types.h> #include <sys/socket.h> @@ -329,8 +329,8 @@ again: (*lfun)(LOG_INFO, "released %s/%d:%d after %d seconds", buf, c.c_lmask, c.c_port, c.c_duration); } - state_del(state, &c); - goto again; + if (state_del(state, &c) == 0) + goto again; } } diff --git a/contrib/blocklist/bin/state.c b/contrib/blocklist/bin/state.c index 08e2622e223f..bb93904f3489 100644 --- a/contrib/blocklist/bin/state.c +++ b/contrib/blocklist/bin/state.c @@ -1,4 +1,4 @@ -/* $NetBSD: state.c,v 1.2 2025/02/11 17:48:30 christos Exp $ */ +/* $NetBSD: state.c,v 1.3 2025/10/25 18:43:51 christos Exp $ */ /*- * Copyright (c) 2015 The NetBSD Foundation, Inc. @@ -35,7 +35,7 @@ #ifdef HAVE_SYS_CDEFS_H #include <sys/cdefs.h> #endif -__RCSID("$NetBSD: state.c,v 1.2 2025/02/11 17:48:30 christos Exp $"); +__RCSID("$NetBSD: state.c,v 1.3 2025/10/25 18:43:51 christos Exp $"); #include <sys/types.h> #include <sys/socket.h> @@ -131,7 +131,7 @@ state_del(DB *db, const struct conf *c) (*lfun)(LOG_DEBUG, "%s: returns %d", __func__, rv); (*db->sync)(db, 0); } - return 0; + return rv; default: (*lfun)(LOG_ERR, "%s: failed (%m)", __func__); return -1; diff --git a/lib/libsys/_libsys.h b/lib/libsys/_libsys.h index 6bd768708a78..12417b572a60 100644 --- a/lib/libsys/_libsys.h +++ b/lib/libsys/_libsys.h @@ -32,6 +32,7 @@ struct itimerspec; struct itimerval; struct jail; struct kevent; +struct kexec_segment; struct kld_file_stat; struct mac; struct module_stat; @@ -470,6 +471,7 @@ typedef int (__sys_getgroups_t)(int, gid_t *); typedef int (__sys_setgroups_t)(int, const gid_t *); typedef int (__sys_jail_attach_jd_t)(int); typedef int (__sys_jail_remove_jd_t)(int); +typedef int (__sys_kexec_load_t)(uint64_t, u_long, struct kexec_segment *, u_long); _Noreturn void __sys__exit(int rval); int __sys_fork(void); @@ -876,6 +878,7 @@ int __sys_getgroups(int gidsetsize, gid_t * gidset); int __sys_setgroups(int gidsetsize, const gid_t * gidset); int __sys_jail_attach_jd(int fd); int __sys_jail_remove_jd(int fd); +int __sys_kexec_load(uint64_t entry, u_long nseg, struct kexec_segment * segments, u_long flags); __END_DECLS #endif /* __LIBSYS_H_ */ diff --git a/lib/libsys/closefrom.2 b/lib/libsys/closefrom.2 index 1885a6fdeaa8..e6b4a5a3e9d7 100644 --- a/lib/libsys/closefrom.2 +++ b/lib/libsys/closefrom.2 @@ -23,7 +23,7 @@ .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" -.Dd May 17, 2025 +.Dd October 27, 2025 .Dt CLOSEFROM 2 .Os .Sh NAME @@ -94,6 +94,11 @@ function first appeared in .Fx 8.0 . .Pp The +.Fn close_range +function first appeared in +.Fx 12.2 . +.Pp +The .Dv CLOSE_RANGE_CLOFORK flag appeared in .Fx 15.0 . diff --git a/lib/libsys/syscalls.map b/lib/libsys/syscalls.map index b5400b9849b3..d00c862eb462 100644 --- a/lib/libsys/syscalls.map +++ b/lib/libsys/syscalls.map @@ -817,4 +817,6 @@ FBSDprivate_1.0 { __sys_jail_attach_jd; _jail_remove_jd; __sys_jail_remove_jd; + _kexec_load; + __sys_kexec_load; }; diff --git a/libexec/rc/rc.d/blacklistd b/libexec/rc/rc.d/blacklistd index 9157e258f43f..175e3e8c56b3 100755 --- a/libexec/rc/rc.d/blacklistd +++ b/libexec/rc/rc.d/blacklistd @@ -29,7 +29,7 @@ # # PROVIDE: blacklistd -# REQUIRE: netif pf +# REQUIRE: netif ipfilter ipfw pf . /etc/rc.subr diff --git a/libexec/rc/rc.d/blocklistd b/libexec/rc/rc.d/blocklistd index 24cbae77fd40..f979162ec3e0 100644..100755 --- a/libexec/rc/rc.d/blocklistd +++ b/libexec/rc/rc.d/blocklistd @@ -29,7 +29,7 @@ # # PROVIDE: blocklistd -# REQUIRE: netif pf +# REQUIRE: netif ipfilter ipfw pf . /etc/rc.subr diff --git a/sbin/ipfw/ipfw2.c b/sbin/ipfw/ipfw2.c index eed390ba5bec..27ccaea2c78f 100644 --- a/sbin/ipfw/ipfw2.c +++ b/sbin/ipfw/ipfw2.c @@ -648,11 +648,7 @@ do_cmd(int optname, void *optval, uintptr_t optlen) if (ipfw_socket < 0) err(EX_UNAVAILABLE, "socket"); - if (optname == IP_FW_GET || - optname == IP_FW_ADD || optname == IP_FW3 || - optname == IP_FW_NAT_GET_CONFIG || - optname < 0 || - optname == IP_FW_NAT_GET_LOG) { + if (optname == IP_FW3 || optname < 0) { if (optname < 0) optname = -optname; i = getsockopt(ipfw_socket, IPPROTO_IP, optname, optval, @@ -5802,7 +5798,7 @@ ipfw_add(char *av[]) sz = default_off + sizeof(ipfw_obj_ctlv) + tlen + rlen; if ((tbuf = calloc(1, sz)) == NULL) - err(EX_UNAVAILABLE, "malloc() failed for IP_FW_ADD"); + err(EX_UNAVAILABLE, "malloc() failed for IP_FW_XADD"); op3 = (ip_fw3_opheader *)tbuf; /* Tables first */ ctlv = (ipfw_obj_ctlv *)(op3 + 1); diff --git a/share/man/man9/PCI_IOV_ADD_VF.9 b/share/man/man9/PCI_IOV_ADD_VF.9 index 512b0b8668cc..95bf5a218e8e 100644 --- a/share/man/man9/PCI_IOV_ADD_VF.9 +++ b/share/man/man9/PCI_IOV_ADD_VF.9 @@ -41,7 +41,7 @@ The .Fn PCI_IOV_ADD_VF method is called by the PCI Single-Root I/O Virtualization .Pq SR-IOV -infrastructure when it is initializating a new Virtual Function (VF) as a child +infrastructure when it is initializing a new Virtual Function (VF) as a child of the given Physical Function (PF) device. This method will not be called until a successful call to .Xr PCI_IOV_INIT 9 diff --git a/share/man/man9/atomic.9 b/share/man/man9/atomic.9 index df24cd4a4d2b..c9133c6311a5 100644 --- a/share/man/man9/atomic.9 +++ b/share/man/man9/atomic.9 @@ -272,7 +272,7 @@ In C11, a release fence by one thread synchronizes with an acquire fence by another thread when an atomic load that is prior to the acquire fence (by program order) reads the value written by an atomic store that is subsequent to the release fence. -In constrast, in +In contrast, in .Fx , because of the atomicity of ordinary, naturally aligned loads and stores, fences can also be synchronized by ordinary loads diff --git a/share/man/man9/bhnd.9 b/share/man/man9/bhnd.9 index 722ae6b6a393..ed3007ea748d 100644 --- a/share/man/man9/bhnd.9 +++ b/share/man/man9/bhnd.9 @@ -2350,7 +2350,7 @@ function retains and returns a reference to the provider registered for .Fa service with the parent .Xr bhnd 4 -bus of devce +bus of device .Fa dev , if available. On success, the caller is responsible for releasing this provider reference diff --git a/share/man/man9/bus_dma.9 b/share/man/man9/bus_dma.9 index b644eeb2a476..0bf27eb5eb22 100644 --- a/share/man/man9/bus_dma.9 +++ b/share/man/man9/bus_dma.9 @@ -197,7 +197,7 @@ in addition to restrictions that differ between unrelated groups of transactions, the driver can first create a .Dq parent -tag that decribes the common restrictions. +tag that describes the common restrictions. The per-group tags can then inherit these restrictions from this .Dq parent tag rather than having to list them explicitly when creating the per-group tags. @@ -733,7 +733,7 @@ Fills in the selected fields of the template with a variable number of key-value parameters. The macros listed below take an argument of the specified type and encapsulate it into a key-value structure that is directly usable as a parameter argument. -Muliple parameters may be provided at once. +Multiple parameters may be provided at once. .Bd -literal BD_PARENT() void * BD_ALIGNMENT() uintmax_t diff --git a/share/man/man9/copy.9 b/share/man/man9/copy.9 index 8b2d0dc67727..3a3105ddf644 100644 --- a/share/man/man9/copy.9 +++ b/share/man/man9/copy.9 @@ -43,7 +43,7 @@ .Nm copyout_nofault , .Nm copystr , .Nm copyinstr -.Nd heterogenous address space copy functions +.Nd heterogeneous address space copy functions .Sh SYNOPSIS .In sys/types.h .In sys/systm.h diff --git a/share/man/man9/crypto_request.9 b/share/man/man9/crypto_request.9 index af62b9089561..77e6a60b46de 100644 --- a/share/man/man9/crypto_request.9 +++ b/share/man/man9/crypto_request.9 @@ -383,7 +383,7 @@ depending on whether the driver is implemented by software or hardware. dispatches the request asynchronously. If the driver is inherently synchronous, the request is queued to a taskqueue backed by a pool of worker threads. -This can increase througput by allowing requests from a single producer to be +This can increase throughput by allowing requests from a single producer to be processed in parallel. By default the pool is sized to provide one thread for each CPU. Worker threads dequeue requests and pass them to the driver asynchronously. diff --git a/share/man/man9/domain.9 b/share/man/man9/domain.9 index dab8cff89e12..d7e743eaf247 100644 --- a/share/man/man9/domain.9 +++ b/share/man/man9/domain.9 @@ -173,7 +173,7 @@ Once a domain is added it cannot be completely unloaded. This is because there is no reference counting system in place to determine if there are any active references from sockets within that domain. -However, the exprimental +However, the experimental .Fn domain_remove exists, and unloadable domains may be supported in the future. .Pp diff --git a/share/man/man9/efirt.9 b/share/man/man9/efirt.9 index c31f52bf2245..e085916801d5 100644 --- a/share/man/man9/efirt.9 +++ b/share/man/man9/efirt.9 @@ -183,7 +183,7 @@ is NULL. .It Dv EIO The variable could not be retrieved due to a hardware error. .It Dv EDOOFUS -The variable could not be retireved due to an authentication failure. +The variable could not be retrieved due to an authentication failure. .El .Pp The diff --git a/share/man/man9/g_geom.9 b/share/man/man9/g_geom.9 index c5b0c0aded2d..99d0ba074e89 100644 --- a/share/man/man9/g_geom.9 +++ b/share/man/man9/g_geom.9 @@ -66,7 +66,7 @@ function is very similar to .Fn g_new_geomf except that it accepts a regular string instead of a .Xr printf 3 Ns --like format strng as the geom's name. +-like format string as the geom's name. .Pp The .Fn g_destroy_geom diff --git a/share/man/man9/gone_in.9 b/share/man/man9/gone_in.9 index 7521adfda204..1b60e1eb10c2 100644 --- a/share/man/man9/gone_in.9 +++ b/share/man/man9/gone_in.9 @@ -1,6 +1,6 @@ .\" Copyright (c) 2021 The FreeBSD Foundation .\" -.\" This document was written by Ed Maste under sponsorhip from +.\" This document was written by Ed Maste under sponsorship from .\" The FreeBSD Foundation. .\" .\" Redistribution and use in source and binary forms, with or without @@ -23,7 +23,7 @@ .\" (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF .\" THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .\" -.Dd August 16, 2021 +.Dd June 24, 2025 .Dt GONE_IN 9 .Os .Sh NAME @@ -33,14 +33,15 @@ .Sh SYNOPSIS .In sys/systm.h .Ft void -.Fn gone_in "int major" "const char *msg" +.Fn gone_in "int major" "const char *msg" "..." .Ft void -.Fn gone_in_dev "device_t dev" "int major" "const char *msg" +.Fn gone_in_dev "device_t dev" "int major" "const char *msg" "..." .Sh DESCRIPTION The -.Fn gone_in -functions are used to provide a notice that the kernel is using a driver or -some other functionality that is deprecated, and will be removed in a future +.Nm gone_in +functions are used to provide a notice that the kernel is actively using a +driver or some other functionality that is deprecated, and is planned for +removal in a future .Fx release. The notice is sent to the kernel @@ -51,30 +52,29 @@ The argument specifies the major version of the .Fx release that will remove the deprecated functionality. +The notice shall be printed only once, thus +.Nm +functions are safe to use in often executed code paths. +.Pp +.Nm gone_in_dev +will prepend driver name before the notice. .Pp In releases before .Fa major -the deprecation notice states -.Do -Deprecated code (to be removed in FreeBSD -.Fa major Ns ): -.Fa msg -.Dc . -In releases equal to and after -.Fa major -the notice states +the provided notice will be appended with .Do -Obsolete code will be removed soon: -.Fa msg +To be removed in FreeBSD +.Fa major Ns .Dc . .Sh EXAMPLES .Bd -literal -offset indent void -sample_init(void) +example_api(foo_t *args) { - /* Initializaiton code omitted. */ + gone_in(16, "Warning! %s[%u] uses obsolete API. ", + curthread->td_proc->p_comm, curthread->td_proc->p_pid); - gone_in(14, "Giant-locked filesystem"); + /* API implementation omitted. */ } int @@ -82,7 +82,7 @@ example_driver_attach(struct example_driver_softc *sc) { /* Attach code omitted. */ - gone_in_dev(sc->dev, 14, "Giant-locked driver"); + gone_in_dev(sc->dev, 16, "driver is deprecated"); } .Ed .Sh HISTORY diff --git a/share/man/man9/ifnet.9 b/share/man/man9/ifnet.9 index 3c45e4f29e2d..e81c2990c13c 100644 --- a/share/man/man9/ifnet.9 +++ b/share/man/man9/ifnet.9 @@ -482,7 +482,7 @@ This is initialized by driver at attach. .It Fn if_getaddrlen .It Fn if_gethwaddr .It Fn if_getbroadcastaddr Fn if_setbroadcastaddr -Access the interface broadcast addess. +Access the interface broadcast address. .It Fn if_setmtu .It Fn if_getmtu Access the interface MTU. diff --git a/share/man/man9/nvmem.9 b/share/man/man9/nvmem.9 index 812cd09a5e35..fa88cbb91854 100644 --- a/share/man/man9/nvmem.9 +++ b/share/man/man9/nvmem.9 @@ -59,19 +59,19 @@ Get the size of the cell base on the reg property on the node. Return the size or ENOENT if the cell name wasn't found .It Fn nvmem_read_cell_by_name "phandle_t node" "const char *name" "void *cell" "size_t buflen" Get the cell content based on the name. -Return 0 on sucess or ENOENT if the cell doesn't exists, ENXIO if no provider device was found, +Return 0 on success or ENOENT if the cell doesn't exists, ENXIO if no provider device was found, EINVAL if the size isn't correct. .It Fn nvmem_read_cell_by_idx "phandle_t node" "int idx" "void *cell" "size_t buflen" Get the cell content based on the id. -Return 0 on sucess or ENOENT if the cell doesn't exists, ENXIO if no provider device was found, +Return 0 on success or ENOENT if the cell doesn't exists, ENXIO if no provider device was found, EINVAL if the size isn't correct. .It Fn nvmem_write_cell_by_name "phandle_t node" "const char *name" "void *cell" "size_t buflen" Write the cell content based on the name. -Return 0 on sucess or ENOENT if the cell doesn't exists, ENXIO if no provider device was found, +Return 0 on success or ENOENT if the cell doesn't exists, ENXIO if no provider device was found, EINVAL if the size isn't correct. .It Fn nvmem_write_cell_by_idx "phandle_t node" "int idx" "void *cell" "size_t buflen" Write the cell content based on the id. -Return 0 on sucess or ENOENT if the cell doesn't exists, ENXIO if no provider device was found, +Return 0 on success or ENOENT if the cell doesn't exists, ENXIO if no provider device was found, EINVAL if the size isn't correct. .El .Sh DEVICE METHODS diff --git a/share/man/man9/ofw_bus_is_compatible.9 b/share/man/man9/ofw_bus_is_compatible.9 index 1b44807e84e8..fcfe8755cf86 100644 --- a/share/man/man9/ofw_bus_is_compatible.9 +++ b/share/man/man9/ofw_bus_is_compatible.9 @@ -61,7 +61,7 @@ older revisions of the driver. If hardware revision B is backward compatible with revision A device tree node can signal this compatibility by providing both "vndr,hrdwrA" and "vndr,hrdwrB" strings in -the "compatibile" property value. +the "compatible" property value. This way older driver can use features available only in revision A, and the new version of the driver can take advantage of revision B feature set. diff --git a/share/man/man9/pci.9 b/share/man/man9/pci.9 index eeb62a63a2bd..871f69f887a6 100644 --- a/share/man/man9/pci.9 +++ b/share/man/man9/pci.9 @@ -523,7 +523,7 @@ device is not a PCI-express device, returns zero. When completion timeouts are disabled for .Fa dev , -this function returns the maxmimum timeout that would be used if timeouts +this function returns the maximum timeout that would be used if timeouts were enabled. .Pp The diff --git a/share/man/man9/refcount.9 b/share/man/man9/refcount.9 index 78631f9a865a..7375f429a607 100644 --- a/share/man/man9/refcount.9 +++ b/share/man/man9/refcount.9 @@ -106,7 +106,7 @@ but additionally checks that the value does not overflow as result of the operation. It returns .Dv true -if the reference was sucessfully obtained, and +if the reference was successfully obtained, and .Dv false if it was not, due to the overflow. .Pp diff --git a/share/man/man9/seqc.9 b/share/man/man9/seqc.9 index e13d73ecb5d7..b1e59b6b7af1 100644 --- a/share/man/man9/seqc.9 +++ b/share/man/man9/seqc.9 @@ -81,7 +81,7 @@ repeated. In case when sequence number is odd the object change is in progress and the reader will wait until the write will the sequence number will become even. .Sh EXAMPLES -The following example for a writer changees the +The following example for a writer changes the .Va var1 and .Va var2 diff --git a/share/man/man9/style.9 b/share/man/man9/style.9 index 26c7a3b2aa64..c9c3af23864a 100644 --- a/share/man/man9/style.9 +++ b/share/man/man9/style.9 @@ -816,7 +816,7 @@ If no local variables are declared, the first line should be a statement. Older versions of this .Nm document required a blank line before code. -Such lines should be removed when signficant changes are made to the code. +Such lines should be removed when significant changes are made to the code. .Pp Use .Xr printf 3 , @@ -947,7 +947,7 @@ namespace foo::bar { .Ed .Pp Member function declarations should follow the same style used for standalone -function protoypes except that a space should be used between a function's +function prototypes except that a space should be used between a function's return type and name. .Pp Function definitions at the top level should use a newline after the function diff --git a/share/man/man9/vn_fullpath.9 b/share/man/man9/vn_fullpath.9 index 9815abc3c86c..af459ed281c0 100644 --- a/share/man/man9/vn_fullpath.9 +++ b/share/man/man9/vn_fullpath.9 @@ -107,7 +107,7 @@ than one name (hard links), not all file systems use the name cache be used for more than one file (in the context of file systems covering other file systems); a file may have no name (if deleted but still open or referenced). -However, the resulting string may still be more useable to a user than +However, the resulting string may still be more usable to a user than a vnode pointer value, or a device number and inode number. Code consuming the results of this function should anticipate (and properly handle) failure. @@ -187,7 +187,7 @@ otherwise, an error number is returned. .Xr free 9 .Sh AUTHORS .An -nosplit -This manual page was initally written by +This manual page was initially written by .An Robert Watson Aq Mt rwatson@FreeBSD.org to describe the .Fn vn_fullpath diff --git a/sys/amd64/amd64/apic_vector.S b/sys/amd64/amd64/apic_vector.S index e98bae9eb6c5..8691387a5a8e 100644 --- a/sys/amd64/amd64/apic_vector.S +++ b/sys/amd64/amd64/apic_vector.S @@ -204,6 +204,17 @@ IDTVEC(spuriousint) jmp doreti /* + * Executed by a CPU when it receives an IPI_OFF from another CPU. + * Should never return + */ + INTR_HANDLER cpuoff + KMSAN_ENTER + call cpuoff_handler + call as_lapic_eoi + KMSAN_LEAVE + jmp doreti + +/* * Executed by a CPU when it receives an IPI_SWI. */ INTR_HANDLER ipi_swi diff --git a/sys/amd64/amd64/genassym.c b/sys/amd64/amd64/genassym.c index eb1b746f5893..2716784ee871 100644 --- a/sys/amd64/amd64/genassym.c +++ b/sys/amd64/amd64/genassym.c @@ -57,6 +57,7 @@ #include <vm/vm_param.h> #include <vm/pmap.h> #include <vm/vm_map.h> +#include <sys/kexec.h> #include <sys/proc.h> #include <x86/apicreg.h> #include <machine/cpu.h> @@ -65,6 +66,7 @@ #include <machine/proc.h> #include <machine/segments.h> #include <machine/efi.h> +#include <machine/kexec.h> ASSYM(P_VMSPACE, offsetof(struct proc, p_vmspace)); ASSYM(VM_PMAP, offsetof(struct vmspace, vm_pmap)); @@ -295,3 +297,13 @@ ASSYM(EC_R13, offsetof(struct efirt_callinfo, ec_r13)); ASSYM(EC_R14, offsetof(struct efirt_callinfo, ec_r14)); ASSYM(EC_R15, offsetof(struct efirt_callinfo, ec_r15)); ASSYM(EC_RFLAGS, offsetof(struct efirt_callinfo, ec_rflags)); + +/* Kexec */ +ASSYM(KEXEC_ENTRY, offsetof(struct kexec_image, entry)); +ASSYM(KEXEC_SEGMENTS, offsetof(struct kexec_image, segments)); +ASSYM(KEXEC_SEGMENT_MAX, KEXEC_SEGMENT_MAX); +ASSYM(KEXEC_IMAGE_SIZE, sizeof(struct kexec_image)); +ASSYM(KEXEC_STAGED_SEGMENT_SIZE, sizeof(struct kexec_segment_stage)); +ASSYM(KEXEC_SEGMENT_SIZE, offsetof(struct kexec_segment_stage, size)); +ASSYM(KEXEC_SEGMENT_MAP, offsetof(struct kexec_segment_stage, map_buf)); +ASSYM(KEXEC_SEGMENT_TARGET, offsetof(struct kexec_segment_stage, target)); diff --git a/sys/amd64/amd64/kexec_support.c b/sys/amd64/amd64/kexec_support.c new file mode 100644 index 000000000000..8189a48e9ae9 --- /dev/null +++ b/sys/amd64/amd64/kexec_support.c @@ -0,0 +1,300 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2025 Juniper Networks, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/systm.h> +#include <sys/bus.h> +#include <sys/conf.h> +#include <sys/interrupt.h> +#include <sys/kernel.h> +#include <sys/kexec.h> +#include <vm/vm.h> +#include <vm/vm_extern.h> +#include <vm/vm_object.h> +#include <vm/vm_phys.h> +#include <vm/pmap.h> +#include <vm/vm_page.h> +#include <vm/vm_radix.h> + +#include <machine/intr_machdep.h> +#include <machine/kexec.h> +#include <machine/md_var.h> +#include <machine/pmap.h> +#include <x86/apicvar.h> + +/* + * Idea behind this: + * + * kexec_load_md(): + * - Update boot page tables (identity map) to include all pages needed before + * disabling MMU. + * + * kexec_reboot_md(): + * - Copy pages into target(s) + * - Do "other stuff" + * - Does not return + */ + +/* + * do_pte: Create PTE entries (4k pages). If false, create 2MB superpages. + * identity: This is for an identity map, treat `start` as a physical address. + * Only valid here if do_pte is false. + */ +static void +kexec_generate_page_tables(pml4_entry_t *root, vm_offset_t start, + vm_size_t size, bool do_pte, bool identity, struct pctrie_iter *pages) +{ + vm_paddr_t mpa; + vm_offset_t pg; + vm_size_t stride = do_pte ? PAGE_SIZE : NBPDR; + vm_page_t m; + vm_pindex_t i, j, k, l; + + pg = start & ~(stride - 1); + i = pmap_pml4e_index(pg); + j = pmap_pdpe_index(pg); + k = pmap_pde_index(pg); + l = pmap_pte_index(pg); + for (; pg < start + size; i++, j = 0, k = 0, l = 0) { + /* + * Walk linearly, as above, but one fell swoop, one page at a + * time. + */ + if (root[i] == 0) { + m = vm_radix_iter_next(pages); + mpa = VM_PAGE_TO_PHYS(m); + root[i] = mpa | PG_RW | PG_V; + } + pdp_entry_t *pdp = + (pdp_entry_t *)(PHYS_TO_DMAP(root[i] & PG_FRAME)); + for (; j < NPDPEPG && pg < start + size; j++, k = 0, l = 0) { + if (pdp[j] == 0) { + m = vm_radix_iter_next(pages); + mpa = VM_PAGE_TO_PHYS(m); + pdp[j] = mpa | PG_RW | PG_V; + } + pd_entry_t *pde = + (pd_entry_t *)(PHYS_TO_DMAP(pdp[j] & PG_FRAME)); + for (; k < NPDEPG && pg < start + size; k++, l = 0) { + if (pde[k] == 0) { + if (!do_pte) { + pde[k] = + (identity ? pg : pmap_kextract(pg)) | + PG_RW | PG_PS | PG_V; + pg += NBPDR; + continue; + } + m = vm_radix_iter_next(pages); + mpa = VM_PAGE_TO_PHYS(m); + pde[k] = mpa | PG_V | PG_RW; + } else if ((pde[k] & PG_PS) != 0) { + pg += NBPDR; + continue; + } + /* Populate the PTEs. */ + for (; l < NPTEPG && pg < start + size; + l++, pg += PAGE_SIZE) { + pt_entry_t *pte = + (pt_entry_t *)PHYS_TO_DMAP(pde[pmap_pde_index(pg)] & PG_FRAME); + pte[pmap_pte_index(pg)] = + pmap_kextract(pg) | PG_RW | PG_V; + } + } + } + } +} + +void +kexec_reboot_md(struct kexec_image *image) +{ + void (*kexec_do_tramp)(void) = image->md_image; + + intr_disable_all(); + lapic_disable(); + kexec_do_reboot_trampoline(VM_PAGE_TO_PHYS(image->first_md_page), + kexec_do_tramp); + + for (;;) + ; +} + +int +kexec_load_md(struct kexec_image *image) +{ + struct pctrie_iter pct_iter; + pml4_entry_t *PT4; + pdp_entry_t *PDP_l; + pd_entry_t *PD_l0; + vm_offset_t va; + int i; + + /* + * Start building the page table. + * First part of the page table is standard for all. + */ + vm_offset_t pa_pdp_l, pa_pd_l0, pa_pd_l1, pa_pd_l2, pa_pd_l3; + vm_page_t m; + + if (la57) + return (EINVAL); + + vm_radix_iter_init(&pct_iter, &image->map_obj->rtree); + /* Working in linear space in the mapped space, `va` is our tracker. */ + m = vm_radix_iter_lookup(&pct_iter, image->first_md_page->pindex); + va = (vm_offset_t)image->map_addr + ptoa(m->pindex); + /* We'll find a place for these later */ + PT4 = (void *)va; + va += PAGE_SIZE; + m = vm_radix_iter_next(&pct_iter); + pa_pdp_l = VM_PAGE_TO_PHYS(m); + PDP_l = (void *)va; + va += PAGE_SIZE; + m = vm_radix_iter_next(&pct_iter); + pa_pd_l0 = VM_PAGE_TO_PHYS(m); + PD_l0 = (void *)va; + va += PAGE_SIZE; + m = vm_radix_iter_next(&pct_iter); + pa_pd_l1 = VM_PAGE_TO_PHYS(m); + m = vm_radix_iter_next(&pct_iter); + pa_pd_l2 = VM_PAGE_TO_PHYS(m); + m = vm_radix_iter_next(&pct_iter); + pa_pd_l3 = VM_PAGE_TO_PHYS(m); + m = vm_radix_iter_next(&pct_iter); + + /* 1:1 mapping of lower 4G */ + PT4[0] = (pml4_entry_t)pa_pdp_l | PG_V | PG_RW; + PDP_l[0] = (pdp_entry_t)pa_pd_l0 | PG_V | PG_RW; + PDP_l[1] = (pdp_entry_t)pa_pd_l1 | PG_V | PG_RW; + PDP_l[2] = (pdp_entry_t)pa_pd_l2 | PG_V | PG_RW; + PDP_l[3] = (pdp_entry_t)pa_pd_l3 | PG_V | PG_RW; + for (i = 0; i < 4 * NPDEPG; i++) { /* we overflow PD_l0 into _l1, etc */ + PD_l0[i] = ((pd_entry_t)i << PDRSHIFT) | PG_V | + PG_RW | PG_PS; + } + + /* Map the target(s) in 2MB chunks. */ + for (i = 0; i < KEXEC_SEGMENT_MAX; i++) { + struct kexec_segment_stage *s = &image->segments[i]; + + if (s->size == 0) + break; + kexec_generate_page_tables(PT4, s->target, s->size, false, + true, &pct_iter); + } + /* Now create the source page tables */ + kexec_generate_page_tables(PT4, image->map_addr, image->map_size, true, + false, &pct_iter); + kexec_generate_page_tables(PT4, + trunc_page((vm_offset_t)kexec_do_reboot_trampoline), + PAGE_SIZE, true, false, &pct_iter); + KASSERT(m != NULL, ("kexec_load_md: Missing trampoline page!\n")); + + /* MD control pages start at this next page. */ + image->md_image = (void *)(image->map_addr + ptoa(m->pindex)); + bcopy(kexec_do_reboot, image->md_image, kexec_do_reboot_size); + + /* Save the image into the MD page(s) right after the trampoline */ + bcopy(image, (void *)((vm_offset_t)image->md_image + + (vm_offset_t)&kexec_saved_image - (vm_offset_t)&kexec_do_reboot), + sizeof(*image)); + + return (0); +} + +/* + * Required pages: + * - L4 (1) (root) + * - L3 (PDPE) - 2 (bottom 512GB, bottom 4 used, top range for kernel map) + * - L2 (PDP) - 5 (2MB superpage mappings, 1GB each, for bottom 4GB, top 1) + * - L1 (PDR) - 1 (kexec trampoline page, first MD page) + * - kexec_do_reboot trampoline - 1 + * - Slop pages for staging (in case it's not aligned nicely) - 3 (worst case) + * + * Minimum 9 pages for the direct map. + */ +int +kexec_md_pages(struct kexec_segment *seg_in) +{ + struct kexec_segment *segs = seg_in; + vm_size_t pages = 13; /* Minimum number of starting pages */ + vm_paddr_t cur_addr = (1UL << 32) - 1; /* Bottom 4G will be identity mapped in full */ + vm_size_t source_total = 0; + + for (int i = 0; i < KEXEC_SEGMENT_MAX; i++) { + vm_offset_t start, end; + if (segs[i].memsz == 0) + break; + + end = round_2mpage((vm_offset_t)segs[i].mem + segs[i].memsz); + start = trunc_2mpage((vm_offset_t)segs[i].mem); + start = max(start, cur_addr + 1); + /* + * Round to cover the full range of page table pages for each + * segment. + */ + source_total += round_2mpage(end - start); + + /* + * Bottom 4GB are identity mapped already in the count, so skip + * any segments that end up there, this will short-circuit that. + */ + if (end <= cur_addr + 1) + continue; + + if (pmap_pml4e_index(end) != pmap_pml4e_index(cur_addr)) { + /* Need a new 512GB mapping page */ + pages++; + pages += howmany(end - (start & ~PML4MASK), NBPML4); + pages += howmany(end - (start & ~PDPMASK), NBPDP); + pages += howmany(end - (start & ~PDRMASK), NBPDR); + + } else if (pmap_pdpe_index(end) != pmap_pdpe_index(cur_addr)) { + pages++; + pages += howmany(end - (start & ~PDPMASK), NBPDP) - 1; + pages += howmany(end - (start & ~PDRMASK), NBPDR); + } + + } + /* Be pessimistic when totaling up source pages. We likely + * can't use superpages, so need to map each page individually. + */ + pages += howmany(source_total, NBPDR); + pages += howmany(source_total, NBPDP); + pages += howmany(source_total, NBPML4); + + /* + * Be intentionally sloppy adding in the extra page table pages. It's + * better to go over than under. + */ + pages += howmany(pages * PAGE_SIZE, NBPDR); + pages += howmany(pages * PAGE_SIZE, NBPDP); + pages += howmany(pages * PAGE_SIZE, NBPML4); + + /* Add in the trampoline pages */ + pages += howmany(kexec_do_reboot_size, PAGE_SIZE); + + return (pages); +} diff --git a/sys/amd64/amd64/kexec_tramp.S b/sys/amd64/amd64/kexec_tramp.S new file mode 100644 index 000000000000..6a2de676bc35 --- /dev/null +++ b/sys/amd64/amd64/kexec_tramp.S @@ -0,0 +1,91 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2025 Juniper Networks, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <machine/asmacros.h> +#include <machine/specialreg.h> +#include "assym.inc" + +/* + * Take a pointer to the image, copy each segment, and jump to the trampoline. + * + * Assumptions: + * - image is in safe memory + * - We're already running out of the new "identity" map. + * - All registers are free game, so go nuts + * - Interrupts are disabled + * - All APs are disabled + */ +ENTRY(kexec_do_reboot) + /* + r9: image pointer + r10: segment pointer + r11: segment counter + */ + leaq kexec_stack(%rip), %rsp + /* Get the saved kexec_image. */ + leaq kexec_saved_image(%rip), %r9 + leaq KEXEC_SEGMENTS(%r9), %r10 + movq $KEXEC_SEGMENT_MAX, %r11 +copy_segment: + movq KEXEC_SEGMENT_SIZE(%r10), %rcx + cmpq $0, %rcx + je done + shrq $3, %rcx + movq KEXEC_SEGMENT_TARGET(%r10), %rdi + movq KEXEC_SEGMENT_MAP(%r10), %rsi + rep + movsq + addq $KEXEC_STAGED_SEGMENT_SIZE, %r10 + decq %r11 + jg copy_segment + +done: + pushq KEXEC_ENTRY(%r9) + ret +fail: + jmp fail +END(kexec_do_reboot) +ENTRY(kexec_do_reboot_trampoline) + /* Set new page table, clears most of TLB. */ + movq %rdi, %cr3 + + /* Now flush the rest of the TLB, including global pages. */ + movq %cr4, %rax + andq $~CR4_PGE, %rax + movq %rax, %cr4 + jmp *%rsi +END(kexec_do_reboot_trampoline) +CNAME(kexec_saved_image): + .globl kexec_saved_image + .space KEXEC_IMAGE_SIZE + .quad 0 + /* We don't need more than quad, so just fill out the page. */ + .p2align PAGE_SHIFT + kexec_stack: +CNAME(kexec_do_reboot_size): + .globl kexec_do_reboot_size + .quad . - kexec_do_reboot diff --git a/sys/amd64/amd64/mp_machdep.c b/sys/amd64/amd64/mp_machdep.c index 00e99f9df192..96ed0a2cc3ba 100644 --- a/sys/amd64/amd64/mp_machdep.c +++ b/sys/amd64/amd64/mp_machdep.c @@ -140,6 +140,10 @@ cpu_mp_start(void) setidt(IPI_STOP, pti ? IDTVEC(cpustop_pti) : IDTVEC(cpustop), SDT_SYSIGT, SEL_KPL, 0); + /* Install an inter-CPU IPI for CPU offline */ + setidt(IPI_OFF, pti ? IDTVEC(cpuoff_pti) : IDTVEC(cpuoff), + SDT_SYSIGT, SEL_KPL, 0); + /* Install an inter-CPU IPI for CPU suspend/resume */ setidt(IPI_SUSPEND, pti ? IDTVEC(cpususpend_pti) : IDTVEC(cpususpend), SDT_SYSIGT, SEL_KPL, 0); @@ -176,6 +180,15 @@ cpu_mp_start(void) #endif } +void +cpu_mp_stop(void) +{ + cpuset_t other_cpus = all_cpus; + + CPU_CLR(PCPU_GET(cpuid), &other_cpus); + offline_cpus(other_cpus); +} + /* * AP CPU's call this to initialize themselves. */ diff --git a/sys/amd64/include/kexec.h b/sys/amd64/include/kexec.h new file mode 100644 index 000000000000..70bc2991be3f --- /dev/null +++ b/sys/amd64/include/kexec.h @@ -0,0 +1,41 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2025 Juniper Networks, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _AMD64_KEXEC_H_ +#define _AMD64_KEXEC_H_ + +struct kexec_segment; +struct kexec_image; +int kexec_md_pages(struct kexec_segment *); +extern void kexec_do_reboot(void); +extern long kexec_do_reboot_size; +extern void *kexec_saved_image; +extern void kexec_do_reboot_trampoline(unsigned long, void (*)(void)); +#define KEXEC_MD_PAGES(x) kexec_md_pages(x) + + +#endif /* _AMD64_KEXEC_H_ */ diff --git a/sys/amd64/include/smp.h b/sys/amd64/include/smp.h index bff92570ff82..28c372a2e556 100644 --- a/sys/amd64/include/smp.h +++ b/sys/amd64/include/smp.h @@ -30,6 +30,7 @@ inthand_t IDTVEC(ipi_intr_bitmap_handler_pti), IDTVEC(ipi_swi_pti), IDTVEC(cpustop_pti), + IDTVEC(cpuoff_pti), IDTVEC(cpususpend_pti), IDTVEC(rendezvous_pti); diff --git a/sys/arm/include/kexec.h b/sys/arm/include/kexec.h new file mode 100644 index 000000000000..50391d32812a --- /dev/null +++ b/sys/arm/include/kexec.h @@ -0,0 +1,38 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2025 Juniper Networks, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _ARM_KEXEC_H_ +#define _ARM_KEXEC_H_ + +int +kexec_load_md(struct kexec_image *image) +{ + return (ENOSYS); +} + +#define kexec_reboot_md(x) do {} while (0) +#endif /* _ARM_KEXEC_H_ */ diff --git a/sys/arm64/arm64/db_disasm.c b/sys/arm64/arm64/db_disasm.c index ab1002560b20..14ae2acc2ce6 100644 --- a/sys/arm64/arm64/db_disasm.c +++ b/sys/arm64/arm64/db_disasm.c @@ -31,6 +31,7 @@ #include <ddb/db_access.h> #include <ddb/db_sym.h> +#include <machine/armreg.h> #include <machine/disassem.h> static u_int db_disasm_read_word(vm_offset_t); diff --git a/sys/arm64/arm64/kexec_support.c b/sys/arm64/arm64/kexec_support.c new file mode 100644 index 000000000000..8b9719c05b67 --- /dev/null +++ b/sys/arm64/arm64/kexec_support.c @@ -0,0 +1,188 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2025 Juniper Networks, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/systm.h> +#include <sys/kexec.h> +#include <vm/vm.h> +#include <vm/vm_extern.h> +#include <vm/vm_object.h> +#include <vm/vm_phys.h> +#include <vm/vm_radix.h> +#include <vm/pmap.h> +#include <vm/vm_page.h> + +#include <machine/armreg.h> +#include <machine/pmap.h> +#include <machine/pte.h> + +/* + * Idea behind this: + * + * kexec_load_md(): + * - Update boot page tables (identity map) to include all pages needed before + * disabling MMU. + * + * kexec_reboot_md(): + * - Copy pages into target(s) + * - Do "other stuff" + * - Does not return + */ + +extern pt_entry_t pagetable_l0_ttbr0_bootstrap[]; +extern unsigned long initstack_end[]; +void switch_stack(void *, void (*)(void *, void *, struct kexec_image *), void *); + +#define SCTLR_EL1_NO_MMU (SCTLR_RES1 | SCTLR_LSMAOE | SCTLR_nTLSMD | \ + SCTLR_EIS | SCTLR_TSCXT | SCTLR_EOS) +#define vm_page_offset(m) ((vm_offset_t)(m) - vm_page_base) +static inline vm_page_t +phys_vm_page(vm_page_t m, vm_offset_t vm_page_v, vm_paddr_t vm_page_p) +{ + return ((vm_page_t)((vm_offset_t)m - vm_page_v + vm_page_p)); +} + +/* First 2 args are filler for switch_stack() */ +static void __aligned(16) __dead2 +kexec_reboot_bottom( void *arg1 __unused, void *arg2 __unused, + struct kexec_image *image) +{ + void (*e)(void) = (void *)image->entry; + vm_offset_t vm_page_base = (vm_offset_t)vm_page_array; + vm_paddr_t vm_page_phys = pmap_kextract((vm_offset_t)vm_page_array); + struct kexec_segment_stage *phys_segs = + (void *)pmap_kextract((vm_offset_t)&image->segments); + vm_paddr_t from_pa, to_pa; + vm_size_t size; + vm_page_t first, m, mp; + struct pctrie_iter pct_i; + + /* + * Create a linked list of all pages in the object before we disable the + * MMU. Once the MMU is disabled we can't use the vm_radix iterators, + * as they rely on virtual address pointers. + */ + first = NULL; + vm_radix_iter_init(&pct_i, &image->map_obj->rtree); + VM_RADIX_FORALL(m, &pct_i) { + if (first == NULL) + first = m; + else + SLIST_INSERT_AFTER(mp, m, plinks.s.ss); + mp = m; + } + + /* + * We're running out of the identity map now, disable the MMU before we + * continue. It's possible page tables can be overwritten, which would + * be very bad if we were running with the MMU enabled. + */ + WRITE_SPECIALREG(sctlr_el1, SCTLR_EL1_NO_MMU); + isb(); + for (int i = 0; i < KEXEC_SEGMENT_MAX; i++) { + if (phys_segs[i].size == 0) + break; + to_pa = phys_segs[i].target; + /* Copy the segment here... */ + for (vm_page_t p = phys_segs[i].first_page; + p != NULL && to_pa - phys_segs[i].target < phys_segs[i].size; + p = SLIST_NEXT(p, plinks.s.ss)) { + p = phys_vm_page(p, vm_page_base, vm_page_phys); + from_pa = p->phys_addr; + if (p->phys_addr == to_pa) { + to_pa += PAGE_SIZE; + continue; + } + for (size = PAGE_SIZE / sizeof(register_t); + size > 0; --size) { + *(register_t *)to_pa = *(register_t *)from_pa; + to_pa += sizeof(register_t); + from_pa += sizeof(register_t); + } + } + } + invalidate_icache(); + e(); + while (1) + ; +} + +void +kexec_reboot_md(struct kexec_image *image) +{ + uintptr_t ptr; + register_t reg; + + for (int i = 0; i < KEXEC_SEGMENT_MAX; i++) { + if (image->segments[i].size > 0) + cpu_dcache_inv_range((void *)PHYS_TO_DMAP(image->segments[i].target), + image->segments[i].size); + } + ptr = pmap_kextract((vm_offset_t)kexec_reboot_bottom); + serror_disable(); + + reg = pmap_kextract((vm_offset_t)pagetable_l0_ttbr0_bootstrap); + set_ttbr0(reg); + cpu_tlb_flushID(); + + typeof(kexec_reboot_bottom) *p = (void *)ptr; + switch_stack((void *)pmap_kextract((vm_offset_t)initstack_end), + p, image); + while (1) + ; +} + +int +kexec_load_md(struct kexec_image *image) +{ + vm_paddr_t tmp; + pt_entry_t *pte; + + /* Create L2 page blocks for the trampoline. L0/L1 are from the startup. */ + + /* + * There are exactly 2 pages before the pagetable_l0_ttbr0_bootstrap, so + * move to there. + */ + pte = pagetable_l0_ttbr0_bootstrap; + pte -= (Ln_ENTRIES * 2); /* move to start of L2 pages */ + + /* + * Populate the identity map with symbols we know we'll need before we + * turn off the MMU. + */ + tmp = pmap_kextract((vm_offset_t)kexec_reboot_bottom); + pte[pmap_l2_index(tmp)] = (tmp | L2_BLOCK | ATTR_AF | ATTR_S1_UXN); + tmp = pmap_kextract((vm_offset_t)initstack_end); + pte[pmap_l2_index(tmp)] = (tmp | L2_BLOCK | ATTR_AF | ATTR_S1_UXN); + /* We'll need vm_page_array for doing offset calculations. */ + tmp = pmap_kextract((vm_offset_t)&vm_page_array); + pte[pmap_l2_index(tmp)] = (tmp | L2_BLOCK | ATTR_AF | ATTR_S1_UXN); + + return (0); +} diff --git a/sys/arm64/arm64/locore.S b/sys/arm64/arm64/locore.S index d35e334905a7..3ec12140f139 100644 --- a/sys/arm64/arm64/locore.S +++ b/sys/arm64/arm64/locore.S @@ -325,6 +325,19 @@ mp_virtdone: b init_secondary LEND(mpentry_common) + +ENTRY(mp_cpu_spinloop) +0: + wfe + ldr x0, mp_cpu_spin_table_release_addr + cbz x0, 0b + blr x0 + .globl mp_cpu_spin_table_release_addr +mp_cpu_spin_table_release_addr: + .quad 0 + .globl mp_cpu_spinloop_end +mp_cpu_spinloop_end: +END(mp_cpu_spinloop) #endif /* @@ -475,6 +488,29 @@ LENTRY(enter_kernel_el) eret LEND(enter_kernel_el) +/* Turn off the MMU. Install ttbr0 from the bootstrap page table, and go there. + * Does not return. + * - x0 - target address to jump to after stopping the MMU. + * - x1 - kernel load address + */ +ENTRY(stop_mmu) + mov x16, x0 /* Save target. */ + ldr x2, =(1f - KERNBASE) + add x17, x1, x2 + ldr x3, =(pagetable_l0_ttbr0_bootstrap - KERNBASE) + add x1, x1, x3 + msr ttbr0_el1, x1 + isb + br x17 +1: + BTI_J + mrs x0, sctlr_el1 + bic x0, x0, SCTLR_M + bic x0, x0, SCTLR_C + msr sctlr_el1, x0 + isb + br x16 +END(stop_mmu) /* * Get the physical address the kernel was loaded at. */ @@ -1094,12 +1130,19 @@ tcr: TCR_SH0_IS | TCR_ORGN0_WBWA | TCR_IRGN0_WBWA) LEND(start_mmu) +ENTRY(switch_stack) + mov sp, x0 + mov x16, x1 + br x16 +END(switch_stack) + ENTRY(abort) b abort END(abort) .bss .align PAGE_SHIFT + .globl initstack_end initstack: .space BOOT_STACK_SIZE initstack_end: @@ -1116,6 +1159,7 @@ initstack_end: * L0 for user */ .globl pagetable_l0_ttbr1 + .globl pagetable_l0_ttbr0_bootstrap pagetable: pagetable_l3_ttbr1: .space (PAGE_SIZE * L3_PAGE_COUNT) diff --git a/sys/arm64/arm64/mp_machdep.c b/sys/arm64/arm64/mp_machdep.c index e4d011df3a06..0bdd2ecfd8a7 100644 --- a/sys/arm64/arm64/mp_machdep.c +++ b/sys/arm64/arm64/mp_machdep.c @@ -60,6 +60,7 @@ #include <machine/debug_monitor.h> #include <machine/intr.h> #include <machine/smp.h> +#include <machine/vmparam.h> #ifdef VFP #include <machine/vfp.h> #endif @@ -103,6 +104,7 @@ static void ipi_hardclock(void *); static void ipi_preempt(void *); static void ipi_rendezvous(void *); static void ipi_stop(void *); +static void ipi_off(void *); #ifdef FDT static u_int fdt_cpuid; @@ -193,6 +195,7 @@ release_aps(void *dummy __unused) intr_ipi_setup(IPI_STOP, "stop", ipi_stop, NULL); intr_ipi_setup(IPI_STOP_HARD, "stop hard", ipi_stop, NULL); intr_ipi_setup(IPI_HARDCLOCK, "hardclock", ipi_hardclock, NULL); + intr_ipi_setup(IPI_OFF, "off", ipi_off, NULL); atomic_store_int(&aps_started, 0); atomic_store_rel_int(&aps_ready, 1); @@ -390,6 +393,34 @@ ipi_stop(void *dummy __unused) CTR0(KTR_SMP, "IPI_STOP (restart)"); } +void stop_mmu(vm_paddr_t, vm_paddr_t) __dead2; +extern uint32_t mp_cpu_spinloop[]; +extern uint32_t mp_cpu_spinloop_end[]; +extern uint64_t mp_cpu_spin_table_release_addr; +static void +ipi_off(void *dummy __unused) +{ + CTR0(KTR_SMP, "IPI_OFF"); + if (psci_present) + psci_cpu_off(); + else { + uint64_t release_addr; + vm_size_t size; + + size = (vm_offset_t)&mp_cpu_spin_table_release_addr - + (vm_offset_t)mp_cpu_spinloop; + release_addr = PCPU_GET(release_addr) - size; + isb(); + invalidate_icache(); + /* Go catatonic, don't take any interrupts. */ + intr_disable(); + stop_mmu(release_addr, pmap_kextract(KERNBASE)); + + + } + CTR0(KTR_SMP, "IPI_OFF failed"); +} + struct cpu_group * cpu_topo(void) { @@ -511,6 +542,7 @@ start_cpu(u_int cpuid, uint64_t target_cpu, int domain, vm_paddr_t release_addr) pcpu_init(pcpup, cpuid, sizeof(struct pcpu)); pcpup->pc_mpidr = target_cpu & CPU_AFF_MASK; bootpcpu = pcpup; + pcpup->pc_release_addr = release_addr; dpcpu[cpuid - 1] = (void *)(pcpup + 1); dpcpu_init(dpcpu[cpuid - 1], cpuid); @@ -752,6 +784,52 @@ cpu_mp_start(void) } } +void +cpu_mp_stop(void) +{ + + /* Short-circuit for single-CPU */ + if (CPU_COUNT(&all_cpus) == 1) + return; + + KASSERT(PCPU_GET(cpuid) == CPU_FIRST(), ("Not on the first CPU!\n")); + + /* + * If we use spin-table, assume U-boot method for now (single address + * shared by all CPUs). + */ + if (!psci_present) { + int cpu; + vm_paddr_t release_addr; + void *release_vaddr; + vm_size_t size; + + /* Find the shared release address. */ + CPU_FOREACH(cpu) { + release_addr = pcpu_find(cpu)->pc_release_addr; + if (release_addr != 0) + break; + } + /* No release address? No way of notifying other CPUs. */ + if (release_addr == 0) + return; + + size = (vm_offset_t)&mp_cpu_spinloop_end - + (vm_offset_t)&mp_cpu_spinloop; + + release_addr -= (vm_offset_t)&mp_cpu_spin_table_release_addr - + (vm_offset_t)mp_cpu_spinloop; + + release_vaddr = pmap_mapdev(release_addr, size); + bcopy(mp_cpu_spinloop, release_vaddr, size); + cpu_dcache_wbinv_range(release_vaddr, size); + pmap_unmapdev(release_vaddr, size); + invalidate_icache(); + } + ipi_all_but_self(IPI_OFF); + DELAY(1000000); +} + /* Introduce rest of cores to the world */ void cpu_mp_announce(void) diff --git a/sys/arm64/include/_armreg.h b/sys/arm64/include/_armreg.h new file mode 100644 index 000000000000..0f5134e5a978 --- /dev/null +++ b/sys/arm64/include/_armreg.h @@ -0,0 +1,57 @@ +/*- + * Copyright (c) 2013, 2014 Andrew Turner + * Copyright (c) 2015,2021 The FreeBSD Foundation + * + * Portions of this software were developed by Andrew Turner + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if !defined(_MACHINE_ARMREG_H_) && \ + !defined(_MACHINE_CPU_H_) && \ + !defined(_MACHINE_HYPERVISOR_H_) +#error Do not include this file directly +#endif + +#ifndef _MACHINE__ARMREG_H_ +#define _MACHINE__ARMREG_H_ + +#define __MRS_REG_ALT_NAME(op0, op1, crn, crm, op2) \ + S##op0##_##op1##_C##crn##_C##crm##_##op2 +#define _MRS_REG_ALT_NAME(op0, op1, crn, crm, op2) \ + __MRS_REG_ALT_NAME(op0, op1, crn, crm, op2) +#define MRS_REG_ALT_NAME(reg) \ + _MRS_REG_ALT_NAME(reg##_op0, reg##_op1, reg##_CRn, reg##_CRm, reg##_op2) + + +#define READ_SPECIALREG(reg) \ +({ uint64_t _val; \ + __asm __volatile("mrs %0, " __STRING(reg) : "=&r" (_val)); \ + _val; \ +}) +#define WRITE_SPECIALREG(reg, _val) \ + __asm __volatile("msr " __STRING(reg) ", %0" : : "r"((uint64_t)_val)) + +#define UL(x) UINT64_C(x) + +#endif /* !_MACHINE__ARMREG_H_ */ diff --git a/sys/arm64/include/armreg.h b/sys/arm64/include/armreg.h index aca3d4c07450..aa9b672ad85a 100644 --- a/sys/arm64/include/armreg.h +++ b/sys/arm64/include/armreg.h @@ -34,25 +34,9 @@ #ifndef _MACHINE_ARMREG_H_ #define _MACHINE_ARMREG_H_ -#define INSN_SIZE 4 - -#define __MRS_REG_ALT_NAME(op0, op1, crn, crm, op2) \ - S##op0##_##op1##_C##crn##_C##crm##_##op2 -#define _MRS_REG_ALT_NAME(op0, op1, crn, crm, op2) \ - __MRS_REG_ALT_NAME(op0, op1, crn, crm, op2) -#define MRS_REG_ALT_NAME(reg) \ - _MRS_REG_ALT_NAME(reg##_op0, reg##_op1, reg##_CRn, reg##_CRm, reg##_op2) - +#include <machine/_armreg.h> -#define READ_SPECIALREG(reg) \ -({ uint64_t _val; \ - __asm __volatile("mrs %0, " __STRING(reg) : "=&r" (_val)); \ - _val; \ -}) -#define WRITE_SPECIALREG(reg, _val) \ - __asm __volatile("msr " __STRING(reg) ", %0" : : "r"((uint64_t)_val)) - -#define UL(x) UINT64_C(x) +#define INSN_SIZE 4 /* AFSR0_EL1 - Auxiliary Fault Status Register 0 */ #define AFSR0_EL1_REG MRS_REG_ALT_NAME(AFSR0_EL1) diff --git a/sys/arm64/include/cpu.h b/sys/arm64/include/cpu.h index 124da8c215ed..b15210633d37 100644 --- a/sys/arm64/include/cpu.h +++ b/sys/arm64/include/cpu.h @@ -43,10 +43,10 @@ #define _MACHINE_CPU_H_ #if !defined(__ASSEMBLER__) +#include <machine/_armreg.h> #include <machine/atomic.h> #include <machine/frame.h> #endif -#include <machine/armreg.h> #define TRAPF_PC(tfp) ((tfp)->tf_elr) #define TRAPF_USERMODE(tfp) (((tfp)->tf_spsr & PSR_M_MASK) == PSR_M_EL0t) diff --git a/sys/arm64/include/cpufunc.h b/sys/arm64/include/cpufunc.h index e6e1f682794e..e9eee643216b 100644 --- a/sys/arm64/include/cpufunc.h +++ b/sys/arm64/include/cpufunc.h @@ -96,6 +96,13 @@ serror_enable(void) __asm __volatile("msr daifclr, #(" __XSTRING(DAIF_A) ")"); } +static __inline void +serror_disable(void) +{ + + __asm __volatile("msr daifset, #(" __XSTRING(DAIF_A) ")"); +} + static __inline register_t get_midr(void) { diff --git a/sys/arm64/include/db_machdep.h b/sys/arm64/include/db_machdep.h index 5dc496ca851d..3ef95f7802ea 100644 --- a/sys/arm64/include/db_machdep.h +++ b/sys/arm64/include/db_machdep.h @@ -31,7 +31,6 @@ #ifndef _MACHINE_DB_MACHDEP_H_ #define _MACHINE_DB_MACHDEP_H_ -#include <machine/armreg.h> #include <machine/frame.h> #include <machine/trap.h> diff --git a/sys/arm64/include/hypervisor.h b/sys/arm64/include/hypervisor.h index 8feabd2b981b..7d405e63cd8d 100644 --- a/sys/arm64/include/hypervisor.h +++ b/sys/arm64/include/hypervisor.h @@ -30,6 +30,8 @@ #ifndef _MACHINE_HYPERVISOR_H_ #define _MACHINE_HYPERVISOR_H_ +#include <machine/_armreg.h> + /* * These registers are only useful when in hypervisor context, * e.g. specific to EL2, or controlling the hypervisor. diff --git a/sys/arm64/include/kexec.h b/sys/arm64/include/kexec.h new file mode 100644 index 000000000000..0a8c7a053331 --- /dev/null +++ b/sys/arm64/include/kexec.h @@ -0,0 +1,33 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2025 Juniper Networks, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _ARM64_KEXEC_H_ +#define _ARM64_KEXEC_H_ + +#define KEXEC_MD_PAGES(x) 0 + +#endif /* _ARM64_KEXEC_H_ */ diff --git a/sys/arm64/include/pcpu.h b/sys/arm64/include/pcpu.h index 09bd8fa8a966..73399d2c3f8c 100644 --- a/sys/arm64/include/pcpu.h +++ b/sys/arm64/include/pcpu.h @@ -50,7 +50,8 @@ struct debug_monitor_state; struct pmap *pc_curvmpmap; \ uint64_t pc_mpidr; \ u_int pc_bcast_tlbi_workaround; \ - char __pad[197] + uint64_t pc_release_addr; \ + char __pad[189] #ifdef _KERNEL diff --git a/sys/arm64/include/smp.h b/sys/arm64/include/smp.h index 500cd1ef4f02..4a5bfda3ac1c 100644 --- a/sys/arm64/include/smp.h +++ b/sys/arm64/include/smp.h @@ -40,6 +40,7 @@ enum { IPI_STOP, IPI_STOP_HARD, IPI_HARDCLOCK, + IPI_OFF, INTR_IPI_COUNT, }; diff --git a/sys/arm64/vmm/io/vgic_v3.c b/sys/arm64/vmm/io/vgic_v3.c index 67afb3374815..023406c64182 100644 --- a/sys/arm64/vmm/io/vgic_v3.c +++ b/sys/arm64/vmm/io/vgic_v3.c @@ -47,7 +47,6 @@ #include <dev/ofw/openfirm.h> -#include <machine/armreg.h> #include <machine/atomic.h> #include <machine/bus.h> #include <machine/cpufunc.h> diff --git a/sys/arm64/vmm/io/vtimer.c b/sys/arm64/vmm/io/vtimer.c index da0f0d96c431..7c7fbb49e691 100644 --- a/sys/arm64/vmm/io/vtimer.c +++ b/sys/arm64/vmm/io/vtimer.c @@ -44,7 +44,6 @@ #include <machine/bus.h> #include <machine/machdep.h> #include <machine/vmm.h> -#include <machine/armreg.h> #include <arm64/vmm/arm64.h> diff --git a/sys/arm64/vmm/vmm.c b/sys/arm64/vmm/vmm.c index 14ea26c3668c..e7b2b5d8c360 100644 --- a/sys/arm64/vmm/vmm.c +++ b/sys/arm64/vmm/vmm.c @@ -51,7 +51,6 @@ #include <vm/vm_extern.h> #include <vm/vm_param.h> -#include <machine/armreg.h> #include <machine/cpu.h> #include <machine/fpu.h> #include <machine/machdep.h> diff --git a/sys/arm64/vmm/vmm_arm64.c b/sys/arm64/vmm/vmm_arm64.c index 618f4afaf8ee..006239431f29 100644 --- a/sys/arm64/vmm/vmm_arm64.c +++ b/sys/arm64/vmm/vmm_arm64.c @@ -47,7 +47,6 @@ #include <vm/vm_page.h> #include <vm/vm_param.h> -#include <machine/armreg.h> #include <machine/vm.h> #include <machine/cpufunc.h> #include <machine/cpu.h> diff --git a/sys/arm64/vmm/vmm_hyp.c b/sys/arm64/vmm/vmm_hyp.c index b8c6d2ab7a9a..0ad7930e9a87 100644 --- a/sys/arm64/vmm/vmm_hyp.c +++ b/sys/arm64/vmm/vmm_hyp.c @@ -32,7 +32,6 @@ #include <sys/types.h> #include <sys/proc.h> -#include <machine/armreg.h> #include "arm64.h" #include "hyp.h" diff --git a/sys/arm64/vmm/vmm_reset.c b/sys/arm64/vmm/vmm_reset.c index 1240c3ed16ec..0e4910ea87b4 100644 --- a/sys/arm64/vmm/vmm_reset.c +++ b/sys/arm64/vmm/vmm_reset.c @@ -31,7 +31,6 @@ #include <sys/kernel.h> #include <sys/lock.h> -#include <machine/armreg.h> #include <machine/cpu.h> #include <machine/hypervisor.h> diff --git a/sys/compat/freebsd32/freebsd32_syscall.h b/sys/compat/freebsd32/freebsd32_syscall.h index 54063150eef9..f8ef7e4a20d3 100644 --- a/sys/compat/freebsd32/freebsd32_syscall.h +++ b/sys/compat/freebsd32/freebsd32_syscall.h @@ -517,4 +517,4 @@ #define FREEBSD32_SYS_setgroups 596 #define FREEBSD32_SYS_jail_attach_jd 597 #define FREEBSD32_SYS_jail_remove_jd 598 -#define FREEBSD32_SYS_MAXSYSCALL 599 +#define FREEBSD32_SYS_MAXSYSCALL 600 diff --git a/sys/compat/freebsd32/freebsd32_syscalls.c b/sys/compat/freebsd32/freebsd32_syscalls.c index f7cc4c284e4d..645cdccbc02d 100644 --- a/sys/compat/freebsd32/freebsd32_syscalls.c +++ b/sys/compat/freebsd32/freebsd32_syscalls.c @@ -604,4 +604,5 @@ const char *freebsd32_syscallnames[] = { "setgroups", /* 596 = setgroups */ "jail_attach_jd", /* 597 = jail_attach_jd */ "jail_remove_jd", /* 598 = jail_remove_jd */ + "#599", /* 599 = kexec_load */ }; diff --git a/sys/compat/freebsd32/freebsd32_sysent.c b/sys/compat/freebsd32/freebsd32_sysent.c index 18f809ef04e3..240b54ae9011 100644 --- a/sys/compat/freebsd32/freebsd32_sysent.c +++ b/sys/compat/freebsd32/freebsd32_sysent.c @@ -666,4 +666,5 @@ struct sysent freebsd32_sysent[] = { { .sy_narg = AS(setgroups_args), .sy_call = (sy_call_t *)sys_setgroups, .sy_auevent = AUE_SETGROUPS, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 596 = setgroups */ { .sy_narg = AS(jail_attach_jd_args), .sy_call = (sy_call_t *)sys_jail_attach_jd, .sy_auevent = AUE_JAIL_ATTACH, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 597 = jail_attach_jd */ { .sy_narg = AS(jail_remove_jd_args), .sy_call = (sy_call_t *)sys_jail_remove_jd, .sy_auevent = AUE_JAIL_REMOVE, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 598 = jail_remove_jd */ + { .sy_narg = 0, .sy_call = (sy_call_t *)nosys, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_ABSENT }, /* 599 = freebsd32_kexec_load */ }; diff --git a/sys/compat/freebsd32/syscalls.conf b/sys/compat/freebsd32/syscalls.conf index 72006631c89e..9308d1529c63 100644 --- a/sys/compat/freebsd32/syscalls.conf +++ b/sys/compat/freebsd32/syscalls.conf @@ -48,10 +48,11 @@ obsol="getkerninfo" # Syscalls without implementations: # __mac_* - should be implemented # afs3_syscall - requires significant porting, probably doesn't make sense +# kexec_load - makes little sense on 64-bit hardware # kldsym - can't be implemented (kernel virtual addresses can't fit in 32-bits) # lgetfh - should be implemented # nlm_syscall - requires significant porting, probably doesn't make sense # nnpfs_syscall - requires significant porting, probably doesn't make sense # ntp_gettime - should be implemented # thr_create - was unimplemented and appears to be unnecessary -unimpl="afs3_syscall kldsym __mac_get_proc __mac_set_proc __mac_get_fd __mac_get_file __mac_set_fd __mac_set_file __mac_get_pid __mac_get_link __mac_set_link __mac_execve nfssvc nlm_syscall ntp_gettime lgetfh nnpfs_syscall thr_create" +unimpl="afs3_syscall kexec_load kldsym __mac_get_proc __mac_set_proc __mac_get_fd __mac_get_file __mac_set_fd __mac_set_file __mac_get_pid __mac_get_link __mac_set_link __mac_execve nfssvc nlm_syscall ntp_gettime lgetfh nnpfs_syscall thr_create" diff --git a/sys/conf/files b/sys/conf/files index c17451324324..0a24b5e1e39b 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -3842,6 +3842,7 @@ kern/kern_jaildesc.c standard kern/kern_jailmeta.c standard kern/kern_kcov.c optional kcov \ compile-with "${NOSAN_C} ${MSAN_CFLAGS}" +kern/kern_kexec.c standard kern/kern_khelp.c standard kern/kern_kthread.c standard kern/kern_ktr.c optional ktr @@ -4549,7 +4550,6 @@ netpfil/ipfw/dn_sched_rr.c optional inet dummynet netpfil/ipfw/dn_sched_wf2q.c optional inet dummynet netpfil/ipfw/ip_dummynet.c optional inet dummynet netpfil/ipfw/ip_dn_io.c optional inet dummynet -netpfil/ipfw/ip_dn_glue.c optional inet dummynet netpfil/ipfw/ip_fw2.c optional inet ipfirewall netpfil/ipfw/ip_fw_bpf.c optional inet ipfirewall netpfil/ipfw/ip_fw_dynamic.c optional inet ipfirewall \ diff --git a/sys/conf/files.amd64 b/sys/conf/files.amd64 index a342242ac66e..e4f01813bc8f 100644 --- a/sys/conf/files.amd64 +++ b/sys/conf/files.amd64 @@ -77,6 +77,8 @@ amd64/amd64/fpu.c standard amd64/amd64/gdb_machdep.c optional gdb amd64/amd64/initcpu.c standard amd64/amd64/io.c optional io +amd64/amd64/kexec_support.c standard +amd64/amd64/kexec_tramp.S standard amd64/amd64/locore.S standard no-obj amd64/amd64/xen-locore.S optional xenhvm \ compile-with "${NORMAL_S} -g0" \ diff --git a/sys/conf/files.arm64 b/sys/conf/files.arm64 index 2f412fa3cb1b..882aca705336 100644 --- a/sys/conf/files.arm64 +++ b/sys/conf/files.arm64 @@ -55,6 +55,7 @@ arm64/arm64/gic_v3_acpi.c optional acpi arm64/arm64/gic_v3_fdt.c optional fdt arm64/arm64/hyp_stub.S standard arm64/arm64/identcpu.c standard +arm64/arm64/kexec_support.c standard arm64/arm64/locore.S standard no-obj arm64/arm64/machdep.c standard arm64/arm64/machdep_boot.c standard diff --git a/sys/contrib/vchiq/interface/vchiq_arm/vchiq_core.c b/sys/contrib/vchiq/interface/vchiq_arm/vchiq_core.c index 2e30dd7dc3de..e7459a5553e4 100644 --- a/sys/contrib/vchiq/interface/vchiq_arm/vchiq_core.c +++ b/sys/contrib/vchiq/interface/vchiq_arm/vchiq_core.c @@ -392,9 +392,9 @@ make_service_callback(VCHIQ_SERVICE_T *service, VCHIQ_REASON_T reason, VCHIQ_HEADER_T *header, void *bulk_userdata) { VCHIQ_STATUS_T status; - vchiq_log_trace(vchiq_core_log_level, "%d: callback:%d (%s, %x, %x)", + vchiq_log_trace(vchiq_core_log_level, "%d: callback:%d (%s, %p, %p)", service->state->id, service->localport, reason_names[reason], - (unsigned int)header, (unsigned int)bulk_userdata); + header, bulk_userdata); status = service->base.callback(reason, header, service->handle, bulk_userdata); if (status == VCHIQ_ERROR) { @@ -640,8 +640,8 @@ process_free_queue(VCHIQ_STATE_T *state) rmb(); - vchiq_log_trace(vchiq_core_log_level, "%d: pfq %d=%x %x %x", - state->id, slot_index, (unsigned int)data, + vchiq_log_trace(vchiq_core_log_level, "%d: pfq %d=%p %x %x", + state->id, slot_index, data, local->slot_queue_recycle, slot_queue_available); /* Initialise the bitmask for services which have used this @@ -675,13 +675,13 @@ process_free_queue(VCHIQ_STATE_T *state) vchiq_log_error(vchiq_core_log_level, "service %d " "message_use_count=%d " - "(header %x, msgid %x, " + "(header %p, msgid %x, " "header->msgid %x, " "header->size %x)", port, service_quota-> message_use_count, - (unsigned int)header, msgid, + header, msgid, header->msgid, header->size); WARN(1, "invalid message use count\n"); @@ -704,24 +704,24 @@ process_free_queue(VCHIQ_STATE_T *state) up(&service_quota->quota_event); vchiq_log_trace( vchiq_core_log_level, - "%d: pfq:%d %x@%x - " + "%d: pfq:%d %x@%p - " "slot_use->%d", state->id, port, header->size, - (unsigned int)header, + header, count - 1); } else { vchiq_log_error( vchiq_core_log_level, "service %d " "slot_use_count" - "=%d (header %x" + "=%d (header %p" ", msgid %x, " "header->msgid" " %x, header->" "size %x)", port, count, - (unsigned int)header, + header, msgid, header->msgid, header->size); @@ -735,9 +735,9 @@ process_free_queue(VCHIQ_STATE_T *state) pos += calc_stride(header->size); if (pos > VCHIQ_SLOT_SIZE) { vchiq_log_error(vchiq_core_log_level, - "pfq - pos %x: header %x, msgid %x, " + "pfq - pos %x: header %p, msgid %x, " "header->msgid %x, header->size %x", - pos, (unsigned int)header, msgid, + pos, header, msgid, header->msgid, header->size); WARN(1, "invalid slot position\n"); } @@ -885,17 +885,16 @@ queue_message(VCHIQ_STATE_T *state, VCHIQ_SERVICE_T *service, int slot_use_count; vchiq_log_info(vchiq_core_log_level, - "%d: qm %s@%x,%x (%d->%d)", + "%d: qm %s@%p,%x (%d->%d)", state->id, msg_type_str(VCHIQ_MSG_TYPE(msgid)), - (unsigned int)header, size, + header, size, VCHIQ_MSG_SRCPORT(msgid), VCHIQ_MSG_DSTPORT(msgid)); BUG_ON(!service); BUG_ON((flags & (QMFLAGS_NO_MUTEX_LOCK | QMFLAGS_NO_MUTEX_UNLOCK)) != 0); - for (i = 0, pos = 0; i < (unsigned int)count; pos += elements[i++].size) if (elements[i].size) { @@ -951,9 +950,9 @@ queue_message(VCHIQ_STATE_T *state, VCHIQ_SERVICE_T *service, VCHIQ_SERVICE_STATS_ADD(service, ctrl_tx_bytes, size); } else { vchiq_log_info(vchiq_core_log_level, - "%d: qm %s@%x,%x (%d->%d)", state->id, + "%d: qm %s@%p,%x (%d->%d)", state->id, msg_type_str(VCHIQ_MSG_TYPE(msgid)), - (unsigned int)header, size, + header, size, VCHIQ_MSG_SRCPORT(msgid), VCHIQ_MSG_DSTPORT(msgid)); if (size != 0) { @@ -1036,9 +1035,9 @@ queue_message_sync(VCHIQ_STATE_T *state, VCHIQ_SERVICE_T *service, int i, pos; vchiq_log_info(vchiq_sync_log_level, - "%d: qms %s@%x,%x (%d->%d)", state->id, + "%d: qms %s@%p,%x (%d->%d)", state->id, msg_type_str(VCHIQ_MSG_TYPE(msgid)), - (unsigned int)header, size, + header, size, VCHIQ_MSG_SRCPORT(msgid), VCHIQ_MSG_DSTPORT(msgid)); @@ -1065,9 +1064,9 @@ queue_message_sync(VCHIQ_STATE_T *state, VCHIQ_SERVICE_T *service, VCHIQ_SERVICE_STATS_ADD(service, ctrl_tx_bytes, size); } else { vchiq_log_info(vchiq_sync_log_level, - "%d: qms %s@%x,%x (%d->%d)", state->id, + "%d: qms %s@%p,%x (%d->%d)", state->id, msg_type_str(VCHIQ_MSG_TYPE(msgid)), - (unsigned int)header, size, + header, size, VCHIQ_MSG_SRCPORT(msgid), VCHIQ_MSG_DSTPORT(msgid)); if (size != 0) { @@ -1368,26 +1367,26 @@ resolve_bulks(VCHIQ_SERVICE_T *service, VCHIQ_BULK_QUEUE_T *queue) "Send Bulk to" : "Recv Bulk from"; if (bulk->actual != VCHIQ_BULK_ACTUAL_ABORTED) vchiq_log_info(SRVTRACE_LEVEL(service), - "%s %c%c%c%c d:%d len:%d %x<->%x", + "%s %c%c%c%c d:%d len:%d %p<->%p", header, VCHIQ_FOURCC_AS_4CHARS( service->base.fourcc), service->remoteport, bulk->size, - (unsigned int)bulk->data, - (unsigned int)bulk->remote_data); + bulk->data, + bulk->remote_data); else vchiq_log_info(SRVTRACE_LEVEL(service), "%s %c%c%c%c d:%d ABORTED - tx len:%d," - " rx len:%d %x<->%x", + " rx len:%d %p<->%p", header, VCHIQ_FOURCC_AS_4CHARS( service->base.fourcc), service->remoteport, bulk->size, bulk->remote_size, - (unsigned int)bulk->data, - (unsigned int)bulk->remote_data); + bulk->data, + bulk->remote_data); } vchiq_complete_bulk(bulk); @@ -1522,8 +1521,8 @@ parse_open(VCHIQ_STATE_T *state, VCHIQ_HEADER_T *header) fourcc = payload->fourcc; vchiq_log_info(vchiq_core_log_level, - "%d: prs OPEN@%x (%d->'%c%c%c%c')", - state->id, (unsigned int)header, + "%d: prs OPEN@%p (%d->'%c%c%c%c')", + state->id, header, localport, VCHIQ_FOURCC_AS_4CHARS(fourcc)); @@ -1661,7 +1660,7 @@ parse_rx_slots(VCHIQ_STATE_T *state) header = (VCHIQ_HEADER_T *)(state->rx_data + (state->rx_pos & VCHIQ_SLOT_MASK)); - DEBUG_VALUE(PARSE_HEADER, (int)header); + DEBUG_VALUE(PARSE_HEADER, (size_t)header); msgid = header->msgid; DEBUG_VALUE(PARSE_MSGID, msgid); size = header->size; @@ -1695,20 +1694,20 @@ parse_rx_slots(VCHIQ_STATE_T *state) remoteport); if (service) vchiq_log_warning(vchiq_core_log_level, - "%d: prs %s@%x (%d->%d) - " + "%d: prs %s@%p (%d->%d) - " "found connected service %d", state->id, msg_type_str(type), - (unsigned int)header, + header, remoteport, localport, service->localport); } if (!service) { vchiq_log_error(vchiq_core_log_level, - "%d: prs %s@%x (%d->%d) - " + "%d: prs %s@%p (%d->%d) - " /* XXX */ "invalid/closed service %d", state->id, msg_type_str(type), - (unsigned int)header, + header, remoteport, localport, localport); goto skip_message; } @@ -1734,12 +1733,12 @@ parse_rx_slots(VCHIQ_STATE_T *state) min(16, size)); } - if (((unsigned int)header & VCHIQ_SLOT_MASK) + calc_stride(size) + if (((size_t)header & VCHIQ_SLOT_MASK) + calc_stride(size) > VCHIQ_SLOT_SIZE) { vchiq_log_error(vchiq_core_log_level, - "header %x (msgid %x) - size %x too big for " + "header %p (msgid %x) - size %x too big for " "slot", - (unsigned int)header, (unsigned int)msgid, + header, (unsigned int)msgid, (unsigned int)size); WARN(1, "oversized for slot\n"); } @@ -1758,8 +1757,8 @@ parse_rx_slots(VCHIQ_STATE_T *state) service->peer_version = payload->version; } vchiq_log_info(vchiq_core_log_level, - "%d: prs OPENACK@%x,%x (%d->%d) v:%d", - state->id, (unsigned int)header, size, + "%d: prs OPENACK@%p,%x (%d->%d) v:%d", + state->id, header, size, remoteport, localport, service->peer_version); if (service->srvstate == VCHIQ_SRVSTATE_OPENING) { @@ -1776,8 +1775,8 @@ parse_rx_slots(VCHIQ_STATE_T *state) WARN_ON(size != 0); /* There should be no data */ vchiq_log_info(vchiq_core_log_level, - "%d: prs CLOSE@%x (%d->%d)", - state->id, (unsigned int)header, + "%d: prs CLOSE@%p (%d->%d)", + state->id, header, remoteport, localport); mark_service_closing_internal(service, 1); @@ -1794,8 +1793,8 @@ parse_rx_slots(VCHIQ_STATE_T *state) break; case VCHIQ_MSG_DATA: vchiq_log_info(vchiq_core_log_level, - "%d: prs DATA@%x,%x (%d->%d)", - state->id, (unsigned int)header, size, + "%d: prs DATA@%p,%x (%d->%d)", + state->id, header, size, remoteport, localport); if ((service->remoteport == remoteport) @@ -1819,8 +1818,8 @@ parse_rx_slots(VCHIQ_STATE_T *state) break; case VCHIQ_MSG_CONNECT: vchiq_log_info(vchiq_core_log_level, - "%d: prs CONNECT@%x", - state->id, (unsigned int)header); + "%d: prs CONNECT@%p", + state->id, header); state->version_common = ((VCHIQ_SLOT_ZERO_T *) state->slot_data)->version; up(&state->connect); @@ -1854,12 +1853,12 @@ parse_rx_slots(VCHIQ_STATE_T *state) wmb(); vchiq_log_info(vchiq_core_log_level, - "%d: prs %s@%x (%d->%d) %x@%x", + "%d: prs %s@%p (%d->%d) %x@%p", state->id, msg_type_str(type), - (unsigned int)header, + header, remoteport, localport, bulk->remote_size, - (unsigned int)bulk->remote_data); + bulk->remote_data); queue->remote_insert++; @@ -1912,10 +1911,10 @@ parse_rx_slots(VCHIQ_STATE_T *state) if ((int)(queue->remote_insert - queue->local_insert) >= 0) { vchiq_log_error(vchiq_core_log_level, - "%d: prs %s@%x (%d->%d) " + "%d: prs %s@%p (%d->%d) " "unexpected (ri=%d,li=%d)", state->id, msg_type_str(type), - (unsigned int)header, + header, remoteport, localport, queue->remote_insert, queue->local_insert); @@ -1932,11 +1931,11 @@ parse_rx_slots(VCHIQ_STATE_T *state) queue->remote_insert++; vchiq_log_info(vchiq_core_log_level, - "%d: prs %s@%x (%d->%d) %x@%x", + "%d: prs %s@%p (%d->%d) %x@%p", state->id, msg_type_str(type), - (unsigned int)header, + header, remoteport, localport, - bulk->actual, (unsigned int)bulk->data); + bulk->actual, bulk->data); vchiq_log_trace(vchiq_core_log_level, "%d: prs:%d %cx li=%x ri=%x p=%x", @@ -1958,14 +1957,14 @@ parse_rx_slots(VCHIQ_STATE_T *state) break; case VCHIQ_MSG_PADDING: vchiq_log_trace(vchiq_core_log_level, - "%d: prs PADDING@%x,%x", - state->id, (unsigned int)header, size); + "%d: prs PADDING@%p,%x", + state->id, header, size); break; case VCHIQ_MSG_PAUSE: /* If initiated, signal the application thread */ vchiq_log_trace(vchiq_core_log_level, - "%d: prs PAUSE@%x,%x", - state->id, (unsigned int)header, size); + "%d: prs PAUSE@%p,%x", + state->id, header, size); if (state->conn_state == VCHIQ_CONNSTATE_PAUSED) { vchiq_log_error(vchiq_core_log_level, "%d: PAUSE received in state PAUSED", @@ -1988,8 +1987,8 @@ parse_rx_slots(VCHIQ_STATE_T *state) break; case VCHIQ_MSG_RESUME: vchiq_log_trace(vchiq_core_log_level, - "%d: prs RESUME@%x,%x", - state->id, (unsigned int)header, size); + "%d: prs RESUME@%p,%x", + state->id, header, size); /* Release the slot mutex */ lmutex_unlock(&state->slot_mutex); if (state->is_master) @@ -2010,8 +2009,8 @@ parse_rx_slots(VCHIQ_STATE_T *state) default: vchiq_log_error(vchiq_core_log_level, - "%d: prs invalid msgid %x@%x,%x", - state->id, msgid, (unsigned int)header, size); + "%d: prs invalid msgid %x@%p,%x", + state->id, msgid, header, size); WARN(1, "invalid message\n"); break; } @@ -2179,10 +2178,10 @@ sync_func(void *v) if (!service) { vchiq_log_error(vchiq_sync_log_level, - "%d: sf %s@%x (%d->%d) - " + "%d: sf %s@%p (%d->%d) - " "invalid/closed service %d", state->id, msg_type_str(type), - (unsigned int)header, + header, remoteport, localport, localport); release_message_sync(state, header); continue; @@ -2213,8 +2212,8 @@ sync_func(void *v) service->peer_version = payload->version; } vchiq_log_info(vchiq_sync_log_level, - "%d: sf OPENACK@%x,%x (%d->%d) v:%d", - state->id, (unsigned int)header, size, + "%d: sf OPENACK@%p,%x (%d->%d) v:%d", + state->id, header, size, remoteport, localport, service->peer_version); if (service->srvstate == VCHIQ_SRVSTATE_OPENING) { service->remoteport = remoteport; @@ -2228,8 +2227,8 @@ sync_func(void *v) case VCHIQ_MSG_DATA: vchiq_log_trace(vchiq_sync_log_level, - "%d: sf DATA@%x,%x (%d->%d)", - state->id, (unsigned int)header, size, + "%d: sf DATA@%p,%x (%d->%d)", + state->id, header, size, remoteport, localport); if ((service->remoteport == remoteport) && @@ -2248,8 +2247,8 @@ sync_func(void *v) default: vchiq_log_error(vchiq_sync_log_level, - "%d: sf unexpected msgid %x@%x,%x", - state->id, msgid, (unsigned int)header, size); + "%d: sf unexpected msgid %x@%p,%x", + state->id, msgid, header, size); release_message_sync(state, header); break; } @@ -2334,8 +2333,8 @@ vchiq_init_state(VCHIQ_STATE_T *state, VCHIQ_SLOT_ZERO_T *slot_zero, if (slot_zero->magic != VCHIQ_MAGIC) { vchiq_loud_error_header(); vchiq_loud_error("Invalid VCHIQ magic value found."); - vchiq_loud_error("slot_zero=%x: magic=%x (expected %x)", - (unsigned int)slot_zero, slot_zero->magic, VCHIQ_MAGIC); + vchiq_loud_error("slot_zero=%p: magic=%x (expected %x)", + slot_zero, slot_zero->magic, VCHIQ_MAGIC); vchiq_loud_error_footer(); return VCHIQ_ERROR; } @@ -2348,9 +2347,9 @@ vchiq_init_state(VCHIQ_STATE_T *state, VCHIQ_SLOT_ZERO_T *slot_zero, if (slot_zero->version < VCHIQ_VERSION_MIN) { vchiq_loud_error_header(); vchiq_loud_error("Incompatible VCHIQ versions found."); - vchiq_loud_error("slot_zero=%x: VideoCore version=%d " + vchiq_loud_error("slot_zero=%p: VideoCore version=%d " "(minimum %d)", - (unsigned int)slot_zero, slot_zero->version, + slot_zero, slot_zero->version, VCHIQ_VERSION_MIN); vchiq_loud_error("Restart with a newer VideoCore image."); vchiq_loud_error_footer(); @@ -2360,9 +2359,9 @@ vchiq_init_state(VCHIQ_STATE_T *state, VCHIQ_SLOT_ZERO_T *slot_zero, if (VCHIQ_VERSION < slot_zero->version_min) { vchiq_loud_error_header(); vchiq_loud_error("Incompatible VCHIQ versions found."); - vchiq_loud_error("slot_zero=%x: version=%d (VideoCore " + vchiq_loud_error("slot_zero=%p: version=%d (VideoCore " "minimum %d)", - (unsigned int)slot_zero, VCHIQ_VERSION, + slot_zero, VCHIQ_VERSION, slot_zero->version_min); vchiq_loud_error("Restart with a newer kernel."); vchiq_loud_error_footer(); @@ -2375,25 +2374,25 @@ vchiq_init_state(VCHIQ_STATE_T *state, VCHIQ_SLOT_ZERO_T *slot_zero, (slot_zero->max_slots_per_side != VCHIQ_MAX_SLOTS_PER_SIDE)) { vchiq_loud_error_header(); if (slot_zero->slot_zero_size != sizeof(VCHIQ_SLOT_ZERO_T)) - vchiq_loud_error("slot_zero=%x: slot_zero_size=%x " + vchiq_loud_error("slot_zero=%p: slot_zero_size=%x " "(expected %zx)", - (unsigned int)slot_zero, + slot_zero, slot_zero->slot_zero_size, sizeof(VCHIQ_SLOT_ZERO_T)); if (slot_zero->slot_size != VCHIQ_SLOT_SIZE) - vchiq_loud_error("slot_zero=%x: slot_size=%d " + vchiq_loud_error("slot_zero=%p: slot_size=%d " "(expected %d", - (unsigned int)slot_zero, slot_zero->slot_size, + slot_zero, slot_zero->slot_size, VCHIQ_SLOT_SIZE); if (slot_zero->max_slots != VCHIQ_MAX_SLOTS) - vchiq_loud_error("slot_zero=%x: max_slots=%d " + vchiq_loud_error("slot_zero=%p: max_slots=%d " "(expected %d)", - (unsigned int)slot_zero, slot_zero->max_slots, + slot_zero, slot_zero->max_slots, VCHIQ_MAX_SLOTS); if (slot_zero->max_slots_per_side != VCHIQ_MAX_SLOTS_PER_SIDE) - vchiq_loud_error("slot_zero=%x: max_slots_per_side=%d " + vchiq_loud_error("slot_zero=%p: max_slots_per_side=%d " "(expected %d)", - (unsigned int)slot_zero, + slot_zero, slot_zero->max_slots_per_side, VCHIQ_MAX_SLOTS_PER_SIDE); vchiq_loud_error_footer(); @@ -2775,18 +2774,18 @@ release_service_messages(VCHIQ_SERVICE_T *service) if ((port == service->localport) && (msgid & VCHIQ_MSGID_CLAIMED)) { vchiq_log_info(vchiq_core_log_level, - " fsi - hdr %x", - (unsigned int)header); + " fsi - hdr %p", + header); release_slot(state, slot_info, header, NULL); } pos += calc_stride(header->size); if (pos > VCHIQ_SLOT_SIZE) { vchiq_log_error(vchiq_core_log_level, - "fsi - pos %x: header %x, " + "fsi - pos %x: header %p, " "msgid %x, header->msgid %x, " "header->size %x", - pos, (unsigned int)header, + pos, header, msgid, header->msgid, header->size); WARN(1, "invalid slot position\n"); @@ -3360,10 +3359,10 @@ vchiq_bulk_transfer(VCHIQ_SERVICE_HANDLE_T handle, wmb(); vchiq_log_info(vchiq_core_log_level, - "%d: bt (%d->%d) %cx %x@%x %x", + "%d: bt (%d->%d) %cx %x@%p %p", state->id, service->localport, service->remoteport, dir_char, - size, (unsigned int)bulk->data, (unsigned int)userdata); + size, bulk->data, userdata); /* The slot mutex must be held when the service is being closed, so claim it here to ensure that isn't happening */ @@ -3710,12 +3709,12 @@ vchiq_dump_state(void *dump_context, VCHIQ_STATE_T *state) vchiq_dump(dump_context, buf, len + 1); len = snprintf(buf, sizeof(buf), - " tx_pos=%x(@%x), rx_pos=%x(@%x)", + " tx_pos=%x(@%p), rx_pos=%x(@%p)", state->local->tx_pos, - (uint32_t)state->tx_data + + state->tx_data + (state->local_tx_pos & VCHIQ_SLOT_MASK), state->rx_pos, - (uint32_t)state->rx_data + + state->rx_data + (state->rx_pos & VCHIQ_SLOT_MASK)); vchiq_dump(dump_context, buf, len + 1); @@ -3817,21 +3816,21 @@ vchiq_dump_service_state(void *dump_context, VCHIQ_SERVICE_T *service) vchiq_dump(dump_context, buf, len + 1); len = snprintf(buf, sizeof(buf), - " Ctrl: tx_count=%d, tx_bytes=%llu, " - "rx_count=%d, rx_bytes=%llu", + " Ctrl: tx_count=%d, tx_bytes=%ju, " + "rx_count=%d, rx_bytes=%ju", service->stats.ctrl_tx_count, - service->stats.ctrl_tx_bytes, + (uintmax_t) service->stats.ctrl_tx_bytes, service->stats.ctrl_rx_count, - service->stats.ctrl_rx_bytes); + (uintmax_t) service->stats.ctrl_rx_bytes); vchiq_dump(dump_context, buf, len + 1); len = snprintf(buf, sizeof(buf), - " Bulk: tx_count=%d, tx_bytes=%llu, " - "rx_count=%d, rx_bytes=%llu", + " Bulk: tx_count=%d, tx_bytes=%ju, " + "rx_count=%d, rx_bytes=%ju", service->stats.bulk_tx_count, - service->stats.bulk_tx_bytes, + (uintmax_t) service->stats.bulk_tx_bytes, service->stats.bulk_rx_count, - service->stats.bulk_rx_bytes); + (uintmax_t) service->stats.bulk_rx_bytes); vchiq_dump(dump_context, buf, len + 1); len = snprintf(buf, sizeof(buf), diff --git a/sys/contrib/vchiq/interface/vchiq_arm/vchiq_kern_lib.c b/sys/contrib/vchiq/interface/vchiq_arm/vchiq_kern_lib.c index 1f849a09d854..4eddcf3b43b2 100644 --- a/sys/contrib/vchiq/interface/vchiq_arm/vchiq_kern_lib.c +++ b/sys/contrib/vchiq/interface/vchiq_arm/vchiq_kern_lib.c @@ -151,9 +151,9 @@ VCHIQ_STATUS_T vchiq_shutdown(VCHIQ_INSTANCE_T instance) list); list_del(pos); vchiq_log_info(vchiq_arm_log_level, - "bulk_waiter - cleaned up %x " + "bulk_waiter - cleaned up %p " "for pid %d", - (unsigned int)waiter, waiter->pid); + waiter, waiter->pid); _sema_destroy(&waiter->bulk_waiter.event); kfree(waiter); @@ -454,8 +454,8 @@ vchiq_blocking_bulk_transfer(VCHIQ_SERVICE_HANDLE_T handle, void *data, list_add(&waiter->list, &instance->bulk_waiter_list); lmutex_unlock(&instance->bulk_waiter_list_mutex); vchiq_log_info(vchiq_arm_log_level, - "saved bulk_waiter %x for pid %d", - (unsigned int)waiter, current->p_pid); + "saved bulk_waiter %p for pid %d", + waiter, current->p_pid); } return status; diff --git a/sys/dev/acpica/acpi.c b/sys/dev/acpica/acpi.c index 3f0a7b40245d..e3ff4f6937d2 100644 --- a/sys/dev/acpica/acpi.c +++ b/sys/dev/acpica/acpi.c @@ -4430,8 +4430,8 @@ acpi_stype_sysctl(SYSCTL_HANDLER_ARGS) return (EINVAL); printf("warning: this sysctl expects a sleep type, but an ACPI S-state has " "been passed to it. This functionality is deprecated; see acpi(4).\n"); - MPASS(sstate < ACPI_S_STATE_COUNT); - if (acpi_supported_sstates[sstate] == false) + if (sstate < ACPI_S_STATE_COUNT && + !acpi_supported_sstates[sstate]) return (EOPNOTSUPP); new_stype = acpi_sstate_to_stype(sstate); } diff --git a/sys/dev/ice/ice_drv_info.h b/sys/dev/ice/ice_drv_info.h index 46965f4124bc..abb11bdb5fd9 100644 --- a/sys/dev/ice/ice_drv_info.h +++ b/sys/dev/ice/ice_drv_info.h @@ -238,6 +238,9 @@ static const pci_vendor_info_t ice_vendor_info_array[] = { ICE_INTEL_VENDOR_ID, 0x0001, 0, "Intel(R) Ethernet Network Adapter E835-XXV-2 for OCP 3.0"), PVIDV_OEM(ICE_INTEL_VENDOR_ID, ICE_DEV_ID_E835CC_SFP, + ICE_INTEL_VENDOR_ID, 0x0002, 0, + "Intel(R) Ethernet Network Adapter E835-XXV-4"), + PVIDV_OEM(ICE_INTEL_VENDOR_ID, ICE_DEV_ID_E835CC_SFP, ICE_INTEL_VENDOR_ID, 0x0003, 0, "Intel(R) Ethernet Network Adapter E835-XXV-2"), PVIDV_OEM(ICE_INTEL_VENDOR_ID, ICE_DEV_ID_E835CC_SFP, diff --git a/sys/dev/ofw/ofw_cpu.c b/sys/dev/ofw/ofw_cpu.c index 888af0440746..4b12f2e994e3 100644 --- a/sys/dev/ofw/ofw_cpu.c +++ b/sys/dev/ofw/ofw_cpu.c @@ -85,7 +85,8 @@ static driver_t ofw_cpulist_driver = { sizeof(struct ofw_cpulist_softc) }; -DRIVER_MODULE(ofw_cpulist, ofwbus, ofw_cpulist_driver, 0, 0); +EARLY_DRIVER_MODULE(ofw_cpulist, ofwbus, ofw_cpulist_driver, 0, 0, + BUS_PASS_CPU + BUS_PASS_ORDER_MIDDLE); static int ofw_cpulist_probe(device_t dev) @@ -180,7 +181,8 @@ static driver_t ofw_cpu_driver = { sizeof(struct ofw_cpu_softc) }; -DRIVER_MODULE(ofw_cpu, cpulist, ofw_cpu_driver, 0, 0); +EARLY_DRIVER_MODULE(ofw_cpu, cpulist, ofw_cpu_driver, 0, 0, + BUS_PASS_CPU + BUS_PASS_ORDER_MIDDLE); static bool ofw_cpu_is_runnable(phandle_t node) @@ -330,6 +332,7 @@ ofw_cpu_attach(device_t dev) device_printf(dev, "Nominal frequency %dMhz\n", sc->sc_nominal_mhz); + OF_device_register_xref(OF_xref_from_node(node), dev); bus_identify_children(dev); bus_attach_children(dev); return (0); diff --git a/sys/dev/psci/psci.c b/sys/dev/psci/psci.c index 497b23d2d4c3..2b250401ae83 100644 --- a/sys/dev/psci/psci.c +++ b/sys/dev/psci/psci.c @@ -474,6 +474,19 @@ psci_cpu_on(unsigned long cpu, unsigned long entry, unsigned long context_id) return (psci_call(fnid, cpu, entry, context_id)); } +int +psci_cpu_off(void) +{ + uint32_t fnid; + + fnid = PSCI_FNID_CPU_OFF; + if (psci_softc != NULL) + fnid = psci_softc->psci_fnids[PSCI_FN_CPU_OFF]; + + /* Returns PSCI_RETVAL_DENIED on error. */ + return (psci_call(fnid, 0, 0, 0)); +} + static void psci_shutdown(void *xsc, int howto) { diff --git a/sys/dev/psci/psci.h b/sys/dev/psci/psci.h index 451d40c0178d..6704eaf26c71 100644 --- a/sys/dev/psci/psci.h +++ b/sys/dev/psci/psci.h @@ -39,6 +39,7 @@ typedef int (*psci_callfn_t)(register_t, register_t, register_t, register_t, extern bool psci_present; int psci_cpu_on(unsigned long, unsigned long, unsigned long); +int psci_cpu_off(void); /* Operates on caller. */ void psci_reset(void); int32_t psci_features(uint32_t); int psci_get_version(void); diff --git a/sys/dev/sound/dummy.c b/sys/dev/sound/dummy.c index 1f2d69708eec..39214a141bf9 100644 --- a/sys/dev/sound/dummy.c +++ b/sys/dev/sound/dummy.c @@ -104,9 +104,10 @@ dummy_chan_io(void *arg) ch = &sc->chans[i]; if (!ch->run) continue; - if (ch->dir == PCMDIR_PLAY) + if (ch->dir == PCMDIR_PLAY) { ch->ptr += sndbuf_getblksz(ch->buf); - else + ch->ptr %= sndbuf_getsize(ch->buf); + } else sndbuf_fillsilence(ch->buf); snd_mtxunlock(sc->lock); chn_intr(ch->chan); diff --git a/sys/dev/xilinx/xlnx_pcib.c b/sys/dev/xilinx/xlnx_pcib.c index d549ec445ea9..816b33ec1142 100644 --- a/sys/dev/xilinx/xlnx_pcib.c +++ b/sys/dev/xilinx/xlnx_pcib.c @@ -1,7 +1,7 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * - * Copyright (c) 2020 Ruslan Bukin <br@bsdpad.com> + * Copyright (c) 2020-2025 Ruslan Bukin <br@bsdpad.com> * * This software was developed by SRI International and the University of * Cambridge Computer Laboratory (Department of Computer Science and @@ -84,7 +84,7 @@ struct xlnx_pcib_softc { struct generic_pcie_fdt_softc fdt_sc; struct resource *res[4]; struct mtx mtx; - vm_offset_t msi_page; + void *msi_page; struct xlnx_pcib_irqsrc *isrcs; device_t dev; void *intr_cookie[3]; @@ -105,6 +105,12 @@ struct xlnx_pcib_irqsrc { u_int flags; }; +static struct ofw_compat_data compat_data[] = { + { "xlnx,xdma-host-3.00", 1 }, + { "xlnx,axi-pcie-host-1.00.a", 1 }, + { NULL, 0 }, +}; + static void xlnx_pcib_clear_err_interrupts(struct generic_pcie_core_softc *sc) { @@ -333,12 +339,12 @@ xlnx_pcib_fdt_probe(device_t dev) if (!ofw_bus_status_okay(dev)) return (ENXIO); - if (ofw_bus_is_compatible(dev, "xlnx,xdma-host-3.00")) { - device_set_desc(dev, "Xilinx XDMA PCIe Controller"); - return (BUS_PROBE_DEFAULT); - } + if (ofw_bus_search_compatible(dev, compat_data)->ocd_data == 0) + return (ENXIO); + + device_set_desc(dev, "Xilinx XDMA PCIe Controller"); - return (ENXIO); + return (BUS_PROBE_DEFAULT); } static int @@ -424,8 +430,8 @@ xlnx_pcib_req_valid(struct generic_pcie_core_softc *sc, bus_space_tag_t t; uint32_t val; - t = sc->bst; - h = sc->bsh; + t = rman_get_bustag(sc->res); + h = rman_get_bushandle(sc->res); if ((bus < sc->bus_start) || (bus > sc->bus_end)) return (0); @@ -467,8 +473,8 @@ xlnx_pcib_read_config(device_t dev, u_int bus, u_int slot, return (~0U); offset = PCIE_ADDR_OFFSET(bus - sc->bus_start, slot, func, reg); - t = sc->bst; - h = sc->bsh; + t = rman_get_bustag(sc->res); + h = rman_get_bushandle(sc->res); data = bus_space_read_4(t, h, offset & ~3); @@ -512,8 +518,8 @@ xlnx_pcib_write_config(device_t dev, u_int bus, u_int slot, offset = PCIE_ADDR_OFFSET(bus - sc->bus_start, slot, func, reg); - t = sc->bst; - h = sc->bsh; + t = rman_get_bustag(sc->res); + h = rman_get_bushandle(sc->res); /* * 32-bit access used due to a bug in the Xilinx bridge that diff --git a/sys/fs/nfsclient/nfs_clrpcops.c b/sys/fs/nfsclient/nfs_clrpcops.c index d3b83eb8b94b..983eb8b9226f 100644 --- a/sys/fs/nfsclient/nfs_clrpcops.c +++ b/sys/fs/nfsclient/nfs_clrpcops.c @@ -2212,7 +2212,7 @@ nfsrpc_writerpc(vnode_t vp, struct uio *uiop, int *iomode, NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED + NFSX_VERF); rlen = fxdr_unsigned(int, *tl++); - if (rlen == 0) { + if (rlen <= 0 || rlen > len) { error = NFSERR_IO; goto nfsmout; } else if (rlen < len) { @@ -5599,7 +5599,7 @@ nfsrpc_createsession(struct nfsmount *nmp, struct nfsclsession *sep, } *tl++ = txdr_unsigned(4096); /* Max response size cached */ *tl++ = txdr_unsigned(20); /* Max operations */ - *tl++ = txdr_unsigned(64); /* Max slots */ + *tl++ = txdr_unsigned(NFSV4_SLOTS); /* Max slots */ *tl = 0; /* No rdma ird */ /* Fill in back channel attributes. */ @@ -5668,6 +5668,11 @@ nfsrpc_createsession(struct nfsmount *nmp, struct nfsclsession *sep, sep->nfsess_maxcache = fxdr_unsigned(int, *tl++); tl++; sep->nfsess_foreslots = fxdr_unsigned(uint16_t, *tl++); + if (sep->nfsess_foreslots == 0) { + error = NFSERR_BADXDR; + goto nfsmout; + } else if (sep->nfsess_foreslots > NFSV4_SLOTS) + sep->nfsess_foreslots = NFSV4_SLOTS; NFSCL_DEBUG(4, "fore slots=%d\n", (int)sep->nfsess_foreslots); irdcnt = fxdr_unsigned(int, *tl); if (irdcnt < 0 || irdcnt > 1) { @@ -5681,6 +5686,8 @@ nfsrpc_createsession(struct nfsmount *nmp, struct nfsclsession *sep, NFSM_DISSECT(tl, uint32_t *, 7 * NFSX_UNSIGNED); tl += 5; sep->nfsess_backslots = fxdr_unsigned(uint16_t, *tl); + if (sep->nfsess_backslots > NFSV4_CBSLOTS) + sep->nfsess_backslots = NFSV4_CBSLOTS; NFSCL_DEBUG(4, "back slots=%d\n", (int)sep->nfsess_backslots); } error = nd->nd_repstat; @@ -5800,7 +5807,8 @@ nfsrpc_getdeviceinfo(struct nfsmount *nmp, uint8_t *deviceid, int layouttype, NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); stripecnt = fxdr_unsigned(int, *tl); NFSCL_DEBUG(4, "stripecnt=%d\n", stripecnt); - if (stripecnt < 1 || stripecnt > 4096) { + if (stripecnt >= MHLEN / NFSX_UNSIGNED || + stripecnt < 1) { printf("pNFS File layout devinfo stripecnt %d:" " out of range\n", stripecnt); error = NFSERR_BADXDR; @@ -7249,7 +7257,7 @@ nfsrpc_writeds(vnode_t vp, struct uio *uiop, int *iomode, int *must_commit, NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED + NFSX_VERF); rlen = fxdr_unsigned(int, *tl++); NFSCL_DEBUG(4, "nfsrpc_writeds: len=%d rlen=%d\n", len, rlen); - if (rlen == 0) { + if (rlen <= 0 || rlen > len) { error = NFSERR_IO; goto nfsmout; } else if (rlen < len) { @@ -8246,7 +8254,7 @@ nfsrv_parseug(struct nfsrv_descript *nd, int dogrp, uid_t *uidp, gid_t *gidp, NFSPROC_T *p) { uint32_t *tl; - char *cp, *str, str0[NFSV4_SMALLSTR + 1]; + char *str, str0[NFSV4_SMALLSTR + 1]; uint32_t len = 0; int error = 0; @@ -8269,9 +8277,9 @@ nfsrv_parseug(struct nfsrv_descript *nd, int dogrp, uid_t *uidp, gid_t *gidp, str = malloc(len + 1, M_TEMP, M_WAITOK); else str = str0; - NFSM_DISSECT(cp, char *, NFSM_RNDUP(len)); - NFSBCOPY(cp, str, len); - str[len] = '\0'; + error = nfsrv_mtostr(nd, str, len); + if (error != 0) + goto nfsmout; NFSCL_DEBUG(4, "nfsrv_parseug: str=%s\n", str); if (dogrp != 0) error = nfsv4_strtogid(nd, str, len, gidp); diff --git a/sys/fs/nfsserver/nfs_nfsdserv.c b/sys/fs/nfsserver/nfs_nfsdserv.c index 6f3447f26620..67af0cf71175 100644 --- a/sys/fs/nfsserver/nfs_nfsdserv.c +++ b/sys/fs/nfsserver/nfs_nfsdserv.c @@ -5138,6 +5138,11 @@ nfsrvd_layoutcommit(struct nfsrv_descript *nd, __unused int isdgram, NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED); layouttype = fxdr_unsigned(int, *tl++); maxcnt = fxdr_unsigned(int, *tl); + /* There is no limit in the RFC, so use 1000 as a sanity limit. */ + if (maxcnt < 0 || maxcnt > 1000) { + error = NFSERR_BADXDR; + goto nfsmout; + } if (maxcnt > 0) { layp = malloc(maxcnt + 1, M_TEMP, M_WAITOK); error = nfsrv_mtostr(nd, layp, maxcnt); diff --git a/sys/i386/include/kexec.h b/sys/i386/include/kexec.h new file mode 100644 index 000000000000..9fbdef38ad2e --- /dev/null +++ b/sys/i386/include/kexec.h @@ -0,0 +1,38 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2025 Juniper Networks, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _I386_KEXEC_H_ +#define _I386_KEXEC_H_ + +int +kexec_load_md(struct kexec_image *image) +{ + return (ENOSYS); +} + +#define kexec_reboot_md(x) do {} while (0) +#endif /* _I386_KEXEC_H_ */ diff --git a/sys/isa/isa_common.c b/sys/isa/isa_common.c index 41a63a3c676c..91a0ee1f2f3d 100644 --- a/sys/isa/isa_common.c +++ b/sys/isa/isa_common.c @@ -569,8 +569,8 @@ isa_probe_children(device_t dev) if (err == 0 && idev->id_vendorid == 0 && strcmp(kern_ident, "GENERIC") == 0 && device_is_attached(child)) - device_printf(child, - "non-PNP ISA device will be removed from GENERIC in FreeBSD 16.\n"); + gone_in_dev(child, 16, + "WARNING: non-PNP ISA device will be removed from GENERIC\n"); } /* diff --git a/sys/kern/init_sysent.c b/sys/kern/init_sysent.c index e42e7dcf8b44..cd305de1ed44 100644 --- a/sys/kern/init_sysent.c +++ b/sys/kern/init_sysent.c @@ -665,4 +665,5 @@ struct sysent sysent[] = { { .sy_narg = AS(setgroups_args), .sy_call = (sy_call_t *)sys_setgroups, .sy_auevent = AUE_SETGROUPS, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 596 = setgroups */ { .sy_narg = AS(jail_attach_jd_args), .sy_call = (sy_call_t *)sys_jail_attach_jd, .sy_auevent = AUE_JAIL_ATTACH, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 597 = jail_attach_jd */ { .sy_narg = AS(jail_remove_jd_args), .sy_call = (sy_call_t *)sys_jail_remove_jd, .sy_auevent = AUE_JAIL_REMOVE, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 598 = jail_remove_jd */ + { .sy_narg = AS(kexec_load_args), .sy_call = (sy_call_t *)sys_kexec_load, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 599 = kexec_load */ }; diff --git a/sys/kern/kern_kexec.c b/sys/kern/kern_kexec.c new file mode 100644 index 000000000000..2efea7dcf9a7 --- /dev/null +++ b/sys/kern/kern_kexec.c @@ -0,0 +1,350 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2025 Juniper Networks, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/param.h> +#include <sys/bus.h> +#include <sys/eventhandler.h> +#include <sys/kernel.h> +#ifdef INTRNG +#include <sys/intr.h> +#endif +#include <sys/kexec.h> +#include <sys/malloc.h> +#include <sys/proc.h> +#include <sys/priv.h> +#include <sys/reboot.h> +#include <sys/rman.h> +#include <sys/rwlock.h> +#include <sys/smp.h> +#include <sys/syscallsubr.h> +#include <sys/sysproto.h> + +#include <vm/vm.h> +#include <vm/pmap.h> +#include <vm/vm_extern.h> +#include <vm/vm_kern.h> +#include <vm/vm_map.h> +#include <vm/vm_object.h> +#include <vm/vm_page.h> +#include <vm/vm_pagequeue.h> +#include <vm/vm_phys.h> +#include <vm/vm_radix.h> + +#include <machine/kexec.h> + +#ifndef KEXEC_MD_PAGES +/* + * Number of MD pages for extra bookkeeping. + * This is a macro because it can be a constant (some architectures make it 0). + * It accepts an argument, which is an array of + * kexec_segment[KEXEC_SEGMENT_MAX]. + */ +#define KEXEC_MD_PAGES(x) 0 +#endif + +/* + * Basic design: + * + * Given an array of "segment descriptors" stage an image to be loaded and + * jumped to at reboot, instead of rebooting via firmware. + * + * Constraints: + * - The segment descriptors' "mem" and "memsz" must each fit within a + * vm_phys_seg segment, which can be obtained via the `vm.phys_segs` sysctl. + * A single segment cannot span multiple vm_phys_seg segments, even if the + * vm_phys_seg segments are adjacent. + * + * Technical details: + * + * Take advantage of the VM subsystem and create a vm_object to hold the staged + * image. When grabbing pages for the object, sort the pages so that if a page + * in the object is located in the physical range of any of the kexec segment + * targets then it gets placed at the pindex corresponding to that physical + * address. This avoids the chance of corruption by writing over the page in + * the final copy, or the need for a copy buffer page. + */ + +static struct kexec_image staged_image; +static vm_offset_t stage_addr; +static vm_object_t kexec_obj; + +static eventhandler_tag kexec_reboot_handler; +static struct mtx kexec_mutex; + +static MALLOC_DEFINE(M_KEXEC, "kexec", "Kexec segments"); + + +static void +kexec_reboot(void *junk __unused, int howto) +{ + if ((howto & RB_KEXEC) == 0 || kexec_obj == NULL) + return; + +#ifdef SMP + cpu_mp_stop(); +#endif /* SMP */ + intr_disable(); + printf("Starting kexec reboot\n"); + + scheduler_stopped = true; + kexec_reboot_md(&staged_image); +} + +MTX_SYSINIT(kexec_mutex, &kexec_mutex, "kexec", MTX_DEF); + +/* Sort the segment list once copied in */ +static int +seg_cmp(const void *seg1, const void *seg2) +{ + const struct kexec_segment *s1, *s2; + + s1 = seg1; + s2 = seg2; + + return ((uintptr_t)s1->mem - (uintptr_t)s2->mem); +} + +static bool +segment_fits(struct kexec_segment *seg) +{ + vm_paddr_t v = (vm_paddr_t)(uintptr_t)seg->mem; + + for (int i = 0; i < vm_phys_nsegs; i++) { + if (v >= vm_phys_segs[i].start && + (v + seg->memsz - 1) <= vm_phys_segs[i].end) + return (true); + } + + return (false); +} + +static vm_paddr_t +pa_for_pindex(struct kexec_segment_stage *segs, int count, vm_pindex_t pind) +{ + for (int i = count; i > 0; --i) { + if (pind >= segs[i - 1].pindex) + return (ptoa(pind - segs[i-1].pindex) + segs[i - 1].target); + } + + panic("No segment for pindex %ju\n", (uintmax_t)pind); +} + +/* + * For now still tied to the system call, so assumes all memory is userspace. + */ +int +kern_kexec_load(struct thread *td, u_long entry, u_long nseg, + struct kexec_segment *seg, u_long flags) +{ + static int kexec_loading; + struct kexec_segment segtmp[KEXEC_SEGMENT_MAX]; + struct kexec_image *new_image_stage = 0; + vm_object_t new_segments = NULL; + uint8_t *buf; + int err = 0; + int i; + const size_t segsize = nseg * sizeof(struct kexec_segment); + vm_page_t *page_list = 0; + vm_size_t image_count, md_pages, page_count, tmpsize; + vm_offset_t segment_va = 0; + /* + * - Do any sanity checking + * - Load the new segments to temporary + * - Remove the old segments + * - Install the new segments + */ + + if (nseg > KEXEC_SEGMENT_MAX) + return (EINVAL); + + if (atomic_cmpset_acq_int(&kexec_loading, false, true) == 0) + return (EBUSY); + + /* Only do error checking if we're installing new segments. */ + if (nseg > 0) { + /* Create the new kexec object before destroying the old one. */ + bzero(&segtmp, sizeof(segtmp)); + err = copyin(seg, segtmp, segsize); + if (err != 0) + goto out; + qsort(segtmp, nseg, sizeof(*segtmp), seg_cmp); + new_image_stage = malloc(sizeof(*new_image_stage), M_TEMP, M_WAITOK | M_ZERO); + /* + * Sanity checking: + * - All segments must not overlap the kernel, so must be fully enclosed + * in a vm_phys_seg (each kexec segment must be in a single + * vm_phys_seg segment, cannot cross even adjacent segments). + */ + image_count = 0; + for (i = 0; i < nseg; i++) { + if (!segment_fits(&segtmp[i]) || + segtmp[i].bufsz > segtmp[i].memsz) { + err = EINVAL; + goto out; + } + new_image_stage->segments[i].pindex = image_count; + new_image_stage->segments[i].target = (vm_offset_t)segtmp[i].mem; + new_image_stage->segments[i].size = segtmp[i].memsz; + image_count += atop(segtmp[i].memsz); + } + md_pages = KEXEC_MD_PAGES(segtmp); + page_count = image_count + md_pages; + new_segments = vm_object_allocate(OBJT_PHYS, page_count); + page_list = malloc(page_count * sizeof(vm_page_t), M_TEMP, M_WAITOK); + + /* + * - Grab all pages for all segments (use pindex to slice it) + * - Walk the list (once) + * - At each pindex, check if the target PA that corresponds + * to that index is in the object. If so, swap the pages. + * - At the end of this the list will be "best" sorted. + */ + vm_page_grab_pages_unlocked(new_segments, 0, + VM_ALLOC_NORMAL | VM_ALLOC_WAITOK | VM_ALLOC_WIRED | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO, + page_list, page_count); + + /* Sort the pages to best match the PA */ + VM_OBJECT_WLOCK(new_segments); + for (i = 0; i < image_count; i++) { + vm_page_t curpg, otherpg, tmp; + vm_pindex_t otheridx; + + curpg = page_list[i]; + otherpg = PHYS_TO_VM_PAGE(pa_for_pindex(new_image_stage->segments, + nseg, curpg->pindex)); + otheridx = otherpg->pindex; + + if (otherpg->object == new_segments) { + /* + * Swap 'curpg' and 'otherpg', since 'otherpg' + * is at the PA 'curpg' covers. + */ + vm_radix_remove(&new_segments->rtree, otheridx); + vm_radix_remove(&new_segments->rtree, i); + otherpg->pindex = i; + curpg->pindex = otheridx; + vm_radix_insert(&new_segments->rtree, curpg); + vm_radix_insert(&new_segments->rtree, otherpg); + tmp = curpg; + page_list[i] = otherpg; + page_list[otheridx] = tmp; + } + } + for (i = 0; i < nseg; i++) { + new_image_stage->segments[i].first_page = + vm_radix_lookup(&new_segments->rtree, + new_image_stage->segments[i].pindex); + } + if (md_pages > 0) + new_image_stage->first_md_page = + vm_radix_lookup(&new_segments->rtree, + page_count - md_pages); + else + new_image_stage->first_md_page = NULL; + VM_OBJECT_WUNLOCK(new_segments); + + /* Map the object to do the copies */ + err = vm_map_find(kernel_map, new_segments, 0, &segment_va, + ptoa(page_count), 0, VMFS_ANY_SPACE, + VM_PROT_RW, VM_PROT_RW, MAP_PREFAULT); + if (err != 0) + goto out; + buf = (void *)segment_va; + new_image_stage->map_addr = segment_va; + new_image_stage->map_size = ptoa(new_segments->size); + new_image_stage->entry = entry; + new_image_stage->map_obj = new_segments; + for (i = 0; i < nseg; i++) { + err = copyin(segtmp[i].buf, buf, segtmp[i].bufsz); + if (err != 0) { + goto out; + } + new_image_stage->segments[i].map_buf = buf; + buf += segtmp[i].bufsz; + tmpsize = segtmp[i].memsz - segtmp[i].bufsz; + if (tmpsize > 0) + memset(buf, 0, tmpsize); + buf += tmpsize; + } + /* What's left are the MD pages, so zero them all out. */ + if (md_pages > 0) + bzero(buf, ptoa(md_pages)); + + cpu_flush_dcache((void *)segment_va, ptoa(page_count)); + if ((err = kexec_load_md(new_image_stage)) != 0) + goto out; + } + if (kexec_obj != NULL) { + vm_object_unwire(kexec_obj, 0, kexec_obj->size, 0); + KASSERT(stage_addr != 0, ("Mapped kexec_obj without address")); + vm_map_remove(kernel_map, stage_addr, stage_addr + kexec_obj->size); + } + kexec_obj = new_segments; + bzero(&staged_image, sizeof(staged_image)); + if (nseg > 0) + memcpy(&staged_image, new_image_stage, sizeof(*new_image_stage)); + + printf("trampoline at %#jx\n", (uintmax_t)staged_image.entry); + if (nseg > 0) { + if (kexec_reboot_handler == NULL) + kexec_reboot_handler = + EVENTHANDLER_REGISTER(shutdown_final, kexec_reboot, NULL, + SHUTDOWN_PRI_DEFAULT - 150); + } else { + if (kexec_reboot_handler != NULL) + EVENTHANDLER_DEREGISTER(shutdown_final, kexec_reboot_handler); + } +out: + /* Clean up the mess if we've gotten far. */ + if (err != 0 && new_segments != NULL) { + vm_object_unwire(new_segments, 0, new_segments->size, 0); + if (segment_va != 0) + vm_map_remove(kernel_map, segment_va, segment_va + kexec_obj->size); + else + vm_object_deallocate(new_segments); + } + atomic_store_rel_int(&kexec_loading, false); + if (new_image_stage != NULL) + free(new_image_stage, M_TEMP); + if (page_list != 0) + free(page_list, M_TEMP); + + return (err); +} + +int +sys_kexec_load(struct thread *td, struct kexec_load_args *uap) +{ + int error; + + // FIXME: Do w need a better privilege check than PRIV_REBOOT here? + error = priv_check(td, PRIV_REBOOT); + if (error != 0) + return (error); + return (kern_kexec_load(td, uap->entry, uap->nseg, uap->segments, uap->flags)); +} diff --git a/sys/kern/subr_smp.c b/sys/kern/subr_smp.c index 1f9577fddf9c..9f5106316018 100644 --- a/sys/kern/subr_smp.c +++ b/sys/kern/subr_smp.c @@ -242,7 +242,7 @@ generic_stop_cpus(cpuset_t map, u_int type) KASSERT( type == IPI_STOP || type == IPI_STOP_HARD #if X86 - || type == IPI_SUSPEND + || type == IPI_SUSPEND || type == IPI_OFF #endif , ("%s: invalid stop type", __func__)); @@ -260,7 +260,7 @@ generic_stop_cpus(cpuset_t map, u_int type) * will be lost, violating FreeBSD's assumption of reliable * IPI delivery. */ - if (type == IPI_SUSPEND) + if (type == IPI_SUSPEND || type == IPI_OFF) mtx_lock_spin(&smp_ipi_mtx); #endif @@ -280,7 +280,7 @@ generic_stop_cpus(cpuset_t map, u_int type) #endif #if X86 - if (type == IPI_SUSPEND) + if (type == IPI_SUSPEND || type == IPI_OFF) cpus = &suspended_cpus; else #endif @@ -298,7 +298,7 @@ generic_stop_cpus(cpuset_t map, u_int type) } #if X86 - if (type == IPI_SUSPEND) + if (type == IPI_SUSPEND || type == IPI_OFF) mtx_unlock_spin(&smp_ipi_mtx); #endif @@ -327,6 +327,13 @@ suspend_cpus(cpuset_t map) return (generic_stop_cpus(map, IPI_SUSPEND)); } + +int +offline_cpus(cpuset_t map) +{ + + return (generic_stop_cpus(map, IPI_OFF)); +} #endif /* diff --git a/sys/kern/syscalls.c b/sys/kern/syscalls.c index 4cef89cd5219..06a4adc3d8cb 100644 --- a/sys/kern/syscalls.c +++ b/sys/kern/syscalls.c @@ -604,4 +604,5 @@ const char *syscallnames[] = { "setgroups", /* 596 = setgroups */ "jail_attach_jd", /* 597 = jail_attach_jd */ "jail_remove_jd", /* 598 = jail_remove_jd */ + "kexec_load", /* 599 = kexec_load */ }; diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master index 967af1f5313c..ea6d2b5aa1ef 100644 --- a/sys/kern/syscalls.master +++ b/sys/kern/syscalls.master @@ -3394,4 +3394,12 @@ ); } +599 AUE_NULL STD { + int kexec_load( + uint64_t entry, + u_long nseg, + _In_reads_(nseg) _Contains_long_ptr_ struct kexec_segment *segments, + u_long flags + ); + } ; vim: syntax=off diff --git a/sys/kern/systrace_args.c b/sys/kern/systrace_args.c index e28fef931ea8..5951cebbe74a 100644 --- a/sys/kern/systrace_args.c +++ b/sys/kern/systrace_args.c @@ -3514,6 +3514,16 @@ systrace_args(int sysnum, void *params, uint64_t *uarg, int *n_args) *n_args = 1; break; } + /* kexec_load */ + case 599: { + struct kexec_load_args *p = params; + uarg[a++] = p->entry; /* uint64_t */ + uarg[a++] = p->nseg; /* u_long */ + uarg[a++] = (intptr_t)p->segments; /* struct kexec_segment * */ + uarg[a++] = p->flags; /* u_long */ + *n_args = 4; + break; + } default: *n_args = 0; break; @@ -9401,6 +9411,25 @@ systrace_entry_setargdesc(int sysnum, int ndx, char *desc, size_t descsz) break; }; break; + /* kexec_load */ + case 599: + switch (ndx) { + case 0: + p = "uint64_t"; + break; + case 1: + p = "u_long"; + break; + case 2: + p = "userland struct kexec_segment *"; + break; + case 3: + p = "u_long"; + break; + default: + break; + }; + break; default: break; }; @@ -11409,6 +11438,11 @@ systrace_return_setargdesc(int sysnum, int ndx, char *desc, size_t descsz) if (ndx == 0 || ndx == 1) p = "int"; break; + /* kexec_load */ + case 599: + if (ndx == 0 || ndx == 1) + p = "int"; + break; default: break; }; diff --git a/sys/modules/dummynet/Makefile b/sys/modules/dummynet/Makefile index 4ff023e6bca5..a645c1673167 100644 --- a/sys/modules/dummynet/Makefile +++ b/sys/modules/dummynet/Makefile @@ -1,7 +1,6 @@ .PATH: ${SRCTOP}/sys/netpfil/ipfw KMOD= dummynet -SRCS= ip_dummynet.c -SRCS+= ip_dn_glue.c ip_dn_io.c +SRCS= ip_dummynet.c ip_dn_io.c SRCS+= dn_aqm_codel.c dn_aqm_pie.c SRCS+= dn_heap.c dn_sched_fifo.c dn_sched_qfq.c dn_sched_rr.c dn_sched_wf2q.c SRCS+= dn_sched_prio.c dn_sched_fq_codel.c dn_sched_fq_pie.c diff --git a/sys/net/altq/altq_cbq.c b/sys/net/altq/altq_cbq.c index fdf39690160b..2333b9ea8678 100644 --- a/sys/net/altq/altq_cbq.c +++ b/sys/net/altq/altq_cbq.c @@ -173,6 +173,8 @@ cbq_request(struct ifaltq *ifq, int req, void *arg) static void get_class_stats(class_stats_t *statsp, struct rm_class *cl) { + memset(statsp, 0, sizeof(*statsp)); + statsp->xmit_cnt = cl->stats_.xmit_cnt; statsp->drop_cnt = cl->stats_.drop_cnt; statsp->over = cl->stats_.over; diff --git a/sys/net/altq/altq_fairq.c b/sys/net/altq/altq_fairq.c index 6069865101a0..0a00168e547e 100644 --- a/sys/net/altq/altq_fairq.c +++ b/sys/net/altq/altq_fairq.c @@ -857,6 +857,8 @@ get_class_stats(struct fairq_classstats *sp, struct fairq_class *cl) { fairq_bucket_t *b; + memset(sp, 0, sizeof(*sp)); + sp->class_handle = cl->cl_handle; sp->qlimit = cl->cl_qlimit; sp->xmit_cnt = cl->cl_xmitcnt; diff --git a/sys/net/altq/altq_priq.c b/sys/net/altq/altq_priq.c index 026346639b2e..fec488418546 100644 --- a/sys/net/altq/altq_priq.c +++ b/sys/net/altq/altq_priq.c @@ -597,6 +597,8 @@ priq_purgeq(struct priq_class *cl) static void get_class_stats(struct priq_classstats *sp, struct priq_class *cl) { + memset(sp, 0, sizeof(*sp)); + sp->class_handle = cl->cl_handle; sp->qlength = qlen(cl->cl_q); sp->qlimit = qlimit(cl->cl_q); diff --git a/sys/net/if.c b/sys/net/if.c index b6a798aa0fab..cb9c47c14c32 100644 --- a/sys/net/if.c +++ b/sys/net/if.c @@ -2842,15 +2842,20 @@ ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td) break; case SIOCAIFGROUP: + { + const char *groupname; + error = priv_check(td, PRIV_NET_ADDIFGROUP); if (error) return (error); - error = if_addgroup(ifp, - ((struct ifgroupreq *)data)->ifgr_group); + groupname = ((struct ifgroupreq *)data)->ifgr_group; + if (strnlen(groupname, IFNAMSIZ) == IFNAMSIZ) + return (EINVAL); + error = if_addgroup(ifp, groupname); if (error != 0) return (error); break; - + } case SIOCGIFGROUP: { struct epoch_tracker et; @@ -2862,15 +2867,20 @@ ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td) } case SIOCDIFGROUP: + { + const char *groupname; + error = priv_check(td, PRIV_NET_DELIFGROUP); if (error) return (error); - error = if_delgroup(ifp, - ((struct ifgroupreq *)data)->ifgr_group); + groupname = ((struct ifgroupreq *)data)->ifgr_group; + if (strnlen(groupname, IFNAMSIZ) == IFNAMSIZ) + return (EINVAL); + error = if_delgroup(ifp, groupname); if (error != 0) return (error); break; - + } default: error = ENOIOCTL; break; @@ -3014,9 +3024,17 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct thread *td) goto out_noref; case SIOCGIFGMEMB: - error = if_getgroupmembers((struct ifgroupreq *)data); - goto out_noref; + { + struct ifgroupreq *req; + req = (struct ifgroupreq *)data; + if (strnlen(req->ifgr_name, IFNAMSIZ) == IFNAMSIZ) { + error = EINVAL; + goto out_noref; + } + error = if_getgroupmembers(req); + goto out_noref; + } #if defined(INET) || defined(INET6) case SIOCSVH: case SIOCGVH: diff --git a/sys/net/if_var.h b/sys/net/if_var.h index f2df612b19c1..961259bb0ca1 100644 --- a/sys/net/if_var.h +++ b/sys/net/if_var.h @@ -383,18 +383,18 @@ struct ifg_group { char ifg_group[IFNAMSIZ]; u_int ifg_refcnt; void *ifg_pf_kif; - CK_STAILQ_HEAD(, ifg_member) ifg_members; /* (CK_) */ - CK_STAILQ_ENTRY(ifg_group) ifg_next; /* (CK_) */ + CK_STAILQ_HEAD(, ifg_member) ifg_members; + CK_STAILQ_ENTRY(ifg_group) ifg_next; }; struct ifg_member { - CK_STAILQ_ENTRY(ifg_member) ifgm_next; /* (CK_) */ + CK_STAILQ_ENTRY(ifg_member) ifgm_next; if_t ifgm_ifp; }; struct ifg_list { struct ifg_group *ifgl_group; - CK_STAILQ_ENTRY(ifg_list) ifgl_next; /* (CK_) */ + CK_STAILQ_ENTRY(ifg_list) ifgl_next; }; #ifdef _SYS_EVENTHANDLER_H_ diff --git a/sys/netgraph/ng_device.c b/sys/netgraph/ng_device.c index 582f877ff3ed..066e3be29694 100644 --- a/sys/netgraph/ng_device.c +++ b/sys/netgraph/ng_device.c @@ -3,6 +3,7 @@ * * Copyright (c) 2002 Mark Santcroos <marks@ripe.net> * Copyright (c) 2004-2005 Gleb Smirnoff <glebius@FreeBSD.org> + * Copyright (c) 2025 Quentin Thébault <quentin.thebault@defenso.fr> * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -50,6 +51,7 @@ #include <sys/poll.h> #include <sys/proc.h> #include <sys/queue.h> +#include <sys/selinfo.h> #include <sys/socket.h> #include <sys/syslog.h> #include <sys/uio.h> @@ -117,12 +119,15 @@ struct ngd_private { struct ng_node *node; struct ng_hook *hook; struct cdev *ngddev; + struct selinfo rsel; + struct selinfo wsel; struct mtx ngd_mtx; int unit; int ether_align; uint16_t flags; #define NGDF_OPEN 0x0001 #define NGDF_RWAIT 0x0002 +#define NGDF_DYING 0x0004 }; typedef struct ngd_private *priv_p; @@ -138,6 +143,24 @@ static d_read_t ngdread; static d_write_t ngdwrite; static d_ioctl_t ngdioctl; static d_poll_t ngdpoll; +static d_kqfilter_t ngdkqfilter; + +static int ngd_kqread_event(struct knote *, long); +static int ngd_kqwrite_event(struct knote *, long); +static void ngd_kqread_detach(struct knote *); +static void ngd_kqwrite_detach(struct knote *); + +static const struct filterops ngd_read_filterops = { + .f_isfd = 1, + .f_detach = ngd_kqread_detach, + .f_event = ngd_kqread_event +}; + +static const struct filterops ngd_write_filterops = { + .f_isfd = 1, + .f_detach = ngd_kqwrite_detach, + .f_event = ngd_kqwrite_event +}; static struct cdevsw ngd_cdevsw = { .d_version = D_VERSION, @@ -146,6 +169,7 @@ static struct cdevsw ngd_cdevsw = { .d_read = ngdread, .d_write = ngdwrite, .d_ioctl = ngdioctl, + .d_kqfilter = ngdkqfilter, .d_poll = ngdpoll, .d_name = NG_DEVICE_DEVNAME, }; @@ -198,6 +222,9 @@ ng_device_constructor(node_p node) mtx_init(&priv->readq.ifq_mtx, "ng_device queue", NULL, MTX_DEF); IFQ_SET_MAXLEN(&priv->readq, ifqmaxlen); + knlist_init_mtx(&priv->rsel.si_note, &priv->ngd_mtx); + knlist_init_mtx(&priv->wsel.si_note, &priv->ngd_mtx); + /* Link everything together */ NG_NODE_SET_PRIVATE(node, priv); priv->node = node; @@ -206,6 +233,8 @@ ng_device_constructor(node_p node) GID_WHEEL, 0600, NG_DEVICE_DEVNAME "%d", priv->unit); if (priv->ngddev == NULL) { printf("%s(): make_dev() failed\n", __func__); + knlist_destroy(&priv->rsel.si_note); + knlist_destroy(&priv->wsel.si_note); mtx_destroy(&priv->ngd_mtx); mtx_destroy(&priv->readq.ifq_mtx); free_unr(ngd_unit, priv->unit); @@ -319,6 +348,8 @@ ng_device_rcvdata(hook_p hook, item_p item) priv->flags &= ~NGDF_RWAIT; wakeup(priv); } + selwakeup(&priv->rsel); + KNOTE_LOCKED(&priv->rsel.si_note, 0); mtx_unlock(&priv->ngd_mtx); return (0); @@ -334,9 +365,22 @@ ng_device_disconnect(hook_p hook) DBG; + mtx_lock(&priv->ngd_mtx); + priv->flags |= NGDF_DYING; + wakeup(priv); + mtx_unlock(&priv->ngd_mtx); + destroy_dev(priv->ngddev); + + knlist_clear(&priv->rsel.si_note, 0); + knlist_clear(&priv->wsel.si_note, 0); + knlist_destroy(&priv->rsel.si_note); + knlist_destroy(&priv->wsel.si_note); mtx_destroy(&priv->ngd_mtx); + seldrain(&priv->rsel); + seldrain(&priv->wsel); + IF_DRAIN(&priv->readq); mtx_destroy(&(priv)->readq.ifq_mtx); @@ -493,9 +537,13 @@ ngdread(struct cdev *dev, struct uio *uio, int flag) return (EWOULDBLOCK); mtx_lock(&priv->ngd_mtx); priv->flags |= NGDF_RWAIT; - if ((error = msleep(priv, &priv->ngd_mtx, - PDROP | PCATCH | PZERO, - "ngdread", 0)) != 0) + if (priv->flags & NGDF_DYING) { + mtx_unlock(&priv->ngd_mtx); + error = ENXIO; + } else + error = mtx_sleep(priv, &priv->ngd_mtx, + PDROP | PCATCH, "ngdread", 0); + if (error != 0) return (error); } } while (m == NULL); @@ -538,9 +586,12 @@ ngdwrite(struct cdev *dev, struct uio *uio, int flag) if (m == NULL) return (ENOBUFS); + /* Setting VNET is required if connecting to a ng_bridge. */ + CURVNET_SET(priv->node->nd_vnet); NET_EPOCH_ENTER(et); NG_SEND_DATA_ONLY(error, priv->hook, m); NET_EPOCH_EXIT(et); + CURVNET_RESTORE(); return (error); } @@ -561,3 +612,72 @@ ngdpoll(struct cdev *dev, int events, struct thread *td) return (revents); } + +static void +ngd_kqread_detach(struct knote *kn) +{ + priv_p priv = (priv_p)kn->kn_hook; + + knlist_remove(&priv->rsel.si_note, kn, 0); +} + +static int +ngd_kqread_event(struct knote *kn, long hint) +{ + priv_p priv = (priv_p)kn->kn_hook; + struct mbuf *m; + + IFQ_LOCK(&priv->readq); + if (IFQ_IS_EMPTY(&priv->readq)) { + kn->kn_data = 0; + } else { + /* + * Since the queue does not store the total number of bytes that + * could be read across all packets and we do not want to + * traverse the whole queue, we only report the number of bytes + * for the first packet in the queue. + */ + IF_POLL(&priv->readq, m); + kn->kn_data = m->m_len; + } + IFQ_UNLOCK(&priv->readq); + + return (kn->kn_data > 0); +} + +static void +ngd_kqwrite_detach(struct knote *kn) +{ + priv_p priv = (priv_p)kn->kn_hook; + + knlist_remove(&priv->wsel.si_note, kn, 0); +} + +static int +ngd_kqwrite_event(struct knote *kn, long hint) +{ + kn->kn_data = IP_MAXPACKET; + + return (1); +} + +static int +ngdkqfilter(struct cdev *dev, struct knote *kn) +{ + priv_p priv = (priv_p)dev->si_drv1; + + switch (kn->kn_filter) { + case EVFILT_READ: + kn->kn_fop = &ngd_read_filterops; + kn->kn_hook = priv; + knlist_add(&priv->rsel.si_note, kn, 0); + return (0); + case EVFILT_WRITE: + kn->kn_fop = &ngd_write_filterops; + kn->kn_hook = priv; + knlist_add(&priv->wsel.si_note, kn, 0); + return (0); + default: + return (EINVAL); + } +} diff --git a/sys/netinet/raw_ip.c b/sys/netinet/raw_ip.c index 66070faf97e9..bfe608be6b36 100644 --- a/sys/netinet/raw_ip.c +++ b/sys/netinet/raw_ip.c @@ -680,7 +680,6 @@ rip_ctloutput(struct socket *so, struct sockopt *sopt) break; case IP_DUMMYNET3: /* generic dummynet v.3 functions */ - case IP_DUMMYNET_GET: if (ip_dn_ctl_ptr != NULL) error = ip_dn_ctl_ptr(sopt); else @@ -747,9 +746,6 @@ rip_ctloutput(struct socket *so, struct sockopt *sopt) break; case IP_DUMMYNET3: /* generic dummynet v.3 functions */ - case IP_DUMMYNET_CONFIGURE: - case IP_DUMMYNET_DEL: - case IP_DUMMYNET_FLUSH: if (ip_dn_ctl_ptr != NULL) error = ip_dn_ctl_ptr(sopt); else diff --git a/sys/netinet/tcp_syncache.c b/sys/netinet/tcp_syncache.c index 3cb538f7054d..3a7755e9f09e 100644 --- a/sys/netinet/tcp_syncache.c +++ b/sys/netinet/tcp_syncache.c @@ -1380,6 +1380,7 @@ syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, struct tcpcb *tp; struct socket *rv = NULL; struct syncache *sc = NULL; + struct ucred *cred; struct syncache_head *sch; struct mbuf *ipopts = NULL; u_int ltflags; @@ -1408,6 +1409,7 @@ syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, */ KASSERT(SOLISTENING(so), ("%s: %p not listening", __func__, so)); tp = sototcpcb(so); + cred = V_tcp_syncache.see_other ? NULL : crhold(so->so_cred); #ifdef INET6 if (inc->inc_flags & INC_ISIPV6) { @@ -1636,16 +1638,16 @@ syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, /* * sc_cred is only used in syncache_pcblist() to list TCP endpoints in * TCPS_SYN_RECEIVED state when V_tcp_syncache.see_other is false. - * Therefore, store the credentials and take a reference count only - * when needed: + * Therefore, store the credentials only when needed: * - sc is allocated from the zone and not using the on stack instance. * - the sysctl variable net.inet.tcp.syncache.see_other is false. * The reference count is decremented when a zone allocated sc is * freed in syncache_free(). */ - if (sc != &scs && !V_tcp_syncache.see_other) - sc->sc_cred = crhold(so->so_cred); - else + if (sc != &scs && !V_tcp_syncache.see_other) { + sc->sc_cred = cred; + cred = NULL; + } else sc->sc_cred = NULL; sc->sc_port = port; sc->sc_ipopts = ipopts; @@ -1783,6 +1785,8 @@ donenoprobe: tcp_fastopen_decrement_counter(tfo_pending); tfo_expanded: + if (cred != NULL) + crfree(cred); if (sc == NULL || sc == &scs) { #ifdef MAC mac_syncache_destroy(&maclabel); diff --git a/sys/netinet/udp_usrreq.c b/sys/netinet/udp_usrreq.c index 04d01099d54a..f1d952037d5a 100644 --- a/sys/netinet/udp_usrreq.c +++ b/sys/netinet/udp_usrreq.c @@ -1172,7 +1172,19 @@ udp_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, else INP_RLOCK(inp); NET_EPOCH_ENTER(et); +#ifdef INET6 + if ((flags & PRUS_IPV6) != 0) { + if ((inp->in6p_outputopts != NULL) && + (inp->in6p_outputopts->ip6po_tclass != -1)) + tos = (u_char)inp->in6p_outputopts->ip6po_tclass; + else + tos = 0; + } else { + tos = inp->inp_ip_tos; + } +#else tos = inp->inp_ip_tos; +#endif if (control != NULL) { /* * XXX: Currently, we assume all the optional information is @@ -1196,6 +1208,23 @@ udp_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, error = udp_v4mapped_pktinfo(cm, &src, inp, flags); if (error != 0) break; + if (((flags & PRUS_IPV6) != 0) && + (cm->cmsg_level == IPPROTO_IPV6) && + (cm->cmsg_type == IPV6_TCLASS)) { + int tclass; + + if (cm->cmsg_len != CMSG_LEN(sizeof(int))) { + error = EINVAL; + break; + } + tclass = *(int *)CMSG_DATA(cm); + if (tclass < -1 || tclass > 255) { + error = EINVAL; + break; + } + if (tclass != -1) + tos = (u_char)tclass; + } #endif if (cm->cmsg_level != IPPROTO_IP) continue; diff --git a/sys/netpfil/ipfw/ip_dn_glue.c b/sys/netpfil/ipfw/ip_dn_glue.c deleted file mode 100644 index 0412b730e4df..000000000000 --- a/sys/netpfil/ipfw/ip_dn_glue.c +++ /dev/null @@ -1,858 +0,0 @@ -/*- - * SPDX-License-Identifier: BSD-2-Clause - * - * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa - * All rights reserved - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -/* - * - * Binary compatibility support for /sbin/ipfw RELENG_7 and RELENG_8 - */ - -#include "opt_inet6.h" - -#include <sys/param.h> -#include <sys/systm.h> -#include <sys/malloc.h> -#include <sys/mbuf.h> -#include <sys/kernel.h> -#include <sys/lock.h> -#include <sys/module.h> -#include <sys/priv.h> -#include <sys/proc.h> -#include <sys/rwlock.h> -#include <sys/socket.h> -#include <sys/socketvar.h> -#include <sys/time.h> -#include <sys/taskqueue.h> -#include <net/if.h> /* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */ -#include <netinet/in.h> -#include <netinet/ip_var.h> /* ip_output(), IP_FORWARDING */ -#include <netinet/ip_fw.h> -#include <netinet/ip_dummynet.h> - -#include <netpfil/ipfw/ip_fw_private.h> -#include <netpfil/ipfw/dn_heap.h> -#include <netpfil/ipfw/ip_dn_private.h> -#ifdef NEW_AQM -#include <netpfil/ipfw/dn_aqm.h> -#endif -#include <netpfil/ipfw/dn_sched.h> - -/* FREEBSD7.2 ip_dummynet.h r191715*/ - -struct dn_heap_entry7 { - int64_t key; /* sorting key. Topmost element is smallest one */ - void *object; /* object pointer */ -}; - -struct dn_heap7 { - int size; - int elements; - int offset; /* XXX if > 0 this is the offset of direct ptr to obj */ - struct dn_heap_entry7 *p; /* really an array of "size" entries */ -}; - -/* Common to 7.2 and 8 */ -struct dn_flow_set { - SLIST_ENTRY(dn_flow_set) next; /* linked list in a hash slot */ - - u_short fs_nr ; /* flow_set number */ - u_short flags_fs; -#define DNOLD_HAVE_FLOW_MASK 0x0001 -#define DNOLD_IS_RED 0x0002 -#define DNOLD_IS_GENTLE_RED 0x0004 -#define DNOLD_QSIZE_IS_BYTES 0x0008 /* queue size is measured in bytes */ -#define DNOLD_NOERROR 0x0010 /* do not report ENOBUFS on drops */ -#define DNOLD_HAS_PROFILE 0x0020 /* the pipe has a delay profile. */ -#define DNOLD_IS_PIPE 0x4000 -#define DNOLD_IS_QUEUE 0x8000 - - struct dn_pipe7 *pipe ; /* pointer to parent pipe */ - u_short parent_nr ; /* parent pipe#, 0 if local to a pipe */ - - int weight ; /* WFQ queue weight */ - int qsize ; /* queue size in slots or bytes */ - int plr[4] ; /* pkt loss rate (2^31-1 means 100%) */ - - struct ipfw_flow_id flow_mask ; - - /* hash table of queues onto this flow_set */ - int rq_size ; /* number of slots */ - int rq_elements ; /* active elements */ - struct dn_flow_queue7 **rq ; /* array of rq_size entries */ - - u_int32_t last_expired ; /* do not expire too frequently */ - int backlogged ; /* #active queues for this flowset */ - - /* RED parameters */ -#define SCALE_RED 16 -#define SCALE(x) ( (x) << SCALE_RED ) -#define SCALE_VAL(x) ( (x) >> SCALE_RED ) -#define SCALE_MUL(x,y) ( ( (x) * (y) ) >> SCALE_RED ) - int w_q ; /* queue weight (scaled) */ - int max_th ; /* maximum threshold for queue (scaled) */ - int min_th ; /* minimum threshold for queue (scaled) */ - int max_p ; /* maximum value for p_b (scaled) */ - u_int c_1 ; /* max_p/(max_th-min_th) (scaled) */ - u_int c_2 ; /* max_p*min_th/(max_th-min_th) (scaled) */ - u_int c_3 ; /* for GRED, (1-max_p)/max_th (scaled) */ - u_int c_4 ; /* for GRED, 1 - 2*max_p (scaled) */ - u_int * w_q_lookup ; /* lookup table for computing (1-w_q)^t */ - u_int lookup_depth ; /* depth of lookup table */ - int lookup_step ; /* granularity inside the lookup table */ - int lookup_weight ; /* equal to (1-w_q)^t / (1-w_q)^(t+1) */ - int avg_pkt_size ; /* medium packet size */ - int max_pkt_size ; /* max packet size */ -}; -SLIST_HEAD(dn_flow_set_head, dn_flow_set); - -#define DN_IS_PIPE 0x4000 -#define DN_IS_QUEUE 0x8000 -struct dn_flow_queue7 { - struct dn_flow_queue7 *next ; - struct ipfw_flow_id id ; - - struct mbuf *head, *tail ; /* queue of packets */ - u_int len ; - u_int len_bytes ; - - u_long numbytes; - - u_int64_t tot_pkts ; /* statistics counters */ - u_int64_t tot_bytes ; - u_int32_t drops ; - - int hash_slot ; /* debugging/diagnostic */ - - /* RED parameters */ - int avg ; /* average queue length est. (scaled) */ - int count ; /* arrivals since last RED drop */ - int random ; /* random value (scaled) */ - u_int32_t q_time; /* start of queue idle time */ - - /* WF2Q+ support */ - struct dn_flow_set *fs ; /* parent flow set */ - int heap_pos ; /* position (index) of struct in heap */ - int64_t sched_time ; /* current time when queue enters ready_heap */ - - int64_t S,F ; /* start time, finish time */ -}; - -struct dn_pipe7 { /* a pipe */ - SLIST_ENTRY(dn_pipe7) next; /* linked list in a hash slot */ - - int pipe_nr ; /* number */ - uint32_t bandwidth; /* really, bytes/tick. */ - int delay ; /* really, ticks */ - - struct mbuf *head, *tail ; /* packets in delay line */ - - /* WF2Q+ */ - struct dn_heap7 scheduler_heap ; /* top extract - key Finish time*/ - struct dn_heap7 not_eligible_heap; /* top extract- key Start time */ - struct dn_heap7 idle_heap ; /* random extract - key Start=Finish time */ - - int64_t V ; /* virtual time */ - int sum; /* sum of weights of all active sessions */ - - int numbytes; - - int64_t sched_time ; /* time pipe was scheduled in ready_heap */ - - /* - * When the tx clock come from an interface (if_name[0] != '\0'), its name - * is stored below, whereas the ifp is filled when the rule is configured. - */ - char if_name[IFNAMSIZ]; - struct ifnet *ifp ; - int ready ; /* set if ifp != NULL and we got a signal from it */ - - struct dn_flow_set fs ; /* used with fixed-rate flows */ -}; -SLIST_HEAD(dn_pipe_head7, dn_pipe7); - -/* FREEBSD8 ip_dummynet.h r196045 */ -struct dn_flow_queue8 { - struct dn_flow_queue8 *next ; - struct ipfw_flow_id id ; - - struct mbuf *head, *tail ; /* queue of packets */ - u_int len ; - u_int len_bytes ; - - uint64_t numbytes ; /* credit for transmission (dynamic queues) */ - int64_t extra_bits; /* extra bits simulating unavailable channel */ - - u_int64_t tot_pkts ; /* statistics counters */ - u_int64_t tot_bytes ; - u_int32_t drops ; - - int hash_slot ; /* debugging/diagnostic */ - - /* RED parameters */ - int avg ; /* average queue length est. (scaled) */ - int count ; /* arrivals since last RED drop */ - int random ; /* random value (scaled) */ - int64_t idle_time; /* start of queue idle time */ - - /* WF2Q+ support */ - struct dn_flow_set *fs ; /* parent flow set */ - int heap_pos ; /* position (index) of struct in heap */ - int64_t sched_time ; /* current time when queue enters ready_heap */ - - int64_t S,F ; /* start time, finish time */ -}; - -struct dn_pipe8 { /* a pipe */ - SLIST_ENTRY(dn_pipe8) next; /* linked list in a hash slot */ - - int pipe_nr ; /* number */ - uint32_t bandwidth; /* really, bytes/tick. */ - int delay ; /* really, ticks */ - - struct mbuf *head, *tail ; /* packets in delay line */ - - /* WF2Q+ */ - struct dn_heap7 scheduler_heap ; /* top extract - key Finish time*/ - struct dn_heap7 not_eligible_heap; /* top extract- key Start time */ - struct dn_heap7 idle_heap ; /* random extract - key Start=Finish time */ - - int64_t V ; /* virtual time */ - int sum; /* sum of weights of all active sessions */ - - /* Same as in dn_flow_queue, numbytes can become large */ - int64_t numbytes; /* bits I can transmit (more or less). */ - uint64_t burst; /* burst size, scaled: bits * hz */ - - int64_t sched_time ; /* time pipe was scheduled in ready_heap */ - int64_t idle_time; /* start of pipe idle time */ - - char if_name[IFNAMSIZ]; - struct ifnet *ifp ; - int ready ; /* set if ifp != NULL and we got a signal from it */ - - struct dn_flow_set fs ; /* used with fixed-rate flows */ - - /* fields to simulate a delay profile */ -#define ED_MAX_NAME_LEN 32 - char name[ED_MAX_NAME_LEN]; - int loss_level; - int samples_no; - int *samples; -}; - -#define ED_MAX_SAMPLES_NO 1024 -struct dn_pipe_max8 { - struct dn_pipe8 pipe; - int samples[ED_MAX_SAMPLES_NO]; -}; -SLIST_HEAD(dn_pipe_head8, dn_pipe8); - -/* - * Changes from 7.2 to 8: - * dn_pipe: - * numbytes from int to int64_t - * add burst (int64_t) - * add idle_time (int64_t) - * add profile - * add struct dn_pipe_max - * add flag DN_HAS_PROFILE - * - * dn_flow_queue - * numbytes from u_long to int64_t - * add extra_bits (int64_t) - * q_time from u_int32_t to int64_t and name idle_time - * - * dn_flow_set unchanged - * - */ - -/* NOTE:XXX copied from dummynet.c */ -#define O_NEXT(p, len) ((void *)((char *)p + len)) -static void -oid_fill(struct dn_id *oid, int len, int type, uintptr_t id) -{ - oid->len = len; - oid->type = type; - oid->subtype = 0; - oid->id = id; -} -/* make room in the buffer and move the pointer forward */ -static void * -o_next(struct dn_id **o, int len, int type) -{ - struct dn_id *ret = *o; - oid_fill(ret, len, type, 0); - *o = O_NEXT(*o, len); - return ret; -} - -static size_t pipesize7 = sizeof(struct dn_pipe7); -static size_t pipesize8 = sizeof(struct dn_pipe8); -static size_t pipesizemax8 = sizeof(struct dn_pipe_max8); - -/* Indicate 'ipfw' version - * 1: from FreeBSD 7.2 - * 0: from FreeBSD 8 - * -1: unknown (for now is unused) - * - * It is update when a IP_DUMMYNET_DEL or IP_DUMMYNET_CONFIGURE request arrives - * NOTE: if a IP_DUMMYNET_GET arrives and the 'ipfw' version is unknown, - * it is suppose to be the FreeBSD 8 version. - */ -static int is7 = 0; - -static int -convertflags2new(int src) -{ - int dst = 0; - - if (src & DNOLD_HAVE_FLOW_MASK) - dst |= DN_HAVE_MASK; - if (src & DNOLD_QSIZE_IS_BYTES) - dst |= DN_QSIZE_BYTES; - if (src & DNOLD_NOERROR) - dst |= DN_NOERROR; - if (src & DNOLD_IS_RED) - dst |= DN_IS_RED; - if (src & DNOLD_IS_GENTLE_RED) - dst |= DN_IS_GENTLE_RED; - if (src & DNOLD_HAS_PROFILE) - dst |= DN_HAS_PROFILE; - - return dst; -} - -static int -convertflags2old(int src) -{ - int dst = 0; - - if (src & DN_HAVE_MASK) - dst |= DNOLD_HAVE_FLOW_MASK; - if (src & DN_IS_RED) - dst |= DNOLD_IS_RED; - if (src & DN_IS_GENTLE_RED) - dst |= DNOLD_IS_GENTLE_RED; - if (src & DN_NOERROR) - dst |= DNOLD_NOERROR; - if (src & DN_HAS_PROFILE) - dst |= DNOLD_HAS_PROFILE; - if (src & DN_QSIZE_BYTES) - dst |= DNOLD_QSIZE_IS_BYTES; - - return dst; -} - -static int -dn_compat_del(void *v) -{ - struct dn_pipe7 *p = (struct dn_pipe7 *) v; - struct dn_pipe8 *p8 = (struct dn_pipe8 *) v; - struct { - struct dn_id oid; - uintptr_t a[1]; /* add more if we want a list */ - } cmd; - - /* XXX DN_API_VERSION ??? */ - oid_fill((void *)&cmd, sizeof(cmd), DN_CMD_DELETE, DN_API_VERSION); - - if (is7) { - if (p->pipe_nr == 0 && p->fs.fs_nr == 0) - return EINVAL; - if (p->pipe_nr != 0 && p->fs.fs_nr != 0) - return EINVAL; - } else { - if (p8->pipe_nr == 0 && p8->fs.fs_nr == 0) - return EINVAL; - if (p8->pipe_nr != 0 && p8->fs.fs_nr != 0) - return EINVAL; - } - - if (p->pipe_nr != 0) { /* pipe x delete */ - cmd.a[0] = p->pipe_nr; - cmd.oid.subtype = DN_LINK; - } else { /* queue x delete */ - cmd.oid.subtype = DN_FS; - cmd.a[0] = (is7) ? p->fs.fs_nr : p8->fs.fs_nr; - } - - return do_config(&cmd, cmd.oid.len); -} - -static int -dn_compat_config_queue(struct dn_fs *fs, void* v) -{ - struct dn_pipe7 *p7 = (struct dn_pipe7 *)v; - struct dn_pipe8 *p8 = (struct dn_pipe8 *)v; - struct dn_flow_set *f; - - if (is7) - f = &p7->fs; - else - f = &p8->fs; - - fs->fs_nr = f->fs_nr; - fs->sched_nr = f->parent_nr; - fs->flow_mask = f->flow_mask; - fs->buckets = f->rq_size; - fs->qsize = f->qsize; - fs->plr[0] = f->plr[0]; - fs->plr[1] = f->plr[1]; - fs->plr[2] = f->plr[2]; - fs->plr[3] = f->plr[3]; - fs->par[0] = f->weight; - fs->flags = convertflags2new(f->flags_fs); - if (fs->flags & DN_IS_GENTLE_RED || fs->flags & DN_IS_RED) { - fs->w_q = f->w_q; - fs->max_th = f->max_th; - fs->min_th = f->min_th; - fs->max_p = f->max_p; - } - - return 0; -} - -static int -dn_compat_config_pipe(struct dn_sch *sch, struct dn_link *p, - struct dn_fs *fs, void* v) -{ - struct dn_pipe7 *p7 = (struct dn_pipe7 *)v; - struct dn_pipe8 *p8 = (struct dn_pipe8 *)v; - int i = p7->pipe_nr; - - sch->sched_nr = i; - sch->oid.subtype = 0; - p->link_nr = i; - fs->fs_nr = i + 2*DN_MAX_ID; - fs->sched_nr = i + DN_MAX_ID; - - /* Common to 7 and 8 */ - p->bandwidth = p7->bandwidth; - p->delay = p7->delay; - if (!is7) { - /* FreeBSD 8 has burst */ - p->burst = p8->burst; - } - - /* fill the fifo flowset */ - dn_compat_config_queue(fs, v); - fs->fs_nr = i + 2*DN_MAX_ID; - fs->sched_nr = i + DN_MAX_ID; - - /* Move scheduler related parameter from fs to sch */ - sch->buckets = fs->buckets; /*XXX*/ - fs->buckets = 0; - if (fs->flags & DN_HAVE_MASK) { - sch->flags |= DN_HAVE_MASK; - fs->flags &= ~DN_HAVE_MASK; - sch->sched_mask = fs->flow_mask; - bzero(&fs->flow_mask, sizeof(struct ipfw_flow_id)); - } - - return 0; -} - -static int -dn_compat_config_profile(struct dn_profile *pf, struct dn_link *p, - void *v) -{ - struct dn_pipe8 *p8 = (struct dn_pipe8 *)v; - - p8->samples = &(((struct dn_pipe_max8 *)p8)->samples[0]); - - pf->link_nr = p->link_nr; - pf->loss_level = p8->loss_level; -// pf->bandwidth = p->bandwidth; //XXX bandwidth redundant? - pf->samples_no = p8->samples_no; - strncpy(pf->name, p8->name,sizeof(pf->name)); - bcopy(p8->samples, pf->samples, sizeof(pf->samples)); - - return 0; -} - -/* - * If p->pipe_nr != 0 the command is 'pipe x config', so need to create - * the three main struct, else only a flowset is created - */ -static int -dn_compat_configure(void *v) -{ - struct dn_id *buf = NULL, *base; - struct dn_sch *sch = NULL; - struct dn_link *p = NULL; - struct dn_fs *fs = NULL; - struct dn_profile *pf = NULL; - int lmax; - int error; - - struct dn_pipe7 *p7 = (struct dn_pipe7 *)v; - struct dn_pipe8 *p8 = (struct dn_pipe8 *)v; - - int i; /* number of object to configure */ - - lmax = sizeof(struct dn_id); /* command header */ - lmax += sizeof(struct dn_sch) + sizeof(struct dn_link) + - sizeof(struct dn_fs) + sizeof(struct dn_profile); - - base = buf = malloc(lmax, M_DUMMYNET, M_WAITOK|M_ZERO); - o_next(&buf, sizeof(struct dn_id), DN_CMD_CONFIG); - base->id = DN_API_VERSION; - - /* pipe_nr is the same in p7 and p8 */ - i = p7->pipe_nr; - if (i != 0) { /* pipe config */ - sch = o_next(&buf, sizeof(*sch), DN_SCH); - p = o_next(&buf, sizeof(*p), DN_LINK); - fs = o_next(&buf, sizeof(*fs), DN_FS); - - error = dn_compat_config_pipe(sch, p, fs, v); - if (error) { - free(buf, M_DUMMYNET); - return error; - } - if (!is7 && p8->samples_no > 0) { - /* Add profiles*/ - pf = o_next(&buf, sizeof(*pf), DN_PROFILE); - error = dn_compat_config_profile(pf, p, v); - if (error) { - free(buf, M_DUMMYNET); - return error; - } - } - } else { /* queue config */ - fs = o_next(&buf, sizeof(*fs), DN_FS); - error = dn_compat_config_queue(fs, v); - if (error) { - free(buf, M_DUMMYNET); - return error; - } - } - error = do_config(base, (char *)buf - (char *)base); - - if (buf) - free(buf, M_DUMMYNET); - return error; -} - -int -dn_compat_calc_size(void) -{ - int need = 0; - /* XXX use FreeBSD 8 struct size */ - /* NOTE: - * - half scheduler: schk_count/2 - * - all flowset: fsk_count - * - all flowset queues: queue_count - * - all pipe queue: si_count - */ - need += V_dn_cfg.schk_count * sizeof(struct dn_pipe8) / 2; - need += V_dn_cfg.fsk_count * sizeof(struct dn_flow_set); - need += V_dn_cfg.si_count * sizeof(struct dn_flow_queue8); - need += V_dn_cfg.queue_count * sizeof(struct dn_flow_queue8); - - return need; -} - -int -dn_c_copy_q (void *_ni, void *arg) -{ - struct copy_args *a = arg; - struct dn_flow_queue7 *fq7 = (struct dn_flow_queue7 *)*a->start; - struct dn_flow_queue8 *fq8 = (struct dn_flow_queue8 *)*a->start; - struct dn_flow *ni = (struct dn_flow *)_ni; - int size = 0; - - /* XXX hash slot not set */ - /* No difference between 7.2/8 */ - fq7->len = ni->length; - fq7->len_bytes = ni->len_bytes; - fq7->id = ni->fid; - - if (is7) { - size = sizeof(struct dn_flow_queue7); - fq7->tot_pkts = ni->tot_pkts; - fq7->tot_bytes = ni->tot_bytes; - fq7->drops = ni->drops; - } else { - size = sizeof(struct dn_flow_queue8); - fq8->tot_pkts = ni->tot_pkts; - fq8->tot_bytes = ni->tot_bytes; - fq8->drops = ni->drops; - } - - *a->start += size; - return 0; -} - -int -dn_c_copy_pipe(struct dn_schk *s, struct copy_args *a, int nq) -{ - struct dn_link *l = &s->link; - struct dn_fsk *f = s->fs; - - struct dn_pipe7 *pipe7 = (struct dn_pipe7 *)*a->start; - struct dn_pipe8 *pipe8 = (struct dn_pipe8 *)*a->start; - struct dn_flow_set *fs; - int size = 0; - - if (is7) { - fs = &pipe7->fs; - size = sizeof(struct dn_pipe7); - } else { - fs = &pipe8->fs; - size = sizeof(struct dn_pipe8); - } - - /* These 4 field are the same in pipe7 and pipe8 */ - pipe7->next.sle_next = (struct dn_pipe7 *)DN_IS_PIPE; - pipe7->bandwidth = l->bandwidth; - pipe7->delay = l->delay * 1000 / hz; - pipe7->pipe_nr = l->link_nr - DN_MAX_ID; - - if (!is7) { - if (s->profile) { - struct dn_profile *pf = s->profile; - strncpy(pipe8->name, pf->name, sizeof(pf->name)); - pipe8->loss_level = pf->loss_level; - pipe8->samples_no = pf->samples_no; - } - pipe8->burst = div64(l->burst , 8 * hz); - } - - fs->flow_mask = s->sch.sched_mask; - fs->rq_size = s->sch.buckets ? s->sch.buckets : 1; - - fs->parent_nr = l->link_nr - DN_MAX_ID; - fs->qsize = f->fs.qsize; - fs->plr[0] = f->fs.plr[0]; - fs->plr[1] = f->fs.plr[1]; - fs->plr[2] = f->fs.plr[2]; - fs->plr[3] = f->fs.plr[3]; - fs->w_q = f->fs.w_q; - fs->max_th = f->max_th; - fs->min_th = f->min_th; - fs->max_p = f->fs.max_p; - fs->rq_elements = nq; - - fs->flags_fs = convertflags2old(f->fs.flags); - - *a->start += size; - return 0; -} - -int -dn_compat_copy_pipe(struct copy_args *a, void *_o) -{ - int have = a->end - *a->start; - int need = 0; - int pipe_size = sizeof(struct dn_pipe8); - int queue_size = sizeof(struct dn_flow_queue8); - int n_queue = 0; /* number of queues */ - - struct dn_schk *s = (struct dn_schk *)_o; - /* calculate needed space: - * - struct dn_pipe - * - if there are instances, dn_queue * n_instances - */ - n_queue = (s->sch.flags & DN_HAVE_MASK ? dn_ht_entries(s->siht) : - (s->siht ? 1 : 0)); - need = pipe_size + queue_size * n_queue; - if (have < need) { - D("have %d < need %d", have, need); - return 1; - } - /* copy pipe */ - dn_c_copy_pipe(s, a, n_queue); - - /* copy queues */ - if (s->sch.flags & DN_HAVE_MASK) - dn_ht_scan(s->siht, dn_c_copy_q, a); - else if (s->siht) - dn_c_copy_q(s->siht, a); - return 0; -} - -int -dn_c_copy_fs(struct dn_fsk *f, struct copy_args *a, int nq) -{ - struct dn_flow_set *fs = (struct dn_flow_set *)*a->start; - - fs->next.sle_next = (struct dn_flow_set *)DN_IS_QUEUE; - fs->fs_nr = f->fs.fs_nr; - fs->qsize = f->fs.qsize; - fs->plr[0] = f->fs.plr[0]; - fs->plr[1] = f->fs.plr[1]; - fs->plr[2] = f->fs.plr[2]; - fs->plr[3] = f->fs.plr[3]; - fs->w_q = f->fs.w_q; - fs->max_th = f->max_th; - fs->min_th = f->min_th; - fs->max_p = f->fs.max_p; - fs->flow_mask = f->fs.flow_mask; - fs->rq_elements = nq; - fs->rq_size = (f->fs.buckets ? f->fs.buckets : 1); - fs->parent_nr = f->fs.sched_nr; - fs->weight = f->fs.par[0]; - - fs->flags_fs = convertflags2old(f->fs.flags); - *a->start += sizeof(struct dn_flow_set); - return 0; -} - -int -dn_compat_copy_queue(struct copy_args *a, void *_o) -{ - int have = a->end - *a->start; - int need = 0; - int fs_size = sizeof(struct dn_flow_set); - int queue_size = sizeof(struct dn_flow_queue8); - - struct dn_fsk *fs = (struct dn_fsk *)_o; - int n_queue = 0; /* number of queues */ - - n_queue = (fs->fs.flags & DN_HAVE_MASK ? dn_ht_entries(fs->qht) : - (fs->qht ? 1 : 0)); - - need = fs_size + queue_size * n_queue; - if (have < need) { - D("have < need"); - return 1; - } - - /* copy flowset */ - dn_c_copy_fs(fs, a, n_queue); - - /* copy queues */ - if (fs->fs.flags & DN_HAVE_MASK) - dn_ht_scan(fs->qht, dn_c_copy_q, a); - else if (fs->qht) - dn_c_copy_q(fs->qht, a); - - return 0; -} - -int -copy_data_helper_compat(void *_o, void *_arg) -{ - struct copy_args *a = _arg; - - if (a->type == DN_COMPAT_PIPE) { - struct dn_schk *s = _o; - if (s->sch.oid.subtype != 1 || s->sch.sched_nr <= DN_MAX_ID) { - return 0; /* not old type */ - } - /* copy pipe parameters, and if instance exists, copy - * other parameters and eventually queues. - */ - if(dn_compat_copy_pipe(a, _o)) - return DNHT_SCAN_END; - } else if (a->type == DN_COMPAT_QUEUE) { - struct dn_fsk *fs = _o; - if (fs->fs.fs_nr >= DN_MAX_ID) - return 0; - if (dn_compat_copy_queue(a, _o)) - return DNHT_SCAN_END; - } - return 0; -} - -/* Main function to manage old requests */ -int -ip_dummynet_compat(struct sockopt *sopt) -{ - int error=0; - void *v = NULL; - struct dn_id oid; - - /* Length of data, used to found ipfw version... */ - int len = sopt->sopt_valsize; - - /* len can be 0 if command was dummynet_flush */ - if (len == pipesize7) { - D("setting compatibility with FreeBSD 7.2"); - is7 = 1; - } - else if (len == pipesize8 || len == pipesizemax8) { - D("setting compatibility with FreeBSD 8"); - is7 = 0; - } - - switch (sopt->sopt_name) { - default: - printf("dummynet: -- unknown option %d", sopt->sopt_name); - error = EINVAL; - break; - - case IP_DUMMYNET_FLUSH: - oid_fill(&oid, sizeof(oid), DN_CMD_FLUSH, DN_API_VERSION); - do_config(&oid, oid.len); - break; - - case IP_DUMMYNET_DEL: - v = malloc(len, M_TEMP, M_WAITOK); - error = sooptcopyin(sopt, v, len, len); - if (error) - break; - error = dn_compat_del(v); - free(v, M_TEMP); - break; - - case IP_DUMMYNET_CONFIGURE: - v = malloc(len, M_TEMP, M_NOWAIT); - if (v == NULL) { - error = ENOMEM; - break; - } - error = sooptcopyin(sopt, v, len, len); - if (error) - break; - error = dn_compat_configure(v); - free(v, M_TEMP); - break; - - case IP_DUMMYNET_GET: { - void *buf; - int ret; - int original_size = sopt->sopt_valsize; - int size; - - ret = dummynet_get(sopt, &buf); - if (ret) - return 0;//XXX ? - size = sopt->sopt_valsize; - sopt->sopt_valsize = original_size; - D("size=%d, buf=%p", size, buf); - ret = sooptcopyout(sopt, buf, size); - if (ret) - printf(" %s ERROR sooptcopyout\n", __FUNCTION__); - if (buf) - free(buf, M_DUMMYNET); - } - } - - return error; -} diff --git a/sys/netpfil/ipfw/ip_dn_private.h b/sys/netpfil/ipfw/ip_dn_private.h index 756a997b6ec3..9a43b86791e0 100644 --- a/sys/netpfil/ipfw/ip_dn_private.h +++ b/sys/netpfil/ipfw/ip_dn_private.h @@ -437,15 +437,7 @@ struct copy_args { }; struct sockopt; -int ip_dummynet_compat(struct sockopt *sopt); -int dummynet_get(struct sockopt *sopt, void **compat); -int dn_c_copy_q (void *_ni, void *arg); -int dn_c_copy_pipe(struct dn_schk *s, struct copy_args *a, int nq); -int dn_c_copy_fs(struct dn_fsk *f, struct copy_args *a, int nq); -int dn_compat_copy_queue(struct copy_args *a, void *_o); -int dn_compat_copy_pipe(struct copy_args *a, void *_o); -int copy_data_helper_compat(void *_o, void *_arg); -int dn_compat_calc_size(void); +int dummynet_get(struct sockopt *sopt); int do_config(void *p, size_t l); /* function to drain idle object */ diff --git a/sys/netpfil/ipfw/ip_dummynet.c b/sys/netpfil/ipfw/ip_dummynet.c index d522f9da0fbe..61442c617753 100644 --- a/sys/netpfil/ipfw/ip_dummynet.c +++ b/sys/netpfil/ipfw/ip_dummynet.c @@ -2198,9 +2198,6 @@ compute_space(struct dn_id *cmd, struct copy_args *a) case DN_FS: /* queue show */ x = DN_C_FS | DN_C_QUEUE; break; - case DN_GET_COMPAT: /* compatibility mode */ - need = dn_compat_calc_size(); - break; } a->flags = x; if (x & DN_C_SCH) { @@ -2226,11 +2223,9 @@ compute_space(struct dn_id *cmd, struct copy_args *a) } /* - * If compat != NULL dummynet_get is called in compatibility mode. - * *compat will be the pointer to the buffer to pass to ipfw */ int -dummynet_get(struct sockopt *sopt, void **compat) +dummynet_get(struct sockopt *sopt) { int have, i, need, error; char *start = NULL, *buf; @@ -2248,37 +2243,28 @@ dummynet_get(struct sockopt *sopt, void **compat) cmd = &r.o; - if (!compat) { - /* copy at least an oid, and possibly a full object */ - error = sooptcopyin(sopt, cmd, sizeof(r), sizeof(*cmd)); - sopt->sopt_valsize = sopt_valsize; - if (error) - goto done; - l = cmd->len; + /* copy at least an oid, and possibly a full object */ + error = sooptcopyin(sopt, cmd, sizeof(r), sizeof(*cmd)); + sopt->sopt_valsize = sopt_valsize; + if (error) + goto done; + l = cmd->len; #ifdef EMULATE_SYSCTL - /* sysctl emulation. */ - if (cmd->type == DN_SYSCTL_GET) - return kesysctl_emu_get(sopt); + /* sysctl emulation. */ + if (cmd->type == DN_SYSCTL_GET) + return kesysctl_emu_get(sopt); #endif - if (l > sizeof(r)) { - /* request larger than default, allocate buffer */ - cmd = malloc(l, M_DUMMYNET, M_NOWAIT); - if (cmd == NULL) { - error = ENOMEM; - goto done; - } - error = sooptcopyin(sopt, cmd, l, l); - sopt->sopt_valsize = sopt_valsize; - if (error) - goto done; + if (l > sizeof(r)) { + /* request larger than default, allocate buffer */ + cmd = malloc(l, M_DUMMYNET, M_NOWAIT); + if (cmd == NULL) { + error = ENOMEM; + goto done; } - } else { /* compatibility */ - error = 0; - cmd->type = DN_CMD_GET; - cmd->len = sizeof(struct dn_id); - cmd->subtype = DN_GET_COMPAT; - // cmd->id = sopt_valsize; - D("compatibility mode"); + error = sooptcopyin(sopt, cmd, l, l); + sopt->sopt_valsize = sopt_valsize; + if (error) + goto done; } #ifdef NEW_AQM @@ -2337,12 +2323,7 @@ dummynet_get(struct sockopt *sopt, void **compat) } if (start == NULL) { - if (compat) { - *compat = NULL; - error = 1; // XXX - } else { - error = sooptcopyout(sopt, cmd, sizeof(*cmd)); - } + error = sooptcopyout(sopt, cmd, sizeof(*cmd)); goto done; } ND("have %d:%d sched %d, %d:%d links %d, %d:%d flowsets %d, " @@ -2355,35 +2336,20 @@ dummynet_get(struct sockopt *sopt, void **compat) sopt->sopt_valsize = sopt_valsize; a.type = cmd->subtype; - if (compat == NULL) { - memcpy(start, cmd, sizeof(*cmd)); - ((struct dn_id*)(start))->len = sizeof(struct dn_id); - buf = start + sizeof(*cmd); - } else - buf = start; + memcpy(start, cmd, sizeof(*cmd)); + ((struct dn_id*)(start))->len = sizeof(struct dn_id); + buf = start + sizeof(*cmd); a.start = &buf; a.end = start + have; /* start copying other objects */ - if (compat) { - a.type = DN_COMPAT_PIPE; - dn_ht_scan(V_dn_cfg.schedhash, copy_data_helper_compat, &a); - a.type = DN_COMPAT_QUEUE; - dn_ht_scan(V_dn_cfg.fshash, copy_data_helper_compat, &a); - } else if (a.type == DN_FS) { + if (a.type == DN_FS) { dn_ht_scan(V_dn_cfg.fshash, copy_data_helper, &a); } else { dn_ht_scan(V_dn_cfg.schedhash, copy_data_helper, &a); } DN_BH_WUNLOCK(); - if (compat) { - *compat = start; - sopt->sopt_valsize = buf - start; - /* free() is done by ip_dummynet_compat() */ - start = NULL; //XXX hack - } else { - error = sooptcopyout(sopt, start, buf - start); - } + error = sooptcopyout(sopt, start, buf - start); done: if (cmd != &r.o) free(cmd, M_DUMMYNET); @@ -2519,17 +2485,9 @@ ip_dn_ctl(struct sockopt *sopt) error = EINVAL; break; - case IP_DUMMYNET_FLUSH: - case IP_DUMMYNET_CONFIGURE: - case IP_DUMMYNET_DEL: /* remove a pipe or queue */ - case IP_DUMMYNET_GET: - D("dummynet: compat option %d", sopt->sopt_name); - error = ip_dummynet_compat(sopt); - break; - case IP_DUMMYNET3: if (sopt->sopt_dir == SOPT_GET) { - error = dummynet_get(sopt, NULL); + error = dummynet_get(sopt); break; } l = sopt->sopt_valsize; diff --git a/sys/powerpc/include/kexec.h b/sys/powerpc/include/kexec.h new file mode 100644 index 000000000000..a57c50926696 --- /dev/null +++ b/sys/powerpc/include/kexec.h @@ -0,0 +1,38 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2025 Juniper Networks, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _POWERPC_KEXEC_H_ +#define _POWERPC_KEXEC_H_ + +int +kexec_load_md(struct kexec_image *image) +{ + return (ENOSYS); +} + +#define kexec_reboot_md(x) do {} while (0) +#endif /* _POWERPC_KEXEC_H_ */ diff --git a/sys/riscv/include/kexec.h b/sys/riscv/include/kexec.h new file mode 100644 index 000000000000..5fb6fd321989 --- /dev/null +++ b/sys/riscv/include/kexec.h @@ -0,0 +1,39 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2025 Juniper Networks, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _RISCV_KEXEC_H_ +#define _RISCV_KEXEC_H_ + +int +kexec_load_md(struct kexec_image *image) +{ + return (ENOSYS); +} + +#define kexec_reboot_md(x) do {} while (0) +#endif /* _RISCV_KEXEC_H_ */ diff --git a/sys/security/audit/audit_bsm_db.c b/sys/security/audit/audit_bsm_db.c index c9f3d5c8a549..358162544287 100644 --- a/sys/security/audit/audit_bsm_db.c +++ b/sys/security/audit/audit_bsm_db.c @@ -56,6 +56,8 @@ #include <security/audit/audit.h> #include <security/audit/audit_private.h> +#include <contrib/ck/include/ck_queue.h> + /* * Hash table functions for the audit event number to event class mask * mapping. @@ -64,21 +66,21 @@ struct evclass_elem { au_event_t event; au_class_t class; - LIST_ENTRY(evclass_elem) entry; + CK_LIST_ENTRY(evclass_elem) entry; }; struct evclass_list { - LIST_HEAD(, evclass_elem) head; + CK_LIST_HEAD(, evclass_elem) head; }; static MALLOC_DEFINE(M_AUDITEVCLASS, "audit_evclass", "Audit event class"); -static struct rwlock evclass_lock; static struct evclass_list evclass_hash[EVCLASSMAP_HASH_TABLE_SIZE]; - -#define EVCLASS_LOCK_INIT() rw_init(&evclass_lock, "evclass_lock") -#define EVCLASS_RLOCK() rw_rlock(&evclass_lock) -#define EVCLASS_RUNLOCK() rw_runlock(&evclass_lock) -#define EVCLASS_WLOCK() rw_wlock(&evclass_lock) -#define EVCLASS_WUNLOCK() rw_wunlock(&evclass_lock) +static struct mtx evclass_mtx; +#define EVCLASS_LOCK_INIT() mtx_init(&evclass_mtx, "evclass_lock", NULL, MTX_DEF) +#define EVCLASS_WLOCK() mtx_lock(&evclass_mtx); +#define EVCLASS_WUNLOCK() mtx_unlock(&evclass_mtx); +/* make these do something if we ever remove entries from the hash */ +#define EVCLASS_RLOCK() {} +#define EVCLASS_RUNLOCK() {} /* * Hash table maintaining a mapping from audit event numbers to audit event @@ -118,7 +120,7 @@ au_event_class(au_event_t event) EVCLASS_RLOCK(); evcl = &evclass_hash[event % EVCLASSMAP_HASH_TABLE_SIZE]; class = 0; - LIST_FOREACH(evc, &evcl->head, entry) { + CK_LIST_FOREACH(evc, &evcl->head, entry) { if (evc->event == event) { class = evc->class; goto out; @@ -150,7 +152,7 @@ au_evclassmap_insert(au_event_t event, au_class_t class) EVCLASS_WLOCK(); evcl = &evclass_hash[event % EVCLASSMAP_HASH_TABLE_SIZE]; - LIST_FOREACH(evc, &evcl->head, entry) { + CK_LIST_FOREACH(evc, &evcl->head, entry) { if (evc->event == event) { evc->class = class; EVCLASS_WUNLOCK(); @@ -161,7 +163,7 @@ au_evclassmap_insert(au_event_t event, au_class_t class) evc = evc_new; evc->event = event; evc->class = class; - LIST_INSERT_HEAD(&evcl->head, evc, entry); + CK_LIST_INSERT_HEAD(&evcl->head, evc, entry); EVCLASS_WUNLOCK(); } @@ -172,7 +174,7 @@ au_evclassmap_init(void) EVCLASS_LOCK_INIT(); for (i = 0; i < EVCLASSMAP_HASH_TABLE_SIZE; i++) - LIST_INIT(&evclass_hash[i].head); + CK_LIST_INIT(&evclass_hash[i].head); /* * Set up the initial event to class mapping for system calls. diff --git a/sys/sys/kexec.h b/sys/sys/kexec.h new file mode 100644 index 000000000000..478193749368 --- /dev/null +++ b/sys/sys/kexec.h @@ -0,0 +1,81 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2025 Juniper Networks, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _SYS_KEXEC_H_ +#define _SYS_KEXEC_H_ + +#include <sys/types.h> + +struct kexec_segment { + void *buf; + size_t bufsz; + vm_paddr_t mem; + vm_size_t memsz; +}; + +/* Flags (aligned with Linux) */ +#define KEXEC_ON_CRASH 0x1 + +/* Aligned with Linux's limit */ +#define KEXEC_SEGMENT_MAX 16 + +#ifdef _KERNEL +struct kexec_segment_stage { + vm_page_t first_page; + void *map_buf; + vm_paddr_t target; + vm_size_t size; + vm_pindex_t pindex; +}; + +struct kexec_image { + struct kexec_segment_stage segments[KEXEC_SEGMENT_MAX]; + vm_paddr_t entry; + struct vm_object *map_obj; /* Containing object */ + vm_offset_t map_addr; /* Mapped in kernel space */ + vm_size_t map_size; + vm_page_t first_md_page; + void *md_image; +}; + +#endif + +#ifndef _KERNEL + +__BEGIN_DECLS +int kexec_load(uint64_t, unsigned long, struct kexec_segment *, unsigned long); +__END_DECLS + +#else + +void kexec_reboot_md(struct kexec_image *); +int kexec_load_md(struct kexec_image *); + +#endif + +#endif diff --git a/sys/sys/reboot.h b/sys/sys/reboot.h index 26e78632fb2c..50ad2b78083c 100644 --- a/sys/sys/reboot.h +++ b/sys/sys/reboot.h @@ -61,6 +61,7 @@ #define RB_REROOT 0x200000 /* unmount the rootfs and mount it again */ #define RB_POWERCYCLE 0x400000 /* Power cycle if possible */ #define RB_MUTEMSGS 0x800000 /* start up with console muted after banner */ +#define RB_KEXEC 0x1000000 /* Boot new kernel using kexec */ #define RB_PROBE 0x10000000 /* Probe multiple consoles */ #define RB_MULTIPLE 0x20000000 /* use multiple consoles */ diff --git a/sys/sys/smp.h b/sys/sys/smp.h index 252dc9dc1cae..b642a6014f33 100644 --- a/sys/sys/smp.h +++ b/sys/sys/smp.h @@ -251,6 +251,7 @@ void cpu_mp_announce(void); int cpu_mp_probe(void); void cpu_mp_setmaxid(void); void cpu_mp_start(void); +void cpu_mp_stop(void); /* Go back to single-CPU */ void forward_signal(struct thread *); int restart_cpus(cpuset_t); @@ -259,6 +260,7 @@ int stop_cpus_hard(cpuset_t); #if defined(__amd64__) || defined(__i386__) int suspend_cpus(cpuset_t); int resume_cpus(cpuset_t); +int offline_cpus(cpuset_t); #endif void smp_rendezvous_action(void); diff --git a/sys/sys/syscall.h b/sys/sys/syscall.h index cff27b8be316..43f46f063e3e 100644 --- a/sys/sys/syscall.h +++ b/sys/sys/syscall.h @@ -537,4 +537,5 @@ #define SYS_setgroups 596 #define SYS_jail_attach_jd 597 #define SYS_jail_remove_jd 598 -#define SYS_MAXSYSCALL 599 +#define SYS_kexec_load 599 +#define SYS_MAXSYSCALL 600 diff --git a/sys/sys/syscall.mk b/sys/sys/syscall.mk index 443dbadcfbff..ce29c050885e 100644 --- a/sys/sys/syscall.mk +++ b/sys/sys/syscall.mk @@ -440,4 +440,5 @@ MIASM = \ getgroups.o \ setgroups.o \ jail_attach_jd.o \ - jail_remove_jd.o + jail_remove_jd.o \ + kexec_load.o diff --git a/sys/sys/syscallsubr.h b/sys/sys/syscallsubr.h index d32690634059..8f106150e193 100644 --- a/sys/sys/syscallsubr.h +++ b/sys/sys/syscallsubr.h @@ -47,6 +47,7 @@ struct image_args; struct jail; struct kevent; struct kevent_copyops; +struct kexec_segment; struct kld_file_stat; struct ksiginfo; struct mbuf; @@ -401,6 +402,8 @@ int kern_writev(struct thread *td, int fd, struct uio *auio); int kern_socketpair(struct thread *td, int domain, int type, int protocol, int *rsv); int kern_unmount(struct thread *td, const char *path, int flags); +int kern_kexec_load(struct thread *td, u_long entry, + u_long nseg, struct kexec_segment *seg, u_long flags); /* flags for kern_sigaction */ #define KSA_OSIGSET 0x0001 /* uses osigact_t */ diff --git a/sys/sys/sysproto.h b/sys/sys/sysproto.h index 8dda4b4533ea..5f5524a4519b 100644 --- a/sys/sys/sysproto.h +++ b/sys/sys/sysproto.h @@ -1907,6 +1907,12 @@ struct jail_attach_jd_args { struct jail_remove_jd_args { char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)]; }; +struct kexec_load_args { + char entry_l_[PADL_(uint64_t)]; uint64_t entry; char entry_r_[PADR_(uint64_t)]; + char nseg_l_[PADL_(u_long)]; u_long nseg; char nseg_r_[PADR_(u_long)]; + char segments_l_[PADL_(struct kexec_segment *)]; struct kexec_segment * segments; char segments_r_[PADR_(struct kexec_segment *)]; + char flags_l_[PADL_(u_long)]; u_long flags; char flags_r_[PADR_(u_long)]; +}; int sys__exit(struct thread *, struct _exit_args *); int sys_fork(struct thread *, struct fork_args *); int sys_read(struct thread *, struct read_args *); @@ -2313,6 +2319,7 @@ int sys_getgroups(struct thread *, struct getgroups_args *); int sys_setgroups(struct thread *, struct setgroups_args *); int sys_jail_attach_jd(struct thread *, struct jail_attach_jd_args *); int sys_jail_remove_jd(struct thread *, struct jail_remove_jd_args *); +int sys_kexec_load(struct thread *, struct kexec_load_args *); #ifdef COMPAT_43 @@ -3311,6 +3318,7 @@ int freebsd14_setgroups(struct thread *, struct freebsd14_setgroups_args *); #define SYS_AUE_setgroups AUE_SETGROUPS #define SYS_AUE_jail_attach_jd AUE_JAIL_ATTACH #define SYS_AUE_jail_remove_jd AUE_JAIL_REMOVE +#define SYS_AUE_kexec_load AUE_NULL #undef PAD_ #undef PADL_ diff --git a/sys/tools/gdb/README.txt b/sys/tools/gdb/README.txt index 8c31565ddc42..ad1544912c3c 100644 --- a/sys/tools/gdb/README.txt +++ b/sys/tools/gdb/README.txt @@ -8,6 +8,9 @@ be automatically loaded by kgdb when opening a vmcore, so if you add new GDB commands or functions, that script should be updated to import them, and you should document them here. +When improving these scripts, you can use the "kgdb-reload" command to reload +them from /usr/lib/debug/boot/kernel/gdb/*. + To provide some rudimentary testing, selftest.py tries to exercise all of the commands and functions defined here. To use it, run selftest.sh to panic the system. Then, create a kernel dump or attach to the panicked kernel, and invoke @@ -15,6 +18,8 @@ the script with "python import selftest" in (k)gdb. Commands: acttrace Display a backtrace for all on-CPU threads +kgdb-reload Reload all gdb modules, useful when developing the modules + themselves. Functions: $PCPU(<field>[, <cpuid>]) Display the value of a PCPU/DPCPU field diff --git a/sys/tools/gdb/acttrace.py b/sys/tools/gdb/acttrace.py index 147effbbddf1..fdd18a4833cd 100644 --- a/sys/tools/gdb/acttrace.py +++ b/sys/tools/gdb/acttrace.py @@ -13,10 +13,8 @@ from pcpu import * class acttrace(gdb.Command): """ - Register an acttrace command with gdb. - - When run, acttrace prints the stack trace of all threads that were on-CPU - at the time of the panic. + Print the stack trace of all threads that were on-CPU at the time of + the panic. """ def __init__(self): super(acttrace, self).__init__("acttrace", gdb.COMMAND_USER) diff --git a/sys/tools/gdb/pcpu.py b/sys/tools/gdb/pcpu.py index aadc4b2d42df..94c451e6eca5 100644 --- a/sys/tools/gdb/pcpu.py +++ b/sys/tools/gdb/pcpu.py @@ -9,7 +9,7 @@ from freebsd import * class pcpu(gdb.Function): """ - Register a function to lookup PCPU and DPCPU variables by name. + A function to look up PCPU and DPCPU fields by name. To look up the value of the PCPU field foo on CPU n, use $PCPU("foo", n). This works for DPCPU fields too. If the CPU ID is diff --git a/sys/tools/gdb/vnet.py b/sys/tools/gdb/vnet.py index 36b4d512a3eb..5f416b2a515a 100644 --- a/sys/tools/gdb/vnet.py +++ b/sys/tools/gdb/vnet.py @@ -10,7 +10,7 @@ from freebsd import * class vnet(gdb.Function): """ - Register a function to look up VNET variables by name. + A function to look up VNET variables by name. To look at the value of a VNET variable V_foo, print $V("foo"). The currently selected thread's VNET is used by default, but can be optionally diff --git a/sys/tools/kernel-gdb.py b/sys/tools/kernel-gdb.py index 8a41ef6efab1..990bdaf31fda 100644 --- a/sys/tools/kernel-gdb.py +++ b/sys/tools/kernel-gdb.py @@ -4,12 +4,40 @@ # SPDX-License-Identifier: BSD-2-Clause # +import importlib import os import sys sys.path.append(os.path.join(os.path.dirname(__file__), "gdb")) -# Import FreeBSD kernel debugging commands and modules below. -import acttrace -import pcpu -import vnet +modules = [ + "acttrace", + "freebsd", + "pcpu", + "vnet" +] + + +def reload_modules(modules): + for mod in modules: + if mod in sys.modules: + importlib.reload(sys.modules[mod]) + else: + importlib.import_module(mod) + +reload_modules(modules) + + +class reload(gdb.Command): + """ + Reload the FreeBSD kernel GDB helper scripts. + """ + def __init__(self): + super(reload, self).__init__("kgdb-reload", gdb.COMMAND_USER) + + def invoke(self, arg, from_tty): + reload_modules(modules) + + +# Register the reload command with gdb. +reload() diff --git a/sys/x86/include/apicvar.h b/sys/x86/include/apicvar.h index c537d0ee0cdd..551f5527ac00 100644 --- a/sys/x86/include/apicvar.h +++ b/sys/x86/include/apicvar.h @@ -134,7 +134,8 @@ #define IPI_STOP (APIC_IPI_INTS + 6) /* Stop CPU until restarted. */ #define IPI_SUSPEND (APIC_IPI_INTS + 7) /* Suspend CPU until restarted. */ #define IPI_SWI (APIC_IPI_INTS + 8) /* Run clk_intr_event. */ -#define IPI_DYN_FIRST (APIC_IPI_INTS + 9) +#define IPI_OFF (APIC_IPI_INTS + 9) /* Stop CPU forever */ +#define IPI_DYN_FIRST (APIC_IPI_INTS + 10) #define IPI_DYN_LAST (254) /* IPIs allocated at runtime */ /* diff --git a/sys/x86/include/intr_machdep.h b/sys/x86/include/intr_machdep.h index 9e913440c712..497c89b0a7eb 100644 --- a/sys/x86/include/intr_machdep.h +++ b/sys/x86/include/intr_machdep.h @@ -142,6 +142,7 @@ int intr_add_handler(struct intsrc *isrc, const char *name, int intr_config_intr(struct intsrc *isrc, enum intr_trigger trig, enum intr_polarity pol); int intr_describe(struct intsrc *isrc, void *ih, const char *descr); +void intr_disable_all(void); void intr_execute_handlers(struct intsrc *isrc, struct trapframe *frame); u_int intr_next_cpu(int domain); struct intsrc *intr_lookup_source(int vector); diff --git a/sys/x86/include/x86_smp.h b/sys/x86/include/x86_smp.h index 8b9eb2ec9b66..f5015e9d8a24 100644 --- a/sys/x86/include/x86_smp.h +++ b/sys/x86/include/x86_smp.h @@ -77,6 +77,7 @@ extern u_long *ipi_rendezvous_counts[MAXCPU]; inthand_t IDTVEC(ipi_intr_bitmap_handler), /* Bitmap based IPIs */ IDTVEC(ipi_swi), /* Runs delayed SWI */ + IDTVEC(cpuoff), /* CPU goes offline until hard reset */ IDTVEC(cpustop), /* CPU stops & waits to be restarted */ IDTVEC(cpususpend), /* CPU suspends & waits to be resumed */ IDTVEC(rendezvous); /* handle CPU rendezvous */ @@ -93,6 +94,7 @@ void assign_cpu_ids(void); void cpu_add(u_int apic_id, char boot_cpu); void cpustop_handler(void); void cpususpend_handler(void); +void cpuoff_handler(void); void init_secondary_tail(void); void init_secondary(void); void ipi_startup(int apic_id, int vector); diff --git a/sys/x86/x86/intr_machdep.c b/sys/x86/x86/intr_machdep.c index 023c3df22580..a16d2ced8dba 100644 --- a/sys/x86/x86/intr_machdep.c +++ b/sys/x86/x86/intr_machdep.c @@ -245,6 +245,26 @@ intr_register_source(struct intsrc *isrc) return (0); } +void +intr_disable_all(void) +{ + /* + * Disable all external interrupts. This is used by kexec_reboot() to + * prevent problems on the other side when APs are brought up. + */ + for (int v = 0; v < num_io_irqs; v++) { + struct intsrc *is; + + is = interrupt_sources[v]; + if (is == NULL) + continue; + if (is->is_pic->pic_disable_intr != NULL) { + is->is_pic->pic_disable_source(is, PIC_EOI); + is->is_pic->pic_disable_intr(is); + } + } +} + struct intsrc * intr_lookup_source(int vector) { diff --git a/sys/x86/x86/mp_x86.c b/sys/x86/x86/mp_x86.c index c0da41a4d222..6b1715853763 100644 --- a/sys/x86/x86/mp_x86.c +++ b/sys/x86/x86/mp_x86.c @@ -1696,6 +1696,28 @@ cpususpend_handler(void) CPU_CLR_ATOMIC(cpu, &toresume_cpus); } +void +cpuoff_handler(void) +{ + u_int cpu; + + cpu = PCPU_GET(cpuid); + + /* Time to go catatonic. A reset will be required to leave. */ + disable_intr(); + lapic_disable(); + CPU_SET_ATOMIC(cpu, &suspended_cpus); + + /* + * There technically should be no need for the `while` here, since it + * cannot be interrupted (interrupts are disabled). Be safe anyway. + * Any interrupt at this point will likely be fatal, as the page tables + * are likely going away shortly. + */ + while (1) + halt(); +} + /* * Handle an IPI_SWI by waking delayed SWI thread. */ diff --git a/sys/x86/x86/msi.c b/sys/x86/x86/msi.c index 9d5a51f9753c..b38247bf6e45 100644 --- a/sys/x86/x86/msi.c +++ b/sys/x86/x86/msi.c @@ -219,6 +219,14 @@ msi_disable_intr(struct intsrc *isrc) struct msi_intsrc *msi = (struct msi_intsrc *)isrc; msi = msi->msi_first; + + /* + * Interrupt sources are always registered, but never unregistered. + * Handle the case where MSIs have all been unregistered. + */ + if (msi == NULL) + return; + msi->msi_enabled--; if (msi->msi_enabled == 0) { for (u_int i = 0; i < msi->msi_count; i++) diff --git a/tests/sys/file/Makefile b/tests/sys/file/Makefile index beb4452359b7..c1fcef68d08e 100644 --- a/tests/sys/file/Makefile +++ b/tests/sys/file/Makefile @@ -3,7 +3,7 @@ TESTSDIR= ${TESTSBASE}/sys/file BINDIR= ${TESTSDIR} ATF_TESTS_C+= path_test -TAP_TESTS_C+= closefrom_test +ATF_TESTS_C+= closefrom_test TAP_TESTS_C+= dup_test ATF_TESTS_C+= fcntlflags_test TAP_TESTS_SH+= flock_test diff --git a/tests/sys/file/closefrom_test.c b/tests/sys/file/closefrom_test.c index 212d048d7566..a51e1630e24d 100644 --- a/tests/sys/file/closefrom_test.c +++ b/tests/sys/file/closefrom_test.c @@ -25,13 +25,13 @@ * SUCH DAMAGE. */ -#include <sys/cdefs.h> /* * Regression tests for the closefrom(2) system call. */ #include <sys/param.h> #include <sys/mman.h> +#include <sys/stat.h> #include <sys/user.h> #include <sys/wait.h> #include <errno.h> @@ -44,67 +44,57 @@ #include <string.h> #include <unistd.h> -struct shared_info { - int failed; - char tag[64]; - char message[0]; -}; +#include <atf-c.h> -static int test = 1; +static char *shared_page; -static void -ok(const char *descr) +/* + * A variant of ATF_REQUIRE that is suitable for use in child + * processes. Since these tests close stderr, errors are reported to + * a shared page of memory checked by the parent process. + */ +#define CHILD_REQUIRE(exp) do { \ + if (!(exp)) \ + child_fail_require(__FILE__, __LINE__, \ + #exp " not met"); \ +} while (0) + +static __dead2 __printflike(3, 4) void +child_fail_require(const char *file, int line, const char *fmt, ...) { + FILE *fp; + va_list ap; - printf("ok %d - %s\n", test, descr); - test++; -} + fp = fmemopen(shared_page, PAGE_SIZE - 1, "w"); + if (fp == NULL) + exit(1); -static void -fail(const char *descr, const char *fmt, ...) -{ - va_list ap; + fprintf(fp, "%s:%d: ", file, line); + va_start(ap, fmt); + vfprintf(fp, fmt, ap); + va_end(ap); + fclose(fp); - printf("not ok %d - %s", test, descr); - test++; - if (fmt) { - va_start(ap, fmt); - printf(" # "); - vprintf(fmt, ap); - va_end(ap); - } - printf("\n"); - exit(1); + exit(0); } -#define fail_err(descr) fail((descr), "%s", strerror(errno)) - -static void -cok(struct shared_info *info, const char *descr) +static pid_t +child_fork(void) { - - info->failed = 0; - strlcpy(info->tag, descr, sizeof(info->tag)); - exit(0); + shared_page = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_ANON | + MAP_SHARED, -1, 0); + ATF_REQUIRE_MSG(shared_page != MAP_FAILED, "mmap: %s", strerror(errno)); + return (atf_utils_fork()); } static void -cfail(struct shared_info *info, const char *descr, const char *fmt, ...) +child_wait(pid_t pid) { - va_list ap; - - info->failed = 1; - strlcpy(info->tag, descr, sizeof(info->tag)); - if (fmt) { - va_start(ap, fmt); - vsprintf(info->message, fmt, ap); - va_end(ap); - } - exit(0); + atf_utils_wait(pid, 0, "", ""); + if (shared_page[0] != '\0') + atf_tc_fail("%s", shared_page); } -#define cfail_err(info, descr) cfail((info), (descr), "%s", strerror(errno)) - /* * Use kinfo_getfile() to fetch the list of file descriptors and figure out * the highest open file descriptor. @@ -116,9 +106,8 @@ highest_fd(void) int cnt, i, highest; kif = kinfo_getfile(getpid(), &cnt); - if (kif == NULL) - fail_err("kinfo_getfile"); - highest = INT_MIN; + ATF_REQUIRE_MSG(kif != NULL, "kinfo_getfile: %s", strerror(errno)); + highest = -1; for (i = 0; i < cnt; i++) if (kif[i].kf_fd > highest) highest = kif[i].kf_fd; @@ -132,262 +121,253 @@ devnull(void) int fd; fd = open(_PATH_DEVNULL, O_RDONLY); - if (fd < 0) - fail_err("open(\" "_PATH_DEVNULL" \")"); + ATF_REQUIRE_MSG(fd != -1, "open(\" "_PATH_DEVNULL" \"): %s", + strerror(errno)); return (fd); } -int -main(void) +ATF_TC_WITHOUT_HEAD(closefrom_simple); +ATF_TC_BODY(closefrom_simple, tc) { - struct shared_info *info; - pid_t pid; - int fd, flags, i, start; - - printf("1..22\n"); + int fd, start; /* We'd better start up with fd's 0, 1, and 2 open. */ - start = devnull(); - if (start < 3) - fail("open", "bad descriptor %d", start); - ok("open"); + start = highest_fd(); + ATF_REQUIRE(start >= 2); + + fd = devnull(); + ATF_REQUIRE(fd > start); /* Make sure highest_fd() works. */ - fd = highest_fd(); - if (start != fd) - fail("highest_fd", "bad descriptor %d != %d", start, fd); - ok("highest_fd"); - - /* Try to use closefrom() for just closing fd 3. */ - closefrom(start); - fd = highest_fd(); - if (fd != start - 1) - fail("closefrom", "highest fd %d", fd); - ok("closefrom"); + ATF_REQUIRE_INTEQ(fd, highest_fd()); + + /* Try to use closefrom() to close just the new fd. */ + closefrom(fd); + ATF_REQUIRE_INTEQ(start, highest_fd()); +} + +ATF_TC_WITHOUT_HEAD(closefrom_with_holes); +ATF_TC_BODY(closefrom_with_holes, tc) +{ + int i, start; + + start = highest_fd(); /* Eat up 16 descriptors. */ for (i = 0; i < 16; i++) (void)devnull(); - fd = highest_fd(); - if (fd != start + 15) - fail("open 16", "highest fd %d", fd); - ok("open 16"); + + ATF_REQUIRE_INTEQ(start + 16, highest_fd()); /* Close half of them. */ - closefrom(11); - fd = highest_fd(); - if (fd != 10) - fail("closefrom", "highest fd %d", fd); - ok("closefrom"); - - /* Explicitly close descriptors 6 and 8 to create holes. */ - if (close(6) < 0 || close(8) < 0) - fail_err("close2 "); - ok("close 2"); - - /* Verify that close on 6 and 8 fails with EBADF. */ - if (close(6) == 0) - fail("close(6)", "did not fail"); - if (errno != EBADF) - fail_err("close(6)"); - ok("close(6)"); - if (close(8) == 0) - fail("close(8)", "did not fail"); - if (errno != EBADF) - fail_err("close(8)"); - ok("close(8)"); - - /* Close from 4 on. */ - closefrom(4); - fd = highest_fd(); - if (fd != 3) - fail("closefrom", "highest fd %d", fd); - ok("closefrom"); - - /* Allocate a small SHM region for IPC with our child. */ - info = mmap(NULL, getpagesize(), PROT_READ | PROT_WRITE, MAP_ANON | - MAP_SHARED, -1, 0); - if (info == MAP_FAILED) - fail_err("mmap"); - ok("mmap"); - - /* Fork a child process to test closefrom(0). */ - pid = fork(); - if (pid < 0) - fail_err("fork"); + closefrom(start + 9); + ATF_REQUIRE_INTEQ(start + 8, highest_fd()); + + /* Explicitly close two descriptors to create holes. */ + ATF_REQUIRE_MSG(close(start + 3) == 0, "close(start + 3): %s", + strerror(errno)); + ATF_REQUIRE_MSG(close(start + 5) == 0, "close(start + 5): %s", + strerror(errno)); + + /* Verify that close on the closed descriptors fails with EBADF. */ + ATF_REQUIRE_ERRNO(EBADF, close(start + 3) == -1); + ATF_REQUIRE_ERRNO(EBADF, close(start + 5) == -1); + + /* Close most remaining descriptors. */ + closefrom(start + 2); + ATF_REQUIRE_INTEQ(start + 1, highest_fd()); +} + +ATF_TC_WITHOUT_HEAD(closefrom_zero); +ATF_TC_BODY(closefrom_zero, tc) +{ + pid_t pid; + int fd; + + /* Ensure standard descriptors are open. */ + ATF_REQUIRE(highest_fd() >= 2); + + pid = child_fork(); if (pid == 0) { /* Child. */ closefrom(0); fd = highest_fd(); - if (fd >= 0) - cfail(info, "closefrom(0)", "highest fd %d", fd); - cok(info, "closefrom(0)"); + CHILD_REQUIRE(fd == -1); + exit(0); } - if (wait(NULL) < 0) - fail_err("wait"); - if (info->failed) - fail(info->tag, "%s", info->message); - ok(info->tag); - - /* Fork a child process to test closefrom(-1). */ - pid = fork(); - if (pid < 0) - fail_err("fork"); + + child_wait(pid); +} + +ATF_TC_WITHOUT_HEAD(closefrom_negative_one); +ATF_TC_BODY(closefrom_negative_one, tc) +{ + pid_t pid; + int fd; + + /* Ensure standard descriptors are open. */ + ATF_REQUIRE(highest_fd() >= 2); + + pid = child_fork(); if (pid == 0) { /* Child. */ closefrom(-1); fd = highest_fd(); - if (fd >= 0) - cfail(info, "closefrom(-1)", "highest fd %d", fd); - cok(info, "closefrom(-1)"); + CHILD_REQUIRE(fd == -1); + exit(0); } - if (wait(NULL) < 0) - fail_err("wait"); - if (info->failed) - fail(info->tag, "%s", info->message); - ok(info->tag); - - /* Dup stdout to 6. */ - if (dup2(1, 6) < 0) - fail_err("dup2"); - fd = highest_fd(); - if (fd != 6) - fail("dup2", "highest fd %d", fd); - ok("dup2"); + + child_wait(pid); +} + +ATF_TC_WITHOUT_HEAD(closefrom_in_holes); +ATF_TC_BODY(closefrom_in_holes, tc) +{ + int start; + + start = highest_fd(); + ATF_REQUIRE(start >= 2); + + /* Dup stdout to a higher fd. */ + ATF_REQUIRE_INTEQ(start + 4, dup2(1, start + 4)); + ATF_REQUIRE_INTEQ(start + 4, highest_fd()); /* Do a closefrom() starting in a hole. */ - closefrom(4); - fd = highest_fd(); - if (fd != 3) - fail("closefrom", "highest fd %d", fd); - ok("closefrom"); + closefrom(start + 2); + ATF_REQUIRE_INTEQ(start, highest_fd()); /* Do a closefrom() beyond our highest open fd. */ - closefrom(32); - fd = highest_fd(); - if (fd != 3) - fail("closefrom", "highest fd %d", fd); - ok("closefrom"); + closefrom(start + 32); + ATF_REQUIRE_INTEQ(start, highest_fd()); +} + +ATF_TC_WITHOUT_HEAD(closerange_basic); +ATF_TC_BODY(closerange_basic, tc) +{ + struct stat sb; + int i, start; - /* Chew up another 8 fd */ + start = highest_fd(); + + /* Open 8 file descriptors */ for (i = 0; i < 8; i++) (void)devnull(); - fd = highest_fd(); - start = fd - 7; + ATF_REQUIRE_INTEQ(start + 8, highest_fd()); /* close_range() a hole in the middle */ - close_range(start + 3, start + 5, 0); - for (i = start + 3; i < start + 6; ++i) { - if (close(i) == 0 || errno != EBADF) { - --i; - break; - } - } - if (i != start + 6) - fail("close_range", "failed to close at %d in %d - %d", i + 1, - start + 3, start + 6); - ok("close_range"); + ATF_REQUIRE_INTEQ(0, close_range(start + 3, start + 5, 0)); + for (i = start + 3; i < start + 6; ++i) + ATF_REQUIRE_ERRNO(EBADF, fstat(i, &sb) == -1); /* close_range from the middle of the hole */ - close_range(start + 4, start + 6, 0); - if ((i = highest_fd()) != fd) - fail("close_range", "highest fd %d", i); - ok("close_range"); + ATF_REQUIRE_INTEQ(0, close_range(start + 4, start + 6, 0)); + ATF_REQUIRE_INTEQ(start + 8, highest_fd()); /* close_range to the end; effectively closefrom(2) */ - close_range(start + 3, ~0L, 0); - if ((i = highest_fd()) != start + 2) - fail("close_range", "highest fd %d", i); - ok("close_range"); + ATF_REQUIRE_INTEQ(0, close_range(start + 3, ~0L, 0)); + ATF_REQUIRE_INTEQ(start + 2, highest_fd()); /* Now close the rest */ - close_range(start, start + 4, 0); - fd = highest_fd(); - if (fd != 3) - fail("close_range", "highest fd %d", fd); - ok("close_range"); - - /* Fork a child process to test closefrom(0) twice. */ - pid = fork(); - if (pid < 0) - fail_err("fork"); + ATF_REQUIRE_INTEQ(0, close_range(start + 1, start + 4, 0)); + ATF_REQUIRE_INTEQ(start, highest_fd()); +} + +ATF_TC_WITHOUT_HEAD(closefrom_zero_twice); +ATF_TC_BODY(closefrom_zero_twice, tc) +{ + pid_t pid; + int fd; + + /* Ensure standard descriptors are open. */ + ATF_REQUIRE(highest_fd() >= 2); + + pid = child_fork(); if (pid == 0) { /* Child. */ closefrom(0); + fd = highest_fd(); + CHILD_REQUIRE(fd == -1); closefrom(0); - cok(info, "closefrom(0)"); + fd = highest_fd(); + CHILD_REQUIRE(fd == -1); + exit(0); } - if (wait(NULL) < 0) - fail_err("wait"); - if (info->failed) - fail(info->tag, "%s", info->message); - ok(info->tag); - /* test CLOSE_RANGE_CLOEXEC */ + child_wait(pid); +} + +static void +require_fd_flag(int fd, const char *descr, const char *descr2, int flag, + bool set) +{ + int flags; + + flags = fcntl(fd, F_GETFD); + ATF_REQUIRE_MSG(flags >= 0, "fcntl(.., F_GETFD): %s", strerror(errno)); + + if (set) { + ATF_REQUIRE_MSG((flags & flag) == flag, + "%s did not set %s on fd %d", descr, descr2, fd); + } else { + ATF_REQUIRE_MSG((flags & flag) == 0, + "%s set %s when it should not have on fd %d", descr, descr2, + fd); + } +} + +ATF_TC_WITHOUT_HEAD(closerange_CLOEXEC); +ATF_TC_BODY(closerange_CLOEXEC, tc) +{ + int i, start; + + start = highest_fd(); + ATF_REQUIRE(start >= 2); + for (i = 0; i < 8; i++) (void)devnull(); - fd = highest_fd(); - start = fd - 8; - if (close_range(start + 1, start + 4, CLOSE_RANGE_CLOEXEC) < 0) - fail_err("close_range(..., CLOSE_RANGE_CLOEXEC)"); - flags = fcntl(start, F_GETFD); - if (flags < 0) - fail_err("fcntl(.., F_GETFD)"); - if ((flags & FD_CLOEXEC) != 0) - fail("close_range", "CLOSE_RANGE_CLOEXEC set close-on-exec " - "when it should not have on fd %d", start); - for (i = start + 1; i <= start + 4; i++) { - flags = fcntl(i, F_GETFD); - if (flags < 0) - fail_err("fcntl(.., F_GETFD)"); - if ((flags & FD_CLOEXEC) == 0) - fail("close_range", "CLOSE_RANGE_CLOEXEC did not set " - "close-on-exec on fd %d", i); - } - for (; i < start + 8; i++) { - flags = fcntl(i, F_GETFD); - if (flags < 0) - fail_err("fcntl(.., F_GETFD)"); - if ((flags & FD_CLOEXEC) != 0) - fail("close_range", "CLOSE_RANGE_CLOEXEC set close-on-exec " - "when it should not have on fd %d", i); + ATF_REQUIRE_INTEQ(start + 8, highest_fd()); + + ATF_REQUIRE_INTEQ(0, close_range(start + 2, start + 5, + CLOSE_RANGE_CLOEXEC)); + for (i = 1; i < 9; i++) { + require_fd_flag(start + i, "CLOSE_RANGE_CLOEXEC", + "close-on-exec", FD_CLOEXEC, i >= 2 && i <= 5); } - if (close_range(start, start + 8, 0) < 0) - fail_err("close_range"); - ok("close_range(..., CLOSE_RANGE_CLOEXEC)"); + ATF_REQUIRE_INTEQ(0, close_range(start + 1, start + 8, 0)); +} + +ATF_TC_WITHOUT_HEAD(closerange_CLOFORK); +ATF_TC_BODY(closerange_CLOFORK, tc) +{ + int i, start; + + start = highest_fd(); + ATF_REQUIRE(start >= 2); - /* test CLOSE_RANGE_CLOFORK */ for (i = 0; i < 8; i++) (void)devnull(); - fd = highest_fd(); - start = fd - 8; - if (close_range(start + 1, start + 4, CLOSE_RANGE_CLOFORK) < 0) - fail_err("close_range(..., CLOSE_RANGE_CLOFORK)"); - flags = fcntl(start, F_GETFD); - if (flags < 0) - fail_err("fcntl(.., F_GETFD)"); - if ((flags & FD_CLOFORK) != 0) - fail("close_range", "CLOSE_RANGE_CLOFORK set close-on-exec " - "when it should not have on fd %d", start); - for (i = start + 1; i <= start + 4; i++) { - flags = fcntl(i, F_GETFD); - if (flags < 0) - fail_err("fcntl(.., F_GETFD)"); - if ((flags & FD_CLOFORK) == 0) - fail("close_range", "CLOSE_RANGE_CLOFORK did not set " - "close-on-exec on fd %d", i); - } - for (; i < start + 8; i++) { - flags = fcntl(i, F_GETFD); - if (flags < 0) - fail_err("fcntl(.., F_GETFD)"); - if ((flags & FD_CLOFORK) != 0) - fail("close_range", "CLOSE_RANGE_CLOFORK set close-on-exec " - "when it should not have on fd %d", i); + ATF_REQUIRE_INTEQ(start + 8, highest_fd()); + + ATF_REQUIRE_INTEQ(0, close_range(start + 2, start + 5, + CLOSE_RANGE_CLOFORK)); + for (i = 1; i < 9; i++) { + require_fd_flag(start + i, "CLOSE_RANGE_CLOFORK", + "close-on-fork", FD_CLOFORK, i >= 2 && i <= 5); } - if (close_range(start, start + 8, 0) < 0) - fail_err("close_range"); - ok("close_range(..., CLOSE_RANGE_CLOFORK)"); + ATF_REQUIRE_INTEQ(0, close_range(start + 1, start + 8, 0)); +} - return (0); +ATF_TP_ADD_TCS(tp) +{ + ATF_TP_ADD_TC(tp, closefrom_simple); + ATF_TP_ADD_TC(tp, closefrom_with_holes); + ATF_TP_ADD_TC(tp, closefrom_zero); + ATF_TP_ADD_TC(tp, closefrom_negative_one); + ATF_TP_ADD_TC(tp, closefrom_in_holes); + ATF_TP_ADD_TC(tp, closerange_basic); + ATF_TP_ADD_TC(tp, closefrom_zero_twice); + ATF_TP_ADD_TC(tp, closerange_CLOEXEC); + ATF_TP_ADD_TC(tp, closerange_CLOFORK); + + return (atf_no_error()); } diff --git a/tests/sys/fs/fusefs/bad_server.cc b/tests/sys/fs/fusefs/bad_server.cc index c3d195735446..825523cac2bb 100644 --- a/tests/sys/fs/fusefs/bad_server.cc +++ b/tests/sys/fs/fusefs/bad_server.cc @@ -64,12 +64,12 @@ TEST_F(BadServer, ShortWrite) out.header.error = 0; out.header.unique = 0; // Asynchronous notification out.expected_errno = EINVAL; - m_mock->write_response(out); /* - * Tell the event loop to quit. The kernel has already disconnected us + * Tell the event loop to quit. The kernel will disconnect us * because of the short write. */ - m_mock->m_quit = true; + m_mock->m_expect_unmount = true; + m_mock->write_response(out); } /* @@ -98,7 +98,7 @@ TEST_F(BadServer, ErrorWithPayload) out.push_back(std::move(out1)); // The kernel may disconnect us for bad behavior, so don't try - // to read any more. + // to read or write any more. m_mock->m_quit = true; })); diff --git a/tests/sys/fs/fusefs/mockfs.cc b/tests/sys/fs/fusefs/mockfs.cc index 55c191716629..b6a32d9b60af 100644 --- a/tests/sys/fs/fusefs/mockfs.cc +++ b/tests/sys/fs/fusefs/mockfs.cc @@ -433,7 +433,8 @@ MockFS::MockFS(int max_read, int max_readahead, bool allow_other, m_child_pid(-1), m_maxwrite(MIN(max_write, max_max_write)), m_nready(-1), - m_quit(false) + m_quit(false), + m_expect_unmount(false) { struct sigaction sa; struct iovec *iov = NULL; @@ -979,7 +980,7 @@ void MockFS::read_request(mockfs_buf_in &in, ssize_t &res) { } res = read(m_fuse_fd, &in, sizeof(in)); - if (res < 0 && !m_quit) { + if (res < 0 && errno != EBADF && !m_quit && !m_expect_unmount) { m_quit = true; FAIL() << "read: " << strerror(errno); } diff --git a/tests/sys/fs/fusefs/mockfs.hh b/tests/sys/fs/fusefs/mockfs.hh index 4b0628d34dd7..f98a5337c9d1 100644 --- a/tests/sys/fs/fusefs/mockfs.hh +++ b/tests/sys/fs/fusefs/mockfs.hh @@ -360,6 +360,9 @@ class MockFS { /* Tell the daemon to shut down ASAP */ bool m_quit; + /* Tell the daemon that the server might forcibly unmount us */ + bool m_expect_unmount; + /* Create a new mockfs and mount it to a tempdir */ MockFS(int max_read, int max_readahead, bool allow_other, bool default_permissions, bool push_symlinks_in, bool ro, diff --git a/usr.sbin/bhyve/bhyve.8 b/usr.sbin/bhyve/bhyve.8 index c902c265da9e..3df9f79c3b2c 100644 --- a/usr.sbin/bhyve/bhyve.8 +++ b/usr.sbin/bhyve/bhyve.8 @@ -22,7 +22,7 @@ .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" -.Dd August 21, 2024 +.Dd September 16, 2025 .Dt BHYVE 8 .Os .Sh NAME @@ -518,6 +518,8 @@ considered unconnected. .Op Cm \&,mtu= Ar N .Xc .It +.Cm ngd Ar N +.It .Xo .Cm netgraph,path= Ar ADDRESS Cm \&,peerhook= Ar HOOK .Op Cm \&,socket= Ar NAME @@ -542,6 +544,19 @@ The MAC address is an ASCII string in .Xr ethers 5 format. .Pp +A +.Cm ngd +device can be used to connect a guest to a +.Xr netgraph 4 +through a +.Xr ng_device 4 +node. +This can be used to run bhyve in a +.Xr VNET 9 +jail, and give it access to the host's netgraph, that cannot be reached +directly, by exposing the ng_device through +.Xr devfs 8 . +.Pp With .Cm virtio-net devices, the @@ -572,7 +587,9 @@ must comply with .Xr netgraph 4 addressing rules. .Pp -The slirp backend can be used to provide a NATed network to the guest. +The +.Cm slirp +backend can be used to provide a NATed network to the guest. This backend has poor performance but does not require any network configuration on the host system. It depends on the diff --git a/usr.sbin/bhyve/net_backends.c b/usr.sbin/bhyve/net_backends.c index 2d11c45f217a..95909d1f8ea2 100644 --- a/usr.sbin/bhyve/net_backends.c +++ b/usr.sbin/bhyve/net_backends.c @@ -119,7 +119,8 @@ tap_init(struct net_backend *be, const char *devname, goto error; } - if (ioctl(be->fd, VMIO_SIOCSIFFLAGS, up)) { + if (strncmp("ngd", be->prefix, 3) && + ioctl(be->fd, VMIO_SIOCSIFFLAGS, up)) { EPRINTLN("tap device link up failed"); goto error; } @@ -273,8 +274,24 @@ static struct net_backend vmnet_backend = { .set_cap = tap_set_cap, }; +/* A clone of the tap backend, with a different prefix. */ +static struct net_backend ngd_backend = { + .prefix = "ngd", + .priv_size = sizeof(struct tap_priv), + .init = tap_init, + .cleanup = tap_cleanup, + .send = tap_send, + .peek_recvlen = tap_peek_recvlen, + .recv = tap_recv, + .recv_enable = tap_recv_enable, + .recv_disable = tap_recv_disable, + .get_cap = tap_get_cap, + .set_cap = tap_set_cap, +}; + DATA_SET(net_backend_set, tap_backend); DATA_SET(net_backend_set, vmnet_backend); +DATA_SET(net_backend_set, ngd_backend); int netbe_legacy_config(nvlist_t *nvl, const char *opts) |
