diff options
40 files changed, 3076 insertions, 1016 deletions
diff --git a/Makefile.inc1 b/Makefile.inc1 index e079a23552f1..a86dead09aa1 100644 --- a/Makefile.inc1 +++ b/Makefile.inc1 @@ -1964,6 +1964,7 @@ REPODIR?= ${OBJROOT}repo PKG_FORMAT?= tzst PKG_LEVEL?= -1 PKG_CLEVEL?= ${"${PKG_FORMAT:Mtar}" != "":?:-l ${PKG_LEVEL}} +PKG_CTHREADS?= 0 PKG_REPO_SIGNING_KEY?= # empty PKG_OUTPUT_DIR?= ${PKG_VERSION} PKG_ABI_FILE?= ${WSTAGEDIR}/usr/bin/uname @@ -2144,7 +2145,7 @@ create-source-src-package: _pkgbootstrap .PHONY ${SSTAGEDIR}/src.ucl ${PKG_CMD} -o ABI=${PKG_ABI} \ -o OSVERSION="${SRCRELDATE}" \ - create -f ${PKG_FORMAT} ${PKG_CLEVEL} \ + create -f ${PKG_FORMAT} ${PKG_CLEVEL} -T${PKG_CTHREADS} \ -M ${SSTAGEDIR}/src.ucl \ -p ${SSTAGEDIR}/src.plist \ -r ${SRCDIR} \ @@ -2170,7 +2171,7 @@ create-source-src-sys-package: _pkgbootstrap .PHONY ${SSTAGEDIR}/src-sys.ucl ${PKG_CMD} -o ABI=${PKG_ABI} \ -o OSVERSION="${SRCRELDATE}" \ - create -f ${PKG_FORMAT} ${PKG_CLEVEL} \ + create -f ${PKG_FORMAT} ${PKG_CLEVEL} -T${PKG_CTHREADS} \ -M ${SSTAGEDIR}/src-sys.ucl \ -p ${SSTAGEDIR}/src-sys.plist \ -r ${SRCDIR} \ @@ -2210,7 +2211,7 @@ create-world-package-${pkgname}: .PHONY ' ${WSTAGEDIR}/${pkgname}.ucl ${PKG_CMD} -o ABI=${PKG_ABI} -o ALLOW_BASE_SHLIBS=yes \ -o OSVERSION="${SRCRELDATE}" \ - create -f ${PKG_FORMAT} ${PKG_CLEVEL} \ + create -f ${PKG_FORMAT} ${PKG_CLEVEL} -T${PKG_CTHREADS} \ -M ${WSTAGEDIR}/${pkgname}.ucl \ -p ${WSTAGEDIR}/${pkgname}.plist \ -r ${WSTAGEDIR} \ @@ -2229,7 +2230,7 @@ create-sets-packages: .PHONY @for manifest in ${WSTAGEDIR}/set-*.ucl; do \ echo "--> Processing manifest: $$manifest"; \ ${PKG_CMD} -o ABI=${PKG_ABI} -o OSVERSION="${SRCRELDATE}" \ - create -f ${PKG_FORMAT} ${PKG_CLEVEL} \ + create -f ${PKG_FORMAT} ${PKG_CLEVEL} -T${PKG_CTHREADS} \ -M $$manifest \ -o "${REPODIR}/${PKG_ABI}/${PKG_OUTPUT_DIR}" \ || exit 1; \ @@ -2259,7 +2260,7 @@ create-dtb-package: .PHONY ${KSTAGEDIR}/${DISTDIR}/dtb.ucl ; \ ${PKG_CMD} -o ABI=${PKG_ABI} -o ALLOW_BASE_SHLIBS=yes \ -o OSVERSION="${SRCRELDATE}" \ - create -f ${PKG_FORMAT} ${PKG_CLEVEL} \ + create -f ${PKG_FORMAT} ${PKG_CLEVEL} -T${PKG_CTHREADS} \ -M ${KSTAGEDIR}/${DISTDIR}/dtb.ucl \ -p ${KSTAGEDIR}/${DISTDIR}/dtb.plist \ -r ${KSTAGEDIR}/${DISTDIR} \ @@ -2296,7 +2297,7 @@ create-kernel-packages-flavor${flavor:C,^""$,${_default_flavor},}: _pkgbootstrap ${KSTAGEDIR}/${DISTDIR}/kernel.${INSTALLKERNEL}${flavor}.ucl ; \ ${PKG_CMD} -o ABI=${PKG_ABI} -o ALLOW_BASE_SHLIBS=yes \ -o OSVERSION="${SRCRELDATE}" \ - create -f ${PKG_FORMAT} ${PKG_CLEVEL} \ + create -f ${PKG_FORMAT} ${PKG_CLEVEL} -T${PKG_CTHREADS} \ -M ${KSTAGEDIR}/${DISTDIR}/kernel.${INSTALLKERNEL}${flavor}.ucl \ -p ${KSTAGEDIR}/${DISTDIR}/kernel.${INSTALLKERNEL}${flavor}.plist \ -r ${KSTAGEDIR}/${DISTDIR} \ @@ -2339,7 +2340,7 @@ create-kernel-packages-extra-flavor${flavor:C,^""$,${_default_flavor},}-${_kerne ${KSTAGEDIR}/kernel.${_kernel}/kernel.${_kernel}${flavor}.ucl ; \ ${PKG_CMD} -o ABI=${PKG_ABI} -o ALLOW_BASE_SHLIBS=yes \ -o OSVERSION="${SRCRELDATE}" \ - create -f ${PKG_FORMAT} ${PKG_CLEVEL} \ + create -f ${PKG_FORMAT} ${PKG_CLEVEL} -T${PKG_CTHREADS} \ -M ${KSTAGEDIR}/kernel.${_kernel}/kernel.${_kernel}${flavor}.ucl \ -p ${KSTAGEDIR}/kernel.${_kernel}/kernel.${_kernel}${flavor}.plist \ -r ${KSTAGEDIR}/kernel.${_kernel} \ @@ -11,7 +11,7 @@ newline. Entries should be separated by a newline. Changes to this file should not be MFCed. 5000d023a446, 03da141d59ae: - Add a "-f" option to "kadmin -l dump" with can be used to + Add a "-f" option to "kadmin -l dump" which can be used to dump the Heimdal KDC database in a format that can be loaded into the MIT KDC. See https://wiki.freebsd.org/Kerberos/Heimdal2MIT_KDC_Migration diff --git a/lib/libsys/socket.2 b/lib/libsys/socket.2 index b211611c6354..48b8f4e87489 100644 --- a/lib/libsys/socket.2 +++ b/lib/libsys/socket.2 @@ -25,7 +25,7 @@ .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" -.Dd May 17, 2025 +.Dd September 28, 2025 .Dt SOCKET 2 .Os .Sh NAME @@ -64,7 +64,7 @@ PF_NETGRAPH Netgraph sockets, PF_NETLINK Netlink protocols, PF_BLUETOOTH Bluetooth protocols, PF_INET_SDP OFED socket direct protocol (IPv4), -AF_HYPERV HyperV sockets +PF_HYPERV HyperV sockets .Ed .Pp Each protocol family is connected to an address family, which has the @@ -89,32 +89,6 @@ SOCK_RAW Raw-protocol interface, SOCK_SEQPACKET Sequenced packet stream .Ed .Pp -A -.Dv SOCK_STREAM -type provides sequenced, reliable, -two-way connection based byte streams. -An out-of-band data transmission mechanism may be supported. -A -.Dv SOCK_DGRAM -socket supports -datagrams (connectionless, unreliable messages of -a fixed (typically small) maximum length). -A -.Dv SOCK_SEQPACKET -socket may provide a sequenced, reliable, -two-way connection-based data transmission path for datagrams -of fixed maximum length; a consumer may be required to read -an entire packet with each read system call. -This facility may have protocol-specific properties. -.Dv SOCK_RAW -sockets provide access to internal network protocols and interfaces. -The -.Dv SOCK_RAW -type is available only to the super-user and is described in -.Xr ip 4 -and -.Xr ip6 4 . -.Pp Additionally, the following flags are allowed in the .Fa type argument: @@ -140,32 +114,23 @@ particular to the in which communication is to take place; see .Xr protocols 5 . -.Pp The .Fa protocol argument may be set to zero (0) to request the default implementation of a socket type for the protocol, if any. -.Pp -Sockets of type +.Sh STREAM SOCKET TYPE +The +.Dv SOCK_STREAM +socket type provides reliable, sequenced, full-duplex octet streams between +the socket and a peer to which the socket is connected. +A socket of type .Dv SOCK_STREAM -are full-duplex byte streams, similar -to pipes. -A stream socket must be in a +needs to be in a .Em connected -state before any data may be sent or received -on it. +state before any data can be sent or received. A connection to another socket is created with a .Xr connect 2 system call. -Once connected, data may be transferred using -.Xr read 2 -and -.Xr write 2 -calls or some variant of the -.Xr send 2 -and -.Xr recv 2 -functions. (Some protocol families, such as the Internet family, support the notion of an .Dq implied connect , @@ -173,62 +138,210 @@ which permits data to be sent piggybacked onto a connect operation by using the .Xr sendto 2 system call.) -When a session has been completed a -.Xr close 2 -may be performed. -Out-of-band data may also be transmitted as described in +Once connected, data may be sent using +.Xr send 2 , +.Xr sendto 2 , +.Xr sendmsg 2 +and +.Xr write 2 +system calls. +Data may be received using +.Xr recv 2 , +.Xr recvfrom 2 , +.Xr recvmsg 2 , +and +.Xr read 2 +system calls. +Record boundaries are not maintained; data sent on a stream socket using output +operations of one size can be received using input operations of smaller or +larger sizes without loss of data. +Data may be buffered; successful return from an output function does not imply +that the data has been delivered to the peer or even transmitted from the local +system. +For certain protocols out-of-band data may also be transmitted as described in .Xr send 2 and received as described in .Xr recv 2 . .Pp -The communications protocols used to implement a -.Dv SOCK_STREAM -ensure that data -is not lost or duplicated. -If a piece of data for which the -peer protocol has buffer space cannot be successfully transmitted -within a reasonable length of time, then -the connection is considered broken and calls -will indicate an error with --1 returns and with -.Er ETIMEDOUT -as the specific code -in the global variable -.Va errno . -The protocols optionally keep sockets -.Dq warm -by forcing transmissions -roughly every minute in the absence of other activity. -An error is then indicated if no response can be -elicited on an otherwise -idle connection for an extended period (e.g.\& 5 minutes). -By default, a +If data cannot be successfully transmitted within a given time then the +connection is considered broken, and subsequent operations shall fail with +a protocol specific error code. +A .Dv SIGPIPE -signal is raised if a process sends -on a broken stream, but this behavior may be inhibited via +signal is raised if a thread attempts to send data on a broken stream (one that +is no longer connected). +The signal can be suppressed by the +.Dv MSG_NOSIGNAL +flag with distinct +.Xr send 2 , +.Xr sendto 2 , +and +.Xr sendmsg 2 +system calls or by the +.Dv SO_NOSIGPIPE +socket option set on the socket with .Xr setsockopt 2 . .Pp -.Dv SOCK_SEQPACKET -sockets employ the same system calls -as +The .Dv SOCK_STREAM -sockets. -The only difference -is that -.Xr read 2 -calls will return only the amount of data requested, -and any remaining in the arriving packet will be discarded. +socket is supported by the following protocol families: +.Dv PF_INET , +.Dv PF_INET6 , +.Dv PF_UNIX , +.Dv PF_BLUETOOTH , +.Dv PF_HYPERV , +and +.Dv PF_INET_SDP . +Out-of-band data transmission mechanism is supported for stream sockets of +.Dv PF_INET +and +.Dv PF_INET6 +protocol families. +.Sh DATAGRAM SOCKET TYPE +The +.Dv SOCK_DGRAM +socket type supports connectionless data transfer which is not necessarily +acknowledged or reliable. +Datagrams can be sent to the address specified (possibly multicast or +broadcast) in each output operation, and incoming datagrams can be received +from multiple sources. +The source address of each datagram is available when receiving the datagram +with +.Xr recvfrom 2 +or +.Xr recvmsg 2 . +An application can also pre-specify a peer address with +.Xr sendto 2 +or +.Xr sendmsg 2 , +in which case calls to output functions that do not specify a peer address +shall send to the pre-specified peer. +If a peer has been specified, only datagrams from that peer shall be received. +A datagram shall be sent in a single output operation, and needs to be received +in a single input operation. +The maximum size of a datagram is protocol-specific. +Output datagrams may be buffered within the system; thus, a successful return +from an output function does not guarantee that a datagram is actually sent or +received. .Pp +The .Dv SOCK_DGRAM +socket is supported by the following protocol families: +.Dv PF_INET , +.Dv PF_INET6 , +.Dv PF_UNIX , +.Dv PF_NETGRAPH , and -.Dv SOCK_RAW -sockets allow sending of datagrams to correspondents -named in +.Dv PF_NETLINK . +.Sh SEQUENCED PACKET SOCKET TYPE +The +.Dv SOCK_SEQPACKET +socket type is similar to the +.Dv SOCK_STREAM +type, and is also connection-oriented. +The only difference between these types is that record boundaries are +maintained using the +.Dv SOCK_SEQPACKET +type. +A record can be sent using one or more output operations and received using one +or more input operations, but a single operation never transfers parts of more +than one record. +Record boundaries are set by the sender with the +.Dv MSG_EOR +flag of .Xr send 2 -calls. -Datagrams are generally received with +or +.Xr sendmsg 2 +functions. +There is no possibility to set a record boundary with +.Xr write 2 . +Record boundaries are visible to the receiver via the +.Dv MSG_EOR +flag in the received message flags returned by the +.Xr recvmsg 2 +function. +It is protocol-specific whether a maximum record size is imposed. +.Pp +The +.Dv SOCK_SEQPACKET +socket is supported by the following protocol families: +.Dv PF_INET , +.Dv PF_INET6 , +and +.Dv PF_UNIX . +.Pp +.Sh RAW SOCKET TYPE +The +.Dv SOCK_RAW +socket type provides access to internal network protocols and interfaces. +It is a datagram socket in its nature, thus has the same semantics of +read and write operations. +The +.Dv SOCK_RAW +type is available only to the super-user and is described in +.Xr ip 4 +and +.Xr ip6 4 . +.Sh NON-BLOCKING MODE +A socket can be created in +.Em non-blocking mode +with the help of +.Dv SOCK_NONBLOCK +flag. +Alternatively, the non-blocking mode on a socket can be turned on and off with +the help of the +.Dv O_NONBLOCK +flag of the +.Xr fcntl 2 +system call. +.Pp +When a non-blocking socket has not enough data in its receive buffer to fulfill +the application supplied buffer, then data receiving system calls like +.Xr recv 2 , .Xr recvfrom 2 , -which returns the next datagram with its return address. +.Xr recvmsg 2 +and +.Xr read 2 +will not block waiting for the data but immediately return. +Return value will indicate amount of bytes read into the supplied buffer. +The +.Va errno +will be set to +.Dv EAGAIN +.Po +has same value as +.Dv EWOULDBLOCK +.Pc . +.Pp +If application tries to send more data on a non-blocking socket than the socket +send buffer can accomodate with +.Xr send 2 , +.Xr sendto 2 , +.Xr sendmsg 2 +or +.Xr write 2 +system calls partial data will be sent. +Return value will indicate amount of bytes sent. +The +.Va errno +will be set to +.Dv EAGAIN . +Note that sockets of +.Dv SOCK_DGRAM +type are unreliable, thus for these sockets sending operations will never fail +with +.Dv EAGAIN +in non-blocking mode neither will block in blocking mode. +.Sh OTHER OPERATIONS ON SOCKETS +Since socket descriptors are file descriptors, many generic file operations +performed by +.Xr fcntl 2 , +apply. +Socket descriptors can be used with all event engines, such as +.Xr kevent 2 , +.Xr select 2 +and +.Xr poll 2 . .Pp An .Xr fcntl 2 @@ -250,6 +363,12 @@ The and .Xr getsockopt 2 system calls are used to set and get options, respectively. +.Pp +Connection associated with a socket can be terminated by +.Xr close 2 +system call. +One direction of communication can be disabled with +.Xr shutdown 2 . .Sh RETURN VALUES A -1 is returned if an error occurs, otherwise the return value is a descriptor referencing the socket. @@ -282,16 +401,23 @@ The socket type is not supported by the protocol. .Sh SEE ALSO .Xr accept 2 , .Xr bind 2 , +.Xr close 2 , .Xr connect 2 , +.Xr fcntl 2 , .Xr getpeername 2 , .Xr getsockname 2 , .Xr getsockopt 2 , .Xr ioctl 2 , +.Xr kevent 2 , .Xr listen 2 , +.Xr poll 2 , .Xr read 2 , .Xr recv 2 , .Xr select 2 , .Xr send 2 , +.Xr sendmsg 2 , +.Xr sendto 2 , +.Xr signal 3 , .Xr shutdown 2 , .Xr socketpair 2 , .Xr write 2 , diff --git a/sbin/geom/Makefile b/sbin/geom/Makefile index 078503d3ae67..61561ef1ff1b 100644 --- a/sbin/geom/Makefile +++ b/sbin/geom/Makefile @@ -9,7 +9,7 @@ MAN= geom.8 CFLAGS+= -I${.CURDIR} -I${.CURDIR}/core CFLAGS+= -DGEOM_CLASS_DIR=\"${GEOM_CLASS_DIR}\" -LIBADD= geom util +LIBADD= geom util xo .if defined(RESCUE) .PATH: ${SRCTOP}/lib/geom/part \ diff --git a/sbin/geom/core/geom.c b/sbin/geom/core/geom.c index b78021194ddd..496123f08274 100644 --- a/sbin/geom/core/geom.c +++ b/sbin/geom/core/geom.c @@ -49,9 +49,12 @@ #include <assert.h> #include <libgeom.h> #include <geom.h> +#include <libxo/xo.h> #include "misc/subr.h" +#define GEOM_XO_VERSION "1" + #ifdef STATIC_GEOM_CLASSES extern uint32_t gpart_version; extern struct g_command gpart_class_commands[]; @@ -513,6 +516,7 @@ run_command(int argc, char *argv[]) gctl_free(req); if (verbose) printf("Done.\n"); + xo_finish(); exit(EXIT_SUCCESS); } @@ -810,6 +814,10 @@ main(int argc, char *argv[]) provider_name = NULL; tflag = false; + argc = xo_parse_args(argc, argv); + if (argc < 0) + return (argc); + if (strcmp(getprogname(), "geom") == 0) { while ((ch = getopt(argc, argv, "hp:t")) != -1) { switch (ch) { @@ -831,6 +839,7 @@ main(int argc, char *argv[]) * Don't adjust argc and argv, it would break get_class(). */ } + xo_set_version(GEOM_XO_VERSION); if (tflag && provider_name != NULL) { errx(EXIT_FAILURE, @@ -839,6 +848,7 @@ main(int argc, char *argv[]) if (provider_name != NULL) { list_one_geom_by_provider(provider_name); + xo_finish(); return (0); } @@ -882,29 +892,33 @@ find_geom(struct gclass *classp, const char *name) } static void -list_one_provider(struct gprovider *pp, const char *prefix) +list_one_provider(struct gprovider *pp, const char *padding) { struct gconfig *conf; char buf[5]; - printf("Name: %s\n", pp->lg_name); + xo_emit("{Lcw:Name}{:Name}\n", pp->lg_name); humanize_number(buf, sizeof(buf), (int64_t)pp->lg_mediasize, "", HN_AUTOSCALE, HN_B | HN_NOSPACE | HN_DECIMAL); - printf("%sMediasize: %jd (%s)\n", prefix, (intmax_t)pp->lg_mediasize, - buf); - printf("%sSectorsize: %u\n", prefix, pp->lg_sectorsize); + xo_emit("{P:/%s}{Lcw:Mediasize}{:Mediasize/%jd} ({N:/%s})\n", + padding, (intmax_t)pp->lg_mediasize, buf); + xo_emit("{P:/%s}{Lcw:Sectorsize}{:Sectorsize/%u} \n", + padding, pp->lg_sectorsize); if (pp->lg_stripesize > 0 || pp->lg_stripeoffset > 0) { - printf("%sStripesize: %ju\n", prefix, pp->lg_stripesize); - printf("%sStripeoffset: %ju\n", prefix, pp->lg_stripeoffset); + xo_emit("{P:/%s}{Lcw:Stripesize}{Stripesize/%ju}\n", + padding, pp->lg_stripesize); + xo_emit("{P:/%s}{Lcw:Stripeoffset}{Stripeoffset/%ju}\n", + padding, pp->lg_stripeoffset); } - printf("%sMode: %s\n", prefix, pp->lg_mode); + xo_emit("{P:/%s}{Lcw:Mode}{Mode}\n", padding, pp->lg_mode); LIST_FOREACH(conf, &pp->lg_config, lg_config) { - printf("%s%s: %s\n", prefix, conf->lg_name, conf->lg_val); + xo_emit("{P:/%s}{Lcwa:}{a:}\n", padding, conf->lg_name, + conf->lg_name, conf->lg_val); } } static void -list_one_consumer(struct gconsumer *cp, const char *prefix) +list_one_consumer(struct gconsumer *cp, const char *padding) { struct gprovider *pp; struct gconfig *conf; @@ -915,20 +929,24 @@ list_one_consumer(struct gconsumer *cp, const char *prefix) else { char buf[5]; - printf("Name: %s\n", pp->lg_name); + xo_emit("{Lcw:Name}{:Name}\n", pp->lg_name); humanize_number(buf, sizeof(buf), (int64_t)pp->lg_mediasize, "", HN_AUTOSCALE, HN_B | HN_NOSPACE | HN_DECIMAL); - printf("%sMediasize: %jd (%s)\n", prefix, - (intmax_t)pp->lg_mediasize, buf); - printf("%sSectorsize: %u\n", prefix, pp->lg_sectorsize); + xo_emit("{P:/%s}{Lcw:Mediasize}{:Mediasize/%jd} ({N:/%s})\n", + padding, (intmax_t)pp->lg_mediasize, buf); + xo_emit("{P:/%s}{Lcw:Sectorsize}{:Sectorsize/%u}\n", + padding, pp->lg_sectorsize); if (pp->lg_stripesize > 0 || pp->lg_stripeoffset > 0) { - printf("%sStripesize: %ju\n", prefix, pp->lg_stripesize); - printf("%sStripeoffset: %ju\n", prefix, pp->lg_stripeoffset); + xo_emit("{P:/%s}{Lcw:Stripesize}{:Stripesize/%ju}\n", + padding, pp->lg_stripesize); + xo_emit("{P:/%s}{Lcw:Stripeoffset}{:Stripesize/%ju}\n", + padding, pp->lg_stripeoffset); } - printf("%sMode: %s\n", prefix, cp->lg_mode); + xo_emit("{P:/%s}{Lcw:Mode}{:Mode}\n", padding, pp->lg_mode); } LIST_FOREACH(conf, &cp->lg_config, lg_config) { - printf("%s%s: %s\n", prefix, conf->lg_name, conf->lg_val); + xo_emit("{P:/%s}{Lcwa:}{a:}\n", padding, conf->lg_name, + conf->lg_name, conf->lg_val); } } @@ -940,27 +958,36 @@ list_one_geom(struct ggeom *gp) struct gconfig *conf; unsigned n; - printf("Geom name: %s\n", gp->lg_name); + xo_emit("{Lcw:Geom name}{:Name}\n", gp->lg_name); LIST_FOREACH(conf, &gp->lg_config, lg_config) { - printf("%s: %s\n", conf->lg_name, conf->lg_val); + xo_emit("{Lcwa:}{a:}\n", conf->lg_name, conf->lg_name, + conf->lg_val); } if (!LIST_EMPTY(&gp->lg_provider)) { - printf("Providers:\n"); + xo_open_list("Providers"); + xo_emit("{Tc:Providers}\n"); n = 1; LIST_FOREACH(pp, &gp->lg_provider, lg_provider) { - printf("%u. ", n++); + xo_emit("{T:/%u} ", n++); + xo_open_instance("provider"); list_one_provider(pp, " "); + xo_close_instance("provider"); } + xo_close_list("Providers"); } if (!LIST_EMPTY(&gp->lg_consumer)) { - printf("Consumers:\n"); + xo_open_list("Consumers"); + xo_emit("{Tc:Consumers}\n"); n = 1; LIST_FOREACH(cp, &gp->lg_consumer, lg_consumer) { - printf("%u. ", n++); + xo_emit("{T:/%u} ", n++); + xo_open_instance("consumer"); list_one_consumer(cp, " "); + xo_close_instance("consumer"); } + xo_close_list("Consumers"); } - printf("\n"); + xo_emit("\n"); } static void @@ -978,8 +1005,10 @@ list_one_geom_by_provider(const char *provider_name) if (gp == NULL) errx(EXIT_FAILURE, "Cannot find provider '%s'.", provider_name); - printf("Geom class: %s\n", gp->lg_class->lg_name); + xo_open_container("Geom"); + xo_emit("{Lwc:Geom class}{:Class}\n", gp->lg_class->lg_name); list_one_geom(gp); + xo_close_container("Geom"); } static void @@ -1038,14 +1067,20 @@ std_list(struct gctl_req *req, unsigned flags __unused) "an instance named '%s'.", gclass_name, name); } + xo_open_container("Geom"); list_one_geom(gp); + xo_close_container("Geom"); } } else { + xo_open_list("Geoms"); LIST_FOREACH(gp, &classp->lg_geom, lg_geom) { if (LIST_EMPTY(&gp->lg_provider) && !all) continue; + xo_open_instance("geom"); list_one_geom(gp); + xo_close_instance("geom"); } + xo_close_list("Geoms"); } geom_deletetree(&mesh); } @@ -1115,34 +1150,24 @@ status_update_len_prs(struct ggeom *gp, int *name_len, int *status_len) } static char * -status_one_consumer(struct gconsumer *cp) +status_one_consumer(struct gconsumer *cp, const char *value) { - static char buf[256]; struct gprovider *pp; struct gconfig *conf; - const char *state, *syncr; + char *ret; pp = cp->lg_provider; if (pp == NULL) return (NULL); - state = NULL; - syncr = NULL; + ret = NULL; LIST_FOREACH(conf, &cp->lg_config, lg_config) { - if (strcasecmp(conf->lg_name, "state") == 0) - state = conf->lg_val; - if (strcasecmp(conf->lg_name, "synchronized") == 0) - syncr = conf->lg_val; - } - if (state == NULL && syncr == NULL) - snprintf(buf, sizeof(buf), "%s", pp->lg_name); - else if (state != NULL && syncr != NULL) { - snprintf(buf, sizeof(buf), "%s (%s, %s)", pp->lg_name, - state, syncr); - } else { - snprintf(buf, sizeof(buf), "%s (%s)", pp->lg_name, - state ? state : syncr); + if (strcasecmp(conf->lg_name, value) == 0) + ret = conf->lg_val; } - return (buf); + + if (ret == NULL) + return (NULL); + return (ret); } static void @@ -1150,8 +1175,8 @@ status_one_geom(struct ggeom *gp, int script, int name_len, int status_len) { struct gconsumer *cp; struct gconfig *conf; - const char *name, *status, *component; - int gotone; + const char *name, *status, *cstate, *csyncr; + int gotone, len; name = gp->lg_name; status = "N/A"; @@ -1161,21 +1186,49 @@ status_one_geom(struct ggeom *gp, int script, int name_len, int status_len) break; } } - gotone = 0; + gotone = len = 0; + xo_open_instance("status"); LIST_FOREACH(cp, &gp->lg_consumer, lg_consumer) { - component = status_one_consumer(cp); - if (component == NULL) + cstate = status_one_consumer(cp, "state"); + csyncr = status_one_consumer(cp, "synchronized"); + if (cstate == NULL && csyncr == NULL) continue; + if (!gotone || script) { + if (!gotone) { + xo_emit("{:name/%*s} {:status/%*s} ", + name_len, name, status_len, status); + } else { + xo_emit("{d:name/%*s} {d:status/%*s} ", + name_len, name, status_len, status); + } + xo_open_list("components"); + } + + xo_open_instance("components"); + if (cstate != NULL && csyncr != NULL) { + xo_emit("{P:/%*s}{:compontent} ({:state}, {:synchronized})\n", + len, "", cp->lg_provider->lg_name, cstate, csyncr); + } else if (cstate != NULL) { + xo_emit("{P:/%*s}{:compontent} ({:state})\n", + len, "", cp->lg_provider->lg_name, cstate); + } else { + xo_emit("{P:/%*s}{:compontent} ({:synchronized})\n", + len, "", cp->lg_provider->lg_name, csyncr); + } + xo_close_instance("components"); gotone = 1; - printf("%*s %*s %s\n", name_len, name, status_len, status, - component); - if (!script) - name = status = ""; + if (!len && !script) + len = name_len + status_len + 4; } if (!gotone) { - printf("%*s %*s %s\n", name_len, name, status_len, status, - "N/A"); + xo_emit("{:name/%*s} {:status/%*s} ", name_len, name, status_len, status); + xo_open_list("components"); + xo_open_instance("components"); + xo_emit("{P:/%*s}{d:compontent}\n", len, "", "N/A"); + xo_close_instance("components"); } + xo_close_list("components"); + xo_close_instance("status"); } static void @@ -1184,9 +1237,10 @@ status_one_geom_prs(struct ggeom *gp, int script, int name_len, int status_len) struct gprovider *pp; struct gconsumer *cp; struct gconfig *conf; - const char *name, *status, *component; - int gotone; + const char *name, *status, *cstate, *csyncr; + int gotone, len; + xo_open_instance("status"); LIST_FOREACH(pp, &gp->lg_provider, lg_provider) { name = pp->lg_name; status = "N/A"; @@ -1202,22 +1256,50 @@ status_one_geom_prs(struct ggeom *gp, int script, int name_len, int status_len) break; } } - gotone = 0; + gotone = len = 0; LIST_FOREACH(cp, &gp->lg_consumer, lg_consumer) { - component = status_one_consumer(cp); - if (component == NULL) + cstate = status_one_consumer(cp, "state"); + csyncr = status_one_consumer(cp, "synchronized"); + if (cstate == NULL && csyncr == NULL) continue; + + if (!gotone || script) { + if (!gotone) { + xo_emit("{:name/%*s} {:status/%*s} ", + name_len, name, status_len, status); + } else { + xo_emit("{d:name/%*s} {d:status/%*s} ", + name_len, name, status_len, status); + } + xo_open_list("components"); + } + + xo_open_instance("component"); + if (cstate != NULL && csyncr != NULL) { + xo_emit("{P:/%*s}{:compontent} ({:state}, {:synchronized})\n", + len, "", cp->lg_provider->lg_name, cstate, csyncr); + } else if (cstate != NULL) { + xo_emit("{P:/%*s}{:compontent} ({:state})\n", + len, "", cp->lg_provider->lg_name, cstate); + } else { + xo_emit("{P:/%*s}{:compontent} ({:synchronized})\n", + len, "", cp->lg_provider->lg_name, csyncr); + } + xo_close_instance("component"); gotone = 1; - printf("%*s %*s %s\n", name_len, name, - status_len, status, component); - if (!script) - name = status = ""; + if (!len && !script) + len = name_len + status_len + 4; } if (!gotone) { - printf("%*s %*s %s\n", name_len, name, - status_len, status, "N/A"); + xo_emit("{:name/%*s} {:status/%*s} ", name_len, name, status_len, status); + xo_open_list("components"); + xo_open_instance("components"); + xo_emit("{P:/%*s}{d:compontent}\n", len, "", "N/A"); + xo_close_instance("components"); } + xo_close_list("components"); } + xo_close_instance("status"); } static void @@ -1240,13 +1322,9 @@ std_status(struct gctl_req *req, unsigned flags __unused) all = gctl_get_int(req, "all"); geoms = gctl_get_int(req, "geoms"); script = gctl_get_int(req, "script"); - if (script) { - name_len = 0; - status_len = 0; - } else { - name_len = strlen("Name"); - status_len = strlen("Status"); - } + name_len = strlen("Name"); + status_len = strlen("Status"); + if (nargs > 0) { for (i = 0, n = 0; i < nargs; i++) { name = gctl_get_ascii(req, "arg%d", i); @@ -1282,9 +1360,10 @@ std_status(struct gctl_req *req, unsigned flags __unused) goto end; } if (!script) { - printf("%*s %*s %s\n", name_len, "Name", status_len, "Status", - "Components"); + xo_emit("{T:/%*s} {T:/%*s} {T:Components}\n", + name_len, "Name", status_len, "Status"); } + xo_open_list("status"); if (nargs > 0) { for (i = 0; i < nargs; i++) { name = gctl_get_ascii(req, "arg%d", i); @@ -1312,6 +1391,7 @@ std_status(struct gctl_req *req, unsigned flags __unused) } } } + xo_close_list("status"); end: geom_deletetree(&mesh); } diff --git a/sbin/ipfw/tables.c b/sbin/ipfw/tables.c index 7c3b1bb35a01..245c0c9e0399 100644 --- a/sbin/ipfw/tables.c +++ b/sbin/ipfw/tables.c @@ -1037,9 +1037,6 @@ table_modify_record(ipfw_obj_header *oh, int ac, char *av[], int add, } } - /* Get real OS error */ - error = errno; - /* Report results back */ ptent = tent_buf; for (i = 0; i < count; ptent++, i++) { diff --git a/sbin/ping/tests/Makefile b/sbin/ping/tests/Makefile index 0520b1d634cf..7d3ab02b9a86 100644 --- a/sbin/ping/tests/Makefile +++ b/sbin/ping/tests/Makefile @@ -1,5 +1,6 @@ ATF_TESTS_C+= in_cksum_test -SRCS.in_cksum_test= in_cksum_test.c ../utils.c +.PATH: ${.CURDIR:H} +SRCS.in_cksum_test= in_cksum_test.c utils.c PACKAGE= tests diff --git a/sys/amd64/amd64/elf_machdep.c b/sys/amd64/amd64/elf_machdep.c index 6cc2d58bbbcc..933f1ac0051f 100644 --- a/sys/amd64/amd64/elf_machdep.c +++ b/sys/amd64/amd64/elf_machdep.c @@ -179,7 +179,7 @@ freebsd_brand_info_la57_img_compat(const struct image_params *imgp, return (!prefer_uva_la48); } -static Elf64_Brandinfo freebsd_brand_info_la48 = { +static const Elf64_Brandinfo freebsd_brand_info_la48 = { .brand = ELFOSABI_FREEBSD, .machine = EM_X86_64, .compat_3_brand = "FreeBSD", @@ -190,7 +190,7 @@ static Elf64_Brandinfo freebsd_brand_info_la48 = { .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE, }; -static Elf64_Brandinfo freebsd_brand_info_la57 = { +static const Elf64_Brandinfo freebsd_brand_info_la57 = { .brand = ELFOSABI_FREEBSD, .machine = EM_X86_64, .compat_3_brand = "FreeBSD", @@ -216,7 +216,7 @@ sysinit_register_elf64_brand_entries(void *arg __unused) SYSINIT(elf64, SI_SUB_EXEC, SI_ORDER_FIRST, sysinit_register_elf64_brand_entries, NULL); -static Elf64_Brandinfo freebsd_brand_oinfo = { +static const Elf64_Brandinfo freebsd_brand_oinfo = { .brand = ELFOSABI_FREEBSD, .machine = EM_X86_64, .compat_3_brand = "FreeBSD", @@ -226,11 +226,10 @@ static Elf64_Brandinfo freebsd_brand_oinfo = { .brand_note = &elf64_freebsd_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; - -SYSINIT(oelf64, SI_SUB_EXEC, SI_ORDER_ANY, +C_SYSINIT(oelf64, SI_SUB_EXEC, SI_ORDER_ANY, (sysinit_cfunc_t)elf64_insert_brand_entry, &freebsd_brand_oinfo); -static Elf64_Brandinfo kfreebsd_brand_info = { +static const Elf64_Brandinfo kfreebsd_brand_info = { .brand = ELFOSABI_FREEBSD, .machine = EM_X86_64, .compat_3_brand = "FreeBSD", @@ -240,8 +239,7 @@ static Elf64_Brandinfo kfreebsd_brand_info = { .brand_note = &elf64_kfreebsd_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE_MANDATORY }; - -SYSINIT(kelf64, SI_SUB_EXEC, SI_ORDER_ANY, +C_SYSINIT(kelf64, SI_SUB_EXEC, SI_ORDER_ANY, (sysinit_cfunc_t)elf64_insert_brand_entry, &kfreebsd_brand_info); void diff --git a/sys/amd64/linux/linux_sysvec.c b/sys/amd64/linux/linux_sysvec.c index c8579c5da4ad..890cf01c46a0 100644 --- a/sys/amd64/linux/linux_sysvec.c +++ b/sys/amd64/linux/linux_sysvec.c @@ -857,7 +857,7 @@ linux_vdso_reloc(char *mapping, Elf_Addr offset) } } -static Elf_Brandnote linux64_brandnote = { +static const Elf_Brandnote linux64_brandnote = { .hdr.n_namesz = sizeof(GNU_ABI_VENDOR), .hdr.n_descsz = 16, .hdr.n_type = 1, @@ -866,7 +866,7 @@ static Elf_Brandnote linux64_brandnote = { .trans_osrel = linux_trans_osrel }; -static Elf64_Brandinfo linux_glibc2brand = { +static const Elf64_Brandinfo linux_glibc2brand = { .brand = ELFOSABI_LINUX, .machine = EM_X86_64, .compat_3_brand = "Linux", @@ -877,7 +877,7 @@ static Elf64_Brandinfo linux_glibc2brand = { .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; -static Elf64_Brandinfo linux_glibc2brandshort = { +static const Elf64_Brandinfo linux_glibc2brandshort = { .brand = ELFOSABI_LINUX, .machine = EM_X86_64, .compat_3_brand = "Linux", @@ -888,7 +888,7 @@ static Elf64_Brandinfo linux_glibc2brandshort = { .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; -static Elf64_Brandinfo linux_muslbrand = { +static const Elf64_Brandinfo linux_muslbrand = { .brand = ELFOSABI_LINUX, .machine = EM_X86_64, .compat_3_brand = "Linux", @@ -900,7 +900,7 @@ static Elf64_Brandinfo linux_muslbrand = { LINUX_BI_FUTEX_REQUEUE }; -static Elf64_Brandinfo *linux_brandlist[] = { +static const Elf64_Brandinfo *linux_brandlist[] = { &linux_glibc2brand, &linux_glibc2brandshort, &linux_muslbrand, @@ -910,7 +910,7 @@ static Elf64_Brandinfo *linux_brandlist[] = { static int linux64_elf_modevent(module_t mod, int type, void *data) { - Elf64_Brandinfo **brandinfo; + const Elf64_Brandinfo **brandinfo; int error; struct linux_ioctl_handler **lihp; diff --git a/sys/amd64/linux32/linux32_sysvec.c b/sys/amd64/linux32/linux32_sysvec.c index 8fac626f9053..735ebb151017 100644 --- a/sys/amd64/linux32/linux32_sysvec.c +++ b/sys/amd64/linux32/linux32_sysvec.c @@ -954,7 +954,7 @@ linux_vdso_reloc(char *mapping, Elf_Addr offset) } } -static Elf_Brandnote linux32_brandnote = { +static const Elf_Brandnote linux32_brandnote = { .hdr.n_namesz = sizeof(GNU_ABI_VENDOR), .hdr.n_descsz = 16, /* XXX at least 16 */ .hdr.n_type = 1, @@ -963,7 +963,7 @@ static Elf_Brandnote linux32_brandnote = { .trans_osrel = linux_trans_osrel }; -static Elf32_Brandinfo linux_brand = { +static const Elf32_Brandinfo linux_brand = { .brand = ELFOSABI_LINUX, .machine = EM_386, .compat_3_brand = "Linux", @@ -974,7 +974,7 @@ static Elf32_Brandinfo linux_brand = { .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; -static Elf32_Brandinfo linux_glibc2brand = { +static const Elf32_Brandinfo linux_glibc2brand = { .brand = ELFOSABI_LINUX, .machine = EM_386, .compat_3_brand = "Linux", @@ -985,7 +985,7 @@ static Elf32_Brandinfo linux_glibc2brand = { .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; -static Elf32_Brandinfo linux_muslbrand = { +static const Elf32_Brandinfo linux_muslbrand = { .brand = ELFOSABI_LINUX, .machine = EM_386, .compat_3_brand = "Linux", @@ -997,7 +997,7 @@ static Elf32_Brandinfo linux_muslbrand = { LINUX_BI_FUTEX_REQUEUE }; -static Elf32_Brandinfo *linux_brandlist[] = { +static const Elf32_Brandinfo *linux_brandlist[] = { &linux_brand, &linux_glibc2brand, &linux_muslbrand, @@ -1007,7 +1007,7 @@ static Elf32_Brandinfo *linux_brandlist[] = { static int linux_elf_modevent(module_t mod, int type, void *data) { - Elf32_Brandinfo **brandinfo; + const Elf32_Brandinfo **brandinfo; int error; struct linux_ioctl_handler **lihp; diff --git a/sys/arm64/arm64/elf_machdep.c b/sys/arm64/arm64/elf_machdep.c index 13af5c5065d6..207b37180a26 100644 --- a/sys/arm64/arm64/elf_machdep.c +++ b/sys/arm64/arm64/elf_machdep.c @@ -121,7 +121,7 @@ static struct sysentvec elf64_freebsd_sysvec = { }; INIT_SYSENTVEC(elf64_sysvec, &elf64_freebsd_sysvec); -static Elf64_Brandinfo freebsd_brand_info = { +static const Elf64_Brandinfo freebsd_brand_info = { .brand = ELFOSABI_FREEBSD, .machine = EM_AARCH64, .compat_3_brand = "FreeBSD", @@ -131,8 +131,7 @@ static Elf64_Brandinfo freebsd_brand_info = { .brand_note = &elf64_freebsd_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; - -SYSINIT(elf64, SI_SUB_EXEC, SI_ORDER_FIRST, +C_SYSINIT(elf64, SI_SUB_EXEC, SI_ORDER_FIRST, (sysinit_cfunc_t)elf64_insert_brand_entry, &freebsd_brand_info); static bool @@ -336,7 +335,7 @@ elf_cpu_parse_dynamic(caddr_t loadbase __unused, Elf_Dyn *dynamic __unused) return (0); } -static Elf_Note gnu_property_note = { +static const Elf_Note gnu_property_note = { .n_namesz = sizeof(GNU_ABI_VENDOR), .n_descsz = 16, .n_type = NT_GNU_PROPERTY_TYPE_0, diff --git a/sys/arm64/linux/linux_sysvec.c b/sys/arm64/linux/linux_sysvec.c index 084b7a11b01f..ac05820f89bc 100644 --- a/sys/arm64/linux/linux_sysvec.c +++ b/sys/arm64/linux/linux_sysvec.c @@ -584,7 +584,7 @@ linux_vdso_reloc(char *mapping, Elf_Addr offset) } } -static Elf_Brandnote linux64_brandnote = { +static const Elf_Brandnote linux64_brandnote = { .hdr.n_namesz = sizeof(GNU_ABI_VENDOR), .hdr.n_descsz = 16, .hdr.n_type = 1, @@ -593,7 +593,7 @@ static Elf_Brandnote linux64_brandnote = { .trans_osrel = linux_trans_osrel }; -static Elf64_Brandinfo linux_glibc2brand = { +static const Elf64_Brandinfo linux_glibc2brand = { .brand = ELFOSABI_LINUX, .machine = EM_AARCH64, .compat_3_brand = "Linux", @@ -604,7 +604,7 @@ static Elf64_Brandinfo linux_glibc2brand = { .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; -Elf64_Brandinfo *linux_brandlist[] = { +const Elf64_Brandinfo *linux_brandlist[] = { &linux_glibc2brand, NULL }; @@ -612,8 +612,8 @@ Elf64_Brandinfo *linux_brandlist[] = { static int linux64_elf_modevent(module_t mod, int type, void *data) { - Elf64_Brandinfo **brandinfo; - struct linux_ioctl_handler**lihp; + const Elf64_Brandinfo **brandinfo; + struct linux_ioctl_handler **lihp; int error; error = 0; diff --git a/sys/compat/ia32/ia32_sysvec.c b/sys/compat/ia32/ia32_sysvec.c index 0ea7d072e911..b9dada4eee7b 100644 --- a/sys/compat/ia32/ia32_sysvec.c +++ b/sys/compat/ia32/ia32_sysvec.c @@ -145,7 +145,7 @@ struct sysentvec ia32_freebsd_sysvec = { }; INIT_SYSENTVEC(elf_ia32_sysvec, &ia32_freebsd_sysvec); -static Elf32_Brandinfo ia32_brand_info = { +static const Elf32_Brandinfo ia32_brand_info = { .brand = ELFOSABI_FREEBSD, .machine = EM_386, .compat_3_brand = "FreeBSD", @@ -155,12 +155,10 @@ static Elf32_Brandinfo ia32_brand_info = { .brand_note = &elf32_freebsd_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; +C_SYSINIT(ia32, SI_SUB_EXEC, SI_ORDER_MIDDLE, + (sysinit_cfunc_t)elf32_insert_brand_entry, &ia32_brand_info); -SYSINIT(ia32, SI_SUB_EXEC, SI_ORDER_MIDDLE, - (sysinit_cfunc_t) elf32_insert_brand_entry, - &ia32_brand_info); - -static Elf32_Brandinfo ia32_brand_oinfo = { +static const Elf32_Brandinfo ia32_brand_oinfo = { .brand = ELFOSABI_FREEBSD, .machine = EM_386, .compat_3_brand = "FreeBSD", @@ -170,12 +168,10 @@ static Elf32_Brandinfo ia32_brand_oinfo = { .brand_note = &elf32_freebsd_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; +C_SYSINIT(oia32, SI_SUB_EXEC, SI_ORDER_ANY, + (sysinit_cfunc_t)elf32_insert_brand_entry, &ia32_brand_oinfo); -SYSINIT(oia32, SI_SUB_EXEC, SI_ORDER_ANY, - (sysinit_cfunc_t) elf32_insert_brand_entry, - &ia32_brand_oinfo); - -static Elf32_Brandinfo kia32_brand_info = { +static const Elf32_Brandinfo kia32_brand_info = { .brand = ELFOSABI_FREEBSD, .machine = EM_386, .compat_3_brand = "FreeBSD", @@ -184,10 +180,8 @@ static Elf32_Brandinfo kia32_brand_info = { .brand_note = &elf32_kfreebsd_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE_MANDATORY }; - -SYSINIT(kia32, SI_SUB_EXEC, SI_ORDER_ANY, - (sysinit_cfunc_t) elf32_insert_brand_entry, - &kia32_brand_info); +C_SYSINIT(kia32, SI_SUB_EXEC, SI_ORDER_ANY, + (sysinit_cfunc_t)elf32_insert_brand_entry, &kia32_brand_info); void elf32_dump_thread(struct thread *td, void *dst, size_t *off) diff --git a/sys/compat/linux/linux_futex.c b/sys/compat/linux/linux_futex.c index 37d0142bae8b..0586eb55a8f3 100644 --- a/sys/compat/linux/linux_futex.c +++ b/sys/compat/linux/linux_futex.c @@ -251,7 +251,7 @@ linux_futex(struct thread *td, struct linux_futex_args *args) * set LINUX_BI_FUTEX_REQUEUE bit of Brandinfo flags. */ p = td->td_proc; - Elf_Brandinfo *bi = p->p_elf_brandinfo; + const Elf_Brandinfo *bi = p->p_elf_brandinfo; if (bi == NULL || ((bi->flags & LINUX_BI_FUTEX_REQUEUE)) == 0) return (EINVAL); args->val3_compare = false; diff --git a/sys/conf/NOTES b/sys/conf/NOTES index ea9b2667607e..a25ee8f6e1af 100644 --- a/sys/conf/NOTES +++ b/sys/conf/NOTES @@ -679,6 +679,7 @@ options TCP_OFFLOAD # TCP offload support. options TCP_RFC7413 # TCP Fast Open options TCPHPTS +#options TCP_HPTS_KTEST # Add KTEST support for HPTS # In order to enable IPSEC you MUST also add device crypto to # your kernel configuration diff --git a/sys/conf/options b/sys/conf/options index b48ad1cf42cf..0b795a8d28fb 100644 --- a/sys/conf/options +++ b/sys/conf/options @@ -231,6 +231,7 @@ SYSVSEM opt_sysvipc.h SYSVSHM opt_sysvipc.h SW_WATCHDOG opt_watchdog.h TCPHPTS +TCP_HPTS_KTEST opt_inet.h TCP_REQUEST_TRK opt_global.h TCP_ACCOUNTING opt_global.h TCP_BBR opt_inet.h diff --git a/sys/fs/fuse/fuse_vnops.c b/sys/fs/fuse/fuse_vnops.c index 5c28db29fc63..683ee2f7ad56 100644 --- a/sys/fs/fuse/fuse_vnops.c +++ b/sys/fs/fuse/fuse_vnops.c @@ -284,7 +284,7 @@ fuse_flush(struct vnode *vp, struct ucred *cred, pid_t pid, int fflag) struct mount *mp = vnode_mount(vp); int err; - if (fsess_not_impl(vnode_mount(vp), FUSE_FLUSH)) + if (fsess_not_impl(mp, FUSE_FLUSH)) return 0; err = fuse_filehandle_getrw(vp, fflag, &fufh, cred, pid); @@ -292,7 +292,7 @@ fuse_flush(struct vnode *vp, struct ucred *cred, pid_t pid, int fflag) return err; if (fufh->fuse_open_flags & FOPEN_NOFLUSH && - (!fsess_opt_writeback(vnode_mount(vp)))) + (!fsess_opt_writeback(mp))) return (0); fdisp_init(&fdi, sizeof(*ffi)); diff --git a/sys/fs/unionfs/union_subr.c b/sys/fs/unionfs/union_subr.c index a14f9ca74305..b6d6db60ca3d 100644 --- a/sys/fs/unionfs/union_subr.c +++ b/sys/fs/unionfs/union_subr.c @@ -587,6 +587,7 @@ unionfs_find_node_status(struct unionfs_node *unp, struct thread *td) struct unionfs_node_status *unsp; pid_t pid; + MPASS(td != NULL); pid = td->td_proc->p_pid; ASSERT_VOP_ELOCKED(UNIONFSTOV(unp), __func__); @@ -612,6 +613,7 @@ unionfs_get_node_status(struct unionfs_node *unp, struct thread *td, struct unionfs_node_status *unsp; pid_t pid; + MPASS(td != NULL); pid = td->td_proc->p_pid; KASSERT(NULL != unspp, ("%s: NULL status", __func__)); diff --git a/sys/fs/unionfs/union_vnops.c b/sys/fs/unionfs/union_vnops.c index 627b2f6e9a1d..26fa14603c85 100644 --- a/sys/fs/unionfs/union_vnops.c +++ b/sys/fs/unionfs/union_vnops.c @@ -814,7 +814,7 @@ unionfs_close(struct vop_close_args *ap) unp = VTOUNIONFS(vp); lvp = unp->un_lowervp; uvp = unp->un_uppervp; - unsp = unionfs_find_node_status(unp, td); + unsp = (td != NULL) ? unionfs_find_node_status(unp, td) : NULL; if (unsp == NULL || (unsp->uns_lower_opencnt <= 0 && unsp->uns_upper_opencnt <= 0)) { diff --git a/sys/kern/imgact_elf.c b/sys/kern/imgact_elf.c index 1bc2491a1a12..c53707a1286c 100644 --- a/sys/kern/imgact_elf.c +++ b/sys/kern/imgact_elf.c @@ -92,7 +92,7 @@ #define ELF_ABI_ID __CONCAT(elf, __ELF_WORD_SIZE) static int __elfN(check_header)(const Elf_Ehdr *hdr); -static Elf_Brandinfo *__elfN(get_brandinfo)(struct image_params *imgp, +static const Elf_Brandinfo *__elfN(get_brandinfo)(struct image_params *imgp, const char *interp, int32_t *osrel, uint32_t *fctl0); static int __elfN(load_file)(struct proc *p, const char *file, u_long *addr, u_long *entry); @@ -104,7 +104,7 @@ static bool __elfN(freebsd_trans_osrel)(const Elf_Note *note, int32_t *osrel); static bool kfreebsd_trans_osrel(const Elf_Note *note, int32_t *osrel); static bool __elfN(check_note)(struct image_params *imgp, - Elf_Brandnote *checknote, int32_t *osrel, bool *has_fctl0, + const Elf_Brandnote *checknote, int32_t *osrel, bool *has_fctl0, uint32_t *fctl0); static vm_prot_t __elfN(trans_prot)(Elf_Word); static Elf_Word __elfN(untrans_prot)(vm_prot_t); @@ -227,7 +227,7 @@ SYSCTL_BOOL(ELF_NODE_OID, OID_AUTO, allow_wx, CTLFLAG_RWTUN, &__elfN(allow_wx), 0, "Allow pages to be mapped simultaneously writable and executable"); -static Elf_Brandinfo *elf_brand_list[MAX_BRANDS]; +static const Elf_Brandinfo *elf_brand_list[MAX_BRANDS]; #define aligned(a, t) (rounddown2((u_long)(a), sizeof(t)) == (u_long)(a)) @@ -286,7 +286,7 @@ kfreebsd_trans_osrel(const Elf_Note *note, int32_t *osrel) } int -__elfN(insert_brand_entry)(Elf_Brandinfo *entry) +__elfN(insert_brand_entry)(const Elf_Brandinfo *entry) { int i; @@ -305,7 +305,7 @@ __elfN(insert_brand_entry)(Elf_Brandinfo *entry) } int -__elfN(remove_brand_entry)(Elf_Brandinfo *entry) +__elfN(remove_brand_entry)(const Elf_Brandinfo *entry) { int i; @@ -321,7 +321,7 @@ __elfN(remove_brand_entry)(Elf_Brandinfo *entry) } bool -__elfN(brand_inuse)(Elf_Brandinfo *entry) +__elfN(brand_inuse)(const Elf_Brandinfo *entry) { struct proc *p; bool rval = false; @@ -338,12 +338,12 @@ __elfN(brand_inuse)(Elf_Brandinfo *entry) return (rval); } -static Elf_Brandinfo * +static const Elf_Brandinfo * __elfN(get_brandinfo)(struct image_params *imgp, const char *interp, int32_t *osrel, uint32_t *fctl0) { const Elf_Ehdr *hdr = (const Elf_Ehdr *)imgp->image_header; - Elf_Brandinfo *bi, *bi_m; + const Elf_Brandinfo *bi, *bi_m; bool ret, has_fctl0; int i, interp_name_len; @@ -492,7 +492,7 @@ __elfN(phdr_in_zero_page)(const Elf_Ehdr *hdr) static int __elfN(check_header)(const Elf_Ehdr *hdr) { - Elf_Brandinfo *bi; + const Elf_Brandinfo *bi; int i; if (!IS_ELF(*hdr) || @@ -1109,7 +1109,7 @@ __CONCAT(exec_, __elfN(imgact))(struct image_params *imgp) struct vmspace *vmspace; vm_map_t map; char *interp; - Elf_Brandinfo *brand_info; + const Elf_Brandinfo *brand_info; struct sysentvec *sv; u_long addr, baddr, entry, proghdr; u_long maxalign, maxsalign, mapsz, maxv, maxv1, anon_loc; @@ -1925,7 +1925,7 @@ __elfN(puthdr)(struct thread *td, void *hdr, size_t hdrsize, int numsegs, Elf_Phdr *phdr; Elf_Shdr *shdr; struct phdr_closure phc; - Elf_Brandinfo *bi; + const Elf_Brandinfo *bi; ehdr = (Elf_Ehdr *)hdr; bi = td->td_proc->p_elf_brandinfo; @@ -2861,7 +2861,7 @@ ret: } struct brandnote_cb_arg { - Elf_Brandnote *brandnote; + const Elf_Brandnote *brandnote; int32_t *osrel; }; @@ -2883,7 +2883,7 @@ brandnote_cb(const Elf_Note *note, void *arg0, bool *res) return (true); } -static Elf_Note fctl_note = { +static const Elf_Note fctl_note = { .n_namesz = sizeof(FREEBSD_ABI_VENDOR), .n_descsz = sizeof(uint32_t), .n_type = NT_FREEBSD_FEATURE_CTL, @@ -2918,7 +2918,7 @@ note_fctl_cb(const Elf_Note *note, void *arg0, bool *res) * as for headers. */ static bool -__elfN(check_note)(struct image_params *imgp, Elf_Brandnote *brandnote, +__elfN(check_note)(struct image_params *imgp, const Elf_Brandnote *brandnote, int32_t *osrel, bool *has_fctl0, uint32_t *fctl0) { const Elf_Phdr *phdr; diff --git a/sys/kern/kern_exec.c b/sys/kern/kern_exec.c index 0fc2d0e7f1bc..2bdd6faa025a 100644 --- a/sys/kern/kern_exec.c +++ b/sys/kern/kern_exec.c @@ -418,7 +418,7 @@ do_execve(struct thread *td, struct image_args *args, struct mac *mac_p, #endif int error, i, orig_osrel; uint32_t orig_fctl0; - Elf_Brandinfo *orig_brandinfo; + const Elf_Brandinfo *orig_brandinfo; size_t freepath_size; static const char fexecv_proc_title[] = "(fexecv)"; @@ -1314,7 +1314,7 @@ exec_map_stack(struct image_params *imgp) MAP_INHERIT_SHARE | MAP_ACC_NO_CHARGE); } else { sharedpage_addr = sv->sv_shared_page_base; - vm_map_fixed(map, obj, 0, + error = vm_map_fixed(map, obj, 0, sharedpage_addr, sv->sv_shared_page_len, VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_READ | VM_PROT_EXECUTE, diff --git a/sys/netinet/tcp_hpts.c b/sys/netinet/tcp_hpts.c index 63bbe4bba11b..c54459bb5f01 100644 --- a/sys/netinet/tcp_hpts.c +++ b/sys/netinet/tcp_hpts.c @@ -39,15 +39,14 @@ * First, and probably the main thing its used by Rack and BBR, it can * be used to call tcp_output() of a transport stack at some time in the future. * The normal way this is done is that tcp_output() of the stack schedules - * itself to be called again by calling tcp_hpts_insert(tcpcb, slot). The - * slot is the time from now that the stack wants to be called but it - * must be converted to tcp_hpts's notion of slot. This is done with - * one of the macros HPTS_MS_TO_SLOTS or HPTS_USEC_TO_SLOTS. So a typical + * itself to be called again by calling tcp_hpts_insert(tcpcb, usecs). The + * usecs is the time from now that the stack wants to be called and is + * passing time directly in microseconds. So a typical * call from the tcp_output() routine might look like: * - * tcp_hpts_insert(tp, HPTS_USEC_TO_SLOTS(550)); + * tcp_hpts_insert(tp, 550, NULL); * - * The above would schedule tcp_output() to be called in 550 useconds. + * The above would schedule tcp_output() to be called in 550 microseconds. * Note that if using this mechanism the stack will want to add near * its top a check to prevent unwanted calls (from user land or the * arrival of incoming ack's). So it would add something like: @@ -149,27 +148,44 @@ #include <netinet/tcpip.h> #include <netinet/cc/cc.h> #include <netinet/tcp_hpts.h> +#include <netinet/tcp_hpts_internal.h> #include <netinet/tcp_log_buf.h> #ifdef tcp_offload #include <netinet/tcp_offload.h> #endif -/* - * The hpts uses a 102400 wheel. The wheel - * defines the time in 10 usec increments (102400 x 10). - * This gives a range of 10usec - 1024ms to place - * an entry within. If the user requests more than - * 1.024 second, a remaineder is attached and the hpts - * when seeing the remainder will re-insert the - * inpcb forward in time from where it is until - * the remainder is zero. - */ +/* Global instance for TCP HPTS */ +struct tcp_hptsi *tcp_hptsi_pace; + +/* Default function table for production use. */ +const struct tcp_hptsi_funcs tcp_hptsi_default_funcs = { + .microuptime = microuptime, + .swi_add = swi_add, + .swi_remove = swi_remove, + .swi_sched = swi_sched, + .intr_event_bind = intr_event_bind, + .intr_event_bind_ithread_cpuset = intr_event_bind_ithread_cpuset, + .callout_init = callout_init, + .callout_reset_sbt_on = callout_reset_sbt_on, + ._callout_stop_safe = _callout_stop_safe, +}; -#define NUM_OF_HPTSI_SLOTS 102400 +#ifdef TCP_HPTS_KTEST +#define microuptime pace->funcs->microuptime +#define swi_add pace->funcs->swi_add +#define swi_remove pace->funcs->swi_remove +#define swi_sched pace->funcs->swi_sched +#define intr_event_bind pace->funcs->intr_event_bind +#define intr_event_bind_ithread_cpuset pace->funcs->intr_event_bind_ithread_cpuset +#define callout_init pace->funcs->callout_init +#define callout_reset_sbt_on pace->funcs->callout_reset_sbt_on +#define _callout_stop_safe pace->funcs->_callout_stop_safe +#endif -/* The number of connections after which the dynamic sleep logic kicks in. */ -#define DEFAULT_CONNECTION_THRESHOLD 100 +static MALLOC_DEFINE(M_TCPHPTS, "tcp_hpts", "TCP hpts"); + +static void tcp_hpts_thread(void *ctx); /* * When using the hpts, a TCP stack must make sure @@ -204,87 +220,22 @@ * * When we are in the "new" mode i.e. conn_cnt > conn_cnt_thresh * then we do a dynamic adjustment on the time we sleep. - * Our threshold is if the lateness of the first client served (in ticks) is + * Our threshold is if the lateness of the first client served (in slots) is * greater than or equal too slots_indicate_more_sleep (10ms - * or 10000 ticks). If we were that late, the actual sleep time - * is adjusted down by 50%. If the ticks_ran is less than - * slots_indicate_more_sleep (100 ticks or 1000usecs). + * or 10000 slots). If we were that late, the actual sleep time + * is adjusted down by 50%. If the slots_ran is less than + * slots_indicate_more_sleep (100 slots or 1000usecs). * */ -/* Each hpts has its own p_mtx which is used for locking */ -#define HPTS_MTX_ASSERT(hpts) mtx_assert(&(hpts)->p_mtx, MA_OWNED) -#define HPTS_LOCK(hpts) mtx_lock(&(hpts)->p_mtx) -#define HPTS_TRYLOCK(hpts) mtx_trylock(&(hpts)->p_mtx) -#define HPTS_UNLOCK(hpts) mtx_unlock(&(hpts)->p_mtx) -struct tcp_hpts_entry { - /* Cache line 0x00 */ - struct mtx p_mtx; /* Mutex for hpts */ - struct timeval p_mysleep; /* Our min sleep time */ - uint64_t syscall_cnt; - uint64_t sleeping; /* What the actual sleep was (if sleeping) */ - uint16_t p_hpts_active; /* Flag that says hpts is awake */ - uint8_t p_wheel_complete; /* have we completed the wheel arc walk? */ - uint32_t p_curtick; /* Tick in 10 us the hpts is going to */ - uint32_t p_runningslot; /* Current tick we are at if we are running */ - uint32_t p_prev_slot; /* Previous slot we were on */ - uint32_t p_cur_slot; /* Current slot in wheel hpts is draining */ - uint32_t p_nxt_slot; /* The next slot outside the current range of - * slots that the hpts is running on. */ - int32_t p_on_queue_cnt; /* Count on queue in this hpts */ - uint32_t p_lasttick; /* Last tick before the current one */ - uint8_t p_direct_wake :1, /* boolean */ - p_on_min_sleep:1, /* boolean */ - p_hpts_wake_scheduled:1, /* boolean */ - hit_callout_thresh:1, - p_avail:4; - uint8_t p_fill[3]; /* Fill to 32 bits */ - /* Cache line 0x40 */ - struct hptsh { - TAILQ_HEAD(, tcpcb) head; - uint32_t count; - uint32_t gencnt; - } *p_hptss; /* Hptsi wheel */ - uint32_t p_hpts_sleep_time; /* Current sleep interval having a max - * of 255ms */ - uint32_t overidden_sleep; /* what was overrided by min-sleep for logging */ - uint32_t saved_lasttick; /* for logging */ - uint32_t saved_curtick; /* for logging */ - uint32_t saved_curslot; /* for logging */ - uint32_t saved_prev_slot; /* for logging */ - uint32_t p_delayed_by; /* How much were we delayed by */ - /* Cache line 0x80 */ - struct sysctl_ctx_list hpts_ctx; - struct sysctl_oid *hpts_root; - struct intr_event *ie; - void *ie_cookie; - uint16_t p_num; /* The hpts number one per cpu */ - uint16_t p_cpu; /* The hpts CPU */ - /* There is extra space in here */ - /* Cache line 0x100 */ - struct callout co __aligned(CACHE_LINE_SIZE); -} __aligned(CACHE_LINE_SIZE); - -static struct tcp_hptsi { - struct cpu_group **grps; - struct tcp_hpts_entry **rp_ent; /* Array of hptss */ - uint32_t *cts_last_ran; - uint32_t grp_cnt; - uint32_t rp_num_hptss; /* Number of hpts threads */ -} tcp_pace; - -static MALLOC_DEFINE(M_TCPHPTS, "tcp_hpts", "TCP hpts"); #ifdef RSS -static int tcp_bind_threads = 1; +int tcp_bind_threads = 1; #else -static int tcp_bind_threads = 2; +int tcp_bind_threads = 2; #endif static int tcp_use_irq_cpu = 0; static int hpts_does_tp_logging = 0; - -static int32_t tcp_hptsi(struct tcp_hpts_entry *hpts, bool from_callout); -static void tcp_hpts_thread(void *ctx); - +static int32_t tcp_hpts_precision = 120; int32_t tcp_min_hptsi_time = DEFAULT_MIN_SLEEP; static int conn_cnt_thresh = DEFAULT_CONNECTION_THRESHOLD; static int32_t dynamic_min_sleep = DYNAMIC_MIN_SLEEP; @@ -295,23 +246,6 @@ SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hpts, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, SYSCTL_NODE(_net_inet_tcp_hpts, OID_AUTO, stats, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "TCP Hpts statistics"); -#define timersub(tvp, uvp, vvp) \ - do { \ - (vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec; \ - (vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec; \ - if ((vvp)->tv_usec < 0) { \ - (vvp)->tv_sec--; \ - (vvp)->tv_usec += 1000000; \ - } \ - } while (0) - -static int32_t tcp_hpts_precision = 120; - -static struct hpts_domain_info { - int count; - int cpu[MAXCPU]; -} hpts_domains[MAXMEMDOM]; - counter_u64_t hpts_hopelessly_behind; SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, hopeless, CTLFLAG_RD, @@ -459,14 +393,14 @@ SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, nowake_over_thresh, CTLFLAG_RW, &tcp_hpts_no_wake_over_thresh, 0, "When we are over the threshold on the pacer do we prohibit wakeups?"); -static uint16_t -hpts_random_cpu(void) +uint16_t +tcp_hptsi_random_cpu(struct tcp_hptsi *pace) { uint16_t cpuid; uint32_t ran; ran = arc4random(); - cpuid = (((ran & 0xffff) % mp_ncpus) % tcp_pace.rp_num_hptss); + cpuid = (((ran & 0xffff) % mp_ncpus) % pace->rp_num_hptss); return (cpuid); } @@ -487,13 +421,11 @@ tcp_hpts_log(struct tcp_hpts_entry *hpts, struct tcpcb *tp, struct timeval *tv, log.u_bbr.flex2 = hpts->p_cur_slot; log.u_bbr.flex3 = hpts->p_prev_slot; log.u_bbr.flex4 = idx; - log.u_bbr.flex5 = hpts->p_curtick; log.u_bbr.flex6 = hpts->p_on_queue_cnt; log.u_bbr.flex7 = hpts->p_cpu; log.u_bbr.flex8 = (uint8_t)from_callout; log.u_bbr.inflight = slots_to_run; log.u_bbr.applimited = hpts->overidden_sleep; - log.u_bbr.delivered = hpts->saved_curtick; log.u_bbr.timeStamp = tcp_tv_to_usec(tv); log.u_bbr.epoch = hpts->saved_curslot; log.u_bbr.lt_epoch = hpts->saved_prev_slot; @@ -510,11 +442,67 @@ tcp_hpts_log(struct tcp_hpts_entry *hpts, struct tcpcb *tp, struct timeval *tv, } } +/* + * Timeout handler for the HPTS sleep callout. It immediately schedules the SWI + * for the HPTS entry to run. + */ static void -tcp_wakehpts(struct tcp_hpts_entry *hpts) +tcp_hpts_sleep_timeout(void *arg) { +#ifdef TCP_HPTS_KTEST + struct tcp_hptsi *pace; +#endif + struct tcp_hpts_entry *hpts; + + hpts = (struct tcp_hpts_entry *)arg; +#ifdef TCP_HPTS_KTEST + pace = hpts->p_hptsi; +#endif + swi_sched(hpts->ie_cookie, 0); +} + +/* + * Reset the HPTS callout timer with the provided timeval. Returns the results + * of the callout_reset_sbt_on() function. + */ +static int +tcp_hpts_sleep(struct tcp_hpts_entry *hpts, struct timeval *tv) +{ +#ifdef TCP_HPTS_KTEST + struct tcp_hptsi *pace; +#endif + sbintime_t sb; + +#ifdef TCP_HPTS_KTEST + pace = hpts->p_hptsi; +#endif + + /* Store off to make visible the actual sleep time */ + hpts->sleeping = tv->tv_usec; + + sb = tvtosbt(*tv); + return (callout_reset_sbt_on( + &hpts->co, sb, 0, tcp_hpts_sleep_timeout, hpts, hpts->p_cpu, + (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)))); +} + +/* + * Schedules the SWI for the HTPS entry to run, if not already scheduled or + * running. + */ +void +tcp_hpts_wake(struct tcp_hpts_entry *hpts) +{ +#ifdef TCP_HPTS_KTEST + struct tcp_hptsi *pace; +#endif + HPTS_MTX_ASSERT(hpts); +#ifdef TCP_HPTS_KTEST + pace = hpts->p_hptsi; +#endif + if (tcp_hpts_no_wake_over_thresh && (hpts->p_on_queue_cnt >= conn_cnt_thresh)) { hpts->p_direct_wake = 0; return; @@ -526,15 +514,6 @@ tcp_wakehpts(struct tcp_hpts_entry *hpts) } static void -hpts_timeout_swi(void *arg) -{ - struct tcp_hpts_entry *hpts; - - hpts = (struct tcp_hpts_entry *)arg; - swi_sched(hpts->ie_cookie, 0); -} - -static void tcp_hpts_insert_internal(struct tcpcb *tp, struct tcp_hpts_entry *hpts) { struct inpcb *inp = tptoinpcb(tp); @@ -562,13 +541,13 @@ tcp_hpts_insert_internal(struct tcpcb *tp, struct tcp_hpts_entry *hpts) } static struct tcp_hpts_entry * -tcp_hpts_lock(struct tcpcb *tp) +tcp_hpts_lock(struct tcp_hptsi *pace, struct tcpcb *tp) { struct tcp_hpts_entry *hpts; INP_LOCK_ASSERT(tptoinpcb(tp)); - hpts = tcp_pace.rp_ent[tp->t_hpts_cpu]; + hpts = pace->rp_ent[tp->t_hpts_cpu]; HPTS_LOCK(hpts); return (hpts); @@ -595,11 +574,10 @@ tcp_hpts_release(struct tcpcb *tp) * and has never received a first packet. */ void -tcp_hpts_init(struct tcpcb *tp) +__tcp_hpts_init(struct tcp_hptsi *pace, struct tcpcb *tp) { - if (__predict_true(tp->t_hpts_cpu == HPTS_CPU_NONE)) { - tp->t_hpts_cpu = hpts_random_cpu(); + tp->t_hpts_cpu = tcp_hptsi_random_cpu(pace); MPASS(!(tp->t_flags2 & TF2_HPTS_CPU_SET)); } } @@ -611,14 +589,14 @@ tcp_hpts_init(struct tcpcb *tp) * INP lock and then get the hpts lock. */ void -tcp_hpts_remove(struct tcpcb *tp) +__tcp_hpts_remove(struct tcp_hptsi *pace, struct tcpcb *tp) { struct tcp_hpts_entry *hpts; struct hptsh *hptsh; INP_WLOCK_ASSERT(tptoinpcb(tp)); - hpts = tcp_hpts_lock(tp); + hpts = tcp_hpts_lock(pace, tp); if (tp->t_in_hpts == IHPTS_ONQUEUE) { hptsh = &hpts->p_hptss[tp->t_hpts_slot]; tp->t_hpts_request = 0; @@ -662,23 +640,19 @@ hpts_slot(uint32_t wheel_slot, uint32_t plus) { /* * Given a slot on the wheel, what slot - * is that plus ticks out? + * is that plus slots out? */ - KASSERT(wheel_slot < NUM_OF_HPTSI_SLOTS, ("Invalid tick %u not on wheel", wheel_slot)); + KASSERT(wheel_slot < NUM_OF_HPTSI_SLOTS, ("Invalid slot %u not on wheel", wheel_slot)); return ((wheel_slot + plus) % NUM_OF_HPTSI_SLOTS); } static inline int -tick_to_wheel(uint32_t cts_in_wticks) +cts_to_wheel(uint32_t cts) { /* - * Given a timestamp in ticks (so by - * default to get it to a real time one - * would multiply by 10.. i.e the number - * of ticks in a slot) map it to our limited - * space wheel. + * Given a timestamp in useconds map it to our limited space wheel. */ - return (cts_in_wticks % NUM_OF_HPTSI_SLOTS); + return ((cts / HPTS_USECS_PER_SLOT) % NUM_OF_HPTSI_SLOTS); } static inline int @@ -721,7 +695,7 @@ max_slots_available(struct tcp_hpts_entry *hpts, uint32_t wheel_slot, uint32_t * if ((hpts->p_hpts_active == 1) && (hpts->p_wheel_complete == 0)) { end_slot = hpts->p_runningslot; - /* Back up one tick */ + /* Back up one slot */ if (end_slot == 0) end_slot = NUM_OF_HPTSI_SLOTS - 1; else @@ -734,7 +708,7 @@ max_slots_available(struct tcp_hpts_entry *hpts, uint32_t wheel_slot, uint32_t * * not active, or we have * completed the pass over * the wheel, we can use the - * prev tick and subtract one from it. This puts us + * prev slot and subtract one from it. This puts us * as far out as possible on the wheel. */ end_slot = hpts->p_prev_slot; @@ -747,7 +721,7 @@ max_slots_available(struct tcp_hpts_entry *hpts, uint32_t wheel_slot, uint32_t * /* * Now we have close to the full wheel left minus the * time it has been since the pacer went to sleep. Note - * that wheel_tick, passed in, should be the current time + * that wheel_slot, passed in, should be the current time * from the perspective of the caller, mapped to the wheel. */ if (hpts->p_prev_slot != wheel_slot) @@ -824,7 +798,7 @@ max_slots_available(struct tcp_hpts_entry *hpts, uint32_t wheel_slot, uint32_t * #ifdef INVARIANTS static void check_if_slot_would_be_wrong(struct tcp_hpts_entry *hpts, struct tcpcb *tp, - uint32_t hptsslot, int line) + uint32_t hptsslot) { /* * Sanity checks for the pacer with invariants @@ -855,12 +829,13 @@ check_if_slot_would_be_wrong(struct tcp_hpts_entry *hpts, struct tcpcb *tp, } #endif -uint32_t -tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_diag *diag) +void +__tcp_hpts_insert(struct tcp_hptsi *pace, struct tcpcb *tp, uint32_t usecs, + struct hpts_diag *diag) { struct tcp_hpts_entry *hpts; struct timeval tv; - uint32_t slot_on, wheel_cts, last_slot, need_new_to = 0; + uint32_t slot, wheel_cts, last_slot, need_new_to = 0; int32_t wheel_slot, maxslots; bool need_wakeup = false; @@ -869,11 +844,13 @@ tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_ MPASS(!(tp->t_in_hpts == IHPTS_ONQUEUE)); /* + * Convert microseconds to slots for internal use. * We now return the next-slot the hpts will be on, beyond its * current run (if up) or where it was when it stopped if it is * sleeping. */ - hpts = tcp_hpts_lock(tp); + slot = HPTS_USEC_TO_SLOTS(usecs); + hpts = tcp_hpts_lock(pace, tp); microuptime(&tv); if (diag) { memset(diag, 0, sizeof(struct hpts_diag)); @@ -882,8 +859,6 @@ tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_ diag->p_runningslot = hpts->p_runningslot; diag->p_nxt_slot = hpts->p_nxt_slot; diag->p_cur_slot = hpts->p_cur_slot; - diag->p_curtick = hpts->p_curtick; - diag->p_lasttick = hpts->p_lasttick; diag->slot_req = slot; diag->p_on_min_sleep = hpts->p_on_min_sleep; diag->hpts_sleep_time = hpts->p_hpts_sleep_time; @@ -910,17 +885,15 @@ tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_ * timeout is not 1. */ hpts->p_direct_wake = 1; - tcp_wakehpts(hpts); + tcp_hpts_wake(hpts); } - slot_on = hpts->p_nxt_slot; HPTS_UNLOCK(hpts); - return (slot_on); + return; } - /* Get the current time relative to the wheel */ - wheel_cts = tcp_tv_to_hpts_slot(&tv); - /* Map it onto the wheel */ - wheel_slot = tick_to_wheel(wheel_cts); + /* Get the current time stamp and map it onto the wheel */ + wheel_cts = tcp_tv_to_usec(&tv); + wheel_slot = cts_to_wheel(wheel_cts); /* Now what's the max we can place it at? */ maxslots = max_slots_available(hpts, wheel_slot, &last_slot); if (diag) { @@ -952,11 +925,11 @@ tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_ tp->t_hpts_slot = last_slot; } if (diag) { - diag->slot_remaining = tp->t_hpts_request; + diag->time_remaining = tp->t_hpts_request; diag->inp_hptsslot = tp->t_hpts_slot; } #ifdef INVARIANTS - check_if_slot_would_be_wrong(hpts, tp, tp->t_hpts_slot, line); + check_if_slot_would_be_wrong(hpts, tp, tp->t_hpts_slot); #endif if (__predict_true(tp->t_in_hpts != IHPTS_MOVING)) tcp_hpts_insert_internal(tp, hpts); @@ -995,12 +968,12 @@ tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_ } /* * Now how far is the hpts sleeping to? if active is 1, its - * up and ticking we do nothing, otherwise we may need to + * up and running we do nothing, otherwise we may need to * reschedule its callout if need_new_to is set from above. */ if (need_wakeup) { hpts->p_direct_wake = 1; - tcp_wakehpts(hpts); + tcp_hpts_wake(hpts); if (diag) { diag->need_new_to = 0; diag->co_ret = 0xffff0000; @@ -1008,7 +981,6 @@ tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_ } else if (need_new_to) { int32_t co_ret; struct timeval tv; - sbintime_t sb; tv.tv_sec = 0; tv.tv_usec = 0; @@ -1016,24 +988,18 @@ tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_ tv.tv_sec++; need_new_to -= HPTS_USEC_IN_SEC; } - tv.tv_usec = need_new_to; - sb = tvtosbt(tv); - co_ret = callout_reset_sbt_on(&hpts->co, sb, 0, - hpts_timeout_swi, hpts, hpts->p_cpu, - (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision))); + tv.tv_usec = need_new_to; /* XXX: Why is this sleeping over the max? */ + co_ret = tcp_hpts_sleep(hpts, &tv); if (diag) { diag->need_new_to = need_new_to; diag->co_ret = co_ret; } } - slot_on = hpts->p_nxt_slot; HPTS_UNLOCK(hpts); - - return (slot_on); } static uint16_t -hpts_cpuid(struct tcpcb *tp, int *failed) +hpts_cpuid(struct tcp_hptsi *pace, struct tcpcb *tp, int *failed) { struct inpcb *inp = tptoinpcb(tp); u_int cpuid; @@ -1060,7 +1026,7 @@ hpts_cpuid(struct tcpcb *tp, int *failed) #ifdef RSS cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype); if (cpuid == NETISR_CPUID_NONE) - return (hpts_random_cpu()); + return (tcp_hptsi_random_cpu(pace)); else return (cpuid); #endif @@ -1071,7 +1037,7 @@ hpts_cpuid(struct tcpcb *tp, int *failed) */ if (inp->inp_flowtype == M_HASHTYPE_NONE) { counter_u64_add(cpu_uses_random, 1); - return (hpts_random_cpu()); + return (tcp_hptsi_random_cpu(pace)); } /* * Hash to a thread based on the flowid. If we are using numa, @@ -1086,7 +1052,7 @@ hpts_cpuid(struct tcpcb *tp, int *failed) #ifdef NUMA } else { /* Hash into the cpu's that use that domain */ - di = &hpts_domains[inp->inp_numa_domain]; + di = &pace->domains[inp->inp_numa_domain]; cpuid = di->cpu[inp->inp_flowid % di->count]; } #endif @@ -1118,9 +1084,16 @@ tcp_hpts_set_max_sleep(struct tcp_hpts_entry *hpts, int wrap_loop_cnt) } } -static int32_t +static bool +tcp_hpts_different_slots(uint32_t cts, uint32_t cts_last_run) +{ + return ((cts / HPTS_USECS_PER_SLOT) != (cts_last_run / HPTS_USECS_PER_SLOT)); +} + +int32_t tcp_hptsi(struct tcp_hpts_entry *hpts, bool from_callout) { + struct tcp_hptsi *pace; struct tcpcb *tp; struct timeval tv; int32_t slots_to_run, i, error; @@ -1130,6 +1103,7 @@ tcp_hptsi(struct tcp_hpts_entry *hpts, bool from_callout) int32_t wrap_loop_cnt = 0; int32_t slot_pos_of_endpoint = 0; int32_t orig_exit_slot; + uint32_t cts, cts_last_run; bool completed_measure, seen_endpoint; completed_measure = false; @@ -1137,32 +1111,34 @@ tcp_hptsi(struct tcp_hpts_entry *hpts, bool from_callout) HPTS_MTX_ASSERT(hpts); NET_EPOCH_ASSERT(); + + pace = hpts->p_hptsi; + MPASS(pace != NULL); + /* record previous info for any logging */ - hpts->saved_lasttick = hpts->p_lasttick; - hpts->saved_curtick = hpts->p_curtick; hpts->saved_curslot = hpts->p_cur_slot; hpts->saved_prev_slot = hpts->p_prev_slot; - hpts->p_lasttick = hpts->p_curtick; - hpts->p_curtick = tcp_gethptstick(&tv); - tcp_pace.cts_last_ran[hpts->p_num] = tcp_tv_to_usec(&tv); - orig_exit_slot = hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick); + microuptime(&tv); + cts_last_run = pace->cts_last_ran[hpts->p_cpu]; + pace->cts_last_ran[hpts->p_cpu] = cts = tcp_tv_to_usec(&tv); + + orig_exit_slot = hpts->p_cur_slot = cts_to_wheel(cts); if ((hpts->p_on_queue_cnt == 0) || - (hpts->p_lasttick == hpts->p_curtick)) { + !tcp_hpts_different_slots(cts, cts_last_run)) { /* - * No time has yet passed, - * or nothing to do. + * Not enough time has yet passed or nothing to do. */ hpts->p_prev_slot = hpts->p_cur_slot; - hpts->p_lasttick = hpts->p_curtick; goto no_run; } again: hpts->p_wheel_complete = 0; HPTS_MTX_ASSERT(hpts); slots_to_run = hpts_slots_diff(hpts->p_prev_slot, hpts->p_cur_slot); - if (((hpts->p_curtick - hpts->p_lasttick) > (NUM_OF_HPTSI_SLOTS - 1)) && - (hpts->p_on_queue_cnt != 0)) { + if ((hpts->p_on_queue_cnt != 0) && + ((cts - cts_last_run) > + ((NUM_OF_HPTSI_SLOTS-1) * HPTS_USECS_PER_SLOT))) { /* * Wheel wrap is occuring, basically we * are behind and the distance between @@ -1238,7 +1214,7 @@ again: uint32_t runningslot; /* - * Calculate our delay, if there are no extra ticks there + * Calculate our delay, if there are no extra slots there * was not any (i.e. if slots_to_run == 1, no delay). */ hpts->p_delayed_by = (slots_to_run - (i + 1)) * @@ -1391,7 +1367,7 @@ again: * gets added to the hpts (not this one) * :-) */ - tcp_set_hpts(tp); + __tcp_set_hpts(pace, tp); } CURVNET_SET(inp->inp_vnet); /* Lets do any logging that we might want to */ @@ -1450,10 +1426,12 @@ no_one: hpts->p_delayed_by = 0; /* * Check to see if we took an excess amount of time and need to run - * more ticks (if we did not hit eno-bufs). + * more slots (if we did not hit eno-bufs). */ hpts->p_prev_slot = hpts->p_cur_slot; - hpts->p_lasttick = hpts->p_curtick; + microuptime(&tv); + cts_last_run = cts; + cts = tcp_tv_to_usec(&tv); if (!from_callout || (loop_cnt > max_pacer_loops)) { /* * Something is serious slow we have @@ -1465,7 +1443,7 @@ no_one: * can never catch up :( * * We will just lie to this thread - * and let it thing p_curtick is + * and let it think p_curslot is * correct. When it next awakens * it will find itself further behind. */ @@ -1473,20 +1451,19 @@ no_one: counter_u64_add(hpts_hopelessly_behind, 1); goto no_run; } - hpts->p_curtick = tcp_gethptstick(&tv); - hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick); + + hpts->p_cur_slot = cts_to_wheel(cts); if (!seen_endpoint) { /* We saw no endpoint but we may be looping */ orig_exit_slot = hpts->p_cur_slot; } - if ((wrap_loop_cnt < 2) && - (hpts->p_lasttick != hpts->p_curtick)) { + if ((wrap_loop_cnt < 2) && tcp_hpts_different_slots(cts, cts_last_run)) { counter_u64_add(hpts_loops, 1); loop_cnt++; goto again; } no_run: - tcp_pace.cts_last_ran[hpts->p_num] = tcp_tv_to_usec(&tv); + pace->cts_last_ran[hpts->p_cpu] = cts; /* * Set flag to tell that we are done for * any slot input that happens during @@ -1494,25 +1471,36 @@ no_run: */ hpts->p_wheel_complete = 1; /* - * Now did we spend too long running input and need to run more ticks? - * Note that if wrap_loop_cnt < 2 then we should have the conditions - * in the KASSERT's true. But if the wheel is behind i.e. wrap_loop_cnt - * is greater than 2, then the condtion most likely are *not* true. - * Also if we are called not from the callout, we don't run the wheel - * multiple times so the slots may not align either. - */ - KASSERT(((hpts->p_prev_slot == hpts->p_cur_slot) || - (wrap_loop_cnt >= 2) || !from_callout), - ("H:%p p_prev_slot:%u not equal to p_cur_slot:%u", hpts, - hpts->p_prev_slot, hpts->p_cur_slot)); - KASSERT(((hpts->p_lasttick == hpts->p_curtick) - || (wrap_loop_cnt >= 2) || !from_callout), - ("H:%p p_lasttick:%u not equal to p_curtick:%u", hpts, - hpts->p_lasttick, hpts->p_curtick)); - if (from_callout && (hpts->p_lasttick != hpts->p_curtick)) { - hpts->p_curtick = tcp_gethptstick(&tv); + * If enough time has elapsed that we should be processing the next + * slot(s), then we should have kept running and not marked the wheel as + * complete. + * + * But there are several other conditions where we would have stopped + * processing, so the prev/cur slots and cts variables won't match. + * These conditions are: + * + * - Calls not from callouts don't run multiple times + * - The wheel is empty + * - We've processed more than max_pacer_loops times + * - We've wrapped more than 2 times + * + * This assert catches when the logic above has violated this design. + * + */ + KASSERT((!from_callout || (hpts->p_on_queue_cnt == 0) || + (loop_cnt > max_pacer_loops) || (wrap_loop_cnt >= 2) || + ((hpts->p_prev_slot == hpts->p_cur_slot) && + !tcp_hpts_different_slots(cts, cts_last_run))), + ("H:%p Shouldn't be done! prev_slot:%u, cur_slot:%u, " + "cts_last_run:%u, cts:%u, loop_cnt:%d, wrap_loop_cnt:%d", + hpts, hpts->p_prev_slot, hpts->p_cur_slot, + cts_last_run, cts, loop_cnt, wrap_loop_cnt)); + + if (from_callout && tcp_hpts_different_slots(cts, cts_last_run)) { + microuptime(&tv); + cts = tcp_tv_to_usec(&tv); + hpts->p_cur_slot = cts_to_wheel(cts); counter_u64_add(hpts_loops, 1); - hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick); goto again; } @@ -1526,16 +1514,16 @@ no_run: } void -tcp_set_hpts(struct tcpcb *tp) +__tcp_set_hpts(struct tcp_hptsi *pace, struct tcpcb *tp) { struct tcp_hpts_entry *hpts; int failed; INP_WLOCK_ASSERT(tptoinpcb(tp)); - hpts = tcp_hpts_lock(tp); + hpts = tcp_hpts_lock(pace, tp); if (tp->t_in_hpts == IHPTS_NONE && !(tp->t_flags2 & TF2_HPTS_CPU_SET)) { - tp->t_hpts_cpu = hpts_cpuid(tp, &failed); + tp->t_hpts_cpu = hpts_cpuid(pace, tp, &failed); if (failed == 0) tp->t_flags2 |= TF2_HPTS_CPU_SET; } @@ -1543,33 +1531,35 @@ tcp_set_hpts(struct tcpcb *tp) } static struct tcp_hpts_entry * -tcp_choose_hpts_to_run(void) +tcp_choose_hpts_to_run(struct tcp_hptsi *pace) { + struct timeval tv; int i, oldest_idx, start, end; uint32_t cts, time_since_ran, calc; - cts = tcp_get_usecs(NULL); + microuptime(&tv); + cts = tcp_tv_to_usec(&tv); time_since_ran = 0; /* Default is all one group */ start = 0; - end = tcp_pace.rp_num_hptss; + end = pace->rp_num_hptss; /* * If we have more than one L3 group figure out which one * this CPU is in. */ - if (tcp_pace.grp_cnt > 1) { - for (i = 0; i < tcp_pace.grp_cnt; i++) { - if (CPU_ISSET(curcpu, &tcp_pace.grps[i]->cg_mask)) { - start = tcp_pace.grps[i]->cg_first; - end = (tcp_pace.grps[i]->cg_last + 1); + if (pace->grp_cnt > 1) { + for (i = 0; i < pace->grp_cnt; i++) { + if (CPU_ISSET(curcpu, &pace->grps[i]->cg_mask)) { + start = pace->grps[i]->cg_first; + end = (pace->grps[i]->cg_last + 1); break; } } } oldest_idx = -1; for (i = start; i < end; i++) { - if (TSTMP_GT(cts, tcp_pace.cts_last_ran[i])) - calc = cts - tcp_pace.cts_last_ran[i]; + if (TSTMP_GT(cts, pace->cts_last_ran[i])) + calc = cts - pace->cts_last_ran[i]; else calc = 0; if (calc > time_since_ran) { @@ -1578,9 +1568,9 @@ tcp_choose_hpts_to_run(void) } } if (oldest_idx >= 0) - return(tcp_pace.rp_ent[oldest_idx]); + return(pace->rp_ent[oldest_idx]); else - return(tcp_pace.rp_ent[(curcpu % tcp_pace.rp_num_hptss)]); + return(pace->rp_ent[(curcpu % pace->rp_num_hptss)]); } static void @@ -1588,9 +1578,9 @@ __tcp_run_hpts(void) { struct epoch_tracker et; struct tcp_hpts_entry *hpts; - int ticks_ran; + int slots_ran; - hpts = tcp_choose_hpts_to_run(); + hpts = tcp_choose_hpts_to_run(tcp_hptsi_pace); if (hpts->p_hpts_active) { /* Already active */ @@ -1606,12 +1596,11 @@ __tcp_run_hpts(void) hpts->syscall_cnt++; counter_u64_add(hpts_direct_call, 1); hpts->p_hpts_active = 1; - ticks_ran = tcp_hptsi(hpts, false); + slots_ran = tcp_hptsi(hpts, false); /* We may want to adjust the sleep values here */ if (hpts->p_on_queue_cnt >= conn_cnt_thresh) { - if (ticks_ran > slots_indicate_less_sleep) { + if (slots_ran > slots_indicate_less_sleep) { struct timeval tv; - sbintime_t sb; hpts->p_mysleep.tv_usec /= 2; if (hpts->p_mysleep.tv_usec < dynamic_min_sleep) @@ -1635,13 +1624,8 @@ __tcp_run_hpts(void) * the dynamic value and set the on_min_sleep * flag so we will not be awoken. */ - sb = tvtosbt(tv); - /* Store off to make visible the actual sleep time */ - hpts->sleeping = tv.tv_usec; - callout_reset_sbt_on(&hpts->co, sb, 0, - hpts_timeout_swi, hpts, hpts->p_cpu, - (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision))); - } else if (ticks_ran < slots_indicate_more_sleep) { + (void)tcp_hpts_sleep(hpts, &tv); + } else if (slots_ran < slots_indicate_more_sleep) { /* For the further sleep, don't reschedule hpts */ hpts->p_mysleep.tv_usec *= 2; if (hpts->p_mysleep.tv_usec > dynamic_max_sleep) @@ -1658,17 +1642,22 @@ out_with_mtx: static void tcp_hpts_thread(void *ctx) { +#ifdef TCP_HPTS_KTEST + struct tcp_hptsi *pace; +#endif struct tcp_hpts_entry *hpts; struct epoch_tracker et; struct timeval tv; - sbintime_t sb; - int ticks_ran; + int slots_ran; hpts = (struct tcp_hpts_entry *)ctx; +#ifdef TCP_HPTS_KTEST + pace = hpts->p_hptsi; +#endif HPTS_LOCK(hpts); if (hpts->p_direct_wake) { /* Signaled by input or output with low occupancy count. */ - callout_stop(&hpts->co); + _callout_stop_safe(&hpts->co, 0); counter_u64_add(hpts_direct_awakening, 1); } else { /* Timed out, the normal case. */ @@ -1721,7 +1710,7 @@ tcp_hpts_thread(void *ctx) } hpts->sleeping = 0; hpts->p_hpts_active = 1; - ticks_ran = tcp_hptsi(hpts, true); + slots_ran = tcp_hptsi(hpts, true); tv.tv_sec = 0; tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_USECS_PER_SLOT; if ((hpts->p_on_queue_cnt > conn_cnt_thresh) && (hpts->hit_callout_thresh == 0)) { @@ -1737,11 +1726,11 @@ tcp_hpts_thread(void *ctx) * Only adjust sleep time if we were * called from the callout i.e. direct_wake == 0. */ - if (ticks_ran < slots_indicate_more_sleep) { + if (slots_ran < slots_indicate_more_sleep) { hpts->p_mysleep.tv_usec *= 2; if (hpts->p_mysleep.tv_usec > dynamic_max_sleep) hpts->p_mysleep.tv_usec = dynamic_max_sleep; - } else if (ticks_ran > slots_indicate_less_sleep) { + } else if (slots_ran > slots_indicate_less_sleep) { hpts->p_mysleep.tv_usec /= 2; if (hpts->p_mysleep.tv_usec < dynamic_min_sleep) hpts->p_mysleep.tv_usec = dynamic_min_sleep; @@ -1797,18 +1786,11 @@ tcp_hpts_thread(void *ctx) hpts->p_hpts_active = 0; back_to_sleep: hpts->p_direct_wake = 0; - sb = tvtosbt(tv); - /* Store off to make visible the actual sleep time */ - hpts->sleeping = tv.tv_usec; - callout_reset_sbt_on(&hpts->co, sb, 0, - hpts_timeout_swi, hpts, hpts->p_cpu, - (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision))); + (void)tcp_hpts_sleep(hpts, &tv); NET_EPOCH_EXIT(et); HPTS_UNLOCK(hpts); } -#undef timersub - static int32_t hpts_count_level(struct cpu_group *cg) { @@ -1845,57 +1827,63 @@ hpts_gather_grps(struct cpu_group **grps, int32_t *at, int32_t max, struct cpu_g } } -static void -tcp_hpts_mod_load(void) +/* + * Initialize a tcp_hptsi structure. This performs the core initialization + * without starting threads. + */ +struct tcp_hptsi* +tcp_hptsi_create(const struct tcp_hptsi_funcs *funcs, bool enable_sysctl) { + struct tcp_hptsi *pace; struct cpu_group *cpu_top; - int32_t error __diagused; - int32_t i, j, bound = 0, created = 0; + uint32_t i, j, cts; + int32_t count; size_t sz, asz; struct timeval tv; - sbintime_t sb; struct tcp_hpts_entry *hpts; - struct pcpu *pc; char unit[16]; uint32_t ncpus = mp_ncpus ? mp_ncpus : MAXCPU; - int count, domain; + KASSERT(funcs != NULL, ("funcs is NULL")); + + /* Allocate the main structure */ + pace = malloc(sizeof(struct tcp_hptsi), M_TCPHPTS, M_WAITOK | M_ZERO); + if (pace == NULL) + return (NULL); + + memset(pace, 0, sizeof(*pace)); + pace->funcs = funcs; + + /* Setup CPU topology information */ #ifdef SMP cpu_top = smp_topo(); #else cpu_top = NULL; #endif - tcp_pace.rp_num_hptss = ncpus; - hpts_hopelessly_behind = counter_u64_alloc(M_WAITOK); - hpts_loops = counter_u64_alloc(M_WAITOK); - back_tosleep = counter_u64_alloc(M_WAITOK); - combined_wheel_wrap = counter_u64_alloc(M_WAITOK); - wheel_wrap = counter_u64_alloc(M_WAITOK); - hpts_wake_timeout = counter_u64_alloc(M_WAITOK); - hpts_direct_awakening = counter_u64_alloc(M_WAITOK); - hpts_back_tosleep = counter_u64_alloc(M_WAITOK); - hpts_direct_call = counter_u64_alloc(M_WAITOK); - cpu_uses_flowid = counter_u64_alloc(M_WAITOK); - cpu_uses_random = counter_u64_alloc(M_WAITOK); + pace->rp_num_hptss = ncpus; - sz = (tcp_pace.rp_num_hptss * sizeof(struct tcp_hpts_entry *)); - tcp_pace.rp_ent = malloc(sz, M_TCPHPTS, M_WAITOK | M_ZERO); - sz = (sizeof(uint32_t) * tcp_pace.rp_num_hptss); - tcp_pace.cts_last_ran = malloc(sz, M_TCPHPTS, M_WAITOK); - tcp_pace.grp_cnt = 0; + /* Allocate hpts entry array */ + sz = (pace->rp_num_hptss * sizeof(struct tcp_hpts_entry *)); + pace->rp_ent = malloc(sz, M_TCPHPTS, M_WAITOK | M_ZERO); + + /* Allocate timestamp tracking array */ + sz = (sizeof(uint32_t) * pace->rp_num_hptss); + pace->cts_last_ran = malloc(sz, M_TCPHPTS, M_WAITOK); + + /* Setup CPU groups */ if (cpu_top == NULL) { - tcp_pace.grp_cnt = 1; + pace->grp_cnt = 1; } else { /* Find out how many cache level 3 domains we have */ count = 0; - tcp_pace.grp_cnt = hpts_count_level(cpu_top); - if (tcp_pace.grp_cnt == 0) { - tcp_pace.grp_cnt = 1; + pace->grp_cnt = hpts_count_level(cpu_top); + if (pace->grp_cnt == 0) { + pace->grp_cnt = 1; } - sz = (tcp_pace.grp_cnt * sizeof(struct cpu_group *)); - tcp_pace.grps = malloc(sz, M_TCPHPTS, M_WAITOK); + sz = (pace->grp_cnt * sizeof(struct cpu_group *)); + pace->grps = malloc(sz, M_TCPHPTS, M_WAITOK); /* Now populate the groups */ - if (tcp_pace.grp_cnt == 1) { + if (pace->grp_cnt == 1) { /* * All we need is the top level all cpu's are in * the same cache so when we use grp[0]->cg_mask @@ -1903,193 +1891,290 @@ tcp_hpts_mod_load(void) * all cpu's in it. The level here is probably * zero which is ok. */ - tcp_pace.grps[0] = cpu_top; + pace->grps[0] = cpu_top; } else { /* * Here we must find all the level three cache domains * and setup our pointers to them. */ count = 0; - hpts_gather_grps(tcp_pace.grps, &count, tcp_pace.grp_cnt, cpu_top); + hpts_gather_grps(pace->grps, &count, pace->grp_cnt, cpu_top); } } + + /* Cache the current time for initializing the hpts entries */ + microuptime(&tv); + cts = tcp_tv_to_usec(&tv); + + /* Initialize each hpts entry */ asz = sizeof(struct hptsh) * NUM_OF_HPTSI_SLOTS; - for (i = 0; i < tcp_pace.rp_num_hptss; i++) { - tcp_pace.rp_ent[i] = malloc(sizeof(struct tcp_hpts_entry), + for (i = 0; i < pace->rp_num_hptss; i++) { + pace->rp_ent[i] = malloc(sizeof(struct tcp_hpts_entry), M_TCPHPTS, M_WAITOK | M_ZERO); - tcp_pace.rp_ent[i]->p_hptss = malloc(asz, M_TCPHPTS, M_WAITOK); - hpts = tcp_pace.rp_ent[i]; - /* - * Init all the hpts structures that are not specifically - * zero'd by the allocations. Also lets attach them to the - * appropriate sysctl block as well. - */ - mtx_init(&hpts->p_mtx, "tcp_hpts_lck", - "hpts", MTX_DEF | MTX_DUPOK); - for (j = 0; j < NUM_OF_HPTSI_SLOTS; j++) { - TAILQ_INIT(&hpts->p_hptss[j].head); - hpts->p_hptss[j].count = 0; - hpts->p_hptss[j].gencnt = 0; - } - sysctl_ctx_init(&hpts->hpts_ctx); - sprintf(unit, "%d", i); - hpts->hpts_root = SYSCTL_ADD_NODE(&hpts->hpts_ctx, - SYSCTL_STATIC_CHILDREN(_net_inet_tcp_hpts), - OID_AUTO, - unit, - CTLFLAG_RW | CTLFLAG_MPSAFE, 0, - ""); - SYSCTL_ADD_INT(&hpts->hpts_ctx, - SYSCTL_CHILDREN(hpts->hpts_root), - OID_AUTO, "out_qcnt", CTLFLAG_RD, - &hpts->p_on_queue_cnt, 0, - "Count TCB's awaiting output processing"); - SYSCTL_ADD_U16(&hpts->hpts_ctx, - SYSCTL_CHILDREN(hpts->hpts_root), - OID_AUTO, "active", CTLFLAG_RD, - &hpts->p_hpts_active, 0, - "Is the hpts active"); - SYSCTL_ADD_UINT(&hpts->hpts_ctx, - SYSCTL_CHILDREN(hpts->hpts_root), - OID_AUTO, "curslot", CTLFLAG_RD, - &hpts->p_cur_slot, 0, - "What the current running pacers goal"); - SYSCTL_ADD_UINT(&hpts->hpts_ctx, - SYSCTL_CHILDREN(hpts->hpts_root), - OID_AUTO, "runtick", CTLFLAG_RD, - &hpts->p_runningslot, 0, - "What the running pacers current slot is"); - SYSCTL_ADD_UINT(&hpts->hpts_ctx, - SYSCTL_CHILDREN(hpts->hpts_root), - OID_AUTO, "curtick", CTLFLAG_RD, - &hpts->p_curtick, 0, - "What the running pacers last tick mapped to the wheel was"); - SYSCTL_ADD_UINT(&hpts->hpts_ctx, - SYSCTL_CHILDREN(hpts->hpts_root), - OID_AUTO, "lastran", CTLFLAG_RD, - &tcp_pace.cts_last_ran[i], 0, - "The last usec tick that this hpts ran"); - SYSCTL_ADD_LONG(&hpts->hpts_ctx, - SYSCTL_CHILDREN(hpts->hpts_root), - OID_AUTO, "cur_min_sleep", CTLFLAG_RD, - &hpts->p_mysleep.tv_usec, - "What the running pacers is using for p_mysleep.tv_usec"); - SYSCTL_ADD_U64(&hpts->hpts_ctx, - SYSCTL_CHILDREN(hpts->hpts_root), - OID_AUTO, "now_sleeping", CTLFLAG_RD, - &hpts->sleeping, 0, - "What the running pacers is actually sleeping for"); - SYSCTL_ADD_U64(&hpts->hpts_ctx, - SYSCTL_CHILDREN(hpts->hpts_root), - OID_AUTO, "syscall_cnt", CTLFLAG_RD, - &hpts->syscall_cnt, 0, - "How many times we had syscalls on this hpts"); + pace->rp_ent[i]->p_hptss = malloc(asz, M_TCPHPTS, + M_WAITOK | M_ZERO); + hpts = pace->rp_ent[i]; + /* Basic initialization */ hpts->p_hpts_sleep_time = hpts_sleep_max; - hpts->p_num = i; - hpts->p_curtick = tcp_gethptstick(&tv); - tcp_pace.cts_last_ran[i] = tcp_tv_to_usec(&tv); - hpts->p_prev_slot = hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick); - hpts->p_cpu = 0xffff; + hpts->p_cpu = i; + pace->cts_last_ran[i] = cts; + hpts->p_cur_slot = cts_to_wheel(cts); + hpts->p_prev_slot = hpts->p_cur_slot; hpts->p_nxt_slot = hpts_slot(hpts->p_cur_slot, 1); callout_init(&hpts->co, 1); + hpts->p_hptsi = pace; + mtx_init(&hpts->p_mtx, "tcp_hpts_lck", "hpts", + MTX_DEF | MTX_DUPOK); + for (j = 0; j < NUM_OF_HPTSI_SLOTS; j++) { + TAILQ_INIT(&hpts->p_hptss[j].head); + } + + /* Setup SYSCTL if requested */ + if (enable_sysctl) { + sysctl_ctx_init(&hpts->hpts_ctx); + sprintf(unit, "%d", i); + hpts->hpts_root = SYSCTL_ADD_NODE(&hpts->hpts_ctx, + SYSCTL_STATIC_CHILDREN(_net_inet_tcp_hpts), + OID_AUTO, + unit, + CTLFLAG_RW | CTLFLAG_MPSAFE, 0, + ""); + SYSCTL_ADD_INT(&hpts->hpts_ctx, + SYSCTL_CHILDREN(hpts->hpts_root), + OID_AUTO, "out_qcnt", CTLFLAG_RD, + &hpts->p_on_queue_cnt, 0, + "Count TCB's awaiting output processing"); + SYSCTL_ADD_U16(&hpts->hpts_ctx, + SYSCTL_CHILDREN(hpts->hpts_root), + OID_AUTO, "active", CTLFLAG_RD, + &hpts->p_hpts_active, 0, + "Is the hpts active"); + SYSCTL_ADD_UINT(&hpts->hpts_ctx, + SYSCTL_CHILDREN(hpts->hpts_root), + OID_AUTO, "curslot", CTLFLAG_RD, + &hpts->p_cur_slot, 0, + "What the current running pacers goal"); + SYSCTL_ADD_UINT(&hpts->hpts_ctx, + SYSCTL_CHILDREN(hpts->hpts_root), + OID_AUTO, "runslot", CTLFLAG_RD, + &hpts->p_runningslot, 0, + "What the running pacers current slot is"); + SYSCTL_ADD_UINT(&hpts->hpts_ctx, + SYSCTL_CHILDREN(hpts->hpts_root), + OID_AUTO, "lastran", CTLFLAG_RD, + &pace->cts_last_ran[i], 0, + "The last usec timestamp that this hpts ran"); + SYSCTL_ADD_LONG(&hpts->hpts_ctx, + SYSCTL_CHILDREN(hpts->hpts_root), + OID_AUTO, "cur_min_sleep", CTLFLAG_RD, + &hpts->p_mysleep.tv_usec, + "What the running pacers is using for p_mysleep.tv_usec"); + SYSCTL_ADD_U64(&hpts->hpts_ctx, + SYSCTL_CHILDREN(hpts->hpts_root), + OID_AUTO, "now_sleeping", CTLFLAG_RD, + &hpts->sleeping, 0, + "What the running pacers is actually sleeping for"); + SYSCTL_ADD_U64(&hpts->hpts_ctx, + SYSCTL_CHILDREN(hpts->hpts_root), + OID_AUTO, "syscall_cnt", CTLFLAG_RD, + &hpts->syscall_cnt, 0, + "How many times we had syscalls on this hpts"); + } } - /* Don't try to bind to NUMA domains if we don't have any */ - if (vm_ndomains == 1 && tcp_bind_threads == 2) - tcp_bind_threads = 0; - /* - * Now lets start ithreads to handle the hptss. - */ - for (i = 0; i < tcp_pace.rp_num_hptss; i++) { - hpts = tcp_pace.rp_ent[i]; - hpts->p_cpu = i; + return (pace); +} + +/* + * Create threads for a tcp_hptsi structure and starts timers for the current + * (minimum) sleep interval. + */ +void +tcp_hptsi_start(struct tcp_hptsi *pace) +{ + struct tcp_hpts_entry *hpts; + struct pcpu *pc; + struct timeval tv; + uint32_t i, j; + int count, domain; + int error __diagused; + + KASSERT(pace != NULL, ("tcp_hptsi_start: pace is NULL")); + + /* Start threads for each hpts entry */ + for (i = 0; i < pace->rp_num_hptss; i++) { + hpts = pace->rp_ent[i]; + + KASSERT(hpts->ie_cookie == NULL, + ("tcp_hptsi_start: hpts[%d]->ie_cookie is not NULL", i)); error = swi_add(&hpts->ie, "hpts", tcp_hpts_thread, (void *)hpts, SWI_NET, INTR_MPSAFE, &hpts->ie_cookie); KASSERT(error == 0, - ("Can't add hpts:%p i:%d err:%d", - hpts, i, error)); - created++; - hpts->p_mysleep.tv_sec = 0; - hpts->p_mysleep.tv_usec = tcp_min_hptsi_time; + ("Can't add hpts:%p i:%d err:%d", hpts, i, error)); + if (tcp_bind_threads == 1) { - if (intr_event_bind(hpts->ie, i) == 0) - bound++; + (void)intr_event_bind(hpts->ie, i); } else if (tcp_bind_threads == 2) { /* Find the group for this CPU (i) and bind into it */ - for (j = 0; j < tcp_pace.grp_cnt; j++) { - if (CPU_ISSET(i, &tcp_pace.grps[j]->cg_mask)) { + for (j = 0; j < pace->grp_cnt; j++) { + if (CPU_ISSET(i, &pace->grps[j]->cg_mask)) { if (intr_event_bind_ithread_cpuset(hpts->ie, - &tcp_pace.grps[j]->cg_mask) == 0) { - bound++; + &pace->grps[j]->cg_mask) == 0) { pc = pcpu_find(i); domain = pc->pc_domain; - count = hpts_domains[domain].count; - hpts_domains[domain].cpu[count] = i; - hpts_domains[domain].count++; + count = pace->domains[domain].count; + pace->domains[domain].cpu[count] = i; + pace->domains[domain].count++; break; } } } } + + hpts->p_mysleep.tv_sec = 0; + hpts->p_mysleep.tv_usec = tcp_min_hptsi_time; tv.tv_sec = 0; tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_USECS_PER_SLOT; - hpts->sleeping = tv.tv_usec; - sb = tvtosbt(tv); - callout_reset_sbt_on(&hpts->co, sb, 0, - hpts_timeout_swi, hpts, hpts->p_cpu, - (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision))); - } - /* - * If we somehow have an empty domain, fall back to choosing - * among all htps threads. - */ - for (i = 0; i < vm_ndomains; i++) { - if (hpts_domains[i].count == 0) { - tcp_bind_threads = 0; - break; - } + (void)tcp_hpts_sleep(hpts, &tv); } - tcp_hpts_softclock = __tcp_run_hpts; - tcp_lro_hpts_init(); - printf("TCP Hpts created %d swi interrupt threads and bound %d to %s\n", - created, bound, - tcp_bind_threads == 2 ? "NUMA domains" : "cpus"); } -static void -tcp_hpts_mod_unload(void) +/* + * Stop all callouts/threads for a tcp_hptsi structure. + */ +void +tcp_hptsi_stop(struct tcp_hptsi *pace) { + struct tcp_hpts_entry *hpts; int rv __diagused; + uint32_t i; - tcp_lro_hpts_uninit(); - atomic_store_ptr(&tcp_hpts_softclock, NULL); + KASSERT(pace != NULL, ("tcp_hptsi_stop: pace is NULL")); - for (int i = 0; i < tcp_pace.rp_num_hptss; i++) { - struct tcp_hpts_entry *hpts = tcp_pace.rp_ent[i]; + for (i = 0; i < pace->rp_num_hptss; i++) { + hpts = pace->rp_ent[i]; + KASSERT(hpts != NULL, ("tcp_hptsi_stop: hpts[%d] is NULL", i)); + KASSERT(hpts->ie_cookie != NULL, + ("tcp_hptsi_stop: hpts[%d]->ie_cookie is NULL", i)); - rv = callout_drain(&hpts->co); + rv = _callout_stop_safe(&hpts->co, CS_DRAIN); MPASS(rv != 0); rv = swi_remove(hpts->ie_cookie); MPASS(rv == 0); + hpts->ie_cookie = NULL; + } +} - rv = sysctl_ctx_free(&hpts->hpts_ctx); - MPASS(rv == 0); +/* + * Destroy a tcp_hptsi structure initialized by tcp_hptsi_create. + */ +void +tcp_hptsi_destroy(struct tcp_hptsi *pace) +{ + struct tcp_hpts_entry *hpts; + uint32_t i; + + KASSERT(pace != NULL, ("tcp_hptsi_destroy: pace is NULL")); + KASSERT(pace->rp_ent != NULL, ("tcp_hptsi_destroy: pace->rp_ent is NULL")); + + /* Cleanup each hpts entry */ + for (i = 0; i < pace->rp_num_hptss; i++) { + hpts = pace->rp_ent[i]; + if (hpts != NULL) { + /* Cleanup SYSCTL if it was initialized */ + if (hpts->hpts_root != NULL) { + sysctl_ctx_free(&hpts->hpts_ctx); + } - mtx_destroy(&hpts->p_mtx); - free(hpts->p_hptss, M_TCPHPTS); - free(hpts, M_TCPHPTS); + mtx_destroy(&hpts->p_mtx); + free(hpts->p_hptss, M_TCPHPTS); + free(hpts, M_TCPHPTS); + } } - free(tcp_pace.rp_ent, M_TCPHPTS); - free(tcp_pace.cts_last_ran, M_TCPHPTS); + /* Cleanup main arrays */ + free(pace->rp_ent, M_TCPHPTS); + free(pace->cts_last_ran, M_TCPHPTS); #ifdef SMP - free(tcp_pace.grps, M_TCPHPTS); + free(pace->grps, M_TCPHPTS); #endif + /* Free the main structure */ + free(pace, M_TCPHPTS); +} + +static int +tcp_hpts_mod_load(void) +{ + int i; + + /* Don't try to bind to NUMA domains if we don't have any */ + if (vm_ndomains == 1 && tcp_bind_threads == 2) + tcp_bind_threads = 0; + + /* Create the tcp_hptsi structure */ + tcp_hptsi_pace = tcp_hptsi_create(&tcp_hptsi_default_funcs, true); + if (tcp_hptsi_pace == NULL) + return (ENOMEM); + + /* Initialize global counters */ + hpts_hopelessly_behind = counter_u64_alloc(M_WAITOK); + hpts_loops = counter_u64_alloc(M_WAITOK); + back_tosleep = counter_u64_alloc(M_WAITOK); + combined_wheel_wrap = counter_u64_alloc(M_WAITOK); + wheel_wrap = counter_u64_alloc(M_WAITOK); + hpts_wake_timeout = counter_u64_alloc(M_WAITOK); + hpts_direct_awakening = counter_u64_alloc(M_WAITOK); + hpts_back_tosleep = counter_u64_alloc(M_WAITOK); + hpts_direct_call = counter_u64_alloc(M_WAITOK); + cpu_uses_flowid = counter_u64_alloc(M_WAITOK); + cpu_uses_random = counter_u64_alloc(M_WAITOK); + + /* Start the threads */ + tcp_hptsi_start(tcp_hptsi_pace); + + /* Enable the global HPTS softclock function */ + tcp_hpts_softclock = __tcp_run_hpts; + + /* Initialize LRO HPTS */ + tcp_lro_hpts_init(); + + /* + * If we somehow have an empty domain, fall back to choosing among all + * HPTS threads. + */ + for (i = 0; i < vm_ndomains; i++) { + if (tcp_hptsi_pace->domains[i].count == 0) { + tcp_bind_threads = 0; + break; + } + } + + printf("TCP HPTS started %u (%s) swi interrupt threads\n", + tcp_hptsi_pace->rp_num_hptss, (tcp_bind_threads == 0) ? + "(unbounded)" : + (tcp_bind_threads == 1 ? "per-cpu" : "per-NUMA-domain")); + + return (0); +} + +static void +tcp_hpts_mod_unload(void) +{ + tcp_lro_hpts_uninit(); + + /* Disable the global HPTS softclock function */ + atomic_store_ptr(&tcp_hpts_softclock, NULL); + + tcp_hptsi_stop(tcp_hptsi_pace); + tcp_hptsi_destroy(tcp_hptsi_pace); + tcp_hptsi_pace = NULL; + + /* Cleanup global counters */ counter_u64_free(hpts_hopelessly_behind); counter_u64_free(hpts_loops); counter_u64_free(back_tosleep); @@ -2104,13 +2189,11 @@ tcp_hpts_mod_unload(void) } static int -tcp_hpts_modevent(module_t mod, int what, void *arg) +tcp_hpts_mod_event(module_t mod, int what, void *arg) { - switch (what) { case MOD_LOAD: - tcp_hpts_mod_load(); - return (0); + return (tcp_hpts_mod_load()); case MOD_QUIESCE: /* * Since we are a dependency of TCP stack modules, they should @@ -2130,7 +2213,7 @@ tcp_hpts_modevent(module_t mod, int what, void *arg) static moduledata_t tcp_hpts_module = { .name = "tcphpts", - .evhand = tcp_hpts_modevent, + .evhand = tcp_hpts_mod_event, }; DECLARE_MODULE(tcphpts, tcp_hpts_module, SI_SUB_SOFTINTR, SI_ORDER_ANY); diff --git a/sys/netinet/tcp_hpts.h b/sys/netinet/tcp_hpts.h index 6172baf2a062..6b05f9701ac2 100644 --- a/sys/netinet/tcp_hpts.h +++ b/sys/netinet/tcp_hpts.h @@ -28,19 +28,11 @@ /* Number of useconds represented by an hpts slot */ #define HPTS_USECS_PER_SLOT 10 -#define HPTS_MS_TO_SLOTS(x) ((x * 100) + 1) -#define HPTS_USEC_TO_SLOTS(x) ((x+9) /10) #define HPTS_USEC_IN_SEC 1000000 #define HPTS_MSEC_IN_SEC 1000 #define HPTS_USEC_IN_MSEC 1000 static inline uint32_t -tcp_tv_to_hpts_slot(const struct timeval *sv) -{ - return ((sv->tv_sec * 100000) + (sv->tv_usec / HPTS_USECS_PER_SLOT)); -} - -static inline uint32_t tcp_tv_to_usec(const struct timeval *sv) { return ((uint32_t) ((sv->tv_sec * HPTS_USEC_IN_SEC) + sv->tv_usec)); @@ -66,7 +58,7 @@ struct hpts_diag { uint32_t p_runningslot; /* bbr->inflight */ uint32_t slot_req; /* bbr->flex3 x */ uint32_t inp_hptsslot; /* bbr->flex4 x */ - uint32_t slot_remaining; /* bbr->flex5 x */ + uint32_t time_remaining; /* bbr->flex5 x */ uint32_t have_slept; /* bbr->epoch x */ uint32_t hpts_sleep_time; /* bbr->applimited x */ uint32_t yet_to_sleep; /* bbr->lt_epoch x */ @@ -75,8 +67,6 @@ struct hpts_diag { uint32_t maxslots; /* bbr->delRate x */ uint32_t wheel_cts; /* bbr->rttProp x */ int32_t co_ret; /* bbr->pkts_out x */ - uint32_t p_curtick; /* upper bbr->cur_del_rate */ - uint32_t p_lasttick; /* lower bbr->cur_del_rate */ uint8_t p_on_min_sleep; /* bbr->flex8 x */ }; @@ -92,13 +82,18 @@ struct hpts_diag { #ifdef _KERNEL +extern struct tcp_hptsi *tcp_hptsi_pace; + /* * The following are the definitions for the kernel HPTS interface for managing * the HPTS ring and the TCBs on it. */ -void tcp_hpts_init(struct tcpcb *); -void tcp_hpts_remove(struct tcpcb *); +void __tcp_hpts_init(struct tcp_hptsi *pace, struct tcpcb *); +#define tcp_hpts_init(tp) __tcp_hpts_init(tcp_hptsi_pace, tp) + +void __tcp_hpts_remove(struct tcp_hptsi *pace, struct tcpcb *); +#define tcp_hpts_remove(tp) __tcp_hpts_remove(tcp_hptsi_pace, tp) static inline bool tcp_in_hpts(struct tcpcb *tp) @@ -132,12 +127,13 @@ tcp_in_hpts(struct tcpcb *tp) * that INP_WLOCK() or from destroying your TCB where again * you should already have the INP_WLOCK(). */ -uint32_t tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, - struct hpts_diag *diag); -#define tcp_hpts_insert(inp, slot) \ - tcp_hpts_insert_diag((inp), (slot), __LINE__, NULL) +void __tcp_hpts_insert(struct tcp_hptsi *pace, struct tcpcb *tp, uint32_t usecs, + struct hpts_diag *diag); +#define tcp_hpts_insert(tp, usecs, diag) \ + __tcp_hpts_insert(tcp_hptsi_pace, (tp), (usecs), (diag)) -void tcp_set_hpts(struct tcpcb *tp); +void __tcp_set_hpts(struct tcp_hptsi *pace, struct tcpcb *tp); +#define tcp_set_hpts(tp) __tcp_set_hpts(tcp_hptsi_pace, tp) extern int32_t tcp_min_hptsi_time; @@ -147,17 +143,6 @@ get_hpts_min_sleep_time(void) return (tcp_min_hptsi_time + HPTS_USECS_PER_SLOT); } -static inline uint32_t -tcp_gethptstick(struct timeval *sv) -{ - struct timeval tv; - - if (sv == NULL) - sv = &tv; - microuptime(sv); - return (tcp_tv_to_hpts_slot(sv)); -} - static inline uint64_t tcp_get_u64_usecs(struct timeval *tv) { @@ -180,12 +165,5 @@ tcp_get_usecs(struct timeval *tv) return (tcp_tv_to_usec(tv)); } -/* - * LRO HPTS initialization and uninitialization, only for internal use by the - * HPTS code. - */ -void tcp_lro_hpts_init(void); -void tcp_lro_hpts_uninit(void); - #endif /* _KERNEL */ #endif /* __tcp_hpts_h__ */ diff --git a/sys/netinet/tcp_hpts_internal.h b/sys/netinet/tcp_hpts_internal.h new file mode 100644 index 000000000000..8b33e03a6981 --- /dev/null +++ b/sys/netinet/tcp_hpts_internal.h @@ -0,0 +1,184 @@ +/*- + * Copyright (c) 2025 Netflix, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef __tcp_hpts_internal_h__ +#define __tcp_hpts_internal_h__ + +/* + * TCP High Precision Timer System (HPTS) - Internal Definitions + * + * This header contains internal structures, constants, and interfaces that are + * implemented in tcp_hpts.c but exposed to enable comprehensive unit testing of + * the HPTS subsystem. + */ + +#if defined(_KERNEL) + +/* + * The hpts uses a 102400 wheel. The wheel + * defines the time in 10 usec increments (102400 x 10). + * This gives a range of 10usec - 1024ms to place + * an entry within. If the user requests more than + * 1.024 second, a remaineder is attached and the hpts + * when seeing the remainder will re-insert the + * inpcb forward in time from where it is until + * the remainder is zero. + */ + +#define NUM_OF_HPTSI_SLOTS 102400 + +/* The number of connections after which the dynamic sleep logic kicks in. */ +#define DEFAULT_CONNECTION_THRESHOLD 100 + +/* + * The hpts uses a 102400 wheel. The wheel + * defines the time in 10 usec increments (102400 x 10). + * This gives a range of 10usec - 1024ms to place + * an entry within. If the user requests more than + * 1.024 second, a remaineder is attached and the hpts + * when seeing the remainder will re-insert the + * inpcb forward in time from where it is until + * the remainder is zero. + */ + +#define NUM_OF_HPTSI_SLOTS 102400 + +/* Convert microseconds to HPTS slots */ +#define HPTS_USEC_TO_SLOTS(x) ((x+9) /10) + +/* The number of connections after which the dynamic sleep logic kicks in. */ +#define DEFAULT_CONNECTION_THRESHOLD 100 + +extern int tcp_bind_threads; /* Thread binding configuration + * (0=none, 1=cpu, 2=numa) */ + +/* + * Abstraction layer controlling time, interrupts and callouts. + */ +struct tcp_hptsi_funcs { + void (*microuptime)(struct timeval *tv); + int (*swi_add)(struct intr_event **eventp, const char *name, + driver_intr_t handler, void *arg, int pri, enum intr_type flags, + void **cookiep); + int (*swi_remove)(void *cookie); + void (*swi_sched)(void *cookie, int flags); + int (*intr_event_bind)(struct intr_event *ie, int cpu); + int (*intr_event_bind_ithread_cpuset)(struct intr_event *ie, + struct _cpuset *mask); + void (*callout_init)(struct callout *c, int mpsafe); + int (*callout_reset_sbt_on)(struct callout *c, sbintime_t sbt, + sbintime_t precision, void (*func)(void *), void *arg, int cpu, + int flags); + int (*_callout_stop_safe)(struct callout *c, int flags); +}; + +/* Default function table for system operation */ +extern const struct tcp_hptsi_funcs tcp_hptsi_default_funcs; + +/* Each hpts has its own p_mtx which is used for locking */ +#define HPTS_MTX_ASSERT(hpts) mtx_assert(&(hpts)->p_mtx, MA_OWNED) +#define HPTS_LOCK(hpts) mtx_lock(&(hpts)->p_mtx) +#define HPTS_TRYLOCK(hpts) mtx_trylock(&(hpts)->p_mtx) +#define HPTS_UNLOCK(hpts) mtx_unlock(&(hpts)->p_mtx) + +struct tcp_hpts_entry { + /* Cache line 0x00 */ + struct mtx p_mtx; /* Mutex for hpts */ + struct timeval p_mysleep; /* Our min sleep time */ + uint64_t syscall_cnt; + uint64_t sleeping; /* What the actual sleep was (if sleeping) */ + uint16_t p_hpts_active; /* Flag that says hpts is awake */ + uint8_t p_wheel_complete; /* have we completed the wheel arc walk? */ + uint32_t p_runningslot; /* Current slot we are at if we are running */ + uint32_t p_prev_slot; /* Previous slot we were on */ + uint32_t p_cur_slot; /* Current slot in wheel hpts is draining */ + uint32_t p_nxt_slot; /* The next slot outside the current range + * of slots that the hpts is running on. */ + int32_t p_on_queue_cnt; /* Count on queue in this hpts */ + uint8_t p_direct_wake :1, /* boolean */ + p_on_min_sleep:1, /* boolean */ + p_hpts_wake_scheduled:1,/* boolean */ + hit_callout_thresh:1, + p_avail:4; + uint8_t p_fill[3]; /* Fill to 32 bits */ + /* Cache line 0x40 */ + struct hptsh { + TAILQ_HEAD(, tcpcb) head; + uint32_t count; + uint32_t gencnt; + } *p_hptss; /* Hptsi wheel */ + uint32_t p_hpts_sleep_time; /* Current sleep interval having a max + * of 255ms */ + uint32_t overidden_sleep; /* what was overrided by min-sleep for logging */ + uint32_t saved_curslot; /* for logging */ + uint32_t saved_prev_slot; /* for logging */ + uint32_t p_delayed_by; /* How much were we delayed by */ + /* Cache line 0x80 */ + struct sysctl_ctx_list hpts_ctx; + struct sysctl_oid *hpts_root; + struct intr_event *ie; + void *ie_cookie; + uint16_t p_cpu; /* The hpts CPU */ + struct tcp_hptsi *p_hptsi; /* Back pointer to parent hptsi structure */ + /* There is extra space in here */ + /* Cache line 0x100 */ + struct callout co __aligned(CACHE_LINE_SIZE); +} __aligned(CACHE_LINE_SIZE); + +struct tcp_hptsi { + struct cpu_group **grps; + struct tcp_hpts_entry **rp_ent; /* Array of hptss */ + uint32_t *cts_last_ran; + uint32_t grp_cnt; + uint32_t rp_num_hptss; /* Number of hpts threads */ + struct hpts_domain_info { + int count; + int cpu[MAXCPU]; + } domains[MAXMEMDOM]; /* Per-NUMA domain CPU assignments */ + const struct tcp_hptsi_funcs *funcs; /* Function table for testability */ +}; + +/* + * Core tcp_hptsi structure manipulation functions. + */ +struct tcp_hptsi* tcp_hptsi_create(const struct tcp_hptsi_funcs *funcs, + bool enable_sysctl); +void tcp_hptsi_destroy(struct tcp_hptsi *pace); +void tcp_hptsi_start(struct tcp_hptsi *pace); +void tcp_hptsi_stop(struct tcp_hptsi *pace); +uint16_t tcp_hptsi_random_cpu(struct tcp_hptsi *pace); +int32_t tcp_hptsi(struct tcp_hpts_entry *hpts, bool from_callout); + +void tcp_hpts_wake(struct tcp_hpts_entry *hpts); + +/* + * LRO HPTS initialization and uninitialization, only for internal use by the + * HPTS code. + */ +void tcp_lro_hpts_init(void); +void tcp_lro_hpts_uninit(void); + +#endif /* defined(_KERNEL) */ +#endif /* __tcp_hpts_internal_h__ */ diff --git a/sys/netinet/tcp_hpts_test.c b/sys/netinet/tcp_hpts_test.c index 085da37e44c8..bab5827e0572 100644 --- a/sys/netinet/tcp_hpts_test.c +++ b/sys/netinet/tcp_hpts_test.c @@ -28,48 +28,1634 @@ #include <tests/ktest.h> #include <sys/cdefs.h> #include <sys/param.h> +#include <sys/bus.h> +#include <sys/interrupt.h> #include <sys/errno.h> #include <sys/malloc.h> +#include <sys/mutex.h> +#include <sys/refcount.h> #include <sys/socket.h> +#include <sys/sysctl.h> #include <sys/systm.h> #include <netinet/in.h> #include <netinet/tcp.h> -#define _WANT_INPCB #include <netinet/in_pcb.h> #include <netinet/tcp_seq.h> -#define _WANT_TCPCB #include <netinet/tcp_var.h> #include <netinet/tcp_hpts.h> +#include <netinet/tcp_hpts_internal.h> #include <dev/tcp_log/tcp_log_dev.h> #include <netinet/tcp_log_buf.h> -#define KTEST_ERR(_ctx, _fmt, ...) \ - KTEST_LOG_LEVEL(_ctx, LOG_ERR, _fmt, ## __VA_ARGS__) +#undef tcp_hpts_init +#undef tcp_hpts_remove +#undef tcp_hpts_insert +#undef tcp_set_hpts + +/* Custom definitions that take the tcp_hptsi */ +#define tcp_hpts_init(pace, tp) __tcp_hpts_init((pace), (tp)) +#define tcp_hpts_remove(pace, tp) __tcp_hpts_remove((pace), (tp)) +#define tcp_hpts_insert(pace, tp, usecs, diag) \ + __tcp_hpts_insert((pace), (tp), (usecs), (diag)) +#define tcp_set_hpts(pace, tp) __tcp_set_hpts((pace), (tp)) + +static MALLOC_DEFINE(M_TCPHPTS, "tcp_hpts_test", "TCP hpts test"); + +static int test_exit_on_failure = true; +SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hpts_test, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, + "TCP HPTS test controls"); +SYSCTL_INT(_net_inet_tcp_hpts_test, OID_AUTO, exit_on_failure, CTLFLAG_RW, + &test_exit_on_failure, 0, + "Exit HPTS test immediately on first failure (1) or continue running all tests (0)"); #define KTEST_VERIFY(x) do { \ if (!(x)) { \ KTEST_ERR(ctx, "FAIL: %s", #x); \ - return (EINVAL); \ + if (test_exit_on_failure) \ + return (EINVAL); \ } else { \ KTEST_LOG(ctx, "PASS: %s", #x); \ } \ } while (0) +#define KTEST_EQUAL(x, y) do { \ + if ((x) != (y)) { \ + KTEST_ERR(ctx, "FAIL: %s != %s (%d != %d)", #x, #y, (x), (y)); \ + if (test_exit_on_failure) \ + return (EINVAL); \ + } else { \ + KTEST_LOG(ctx, "PASS: %s == %s", #x, #y); \ + } \ +} while (0) + +#define KTEST_NEQUAL(x, y) do { \ + if ((x) == (y)) { \ + KTEST_ERR(ctx, "FAIL: %s == %s (%d == %d)", #x, #y, (x), (y)); \ + if (test_exit_on_failure) \ + return (EINVAL); \ + } else { \ + KTEST_LOG(ctx, "PASS: %s != %s", #x, #y); \ + } \ +} while (0) + +#define KTEST_GREATER_THAN(x, y) do { \ + if ((x) <= (y)) { \ + KTEST_ERR(ctx, "FAIL: %s <= %s (%d <= %d)", #x, #y, (x), (y)); \ + if (test_exit_on_failure) \ + return (EINVAL); \ + } else { \ + KTEST_LOG(ctx, "PASS: %s > %s", #x, #y); \ + } \ +} while (0) + +#define KTEST_VERIFY_RET(x, y) do { \ + if (!(x)) { \ + KTEST_ERR(ctx, "FAIL: %s", #x); \ + if (test_exit_on_failure) \ + return (y); \ + } else { \ + KTEST_LOG(ctx, "PASS: %s", #x); \ + } \ +} while (0) + +static void +dump_hpts_entry(struct ktest_test_context *ctx, struct tcp_hpts_entry *hpts) +{ + KTEST_LOG(ctx, "tcp_hpts_entry(%p)", hpts); + KTEST_LOG(ctx, " p_cur_slot: %u", hpts->p_cur_slot); + KTEST_LOG(ctx, " p_prev_slot: %u", hpts->p_prev_slot); + KTEST_LOG(ctx, " p_nxt_slot: %u", hpts->p_nxt_slot); + KTEST_LOG(ctx, " p_runningslot: %u", hpts->p_runningslot); + KTEST_LOG(ctx, " p_on_queue_cnt: %d", hpts->p_on_queue_cnt); + KTEST_LOG(ctx, " p_hpts_active: %u", hpts->p_hpts_active); + KTEST_LOG(ctx, " p_wheel_complete: %u", hpts->p_wheel_complete); + KTEST_LOG(ctx, " p_direct_wake: %u", hpts->p_direct_wake); + KTEST_LOG(ctx, " p_on_min_sleep: %u", hpts->p_on_min_sleep); + KTEST_LOG(ctx, " p_hpts_wake_scheduled: %u", hpts->p_hpts_wake_scheduled); + KTEST_LOG(ctx, " hit_callout_thresh: %u", hpts->hit_callout_thresh); + KTEST_LOG(ctx, " p_hpts_sleep_time: %u", hpts->p_hpts_sleep_time); + KTEST_LOG(ctx, " p_delayed_by: %u", hpts->p_delayed_by); + KTEST_LOG(ctx, " overidden_sleep: %u", hpts->overidden_sleep); + KTEST_LOG(ctx, " saved_curslot: %u", hpts->saved_curslot); + KTEST_LOG(ctx, " saved_prev_slot: %u", hpts->saved_prev_slot); + KTEST_LOG(ctx, " syscall_cnt: %lu", hpts->syscall_cnt); + KTEST_LOG(ctx, " sleeping: %lu", hpts->sleeping); + KTEST_LOG(ctx, " p_cpu: %u", hpts->p_cpu); + KTEST_LOG(ctx, " ie_cookie: %p", hpts->ie_cookie); + KTEST_LOG(ctx, " p_hptsi: %p", hpts->p_hptsi); + KTEST_LOG(ctx, " p_mysleep: %ld.%06ld", hpts->p_mysleep.tv_sec, hpts->p_mysleep.tv_usec); +} + +static void +dump_tcpcb(struct tcpcb *tp) +{ + struct ktest_test_context *ctx = tp->t_fb_ptr; + struct inpcb *inp = &tp->t_inpcb; + + KTEST_LOG(ctx, "tcp_control_block(%p)", tp); + + /* HPTS-specific fields */ + KTEST_LOG(ctx, " t_in_hpts: %d", tp->t_in_hpts); + KTEST_LOG(ctx, " t_hpts_cpu: %u", tp->t_hpts_cpu); + KTEST_LOG(ctx, " t_hpts_slot: %d", tp->t_hpts_slot); + KTEST_LOG(ctx, " t_hpts_gencnt: %u", tp->t_hpts_gencnt); + KTEST_LOG(ctx, " t_hpts_request: %u", tp->t_hpts_request); + + /* LRO CPU field */ + KTEST_LOG(ctx, " t_lro_cpu: %u", tp->t_lro_cpu); + + /* TCP flags that affect HPTS */ + KTEST_LOG(ctx, " t_flags2: 0x%x", tp->t_flags2); + KTEST_LOG(ctx, " TF2_HPTS_CPU_SET: %s", (tp->t_flags2 & TF2_HPTS_CPU_SET) ? "YES" : "NO"); + KTEST_LOG(ctx, " TF2_HPTS_CALLS: %s", (tp->t_flags2 & TF2_HPTS_CALLS) ? "YES" : "NO"); + KTEST_LOG(ctx, " TF2_SUPPORTS_MBUFQ: %s", (tp->t_flags2 & TF2_SUPPORTS_MBUFQ) ? "YES" : "NO"); + + /* Input PCB fields that HPTS uses */ + KTEST_LOG(ctx, " inp_flags: 0x%x", inp->inp_flags); + KTEST_LOG(ctx, " INP_DROPPED: %s", (inp->inp_flags & INP_DROPPED) ? "YES" : "NO"); + KTEST_LOG(ctx, " inp_flowid: 0x%x", inp->inp_flowid); + KTEST_LOG(ctx, " inp_flowtype: %u", inp->inp_flowtype); + KTEST_LOG(ctx, " inp_numa_domain: %d", inp->inp_numa_domain); +} + +/* Enum for call counting indices */ +enum test_call_counts { + CCNT_MICROUPTIME = 0, + CCNT_SWI_ADD, + CCNT_SWI_REMOVE, + CCNT_SWI_SCHED, + CCNT_INTR_EVENT_BIND, + CCNT_INTR_EVENT_BIND_CPUSET, + CCNT_CALLOUT_INIT, + CCNT_CALLOUT_RESET_SBT_ON, + CCNT_CALLOUT_STOP_SAFE, + CCNT_TCP_OUTPUT, + CCNT_TCP_TFB_DO_QUEUED_SEGMENTS, + CCNT_MAX +}; + +static uint32_t call_counts[CCNT_MAX]; + +static uint64_t test_time_usec = 0; + +/* + * Reset all test global variables to a clean state. + */ +static void +test_hpts_init(void) +{ + memset(call_counts, 0, sizeof(call_counts)); + test_time_usec = 0; +} + +static void +test_microuptime(struct timeval *tv) +{ + call_counts[CCNT_MICROUPTIME]++; + tv->tv_sec = test_time_usec / 1000000; + tv->tv_usec = test_time_usec % 1000000; +} + static int -test_hpts_init(struct ktest_test_context *ctx) +test_swi_add(struct intr_event **eventp, const char *name, + driver_intr_t handler, void *arg, int pri, enum intr_type flags, + void **cookiep) +{ + call_counts[CCNT_SWI_ADD]++; + /* Simulate successful SWI creation */ + *eventp = (struct intr_event *)0xfeedface; /* Mock event */ + *cookiep = (void *)0xdeadbeef; /* Mock cookie */ + return (0); +} + +static int +test_swi_remove(void *cookie) +{ + call_counts[CCNT_SWI_REMOVE]++; + /* Simulate successful removal */ + return (0); +} + +static void +test_swi_sched(void *cookie, int flags) +{ + call_counts[CCNT_SWI_SCHED]++; + /* Simulate successful SWI scheduling */ +} + +static int +test_intr_event_bind(struct intr_event *ie, int cpu) +{ + call_counts[CCNT_INTR_EVENT_BIND]++; + /* Simulate successful binding */ + return (0); +} + +static int +test_intr_event_bind_ithread_cpuset(struct intr_event *ie, struct _cpuset *mask) +{ + call_counts[CCNT_INTR_EVENT_BIND_CPUSET]++; + /* Simulate successful cpuset binding */ + return (0); +} + +static void +test_callout_init(struct callout *c, int mpsafe) +{ + call_counts[CCNT_CALLOUT_INIT]++; + memset(c, 0, sizeof(*c)); +} + +static int +test_callout_reset_sbt_on(struct callout *c, sbintime_t sbt, sbintime_t precision, + void (*func)(void *), void *arg, int cpu, int flags) +{ + call_counts[CCNT_CALLOUT_RESET_SBT_ON]++; + /* Return 1 to simulate successful timer scheduling */ + return (1); +} + +static int +test_callout_stop_safe(struct callout *c, int flags) +{ + call_counts[CCNT_CALLOUT_STOP_SAFE]++; + /* Return 1 to simulate successful timer stopping */ + return (1); +} + +static const struct tcp_hptsi_funcs test_funcs = { + .microuptime = test_microuptime, + .swi_add = test_swi_add, + .swi_remove = test_swi_remove, + .swi_sched = test_swi_sched, + .intr_event_bind = test_intr_event_bind, + .intr_event_bind_ithread_cpuset = test_intr_event_bind_ithread_cpuset, + .callout_init = test_callout_init, + .callout_reset_sbt_on = test_callout_reset_sbt_on, + ._callout_stop_safe = test_callout_stop_safe, +}; + +#define TP_REMOVE_FROM_HPTS(tp) tp->bits_spare +#define TP_LOG_TEST(tp) tp->t_log_state_set + +static int +test_tcp_output(struct tcpcb *tp) +{ + struct ktest_test_context *ctx = tp->t_fb_ptr; + struct tcp_hptsi *pace = (struct tcp_hptsi*)tp->t_tfo_pending; + struct tcp_hpts_entry *hpts = pace->rp_ent[tp->t_hpts_cpu]; + + call_counts[CCNT_TCP_OUTPUT]++; + if (TP_LOG_TEST(tp)) { + KTEST_LOG(ctx, "=> tcp_output(%p)", tp); + dump_tcpcb(tp); + dump_hpts_entry(ctx, hpts); + } + + if ((TP_REMOVE_FROM_HPTS(tp) & 1) != 0) { + if (TP_LOG_TEST(tp)) + KTEST_LOG(ctx, "=> tcp_hpts_remove(%p)", tp); + tcp_hpts_remove(pace, tp); + } + + if ((TP_REMOVE_FROM_HPTS(tp) & 2) != 0) { + INP_WUNLOCK(&tp->t_inpcb); /* tcp_output unlocks on error */ + return (-1); /* Simulate tcp_output error */ + } + + return (0); +} + +static int +test_tfb_do_queued_segments(struct tcpcb *tp, int flag) +{ + struct ktest_test_context *ctx = tp->t_fb_ptr; + struct tcp_hptsi *pace = (struct tcp_hptsi*)tp->t_tfo_pending; + struct tcp_hpts_entry *hpts = pace->rp_ent[tp->t_hpts_cpu]; + + call_counts[CCNT_TCP_TFB_DO_QUEUED_SEGMENTS]++; + KTEST_LOG(ctx, "=> tfb_do_queued_segments(%p, %d)", tp, flag); + dump_tcpcb(tp); + dump_hpts_entry(ctx, hpts); + + if ((TP_REMOVE_FROM_HPTS(tp) & 1) != 0) { + if (TP_LOG_TEST(tp)) + KTEST_LOG(ctx, "=> tcp_hpts_remove(%p)", tp); + tcp_hpts_remove(pace, tp); + } + + if ((TP_REMOVE_FROM_HPTS(tp) & 2) != 0) { + INP_WUNLOCK(&tp->t_inpcb); /* do_queued_segments unlocks on error */ + return (-1); /* Simulate do_queued_segments error */ + } + + return (0); +} + +static struct tcp_function_block test_tcp_fb = { + .tfb_tcp_block_name = "hpts_test_tcp", + .tfb_tcp_output = test_tcp_output, + .tfb_do_queued_segments = test_tfb_do_queued_segments, +}; + +/* + * Create a minimally initialized tcpcb that can be safely inserted into HPTS. + * This function allocates and initializes all the fields that HPTS code + * reads or writes. + */ +static struct tcpcb * +test_hpts_create_tcpcb(struct ktest_test_context *ctx, struct tcp_hptsi *pace) +{ + struct tcpcb *tp; + + tp = malloc(sizeof(struct tcpcb), M_TCPHPTS, M_WAITOK | M_ZERO); + if (tp) { + rw_init_flags(&tp->t_inpcb.inp_lock, "test-inp", + RW_RECURSE | RW_DUPOK); + refcount_init(&tp->t_inpcb.inp_refcount, 1); + tp->t_inpcb.inp_pcbinfo = &V_tcbinfo; + tp->t_fb = &test_tcp_fb; + tp->t_hpts_cpu = HPTS_CPU_NONE; + STAILQ_INIT(&tp->t_inqueue); + tcp_hpts_init(pace, tp); + + /* Stuff some pointers in the tcb for test purposes. */ + tp->t_fb_ptr = ctx; + tp->t_tfo_pending = (unsigned int*)pace; + } + + return (tp); +} + +/* + * Free a test tcpcb created by test_hpts_create_tcpcb() + */ +static void +test_hpts_free_tcpcb(struct tcpcb *tp) +{ + if (tp == NULL) + return; + + INP_LOCK_DESTROY(&tp->t_inpcb); + free(tp, M_TCPHPTS); +} + +/* + * *********************************************** + * * KTEST functions for testing the HPTS module * + * *********************************************** + */ + +/* + * Validates that the HPTS module is properly loaded and initialized by checking + * that the minimum HPTS time is configured. + */ +KTEST_FUNC(module_load) +{ + test_hpts_init(); + KTEST_NEQUAL(tcp_min_hptsi_time, 0); + KTEST_VERIFY(tcp_bind_threads >= 0 && tcp_bind_threads <= 2); + KTEST_NEQUAL(tcp_hptsi_pace, NULL); + return (0); +} + +/* + * Validates the creation and destruction of tcp_hptsi structures, ensuring + * proper initialization of internal fields and clean destruction. + */ +KTEST_FUNC(hptsi_create_destroy) +{ + struct tcp_hptsi *pace; + + test_hpts_init(); + + pace = tcp_hptsi_create(&test_funcs, false); + KTEST_NEQUAL(pace, NULL); + KTEST_NEQUAL(pace->rp_ent, NULL); + KTEST_NEQUAL(pace->cts_last_ran, NULL); + KTEST_VERIFY(pace->rp_num_hptss > 0); + KTEST_VERIFY(pace->rp_num_hptss <= MAXCPU); /* Reasonable upper bound */ + KTEST_VERIFY(pace->grp_cnt >= 1); /* At least one group */ + KTEST_EQUAL(pace->funcs, &test_funcs); /* Verify function pointer was set */ + + /* Verify individual HPTS entries are properly initialized */ + for (uint32_t i = 0; i < pace->rp_num_hptss; i++) { + KTEST_NEQUAL(pace->rp_ent[i], NULL); + KTEST_EQUAL(pace->rp_ent[i]->p_cpu, i); + KTEST_EQUAL(pace->rp_ent[i]->p_hptsi, pace); + KTEST_EQUAL(pace->rp_ent[i]->p_on_queue_cnt, 0); + } + + tcp_hptsi_destroy(pace); + + return (0); +} + +/* + * Validates that tcp_hptsi structures can be started and stopped properly, + * including verification that threads are created during start and cleaned up + * during stop operations. + */ +KTEST_FUNC(hptsi_start_stop) +{ + struct tcp_hptsi *pace; + + test_hpts_init(); + + pace = tcp_hptsi_create(&test_funcs, false); + KTEST_NEQUAL(pace, NULL); + + tcp_hptsi_start(pace); + + /* Verify that entries have threads started */ + struct tcp_hpts_entry *hpts = pace->rp_ent[0]; + KTEST_NEQUAL(hpts->ie_cookie, NULL); /* Should have SWI handler */ + KTEST_EQUAL(hpts->p_hptsi, pace); /* Should point to our pace */ + + tcp_hptsi_stop(pace); + tcp_hptsi_destroy(pace); + + return (0); +} + +/* + * Validates that multiple tcp_hptsi instances can coexist independently, with + * different configurations and CPU assignments without interfering with each + * other. + */ +KTEST_FUNC(hptsi_independence) +{ + struct tcp_hptsi *pace1, *pace2; + uint16_t cpu1, cpu2; + + test_hpts_init(); + + pace1 = tcp_hptsi_create(&test_funcs, false); + pace2 = tcp_hptsi_create(&test_funcs, false); + KTEST_NEQUAL(pace1, NULL); + KTEST_NEQUAL(pace2, NULL); + KTEST_NEQUAL(pace2->rp_ent, NULL); + + cpu1 = tcp_hptsi_random_cpu(pace1); + cpu2 = tcp_hptsi_random_cpu(pace2); + KTEST_VERIFY(cpu1 < pace1->rp_num_hptss); + KTEST_VERIFY(cpu2 < pace2->rp_num_hptss); + + /* Verify both instances have independent entry arrays */ + KTEST_NEQUAL(pace1->rp_ent, pace2->rp_ent); + /* Verify they may have different CPU counts but both reasonable */ + KTEST_VERIFY(pace1->rp_num_hptss > 0 && pace1->rp_num_hptss <= MAXCPU); + KTEST_VERIFY(pace2->rp_num_hptss > 0 && pace2->rp_num_hptss <= MAXCPU); + + tcp_hptsi_destroy(pace1); + tcp_hptsi_destroy(pace2); + + return (0); +} + +/* + * Validates that custom function injection works correctly, ensuring that + * test-specific implementations of microuptime and others are properly + * called by the HPTS system. + */ +KTEST_FUNC(function_injection) +{ + struct tcp_hptsi *pace; + + test_hpts_init(); + + pace = tcp_hptsi_create(&test_funcs, false); + KTEST_NEQUAL(pace, NULL); + KTEST_EQUAL(pace->funcs, &test_funcs); + KTEST_VERIFY(call_counts[CCNT_MICROUPTIME] > 0); + KTEST_VERIFY(call_counts[CCNT_CALLOUT_INIT] > 0); + + tcp_hptsi_start(pace); + KTEST_VERIFY(call_counts[CCNT_SWI_ADD] > 0); + KTEST_VERIFY(tcp_bind_threads == 0 || + call_counts[CCNT_INTR_EVENT_BIND] > 0 || + call_counts[CCNT_INTR_EVENT_BIND_CPUSET] > 0); + KTEST_VERIFY(call_counts[CCNT_CALLOUT_RESET_SBT_ON] > 0); + + tcp_hptsi_stop(pace); + KTEST_VERIFY(call_counts[CCNT_CALLOUT_STOP_SAFE] > 0); + KTEST_VERIFY(call_counts[CCNT_SWI_REMOVE] > 0); + + tcp_hptsi_destroy(pace); + + /* Verify we have a reasonable balance of create/destroy calls */ + KTEST_EQUAL(call_counts[CCNT_SWI_ADD], call_counts[CCNT_SWI_REMOVE]); + KTEST_VERIFY(call_counts[CCNT_CALLOUT_RESET_SBT_ON] <= call_counts[CCNT_CALLOUT_STOP_SAFE]); + + return (0); +} + +/* + * Validates that a tcpcb can be properly initialized for HPTS compatibility, + * ensuring all required fields are set correctly and function pointers are + * valid for safe HPTS operations. + */ +KTEST_FUNC(tcpcb_initialization) +{ + struct tcp_hptsi *pace; + struct tcpcb *tp; + + test_hpts_init(); + + pace = tcp_hptsi_create(&test_funcs, false); + KTEST_NEQUAL(pace, NULL); + tcp_hptsi_start(pace); + + /* Verify the tcpcb is properly initialized for HPTS */ + tp = test_hpts_create_tcpcb(ctx, pace); + KTEST_NEQUAL(tp, NULL); + KTEST_NEQUAL(tp->t_fb, NULL); + KTEST_NEQUAL(tp->t_fb->tfb_tcp_output, NULL); + KTEST_NEQUAL(tp->t_fb->tfb_do_queued_segments, NULL); + KTEST_EQUAL(tp->t_in_hpts, IHPTS_NONE); + KTEST_EQUAL((tp->t_flags2 & (TF2_HPTS_CPU_SET | TF2_HPTS_CALLS)), 0); + + /* Verify that HPTS-specific fields are initialized */ + KTEST_EQUAL(tp->t_hpts_gencnt, 0); + KTEST_EQUAL(tp->t_hpts_slot, 0); + KTEST_EQUAL(tp->t_hpts_request, 0); + KTEST_EQUAL(tp->t_lro_cpu, 0); + KTEST_VERIFY(tp->t_hpts_cpu < pace->rp_num_hptss); + KTEST_EQUAL(tp->t_inpcb.inp_refcount, 1); + KTEST_VERIFY(!(tp->t_inpcb.inp_flags & INP_DROPPED)); + + test_hpts_free_tcpcb(tp); + tcp_hptsi_stop(pace); + tcp_hptsi_destroy(pace); + + return (0); +} + +/* + * Validates that tcpcb structures can be successfully inserted into and removed + * from the HPTS wheel, with proper state tracking and slot assignment during + * the process. + */ +KTEST_FUNC(tcpcb_insertion) +{ + struct tcp_hptsi *pace; + struct tcpcb *tp; + struct tcp_hpts_entry *hpts; + uint32_t timeout_usecs = 10; + + test_hpts_init(); + + pace = tcp_hptsi_create(&test_funcs, false); + KTEST_NEQUAL(pace, NULL); + tcp_hptsi_start(pace); + + tp = test_hpts_create_tcpcb(ctx, pace); + KTEST_NEQUAL(tp, NULL); + KTEST_EQUAL(tp->t_in_hpts, IHPTS_NONE); + KTEST_EQUAL((tp->t_flags2 & TF2_HPTS_CALLS), 0); + + INP_WLOCK(&tp->t_inpcb); + tp->t_flags2 |= TF2_HPTS_CALLS; + KTEST_EQUAL(call_counts[CCNT_SWI_SCHED], 0); + tcp_hpts_insert(pace, tp, timeout_usecs, NULL); + KTEST_EQUAL(tp->t_in_hpts, IHPTS_ONQUEUE); + INP_WUNLOCK(&tp->t_inpcb); + KTEST_EQUAL(call_counts[CCNT_TCP_OUTPUT], 0); + KTEST_EQUAL(call_counts[CCNT_SWI_SCHED], 1); + KTEST_VERIFY(tcp_in_hpts(tp)); + KTEST_VERIFY(tp->t_hpts_slot >= 0); + KTEST_VERIFY(tp->t_hpts_slot < NUM_OF_HPTSI_SLOTS); + + hpts = pace->rp_ent[tp->t_hpts_cpu]; + KTEST_EQUAL(hpts->p_on_queue_cnt, 1); + KTEST_EQUAL(tp->t_hpts_request, 0); + KTEST_EQUAL(tp->t_hpts_slot, HPTS_USEC_TO_SLOTS(timeout_usecs)); + //KTEST_EQUAL(tp->t_hpts_gencnt, 1); + + INP_WLOCK(&tp->t_inpcb); + tcp_hpts_remove(pace, tp); + KTEST_EQUAL(tp->t_in_hpts, IHPTS_NONE); + INP_WUNLOCK(&tp->t_inpcb); + KTEST_EQUAL(call_counts[CCNT_TCP_OUTPUT], 0); + KTEST_VERIFY(!tcp_in_hpts(tp)); + + KTEST_EQUAL(hpts->p_on_queue_cnt, 0); + + test_hpts_free_tcpcb(tp); + tcp_hptsi_stop(pace); + tcp_hptsi_destroy(pace); + + return (0); +} + +/* + * Validates the core HPTS timer functionality by verifying that scheduled + * tcpcb entries trigger tcp_output calls at appropriate times, simulating + * real-world timer-driven TCP processing. + */ +KTEST_FUNC(timer_functionality) +{ + struct epoch_tracker et; + struct tcp_hptsi *pace; + struct tcp_hpts_entry *hpts; + struct tcpcb *tp; + int32_t slots_ran; + uint32_t i; + + test_hpts_init(); + + pace = tcp_hptsi_create(&test_funcs, false); + KTEST_NEQUAL(pace, NULL); + tcp_hptsi_start(pace); + + for (i = 0; i < pace->rp_num_hptss; i++) + dump_hpts_entry(ctx, pace->rp_ent[i]); + + /* Create and insert the tcpcb into the HPTS wheel to wait for 500 usec */ + tp = test_hpts_create_tcpcb(ctx, pace); + KTEST_NEQUAL(tp, NULL); + dump_tcpcb(tp); + TP_LOG_TEST(tp) = 1; /* Enable logging for this tcpcb */ + + KTEST_LOG(ctx, "=> tcp_hpts_insert(%p)", tp); + INP_WLOCK(&tp->t_inpcb); + tp->t_flags2 |= TF2_HPTS_CALLS; /* Mark as needing HPTS processing */ + tcp_hpts_insert(pace, tp, 500, NULL); + INP_WUNLOCK(&tp->t_inpcb); + + dump_tcpcb(tp); + for (i = 0; i < pace->rp_num_hptss; i++) + dump_hpts_entry(ctx, pace->rp_ent[i]); + + hpts = pace->rp_ent[tp->t_hpts_cpu]; + KTEST_EQUAL(hpts->p_on_queue_cnt, 1); + KTEST_EQUAL(hpts->p_prev_slot, 0); + KTEST_EQUAL(hpts->p_cur_slot, 0); + KTEST_EQUAL(hpts->p_runningslot, 0); + KTEST_EQUAL(hpts->p_nxt_slot, 1); + KTEST_EQUAL(hpts->p_hpts_active, 0); + + KTEST_EQUAL(tp->t_in_hpts, IHPTS_ONQUEUE); + KTEST_EQUAL(tp->t_hpts_request, 0); + KTEST_EQUAL(tp->t_hpts_slot, HPTS_USEC_TO_SLOTS(500)); + + /* Set our test flag to indicate the tcpcb should be removed from the + * wheel when tcp_output is called. */ + TP_REMOVE_FROM_HPTS(tp) = 1; + + /* Test early exit condition: advance time by insufficient amount */ + KTEST_LOG(ctx, "Testing early exit with insufficient time advancement"); + test_time_usec += 1; /* Very small advancement - should cause early exit */ + HPTS_LOCK(hpts); + NET_EPOCH_ENTER(et); + slots_ran = tcp_hptsi(hpts, true); + HPTS_UNLOCK(hpts); + NET_EPOCH_EXIT(et); + + /* Should return 0 slots due to insufficient time advancement */ + KTEST_EQUAL(slots_ran, 0); + KTEST_EQUAL(call_counts[CCNT_TCP_OUTPUT], 0); /* No processing should occur */ + KTEST_EQUAL(tp->t_in_hpts, IHPTS_ONQUEUE); /* Connection still queued */ + + /* Wait for 498 more usecs and trigger the HPTS workers and verify + * nothing happens yet (total 499 usec) */ + KTEST_EQUAL(call_counts[CCNT_TCP_OUTPUT], 0); + test_time_usec += 498; + for (i = 0; i < pace->rp_num_hptss; i++) { + KTEST_LOG(ctx, "=> tcp_hptsi(%p)", pace->rp_ent[i]); + HPTS_LOCK(pace->rp_ent[i]); + NET_EPOCH_ENTER(et); + slots_ran = tcp_hptsi(pace->rp_ent[i], true); + HPTS_UNLOCK(pace->rp_ent[i]); + NET_EPOCH_EXIT(et); + + dump_hpts_entry(ctx, pace->rp_ent[i]); + KTEST_VERIFY(slots_ran >= 0); + KTEST_EQUAL(pace->rp_ent[i]->p_prev_slot, 49); + KTEST_EQUAL(pace->rp_ent[i]->p_cur_slot, 49); + } + + dump_tcpcb(tp); + KTEST_EQUAL(call_counts[CCNT_TCP_OUTPUT], 0); + KTEST_EQUAL(tp->t_in_hpts, IHPTS_ONQUEUE); + KTEST_EQUAL(tp->t_hpts_request, 0); + KTEST_EQUAL(tp->t_hpts_slot, HPTS_USEC_TO_SLOTS(500)); + KTEST_EQUAL(hpts->p_on_queue_cnt, 1); + + /* Wait for 1 more usec and trigger the HPTS workers and verify it + * triggers tcp_output this time */ + KTEST_EQUAL(call_counts[CCNT_TCP_OUTPUT], 0); + test_time_usec += 1; + for (i = 0; i < pace->rp_num_hptss; i++) { + KTEST_LOG(ctx, "=> tcp_hptsi(%p)", pace->rp_ent[i]); + HPTS_LOCK(pace->rp_ent[i]); + NET_EPOCH_ENTER(et); + slots_ran = tcp_hptsi(pace->rp_ent[i], true); + HPTS_UNLOCK(pace->rp_ent[i]); + NET_EPOCH_EXIT(et); + + dump_hpts_entry(ctx, pace->rp_ent[i]); + KTEST_VERIFY(slots_ran >= 0); + KTEST_EQUAL(pace->rp_ent[i]->p_prev_slot, 50); + KTEST_EQUAL(pace->rp_ent[i]->p_cur_slot, 50); + } + + dump_tcpcb(tp); + KTEST_EQUAL(call_counts[CCNT_TCP_OUTPUT], 1); + KTEST_EQUAL(tp->t_in_hpts, IHPTS_NONE); + KTEST_EQUAL(hpts->p_on_queue_cnt, 0); + + test_hpts_free_tcpcb(tp); + tcp_hptsi_stop(pace); + tcp_hptsi_destroy(pace); + + return (0); +} + +/* + * Validates HPTS scalability by creating and inserting a LOT of tcpcbs into + * the HPTS wheel, testing performance under high load conditions. + */ +KTEST_FUNC(scalability_tcpcbs) +{ + struct tcp_hptsi *pace; + struct tcpcb **tcpcbs; + uint32_t i, num_tcpcbs = 100000, total_queued = 0; + + test_hpts_init(); + + pace = tcp_hptsi_create(&test_funcs, false); + KTEST_NEQUAL(pace, NULL); + tcp_hptsi_start(pace); + + /* Allocate array to hold pointers to all tcpcbs */ + tcpcbs = malloc(num_tcpcbs * sizeof(struct tcpcb *), M_TCPHPTS, M_WAITOK | M_ZERO); + KTEST_VERIFY_RET(tcpcbs != NULL, ENOMEM); + + /* Create a LOT of tcpcbs */ + KTEST_LOG(ctx, "Creating %u tcpcbs...", num_tcpcbs); + for (i = 0; i < num_tcpcbs; i++) { + tcpcbs[i] = test_hpts_create_tcpcb(ctx, pace); + if (tcpcbs[i] == NULL) { + KTEST_ERR(ctx, "FAIL: tcpcbs[i] == NULL"); + return (EINVAL); + } + } + + /* Insert all created tcpcbs into HPTS */ + KTEST_LOG(ctx, "Inserting all tcpcbs into HPTS..."); + for (i = 0; i < num_tcpcbs; i++) { + INP_WLOCK(&tcpcbs[i]->t_inpcb); + tcpcbs[i]->t_flags2 |= TF2_HPTS_CALLS; + /* Insert with varying future timeouts to distribute across slots */ + tcp_hpts_insert(pace, tcpcbs[i], 100 + (i % 1000), NULL); + INP_WUNLOCK(&tcpcbs[i]->t_inpcb); + } + + /* Verify total queue counts across all CPUs */ + for (i = 0; i < pace->rp_num_hptss; i++) { + total_queued += pace->rp_ent[i]->p_on_queue_cnt; + } + KTEST_EQUAL(total_queued, num_tcpcbs); + + for (i = 0; i < pace->rp_num_hptss; i++) + dump_hpts_entry(ctx, pace->rp_ent[i]); + + /* Remove all tcpcbs from HPTS */ + KTEST_LOG(ctx, "Removing all tcpcbs from HPTS..."); + for (i = 0; i < num_tcpcbs; i++) { + INP_WLOCK(&tcpcbs[i]->t_inpcb); + if (tcpcbs[i]->t_in_hpts != IHPTS_NONE) { + tcp_hpts_remove(pace, tcpcbs[i]); + } + INP_WUNLOCK(&tcpcbs[i]->t_inpcb); + } + + /* Verify all queues are now empty */ + for (i = 0; i < pace->rp_num_hptss; i++) { + if (pace->rp_ent[i]->p_on_queue_cnt != 0) { + KTEST_ERR(ctx, "FAIL: pace->rp_ent[i]->p_on_queue_cnt != 0"); + return (EINVAL); + } + } + + for (i = 0; i < num_tcpcbs; i++) { + test_hpts_free_tcpcb(tcpcbs[i]); + } + free(tcpcbs, M_TCPHPTS); + tcp_hptsi_stop(pace); + tcp_hptsi_destroy(pace); + + return (0); +} + +/* + * Validates wheel wrap scenarios where the timer falls significantly behind + * and needs to process more than one full wheel revolution worth of slots. + */ +KTEST_FUNC(wheel_wrap_recovery) +{ + struct epoch_tracker et; + struct tcp_hptsi *pace; + struct tcpcb **tcpcbs; + uint32_t i, timeout_usecs, num_tcpcbs = 500; + int32_t slots_ran; + + test_hpts_init(); + + pace = tcp_hptsi_create(&test_funcs, false); + KTEST_NEQUAL(pace, NULL); + tcp_hptsi_start(pace); + + /* Allocate array to hold pointers to tcpcbs */ + tcpcbs = malloc(num_tcpcbs * sizeof(struct tcpcb *), M_TCPHPTS, M_WAITOK | M_ZERO); + KTEST_VERIFY_RET(tcpcbs != NULL, ENOMEM); + + /* Create tcpcbs and insert them across many slots */ + for (i = 0; i < num_tcpcbs; i++) { + tcpcbs[i] = test_hpts_create_tcpcb(ctx, pace); + KTEST_NEQUAL(tcpcbs[i], NULL); + TP_REMOVE_FROM_HPTS(tcpcbs[i]) = 1; + + timeout_usecs = ((i * NUM_OF_HPTSI_SLOTS) / num_tcpcbs) * HPTS_USECS_PER_SLOT; /* Spread across slots */ + + INP_WLOCK(&tcpcbs[i]->t_inpcb); + tcpcbs[i]->t_flags2 |= TF2_HPTS_CALLS; + tcp_hpts_insert(pace, tcpcbs[i], timeout_usecs, NULL); + INP_WUNLOCK(&tcpcbs[i]->t_inpcb); + } + + /* Fast forward time significantly to trigger wheel wrap */ + test_time_usec += (NUM_OF_HPTSI_SLOTS + 5000) * HPTS_USECS_PER_SLOT; + + for (i = 0; i < pace->rp_num_hptss; i++) { + KTEST_LOG(ctx, "=> tcp_hptsi(%u)", i); + KTEST_NEQUAL(pace->rp_ent[i]->p_on_queue_cnt, 0); + + HPTS_LOCK(pace->rp_ent[i]); + NET_EPOCH_ENTER(et); + slots_ran = tcp_hptsi(pace->rp_ent[i], true); + HPTS_UNLOCK(pace->rp_ent[i]); + NET_EPOCH_EXIT(et); + + KTEST_EQUAL(slots_ran, NUM_OF_HPTSI_SLOTS-1); /* Should process all slots */ + KTEST_EQUAL(pace->rp_ent[i]->p_on_queue_cnt, 0); + KTEST_NEQUAL(pace->rp_ent[i]->p_cur_slot, + pace->rp_ent[i]->p_prev_slot); + } + + /* Cleanup */ + for (i = 0; i < num_tcpcbs; i++) { + INP_WLOCK(&tcpcbs[i]->t_inpcb); + if (tcpcbs[i]->t_in_hpts != IHPTS_NONE) { + tcp_hpts_remove(pace, tcpcbs[i]); + } + INP_WUNLOCK(&tcpcbs[i]->t_inpcb); + test_hpts_free_tcpcb(tcpcbs[i]); + } + free(tcpcbs, M_TCPHPTS); + tcp_hptsi_stop(pace); + tcp_hptsi_destroy(pace); + + return (0); +} + +/* + * Validates proper handling of tcpcbs in the IHPTS_MOVING state, which occurs + * when a tcpcb is being processed by the HPTS thread but gets removed. + */ +KTEST_FUNC(tcpcb_moving_state) +{ + struct epoch_tracker et; + struct tcp_hptsi *pace; + struct tcpcb *tp1, *tp2; + struct tcp_hpts_entry *hpts; + int32_t slots_ran; + + test_hpts_init(); + + pace = tcp_hptsi_create(&test_funcs, false); + KTEST_NEQUAL(pace, NULL); + tcp_hptsi_start(pace); + + /* Create two tcpcbs on the same CPU/slot */ + tp1 = test_hpts_create_tcpcb(ctx, pace); + tp2 = test_hpts_create_tcpcb(ctx, pace); + KTEST_NEQUAL(tp1, NULL); + KTEST_NEQUAL(tp2, NULL); + + /* Force them to the same CPU for predictable testing */ + tp1->t_hpts_cpu = 0; + tp2->t_hpts_cpu = 0; + + /* Insert both into the same slot */ + INP_WLOCK(&tp1->t_inpcb); + tp1->t_flags2 |= TF2_HPTS_CALLS; + tcp_hpts_insert(pace, tp1, 100, NULL); + INP_WUNLOCK(&tp1->t_inpcb); + + INP_WLOCK(&tp2->t_inpcb); + tp2->t_flags2 |= TF2_HPTS_CALLS; + tcp_hpts_insert(pace, tp2, 100, NULL); + INP_WUNLOCK(&tp2->t_inpcb); + + hpts = pace->rp_ent[0]; + + /* Manually transition tp1 to MOVING state to simulate race condition */ + HPTS_LOCK(hpts); + tp1->t_in_hpts = IHPTS_MOVING; + tp1->t_hpts_slot = -1; /* Mark for removal */ + HPTS_UNLOCK(hpts); + + /* Set time and run HPTS to process the moving state */ + test_time_usec += 100; + HPTS_LOCK(hpts); + NET_EPOCH_ENTER(et); + slots_ran = tcp_hptsi(hpts, true); + HPTS_UNLOCK(hpts); + NET_EPOCH_EXIT(et); + + KTEST_VERIFY(slots_ran >= 0); + KTEST_EQUAL(call_counts[CCNT_TCP_OUTPUT], 1); /* Shouldn't call on both */ + + /* tp1 should be cleaned up and removed */ + KTEST_EQUAL(tp1->t_in_hpts, IHPTS_NONE); + /* tp2 should have been processed normally */ + KTEST_EQUAL(tp2->t_in_hpts, IHPTS_NONE); + + test_hpts_free_tcpcb(tp1); + test_hpts_free_tcpcb(tp2); + tcp_hptsi_stop(pace); + tcp_hptsi_destroy(pace); + + return (0); +} + +/* + * Validates that tcpcbs with deferred requests (t_hpts_request > 0) are + * properly handled and re-inserted into appropriate future slots after + * the wheel processes enough slots to accommodate the original request. + */ +KTEST_FUNC(deferred_requests) +{ + struct epoch_tracker et; + struct tcp_hptsi *pace; + struct tcpcb *tp, *tp2; + struct tcp_hpts_entry *hpts; + uint32_t large_timeout_usecs = (NUM_OF_HPTSI_SLOTS + 5000) * HPTS_USECS_PER_SLOT; /* Beyond wheel capacity */ + uint32_t huge_timeout_usecs = (NUM_OF_HPTSI_SLOTS * 3) * HPTS_USECS_PER_SLOT; /* 3x wheel capacity */ + uint32_t initial_request; + int32_t slots_ran; + + test_hpts_init(); + + pace = tcp_hptsi_create(&test_funcs, false); + KTEST_NEQUAL(pace, NULL); + tcp_hptsi_start(pace); + + tp = test_hpts_create_tcpcb(ctx, pace); + KTEST_NEQUAL(tp, NULL); + + /* Insert with a request that exceeds current wheel capacity */ + INP_WLOCK(&tp->t_inpcb); + tp->t_flags2 |= TF2_HPTS_CALLS; + tcp_hpts_insert(pace, tp, large_timeout_usecs, NULL); + INP_WUNLOCK(&tp->t_inpcb); + + /* Verify it was inserted with a deferred request */ + dump_tcpcb(tp); + KTEST_EQUAL(tp->t_in_hpts, IHPTS_ONQUEUE); + KTEST_VERIFY(tp->t_hpts_request > 0); + KTEST_VERIFY(tp->t_hpts_slot < NUM_OF_HPTSI_SLOTS); + + hpts = pace->rp_ent[tp->t_hpts_cpu]; + + /* Advance time to process deferred requests */ + test_time_usec += NUM_OF_HPTSI_SLOTS * HPTS_USECS_PER_SLOT; + + /* Process the wheel to handle deferred requests */ + HPTS_LOCK(hpts); + NET_EPOCH_ENTER(et); + slots_ran = tcp_hptsi(hpts, true); + HPTS_UNLOCK(hpts); + NET_EPOCH_EXIT(et); + + dump_hpts_entry(ctx, hpts); + KTEST_GREATER_THAN(slots_ran, 0); + dump_tcpcb(tp); + KTEST_EQUAL(tp->t_hpts_request, 0); + + /* Test incremental deferred request processing over multiple cycles */ + KTEST_LOG(ctx, "Testing incremental deferred request processing"); + + /* Create a new connection with an even larger request */ + tp2 = test_hpts_create_tcpcb(ctx, pace); + KTEST_NEQUAL(tp2, NULL); + tp2->t_hpts_cpu = tp->t_hpts_cpu; /* Same CPU for predictable testing */ + + INP_WLOCK(&tp2->t_inpcb); + tp2->t_flags2 |= TF2_HPTS_CALLS; + tcp_hpts_insert(pace, tp2, huge_timeout_usecs, NULL); + INP_WUNLOCK(&tp2->t_inpcb); + + /* Verify initial deferred request */ + initial_request = tp2->t_hpts_request; + KTEST_VERIFY(initial_request > NUM_OF_HPTSI_SLOTS); + + /* Process one wheel cycle - should reduce but not eliminate request */ + test_time_usec += NUM_OF_HPTSI_SLOTS * HPTS_USECS_PER_SLOT; + HPTS_LOCK(hpts); + NET_EPOCH_ENTER(et); + slots_ran = tcp_hptsi(hpts, true); + HPTS_UNLOCK(hpts); + NET_EPOCH_EXIT(et); + + /* Request should be reduced but not zero */ + KTEST_GREATER_THAN(initial_request, tp2->t_hpts_request); + KTEST_VERIFY(tp2->t_hpts_request > 0); + KTEST_EQUAL(tp2->t_in_hpts, IHPTS_ONQUEUE); /* Still queued */ + + /* For huge_timeout_usecs = NUM_OF_HPTSI_SLOTS * 3 * HPTS_USECS_PER_SLOT, we need ~3 cycles to complete. + * Each cycle can reduce the request by at most NUM_OF_HPTSI_SLOTS. */ + test_time_usec += NUM_OF_HPTSI_SLOTS * HPTS_USECS_PER_SLOT; + HPTS_LOCK(hpts); + NET_EPOCH_ENTER(et); + slots_ran = tcp_hptsi(hpts, true); + HPTS_UNLOCK(hpts); + NET_EPOCH_EXIT(et); + + /* After second cycle, request should be reduced significantly (likely by ~NUM_OF_HPTSI_SLOTS) */ + KTEST_VERIFY(tp2->t_hpts_request < initial_request); + KTEST_VERIFY(tp2->t_hpts_request > 0); /* But not yet zero for such a large request */ + + /* Clean up second connection */ + INP_WLOCK(&tp2->t_inpcb); + if (tp2->t_in_hpts != IHPTS_NONE) { + tcp_hpts_remove(pace, tp2); + } + INP_WUNLOCK(&tp2->t_inpcb); + test_hpts_free_tcpcb(tp2); + + /* Clean up */ + INP_WLOCK(&tp->t_inpcb); + if (tp->t_in_hpts != IHPTS_NONE) { + tcp_hpts_remove(pace, tp); + } + INP_WUNLOCK(&tp->t_inpcb); + test_hpts_free_tcpcb(tp); + tcp_hptsi_stop(pace); + tcp_hptsi_destroy(pace); + + return (0); +} + +/* + * Validates CPU assignment and affinity mechanisms, including flowid-based + * assignment, random fallback scenarios, and explicit CPU setting. Tests + * the actual cpu assignment logic in hpts_cpuid via tcp_set_hpts. + */ +KTEST_FUNC(cpu_assignment) +{ + struct tcp_hptsi *pace; + struct tcpcb *tp1, *tp2, *tp2_dup, *tp3; + + test_hpts_init(); + + pace = tcp_hptsi_create(&test_funcs, false); + KTEST_NEQUAL(pace, NULL); + + /* Test random CPU assignment (no flowid) */ + tp1 = test_hpts_create_tcpcb(ctx, pace); + KTEST_NEQUAL(tp1, NULL); + tp1->t_inpcb.inp_flowtype = M_HASHTYPE_NONE; + INP_WLOCK(&tp1->t_inpcb); + tcp_set_hpts(pace, tp1); + INP_WUNLOCK(&tp1->t_inpcb); + KTEST_VERIFY(tp1->t_hpts_cpu < pace->rp_num_hptss); + KTEST_VERIFY(tp1->t_flags2 & TF2_HPTS_CPU_SET); + + /* Test flowid-based assignment */ + tp2 = test_hpts_create_tcpcb(ctx, pace); + KTEST_NEQUAL(tp2, NULL); + tp2->t_inpcb.inp_flowtype = M_HASHTYPE_RSS_TCP_IPV4; + tp2->t_inpcb.inp_flowid = 12345; + INP_WLOCK(&tp2->t_inpcb); + tcp_set_hpts(pace, tp2); + INP_WUNLOCK(&tp2->t_inpcb); + KTEST_VERIFY(tp2->t_hpts_cpu < pace->rp_num_hptss); + KTEST_VERIFY(tp2->t_flags2 & TF2_HPTS_CPU_SET); + + /* With the same flowid, should get same CPU assignment */ + tp2_dup = test_hpts_create_tcpcb(ctx, pace); + KTEST_NEQUAL(tp2_dup, NULL); + tp2_dup->t_inpcb.inp_flowtype = M_HASHTYPE_RSS_TCP_IPV4; + tp2_dup->t_inpcb.inp_flowid = 12345; + INP_WLOCK(&tp2_dup->t_inpcb); + tcp_set_hpts(pace, tp2_dup); + INP_WUNLOCK(&tp2_dup->t_inpcb); + KTEST_EQUAL(tp2_dup->t_hpts_cpu, tp2->t_hpts_cpu); + + /* Test explicit CPU setting */ + tp3 = test_hpts_create_tcpcb(ctx, pace); + KTEST_NEQUAL(tp3, NULL); + tp3->t_hpts_cpu = 1; /* Assume we have at least 2 CPUs */ + tp3->t_flags2 |= TF2_HPTS_CPU_SET; + INP_WLOCK(&tp3->t_inpcb); + tcp_set_hpts(pace, tp3); + INP_WUNLOCK(&tp3->t_inpcb); + KTEST_EQUAL(tp3->t_hpts_cpu, 1); + + test_hpts_free_tcpcb(tp1); + test_hpts_free_tcpcb(tp2); + test_hpts_free_tcpcb(tp2_dup); + test_hpts_free_tcpcb(tp3); + tcp_hptsi_destroy(pace); + + return (0); +} + +/* + * Validates edge cases in slot calculation including boundary conditions + * around slot 0, maximum slots, and slot wrapping arithmetic. + */ +KTEST_FUNC(slot_boundary_conditions) +{ + struct tcp_hptsi *pace; + struct tcpcb *tp; + + test_hpts_init(); + + pace = tcp_hptsi_create(&test_funcs, false); + KTEST_NEQUAL(pace, NULL); + tcp_hptsi_start(pace); + + /* Test insertion at slot 0 */ + tp = test_hpts_create_tcpcb(ctx, pace); + KTEST_NEQUAL(tp, NULL); + INP_WLOCK(&tp->t_inpcb); + tp->t_flags2 |= TF2_HPTS_CALLS; + tcp_hpts_insert(pace, tp, 0, NULL); /* Should insert immediately (0 timeout) */ + INP_WUNLOCK(&tp->t_inpcb); + KTEST_EQUAL(tp->t_in_hpts, IHPTS_ONQUEUE); + KTEST_VERIFY(tp->t_hpts_slot < NUM_OF_HPTSI_SLOTS); + + INP_WLOCK(&tp->t_inpcb); + tcp_hpts_remove(pace, tp); + INP_WUNLOCK(&tp->t_inpcb); + + /* Test insertion at maximum slot value */ + INP_WLOCK(&tp->t_inpcb); + tp->t_flags2 |= TF2_HPTS_CALLS; + tcp_hpts_insert(pace, tp, (NUM_OF_HPTSI_SLOTS - 1) * HPTS_USECS_PER_SLOT, NULL); + INP_WUNLOCK(&tp->t_inpcb); + KTEST_EQUAL(tp->t_in_hpts, IHPTS_ONQUEUE); + + INP_WLOCK(&tp->t_inpcb); + tcp_hpts_remove(pace, tp); + INP_WUNLOCK(&tp->t_inpcb); + + /* Test very small timeout values */ + INP_WLOCK(&tp->t_inpcb); + tp->t_flags2 |= TF2_HPTS_CALLS; + tcp_hpts_insert(pace, tp, 1, NULL); + INP_WUNLOCK(&tp->t_inpcb); + KTEST_EQUAL(tp->t_in_hpts, IHPTS_ONQUEUE); + KTEST_EQUAL(tp->t_hpts_slot, HPTS_USEC_TO_SLOTS(1)); /* Should convert 1 usec to slot */ + + INP_WLOCK(&tp->t_inpcb); + tcp_hpts_remove(pace, tp); + INP_WUNLOCK(&tp->t_inpcb); + + test_hpts_free_tcpcb(tp); + tcp_hptsi_stop(pace); + tcp_hptsi_destroy(pace); + + return (0); +} + +/* + * Validates HPTS behavior under high load conditions, including proper + * processing of many connections and connection count tracking. + */ +KTEST_FUNC(dynamic_sleep_adjustment) { - /* TODO: Refactor HPTS code so that it may be tested. */ - KTEST_VERIFY(tcp_min_hptsi_time != 0); + struct epoch_tracker et; + struct tcp_hptsi *pace; + struct tcpcb **tcpcbs; + struct tcp_hpts_entry *hpts; + uint32_t i, num_tcpcbs = DEFAULT_CONNECTION_THRESHOLD + 50; + int32_t slots_ran; + + test_hpts_init(); + + pace = tcp_hptsi_create(&test_funcs, false); + KTEST_NEQUAL(pace, NULL); + tcp_hptsi_start(pace); + + /* Create many connections to exceed threshold */ + tcpcbs = malloc(num_tcpcbs * sizeof(struct tcpcb *), M_TCPHPTS, M_WAITOK | M_ZERO); + KTEST_VERIFY_RET(tcpcbs != NULL, ENOMEM); + + for (i = 0; i < num_tcpcbs; i++) { + tcpcbs[i] = test_hpts_create_tcpcb(ctx, pace); + KTEST_NEQUAL(tcpcbs[i], NULL); + tcpcbs[i]->t_hpts_cpu = 0; /* Force all to CPU 0 */ + INP_WLOCK(&tcpcbs[i]->t_inpcb); + tcpcbs[i]->t_flags2 |= TF2_HPTS_CALLS; + TP_REMOVE_FROM_HPTS(tcpcbs[i]) = 1; /* Will be removed after output */ + tcp_hpts_insert(pace, tcpcbs[i], 100, NULL); + INP_WUNLOCK(&tcpcbs[i]->t_inpcb); + } + + hpts = pace->rp_ent[0]; + dump_hpts_entry(ctx, hpts); + + /* Verify we're above threshold */ + KTEST_GREATER_THAN(hpts->p_on_queue_cnt, DEFAULT_CONNECTION_THRESHOLD); + + /* Run HPTS to process many connections */ + test_time_usec += 100; + HPTS_LOCK(hpts); + NET_EPOCH_ENTER(et); + slots_ran = tcp_hptsi(hpts, true); + HPTS_UNLOCK(hpts); + NET_EPOCH_EXIT(et); + + /* Verify HPTS processed slots and connections correctly */ + KTEST_GREATER_THAN(slots_ran, 0); + KTEST_EQUAL(call_counts[CCNT_TCP_OUTPUT], num_tcpcbs); + + /* Verify all connections were removed from queue */ + KTEST_EQUAL(hpts->p_on_queue_cnt, 0); + + /* Cleanup */ + for (i = 0; i < num_tcpcbs; i++) { + test_hpts_free_tcpcb(tcpcbs[i]); + } + free(tcpcbs, M_TCPHPTS); + tcp_hptsi_stop(pace); + tcp_hptsi_destroy(pace); + + return (0); +} + +/* + * Validates handling of concurrent insert/remove operations and race conditions + * between HPTS processing and user operations. + */ +KTEST_FUNC(concurrent_operations) +{ + struct tcp_hptsi *pace; + struct tcpcb *tp1, *tp2; + struct tcp_hpts_entry *hpts; + + test_hpts_init(); + + pace = tcp_hptsi_create(&test_funcs, false); + KTEST_NEQUAL(pace, NULL); + tcp_hptsi_start(pace); + + tp1 = test_hpts_create_tcpcb(ctx, pace); + tp2 = test_hpts_create_tcpcb(ctx, pace); + KTEST_NEQUAL(tp1, NULL); + KTEST_NEQUAL(tp2, NULL); + + /* Force all to CPU 0 */ + tp1->t_hpts_cpu = 0; + tp2->t_hpts_cpu = 0; + + /* Insert tp1 */ + INP_WLOCK(&tp1->t_inpcb); + tp1->t_flags2 |= TF2_HPTS_CALLS; + tcp_hpts_insert(pace, tp1, 100, NULL); + INP_WUNLOCK(&tp1->t_inpcb); + + /* Insert tp2 into same slot */ + INP_WLOCK(&tp2->t_inpcb); + tp2->t_flags2 |= TF2_HPTS_CALLS; + tcp_hpts_insert(pace, tp2, 100, NULL); + INP_WUNLOCK(&tp2->t_inpcb); + + /* Verify both are inserted */ + KTEST_EQUAL(tp1->t_in_hpts, IHPTS_ONQUEUE); + KTEST_EQUAL(tp2->t_in_hpts, IHPTS_ONQUEUE); + + /* Verify they're both assigned to the same slot */ + KTEST_EQUAL(tp1->t_hpts_slot, tp2->t_hpts_slot); + + /* Verify queue count reflects both connections */ + KTEST_EQUAL(tp1->t_hpts_cpu, tp2->t_hpts_cpu); /* Should be on same CPU */ + hpts = pace->rp_ent[tp1->t_hpts_cpu]; + KTEST_EQUAL(hpts->p_on_queue_cnt, 2); + + /* Remove tp1 while tp2 is still there */ + INP_WLOCK(&tp1->t_inpcb); + tcp_hpts_remove(pace, tp1); + INP_WUNLOCK(&tp1->t_inpcb); + + /* Verify tp1 removed, tp2 still there */ + KTEST_EQUAL(tp1->t_in_hpts, IHPTS_NONE); + KTEST_EQUAL(tp2->t_in_hpts, IHPTS_ONQUEUE); + + /* Verify queue count decreased by one */ + KTEST_EQUAL(hpts->p_on_queue_cnt, 1); + + /* Remove tp2 */ + INP_WLOCK(&tp2->t_inpcb); + tcp_hpts_remove(pace, tp2); + INP_WUNLOCK(&tp2->t_inpcb); + + KTEST_EQUAL(tp2->t_in_hpts, IHPTS_NONE); + + /* Verify queue is now completely empty */ + KTEST_EQUAL(hpts->p_on_queue_cnt, 0); + + test_hpts_free_tcpcb(tp1); + test_hpts_free_tcpcb(tp2); + tcp_hptsi_stop(pace); + tcp_hptsi_destroy(pace); + + return (0); +} + +/* + * Validates the queued segments processing path via tfb_do_queued_segments, + * which is an alternative to direct tcp_output calls. + */ +KTEST_FUNC(queued_segments_processing) +{ + struct epoch_tracker et; + struct tcp_hptsi *pace; + struct tcpcb *tp; + struct tcp_hpts_entry *hpts; + struct mbuf *fake_mbuf; + int32_t slots_ran; + + test_hpts_init(); + + pace = tcp_hptsi_create(&test_funcs, false); + KTEST_NEQUAL(pace, NULL); + tcp_hptsi_start(pace); + + tp = test_hpts_create_tcpcb(ctx, pace); + KTEST_NEQUAL(tp, NULL); + + /* Create a minimal fake mbuf that has valid STAILQ pointers */ + fake_mbuf = malloc(sizeof(struct mbuf), M_TCPHPTS, M_WAITOK | M_ZERO); + KTEST_NEQUAL(fake_mbuf, NULL); + + /* Set up for queued segments path */ + tp->t_flags2 |= (TF2_HPTS_CALLS | TF2_SUPPORTS_MBUFQ); + STAILQ_INSERT_TAIL(&tp->t_inqueue, fake_mbuf, m_stailqpkt); + + INP_WLOCK(&tp->t_inpcb); + tcp_hpts_insert(pace, tp, 100, NULL); + INP_WUNLOCK(&tp->t_inpcb); + + hpts = pace->rp_ent[tp->t_hpts_cpu]; + + /* Run HPTS and verify queued segments path is taken */ + test_time_usec += 100; + HPTS_LOCK(hpts); + NET_EPOCH_ENTER(et); + slots_ran = tcp_hptsi(hpts, true); + HPTS_UNLOCK(hpts); + NET_EPOCH_EXIT(et); + + KTEST_VERIFY(slots_ran >= 0); + KTEST_EQUAL(call_counts[CCNT_TCP_TFB_DO_QUEUED_SEGMENTS], 1); + + /* Connection should be removed from HPTS after processing */ + KTEST_EQUAL(tp->t_in_hpts, IHPTS_NONE); + + /* Clean up the fake mbuf if it's still in the queue */ + if (!STAILQ_EMPTY(&tp->t_inqueue)) { + struct mbuf *m = STAILQ_FIRST(&tp->t_inqueue); + STAILQ_REMOVE_HEAD(&tp->t_inqueue, m_stailqpkt); + free(m, M_TCPHPTS); + } + + test_hpts_free_tcpcb(tp); + tcp_hptsi_stop(pace); + tcp_hptsi_destroy(pace); + + return (0); +} + +/* + * Validates the direct wake mechanism and wake inhibition logic when + * the connection count exceeds thresholds. + */ +KTEST_FUNC(direct_wake_mechanism) +{ + struct tcp_hptsi *pace; + struct tcpcb *tp; + struct tcp_hpts_entry *hpts; + + test_hpts_init(); + + pace = tcp_hptsi_create(&test_funcs, false); + KTEST_NEQUAL(pace, NULL); + tcp_hptsi_start(pace); + + tp = test_hpts_create_tcpcb(ctx, pace); + KTEST_NEQUAL(tp, NULL); + hpts = pace->rp_ent[tp->t_hpts_cpu]; + + /* Test direct wake when not over threshold */ + HPTS_LOCK(hpts); + hpts->p_on_queue_cnt = 50; /* Below threshold */ + hpts->p_hpts_wake_scheduled = 0; + tcp_hpts_wake(hpts); + KTEST_EQUAL(hpts->p_hpts_wake_scheduled, 1); + KTEST_EQUAL(call_counts[CCNT_SWI_SCHED], 1); + HPTS_UNLOCK(hpts); + + /* Reset for next test */ + hpts->p_hpts_wake_scheduled = 0; + call_counts[CCNT_SWI_SCHED] = 0; + + /* Test wake inhibition when over threshold */ + HPTS_LOCK(hpts); + hpts->p_on_queue_cnt = 200; /* Above threshold */ + hpts->p_direct_wake = 1; /* Request direct wake */ + tcp_hpts_wake(hpts); + KTEST_EQUAL(hpts->p_hpts_wake_scheduled, 0); /* Should be inhibited */ + KTEST_EQUAL(hpts->p_direct_wake, 0); /* Should be cleared */ + KTEST_EQUAL(call_counts[CCNT_SWI_SCHED], 0); /* No SWI scheduled */ + HPTS_UNLOCK(hpts); + + test_hpts_free_tcpcb(tp); + tcp_hptsi_stop(pace); + tcp_hptsi_destroy(pace); + + return (0); +} + +/* + * Validates HPTS collision detection when attempting to run HPTS while + * it's already active. + */ +KTEST_FUNC(hpts_collision_detection) +{ + struct epoch_tracker et; + struct tcp_hptsi *pace; + struct tcp_hpts_entry *hpts; + int32_t slots_ran; + + test_hpts_init(); + + pace = tcp_hptsi_create(&test_funcs, false); + KTEST_NEQUAL(pace, NULL); + tcp_hptsi_start(pace); + + hpts = pace->rp_ent[0]; + + /* Mark HPTS as active */ + HPTS_LOCK(hpts); + hpts->p_hpts_active = 1; + HPTS_UNLOCK(hpts); + + /* Attempt to run HPTS again - should detect collision */ + HPTS_LOCK(hpts); + NET_EPOCH_ENTER(et); + slots_ran = tcp_hptsi(hpts, false); /* from_callout = false */ + HPTS_UNLOCK(hpts); + NET_EPOCH_EXIT(et); + + /* Should return 0 indicating no work done due to collision */ + KTEST_EQUAL(slots_ran, 0); + + tcp_hptsi_stop(pace); + tcp_hptsi_destroy(pace); + + return (0); +} + +/* + * Validates generation count handling for race condition detection between + * HPTS processing and connection insertion/removal operations. + */ +KTEST_FUNC(generation_count_validation) +{ + struct epoch_tracker et; + struct tcp_hptsi *pace; + struct tcp_hpts_entry *hpts; + struct tcpcb *tp1, *tp2; + uint32_t initial_gencnt, slot_to_test = 10; + uint32_t timeout_usecs = slot_to_test * HPTS_USECS_PER_SLOT; + uint32_t tp2_original_gencnt; + int32_t slots_ran; + + test_hpts_init(); + + pace = tcp_hptsi_create(&test_funcs, false); + KTEST_NEQUAL(pace, NULL); + tcp_hptsi_start(pace); + + hpts = pace->rp_ent[0]; + + /* Record initial generation count for the test slot */ + initial_gencnt = hpts->p_hptss[slot_to_test].gencnt; + + /* Create and insert first connection */ + tp1 = test_hpts_create_tcpcb(ctx, pace); + KTEST_NEQUAL(tp1, NULL); + tp1->t_hpts_cpu = 0; /* Force to CPU 0 */ + + INP_WLOCK(&tp1->t_inpcb); + tp1->t_flags2 |= TF2_HPTS_CALLS; + tcp_hpts_insert(pace, tp1, timeout_usecs, NULL); + INP_WUNLOCK(&tp1->t_inpcb); + + /* Verify connection stored the generation count */ + KTEST_EQUAL(tp1->t_in_hpts, IHPTS_ONQUEUE); + KTEST_EQUAL(tp1->t_hpts_slot, slot_to_test); + KTEST_EQUAL(tp1->t_hpts_gencnt, initial_gencnt); + + /* Create second connection but don't insert yet */ + tp2 = test_hpts_create_tcpcb(ctx, pace); + KTEST_NEQUAL(tp2, NULL); + tp2->t_hpts_cpu = 0; /* Force to CPU 0 */ + + /* Force generation count increment by processing the slot */ + test_time_usec += (slot_to_test + 1) * HPTS_USECS_PER_SLOT; + HPTS_LOCK(hpts); + NET_EPOCH_ENTER(et); + slots_ran = tcp_hptsi(hpts, true); + HPTS_UNLOCK(hpts); + NET_EPOCH_EXIT(et); + + /* Verify processing occurred */ + KTEST_VERIFY(slots_ran > 0); + KTEST_EQUAL(call_counts[CCNT_TCP_OUTPUT], 1); + + /* Verify generation count was incremented */ + KTEST_EQUAL(hpts->p_hptss[slot_to_test].gencnt, initial_gencnt + 1); + + /* Verify first connection was processed and removed */ + KTEST_EQUAL(tp1->t_in_hpts, IHPTS_NONE); + + /* Insert second connection and record its generation count */ + INP_WLOCK(&tp2->t_inpcb); + tp2->t_flags2 |= TF2_HPTS_CALLS; + tcp_hpts_insert(pace, tp2, timeout_usecs, NULL); + INP_WUNLOCK(&tp2->t_inpcb); + + /* Verify connection was inserted successfully */ + KTEST_EQUAL(tp2->t_in_hpts, IHPTS_ONQUEUE); + + /* Record the generation count that tp2 received */ + tp2_original_gencnt = tp2->t_hpts_gencnt; + + /* Test generation count mismatch detection during processing */ + /* Manually set stale generation count to simulate race condition */ + tp2->t_hpts_gencnt = tp2_original_gencnt + 100; /* Force a mismatch */ + + /* Process the slot to trigger generation count validation */ + test_time_usec += (slot_to_test + 1) * HPTS_USECS_PER_SLOT; + HPTS_LOCK(hpts); + NET_EPOCH_ENTER(et); + slots_ran = tcp_hptsi(hpts, true); + HPTS_UNLOCK(hpts); + NET_EPOCH_EXIT(et); + + /* Connection should be processed despite generation count mismatch */ + KTEST_EQUAL(tp2->t_in_hpts, IHPTS_NONE); /* Processed and released */ + + /* The key test: HPTS should handle mismatched generation counts gracefully */ + KTEST_VERIFY(slots_ran > 0); /* Processing should still occur */ + + test_hpts_free_tcpcb(tp1); + test_hpts_free_tcpcb(tp2); + tcp_hptsi_stop(pace); + tcp_hptsi_destroy(pace); + return (0); } static const struct ktest_test_info tests[] = { - { - .name = "test_hpts_init", - .desc = "Tests HPTS initialization and cleanup", - .func = &test_hpts_init, - }, + KTEST_INFO(module_load), + KTEST_INFO(hptsi_create_destroy), + KTEST_INFO(hptsi_start_stop), + KTEST_INFO(hptsi_independence), + KTEST_INFO(function_injection), + KTEST_INFO(tcpcb_initialization), + KTEST_INFO(tcpcb_insertion), + KTEST_INFO(timer_functionality), + KTEST_INFO(scalability_tcpcbs), + KTEST_INFO(wheel_wrap_recovery), + KTEST_INFO(tcpcb_moving_state), + KTEST_INFO(deferred_requests), + KTEST_INFO(cpu_assignment), + KTEST_INFO(slot_boundary_conditions), + KTEST_INFO(dynamic_sleep_adjustment), + KTEST_INFO(concurrent_operations), + KTEST_INFO(queued_segments_processing), + KTEST_INFO(direct_wake_mechanism), + KTEST_INFO(hpts_collision_detection), + KTEST_INFO(generation_count_validation), }; KTEST_MODULE_DECLARE(ktest_tcphpts, tests); diff --git a/sys/netinet/tcp_lro_hpts.c b/sys/netinet/tcp_lro_hpts.c index 43587285fe26..ac1a27a4290a 100644 --- a/sys/netinet/tcp_lro_hpts.c +++ b/sys/netinet/tcp_lro_hpts.c @@ -29,6 +29,8 @@ #include "opt_inet6.h" #include <sys/param.h> +#include <sys/bus.h> +#include <sys/interrupt.h> #include <sys/systm.h> #include <sys/kernel.h> #include <sys/malloc.h> @@ -62,6 +64,7 @@ #include <netinet/tcp_lro.h> #include <netinet/tcp_var.h> #include <netinet/tcp_hpts.h> +#include <netinet/tcp_hpts_internal.h> #ifdef TCP_BLACKBOX #include <netinet/tcp_log_buf.h> #endif diff --git a/sys/netinet/tcp_stacks/bbr.c b/sys/netinet/tcp_stacks/bbr.c index f2d7867df9b4..66983edcdd73 100644 --- a/sys/netinet/tcp_stacks/bbr.c +++ b/sys/netinet/tcp_stacks/bbr.c @@ -480,7 +480,7 @@ bbr_find_lowest_rsm(struct tcp_bbr *bbr); static __inline uint32_t bbr_get_rtt(struct tcp_bbr *bbr, int32_t rtt_type); static void -bbr_log_to_start(struct tcp_bbr *bbr, uint32_t cts, uint32_t to, int32_t slot, +bbr_log_to_start(struct tcp_bbr *bbr, uint32_t cts, uint32_t to, int32_t pacing_delay, uint8_t which); static void bbr_log_timer_var(struct tcp_bbr *bbr, int mode, uint32_t cts, @@ -489,7 +489,7 @@ bbr_log_timer_var(struct tcp_bbr *bbr, int mode, uint32_t cts, static void bbr_log_hpts_diag(struct tcp_bbr *bbr, uint32_t cts, struct hpts_diag *diag); static void -bbr_log_type_bbrsnd(struct tcp_bbr *bbr, uint32_t len, uint32_t slot, +bbr_log_type_bbrsnd(struct tcp_bbr *bbr, uint32_t len, uint32_t pacing_delay, uint32_t del_by, uint32_t cts, uint32_t sloton, uint32_t prev_delay); static void @@ -724,7 +724,7 @@ bbr_minseg(struct tcp_bbr *bbr) } static void -bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_t frm, int32_t slot, uint32_t tot_len) +bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_t frm, int32_t pacing_delay, uint32_t tot_len) { struct inpcb *inp = tptoinpcb(tp); struct hpts_diag diag; @@ -751,40 +751,40 @@ bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_ bbr->r_ctl.rc_timer_exp = 0; prev_delay = bbr->r_ctl.rc_last_delay_val; if (bbr->r_ctl.rc_last_delay_val && - (slot == 0)) { + (pacing_delay == 0)) { /* * If a previous pacer delay was in place we * are not coming from the output side (where * we calculate a delay, more likely a timer). */ - slot = bbr->r_ctl.rc_last_delay_val; + pacing_delay = bbr->r_ctl.rc_last_delay_val; if (TSTMP_GT(cts, bbr->rc_pacer_started)) { /* Compensate for time passed */ delay_calc = cts - bbr->rc_pacer_started; - if (delay_calc <= slot) - slot -= delay_calc; + if (delay_calc <= pacing_delay) + pacing_delay -= delay_calc; } } /* Do we have early to make up for by pushing out the pacing time? */ if (bbr->r_agg_early_set) { - bbr_log_pacing_delay_calc(bbr, 0, bbr->r_ctl.rc_agg_early, cts, slot, 0, bbr->r_agg_early_set, 2); - slot += bbr->r_ctl.rc_agg_early; + bbr_log_pacing_delay_calc(bbr, 0, bbr->r_ctl.rc_agg_early, cts, pacing_delay, 0, bbr->r_agg_early_set, 2); + pacing_delay += bbr->r_ctl.rc_agg_early; bbr->r_ctl.rc_agg_early = 0; bbr->r_agg_early_set = 0; } /* Are we running a total debt that needs to be compensated for? */ if (bbr->r_ctl.rc_hptsi_agg_delay) { - if (slot > bbr->r_ctl.rc_hptsi_agg_delay) { + if (pacing_delay > bbr->r_ctl.rc_hptsi_agg_delay) { /* We nuke the delay */ - slot -= bbr->r_ctl.rc_hptsi_agg_delay; + pacing_delay -= bbr->r_ctl.rc_hptsi_agg_delay; bbr->r_ctl.rc_hptsi_agg_delay = 0; } else { /* We nuke some of the delay, put in a minimal 100usecs */ - bbr->r_ctl.rc_hptsi_agg_delay -= slot; - bbr->r_ctl.rc_last_delay_val = slot = 100; + bbr->r_ctl.rc_hptsi_agg_delay -= pacing_delay; + bbr->r_ctl.rc_last_delay_val = pacing_delay = 100; } } - bbr->r_ctl.rc_last_delay_val = slot; + bbr->r_ctl.rc_last_delay_val = pacing_delay; hpts_timeout = bbr_timer_start(tp, bbr, cts); if (tp->t_flags & TF_DELACK) { if (bbr->rc_in_persist == 0) { @@ -810,7 +810,7 @@ bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_ bbr->r_ctl.rc_hpts_flags = PACE_TMR_DELACK; hpts_timeout = delayed_ack; } - if (slot) { + if (pacing_delay) { /* Mark that we have a pacing timer up */ BBR_STAT_INC(bbr_paced_segments); bbr->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT; @@ -820,7 +820,7 @@ bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_ * wheel, we resort to a keep-alive timer if its configured. */ if ((hpts_timeout == 0) && - (slot == 0)) { + (pacing_delay == 0)) { if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && (tp->t_state <= TCPS_CLOSING)) { /* @@ -849,7 +849,7 @@ bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_ if (left < hpts_timeout) hpts_timeout = left; } - if (bbr->r_ctl.rc_incr_tmrs && slot && + if (bbr->r_ctl.rc_incr_tmrs && pacing_delay && (bbr->r_ctl.rc_hpts_flags & (PACE_TMR_TLP|PACE_TMR_RXT))) { /* * If configured to do so, and the timer is either @@ -867,7 +867,7 @@ bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_ * this extra delay but this is easier and being more * conservative is probably better. */ - hpts_timeout += slot; + hpts_timeout += pacing_delay; } if (hpts_timeout) { /* @@ -879,10 +879,10 @@ bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_ bbr->r_ctl.rc_timer_exp = cts + hpts_timeout; } else bbr->r_ctl.rc_timer_exp = 0; - if ((slot) && + if ((pacing_delay) && (bbr->rc_use_google || bbr->output_error_seen || - (slot <= hpts_timeout)) ) { + (pacing_delay <= hpts_timeout)) ) { /* * Tell LRO that it can queue packets while * we pace. @@ -900,17 +900,15 @@ bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_ tp->t_flags2 &= ~TF2_DONT_SACK_QUEUE; bbr->rc_pacer_started = cts; - (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(slot), - __LINE__, &diag); + tcp_hpts_insert(tp, pacing_delay, &diag); bbr->rc_timer_first = 0; bbr->bbr_timer_src = frm; - bbr_log_to_start(bbr, cts, hpts_timeout, slot, 1); + bbr_log_to_start(bbr, cts, hpts_timeout, pacing_delay, 1); bbr_log_hpts_diag(bbr, cts, &diag); } else if (hpts_timeout) { - (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(hpts_timeout), - __LINE__, &diag); + tcp_hpts_insert(tp, hpts_timeout, &diag); /* - * We add the flag here as well if the slot is set, + * We add the flag here as well if the pacing delay is set, * since hpts will call in to clear the queue first before * calling the output routine (which does our timers). * We don't want to set the flag if its just a timer @@ -919,7 +917,7 @@ bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_ * on a keep-alive timer and a request comes in for * more data. */ - if (slot) + if (pacing_delay) bbr->rc_pacer_started = cts; if ((bbr->r_ctl.rc_hpts_flags & PACE_TMR_RACK) && (bbr->rc_cwnd_limited == 0)) { @@ -936,12 +934,12 @@ bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_ TF2_DONT_SACK_QUEUE); } bbr->bbr_timer_src = frm; - bbr_log_to_start(bbr, cts, hpts_timeout, slot, 0); + bbr_log_to_start(bbr, cts, hpts_timeout, pacing_delay, 0); bbr_log_hpts_diag(bbr, cts, &diag); bbr->rc_timer_first = 1; } bbr->rc_tmr_stopped = 0; - bbr_log_type_bbrsnd(bbr, tot_len, slot, delay_calc, cts, frm, prev_delay); + bbr_log_type_bbrsnd(bbr, tot_len, pacing_delay, delay_calc, cts, frm, prev_delay); } static void @@ -1033,8 +1031,8 @@ bbr_timer_audit(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts, struct sock } /* * Ok the timer originally started is not what we want now. We will - * force the hpts to be stopped if any, and restart with the slot - * set to what was in the saved slot. + * force the hpts to be stopped if any, and restart with the pacing + * delay set to what was in the saved delay. */ wrong_timer: if ((bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) { @@ -2397,7 +2395,7 @@ bbr_log_hpts_diag(struct tcp_bbr *bbr, uint32_t cts, struct hpts_diag *diag) log.u_bbr.flex2 = diag->p_cur_slot; log.u_bbr.flex3 = diag->slot_req; log.u_bbr.flex4 = diag->inp_hptsslot; - log.u_bbr.flex5 = diag->slot_remaining; + log.u_bbr.flex5 = diag->time_remaining; log.u_bbr.flex6 = diag->need_new_to; log.u_bbr.flex7 = diag->p_hpts_active; log.u_bbr.flex8 = diag->p_on_min_sleep; @@ -2411,9 +2409,6 @@ bbr_log_hpts_diag(struct tcp_bbr *bbr, uint32_t cts, struct hpts_diag *diag) log.u_bbr.bw_inuse = diag->wheel_slot; log.u_bbr.rttProp = diag->wheel_cts; log.u_bbr.delRate = diag->maxslots; - log.u_bbr.cur_del_rate = diag->p_curtick; - log.u_bbr.cur_del_rate <<= 32; - log.u_bbr.cur_del_rate |= diag->p_lasttick; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, @@ -2473,7 +2468,7 @@ bbr_log_pacing_delay_calc(struct tcp_bbr *bbr, uint16_t gain, uint32_t len, } static void -bbr_log_to_start(struct tcp_bbr *bbr, uint32_t cts, uint32_t to, int32_t slot, uint8_t which) +bbr_log_to_start(struct tcp_bbr *bbr, uint32_t cts, uint32_t to, int32_t pacing_delay, uint8_t which) { if (tcp_bblogging_on(bbr->rc_tp)) { union tcp_log_stackspecific log; @@ -2483,7 +2478,7 @@ bbr_log_to_start(struct tcp_bbr *bbr, uint32_t cts, uint32_t to, int32_t slot, u log.u_bbr.flex1 = bbr->bbr_timer_src; log.u_bbr.flex2 = to; log.u_bbr.flex3 = bbr->r_ctl.rc_hpts_flags; - log.u_bbr.flex4 = slot; + log.u_bbr.flex4 = pacing_delay; log.u_bbr.flex5 = bbr->rc_tp->t_hpts_slot; log.u_bbr.flex6 = TICKS_2_USEC(bbr->rc_tp->t_rxtcur); log.u_bbr.pkts_out = bbr->rc_tp->t_flags2; @@ -2733,13 +2728,13 @@ bbr_type_log_hdwr_pacing(struct tcp_bbr *bbr, const struct ifnet *ifp, } static void -bbr_log_type_bbrsnd(struct tcp_bbr *bbr, uint32_t len, uint32_t slot, uint32_t del_by, uint32_t cts, uint32_t line, uint32_t prev_delay) +bbr_log_type_bbrsnd(struct tcp_bbr *bbr, uint32_t len, uint32_t pacing_delay, uint32_t del_by, uint32_t cts, uint32_t line, uint32_t prev_delay) { if (tcp_bblogging_on(bbr->rc_tp)) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); - log.u_bbr.flex1 = slot; + log.u_bbr.flex1 = pacing_delay; log.u_bbr.flex2 = del_by; log.u_bbr.flex3 = prev_delay; log.u_bbr.flex4 = line; @@ -5205,7 +5200,7 @@ bbr_process_timers(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts, uint8_t left = bbr->r_ctl.rc_timer_exp - cts; ret = -3; bbr_log_to_processing(bbr, cts, ret, left, hpts_calling); - tcp_hpts_insert(tp, HPTS_USEC_TO_SLOTS(left)); + tcp_hpts_insert(tp, left, NULL); return (1); } bbr->rc_tmr_stopped = 0; @@ -5254,7 +5249,7 @@ bbr_timer_cancel(struct tcp_bbr *bbr, int32_t line, uint32_t cts) else time_since_send = 0; if (bbr->r_ctl.rc_last_delay_val > time_since_send) { - /* Cut down our slot time */ + /* Cut down our pacing_delay time */ bbr->r_ctl.rc_last_delay_val -= time_since_send; } else { bbr->r_ctl.rc_last_delay_val = 0; @@ -5888,7 +5883,7 @@ bbr_log_output(struct tcp_bbr *bbr, struct tcpcb *tp, struct tcpopt *to, int32_t * sequence 1 for 10 bytes. In such an example the r_start would be * 1 (starting sequence) but the r_end would be r_start+len i.e. 11. * This means that r_end is actually the first sequence for the next - * slot (11). + * pacing delay (11). * */ INP_WLOCK_ASSERT(tptoinpcb(tp)); @@ -11856,7 +11851,7 @@ bbr_output_wtime(struct tcpcb *tp, const struct timeval *tv) struct bbr_sendmap *rsm = NULL; int32_t tso, mtu; struct tcpopt to; - int32_t slot = 0; + int32_t pacing_delay = 0; struct inpcb *inp; struct sockbuf *sb; bool hpts_calling; @@ -11986,8 +11981,7 @@ bbr_output_wtime(struct tcpcb *tp, const struct timeval *tv) delay_calc -= bbr->r_ctl.rc_last_delay_val; else { /* - * We are early setup to adjust - * our slot time. + * We are early setup to adjust out pacing delay. */ uint64_t merged_val; @@ -12104,7 +12098,7 @@ again: #endif error = 0; tso = 0; - slot = 0; + pacing_delay = 0; mtu = 0; sendwin = min(tp->snd_wnd, tp->snd_cwnd); sb_offset = tp->snd_max - tp->snd_una; @@ -12126,7 +12120,7 @@ recheck_resend: tot_len = tp->t_maxseg; if (hpts_calling) /* Retry in a ms */ - slot = 1001; + pacing_delay = 1001; goto just_return_nolock; } TAILQ_INSERT_TAIL(&bbr->r_ctl.rc_free, rsm, r_next); @@ -12699,9 +12693,9 @@ just_return: SOCK_SENDBUF_UNLOCK(so); just_return_nolock: if (tot_len) - slot = bbr_get_pacing_delay(bbr, bbr->r_ctl.rc_bbr_hptsi_gain, tot_len, cts, 0); + pacing_delay = bbr_get_pacing_delay(bbr, bbr->r_ctl.rc_bbr_hptsi_gain, tot_len, cts, 0); if (bbr->rc_no_pacing) - slot = 0; + pacing_delay = 0; if (tot_len == 0) { if ((ctf_outstanding(tp) + min((bbr->r_ctl.rc_high_rwnd/2), bbr_minseg(bbr))) >= tp->snd_wnd) { @@ -12751,7 +12745,7 @@ just_return_nolock: /* Dont update the time if we did not send */ bbr->r_ctl.rc_last_delay_val = 0; bbr->rc_output_starts_timer = 1; - bbr_start_hpts_timer(bbr, tp, cts, 9, slot, tot_len); + bbr_start_hpts_timer(bbr, tp, cts, 9, pacing_delay, tot_len); bbr_log_type_just_return(bbr, cts, tot_len, hpts_calling, app_limited, p_maxseg, len); if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { /* Make sure snd_nxt is drug up */ @@ -12787,7 +12781,7 @@ send: flags &= ~TH_FIN; if ((len == 0) && ((tp->t_flags & TF_ACKNOW) == 0)) { /* Lets not send this */ - slot = 0; + pacing_delay = 0; goto just_return; } } @@ -13053,7 +13047,7 @@ send: /* * We have outstanding data, don't send a fin by itself!. */ - slot = 0; + pacing_delay = 0; goto just_return; } /* @@ -13763,7 +13757,7 @@ nomore: if (tp->snd_cwnd < maxseg) tp->snd_cwnd = maxseg; } - slot = (bbr_error_base_paceout + 1) << bbr->oerror_cnt; + pacing_delay = (bbr_error_base_paceout + 1) << bbr->oerror_cnt; BBR_STAT_INC(bbr_saw_enobuf); if (bbr->bbr_hdrw_pacing) counter_u64_add(bbr_hdwr_pacing_enobuf, 1); @@ -13812,18 +13806,18 @@ nomore: } /* * Nuke all other things that can interfere - * with slot + * with pacing delay */ if ((tot_len + len) && (len >= tp->t_maxseg)) { - slot = bbr_get_pacing_delay(bbr, + pacing_delay = bbr_get_pacing_delay(bbr, bbr->r_ctl.rc_bbr_hptsi_gain, (tot_len + len), cts, 0); - if (slot < bbr_error_base_paceout) - slot = (bbr_error_base_paceout + 2) << bbr->oerror_cnt; + if (pacing_delay < bbr_error_base_paceout) + pacing_delay = (bbr_error_base_paceout + 2) << bbr->oerror_cnt; } else - slot = (bbr_error_base_paceout + 2) << bbr->oerror_cnt; + pacing_delay = (bbr_error_base_paceout + 2) << bbr->oerror_cnt; bbr->rc_output_starts_timer = 1; - bbr_start_hpts_timer(bbr, tp, cts, 10, slot, + bbr_start_hpts_timer(bbr, tp, cts, 10, pacing_delay, tot_len); return (error); } @@ -13841,9 +13835,9 @@ nomore: } /* FALLTHROUGH */ default: - slot = (bbr_error_base_paceout + 3) << bbr->oerror_cnt; + pacing_delay = (bbr_error_base_paceout + 3) << bbr->oerror_cnt; bbr->rc_output_starts_timer = 1; - bbr_start_hpts_timer(bbr, tp, cts, 11, slot, 0); + bbr_start_hpts_timer(bbr, tp, cts, 11, pacing_delay, 0); return (error); } #ifdef STATS @@ -13981,12 +13975,12 @@ skip_again: tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); if (((flags & (TH_RST | TH_SYN | TH_FIN)) == 0) && tot_len) { /* - * Calculate/Re-Calculate the hptsi slot in usecs based on + * Calculate/Re-Calculate the hptsi timeout in usecs based on * what we have sent so far */ - slot = bbr_get_pacing_delay(bbr, bbr->r_ctl.rc_bbr_hptsi_gain, tot_len, cts, 0); + pacing_delay = bbr_get_pacing_delay(bbr, bbr->r_ctl.rc_bbr_hptsi_gain, tot_len, cts, 0); if (bbr->rc_no_pacing) - slot = 0; + pacing_delay = 0; } tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); enobufs: @@ -13999,8 +13993,8 @@ enobufs: (more_to_rxt || ((bbr->r_ctl.rc_resend = bbr_check_recovery_mode(tp, bbr, cts)) != NULL))) { /* Rack cheats and shotguns out all rxt's 1ms apart */ - if (slot > 1000) - slot = 1000; + if (pacing_delay > 1000) + pacing_delay = 1000; } if (bbr->bbr_hdrw_pacing && (bbr->hw_pacing_set == 0)) { /* @@ -14014,7 +14008,7 @@ enobufs: tcp_bbr_tso_size_check(bbr, cts); } } - bbr_start_hpts_timer(bbr, tp, cts, 12, slot, tot_len); + bbr_start_hpts_timer(bbr, tp, cts, 12, pacing_delay, tot_len); if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { /* Make sure snd_nxt is drug up */ tp->snd_nxt = tp->snd_max; @@ -14132,8 +14126,7 @@ bbr_switch_failed(struct tcpcb *tp) } } else toval = HPTS_USECS_PER_SLOT; - (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(toval), - __LINE__, &diag); + tcp_hpts_insert(tp, toval, &diag); bbr_log_hpts_diag(bbr, cts, &diag); } diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c index 11ef5ba706c5..c7962b57a69e 100644 --- a/sys/netinet/tcp_stacks/rack.c +++ b/sys/netinet/tcp_stacks/rack.c @@ -250,11 +250,11 @@ static int32_t rack_non_rxt_use_cr = 0; /* does a non-rxt in recovery use the co static int32_t rack_persist_min = 250000; /* 250usec */ static int32_t rack_persist_max = 2000000; /* 2 Second in usec's */ static int32_t rack_honors_hpts_min_to = 1; /* Do we honor the hpts minimum time out for pacing timers */ -static uint32_t rack_max_reduce = 10; /* Percent we can reduce slot by */ +static uint32_t rack_max_reduce = 10; /* Percent we can reduce pacing delay by */ static int32_t rack_sack_not_required = 1; /* set to one to allow non-sack to use rack */ static int32_t rack_limit_time_with_srtt = 0; static int32_t rack_autosndbuf_inc = 20; /* In percentage form */ -static int32_t rack_enobuf_hw_boost_mult = 0; /* How many times the hw rate we boost slot using time_between */ +static int32_t rack_enobuf_hw_boost_mult = 0; /* How many times the hw rate we boost pacing delay using time_between */ static int32_t rack_enobuf_hw_max = 12000; /* 12 ms in usecs */ static int32_t rack_enobuf_hw_min = 10000; /* 10 ms in usecs */ static int32_t rack_hw_rwnd_factor = 2; /* How many max_segs the rwnd must be before we hold off sending */ @@ -278,7 +278,7 @@ static int32_t rack_hptsi_segments = 40; static int32_t rack_rate_sample_method = USE_RTT_LOW; static int32_t rack_pace_every_seg = 0; static int32_t rack_delayed_ack_time = 40000; /* 40ms in usecs */ -static int32_t rack_slot_reduction = 4; +static int32_t rack_pacing_delay_reduction = 4; static int32_t rack_wma_divisor = 8; /* For WMA calculation */ static int32_t rack_cwnd_block_ends_measure = 0; static int32_t rack_rwnd_block_ends_measure = 0; @@ -478,7 +478,7 @@ rack_log_alt_to_to_cancel(struct tcp_rack *rack, uint16_t flex7, uint8_t mod); static void -rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t slot, +rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t pacing_delay, uint64_t bw_est, uint64_t bw, uint64_t len_time, int method, int line, struct rack_sendmap *rsm, uint8_t quality); static struct rack_sendmap * @@ -1107,7 +1107,7 @@ rack_init_sysctls(void) SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_pacing), OID_AUTO, "burst_reduces", CTLFLAG_RW, - &rack_slot_reduction, 4, + &rack_pacing_delay_reduction, 4, "When doing only burst mitigation what is the reduce divisor"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), @@ -1399,7 +1399,7 @@ rack_init_sysctls(void) SYSCTL_CHILDREN(rack_timers), OID_AUTO, "hpts_max_reduce", CTLFLAG_RW, &rack_max_reduce, 10, - "Max percentage we will reduce slot by for pacing when we are behind"); + "Max percentage we will reduce pacing delay by for pacing when we are behind"); SYSCTL_ADD_U32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_timers), OID_AUTO, "persmin", CTLFLAG_RW, @@ -2700,7 +2700,7 @@ rack_log_retran_reason(struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t } static void -rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot, uint8_t which) +rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t pacing_delay, uint8_t which) { if (tcp_bblogging_on(rack->rc_tp)) { union tcp_log_stackspecific log; @@ -2710,7 +2710,7 @@ rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot log.u_bbr.flex1 = rack->rc_tp->t_srtt; log.u_bbr.flex2 = to; log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags; - log.u_bbr.flex4 = slot; + log.u_bbr.flex4 = pacing_delay; log.u_bbr.flex5 = rack->rc_tp->t_hpts_slot; log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; log.u_bbr.flex7 = rack->rc_in_persist; @@ -3034,14 +3034,14 @@ rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, } static void -rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t slot, uint32_t cts, struct timeval *tv, int line) +rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t pacing_delay, uint32_t cts, struct timeval *tv, int line) { if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { union tcp_log_stackspecific log; memset(&log, 0, sizeof(log)); log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); - log.u_bbr.flex1 = slot; + log.u_bbr.flex1 = pacing_delay; if (rack->rack_no_prr) log.u_bbr.flex2 = 0; else @@ -3139,7 +3139,7 @@ rack_log_type_pacing_sizes(struct tcpcb *tp, struct tcp_rack *rack, uint32_t arg } static void -rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, uint32_t slot, +rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, uint32_t pacing_delay, uint8_t hpts_calling, int reason, uint32_t cwnd_to_use) { if (tcp_bblogging_on(rack->rc_tp)) { @@ -3148,7 +3148,7 @@ rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, ui memset(&log, 0, sizeof(log)); log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); - log.u_bbr.flex1 = slot; + log.u_bbr.flex1 = pacing_delay; log.u_bbr.flex2 = rack->r_ctl.rc_hpts_flags; log.u_bbr.flex4 = reason; if (rack->rack_no_prr) @@ -6482,7 +6482,7 @@ rack_log_hpts_diag(struct tcp_rack *rack, uint32_t cts, log.u_bbr.flex2 = diag->p_cur_slot; log.u_bbr.flex3 = diag->slot_req; log.u_bbr.flex4 = diag->inp_hptsslot; - log.u_bbr.flex5 = diag->slot_remaining; + log.u_bbr.flex5 = diag->time_remaining; log.u_bbr.flex6 = diag->need_new_to; log.u_bbr.flex7 = diag->p_hpts_active; log.u_bbr.flex8 = diag->p_on_min_sleep; @@ -6497,9 +6497,6 @@ rack_log_hpts_diag(struct tcp_rack *rack, uint32_t cts, log.u_bbr.rttProp = diag->wheel_cts; log.u_bbr.timeStamp = cts; log.u_bbr.delRate = diag->maxslots; - log.u_bbr.cur_del_rate = diag->p_curtick; - log.u_bbr.cur_del_rate <<= 32; - log.u_bbr.cur_del_rate |= diag->p_lasttick; TCP_LOG_EVENTP(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, @@ -6532,14 +6529,14 @@ rack_log_wakeup(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb, uin static void rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, - int32_t slot, uint32_t tot_len_this_send, int sup_rack) + int32_t usecs, uint32_t tot_len_this_send, int sup_rack) { struct hpts_diag diag; struct inpcb *inp = tptoinpcb(tp); struct timeval tv; uint32_t delayed_ack = 0; uint32_t hpts_timeout; - uint32_t entry_slot = slot; + uint32_t entry_usecs = usecs; uint8_t stopped; uint32_t left = 0; uint32_t us_cts; @@ -6560,7 +6557,7 @@ rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, rack->r_ctl.rc_hpts_flags = 0; us_cts = tcp_get_usecs(&tv); /* Now early/late accounting */ - rack_log_pacing_delay_calc(rack, entry_slot, slot, 0, 0, 0, 26, __LINE__, NULL, 0); + rack_log_pacing_delay_calc(rack, entry_usecs, usecs, 0, 0, 0, 26, __LINE__, NULL, 0); if (rack->r_early && (rack->rc_ack_can_sendout_data == 0)) { /* * We have a early carry over set, @@ -6571,7 +6568,7 @@ rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, * penalize the next timer for being awoke * by an ack aka the rc_agg_early (non-paced mode). */ - slot += rack->r_ctl.rc_agg_early; + usecs += rack->r_ctl.rc_agg_early; rack->r_early = 0; rack->r_ctl.rc_agg_early = 0; } @@ -6583,29 +6580,29 @@ rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, * really depends on what * the current pacing time is. */ - if (rack->r_ctl.rc_agg_delayed >= slot) { + if (rack->r_ctl.rc_agg_delayed >= usecs) { /* * We can't compensate for it all. * And we have to have some time * on the clock. We always have a min - * 10 slots (10 x 10 i.e. 100 usecs). + * 10 HPTS timer units (10 x 10 i.e. 100 usecs). */ - if (slot <= HPTS_USECS_PER_SLOT) { + if (usecs <= HPTS_USECS_PER_SLOT) { /* We gain delay */ - rack->r_ctl.rc_agg_delayed += (HPTS_USECS_PER_SLOT - slot); - slot = HPTS_USECS_PER_SLOT; + rack->r_ctl.rc_agg_delayed += (HPTS_USECS_PER_SLOT - usecs); + usecs = HPTS_USECS_PER_SLOT; } else { /* We take off some */ - rack->r_ctl.rc_agg_delayed -= (slot - HPTS_USECS_PER_SLOT); - slot = HPTS_USECS_PER_SLOT; + rack->r_ctl.rc_agg_delayed -= (usecs - HPTS_USECS_PER_SLOT); + usecs = HPTS_USECS_PER_SLOT; } } else { - slot -= rack->r_ctl.rc_agg_delayed; + usecs -= rack->r_ctl.rc_agg_delayed; rack->r_ctl.rc_agg_delayed = 0; /* Make sure we have 100 useconds at minimum */ - if (slot < HPTS_USECS_PER_SLOT) { - rack->r_ctl.rc_agg_delayed = HPTS_USECS_PER_SLOT - slot; - slot = HPTS_USECS_PER_SLOT; + if (usecs < HPTS_USECS_PER_SLOT) { + rack->r_ctl.rc_agg_delayed = HPTS_USECS_PER_SLOT - usecs; + usecs = HPTS_USECS_PER_SLOT; } if (rack->r_ctl.rc_agg_delayed == 0) rack->r_late = 0; @@ -6614,17 +6611,17 @@ rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, /* r_use_hpts_min is on and so is DGP */ uint32_t max_red; - max_red = (slot * rack->r_ctl.max_reduction) / 100; + max_red = (usecs * rack->r_ctl.max_reduction) / 100; if (max_red >= rack->r_ctl.rc_agg_delayed) { - slot -= rack->r_ctl.rc_agg_delayed; + usecs -= rack->r_ctl.rc_agg_delayed; rack->r_ctl.rc_agg_delayed = 0; } else { - slot -= max_red; + usecs -= max_red; rack->r_ctl.rc_agg_delayed -= max_red; } } if ((rack->r_use_hpts_min == 1) && - (slot > 0) && + (usecs > 0) && (rack->dgp_on == 1)) { /* * We are enforcing a min pacing timer @@ -6633,8 +6630,8 @@ rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, uint32_t min; min = get_hpts_min_sleep_time(); - if (min > slot) { - slot = min; + if (min > usecs) { + usecs = min; } } hpts_timeout = rack_timer_start(tp, rack, cts, sup_rack); @@ -6652,7 +6649,7 @@ rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, * wheel, we resort to a keep-alive timer if its configured. */ if ((hpts_timeout == 0) && - (slot == 0)) { + (usecs == 0)) { if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && (tp->t_state <= TCPS_CLOSING)) { /* @@ -6709,10 +6706,10 @@ rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, hpts_timeout = 0x7ffffffe; rack->r_ctl.rc_timer_exp = cts + hpts_timeout; } - rack_log_pacing_delay_calc(rack, entry_slot, slot, hpts_timeout, 0, 0, 27, __LINE__, NULL, 0); + rack_log_pacing_delay_calc(rack, entry_usecs, usecs, hpts_timeout, 0, 0, 27, __LINE__, NULL, 0); if ((rack->gp_ready == 0) && (rack->use_fixed_rate == 0) && - (hpts_timeout < slot) && + (hpts_timeout < usecs) && (rack->r_ctl.rc_hpts_flags & (PACE_TMR_TLP|PACE_TMR_RXT))) { /* * We have no good estimate yet for the @@ -6722,7 +6719,7 @@ rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, * pace that long since we know the calculation * so far is not accurate. */ - slot = hpts_timeout; + usecs = hpts_timeout; } /** * Turn off all the flags for queuing by default. The @@ -6754,11 +6751,11 @@ rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, * so LRO can call into us. */ tp->t_flags2 &= ~(TF2_DONT_SACK_QUEUE|TF2_MBUF_QUEUE_READY); - if (slot) { + if (usecs) { rack->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT; - rack->r_ctl.rc_last_output_to = us_cts + slot; + rack->r_ctl.rc_last_output_to = us_cts + usecs; /* - * A pacing timer (slot) is being set, in + * A pacing timer (usecs microseconds) is being set, in * such a case we cannot send (we are blocked by * the timer). So lets tell LRO that it should not * wake us unless there is a SACK. Note this only @@ -6799,20 +6796,18 @@ rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, } if ((rack->use_rack_rr) && (rack->r_rr_config < 2) && - ((hpts_timeout) && (hpts_timeout < slot))) { + ((hpts_timeout) && (hpts_timeout < usecs))) { /* * Arrange for the hpts to kick back in after the * t-o if the t-o does not cause a send. */ - (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(hpts_timeout), - __LINE__, &diag); + tcp_hpts_insert(tp, hpts_timeout, &diag); rack_log_hpts_diag(rack, us_cts, &diag, &tv); - rack_log_to_start(rack, cts, hpts_timeout, slot, 0); + rack_log_to_start(rack, cts, hpts_timeout, usecs, 0); } else { - (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(slot), - __LINE__, &diag); + tcp_hpts_insert(tp, usecs, &diag); rack_log_hpts_diag(rack, us_cts, &diag, &tv); - rack_log_to_start(rack, cts, hpts_timeout, slot, 1); + rack_log_to_start(rack, cts, hpts_timeout, usecs, 1); } } else if (hpts_timeout) { /* @@ -6824,22 +6819,21 @@ rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, * at the start of this block) are good enough. */ rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; - (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(hpts_timeout), - __LINE__, &diag); + tcp_hpts_insert(tp, hpts_timeout, &diag); rack_log_hpts_diag(rack, us_cts, &diag, &tv); - rack_log_to_start(rack, cts, hpts_timeout, slot, 0); + rack_log_to_start(rack, cts, hpts_timeout, usecs, 0); } else { /* No timer starting */ #ifdef INVARIANTS if (SEQ_GT(tp->snd_max, tp->snd_una)) { - panic("tp:%p rack:%p tlts:%d cts:%u slot:%u pto:%u -- no timer started?", - tp, rack, tot_len_this_send, cts, slot, hpts_timeout); + panic("tp:%p rack:%p tlts:%d cts:%u usecs:%u pto:%u -- no timer started?", + tp, rack, tot_len_this_send, cts, usecs, hpts_timeout); } #endif } rack->rc_tmr_stopped = 0; - if (slot) - rack_log_type_bbrsnd(rack, tot_len_this_send, slot, us_cts, &tv, __LINE__); + if (usecs) + rack_log_type_bbrsnd(rack, tot_len_this_send, usecs, us_cts, &tv, __LINE__); } static void @@ -8016,7 +8010,7 @@ rack_process_timers(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8 rack->rc_tp->t_flags2 &= ~TF2_DONT_SACK_QUEUE; ret = -3; left = rack->r_ctl.rc_timer_exp - cts; - tcp_hpts_insert(tp, HPTS_MS_TO_SLOTS(left)); + tcp_hpts_insert(tp, left, NULL); rack_log_to_processing(rack, cts, ret, left); return (1); } @@ -14377,8 +14371,7 @@ rack_switch_failed(struct tcpcb *tp) } } else toval = HPTS_USECS_PER_SLOT; - (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(toval), - __LINE__, &diag); + tcp_hpts_insert(tp, toval, &diag); rack_log_hpts_diag(rack, cts, &diag, &tv); } @@ -14973,8 +14966,7 @@ rack_init(struct tcpcb *tp, void **ptr) if (tov) { struct hpts_diag diag; - (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(tov), - __LINE__, &diag); + tcp_hpts_insert(tp, tov, &diag); rack_log_hpts_diag(rack, us_cts, &diag, &rack->r_ctl.act_rcv_time); } } @@ -16367,7 +16359,7 @@ rack_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, struct rack_sendmap *rsm; int32_t prev_state = 0; int no_output = 0; - int slot_remaining = 0; + int time_remaining = 0; #ifdef TCP_ACCOUNTING int ack_val_set = 0xf; #endif @@ -16416,7 +16408,7 @@ rack_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, * could be, if a sack is present, we want to be awoken and * so should process the packets. */ - slot_remaining = rack->r_ctl.rc_last_output_to - us_cts; + time_remaining = rack->r_ctl.rc_last_output_to - us_cts; if (rack->rc_tp->t_flags2 & TF2_DONT_SACK_QUEUE) { no_output = 1; } else { @@ -16436,7 +16428,7 @@ rack_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, (*ts_ptr == TCP_LRO_TS_OPTION))) no_output = 1; } - if ((no_output == 1) && (slot_remaining < tcp_min_hptsi_time)) { + if ((no_output == 1) && (time_remaining < tcp_min_hptsi_time)) { /* * It is unrealistic to think we can pace in less than * the minimum granularity of the pacer (def:250usec). So @@ -16919,10 +16911,10 @@ do_output_now: (tcp_in_hpts(rack->rc_tp) == 0)) { /* * We are not in hpts and we had a pacing timer up. Use - * the remaining time (slot_remaining) to restart the timer. + * the remaining time (time_remaining) to restart the timer. */ - KASSERT ((slot_remaining != 0), ("slot remaining is zero for rack:%p tp:%p", rack, tp)); - rack_start_hpts_timer(rack, tp, cts, slot_remaining, 0, 0); + KASSERT ((time_remaining != 0), ("slot remaining is zero for rack:%p tp:%p", rack, tp)); + rack_start_hpts_timer(rack, tp, cts, time_remaining, 0, 0); rack_free_trim(rack); } /* Clear the flag, it may have been cleared by output but we may not have */ @@ -17102,7 +17094,7 @@ check_it: } static void -rack_log_pacing_delay_calc (struct tcp_rack *rack, uint32_t len, uint32_t slot, +rack_log_pacing_delay_calc (struct tcp_rack *rack, uint32_t len, uint32_t pacing_delay, uint64_t bw_est, uint64_t bw, uint64_t len_time, int method, int line, struct rack_sendmap *rsm, uint8_t quality) { @@ -17125,7 +17117,7 @@ rack_log_pacing_delay_calc (struct tcp_rack *rack, uint32_t len, uint32_t slot, } } memset(&log, 0, sizeof(log)); - log.u_bbr.flex1 = slot; + log.u_bbr.flex1 = pacing_delay; log.u_bbr.flex2 = len; log.u_bbr.flex3 = rack->r_ctl.rc_pace_min_segs; log.u_bbr.flex4 = rack->r_ctl.rc_pace_max_segs; @@ -17284,25 +17276,25 @@ rack_arrive_at_discounted_rate(struct tcp_rack *rack, uint64_t window_input, uin } static int32_t -pace_to_fill_cwnd(struct tcp_rack *rack, int32_t slot, uint32_t len, uint32_t segsiz, int *capped, uint64_t *rate_wanted, uint8_t non_paced) +pace_to_fill_cwnd(struct tcp_rack *rack, int32_t pacing_delay, uint32_t len, uint32_t segsiz, int *capped, uint64_t *rate_wanted, uint8_t non_paced) { uint64_t lentim, fill_bw; rack->r_via_fill_cw = 0; if (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > rack->r_ctl.cwnd_to_use) - return (slot); + return (pacing_delay); if ((ctf_outstanding(rack->rc_tp) + (segsiz-1)) > rack->rc_tp->snd_wnd) - return (slot); + return (pacing_delay); if (rack->r_ctl.rc_last_us_rtt == 0) - return (slot); + return (pacing_delay); if (rack->rc_pace_fill_if_rttin_range && (rack->r_ctl.rc_last_us_rtt >= (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack->rtt_limit_mul))) { /* The rtt is huge, N * smallest, lets not fill */ - return (slot); + return (pacing_delay); } if (rack->r_ctl.fillcw_cap && *rate_wanted >= rack->r_ctl.fillcw_cap) - return (slot); + return (pacing_delay); /* * first lets calculate the b/w based on the last us-rtt * and the the smallest send window. @@ -17368,7 +17360,7 @@ at_lt_bw: if (non_paced) *rate_wanted = fill_bw; if ((fill_bw < RACK_MIN_BW) || (fill_bw < *rate_wanted)) - return (slot); + return (pacing_delay); rack->r_via_fill_cw = 1; if (rack->r_rack_hw_rate_caps && (rack->r_ctl.crte != NULL)) { @@ -17423,19 +17415,19 @@ at_lt_bw: lentim = (uint64_t)(len) * (uint64_t)HPTS_USEC_IN_SEC; lentim /= fill_bw; *rate_wanted = fill_bw; - if (non_paced || (lentim < slot)) { - rack_log_pacing_delay_calc(rack, len, slot, fill_bw, + if (non_paced || (lentim < pacing_delay)) { + rack_log_pacing_delay_calc(rack, len, pacing_delay, fill_bw, 0, lentim, 12, __LINE__, NULL, 0); return ((int32_t)lentim); } else - return (slot); + return (pacing_delay); } static int32_t rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, struct rack_sendmap *rsm, uint32_t segsiz, int line) { uint64_t srtt; - int32_t slot = 0; + int32_t pacing_delay = 0; int can_start_hw_pacing = 1; int err; int pace_one; @@ -17483,25 +17475,25 @@ rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, str * cwnd. Which in that case we are just waiting for * a ACK. */ - slot = len / tr_perms; + pacing_delay = len / tr_perms; /* Now do we reduce the time so we don't run dry? */ - if (slot && rack_slot_reduction) { - reduce = (slot / rack_slot_reduction); - if (reduce < slot) { - slot -= reduce; + if (pacing_delay && rack_pacing_delay_reduction) { + reduce = (pacing_delay / rack_pacing_delay_reduction); + if (reduce < pacing_delay) { + pacing_delay -= reduce; } else - slot = 0; + pacing_delay = 0; } else reduce = 0; - slot *= HPTS_USEC_IN_MSEC; + pacing_delay *= HPTS_USEC_IN_MSEC; if (rack->rc_pace_to_cwnd) { uint64_t rate_wanted = 0; - slot = pace_to_fill_cwnd(rack, slot, len, segsiz, NULL, &rate_wanted, 1); + pacing_delay = pace_to_fill_cwnd(rack, pacing_delay, len, segsiz, NULL, &rate_wanted, 1); rack->rc_ack_can_sendout_data = 1; - rack_log_pacing_delay_calc(rack, len, slot, rate_wanted, 0, 0, 14, __LINE__, NULL, 0); + rack_log_pacing_delay_calc(rack, len, pacing_delay, rate_wanted, 0, 0, 14, __LINE__, NULL, 0); } else - rack_log_pacing_delay_calc(rack, len, slot, tr_perms, reduce, 0, 7, __LINE__, NULL, 0); + rack_log_pacing_delay_calc(rack, len, pacing_delay, tr_perms, reduce, 0, 7, __LINE__, NULL, 0); /*******************************************************/ /* RRS: We insert non-paced call to stats here for len */ /*******************************************************/ @@ -17575,7 +17567,7 @@ rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, str segs *= oh; lentim = (uint64_t)(len + segs) * (uint64_t)HPTS_USEC_IN_SEC; res = lentim / rate_wanted; - slot = (uint32_t)res; + pacing_delay = (uint32_t)res; if (rack_hw_rate_min && (rate_wanted < rack_hw_rate_min)) { can_start_hw_pacing = 0; @@ -17635,7 +17627,7 @@ rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, str * We want to pace at our rate *or* faster to * fill the cwnd to the max if its not full. */ - slot = pace_to_fill_cwnd(rack, slot, (len+segs), segsiz, &capped, &rate_wanted, 0); + pacing_delay = pace_to_fill_cwnd(rack, pacing_delay, (len+segs), segsiz, &capped, &rate_wanted, 0); /* Re-check to make sure we are not exceeding our max b/w */ if ((rack->r_ctl.crte != NULL) && (tcp_hw_highest_rate(rack->r_ctl.crte) < rate_wanted)) { @@ -17786,15 +17778,15 @@ rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, str srtt = rack->rc_tp->t_srtt; else srtt = RACK_INITIAL_RTO * HPTS_USEC_IN_MSEC; /* its in ms convert */ - if (srtt < (uint64_t)slot) { - rack_log_pacing_delay_calc(rack, srtt, slot, rate_wanted, bw_est, lentim, 99, __LINE__, NULL, 0); - slot = srtt; + if (srtt < (uint64_t)pacing_delay) { + rack_log_pacing_delay_calc(rack, srtt, pacing_delay, rate_wanted, bw_est, lentim, 99, __LINE__, NULL, 0); + pacing_delay = srtt; } } /*******************************************************************/ /* RRS: We insert paced call to stats here for len and rate_wanted */ /*******************************************************************/ - rack_log_pacing_delay_calc(rack, len, slot, rate_wanted, bw_est, lentim, 2, __LINE__, rsm, 0); + rack_log_pacing_delay_calc(rack, len, pacing_delay, rate_wanted, bw_est, lentim, 2, __LINE__, rsm, 0); } if (rack->r_ctl.crte && (rack->r_ctl.crte->rs_num_enobufs > 0)) { /* @@ -17811,9 +17803,9 @@ rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, str hw_boost_delay = rack_enobuf_hw_max; else if (hw_boost_delay < rack_enobuf_hw_min) hw_boost_delay = rack_enobuf_hw_min; - slot += hw_boost_delay; + pacing_delay += hw_boost_delay; } - return (slot); + return (pacing_delay); } static void @@ -18482,7 +18474,7 @@ rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendma struct tcpopt to; u_char opt[TCP_MAXOLEN]; uint32_t hdrlen, optlen; - int32_t slot, segsiz, max_val, tso = 0, error = 0, ulen = 0; + int32_t pacing_delay, segsiz, max_val, tso = 0, error = 0, ulen = 0; uint16_t flags; uint32_t if_hw_tsomaxsegcount = 0, startseq; uint32_t if_hw_tsomaxsegsize; @@ -18688,9 +18680,9 @@ rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendma } if (rack->r_ctl.crte != NULL) { /* See if we can send via the hw queue */ - slot = rack_check_queue_level(rack, tp, tv, cts, len, segsiz); + pacing_delay = rack_check_queue_level(rack, tp, tv, cts, len, segsiz); /* If there is nothing in queue (no pacing time) we can send via the hw queue */ - if (slot == 0) + if (pacing_delay == 0) ip_sendflag = 0; } tcp_set_flags(th, flags); @@ -18955,20 +18947,20 @@ rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendma rack_log_queue_level(tp, rack, len, tv, cts); } else tcp_trace_point(rack->rc_tp, TCP_TP_ENOBUF); - slot = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC); + pacing_delay = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC); if (rack->rc_enobuf < 0x7f) rack->rc_enobuf++; - if (slot < (10 * HPTS_USEC_IN_MSEC)) - slot = 10 * HPTS_USEC_IN_MSEC; + if (pacing_delay < (10 * HPTS_USEC_IN_MSEC)) + pacing_delay = 10 * HPTS_USEC_IN_MSEC; if (rack->r_ctl.crte != NULL) { counter_u64_add(rack_saw_enobuf_hw, 1); tcp_rl_log_enobuf(rack->r_ctl.crte); } counter_u64_add(rack_saw_enobuf, 1); } else { - slot = rack_get_pacing_delay(rack, tp, len, NULL, segsiz, __LINE__); + pacing_delay = rack_get_pacing_delay(rack, tp, len, NULL, segsiz, __LINE__); } - rack_start_hpts_timer(rack, tp, cts, slot, len, 0); + rack_start_hpts_timer(rack, tp, cts, pacing_delay, len, 0); #ifdef TCP_ACCOUNTING crtsc = get_cyclecount(); if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { @@ -19071,7 +19063,7 @@ rack_fast_output(struct tcpcb *tp, struct tcp_rack *rack, uint64_t ts_val, #ifdef TCP_ACCOUNTING int cnt_thru = 1; #endif - int32_t slot, segsiz, len, max_val, tso = 0, sb_offset, error, ulen = 0; + int32_t pacing_delay, segsiz, len, max_val, tso = 0, sb_offset, error, ulen = 0; uint16_t flags; uint32_t s_soff; uint32_t if_hw_tsomaxsegcount = 0, startseq; @@ -19519,8 +19511,8 @@ again: } tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); counter_u64_add(rack_fto_send, 1); - slot = rack_get_pacing_delay(rack, tp, *tot_len, NULL, segsiz, __LINE__); - rack_start_hpts_timer(rack, tp, cts, slot, *tot_len, 0); + pacing_delay = rack_get_pacing_delay(rack, tp, *tot_len, NULL, segsiz, __LINE__); + rack_start_hpts_timer(rack, tp, cts, pacing_delay, *tot_len, 0); #ifdef TCP_ACCOUNTING crtsc = get_cyclecount(); if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { @@ -19707,7 +19699,7 @@ rack_output(struct tcpcb *tp) struct rack_sendmap *rsm = NULL; int32_t tso, mtu; struct tcpopt to; - int32_t slot = 0; + int32_t pacing_delay = 0; int32_t sup_rack = 0; uint32_t cts, ms_cts, delayed, early; uint32_t add_flag = RACK_SENT_SP; @@ -20070,7 +20062,7 @@ again: if (rsm == NULL) { if (hpts_calling) /* Retry in a ms */ - slot = (1 * HPTS_USEC_IN_MSEC); + pacing_delay = (1 * HPTS_USEC_IN_MSEC); so = inp->inp_socket; sb = &so->so_snd; goto just_return_nolock; @@ -20877,7 +20869,7 @@ just_return_nolock: } if (tot_len_this_send > 0) { rack->r_ctl.fsb.recwin = recwin; - slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, NULL, segsiz, __LINE__); + pacing_delay = rack_get_pacing_delay(rack, tp, tot_len_this_send, NULL, segsiz, __LINE__); if ((error == 0) && rack_use_rfo && ((flags & (TH_SYN|TH_FIN)) == 0) && @@ -21060,8 +21052,8 @@ just_return_nolock: /* Yes lets make sure to move to persist before timer-start */ rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime, tp->snd_una); } - rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, sup_rack); - rack_log_type_just_return(rack, cts, tot_len_this_send, slot, hpts_calling, app_limited, cwnd_to_use); + rack_start_hpts_timer(rack, tp, cts, pacing_delay, tot_len_this_send, sup_rack); + rack_log_type_just_return(rack, cts, tot_len_this_send, pacing_delay, hpts_calling, app_limited, cwnd_to_use); } #ifdef NETFLIX_SHARED_CWND if ((sbavail(sb) == 0) && @@ -21100,8 +21092,8 @@ send: * we come around to again, the flag will be clear. */ check_done = 1; - slot = rack_check_queue_level(rack, tp, &tv, cts, len, segsiz); - if (slot) { + pacing_delay = rack_check_queue_level(rack, tp, &tv, cts, len, segsiz); + if (pacing_delay) { rack->r_ctl.rc_agg_delayed = 0; rack->r_ctl.rc_agg_early = 0; rack->r_early = 0; @@ -22358,11 +22350,11 @@ nomore: rack_log_queue_level(tp, rack, len, &tv, cts); } else tcp_trace_point(rack->rc_tp, TCP_TP_ENOBUF); - slot = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC); + pacing_delay = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC); if (rack->rc_enobuf < 0x7f) rack->rc_enobuf++; - if (slot < (10 * HPTS_USEC_IN_MSEC)) - slot = 10 * HPTS_USEC_IN_MSEC; + if (pacing_delay < (10 * HPTS_USEC_IN_MSEC)) + pacing_delay = 10 * HPTS_USEC_IN_MSEC; if (rack->r_ctl.crte != NULL) { counter_u64_add(rack_saw_enobuf_hw, 1); tcp_rl_log_enobuf(rack->r_ctl.crte); @@ -22389,8 +22381,8 @@ nomore: goto again; } } - slot = 10 * HPTS_USEC_IN_MSEC; - rack_start_hpts_timer(rack, tp, cts, slot, 0, 0); + pacing_delay = 10 * HPTS_USEC_IN_MSEC; + rack_start_hpts_timer(rack, tp, cts, pacing_delay, 0, 0); #ifdef TCP_ACCOUNTING crtsc = get_cyclecount(); if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { @@ -22412,8 +22404,8 @@ nomore: } /* FALLTHROUGH */ default: - slot = 10 * HPTS_USEC_IN_MSEC; - rack_start_hpts_timer(rack, tp, cts, slot, 0, 0); + pacing_delay = 10 * HPTS_USEC_IN_MSEC; + rack_start_hpts_timer(rack, tp, cts, pacing_delay, 0, 0); #ifdef TCP_ACCOUNTING crtsc = get_cyclecount(); if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { @@ -22456,18 +22448,18 @@ enobufs: /* * We don't send again after sending a RST. */ - slot = 0; + pacing_delay = 0; sendalot = 0; if (error == 0) tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); - } else if ((slot == 0) && (sendalot == 0) && tot_len_this_send) { + } else if ((pacing_delay == 0) && (sendalot == 0) && tot_len_this_send) { /* * Get our pacing rate, if an error * occurred in sending (ENOBUF) we would * hit the else if with slot preset. Other * errors return. */ - slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, rsm, segsiz, __LINE__); + pacing_delay = rack_get_pacing_delay(rack, tp, tot_len_this_send, rsm, segsiz, __LINE__); } /* We have sent clear the flag */ rack->r_ent_rec_ns = 0; @@ -22499,7 +22491,7 @@ enobufs: */ tp->t_flags &= ~(TF_WASCRECOVERY|TF_WASFRECOVERY); } - if (slot) { + if (pacing_delay) { /* set the rack tcb into the slot N */ if ((error == 0) && rack_use_rfo && @@ -22564,7 +22556,7 @@ skip_all_send: /* Assure when we leave that snd_nxt will point to top */ if (SEQ_GT(tp->snd_max, tp->snd_nxt)) tp->snd_nxt = tp->snd_max; - rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, 0); + rack_start_hpts_timer(rack, tp, cts, pacing_delay, tot_len_this_send, 0); #ifdef TCP_ACCOUNTING crtsc = get_cyclecount() - ts_val; if (tot_len_this_send) { diff --git a/sys/netpfil/ipfw/ip_fw_nat.c b/sys/netpfil/ipfw/ip_fw_nat.c index 1cee7873de31..8bd27f6885ab 100644 --- a/sys/netpfil/ipfw/ip_fw_nat.c +++ b/sys/netpfil/ipfw/ip_fw_nat.c @@ -999,9 +999,11 @@ ipfw_nat_del(struct sockopt *sopt) { struct cfg_nat *ptr; struct ip_fw_chain *chain = &V_layer3_chain; - int i; + int error, i; - sooptcopyin(sopt, &i, sizeof i, sizeof i); + error = sooptcopyin(sopt, &i, sizeof i, sizeof i); + if (error != 0) + return (error); /* XXX validate i */ IPFW_UH_WLOCK(chain); ptr = lookup_nat(&chain->nat, i); @@ -1104,7 +1106,7 @@ ipfw_nat_get_log(struct sockopt *sopt) { uint8_t *data; struct cfg_nat *ptr; - int i, size; + int error, i, size; struct ip_fw_chain *chain; IPFW_RLOCK_TRACKER; @@ -1134,9 +1136,9 @@ ipfw_nat_get_log(struct sockopt *sopt) i += LIBALIAS_BUF_SIZE; } IPFW_RUNLOCK(chain); - sooptcopyout(sopt, data, size); + error = sooptcopyout(sopt, data, size); free(data, M_IPFW); - return(0); + return (error); } static int diff --git a/sys/rpc/auth.h b/sys/rpc/auth.h index 33c33ffd594d..648fb99a3a27 100644 --- a/sys/rpc/auth.h +++ b/sys/rpc/auth.h @@ -354,6 +354,10 @@ __END_DECLS #define RPCSEC_GSS 6 /* RPCSEC_GSS */ #define AUTH_TLS 7 /* Initiate RPC-over-TLS */ +/* RFC 5531's prescribed limits for variable-lenth arrays. */ +#define AUTH_SYS_MAX_HOSTNAME 255 +#define AUTH_SYS_MAX_GROUPS 16 /* Supplementary groups. */ + /* * Pseudo auth flavors for RPCSEC_GSS. */ diff --git a/sys/rpc/authunix_prot.c b/sys/rpc/authunix_prot.c index b107d5541c50..ff4c12c3f52e 100644 --- a/sys/rpc/authunix_prot.c +++ b/sys/rpc/authunix_prot.c @@ -30,7 +30,6 @@ * POSSIBILITY OF SUCH DAMAGE. */ -#include <sys/cdefs.h> /* * authunix_prot.c * XDR for UNIX style authentication parameters for RPC @@ -40,8 +39,7 @@ #include <sys/param.h> #include <sys/jail.h> -#include <sys/kernel.h> -#include <sys/systm.h> +#include <sys/libkern.h> #include <sys/ucred.h> #include <rpc/types.h> @@ -50,9 +48,6 @@ #include <rpc/rpc_com.h> -/* gids compose part of a credential; there may not be more than 16 of them */ -#define NGRPS 16 - /* * XDR for unix authentication parameters. */ @@ -60,25 +55,23 @@ bool_t xdr_authunix_parms(XDR *xdrs, uint32_t *time, struct xucred *cred) { uint32_t namelen; - uint32_t ngroups, i; + uint32_t supp_ngroups, i; uint32_t junk; char hostbuf[MAXHOSTNAMELEN]; + if (xdrs->x_op == XDR_FREE) + /* This function does not allocate auxiliary memory. */ + return (TRUE); + if (xdrs->x_op == XDR_ENCODE) { - /* - * Restrict name length to 255 according to RFC 1057. - */ getcredhostname(NULL, hostbuf, sizeof(hostbuf)); namelen = strlen(hostbuf); - if (namelen > 255) - namelen = 255; - } else { + if (namelen > AUTH_SYS_MAX_HOSTNAME) + namelen = AUTH_SYS_MAX_HOSTNAME; + } else namelen = 0; - } - junk = 0; - if (!xdr_uint32_t(xdrs, time) - || !xdr_uint32_t(xdrs, &namelen)) + if (!xdr_uint32_t(xdrs, time) || !xdr_uint32_t(xdrs, &namelen)) return (FALSE); /* @@ -88,43 +81,65 @@ xdr_authunix_parms(XDR *xdrs, uint32_t *time, struct xucred *cred) if (!xdr_opaque(xdrs, hostbuf, namelen)) return (FALSE); } else { + if (namelen > AUTH_SYS_MAX_HOSTNAME) + return (FALSE); xdr_setpos(xdrs, xdr_getpos(xdrs) + RNDUP(namelen)); } if (!xdr_uint32_t(xdrs, &cred->cr_uid)) return (FALSE); + + /* + * Safety check: The protocol needs at least one group (access to + * 'cr_gid', decrementation of 'cr_ngroups' below). + */ + if (xdrs->x_op == XDR_ENCODE && cred->cr_ngroups == 0) + return (FALSE); if (!xdr_uint32_t(xdrs, &cred->cr_gid)) return (FALSE); if (xdrs->x_op == XDR_ENCODE) { /* - * Note that this is a `struct xucred`, which maintains its - * historical layout of preserving the egid in cr_ngroups and - * cr_groups[0] == egid. + * Note that this is a 'struct xucred', which still has the + * historical layout where the effective GID is in cr_groups[0] + * and is accounted in 'cr_ngroups'. We substract 1 to obtain + * the number of "supplementary" groups, passed in the AUTH_SYS + * credentials variable-length array called gids[] in RFC 5531. */ - ngroups = cred->cr_ngroups - 1; - if (ngroups > NGRPS) - ngroups = NGRPS; + MPASS(cred->cr_ngroups <= XU_NGROUPS); + supp_ngroups = cred->cr_ngroups - 1; + if (supp_ngroups > AUTH_SYS_MAX_GROUPS) + /* With current values, this should never execute. */ + supp_ngroups = AUTH_SYS_MAX_GROUPS; } - if (!xdr_uint32_t(xdrs, &ngroups)) + if (!xdr_uint32_t(xdrs, &supp_ngroups)) return (FALSE); - for (i = 0; i < ngroups; i++) { - if (i < ngroups_max) { - if (!xdr_uint32_t(xdrs, &cred->cr_groups[i + 1])) - return (FALSE); - } else { - if (!xdr_uint32_t(xdrs, &junk)) - return (FALSE); - } - } - if (xdrs->x_op == XDR_DECODE) { - if (ngroups > ngroups_max) - cred->cr_ngroups = ngroups_max + 1; - else - cred->cr_ngroups = ngroups + 1; - } + /* + * Because we cannot store more than XU_NGROUPS in total (16 at time of + * this writing), for now we choose to be strict with respect to RFC + * 5531's maximum number of supplementary groups (AUTH_SYS_MAX_GROUPS). + * That would also be an accidental DoS prevention measure if the + * request handling code didn't try to reassemble it in full without any + * size limits. Although AUTH_SYS_MAX_GROUPS and XU_NGROUPS are equal, + * since the latter includes the "effective" GID, we cannot store the + * last group of a message with exactly AUTH_SYS_MAX_GROUPS + * supplementary groups. We accept such messages so as not to violate + * the protocol, silently dropping the last group on the floor. + */ + + if (xdrs->x_op != XDR_ENCODE && supp_ngroups > AUTH_SYS_MAX_GROUPS) + return (FALSE); + + junk = 0; + for (i = 0; i < supp_ngroups; ++i) + if (!xdr_uint32_t(xdrs, i < XU_NGROUPS - 1 ? + &cred->cr_sgroups[i] : &junk)) + return (FALSE); + + if (xdrs->x_op != XDR_ENCODE) + cred->cr_ngroups = MIN(supp_ngroups + 1, XU_NGROUPS); return (TRUE); } diff --git a/sys/rpc/svc_auth_unix.c b/sys/rpc/svc_auth_unix.c index 963f4f272964..aa0fc585865f 100644 --- a/sys/rpc/svc_auth_unix.c +++ b/sys/rpc/svc_auth_unix.c @@ -41,18 +41,12 @@ */ #include <sys/param.h> -#include <sys/lock.h> -#include <sys/mutex.h> -#include <sys/systm.h> #include <sys/ucred.h> #include <rpc/rpc.h> #include <rpc/rpc_com.h> -#define MAX_MACHINE_NAME 255 -#define NGRPS 16 - /* * Unix longhand authenticator */ @@ -62,11 +56,8 @@ _svcauth_unix(struct svc_req *rqst, struct rpc_msg *msg) enum auth_stat stat; XDR xdrs; int32_t *buf; - uint32_t time; struct xucred *xcr; - u_int auth_len; - size_t str_len, gid_len; - u_int i; + uint32_t auth_len, time; xcr = rqst->rq_clntcred; auth_len = (u_int)msg->rm_call.cb_cred.oa_length; @@ -74,51 +65,58 @@ _svcauth_unix(struct svc_req *rqst, struct rpc_msg *msg) XDR_DECODE); buf = XDR_INLINE(&xdrs, auth_len); if (buf != NULL) { + /* 'time', 'str_len', UID, GID and 'supp_ngroups'. */ + const uint32_t min_len = 5 * BYTES_PER_XDR_UNIT; + uint32_t str_len, supp_ngroups; + + if (auth_len < min_len) + goto badcred; time = IXDR_GET_UINT32(buf); - str_len = (size_t)IXDR_GET_UINT32(buf); - if (str_len > MAX_MACHINE_NAME) { - stat = AUTH_BADCRED; - goto done; - } + str_len = IXDR_GET_UINT32(buf); + if (str_len > AUTH_SYS_MAX_HOSTNAME) + goto badcred; str_len = RNDUP(str_len); + /* + * Recheck message length now that we know the value of + * 'str_len' (and that it won't cause an overflow in additions + * below) to protect access to the credentials part. + */ + if (auth_len < min_len + str_len) + goto badcred; buf += str_len / sizeof (int32_t); xcr->cr_uid = IXDR_GET_UINT32(buf); xcr->cr_gid = IXDR_GET_UINT32(buf); - gid_len = (size_t)IXDR_GET_UINT32(buf); - if (gid_len > NGRPS) { - stat = AUTH_BADCRED; - goto done; - } - for (i = 0; i < gid_len; i++) { - /* - * Note that this is a `struct xucred`, which maintains - * its historical layout of preserving the egid in - * cr_ngroups and cr_groups[0] == egid. - */ - if (i + 1 < XU_NGROUPS) - xcr->cr_groups[i + 1] = IXDR_GET_INT32(buf); - else - buf++; - } - if (gid_len + 1 > XU_NGROUPS) - xcr->cr_ngroups = XU_NGROUPS; - else - xcr->cr_ngroups = gid_len + 1; + supp_ngroups = IXDR_GET_UINT32(buf); + /* + * See the herald comment before a similar test at the end of + * xdr_authunix_parms() for why we strictly respect RFC 5531 and + * why we may have to drop the last supplementary group when + * there are AUTH_SYS_MAX_GROUPS of them. + */ + if (supp_ngroups > AUTH_SYS_MAX_GROUPS) + goto badcred; + /* + * Final message length check, as we now know how much we will + * read in total. + */ + if (auth_len < min_len + str_len + + supp_ngroups * BYTES_PER_XDR_UNIT) + goto badcred; /* - * five is the smallest unix credentials structure - - * timestamp, hostname len (0), uid, gid, and gids len (0). + * Note that 'xcr' is a 'struct xucred', which still has the + * historical layout where the effective GID is in cr_groups[0] + * and is accounted in 'cr_ngroups'. */ - if ((5 + gid_len) * BYTES_PER_XDR_UNIT + str_len > auth_len) { - (void) printf("bad auth_len gid %ld str %ld auth %u\n", - (long)gid_len, (long)str_len, auth_len); - stat = AUTH_BADCRED; - goto done; + for (uint32_t i = 0; i < supp_ngroups; ++i) { + if (i < XU_NGROUPS - 1) + xcr->cr_sgroups[i] = IXDR_GET_INT32(buf); + else + buf++; } - } else if (! xdr_authunix_parms(&xdrs, &time, xcr)) { - stat = AUTH_BADCRED; - goto done; - } + xcr->cr_ngroups = MIN(supp_ngroups + 1, XU_NGROUPS); + } else if (!xdr_authunix_parms(&xdrs, &time, xcr)) + goto badcred; rqst->rq_verf = _null_auth; stat = AUTH_OK; @@ -126,6 +124,10 @@ done: XDR_DESTROY(&xdrs); return (stat); + +badcred: + stat = AUTH_BADCRED; + goto done; } diff --git a/sys/sys/imgact_elf.h b/sys/sys/imgact_elf.h index 2845a9dbc1e2..9e2a233248b4 100644 --- a/sys/sys/imgact_elf.h +++ b/sys/sys/imgact_elf.h @@ -86,7 +86,7 @@ typedef struct { struct sysentvec *sysvec; const char *interp_newpath; int flags; - Elf_Brandnote *brand_note; + const Elf_Brandnote *brand_note; bool (*header_supported)(const struct image_params *, const int32_t *, const uint32_t *); /* High 8 bits of flags is private to the ABI */ @@ -111,9 +111,9 @@ struct sseg_closure { size_t size; /* Total size of all writable segments. */ }; -bool __elfN(brand_inuse)(Elf_Brandinfo *entry); -int __elfN(insert_brand_entry)(Elf_Brandinfo *entry); -int __elfN(remove_brand_entry)(Elf_Brandinfo *entry); +bool __elfN(brand_inuse)(const Elf_Brandinfo *entry); +int __elfN(insert_brand_entry)(const Elf_Brandinfo *entry); +int __elfN(remove_brand_entry)(const Elf_Brandinfo *entry); int __elfN(freebsd_fixup)(uintptr_t *, struct image_params *); int __elfN(coredump)(struct thread *, struct coredump_writer *, off_t, int); size_t __elfN(populate_note)(int, void *, void *, size_t, void **); diff --git a/sys/sys/proc.h b/sys/sys/proc.h index 9140cee56885..8c0729d3ec66 100644 --- a/sys/sys/proc.h +++ b/sys/sys/proc.h @@ -741,7 +741,7 @@ struct proc { reaper which spawned our subtree. */ uint64_t p_elf_flags; /* (x) ELF flags */ - void *p_elf_brandinfo; /* (x) Elf_Brandinfo, NULL for + const void *p_elf_brandinfo; /* (x) Elf_Brandinfo, NULL for non ELF binaries. */ sbintime_t p_umtx_min_timeout; /* End area that is copied on creation. */ diff --git a/sys/sys/socket.h b/sys/sys/socket.h index cdd4fa3b4b89..cf1d95da6168 100644 --- a/sys/sys/socket.h +++ b/sys/sys/socket.h @@ -396,6 +396,7 @@ struct sockproto { #define PF_NETLINK AF_NETLINK #define PF_INET_SDP AF_INET_SDP #define PF_INET6_SDP AF_INET6_SDP +#define PF_HYPERV AF_HYPERV #define PF_DIVERT AF_DIVERT #define PF_IPFWLOG AF_IPFWLOG diff --git a/sys/tests/ktest.h b/sys/tests/ktest.h index 8af92f1e0c18..75d7a75e2fff 100644 --- a/sys/tests/ktest.h +++ b/sys/tests/ktest.h @@ -57,6 +57,8 @@ struct ktest_test_info { ktest_parse_t parse; }; +#define KTEST_FUNC(X) static int __ktest_##X(struct ktest_test_context *ctx) + struct ktest_module_info { const char *name; const struct ktest_test_info *tests; @@ -64,6 +66,8 @@ struct ktest_module_info { void *module_ptr; }; +#define KTEST_INFO(X) { "test_" #X, "Test " #X, __ktest_##X, NULL } + int ktest_default_modevent(module_t mod, int type, void *arg); bool ktest_start_msg(struct ktest_test_context *ctx); @@ -84,6 +88,9 @@ void ktest_end_msg(struct ktest_test_context *ctx); #define KTEST_LOG(_ctx, _fmt, ...) \ KTEST_LOG_LEVEL(_ctx, LOG_DEBUG, _fmt, ## __VA_ARGS__) +#define KTEST_ERR(_ctx, _fmt, ...) \ + KTEST_LOG_LEVEL(_ctx, LOG_ERR, _fmt, ## __VA_ARGS__) + #define KTEST_MAX_BUF 512 #define KTEST_MODULE_DECLARE(_n, _t) \ diff --git a/tests/sys/fs/fusefs/bad_server.cc b/tests/sys/fs/fusefs/bad_server.cc index af2ca146e431..c3d195735446 100644 --- a/tests/sys/fs/fusefs/bad_server.cc +++ b/tests/sys/fs/fusefs/bad_server.cc @@ -65,6 +65,11 @@ TEST_F(BadServer, ShortWrite) out.header.unique = 0; // Asynchronous notification out.expected_errno = EINVAL; m_mock->write_response(out); + /* + * Tell the event loop to quit. The kernel has already disconnected us + * because of the short write. + */ + m_mock->m_quit = true; } /* diff --git a/usr.bin/sockstat/main.c b/usr.bin/sockstat/main.c index 7fedfd5b8724..d1ea6b1bc958 100644 --- a/usr.bin/sockstat/main.c +++ b/usr.bin/sockstat/main.c @@ -1789,9 +1789,11 @@ main(int argc, char *argv[]) argc = xo_parse_args(argc, argv); if (argc < 0) exit(1); - if (xo_get_style(NULL) != XO_STYLE_TEXT && - xo_get_style(NULL) != XO_STYLE_HTML) - is_xo_style_encoding = true; + if (xo_get_style(NULL) != XO_STYLE_TEXT) { + show_path_state = true; + if (xo_get_style(NULL) != XO_STYLE_HTML) + is_xo_style_encoding = true; + } opt_j = -1; while ((o = getopt(argc, argv, "46AbCcfIij:Llnp:P:qSsUuvw")) != -1) switch (o) { diff --git a/usr.bin/sockstat/sockstat.1 b/usr.bin/sockstat/sockstat.1 index d14eb967ad0f..1498fb1d88f7 100644 --- a/usr.bin/sockstat/sockstat.1 +++ b/usr.bin/sockstat/sockstat.1 @@ -25,7 +25,7 @@ .\" (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF .\" THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .\" -.Dd October 9, 2025 +.Dd October 14, 2025 .Dt SOCKSTAT 1 .Os .Sh NAME @@ -205,7 +205,8 @@ is specified (only for SCTP or TCP). The path state if .Fl s is specified (only for SCTP). -This column is only shown when there is at least one path state shown. +When using traditional text output, this column is only shown when there is at +least one path state to show. .It Li CONN STATE The connection state if .Fl s diff --git a/usr.sbin/certctl/certctl.8 b/usr.sbin/certctl/certctl.8 index edf993e1361a..e58da8e7ff84 100644 --- a/usr.sbin/certctl/certctl.8 +++ b/usr.sbin/certctl/certctl.8 @@ -24,7 +24,7 @@ .\" IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE .\" POSSIBILITY OF SUCH DAMAGE. .\" -.Dd August 18, 2025 +.Dd October 9, 2025 .Dt CERTCTL 8 .Os .Sh NAME @@ -110,7 +110,7 @@ A copy of each trusted certificate is placed in and each untrusted certificate in .Ev UNTRUSTDESTDIR . In addition, a bundle containing the trusted certificates is placed in -.Ev BUNDLEFILE . +.Ev BUNDLE . .It Ic untrust Add the specified file to the untrusted list. .It Ic trust @@ -151,6 +151,8 @@ Default: .Pa ${DESTDIR}${DISTBASE}/etc/ssl/untrusted .It Ev BUNDLE File name of bundle to produce. +Default: +.Pa ${DESTDIR}${DISTBASE}/etc/ssl/cert.pem .El .Sh SEE ALSO .Xr openssl 1 |