aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Makefile.inc116
-rw-r--r--RELNOTES2
-rw-r--r--UPDATING8
-rw-r--r--contrib/blocklist/bin/blacklistctl.8136
-rw-r--r--contrib/blocklist/bin/blacklistd.8308
-rw-r--r--contrib/blocklist/bin/blacklistd.conf.5242
-rw-r--r--contrib/blocklist/lib/libblacklist.3188
-rw-r--r--lib/libblacklist/Makefile17
-rw-r--r--lib/libc/stdlib/realpath.312
-rw-r--r--lib/libc/stdlib/realpath.c14
-rw-r--r--lib/libc/tests/gen/realpath2_test.c106
-rw-r--r--lib/libpam/modules/pam_xdg/pam_xdg.81
-rw-r--r--sbin/geom/Makefile2
-rw-r--r--sbin/geom/core/geom.c232
-rw-r--r--sbin/ifconfig/ifconfig.84
-rw-r--r--sbin/ipfw/tables.c3
-rwxr-xr-xsbin/mdconfig/tests/mdconfig_test.sh15
-rw-r--r--sbin/ping/tests/Makefile3
-rw-r--r--share/man/man4/bridge.4210
-rw-r--r--sys/arm/arm/pmap-v6.c2
-rw-r--r--sys/arm/arm/unwind.c4
-rw-r--r--sys/arm64/coresight/coresight.c2
-rw-r--r--sys/cam/scsi/scsi_all.c4
-rw-r--r--sys/cddl/compat/opensolaris/kern/opensolaris.c2
-rw-r--r--sys/conf/NOTES1
-rw-r--r--sys/conf/options1
-rw-r--r--sys/dev/fdt/fdt_slicer.c6
-rw-r--r--sys/dev/iommu/iommu_gas.c2
-rw-r--r--sys/dev/nvme/nvme.c4
-rw-r--r--sys/dev/nvme/nvme.h4
-rw-r--r--sys/dev/nvme/nvme_sim.c4
-rw-r--r--sys/dev/xdma/xdma.c2
-rw-r--r--sys/dev/xen/bus/xen_intr.c6
-rw-r--r--sys/fs/p9fs/p9_transport.c3
-rw-r--r--sys/i386/i386/machdep.c2
-rw-r--r--sys/i386/i386/pmap.c2
-rw-r--r--sys/kern/kern_boottrace.c2
-rw-r--r--sys/kern/kern_devctl.c2
-rw-r--r--sys/kern/kern_event.c4
-rw-r--r--sys/kern/kern_jailmeta.c8
-rw-r--r--sys/kern/kern_linker.c2
-rw-r--r--sys/kern/kern_malloc.c2
-rw-r--r--sys/kern/kern_racct.c4
-rw-r--r--sys/kern/kern_rangelock.c2
-rw-r--r--sys/kern/kern_rctl.c4
-rw-r--r--sys/kern/kern_sharedpage.c3
-rw-r--r--sys/kern/kern_sig.c4
-rw-r--r--sys/kern/kern_time.c4
-rw-r--r--sys/kern/subr_pcpu.c2
-rw-r--r--sys/kern/sys_socket.c2
-rw-r--r--sys/libkern/arc4random.c4
-rw-r--r--sys/libkern/x86/crc32_sse42.c4
-rw-r--r--sys/modules/ktest/Makefile3
-rw-r--r--sys/modules/ktest/ktest_tcphpts/Makefile13
-rw-r--r--sys/net/route.c2
-rw-r--r--sys/net/route/route_tables.c2
-rw-r--r--sys/net/rtsock.c2
-rw-r--r--sys/net80211/ieee80211_ht.c2
-rw-r--r--sys/net80211/ieee80211_hwmp.c2
-rw-r--r--sys/net80211/ieee80211_mesh.c2
-rw-r--r--sys/net80211/ieee80211_phy.c2
-rw-r--r--sys/net80211/ieee80211_proto.c2
-rw-r--r--sys/net80211/ieee80211_vht.c2
-rw-r--r--sys/netinet/cc/cc.c2
-rw-r--r--sys/netinet/in_fib_algo.c2
-rw-r--r--sys/netinet/tcp_hpts.c933
-rw-r--r--sys/netinet/tcp_hpts.h50
-rw-r--r--sys/netinet/tcp_hpts_internal.h184
-rw-r--r--sys/netinet/tcp_hpts_test.c1662
-rw-r--r--sys/netinet/tcp_lro_hpts.c3
-rw-r--r--sys/netinet/tcp_stacks/bbr.c131
-rw-r--r--sys/netinet/tcp_stacks/rack.c252
-rw-r--r--sys/netinet6/in6_fib_algo.c2
-rw-r--r--sys/netipsec/xform_ipcomp.c4
-rw-r--r--sys/netpfil/ipfw/ip_fw2.c9
-rw-r--r--sys/netpfil/ipfw/ip_fw_nat.c4
-rw-r--r--sys/netpfil/pf/pf_ioctl.c4
-rw-r--r--sys/nfs/nfs_diskless.c2
-rw-r--r--sys/powerpc/aim/mmu_oea64.c4
-rw-r--r--sys/powerpc/cpufreq/pmcr.c3
-rw-r--r--sys/security/audit/audit.c2
-rw-r--r--sys/security/mac/mac_framework.c4
-rw-r--r--sys/sys/sysent.h3
-rw-r--r--sys/tests/ktest.h10
-rw-r--r--sys/vm/vm_meter.c2
-rw-r--r--sys/vm/vm_pageout.c4
-rw-r--r--sys/x86/x86/tsc.c2
-rw-r--r--sys/x86/xen/xen_apic.c2
-rw-r--r--tests/atf_python/ktest.py12
-rw-r--r--tests/sys/netinet/Makefile1
-rw-r--r--tests/sys/netinet/tcp_hpts_test.py4
-rw-r--r--tests/sys/vm/mmap_test.c31
-rw-r--r--usr.bin/login/login.conf4
-rw-r--r--usr.sbin/blacklistctl/Makefile3
-rw-r--r--usr.sbin/blacklistd/Makefile4
-rw-r--r--usr.sbin/fwget/pci/pci_network_mediatek36
96 files changed, 4062 insertions, 970 deletions
diff --git a/Makefile.inc1 b/Makefile.inc1
index 74c4598dd092..a86dead09aa1 100644
--- a/Makefile.inc1
+++ b/Makefile.inc1
@@ -1964,6 +1964,7 @@ REPODIR?= ${OBJROOT}repo
PKG_FORMAT?= tzst
PKG_LEVEL?= -1
PKG_CLEVEL?= ${"${PKG_FORMAT:Mtar}" != "":?:-l ${PKG_LEVEL}}
+PKG_CTHREADS?= 0
PKG_REPO_SIGNING_KEY?= # empty
PKG_OUTPUT_DIR?= ${PKG_VERSION}
PKG_ABI_FILE?= ${WSTAGEDIR}/usr/bin/uname
@@ -2094,6 +2095,7 @@ create-packages-world: _pkgbootstrap _repodir .PHONY
.ORDER: create-packages-world create-packages-sets
.ORDER: create-packages-kernel create-packages-sets
+.ORDER: create-packages-source create-packages-sets
create-packages-sets: _pkgbootstrap _repodir .PHONY
${_+_}@cd ${.CURDIR}; \
${MAKE} -f Makefile.inc1 \
@@ -2143,7 +2145,7 @@ create-source-src-package: _pkgbootstrap .PHONY
${SSTAGEDIR}/src.ucl
${PKG_CMD} -o ABI=${PKG_ABI} \
-o OSVERSION="${SRCRELDATE}" \
- create -f ${PKG_FORMAT} ${PKG_CLEVEL} \
+ create -f ${PKG_FORMAT} ${PKG_CLEVEL} -T${PKG_CTHREADS} \
-M ${SSTAGEDIR}/src.ucl \
-p ${SSTAGEDIR}/src.plist \
-r ${SRCDIR} \
@@ -2169,7 +2171,7 @@ create-source-src-sys-package: _pkgbootstrap .PHONY
${SSTAGEDIR}/src-sys.ucl
${PKG_CMD} -o ABI=${PKG_ABI} \
-o OSVERSION="${SRCRELDATE}" \
- create -f ${PKG_FORMAT} ${PKG_CLEVEL} \
+ create -f ${PKG_FORMAT} ${PKG_CLEVEL} -T${PKG_CTHREADS} \
-M ${SSTAGEDIR}/src-sys.ucl \
-p ${SSTAGEDIR}/src-sys.plist \
-r ${SRCDIR} \
@@ -2209,7 +2211,7 @@ create-world-package-${pkgname}: .PHONY
' ${WSTAGEDIR}/${pkgname}.ucl
${PKG_CMD} -o ABI=${PKG_ABI} -o ALLOW_BASE_SHLIBS=yes \
-o OSVERSION="${SRCRELDATE}" \
- create -f ${PKG_FORMAT} ${PKG_CLEVEL} \
+ create -f ${PKG_FORMAT} ${PKG_CLEVEL} -T${PKG_CTHREADS} \
-M ${WSTAGEDIR}/${pkgname}.ucl \
-p ${WSTAGEDIR}/${pkgname}.plist \
-r ${WSTAGEDIR} \
@@ -2228,7 +2230,7 @@ create-sets-packages: .PHONY
@for manifest in ${WSTAGEDIR}/set-*.ucl; do \
echo "--> Processing manifest: $$manifest"; \
${PKG_CMD} -o ABI=${PKG_ABI} -o OSVERSION="${SRCRELDATE}" \
- create -f ${PKG_FORMAT} ${PKG_CLEVEL} \
+ create -f ${PKG_FORMAT} ${PKG_CLEVEL} -T${PKG_CTHREADS} \
-M $$manifest \
-o "${REPODIR}/${PKG_ABI}/${PKG_OUTPUT_DIR}" \
|| exit 1; \
@@ -2258,7 +2260,7 @@ create-dtb-package: .PHONY
${KSTAGEDIR}/${DISTDIR}/dtb.ucl ; \
${PKG_CMD} -o ABI=${PKG_ABI} -o ALLOW_BASE_SHLIBS=yes \
-o OSVERSION="${SRCRELDATE}" \
- create -f ${PKG_FORMAT} ${PKG_CLEVEL} \
+ create -f ${PKG_FORMAT} ${PKG_CLEVEL} -T${PKG_CTHREADS} \
-M ${KSTAGEDIR}/${DISTDIR}/dtb.ucl \
-p ${KSTAGEDIR}/${DISTDIR}/dtb.plist \
-r ${KSTAGEDIR}/${DISTDIR} \
@@ -2295,7 +2297,7 @@ create-kernel-packages-flavor${flavor:C,^""$,${_default_flavor},}: _pkgbootstrap
${KSTAGEDIR}/${DISTDIR}/kernel.${INSTALLKERNEL}${flavor}.ucl ; \
${PKG_CMD} -o ABI=${PKG_ABI} -o ALLOW_BASE_SHLIBS=yes \
-o OSVERSION="${SRCRELDATE}" \
- create -f ${PKG_FORMAT} ${PKG_CLEVEL} \
+ create -f ${PKG_FORMAT} ${PKG_CLEVEL} -T${PKG_CTHREADS} \
-M ${KSTAGEDIR}/${DISTDIR}/kernel.${INSTALLKERNEL}${flavor}.ucl \
-p ${KSTAGEDIR}/${DISTDIR}/kernel.${INSTALLKERNEL}${flavor}.plist \
-r ${KSTAGEDIR}/${DISTDIR} \
@@ -2338,7 +2340,7 @@ create-kernel-packages-extra-flavor${flavor:C,^""$,${_default_flavor},}-${_kerne
${KSTAGEDIR}/kernel.${_kernel}/kernel.${_kernel}${flavor}.ucl ; \
${PKG_CMD} -o ABI=${PKG_ABI} -o ALLOW_BASE_SHLIBS=yes \
-o OSVERSION="${SRCRELDATE}" \
- create -f ${PKG_FORMAT} ${PKG_CLEVEL} \
+ create -f ${PKG_FORMAT} ${PKG_CLEVEL} -T${PKG_CTHREADS} \
-M ${KSTAGEDIR}/kernel.${_kernel}/kernel.${_kernel}${flavor}.ucl \
-p ${KSTAGEDIR}/kernel.${_kernel}/kernel.${_kernel}${flavor}.plist \
-r ${KSTAGEDIR}/kernel.${_kernel} \
diff --git a/RELNOTES b/RELNOTES
index 174ce12e4148..e34a5b23a005 100644
--- a/RELNOTES
+++ b/RELNOTES
@@ -11,7 +11,7 @@ newline. Entries should be separated by a newline.
Changes to this file should not be MFCed.
5000d023a446, 03da141d59ae:
- Add a "-f" option to "kadmin -l dump" with can be used to
+ Add a "-f" option to "kadmin -l dump" which can be used to
dump the Heimdal KDC database in a format that can be loaded
into the MIT KDC.
See https://wiki.freebsd.org/Kerberos/Heimdal2MIT_KDC_Migration
diff --git a/UPDATING b/UPDATING
index 4460898fca2d..9c8bd3a9fd6b 100644
--- a/UPDATING
+++ b/UPDATING
@@ -27,6 +27,12 @@ NOTE TO PEOPLE WHO THINK THAT FreeBSD 16.x IS SLOW:
world, or to merely disable the most expensive debugging functionality
at runtime, run "ln -s 'abort:false,junk:false' /etc/malloc.conf".)
+20251012:
+ Blacklist has been renamed upstream to Blocklist. If you have it
+ configured, rename all configuration files, firewall anchors or
+ sentinel files to reflect the new nomenclature. Old setups will
+ continue to work emitting a warning.
+
20251002:
Audio-related utilities including mixer(8) and virtual_oss(8) have
moved to the new FreeBSD-sound package. If you have set-optional or
@@ -700,7 +706,7 @@ NOTE TO PEOPLE WHO THINK THAT FreeBSD 16.x IS SLOW:
Bump _FreeBSD_version to 1400078 to be able to detect this change.
20221212:
- llvm-objump is now always installed as objdump. Previously there was
+ llvm-objdump is now always installed as objdump. Previously there was
no /usr/bin/objdump unless the WITH_LLVM_BINUTILS knob was used.
Some LLVM objdump options have a different output format compared to
diff --git a/contrib/blocklist/bin/blacklistctl.8 b/contrib/blocklist/bin/blacklistctl.8
new file mode 100644
index 000000000000..4d557c0c979d
--- /dev/null
+++ b/contrib/blocklist/bin/blacklistctl.8
@@ -0,0 +1,136 @@
+.\" $NetBSD: blocklistctl.8,v 1.4 2025/02/07 01:35:38 kre Exp $
+.\"
+.\" Copyright (c) 2015 The NetBSD Foundation, Inc.
+.\" All rights reserved.
+.\"
+.\" This code is derived from software contributed to The NetBSD Foundation
+.\" by Christos Zoulas.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+.\" ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+.\" TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+.\" PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+.\" BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+.\" CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+.\" SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+.\" INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+.\" CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+.\" ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+.\" POSSIBILITY OF SUCH DAMAGE.
+.\"
+.Dd January 27, 2025
+.Dt BLACKLISTCTL 8
+.Os
+.Sh NAME
+.Nm blacklistctl
+.Nd display and change the state of the blacklistd database
+.Sh SYNOPSIS
+.Nm
+.Cm dump
+.Op Fl abdnrw
+.Op Fl D Ar dbname
+.Sh DESCRIPTION
+.Nm
+is a program used to display and change the state of the
+.Xr blacklistd 8
+database.
+The following sub-commands are supported:
+.Ss dump
+.Pp
+The following options are available for the
+.Cm dump
+sub-command:
+.Bl -tag -width indent
+.It Fl a
+Show all database entries, by default it shows only the active ones.
+Inactive entries will be shown with a last-access (or, with
+.Fl r ,
+the remaining) time of
+.Ql never .
+.It Fl b
+Show only the blocked entries.
+.It Fl D Ar dbname
+Specify the location of the
+.Ic blacklistd
+database file to use.
+The default is
+.Pa /var/db/blocklistd.db .
+.It Fl d
+Increase debugging level.
+.It Fl n
+Don't display a header.
+.It Fl r
+Show the remaining blocked time instead of the last activity time.
+.It Fl w
+Normally the width of addresses is good for IPv4, the
+.Fl w
+flag, makes the display wide enough for IPv6 addresses.
+.El
+.Pp
+The output of the
+.Cm dump
+sub-command consists of a header (unless
+.Fl n
+was given) and one line for each record in the database, where each line
+has the following columns:
+.Bl -tag -width indent
+.It Ql address/ma:port
+The remote address, mask, and local port number of the client connection
+associated with the database entry.
+.It Ql id
+column will show the identifier for the packet filter rule associated
+with the database entry, though this may only be the word
+.Ql OK
+for packet filters which do not creat a unique identifier for each rule.
+.It Ql nfail
+The number of
+.Em failures
+reported for the client on the noted port, as well as the number of
+failures allowed before blocking (or, with
+.Fl a ,
+an asterisk
+.Aq * )
+.It So last access Sc | So remaining time Sc
+The last time a the client was reported as attempting access, or, with
+.Fl r ,
+the time remaining before the rule blocking the client will be removed.
+.El
+.Sh SEE ALSO
+.Xr blacklistd 8
+.Sh NOTES
+The
+.Nm
+program has been renamed to
+.Xr blocklistctl 8 .
+.Pp
+Sometimes the reported number of failed attempts can exceed the number
+of attempts that
+.Xr blacklistd 8
+is configured to block.
+This can happen either because the rule has been removed manually, or
+because there were more attempts in flight while the rule block was being
+added.
+This condition is normal; in that case
+.Xr blacklistd 8
+will first attempt to remove the existing rule, and then it will re-add
+it to make sure that there is only one rule active.
+.Sh HISTORY
+.Nm
+first appeared in
+.Nx 7 .
+.Fx
+support for
+.Nm
+was implemented in
+.Fx 11 .
+.Sh AUTHORS
+.An Christos Zoulas
diff --git a/contrib/blocklist/bin/blacklistd.8 b/contrib/blocklist/bin/blacklistd.8
new file mode 100644
index 000000000000..9ca886e9c4d3
--- /dev/null
+++ b/contrib/blocklist/bin/blacklistd.8
@@ -0,0 +1,308 @@
+.\" $NetBSD: blocklistd.8,v 1.8 2025/02/25 22:13:34 christos Exp $
+.\"
+.\" Copyright (c) 2015 The NetBSD Foundation, Inc.
+.\" All rights reserved.
+.\"
+.\" This code is derived from software contributed to The NetBSD Foundation
+.\" by Christos Zoulas.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+.\" ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+.\" TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+.\" PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+.\" BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+.\" CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+.\" SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+.\" INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+.\" CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+.\" ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+.\" POSSIBILITY OF SUCH DAMAGE.
+.\"
+.Dd February 25, 2025
+.Dt BLACKLISTD 8
+.Os
+.Sh NAME
+.Nm blacklistd
+.Nd block and release ports on demand to avoid DoS abuse
+.Sh SYNOPSIS
+.Nm
+.Op Fl dfrv
+.Op Fl C Ar controlprog
+.Op Fl c Ar configfile
+.Op Fl D Ar dbfile
+.Op Fl P Ar sockpathsfile
+.Op Fl R Ar rulename
+.Op Fl s Ar sockpath
+.Op Fl t Ar timeout
+.Sh DESCRIPTION
+.Nm
+is a daemon similar to
+.Xr syslogd 8
+that listens to sockets at paths specified in the
+.Ar sockpathsfile
+for notifications from other daemons about successful or failed connection
+attempts.
+If no such file is specified, then it only listens to the socket path
+specified by
+.Ar sockpath
+or if that is not specified to
+.Pa /var/run/blocklistd.sock .
+Each notification contains an (action, port, protocol, address, owner) tuple
+that identifies the remote connection and the action.
+This tuple is consulted against entries from the
+.Ar configfile ,
+with the syntax specified in
+.Xr blacklistd.conf 5 .
+If an entry is matched, a state entry is created for that tuple.
+Each entry contains a number of tries limit and a duration.
+.Pp
+If
+.Ar configfile
+is a directory, or a directory exists with the same name as
+.Ar configfile
+with
+.Qq .d
+appended to it, each file in the directory will be read as configuration file.
+If
+.Ar configfile
+exists as a file it will be processed before the contents of the
+.Ar configfile Ns .d
+directory if that also exists.
+.Pp
+The way
+.Nm
+does configuration entry matching is by having the client side pass the
+file descriptor associated with the connection the client wants to blacklist
+as well as passing socket credentials.
+.Pp
+The file descriptor is used to retrieve information (address and port)
+about the remote side with
+.Xr getpeername 2
+and the local side with
+.Xr getsockname 2 .
+.Pp
+By examining the port of the local side,
+.Nm
+can determine if the client program
+.Dq owns
+the port.
+By examining the optional address portion on the local side, it can match
+interfaces.
+By examining the remote address, it can match specific allow or deny rules.
+.Pp
+Finally
+.Nm
+can examine the socket credentials to match the user in the configuration file.
+.Pp
+While this works well for TCP sockets, it cannot be relied on for unbound
+UDP sockets.
+It is also less meaningful when it comes to connections using non-privileged
+ports.
+On the other hand, if we receive a request that has a local endpoint indicating
+a UDP privileged port, we can presume that the client was privileged to be
+able to acquire that port.
+.Pp
+Once an entry is matched
+.Nm
+can perform various actions.
+If the action is
+.Dq add
+and the number of tries limit is reached, then a
+control script
+.Ar controlprog
+is invoked with arguments:
+.Bd -literal -offset indent
+control add <rulename> <proto> <address> <mask> <port>
+.Ed
+.Pp
+and should invoke a packet filter command to block the connection
+specified by the arguments.
+The
+.Ar rulename
+argument can be set from the command line (default
+.Dv blacklistd ) .
+The script could print a numerical id to stdout as a handle for
+the rule that can be used later to remove that connection, but
+that is not required as all information to remove the rule is
+kept.
+.Pp
+If the action is
+.Dq rem
+Then the same control script is invoked as:
+.Bd -literal -offset indent
+control rem <rulename> <proto> <address> <mask> <port> <id>
+.Ed
+.Pp
+where
+.Ar id
+is the number returned from the
+.Dq add
+action.
+.Pp
+.Nm
+maintains a database of known connections in
+.Ar dbfile .
+On startup it reads entries from that file, and updates its internal state.
+.Pp
+.Nm
+checks the list of active entries every
+.Ar timeout
+seconds (default
+.Dv 15 )
+and removes entries and block rules using the control program as necessary.
+.Pp
+The following options are available:
+.Bl -tag -width indent
+.It Fl C Ar controlprog
+Use
+.Ar controlprog
+to communicate with the packet filter, instead of the default, which is
+.Pa /usr/libexec/blacklistd-helper .
+The following arguments are passed to the control program:
+.Bl -tag -width protocol
+.It action
+The action to perform:
+.Dv add ,
+.Dv rem ,
+or
+.Dv flush ;
+to add, remove or flush a firewall rule.
+.It name
+The rule name.
+.It protocol
+The optional protocol name (can be empty):
+.Dv tcp ,
+.Dv tcp6 ,
+.Dv udp ,
+.Dv udp6 .
+.It address
+The IPv4 or IPv6 numeric address to be blocked or released.
+.It mask
+The numeric mask to be applied to the blocked or released address
+.It port
+The optional numeric port to be blocked (can be empty).
+.It id
+For packet filters that support removal of rules by rule identifier, the
+identifier of the rule to be removed.
+The add command is expected to return the rule identifier string to stdout.
+.El
+.It Fl c Ar configuration
+The name of the configuration file to read.
+The default when
+.Fl c
+is not given is
+.Pa /etc/blacklistd.conf .
+.It Fl D Ar dbfile
+The Berkeley DB file where
+.Nm
+stores its state.
+It defaults to
+.Pa /var/db/blocklistd.db .
+.It Fl d
+Normally,
+.Nm
+disassociates itself from the terminal unless the
+.Fl d
+flag is specified, in which case it stays in the foreground.
+.It Fl f
+Truncate the state database and flush all the rules named
+.Ar rulename
+are deleted by invoking the control script as:
+.Bd -literal -offset indent
+control flush <rulename>
+.Ed
+.It Fl P Ar sockpathsfile
+A file containing a list of pathnames, one per line that
+.Nm
+will create sockets to listen to.
+This is useful for chrooted environments.
+.It Fl R Ar rulename
+Specify the default rule name for the packet filter rules, usually
+.Dv blacklistd .
+.It Fl r
+Re-read the firewall rules from the internal database, then
+remove and re-add them.
+This helps for packet filters that do not retain state across reboots.
+.It Fl s Ar sockpath
+Add
+.Ar sockpath
+to the list of Unix sockets
+.Nm
+listens to.
+.It Fl t Ar timeout
+The interval in seconds
+.Nm
+polls the state file to update the rules.
+.It Fl v
+Cause
+.Nm
+to print
+diagnostic messages to
+.Dv stdout
+instead of
+.Xr syslogd 8 .
+.El
+.Sh SIGNAL HANDLING
+.Nm
+deals with the following signals:
+.Bl -tag -width "USR2"
+.It Dv HUP
+Receipt of this signal causes
+.Nm
+to re-read the configuration file.
+.It Dv INT , Dv TERM & Dv QUIT
+These signals tell
+.Nm
+to exit in an orderly fashion.
+.It Dv USR1
+This signal tells
+.Nm
+to increase the internal debugging level by 1.
+.It Dv USR2
+This signal tells
+.Nm
+to decrease the internal debugging level by 1.
+.El
+.Sh FILES
+.Bl -tag -width /usr/libexec/blacklistd-helper -compact
+.It Pa /usr/libexec/blacklistd-helper
+Shell script invoked to interface with the packet filter.
+.It Pa /etc/blacklistd.conf
+Configuration file.
+.It Pa /var/db/blocklistd.db
+Database of current connection entries.
+.It Pa /var/run/blocklistd.sock
+Socket to receive connection notifications.
+.El
+.Sh SEE ALSO
+.Xr blacklistd.conf 5 ,
+.Xr blacklistctl 8 ,
+.Xr ipf 8 ,
+.Xr ipfw 8 ,
+.Xr pfctl 8 ,
+.Xr syslogd 8
+.Sh NOTES
+The
+.Nm
+daemon has been renamed to
+.Xr blocklistd 8 .
+.Sh HISTORY
+.Nm
+first appeared in
+.Nx 7 .
+.Fx
+support for
+.Nm
+was implemented in
+.Fx 11 .
+.Sh AUTHORS
+.An Christos Zoulas
diff --git a/contrib/blocklist/bin/blacklistd.conf.5 b/contrib/blocklist/bin/blacklistd.conf.5
new file mode 100644
index 000000000000..e775d30e7e8e
--- /dev/null
+++ b/contrib/blocklist/bin/blacklistd.conf.5
@@ -0,0 +1,242 @@
+.\" $NetBSD: blocklistd.conf.5,v 1.7 2025/02/11 17:47:05 christos Exp $
+.\"
+.\" Copyright (c) 2015, 2025 The NetBSD Foundation, Inc.
+.\" All rights reserved.
+.\"
+.\" This code is derived from software contributed to The NetBSD Foundation
+.\" by Christos Zoulas.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+.\" ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+.\" TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+.\" PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+.\" BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+.\" CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+.\" SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+.\" INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+.\" CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+.\" ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+.\" POSSIBILITY OF SUCH DAMAGE.
+.\"
+.Dd February 5, 2025
+.Dt BLACKLISTD.CONF 5
+.Os
+.Sh NAME
+.Nm blacklistd.conf
+.Nd configuration file format for blacklistd
+.Sh DESCRIPTION
+The
+.Nm
+file contains configuration entries for
+.Xr blacklistd 8
+in a fashion similar to
+.Xr inetd.conf 5 .
+Only one entry per line is permitted.
+Every entry must have all fields populated.
+Each field can be separated by a tab or a space.
+Comments are denoted by a
+.Dq #
+at the beginning of a line.
+.Pp
+There are two kinds of configuration lines,
+.Va [local]
+and
+.Va [remote] .
+By default, configuration lines are
+.Va [local] ,
+i.e. the address specified refers to the addresses on the local machine.
+To switch to between
+.Va [local]
+and
+.Va [remote]
+configuration lines you can specify the stanzas:
+.Dq [local]
+and
+.Dq [remote] .
+.Pp
+On
+.Va [local]
+and
+.Va [remote]
+lines
+.Dq *
+means use the default, or wildcard match.
+In addition, for
+.Va [remote]
+lines
+.Dq =
+means use the values from the matched
+.Va [local]
+configuration line.
+.Pp
+The first four fields,
+.Va location ,
+.Va type ,
+.Va proto ,
+and
+.Va owner
+are used to match the
+.Va [local]
+or
+.Va [remote]
+addresses, whereas the last 3 fields
+.Va name ,
+.Va nfail ,
+and
+.Va disable
+are used to modify the filtering action.
+.Pp
+The first field denotes the
+.Va location
+as an address, mask, and port.
+The syntax for the
+.Va location
+is:
+.Bd -literal -offset indent
+ [<address>|<interface>][/<mask>][:<port>]
+.Ed
+.Pp
+The
+.Dv address
+can be an IPv4 address in numeric format, an IPv6 address
+in numeric format and enclosed by square brackets, or an interface name.
+Mask modifiers are not allowed on interfaces because interfaces
+can have multiple addresses in different protocols where the mask has a
+different size.
+.Pp
+The
+.Dv mask
+is always numeric, but the
+.Dv port
+can be either numeric or symbolic.
+.Pp
+The second field is the socket
+.Va type :
+.Dv stream ,
+.Dv dgram ,
+or numeric.
+The third field is the
+.Va protocol :
+.Dv tcp ,
+.Dv udp ,
+.Dv tcp6 ,
+.Dv udp6 ,
+or numeric.
+The fourth field is the effective user
+.Va ( owner )
+of the daemon process reporting the event,
+either as a username or a userid.
+.Pp
+The rest of the fields control the behavior of the filter.
+.Pp
+The
+.Va name
+field, is the name of the packet filter rule to be used.
+If the
+.Va name
+starts with a hyphen
+.Pq Dq - ,
+then the default rulename is prepended to the given name.
+If the
+.Dv name
+contains a
+.Dq / ,
+the remaining portion of the name is interpreted as the mask to be
+applied to the address specified in the rule, causing a single rule violation to
+block the entire subnet for the configured prefix.
+.Pp
+The
+.Va nfail
+field contains the number of failed attempts before access is blocked,
+defaulting to
+.Dq *
+meaning never, and the last field
+.Va duration
+specifies the amount of time since the last access that the blocking
+rule should be active, defaulting to
+.Dq *
+meaning forever.
+The default unit for
+.Va duration
+is seconds, but one can specify suffixes for different units, such as
+.Dq m
+for minutes
+.Dq h
+for hours and
+.Dq d
+for days.
+.Pp
+Matching is done first by checking the
+.Va [local]
+rules individually, in the order of the most specific to the least specific.
+If a match is found, then the matching
+.Va [remote]
+rules are applied.
+The
+.Va name ,
+.Va nfail ,
+and
+.Va duration
+fields can be altered by the
+.Va [remote]
+rule that matched.
+.Pp
+The
+.Va [remote]
+rules can be used for allowing specific addresses, changing the mask
+size (via
+.Va name ) ,
+the rule that the packet filter uses (also via
+.Va name ) ,
+the number of failed attempts (via
+.Va nfail ) ,
+or the duration to block (via
+.Va duration ) .
+.Sh FILES
+.Bl -tag -width /etc/blacklistd.conf -compact
+.It Pa /etc/blacklistd.conf
+Configuration file.
+.El
+.Sh EXAMPLES
+.Bd -literal -offset 8n
+# Block ssh, after 3 attempts for 6 hours on the bnx0 interface
+[local]
+# location type proto owner name nfail duration
+bnx0:ssh * * * * 3 6h
+[remote]
+# Never block 1.2.3.4
+1.2.3.4:ssh * * * * * *
+# Never block the example IPv6 subnet either
+[2001:db8::]/32:ssh * * * * * *
+# For addresses coming from 8.8.0.0/16 block whole /24 networks instead
+# individual hosts, but keep the rest of the blocking parameters the same.
+8.8.0.0/16:ssh * * * /24 = =
+.Ed
+.Sh SEE ALSO
+.Xr blacklistctl 8 ,
+.Xr blacklistd 8
+.Sh NOTES
+The
+.Nm
+file has been renamed to
+.Xr blocklistd.conf 8 .
+.Sh HISTORY
+.Nm
+first appeared in
+.Nx 7 .
+.Fx
+support for
+.Nm
+was implemented in
+.Fx 11 .
+.Sh AUTHORS
+.An Christos Zoulas
diff --git a/contrib/blocklist/lib/libblacklist.3 b/contrib/blocklist/lib/libblacklist.3
new file mode 100644
index 000000000000..5bc093c38f79
--- /dev/null
+++ b/contrib/blocklist/lib/libblacklist.3
@@ -0,0 +1,188 @@
+.\" $NetBSD: libblocklist.3,v 1.7 2025/02/05 20:14:30 christos Exp $
+.\"
+.\" Copyright (c) 2015 The NetBSD Foundation, Inc.
+.\" All rights reserved.
+.\"
+.\" This code is derived from software contributed to The NetBSD Foundation
+.\" by Christos Zoulas.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+.\" ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+.\" TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+.\" PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+.\" BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+.\" CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+.\" SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+.\" INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+.\" CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+.\" ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+.\" POSSIBILITY OF SUCH DAMAGE.
+.\"
+.Dd February 5, 2025
+.Dt LIBBLACKLIST 3
+.Os
+.Sh NAME
+.Nm blacklist_open ,
+.Nm blacklist_open2 ,
+.Nm blacklist_close ,
+.Nm blacklist_r ,
+.Nm blacklist ,
+.Nm blacklist_sa ,
+.Nm blacklist_sa_r
+.Nd Blacklistd notification library
+.Sh LIBRARY
+.Lb libblacklist
+.Sh SYNOPSIS
+.In blacklist.h
+.Ft struct blacklist *
+.Fn blacklist_open "void"
+.Ft struct blacklist *
+.Fn blacklist_open2 "void (*logger)(int, struct syslog_data *, va_list)"
+.Ft void
+.Fn blacklist_close "struct blacklist *cookie"
+.Ft int
+.Fn blacklist "int action" "int fd" "const char *msg"
+.Ft int
+.Fn blacklist_r "struct blacklist *cookie" "int action" "int fd" "const char *msg"
+.Ft int
+.Fn blacklist_sa "int action" "int fd" "const struct sockaddr *sa" "socklen_t salen" "const char *msg"
+.Ft int
+.Fn blacklist_sa_r "struct blacklist *cookie" "int action" "int fd" "const struct sockaddr *sa" "socklen_t salen" "const char *msg"
+.Sh DESCRIPTION
+These functions can be used by daemons to notify
+.Xr blacklistd 8
+about successful and failed remote connections so that blacklistd can
+block or release port access to prevent Denial of Service attacks.
+.Pp
+The function
+.Fn blacklist_open
+creates the necessary state to communicate with
+.Xr blacklistd 8
+and returns a pointer to it, or
+.Dv NULL
+on failure.
+.Pp
+The function
+.Fn blacklist_open2
+is similar to
+.Fn blacklist_open
+but allows a
+.Fa logger
+to be specified.
+If the
+.Fa logger
+is
+.Dv NULL ,
+then no logging is performed.
+.Pp
+The
+.Fn blacklist_close
+function frees all memory and resources used.
+.Pp
+The
+.Fn blacklist
+function sends a message to
+.Xr blacklistd 8 ,
+with an integer
+.Ar action
+argument specifying the type of notification,
+a file descriptor
+.Ar fd
+specifying the accepted file descriptor connected to the client,
+and an optional message in the
+.Ar msg
+argument.
+.Pp
+The
+.Ar action
+parameter can take these values:
+.Bl -tag -width ".Dv BLACKLIST_ABUSIVE_BEHAVIOR"
+.It Dv BLACKLIST_AUTH_FAIL
+There was an unsuccessful authentication attempt.
+.It Dv BLACKLIST_AUTH_OK
+A user successfully authenticated.
+.It Dv BLACKLIST_ABUSIVE_BEHAVIOR
+The sending daemon has detected abusive behavior
+from the remote system.
+The remote address should
+be blocked as soon as possible.
+.It Dv BLACKLIST_BAD_USER
+The sending daemon has determined the username
+presented for authentication is invalid.
+The
+.Xr blacklistd 8
+daemon compares the username to a configured list of forbidden
+usernames and
+blocks the address immediately if a forbidden username matches.
+(The
+.Dv BLACKLIST_BAD_USER
+support is not currently available.)
+.El
+.Pp
+The
+.Fn blacklist_r
+function is more efficient because it keeps the blacklist state around.
+.Pp
+The
+.Fn blacklist_sa
+and
+.Fn blacklist_sa_r
+functions can be used with unconnected sockets, where
+.Xr getpeername 2
+will not work, the server will pass the peer name in the message.
+.Pp
+In all cases the file descriptor passed in the
+.Fa fd
+argument must be pointing to a valid socket so that
+.Xr blacklistd 8
+can establish ownership of the local endpoint
+using
+.Xr getsockname 2 .
+.Pp
+By default,
+.Xr syslogd 8
+is used for message logging.
+The internal
+.Fn bl_create
+function can be used to create the required internal
+state and specify a custom logging function.
+.Sh RETURN VALUES
+The function
+.Fn blacklist_open
+returns a cookie on success and
+.Dv NULL
+on failure setting
+.Dv errno
+to an appropriate value.
+.Pp
+The functions
+.Fn blacklist ,
+.Fn blacklist_sa ,
+and
+.Fn blacklist_sa_r
+return
+.Dv 0
+on success and
+.Dv \-1
+on failure setting
+.Dv errno
+to an appropriate value.
+.Sh NOTES
+The
+.Lb libblacklist
+has been renamed to
+.Xr libblocklist 3 .
+.Sh SEE ALSO
+.Xr blacklistd.conf 5 ,
+.Xr blacklistd 8
+.Sh AUTHORS
+.An Christos Zoulas
diff --git a/lib/libblacklist/Makefile b/lib/libblacklist/Makefile
index 07c770883eab..cac023d69bb7 100644
--- a/lib/libblacklist/Makefile
+++ b/lib/libblacklist/Makefile
@@ -18,14 +18,13 @@ CFLAGS+=-I${BLOCKLIST_DIR}/include -I${BLOCKLIST_DIR}/port \
SRCS= old_bl.c blacklist.c vsyslog_r.c
INCS= blacklist.h
-MAN= libblocklist.3
-
-MLINKS+=libblocklist.3 libblacklist.3 \
- libblocklist.3 blacklist_open.3 \
- libblocklist.3 blacklist_close.3 \
- libblocklist.3 blacklist.3 \
- libblocklist.3 blacklist_r.3 \
- libblocklist.3 blacklist_sa.3 \
- libblocklist.3 blacklist_sa_r.3
+MAN= libblacklist.3
+
+MLINKS= libblacklist.3 blacklist_open.3 \
+ libblacklist.3 blacklist_close.3 \
+ libblacklist.3 blacklist.3 \
+ libblacklist.3 blacklist_r.3 \
+ libblacklist.3 blacklist_sa.3 \
+ libblacklist.3 blacklist_sa_r.3
.include <bsd.lib.mk>
diff --git a/lib/libc/stdlib/realpath.3 b/lib/libc/stdlib/realpath.3
index 065ba312c2ef..76f40249963b 100644
--- a/lib/libc/stdlib/realpath.3
+++ b/lib/libc/stdlib/realpath.3
@@ -28,7 +28,7 @@
.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
.\" SUCH DAMAGE.
.\"
-.Dd May 11, 2012
+.Dd October 10, 2025
.Dt REALPATH 3
.Os
.Sh NAME
@@ -108,11 +108,11 @@ and
.Xr getcwd 3 .
.Sh SEE ALSO
.Xr getcwd 3
-.\" .Sh STANDARDS
-.\" The
-.\" .Fn realpath
-.\" function conforms to
-.\" .St -p1003.1-2001 .
+.Sh STANDARDS
+The
+.Fn realpath
+function conforms to
+.St -p1003.1-2001 .
.Sh HISTORY
The
.Fn realpath
diff --git a/lib/libc/stdlib/realpath.c b/lib/libc/stdlib/realpath.c
index 4c52b73319ab..18f29e95ee6b 100644
--- a/lib/libc/stdlib/realpath.c
+++ b/lib/libc/stdlib/realpath.c
@@ -49,7 +49,7 @@ realpath1(const char *path, char *resolved)
{
struct stat sb;
char *p, *q;
- size_t left_len, resolved_len, next_token_len;
+ size_t left_len, prev_len, resolved_len, next_token_len;
unsigned symlinks;
ssize_t slen;
char left[PATH_MAX], next_token[PATH_MAX], symlink[PATH_MAX];
@@ -98,6 +98,7 @@ realpath1(const char *path, char *resolved)
left_len = 0;
}
+ prev_len = resolved_len;
if (resolved[resolved_len - 1] != '/') {
if (resolved_len + 1 >= PATH_MAX) {
errno = ENAMETOOLONG;
@@ -133,8 +134,17 @@ realpath1(const char *path, char *resolved)
errno = ENAMETOOLONG;
return (NULL);
}
- if (lstat(resolved, &sb) != 0)
+ if (lstat(resolved, &sb) != 0) {
+ /*
+ * EACCES means the parent directory is not
+ * readable, while ENOTDIR means the parent
+ * directory is not a directory. Rewind the path
+ * to correctly indicate where the error lies.
+ */
+ if (errno == EACCES || errno == ENOTDIR)
+ resolved[prev_len] = '\0';
return (NULL);
+ }
if (S_ISLNK(sb.st_mode)) {
if (symlinks++ > MAXSYMLINKS) {
errno = ELOOP;
diff --git a/lib/libc/tests/gen/realpath2_test.c b/lib/libc/tests/gen/realpath2_test.c
index f89dd99cbb72..431df8721ae0 100644
--- a/lib/libc/tests/gen/realpath2_test.c
+++ b/lib/libc/tests/gen/realpath2_test.c
@@ -1,6 +1,7 @@
/*
* Copyright (c) 2017 Jan Kokemüller
* All rights reserved.
+ * Copyright (c) 2025 Klara, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -25,6 +26,8 @@
*/
#include <sys/param.h>
+#include <sys/stat.h>
+
#include <errno.h>
#include <fcntl.h>
#include <stdio.h>
@@ -34,6 +37,31 @@
#include <atf-c.h>
+ATF_TC(realpath_null);
+ATF_TC_HEAD(realpath_null, tc)
+{
+ atf_tc_set_md_var(tc, "descr", "Test null input");
+}
+ATF_TC_BODY(realpath_null, tc)
+{
+ ATF_REQUIRE_ERRNO(EINVAL, realpath(NULL, NULL) == NULL);
+}
+
+ATF_TC(realpath_empty);
+ATF_TC_HEAD(realpath_empty, tc)
+{
+ atf_tc_set_md_var(tc, "descr", "Test empty input");
+}
+ATF_TC_BODY(realpath_empty, tc)
+{
+ char resb[PATH_MAX] = "";
+
+ ATF_REQUIRE_EQ(0, mkdir("foo", 0755));
+ ATF_REQUIRE_EQ(0, chdir("foo"));
+ ATF_REQUIRE_ERRNO(ENOENT, realpath("", resb) == NULL);
+ ATF_REQUIRE_STREQ("", resb);
+}
+
ATF_TC(realpath_buffer_overflow);
ATF_TC_HEAD(realpath_buffer_overflow, tc)
{
@@ -44,16 +72,11 @@ ATF_TC_HEAD(realpath_buffer_overflow, tc)
ATF_TC_BODY(realpath_buffer_overflow, tc)
{
- char path[MAXPATHLEN] = { 0 };
- char resb[MAXPATHLEN] = { 0 };
- size_t i;
+ char path[PATH_MAX] = "";
+ char resb[PATH_MAX] = "";
- path[0] = 'a';
+ memset(path, 'a', sizeof(path) - 1);
path[1] = '/';
- for (i = 2; i < sizeof(path) - 1; ++i) {
- path[i] = 'a';
- }
-
ATF_REQUIRE(realpath(path, resb) == NULL);
}
@@ -66,9 +89,9 @@ ATF_TC_HEAD(realpath_empty_symlink, tc)
ATF_TC_BODY(realpath_empty_symlink, tc)
{
- char path[MAXPATHLEN] = { 0 };
- char slnk[MAXPATHLEN] = { 0 };
- char resb[MAXPATHLEN] = { 0 };
+ char path[PATH_MAX] = "";
+ char slnk[PATH_MAX] = "";
+ char resb[PATH_MAX] = "";
int fd;
(void)strlcat(slnk, "empty_symlink", sizeof(slnk));
@@ -89,11 +112,70 @@ ATF_TC_BODY(realpath_empty_symlink, tc)
ATF_REQUIRE(unlink(slnk) == 0);
}
-ATF_TP_ADD_TCS(tp)
+ATF_TC(realpath_partial);
+ATF_TC_HEAD(realpath_partial, tc)
+{
+ atf_tc_set_md_var(tc, "descr",
+ "Test that failure leaves a partial result");
+ atf_tc_set_md_var(tc, "require.user", "unprivileged");
+}
+
+ATF_TC_BODY(realpath_partial, tc)
{
+ char resb[PATH_MAX] = "";
+ size_t len;
+
+ /* scenario 1: missing directory */
+ ATF_REQUIRE_EQ(0, mkdir("foo", 0755));
+ ATF_REQUIRE_ERRNO(ENOENT, realpath("foo/bar/baz", resb) == NULL);
+ len = strnlen(resb, sizeof(resb));
+ ATF_REQUIRE(len > 8 && len < sizeof(resb));
+ ATF_REQUIRE_STREQ("/foo/bar", resb + len - 8);
+
+ /* scenario 2: dead link 1 */
+ ATF_REQUIRE_EQ(0, symlink("nix", "foo/bar"));
+ ATF_REQUIRE_ERRNO(ENOENT, realpath("foo/bar/baz", resb) == NULL);
+ len = strnlen(resb, sizeof(resb));
+ ATF_REQUIRE(len > 8 && len < sizeof(resb));
+ ATF_REQUIRE_STREQ("/foo/nix", resb + len - 8);
+
+ /* scenario 3: missing file */
+ ATF_REQUIRE_EQ(0, unlink("foo/bar"));
+ ATF_REQUIRE_EQ(0, mkdir("foo/bar", 0755));
+ ATF_REQUIRE_ERRNO(ENOENT, realpath("foo/bar/baz", resb) == NULL);
+ len = strnlen(resb, sizeof(resb));
+ ATF_REQUIRE(len > 12 && len < sizeof(resb));
+ ATF_REQUIRE_STREQ("/foo/bar/baz", resb + len - 12);
+
+ /* scenario 4: dead link 2 */
+ ATF_REQUIRE_EQ(0, symlink("nix", "foo/bar/baz"));
+ ATF_REQUIRE_ERRNO(ENOENT, realpath("foo/bar/baz", resb) == NULL);
+ len = strnlen(resb, sizeof(resb));
+ ATF_REQUIRE(len > 12 && len < sizeof(resb));
+ ATF_REQUIRE_STREQ("/foo/bar/nix", resb + len - 12);
+
+ /* scenario 5: unreadable directory */
+ ATF_REQUIRE_EQ(0, chmod("foo", 000));
+ ATF_REQUIRE_ERRNO(EACCES, realpath("foo/bar/baz", resb) == NULL);
+ len = strnlen(resb, sizeof(resb));
+ ATF_REQUIRE(len > 4 && len < sizeof(resb));
+ ATF_REQUIRE_STREQ("/foo", resb + len - 4);
+
+ /* scenario 6: not a directory */
+ ATF_REQUIRE_EQ(0, close(creat("bar", 0644)));
+ ATF_REQUIRE_ERRNO(ENOTDIR, realpath("bar/baz", resb) == NULL);
+ len = strnlen(resb, sizeof(resb));
+ ATF_REQUIRE(len > 4 && len < sizeof(resb));
+ ATF_REQUIRE_STREQ("/bar", resb + len - 4);
+}
+ATF_TP_ADD_TCS(tp)
+{
+ ATF_TP_ADD_TC(tp, realpath_null);
+ ATF_TP_ADD_TC(tp, realpath_empty);
ATF_TP_ADD_TC(tp, realpath_buffer_overflow);
ATF_TP_ADD_TC(tp, realpath_empty_symlink);
+ ATF_TP_ADD_TC(tp, realpath_partial);
return atf_no_error();
}
diff --git a/lib/libpam/modules/pam_xdg/pam_xdg.8 b/lib/libpam/modules/pam_xdg/pam_xdg.8
index 9b335751a9fb..031010953e98 100644
--- a/lib/libpam/modules/pam_xdg/pam_xdg.8
+++ b/lib/libpam/modules/pam_xdg/pam_xdg.8
@@ -50,7 +50,6 @@ Use an alternate base directory
.Bl -tag -width indent
.It Ev XDG_RUNTIME_DIR
The location of the runtime files base directory created by this module.
-Note that the module does not set this environment variable.
.El
.Sh STANDARDS
The directory created by this module conforms to the
diff --git a/sbin/geom/Makefile b/sbin/geom/Makefile
index 078503d3ae67..61561ef1ff1b 100644
--- a/sbin/geom/Makefile
+++ b/sbin/geom/Makefile
@@ -9,7 +9,7 @@ MAN= geom.8
CFLAGS+= -I${.CURDIR} -I${.CURDIR}/core
CFLAGS+= -DGEOM_CLASS_DIR=\"${GEOM_CLASS_DIR}\"
-LIBADD= geom util
+LIBADD= geom util xo
.if defined(RESCUE)
.PATH: ${SRCTOP}/lib/geom/part \
diff --git a/sbin/geom/core/geom.c b/sbin/geom/core/geom.c
index b78021194ddd..496123f08274 100644
--- a/sbin/geom/core/geom.c
+++ b/sbin/geom/core/geom.c
@@ -49,9 +49,12 @@
#include <assert.h>
#include <libgeom.h>
#include <geom.h>
+#include <libxo/xo.h>
#include "misc/subr.h"
+#define GEOM_XO_VERSION "1"
+
#ifdef STATIC_GEOM_CLASSES
extern uint32_t gpart_version;
extern struct g_command gpart_class_commands[];
@@ -513,6 +516,7 @@ run_command(int argc, char *argv[])
gctl_free(req);
if (verbose)
printf("Done.\n");
+ xo_finish();
exit(EXIT_SUCCESS);
}
@@ -810,6 +814,10 @@ main(int argc, char *argv[])
provider_name = NULL;
tflag = false;
+ argc = xo_parse_args(argc, argv);
+ if (argc < 0)
+ return (argc);
+
if (strcmp(getprogname(), "geom") == 0) {
while ((ch = getopt(argc, argv, "hp:t")) != -1) {
switch (ch) {
@@ -831,6 +839,7 @@ main(int argc, char *argv[])
* Don't adjust argc and argv, it would break get_class().
*/
}
+ xo_set_version(GEOM_XO_VERSION);
if (tflag && provider_name != NULL) {
errx(EXIT_FAILURE,
@@ -839,6 +848,7 @@ main(int argc, char *argv[])
if (provider_name != NULL) {
list_one_geom_by_provider(provider_name);
+ xo_finish();
return (0);
}
@@ -882,29 +892,33 @@ find_geom(struct gclass *classp, const char *name)
}
static void
-list_one_provider(struct gprovider *pp, const char *prefix)
+list_one_provider(struct gprovider *pp, const char *padding)
{
struct gconfig *conf;
char buf[5];
- printf("Name: %s\n", pp->lg_name);
+ xo_emit("{Lcw:Name}{:Name}\n", pp->lg_name);
humanize_number(buf, sizeof(buf), (int64_t)pp->lg_mediasize, "",
HN_AUTOSCALE, HN_B | HN_NOSPACE | HN_DECIMAL);
- printf("%sMediasize: %jd (%s)\n", prefix, (intmax_t)pp->lg_mediasize,
- buf);
- printf("%sSectorsize: %u\n", prefix, pp->lg_sectorsize);
+ xo_emit("{P:/%s}{Lcw:Mediasize}{:Mediasize/%jd} ({N:/%s})\n",
+ padding, (intmax_t)pp->lg_mediasize, buf);
+ xo_emit("{P:/%s}{Lcw:Sectorsize}{:Sectorsize/%u} \n",
+ padding, pp->lg_sectorsize);
if (pp->lg_stripesize > 0 || pp->lg_stripeoffset > 0) {
- printf("%sStripesize: %ju\n", prefix, pp->lg_stripesize);
- printf("%sStripeoffset: %ju\n", prefix, pp->lg_stripeoffset);
+ xo_emit("{P:/%s}{Lcw:Stripesize}{Stripesize/%ju}\n",
+ padding, pp->lg_stripesize);
+ xo_emit("{P:/%s}{Lcw:Stripeoffset}{Stripeoffset/%ju}\n",
+ padding, pp->lg_stripeoffset);
}
- printf("%sMode: %s\n", prefix, pp->lg_mode);
+ xo_emit("{P:/%s}{Lcw:Mode}{Mode}\n", padding, pp->lg_mode);
LIST_FOREACH(conf, &pp->lg_config, lg_config) {
- printf("%s%s: %s\n", prefix, conf->lg_name, conf->lg_val);
+ xo_emit("{P:/%s}{Lcwa:}{a:}\n", padding, conf->lg_name,
+ conf->lg_name, conf->lg_val);
}
}
static void
-list_one_consumer(struct gconsumer *cp, const char *prefix)
+list_one_consumer(struct gconsumer *cp, const char *padding)
{
struct gprovider *pp;
struct gconfig *conf;
@@ -915,20 +929,24 @@ list_one_consumer(struct gconsumer *cp, const char *prefix)
else {
char buf[5];
- printf("Name: %s\n", pp->lg_name);
+ xo_emit("{Lcw:Name}{:Name}\n", pp->lg_name);
humanize_number(buf, sizeof(buf), (int64_t)pp->lg_mediasize, "",
HN_AUTOSCALE, HN_B | HN_NOSPACE | HN_DECIMAL);
- printf("%sMediasize: %jd (%s)\n", prefix,
- (intmax_t)pp->lg_mediasize, buf);
- printf("%sSectorsize: %u\n", prefix, pp->lg_sectorsize);
+ xo_emit("{P:/%s}{Lcw:Mediasize}{:Mediasize/%jd} ({N:/%s})\n",
+ padding, (intmax_t)pp->lg_mediasize, buf);
+ xo_emit("{P:/%s}{Lcw:Sectorsize}{:Sectorsize/%u}\n",
+ padding, pp->lg_sectorsize);
if (pp->lg_stripesize > 0 || pp->lg_stripeoffset > 0) {
- printf("%sStripesize: %ju\n", prefix, pp->lg_stripesize);
- printf("%sStripeoffset: %ju\n", prefix, pp->lg_stripeoffset);
+ xo_emit("{P:/%s}{Lcw:Stripesize}{:Stripesize/%ju}\n",
+ padding, pp->lg_stripesize);
+ xo_emit("{P:/%s}{Lcw:Stripeoffset}{:Stripesize/%ju}\n",
+ padding, pp->lg_stripeoffset);
}
- printf("%sMode: %s\n", prefix, cp->lg_mode);
+ xo_emit("{P:/%s}{Lcw:Mode}{:Mode}\n", padding, pp->lg_mode);
}
LIST_FOREACH(conf, &cp->lg_config, lg_config) {
- printf("%s%s: %s\n", prefix, conf->lg_name, conf->lg_val);
+ xo_emit("{P:/%s}{Lcwa:}{a:}\n", padding, conf->lg_name,
+ conf->lg_name, conf->lg_val);
}
}
@@ -940,27 +958,36 @@ list_one_geom(struct ggeom *gp)
struct gconfig *conf;
unsigned n;
- printf("Geom name: %s\n", gp->lg_name);
+ xo_emit("{Lcw:Geom name}{:Name}\n", gp->lg_name);
LIST_FOREACH(conf, &gp->lg_config, lg_config) {
- printf("%s: %s\n", conf->lg_name, conf->lg_val);
+ xo_emit("{Lcwa:}{a:}\n", conf->lg_name, conf->lg_name,
+ conf->lg_val);
}
if (!LIST_EMPTY(&gp->lg_provider)) {
- printf("Providers:\n");
+ xo_open_list("Providers");
+ xo_emit("{Tc:Providers}\n");
n = 1;
LIST_FOREACH(pp, &gp->lg_provider, lg_provider) {
- printf("%u. ", n++);
+ xo_emit("{T:/%u} ", n++);
+ xo_open_instance("provider");
list_one_provider(pp, " ");
+ xo_close_instance("provider");
}
+ xo_close_list("Providers");
}
if (!LIST_EMPTY(&gp->lg_consumer)) {
- printf("Consumers:\n");
+ xo_open_list("Consumers");
+ xo_emit("{Tc:Consumers}\n");
n = 1;
LIST_FOREACH(cp, &gp->lg_consumer, lg_consumer) {
- printf("%u. ", n++);
+ xo_emit("{T:/%u} ", n++);
+ xo_open_instance("consumer");
list_one_consumer(cp, " ");
+ xo_close_instance("consumer");
}
+ xo_close_list("Consumers");
}
- printf("\n");
+ xo_emit("\n");
}
static void
@@ -978,8 +1005,10 @@ list_one_geom_by_provider(const char *provider_name)
if (gp == NULL)
errx(EXIT_FAILURE, "Cannot find provider '%s'.", provider_name);
- printf("Geom class: %s\n", gp->lg_class->lg_name);
+ xo_open_container("Geom");
+ xo_emit("{Lwc:Geom class}{:Class}\n", gp->lg_class->lg_name);
list_one_geom(gp);
+ xo_close_container("Geom");
}
static void
@@ -1038,14 +1067,20 @@ std_list(struct gctl_req *req, unsigned flags __unused)
"an instance named '%s'.",
gclass_name, name);
}
+ xo_open_container("Geom");
list_one_geom(gp);
+ xo_close_container("Geom");
}
} else {
+ xo_open_list("Geoms");
LIST_FOREACH(gp, &classp->lg_geom, lg_geom) {
if (LIST_EMPTY(&gp->lg_provider) && !all)
continue;
+ xo_open_instance("geom");
list_one_geom(gp);
+ xo_close_instance("geom");
}
+ xo_close_list("Geoms");
}
geom_deletetree(&mesh);
}
@@ -1115,34 +1150,24 @@ status_update_len_prs(struct ggeom *gp, int *name_len, int *status_len)
}
static char *
-status_one_consumer(struct gconsumer *cp)
+status_one_consumer(struct gconsumer *cp, const char *value)
{
- static char buf[256];
struct gprovider *pp;
struct gconfig *conf;
- const char *state, *syncr;
+ char *ret;
pp = cp->lg_provider;
if (pp == NULL)
return (NULL);
- state = NULL;
- syncr = NULL;
+ ret = NULL;
LIST_FOREACH(conf, &cp->lg_config, lg_config) {
- if (strcasecmp(conf->lg_name, "state") == 0)
- state = conf->lg_val;
- if (strcasecmp(conf->lg_name, "synchronized") == 0)
- syncr = conf->lg_val;
- }
- if (state == NULL && syncr == NULL)
- snprintf(buf, sizeof(buf), "%s", pp->lg_name);
- else if (state != NULL && syncr != NULL) {
- snprintf(buf, sizeof(buf), "%s (%s, %s)", pp->lg_name,
- state, syncr);
- } else {
- snprintf(buf, sizeof(buf), "%s (%s)", pp->lg_name,
- state ? state : syncr);
+ if (strcasecmp(conf->lg_name, value) == 0)
+ ret = conf->lg_val;
}
- return (buf);
+
+ if (ret == NULL)
+ return (NULL);
+ return (ret);
}
static void
@@ -1150,8 +1175,8 @@ status_one_geom(struct ggeom *gp, int script, int name_len, int status_len)
{
struct gconsumer *cp;
struct gconfig *conf;
- const char *name, *status, *component;
- int gotone;
+ const char *name, *status, *cstate, *csyncr;
+ int gotone, len;
name = gp->lg_name;
status = "N/A";
@@ -1161,21 +1186,49 @@ status_one_geom(struct ggeom *gp, int script, int name_len, int status_len)
break;
}
}
- gotone = 0;
+ gotone = len = 0;
+ xo_open_instance("status");
LIST_FOREACH(cp, &gp->lg_consumer, lg_consumer) {
- component = status_one_consumer(cp);
- if (component == NULL)
+ cstate = status_one_consumer(cp, "state");
+ csyncr = status_one_consumer(cp, "synchronized");
+ if (cstate == NULL && csyncr == NULL)
continue;
+ if (!gotone || script) {
+ if (!gotone) {
+ xo_emit("{:name/%*s} {:status/%*s} ",
+ name_len, name, status_len, status);
+ } else {
+ xo_emit("{d:name/%*s} {d:status/%*s} ",
+ name_len, name, status_len, status);
+ }
+ xo_open_list("components");
+ }
+
+ xo_open_instance("components");
+ if (cstate != NULL && csyncr != NULL) {
+ xo_emit("{P:/%*s}{:compontent} ({:state}, {:synchronized})\n",
+ len, "", cp->lg_provider->lg_name, cstate, csyncr);
+ } else if (cstate != NULL) {
+ xo_emit("{P:/%*s}{:compontent} ({:state})\n",
+ len, "", cp->lg_provider->lg_name, cstate);
+ } else {
+ xo_emit("{P:/%*s}{:compontent} ({:synchronized})\n",
+ len, "", cp->lg_provider->lg_name, csyncr);
+ }
+ xo_close_instance("components");
gotone = 1;
- printf("%*s %*s %s\n", name_len, name, status_len, status,
- component);
- if (!script)
- name = status = "";
+ if (!len && !script)
+ len = name_len + status_len + 4;
}
if (!gotone) {
- printf("%*s %*s %s\n", name_len, name, status_len, status,
- "N/A");
+ xo_emit("{:name/%*s} {:status/%*s} ", name_len, name, status_len, status);
+ xo_open_list("components");
+ xo_open_instance("components");
+ xo_emit("{P:/%*s}{d:compontent}\n", len, "", "N/A");
+ xo_close_instance("components");
}
+ xo_close_list("components");
+ xo_close_instance("status");
}
static void
@@ -1184,9 +1237,10 @@ status_one_geom_prs(struct ggeom *gp, int script, int name_len, int status_len)
struct gprovider *pp;
struct gconsumer *cp;
struct gconfig *conf;
- const char *name, *status, *component;
- int gotone;
+ const char *name, *status, *cstate, *csyncr;
+ int gotone, len;
+ xo_open_instance("status");
LIST_FOREACH(pp, &gp->lg_provider, lg_provider) {
name = pp->lg_name;
status = "N/A";
@@ -1202,22 +1256,50 @@ status_one_geom_prs(struct ggeom *gp, int script, int name_len, int status_len)
break;
}
}
- gotone = 0;
+ gotone = len = 0;
LIST_FOREACH(cp, &gp->lg_consumer, lg_consumer) {
- component = status_one_consumer(cp);
- if (component == NULL)
+ cstate = status_one_consumer(cp, "state");
+ csyncr = status_one_consumer(cp, "synchronized");
+ if (cstate == NULL && csyncr == NULL)
continue;
+
+ if (!gotone || script) {
+ if (!gotone) {
+ xo_emit("{:name/%*s} {:status/%*s} ",
+ name_len, name, status_len, status);
+ } else {
+ xo_emit("{d:name/%*s} {d:status/%*s} ",
+ name_len, name, status_len, status);
+ }
+ xo_open_list("components");
+ }
+
+ xo_open_instance("component");
+ if (cstate != NULL && csyncr != NULL) {
+ xo_emit("{P:/%*s}{:compontent} ({:state}, {:synchronized})\n",
+ len, "", cp->lg_provider->lg_name, cstate, csyncr);
+ } else if (cstate != NULL) {
+ xo_emit("{P:/%*s}{:compontent} ({:state})\n",
+ len, "", cp->lg_provider->lg_name, cstate);
+ } else {
+ xo_emit("{P:/%*s}{:compontent} ({:synchronized})\n",
+ len, "", cp->lg_provider->lg_name, csyncr);
+ }
+ xo_close_instance("component");
gotone = 1;
- printf("%*s %*s %s\n", name_len, name,
- status_len, status, component);
- if (!script)
- name = status = "";
+ if (!len && !script)
+ len = name_len + status_len + 4;
}
if (!gotone) {
- printf("%*s %*s %s\n", name_len, name,
- status_len, status, "N/A");
+ xo_emit("{:name/%*s} {:status/%*s} ", name_len, name, status_len, status);
+ xo_open_list("components");
+ xo_open_instance("components");
+ xo_emit("{P:/%*s}{d:compontent}\n", len, "", "N/A");
+ xo_close_instance("components");
}
+ xo_close_list("components");
}
+ xo_close_instance("status");
}
static void
@@ -1240,13 +1322,9 @@ std_status(struct gctl_req *req, unsigned flags __unused)
all = gctl_get_int(req, "all");
geoms = gctl_get_int(req, "geoms");
script = gctl_get_int(req, "script");
- if (script) {
- name_len = 0;
- status_len = 0;
- } else {
- name_len = strlen("Name");
- status_len = strlen("Status");
- }
+ name_len = strlen("Name");
+ status_len = strlen("Status");
+
if (nargs > 0) {
for (i = 0, n = 0; i < nargs; i++) {
name = gctl_get_ascii(req, "arg%d", i);
@@ -1282,9 +1360,10 @@ std_status(struct gctl_req *req, unsigned flags __unused)
goto end;
}
if (!script) {
- printf("%*s %*s %s\n", name_len, "Name", status_len, "Status",
- "Components");
+ xo_emit("{T:/%*s} {T:/%*s} {T:Components}\n",
+ name_len, "Name", status_len, "Status");
}
+ xo_open_list("status");
if (nargs > 0) {
for (i = 0; i < nargs; i++) {
name = gctl_get_ascii(req, "arg%d", i);
@@ -1312,6 +1391,7 @@ std_status(struct gctl_req *req, unsigned flags __unused)
}
}
}
+ xo_close_list("status");
end:
geom_deletetree(&mesh);
}
diff --git a/sbin/ifconfig/ifconfig.8 b/sbin/ifconfig/ifconfig.8
index fafb77e1ca6c..d4f8d2b5747a 100644
--- a/sbin/ifconfig/ifconfig.8
+++ b/sbin/ifconfig/ifconfig.8
@@ -28,7 +28,7 @@
.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
.\" SUCH DAMAGE.
.\"
-.Dd September 12, 2025
+.Dd October 12, 2025
.Dt IFCONFIG 8
.Os
.Sh NAME
@@ -2799,7 +2799,7 @@ Set the untagged VLAN identifier for an interface.
Frames received on this interface without an 802.1Q tag will be assigned
to this VLAN instead of the default VLAN 0,
and outgoing frames on this VLAN will have their 802.1Q tag removed.
-.It Cm -ifuntagged Ar interface Ar vlan-id
+.It Cm -ifuntagged Ar interface
Clear the untagged VLAN identifier for an interface.
.It Cm defuntagged Ar vlan-id
Enable the
diff --git a/sbin/ipfw/tables.c b/sbin/ipfw/tables.c
index 7c3b1bb35a01..245c0c9e0399 100644
--- a/sbin/ipfw/tables.c
+++ b/sbin/ipfw/tables.c
@@ -1037,9 +1037,6 @@ table_modify_record(ipfw_obj_header *oh, int ac, char *av[], int add,
}
}
- /* Get real OS error */
- error = errno;
-
/* Report results back */
ptent = tent_buf;
for (i = 0; i < count; ptent++, i++) {
diff --git a/sbin/mdconfig/tests/mdconfig_test.sh b/sbin/mdconfig/tests/mdconfig_test.sh
index ea87ff5d542d..cc29c188cbd8 100755
--- a/sbin/mdconfig/tests/mdconfig_test.sh
+++ b/sbin/mdconfig/tests/mdconfig_test.sh
@@ -274,22 +274,23 @@ attach_size_rounddown()
attach_size_rounddown_body()
{
local md
- local ss=8192
- local ms=$(($ss + 4096))
- local ms2=$((2 * $ss + 4096))
+ local pgsz=$(pagesize)
+ local ss=$(($pgsz * 2))
+ local ms=$(($ss + $pgsz))
+ local ms2=$((2 * $ss + $pgsz))
- # Use a sector size that's a likely multiple of PAGE_SIZE, as md(4)
+ # Use a sector size that's a multiple of the kernel page size, as md(4)
# expects that for swap MDs.
atf_check -s exit:0 -o save:mdconfig.out -e empty \
-x "mdconfig -a -t swap -S $ss -s ${ms}b"
md=$(cat mdconfig.out)
- # 12288 bytes should be rounded down to one sector.
- check_diskinfo "$md" 8192 1 $ss
+ # one sector plus one page should be rounded down to one sector.
+ check_diskinfo "$md" $ss 1 $ss
# Resize and verify that the new size was also rounded down.
atf_check -s exit:0 -o empty -e empty \
-x "mdconfig -r -u ${md#md} -s ${ms2}b"
- check_diskinfo "$md" 16384 2 $ss
+ check_diskinfo "$md" $((2 * $ss)) 2 $ss
}
attach_size_rounddown_cleanup()
{
diff --git a/sbin/ping/tests/Makefile b/sbin/ping/tests/Makefile
index 0520b1d634cf..7d3ab02b9a86 100644
--- a/sbin/ping/tests/Makefile
+++ b/sbin/ping/tests/Makefile
@@ -1,5 +1,6 @@
ATF_TESTS_C+= in_cksum_test
-SRCS.in_cksum_test= in_cksum_test.c ../utils.c
+.PATH: ${.CURDIR:H}
+SRCS.in_cksum_test= in_cksum_test.c utils.c
PACKAGE= tests
diff --git a/share/man/man4/bridge.4 b/share/man/man4/bridge.4
index 7048df4593bf..3af952256d3a 100644
--- a/share/man/man4/bridge.4
+++ b/share/man/man4/bridge.4
@@ -36,7 +36,7 @@
.\" ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
.\" POSSIBILITY OF SUCH DAMAGE.
.\"
-.Dd July 28, 2025
+.Dd October 13, 2025
.Dt IF_BRIDGE 4
.Os
.Sh NAME
@@ -272,53 +272,149 @@ by setting the
node using
.Xr sysctl 8 .
.Sh VLAN SUPPORT
-The
+Virtual LANs (VLANs), defined in the IEEE 802.1Q standard, allow traffic
+on a bridge to be segregated into separate logical networks which cannot
+communicate with each other.
+For example, two interfaces in VLAN 10 would be able to communicate
+with each other, but not with another interface in VLAN 20.
+.Pp
+Each VLAN is identified by a number between 1 and 4094 inclusive.
+By default, all traffic on the bridge is assigned to "VLAN 0",
+a pseudo-VLAN used for historical compatibility.
+When VLANs are in use on a bridge, it is recommended to explicitly
+assign all traffic to a VLAN rather than using VLAN 0.
+.Pp
+The bridge implements Independent VLAN Learning (IVL), meaning that
+host addresses are learned separately for each VLAN, and the same host
+address may exist on several different ports in different VLANs.
+.Pp
+If a
+.Xr vlan 4
+interface is configured on an interface which is also an
.Nm
-driver has full support for virtual LANs (VLANs).
-The bridge implements independent VLAN learning, i.e. MAC addresses are
-learned on a per-VLAN basis, and the same MAC address may be learned on
-multiple interfaces on different VLANs.
-Incoming frames with an 802.1Q tag will be assigned to the appropriate
-VLAN.
-.Pp
-Traffic sent to or from the host is not assigned to a VLAN by default.
-To allow the host to communicate on a VLAN, configure a
+member interface, all tagged frames will be processed by the
.Xr vlan 4
-interface on the bridge and (if necessary) assign IP addresses there.
-.Pp
-By default no access control is enabled, so any interface may
-participate in any VLAN.
-.Pp
-VLAN filtering may be enabled on a bridge using the
+interface and will not be visible to the bridge.
+This configuration is not recommended and may be unsupported in a
+future release.
+.Ss Tagged and untagged traffic
+Incoming frames on a member interface may be either tagged or untagged.
+Tagged frames contain an 802.1Q header indicating which VLAN the
+frame belongs to, while untagged frames do not.
+When a tagged frame is received, the frame is automatically assigned to
+the VLAN in the tag (subject to any configured VLAN access list),
+while untagged frames are assigned to the interface's configured
+Port VLAN ID (PVID), or to VLAN 0 if no PVID is configured.
+.Ss Assigning interfaces to VLANs
+An interface's PVID may be configured using the
.Xr ifconfig 8
-.Cm vlanfilter
-option.
-When VLAN filtering is enabled, an interface may only send and receive
-frames based on its configured VLAN access list.
+.Cm ifuntagged
+command:
+.Bd -literal -offset indent
+ifconfig bridge0 ifuntagged ix0 10
+.Ed
.Pp
-The interface's untagged VLAN ID may be configured using the
-.Xr ifconfig 8
+Or by using the
.Cm untagged
-option.
-If an untagged VLAN ID is configured, incoming frames will be assigned
-to that VLAN, and the interface may receive outgoing untagged frames
-in that VLAN.
-.Pp
-The tagged VLAN access list may be configured using the
-.Cm tagged ,
-.Cm +tagged
-and
-.Cm -tagged
-options to
-.Xr ifconfig 8 .
-An interface may send and receive tagged frames for any VLAN in its
-access list.
+option to
+.Cm addm :
+.Bd -literal -offset indent
+ifconfig bridge0 addm ix0 untagged 10
+.Ed
.Pp
-The bridge will automatically insert or remove 802.1q tags as needed,
-based on the interface configuration, when forwarding frames between
-interfaces.
-This tag processing is only done for interfaces with VLAN filtering
-enabled.
+This will assign all untagged traffic received on the interface to the
+specified VLAN, and any traffic transmitted on the interface in this
+VLAN will have its VLAN tag (if present) removed.
+Conversely, any traffic transmitted on the interface in a different
+VLAN will have a tag added, to allow the remote system to assign the
+traffic to the appropriate VLAN.
+.Ss Host communication in a VLAN
+Sometimes it is useful to allow the host itself to communicate in a VLAN,
+for example to provide routing to other hosts in the VLAN.
+To do this, create a
+.Xr vlan 4
+interface on top of the
+.Nm
+interface with the appropriate VLAN tag.
+For example, to allow the host to communicate in VLAN 10:
+.Bd -literal -offset indent
+ifconfig bridge0.10 create inet6 2001:db8::1/64
+.Ed
+.Ss Configuring the VLAN access list (VLAN filtering)
+For historical reasons, the default
+.Nm
+configuration allows all interfaces to send tagged traffic for any VLAN,
+meaning that VLANs do not provide security separation.
+To restrict which interfaces may communicate in which VLANs,
+enable VLAN filtering on the bridge:
+.Bd -literal -offset indent
+ifconfig bridge0 vlanfilter
+.Ed
+.Pp
+This has the following effects on bridge members:
+.Bl -bullet -offset indent
+.It
+No untagged frames will be accepted from a member interface unless
+the interface has a PVID configured.
+.It
+No tagged frames will be accepted from a member interface unless
+the VLAN identifier is present in the interface's VLAN access list.
+.It
+Frames with stacked tags (Q-in-Q) will not be accepted from a
+member interface unless the
+.Cm qinq
+option (see below) has been configured for that member.
+.El
+.Pp
+To configure the VLAN access list, use the
+.Xr ifconfig 8
+.Cm iftagged ,
+.Cm +iftagged
+or
+.Cm -iftagged
+commands.
+For example, to allow an interface to communicate in VLANs 10, 20,
+and any VLAN from 100 to 199:
+.Bd -literal -offset indent
+ifconfig bridge0 iftagged ix0 10,20,100-199
+.Ed
+.Ss IEEE 802.1ad (Q-in-Q) configuration
+IEEE 802.1ad, also called Q-in-Q or
+.Dq tag stacking ,
+allows a single Ethernet frame to contain multiple tags.
+This allows one Ethernet network to transport traffic between endpoints
+using its own VLAN tags without interfering with any pre-existing tags,
+and is often used in service provider networks to provide
+.Dq virtual wire
+Ethernet services.
+.Pp
+When VLAN filtering is enabled,
+.Nm
+does not permit member interfaces to send Q-in-Q frames, because in
+certain configuration this allows
+.Dq VLAN-hopping
+attacks on the bridge.
+For example, consider a bridge with port ix0 configured as a tagged
+port in VLAN 10, and port ix1 configured as untagged in VLAN 10 and
+tagged in VLAN 20.
+If ix0 is allowed to send Q-in-Q frames, then it can send a frame with
+two tags: one for VLAN 10, followed by one for VLAN 20.
+When the bridge forwards the frame to ix1, it will strip the VLAN tag
+for VLAN 10, then forward the frame to ix1 with the tag for VLAN 20
+intact, effectively allowing ix1 to send traffic on VLAN 20 even
+though the bridge configuration should not permit that.
+.Pp
+To permit an interface to send Q-in-Q frames, set the
+.Xr ifconfig 8
+.Cm qinq
+flag on the interface.
+This is only required on the interface which will send Q-in-Q frames,
+not the interface receiving the frames.
+.Pp
+Alternatively, set the
+.Cm defqinq
+flag on the bridge itself to enable Q-in-Q for all newly-added
+interfaces by default.
.Sh PACKET FILTERING
Packet filtering can be used with any firewall package that hooks in via the
.Xr pfil 9
@@ -537,6 +633,36 @@ ifconfig_wlan0="up ssid my_ap mode 11g"
ifconfig_fxp0="up"
.Ed
.Pp
+The following will cause a bridge to be created with two VLANs,
+10 and 20, where the
+.Dq Li em
+interfaces can only communicate in their assigned VLANs,
+while
+.Dq Li ix0
+is a trunk port which can communicate in either VLAN:
+.Bd -literal -offset indent
+cloned_interfaces="bridge0"
+ifconfig_bridge0="vlanfilter \e
+ addm em0 untagged 10 \e
+ addm em1 untagged 10 \e
+ addm em2 untagged 20 \e
+ addm em3 untagged 20 \e
+ addm ix0 tagged 10,20"
+ifconfig_em0="up"
+ifconfig_em1="up"
+ifconfig_em2="up"
+ifconfig_em3="up"
+ifconfig_ix0="up"
+.Ed
+.Pp
+The previous example could be extended to allow the host to
+communicate in VLANs 10 and 20:
+.Bd -literal -offset indent
+vlans_bridge0="10 20"
+ifconfig_bridge0_10_ipv6="inet6 2001:db8:0:10::1/64"
+ifconfig_bridge0_20_ipv6="inet6 2001:db8:0:20::1/64"
+.Ed
+.Pp
Consider a system with two 4-port Ethernet boards.
The following will cause a bridge consisting of all 8 ports with
Rapid Spanning Tree enabled to be created:
diff --git a/sys/arm/arm/pmap-v6.c b/sys/arm/arm/pmap-v6.c
index 78883296c5b7..6a0ece1e4d98 100644
--- a/sys/arm/arm/pmap-v6.c
+++ b/sys/arm/arm/pmap-v6.c
@@ -1246,7 +1246,7 @@ pmap_bootstrap(vm_offset_t firstaddr)
}
static void
-pmap_init_reserved_pages(void)
+pmap_init_reserved_pages(void *dummy __unused)
{
struct pcpu *pc;
vm_offset_t pages;
diff --git a/sys/arm/arm/unwind.c b/sys/arm/arm/unwind.c
index 7ad91a3e01a5..0d77074fae34 100644
--- a/sys/arm/arm/unwind.c
+++ b/sys/arm/arm/unwind.c
@@ -278,7 +278,7 @@ unwind_module_unloaded(struct linker_file *lf)
* the unwind tables might be stripped, so instead we have to use the
* _exidx_start/end symbols created by ldscript.arm.
*/
-static int
+static void
module_info_init(void *arg __unused)
{
struct linker_file thekernel;
@@ -291,8 +291,6 @@ module_info_init(void *arg __unused)
thekernel.exidx_addr = CADDR(&_exidx_start);
thekernel.exidx_size = UADDR(&_exidx_end) - UADDR(&_exidx_start);
populate_module_info(create_module_info(), &thekernel);
-
- return (0);
}
SYSINIT(unwind_init, SI_SUB_KMEM, SI_ORDER_ANY, module_info_init, NULL);
diff --git a/sys/arm64/coresight/coresight.c b/sys/arm64/coresight/coresight.c
index 5928c153f4ae..9b9d3c65ecc9 100644
--- a/sys/arm64/coresight/coresight.c
+++ b/sys/arm64/coresight/coresight.c
@@ -113,7 +113,7 @@ coresight_get_output_device(struct endpoint *endp, struct endpoint **out_endp)
}
static void
-coresight_init(void)
+coresight_init(void *dummy __unused)
{
mtx_init(&cs_mtx, "ARM Coresight", NULL, MTX_DEF);
diff --git a/sys/cam/scsi/scsi_all.c b/sys/cam/scsi/scsi_all.c
index b518f84454ad..9c097f52d136 100644
--- a/sys/cam/scsi/scsi_all.c
+++ b/sys/cam/scsi/scsi_all.c
@@ -112,7 +112,7 @@ static void fetchtableentries(int sense_key, int asc, int ascq,
const struct asc_table_entry **);
#ifdef _KERNEL
-static void init_scsi_delay(void);
+static void init_scsi_delay(void *);
static int sysctl_scsi_delay(SYSCTL_HANDLER_ARGS);
static int set_scsi_delay(int delay);
#endif
@@ -9379,7 +9379,7 @@ scsi_vpd_supported_page(struct cam_periph *periph, uint8_t page_id)
}
static void
-init_scsi_delay(void)
+init_scsi_delay(void *dummy __unused)
{
int delay;
diff --git a/sys/cddl/compat/opensolaris/kern/opensolaris.c b/sys/cddl/compat/opensolaris/kern/opensolaris.c
index 10924977c20d..898b2ea49f96 100644
--- a/sys/cddl/compat/opensolaris/kern/opensolaris.c
+++ b/sys/cddl/compat/opensolaris/kern/opensolaris.c
@@ -67,7 +67,7 @@ opensolaris_load(void *dummy)
SYSINIT(opensolaris_register, SI_SUB_OPENSOLARIS, SI_ORDER_FIRST, opensolaris_load, NULL);
static void
-opensolaris_unload(void)
+opensolaris_unload(void *dummy __unused)
{
mutex_destroy(&cpu_lock);
}
diff --git a/sys/conf/NOTES b/sys/conf/NOTES
index ea9b2667607e..a25ee8f6e1af 100644
--- a/sys/conf/NOTES
+++ b/sys/conf/NOTES
@@ -679,6 +679,7 @@ options TCP_OFFLOAD # TCP offload support.
options TCP_RFC7413 # TCP Fast Open
options TCPHPTS
+#options TCP_HPTS_KTEST # Add KTEST support for HPTS
# In order to enable IPSEC you MUST also add device crypto to
# your kernel configuration
diff --git a/sys/conf/options b/sys/conf/options
index b48ad1cf42cf..0b795a8d28fb 100644
--- a/sys/conf/options
+++ b/sys/conf/options
@@ -231,6 +231,7 @@ SYSVSEM opt_sysvipc.h
SYSVSHM opt_sysvipc.h
SW_WATCHDOG opt_watchdog.h
TCPHPTS
+TCP_HPTS_KTEST opt_inet.h
TCP_REQUEST_TRK opt_global.h
TCP_ACCOUNTING opt_global.h
TCP_BBR opt_inet.h
diff --git a/sys/dev/fdt/fdt_slicer.c b/sys/dev/fdt/fdt_slicer.c
index 3ba4eddf8b61..50112db5cfae 100644
--- a/sys/dev/fdt/fdt_slicer.c
+++ b/sys/dev/fdt/fdt_slicer.c
@@ -45,7 +45,7 @@
static int fill_slices(device_t dev, const char *provider,
struct flash_slice *slices, int *slices_num);
-static void fdt_slicer_init(void);
+static void fdt_slicer_init(void *);
static int
fill_slices_from_node(phandle_t node, struct flash_slice *slices, int *count)
@@ -138,7 +138,7 @@ fill_slices(device_t dev, const char *provider __unused,
}
static void
-fdt_slicer_init(void)
+fdt_slicer_init(void *dummy __unused)
{
flash_register_slicer(fill_slices, FLASH_SLICES_TYPE_NAND, false);
@@ -147,7 +147,7 @@ fdt_slicer_init(void)
}
static void
-fdt_slicer_cleanup(void)
+fdt_slicer_cleanup(void *dummy __unused)
{
flash_register_slicer(NULL, FLASH_SLICES_TYPE_NAND, true);
diff --git a/sys/dev/iommu/iommu_gas.c b/sys/dev/iommu/iommu_gas.c
index ffa8dc096adc..80e37341b3dc 100644
--- a/sys/dev/iommu/iommu_gas.c
+++ b/sys/dev/iommu/iommu_gas.c
@@ -77,7 +77,7 @@ static int iommu_check_free;
#endif
static void
-intel_gas_init(void)
+intel_gas_init(void *dummy __unused)
{
iommu_map_entry_zone = uma_zcreate("IOMMU_MAP_ENTRY",
diff --git a/sys/dev/nvme/nvme.c b/sys/dev/nvme/nvme.c
index ead91f0d01fe..d119f9877aaa 100644
--- a/sys/dev/nvme/nvme.c
+++ b/sys/dev/nvme/nvme.c
@@ -51,7 +51,7 @@ int32_t nvme_retry_count;
MALLOC_DEFINE(M_NVME, "nvme", "nvme(4) memory allocations");
static void
-nvme_init(void)
+nvme_init(void *dummy __unused)
{
uint32_t i;
@@ -62,7 +62,7 @@ nvme_init(void)
SYSINIT(nvme_register, SI_SUB_DRIVERS, SI_ORDER_SECOND, nvme_init, NULL);
static void
-nvme_uninit(void)
+nvme_uninit(void *dummy __unused)
{
}
diff --git a/sys/dev/nvme/nvme.h b/sys/dev/nvme/nvme.h
index 57cb37907e65..f4ea08f129c0 100644
--- a/sys/dev/nvme/nvme.h
+++ b/sys/dev/nvme/nvme.h
@@ -2153,8 +2153,6 @@ static inline
void nvme_namespace_data_swapbytes(struct nvme_namespace_data *s __unused)
{
#if _BYTE_ORDER != _LITTLE_ENDIAN
- int i;
-
s->nsze = le64toh(s->nsze);
s->ncap = le64toh(s->ncap);
s->nuse = le64toh(s->nuse);
@@ -2173,7 +2171,7 @@ void nvme_namespace_data_swapbytes(struct nvme_namespace_data *s __unused)
s->anagrpid = le32toh(s->anagrpid);
s->nvmsetid = le16toh(s->nvmsetid);
s->endgid = le16toh(s->endgid);
- for (i = 0; i < 64; i++)
+ for (unsigned i = 0; i < nitems(s->lbaf); i++)
s->lbaf[i] = le32toh(s->lbaf[i]);
#endif
}
diff --git a/sys/dev/nvme/nvme_sim.c b/sys/dev/nvme/nvme_sim.c
index a06774a64761..7693aa6d54d3 100644
--- a/sys/dev/nvme/nvme_sim.c
+++ b/sys/dev/nvme/nvme_sim.c
@@ -391,7 +391,7 @@ nvme_sim_controller_fail(void *ctrlr_arg)
struct nvme_consumer *consumer_cookie;
static void
-nvme_sim_init(void)
+nvme_sim_init(void *dummy __unused)
{
if (nvme_use_nvd)
return;
@@ -404,7 +404,7 @@ SYSINIT(nvme_sim_register, SI_SUB_DRIVERS, SI_ORDER_ANY,
nvme_sim_init, NULL);
static void
-nvme_sim_uninit(void)
+nvme_sim_uninit(void *dummy __unused)
{
if (nvme_use_nvd)
return;
diff --git a/sys/dev/xdma/xdma.c b/sys/dev/xdma/xdma.c
index 62b781159d03..cdd9ad0b8f39 100644
--- a/sys/dev/xdma/xdma.c
+++ b/sys/dev/xdma/xdma.c
@@ -555,7 +555,7 @@ xdma_put(xdma_controller_t *xdma)
}
static void
-xdma_init(void)
+xdma_init(void *dummy __unused)
{
mtx_init(&xdma_mtx, "xDMA", NULL, MTX_DEF);
diff --git a/sys/dev/xen/bus/xen_intr.c b/sys/dev/xen/bus/xen_intr.c
index cb30b6efa484..2b5fa8fb7cd1 100644
--- a/sys/dev/xen/bus/xen_intr.c
+++ b/sys/dev/xen/bus/xen_intr.c
@@ -460,7 +460,7 @@ xen_intr_handle_upcall(void *unused __unused)
return (FILTER_HANDLED);
}
-static int
+static void
xen_intr_init(void *dummy __unused)
{
shared_info_t *s = HYPERVISOR_shared_info;
@@ -468,7 +468,7 @@ xen_intr_init(void *dummy __unused)
int i;
if (!xen_domain())
- return (0);
+ return;
_Static_assert(is_valid_evtchn(0),
"is_valid_evtchn(0) fails (unused by Xen, but valid by interface");
@@ -502,8 +502,6 @@ xen_intr_init(void *dummy __unused)
if (bootverbose)
printf("Xen interrupt system initialized\n");
-
- return (0);
}
SYSINIT(xen_intr_init, SI_SUB_INTR, SI_ORDER_SECOND, xen_intr_init, NULL);
diff --git a/sys/fs/p9fs/p9_transport.c b/sys/fs/p9fs/p9_transport.c
index c82d81fedcd7..25eee984265c 100644
--- a/sys/fs/p9fs/p9_transport.c
+++ b/sys/fs/p9fs/p9_transport.c
@@ -34,9 +34,8 @@
TAILQ_HEAD(, p9_trans_module) transports;
static void
-p9_transport_init(void)
+p9_transport_init(void *dummy __unused)
{
-
TAILQ_INIT(&transports);
}
diff --git a/sys/i386/i386/machdep.c b/sys/i386/i386/machdep.c
index 6aac0e968362..3f659432552c 100644
--- a/sys/i386/i386/machdep.c
+++ b/sys/i386/i386/machdep.c
@@ -1605,7 +1605,7 @@ init386(int first)
}
static void
-machdep_init_trampoline(void)
+machdep_init_trampoline(void *dummy __unused)
{
struct region_descriptor r_gdt, r_idt;
struct i386tss *tss;
diff --git a/sys/i386/i386/pmap.c b/sys/i386/i386/pmap.c
index b44f5e08bbcf..1cf0867d57c3 100644
--- a/sys/i386/i386/pmap.c
+++ b/sys/i386/i386/pmap.c
@@ -720,7 +720,7 @@ __CONCAT(PMTYPE, bootstrap)(vm_paddr_t firstaddr)
}
static void
-pmap_init_reserved_pages(void)
+pmap_init_reserved_pages(void *dummy __unused)
{
struct pcpu *pc;
vm_offset_t pages;
diff --git a/sys/kern/kern_boottrace.c b/sys/kern/kern_boottrace.c
index 1fa87955a299..c83255bc74ee 100644
--- a/sys/kern/kern_boottrace.c
+++ b/sys/kern/kern_boottrace.c
@@ -579,7 +579,7 @@ sysctl_boottrace_reset(SYSCTL_HANDLER_ARGS)
}
static void
-boottrace_init(void)
+boottrace_init(void *dummy __unused)
{
if (!boottrace_enabled)
diff --git a/sys/kern/kern_devctl.c b/sys/kern/kern_devctl.c
index 7a2818c29b1a..a1696225df32 100644
--- a/sys/kern/kern_devctl.c
+++ b/sys/kern/kern_devctl.c
@@ -140,7 +140,7 @@ static struct devctlbridge {
} devctl_notify_hook = { .send_f = NULL };
static void
-devctl_init(void)
+devctl_init(void *dummy __unused)
{
int reserve;
uma_zone_t z;
diff --git a/sys/kern/kern_event.c b/sys/kern/kern_event.c
index 23d8dc9cf54a..a6333d8011b1 100644
--- a/sys/kern/kern_event.c
+++ b/sys/kern/kern_event.c
@@ -156,7 +156,7 @@ static void knote_drop(struct knote *kn, struct thread *td);
static void knote_drop_detached(struct knote *kn, struct thread *td);
static void knote_enqueue(struct knote *kn);
static void knote_dequeue(struct knote *kn);
-static void knote_init(void);
+static void knote_init(void *);
static struct knote *knote_alloc(int mflag);
static void knote_free(struct knote *kn);
@@ -2887,7 +2887,7 @@ knote_dequeue(struct knote *kn)
}
static void
-knote_init(void)
+knote_init(void *dummy __unused)
{
knote_zone = uma_zcreate("KNOTE", sizeof(struct knote), NULL, NULL,
diff --git a/sys/kern/kern_jailmeta.c b/sys/kern/kern_jailmeta.c
index 4e37eccad03a..91bb7155820d 100644
--- a/sys/kern/kern_jailmeta.c
+++ b/sys/kern/kern_jailmeta.c
@@ -599,22 +599,18 @@ SYSCTL_PROC(_security_jail, OID_AUTO, env,
/* Setup and tear down. */
-static int
+static void
jm_sysinit(void *arg __unused)
{
meta.osd_slot = osd_jail_register(jm_osd_destructor, meta.methods);
env.osd_slot = osd_jail_register(jm_osd_destructor, env.methods);
-
- return (0);
}
-static int
+static void
jm_sysuninit(void *arg __unused)
{
osd_jail_deregister(meta.osd_slot);
osd_jail_deregister(env.osd_slot);
-
- return (0);
}
SYSINIT(jailmeta, SI_SUB_DRIVERS, SI_ORDER_ANY, jm_sysinit, NULL);
diff --git a/sys/kern/kern_linker.c b/sys/kern/kern_linker.c
index d566bc01bc5e..e2f63cbc0c5a 100644
--- a/sys/kern/kern_linker.c
+++ b/sys/kern/kern_linker.c
@@ -435,7 +435,7 @@ linker_file_register_modules(linker_file_t lf)
}
static void
-linker_init_kernel_modules(void)
+linker_init_kernel_modules(void *dummy __unused)
{
sx_xlock(&kld_sx);
diff --git a/sys/kern/kern_malloc.c b/sys/kern/kern_malloc.c
index 653ce1ee556b..e919b15543b2 100644
--- a/sys/kern/kern_malloc.c
+++ b/sys/kern/kern_malloc.c
@@ -303,7 +303,7 @@ sysctl_vm_malloc_zone_sizes(SYSCTL_HANDLER_ARGS)
*/
#if MALLOC_DEBUG_MAXZONES > 1
static void
-tunable_set_numzones(void)
+tunable_set_numzones(void *dummy __unused)
{
TUNABLE_INT_FETCH("debug.malloc.numzones",
diff --git a/sys/kern/kern_racct.c b/sys/kern/kern_racct.c
index 7351e9cb6313..2aab151aba08 100644
--- a/sys/kern/kern_racct.c
+++ b/sys/kern/kern_racct.c
@@ -1312,7 +1312,7 @@ static struct kproc_desc racctd_kp = {
};
static void
-racctd_init(void)
+racctd_init(void *dummy __unused)
{
if (!racct_enable)
return;
@@ -1322,7 +1322,7 @@ racctd_init(void)
SYSINIT(racctd, SI_SUB_RACCTD, SI_ORDER_FIRST, racctd_init, NULL);
static void
-racct_init(void)
+racct_init(void *dummy __unused)
{
if (!racct_enable)
return;
diff --git a/sys/kern/kern_rangelock.c b/sys/kern/kern_rangelock.c
index 3854ffbeec29..cd66bff62608 100644
--- a/sys/kern/kern_rangelock.c
+++ b/sys/kern/kern_rangelock.c
@@ -300,7 +300,7 @@ static void rangelock_free_free(struct rl_q_entry *free);
static void rangelock_noncheating_destroy(struct rangelock *lock);
static void
-rangelock_sys_init(void)
+rangelock_sys_init(void *dummy __unused)
{
rl_entry_zone = uma_zcreate("rl_entry", sizeof(struct rl_q_entry),
NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct rl_q_entry),
diff --git a/sys/kern/kern_rctl.c b/sys/kern/kern_rctl.c
index 4232c71f86fb..682ba86d23ff 100644
--- a/sys/kern/kern_rctl.c
+++ b/sys/kern/kern_rctl.c
@@ -209,7 +209,7 @@ static struct dict actionnames[] = {
{ "throttle", RCTL_ACTION_THROTTLE },
{ NULL, -1 }};
-static void rctl_init(void);
+static void rctl_init(void *);
SYSINIT(rctl, SI_SUB_RACCT, SI_ORDER_FIRST, rctl_init, NULL);
static uma_zone_t rctl_rule_zone;
@@ -2175,7 +2175,7 @@ rctl_racct_release(struct racct *racct)
}
static void
-rctl_init(void)
+rctl_init(void *dummy __unused)
{
if (!racct_enable)
diff --git a/sys/kern/kern_sharedpage.c b/sys/kern/kern_sharedpage.c
index 5b8398caaca9..f48d0e3d616b 100644
--- a/sys/kern/kern_sharedpage.c
+++ b/sys/kern/kern_sharedpage.c
@@ -130,8 +130,7 @@ shared_page_init(void *dummy __unused)
shared_page_mapping = (char *)addr;
}
-SYSINIT(shp, SI_SUB_EXEC, SI_ORDER_FIRST, (sysinit_cfunc_t)shared_page_init,
- NULL);
+SYSINIT(shp, SI_SUB_EXEC, SI_ORDER_FIRST, shared_page_init, NULL);
/*
* Push the timehands update to the shared page.
diff --git a/sys/kern/kern_sig.c b/sys/kern/kern_sig.c
index 8efc0886988b..21f765b17f62 100644
--- a/sys/kern/kern_sig.c
+++ b/sys/kern/kern_sig.c
@@ -113,7 +113,7 @@ static int filt_sigattach(struct knote *kn);
static void filt_sigdetach(struct knote *kn);
static int filt_signal(struct knote *kn, long hint);
static struct thread *sigtd(struct proc *p, int sig, bool fast_sigblock);
-static void sigqueue_start(void);
+static void sigqueue_start(void *);
static void sigfastblock_setpend(struct thread *td, bool resched);
static void sig_handle_first_stop(struct thread *td, struct proc *p,
int sig);
@@ -344,7 +344,7 @@ ast_sigsuspend(struct thread *td, int tda __unused)
}
static void
-sigqueue_start(void)
+sigqueue_start(void *dummy __unused)
{
ksiginfo_zone = uma_zcreate("ksiginfo", sizeof(ksiginfo_t),
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
diff --git a/sys/kern/kern_time.c b/sys/kern/kern_time.c
index 2a6f0989f6aa..5b7485c25cd7 100644
--- a/sys/kern/kern_time.c
+++ b/sys/kern/kern_time.c
@@ -90,7 +90,7 @@ static int user_clock_nanosleep(struct thread *td, clockid_t clock_id,
int flags, const struct timespec *ua_rqtp,
struct timespec *ua_rmtp);
-static void itimer_start(void);
+static void itimer_start(void *);
static int itimer_init(void *, int, int);
static void itimer_fini(void *, int);
static void itimer_enter(struct itimer *);
@@ -1170,7 +1170,7 @@ eventratecheck(struct timeval *lasttime, int *cureps, int maxeps)
}
static void
-itimer_start(void)
+itimer_start(void *dummy __unused)
{
static const struct kclock rt_clock = {
.timer_create = realtimer_create,
diff --git a/sys/kern/subr_pcpu.c b/sys/kern/subr_pcpu.c
index 5c14e15830f4..c9a387a5e87b 100644
--- a/sys/kern/subr_pcpu.c
+++ b/sys/kern/subr_pcpu.c
@@ -140,7 +140,7 @@ uma_zone_t pcpu_zone_32;
uma_zone_t pcpu_zone_64;
static void
-pcpu_zones_startup(void)
+pcpu_zones_startup(void *dummy __unused)
{
pcpu_zone_4 = uma_zcreate("pcpu-4", 4,
diff --git a/sys/kern/sys_socket.c b/sys/kern/sys_socket.c
index c221106ae067..bc0725230cca 100644
--- a/sys/kern/sys_socket.c
+++ b/sys/kern/sys_socket.c
@@ -586,7 +586,7 @@ soaio_enqueue(struct task *task)
}
static void
-soaio_init(void)
+soaio_init(void *dummy __unused)
{
soaio_lifetime = AIOD_LIFETIME_DEFAULT;
diff --git a/sys/libkern/arc4random.c b/sys/libkern/arc4random.c
index 016822e9f03c..6fca7c3c4e9d 100644
--- a/sys/libkern/arc4random.c
+++ b/sys/libkern/arc4random.c
@@ -156,7 +156,7 @@ chacha20_randomstir(struct chacha20_s *chacha20)
* Initialize the contexts.
*/
static void
-chacha20_init(void)
+chacha20_init(void *dummy __unused)
{
struct chacha20_s *chacha20;
@@ -176,7 +176,7 @@ SYSINIT(chacha20, SI_SUB_LOCK, SI_ORDER_ANY, chacha20_init, NULL);
static void
-chacha20_uninit(void)
+chacha20_uninit(void *dummy __unused)
{
struct chacha20_s *chacha20;
diff --git a/sys/libkern/x86/crc32_sse42.c b/sys/libkern/x86/crc32_sse42.c
index b79c7afbeeb1..94ffdc178910 100644
--- a/sys/libkern/x86/crc32_sse42.c
+++ b/sys/libkern/x86/crc32_sse42.c
@@ -199,8 +199,10 @@ crc32c_shift(uint32_t zeros[][256], uint32_t crc)
static void
#ifndef _KERNEL
__attribute__((__constructor__))
-#endif
crc32c_init_hw(void)
+#else
+crc32c_init_hw(void *dummy __unused)
+#endif
{
crc32c_zeros(crc32c_long, LONG);
crc32c_zeros(crc32c_2long, 2 * LONG);
diff --git a/sys/modules/ktest/Makefile b/sys/modules/ktest/Makefile
index a3052efa9ed9..d5f15576f38b 100644
--- a/sys/modules/ktest/Makefile
+++ b/sys/modules/ktest/Makefile
@@ -1,5 +1,6 @@
SUBDIR= ktest \
ktest_example \
- ktest_netlink_message_writer
+ ktest_netlink_message_writer \
+ ktest_tcphpts
.include <bsd.subdir.mk>
diff --git a/sys/modules/ktest/ktest_tcphpts/Makefile b/sys/modules/ktest/ktest_tcphpts/Makefile
new file mode 100644
index 000000000000..b642c0cb4209
--- /dev/null
+++ b/sys/modules/ktest/ktest_tcphpts/Makefile
@@ -0,0 +1,13 @@
+PACKAGE= tests
+WARNS?= 6
+
+SYSDIR?=${SRCTOP}/sys
+.include "${SYSDIR}/conf/kern.opts.mk"
+
+.PATH: ${SYSDIR}/netinet
+
+KMOD= ktest_tcphpts
+SRCS= tcp_hpts_test.c
+
+.include <bsd.kmod.mk>
+
diff --git a/sys/net/route.c b/sys/net/route.c
index 7a50bcc43e06..d2c9f3e39c17 100644
--- a/sys/net/route.c
+++ b/sys/net/route.c
@@ -89,7 +89,7 @@ static int rt_ifdelroute(const struct rtentry *rt, const struct nhop_object *,
* SI_ORDER_MIDDLE.
*/
static void
-route_init(void)
+route_init(void *dummy __unused)
{
nhops_init();
diff --git a/sys/net/route/route_tables.c b/sys/net/route/route_tables.c
index 176ca43fa1c5..3b7bb1385d0e 100644
--- a/sys/net/route/route_tables.c
+++ b/sys/net/route/route_tables.c
@@ -186,7 +186,7 @@ rtables_prison_destructor(void *data)
}
static void
-rtables_init(void)
+rtables_init(void *dummy __unused)
{
osd_method_t methods[PR_MAXMETHOD] = {
[PR_METHOD_ATTACH] = rtables_check_proc_fib,
diff --git a/sys/net/rtsock.c b/sys/net/rtsock.c
index f0dcc973ca7c..be858428bb3e 100644
--- a/sys/net/rtsock.c
+++ b/sys/net/rtsock.c
@@ -309,7 +309,7 @@ rtsock_notify_event(uint32_t fibnum, const struct rib_cmd_info *rc)
}
static void
-rtsock_init(void)
+rtsock_init(void *dummy __unused)
{
rtsbridge_orig_p = rtsock_callback_p;
rtsock_callback_p = &rtsbridge;
diff --git a/sys/net80211/ieee80211_ht.c b/sys/net80211/ieee80211_ht.c
index 3af56a228295..a8a767785fce 100644
--- a/sys/net80211/ieee80211_ht.c
+++ b/sys/net80211/ieee80211_ht.c
@@ -167,7 +167,7 @@ static ieee80211_send_action_func ht_send_action_ba_delba;
static ieee80211_send_action_func ht_send_action_ht_txchwidth;
static void
-ieee80211_ht_init(void)
+ieee80211_ht_init(void *dummy __unused)
{
/*
* Setup HT parameters that depends on the clock frequency.
diff --git a/sys/net80211/ieee80211_hwmp.c b/sys/net80211/ieee80211_hwmp.c
index b69210768c54..084e67da13db 100644
--- a/sys/net80211/ieee80211_hwmp.c
+++ b/sys/net80211/ieee80211_hwmp.c
@@ -212,7 +212,7 @@ SYSCTL_PROC(_net_wlan_hwmp, OID_AUTO, inact,
"mesh route inactivity timeout (ms)");
static void
-ieee80211_hwmp_init(void)
+ieee80211_hwmp_init(void *dummy __unused)
{
/* Default values as per amendment */
ieee80211_hwmp_pathtimeout = msecs_to_ticks(5*1000);
diff --git a/sys/net80211/ieee80211_mesh.c b/sys/net80211/ieee80211_mesh.c
index 3f0410a69e3c..7f2e8bdcb963 100644
--- a/sys/net80211/ieee80211_mesh.c
+++ b/sys/net80211/ieee80211_mesh.c
@@ -548,7 +548,7 @@ mesh_gatemode_cb(void *arg)
}
static void
-ieee80211_mesh_init(void)
+ieee80211_mesh_init(void *dummy __unused)
{
memset(mesh_proto_paths, 0, sizeof(mesh_proto_paths));
diff --git a/sys/net80211/ieee80211_phy.c b/sys/net80211/ieee80211_phy.c
index 7f53c717152b..b4d9b16907d2 100644
--- a/sys/net80211/ieee80211_phy.c
+++ b/sys/net80211/ieee80211_phy.c
@@ -348,7 +348,7 @@ ieee80211_setup_ratetable(struct ieee80211_rate_table *rt)
/* Setup all rate tables */
static void
-ieee80211_phy_init(void)
+ieee80211_phy_init(void *dummy __unused)
{
static struct ieee80211_rate_table * const ratetables[] = {
&ieee80211_half_table,
diff --git a/sys/net80211/ieee80211_proto.c b/sys/net80211/ieee80211_proto.c
index 0c161d98a55a..4918bf7d025f 100644
--- a/sys/net80211/ieee80211_proto.c
+++ b/sys/net80211/ieee80211_proto.c
@@ -459,7 +459,7 @@ static const struct ieee80211_authenticator auth_internal = {
* Setup internal authenticators once; they are never unregistered.
*/
static void
-ieee80211_auth_setup(void)
+ieee80211_auth_setup(void *dummy __unused)
{
ieee80211_authenticator_register(IEEE80211_AUTH_OPEN, &auth_internal);
ieee80211_authenticator_register(IEEE80211_AUTH_SHARED, &auth_internal);
diff --git a/sys/net80211/ieee80211_vht.c b/sys/net80211/ieee80211_vht.c
index 10a5fc7f08ab..095c4108c768 100644
--- a/sys/net80211/ieee80211_vht.c
+++ b/sys/net80211/ieee80211_vht.c
@@ -102,7 +102,7 @@ vht_send_action_placeholder(struct ieee80211_node *ni,
}
static void
-ieee80211_vht_init(void)
+ieee80211_vht_init(void *dummy __unused)
{
ieee80211_recv_action_register(IEEE80211_ACTION_CAT_VHT,
diff --git a/sys/netinet/cc/cc.c b/sys/netinet/cc/cc.c
index c20a20cd983d..bc06616dbf93 100644
--- a/sys/netinet/cc/cc.c
+++ b/sys/netinet/cc/cc.c
@@ -271,7 +271,7 @@ cc_check_default(struct cc_algo *remove_cc)
* Initialise CC subsystem on system boot.
*/
static void
-cc_init(void)
+cc_init(void *dummy __unused)
{
CC_LIST_LOCK_INIT();
STAILQ_INIT(&cc_list);
diff --git a/sys/netinet/in_fib_algo.c b/sys/netinet/in_fib_algo.c
index 123dacb409e7..95621c300064 100644
--- a/sys/netinet/in_fib_algo.c
+++ b/sys/netinet/in_fib_algo.c
@@ -767,7 +767,7 @@ struct fib_lookup_module flm_radix4 = {
};
static void
-fib4_algo_init(void)
+fib4_algo_init(void *dummy __unused)
{
fib_module_register(&flm_bsearch4);
diff --git a/sys/netinet/tcp_hpts.c b/sys/netinet/tcp_hpts.c
index 63bbe4bba11b..c54459bb5f01 100644
--- a/sys/netinet/tcp_hpts.c
+++ b/sys/netinet/tcp_hpts.c
@@ -39,15 +39,14 @@
* First, and probably the main thing its used by Rack and BBR, it can
* be used to call tcp_output() of a transport stack at some time in the future.
* The normal way this is done is that tcp_output() of the stack schedules
- * itself to be called again by calling tcp_hpts_insert(tcpcb, slot). The
- * slot is the time from now that the stack wants to be called but it
- * must be converted to tcp_hpts's notion of slot. This is done with
- * one of the macros HPTS_MS_TO_SLOTS or HPTS_USEC_TO_SLOTS. So a typical
+ * itself to be called again by calling tcp_hpts_insert(tcpcb, usecs). The
+ * usecs is the time from now that the stack wants to be called and is
+ * passing time directly in microseconds. So a typical
* call from the tcp_output() routine might look like:
*
- * tcp_hpts_insert(tp, HPTS_USEC_TO_SLOTS(550));
+ * tcp_hpts_insert(tp, 550, NULL);
*
- * The above would schedule tcp_output() to be called in 550 useconds.
+ * The above would schedule tcp_output() to be called in 550 microseconds.
* Note that if using this mechanism the stack will want to add near
* its top a check to prevent unwanted calls (from user land or the
* arrival of incoming ack's). So it would add something like:
@@ -149,27 +148,44 @@
#include <netinet/tcpip.h>
#include <netinet/cc/cc.h>
#include <netinet/tcp_hpts.h>
+#include <netinet/tcp_hpts_internal.h>
#include <netinet/tcp_log_buf.h>
#ifdef tcp_offload
#include <netinet/tcp_offload.h>
#endif
-/*
- * The hpts uses a 102400 wheel. The wheel
- * defines the time in 10 usec increments (102400 x 10).
- * This gives a range of 10usec - 1024ms to place
- * an entry within. If the user requests more than
- * 1.024 second, a remaineder is attached and the hpts
- * when seeing the remainder will re-insert the
- * inpcb forward in time from where it is until
- * the remainder is zero.
- */
+/* Global instance for TCP HPTS */
+struct tcp_hptsi *tcp_hptsi_pace;
+
+/* Default function table for production use. */
+const struct tcp_hptsi_funcs tcp_hptsi_default_funcs = {
+ .microuptime = microuptime,
+ .swi_add = swi_add,
+ .swi_remove = swi_remove,
+ .swi_sched = swi_sched,
+ .intr_event_bind = intr_event_bind,
+ .intr_event_bind_ithread_cpuset = intr_event_bind_ithread_cpuset,
+ .callout_init = callout_init,
+ .callout_reset_sbt_on = callout_reset_sbt_on,
+ ._callout_stop_safe = _callout_stop_safe,
+};
-#define NUM_OF_HPTSI_SLOTS 102400
+#ifdef TCP_HPTS_KTEST
+#define microuptime pace->funcs->microuptime
+#define swi_add pace->funcs->swi_add
+#define swi_remove pace->funcs->swi_remove
+#define swi_sched pace->funcs->swi_sched
+#define intr_event_bind pace->funcs->intr_event_bind
+#define intr_event_bind_ithread_cpuset pace->funcs->intr_event_bind_ithread_cpuset
+#define callout_init pace->funcs->callout_init
+#define callout_reset_sbt_on pace->funcs->callout_reset_sbt_on
+#define _callout_stop_safe pace->funcs->_callout_stop_safe
+#endif
-/* The number of connections after which the dynamic sleep logic kicks in. */
-#define DEFAULT_CONNECTION_THRESHOLD 100
+static MALLOC_DEFINE(M_TCPHPTS, "tcp_hpts", "TCP hpts");
+
+static void tcp_hpts_thread(void *ctx);
/*
* When using the hpts, a TCP stack must make sure
@@ -204,87 +220,22 @@
*
* When we are in the "new" mode i.e. conn_cnt > conn_cnt_thresh
* then we do a dynamic adjustment on the time we sleep.
- * Our threshold is if the lateness of the first client served (in ticks) is
+ * Our threshold is if the lateness of the first client served (in slots) is
* greater than or equal too slots_indicate_more_sleep (10ms
- * or 10000 ticks). If we were that late, the actual sleep time
- * is adjusted down by 50%. If the ticks_ran is less than
- * slots_indicate_more_sleep (100 ticks or 1000usecs).
+ * or 10000 slots). If we were that late, the actual sleep time
+ * is adjusted down by 50%. If the slots_ran is less than
+ * slots_indicate_more_sleep (100 slots or 1000usecs).
*
*/
-/* Each hpts has its own p_mtx which is used for locking */
-#define HPTS_MTX_ASSERT(hpts) mtx_assert(&(hpts)->p_mtx, MA_OWNED)
-#define HPTS_LOCK(hpts) mtx_lock(&(hpts)->p_mtx)
-#define HPTS_TRYLOCK(hpts) mtx_trylock(&(hpts)->p_mtx)
-#define HPTS_UNLOCK(hpts) mtx_unlock(&(hpts)->p_mtx)
-struct tcp_hpts_entry {
- /* Cache line 0x00 */
- struct mtx p_mtx; /* Mutex for hpts */
- struct timeval p_mysleep; /* Our min sleep time */
- uint64_t syscall_cnt;
- uint64_t sleeping; /* What the actual sleep was (if sleeping) */
- uint16_t p_hpts_active; /* Flag that says hpts is awake */
- uint8_t p_wheel_complete; /* have we completed the wheel arc walk? */
- uint32_t p_curtick; /* Tick in 10 us the hpts is going to */
- uint32_t p_runningslot; /* Current tick we are at if we are running */
- uint32_t p_prev_slot; /* Previous slot we were on */
- uint32_t p_cur_slot; /* Current slot in wheel hpts is draining */
- uint32_t p_nxt_slot; /* The next slot outside the current range of
- * slots that the hpts is running on. */
- int32_t p_on_queue_cnt; /* Count on queue in this hpts */
- uint32_t p_lasttick; /* Last tick before the current one */
- uint8_t p_direct_wake :1, /* boolean */
- p_on_min_sleep:1, /* boolean */
- p_hpts_wake_scheduled:1, /* boolean */
- hit_callout_thresh:1,
- p_avail:4;
- uint8_t p_fill[3]; /* Fill to 32 bits */
- /* Cache line 0x40 */
- struct hptsh {
- TAILQ_HEAD(, tcpcb) head;
- uint32_t count;
- uint32_t gencnt;
- } *p_hptss; /* Hptsi wheel */
- uint32_t p_hpts_sleep_time; /* Current sleep interval having a max
- * of 255ms */
- uint32_t overidden_sleep; /* what was overrided by min-sleep for logging */
- uint32_t saved_lasttick; /* for logging */
- uint32_t saved_curtick; /* for logging */
- uint32_t saved_curslot; /* for logging */
- uint32_t saved_prev_slot; /* for logging */
- uint32_t p_delayed_by; /* How much were we delayed by */
- /* Cache line 0x80 */
- struct sysctl_ctx_list hpts_ctx;
- struct sysctl_oid *hpts_root;
- struct intr_event *ie;
- void *ie_cookie;
- uint16_t p_num; /* The hpts number one per cpu */
- uint16_t p_cpu; /* The hpts CPU */
- /* There is extra space in here */
- /* Cache line 0x100 */
- struct callout co __aligned(CACHE_LINE_SIZE);
-} __aligned(CACHE_LINE_SIZE);
-
-static struct tcp_hptsi {
- struct cpu_group **grps;
- struct tcp_hpts_entry **rp_ent; /* Array of hptss */
- uint32_t *cts_last_ran;
- uint32_t grp_cnt;
- uint32_t rp_num_hptss; /* Number of hpts threads */
-} tcp_pace;
-
-static MALLOC_DEFINE(M_TCPHPTS, "tcp_hpts", "TCP hpts");
#ifdef RSS
-static int tcp_bind_threads = 1;
+int tcp_bind_threads = 1;
#else
-static int tcp_bind_threads = 2;
+int tcp_bind_threads = 2;
#endif
static int tcp_use_irq_cpu = 0;
static int hpts_does_tp_logging = 0;
-
-static int32_t tcp_hptsi(struct tcp_hpts_entry *hpts, bool from_callout);
-static void tcp_hpts_thread(void *ctx);
-
+static int32_t tcp_hpts_precision = 120;
int32_t tcp_min_hptsi_time = DEFAULT_MIN_SLEEP;
static int conn_cnt_thresh = DEFAULT_CONNECTION_THRESHOLD;
static int32_t dynamic_min_sleep = DYNAMIC_MIN_SLEEP;
@@ -295,23 +246,6 @@ SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hpts, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
SYSCTL_NODE(_net_inet_tcp_hpts, OID_AUTO, stats, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
"TCP Hpts statistics");
-#define timersub(tvp, uvp, vvp) \
- do { \
- (vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec; \
- (vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec; \
- if ((vvp)->tv_usec < 0) { \
- (vvp)->tv_sec--; \
- (vvp)->tv_usec += 1000000; \
- } \
- } while (0)
-
-static int32_t tcp_hpts_precision = 120;
-
-static struct hpts_domain_info {
- int count;
- int cpu[MAXCPU];
-} hpts_domains[MAXMEMDOM];
-
counter_u64_t hpts_hopelessly_behind;
SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, hopeless, CTLFLAG_RD,
@@ -459,14 +393,14 @@ SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, nowake_over_thresh, CTLFLAG_RW,
&tcp_hpts_no_wake_over_thresh, 0,
"When we are over the threshold on the pacer do we prohibit wakeups?");
-static uint16_t
-hpts_random_cpu(void)
+uint16_t
+tcp_hptsi_random_cpu(struct tcp_hptsi *pace)
{
uint16_t cpuid;
uint32_t ran;
ran = arc4random();
- cpuid = (((ran & 0xffff) % mp_ncpus) % tcp_pace.rp_num_hptss);
+ cpuid = (((ran & 0xffff) % mp_ncpus) % pace->rp_num_hptss);
return (cpuid);
}
@@ -487,13 +421,11 @@ tcp_hpts_log(struct tcp_hpts_entry *hpts, struct tcpcb *tp, struct timeval *tv,
log.u_bbr.flex2 = hpts->p_cur_slot;
log.u_bbr.flex3 = hpts->p_prev_slot;
log.u_bbr.flex4 = idx;
- log.u_bbr.flex5 = hpts->p_curtick;
log.u_bbr.flex6 = hpts->p_on_queue_cnt;
log.u_bbr.flex7 = hpts->p_cpu;
log.u_bbr.flex8 = (uint8_t)from_callout;
log.u_bbr.inflight = slots_to_run;
log.u_bbr.applimited = hpts->overidden_sleep;
- log.u_bbr.delivered = hpts->saved_curtick;
log.u_bbr.timeStamp = tcp_tv_to_usec(tv);
log.u_bbr.epoch = hpts->saved_curslot;
log.u_bbr.lt_epoch = hpts->saved_prev_slot;
@@ -510,11 +442,67 @@ tcp_hpts_log(struct tcp_hpts_entry *hpts, struct tcpcb *tp, struct timeval *tv,
}
}
+/*
+ * Timeout handler for the HPTS sleep callout. It immediately schedules the SWI
+ * for the HPTS entry to run.
+ */
static void
-tcp_wakehpts(struct tcp_hpts_entry *hpts)
+tcp_hpts_sleep_timeout(void *arg)
{
+#ifdef TCP_HPTS_KTEST
+ struct tcp_hptsi *pace;
+#endif
+ struct tcp_hpts_entry *hpts;
+
+ hpts = (struct tcp_hpts_entry *)arg;
+#ifdef TCP_HPTS_KTEST
+ pace = hpts->p_hptsi;
+#endif
+ swi_sched(hpts->ie_cookie, 0);
+}
+
+/*
+ * Reset the HPTS callout timer with the provided timeval. Returns the results
+ * of the callout_reset_sbt_on() function.
+ */
+static int
+tcp_hpts_sleep(struct tcp_hpts_entry *hpts, struct timeval *tv)
+{
+#ifdef TCP_HPTS_KTEST
+ struct tcp_hptsi *pace;
+#endif
+ sbintime_t sb;
+
+#ifdef TCP_HPTS_KTEST
+ pace = hpts->p_hptsi;
+#endif
+
+ /* Store off to make visible the actual sleep time */
+ hpts->sleeping = tv->tv_usec;
+
+ sb = tvtosbt(*tv);
+ return (callout_reset_sbt_on(
+ &hpts->co, sb, 0, tcp_hpts_sleep_timeout, hpts, hpts->p_cpu,
+ (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision))));
+}
+
+/*
+ * Schedules the SWI for the HTPS entry to run, if not already scheduled or
+ * running.
+ */
+void
+tcp_hpts_wake(struct tcp_hpts_entry *hpts)
+{
+#ifdef TCP_HPTS_KTEST
+ struct tcp_hptsi *pace;
+#endif
+
HPTS_MTX_ASSERT(hpts);
+#ifdef TCP_HPTS_KTEST
+ pace = hpts->p_hptsi;
+#endif
+
if (tcp_hpts_no_wake_over_thresh && (hpts->p_on_queue_cnt >= conn_cnt_thresh)) {
hpts->p_direct_wake = 0;
return;
@@ -526,15 +514,6 @@ tcp_wakehpts(struct tcp_hpts_entry *hpts)
}
static void
-hpts_timeout_swi(void *arg)
-{
- struct tcp_hpts_entry *hpts;
-
- hpts = (struct tcp_hpts_entry *)arg;
- swi_sched(hpts->ie_cookie, 0);
-}
-
-static void
tcp_hpts_insert_internal(struct tcpcb *tp, struct tcp_hpts_entry *hpts)
{
struct inpcb *inp = tptoinpcb(tp);
@@ -562,13 +541,13 @@ tcp_hpts_insert_internal(struct tcpcb *tp, struct tcp_hpts_entry *hpts)
}
static struct tcp_hpts_entry *
-tcp_hpts_lock(struct tcpcb *tp)
+tcp_hpts_lock(struct tcp_hptsi *pace, struct tcpcb *tp)
{
struct tcp_hpts_entry *hpts;
INP_LOCK_ASSERT(tptoinpcb(tp));
- hpts = tcp_pace.rp_ent[tp->t_hpts_cpu];
+ hpts = pace->rp_ent[tp->t_hpts_cpu];
HPTS_LOCK(hpts);
return (hpts);
@@ -595,11 +574,10 @@ tcp_hpts_release(struct tcpcb *tp)
* and has never received a first packet.
*/
void
-tcp_hpts_init(struct tcpcb *tp)
+__tcp_hpts_init(struct tcp_hptsi *pace, struct tcpcb *tp)
{
-
if (__predict_true(tp->t_hpts_cpu == HPTS_CPU_NONE)) {
- tp->t_hpts_cpu = hpts_random_cpu();
+ tp->t_hpts_cpu = tcp_hptsi_random_cpu(pace);
MPASS(!(tp->t_flags2 & TF2_HPTS_CPU_SET));
}
}
@@ -611,14 +589,14 @@ tcp_hpts_init(struct tcpcb *tp)
* INP lock and then get the hpts lock.
*/
void
-tcp_hpts_remove(struct tcpcb *tp)
+__tcp_hpts_remove(struct tcp_hptsi *pace, struct tcpcb *tp)
{
struct tcp_hpts_entry *hpts;
struct hptsh *hptsh;
INP_WLOCK_ASSERT(tptoinpcb(tp));
- hpts = tcp_hpts_lock(tp);
+ hpts = tcp_hpts_lock(pace, tp);
if (tp->t_in_hpts == IHPTS_ONQUEUE) {
hptsh = &hpts->p_hptss[tp->t_hpts_slot];
tp->t_hpts_request = 0;
@@ -662,23 +640,19 @@ hpts_slot(uint32_t wheel_slot, uint32_t plus)
{
/*
* Given a slot on the wheel, what slot
- * is that plus ticks out?
+ * is that plus slots out?
*/
- KASSERT(wheel_slot < NUM_OF_HPTSI_SLOTS, ("Invalid tick %u not on wheel", wheel_slot));
+ KASSERT(wheel_slot < NUM_OF_HPTSI_SLOTS, ("Invalid slot %u not on wheel", wheel_slot));
return ((wheel_slot + plus) % NUM_OF_HPTSI_SLOTS);
}
static inline int
-tick_to_wheel(uint32_t cts_in_wticks)
+cts_to_wheel(uint32_t cts)
{
/*
- * Given a timestamp in ticks (so by
- * default to get it to a real time one
- * would multiply by 10.. i.e the number
- * of ticks in a slot) map it to our limited
- * space wheel.
+ * Given a timestamp in useconds map it to our limited space wheel.
*/
- return (cts_in_wticks % NUM_OF_HPTSI_SLOTS);
+ return ((cts / HPTS_USECS_PER_SLOT) % NUM_OF_HPTSI_SLOTS);
}
static inline int
@@ -721,7 +695,7 @@ max_slots_available(struct tcp_hpts_entry *hpts, uint32_t wheel_slot, uint32_t *
if ((hpts->p_hpts_active == 1) &&
(hpts->p_wheel_complete == 0)) {
end_slot = hpts->p_runningslot;
- /* Back up one tick */
+ /* Back up one slot */
if (end_slot == 0)
end_slot = NUM_OF_HPTSI_SLOTS - 1;
else
@@ -734,7 +708,7 @@ max_slots_available(struct tcp_hpts_entry *hpts, uint32_t wheel_slot, uint32_t *
* not active, or we have
* completed the pass over
* the wheel, we can use the
- * prev tick and subtract one from it. This puts us
+ * prev slot and subtract one from it. This puts us
* as far out as possible on the wheel.
*/
end_slot = hpts->p_prev_slot;
@@ -747,7 +721,7 @@ max_slots_available(struct tcp_hpts_entry *hpts, uint32_t wheel_slot, uint32_t *
/*
* Now we have close to the full wheel left minus the
* time it has been since the pacer went to sleep. Note
- * that wheel_tick, passed in, should be the current time
+ * that wheel_slot, passed in, should be the current time
* from the perspective of the caller, mapped to the wheel.
*/
if (hpts->p_prev_slot != wheel_slot)
@@ -824,7 +798,7 @@ max_slots_available(struct tcp_hpts_entry *hpts, uint32_t wheel_slot, uint32_t *
#ifdef INVARIANTS
static void
check_if_slot_would_be_wrong(struct tcp_hpts_entry *hpts, struct tcpcb *tp,
- uint32_t hptsslot, int line)
+ uint32_t hptsslot)
{
/*
* Sanity checks for the pacer with invariants
@@ -855,12 +829,13 @@ check_if_slot_would_be_wrong(struct tcp_hpts_entry *hpts, struct tcpcb *tp,
}
#endif
-uint32_t
-tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_diag *diag)
+void
+__tcp_hpts_insert(struct tcp_hptsi *pace, struct tcpcb *tp, uint32_t usecs,
+ struct hpts_diag *diag)
{
struct tcp_hpts_entry *hpts;
struct timeval tv;
- uint32_t slot_on, wheel_cts, last_slot, need_new_to = 0;
+ uint32_t slot, wheel_cts, last_slot, need_new_to = 0;
int32_t wheel_slot, maxslots;
bool need_wakeup = false;
@@ -869,11 +844,13 @@ tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_
MPASS(!(tp->t_in_hpts == IHPTS_ONQUEUE));
/*
+ * Convert microseconds to slots for internal use.
* We now return the next-slot the hpts will be on, beyond its
* current run (if up) or where it was when it stopped if it is
* sleeping.
*/
- hpts = tcp_hpts_lock(tp);
+ slot = HPTS_USEC_TO_SLOTS(usecs);
+ hpts = tcp_hpts_lock(pace, tp);
microuptime(&tv);
if (diag) {
memset(diag, 0, sizeof(struct hpts_diag));
@@ -882,8 +859,6 @@ tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_
diag->p_runningslot = hpts->p_runningslot;
diag->p_nxt_slot = hpts->p_nxt_slot;
diag->p_cur_slot = hpts->p_cur_slot;
- diag->p_curtick = hpts->p_curtick;
- diag->p_lasttick = hpts->p_lasttick;
diag->slot_req = slot;
diag->p_on_min_sleep = hpts->p_on_min_sleep;
diag->hpts_sleep_time = hpts->p_hpts_sleep_time;
@@ -910,17 +885,15 @@ tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_
* timeout is not 1.
*/
hpts->p_direct_wake = 1;
- tcp_wakehpts(hpts);
+ tcp_hpts_wake(hpts);
}
- slot_on = hpts->p_nxt_slot;
HPTS_UNLOCK(hpts);
- return (slot_on);
+ return;
}
- /* Get the current time relative to the wheel */
- wheel_cts = tcp_tv_to_hpts_slot(&tv);
- /* Map it onto the wheel */
- wheel_slot = tick_to_wheel(wheel_cts);
+ /* Get the current time stamp and map it onto the wheel */
+ wheel_cts = tcp_tv_to_usec(&tv);
+ wheel_slot = cts_to_wheel(wheel_cts);
/* Now what's the max we can place it at? */
maxslots = max_slots_available(hpts, wheel_slot, &last_slot);
if (diag) {
@@ -952,11 +925,11 @@ tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_
tp->t_hpts_slot = last_slot;
}
if (diag) {
- diag->slot_remaining = tp->t_hpts_request;
+ diag->time_remaining = tp->t_hpts_request;
diag->inp_hptsslot = tp->t_hpts_slot;
}
#ifdef INVARIANTS
- check_if_slot_would_be_wrong(hpts, tp, tp->t_hpts_slot, line);
+ check_if_slot_would_be_wrong(hpts, tp, tp->t_hpts_slot);
#endif
if (__predict_true(tp->t_in_hpts != IHPTS_MOVING))
tcp_hpts_insert_internal(tp, hpts);
@@ -995,12 +968,12 @@ tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_
}
/*
* Now how far is the hpts sleeping to? if active is 1, its
- * up and ticking we do nothing, otherwise we may need to
+ * up and running we do nothing, otherwise we may need to
* reschedule its callout if need_new_to is set from above.
*/
if (need_wakeup) {
hpts->p_direct_wake = 1;
- tcp_wakehpts(hpts);
+ tcp_hpts_wake(hpts);
if (diag) {
diag->need_new_to = 0;
diag->co_ret = 0xffff0000;
@@ -1008,7 +981,6 @@ tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_
} else if (need_new_to) {
int32_t co_ret;
struct timeval tv;
- sbintime_t sb;
tv.tv_sec = 0;
tv.tv_usec = 0;
@@ -1016,24 +988,18 @@ tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_
tv.tv_sec++;
need_new_to -= HPTS_USEC_IN_SEC;
}
- tv.tv_usec = need_new_to;
- sb = tvtosbt(tv);
- co_ret = callout_reset_sbt_on(&hpts->co, sb, 0,
- hpts_timeout_swi, hpts, hpts->p_cpu,
- (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
+ tv.tv_usec = need_new_to; /* XXX: Why is this sleeping over the max? */
+ co_ret = tcp_hpts_sleep(hpts, &tv);
if (diag) {
diag->need_new_to = need_new_to;
diag->co_ret = co_ret;
}
}
- slot_on = hpts->p_nxt_slot;
HPTS_UNLOCK(hpts);
-
- return (slot_on);
}
static uint16_t
-hpts_cpuid(struct tcpcb *tp, int *failed)
+hpts_cpuid(struct tcp_hptsi *pace, struct tcpcb *tp, int *failed)
{
struct inpcb *inp = tptoinpcb(tp);
u_int cpuid;
@@ -1060,7 +1026,7 @@ hpts_cpuid(struct tcpcb *tp, int *failed)
#ifdef RSS
cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype);
if (cpuid == NETISR_CPUID_NONE)
- return (hpts_random_cpu());
+ return (tcp_hptsi_random_cpu(pace));
else
return (cpuid);
#endif
@@ -1071,7 +1037,7 @@ hpts_cpuid(struct tcpcb *tp, int *failed)
*/
if (inp->inp_flowtype == M_HASHTYPE_NONE) {
counter_u64_add(cpu_uses_random, 1);
- return (hpts_random_cpu());
+ return (tcp_hptsi_random_cpu(pace));
}
/*
* Hash to a thread based on the flowid. If we are using numa,
@@ -1086,7 +1052,7 @@ hpts_cpuid(struct tcpcb *tp, int *failed)
#ifdef NUMA
} else {
/* Hash into the cpu's that use that domain */
- di = &hpts_domains[inp->inp_numa_domain];
+ di = &pace->domains[inp->inp_numa_domain];
cpuid = di->cpu[inp->inp_flowid % di->count];
}
#endif
@@ -1118,9 +1084,16 @@ tcp_hpts_set_max_sleep(struct tcp_hpts_entry *hpts, int wrap_loop_cnt)
}
}
-static int32_t
+static bool
+tcp_hpts_different_slots(uint32_t cts, uint32_t cts_last_run)
+{
+ return ((cts / HPTS_USECS_PER_SLOT) != (cts_last_run / HPTS_USECS_PER_SLOT));
+}
+
+int32_t
tcp_hptsi(struct tcp_hpts_entry *hpts, bool from_callout)
{
+ struct tcp_hptsi *pace;
struct tcpcb *tp;
struct timeval tv;
int32_t slots_to_run, i, error;
@@ -1130,6 +1103,7 @@ tcp_hptsi(struct tcp_hpts_entry *hpts, bool from_callout)
int32_t wrap_loop_cnt = 0;
int32_t slot_pos_of_endpoint = 0;
int32_t orig_exit_slot;
+ uint32_t cts, cts_last_run;
bool completed_measure, seen_endpoint;
completed_measure = false;
@@ -1137,32 +1111,34 @@ tcp_hptsi(struct tcp_hpts_entry *hpts, bool from_callout)
HPTS_MTX_ASSERT(hpts);
NET_EPOCH_ASSERT();
+
+ pace = hpts->p_hptsi;
+ MPASS(pace != NULL);
+
/* record previous info for any logging */
- hpts->saved_lasttick = hpts->p_lasttick;
- hpts->saved_curtick = hpts->p_curtick;
hpts->saved_curslot = hpts->p_cur_slot;
hpts->saved_prev_slot = hpts->p_prev_slot;
- hpts->p_lasttick = hpts->p_curtick;
- hpts->p_curtick = tcp_gethptstick(&tv);
- tcp_pace.cts_last_ran[hpts->p_num] = tcp_tv_to_usec(&tv);
- orig_exit_slot = hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
+ microuptime(&tv);
+ cts_last_run = pace->cts_last_ran[hpts->p_cpu];
+ pace->cts_last_ran[hpts->p_cpu] = cts = tcp_tv_to_usec(&tv);
+
+ orig_exit_slot = hpts->p_cur_slot = cts_to_wheel(cts);
if ((hpts->p_on_queue_cnt == 0) ||
- (hpts->p_lasttick == hpts->p_curtick)) {
+ !tcp_hpts_different_slots(cts, cts_last_run)) {
/*
- * No time has yet passed,
- * or nothing to do.
+ * Not enough time has yet passed or nothing to do.
*/
hpts->p_prev_slot = hpts->p_cur_slot;
- hpts->p_lasttick = hpts->p_curtick;
goto no_run;
}
again:
hpts->p_wheel_complete = 0;
HPTS_MTX_ASSERT(hpts);
slots_to_run = hpts_slots_diff(hpts->p_prev_slot, hpts->p_cur_slot);
- if (((hpts->p_curtick - hpts->p_lasttick) > (NUM_OF_HPTSI_SLOTS - 1)) &&
- (hpts->p_on_queue_cnt != 0)) {
+ if ((hpts->p_on_queue_cnt != 0) &&
+ ((cts - cts_last_run) >
+ ((NUM_OF_HPTSI_SLOTS-1) * HPTS_USECS_PER_SLOT))) {
/*
* Wheel wrap is occuring, basically we
* are behind and the distance between
@@ -1238,7 +1214,7 @@ again:
uint32_t runningslot;
/*
- * Calculate our delay, if there are no extra ticks there
+ * Calculate our delay, if there are no extra slots there
* was not any (i.e. if slots_to_run == 1, no delay).
*/
hpts->p_delayed_by = (slots_to_run - (i + 1)) *
@@ -1391,7 +1367,7 @@ again:
* gets added to the hpts (not this one)
* :-)
*/
- tcp_set_hpts(tp);
+ __tcp_set_hpts(pace, tp);
}
CURVNET_SET(inp->inp_vnet);
/* Lets do any logging that we might want to */
@@ -1450,10 +1426,12 @@ no_one:
hpts->p_delayed_by = 0;
/*
* Check to see if we took an excess amount of time and need to run
- * more ticks (if we did not hit eno-bufs).
+ * more slots (if we did not hit eno-bufs).
*/
hpts->p_prev_slot = hpts->p_cur_slot;
- hpts->p_lasttick = hpts->p_curtick;
+ microuptime(&tv);
+ cts_last_run = cts;
+ cts = tcp_tv_to_usec(&tv);
if (!from_callout || (loop_cnt > max_pacer_loops)) {
/*
* Something is serious slow we have
@@ -1465,7 +1443,7 @@ no_one:
* can never catch up :(
*
* We will just lie to this thread
- * and let it thing p_curtick is
+ * and let it think p_curslot is
* correct. When it next awakens
* it will find itself further behind.
*/
@@ -1473,20 +1451,19 @@ no_one:
counter_u64_add(hpts_hopelessly_behind, 1);
goto no_run;
}
- hpts->p_curtick = tcp_gethptstick(&tv);
- hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
+
+ hpts->p_cur_slot = cts_to_wheel(cts);
if (!seen_endpoint) {
/* We saw no endpoint but we may be looping */
orig_exit_slot = hpts->p_cur_slot;
}
- if ((wrap_loop_cnt < 2) &&
- (hpts->p_lasttick != hpts->p_curtick)) {
+ if ((wrap_loop_cnt < 2) && tcp_hpts_different_slots(cts, cts_last_run)) {
counter_u64_add(hpts_loops, 1);
loop_cnt++;
goto again;
}
no_run:
- tcp_pace.cts_last_ran[hpts->p_num] = tcp_tv_to_usec(&tv);
+ pace->cts_last_ran[hpts->p_cpu] = cts;
/*
* Set flag to tell that we are done for
* any slot input that happens during
@@ -1494,25 +1471,36 @@ no_run:
*/
hpts->p_wheel_complete = 1;
/*
- * Now did we spend too long running input and need to run more ticks?
- * Note that if wrap_loop_cnt < 2 then we should have the conditions
- * in the KASSERT's true. But if the wheel is behind i.e. wrap_loop_cnt
- * is greater than 2, then the condtion most likely are *not* true.
- * Also if we are called not from the callout, we don't run the wheel
- * multiple times so the slots may not align either.
- */
- KASSERT(((hpts->p_prev_slot == hpts->p_cur_slot) ||
- (wrap_loop_cnt >= 2) || !from_callout),
- ("H:%p p_prev_slot:%u not equal to p_cur_slot:%u", hpts,
- hpts->p_prev_slot, hpts->p_cur_slot));
- KASSERT(((hpts->p_lasttick == hpts->p_curtick)
- || (wrap_loop_cnt >= 2) || !from_callout),
- ("H:%p p_lasttick:%u not equal to p_curtick:%u", hpts,
- hpts->p_lasttick, hpts->p_curtick));
- if (from_callout && (hpts->p_lasttick != hpts->p_curtick)) {
- hpts->p_curtick = tcp_gethptstick(&tv);
+ * If enough time has elapsed that we should be processing the next
+ * slot(s), then we should have kept running and not marked the wheel as
+ * complete.
+ *
+ * But there are several other conditions where we would have stopped
+ * processing, so the prev/cur slots and cts variables won't match.
+ * These conditions are:
+ *
+ * - Calls not from callouts don't run multiple times
+ * - The wheel is empty
+ * - We've processed more than max_pacer_loops times
+ * - We've wrapped more than 2 times
+ *
+ * This assert catches when the logic above has violated this design.
+ *
+ */
+ KASSERT((!from_callout || (hpts->p_on_queue_cnt == 0) ||
+ (loop_cnt > max_pacer_loops) || (wrap_loop_cnt >= 2) ||
+ ((hpts->p_prev_slot == hpts->p_cur_slot) &&
+ !tcp_hpts_different_slots(cts, cts_last_run))),
+ ("H:%p Shouldn't be done! prev_slot:%u, cur_slot:%u, "
+ "cts_last_run:%u, cts:%u, loop_cnt:%d, wrap_loop_cnt:%d",
+ hpts, hpts->p_prev_slot, hpts->p_cur_slot,
+ cts_last_run, cts, loop_cnt, wrap_loop_cnt));
+
+ if (from_callout && tcp_hpts_different_slots(cts, cts_last_run)) {
+ microuptime(&tv);
+ cts = tcp_tv_to_usec(&tv);
+ hpts->p_cur_slot = cts_to_wheel(cts);
counter_u64_add(hpts_loops, 1);
- hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
goto again;
}
@@ -1526,16 +1514,16 @@ no_run:
}
void
-tcp_set_hpts(struct tcpcb *tp)
+__tcp_set_hpts(struct tcp_hptsi *pace, struct tcpcb *tp)
{
struct tcp_hpts_entry *hpts;
int failed;
INP_WLOCK_ASSERT(tptoinpcb(tp));
- hpts = tcp_hpts_lock(tp);
+ hpts = tcp_hpts_lock(pace, tp);
if (tp->t_in_hpts == IHPTS_NONE && !(tp->t_flags2 & TF2_HPTS_CPU_SET)) {
- tp->t_hpts_cpu = hpts_cpuid(tp, &failed);
+ tp->t_hpts_cpu = hpts_cpuid(pace, tp, &failed);
if (failed == 0)
tp->t_flags2 |= TF2_HPTS_CPU_SET;
}
@@ -1543,33 +1531,35 @@ tcp_set_hpts(struct tcpcb *tp)
}
static struct tcp_hpts_entry *
-tcp_choose_hpts_to_run(void)
+tcp_choose_hpts_to_run(struct tcp_hptsi *pace)
{
+ struct timeval tv;
int i, oldest_idx, start, end;
uint32_t cts, time_since_ran, calc;
- cts = tcp_get_usecs(NULL);
+ microuptime(&tv);
+ cts = tcp_tv_to_usec(&tv);
time_since_ran = 0;
/* Default is all one group */
start = 0;
- end = tcp_pace.rp_num_hptss;
+ end = pace->rp_num_hptss;
/*
* If we have more than one L3 group figure out which one
* this CPU is in.
*/
- if (tcp_pace.grp_cnt > 1) {
- for (i = 0; i < tcp_pace.grp_cnt; i++) {
- if (CPU_ISSET(curcpu, &tcp_pace.grps[i]->cg_mask)) {
- start = tcp_pace.grps[i]->cg_first;
- end = (tcp_pace.grps[i]->cg_last + 1);
+ if (pace->grp_cnt > 1) {
+ for (i = 0; i < pace->grp_cnt; i++) {
+ if (CPU_ISSET(curcpu, &pace->grps[i]->cg_mask)) {
+ start = pace->grps[i]->cg_first;
+ end = (pace->grps[i]->cg_last + 1);
break;
}
}
}
oldest_idx = -1;
for (i = start; i < end; i++) {
- if (TSTMP_GT(cts, tcp_pace.cts_last_ran[i]))
- calc = cts - tcp_pace.cts_last_ran[i];
+ if (TSTMP_GT(cts, pace->cts_last_ran[i]))
+ calc = cts - pace->cts_last_ran[i];
else
calc = 0;
if (calc > time_since_ran) {
@@ -1578,9 +1568,9 @@ tcp_choose_hpts_to_run(void)
}
}
if (oldest_idx >= 0)
- return(tcp_pace.rp_ent[oldest_idx]);
+ return(pace->rp_ent[oldest_idx]);
else
- return(tcp_pace.rp_ent[(curcpu % tcp_pace.rp_num_hptss)]);
+ return(pace->rp_ent[(curcpu % pace->rp_num_hptss)]);
}
static void
@@ -1588,9 +1578,9 @@ __tcp_run_hpts(void)
{
struct epoch_tracker et;
struct tcp_hpts_entry *hpts;
- int ticks_ran;
+ int slots_ran;
- hpts = tcp_choose_hpts_to_run();
+ hpts = tcp_choose_hpts_to_run(tcp_hptsi_pace);
if (hpts->p_hpts_active) {
/* Already active */
@@ -1606,12 +1596,11 @@ __tcp_run_hpts(void)
hpts->syscall_cnt++;
counter_u64_add(hpts_direct_call, 1);
hpts->p_hpts_active = 1;
- ticks_ran = tcp_hptsi(hpts, false);
+ slots_ran = tcp_hptsi(hpts, false);
/* We may want to adjust the sleep values here */
if (hpts->p_on_queue_cnt >= conn_cnt_thresh) {
- if (ticks_ran > slots_indicate_less_sleep) {
+ if (slots_ran > slots_indicate_less_sleep) {
struct timeval tv;
- sbintime_t sb;
hpts->p_mysleep.tv_usec /= 2;
if (hpts->p_mysleep.tv_usec < dynamic_min_sleep)
@@ -1635,13 +1624,8 @@ __tcp_run_hpts(void)
* the dynamic value and set the on_min_sleep
* flag so we will not be awoken.
*/
- sb = tvtosbt(tv);
- /* Store off to make visible the actual sleep time */
- hpts->sleeping = tv.tv_usec;
- callout_reset_sbt_on(&hpts->co, sb, 0,
- hpts_timeout_swi, hpts, hpts->p_cpu,
- (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
- } else if (ticks_ran < slots_indicate_more_sleep) {
+ (void)tcp_hpts_sleep(hpts, &tv);
+ } else if (slots_ran < slots_indicate_more_sleep) {
/* For the further sleep, don't reschedule hpts */
hpts->p_mysleep.tv_usec *= 2;
if (hpts->p_mysleep.tv_usec > dynamic_max_sleep)
@@ -1658,17 +1642,22 @@ out_with_mtx:
static void
tcp_hpts_thread(void *ctx)
{
+#ifdef TCP_HPTS_KTEST
+ struct tcp_hptsi *pace;
+#endif
struct tcp_hpts_entry *hpts;
struct epoch_tracker et;
struct timeval tv;
- sbintime_t sb;
- int ticks_ran;
+ int slots_ran;
hpts = (struct tcp_hpts_entry *)ctx;
+#ifdef TCP_HPTS_KTEST
+ pace = hpts->p_hptsi;
+#endif
HPTS_LOCK(hpts);
if (hpts->p_direct_wake) {
/* Signaled by input or output with low occupancy count. */
- callout_stop(&hpts->co);
+ _callout_stop_safe(&hpts->co, 0);
counter_u64_add(hpts_direct_awakening, 1);
} else {
/* Timed out, the normal case. */
@@ -1721,7 +1710,7 @@ tcp_hpts_thread(void *ctx)
}
hpts->sleeping = 0;
hpts->p_hpts_active = 1;
- ticks_ran = tcp_hptsi(hpts, true);
+ slots_ran = tcp_hptsi(hpts, true);
tv.tv_sec = 0;
tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_USECS_PER_SLOT;
if ((hpts->p_on_queue_cnt > conn_cnt_thresh) && (hpts->hit_callout_thresh == 0)) {
@@ -1737,11 +1726,11 @@ tcp_hpts_thread(void *ctx)
* Only adjust sleep time if we were
* called from the callout i.e. direct_wake == 0.
*/
- if (ticks_ran < slots_indicate_more_sleep) {
+ if (slots_ran < slots_indicate_more_sleep) {
hpts->p_mysleep.tv_usec *= 2;
if (hpts->p_mysleep.tv_usec > dynamic_max_sleep)
hpts->p_mysleep.tv_usec = dynamic_max_sleep;
- } else if (ticks_ran > slots_indicate_less_sleep) {
+ } else if (slots_ran > slots_indicate_less_sleep) {
hpts->p_mysleep.tv_usec /= 2;
if (hpts->p_mysleep.tv_usec < dynamic_min_sleep)
hpts->p_mysleep.tv_usec = dynamic_min_sleep;
@@ -1797,18 +1786,11 @@ tcp_hpts_thread(void *ctx)
hpts->p_hpts_active = 0;
back_to_sleep:
hpts->p_direct_wake = 0;
- sb = tvtosbt(tv);
- /* Store off to make visible the actual sleep time */
- hpts->sleeping = tv.tv_usec;
- callout_reset_sbt_on(&hpts->co, sb, 0,
- hpts_timeout_swi, hpts, hpts->p_cpu,
- (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
+ (void)tcp_hpts_sleep(hpts, &tv);
NET_EPOCH_EXIT(et);
HPTS_UNLOCK(hpts);
}
-#undef timersub
-
static int32_t
hpts_count_level(struct cpu_group *cg)
{
@@ -1845,57 +1827,63 @@ hpts_gather_grps(struct cpu_group **grps, int32_t *at, int32_t max, struct cpu_g
}
}
-static void
-tcp_hpts_mod_load(void)
+/*
+ * Initialize a tcp_hptsi structure. This performs the core initialization
+ * without starting threads.
+ */
+struct tcp_hptsi*
+tcp_hptsi_create(const struct tcp_hptsi_funcs *funcs, bool enable_sysctl)
{
+ struct tcp_hptsi *pace;
struct cpu_group *cpu_top;
- int32_t error __diagused;
- int32_t i, j, bound = 0, created = 0;
+ uint32_t i, j, cts;
+ int32_t count;
size_t sz, asz;
struct timeval tv;
- sbintime_t sb;
struct tcp_hpts_entry *hpts;
- struct pcpu *pc;
char unit[16];
uint32_t ncpus = mp_ncpus ? mp_ncpus : MAXCPU;
- int count, domain;
+ KASSERT(funcs != NULL, ("funcs is NULL"));
+
+ /* Allocate the main structure */
+ pace = malloc(sizeof(struct tcp_hptsi), M_TCPHPTS, M_WAITOK | M_ZERO);
+ if (pace == NULL)
+ return (NULL);
+
+ memset(pace, 0, sizeof(*pace));
+ pace->funcs = funcs;
+
+ /* Setup CPU topology information */
#ifdef SMP
cpu_top = smp_topo();
#else
cpu_top = NULL;
#endif
- tcp_pace.rp_num_hptss = ncpus;
- hpts_hopelessly_behind = counter_u64_alloc(M_WAITOK);
- hpts_loops = counter_u64_alloc(M_WAITOK);
- back_tosleep = counter_u64_alloc(M_WAITOK);
- combined_wheel_wrap = counter_u64_alloc(M_WAITOK);
- wheel_wrap = counter_u64_alloc(M_WAITOK);
- hpts_wake_timeout = counter_u64_alloc(M_WAITOK);
- hpts_direct_awakening = counter_u64_alloc(M_WAITOK);
- hpts_back_tosleep = counter_u64_alloc(M_WAITOK);
- hpts_direct_call = counter_u64_alloc(M_WAITOK);
- cpu_uses_flowid = counter_u64_alloc(M_WAITOK);
- cpu_uses_random = counter_u64_alloc(M_WAITOK);
+ pace->rp_num_hptss = ncpus;
- sz = (tcp_pace.rp_num_hptss * sizeof(struct tcp_hpts_entry *));
- tcp_pace.rp_ent = malloc(sz, M_TCPHPTS, M_WAITOK | M_ZERO);
- sz = (sizeof(uint32_t) * tcp_pace.rp_num_hptss);
- tcp_pace.cts_last_ran = malloc(sz, M_TCPHPTS, M_WAITOK);
- tcp_pace.grp_cnt = 0;
+ /* Allocate hpts entry array */
+ sz = (pace->rp_num_hptss * sizeof(struct tcp_hpts_entry *));
+ pace->rp_ent = malloc(sz, M_TCPHPTS, M_WAITOK | M_ZERO);
+
+ /* Allocate timestamp tracking array */
+ sz = (sizeof(uint32_t) * pace->rp_num_hptss);
+ pace->cts_last_ran = malloc(sz, M_TCPHPTS, M_WAITOK);
+
+ /* Setup CPU groups */
if (cpu_top == NULL) {
- tcp_pace.grp_cnt = 1;
+ pace->grp_cnt = 1;
} else {
/* Find out how many cache level 3 domains we have */
count = 0;
- tcp_pace.grp_cnt = hpts_count_level(cpu_top);
- if (tcp_pace.grp_cnt == 0) {
- tcp_pace.grp_cnt = 1;
+ pace->grp_cnt = hpts_count_level(cpu_top);
+ if (pace->grp_cnt == 0) {
+ pace->grp_cnt = 1;
}
- sz = (tcp_pace.grp_cnt * sizeof(struct cpu_group *));
- tcp_pace.grps = malloc(sz, M_TCPHPTS, M_WAITOK);
+ sz = (pace->grp_cnt * sizeof(struct cpu_group *));
+ pace->grps = malloc(sz, M_TCPHPTS, M_WAITOK);
/* Now populate the groups */
- if (tcp_pace.grp_cnt == 1) {
+ if (pace->grp_cnt == 1) {
/*
* All we need is the top level all cpu's are in
* the same cache so when we use grp[0]->cg_mask
@@ -1903,193 +1891,290 @@ tcp_hpts_mod_load(void)
* all cpu's in it. The level here is probably
* zero which is ok.
*/
- tcp_pace.grps[0] = cpu_top;
+ pace->grps[0] = cpu_top;
} else {
/*
* Here we must find all the level three cache domains
* and setup our pointers to them.
*/
count = 0;
- hpts_gather_grps(tcp_pace.grps, &count, tcp_pace.grp_cnt, cpu_top);
+ hpts_gather_grps(pace->grps, &count, pace->grp_cnt, cpu_top);
}
}
+
+ /* Cache the current time for initializing the hpts entries */
+ microuptime(&tv);
+ cts = tcp_tv_to_usec(&tv);
+
+ /* Initialize each hpts entry */
asz = sizeof(struct hptsh) * NUM_OF_HPTSI_SLOTS;
- for (i = 0; i < tcp_pace.rp_num_hptss; i++) {
- tcp_pace.rp_ent[i] = malloc(sizeof(struct tcp_hpts_entry),
+ for (i = 0; i < pace->rp_num_hptss; i++) {
+ pace->rp_ent[i] = malloc(sizeof(struct tcp_hpts_entry),
M_TCPHPTS, M_WAITOK | M_ZERO);
- tcp_pace.rp_ent[i]->p_hptss = malloc(asz, M_TCPHPTS, M_WAITOK);
- hpts = tcp_pace.rp_ent[i];
- /*
- * Init all the hpts structures that are not specifically
- * zero'd by the allocations. Also lets attach them to the
- * appropriate sysctl block as well.
- */
- mtx_init(&hpts->p_mtx, "tcp_hpts_lck",
- "hpts", MTX_DEF | MTX_DUPOK);
- for (j = 0; j < NUM_OF_HPTSI_SLOTS; j++) {
- TAILQ_INIT(&hpts->p_hptss[j].head);
- hpts->p_hptss[j].count = 0;
- hpts->p_hptss[j].gencnt = 0;
- }
- sysctl_ctx_init(&hpts->hpts_ctx);
- sprintf(unit, "%d", i);
- hpts->hpts_root = SYSCTL_ADD_NODE(&hpts->hpts_ctx,
- SYSCTL_STATIC_CHILDREN(_net_inet_tcp_hpts),
- OID_AUTO,
- unit,
- CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
- "");
- SYSCTL_ADD_INT(&hpts->hpts_ctx,
- SYSCTL_CHILDREN(hpts->hpts_root),
- OID_AUTO, "out_qcnt", CTLFLAG_RD,
- &hpts->p_on_queue_cnt, 0,
- "Count TCB's awaiting output processing");
- SYSCTL_ADD_U16(&hpts->hpts_ctx,
- SYSCTL_CHILDREN(hpts->hpts_root),
- OID_AUTO, "active", CTLFLAG_RD,
- &hpts->p_hpts_active, 0,
- "Is the hpts active");
- SYSCTL_ADD_UINT(&hpts->hpts_ctx,
- SYSCTL_CHILDREN(hpts->hpts_root),
- OID_AUTO, "curslot", CTLFLAG_RD,
- &hpts->p_cur_slot, 0,
- "What the current running pacers goal");
- SYSCTL_ADD_UINT(&hpts->hpts_ctx,
- SYSCTL_CHILDREN(hpts->hpts_root),
- OID_AUTO, "runtick", CTLFLAG_RD,
- &hpts->p_runningslot, 0,
- "What the running pacers current slot is");
- SYSCTL_ADD_UINT(&hpts->hpts_ctx,
- SYSCTL_CHILDREN(hpts->hpts_root),
- OID_AUTO, "curtick", CTLFLAG_RD,
- &hpts->p_curtick, 0,
- "What the running pacers last tick mapped to the wheel was");
- SYSCTL_ADD_UINT(&hpts->hpts_ctx,
- SYSCTL_CHILDREN(hpts->hpts_root),
- OID_AUTO, "lastran", CTLFLAG_RD,
- &tcp_pace.cts_last_ran[i], 0,
- "The last usec tick that this hpts ran");
- SYSCTL_ADD_LONG(&hpts->hpts_ctx,
- SYSCTL_CHILDREN(hpts->hpts_root),
- OID_AUTO, "cur_min_sleep", CTLFLAG_RD,
- &hpts->p_mysleep.tv_usec,
- "What the running pacers is using for p_mysleep.tv_usec");
- SYSCTL_ADD_U64(&hpts->hpts_ctx,
- SYSCTL_CHILDREN(hpts->hpts_root),
- OID_AUTO, "now_sleeping", CTLFLAG_RD,
- &hpts->sleeping, 0,
- "What the running pacers is actually sleeping for");
- SYSCTL_ADD_U64(&hpts->hpts_ctx,
- SYSCTL_CHILDREN(hpts->hpts_root),
- OID_AUTO, "syscall_cnt", CTLFLAG_RD,
- &hpts->syscall_cnt, 0,
- "How many times we had syscalls on this hpts");
+ pace->rp_ent[i]->p_hptss = malloc(asz, M_TCPHPTS,
+ M_WAITOK | M_ZERO);
+ hpts = pace->rp_ent[i];
+ /* Basic initialization */
hpts->p_hpts_sleep_time = hpts_sleep_max;
- hpts->p_num = i;
- hpts->p_curtick = tcp_gethptstick(&tv);
- tcp_pace.cts_last_ran[i] = tcp_tv_to_usec(&tv);
- hpts->p_prev_slot = hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
- hpts->p_cpu = 0xffff;
+ hpts->p_cpu = i;
+ pace->cts_last_ran[i] = cts;
+ hpts->p_cur_slot = cts_to_wheel(cts);
+ hpts->p_prev_slot = hpts->p_cur_slot;
hpts->p_nxt_slot = hpts_slot(hpts->p_cur_slot, 1);
callout_init(&hpts->co, 1);
+ hpts->p_hptsi = pace;
+ mtx_init(&hpts->p_mtx, "tcp_hpts_lck", "hpts",
+ MTX_DEF | MTX_DUPOK);
+ for (j = 0; j < NUM_OF_HPTSI_SLOTS; j++) {
+ TAILQ_INIT(&hpts->p_hptss[j].head);
+ }
+
+ /* Setup SYSCTL if requested */
+ if (enable_sysctl) {
+ sysctl_ctx_init(&hpts->hpts_ctx);
+ sprintf(unit, "%d", i);
+ hpts->hpts_root = SYSCTL_ADD_NODE(&hpts->hpts_ctx,
+ SYSCTL_STATIC_CHILDREN(_net_inet_tcp_hpts),
+ OID_AUTO,
+ unit,
+ CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
+ "");
+ SYSCTL_ADD_INT(&hpts->hpts_ctx,
+ SYSCTL_CHILDREN(hpts->hpts_root),
+ OID_AUTO, "out_qcnt", CTLFLAG_RD,
+ &hpts->p_on_queue_cnt, 0,
+ "Count TCB's awaiting output processing");
+ SYSCTL_ADD_U16(&hpts->hpts_ctx,
+ SYSCTL_CHILDREN(hpts->hpts_root),
+ OID_AUTO, "active", CTLFLAG_RD,
+ &hpts->p_hpts_active, 0,
+ "Is the hpts active");
+ SYSCTL_ADD_UINT(&hpts->hpts_ctx,
+ SYSCTL_CHILDREN(hpts->hpts_root),
+ OID_AUTO, "curslot", CTLFLAG_RD,
+ &hpts->p_cur_slot, 0,
+ "What the current running pacers goal");
+ SYSCTL_ADD_UINT(&hpts->hpts_ctx,
+ SYSCTL_CHILDREN(hpts->hpts_root),
+ OID_AUTO, "runslot", CTLFLAG_RD,
+ &hpts->p_runningslot, 0,
+ "What the running pacers current slot is");
+ SYSCTL_ADD_UINT(&hpts->hpts_ctx,
+ SYSCTL_CHILDREN(hpts->hpts_root),
+ OID_AUTO, "lastran", CTLFLAG_RD,
+ &pace->cts_last_ran[i], 0,
+ "The last usec timestamp that this hpts ran");
+ SYSCTL_ADD_LONG(&hpts->hpts_ctx,
+ SYSCTL_CHILDREN(hpts->hpts_root),
+ OID_AUTO, "cur_min_sleep", CTLFLAG_RD,
+ &hpts->p_mysleep.tv_usec,
+ "What the running pacers is using for p_mysleep.tv_usec");
+ SYSCTL_ADD_U64(&hpts->hpts_ctx,
+ SYSCTL_CHILDREN(hpts->hpts_root),
+ OID_AUTO, "now_sleeping", CTLFLAG_RD,
+ &hpts->sleeping, 0,
+ "What the running pacers is actually sleeping for");
+ SYSCTL_ADD_U64(&hpts->hpts_ctx,
+ SYSCTL_CHILDREN(hpts->hpts_root),
+ OID_AUTO, "syscall_cnt", CTLFLAG_RD,
+ &hpts->syscall_cnt, 0,
+ "How many times we had syscalls on this hpts");
+ }
}
- /* Don't try to bind to NUMA domains if we don't have any */
- if (vm_ndomains == 1 && tcp_bind_threads == 2)
- tcp_bind_threads = 0;
- /*
- * Now lets start ithreads to handle the hptss.
- */
- for (i = 0; i < tcp_pace.rp_num_hptss; i++) {
- hpts = tcp_pace.rp_ent[i];
- hpts->p_cpu = i;
+ return (pace);
+}
+
+/*
+ * Create threads for a tcp_hptsi structure and starts timers for the current
+ * (minimum) sleep interval.
+ */
+void
+tcp_hptsi_start(struct tcp_hptsi *pace)
+{
+ struct tcp_hpts_entry *hpts;
+ struct pcpu *pc;
+ struct timeval tv;
+ uint32_t i, j;
+ int count, domain;
+ int error __diagused;
+
+ KASSERT(pace != NULL, ("tcp_hptsi_start: pace is NULL"));
+
+ /* Start threads for each hpts entry */
+ for (i = 0; i < pace->rp_num_hptss; i++) {
+ hpts = pace->rp_ent[i];
+
+ KASSERT(hpts->ie_cookie == NULL,
+ ("tcp_hptsi_start: hpts[%d]->ie_cookie is not NULL", i));
error = swi_add(&hpts->ie, "hpts",
tcp_hpts_thread, (void *)hpts,
SWI_NET, INTR_MPSAFE, &hpts->ie_cookie);
KASSERT(error == 0,
- ("Can't add hpts:%p i:%d err:%d",
- hpts, i, error));
- created++;
- hpts->p_mysleep.tv_sec = 0;
- hpts->p_mysleep.tv_usec = tcp_min_hptsi_time;
+ ("Can't add hpts:%p i:%d err:%d", hpts, i, error));
+
if (tcp_bind_threads == 1) {
- if (intr_event_bind(hpts->ie, i) == 0)
- bound++;
+ (void)intr_event_bind(hpts->ie, i);
} else if (tcp_bind_threads == 2) {
/* Find the group for this CPU (i) and bind into it */
- for (j = 0; j < tcp_pace.grp_cnt; j++) {
- if (CPU_ISSET(i, &tcp_pace.grps[j]->cg_mask)) {
+ for (j = 0; j < pace->grp_cnt; j++) {
+ if (CPU_ISSET(i, &pace->grps[j]->cg_mask)) {
if (intr_event_bind_ithread_cpuset(hpts->ie,
- &tcp_pace.grps[j]->cg_mask) == 0) {
- bound++;
+ &pace->grps[j]->cg_mask) == 0) {
pc = pcpu_find(i);
domain = pc->pc_domain;
- count = hpts_domains[domain].count;
- hpts_domains[domain].cpu[count] = i;
- hpts_domains[domain].count++;
+ count = pace->domains[domain].count;
+ pace->domains[domain].cpu[count] = i;
+ pace->domains[domain].count++;
break;
}
}
}
}
+
+ hpts->p_mysleep.tv_sec = 0;
+ hpts->p_mysleep.tv_usec = tcp_min_hptsi_time;
tv.tv_sec = 0;
tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_USECS_PER_SLOT;
- hpts->sleeping = tv.tv_usec;
- sb = tvtosbt(tv);
- callout_reset_sbt_on(&hpts->co, sb, 0,
- hpts_timeout_swi, hpts, hpts->p_cpu,
- (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
- }
- /*
- * If we somehow have an empty domain, fall back to choosing
- * among all htps threads.
- */
- for (i = 0; i < vm_ndomains; i++) {
- if (hpts_domains[i].count == 0) {
- tcp_bind_threads = 0;
- break;
- }
+ (void)tcp_hpts_sleep(hpts, &tv);
}
- tcp_hpts_softclock = __tcp_run_hpts;
- tcp_lro_hpts_init();
- printf("TCP Hpts created %d swi interrupt threads and bound %d to %s\n",
- created, bound,
- tcp_bind_threads == 2 ? "NUMA domains" : "cpus");
}
-static void
-tcp_hpts_mod_unload(void)
+/*
+ * Stop all callouts/threads for a tcp_hptsi structure.
+ */
+void
+tcp_hptsi_stop(struct tcp_hptsi *pace)
{
+ struct tcp_hpts_entry *hpts;
int rv __diagused;
+ uint32_t i;
- tcp_lro_hpts_uninit();
- atomic_store_ptr(&tcp_hpts_softclock, NULL);
+ KASSERT(pace != NULL, ("tcp_hptsi_stop: pace is NULL"));
- for (int i = 0; i < tcp_pace.rp_num_hptss; i++) {
- struct tcp_hpts_entry *hpts = tcp_pace.rp_ent[i];
+ for (i = 0; i < pace->rp_num_hptss; i++) {
+ hpts = pace->rp_ent[i];
+ KASSERT(hpts != NULL, ("tcp_hptsi_stop: hpts[%d] is NULL", i));
+ KASSERT(hpts->ie_cookie != NULL,
+ ("tcp_hptsi_stop: hpts[%d]->ie_cookie is NULL", i));
- rv = callout_drain(&hpts->co);
+ rv = _callout_stop_safe(&hpts->co, CS_DRAIN);
MPASS(rv != 0);
rv = swi_remove(hpts->ie_cookie);
MPASS(rv == 0);
+ hpts->ie_cookie = NULL;
+ }
+}
- rv = sysctl_ctx_free(&hpts->hpts_ctx);
- MPASS(rv == 0);
+/*
+ * Destroy a tcp_hptsi structure initialized by tcp_hptsi_create.
+ */
+void
+tcp_hptsi_destroy(struct tcp_hptsi *pace)
+{
+ struct tcp_hpts_entry *hpts;
+ uint32_t i;
+
+ KASSERT(pace != NULL, ("tcp_hptsi_destroy: pace is NULL"));
+ KASSERT(pace->rp_ent != NULL, ("tcp_hptsi_destroy: pace->rp_ent is NULL"));
+
+ /* Cleanup each hpts entry */
+ for (i = 0; i < pace->rp_num_hptss; i++) {
+ hpts = pace->rp_ent[i];
+ if (hpts != NULL) {
+ /* Cleanup SYSCTL if it was initialized */
+ if (hpts->hpts_root != NULL) {
+ sysctl_ctx_free(&hpts->hpts_ctx);
+ }
- mtx_destroy(&hpts->p_mtx);
- free(hpts->p_hptss, M_TCPHPTS);
- free(hpts, M_TCPHPTS);
+ mtx_destroy(&hpts->p_mtx);
+ free(hpts->p_hptss, M_TCPHPTS);
+ free(hpts, M_TCPHPTS);
+ }
}
- free(tcp_pace.rp_ent, M_TCPHPTS);
- free(tcp_pace.cts_last_ran, M_TCPHPTS);
+ /* Cleanup main arrays */
+ free(pace->rp_ent, M_TCPHPTS);
+ free(pace->cts_last_ran, M_TCPHPTS);
#ifdef SMP
- free(tcp_pace.grps, M_TCPHPTS);
+ free(pace->grps, M_TCPHPTS);
#endif
+ /* Free the main structure */
+ free(pace, M_TCPHPTS);
+}
+
+static int
+tcp_hpts_mod_load(void)
+{
+ int i;
+
+ /* Don't try to bind to NUMA domains if we don't have any */
+ if (vm_ndomains == 1 && tcp_bind_threads == 2)
+ tcp_bind_threads = 0;
+
+ /* Create the tcp_hptsi structure */
+ tcp_hptsi_pace = tcp_hptsi_create(&tcp_hptsi_default_funcs, true);
+ if (tcp_hptsi_pace == NULL)
+ return (ENOMEM);
+
+ /* Initialize global counters */
+ hpts_hopelessly_behind = counter_u64_alloc(M_WAITOK);
+ hpts_loops = counter_u64_alloc(M_WAITOK);
+ back_tosleep = counter_u64_alloc(M_WAITOK);
+ combined_wheel_wrap = counter_u64_alloc(M_WAITOK);
+ wheel_wrap = counter_u64_alloc(M_WAITOK);
+ hpts_wake_timeout = counter_u64_alloc(M_WAITOK);
+ hpts_direct_awakening = counter_u64_alloc(M_WAITOK);
+ hpts_back_tosleep = counter_u64_alloc(M_WAITOK);
+ hpts_direct_call = counter_u64_alloc(M_WAITOK);
+ cpu_uses_flowid = counter_u64_alloc(M_WAITOK);
+ cpu_uses_random = counter_u64_alloc(M_WAITOK);
+
+ /* Start the threads */
+ tcp_hptsi_start(tcp_hptsi_pace);
+
+ /* Enable the global HPTS softclock function */
+ tcp_hpts_softclock = __tcp_run_hpts;
+
+ /* Initialize LRO HPTS */
+ tcp_lro_hpts_init();
+
+ /*
+ * If we somehow have an empty domain, fall back to choosing among all
+ * HPTS threads.
+ */
+ for (i = 0; i < vm_ndomains; i++) {
+ if (tcp_hptsi_pace->domains[i].count == 0) {
+ tcp_bind_threads = 0;
+ break;
+ }
+ }
+
+ printf("TCP HPTS started %u (%s) swi interrupt threads\n",
+ tcp_hptsi_pace->rp_num_hptss, (tcp_bind_threads == 0) ?
+ "(unbounded)" :
+ (tcp_bind_threads == 1 ? "per-cpu" : "per-NUMA-domain"));
+
+ return (0);
+}
+
+static void
+tcp_hpts_mod_unload(void)
+{
+ tcp_lro_hpts_uninit();
+
+ /* Disable the global HPTS softclock function */
+ atomic_store_ptr(&tcp_hpts_softclock, NULL);
+
+ tcp_hptsi_stop(tcp_hptsi_pace);
+ tcp_hptsi_destroy(tcp_hptsi_pace);
+ tcp_hptsi_pace = NULL;
+
+ /* Cleanup global counters */
counter_u64_free(hpts_hopelessly_behind);
counter_u64_free(hpts_loops);
counter_u64_free(back_tosleep);
@@ -2104,13 +2189,11 @@ tcp_hpts_mod_unload(void)
}
static int
-tcp_hpts_modevent(module_t mod, int what, void *arg)
+tcp_hpts_mod_event(module_t mod, int what, void *arg)
{
-
switch (what) {
case MOD_LOAD:
- tcp_hpts_mod_load();
- return (0);
+ return (tcp_hpts_mod_load());
case MOD_QUIESCE:
/*
* Since we are a dependency of TCP stack modules, they should
@@ -2130,7 +2213,7 @@ tcp_hpts_modevent(module_t mod, int what, void *arg)
static moduledata_t tcp_hpts_module = {
.name = "tcphpts",
- .evhand = tcp_hpts_modevent,
+ .evhand = tcp_hpts_mod_event,
};
DECLARE_MODULE(tcphpts, tcp_hpts_module, SI_SUB_SOFTINTR, SI_ORDER_ANY);
diff --git a/sys/netinet/tcp_hpts.h b/sys/netinet/tcp_hpts.h
index 6172baf2a062..6b05f9701ac2 100644
--- a/sys/netinet/tcp_hpts.h
+++ b/sys/netinet/tcp_hpts.h
@@ -28,19 +28,11 @@
/* Number of useconds represented by an hpts slot */
#define HPTS_USECS_PER_SLOT 10
-#define HPTS_MS_TO_SLOTS(x) ((x * 100) + 1)
-#define HPTS_USEC_TO_SLOTS(x) ((x+9) /10)
#define HPTS_USEC_IN_SEC 1000000
#define HPTS_MSEC_IN_SEC 1000
#define HPTS_USEC_IN_MSEC 1000
static inline uint32_t
-tcp_tv_to_hpts_slot(const struct timeval *sv)
-{
- return ((sv->tv_sec * 100000) + (sv->tv_usec / HPTS_USECS_PER_SLOT));
-}
-
-static inline uint32_t
tcp_tv_to_usec(const struct timeval *sv)
{
return ((uint32_t) ((sv->tv_sec * HPTS_USEC_IN_SEC) + sv->tv_usec));
@@ -66,7 +58,7 @@ struct hpts_diag {
uint32_t p_runningslot; /* bbr->inflight */
uint32_t slot_req; /* bbr->flex3 x */
uint32_t inp_hptsslot; /* bbr->flex4 x */
- uint32_t slot_remaining; /* bbr->flex5 x */
+ uint32_t time_remaining; /* bbr->flex5 x */
uint32_t have_slept; /* bbr->epoch x */
uint32_t hpts_sleep_time; /* bbr->applimited x */
uint32_t yet_to_sleep; /* bbr->lt_epoch x */
@@ -75,8 +67,6 @@ struct hpts_diag {
uint32_t maxslots; /* bbr->delRate x */
uint32_t wheel_cts; /* bbr->rttProp x */
int32_t co_ret; /* bbr->pkts_out x */
- uint32_t p_curtick; /* upper bbr->cur_del_rate */
- uint32_t p_lasttick; /* lower bbr->cur_del_rate */
uint8_t p_on_min_sleep; /* bbr->flex8 x */
};
@@ -92,13 +82,18 @@ struct hpts_diag {
#ifdef _KERNEL
+extern struct tcp_hptsi *tcp_hptsi_pace;
+
/*
* The following are the definitions for the kernel HPTS interface for managing
* the HPTS ring and the TCBs on it.
*/
-void tcp_hpts_init(struct tcpcb *);
-void tcp_hpts_remove(struct tcpcb *);
+void __tcp_hpts_init(struct tcp_hptsi *pace, struct tcpcb *);
+#define tcp_hpts_init(tp) __tcp_hpts_init(tcp_hptsi_pace, tp)
+
+void __tcp_hpts_remove(struct tcp_hptsi *pace, struct tcpcb *);
+#define tcp_hpts_remove(tp) __tcp_hpts_remove(tcp_hptsi_pace, tp)
static inline bool
tcp_in_hpts(struct tcpcb *tp)
@@ -132,12 +127,13 @@ tcp_in_hpts(struct tcpcb *tp)
* that INP_WLOCK() or from destroying your TCB where again
* you should already have the INP_WLOCK().
*/
-uint32_t tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line,
- struct hpts_diag *diag);
-#define tcp_hpts_insert(inp, slot) \
- tcp_hpts_insert_diag((inp), (slot), __LINE__, NULL)
+void __tcp_hpts_insert(struct tcp_hptsi *pace, struct tcpcb *tp, uint32_t usecs,
+ struct hpts_diag *diag);
+#define tcp_hpts_insert(tp, usecs, diag) \
+ __tcp_hpts_insert(tcp_hptsi_pace, (tp), (usecs), (diag))
-void tcp_set_hpts(struct tcpcb *tp);
+void __tcp_set_hpts(struct tcp_hptsi *pace, struct tcpcb *tp);
+#define tcp_set_hpts(tp) __tcp_set_hpts(tcp_hptsi_pace, tp)
extern int32_t tcp_min_hptsi_time;
@@ -147,17 +143,6 @@ get_hpts_min_sleep_time(void)
return (tcp_min_hptsi_time + HPTS_USECS_PER_SLOT);
}
-static inline uint32_t
-tcp_gethptstick(struct timeval *sv)
-{
- struct timeval tv;
-
- if (sv == NULL)
- sv = &tv;
- microuptime(sv);
- return (tcp_tv_to_hpts_slot(sv));
-}
-
static inline uint64_t
tcp_get_u64_usecs(struct timeval *tv)
{
@@ -180,12 +165,5 @@ tcp_get_usecs(struct timeval *tv)
return (tcp_tv_to_usec(tv));
}
-/*
- * LRO HPTS initialization and uninitialization, only for internal use by the
- * HPTS code.
- */
-void tcp_lro_hpts_init(void);
-void tcp_lro_hpts_uninit(void);
-
#endif /* _KERNEL */
#endif /* __tcp_hpts_h__ */
diff --git a/sys/netinet/tcp_hpts_internal.h b/sys/netinet/tcp_hpts_internal.h
new file mode 100644
index 000000000000..8b33e03a6981
--- /dev/null
+++ b/sys/netinet/tcp_hpts_internal.h
@@ -0,0 +1,184 @@
+/*-
+ * Copyright (c) 2025 Netflix, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef __tcp_hpts_internal_h__
+#define __tcp_hpts_internal_h__
+
+/*
+ * TCP High Precision Timer System (HPTS) - Internal Definitions
+ *
+ * This header contains internal structures, constants, and interfaces that are
+ * implemented in tcp_hpts.c but exposed to enable comprehensive unit testing of
+ * the HPTS subsystem.
+ */
+
+#if defined(_KERNEL)
+
+/*
+ * The hpts uses a 102400 wheel. The wheel
+ * defines the time in 10 usec increments (102400 x 10).
+ * This gives a range of 10usec - 1024ms to place
+ * an entry within. If the user requests more than
+ * 1.024 second, a remaineder is attached and the hpts
+ * when seeing the remainder will re-insert the
+ * inpcb forward in time from where it is until
+ * the remainder is zero.
+ */
+
+#define NUM_OF_HPTSI_SLOTS 102400
+
+/* The number of connections after which the dynamic sleep logic kicks in. */
+#define DEFAULT_CONNECTION_THRESHOLD 100
+
+/*
+ * The hpts uses a 102400 wheel. The wheel
+ * defines the time in 10 usec increments (102400 x 10).
+ * This gives a range of 10usec - 1024ms to place
+ * an entry within. If the user requests more than
+ * 1.024 second, a remaineder is attached and the hpts
+ * when seeing the remainder will re-insert the
+ * inpcb forward in time from where it is until
+ * the remainder is zero.
+ */
+
+#define NUM_OF_HPTSI_SLOTS 102400
+
+/* Convert microseconds to HPTS slots */
+#define HPTS_USEC_TO_SLOTS(x) ((x+9) /10)
+
+/* The number of connections after which the dynamic sleep logic kicks in. */
+#define DEFAULT_CONNECTION_THRESHOLD 100
+
+extern int tcp_bind_threads; /* Thread binding configuration
+ * (0=none, 1=cpu, 2=numa) */
+
+/*
+ * Abstraction layer controlling time, interrupts and callouts.
+ */
+struct tcp_hptsi_funcs {
+ void (*microuptime)(struct timeval *tv);
+ int (*swi_add)(struct intr_event **eventp, const char *name,
+ driver_intr_t handler, void *arg, int pri, enum intr_type flags,
+ void **cookiep);
+ int (*swi_remove)(void *cookie);
+ void (*swi_sched)(void *cookie, int flags);
+ int (*intr_event_bind)(struct intr_event *ie, int cpu);
+ int (*intr_event_bind_ithread_cpuset)(struct intr_event *ie,
+ struct _cpuset *mask);
+ void (*callout_init)(struct callout *c, int mpsafe);
+ int (*callout_reset_sbt_on)(struct callout *c, sbintime_t sbt,
+ sbintime_t precision, void (*func)(void *), void *arg, int cpu,
+ int flags);
+ int (*_callout_stop_safe)(struct callout *c, int flags);
+};
+
+/* Default function table for system operation */
+extern const struct tcp_hptsi_funcs tcp_hptsi_default_funcs;
+
+/* Each hpts has its own p_mtx which is used for locking */
+#define HPTS_MTX_ASSERT(hpts) mtx_assert(&(hpts)->p_mtx, MA_OWNED)
+#define HPTS_LOCK(hpts) mtx_lock(&(hpts)->p_mtx)
+#define HPTS_TRYLOCK(hpts) mtx_trylock(&(hpts)->p_mtx)
+#define HPTS_UNLOCK(hpts) mtx_unlock(&(hpts)->p_mtx)
+
+struct tcp_hpts_entry {
+ /* Cache line 0x00 */
+ struct mtx p_mtx; /* Mutex for hpts */
+ struct timeval p_mysleep; /* Our min sleep time */
+ uint64_t syscall_cnt;
+ uint64_t sleeping; /* What the actual sleep was (if sleeping) */
+ uint16_t p_hpts_active; /* Flag that says hpts is awake */
+ uint8_t p_wheel_complete; /* have we completed the wheel arc walk? */
+ uint32_t p_runningslot; /* Current slot we are at if we are running */
+ uint32_t p_prev_slot; /* Previous slot we were on */
+ uint32_t p_cur_slot; /* Current slot in wheel hpts is draining */
+ uint32_t p_nxt_slot; /* The next slot outside the current range
+ * of slots that the hpts is running on. */
+ int32_t p_on_queue_cnt; /* Count on queue in this hpts */
+ uint8_t p_direct_wake :1, /* boolean */
+ p_on_min_sleep:1, /* boolean */
+ p_hpts_wake_scheduled:1,/* boolean */
+ hit_callout_thresh:1,
+ p_avail:4;
+ uint8_t p_fill[3]; /* Fill to 32 bits */
+ /* Cache line 0x40 */
+ struct hptsh {
+ TAILQ_HEAD(, tcpcb) head;
+ uint32_t count;
+ uint32_t gencnt;
+ } *p_hptss; /* Hptsi wheel */
+ uint32_t p_hpts_sleep_time; /* Current sleep interval having a max
+ * of 255ms */
+ uint32_t overidden_sleep; /* what was overrided by min-sleep for logging */
+ uint32_t saved_curslot; /* for logging */
+ uint32_t saved_prev_slot; /* for logging */
+ uint32_t p_delayed_by; /* How much were we delayed by */
+ /* Cache line 0x80 */
+ struct sysctl_ctx_list hpts_ctx;
+ struct sysctl_oid *hpts_root;
+ struct intr_event *ie;
+ void *ie_cookie;
+ uint16_t p_cpu; /* The hpts CPU */
+ struct tcp_hptsi *p_hptsi; /* Back pointer to parent hptsi structure */
+ /* There is extra space in here */
+ /* Cache line 0x100 */
+ struct callout co __aligned(CACHE_LINE_SIZE);
+} __aligned(CACHE_LINE_SIZE);
+
+struct tcp_hptsi {
+ struct cpu_group **grps;
+ struct tcp_hpts_entry **rp_ent; /* Array of hptss */
+ uint32_t *cts_last_ran;
+ uint32_t grp_cnt;
+ uint32_t rp_num_hptss; /* Number of hpts threads */
+ struct hpts_domain_info {
+ int count;
+ int cpu[MAXCPU];
+ } domains[MAXMEMDOM]; /* Per-NUMA domain CPU assignments */
+ const struct tcp_hptsi_funcs *funcs; /* Function table for testability */
+};
+
+/*
+ * Core tcp_hptsi structure manipulation functions.
+ */
+struct tcp_hptsi* tcp_hptsi_create(const struct tcp_hptsi_funcs *funcs,
+ bool enable_sysctl);
+void tcp_hptsi_destroy(struct tcp_hptsi *pace);
+void tcp_hptsi_start(struct tcp_hptsi *pace);
+void tcp_hptsi_stop(struct tcp_hptsi *pace);
+uint16_t tcp_hptsi_random_cpu(struct tcp_hptsi *pace);
+int32_t tcp_hptsi(struct tcp_hpts_entry *hpts, bool from_callout);
+
+void tcp_hpts_wake(struct tcp_hpts_entry *hpts);
+
+/*
+ * LRO HPTS initialization and uninitialization, only for internal use by the
+ * HPTS code.
+ */
+void tcp_lro_hpts_init(void);
+void tcp_lro_hpts_uninit(void);
+
+#endif /* defined(_KERNEL) */
+#endif /* __tcp_hpts_internal_h__ */
diff --git a/sys/netinet/tcp_hpts_test.c b/sys/netinet/tcp_hpts_test.c
new file mode 100644
index 000000000000..bab5827e0572
--- /dev/null
+++ b/sys/netinet/tcp_hpts_test.c
@@ -0,0 +1,1662 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2025 Netflix, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <tests/ktest.h>
+#include <sys/cdefs.h>
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/interrupt.h>
+#include <sys/errno.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/refcount.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <netinet/in_pcb.h>
+#include <netinet/tcp_seq.h>
+#include <netinet/tcp_var.h>
+#include <netinet/tcp_hpts.h>
+#include <netinet/tcp_hpts_internal.h>
+#include <dev/tcp_log/tcp_log_dev.h>
+#include <netinet/tcp_log_buf.h>
+
+#undef tcp_hpts_init
+#undef tcp_hpts_remove
+#undef tcp_hpts_insert
+#undef tcp_set_hpts
+
+/* Custom definitions that take the tcp_hptsi */
+#define tcp_hpts_init(pace, tp) __tcp_hpts_init((pace), (tp))
+#define tcp_hpts_remove(pace, tp) __tcp_hpts_remove((pace), (tp))
+#define tcp_hpts_insert(pace, tp, usecs, diag) \
+ __tcp_hpts_insert((pace), (tp), (usecs), (diag))
+#define tcp_set_hpts(pace, tp) __tcp_set_hpts((pace), (tp))
+
+static MALLOC_DEFINE(M_TCPHPTS, "tcp_hpts_test", "TCP hpts test");
+
+static int test_exit_on_failure = true;
+SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hpts_test, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
+ "TCP HPTS test controls");
+SYSCTL_INT(_net_inet_tcp_hpts_test, OID_AUTO, exit_on_failure, CTLFLAG_RW,
+ &test_exit_on_failure, 0,
+ "Exit HPTS test immediately on first failure (1) or continue running all tests (0)");
+
+#define KTEST_VERIFY(x) do { \
+ if (!(x)) { \
+ KTEST_ERR(ctx, "FAIL: %s", #x); \
+ if (test_exit_on_failure) \
+ return (EINVAL); \
+ } else { \
+ KTEST_LOG(ctx, "PASS: %s", #x); \
+ } \
+} while (0)
+
+#define KTEST_EQUAL(x, y) do { \
+ if ((x) != (y)) { \
+ KTEST_ERR(ctx, "FAIL: %s != %s (%d != %d)", #x, #y, (x), (y)); \
+ if (test_exit_on_failure) \
+ return (EINVAL); \
+ } else { \
+ KTEST_LOG(ctx, "PASS: %s == %s", #x, #y); \
+ } \
+} while (0)
+
+#define KTEST_NEQUAL(x, y) do { \
+ if ((x) == (y)) { \
+ KTEST_ERR(ctx, "FAIL: %s == %s (%d == %d)", #x, #y, (x), (y)); \
+ if (test_exit_on_failure) \
+ return (EINVAL); \
+ } else { \
+ KTEST_LOG(ctx, "PASS: %s != %s", #x, #y); \
+ } \
+} while (0)
+
+#define KTEST_GREATER_THAN(x, y) do { \
+ if ((x) <= (y)) { \
+ KTEST_ERR(ctx, "FAIL: %s <= %s (%d <= %d)", #x, #y, (x), (y)); \
+ if (test_exit_on_failure) \
+ return (EINVAL); \
+ } else { \
+ KTEST_LOG(ctx, "PASS: %s > %s", #x, #y); \
+ } \
+} while (0)
+
+#define KTEST_VERIFY_RET(x, y) do { \
+ if (!(x)) { \
+ KTEST_ERR(ctx, "FAIL: %s", #x); \
+ if (test_exit_on_failure) \
+ return (y); \
+ } else { \
+ KTEST_LOG(ctx, "PASS: %s", #x); \
+ } \
+} while (0)
+
+static void
+dump_hpts_entry(struct ktest_test_context *ctx, struct tcp_hpts_entry *hpts)
+{
+ KTEST_LOG(ctx, "tcp_hpts_entry(%p)", hpts);
+ KTEST_LOG(ctx, " p_cur_slot: %u", hpts->p_cur_slot);
+ KTEST_LOG(ctx, " p_prev_slot: %u", hpts->p_prev_slot);
+ KTEST_LOG(ctx, " p_nxt_slot: %u", hpts->p_nxt_slot);
+ KTEST_LOG(ctx, " p_runningslot: %u", hpts->p_runningslot);
+ KTEST_LOG(ctx, " p_on_queue_cnt: %d", hpts->p_on_queue_cnt);
+ KTEST_LOG(ctx, " p_hpts_active: %u", hpts->p_hpts_active);
+ KTEST_LOG(ctx, " p_wheel_complete: %u", hpts->p_wheel_complete);
+ KTEST_LOG(ctx, " p_direct_wake: %u", hpts->p_direct_wake);
+ KTEST_LOG(ctx, " p_on_min_sleep: %u", hpts->p_on_min_sleep);
+ KTEST_LOG(ctx, " p_hpts_wake_scheduled: %u", hpts->p_hpts_wake_scheduled);
+ KTEST_LOG(ctx, " hit_callout_thresh: %u", hpts->hit_callout_thresh);
+ KTEST_LOG(ctx, " p_hpts_sleep_time: %u", hpts->p_hpts_sleep_time);
+ KTEST_LOG(ctx, " p_delayed_by: %u", hpts->p_delayed_by);
+ KTEST_LOG(ctx, " overidden_sleep: %u", hpts->overidden_sleep);
+ KTEST_LOG(ctx, " saved_curslot: %u", hpts->saved_curslot);
+ KTEST_LOG(ctx, " saved_prev_slot: %u", hpts->saved_prev_slot);
+ KTEST_LOG(ctx, " syscall_cnt: %lu", hpts->syscall_cnt);
+ KTEST_LOG(ctx, " sleeping: %lu", hpts->sleeping);
+ KTEST_LOG(ctx, " p_cpu: %u", hpts->p_cpu);
+ KTEST_LOG(ctx, " ie_cookie: %p", hpts->ie_cookie);
+ KTEST_LOG(ctx, " p_hptsi: %p", hpts->p_hptsi);
+ KTEST_LOG(ctx, " p_mysleep: %ld.%06ld", hpts->p_mysleep.tv_sec, hpts->p_mysleep.tv_usec);
+}
+
+static void
+dump_tcpcb(struct tcpcb *tp)
+{
+ struct ktest_test_context *ctx = tp->t_fb_ptr;
+ struct inpcb *inp = &tp->t_inpcb;
+
+ KTEST_LOG(ctx, "tcp_control_block(%p)", tp);
+
+ /* HPTS-specific fields */
+ KTEST_LOG(ctx, " t_in_hpts: %d", tp->t_in_hpts);
+ KTEST_LOG(ctx, " t_hpts_cpu: %u", tp->t_hpts_cpu);
+ KTEST_LOG(ctx, " t_hpts_slot: %d", tp->t_hpts_slot);
+ KTEST_LOG(ctx, " t_hpts_gencnt: %u", tp->t_hpts_gencnt);
+ KTEST_LOG(ctx, " t_hpts_request: %u", tp->t_hpts_request);
+
+ /* LRO CPU field */
+ KTEST_LOG(ctx, " t_lro_cpu: %u", tp->t_lro_cpu);
+
+ /* TCP flags that affect HPTS */
+ KTEST_LOG(ctx, " t_flags2: 0x%x", tp->t_flags2);
+ KTEST_LOG(ctx, " TF2_HPTS_CPU_SET: %s", (tp->t_flags2 & TF2_HPTS_CPU_SET) ? "YES" : "NO");
+ KTEST_LOG(ctx, " TF2_HPTS_CALLS: %s", (tp->t_flags2 & TF2_HPTS_CALLS) ? "YES" : "NO");
+ KTEST_LOG(ctx, " TF2_SUPPORTS_MBUFQ: %s", (tp->t_flags2 & TF2_SUPPORTS_MBUFQ) ? "YES" : "NO");
+
+ /* Input PCB fields that HPTS uses */
+ KTEST_LOG(ctx, " inp_flags: 0x%x", inp->inp_flags);
+ KTEST_LOG(ctx, " INP_DROPPED: %s", (inp->inp_flags & INP_DROPPED) ? "YES" : "NO");
+ KTEST_LOG(ctx, " inp_flowid: 0x%x", inp->inp_flowid);
+ KTEST_LOG(ctx, " inp_flowtype: %u", inp->inp_flowtype);
+ KTEST_LOG(ctx, " inp_numa_domain: %d", inp->inp_numa_domain);
+}
+
+/* Enum for call counting indices */
+enum test_call_counts {
+ CCNT_MICROUPTIME = 0,
+ CCNT_SWI_ADD,
+ CCNT_SWI_REMOVE,
+ CCNT_SWI_SCHED,
+ CCNT_INTR_EVENT_BIND,
+ CCNT_INTR_EVENT_BIND_CPUSET,
+ CCNT_CALLOUT_INIT,
+ CCNT_CALLOUT_RESET_SBT_ON,
+ CCNT_CALLOUT_STOP_SAFE,
+ CCNT_TCP_OUTPUT,
+ CCNT_TCP_TFB_DO_QUEUED_SEGMENTS,
+ CCNT_MAX
+};
+
+static uint32_t call_counts[CCNT_MAX];
+
+static uint64_t test_time_usec = 0;
+
+/*
+ * Reset all test global variables to a clean state.
+ */
+static void
+test_hpts_init(void)
+{
+ memset(call_counts, 0, sizeof(call_counts));
+ test_time_usec = 0;
+}
+
+static void
+test_microuptime(struct timeval *tv)
+{
+ call_counts[CCNT_MICROUPTIME]++;
+ tv->tv_sec = test_time_usec / 1000000;
+ tv->tv_usec = test_time_usec % 1000000;
+}
+
+static int
+test_swi_add(struct intr_event **eventp, const char *name,
+ driver_intr_t handler, void *arg, int pri, enum intr_type flags,
+ void **cookiep)
+{
+ call_counts[CCNT_SWI_ADD]++;
+ /* Simulate successful SWI creation */
+ *eventp = (struct intr_event *)0xfeedface; /* Mock event */
+ *cookiep = (void *)0xdeadbeef; /* Mock cookie */
+ return (0);
+}
+
+static int
+test_swi_remove(void *cookie)
+{
+ call_counts[CCNT_SWI_REMOVE]++;
+ /* Simulate successful removal */
+ return (0);
+}
+
+static void
+test_swi_sched(void *cookie, int flags)
+{
+ call_counts[CCNT_SWI_SCHED]++;
+ /* Simulate successful SWI scheduling */
+}
+
+static int
+test_intr_event_bind(struct intr_event *ie, int cpu)
+{
+ call_counts[CCNT_INTR_EVENT_BIND]++;
+ /* Simulate successful binding */
+ return (0);
+}
+
+static int
+test_intr_event_bind_ithread_cpuset(struct intr_event *ie, struct _cpuset *mask)
+{
+ call_counts[CCNT_INTR_EVENT_BIND_CPUSET]++;
+ /* Simulate successful cpuset binding */
+ return (0);
+}
+
+static void
+test_callout_init(struct callout *c, int mpsafe)
+{
+ call_counts[CCNT_CALLOUT_INIT]++;
+ memset(c, 0, sizeof(*c));
+}
+
+static int
+test_callout_reset_sbt_on(struct callout *c, sbintime_t sbt, sbintime_t precision,
+ void (*func)(void *), void *arg, int cpu, int flags)
+{
+ call_counts[CCNT_CALLOUT_RESET_SBT_ON]++;
+ /* Return 1 to simulate successful timer scheduling */
+ return (1);
+}
+
+static int
+test_callout_stop_safe(struct callout *c, int flags)
+{
+ call_counts[CCNT_CALLOUT_STOP_SAFE]++;
+ /* Return 1 to simulate successful timer stopping */
+ return (1);
+}
+
+static const struct tcp_hptsi_funcs test_funcs = {
+ .microuptime = test_microuptime,
+ .swi_add = test_swi_add,
+ .swi_remove = test_swi_remove,
+ .swi_sched = test_swi_sched,
+ .intr_event_bind = test_intr_event_bind,
+ .intr_event_bind_ithread_cpuset = test_intr_event_bind_ithread_cpuset,
+ .callout_init = test_callout_init,
+ .callout_reset_sbt_on = test_callout_reset_sbt_on,
+ ._callout_stop_safe = test_callout_stop_safe,
+};
+
+#define TP_REMOVE_FROM_HPTS(tp) tp->bits_spare
+#define TP_LOG_TEST(tp) tp->t_log_state_set
+
+static int
+test_tcp_output(struct tcpcb *tp)
+{
+ struct ktest_test_context *ctx = tp->t_fb_ptr;
+ struct tcp_hptsi *pace = (struct tcp_hptsi*)tp->t_tfo_pending;
+ struct tcp_hpts_entry *hpts = pace->rp_ent[tp->t_hpts_cpu];
+
+ call_counts[CCNT_TCP_OUTPUT]++;
+ if (TP_LOG_TEST(tp)) {
+ KTEST_LOG(ctx, "=> tcp_output(%p)", tp);
+ dump_tcpcb(tp);
+ dump_hpts_entry(ctx, hpts);
+ }
+
+ if ((TP_REMOVE_FROM_HPTS(tp) & 1) != 0) {
+ if (TP_LOG_TEST(tp))
+ KTEST_LOG(ctx, "=> tcp_hpts_remove(%p)", tp);
+ tcp_hpts_remove(pace, tp);
+ }
+
+ if ((TP_REMOVE_FROM_HPTS(tp) & 2) != 0) {
+ INP_WUNLOCK(&tp->t_inpcb); /* tcp_output unlocks on error */
+ return (-1); /* Simulate tcp_output error */
+ }
+
+ return (0);
+}
+
+static int
+test_tfb_do_queued_segments(struct tcpcb *tp, int flag)
+{
+ struct ktest_test_context *ctx = tp->t_fb_ptr;
+ struct tcp_hptsi *pace = (struct tcp_hptsi*)tp->t_tfo_pending;
+ struct tcp_hpts_entry *hpts = pace->rp_ent[tp->t_hpts_cpu];
+
+ call_counts[CCNT_TCP_TFB_DO_QUEUED_SEGMENTS]++;
+ KTEST_LOG(ctx, "=> tfb_do_queued_segments(%p, %d)", tp, flag);
+ dump_tcpcb(tp);
+ dump_hpts_entry(ctx, hpts);
+
+ if ((TP_REMOVE_FROM_HPTS(tp) & 1) != 0) {
+ if (TP_LOG_TEST(tp))
+ KTEST_LOG(ctx, "=> tcp_hpts_remove(%p)", tp);
+ tcp_hpts_remove(pace, tp);
+ }
+
+ if ((TP_REMOVE_FROM_HPTS(tp) & 2) != 0) {
+ INP_WUNLOCK(&tp->t_inpcb); /* do_queued_segments unlocks on error */
+ return (-1); /* Simulate do_queued_segments error */
+ }
+
+ return (0);
+}
+
+static struct tcp_function_block test_tcp_fb = {
+ .tfb_tcp_block_name = "hpts_test_tcp",
+ .tfb_tcp_output = test_tcp_output,
+ .tfb_do_queued_segments = test_tfb_do_queued_segments,
+};
+
+/*
+ * Create a minimally initialized tcpcb that can be safely inserted into HPTS.
+ * This function allocates and initializes all the fields that HPTS code
+ * reads or writes.
+ */
+static struct tcpcb *
+test_hpts_create_tcpcb(struct ktest_test_context *ctx, struct tcp_hptsi *pace)
+{
+ struct tcpcb *tp;
+
+ tp = malloc(sizeof(struct tcpcb), M_TCPHPTS, M_WAITOK | M_ZERO);
+ if (tp) {
+ rw_init_flags(&tp->t_inpcb.inp_lock, "test-inp",
+ RW_RECURSE | RW_DUPOK);
+ refcount_init(&tp->t_inpcb.inp_refcount, 1);
+ tp->t_inpcb.inp_pcbinfo = &V_tcbinfo;
+ tp->t_fb = &test_tcp_fb;
+ tp->t_hpts_cpu = HPTS_CPU_NONE;
+ STAILQ_INIT(&tp->t_inqueue);
+ tcp_hpts_init(pace, tp);
+
+ /* Stuff some pointers in the tcb for test purposes. */
+ tp->t_fb_ptr = ctx;
+ tp->t_tfo_pending = (unsigned int*)pace;
+ }
+
+ return (tp);
+}
+
+/*
+ * Free a test tcpcb created by test_hpts_create_tcpcb()
+ */
+static void
+test_hpts_free_tcpcb(struct tcpcb *tp)
+{
+ if (tp == NULL)
+ return;
+
+ INP_LOCK_DESTROY(&tp->t_inpcb);
+ free(tp, M_TCPHPTS);
+}
+
+/*
+ * ***********************************************
+ * * KTEST functions for testing the HPTS module *
+ * ***********************************************
+ */
+
+/*
+ * Validates that the HPTS module is properly loaded and initialized by checking
+ * that the minimum HPTS time is configured.
+ */
+KTEST_FUNC(module_load)
+{
+ test_hpts_init();
+ KTEST_NEQUAL(tcp_min_hptsi_time, 0);
+ KTEST_VERIFY(tcp_bind_threads >= 0 && tcp_bind_threads <= 2);
+ KTEST_NEQUAL(tcp_hptsi_pace, NULL);
+ return (0);
+}
+
+/*
+ * Validates the creation and destruction of tcp_hptsi structures, ensuring
+ * proper initialization of internal fields and clean destruction.
+ */
+KTEST_FUNC(hptsi_create_destroy)
+{
+ struct tcp_hptsi *pace;
+
+ test_hpts_init();
+
+ pace = tcp_hptsi_create(&test_funcs, false);
+ KTEST_NEQUAL(pace, NULL);
+ KTEST_NEQUAL(pace->rp_ent, NULL);
+ KTEST_NEQUAL(pace->cts_last_ran, NULL);
+ KTEST_VERIFY(pace->rp_num_hptss > 0);
+ KTEST_VERIFY(pace->rp_num_hptss <= MAXCPU); /* Reasonable upper bound */
+ KTEST_VERIFY(pace->grp_cnt >= 1); /* At least one group */
+ KTEST_EQUAL(pace->funcs, &test_funcs); /* Verify function pointer was set */
+
+ /* Verify individual HPTS entries are properly initialized */
+ for (uint32_t i = 0; i < pace->rp_num_hptss; i++) {
+ KTEST_NEQUAL(pace->rp_ent[i], NULL);
+ KTEST_EQUAL(pace->rp_ent[i]->p_cpu, i);
+ KTEST_EQUAL(pace->rp_ent[i]->p_hptsi, pace);
+ KTEST_EQUAL(pace->rp_ent[i]->p_on_queue_cnt, 0);
+ }
+
+ tcp_hptsi_destroy(pace);
+
+ return (0);
+}
+
+/*
+ * Validates that tcp_hptsi structures can be started and stopped properly,
+ * including verification that threads are created during start and cleaned up
+ * during stop operations.
+ */
+KTEST_FUNC(hptsi_start_stop)
+{
+ struct tcp_hptsi *pace;
+
+ test_hpts_init();
+
+ pace = tcp_hptsi_create(&test_funcs, false);
+ KTEST_NEQUAL(pace, NULL);
+
+ tcp_hptsi_start(pace);
+
+ /* Verify that entries have threads started */
+ struct tcp_hpts_entry *hpts = pace->rp_ent[0];
+ KTEST_NEQUAL(hpts->ie_cookie, NULL); /* Should have SWI handler */
+ KTEST_EQUAL(hpts->p_hptsi, pace); /* Should point to our pace */
+
+ tcp_hptsi_stop(pace);
+ tcp_hptsi_destroy(pace);
+
+ return (0);
+}
+
+/*
+ * Validates that multiple tcp_hptsi instances can coexist independently, with
+ * different configurations and CPU assignments without interfering with each
+ * other.
+ */
+KTEST_FUNC(hptsi_independence)
+{
+ struct tcp_hptsi *pace1, *pace2;
+ uint16_t cpu1, cpu2;
+
+ test_hpts_init();
+
+ pace1 = tcp_hptsi_create(&test_funcs, false);
+ pace2 = tcp_hptsi_create(&test_funcs, false);
+ KTEST_NEQUAL(pace1, NULL);
+ KTEST_NEQUAL(pace2, NULL);
+ KTEST_NEQUAL(pace2->rp_ent, NULL);
+
+ cpu1 = tcp_hptsi_random_cpu(pace1);
+ cpu2 = tcp_hptsi_random_cpu(pace2);
+ KTEST_VERIFY(cpu1 < pace1->rp_num_hptss);
+ KTEST_VERIFY(cpu2 < pace2->rp_num_hptss);
+
+ /* Verify both instances have independent entry arrays */
+ KTEST_NEQUAL(pace1->rp_ent, pace2->rp_ent);
+ /* Verify they may have different CPU counts but both reasonable */
+ KTEST_VERIFY(pace1->rp_num_hptss > 0 && pace1->rp_num_hptss <= MAXCPU);
+ KTEST_VERIFY(pace2->rp_num_hptss > 0 && pace2->rp_num_hptss <= MAXCPU);
+
+ tcp_hptsi_destroy(pace1);
+ tcp_hptsi_destroy(pace2);
+
+ return (0);
+}
+
+/*
+ * Validates that custom function injection works correctly, ensuring that
+ * test-specific implementations of microuptime and others are properly
+ * called by the HPTS system.
+ */
+KTEST_FUNC(function_injection)
+{
+ struct tcp_hptsi *pace;
+
+ test_hpts_init();
+
+ pace = tcp_hptsi_create(&test_funcs, false);
+ KTEST_NEQUAL(pace, NULL);
+ KTEST_EQUAL(pace->funcs, &test_funcs);
+ KTEST_VERIFY(call_counts[CCNT_MICROUPTIME] > 0);
+ KTEST_VERIFY(call_counts[CCNT_CALLOUT_INIT] > 0);
+
+ tcp_hptsi_start(pace);
+ KTEST_VERIFY(call_counts[CCNT_SWI_ADD] > 0);
+ KTEST_VERIFY(tcp_bind_threads == 0 ||
+ call_counts[CCNT_INTR_EVENT_BIND] > 0 ||
+ call_counts[CCNT_INTR_EVENT_BIND_CPUSET] > 0);
+ KTEST_VERIFY(call_counts[CCNT_CALLOUT_RESET_SBT_ON] > 0);
+
+ tcp_hptsi_stop(pace);
+ KTEST_VERIFY(call_counts[CCNT_CALLOUT_STOP_SAFE] > 0);
+ KTEST_VERIFY(call_counts[CCNT_SWI_REMOVE] > 0);
+
+ tcp_hptsi_destroy(pace);
+
+ /* Verify we have a reasonable balance of create/destroy calls */
+ KTEST_EQUAL(call_counts[CCNT_SWI_ADD], call_counts[CCNT_SWI_REMOVE]);
+ KTEST_VERIFY(call_counts[CCNT_CALLOUT_RESET_SBT_ON] <= call_counts[CCNT_CALLOUT_STOP_SAFE]);
+
+ return (0);
+}
+
+/*
+ * Validates that a tcpcb can be properly initialized for HPTS compatibility,
+ * ensuring all required fields are set correctly and function pointers are
+ * valid for safe HPTS operations.
+ */
+KTEST_FUNC(tcpcb_initialization)
+{
+ struct tcp_hptsi *pace;
+ struct tcpcb *tp;
+
+ test_hpts_init();
+
+ pace = tcp_hptsi_create(&test_funcs, false);
+ KTEST_NEQUAL(pace, NULL);
+ tcp_hptsi_start(pace);
+
+ /* Verify the tcpcb is properly initialized for HPTS */
+ tp = test_hpts_create_tcpcb(ctx, pace);
+ KTEST_NEQUAL(tp, NULL);
+ KTEST_NEQUAL(tp->t_fb, NULL);
+ KTEST_NEQUAL(tp->t_fb->tfb_tcp_output, NULL);
+ KTEST_NEQUAL(tp->t_fb->tfb_do_queued_segments, NULL);
+ KTEST_EQUAL(tp->t_in_hpts, IHPTS_NONE);
+ KTEST_EQUAL((tp->t_flags2 & (TF2_HPTS_CPU_SET | TF2_HPTS_CALLS)), 0);
+
+ /* Verify that HPTS-specific fields are initialized */
+ KTEST_EQUAL(tp->t_hpts_gencnt, 0);
+ KTEST_EQUAL(tp->t_hpts_slot, 0);
+ KTEST_EQUAL(tp->t_hpts_request, 0);
+ KTEST_EQUAL(tp->t_lro_cpu, 0);
+ KTEST_VERIFY(tp->t_hpts_cpu < pace->rp_num_hptss);
+ KTEST_EQUAL(tp->t_inpcb.inp_refcount, 1);
+ KTEST_VERIFY(!(tp->t_inpcb.inp_flags & INP_DROPPED));
+
+ test_hpts_free_tcpcb(tp);
+ tcp_hptsi_stop(pace);
+ tcp_hptsi_destroy(pace);
+
+ return (0);
+}
+
+/*
+ * Validates that tcpcb structures can be successfully inserted into and removed
+ * from the HPTS wheel, with proper state tracking and slot assignment during
+ * the process.
+ */
+KTEST_FUNC(tcpcb_insertion)
+{
+ struct tcp_hptsi *pace;
+ struct tcpcb *tp;
+ struct tcp_hpts_entry *hpts;
+ uint32_t timeout_usecs = 10;
+
+ test_hpts_init();
+
+ pace = tcp_hptsi_create(&test_funcs, false);
+ KTEST_NEQUAL(pace, NULL);
+ tcp_hptsi_start(pace);
+
+ tp = test_hpts_create_tcpcb(ctx, pace);
+ KTEST_NEQUAL(tp, NULL);
+ KTEST_EQUAL(tp->t_in_hpts, IHPTS_NONE);
+ KTEST_EQUAL((tp->t_flags2 & TF2_HPTS_CALLS), 0);
+
+ INP_WLOCK(&tp->t_inpcb);
+ tp->t_flags2 |= TF2_HPTS_CALLS;
+ KTEST_EQUAL(call_counts[CCNT_SWI_SCHED], 0);
+ tcp_hpts_insert(pace, tp, timeout_usecs, NULL);
+ KTEST_EQUAL(tp->t_in_hpts, IHPTS_ONQUEUE);
+ INP_WUNLOCK(&tp->t_inpcb);
+ KTEST_EQUAL(call_counts[CCNT_TCP_OUTPUT], 0);
+ KTEST_EQUAL(call_counts[CCNT_SWI_SCHED], 1);
+ KTEST_VERIFY(tcp_in_hpts(tp));
+ KTEST_VERIFY(tp->t_hpts_slot >= 0);
+ KTEST_VERIFY(tp->t_hpts_slot < NUM_OF_HPTSI_SLOTS);
+
+ hpts = pace->rp_ent[tp->t_hpts_cpu];
+ KTEST_EQUAL(hpts->p_on_queue_cnt, 1);
+ KTEST_EQUAL(tp->t_hpts_request, 0);
+ KTEST_EQUAL(tp->t_hpts_slot, HPTS_USEC_TO_SLOTS(timeout_usecs));
+ //KTEST_EQUAL(tp->t_hpts_gencnt, 1);
+
+ INP_WLOCK(&tp->t_inpcb);
+ tcp_hpts_remove(pace, tp);
+ KTEST_EQUAL(tp->t_in_hpts, IHPTS_NONE);
+ INP_WUNLOCK(&tp->t_inpcb);
+ KTEST_EQUAL(call_counts[CCNT_TCP_OUTPUT], 0);
+ KTEST_VERIFY(!tcp_in_hpts(tp));
+
+ KTEST_EQUAL(hpts->p_on_queue_cnt, 0);
+
+ test_hpts_free_tcpcb(tp);
+ tcp_hptsi_stop(pace);
+ tcp_hptsi_destroy(pace);
+
+ return (0);
+}
+
+/*
+ * Validates the core HPTS timer functionality by verifying that scheduled
+ * tcpcb entries trigger tcp_output calls at appropriate times, simulating
+ * real-world timer-driven TCP processing.
+ */
+KTEST_FUNC(timer_functionality)
+{
+ struct epoch_tracker et;
+ struct tcp_hptsi *pace;
+ struct tcp_hpts_entry *hpts;
+ struct tcpcb *tp;
+ int32_t slots_ran;
+ uint32_t i;
+
+ test_hpts_init();
+
+ pace = tcp_hptsi_create(&test_funcs, false);
+ KTEST_NEQUAL(pace, NULL);
+ tcp_hptsi_start(pace);
+
+ for (i = 0; i < pace->rp_num_hptss; i++)
+ dump_hpts_entry(ctx, pace->rp_ent[i]);
+
+ /* Create and insert the tcpcb into the HPTS wheel to wait for 500 usec */
+ tp = test_hpts_create_tcpcb(ctx, pace);
+ KTEST_NEQUAL(tp, NULL);
+ dump_tcpcb(tp);
+ TP_LOG_TEST(tp) = 1; /* Enable logging for this tcpcb */
+
+ KTEST_LOG(ctx, "=> tcp_hpts_insert(%p)", tp);
+ INP_WLOCK(&tp->t_inpcb);
+ tp->t_flags2 |= TF2_HPTS_CALLS; /* Mark as needing HPTS processing */
+ tcp_hpts_insert(pace, tp, 500, NULL);
+ INP_WUNLOCK(&tp->t_inpcb);
+
+ dump_tcpcb(tp);
+ for (i = 0; i < pace->rp_num_hptss; i++)
+ dump_hpts_entry(ctx, pace->rp_ent[i]);
+
+ hpts = pace->rp_ent[tp->t_hpts_cpu];
+ KTEST_EQUAL(hpts->p_on_queue_cnt, 1);
+ KTEST_EQUAL(hpts->p_prev_slot, 0);
+ KTEST_EQUAL(hpts->p_cur_slot, 0);
+ KTEST_EQUAL(hpts->p_runningslot, 0);
+ KTEST_EQUAL(hpts->p_nxt_slot, 1);
+ KTEST_EQUAL(hpts->p_hpts_active, 0);
+
+ KTEST_EQUAL(tp->t_in_hpts, IHPTS_ONQUEUE);
+ KTEST_EQUAL(tp->t_hpts_request, 0);
+ KTEST_EQUAL(tp->t_hpts_slot, HPTS_USEC_TO_SLOTS(500));
+
+ /* Set our test flag to indicate the tcpcb should be removed from the
+ * wheel when tcp_output is called. */
+ TP_REMOVE_FROM_HPTS(tp) = 1;
+
+ /* Test early exit condition: advance time by insufficient amount */
+ KTEST_LOG(ctx, "Testing early exit with insufficient time advancement");
+ test_time_usec += 1; /* Very small advancement - should cause early exit */
+ HPTS_LOCK(hpts);
+ NET_EPOCH_ENTER(et);
+ slots_ran = tcp_hptsi(hpts, true);
+ HPTS_UNLOCK(hpts);
+ NET_EPOCH_EXIT(et);
+
+ /* Should return 0 slots due to insufficient time advancement */
+ KTEST_EQUAL(slots_ran, 0);
+ KTEST_EQUAL(call_counts[CCNT_TCP_OUTPUT], 0); /* No processing should occur */
+ KTEST_EQUAL(tp->t_in_hpts, IHPTS_ONQUEUE); /* Connection still queued */
+
+ /* Wait for 498 more usecs and trigger the HPTS workers and verify
+ * nothing happens yet (total 499 usec) */
+ KTEST_EQUAL(call_counts[CCNT_TCP_OUTPUT], 0);
+ test_time_usec += 498;
+ for (i = 0; i < pace->rp_num_hptss; i++) {
+ KTEST_LOG(ctx, "=> tcp_hptsi(%p)", pace->rp_ent[i]);
+ HPTS_LOCK(pace->rp_ent[i]);
+ NET_EPOCH_ENTER(et);
+ slots_ran = tcp_hptsi(pace->rp_ent[i], true);
+ HPTS_UNLOCK(pace->rp_ent[i]);
+ NET_EPOCH_EXIT(et);
+
+ dump_hpts_entry(ctx, pace->rp_ent[i]);
+ KTEST_VERIFY(slots_ran >= 0);
+ KTEST_EQUAL(pace->rp_ent[i]->p_prev_slot, 49);
+ KTEST_EQUAL(pace->rp_ent[i]->p_cur_slot, 49);
+ }
+
+ dump_tcpcb(tp);
+ KTEST_EQUAL(call_counts[CCNT_TCP_OUTPUT], 0);
+ KTEST_EQUAL(tp->t_in_hpts, IHPTS_ONQUEUE);
+ KTEST_EQUAL(tp->t_hpts_request, 0);
+ KTEST_EQUAL(tp->t_hpts_slot, HPTS_USEC_TO_SLOTS(500));
+ KTEST_EQUAL(hpts->p_on_queue_cnt, 1);
+
+ /* Wait for 1 more usec and trigger the HPTS workers and verify it
+ * triggers tcp_output this time */
+ KTEST_EQUAL(call_counts[CCNT_TCP_OUTPUT], 0);
+ test_time_usec += 1;
+ for (i = 0; i < pace->rp_num_hptss; i++) {
+ KTEST_LOG(ctx, "=> tcp_hptsi(%p)", pace->rp_ent[i]);
+ HPTS_LOCK(pace->rp_ent[i]);
+ NET_EPOCH_ENTER(et);
+ slots_ran = tcp_hptsi(pace->rp_ent[i], true);
+ HPTS_UNLOCK(pace->rp_ent[i]);
+ NET_EPOCH_EXIT(et);
+
+ dump_hpts_entry(ctx, pace->rp_ent[i]);
+ KTEST_VERIFY(slots_ran >= 0);
+ KTEST_EQUAL(pace->rp_ent[i]->p_prev_slot, 50);
+ KTEST_EQUAL(pace->rp_ent[i]->p_cur_slot, 50);
+ }
+
+ dump_tcpcb(tp);
+ KTEST_EQUAL(call_counts[CCNT_TCP_OUTPUT], 1);
+ KTEST_EQUAL(tp->t_in_hpts, IHPTS_NONE);
+ KTEST_EQUAL(hpts->p_on_queue_cnt, 0);
+
+ test_hpts_free_tcpcb(tp);
+ tcp_hptsi_stop(pace);
+ tcp_hptsi_destroy(pace);
+
+ return (0);
+}
+
+/*
+ * Validates HPTS scalability by creating and inserting a LOT of tcpcbs into
+ * the HPTS wheel, testing performance under high load conditions.
+ */
+KTEST_FUNC(scalability_tcpcbs)
+{
+ struct tcp_hptsi *pace;
+ struct tcpcb **tcpcbs;
+ uint32_t i, num_tcpcbs = 100000, total_queued = 0;
+
+ test_hpts_init();
+
+ pace = tcp_hptsi_create(&test_funcs, false);
+ KTEST_NEQUAL(pace, NULL);
+ tcp_hptsi_start(pace);
+
+ /* Allocate array to hold pointers to all tcpcbs */
+ tcpcbs = malloc(num_tcpcbs * sizeof(struct tcpcb *), M_TCPHPTS, M_WAITOK | M_ZERO);
+ KTEST_VERIFY_RET(tcpcbs != NULL, ENOMEM);
+
+ /* Create a LOT of tcpcbs */
+ KTEST_LOG(ctx, "Creating %u tcpcbs...", num_tcpcbs);
+ for (i = 0; i < num_tcpcbs; i++) {
+ tcpcbs[i] = test_hpts_create_tcpcb(ctx, pace);
+ if (tcpcbs[i] == NULL) {
+ KTEST_ERR(ctx, "FAIL: tcpcbs[i] == NULL");
+ return (EINVAL);
+ }
+ }
+
+ /* Insert all created tcpcbs into HPTS */
+ KTEST_LOG(ctx, "Inserting all tcpcbs into HPTS...");
+ for (i = 0; i < num_tcpcbs; i++) {
+ INP_WLOCK(&tcpcbs[i]->t_inpcb);
+ tcpcbs[i]->t_flags2 |= TF2_HPTS_CALLS;
+ /* Insert with varying future timeouts to distribute across slots */
+ tcp_hpts_insert(pace, tcpcbs[i], 100 + (i % 1000), NULL);
+ INP_WUNLOCK(&tcpcbs[i]->t_inpcb);
+ }
+
+ /* Verify total queue counts across all CPUs */
+ for (i = 0; i < pace->rp_num_hptss; i++) {
+ total_queued += pace->rp_ent[i]->p_on_queue_cnt;
+ }
+ KTEST_EQUAL(total_queued, num_tcpcbs);
+
+ for (i = 0; i < pace->rp_num_hptss; i++)
+ dump_hpts_entry(ctx, pace->rp_ent[i]);
+
+ /* Remove all tcpcbs from HPTS */
+ KTEST_LOG(ctx, "Removing all tcpcbs from HPTS...");
+ for (i = 0; i < num_tcpcbs; i++) {
+ INP_WLOCK(&tcpcbs[i]->t_inpcb);
+ if (tcpcbs[i]->t_in_hpts != IHPTS_NONE) {
+ tcp_hpts_remove(pace, tcpcbs[i]);
+ }
+ INP_WUNLOCK(&tcpcbs[i]->t_inpcb);
+ }
+
+ /* Verify all queues are now empty */
+ for (i = 0; i < pace->rp_num_hptss; i++) {
+ if (pace->rp_ent[i]->p_on_queue_cnt != 0) {
+ KTEST_ERR(ctx, "FAIL: pace->rp_ent[i]->p_on_queue_cnt != 0");
+ return (EINVAL);
+ }
+ }
+
+ for (i = 0; i < num_tcpcbs; i++) {
+ test_hpts_free_tcpcb(tcpcbs[i]);
+ }
+ free(tcpcbs, M_TCPHPTS);
+ tcp_hptsi_stop(pace);
+ tcp_hptsi_destroy(pace);
+
+ return (0);
+}
+
+/*
+ * Validates wheel wrap scenarios where the timer falls significantly behind
+ * and needs to process more than one full wheel revolution worth of slots.
+ */
+KTEST_FUNC(wheel_wrap_recovery)
+{
+ struct epoch_tracker et;
+ struct tcp_hptsi *pace;
+ struct tcpcb **tcpcbs;
+ uint32_t i, timeout_usecs, num_tcpcbs = 500;
+ int32_t slots_ran;
+
+ test_hpts_init();
+
+ pace = tcp_hptsi_create(&test_funcs, false);
+ KTEST_NEQUAL(pace, NULL);
+ tcp_hptsi_start(pace);
+
+ /* Allocate array to hold pointers to tcpcbs */
+ tcpcbs = malloc(num_tcpcbs * sizeof(struct tcpcb *), M_TCPHPTS, M_WAITOK | M_ZERO);
+ KTEST_VERIFY_RET(tcpcbs != NULL, ENOMEM);
+
+ /* Create tcpcbs and insert them across many slots */
+ for (i = 0; i < num_tcpcbs; i++) {
+ tcpcbs[i] = test_hpts_create_tcpcb(ctx, pace);
+ KTEST_NEQUAL(tcpcbs[i], NULL);
+ TP_REMOVE_FROM_HPTS(tcpcbs[i]) = 1;
+
+ timeout_usecs = ((i * NUM_OF_HPTSI_SLOTS) / num_tcpcbs) * HPTS_USECS_PER_SLOT; /* Spread across slots */
+
+ INP_WLOCK(&tcpcbs[i]->t_inpcb);
+ tcpcbs[i]->t_flags2 |= TF2_HPTS_CALLS;
+ tcp_hpts_insert(pace, tcpcbs[i], timeout_usecs, NULL);
+ INP_WUNLOCK(&tcpcbs[i]->t_inpcb);
+ }
+
+ /* Fast forward time significantly to trigger wheel wrap */
+ test_time_usec += (NUM_OF_HPTSI_SLOTS + 5000) * HPTS_USECS_PER_SLOT;
+
+ for (i = 0; i < pace->rp_num_hptss; i++) {
+ KTEST_LOG(ctx, "=> tcp_hptsi(%u)", i);
+ KTEST_NEQUAL(pace->rp_ent[i]->p_on_queue_cnt, 0);
+
+ HPTS_LOCK(pace->rp_ent[i]);
+ NET_EPOCH_ENTER(et);
+ slots_ran = tcp_hptsi(pace->rp_ent[i], true);
+ HPTS_UNLOCK(pace->rp_ent[i]);
+ NET_EPOCH_EXIT(et);
+
+ KTEST_EQUAL(slots_ran, NUM_OF_HPTSI_SLOTS-1); /* Should process all slots */
+ KTEST_EQUAL(pace->rp_ent[i]->p_on_queue_cnt, 0);
+ KTEST_NEQUAL(pace->rp_ent[i]->p_cur_slot,
+ pace->rp_ent[i]->p_prev_slot);
+ }
+
+ /* Cleanup */
+ for (i = 0; i < num_tcpcbs; i++) {
+ INP_WLOCK(&tcpcbs[i]->t_inpcb);
+ if (tcpcbs[i]->t_in_hpts != IHPTS_NONE) {
+ tcp_hpts_remove(pace, tcpcbs[i]);
+ }
+ INP_WUNLOCK(&tcpcbs[i]->t_inpcb);
+ test_hpts_free_tcpcb(tcpcbs[i]);
+ }
+ free(tcpcbs, M_TCPHPTS);
+ tcp_hptsi_stop(pace);
+ tcp_hptsi_destroy(pace);
+
+ return (0);
+}
+
+/*
+ * Validates proper handling of tcpcbs in the IHPTS_MOVING state, which occurs
+ * when a tcpcb is being processed by the HPTS thread but gets removed.
+ */
+KTEST_FUNC(tcpcb_moving_state)
+{
+ struct epoch_tracker et;
+ struct tcp_hptsi *pace;
+ struct tcpcb *tp1, *tp2;
+ struct tcp_hpts_entry *hpts;
+ int32_t slots_ran;
+
+ test_hpts_init();
+
+ pace = tcp_hptsi_create(&test_funcs, false);
+ KTEST_NEQUAL(pace, NULL);
+ tcp_hptsi_start(pace);
+
+ /* Create two tcpcbs on the same CPU/slot */
+ tp1 = test_hpts_create_tcpcb(ctx, pace);
+ tp2 = test_hpts_create_tcpcb(ctx, pace);
+ KTEST_NEQUAL(tp1, NULL);
+ KTEST_NEQUAL(tp2, NULL);
+
+ /* Force them to the same CPU for predictable testing */
+ tp1->t_hpts_cpu = 0;
+ tp2->t_hpts_cpu = 0;
+
+ /* Insert both into the same slot */
+ INP_WLOCK(&tp1->t_inpcb);
+ tp1->t_flags2 |= TF2_HPTS_CALLS;
+ tcp_hpts_insert(pace, tp1, 100, NULL);
+ INP_WUNLOCK(&tp1->t_inpcb);
+
+ INP_WLOCK(&tp2->t_inpcb);
+ tp2->t_flags2 |= TF2_HPTS_CALLS;
+ tcp_hpts_insert(pace, tp2, 100, NULL);
+ INP_WUNLOCK(&tp2->t_inpcb);
+
+ hpts = pace->rp_ent[0];
+
+ /* Manually transition tp1 to MOVING state to simulate race condition */
+ HPTS_LOCK(hpts);
+ tp1->t_in_hpts = IHPTS_MOVING;
+ tp1->t_hpts_slot = -1; /* Mark for removal */
+ HPTS_UNLOCK(hpts);
+
+ /* Set time and run HPTS to process the moving state */
+ test_time_usec += 100;
+ HPTS_LOCK(hpts);
+ NET_EPOCH_ENTER(et);
+ slots_ran = tcp_hptsi(hpts, true);
+ HPTS_UNLOCK(hpts);
+ NET_EPOCH_EXIT(et);
+
+ KTEST_VERIFY(slots_ran >= 0);
+ KTEST_EQUAL(call_counts[CCNT_TCP_OUTPUT], 1); /* Shouldn't call on both */
+
+ /* tp1 should be cleaned up and removed */
+ KTEST_EQUAL(tp1->t_in_hpts, IHPTS_NONE);
+ /* tp2 should have been processed normally */
+ KTEST_EQUAL(tp2->t_in_hpts, IHPTS_NONE);
+
+ test_hpts_free_tcpcb(tp1);
+ test_hpts_free_tcpcb(tp2);
+ tcp_hptsi_stop(pace);
+ tcp_hptsi_destroy(pace);
+
+ return (0);
+}
+
+/*
+ * Validates that tcpcbs with deferred requests (t_hpts_request > 0) are
+ * properly handled and re-inserted into appropriate future slots after
+ * the wheel processes enough slots to accommodate the original request.
+ */
+KTEST_FUNC(deferred_requests)
+{
+ struct epoch_tracker et;
+ struct tcp_hptsi *pace;
+ struct tcpcb *tp, *tp2;
+ struct tcp_hpts_entry *hpts;
+ uint32_t large_timeout_usecs = (NUM_OF_HPTSI_SLOTS + 5000) * HPTS_USECS_PER_SLOT; /* Beyond wheel capacity */
+ uint32_t huge_timeout_usecs = (NUM_OF_HPTSI_SLOTS * 3) * HPTS_USECS_PER_SLOT; /* 3x wheel capacity */
+ uint32_t initial_request;
+ int32_t slots_ran;
+
+ test_hpts_init();
+
+ pace = tcp_hptsi_create(&test_funcs, false);
+ KTEST_NEQUAL(pace, NULL);
+ tcp_hptsi_start(pace);
+
+ tp = test_hpts_create_tcpcb(ctx, pace);
+ KTEST_NEQUAL(tp, NULL);
+
+ /* Insert with a request that exceeds current wheel capacity */
+ INP_WLOCK(&tp->t_inpcb);
+ tp->t_flags2 |= TF2_HPTS_CALLS;
+ tcp_hpts_insert(pace, tp, large_timeout_usecs, NULL);
+ INP_WUNLOCK(&tp->t_inpcb);
+
+ /* Verify it was inserted with a deferred request */
+ dump_tcpcb(tp);
+ KTEST_EQUAL(tp->t_in_hpts, IHPTS_ONQUEUE);
+ KTEST_VERIFY(tp->t_hpts_request > 0);
+ KTEST_VERIFY(tp->t_hpts_slot < NUM_OF_HPTSI_SLOTS);
+
+ hpts = pace->rp_ent[tp->t_hpts_cpu];
+
+ /* Advance time to process deferred requests */
+ test_time_usec += NUM_OF_HPTSI_SLOTS * HPTS_USECS_PER_SLOT;
+
+ /* Process the wheel to handle deferred requests */
+ HPTS_LOCK(hpts);
+ NET_EPOCH_ENTER(et);
+ slots_ran = tcp_hptsi(hpts, true);
+ HPTS_UNLOCK(hpts);
+ NET_EPOCH_EXIT(et);
+
+ dump_hpts_entry(ctx, hpts);
+ KTEST_GREATER_THAN(slots_ran, 0);
+ dump_tcpcb(tp);
+ KTEST_EQUAL(tp->t_hpts_request, 0);
+
+ /* Test incremental deferred request processing over multiple cycles */
+ KTEST_LOG(ctx, "Testing incremental deferred request processing");
+
+ /* Create a new connection with an even larger request */
+ tp2 = test_hpts_create_tcpcb(ctx, pace);
+ KTEST_NEQUAL(tp2, NULL);
+ tp2->t_hpts_cpu = tp->t_hpts_cpu; /* Same CPU for predictable testing */
+
+ INP_WLOCK(&tp2->t_inpcb);
+ tp2->t_flags2 |= TF2_HPTS_CALLS;
+ tcp_hpts_insert(pace, tp2, huge_timeout_usecs, NULL);
+ INP_WUNLOCK(&tp2->t_inpcb);
+
+ /* Verify initial deferred request */
+ initial_request = tp2->t_hpts_request;
+ KTEST_VERIFY(initial_request > NUM_OF_HPTSI_SLOTS);
+
+ /* Process one wheel cycle - should reduce but not eliminate request */
+ test_time_usec += NUM_OF_HPTSI_SLOTS * HPTS_USECS_PER_SLOT;
+ HPTS_LOCK(hpts);
+ NET_EPOCH_ENTER(et);
+ slots_ran = tcp_hptsi(hpts, true);
+ HPTS_UNLOCK(hpts);
+ NET_EPOCH_EXIT(et);
+
+ /* Request should be reduced but not zero */
+ KTEST_GREATER_THAN(initial_request, tp2->t_hpts_request);
+ KTEST_VERIFY(tp2->t_hpts_request > 0);
+ KTEST_EQUAL(tp2->t_in_hpts, IHPTS_ONQUEUE); /* Still queued */
+
+ /* For huge_timeout_usecs = NUM_OF_HPTSI_SLOTS * 3 * HPTS_USECS_PER_SLOT, we need ~3 cycles to complete.
+ * Each cycle can reduce the request by at most NUM_OF_HPTSI_SLOTS. */
+ test_time_usec += NUM_OF_HPTSI_SLOTS * HPTS_USECS_PER_SLOT;
+ HPTS_LOCK(hpts);
+ NET_EPOCH_ENTER(et);
+ slots_ran = tcp_hptsi(hpts, true);
+ HPTS_UNLOCK(hpts);
+ NET_EPOCH_EXIT(et);
+
+ /* After second cycle, request should be reduced significantly (likely by ~NUM_OF_HPTSI_SLOTS) */
+ KTEST_VERIFY(tp2->t_hpts_request < initial_request);
+ KTEST_VERIFY(tp2->t_hpts_request > 0); /* But not yet zero for such a large request */
+
+ /* Clean up second connection */
+ INP_WLOCK(&tp2->t_inpcb);
+ if (tp2->t_in_hpts != IHPTS_NONE) {
+ tcp_hpts_remove(pace, tp2);
+ }
+ INP_WUNLOCK(&tp2->t_inpcb);
+ test_hpts_free_tcpcb(tp2);
+
+ /* Clean up */
+ INP_WLOCK(&tp->t_inpcb);
+ if (tp->t_in_hpts != IHPTS_NONE) {
+ tcp_hpts_remove(pace, tp);
+ }
+ INP_WUNLOCK(&tp->t_inpcb);
+ test_hpts_free_tcpcb(tp);
+ tcp_hptsi_stop(pace);
+ tcp_hptsi_destroy(pace);
+
+ return (0);
+}
+
+/*
+ * Validates CPU assignment and affinity mechanisms, including flowid-based
+ * assignment, random fallback scenarios, and explicit CPU setting. Tests
+ * the actual cpu assignment logic in hpts_cpuid via tcp_set_hpts.
+ */
+KTEST_FUNC(cpu_assignment)
+{
+ struct tcp_hptsi *pace;
+ struct tcpcb *tp1, *tp2, *tp2_dup, *tp3;
+
+ test_hpts_init();
+
+ pace = tcp_hptsi_create(&test_funcs, false);
+ KTEST_NEQUAL(pace, NULL);
+
+ /* Test random CPU assignment (no flowid) */
+ tp1 = test_hpts_create_tcpcb(ctx, pace);
+ KTEST_NEQUAL(tp1, NULL);
+ tp1->t_inpcb.inp_flowtype = M_HASHTYPE_NONE;
+ INP_WLOCK(&tp1->t_inpcb);
+ tcp_set_hpts(pace, tp1);
+ INP_WUNLOCK(&tp1->t_inpcb);
+ KTEST_VERIFY(tp1->t_hpts_cpu < pace->rp_num_hptss);
+ KTEST_VERIFY(tp1->t_flags2 & TF2_HPTS_CPU_SET);
+
+ /* Test flowid-based assignment */
+ tp2 = test_hpts_create_tcpcb(ctx, pace);
+ KTEST_NEQUAL(tp2, NULL);
+ tp2->t_inpcb.inp_flowtype = M_HASHTYPE_RSS_TCP_IPV4;
+ tp2->t_inpcb.inp_flowid = 12345;
+ INP_WLOCK(&tp2->t_inpcb);
+ tcp_set_hpts(pace, tp2);
+ INP_WUNLOCK(&tp2->t_inpcb);
+ KTEST_VERIFY(tp2->t_hpts_cpu < pace->rp_num_hptss);
+ KTEST_VERIFY(tp2->t_flags2 & TF2_HPTS_CPU_SET);
+
+ /* With the same flowid, should get same CPU assignment */
+ tp2_dup = test_hpts_create_tcpcb(ctx, pace);
+ KTEST_NEQUAL(tp2_dup, NULL);
+ tp2_dup->t_inpcb.inp_flowtype = M_HASHTYPE_RSS_TCP_IPV4;
+ tp2_dup->t_inpcb.inp_flowid = 12345;
+ INP_WLOCK(&tp2_dup->t_inpcb);
+ tcp_set_hpts(pace, tp2_dup);
+ INP_WUNLOCK(&tp2_dup->t_inpcb);
+ KTEST_EQUAL(tp2_dup->t_hpts_cpu, tp2->t_hpts_cpu);
+
+ /* Test explicit CPU setting */
+ tp3 = test_hpts_create_tcpcb(ctx, pace);
+ KTEST_NEQUAL(tp3, NULL);
+ tp3->t_hpts_cpu = 1; /* Assume we have at least 2 CPUs */
+ tp3->t_flags2 |= TF2_HPTS_CPU_SET;
+ INP_WLOCK(&tp3->t_inpcb);
+ tcp_set_hpts(pace, tp3);
+ INP_WUNLOCK(&tp3->t_inpcb);
+ KTEST_EQUAL(tp3->t_hpts_cpu, 1);
+
+ test_hpts_free_tcpcb(tp1);
+ test_hpts_free_tcpcb(tp2);
+ test_hpts_free_tcpcb(tp2_dup);
+ test_hpts_free_tcpcb(tp3);
+ tcp_hptsi_destroy(pace);
+
+ return (0);
+}
+
+/*
+ * Validates edge cases in slot calculation including boundary conditions
+ * around slot 0, maximum slots, and slot wrapping arithmetic.
+ */
+KTEST_FUNC(slot_boundary_conditions)
+{
+ struct tcp_hptsi *pace;
+ struct tcpcb *tp;
+
+ test_hpts_init();
+
+ pace = tcp_hptsi_create(&test_funcs, false);
+ KTEST_NEQUAL(pace, NULL);
+ tcp_hptsi_start(pace);
+
+ /* Test insertion at slot 0 */
+ tp = test_hpts_create_tcpcb(ctx, pace);
+ KTEST_NEQUAL(tp, NULL);
+ INP_WLOCK(&tp->t_inpcb);
+ tp->t_flags2 |= TF2_HPTS_CALLS;
+ tcp_hpts_insert(pace, tp, 0, NULL); /* Should insert immediately (0 timeout) */
+ INP_WUNLOCK(&tp->t_inpcb);
+ KTEST_EQUAL(tp->t_in_hpts, IHPTS_ONQUEUE);
+ KTEST_VERIFY(tp->t_hpts_slot < NUM_OF_HPTSI_SLOTS);
+
+ INP_WLOCK(&tp->t_inpcb);
+ tcp_hpts_remove(pace, tp);
+ INP_WUNLOCK(&tp->t_inpcb);
+
+ /* Test insertion at maximum slot value */
+ INP_WLOCK(&tp->t_inpcb);
+ tp->t_flags2 |= TF2_HPTS_CALLS;
+ tcp_hpts_insert(pace, tp, (NUM_OF_HPTSI_SLOTS - 1) * HPTS_USECS_PER_SLOT, NULL);
+ INP_WUNLOCK(&tp->t_inpcb);
+ KTEST_EQUAL(tp->t_in_hpts, IHPTS_ONQUEUE);
+
+ INP_WLOCK(&tp->t_inpcb);
+ tcp_hpts_remove(pace, tp);
+ INP_WUNLOCK(&tp->t_inpcb);
+
+ /* Test very small timeout values */
+ INP_WLOCK(&tp->t_inpcb);
+ tp->t_flags2 |= TF2_HPTS_CALLS;
+ tcp_hpts_insert(pace, tp, 1, NULL);
+ INP_WUNLOCK(&tp->t_inpcb);
+ KTEST_EQUAL(tp->t_in_hpts, IHPTS_ONQUEUE);
+ KTEST_EQUAL(tp->t_hpts_slot, HPTS_USEC_TO_SLOTS(1)); /* Should convert 1 usec to slot */
+
+ INP_WLOCK(&tp->t_inpcb);
+ tcp_hpts_remove(pace, tp);
+ INP_WUNLOCK(&tp->t_inpcb);
+
+ test_hpts_free_tcpcb(tp);
+ tcp_hptsi_stop(pace);
+ tcp_hptsi_destroy(pace);
+
+ return (0);
+}
+
+/*
+ * Validates HPTS behavior under high load conditions, including proper
+ * processing of many connections and connection count tracking.
+ */
+KTEST_FUNC(dynamic_sleep_adjustment)
+{
+ struct epoch_tracker et;
+ struct tcp_hptsi *pace;
+ struct tcpcb **tcpcbs;
+ struct tcp_hpts_entry *hpts;
+ uint32_t i, num_tcpcbs = DEFAULT_CONNECTION_THRESHOLD + 50;
+ int32_t slots_ran;
+
+ test_hpts_init();
+
+ pace = tcp_hptsi_create(&test_funcs, false);
+ KTEST_NEQUAL(pace, NULL);
+ tcp_hptsi_start(pace);
+
+ /* Create many connections to exceed threshold */
+ tcpcbs = malloc(num_tcpcbs * sizeof(struct tcpcb *), M_TCPHPTS, M_WAITOK | M_ZERO);
+ KTEST_VERIFY_RET(tcpcbs != NULL, ENOMEM);
+
+ for (i = 0; i < num_tcpcbs; i++) {
+ tcpcbs[i] = test_hpts_create_tcpcb(ctx, pace);
+ KTEST_NEQUAL(tcpcbs[i], NULL);
+ tcpcbs[i]->t_hpts_cpu = 0; /* Force all to CPU 0 */
+ INP_WLOCK(&tcpcbs[i]->t_inpcb);
+ tcpcbs[i]->t_flags2 |= TF2_HPTS_CALLS;
+ TP_REMOVE_FROM_HPTS(tcpcbs[i]) = 1; /* Will be removed after output */
+ tcp_hpts_insert(pace, tcpcbs[i], 100, NULL);
+ INP_WUNLOCK(&tcpcbs[i]->t_inpcb);
+ }
+
+ hpts = pace->rp_ent[0];
+ dump_hpts_entry(ctx, hpts);
+
+ /* Verify we're above threshold */
+ KTEST_GREATER_THAN(hpts->p_on_queue_cnt, DEFAULT_CONNECTION_THRESHOLD);
+
+ /* Run HPTS to process many connections */
+ test_time_usec += 100;
+ HPTS_LOCK(hpts);
+ NET_EPOCH_ENTER(et);
+ slots_ran = tcp_hptsi(hpts, true);
+ HPTS_UNLOCK(hpts);
+ NET_EPOCH_EXIT(et);
+
+ /* Verify HPTS processed slots and connections correctly */
+ KTEST_GREATER_THAN(slots_ran, 0);
+ KTEST_EQUAL(call_counts[CCNT_TCP_OUTPUT], num_tcpcbs);
+
+ /* Verify all connections were removed from queue */
+ KTEST_EQUAL(hpts->p_on_queue_cnt, 0);
+
+ /* Cleanup */
+ for (i = 0; i < num_tcpcbs; i++) {
+ test_hpts_free_tcpcb(tcpcbs[i]);
+ }
+ free(tcpcbs, M_TCPHPTS);
+ tcp_hptsi_stop(pace);
+ tcp_hptsi_destroy(pace);
+
+ return (0);
+}
+
+/*
+ * Validates handling of concurrent insert/remove operations and race conditions
+ * between HPTS processing and user operations.
+ */
+KTEST_FUNC(concurrent_operations)
+{
+ struct tcp_hptsi *pace;
+ struct tcpcb *tp1, *tp2;
+ struct tcp_hpts_entry *hpts;
+
+ test_hpts_init();
+
+ pace = tcp_hptsi_create(&test_funcs, false);
+ KTEST_NEQUAL(pace, NULL);
+ tcp_hptsi_start(pace);
+
+ tp1 = test_hpts_create_tcpcb(ctx, pace);
+ tp2 = test_hpts_create_tcpcb(ctx, pace);
+ KTEST_NEQUAL(tp1, NULL);
+ KTEST_NEQUAL(tp2, NULL);
+
+ /* Force all to CPU 0 */
+ tp1->t_hpts_cpu = 0;
+ tp2->t_hpts_cpu = 0;
+
+ /* Insert tp1 */
+ INP_WLOCK(&tp1->t_inpcb);
+ tp1->t_flags2 |= TF2_HPTS_CALLS;
+ tcp_hpts_insert(pace, tp1, 100, NULL);
+ INP_WUNLOCK(&tp1->t_inpcb);
+
+ /* Insert tp2 into same slot */
+ INP_WLOCK(&tp2->t_inpcb);
+ tp2->t_flags2 |= TF2_HPTS_CALLS;
+ tcp_hpts_insert(pace, tp2, 100, NULL);
+ INP_WUNLOCK(&tp2->t_inpcb);
+
+ /* Verify both are inserted */
+ KTEST_EQUAL(tp1->t_in_hpts, IHPTS_ONQUEUE);
+ KTEST_EQUAL(tp2->t_in_hpts, IHPTS_ONQUEUE);
+
+ /* Verify they're both assigned to the same slot */
+ KTEST_EQUAL(tp1->t_hpts_slot, tp2->t_hpts_slot);
+
+ /* Verify queue count reflects both connections */
+ KTEST_EQUAL(tp1->t_hpts_cpu, tp2->t_hpts_cpu); /* Should be on same CPU */
+ hpts = pace->rp_ent[tp1->t_hpts_cpu];
+ KTEST_EQUAL(hpts->p_on_queue_cnt, 2);
+
+ /* Remove tp1 while tp2 is still there */
+ INP_WLOCK(&tp1->t_inpcb);
+ tcp_hpts_remove(pace, tp1);
+ INP_WUNLOCK(&tp1->t_inpcb);
+
+ /* Verify tp1 removed, tp2 still there */
+ KTEST_EQUAL(tp1->t_in_hpts, IHPTS_NONE);
+ KTEST_EQUAL(tp2->t_in_hpts, IHPTS_ONQUEUE);
+
+ /* Verify queue count decreased by one */
+ KTEST_EQUAL(hpts->p_on_queue_cnt, 1);
+
+ /* Remove tp2 */
+ INP_WLOCK(&tp2->t_inpcb);
+ tcp_hpts_remove(pace, tp2);
+ INP_WUNLOCK(&tp2->t_inpcb);
+
+ KTEST_EQUAL(tp2->t_in_hpts, IHPTS_NONE);
+
+ /* Verify queue is now completely empty */
+ KTEST_EQUAL(hpts->p_on_queue_cnt, 0);
+
+ test_hpts_free_tcpcb(tp1);
+ test_hpts_free_tcpcb(tp2);
+ tcp_hptsi_stop(pace);
+ tcp_hptsi_destroy(pace);
+
+ return (0);
+}
+
+/*
+ * Validates the queued segments processing path via tfb_do_queued_segments,
+ * which is an alternative to direct tcp_output calls.
+ */
+KTEST_FUNC(queued_segments_processing)
+{
+ struct epoch_tracker et;
+ struct tcp_hptsi *pace;
+ struct tcpcb *tp;
+ struct tcp_hpts_entry *hpts;
+ struct mbuf *fake_mbuf;
+ int32_t slots_ran;
+
+ test_hpts_init();
+
+ pace = tcp_hptsi_create(&test_funcs, false);
+ KTEST_NEQUAL(pace, NULL);
+ tcp_hptsi_start(pace);
+
+ tp = test_hpts_create_tcpcb(ctx, pace);
+ KTEST_NEQUAL(tp, NULL);
+
+ /* Create a minimal fake mbuf that has valid STAILQ pointers */
+ fake_mbuf = malloc(sizeof(struct mbuf), M_TCPHPTS, M_WAITOK | M_ZERO);
+ KTEST_NEQUAL(fake_mbuf, NULL);
+
+ /* Set up for queued segments path */
+ tp->t_flags2 |= (TF2_HPTS_CALLS | TF2_SUPPORTS_MBUFQ);
+ STAILQ_INSERT_TAIL(&tp->t_inqueue, fake_mbuf, m_stailqpkt);
+
+ INP_WLOCK(&tp->t_inpcb);
+ tcp_hpts_insert(pace, tp, 100, NULL);
+ INP_WUNLOCK(&tp->t_inpcb);
+
+ hpts = pace->rp_ent[tp->t_hpts_cpu];
+
+ /* Run HPTS and verify queued segments path is taken */
+ test_time_usec += 100;
+ HPTS_LOCK(hpts);
+ NET_EPOCH_ENTER(et);
+ slots_ran = tcp_hptsi(hpts, true);
+ HPTS_UNLOCK(hpts);
+ NET_EPOCH_EXIT(et);
+
+ KTEST_VERIFY(slots_ran >= 0);
+ KTEST_EQUAL(call_counts[CCNT_TCP_TFB_DO_QUEUED_SEGMENTS], 1);
+
+ /* Connection should be removed from HPTS after processing */
+ KTEST_EQUAL(tp->t_in_hpts, IHPTS_NONE);
+
+ /* Clean up the fake mbuf if it's still in the queue */
+ if (!STAILQ_EMPTY(&tp->t_inqueue)) {
+ struct mbuf *m = STAILQ_FIRST(&tp->t_inqueue);
+ STAILQ_REMOVE_HEAD(&tp->t_inqueue, m_stailqpkt);
+ free(m, M_TCPHPTS);
+ }
+
+ test_hpts_free_tcpcb(tp);
+ tcp_hptsi_stop(pace);
+ tcp_hptsi_destroy(pace);
+
+ return (0);
+}
+
+/*
+ * Validates the direct wake mechanism and wake inhibition logic when
+ * the connection count exceeds thresholds.
+ */
+KTEST_FUNC(direct_wake_mechanism)
+{
+ struct tcp_hptsi *pace;
+ struct tcpcb *tp;
+ struct tcp_hpts_entry *hpts;
+
+ test_hpts_init();
+
+ pace = tcp_hptsi_create(&test_funcs, false);
+ KTEST_NEQUAL(pace, NULL);
+ tcp_hptsi_start(pace);
+
+ tp = test_hpts_create_tcpcb(ctx, pace);
+ KTEST_NEQUAL(tp, NULL);
+ hpts = pace->rp_ent[tp->t_hpts_cpu];
+
+ /* Test direct wake when not over threshold */
+ HPTS_LOCK(hpts);
+ hpts->p_on_queue_cnt = 50; /* Below threshold */
+ hpts->p_hpts_wake_scheduled = 0;
+ tcp_hpts_wake(hpts);
+ KTEST_EQUAL(hpts->p_hpts_wake_scheduled, 1);
+ KTEST_EQUAL(call_counts[CCNT_SWI_SCHED], 1);
+ HPTS_UNLOCK(hpts);
+
+ /* Reset for next test */
+ hpts->p_hpts_wake_scheduled = 0;
+ call_counts[CCNT_SWI_SCHED] = 0;
+
+ /* Test wake inhibition when over threshold */
+ HPTS_LOCK(hpts);
+ hpts->p_on_queue_cnt = 200; /* Above threshold */
+ hpts->p_direct_wake = 1; /* Request direct wake */
+ tcp_hpts_wake(hpts);
+ KTEST_EQUAL(hpts->p_hpts_wake_scheduled, 0); /* Should be inhibited */
+ KTEST_EQUAL(hpts->p_direct_wake, 0); /* Should be cleared */
+ KTEST_EQUAL(call_counts[CCNT_SWI_SCHED], 0); /* No SWI scheduled */
+ HPTS_UNLOCK(hpts);
+
+ test_hpts_free_tcpcb(tp);
+ tcp_hptsi_stop(pace);
+ tcp_hptsi_destroy(pace);
+
+ return (0);
+}
+
+/*
+ * Validates HPTS collision detection when attempting to run HPTS while
+ * it's already active.
+ */
+KTEST_FUNC(hpts_collision_detection)
+{
+ struct epoch_tracker et;
+ struct tcp_hptsi *pace;
+ struct tcp_hpts_entry *hpts;
+ int32_t slots_ran;
+
+ test_hpts_init();
+
+ pace = tcp_hptsi_create(&test_funcs, false);
+ KTEST_NEQUAL(pace, NULL);
+ tcp_hptsi_start(pace);
+
+ hpts = pace->rp_ent[0];
+
+ /* Mark HPTS as active */
+ HPTS_LOCK(hpts);
+ hpts->p_hpts_active = 1;
+ HPTS_UNLOCK(hpts);
+
+ /* Attempt to run HPTS again - should detect collision */
+ HPTS_LOCK(hpts);
+ NET_EPOCH_ENTER(et);
+ slots_ran = tcp_hptsi(hpts, false); /* from_callout = false */
+ HPTS_UNLOCK(hpts);
+ NET_EPOCH_EXIT(et);
+
+ /* Should return 0 indicating no work done due to collision */
+ KTEST_EQUAL(slots_ran, 0);
+
+ tcp_hptsi_stop(pace);
+ tcp_hptsi_destroy(pace);
+
+ return (0);
+}
+
+/*
+ * Validates generation count handling for race condition detection between
+ * HPTS processing and connection insertion/removal operations.
+ */
+KTEST_FUNC(generation_count_validation)
+{
+ struct epoch_tracker et;
+ struct tcp_hptsi *pace;
+ struct tcp_hpts_entry *hpts;
+ struct tcpcb *tp1, *tp2;
+ uint32_t initial_gencnt, slot_to_test = 10;
+ uint32_t timeout_usecs = slot_to_test * HPTS_USECS_PER_SLOT;
+ uint32_t tp2_original_gencnt;
+ int32_t slots_ran;
+
+ test_hpts_init();
+
+ pace = tcp_hptsi_create(&test_funcs, false);
+ KTEST_NEQUAL(pace, NULL);
+ tcp_hptsi_start(pace);
+
+ hpts = pace->rp_ent[0];
+
+ /* Record initial generation count for the test slot */
+ initial_gencnt = hpts->p_hptss[slot_to_test].gencnt;
+
+ /* Create and insert first connection */
+ tp1 = test_hpts_create_tcpcb(ctx, pace);
+ KTEST_NEQUAL(tp1, NULL);
+ tp1->t_hpts_cpu = 0; /* Force to CPU 0 */
+
+ INP_WLOCK(&tp1->t_inpcb);
+ tp1->t_flags2 |= TF2_HPTS_CALLS;
+ tcp_hpts_insert(pace, tp1, timeout_usecs, NULL);
+ INP_WUNLOCK(&tp1->t_inpcb);
+
+ /* Verify connection stored the generation count */
+ KTEST_EQUAL(tp1->t_in_hpts, IHPTS_ONQUEUE);
+ KTEST_EQUAL(tp1->t_hpts_slot, slot_to_test);
+ KTEST_EQUAL(tp1->t_hpts_gencnt, initial_gencnt);
+
+ /* Create second connection but don't insert yet */
+ tp2 = test_hpts_create_tcpcb(ctx, pace);
+ KTEST_NEQUAL(tp2, NULL);
+ tp2->t_hpts_cpu = 0; /* Force to CPU 0 */
+
+ /* Force generation count increment by processing the slot */
+ test_time_usec += (slot_to_test + 1) * HPTS_USECS_PER_SLOT;
+ HPTS_LOCK(hpts);
+ NET_EPOCH_ENTER(et);
+ slots_ran = tcp_hptsi(hpts, true);
+ HPTS_UNLOCK(hpts);
+ NET_EPOCH_EXIT(et);
+
+ /* Verify processing occurred */
+ KTEST_VERIFY(slots_ran > 0);
+ KTEST_EQUAL(call_counts[CCNT_TCP_OUTPUT], 1);
+
+ /* Verify generation count was incremented */
+ KTEST_EQUAL(hpts->p_hptss[slot_to_test].gencnt, initial_gencnt + 1);
+
+ /* Verify first connection was processed and removed */
+ KTEST_EQUAL(tp1->t_in_hpts, IHPTS_NONE);
+
+ /* Insert second connection and record its generation count */
+ INP_WLOCK(&tp2->t_inpcb);
+ tp2->t_flags2 |= TF2_HPTS_CALLS;
+ tcp_hpts_insert(pace, tp2, timeout_usecs, NULL);
+ INP_WUNLOCK(&tp2->t_inpcb);
+
+ /* Verify connection was inserted successfully */
+ KTEST_EQUAL(tp2->t_in_hpts, IHPTS_ONQUEUE);
+
+ /* Record the generation count that tp2 received */
+ tp2_original_gencnt = tp2->t_hpts_gencnt;
+
+ /* Test generation count mismatch detection during processing */
+ /* Manually set stale generation count to simulate race condition */
+ tp2->t_hpts_gencnt = tp2_original_gencnt + 100; /* Force a mismatch */
+
+ /* Process the slot to trigger generation count validation */
+ test_time_usec += (slot_to_test + 1) * HPTS_USECS_PER_SLOT;
+ HPTS_LOCK(hpts);
+ NET_EPOCH_ENTER(et);
+ slots_ran = tcp_hptsi(hpts, true);
+ HPTS_UNLOCK(hpts);
+ NET_EPOCH_EXIT(et);
+
+ /* Connection should be processed despite generation count mismatch */
+ KTEST_EQUAL(tp2->t_in_hpts, IHPTS_NONE); /* Processed and released */
+
+ /* The key test: HPTS should handle mismatched generation counts gracefully */
+ KTEST_VERIFY(slots_ran > 0); /* Processing should still occur */
+
+ test_hpts_free_tcpcb(tp1);
+ test_hpts_free_tcpcb(tp2);
+ tcp_hptsi_stop(pace);
+ tcp_hptsi_destroy(pace);
+
+ return (0);
+}
+
+static const struct ktest_test_info tests[] = {
+ KTEST_INFO(module_load),
+ KTEST_INFO(hptsi_create_destroy),
+ KTEST_INFO(hptsi_start_stop),
+ KTEST_INFO(hptsi_independence),
+ KTEST_INFO(function_injection),
+ KTEST_INFO(tcpcb_initialization),
+ KTEST_INFO(tcpcb_insertion),
+ KTEST_INFO(timer_functionality),
+ KTEST_INFO(scalability_tcpcbs),
+ KTEST_INFO(wheel_wrap_recovery),
+ KTEST_INFO(tcpcb_moving_state),
+ KTEST_INFO(deferred_requests),
+ KTEST_INFO(cpu_assignment),
+ KTEST_INFO(slot_boundary_conditions),
+ KTEST_INFO(dynamic_sleep_adjustment),
+ KTEST_INFO(concurrent_operations),
+ KTEST_INFO(queued_segments_processing),
+ KTEST_INFO(direct_wake_mechanism),
+ KTEST_INFO(hpts_collision_detection),
+ KTEST_INFO(generation_count_validation),
+};
+
+KTEST_MODULE_DECLARE(ktest_tcphpts, tests);
+KTEST_MODULE_DEPEND(ktest_tcphpts, tcphpts);
diff --git a/sys/netinet/tcp_lro_hpts.c b/sys/netinet/tcp_lro_hpts.c
index 43587285fe26..ac1a27a4290a 100644
--- a/sys/netinet/tcp_lro_hpts.c
+++ b/sys/netinet/tcp_lro_hpts.c
@@ -29,6 +29,8 @@
#include "opt_inet6.h"
#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/interrupt.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/malloc.h>
@@ -62,6 +64,7 @@
#include <netinet/tcp_lro.h>
#include <netinet/tcp_var.h>
#include <netinet/tcp_hpts.h>
+#include <netinet/tcp_hpts_internal.h>
#ifdef TCP_BLACKBOX
#include <netinet/tcp_log_buf.h>
#endif
diff --git a/sys/netinet/tcp_stacks/bbr.c b/sys/netinet/tcp_stacks/bbr.c
index f2d7867df9b4..66983edcdd73 100644
--- a/sys/netinet/tcp_stacks/bbr.c
+++ b/sys/netinet/tcp_stacks/bbr.c
@@ -480,7 +480,7 @@ bbr_find_lowest_rsm(struct tcp_bbr *bbr);
static __inline uint32_t
bbr_get_rtt(struct tcp_bbr *bbr, int32_t rtt_type);
static void
-bbr_log_to_start(struct tcp_bbr *bbr, uint32_t cts, uint32_t to, int32_t slot,
+bbr_log_to_start(struct tcp_bbr *bbr, uint32_t cts, uint32_t to, int32_t pacing_delay,
uint8_t which);
static void
bbr_log_timer_var(struct tcp_bbr *bbr, int mode, uint32_t cts,
@@ -489,7 +489,7 @@ bbr_log_timer_var(struct tcp_bbr *bbr, int mode, uint32_t cts,
static void
bbr_log_hpts_diag(struct tcp_bbr *bbr, uint32_t cts, struct hpts_diag *diag);
static void
-bbr_log_type_bbrsnd(struct tcp_bbr *bbr, uint32_t len, uint32_t slot,
+bbr_log_type_bbrsnd(struct tcp_bbr *bbr, uint32_t len, uint32_t pacing_delay,
uint32_t del_by, uint32_t cts, uint32_t sloton,
uint32_t prev_delay);
static void
@@ -724,7 +724,7 @@ bbr_minseg(struct tcp_bbr *bbr)
}
static void
-bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_t frm, int32_t slot, uint32_t tot_len)
+bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_t frm, int32_t pacing_delay, uint32_t tot_len)
{
struct inpcb *inp = tptoinpcb(tp);
struct hpts_diag diag;
@@ -751,40 +751,40 @@ bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_
bbr->r_ctl.rc_timer_exp = 0;
prev_delay = bbr->r_ctl.rc_last_delay_val;
if (bbr->r_ctl.rc_last_delay_val &&
- (slot == 0)) {
+ (pacing_delay == 0)) {
/*
* If a previous pacer delay was in place we
* are not coming from the output side (where
* we calculate a delay, more likely a timer).
*/
- slot = bbr->r_ctl.rc_last_delay_val;
+ pacing_delay = bbr->r_ctl.rc_last_delay_val;
if (TSTMP_GT(cts, bbr->rc_pacer_started)) {
/* Compensate for time passed */
delay_calc = cts - bbr->rc_pacer_started;
- if (delay_calc <= slot)
- slot -= delay_calc;
+ if (delay_calc <= pacing_delay)
+ pacing_delay -= delay_calc;
}
}
/* Do we have early to make up for by pushing out the pacing time? */
if (bbr->r_agg_early_set) {
- bbr_log_pacing_delay_calc(bbr, 0, bbr->r_ctl.rc_agg_early, cts, slot, 0, bbr->r_agg_early_set, 2);
- slot += bbr->r_ctl.rc_agg_early;
+ bbr_log_pacing_delay_calc(bbr, 0, bbr->r_ctl.rc_agg_early, cts, pacing_delay, 0, bbr->r_agg_early_set, 2);
+ pacing_delay += bbr->r_ctl.rc_agg_early;
bbr->r_ctl.rc_agg_early = 0;
bbr->r_agg_early_set = 0;
}
/* Are we running a total debt that needs to be compensated for? */
if (bbr->r_ctl.rc_hptsi_agg_delay) {
- if (slot > bbr->r_ctl.rc_hptsi_agg_delay) {
+ if (pacing_delay > bbr->r_ctl.rc_hptsi_agg_delay) {
/* We nuke the delay */
- slot -= bbr->r_ctl.rc_hptsi_agg_delay;
+ pacing_delay -= bbr->r_ctl.rc_hptsi_agg_delay;
bbr->r_ctl.rc_hptsi_agg_delay = 0;
} else {
/* We nuke some of the delay, put in a minimal 100usecs */
- bbr->r_ctl.rc_hptsi_agg_delay -= slot;
- bbr->r_ctl.rc_last_delay_val = slot = 100;
+ bbr->r_ctl.rc_hptsi_agg_delay -= pacing_delay;
+ bbr->r_ctl.rc_last_delay_val = pacing_delay = 100;
}
}
- bbr->r_ctl.rc_last_delay_val = slot;
+ bbr->r_ctl.rc_last_delay_val = pacing_delay;
hpts_timeout = bbr_timer_start(tp, bbr, cts);
if (tp->t_flags & TF_DELACK) {
if (bbr->rc_in_persist == 0) {
@@ -810,7 +810,7 @@ bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_
bbr->r_ctl.rc_hpts_flags = PACE_TMR_DELACK;
hpts_timeout = delayed_ack;
}
- if (slot) {
+ if (pacing_delay) {
/* Mark that we have a pacing timer up */
BBR_STAT_INC(bbr_paced_segments);
bbr->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT;
@@ -820,7 +820,7 @@ bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_
* wheel, we resort to a keep-alive timer if its configured.
*/
if ((hpts_timeout == 0) &&
- (slot == 0)) {
+ (pacing_delay == 0)) {
if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) &&
(tp->t_state <= TCPS_CLOSING)) {
/*
@@ -849,7 +849,7 @@ bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_
if (left < hpts_timeout)
hpts_timeout = left;
}
- if (bbr->r_ctl.rc_incr_tmrs && slot &&
+ if (bbr->r_ctl.rc_incr_tmrs && pacing_delay &&
(bbr->r_ctl.rc_hpts_flags & (PACE_TMR_TLP|PACE_TMR_RXT))) {
/*
* If configured to do so, and the timer is either
@@ -867,7 +867,7 @@ bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_
* this extra delay but this is easier and being more
* conservative is probably better.
*/
- hpts_timeout += slot;
+ hpts_timeout += pacing_delay;
}
if (hpts_timeout) {
/*
@@ -879,10 +879,10 @@ bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_
bbr->r_ctl.rc_timer_exp = cts + hpts_timeout;
} else
bbr->r_ctl.rc_timer_exp = 0;
- if ((slot) &&
+ if ((pacing_delay) &&
(bbr->rc_use_google ||
bbr->output_error_seen ||
- (slot <= hpts_timeout)) ) {
+ (pacing_delay <= hpts_timeout)) ) {
/*
* Tell LRO that it can queue packets while
* we pace.
@@ -900,17 +900,15 @@ bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_
tp->t_flags2 &= ~TF2_DONT_SACK_QUEUE;
bbr->rc_pacer_started = cts;
- (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(slot),
- __LINE__, &diag);
+ tcp_hpts_insert(tp, pacing_delay, &diag);
bbr->rc_timer_first = 0;
bbr->bbr_timer_src = frm;
- bbr_log_to_start(bbr, cts, hpts_timeout, slot, 1);
+ bbr_log_to_start(bbr, cts, hpts_timeout, pacing_delay, 1);
bbr_log_hpts_diag(bbr, cts, &diag);
} else if (hpts_timeout) {
- (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(hpts_timeout),
- __LINE__, &diag);
+ tcp_hpts_insert(tp, hpts_timeout, &diag);
/*
- * We add the flag here as well if the slot is set,
+ * We add the flag here as well if the pacing delay is set,
* since hpts will call in to clear the queue first before
* calling the output routine (which does our timers).
* We don't want to set the flag if its just a timer
@@ -919,7 +917,7 @@ bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_
* on a keep-alive timer and a request comes in for
* more data.
*/
- if (slot)
+ if (pacing_delay)
bbr->rc_pacer_started = cts;
if ((bbr->r_ctl.rc_hpts_flags & PACE_TMR_RACK) &&
(bbr->rc_cwnd_limited == 0)) {
@@ -936,12 +934,12 @@ bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_
TF2_DONT_SACK_QUEUE);
}
bbr->bbr_timer_src = frm;
- bbr_log_to_start(bbr, cts, hpts_timeout, slot, 0);
+ bbr_log_to_start(bbr, cts, hpts_timeout, pacing_delay, 0);
bbr_log_hpts_diag(bbr, cts, &diag);
bbr->rc_timer_first = 1;
}
bbr->rc_tmr_stopped = 0;
- bbr_log_type_bbrsnd(bbr, tot_len, slot, delay_calc, cts, frm, prev_delay);
+ bbr_log_type_bbrsnd(bbr, tot_len, pacing_delay, delay_calc, cts, frm, prev_delay);
}
static void
@@ -1033,8 +1031,8 @@ bbr_timer_audit(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts, struct sock
}
/*
* Ok the timer originally started is not what we want now. We will
- * force the hpts to be stopped if any, and restart with the slot
- * set to what was in the saved slot.
+ * force the hpts to be stopped if any, and restart with the pacing
+ * delay set to what was in the saved delay.
*/
wrong_timer:
if ((bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) {
@@ -2397,7 +2395,7 @@ bbr_log_hpts_diag(struct tcp_bbr *bbr, uint32_t cts, struct hpts_diag *diag)
log.u_bbr.flex2 = diag->p_cur_slot;
log.u_bbr.flex3 = diag->slot_req;
log.u_bbr.flex4 = diag->inp_hptsslot;
- log.u_bbr.flex5 = diag->slot_remaining;
+ log.u_bbr.flex5 = diag->time_remaining;
log.u_bbr.flex6 = diag->need_new_to;
log.u_bbr.flex7 = diag->p_hpts_active;
log.u_bbr.flex8 = diag->p_on_min_sleep;
@@ -2411,9 +2409,6 @@ bbr_log_hpts_diag(struct tcp_bbr *bbr, uint32_t cts, struct hpts_diag *diag)
log.u_bbr.bw_inuse = diag->wheel_slot;
log.u_bbr.rttProp = diag->wheel_cts;
log.u_bbr.delRate = diag->maxslots;
- log.u_bbr.cur_del_rate = diag->p_curtick;
- log.u_bbr.cur_del_rate <<= 32;
- log.u_bbr.cur_del_rate |= diag->p_lasttick;
TCP_LOG_EVENTP(bbr->rc_tp, NULL,
&bbr->rc_inp->inp_socket->so_rcv,
&bbr->rc_inp->inp_socket->so_snd,
@@ -2473,7 +2468,7 @@ bbr_log_pacing_delay_calc(struct tcp_bbr *bbr, uint16_t gain, uint32_t len,
}
static void
-bbr_log_to_start(struct tcp_bbr *bbr, uint32_t cts, uint32_t to, int32_t slot, uint8_t which)
+bbr_log_to_start(struct tcp_bbr *bbr, uint32_t cts, uint32_t to, int32_t pacing_delay, uint8_t which)
{
if (tcp_bblogging_on(bbr->rc_tp)) {
union tcp_log_stackspecific log;
@@ -2483,7 +2478,7 @@ bbr_log_to_start(struct tcp_bbr *bbr, uint32_t cts, uint32_t to, int32_t slot, u
log.u_bbr.flex1 = bbr->bbr_timer_src;
log.u_bbr.flex2 = to;
log.u_bbr.flex3 = bbr->r_ctl.rc_hpts_flags;
- log.u_bbr.flex4 = slot;
+ log.u_bbr.flex4 = pacing_delay;
log.u_bbr.flex5 = bbr->rc_tp->t_hpts_slot;
log.u_bbr.flex6 = TICKS_2_USEC(bbr->rc_tp->t_rxtcur);
log.u_bbr.pkts_out = bbr->rc_tp->t_flags2;
@@ -2733,13 +2728,13 @@ bbr_type_log_hdwr_pacing(struct tcp_bbr *bbr, const struct ifnet *ifp,
}
static void
-bbr_log_type_bbrsnd(struct tcp_bbr *bbr, uint32_t len, uint32_t slot, uint32_t del_by, uint32_t cts, uint32_t line, uint32_t prev_delay)
+bbr_log_type_bbrsnd(struct tcp_bbr *bbr, uint32_t len, uint32_t pacing_delay, uint32_t del_by, uint32_t cts, uint32_t line, uint32_t prev_delay)
{
if (tcp_bblogging_on(bbr->rc_tp)) {
union tcp_log_stackspecific log;
bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
- log.u_bbr.flex1 = slot;
+ log.u_bbr.flex1 = pacing_delay;
log.u_bbr.flex2 = del_by;
log.u_bbr.flex3 = prev_delay;
log.u_bbr.flex4 = line;
@@ -5205,7 +5200,7 @@ bbr_process_timers(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts, uint8_t
left = bbr->r_ctl.rc_timer_exp - cts;
ret = -3;
bbr_log_to_processing(bbr, cts, ret, left, hpts_calling);
- tcp_hpts_insert(tp, HPTS_USEC_TO_SLOTS(left));
+ tcp_hpts_insert(tp, left, NULL);
return (1);
}
bbr->rc_tmr_stopped = 0;
@@ -5254,7 +5249,7 @@ bbr_timer_cancel(struct tcp_bbr *bbr, int32_t line, uint32_t cts)
else
time_since_send = 0;
if (bbr->r_ctl.rc_last_delay_val > time_since_send) {
- /* Cut down our slot time */
+ /* Cut down our pacing_delay time */
bbr->r_ctl.rc_last_delay_val -= time_since_send;
} else {
bbr->r_ctl.rc_last_delay_val = 0;
@@ -5888,7 +5883,7 @@ bbr_log_output(struct tcp_bbr *bbr, struct tcpcb *tp, struct tcpopt *to, int32_t
* sequence 1 for 10 bytes. In such an example the r_start would be
* 1 (starting sequence) but the r_end would be r_start+len i.e. 11.
* This means that r_end is actually the first sequence for the next
- * slot (11).
+ * pacing delay (11).
*
*/
INP_WLOCK_ASSERT(tptoinpcb(tp));
@@ -11856,7 +11851,7 @@ bbr_output_wtime(struct tcpcb *tp, const struct timeval *tv)
struct bbr_sendmap *rsm = NULL;
int32_t tso, mtu;
struct tcpopt to;
- int32_t slot = 0;
+ int32_t pacing_delay = 0;
struct inpcb *inp;
struct sockbuf *sb;
bool hpts_calling;
@@ -11986,8 +11981,7 @@ bbr_output_wtime(struct tcpcb *tp, const struct timeval *tv)
delay_calc -= bbr->r_ctl.rc_last_delay_val;
else {
/*
- * We are early setup to adjust
- * our slot time.
+ * We are early setup to adjust out pacing delay.
*/
uint64_t merged_val;
@@ -12104,7 +12098,7 @@ again:
#endif
error = 0;
tso = 0;
- slot = 0;
+ pacing_delay = 0;
mtu = 0;
sendwin = min(tp->snd_wnd, tp->snd_cwnd);
sb_offset = tp->snd_max - tp->snd_una;
@@ -12126,7 +12120,7 @@ recheck_resend:
tot_len = tp->t_maxseg;
if (hpts_calling)
/* Retry in a ms */
- slot = 1001;
+ pacing_delay = 1001;
goto just_return_nolock;
}
TAILQ_INSERT_TAIL(&bbr->r_ctl.rc_free, rsm, r_next);
@@ -12699,9 +12693,9 @@ just_return:
SOCK_SENDBUF_UNLOCK(so);
just_return_nolock:
if (tot_len)
- slot = bbr_get_pacing_delay(bbr, bbr->r_ctl.rc_bbr_hptsi_gain, tot_len, cts, 0);
+ pacing_delay = bbr_get_pacing_delay(bbr, bbr->r_ctl.rc_bbr_hptsi_gain, tot_len, cts, 0);
if (bbr->rc_no_pacing)
- slot = 0;
+ pacing_delay = 0;
if (tot_len == 0) {
if ((ctf_outstanding(tp) + min((bbr->r_ctl.rc_high_rwnd/2), bbr_minseg(bbr))) >=
tp->snd_wnd) {
@@ -12751,7 +12745,7 @@ just_return_nolock:
/* Dont update the time if we did not send */
bbr->r_ctl.rc_last_delay_val = 0;
bbr->rc_output_starts_timer = 1;
- bbr_start_hpts_timer(bbr, tp, cts, 9, slot, tot_len);
+ bbr_start_hpts_timer(bbr, tp, cts, 9, pacing_delay, tot_len);
bbr_log_type_just_return(bbr, cts, tot_len, hpts_calling, app_limited, p_maxseg, len);
if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
/* Make sure snd_nxt is drug up */
@@ -12787,7 +12781,7 @@ send:
flags &= ~TH_FIN;
if ((len == 0) && ((tp->t_flags & TF_ACKNOW) == 0)) {
/* Lets not send this */
- slot = 0;
+ pacing_delay = 0;
goto just_return;
}
}
@@ -13053,7 +13047,7 @@ send:
/*
* We have outstanding data, don't send a fin by itself!.
*/
- slot = 0;
+ pacing_delay = 0;
goto just_return;
}
/*
@@ -13763,7 +13757,7 @@ nomore:
if (tp->snd_cwnd < maxseg)
tp->snd_cwnd = maxseg;
}
- slot = (bbr_error_base_paceout + 1) << bbr->oerror_cnt;
+ pacing_delay = (bbr_error_base_paceout + 1) << bbr->oerror_cnt;
BBR_STAT_INC(bbr_saw_enobuf);
if (bbr->bbr_hdrw_pacing)
counter_u64_add(bbr_hdwr_pacing_enobuf, 1);
@@ -13812,18 +13806,18 @@ nomore:
}
/*
* Nuke all other things that can interfere
- * with slot
+ * with pacing delay
*/
if ((tot_len + len) && (len >= tp->t_maxseg)) {
- slot = bbr_get_pacing_delay(bbr,
+ pacing_delay = bbr_get_pacing_delay(bbr,
bbr->r_ctl.rc_bbr_hptsi_gain,
(tot_len + len), cts, 0);
- if (slot < bbr_error_base_paceout)
- slot = (bbr_error_base_paceout + 2) << bbr->oerror_cnt;
+ if (pacing_delay < bbr_error_base_paceout)
+ pacing_delay = (bbr_error_base_paceout + 2) << bbr->oerror_cnt;
} else
- slot = (bbr_error_base_paceout + 2) << bbr->oerror_cnt;
+ pacing_delay = (bbr_error_base_paceout + 2) << bbr->oerror_cnt;
bbr->rc_output_starts_timer = 1;
- bbr_start_hpts_timer(bbr, tp, cts, 10, slot,
+ bbr_start_hpts_timer(bbr, tp, cts, 10, pacing_delay,
tot_len);
return (error);
}
@@ -13841,9 +13835,9 @@ nomore:
}
/* FALLTHROUGH */
default:
- slot = (bbr_error_base_paceout + 3) << bbr->oerror_cnt;
+ pacing_delay = (bbr_error_base_paceout + 3) << bbr->oerror_cnt;
bbr->rc_output_starts_timer = 1;
- bbr_start_hpts_timer(bbr, tp, cts, 11, slot, 0);
+ bbr_start_hpts_timer(bbr, tp, cts, 11, pacing_delay, 0);
return (error);
}
#ifdef STATS
@@ -13981,12 +13975,12 @@ skip_again:
tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST);
if (((flags & (TH_RST | TH_SYN | TH_FIN)) == 0) && tot_len) {
/*
- * Calculate/Re-Calculate the hptsi slot in usecs based on
+ * Calculate/Re-Calculate the hptsi timeout in usecs based on
* what we have sent so far
*/
- slot = bbr_get_pacing_delay(bbr, bbr->r_ctl.rc_bbr_hptsi_gain, tot_len, cts, 0);
+ pacing_delay = bbr_get_pacing_delay(bbr, bbr->r_ctl.rc_bbr_hptsi_gain, tot_len, cts, 0);
if (bbr->rc_no_pacing)
- slot = 0;
+ pacing_delay = 0;
}
tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
enobufs:
@@ -13999,8 +13993,8 @@ enobufs:
(more_to_rxt ||
((bbr->r_ctl.rc_resend = bbr_check_recovery_mode(tp, bbr, cts)) != NULL))) {
/* Rack cheats and shotguns out all rxt's 1ms apart */
- if (slot > 1000)
- slot = 1000;
+ if (pacing_delay > 1000)
+ pacing_delay = 1000;
}
if (bbr->bbr_hdrw_pacing && (bbr->hw_pacing_set == 0)) {
/*
@@ -14014,7 +14008,7 @@ enobufs:
tcp_bbr_tso_size_check(bbr, cts);
}
}
- bbr_start_hpts_timer(bbr, tp, cts, 12, slot, tot_len);
+ bbr_start_hpts_timer(bbr, tp, cts, 12, pacing_delay, tot_len);
if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
/* Make sure snd_nxt is drug up */
tp->snd_nxt = tp->snd_max;
@@ -14132,8 +14126,7 @@ bbr_switch_failed(struct tcpcb *tp)
}
} else
toval = HPTS_USECS_PER_SLOT;
- (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(toval),
- __LINE__, &diag);
+ tcp_hpts_insert(tp, toval, &diag);
bbr_log_hpts_diag(bbr, cts, &diag);
}
diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c
index 11ef5ba706c5..c7962b57a69e 100644
--- a/sys/netinet/tcp_stacks/rack.c
+++ b/sys/netinet/tcp_stacks/rack.c
@@ -250,11 +250,11 @@ static int32_t rack_non_rxt_use_cr = 0; /* does a non-rxt in recovery use the co
static int32_t rack_persist_min = 250000; /* 250usec */
static int32_t rack_persist_max = 2000000; /* 2 Second in usec's */
static int32_t rack_honors_hpts_min_to = 1; /* Do we honor the hpts minimum time out for pacing timers */
-static uint32_t rack_max_reduce = 10; /* Percent we can reduce slot by */
+static uint32_t rack_max_reduce = 10; /* Percent we can reduce pacing delay by */
static int32_t rack_sack_not_required = 1; /* set to one to allow non-sack to use rack */
static int32_t rack_limit_time_with_srtt = 0;
static int32_t rack_autosndbuf_inc = 20; /* In percentage form */
-static int32_t rack_enobuf_hw_boost_mult = 0; /* How many times the hw rate we boost slot using time_between */
+static int32_t rack_enobuf_hw_boost_mult = 0; /* How many times the hw rate we boost pacing delay using time_between */
static int32_t rack_enobuf_hw_max = 12000; /* 12 ms in usecs */
static int32_t rack_enobuf_hw_min = 10000; /* 10 ms in usecs */
static int32_t rack_hw_rwnd_factor = 2; /* How many max_segs the rwnd must be before we hold off sending */
@@ -278,7 +278,7 @@ static int32_t rack_hptsi_segments = 40;
static int32_t rack_rate_sample_method = USE_RTT_LOW;
static int32_t rack_pace_every_seg = 0;
static int32_t rack_delayed_ack_time = 40000; /* 40ms in usecs */
-static int32_t rack_slot_reduction = 4;
+static int32_t rack_pacing_delay_reduction = 4;
static int32_t rack_wma_divisor = 8; /* For WMA calculation */
static int32_t rack_cwnd_block_ends_measure = 0;
static int32_t rack_rwnd_block_ends_measure = 0;
@@ -478,7 +478,7 @@ rack_log_alt_to_to_cancel(struct tcp_rack *rack,
uint16_t flex7, uint8_t mod);
static void
-rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t slot,
+rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t pacing_delay,
uint64_t bw_est, uint64_t bw, uint64_t len_time, int method, int line,
struct rack_sendmap *rsm, uint8_t quality);
static struct rack_sendmap *
@@ -1107,7 +1107,7 @@ rack_init_sysctls(void)
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_pacing),
OID_AUTO, "burst_reduces", CTLFLAG_RW,
- &rack_slot_reduction, 4,
+ &rack_pacing_delay_reduction, 4,
"When doing only burst mitigation what is the reduce divisor");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_sysctl_root),
@@ -1399,7 +1399,7 @@ rack_init_sysctls(void)
SYSCTL_CHILDREN(rack_timers),
OID_AUTO, "hpts_max_reduce", CTLFLAG_RW,
&rack_max_reduce, 10,
- "Max percentage we will reduce slot by for pacing when we are behind");
+ "Max percentage we will reduce pacing delay by for pacing when we are behind");
SYSCTL_ADD_U32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_timers),
OID_AUTO, "persmin", CTLFLAG_RW,
@@ -2700,7 +2700,7 @@ rack_log_retran_reason(struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t
}
static void
-rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot, uint8_t which)
+rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t pacing_delay, uint8_t which)
{
if (tcp_bblogging_on(rack->rc_tp)) {
union tcp_log_stackspecific log;
@@ -2710,7 +2710,7 @@ rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot
log.u_bbr.flex1 = rack->rc_tp->t_srtt;
log.u_bbr.flex2 = to;
log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags;
- log.u_bbr.flex4 = slot;
+ log.u_bbr.flex4 = pacing_delay;
log.u_bbr.flex5 = rack->rc_tp->t_hpts_slot;
log.u_bbr.flex6 = rack->rc_tp->t_rxtcur;
log.u_bbr.flex7 = rack->rc_in_persist;
@@ -3034,14 +3034,14 @@ rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick,
}
static void
-rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t slot, uint32_t cts, struct timeval *tv, int line)
+rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t pacing_delay, uint32_t cts, struct timeval *tv, int line)
{
if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) {
union tcp_log_stackspecific log;
memset(&log, 0, sizeof(log));
log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);
- log.u_bbr.flex1 = slot;
+ log.u_bbr.flex1 = pacing_delay;
if (rack->rack_no_prr)
log.u_bbr.flex2 = 0;
else
@@ -3139,7 +3139,7 @@ rack_log_type_pacing_sizes(struct tcpcb *tp, struct tcp_rack *rack, uint32_t arg
}
static void
-rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, uint32_t slot,
+rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, uint32_t pacing_delay,
uint8_t hpts_calling, int reason, uint32_t cwnd_to_use)
{
if (tcp_bblogging_on(rack->rc_tp)) {
@@ -3148,7 +3148,7 @@ rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, ui
memset(&log, 0, sizeof(log));
log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);
- log.u_bbr.flex1 = slot;
+ log.u_bbr.flex1 = pacing_delay;
log.u_bbr.flex2 = rack->r_ctl.rc_hpts_flags;
log.u_bbr.flex4 = reason;
if (rack->rack_no_prr)
@@ -6482,7 +6482,7 @@ rack_log_hpts_diag(struct tcp_rack *rack, uint32_t cts,
log.u_bbr.flex2 = diag->p_cur_slot;
log.u_bbr.flex3 = diag->slot_req;
log.u_bbr.flex4 = diag->inp_hptsslot;
- log.u_bbr.flex5 = diag->slot_remaining;
+ log.u_bbr.flex5 = diag->time_remaining;
log.u_bbr.flex6 = diag->need_new_to;
log.u_bbr.flex7 = diag->p_hpts_active;
log.u_bbr.flex8 = diag->p_on_min_sleep;
@@ -6497,9 +6497,6 @@ rack_log_hpts_diag(struct tcp_rack *rack, uint32_t cts,
log.u_bbr.rttProp = diag->wheel_cts;
log.u_bbr.timeStamp = cts;
log.u_bbr.delRate = diag->maxslots;
- log.u_bbr.cur_del_rate = diag->p_curtick;
- log.u_bbr.cur_del_rate <<= 32;
- log.u_bbr.cur_del_rate |= diag->p_lasttick;
TCP_LOG_EVENTP(rack->rc_tp, NULL,
&rack->rc_inp->inp_socket->so_rcv,
&rack->rc_inp->inp_socket->so_snd,
@@ -6532,14 +6529,14 @@ rack_log_wakeup(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb, uin
static void
rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts,
- int32_t slot, uint32_t tot_len_this_send, int sup_rack)
+ int32_t usecs, uint32_t tot_len_this_send, int sup_rack)
{
struct hpts_diag diag;
struct inpcb *inp = tptoinpcb(tp);
struct timeval tv;
uint32_t delayed_ack = 0;
uint32_t hpts_timeout;
- uint32_t entry_slot = slot;
+ uint32_t entry_usecs = usecs;
uint8_t stopped;
uint32_t left = 0;
uint32_t us_cts;
@@ -6560,7 +6557,7 @@ rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts,
rack->r_ctl.rc_hpts_flags = 0;
us_cts = tcp_get_usecs(&tv);
/* Now early/late accounting */
- rack_log_pacing_delay_calc(rack, entry_slot, slot, 0, 0, 0, 26, __LINE__, NULL, 0);
+ rack_log_pacing_delay_calc(rack, entry_usecs, usecs, 0, 0, 0, 26, __LINE__, NULL, 0);
if (rack->r_early && (rack->rc_ack_can_sendout_data == 0)) {
/*
* We have a early carry over set,
@@ -6571,7 +6568,7 @@ rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts,
* penalize the next timer for being awoke
* by an ack aka the rc_agg_early (non-paced mode).
*/
- slot += rack->r_ctl.rc_agg_early;
+ usecs += rack->r_ctl.rc_agg_early;
rack->r_early = 0;
rack->r_ctl.rc_agg_early = 0;
}
@@ -6583,29 +6580,29 @@ rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts,
* really depends on what
* the current pacing time is.
*/
- if (rack->r_ctl.rc_agg_delayed >= slot) {
+ if (rack->r_ctl.rc_agg_delayed >= usecs) {
/*
* We can't compensate for it all.
* And we have to have some time
* on the clock. We always have a min
- * 10 slots (10 x 10 i.e. 100 usecs).
+ * 10 HPTS timer units (10 x 10 i.e. 100 usecs).
*/
- if (slot <= HPTS_USECS_PER_SLOT) {
+ if (usecs <= HPTS_USECS_PER_SLOT) {
/* We gain delay */
- rack->r_ctl.rc_agg_delayed += (HPTS_USECS_PER_SLOT - slot);
- slot = HPTS_USECS_PER_SLOT;
+ rack->r_ctl.rc_agg_delayed += (HPTS_USECS_PER_SLOT - usecs);
+ usecs = HPTS_USECS_PER_SLOT;
} else {
/* We take off some */
- rack->r_ctl.rc_agg_delayed -= (slot - HPTS_USECS_PER_SLOT);
- slot = HPTS_USECS_PER_SLOT;
+ rack->r_ctl.rc_agg_delayed -= (usecs - HPTS_USECS_PER_SLOT);
+ usecs = HPTS_USECS_PER_SLOT;
}
} else {
- slot -= rack->r_ctl.rc_agg_delayed;
+ usecs -= rack->r_ctl.rc_agg_delayed;
rack->r_ctl.rc_agg_delayed = 0;
/* Make sure we have 100 useconds at minimum */
- if (slot < HPTS_USECS_PER_SLOT) {
- rack->r_ctl.rc_agg_delayed = HPTS_USECS_PER_SLOT - slot;
- slot = HPTS_USECS_PER_SLOT;
+ if (usecs < HPTS_USECS_PER_SLOT) {
+ rack->r_ctl.rc_agg_delayed = HPTS_USECS_PER_SLOT - usecs;
+ usecs = HPTS_USECS_PER_SLOT;
}
if (rack->r_ctl.rc_agg_delayed == 0)
rack->r_late = 0;
@@ -6614,17 +6611,17 @@ rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts,
/* r_use_hpts_min is on and so is DGP */
uint32_t max_red;
- max_red = (slot * rack->r_ctl.max_reduction) / 100;
+ max_red = (usecs * rack->r_ctl.max_reduction) / 100;
if (max_red >= rack->r_ctl.rc_agg_delayed) {
- slot -= rack->r_ctl.rc_agg_delayed;
+ usecs -= rack->r_ctl.rc_agg_delayed;
rack->r_ctl.rc_agg_delayed = 0;
} else {
- slot -= max_red;
+ usecs -= max_red;
rack->r_ctl.rc_agg_delayed -= max_red;
}
}
if ((rack->r_use_hpts_min == 1) &&
- (slot > 0) &&
+ (usecs > 0) &&
(rack->dgp_on == 1)) {
/*
* We are enforcing a min pacing timer
@@ -6633,8 +6630,8 @@ rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts,
uint32_t min;
min = get_hpts_min_sleep_time();
- if (min > slot) {
- slot = min;
+ if (min > usecs) {
+ usecs = min;
}
}
hpts_timeout = rack_timer_start(tp, rack, cts, sup_rack);
@@ -6652,7 +6649,7 @@ rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts,
* wheel, we resort to a keep-alive timer if its configured.
*/
if ((hpts_timeout == 0) &&
- (slot == 0)) {
+ (usecs == 0)) {
if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) &&
(tp->t_state <= TCPS_CLOSING)) {
/*
@@ -6709,10 +6706,10 @@ rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts,
hpts_timeout = 0x7ffffffe;
rack->r_ctl.rc_timer_exp = cts + hpts_timeout;
}
- rack_log_pacing_delay_calc(rack, entry_slot, slot, hpts_timeout, 0, 0, 27, __LINE__, NULL, 0);
+ rack_log_pacing_delay_calc(rack, entry_usecs, usecs, hpts_timeout, 0, 0, 27, __LINE__, NULL, 0);
if ((rack->gp_ready == 0) &&
(rack->use_fixed_rate == 0) &&
- (hpts_timeout < slot) &&
+ (hpts_timeout < usecs) &&
(rack->r_ctl.rc_hpts_flags & (PACE_TMR_TLP|PACE_TMR_RXT))) {
/*
* We have no good estimate yet for the
@@ -6722,7 +6719,7 @@ rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts,
* pace that long since we know the calculation
* so far is not accurate.
*/
- slot = hpts_timeout;
+ usecs = hpts_timeout;
}
/**
* Turn off all the flags for queuing by default. The
@@ -6754,11 +6751,11 @@ rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts,
* so LRO can call into us.
*/
tp->t_flags2 &= ~(TF2_DONT_SACK_QUEUE|TF2_MBUF_QUEUE_READY);
- if (slot) {
+ if (usecs) {
rack->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT;
- rack->r_ctl.rc_last_output_to = us_cts + slot;
+ rack->r_ctl.rc_last_output_to = us_cts + usecs;
/*
- * A pacing timer (slot) is being set, in
+ * A pacing timer (usecs microseconds) is being set, in
* such a case we cannot send (we are blocked by
* the timer). So lets tell LRO that it should not
* wake us unless there is a SACK. Note this only
@@ -6799,20 +6796,18 @@ rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts,
}
if ((rack->use_rack_rr) &&
(rack->r_rr_config < 2) &&
- ((hpts_timeout) && (hpts_timeout < slot))) {
+ ((hpts_timeout) && (hpts_timeout < usecs))) {
/*
* Arrange for the hpts to kick back in after the
* t-o if the t-o does not cause a send.
*/
- (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(hpts_timeout),
- __LINE__, &diag);
+ tcp_hpts_insert(tp, hpts_timeout, &diag);
rack_log_hpts_diag(rack, us_cts, &diag, &tv);
- rack_log_to_start(rack, cts, hpts_timeout, slot, 0);
+ rack_log_to_start(rack, cts, hpts_timeout, usecs, 0);
} else {
- (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(slot),
- __LINE__, &diag);
+ tcp_hpts_insert(tp, usecs, &diag);
rack_log_hpts_diag(rack, us_cts, &diag, &tv);
- rack_log_to_start(rack, cts, hpts_timeout, slot, 1);
+ rack_log_to_start(rack, cts, hpts_timeout, usecs, 1);
}
} else if (hpts_timeout) {
/*
@@ -6824,22 +6819,21 @@ rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts,
* at the start of this block) are good enough.
*/
rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT;
- (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(hpts_timeout),
- __LINE__, &diag);
+ tcp_hpts_insert(tp, hpts_timeout, &diag);
rack_log_hpts_diag(rack, us_cts, &diag, &tv);
- rack_log_to_start(rack, cts, hpts_timeout, slot, 0);
+ rack_log_to_start(rack, cts, hpts_timeout, usecs, 0);
} else {
/* No timer starting */
#ifdef INVARIANTS
if (SEQ_GT(tp->snd_max, tp->snd_una)) {
- panic("tp:%p rack:%p tlts:%d cts:%u slot:%u pto:%u -- no timer started?",
- tp, rack, tot_len_this_send, cts, slot, hpts_timeout);
+ panic("tp:%p rack:%p tlts:%d cts:%u usecs:%u pto:%u -- no timer started?",
+ tp, rack, tot_len_this_send, cts, usecs, hpts_timeout);
}
#endif
}
rack->rc_tmr_stopped = 0;
- if (slot)
- rack_log_type_bbrsnd(rack, tot_len_this_send, slot, us_cts, &tv, __LINE__);
+ if (usecs)
+ rack_log_type_bbrsnd(rack, tot_len_this_send, usecs, us_cts, &tv, __LINE__);
}
static void
@@ -8016,7 +8010,7 @@ rack_process_timers(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8
rack->rc_tp->t_flags2 &= ~TF2_DONT_SACK_QUEUE;
ret = -3;
left = rack->r_ctl.rc_timer_exp - cts;
- tcp_hpts_insert(tp, HPTS_MS_TO_SLOTS(left));
+ tcp_hpts_insert(tp, left, NULL);
rack_log_to_processing(rack, cts, ret, left);
return (1);
}
@@ -14377,8 +14371,7 @@ rack_switch_failed(struct tcpcb *tp)
}
} else
toval = HPTS_USECS_PER_SLOT;
- (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(toval),
- __LINE__, &diag);
+ tcp_hpts_insert(tp, toval, &diag);
rack_log_hpts_diag(rack, cts, &diag, &tv);
}
@@ -14973,8 +14966,7 @@ rack_init(struct tcpcb *tp, void **ptr)
if (tov) {
struct hpts_diag diag;
- (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(tov),
- __LINE__, &diag);
+ tcp_hpts_insert(tp, tov, &diag);
rack_log_hpts_diag(rack, us_cts, &diag, &rack->r_ctl.act_rcv_time);
}
}
@@ -16367,7 +16359,7 @@ rack_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
struct rack_sendmap *rsm;
int32_t prev_state = 0;
int no_output = 0;
- int slot_remaining = 0;
+ int time_remaining = 0;
#ifdef TCP_ACCOUNTING
int ack_val_set = 0xf;
#endif
@@ -16416,7 +16408,7 @@ rack_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
* could be, if a sack is present, we want to be awoken and
* so should process the packets.
*/
- slot_remaining = rack->r_ctl.rc_last_output_to - us_cts;
+ time_remaining = rack->r_ctl.rc_last_output_to - us_cts;
if (rack->rc_tp->t_flags2 & TF2_DONT_SACK_QUEUE) {
no_output = 1;
} else {
@@ -16436,7 +16428,7 @@ rack_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
(*ts_ptr == TCP_LRO_TS_OPTION)))
no_output = 1;
}
- if ((no_output == 1) && (slot_remaining < tcp_min_hptsi_time)) {
+ if ((no_output == 1) && (time_remaining < tcp_min_hptsi_time)) {
/*
* It is unrealistic to think we can pace in less than
* the minimum granularity of the pacer (def:250usec). So
@@ -16919,10 +16911,10 @@ do_output_now:
(tcp_in_hpts(rack->rc_tp) == 0)) {
/*
* We are not in hpts and we had a pacing timer up. Use
- * the remaining time (slot_remaining) to restart the timer.
+ * the remaining time (time_remaining) to restart the timer.
*/
- KASSERT ((slot_remaining != 0), ("slot remaining is zero for rack:%p tp:%p", rack, tp));
- rack_start_hpts_timer(rack, tp, cts, slot_remaining, 0, 0);
+ KASSERT ((time_remaining != 0), ("slot remaining is zero for rack:%p tp:%p", rack, tp));
+ rack_start_hpts_timer(rack, tp, cts, time_remaining, 0, 0);
rack_free_trim(rack);
}
/* Clear the flag, it may have been cleared by output but we may not have */
@@ -17102,7 +17094,7 @@ check_it:
}
static void
-rack_log_pacing_delay_calc (struct tcp_rack *rack, uint32_t len, uint32_t slot,
+rack_log_pacing_delay_calc (struct tcp_rack *rack, uint32_t len, uint32_t pacing_delay,
uint64_t bw_est, uint64_t bw, uint64_t len_time, int method,
int line, struct rack_sendmap *rsm, uint8_t quality)
{
@@ -17125,7 +17117,7 @@ rack_log_pacing_delay_calc (struct tcp_rack *rack, uint32_t len, uint32_t slot,
}
}
memset(&log, 0, sizeof(log));
- log.u_bbr.flex1 = slot;
+ log.u_bbr.flex1 = pacing_delay;
log.u_bbr.flex2 = len;
log.u_bbr.flex3 = rack->r_ctl.rc_pace_min_segs;
log.u_bbr.flex4 = rack->r_ctl.rc_pace_max_segs;
@@ -17284,25 +17276,25 @@ rack_arrive_at_discounted_rate(struct tcp_rack *rack, uint64_t window_input, uin
}
static int32_t
-pace_to_fill_cwnd(struct tcp_rack *rack, int32_t slot, uint32_t len, uint32_t segsiz, int *capped, uint64_t *rate_wanted, uint8_t non_paced)
+pace_to_fill_cwnd(struct tcp_rack *rack, int32_t pacing_delay, uint32_t len, uint32_t segsiz, int *capped, uint64_t *rate_wanted, uint8_t non_paced)
{
uint64_t lentim, fill_bw;
rack->r_via_fill_cw = 0;
if (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > rack->r_ctl.cwnd_to_use)
- return (slot);
+ return (pacing_delay);
if ((ctf_outstanding(rack->rc_tp) + (segsiz-1)) > rack->rc_tp->snd_wnd)
- return (slot);
+ return (pacing_delay);
if (rack->r_ctl.rc_last_us_rtt == 0)
- return (slot);
+ return (pacing_delay);
if (rack->rc_pace_fill_if_rttin_range &&
(rack->r_ctl.rc_last_us_rtt >=
(get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack->rtt_limit_mul))) {
/* The rtt is huge, N * smallest, lets not fill */
- return (slot);
+ return (pacing_delay);
}
if (rack->r_ctl.fillcw_cap && *rate_wanted >= rack->r_ctl.fillcw_cap)
- return (slot);
+ return (pacing_delay);
/*
* first lets calculate the b/w based on the last us-rtt
* and the the smallest send window.
@@ -17368,7 +17360,7 @@ at_lt_bw:
if (non_paced)
*rate_wanted = fill_bw;
if ((fill_bw < RACK_MIN_BW) || (fill_bw < *rate_wanted))
- return (slot);
+ return (pacing_delay);
rack->r_via_fill_cw = 1;
if (rack->r_rack_hw_rate_caps &&
(rack->r_ctl.crte != NULL)) {
@@ -17423,19 +17415,19 @@ at_lt_bw:
lentim = (uint64_t)(len) * (uint64_t)HPTS_USEC_IN_SEC;
lentim /= fill_bw;
*rate_wanted = fill_bw;
- if (non_paced || (lentim < slot)) {
- rack_log_pacing_delay_calc(rack, len, slot, fill_bw,
+ if (non_paced || (lentim < pacing_delay)) {
+ rack_log_pacing_delay_calc(rack, len, pacing_delay, fill_bw,
0, lentim, 12, __LINE__, NULL, 0);
return ((int32_t)lentim);
} else
- return (slot);
+ return (pacing_delay);
}
static int32_t
rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, struct rack_sendmap *rsm, uint32_t segsiz, int line)
{
uint64_t srtt;
- int32_t slot = 0;
+ int32_t pacing_delay = 0;
int can_start_hw_pacing = 1;
int err;
int pace_one;
@@ -17483,25 +17475,25 @@ rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, str
* cwnd. Which in that case we are just waiting for
* a ACK.
*/
- slot = len / tr_perms;
+ pacing_delay = len / tr_perms;
/* Now do we reduce the time so we don't run dry? */
- if (slot && rack_slot_reduction) {
- reduce = (slot / rack_slot_reduction);
- if (reduce < slot) {
- slot -= reduce;
+ if (pacing_delay && rack_pacing_delay_reduction) {
+ reduce = (pacing_delay / rack_pacing_delay_reduction);
+ if (reduce < pacing_delay) {
+ pacing_delay -= reduce;
} else
- slot = 0;
+ pacing_delay = 0;
} else
reduce = 0;
- slot *= HPTS_USEC_IN_MSEC;
+ pacing_delay *= HPTS_USEC_IN_MSEC;
if (rack->rc_pace_to_cwnd) {
uint64_t rate_wanted = 0;
- slot = pace_to_fill_cwnd(rack, slot, len, segsiz, NULL, &rate_wanted, 1);
+ pacing_delay = pace_to_fill_cwnd(rack, pacing_delay, len, segsiz, NULL, &rate_wanted, 1);
rack->rc_ack_can_sendout_data = 1;
- rack_log_pacing_delay_calc(rack, len, slot, rate_wanted, 0, 0, 14, __LINE__, NULL, 0);
+ rack_log_pacing_delay_calc(rack, len, pacing_delay, rate_wanted, 0, 0, 14, __LINE__, NULL, 0);
} else
- rack_log_pacing_delay_calc(rack, len, slot, tr_perms, reduce, 0, 7, __LINE__, NULL, 0);
+ rack_log_pacing_delay_calc(rack, len, pacing_delay, tr_perms, reduce, 0, 7, __LINE__, NULL, 0);
/*******************************************************/
/* RRS: We insert non-paced call to stats here for len */
/*******************************************************/
@@ -17575,7 +17567,7 @@ rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, str
segs *= oh;
lentim = (uint64_t)(len + segs) * (uint64_t)HPTS_USEC_IN_SEC;
res = lentim / rate_wanted;
- slot = (uint32_t)res;
+ pacing_delay = (uint32_t)res;
if (rack_hw_rate_min &&
(rate_wanted < rack_hw_rate_min)) {
can_start_hw_pacing = 0;
@@ -17635,7 +17627,7 @@ rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, str
* We want to pace at our rate *or* faster to
* fill the cwnd to the max if its not full.
*/
- slot = pace_to_fill_cwnd(rack, slot, (len+segs), segsiz, &capped, &rate_wanted, 0);
+ pacing_delay = pace_to_fill_cwnd(rack, pacing_delay, (len+segs), segsiz, &capped, &rate_wanted, 0);
/* Re-check to make sure we are not exceeding our max b/w */
if ((rack->r_ctl.crte != NULL) &&
(tcp_hw_highest_rate(rack->r_ctl.crte) < rate_wanted)) {
@@ -17786,15 +17778,15 @@ rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, str
srtt = rack->rc_tp->t_srtt;
else
srtt = RACK_INITIAL_RTO * HPTS_USEC_IN_MSEC; /* its in ms convert */
- if (srtt < (uint64_t)slot) {
- rack_log_pacing_delay_calc(rack, srtt, slot, rate_wanted, bw_est, lentim, 99, __LINE__, NULL, 0);
- slot = srtt;
+ if (srtt < (uint64_t)pacing_delay) {
+ rack_log_pacing_delay_calc(rack, srtt, pacing_delay, rate_wanted, bw_est, lentim, 99, __LINE__, NULL, 0);
+ pacing_delay = srtt;
}
}
/*******************************************************************/
/* RRS: We insert paced call to stats here for len and rate_wanted */
/*******************************************************************/
- rack_log_pacing_delay_calc(rack, len, slot, rate_wanted, bw_est, lentim, 2, __LINE__, rsm, 0);
+ rack_log_pacing_delay_calc(rack, len, pacing_delay, rate_wanted, bw_est, lentim, 2, __LINE__, rsm, 0);
}
if (rack->r_ctl.crte && (rack->r_ctl.crte->rs_num_enobufs > 0)) {
/*
@@ -17811,9 +17803,9 @@ rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, str
hw_boost_delay = rack_enobuf_hw_max;
else if (hw_boost_delay < rack_enobuf_hw_min)
hw_boost_delay = rack_enobuf_hw_min;
- slot += hw_boost_delay;
+ pacing_delay += hw_boost_delay;
}
- return (slot);
+ return (pacing_delay);
}
static void
@@ -18482,7 +18474,7 @@ rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendma
struct tcpopt to;
u_char opt[TCP_MAXOLEN];
uint32_t hdrlen, optlen;
- int32_t slot, segsiz, max_val, tso = 0, error = 0, ulen = 0;
+ int32_t pacing_delay, segsiz, max_val, tso = 0, error = 0, ulen = 0;
uint16_t flags;
uint32_t if_hw_tsomaxsegcount = 0, startseq;
uint32_t if_hw_tsomaxsegsize;
@@ -18688,9 +18680,9 @@ rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendma
}
if (rack->r_ctl.crte != NULL) {
/* See if we can send via the hw queue */
- slot = rack_check_queue_level(rack, tp, tv, cts, len, segsiz);
+ pacing_delay = rack_check_queue_level(rack, tp, tv, cts, len, segsiz);
/* If there is nothing in queue (no pacing time) we can send via the hw queue */
- if (slot == 0)
+ if (pacing_delay == 0)
ip_sendflag = 0;
}
tcp_set_flags(th, flags);
@@ -18955,20 +18947,20 @@ rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendma
rack_log_queue_level(tp, rack, len, tv, cts);
} else
tcp_trace_point(rack->rc_tp, TCP_TP_ENOBUF);
- slot = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC);
+ pacing_delay = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC);
if (rack->rc_enobuf < 0x7f)
rack->rc_enobuf++;
- if (slot < (10 * HPTS_USEC_IN_MSEC))
- slot = 10 * HPTS_USEC_IN_MSEC;
+ if (pacing_delay < (10 * HPTS_USEC_IN_MSEC))
+ pacing_delay = 10 * HPTS_USEC_IN_MSEC;
if (rack->r_ctl.crte != NULL) {
counter_u64_add(rack_saw_enobuf_hw, 1);
tcp_rl_log_enobuf(rack->r_ctl.crte);
}
counter_u64_add(rack_saw_enobuf, 1);
} else {
- slot = rack_get_pacing_delay(rack, tp, len, NULL, segsiz, __LINE__);
+ pacing_delay = rack_get_pacing_delay(rack, tp, len, NULL, segsiz, __LINE__);
}
- rack_start_hpts_timer(rack, tp, cts, slot, len, 0);
+ rack_start_hpts_timer(rack, tp, cts, pacing_delay, len, 0);
#ifdef TCP_ACCOUNTING
crtsc = get_cyclecount();
if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
@@ -19071,7 +19063,7 @@ rack_fast_output(struct tcpcb *tp, struct tcp_rack *rack, uint64_t ts_val,
#ifdef TCP_ACCOUNTING
int cnt_thru = 1;
#endif
- int32_t slot, segsiz, len, max_val, tso = 0, sb_offset, error, ulen = 0;
+ int32_t pacing_delay, segsiz, len, max_val, tso = 0, sb_offset, error, ulen = 0;
uint16_t flags;
uint32_t s_soff;
uint32_t if_hw_tsomaxsegcount = 0, startseq;
@@ -19519,8 +19511,8 @@ again:
}
tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
counter_u64_add(rack_fto_send, 1);
- slot = rack_get_pacing_delay(rack, tp, *tot_len, NULL, segsiz, __LINE__);
- rack_start_hpts_timer(rack, tp, cts, slot, *tot_len, 0);
+ pacing_delay = rack_get_pacing_delay(rack, tp, *tot_len, NULL, segsiz, __LINE__);
+ rack_start_hpts_timer(rack, tp, cts, pacing_delay, *tot_len, 0);
#ifdef TCP_ACCOUNTING
crtsc = get_cyclecount();
if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
@@ -19707,7 +19699,7 @@ rack_output(struct tcpcb *tp)
struct rack_sendmap *rsm = NULL;
int32_t tso, mtu;
struct tcpopt to;
- int32_t slot = 0;
+ int32_t pacing_delay = 0;
int32_t sup_rack = 0;
uint32_t cts, ms_cts, delayed, early;
uint32_t add_flag = RACK_SENT_SP;
@@ -20070,7 +20062,7 @@ again:
if (rsm == NULL) {
if (hpts_calling)
/* Retry in a ms */
- slot = (1 * HPTS_USEC_IN_MSEC);
+ pacing_delay = (1 * HPTS_USEC_IN_MSEC);
so = inp->inp_socket;
sb = &so->so_snd;
goto just_return_nolock;
@@ -20877,7 +20869,7 @@ just_return_nolock:
}
if (tot_len_this_send > 0) {
rack->r_ctl.fsb.recwin = recwin;
- slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, NULL, segsiz, __LINE__);
+ pacing_delay = rack_get_pacing_delay(rack, tp, tot_len_this_send, NULL, segsiz, __LINE__);
if ((error == 0) &&
rack_use_rfo &&
((flags & (TH_SYN|TH_FIN)) == 0) &&
@@ -21060,8 +21052,8 @@ just_return_nolock:
/* Yes lets make sure to move to persist before timer-start */
rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime, tp->snd_una);
}
- rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, sup_rack);
- rack_log_type_just_return(rack, cts, tot_len_this_send, slot, hpts_calling, app_limited, cwnd_to_use);
+ rack_start_hpts_timer(rack, tp, cts, pacing_delay, tot_len_this_send, sup_rack);
+ rack_log_type_just_return(rack, cts, tot_len_this_send, pacing_delay, hpts_calling, app_limited, cwnd_to_use);
}
#ifdef NETFLIX_SHARED_CWND
if ((sbavail(sb) == 0) &&
@@ -21100,8 +21092,8 @@ send:
* we come around to again, the flag will be clear.
*/
check_done = 1;
- slot = rack_check_queue_level(rack, tp, &tv, cts, len, segsiz);
- if (slot) {
+ pacing_delay = rack_check_queue_level(rack, tp, &tv, cts, len, segsiz);
+ if (pacing_delay) {
rack->r_ctl.rc_agg_delayed = 0;
rack->r_ctl.rc_agg_early = 0;
rack->r_early = 0;
@@ -22358,11 +22350,11 @@ nomore:
rack_log_queue_level(tp, rack, len, &tv, cts);
} else
tcp_trace_point(rack->rc_tp, TCP_TP_ENOBUF);
- slot = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC);
+ pacing_delay = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC);
if (rack->rc_enobuf < 0x7f)
rack->rc_enobuf++;
- if (slot < (10 * HPTS_USEC_IN_MSEC))
- slot = 10 * HPTS_USEC_IN_MSEC;
+ if (pacing_delay < (10 * HPTS_USEC_IN_MSEC))
+ pacing_delay = 10 * HPTS_USEC_IN_MSEC;
if (rack->r_ctl.crte != NULL) {
counter_u64_add(rack_saw_enobuf_hw, 1);
tcp_rl_log_enobuf(rack->r_ctl.crte);
@@ -22389,8 +22381,8 @@ nomore:
goto again;
}
}
- slot = 10 * HPTS_USEC_IN_MSEC;
- rack_start_hpts_timer(rack, tp, cts, slot, 0, 0);
+ pacing_delay = 10 * HPTS_USEC_IN_MSEC;
+ rack_start_hpts_timer(rack, tp, cts, pacing_delay, 0, 0);
#ifdef TCP_ACCOUNTING
crtsc = get_cyclecount();
if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
@@ -22412,8 +22404,8 @@ nomore:
}
/* FALLTHROUGH */
default:
- slot = 10 * HPTS_USEC_IN_MSEC;
- rack_start_hpts_timer(rack, tp, cts, slot, 0, 0);
+ pacing_delay = 10 * HPTS_USEC_IN_MSEC;
+ rack_start_hpts_timer(rack, tp, cts, pacing_delay, 0, 0);
#ifdef TCP_ACCOUNTING
crtsc = get_cyclecount();
if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
@@ -22456,18 +22448,18 @@ enobufs:
/*
* We don't send again after sending a RST.
*/
- slot = 0;
+ pacing_delay = 0;
sendalot = 0;
if (error == 0)
tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST);
- } else if ((slot == 0) && (sendalot == 0) && tot_len_this_send) {
+ } else if ((pacing_delay == 0) && (sendalot == 0) && tot_len_this_send) {
/*
* Get our pacing rate, if an error
* occurred in sending (ENOBUF) we would
* hit the else if with slot preset. Other
* errors return.
*/
- slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, rsm, segsiz, __LINE__);
+ pacing_delay = rack_get_pacing_delay(rack, tp, tot_len_this_send, rsm, segsiz, __LINE__);
}
/* We have sent clear the flag */
rack->r_ent_rec_ns = 0;
@@ -22499,7 +22491,7 @@ enobufs:
*/
tp->t_flags &= ~(TF_WASCRECOVERY|TF_WASFRECOVERY);
}
- if (slot) {
+ if (pacing_delay) {
/* set the rack tcb into the slot N */
if ((error == 0) &&
rack_use_rfo &&
@@ -22564,7 +22556,7 @@ skip_all_send:
/* Assure when we leave that snd_nxt will point to top */
if (SEQ_GT(tp->snd_max, tp->snd_nxt))
tp->snd_nxt = tp->snd_max;
- rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, 0);
+ rack_start_hpts_timer(rack, tp, cts, pacing_delay, tot_len_this_send, 0);
#ifdef TCP_ACCOUNTING
crtsc = get_cyclecount() - ts_val;
if (tot_len_this_send) {
diff --git a/sys/netinet6/in6_fib_algo.c b/sys/netinet6/in6_fib_algo.c
index 10ffe7ab0265..ef5cfc6d5ef6 100644
--- a/sys/netinet6/in6_fib_algo.c
+++ b/sys/netinet6/in6_fib_algo.c
@@ -351,7 +351,7 @@ struct fib_lookup_module flm_radix6 = {
};
static void
-fib6_algo_init(void)
+fib6_algo_init(void *dummy __unused)
{
fib_module_register(&flm_radix6_lockless);
diff --git a/sys/netipsec/xform_ipcomp.c b/sys/netipsec/xform_ipcomp.c
index 737d4a50098a..05a01b75e0bb 100644
--- a/sys/netipsec/xform_ipcomp.c
+++ b/sys/netipsec/xform_ipcomp.c
@@ -750,7 +750,7 @@ static struct xformsw ipcomp_xformsw = {
};
static void
-ipcomp_attach(void)
+ipcomp_attach(void *dummy __unused)
{
#ifdef INET
@@ -763,7 +763,7 @@ ipcomp_attach(void)
}
static void
-ipcomp_detach(void)
+ipcomp_detach(void *dummy __unused)
{
#ifdef INET
diff --git a/sys/netpfil/ipfw/ip_fw2.c b/sys/netpfil/ipfw/ip_fw2.c
index b59d8d08bf80..d15d7760d7f1 100644
--- a/sys/netpfil/ipfw/ip_fw2.c
+++ b/sys/netpfil/ipfw/ip_fw2.c
@@ -3578,11 +3578,9 @@ sysctl_ipfw_tables_sets(SYSCTL_HANDLER_ARGS)
/*
* Stuff that must be initialised only on boot or module load
*/
-static int
-ipfw_init(void)
+static void
+ipfw_init(void *dummy __unused)
{
- int error = 0;
-
/*
* Only print out this stuff the first time around,
* when called from the sysinit code.
@@ -3627,14 +3625,13 @@ ipfw_init(void)
ipfw_init_sopt_handler();
ipfw_init_obj_rewriter();
ipfw_iface_init();
- return (error);
}
/*
* Called for the removal of the last instance only on module unload.
*/
static void
-ipfw_destroy(void)
+ipfw_destroy(void *dummy __unused)
{
ipfw_iface_destroy();
diff --git a/sys/netpfil/ipfw/ip_fw_nat.c b/sys/netpfil/ipfw/ip_fw_nat.c
index 1e2ff1bca290..1cee7873de31 100644
--- a/sys/netpfil/ipfw/ip_fw_nat.c
+++ b/sys/netpfil/ipfw/ip_fw_nat.c
@@ -1166,7 +1166,7 @@ vnet_ipfw_nat_uninit(const void *arg __unused)
}
static void
-ipfw_nat_init(void)
+ipfw_nat_init(void *dummy __unused)
{
/* init ipfw hooks */
@@ -1183,7 +1183,7 @@ ipfw_nat_init(void)
}
static void
-ipfw_nat_destroy(void)
+ipfw_nat_destroy(void *dummy __unused)
{
EVENTHANDLER_DEREGISTER(ifaddr_event, ifaddr_event_tag);
diff --git a/sys/netpfil/pf/pf_ioctl.c b/sys/netpfil/pf/pf_ioctl.c
index d58af6e5ec4d..a4557f139ae5 100644
--- a/sys/netpfil/pf/pf_ioctl.c
+++ b/sys/netpfil/pf/pf_ioctl.c
@@ -259,7 +259,7 @@ static void dehook_pf_eth(void);
static void dehook_pf(void);
static int shutdown_pf(void);
static int pf_load(void);
-static void pf_unload(void);
+static void pf_unload(void *);
static struct cdevsw pf_cdevsw = {
.d_ioctl = pfioctl,
@@ -7082,7 +7082,7 @@ pf_unload_vnet(void)
}
static void
-pf_unload(void)
+pf_unload(void *dummy __unused)
{
sx_xlock(&pf_end_lock);
diff --git a/sys/nfs/nfs_diskless.c b/sys/nfs/nfs_diskless.c
index 42cfee63d184..0f0cf80feeec 100644
--- a/sys/nfs/nfs_diskless.c
+++ b/sys/nfs/nfs_diskless.c
@@ -428,7 +428,7 @@ decode_nfshandle(char *ev, u_char *fh, int maxfh)
#if !defined(BOOTP_NFSROOT)
static void
-nfs_rootconf(void)
+nfs_rootconf(void *dummy __unused)
{
nfs_setup_diskless();
diff --git a/sys/powerpc/aim/mmu_oea64.c b/sys/powerpc/aim/mmu_oea64.c
index 796b1719b8ba..01bf4c7e90a8 100644
--- a/sys/powerpc/aim/mmu_oea64.c
+++ b/sys/powerpc/aim/mmu_oea64.c
@@ -297,7 +297,7 @@ static u_int moea64_clear_bit(vm_page_t, uint64_t);
static void moea64_kremove(vm_offset_t);
static void moea64_syncicache(pmap_t pmap, vm_offset_t va,
vm_paddr_t pa, vm_size_t sz);
-static void moea64_pmap_init_qpages(void);
+static void moea64_pmap_init_qpages(void *);
static void moea64_remove_locked(pmap_t, vm_offset_t,
vm_offset_t, struct pvo_dlist *);
@@ -1284,7 +1284,7 @@ moea64_late_bootstrap(vm_offset_t kernelstart, vm_offset_t kernelend)
}
static void
-moea64_pmap_init_qpages(void)
+moea64_pmap_init_qpages(void *dummy __unused)
{
struct pcpu *pc;
int i;
diff --git a/sys/powerpc/cpufreq/pmcr.c b/sys/powerpc/cpufreq/pmcr.c
index dd489b607606..6ae0777a8ac7 100644
--- a/sys/powerpc/cpufreq/pmcr.c
+++ b/sys/powerpc/cpufreq/pmcr.c
@@ -40,7 +40,8 @@ static int pstate_ids[256];
static int pstate_freqs[256];
static int npstates;
-static void parse_pstates(void)
+static void
+parse_pstates(void *dummy __unused)
{
phandle_t node;
diff --git a/sys/security/audit/audit.c b/sys/security/audit/audit.c
index 7ec50d990d4e..876776e5f62e 100644
--- a/sys/security/audit/audit.c
+++ b/sys/security/audit/audit.c
@@ -329,7 +329,7 @@ audit_record_dtor(void *mem, int size, void *arg)
* call into the BSM assembly code to initialize it.
*/
static void
-audit_init(void)
+audit_init(void *dummy __unused)
{
audit_trail_enabled = 0;
diff --git a/sys/security/mac/mac_framework.c b/sys/security/mac/mac_framework.c
index d742b5dcbc3a..b0776160cc74 100644
--- a/sys/security/mac/mac_framework.c
+++ b/sys/security/mac/mac_framework.c
@@ -320,7 +320,7 @@ mac_policy_xlock_assert(void)
* Initialize the MAC subsystem, including appropriate SMP locks.
*/
static void
-mac_init(void)
+mac_init(void *dummy __unused)
{
LIST_INIT(&mac_static_policy_list);
@@ -340,7 +340,7 @@ mac_init(void)
* kernel, or loaded before the kernel startup.
*/
static void
-mac_late_init(void)
+mac_late_init(void *dummy __unused)
{
mac_late = 1;
diff --git a/sys/sys/sysent.h b/sys/sys/sysent.h
index 1714fa5a7416..6de391dcc03e 100644
--- a/sys/sys/sysent.h
+++ b/sys/sys/sysent.h
@@ -343,8 +343,7 @@ void exec_free_abi_mappings(struct proc *p);
void exec_onexec_old(struct thread *td);
#define INIT_SYSENTVEC(name, sv) \
- SYSINIT(name, SI_SUB_EXEC, SI_ORDER_ANY, \
- (sysinit_cfunc_t)exec_sysvec_init, sv);
+ SYSINIT(name, SI_SUB_EXEC, SI_ORDER_ANY, exec_sysvec_init, sv)
#endif /* _KERNEL */
diff --git a/sys/tests/ktest.h b/sys/tests/ktest.h
index c767aa31e8e5..75d7a75e2fff 100644
--- a/sys/tests/ktest.h
+++ b/sys/tests/ktest.h
@@ -57,6 +57,8 @@ struct ktest_test_info {
ktest_parse_t parse;
};
+#define KTEST_FUNC(X) static int __ktest_##X(struct ktest_test_context *ctx)
+
struct ktest_module_info {
const char *name;
const struct ktest_test_info *tests;
@@ -64,6 +66,8 @@ struct ktest_module_info {
void *module_ptr;
};
+#define KTEST_INFO(X) { "test_" #X, "Test " #X, __ktest_##X, NULL }
+
int ktest_default_modevent(module_t mod, int type, void *arg);
bool ktest_start_msg(struct ktest_test_context *ctx);
@@ -84,6 +88,9 @@ void ktest_end_msg(struct ktest_test_context *ctx);
#define KTEST_LOG(_ctx, _fmt, ...) \
KTEST_LOG_LEVEL(_ctx, LOG_DEBUG, _fmt, ## __VA_ARGS__)
+#define KTEST_ERR(_ctx, _fmt, ...) \
+ KTEST_LOG_LEVEL(_ctx, LOG_ERR, _fmt, ## __VA_ARGS__)
+
#define KTEST_MAX_BUF 512
#define KTEST_MODULE_DECLARE(_n, _t) \
@@ -104,6 +111,9 @@ MODULE_VERSION(ktest_##_n, 1); \
MODULE_DEPEND(ktest_##_n, ktestmod, 1, 1, 1); \
MODULE_DEPEND(ktest_##_n, netlink, 1, 1, 1); \
+#define KTEST_MODULE_DEPEND(_n, _d) \
+MODULE_DEPEND(ktest_##_n, _d, 1, 1, 1); \
+
#endif /* _KERNEL */
/* genetlink definitions */
diff --git a/sys/vm/vm_meter.c b/sys/vm/vm_meter.c
index fef28bb883e4..fee50f49c844 100644
--- a/sys/vm/vm_meter.c
+++ b/sys/vm/vm_meter.c
@@ -96,7 +96,7 @@ struct vmmeter __read_mostly vm_cnt = {
u_long __exclusive_cache_line vm_user_wire_count;
static void
-vmcounter_startup(void)
+vmcounter_startup(void *dummy __unused)
{
counter_u64_t *cnt = (counter_u64_t *)&vm_cnt;
diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c
index 3f1be78342c9..418a9cff8abf 100644
--- a/sys/vm/vm_pageout.c
+++ b/sys/vm/vm_pageout.c
@@ -120,7 +120,7 @@
/* the kernel process "vm_pageout"*/
static void vm_pageout(void);
-static void vm_pageout_init(void);
+static void vm_pageout_init(void *);
static int vm_pageout_clean(vm_page_t m, int *numpagedout);
static int vm_pageout_cluster(vm_page_t m);
static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage,
@@ -2333,7 +2333,7 @@ vm_pageout_init_domain(int domain)
}
static void
-vm_pageout_init(void)
+vm_pageout_init(void *dummy __unused)
{
u_long freecount;
int i;
diff --git a/sys/x86/x86/tsc.c b/sys/x86/x86/tsc.c
index a1a5d8140b14..3b873d9dae73 100644
--- a/sys/x86/x86/tsc.c
+++ b/sys/x86/x86/tsc.c
@@ -650,7 +650,7 @@ retry:
#endif /* SMP */
static void
-init_TSC_tc(void)
+init_TSC_tc(void *dummy __unused)
{
uint64_t max_freq;
int shift;
diff --git a/sys/x86/xen/xen_apic.c b/sys/x86/xen/xen_apic.c
index 994dc3e0804c..43a253cc2860 100644
--- a/sys/x86/xen/xen_apic.c
+++ b/sys/x86/xen/xen_apic.c
@@ -330,7 +330,7 @@ xen_cpu_ipi_init(int cpu)
}
static void
-xen_setup_cpus(void)
+xen_setup_cpus(void *dummy __unused)
{
uint32_t regs[4];
int i;
diff --git a/tests/atf_python/ktest.py b/tests/atf_python/ktest.py
index a18f47d1dd06..a671aaa1fd4c 100644
--- a/tests/atf_python/ktest.py
+++ b/tests/atf_python/ktest.py
@@ -67,6 +67,10 @@ class KtestLoader(object):
def __init__(self, module_name: str, autoload: bool):
self.module_name = module_name
self.autoload = autoload
+ # Ensure the base ktest.ko module is loaded
+ result = libc.kldload("ktest")
+ if result != 0 and result != 17: # 17 is EEXIST (already loaded)
+ logger.debug(f"Failed to load base ktest module (error {result})")
self.helper = NlHelper()
self.nlsock = Nlsock(NlConst.NETLINK_GENERIC, self.helper)
self.family_id = self._get_family_id()
@@ -76,7 +80,9 @@ class KtestLoader(object):
family_id = self.nlsock.get_genl_family_id(NETLINK_FAMILY)
except ValueError:
if self.autoload:
- libc.kldload(self.module_name)
+ result = libc.kldload(self.module_name)
+ if result != 0 and result != 17: # 17 is EEXIST (already loaded)
+ raise RuntimeError(f"Failed to load kernel module '{self.module_name}' (error {result})")
family_id = self.nlsock.get_genl_family_id(NETLINK_FAMILY)
else:
raise
@@ -103,7 +109,9 @@ class KtestLoader(object):
def load_ktests(self):
ret = self._load_ktests()
if not ret and self.autoload:
- libc.kldload(self.module_name)
+ result = libc.kldload(self.module_name)
+ if result != 0 and result != 17: # 17 is EEXIST (already loaded)
+ raise RuntimeError(f"Failed to load kernel module '{self.module_name}' (error {result})")
ret = self._load_ktests()
return ret
diff --git a/tests/sys/netinet/Makefile b/tests/sys/netinet/Makefile
index b742342beecb..9739221676ce 100644
--- a/tests/sys/netinet/Makefile
+++ b/tests/sys/netinet/Makefile
@@ -30,6 +30,7 @@ ATF_TESTS_SH= arp \
ATF_TESTS_PYTEST+= carp.py
ATF_TESTS_PYTEST+= igmp.py
+ATF_TESTS_PYTEST+= tcp_hpts_test.py
LIBADD.so_reuseport_lb_test= pthread
LIBADD.udp_bindings= pthread
diff --git a/tests/sys/netinet/tcp_hpts_test.py b/tests/sys/netinet/tcp_hpts_test.py
new file mode 100644
index 000000000000..c56383fb310f
--- /dev/null
+++ b/tests/sys/netinet/tcp_hpts_test.py
@@ -0,0 +1,4 @@
+from atf_python.ktest import BaseKernelTest
+
+class TestTcpHpts(BaseKernelTest):
+ KTEST_MODULE_NAME = "ktest_tcphpts"
diff --git a/tests/sys/vm/mmap_test.c b/tests/sys/vm/mmap_test.c
index 6bc30f73ca95..27d02ae667fb 100644
--- a/tests/sys/vm/mmap_test.c
+++ b/tests/sys/vm/mmap_test.c
@@ -36,21 +36,6 @@
#include <stdio.h>
#include <stdlib.h>
-static const struct {
- void *addr;
- int ok[2]; /* Depending on security.bsd.map_at_zero {0, !=0}. */
-} map_at_zero_tests[] = {
- { (void *)0, { 0, 1 } }, /* Test sysctl. */
- { (void *)1, { 0, 0 } },
- { (void *)(PAGE_SIZE - 1), { 0, 0 } },
- { (void *)PAGE_SIZE, { 1, 1 } },
- { (void *)-1, { 0, 0 } },
- { (void *)(-PAGE_SIZE), { 0, 0 } },
- { (void *)(-1 - PAGE_SIZE), { 0, 0 } },
- { (void *)(-1 - PAGE_SIZE - 1), { 0, 0 } },
- { (void *)(0x1000 * PAGE_SIZE), { 1, 1 } },
-};
-
#define MAP_AT_ZERO "security.bsd.map_at_zero"
#ifdef __LP64__
@@ -68,6 +53,22 @@ ATF_TC_BODY(mmap__map_at_zero, tc)
int map_at_zero;
bool allow_wx;
int prot_flags;
+ size_t pgsz = getpagesize();
+
+ const struct {
+ void *addr;
+ int ok[2]; /* Depending on security.bsd.map_at_zero {0, !=0}. */
+ } map_at_zero_tests[] = {
+ { (void *)0, { 0, 1 } }, /* Test sysctl. */
+ { (void *)1, { 0, 0 } },
+ { (void *)(pgsz - 1), { 0, 0 } },
+ { (void *)pgsz, { 1, 1 } },
+ { (void *)-1, { 0, 0 } },
+ { (void *)(-pgsz), { 0, 0 } },
+ { (void *)(-1 - pgsz), { 0, 0 } },
+ { (void *)(-1 - pgsz - 1), { 0, 0 } },
+ { (void *)(0x1000 * pgsz), { 1, 1 } },
+ };
len = sizeof(map_at_zero);
if (sysctlbyname(MAP_AT_ZERO, &map_at_zero, &len, NULL, 0) == -1) {
diff --git a/usr.bin/login/login.conf b/usr.bin/login/login.conf
index 1069da17b4db..c65a83caa565 100644
--- a/usr.bin/login/login.conf
+++ b/usr.bin/login/login.conf
@@ -46,7 +46,6 @@ default:\
:umtxp=unlimited:\
:pipebuf=unlimited:\
:priority=0:\
- :ignoretime@:\
:umask=022:\
:charset=UTF-8:\
:lang=C.UTF-8:
@@ -149,7 +148,6 @@ russian|Russian Users Accounts:\
# :requirehome:\
# :passwordtime=90d:\
# :umask=002:\
-# :ignoretime@:\
# :tc=default:
#
#
@@ -174,7 +172,6 @@ russian|Russian Users Accounts:\
##
#staff:\
# :ignorenologin:\
-# :ignoretime:\
# :requirehome@:\
# :accounted@:\
# :path=~/bin /bin /sbin /usr/bin /usr/sbin /usr/local/bin /usr/local/sbin:\
@@ -265,7 +262,6 @@ russian|Russian Users Accounts:\
## - no time accounting, restricted to access via dialin lines
##
#site:\
-# :ignoretime:\
# :passwordtime@:\
# :refreshtime@:\
# :refreshperiod@:\
diff --git a/usr.sbin/blacklistctl/Makefile b/usr.sbin/blacklistctl/Makefile
index 8a01f52926a7..41c5f44b072b 100644
--- a/usr.sbin/blacklistctl/Makefile
+++ b/usr.sbin/blacklistctl/Makefile
@@ -6,8 +6,7 @@ PACKAGE= blocklist
PROG= blacklistctl
SRCS= blacklistctl.c conf.c state.c support.c old_internal.c \
sockaddr_snprintf.c pidfile.c strtoi.c popenve.c
-MAN= blocklistctl.8
-MLINKS= blocklistctl.8 blacklistctl.8
+MAN= blacklistctl.8
LDFLAGS+=-L${LIBBLACKLISTDIR}
LIBADD+= blocklist util
diff --git a/usr.sbin/blacklistd/Makefile b/usr.sbin/blacklistd/Makefile
index b4ba4ca2f9ad..490b12d46968 100644
--- a/usr.sbin/blacklistd/Makefile
+++ b/usr.sbin/blacklistd/Makefile
@@ -7,9 +7,7 @@ CONFS= blacklistd.conf
PROG= blacklistd
SRCS= blacklistd.c conf.c run.c state.c support.c old_internal.c \
sockaddr_snprintf.c pidfile.c strtoi.c popenve.c vsyslog_r.c
-MAN= blocklistd.8 blocklistd.conf.5
-MLINKS= blocklistd.8 blacklistd.8 \
- blocklistd.conf.5 blacklistd.conf.5
+MAN= blacklistd.8 blacklistd.conf.5
LDFLAGS+=-L${LIBBLACKLISTDIR}
LIBADD+= blocklist util
diff --git a/usr.sbin/fwget/pci/pci_network_mediatek b/usr.sbin/fwget/pci/pci_network_mediatek
index 653c87c410eb..e1e15dcfa2e5 100644
--- a/usr.sbin/fwget/pci/pci_network_mediatek
+++ b/usr.sbin/fwget/pci/pci_network_mediatek
@@ -38,24 +38,24 @@ pci_network_mediatek_mt76()
# { sys/contrib/dev/mediatek/mt76/zzz_fw_ports_fwget.sh }
### >>>
- 0x0608) addpkg "wifi-firmware-mediatek-kmod-mt792x"; return 1 ;;
- 0x0616) addpkg "wifi-firmware-mediatek-kmod-mt792x"; return 1 ;;
- 0x0717) addpkg "wifi-firmware-mediatek-kmod-mt792x"; return 1 ;;
- 0x7611) addpkg "wifi-firmware-mediatek-kmod-mt7615"; return 1 ;;
- 0x7615) addpkg "wifi-firmware-mediatek-kmod-mt7615"; return 1 ;;
- 0x7663) addpkg "wifi-firmware-mediatek-kmod-mt7615"; return 1 ;;
- 0x7906) addpkg "wifi-firmware-mediatek-kmod-mt7915"; return 1 ;;
- 0x790a) addpkg "wifi-firmware-mediatek-kmod-mt7915"; return 1 ;;
- 0x7915) addpkg "wifi-firmware-mediatek-kmod-mt7915"; return 1 ;;
- 0x7916) addpkg "wifi-firmware-mediatek-kmod-mt7915"; return 1 ;;
- 0x7920) addpkg "wifi-firmware-mediatek-kmod-mt792x"; return 1 ;;
- 0x7922) addpkg "wifi-firmware-mediatek-kmod-mt792x"; return 1 ;;
- 0x7925) addpkg "wifi-firmware-mediatek-kmod-mt792x"; return 1 ;;
- 0x7961) addpkg "wifi-firmware-mediatek-kmod-mt792x"; return 1 ;;
- 0x7990) addpkg "wifi-firmware-mediatek-kmod-mt7996"; return 1 ;;
- 0x7991) addpkg "wifi-firmware-mediatek-kmod-mt7996"; return 1 ;;
- 0x7992) addpkg "wifi-firmware-mediatek-kmod-mt7996"; return 1 ;;
- 0x799a) addpkg "wifi-firmware-mediatek-kmod-mt7996"; return 1 ;;
+ 0x0608) addpkg "wifi-firmware-mt76-kmod-mt792x"; return 1 ;;
+ 0x0616) addpkg "wifi-firmware-mt76-kmod-mt792x"; return 1 ;;
+ 0x0717) addpkg "wifi-firmware-mt76-kmod-mt792x"; return 1 ;;
+ 0x7611) addpkg "wifi-firmware-mt76-kmod-mt7615"; return 1 ;;
+ 0x7615) addpkg "wifi-firmware-mt76-kmod-mt7615"; return 1 ;;
+ 0x7663) addpkg "wifi-firmware-mt76-kmod-mt7615"; return 1 ;;
+ 0x7906) addpkg "wifi-firmware-mt76-kmod-mt7915"; return 1 ;;
+ 0x790a) addpkg "wifi-firmware-mt76-kmod-mt7915"; return 1 ;;
+ 0x7915) addpkg "wifi-firmware-mt76-kmod-mt7915"; return 1 ;;
+ 0x7916) addpkg "wifi-firmware-mt76-kmod-mt7915"; return 1 ;;
+ 0x7920) addpkg "wifi-firmware-mt76-kmod-mt792x"; return 1 ;;
+ 0x7922) addpkg "wifi-firmware-mt76-kmod-mt792x"; return 1 ;;
+ 0x7925) addpkg "wifi-firmware-mt76-kmod-mt792x"; return 1 ;;
+ 0x7961) addpkg "wifi-firmware-mt76-kmod-mt792x"; return 1 ;;
+ 0x7990) addpkg "wifi-firmware-mt76-kmod-mt7996"; return 1 ;;
+ 0x7991) addpkg "wifi-firmware-mt76-kmod-mt7996"; return 1 ;;
+ 0x7992) addpkg "wifi-firmware-mt76-kmod-mt7996"; return 1 ;;
+ 0x799a) addpkg "wifi-firmware-mt76-kmod-mt7996"; return 1 ;;
### <<<
esac