aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Makefile.inc116
-rw-r--r--RELNOTES2
-rw-r--r--UPDATING8
-rw-r--r--lib/libc/stdlib/realpath.312
-rw-r--r--lib/libc/stdlib/realpath.c14
-rw-r--r--lib/libc/tests/gen/realpath2_test.c106
-rw-r--r--lib/libsys/socket.2302
-rw-r--r--sbin/fsck_ffs/setup.c47
-rw-r--r--sbin/geom/Makefile2
-rw-r--r--sbin/geom/core/geom.c232
-rw-r--r--sbin/ipfw/tables.c3
-rwxr-xr-xsbin/mdconfig/tests/mdconfig_test.sh15
-rw-r--r--sbin/ping/tests/Makefile3
-rw-r--r--share/man/man4/bridge.4210
-rw-r--r--sys/amd64/amd64/elf_machdep.c14
-rw-r--r--sys/amd64/linux/linux_sysvec.c12
-rw-r--r--sys/amd64/linux32/linux32_sysvec.c12
-rw-r--r--sys/arm/arm/pmap-v6.c2
-rw-r--r--sys/arm/arm/unwind.c4
-rw-r--r--sys/arm64/arm64/elf_machdep.c7
-rw-r--r--sys/arm64/coresight/coresight.c2
-rw-r--r--sys/arm64/linux/linux_sysvec.c10
-rw-r--r--sys/cam/scsi/scsi_all.c4
-rw-r--r--sys/cddl/compat/opensolaris/kern/opensolaris.c2
-rw-r--r--sys/compat/ia32/ia32_sysvec.c24
-rw-r--r--sys/compat/linux/linux_futex.c2
-rw-r--r--sys/conf/NOTES1
-rw-r--r--sys/conf/options1
-rw-r--r--sys/conf/std.debug1
-rw-r--r--sys/conf/std.nodebug1
-rw-r--r--sys/dev/fdt/fdt_slicer.c6
-rw-r--r--sys/dev/iommu/iommu_gas.c2
-rw-r--r--sys/dev/mii/mv88e151x.c8
-rw-r--r--sys/dev/nvme/nvme.c4
-rw-r--r--sys/dev/nvme/nvme.h4
-rw-r--r--sys/dev/nvme/nvme_sim.c4
-rw-r--r--sys/dev/xdma/xdma.c2
-rw-r--r--sys/dev/xen/bus/xen_intr.c6
-rw-r--r--sys/fs/fuse/fuse_vnops.c4
-rw-r--r--sys/fs/p9fs/p9_transport.c3
-rw-r--r--sys/fs/unionfs/union_subr.c2
-rw-r--r--sys/fs/unionfs/union_vnops.c2
-rw-r--r--sys/i386/i386/machdep.c2
-rw-r--r--sys/i386/i386/pmap.c2
-rw-r--r--sys/kern/imgact_elf.c28
-rw-r--r--sys/kern/kern_boottrace.c2
-rw-r--r--sys/kern/kern_devctl.c2
-rw-r--r--sys/kern/kern_event.c4
-rw-r--r--sys/kern/kern_exec.c4
-rw-r--r--sys/kern/kern_jailmeta.c8
-rw-r--r--sys/kern/kern_linker.c2
-rw-r--r--sys/kern/kern_malloc.c2
-rw-r--r--sys/kern/kern_racct.c4
-rw-r--r--sys/kern/kern_rangelock.c2
-rw-r--r--sys/kern/kern_rctl.c4
-rw-r--r--sys/kern/kern_sharedpage.c3
-rw-r--r--sys/kern/kern_sig.c4
-rw-r--r--sys/kern/kern_time.c4
-rw-r--r--sys/kern/subr_pcpu.c2
-rw-r--r--sys/kern/sys_socket.c2
-rw-r--r--sys/kern/uipc_usrreq.c20
-rw-r--r--sys/libkern/arc4random.c4
-rw-r--r--sys/libkern/x86/crc32_sse42.c4
-rw-r--r--sys/modules/ktest/Makefile3
-rw-r--r--sys/modules/ktest/ktest_tcphpts/Makefile13
-rw-r--r--sys/net/route.c2
-rw-r--r--sys/net/route/route_tables.c2
-rw-r--r--sys/net/rtsock.c2
-rw-r--r--sys/net80211/ieee80211_ht.c2
-rw-r--r--sys/net80211/ieee80211_hwmp.c2
-rw-r--r--sys/net80211/ieee80211_mesh.c2
-rw-r--r--sys/net80211/ieee80211_phy.c2
-rw-r--r--sys/net80211/ieee80211_proto.c2
-rw-r--r--sys/net80211/ieee80211_vht.c2
-rw-r--r--sys/netinet/cc/cc.c2
-rw-r--r--sys/netinet/in_fib_algo.c2
-rw-r--r--sys/netinet/tcp_hpts.c933
-rw-r--r--sys/netinet/tcp_hpts.h50
-rw-r--r--sys/netinet/tcp_hpts_internal.h184
-rw-r--r--sys/netinet/tcp_hpts_test.c1662
-rw-r--r--sys/netinet/tcp_lro_hpts.c3
-rw-r--r--sys/netinet/tcp_stacks/bbr.c131
-rw-r--r--sys/netinet/tcp_stacks/rack.c252
-rw-r--r--sys/netinet6/in6_fib_algo.c2
-rw-r--r--sys/netipsec/xform_ipcomp.c4
-rw-r--r--sys/netpfil/ipfw/ip_fw2.c9
-rw-r--r--sys/netpfil/ipfw/ip_fw_nat.c16
-rw-r--r--sys/netpfil/pf/pf_ioctl.c4
-rw-r--r--sys/nfs/nfs_diskless.c2
-rw-r--r--sys/powerpc/aim/mmu_oea64.c4
-rw-r--r--sys/powerpc/cpufreq/pmcr.c3
-rw-r--r--sys/rpc/auth.h4
-rw-r--r--sys/rpc/authunix_prot.c93
-rw-r--r--sys/rpc/svc_auth_unix.c94
-rw-r--r--sys/security/audit/audit.c2
-rw-r--r--sys/security/mac/mac_framework.c4
-rw-r--r--sys/sys/imgact_elf.h8
-rw-r--r--sys/sys/proc.h2
-rw-r--r--sys/sys/socket.h1
-rw-r--r--sys/sys/sysent.h3
-rw-r--r--sys/sys/tree.h57
-rw-r--r--sys/tests/ktest.h10
-rw-r--r--sys/ufs/ffs/ffs_inode.c4
-rw-r--r--sys/vm/vm_meter.c2
-rw-r--r--sys/vm/vm_pageout.c4
-rw-r--r--sys/x86/x86/tsc.c2
-rw-r--r--sys/x86/xen/xen_apic.c2
-rw-r--r--tests/atf_python/ktest.py12
-rw-r--r--tests/sys/fs/fusefs/bad_server.cc5
-rw-r--r--tests/sys/kern/unix_stream.c27
-rw-r--r--tests/sys/netinet/Makefile1
-rwxr-xr-xtests/sys/netinet/multicast.sh29
-rw-r--r--tests/sys/netinet/tcp_hpts_test.py4
-rw-r--r--tests/sys/vm/mmap_test.c31
-rw-r--r--usr.bin/login/login.conf4
-rw-r--r--usr.bin/sockstat/main.c8
-rw-r--r--usr.bin/sockstat/sockstat.15
-rw-r--r--usr.sbin/certctl/certctl.86
-rw-r--r--usr.sbin/fwget/pci/pci_network_mediatek36
119 files changed, 3687 insertions, 1297 deletions
diff --git a/Makefile.inc1 b/Makefile.inc1
index 74c4598dd092..a86dead09aa1 100644
--- a/Makefile.inc1
+++ b/Makefile.inc1
@@ -1964,6 +1964,7 @@ REPODIR?= ${OBJROOT}repo
PKG_FORMAT?= tzst
PKG_LEVEL?= -1
PKG_CLEVEL?= ${"${PKG_FORMAT:Mtar}" != "":?:-l ${PKG_LEVEL}}
+PKG_CTHREADS?= 0
PKG_REPO_SIGNING_KEY?= # empty
PKG_OUTPUT_DIR?= ${PKG_VERSION}
PKG_ABI_FILE?= ${WSTAGEDIR}/usr/bin/uname
@@ -2094,6 +2095,7 @@ create-packages-world: _pkgbootstrap _repodir .PHONY
.ORDER: create-packages-world create-packages-sets
.ORDER: create-packages-kernel create-packages-sets
+.ORDER: create-packages-source create-packages-sets
create-packages-sets: _pkgbootstrap _repodir .PHONY
${_+_}@cd ${.CURDIR}; \
${MAKE} -f Makefile.inc1 \
@@ -2143,7 +2145,7 @@ create-source-src-package: _pkgbootstrap .PHONY
${SSTAGEDIR}/src.ucl
${PKG_CMD} -o ABI=${PKG_ABI} \
-o OSVERSION="${SRCRELDATE}" \
- create -f ${PKG_FORMAT} ${PKG_CLEVEL} \
+ create -f ${PKG_FORMAT} ${PKG_CLEVEL} -T${PKG_CTHREADS} \
-M ${SSTAGEDIR}/src.ucl \
-p ${SSTAGEDIR}/src.plist \
-r ${SRCDIR} \
@@ -2169,7 +2171,7 @@ create-source-src-sys-package: _pkgbootstrap .PHONY
${SSTAGEDIR}/src-sys.ucl
${PKG_CMD} -o ABI=${PKG_ABI} \
-o OSVERSION="${SRCRELDATE}" \
- create -f ${PKG_FORMAT} ${PKG_CLEVEL} \
+ create -f ${PKG_FORMAT} ${PKG_CLEVEL} -T${PKG_CTHREADS} \
-M ${SSTAGEDIR}/src-sys.ucl \
-p ${SSTAGEDIR}/src-sys.plist \
-r ${SRCDIR} \
@@ -2209,7 +2211,7 @@ create-world-package-${pkgname}: .PHONY
' ${WSTAGEDIR}/${pkgname}.ucl
${PKG_CMD} -o ABI=${PKG_ABI} -o ALLOW_BASE_SHLIBS=yes \
-o OSVERSION="${SRCRELDATE}" \
- create -f ${PKG_FORMAT} ${PKG_CLEVEL} \
+ create -f ${PKG_FORMAT} ${PKG_CLEVEL} -T${PKG_CTHREADS} \
-M ${WSTAGEDIR}/${pkgname}.ucl \
-p ${WSTAGEDIR}/${pkgname}.plist \
-r ${WSTAGEDIR} \
@@ -2228,7 +2230,7 @@ create-sets-packages: .PHONY
@for manifest in ${WSTAGEDIR}/set-*.ucl; do \
echo "--> Processing manifest: $$manifest"; \
${PKG_CMD} -o ABI=${PKG_ABI} -o OSVERSION="${SRCRELDATE}" \
- create -f ${PKG_FORMAT} ${PKG_CLEVEL} \
+ create -f ${PKG_FORMAT} ${PKG_CLEVEL} -T${PKG_CTHREADS} \
-M $$manifest \
-o "${REPODIR}/${PKG_ABI}/${PKG_OUTPUT_DIR}" \
|| exit 1; \
@@ -2258,7 +2260,7 @@ create-dtb-package: .PHONY
${KSTAGEDIR}/${DISTDIR}/dtb.ucl ; \
${PKG_CMD} -o ABI=${PKG_ABI} -o ALLOW_BASE_SHLIBS=yes \
-o OSVERSION="${SRCRELDATE}" \
- create -f ${PKG_FORMAT} ${PKG_CLEVEL} \
+ create -f ${PKG_FORMAT} ${PKG_CLEVEL} -T${PKG_CTHREADS} \
-M ${KSTAGEDIR}/${DISTDIR}/dtb.ucl \
-p ${KSTAGEDIR}/${DISTDIR}/dtb.plist \
-r ${KSTAGEDIR}/${DISTDIR} \
@@ -2295,7 +2297,7 @@ create-kernel-packages-flavor${flavor:C,^""$,${_default_flavor},}: _pkgbootstrap
${KSTAGEDIR}/${DISTDIR}/kernel.${INSTALLKERNEL}${flavor}.ucl ; \
${PKG_CMD} -o ABI=${PKG_ABI} -o ALLOW_BASE_SHLIBS=yes \
-o OSVERSION="${SRCRELDATE}" \
- create -f ${PKG_FORMAT} ${PKG_CLEVEL} \
+ create -f ${PKG_FORMAT} ${PKG_CLEVEL} -T${PKG_CTHREADS} \
-M ${KSTAGEDIR}/${DISTDIR}/kernel.${INSTALLKERNEL}${flavor}.ucl \
-p ${KSTAGEDIR}/${DISTDIR}/kernel.${INSTALLKERNEL}${flavor}.plist \
-r ${KSTAGEDIR}/${DISTDIR} \
@@ -2338,7 +2340,7 @@ create-kernel-packages-extra-flavor${flavor:C,^""$,${_default_flavor},}-${_kerne
${KSTAGEDIR}/kernel.${_kernel}/kernel.${_kernel}${flavor}.ucl ; \
${PKG_CMD} -o ABI=${PKG_ABI} -o ALLOW_BASE_SHLIBS=yes \
-o OSVERSION="${SRCRELDATE}" \
- create -f ${PKG_FORMAT} ${PKG_CLEVEL} \
+ create -f ${PKG_FORMAT} ${PKG_CLEVEL} -T${PKG_CTHREADS} \
-M ${KSTAGEDIR}/kernel.${_kernel}/kernel.${_kernel}${flavor}.ucl \
-p ${KSTAGEDIR}/kernel.${_kernel}/kernel.${_kernel}${flavor}.plist \
-r ${KSTAGEDIR}/kernel.${_kernel} \
diff --git a/RELNOTES b/RELNOTES
index 174ce12e4148..e34a5b23a005 100644
--- a/RELNOTES
+++ b/RELNOTES
@@ -11,7 +11,7 @@ newline. Entries should be separated by a newline.
Changes to this file should not be MFCed.
5000d023a446, 03da141d59ae:
- Add a "-f" option to "kadmin -l dump" with can be used to
+ Add a "-f" option to "kadmin -l dump" which can be used to
dump the Heimdal KDC database in a format that can be loaded
into the MIT KDC.
See https://wiki.freebsd.org/Kerberos/Heimdal2MIT_KDC_Migration
diff --git a/UPDATING b/UPDATING
index 4460898fca2d..9c8bd3a9fd6b 100644
--- a/UPDATING
+++ b/UPDATING
@@ -27,6 +27,12 @@ NOTE TO PEOPLE WHO THINK THAT FreeBSD 16.x IS SLOW:
world, or to merely disable the most expensive debugging functionality
at runtime, run "ln -s 'abort:false,junk:false' /etc/malloc.conf".)
+20251012:
+ Blacklist has been renamed upstream to Blocklist. If you have it
+ configured, rename all configuration files, firewall anchors or
+ sentinel files to reflect the new nomenclature. Old setups will
+ continue to work emitting a warning.
+
20251002:
Audio-related utilities including mixer(8) and virtual_oss(8) have
moved to the new FreeBSD-sound package. If you have set-optional or
@@ -700,7 +706,7 @@ NOTE TO PEOPLE WHO THINK THAT FreeBSD 16.x IS SLOW:
Bump _FreeBSD_version to 1400078 to be able to detect this change.
20221212:
- llvm-objump is now always installed as objdump. Previously there was
+ llvm-objdump is now always installed as objdump. Previously there was
no /usr/bin/objdump unless the WITH_LLVM_BINUTILS knob was used.
Some LLVM objdump options have a different output format compared to
diff --git a/lib/libc/stdlib/realpath.3 b/lib/libc/stdlib/realpath.3
index 065ba312c2ef..76f40249963b 100644
--- a/lib/libc/stdlib/realpath.3
+++ b/lib/libc/stdlib/realpath.3
@@ -28,7 +28,7 @@
.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
.\" SUCH DAMAGE.
.\"
-.Dd May 11, 2012
+.Dd October 10, 2025
.Dt REALPATH 3
.Os
.Sh NAME
@@ -108,11 +108,11 @@ and
.Xr getcwd 3 .
.Sh SEE ALSO
.Xr getcwd 3
-.\" .Sh STANDARDS
-.\" The
-.\" .Fn realpath
-.\" function conforms to
-.\" .St -p1003.1-2001 .
+.Sh STANDARDS
+The
+.Fn realpath
+function conforms to
+.St -p1003.1-2001 .
.Sh HISTORY
The
.Fn realpath
diff --git a/lib/libc/stdlib/realpath.c b/lib/libc/stdlib/realpath.c
index 4c52b73319ab..18f29e95ee6b 100644
--- a/lib/libc/stdlib/realpath.c
+++ b/lib/libc/stdlib/realpath.c
@@ -49,7 +49,7 @@ realpath1(const char *path, char *resolved)
{
struct stat sb;
char *p, *q;
- size_t left_len, resolved_len, next_token_len;
+ size_t left_len, prev_len, resolved_len, next_token_len;
unsigned symlinks;
ssize_t slen;
char left[PATH_MAX], next_token[PATH_MAX], symlink[PATH_MAX];
@@ -98,6 +98,7 @@ realpath1(const char *path, char *resolved)
left_len = 0;
}
+ prev_len = resolved_len;
if (resolved[resolved_len - 1] != '/') {
if (resolved_len + 1 >= PATH_MAX) {
errno = ENAMETOOLONG;
@@ -133,8 +134,17 @@ realpath1(const char *path, char *resolved)
errno = ENAMETOOLONG;
return (NULL);
}
- if (lstat(resolved, &sb) != 0)
+ if (lstat(resolved, &sb) != 0) {
+ /*
+ * EACCES means the parent directory is not
+ * readable, while ENOTDIR means the parent
+ * directory is not a directory. Rewind the path
+ * to correctly indicate where the error lies.
+ */
+ if (errno == EACCES || errno == ENOTDIR)
+ resolved[prev_len] = '\0';
return (NULL);
+ }
if (S_ISLNK(sb.st_mode)) {
if (symlinks++ > MAXSYMLINKS) {
errno = ELOOP;
diff --git a/lib/libc/tests/gen/realpath2_test.c b/lib/libc/tests/gen/realpath2_test.c
index f89dd99cbb72..431df8721ae0 100644
--- a/lib/libc/tests/gen/realpath2_test.c
+++ b/lib/libc/tests/gen/realpath2_test.c
@@ -1,6 +1,7 @@
/*
* Copyright (c) 2017 Jan Kokemüller
* All rights reserved.
+ * Copyright (c) 2025 Klara, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -25,6 +26,8 @@
*/
#include <sys/param.h>
+#include <sys/stat.h>
+
#include <errno.h>
#include <fcntl.h>
#include <stdio.h>
@@ -34,6 +37,31 @@
#include <atf-c.h>
+ATF_TC(realpath_null);
+ATF_TC_HEAD(realpath_null, tc)
+{
+ atf_tc_set_md_var(tc, "descr", "Test null input");
+}
+ATF_TC_BODY(realpath_null, tc)
+{
+ ATF_REQUIRE_ERRNO(EINVAL, realpath(NULL, NULL) == NULL);
+}
+
+ATF_TC(realpath_empty);
+ATF_TC_HEAD(realpath_empty, tc)
+{
+ atf_tc_set_md_var(tc, "descr", "Test empty input");
+}
+ATF_TC_BODY(realpath_empty, tc)
+{
+ char resb[PATH_MAX] = "";
+
+ ATF_REQUIRE_EQ(0, mkdir("foo", 0755));
+ ATF_REQUIRE_EQ(0, chdir("foo"));
+ ATF_REQUIRE_ERRNO(ENOENT, realpath("", resb) == NULL);
+ ATF_REQUIRE_STREQ("", resb);
+}
+
ATF_TC(realpath_buffer_overflow);
ATF_TC_HEAD(realpath_buffer_overflow, tc)
{
@@ -44,16 +72,11 @@ ATF_TC_HEAD(realpath_buffer_overflow, tc)
ATF_TC_BODY(realpath_buffer_overflow, tc)
{
- char path[MAXPATHLEN] = { 0 };
- char resb[MAXPATHLEN] = { 0 };
- size_t i;
+ char path[PATH_MAX] = "";
+ char resb[PATH_MAX] = "";
- path[0] = 'a';
+ memset(path, 'a', sizeof(path) - 1);
path[1] = '/';
- for (i = 2; i < sizeof(path) - 1; ++i) {
- path[i] = 'a';
- }
-
ATF_REQUIRE(realpath(path, resb) == NULL);
}
@@ -66,9 +89,9 @@ ATF_TC_HEAD(realpath_empty_symlink, tc)
ATF_TC_BODY(realpath_empty_symlink, tc)
{
- char path[MAXPATHLEN] = { 0 };
- char slnk[MAXPATHLEN] = { 0 };
- char resb[MAXPATHLEN] = { 0 };
+ char path[PATH_MAX] = "";
+ char slnk[PATH_MAX] = "";
+ char resb[PATH_MAX] = "";
int fd;
(void)strlcat(slnk, "empty_symlink", sizeof(slnk));
@@ -89,11 +112,70 @@ ATF_TC_BODY(realpath_empty_symlink, tc)
ATF_REQUIRE(unlink(slnk) == 0);
}
-ATF_TP_ADD_TCS(tp)
+ATF_TC(realpath_partial);
+ATF_TC_HEAD(realpath_partial, tc)
+{
+ atf_tc_set_md_var(tc, "descr",
+ "Test that failure leaves a partial result");
+ atf_tc_set_md_var(tc, "require.user", "unprivileged");
+}
+
+ATF_TC_BODY(realpath_partial, tc)
{
+ char resb[PATH_MAX] = "";
+ size_t len;
+
+ /* scenario 1: missing directory */
+ ATF_REQUIRE_EQ(0, mkdir("foo", 0755));
+ ATF_REQUIRE_ERRNO(ENOENT, realpath("foo/bar/baz", resb) == NULL);
+ len = strnlen(resb, sizeof(resb));
+ ATF_REQUIRE(len > 8 && len < sizeof(resb));
+ ATF_REQUIRE_STREQ("/foo/bar", resb + len - 8);
+
+ /* scenario 2: dead link 1 */
+ ATF_REQUIRE_EQ(0, symlink("nix", "foo/bar"));
+ ATF_REQUIRE_ERRNO(ENOENT, realpath("foo/bar/baz", resb) == NULL);
+ len = strnlen(resb, sizeof(resb));
+ ATF_REQUIRE(len > 8 && len < sizeof(resb));
+ ATF_REQUIRE_STREQ("/foo/nix", resb + len - 8);
+
+ /* scenario 3: missing file */
+ ATF_REQUIRE_EQ(0, unlink("foo/bar"));
+ ATF_REQUIRE_EQ(0, mkdir("foo/bar", 0755));
+ ATF_REQUIRE_ERRNO(ENOENT, realpath("foo/bar/baz", resb) == NULL);
+ len = strnlen(resb, sizeof(resb));
+ ATF_REQUIRE(len > 12 && len < sizeof(resb));
+ ATF_REQUIRE_STREQ("/foo/bar/baz", resb + len - 12);
+
+ /* scenario 4: dead link 2 */
+ ATF_REQUIRE_EQ(0, symlink("nix", "foo/bar/baz"));
+ ATF_REQUIRE_ERRNO(ENOENT, realpath("foo/bar/baz", resb) == NULL);
+ len = strnlen(resb, sizeof(resb));
+ ATF_REQUIRE(len > 12 && len < sizeof(resb));
+ ATF_REQUIRE_STREQ("/foo/bar/nix", resb + len - 12);
+
+ /* scenario 5: unreadable directory */
+ ATF_REQUIRE_EQ(0, chmod("foo", 000));
+ ATF_REQUIRE_ERRNO(EACCES, realpath("foo/bar/baz", resb) == NULL);
+ len = strnlen(resb, sizeof(resb));
+ ATF_REQUIRE(len > 4 && len < sizeof(resb));
+ ATF_REQUIRE_STREQ("/foo", resb + len - 4);
+
+ /* scenario 6: not a directory */
+ ATF_REQUIRE_EQ(0, close(creat("bar", 0644)));
+ ATF_REQUIRE_ERRNO(ENOTDIR, realpath("bar/baz", resb) == NULL);
+ len = strnlen(resb, sizeof(resb));
+ ATF_REQUIRE(len > 4 && len < sizeof(resb));
+ ATF_REQUIRE_STREQ("/bar", resb + len - 4);
+}
+ATF_TP_ADD_TCS(tp)
+{
+ ATF_TP_ADD_TC(tp, realpath_null);
+ ATF_TP_ADD_TC(tp, realpath_empty);
ATF_TP_ADD_TC(tp, realpath_buffer_overflow);
ATF_TP_ADD_TC(tp, realpath_empty_symlink);
+ ATF_TP_ADD_TC(tp, realpath_partial);
return atf_no_error();
}
diff --git a/lib/libsys/socket.2 b/lib/libsys/socket.2
index b211611c6354..48b8f4e87489 100644
--- a/lib/libsys/socket.2
+++ b/lib/libsys/socket.2
@@ -25,7 +25,7 @@
.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
.\" SUCH DAMAGE.
.\"
-.Dd May 17, 2025
+.Dd September 28, 2025
.Dt SOCKET 2
.Os
.Sh NAME
@@ -64,7 +64,7 @@ PF_NETGRAPH Netgraph sockets,
PF_NETLINK Netlink protocols,
PF_BLUETOOTH Bluetooth protocols,
PF_INET_SDP OFED socket direct protocol (IPv4),
-AF_HYPERV HyperV sockets
+PF_HYPERV HyperV sockets
.Ed
.Pp
Each protocol family is connected to an address family, which has the
@@ -89,32 +89,6 @@ SOCK_RAW Raw-protocol interface,
SOCK_SEQPACKET Sequenced packet stream
.Ed
.Pp
-A
-.Dv SOCK_STREAM
-type provides sequenced, reliable,
-two-way connection based byte streams.
-An out-of-band data transmission mechanism may be supported.
-A
-.Dv SOCK_DGRAM
-socket supports
-datagrams (connectionless, unreliable messages of
-a fixed (typically small) maximum length).
-A
-.Dv SOCK_SEQPACKET
-socket may provide a sequenced, reliable,
-two-way connection-based data transmission path for datagrams
-of fixed maximum length; a consumer may be required to read
-an entire packet with each read system call.
-This facility may have protocol-specific properties.
-.Dv SOCK_RAW
-sockets provide access to internal network protocols and interfaces.
-The
-.Dv SOCK_RAW
-type is available only to the super-user and is described in
-.Xr ip 4
-and
-.Xr ip6 4 .
-.Pp
Additionally, the following flags are allowed in the
.Fa type
argument:
@@ -140,32 +114,23 @@ particular to the
in which communication
is to take place; see
.Xr protocols 5 .
-.Pp
The
.Fa protocol
argument may be set to zero (0) to request the default
implementation of a socket type for the protocol, if any.
-.Pp
-Sockets of type
+.Sh STREAM SOCKET TYPE
+The
+.Dv SOCK_STREAM
+socket type provides reliable, sequenced, full-duplex octet streams between
+the socket and a peer to which the socket is connected.
+A socket of type
.Dv SOCK_STREAM
-are full-duplex byte streams, similar
-to pipes.
-A stream socket must be in a
+needs to be in a
.Em connected
-state before any data may be sent or received
-on it.
+state before any data can be sent or received.
A connection to another socket is created with a
.Xr connect 2
system call.
-Once connected, data may be transferred using
-.Xr read 2
-and
-.Xr write 2
-calls or some variant of the
-.Xr send 2
-and
-.Xr recv 2
-functions.
(Some protocol families, such as the Internet family,
support the notion of an
.Dq implied connect ,
@@ -173,62 +138,210 @@ which permits data to be sent piggybacked onto a connect operation by
using the
.Xr sendto 2
system call.)
-When a session has been completed a
-.Xr close 2
-may be performed.
-Out-of-band data may also be transmitted as described in
+Once connected, data may be sent using
+.Xr send 2 ,
+.Xr sendto 2 ,
+.Xr sendmsg 2
+and
+.Xr write 2
+system calls.
+Data may be received using
+.Xr recv 2 ,
+.Xr recvfrom 2 ,
+.Xr recvmsg 2 ,
+and
+.Xr read 2
+system calls.
+Record boundaries are not maintained; data sent on a stream socket using output
+operations of one size can be received using input operations of smaller or
+larger sizes without loss of data.
+Data may be buffered; successful return from an output function does not imply
+that the data has been delivered to the peer or even transmitted from the local
+system.
+For certain protocols out-of-band data may also be transmitted as described in
.Xr send 2
and received as described in
.Xr recv 2 .
.Pp
-The communications protocols used to implement a
-.Dv SOCK_STREAM
-ensure that data
-is not lost or duplicated.
-If a piece of data for which the
-peer protocol has buffer space cannot be successfully transmitted
-within a reasonable length of time, then
-the connection is considered broken and calls
-will indicate an error with
--1 returns and with
-.Er ETIMEDOUT
-as the specific code
-in the global variable
-.Va errno .
-The protocols optionally keep sockets
-.Dq warm
-by forcing transmissions
-roughly every minute in the absence of other activity.
-An error is then indicated if no response can be
-elicited on an otherwise
-idle connection for an extended period (e.g.\& 5 minutes).
-By default, a
+If data cannot be successfully transmitted within a given time then the
+connection is considered broken, and subsequent operations shall fail with
+a protocol specific error code.
+A
.Dv SIGPIPE
-signal is raised if a process sends
-on a broken stream, but this behavior may be inhibited via
+signal is raised if a thread attempts to send data on a broken stream (one that
+is no longer connected).
+The signal can be suppressed by the
+.Dv MSG_NOSIGNAL
+flag with distinct
+.Xr send 2 ,
+.Xr sendto 2 ,
+and
+.Xr sendmsg 2
+system calls or by the
+.Dv SO_NOSIGPIPE
+socket option set on the socket with
.Xr setsockopt 2 .
.Pp
-.Dv SOCK_SEQPACKET
-sockets employ the same system calls
-as
+The
.Dv SOCK_STREAM
-sockets.
-The only difference
-is that
-.Xr read 2
-calls will return only the amount of data requested,
-and any remaining in the arriving packet will be discarded.
+socket is supported by the following protocol families:
+.Dv PF_INET ,
+.Dv PF_INET6 ,
+.Dv PF_UNIX ,
+.Dv PF_BLUETOOTH ,
+.Dv PF_HYPERV ,
+and
+.Dv PF_INET_SDP .
+Out-of-band data transmission mechanism is supported for stream sockets of
+.Dv PF_INET
+and
+.Dv PF_INET6
+protocol families.
+.Sh DATAGRAM SOCKET TYPE
+The
+.Dv SOCK_DGRAM
+socket type supports connectionless data transfer which is not necessarily
+acknowledged or reliable.
+Datagrams can be sent to the address specified (possibly multicast or
+broadcast) in each output operation, and incoming datagrams can be received
+from multiple sources.
+The source address of each datagram is available when receiving the datagram
+with
+.Xr recvfrom 2
+or
+.Xr recvmsg 2 .
+An application can also pre-specify a peer address with
+.Xr sendto 2
+or
+.Xr sendmsg 2 ,
+in which case calls to output functions that do not specify a peer address
+shall send to the pre-specified peer.
+If a peer has been specified, only datagrams from that peer shall be received.
+A datagram shall be sent in a single output operation, and needs to be received
+in a single input operation.
+The maximum size of a datagram is protocol-specific.
+Output datagrams may be buffered within the system; thus, a successful return
+from an output function does not guarantee that a datagram is actually sent or
+received.
.Pp
+The
.Dv SOCK_DGRAM
+socket is supported by the following protocol families:
+.Dv PF_INET ,
+.Dv PF_INET6 ,
+.Dv PF_UNIX ,
+.Dv PF_NETGRAPH ,
and
-.Dv SOCK_RAW
-sockets allow sending of datagrams to correspondents
-named in
+.Dv PF_NETLINK .
+.Sh SEQUENCED PACKET SOCKET TYPE
+The
+.Dv SOCK_SEQPACKET
+socket type is similar to the
+.Dv SOCK_STREAM
+type, and is also connection-oriented.
+The only difference between these types is that record boundaries are
+maintained using the
+.Dv SOCK_SEQPACKET
+type.
+A record can be sent using one or more output operations and received using one
+or more input operations, but a single operation never transfers parts of more
+than one record.
+Record boundaries are set by the sender with the
+.Dv MSG_EOR
+flag of
.Xr send 2
-calls.
-Datagrams are generally received with
+or
+.Xr sendmsg 2
+functions.
+There is no possibility to set a record boundary with
+.Xr write 2 .
+Record boundaries are visible to the receiver via the
+.Dv MSG_EOR
+flag in the received message flags returned by the
+.Xr recvmsg 2
+function.
+It is protocol-specific whether a maximum record size is imposed.
+.Pp
+The
+.Dv SOCK_SEQPACKET
+socket is supported by the following protocol families:
+.Dv PF_INET ,
+.Dv PF_INET6 ,
+and
+.Dv PF_UNIX .
+.Pp
+.Sh RAW SOCKET TYPE
+The
+.Dv SOCK_RAW
+socket type provides access to internal network protocols and interfaces.
+It is a datagram socket in its nature, thus has the same semantics of
+read and write operations.
+The
+.Dv SOCK_RAW
+type is available only to the super-user and is described in
+.Xr ip 4
+and
+.Xr ip6 4 .
+.Sh NON-BLOCKING MODE
+A socket can be created in
+.Em non-blocking mode
+with the help of
+.Dv SOCK_NONBLOCK
+flag.
+Alternatively, the non-blocking mode on a socket can be turned on and off with
+the help of the
+.Dv O_NONBLOCK
+flag of the
+.Xr fcntl 2
+system call.
+.Pp
+When a non-blocking socket has not enough data in its receive buffer to fulfill
+the application supplied buffer, then data receiving system calls like
+.Xr recv 2 ,
.Xr recvfrom 2 ,
-which returns the next datagram with its return address.
+.Xr recvmsg 2
+and
+.Xr read 2
+will not block waiting for the data but immediately return.
+Return value will indicate amount of bytes read into the supplied buffer.
+The
+.Va errno
+will be set to
+.Dv EAGAIN
+.Po
+has same value as
+.Dv EWOULDBLOCK
+.Pc .
+.Pp
+If application tries to send more data on a non-blocking socket than the socket
+send buffer can accomodate with
+.Xr send 2 ,
+.Xr sendto 2 ,
+.Xr sendmsg 2
+or
+.Xr write 2
+system calls partial data will be sent.
+Return value will indicate amount of bytes sent.
+The
+.Va errno
+will be set to
+.Dv EAGAIN .
+Note that sockets of
+.Dv SOCK_DGRAM
+type are unreliable, thus for these sockets sending operations will never fail
+with
+.Dv EAGAIN
+in non-blocking mode neither will block in blocking mode.
+.Sh OTHER OPERATIONS ON SOCKETS
+Since socket descriptors are file descriptors, many generic file operations
+performed by
+.Xr fcntl 2 ,
+apply.
+Socket descriptors can be used with all event engines, such as
+.Xr kevent 2 ,
+.Xr select 2
+and
+.Xr poll 2 .
.Pp
An
.Xr fcntl 2
@@ -250,6 +363,12 @@ The
and
.Xr getsockopt 2
system calls are used to set and get options, respectively.
+.Pp
+Connection associated with a socket can be terminated by
+.Xr close 2
+system call.
+One direction of communication can be disabled with
+.Xr shutdown 2 .
.Sh RETURN VALUES
A -1 is returned if an error occurs, otherwise the return
value is a descriptor referencing the socket.
@@ -282,16 +401,23 @@ The socket type is not supported by the protocol.
.Sh SEE ALSO
.Xr accept 2 ,
.Xr bind 2 ,
+.Xr close 2 ,
.Xr connect 2 ,
+.Xr fcntl 2 ,
.Xr getpeername 2 ,
.Xr getsockname 2 ,
.Xr getsockopt 2 ,
.Xr ioctl 2 ,
+.Xr kevent 2 ,
.Xr listen 2 ,
+.Xr poll 2 ,
.Xr read 2 ,
.Xr recv 2 ,
.Xr select 2 ,
.Xr send 2 ,
+.Xr sendmsg 2 ,
+.Xr sendto 2 ,
+.Xr signal 3 ,
.Xr shutdown 2 ,
.Xr socketpair 2 ,
.Xr write 2 ,
diff --git a/sbin/fsck_ffs/setup.c b/sbin/fsck_ffs/setup.c
index f10f02d159c3..41b4a5336350 100644
--- a/sbin/fsck_ffs/setup.c
+++ b/sbin/fsck_ffs/setup.c
@@ -58,7 +58,6 @@ char *copybuf; /* buffer to copy snapshot blocks */
static int sbhashfailed;
#define POWEROF2(num) (((num) & ((num) - 1)) == 0)
-static int calcsb(char *dev, int devfd, struct fs *fs);
static void saverecovery(int readfd, int writefd);
static int chkrecovery(int devfd);
static int getlbnblkno(struct inodesc *);
@@ -501,52 +500,6 @@ sblock_init(void)
}
/*
- * Calculate a prototype superblock based on information in the boot area.
- * When done the cgsblock macro can be calculated and the fs_ncg field
- * can be used. Do NOT attempt to use other macros without verifying that
- * their needed information is available!
- */
-static int
-calcsb(char *dev, int devfd, struct fs *fs)
-{
- struct fsrecovery *fsr;
- char *fsrbuf;
- u_int secsize;
-
- /*
- * We need fragments-per-group and the partition-size.
- *
- * Newfs stores these details at the end of the boot block area
- * at the start of the filesystem partition. If they have been
- * overwritten by a boot block, we fail. But usually they are
- * there and we can use them.
- */
- if (ioctl(devfd, DIOCGSECTORSIZE, &secsize) == -1)
- return (0);
- fsrbuf = Balloc(secsize);
- if (fsrbuf == NULL)
- errx(EEXIT, "calcsb: cannot allocate recovery buffer");
- if (blread(devfd, fsrbuf,
- (SBLOCK_UFS2 - secsize) / dev_bsize, secsize) != 0) {
- free(fsrbuf);
- return (0);
- }
- fsr = (struct fsrecovery *)&fsrbuf[secsize - sizeof *fsr];
- if (fsr->fsr_magic != FS_UFS2_MAGIC) {
- free(fsrbuf);
- return (0);
- }
- memset(fs, 0, sizeof(struct fs));
- fs->fs_fpg = fsr->fsr_fpg;
- fs->fs_fsbtodb = fsr->fsr_fsbtodb;
- fs->fs_sblkno = fsr->fsr_sblkno;
- fs->fs_magic = fsr->fsr_magic;
- fs->fs_ncg = fsr->fsr_ncg;
- free(fsrbuf);
- return (1);
-}
-
-/*
* Check to see if recovery information exists.
* Return 1 if it exists or cannot be created.
* Return 0 if it does not exist and can be created.
diff --git a/sbin/geom/Makefile b/sbin/geom/Makefile
index 078503d3ae67..61561ef1ff1b 100644
--- a/sbin/geom/Makefile
+++ b/sbin/geom/Makefile
@@ -9,7 +9,7 @@ MAN= geom.8
CFLAGS+= -I${.CURDIR} -I${.CURDIR}/core
CFLAGS+= -DGEOM_CLASS_DIR=\"${GEOM_CLASS_DIR}\"
-LIBADD= geom util
+LIBADD= geom util xo
.if defined(RESCUE)
.PATH: ${SRCTOP}/lib/geom/part \
diff --git a/sbin/geom/core/geom.c b/sbin/geom/core/geom.c
index b78021194ddd..496123f08274 100644
--- a/sbin/geom/core/geom.c
+++ b/sbin/geom/core/geom.c
@@ -49,9 +49,12 @@
#include <assert.h>
#include <libgeom.h>
#include <geom.h>
+#include <libxo/xo.h>
#include "misc/subr.h"
+#define GEOM_XO_VERSION "1"
+
#ifdef STATIC_GEOM_CLASSES
extern uint32_t gpart_version;
extern struct g_command gpart_class_commands[];
@@ -513,6 +516,7 @@ run_command(int argc, char *argv[])
gctl_free(req);
if (verbose)
printf("Done.\n");
+ xo_finish();
exit(EXIT_SUCCESS);
}
@@ -810,6 +814,10 @@ main(int argc, char *argv[])
provider_name = NULL;
tflag = false;
+ argc = xo_parse_args(argc, argv);
+ if (argc < 0)
+ return (argc);
+
if (strcmp(getprogname(), "geom") == 0) {
while ((ch = getopt(argc, argv, "hp:t")) != -1) {
switch (ch) {
@@ -831,6 +839,7 @@ main(int argc, char *argv[])
* Don't adjust argc and argv, it would break get_class().
*/
}
+ xo_set_version(GEOM_XO_VERSION);
if (tflag && provider_name != NULL) {
errx(EXIT_FAILURE,
@@ -839,6 +848,7 @@ main(int argc, char *argv[])
if (provider_name != NULL) {
list_one_geom_by_provider(provider_name);
+ xo_finish();
return (0);
}
@@ -882,29 +892,33 @@ find_geom(struct gclass *classp, const char *name)
}
static void
-list_one_provider(struct gprovider *pp, const char *prefix)
+list_one_provider(struct gprovider *pp, const char *padding)
{
struct gconfig *conf;
char buf[5];
- printf("Name: %s\n", pp->lg_name);
+ xo_emit("{Lcw:Name}{:Name}\n", pp->lg_name);
humanize_number(buf, sizeof(buf), (int64_t)pp->lg_mediasize, "",
HN_AUTOSCALE, HN_B | HN_NOSPACE | HN_DECIMAL);
- printf("%sMediasize: %jd (%s)\n", prefix, (intmax_t)pp->lg_mediasize,
- buf);
- printf("%sSectorsize: %u\n", prefix, pp->lg_sectorsize);
+ xo_emit("{P:/%s}{Lcw:Mediasize}{:Mediasize/%jd} ({N:/%s})\n",
+ padding, (intmax_t)pp->lg_mediasize, buf);
+ xo_emit("{P:/%s}{Lcw:Sectorsize}{:Sectorsize/%u} \n",
+ padding, pp->lg_sectorsize);
if (pp->lg_stripesize > 0 || pp->lg_stripeoffset > 0) {
- printf("%sStripesize: %ju\n", prefix, pp->lg_stripesize);
- printf("%sStripeoffset: %ju\n", prefix, pp->lg_stripeoffset);
+ xo_emit("{P:/%s}{Lcw:Stripesize}{Stripesize/%ju}\n",
+ padding, pp->lg_stripesize);
+ xo_emit("{P:/%s}{Lcw:Stripeoffset}{Stripeoffset/%ju}\n",
+ padding, pp->lg_stripeoffset);
}
- printf("%sMode: %s\n", prefix, pp->lg_mode);
+ xo_emit("{P:/%s}{Lcw:Mode}{Mode}\n", padding, pp->lg_mode);
LIST_FOREACH(conf, &pp->lg_config, lg_config) {
- printf("%s%s: %s\n", prefix, conf->lg_name, conf->lg_val);
+ xo_emit("{P:/%s}{Lcwa:}{a:}\n", padding, conf->lg_name,
+ conf->lg_name, conf->lg_val);
}
}
static void
-list_one_consumer(struct gconsumer *cp, const char *prefix)
+list_one_consumer(struct gconsumer *cp, const char *padding)
{
struct gprovider *pp;
struct gconfig *conf;
@@ -915,20 +929,24 @@ list_one_consumer(struct gconsumer *cp, const char *prefix)
else {
char buf[5];
- printf("Name: %s\n", pp->lg_name);
+ xo_emit("{Lcw:Name}{:Name}\n", pp->lg_name);
humanize_number(buf, sizeof(buf), (int64_t)pp->lg_mediasize, "",
HN_AUTOSCALE, HN_B | HN_NOSPACE | HN_DECIMAL);
- printf("%sMediasize: %jd (%s)\n", prefix,
- (intmax_t)pp->lg_mediasize, buf);
- printf("%sSectorsize: %u\n", prefix, pp->lg_sectorsize);
+ xo_emit("{P:/%s}{Lcw:Mediasize}{:Mediasize/%jd} ({N:/%s})\n",
+ padding, (intmax_t)pp->lg_mediasize, buf);
+ xo_emit("{P:/%s}{Lcw:Sectorsize}{:Sectorsize/%u}\n",
+ padding, pp->lg_sectorsize);
if (pp->lg_stripesize > 0 || pp->lg_stripeoffset > 0) {
- printf("%sStripesize: %ju\n", prefix, pp->lg_stripesize);
- printf("%sStripeoffset: %ju\n", prefix, pp->lg_stripeoffset);
+ xo_emit("{P:/%s}{Lcw:Stripesize}{:Stripesize/%ju}\n",
+ padding, pp->lg_stripesize);
+ xo_emit("{P:/%s}{Lcw:Stripeoffset}{:Stripesize/%ju}\n",
+ padding, pp->lg_stripeoffset);
}
- printf("%sMode: %s\n", prefix, cp->lg_mode);
+ xo_emit("{P:/%s}{Lcw:Mode}{:Mode}\n", padding, pp->lg_mode);
}
LIST_FOREACH(conf, &cp->lg_config, lg_config) {
- printf("%s%s: %s\n", prefix, conf->lg_name, conf->lg_val);
+ xo_emit("{P:/%s}{Lcwa:}{a:}\n", padding, conf->lg_name,
+ conf->lg_name, conf->lg_val);
}
}
@@ -940,27 +958,36 @@ list_one_geom(struct ggeom *gp)
struct gconfig *conf;
unsigned n;
- printf("Geom name: %s\n", gp->lg_name);
+ xo_emit("{Lcw:Geom name}{:Name}\n", gp->lg_name);
LIST_FOREACH(conf, &gp->lg_config, lg_config) {
- printf("%s: %s\n", conf->lg_name, conf->lg_val);
+ xo_emit("{Lcwa:}{a:}\n", conf->lg_name, conf->lg_name,
+ conf->lg_val);
}
if (!LIST_EMPTY(&gp->lg_provider)) {
- printf("Providers:\n");
+ xo_open_list("Providers");
+ xo_emit("{Tc:Providers}\n");
n = 1;
LIST_FOREACH(pp, &gp->lg_provider, lg_provider) {
- printf("%u. ", n++);
+ xo_emit("{T:/%u} ", n++);
+ xo_open_instance("provider");
list_one_provider(pp, " ");
+ xo_close_instance("provider");
}
+ xo_close_list("Providers");
}
if (!LIST_EMPTY(&gp->lg_consumer)) {
- printf("Consumers:\n");
+ xo_open_list("Consumers");
+ xo_emit("{Tc:Consumers}\n");
n = 1;
LIST_FOREACH(cp, &gp->lg_consumer, lg_consumer) {
- printf("%u. ", n++);
+ xo_emit("{T:/%u} ", n++);
+ xo_open_instance("consumer");
list_one_consumer(cp, " ");
+ xo_close_instance("consumer");
}
+ xo_close_list("Consumers");
}
- printf("\n");
+ xo_emit("\n");
}
static void
@@ -978,8 +1005,10 @@ list_one_geom_by_provider(const char *provider_name)
if (gp == NULL)
errx(EXIT_FAILURE, "Cannot find provider '%s'.", provider_name);
- printf("Geom class: %s\n", gp->lg_class->lg_name);
+ xo_open_container("Geom");
+ xo_emit("{Lwc:Geom class}{:Class}\n", gp->lg_class->lg_name);
list_one_geom(gp);
+ xo_close_container("Geom");
}
static void
@@ -1038,14 +1067,20 @@ std_list(struct gctl_req *req, unsigned flags __unused)
"an instance named '%s'.",
gclass_name, name);
}
+ xo_open_container("Geom");
list_one_geom(gp);
+ xo_close_container("Geom");
}
} else {
+ xo_open_list("Geoms");
LIST_FOREACH(gp, &classp->lg_geom, lg_geom) {
if (LIST_EMPTY(&gp->lg_provider) && !all)
continue;
+ xo_open_instance("geom");
list_one_geom(gp);
+ xo_close_instance("geom");
}
+ xo_close_list("Geoms");
}
geom_deletetree(&mesh);
}
@@ -1115,34 +1150,24 @@ status_update_len_prs(struct ggeom *gp, int *name_len, int *status_len)
}
static char *
-status_one_consumer(struct gconsumer *cp)
+status_one_consumer(struct gconsumer *cp, const char *value)
{
- static char buf[256];
struct gprovider *pp;
struct gconfig *conf;
- const char *state, *syncr;
+ char *ret;
pp = cp->lg_provider;
if (pp == NULL)
return (NULL);
- state = NULL;
- syncr = NULL;
+ ret = NULL;
LIST_FOREACH(conf, &cp->lg_config, lg_config) {
- if (strcasecmp(conf->lg_name, "state") == 0)
- state = conf->lg_val;
- if (strcasecmp(conf->lg_name, "synchronized") == 0)
- syncr = conf->lg_val;
- }
- if (state == NULL && syncr == NULL)
- snprintf(buf, sizeof(buf), "%s", pp->lg_name);
- else if (state != NULL && syncr != NULL) {
- snprintf(buf, sizeof(buf), "%s (%s, %s)", pp->lg_name,
- state, syncr);
- } else {
- snprintf(buf, sizeof(buf), "%s (%s)", pp->lg_name,
- state ? state : syncr);
+ if (strcasecmp(conf->lg_name, value) == 0)
+ ret = conf->lg_val;
}
- return (buf);
+
+ if (ret == NULL)
+ return (NULL);
+ return (ret);
}
static void
@@ -1150,8 +1175,8 @@ status_one_geom(struct ggeom *gp, int script, int name_len, int status_len)
{
struct gconsumer *cp;
struct gconfig *conf;
- const char *name, *status, *component;
- int gotone;
+ const char *name, *status, *cstate, *csyncr;
+ int gotone, len;
name = gp->lg_name;
status = "N/A";
@@ -1161,21 +1186,49 @@ status_one_geom(struct ggeom *gp, int script, int name_len, int status_len)
break;
}
}
- gotone = 0;
+ gotone = len = 0;
+ xo_open_instance("status");
LIST_FOREACH(cp, &gp->lg_consumer, lg_consumer) {
- component = status_one_consumer(cp);
- if (component == NULL)
+ cstate = status_one_consumer(cp, "state");
+ csyncr = status_one_consumer(cp, "synchronized");
+ if (cstate == NULL && csyncr == NULL)
continue;
+ if (!gotone || script) {
+ if (!gotone) {
+ xo_emit("{:name/%*s} {:status/%*s} ",
+ name_len, name, status_len, status);
+ } else {
+ xo_emit("{d:name/%*s} {d:status/%*s} ",
+ name_len, name, status_len, status);
+ }
+ xo_open_list("components");
+ }
+
+ xo_open_instance("components");
+ if (cstate != NULL && csyncr != NULL) {
+ xo_emit("{P:/%*s}{:compontent} ({:state}, {:synchronized})\n",
+ len, "", cp->lg_provider->lg_name, cstate, csyncr);
+ } else if (cstate != NULL) {
+ xo_emit("{P:/%*s}{:compontent} ({:state})\n",
+ len, "", cp->lg_provider->lg_name, cstate);
+ } else {
+ xo_emit("{P:/%*s}{:compontent} ({:synchronized})\n",
+ len, "", cp->lg_provider->lg_name, csyncr);
+ }
+ xo_close_instance("components");
gotone = 1;
- printf("%*s %*s %s\n", name_len, name, status_len, status,
- component);
- if (!script)
- name = status = "";
+ if (!len && !script)
+ len = name_len + status_len + 4;
}
if (!gotone) {
- printf("%*s %*s %s\n", name_len, name, status_len, status,
- "N/A");
+ xo_emit("{:name/%*s} {:status/%*s} ", name_len, name, status_len, status);
+ xo_open_list("components");
+ xo_open_instance("components");
+ xo_emit("{P:/%*s}{d:compontent}\n", len, "", "N/A");
+ xo_close_instance("components");
}
+ xo_close_list("components");
+ xo_close_instance("status");
}
static void
@@ -1184,9 +1237,10 @@ status_one_geom_prs(struct ggeom *gp, int script, int name_len, int status_len)
struct gprovider *pp;
struct gconsumer *cp;
struct gconfig *conf;
- const char *name, *status, *component;
- int gotone;
+ const char *name, *status, *cstate, *csyncr;
+ int gotone, len;
+ xo_open_instance("status");
LIST_FOREACH(pp, &gp->lg_provider, lg_provider) {
name = pp->lg_name;
status = "N/A";
@@ -1202,22 +1256,50 @@ status_one_geom_prs(struct ggeom *gp, int script, int name_len, int status_len)
break;
}
}
- gotone = 0;
+ gotone = len = 0;
LIST_FOREACH(cp, &gp->lg_consumer, lg_consumer) {
- component = status_one_consumer(cp);
- if (component == NULL)
+ cstate = status_one_consumer(cp, "state");
+ csyncr = status_one_consumer(cp, "synchronized");
+ if (cstate == NULL && csyncr == NULL)
continue;
+
+ if (!gotone || script) {
+ if (!gotone) {
+ xo_emit("{:name/%*s} {:status/%*s} ",
+ name_len, name, status_len, status);
+ } else {
+ xo_emit("{d:name/%*s} {d:status/%*s} ",
+ name_len, name, status_len, status);
+ }
+ xo_open_list("components");
+ }
+
+ xo_open_instance("component");
+ if (cstate != NULL && csyncr != NULL) {
+ xo_emit("{P:/%*s}{:compontent} ({:state}, {:synchronized})\n",
+ len, "", cp->lg_provider->lg_name, cstate, csyncr);
+ } else if (cstate != NULL) {
+ xo_emit("{P:/%*s}{:compontent} ({:state})\n",
+ len, "", cp->lg_provider->lg_name, cstate);
+ } else {
+ xo_emit("{P:/%*s}{:compontent} ({:synchronized})\n",
+ len, "", cp->lg_provider->lg_name, csyncr);
+ }
+ xo_close_instance("component");
gotone = 1;
- printf("%*s %*s %s\n", name_len, name,
- status_len, status, component);
- if (!script)
- name = status = "";
+ if (!len && !script)
+ len = name_len + status_len + 4;
}
if (!gotone) {
- printf("%*s %*s %s\n", name_len, name,
- status_len, status, "N/A");
+ xo_emit("{:name/%*s} {:status/%*s} ", name_len, name, status_len, status);
+ xo_open_list("components");
+ xo_open_instance("components");
+ xo_emit("{P:/%*s}{d:compontent}\n", len, "", "N/A");
+ xo_close_instance("components");
}
+ xo_close_list("components");
}
+ xo_close_instance("status");
}
static void
@@ -1240,13 +1322,9 @@ std_status(struct gctl_req *req, unsigned flags __unused)
all = gctl_get_int(req, "all");
geoms = gctl_get_int(req, "geoms");
script = gctl_get_int(req, "script");
- if (script) {
- name_len = 0;
- status_len = 0;
- } else {
- name_len = strlen("Name");
- status_len = strlen("Status");
- }
+ name_len = strlen("Name");
+ status_len = strlen("Status");
+
if (nargs > 0) {
for (i = 0, n = 0; i < nargs; i++) {
name = gctl_get_ascii(req, "arg%d", i);
@@ -1282,9 +1360,10 @@ std_status(struct gctl_req *req, unsigned flags __unused)
goto end;
}
if (!script) {
- printf("%*s %*s %s\n", name_len, "Name", status_len, "Status",
- "Components");
+ xo_emit("{T:/%*s} {T:/%*s} {T:Components}\n",
+ name_len, "Name", status_len, "Status");
}
+ xo_open_list("status");
if (nargs > 0) {
for (i = 0; i < nargs; i++) {
name = gctl_get_ascii(req, "arg%d", i);
@@ -1312,6 +1391,7 @@ std_status(struct gctl_req *req, unsigned flags __unused)
}
}
}
+ xo_close_list("status");
end:
geom_deletetree(&mesh);
}
diff --git a/sbin/ipfw/tables.c b/sbin/ipfw/tables.c
index 7c3b1bb35a01..245c0c9e0399 100644
--- a/sbin/ipfw/tables.c
+++ b/sbin/ipfw/tables.c
@@ -1037,9 +1037,6 @@ table_modify_record(ipfw_obj_header *oh, int ac, char *av[], int add,
}
}
- /* Get real OS error */
- error = errno;
-
/* Report results back */
ptent = tent_buf;
for (i = 0; i < count; ptent++, i++) {
diff --git a/sbin/mdconfig/tests/mdconfig_test.sh b/sbin/mdconfig/tests/mdconfig_test.sh
index ea87ff5d542d..cc29c188cbd8 100755
--- a/sbin/mdconfig/tests/mdconfig_test.sh
+++ b/sbin/mdconfig/tests/mdconfig_test.sh
@@ -274,22 +274,23 @@ attach_size_rounddown()
attach_size_rounddown_body()
{
local md
- local ss=8192
- local ms=$(($ss + 4096))
- local ms2=$((2 * $ss + 4096))
+ local pgsz=$(pagesize)
+ local ss=$(($pgsz * 2))
+ local ms=$(($ss + $pgsz))
+ local ms2=$((2 * $ss + $pgsz))
- # Use a sector size that's a likely multiple of PAGE_SIZE, as md(4)
+ # Use a sector size that's a multiple of the kernel page size, as md(4)
# expects that for swap MDs.
atf_check -s exit:0 -o save:mdconfig.out -e empty \
-x "mdconfig -a -t swap -S $ss -s ${ms}b"
md=$(cat mdconfig.out)
- # 12288 bytes should be rounded down to one sector.
- check_diskinfo "$md" 8192 1 $ss
+ # one sector plus one page should be rounded down to one sector.
+ check_diskinfo "$md" $ss 1 $ss
# Resize and verify that the new size was also rounded down.
atf_check -s exit:0 -o empty -e empty \
-x "mdconfig -r -u ${md#md} -s ${ms2}b"
- check_diskinfo "$md" 16384 2 $ss
+ check_diskinfo "$md" $((2 * $ss)) 2 $ss
}
attach_size_rounddown_cleanup()
{
diff --git a/sbin/ping/tests/Makefile b/sbin/ping/tests/Makefile
index 0520b1d634cf..7d3ab02b9a86 100644
--- a/sbin/ping/tests/Makefile
+++ b/sbin/ping/tests/Makefile
@@ -1,5 +1,6 @@
ATF_TESTS_C+= in_cksum_test
-SRCS.in_cksum_test= in_cksum_test.c ../utils.c
+.PATH: ${.CURDIR:H}
+SRCS.in_cksum_test= in_cksum_test.c utils.c
PACKAGE= tests
diff --git a/share/man/man4/bridge.4 b/share/man/man4/bridge.4
index 7048df4593bf..3af952256d3a 100644
--- a/share/man/man4/bridge.4
+++ b/share/man/man4/bridge.4
@@ -36,7 +36,7 @@
.\" ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
.\" POSSIBILITY OF SUCH DAMAGE.
.\"
-.Dd July 28, 2025
+.Dd October 13, 2025
.Dt IF_BRIDGE 4
.Os
.Sh NAME
@@ -272,53 +272,149 @@ by setting the
node using
.Xr sysctl 8 .
.Sh VLAN SUPPORT
-The
+Virtual LANs (VLANs), defined in the IEEE 802.1Q standard, allow traffic
+on a bridge to be segregated into separate logical networks which cannot
+communicate with each other.
+For example, two interfaces in VLAN 10 would be able to communicate
+with each other, but not with another interface in VLAN 20.
+.Pp
+Each VLAN is identified by a number between 1 and 4094 inclusive.
+By default, all traffic on the bridge is assigned to "VLAN 0",
+a pseudo-VLAN used for historical compatibility.
+When VLANs are in use on a bridge, it is recommended to explicitly
+assign all traffic to a VLAN rather than using VLAN 0.
+.Pp
+The bridge implements Independent VLAN Learning (IVL), meaning that
+host addresses are learned separately for each VLAN, and the same host
+address may exist on several different ports in different VLANs.
+.Pp
+If a
+.Xr vlan 4
+interface is configured on an interface which is also an
.Nm
-driver has full support for virtual LANs (VLANs).
-The bridge implements independent VLAN learning, i.e. MAC addresses are
-learned on a per-VLAN basis, and the same MAC address may be learned on
-multiple interfaces on different VLANs.
-Incoming frames with an 802.1Q tag will be assigned to the appropriate
-VLAN.
-.Pp
-Traffic sent to or from the host is not assigned to a VLAN by default.
-To allow the host to communicate on a VLAN, configure a
+member interface, all tagged frames will be processed by the
.Xr vlan 4
-interface on the bridge and (if necessary) assign IP addresses there.
-.Pp
-By default no access control is enabled, so any interface may
-participate in any VLAN.
-.Pp
-VLAN filtering may be enabled on a bridge using the
+interface and will not be visible to the bridge.
+This configuration is not recommended and may be unsupported in a
+future release.
+.Ss Tagged and untagged traffic
+Incoming frames on a member interface may be either tagged or untagged.
+Tagged frames contain an 802.1Q header indicating which VLAN the
+frame belongs to, while untagged frames do not.
+When a tagged frame is received, the frame is automatically assigned to
+the VLAN in the tag (subject to any configured VLAN access list),
+while untagged frames are assigned to the interface's configured
+Port VLAN ID (PVID), or to VLAN 0 if no PVID is configured.
+.Ss Assigning interfaces to VLANs
+An interface's PVID may be configured using the
.Xr ifconfig 8
-.Cm vlanfilter
-option.
-When VLAN filtering is enabled, an interface may only send and receive
-frames based on its configured VLAN access list.
+.Cm ifuntagged
+command:
+.Bd -literal -offset indent
+ifconfig bridge0 ifuntagged ix0 10
+.Ed
.Pp
-The interface's untagged VLAN ID may be configured using the
-.Xr ifconfig 8
+Or by using the
.Cm untagged
-option.
-If an untagged VLAN ID is configured, incoming frames will be assigned
-to that VLAN, and the interface may receive outgoing untagged frames
-in that VLAN.
-.Pp
-The tagged VLAN access list may be configured using the
-.Cm tagged ,
-.Cm +tagged
-and
-.Cm -tagged
-options to
-.Xr ifconfig 8 .
-An interface may send and receive tagged frames for any VLAN in its
-access list.
+option to
+.Cm addm :
+.Bd -literal -offset indent
+ifconfig bridge0 addm ix0 untagged 10
+.Ed
.Pp
-The bridge will automatically insert or remove 802.1q tags as needed,
-based on the interface configuration, when forwarding frames between
-interfaces.
-This tag processing is only done for interfaces with VLAN filtering
-enabled.
+This will assign all untagged traffic received on the interface to the
+specified VLAN, and any traffic transmitted on the interface in this
+VLAN will have its VLAN tag (if present) removed.
+Conversely, any traffic transmitted on the interface in a different
+VLAN will have a tag added, to allow the remote system to assign the
+traffic to the appropriate VLAN.
+.Ss Host communication in a VLAN
+Sometimes it is useful to allow the host itself to communicate in a VLAN,
+for example to provide routing to other hosts in the VLAN.
+To do this, create a
+.Xr vlan 4
+interface on top of the
+.Nm
+interface with the appropriate VLAN tag.
+For example, to allow the host to communicate in VLAN 10:
+.Bd -literal -offset indent
+ifconfig bridge0.10 create inet6 2001:db8::1/64
+.Ed
+.Ss Configuring the VLAN access list (VLAN filtering)
+For historical reasons, the default
+.Nm
+configuration allows all interfaces to send tagged traffic for any VLAN,
+meaning that VLANs do not provide security separation.
+To restrict which interfaces may communicate in which VLANs,
+enable VLAN filtering on the bridge:
+.Bd -literal -offset indent
+ifconfig bridge0 vlanfilter
+.Ed
+.Pp
+This has the following effects on bridge members:
+.Bl -bullet -offset indent
+.It
+No untagged frames will be accepted from a member interface unless
+the interface has a PVID configured.
+.It
+No tagged frames will be accepted from a member interface unless
+the VLAN identifier is present in the interface's VLAN access list.
+.It
+Frames with stacked tags (Q-in-Q) will not be accepted from a
+member interface unless the
+.Cm qinq
+option (see below) has been configured for that member.
+.El
+.Pp
+To configure the VLAN access list, use the
+.Xr ifconfig 8
+.Cm iftagged ,
+.Cm +iftagged
+or
+.Cm -iftagged
+commands.
+For example, to allow an interface to communicate in VLANs 10, 20,
+and any VLAN from 100 to 199:
+.Bd -literal -offset indent
+ifconfig bridge0 iftagged ix0 10,20,100-199
+.Ed
+.Ss IEEE 802.1ad (Q-in-Q) configuration
+IEEE 802.1ad, also called Q-in-Q or
+.Dq tag stacking ,
+allows a single Ethernet frame to contain multiple tags.
+This allows one Ethernet network to transport traffic between endpoints
+using its own VLAN tags without interfering with any pre-existing tags,
+and is often used in service provider networks to provide
+.Dq virtual wire
+Ethernet services.
+.Pp
+When VLAN filtering is enabled,
+.Nm
+does not permit member interfaces to send Q-in-Q frames, because in
+certain configuration this allows
+.Dq VLAN-hopping
+attacks on the bridge.
+For example, consider a bridge with port ix0 configured as a tagged
+port in VLAN 10, and port ix1 configured as untagged in VLAN 10 and
+tagged in VLAN 20.
+If ix0 is allowed to send Q-in-Q frames, then it can send a frame with
+two tags: one for VLAN 10, followed by one for VLAN 20.
+When the bridge forwards the frame to ix1, it will strip the VLAN tag
+for VLAN 10, then forward the frame to ix1 with the tag for VLAN 20
+intact, effectively allowing ix1 to send traffic on VLAN 20 even
+though the bridge configuration should not permit that.
+.Pp
+To permit an interface to send Q-in-Q frames, set the
+.Xr ifconfig 8
+.Cm qinq
+flag on the interface.
+This is only required on the interface which will send Q-in-Q frames,
+not the interface receiving the frames.
+.Pp
+Alternatively, set the
+.Cm defqinq
+flag on the bridge itself to enable Q-in-Q for all newly-added
+interfaces by default.
.Sh PACKET FILTERING
Packet filtering can be used with any firewall package that hooks in via the
.Xr pfil 9
@@ -537,6 +633,36 @@ ifconfig_wlan0="up ssid my_ap mode 11g"
ifconfig_fxp0="up"
.Ed
.Pp
+The following will cause a bridge to be created with two VLANs,
+10 and 20, where the
+.Dq Li em
+interfaces can only communicate in their assigned VLANs,
+while
+.Dq Li ix0
+is a trunk port which can communicate in either VLAN:
+.Bd -literal -offset indent
+cloned_interfaces="bridge0"
+ifconfig_bridge0="vlanfilter \e
+ addm em0 untagged 10 \e
+ addm em1 untagged 10 \e
+ addm em2 untagged 20 \e
+ addm em3 untagged 20 \e
+ addm ix0 tagged 10,20"
+ifconfig_em0="up"
+ifconfig_em1="up"
+ifconfig_em2="up"
+ifconfig_em3="up"
+ifconfig_ix0="up"
+.Ed
+.Pp
+The previous example could be extended to allow the host to
+communicate in VLANs 10 and 20:
+.Bd -literal -offset indent
+vlans_bridge0="10 20"
+ifconfig_bridge0_10_ipv6="inet6 2001:db8:0:10::1/64"
+ifconfig_bridge0_20_ipv6="inet6 2001:db8:0:20::1/64"
+.Ed
+.Pp
Consider a system with two 4-port Ethernet boards.
The following will cause a bridge consisting of all 8 ports with
Rapid Spanning Tree enabled to be created:
diff --git a/sys/amd64/amd64/elf_machdep.c b/sys/amd64/amd64/elf_machdep.c
index 6cc2d58bbbcc..933f1ac0051f 100644
--- a/sys/amd64/amd64/elf_machdep.c
+++ b/sys/amd64/amd64/elf_machdep.c
@@ -179,7 +179,7 @@ freebsd_brand_info_la57_img_compat(const struct image_params *imgp,
return (!prefer_uva_la48);
}
-static Elf64_Brandinfo freebsd_brand_info_la48 = {
+static const Elf64_Brandinfo freebsd_brand_info_la48 = {
.brand = ELFOSABI_FREEBSD,
.machine = EM_X86_64,
.compat_3_brand = "FreeBSD",
@@ -190,7 +190,7 @@ static Elf64_Brandinfo freebsd_brand_info_la48 = {
.flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE,
};
-static Elf64_Brandinfo freebsd_brand_info_la57 = {
+static const Elf64_Brandinfo freebsd_brand_info_la57 = {
.brand = ELFOSABI_FREEBSD,
.machine = EM_X86_64,
.compat_3_brand = "FreeBSD",
@@ -216,7 +216,7 @@ sysinit_register_elf64_brand_entries(void *arg __unused)
SYSINIT(elf64, SI_SUB_EXEC, SI_ORDER_FIRST,
sysinit_register_elf64_brand_entries, NULL);
-static Elf64_Brandinfo freebsd_brand_oinfo = {
+static const Elf64_Brandinfo freebsd_brand_oinfo = {
.brand = ELFOSABI_FREEBSD,
.machine = EM_X86_64,
.compat_3_brand = "FreeBSD",
@@ -226,11 +226,10 @@ static Elf64_Brandinfo freebsd_brand_oinfo = {
.brand_note = &elf64_freebsd_brandnote,
.flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE
};
-
-SYSINIT(oelf64, SI_SUB_EXEC, SI_ORDER_ANY,
+C_SYSINIT(oelf64, SI_SUB_EXEC, SI_ORDER_ANY,
(sysinit_cfunc_t)elf64_insert_brand_entry, &freebsd_brand_oinfo);
-static Elf64_Brandinfo kfreebsd_brand_info = {
+static const Elf64_Brandinfo kfreebsd_brand_info = {
.brand = ELFOSABI_FREEBSD,
.machine = EM_X86_64,
.compat_3_brand = "FreeBSD",
@@ -240,8 +239,7 @@ static Elf64_Brandinfo kfreebsd_brand_info = {
.brand_note = &elf64_kfreebsd_brandnote,
.flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE_MANDATORY
};
-
-SYSINIT(kelf64, SI_SUB_EXEC, SI_ORDER_ANY,
+C_SYSINIT(kelf64, SI_SUB_EXEC, SI_ORDER_ANY,
(sysinit_cfunc_t)elf64_insert_brand_entry, &kfreebsd_brand_info);
void
diff --git a/sys/amd64/linux/linux_sysvec.c b/sys/amd64/linux/linux_sysvec.c
index c8579c5da4ad..890cf01c46a0 100644
--- a/sys/amd64/linux/linux_sysvec.c
+++ b/sys/amd64/linux/linux_sysvec.c
@@ -857,7 +857,7 @@ linux_vdso_reloc(char *mapping, Elf_Addr offset)
}
}
-static Elf_Brandnote linux64_brandnote = {
+static const Elf_Brandnote linux64_brandnote = {
.hdr.n_namesz = sizeof(GNU_ABI_VENDOR),
.hdr.n_descsz = 16,
.hdr.n_type = 1,
@@ -866,7 +866,7 @@ static Elf_Brandnote linux64_brandnote = {
.trans_osrel = linux_trans_osrel
};
-static Elf64_Brandinfo linux_glibc2brand = {
+static const Elf64_Brandinfo linux_glibc2brand = {
.brand = ELFOSABI_LINUX,
.machine = EM_X86_64,
.compat_3_brand = "Linux",
@@ -877,7 +877,7 @@ static Elf64_Brandinfo linux_glibc2brand = {
.flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE
};
-static Elf64_Brandinfo linux_glibc2brandshort = {
+static const Elf64_Brandinfo linux_glibc2brandshort = {
.brand = ELFOSABI_LINUX,
.machine = EM_X86_64,
.compat_3_brand = "Linux",
@@ -888,7 +888,7 @@ static Elf64_Brandinfo linux_glibc2brandshort = {
.flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE
};
-static Elf64_Brandinfo linux_muslbrand = {
+static const Elf64_Brandinfo linux_muslbrand = {
.brand = ELFOSABI_LINUX,
.machine = EM_X86_64,
.compat_3_brand = "Linux",
@@ -900,7 +900,7 @@ static Elf64_Brandinfo linux_muslbrand = {
LINUX_BI_FUTEX_REQUEUE
};
-static Elf64_Brandinfo *linux_brandlist[] = {
+static const Elf64_Brandinfo *linux_brandlist[] = {
&linux_glibc2brand,
&linux_glibc2brandshort,
&linux_muslbrand,
@@ -910,7 +910,7 @@ static Elf64_Brandinfo *linux_brandlist[] = {
static int
linux64_elf_modevent(module_t mod, int type, void *data)
{
- Elf64_Brandinfo **brandinfo;
+ const Elf64_Brandinfo **brandinfo;
int error;
struct linux_ioctl_handler **lihp;
diff --git a/sys/amd64/linux32/linux32_sysvec.c b/sys/amd64/linux32/linux32_sysvec.c
index 8fac626f9053..735ebb151017 100644
--- a/sys/amd64/linux32/linux32_sysvec.c
+++ b/sys/amd64/linux32/linux32_sysvec.c
@@ -954,7 +954,7 @@ linux_vdso_reloc(char *mapping, Elf_Addr offset)
}
}
-static Elf_Brandnote linux32_brandnote = {
+static const Elf_Brandnote linux32_brandnote = {
.hdr.n_namesz = sizeof(GNU_ABI_VENDOR),
.hdr.n_descsz = 16, /* XXX at least 16 */
.hdr.n_type = 1,
@@ -963,7 +963,7 @@ static Elf_Brandnote linux32_brandnote = {
.trans_osrel = linux_trans_osrel
};
-static Elf32_Brandinfo linux_brand = {
+static const Elf32_Brandinfo linux_brand = {
.brand = ELFOSABI_LINUX,
.machine = EM_386,
.compat_3_brand = "Linux",
@@ -974,7 +974,7 @@ static Elf32_Brandinfo linux_brand = {
.flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE
};
-static Elf32_Brandinfo linux_glibc2brand = {
+static const Elf32_Brandinfo linux_glibc2brand = {
.brand = ELFOSABI_LINUX,
.machine = EM_386,
.compat_3_brand = "Linux",
@@ -985,7 +985,7 @@ static Elf32_Brandinfo linux_glibc2brand = {
.flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE
};
-static Elf32_Brandinfo linux_muslbrand = {
+static const Elf32_Brandinfo linux_muslbrand = {
.brand = ELFOSABI_LINUX,
.machine = EM_386,
.compat_3_brand = "Linux",
@@ -997,7 +997,7 @@ static Elf32_Brandinfo linux_muslbrand = {
LINUX_BI_FUTEX_REQUEUE
};
-static Elf32_Brandinfo *linux_brandlist[] = {
+static const Elf32_Brandinfo *linux_brandlist[] = {
&linux_brand,
&linux_glibc2brand,
&linux_muslbrand,
@@ -1007,7 +1007,7 @@ static Elf32_Brandinfo *linux_brandlist[] = {
static int
linux_elf_modevent(module_t mod, int type, void *data)
{
- Elf32_Brandinfo **brandinfo;
+ const Elf32_Brandinfo **brandinfo;
int error;
struct linux_ioctl_handler **lihp;
diff --git a/sys/arm/arm/pmap-v6.c b/sys/arm/arm/pmap-v6.c
index 78883296c5b7..6a0ece1e4d98 100644
--- a/sys/arm/arm/pmap-v6.c
+++ b/sys/arm/arm/pmap-v6.c
@@ -1246,7 +1246,7 @@ pmap_bootstrap(vm_offset_t firstaddr)
}
static void
-pmap_init_reserved_pages(void)
+pmap_init_reserved_pages(void *dummy __unused)
{
struct pcpu *pc;
vm_offset_t pages;
diff --git a/sys/arm/arm/unwind.c b/sys/arm/arm/unwind.c
index 7ad91a3e01a5..0d77074fae34 100644
--- a/sys/arm/arm/unwind.c
+++ b/sys/arm/arm/unwind.c
@@ -278,7 +278,7 @@ unwind_module_unloaded(struct linker_file *lf)
* the unwind tables might be stripped, so instead we have to use the
* _exidx_start/end symbols created by ldscript.arm.
*/
-static int
+static void
module_info_init(void *arg __unused)
{
struct linker_file thekernel;
@@ -291,8 +291,6 @@ module_info_init(void *arg __unused)
thekernel.exidx_addr = CADDR(&_exidx_start);
thekernel.exidx_size = UADDR(&_exidx_end) - UADDR(&_exidx_start);
populate_module_info(create_module_info(), &thekernel);
-
- return (0);
}
SYSINIT(unwind_init, SI_SUB_KMEM, SI_ORDER_ANY, module_info_init, NULL);
diff --git a/sys/arm64/arm64/elf_machdep.c b/sys/arm64/arm64/elf_machdep.c
index 13af5c5065d6..207b37180a26 100644
--- a/sys/arm64/arm64/elf_machdep.c
+++ b/sys/arm64/arm64/elf_machdep.c
@@ -121,7 +121,7 @@ static struct sysentvec elf64_freebsd_sysvec = {
};
INIT_SYSENTVEC(elf64_sysvec, &elf64_freebsd_sysvec);
-static Elf64_Brandinfo freebsd_brand_info = {
+static const Elf64_Brandinfo freebsd_brand_info = {
.brand = ELFOSABI_FREEBSD,
.machine = EM_AARCH64,
.compat_3_brand = "FreeBSD",
@@ -131,8 +131,7 @@ static Elf64_Brandinfo freebsd_brand_info = {
.brand_note = &elf64_freebsd_brandnote,
.flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE
};
-
-SYSINIT(elf64, SI_SUB_EXEC, SI_ORDER_FIRST,
+C_SYSINIT(elf64, SI_SUB_EXEC, SI_ORDER_FIRST,
(sysinit_cfunc_t)elf64_insert_brand_entry, &freebsd_brand_info);
static bool
@@ -336,7 +335,7 @@ elf_cpu_parse_dynamic(caddr_t loadbase __unused, Elf_Dyn *dynamic __unused)
return (0);
}
-static Elf_Note gnu_property_note = {
+static const Elf_Note gnu_property_note = {
.n_namesz = sizeof(GNU_ABI_VENDOR),
.n_descsz = 16,
.n_type = NT_GNU_PROPERTY_TYPE_0,
diff --git a/sys/arm64/coresight/coresight.c b/sys/arm64/coresight/coresight.c
index 5928c153f4ae..9b9d3c65ecc9 100644
--- a/sys/arm64/coresight/coresight.c
+++ b/sys/arm64/coresight/coresight.c
@@ -113,7 +113,7 @@ coresight_get_output_device(struct endpoint *endp, struct endpoint **out_endp)
}
static void
-coresight_init(void)
+coresight_init(void *dummy __unused)
{
mtx_init(&cs_mtx, "ARM Coresight", NULL, MTX_DEF);
diff --git a/sys/arm64/linux/linux_sysvec.c b/sys/arm64/linux/linux_sysvec.c
index 084b7a11b01f..ac05820f89bc 100644
--- a/sys/arm64/linux/linux_sysvec.c
+++ b/sys/arm64/linux/linux_sysvec.c
@@ -584,7 +584,7 @@ linux_vdso_reloc(char *mapping, Elf_Addr offset)
}
}
-static Elf_Brandnote linux64_brandnote = {
+static const Elf_Brandnote linux64_brandnote = {
.hdr.n_namesz = sizeof(GNU_ABI_VENDOR),
.hdr.n_descsz = 16,
.hdr.n_type = 1,
@@ -593,7 +593,7 @@ static Elf_Brandnote linux64_brandnote = {
.trans_osrel = linux_trans_osrel
};
-static Elf64_Brandinfo linux_glibc2brand = {
+static const Elf64_Brandinfo linux_glibc2brand = {
.brand = ELFOSABI_LINUX,
.machine = EM_AARCH64,
.compat_3_brand = "Linux",
@@ -604,7 +604,7 @@ static Elf64_Brandinfo linux_glibc2brand = {
.flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE
};
-Elf64_Brandinfo *linux_brandlist[] = {
+const Elf64_Brandinfo *linux_brandlist[] = {
&linux_glibc2brand,
NULL
};
@@ -612,8 +612,8 @@ Elf64_Brandinfo *linux_brandlist[] = {
static int
linux64_elf_modevent(module_t mod, int type, void *data)
{
- Elf64_Brandinfo **brandinfo;
- struct linux_ioctl_handler**lihp;
+ const Elf64_Brandinfo **brandinfo;
+ struct linux_ioctl_handler **lihp;
int error;
error = 0;
diff --git a/sys/cam/scsi/scsi_all.c b/sys/cam/scsi/scsi_all.c
index b518f84454ad..9c097f52d136 100644
--- a/sys/cam/scsi/scsi_all.c
+++ b/sys/cam/scsi/scsi_all.c
@@ -112,7 +112,7 @@ static void fetchtableentries(int sense_key, int asc, int ascq,
const struct asc_table_entry **);
#ifdef _KERNEL
-static void init_scsi_delay(void);
+static void init_scsi_delay(void *);
static int sysctl_scsi_delay(SYSCTL_HANDLER_ARGS);
static int set_scsi_delay(int delay);
#endif
@@ -9379,7 +9379,7 @@ scsi_vpd_supported_page(struct cam_periph *periph, uint8_t page_id)
}
static void
-init_scsi_delay(void)
+init_scsi_delay(void *dummy __unused)
{
int delay;
diff --git a/sys/cddl/compat/opensolaris/kern/opensolaris.c b/sys/cddl/compat/opensolaris/kern/opensolaris.c
index 10924977c20d..898b2ea49f96 100644
--- a/sys/cddl/compat/opensolaris/kern/opensolaris.c
+++ b/sys/cddl/compat/opensolaris/kern/opensolaris.c
@@ -67,7 +67,7 @@ opensolaris_load(void *dummy)
SYSINIT(opensolaris_register, SI_SUB_OPENSOLARIS, SI_ORDER_FIRST, opensolaris_load, NULL);
static void
-opensolaris_unload(void)
+opensolaris_unload(void *dummy __unused)
{
mutex_destroy(&cpu_lock);
}
diff --git a/sys/compat/ia32/ia32_sysvec.c b/sys/compat/ia32/ia32_sysvec.c
index 0ea7d072e911..b9dada4eee7b 100644
--- a/sys/compat/ia32/ia32_sysvec.c
+++ b/sys/compat/ia32/ia32_sysvec.c
@@ -145,7 +145,7 @@ struct sysentvec ia32_freebsd_sysvec = {
};
INIT_SYSENTVEC(elf_ia32_sysvec, &ia32_freebsd_sysvec);
-static Elf32_Brandinfo ia32_brand_info = {
+static const Elf32_Brandinfo ia32_brand_info = {
.brand = ELFOSABI_FREEBSD,
.machine = EM_386,
.compat_3_brand = "FreeBSD",
@@ -155,12 +155,10 @@ static Elf32_Brandinfo ia32_brand_info = {
.brand_note = &elf32_freebsd_brandnote,
.flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE
};
+C_SYSINIT(ia32, SI_SUB_EXEC, SI_ORDER_MIDDLE,
+ (sysinit_cfunc_t)elf32_insert_brand_entry, &ia32_brand_info);
-SYSINIT(ia32, SI_SUB_EXEC, SI_ORDER_MIDDLE,
- (sysinit_cfunc_t) elf32_insert_brand_entry,
- &ia32_brand_info);
-
-static Elf32_Brandinfo ia32_brand_oinfo = {
+static const Elf32_Brandinfo ia32_brand_oinfo = {
.brand = ELFOSABI_FREEBSD,
.machine = EM_386,
.compat_3_brand = "FreeBSD",
@@ -170,12 +168,10 @@ static Elf32_Brandinfo ia32_brand_oinfo = {
.brand_note = &elf32_freebsd_brandnote,
.flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE
};
+C_SYSINIT(oia32, SI_SUB_EXEC, SI_ORDER_ANY,
+ (sysinit_cfunc_t)elf32_insert_brand_entry, &ia32_brand_oinfo);
-SYSINIT(oia32, SI_SUB_EXEC, SI_ORDER_ANY,
- (sysinit_cfunc_t) elf32_insert_brand_entry,
- &ia32_brand_oinfo);
-
-static Elf32_Brandinfo kia32_brand_info = {
+static const Elf32_Brandinfo kia32_brand_info = {
.brand = ELFOSABI_FREEBSD,
.machine = EM_386,
.compat_3_brand = "FreeBSD",
@@ -184,10 +180,8 @@ static Elf32_Brandinfo kia32_brand_info = {
.brand_note = &elf32_kfreebsd_brandnote,
.flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE_MANDATORY
};
-
-SYSINIT(kia32, SI_SUB_EXEC, SI_ORDER_ANY,
- (sysinit_cfunc_t) elf32_insert_brand_entry,
- &kia32_brand_info);
+C_SYSINIT(kia32, SI_SUB_EXEC, SI_ORDER_ANY,
+ (sysinit_cfunc_t)elf32_insert_brand_entry, &kia32_brand_info);
void
elf32_dump_thread(struct thread *td, void *dst, size_t *off)
diff --git a/sys/compat/linux/linux_futex.c b/sys/compat/linux/linux_futex.c
index 37d0142bae8b..0586eb55a8f3 100644
--- a/sys/compat/linux/linux_futex.c
+++ b/sys/compat/linux/linux_futex.c
@@ -251,7 +251,7 @@ linux_futex(struct thread *td, struct linux_futex_args *args)
* set LINUX_BI_FUTEX_REQUEUE bit of Brandinfo flags.
*/
p = td->td_proc;
- Elf_Brandinfo *bi = p->p_elf_brandinfo;
+ const Elf_Brandinfo *bi = p->p_elf_brandinfo;
if (bi == NULL || ((bi->flags & LINUX_BI_FUTEX_REQUEUE)) == 0)
return (EINVAL);
args->val3_compare = false;
diff --git a/sys/conf/NOTES b/sys/conf/NOTES
index ea9b2667607e..a25ee8f6e1af 100644
--- a/sys/conf/NOTES
+++ b/sys/conf/NOTES
@@ -679,6 +679,7 @@ options TCP_OFFLOAD # TCP offload support.
options TCP_RFC7413 # TCP Fast Open
options TCPHPTS
+#options TCP_HPTS_KTEST # Add KTEST support for HPTS
# In order to enable IPSEC you MUST also add device crypto to
# your kernel configuration
diff --git a/sys/conf/options b/sys/conf/options
index b48ad1cf42cf..0b795a8d28fb 100644
--- a/sys/conf/options
+++ b/sys/conf/options
@@ -231,6 +231,7 @@ SYSVSEM opt_sysvipc.h
SYSVSHM opt_sysvipc.h
SW_WATCHDOG opt_watchdog.h
TCPHPTS
+TCP_HPTS_KTEST opt_inet.h
TCP_REQUEST_TRK opt_global.h
TCP_ACCOUNTING opt_global.h
TCP_BBR opt_inet.h
diff --git a/sys/conf/std.debug b/sys/conf/std.debug
index f5ed5582c78d..0149779b3e5c 100644
--- a/sys/conf/std.debug
+++ b/sys/conf/std.debug
@@ -16,3 +16,4 @@ options MALLOC_DEBUG_MAXZONES=8 # Separate malloc(9) zones
options VERBOSE_SYSINIT=0 # Support debug.verbose_sysinit, off by default
options ALT_BREAK_TO_DEBUGGER # Enter debugger on keyboard escape sequence
options KDTRACE_MIB_SDT # Add SDT probes to network counters
+options TCP_HPTS_KTEST # Add KTEST support for HPTS
diff --git a/sys/conf/std.nodebug b/sys/conf/std.nodebug
index 4035e28d2a62..79676a1d618f 100644
--- a/sys/conf/std.nodebug
+++ b/sys/conf/std.nodebug
@@ -16,6 +16,7 @@ nooptions KCOV
nooptions MALLOC_DEBUG_MAXZONES
nooptions QUEUE_MACRO_DEBUG_TRASH
nooptions KDTRACE_MIB_SDT
+nooptions TCP_HPTS_KTEST
# Net80211 debugging
nooptions IEEE80211_DEBUG
diff --git a/sys/dev/fdt/fdt_slicer.c b/sys/dev/fdt/fdt_slicer.c
index 3ba4eddf8b61..50112db5cfae 100644
--- a/sys/dev/fdt/fdt_slicer.c
+++ b/sys/dev/fdt/fdt_slicer.c
@@ -45,7 +45,7 @@
static int fill_slices(device_t dev, const char *provider,
struct flash_slice *slices, int *slices_num);
-static void fdt_slicer_init(void);
+static void fdt_slicer_init(void *);
static int
fill_slices_from_node(phandle_t node, struct flash_slice *slices, int *count)
@@ -138,7 +138,7 @@ fill_slices(device_t dev, const char *provider __unused,
}
static void
-fdt_slicer_init(void)
+fdt_slicer_init(void *dummy __unused)
{
flash_register_slicer(fill_slices, FLASH_SLICES_TYPE_NAND, false);
@@ -147,7 +147,7 @@ fdt_slicer_init(void)
}
static void
-fdt_slicer_cleanup(void)
+fdt_slicer_cleanup(void *dummy __unused)
{
flash_register_slicer(NULL, FLASH_SLICES_TYPE_NAND, true);
diff --git a/sys/dev/iommu/iommu_gas.c b/sys/dev/iommu/iommu_gas.c
index ffa8dc096adc..80e37341b3dc 100644
--- a/sys/dev/iommu/iommu_gas.c
+++ b/sys/dev/iommu/iommu_gas.c
@@ -77,7 +77,7 @@ static int iommu_check_free;
#endif
static void
-intel_gas_init(void)
+intel_gas_init(void *dummy __unused)
{
iommu_map_entry_zone = uma_zcreate("IOMMU_MAP_ENTRY",
diff --git a/sys/dev/mii/mv88e151x.c b/sys/dev/mii/mv88e151x.c
index 618ad81471c9..fb03b2a7a917 100644
--- a/sys/dev/mii/mv88e151x.c
+++ b/sys/dev/mii/mv88e151x.c
@@ -97,7 +97,7 @@ mv88e151x_attach(device_t dev)
{
const struct mii_attach_args *ma;
struct mii_softc *sc;
- uint32_t cop_cap, cop_extcap;
+ uint32_t cop_cap = 0, cop_extcap = 0;
sc = device_get_softc(dev);
ma = device_get_ivars(dev);
@@ -224,10 +224,12 @@ mv88e151x_fiber_status(struct mii_softc *phy)
else if (reg & MV88E151X_STATUS_LINK &&
reg & MV88E151X_STATUS_SYNC &&
(reg & MV88E151X_STATUS_ENERGY) == 0) {
- if ((reg & MV88E151X_STATUS_SPEED_MASK) ==
+ if (((reg & MV88E151X_STATUS_SPEED_MASK) >>
+ MV88E151X_STATUS_SPEED_SHIFT) ==
MV88E151X_STATUS_SPEED_1000)
mii->mii_media_active |= IFM_1000_SX;
- else if ((reg & MV88E151X_STATUS_SPEED_MASK) ==
+ else if (((reg & MV88E151X_STATUS_SPEED_MASK) >>
+ MV88E151X_STATUS_SPEED_SHIFT) ==
MV88E151X_STATUS_SPEED_100)
mii->mii_media_active |= IFM_100_FX;
else
diff --git a/sys/dev/nvme/nvme.c b/sys/dev/nvme/nvme.c
index ead91f0d01fe..d119f9877aaa 100644
--- a/sys/dev/nvme/nvme.c
+++ b/sys/dev/nvme/nvme.c
@@ -51,7 +51,7 @@ int32_t nvme_retry_count;
MALLOC_DEFINE(M_NVME, "nvme", "nvme(4) memory allocations");
static void
-nvme_init(void)
+nvme_init(void *dummy __unused)
{
uint32_t i;
@@ -62,7 +62,7 @@ nvme_init(void)
SYSINIT(nvme_register, SI_SUB_DRIVERS, SI_ORDER_SECOND, nvme_init, NULL);
static void
-nvme_uninit(void)
+nvme_uninit(void *dummy __unused)
{
}
diff --git a/sys/dev/nvme/nvme.h b/sys/dev/nvme/nvme.h
index 557b4df4c328..f4ea08f129c0 100644
--- a/sys/dev/nvme/nvme.h
+++ b/sys/dev/nvme/nvme.h
@@ -2153,8 +2153,6 @@ static inline
void nvme_namespace_data_swapbytes(struct nvme_namespace_data *s __unused)
{
#if _BYTE_ORDER != _LITTLE_ENDIAN
- int i;
-
s->nsze = le64toh(s->nsze);
s->ncap = le64toh(s->ncap);
s->nuse = le64toh(s->nuse);
@@ -2173,7 +2171,7 @@ void nvme_namespace_data_swapbytes(struct nvme_namespace_data *s __unused)
s->anagrpid = le32toh(s->anagrpid);
s->nvmsetid = le16toh(s->nvmsetid);
s->endgid = le16toh(s->endgid);
- for (i = 0; i < nitems(s->lbaf); i++)
+ for (unsigned i = 0; i < nitems(s->lbaf); i++)
s->lbaf[i] = le32toh(s->lbaf[i]);
#endif
}
diff --git a/sys/dev/nvme/nvme_sim.c b/sys/dev/nvme/nvme_sim.c
index a06774a64761..7693aa6d54d3 100644
--- a/sys/dev/nvme/nvme_sim.c
+++ b/sys/dev/nvme/nvme_sim.c
@@ -391,7 +391,7 @@ nvme_sim_controller_fail(void *ctrlr_arg)
struct nvme_consumer *consumer_cookie;
static void
-nvme_sim_init(void)
+nvme_sim_init(void *dummy __unused)
{
if (nvme_use_nvd)
return;
@@ -404,7 +404,7 @@ SYSINIT(nvme_sim_register, SI_SUB_DRIVERS, SI_ORDER_ANY,
nvme_sim_init, NULL);
static void
-nvme_sim_uninit(void)
+nvme_sim_uninit(void *dummy __unused)
{
if (nvme_use_nvd)
return;
diff --git a/sys/dev/xdma/xdma.c b/sys/dev/xdma/xdma.c
index 62b781159d03..cdd9ad0b8f39 100644
--- a/sys/dev/xdma/xdma.c
+++ b/sys/dev/xdma/xdma.c
@@ -555,7 +555,7 @@ xdma_put(xdma_controller_t *xdma)
}
static void
-xdma_init(void)
+xdma_init(void *dummy __unused)
{
mtx_init(&xdma_mtx, "xDMA", NULL, MTX_DEF);
diff --git a/sys/dev/xen/bus/xen_intr.c b/sys/dev/xen/bus/xen_intr.c
index cb30b6efa484..2b5fa8fb7cd1 100644
--- a/sys/dev/xen/bus/xen_intr.c
+++ b/sys/dev/xen/bus/xen_intr.c
@@ -460,7 +460,7 @@ xen_intr_handle_upcall(void *unused __unused)
return (FILTER_HANDLED);
}
-static int
+static void
xen_intr_init(void *dummy __unused)
{
shared_info_t *s = HYPERVISOR_shared_info;
@@ -468,7 +468,7 @@ xen_intr_init(void *dummy __unused)
int i;
if (!xen_domain())
- return (0);
+ return;
_Static_assert(is_valid_evtchn(0),
"is_valid_evtchn(0) fails (unused by Xen, but valid by interface");
@@ -502,8 +502,6 @@ xen_intr_init(void *dummy __unused)
if (bootverbose)
printf("Xen interrupt system initialized\n");
-
- return (0);
}
SYSINIT(xen_intr_init, SI_SUB_INTR, SI_ORDER_SECOND, xen_intr_init, NULL);
diff --git a/sys/fs/fuse/fuse_vnops.c b/sys/fs/fuse/fuse_vnops.c
index 5c28db29fc63..683ee2f7ad56 100644
--- a/sys/fs/fuse/fuse_vnops.c
+++ b/sys/fs/fuse/fuse_vnops.c
@@ -284,7 +284,7 @@ fuse_flush(struct vnode *vp, struct ucred *cred, pid_t pid, int fflag)
struct mount *mp = vnode_mount(vp);
int err;
- if (fsess_not_impl(vnode_mount(vp), FUSE_FLUSH))
+ if (fsess_not_impl(mp, FUSE_FLUSH))
return 0;
err = fuse_filehandle_getrw(vp, fflag, &fufh, cred, pid);
@@ -292,7 +292,7 @@ fuse_flush(struct vnode *vp, struct ucred *cred, pid_t pid, int fflag)
return err;
if (fufh->fuse_open_flags & FOPEN_NOFLUSH &&
- (!fsess_opt_writeback(vnode_mount(vp))))
+ (!fsess_opt_writeback(mp)))
return (0);
fdisp_init(&fdi, sizeof(*ffi));
diff --git a/sys/fs/p9fs/p9_transport.c b/sys/fs/p9fs/p9_transport.c
index c82d81fedcd7..25eee984265c 100644
--- a/sys/fs/p9fs/p9_transport.c
+++ b/sys/fs/p9fs/p9_transport.c
@@ -34,9 +34,8 @@
TAILQ_HEAD(, p9_trans_module) transports;
static void
-p9_transport_init(void)
+p9_transport_init(void *dummy __unused)
{
-
TAILQ_INIT(&transports);
}
diff --git a/sys/fs/unionfs/union_subr.c b/sys/fs/unionfs/union_subr.c
index a14f9ca74305..b6d6db60ca3d 100644
--- a/sys/fs/unionfs/union_subr.c
+++ b/sys/fs/unionfs/union_subr.c
@@ -587,6 +587,7 @@ unionfs_find_node_status(struct unionfs_node *unp, struct thread *td)
struct unionfs_node_status *unsp;
pid_t pid;
+ MPASS(td != NULL);
pid = td->td_proc->p_pid;
ASSERT_VOP_ELOCKED(UNIONFSTOV(unp), __func__);
@@ -612,6 +613,7 @@ unionfs_get_node_status(struct unionfs_node *unp, struct thread *td,
struct unionfs_node_status *unsp;
pid_t pid;
+ MPASS(td != NULL);
pid = td->td_proc->p_pid;
KASSERT(NULL != unspp, ("%s: NULL status", __func__));
diff --git a/sys/fs/unionfs/union_vnops.c b/sys/fs/unionfs/union_vnops.c
index 627b2f6e9a1d..26fa14603c85 100644
--- a/sys/fs/unionfs/union_vnops.c
+++ b/sys/fs/unionfs/union_vnops.c
@@ -814,7 +814,7 @@ unionfs_close(struct vop_close_args *ap)
unp = VTOUNIONFS(vp);
lvp = unp->un_lowervp;
uvp = unp->un_uppervp;
- unsp = unionfs_find_node_status(unp, td);
+ unsp = (td != NULL) ? unionfs_find_node_status(unp, td) : NULL;
if (unsp == NULL ||
(unsp->uns_lower_opencnt <= 0 && unsp->uns_upper_opencnt <= 0)) {
diff --git a/sys/i386/i386/machdep.c b/sys/i386/i386/machdep.c
index 6aac0e968362..3f659432552c 100644
--- a/sys/i386/i386/machdep.c
+++ b/sys/i386/i386/machdep.c
@@ -1605,7 +1605,7 @@ init386(int first)
}
static void
-machdep_init_trampoline(void)
+machdep_init_trampoline(void *dummy __unused)
{
struct region_descriptor r_gdt, r_idt;
struct i386tss *tss;
diff --git a/sys/i386/i386/pmap.c b/sys/i386/i386/pmap.c
index b44f5e08bbcf..1cf0867d57c3 100644
--- a/sys/i386/i386/pmap.c
+++ b/sys/i386/i386/pmap.c
@@ -720,7 +720,7 @@ __CONCAT(PMTYPE, bootstrap)(vm_paddr_t firstaddr)
}
static void
-pmap_init_reserved_pages(void)
+pmap_init_reserved_pages(void *dummy __unused)
{
struct pcpu *pc;
vm_offset_t pages;
diff --git a/sys/kern/imgact_elf.c b/sys/kern/imgact_elf.c
index 1bc2491a1a12..c53707a1286c 100644
--- a/sys/kern/imgact_elf.c
+++ b/sys/kern/imgact_elf.c
@@ -92,7 +92,7 @@
#define ELF_ABI_ID __CONCAT(elf, __ELF_WORD_SIZE)
static int __elfN(check_header)(const Elf_Ehdr *hdr);
-static Elf_Brandinfo *__elfN(get_brandinfo)(struct image_params *imgp,
+static const Elf_Brandinfo *__elfN(get_brandinfo)(struct image_params *imgp,
const char *interp, int32_t *osrel, uint32_t *fctl0);
static int __elfN(load_file)(struct proc *p, const char *file, u_long *addr,
u_long *entry);
@@ -104,7 +104,7 @@ static bool __elfN(freebsd_trans_osrel)(const Elf_Note *note,
int32_t *osrel);
static bool kfreebsd_trans_osrel(const Elf_Note *note, int32_t *osrel);
static bool __elfN(check_note)(struct image_params *imgp,
- Elf_Brandnote *checknote, int32_t *osrel, bool *has_fctl0,
+ const Elf_Brandnote *checknote, int32_t *osrel, bool *has_fctl0,
uint32_t *fctl0);
static vm_prot_t __elfN(trans_prot)(Elf_Word);
static Elf_Word __elfN(untrans_prot)(vm_prot_t);
@@ -227,7 +227,7 @@ SYSCTL_BOOL(ELF_NODE_OID, OID_AUTO, allow_wx,
CTLFLAG_RWTUN, &__elfN(allow_wx), 0,
"Allow pages to be mapped simultaneously writable and executable");
-static Elf_Brandinfo *elf_brand_list[MAX_BRANDS];
+static const Elf_Brandinfo *elf_brand_list[MAX_BRANDS];
#define aligned(a, t) (rounddown2((u_long)(a), sizeof(t)) == (u_long)(a))
@@ -286,7 +286,7 @@ kfreebsd_trans_osrel(const Elf_Note *note, int32_t *osrel)
}
int
-__elfN(insert_brand_entry)(Elf_Brandinfo *entry)
+__elfN(insert_brand_entry)(const Elf_Brandinfo *entry)
{
int i;
@@ -305,7 +305,7 @@ __elfN(insert_brand_entry)(Elf_Brandinfo *entry)
}
int
-__elfN(remove_brand_entry)(Elf_Brandinfo *entry)
+__elfN(remove_brand_entry)(const Elf_Brandinfo *entry)
{
int i;
@@ -321,7 +321,7 @@ __elfN(remove_brand_entry)(Elf_Brandinfo *entry)
}
bool
-__elfN(brand_inuse)(Elf_Brandinfo *entry)
+__elfN(brand_inuse)(const Elf_Brandinfo *entry)
{
struct proc *p;
bool rval = false;
@@ -338,12 +338,12 @@ __elfN(brand_inuse)(Elf_Brandinfo *entry)
return (rval);
}
-static Elf_Brandinfo *
+static const Elf_Brandinfo *
__elfN(get_brandinfo)(struct image_params *imgp, const char *interp,
int32_t *osrel, uint32_t *fctl0)
{
const Elf_Ehdr *hdr = (const Elf_Ehdr *)imgp->image_header;
- Elf_Brandinfo *bi, *bi_m;
+ const Elf_Brandinfo *bi, *bi_m;
bool ret, has_fctl0;
int i, interp_name_len;
@@ -492,7 +492,7 @@ __elfN(phdr_in_zero_page)(const Elf_Ehdr *hdr)
static int
__elfN(check_header)(const Elf_Ehdr *hdr)
{
- Elf_Brandinfo *bi;
+ const Elf_Brandinfo *bi;
int i;
if (!IS_ELF(*hdr) ||
@@ -1109,7 +1109,7 @@ __CONCAT(exec_, __elfN(imgact))(struct image_params *imgp)
struct vmspace *vmspace;
vm_map_t map;
char *interp;
- Elf_Brandinfo *brand_info;
+ const Elf_Brandinfo *brand_info;
struct sysentvec *sv;
u_long addr, baddr, entry, proghdr;
u_long maxalign, maxsalign, mapsz, maxv, maxv1, anon_loc;
@@ -1925,7 +1925,7 @@ __elfN(puthdr)(struct thread *td, void *hdr, size_t hdrsize, int numsegs,
Elf_Phdr *phdr;
Elf_Shdr *shdr;
struct phdr_closure phc;
- Elf_Brandinfo *bi;
+ const Elf_Brandinfo *bi;
ehdr = (Elf_Ehdr *)hdr;
bi = td->td_proc->p_elf_brandinfo;
@@ -2861,7 +2861,7 @@ ret:
}
struct brandnote_cb_arg {
- Elf_Brandnote *brandnote;
+ const Elf_Brandnote *brandnote;
int32_t *osrel;
};
@@ -2883,7 +2883,7 @@ brandnote_cb(const Elf_Note *note, void *arg0, bool *res)
return (true);
}
-static Elf_Note fctl_note = {
+static const Elf_Note fctl_note = {
.n_namesz = sizeof(FREEBSD_ABI_VENDOR),
.n_descsz = sizeof(uint32_t),
.n_type = NT_FREEBSD_FEATURE_CTL,
@@ -2918,7 +2918,7 @@ note_fctl_cb(const Elf_Note *note, void *arg0, bool *res)
* as for headers.
*/
static bool
-__elfN(check_note)(struct image_params *imgp, Elf_Brandnote *brandnote,
+__elfN(check_note)(struct image_params *imgp, const Elf_Brandnote *brandnote,
int32_t *osrel, bool *has_fctl0, uint32_t *fctl0)
{
const Elf_Phdr *phdr;
diff --git a/sys/kern/kern_boottrace.c b/sys/kern/kern_boottrace.c
index 1fa87955a299..c83255bc74ee 100644
--- a/sys/kern/kern_boottrace.c
+++ b/sys/kern/kern_boottrace.c
@@ -579,7 +579,7 @@ sysctl_boottrace_reset(SYSCTL_HANDLER_ARGS)
}
static void
-boottrace_init(void)
+boottrace_init(void *dummy __unused)
{
if (!boottrace_enabled)
diff --git a/sys/kern/kern_devctl.c b/sys/kern/kern_devctl.c
index 7a2818c29b1a..a1696225df32 100644
--- a/sys/kern/kern_devctl.c
+++ b/sys/kern/kern_devctl.c
@@ -140,7 +140,7 @@ static struct devctlbridge {
} devctl_notify_hook = { .send_f = NULL };
static void
-devctl_init(void)
+devctl_init(void *dummy __unused)
{
int reserve;
uma_zone_t z;
diff --git a/sys/kern/kern_event.c b/sys/kern/kern_event.c
index 23d8dc9cf54a..a6333d8011b1 100644
--- a/sys/kern/kern_event.c
+++ b/sys/kern/kern_event.c
@@ -156,7 +156,7 @@ static void knote_drop(struct knote *kn, struct thread *td);
static void knote_drop_detached(struct knote *kn, struct thread *td);
static void knote_enqueue(struct knote *kn);
static void knote_dequeue(struct knote *kn);
-static void knote_init(void);
+static void knote_init(void *);
static struct knote *knote_alloc(int mflag);
static void knote_free(struct knote *kn);
@@ -2887,7 +2887,7 @@ knote_dequeue(struct knote *kn)
}
static void
-knote_init(void)
+knote_init(void *dummy __unused)
{
knote_zone = uma_zcreate("KNOTE", sizeof(struct knote), NULL, NULL,
diff --git a/sys/kern/kern_exec.c b/sys/kern/kern_exec.c
index 0fc2d0e7f1bc..2bdd6faa025a 100644
--- a/sys/kern/kern_exec.c
+++ b/sys/kern/kern_exec.c
@@ -418,7 +418,7 @@ do_execve(struct thread *td, struct image_args *args, struct mac *mac_p,
#endif
int error, i, orig_osrel;
uint32_t orig_fctl0;
- Elf_Brandinfo *orig_brandinfo;
+ const Elf_Brandinfo *orig_brandinfo;
size_t freepath_size;
static const char fexecv_proc_title[] = "(fexecv)";
@@ -1314,7 +1314,7 @@ exec_map_stack(struct image_params *imgp)
MAP_INHERIT_SHARE | MAP_ACC_NO_CHARGE);
} else {
sharedpage_addr = sv->sv_shared_page_base;
- vm_map_fixed(map, obj, 0,
+ error = vm_map_fixed(map, obj, 0,
sharedpage_addr, sv->sv_shared_page_len,
VM_PROT_READ | VM_PROT_EXECUTE,
VM_PROT_READ | VM_PROT_EXECUTE,
diff --git a/sys/kern/kern_jailmeta.c b/sys/kern/kern_jailmeta.c
index 4e37eccad03a..91bb7155820d 100644
--- a/sys/kern/kern_jailmeta.c
+++ b/sys/kern/kern_jailmeta.c
@@ -599,22 +599,18 @@ SYSCTL_PROC(_security_jail, OID_AUTO, env,
/* Setup and tear down. */
-static int
+static void
jm_sysinit(void *arg __unused)
{
meta.osd_slot = osd_jail_register(jm_osd_destructor, meta.methods);
env.osd_slot = osd_jail_register(jm_osd_destructor, env.methods);
-
- return (0);
}
-static int
+static void
jm_sysuninit(void *arg __unused)
{
osd_jail_deregister(meta.osd_slot);
osd_jail_deregister(env.osd_slot);
-
- return (0);
}
SYSINIT(jailmeta, SI_SUB_DRIVERS, SI_ORDER_ANY, jm_sysinit, NULL);
diff --git a/sys/kern/kern_linker.c b/sys/kern/kern_linker.c
index d566bc01bc5e..e2f63cbc0c5a 100644
--- a/sys/kern/kern_linker.c
+++ b/sys/kern/kern_linker.c
@@ -435,7 +435,7 @@ linker_file_register_modules(linker_file_t lf)
}
static void
-linker_init_kernel_modules(void)
+linker_init_kernel_modules(void *dummy __unused)
{
sx_xlock(&kld_sx);
diff --git a/sys/kern/kern_malloc.c b/sys/kern/kern_malloc.c
index 653ce1ee556b..e919b15543b2 100644
--- a/sys/kern/kern_malloc.c
+++ b/sys/kern/kern_malloc.c
@@ -303,7 +303,7 @@ sysctl_vm_malloc_zone_sizes(SYSCTL_HANDLER_ARGS)
*/
#if MALLOC_DEBUG_MAXZONES > 1
static void
-tunable_set_numzones(void)
+tunable_set_numzones(void *dummy __unused)
{
TUNABLE_INT_FETCH("debug.malloc.numzones",
diff --git a/sys/kern/kern_racct.c b/sys/kern/kern_racct.c
index 7351e9cb6313..2aab151aba08 100644
--- a/sys/kern/kern_racct.c
+++ b/sys/kern/kern_racct.c
@@ -1312,7 +1312,7 @@ static struct kproc_desc racctd_kp = {
};
static void
-racctd_init(void)
+racctd_init(void *dummy __unused)
{
if (!racct_enable)
return;
@@ -1322,7 +1322,7 @@ racctd_init(void)
SYSINIT(racctd, SI_SUB_RACCTD, SI_ORDER_FIRST, racctd_init, NULL);
static void
-racct_init(void)
+racct_init(void *dummy __unused)
{
if (!racct_enable)
return;
diff --git a/sys/kern/kern_rangelock.c b/sys/kern/kern_rangelock.c
index 3854ffbeec29..cd66bff62608 100644
--- a/sys/kern/kern_rangelock.c
+++ b/sys/kern/kern_rangelock.c
@@ -300,7 +300,7 @@ static void rangelock_free_free(struct rl_q_entry *free);
static void rangelock_noncheating_destroy(struct rangelock *lock);
static void
-rangelock_sys_init(void)
+rangelock_sys_init(void *dummy __unused)
{
rl_entry_zone = uma_zcreate("rl_entry", sizeof(struct rl_q_entry),
NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct rl_q_entry),
diff --git a/sys/kern/kern_rctl.c b/sys/kern/kern_rctl.c
index 4232c71f86fb..682ba86d23ff 100644
--- a/sys/kern/kern_rctl.c
+++ b/sys/kern/kern_rctl.c
@@ -209,7 +209,7 @@ static struct dict actionnames[] = {
{ "throttle", RCTL_ACTION_THROTTLE },
{ NULL, -1 }};
-static void rctl_init(void);
+static void rctl_init(void *);
SYSINIT(rctl, SI_SUB_RACCT, SI_ORDER_FIRST, rctl_init, NULL);
static uma_zone_t rctl_rule_zone;
@@ -2175,7 +2175,7 @@ rctl_racct_release(struct racct *racct)
}
static void
-rctl_init(void)
+rctl_init(void *dummy __unused)
{
if (!racct_enable)
diff --git a/sys/kern/kern_sharedpage.c b/sys/kern/kern_sharedpage.c
index 5b8398caaca9..f48d0e3d616b 100644
--- a/sys/kern/kern_sharedpage.c
+++ b/sys/kern/kern_sharedpage.c
@@ -130,8 +130,7 @@ shared_page_init(void *dummy __unused)
shared_page_mapping = (char *)addr;
}
-SYSINIT(shp, SI_SUB_EXEC, SI_ORDER_FIRST, (sysinit_cfunc_t)shared_page_init,
- NULL);
+SYSINIT(shp, SI_SUB_EXEC, SI_ORDER_FIRST, shared_page_init, NULL);
/*
* Push the timehands update to the shared page.
diff --git a/sys/kern/kern_sig.c b/sys/kern/kern_sig.c
index 8efc0886988b..21f765b17f62 100644
--- a/sys/kern/kern_sig.c
+++ b/sys/kern/kern_sig.c
@@ -113,7 +113,7 @@ static int filt_sigattach(struct knote *kn);
static void filt_sigdetach(struct knote *kn);
static int filt_signal(struct knote *kn, long hint);
static struct thread *sigtd(struct proc *p, int sig, bool fast_sigblock);
-static void sigqueue_start(void);
+static void sigqueue_start(void *);
static void sigfastblock_setpend(struct thread *td, bool resched);
static void sig_handle_first_stop(struct thread *td, struct proc *p,
int sig);
@@ -344,7 +344,7 @@ ast_sigsuspend(struct thread *td, int tda __unused)
}
static void
-sigqueue_start(void)
+sigqueue_start(void *dummy __unused)
{
ksiginfo_zone = uma_zcreate("ksiginfo", sizeof(ksiginfo_t),
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
diff --git a/sys/kern/kern_time.c b/sys/kern/kern_time.c
index 2a6f0989f6aa..5b7485c25cd7 100644
--- a/sys/kern/kern_time.c
+++ b/sys/kern/kern_time.c
@@ -90,7 +90,7 @@ static int user_clock_nanosleep(struct thread *td, clockid_t clock_id,
int flags, const struct timespec *ua_rqtp,
struct timespec *ua_rmtp);
-static void itimer_start(void);
+static void itimer_start(void *);
static int itimer_init(void *, int, int);
static void itimer_fini(void *, int);
static void itimer_enter(struct itimer *);
@@ -1170,7 +1170,7 @@ eventratecheck(struct timeval *lasttime, int *cureps, int maxeps)
}
static void
-itimer_start(void)
+itimer_start(void *dummy __unused)
{
static const struct kclock rt_clock = {
.timer_create = realtimer_create,
diff --git a/sys/kern/subr_pcpu.c b/sys/kern/subr_pcpu.c
index 5c14e15830f4..c9a387a5e87b 100644
--- a/sys/kern/subr_pcpu.c
+++ b/sys/kern/subr_pcpu.c
@@ -140,7 +140,7 @@ uma_zone_t pcpu_zone_32;
uma_zone_t pcpu_zone_64;
static void
-pcpu_zones_startup(void)
+pcpu_zones_startup(void *dummy __unused)
{
pcpu_zone_4 = uma_zcreate("pcpu-4", 4,
diff --git a/sys/kern/sys_socket.c b/sys/kern/sys_socket.c
index c221106ae067..bc0725230cca 100644
--- a/sys/kern/sys_socket.c
+++ b/sys/kern/sys_socket.c
@@ -586,7 +586,7 @@ soaio_enqueue(struct task *task)
}
static void
-soaio_init(void)
+soaio_init(void *dummy __unused)
{
soaio_lifetime = AIOD_LIFETIME_DEFAULT;
diff --git a/sys/kern/uipc_usrreq.c b/sys/kern/uipc_usrreq.c
index 340d84666459..c5fc1e84ce3f 100644
--- a/sys/kern/uipc_usrreq.c
+++ b/sys/kern/uipc_usrreq.c
@@ -1069,6 +1069,21 @@ uipc_stream_sbspace(struct sockbuf *sb)
return (min(space, mbspace));
}
+/*
+ * UNIX version of generic sbwait() for writes. We wait on peer's receive
+ * buffer, using our timeout.
+ */
+static int
+uipc_stream_sbwait(struct socket *so, sbintime_t timeo)
+{
+ struct sockbuf *sb = &so->so_rcv;
+
+ SOCK_RECVBUF_LOCK_ASSERT(so);
+ sb->sb_flags |= SB_WAIT;
+ return (msleep_sbt(&sb->sb_acc, SOCK_RECVBUF_MTX(so), PSOCK | PCATCH,
+ "sbwait", timeo, 0, 0));
+}
+
static int
uipc_sosend_stream_or_seqpacket(struct socket *so, struct sockaddr *addr,
struct uio *uio0, struct mbuf *m, struct mbuf *c, int flags,
@@ -1203,7 +1218,8 @@ restart:
error = EWOULDBLOCK;
goto out4;
}
- if ((error = sbwait(so2, SO_RCV)) != 0) {
+ if ((error = uipc_stream_sbwait(so2,
+ so->so_snd.sb_timeo)) != 0) {
SOCK_RECVBUF_UNLOCK(so2);
goto out4;
} else
@@ -2397,7 +2413,7 @@ uipc_sendfile_wait(struct socket *so, off_t need, int *space)
}
if (!sockref)
soref(so2);
- error = sbwait(so2, SO_RCV);
+ error = uipc_stream_sbwait(so2, so->so_snd.sb_timeo);
if (error == 0 &&
__predict_false(sb->sb_state & SBS_CANTRCVMORE))
error = EPIPE;
diff --git a/sys/libkern/arc4random.c b/sys/libkern/arc4random.c
index 016822e9f03c..6fca7c3c4e9d 100644
--- a/sys/libkern/arc4random.c
+++ b/sys/libkern/arc4random.c
@@ -156,7 +156,7 @@ chacha20_randomstir(struct chacha20_s *chacha20)
* Initialize the contexts.
*/
static void
-chacha20_init(void)
+chacha20_init(void *dummy __unused)
{
struct chacha20_s *chacha20;
@@ -176,7 +176,7 @@ SYSINIT(chacha20, SI_SUB_LOCK, SI_ORDER_ANY, chacha20_init, NULL);
static void
-chacha20_uninit(void)
+chacha20_uninit(void *dummy __unused)
{
struct chacha20_s *chacha20;
diff --git a/sys/libkern/x86/crc32_sse42.c b/sys/libkern/x86/crc32_sse42.c
index b79c7afbeeb1..94ffdc178910 100644
--- a/sys/libkern/x86/crc32_sse42.c
+++ b/sys/libkern/x86/crc32_sse42.c
@@ -199,8 +199,10 @@ crc32c_shift(uint32_t zeros[][256], uint32_t crc)
static void
#ifndef _KERNEL
__attribute__((__constructor__))
-#endif
crc32c_init_hw(void)
+#else
+crc32c_init_hw(void *dummy __unused)
+#endif
{
crc32c_zeros(crc32c_long, LONG);
crc32c_zeros(crc32c_2long, 2 * LONG);
diff --git a/sys/modules/ktest/Makefile b/sys/modules/ktest/Makefile
index a3052efa9ed9..d5f15576f38b 100644
--- a/sys/modules/ktest/Makefile
+++ b/sys/modules/ktest/Makefile
@@ -1,5 +1,6 @@
SUBDIR= ktest \
ktest_example \
- ktest_netlink_message_writer
+ ktest_netlink_message_writer \
+ ktest_tcphpts
.include <bsd.subdir.mk>
diff --git a/sys/modules/ktest/ktest_tcphpts/Makefile b/sys/modules/ktest/ktest_tcphpts/Makefile
new file mode 100644
index 000000000000..b642c0cb4209
--- /dev/null
+++ b/sys/modules/ktest/ktest_tcphpts/Makefile
@@ -0,0 +1,13 @@
+PACKAGE= tests
+WARNS?= 6
+
+SYSDIR?=${SRCTOP}/sys
+.include "${SYSDIR}/conf/kern.opts.mk"
+
+.PATH: ${SYSDIR}/netinet
+
+KMOD= ktest_tcphpts
+SRCS= tcp_hpts_test.c
+
+.include <bsd.kmod.mk>
+
diff --git a/sys/net/route.c b/sys/net/route.c
index 7a50bcc43e06..d2c9f3e39c17 100644
--- a/sys/net/route.c
+++ b/sys/net/route.c
@@ -89,7 +89,7 @@ static int rt_ifdelroute(const struct rtentry *rt, const struct nhop_object *,
* SI_ORDER_MIDDLE.
*/
static void
-route_init(void)
+route_init(void *dummy __unused)
{
nhops_init();
diff --git a/sys/net/route/route_tables.c b/sys/net/route/route_tables.c
index 176ca43fa1c5..3b7bb1385d0e 100644
--- a/sys/net/route/route_tables.c
+++ b/sys/net/route/route_tables.c
@@ -186,7 +186,7 @@ rtables_prison_destructor(void *data)
}
static void
-rtables_init(void)
+rtables_init(void *dummy __unused)
{
osd_method_t methods[PR_MAXMETHOD] = {
[PR_METHOD_ATTACH] = rtables_check_proc_fib,
diff --git a/sys/net/rtsock.c b/sys/net/rtsock.c
index f0dcc973ca7c..be858428bb3e 100644
--- a/sys/net/rtsock.c
+++ b/sys/net/rtsock.c
@@ -309,7 +309,7 @@ rtsock_notify_event(uint32_t fibnum, const struct rib_cmd_info *rc)
}
static void
-rtsock_init(void)
+rtsock_init(void *dummy __unused)
{
rtsbridge_orig_p = rtsock_callback_p;
rtsock_callback_p = &rtsbridge;
diff --git a/sys/net80211/ieee80211_ht.c b/sys/net80211/ieee80211_ht.c
index 3af56a228295..a8a767785fce 100644
--- a/sys/net80211/ieee80211_ht.c
+++ b/sys/net80211/ieee80211_ht.c
@@ -167,7 +167,7 @@ static ieee80211_send_action_func ht_send_action_ba_delba;
static ieee80211_send_action_func ht_send_action_ht_txchwidth;
static void
-ieee80211_ht_init(void)
+ieee80211_ht_init(void *dummy __unused)
{
/*
* Setup HT parameters that depends on the clock frequency.
diff --git a/sys/net80211/ieee80211_hwmp.c b/sys/net80211/ieee80211_hwmp.c
index b69210768c54..084e67da13db 100644
--- a/sys/net80211/ieee80211_hwmp.c
+++ b/sys/net80211/ieee80211_hwmp.c
@@ -212,7 +212,7 @@ SYSCTL_PROC(_net_wlan_hwmp, OID_AUTO, inact,
"mesh route inactivity timeout (ms)");
static void
-ieee80211_hwmp_init(void)
+ieee80211_hwmp_init(void *dummy __unused)
{
/* Default values as per amendment */
ieee80211_hwmp_pathtimeout = msecs_to_ticks(5*1000);
diff --git a/sys/net80211/ieee80211_mesh.c b/sys/net80211/ieee80211_mesh.c
index 3f0410a69e3c..7f2e8bdcb963 100644
--- a/sys/net80211/ieee80211_mesh.c
+++ b/sys/net80211/ieee80211_mesh.c
@@ -548,7 +548,7 @@ mesh_gatemode_cb(void *arg)
}
static void
-ieee80211_mesh_init(void)
+ieee80211_mesh_init(void *dummy __unused)
{
memset(mesh_proto_paths, 0, sizeof(mesh_proto_paths));
diff --git a/sys/net80211/ieee80211_phy.c b/sys/net80211/ieee80211_phy.c
index 7f53c717152b..b4d9b16907d2 100644
--- a/sys/net80211/ieee80211_phy.c
+++ b/sys/net80211/ieee80211_phy.c
@@ -348,7 +348,7 @@ ieee80211_setup_ratetable(struct ieee80211_rate_table *rt)
/* Setup all rate tables */
static void
-ieee80211_phy_init(void)
+ieee80211_phy_init(void *dummy __unused)
{
static struct ieee80211_rate_table * const ratetables[] = {
&ieee80211_half_table,
diff --git a/sys/net80211/ieee80211_proto.c b/sys/net80211/ieee80211_proto.c
index 0c161d98a55a..4918bf7d025f 100644
--- a/sys/net80211/ieee80211_proto.c
+++ b/sys/net80211/ieee80211_proto.c
@@ -459,7 +459,7 @@ static const struct ieee80211_authenticator auth_internal = {
* Setup internal authenticators once; they are never unregistered.
*/
static void
-ieee80211_auth_setup(void)
+ieee80211_auth_setup(void *dummy __unused)
{
ieee80211_authenticator_register(IEEE80211_AUTH_OPEN, &auth_internal);
ieee80211_authenticator_register(IEEE80211_AUTH_SHARED, &auth_internal);
diff --git a/sys/net80211/ieee80211_vht.c b/sys/net80211/ieee80211_vht.c
index 10a5fc7f08ab..095c4108c768 100644
--- a/sys/net80211/ieee80211_vht.c
+++ b/sys/net80211/ieee80211_vht.c
@@ -102,7 +102,7 @@ vht_send_action_placeholder(struct ieee80211_node *ni,
}
static void
-ieee80211_vht_init(void)
+ieee80211_vht_init(void *dummy __unused)
{
ieee80211_recv_action_register(IEEE80211_ACTION_CAT_VHT,
diff --git a/sys/netinet/cc/cc.c b/sys/netinet/cc/cc.c
index c20a20cd983d..bc06616dbf93 100644
--- a/sys/netinet/cc/cc.c
+++ b/sys/netinet/cc/cc.c
@@ -271,7 +271,7 @@ cc_check_default(struct cc_algo *remove_cc)
* Initialise CC subsystem on system boot.
*/
static void
-cc_init(void)
+cc_init(void *dummy __unused)
{
CC_LIST_LOCK_INIT();
STAILQ_INIT(&cc_list);
diff --git a/sys/netinet/in_fib_algo.c b/sys/netinet/in_fib_algo.c
index 123dacb409e7..95621c300064 100644
--- a/sys/netinet/in_fib_algo.c
+++ b/sys/netinet/in_fib_algo.c
@@ -767,7 +767,7 @@ struct fib_lookup_module flm_radix4 = {
};
static void
-fib4_algo_init(void)
+fib4_algo_init(void *dummy __unused)
{
fib_module_register(&flm_bsearch4);
diff --git a/sys/netinet/tcp_hpts.c b/sys/netinet/tcp_hpts.c
index 63bbe4bba11b..c54459bb5f01 100644
--- a/sys/netinet/tcp_hpts.c
+++ b/sys/netinet/tcp_hpts.c
@@ -39,15 +39,14 @@
* First, and probably the main thing its used by Rack and BBR, it can
* be used to call tcp_output() of a transport stack at some time in the future.
* The normal way this is done is that tcp_output() of the stack schedules
- * itself to be called again by calling tcp_hpts_insert(tcpcb, slot). The
- * slot is the time from now that the stack wants to be called but it
- * must be converted to tcp_hpts's notion of slot. This is done with
- * one of the macros HPTS_MS_TO_SLOTS or HPTS_USEC_TO_SLOTS. So a typical
+ * itself to be called again by calling tcp_hpts_insert(tcpcb, usecs). The
+ * usecs is the time from now that the stack wants to be called and is
+ * passing time directly in microseconds. So a typical
* call from the tcp_output() routine might look like:
*
- * tcp_hpts_insert(tp, HPTS_USEC_TO_SLOTS(550));
+ * tcp_hpts_insert(tp, 550, NULL);
*
- * The above would schedule tcp_output() to be called in 550 useconds.
+ * The above would schedule tcp_output() to be called in 550 microseconds.
* Note that if using this mechanism the stack will want to add near
* its top a check to prevent unwanted calls (from user land or the
* arrival of incoming ack's). So it would add something like:
@@ -149,27 +148,44 @@
#include <netinet/tcpip.h>
#include <netinet/cc/cc.h>
#include <netinet/tcp_hpts.h>
+#include <netinet/tcp_hpts_internal.h>
#include <netinet/tcp_log_buf.h>
#ifdef tcp_offload
#include <netinet/tcp_offload.h>
#endif
-/*
- * The hpts uses a 102400 wheel. The wheel
- * defines the time in 10 usec increments (102400 x 10).
- * This gives a range of 10usec - 1024ms to place
- * an entry within. If the user requests more than
- * 1.024 second, a remaineder is attached and the hpts
- * when seeing the remainder will re-insert the
- * inpcb forward in time from where it is until
- * the remainder is zero.
- */
+/* Global instance for TCP HPTS */
+struct tcp_hptsi *tcp_hptsi_pace;
+
+/* Default function table for production use. */
+const struct tcp_hptsi_funcs tcp_hptsi_default_funcs = {
+ .microuptime = microuptime,
+ .swi_add = swi_add,
+ .swi_remove = swi_remove,
+ .swi_sched = swi_sched,
+ .intr_event_bind = intr_event_bind,
+ .intr_event_bind_ithread_cpuset = intr_event_bind_ithread_cpuset,
+ .callout_init = callout_init,
+ .callout_reset_sbt_on = callout_reset_sbt_on,
+ ._callout_stop_safe = _callout_stop_safe,
+};
-#define NUM_OF_HPTSI_SLOTS 102400
+#ifdef TCP_HPTS_KTEST
+#define microuptime pace->funcs->microuptime
+#define swi_add pace->funcs->swi_add
+#define swi_remove pace->funcs->swi_remove
+#define swi_sched pace->funcs->swi_sched
+#define intr_event_bind pace->funcs->intr_event_bind
+#define intr_event_bind_ithread_cpuset pace->funcs->intr_event_bind_ithread_cpuset
+#define callout_init pace->funcs->callout_init
+#define callout_reset_sbt_on pace->funcs->callout_reset_sbt_on
+#define _callout_stop_safe pace->funcs->_callout_stop_safe
+#endif
-/* The number of connections after which the dynamic sleep logic kicks in. */
-#define DEFAULT_CONNECTION_THRESHOLD 100
+static MALLOC_DEFINE(M_TCPHPTS, "tcp_hpts", "TCP hpts");
+
+static void tcp_hpts_thread(void *ctx);
/*
* When using the hpts, a TCP stack must make sure
@@ -204,87 +220,22 @@
*
* When we are in the "new" mode i.e. conn_cnt > conn_cnt_thresh
* then we do a dynamic adjustment on the time we sleep.
- * Our threshold is if the lateness of the first client served (in ticks) is
+ * Our threshold is if the lateness of the first client served (in slots) is
* greater than or equal too slots_indicate_more_sleep (10ms
- * or 10000 ticks). If we were that late, the actual sleep time
- * is adjusted down by 50%. If the ticks_ran is less than
- * slots_indicate_more_sleep (100 ticks or 1000usecs).
+ * or 10000 slots). If we were that late, the actual sleep time
+ * is adjusted down by 50%. If the slots_ran is less than
+ * slots_indicate_more_sleep (100 slots or 1000usecs).
*
*/
-/* Each hpts has its own p_mtx which is used for locking */
-#define HPTS_MTX_ASSERT(hpts) mtx_assert(&(hpts)->p_mtx, MA_OWNED)
-#define HPTS_LOCK(hpts) mtx_lock(&(hpts)->p_mtx)
-#define HPTS_TRYLOCK(hpts) mtx_trylock(&(hpts)->p_mtx)
-#define HPTS_UNLOCK(hpts) mtx_unlock(&(hpts)->p_mtx)
-struct tcp_hpts_entry {
- /* Cache line 0x00 */
- struct mtx p_mtx; /* Mutex for hpts */
- struct timeval p_mysleep; /* Our min sleep time */
- uint64_t syscall_cnt;
- uint64_t sleeping; /* What the actual sleep was (if sleeping) */
- uint16_t p_hpts_active; /* Flag that says hpts is awake */
- uint8_t p_wheel_complete; /* have we completed the wheel arc walk? */
- uint32_t p_curtick; /* Tick in 10 us the hpts is going to */
- uint32_t p_runningslot; /* Current tick we are at if we are running */
- uint32_t p_prev_slot; /* Previous slot we were on */
- uint32_t p_cur_slot; /* Current slot in wheel hpts is draining */
- uint32_t p_nxt_slot; /* The next slot outside the current range of
- * slots that the hpts is running on. */
- int32_t p_on_queue_cnt; /* Count on queue in this hpts */
- uint32_t p_lasttick; /* Last tick before the current one */
- uint8_t p_direct_wake :1, /* boolean */
- p_on_min_sleep:1, /* boolean */
- p_hpts_wake_scheduled:1, /* boolean */
- hit_callout_thresh:1,
- p_avail:4;
- uint8_t p_fill[3]; /* Fill to 32 bits */
- /* Cache line 0x40 */
- struct hptsh {
- TAILQ_HEAD(, tcpcb) head;
- uint32_t count;
- uint32_t gencnt;
- } *p_hptss; /* Hptsi wheel */
- uint32_t p_hpts_sleep_time; /* Current sleep interval having a max
- * of 255ms */
- uint32_t overidden_sleep; /* what was overrided by min-sleep for logging */
- uint32_t saved_lasttick; /* for logging */
- uint32_t saved_curtick; /* for logging */
- uint32_t saved_curslot; /* for logging */
- uint32_t saved_prev_slot; /* for logging */
- uint32_t p_delayed_by; /* How much were we delayed by */
- /* Cache line 0x80 */
- struct sysctl_ctx_list hpts_ctx;
- struct sysctl_oid *hpts_root;
- struct intr_event *ie;
- void *ie_cookie;
- uint16_t p_num; /* The hpts number one per cpu */
- uint16_t p_cpu; /* The hpts CPU */
- /* There is extra space in here */
- /* Cache line 0x100 */
- struct callout co __aligned(CACHE_LINE_SIZE);
-} __aligned(CACHE_LINE_SIZE);
-
-static struct tcp_hptsi {
- struct cpu_group **grps;
- struct tcp_hpts_entry **rp_ent; /* Array of hptss */
- uint32_t *cts_last_ran;
- uint32_t grp_cnt;
- uint32_t rp_num_hptss; /* Number of hpts threads */
-} tcp_pace;
-
-static MALLOC_DEFINE(M_TCPHPTS, "tcp_hpts", "TCP hpts");
#ifdef RSS
-static int tcp_bind_threads = 1;
+int tcp_bind_threads = 1;
#else
-static int tcp_bind_threads = 2;
+int tcp_bind_threads = 2;
#endif
static int tcp_use_irq_cpu = 0;
static int hpts_does_tp_logging = 0;
-
-static int32_t tcp_hptsi(struct tcp_hpts_entry *hpts, bool from_callout);
-static void tcp_hpts_thread(void *ctx);
-
+static int32_t tcp_hpts_precision = 120;
int32_t tcp_min_hptsi_time = DEFAULT_MIN_SLEEP;
static int conn_cnt_thresh = DEFAULT_CONNECTION_THRESHOLD;
static int32_t dynamic_min_sleep = DYNAMIC_MIN_SLEEP;
@@ -295,23 +246,6 @@ SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hpts, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
SYSCTL_NODE(_net_inet_tcp_hpts, OID_AUTO, stats, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
"TCP Hpts statistics");
-#define timersub(tvp, uvp, vvp) \
- do { \
- (vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec; \
- (vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec; \
- if ((vvp)->tv_usec < 0) { \
- (vvp)->tv_sec--; \
- (vvp)->tv_usec += 1000000; \
- } \
- } while (0)
-
-static int32_t tcp_hpts_precision = 120;
-
-static struct hpts_domain_info {
- int count;
- int cpu[MAXCPU];
-} hpts_domains[MAXMEMDOM];
-
counter_u64_t hpts_hopelessly_behind;
SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, hopeless, CTLFLAG_RD,
@@ -459,14 +393,14 @@ SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, nowake_over_thresh, CTLFLAG_RW,
&tcp_hpts_no_wake_over_thresh, 0,
"When we are over the threshold on the pacer do we prohibit wakeups?");
-static uint16_t
-hpts_random_cpu(void)
+uint16_t
+tcp_hptsi_random_cpu(struct tcp_hptsi *pace)
{
uint16_t cpuid;
uint32_t ran;
ran = arc4random();
- cpuid = (((ran & 0xffff) % mp_ncpus) % tcp_pace.rp_num_hptss);
+ cpuid = (((ran & 0xffff) % mp_ncpus) % pace->rp_num_hptss);
return (cpuid);
}
@@ -487,13 +421,11 @@ tcp_hpts_log(struct tcp_hpts_entry *hpts, struct tcpcb *tp, struct timeval *tv,
log.u_bbr.flex2 = hpts->p_cur_slot;
log.u_bbr.flex3 = hpts->p_prev_slot;
log.u_bbr.flex4 = idx;
- log.u_bbr.flex5 = hpts->p_curtick;
log.u_bbr.flex6 = hpts->p_on_queue_cnt;
log.u_bbr.flex7 = hpts->p_cpu;
log.u_bbr.flex8 = (uint8_t)from_callout;
log.u_bbr.inflight = slots_to_run;
log.u_bbr.applimited = hpts->overidden_sleep;
- log.u_bbr.delivered = hpts->saved_curtick;
log.u_bbr.timeStamp = tcp_tv_to_usec(tv);
log.u_bbr.epoch = hpts->saved_curslot;
log.u_bbr.lt_epoch = hpts->saved_prev_slot;
@@ -510,11 +442,67 @@ tcp_hpts_log(struct tcp_hpts_entry *hpts, struct tcpcb *tp, struct timeval *tv,
}
}
+/*
+ * Timeout handler for the HPTS sleep callout. It immediately schedules the SWI
+ * for the HPTS entry to run.
+ */
static void
-tcp_wakehpts(struct tcp_hpts_entry *hpts)
+tcp_hpts_sleep_timeout(void *arg)
{
+#ifdef TCP_HPTS_KTEST
+ struct tcp_hptsi *pace;
+#endif
+ struct tcp_hpts_entry *hpts;
+
+ hpts = (struct tcp_hpts_entry *)arg;
+#ifdef TCP_HPTS_KTEST
+ pace = hpts->p_hptsi;
+#endif
+ swi_sched(hpts->ie_cookie, 0);
+}
+
+/*
+ * Reset the HPTS callout timer with the provided timeval. Returns the results
+ * of the callout_reset_sbt_on() function.
+ */
+static int
+tcp_hpts_sleep(struct tcp_hpts_entry *hpts, struct timeval *tv)
+{
+#ifdef TCP_HPTS_KTEST
+ struct tcp_hptsi *pace;
+#endif
+ sbintime_t sb;
+
+#ifdef TCP_HPTS_KTEST
+ pace = hpts->p_hptsi;
+#endif
+
+ /* Store off to make visible the actual sleep time */
+ hpts->sleeping = tv->tv_usec;
+
+ sb = tvtosbt(*tv);
+ return (callout_reset_sbt_on(
+ &hpts->co, sb, 0, tcp_hpts_sleep_timeout, hpts, hpts->p_cpu,
+ (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision))));
+}
+
+/*
+ * Schedules the SWI for the HTPS entry to run, if not already scheduled or
+ * running.
+ */
+void
+tcp_hpts_wake(struct tcp_hpts_entry *hpts)
+{
+#ifdef TCP_HPTS_KTEST
+ struct tcp_hptsi *pace;
+#endif
+
HPTS_MTX_ASSERT(hpts);
+#ifdef TCP_HPTS_KTEST
+ pace = hpts->p_hptsi;
+#endif
+
if (tcp_hpts_no_wake_over_thresh && (hpts->p_on_queue_cnt >= conn_cnt_thresh)) {
hpts->p_direct_wake = 0;
return;
@@ -526,15 +514,6 @@ tcp_wakehpts(struct tcp_hpts_entry *hpts)
}
static void
-hpts_timeout_swi(void *arg)
-{
- struct tcp_hpts_entry *hpts;
-
- hpts = (struct tcp_hpts_entry *)arg;
- swi_sched(hpts->ie_cookie, 0);
-}
-
-static void
tcp_hpts_insert_internal(struct tcpcb *tp, struct tcp_hpts_entry *hpts)
{
struct inpcb *inp = tptoinpcb(tp);
@@ -562,13 +541,13 @@ tcp_hpts_insert_internal(struct tcpcb *tp, struct tcp_hpts_entry *hpts)
}
static struct tcp_hpts_entry *
-tcp_hpts_lock(struct tcpcb *tp)
+tcp_hpts_lock(struct tcp_hptsi *pace, struct tcpcb *tp)
{
struct tcp_hpts_entry *hpts;
INP_LOCK_ASSERT(tptoinpcb(tp));
- hpts = tcp_pace.rp_ent[tp->t_hpts_cpu];
+ hpts = pace->rp_ent[tp->t_hpts_cpu];
HPTS_LOCK(hpts);
return (hpts);
@@ -595,11 +574,10 @@ tcp_hpts_release(struct tcpcb *tp)
* and has never received a first packet.
*/
void
-tcp_hpts_init(struct tcpcb *tp)
+__tcp_hpts_init(struct tcp_hptsi *pace, struct tcpcb *tp)
{
-
if (__predict_true(tp->t_hpts_cpu == HPTS_CPU_NONE)) {
- tp->t_hpts_cpu = hpts_random_cpu();
+ tp->t_hpts_cpu = tcp_hptsi_random_cpu(pace);
MPASS(!(tp->t_flags2 & TF2_HPTS_CPU_SET));
}
}
@@ -611,14 +589,14 @@ tcp_hpts_init(struct tcpcb *tp)
* INP lock and then get the hpts lock.
*/
void
-tcp_hpts_remove(struct tcpcb *tp)
+__tcp_hpts_remove(struct tcp_hptsi *pace, struct tcpcb *tp)
{
struct tcp_hpts_entry *hpts;
struct hptsh *hptsh;
INP_WLOCK_ASSERT(tptoinpcb(tp));
- hpts = tcp_hpts_lock(tp);
+ hpts = tcp_hpts_lock(pace, tp);
if (tp->t_in_hpts == IHPTS_ONQUEUE) {
hptsh = &hpts->p_hptss[tp->t_hpts_slot];
tp->t_hpts_request = 0;
@@ -662,23 +640,19 @@ hpts_slot(uint32_t wheel_slot, uint32_t plus)
{
/*
* Given a slot on the wheel, what slot
- * is that plus ticks out?
+ * is that plus slots out?
*/
- KASSERT(wheel_slot < NUM_OF_HPTSI_SLOTS, ("Invalid tick %u not on wheel", wheel_slot));
+ KASSERT(wheel_slot < NUM_OF_HPTSI_SLOTS, ("Invalid slot %u not on wheel", wheel_slot));
return ((wheel_slot + plus) % NUM_OF_HPTSI_SLOTS);
}
static inline int
-tick_to_wheel(uint32_t cts_in_wticks)
+cts_to_wheel(uint32_t cts)
{
/*
- * Given a timestamp in ticks (so by
- * default to get it to a real time one
- * would multiply by 10.. i.e the number
- * of ticks in a slot) map it to our limited
- * space wheel.
+ * Given a timestamp in useconds map it to our limited space wheel.
*/
- return (cts_in_wticks % NUM_OF_HPTSI_SLOTS);
+ return ((cts / HPTS_USECS_PER_SLOT) % NUM_OF_HPTSI_SLOTS);
}
static inline int
@@ -721,7 +695,7 @@ max_slots_available(struct tcp_hpts_entry *hpts, uint32_t wheel_slot, uint32_t *
if ((hpts->p_hpts_active == 1) &&
(hpts->p_wheel_complete == 0)) {
end_slot = hpts->p_runningslot;
- /* Back up one tick */
+ /* Back up one slot */
if (end_slot == 0)
end_slot = NUM_OF_HPTSI_SLOTS - 1;
else
@@ -734,7 +708,7 @@ max_slots_available(struct tcp_hpts_entry *hpts, uint32_t wheel_slot, uint32_t *
* not active, or we have
* completed the pass over
* the wheel, we can use the
- * prev tick and subtract one from it. This puts us
+ * prev slot and subtract one from it. This puts us
* as far out as possible on the wheel.
*/
end_slot = hpts->p_prev_slot;
@@ -747,7 +721,7 @@ max_slots_available(struct tcp_hpts_entry *hpts, uint32_t wheel_slot, uint32_t *
/*
* Now we have close to the full wheel left minus the
* time it has been since the pacer went to sleep. Note
- * that wheel_tick, passed in, should be the current time
+ * that wheel_slot, passed in, should be the current time
* from the perspective of the caller, mapped to the wheel.
*/
if (hpts->p_prev_slot != wheel_slot)
@@ -824,7 +798,7 @@ max_slots_available(struct tcp_hpts_entry *hpts, uint32_t wheel_slot, uint32_t *
#ifdef INVARIANTS
static void
check_if_slot_would_be_wrong(struct tcp_hpts_entry *hpts, struct tcpcb *tp,
- uint32_t hptsslot, int line)
+ uint32_t hptsslot)
{
/*
* Sanity checks for the pacer with invariants
@@ -855,12 +829,13 @@ check_if_slot_would_be_wrong(struct tcp_hpts_entry *hpts, struct tcpcb *tp,
}
#endif
-uint32_t
-tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_diag *diag)
+void
+__tcp_hpts_insert(struct tcp_hptsi *pace, struct tcpcb *tp, uint32_t usecs,
+ struct hpts_diag *diag)
{
struct tcp_hpts_entry *hpts;
struct timeval tv;
- uint32_t slot_on, wheel_cts, last_slot, need_new_to = 0;
+ uint32_t slot, wheel_cts, last_slot, need_new_to = 0;
int32_t wheel_slot, maxslots;
bool need_wakeup = false;
@@ -869,11 +844,13 @@ tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_
MPASS(!(tp->t_in_hpts == IHPTS_ONQUEUE));
/*
+ * Convert microseconds to slots for internal use.
* We now return the next-slot the hpts will be on, beyond its
* current run (if up) or where it was when it stopped if it is
* sleeping.
*/
- hpts = tcp_hpts_lock(tp);
+ slot = HPTS_USEC_TO_SLOTS(usecs);
+ hpts = tcp_hpts_lock(pace, tp);
microuptime(&tv);
if (diag) {
memset(diag, 0, sizeof(struct hpts_diag));
@@ -882,8 +859,6 @@ tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_
diag->p_runningslot = hpts->p_runningslot;
diag->p_nxt_slot = hpts->p_nxt_slot;
diag->p_cur_slot = hpts->p_cur_slot;
- diag->p_curtick = hpts->p_curtick;
- diag->p_lasttick = hpts->p_lasttick;
diag->slot_req = slot;
diag->p_on_min_sleep = hpts->p_on_min_sleep;
diag->hpts_sleep_time = hpts->p_hpts_sleep_time;
@@ -910,17 +885,15 @@ tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_
* timeout is not 1.
*/
hpts->p_direct_wake = 1;
- tcp_wakehpts(hpts);
+ tcp_hpts_wake(hpts);
}
- slot_on = hpts->p_nxt_slot;
HPTS_UNLOCK(hpts);
- return (slot_on);
+ return;
}
- /* Get the current time relative to the wheel */
- wheel_cts = tcp_tv_to_hpts_slot(&tv);
- /* Map it onto the wheel */
- wheel_slot = tick_to_wheel(wheel_cts);
+ /* Get the current time stamp and map it onto the wheel */
+ wheel_cts = tcp_tv_to_usec(&tv);
+ wheel_slot = cts_to_wheel(wheel_cts);
/* Now what's the max we can place it at? */
maxslots = max_slots_available(hpts, wheel_slot, &last_slot);
if (diag) {
@@ -952,11 +925,11 @@ tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_
tp->t_hpts_slot = last_slot;
}
if (diag) {
- diag->slot_remaining = tp->t_hpts_request;
+ diag->time_remaining = tp->t_hpts_request;
diag->inp_hptsslot = tp->t_hpts_slot;
}
#ifdef INVARIANTS
- check_if_slot_would_be_wrong(hpts, tp, tp->t_hpts_slot, line);
+ check_if_slot_would_be_wrong(hpts, tp, tp->t_hpts_slot);
#endif
if (__predict_true(tp->t_in_hpts != IHPTS_MOVING))
tcp_hpts_insert_internal(tp, hpts);
@@ -995,12 +968,12 @@ tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_
}
/*
* Now how far is the hpts sleeping to? if active is 1, its
- * up and ticking we do nothing, otherwise we may need to
+ * up and running we do nothing, otherwise we may need to
* reschedule its callout if need_new_to is set from above.
*/
if (need_wakeup) {
hpts->p_direct_wake = 1;
- tcp_wakehpts(hpts);
+ tcp_hpts_wake(hpts);
if (diag) {
diag->need_new_to = 0;
diag->co_ret = 0xffff0000;
@@ -1008,7 +981,6 @@ tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_
} else if (need_new_to) {
int32_t co_ret;
struct timeval tv;
- sbintime_t sb;
tv.tv_sec = 0;
tv.tv_usec = 0;
@@ -1016,24 +988,18 @@ tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_
tv.tv_sec++;
need_new_to -= HPTS_USEC_IN_SEC;
}
- tv.tv_usec = need_new_to;
- sb = tvtosbt(tv);
- co_ret = callout_reset_sbt_on(&hpts->co, sb, 0,
- hpts_timeout_swi, hpts, hpts->p_cpu,
- (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
+ tv.tv_usec = need_new_to; /* XXX: Why is this sleeping over the max? */
+ co_ret = tcp_hpts_sleep(hpts, &tv);
if (diag) {
diag->need_new_to = need_new_to;
diag->co_ret = co_ret;
}
}
- slot_on = hpts->p_nxt_slot;
HPTS_UNLOCK(hpts);
-
- return (slot_on);
}
static uint16_t
-hpts_cpuid(struct tcpcb *tp, int *failed)
+hpts_cpuid(struct tcp_hptsi *pace, struct tcpcb *tp, int *failed)
{
struct inpcb *inp = tptoinpcb(tp);
u_int cpuid;
@@ -1060,7 +1026,7 @@ hpts_cpuid(struct tcpcb *tp, int *failed)
#ifdef RSS
cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype);
if (cpuid == NETISR_CPUID_NONE)
- return (hpts_random_cpu());
+ return (tcp_hptsi_random_cpu(pace));
else
return (cpuid);
#endif
@@ -1071,7 +1037,7 @@ hpts_cpuid(struct tcpcb *tp, int *failed)
*/
if (inp->inp_flowtype == M_HASHTYPE_NONE) {
counter_u64_add(cpu_uses_random, 1);
- return (hpts_random_cpu());
+ return (tcp_hptsi_random_cpu(pace));
}
/*
* Hash to a thread based on the flowid. If we are using numa,
@@ -1086,7 +1052,7 @@ hpts_cpuid(struct tcpcb *tp, int *failed)
#ifdef NUMA
} else {
/* Hash into the cpu's that use that domain */
- di = &hpts_domains[inp->inp_numa_domain];
+ di = &pace->domains[inp->inp_numa_domain];
cpuid = di->cpu[inp->inp_flowid % di->count];
}
#endif
@@ -1118,9 +1084,16 @@ tcp_hpts_set_max_sleep(struct tcp_hpts_entry *hpts, int wrap_loop_cnt)
}
}
-static int32_t
+static bool
+tcp_hpts_different_slots(uint32_t cts, uint32_t cts_last_run)
+{
+ return ((cts / HPTS_USECS_PER_SLOT) != (cts_last_run / HPTS_USECS_PER_SLOT));
+}
+
+int32_t
tcp_hptsi(struct tcp_hpts_entry *hpts, bool from_callout)
{
+ struct tcp_hptsi *pace;
struct tcpcb *tp;
struct timeval tv;
int32_t slots_to_run, i, error;
@@ -1130,6 +1103,7 @@ tcp_hptsi(struct tcp_hpts_entry *hpts, bool from_callout)
int32_t wrap_loop_cnt = 0;
int32_t slot_pos_of_endpoint = 0;
int32_t orig_exit_slot;
+ uint32_t cts, cts_last_run;
bool completed_measure, seen_endpoint;
completed_measure = false;
@@ -1137,32 +1111,34 @@ tcp_hptsi(struct tcp_hpts_entry *hpts, bool from_callout)
HPTS_MTX_ASSERT(hpts);
NET_EPOCH_ASSERT();
+
+ pace = hpts->p_hptsi;
+ MPASS(pace != NULL);
+
/* record previous info for any logging */
- hpts->saved_lasttick = hpts->p_lasttick;
- hpts->saved_curtick = hpts->p_curtick;
hpts->saved_curslot = hpts->p_cur_slot;
hpts->saved_prev_slot = hpts->p_prev_slot;
- hpts->p_lasttick = hpts->p_curtick;
- hpts->p_curtick = tcp_gethptstick(&tv);
- tcp_pace.cts_last_ran[hpts->p_num] = tcp_tv_to_usec(&tv);
- orig_exit_slot = hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
+ microuptime(&tv);
+ cts_last_run = pace->cts_last_ran[hpts->p_cpu];
+ pace->cts_last_ran[hpts->p_cpu] = cts = tcp_tv_to_usec(&tv);
+
+ orig_exit_slot = hpts->p_cur_slot = cts_to_wheel(cts);
if ((hpts->p_on_queue_cnt == 0) ||
- (hpts->p_lasttick == hpts->p_curtick)) {
+ !tcp_hpts_different_slots(cts, cts_last_run)) {
/*
- * No time has yet passed,
- * or nothing to do.
+ * Not enough time has yet passed or nothing to do.
*/
hpts->p_prev_slot = hpts->p_cur_slot;
- hpts->p_lasttick = hpts->p_curtick;
goto no_run;
}
again:
hpts->p_wheel_complete = 0;
HPTS_MTX_ASSERT(hpts);
slots_to_run = hpts_slots_diff(hpts->p_prev_slot, hpts->p_cur_slot);
- if (((hpts->p_curtick - hpts->p_lasttick) > (NUM_OF_HPTSI_SLOTS - 1)) &&
- (hpts->p_on_queue_cnt != 0)) {
+ if ((hpts->p_on_queue_cnt != 0) &&
+ ((cts - cts_last_run) >
+ ((NUM_OF_HPTSI_SLOTS-1) * HPTS_USECS_PER_SLOT))) {
/*
* Wheel wrap is occuring, basically we
* are behind and the distance between
@@ -1238,7 +1214,7 @@ again:
uint32_t runningslot;
/*
- * Calculate our delay, if there are no extra ticks there
+ * Calculate our delay, if there are no extra slots there
* was not any (i.e. if slots_to_run == 1, no delay).
*/
hpts->p_delayed_by = (slots_to_run - (i + 1)) *
@@ -1391,7 +1367,7 @@ again:
* gets added to the hpts (not this one)
* :-)
*/
- tcp_set_hpts(tp);
+ __tcp_set_hpts(pace, tp);
}
CURVNET_SET(inp->inp_vnet);
/* Lets do any logging that we might want to */
@@ -1450,10 +1426,12 @@ no_one:
hpts->p_delayed_by = 0;
/*
* Check to see if we took an excess amount of time and need to run
- * more ticks (if we did not hit eno-bufs).
+ * more slots (if we did not hit eno-bufs).
*/
hpts->p_prev_slot = hpts->p_cur_slot;
- hpts->p_lasttick = hpts->p_curtick;
+ microuptime(&tv);
+ cts_last_run = cts;
+ cts = tcp_tv_to_usec(&tv);
if (!from_callout || (loop_cnt > max_pacer_loops)) {
/*
* Something is serious slow we have
@@ -1465,7 +1443,7 @@ no_one:
* can never catch up :(
*
* We will just lie to this thread
- * and let it thing p_curtick is
+ * and let it think p_curslot is
* correct. When it next awakens
* it will find itself further behind.
*/
@@ -1473,20 +1451,19 @@ no_one:
counter_u64_add(hpts_hopelessly_behind, 1);
goto no_run;
}
- hpts->p_curtick = tcp_gethptstick(&tv);
- hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
+
+ hpts->p_cur_slot = cts_to_wheel(cts);
if (!seen_endpoint) {
/* We saw no endpoint but we may be looping */
orig_exit_slot = hpts->p_cur_slot;
}
- if ((wrap_loop_cnt < 2) &&
- (hpts->p_lasttick != hpts->p_curtick)) {
+ if ((wrap_loop_cnt < 2) && tcp_hpts_different_slots(cts, cts_last_run)) {
counter_u64_add(hpts_loops, 1);
loop_cnt++;
goto again;
}
no_run:
- tcp_pace.cts_last_ran[hpts->p_num] = tcp_tv_to_usec(&tv);
+ pace->cts_last_ran[hpts->p_cpu] = cts;
/*
* Set flag to tell that we are done for
* any slot input that happens during
@@ -1494,25 +1471,36 @@ no_run:
*/
hpts->p_wheel_complete = 1;
/*
- * Now did we spend too long running input and need to run more ticks?
- * Note that if wrap_loop_cnt < 2 then we should have the conditions
- * in the KASSERT's true. But if the wheel is behind i.e. wrap_loop_cnt
- * is greater than 2, then the condtion most likely are *not* true.
- * Also if we are called not from the callout, we don't run the wheel
- * multiple times so the slots may not align either.
- */
- KASSERT(((hpts->p_prev_slot == hpts->p_cur_slot) ||
- (wrap_loop_cnt >= 2) || !from_callout),
- ("H:%p p_prev_slot:%u not equal to p_cur_slot:%u", hpts,
- hpts->p_prev_slot, hpts->p_cur_slot));
- KASSERT(((hpts->p_lasttick == hpts->p_curtick)
- || (wrap_loop_cnt >= 2) || !from_callout),
- ("H:%p p_lasttick:%u not equal to p_curtick:%u", hpts,
- hpts->p_lasttick, hpts->p_curtick));
- if (from_callout && (hpts->p_lasttick != hpts->p_curtick)) {
- hpts->p_curtick = tcp_gethptstick(&tv);
+ * If enough time has elapsed that we should be processing the next
+ * slot(s), then we should have kept running and not marked the wheel as
+ * complete.
+ *
+ * But there are several other conditions where we would have stopped
+ * processing, so the prev/cur slots and cts variables won't match.
+ * These conditions are:
+ *
+ * - Calls not from callouts don't run multiple times
+ * - The wheel is empty
+ * - We've processed more than max_pacer_loops times
+ * - We've wrapped more than 2 times
+ *
+ * This assert catches when the logic above has violated this design.
+ *
+ */
+ KASSERT((!from_callout || (hpts->p_on_queue_cnt == 0) ||
+ (loop_cnt > max_pacer_loops) || (wrap_loop_cnt >= 2) ||
+ ((hpts->p_prev_slot == hpts->p_cur_slot) &&
+ !tcp_hpts_different_slots(cts, cts_last_run))),
+ ("H:%p Shouldn't be done! prev_slot:%u, cur_slot:%u, "
+ "cts_last_run:%u, cts:%u, loop_cnt:%d, wrap_loop_cnt:%d",
+ hpts, hpts->p_prev_slot, hpts->p_cur_slot,
+ cts_last_run, cts, loop_cnt, wrap_loop_cnt));
+
+ if (from_callout && tcp_hpts_different_slots(cts, cts_last_run)) {
+ microuptime(&tv);
+ cts = tcp_tv_to_usec(&tv);
+ hpts->p_cur_slot = cts_to_wheel(cts);
counter_u64_add(hpts_loops, 1);
- hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
goto again;
}
@@ -1526,16 +1514,16 @@ no_run:
}
void
-tcp_set_hpts(struct tcpcb *tp)
+__tcp_set_hpts(struct tcp_hptsi *pace, struct tcpcb *tp)
{
struct tcp_hpts_entry *hpts;
int failed;
INP_WLOCK_ASSERT(tptoinpcb(tp));
- hpts = tcp_hpts_lock(tp);
+ hpts = tcp_hpts_lock(pace, tp);
if (tp->t_in_hpts == IHPTS_NONE && !(tp->t_flags2 & TF2_HPTS_CPU_SET)) {
- tp->t_hpts_cpu = hpts_cpuid(tp, &failed);
+ tp->t_hpts_cpu = hpts_cpuid(pace, tp, &failed);
if (failed == 0)
tp->t_flags2 |= TF2_HPTS_CPU_SET;
}
@@ -1543,33 +1531,35 @@ tcp_set_hpts(struct tcpcb *tp)
}
static struct tcp_hpts_entry *
-tcp_choose_hpts_to_run(void)
+tcp_choose_hpts_to_run(struct tcp_hptsi *pace)
{
+ struct timeval tv;
int i, oldest_idx, start, end;
uint32_t cts, time_since_ran, calc;
- cts = tcp_get_usecs(NULL);
+ microuptime(&tv);
+ cts = tcp_tv_to_usec(&tv);
time_since_ran = 0;
/* Default is all one group */
start = 0;
- end = tcp_pace.rp_num_hptss;
+ end = pace->rp_num_hptss;
/*
* If we have more than one L3 group figure out which one
* this CPU is in.
*/
- if (tcp_pace.grp_cnt > 1) {
- for (i = 0; i < tcp_pace.grp_cnt; i++) {
- if (CPU_ISSET(curcpu, &tcp_pace.grps[i]->cg_mask)) {
- start = tcp_pace.grps[i]->cg_first;
- end = (tcp_pace.grps[i]->cg_last + 1);
+ if (pace->grp_cnt > 1) {
+ for (i = 0; i < pace->grp_cnt; i++) {
+ if (CPU_ISSET(curcpu, &pace->grps[i]->cg_mask)) {
+ start = pace->grps[i]->cg_first;
+ end = (pace->grps[i]->cg_last + 1);
break;
}
}
}
oldest_idx = -1;
for (i = start; i < end; i++) {
- if (TSTMP_GT(cts, tcp_pace.cts_last_ran[i]))
- calc = cts - tcp_pace.cts_last_ran[i];
+ if (TSTMP_GT(cts, pace->cts_last_ran[i]))
+ calc = cts - pace->cts_last_ran[i];
else
calc = 0;
if (calc > time_since_ran) {
@@ -1578,9 +1568,9 @@ tcp_choose_hpts_to_run(void)
}
}
if (oldest_idx >= 0)
- return(tcp_pace.rp_ent[oldest_idx]);
+ return(pace->rp_ent[oldest_idx]);
else
- return(tcp_pace.rp_ent[(curcpu % tcp_pace.rp_num_hptss)]);
+ return(pace->rp_ent[(curcpu % pace->rp_num_hptss)]);
}
static void
@@ -1588,9 +1578,9 @@ __tcp_run_hpts(void)
{
struct epoch_tracker et;
struct tcp_hpts_entry *hpts;
- int ticks_ran;
+ int slots_ran;
- hpts = tcp_choose_hpts_to_run();
+ hpts = tcp_choose_hpts_to_run(tcp_hptsi_pace);
if (hpts->p_hpts_active) {
/* Already active */
@@ -1606,12 +1596,11 @@ __tcp_run_hpts(void)
hpts->syscall_cnt++;
counter_u64_add(hpts_direct_call, 1);
hpts->p_hpts_active = 1;
- ticks_ran = tcp_hptsi(hpts, false);
+ slots_ran = tcp_hptsi(hpts, false);
/* We may want to adjust the sleep values here */
if (hpts->p_on_queue_cnt >= conn_cnt_thresh) {
- if (ticks_ran > slots_indicate_less_sleep) {
+ if (slots_ran > slots_indicate_less_sleep) {
struct timeval tv;
- sbintime_t sb;
hpts->p_mysleep.tv_usec /= 2;
if (hpts->p_mysleep.tv_usec < dynamic_min_sleep)
@@ -1635,13 +1624,8 @@ __tcp_run_hpts(void)
* the dynamic value and set the on_min_sleep
* flag so we will not be awoken.
*/
- sb = tvtosbt(tv);
- /* Store off to make visible the actual sleep time */
- hpts->sleeping = tv.tv_usec;
- callout_reset_sbt_on(&hpts->co, sb, 0,
- hpts_timeout_swi, hpts, hpts->p_cpu,
- (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
- } else if (ticks_ran < slots_indicate_more_sleep) {
+ (void)tcp_hpts_sleep(hpts, &tv);
+ } else if (slots_ran < slots_indicate_more_sleep) {
/* For the further sleep, don't reschedule hpts */
hpts->p_mysleep.tv_usec *= 2;
if (hpts->p_mysleep.tv_usec > dynamic_max_sleep)
@@ -1658,17 +1642,22 @@ out_with_mtx:
static void
tcp_hpts_thread(void *ctx)
{
+#ifdef TCP_HPTS_KTEST
+ struct tcp_hptsi *pace;
+#endif
struct tcp_hpts_entry *hpts;
struct epoch_tracker et;
struct timeval tv;
- sbintime_t sb;
- int ticks_ran;
+ int slots_ran;
hpts = (struct tcp_hpts_entry *)ctx;
+#ifdef TCP_HPTS_KTEST
+ pace = hpts->p_hptsi;
+#endif
HPTS_LOCK(hpts);
if (hpts->p_direct_wake) {
/* Signaled by input or output with low occupancy count. */
- callout_stop(&hpts->co);
+ _callout_stop_safe(&hpts->co, 0);
counter_u64_add(hpts_direct_awakening, 1);
} else {
/* Timed out, the normal case. */
@@ -1721,7 +1710,7 @@ tcp_hpts_thread(void *ctx)
}
hpts->sleeping = 0;
hpts->p_hpts_active = 1;
- ticks_ran = tcp_hptsi(hpts, true);
+ slots_ran = tcp_hptsi(hpts, true);
tv.tv_sec = 0;
tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_USECS_PER_SLOT;
if ((hpts->p_on_queue_cnt > conn_cnt_thresh) && (hpts->hit_callout_thresh == 0)) {
@@ -1737,11 +1726,11 @@ tcp_hpts_thread(void *ctx)
* Only adjust sleep time if we were
* called from the callout i.e. direct_wake == 0.
*/
- if (ticks_ran < slots_indicate_more_sleep) {
+ if (slots_ran < slots_indicate_more_sleep) {
hpts->p_mysleep.tv_usec *= 2;
if (hpts->p_mysleep.tv_usec > dynamic_max_sleep)
hpts->p_mysleep.tv_usec = dynamic_max_sleep;
- } else if (ticks_ran > slots_indicate_less_sleep) {
+ } else if (slots_ran > slots_indicate_less_sleep) {
hpts->p_mysleep.tv_usec /= 2;
if (hpts->p_mysleep.tv_usec < dynamic_min_sleep)
hpts->p_mysleep.tv_usec = dynamic_min_sleep;
@@ -1797,18 +1786,11 @@ tcp_hpts_thread(void *ctx)
hpts->p_hpts_active = 0;
back_to_sleep:
hpts->p_direct_wake = 0;
- sb = tvtosbt(tv);
- /* Store off to make visible the actual sleep time */
- hpts->sleeping = tv.tv_usec;
- callout_reset_sbt_on(&hpts->co, sb, 0,
- hpts_timeout_swi, hpts, hpts->p_cpu,
- (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
+ (void)tcp_hpts_sleep(hpts, &tv);
NET_EPOCH_EXIT(et);
HPTS_UNLOCK(hpts);
}
-#undef timersub
-
static int32_t
hpts_count_level(struct cpu_group *cg)
{
@@ -1845,57 +1827,63 @@ hpts_gather_grps(struct cpu_group **grps, int32_t *at, int32_t max, struct cpu_g
}
}
-static void
-tcp_hpts_mod_load(void)
+/*
+ * Initialize a tcp_hptsi structure. This performs the core initialization
+ * without starting threads.
+ */
+struct tcp_hptsi*
+tcp_hptsi_create(const struct tcp_hptsi_funcs *funcs, bool enable_sysctl)
{
+ struct tcp_hptsi *pace;
struct cpu_group *cpu_top;
- int32_t error __diagused;
- int32_t i, j, bound = 0, created = 0;
+ uint32_t i, j, cts;
+ int32_t count;
size_t sz, asz;
struct timeval tv;
- sbintime_t sb;
struct tcp_hpts_entry *hpts;
- struct pcpu *pc;
char unit[16];
uint32_t ncpus = mp_ncpus ? mp_ncpus : MAXCPU;
- int count, domain;
+ KASSERT(funcs != NULL, ("funcs is NULL"));
+
+ /* Allocate the main structure */
+ pace = malloc(sizeof(struct tcp_hptsi), M_TCPHPTS, M_WAITOK | M_ZERO);
+ if (pace == NULL)
+ return (NULL);
+
+ memset(pace, 0, sizeof(*pace));
+ pace->funcs = funcs;
+
+ /* Setup CPU topology information */
#ifdef SMP
cpu_top = smp_topo();
#else
cpu_top = NULL;
#endif
- tcp_pace.rp_num_hptss = ncpus;
- hpts_hopelessly_behind = counter_u64_alloc(M_WAITOK);
- hpts_loops = counter_u64_alloc(M_WAITOK);
- back_tosleep = counter_u64_alloc(M_WAITOK);
- combined_wheel_wrap = counter_u64_alloc(M_WAITOK);
- wheel_wrap = counter_u64_alloc(M_WAITOK);
- hpts_wake_timeout = counter_u64_alloc(M_WAITOK);
- hpts_direct_awakening = counter_u64_alloc(M_WAITOK);
- hpts_back_tosleep = counter_u64_alloc(M_WAITOK);
- hpts_direct_call = counter_u64_alloc(M_WAITOK);
- cpu_uses_flowid = counter_u64_alloc(M_WAITOK);
- cpu_uses_random = counter_u64_alloc(M_WAITOK);
+ pace->rp_num_hptss = ncpus;
- sz = (tcp_pace.rp_num_hptss * sizeof(struct tcp_hpts_entry *));
- tcp_pace.rp_ent = malloc(sz, M_TCPHPTS, M_WAITOK | M_ZERO);
- sz = (sizeof(uint32_t) * tcp_pace.rp_num_hptss);
- tcp_pace.cts_last_ran = malloc(sz, M_TCPHPTS, M_WAITOK);
- tcp_pace.grp_cnt = 0;
+ /* Allocate hpts entry array */
+ sz = (pace->rp_num_hptss * sizeof(struct tcp_hpts_entry *));
+ pace->rp_ent = malloc(sz, M_TCPHPTS, M_WAITOK | M_ZERO);
+
+ /* Allocate timestamp tracking array */
+ sz = (sizeof(uint32_t) * pace->rp_num_hptss);
+ pace->cts_last_ran = malloc(sz, M_TCPHPTS, M_WAITOK);
+
+ /* Setup CPU groups */
if (cpu_top == NULL) {
- tcp_pace.grp_cnt = 1;
+ pace->grp_cnt = 1;
} else {
/* Find out how many cache level 3 domains we have */
count = 0;
- tcp_pace.grp_cnt = hpts_count_level(cpu_top);
- if (tcp_pace.grp_cnt == 0) {
- tcp_pace.grp_cnt = 1;
+ pace->grp_cnt = hpts_count_level(cpu_top);
+ if (pace->grp_cnt == 0) {
+ pace->grp_cnt = 1;
}
- sz = (tcp_pace.grp_cnt * sizeof(struct cpu_group *));
- tcp_pace.grps = malloc(sz, M_TCPHPTS, M_WAITOK);
+ sz = (pace->grp_cnt * sizeof(struct cpu_group *));
+ pace->grps = malloc(sz, M_TCPHPTS, M_WAITOK);
/* Now populate the groups */
- if (tcp_pace.grp_cnt == 1) {
+ if (pace->grp_cnt == 1) {
/*
* All we need is the top level all cpu's are in
* the same cache so when we use grp[0]->cg_mask
@@ -1903,193 +1891,290 @@ tcp_hpts_mod_load(void)
* all cpu's in it. The level here is probably
* zero which is ok.
*/
- tcp_pace.grps[0] = cpu_top;
+ pace->grps[0] = cpu_top;
} else {
/*
* Here we must find all the level three cache domains
* and setup our pointers to them.
*/
count = 0;
- hpts_gather_grps(tcp_pace.grps, &count, tcp_pace.grp_cnt, cpu_top);
+ hpts_gather_grps(pace->grps, &count, pace->grp_cnt, cpu_top);
}
}
+
+ /* Cache the current time for initializing the hpts entries */
+ microuptime(&tv);
+ cts = tcp_tv_to_usec(&tv);
+
+ /* Initialize each hpts entry */
asz = sizeof(struct hptsh) * NUM_OF_HPTSI_SLOTS;
- for (i = 0; i < tcp_pace.rp_num_hptss; i++) {
- tcp_pace.rp_ent[i] = malloc(sizeof(struct tcp_hpts_entry),
+ for (i = 0; i < pace->rp_num_hptss; i++) {
+ pace->rp_ent[i] = malloc(sizeof(struct tcp_hpts_entry),
M_TCPHPTS, M_WAITOK | M_ZERO);
- tcp_pace.rp_ent[i]->p_hptss = malloc(asz, M_TCPHPTS, M_WAITOK);
- hpts = tcp_pace.rp_ent[i];
- /*
- * Init all the hpts structures that are not specifically
- * zero'd by the allocations. Also lets attach them to the
- * appropriate sysctl block as well.
- */
- mtx_init(&hpts->p_mtx, "tcp_hpts_lck",
- "hpts", MTX_DEF | MTX_DUPOK);
- for (j = 0; j < NUM_OF_HPTSI_SLOTS; j++) {
- TAILQ_INIT(&hpts->p_hptss[j].head);
- hpts->p_hptss[j].count = 0;
- hpts->p_hptss[j].gencnt = 0;
- }
- sysctl_ctx_init(&hpts->hpts_ctx);
- sprintf(unit, "%d", i);
- hpts->hpts_root = SYSCTL_ADD_NODE(&hpts->hpts_ctx,
- SYSCTL_STATIC_CHILDREN(_net_inet_tcp_hpts),
- OID_AUTO,
- unit,
- CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
- "");
- SYSCTL_ADD_INT(&hpts->hpts_ctx,
- SYSCTL_CHILDREN(hpts->hpts_root),
- OID_AUTO, "out_qcnt", CTLFLAG_RD,
- &hpts->p_on_queue_cnt, 0,
- "Count TCB's awaiting output processing");
- SYSCTL_ADD_U16(&hpts->hpts_ctx,
- SYSCTL_CHILDREN(hpts->hpts_root),
- OID_AUTO, "active", CTLFLAG_RD,
- &hpts->p_hpts_active, 0,
- "Is the hpts active");
- SYSCTL_ADD_UINT(&hpts->hpts_ctx,
- SYSCTL_CHILDREN(hpts->hpts_root),
- OID_AUTO, "curslot", CTLFLAG_RD,
- &hpts->p_cur_slot, 0,
- "What the current running pacers goal");
- SYSCTL_ADD_UINT(&hpts->hpts_ctx,
- SYSCTL_CHILDREN(hpts->hpts_root),
- OID_AUTO, "runtick", CTLFLAG_RD,
- &hpts->p_runningslot, 0,
- "What the running pacers current slot is");
- SYSCTL_ADD_UINT(&hpts->hpts_ctx,
- SYSCTL_CHILDREN(hpts->hpts_root),
- OID_AUTO, "curtick", CTLFLAG_RD,
- &hpts->p_curtick, 0,
- "What the running pacers last tick mapped to the wheel was");
- SYSCTL_ADD_UINT(&hpts->hpts_ctx,
- SYSCTL_CHILDREN(hpts->hpts_root),
- OID_AUTO, "lastran", CTLFLAG_RD,
- &tcp_pace.cts_last_ran[i], 0,
- "The last usec tick that this hpts ran");
- SYSCTL_ADD_LONG(&hpts->hpts_ctx,
- SYSCTL_CHILDREN(hpts->hpts_root),
- OID_AUTO, "cur_min_sleep", CTLFLAG_RD,
- &hpts->p_mysleep.tv_usec,
- "What the running pacers is using for p_mysleep.tv_usec");
- SYSCTL_ADD_U64(&hpts->hpts_ctx,
- SYSCTL_CHILDREN(hpts->hpts_root),
- OID_AUTO, "now_sleeping", CTLFLAG_RD,
- &hpts->sleeping, 0,
- "What the running pacers is actually sleeping for");
- SYSCTL_ADD_U64(&hpts->hpts_ctx,
- SYSCTL_CHILDREN(hpts->hpts_root),
- OID_AUTO, "syscall_cnt", CTLFLAG_RD,
- &hpts->syscall_cnt, 0,
- "How many times we had syscalls on this hpts");
+ pace->rp_ent[i]->p_hptss = malloc(asz, M_TCPHPTS,
+ M_WAITOK | M_ZERO);
+ hpts = pace->rp_ent[i];
+ /* Basic initialization */
hpts->p_hpts_sleep_time = hpts_sleep_max;
- hpts->p_num = i;
- hpts->p_curtick = tcp_gethptstick(&tv);
- tcp_pace.cts_last_ran[i] = tcp_tv_to_usec(&tv);
- hpts->p_prev_slot = hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
- hpts->p_cpu = 0xffff;
+ hpts->p_cpu = i;
+ pace->cts_last_ran[i] = cts;
+ hpts->p_cur_slot = cts_to_wheel(cts);
+ hpts->p_prev_slot = hpts->p_cur_slot;
hpts->p_nxt_slot = hpts_slot(hpts->p_cur_slot, 1);
callout_init(&hpts->co, 1);
+ hpts->p_hptsi = pace;
+ mtx_init(&hpts->p_mtx, "tcp_hpts_lck", "hpts",
+ MTX_DEF | MTX_DUPOK);
+ for (j = 0; j < NUM_OF_HPTSI_SLOTS; j++) {
+ TAILQ_INIT(&hpts->p_hptss[j].head);
+ }
+
+ /* Setup SYSCTL if requested */
+ if (enable_sysctl) {
+ sysctl_ctx_init(&hpts->hpts_ctx);
+ sprintf(unit, "%d", i);
+ hpts->hpts_root = SYSCTL_ADD_NODE(&hpts->hpts_ctx,
+ SYSCTL_STATIC_CHILDREN(_net_inet_tcp_hpts),
+ OID_AUTO,
+ unit,
+ CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
+ "");
+ SYSCTL_ADD_INT(&hpts->hpts_ctx,
+ SYSCTL_CHILDREN(hpts->hpts_root),
+ OID_AUTO, "out_qcnt", CTLFLAG_RD,
+ &hpts->p_on_queue_cnt, 0,
+ "Count TCB's awaiting output processing");
+ SYSCTL_ADD_U16(&hpts->hpts_ctx,
+ SYSCTL_CHILDREN(hpts->hpts_root),
+ OID_AUTO, "active", CTLFLAG_RD,
+ &hpts->p_hpts_active, 0,
+ "Is the hpts active");
+ SYSCTL_ADD_UINT(&hpts->hpts_ctx,
+ SYSCTL_CHILDREN(hpts->hpts_root),
+ OID_AUTO, "curslot", CTLFLAG_RD,
+ &hpts->p_cur_slot, 0,
+ "What the current running pacers goal");
+ SYSCTL_ADD_UINT(&hpts->hpts_ctx,
+ SYSCTL_CHILDREN(hpts->hpts_root),
+ OID_AUTO, "runslot", CTLFLAG_RD,
+ &hpts->p_runningslot, 0,
+ "What the running pacers current slot is");
+ SYSCTL_ADD_UINT(&hpts->hpts_ctx,
+ SYSCTL_CHILDREN(hpts->hpts_root),
+ OID_AUTO, "lastran", CTLFLAG_RD,
+ &pace->cts_last_ran[i], 0,
+ "The last usec timestamp that this hpts ran");
+ SYSCTL_ADD_LONG(&hpts->hpts_ctx,
+ SYSCTL_CHILDREN(hpts->hpts_root),
+ OID_AUTO, "cur_min_sleep", CTLFLAG_RD,
+ &hpts->p_mysleep.tv_usec,
+ "What the running pacers is using for p_mysleep.tv_usec");
+ SYSCTL_ADD_U64(&hpts->hpts_ctx,
+ SYSCTL_CHILDREN(hpts->hpts_root),
+ OID_AUTO, "now_sleeping", CTLFLAG_RD,
+ &hpts->sleeping, 0,
+ "What the running pacers is actually sleeping for");
+ SYSCTL_ADD_U64(&hpts->hpts_ctx,
+ SYSCTL_CHILDREN(hpts->hpts_root),
+ OID_AUTO, "syscall_cnt", CTLFLAG_RD,
+ &hpts->syscall_cnt, 0,
+ "How many times we had syscalls on this hpts");
+ }
}
- /* Don't try to bind to NUMA domains if we don't have any */
- if (vm_ndomains == 1 && tcp_bind_threads == 2)
- tcp_bind_threads = 0;
- /*
- * Now lets start ithreads to handle the hptss.
- */
- for (i = 0; i < tcp_pace.rp_num_hptss; i++) {
- hpts = tcp_pace.rp_ent[i];
- hpts->p_cpu = i;
+ return (pace);
+}
+
+/*
+ * Create threads for a tcp_hptsi structure and starts timers for the current
+ * (minimum) sleep interval.
+ */
+void
+tcp_hptsi_start(struct tcp_hptsi *pace)
+{
+ struct tcp_hpts_entry *hpts;
+ struct pcpu *pc;
+ struct timeval tv;
+ uint32_t i, j;
+ int count, domain;
+ int error __diagused;
+
+ KASSERT(pace != NULL, ("tcp_hptsi_start: pace is NULL"));
+
+ /* Start threads for each hpts entry */
+ for (i = 0; i < pace->rp_num_hptss; i++) {
+ hpts = pace->rp_ent[i];
+
+ KASSERT(hpts->ie_cookie == NULL,
+ ("tcp_hptsi_start: hpts[%d]->ie_cookie is not NULL", i));
error = swi_add(&hpts->ie, "hpts",
tcp_hpts_thread, (void *)hpts,
SWI_NET, INTR_MPSAFE, &hpts->ie_cookie);
KASSERT(error == 0,
- ("Can't add hpts:%p i:%d err:%d",
- hpts, i, error));
- created++;
- hpts->p_mysleep.tv_sec = 0;
- hpts->p_mysleep.tv_usec = tcp_min_hptsi_time;
+ ("Can't add hpts:%p i:%d err:%d", hpts, i, error));
+
if (tcp_bind_threads == 1) {
- if (intr_event_bind(hpts->ie, i) == 0)
- bound++;
+ (void)intr_event_bind(hpts->ie, i);
} else if (tcp_bind_threads == 2) {
/* Find the group for this CPU (i) and bind into it */
- for (j = 0; j < tcp_pace.grp_cnt; j++) {
- if (CPU_ISSET(i, &tcp_pace.grps[j]->cg_mask)) {
+ for (j = 0; j < pace->grp_cnt; j++) {
+ if (CPU_ISSET(i, &pace->grps[j]->cg_mask)) {
if (intr_event_bind_ithread_cpuset(hpts->ie,
- &tcp_pace.grps[j]->cg_mask) == 0) {
- bound++;
+ &pace->grps[j]->cg_mask) == 0) {
pc = pcpu_find(i);
domain = pc->pc_domain;
- count = hpts_domains[domain].count;
- hpts_domains[domain].cpu[count] = i;
- hpts_domains[domain].count++;
+ count = pace->domains[domain].count;
+ pace->domains[domain].cpu[count] = i;
+ pace->domains[domain].count++;
break;
}
}
}
}
+
+ hpts->p_mysleep.tv_sec = 0;
+ hpts->p_mysleep.tv_usec = tcp_min_hptsi_time;
tv.tv_sec = 0;
tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_USECS_PER_SLOT;
- hpts->sleeping = tv.tv_usec;
- sb = tvtosbt(tv);
- callout_reset_sbt_on(&hpts->co, sb, 0,
- hpts_timeout_swi, hpts, hpts->p_cpu,
- (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
- }
- /*
- * If we somehow have an empty domain, fall back to choosing
- * among all htps threads.
- */
- for (i = 0; i < vm_ndomains; i++) {
- if (hpts_domains[i].count == 0) {
- tcp_bind_threads = 0;
- break;
- }
+ (void)tcp_hpts_sleep(hpts, &tv);
}
- tcp_hpts_softclock = __tcp_run_hpts;
- tcp_lro_hpts_init();
- printf("TCP Hpts created %d swi interrupt threads and bound %d to %s\n",
- created, bound,
- tcp_bind_threads == 2 ? "NUMA domains" : "cpus");
}
-static void
-tcp_hpts_mod_unload(void)
+/*
+ * Stop all callouts/threads for a tcp_hptsi structure.
+ */
+void
+tcp_hptsi_stop(struct tcp_hptsi *pace)
{
+ struct tcp_hpts_entry *hpts;
int rv __diagused;
+ uint32_t i;
- tcp_lro_hpts_uninit();
- atomic_store_ptr(&tcp_hpts_softclock, NULL);
+ KASSERT(pace != NULL, ("tcp_hptsi_stop: pace is NULL"));
- for (int i = 0; i < tcp_pace.rp_num_hptss; i++) {
- struct tcp_hpts_entry *hpts = tcp_pace.rp_ent[i];
+ for (i = 0; i < pace->rp_num_hptss; i++) {
+ hpts = pace->rp_ent[i];
+ KASSERT(hpts != NULL, ("tcp_hptsi_stop: hpts[%d] is NULL", i));
+ KASSERT(hpts->ie_cookie != NULL,
+ ("tcp_hptsi_stop: hpts[%d]->ie_cookie is NULL", i));
- rv = callout_drain(&hpts->co);
+ rv = _callout_stop_safe(&hpts->co, CS_DRAIN);
MPASS(rv != 0);
rv = swi_remove(hpts->ie_cookie);
MPASS(rv == 0);
+ hpts->ie_cookie = NULL;
+ }
+}
- rv = sysctl_ctx_free(&hpts->hpts_ctx);
- MPASS(rv == 0);
+/*
+ * Destroy a tcp_hptsi structure initialized by tcp_hptsi_create.
+ */
+void
+tcp_hptsi_destroy(struct tcp_hptsi *pace)
+{
+ struct tcp_hpts_entry *hpts;
+ uint32_t i;
+
+ KASSERT(pace != NULL, ("tcp_hptsi_destroy: pace is NULL"));
+ KASSERT(pace->rp_ent != NULL, ("tcp_hptsi_destroy: pace->rp_ent is NULL"));
+
+ /* Cleanup each hpts entry */
+ for (i = 0; i < pace->rp_num_hptss; i++) {
+ hpts = pace->rp_ent[i];
+ if (hpts != NULL) {
+ /* Cleanup SYSCTL if it was initialized */
+ if (hpts->hpts_root != NULL) {
+ sysctl_ctx_free(&hpts->hpts_ctx);
+ }
- mtx_destroy(&hpts->p_mtx);
- free(hpts->p_hptss, M_TCPHPTS);
- free(hpts, M_TCPHPTS);
+ mtx_destroy(&hpts->p_mtx);
+ free(hpts->p_hptss, M_TCPHPTS);
+ free(hpts, M_TCPHPTS);
+ }
}
- free(tcp_pace.rp_ent, M_TCPHPTS);
- free(tcp_pace.cts_last_ran, M_TCPHPTS);
+ /* Cleanup main arrays */
+ free(pace->rp_ent, M_TCPHPTS);
+ free(pace->cts_last_ran, M_TCPHPTS);
#ifdef SMP
- free(tcp_pace.grps, M_TCPHPTS);
+ free(pace->grps, M_TCPHPTS);
#endif
+ /* Free the main structure */
+ free(pace, M_TCPHPTS);
+}
+
+static int
+tcp_hpts_mod_load(void)
+{
+ int i;
+
+ /* Don't try to bind to NUMA domains if we don't have any */
+ if (vm_ndomains == 1 && tcp_bind_threads == 2)
+ tcp_bind_threads = 0;
+
+ /* Create the tcp_hptsi structure */
+ tcp_hptsi_pace = tcp_hptsi_create(&tcp_hptsi_default_funcs, true);
+ if (tcp_hptsi_pace == NULL)
+ return (ENOMEM);
+
+ /* Initialize global counters */
+ hpts_hopelessly_behind = counter_u64_alloc(M_WAITOK);
+ hpts_loops = counter_u64_alloc(M_WAITOK);
+ back_tosleep = counter_u64_alloc(M_WAITOK);
+ combined_wheel_wrap = counter_u64_alloc(M_WAITOK);
+ wheel_wrap = counter_u64_alloc(M_WAITOK);
+ hpts_wake_timeout = counter_u64_alloc(M_WAITOK);
+ hpts_direct_awakening = counter_u64_alloc(M_WAITOK);
+ hpts_back_tosleep = counter_u64_alloc(M_WAITOK);
+ hpts_direct_call = counter_u64_alloc(M_WAITOK);
+ cpu_uses_flowid = counter_u64_alloc(M_WAITOK);
+ cpu_uses_random = counter_u64_alloc(M_WAITOK);
+
+ /* Start the threads */
+ tcp_hptsi_start(tcp_hptsi_pace);
+
+ /* Enable the global HPTS softclock function */
+ tcp_hpts_softclock = __tcp_run_hpts;
+
+ /* Initialize LRO HPTS */
+ tcp_lro_hpts_init();
+
+ /*
+ * If we somehow have an empty domain, fall back to choosing among all
+ * HPTS threads.
+ */
+ for (i = 0; i < vm_ndomains; i++) {
+ if (tcp_hptsi_pace->domains[i].count == 0) {
+ tcp_bind_threads = 0;
+ break;
+ }
+ }
+
+ printf("TCP HPTS started %u (%s) swi interrupt threads\n",
+ tcp_hptsi_pace->rp_num_hptss, (tcp_bind_threads == 0) ?
+ "(unbounded)" :
+ (tcp_bind_threads == 1 ? "per-cpu" : "per-NUMA-domain"));
+
+ return (0);
+}
+
+static void
+tcp_hpts_mod_unload(void)
+{
+ tcp_lro_hpts_uninit();
+
+ /* Disable the global HPTS softclock function */
+ atomic_store_ptr(&tcp_hpts_softclock, NULL);
+
+ tcp_hptsi_stop(tcp_hptsi_pace);
+ tcp_hptsi_destroy(tcp_hptsi_pace);
+ tcp_hptsi_pace = NULL;
+
+ /* Cleanup global counters */
counter_u64_free(hpts_hopelessly_behind);
counter_u64_free(hpts_loops);
counter_u64_free(back_tosleep);
@@ -2104,13 +2189,11 @@ tcp_hpts_mod_unload(void)
}
static int
-tcp_hpts_modevent(module_t mod, int what, void *arg)
+tcp_hpts_mod_event(module_t mod, int what, void *arg)
{
-
switch (what) {
case MOD_LOAD:
- tcp_hpts_mod_load();
- return (0);
+ return (tcp_hpts_mod_load());
case MOD_QUIESCE:
/*
* Since we are a dependency of TCP stack modules, they should
@@ -2130,7 +2213,7 @@ tcp_hpts_modevent(module_t mod, int what, void *arg)
static moduledata_t tcp_hpts_module = {
.name = "tcphpts",
- .evhand = tcp_hpts_modevent,
+ .evhand = tcp_hpts_mod_event,
};
DECLARE_MODULE(tcphpts, tcp_hpts_module, SI_SUB_SOFTINTR, SI_ORDER_ANY);
diff --git a/sys/netinet/tcp_hpts.h b/sys/netinet/tcp_hpts.h
index 6172baf2a062..6b05f9701ac2 100644
--- a/sys/netinet/tcp_hpts.h
+++ b/sys/netinet/tcp_hpts.h
@@ -28,19 +28,11 @@
/* Number of useconds represented by an hpts slot */
#define HPTS_USECS_PER_SLOT 10
-#define HPTS_MS_TO_SLOTS(x) ((x * 100) + 1)
-#define HPTS_USEC_TO_SLOTS(x) ((x+9) /10)
#define HPTS_USEC_IN_SEC 1000000
#define HPTS_MSEC_IN_SEC 1000
#define HPTS_USEC_IN_MSEC 1000
static inline uint32_t
-tcp_tv_to_hpts_slot(const struct timeval *sv)
-{
- return ((sv->tv_sec * 100000) + (sv->tv_usec / HPTS_USECS_PER_SLOT));
-}
-
-static inline uint32_t
tcp_tv_to_usec(const struct timeval *sv)
{
return ((uint32_t) ((sv->tv_sec * HPTS_USEC_IN_SEC) + sv->tv_usec));
@@ -66,7 +58,7 @@ struct hpts_diag {
uint32_t p_runningslot; /* bbr->inflight */
uint32_t slot_req; /* bbr->flex3 x */
uint32_t inp_hptsslot; /* bbr->flex4 x */
- uint32_t slot_remaining; /* bbr->flex5 x */
+ uint32_t time_remaining; /* bbr->flex5 x */
uint32_t have_slept; /* bbr->epoch x */
uint32_t hpts_sleep_time; /* bbr->applimited x */
uint32_t yet_to_sleep; /* bbr->lt_epoch x */
@@ -75,8 +67,6 @@ struct hpts_diag {
uint32_t maxslots; /* bbr->delRate x */
uint32_t wheel_cts; /* bbr->rttProp x */
int32_t co_ret; /* bbr->pkts_out x */
- uint32_t p_curtick; /* upper bbr->cur_del_rate */
- uint32_t p_lasttick; /* lower bbr->cur_del_rate */
uint8_t p_on_min_sleep; /* bbr->flex8 x */
};
@@ -92,13 +82,18 @@ struct hpts_diag {
#ifdef _KERNEL
+extern struct tcp_hptsi *tcp_hptsi_pace;
+
/*
* The following are the definitions for the kernel HPTS interface for managing
* the HPTS ring and the TCBs on it.
*/
-void tcp_hpts_init(struct tcpcb *);
-void tcp_hpts_remove(struct tcpcb *);
+void __tcp_hpts_init(struct tcp_hptsi *pace, struct tcpcb *);
+#define tcp_hpts_init(tp) __tcp_hpts_init(tcp_hptsi_pace, tp)
+
+void __tcp_hpts_remove(struct tcp_hptsi *pace, struct tcpcb *);
+#define tcp_hpts_remove(tp) __tcp_hpts_remove(tcp_hptsi_pace, tp)
static inline bool
tcp_in_hpts(struct tcpcb *tp)
@@ -132,12 +127,13 @@ tcp_in_hpts(struct tcpcb *tp)
* that INP_WLOCK() or from destroying your TCB where again
* you should already have the INP_WLOCK().
*/
-uint32_t tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line,
- struct hpts_diag *diag);
-#define tcp_hpts_insert(inp, slot) \
- tcp_hpts_insert_diag((inp), (slot), __LINE__, NULL)
+void __tcp_hpts_insert(struct tcp_hptsi *pace, struct tcpcb *tp, uint32_t usecs,
+ struct hpts_diag *diag);
+#define tcp_hpts_insert(tp, usecs, diag) \
+ __tcp_hpts_insert(tcp_hptsi_pace, (tp), (usecs), (diag))
-void tcp_set_hpts(struct tcpcb *tp);
+void __tcp_set_hpts(struct tcp_hptsi *pace, struct tcpcb *tp);
+#define tcp_set_hpts(tp) __tcp_set_hpts(tcp_hptsi_pace, tp)
extern int32_t tcp_min_hptsi_time;
@@ -147,17 +143,6 @@ get_hpts_min_sleep_time(void)
return (tcp_min_hptsi_time + HPTS_USECS_PER_SLOT);
}
-static inline uint32_t
-tcp_gethptstick(struct timeval *sv)
-{
- struct timeval tv;
-
- if (sv == NULL)
- sv = &tv;
- microuptime(sv);
- return (tcp_tv_to_hpts_slot(sv));
-}
-
static inline uint64_t
tcp_get_u64_usecs(struct timeval *tv)
{
@@ -180,12 +165,5 @@ tcp_get_usecs(struct timeval *tv)
return (tcp_tv_to_usec(tv));
}
-/*
- * LRO HPTS initialization and uninitialization, only for internal use by the
- * HPTS code.
- */
-void tcp_lro_hpts_init(void);
-void tcp_lro_hpts_uninit(void);
-
#endif /* _KERNEL */
#endif /* __tcp_hpts_h__ */
diff --git a/sys/netinet/tcp_hpts_internal.h b/sys/netinet/tcp_hpts_internal.h
new file mode 100644
index 000000000000..8b33e03a6981
--- /dev/null
+++ b/sys/netinet/tcp_hpts_internal.h
@@ -0,0 +1,184 @@
+/*-
+ * Copyright (c) 2025 Netflix, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef __tcp_hpts_internal_h__
+#define __tcp_hpts_internal_h__
+
+/*
+ * TCP High Precision Timer System (HPTS) - Internal Definitions
+ *
+ * This header contains internal structures, constants, and interfaces that are
+ * implemented in tcp_hpts.c but exposed to enable comprehensive unit testing of
+ * the HPTS subsystem.
+ */
+
+#if defined(_KERNEL)
+
+/*
+ * The hpts uses a 102400 wheel. The wheel
+ * defines the time in 10 usec increments (102400 x 10).
+ * This gives a range of 10usec - 1024ms to place
+ * an entry within. If the user requests more than
+ * 1.024 second, a remaineder is attached and the hpts
+ * when seeing the remainder will re-insert the
+ * inpcb forward in time from where it is until
+ * the remainder is zero.
+ */
+
+#define NUM_OF_HPTSI_SLOTS 102400
+
+/* The number of connections after which the dynamic sleep logic kicks in. */
+#define DEFAULT_CONNECTION_THRESHOLD 100
+
+/*
+ * The hpts uses a 102400 wheel. The wheel
+ * defines the time in 10 usec increments (102400 x 10).
+ * This gives a range of 10usec - 1024ms to place
+ * an entry within. If the user requests more than
+ * 1.024 second, a remaineder is attached and the hpts
+ * when seeing the remainder will re-insert the
+ * inpcb forward in time from where it is until
+ * the remainder is zero.
+ */
+
+#define NUM_OF_HPTSI_SLOTS 102400
+
+/* Convert microseconds to HPTS slots */
+#define HPTS_USEC_TO_SLOTS(x) ((x+9) /10)
+
+/* The number of connections after which the dynamic sleep logic kicks in. */
+#define DEFAULT_CONNECTION_THRESHOLD 100
+
+extern int tcp_bind_threads; /* Thread binding configuration
+ * (0=none, 1=cpu, 2=numa) */
+
+/*
+ * Abstraction layer controlling time, interrupts and callouts.
+ */
+struct tcp_hptsi_funcs {
+ void (*microuptime)(struct timeval *tv);
+ int (*swi_add)(struct intr_event **eventp, const char *name,
+ driver_intr_t handler, void *arg, int pri, enum intr_type flags,
+ void **cookiep);
+ int (*swi_remove)(void *cookie);
+ void (*swi_sched)(void *cookie, int flags);
+ int (*intr_event_bind)(struct intr_event *ie, int cpu);
+ int (*intr_event_bind_ithread_cpuset)(struct intr_event *ie,
+ struct _cpuset *mask);
+ void (*callout_init)(struct callout *c, int mpsafe);
+ int (*callout_reset_sbt_on)(struct callout *c, sbintime_t sbt,
+ sbintime_t precision, void (*func)(void *), void *arg, int cpu,
+ int flags);
+ int (*_callout_stop_safe)(struct callout *c, int flags);
+};
+
+/* Default function table for system operation */
+extern const struct tcp_hptsi_funcs tcp_hptsi_default_funcs;
+
+/* Each hpts has its own p_mtx which is used for locking */
+#define HPTS_MTX_ASSERT(hpts) mtx_assert(&(hpts)->p_mtx, MA_OWNED)
+#define HPTS_LOCK(hpts) mtx_lock(&(hpts)->p_mtx)
+#define HPTS_TRYLOCK(hpts) mtx_trylock(&(hpts)->p_mtx)
+#define HPTS_UNLOCK(hpts) mtx_unlock(&(hpts)->p_mtx)
+
+struct tcp_hpts_entry {
+ /* Cache line 0x00 */
+ struct mtx p_mtx; /* Mutex for hpts */
+ struct timeval p_mysleep; /* Our min sleep time */
+ uint64_t syscall_cnt;
+ uint64_t sleeping; /* What the actual sleep was (if sleeping) */
+ uint16_t p_hpts_active; /* Flag that says hpts is awake */
+ uint8_t p_wheel_complete; /* have we completed the wheel arc walk? */
+ uint32_t p_runningslot; /* Current slot we are at if we are running */
+ uint32_t p_prev_slot; /* Previous slot we were on */
+ uint32_t p_cur_slot; /* Current slot in wheel hpts is draining */
+ uint32_t p_nxt_slot; /* The next slot outside the current range
+ * of slots that the hpts is running on. */
+ int32_t p_on_queue_cnt; /* Count on queue in this hpts */
+ uint8_t p_direct_wake :1, /* boolean */
+ p_on_min_sleep:1, /* boolean */
+ p_hpts_wake_scheduled:1,/* boolean */
+ hit_callout_thresh:1,
+ p_avail:4;
+ uint8_t p_fill[3]; /* Fill to 32 bits */
+ /* Cache line 0x40 */
+ struct hptsh {
+ TAILQ_HEAD(, tcpcb) head;
+ uint32_t count;
+ uint32_t gencnt;
+ } *p_hptss; /* Hptsi wheel */
+ uint32_t p_hpts_sleep_time; /* Current sleep interval having a max
+ * of 255ms */
+ uint32_t overidden_sleep; /* what was overrided by min-sleep for logging */
+ uint32_t saved_curslot; /* for logging */
+ uint32_t saved_prev_slot; /* for logging */
+ uint32_t p_delayed_by; /* How much were we delayed by */
+ /* Cache line 0x80 */
+ struct sysctl_ctx_list hpts_ctx;
+ struct sysctl_oid *hpts_root;
+ struct intr_event *ie;
+ void *ie_cookie;
+ uint16_t p_cpu; /* The hpts CPU */
+ struct tcp_hptsi *p_hptsi; /* Back pointer to parent hptsi structure */
+ /* There is extra space in here */
+ /* Cache line 0x100 */
+ struct callout co __aligned(CACHE_LINE_SIZE);
+} __aligned(CACHE_LINE_SIZE);
+
+struct tcp_hptsi {
+ struct cpu_group **grps;
+ struct tcp_hpts_entry **rp_ent; /* Array of hptss */
+ uint32_t *cts_last_ran;
+ uint32_t grp_cnt;
+ uint32_t rp_num_hptss; /* Number of hpts threads */
+ struct hpts_domain_info {
+ int count;
+ int cpu[MAXCPU];
+ } domains[MAXMEMDOM]; /* Per-NUMA domain CPU assignments */
+ const struct tcp_hptsi_funcs *funcs; /* Function table for testability */
+};
+
+/*
+ * Core tcp_hptsi structure manipulation functions.
+ */
+struct tcp_hptsi* tcp_hptsi_create(const struct tcp_hptsi_funcs *funcs,
+ bool enable_sysctl);
+void tcp_hptsi_destroy(struct tcp_hptsi *pace);
+void tcp_hptsi_start(struct tcp_hptsi *pace);
+void tcp_hptsi_stop(struct tcp_hptsi *pace);
+uint16_t tcp_hptsi_random_cpu(struct tcp_hptsi *pace);
+int32_t tcp_hptsi(struct tcp_hpts_entry *hpts, bool from_callout);
+
+void tcp_hpts_wake(struct tcp_hpts_entry *hpts);
+
+/*
+ * LRO HPTS initialization and uninitialization, only for internal use by the
+ * HPTS code.
+ */
+void tcp_lro_hpts_init(void);
+void tcp_lro_hpts_uninit(void);
+
+#endif /* defined(_KERNEL) */
+#endif /* __tcp_hpts_internal_h__ */
diff --git a/sys/netinet/tcp_hpts_test.c b/sys/netinet/tcp_hpts_test.c
new file mode 100644
index 000000000000..bab5827e0572
--- /dev/null
+++ b/sys/netinet/tcp_hpts_test.c
@@ -0,0 +1,1662 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2025 Netflix, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <tests/ktest.h>
+#include <sys/cdefs.h>
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/interrupt.h>
+#include <sys/errno.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/refcount.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <netinet/in_pcb.h>
+#include <netinet/tcp_seq.h>
+#include <netinet/tcp_var.h>
+#include <netinet/tcp_hpts.h>
+#include <netinet/tcp_hpts_internal.h>
+#include <dev/tcp_log/tcp_log_dev.h>
+#include <netinet/tcp_log_buf.h>
+
+#undef tcp_hpts_init
+#undef tcp_hpts_remove
+#undef tcp_hpts_insert
+#undef tcp_set_hpts
+
+/* Custom definitions that take the tcp_hptsi */
+#define tcp_hpts_init(pace, tp) __tcp_hpts_init((pace), (tp))
+#define tcp_hpts_remove(pace, tp) __tcp_hpts_remove((pace), (tp))
+#define tcp_hpts_insert(pace, tp, usecs, diag) \
+ __tcp_hpts_insert((pace), (tp), (usecs), (diag))
+#define tcp_set_hpts(pace, tp) __tcp_set_hpts((pace), (tp))
+
+static MALLOC_DEFINE(M_TCPHPTS, "tcp_hpts_test", "TCP hpts test");
+
+static int test_exit_on_failure = true;
+SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hpts_test, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
+ "TCP HPTS test controls");
+SYSCTL_INT(_net_inet_tcp_hpts_test, OID_AUTO, exit_on_failure, CTLFLAG_RW,
+ &test_exit_on_failure, 0,
+ "Exit HPTS test immediately on first failure (1) or continue running all tests (0)");
+
+#define KTEST_VERIFY(x) do { \
+ if (!(x)) { \
+ KTEST_ERR(ctx, "FAIL: %s", #x); \
+ if (test_exit_on_failure) \
+ return (EINVAL); \
+ } else { \
+ KTEST_LOG(ctx, "PASS: %s", #x); \
+ } \
+} while (0)
+
+#define KTEST_EQUAL(x, y) do { \
+ if ((x) != (y)) { \
+ KTEST_ERR(ctx, "FAIL: %s != %s (%d != %d)", #x, #y, (x), (y)); \
+ if (test_exit_on_failure) \
+ return (EINVAL); \
+ } else { \
+ KTEST_LOG(ctx, "PASS: %s == %s", #x, #y); \
+ } \
+} while (0)
+
+#define KTEST_NEQUAL(x, y) do { \
+ if ((x) == (y)) { \
+ KTEST_ERR(ctx, "FAIL: %s == %s (%d == %d)", #x, #y, (x), (y)); \
+ if (test_exit_on_failure) \
+ return (EINVAL); \
+ } else { \
+ KTEST_LOG(ctx, "PASS: %s != %s", #x, #y); \
+ } \
+} while (0)
+
+#define KTEST_GREATER_THAN(x, y) do { \
+ if ((x) <= (y)) { \
+ KTEST_ERR(ctx, "FAIL: %s <= %s (%d <= %d)", #x, #y, (x), (y)); \
+ if (test_exit_on_failure) \
+ return (EINVAL); \
+ } else { \
+ KTEST_LOG(ctx, "PASS: %s > %s", #x, #y); \
+ } \
+} while (0)
+
+#define KTEST_VERIFY_RET(x, y) do { \
+ if (!(x)) { \
+ KTEST_ERR(ctx, "FAIL: %s", #x); \
+ if (test_exit_on_failure) \
+ return (y); \
+ } else { \
+ KTEST_LOG(ctx, "PASS: %s", #x); \
+ } \
+} while (0)
+
+static void
+dump_hpts_entry(struct ktest_test_context *ctx, struct tcp_hpts_entry *hpts)
+{
+ KTEST_LOG(ctx, "tcp_hpts_entry(%p)", hpts);
+ KTEST_LOG(ctx, " p_cur_slot: %u", hpts->p_cur_slot);
+ KTEST_LOG(ctx, " p_prev_slot: %u", hpts->p_prev_slot);
+ KTEST_LOG(ctx, " p_nxt_slot: %u", hpts->p_nxt_slot);
+ KTEST_LOG(ctx, " p_runningslot: %u", hpts->p_runningslot);
+ KTEST_LOG(ctx, " p_on_queue_cnt: %d", hpts->p_on_queue_cnt);
+ KTEST_LOG(ctx, " p_hpts_active: %u", hpts->p_hpts_active);
+ KTEST_LOG(ctx, " p_wheel_complete: %u", hpts->p_wheel_complete);
+ KTEST_LOG(ctx, " p_direct_wake: %u", hpts->p_direct_wake);
+ KTEST_LOG(ctx, " p_on_min_sleep: %u", hpts->p_on_min_sleep);
+ KTEST_LOG(ctx, " p_hpts_wake_scheduled: %u", hpts->p_hpts_wake_scheduled);
+ KTEST_LOG(ctx, " hit_callout_thresh: %u", hpts->hit_callout_thresh);
+ KTEST_LOG(ctx, " p_hpts_sleep_time: %u", hpts->p_hpts_sleep_time);
+ KTEST_LOG(ctx, " p_delayed_by: %u", hpts->p_delayed_by);
+ KTEST_LOG(ctx, " overidden_sleep: %u", hpts->overidden_sleep);
+ KTEST_LOG(ctx, " saved_curslot: %u", hpts->saved_curslot);
+ KTEST_LOG(ctx, " saved_prev_slot: %u", hpts->saved_prev_slot);
+ KTEST_LOG(ctx, " syscall_cnt: %lu", hpts->syscall_cnt);
+ KTEST_LOG(ctx, " sleeping: %lu", hpts->sleeping);
+ KTEST_LOG(ctx, " p_cpu: %u", hpts->p_cpu);
+ KTEST_LOG(ctx, " ie_cookie: %p", hpts->ie_cookie);
+ KTEST_LOG(ctx, " p_hptsi: %p", hpts->p_hptsi);
+ KTEST_LOG(ctx, " p_mysleep: %ld.%06ld", hpts->p_mysleep.tv_sec, hpts->p_mysleep.tv_usec);
+}
+
+static void
+dump_tcpcb(struct tcpcb *tp)
+{
+ struct ktest_test_context *ctx = tp->t_fb_ptr;
+ struct inpcb *inp = &tp->t_inpcb;
+
+ KTEST_LOG(ctx, "tcp_control_block(%p)", tp);
+
+ /* HPTS-specific fields */
+ KTEST_LOG(ctx, " t_in_hpts: %d", tp->t_in_hpts);
+ KTEST_LOG(ctx, " t_hpts_cpu: %u", tp->t_hpts_cpu);
+ KTEST_LOG(ctx, " t_hpts_slot: %d", tp->t_hpts_slot);
+ KTEST_LOG(ctx, " t_hpts_gencnt: %u", tp->t_hpts_gencnt);
+ KTEST_LOG(ctx, " t_hpts_request: %u", tp->t_hpts_request);
+
+ /* LRO CPU field */
+ KTEST_LOG(ctx, " t_lro_cpu: %u", tp->t_lro_cpu);
+
+ /* TCP flags that affect HPTS */
+ KTEST_LOG(ctx, " t_flags2: 0x%x", tp->t_flags2);
+ KTEST_LOG(ctx, " TF2_HPTS_CPU_SET: %s", (tp->t_flags2 & TF2_HPTS_CPU_SET) ? "YES" : "NO");
+ KTEST_LOG(ctx, " TF2_HPTS_CALLS: %s", (tp->t_flags2 & TF2_HPTS_CALLS) ? "YES" : "NO");
+ KTEST_LOG(ctx, " TF2_SUPPORTS_MBUFQ: %s", (tp->t_flags2 & TF2_SUPPORTS_MBUFQ) ? "YES" : "NO");
+
+ /* Input PCB fields that HPTS uses */
+ KTEST_LOG(ctx, " inp_flags: 0x%x", inp->inp_flags);
+ KTEST_LOG(ctx, " INP_DROPPED: %s", (inp->inp_flags & INP_DROPPED) ? "YES" : "NO");
+ KTEST_LOG(ctx, " inp_flowid: 0x%x", inp->inp_flowid);
+ KTEST_LOG(ctx, " inp_flowtype: %u", inp->inp_flowtype);
+ KTEST_LOG(ctx, " inp_numa_domain: %d", inp->inp_numa_domain);
+}
+
+/* Enum for call counting indices */
+enum test_call_counts {
+ CCNT_MICROUPTIME = 0,
+ CCNT_SWI_ADD,
+ CCNT_SWI_REMOVE,
+ CCNT_SWI_SCHED,
+ CCNT_INTR_EVENT_BIND,
+ CCNT_INTR_EVENT_BIND_CPUSET,
+ CCNT_CALLOUT_INIT,
+ CCNT_CALLOUT_RESET_SBT_ON,
+ CCNT_CALLOUT_STOP_SAFE,
+ CCNT_TCP_OUTPUT,
+ CCNT_TCP_TFB_DO_QUEUED_SEGMENTS,
+ CCNT_MAX
+};
+
+static uint32_t call_counts[CCNT_MAX];
+
+static uint64_t test_time_usec = 0;
+
+/*
+ * Reset all test global variables to a clean state.
+ */
+static void
+test_hpts_init(void)
+{
+ memset(call_counts, 0, sizeof(call_counts));
+ test_time_usec = 0;
+}
+
+static void
+test_microuptime(struct timeval *tv)
+{
+ call_counts[CCNT_MICROUPTIME]++;
+ tv->tv_sec = test_time_usec / 1000000;
+ tv->tv_usec = test_time_usec % 1000000;
+}
+
+static int
+test_swi_add(struct intr_event **eventp, const char *name,
+ driver_intr_t handler, void *arg, int pri, enum intr_type flags,
+ void **cookiep)
+{
+ call_counts[CCNT_SWI_ADD]++;
+ /* Simulate successful SWI creation */
+ *eventp = (struct intr_event *)0xfeedface; /* Mock event */
+ *cookiep = (void *)0xdeadbeef; /* Mock cookie */
+ return (0);
+}
+
+static int
+test_swi_remove(void *cookie)
+{
+ call_counts[CCNT_SWI_REMOVE]++;
+ /* Simulate successful removal */
+ return (0);
+}
+
+static void
+test_swi_sched(void *cookie, int flags)
+{
+ call_counts[CCNT_SWI_SCHED]++;
+ /* Simulate successful SWI scheduling */
+}
+
+static int
+test_intr_event_bind(struct intr_event *ie, int cpu)
+{
+ call_counts[CCNT_INTR_EVENT_BIND]++;
+ /* Simulate successful binding */
+ return (0);
+}
+
+static int
+test_intr_event_bind_ithread_cpuset(struct intr_event *ie, struct _cpuset *mask)
+{
+ call_counts[CCNT_INTR_EVENT_BIND_CPUSET]++;
+ /* Simulate successful cpuset binding */
+ return (0);
+}
+
+static void
+test_callout_init(struct callout *c, int mpsafe)
+{
+ call_counts[CCNT_CALLOUT_INIT]++;
+ memset(c, 0, sizeof(*c));
+}
+
+static int
+test_callout_reset_sbt_on(struct callout *c, sbintime_t sbt, sbintime_t precision,
+ void (*func)(void *), void *arg, int cpu, int flags)
+{
+ call_counts[CCNT_CALLOUT_RESET_SBT_ON]++;
+ /* Return 1 to simulate successful timer scheduling */
+ return (1);
+}
+
+static int
+test_callout_stop_safe(struct callout *c, int flags)
+{
+ call_counts[CCNT_CALLOUT_STOP_SAFE]++;
+ /* Return 1 to simulate successful timer stopping */
+ return (1);
+}
+
+static const struct tcp_hptsi_funcs test_funcs = {
+ .microuptime = test_microuptime,
+ .swi_add = test_swi_add,
+ .swi_remove = test_swi_remove,
+ .swi_sched = test_swi_sched,
+ .intr_event_bind = test_intr_event_bind,
+ .intr_event_bind_ithread_cpuset = test_intr_event_bind_ithread_cpuset,
+ .callout_init = test_callout_init,
+ .callout_reset_sbt_on = test_callout_reset_sbt_on,
+ ._callout_stop_safe = test_callout_stop_safe,
+};
+
+#define TP_REMOVE_FROM_HPTS(tp) tp->bits_spare
+#define TP_LOG_TEST(tp) tp->t_log_state_set
+
+static int
+test_tcp_output(struct tcpcb *tp)
+{
+ struct ktest_test_context *ctx = tp->t_fb_ptr;
+ struct tcp_hptsi *pace = (struct tcp_hptsi*)tp->t_tfo_pending;
+ struct tcp_hpts_entry *hpts = pace->rp_ent[tp->t_hpts_cpu];
+
+ call_counts[CCNT_TCP_OUTPUT]++;
+ if (TP_LOG_TEST(tp)) {
+ KTEST_LOG(ctx, "=> tcp_output(%p)", tp);
+ dump_tcpcb(tp);
+ dump_hpts_entry(ctx, hpts);
+ }
+
+ if ((TP_REMOVE_FROM_HPTS(tp) & 1) != 0) {
+ if (TP_LOG_TEST(tp))
+ KTEST_LOG(ctx, "=> tcp_hpts_remove(%p)", tp);
+ tcp_hpts_remove(pace, tp);
+ }
+
+ if ((TP_REMOVE_FROM_HPTS(tp) & 2) != 0) {
+ INP_WUNLOCK(&tp->t_inpcb); /* tcp_output unlocks on error */
+ return (-1); /* Simulate tcp_output error */
+ }
+
+ return (0);
+}
+
+static int
+test_tfb_do_queued_segments(struct tcpcb *tp, int flag)
+{
+ struct ktest_test_context *ctx = tp->t_fb_ptr;
+ struct tcp_hptsi *pace = (struct tcp_hptsi*)tp->t_tfo_pending;
+ struct tcp_hpts_entry *hpts = pace->rp_ent[tp->t_hpts_cpu];
+
+ call_counts[CCNT_TCP_TFB_DO_QUEUED_SEGMENTS]++;
+ KTEST_LOG(ctx, "=> tfb_do_queued_segments(%p, %d)", tp, flag);
+ dump_tcpcb(tp);
+ dump_hpts_entry(ctx, hpts);
+
+ if ((TP_REMOVE_FROM_HPTS(tp) & 1) != 0) {
+ if (TP_LOG_TEST(tp))
+ KTEST_LOG(ctx, "=> tcp_hpts_remove(%p)", tp);
+ tcp_hpts_remove(pace, tp);
+ }
+
+ if ((TP_REMOVE_FROM_HPTS(tp) & 2) != 0) {
+ INP_WUNLOCK(&tp->t_inpcb); /* do_queued_segments unlocks on error */
+ return (-1); /* Simulate do_queued_segments error */
+ }
+
+ return (0);
+}
+
+static struct tcp_function_block test_tcp_fb = {
+ .tfb_tcp_block_name = "hpts_test_tcp",
+ .tfb_tcp_output = test_tcp_output,
+ .tfb_do_queued_segments = test_tfb_do_queued_segments,
+};
+
+/*
+ * Create a minimally initialized tcpcb that can be safely inserted into HPTS.
+ * This function allocates and initializes all the fields that HPTS code
+ * reads or writes.
+ */
+static struct tcpcb *
+test_hpts_create_tcpcb(struct ktest_test_context *ctx, struct tcp_hptsi *pace)
+{
+ struct tcpcb *tp;
+
+ tp = malloc(sizeof(struct tcpcb), M_TCPHPTS, M_WAITOK | M_ZERO);
+ if (tp) {
+ rw_init_flags(&tp->t_inpcb.inp_lock, "test-inp",
+ RW_RECURSE | RW_DUPOK);
+ refcount_init(&tp->t_inpcb.inp_refcount, 1);
+ tp->t_inpcb.inp_pcbinfo = &V_tcbinfo;
+ tp->t_fb = &test_tcp_fb;
+ tp->t_hpts_cpu = HPTS_CPU_NONE;
+ STAILQ_INIT(&tp->t_inqueue);
+ tcp_hpts_init(pace, tp);
+
+ /* Stuff some pointers in the tcb for test purposes. */
+ tp->t_fb_ptr = ctx;
+ tp->t_tfo_pending = (unsigned int*)pace;
+ }
+
+ return (tp);
+}
+
+/*
+ * Free a test tcpcb created by test_hpts_create_tcpcb()
+ */
+static void
+test_hpts_free_tcpcb(struct tcpcb *tp)
+{
+ if (tp == NULL)
+ return;
+
+ INP_LOCK_DESTROY(&tp->t_inpcb);
+ free(tp, M_TCPHPTS);
+}
+
+/*
+ * ***********************************************
+ * * KTEST functions for testing the HPTS module *
+ * ***********************************************
+ */
+
+/*
+ * Validates that the HPTS module is properly loaded and initialized by checking
+ * that the minimum HPTS time is configured.
+ */
+KTEST_FUNC(module_load)
+{
+ test_hpts_init();
+ KTEST_NEQUAL(tcp_min_hptsi_time, 0);
+ KTEST_VERIFY(tcp_bind_threads >= 0 && tcp_bind_threads <= 2);
+ KTEST_NEQUAL(tcp_hptsi_pace, NULL);
+ return (0);
+}
+
+/*
+ * Validates the creation and destruction of tcp_hptsi structures, ensuring
+ * proper initialization of internal fields and clean destruction.
+ */
+KTEST_FUNC(hptsi_create_destroy)
+{
+ struct tcp_hptsi *pace;
+
+ test_hpts_init();
+
+ pace = tcp_hptsi_create(&test_funcs, false);
+ KTEST_NEQUAL(pace, NULL);
+ KTEST_NEQUAL(pace->rp_ent, NULL);
+ KTEST_NEQUAL(pace->cts_last_ran, NULL);
+ KTEST_VERIFY(pace->rp_num_hptss > 0);
+ KTEST_VERIFY(pace->rp_num_hptss <= MAXCPU); /* Reasonable upper bound */
+ KTEST_VERIFY(pace->grp_cnt >= 1); /* At least one group */
+ KTEST_EQUAL(pace->funcs, &test_funcs); /* Verify function pointer was set */
+
+ /* Verify individual HPTS entries are properly initialized */
+ for (uint32_t i = 0; i < pace->rp_num_hptss; i++) {
+ KTEST_NEQUAL(pace->rp_ent[i], NULL);
+ KTEST_EQUAL(pace->rp_ent[i]->p_cpu, i);
+ KTEST_EQUAL(pace->rp_ent[i]->p_hptsi, pace);
+ KTEST_EQUAL(pace->rp_ent[i]->p_on_queue_cnt, 0);
+ }
+
+ tcp_hptsi_destroy(pace);
+
+ return (0);
+}
+
+/*
+ * Validates that tcp_hptsi structures can be started and stopped properly,
+ * including verification that threads are created during start and cleaned up
+ * during stop operations.
+ */
+KTEST_FUNC(hptsi_start_stop)
+{
+ struct tcp_hptsi *pace;
+
+ test_hpts_init();
+
+ pace = tcp_hptsi_create(&test_funcs, false);
+ KTEST_NEQUAL(pace, NULL);
+
+ tcp_hptsi_start(pace);
+
+ /* Verify that entries have threads started */
+ struct tcp_hpts_entry *hpts = pace->rp_ent[0];
+ KTEST_NEQUAL(hpts->ie_cookie, NULL); /* Should have SWI handler */
+ KTEST_EQUAL(hpts->p_hptsi, pace); /* Should point to our pace */
+
+ tcp_hptsi_stop(pace);
+ tcp_hptsi_destroy(pace);
+
+ return (0);
+}
+
+/*
+ * Validates that multiple tcp_hptsi instances can coexist independently, with
+ * different configurations and CPU assignments without interfering with each
+ * other.
+ */
+KTEST_FUNC(hptsi_independence)
+{
+ struct tcp_hptsi *pace1, *pace2;
+ uint16_t cpu1, cpu2;
+
+ test_hpts_init();
+
+ pace1 = tcp_hptsi_create(&test_funcs, false);
+ pace2 = tcp_hptsi_create(&test_funcs, false);
+ KTEST_NEQUAL(pace1, NULL);
+ KTEST_NEQUAL(pace2, NULL);
+ KTEST_NEQUAL(pace2->rp_ent, NULL);
+
+ cpu1 = tcp_hptsi_random_cpu(pace1);
+ cpu2 = tcp_hptsi_random_cpu(pace2);
+ KTEST_VERIFY(cpu1 < pace1->rp_num_hptss);
+ KTEST_VERIFY(cpu2 < pace2->rp_num_hptss);
+
+ /* Verify both instances have independent entry arrays */
+ KTEST_NEQUAL(pace1->rp_ent, pace2->rp_ent);
+ /* Verify they may have different CPU counts but both reasonable */
+ KTEST_VERIFY(pace1->rp_num_hptss > 0 && pace1->rp_num_hptss <= MAXCPU);
+ KTEST_VERIFY(pace2->rp_num_hptss > 0 && pace2->rp_num_hptss <= MAXCPU);
+
+ tcp_hptsi_destroy(pace1);
+ tcp_hptsi_destroy(pace2);
+
+ return (0);
+}
+
+/*
+ * Validates that custom function injection works correctly, ensuring that
+ * test-specific implementations of microuptime and others are properly
+ * called by the HPTS system.
+ */
+KTEST_FUNC(function_injection)
+{
+ struct tcp_hptsi *pace;
+
+ test_hpts_init();
+
+ pace = tcp_hptsi_create(&test_funcs, false);
+ KTEST_NEQUAL(pace, NULL);
+ KTEST_EQUAL(pace->funcs, &test_funcs);
+ KTEST_VERIFY(call_counts[CCNT_MICROUPTIME] > 0);
+ KTEST_VERIFY(call_counts[CCNT_CALLOUT_INIT] > 0);
+
+ tcp_hptsi_start(pace);
+ KTEST_VERIFY(call_counts[CCNT_SWI_ADD] > 0);
+ KTEST_VERIFY(tcp_bind_threads == 0 ||
+ call_counts[CCNT_INTR_EVENT_BIND] > 0 ||
+ call_counts[CCNT_INTR_EVENT_BIND_CPUSET] > 0);
+ KTEST_VERIFY(call_counts[CCNT_CALLOUT_RESET_SBT_ON] > 0);
+
+ tcp_hptsi_stop(pace);
+ KTEST_VERIFY(call_counts[CCNT_CALLOUT_STOP_SAFE] > 0);
+ KTEST_VERIFY(call_counts[CCNT_SWI_REMOVE] > 0);
+
+ tcp_hptsi_destroy(pace);
+
+ /* Verify we have a reasonable balance of create/destroy calls */
+ KTEST_EQUAL(call_counts[CCNT_SWI_ADD], call_counts[CCNT_SWI_REMOVE]);
+ KTEST_VERIFY(call_counts[CCNT_CALLOUT_RESET_SBT_ON] <= call_counts[CCNT_CALLOUT_STOP_SAFE]);
+
+ return (0);
+}
+
+/*
+ * Validates that a tcpcb can be properly initialized for HPTS compatibility,
+ * ensuring all required fields are set correctly and function pointers are
+ * valid for safe HPTS operations.
+ */
+KTEST_FUNC(tcpcb_initialization)
+{
+ struct tcp_hptsi *pace;
+ struct tcpcb *tp;
+
+ test_hpts_init();
+
+ pace = tcp_hptsi_create(&test_funcs, false);
+ KTEST_NEQUAL(pace, NULL);
+ tcp_hptsi_start(pace);
+
+ /* Verify the tcpcb is properly initialized for HPTS */
+ tp = test_hpts_create_tcpcb(ctx, pace);
+ KTEST_NEQUAL(tp, NULL);
+ KTEST_NEQUAL(tp->t_fb, NULL);
+ KTEST_NEQUAL(tp->t_fb->tfb_tcp_output, NULL);
+ KTEST_NEQUAL(tp->t_fb->tfb_do_queued_segments, NULL);
+ KTEST_EQUAL(tp->t_in_hpts, IHPTS_NONE);
+ KTEST_EQUAL((tp->t_flags2 & (TF2_HPTS_CPU_SET | TF2_HPTS_CALLS)), 0);
+
+ /* Verify that HPTS-specific fields are initialized */
+ KTEST_EQUAL(tp->t_hpts_gencnt, 0);
+ KTEST_EQUAL(tp->t_hpts_slot, 0);
+ KTEST_EQUAL(tp->t_hpts_request, 0);
+ KTEST_EQUAL(tp->t_lro_cpu, 0);
+ KTEST_VERIFY(tp->t_hpts_cpu < pace->rp_num_hptss);
+ KTEST_EQUAL(tp->t_inpcb.inp_refcount, 1);
+ KTEST_VERIFY(!(tp->t_inpcb.inp_flags & INP_DROPPED));
+
+ test_hpts_free_tcpcb(tp);
+ tcp_hptsi_stop(pace);
+ tcp_hptsi_destroy(pace);
+
+ return (0);
+}
+
+/*
+ * Validates that tcpcb structures can be successfully inserted into and removed
+ * from the HPTS wheel, with proper state tracking and slot assignment during
+ * the process.
+ */
+KTEST_FUNC(tcpcb_insertion)
+{
+ struct tcp_hptsi *pace;
+ struct tcpcb *tp;
+ struct tcp_hpts_entry *hpts;
+ uint32_t timeout_usecs = 10;
+
+ test_hpts_init();
+
+ pace = tcp_hptsi_create(&test_funcs, false);
+ KTEST_NEQUAL(pace, NULL);
+ tcp_hptsi_start(pace);
+
+ tp = test_hpts_create_tcpcb(ctx, pace);
+ KTEST_NEQUAL(tp, NULL);
+ KTEST_EQUAL(tp->t_in_hpts, IHPTS_NONE);
+ KTEST_EQUAL((tp->t_flags2 & TF2_HPTS_CALLS), 0);
+
+ INP_WLOCK(&tp->t_inpcb);
+ tp->t_flags2 |= TF2_HPTS_CALLS;
+ KTEST_EQUAL(call_counts[CCNT_SWI_SCHED], 0);
+ tcp_hpts_insert(pace, tp, timeout_usecs, NULL);
+ KTEST_EQUAL(tp->t_in_hpts, IHPTS_ONQUEUE);
+ INP_WUNLOCK(&tp->t_inpcb);
+ KTEST_EQUAL(call_counts[CCNT_TCP_OUTPUT], 0);
+ KTEST_EQUAL(call_counts[CCNT_SWI_SCHED], 1);
+ KTEST_VERIFY(tcp_in_hpts(tp));
+ KTEST_VERIFY(tp->t_hpts_slot >= 0);
+ KTEST_VERIFY(tp->t_hpts_slot < NUM_OF_HPTSI_SLOTS);
+
+ hpts = pace->rp_ent[tp->t_hpts_cpu];
+ KTEST_EQUAL(hpts->p_on_queue_cnt, 1);
+ KTEST_EQUAL(tp->t_hpts_request, 0);
+ KTEST_EQUAL(tp->t_hpts_slot, HPTS_USEC_TO_SLOTS(timeout_usecs));
+ //KTEST_EQUAL(tp->t_hpts_gencnt, 1);
+
+ INP_WLOCK(&tp->t_inpcb);
+ tcp_hpts_remove(pace, tp);
+ KTEST_EQUAL(tp->t_in_hpts, IHPTS_NONE);
+ INP_WUNLOCK(&tp->t_inpcb);
+ KTEST_EQUAL(call_counts[CCNT_TCP_OUTPUT], 0);
+ KTEST_VERIFY(!tcp_in_hpts(tp));
+
+ KTEST_EQUAL(hpts->p_on_queue_cnt, 0);
+
+ test_hpts_free_tcpcb(tp);
+ tcp_hptsi_stop(pace);
+ tcp_hptsi_destroy(pace);
+
+ return (0);
+}
+
+/*
+ * Validates the core HPTS timer functionality by verifying that scheduled
+ * tcpcb entries trigger tcp_output calls at appropriate times, simulating
+ * real-world timer-driven TCP processing.
+ */
+KTEST_FUNC(timer_functionality)
+{
+ struct epoch_tracker et;
+ struct tcp_hptsi *pace;
+ struct tcp_hpts_entry *hpts;
+ struct tcpcb *tp;
+ int32_t slots_ran;
+ uint32_t i;
+
+ test_hpts_init();
+
+ pace = tcp_hptsi_create(&test_funcs, false);
+ KTEST_NEQUAL(pace, NULL);
+ tcp_hptsi_start(pace);
+
+ for (i = 0; i < pace->rp_num_hptss; i++)
+ dump_hpts_entry(ctx, pace->rp_ent[i]);
+
+ /* Create and insert the tcpcb into the HPTS wheel to wait for 500 usec */
+ tp = test_hpts_create_tcpcb(ctx, pace);
+ KTEST_NEQUAL(tp, NULL);
+ dump_tcpcb(tp);
+ TP_LOG_TEST(tp) = 1; /* Enable logging for this tcpcb */
+
+ KTEST_LOG(ctx, "=> tcp_hpts_insert(%p)", tp);
+ INP_WLOCK(&tp->t_inpcb);
+ tp->t_flags2 |= TF2_HPTS_CALLS; /* Mark as needing HPTS processing */
+ tcp_hpts_insert(pace, tp, 500, NULL);
+ INP_WUNLOCK(&tp->t_inpcb);
+
+ dump_tcpcb(tp);
+ for (i = 0; i < pace->rp_num_hptss; i++)
+ dump_hpts_entry(ctx, pace->rp_ent[i]);
+
+ hpts = pace->rp_ent[tp->t_hpts_cpu];
+ KTEST_EQUAL(hpts->p_on_queue_cnt, 1);
+ KTEST_EQUAL(hpts->p_prev_slot, 0);
+ KTEST_EQUAL(hpts->p_cur_slot, 0);
+ KTEST_EQUAL(hpts->p_runningslot, 0);
+ KTEST_EQUAL(hpts->p_nxt_slot, 1);
+ KTEST_EQUAL(hpts->p_hpts_active, 0);
+
+ KTEST_EQUAL(tp->t_in_hpts, IHPTS_ONQUEUE);
+ KTEST_EQUAL(tp->t_hpts_request, 0);
+ KTEST_EQUAL(tp->t_hpts_slot, HPTS_USEC_TO_SLOTS(500));
+
+ /* Set our test flag to indicate the tcpcb should be removed from the
+ * wheel when tcp_output is called. */
+ TP_REMOVE_FROM_HPTS(tp) = 1;
+
+ /* Test early exit condition: advance time by insufficient amount */
+ KTEST_LOG(ctx, "Testing early exit with insufficient time advancement");
+ test_time_usec += 1; /* Very small advancement - should cause early exit */
+ HPTS_LOCK(hpts);
+ NET_EPOCH_ENTER(et);
+ slots_ran = tcp_hptsi(hpts, true);
+ HPTS_UNLOCK(hpts);
+ NET_EPOCH_EXIT(et);
+
+ /* Should return 0 slots due to insufficient time advancement */
+ KTEST_EQUAL(slots_ran, 0);
+ KTEST_EQUAL(call_counts[CCNT_TCP_OUTPUT], 0); /* No processing should occur */
+ KTEST_EQUAL(tp->t_in_hpts, IHPTS_ONQUEUE); /* Connection still queued */
+
+ /* Wait for 498 more usecs and trigger the HPTS workers and verify
+ * nothing happens yet (total 499 usec) */
+ KTEST_EQUAL(call_counts[CCNT_TCP_OUTPUT], 0);
+ test_time_usec += 498;
+ for (i = 0; i < pace->rp_num_hptss; i++) {
+ KTEST_LOG(ctx, "=> tcp_hptsi(%p)", pace->rp_ent[i]);
+ HPTS_LOCK(pace->rp_ent[i]);
+ NET_EPOCH_ENTER(et);
+ slots_ran = tcp_hptsi(pace->rp_ent[i], true);
+ HPTS_UNLOCK(pace->rp_ent[i]);
+ NET_EPOCH_EXIT(et);
+
+ dump_hpts_entry(ctx, pace->rp_ent[i]);
+ KTEST_VERIFY(slots_ran >= 0);
+ KTEST_EQUAL(pace->rp_ent[i]->p_prev_slot, 49);
+ KTEST_EQUAL(pace->rp_ent[i]->p_cur_slot, 49);
+ }
+
+ dump_tcpcb(tp);
+ KTEST_EQUAL(call_counts[CCNT_TCP_OUTPUT], 0);
+ KTEST_EQUAL(tp->t_in_hpts, IHPTS_ONQUEUE);
+ KTEST_EQUAL(tp->t_hpts_request, 0);
+ KTEST_EQUAL(tp->t_hpts_slot, HPTS_USEC_TO_SLOTS(500));
+ KTEST_EQUAL(hpts->p_on_queue_cnt, 1);
+
+ /* Wait for 1 more usec and trigger the HPTS workers and verify it
+ * triggers tcp_output this time */
+ KTEST_EQUAL(call_counts[CCNT_TCP_OUTPUT], 0);
+ test_time_usec += 1;
+ for (i = 0; i < pace->rp_num_hptss; i++) {
+ KTEST_LOG(ctx, "=> tcp_hptsi(%p)", pace->rp_ent[i]);
+ HPTS_LOCK(pace->rp_ent[i]);
+ NET_EPOCH_ENTER(et);
+ slots_ran = tcp_hptsi(pace->rp_ent[i], true);
+ HPTS_UNLOCK(pace->rp_ent[i]);
+ NET_EPOCH_EXIT(et);
+
+ dump_hpts_entry(ctx, pace->rp_ent[i]);
+ KTEST_VERIFY(slots_ran >= 0);
+ KTEST_EQUAL(pace->rp_ent[i]->p_prev_slot, 50);
+ KTEST_EQUAL(pace->rp_ent[i]->p_cur_slot, 50);
+ }
+
+ dump_tcpcb(tp);
+ KTEST_EQUAL(call_counts[CCNT_TCP_OUTPUT], 1);
+ KTEST_EQUAL(tp->t_in_hpts, IHPTS_NONE);
+ KTEST_EQUAL(hpts->p_on_queue_cnt, 0);
+
+ test_hpts_free_tcpcb(tp);
+ tcp_hptsi_stop(pace);
+ tcp_hptsi_destroy(pace);
+
+ return (0);
+}
+
+/*
+ * Validates HPTS scalability by creating and inserting a LOT of tcpcbs into
+ * the HPTS wheel, testing performance under high load conditions.
+ */
+KTEST_FUNC(scalability_tcpcbs)
+{
+ struct tcp_hptsi *pace;
+ struct tcpcb **tcpcbs;
+ uint32_t i, num_tcpcbs = 100000, total_queued = 0;
+
+ test_hpts_init();
+
+ pace = tcp_hptsi_create(&test_funcs, false);
+ KTEST_NEQUAL(pace, NULL);
+ tcp_hptsi_start(pace);
+
+ /* Allocate array to hold pointers to all tcpcbs */
+ tcpcbs = malloc(num_tcpcbs * sizeof(struct tcpcb *), M_TCPHPTS, M_WAITOK | M_ZERO);
+ KTEST_VERIFY_RET(tcpcbs != NULL, ENOMEM);
+
+ /* Create a LOT of tcpcbs */
+ KTEST_LOG(ctx, "Creating %u tcpcbs...", num_tcpcbs);
+ for (i = 0; i < num_tcpcbs; i++) {
+ tcpcbs[i] = test_hpts_create_tcpcb(ctx, pace);
+ if (tcpcbs[i] == NULL) {
+ KTEST_ERR(ctx, "FAIL: tcpcbs[i] == NULL");
+ return (EINVAL);
+ }
+ }
+
+ /* Insert all created tcpcbs into HPTS */
+ KTEST_LOG(ctx, "Inserting all tcpcbs into HPTS...");
+ for (i = 0; i < num_tcpcbs; i++) {
+ INP_WLOCK(&tcpcbs[i]->t_inpcb);
+ tcpcbs[i]->t_flags2 |= TF2_HPTS_CALLS;
+ /* Insert with varying future timeouts to distribute across slots */
+ tcp_hpts_insert(pace, tcpcbs[i], 100 + (i % 1000), NULL);
+ INP_WUNLOCK(&tcpcbs[i]->t_inpcb);
+ }
+
+ /* Verify total queue counts across all CPUs */
+ for (i = 0; i < pace->rp_num_hptss; i++) {
+ total_queued += pace->rp_ent[i]->p_on_queue_cnt;
+ }
+ KTEST_EQUAL(total_queued, num_tcpcbs);
+
+ for (i = 0; i < pace->rp_num_hptss; i++)
+ dump_hpts_entry(ctx, pace->rp_ent[i]);
+
+ /* Remove all tcpcbs from HPTS */
+ KTEST_LOG(ctx, "Removing all tcpcbs from HPTS...");
+ for (i = 0; i < num_tcpcbs; i++) {
+ INP_WLOCK(&tcpcbs[i]->t_inpcb);
+ if (tcpcbs[i]->t_in_hpts != IHPTS_NONE) {
+ tcp_hpts_remove(pace, tcpcbs[i]);
+ }
+ INP_WUNLOCK(&tcpcbs[i]->t_inpcb);
+ }
+
+ /* Verify all queues are now empty */
+ for (i = 0; i < pace->rp_num_hptss; i++) {
+ if (pace->rp_ent[i]->p_on_queue_cnt != 0) {
+ KTEST_ERR(ctx, "FAIL: pace->rp_ent[i]->p_on_queue_cnt != 0");
+ return (EINVAL);
+ }
+ }
+
+ for (i = 0; i < num_tcpcbs; i++) {
+ test_hpts_free_tcpcb(tcpcbs[i]);
+ }
+ free(tcpcbs, M_TCPHPTS);
+ tcp_hptsi_stop(pace);
+ tcp_hptsi_destroy(pace);
+
+ return (0);
+}
+
+/*
+ * Validates wheel wrap scenarios where the timer falls significantly behind
+ * and needs to process more than one full wheel revolution worth of slots.
+ */
+KTEST_FUNC(wheel_wrap_recovery)
+{
+ struct epoch_tracker et;
+ struct tcp_hptsi *pace;
+ struct tcpcb **tcpcbs;
+ uint32_t i, timeout_usecs, num_tcpcbs = 500;
+ int32_t slots_ran;
+
+ test_hpts_init();
+
+ pace = tcp_hptsi_create(&test_funcs, false);
+ KTEST_NEQUAL(pace, NULL);
+ tcp_hptsi_start(pace);
+
+ /* Allocate array to hold pointers to tcpcbs */
+ tcpcbs = malloc(num_tcpcbs * sizeof(struct tcpcb *), M_TCPHPTS, M_WAITOK | M_ZERO);
+ KTEST_VERIFY_RET(tcpcbs != NULL, ENOMEM);
+
+ /* Create tcpcbs and insert them across many slots */
+ for (i = 0; i < num_tcpcbs; i++) {
+ tcpcbs[i] = test_hpts_create_tcpcb(ctx, pace);
+ KTEST_NEQUAL(tcpcbs[i], NULL);
+ TP_REMOVE_FROM_HPTS(tcpcbs[i]) = 1;
+
+ timeout_usecs = ((i * NUM_OF_HPTSI_SLOTS) / num_tcpcbs) * HPTS_USECS_PER_SLOT; /* Spread across slots */
+
+ INP_WLOCK(&tcpcbs[i]->t_inpcb);
+ tcpcbs[i]->t_flags2 |= TF2_HPTS_CALLS;
+ tcp_hpts_insert(pace, tcpcbs[i], timeout_usecs, NULL);
+ INP_WUNLOCK(&tcpcbs[i]->t_inpcb);
+ }
+
+ /* Fast forward time significantly to trigger wheel wrap */
+ test_time_usec += (NUM_OF_HPTSI_SLOTS + 5000) * HPTS_USECS_PER_SLOT;
+
+ for (i = 0; i < pace->rp_num_hptss; i++) {
+ KTEST_LOG(ctx, "=> tcp_hptsi(%u)", i);
+ KTEST_NEQUAL(pace->rp_ent[i]->p_on_queue_cnt, 0);
+
+ HPTS_LOCK(pace->rp_ent[i]);
+ NET_EPOCH_ENTER(et);
+ slots_ran = tcp_hptsi(pace->rp_ent[i], true);
+ HPTS_UNLOCK(pace->rp_ent[i]);
+ NET_EPOCH_EXIT(et);
+
+ KTEST_EQUAL(slots_ran, NUM_OF_HPTSI_SLOTS-1); /* Should process all slots */
+ KTEST_EQUAL(pace->rp_ent[i]->p_on_queue_cnt, 0);
+ KTEST_NEQUAL(pace->rp_ent[i]->p_cur_slot,
+ pace->rp_ent[i]->p_prev_slot);
+ }
+
+ /* Cleanup */
+ for (i = 0; i < num_tcpcbs; i++) {
+ INP_WLOCK(&tcpcbs[i]->t_inpcb);
+ if (tcpcbs[i]->t_in_hpts != IHPTS_NONE) {
+ tcp_hpts_remove(pace, tcpcbs[i]);
+ }
+ INP_WUNLOCK(&tcpcbs[i]->t_inpcb);
+ test_hpts_free_tcpcb(tcpcbs[i]);
+ }
+ free(tcpcbs, M_TCPHPTS);
+ tcp_hptsi_stop(pace);
+ tcp_hptsi_destroy(pace);
+
+ return (0);
+}
+
+/*
+ * Validates proper handling of tcpcbs in the IHPTS_MOVING state, which occurs
+ * when a tcpcb is being processed by the HPTS thread but gets removed.
+ */
+KTEST_FUNC(tcpcb_moving_state)
+{
+ struct epoch_tracker et;
+ struct tcp_hptsi *pace;
+ struct tcpcb *tp1, *tp2;
+ struct tcp_hpts_entry *hpts;
+ int32_t slots_ran;
+
+ test_hpts_init();
+
+ pace = tcp_hptsi_create(&test_funcs, false);
+ KTEST_NEQUAL(pace, NULL);
+ tcp_hptsi_start(pace);
+
+ /* Create two tcpcbs on the same CPU/slot */
+ tp1 = test_hpts_create_tcpcb(ctx, pace);
+ tp2 = test_hpts_create_tcpcb(ctx, pace);
+ KTEST_NEQUAL(tp1, NULL);
+ KTEST_NEQUAL(tp2, NULL);
+
+ /* Force them to the same CPU for predictable testing */
+ tp1->t_hpts_cpu = 0;
+ tp2->t_hpts_cpu = 0;
+
+ /* Insert both into the same slot */
+ INP_WLOCK(&tp1->t_inpcb);
+ tp1->t_flags2 |= TF2_HPTS_CALLS;
+ tcp_hpts_insert(pace, tp1, 100, NULL);
+ INP_WUNLOCK(&tp1->t_inpcb);
+
+ INP_WLOCK(&tp2->t_inpcb);
+ tp2->t_flags2 |= TF2_HPTS_CALLS;
+ tcp_hpts_insert(pace, tp2, 100, NULL);
+ INP_WUNLOCK(&tp2->t_inpcb);
+
+ hpts = pace->rp_ent[0];
+
+ /* Manually transition tp1 to MOVING state to simulate race condition */
+ HPTS_LOCK(hpts);
+ tp1->t_in_hpts = IHPTS_MOVING;
+ tp1->t_hpts_slot = -1; /* Mark for removal */
+ HPTS_UNLOCK(hpts);
+
+ /* Set time and run HPTS to process the moving state */
+ test_time_usec += 100;
+ HPTS_LOCK(hpts);
+ NET_EPOCH_ENTER(et);
+ slots_ran = tcp_hptsi(hpts, true);
+ HPTS_UNLOCK(hpts);
+ NET_EPOCH_EXIT(et);
+
+ KTEST_VERIFY(slots_ran >= 0);
+ KTEST_EQUAL(call_counts[CCNT_TCP_OUTPUT], 1); /* Shouldn't call on both */
+
+ /* tp1 should be cleaned up and removed */
+ KTEST_EQUAL(tp1->t_in_hpts, IHPTS_NONE);
+ /* tp2 should have been processed normally */
+ KTEST_EQUAL(tp2->t_in_hpts, IHPTS_NONE);
+
+ test_hpts_free_tcpcb(tp1);
+ test_hpts_free_tcpcb(tp2);
+ tcp_hptsi_stop(pace);
+ tcp_hptsi_destroy(pace);
+
+ return (0);
+}
+
+/*
+ * Validates that tcpcbs with deferred requests (t_hpts_request > 0) are
+ * properly handled and re-inserted into appropriate future slots after
+ * the wheel processes enough slots to accommodate the original request.
+ */
+KTEST_FUNC(deferred_requests)
+{
+ struct epoch_tracker et;
+ struct tcp_hptsi *pace;
+ struct tcpcb *tp, *tp2;
+ struct tcp_hpts_entry *hpts;
+ uint32_t large_timeout_usecs = (NUM_OF_HPTSI_SLOTS + 5000) * HPTS_USECS_PER_SLOT; /* Beyond wheel capacity */
+ uint32_t huge_timeout_usecs = (NUM_OF_HPTSI_SLOTS * 3) * HPTS_USECS_PER_SLOT; /* 3x wheel capacity */
+ uint32_t initial_request;
+ int32_t slots_ran;
+
+ test_hpts_init();
+
+ pace = tcp_hptsi_create(&test_funcs, false);
+ KTEST_NEQUAL(pace, NULL);
+ tcp_hptsi_start(pace);
+
+ tp = test_hpts_create_tcpcb(ctx, pace);
+ KTEST_NEQUAL(tp, NULL);
+
+ /* Insert with a request that exceeds current wheel capacity */
+ INP_WLOCK(&tp->t_inpcb);
+ tp->t_flags2 |= TF2_HPTS_CALLS;
+ tcp_hpts_insert(pace, tp, large_timeout_usecs, NULL);
+ INP_WUNLOCK(&tp->t_inpcb);
+
+ /* Verify it was inserted with a deferred request */
+ dump_tcpcb(tp);
+ KTEST_EQUAL(tp->t_in_hpts, IHPTS_ONQUEUE);
+ KTEST_VERIFY(tp->t_hpts_request > 0);
+ KTEST_VERIFY(tp->t_hpts_slot < NUM_OF_HPTSI_SLOTS);
+
+ hpts = pace->rp_ent[tp->t_hpts_cpu];
+
+ /* Advance time to process deferred requests */
+ test_time_usec += NUM_OF_HPTSI_SLOTS * HPTS_USECS_PER_SLOT;
+
+ /* Process the wheel to handle deferred requests */
+ HPTS_LOCK(hpts);
+ NET_EPOCH_ENTER(et);
+ slots_ran = tcp_hptsi(hpts, true);
+ HPTS_UNLOCK(hpts);
+ NET_EPOCH_EXIT(et);
+
+ dump_hpts_entry(ctx, hpts);
+ KTEST_GREATER_THAN(slots_ran, 0);
+ dump_tcpcb(tp);
+ KTEST_EQUAL(tp->t_hpts_request, 0);
+
+ /* Test incremental deferred request processing over multiple cycles */
+ KTEST_LOG(ctx, "Testing incremental deferred request processing");
+
+ /* Create a new connection with an even larger request */
+ tp2 = test_hpts_create_tcpcb(ctx, pace);
+ KTEST_NEQUAL(tp2, NULL);
+ tp2->t_hpts_cpu = tp->t_hpts_cpu; /* Same CPU for predictable testing */
+
+ INP_WLOCK(&tp2->t_inpcb);
+ tp2->t_flags2 |= TF2_HPTS_CALLS;
+ tcp_hpts_insert(pace, tp2, huge_timeout_usecs, NULL);
+ INP_WUNLOCK(&tp2->t_inpcb);
+
+ /* Verify initial deferred request */
+ initial_request = tp2->t_hpts_request;
+ KTEST_VERIFY(initial_request > NUM_OF_HPTSI_SLOTS);
+
+ /* Process one wheel cycle - should reduce but not eliminate request */
+ test_time_usec += NUM_OF_HPTSI_SLOTS * HPTS_USECS_PER_SLOT;
+ HPTS_LOCK(hpts);
+ NET_EPOCH_ENTER(et);
+ slots_ran = tcp_hptsi(hpts, true);
+ HPTS_UNLOCK(hpts);
+ NET_EPOCH_EXIT(et);
+
+ /* Request should be reduced but not zero */
+ KTEST_GREATER_THAN(initial_request, tp2->t_hpts_request);
+ KTEST_VERIFY(tp2->t_hpts_request > 0);
+ KTEST_EQUAL(tp2->t_in_hpts, IHPTS_ONQUEUE); /* Still queued */
+
+ /* For huge_timeout_usecs = NUM_OF_HPTSI_SLOTS * 3 * HPTS_USECS_PER_SLOT, we need ~3 cycles to complete.
+ * Each cycle can reduce the request by at most NUM_OF_HPTSI_SLOTS. */
+ test_time_usec += NUM_OF_HPTSI_SLOTS * HPTS_USECS_PER_SLOT;
+ HPTS_LOCK(hpts);
+ NET_EPOCH_ENTER(et);
+ slots_ran = tcp_hptsi(hpts, true);
+ HPTS_UNLOCK(hpts);
+ NET_EPOCH_EXIT(et);
+
+ /* After second cycle, request should be reduced significantly (likely by ~NUM_OF_HPTSI_SLOTS) */
+ KTEST_VERIFY(tp2->t_hpts_request < initial_request);
+ KTEST_VERIFY(tp2->t_hpts_request > 0); /* But not yet zero for such a large request */
+
+ /* Clean up second connection */
+ INP_WLOCK(&tp2->t_inpcb);
+ if (tp2->t_in_hpts != IHPTS_NONE) {
+ tcp_hpts_remove(pace, tp2);
+ }
+ INP_WUNLOCK(&tp2->t_inpcb);
+ test_hpts_free_tcpcb(tp2);
+
+ /* Clean up */
+ INP_WLOCK(&tp->t_inpcb);
+ if (tp->t_in_hpts != IHPTS_NONE) {
+ tcp_hpts_remove(pace, tp);
+ }
+ INP_WUNLOCK(&tp->t_inpcb);
+ test_hpts_free_tcpcb(tp);
+ tcp_hptsi_stop(pace);
+ tcp_hptsi_destroy(pace);
+
+ return (0);
+}
+
+/*
+ * Validates CPU assignment and affinity mechanisms, including flowid-based
+ * assignment, random fallback scenarios, and explicit CPU setting. Tests
+ * the actual cpu assignment logic in hpts_cpuid via tcp_set_hpts.
+ */
+KTEST_FUNC(cpu_assignment)
+{
+ struct tcp_hptsi *pace;
+ struct tcpcb *tp1, *tp2, *tp2_dup, *tp3;
+
+ test_hpts_init();
+
+ pace = tcp_hptsi_create(&test_funcs, false);
+ KTEST_NEQUAL(pace, NULL);
+
+ /* Test random CPU assignment (no flowid) */
+ tp1 = test_hpts_create_tcpcb(ctx, pace);
+ KTEST_NEQUAL(tp1, NULL);
+ tp1->t_inpcb.inp_flowtype = M_HASHTYPE_NONE;
+ INP_WLOCK(&tp1->t_inpcb);
+ tcp_set_hpts(pace, tp1);
+ INP_WUNLOCK(&tp1->t_inpcb);
+ KTEST_VERIFY(tp1->t_hpts_cpu < pace->rp_num_hptss);
+ KTEST_VERIFY(tp1->t_flags2 & TF2_HPTS_CPU_SET);
+
+ /* Test flowid-based assignment */
+ tp2 = test_hpts_create_tcpcb(ctx, pace);
+ KTEST_NEQUAL(tp2, NULL);
+ tp2->t_inpcb.inp_flowtype = M_HASHTYPE_RSS_TCP_IPV4;
+ tp2->t_inpcb.inp_flowid = 12345;
+ INP_WLOCK(&tp2->t_inpcb);
+ tcp_set_hpts(pace, tp2);
+ INP_WUNLOCK(&tp2->t_inpcb);
+ KTEST_VERIFY(tp2->t_hpts_cpu < pace->rp_num_hptss);
+ KTEST_VERIFY(tp2->t_flags2 & TF2_HPTS_CPU_SET);
+
+ /* With the same flowid, should get same CPU assignment */
+ tp2_dup = test_hpts_create_tcpcb(ctx, pace);
+ KTEST_NEQUAL(tp2_dup, NULL);
+ tp2_dup->t_inpcb.inp_flowtype = M_HASHTYPE_RSS_TCP_IPV4;
+ tp2_dup->t_inpcb.inp_flowid = 12345;
+ INP_WLOCK(&tp2_dup->t_inpcb);
+ tcp_set_hpts(pace, tp2_dup);
+ INP_WUNLOCK(&tp2_dup->t_inpcb);
+ KTEST_EQUAL(tp2_dup->t_hpts_cpu, tp2->t_hpts_cpu);
+
+ /* Test explicit CPU setting */
+ tp3 = test_hpts_create_tcpcb(ctx, pace);
+ KTEST_NEQUAL(tp3, NULL);
+ tp3->t_hpts_cpu = 1; /* Assume we have at least 2 CPUs */
+ tp3->t_flags2 |= TF2_HPTS_CPU_SET;
+ INP_WLOCK(&tp3->t_inpcb);
+ tcp_set_hpts(pace, tp3);
+ INP_WUNLOCK(&tp3->t_inpcb);
+ KTEST_EQUAL(tp3->t_hpts_cpu, 1);
+
+ test_hpts_free_tcpcb(tp1);
+ test_hpts_free_tcpcb(tp2);
+ test_hpts_free_tcpcb(tp2_dup);
+ test_hpts_free_tcpcb(tp3);
+ tcp_hptsi_destroy(pace);
+
+ return (0);
+}
+
+/*
+ * Validates edge cases in slot calculation including boundary conditions
+ * around slot 0, maximum slots, and slot wrapping arithmetic.
+ */
+KTEST_FUNC(slot_boundary_conditions)
+{
+ struct tcp_hptsi *pace;
+ struct tcpcb *tp;
+
+ test_hpts_init();
+
+ pace = tcp_hptsi_create(&test_funcs, false);
+ KTEST_NEQUAL(pace, NULL);
+ tcp_hptsi_start(pace);
+
+ /* Test insertion at slot 0 */
+ tp = test_hpts_create_tcpcb(ctx, pace);
+ KTEST_NEQUAL(tp, NULL);
+ INP_WLOCK(&tp->t_inpcb);
+ tp->t_flags2 |= TF2_HPTS_CALLS;
+ tcp_hpts_insert(pace, tp, 0, NULL); /* Should insert immediately (0 timeout) */
+ INP_WUNLOCK(&tp->t_inpcb);
+ KTEST_EQUAL(tp->t_in_hpts, IHPTS_ONQUEUE);
+ KTEST_VERIFY(tp->t_hpts_slot < NUM_OF_HPTSI_SLOTS);
+
+ INP_WLOCK(&tp->t_inpcb);
+ tcp_hpts_remove(pace, tp);
+ INP_WUNLOCK(&tp->t_inpcb);
+
+ /* Test insertion at maximum slot value */
+ INP_WLOCK(&tp->t_inpcb);
+ tp->t_flags2 |= TF2_HPTS_CALLS;
+ tcp_hpts_insert(pace, tp, (NUM_OF_HPTSI_SLOTS - 1) * HPTS_USECS_PER_SLOT, NULL);
+ INP_WUNLOCK(&tp->t_inpcb);
+ KTEST_EQUAL(tp->t_in_hpts, IHPTS_ONQUEUE);
+
+ INP_WLOCK(&tp->t_inpcb);
+ tcp_hpts_remove(pace, tp);
+ INP_WUNLOCK(&tp->t_inpcb);
+
+ /* Test very small timeout values */
+ INP_WLOCK(&tp->t_inpcb);
+ tp->t_flags2 |= TF2_HPTS_CALLS;
+ tcp_hpts_insert(pace, tp, 1, NULL);
+ INP_WUNLOCK(&tp->t_inpcb);
+ KTEST_EQUAL(tp->t_in_hpts, IHPTS_ONQUEUE);
+ KTEST_EQUAL(tp->t_hpts_slot, HPTS_USEC_TO_SLOTS(1)); /* Should convert 1 usec to slot */
+
+ INP_WLOCK(&tp->t_inpcb);
+ tcp_hpts_remove(pace, tp);
+ INP_WUNLOCK(&tp->t_inpcb);
+
+ test_hpts_free_tcpcb(tp);
+ tcp_hptsi_stop(pace);
+ tcp_hptsi_destroy(pace);
+
+ return (0);
+}
+
+/*
+ * Validates HPTS behavior under high load conditions, including proper
+ * processing of many connections and connection count tracking.
+ */
+KTEST_FUNC(dynamic_sleep_adjustment)
+{
+ struct epoch_tracker et;
+ struct tcp_hptsi *pace;
+ struct tcpcb **tcpcbs;
+ struct tcp_hpts_entry *hpts;
+ uint32_t i, num_tcpcbs = DEFAULT_CONNECTION_THRESHOLD + 50;
+ int32_t slots_ran;
+
+ test_hpts_init();
+
+ pace = tcp_hptsi_create(&test_funcs, false);
+ KTEST_NEQUAL(pace, NULL);
+ tcp_hptsi_start(pace);
+
+ /* Create many connections to exceed threshold */
+ tcpcbs = malloc(num_tcpcbs * sizeof(struct tcpcb *), M_TCPHPTS, M_WAITOK | M_ZERO);
+ KTEST_VERIFY_RET(tcpcbs != NULL, ENOMEM);
+
+ for (i = 0; i < num_tcpcbs; i++) {
+ tcpcbs[i] = test_hpts_create_tcpcb(ctx, pace);
+ KTEST_NEQUAL(tcpcbs[i], NULL);
+ tcpcbs[i]->t_hpts_cpu = 0; /* Force all to CPU 0 */
+ INP_WLOCK(&tcpcbs[i]->t_inpcb);
+ tcpcbs[i]->t_flags2 |= TF2_HPTS_CALLS;
+ TP_REMOVE_FROM_HPTS(tcpcbs[i]) = 1; /* Will be removed after output */
+ tcp_hpts_insert(pace, tcpcbs[i], 100, NULL);
+ INP_WUNLOCK(&tcpcbs[i]->t_inpcb);
+ }
+
+ hpts = pace->rp_ent[0];
+ dump_hpts_entry(ctx, hpts);
+
+ /* Verify we're above threshold */
+ KTEST_GREATER_THAN(hpts->p_on_queue_cnt, DEFAULT_CONNECTION_THRESHOLD);
+
+ /* Run HPTS to process many connections */
+ test_time_usec += 100;
+ HPTS_LOCK(hpts);
+ NET_EPOCH_ENTER(et);
+ slots_ran = tcp_hptsi(hpts, true);
+ HPTS_UNLOCK(hpts);
+ NET_EPOCH_EXIT(et);
+
+ /* Verify HPTS processed slots and connections correctly */
+ KTEST_GREATER_THAN(slots_ran, 0);
+ KTEST_EQUAL(call_counts[CCNT_TCP_OUTPUT], num_tcpcbs);
+
+ /* Verify all connections were removed from queue */
+ KTEST_EQUAL(hpts->p_on_queue_cnt, 0);
+
+ /* Cleanup */
+ for (i = 0; i < num_tcpcbs; i++) {
+ test_hpts_free_tcpcb(tcpcbs[i]);
+ }
+ free(tcpcbs, M_TCPHPTS);
+ tcp_hptsi_stop(pace);
+ tcp_hptsi_destroy(pace);
+
+ return (0);
+}
+
+/*
+ * Validates handling of concurrent insert/remove operations and race conditions
+ * between HPTS processing and user operations.
+ */
+KTEST_FUNC(concurrent_operations)
+{
+ struct tcp_hptsi *pace;
+ struct tcpcb *tp1, *tp2;
+ struct tcp_hpts_entry *hpts;
+
+ test_hpts_init();
+
+ pace = tcp_hptsi_create(&test_funcs, false);
+ KTEST_NEQUAL(pace, NULL);
+ tcp_hptsi_start(pace);
+
+ tp1 = test_hpts_create_tcpcb(ctx, pace);
+ tp2 = test_hpts_create_tcpcb(ctx, pace);
+ KTEST_NEQUAL(tp1, NULL);
+ KTEST_NEQUAL(tp2, NULL);
+
+ /* Force all to CPU 0 */
+ tp1->t_hpts_cpu = 0;
+ tp2->t_hpts_cpu = 0;
+
+ /* Insert tp1 */
+ INP_WLOCK(&tp1->t_inpcb);
+ tp1->t_flags2 |= TF2_HPTS_CALLS;
+ tcp_hpts_insert(pace, tp1, 100, NULL);
+ INP_WUNLOCK(&tp1->t_inpcb);
+
+ /* Insert tp2 into same slot */
+ INP_WLOCK(&tp2->t_inpcb);
+ tp2->t_flags2 |= TF2_HPTS_CALLS;
+ tcp_hpts_insert(pace, tp2, 100, NULL);
+ INP_WUNLOCK(&tp2->t_inpcb);
+
+ /* Verify both are inserted */
+ KTEST_EQUAL(tp1->t_in_hpts, IHPTS_ONQUEUE);
+ KTEST_EQUAL(tp2->t_in_hpts, IHPTS_ONQUEUE);
+
+ /* Verify they're both assigned to the same slot */
+ KTEST_EQUAL(tp1->t_hpts_slot, tp2->t_hpts_slot);
+
+ /* Verify queue count reflects both connections */
+ KTEST_EQUAL(tp1->t_hpts_cpu, tp2->t_hpts_cpu); /* Should be on same CPU */
+ hpts = pace->rp_ent[tp1->t_hpts_cpu];
+ KTEST_EQUAL(hpts->p_on_queue_cnt, 2);
+
+ /* Remove tp1 while tp2 is still there */
+ INP_WLOCK(&tp1->t_inpcb);
+ tcp_hpts_remove(pace, tp1);
+ INP_WUNLOCK(&tp1->t_inpcb);
+
+ /* Verify tp1 removed, tp2 still there */
+ KTEST_EQUAL(tp1->t_in_hpts, IHPTS_NONE);
+ KTEST_EQUAL(tp2->t_in_hpts, IHPTS_ONQUEUE);
+
+ /* Verify queue count decreased by one */
+ KTEST_EQUAL(hpts->p_on_queue_cnt, 1);
+
+ /* Remove tp2 */
+ INP_WLOCK(&tp2->t_inpcb);
+ tcp_hpts_remove(pace, tp2);
+ INP_WUNLOCK(&tp2->t_inpcb);
+
+ KTEST_EQUAL(tp2->t_in_hpts, IHPTS_NONE);
+
+ /* Verify queue is now completely empty */
+ KTEST_EQUAL(hpts->p_on_queue_cnt, 0);
+
+ test_hpts_free_tcpcb(tp1);
+ test_hpts_free_tcpcb(tp2);
+ tcp_hptsi_stop(pace);
+ tcp_hptsi_destroy(pace);
+
+ return (0);
+}
+
+/*
+ * Validates the queued segments processing path via tfb_do_queued_segments,
+ * which is an alternative to direct tcp_output calls.
+ */
+KTEST_FUNC(queued_segments_processing)
+{
+ struct epoch_tracker et;
+ struct tcp_hptsi *pace;
+ struct tcpcb *tp;
+ struct tcp_hpts_entry *hpts;
+ struct mbuf *fake_mbuf;
+ int32_t slots_ran;
+
+ test_hpts_init();
+
+ pace = tcp_hptsi_create(&test_funcs, false);
+ KTEST_NEQUAL(pace, NULL);
+ tcp_hptsi_start(pace);
+
+ tp = test_hpts_create_tcpcb(ctx, pace);
+ KTEST_NEQUAL(tp, NULL);
+
+ /* Create a minimal fake mbuf that has valid STAILQ pointers */
+ fake_mbuf = malloc(sizeof(struct mbuf), M_TCPHPTS, M_WAITOK | M_ZERO);
+ KTEST_NEQUAL(fake_mbuf, NULL);
+
+ /* Set up for queued segments path */
+ tp->t_flags2 |= (TF2_HPTS_CALLS | TF2_SUPPORTS_MBUFQ);
+ STAILQ_INSERT_TAIL(&tp->t_inqueue, fake_mbuf, m_stailqpkt);
+
+ INP_WLOCK(&tp->t_inpcb);
+ tcp_hpts_insert(pace, tp, 100, NULL);
+ INP_WUNLOCK(&tp->t_inpcb);
+
+ hpts = pace->rp_ent[tp->t_hpts_cpu];
+
+ /* Run HPTS and verify queued segments path is taken */
+ test_time_usec += 100;
+ HPTS_LOCK(hpts);
+ NET_EPOCH_ENTER(et);
+ slots_ran = tcp_hptsi(hpts, true);
+ HPTS_UNLOCK(hpts);
+ NET_EPOCH_EXIT(et);
+
+ KTEST_VERIFY(slots_ran >= 0);
+ KTEST_EQUAL(call_counts[CCNT_TCP_TFB_DO_QUEUED_SEGMENTS], 1);
+
+ /* Connection should be removed from HPTS after processing */
+ KTEST_EQUAL(tp->t_in_hpts, IHPTS_NONE);
+
+ /* Clean up the fake mbuf if it's still in the queue */
+ if (!STAILQ_EMPTY(&tp->t_inqueue)) {
+ struct mbuf *m = STAILQ_FIRST(&tp->t_inqueue);
+ STAILQ_REMOVE_HEAD(&tp->t_inqueue, m_stailqpkt);
+ free(m, M_TCPHPTS);
+ }
+
+ test_hpts_free_tcpcb(tp);
+ tcp_hptsi_stop(pace);
+ tcp_hptsi_destroy(pace);
+
+ return (0);
+}
+
+/*
+ * Validates the direct wake mechanism and wake inhibition logic when
+ * the connection count exceeds thresholds.
+ */
+KTEST_FUNC(direct_wake_mechanism)
+{
+ struct tcp_hptsi *pace;
+ struct tcpcb *tp;
+ struct tcp_hpts_entry *hpts;
+
+ test_hpts_init();
+
+ pace = tcp_hptsi_create(&test_funcs, false);
+ KTEST_NEQUAL(pace, NULL);
+ tcp_hptsi_start(pace);
+
+ tp = test_hpts_create_tcpcb(ctx, pace);
+ KTEST_NEQUAL(tp, NULL);
+ hpts = pace->rp_ent[tp->t_hpts_cpu];
+
+ /* Test direct wake when not over threshold */
+ HPTS_LOCK(hpts);
+ hpts->p_on_queue_cnt = 50; /* Below threshold */
+ hpts->p_hpts_wake_scheduled = 0;
+ tcp_hpts_wake(hpts);
+ KTEST_EQUAL(hpts->p_hpts_wake_scheduled, 1);
+ KTEST_EQUAL(call_counts[CCNT_SWI_SCHED], 1);
+ HPTS_UNLOCK(hpts);
+
+ /* Reset for next test */
+ hpts->p_hpts_wake_scheduled = 0;
+ call_counts[CCNT_SWI_SCHED] = 0;
+
+ /* Test wake inhibition when over threshold */
+ HPTS_LOCK(hpts);
+ hpts->p_on_queue_cnt = 200; /* Above threshold */
+ hpts->p_direct_wake = 1; /* Request direct wake */
+ tcp_hpts_wake(hpts);
+ KTEST_EQUAL(hpts->p_hpts_wake_scheduled, 0); /* Should be inhibited */
+ KTEST_EQUAL(hpts->p_direct_wake, 0); /* Should be cleared */
+ KTEST_EQUAL(call_counts[CCNT_SWI_SCHED], 0); /* No SWI scheduled */
+ HPTS_UNLOCK(hpts);
+
+ test_hpts_free_tcpcb(tp);
+ tcp_hptsi_stop(pace);
+ tcp_hptsi_destroy(pace);
+
+ return (0);
+}
+
+/*
+ * Validates HPTS collision detection when attempting to run HPTS while
+ * it's already active.
+ */
+KTEST_FUNC(hpts_collision_detection)
+{
+ struct epoch_tracker et;
+ struct tcp_hptsi *pace;
+ struct tcp_hpts_entry *hpts;
+ int32_t slots_ran;
+
+ test_hpts_init();
+
+ pace = tcp_hptsi_create(&test_funcs, false);
+ KTEST_NEQUAL(pace, NULL);
+ tcp_hptsi_start(pace);
+
+ hpts = pace->rp_ent[0];
+
+ /* Mark HPTS as active */
+ HPTS_LOCK(hpts);
+ hpts->p_hpts_active = 1;
+ HPTS_UNLOCK(hpts);
+
+ /* Attempt to run HPTS again - should detect collision */
+ HPTS_LOCK(hpts);
+ NET_EPOCH_ENTER(et);
+ slots_ran = tcp_hptsi(hpts, false); /* from_callout = false */
+ HPTS_UNLOCK(hpts);
+ NET_EPOCH_EXIT(et);
+
+ /* Should return 0 indicating no work done due to collision */
+ KTEST_EQUAL(slots_ran, 0);
+
+ tcp_hptsi_stop(pace);
+ tcp_hptsi_destroy(pace);
+
+ return (0);
+}
+
+/*
+ * Validates generation count handling for race condition detection between
+ * HPTS processing and connection insertion/removal operations.
+ */
+KTEST_FUNC(generation_count_validation)
+{
+ struct epoch_tracker et;
+ struct tcp_hptsi *pace;
+ struct tcp_hpts_entry *hpts;
+ struct tcpcb *tp1, *tp2;
+ uint32_t initial_gencnt, slot_to_test = 10;
+ uint32_t timeout_usecs = slot_to_test * HPTS_USECS_PER_SLOT;
+ uint32_t tp2_original_gencnt;
+ int32_t slots_ran;
+
+ test_hpts_init();
+
+ pace = tcp_hptsi_create(&test_funcs, false);
+ KTEST_NEQUAL(pace, NULL);
+ tcp_hptsi_start(pace);
+
+ hpts = pace->rp_ent[0];
+
+ /* Record initial generation count for the test slot */
+ initial_gencnt = hpts->p_hptss[slot_to_test].gencnt;
+
+ /* Create and insert first connection */
+ tp1 = test_hpts_create_tcpcb(ctx, pace);
+ KTEST_NEQUAL(tp1, NULL);
+ tp1->t_hpts_cpu = 0; /* Force to CPU 0 */
+
+ INP_WLOCK(&tp1->t_inpcb);
+ tp1->t_flags2 |= TF2_HPTS_CALLS;
+ tcp_hpts_insert(pace, tp1, timeout_usecs, NULL);
+ INP_WUNLOCK(&tp1->t_inpcb);
+
+ /* Verify connection stored the generation count */
+ KTEST_EQUAL(tp1->t_in_hpts, IHPTS_ONQUEUE);
+ KTEST_EQUAL(tp1->t_hpts_slot, slot_to_test);
+ KTEST_EQUAL(tp1->t_hpts_gencnt, initial_gencnt);
+
+ /* Create second connection but don't insert yet */
+ tp2 = test_hpts_create_tcpcb(ctx, pace);
+ KTEST_NEQUAL(tp2, NULL);
+ tp2->t_hpts_cpu = 0; /* Force to CPU 0 */
+
+ /* Force generation count increment by processing the slot */
+ test_time_usec += (slot_to_test + 1) * HPTS_USECS_PER_SLOT;
+ HPTS_LOCK(hpts);
+ NET_EPOCH_ENTER(et);
+ slots_ran = tcp_hptsi(hpts, true);
+ HPTS_UNLOCK(hpts);
+ NET_EPOCH_EXIT(et);
+
+ /* Verify processing occurred */
+ KTEST_VERIFY(slots_ran > 0);
+ KTEST_EQUAL(call_counts[CCNT_TCP_OUTPUT], 1);
+
+ /* Verify generation count was incremented */
+ KTEST_EQUAL(hpts->p_hptss[slot_to_test].gencnt, initial_gencnt + 1);
+
+ /* Verify first connection was processed and removed */
+ KTEST_EQUAL(tp1->t_in_hpts, IHPTS_NONE);
+
+ /* Insert second connection and record its generation count */
+ INP_WLOCK(&tp2->t_inpcb);
+ tp2->t_flags2 |= TF2_HPTS_CALLS;
+ tcp_hpts_insert(pace, tp2, timeout_usecs, NULL);
+ INP_WUNLOCK(&tp2->t_inpcb);
+
+ /* Verify connection was inserted successfully */
+ KTEST_EQUAL(tp2->t_in_hpts, IHPTS_ONQUEUE);
+
+ /* Record the generation count that tp2 received */
+ tp2_original_gencnt = tp2->t_hpts_gencnt;
+
+ /* Test generation count mismatch detection during processing */
+ /* Manually set stale generation count to simulate race condition */
+ tp2->t_hpts_gencnt = tp2_original_gencnt + 100; /* Force a mismatch */
+
+ /* Process the slot to trigger generation count validation */
+ test_time_usec += (slot_to_test + 1) * HPTS_USECS_PER_SLOT;
+ HPTS_LOCK(hpts);
+ NET_EPOCH_ENTER(et);
+ slots_ran = tcp_hptsi(hpts, true);
+ HPTS_UNLOCK(hpts);
+ NET_EPOCH_EXIT(et);
+
+ /* Connection should be processed despite generation count mismatch */
+ KTEST_EQUAL(tp2->t_in_hpts, IHPTS_NONE); /* Processed and released */
+
+ /* The key test: HPTS should handle mismatched generation counts gracefully */
+ KTEST_VERIFY(slots_ran > 0); /* Processing should still occur */
+
+ test_hpts_free_tcpcb(tp1);
+ test_hpts_free_tcpcb(tp2);
+ tcp_hptsi_stop(pace);
+ tcp_hptsi_destroy(pace);
+
+ return (0);
+}
+
+static const struct ktest_test_info tests[] = {
+ KTEST_INFO(module_load),
+ KTEST_INFO(hptsi_create_destroy),
+ KTEST_INFO(hptsi_start_stop),
+ KTEST_INFO(hptsi_independence),
+ KTEST_INFO(function_injection),
+ KTEST_INFO(tcpcb_initialization),
+ KTEST_INFO(tcpcb_insertion),
+ KTEST_INFO(timer_functionality),
+ KTEST_INFO(scalability_tcpcbs),
+ KTEST_INFO(wheel_wrap_recovery),
+ KTEST_INFO(tcpcb_moving_state),
+ KTEST_INFO(deferred_requests),
+ KTEST_INFO(cpu_assignment),
+ KTEST_INFO(slot_boundary_conditions),
+ KTEST_INFO(dynamic_sleep_adjustment),
+ KTEST_INFO(concurrent_operations),
+ KTEST_INFO(queued_segments_processing),
+ KTEST_INFO(direct_wake_mechanism),
+ KTEST_INFO(hpts_collision_detection),
+ KTEST_INFO(generation_count_validation),
+};
+
+KTEST_MODULE_DECLARE(ktest_tcphpts, tests);
+KTEST_MODULE_DEPEND(ktest_tcphpts, tcphpts);
diff --git a/sys/netinet/tcp_lro_hpts.c b/sys/netinet/tcp_lro_hpts.c
index 43587285fe26..ac1a27a4290a 100644
--- a/sys/netinet/tcp_lro_hpts.c
+++ b/sys/netinet/tcp_lro_hpts.c
@@ -29,6 +29,8 @@
#include "opt_inet6.h"
#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/interrupt.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/malloc.h>
@@ -62,6 +64,7 @@
#include <netinet/tcp_lro.h>
#include <netinet/tcp_var.h>
#include <netinet/tcp_hpts.h>
+#include <netinet/tcp_hpts_internal.h>
#ifdef TCP_BLACKBOX
#include <netinet/tcp_log_buf.h>
#endif
diff --git a/sys/netinet/tcp_stacks/bbr.c b/sys/netinet/tcp_stacks/bbr.c
index f2d7867df9b4..66983edcdd73 100644
--- a/sys/netinet/tcp_stacks/bbr.c
+++ b/sys/netinet/tcp_stacks/bbr.c
@@ -480,7 +480,7 @@ bbr_find_lowest_rsm(struct tcp_bbr *bbr);
static __inline uint32_t
bbr_get_rtt(struct tcp_bbr *bbr, int32_t rtt_type);
static void
-bbr_log_to_start(struct tcp_bbr *bbr, uint32_t cts, uint32_t to, int32_t slot,
+bbr_log_to_start(struct tcp_bbr *bbr, uint32_t cts, uint32_t to, int32_t pacing_delay,
uint8_t which);
static void
bbr_log_timer_var(struct tcp_bbr *bbr, int mode, uint32_t cts,
@@ -489,7 +489,7 @@ bbr_log_timer_var(struct tcp_bbr *bbr, int mode, uint32_t cts,
static void
bbr_log_hpts_diag(struct tcp_bbr *bbr, uint32_t cts, struct hpts_diag *diag);
static void
-bbr_log_type_bbrsnd(struct tcp_bbr *bbr, uint32_t len, uint32_t slot,
+bbr_log_type_bbrsnd(struct tcp_bbr *bbr, uint32_t len, uint32_t pacing_delay,
uint32_t del_by, uint32_t cts, uint32_t sloton,
uint32_t prev_delay);
static void
@@ -724,7 +724,7 @@ bbr_minseg(struct tcp_bbr *bbr)
}
static void
-bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_t frm, int32_t slot, uint32_t tot_len)
+bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_t frm, int32_t pacing_delay, uint32_t tot_len)
{
struct inpcb *inp = tptoinpcb(tp);
struct hpts_diag diag;
@@ -751,40 +751,40 @@ bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_
bbr->r_ctl.rc_timer_exp = 0;
prev_delay = bbr->r_ctl.rc_last_delay_val;
if (bbr->r_ctl.rc_last_delay_val &&
- (slot == 0)) {
+ (pacing_delay == 0)) {
/*
* If a previous pacer delay was in place we
* are not coming from the output side (where
* we calculate a delay, more likely a timer).
*/
- slot = bbr->r_ctl.rc_last_delay_val;
+ pacing_delay = bbr->r_ctl.rc_last_delay_val;
if (TSTMP_GT(cts, bbr->rc_pacer_started)) {
/* Compensate for time passed */
delay_calc = cts - bbr->rc_pacer_started;
- if (delay_calc <= slot)
- slot -= delay_calc;
+ if (delay_calc <= pacing_delay)
+ pacing_delay -= delay_calc;
}
}
/* Do we have early to make up for by pushing out the pacing time? */
if (bbr->r_agg_early_set) {
- bbr_log_pacing_delay_calc(bbr, 0, bbr->r_ctl.rc_agg_early, cts, slot, 0, bbr->r_agg_early_set, 2);
- slot += bbr->r_ctl.rc_agg_early;
+ bbr_log_pacing_delay_calc(bbr, 0, bbr->r_ctl.rc_agg_early, cts, pacing_delay, 0, bbr->r_agg_early_set, 2);
+ pacing_delay += bbr->r_ctl.rc_agg_early;
bbr->r_ctl.rc_agg_early = 0;
bbr->r_agg_early_set = 0;
}
/* Are we running a total debt that needs to be compensated for? */
if (bbr->r_ctl.rc_hptsi_agg_delay) {
- if (slot > bbr->r_ctl.rc_hptsi_agg_delay) {
+ if (pacing_delay > bbr->r_ctl.rc_hptsi_agg_delay) {
/* We nuke the delay */
- slot -= bbr->r_ctl.rc_hptsi_agg_delay;
+ pacing_delay -= bbr->r_ctl.rc_hptsi_agg_delay;
bbr->r_ctl.rc_hptsi_agg_delay = 0;
} else {
/* We nuke some of the delay, put in a minimal 100usecs */
- bbr->r_ctl.rc_hptsi_agg_delay -= slot;
- bbr->r_ctl.rc_last_delay_val = slot = 100;
+ bbr->r_ctl.rc_hptsi_agg_delay -= pacing_delay;
+ bbr->r_ctl.rc_last_delay_val = pacing_delay = 100;
}
}
- bbr->r_ctl.rc_last_delay_val = slot;
+ bbr->r_ctl.rc_last_delay_val = pacing_delay;
hpts_timeout = bbr_timer_start(tp, bbr, cts);
if (tp->t_flags & TF_DELACK) {
if (bbr->rc_in_persist == 0) {
@@ -810,7 +810,7 @@ bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_
bbr->r_ctl.rc_hpts_flags = PACE_TMR_DELACK;
hpts_timeout = delayed_ack;
}
- if (slot) {
+ if (pacing_delay) {
/* Mark that we have a pacing timer up */
BBR_STAT_INC(bbr_paced_segments);
bbr->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT;
@@ -820,7 +820,7 @@ bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_
* wheel, we resort to a keep-alive timer if its configured.
*/
if ((hpts_timeout == 0) &&
- (slot == 0)) {
+ (pacing_delay == 0)) {
if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) &&
(tp->t_state <= TCPS_CLOSING)) {
/*
@@ -849,7 +849,7 @@ bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_
if (left < hpts_timeout)
hpts_timeout = left;
}
- if (bbr->r_ctl.rc_incr_tmrs && slot &&
+ if (bbr->r_ctl.rc_incr_tmrs && pacing_delay &&
(bbr->r_ctl.rc_hpts_flags & (PACE_TMR_TLP|PACE_TMR_RXT))) {
/*
* If configured to do so, and the timer is either
@@ -867,7 +867,7 @@ bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_
* this extra delay but this is easier and being more
* conservative is probably better.
*/
- hpts_timeout += slot;
+ hpts_timeout += pacing_delay;
}
if (hpts_timeout) {
/*
@@ -879,10 +879,10 @@ bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_
bbr->r_ctl.rc_timer_exp = cts + hpts_timeout;
} else
bbr->r_ctl.rc_timer_exp = 0;
- if ((slot) &&
+ if ((pacing_delay) &&
(bbr->rc_use_google ||
bbr->output_error_seen ||
- (slot <= hpts_timeout)) ) {
+ (pacing_delay <= hpts_timeout)) ) {
/*
* Tell LRO that it can queue packets while
* we pace.
@@ -900,17 +900,15 @@ bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_
tp->t_flags2 &= ~TF2_DONT_SACK_QUEUE;
bbr->rc_pacer_started = cts;
- (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(slot),
- __LINE__, &diag);
+ tcp_hpts_insert(tp, pacing_delay, &diag);
bbr->rc_timer_first = 0;
bbr->bbr_timer_src = frm;
- bbr_log_to_start(bbr, cts, hpts_timeout, slot, 1);
+ bbr_log_to_start(bbr, cts, hpts_timeout, pacing_delay, 1);
bbr_log_hpts_diag(bbr, cts, &diag);
} else if (hpts_timeout) {
- (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(hpts_timeout),
- __LINE__, &diag);
+ tcp_hpts_insert(tp, hpts_timeout, &diag);
/*
- * We add the flag here as well if the slot is set,
+ * We add the flag here as well if the pacing delay is set,
* since hpts will call in to clear the queue first before
* calling the output routine (which does our timers).
* We don't want to set the flag if its just a timer
@@ -919,7 +917,7 @@ bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_
* on a keep-alive timer and a request comes in for
* more data.
*/
- if (slot)
+ if (pacing_delay)
bbr->rc_pacer_started = cts;
if ((bbr->r_ctl.rc_hpts_flags & PACE_TMR_RACK) &&
(bbr->rc_cwnd_limited == 0)) {
@@ -936,12 +934,12 @@ bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_
TF2_DONT_SACK_QUEUE);
}
bbr->bbr_timer_src = frm;
- bbr_log_to_start(bbr, cts, hpts_timeout, slot, 0);
+ bbr_log_to_start(bbr, cts, hpts_timeout, pacing_delay, 0);
bbr_log_hpts_diag(bbr, cts, &diag);
bbr->rc_timer_first = 1;
}
bbr->rc_tmr_stopped = 0;
- bbr_log_type_bbrsnd(bbr, tot_len, slot, delay_calc, cts, frm, prev_delay);
+ bbr_log_type_bbrsnd(bbr, tot_len, pacing_delay, delay_calc, cts, frm, prev_delay);
}
static void
@@ -1033,8 +1031,8 @@ bbr_timer_audit(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts, struct sock
}
/*
* Ok the timer originally started is not what we want now. We will
- * force the hpts to be stopped if any, and restart with the slot
- * set to what was in the saved slot.
+ * force the hpts to be stopped if any, and restart with the pacing
+ * delay set to what was in the saved delay.
*/
wrong_timer:
if ((bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) {
@@ -2397,7 +2395,7 @@ bbr_log_hpts_diag(struct tcp_bbr *bbr, uint32_t cts, struct hpts_diag *diag)
log.u_bbr.flex2 = diag->p_cur_slot;
log.u_bbr.flex3 = diag->slot_req;
log.u_bbr.flex4 = diag->inp_hptsslot;
- log.u_bbr.flex5 = diag->slot_remaining;
+ log.u_bbr.flex5 = diag->time_remaining;
log.u_bbr.flex6 = diag->need_new_to;
log.u_bbr.flex7 = diag->p_hpts_active;
log.u_bbr.flex8 = diag->p_on_min_sleep;
@@ -2411,9 +2409,6 @@ bbr_log_hpts_diag(struct tcp_bbr *bbr, uint32_t cts, struct hpts_diag *diag)
log.u_bbr.bw_inuse = diag->wheel_slot;
log.u_bbr.rttProp = diag->wheel_cts;
log.u_bbr.delRate = diag->maxslots;
- log.u_bbr.cur_del_rate = diag->p_curtick;
- log.u_bbr.cur_del_rate <<= 32;
- log.u_bbr.cur_del_rate |= diag->p_lasttick;
TCP_LOG_EVENTP(bbr->rc_tp, NULL,
&bbr->rc_inp->inp_socket->so_rcv,
&bbr->rc_inp->inp_socket->so_snd,
@@ -2473,7 +2468,7 @@ bbr_log_pacing_delay_calc(struct tcp_bbr *bbr, uint16_t gain, uint32_t len,
}
static void
-bbr_log_to_start(struct tcp_bbr *bbr, uint32_t cts, uint32_t to, int32_t slot, uint8_t which)
+bbr_log_to_start(struct tcp_bbr *bbr, uint32_t cts, uint32_t to, int32_t pacing_delay, uint8_t which)
{
if (tcp_bblogging_on(bbr->rc_tp)) {
union tcp_log_stackspecific log;
@@ -2483,7 +2478,7 @@ bbr_log_to_start(struct tcp_bbr *bbr, uint32_t cts, uint32_t to, int32_t slot, u
log.u_bbr.flex1 = bbr->bbr_timer_src;
log.u_bbr.flex2 = to;
log.u_bbr.flex3 = bbr->r_ctl.rc_hpts_flags;
- log.u_bbr.flex4 = slot;
+ log.u_bbr.flex4 = pacing_delay;
log.u_bbr.flex5 = bbr->rc_tp->t_hpts_slot;
log.u_bbr.flex6 = TICKS_2_USEC(bbr->rc_tp->t_rxtcur);
log.u_bbr.pkts_out = bbr->rc_tp->t_flags2;
@@ -2733,13 +2728,13 @@ bbr_type_log_hdwr_pacing(struct tcp_bbr *bbr, const struct ifnet *ifp,
}
static void
-bbr_log_type_bbrsnd(struct tcp_bbr *bbr, uint32_t len, uint32_t slot, uint32_t del_by, uint32_t cts, uint32_t line, uint32_t prev_delay)
+bbr_log_type_bbrsnd(struct tcp_bbr *bbr, uint32_t len, uint32_t pacing_delay, uint32_t del_by, uint32_t cts, uint32_t line, uint32_t prev_delay)
{
if (tcp_bblogging_on(bbr->rc_tp)) {
union tcp_log_stackspecific log;
bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
- log.u_bbr.flex1 = slot;
+ log.u_bbr.flex1 = pacing_delay;
log.u_bbr.flex2 = del_by;
log.u_bbr.flex3 = prev_delay;
log.u_bbr.flex4 = line;
@@ -5205,7 +5200,7 @@ bbr_process_timers(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts, uint8_t
left = bbr->r_ctl.rc_timer_exp - cts;
ret = -3;
bbr_log_to_processing(bbr, cts, ret, left, hpts_calling);
- tcp_hpts_insert(tp, HPTS_USEC_TO_SLOTS(left));
+ tcp_hpts_insert(tp, left, NULL);
return (1);
}
bbr->rc_tmr_stopped = 0;
@@ -5254,7 +5249,7 @@ bbr_timer_cancel(struct tcp_bbr *bbr, int32_t line, uint32_t cts)
else
time_since_send = 0;
if (bbr->r_ctl.rc_last_delay_val > time_since_send) {
- /* Cut down our slot time */
+ /* Cut down our pacing_delay time */
bbr->r_ctl.rc_last_delay_val -= time_since_send;
} else {
bbr->r_ctl.rc_last_delay_val = 0;
@@ -5888,7 +5883,7 @@ bbr_log_output(struct tcp_bbr *bbr, struct tcpcb *tp, struct tcpopt *to, int32_t
* sequence 1 for 10 bytes. In such an example the r_start would be
* 1 (starting sequence) but the r_end would be r_start+len i.e. 11.
* This means that r_end is actually the first sequence for the next
- * slot (11).
+ * pacing delay (11).
*
*/
INP_WLOCK_ASSERT(tptoinpcb(tp));
@@ -11856,7 +11851,7 @@ bbr_output_wtime(struct tcpcb *tp, const struct timeval *tv)
struct bbr_sendmap *rsm = NULL;
int32_t tso, mtu;
struct tcpopt to;
- int32_t slot = 0;
+ int32_t pacing_delay = 0;
struct inpcb *inp;
struct sockbuf *sb;
bool hpts_calling;
@@ -11986,8 +11981,7 @@ bbr_output_wtime(struct tcpcb *tp, const struct timeval *tv)
delay_calc -= bbr->r_ctl.rc_last_delay_val;
else {
/*
- * We are early setup to adjust
- * our slot time.
+ * We are early setup to adjust out pacing delay.
*/
uint64_t merged_val;
@@ -12104,7 +12098,7 @@ again:
#endif
error = 0;
tso = 0;
- slot = 0;
+ pacing_delay = 0;
mtu = 0;
sendwin = min(tp->snd_wnd, tp->snd_cwnd);
sb_offset = tp->snd_max - tp->snd_una;
@@ -12126,7 +12120,7 @@ recheck_resend:
tot_len = tp->t_maxseg;
if (hpts_calling)
/* Retry in a ms */
- slot = 1001;
+ pacing_delay = 1001;
goto just_return_nolock;
}
TAILQ_INSERT_TAIL(&bbr->r_ctl.rc_free, rsm, r_next);
@@ -12699,9 +12693,9 @@ just_return:
SOCK_SENDBUF_UNLOCK(so);
just_return_nolock:
if (tot_len)
- slot = bbr_get_pacing_delay(bbr, bbr->r_ctl.rc_bbr_hptsi_gain, tot_len, cts, 0);
+ pacing_delay = bbr_get_pacing_delay(bbr, bbr->r_ctl.rc_bbr_hptsi_gain, tot_len, cts, 0);
if (bbr->rc_no_pacing)
- slot = 0;
+ pacing_delay = 0;
if (tot_len == 0) {
if ((ctf_outstanding(tp) + min((bbr->r_ctl.rc_high_rwnd/2), bbr_minseg(bbr))) >=
tp->snd_wnd) {
@@ -12751,7 +12745,7 @@ just_return_nolock:
/* Dont update the time if we did not send */
bbr->r_ctl.rc_last_delay_val = 0;
bbr->rc_output_starts_timer = 1;
- bbr_start_hpts_timer(bbr, tp, cts, 9, slot, tot_len);
+ bbr_start_hpts_timer(bbr, tp, cts, 9, pacing_delay, tot_len);
bbr_log_type_just_return(bbr, cts, tot_len, hpts_calling, app_limited, p_maxseg, len);
if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
/* Make sure snd_nxt is drug up */
@@ -12787,7 +12781,7 @@ send:
flags &= ~TH_FIN;
if ((len == 0) && ((tp->t_flags & TF_ACKNOW) == 0)) {
/* Lets not send this */
- slot = 0;
+ pacing_delay = 0;
goto just_return;
}
}
@@ -13053,7 +13047,7 @@ send:
/*
* We have outstanding data, don't send a fin by itself!.
*/
- slot = 0;
+ pacing_delay = 0;
goto just_return;
}
/*
@@ -13763,7 +13757,7 @@ nomore:
if (tp->snd_cwnd < maxseg)
tp->snd_cwnd = maxseg;
}
- slot = (bbr_error_base_paceout + 1) << bbr->oerror_cnt;
+ pacing_delay = (bbr_error_base_paceout + 1) << bbr->oerror_cnt;
BBR_STAT_INC(bbr_saw_enobuf);
if (bbr->bbr_hdrw_pacing)
counter_u64_add(bbr_hdwr_pacing_enobuf, 1);
@@ -13812,18 +13806,18 @@ nomore:
}
/*
* Nuke all other things that can interfere
- * with slot
+ * with pacing delay
*/
if ((tot_len + len) && (len >= tp->t_maxseg)) {
- slot = bbr_get_pacing_delay(bbr,
+ pacing_delay = bbr_get_pacing_delay(bbr,
bbr->r_ctl.rc_bbr_hptsi_gain,
(tot_len + len), cts, 0);
- if (slot < bbr_error_base_paceout)
- slot = (bbr_error_base_paceout + 2) << bbr->oerror_cnt;
+ if (pacing_delay < bbr_error_base_paceout)
+ pacing_delay = (bbr_error_base_paceout + 2) << bbr->oerror_cnt;
} else
- slot = (bbr_error_base_paceout + 2) << bbr->oerror_cnt;
+ pacing_delay = (bbr_error_base_paceout + 2) << bbr->oerror_cnt;
bbr->rc_output_starts_timer = 1;
- bbr_start_hpts_timer(bbr, tp, cts, 10, slot,
+ bbr_start_hpts_timer(bbr, tp, cts, 10, pacing_delay,
tot_len);
return (error);
}
@@ -13841,9 +13835,9 @@ nomore:
}
/* FALLTHROUGH */
default:
- slot = (bbr_error_base_paceout + 3) << bbr->oerror_cnt;
+ pacing_delay = (bbr_error_base_paceout + 3) << bbr->oerror_cnt;
bbr->rc_output_starts_timer = 1;
- bbr_start_hpts_timer(bbr, tp, cts, 11, slot, 0);
+ bbr_start_hpts_timer(bbr, tp, cts, 11, pacing_delay, 0);
return (error);
}
#ifdef STATS
@@ -13981,12 +13975,12 @@ skip_again:
tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST);
if (((flags & (TH_RST | TH_SYN | TH_FIN)) == 0) && tot_len) {
/*
- * Calculate/Re-Calculate the hptsi slot in usecs based on
+ * Calculate/Re-Calculate the hptsi timeout in usecs based on
* what we have sent so far
*/
- slot = bbr_get_pacing_delay(bbr, bbr->r_ctl.rc_bbr_hptsi_gain, tot_len, cts, 0);
+ pacing_delay = bbr_get_pacing_delay(bbr, bbr->r_ctl.rc_bbr_hptsi_gain, tot_len, cts, 0);
if (bbr->rc_no_pacing)
- slot = 0;
+ pacing_delay = 0;
}
tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
enobufs:
@@ -13999,8 +13993,8 @@ enobufs:
(more_to_rxt ||
((bbr->r_ctl.rc_resend = bbr_check_recovery_mode(tp, bbr, cts)) != NULL))) {
/* Rack cheats and shotguns out all rxt's 1ms apart */
- if (slot > 1000)
- slot = 1000;
+ if (pacing_delay > 1000)
+ pacing_delay = 1000;
}
if (bbr->bbr_hdrw_pacing && (bbr->hw_pacing_set == 0)) {
/*
@@ -14014,7 +14008,7 @@ enobufs:
tcp_bbr_tso_size_check(bbr, cts);
}
}
- bbr_start_hpts_timer(bbr, tp, cts, 12, slot, tot_len);
+ bbr_start_hpts_timer(bbr, tp, cts, 12, pacing_delay, tot_len);
if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
/* Make sure snd_nxt is drug up */
tp->snd_nxt = tp->snd_max;
@@ -14132,8 +14126,7 @@ bbr_switch_failed(struct tcpcb *tp)
}
} else
toval = HPTS_USECS_PER_SLOT;
- (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(toval),
- __LINE__, &diag);
+ tcp_hpts_insert(tp, toval, &diag);
bbr_log_hpts_diag(bbr, cts, &diag);
}
diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c
index 11ef5ba706c5..c7962b57a69e 100644
--- a/sys/netinet/tcp_stacks/rack.c
+++ b/sys/netinet/tcp_stacks/rack.c
@@ -250,11 +250,11 @@ static int32_t rack_non_rxt_use_cr = 0; /* does a non-rxt in recovery use the co
static int32_t rack_persist_min = 250000; /* 250usec */
static int32_t rack_persist_max = 2000000; /* 2 Second in usec's */
static int32_t rack_honors_hpts_min_to = 1; /* Do we honor the hpts minimum time out for pacing timers */
-static uint32_t rack_max_reduce = 10; /* Percent we can reduce slot by */
+static uint32_t rack_max_reduce = 10; /* Percent we can reduce pacing delay by */
static int32_t rack_sack_not_required = 1; /* set to one to allow non-sack to use rack */
static int32_t rack_limit_time_with_srtt = 0;
static int32_t rack_autosndbuf_inc = 20; /* In percentage form */
-static int32_t rack_enobuf_hw_boost_mult = 0; /* How many times the hw rate we boost slot using time_between */
+static int32_t rack_enobuf_hw_boost_mult = 0; /* How many times the hw rate we boost pacing delay using time_between */
static int32_t rack_enobuf_hw_max = 12000; /* 12 ms in usecs */
static int32_t rack_enobuf_hw_min = 10000; /* 10 ms in usecs */
static int32_t rack_hw_rwnd_factor = 2; /* How many max_segs the rwnd must be before we hold off sending */
@@ -278,7 +278,7 @@ static int32_t rack_hptsi_segments = 40;
static int32_t rack_rate_sample_method = USE_RTT_LOW;
static int32_t rack_pace_every_seg = 0;
static int32_t rack_delayed_ack_time = 40000; /* 40ms in usecs */
-static int32_t rack_slot_reduction = 4;
+static int32_t rack_pacing_delay_reduction = 4;
static int32_t rack_wma_divisor = 8; /* For WMA calculation */
static int32_t rack_cwnd_block_ends_measure = 0;
static int32_t rack_rwnd_block_ends_measure = 0;
@@ -478,7 +478,7 @@ rack_log_alt_to_to_cancel(struct tcp_rack *rack,
uint16_t flex7, uint8_t mod);
static void
-rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t slot,
+rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t pacing_delay,
uint64_t bw_est, uint64_t bw, uint64_t len_time, int method, int line,
struct rack_sendmap *rsm, uint8_t quality);
static struct rack_sendmap *
@@ -1107,7 +1107,7 @@ rack_init_sysctls(void)
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_pacing),
OID_AUTO, "burst_reduces", CTLFLAG_RW,
- &rack_slot_reduction, 4,
+ &rack_pacing_delay_reduction, 4,
"When doing only burst mitigation what is the reduce divisor");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_sysctl_root),
@@ -1399,7 +1399,7 @@ rack_init_sysctls(void)
SYSCTL_CHILDREN(rack_timers),
OID_AUTO, "hpts_max_reduce", CTLFLAG_RW,
&rack_max_reduce, 10,
- "Max percentage we will reduce slot by for pacing when we are behind");
+ "Max percentage we will reduce pacing delay by for pacing when we are behind");
SYSCTL_ADD_U32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_timers),
OID_AUTO, "persmin", CTLFLAG_RW,
@@ -2700,7 +2700,7 @@ rack_log_retran_reason(struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t
}
static void
-rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot, uint8_t which)
+rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t pacing_delay, uint8_t which)
{
if (tcp_bblogging_on(rack->rc_tp)) {
union tcp_log_stackspecific log;
@@ -2710,7 +2710,7 @@ rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot
log.u_bbr.flex1 = rack->rc_tp->t_srtt;
log.u_bbr.flex2 = to;
log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags;
- log.u_bbr.flex4 = slot;
+ log.u_bbr.flex4 = pacing_delay;
log.u_bbr.flex5 = rack->rc_tp->t_hpts_slot;
log.u_bbr.flex6 = rack->rc_tp->t_rxtcur;
log.u_bbr.flex7 = rack->rc_in_persist;
@@ -3034,14 +3034,14 @@ rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick,
}
static void
-rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t slot, uint32_t cts, struct timeval *tv, int line)
+rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t pacing_delay, uint32_t cts, struct timeval *tv, int line)
{
if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) {
union tcp_log_stackspecific log;
memset(&log, 0, sizeof(log));
log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);
- log.u_bbr.flex1 = slot;
+ log.u_bbr.flex1 = pacing_delay;
if (rack->rack_no_prr)
log.u_bbr.flex2 = 0;
else
@@ -3139,7 +3139,7 @@ rack_log_type_pacing_sizes(struct tcpcb *tp, struct tcp_rack *rack, uint32_t arg
}
static void
-rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, uint32_t slot,
+rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, uint32_t pacing_delay,
uint8_t hpts_calling, int reason, uint32_t cwnd_to_use)
{
if (tcp_bblogging_on(rack->rc_tp)) {
@@ -3148,7 +3148,7 @@ rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, ui
memset(&log, 0, sizeof(log));
log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);
- log.u_bbr.flex1 = slot;
+ log.u_bbr.flex1 = pacing_delay;
log.u_bbr.flex2 = rack->r_ctl.rc_hpts_flags;
log.u_bbr.flex4 = reason;
if (rack->rack_no_prr)
@@ -6482,7 +6482,7 @@ rack_log_hpts_diag(struct tcp_rack *rack, uint32_t cts,
log.u_bbr.flex2 = diag->p_cur_slot;
log.u_bbr.flex3 = diag->slot_req;
log.u_bbr.flex4 = diag->inp_hptsslot;
- log.u_bbr.flex5 = diag->slot_remaining;
+ log.u_bbr.flex5 = diag->time_remaining;
log.u_bbr.flex6 = diag->need_new_to;
log.u_bbr.flex7 = diag->p_hpts_active;
log.u_bbr.flex8 = diag->p_on_min_sleep;
@@ -6497,9 +6497,6 @@ rack_log_hpts_diag(struct tcp_rack *rack, uint32_t cts,
log.u_bbr.rttProp = diag->wheel_cts;
log.u_bbr.timeStamp = cts;
log.u_bbr.delRate = diag->maxslots;
- log.u_bbr.cur_del_rate = diag->p_curtick;
- log.u_bbr.cur_del_rate <<= 32;
- log.u_bbr.cur_del_rate |= diag->p_lasttick;
TCP_LOG_EVENTP(rack->rc_tp, NULL,
&rack->rc_inp->inp_socket->so_rcv,
&rack->rc_inp->inp_socket->so_snd,
@@ -6532,14 +6529,14 @@ rack_log_wakeup(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb, uin
static void
rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts,
- int32_t slot, uint32_t tot_len_this_send, int sup_rack)
+ int32_t usecs, uint32_t tot_len_this_send, int sup_rack)
{
struct hpts_diag diag;
struct inpcb *inp = tptoinpcb(tp);
struct timeval tv;
uint32_t delayed_ack = 0;
uint32_t hpts_timeout;
- uint32_t entry_slot = slot;
+ uint32_t entry_usecs = usecs;
uint8_t stopped;
uint32_t left = 0;
uint32_t us_cts;
@@ -6560,7 +6557,7 @@ rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts,
rack->r_ctl.rc_hpts_flags = 0;
us_cts = tcp_get_usecs(&tv);
/* Now early/late accounting */
- rack_log_pacing_delay_calc(rack, entry_slot, slot, 0, 0, 0, 26, __LINE__, NULL, 0);
+ rack_log_pacing_delay_calc(rack, entry_usecs, usecs, 0, 0, 0, 26, __LINE__, NULL, 0);
if (rack->r_early && (rack->rc_ack_can_sendout_data == 0)) {
/*
* We have a early carry over set,
@@ -6571,7 +6568,7 @@ rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts,
* penalize the next timer for being awoke
* by an ack aka the rc_agg_early (non-paced mode).
*/
- slot += rack->r_ctl.rc_agg_early;
+ usecs += rack->r_ctl.rc_agg_early;
rack->r_early = 0;
rack->r_ctl.rc_agg_early = 0;
}
@@ -6583,29 +6580,29 @@ rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts,
* really depends on what
* the current pacing time is.
*/
- if (rack->r_ctl.rc_agg_delayed >= slot) {
+ if (rack->r_ctl.rc_agg_delayed >= usecs) {
/*
* We can't compensate for it all.
* And we have to have some time
* on the clock. We always have a min
- * 10 slots (10 x 10 i.e. 100 usecs).
+ * 10 HPTS timer units (10 x 10 i.e. 100 usecs).
*/
- if (slot <= HPTS_USECS_PER_SLOT) {
+ if (usecs <= HPTS_USECS_PER_SLOT) {
/* We gain delay */
- rack->r_ctl.rc_agg_delayed += (HPTS_USECS_PER_SLOT - slot);
- slot = HPTS_USECS_PER_SLOT;
+ rack->r_ctl.rc_agg_delayed += (HPTS_USECS_PER_SLOT - usecs);
+ usecs = HPTS_USECS_PER_SLOT;
} else {
/* We take off some */
- rack->r_ctl.rc_agg_delayed -= (slot - HPTS_USECS_PER_SLOT);
- slot = HPTS_USECS_PER_SLOT;
+ rack->r_ctl.rc_agg_delayed -= (usecs - HPTS_USECS_PER_SLOT);
+ usecs = HPTS_USECS_PER_SLOT;
}
} else {
- slot -= rack->r_ctl.rc_agg_delayed;
+ usecs -= rack->r_ctl.rc_agg_delayed;
rack->r_ctl.rc_agg_delayed = 0;
/* Make sure we have 100 useconds at minimum */
- if (slot < HPTS_USECS_PER_SLOT) {
- rack->r_ctl.rc_agg_delayed = HPTS_USECS_PER_SLOT - slot;
- slot = HPTS_USECS_PER_SLOT;
+ if (usecs < HPTS_USECS_PER_SLOT) {
+ rack->r_ctl.rc_agg_delayed = HPTS_USECS_PER_SLOT - usecs;
+ usecs = HPTS_USECS_PER_SLOT;
}
if (rack->r_ctl.rc_agg_delayed == 0)
rack->r_late = 0;
@@ -6614,17 +6611,17 @@ rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts,
/* r_use_hpts_min is on and so is DGP */
uint32_t max_red;
- max_red = (slot * rack->r_ctl.max_reduction) / 100;
+ max_red = (usecs * rack->r_ctl.max_reduction) / 100;
if (max_red >= rack->r_ctl.rc_agg_delayed) {
- slot -= rack->r_ctl.rc_agg_delayed;
+ usecs -= rack->r_ctl.rc_agg_delayed;
rack->r_ctl.rc_agg_delayed = 0;
} else {
- slot -= max_red;
+ usecs -= max_red;
rack->r_ctl.rc_agg_delayed -= max_red;
}
}
if ((rack->r_use_hpts_min == 1) &&
- (slot > 0) &&
+ (usecs > 0) &&
(rack->dgp_on == 1)) {
/*
* We are enforcing a min pacing timer
@@ -6633,8 +6630,8 @@ rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts,
uint32_t min;
min = get_hpts_min_sleep_time();
- if (min > slot) {
- slot = min;
+ if (min > usecs) {
+ usecs = min;
}
}
hpts_timeout = rack_timer_start(tp, rack, cts, sup_rack);
@@ -6652,7 +6649,7 @@ rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts,
* wheel, we resort to a keep-alive timer if its configured.
*/
if ((hpts_timeout == 0) &&
- (slot == 0)) {
+ (usecs == 0)) {
if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) &&
(tp->t_state <= TCPS_CLOSING)) {
/*
@@ -6709,10 +6706,10 @@ rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts,
hpts_timeout = 0x7ffffffe;
rack->r_ctl.rc_timer_exp = cts + hpts_timeout;
}
- rack_log_pacing_delay_calc(rack, entry_slot, slot, hpts_timeout, 0, 0, 27, __LINE__, NULL, 0);
+ rack_log_pacing_delay_calc(rack, entry_usecs, usecs, hpts_timeout, 0, 0, 27, __LINE__, NULL, 0);
if ((rack->gp_ready == 0) &&
(rack->use_fixed_rate == 0) &&
- (hpts_timeout < slot) &&
+ (hpts_timeout < usecs) &&
(rack->r_ctl.rc_hpts_flags & (PACE_TMR_TLP|PACE_TMR_RXT))) {
/*
* We have no good estimate yet for the
@@ -6722,7 +6719,7 @@ rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts,
* pace that long since we know the calculation
* so far is not accurate.
*/
- slot = hpts_timeout;
+ usecs = hpts_timeout;
}
/**
* Turn off all the flags for queuing by default. The
@@ -6754,11 +6751,11 @@ rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts,
* so LRO can call into us.
*/
tp->t_flags2 &= ~(TF2_DONT_SACK_QUEUE|TF2_MBUF_QUEUE_READY);
- if (slot) {
+ if (usecs) {
rack->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT;
- rack->r_ctl.rc_last_output_to = us_cts + slot;
+ rack->r_ctl.rc_last_output_to = us_cts + usecs;
/*
- * A pacing timer (slot) is being set, in
+ * A pacing timer (usecs microseconds) is being set, in
* such a case we cannot send (we are blocked by
* the timer). So lets tell LRO that it should not
* wake us unless there is a SACK. Note this only
@@ -6799,20 +6796,18 @@ rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts,
}
if ((rack->use_rack_rr) &&
(rack->r_rr_config < 2) &&
- ((hpts_timeout) && (hpts_timeout < slot))) {
+ ((hpts_timeout) && (hpts_timeout < usecs))) {
/*
* Arrange for the hpts to kick back in after the
* t-o if the t-o does not cause a send.
*/
- (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(hpts_timeout),
- __LINE__, &diag);
+ tcp_hpts_insert(tp, hpts_timeout, &diag);
rack_log_hpts_diag(rack, us_cts, &diag, &tv);
- rack_log_to_start(rack, cts, hpts_timeout, slot, 0);
+ rack_log_to_start(rack, cts, hpts_timeout, usecs, 0);
} else {
- (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(slot),
- __LINE__, &diag);
+ tcp_hpts_insert(tp, usecs, &diag);
rack_log_hpts_diag(rack, us_cts, &diag, &tv);
- rack_log_to_start(rack, cts, hpts_timeout, slot, 1);
+ rack_log_to_start(rack, cts, hpts_timeout, usecs, 1);
}
} else if (hpts_timeout) {
/*
@@ -6824,22 +6819,21 @@ rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts,
* at the start of this block) are good enough.
*/
rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT;
- (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(hpts_timeout),
- __LINE__, &diag);
+ tcp_hpts_insert(tp, hpts_timeout, &diag);
rack_log_hpts_diag(rack, us_cts, &diag, &tv);
- rack_log_to_start(rack, cts, hpts_timeout, slot, 0);
+ rack_log_to_start(rack, cts, hpts_timeout, usecs, 0);
} else {
/* No timer starting */
#ifdef INVARIANTS
if (SEQ_GT(tp->snd_max, tp->snd_una)) {
- panic("tp:%p rack:%p tlts:%d cts:%u slot:%u pto:%u -- no timer started?",
- tp, rack, tot_len_this_send, cts, slot, hpts_timeout);
+ panic("tp:%p rack:%p tlts:%d cts:%u usecs:%u pto:%u -- no timer started?",
+ tp, rack, tot_len_this_send, cts, usecs, hpts_timeout);
}
#endif
}
rack->rc_tmr_stopped = 0;
- if (slot)
- rack_log_type_bbrsnd(rack, tot_len_this_send, slot, us_cts, &tv, __LINE__);
+ if (usecs)
+ rack_log_type_bbrsnd(rack, tot_len_this_send, usecs, us_cts, &tv, __LINE__);
}
static void
@@ -8016,7 +8010,7 @@ rack_process_timers(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8
rack->rc_tp->t_flags2 &= ~TF2_DONT_SACK_QUEUE;
ret = -3;
left = rack->r_ctl.rc_timer_exp - cts;
- tcp_hpts_insert(tp, HPTS_MS_TO_SLOTS(left));
+ tcp_hpts_insert(tp, left, NULL);
rack_log_to_processing(rack, cts, ret, left);
return (1);
}
@@ -14377,8 +14371,7 @@ rack_switch_failed(struct tcpcb *tp)
}
} else
toval = HPTS_USECS_PER_SLOT;
- (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(toval),
- __LINE__, &diag);
+ tcp_hpts_insert(tp, toval, &diag);
rack_log_hpts_diag(rack, cts, &diag, &tv);
}
@@ -14973,8 +14966,7 @@ rack_init(struct tcpcb *tp, void **ptr)
if (tov) {
struct hpts_diag diag;
- (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(tov),
- __LINE__, &diag);
+ tcp_hpts_insert(tp, tov, &diag);
rack_log_hpts_diag(rack, us_cts, &diag, &rack->r_ctl.act_rcv_time);
}
}
@@ -16367,7 +16359,7 @@ rack_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
struct rack_sendmap *rsm;
int32_t prev_state = 0;
int no_output = 0;
- int slot_remaining = 0;
+ int time_remaining = 0;
#ifdef TCP_ACCOUNTING
int ack_val_set = 0xf;
#endif
@@ -16416,7 +16408,7 @@ rack_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
* could be, if a sack is present, we want to be awoken and
* so should process the packets.
*/
- slot_remaining = rack->r_ctl.rc_last_output_to - us_cts;
+ time_remaining = rack->r_ctl.rc_last_output_to - us_cts;
if (rack->rc_tp->t_flags2 & TF2_DONT_SACK_QUEUE) {
no_output = 1;
} else {
@@ -16436,7 +16428,7 @@ rack_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
(*ts_ptr == TCP_LRO_TS_OPTION)))
no_output = 1;
}
- if ((no_output == 1) && (slot_remaining < tcp_min_hptsi_time)) {
+ if ((no_output == 1) && (time_remaining < tcp_min_hptsi_time)) {
/*
* It is unrealistic to think we can pace in less than
* the minimum granularity of the pacer (def:250usec). So
@@ -16919,10 +16911,10 @@ do_output_now:
(tcp_in_hpts(rack->rc_tp) == 0)) {
/*
* We are not in hpts and we had a pacing timer up. Use
- * the remaining time (slot_remaining) to restart the timer.
+ * the remaining time (time_remaining) to restart the timer.
*/
- KASSERT ((slot_remaining != 0), ("slot remaining is zero for rack:%p tp:%p", rack, tp));
- rack_start_hpts_timer(rack, tp, cts, slot_remaining, 0, 0);
+ KASSERT ((time_remaining != 0), ("slot remaining is zero for rack:%p tp:%p", rack, tp));
+ rack_start_hpts_timer(rack, tp, cts, time_remaining, 0, 0);
rack_free_trim(rack);
}
/* Clear the flag, it may have been cleared by output but we may not have */
@@ -17102,7 +17094,7 @@ check_it:
}
static void
-rack_log_pacing_delay_calc (struct tcp_rack *rack, uint32_t len, uint32_t slot,
+rack_log_pacing_delay_calc (struct tcp_rack *rack, uint32_t len, uint32_t pacing_delay,
uint64_t bw_est, uint64_t bw, uint64_t len_time, int method,
int line, struct rack_sendmap *rsm, uint8_t quality)
{
@@ -17125,7 +17117,7 @@ rack_log_pacing_delay_calc (struct tcp_rack *rack, uint32_t len, uint32_t slot,
}
}
memset(&log, 0, sizeof(log));
- log.u_bbr.flex1 = slot;
+ log.u_bbr.flex1 = pacing_delay;
log.u_bbr.flex2 = len;
log.u_bbr.flex3 = rack->r_ctl.rc_pace_min_segs;
log.u_bbr.flex4 = rack->r_ctl.rc_pace_max_segs;
@@ -17284,25 +17276,25 @@ rack_arrive_at_discounted_rate(struct tcp_rack *rack, uint64_t window_input, uin
}
static int32_t
-pace_to_fill_cwnd(struct tcp_rack *rack, int32_t slot, uint32_t len, uint32_t segsiz, int *capped, uint64_t *rate_wanted, uint8_t non_paced)
+pace_to_fill_cwnd(struct tcp_rack *rack, int32_t pacing_delay, uint32_t len, uint32_t segsiz, int *capped, uint64_t *rate_wanted, uint8_t non_paced)
{
uint64_t lentim, fill_bw;
rack->r_via_fill_cw = 0;
if (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > rack->r_ctl.cwnd_to_use)
- return (slot);
+ return (pacing_delay);
if ((ctf_outstanding(rack->rc_tp) + (segsiz-1)) > rack->rc_tp->snd_wnd)
- return (slot);
+ return (pacing_delay);
if (rack->r_ctl.rc_last_us_rtt == 0)
- return (slot);
+ return (pacing_delay);
if (rack->rc_pace_fill_if_rttin_range &&
(rack->r_ctl.rc_last_us_rtt >=
(get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack->rtt_limit_mul))) {
/* The rtt is huge, N * smallest, lets not fill */
- return (slot);
+ return (pacing_delay);
}
if (rack->r_ctl.fillcw_cap && *rate_wanted >= rack->r_ctl.fillcw_cap)
- return (slot);
+ return (pacing_delay);
/*
* first lets calculate the b/w based on the last us-rtt
* and the the smallest send window.
@@ -17368,7 +17360,7 @@ at_lt_bw:
if (non_paced)
*rate_wanted = fill_bw;
if ((fill_bw < RACK_MIN_BW) || (fill_bw < *rate_wanted))
- return (slot);
+ return (pacing_delay);
rack->r_via_fill_cw = 1;
if (rack->r_rack_hw_rate_caps &&
(rack->r_ctl.crte != NULL)) {
@@ -17423,19 +17415,19 @@ at_lt_bw:
lentim = (uint64_t)(len) * (uint64_t)HPTS_USEC_IN_SEC;
lentim /= fill_bw;
*rate_wanted = fill_bw;
- if (non_paced || (lentim < slot)) {
- rack_log_pacing_delay_calc(rack, len, slot, fill_bw,
+ if (non_paced || (lentim < pacing_delay)) {
+ rack_log_pacing_delay_calc(rack, len, pacing_delay, fill_bw,
0, lentim, 12, __LINE__, NULL, 0);
return ((int32_t)lentim);
} else
- return (slot);
+ return (pacing_delay);
}
static int32_t
rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, struct rack_sendmap *rsm, uint32_t segsiz, int line)
{
uint64_t srtt;
- int32_t slot = 0;
+ int32_t pacing_delay = 0;
int can_start_hw_pacing = 1;
int err;
int pace_one;
@@ -17483,25 +17475,25 @@ rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, str
* cwnd. Which in that case we are just waiting for
* a ACK.
*/
- slot = len / tr_perms;
+ pacing_delay = len / tr_perms;
/* Now do we reduce the time so we don't run dry? */
- if (slot && rack_slot_reduction) {
- reduce = (slot / rack_slot_reduction);
- if (reduce < slot) {
- slot -= reduce;
+ if (pacing_delay && rack_pacing_delay_reduction) {
+ reduce = (pacing_delay / rack_pacing_delay_reduction);
+ if (reduce < pacing_delay) {
+ pacing_delay -= reduce;
} else
- slot = 0;
+ pacing_delay = 0;
} else
reduce = 0;
- slot *= HPTS_USEC_IN_MSEC;
+ pacing_delay *= HPTS_USEC_IN_MSEC;
if (rack->rc_pace_to_cwnd) {
uint64_t rate_wanted = 0;
- slot = pace_to_fill_cwnd(rack, slot, len, segsiz, NULL, &rate_wanted, 1);
+ pacing_delay = pace_to_fill_cwnd(rack, pacing_delay, len, segsiz, NULL, &rate_wanted, 1);
rack->rc_ack_can_sendout_data = 1;
- rack_log_pacing_delay_calc(rack, len, slot, rate_wanted, 0, 0, 14, __LINE__, NULL, 0);
+ rack_log_pacing_delay_calc(rack, len, pacing_delay, rate_wanted, 0, 0, 14, __LINE__, NULL, 0);
} else
- rack_log_pacing_delay_calc(rack, len, slot, tr_perms, reduce, 0, 7, __LINE__, NULL, 0);
+ rack_log_pacing_delay_calc(rack, len, pacing_delay, tr_perms, reduce, 0, 7, __LINE__, NULL, 0);
/*******************************************************/
/* RRS: We insert non-paced call to stats here for len */
/*******************************************************/
@@ -17575,7 +17567,7 @@ rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, str
segs *= oh;
lentim = (uint64_t)(len + segs) * (uint64_t)HPTS_USEC_IN_SEC;
res = lentim / rate_wanted;
- slot = (uint32_t)res;
+ pacing_delay = (uint32_t)res;
if (rack_hw_rate_min &&
(rate_wanted < rack_hw_rate_min)) {
can_start_hw_pacing = 0;
@@ -17635,7 +17627,7 @@ rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, str
* We want to pace at our rate *or* faster to
* fill the cwnd to the max if its not full.
*/
- slot = pace_to_fill_cwnd(rack, slot, (len+segs), segsiz, &capped, &rate_wanted, 0);
+ pacing_delay = pace_to_fill_cwnd(rack, pacing_delay, (len+segs), segsiz, &capped, &rate_wanted, 0);
/* Re-check to make sure we are not exceeding our max b/w */
if ((rack->r_ctl.crte != NULL) &&
(tcp_hw_highest_rate(rack->r_ctl.crte) < rate_wanted)) {
@@ -17786,15 +17778,15 @@ rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, str
srtt = rack->rc_tp->t_srtt;
else
srtt = RACK_INITIAL_RTO * HPTS_USEC_IN_MSEC; /* its in ms convert */
- if (srtt < (uint64_t)slot) {
- rack_log_pacing_delay_calc(rack, srtt, slot, rate_wanted, bw_est, lentim, 99, __LINE__, NULL, 0);
- slot = srtt;
+ if (srtt < (uint64_t)pacing_delay) {
+ rack_log_pacing_delay_calc(rack, srtt, pacing_delay, rate_wanted, bw_est, lentim, 99, __LINE__, NULL, 0);
+ pacing_delay = srtt;
}
}
/*******************************************************************/
/* RRS: We insert paced call to stats here for len and rate_wanted */
/*******************************************************************/
- rack_log_pacing_delay_calc(rack, len, slot, rate_wanted, bw_est, lentim, 2, __LINE__, rsm, 0);
+ rack_log_pacing_delay_calc(rack, len, pacing_delay, rate_wanted, bw_est, lentim, 2, __LINE__, rsm, 0);
}
if (rack->r_ctl.crte && (rack->r_ctl.crte->rs_num_enobufs > 0)) {
/*
@@ -17811,9 +17803,9 @@ rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, str
hw_boost_delay = rack_enobuf_hw_max;
else if (hw_boost_delay < rack_enobuf_hw_min)
hw_boost_delay = rack_enobuf_hw_min;
- slot += hw_boost_delay;
+ pacing_delay += hw_boost_delay;
}
- return (slot);
+ return (pacing_delay);
}
static void
@@ -18482,7 +18474,7 @@ rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendma
struct tcpopt to;
u_char opt[TCP_MAXOLEN];
uint32_t hdrlen, optlen;
- int32_t slot, segsiz, max_val, tso = 0, error = 0, ulen = 0;
+ int32_t pacing_delay, segsiz, max_val, tso = 0, error = 0, ulen = 0;
uint16_t flags;
uint32_t if_hw_tsomaxsegcount = 0, startseq;
uint32_t if_hw_tsomaxsegsize;
@@ -18688,9 +18680,9 @@ rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendma
}
if (rack->r_ctl.crte != NULL) {
/* See if we can send via the hw queue */
- slot = rack_check_queue_level(rack, tp, tv, cts, len, segsiz);
+ pacing_delay = rack_check_queue_level(rack, tp, tv, cts, len, segsiz);
/* If there is nothing in queue (no pacing time) we can send via the hw queue */
- if (slot == 0)
+ if (pacing_delay == 0)
ip_sendflag = 0;
}
tcp_set_flags(th, flags);
@@ -18955,20 +18947,20 @@ rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendma
rack_log_queue_level(tp, rack, len, tv, cts);
} else
tcp_trace_point(rack->rc_tp, TCP_TP_ENOBUF);
- slot = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC);
+ pacing_delay = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC);
if (rack->rc_enobuf < 0x7f)
rack->rc_enobuf++;
- if (slot < (10 * HPTS_USEC_IN_MSEC))
- slot = 10 * HPTS_USEC_IN_MSEC;
+ if (pacing_delay < (10 * HPTS_USEC_IN_MSEC))
+ pacing_delay = 10 * HPTS_USEC_IN_MSEC;
if (rack->r_ctl.crte != NULL) {
counter_u64_add(rack_saw_enobuf_hw, 1);
tcp_rl_log_enobuf(rack->r_ctl.crte);
}
counter_u64_add(rack_saw_enobuf, 1);
} else {
- slot = rack_get_pacing_delay(rack, tp, len, NULL, segsiz, __LINE__);
+ pacing_delay = rack_get_pacing_delay(rack, tp, len, NULL, segsiz, __LINE__);
}
- rack_start_hpts_timer(rack, tp, cts, slot, len, 0);
+ rack_start_hpts_timer(rack, tp, cts, pacing_delay, len, 0);
#ifdef TCP_ACCOUNTING
crtsc = get_cyclecount();
if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
@@ -19071,7 +19063,7 @@ rack_fast_output(struct tcpcb *tp, struct tcp_rack *rack, uint64_t ts_val,
#ifdef TCP_ACCOUNTING
int cnt_thru = 1;
#endif
- int32_t slot, segsiz, len, max_val, tso = 0, sb_offset, error, ulen = 0;
+ int32_t pacing_delay, segsiz, len, max_val, tso = 0, sb_offset, error, ulen = 0;
uint16_t flags;
uint32_t s_soff;
uint32_t if_hw_tsomaxsegcount = 0, startseq;
@@ -19519,8 +19511,8 @@ again:
}
tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
counter_u64_add(rack_fto_send, 1);
- slot = rack_get_pacing_delay(rack, tp, *tot_len, NULL, segsiz, __LINE__);
- rack_start_hpts_timer(rack, tp, cts, slot, *tot_len, 0);
+ pacing_delay = rack_get_pacing_delay(rack, tp, *tot_len, NULL, segsiz, __LINE__);
+ rack_start_hpts_timer(rack, tp, cts, pacing_delay, *tot_len, 0);
#ifdef TCP_ACCOUNTING
crtsc = get_cyclecount();
if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
@@ -19707,7 +19699,7 @@ rack_output(struct tcpcb *tp)
struct rack_sendmap *rsm = NULL;
int32_t tso, mtu;
struct tcpopt to;
- int32_t slot = 0;
+ int32_t pacing_delay = 0;
int32_t sup_rack = 0;
uint32_t cts, ms_cts, delayed, early;
uint32_t add_flag = RACK_SENT_SP;
@@ -20070,7 +20062,7 @@ again:
if (rsm == NULL) {
if (hpts_calling)
/* Retry in a ms */
- slot = (1 * HPTS_USEC_IN_MSEC);
+ pacing_delay = (1 * HPTS_USEC_IN_MSEC);
so = inp->inp_socket;
sb = &so->so_snd;
goto just_return_nolock;
@@ -20877,7 +20869,7 @@ just_return_nolock:
}
if (tot_len_this_send > 0) {
rack->r_ctl.fsb.recwin = recwin;
- slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, NULL, segsiz, __LINE__);
+ pacing_delay = rack_get_pacing_delay(rack, tp, tot_len_this_send, NULL, segsiz, __LINE__);
if ((error == 0) &&
rack_use_rfo &&
((flags & (TH_SYN|TH_FIN)) == 0) &&
@@ -21060,8 +21052,8 @@ just_return_nolock:
/* Yes lets make sure to move to persist before timer-start */
rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime, tp->snd_una);
}
- rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, sup_rack);
- rack_log_type_just_return(rack, cts, tot_len_this_send, slot, hpts_calling, app_limited, cwnd_to_use);
+ rack_start_hpts_timer(rack, tp, cts, pacing_delay, tot_len_this_send, sup_rack);
+ rack_log_type_just_return(rack, cts, tot_len_this_send, pacing_delay, hpts_calling, app_limited, cwnd_to_use);
}
#ifdef NETFLIX_SHARED_CWND
if ((sbavail(sb) == 0) &&
@@ -21100,8 +21092,8 @@ send:
* we come around to again, the flag will be clear.
*/
check_done = 1;
- slot = rack_check_queue_level(rack, tp, &tv, cts, len, segsiz);
- if (slot) {
+ pacing_delay = rack_check_queue_level(rack, tp, &tv, cts, len, segsiz);
+ if (pacing_delay) {
rack->r_ctl.rc_agg_delayed = 0;
rack->r_ctl.rc_agg_early = 0;
rack->r_early = 0;
@@ -22358,11 +22350,11 @@ nomore:
rack_log_queue_level(tp, rack, len, &tv, cts);
} else
tcp_trace_point(rack->rc_tp, TCP_TP_ENOBUF);
- slot = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC);
+ pacing_delay = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC);
if (rack->rc_enobuf < 0x7f)
rack->rc_enobuf++;
- if (slot < (10 * HPTS_USEC_IN_MSEC))
- slot = 10 * HPTS_USEC_IN_MSEC;
+ if (pacing_delay < (10 * HPTS_USEC_IN_MSEC))
+ pacing_delay = 10 * HPTS_USEC_IN_MSEC;
if (rack->r_ctl.crte != NULL) {
counter_u64_add(rack_saw_enobuf_hw, 1);
tcp_rl_log_enobuf(rack->r_ctl.crte);
@@ -22389,8 +22381,8 @@ nomore:
goto again;
}
}
- slot = 10 * HPTS_USEC_IN_MSEC;
- rack_start_hpts_timer(rack, tp, cts, slot, 0, 0);
+ pacing_delay = 10 * HPTS_USEC_IN_MSEC;
+ rack_start_hpts_timer(rack, tp, cts, pacing_delay, 0, 0);
#ifdef TCP_ACCOUNTING
crtsc = get_cyclecount();
if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
@@ -22412,8 +22404,8 @@ nomore:
}
/* FALLTHROUGH */
default:
- slot = 10 * HPTS_USEC_IN_MSEC;
- rack_start_hpts_timer(rack, tp, cts, slot, 0, 0);
+ pacing_delay = 10 * HPTS_USEC_IN_MSEC;
+ rack_start_hpts_timer(rack, tp, cts, pacing_delay, 0, 0);
#ifdef TCP_ACCOUNTING
crtsc = get_cyclecount();
if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
@@ -22456,18 +22448,18 @@ enobufs:
/*
* We don't send again after sending a RST.
*/
- slot = 0;
+ pacing_delay = 0;
sendalot = 0;
if (error == 0)
tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST);
- } else if ((slot == 0) && (sendalot == 0) && tot_len_this_send) {
+ } else if ((pacing_delay == 0) && (sendalot == 0) && tot_len_this_send) {
/*
* Get our pacing rate, if an error
* occurred in sending (ENOBUF) we would
* hit the else if with slot preset. Other
* errors return.
*/
- slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, rsm, segsiz, __LINE__);
+ pacing_delay = rack_get_pacing_delay(rack, tp, tot_len_this_send, rsm, segsiz, __LINE__);
}
/* We have sent clear the flag */
rack->r_ent_rec_ns = 0;
@@ -22499,7 +22491,7 @@ enobufs:
*/
tp->t_flags &= ~(TF_WASCRECOVERY|TF_WASFRECOVERY);
}
- if (slot) {
+ if (pacing_delay) {
/* set the rack tcb into the slot N */
if ((error == 0) &&
rack_use_rfo &&
@@ -22564,7 +22556,7 @@ skip_all_send:
/* Assure when we leave that snd_nxt will point to top */
if (SEQ_GT(tp->snd_max, tp->snd_nxt))
tp->snd_nxt = tp->snd_max;
- rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, 0);
+ rack_start_hpts_timer(rack, tp, cts, pacing_delay, tot_len_this_send, 0);
#ifdef TCP_ACCOUNTING
crtsc = get_cyclecount() - ts_val;
if (tot_len_this_send) {
diff --git a/sys/netinet6/in6_fib_algo.c b/sys/netinet6/in6_fib_algo.c
index 10ffe7ab0265..ef5cfc6d5ef6 100644
--- a/sys/netinet6/in6_fib_algo.c
+++ b/sys/netinet6/in6_fib_algo.c
@@ -351,7 +351,7 @@ struct fib_lookup_module flm_radix6 = {
};
static void
-fib6_algo_init(void)
+fib6_algo_init(void *dummy __unused)
{
fib_module_register(&flm_radix6_lockless);
diff --git a/sys/netipsec/xform_ipcomp.c b/sys/netipsec/xform_ipcomp.c
index 737d4a50098a..05a01b75e0bb 100644
--- a/sys/netipsec/xform_ipcomp.c
+++ b/sys/netipsec/xform_ipcomp.c
@@ -750,7 +750,7 @@ static struct xformsw ipcomp_xformsw = {
};
static void
-ipcomp_attach(void)
+ipcomp_attach(void *dummy __unused)
{
#ifdef INET
@@ -763,7 +763,7 @@ ipcomp_attach(void)
}
static void
-ipcomp_detach(void)
+ipcomp_detach(void *dummy __unused)
{
#ifdef INET
diff --git a/sys/netpfil/ipfw/ip_fw2.c b/sys/netpfil/ipfw/ip_fw2.c
index b59d8d08bf80..d15d7760d7f1 100644
--- a/sys/netpfil/ipfw/ip_fw2.c
+++ b/sys/netpfil/ipfw/ip_fw2.c
@@ -3578,11 +3578,9 @@ sysctl_ipfw_tables_sets(SYSCTL_HANDLER_ARGS)
/*
* Stuff that must be initialised only on boot or module load
*/
-static int
-ipfw_init(void)
+static void
+ipfw_init(void *dummy __unused)
{
- int error = 0;
-
/*
* Only print out this stuff the first time around,
* when called from the sysinit code.
@@ -3627,14 +3625,13 @@ ipfw_init(void)
ipfw_init_sopt_handler();
ipfw_init_obj_rewriter();
ipfw_iface_init();
- return (error);
}
/*
* Called for the removal of the last instance only on module unload.
*/
static void
-ipfw_destroy(void)
+ipfw_destroy(void *dummy __unused)
{
ipfw_iface_destroy();
diff --git a/sys/netpfil/ipfw/ip_fw_nat.c b/sys/netpfil/ipfw/ip_fw_nat.c
index 1e2ff1bca290..8bd27f6885ab 100644
--- a/sys/netpfil/ipfw/ip_fw_nat.c
+++ b/sys/netpfil/ipfw/ip_fw_nat.c
@@ -999,9 +999,11 @@ ipfw_nat_del(struct sockopt *sopt)
{
struct cfg_nat *ptr;
struct ip_fw_chain *chain = &V_layer3_chain;
- int i;
+ int error, i;
- sooptcopyin(sopt, &i, sizeof i, sizeof i);
+ error = sooptcopyin(sopt, &i, sizeof i, sizeof i);
+ if (error != 0)
+ return (error);
/* XXX validate i */
IPFW_UH_WLOCK(chain);
ptr = lookup_nat(&chain->nat, i);
@@ -1104,7 +1106,7 @@ ipfw_nat_get_log(struct sockopt *sopt)
{
uint8_t *data;
struct cfg_nat *ptr;
- int i, size;
+ int error, i, size;
struct ip_fw_chain *chain;
IPFW_RLOCK_TRACKER;
@@ -1134,9 +1136,9 @@ ipfw_nat_get_log(struct sockopt *sopt)
i += LIBALIAS_BUF_SIZE;
}
IPFW_RUNLOCK(chain);
- sooptcopyout(sopt, data, size);
+ error = sooptcopyout(sopt, data, size);
free(data, M_IPFW);
- return(0);
+ return (error);
}
static int
@@ -1166,7 +1168,7 @@ vnet_ipfw_nat_uninit(const void *arg __unused)
}
static void
-ipfw_nat_init(void)
+ipfw_nat_init(void *dummy __unused)
{
/* init ipfw hooks */
@@ -1183,7 +1185,7 @@ ipfw_nat_init(void)
}
static void
-ipfw_nat_destroy(void)
+ipfw_nat_destroy(void *dummy __unused)
{
EVENTHANDLER_DEREGISTER(ifaddr_event, ifaddr_event_tag);
diff --git a/sys/netpfil/pf/pf_ioctl.c b/sys/netpfil/pf/pf_ioctl.c
index d58af6e5ec4d..a4557f139ae5 100644
--- a/sys/netpfil/pf/pf_ioctl.c
+++ b/sys/netpfil/pf/pf_ioctl.c
@@ -259,7 +259,7 @@ static void dehook_pf_eth(void);
static void dehook_pf(void);
static int shutdown_pf(void);
static int pf_load(void);
-static void pf_unload(void);
+static void pf_unload(void *);
static struct cdevsw pf_cdevsw = {
.d_ioctl = pfioctl,
@@ -7082,7 +7082,7 @@ pf_unload_vnet(void)
}
static void
-pf_unload(void)
+pf_unload(void *dummy __unused)
{
sx_xlock(&pf_end_lock);
diff --git a/sys/nfs/nfs_diskless.c b/sys/nfs/nfs_diskless.c
index 42cfee63d184..0f0cf80feeec 100644
--- a/sys/nfs/nfs_diskless.c
+++ b/sys/nfs/nfs_diskless.c
@@ -428,7 +428,7 @@ decode_nfshandle(char *ev, u_char *fh, int maxfh)
#if !defined(BOOTP_NFSROOT)
static void
-nfs_rootconf(void)
+nfs_rootconf(void *dummy __unused)
{
nfs_setup_diskless();
diff --git a/sys/powerpc/aim/mmu_oea64.c b/sys/powerpc/aim/mmu_oea64.c
index 796b1719b8ba..01bf4c7e90a8 100644
--- a/sys/powerpc/aim/mmu_oea64.c
+++ b/sys/powerpc/aim/mmu_oea64.c
@@ -297,7 +297,7 @@ static u_int moea64_clear_bit(vm_page_t, uint64_t);
static void moea64_kremove(vm_offset_t);
static void moea64_syncicache(pmap_t pmap, vm_offset_t va,
vm_paddr_t pa, vm_size_t sz);
-static void moea64_pmap_init_qpages(void);
+static void moea64_pmap_init_qpages(void *);
static void moea64_remove_locked(pmap_t, vm_offset_t,
vm_offset_t, struct pvo_dlist *);
@@ -1284,7 +1284,7 @@ moea64_late_bootstrap(vm_offset_t kernelstart, vm_offset_t kernelend)
}
static void
-moea64_pmap_init_qpages(void)
+moea64_pmap_init_qpages(void *dummy __unused)
{
struct pcpu *pc;
int i;
diff --git a/sys/powerpc/cpufreq/pmcr.c b/sys/powerpc/cpufreq/pmcr.c
index dd489b607606..6ae0777a8ac7 100644
--- a/sys/powerpc/cpufreq/pmcr.c
+++ b/sys/powerpc/cpufreq/pmcr.c
@@ -40,7 +40,8 @@ static int pstate_ids[256];
static int pstate_freqs[256];
static int npstates;
-static void parse_pstates(void)
+static void
+parse_pstates(void *dummy __unused)
{
phandle_t node;
diff --git a/sys/rpc/auth.h b/sys/rpc/auth.h
index 33c33ffd594d..648fb99a3a27 100644
--- a/sys/rpc/auth.h
+++ b/sys/rpc/auth.h
@@ -354,6 +354,10 @@ __END_DECLS
#define RPCSEC_GSS 6 /* RPCSEC_GSS */
#define AUTH_TLS 7 /* Initiate RPC-over-TLS */
+/* RFC 5531's prescribed limits for variable-lenth arrays. */
+#define AUTH_SYS_MAX_HOSTNAME 255
+#define AUTH_SYS_MAX_GROUPS 16 /* Supplementary groups. */
+
/*
* Pseudo auth flavors for RPCSEC_GSS.
*/
diff --git a/sys/rpc/authunix_prot.c b/sys/rpc/authunix_prot.c
index b107d5541c50..ff4c12c3f52e 100644
--- a/sys/rpc/authunix_prot.c
+++ b/sys/rpc/authunix_prot.c
@@ -30,7 +30,6 @@
* POSSIBILITY OF SUCH DAMAGE.
*/
-#include <sys/cdefs.h>
/*
* authunix_prot.c
* XDR for UNIX style authentication parameters for RPC
@@ -40,8 +39,7 @@
#include <sys/param.h>
#include <sys/jail.h>
-#include <sys/kernel.h>
-#include <sys/systm.h>
+#include <sys/libkern.h>
#include <sys/ucred.h>
#include <rpc/types.h>
@@ -50,9 +48,6 @@
#include <rpc/rpc_com.h>
-/* gids compose part of a credential; there may not be more than 16 of them */
-#define NGRPS 16
-
/*
* XDR for unix authentication parameters.
*/
@@ -60,25 +55,23 @@ bool_t
xdr_authunix_parms(XDR *xdrs, uint32_t *time, struct xucred *cred)
{
uint32_t namelen;
- uint32_t ngroups, i;
+ uint32_t supp_ngroups, i;
uint32_t junk;
char hostbuf[MAXHOSTNAMELEN];
+ if (xdrs->x_op == XDR_FREE)
+ /* This function does not allocate auxiliary memory. */
+ return (TRUE);
+
if (xdrs->x_op == XDR_ENCODE) {
- /*
- * Restrict name length to 255 according to RFC 1057.
- */
getcredhostname(NULL, hostbuf, sizeof(hostbuf));
namelen = strlen(hostbuf);
- if (namelen > 255)
- namelen = 255;
- } else {
+ if (namelen > AUTH_SYS_MAX_HOSTNAME)
+ namelen = AUTH_SYS_MAX_HOSTNAME;
+ } else
namelen = 0;
- }
- junk = 0;
- if (!xdr_uint32_t(xdrs, time)
- || !xdr_uint32_t(xdrs, &namelen))
+ if (!xdr_uint32_t(xdrs, time) || !xdr_uint32_t(xdrs, &namelen))
return (FALSE);
/*
@@ -88,43 +81,65 @@ xdr_authunix_parms(XDR *xdrs, uint32_t *time, struct xucred *cred)
if (!xdr_opaque(xdrs, hostbuf, namelen))
return (FALSE);
} else {
+ if (namelen > AUTH_SYS_MAX_HOSTNAME)
+ return (FALSE);
xdr_setpos(xdrs, xdr_getpos(xdrs) + RNDUP(namelen));
}
if (!xdr_uint32_t(xdrs, &cred->cr_uid))
return (FALSE);
+
+ /*
+ * Safety check: The protocol needs at least one group (access to
+ * 'cr_gid', decrementation of 'cr_ngroups' below).
+ */
+ if (xdrs->x_op == XDR_ENCODE && cred->cr_ngroups == 0)
+ return (FALSE);
if (!xdr_uint32_t(xdrs, &cred->cr_gid))
return (FALSE);
if (xdrs->x_op == XDR_ENCODE) {
/*
- * Note that this is a `struct xucred`, which maintains its
- * historical layout of preserving the egid in cr_ngroups and
- * cr_groups[0] == egid.
+ * Note that this is a 'struct xucred', which still has the
+ * historical layout where the effective GID is in cr_groups[0]
+ * and is accounted in 'cr_ngroups'. We substract 1 to obtain
+ * the number of "supplementary" groups, passed in the AUTH_SYS
+ * credentials variable-length array called gids[] in RFC 5531.
*/
- ngroups = cred->cr_ngroups - 1;
- if (ngroups > NGRPS)
- ngroups = NGRPS;
+ MPASS(cred->cr_ngroups <= XU_NGROUPS);
+ supp_ngroups = cred->cr_ngroups - 1;
+ if (supp_ngroups > AUTH_SYS_MAX_GROUPS)
+ /* With current values, this should never execute. */
+ supp_ngroups = AUTH_SYS_MAX_GROUPS;
}
- if (!xdr_uint32_t(xdrs, &ngroups))
+ if (!xdr_uint32_t(xdrs, &supp_ngroups))
return (FALSE);
- for (i = 0; i < ngroups; i++) {
- if (i < ngroups_max) {
- if (!xdr_uint32_t(xdrs, &cred->cr_groups[i + 1]))
- return (FALSE);
- } else {
- if (!xdr_uint32_t(xdrs, &junk))
- return (FALSE);
- }
- }
- if (xdrs->x_op == XDR_DECODE) {
- if (ngroups > ngroups_max)
- cred->cr_ngroups = ngroups_max + 1;
- else
- cred->cr_ngroups = ngroups + 1;
- }
+ /*
+ * Because we cannot store more than XU_NGROUPS in total (16 at time of
+ * this writing), for now we choose to be strict with respect to RFC
+ * 5531's maximum number of supplementary groups (AUTH_SYS_MAX_GROUPS).
+ * That would also be an accidental DoS prevention measure if the
+ * request handling code didn't try to reassemble it in full without any
+ * size limits. Although AUTH_SYS_MAX_GROUPS and XU_NGROUPS are equal,
+ * since the latter includes the "effective" GID, we cannot store the
+ * last group of a message with exactly AUTH_SYS_MAX_GROUPS
+ * supplementary groups. We accept such messages so as not to violate
+ * the protocol, silently dropping the last group on the floor.
+ */
+
+ if (xdrs->x_op != XDR_ENCODE && supp_ngroups > AUTH_SYS_MAX_GROUPS)
+ return (FALSE);
+
+ junk = 0;
+ for (i = 0; i < supp_ngroups; ++i)
+ if (!xdr_uint32_t(xdrs, i < XU_NGROUPS - 1 ?
+ &cred->cr_sgroups[i] : &junk))
+ return (FALSE);
+
+ if (xdrs->x_op != XDR_ENCODE)
+ cred->cr_ngroups = MIN(supp_ngroups + 1, XU_NGROUPS);
return (TRUE);
}
diff --git a/sys/rpc/svc_auth_unix.c b/sys/rpc/svc_auth_unix.c
index 963f4f272964..aa0fc585865f 100644
--- a/sys/rpc/svc_auth_unix.c
+++ b/sys/rpc/svc_auth_unix.c
@@ -41,18 +41,12 @@
*/
#include <sys/param.h>
-#include <sys/lock.h>
-#include <sys/mutex.h>
-#include <sys/systm.h>
#include <sys/ucred.h>
#include <rpc/rpc.h>
#include <rpc/rpc_com.h>
-#define MAX_MACHINE_NAME 255
-#define NGRPS 16
-
/*
* Unix longhand authenticator
*/
@@ -62,11 +56,8 @@ _svcauth_unix(struct svc_req *rqst, struct rpc_msg *msg)
enum auth_stat stat;
XDR xdrs;
int32_t *buf;
- uint32_t time;
struct xucred *xcr;
- u_int auth_len;
- size_t str_len, gid_len;
- u_int i;
+ uint32_t auth_len, time;
xcr = rqst->rq_clntcred;
auth_len = (u_int)msg->rm_call.cb_cred.oa_length;
@@ -74,51 +65,58 @@ _svcauth_unix(struct svc_req *rqst, struct rpc_msg *msg)
XDR_DECODE);
buf = XDR_INLINE(&xdrs, auth_len);
if (buf != NULL) {
+ /* 'time', 'str_len', UID, GID and 'supp_ngroups'. */
+ const uint32_t min_len = 5 * BYTES_PER_XDR_UNIT;
+ uint32_t str_len, supp_ngroups;
+
+ if (auth_len < min_len)
+ goto badcred;
time = IXDR_GET_UINT32(buf);
- str_len = (size_t)IXDR_GET_UINT32(buf);
- if (str_len > MAX_MACHINE_NAME) {
- stat = AUTH_BADCRED;
- goto done;
- }
+ str_len = IXDR_GET_UINT32(buf);
+ if (str_len > AUTH_SYS_MAX_HOSTNAME)
+ goto badcred;
str_len = RNDUP(str_len);
+ /*
+ * Recheck message length now that we know the value of
+ * 'str_len' (and that it won't cause an overflow in additions
+ * below) to protect access to the credentials part.
+ */
+ if (auth_len < min_len + str_len)
+ goto badcred;
buf += str_len / sizeof (int32_t);
xcr->cr_uid = IXDR_GET_UINT32(buf);
xcr->cr_gid = IXDR_GET_UINT32(buf);
- gid_len = (size_t)IXDR_GET_UINT32(buf);
- if (gid_len > NGRPS) {
- stat = AUTH_BADCRED;
- goto done;
- }
- for (i = 0; i < gid_len; i++) {
- /*
- * Note that this is a `struct xucred`, which maintains
- * its historical layout of preserving the egid in
- * cr_ngroups and cr_groups[0] == egid.
- */
- if (i + 1 < XU_NGROUPS)
- xcr->cr_groups[i + 1] = IXDR_GET_INT32(buf);
- else
- buf++;
- }
- if (gid_len + 1 > XU_NGROUPS)
- xcr->cr_ngroups = XU_NGROUPS;
- else
- xcr->cr_ngroups = gid_len + 1;
+ supp_ngroups = IXDR_GET_UINT32(buf);
+ /*
+ * See the herald comment before a similar test at the end of
+ * xdr_authunix_parms() for why we strictly respect RFC 5531 and
+ * why we may have to drop the last supplementary group when
+ * there are AUTH_SYS_MAX_GROUPS of them.
+ */
+ if (supp_ngroups > AUTH_SYS_MAX_GROUPS)
+ goto badcred;
+ /*
+ * Final message length check, as we now know how much we will
+ * read in total.
+ */
+ if (auth_len < min_len + str_len +
+ supp_ngroups * BYTES_PER_XDR_UNIT)
+ goto badcred;
/*
- * five is the smallest unix credentials structure -
- * timestamp, hostname len (0), uid, gid, and gids len (0).
+ * Note that 'xcr' is a 'struct xucred', which still has the
+ * historical layout where the effective GID is in cr_groups[0]
+ * and is accounted in 'cr_ngroups'.
*/
- if ((5 + gid_len) * BYTES_PER_XDR_UNIT + str_len > auth_len) {
- (void) printf("bad auth_len gid %ld str %ld auth %u\n",
- (long)gid_len, (long)str_len, auth_len);
- stat = AUTH_BADCRED;
- goto done;
+ for (uint32_t i = 0; i < supp_ngroups; ++i) {
+ if (i < XU_NGROUPS - 1)
+ xcr->cr_sgroups[i] = IXDR_GET_INT32(buf);
+ else
+ buf++;
}
- } else if (! xdr_authunix_parms(&xdrs, &time, xcr)) {
- stat = AUTH_BADCRED;
- goto done;
- }
+ xcr->cr_ngroups = MIN(supp_ngroups + 1, XU_NGROUPS);
+ } else if (!xdr_authunix_parms(&xdrs, &time, xcr))
+ goto badcred;
rqst->rq_verf = _null_auth;
stat = AUTH_OK;
@@ -126,6 +124,10 @@ done:
XDR_DESTROY(&xdrs);
return (stat);
+
+badcred:
+ stat = AUTH_BADCRED;
+ goto done;
}
diff --git a/sys/security/audit/audit.c b/sys/security/audit/audit.c
index 7ec50d990d4e..876776e5f62e 100644
--- a/sys/security/audit/audit.c
+++ b/sys/security/audit/audit.c
@@ -329,7 +329,7 @@ audit_record_dtor(void *mem, int size, void *arg)
* call into the BSM assembly code to initialize it.
*/
static void
-audit_init(void)
+audit_init(void *dummy __unused)
{
audit_trail_enabled = 0;
diff --git a/sys/security/mac/mac_framework.c b/sys/security/mac/mac_framework.c
index d742b5dcbc3a..b0776160cc74 100644
--- a/sys/security/mac/mac_framework.c
+++ b/sys/security/mac/mac_framework.c
@@ -320,7 +320,7 @@ mac_policy_xlock_assert(void)
* Initialize the MAC subsystem, including appropriate SMP locks.
*/
static void
-mac_init(void)
+mac_init(void *dummy __unused)
{
LIST_INIT(&mac_static_policy_list);
@@ -340,7 +340,7 @@ mac_init(void)
* kernel, or loaded before the kernel startup.
*/
static void
-mac_late_init(void)
+mac_late_init(void *dummy __unused)
{
mac_late = 1;
diff --git a/sys/sys/imgact_elf.h b/sys/sys/imgact_elf.h
index 2845a9dbc1e2..9e2a233248b4 100644
--- a/sys/sys/imgact_elf.h
+++ b/sys/sys/imgact_elf.h
@@ -86,7 +86,7 @@ typedef struct {
struct sysentvec *sysvec;
const char *interp_newpath;
int flags;
- Elf_Brandnote *brand_note;
+ const Elf_Brandnote *brand_note;
bool (*header_supported)(const struct image_params *,
const int32_t *, const uint32_t *);
/* High 8 bits of flags is private to the ABI */
@@ -111,9 +111,9 @@ struct sseg_closure {
size_t size; /* Total size of all writable segments. */
};
-bool __elfN(brand_inuse)(Elf_Brandinfo *entry);
-int __elfN(insert_brand_entry)(Elf_Brandinfo *entry);
-int __elfN(remove_brand_entry)(Elf_Brandinfo *entry);
+bool __elfN(brand_inuse)(const Elf_Brandinfo *entry);
+int __elfN(insert_brand_entry)(const Elf_Brandinfo *entry);
+int __elfN(remove_brand_entry)(const Elf_Brandinfo *entry);
int __elfN(freebsd_fixup)(uintptr_t *, struct image_params *);
int __elfN(coredump)(struct thread *, struct coredump_writer *, off_t, int);
size_t __elfN(populate_note)(int, void *, void *, size_t, void **);
diff --git a/sys/sys/proc.h b/sys/sys/proc.h
index 9140cee56885..8c0729d3ec66 100644
--- a/sys/sys/proc.h
+++ b/sys/sys/proc.h
@@ -741,7 +741,7 @@ struct proc {
reaper which spawned
our subtree. */
uint64_t p_elf_flags; /* (x) ELF flags */
- void *p_elf_brandinfo; /* (x) Elf_Brandinfo, NULL for
+ const void *p_elf_brandinfo; /* (x) Elf_Brandinfo, NULL for
non ELF binaries. */
sbintime_t p_umtx_min_timeout;
/* End area that is copied on creation. */
diff --git a/sys/sys/socket.h b/sys/sys/socket.h
index cdd4fa3b4b89..cf1d95da6168 100644
--- a/sys/sys/socket.h
+++ b/sys/sys/socket.h
@@ -396,6 +396,7 @@ struct sockproto {
#define PF_NETLINK AF_NETLINK
#define PF_INET_SDP AF_INET_SDP
#define PF_INET6_SDP AF_INET6_SDP
+#define PF_HYPERV AF_HYPERV
#define PF_DIVERT AF_DIVERT
#define PF_IPFWLOG AF_IPFWLOG
diff --git a/sys/sys/sysent.h b/sys/sys/sysent.h
index 1714fa5a7416..6de391dcc03e 100644
--- a/sys/sys/sysent.h
+++ b/sys/sys/sysent.h
@@ -343,8 +343,7 @@ void exec_free_abi_mappings(struct proc *p);
void exec_onexec_old(struct thread *td);
#define INIT_SYSENTVEC(name, sv) \
- SYSINIT(name, SI_SUB_EXEC, SI_ORDER_ANY, \
- (sysinit_cfunc_t)exec_sysvec_init, sv);
+ SYSINIT(name, SI_SUB_EXEC, SI_ORDER_ANY, exec_sysvec_init, sv)
#endif /* _KERNEL */
diff --git a/sys/sys/tree.h b/sys/sys/tree.h
index c11bccfb387c..194ad505b038 100644
--- a/sys/sys/tree.h
+++ b/sys/sys/tree.h
@@ -334,10 +334,13 @@ struct { \
#define _RB_L ((__uintptr_t)1)
#define _RB_R ((__uintptr_t)2)
#define _RB_LR ((__uintptr_t)3)
-#define _RB_BITS(elm) (*(__uintptr_t *)&elm)
+#define _RB_BITS(elm) ((__uintptr_t)elm)
#define _RB_BITSUP(elm, field) _RB_BITS(_RB_UP(elm, field))
-#define _RB_PTR(elm) (__typeof(elm)) \
- ((__uintptr_t)elm & ~_RB_LR)
+#define _RB_PTR_OP(elm, op, dir) ((__typeof(elm)) \
+ ((__uintptr_t)(elm) op (dir)))
+#define _RB_PTR(elm) _RB_PTR_OP((elm), &, ~_RB_LR)
+#define _RB_MOD_OR(elm, dir) ((elm) = _RB_PTR_OP((elm), |, (dir)))
+#define _RB_MOD_XOR(elm, dir) ((elm) = _RB_PTR_OP((elm), ^, (dir)))
#define RB_PARENT(elm, field) _RB_PTR(_RB_UP(elm, field))
#define RB_LEFT(elm, field) _RB_LINK(elm, _RB_L, field)
@@ -346,8 +349,8 @@ struct { \
#define RB_EMPTY(head) (RB_ROOT(head) == NULL)
#define RB_SET_PARENT(dst, src, field) do { \
- _RB_BITSUP(dst, field) = (__uintptr_t)src | \
- (_RB_BITSUP(dst, field) & _RB_LR); \
+ _RB_UP(dst, field) = (__typeof(src))((__uintptr_t)src | \
+ (_RB_BITSUP(dst, field) & _RB_LR)); \
} while (/*CONSTCOND*/ 0)
#define RB_SET(elm, parent, field) do { \
@@ -546,12 +549,12 @@ name##_RB_INSERT_COLOR(struct name *head, \
elmdir = RB_RIGHT(parent, field) == elm ? _RB_R : _RB_L; \
if (_RB_BITS(gpar) & elmdir) { \
/* shorten the parent-elm edge to rebalance */ \
- _RB_BITSUP(parent, field) ^= elmdir; \
+ _RB_MOD_XOR(_RB_UP(parent, field), elmdir); \
return (NULL); \
} \
sibdir = elmdir ^ _RB_LR; \
/* the other edge must change length */ \
- _RB_BITSUP(parent, field) ^= sibdir; \
+ _RB_MOD_XOR(_RB_UP(parent, field), sibdir); \
if ((_RB_BITS(gpar) & _RB_LR) == 0) { \
/* both edges now short, retry from parent */ \
child = elm; \
@@ -583,11 +586,14 @@ name##_RB_INSERT_COLOR(struct name *head, \
RB_ROTATE(elm, child, elmdir, field); \
child_up = _RB_UP(child, field); \
if (_RB_BITS(child_up) & sibdir) \
- _RB_BITSUP(parent, field) ^= elmdir; \
+ _RB_MOD_XOR(_RB_UP(parent, field), \
+ elmdir); \
if (_RB_BITS(child_up) & elmdir) \
- _RB_BITSUP(elm, field) ^= _RB_LR; \
+ _RB_MOD_XOR(_RB_UP(elm, field), \
+ _RB_LR); \
else \
- _RB_BITSUP(elm, field) ^= elmdir; \
+ _RB_MOD_XOR(_RB_UP(elm, field), \
+ elmdir); \
/* if child is a leaf, don't augment elm, \
* since it is restored to be a leaf again. */ \
if ((_RB_BITS(child_up) & _RB_LR) == 0) \
@@ -656,7 +662,7 @@ name##_RB_REMOVE_COLOR(struct name *head, \
/* the rank of the tree rooted at elm shrank */ \
gpar = _RB_UP(parent, field); \
elmdir = RB_RIGHT(parent, field) == elm ? _RB_R : _RB_L; \
- _RB_BITS(gpar) ^= elmdir; \
+ _RB_MOD_XOR(gpar, elmdir); \
if (_RB_BITS(gpar) & elmdir) { \
/* lengthen the parent-elm edge to rebalance */ \
_RB_UP(parent, field) = gpar; \
@@ -664,7 +670,7 @@ name##_RB_REMOVE_COLOR(struct name *head, \
} \
if (_RB_BITS(gpar) & _RB_LR) { \
/* shorten other edge, retry from parent */ \
- _RB_BITS(gpar) ^= _RB_LR; \
+ _RB_MOD_XOR(gpar, _RB_LR); \
_RB_UP(parent, field) = gpar; \
gpar = _RB_PTR(gpar); \
continue; \
@@ -672,7 +678,7 @@ name##_RB_REMOVE_COLOR(struct name *head, \
sibdir = elmdir ^ _RB_LR; \
sib = _RB_LINK(parent, sibdir, field); \
up = _RB_UP(sib, field); \
- _RB_BITS(up) ^= _RB_LR; \
+ _RB_MOD_XOR(up, _RB_LR); \
if ((_RB_BITS(up) & _RB_LR) == 0) { \
/* shorten edges descending from sib, retry */ \
_RB_UP(sib, field) = up; \
@@ -703,24 +709,29 @@ name##_RB_REMOVE_COLOR(struct name *head, \
/* elm is a 1-child. First rotate at elm. */ \
RB_ROTATE(sib, elm, sibdir, field); \
up = _RB_UP(elm, field); \
- _RB_BITSUP(parent, field) ^= \
- (_RB_BITS(up) & elmdir) ? _RB_LR : elmdir; \
- _RB_BITSUP(sib, field) ^= \
- (_RB_BITS(up) & sibdir) ? _RB_LR : sibdir; \
- _RB_BITSUP(elm, field) |= _RB_LR; \
+ _RB_MOD_XOR(_RB_UP(parent, field), \
+ (_RB_BITS(up) & elmdir) ? _RB_LR : elmdir); \
+ _RB_MOD_XOR(_RB_UP(sib, field), \
+ (_RB_BITS(up) & sibdir) ? _RB_LR : sibdir); \
+ _RB_MOD_OR(_RB_UP(elm, field), _RB_LR); \
} else { \
if ((_RB_BITS(up) & elmdir) == 0 && \
RB_STRICT_HST && elm != NULL) { \
/* if parent does not become a leaf, \
do not demote parent yet. */ \
- _RB_BITSUP(parent, field) ^= sibdir; \
- _RB_BITSUP(sib, field) ^= _RB_LR; \
+ _RB_MOD_XOR(_RB_UP(parent, field), \
+ sibdir); \
+ _RB_MOD_XOR(_RB_UP(sib, field), \
+ _RB_LR); \
} else if ((_RB_BITS(up) & elmdir) == 0) { \
/* demote parent. */ \
- _RB_BITSUP(parent, field) ^= elmdir; \
- _RB_BITSUP(sib, field) ^= sibdir; \
+ _RB_MOD_XOR(_RB_UP(parent, field), \
+ elmdir); \
+ _RB_MOD_XOR(_RB_UP(sib, field), \
+ sibdir); \
} else \
- _RB_BITSUP(sib, field) ^= sibdir; \
+ _RB_MOD_XOR(_RB_UP(sib, field), \
+ sibdir); \
elm = sib; \
} \
\
diff --git a/sys/tests/ktest.h b/sys/tests/ktest.h
index c767aa31e8e5..75d7a75e2fff 100644
--- a/sys/tests/ktest.h
+++ b/sys/tests/ktest.h
@@ -57,6 +57,8 @@ struct ktest_test_info {
ktest_parse_t parse;
};
+#define KTEST_FUNC(X) static int __ktest_##X(struct ktest_test_context *ctx)
+
struct ktest_module_info {
const char *name;
const struct ktest_test_info *tests;
@@ -64,6 +66,8 @@ struct ktest_module_info {
void *module_ptr;
};
+#define KTEST_INFO(X) { "test_" #X, "Test " #X, __ktest_##X, NULL }
+
int ktest_default_modevent(module_t mod, int type, void *arg);
bool ktest_start_msg(struct ktest_test_context *ctx);
@@ -84,6 +88,9 @@ void ktest_end_msg(struct ktest_test_context *ctx);
#define KTEST_LOG(_ctx, _fmt, ...) \
KTEST_LOG_LEVEL(_ctx, LOG_DEBUG, _fmt, ## __VA_ARGS__)
+#define KTEST_ERR(_ctx, _fmt, ...) \
+ KTEST_LOG_LEVEL(_ctx, LOG_ERR, _fmt, ## __VA_ARGS__)
+
#define KTEST_MAX_BUF 512
#define KTEST_MODULE_DECLARE(_n, _t) \
@@ -104,6 +111,9 @@ MODULE_VERSION(ktest_##_n, 1); \
MODULE_DEPEND(ktest_##_n, ktestmod, 1, 1, 1); \
MODULE_DEPEND(ktest_##_n, netlink, 1, 1, 1); \
+#define KTEST_MODULE_DEPEND(_n, _d) \
+MODULE_DEPEND(ktest_##_n, _d, 1, 1, 1); \
+
#endif /* _KERNEL */
/* genetlink definitions */
diff --git a/sys/ufs/ffs/ffs_inode.c b/sys/ufs/ffs/ffs_inode.c
index 970536a13aa5..f47cfd08f75a 100644
--- a/sys/ufs/ffs/ffs_inode.c
+++ b/sys/ufs/ffs/ffs_inode.c
@@ -653,8 +653,8 @@ done:
for (i = 0; i < UFS_NDADDR; i++)
if (newblks[i] != DIP(ip, i_db[i]))
panic("ffs_truncate2: blkno %d newblks %jd != i_db %jd",
- i, (intmax_t)newblks[UFS_NDADDR + level],
- (intmax_t)DIP(ip, i_ib[level]));
+ i, (intmax_t)newblks[i],
+ (intmax_t)DIP(ip, i_db[i]));
BO_LOCK(bo);
if (length == 0 &&
(fs->fs_magic != FS_UFS2_MAGIC || ip->i_din2->di_extsize == 0) &&
diff --git a/sys/vm/vm_meter.c b/sys/vm/vm_meter.c
index fef28bb883e4..fee50f49c844 100644
--- a/sys/vm/vm_meter.c
+++ b/sys/vm/vm_meter.c
@@ -96,7 +96,7 @@ struct vmmeter __read_mostly vm_cnt = {
u_long __exclusive_cache_line vm_user_wire_count;
static void
-vmcounter_startup(void)
+vmcounter_startup(void *dummy __unused)
{
counter_u64_t *cnt = (counter_u64_t *)&vm_cnt;
diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c
index 3f1be78342c9..418a9cff8abf 100644
--- a/sys/vm/vm_pageout.c
+++ b/sys/vm/vm_pageout.c
@@ -120,7 +120,7 @@
/* the kernel process "vm_pageout"*/
static void vm_pageout(void);
-static void vm_pageout_init(void);
+static void vm_pageout_init(void *);
static int vm_pageout_clean(vm_page_t m, int *numpagedout);
static int vm_pageout_cluster(vm_page_t m);
static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage,
@@ -2333,7 +2333,7 @@ vm_pageout_init_domain(int domain)
}
static void
-vm_pageout_init(void)
+vm_pageout_init(void *dummy __unused)
{
u_long freecount;
int i;
diff --git a/sys/x86/x86/tsc.c b/sys/x86/x86/tsc.c
index a1a5d8140b14..3b873d9dae73 100644
--- a/sys/x86/x86/tsc.c
+++ b/sys/x86/x86/tsc.c
@@ -650,7 +650,7 @@ retry:
#endif /* SMP */
static void
-init_TSC_tc(void)
+init_TSC_tc(void *dummy __unused)
{
uint64_t max_freq;
int shift;
diff --git a/sys/x86/xen/xen_apic.c b/sys/x86/xen/xen_apic.c
index 994dc3e0804c..43a253cc2860 100644
--- a/sys/x86/xen/xen_apic.c
+++ b/sys/x86/xen/xen_apic.c
@@ -330,7 +330,7 @@ xen_cpu_ipi_init(int cpu)
}
static void
-xen_setup_cpus(void)
+xen_setup_cpus(void *dummy __unused)
{
uint32_t regs[4];
int i;
diff --git a/tests/atf_python/ktest.py b/tests/atf_python/ktest.py
index a18f47d1dd06..a671aaa1fd4c 100644
--- a/tests/atf_python/ktest.py
+++ b/tests/atf_python/ktest.py
@@ -67,6 +67,10 @@ class KtestLoader(object):
def __init__(self, module_name: str, autoload: bool):
self.module_name = module_name
self.autoload = autoload
+ # Ensure the base ktest.ko module is loaded
+ result = libc.kldload("ktest")
+ if result != 0 and result != 17: # 17 is EEXIST (already loaded)
+ logger.debug(f"Failed to load base ktest module (error {result})")
self.helper = NlHelper()
self.nlsock = Nlsock(NlConst.NETLINK_GENERIC, self.helper)
self.family_id = self._get_family_id()
@@ -76,7 +80,9 @@ class KtestLoader(object):
family_id = self.nlsock.get_genl_family_id(NETLINK_FAMILY)
except ValueError:
if self.autoload:
- libc.kldload(self.module_name)
+ result = libc.kldload(self.module_name)
+ if result != 0 and result != 17: # 17 is EEXIST (already loaded)
+ raise RuntimeError(f"Failed to load kernel module '{self.module_name}' (error {result})")
family_id = self.nlsock.get_genl_family_id(NETLINK_FAMILY)
else:
raise
@@ -103,7 +109,9 @@ class KtestLoader(object):
def load_ktests(self):
ret = self._load_ktests()
if not ret and self.autoload:
- libc.kldload(self.module_name)
+ result = libc.kldload(self.module_name)
+ if result != 0 and result != 17: # 17 is EEXIST (already loaded)
+ raise RuntimeError(f"Failed to load kernel module '{self.module_name}' (error {result})")
ret = self._load_ktests()
return ret
diff --git a/tests/sys/fs/fusefs/bad_server.cc b/tests/sys/fs/fusefs/bad_server.cc
index af2ca146e431..c3d195735446 100644
--- a/tests/sys/fs/fusefs/bad_server.cc
+++ b/tests/sys/fs/fusefs/bad_server.cc
@@ -65,6 +65,11 @@ TEST_F(BadServer, ShortWrite)
out.header.unique = 0; // Asynchronous notification
out.expected_errno = EINVAL;
m_mock->write_response(out);
+ /*
+ * Tell the event loop to quit. The kernel has already disconnected us
+ * because of the short write.
+ */
+ m_mock->m_quit = true;
}
/*
diff --git a/tests/sys/kern/unix_stream.c b/tests/sys/kern/unix_stream.c
index 49d621dc5b0a..442b766ac885 100644
--- a/tests/sys/kern/unix_stream.c
+++ b/tests/sys/kern/unix_stream.c
@@ -1,6 +1,7 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause
*
+ * Copyright (c) 2025 Gleb Smirnoff <glebius@FreeBSD.org>
* Copyright (c) 2018 Alan Somers
*
* Redistribution and use in source and binary forms, with or without
@@ -30,6 +31,7 @@
#include <sys/event.h>
#include <sys/select.h>
#include <sys/sysctl.h>
+#include <sys/time.h>
#include <sys/un.h>
#include <errno.h>
#include <fcntl.h>
@@ -490,6 +492,30 @@ ATF_TC_BODY(ourshutdown_kevent, tc)
close(sv[1]);
}
+ATF_TC_WITHOUT_HEAD(SO_SNDTIMEO);
+ATF_TC_BODY(SO_SNDTIMEO, tc)
+{
+ struct timespec tp1, tp2, rtp, sleep = { .tv_nsec = 100000000 };
+ int sv[2];
+ char buf[10];
+
+ full_socketpair(sv);
+ ATF_REQUIRE_EQ(0, setsockopt(sv[0], SOL_SOCKET, SO_SNDTIMEO,
+ &(struct timeval){ .tv_usec = sleep.tv_nsec / 1000 },
+ sizeof(struct timeval)));
+ ATF_REQUIRE_EQ(0, clock_gettime(CLOCK_MONOTONIC_PRECISE, &tp1));
+ ATF_REQUIRE_EQ(-1, send(sv[0], buf, sizeof(buf), 0));
+ ATF_REQUIRE(errno == EAGAIN);
+ ATF_REQUIRE_EQ(0, clock_gettime(CLOCK_MONOTONIC_PRECISE, &tp2));
+ timespecsub(&tp2, &tp1, &rtp);
+ ATF_REQUIRE(timespeccmp(&rtp, &sleep, >=));
+ ATF_REQUIRE_EQ(sizeof(buf), recv(sv[1], buf, sizeof(buf), 0));
+ ATF_REQUIRE_EQ(sizeof(buf), send(sv[0], buf, sizeof(buf), 0));
+
+ close(sv[0]);
+ close(sv[1]);
+}
+
ATF_TP_ADD_TCS(tp)
{
ATF_TP_ADD_TC(tp, getpeereid);
@@ -506,6 +532,7 @@ ATF_TP_ADD_TCS(tp)
ATF_TP_ADD_TC(tp, peershutdown_wakeup_poll);
ATF_TP_ADD_TC(tp, peershutdown_wakeup_kevent);
ATF_TP_ADD_TC(tp, ourshutdown_kevent);
+ ATF_TP_ADD_TC(tp, SO_SNDTIMEO);
return atf_no_error();
}
diff --git a/tests/sys/netinet/Makefile b/tests/sys/netinet/Makefile
index b742342beecb..9739221676ce 100644
--- a/tests/sys/netinet/Makefile
+++ b/tests/sys/netinet/Makefile
@@ -30,6 +30,7 @@ ATF_TESTS_SH= arp \
ATF_TESTS_PYTEST+= carp.py
ATF_TESTS_PYTEST+= igmp.py
+ATF_TESTS_PYTEST+= tcp_hpts_test.py
LIBADD.so_reuseport_lb_test= pthread
LIBADD.udp_bindings= pthread
diff --git a/tests/sys/netinet/multicast.sh b/tests/sys/netinet/multicast.sh
index a3854fd2fd20..273970d0f7ea 100755
--- a/tests/sys/netinet/multicast.sh
+++ b/tests/sys/netinet/multicast.sh
@@ -45,6 +45,15 @@ multicast_vnet_init()
jexec mjail2 ifconfig ${epair2}b 192.0.3.2/24
}
+multicast_join()
+{
+ jexec mjail2 $(atf_get_srcdir)/multicast-receive \
+ $1 233.252.0.1 6676 $2 > out & pid=$!
+ while ! jexec mjail2 ifmcstat | grep -q 233\.252\.0\.1; do
+ sleep 0.01
+ done
+}
+
atf_test_case "IP_ADD_MEMBERSHIP_ip_mreq" "cleanup"
IP_ADD_MEMBERSHIP_ip_mreq_head()
{
@@ -56,8 +65,7 @@ IP_ADD_MEMBERSHIP_ip_mreq_body()
multicast_vnet_init
# join group on interface with IP address 192.0.2.2
- jexec mjail2 $(atf_get_srcdir)/multicast-receive \
- ip_mreq 233.252.0.1 6676 192.0.2.2 > out & pid=$!
+ multicast_join ip_mreq 192.0.2.2
atf_check -s exit:0 -o empty \
jexec mjail1 $(atf_get_srcdir)/multicast-send \
0.0.0.0 6676 233.252.0.1 6676 192.0.2.1 hello
@@ -65,8 +73,7 @@ IP_ADD_MEMBERSHIP_ip_mreq_body()
atf_check -s exit:0 -o inline:"192.0.2.1:6676 hello\n" cat out
# join group on interface with IP address 192.0.3.2
- jexec mjail2 $(atf_get_srcdir)/multicast-receive \
- ip_mreq 233.252.0.1 6676 192.0.3.2 > out & pid=$!
+ multicast_join ip_mreq 192.0.3.2
atf_check -s exit:0 -o empty \
jexec mjail1 $(atf_get_srcdir)/multicast-send \
0.0.0.0 6676 233.252.0.1 6676 192.0.3.1 hello
@@ -90,8 +97,7 @@ IP_ADD_MEMBERSHIP_ip_mreqn_body()
multicast_vnet_init
# join group on interface epair2
- jexec mjail2 $(atf_get_srcdir)/multicast-receive \
- ip_mreqn 233.252.0.1 6676 ${epair1}b > out & pid=$!
+ multicast_join ip_mreqn ${epair1}b
atf_check -s exit:0 -o empty \
jexec mjail1 $(atf_get_srcdir)/multicast-send \
0.0.0.0 6676 233.252.0.1 6676 ${epair1}a hello
@@ -99,8 +105,7 @@ IP_ADD_MEMBERSHIP_ip_mreqn_body()
atf_check -s exit:0 -o inline:"192.0.2.1:6676 hello\n" cat out
# join group on interface epair2
- jexec mjail2 $(atf_get_srcdir)/multicast-receive \
- ip_mreqn 233.252.0.1 6676 ${epair2}b > out & pid=$!
+ multicast_join ip_mreqn ${epair2}b
atf_check -s exit:0 -o empty \
jexec mjail1 $(atf_get_srcdir)/multicast-send \
0.0.0.0 6676 233.252.0.1 6676 ${epair2}a hello
@@ -123,9 +128,8 @@ MCAST_JOIN_GROUP_body()
{
multicast_vnet_init
- # join group on interface epair2
- jexec mjail2 $(atf_get_srcdir)/multicast-receive \
- group_req 233.252.0.1 6676 ${epair1}b > out & pid=$!
+ # join group on interface epair1
+ multicast_join group_req ${epair1}b
atf_check -s exit:0 -o empty \
jexec mjail1 $(atf_get_srcdir)/multicast-send \
0.0.0.0 6676 233.252.0.1 6676 ${epair1}a hello
@@ -133,8 +137,7 @@ MCAST_JOIN_GROUP_body()
atf_check -s exit:0 -o inline:"192.0.2.1:6676 hello\n" cat out
# join group on interface epair2
- jexec mjail2 $(atf_get_srcdir)/multicast-receive \
- group_req 233.252.0.1 6676 ${epair2}b > out & pid=$!
+ multicast_join group_req ${epair2}b
atf_check -s exit:0 -o empty \
jexec mjail1 $(atf_get_srcdir)/multicast-send \
0.0.0.0 6676 233.252.0.1 6676 ${epair2}a hello
diff --git a/tests/sys/netinet/tcp_hpts_test.py b/tests/sys/netinet/tcp_hpts_test.py
new file mode 100644
index 000000000000..c56383fb310f
--- /dev/null
+++ b/tests/sys/netinet/tcp_hpts_test.py
@@ -0,0 +1,4 @@
+from atf_python.ktest import BaseKernelTest
+
+class TestTcpHpts(BaseKernelTest):
+ KTEST_MODULE_NAME = "ktest_tcphpts"
diff --git a/tests/sys/vm/mmap_test.c b/tests/sys/vm/mmap_test.c
index 6bc30f73ca95..27d02ae667fb 100644
--- a/tests/sys/vm/mmap_test.c
+++ b/tests/sys/vm/mmap_test.c
@@ -36,21 +36,6 @@
#include <stdio.h>
#include <stdlib.h>
-static const struct {
- void *addr;
- int ok[2]; /* Depending on security.bsd.map_at_zero {0, !=0}. */
-} map_at_zero_tests[] = {
- { (void *)0, { 0, 1 } }, /* Test sysctl. */
- { (void *)1, { 0, 0 } },
- { (void *)(PAGE_SIZE - 1), { 0, 0 } },
- { (void *)PAGE_SIZE, { 1, 1 } },
- { (void *)-1, { 0, 0 } },
- { (void *)(-PAGE_SIZE), { 0, 0 } },
- { (void *)(-1 - PAGE_SIZE), { 0, 0 } },
- { (void *)(-1 - PAGE_SIZE - 1), { 0, 0 } },
- { (void *)(0x1000 * PAGE_SIZE), { 1, 1 } },
-};
-
#define MAP_AT_ZERO "security.bsd.map_at_zero"
#ifdef __LP64__
@@ -68,6 +53,22 @@ ATF_TC_BODY(mmap__map_at_zero, tc)
int map_at_zero;
bool allow_wx;
int prot_flags;
+ size_t pgsz = getpagesize();
+
+ const struct {
+ void *addr;
+ int ok[2]; /* Depending on security.bsd.map_at_zero {0, !=0}. */
+ } map_at_zero_tests[] = {
+ { (void *)0, { 0, 1 } }, /* Test sysctl. */
+ { (void *)1, { 0, 0 } },
+ { (void *)(pgsz - 1), { 0, 0 } },
+ { (void *)pgsz, { 1, 1 } },
+ { (void *)-1, { 0, 0 } },
+ { (void *)(-pgsz), { 0, 0 } },
+ { (void *)(-1 - pgsz), { 0, 0 } },
+ { (void *)(-1 - pgsz - 1), { 0, 0 } },
+ { (void *)(0x1000 * pgsz), { 1, 1 } },
+ };
len = sizeof(map_at_zero);
if (sysctlbyname(MAP_AT_ZERO, &map_at_zero, &len, NULL, 0) == -1) {
diff --git a/usr.bin/login/login.conf b/usr.bin/login/login.conf
index 1069da17b4db..c65a83caa565 100644
--- a/usr.bin/login/login.conf
+++ b/usr.bin/login/login.conf
@@ -46,7 +46,6 @@ default:\
:umtxp=unlimited:\
:pipebuf=unlimited:\
:priority=0:\
- :ignoretime@:\
:umask=022:\
:charset=UTF-8:\
:lang=C.UTF-8:
@@ -149,7 +148,6 @@ russian|Russian Users Accounts:\
# :requirehome:\
# :passwordtime=90d:\
# :umask=002:\
-# :ignoretime@:\
# :tc=default:
#
#
@@ -174,7 +172,6 @@ russian|Russian Users Accounts:\
##
#staff:\
# :ignorenologin:\
-# :ignoretime:\
# :requirehome@:\
# :accounted@:\
# :path=~/bin /bin /sbin /usr/bin /usr/sbin /usr/local/bin /usr/local/sbin:\
@@ -265,7 +262,6 @@ russian|Russian Users Accounts:\
## - no time accounting, restricted to access via dialin lines
##
#site:\
-# :ignoretime:\
# :passwordtime@:\
# :refreshtime@:\
# :refreshperiod@:\
diff --git a/usr.bin/sockstat/main.c b/usr.bin/sockstat/main.c
index 7fedfd5b8724..d1ea6b1bc958 100644
--- a/usr.bin/sockstat/main.c
+++ b/usr.bin/sockstat/main.c
@@ -1789,9 +1789,11 @@ main(int argc, char *argv[])
argc = xo_parse_args(argc, argv);
if (argc < 0)
exit(1);
- if (xo_get_style(NULL) != XO_STYLE_TEXT &&
- xo_get_style(NULL) != XO_STYLE_HTML)
- is_xo_style_encoding = true;
+ if (xo_get_style(NULL) != XO_STYLE_TEXT) {
+ show_path_state = true;
+ if (xo_get_style(NULL) != XO_STYLE_HTML)
+ is_xo_style_encoding = true;
+ }
opt_j = -1;
while ((o = getopt(argc, argv, "46AbCcfIij:Llnp:P:qSsUuvw")) != -1)
switch (o) {
diff --git a/usr.bin/sockstat/sockstat.1 b/usr.bin/sockstat/sockstat.1
index d14eb967ad0f..1498fb1d88f7 100644
--- a/usr.bin/sockstat/sockstat.1
+++ b/usr.bin/sockstat/sockstat.1
@@ -25,7 +25,7 @@
.\" (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
.\" THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.\"
-.Dd October 9, 2025
+.Dd October 14, 2025
.Dt SOCKSTAT 1
.Os
.Sh NAME
@@ -205,7 +205,8 @@ is specified (only for SCTP or TCP).
The path state if
.Fl s
is specified (only for SCTP).
-This column is only shown when there is at least one path state shown.
+When using traditional text output, this column is only shown when there is at
+least one path state to show.
.It Li CONN STATE
The connection state if
.Fl s
diff --git a/usr.sbin/certctl/certctl.8 b/usr.sbin/certctl/certctl.8
index edf993e1361a..e58da8e7ff84 100644
--- a/usr.sbin/certctl/certctl.8
+++ b/usr.sbin/certctl/certctl.8
@@ -24,7 +24,7 @@
.\" IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
.\" POSSIBILITY OF SUCH DAMAGE.
.\"
-.Dd August 18, 2025
+.Dd October 9, 2025
.Dt CERTCTL 8
.Os
.Sh NAME
@@ -110,7 +110,7 @@ A copy of each trusted certificate is placed in
and each untrusted certificate in
.Ev UNTRUSTDESTDIR .
In addition, a bundle containing the trusted certificates is placed in
-.Ev BUNDLEFILE .
+.Ev BUNDLE .
.It Ic untrust
Add the specified file to the untrusted list.
.It Ic trust
@@ -151,6 +151,8 @@ Default:
.Pa ${DESTDIR}${DISTBASE}/etc/ssl/untrusted
.It Ev BUNDLE
File name of bundle to produce.
+Default:
+.Pa ${DESTDIR}${DISTBASE}/etc/ssl/cert.pem
.El
.Sh SEE ALSO
.Xr openssl 1
diff --git a/usr.sbin/fwget/pci/pci_network_mediatek b/usr.sbin/fwget/pci/pci_network_mediatek
index 653c87c410eb..e1e15dcfa2e5 100644
--- a/usr.sbin/fwget/pci/pci_network_mediatek
+++ b/usr.sbin/fwget/pci/pci_network_mediatek
@@ -38,24 +38,24 @@ pci_network_mediatek_mt76()
# { sys/contrib/dev/mediatek/mt76/zzz_fw_ports_fwget.sh }
### >>>
- 0x0608) addpkg "wifi-firmware-mediatek-kmod-mt792x"; return 1 ;;
- 0x0616) addpkg "wifi-firmware-mediatek-kmod-mt792x"; return 1 ;;
- 0x0717) addpkg "wifi-firmware-mediatek-kmod-mt792x"; return 1 ;;
- 0x7611) addpkg "wifi-firmware-mediatek-kmod-mt7615"; return 1 ;;
- 0x7615) addpkg "wifi-firmware-mediatek-kmod-mt7615"; return 1 ;;
- 0x7663) addpkg "wifi-firmware-mediatek-kmod-mt7615"; return 1 ;;
- 0x7906) addpkg "wifi-firmware-mediatek-kmod-mt7915"; return 1 ;;
- 0x790a) addpkg "wifi-firmware-mediatek-kmod-mt7915"; return 1 ;;
- 0x7915) addpkg "wifi-firmware-mediatek-kmod-mt7915"; return 1 ;;
- 0x7916) addpkg "wifi-firmware-mediatek-kmod-mt7915"; return 1 ;;
- 0x7920) addpkg "wifi-firmware-mediatek-kmod-mt792x"; return 1 ;;
- 0x7922) addpkg "wifi-firmware-mediatek-kmod-mt792x"; return 1 ;;
- 0x7925) addpkg "wifi-firmware-mediatek-kmod-mt792x"; return 1 ;;
- 0x7961) addpkg "wifi-firmware-mediatek-kmod-mt792x"; return 1 ;;
- 0x7990) addpkg "wifi-firmware-mediatek-kmod-mt7996"; return 1 ;;
- 0x7991) addpkg "wifi-firmware-mediatek-kmod-mt7996"; return 1 ;;
- 0x7992) addpkg "wifi-firmware-mediatek-kmod-mt7996"; return 1 ;;
- 0x799a) addpkg "wifi-firmware-mediatek-kmod-mt7996"; return 1 ;;
+ 0x0608) addpkg "wifi-firmware-mt76-kmod-mt792x"; return 1 ;;
+ 0x0616) addpkg "wifi-firmware-mt76-kmod-mt792x"; return 1 ;;
+ 0x0717) addpkg "wifi-firmware-mt76-kmod-mt792x"; return 1 ;;
+ 0x7611) addpkg "wifi-firmware-mt76-kmod-mt7615"; return 1 ;;
+ 0x7615) addpkg "wifi-firmware-mt76-kmod-mt7615"; return 1 ;;
+ 0x7663) addpkg "wifi-firmware-mt76-kmod-mt7615"; return 1 ;;
+ 0x7906) addpkg "wifi-firmware-mt76-kmod-mt7915"; return 1 ;;
+ 0x790a) addpkg "wifi-firmware-mt76-kmod-mt7915"; return 1 ;;
+ 0x7915) addpkg "wifi-firmware-mt76-kmod-mt7915"; return 1 ;;
+ 0x7916) addpkg "wifi-firmware-mt76-kmod-mt7915"; return 1 ;;
+ 0x7920) addpkg "wifi-firmware-mt76-kmod-mt792x"; return 1 ;;
+ 0x7922) addpkg "wifi-firmware-mt76-kmod-mt792x"; return 1 ;;
+ 0x7925) addpkg "wifi-firmware-mt76-kmod-mt792x"; return 1 ;;
+ 0x7961) addpkg "wifi-firmware-mt76-kmod-mt792x"; return 1 ;;
+ 0x7990) addpkg "wifi-firmware-mt76-kmod-mt7996"; return 1 ;;
+ 0x7991) addpkg "wifi-firmware-mt76-kmod-mt7996"; return 1 ;;
+ 0x7992) addpkg "wifi-firmware-mt76-kmod-mt7996"; return 1 ;;
+ 0x799a) addpkg "wifi-firmware-mt76-kmod-mt7996"; return 1 ;;
### <<<
esac