summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--share/man/man4/tcp.417
-rw-r--r--sys/netinet/in_pcb.c112
-rw-r--r--sys/netinet/in_pcb.h3
-rw-r--r--sys/netinet/tcp.h4
-rw-r--r--sys/netinet/tcp_usrreq.c10
-rw-r--r--sys/netinet6/in6_pcb.c37
-rw-r--r--sys/netinet6/in6_pcb.h2
7 files changed, 150 insertions, 35 deletions
diff --git a/share/man/man4/tcp.4 b/share/man/man4/tcp.4
index b046c56ef4e6..24b2e2d24d19 100644
--- a/share/man/man4/tcp.4
+++ b/share/man/man4/tcp.4
@@ -34,7 +34,7 @@
.\" From: @(#)tcp.4 8.1 (Berkeley) 6/5/93
.\" $FreeBSD$
.\"
-.Dd November 25, 2020
+.Dd December 19, 2020
.Dt TCP 4
.Os
.Sh NAME
@@ -314,6 +314,21 @@ Enable in-kernel TLS for data read from this socket.
See
.Xr ktls 4
for more details.
+.It Dv TCP_REUSPORT_LB_NUMA
+Changes NUMA affinity filtering for an established TCP listen
+socket.
+This option takes a single integer argument which specifies
+the NUMA domain to filter on for this listen socket.
+The argument can also have the follwing special values:
+.Bl -tag -width "Dv TCP_REUSPORT_LB_NUMA"
+.It Dv TCP_REUSPORT_LB_NUMA_NODOM
+Remove NUMA filtering for this listen socket.
+.It Dv TCP_REUSPORT_LB_NUMA_CURDOM
+Filter traffic associated with the domain where the calling thread is
+currently executing.
+This is typically used after a process or thread inherits a listen
+socket from its parent, and sets its CPU affinity to a particular core.
+.El
.El
.Pp
The option level for the
diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c
index 03cd09fb448d..5adac0fddddf 100644
--- a/sys/netinet/in_pcb.c
+++ b/sys/netinet/in_pcb.c
@@ -75,6 +75,7 @@ __FBSDID("$FreeBSD$");
#endif
#include <vm/uma.h>
+#include <vm/vm.h>
#include <net/if.h>
#include <net/if_var.h>
@@ -150,7 +151,8 @@ static void in_pcbremlists(struct inpcb *inp);
static struct inpcb *in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo,
struct in_addr faddr, u_int fport_arg,
struct in_addr laddr, u_int lport_arg,
- int lookupflags, struct ifnet *ifp);
+ int lookupflags, struct ifnet *ifp,
+ uint8_t numa_domain);
#define RANGECHK(var, min, max) \
if ((var) < (min)) { (var) = (min); } \
@@ -248,7 +250,8 @@ SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, set_ok, CTLFLAG_RD,
static struct inpcblbgroup *
in_pcblbgroup_alloc(struct inpcblbgrouphead *hdr, u_char vflag,
- uint16_t port, const union in_dependaddr *addr, int size)
+ uint16_t port, const union in_dependaddr *addr, int size,
+ uint8_t numa_domain)
{
struct inpcblbgroup *grp;
size_t bytes;
@@ -259,6 +262,7 @@ in_pcblbgroup_alloc(struct inpcblbgrouphead *hdr, u_char vflag,
return (NULL);
grp->il_vflag = vflag;
grp->il_lport = port;
+ grp->il_numa_domain = numa_domain;
grp->il_dependladdr = *addr;
grp->il_inpsiz = size;
CK_LIST_INSERT_HEAD(hdr, grp, il_list);
@@ -290,7 +294,8 @@ in_pcblbgroup_resize(struct inpcblbgrouphead *hdr,
int i;
grp = in_pcblbgroup_alloc(hdr, old_grp->il_vflag,
- old_grp->il_lport, &old_grp->il_dependladdr, size);
+ old_grp->il_lport, &old_grp->il_dependladdr, size,
+ old_grp->il_numa_domain);
if (grp == NULL)
return (NULL);
@@ -333,7 +338,7 @@ in_pcblbgroup_reorder(struct inpcblbgrouphead *hdr, struct inpcblbgroup **grpp,
* Add PCB to load balance group for SO_REUSEPORT_LB option.
*/
static int
-in_pcbinslbgrouphash(struct inpcb *inp)
+in_pcbinslbgrouphash(struct inpcb *inp, uint8_t numa_domain)
{
const static struct timeval interval = { 60, 0 };
static struct timeval lastprint;
@@ -369,6 +374,7 @@ in_pcbinslbgrouphash(struct inpcb *inp)
CK_LIST_FOREACH(grp, hdr, il_list) {
if (grp->il_vflag == inp->inp_vflag &&
grp->il_lport == inp->inp_lport &&
+ grp->il_numa_domain == numa_domain &&
memcmp(&grp->il_dependladdr,
&inp->inp_inc.inc_ie.ie_dependladdr,
sizeof(grp->il_dependladdr)) == 0)
@@ -378,7 +384,7 @@ in_pcbinslbgrouphash(struct inpcb *inp)
/* Create new load balance group. */
grp = in_pcblbgroup_alloc(hdr, inp->inp_vflag,
inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr,
- INPCBLBGROUP_SIZMIN);
+ INPCBLBGROUP_SIZMIN, numa_domain);
if (grp == NULL)
return (ENOBUFS);
} else if (grp->il_inpcnt == grp->il_inpsiz) {
@@ -439,6 +445,57 @@ in_pcbremlbgrouphash(struct inpcb *inp)
}
}
+int
+in_pcblbgroup_numa(struct inpcb *inp, int arg)
+{
+ struct inpcbinfo *pcbinfo;
+ struct inpcblbgrouphead *hdr;
+ struct inpcblbgroup *grp;
+ int err, i;
+ uint8_t numa_domain;
+
+ switch (arg) {
+ case TCP_REUSPORT_LB_NUMA_NODOM:
+ numa_domain = M_NODOM;
+ break;
+ case TCP_REUSPORT_LB_NUMA_CURDOM:
+ numa_domain = PCPU_GET(domain);
+ break;
+ default:
+ if (arg < 0 || arg >= vm_ndomains)
+ return (EINVAL);
+ numa_domain = arg;
+ }
+
+ err = 0;
+ pcbinfo = inp->inp_pcbinfo;
+ INP_WLOCK_ASSERT(inp);
+ INP_HASH_WLOCK(pcbinfo);
+ hdr = &pcbinfo->ipi_lbgrouphashbase[
+ INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)];
+ CK_LIST_FOREACH(grp, hdr, il_list) {
+ for (i = 0; i < grp->il_inpcnt; ++i) {
+ if (grp->il_inp[i] != inp)
+ continue;
+
+ if (grp->il_numa_domain == numa_domain) {
+ goto abort_with_hash_wlock;
+ }
+
+ /* Remove it from the old group. */
+ in_pcbremlbgrouphash(inp);
+
+ /* Add it to the new group based on numa domain. */
+ in_pcbinslbgrouphash(inp, numa_domain);
+ goto abort_with_hash_wlock;
+ }
+ }
+ err = ENOENT;
+abort_with_hash_wlock:
+ INP_HASH_WUNLOCK(pcbinfo);
+ return (err);
+}
+
/*
* Different protocols initialize their inpcbs differently - giving
* different name to the lock. But they all are disposed the same.
@@ -731,14 +788,14 @@ in_pcb_lport_dest(struct inpcb *inp, struct sockaddr *lsa, u_short *lportp,
if (lsa->sa_family == AF_INET) {
tmpinp = in_pcblookup_hash_locked(pcbinfo,
faddr, fport, laddr, lport, lookupflags,
- NULL);
+ NULL, M_NODOM);
}
#endif
#ifdef INET6
if (lsa->sa_family == AF_INET6) {
tmpinp = in6_pcblookup_hash_locked(pcbinfo,
faddr6, fport, laddr6, lport, lookupflags,
- NULL);
+ NULL, M_NODOM);
}
#endif
} else {
@@ -1399,9 +1456,10 @@ in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam,
if (error)
return (error);
}
+
if (lport != 0) {
oinp = in_pcblookup_hash_locked(inp->inp_pcbinfo, faddr,
- fport, laddr, lport, 0, NULL);
+ fport, laddr, lport, 0, NULL, M_NODOM);
if (oinp != NULL) {
if (oinpp != NULL)
*oinpp = oinp;
@@ -2019,9 +2077,9 @@ in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr,
static struct inpcb *
in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo,
const struct in_addr *laddr, uint16_t lport, const struct in_addr *faddr,
- uint16_t fport, int lookupflags)
+ uint16_t fport, int lookupflags, int numa_domain)
{
- struct inpcb *local_wild;
+ struct inpcb *local_wild, *numa_wild;
const struct inpcblbgrouphead *hdr;
struct inpcblbgroup *grp;
uint32_t idx;
@@ -2041,6 +2099,7 @@ in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo,
* - Load balanced group does not contain IPv4 mapped INET6 wild sockets
*/
local_wild = NULL;
+ numa_wild = NULL;
CK_LIST_FOREACH(grp, hdr, il_list) {
#ifdef INET6
if (!(grp->il_vflag & INP_IPV4))
@@ -2051,12 +2110,24 @@ in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo,
idx = INP_PCBLBGROUP_PKTHASH(faddr->s_addr, lport, fport) %
grp->il_inpcnt;
- if (grp->il_laddr.s_addr == laddr->s_addr)
- return (grp->il_inp[idx]);
+ if (grp->il_laddr.s_addr == laddr->s_addr) {
+ if (numa_domain == M_NODOM ||
+ grp->il_numa_domain == numa_domain) {
+ return (grp->il_inp[idx]);
+ } else {
+ numa_wild = grp->il_inp[idx];
+ }
+ }
if (grp->il_laddr.s_addr == INADDR_ANY &&
- (lookupflags & INPLOOKUP_WILDCARD) != 0)
+ (lookupflags & INPLOOKUP_WILDCARD) != 0 &&
+ (local_wild == NULL || numa_domain == M_NODOM ||
+ grp->il_numa_domain == numa_domain)) {
local_wild = grp->il_inp[idx];
+ }
}
+ if (numa_wild != NULL)
+ return (numa_wild);
+
return (local_wild);
}
@@ -2303,7 +2374,7 @@ found:
static struct inpcb *
in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr,
u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags,
- struct ifnet *ifp)
+ struct ifnet *ifp, uint8_t numa_domain)
{
struct inpcbhead *head;
struct inpcb *inp, *tmpinp;
@@ -2348,7 +2419,7 @@ in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr,
*/
if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
inp = in_pcblookup_lbgroup(pcbinfo, &laddr, lport, &faddr,
- fport, lookupflags);
+ fport, lookupflags, numa_domain);
if (inp != NULL)
return (inp);
}
@@ -2435,12 +2506,13 @@ in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr,
static struct inpcb *
in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
u_int fport, struct in_addr laddr, u_int lport, int lookupflags,
- struct ifnet *ifp)
+ struct ifnet *ifp, uint8_t numa_domain)
{
struct inpcb *inp;
inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport,
- (lookupflags & ~(INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)), ifp);
+ (lookupflags & ~(INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)), ifp,
+ numa_domain);
if (inp != NULL) {
if (lookupflags & INPLOOKUP_WLOCKPCB) {
INP_WLOCK(inp);
@@ -2507,7 +2579,7 @@ in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport,
}
#endif
return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
- lookupflags, ifp));
+ lookupflags, ifp, M_NODOM));
}
struct inpcb *
@@ -2549,7 +2621,7 @@ in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr,
}
#endif
return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
- lookupflags, ifp));
+ lookupflags, ifp, m->m_pkthdr.numa_domain));
}
#endif /* INET */
@@ -2591,7 +2663,7 @@ in_pcbinshash_internal(struct inpcb *inp, struct mbuf *m)
*/
so_options = inp_so_options(inp);
if (so_options & SO_REUSEPORT_LB) {
- int ret = in_pcbinslbgrouphash(inp);
+ int ret = in_pcbinslbgrouphash(inp, M_NODOM);
if (ret) {
/* pcb lb group malloc fail (ret=ENOBUFS). */
return (ret);
diff --git a/sys/netinet/in_pcb.h b/sys/netinet/in_pcb.h
index 56e2204f9054..080d07cc7218 100644
--- a/sys/netinet/in_pcb.h
+++ b/sys/netinet/in_pcb.h
@@ -565,7 +565,7 @@ struct inpcblbgroup {
struct epoch_context il_epoch_ctx;
uint16_t il_lport; /* (c) */
u_char il_vflag; /* (c) */
- u_char il_pad;
+ u_int8_t il_numa_domain;
uint32_t il_pad2;
union in_dependaddr il_dependladdr; /* (c) */
#define il_laddr il_dependladdr.id46_addr.ia46_addr4
@@ -852,6 +852,7 @@ int in_pcbinshash(struct inpcb *);
int in_pcbinshash_mbuf(struct inpcb *, struct mbuf *);
int in_pcbladdr(struct inpcb *, struct in_addr *, struct in_addr *,
struct ucred *);
+int in_pcblbgroup_numa(struct inpcb *, int arg);
struct inpcb *
in_pcblookup_local(struct inpcbinfo *,
struct in_addr, u_short, int, struct ucred *);
diff --git a/sys/netinet/tcp.h b/sys/netinet/tcp.h
index faf142959375..0b71bd4658f8 100644
--- a/sys/netinet/tcp.h
+++ b/sys/netinet/tcp.h
@@ -196,6 +196,7 @@ struct tcphdr {
#define TCP_PCAP_IN 4096 /* number of input packets to keep */
#define TCP_FUNCTION_BLK 8192 /* Set the tcp function pointers to the specified stack */
/* Options for Rack and BBR */
+#define TCP_REUSPORT_LB_NUMA 1026 /* set listen socket numa domain */
#define TCP_RACK_MBUF_QUEUE 1050 /* Do we allow mbuf queuing if supported */
#define TCP_RACK_PROP 1051 /* RACK proportional rate reduction (bool) */
#define TCP_RACK_TLP_REDUCE 1052 /* RACK TLP cwnd reduction (bool) */
@@ -406,4 +407,7 @@ struct tcp_function_set {
#define VOI_TCP_GPUT_ND 8 /* Goodput normalised delta */
#define VOI_TCP_ACKLEN 9 /* Average ACKed bytes per ACK */
+#define TCP_REUSPORT_LB_NUMA_NODOM (-2) /* remove numa binding */
+#define TCP_REUSPORT_LB_NUMA_CURDOM (-1) /* bind to current domain */
+
#endif /* !_NETINET_TCP_H_ */
diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c
index ffee58b9043f..bfa96ce093b5 100644
--- a/sys/netinet/tcp_usrreq.c
+++ b/sys/netinet/tcp_usrreq.c
@@ -2143,6 +2143,16 @@ unlock_and_done:
INP_WUNLOCK(inp);
break;
+ case TCP_REUSPORT_LB_NUMA:
+ INP_WUNLOCK(inp);
+ error = sooptcopyin(sopt, &optval, sizeof(optval),
+ sizeof(optval));
+ INP_WLOCK_RECHECK(inp);
+ if (!error)
+ error = in_pcblbgroup_numa(inp, optval);
+ INP_WUNLOCK(inp);
+ break;
+
#ifdef KERN_TLS
case TCP_TXTLS_ENABLE:
INP_WUNLOCK(inp);
diff --git a/sys/netinet6/in6_pcb.c b/sys/netinet6/in6_pcb.c
index 1040a3741999..567a7918f159 100644
--- a/sys/netinet6/in6_pcb.c
+++ b/sys/netinet6/in6_pcb.c
@@ -446,7 +446,7 @@ in6_pcbconnect_mbuf(struct inpcb *inp, struct sockaddr *nam,
sin6->sin6_port,
IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)
? &laddr6.sin6_addr : &inp->in6p_laddr,
- inp->inp_lport, 0, NULL) != NULL) {
+ inp->inp_lport, 0, NULL, M_NODOM) != NULL) {
return (EADDRINUSE);
}
if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) {
@@ -903,9 +903,9 @@ in6_rtchange(struct inpcb *inp, int errno __unused)
static struct inpcb *
in6_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo,
const struct in6_addr *laddr, uint16_t lport, const struct in6_addr *faddr,
- uint16_t fport, int lookupflags)
+ uint16_t fport, int lookupflags, uint8_t numa_domain)
{
- struct inpcb *local_wild;
+ struct inpcb *local_wild, *numa_wild;
const struct inpcblbgrouphead *hdr;
struct inpcblbgroup *grp;
uint32_t idx;
@@ -925,6 +925,7 @@ in6_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo,
* - Load balanced does not contain IPv4 mapped INET6 wild sockets.
*/
local_wild = NULL;
+ numa_wild = NULL;
CK_LIST_FOREACH(grp, hdr, il_list) {
#ifdef INET
if (!(grp->il_vflag & INP_IPV6))
@@ -935,12 +936,23 @@ in6_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo,
idx = INP_PCBLBGROUP_PKTHASH(INP6_PCBHASHKEY(faddr), lport,
fport) % grp->il_inpcnt;
- if (IN6_ARE_ADDR_EQUAL(&grp->il6_laddr, laddr))
- return (grp->il_inp[idx]);
+ if (IN6_ARE_ADDR_EQUAL(&grp->il6_laddr, laddr)) {
+ if (numa_domain == M_NODOM ||
+ grp->il_numa_domain == numa_domain) {
+ return (grp->il_inp[idx]);
+ }
+ else
+ numa_wild = grp->il_inp[idx];
+ }
if (IN6_IS_ADDR_UNSPECIFIED(&grp->il6_laddr) &&
- (lookupflags & INPLOOKUP_WILDCARD) != 0)
+ (lookupflags & INPLOOKUP_WILDCARD) != 0 &&
+ (local_wild == NULL || numa_domain == M_NODOM ||
+ grp->il_numa_domain == numa_domain)) {
local_wild = grp->il_inp[idx];
+ }
}
+ if (numa_wild != NULL)
+ return (numa_wild);
return (local_wild);
}
@@ -1151,7 +1163,7 @@ found:
struct inpcb *
in6_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in6_addr *faddr,
u_int fport_arg, struct in6_addr *laddr, u_int lport_arg,
- int lookupflags, struct ifnet *ifp)
+ int lookupflags, struct ifnet *ifp, uint8_t numa_domain)
{
struct inpcbhead *head;
struct inpcb *inp, *tmpinp;
@@ -1195,7 +1207,7 @@ in6_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in6_addr *faddr,
*/
if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
inp = in6_pcblookup_lbgroup(pcbinfo, laddr, lport, faddr,
- fport, lookupflags);
+ fport, lookupflags, numa_domain);
if (inp != NULL)
return (inp);
}
@@ -1273,12 +1285,13 @@ in6_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in6_addr *faddr,
static struct inpcb *
in6_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in6_addr *faddr,
u_int fport, struct in6_addr *laddr, u_int lport, int lookupflags,
- struct ifnet *ifp)
+ struct ifnet *ifp, uint8_t numa_domain)
{
struct inpcb *inp;
inp = in6_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport,
- (lookupflags & ~(INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)), ifp);
+ (lookupflags & ~(INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)), ifp,
+ numa_domain);
if (inp != NULL) {
if (lookupflags & INPLOOKUP_WLOCKPCB) {
INP_WLOCK(inp);
@@ -1344,7 +1357,7 @@ in6_pcblookup(struct inpcbinfo *pcbinfo, struct in6_addr *faddr, u_int fport,
}
#endif
return (in6_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
- lookupflags, ifp));
+ lookupflags, ifp, M_NODOM));
}
struct inpcb *
@@ -1386,7 +1399,7 @@ in6_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in6_addr *faddr,
}
#endif
return (in6_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
- lookupflags, ifp));
+ lookupflags, ifp, m->m_pkthdr.numa_domain));
}
void
diff --git a/sys/netinet6/in6_pcb.h b/sys/netinet6/in6_pcb.h
index d2df04402b2f..06df113c2325 100644
--- a/sys/netinet6/in6_pcb.h
+++ b/sys/netinet6/in6_pcb.h
@@ -95,7 +95,7 @@ struct inpcb *
struct inpcb *
in6_pcblookup_hash_locked(struct inpcbinfo *pcbinfo,
struct in6_addr *faddr, u_int fport_arg, struct in6_addr *laddr,
- u_int lport_arg, int lookupflags, struct ifnet *ifp);
+ u_int lport_arg, int lookupflags, struct ifnet *ifp, uint8_t);
struct inpcb *
in6_pcblookup(struct inpcbinfo *, struct in6_addr *,
u_int, struct in6_addr *, u_int, int,