diff options
Diffstat (limited to 'sys/netinet')
| -rw-r--r-- | sys/netinet/cc/cc.c | 2 | ||||
| -rw-r--r-- | sys/netinet/in_fib_algo.c | 2 | ||||
| -rw-r--r-- | sys/netinet/in_mcast.c | 113 | ||||
| -rw-r--r-- | sys/netinet/in_pcb.c | 13 | ||||
| -rw-r--r-- | sys/netinet/in_proto.c | 2 | ||||
| -rw-r--r-- | sys/netinet/libalias/alias_db.c | 2 | ||||
| -rw-r--r-- | sys/netinet/raw_ip.c | 4 | ||||
| -rw-r--r-- | sys/netinet/siftr.c | 2 | ||||
| -rw-r--r-- | sys/netinet/tcp_hpts.c | 933 | ||||
| -rw-r--r-- | sys/netinet/tcp_hpts.h | 50 | ||||
| -rw-r--r-- | sys/netinet/tcp_hpts_internal.h | 184 | ||||
| -rw-r--r-- | sys/netinet/tcp_hpts_test.c | 1682 | ||||
| -rw-r--r-- | sys/netinet/tcp_input.c | 2 | ||||
| -rw-r--r-- | sys/netinet/tcp_lro.c | 9 | ||||
| -rw-r--r-- | sys/netinet/tcp_lro_hpts.c | 3 | ||||
| -rw-r--r-- | sys/netinet/tcp_output.c | 2 | ||||
| -rw-r--r-- | sys/netinet/tcp_stacks/bbr.c | 139 | ||||
| -rw-r--r-- | sys/netinet/tcp_stacks/rack.c | 536 | ||||
| -rw-r--r-- | sys/netinet/tcp_stacks/tcp_rack.h | 1 | ||||
| -rw-r--r-- | sys/netinet/tcp_subr.c | 2 | ||||
| -rw-r--r-- | sys/netinet/tcp_syncache.c | 52 | ||||
| -rw-r--r-- | sys/netinet/tcp_timer.c | 7 | ||||
| -rw-r--r-- | sys/netinet/udp_usrreq.c | 49 | ||||
| -rw-r--r-- | sys/netinet/udp_var.h | 1 | 
24 files changed, 2767 insertions, 1025 deletions
| diff --git a/sys/netinet/cc/cc.c b/sys/netinet/cc/cc.c index c20a20cd983d..bc06616dbf93 100644 --- a/sys/netinet/cc/cc.c +++ b/sys/netinet/cc/cc.c @@ -271,7 +271,7 @@ cc_check_default(struct cc_algo *remove_cc)   * Initialise CC subsystem on system boot.   */  static void -cc_init(void) +cc_init(void *dummy __unused)  {  	CC_LIST_LOCK_INIT();  	STAILQ_INIT(&cc_list); diff --git a/sys/netinet/in_fib_algo.c b/sys/netinet/in_fib_algo.c index 123dacb409e7..95621c300064 100644 --- a/sys/netinet/in_fib_algo.c +++ b/sys/netinet/in_fib_algo.c @@ -767,7 +767,7 @@ struct fib_lookup_module flm_radix4 = {  };  static void -fib4_algo_init(void) +fib4_algo_init(void *dummy __unused)  {  	fib_module_register(&flm_bsearch4); diff --git a/sys/netinet/in_mcast.c b/sys/netinet/in_mcast.c index f5b20c49ffd2..ba112afbf002 100644 --- a/sys/netinet/in_mcast.c +++ b/sys/netinet/in_mcast.c @@ -159,9 +159,6 @@ static struct ip_moptions *  static int	inp_get_source_filters(struct inpcb *, struct sockopt *);  static int	inp_join_group(struct inpcb *, struct sockopt *);  static int	inp_leave_group(struct inpcb *, struct sockopt *); -static struct ifnet * -		inp_lookup_mcast_ifp(const struct inpcb *, -		    const struct sockaddr_in *, const struct in_addr);  static int	inp_block_unblock_source(struct inpcb *, struct sockopt *);  static int	inp_set_multicast_if(struct inpcb *, struct sockopt *);  static int	inp_set_source_filters(struct inpcb *, struct sockopt *); @@ -1832,69 +1829,55 @@ inp_getmoptions(struct inpcb *inp, struct sockopt *sopt)  }  /* - * Look up the ifnet to use for a multicast group membership, - * given the IPv4 address of an interface, and the IPv4 group address. - * - * This routine exists to support legacy multicast applications - * which do not understand that multicast memberships are scoped to - * specific physical links in the networking stack, or which need - * to join link-scope groups before IPv4 addresses are configured. - * - * Use this socket's current FIB number for any required FIB lookup. - * If ina is INADDR_ANY, look up the group address in the unicast FIB, - * and use its ifp; usually, this points to the default next-hop. - * - * If the FIB lookup fails, attempt to use the first non-loopback - * interface with multicast capability in the system as a - * last resort. The legacy IPv4 ASM API requires that we do - * this in order to allow groups to be joined when the routing - * table has not yet been populated during boot. - * - * Returns NULL if no ifp could be found, otherwise return referenced ifp. + * Look up the ifnet to join a multicast group membership via legacy + * IP_ADD_MEMBERSHIP or via more modern MCAST_JOIN_GROUP.   * - * FUTURE: Implement IPv4 source-address selection. + * If the interface index was specified explicitly, just use it.  If the + * address was specified (legacy), try to find matching interface.  Else + * (index == 0 && no address) do a route lookup.  If that fails for a modern + * MCAST_JOIN_GROUP return failure, for legacy IP_ADD_MEMBERSHIP find first + * multicast capable interface.   */  static struct ifnet * -inp_lookup_mcast_ifp(const struct inpcb *inp, -    const struct sockaddr_in *gsin, const struct in_addr ina) +inp_lookup_mcast_ifp(const struct inpcb *inp, const struct in_addr maddr, +const struct in_addr *ina, const u_int index)  {  	struct ifnet *ifp;  	struct nhop_object *nh;  	NET_EPOCH_ASSERT(); -	KASSERT(inp != NULL, ("%s: inp must not be NULL", __func__)); -	KASSERT(gsin->sin_family == AF_INET, ("%s: not AF_INET", __func__)); -	KASSERT(IN_MULTICAST(ntohl(gsin->sin_addr.s_addr)), -	    ("%s: not multicast", __func__)); -	ifp = NULL; -	if (!in_nullhost(ina)) { -		INADDR_TO_IFP(ina, ifp); +	if (index != 0) +		return (ifnet_byindex_ref(index)); + +	if (ina != NULL && !in_nullhost(*ina)) { +		INADDR_TO_IFP(*ina, ifp);  		if (ifp != NULL)  			if_ref(ifp); -	} else { -		nh = fib4_lookup(inp->inp_inc.inc_fibnum, gsin->sin_addr, 0, NHR_NONE, 0); -		if (nh != NULL) { -			ifp = nh->nh_ifp; -			if_ref(ifp); -		} else { -			struct in_ifaddr *ia; -			struct ifnet *mifp; - -			mifp = NULL; -			CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { -				mifp = ia->ia_ifp; -				if (!(mifp->if_flags & IFF_LOOPBACK) && -				     (mifp->if_flags & IFF_MULTICAST)) { -					ifp = mifp; -					if_ref(ifp); -					break; -				} +		return (ifp); +	} + +	nh = fib4_lookup(inp->inp_inc.inc_fibnum, maddr, 0, NHR_NONE, 0); +	if (nh != NULL) { +		ifp = nh->nh_ifp; +		if_ref(ifp); +		return (ifp); +	} + +	if (ina != NULL) { +		struct in_ifaddr *ia; + +		CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { +			if (!(ia->ia_ifp->if_flags & IFF_LOOPBACK) && +			     (ia->ia_ifp->if_flags & IFF_MULTICAST)) { +				ifp = ia->ia_ifp; +				if_ref(ifp); +				return (ifp);  			}  		}  	} -	return (ifp); +	return (NULL);  }  /* @@ -1926,13 +1909,13 @@ inp_join_group(struct inpcb *inp, struct sockopt *sopt)  	switch (sopt->sopt_name) {  	case IP_ADD_MEMBERSHIP: {  		struct ip_mreqn mreqn; +		bool mreq; -		if (sopt->sopt_valsize == sizeof(struct ip_mreqn)) -			error = sooptcopyin(sopt, &mreqn, -			    sizeof(struct ip_mreqn), sizeof(struct ip_mreqn)); -		else -			error = sooptcopyin(sopt, &mreqn, -			    sizeof(struct ip_mreq), sizeof(struct ip_mreq)); +		mreq = (sopt->sopt_valsize != sizeof(struct ip_mreqn)); + +		error = sooptcopyin(sopt, &mreqn, +		    mreq ? sizeof(struct ip_mreq) : sizeof(struct ip_mreqn), +		    mreq ? sizeof(struct ip_mreq) : sizeof(struct ip_mreqn));  		if (error)  			return (error); @@ -1943,12 +1926,9 @@ inp_join_group(struct inpcb *inp, struct sockopt *sopt)  			return (EINVAL);  		NET_EPOCH_ENTER(et); -		if (sopt->sopt_valsize == sizeof(struct ip_mreqn) && -		    mreqn.imr_ifindex != 0) -			ifp = ifnet_byindex_ref(mreqn.imr_ifindex); -		else -			ifp = inp_lookup_mcast_ifp(inp, &gsa->sin, -			    mreqn.imr_address); +		ifp = inp_lookup_mcast_ifp(inp, mreqn.imr_multiaddr, +		    mreq ? &mreqn.imr_address : NULL, +		    mreq ? 0 : mreqn.imr_ifindex);  		NET_EPOCH_EXIT(et);  		break;  	} @@ -1971,8 +1951,8 @@ inp_join_group(struct inpcb *inp, struct sockopt *sopt)  		ssa->sin.sin_addr = mreqs.imr_sourceaddr;  		NET_EPOCH_ENTER(et); -		ifp = inp_lookup_mcast_ifp(inp, &gsa->sin, -		    mreqs.imr_interface); +		ifp = inp_lookup_mcast_ifp(inp, mreqs.imr_multiaddr, +		    &mreqs.imr_interface, 0);  		NET_EPOCH_EXIT(et);  		CTR3(KTR_IGMPV3, "%s: imr_interface = 0x%08x, ifp = %p",  		    __func__, ntohl(mreqs.imr_interface.s_addr), ifp); @@ -2013,7 +1993,8 @@ inp_join_group(struct inpcb *inp, struct sockopt *sopt)  			return (EINVAL);  		NET_EPOCH_ENTER(et); -		ifp = ifnet_byindex_ref(gsr.gsr_interface); +		ifp = inp_lookup_mcast_ifp(inp, gsa->sin.sin_addr, NULL, +		    gsr.gsr_interface);  		NET_EPOCH_EXIT(et);  		if (ifp == NULL)  			return (EADDRNOTAVAIL); diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c index dbe48242381d..712ff28768dc 100644 --- a/sys/netinet/in_pcb.c +++ b/sys/netinet/in_pcb.c @@ -2665,10 +2665,13 @@ in_pcbinshash(struct inpcb *inp)  	    INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)];  	/* -	 * Add entry to load balance group. -	 * Only do this if SO_REUSEPORT_LB is set. +	 * Ignore SO_REUSEPORT_LB if the socket is connected.  Really this case +	 * should be an error, but for UDP sockets it is not, and some +	 * applications erroneously set it on connected UDP sockets, so we can't +	 * change this without breaking compatibility.  	 */ -	if ((inp->inp_socket->so_options & SO_REUSEPORT_LB) != 0) { +	if (!connected && +	    (inp->inp_socket->so_options & SO_REUSEPORT_LB) != 0) {  		int error = in_pcbinslbgrouphash(inp, M_NODOM);  		if (error != 0)  			return (error); @@ -2770,6 +2773,10 @@ in_pcbrehash(struct inpcb *inp)  		connected = !in_nullhost(inp->inp_faddr);  	} +	/* See the comment in in_pcbinshash(). */ +	if (connected && (inp->inp_flags & INP_INLBGROUP) != 0) +		in_pcbremlbgrouphash(inp); +  	/*  	 * When rehashing, the caller must ensure that either the new or the old  	 * foreign address was unspecified. diff --git a/sys/netinet/in_proto.c b/sys/netinet/in_proto.c index db46da6022c5..42a6cf0b5810 100644 --- a/sys/netinet/in_proto.c +++ b/sys/netinet/in_proto.c @@ -108,6 +108,8 @@ SYSCTL_NODE(_net_inet, IPPROTO_ICMP, icmp, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,      "ICMP");  SYSCTL_NODE(_net_inet, IPPROTO_UDP, udp, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,      "UDP"); +SYSCTL_NODE(_net_inet, IPPROTO_UDPLITE, udplite, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, +    "UDP-Lite");  SYSCTL_NODE(_net_inet, IPPROTO_TCP, tcp, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,      "TCP");  #if defined(SCTP) || defined(SCTP_SUPPORT) diff --git a/sys/netinet/libalias/alias_db.c b/sys/netinet/libalias/alias_db.c index c143d74a2f45..41f0a328daec 100644 --- a/sys/netinet/libalias/alias_db.c +++ b/sys/netinet/libalias/alias_db.c @@ -2181,7 +2181,7 @@ LibAliasInit(struct libalias *la)  #undef malloc	/* XXX: ugly */  		la = malloc(sizeof *la, M_ALIAS, M_WAITOK | M_ZERO);  #else -		la = calloc(sizeof *la, 1); +		la = calloc(1, sizeof *la);  		if (la == NULL)  			return (la);  #endif diff --git a/sys/netinet/raw_ip.c b/sys/netinet/raw_ip.c index 66070faf97e9..bfe608be6b36 100644 --- a/sys/netinet/raw_ip.c +++ b/sys/netinet/raw_ip.c @@ -680,7 +680,6 @@ rip_ctloutput(struct socket *so, struct sockopt *sopt)  			break;  		case IP_DUMMYNET3:	/* generic dummynet v.3 functions */ -		case IP_DUMMYNET_GET:  			if (ip_dn_ctl_ptr != NULL)  				error = ip_dn_ctl_ptr(sopt);  			else @@ -747,9 +746,6 @@ rip_ctloutput(struct socket *so, struct sockopt *sopt)  			break;  		case IP_DUMMYNET3:	/* generic dummynet v.3 functions */ -		case IP_DUMMYNET_CONFIGURE: -		case IP_DUMMYNET_DEL: -		case IP_DUMMYNET_FLUSH:  			if (ip_dn_ctl_ptr != NULL)  				error = ip_dn_ctl_ptr(sopt);  			else diff --git a/sys/netinet/siftr.c b/sys/netinet/siftr.c index 374b5595fcbc..5b89ca026e85 100644 --- a/sys/netinet/siftr.c +++ b/sys/netinet/siftr.c @@ -519,7 +519,7 @@ siftr_pkt_manager_thread(void *arg)  			if (log_buf != NULL) {  				alq_post_flags(siftr_alq, log_buf, 0);  			} -			for (;cnt > 0; cnt--) { +			for (; cnt > 0; cnt--) {  				pkt_node = STAILQ_FIRST(&tmp_pkt_queue);  				STAILQ_REMOVE_HEAD(&tmp_pkt_queue, nodes);  				free(pkt_node, M_SIFTR_PKTNODE); diff --git a/sys/netinet/tcp_hpts.c b/sys/netinet/tcp_hpts.c index 63bbe4bba11b..c54459bb5f01 100644 --- a/sys/netinet/tcp_hpts.c +++ b/sys/netinet/tcp_hpts.c @@ -39,15 +39,14 @@   * First, and probably the main thing its used by Rack and BBR, it can   * be used to call tcp_output() of a transport stack at some time in the future.   * The normal way this is done is that tcp_output() of the stack schedules - * itself to be called again by calling tcp_hpts_insert(tcpcb, slot). The - * slot is the time from now that the stack wants to be called but it - * must be converted to tcp_hpts's notion of slot. This is done with - * one of the macros HPTS_MS_TO_SLOTS or HPTS_USEC_TO_SLOTS. So a typical + * itself to be called again by calling tcp_hpts_insert(tcpcb, usecs). The + * usecs is the time from now that the stack wants to be called and is + * passing time directly in microseconds. So a typical   * call from the tcp_output() routine might look like:   * - * tcp_hpts_insert(tp, HPTS_USEC_TO_SLOTS(550)); + * tcp_hpts_insert(tp, 550, NULL);   * - * The above would schedule tcp_output() to be called in 550 useconds. + * The above would schedule tcp_output() to be called in 550 microseconds.   * Note that if using this mechanism the stack will want to add near   * its top a check to prevent unwanted calls (from user land or the   * arrival of incoming ack's). So it would add something like: @@ -149,27 +148,44 @@  #include <netinet/tcpip.h>  #include <netinet/cc/cc.h>  #include <netinet/tcp_hpts.h> +#include <netinet/tcp_hpts_internal.h>  #include <netinet/tcp_log_buf.h>  #ifdef tcp_offload  #include <netinet/tcp_offload.h>  #endif -/* - * The hpts uses a 102400 wheel. The wheel - * defines the time in 10 usec increments (102400 x 10). - * This gives a range of 10usec - 1024ms to place - * an entry within. If the user requests more than - * 1.024 second, a remaineder is attached and the hpts - * when seeing the remainder will re-insert the - * inpcb forward in time from where it is until - * the remainder is zero. - */ +/* Global instance for TCP HPTS */ +struct tcp_hptsi *tcp_hptsi_pace; + +/* Default function table for production use. */ +const struct tcp_hptsi_funcs tcp_hptsi_default_funcs = { +	.microuptime = microuptime, +	.swi_add = swi_add, +	.swi_remove = swi_remove, +	.swi_sched = swi_sched, +	.intr_event_bind = intr_event_bind, +	.intr_event_bind_ithread_cpuset = intr_event_bind_ithread_cpuset, +	.callout_init = callout_init, +	.callout_reset_sbt_on = callout_reset_sbt_on, +	._callout_stop_safe = _callout_stop_safe, +}; -#define NUM_OF_HPTSI_SLOTS 102400 +#ifdef TCP_HPTS_KTEST +#define microuptime pace->funcs->microuptime +#define swi_add pace->funcs->swi_add +#define swi_remove pace->funcs->swi_remove +#define swi_sched pace->funcs->swi_sched +#define intr_event_bind pace->funcs->intr_event_bind +#define intr_event_bind_ithread_cpuset pace->funcs->intr_event_bind_ithread_cpuset +#define callout_init pace->funcs->callout_init +#define callout_reset_sbt_on pace->funcs->callout_reset_sbt_on +#define _callout_stop_safe pace->funcs->_callout_stop_safe +#endif -/* The number of connections after which the dynamic sleep logic kicks in. */ -#define DEFAULT_CONNECTION_THRESHOLD 100 +static MALLOC_DEFINE(M_TCPHPTS, "tcp_hpts", "TCP hpts"); + +static void tcp_hpts_thread(void *ctx);  /*   * When using the hpts, a TCP stack must make sure @@ -204,87 +220,22 @@   *   * When we are in the "new" mode i.e. conn_cnt > conn_cnt_thresh   * then we do a dynamic adjustment on the time we sleep. - * Our threshold is if the lateness of the first client served (in ticks) is + * Our threshold is if the lateness of the first client served (in slots) is   * greater than or equal too slots_indicate_more_sleep (10ms - * or 10000 ticks). If we were that late, the actual sleep time - * is adjusted down by 50%. If the ticks_ran is less than - * slots_indicate_more_sleep (100 ticks or 1000usecs). + * or 10000 slots). If we were that late, the actual sleep time + * is adjusted down by 50%. If the slots_ran is less than + * slots_indicate_more_sleep (100 slots or 1000usecs).   *   */ -/* Each hpts has its own p_mtx which is used for locking */ -#define	HPTS_MTX_ASSERT(hpts)	mtx_assert(&(hpts)->p_mtx, MA_OWNED) -#define	HPTS_LOCK(hpts)		mtx_lock(&(hpts)->p_mtx) -#define	HPTS_TRYLOCK(hpts)	mtx_trylock(&(hpts)->p_mtx) -#define	HPTS_UNLOCK(hpts)	mtx_unlock(&(hpts)->p_mtx) -struct tcp_hpts_entry { -	/* Cache line 0x00 */ -	struct mtx p_mtx;	/* Mutex for hpts */ -	struct timeval p_mysleep;	/* Our min sleep time */ -	uint64_t syscall_cnt; -	uint64_t sleeping;	/* What the actual sleep was (if sleeping) */ -	uint16_t p_hpts_active; /* Flag that says hpts is awake  */ -	uint8_t p_wheel_complete; /* have we completed the wheel arc walk? */ -	uint32_t p_curtick;	/* Tick in 10 us the hpts is going to */ -	uint32_t p_runningslot; /* Current tick we are at if we are running */ -	uint32_t p_prev_slot;	/* Previous slot we were on */ -	uint32_t p_cur_slot;	/* Current slot in wheel hpts is draining */ -	uint32_t p_nxt_slot;	/* The next slot outside the current range of -				 * slots that the hpts is running on. */ -	int32_t p_on_queue_cnt;	/* Count on queue in this hpts */ -	uint32_t p_lasttick;	/* Last tick before the current one */ -	uint8_t p_direct_wake :1, /* boolean */ -		p_on_min_sleep:1, /* boolean */ -		p_hpts_wake_scheduled:1, /* boolean */ -		hit_callout_thresh:1, -		p_avail:4; -	uint8_t p_fill[3];	  /* Fill to 32 bits */ -	/* Cache line 0x40 */ -	struct hptsh { -		TAILQ_HEAD(, tcpcb)	head; -		uint32_t		count; -		uint32_t		gencnt; -	} *p_hptss;			/* Hptsi wheel */ -	uint32_t p_hpts_sleep_time;	/* Current sleep interval having a max -					 * of 255ms */ -	uint32_t overidden_sleep;	/* what was overrided by min-sleep for logging */ -	uint32_t saved_lasttick;	/* for logging */ -	uint32_t saved_curtick;		/* for logging */ -	uint32_t saved_curslot;		/* for logging */ -	uint32_t saved_prev_slot;       /* for logging */ -	uint32_t p_delayed_by;	/* How much were we delayed by */ -	/* Cache line 0x80 */ -	struct sysctl_ctx_list hpts_ctx; -	struct sysctl_oid *hpts_root; -	struct intr_event *ie; -	void *ie_cookie; -	uint16_t p_num;		/* The hpts number one per cpu */ -	uint16_t p_cpu;		/* The hpts CPU */ -	/* There is extra space in here */ -	/* Cache line 0x100 */ -	struct callout co __aligned(CACHE_LINE_SIZE); -}               __aligned(CACHE_LINE_SIZE); - -static struct tcp_hptsi { -	struct cpu_group **grps; -	struct tcp_hpts_entry **rp_ent;	/* Array of hptss */ -	uint32_t *cts_last_ran; -	uint32_t grp_cnt; -	uint32_t rp_num_hptss;	/* Number of hpts threads */ -} tcp_pace; - -static MALLOC_DEFINE(M_TCPHPTS, "tcp_hpts", "TCP hpts");  #ifdef RSS -static int tcp_bind_threads = 1; +int tcp_bind_threads = 1;  #else -static int tcp_bind_threads = 2; +int tcp_bind_threads = 2;  #endif  static int tcp_use_irq_cpu = 0;  static int hpts_does_tp_logging = 0; - -static int32_t tcp_hptsi(struct tcp_hpts_entry *hpts, bool from_callout); -static void tcp_hpts_thread(void *ctx); - +static int32_t tcp_hpts_precision = 120;  int32_t tcp_min_hptsi_time = DEFAULT_MIN_SLEEP;  static int conn_cnt_thresh = DEFAULT_CONNECTION_THRESHOLD;  static int32_t dynamic_min_sleep = DYNAMIC_MIN_SLEEP; @@ -295,23 +246,6 @@ SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hpts, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,  SYSCTL_NODE(_net_inet_tcp_hpts, OID_AUTO, stats, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,      "TCP Hpts statistics"); -#define	timersub(tvp, uvp, vvp)						\ -	do {								\ -		(vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec;		\ -		(vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec;	\ -		if ((vvp)->tv_usec < 0) {				\ -			(vvp)->tv_sec--;				\ -			(vvp)->tv_usec += 1000000;			\ -		}							\ -	} while (0) - -static int32_t tcp_hpts_precision = 120; - -static struct hpts_domain_info { -	int count; -	int cpu[MAXCPU]; -} hpts_domains[MAXMEMDOM]; -  counter_u64_t hpts_hopelessly_behind;  SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, hopeless, CTLFLAG_RD, @@ -459,14 +393,14 @@ SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, nowake_over_thresh, CTLFLAG_RW,      &tcp_hpts_no_wake_over_thresh, 0,      "When we are over the threshold on the pacer do we prohibit wakeups?"); -static uint16_t -hpts_random_cpu(void) +uint16_t +tcp_hptsi_random_cpu(struct tcp_hptsi *pace)  {  	uint16_t cpuid;  	uint32_t ran;  	ran = arc4random(); -	cpuid = (((ran & 0xffff) % mp_ncpus) % tcp_pace.rp_num_hptss); +	cpuid = (((ran & 0xffff) % mp_ncpus) % pace->rp_num_hptss);  	return (cpuid);  } @@ -487,13 +421,11 @@ tcp_hpts_log(struct tcp_hpts_entry *hpts, struct tcpcb *tp, struct timeval *tv,  		log.u_bbr.flex2 = hpts->p_cur_slot;  		log.u_bbr.flex3 = hpts->p_prev_slot;  		log.u_bbr.flex4 = idx; -		log.u_bbr.flex5 = hpts->p_curtick;  		log.u_bbr.flex6 = hpts->p_on_queue_cnt;  		log.u_bbr.flex7 = hpts->p_cpu;  		log.u_bbr.flex8 = (uint8_t)from_callout;  		log.u_bbr.inflight = slots_to_run;  		log.u_bbr.applimited = hpts->overidden_sleep; -		log.u_bbr.delivered = hpts->saved_curtick;  		log.u_bbr.timeStamp = tcp_tv_to_usec(tv);  		log.u_bbr.epoch = hpts->saved_curslot;  		log.u_bbr.lt_epoch = hpts->saved_prev_slot; @@ -510,11 +442,67 @@ tcp_hpts_log(struct tcp_hpts_entry *hpts, struct tcpcb *tp, struct timeval *tv,  	}  } +/* + * Timeout handler for the HPTS sleep callout. It immediately schedules the SWI + * for the HPTS entry to run. + */  static void -tcp_wakehpts(struct tcp_hpts_entry *hpts) +tcp_hpts_sleep_timeout(void *arg)  { +#ifdef TCP_HPTS_KTEST +	struct tcp_hptsi *pace; +#endif +	struct tcp_hpts_entry *hpts; + +	hpts = (struct tcp_hpts_entry *)arg; +#ifdef TCP_HPTS_KTEST +	pace = hpts->p_hptsi; +#endif +	swi_sched(hpts->ie_cookie, 0); +} + +/* + * Reset the HPTS callout timer with the provided timeval. Returns the results + * of the callout_reset_sbt_on() function. + */ +static int +tcp_hpts_sleep(struct tcp_hpts_entry *hpts, struct timeval *tv) +{ +#ifdef TCP_HPTS_KTEST +	struct tcp_hptsi *pace; +#endif +	sbintime_t sb; + +#ifdef TCP_HPTS_KTEST +	pace = hpts->p_hptsi; +#endif + +	/* Store off to make visible the actual sleep time */ +	hpts->sleeping = tv->tv_usec; + +	sb = tvtosbt(*tv); +	return (callout_reset_sbt_on( +		    &hpts->co, sb, 0, tcp_hpts_sleep_timeout, hpts, hpts->p_cpu, +		    (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)))); +} + +/* + * Schedules the SWI for the HTPS entry to run, if not already scheduled or + * running. + */ +void +tcp_hpts_wake(struct tcp_hpts_entry *hpts) +{ +#ifdef TCP_HPTS_KTEST +	struct tcp_hptsi *pace; +#endif +  	HPTS_MTX_ASSERT(hpts); +#ifdef TCP_HPTS_KTEST +	pace = hpts->p_hptsi; +#endif +  	if (tcp_hpts_no_wake_over_thresh && (hpts->p_on_queue_cnt >= conn_cnt_thresh)) {  		hpts->p_direct_wake = 0;  		return; @@ -526,15 +514,6 @@ tcp_wakehpts(struct tcp_hpts_entry *hpts)  }  static void -hpts_timeout_swi(void *arg) -{ -	struct tcp_hpts_entry *hpts; - -	hpts = (struct tcp_hpts_entry *)arg; -	swi_sched(hpts->ie_cookie, 0); -} - -static void  tcp_hpts_insert_internal(struct tcpcb *tp, struct tcp_hpts_entry *hpts)  {  	struct inpcb *inp = tptoinpcb(tp); @@ -562,13 +541,13 @@ tcp_hpts_insert_internal(struct tcpcb *tp, struct tcp_hpts_entry *hpts)  }  static struct tcp_hpts_entry * -tcp_hpts_lock(struct tcpcb *tp) +tcp_hpts_lock(struct tcp_hptsi *pace, struct tcpcb *tp)  {  	struct tcp_hpts_entry *hpts;  	INP_LOCK_ASSERT(tptoinpcb(tp)); -	hpts = tcp_pace.rp_ent[tp->t_hpts_cpu]; +	hpts = pace->rp_ent[tp->t_hpts_cpu];  	HPTS_LOCK(hpts);  	return (hpts); @@ -595,11 +574,10 @@ tcp_hpts_release(struct tcpcb *tp)   * and has never received a first packet.   */  void -tcp_hpts_init(struct tcpcb *tp) +__tcp_hpts_init(struct tcp_hptsi *pace, struct tcpcb *tp)  { -  	if (__predict_true(tp->t_hpts_cpu == HPTS_CPU_NONE)) { -		tp->t_hpts_cpu = hpts_random_cpu(); +		tp->t_hpts_cpu = tcp_hptsi_random_cpu(pace);  		MPASS(!(tp->t_flags2 & TF2_HPTS_CPU_SET));  	}  } @@ -611,14 +589,14 @@ tcp_hpts_init(struct tcpcb *tp)   * INP lock and then get the hpts lock.   */  void -tcp_hpts_remove(struct tcpcb *tp) +__tcp_hpts_remove(struct tcp_hptsi *pace, struct tcpcb *tp)  {  	struct tcp_hpts_entry *hpts;  	struct hptsh *hptsh;  	INP_WLOCK_ASSERT(tptoinpcb(tp)); -	hpts = tcp_hpts_lock(tp); +	hpts = tcp_hpts_lock(pace, tp);  	if (tp->t_in_hpts == IHPTS_ONQUEUE) {  		hptsh = &hpts->p_hptss[tp->t_hpts_slot];  		tp->t_hpts_request = 0; @@ -662,23 +640,19 @@ hpts_slot(uint32_t wheel_slot, uint32_t plus)  {  	/*  	 * Given a slot on the wheel, what slot -	 * is that plus ticks out? +	 * is that plus slots out?  	 */ -	KASSERT(wheel_slot < NUM_OF_HPTSI_SLOTS, ("Invalid tick %u not on wheel", wheel_slot)); +	KASSERT(wheel_slot < NUM_OF_HPTSI_SLOTS, ("Invalid slot %u not on wheel", wheel_slot));  	return ((wheel_slot + plus) % NUM_OF_HPTSI_SLOTS);  }  static inline int -tick_to_wheel(uint32_t cts_in_wticks) +cts_to_wheel(uint32_t cts)  {  	/* -	 * Given a timestamp in ticks (so by -	 * default to get it to a real time one -	 * would multiply by 10.. i.e the number -	 * of ticks in a slot) map it to our limited -	 * space wheel. +	 * Given a timestamp in useconds map it to our limited space wheel.  	 */ -	return (cts_in_wticks % NUM_OF_HPTSI_SLOTS); +	return ((cts / HPTS_USECS_PER_SLOT) % NUM_OF_HPTSI_SLOTS);  }  static inline int @@ -721,7 +695,7 @@ max_slots_available(struct tcp_hpts_entry *hpts, uint32_t wheel_slot, uint32_t *  	if ((hpts->p_hpts_active == 1) &&  	    (hpts->p_wheel_complete == 0)) {  		end_slot = hpts->p_runningslot; -		/* Back up one tick */ +		/* Back up one slot */  		if (end_slot == 0)  			end_slot = NUM_OF_HPTSI_SLOTS - 1;  		else @@ -734,7 +708,7 @@ max_slots_available(struct tcp_hpts_entry *hpts, uint32_t wheel_slot, uint32_t *  		 * not active, or we have  		 * completed the pass over  		 * the wheel, we can use the -		 * prev tick and subtract one from it. This puts us +		 * prev slot and subtract one from it. This puts us  		 * as far out as possible on the wheel.  		 */  		end_slot = hpts->p_prev_slot; @@ -747,7 +721,7 @@ max_slots_available(struct tcp_hpts_entry *hpts, uint32_t wheel_slot, uint32_t *  		/*  		 * Now we have close to the full wheel left minus the  		 * time it has been since the pacer went to sleep. Note -		 * that wheel_tick, passed in, should be the current time +		 * that wheel_slot, passed in, should be the current time  		 * from the perspective of the caller, mapped to the wheel.  		 */  		if (hpts->p_prev_slot != wheel_slot) @@ -824,7 +798,7 @@ max_slots_available(struct tcp_hpts_entry *hpts, uint32_t wheel_slot, uint32_t *  #ifdef INVARIANTS  static void  check_if_slot_would_be_wrong(struct tcp_hpts_entry *hpts, struct tcpcb *tp, -    uint32_t hptsslot, int line) +    uint32_t hptsslot)  {  	/*  	 * Sanity checks for the pacer with invariants @@ -855,12 +829,13 @@ check_if_slot_would_be_wrong(struct tcp_hpts_entry *hpts, struct tcpcb *tp,  }  #endif -uint32_t -tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_diag *diag) +void +__tcp_hpts_insert(struct tcp_hptsi *pace, struct tcpcb *tp, uint32_t usecs, +	struct hpts_diag *diag)  {  	struct tcp_hpts_entry *hpts;  	struct timeval tv; -	uint32_t slot_on, wheel_cts, last_slot, need_new_to = 0; +	uint32_t slot, wheel_cts, last_slot, need_new_to = 0;  	int32_t wheel_slot, maxslots;  	bool need_wakeup = false; @@ -869,11 +844,13 @@ tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_  	MPASS(!(tp->t_in_hpts == IHPTS_ONQUEUE));  	/* +	 * Convert microseconds to slots for internal use.  	 * We now return the next-slot the hpts will be on, beyond its  	 * current run (if up) or where it was when it stopped if it is  	 * sleeping.  	 */ -	hpts = tcp_hpts_lock(tp); +	slot = HPTS_USEC_TO_SLOTS(usecs); +	hpts = tcp_hpts_lock(pace, tp);  	microuptime(&tv);  	if (diag) {  		memset(diag, 0, sizeof(struct hpts_diag)); @@ -882,8 +859,6 @@ tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_  		diag->p_runningslot = hpts->p_runningslot;  		diag->p_nxt_slot = hpts->p_nxt_slot;  		diag->p_cur_slot = hpts->p_cur_slot; -		diag->p_curtick = hpts->p_curtick; -		diag->p_lasttick = hpts->p_lasttick;  		diag->slot_req = slot;  		diag->p_on_min_sleep = hpts->p_on_min_sleep;  		diag->hpts_sleep_time = hpts->p_hpts_sleep_time; @@ -910,17 +885,15 @@ tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_  			 * timeout is not 1.  			 */  			hpts->p_direct_wake = 1; -			tcp_wakehpts(hpts); +			tcp_hpts_wake(hpts);  		} -		slot_on = hpts->p_nxt_slot;  		HPTS_UNLOCK(hpts); -		return (slot_on); +		return;  	} -	/* Get the current time relative to the wheel */ -	wheel_cts = tcp_tv_to_hpts_slot(&tv); -	/* Map it onto the wheel */ -	wheel_slot = tick_to_wheel(wheel_cts); +	/* Get the current time stamp and map it onto the wheel */ +	wheel_cts = tcp_tv_to_usec(&tv); +	wheel_slot = cts_to_wheel(wheel_cts);  	/* Now what's the max we can place it at? */  	maxslots = max_slots_available(hpts, wheel_slot, &last_slot);  	if (diag) { @@ -952,11 +925,11 @@ tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_  		tp->t_hpts_slot = last_slot;  	}  	if (diag) { -		diag->slot_remaining = tp->t_hpts_request; +		diag->time_remaining = tp->t_hpts_request;  		diag->inp_hptsslot = tp->t_hpts_slot;  	}  #ifdef INVARIANTS -	check_if_slot_would_be_wrong(hpts, tp, tp->t_hpts_slot, line); +	check_if_slot_would_be_wrong(hpts, tp, tp->t_hpts_slot);  #endif  	if (__predict_true(tp->t_in_hpts != IHPTS_MOVING))  		tcp_hpts_insert_internal(tp, hpts); @@ -995,12 +968,12 @@ tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_  	}  	/*  	 * Now how far is the hpts sleeping to? if active is 1, its -	 * up and ticking we do nothing, otherwise we may need to +	 * up and running we do nothing, otherwise we may need to  	 * reschedule its callout if need_new_to is set from above.  	 */  	if (need_wakeup) {  		hpts->p_direct_wake = 1; -		tcp_wakehpts(hpts); +		tcp_hpts_wake(hpts);  		if (diag) {  			diag->need_new_to = 0;  			diag->co_ret = 0xffff0000; @@ -1008,7 +981,6 @@ tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_  	} else if (need_new_to) {  		int32_t co_ret;  		struct timeval tv; -		sbintime_t sb;  		tv.tv_sec = 0;  		tv.tv_usec = 0; @@ -1016,24 +988,18 @@ tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_  			tv.tv_sec++;  			need_new_to -= HPTS_USEC_IN_SEC;  		} -		tv.tv_usec = need_new_to; -		sb = tvtosbt(tv); -		co_ret = callout_reset_sbt_on(&hpts->co, sb, 0, -					      hpts_timeout_swi, hpts, hpts->p_cpu, -					      (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision))); +		tv.tv_usec = need_new_to; /* XXX: Why is this sleeping over the max? */ +		co_ret = tcp_hpts_sleep(hpts, &tv);  		if (diag) {  			diag->need_new_to = need_new_to;  			diag->co_ret = co_ret;  		}  	} -	slot_on = hpts->p_nxt_slot;  	HPTS_UNLOCK(hpts); - -	return (slot_on);  }  static uint16_t -hpts_cpuid(struct tcpcb *tp, int *failed) +hpts_cpuid(struct tcp_hptsi *pace, struct tcpcb *tp, int *failed)  {  	struct inpcb *inp = tptoinpcb(tp);  	u_int cpuid; @@ -1060,7 +1026,7 @@ hpts_cpuid(struct tcpcb *tp, int *failed)  #ifdef RSS  	cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype);  	if (cpuid == NETISR_CPUID_NONE) -		return (hpts_random_cpu()); +		return (tcp_hptsi_random_cpu(pace));  	else  		return (cpuid);  #endif @@ -1071,7 +1037,7 @@ hpts_cpuid(struct tcpcb *tp, int *failed)  	 */  	if (inp->inp_flowtype == M_HASHTYPE_NONE) {  		counter_u64_add(cpu_uses_random, 1); -		return (hpts_random_cpu()); +		return (tcp_hptsi_random_cpu(pace));  	}  	/*  	 * Hash to a thread based on the flowid.  If we are using numa, @@ -1086,7 +1052,7 @@ hpts_cpuid(struct tcpcb *tp, int *failed)  #ifdef NUMA  	} else {  		/* Hash into the cpu's that use that domain */ -		di = &hpts_domains[inp->inp_numa_domain]; +		di = &pace->domains[inp->inp_numa_domain];  		cpuid = di->cpu[inp->inp_flowid % di->count];  	}  #endif @@ -1118,9 +1084,16 @@ tcp_hpts_set_max_sleep(struct tcp_hpts_entry *hpts, int wrap_loop_cnt)  	}  } -static int32_t +static bool +tcp_hpts_different_slots(uint32_t cts, uint32_t cts_last_run) +{ +	return ((cts / HPTS_USECS_PER_SLOT) != (cts_last_run / HPTS_USECS_PER_SLOT)); +} + +int32_t  tcp_hptsi(struct tcp_hpts_entry *hpts, bool from_callout)  { +	struct tcp_hptsi *pace;  	struct tcpcb *tp;  	struct timeval tv;  	int32_t slots_to_run, i, error; @@ -1130,6 +1103,7 @@ tcp_hptsi(struct tcp_hpts_entry *hpts, bool from_callout)  	int32_t wrap_loop_cnt = 0;  	int32_t slot_pos_of_endpoint = 0;  	int32_t orig_exit_slot; +	uint32_t cts, cts_last_run;  	bool completed_measure, seen_endpoint;  	completed_measure = false; @@ -1137,32 +1111,34 @@ tcp_hptsi(struct tcp_hpts_entry *hpts, bool from_callout)  	HPTS_MTX_ASSERT(hpts);  	NET_EPOCH_ASSERT(); + +	pace = hpts->p_hptsi; +	MPASS(pace != NULL); +  	/* record previous info for any logging */ -	hpts->saved_lasttick = hpts->p_lasttick; -	hpts->saved_curtick = hpts->p_curtick;  	hpts->saved_curslot = hpts->p_cur_slot;  	hpts->saved_prev_slot = hpts->p_prev_slot; -	hpts->p_lasttick = hpts->p_curtick; -	hpts->p_curtick = tcp_gethptstick(&tv); -	tcp_pace.cts_last_ran[hpts->p_num] = tcp_tv_to_usec(&tv); -	orig_exit_slot = hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick); +	microuptime(&tv); +	cts_last_run = pace->cts_last_ran[hpts->p_cpu]; +	pace->cts_last_ran[hpts->p_cpu] = cts = tcp_tv_to_usec(&tv); + +	orig_exit_slot = hpts->p_cur_slot = cts_to_wheel(cts);  	if ((hpts->p_on_queue_cnt == 0) || -	    (hpts->p_lasttick == hpts->p_curtick)) { +	    !tcp_hpts_different_slots(cts, cts_last_run)) {  		/* -		 * No time has yet passed, -		 * or nothing to do. +		 * Not enough time has yet passed or nothing to do.  		 */  		hpts->p_prev_slot = hpts->p_cur_slot; -		hpts->p_lasttick = hpts->p_curtick;  		goto no_run;  	}  again:  	hpts->p_wheel_complete = 0;  	HPTS_MTX_ASSERT(hpts);  	slots_to_run = hpts_slots_diff(hpts->p_prev_slot, hpts->p_cur_slot); -	if (((hpts->p_curtick - hpts->p_lasttick) > (NUM_OF_HPTSI_SLOTS - 1)) && -	    (hpts->p_on_queue_cnt != 0)) { +	if ((hpts->p_on_queue_cnt != 0) && +	    ((cts - cts_last_run) > +	     ((NUM_OF_HPTSI_SLOTS-1) * HPTS_USECS_PER_SLOT))) {  		/*  		 * Wheel wrap is occuring, basically we  		 * are behind and the distance between @@ -1238,7 +1214,7 @@ again:  		uint32_t runningslot;  		/* -		 * Calculate our delay, if there are no extra ticks there +		 * Calculate our delay, if there are no extra slots there  		 * was not any (i.e. if slots_to_run == 1, no delay).  		 */  		hpts->p_delayed_by = (slots_to_run - (i + 1)) * @@ -1391,7 +1367,7 @@ again:  				 * gets added to the hpts (not this one)  				 * :-)  				 */ -				tcp_set_hpts(tp); +				__tcp_set_hpts(pace, tp);  			}  			CURVNET_SET(inp->inp_vnet);  			/* Lets do any logging that we might want to */ @@ -1450,10 +1426,12 @@ no_one:  	hpts->p_delayed_by = 0;  	/*  	 * Check to see if we took an excess amount of time and need to run -	 * more ticks (if we did not hit eno-bufs). +	 * more slots (if we did not hit eno-bufs).  	 */  	hpts->p_prev_slot = hpts->p_cur_slot; -	hpts->p_lasttick = hpts->p_curtick; +	microuptime(&tv); +	cts_last_run = cts; +	cts = tcp_tv_to_usec(&tv);  	if (!from_callout || (loop_cnt > max_pacer_loops)) {  		/*  		 * Something is serious slow we have @@ -1465,7 +1443,7 @@ no_one:  		 * can never catch up :(  		 *  		 * We will just lie to this thread -		 * and let it thing p_curtick is +		 * and let it think p_curslot is  		 * correct. When it next awakens  		 * it will find itself further behind.  		 */ @@ -1473,20 +1451,19 @@ no_one:  			counter_u64_add(hpts_hopelessly_behind, 1);  		goto no_run;  	} -	hpts->p_curtick = tcp_gethptstick(&tv); -	hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick); + +	hpts->p_cur_slot = cts_to_wheel(cts);  	if (!seen_endpoint) {  		/* We saw no endpoint but we may be looping */  		orig_exit_slot = hpts->p_cur_slot;  	} -	if ((wrap_loop_cnt < 2) && -	    (hpts->p_lasttick != hpts->p_curtick)) { +	if ((wrap_loop_cnt < 2) && tcp_hpts_different_slots(cts, cts_last_run)) {  		counter_u64_add(hpts_loops, 1);  		loop_cnt++;  		goto again;  	}  no_run: -	tcp_pace.cts_last_ran[hpts->p_num] = tcp_tv_to_usec(&tv); +	pace->cts_last_ran[hpts->p_cpu] = cts;  	/*  	 * Set flag to tell that we are done for  	 * any slot input that happens during @@ -1494,25 +1471,36 @@ no_run:  	 */  	hpts->p_wheel_complete = 1;  	/* -	 * Now did we spend too long running input and need to run more ticks? -	 * Note that if wrap_loop_cnt < 2 then we should have the conditions -	 * in the KASSERT's true. But if the wheel is behind i.e. wrap_loop_cnt -	 * is greater than 2, then the condtion most likely are *not* true. -	 * Also if we are called not from the callout, we don't run the wheel -	 * multiple times so the slots may not align either. -	 */ -	KASSERT(((hpts->p_prev_slot == hpts->p_cur_slot) || -		 (wrap_loop_cnt >= 2) || !from_callout), -		("H:%p p_prev_slot:%u not equal to p_cur_slot:%u", hpts, -		 hpts->p_prev_slot, hpts->p_cur_slot)); -	KASSERT(((hpts->p_lasttick == hpts->p_curtick) -		 || (wrap_loop_cnt >= 2) || !from_callout), -		("H:%p p_lasttick:%u not equal to p_curtick:%u", hpts, -		 hpts->p_lasttick, hpts->p_curtick)); -	if (from_callout && (hpts->p_lasttick != hpts->p_curtick)) { -		hpts->p_curtick = tcp_gethptstick(&tv); +	* If enough time has elapsed that we should be processing the next +	* slot(s), then we should have kept running and not marked the wheel as +	* complete. +	* +	* But there are several other conditions where we would have stopped +	* processing, so the prev/cur slots and cts variables won't match. +	* These conditions are: +	* +	* - Calls not from callouts don't run multiple times +	* - The wheel is empty +	* - We've processed more than max_pacer_loops times +	* - We've wrapped more than 2 times +	* +	* This assert catches when the logic above has violated this design. +	* +	*/ +	KASSERT((!from_callout || (hpts->p_on_queue_cnt == 0) || +		 (loop_cnt > max_pacer_loops) || (wrap_loop_cnt >= 2) || +		 ((hpts->p_prev_slot == hpts->p_cur_slot) && +		  !tcp_hpts_different_slots(cts, cts_last_run))), +		("H:%p Shouldn't be done! prev_slot:%u, cur_slot:%u, " +		 "cts_last_run:%u, cts:%u, loop_cnt:%d, wrap_loop_cnt:%d", +		 hpts, hpts->p_prev_slot, hpts->p_cur_slot, +		 cts_last_run, cts, loop_cnt, wrap_loop_cnt)); + +	if (from_callout && tcp_hpts_different_slots(cts, cts_last_run)) { +		microuptime(&tv); +		cts = tcp_tv_to_usec(&tv); +		hpts->p_cur_slot = cts_to_wheel(cts);  		counter_u64_add(hpts_loops, 1); -		hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);  		goto again;  	} @@ -1526,16 +1514,16 @@ no_run:  }  void -tcp_set_hpts(struct tcpcb *tp) +__tcp_set_hpts(struct tcp_hptsi *pace, struct tcpcb *tp)  {  	struct tcp_hpts_entry *hpts;  	int failed;  	INP_WLOCK_ASSERT(tptoinpcb(tp)); -	hpts = tcp_hpts_lock(tp); +	hpts = tcp_hpts_lock(pace, tp);  	if (tp->t_in_hpts == IHPTS_NONE && !(tp->t_flags2 & TF2_HPTS_CPU_SET)) { -		tp->t_hpts_cpu = hpts_cpuid(tp, &failed); +		tp->t_hpts_cpu = hpts_cpuid(pace, tp, &failed);  		if (failed == 0)  			tp->t_flags2 |= TF2_HPTS_CPU_SET;  	} @@ -1543,33 +1531,35 @@ tcp_set_hpts(struct tcpcb *tp)  }  static struct tcp_hpts_entry * -tcp_choose_hpts_to_run(void) +tcp_choose_hpts_to_run(struct tcp_hptsi *pace)  { +	struct timeval tv;  	int i, oldest_idx, start, end;  	uint32_t cts, time_since_ran, calc; -	cts = tcp_get_usecs(NULL); +	microuptime(&tv); +	cts = tcp_tv_to_usec(&tv);  	time_since_ran = 0;  	/* Default is all one group */  	start = 0; -	end = tcp_pace.rp_num_hptss; +	end = pace->rp_num_hptss;  	/*  	 * If we have more than one L3 group figure out which one  	 * this CPU is in.  	 */ -	if (tcp_pace.grp_cnt > 1) { -		for (i = 0; i < tcp_pace.grp_cnt; i++) { -			if (CPU_ISSET(curcpu, &tcp_pace.grps[i]->cg_mask)) { -				start = tcp_pace.grps[i]->cg_first; -				end = (tcp_pace.grps[i]->cg_last + 1); +	if (pace->grp_cnt > 1) { +		for (i = 0; i < pace->grp_cnt; i++) { +			if (CPU_ISSET(curcpu, &pace->grps[i]->cg_mask)) { +				start = pace->grps[i]->cg_first; +				end = (pace->grps[i]->cg_last + 1);  				break;  			}  		}  	}  	oldest_idx = -1;  	for (i = start; i < end; i++) { -		if (TSTMP_GT(cts, tcp_pace.cts_last_ran[i])) -			calc = cts - tcp_pace.cts_last_ran[i]; +		if (TSTMP_GT(cts, pace->cts_last_ran[i])) +			calc = cts - pace->cts_last_ran[i];  		else  			calc = 0;  		if (calc > time_since_ran) { @@ -1578,9 +1568,9 @@ tcp_choose_hpts_to_run(void)  		}  	}  	if (oldest_idx >= 0) -		return(tcp_pace.rp_ent[oldest_idx]); +		return(pace->rp_ent[oldest_idx]);  	else -		return(tcp_pace.rp_ent[(curcpu % tcp_pace.rp_num_hptss)]); +		return(pace->rp_ent[(curcpu % pace->rp_num_hptss)]);  }  static void @@ -1588,9 +1578,9 @@ __tcp_run_hpts(void)  {  	struct epoch_tracker et;  	struct tcp_hpts_entry *hpts; -	int ticks_ran; +	int slots_ran; -	hpts = tcp_choose_hpts_to_run(); +	hpts = tcp_choose_hpts_to_run(tcp_hptsi_pace);  	if (hpts->p_hpts_active) {  		/* Already active */ @@ -1606,12 +1596,11 @@ __tcp_run_hpts(void)  	hpts->syscall_cnt++;  	counter_u64_add(hpts_direct_call, 1);  	hpts->p_hpts_active = 1; -	ticks_ran = tcp_hptsi(hpts, false); +	slots_ran = tcp_hptsi(hpts, false);  	/* We may want to adjust the sleep values here */  	if (hpts->p_on_queue_cnt >= conn_cnt_thresh) { -		if (ticks_ran > slots_indicate_less_sleep) { +		if (slots_ran > slots_indicate_less_sleep) {  			struct timeval tv; -			sbintime_t sb;  			hpts->p_mysleep.tv_usec /= 2;  			if (hpts->p_mysleep.tv_usec < dynamic_min_sleep) @@ -1635,13 +1624,8 @@ __tcp_run_hpts(void)  			 * the dynamic value and set the on_min_sleep  			 * flag so we will not be awoken.  			 */ -			sb = tvtosbt(tv); -			/* Store off to make visible the actual sleep time */ -			hpts->sleeping = tv.tv_usec; -			callout_reset_sbt_on(&hpts->co, sb, 0, -					     hpts_timeout_swi, hpts, hpts->p_cpu, -					     (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision))); -		} else if (ticks_ran < slots_indicate_more_sleep) { +			(void)tcp_hpts_sleep(hpts, &tv); +		} else if (slots_ran < slots_indicate_more_sleep) {  			/* For the further sleep, don't reschedule  hpts */  			hpts->p_mysleep.tv_usec *= 2;  			if (hpts->p_mysleep.tv_usec > dynamic_max_sleep) @@ -1658,17 +1642,22 @@ out_with_mtx:  static void  tcp_hpts_thread(void *ctx)  { +#ifdef TCP_HPTS_KTEST +	struct tcp_hptsi *pace; +#endif  	struct tcp_hpts_entry *hpts;  	struct epoch_tracker et;  	struct timeval tv; -	sbintime_t sb; -	int ticks_ran; +	int slots_ran;  	hpts = (struct tcp_hpts_entry *)ctx; +#ifdef TCP_HPTS_KTEST +	pace = hpts->p_hptsi; +#endif  	HPTS_LOCK(hpts);  	if (hpts->p_direct_wake) {  		/* Signaled by input or output with low occupancy count. */ -		callout_stop(&hpts->co); +		_callout_stop_safe(&hpts->co, 0);  		counter_u64_add(hpts_direct_awakening, 1);  	} else {  		/* Timed out, the normal case. */ @@ -1721,7 +1710,7 @@ tcp_hpts_thread(void *ctx)  	}  	hpts->sleeping = 0;  	hpts->p_hpts_active = 1; -	ticks_ran = tcp_hptsi(hpts, true); +	slots_ran = tcp_hptsi(hpts, true);  	tv.tv_sec = 0;  	tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_USECS_PER_SLOT;  	if ((hpts->p_on_queue_cnt > conn_cnt_thresh) && (hpts->hit_callout_thresh == 0)) { @@ -1737,11 +1726,11 @@ tcp_hpts_thread(void *ctx)  			 * Only adjust sleep time if we were  			 * called from the callout i.e. direct_wake == 0.  			 */ -			if (ticks_ran < slots_indicate_more_sleep) { +			if (slots_ran < slots_indicate_more_sleep) {  				hpts->p_mysleep.tv_usec *= 2;  				if (hpts->p_mysleep.tv_usec > dynamic_max_sleep)  					hpts->p_mysleep.tv_usec = dynamic_max_sleep; -			} else if (ticks_ran > slots_indicate_less_sleep) { +			} else if (slots_ran > slots_indicate_less_sleep) {  				hpts->p_mysleep.tv_usec /= 2;  				if (hpts->p_mysleep.tv_usec < dynamic_min_sleep)  					hpts->p_mysleep.tv_usec = dynamic_min_sleep; @@ -1797,18 +1786,11 @@ tcp_hpts_thread(void *ctx)  	hpts->p_hpts_active = 0;  back_to_sleep:  	hpts->p_direct_wake = 0; -	sb = tvtosbt(tv); -	/* Store off to make visible the actual sleep time */ -	hpts->sleeping = tv.tv_usec; -	callout_reset_sbt_on(&hpts->co, sb, 0, -			     hpts_timeout_swi, hpts, hpts->p_cpu, -			     (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision))); +	(void)tcp_hpts_sleep(hpts, &tv);  	NET_EPOCH_EXIT(et);  	HPTS_UNLOCK(hpts);  } -#undef	timersub -  static int32_t  hpts_count_level(struct cpu_group *cg)  { @@ -1845,57 +1827,63 @@ hpts_gather_grps(struct cpu_group **grps, int32_t *at, int32_t max, struct cpu_g  	}  } -static void -tcp_hpts_mod_load(void) +/* + * Initialize a tcp_hptsi structure. This performs the core initialization + * without starting threads. + */ +struct tcp_hptsi* +tcp_hptsi_create(const struct tcp_hptsi_funcs *funcs, bool enable_sysctl)  { +	struct tcp_hptsi *pace;  	struct cpu_group *cpu_top; -	int32_t error __diagused; -	int32_t i, j, bound = 0, created = 0; +	uint32_t i, j, cts; +	int32_t count;  	size_t sz, asz;  	struct timeval tv; -	sbintime_t sb;  	struct tcp_hpts_entry *hpts; -	struct pcpu *pc;  	char unit[16];  	uint32_t ncpus = mp_ncpus ? mp_ncpus : MAXCPU; -	int count, domain; +	KASSERT(funcs != NULL, ("funcs is NULL")); + +	/* Allocate the main structure */ +	pace = malloc(sizeof(struct tcp_hptsi), M_TCPHPTS, M_WAITOK | M_ZERO); +	if (pace == NULL) +		return (NULL); + +	memset(pace, 0, sizeof(*pace)); +	pace->funcs = funcs; + +	/* Setup CPU topology information */  #ifdef SMP  	cpu_top = smp_topo();  #else  	cpu_top = NULL;  #endif -	tcp_pace.rp_num_hptss = ncpus; -	hpts_hopelessly_behind = counter_u64_alloc(M_WAITOK); -	hpts_loops = counter_u64_alloc(M_WAITOK); -	back_tosleep = counter_u64_alloc(M_WAITOK); -	combined_wheel_wrap = counter_u64_alloc(M_WAITOK); -	wheel_wrap = counter_u64_alloc(M_WAITOK); -	hpts_wake_timeout = counter_u64_alloc(M_WAITOK); -	hpts_direct_awakening = counter_u64_alloc(M_WAITOK); -	hpts_back_tosleep = counter_u64_alloc(M_WAITOK); -	hpts_direct_call = counter_u64_alloc(M_WAITOK); -	cpu_uses_flowid = counter_u64_alloc(M_WAITOK); -	cpu_uses_random = counter_u64_alloc(M_WAITOK); +	pace->rp_num_hptss = ncpus; -	sz = (tcp_pace.rp_num_hptss * sizeof(struct tcp_hpts_entry *)); -	tcp_pace.rp_ent = malloc(sz, M_TCPHPTS, M_WAITOK | M_ZERO); -	sz = (sizeof(uint32_t) * tcp_pace.rp_num_hptss); -	tcp_pace.cts_last_ran = malloc(sz, M_TCPHPTS, M_WAITOK); -	tcp_pace.grp_cnt = 0; +	/* Allocate hpts entry array */ +	sz = (pace->rp_num_hptss * sizeof(struct tcp_hpts_entry *)); +	pace->rp_ent = malloc(sz, M_TCPHPTS, M_WAITOK | M_ZERO); + +	/* Allocate timestamp tracking array */ +	sz = (sizeof(uint32_t) * pace->rp_num_hptss); +	pace->cts_last_ran = malloc(sz, M_TCPHPTS, M_WAITOK); + +	/* Setup CPU groups */  	if (cpu_top == NULL) { -		tcp_pace.grp_cnt = 1; +		pace->grp_cnt = 1;  	} else {  		/* Find out how many cache level 3 domains we have */  		count = 0; -		tcp_pace.grp_cnt = hpts_count_level(cpu_top); -		if (tcp_pace.grp_cnt == 0) { -			tcp_pace.grp_cnt = 1; +		pace->grp_cnt = hpts_count_level(cpu_top); +		if (pace->grp_cnt == 0) { +			pace->grp_cnt = 1;  		} -		sz = (tcp_pace.grp_cnt * sizeof(struct cpu_group *)); -		tcp_pace.grps = malloc(sz, M_TCPHPTS, M_WAITOK); +		sz = (pace->grp_cnt * sizeof(struct cpu_group *)); +		pace->grps = malloc(sz, M_TCPHPTS, M_WAITOK);  		/* Now populate the groups */ -		if (tcp_pace.grp_cnt == 1) { +		if (pace->grp_cnt == 1) {  			/*  			 * All we need is the top level all cpu's are in  			 * the same cache so when we use grp[0]->cg_mask @@ -1903,193 +1891,290 @@ tcp_hpts_mod_load(void)  			 * all cpu's in it. The level here is probably  			 * zero which is ok.  			 */ -			tcp_pace.grps[0] = cpu_top; +			pace->grps[0] = cpu_top;  		} else {  			/*  			 * Here we must find all the level three cache domains  			 * and setup our pointers to them.  			 */  			count = 0; -			hpts_gather_grps(tcp_pace.grps, &count, tcp_pace.grp_cnt, cpu_top); +			hpts_gather_grps(pace->grps, &count, pace->grp_cnt, cpu_top);  		}  	} + +	/* Cache the current time for initializing the hpts entries */ +	microuptime(&tv); +	cts = tcp_tv_to_usec(&tv); + +	/* Initialize each hpts entry */  	asz = sizeof(struct hptsh) * NUM_OF_HPTSI_SLOTS; -	for (i = 0; i < tcp_pace.rp_num_hptss; i++) { -		tcp_pace.rp_ent[i] = malloc(sizeof(struct tcp_hpts_entry), +	for (i = 0; i < pace->rp_num_hptss; i++) { +		pace->rp_ent[i] = malloc(sizeof(struct tcp_hpts_entry),  		    M_TCPHPTS, M_WAITOK | M_ZERO); -		tcp_pace.rp_ent[i]->p_hptss = malloc(asz, M_TCPHPTS, M_WAITOK); -		hpts = tcp_pace.rp_ent[i]; -		/* -		 * Init all the hpts structures that are not specifically -		 * zero'd by the allocations. Also lets attach them to the -		 * appropriate sysctl block as well. -		 */ -		mtx_init(&hpts->p_mtx, "tcp_hpts_lck", -		    "hpts", MTX_DEF | MTX_DUPOK); -		for (j = 0; j < NUM_OF_HPTSI_SLOTS; j++) { -			TAILQ_INIT(&hpts->p_hptss[j].head); -			hpts->p_hptss[j].count = 0; -			hpts->p_hptss[j].gencnt = 0; -		} -		sysctl_ctx_init(&hpts->hpts_ctx); -		sprintf(unit, "%d", i); -		hpts->hpts_root = SYSCTL_ADD_NODE(&hpts->hpts_ctx, -		    SYSCTL_STATIC_CHILDREN(_net_inet_tcp_hpts), -		    OID_AUTO, -		    unit, -		    CTLFLAG_RW | CTLFLAG_MPSAFE, 0, -		    ""); -		SYSCTL_ADD_INT(&hpts->hpts_ctx, -		    SYSCTL_CHILDREN(hpts->hpts_root), -		    OID_AUTO, "out_qcnt", CTLFLAG_RD, -		    &hpts->p_on_queue_cnt, 0, -		    "Count TCB's awaiting output processing"); -		SYSCTL_ADD_U16(&hpts->hpts_ctx, -		    SYSCTL_CHILDREN(hpts->hpts_root), -		    OID_AUTO, "active", CTLFLAG_RD, -		    &hpts->p_hpts_active, 0, -		    "Is the hpts active"); -		SYSCTL_ADD_UINT(&hpts->hpts_ctx, -		    SYSCTL_CHILDREN(hpts->hpts_root), -		    OID_AUTO, "curslot", CTLFLAG_RD, -		    &hpts->p_cur_slot, 0, -		    "What the current running pacers goal"); -		SYSCTL_ADD_UINT(&hpts->hpts_ctx, -		    SYSCTL_CHILDREN(hpts->hpts_root), -		    OID_AUTO, "runtick", CTLFLAG_RD, -		    &hpts->p_runningslot, 0, -		    "What the running pacers current slot is"); -		SYSCTL_ADD_UINT(&hpts->hpts_ctx, -		    SYSCTL_CHILDREN(hpts->hpts_root), -		    OID_AUTO, "curtick", CTLFLAG_RD, -		    &hpts->p_curtick, 0, -		    "What the running pacers last tick mapped to the wheel was"); -		SYSCTL_ADD_UINT(&hpts->hpts_ctx, -		    SYSCTL_CHILDREN(hpts->hpts_root), -		    OID_AUTO, "lastran", CTLFLAG_RD, -		    &tcp_pace.cts_last_ran[i], 0, -		    "The last usec tick that this hpts ran"); -		SYSCTL_ADD_LONG(&hpts->hpts_ctx, -		    SYSCTL_CHILDREN(hpts->hpts_root), -		    OID_AUTO, "cur_min_sleep", CTLFLAG_RD, -		    &hpts->p_mysleep.tv_usec, -		    "What the running pacers is using for p_mysleep.tv_usec"); -		SYSCTL_ADD_U64(&hpts->hpts_ctx, -		    SYSCTL_CHILDREN(hpts->hpts_root), -		    OID_AUTO, "now_sleeping", CTLFLAG_RD, -		    &hpts->sleeping, 0, -		    "What the running pacers is actually sleeping for"); -		SYSCTL_ADD_U64(&hpts->hpts_ctx, -		    SYSCTL_CHILDREN(hpts->hpts_root), -		    OID_AUTO, "syscall_cnt", CTLFLAG_RD, -		    &hpts->syscall_cnt, 0, -		    "How many times we had syscalls on this hpts"); +		pace->rp_ent[i]->p_hptss = malloc(asz, M_TCPHPTS, +		    M_WAITOK | M_ZERO); +		hpts = pace->rp_ent[i]; +		/* Basic initialization */  		hpts->p_hpts_sleep_time = hpts_sleep_max; -		hpts->p_num = i; -		hpts->p_curtick = tcp_gethptstick(&tv); -		tcp_pace.cts_last_ran[i] = tcp_tv_to_usec(&tv); -		hpts->p_prev_slot = hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick); -		hpts->p_cpu = 0xffff; +		hpts->p_cpu = i; +		pace->cts_last_ran[i] = cts; +		hpts->p_cur_slot = cts_to_wheel(cts); +		hpts->p_prev_slot = hpts->p_cur_slot;  		hpts->p_nxt_slot = hpts_slot(hpts->p_cur_slot, 1);  		callout_init(&hpts->co, 1); +		hpts->p_hptsi = pace; +		mtx_init(&hpts->p_mtx, "tcp_hpts_lck", "hpts", +		    MTX_DEF | MTX_DUPOK); +		for (j = 0; j < NUM_OF_HPTSI_SLOTS; j++) { +			TAILQ_INIT(&hpts->p_hptss[j].head); +		} + +		/* Setup SYSCTL if requested */ +		if (enable_sysctl) { +			sysctl_ctx_init(&hpts->hpts_ctx); +			sprintf(unit, "%d", i); +			hpts->hpts_root = SYSCTL_ADD_NODE(&hpts->hpts_ctx, +			    SYSCTL_STATIC_CHILDREN(_net_inet_tcp_hpts), +			    OID_AUTO, +			    unit, +			    CTLFLAG_RW | CTLFLAG_MPSAFE, 0, +			    ""); +			SYSCTL_ADD_INT(&hpts->hpts_ctx, +			    SYSCTL_CHILDREN(hpts->hpts_root), +			    OID_AUTO, "out_qcnt", CTLFLAG_RD, +			    &hpts->p_on_queue_cnt, 0, +			    "Count TCB's awaiting output processing"); +			SYSCTL_ADD_U16(&hpts->hpts_ctx, +			    SYSCTL_CHILDREN(hpts->hpts_root), +			    OID_AUTO, "active", CTLFLAG_RD, +			    &hpts->p_hpts_active, 0, +			    "Is the hpts active"); +			SYSCTL_ADD_UINT(&hpts->hpts_ctx, +			    SYSCTL_CHILDREN(hpts->hpts_root), +			    OID_AUTO, "curslot", CTLFLAG_RD, +			    &hpts->p_cur_slot, 0, +			    "What the current running pacers goal"); +			SYSCTL_ADD_UINT(&hpts->hpts_ctx, +			    SYSCTL_CHILDREN(hpts->hpts_root), +			    OID_AUTO, "runslot", CTLFLAG_RD, +			    &hpts->p_runningslot, 0, +			    "What the running pacers current slot is"); +			SYSCTL_ADD_UINT(&hpts->hpts_ctx, +			    SYSCTL_CHILDREN(hpts->hpts_root), +			    OID_AUTO, "lastran", CTLFLAG_RD, +			    &pace->cts_last_ran[i], 0, +			    "The last usec timestamp that this hpts ran"); +			SYSCTL_ADD_LONG(&hpts->hpts_ctx, +			    SYSCTL_CHILDREN(hpts->hpts_root), +			    OID_AUTO, "cur_min_sleep", CTLFLAG_RD, +			    &hpts->p_mysleep.tv_usec, +			    "What the running pacers is using for p_mysleep.tv_usec"); +			SYSCTL_ADD_U64(&hpts->hpts_ctx, +			    SYSCTL_CHILDREN(hpts->hpts_root), +			    OID_AUTO, "now_sleeping", CTLFLAG_RD, +			    &hpts->sleeping, 0, +			    "What the running pacers is actually sleeping for"); +			SYSCTL_ADD_U64(&hpts->hpts_ctx, +			    SYSCTL_CHILDREN(hpts->hpts_root), +			    OID_AUTO, "syscall_cnt", CTLFLAG_RD, +			    &hpts->syscall_cnt, 0, +			    "How many times we had syscalls on this hpts"); +		}  	} -	/* Don't try to bind to NUMA domains if we don't have any */ -	if (vm_ndomains == 1 && tcp_bind_threads == 2) -		tcp_bind_threads = 0; -	/* -	 * Now lets start ithreads to handle the hptss. -	 */ -	for (i = 0; i < tcp_pace.rp_num_hptss; i++) { -		hpts = tcp_pace.rp_ent[i]; -		hpts->p_cpu = i; +	return (pace); +} + +/* + * Create threads for a tcp_hptsi structure and starts timers for the current + * (minimum) sleep interval. + */ +void +tcp_hptsi_start(struct tcp_hptsi *pace) +{ +	struct tcp_hpts_entry *hpts; +	struct pcpu *pc; +	struct timeval tv; +	uint32_t i, j; +	int count, domain; +	int error __diagused; + +	KASSERT(pace != NULL, ("tcp_hptsi_start: pace is NULL")); + +	/* Start threads for each hpts entry */ +	for (i = 0; i < pace->rp_num_hptss; i++) { +		hpts = pace->rp_ent[i]; + +		KASSERT(hpts->ie_cookie == NULL, +		    ("tcp_hptsi_start: hpts[%d]->ie_cookie is not NULL", i));  		error = swi_add(&hpts->ie, "hpts",  		    tcp_hpts_thread, (void *)hpts,  		    SWI_NET, INTR_MPSAFE, &hpts->ie_cookie);  		KASSERT(error == 0, -			("Can't add hpts:%p i:%d err:%d", -			 hpts, i, error)); -		created++; -		hpts->p_mysleep.tv_sec = 0; -		hpts->p_mysleep.tv_usec = tcp_min_hptsi_time; +		    ("Can't add hpts:%p i:%d err:%d", hpts, i, error)); +  		if (tcp_bind_threads == 1) { -			if (intr_event_bind(hpts->ie, i) == 0) -				bound++; +			(void)intr_event_bind(hpts->ie, i);  		} else if (tcp_bind_threads == 2) {  			/* Find the group for this CPU (i) and bind into it */ -			for (j = 0; j < tcp_pace.grp_cnt; j++) { -				if (CPU_ISSET(i, &tcp_pace.grps[j]->cg_mask)) { +			for (j = 0; j < pace->grp_cnt; j++) { +				if (CPU_ISSET(i, &pace->grps[j]->cg_mask)) {  					if (intr_event_bind_ithread_cpuset(hpts->ie, -						&tcp_pace.grps[j]->cg_mask) == 0) { -						bound++; +					    &pace->grps[j]->cg_mask) == 0) {  						pc = pcpu_find(i);  						domain = pc->pc_domain; -						count = hpts_domains[domain].count; -						hpts_domains[domain].cpu[count] = i; -						hpts_domains[domain].count++; +						count = pace->domains[domain].count; +						pace->domains[domain].cpu[count] = i; +						pace->domains[domain].count++;  						break;  					}  				}  			}  		} + +		hpts->p_mysleep.tv_sec = 0; +		hpts->p_mysleep.tv_usec = tcp_min_hptsi_time;  		tv.tv_sec = 0;  		tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_USECS_PER_SLOT; -		hpts->sleeping = tv.tv_usec; -		sb = tvtosbt(tv); -		callout_reset_sbt_on(&hpts->co, sb, 0, -				     hpts_timeout_swi, hpts, hpts->p_cpu, -				     (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision))); -	} -	/* -	 * If we somehow have an empty domain, fall back to choosing -	 * among all htps threads. -	 */ -	for (i = 0; i < vm_ndomains; i++) { -		if (hpts_domains[i].count == 0) { -			tcp_bind_threads = 0; -			break; -		} +		(void)tcp_hpts_sleep(hpts, &tv);  	} -	tcp_hpts_softclock = __tcp_run_hpts; -	tcp_lro_hpts_init(); -	printf("TCP Hpts created %d swi interrupt threads and bound %d to %s\n", -	    created, bound, -	    tcp_bind_threads == 2 ? "NUMA domains" : "cpus");  } -static void -tcp_hpts_mod_unload(void) +/* + * Stop all callouts/threads for a tcp_hptsi structure. + */ +void +tcp_hptsi_stop(struct tcp_hptsi *pace)  { +	struct tcp_hpts_entry *hpts;  	int rv __diagused; +	uint32_t i; -	tcp_lro_hpts_uninit(); -	atomic_store_ptr(&tcp_hpts_softclock, NULL); +	KASSERT(pace != NULL, ("tcp_hptsi_stop: pace is NULL")); -	for (int i = 0; i < tcp_pace.rp_num_hptss; i++) { -		struct tcp_hpts_entry *hpts = tcp_pace.rp_ent[i]; +	for (i = 0; i < pace->rp_num_hptss; i++) { +		hpts = pace->rp_ent[i]; +		KASSERT(hpts != NULL, ("tcp_hptsi_stop: hpts[%d] is NULL", i)); +		KASSERT(hpts->ie_cookie != NULL, +		    ("tcp_hptsi_stop: hpts[%d]->ie_cookie is NULL", i)); -		rv = callout_drain(&hpts->co); +		rv = _callout_stop_safe(&hpts->co, CS_DRAIN);  		MPASS(rv != 0);  		rv = swi_remove(hpts->ie_cookie);  		MPASS(rv == 0); +		hpts->ie_cookie = NULL; +	} +} -		rv = sysctl_ctx_free(&hpts->hpts_ctx); -		MPASS(rv == 0); +/* + * Destroy a tcp_hptsi structure initialized by tcp_hptsi_create. + */ +void +tcp_hptsi_destroy(struct tcp_hptsi *pace) +{ +	struct tcp_hpts_entry *hpts; +	uint32_t i; + +	KASSERT(pace != NULL, ("tcp_hptsi_destroy: pace is NULL")); +	KASSERT(pace->rp_ent != NULL, ("tcp_hptsi_destroy: pace->rp_ent is NULL")); + +	/* Cleanup each hpts entry */ +	for (i = 0; i < pace->rp_num_hptss; i++) { +		hpts = pace->rp_ent[i]; +		if (hpts != NULL) { +			/* Cleanup SYSCTL if it was initialized */ +			if (hpts->hpts_root != NULL) { +				sysctl_ctx_free(&hpts->hpts_ctx); +			} -		mtx_destroy(&hpts->p_mtx); -		free(hpts->p_hptss, M_TCPHPTS); -		free(hpts, M_TCPHPTS); +			mtx_destroy(&hpts->p_mtx); +			free(hpts->p_hptss, M_TCPHPTS); +			free(hpts, M_TCPHPTS); +		}  	} -	free(tcp_pace.rp_ent, M_TCPHPTS); -	free(tcp_pace.cts_last_ran, M_TCPHPTS); +	/* Cleanup main arrays */ +	free(pace->rp_ent, M_TCPHPTS); +	free(pace->cts_last_ran, M_TCPHPTS);  #ifdef SMP -	free(tcp_pace.grps, M_TCPHPTS); +	free(pace->grps, M_TCPHPTS);  #endif +	/* Free the main structure */ +	free(pace, M_TCPHPTS); +} + +static int +tcp_hpts_mod_load(void) +{ +	int i; + +	/* Don't try to bind to NUMA domains if we don't have any */ +	if (vm_ndomains == 1 && tcp_bind_threads == 2) +		tcp_bind_threads = 0; + +	/* Create the tcp_hptsi structure */ +	tcp_hptsi_pace = tcp_hptsi_create(&tcp_hptsi_default_funcs, true); +	if (tcp_hptsi_pace == NULL) +		return (ENOMEM); + +	/* Initialize global counters */ +	hpts_hopelessly_behind = counter_u64_alloc(M_WAITOK); +	hpts_loops = counter_u64_alloc(M_WAITOK); +	back_tosleep = counter_u64_alloc(M_WAITOK); +	combined_wheel_wrap = counter_u64_alloc(M_WAITOK); +	wheel_wrap = counter_u64_alloc(M_WAITOK); +	hpts_wake_timeout = counter_u64_alloc(M_WAITOK); +	hpts_direct_awakening = counter_u64_alloc(M_WAITOK); +	hpts_back_tosleep = counter_u64_alloc(M_WAITOK); +	hpts_direct_call = counter_u64_alloc(M_WAITOK); +	cpu_uses_flowid = counter_u64_alloc(M_WAITOK); +	cpu_uses_random = counter_u64_alloc(M_WAITOK); + +	/* Start the threads */ +	tcp_hptsi_start(tcp_hptsi_pace); + +	/* Enable the global HPTS softclock function */ +	tcp_hpts_softclock = __tcp_run_hpts; + +	/* Initialize LRO HPTS */ +	tcp_lro_hpts_init(); + +	/* +	 * If we somehow have an empty domain, fall back to choosing among all +	 * HPTS threads. +	 */ +	for (i = 0; i < vm_ndomains; i++) { +		if (tcp_hptsi_pace->domains[i].count == 0) { +			tcp_bind_threads = 0; +			break; +		} +	} + +	printf("TCP HPTS started %u (%s) swi interrupt threads\n", +		tcp_hptsi_pace->rp_num_hptss, (tcp_bind_threads == 0) ? +		 "(unbounded)" : +		 (tcp_bind_threads == 1 ? "per-cpu" : "per-NUMA-domain")); + +	return (0); +} + +static void +tcp_hpts_mod_unload(void) +{ +	tcp_lro_hpts_uninit(); + +	/* Disable the global HPTS softclock function */ +	atomic_store_ptr(&tcp_hpts_softclock, NULL); + +	tcp_hptsi_stop(tcp_hptsi_pace); +	tcp_hptsi_destroy(tcp_hptsi_pace); +	tcp_hptsi_pace = NULL; + +	/* Cleanup global counters */  	counter_u64_free(hpts_hopelessly_behind);  	counter_u64_free(hpts_loops);  	counter_u64_free(back_tosleep); @@ -2104,13 +2189,11 @@ tcp_hpts_mod_unload(void)  }  static int -tcp_hpts_modevent(module_t mod, int what, void *arg) +tcp_hpts_mod_event(module_t mod, int what, void *arg)  { -  	switch (what) {  	case MOD_LOAD: -		tcp_hpts_mod_load(); -		return (0); +		return (tcp_hpts_mod_load());  	case MOD_QUIESCE:  		/*  		 * Since we are a dependency of TCP stack modules, they should @@ -2130,7 +2213,7 @@ tcp_hpts_modevent(module_t mod, int what, void *arg)  static moduledata_t tcp_hpts_module = {  	.name = "tcphpts", -	.evhand = tcp_hpts_modevent, +	.evhand = tcp_hpts_mod_event,  };  DECLARE_MODULE(tcphpts, tcp_hpts_module, SI_SUB_SOFTINTR, SI_ORDER_ANY); diff --git a/sys/netinet/tcp_hpts.h b/sys/netinet/tcp_hpts.h index 6172baf2a062..6b05f9701ac2 100644 --- a/sys/netinet/tcp_hpts.h +++ b/sys/netinet/tcp_hpts.h @@ -28,19 +28,11 @@  /* Number of useconds represented by an hpts slot */  #define HPTS_USECS_PER_SLOT 10 -#define HPTS_MS_TO_SLOTS(x) ((x * 100) + 1) -#define HPTS_USEC_TO_SLOTS(x) ((x+9) /10)  #define HPTS_USEC_IN_SEC 1000000  #define HPTS_MSEC_IN_SEC 1000  #define HPTS_USEC_IN_MSEC 1000  static inline uint32_t -tcp_tv_to_hpts_slot(const struct timeval *sv) -{ -	return ((sv->tv_sec * 100000) + (sv->tv_usec / HPTS_USECS_PER_SLOT)); -} - -static inline uint32_t  tcp_tv_to_usec(const struct timeval *sv)  {  	return ((uint32_t) ((sv->tv_sec * HPTS_USEC_IN_SEC) + sv->tv_usec)); @@ -66,7 +58,7 @@ struct hpts_diag {  	uint32_t p_runningslot;		/* bbr->inflight */  	uint32_t slot_req;		/* bbr->flex3 x */  	uint32_t inp_hptsslot;		/* bbr->flex4 x */ -	uint32_t slot_remaining;	/* bbr->flex5 x */ +	uint32_t time_remaining;	/* bbr->flex5 x */  	uint32_t have_slept;		/* bbr->epoch x */  	uint32_t hpts_sleep_time;	/* bbr->applimited x */  	uint32_t yet_to_sleep;		/* bbr->lt_epoch x */ @@ -75,8 +67,6 @@ struct hpts_diag {  	uint32_t maxslots;		/* bbr->delRate x */  	uint32_t wheel_cts;		/* bbr->rttProp x */  	int32_t co_ret; 		/* bbr->pkts_out x */ -	uint32_t p_curtick;		/* upper bbr->cur_del_rate */ -	uint32_t p_lasttick;		/* lower bbr->cur_del_rate */  	uint8_t p_on_min_sleep; 	/* bbr->flex8 x */  }; @@ -92,13 +82,18 @@ struct hpts_diag {  #ifdef _KERNEL +extern struct tcp_hptsi *tcp_hptsi_pace; +  /*   * The following are the definitions for the kernel HPTS interface for managing   * the HPTS ring and the TCBs on it.  */ -void tcp_hpts_init(struct tcpcb *); -void tcp_hpts_remove(struct tcpcb *); +void __tcp_hpts_init(struct tcp_hptsi *pace, struct tcpcb *); +#define tcp_hpts_init(tp) __tcp_hpts_init(tcp_hptsi_pace, tp) + +void __tcp_hpts_remove(struct tcp_hptsi *pace, struct tcpcb *); +#define tcp_hpts_remove(tp) __tcp_hpts_remove(tcp_hptsi_pace, tp)  static inline bool  tcp_in_hpts(struct tcpcb *tp) @@ -132,12 +127,13 @@ tcp_in_hpts(struct tcpcb *tp)   * that INP_WLOCK() or from destroying your TCB where again   * you should already have the INP_WLOCK().   */ -uint32_t tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, -    struct hpts_diag *diag); -#define	tcp_hpts_insert(inp, slot)	\ -	tcp_hpts_insert_diag((inp), (slot), __LINE__, NULL) +void __tcp_hpts_insert(struct tcp_hptsi *pace, struct tcpcb *tp, uint32_t usecs, +	struct hpts_diag *diag); +#define	tcp_hpts_insert(tp, usecs, diag)	\ +	__tcp_hpts_insert(tcp_hptsi_pace, (tp), (usecs), (diag)) -void tcp_set_hpts(struct tcpcb *tp); +void __tcp_set_hpts(struct tcp_hptsi *pace, struct tcpcb *tp); +#define tcp_set_hpts(tp) __tcp_set_hpts(tcp_hptsi_pace, tp)  extern int32_t tcp_min_hptsi_time; @@ -147,17 +143,6 @@ get_hpts_min_sleep_time(void)  	return (tcp_min_hptsi_time + HPTS_USECS_PER_SLOT);  } -static inline uint32_t -tcp_gethptstick(struct timeval *sv) -{ -	struct timeval tv; - -	if (sv == NULL) -		sv = &tv; -	microuptime(sv); -	return (tcp_tv_to_hpts_slot(sv)); -} -  static inline uint64_t  tcp_get_u64_usecs(struct timeval *tv)  { @@ -180,12 +165,5 @@ tcp_get_usecs(struct timeval *tv)  	return (tcp_tv_to_usec(tv));  } -/* - * LRO HPTS initialization and uninitialization, only for internal use by the - * HPTS code. - */ -void tcp_lro_hpts_init(void); -void tcp_lro_hpts_uninit(void); -  #endif /* _KERNEL */  #endif /* __tcp_hpts_h__ */ diff --git a/sys/netinet/tcp_hpts_internal.h b/sys/netinet/tcp_hpts_internal.h new file mode 100644 index 000000000000..8b33e03a6981 --- /dev/null +++ b/sys/netinet/tcp_hpts_internal.h @@ -0,0 +1,184 @@ +/*- + * Copyright (c) 2025 Netflix, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + *    notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + *    notice, this list of conditions and the following disclaimer in the + *    documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef __tcp_hpts_internal_h__ +#define __tcp_hpts_internal_h__ + +/* + * TCP High Precision Timer System (HPTS) - Internal Definitions + * + * This header contains internal structures, constants, and interfaces that are + * implemented in tcp_hpts.c but exposed to enable comprehensive unit testing of + * the HPTS subsystem. + */ + +#if defined(_KERNEL) + +/* + * The hpts uses a 102400 wheel. The wheel + * defines the time in 10 usec increments (102400 x 10). + * This gives a range of 10usec - 1024ms to place + * an entry within. If the user requests more than + * 1.024 second, a remaineder is attached and the hpts + * when seeing the remainder will re-insert the + * inpcb forward in time from where it is until + * the remainder is zero. + */ + +#define NUM_OF_HPTSI_SLOTS 102400 + +/* The number of connections after which the dynamic sleep logic kicks in. */ +#define DEFAULT_CONNECTION_THRESHOLD 100 + +/* + * The hpts uses a 102400 wheel. The wheel + * defines the time in 10 usec increments (102400 x 10). + * This gives a range of 10usec - 1024ms to place + * an entry within. If the user requests more than + * 1.024 second, a remaineder is attached and the hpts + * when seeing the remainder will re-insert the + * inpcb forward in time from where it is until + * the remainder is zero. + */ + +#define NUM_OF_HPTSI_SLOTS 102400 + +/* Convert microseconds to HPTS slots */ +#define HPTS_USEC_TO_SLOTS(x) ((x+9) /10) + +/* The number of connections after which the dynamic sleep logic kicks in. */ +#define DEFAULT_CONNECTION_THRESHOLD 100 + +extern int tcp_bind_threads; 		/* Thread binding configuration +					 * (0=none, 1=cpu, 2=numa) */ + +/* + * Abstraction layer controlling time, interrupts and callouts. + */ +struct tcp_hptsi_funcs { +	void (*microuptime)(struct timeval *tv); +	int (*swi_add)(struct intr_event **eventp, const char *name, +		driver_intr_t handler, void *arg, int pri, enum intr_type flags, +		void **cookiep); +	int (*swi_remove)(void *cookie); +	void (*swi_sched)(void *cookie, int flags); +	int (*intr_event_bind)(struct intr_event *ie, int cpu); +	int (*intr_event_bind_ithread_cpuset)(struct intr_event *ie, +		struct _cpuset *mask); +	void (*callout_init)(struct callout *c, int mpsafe); +	int (*callout_reset_sbt_on)(struct callout *c, sbintime_t sbt, +		sbintime_t precision, void (*func)(void *), void *arg, int cpu, +		int flags); +	int (*_callout_stop_safe)(struct callout *c, int flags); +}; + +/* Default function table for system operation */ +extern const struct tcp_hptsi_funcs tcp_hptsi_default_funcs; + +/* Each hpts has its own p_mtx which is used for locking */ +#define	HPTS_MTX_ASSERT(hpts)	mtx_assert(&(hpts)->p_mtx, MA_OWNED) +#define	HPTS_LOCK(hpts)		mtx_lock(&(hpts)->p_mtx) +#define	HPTS_TRYLOCK(hpts)	mtx_trylock(&(hpts)->p_mtx) +#define	HPTS_UNLOCK(hpts)	mtx_unlock(&(hpts)->p_mtx) + +struct tcp_hpts_entry { +	/* Cache line 0x00 */ +	struct mtx p_mtx;		/* Mutex for hpts */ +	struct timeval p_mysleep;	/* Our min sleep time */ +	uint64_t syscall_cnt; +	uint64_t sleeping;		/* What the actual sleep was (if sleeping) */ +	uint16_t p_hpts_active; 	/* Flag that says hpts is awake  */ +	uint8_t p_wheel_complete; 	/* have we completed the wheel arc walk? */ +	uint32_t p_runningslot; 	/* Current slot we are at if we are running */ +	uint32_t p_prev_slot;		/* Previous slot we were on */ +	uint32_t p_cur_slot;		/* Current slot in wheel hpts is draining */ +	uint32_t p_nxt_slot;		/* The next slot outside the current range +					 * of slots that the hpts is running on. */ +	int32_t p_on_queue_cnt;		/* Count on queue in this hpts */ +	uint8_t p_direct_wake :1, 	/* boolean */ +		p_on_min_sleep:1, 	/* boolean */ +		p_hpts_wake_scheduled:1,/* boolean */ +		hit_callout_thresh:1, +		p_avail:4; +	uint8_t p_fill[3];		/* Fill to 32 bits */ +	/* Cache line 0x40 */ +	struct hptsh { +		TAILQ_HEAD(, tcpcb)	head; +		uint32_t		count; +		uint32_t		gencnt; +	} *p_hptss;			/* Hptsi wheel */ +	uint32_t p_hpts_sleep_time;	/* Current sleep interval having a max +					 * of 255ms */ +	uint32_t overidden_sleep;	/* what was overrided by min-sleep for logging */ +	uint32_t saved_curslot;		/* for logging */ +	uint32_t saved_prev_slot;	/* for logging */ +	uint32_t p_delayed_by;		/* How much were we delayed by */ +	/* Cache line 0x80 */ +	struct sysctl_ctx_list hpts_ctx; +	struct sysctl_oid *hpts_root; +	struct intr_event *ie; +	void *ie_cookie; +	uint16_t p_cpu;			/* The hpts CPU */ +	struct tcp_hptsi *p_hptsi;	/* Back pointer to parent hptsi structure */ +	/* There is extra space in here */ +	/* Cache line 0x100 */ +	struct callout co __aligned(CACHE_LINE_SIZE); +}               __aligned(CACHE_LINE_SIZE); + +struct tcp_hptsi { +	struct cpu_group **grps; +	struct tcp_hpts_entry **rp_ent;	/* Array of hptss */ +	uint32_t *cts_last_ran; +	uint32_t grp_cnt; +	uint32_t rp_num_hptss;		/* Number of hpts threads */ +	struct hpts_domain_info { +		int count; +		int cpu[MAXCPU]; +	} domains[MAXMEMDOM];		/* Per-NUMA domain CPU assignments */ +	const struct tcp_hptsi_funcs *funcs;	/* Function table for testability */ +}; + +/* + * Core tcp_hptsi structure manipulation functions. + */ +struct tcp_hptsi* tcp_hptsi_create(const struct tcp_hptsi_funcs *funcs, +	bool enable_sysctl); +void tcp_hptsi_destroy(struct tcp_hptsi *pace); +void tcp_hptsi_start(struct tcp_hptsi *pace); +void tcp_hptsi_stop(struct tcp_hptsi *pace); +uint16_t tcp_hptsi_random_cpu(struct tcp_hptsi *pace); +int32_t tcp_hptsi(struct tcp_hpts_entry *hpts, bool from_callout); + +void tcp_hpts_wake(struct tcp_hpts_entry *hpts); + +/* + * LRO HPTS initialization and uninitialization, only for internal use by the + * HPTS code. + */ +void tcp_lro_hpts_init(void); +void tcp_lro_hpts_uninit(void); + +#endif /* defined(_KERNEL) */ +#endif /* __tcp_hpts_internal_h__ */ diff --git a/sys/netinet/tcp_hpts_test.c b/sys/netinet/tcp_hpts_test.c new file mode 100644 index 000000000000..c5dc9cb5b03b --- /dev/null +++ b/sys/netinet/tcp_hpts_test.c @@ -0,0 +1,1682 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2025 Netflix, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + *    notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + *    notice, this list of conditions and the following disclaimer in the + *    documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <tests/ktest.h> +#include <sys/cdefs.h> +#include "opt_inet.h" +#include <sys/param.h> +#include <sys/bus.h> +#include <sys/interrupt.h> +#include <sys/errno.h> +#include <sys/malloc.h> +#include <sys/mutex.h> +#include <sys/refcount.h> +#include <sys/socket.h> +#include <sys/sysctl.h> +#include <sys/systm.h> + +#include <netinet/in.h> +#include <netinet/tcp.h> +#include <netinet/in_pcb.h> +#include <netinet/tcp_seq.h> +#include <netinet/tcp_var.h> +#include <netinet/tcp_hpts.h> +#include <netinet/tcp_hpts_internal.h> +#include <dev/tcp_log/tcp_log_dev.h> +#include <netinet/tcp_log_buf.h> + +#undef tcp_hpts_init +#undef tcp_hpts_remove +#undef tcp_hpts_insert +#undef tcp_set_hpts + +/* Custom definitions that take the tcp_hptsi */ +#define tcp_hpts_init(pace, tp) __tcp_hpts_init((pace), (tp)) +#define tcp_hpts_remove(pace, tp) __tcp_hpts_remove((pace), (tp)) +#define	tcp_hpts_insert(pace, tp, usecs, diag)	\ +	__tcp_hpts_insert((pace), (tp), (usecs), (diag)) +#define tcp_set_hpts(pace, tp) __tcp_set_hpts((pace), (tp)) + +static MALLOC_DEFINE(M_TCPHPTS, "tcp_hpts_test", "TCP hpts test"); + +static int test_exit_on_failure = true; +SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hpts_test, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, +    "TCP HPTS test controls"); +SYSCTL_INT(_net_inet_tcp_hpts_test, OID_AUTO, exit_on_failure, CTLFLAG_RW, +    &test_exit_on_failure, 0, +    "Exit HPTS test immediately on first failure (1) or continue running all tests (0)"); + +#define KTEST_VERIFY(x) do { \ +	if (!(x)) { \ +		KTEST_ERR(ctx, "FAIL: %s", #x); \ +		if (test_exit_on_failure) \ +			return (EINVAL); \ +	} else { \ +		KTEST_LOG(ctx, "PASS: %s", #x); \ +	} \ +} while (0) + +#define KTEST_EQUAL(x, y) do { \ +	if ((x) != (y)) { \ +		KTEST_ERR(ctx, "FAIL: %s != %s (%d != %d)", #x, #y, (x), (y)); \ +		if (test_exit_on_failure) \ +			return (EINVAL); \ +	} else { \ +		KTEST_LOG(ctx, "PASS: %s == %s", #x, #y); \ +	} \ +} while (0) + +#define KTEST_NEQUAL(x, y) do { \ +	if ((x) == (y)) { \ +		KTEST_ERR(ctx, "FAIL: %s == %s (%d == %d)", #x, #y, (x), (y)); \ +		if (test_exit_on_failure) \ +			return (EINVAL); \ +	} else { \ +		KTEST_LOG(ctx, "PASS: %s != %s", #x, #y); \ +	} \ +} while (0) + +#define KTEST_GREATER_THAN(x, y) do { \ +	if ((x) <= (y)) { \ +		KTEST_ERR(ctx, "FAIL: %s <= %s (%d <= %d)", #x, #y, (x), (y)); \ +		if (test_exit_on_failure) \ +			return (EINVAL); \ +	} else { \ +		KTEST_LOG(ctx, "PASS: %s > %s", #x, #y); \ +	} \ +} while (0) + +#define KTEST_VERIFY_RET(x, y) do { \ +	if (!(x)) { \ +		KTEST_ERR(ctx, "FAIL: %s", #x); \ +		if (test_exit_on_failure) \ +			return (y); \ +	} else { \ +		KTEST_LOG(ctx, "PASS: %s", #x); \ +	} \ +} while (0) + +#ifdef TCP_HPTS_KTEST + +static void +dump_hpts_entry(struct ktest_test_context *ctx, struct tcp_hpts_entry *hpts) +{ +	KTEST_LOG(ctx, "tcp_hpts_entry(%p)", hpts); +	KTEST_LOG(ctx, "  p_cur_slot: %u", hpts->p_cur_slot); +	KTEST_LOG(ctx, "  p_prev_slot: %u", hpts->p_prev_slot); +	KTEST_LOG(ctx, "  p_nxt_slot: %u", hpts->p_nxt_slot); +	KTEST_LOG(ctx, "  p_runningslot: %u", hpts->p_runningslot); +	KTEST_LOG(ctx, "  p_on_queue_cnt: %d", hpts->p_on_queue_cnt); +	KTEST_LOG(ctx, "  p_hpts_active: %u", hpts->p_hpts_active); +	KTEST_LOG(ctx, "  p_wheel_complete: %u", hpts->p_wheel_complete); +	KTEST_LOG(ctx, "  p_direct_wake: %u", hpts->p_direct_wake); +	KTEST_LOG(ctx, "  p_on_min_sleep: %u", hpts->p_on_min_sleep); +	KTEST_LOG(ctx, "  p_hpts_wake_scheduled: %u", hpts->p_hpts_wake_scheduled); +	KTEST_LOG(ctx, "  hit_callout_thresh: %u", hpts->hit_callout_thresh); +	KTEST_LOG(ctx, "  p_hpts_sleep_time: %u", hpts->p_hpts_sleep_time); +	KTEST_LOG(ctx, "  p_delayed_by: %u", hpts->p_delayed_by); +	KTEST_LOG(ctx, "  overidden_sleep: %u", hpts->overidden_sleep); +	KTEST_LOG(ctx, "  saved_curslot: %u", hpts->saved_curslot); +	KTEST_LOG(ctx, "  saved_prev_slot: %u", hpts->saved_prev_slot); +	KTEST_LOG(ctx, "  syscall_cnt: %lu", hpts->syscall_cnt); +	KTEST_LOG(ctx, "  sleeping: %lu", hpts->sleeping); +	KTEST_LOG(ctx, "  p_cpu: %u", hpts->p_cpu); +	KTEST_LOG(ctx, "  ie_cookie: %p", hpts->ie_cookie); +	KTEST_LOG(ctx, "  p_hptsi: %p", hpts->p_hptsi); +	KTEST_LOG(ctx, "  p_mysleep: %ld.%06ld", hpts->p_mysleep.tv_sec, hpts->p_mysleep.tv_usec); +} + +static void +dump_tcpcb(struct tcpcb *tp) +{ +	struct ktest_test_context *ctx = tp->t_fb_ptr; +	struct inpcb *inp = &tp->t_inpcb; + +	KTEST_LOG(ctx, "tcp_control_block(%p)", tp); + +	/* HPTS-specific fields */ +	KTEST_LOG(ctx, "  t_in_hpts: %d", tp->t_in_hpts); +	KTEST_LOG(ctx, "  t_hpts_cpu: %u", tp->t_hpts_cpu); +	KTEST_LOG(ctx, "  t_hpts_slot: %d", tp->t_hpts_slot); +	KTEST_LOG(ctx, "  t_hpts_gencnt: %u", tp->t_hpts_gencnt); +	KTEST_LOG(ctx, "  t_hpts_request: %u", tp->t_hpts_request); + +	/* LRO CPU field */ +	KTEST_LOG(ctx, "  t_lro_cpu: %u", tp->t_lro_cpu); + +	/* TCP flags that affect HPTS */ +	KTEST_LOG(ctx, "  t_flags2: 0x%x", tp->t_flags2); +	KTEST_LOG(ctx, "    TF2_HPTS_CPU_SET: %s", (tp->t_flags2 & TF2_HPTS_CPU_SET) ? "YES" : "NO"); +	KTEST_LOG(ctx, "    TF2_HPTS_CALLS: %s", (tp->t_flags2 & TF2_HPTS_CALLS) ? "YES" : "NO"); +	KTEST_LOG(ctx, "    TF2_SUPPORTS_MBUFQ: %s", (tp->t_flags2 & TF2_SUPPORTS_MBUFQ) ? "YES" : "NO"); + +	/* Input PCB fields that HPTS uses */ +	KTEST_LOG(ctx, "  inp_flags: 0x%x", inp->inp_flags); +	KTEST_LOG(ctx, "    INP_DROPPED: %s", (inp->inp_flags & INP_DROPPED) ? "YES" : "NO"); +	KTEST_LOG(ctx, "  inp_flowid: 0x%x", inp->inp_flowid); +	KTEST_LOG(ctx, "  inp_flowtype: %u", inp->inp_flowtype); +	KTEST_LOG(ctx, "  inp_numa_domain: %d", inp->inp_numa_domain); +} + +/* Enum for call counting indices */ +enum test_call_counts { +	CCNT_MICROUPTIME = 0, +	CCNT_SWI_ADD, +	CCNT_SWI_REMOVE, +	CCNT_SWI_SCHED, +	CCNT_INTR_EVENT_BIND, +	CCNT_INTR_EVENT_BIND_CPUSET, +	CCNT_CALLOUT_INIT, +	CCNT_CALLOUT_RESET_SBT_ON, +	CCNT_CALLOUT_STOP_SAFE, +	CCNT_TCP_OUTPUT, +	CCNT_TCP_TFB_DO_QUEUED_SEGMENTS, +	CCNT_MAX +}; + +static uint32_t call_counts[CCNT_MAX]; + +static uint64_t test_time_usec = 0; + +/* + * Reset all test global variables to a clean state. + */ +static void +test_hpts_init(void) +{ +	memset(call_counts, 0, sizeof(call_counts)); +	test_time_usec = 0; +} + +static void +test_microuptime(struct timeval *tv) +{ +	call_counts[CCNT_MICROUPTIME]++; +	tv->tv_sec = test_time_usec / 1000000; +	tv->tv_usec = test_time_usec % 1000000; +} + +static int +test_swi_add(struct intr_event **eventp, const char *name, +    driver_intr_t handler, void *arg, int pri, enum intr_type flags, +    void **cookiep) +{ +	call_counts[CCNT_SWI_ADD]++; +	/* Simulate successful SWI creation */ +	*eventp = (struct intr_event *)0xfeedface; /* Mock event */ +	*cookiep = (void *)0xdeadbeef; /* Mock cookie */ +	return (0); +} + +static int +test_swi_remove(void *cookie) +{ +	call_counts[CCNT_SWI_REMOVE]++; +	/* Simulate successful removal */ +	return (0); +} + +static void +test_swi_sched(void *cookie, int flags) +{ +	call_counts[CCNT_SWI_SCHED]++; +	/* Simulate successful SWI scheduling */ +} + +static int +test_intr_event_bind(struct intr_event *ie, int cpu) +{ +	call_counts[CCNT_INTR_EVENT_BIND]++; +	/* Simulate successful binding */ +	return (0); +} + +static int +test_intr_event_bind_ithread_cpuset(struct intr_event *ie, struct _cpuset *mask) +{ +	call_counts[CCNT_INTR_EVENT_BIND_CPUSET]++; +	/* Simulate successful cpuset binding */ +	return (0); +} + +static void +test_callout_init(struct callout *c, int mpsafe) +{ +	call_counts[CCNT_CALLOUT_INIT]++; +	memset(c, 0, sizeof(*c)); +} + +static int +test_callout_reset_sbt_on(struct callout *c, sbintime_t sbt, sbintime_t precision, +    void (*func)(void *), void *arg, int cpu, int flags) +{ +	call_counts[CCNT_CALLOUT_RESET_SBT_ON]++; +	/* Return 1 to simulate successful timer scheduling */ +	return (1); +} + +static int +test_callout_stop_safe(struct callout *c, int flags) +{ +	call_counts[CCNT_CALLOUT_STOP_SAFE]++; +	/* Return 1 to simulate successful timer stopping */ +	return (1); +} + +static const struct tcp_hptsi_funcs test_funcs = { +	.microuptime = test_microuptime, +	.swi_add = test_swi_add, +	.swi_remove = test_swi_remove, +	.swi_sched = test_swi_sched, +	.intr_event_bind = test_intr_event_bind, +	.intr_event_bind_ithread_cpuset = test_intr_event_bind_ithread_cpuset, +	.callout_init = test_callout_init, +	.callout_reset_sbt_on = test_callout_reset_sbt_on, +	._callout_stop_safe = test_callout_stop_safe, +}; + +#define TP_REMOVE_FROM_HPTS(tp) tp->bits_spare +#define TP_LOG_TEST(tp) tp->t_log_state_set + +static int +test_tcp_output(struct tcpcb *tp) +{ +	struct ktest_test_context *ctx = tp->t_fb_ptr; +	struct tcp_hptsi *pace = (struct tcp_hptsi*)tp->t_tfo_pending; +	struct tcp_hpts_entry *hpts = pace->rp_ent[tp->t_hpts_cpu]; + +	call_counts[CCNT_TCP_OUTPUT]++; +	if (TP_LOG_TEST(tp)) { +		KTEST_LOG(ctx, "=> tcp_output(%p)", tp); +		dump_tcpcb(tp); +		dump_hpts_entry(ctx, hpts); +	} + +	if ((TP_REMOVE_FROM_HPTS(tp) & 1) != 0) { +		if (TP_LOG_TEST(tp)) +			KTEST_LOG(ctx, "=> tcp_hpts_remove(%p)", tp); +		tcp_hpts_remove(pace, tp); +	} + +	if ((TP_REMOVE_FROM_HPTS(tp) & 2) != 0) { +		INP_WUNLOCK(&tp->t_inpcb); /* tcp_output unlocks on error */ +		return (-1); /* Simulate tcp_output error */ +	} + +	return (0); +} + +static int +test_tfb_do_queued_segments(struct tcpcb *tp, int flag) +{ +	struct ktest_test_context *ctx = tp->t_fb_ptr; +	struct tcp_hptsi *pace = (struct tcp_hptsi*)tp->t_tfo_pending; +	struct tcp_hpts_entry *hpts = pace->rp_ent[tp->t_hpts_cpu]; + +	call_counts[CCNT_TCP_TFB_DO_QUEUED_SEGMENTS]++; +	KTEST_LOG(ctx, "=> tfb_do_queued_segments(%p, %d)", tp, flag); +	dump_tcpcb(tp); +	dump_hpts_entry(ctx, hpts); + +	if ((TP_REMOVE_FROM_HPTS(tp) & 1) != 0) { +		if (TP_LOG_TEST(tp)) +			KTEST_LOG(ctx, "=> tcp_hpts_remove(%p)", tp); +		tcp_hpts_remove(pace, tp); +	} + +	if ((TP_REMOVE_FROM_HPTS(tp) & 2) != 0) { +		INP_WUNLOCK(&tp->t_inpcb); /* do_queued_segments unlocks on error */ +		return (-1); /* Simulate do_queued_segments error */ +	} + +	return (0); +} + +static struct tcp_function_block test_tcp_fb = { +	.tfb_tcp_block_name = "hpts_test_tcp", +	.tfb_tcp_output = test_tcp_output, +	.tfb_do_queued_segments = test_tfb_do_queued_segments, +}; + +/* + * Create a minimally initialized tcpcb that can be safely inserted into HPTS. + * This function allocates and initializes all the fields that HPTS code + * reads or writes. + */ +static struct tcpcb * +test_hpts_create_tcpcb(struct ktest_test_context *ctx, struct tcp_hptsi *pace) +{ +	struct tcpcb *tp; + +	tp = malloc(sizeof(struct tcpcb), M_TCPHPTS, M_WAITOK | M_ZERO); +	if (tp) { +		rw_init_flags(&tp->t_inpcb.inp_lock, "test-inp", +			RW_RECURSE | RW_DUPOK); +		refcount_init(&tp->t_inpcb.inp_refcount, 1); +		tp->t_inpcb.inp_pcbinfo = &V_tcbinfo; +		tp->t_fb = &test_tcp_fb; +		tp->t_hpts_cpu = HPTS_CPU_NONE; +		STAILQ_INIT(&tp->t_inqueue); +		tcp_hpts_init(pace, tp); + +		/* Stuff some pointers in the tcb for test purposes. */ +		tp->t_fb_ptr = ctx; +		tp->t_tfo_pending = (unsigned int*)pace; +	} + +	return (tp); +} + +/* + * Free a test tcpcb created by test_hpts_create_tcpcb() + */ +static void +test_hpts_free_tcpcb(struct tcpcb *tp) +{ +	if (tp == NULL) +		return; + +	INP_LOCK_DESTROY(&tp->t_inpcb); +	free(tp, M_TCPHPTS); +} + +/* + * *********************************************** + * * KTEST functions for testing the HPTS module * + * *********************************************** + */ + +/* + * Validates that the HPTS module is properly loaded and initialized by checking + * that the minimum HPTS time is configured. + */ +KTEST_FUNC(module_load) +{ +	test_hpts_init(); +	KTEST_NEQUAL(tcp_min_hptsi_time, 0); +	KTEST_VERIFY(tcp_bind_threads >= 0 && tcp_bind_threads <= 2); +	KTEST_NEQUAL(tcp_hptsi_pace, NULL); +	return (0); +} + +/* + * Validates the creation and destruction of tcp_hptsi structures, ensuring + * proper initialization of internal fields and clean destruction. + */ +KTEST_FUNC(hptsi_create_destroy) +{ +	struct tcp_hptsi *pace; + +	test_hpts_init(); + +	pace = tcp_hptsi_create(&test_funcs, false); +	KTEST_NEQUAL(pace, NULL); +	KTEST_NEQUAL(pace->rp_ent, NULL); +	KTEST_NEQUAL(pace->cts_last_ran, NULL); +	KTEST_VERIFY(pace->rp_num_hptss > 0); +	KTEST_VERIFY(pace->rp_num_hptss <= MAXCPU); /* Reasonable upper bound */ +	KTEST_VERIFY(pace->grp_cnt >= 1); /* At least one group */ +	KTEST_EQUAL(pace->funcs, &test_funcs); /* Verify function pointer was set */ + +	/* Verify individual HPTS entries are properly initialized */ +	for (uint32_t i = 0; i < pace->rp_num_hptss; i++) { +		KTEST_NEQUAL(pace->rp_ent[i], NULL); +		KTEST_EQUAL(pace->rp_ent[i]->p_cpu, i); +		KTEST_EQUAL(pace->rp_ent[i]->p_hptsi, pace); +		KTEST_EQUAL(pace->rp_ent[i]->p_on_queue_cnt, 0); +	} + +	tcp_hptsi_destroy(pace); + +	return (0); +} + +/* + * Validates that tcp_hptsi structures can be started and stopped properly, + * including verification that threads are created during start and cleaned up + * during stop operations. + */ +KTEST_FUNC(hptsi_start_stop) +{ +	struct tcp_hptsi *pace; + +	test_hpts_init(); + +	pace = tcp_hptsi_create(&test_funcs, false); +	KTEST_NEQUAL(pace, NULL); + +	tcp_hptsi_start(pace); + +	/* Verify that entries have threads started */ +	struct tcp_hpts_entry *hpts = pace->rp_ent[0]; +	KTEST_NEQUAL(hpts->ie_cookie, NULL);  /* Should have SWI handler */ +	KTEST_EQUAL(hpts->p_hptsi, pace);     /* Should point to our pace */ + +	tcp_hptsi_stop(pace); +	tcp_hptsi_destroy(pace); + +	return (0); +} + +/* + * Validates that multiple tcp_hptsi instances can coexist independently, with + * different configurations and CPU assignments without interfering with each + * other. + */ +KTEST_FUNC(hptsi_independence) +{ +	struct tcp_hptsi *pace1, *pace2; +	uint16_t cpu1, cpu2; + +	test_hpts_init(); + +	pace1 = tcp_hptsi_create(&test_funcs, false); +	pace2 = tcp_hptsi_create(&test_funcs, false); +	KTEST_NEQUAL(pace1, NULL); +	KTEST_NEQUAL(pace2, NULL); +	KTEST_NEQUAL(pace2->rp_ent, NULL); + +	cpu1 = tcp_hptsi_random_cpu(pace1); +	cpu2 = tcp_hptsi_random_cpu(pace2); +	KTEST_VERIFY(cpu1 < pace1->rp_num_hptss); +	KTEST_VERIFY(cpu2 < pace2->rp_num_hptss); + +	/* Verify both instances have independent entry arrays */ +	KTEST_NEQUAL(pace1->rp_ent, pace2->rp_ent); +	/* Verify they may have different CPU counts but both reasonable */ +	KTEST_VERIFY(pace1->rp_num_hptss > 0 && pace1->rp_num_hptss <= MAXCPU); +	KTEST_VERIFY(pace2->rp_num_hptss > 0 && pace2->rp_num_hptss <= MAXCPU); + +	tcp_hptsi_destroy(pace1); +	tcp_hptsi_destroy(pace2); + +	return (0); +} + +/* + * Validates that custom function injection works correctly, ensuring that + * test-specific implementations of microuptime and others are properly + * called by the HPTS system. + */ +KTEST_FUNC(function_injection) +{ +	struct tcp_hptsi *pace; + +	test_hpts_init(); + +	pace = tcp_hptsi_create(&test_funcs, false); +	KTEST_NEQUAL(pace, NULL); +	KTEST_EQUAL(pace->funcs, &test_funcs); +	KTEST_VERIFY(call_counts[CCNT_MICROUPTIME] > 0); +	KTEST_VERIFY(call_counts[CCNT_CALLOUT_INIT] > 0); + +	tcp_hptsi_start(pace); +	KTEST_VERIFY(call_counts[CCNT_SWI_ADD] > 0); +	KTEST_VERIFY(tcp_bind_threads == 0 || +	    call_counts[CCNT_INTR_EVENT_BIND] > 0 || +	    call_counts[CCNT_INTR_EVENT_BIND_CPUSET] > 0); +	KTEST_VERIFY(call_counts[CCNT_CALLOUT_RESET_SBT_ON] > 0); + +	tcp_hptsi_stop(pace); +	KTEST_VERIFY(call_counts[CCNT_CALLOUT_STOP_SAFE] > 0); +	KTEST_VERIFY(call_counts[CCNT_SWI_REMOVE] > 0); + +	tcp_hptsi_destroy(pace); + +	/* Verify we have a reasonable balance of create/destroy calls */ +	KTEST_EQUAL(call_counts[CCNT_SWI_ADD], call_counts[CCNT_SWI_REMOVE]); +	KTEST_VERIFY(call_counts[CCNT_CALLOUT_RESET_SBT_ON] <= call_counts[CCNT_CALLOUT_STOP_SAFE]); + +	return (0); +} + +/* + * Validates that a tcpcb can be properly initialized for HPTS compatibility, + * ensuring all required fields are set correctly and function pointers are + * valid for safe HPTS operations. + */ +KTEST_FUNC(tcpcb_initialization) +{ +	struct tcp_hptsi *pace; +	struct tcpcb *tp; + +	test_hpts_init(); + +	pace = tcp_hptsi_create(&test_funcs, false); +	KTEST_NEQUAL(pace, NULL); +	tcp_hptsi_start(pace); + +	/* Verify the tcpcb is properly initialized for HPTS */ +	tp = test_hpts_create_tcpcb(ctx, pace); +	KTEST_NEQUAL(tp, NULL); +	KTEST_NEQUAL(tp->t_fb, NULL); +	KTEST_NEQUAL(tp->t_fb->tfb_tcp_output, NULL); +	KTEST_NEQUAL(tp->t_fb->tfb_do_queued_segments, NULL); +	KTEST_EQUAL(tp->t_in_hpts, IHPTS_NONE); +	KTEST_EQUAL((tp->t_flags2 & (TF2_HPTS_CPU_SET | TF2_HPTS_CALLS)), 0); + +	/* Verify that HPTS-specific fields are initialized */ +	KTEST_EQUAL(tp->t_hpts_gencnt, 0); +	KTEST_EQUAL(tp->t_hpts_slot, 0); +	KTEST_EQUAL(tp->t_hpts_request, 0); +	KTEST_EQUAL(tp->t_lro_cpu, 0); +	KTEST_VERIFY(tp->t_hpts_cpu < pace->rp_num_hptss); +	KTEST_EQUAL(tp->t_inpcb.inp_refcount, 1); +	KTEST_VERIFY(!(tp->t_inpcb.inp_flags & INP_DROPPED)); + +	test_hpts_free_tcpcb(tp); +	tcp_hptsi_stop(pace); +	tcp_hptsi_destroy(pace); + +	return (0); +} + +/* + * Validates that tcpcb structures can be successfully inserted into and removed + * from the HPTS wheel, with proper state tracking and slot assignment during + * the process. + */ +KTEST_FUNC(tcpcb_insertion) +{ +	struct tcp_hptsi *pace; +	struct tcpcb *tp; +	struct tcp_hpts_entry *hpts; +	uint32_t timeout_usecs = 10; + +	test_hpts_init(); + +	pace = tcp_hptsi_create(&test_funcs, false); +	KTEST_NEQUAL(pace, NULL); +	tcp_hptsi_start(pace); + +	tp = test_hpts_create_tcpcb(ctx, pace); +	KTEST_NEQUAL(tp, NULL); +	KTEST_EQUAL(tp->t_in_hpts, IHPTS_NONE); +	KTEST_EQUAL((tp->t_flags2 & TF2_HPTS_CALLS), 0); + +	INP_WLOCK(&tp->t_inpcb); +	tp->t_flags2 |= TF2_HPTS_CALLS; +	KTEST_EQUAL(call_counts[CCNT_SWI_SCHED], 0); +	tcp_hpts_insert(pace, tp, timeout_usecs, NULL); +	KTEST_EQUAL(tp->t_in_hpts, IHPTS_ONQUEUE); +	INP_WUNLOCK(&tp->t_inpcb); +	KTEST_EQUAL(call_counts[CCNT_TCP_OUTPUT], 0); +	KTEST_EQUAL(call_counts[CCNT_SWI_SCHED], 1); +	KTEST_VERIFY(tcp_in_hpts(tp)); +	KTEST_VERIFY(tp->t_hpts_slot >= 0); +	KTEST_VERIFY(tp->t_hpts_slot < NUM_OF_HPTSI_SLOTS); + +	hpts = pace->rp_ent[tp->t_hpts_cpu]; +	KTEST_EQUAL(hpts->p_on_queue_cnt, 1); +	KTEST_EQUAL(tp->t_hpts_request, 0); +	KTEST_EQUAL(tp->t_hpts_slot, HPTS_USEC_TO_SLOTS(timeout_usecs)); +	//KTEST_EQUAL(tp->t_hpts_gencnt, 1); + +	INP_WLOCK(&tp->t_inpcb); +	tcp_hpts_remove(pace, tp); +	KTEST_EQUAL(tp->t_in_hpts, IHPTS_NONE); +	INP_WUNLOCK(&tp->t_inpcb); +	KTEST_EQUAL(call_counts[CCNT_TCP_OUTPUT], 0); +	KTEST_VERIFY(!tcp_in_hpts(tp)); + +	KTEST_EQUAL(hpts->p_on_queue_cnt, 0); + +	test_hpts_free_tcpcb(tp); +	tcp_hptsi_stop(pace); +	tcp_hptsi_destroy(pace); + +	return (0); +} + +/* + * Validates the core HPTS timer functionality by verifying that scheduled + * tcpcb entries trigger tcp_output calls at appropriate times, simulating + * real-world timer-driven TCP processing. + */ +KTEST_FUNC(timer_functionality) +{ +	struct epoch_tracker et; +	struct tcp_hptsi *pace; +	struct tcp_hpts_entry *hpts; +	struct tcpcb *tp; +	int32_t slots_ran; +	uint32_t i; + +	test_hpts_init(); + +	pace = tcp_hptsi_create(&test_funcs, false); +	KTEST_NEQUAL(pace, NULL); +	tcp_hptsi_start(pace); + +	for (i = 0; i < pace->rp_num_hptss; i++) +		dump_hpts_entry(ctx, pace->rp_ent[i]); + +	/* Create and insert the tcpcb into the HPTS wheel to wait for 500 usec */ +	tp = test_hpts_create_tcpcb(ctx, pace); +	KTEST_NEQUAL(tp, NULL); +	dump_tcpcb(tp); +	TP_LOG_TEST(tp) = 1; /* Enable logging for this tcpcb */ + +	KTEST_LOG(ctx, "=> tcp_hpts_insert(%p)", tp); +	INP_WLOCK(&tp->t_inpcb); +	tp->t_flags2 |= TF2_HPTS_CALLS; /* Mark as needing HPTS processing */ +	tcp_hpts_insert(pace, tp, 500, NULL); +	INP_WUNLOCK(&tp->t_inpcb); + +	dump_tcpcb(tp); +	for (i = 0; i < pace->rp_num_hptss; i++) +		dump_hpts_entry(ctx, pace->rp_ent[i]); + +	hpts = pace->rp_ent[tp->t_hpts_cpu]; +	KTEST_EQUAL(hpts->p_on_queue_cnt, 1); +	KTEST_EQUAL(hpts->p_prev_slot, 0); +	KTEST_EQUAL(hpts->p_cur_slot, 0); +	KTEST_EQUAL(hpts->p_runningslot, 0); +	KTEST_EQUAL(hpts->p_nxt_slot, 1); +	KTEST_EQUAL(hpts->p_hpts_active, 0); + +	KTEST_EQUAL(tp->t_in_hpts, IHPTS_ONQUEUE); +	KTEST_EQUAL(tp->t_hpts_request, 0); +	KTEST_EQUAL(tp->t_hpts_slot, HPTS_USEC_TO_SLOTS(500)); + +	/* Set our test flag to indicate the tcpcb should be removed from the +	 * wheel when tcp_output is called. */ +	TP_REMOVE_FROM_HPTS(tp) = 1; + +	/* Test early exit condition: advance time by insufficient amount */ +	KTEST_LOG(ctx, "Testing early exit with insufficient time advancement"); +	test_time_usec += 1; /* Very small advancement - should cause early exit */ +	HPTS_LOCK(hpts); +	NET_EPOCH_ENTER(et); +	slots_ran = tcp_hptsi(hpts, true); +	HPTS_UNLOCK(hpts); +	NET_EPOCH_EXIT(et); + +	/* Should return 0 slots due to insufficient time advancement */ +	KTEST_EQUAL(slots_ran, 0); +	KTEST_EQUAL(call_counts[CCNT_TCP_OUTPUT], 0); /* No processing should occur */ +	KTEST_EQUAL(tp->t_in_hpts, IHPTS_ONQUEUE); /* Connection still queued */ + +	/* Wait for 498 more usecs and trigger the HPTS workers and verify +	 * nothing happens yet (total 499 usec) */ +	KTEST_EQUAL(call_counts[CCNT_TCP_OUTPUT], 0); +	test_time_usec += 498; +	for (i = 0; i < pace->rp_num_hptss; i++) { +		KTEST_LOG(ctx, "=> tcp_hptsi(%p)", pace->rp_ent[i]); +		HPTS_LOCK(pace->rp_ent[i]); +		NET_EPOCH_ENTER(et); +		slots_ran = tcp_hptsi(pace->rp_ent[i], true); +		HPTS_UNLOCK(pace->rp_ent[i]); +		NET_EPOCH_EXIT(et); + +		dump_hpts_entry(ctx, pace->rp_ent[i]); +		KTEST_VERIFY(slots_ran >= 0); +		KTEST_EQUAL(pace->rp_ent[i]->p_prev_slot, 49); +		KTEST_EQUAL(pace->rp_ent[i]->p_cur_slot, 49); +	} + +	dump_tcpcb(tp); +	KTEST_EQUAL(call_counts[CCNT_TCP_OUTPUT], 0); +	KTEST_EQUAL(tp->t_in_hpts, IHPTS_ONQUEUE); +	KTEST_EQUAL(tp->t_hpts_request, 0); +	KTEST_EQUAL(tp->t_hpts_slot, HPTS_USEC_TO_SLOTS(500)); +	KTEST_EQUAL(hpts->p_on_queue_cnt, 1); + +	/* Wait for 1 more usec and trigger the HPTS workers and verify it +	 * triggers tcp_output this time */ +	KTEST_EQUAL(call_counts[CCNT_TCP_OUTPUT], 0); +	test_time_usec += 1; +	for (i = 0; i < pace->rp_num_hptss; i++) { +		KTEST_LOG(ctx, "=> tcp_hptsi(%p)", pace->rp_ent[i]); +		HPTS_LOCK(pace->rp_ent[i]); +		NET_EPOCH_ENTER(et); +		slots_ran = tcp_hptsi(pace->rp_ent[i], true); +		HPTS_UNLOCK(pace->rp_ent[i]); +		NET_EPOCH_EXIT(et); + +		dump_hpts_entry(ctx, pace->rp_ent[i]); +		KTEST_VERIFY(slots_ran >= 0); +		KTEST_EQUAL(pace->rp_ent[i]->p_prev_slot, 50); +		KTEST_EQUAL(pace->rp_ent[i]->p_cur_slot, 50); +	} + +	dump_tcpcb(tp); +	KTEST_EQUAL(call_counts[CCNT_TCP_OUTPUT], 1); +	KTEST_EQUAL(tp->t_in_hpts, IHPTS_NONE); +	KTEST_EQUAL(hpts->p_on_queue_cnt, 0); + +	test_hpts_free_tcpcb(tp); +	tcp_hptsi_stop(pace); +	tcp_hptsi_destroy(pace); + +	return (0); +} + +/* + * Validates HPTS scalability by creating and inserting a LOT of tcpcbs into + * the HPTS wheel, testing performance under high load conditions. + */ +KTEST_FUNC(scalability_tcpcbs) +{ +	struct tcp_hptsi *pace; +	struct tcpcb **tcpcbs; +	uint32_t i, num_tcpcbs = 100000, total_queued = 0; + +	test_hpts_init(); + +	pace = tcp_hptsi_create(&test_funcs, false); +	KTEST_NEQUAL(pace, NULL); +	tcp_hptsi_start(pace); + +	/* Allocate array to hold pointers to all tcpcbs */ +	tcpcbs = malloc(num_tcpcbs * sizeof(struct tcpcb *), M_TCPHPTS, M_WAITOK | M_ZERO); +	KTEST_VERIFY_RET(tcpcbs != NULL, ENOMEM); + +	/* Create a LOT of tcpcbs */ +	KTEST_LOG(ctx, "Creating %u tcpcbs...", num_tcpcbs); +	for (i = 0; i < num_tcpcbs; i++) { +		tcpcbs[i] = test_hpts_create_tcpcb(ctx, pace); +		if (tcpcbs[i] == NULL) { +			KTEST_ERR(ctx, "FAIL: tcpcbs[i] == NULL"); +			return (EINVAL); +		} +	} + +	/* Insert all created tcpcbs into HPTS */ +	KTEST_LOG(ctx, "Inserting all tcpcbs into HPTS..."); +	for (i = 0; i < num_tcpcbs; i++) { +		INP_WLOCK(&tcpcbs[i]->t_inpcb); +		tcpcbs[i]->t_flags2 |= TF2_HPTS_CALLS; +		/* Insert with varying future timeouts to distribute across slots */ +		tcp_hpts_insert(pace, tcpcbs[i], 100 + (i % 1000), NULL); +		INP_WUNLOCK(&tcpcbs[i]->t_inpcb); +	} + +	/* Verify total queue counts across all CPUs */ +	for (i = 0; i < pace->rp_num_hptss; i++) { +		total_queued += pace->rp_ent[i]->p_on_queue_cnt; +	} +	KTEST_EQUAL(total_queued, num_tcpcbs); + +	for (i = 0; i < pace->rp_num_hptss; i++) +		dump_hpts_entry(ctx, pace->rp_ent[i]); + +	/* Remove all tcpcbs from HPTS */ +	KTEST_LOG(ctx, "Removing all tcpcbs from HPTS..."); +	for (i = 0; i < num_tcpcbs; i++) { +		INP_WLOCK(&tcpcbs[i]->t_inpcb); +		if (tcpcbs[i]->t_in_hpts != IHPTS_NONE) { +			tcp_hpts_remove(pace, tcpcbs[i]); +		} +		INP_WUNLOCK(&tcpcbs[i]->t_inpcb); +	} + +	/* Verify all queues are now empty */ +	for (i = 0; i < pace->rp_num_hptss; i++) { +		if (pace->rp_ent[i]->p_on_queue_cnt != 0) { +			KTEST_ERR(ctx, "FAIL: pace->rp_ent[i]->p_on_queue_cnt != 0"); +			return (EINVAL); +		} +	} + +	for (i = 0; i < num_tcpcbs; i++) { +		test_hpts_free_tcpcb(tcpcbs[i]); +	} +	free(tcpcbs, M_TCPHPTS); +	tcp_hptsi_stop(pace); +	tcp_hptsi_destroy(pace); + +	return (0); +} + +/* + * Validates wheel wrap scenarios where the timer falls significantly behind + * and needs to process more than one full wheel revolution worth of slots. + */ +KTEST_FUNC(wheel_wrap_recovery) +{ +	struct epoch_tracker et; +	struct tcp_hptsi *pace; +	struct tcpcb **tcpcbs; +	uint32_t i, timeout_usecs, num_tcpcbs = 500; +	int32_t slots_ran; + +	test_hpts_init(); + +	pace = tcp_hptsi_create(&test_funcs, false); +	KTEST_NEQUAL(pace, NULL); +	tcp_hptsi_start(pace); + +	/* Allocate array to hold pointers to tcpcbs */ +	tcpcbs = malloc(num_tcpcbs * sizeof(struct tcpcb *), M_TCPHPTS, M_WAITOK | M_ZERO); +	KTEST_VERIFY_RET(tcpcbs != NULL, ENOMEM); + +	/* Create tcpcbs and insert them across many slots */ +	for (i = 0; i < num_tcpcbs; i++) { +		tcpcbs[i] = test_hpts_create_tcpcb(ctx, pace); +		KTEST_NEQUAL(tcpcbs[i], NULL); +		TP_REMOVE_FROM_HPTS(tcpcbs[i]) = 1; + +		timeout_usecs = ((i * NUM_OF_HPTSI_SLOTS) / num_tcpcbs) * HPTS_USECS_PER_SLOT; /* Spread across slots */ + +		INP_WLOCK(&tcpcbs[i]->t_inpcb); +		tcpcbs[i]->t_flags2 |= TF2_HPTS_CALLS; +		tcp_hpts_insert(pace, tcpcbs[i], timeout_usecs, NULL); +		INP_WUNLOCK(&tcpcbs[i]->t_inpcb); +	} + +	/* Fast forward time significantly to trigger wheel wrap */ +	test_time_usec += (NUM_OF_HPTSI_SLOTS + 5000) * HPTS_USECS_PER_SLOT; + +	for (i = 0; i < pace->rp_num_hptss; i++) { +		KTEST_LOG(ctx, "=> tcp_hptsi(%u)", i); +		KTEST_NEQUAL(pace->rp_ent[i]->p_on_queue_cnt, 0); + +		HPTS_LOCK(pace->rp_ent[i]); +		NET_EPOCH_ENTER(et); +		slots_ran = tcp_hptsi(pace->rp_ent[i], true); +		HPTS_UNLOCK(pace->rp_ent[i]); +		NET_EPOCH_EXIT(et); + +		KTEST_EQUAL(slots_ran, NUM_OF_HPTSI_SLOTS-1); /* Should process all slots */ +		KTEST_EQUAL(pace->rp_ent[i]->p_on_queue_cnt, 0); +		KTEST_NEQUAL(pace->rp_ent[i]->p_cur_slot, +			pace->rp_ent[i]->p_prev_slot); +	} + +	/* Cleanup */ +	for (i = 0; i < num_tcpcbs; i++) { +		INP_WLOCK(&tcpcbs[i]->t_inpcb); +		if (tcpcbs[i]->t_in_hpts != IHPTS_NONE) { +			tcp_hpts_remove(pace, tcpcbs[i]); +		} +		INP_WUNLOCK(&tcpcbs[i]->t_inpcb); +		test_hpts_free_tcpcb(tcpcbs[i]); +	} +	free(tcpcbs, M_TCPHPTS); +	tcp_hptsi_stop(pace); +	tcp_hptsi_destroy(pace); + +	return (0); +} + +/* + * Validates proper handling of tcpcbs in the IHPTS_MOVING state, which occurs + * when a tcpcb is being processed by the HPTS thread but gets removed. + */ +KTEST_FUNC(tcpcb_moving_state) +{ +	struct epoch_tracker et; +	struct tcp_hptsi *pace; +	struct tcpcb *tp1, *tp2; +	struct tcp_hpts_entry *hpts; +	int32_t slots_ran; + +	test_hpts_init(); + +	pace = tcp_hptsi_create(&test_funcs, false); +	KTEST_NEQUAL(pace, NULL); +	tcp_hptsi_start(pace); + +	/* Create two tcpcbs on the same CPU/slot */ +	tp1 = test_hpts_create_tcpcb(ctx, pace); +	tp2 = test_hpts_create_tcpcb(ctx, pace); +	KTEST_NEQUAL(tp1, NULL); +	KTEST_NEQUAL(tp2, NULL); + +	/* Force them to the same CPU for predictable testing */ +	tp1->t_hpts_cpu = 0; +	tp2->t_hpts_cpu = 0; + +	/* Insert both into the same slot */ +	INP_WLOCK(&tp1->t_inpcb); +	tp1->t_flags2 |= TF2_HPTS_CALLS; +	tcp_hpts_insert(pace, tp1, 100, NULL); +	INP_WUNLOCK(&tp1->t_inpcb); + +	INP_WLOCK(&tp2->t_inpcb); +	tp2->t_flags2 |= TF2_HPTS_CALLS; +	tcp_hpts_insert(pace, tp2, 100, NULL); +	INP_WUNLOCK(&tp2->t_inpcb); + +	hpts = pace->rp_ent[0]; + +	/* Manually transition tp1 to MOVING state to simulate race condition */ +	HPTS_LOCK(hpts); +	tp1->t_in_hpts = IHPTS_MOVING; +	tp1->t_hpts_slot = -1; /* Mark for removal */ +	HPTS_UNLOCK(hpts); + +	/* Set time and run HPTS to process the moving state */ +	test_time_usec += 100; +	HPTS_LOCK(hpts); +	NET_EPOCH_ENTER(et); +	slots_ran = tcp_hptsi(hpts, true); +	HPTS_UNLOCK(hpts); +	NET_EPOCH_EXIT(et); + +	KTEST_VERIFY(slots_ran >= 0); +	KTEST_EQUAL(call_counts[CCNT_TCP_OUTPUT], 1); /* Shouldn't call on both */ + +	/* tp1 should be cleaned up and removed */ +	KTEST_EQUAL(tp1->t_in_hpts, IHPTS_NONE); +	/* tp2 should have been processed normally */ +	KTEST_EQUAL(tp2->t_in_hpts, IHPTS_NONE); + +	test_hpts_free_tcpcb(tp1); +	test_hpts_free_tcpcb(tp2); +	tcp_hptsi_stop(pace); +	tcp_hptsi_destroy(pace); + +	return (0); +} + +/* + * Validates that tcpcbs with deferred requests (t_hpts_request > 0) are + * properly handled and re-inserted into appropriate future slots after + * the wheel processes enough slots to accommodate the original request. + */ +KTEST_FUNC(deferred_requests) +{ +	struct epoch_tracker et; +	struct tcp_hptsi *pace; +	struct tcpcb *tp, *tp2; +	struct tcp_hpts_entry *hpts; +	uint32_t large_timeout_usecs = (NUM_OF_HPTSI_SLOTS + 5000) * HPTS_USECS_PER_SLOT; /* Beyond wheel capacity */ +	uint32_t huge_timeout_usecs = (NUM_OF_HPTSI_SLOTS * 3) * HPTS_USECS_PER_SLOT; /* 3x wheel capacity */ +	uint32_t initial_request; +	int32_t slots_ran; + +	test_hpts_init(); + +	pace = tcp_hptsi_create(&test_funcs, false); +	KTEST_NEQUAL(pace, NULL); +	tcp_hptsi_start(pace); + +	tp = test_hpts_create_tcpcb(ctx, pace); +	KTEST_NEQUAL(tp, NULL); + +	/* Insert with a request that exceeds current wheel capacity */ +	INP_WLOCK(&tp->t_inpcb); +	tp->t_flags2 |= TF2_HPTS_CALLS; +	tcp_hpts_insert(pace, tp, large_timeout_usecs, NULL); +	INP_WUNLOCK(&tp->t_inpcb); + +	/* Verify it was inserted with a deferred request */ +	dump_tcpcb(tp); +	KTEST_EQUAL(tp->t_in_hpts, IHPTS_ONQUEUE); +	KTEST_VERIFY(tp->t_hpts_request > 0); +	KTEST_VERIFY(tp->t_hpts_slot < NUM_OF_HPTSI_SLOTS); + +	hpts = pace->rp_ent[tp->t_hpts_cpu]; + +	/* Advance time to process deferred requests */ +	test_time_usec += NUM_OF_HPTSI_SLOTS * HPTS_USECS_PER_SLOT; + +	/* Process the wheel to handle deferred requests */ +	HPTS_LOCK(hpts); +	NET_EPOCH_ENTER(et); +	slots_ran = tcp_hptsi(hpts, true); +	HPTS_UNLOCK(hpts); +	NET_EPOCH_EXIT(et); + +	dump_hpts_entry(ctx, hpts); +	KTEST_GREATER_THAN(slots_ran, 0); +	dump_tcpcb(tp); +	KTEST_EQUAL(tp->t_hpts_request, 0); + +	/* Test incremental deferred request processing over multiple cycles */ +	KTEST_LOG(ctx, "Testing incremental deferred request processing"); + +	/* Create a new connection with an even larger request */ +	tp2 = test_hpts_create_tcpcb(ctx, pace); +	KTEST_NEQUAL(tp2, NULL); +	tp2->t_hpts_cpu = tp->t_hpts_cpu; /* Same CPU for predictable testing */ + +	INP_WLOCK(&tp2->t_inpcb); +	tp2->t_flags2 |= TF2_HPTS_CALLS; +	tcp_hpts_insert(pace, tp2, huge_timeout_usecs, NULL); +	INP_WUNLOCK(&tp2->t_inpcb); + +	/* Verify initial deferred request */ +	initial_request = tp2->t_hpts_request; +	KTEST_VERIFY(initial_request > NUM_OF_HPTSI_SLOTS); + +	/* Process one wheel cycle - should reduce but not eliminate request */ +	test_time_usec += NUM_OF_HPTSI_SLOTS * HPTS_USECS_PER_SLOT; +	HPTS_LOCK(hpts); +	NET_EPOCH_ENTER(et); +	slots_ran = tcp_hptsi(hpts, true); +	HPTS_UNLOCK(hpts); +	NET_EPOCH_EXIT(et); + +	/* Request should be reduced but not zero */ +	KTEST_GREATER_THAN(initial_request, tp2->t_hpts_request); +	KTEST_VERIFY(tp2->t_hpts_request > 0); +	KTEST_EQUAL(tp2->t_in_hpts, IHPTS_ONQUEUE); /* Still queued */ + +	/* For huge_timeout_usecs = NUM_OF_HPTSI_SLOTS * 3 * HPTS_USECS_PER_SLOT, we need ~3 cycles to complete. +	 * Each cycle can reduce the request by at most NUM_OF_HPTSI_SLOTS. */ +	test_time_usec += NUM_OF_HPTSI_SLOTS * HPTS_USECS_PER_SLOT; +	HPTS_LOCK(hpts); +	NET_EPOCH_ENTER(et); +	slots_ran = tcp_hptsi(hpts, true); +	HPTS_UNLOCK(hpts); +	NET_EPOCH_EXIT(et); + +	/* After second cycle, request should be reduced significantly (likely by ~NUM_OF_HPTSI_SLOTS) */ +	KTEST_VERIFY(tp2->t_hpts_request < initial_request); +	KTEST_VERIFY(tp2->t_hpts_request > 0); /* But not yet zero for such a large request */ + +	/* Clean up second connection */ +	INP_WLOCK(&tp2->t_inpcb); +	if (tp2->t_in_hpts != IHPTS_NONE) { +		tcp_hpts_remove(pace, tp2); +	} +	INP_WUNLOCK(&tp2->t_inpcb); +	test_hpts_free_tcpcb(tp2); + +	/* Clean up */ +	INP_WLOCK(&tp->t_inpcb); +	if (tp->t_in_hpts != IHPTS_NONE) { +		tcp_hpts_remove(pace, tp); +	} +	INP_WUNLOCK(&tp->t_inpcb); +	test_hpts_free_tcpcb(tp); +	tcp_hptsi_stop(pace); +	tcp_hptsi_destroy(pace); + +	return (0); +} + +/* + * Validates CPU assignment and affinity mechanisms, including flowid-based + * assignment, random fallback scenarios, and explicit CPU setting. Tests + * the actual cpu assignment logic in hpts_cpuid via tcp_set_hpts. + */ +KTEST_FUNC(cpu_assignment) +{ +	struct tcp_hptsi *pace; +	struct tcpcb *tp1, *tp2, *tp2_dup, *tp3; + +	test_hpts_init(); + +	pace = tcp_hptsi_create(&test_funcs, false); +	KTEST_NEQUAL(pace, NULL); + +	/* Test random CPU assignment (no flowid) */ +	tp1 = test_hpts_create_tcpcb(ctx, pace); +	KTEST_NEQUAL(tp1, NULL); +	tp1->t_inpcb.inp_flowtype = M_HASHTYPE_NONE; +	INP_WLOCK(&tp1->t_inpcb); +	tcp_set_hpts(pace, tp1); +	INP_WUNLOCK(&tp1->t_inpcb); +	KTEST_VERIFY(tp1->t_hpts_cpu < pace->rp_num_hptss); +	KTEST_VERIFY(tp1->t_flags2 & TF2_HPTS_CPU_SET); + +	/* Test flowid-based assignment */ +	tp2 = test_hpts_create_tcpcb(ctx, pace); +	KTEST_NEQUAL(tp2, NULL); +	tp2->t_inpcb.inp_flowtype = M_HASHTYPE_RSS_TCP_IPV4; +	tp2->t_inpcb.inp_flowid = 12345; +	INP_WLOCK(&tp2->t_inpcb); +	tcp_set_hpts(pace, tp2); +	INP_WUNLOCK(&tp2->t_inpcb); +	KTEST_VERIFY(tp2->t_hpts_cpu < pace->rp_num_hptss); +	KTEST_VERIFY(tp2->t_flags2 & TF2_HPTS_CPU_SET); + +	/* With the same flowid, should get same CPU assignment */ +	tp2_dup = test_hpts_create_tcpcb(ctx, pace); +	KTEST_NEQUAL(tp2_dup, NULL); +	tp2_dup->t_inpcb.inp_flowtype = M_HASHTYPE_RSS_TCP_IPV4; +	tp2_dup->t_inpcb.inp_flowid = 12345; +	INP_WLOCK(&tp2_dup->t_inpcb); +	tcp_set_hpts(pace, tp2_dup); +	INP_WUNLOCK(&tp2_dup->t_inpcb); +	KTEST_EQUAL(tp2_dup->t_hpts_cpu, tp2->t_hpts_cpu); + +	/* Test explicit CPU setting */ +	tp3 = test_hpts_create_tcpcb(ctx, pace); +	KTEST_NEQUAL(tp3, NULL); +	tp3->t_hpts_cpu = 1; /* Assume we have at least 2 CPUs */ +	tp3->t_flags2 |= TF2_HPTS_CPU_SET; +	INP_WLOCK(&tp3->t_inpcb); +	tcp_set_hpts(pace, tp3); +	INP_WUNLOCK(&tp3->t_inpcb); +	KTEST_EQUAL(tp3->t_hpts_cpu, 1); + +	test_hpts_free_tcpcb(tp1); +	test_hpts_free_tcpcb(tp2); +	test_hpts_free_tcpcb(tp2_dup); +	test_hpts_free_tcpcb(tp3); +	tcp_hptsi_destroy(pace); + +	return (0); +} + +/* + * Validates edge cases in slot calculation including boundary conditions + * around slot 0, maximum slots, and slot wrapping arithmetic. + */ +KTEST_FUNC(slot_boundary_conditions) +{ +	struct tcp_hptsi *pace; +	struct tcpcb *tp; + +	test_hpts_init(); + +	pace = tcp_hptsi_create(&test_funcs, false); +	KTEST_NEQUAL(pace, NULL); +	tcp_hptsi_start(pace); + +	/* Test insertion at slot 0 */ +	tp = test_hpts_create_tcpcb(ctx, pace); +	KTEST_NEQUAL(tp, NULL); +	INP_WLOCK(&tp->t_inpcb); +	tp->t_flags2 |= TF2_HPTS_CALLS; +	tcp_hpts_insert(pace, tp, 0, NULL); /* Should insert immediately (0 timeout) */ +	INP_WUNLOCK(&tp->t_inpcb); +	KTEST_EQUAL(tp->t_in_hpts, IHPTS_ONQUEUE); +	KTEST_VERIFY(tp->t_hpts_slot < NUM_OF_HPTSI_SLOTS); + +	INP_WLOCK(&tp->t_inpcb); +	tcp_hpts_remove(pace, tp); +	INP_WUNLOCK(&tp->t_inpcb); + +	/* Test insertion at maximum slot value */ +	INP_WLOCK(&tp->t_inpcb); +	tp->t_flags2 |= TF2_HPTS_CALLS; +	tcp_hpts_insert(pace, tp, (NUM_OF_HPTSI_SLOTS - 1) * HPTS_USECS_PER_SLOT, NULL); +	INP_WUNLOCK(&tp->t_inpcb); +	KTEST_EQUAL(tp->t_in_hpts, IHPTS_ONQUEUE); + +	INP_WLOCK(&tp->t_inpcb); +	tcp_hpts_remove(pace, tp); +	INP_WUNLOCK(&tp->t_inpcb); + +	/* Test very small timeout values */ +	INP_WLOCK(&tp->t_inpcb); +	tp->t_flags2 |= TF2_HPTS_CALLS; +	tcp_hpts_insert(pace, tp, 1, NULL); +	INP_WUNLOCK(&tp->t_inpcb); +	KTEST_EQUAL(tp->t_in_hpts, IHPTS_ONQUEUE); +	KTEST_EQUAL(tp->t_hpts_slot, HPTS_USEC_TO_SLOTS(1)); /* Should convert 1 usec to slot */ + +	INP_WLOCK(&tp->t_inpcb); +	tcp_hpts_remove(pace, tp); +	INP_WUNLOCK(&tp->t_inpcb); + +	test_hpts_free_tcpcb(tp); +	tcp_hptsi_stop(pace); +	tcp_hptsi_destroy(pace); + +	return (0); +} + +/* + * Validates HPTS behavior under high load conditions, including proper + * processing of many connections and connection count tracking. + */ +KTEST_FUNC(dynamic_sleep_adjustment) +{ +	struct epoch_tracker et; +	struct tcp_hptsi *pace; +	struct tcpcb **tcpcbs; +	struct tcp_hpts_entry *hpts; +	uint32_t i, num_tcpcbs = DEFAULT_CONNECTION_THRESHOLD + 50; +	int32_t slots_ran; + +	test_hpts_init(); + +	pace = tcp_hptsi_create(&test_funcs, false); +	KTEST_NEQUAL(pace, NULL); +	tcp_hptsi_start(pace); + +	/* Create many connections to exceed threshold */ +	tcpcbs = malloc(num_tcpcbs * sizeof(struct tcpcb *), M_TCPHPTS, M_WAITOK | M_ZERO); +	KTEST_VERIFY_RET(tcpcbs != NULL, ENOMEM); + +	for (i = 0; i < num_tcpcbs; i++) { +		tcpcbs[i] = test_hpts_create_tcpcb(ctx, pace); +		KTEST_NEQUAL(tcpcbs[i], NULL); +		tcpcbs[i]->t_hpts_cpu = 0; /* Force all to CPU 0 */ +		INP_WLOCK(&tcpcbs[i]->t_inpcb); +		tcpcbs[i]->t_flags2 |= TF2_HPTS_CALLS; +		TP_REMOVE_FROM_HPTS(tcpcbs[i]) = 1; /* Will be removed after output */ +		tcp_hpts_insert(pace, tcpcbs[i], 100, NULL); +		INP_WUNLOCK(&tcpcbs[i]->t_inpcb); +	} + +	hpts = pace->rp_ent[0]; +	dump_hpts_entry(ctx, hpts); + +	/* Verify we're above threshold */ +	KTEST_GREATER_THAN(hpts->p_on_queue_cnt, DEFAULT_CONNECTION_THRESHOLD); + +	/* Run HPTS to process many connections */ +	test_time_usec += 100; +	HPTS_LOCK(hpts); +	NET_EPOCH_ENTER(et); +	slots_ran = tcp_hptsi(hpts, true); +	HPTS_UNLOCK(hpts); +	NET_EPOCH_EXIT(et); + +	/* Verify HPTS processed slots and connections correctly */ +	KTEST_GREATER_THAN(slots_ran, 0); +	KTEST_EQUAL(call_counts[CCNT_TCP_OUTPUT], num_tcpcbs); + +	/* Verify all connections were removed from queue */ +	KTEST_EQUAL(hpts->p_on_queue_cnt, 0); + +	/* Cleanup */ +	for (i = 0; i < num_tcpcbs; i++) { +		test_hpts_free_tcpcb(tcpcbs[i]); +	} +	free(tcpcbs, M_TCPHPTS); +	tcp_hptsi_stop(pace); +	tcp_hptsi_destroy(pace); + +	return (0); +} + +/* + * Validates handling of concurrent insert/remove operations and race conditions + * between HPTS processing and user operations. + */ +KTEST_FUNC(concurrent_operations) +{ +	struct tcp_hptsi *pace; +	struct tcpcb *tp1, *tp2; +	struct tcp_hpts_entry *hpts; + +	test_hpts_init(); + +	pace = tcp_hptsi_create(&test_funcs, false); +	KTEST_NEQUAL(pace, NULL); +	tcp_hptsi_start(pace); + +	tp1 = test_hpts_create_tcpcb(ctx, pace); +	tp2 = test_hpts_create_tcpcb(ctx, pace); +	KTEST_NEQUAL(tp1, NULL); +	KTEST_NEQUAL(tp2, NULL); + +	/* Force all to CPU 0 */ +	tp1->t_hpts_cpu = 0; +	tp2->t_hpts_cpu = 0; + +	/* Insert tp1 */ +	INP_WLOCK(&tp1->t_inpcb); +	tp1->t_flags2 |= TF2_HPTS_CALLS; +	tcp_hpts_insert(pace, tp1, 100, NULL); +	INP_WUNLOCK(&tp1->t_inpcb); + +	/* Insert tp2 into same slot */ +	INP_WLOCK(&tp2->t_inpcb); +	tp2->t_flags2 |= TF2_HPTS_CALLS; +	tcp_hpts_insert(pace, tp2, 100, NULL); +	INP_WUNLOCK(&tp2->t_inpcb); + +	/* Verify both are inserted */ +	KTEST_EQUAL(tp1->t_in_hpts, IHPTS_ONQUEUE); +	KTEST_EQUAL(tp2->t_in_hpts, IHPTS_ONQUEUE); + +	/* Verify they're both assigned to the same slot */ +	KTEST_EQUAL(tp1->t_hpts_slot, tp2->t_hpts_slot); + +	/* Verify queue count reflects both connections */ +	KTEST_EQUAL(tp1->t_hpts_cpu, tp2->t_hpts_cpu); /* Should be on same CPU */ +	hpts = pace->rp_ent[tp1->t_hpts_cpu]; +	KTEST_EQUAL(hpts->p_on_queue_cnt, 2); + +	/* Remove tp1 while tp2 is still there */ +	INP_WLOCK(&tp1->t_inpcb); +	tcp_hpts_remove(pace, tp1); +	INP_WUNLOCK(&tp1->t_inpcb); + +	/* Verify tp1 removed, tp2 still there */ +	KTEST_EQUAL(tp1->t_in_hpts, IHPTS_NONE); +	KTEST_EQUAL(tp2->t_in_hpts, IHPTS_ONQUEUE); + +	/* Verify queue count decreased by one */ +	KTEST_EQUAL(hpts->p_on_queue_cnt, 1); + +	/* Remove tp2 */ +	INP_WLOCK(&tp2->t_inpcb); +	tcp_hpts_remove(pace, tp2); +	INP_WUNLOCK(&tp2->t_inpcb); + +	KTEST_EQUAL(tp2->t_in_hpts, IHPTS_NONE); + +	/* Verify queue is now completely empty */ +	KTEST_EQUAL(hpts->p_on_queue_cnt, 0); + +	test_hpts_free_tcpcb(tp1); +	test_hpts_free_tcpcb(tp2); +	tcp_hptsi_stop(pace); +	tcp_hptsi_destroy(pace); + +	return (0); +} + +/* + * Validates the queued segments processing path via tfb_do_queued_segments, + * which is an alternative to direct tcp_output calls. + */ +KTEST_FUNC(queued_segments_processing) +{ +	struct epoch_tracker et; +	struct tcp_hptsi *pace; +	struct tcpcb *tp; +	struct tcp_hpts_entry *hpts; +	struct mbuf *fake_mbuf; +	int32_t slots_ran; + +	test_hpts_init(); + +	pace = tcp_hptsi_create(&test_funcs, false); +	KTEST_NEQUAL(pace, NULL); +	tcp_hptsi_start(pace); + +	tp = test_hpts_create_tcpcb(ctx, pace); +	KTEST_NEQUAL(tp, NULL); + +	/* Create a minimal fake mbuf that has valid STAILQ pointers */ +	fake_mbuf = malloc(sizeof(struct mbuf), M_TCPHPTS, M_WAITOK | M_ZERO); +	KTEST_NEQUAL(fake_mbuf, NULL); + +	/* Set up for queued segments path */ +	tp->t_flags2 |= (TF2_HPTS_CALLS | TF2_SUPPORTS_MBUFQ); +	STAILQ_INSERT_TAIL(&tp->t_inqueue, fake_mbuf, m_stailqpkt); + +	INP_WLOCK(&tp->t_inpcb); +	tcp_hpts_insert(pace, tp, 100, NULL); +	INP_WUNLOCK(&tp->t_inpcb); + +	hpts = pace->rp_ent[tp->t_hpts_cpu]; + +	/* Run HPTS and verify queued segments path is taken */ +	test_time_usec += 100; +	HPTS_LOCK(hpts); +	NET_EPOCH_ENTER(et); +	slots_ran = tcp_hptsi(hpts, true); +	HPTS_UNLOCK(hpts); +	NET_EPOCH_EXIT(et); + +	KTEST_VERIFY(slots_ran >= 0); +	KTEST_EQUAL(call_counts[CCNT_TCP_TFB_DO_QUEUED_SEGMENTS], 1); + +	/* Connection should be removed from HPTS after processing */ +	KTEST_EQUAL(tp->t_in_hpts, IHPTS_NONE); + +	/* Clean up the fake mbuf if it's still in the queue */ +	if (!STAILQ_EMPTY(&tp->t_inqueue)) { +		struct mbuf *m = STAILQ_FIRST(&tp->t_inqueue); +		STAILQ_REMOVE_HEAD(&tp->t_inqueue, m_stailqpkt); +		free(m, M_TCPHPTS); +	} + +	test_hpts_free_tcpcb(tp); +	tcp_hptsi_stop(pace); +	tcp_hptsi_destroy(pace); + +	return (0); +} + +/* + * Validates the direct wake mechanism and wake inhibition logic when + * the connection count exceeds thresholds. + */ +KTEST_FUNC(direct_wake_mechanism) +{ +	struct tcp_hptsi *pace; +	struct tcpcb *tp; +	struct tcp_hpts_entry *hpts; + +	test_hpts_init(); + +	pace = tcp_hptsi_create(&test_funcs, false); +	KTEST_NEQUAL(pace, NULL); +	tcp_hptsi_start(pace); + +	tp = test_hpts_create_tcpcb(ctx, pace); +	KTEST_NEQUAL(tp, NULL); +	hpts = pace->rp_ent[tp->t_hpts_cpu]; + +	/* Test direct wake when not over threshold */ +	HPTS_LOCK(hpts); +	hpts->p_on_queue_cnt = 50; /* Below threshold */ +	hpts->p_hpts_wake_scheduled = 0; +	tcp_hpts_wake(hpts); +	KTEST_EQUAL(hpts->p_hpts_wake_scheduled, 1); +	KTEST_EQUAL(call_counts[CCNT_SWI_SCHED], 1); +	HPTS_UNLOCK(hpts); + +	/* Reset for next test */ +	hpts->p_hpts_wake_scheduled = 0; +	call_counts[CCNT_SWI_SCHED] = 0; + +	/* Test wake inhibition when over threshold */ +	HPTS_LOCK(hpts); +	hpts->p_on_queue_cnt = 200; /* Above threshold */ +	hpts->p_direct_wake = 1; /* Request direct wake */ +	tcp_hpts_wake(hpts); +	KTEST_EQUAL(hpts->p_hpts_wake_scheduled, 0); /* Should be inhibited */ +	KTEST_EQUAL(hpts->p_direct_wake, 0); /* Should be cleared */ +	KTEST_EQUAL(call_counts[CCNT_SWI_SCHED], 0); /* No SWI scheduled */ +	HPTS_UNLOCK(hpts); + +	test_hpts_free_tcpcb(tp); +	tcp_hptsi_stop(pace); +	tcp_hptsi_destroy(pace); + +	return (0); +} + +/* + * Validates HPTS collision detection when attempting to run HPTS while + * it's already active. + */ +KTEST_FUNC(hpts_collision_detection) +{ +	struct epoch_tracker et; +	struct tcp_hptsi *pace; +	struct tcp_hpts_entry *hpts; +	int32_t slots_ran; + +	test_hpts_init(); + +	pace = tcp_hptsi_create(&test_funcs, false); +	KTEST_NEQUAL(pace, NULL); +	tcp_hptsi_start(pace); + +	hpts = pace->rp_ent[0]; + +	/* Mark HPTS as active */ +	HPTS_LOCK(hpts); +	hpts->p_hpts_active = 1; +	HPTS_UNLOCK(hpts); + +	/* Attempt to run HPTS again - should detect collision */ +	HPTS_LOCK(hpts); +	NET_EPOCH_ENTER(et); +	slots_ran = tcp_hptsi(hpts, false); /* from_callout = false */ +	HPTS_UNLOCK(hpts); +	NET_EPOCH_EXIT(et); + +	/* Should return 0 indicating no work done due to collision */ +	KTEST_EQUAL(slots_ran, 0); + +	tcp_hptsi_stop(pace); +	tcp_hptsi_destroy(pace); + +	return (0); +} + +/* + * Validates generation count handling for race condition detection between + * HPTS processing and connection insertion/removal operations. + */ +KTEST_FUNC(generation_count_validation) +{ +	struct epoch_tracker et; +	struct tcp_hptsi *pace; +	struct tcp_hpts_entry *hpts; +	struct tcpcb *tp1, *tp2; +	uint32_t initial_gencnt, slot_to_test = 10; +	uint32_t timeout_usecs = slot_to_test * HPTS_USECS_PER_SLOT; +	uint32_t tp2_original_gencnt; +	int32_t slots_ran; + +	test_hpts_init(); + +	pace = tcp_hptsi_create(&test_funcs, false); +	KTEST_NEQUAL(pace, NULL); +	tcp_hptsi_start(pace); + +	hpts = pace->rp_ent[0]; + +	/* Record initial generation count for the test slot */ +	initial_gencnt = hpts->p_hptss[slot_to_test].gencnt; + +	/* Create and insert first connection */ +	tp1 = test_hpts_create_tcpcb(ctx, pace); +	KTEST_NEQUAL(tp1, NULL); +	tp1->t_hpts_cpu = 0; /* Force to CPU 0 */ + +	INP_WLOCK(&tp1->t_inpcb); +	tp1->t_flags2 |= TF2_HPTS_CALLS; +	tcp_hpts_insert(pace, tp1, timeout_usecs, NULL); +	INP_WUNLOCK(&tp1->t_inpcb); + +	/* Verify connection stored the generation count */ +	KTEST_EQUAL(tp1->t_in_hpts, IHPTS_ONQUEUE); +	KTEST_EQUAL(tp1->t_hpts_slot, slot_to_test); +	KTEST_EQUAL(tp1->t_hpts_gencnt, initial_gencnt); + +	/* Create second connection but don't insert yet */ +	tp2 = test_hpts_create_tcpcb(ctx, pace); +	KTEST_NEQUAL(tp2, NULL); +	tp2->t_hpts_cpu = 0; /* Force to CPU 0 */ + +	/* Force generation count increment by processing the slot */ +	test_time_usec += (slot_to_test + 1) * HPTS_USECS_PER_SLOT; +	HPTS_LOCK(hpts); +	NET_EPOCH_ENTER(et); +	slots_ran = tcp_hptsi(hpts, true); +	HPTS_UNLOCK(hpts); +	NET_EPOCH_EXIT(et); + +	/* Verify processing occurred */ +	KTEST_VERIFY(slots_ran > 0); +	KTEST_EQUAL(call_counts[CCNT_TCP_OUTPUT], 1); + +	/* Verify generation count was incremented */ +	KTEST_EQUAL(hpts->p_hptss[slot_to_test].gencnt, initial_gencnt + 1); + +	/* Verify first connection was processed and removed */ +	KTEST_EQUAL(tp1->t_in_hpts, IHPTS_NONE); + +	/* Insert second connection and record its generation count */ +	INP_WLOCK(&tp2->t_inpcb); +	tp2->t_flags2 |= TF2_HPTS_CALLS; +	tcp_hpts_insert(pace, tp2, timeout_usecs, NULL); +	INP_WUNLOCK(&tp2->t_inpcb); + +	/* Verify connection was inserted successfully */ +	KTEST_EQUAL(tp2->t_in_hpts, IHPTS_ONQUEUE); + +	/* Record the generation count that tp2 received */ +	tp2_original_gencnt = tp2->t_hpts_gencnt; + +	/* Test generation count mismatch detection during processing */ +	/* Manually set stale generation count to simulate race condition */ +	tp2->t_hpts_gencnt = tp2_original_gencnt + 100; /* Force a mismatch */ + +	/* Process the slot to trigger generation count validation */ +	test_time_usec += (slot_to_test + 1) * HPTS_USECS_PER_SLOT; +	HPTS_LOCK(hpts); +	NET_EPOCH_ENTER(et); +	slots_ran = tcp_hptsi(hpts, true); +	HPTS_UNLOCK(hpts); +	NET_EPOCH_EXIT(et); + +	/* Connection should be processed despite generation count mismatch */ +	KTEST_EQUAL(tp2->t_in_hpts, IHPTS_NONE); /* Processed and released */ + +	/* The key test: HPTS should handle mismatched generation counts gracefully */ +	KTEST_VERIFY(slots_ran > 0); /* Processing should still occur */ + +	test_hpts_free_tcpcb(tp1); +	test_hpts_free_tcpcb(tp2); +	tcp_hptsi_stop(pace); +	tcp_hptsi_destroy(pace); + +	return (0); +} + +static const struct ktest_test_info tests[] = { +	KTEST_INFO(module_load), +	KTEST_INFO(hptsi_create_destroy), +	KTEST_INFO(hptsi_start_stop), +	KTEST_INFO(hptsi_independence), +	KTEST_INFO(function_injection), +	KTEST_INFO(tcpcb_initialization), +	KTEST_INFO(tcpcb_insertion), +	KTEST_INFO(timer_functionality), +	KTEST_INFO(scalability_tcpcbs), +	KTEST_INFO(wheel_wrap_recovery), +	KTEST_INFO(tcpcb_moving_state), +	KTEST_INFO(deferred_requests), +	KTEST_INFO(cpu_assignment), +	KTEST_INFO(slot_boundary_conditions), +	KTEST_INFO(dynamic_sleep_adjustment), +	KTEST_INFO(concurrent_operations), +	KTEST_INFO(queued_segments_processing), +	KTEST_INFO(direct_wake_mechanism), +	KTEST_INFO(hpts_collision_detection), +	KTEST_INFO(generation_count_validation), +}; + +#else /* TCP_HPTS_KTEST */ + +/* + * Stub to indicate that the TCP HPTS ktest is not enabled. + */ +KTEST_FUNC(module_load_without_tests) +{ +	KTEST_LOG(ctx, "Warning: TCP HPTS ktest is not enabled"); +	return (0); +} + +static const struct ktest_test_info tests[] = { +	KTEST_INFO(module_load_without_tests), +}; + +#endif + +KTEST_MODULE_DECLARE(ktest_tcphpts, tests); +KTEST_MODULE_DEPEND(ktest_tcphpts, tcphpts); diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c index dd27ec77c1af..2146b0cac48f 100644 --- a/sys/netinet/tcp_input.c +++ b/sys/netinet/tcp_input.c @@ -219,7 +219,7 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_auto, CTLFLAG_VNET | CTLFLAG_RW,      &VNET_NAME(tcp_do_autorcvbuf), 0,      "Enable automatic receive buffer sizing"); -VNET_DEFINE(int, tcp_autorcvbuf_max) = 2*1024*1024; +VNET_DEFINE(int, tcp_autorcvbuf_max) = 8*1024*1024;  SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_VNET | CTLFLAG_RW,      &VNET_NAME(tcp_autorcvbuf_max), 0,      "Max size of automatic receive buffer"); diff --git a/sys/netinet/tcp_lro.c b/sys/netinet/tcp_lro.c index 64efa4bf060f..9b5baf115855 100644 --- a/sys/netinet/tcp_lro.c +++ b/sys/netinet/tcp_lro.c @@ -1475,10 +1475,11 @@ tcp_lro_queue_mbuf(struct lro_ctrl *lc, struct mbuf *mb)   	}  	/* create sequence number */ -	lc->lro_mbuf_data[lc->lro_mbuf_count].seq = -	    (((uint64_t)M_HASHTYPE_GET(mb)) << 56) | -	    (((uint64_t)mb->m_pkthdr.flowid) << 24) | -	    ((uint64_t)lc->lro_mbuf_count); +	lc->lro_mbuf_data[lc->lro_mbuf_count].seq = lc->lro_mbuf_count; +	if (M_HASHTYPE_ISHASH(mb)) +		lc->lro_mbuf_data[lc->lro_mbuf_count].seq |= +		    (((uint64_t)M_HASHTYPE_GET(mb)) << 56) | +		    (((uint64_t)mb->m_pkthdr.flowid) << 24);  	/* enter mbuf */  	lc->lro_mbuf_data[lc->lro_mbuf_count].mb = mb; diff --git a/sys/netinet/tcp_lro_hpts.c b/sys/netinet/tcp_lro_hpts.c index 43587285fe26..ac1a27a4290a 100644 --- a/sys/netinet/tcp_lro_hpts.c +++ b/sys/netinet/tcp_lro_hpts.c @@ -29,6 +29,8 @@  #include "opt_inet6.h"  #include <sys/param.h> +#include <sys/bus.h> +#include <sys/interrupt.h>  #include <sys/systm.h>  #include <sys/kernel.h>  #include <sys/malloc.h> @@ -62,6 +64,7 @@  #include <netinet/tcp_lro.h>  #include <netinet/tcp_var.h>  #include <netinet/tcp_hpts.h> +#include <netinet/tcp_hpts_internal.h>  #ifdef TCP_BLACKBOX  #include <netinet/tcp_log_buf.h>  #endif diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c index 2dfb7faf56e3..208f72c4661c 100644 --- a/sys/netinet/tcp_output.c +++ b/sys/netinet/tcp_output.c @@ -123,7 +123,7 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_inc, CTLFLAG_VNET | CTLFLAG_RW,  	&VNET_NAME(tcp_autosndbuf_inc), 0,  	"Incrementor step size of automatic send buffer"); -VNET_DEFINE(int, tcp_autosndbuf_max) = 2*1024*1024; +VNET_DEFINE(int, tcp_autosndbuf_max) = 8*1024*1024;  SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_max, CTLFLAG_VNET | CTLFLAG_RW,  	&VNET_NAME(tcp_autosndbuf_max), 0,  	"Max size of automatic send buffer"); diff --git a/sys/netinet/tcp_stacks/bbr.c b/sys/netinet/tcp_stacks/bbr.c index f2d7867df9b4..10383bc0801e 100644 --- a/sys/netinet/tcp_stacks/bbr.c +++ b/sys/netinet/tcp_stacks/bbr.c @@ -477,10 +477,10 @@ bbr_log_rtt_shrinks(struct tcp_bbr *bbr, uint32_t cts, uint32_t applied,  		    uint16_t set);  static struct bbr_sendmap *  bbr_find_lowest_rsm(struct tcp_bbr *bbr); -static __inline uint32_t +static inline uint32_t  bbr_get_rtt(struct tcp_bbr *bbr, int32_t rtt_type);  static void -bbr_log_to_start(struct tcp_bbr *bbr, uint32_t cts, uint32_t to, int32_t slot, +bbr_log_to_start(struct tcp_bbr *bbr, uint32_t cts, uint32_t to, int32_t pacing_delay,  		 uint8_t which);  static void  bbr_log_timer_var(struct tcp_bbr *bbr, int mode, uint32_t cts, @@ -489,7 +489,7 @@ bbr_log_timer_var(struct tcp_bbr *bbr, int mode, uint32_t cts,  static void  bbr_log_hpts_diag(struct tcp_bbr *bbr, uint32_t cts, struct hpts_diag *diag);  static void -bbr_log_type_bbrsnd(struct tcp_bbr *bbr, uint32_t len, uint32_t slot, +bbr_log_type_bbrsnd(struct tcp_bbr *bbr, uint32_t len, uint32_t pacing_delay,  		    uint32_t del_by, uint32_t cts, uint32_t sloton,  		    uint32_t prev_delay);  static void @@ -724,7 +724,7 @@ bbr_minseg(struct tcp_bbr *bbr)  }  static void -bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_t frm, int32_t slot, uint32_t tot_len) +bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_t frm, int32_t pacing_delay, uint32_t tot_len)  {  	struct inpcb *inp = tptoinpcb(tp);  	struct hpts_diag diag; @@ -751,40 +751,40 @@ bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_  	bbr->r_ctl.rc_timer_exp = 0;  	prev_delay = bbr->r_ctl.rc_last_delay_val;  	if (bbr->r_ctl.rc_last_delay_val && -	    (slot == 0)) { +	    (pacing_delay == 0)) {  		/*  		 * If a previous pacer delay was in place we  		 * are not coming from the output side (where  		 * we calculate a delay, more likely a timer).  		 */ -		slot = bbr->r_ctl.rc_last_delay_val; +		pacing_delay = bbr->r_ctl.rc_last_delay_val;  		if (TSTMP_GT(cts, bbr->rc_pacer_started)) {  			/* Compensate for time passed  */  			delay_calc = cts - bbr->rc_pacer_started; -			if (delay_calc <= slot) -				slot -= delay_calc; +			if (delay_calc <= pacing_delay) +				pacing_delay -= delay_calc;  		}  	}  	/* Do we have early to make up for by pushing out the pacing time? */  	if (bbr->r_agg_early_set) { -		bbr_log_pacing_delay_calc(bbr, 0, bbr->r_ctl.rc_agg_early, cts, slot, 0, bbr->r_agg_early_set, 2); -		slot += bbr->r_ctl.rc_agg_early; +		bbr_log_pacing_delay_calc(bbr, 0, bbr->r_ctl.rc_agg_early, cts, pacing_delay, 0, bbr->r_agg_early_set, 2); +		pacing_delay += bbr->r_ctl.rc_agg_early;  		bbr->r_ctl.rc_agg_early = 0;  		bbr->r_agg_early_set = 0;  	}  	/* Are we running a total debt that needs to be compensated for? */  	if (bbr->r_ctl.rc_hptsi_agg_delay) { -		if (slot > bbr->r_ctl.rc_hptsi_agg_delay) { +		if (pacing_delay > bbr->r_ctl.rc_hptsi_agg_delay) {  			/* We nuke the delay */ -			slot -= bbr->r_ctl.rc_hptsi_agg_delay; +			pacing_delay -= bbr->r_ctl.rc_hptsi_agg_delay;  			bbr->r_ctl.rc_hptsi_agg_delay = 0;  		} else {  			/* We nuke some of the delay, put in a minimal 100usecs  */ -			bbr->r_ctl.rc_hptsi_agg_delay -= slot; -			bbr->r_ctl.rc_last_delay_val = slot = 100; +			bbr->r_ctl.rc_hptsi_agg_delay -= pacing_delay; +			bbr->r_ctl.rc_last_delay_val = pacing_delay = 100;  		}  	} -	bbr->r_ctl.rc_last_delay_val = slot; +	bbr->r_ctl.rc_last_delay_val = pacing_delay;  	hpts_timeout = bbr_timer_start(tp, bbr, cts);  	if (tp->t_flags & TF_DELACK) {  		if (bbr->rc_in_persist == 0) { @@ -810,7 +810,7 @@ bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_  		bbr->r_ctl.rc_hpts_flags = PACE_TMR_DELACK;  		hpts_timeout = delayed_ack;  	} -	if (slot) { +	if (pacing_delay) {  		/* Mark that we have a pacing timer up */  		BBR_STAT_INC(bbr_paced_segments);  		bbr->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT; @@ -820,7 +820,7 @@ bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_  	 * wheel, we resort to a keep-alive timer if its configured.  	 */  	if ((hpts_timeout == 0) && -	    (slot == 0)) { +	    (pacing_delay == 0)) {  		if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) &&  		    (tp->t_state <= TCPS_CLOSING)) {  			/* @@ -849,7 +849,7 @@ bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_  		if (left < hpts_timeout)  			hpts_timeout = left;  	} -	if (bbr->r_ctl.rc_incr_tmrs && slot && +	if (bbr->r_ctl.rc_incr_tmrs && pacing_delay &&  	    (bbr->r_ctl.rc_hpts_flags & (PACE_TMR_TLP|PACE_TMR_RXT))) {  		/*  		 * If configured to do so, and the timer is either @@ -867,7 +867,7 @@ bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_  		 * this extra delay but this is easier and being more  		 * conservative is probably better.  		 */ -		hpts_timeout += slot; +		hpts_timeout += pacing_delay;  	}  	if (hpts_timeout) {  		/* @@ -879,10 +879,10 @@ bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_  		bbr->r_ctl.rc_timer_exp = cts + hpts_timeout;  	} else  		bbr->r_ctl.rc_timer_exp = 0; -	if ((slot) && +	if ((pacing_delay) &&  	    (bbr->rc_use_google ||  	     bbr->output_error_seen || -	     (slot <= hpts_timeout))  ) { +	     (pacing_delay <= hpts_timeout))  ) {  		/*  		 * Tell LRO that it can queue packets while  		 * we pace. @@ -900,17 +900,15 @@ bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_  			tp->t_flags2 &= ~TF2_DONT_SACK_QUEUE;  		bbr->rc_pacer_started = cts; -		(void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(slot), -					   __LINE__, &diag); +		tcp_hpts_insert(tp, pacing_delay, &diag);  		bbr->rc_timer_first = 0;  		bbr->bbr_timer_src = frm; -		bbr_log_to_start(bbr, cts, hpts_timeout, slot, 1); +		bbr_log_to_start(bbr, cts, hpts_timeout, pacing_delay, 1);  		bbr_log_hpts_diag(bbr, cts, &diag);  	} else if (hpts_timeout) { -		(void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(hpts_timeout), -					   __LINE__, &diag); +		tcp_hpts_insert(tp, hpts_timeout, &diag);  		/* -		 * We add the flag here as well if the slot is set, +		 * We add the flag here as well if the pacing delay is set,  		 * since hpts will call in to clear the queue first before  		 * calling the output routine (which does our timers).  		 * We don't want to set the flag if its just a timer @@ -919,7 +917,7 @@ bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_  		 * on a keep-alive timer and a request comes in for  		 * more data.  		 */ -		if (slot) +		if (pacing_delay)  			bbr->rc_pacer_started = cts;  		if ((bbr->r_ctl.rc_hpts_flags & PACE_TMR_RACK) &&  		    (bbr->rc_cwnd_limited == 0)) { @@ -936,12 +934,12 @@ bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_  			    TF2_DONT_SACK_QUEUE);  		}  		bbr->bbr_timer_src = frm; -		bbr_log_to_start(bbr, cts, hpts_timeout, slot, 0); +		bbr_log_to_start(bbr, cts, hpts_timeout, pacing_delay, 0);  		bbr_log_hpts_diag(bbr, cts, &diag);  		bbr->rc_timer_first = 1;  	}  	bbr->rc_tmr_stopped = 0; -	bbr_log_type_bbrsnd(bbr, tot_len, slot, delay_calc, cts, frm, prev_delay); +	bbr_log_type_bbrsnd(bbr, tot_len, pacing_delay, delay_calc, cts, frm, prev_delay);  }  static void @@ -1033,8 +1031,8 @@ bbr_timer_audit(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts, struct sock  	}  	/*  	 * Ok the timer originally started is not what we want now. We will -	 * force the hpts to be stopped if any, and restart with the slot -	 * set to what was in the saved slot. +	 * force the hpts to be stopped if any, and restart with the pacing +	 * delay set to what was in the saved delay.  	 */  wrong_timer:  	if ((bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) { @@ -1843,7 +1841,7 @@ bbr_counter_destroy(void)  } -static __inline void +static inline void  bbr_fill_in_logging_data(struct tcp_bbr *bbr, struct tcp_log_bbr *l, uint32_t cts)  {  	memset(l, 0, sizeof(union tcp_log_stackspecific)); @@ -2397,7 +2395,7 @@ bbr_log_hpts_diag(struct tcp_bbr *bbr, uint32_t cts, struct hpts_diag *diag)  		log.u_bbr.flex2 = diag->p_cur_slot;  		log.u_bbr.flex3 = diag->slot_req;  		log.u_bbr.flex4 = diag->inp_hptsslot; -		log.u_bbr.flex5 = diag->slot_remaining; +		log.u_bbr.flex5 = diag->time_remaining;  		log.u_bbr.flex6 = diag->need_new_to;  		log.u_bbr.flex7 = diag->p_hpts_active;  		log.u_bbr.flex8 = diag->p_on_min_sleep; @@ -2411,9 +2409,6 @@ bbr_log_hpts_diag(struct tcp_bbr *bbr, uint32_t cts, struct hpts_diag *diag)  		log.u_bbr.bw_inuse = diag->wheel_slot;  		log.u_bbr.rttProp = diag->wheel_cts;  		log.u_bbr.delRate = diag->maxslots; -		log.u_bbr.cur_del_rate = diag->p_curtick; -		log.u_bbr.cur_del_rate <<= 32; -		log.u_bbr.cur_del_rate |= diag->p_lasttick;  		TCP_LOG_EVENTP(bbr->rc_tp, NULL,  		    &bbr->rc_inp->inp_socket->so_rcv,  		    &bbr->rc_inp->inp_socket->so_snd, @@ -2473,7 +2468,7 @@ bbr_log_pacing_delay_calc(struct tcp_bbr *bbr, uint16_t gain, uint32_t len,  }  static void -bbr_log_to_start(struct tcp_bbr *bbr, uint32_t cts, uint32_t to, int32_t slot, uint8_t which) +bbr_log_to_start(struct tcp_bbr *bbr, uint32_t cts, uint32_t to, int32_t pacing_delay, uint8_t which)  {  	if (tcp_bblogging_on(bbr->rc_tp)) {  		union tcp_log_stackspecific log; @@ -2483,7 +2478,7 @@ bbr_log_to_start(struct tcp_bbr *bbr, uint32_t cts, uint32_t to, int32_t slot, u  		log.u_bbr.flex1 = bbr->bbr_timer_src;  		log.u_bbr.flex2 = to;  		log.u_bbr.flex3 = bbr->r_ctl.rc_hpts_flags; -		log.u_bbr.flex4 = slot; +		log.u_bbr.flex4 = pacing_delay;  		log.u_bbr.flex5 = bbr->rc_tp->t_hpts_slot;  		log.u_bbr.flex6 = TICKS_2_USEC(bbr->rc_tp->t_rxtcur);  		log.u_bbr.pkts_out = bbr->rc_tp->t_flags2; @@ -2733,13 +2728,13 @@ bbr_type_log_hdwr_pacing(struct tcp_bbr *bbr, const struct ifnet *ifp,  }  static void -bbr_log_type_bbrsnd(struct tcp_bbr *bbr, uint32_t len, uint32_t slot, uint32_t del_by, uint32_t cts, uint32_t line, uint32_t prev_delay) +bbr_log_type_bbrsnd(struct tcp_bbr *bbr, uint32_t len, uint32_t pacing_delay, uint32_t del_by, uint32_t cts, uint32_t line, uint32_t prev_delay)  {  	if (tcp_bblogging_on(bbr->rc_tp)) {  		union tcp_log_stackspecific log;  		bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); -		log.u_bbr.flex1 = slot; +		log.u_bbr.flex1 = pacing_delay;  		log.u_bbr.flex2 = del_by;  		log.u_bbr.flex3 = prev_delay;  		log.u_bbr.flex4 = line; @@ -4211,7 +4206,7 @@ bbr_calc_thresh_tlp(struct tcpcb *tp, struct tcp_bbr *bbr,  /*   * Return one of three RTTs to use (in microseconds).   */ -static __inline uint32_t +static inline uint32_t  bbr_get_rtt(struct tcp_bbr *bbr, int32_t rtt_type)  {  	uint32_t f_rtt; @@ -4375,7 +4370,7 @@ bbr_timeout_rack(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts)  	return (0);  } -static __inline void +static inline void  bbr_clone_rsm(struct tcp_bbr *bbr, struct bbr_sendmap *nrsm, struct bbr_sendmap *rsm, uint32_t start)  {  	int idx; @@ -5205,7 +5200,7 @@ bbr_process_timers(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts, uint8_t  		left = bbr->r_ctl.rc_timer_exp - cts;  		ret = -3;  		bbr_log_to_processing(bbr, cts, ret, left, hpts_calling); -		tcp_hpts_insert(tp, HPTS_USEC_TO_SLOTS(left)); +		tcp_hpts_insert(tp, left, NULL);  		return (1);  	}  	bbr->rc_tmr_stopped = 0; @@ -5254,7 +5249,7 @@ bbr_timer_cancel(struct tcp_bbr *bbr, int32_t line, uint32_t cts)  				else  					time_since_send = 0;  				if (bbr->r_ctl.rc_last_delay_val > time_since_send) { -					/* Cut down our slot time */ +					/* Cut down our pacing_delay time */  					bbr->r_ctl.rc_last_delay_val -= time_since_send;  				} else {  					bbr->r_ctl.rc_last_delay_val = 0; @@ -5888,7 +5883,7 @@ bbr_log_output(struct tcp_bbr *bbr, struct tcpcb *tp, struct tcpopt *to, int32_t  	 * sequence 1 for 10 bytes. In such an example the r_start would be  	 * 1 (starting sequence) but the r_end would be r_start+len i.e. 11.  	 * This means that r_end is actually the first sequence for the next -	 * slot (11). +	 * pacing delay (11).  	 *  	 */  	INP_WLOCK_ASSERT(tptoinpcb(tp)); @@ -11856,7 +11851,7 @@ bbr_output_wtime(struct tcpcb *tp, const struct timeval *tv)  	struct bbr_sendmap *rsm = NULL;  	int32_t tso, mtu;  	struct tcpopt to; -	int32_t slot = 0; +	int32_t pacing_delay = 0;  	struct inpcb *inp;  	struct sockbuf *sb;  	bool hpts_calling; @@ -11986,8 +11981,7 @@ bbr_output_wtime(struct tcpcb *tp, const struct timeval *tv)  			delay_calc -= bbr->r_ctl.rc_last_delay_val;  		else {  			/* -			 * We are early setup to adjust -			 * our slot time. +			 * We are early setup to adjust out pacing delay.  			 */  			uint64_t merged_val; @@ -12104,7 +12098,7 @@ again:  #endif  	error = 0;  	tso = 0; -	slot = 0; +	pacing_delay = 0;  	mtu = 0;  	sendwin = min(tp->snd_wnd, tp->snd_cwnd);  	sb_offset = tp->snd_max - tp->snd_una; @@ -12126,7 +12120,7 @@ recheck_resend:  			tot_len = tp->t_maxseg;  			if (hpts_calling)  				/* Retry in a ms */ -				slot = 1001; +				pacing_delay = 1001;  			goto just_return_nolock;  		}  		TAILQ_INSERT_TAIL(&bbr->r_ctl.rc_free, rsm, r_next); @@ -12699,9 +12693,9 @@ just_return:  	SOCK_SENDBUF_UNLOCK(so);  just_return_nolock:  	if (tot_len) -		slot = bbr_get_pacing_delay(bbr, bbr->r_ctl.rc_bbr_hptsi_gain, tot_len, cts, 0); +		pacing_delay = bbr_get_pacing_delay(bbr, bbr->r_ctl.rc_bbr_hptsi_gain, tot_len, cts, 0);  	if (bbr->rc_no_pacing) -		slot = 0; +		pacing_delay = 0;  	if (tot_len == 0) {  		if ((ctf_outstanding(tp) + min((bbr->r_ctl.rc_high_rwnd/2), bbr_minseg(bbr))) >=  		    tp->snd_wnd) { @@ -12751,7 +12745,7 @@ just_return_nolock:  	/* Dont update the time if we did not send */  	bbr->r_ctl.rc_last_delay_val = 0;  	bbr->rc_output_starts_timer = 1; -	bbr_start_hpts_timer(bbr, tp, cts, 9, slot, tot_len); +	bbr_start_hpts_timer(bbr, tp, cts, 9, pacing_delay, tot_len);  	bbr_log_type_just_return(bbr, cts, tot_len, hpts_calling, app_limited, p_maxseg, len);  	if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {  		/* Make sure snd_nxt is drug up */ @@ -12787,7 +12781,7 @@ send:  				flags &= ~TH_FIN;  				if ((len == 0) && ((tp->t_flags & TF_ACKNOW) == 0)) {  					/* Lets not send this */ -					slot = 0; +					pacing_delay = 0;  					goto just_return;  				}  			} @@ -13053,7 +13047,7 @@ send:  		/*  		 * We have outstanding data, don't send a fin by itself!.  		 */ -		slot = 0; +		pacing_delay = 0;  		goto just_return;  	}  	/* @@ -13763,7 +13757,7 @@ nomore:  				if (tp->snd_cwnd < maxseg)  					tp->snd_cwnd = maxseg;  			} -			slot = (bbr_error_base_paceout + 1) << bbr->oerror_cnt; +			pacing_delay = (bbr_error_base_paceout + 1) << bbr->oerror_cnt;  			BBR_STAT_INC(bbr_saw_enobuf);  			if (bbr->bbr_hdrw_pacing)  				counter_u64_add(bbr_hdwr_pacing_enobuf, 1); @@ -13812,18 +13806,18 @@ nomore:  				}  				/*  				 * Nuke all other things that can interfere -				 * with slot +				 * with pacing delay  				 */  				if ((tot_len + len) && (len >= tp->t_maxseg)) { -					slot = bbr_get_pacing_delay(bbr, +					pacing_delay = bbr_get_pacing_delay(bbr,  					    bbr->r_ctl.rc_bbr_hptsi_gain,  					    (tot_len + len), cts, 0); -					if (slot < bbr_error_base_paceout) -						slot = (bbr_error_base_paceout + 2) << bbr->oerror_cnt; +					if (pacing_delay < bbr_error_base_paceout) +						pacing_delay = (bbr_error_base_paceout + 2) << bbr->oerror_cnt;  				} else -					slot = (bbr_error_base_paceout + 2) << bbr->oerror_cnt; +					pacing_delay = (bbr_error_base_paceout + 2) << bbr->oerror_cnt;  				bbr->rc_output_starts_timer = 1; -				bbr_start_hpts_timer(bbr, tp, cts, 10, slot, +				bbr_start_hpts_timer(bbr, tp, cts, 10, pacing_delay,  				    tot_len);  				return (error);  			} @@ -13841,9 +13835,9 @@ nomore:  			}  			/* FALLTHROUGH */  		default: -			slot = (bbr_error_base_paceout + 3) << bbr->oerror_cnt; +			pacing_delay = (bbr_error_base_paceout + 3) << bbr->oerror_cnt;  			bbr->rc_output_starts_timer = 1; -			bbr_start_hpts_timer(bbr, tp, cts, 11, slot, 0); +			bbr_start_hpts_timer(bbr, tp, cts, 11, pacing_delay, 0);  			return (error);  		}  #ifdef STATS @@ -13981,12 +13975,12 @@ skip_again:  		tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST);  	if (((flags & (TH_RST | TH_SYN | TH_FIN)) == 0) && tot_len) {  		/* -		 * Calculate/Re-Calculate the hptsi slot in usecs based on +		 * Calculate/Re-Calculate the hptsi timeout in usecs based on  		 * what we have sent so far  		 */ -		slot = bbr_get_pacing_delay(bbr, bbr->r_ctl.rc_bbr_hptsi_gain, tot_len, cts, 0); +		pacing_delay = bbr_get_pacing_delay(bbr, bbr->r_ctl.rc_bbr_hptsi_gain, tot_len, cts, 0);  		if (bbr->rc_no_pacing) -			slot = 0; +			pacing_delay = 0;  	}  	tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);  enobufs: @@ -13999,8 +13993,8 @@ enobufs:  	    (more_to_rxt ||  	     ((bbr->r_ctl.rc_resend = bbr_check_recovery_mode(tp, bbr, cts)) != NULL))) {  		/* Rack cheats and shotguns out all rxt's 1ms apart */ -		if (slot > 1000) -			slot = 1000; +		if (pacing_delay > 1000) +			pacing_delay = 1000;  	}  	if (bbr->bbr_hdrw_pacing && (bbr->hw_pacing_set == 0)) {  		/* @@ -14014,7 +14008,7 @@ enobufs:  			tcp_bbr_tso_size_check(bbr, cts);  		}  	} -	bbr_start_hpts_timer(bbr, tp, cts, 12, slot, tot_len); +	bbr_start_hpts_timer(bbr, tp, cts, 12, pacing_delay, tot_len);  	if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {  		/* Make sure snd_nxt is drug up */  		tp->snd_nxt = tp->snd_max; @@ -14132,8 +14126,7 @@ bbr_switch_failed(struct tcpcb *tp)  		}  	} else  		toval = HPTS_USECS_PER_SLOT; -	(void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(toval), -				   __LINE__, &diag); +	tcp_hpts_insert(tp, toval, &diag);  	bbr_log_hpts_diag(bbr, cts, &diag);  } diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c index 11ef5ba706c5..9ed26d5a617b 100644 --- a/sys/netinet/tcp_stacks/rack.c +++ b/sys/netinet/tcp_stacks/rack.c @@ -204,10 +204,6 @@ static int32_t rack_dnd_default = 0;		/* For rr_conf = 3, what is the default fo  static int32_t rack_rxt_controls = 0;  static int32_t rack_fill_cw_state = 0;  static uint8_t rack_req_measurements = 1; -/* Attack threshold detections */ -static uint32_t rack_highest_sack_thresh_seen = 0; -static uint32_t rack_highest_move_thresh_seen = 0; -static uint32_t rack_merge_out_sacks_on_attack = 0;  static int32_t rack_enable_hw_pacing = 0; /* Due to CCSP keep it off by default */  static int32_t rack_hw_rate_caps = 0; /* 1; */  static int32_t rack_hw_rate_cap_per = 0;	/* 0 -- off  */ @@ -223,7 +219,6 @@ static int32_t rack_default_pacing_divisor = 250;  static uint16_t rack_pacing_min_seg = 0;  static int32_t rack_timely_off = 0; -static uint32_t sad_seg_size_per = 800;	/* 80.0 % */  static int32_t rack_pkt_delay = 1000;  static int32_t rack_send_a_lot_in_prr = 1;  static int32_t rack_min_to = 1000;	/* Number of microsecond  min timeout */ @@ -250,11 +245,11 @@ static int32_t rack_non_rxt_use_cr = 0; /* does a non-rxt in recovery use the co  static int32_t rack_persist_min = 250000;	/* 250usec */  static int32_t rack_persist_max = 2000000;	/* 2 Second in usec's */  static int32_t rack_honors_hpts_min_to =  1;	/* Do we honor the hpts minimum time out for pacing timers */ -static uint32_t rack_max_reduce = 10;		/* Percent we can reduce slot by */ +static uint32_t rack_max_reduce = 10;		/* Percent we can reduce pacing delay by */  static int32_t rack_sack_not_required = 1;	/* set to one to allow non-sack to use rack */  static int32_t rack_limit_time_with_srtt = 0;  static int32_t rack_autosndbuf_inc = 20;	/* In percentage form */ -static int32_t rack_enobuf_hw_boost_mult = 0;	/* How many times the hw rate we boost slot using time_between */ +static int32_t rack_enobuf_hw_boost_mult = 0;	/* How many times the hw rate we boost pacing delay using time_between */  static int32_t rack_enobuf_hw_max = 12000;	/* 12 ms in usecs */  static int32_t rack_enobuf_hw_min = 10000;	/* 10 ms in usecs */  static int32_t rack_hw_rwnd_factor = 2;		/* How many max_segs the rwnd must be before we hold off sending */ @@ -278,7 +273,7 @@ static int32_t rack_hptsi_segments = 40;  static int32_t rack_rate_sample_method = USE_RTT_LOW;  static int32_t rack_pace_every_seg = 0;  static int32_t rack_delayed_ack_time = 40000;	/* 40ms in usecs */ -static int32_t rack_slot_reduction = 4; +static int32_t rack_pacing_delay_reduction = 4;  static int32_t rack_wma_divisor = 8;		/* For WMA calculation */  static int32_t rack_cwnd_block_ends_measure = 0;  static int32_t rack_rwnd_block_ends_measure = 0; @@ -399,18 +394,6 @@ counter_u64_t rack_extended_rfo;  counter_u64_t rack_sack_proc_all;  counter_u64_t rack_sack_proc_short;  counter_u64_t rack_sack_proc_restart; -counter_u64_t rack_sack_attacks_detected; -counter_u64_t rack_sack_attacks_reversed; -counter_u64_t rack_sack_attacks_suspect; -counter_u64_t rack_sack_used_next_merge; -counter_u64_t rack_sack_splits; -counter_u64_t rack_sack_used_prev_merge; -counter_u64_t rack_sack_skipped_acked; -counter_u64_t rack_ack_total; -counter_u64_t rack_express_sack; -counter_u64_t rack_sack_total; -counter_u64_t rack_move_none; -counter_u64_t rack_move_some;  counter_u64_t rack_input_idle_reduces;  counter_u64_t rack_collapsed_win; @@ -478,7 +461,7 @@ rack_log_alt_to_to_cancel(struct tcp_rack *rack,      uint16_t flex7, uint8_t mod);  static void -rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t slot, +rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t pacing_delay,     uint64_t bw_est, uint64_t bw, uint64_t len_time, int method, int line,     struct rack_sendmap *rsm, uint8_t quality);  static struct rack_sendmap * @@ -834,18 +817,6 @@ sysctl_rack_clear(SYSCTL_HANDLER_ARGS)  		counter_u64_zero(rack_rxt_clamps_cwnd_uniq);  		counter_u64_zero(rack_multi_single_eq);  		counter_u64_zero(rack_proc_non_comp_ack); -		counter_u64_zero(rack_sack_attacks_detected); -		counter_u64_zero(rack_sack_attacks_reversed); -		counter_u64_zero(rack_sack_attacks_suspect); -		counter_u64_zero(rack_sack_used_next_merge); -		counter_u64_zero(rack_sack_used_prev_merge); -		counter_u64_zero(rack_sack_splits); -		counter_u64_zero(rack_sack_skipped_acked); -		counter_u64_zero(rack_ack_total); -		counter_u64_zero(rack_express_sack); -		counter_u64_zero(rack_sack_total); -		counter_u64_zero(rack_move_none); -		counter_u64_zero(rack_move_some);  		counter_u64_zero(rack_try_scwnd);  		counter_u64_zero(rack_collapsed_win);  		counter_u64_zero(rack_collapsed_win_rxt); @@ -872,7 +843,6 @@ static void  rack_init_sysctls(void)  {  	struct sysctl_oid *rack_counters; -	struct sysctl_oid *rack_attack;  	struct sysctl_oid *rack_pacing;  	struct sysctl_oid *rack_timely;  	struct sysctl_oid *rack_timers; @@ -883,12 +853,6 @@ rack_init_sysctls(void)  	struct sysctl_oid *rack_probertt;  	struct sysctl_oid *rack_hw_pacing; -	rack_attack = SYSCTL_ADD_NODE(&rack_sysctl_ctx, -	    SYSCTL_CHILDREN(rack_sysctl_root), -	    OID_AUTO, -	    "sack_attack", -	    CTLFLAG_RW | CTLFLAG_MPSAFE, 0, -	    "Rack Sack Attack Counters and Controls");  	rack_counters = SYSCTL_ADD_NODE(&rack_sysctl_ctx,  	    SYSCTL_CHILDREN(rack_sysctl_root),  	    OID_AUTO, @@ -1107,7 +1071,7 @@ rack_init_sysctls(void)  	SYSCTL_ADD_S32(&rack_sysctl_ctx,  	    SYSCTL_CHILDREN(rack_pacing),  	    OID_AUTO, "burst_reduces", CTLFLAG_RW, -	    &rack_slot_reduction, 4, +	    &rack_pacing_delay_reduction, 4,  	    "When doing only burst mitigation what is the reduce divisor");  	SYSCTL_ADD_S32(&rack_sysctl_ctx,  	    SYSCTL_CHILDREN(rack_sysctl_root), @@ -1399,7 +1363,7 @@ rack_init_sysctls(void)  	    SYSCTL_CHILDREN(rack_timers),  	    OID_AUTO, "hpts_max_reduce", CTLFLAG_RW,  	    &rack_max_reduce, 10, -	    "Max percentage we will reduce slot by for pacing when we are behind"); +	    "Max percentage we will reduce pacing delay by for pacing when we are behind");  	SYSCTL_ADD_U32(&rack_sysctl_ctx,  	    SYSCTL_CHILDREN(rack_timers),  	    OID_AUTO, "persmin", CTLFLAG_RW, @@ -1535,11 +1499,6 @@ rack_init_sysctls(void)  	    "Do not disturb default for rack_rrr = 3");  	SYSCTL_ADD_S32(&rack_sysctl_ctx,  	    SYSCTL_CHILDREN(rack_misc), -	    OID_AUTO, "sad_seg_per", CTLFLAG_RW, -	    &sad_seg_size_per, 800, -	    "Percentage of segment size needed in a sack 800 = 80.0?"); -	SYSCTL_ADD_S32(&rack_sysctl_ctx, -	    SYSCTL_CHILDREN(rack_misc),  	    OID_AUTO, "rxt_controls", CTLFLAG_RW,  	    &rack_rxt_controls, 0,  	    "Retransmit sending size controls (valid  values 0, 1, 2 default=1)?"); @@ -1619,85 +1578,6 @@ rack_init_sysctls(void)  	    &rack_autosndbuf_inc, 20,  	    "What percentage should rack scale up its snd buffer by?"); - -	/* Sack Attacker detection stuff */ -	SYSCTL_ADD_U32(&rack_sysctl_ctx, -	    SYSCTL_CHILDREN(rack_attack), -	    OID_AUTO, "merge_out", CTLFLAG_RW, -	    &rack_merge_out_sacks_on_attack, 0, -	    "Do we merge the sendmap when we decide we are being attacked?"); - -	SYSCTL_ADD_U32(&rack_sysctl_ctx, -	    SYSCTL_CHILDREN(rack_attack), -	    OID_AUTO, "detect_highsackratio", CTLFLAG_RW, -	    &rack_highest_sack_thresh_seen, 0, -	    "Highest sack to ack ratio seen"); -	SYSCTL_ADD_U32(&rack_sysctl_ctx, -	    SYSCTL_CHILDREN(rack_attack), -	    OID_AUTO, "detect_highmoveratio", CTLFLAG_RW, -	    &rack_highest_move_thresh_seen, 0, -	    "Highest move to non-move ratio seen"); -	rack_ack_total = counter_u64_alloc(M_WAITOK); -	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, -	    SYSCTL_CHILDREN(rack_attack), -	    OID_AUTO, "acktotal", CTLFLAG_RD, -	    &rack_ack_total, -	    "Total number of Ack's"); -	rack_express_sack = counter_u64_alloc(M_WAITOK); -	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, -	    SYSCTL_CHILDREN(rack_attack), -	    OID_AUTO, "exp_sacktotal", CTLFLAG_RD, -	    &rack_express_sack, -	    "Total expresss number of Sack's"); -	rack_sack_total = counter_u64_alloc(M_WAITOK); -	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, -	    SYSCTL_CHILDREN(rack_attack), -	    OID_AUTO, "sacktotal", CTLFLAG_RD, -	    &rack_sack_total, -	    "Total number of SACKs"); -	rack_move_none = counter_u64_alloc(M_WAITOK); -	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, -	    SYSCTL_CHILDREN(rack_attack), -	    OID_AUTO, "move_none", CTLFLAG_RD, -	    &rack_move_none, -	    "Total number of SACK index reuse of positions under threshold"); -	rack_move_some = counter_u64_alloc(M_WAITOK); -	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, -	    SYSCTL_CHILDREN(rack_attack), -	    OID_AUTO, "move_some", CTLFLAG_RD, -	    &rack_move_some, -	    "Total number of SACK index reuse of positions over threshold"); -	rack_sack_attacks_detected = counter_u64_alloc(M_WAITOK); -	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, -	    SYSCTL_CHILDREN(rack_attack), -	    OID_AUTO, "attacks", CTLFLAG_RD, -	    &rack_sack_attacks_detected, -	    "Total number of SACK attackers that had sack disabled"); -	rack_sack_attacks_reversed = counter_u64_alloc(M_WAITOK); -	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, -	    SYSCTL_CHILDREN(rack_attack), -	    OID_AUTO, "reversed", CTLFLAG_RD, -	    &rack_sack_attacks_reversed, -	    "Total number of SACK attackers that were later determined false positive"); -	rack_sack_attacks_suspect = counter_u64_alloc(M_WAITOK); -	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, -	    SYSCTL_CHILDREN(rack_attack), -	    OID_AUTO, "suspect", CTLFLAG_RD, -	    &rack_sack_attacks_suspect, -	    "Total number of SACKs that triggered early detection"); - -	rack_sack_used_next_merge = counter_u64_alloc(M_WAITOK); -	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, -	    SYSCTL_CHILDREN(rack_attack), -	    OID_AUTO, "nextmerge", CTLFLAG_RD, -	    &rack_sack_used_next_merge, -	    "Total number of times we used the next merge"); -	rack_sack_used_prev_merge = counter_u64_alloc(M_WAITOK); -	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, -	    SYSCTL_CHILDREN(rack_attack), -	    OID_AUTO, "prevmerge", CTLFLAG_RD, -	    &rack_sack_used_prev_merge, -	    "Total number of times we used the prev merge");  	/* Counters */  	rack_total_bytes = counter_u64_alloc(M_WAITOK);  	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, @@ -1908,18 +1788,6 @@ rack_init_sysctls(void)  	    OID_AUTO, "sack_short", CTLFLAG_RD,  	    &rack_sack_proc_short,  	    "Total times we took shortcut for sack processing"); -	rack_sack_skipped_acked = counter_u64_alloc(M_WAITOK); -	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, -	    SYSCTL_CHILDREN(rack_attack), -	    OID_AUTO, "skipacked", CTLFLAG_RD, -	    &rack_sack_skipped_acked, -	    "Total number of times we skipped previously sacked"); -	rack_sack_splits = counter_u64_alloc(M_WAITOK); -	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, -	    SYSCTL_CHILDREN(rack_attack), -	    OID_AUTO, "ofsplit", CTLFLAG_RD, -	    &rack_sack_splits, -	    "Total number of times we did the old fashion tree split");  	rack_input_idle_reduces = counter_u64_alloc(M_WAITOK);  	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,  	    SYSCTL_CHILDREN(rack_counters), @@ -2700,7 +2568,7 @@ rack_log_retran_reason(struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t  }  static void -rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot, uint8_t which) +rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t pacing_delay, uint8_t which)  {  	if (tcp_bblogging_on(rack->rc_tp)) {  		union tcp_log_stackspecific log; @@ -2710,7 +2578,7 @@ rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot  		log.u_bbr.flex1 = rack->rc_tp->t_srtt;  		log.u_bbr.flex2 = to;  		log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags; -		log.u_bbr.flex4 = slot; +		log.u_bbr.flex4 = pacing_delay;  		log.u_bbr.flex5 = rack->rc_tp->t_hpts_slot;  		log.u_bbr.flex6 = rack->rc_tp->t_rxtcur;  		log.u_bbr.flex7 = rack->rc_in_persist; @@ -3034,14 +2902,14 @@ rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick,  }  static void -rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t slot, uint32_t cts, struct timeval *tv, int line) +rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t pacing_delay, uint32_t cts, struct timeval *tv, int line)  {  	if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) {  		union tcp_log_stackspecific log;  		memset(&log, 0, sizeof(log));  		log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); -		log.u_bbr.flex1 = slot; +		log.u_bbr.flex1 = pacing_delay;  		if (rack->rack_no_prr)  			log.u_bbr.flex2 = 0;  		else @@ -3139,7 +3007,7 @@ rack_log_type_pacing_sizes(struct tcpcb *tp, struct tcp_rack *rack, uint32_t arg  }  static void -rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, uint32_t slot, +rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, uint32_t pacing_delay,  			  uint8_t hpts_calling, int reason, uint32_t cwnd_to_use)  {  	if (tcp_bblogging_on(rack->rc_tp)) { @@ -3148,7 +3016,7 @@ rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, ui  		memset(&log, 0, sizeof(log));  		log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); -		log.u_bbr.flex1 = slot; +		log.u_bbr.flex1 = pacing_delay;  		log.u_bbr.flex2 = rack->r_ctl.rc_hpts_flags;  		log.u_bbr.flex4 = reason;  		if (rack->rack_no_prr) @@ -3319,16 +3187,6 @@ rack_counter_destroy(void)  	counter_u64_free(rack_hw_pace_lost);  	counter_u64_free(rack_non_fto_send);  	counter_u64_free(rack_extended_rfo); -	counter_u64_free(rack_ack_total); -	counter_u64_free(rack_express_sack); -	counter_u64_free(rack_sack_total); -	counter_u64_free(rack_move_none); -	counter_u64_free(rack_move_some); -	counter_u64_free(rack_sack_attacks_detected); -	counter_u64_free(rack_sack_attacks_reversed); -	counter_u64_free(rack_sack_attacks_suspect); -	counter_u64_free(rack_sack_used_next_merge); -	counter_u64_free(rack_sack_used_prev_merge);  	counter_u64_free(rack_tlp_tot);  	counter_u64_free(rack_tlp_newdata);  	counter_u64_free(rack_tlp_retran); @@ -3351,8 +3209,6 @@ rack_counter_destroy(void)  	counter_u64_free(rack_sack_proc_all);  	counter_u64_free(rack_sack_proc_restart);  	counter_u64_free(rack_sack_proc_short); -	counter_u64_free(rack_sack_skipped_acked); -	counter_u64_free(rack_sack_splits);  	counter_u64_free(rack_input_idle_reduces);  	counter_u64_free(rack_collapsed_win);  	counter_u64_free(rack_collapsed_win_rxt); @@ -4730,7 +4586,7 @@ rack_make_timely_judgement(struct tcp_rack *rack, uint32_t rtt, int32_t rtt_diff  	return (timely_says);  } -static __inline int +static inline int  rack_in_gp_window(struct tcpcb *tp, struct rack_sendmap *rsm)  {  	if (SEQ_GEQ(rsm->r_start, tp->gput_seq) && @@ -4767,7 +4623,7 @@ rack_in_gp_window(struct tcpcb *tp, struct rack_sendmap *rsm)  	return (0);  } -static __inline void +static inline void  rack_mark_in_gp_win(struct tcpcb *tp, struct rack_sendmap *rsm)  { @@ -4784,7 +4640,7 @@ rack_mark_in_gp_win(struct tcpcb *tp, struct rack_sendmap *rsm)  		rsm->r_flags &= ~RACK_IN_GP_WIN;  } -static __inline void +static inline void  rack_clear_gp_marks(struct tcpcb *tp, struct tcp_rack *rack)  {  	/* A GP measurement is ending, clear all marks on the send map*/ @@ -4802,7 +4658,7 @@ rack_clear_gp_marks(struct tcpcb *tp, struct tcp_rack *rack)  } -static __inline void +static inline void  rack_tend_gp_marks(struct tcpcb *tp, struct tcp_rack *rack)  {  	struct rack_sendmap *rsm = NULL; @@ -6482,7 +6338,7 @@ rack_log_hpts_diag(struct tcp_rack *rack, uint32_t cts,  		log.u_bbr.flex2 = diag->p_cur_slot;  		log.u_bbr.flex3 = diag->slot_req;  		log.u_bbr.flex4 = diag->inp_hptsslot; -		log.u_bbr.flex5 = diag->slot_remaining; +		log.u_bbr.flex5 = diag->time_remaining;  		log.u_bbr.flex6 = diag->need_new_to;  		log.u_bbr.flex7 = diag->p_hpts_active;  		log.u_bbr.flex8 = diag->p_on_min_sleep; @@ -6497,9 +6353,6 @@ rack_log_hpts_diag(struct tcp_rack *rack, uint32_t cts,  		log.u_bbr.rttProp = diag->wheel_cts;  		log.u_bbr.timeStamp = cts;  		log.u_bbr.delRate = diag->maxslots; -		log.u_bbr.cur_del_rate = diag->p_curtick; -		log.u_bbr.cur_del_rate <<= 32; -		log.u_bbr.cur_del_rate |= diag->p_lasttick;  		TCP_LOG_EVENTP(rack->rc_tp, NULL,  		    &rack->rc_inp->inp_socket->so_rcv,  		    &rack->rc_inp->inp_socket->so_snd, @@ -6532,14 +6385,14 @@ rack_log_wakeup(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb, uin  static void  rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, -      int32_t slot, uint32_t tot_len_this_send, int sup_rack) +      int32_t usecs, uint32_t tot_len_this_send, int sup_rack)  {  	struct hpts_diag diag;  	struct inpcb *inp = tptoinpcb(tp);  	struct timeval tv;  	uint32_t delayed_ack = 0;  	uint32_t hpts_timeout; -	uint32_t entry_slot = slot; +	uint32_t entry_usecs = usecs;  	uint8_t stopped;  	uint32_t left = 0;  	uint32_t us_cts; @@ -6560,7 +6413,7 @@ rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts,  	rack->r_ctl.rc_hpts_flags = 0;  	us_cts = tcp_get_usecs(&tv);  	/* Now early/late accounting */ -	rack_log_pacing_delay_calc(rack, entry_slot, slot, 0, 0, 0, 26, __LINE__, NULL, 0); +	rack_log_pacing_delay_calc(rack, entry_usecs, usecs, 0, 0, 0, 26, __LINE__, NULL, 0);  	if (rack->r_early && (rack->rc_ack_can_sendout_data == 0)) {  		/*  		 * We have a early carry over set, @@ -6571,7 +6424,7 @@ rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts,  		 * penalize the next timer for being awoke  		 * by an ack aka the rc_agg_early (non-paced mode).  		 */ -		slot += rack->r_ctl.rc_agg_early; +		usecs += rack->r_ctl.rc_agg_early;  		rack->r_early = 0;  		rack->r_ctl.rc_agg_early = 0;  	} @@ -6583,29 +6436,29 @@ rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts,  		 * really depends on what  		 * the current pacing time is.  		 */ -		if (rack->r_ctl.rc_agg_delayed >= slot) { +		if (rack->r_ctl.rc_agg_delayed >= usecs) {  			/*  			 * We can't compensate for it all.  			 * And we have to have some time  			 * on the clock. We always have a min -			 * 10 slots (10 x 10 i.e. 100 usecs). +			 * 10 HPTS timer units (10 x 10 i.e. 100 usecs).  			 */ -			if (slot <= HPTS_USECS_PER_SLOT) { +			if (usecs <= HPTS_USECS_PER_SLOT) {  				/* We gain delay */ -				rack->r_ctl.rc_agg_delayed += (HPTS_USECS_PER_SLOT - slot); -				slot = HPTS_USECS_PER_SLOT; +				rack->r_ctl.rc_agg_delayed += (HPTS_USECS_PER_SLOT - usecs); +				usecs = HPTS_USECS_PER_SLOT;  			} else {  				/* We take off some */ -				rack->r_ctl.rc_agg_delayed -= (slot - HPTS_USECS_PER_SLOT); -				slot = HPTS_USECS_PER_SLOT; +				rack->r_ctl.rc_agg_delayed -= (usecs - HPTS_USECS_PER_SLOT); +				usecs = HPTS_USECS_PER_SLOT;  			}  		} else { -			slot -= rack->r_ctl.rc_agg_delayed; +			usecs -= rack->r_ctl.rc_agg_delayed;  			rack->r_ctl.rc_agg_delayed = 0;  			/* Make sure we have 100 useconds at minimum */ -			if (slot < HPTS_USECS_PER_SLOT) { -				rack->r_ctl.rc_agg_delayed = HPTS_USECS_PER_SLOT - slot; -				slot = HPTS_USECS_PER_SLOT; +			if (usecs < HPTS_USECS_PER_SLOT) { +				rack->r_ctl.rc_agg_delayed = HPTS_USECS_PER_SLOT - usecs; +				usecs = HPTS_USECS_PER_SLOT;  			}  			if (rack->r_ctl.rc_agg_delayed == 0)  				rack->r_late = 0; @@ -6614,17 +6467,17 @@ rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts,  		/* r_use_hpts_min is on and so is DGP */  		uint32_t max_red; -		max_red = (slot * rack->r_ctl.max_reduction) / 100; +		max_red = (usecs * rack->r_ctl.max_reduction) / 100;  		if (max_red >= rack->r_ctl.rc_agg_delayed) { -			slot -= rack->r_ctl.rc_agg_delayed; +			usecs -= rack->r_ctl.rc_agg_delayed;  			rack->r_ctl.rc_agg_delayed = 0;  		} else { -			slot -= max_red; +			usecs -= max_red;  			rack->r_ctl.rc_agg_delayed -= max_red;  		}  	}  	if ((rack->r_use_hpts_min == 1) && -	    (slot > 0) && +	    (usecs > 0) &&  	    (rack->dgp_on == 1)) {  		/*  		 * We are enforcing a min pacing timer @@ -6633,8 +6486,8 @@ rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts,  		uint32_t min;  		min = get_hpts_min_sleep_time(); -		if (min > slot) { -			slot = min; +		if (min > usecs) { +			usecs = min;  		}  	}  	hpts_timeout = rack_timer_start(tp, rack, cts, sup_rack); @@ -6652,7 +6505,7 @@ rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts,  	 * wheel, we resort to a keep-alive timer if its configured.  	 */  	if ((hpts_timeout == 0) && -	    (slot == 0)) { +	    (usecs == 0)) {  		if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) &&  		    (tp->t_state <= TCPS_CLOSING)) {  			/* @@ -6709,10 +6562,10 @@ rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts,  			hpts_timeout = 0x7ffffffe;  		rack->r_ctl.rc_timer_exp = cts + hpts_timeout;  	} -	rack_log_pacing_delay_calc(rack, entry_slot, slot, hpts_timeout, 0, 0, 27, __LINE__, NULL, 0); +	rack_log_pacing_delay_calc(rack, entry_usecs, usecs, hpts_timeout, 0, 0, 27, __LINE__, NULL, 0);  	if ((rack->gp_ready == 0) &&  	    (rack->use_fixed_rate == 0) && -	    (hpts_timeout < slot) && +	    (hpts_timeout < usecs) &&  	    (rack->r_ctl.rc_hpts_flags & (PACE_TMR_TLP|PACE_TMR_RXT))) {  		/*  		 * We have no good estimate yet for the @@ -6722,7 +6575,7 @@ rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts,  		 * pace that long since we know the calculation  		 * so far is not accurate.  		 */ -		slot = hpts_timeout; +		usecs = hpts_timeout;  	}  	/**  	 * Turn off all the flags for queuing by default. The @@ -6754,11 +6607,11 @@ rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts,  	 * so LRO can call into us.  	 */  	tp->t_flags2 &= ~(TF2_DONT_SACK_QUEUE|TF2_MBUF_QUEUE_READY); -	if (slot) { +	if (usecs) {  		rack->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT; -		rack->r_ctl.rc_last_output_to = us_cts + slot; +		rack->r_ctl.rc_last_output_to = us_cts + usecs;  		/* -		 * A pacing timer (slot) is being set, in +		 * A pacing timer (usecs microseconds) is being set, in  		 * such a case we cannot send (we are blocked by  		 * the timer). So lets tell LRO that it should not  		 * wake us unless there is a SACK. Note this only @@ -6799,20 +6652,18 @@ rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts,  		}  		if ((rack->use_rack_rr) &&  		    (rack->r_rr_config < 2) && -		    ((hpts_timeout) && (hpts_timeout < slot))) { +		    ((hpts_timeout) && (hpts_timeout < usecs))) {  			/*  			 * Arrange for the hpts to kick back in after the  			 * t-o if the t-o does not cause a send.  			 */ -			(void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(hpts_timeout), -						   __LINE__, &diag); +			tcp_hpts_insert(tp, hpts_timeout, &diag);  			rack_log_hpts_diag(rack, us_cts, &diag, &tv); -			rack_log_to_start(rack, cts, hpts_timeout, slot, 0); +			rack_log_to_start(rack, cts, hpts_timeout, usecs, 0);  		} else { -			(void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(slot), -						   __LINE__, &diag); +			tcp_hpts_insert(tp, usecs, &diag);  			rack_log_hpts_diag(rack, us_cts, &diag, &tv); -			rack_log_to_start(rack, cts, hpts_timeout, slot, 1); +			rack_log_to_start(rack, cts, hpts_timeout, usecs, 1);  		}  	} else if (hpts_timeout) {  		/* @@ -6824,22 +6675,21 @@ rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts,  		 * at the start of this block) are good enough.  		 */  		rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; -		(void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(hpts_timeout), -					   __LINE__, &diag); +		tcp_hpts_insert(tp, hpts_timeout, &diag);  		rack_log_hpts_diag(rack, us_cts, &diag, &tv); -		rack_log_to_start(rack, cts, hpts_timeout, slot, 0); +		rack_log_to_start(rack, cts, hpts_timeout, usecs, 0);  	} else {  		/* No timer starting */  #ifdef INVARIANTS  		if (SEQ_GT(tp->snd_max, tp->snd_una)) { -			panic("tp:%p rack:%p tlts:%d cts:%u slot:%u pto:%u -- no timer started?", -			    tp, rack, tot_len_this_send, cts, slot, hpts_timeout); +			panic("tp:%p rack:%p tlts:%d cts:%u usecs:%u pto:%u -- no timer started?", +			    tp, rack, tot_len_this_send, cts, usecs, hpts_timeout);  		}  #endif  	}  	rack->rc_tmr_stopped = 0; -	if (slot) -		rack_log_type_bbrsnd(rack, tot_len_this_send, slot, us_cts, &tv, __LINE__); +	if (usecs) +		rack_log_type_bbrsnd(rack, tot_len_this_send, usecs, us_cts, &tv, __LINE__);  }  static void @@ -6870,6 +6720,18 @@ rack_mark_lost(struct tcpcb *tp,  	}  } +static inline void +rack_mark_nolonger_lost(struct tcp_rack *rack, struct rack_sendmap *rsm) +{ +	KASSERT((rack->r_ctl.rc_considered_lost >= (rsm->r_end - rsm->r_start)), +		("rsm:%p rack:%p rc_considered_lost goes negative", rsm,  rack)); +	rsm->r_flags &= ~RACK_WAS_LOST; +	if (rack->r_ctl.rc_considered_lost >= (rsm->r_end - rsm->r_start)) +		rack->r_ctl.rc_considered_lost -= rsm->r_end - rsm->r_start; +	else +		rack->r_ctl.rc_considered_lost = 0; +} +  /*   * RACK Timer, here we simply do logging and house keeping.   * the normal rack_output() function will call the @@ -7011,7 +6873,7 @@ rack_setup_offset_for_rsm(struct tcp_rack *rack, struct rack_sendmap *src_rsm, s  	rsm->orig_t_space = M_TRAILINGROOM(rsm->m);  } -static __inline void +static inline void  rack_clone_rsm(struct tcp_rack *rack, struct rack_sendmap *nrsm,  	       struct rack_sendmap *rsm, uint32_t start)  { @@ -8016,7 +7878,7 @@ rack_process_timers(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8  		rack->rc_tp->t_flags2 &= ~TF2_DONT_SACK_QUEUE;  		ret = -3;  		left = rack->r_ctl.rc_timer_exp - cts; -		tcp_hpts_insert(tp, HPTS_MS_TO_SLOTS(left)); +		tcp_hpts_insert(tp, left, NULL);  		rack_log_to_processing(rack, cts, ret, left);  		return (1);  	} @@ -8136,13 +7998,7 @@ rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack,  		 * remove the lost desgination and reduce the  		 * bytes considered lost.  		 */ -		rsm->r_flags &= ~RACK_WAS_LOST; -		KASSERT((rack->r_ctl.rc_considered_lost >= (rsm->r_end - rsm->r_start)), -			("rsm:%p rack:%p rc_considered_lost goes negative", rsm,  rack)); -		if (rack->r_ctl.rc_considered_lost >= (rsm->r_end - rsm->r_start)) -			rack->r_ctl.rc_considered_lost -= rsm->r_end - rsm->r_start; -		else -			rack->r_ctl.rc_considered_lost = 0; +		rack_mark_nolonger_lost(rack, rsm);  	}  	idx = rsm->r_rtr_cnt - 1;  	rsm->r_tim_lastsent[idx] = ts; @@ -9498,6 +9354,11 @@ do_rest_ofb:  				if (rsm->r_flags & RACK_WAS_LOST) {  					int my_chg; +					/* +					 * Note here we do not use our rack_mark_nolonger_lost() function +					 * since we are moving our data pointer around and the +					 * ack'ed side is already not considered lost. +					 */  					my_chg = (nrsm->r_end - nrsm->r_start);  					KASSERT((rack->r_ctl.rc_considered_lost >= my_chg),  						("rsm:%p rack:%p rc_considered_lost goes negative", rsm,  rack)); @@ -9537,7 +9398,6 @@ do_rest_ofb:  					goto out;  				}  				rack_log_map_chg(tp, rack, &stack_map, rsm, next, MAP_SACK_M1, end, __LINE__); -				counter_u64_add(rack_sack_used_next_merge, 1);  				/* Postion for the next block */  				start = next->r_end;  				rsm = tqhash_next(rack->r_ctl.tqh, next); @@ -9569,7 +9429,6 @@ do_rest_ofb:  					 */  					goto out;  				} -				counter_u64_add(rack_sack_splits, 1);  				rack_clone_rsm(rack, nrsm, rsm, start);  				rsm->r_just_ret = 0;  #ifndef INVARIANTS @@ -9591,7 +9450,6 @@ do_rest_ofb:  			}  		} else {  			/* Already sacked this piece */ -			counter_u64_add(rack_sack_skipped_acked, 1);  			if (end == rsm->r_end) {  				/* Done with block */  				rsm = tqhash_next(rack->r_ctl.tqh, rsm); @@ -9665,16 +9523,11 @@ do_rest_ofb:  			changed += (rsm->r_end - rsm->r_start);  			/* You get a count for acking a whole segment or more */  			if (rsm->r_flags & RACK_WAS_LOST) { -				int my_chg; - -				my_chg = (rsm->r_end - rsm->r_start); -				rsm->r_flags &= ~RACK_WAS_LOST; -				KASSERT((rack->r_ctl.rc_considered_lost >= my_chg), -					("rsm:%p rack:%p rc_considered_lost goes negative", rsm,  rack)); -				if (my_chg <= rack->r_ctl.rc_considered_lost) -					rack->r_ctl.rc_considered_lost -= my_chg; -				else -					rack->r_ctl.rc_considered_lost = 0; +				/* +				 * Here we can use the inline function since +				 * the rsm is truly marked lost and now no longer lost. +				 */ +				rack_mark_nolonger_lost(rack, rsm);  			}  			rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start);  			if (rsm->r_in_tmap) /* should be true */ @@ -9696,8 +9549,6 @@ do_rest_ofb:  				rsm->r_in_tmap = 0;  			}  			rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_SACK_M3, end, __LINE__); -		} else { -			counter_u64_add(rack_sack_skipped_acked, 1);  		}  		if (end == rsm->r_end) {  			/* This block only - done, setup for next */ @@ -9857,6 +9708,10 @@ do_rest_ofb:  			if (rsm->r_flags & RACK_WAS_LOST) {  				int my_chg; +				/* +				 * Note here we are using hookery again so we can't +				 * use our rack_mark_nolonger_lost() function. +				 */  				my_chg = (nrsm->r_end - nrsm->r_start);  				KASSERT((rack->r_ctl.rc_considered_lost >= my_chg),  					("rsm:%p rack:%p rc_considered_lost goes negative", rsm,  rack)); @@ -9872,7 +9727,6 @@ do_rest_ofb:  			}  			rack_log_map_chg(tp, rack, prev, &stack_map, rsm, MAP_SACK_M4, end, __LINE__);  			rsm = prev; -			counter_u64_add(rack_sack_used_prev_merge, 1);  		} else {  			/**  			 * This is the case where our previous @@ -9937,7 +9791,6 @@ do_rest_ofb:  			 * rsm      |---|         (acked)  			 * nrsm         |------|  (not acked)  			 */ -			counter_u64_add(rack_sack_splits, 1);  			rack_clone_rsm(rack, nrsm, rsm, end);  			rsm->r_flags &= (~RACK_HAS_FIN);  			rsm->r_just_ret = 0; @@ -9958,16 +9811,10 @@ do_rest_ofb:  			rack_update_rtt(tp, rack, rsm, to, cts, SACKED, 0);  			changed += (rsm->r_end - rsm->r_start);  			if (rsm->r_flags & RACK_WAS_LOST) { -				int my_chg; - -				my_chg = (rsm->r_end - rsm->r_start); -				rsm->r_flags &= ~RACK_WAS_LOST; -				KASSERT((rack->r_ctl.rc_considered_lost >= my_chg), -					("rsm:%p rack:%p rc_considered_lost goes negative", rsm,  rack)); -				if (my_chg <= rack->r_ctl.rc_considered_lost) -					rack->r_ctl.rc_considered_lost -= my_chg; -				else -					rack->r_ctl.rc_considered_lost = 0; +				/* +				 * Here it is safe to use our function. +				 */ +				rack_mark_nolonger_lost(rack, rsm);  			}  			rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); @@ -9991,11 +9838,6 @@ do_rest_ofb:  				rsm->r_in_tmap = 0;  			}  		} -	} else if (start != end){ -		/* -		 * The block was already acked. -		 */ -		counter_u64_add(rack_sack_skipped_acked, 1);  	}  out:  	if (rsm && @@ -10368,13 +10210,7 @@ more:  			 * and yet before retransmitting we get an ack  			 * which can happen due to reordering.  			 */ -			rsm->r_flags &= ~RACK_WAS_LOST; -			KASSERT((rack->r_ctl.rc_considered_lost >= (rsm->r_end - rsm->r_start)), -				("rsm:%p rack:%p rc_considered_lost goes negative", rsm,  rack)); -			if (rack->r_ctl.rc_considered_lost >= (rsm->r_end - rsm->r_start)) -				rack->r_ctl.rc_considered_lost -= rsm->r_end - rsm->r_start; -			else -				rack->r_ctl.rc_considered_lost = 0; +			rack_mark_nolonger_lost(rack, rsm);  		}  		rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_FREE, rsm->r_end, __LINE__);  		rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes; @@ -10482,12 +10318,7 @@ more:  		 * which can happen due to reordering. In this  		 * case its only a partial ack of the send.  		 */ -		KASSERT((rack->r_ctl.rc_considered_lost >= (th_ack - rsm->r_start)), -			("rsm:%p rack:%p rc_considered_lost goes negative th_ack:%u", rsm,  rack, th_ack)); -		if (rack->r_ctl.rc_considered_lost >= (th_ack - rsm->r_start)) -			rack->r_ctl.rc_considered_lost -= th_ack - rsm->r_start; -		else -			rack->r_ctl.rc_considered_lost = 0; +		rack_mark_nolonger_lost(rack, rsm);  	}  	/*  	 * Clear the dup ack count for @@ -10799,17 +10630,6 @@ rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th, int entered  	changed = 0;  	th_ack = th->th_ack;  	segsiz = ctf_fixed_maxseg(rack->rc_tp); -	if (BYTES_THIS_ACK(tp, th) >=  segsiz) { -		/* -		 * You only get credit for -		 * MSS and greater (and you get extra -		 * credit for larger cum-ack moves). -		 */ -		int ac; - -		ac = BYTES_THIS_ACK(tp, th) / ctf_fixed_maxseg(rack->rc_tp); -		counter_u64_add(rack_ack_total, ac); -	}  	if (SEQ_GT(th_ack, tp->snd_una)) {  		rack_log_progress_event(rack, tp, ticks, PROGRESS_UPDATE, __LINE__);  		tp->t_acktime = ticks; @@ -10881,8 +10701,8 @@ rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th, int entered  	if (sacks_seen != NULL)  		*sacks_seen = num_sack_blks;  	if (num_sack_blks == 0) { -		/* Nothing to sack, but we need to update counts */ -		goto out_with_totals; +		/* Nothing to sack */ +		goto out;  	}  	/* Its a sack of some sort */  	if (num_sack_blks < 2) { @@ -10905,7 +10725,7 @@ rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th, int entered  	 */  again:  	if (num_sack_blks == 0) -		goto out_with_totals; +		goto out;  	if (num_sack_blks > 1) {  		for (i = 0; i < num_sack_blks; i++) {  			for (j = i + 1; j < num_sack_blks; j++) { @@ -10958,19 +10778,7 @@ do_sack_work:  			changed += acked;  		}  		if (num_sack_blks == 1) { -			/* -			 * This is what we would expect from -			 * a normal implementation to happen -			 * after we have retransmitted the FR, -			 * i.e the sack-filter pushes down -			 * to 1 block and the next to be retransmitted -			 * is the sequence in the sack block (has more -			 * are acked). Count this as ACK'd data to boost -			 * up the chances of recovering any false positives. -			 */ -			counter_u64_add(rack_ack_total, (acked / ctf_fixed_maxseg(rack->rc_tp))); -			counter_u64_add(rack_express_sack, 1); -			goto out_with_totals; +			goto out;  		} else {  			/*  			 * Start the loop through the @@ -10979,7 +10787,6 @@ do_sack_work:  			loop_start = 1;  		}  	} -	counter_u64_add(rack_sack_total, 1);  	rsm = rack->r_ctl.rc_sacklast;  	for (i = loop_start; i < num_sack_blks; i++) {  		acked = rack_proc_sack_blk(tp, rack, &sack_blocks[i], to, &rsm, cts,  segsiz); @@ -10988,18 +10795,6 @@ do_sack_work:  			changed += acked;  		}  	} -out_with_totals: -	if (num_sack_blks > 1) { -		/* -		 * You get an extra stroke if -		 * you have more than one sack-blk, this -		 * could be where we are skipping forward -		 * and the sack-filter is still working, or -		 * it could be an attacker constantly -		 * moving us. -		 */ -		counter_u64_add(rack_move_some, 1); -	}  out:  	if (changed) {  		/* Something changed cancel the rack timer */ @@ -14377,8 +14172,7 @@ rack_switch_failed(struct tcpcb *tp)  		}  	} else  		toval = HPTS_USECS_PER_SLOT; -	(void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(toval), -				   __LINE__, &diag); +	tcp_hpts_insert(tp, toval, &diag);  	rack_log_hpts_diag(rack, cts, &diag, &tv);  } @@ -14720,7 +14514,6 @@ rack_init(struct tcpcb *tp, void **ptr)  	rack->r_ctl.rc_prr_sendalot = rack_send_a_lot_in_prr;  	rack->r_ctl.rc_min_to = rack_min_to;  	microuptime(&rack->r_ctl.act_rcv_time); -	rack->r_ctl.rc_last_time_decay = rack->r_ctl.act_rcv_time;  	rack->r_ctl.rack_per_of_gp_ss = rack_per_of_gp_ss;  	if (rack_hw_up_only)  		rack->r_up_only = 1; @@ -14973,8 +14766,7 @@ rack_init(struct tcpcb *tp, void **ptr)  				if (tov) {  					struct hpts_diag diag; -					(void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(tov), -								   __LINE__, &diag); +					tcp_hpts_insert(tp, tov, &diag);  					rack_log_hpts_diag(rack, us_cts, &diag, &rack->r_ctl.act_rcv_time);  				}  			} @@ -16367,7 +16159,7 @@ rack_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,  	struct rack_sendmap *rsm;  	int32_t prev_state = 0;  	int no_output = 0; -	int slot_remaining = 0; +	int time_remaining = 0;  #ifdef TCP_ACCOUNTING  	int ack_val_set = 0xf;  #endif @@ -16416,7 +16208,7 @@ rack_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,  		 * could be, if a sack is present, we want to be awoken and  		 * so should process the packets.  		 */ -		slot_remaining = rack->r_ctl.rc_last_output_to - us_cts; +		time_remaining = rack->r_ctl.rc_last_output_to - us_cts;  		if (rack->rc_tp->t_flags2 & TF2_DONT_SACK_QUEUE) {  			no_output = 1;  		} else { @@ -16436,7 +16228,7 @@ rack_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,  			     (*ts_ptr == TCP_LRO_TS_OPTION)))  				no_output = 1;  		} -		if ((no_output == 1) && (slot_remaining < tcp_min_hptsi_time)) { +		if ((no_output == 1) && (time_remaining < tcp_min_hptsi_time)) {  			/*  			 * It is unrealistic to think we can pace in less than  			 * the minimum granularity of the pacer (def:250usec). So @@ -16919,10 +16711,10 @@ do_output_now:  			   (tcp_in_hpts(rack->rc_tp) == 0)) {  			/*  			 * We are not in hpts and we had a pacing timer up. Use -			 * the remaining time (slot_remaining) to restart the timer. +			 * the remaining time (time_remaining) to restart the timer.  			 */ -			KASSERT ((slot_remaining != 0), ("slot remaining is zero for rack:%p tp:%p", rack, tp)); -			rack_start_hpts_timer(rack, tp, cts, slot_remaining, 0, 0); +			KASSERT ((time_remaining != 0), ("slot remaining is zero for rack:%p tp:%p", rack, tp)); +			rack_start_hpts_timer(rack, tp, cts, time_remaining, 0, 0);  			rack_free_trim(rack);  		}  		/* Clear the flag, it may have been cleared by output but we may not have  */ @@ -17102,7 +16894,7 @@ check_it:  }  static void -rack_log_pacing_delay_calc (struct tcp_rack *rack, uint32_t len, uint32_t slot, +rack_log_pacing_delay_calc (struct tcp_rack *rack, uint32_t len, uint32_t pacing_delay,  			   uint64_t bw_est, uint64_t bw, uint64_t len_time, int method,  			   int line, struct rack_sendmap *rsm, uint8_t quality)  { @@ -17125,7 +16917,7 @@ rack_log_pacing_delay_calc (struct tcp_rack *rack, uint32_t len, uint32_t slot,  			}  		}  		memset(&log, 0, sizeof(log)); -		log.u_bbr.flex1 = slot; +		log.u_bbr.flex1 = pacing_delay;  		log.u_bbr.flex2 = len;  		log.u_bbr.flex3 = rack->r_ctl.rc_pace_min_segs;  		log.u_bbr.flex4 = rack->r_ctl.rc_pace_max_segs; @@ -17284,25 +17076,25 @@ rack_arrive_at_discounted_rate(struct tcp_rack *rack, uint64_t window_input, uin  }  static int32_t -pace_to_fill_cwnd(struct tcp_rack *rack, int32_t slot, uint32_t len, uint32_t segsiz, int *capped, uint64_t *rate_wanted, uint8_t non_paced) +pace_to_fill_cwnd(struct tcp_rack *rack, int32_t pacing_delay, uint32_t len, uint32_t segsiz, int *capped, uint64_t *rate_wanted, uint8_t non_paced)  {  	uint64_t lentim, fill_bw;  	rack->r_via_fill_cw = 0;  	if (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > rack->r_ctl.cwnd_to_use) -		return (slot); +		return (pacing_delay);  	if ((ctf_outstanding(rack->rc_tp) + (segsiz-1)) > rack->rc_tp->snd_wnd) -		return (slot); +		return (pacing_delay);  	if (rack->r_ctl.rc_last_us_rtt == 0) -		return (slot); +		return (pacing_delay);  	if (rack->rc_pace_fill_if_rttin_range &&  	    (rack->r_ctl.rc_last_us_rtt >=  	     (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack->rtt_limit_mul))) {  		/* The rtt is huge, N * smallest, lets not fill */ -		return (slot); +		return (pacing_delay);  	}  	if (rack->r_ctl.fillcw_cap && *rate_wanted >= rack->r_ctl.fillcw_cap) -		return (slot); +		return (pacing_delay);  	/*  	 * first lets calculate the b/w based on the last us-rtt  	 * and the the smallest send window. @@ -17368,7 +17160,7 @@ at_lt_bw:  	if (non_paced)  		*rate_wanted = fill_bw;  	if ((fill_bw < RACK_MIN_BW) || (fill_bw < *rate_wanted)) -		return (slot); +		return (pacing_delay);  	rack->r_via_fill_cw = 1;  	if (rack->r_rack_hw_rate_caps &&  	    (rack->r_ctl.crte != NULL)) { @@ -17423,19 +17215,19 @@ at_lt_bw:  	lentim = (uint64_t)(len) * (uint64_t)HPTS_USEC_IN_SEC;  	lentim /= fill_bw;  	*rate_wanted = fill_bw; -	if (non_paced || (lentim < slot)) { -		rack_log_pacing_delay_calc(rack, len, slot, fill_bw, +	if (non_paced || (lentim < pacing_delay)) { +		rack_log_pacing_delay_calc(rack, len, pacing_delay, fill_bw,  					   0, lentim, 12, __LINE__, NULL, 0);  		return ((int32_t)lentim);  	} else -		return (slot); +		return (pacing_delay);  }  static int32_t  rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, struct rack_sendmap *rsm, uint32_t segsiz, int line)  {  	uint64_t srtt; -	int32_t slot = 0; +	int32_t pacing_delay = 0;  	int can_start_hw_pacing = 1;  	int err;  	int pace_one; @@ -17483,25 +17275,25 @@ rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, str  		 * cwnd. Which in that case we are just waiting for  		 * a ACK.  		 */ -		slot = len / tr_perms; +		pacing_delay = len / tr_perms;  		/* Now do we reduce the time so we don't run dry? */ -		if (slot && rack_slot_reduction) { -			reduce = (slot / rack_slot_reduction); -			if (reduce < slot) { -				slot -= reduce; +		if (pacing_delay && rack_pacing_delay_reduction) { +			reduce = (pacing_delay / rack_pacing_delay_reduction); +			if (reduce < pacing_delay) { +				pacing_delay -= reduce;  			} else -				slot = 0; +				pacing_delay = 0;  		} else  			reduce = 0; -		slot *= HPTS_USEC_IN_MSEC; +		pacing_delay *= HPTS_USEC_IN_MSEC;  		if (rack->rc_pace_to_cwnd) {  			uint64_t rate_wanted = 0; -			slot = pace_to_fill_cwnd(rack, slot, len, segsiz, NULL, &rate_wanted, 1); +			pacing_delay = pace_to_fill_cwnd(rack, pacing_delay, len, segsiz, NULL, &rate_wanted, 1);  			rack->rc_ack_can_sendout_data = 1; -			rack_log_pacing_delay_calc(rack, len, slot, rate_wanted, 0, 0, 14, __LINE__, NULL, 0); +			rack_log_pacing_delay_calc(rack, len, pacing_delay, rate_wanted, 0, 0, 14, __LINE__, NULL, 0);  		} else -			rack_log_pacing_delay_calc(rack, len, slot, tr_perms, reduce, 0, 7, __LINE__, NULL, 0); +			rack_log_pacing_delay_calc(rack, len, pacing_delay, tr_perms, reduce, 0, 7, __LINE__, NULL, 0);  		/*******************************************************/  		/* RRS: We insert non-paced call to stats here for len */  		/*******************************************************/ @@ -17575,7 +17367,7 @@ rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, str  		segs *= oh;  		lentim = (uint64_t)(len + segs) * (uint64_t)HPTS_USEC_IN_SEC;  		res = lentim / rate_wanted; -		slot = (uint32_t)res; +		pacing_delay = (uint32_t)res;  		if (rack_hw_rate_min &&  		    (rate_wanted < rack_hw_rate_min)) {  			can_start_hw_pacing = 0; @@ -17635,7 +17427,7 @@ rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, str  			 * We want to pace at our rate *or* faster to  			 * fill the cwnd to the max if its not full.  			 */ -			slot = pace_to_fill_cwnd(rack, slot, (len+segs), segsiz, &capped, &rate_wanted, 0); +			pacing_delay = pace_to_fill_cwnd(rack, pacing_delay, (len+segs), segsiz, &capped, &rate_wanted, 0);  			/* Re-check to make sure we are not exceeding our max b/w */  			if ((rack->r_ctl.crte != NULL) &&  			    (tcp_hw_highest_rate(rack->r_ctl.crte) < rate_wanted)) { @@ -17786,15 +17578,15 @@ rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, str  				srtt = rack->rc_tp->t_srtt;  			else  				srtt = RACK_INITIAL_RTO * HPTS_USEC_IN_MSEC;	/* its in ms convert */ -			if (srtt < (uint64_t)slot) { -				rack_log_pacing_delay_calc(rack, srtt, slot, rate_wanted, bw_est, lentim, 99, __LINE__, NULL, 0); -				slot = srtt; +			if (srtt < (uint64_t)pacing_delay) { +				rack_log_pacing_delay_calc(rack, srtt, pacing_delay, rate_wanted, bw_est, lentim, 99, __LINE__, NULL, 0); +				pacing_delay = srtt;  			}  		}  		/*******************************************************************/  		/* RRS: We insert paced call to stats here for len and rate_wanted */  		/*******************************************************************/ -		rack_log_pacing_delay_calc(rack, len, slot, rate_wanted, bw_est, lentim, 2, __LINE__, rsm, 0); +		rack_log_pacing_delay_calc(rack, len, pacing_delay, rate_wanted, bw_est, lentim, 2, __LINE__, rsm, 0);  	}  	if (rack->r_ctl.crte && (rack->r_ctl.crte->rs_num_enobufs > 0)) {  		/* @@ -17811,9 +17603,9 @@ rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, str  			hw_boost_delay = rack_enobuf_hw_max;  		else if (hw_boost_delay < rack_enobuf_hw_min)  			hw_boost_delay = rack_enobuf_hw_min; -		slot += hw_boost_delay; +		pacing_delay += hw_boost_delay;  	} -	return (slot); +	return (pacing_delay);  }  static void @@ -18482,7 +18274,7 @@ rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendma  	struct tcpopt to;  	u_char opt[TCP_MAXOLEN];  	uint32_t hdrlen, optlen; -	int32_t slot, segsiz, max_val, tso = 0, error = 0, ulen = 0; +	int32_t pacing_delay, segsiz, max_val, tso = 0, error = 0, ulen = 0;  	uint16_t flags;  	uint32_t if_hw_tsomaxsegcount = 0, startseq;  	uint32_t if_hw_tsomaxsegsize; @@ -18688,9 +18480,9 @@ rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendma  	}  	if (rack->r_ctl.crte != NULL) {  		/* See if we can send via the hw queue */ -		slot = rack_check_queue_level(rack, tp, tv, cts, len, segsiz); +		pacing_delay = rack_check_queue_level(rack, tp, tv, cts, len, segsiz);  		/* If there is nothing in queue (no pacing time) we can send via the hw queue */ -		if (slot == 0) +		if (pacing_delay == 0)  			ip_sendflag = 0;  	}  	tcp_set_flags(th, flags); @@ -18955,20 +18747,20 @@ rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendma  				rack_log_queue_level(tp, rack, len, tv, cts);  		} else  			tcp_trace_point(rack->rc_tp, TCP_TP_ENOBUF); -		slot = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC); +		pacing_delay = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC);  		if (rack->rc_enobuf < 0x7f)  			rack->rc_enobuf++; -		if (slot < (10 * HPTS_USEC_IN_MSEC)) -			slot = 10 * HPTS_USEC_IN_MSEC; +		if (pacing_delay < (10 * HPTS_USEC_IN_MSEC)) +			pacing_delay = 10 * HPTS_USEC_IN_MSEC;  		if (rack->r_ctl.crte != NULL) {  			counter_u64_add(rack_saw_enobuf_hw, 1);  			tcp_rl_log_enobuf(rack->r_ctl.crte);  		}  		counter_u64_add(rack_saw_enobuf, 1);  	} else { -		slot = rack_get_pacing_delay(rack, tp, len, NULL, segsiz, __LINE__); +		pacing_delay = rack_get_pacing_delay(rack, tp, len, NULL, segsiz, __LINE__);  	} -	rack_start_hpts_timer(rack, tp, cts, slot, len, 0); +	rack_start_hpts_timer(rack, tp, cts, pacing_delay, len, 0);  #ifdef TCP_ACCOUNTING  	crtsc = get_cyclecount();  	if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { @@ -19071,7 +18863,7 @@ rack_fast_output(struct tcpcb *tp, struct tcp_rack *rack, uint64_t ts_val,  #ifdef TCP_ACCOUNTING  	int cnt_thru = 1;  #endif -	int32_t slot, segsiz, len, max_val, tso = 0, sb_offset, error, ulen = 0; +	int32_t pacing_delay, segsiz, len, max_val, tso = 0, sb_offset, error, ulen = 0;  	uint16_t flags;  	uint32_t s_soff;  	uint32_t if_hw_tsomaxsegcount = 0, startseq; @@ -19519,8 +19311,8 @@ again:  	}  	tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);  	counter_u64_add(rack_fto_send, 1); -	slot = rack_get_pacing_delay(rack, tp, *tot_len, NULL, segsiz, __LINE__); -	rack_start_hpts_timer(rack, tp, cts, slot, *tot_len, 0); +	pacing_delay = rack_get_pacing_delay(rack, tp, *tot_len, NULL, segsiz, __LINE__); +	rack_start_hpts_timer(rack, tp, cts, pacing_delay, *tot_len, 0);  #ifdef TCP_ACCOUNTING  	crtsc = get_cyclecount();  	if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { @@ -19707,7 +19499,7 @@ rack_output(struct tcpcb *tp)  	struct rack_sendmap *rsm = NULL;  	int32_t tso, mtu;  	struct tcpopt to; -	int32_t slot = 0; +	int32_t pacing_delay = 0;  	int32_t sup_rack = 0;  	uint32_t cts, ms_cts, delayed, early;  	uint32_t add_flag = RACK_SENT_SP; @@ -20070,7 +19862,7 @@ again:  		if (rsm == NULL) {  			if (hpts_calling)  				/* Retry in a ms */ -				slot = (1 * HPTS_USEC_IN_MSEC); +				pacing_delay = (1 * HPTS_USEC_IN_MSEC);  			so = inp->inp_socket;  			sb = &so->so_snd;  			goto just_return_nolock; @@ -20877,7 +20669,7 @@ just_return_nolock:  		}  		if (tot_len_this_send > 0) {  			rack->r_ctl.fsb.recwin = recwin; -			slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, NULL, segsiz, __LINE__); +			pacing_delay = rack_get_pacing_delay(rack, tp, tot_len_this_send, NULL, segsiz, __LINE__);  			if ((error == 0) &&  			    rack_use_rfo &&  			    ((flags & (TH_SYN|TH_FIN)) == 0) && @@ -21060,8 +20852,8 @@ just_return_nolock:  			/* Yes lets make sure to move to persist before timer-start */  			rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime, tp->snd_una);  		} -		rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, sup_rack); -		rack_log_type_just_return(rack, cts, tot_len_this_send, slot, hpts_calling, app_limited, cwnd_to_use); +		rack_start_hpts_timer(rack, tp, cts, pacing_delay, tot_len_this_send, sup_rack); +		rack_log_type_just_return(rack, cts, tot_len_this_send, pacing_delay, hpts_calling, app_limited, cwnd_to_use);  	}  #ifdef NETFLIX_SHARED_CWND  	if ((sbavail(sb) == 0) && @@ -21100,8 +20892,8 @@ send:  		 * we come around to again, the flag will be clear.  		 */  		check_done = 1; -		slot = rack_check_queue_level(rack, tp, &tv, cts, len, segsiz); -		if (slot) { +		pacing_delay = rack_check_queue_level(rack, tp, &tv, cts, len, segsiz); +		if (pacing_delay) {  			rack->r_ctl.rc_agg_delayed = 0;  			rack->r_ctl.rc_agg_early = 0;  			rack->r_early = 0; @@ -22358,11 +22150,11 @@ nomore:  					rack_log_queue_level(tp, rack, len, &tv, cts);  			} else  				tcp_trace_point(rack->rc_tp, TCP_TP_ENOBUF); -			slot = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC); +			pacing_delay = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC);  			if (rack->rc_enobuf < 0x7f)  				rack->rc_enobuf++; -			if (slot < (10 * HPTS_USEC_IN_MSEC)) -				slot = 10 * HPTS_USEC_IN_MSEC; +			if (pacing_delay < (10 * HPTS_USEC_IN_MSEC)) +				pacing_delay = 10 * HPTS_USEC_IN_MSEC;  			if (rack->r_ctl.crte != NULL) {  				counter_u64_add(rack_saw_enobuf_hw, 1);  				tcp_rl_log_enobuf(rack->r_ctl.crte); @@ -22389,8 +22181,8 @@ nomore:  					goto again;  				}  			} -			slot = 10 * HPTS_USEC_IN_MSEC; -			rack_start_hpts_timer(rack, tp, cts, slot, 0, 0); +			pacing_delay = 10 * HPTS_USEC_IN_MSEC; +			rack_start_hpts_timer(rack, tp, cts, pacing_delay, 0, 0);  #ifdef TCP_ACCOUNTING  			crtsc = get_cyclecount();  			if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { @@ -22412,8 +22204,8 @@ nomore:  			}  			/* FALLTHROUGH */  		default: -			slot = 10 * HPTS_USEC_IN_MSEC; -			rack_start_hpts_timer(rack, tp, cts, slot, 0, 0); +			pacing_delay = 10 * HPTS_USEC_IN_MSEC; +			rack_start_hpts_timer(rack, tp, cts, pacing_delay, 0, 0);  #ifdef TCP_ACCOUNTING  			crtsc = get_cyclecount();  			if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { @@ -22456,18 +22248,18 @@ enobufs:  		/*  		 * We don't send again after sending a RST.  		 */ -		slot = 0; +		pacing_delay = 0;  		sendalot = 0;  		if (error == 0)  			tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); -	} else if ((slot == 0) && (sendalot == 0) && tot_len_this_send) { +	} else if ((pacing_delay == 0) && (sendalot == 0) && tot_len_this_send) {  		/*  		 * Get our pacing rate, if an error  		 * occurred in sending (ENOBUF) we would  		 * hit the else if with slot preset. Other  		 * errors return.  		 */ -		slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, rsm, segsiz, __LINE__); +		pacing_delay = rack_get_pacing_delay(rack, tp, tot_len_this_send, rsm, segsiz, __LINE__);  	}  	/* We have sent clear the flag */  	rack->r_ent_rec_ns = 0; @@ -22499,7 +22291,7 @@ enobufs:  		 */  		tp->t_flags &= ~(TF_WASCRECOVERY|TF_WASFRECOVERY);  	} -	if (slot) { +	if (pacing_delay) {  		/* set the rack tcb into the slot N */  		if ((error == 0) &&  		    rack_use_rfo && @@ -22564,7 +22356,7 @@ skip_all_send:  	/* Assure when we leave that snd_nxt will point to top */  	if (SEQ_GT(tp->snd_max, tp->snd_nxt))  		tp->snd_nxt = tp->snd_max; -	rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, 0); +	rack_start_hpts_timer(rack, tp, cts, pacing_delay, tot_len_this_send, 0);  #ifdef TCP_ACCOUNTING  	crtsc = get_cyclecount() - ts_val;  	if (tot_len_this_send) { diff --git a/sys/netinet/tcp_stacks/tcp_rack.h b/sys/netinet/tcp_stacks/tcp_rack.h index 144b4fabf7eb..cac17d9aeb50 100644 --- a/sys/netinet/tcp_stacks/tcp_rack.h +++ b/sys/netinet/tcp_stacks/tcp_rack.h @@ -462,7 +462,6 @@ struct rack_control {  	uint64_t rc_gp_output_ts; /* chg*/  	uint64_t rc_gp_cumack_ts; /* chg*/  	struct timeval act_rcv_time; -	struct timeval rc_last_time_decay;	/* SAD time decay happened here */  	uint64_t gp_bw;  	uint64_t init_rate;  #ifdef NETFLIX_SHARED_CWND diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c index c817c79881d6..b6f428b279b3 100644 --- a/sys/netinet/tcp_subr.c +++ b/sys/netinet/tcp_subr.c @@ -607,7 +607,7 @@ tcp_recv_udp_tunneled_packet(struct mbuf *m, int off, struct inpcb *inp,  		}  	}  	m->m_pkthdr.tcp_tun_port = port = uh->uh_sport; -	bcopy(th, uh, m->m_len - off); +	bcopy(th, uh, m->m_len - off - sizeof(struct udphdr));  	m->m_len -= sizeof(struct udphdr);  	m->m_pkthdr.len -= sizeof(struct udphdr);  	/* diff --git a/sys/netinet/tcp_syncache.c b/sys/netinet/tcp_syncache.c index 2bb99596f965..3a7755e9f09e 100644 --- a/sys/netinet/tcp_syncache.c +++ b/sys/netinet/tcp_syncache.c @@ -1046,6 +1046,8 @@ abort:   *   * On syncache_socket() success the newly created socket   * has its underlying inp locked. + * + * *lsop is updated, if and only if 1 is returned.   */  int  syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, @@ -1094,12 +1096,14 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,  				 */  				SCH_UNLOCK(sch);  				TCPSTAT_INC(tcps_sc_spurcookie); -				if ((s = tcp_log_addrs(inc, th, NULL, NULL))) +				if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {  					log(LOG_DEBUG, "%s; %s: Spurious ACK, "  					    "segment rejected "  					    "(syncookies disabled)\n",  					    s, __func__); -				goto failed; +					free(s, M_TCPLOG); +				} +				return (0);  			}  			if (sch->sch_last_overflow <  			    time_uptime - SYNCOOKIE_LIFETIME) { @@ -1109,12 +1113,14 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,  				 */  				SCH_UNLOCK(sch);  				TCPSTAT_INC(tcps_sc_spurcookie); -				if ((s = tcp_log_addrs(inc, th, NULL, NULL))) +				if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {  					log(LOG_DEBUG, "%s; %s: Spurious ACK, "  					    "segment rejected "  					    "(no syncache entry)\n",  					    s, __func__); -				goto failed; +					free(s, M_TCPLOG); +				} +				return (0);  			}  			SCH_UNLOCK(sch);  		} @@ -1128,11 +1134,13 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,  			TCPSTAT_INC(tcps_sc_recvcookie);  		} else {  			TCPSTAT_INC(tcps_sc_failcookie); -			if ((s = tcp_log_addrs(inc, th, NULL, NULL))) +			if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {  				log(LOG_DEBUG, "%s; %s: Segment failed "  				    "SYNCOOKIE authentication, segment rejected "  				    "(probably spoofed)\n", s, __func__); -			goto failed; +				free(s, M_TCPLOG); +			} +			return (0);  		}  #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)  		/* If received ACK has MD5 signature, check it. */ @@ -1160,7 +1168,7 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,  		/*  		 * If listening socket requested TCP digests, check that  		 * received ACK has signature and it is correct. -		 * If not, drop the ACK and leave sc entry in th cache, +		 * If not, drop the ACK and leave sc entry in the cache,  		 * because SYN was received with correct signature.  		 */  		if (sc->sc_flags & SCF_SIGNATURE) { @@ -1206,9 +1214,9 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,  				    "%s; %s: SEG.TSval %u < TS.Recent %u, "  				    "segment dropped\n", s, __func__,  				    to->to_tsval, sc->sc_tsreflect); -				free(s, M_TCPLOG);  			}  			SCH_UNLOCK(sch); +			free(s, M_TCPLOG);  			return (-1);  /* Do not send RST */  		} @@ -1225,7 +1233,6 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,  				    "expected, segment processed normally\n",  				    s, __func__);  				free(s, M_TCPLOG); -				s = NULL;  			}  		} @@ -1285,7 +1292,8 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,  				    "segment rejected\n",  				    s, __func__, th->th_ack, sc->sc_iss + 1);  			SCH_UNLOCK(sch); -			goto failed; +			free(s, M_TCPLOG); +			return (0);  /* Do send RST, do not free sc. */  		}  		TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash); @@ -1311,16 +1319,6 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,  	if (sc != &scs)  		syncache_free(sc);  	return (1); -failed: -	if (sc != NULL) { -		TCPSTATES_DEC(TCPS_SYN_RECEIVED); -		if (sc != &scs) -			syncache_free(sc); -	} -	if (s != NULL) -		free(s, M_TCPLOG); -	*lsop = NULL; -	return (0);  }  static struct socket * @@ -1382,6 +1380,7 @@ syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,  	struct tcpcb *tp;  	struct socket *rv = NULL;  	struct syncache *sc = NULL; +	struct ucred *cred;  	struct syncache_head *sch;  	struct mbuf *ipopts = NULL;  	u_int ltflags; @@ -1410,6 +1409,7 @@ syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,  	 */  	KASSERT(SOLISTENING(so), ("%s: %p not listening", __func__, so));  	tp = sototcpcb(so); +	cred = V_tcp_syncache.see_other ? NULL : crhold(so->so_cred);  #ifdef INET6  	if (inc->inc_flags & INC_ISIPV6) { @@ -1638,16 +1638,16 @@ syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,  	/*  	 * sc_cred is only used in syncache_pcblist() to list TCP endpoints in  	 * TCPS_SYN_RECEIVED state when V_tcp_syncache.see_other is false. -	 * Therefore, store the credentials and take a reference count only -	 * when needed: +	 * Therefore, store the credentials only when needed:  	 * - sc is allocated from the zone and not using the on stack instance.  	 * - the sysctl variable net.inet.tcp.syncache.see_other is false.  	 * The reference count is decremented when a zone allocated sc is  	 * freed in syncache_free().  	 */ -	if (sc != &scs && !V_tcp_syncache.see_other) -		sc->sc_cred = crhold(so->so_cred); -	else +	if (sc != &scs && !V_tcp_syncache.see_other) { +		sc->sc_cred = cred; +		cred = NULL; +	} else  		sc->sc_cred = NULL;  	sc->sc_port = port;  	sc->sc_ipopts = ipopts; @@ -1785,6 +1785,8 @@ donenoprobe:  		tcp_fastopen_decrement_counter(tfo_pending);  tfo_expanded: +	if (cred != NULL) +		crfree(cred);  	if (sc == NULL || sc == &scs) {  #ifdef MAC  		mac_syncache_destroy(&maclabel); diff --git a/sys/netinet/tcp_timer.c b/sys/netinet/tcp_timer.c index 3b9fe7a317b0..57c57666fa3a 100644 --- a/sys/netinet/tcp_timer.c +++ b/sys/netinet/tcp_timer.c @@ -513,9 +513,12 @@ tcp_timer_persist(struct tcpcb *tp)  	if (progdrop || (tp->t_rxtshift >= V_tcp_retries &&  	    (ticks - tp->t_rcvtime >= tcp_maxpersistidle ||  	     ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff))) { -		if (!progdrop) +		if (progdrop) { +			tcp_log_end_status(tp, TCP_EI_STATUS_PROGRESS); +		} else {  			TCPSTAT_INC(tcps_persistdrop); -		tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); +			tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); +		}  		goto dropit;  	}  	/* diff --git a/sys/netinet/udp_usrreq.c b/sys/netinet/udp_usrreq.c index cea8a916679b..f1d952037d5a 100644 --- a/sys/netinet/udp_usrreq.c +++ b/sys/netinet/udp_usrreq.c @@ -787,7 +787,8 @@ udplite_ctlinput(struct icmp *icmp)  static int  udp_pcblist(SYSCTL_HANDLER_ARGS)  { -	struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_udbinfo, +	struct inpcbinfo *pcbinfo = udp_get_inpcbinfo(arg2); +	struct inpcb_iterator inpi = INP_ALL_ITERATOR(pcbinfo,  	    INPLOOKUP_RLOCKPCB);  	struct xinpgen xig;  	struct inpcb *inp; @@ -799,7 +800,7 @@ udp_pcblist(SYSCTL_HANDLER_ARGS)  	if (req->oldptr == 0) {  		int n; -		n = V_udbinfo.ipi_count; +		n = pcbinfo->ipi_count;  		n += imax(n / 8, 10);  		req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xinpcb);  		return (0); @@ -810,8 +811,8 @@ udp_pcblist(SYSCTL_HANDLER_ARGS)  	bzero(&xig, sizeof(xig));  	xig.xig_len = sizeof xig; -	xig.xig_count = V_udbinfo.ipi_count; -	xig.xig_gen = V_udbinfo.ipi_gencnt; +	xig.xig_count = pcbinfo->ipi_count; +	xig.xig_gen = pcbinfo->ipi_gencnt;  	xig.xig_sogen = so_gencnt;  	error = SYSCTL_OUT(req, &xig, sizeof xig);  	if (error) @@ -838,9 +839,9 @@ udp_pcblist(SYSCTL_HANDLER_ARGS)  		 * that something happened while we were processing this  		 * request, and it might be necessary to retry.  		 */ -		xig.xig_gen = V_udbinfo.ipi_gencnt; +		xig.xig_gen = pcbinfo->ipi_gencnt;  		xig.xig_sogen = so_gencnt; -		xig.xig_count = V_udbinfo.ipi_count; +		xig.xig_count = pcbinfo->ipi_count;  		error = SYSCTL_OUT(req, &xig, sizeof xig);  	} @@ -848,10 +849,15 @@ udp_pcblist(SYSCTL_HANDLER_ARGS)  }  SYSCTL_PROC(_net_inet_udp, UDPCTL_PCBLIST, pcblist, -    CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, +    CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, IPPROTO_UDP,      udp_pcblist, "S,xinpcb",      "List of active UDP sockets"); +SYSCTL_PROC(_net_inet_udplite, OID_AUTO, pcblist, +    CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, IPPROTO_UDPLITE, +    udp_pcblist, "S,xinpcb", +    "List of active UDP-Lite sockets"); +  #ifdef INET  static int  udp_getcred(SYSCTL_HANDLER_ARGS) @@ -1166,7 +1172,19 @@ udp_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr,  	else  		INP_RLOCK(inp);  	NET_EPOCH_ENTER(et); +#ifdef INET6 +	if ((flags & PRUS_IPV6) != 0) { +		if ((inp->in6p_outputopts != NULL) && +		    (inp->in6p_outputopts->ip6po_tclass != -1)) +			tos = (u_char)inp->in6p_outputopts->ip6po_tclass; +		else +			tos = 0; +	} else { +		tos = inp->inp_ip_tos; +	} +#else  	tos = inp->inp_ip_tos; +#endif  	if (control != NULL) {  		/*  		 * XXX: Currently, we assume all the optional information is @@ -1190,6 +1208,23 @@ udp_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr,  			error = udp_v4mapped_pktinfo(cm, &src, inp, flags);  			if (error != 0)  				break; +			if (((flags & PRUS_IPV6) != 0) && +			    (cm->cmsg_level == IPPROTO_IPV6) && +			    (cm->cmsg_type == IPV6_TCLASS)) { +				int tclass; + +				if (cm->cmsg_len != CMSG_LEN(sizeof(int))) { +					error = EINVAL; +					break; +				} +				tclass = *(int *)CMSG_DATA(cm); +				if (tclass < -1 || tclass > 255) { +					error = EINVAL; +					break; +				} +				if (tclass != -1) +					tos = (u_char)tclass; +			}  #endif  			if (cm->cmsg_level != IPPROTO_IP)  				continue; diff --git a/sys/netinet/udp_var.h b/sys/netinet/udp_var.h index 3895f365db3c..3ae08fc0b8f0 100644 --- a/sys/netinet/udp_var.h +++ b/sys/netinet/udp_var.h @@ -147,6 +147,7 @@ void	kmod_udpstat_inc(int statnum);  	} while (0)  SYSCTL_DECL(_net_inet_udp); +SYSCTL_DECL(_net_inet_udplite);  VNET_DECLARE(struct inpcbinfo, udbinfo);  VNET_DECLARE(struct inpcbinfo, ulitecbinfo); | 
