aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--sys/conf/NOTES1
-rw-r--r--sys/conf/options1
-rw-r--r--sys/netinet/tcp_hpts.c758
-rw-r--r--sys/netinet/tcp_hpts.h30
-rw-r--r--sys/netinet/tcp_hpts_test.c1616
-rw-r--r--sys/netinet/tcp_lro_hpts.c3
-rw-r--r--sys/tests/ktest.h7
7 files changed, 2055 insertions, 361 deletions
diff --git a/sys/conf/NOTES b/sys/conf/NOTES
index ea9b2667607e..a25ee8f6e1af 100644
--- a/sys/conf/NOTES
+++ b/sys/conf/NOTES
@@ -679,6 +679,7 @@ options TCP_OFFLOAD # TCP offload support.
options TCP_RFC7413 # TCP Fast Open
options TCPHPTS
+#options TCP_HPTS_KTEST # Add KTEST support for HPTS
# In order to enable IPSEC you MUST also add device crypto to
# your kernel configuration
diff --git a/sys/conf/options b/sys/conf/options
index b48ad1cf42cf..0b795a8d28fb 100644
--- a/sys/conf/options
+++ b/sys/conf/options
@@ -231,6 +231,7 @@ SYSVSEM opt_sysvipc.h
SYSVSHM opt_sysvipc.h
SW_WATCHDOG opt_watchdog.h
TCPHPTS
+TCP_HPTS_KTEST opt_inet.h
TCP_REQUEST_TRK opt_global.h
TCP_ACCOUNTING opt_global.h
TCP_BBR opt_inet.h
diff --git a/sys/netinet/tcp_hpts.c b/sys/netinet/tcp_hpts.c
index 133703a5ede1..2631e79ab034 100644
--- a/sys/netinet/tcp_hpts.c
+++ b/sys/netinet/tcp_hpts.c
@@ -149,27 +149,44 @@
#include <netinet/tcpip.h>
#include <netinet/cc/cc.h>
#include <netinet/tcp_hpts.h>
+#include <netinet/tcp_hpts_internal.h>
#include <netinet/tcp_log_buf.h>
#ifdef tcp_offload
#include <netinet/tcp_offload.h>
#endif
-/*
- * The hpts uses a 102400 wheel. The wheel
- * defines the time in 10 usec increments (102400 x 10).
- * This gives a range of 10usec - 1024ms to place
- * an entry within. If the user requests more than
- * 1.024 second, a remaineder is attached and the hpts
- * when seeing the remainder will re-insert the
- * inpcb forward in time from where it is until
- * the remainder is zero.
- */
+/* Global instance for TCP HPTS */
+struct tcp_hptsi *tcp_hptsi_pace;
+
+/* Default function table for production use. */
+const struct tcp_hptsi_funcs tcp_hptsi_default_funcs = {
+ .microuptime = microuptime,
+ .swi_add = swi_add,
+ .swi_remove = swi_remove,
+ .swi_sched = swi_sched,
+ .intr_event_bind = intr_event_bind,
+ .intr_event_bind_ithread_cpuset = intr_event_bind_ithread_cpuset,
+ .callout_init = callout_init,
+ .callout_reset_sbt_on = callout_reset_sbt_on,
+ ._callout_stop_safe = _callout_stop_safe,
+};
-#define NUM_OF_HPTSI_SLOTS 102400
+#ifdef TCP_HPTS_KTEST
+#define microuptime pace->funcs->microuptime
+#define swi_add pace->funcs->swi_add
+#define swi_remove pace->funcs->swi_remove
+#define swi_sched pace->funcs->swi_sched
+#define intr_event_bind pace->funcs->intr_event_bind
+#define intr_event_bind_ithread_cpuset pace->funcs->intr_event_bind_ithread_cpuset
+#define callout_init pace->funcs->callout_init
+#define callout_reset_sbt_on pace->funcs->callout_reset_sbt_on
+#define _callout_stop_safe pace->funcs->_callout_stop_safe
+#endif
-/* The number of connections after which the dynamic sleep logic kicks in. */
-#define DEFAULT_CONNECTION_THRESHOLD 100
+static MALLOC_DEFINE(M_TCPHPTS, "tcp_hpts", "TCP hpts");
+
+static void tcp_hpts_thread(void *ctx);
/*
* When using the hpts, a TCP stack must make sure
@@ -212,75 +229,14 @@
*
*/
-/* Each hpts has its own p_mtx which is used for locking */
-#define HPTS_MTX_ASSERT(hpts) mtx_assert(&(hpts)->p_mtx, MA_OWNED)
-#define HPTS_LOCK(hpts) mtx_lock(&(hpts)->p_mtx)
-#define HPTS_TRYLOCK(hpts) mtx_trylock(&(hpts)->p_mtx)
-#define HPTS_UNLOCK(hpts) mtx_unlock(&(hpts)->p_mtx)
-struct tcp_hpts_entry {
- /* Cache line 0x00 */
- struct mtx p_mtx; /* Mutex for hpts */
- struct timeval p_mysleep; /* Our min sleep time */
- uint64_t syscall_cnt;
- uint64_t sleeping; /* What the actual sleep was (if sleeping) */
- uint16_t p_hpts_active; /* Flag that says hpts is awake */
- uint8_t p_wheel_complete; /* have we completed the wheel arc walk? */
- uint32_t p_runningslot; /* Current slot we are at if we are running */
- uint32_t p_prev_slot; /* Previous slot we were on */
- uint32_t p_cur_slot; /* Current slot in wheel hpts is draining */
- uint32_t p_nxt_slot; /* The next slot outside the current range of
- * slots that the hpts is running on. */
- int32_t p_on_queue_cnt; /* Count on queue in this hpts */
- uint8_t p_direct_wake :1, /* boolean */
- p_on_min_sleep:1, /* boolean */
- p_hpts_wake_scheduled:1, /* boolean */
- hit_callout_thresh:1,
- p_avail:4;
- uint8_t p_fill[3]; /* Fill to 32 bits */
- /* Cache line 0x40 */
- struct hptsh {
- TAILQ_HEAD(, tcpcb) head;
- uint32_t count;
- uint32_t gencnt;
- } *p_hptss; /* Hptsi wheel */
- uint32_t p_hpts_sleep_time; /* Current sleep interval having a max
- * of 255ms */
- uint32_t overidden_sleep; /* what was overrided by min-sleep for logging */
- uint32_t saved_curslot; /* for logging */
- uint32_t saved_prev_slot; /* for logging */
- uint32_t p_delayed_by; /* How much were we delayed by */
- /* Cache line 0x80 */
- struct sysctl_ctx_list hpts_ctx;
- struct sysctl_oid *hpts_root;
- struct intr_event *ie;
- void *ie_cookie;
- uint16_t p_num; /* The hpts number one per cpu */
- uint16_t p_cpu; /* The hpts CPU */
- /* There is extra space in here */
- /* Cache line 0x100 */
- struct callout co __aligned(CACHE_LINE_SIZE);
-} __aligned(CACHE_LINE_SIZE);
-
-static struct tcp_hptsi {
- struct cpu_group **grps;
- struct tcp_hpts_entry **rp_ent; /* Array of hptss */
- uint32_t *cts_last_ran;
- uint32_t grp_cnt;
- uint32_t rp_num_hptss; /* Number of hpts threads */
-} tcp_pace;
-
-static MALLOC_DEFINE(M_TCPHPTS, "tcp_hpts", "TCP hpts");
#ifdef RSS
-static int tcp_bind_threads = 1;
+int tcp_bind_threads = 1;
#else
-static int tcp_bind_threads = 2;
+int tcp_bind_threads = 2;
#endif
static int tcp_use_irq_cpu = 0;
static int hpts_does_tp_logging = 0;
-
-static int32_t tcp_hptsi(struct tcp_hpts_entry *hpts, bool from_callout);
-static void tcp_hpts_thread(void *ctx);
-
+static int32_t tcp_hpts_precision = 120;
int32_t tcp_min_hptsi_time = DEFAULT_MIN_SLEEP;
static int conn_cnt_thresh = DEFAULT_CONNECTION_THRESHOLD;
static int32_t dynamic_min_sleep = DYNAMIC_MIN_SLEEP;
@@ -291,23 +247,6 @@ SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hpts, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
SYSCTL_NODE(_net_inet_tcp_hpts, OID_AUTO, stats, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
"TCP Hpts statistics");
-#define timersub(tvp, uvp, vvp) \
- do { \
- (vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec; \
- (vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec; \
- if ((vvp)->tv_usec < 0) { \
- (vvp)->tv_sec--; \
- (vvp)->tv_usec += 1000000; \
- } \
- } while (0)
-
-static int32_t tcp_hpts_precision = 120;
-
-static struct hpts_domain_info {
- int count;
- int cpu[MAXCPU];
-} hpts_domains[MAXMEMDOM];
-
counter_u64_t hpts_hopelessly_behind;
SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, hopeless, CTLFLAG_RD,
@@ -455,14 +394,14 @@ SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, nowake_over_thresh, CTLFLAG_RW,
&tcp_hpts_no_wake_over_thresh, 0,
"When we are over the threshold on the pacer do we prohibit wakeups?");
-static uint16_t
-hpts_random_cpu(void)
+uint16_t
+tcp_hptsi_random_cpu(struct tcp_hptsi *pace)
{
uint16_t cpuid;
uint32_t ran;
ran = arc4random();
- cpuid = (((ran & 0xffff) % mp_ncpus) % tcp_pace.rp_num_hptss);
+ cpuid = (((ran & 0xffff) % mp_ncpus) % pace->rp_num_hptss);
return (cpuid);
}
@@ -504,11 +443,67 @@ tcp_hpts_log(struct tcp_hpts_entry *hpts, struct tcpcb *tp, struct timeval *tv,
}
}
+/*
+ * Timeout handler for the HPTS sleep callout. It immediately schedules the SWI
+ * for the HPTS entry to run.
+ */
static void
-tcp_wakehpts(struct tcp_hpts_entry *hpts)
+tcp_hpts_sleep_timeout(void *arg)
{
+#ifdef TCP_HPTS_KTEST
+ struct tcp_hptsi *pace;
+#endif
+ struct tcp_hpts_entry *hpts;
+
+ hpts = (struct tcp_hpts_entry *)arg;
+#ifdef TCP_HPTS_KTEST
+ pace = hpts->p_hptsi;
+#endif
+ swi_sched(hpts->ie_cookie, 0);
+}
+
+/*
+ * Reset the HPTS callout timer with the provided timeval. Returns the results
+ * of the callout_reset_sbt_on() function.
+ */
+static int
+tcp_hpts_sleep(struct tcp_hpts_entry *hpts, struct timeval *tv)
+{
+#ifdef TCP_HPTS_KTEST
+ struct tcp_hptsi *pace;
+#endif
+ sbintime_t sb;
+
+#ifdef TCP_HPTS_KTEST
+ pace = hpts->p_hptsi;
+#endif
+
+ /* Store off to make visible the actual sleep time */
+ hpts->sleeping = tv->tv_usec;
+
+ sb = tvtosbt(*tv);
+ return (callout_reset_sbt_on(
+ &hpts->co, sb, 0, tcp_hpts_sleep_timeout, hpts, hpts->p_cpu,
+ (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision))));
+}
+
+/*
+ * Schedules the SWI for the HTPS entry to run, if not already scheduled or
+ * running.
+ */
+void
+tcp_hpts_wake(struct tcp_hpts_entry *hpts)
+{
+#ifdef TCP_HPTS_KTEST
+ struct tcp_hptsi *pace;
+#endif
+
HPTS_MTX_ASSERT(hpts);
+#ifdef TCP_HPTS_KTEST
+ pace = hpts->p_hptsi;
+#endif
+
if (tcp_hpts_no_wake_over_thresh && (hpts->p_on_queue_cnt >= conn_cnt_thresh)) {
hpts->p_direct_wake = 0;
return;
@@ -520,15 +515,6 @@ tcp_wakehpts(struct tcp_hpts_entry *hpts)
}
static void
-hpts_timeout_swi(void *arg)
-{
- struct tcp_hpts_entry *hpts;
-
- hpts = (struct tcp_hpts_entry *)arg;
- swi_sched(hpts->ie_cookie, 0);
-}
-
-static void
tcp_hpts_insert_internal(struct tcpcb *tp, struct tcp_hpts_entry *hpts)
{
struct inpcb *inp = tptoinpcb(tp);
@@ -556,13 +542,13 @@ tcp_hpts_insert_internal(struct tcpcb *tp, struct tcp_hpts_entry *hpts)
}
static struct tcp_hpts_entry *
-tcp_hpts_lock(struct tcpcb *tp)
+tcp_hpts_lock(struct tcp_hptsi *pace, struct tcpcb *tp)
{
struct tcp_hpts_entry *hpts;
INP_LOCK_ASSERT(tptoinpcb(tp));
- hpts = tcp_pace.rp_ent[tp->t_hpts_cpu];
+ hpts = pace->rp_ent[tp->t_hpts_cpu];
HPTS_LOCK(hpts);
return (hpts);
@@ -589,11 +575,10 @@ tcp_hpts_release(struct tcpcb *tp)
* and has never received a first packet.
*/
void
-tcp_hpts_init(struct tcpcb *tp)
+__tcp_hpts_init(struct tcp_hptsi *pace, struct tcpcb *tp)
{
-
if (__predict_true(tp->t_hpts_cpu == HPTS_CPU_NONE)) {
- tp->t_hpts_cpu = hpts_random_cpu();
+ tp->t_hpts_cpu = tcp_hptsi_random_cpu(pace);
MPASS(!(tp->t_flags2 & TF2_HPTS_CPU_SET));
}
}
@@ -605,14 +590,14 @@ tcp_hpts_init(struct tcpcb *tp)
* INP lock and then get the hpts lock.
*/
void
-tcp_hpts_remove(struct tcpcb *tp)
+__tcp_hpts_remove(struct tcp_hptsi *pace, struct tcpcb *tp)
{
struct tcp_hpts_entry *hpts;
struct hptsh *hptsh;
INP_WLOCK_ASSERT(tptoinpcb(tp));
- hpts = tcp_hpts_lock(tp);
+ hpts = tcp_hpts_lock(pace, tp);
if (tp->t_in_hpts == IHPTS_ONQUEUE) {
hptsh = &hpts->p_hptss[tp->t_hpts_slot];
tp->t_hpts_request = 0;
@@ -847,10 +832,11 @@ check_if_slot_would_be_wrong(struct tcp_hpts_entry *hpts, struct tcpcb *tp,
void
#ifdef INVARIANTS
-__tcp_hpts_insert(struct tcpcb *tp, uint32_t slot, int32_t line,
- struct hpts_diag *diag)
+__tcp_hpts_insert(struct tcp_hptsi *pace, struct tcpcb *tp, uint32_t slot,
+ int32_t line, struct hpts_diag *diag)
#else
-tcp_hpts_insert(struct tcpcb *tp, uint32_t slot, struct hpts_diag *diag)
+__tcp_hpts_insert(struct tcp_hptsi *pace, struct tcpcb *tp, uint32_t slot,
+ struct hpts_diag *diag)
#endif
{
struct tcp_hpts_entry *hpts;
@@ -868,7 +854,7 @@ tcp_hpts_insert(struct tcpcb *tp, uint32_t slot, struct hpts_diag *diag)
* current run (if up) or where it was when it stopped if it is
* sleeping.
*/
- hpts = tcp_hpts_lock(tp);
+ hpts = tcp_hpts_lock(pace, tp);
microuptime(&tv);
if (diag) {
memset(diag, 0, sizeof(struct hpts_diag));
@@ -903,7 +889,7 @@ tcp_hpts_insert(struct tcpcb *tp, uint32_t slot, struct hpts_diag *diag)
* timeout is not 1.
*/
hpts->p_direct_wake = 1;
- tcp_wakehpts(hpts);
+ tcp_hpts_wake(hpts);
}
HPTS_UNLOCK(hpts);
@@ -991,7 +977,7 @@ tcp_hpts_insert(struct tcpcb *tp, uint32_t slot, struct hpts_diag *diag)
*/
if (need_wakeup) {
hpts->p_direct_wake = 1;
- tcp_wakehpts(hpts);
+ tcp_hpts_wake(hpts);
if (diag) {
diag->need_new_to = 0;
diag->co_ret = 0xffff0000;
@@ -999,7 +985,6 @@ tcp_hpts_insert(struct tcpcb *tp, uint32_t slot, struct hpts_diag *diag)
} else if (need_new_to) {
int32_t co_ret;
struct timeval tv;
- sbintime_t sb;
tv.tv_sec = 0;
tv.tv_usec = 0;
@@ -1007,11 +992,8 @@ tcp_hpts_insert(struct tcpcb *tp, uint32_t slot, struct hpts_diag *diag)
tv.tv_sec++;
need_new_to -= HPTS_USEC_IN_SEC;
}
- tv.tv_usec = need_new_to;
- sb = tvtosbt(tv);
- co_ret = callout_reset_sbt_on(&hpts->co, sb, 0,
- hpts_timeout_swi, hpts, hpts->p_cpu,
- (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
+ tv.tv_usec = need_new_to; /* XXX: Why is this sleeping over the max? */
+ co_ret = tcp_hpts_sleep(hpts, &tv);
if (diag) {
diag->need_new_to = need_new_to;
diag->co_ret = co_ret;
@@ -1021,7 +1003,7 @@ tcp_hpts_insert(struct tcpcb *tp, uint32_t slot, struct hpts_diag *diag)
}
static uint16_t
-hpts_cpuid(struct tcpcb *tp, int *failed)
+hpts_cpuid(struct tcp_hptsi *pace, struct tcpcb *tp, int *failed)
{
struct inpcb *inp = tptoinpcb(tp);
u_int cpuid;
@@ -1048,7 +1030,7 @@ hpts_cpuid(struct tcpcb *tp, int *failed)
#ifdef RSS
cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype);
if (cpuid == NETISR_CPUID_NONE)
- return (hpts_random_cpu());
+ return (tcp_hptsi_random_cpu(pace));
else
return (cpuid);
#endif
@@ -1059,7 +1041,7 @@ hpts_cpuid(struct tcpcb *tp, int *failed)
*/
if (inp->inp_flowtype == M_HASHTYPE_NONE) {
counter_u64_add(cpu_uses_random, 1);
- return (hpts_random_cpu());
+ return (tcp_hptsi_random_cpu(pace));
}
/*
* Hash to a thread based on the flowid. If we are using numa,
@@ -1074,7 +1056,7 @@ hpts_cpuid(struct tcpcb *tp, int *failed)
#ifdef NUMA
} else {
/* Hash into the cpu's that use that domain */
- di = &hpts_domains[inp->inp_numa_domain];
+ di = &pace->domains[inp->inp_numa_domain];
cpuid = di->cpu[inp->inp_flowid % di->count];
}
#endif
@@ -1112,9 +1094,10 @@ tcp_hpts_different_slots(uint32_t cts, uint32_t cts_last_run)
return ((cts / HPTS_USECS_PER_SLOT) != (cts_last_run / HPTS_USECS_PER_SLOT));
}
-static int32_t
+int32_t
tcp_hptsi(struct tcp_hpts_entry *hpts, bool from_callout)
{
+ struct tcp_hptsi *pace;
struct tcpcb *tp;
struct timeval tv;
int32_t slots_to_run, i, error;
@@ -1132,12 +1115,17 @@ tcp_hptsi(struct tcp_hpts_entry *hpts, bool from_callout)
HPTS_MTX_ASSERT(hpts);
NET_EPOCH_ASSERT();
+
+ pace = hpts->p_hptsi;
+ MPASS(pace != NULL);
+
/* record previous info for any logging */
hpts->saved_curslot = hpts->p_cur_slot;
hpts->saved_prev_slot = hpts->p_prev_slot;
- cts_last_run = tcp_pace.cts_last_ran[hpts->p_num];
- tcp_pace.cts_last_ran[hpts->p_num] = cts = tcp_get_usecs(&tv);
+ microuptime(&tv);
+ cts_last_run = pace->cts_last_ran[hpts->p_cpu];
+ pace->cts_last_ran[hpts->p_cpu] = cts = tcp_tv_to_usec(&tv);
orig_exit_slot = hpts->p_cur_slot = cts_to_wheel(cts);
if ((hpts->p_on_queue_cnt == 0) ||
@@ -1383,7 +1371,7 @@ again:
* gets added to the hpts (not this one)
* :-)
*/
- tcp_set_hpts(tp);
+ __tcp_set_hpts(pace, tp);
}
CURVNET_SET(inp->inp_vnet);
/* Lets do any logging that we might want to */
@@ -1445,8 +1433,9 @@ no_one:
* more slots (if we did not hit eno-bufs).
*/
hpts->p_prev_slot = hpts->p_cur_slot;
+ microuptime(&tv);
cts_last_run = cts;
- cts = tcp_get_usecs(&tv);
+ cts = tcp_tv_to_usec(&tv);
if (!from_callout || (loop_cnt > max_pacer_loops)) {
/*
* Something is serious slow we have
@@ -1478,7 +1467,7 @@ no_one:
goto again;
}
no_run:
- tcp_pace.cts_last_ran[hpts->p_num] = cts;
+ pace->cts_last_ran[hpts->p_cpu] = cts;
/*
* Set flag to tell that we are done for
* any slot input that happens during
@@ -1512,7 +1501,8 @@ no_run:
cts_last_run, cts, loop_cnt, wrap_loop_cnt));
if (from_callout && tcp_hpts_different_slots(cts, cts_last_run)) {
- cts = tcp_get_usecs(&tv);
+ microuptime(&tv);
+ cts = tcp_tv_to_usec(&tv);
hpts->p_cur_slot = cts_to_wheel(cts);
counter_u64_add(hpts_loops, 1);
goto again;
@@ -1528,16 +1518,16 @@ no_run:
}
void
-tcp_set_hpts(struct tcpcb *tp)
+__tcp_set_hpts(struct tcp_hptsi *pace, struct tcpcb *tp)
{
struct tcp_hpts_entry *hpts;
int failed;
INP_WLOCK_ASSERT(tptoinpcb(tp));
- hpts = tcp_hpts_lock(tp);
+ hpts = tcp_hpts_lock(pace, tp);
if (tp->t_in_hpts == IHPTS_NONE && !(tp->t_flags2 & TF2_HPTS_CPU_SET)) {
- tp->t_hpts_cpu = hpts_cpuid(tp, &failed);
+ tp->t_hpts_cpu = hpts_cpuid(pace, tp, &failed);
if (failed == 0)
tp->t_flags2 |= TF2_HPTS_CPU_SET;
}
@@ -1545,33 +1535,35 @@ tcp_set_hpts(struct tcpcb *tp)
}
static struct tcp_hpts_entry *
-tcp_choose_hpts_to_run(void)
+tcp_choose_hpts_to_run(struct tcp_hptsi *pace)
{
+ struct timeval tv;
int i, oldest_idx, start, end;
uint32_t cts, time_since_ran, calc;
- cts = tcp_get_usecs(NULL);
+ microuptime(&tv);
+ cts = tcp_tv_to_usec(&tv);
time_since_ran = 0;
/* Default is all one group */
start = 0;
- end = tcp_pace.rp_num_hptss;
+ end = pace->rp_num_hptss;
/*
* If we have more than one L3 group figure out which one
* this CPU is in.
*/
- if (tcp_pace.grp_cnt > 1) {
- for (i = 0; i < tcp_pace.grp_cnt; i++) {
- if (CPU_ISSET(curcpu, &tcp_pace.grps[i]->cg_mask)) {
- start = tcp_pace.grps[i]->cg_first;
- end = (tcp_pace.grps[i]->cg_last + 1);
+ if (pace->grp_cnt > 1) {
+ for (i = 0; i < pace->grp_cnt; i++) {
+ if (CPU_ISSET(curcpu, &pace->grps[i]->cg_mask)) {
+ start = pace->grps[i]->cg_first;
+ end = (pace->grps[i]->cg_last + 1);
break;
}
}
}
oldest_idx = -1;
for (i = start; i < end; i++) {
- if (TSTMP_GT(cts, tcp_pace.cts_last_ran[i]))
- calc = cts - tcp_pace.cts_last_ran[i];
+ if (TSTMP_GT(cts, pace->cts_last_ran[i]))
+ calc = cts - pace->cts_last_ran[i];
else
calc = 0;
if (calc > time_since_ran) {
@@ -1580,9 +1572,9 @@ tcp_choose_hpts_to_run(void)
}
}
if (oldest_idx >= 0)
- return(tcp_pace.rp_ent[oldest_idx]);
+ return(pace->rp_ent[oldest_idx]);
else
- return(tcp_pace.rp_ent[(curcpu % tcp_pace.rp_num_hptss)]);
+ return(pace->rp_ent[(curcpu % pace->rp_num_hptss)]);
}
static void
@@ -1592,7 +1584,7 @@ __tcp_run_hpts(void)
struct tcp_hpts_entry *hpts;
int slots_ran;
- hpts = tcp_choose_hpts_to_run();
+ hpts = tcp_choose_hpts_to_run(tcp_hptsi_pace);
if (hpts->p_hpts_active) {
/* Already active */
@@ -1613,7 +1605,6 @@ __tcp_run_hpts(void)
if (hpts->p_on_queue_cnt >= conn_cnt_thresh) {
if (slots_ran > slots_indicate_less_sleep) {
struct timeval tv;
- sbintime_t sb;
hpts->p_mysleep.tv_usec /= 2;
if (hpts->p_mysleep.tv_usec < dynamic_min_sleep)
@@ -1637,12 +1628,7 @@ __tcp_run_hpts(void)
* the dynamic value and set the on_min_sleep
* flag so we will not be awoken.
*/
- sb = tvtosbt(tv);
- /* Store off to make visible the actual sleep time */
- hpts->sleeping = tv.tv_usec;
- callout_reset_sbt_on(&hpts->co, sb, 0,
- hpts_timeout_swi, hpts, hpts->p_cpu,
- (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
+ (void)tcp_hpts_sleep(hpts, &tv);
} else if (slots_ran < slots_indicate_more_sleep) {
/* For the further sleep, don't reschedule hpts */
hpts->p_mysleep.tv_usec *= 2;
@@ -1660,17 +1646,22 @@ out_with_mtx:
static void
tcp_hpts_thread(void *ctx)
{
+#ifdef TCP_HPTS_KTEST
+ struct tcp_hptsi *pace;
+#endif
struct tcp_hpts_entry *hpts;
struct epoch_tracker et;
struct timeval tv;
- sbintime_t sb;
int slots_ran;
hpts = (struct tcp_hpts_entry *)ctx;
+#ifdef TCP_HPTS_KTEST
+ pace = hpts->p_hptsi;
+#endif
HPTS_LOCK(hpts);
if (hpts->p_direct_wake) {
/* Signaled by input or output with low occupancy count. */
- callout_stop(&hpts->co);
+ _callout_stop_safe(&hpts->co, 0);
counter_u64_add(hpts_direct_awakening, 1);
} else {
/* Timed out, the normal case. */
@@ -1799,18 +1790,11 @@ tcp_hpts_thread(void *ctx)
hpts->p_hpts_active = 0;
back_to_sleep:
hpts->p_direct_wake = 0;
- sb = tvtosbt(tv);
- /* Store off to make visible the actual sleep time */
- hpts->sleeping = tv.tv_usec;
- callout_reset_sbt_on(&hpts->co, sb, 0,
- hpts_timeout_swi, hpts, hpts->p_cpu,
- (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
+ (void)tcp_hpts_sleep(hpts, &tv);
NET_EPOCH_EXIT(et);
HPTS_UNLOCK(hpts);
}
-#undef timersub
-
static int32_t
hpts_count_level(struct cpu_group *cg)
{
@@ -1847,57 +1831,63 @@ hpts_gather_grps(struct cpu_group **grps, int32_t *at, int32_t max, struct cpu_g
}
}
-static void
-tcp_hpts_mod_load(void)
+/*
+ * Initialize a tcp_hptsi structure. This performs the core initialization
+ * without starting threads.
+ */
+struct tcp_hptsi*
+tcp_hptsi_create(const struct tcp_hptsi_funcs *funcs, bool enable_sysctl)
{
+ struct tcp_hptsi *pace;
struct cpu_group *cpu_top;
- int32_t error __diagused;
- int32_t i, j, bound = 0, created = 0;
+ uint32_t i, j, cts;
+ int32_t count;
size_t sz, asz;
struct timeval tv;
- sbintime_t sb;
struct tcp_hpts_entry *hpts;
- struct pcpu *pc;
char unit[16];
uint32_t ncpus = mp_ncpus ? mp_ncpus : MAXCPU;
- int count, domain;
+ KASSERT(funcs != NULL, ("funcs is NULL"));
+
+ /* Allocate the main structure */
+ pace = malloc(sizeof(struct tcp_hptsi), M_TCPHPTS, M_WAITOK | M_ZERO);
+ if (pace == NULL)
+ return (NULL);
+
+ memset(pace, 0, sizeof(*pace));
+ pace->funcs = funcs;
+
+ /* Setup CPU topology information */
#ifdef SMP
cpu_top = smp_topo();
#else
cpu_top = NULL;
#endif
- tcp_pace.rp_num_hptss = ncpus;
- hpts_hopelessly_behind = counter_u64_alloc(M_WAITOK);
- hpts_loops = counter_u64_alloc(M_WAITOK);
- back_tosleep = counter_u64_alloc(M_WAITOK);
- combined_wheel_wrap = counter_u64_alloc(M_WAITOK);
- wheel_wrap = counter_u64_alloc(M_WAITOK);
- hpts_wake_timeout = counter_u64_alloc(M_WAITOK);
- hpts_direct_awakening = counter_u64_alloc(M_WAITOK);
- hpts_back_tosleep = counter_u64_alloc(M_WAITOK);
- hpts_direct_call = counter_u64_alloc(M_WAITOK);
- cpu_uses_flowid = counter_u64_alloc(M_WAITOK);
- cpu_uses_random = counter_u64_alloc(M_WAITOK);
+ pace->rp_num_hptss = ncpus;
- sz = (tcp_pace.rp_num_hptss * sizeof(struct tcp_hpts_entry *));
- tcp_pace.rp_ent = malloc(sz, M_TCPHPTS, M_WAITOK | M_ZERO);
- sz = (sizeof(uint32_t) * tcp_pace.rp_num_hptss);
- tcp_pace.cts_last_ran = malloc(sz, M_TCPHPTS, M_WAITOK);
- tcp_pace.grp_cnt = 0;
+ /* Allocate hpts entry array */
+ sz = (pace->rp_num_hptss * sizeof(struct tcp_hpts_entry *));
+ pace->rp_ent = malloc(sz, M_TCPHPTS, M_WAITOK | M_ZERO);
+
+ /* Allocate timestamp tracking array */
+ sz = (sizeof(uint32_t) * pace->rp_num_hptss);
+ pace->cts_last_ran = malloc(sz, M_TCPHPTS, M_WAITOK);
+
+ /* Setup CPU groups */
if (cpu_top == NULL) {
- tcp_pace.grp_cnt = 1;
+ pace->grp_cnt = 1;
} else {
/* Find out how many cache level 3 domains we have */
count = 0;
- tcp_pace.grp_cnt = hpts_count_level(cpu_top);
- if (tcp_pace.grp_cnt == 0) {
- tcp_pace.grp_cnt = 1;
+ pace->grp_cnt = hpts_count_level(cpu_top);
+ if (pace->grp_cnt == 0) {
+ pace->grp_cnt = 1;
}
- sz = (tcp_pace.grp_cnt * sizeof(struct cpu_group *));
- tcp_pace.grps = malloc(sz, M_TCPHPTS, M_WAITOK);
+ sz = (pace->grp_cnt * sizeof(struct cpu_group *));
+ pace->grps = malloc(sz, M_TCPHPTS, M_WAITOK);
/* Now populate the groups */
- if (tcp_pace.grp_cnt == 1) {
+ if (pace->grp_cnt == 1) {
/*
* All we need is the top level all cpu's are in
* the same cache so when we use grp[0]->cg_mask
@@ -1905,188 +1895,290 @@ tcp_hpts_mod_load(void)
* all cpu's in it. The level here is probably
* zero which is ok.
*/
- tcp_pace.grps[0] = cpu_top;
+ pace->grps[0] = cpu_top;
} else {
/*
* Here we must find all the level three cache domains
* and setup our pointers to them.
*/
count = 0;
- hpts_gather_grps(tcp_pace.grps, &count, tcp_pace.grp_cnt, cpu_top);
+ hpts_gather_grps(pace->grps, &count, pace->grp_cnt, cpu_top);
}
}
+
+ /* Cache the current time for initializing the hpts entries */
+ microuptime(&tv);
+ cts = tcp_tv_to_usec(&tv);
+
+ /* Initialize each hpts entry */
asz = sizeof(struct hptsh) * NUM_OF_HPTSI_SLOTS;
- for (i = 0; i < tcp_pace.rp_num_hptss; i++) {
- tcp_pace.rp_ent[i] = malloc(sizeof(struct tcp_hpts_entry),
+ for (i = 0; i < pace->rp_num_hptss; i++) {
+ pace->rp_ent[i] = malloc(sizeof(struct tcp_hpts_entry),
M_TCPHPTS, M_WAITOK | M_ZERO);
- tcp_pace.rp_ent[i]->p_hptss = malloc(asz, M_TCPHPTS, M_WAITOK);
- hpts = tcp_pace.rp_ent[i];
- /*
- * Init all the hpts structures that are not specifically
- * zero'd by the allocations. Also lets attach them to the
- * appropriate sysctl block as well.
- */
- mtx_init(&hpts->p_mtx, "tcp_hpts_lck",
- "hpts", MTX_DEF | MTX_DUPOK);
- for (j = 0; j < NUM_OF_HPTSI_SLOTS; j++) {
- TAILQ_INIT(&hpts->p_hptss[j].head);
- hpts->p_hptss[j].count = 0;
- hpts->p_hptss[j].gencnt = 0;
- }
- sysctl_ctx_init(&hpts->hpts_ctx);
- sprintf(unit, "%d", i);
- hpts->hpts_root = SYSCTL_ADD_NODE(&hpts->hpts_ctx,
- SYSCTL_STATIC_CHILDREN(_net_inet_tcp_hpts),
- OID_AUTO,
- unit,
- CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
- "");
- SYSCTL_ADD_INT(&hpts->hpts_ctx,
- SYSCTL_CHILDREN(hpts->hpts_root),
- OID_AUTO, "out_qcnt", CTLFLAG_RD,
- &hpts->p_on_queue_cnt, 0,
- "Count TCB's awaiting output processing");
- SYSCTL_ADD_U16(&hpts->hpts_ctx,
- SYSCTL_CHILDREN(hpts->hpts_root),
- OID_AUTO, "active", CTLFLAG_RD,
- &hpts->p_hpts_active, 0,
- "Is the hpts active");
- SYSCTL_ADD_UINT(&hpts->hpts_ctx,
- SYSCTL_CHILDREN(hpts->hpts_root),
- OID_AUTO, "curslot", CTLFLAG_RD,
- &hpts->p_cur_slot, 0,
- "What the current running pacers goal");
- SYSCTL_ADD_UINT(&hpts->hpts_ctx,
- SYSCTL_CHILDREN(hpts->hpts_root),
- OID_AUTO, "runslot", CTLFLAG_RD,
- &hpts->p_runningslot, 0,
- "What the running pacers current slot is");
- SYSCTL_ADD_UINT(&hpts->hpts_ctx,
- SYSCTL_CHILDREN(hpts->hpts_root),
- OID_AUTO, "lastran", CTLFLAG_RD,
- &tcp_pace.cts_last_ran[i], 0,
- "The last usec timestamp that this hpts ran");
- SYSCTL_ADD_LONG(&hpts->hpts_ctx,
- SYSCTL_CHILDREN(hpts->hpts_root),
- OID_AUTO, "cur_min_sleep", CTLFLAG_RD,
- &hpts->p_mysleep.tv_usec,
- "What the running pacers is using for p_mysleep.tv_usec");
- SYSCTL_ADD_U64(&hpts->hpts_ctx,
- SYSCTL_CHILDREN(hpts->hpts_root),
- OID_AUTO, "now_sleeping", CTLFLAG_RD,
- &hpts->sleeping, 0,
- "What the running pacers is actually sleeping for");
- SYSCTL_ADD_U64(&hpts->hpts_ctx,
- SYSCTL_CHILDREN(hpts->hpts_root),
- OID_AUTO, "syscall_cnt", CTLFLAG_RD,
- &hpts->syscall_cnt, 0,
- "How many times we had syscalls on this hpts");
+ pace->rp_ent[i]->p_hptss = malloc(asz, M_TCPHPTS,
+ M_WAITOK | M_ZERO);
+ hpts = pace->rp_ent[i];
+ /* Basic initialization */
hpts->p_hpts_sleep_time = hpts_sleep_max;
- hpts->p_num = i;
- tcp_pace.cts_last_ran[i] = tcp_get_u64_usecs(&tv);
- hpts->p_cur_slot = cts_to_wheel(tcp_pace.cts_last_ran[i]);
+ hpts->p_cpu = i;
+ pace->cts_last_ran[i] = cts;
+ hpts->p_cur_slot = cts_to_wheel(cts);
hpts->p_prev_slot = hpts->p_cur_slot;
- hpts->p_cpu = 0xffff;
hpts->p_nxt_slot = hpts_slot(hpts->p_cur_slot, 1);
callout_init(&hpts->co, 1);
+ hpts->p_hptsi = pace;
+ mtx_init(&hpts->p_mtx, "tcp_hpts_lck", "hpts",
+ MTX_DEF | MTX_DUPOK);
+ for (j = 0; j < NUM_OF_HPTSI_SLOTS; j++) {
+ TAILQ_INIT(&hpts->p_hptss[j].head);
+ }
+
+ /* Setup SYSCTL if requested */
+ if (enable_sysctl) {
+ sysctl_ctx_init(&hpts->hpts_ctx);
+ sprintf(unit, "%d", i);
+ hpts->hpts_root = SYSCTL_ADD_NODE(&hpts->hpts_ctx,
+ SYSCTL_STATIC_CHILDREN(_net_inet_tcp_hpts),
+ OID_AUTO,
+ unit,
+ CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
+ "");
+ SYSCTL_ADD_INT(&hpts->hpts_ctx,
+ SYSCTL_CHILDREN(hpts->hpts_root),
+ OID_AUTO, "out_qcnt", CTLFLAG_RD,
+ &hpts->p_on_queue_cnt, 0,
+ "Count TCB's awaiting output processing");
+ SYSCTL_ADD_U16(&hpts->hpts_ctx,
+ SYSCTL_CHILDREN(hpts->hpts_root),
+ OID_AUTO, "active", CTLFLAG_RD,
+ &hpts->p_hpts_active, 0,
+ "Is the hpts active");
+ SYSCTL_ADD_UINT(&hpts->hpts_ctx,
+ SYSCTL_CHILDREN(hpts->hpts_root),
+ OID_AUTO, "curslot", CTLFLAG_RD,
+ &hpts->p_cur_slot, 0,
+ "What the current running pacers goal");
+ SYSCTL_ADD_UINT(&hpts->hpts_ctx,
+ SYSCTL_CHILDREN(hpts->hpts_root),
+ OID_AUTO, "runslot", CTLFLAG_RD,
+ &hpts->p_runningslot, 0,
+ "What the running pacers current slot is");
+ SYSCTL_ADD_UINT(&hpts->hpts_ctx,
+ SYSCTL_CHILDREN(hpts->hpts_root),
+ OID_AUTO, "lastran", CTLFLAG_RD,
+ &pace->cts_last_ran[i], 0,
+ "The last usec timestamp that this hpts ran");
+ SYSCTL_ADD_LONG(&hpts->hpts_ctx,
+ SYSCTL_CHILDREN(hpts->hpts_root),
+ OID_AUTO, "cur_min_sleep", CTLFLAG_RD,
+ &hpts->p_mysleep.tv_usec,
+ "What the running pacers is using for p_mysleep.tv_usec");
+ SYSCTL_ADD_U64(&hpts->hpts_ctx,
+ SYSCTL_CHILDREN(hpts->hpts_root),
+ OID_AUTO, "now_sleeping", CTLFLAG_RD,
+ &hpts->sleeping, 0,
+ "What the running pacers is actually sleeping for");
+ SYSCTL_ADD_U64(&hpts->hpts_ctx,
+ SYSCTL_CHILDREN(hpts->hpts_root),
+ OID_AUTO, "syscall_cnt", CTLFLAG_RD,
+ &hpts->syscall_cnt, 0,
+ "How many times we had syscalls on this hpts");
+ }
}
- /* Don't try to bind to NUMA domains if we don't have any */
- if (vm_ndomains == 1 && tcp_bind_threads == 2)
- tcp_bind_threads = 0;
- /*
- * Now lets start ithreads to handle the hptss.
- */
- for (i = 0; i < tcp_pace.rp_num_hptss; i++) {
- hpts = tcp_pace.rp_ent[i];
- hpts->p_cpu = i;
+ return (pace);
+}
+
+/*
+ * Create threads for a tcp_hptsi structure and starts timers for the current
+ * (minimum) sleep interval.
+ */
+void
+tcp_hptsi_start(struct tcp_hptsi *pace)
+{
+ struct tcp_hpts_entry *hpts;
+ struct pcpu *pc;
+ struct timeval tv;
+ uint32_t i, j;
+ int count, domain;
+ int error __diagused;
+
+ KASSERT(pace != NULL, ("tcp_hptsi_start: pace is NULL"));
+
+ /* Start threads for each hpts entry */
+ for (i = 0; i < pace->rp_num_hptss; i++) {
+ hpts = pace->rp_ent[i];
+
+ KASSERT(hpts->ie_cookie == NULL,
+ ("tcp_hptsi_start: hpts[%d]->ie_cookie is not NULL", i));
error = swi_add(&hpts->ie, "hpts",
tcp_hpts_thread, (void *)hpts,
SWI_NET, INTR_MPSAFE, &hpts->ie_cookie);
KASSERT(error == 0,
- ("Can't add hpts:%p i:%d err:%d",
- hpts, i, error));
- created++;
- hpts->p_mysleep.tv_sec = 0;
- hpts->p_mysleep.tv_usec = tcp_min_hptsi_time;
+ ("Can't add hpts:%p i:%d err:%d", hpts, i, error));
+
if (tcp_bind_threads == 1) {
- if (intr_event_bind(hpts->ie, i) == 0)
- bound++;
+ (void)intr_event_bind(hpts->ie, i);
} else if (tcp_bind_threads == 2) {
/* Find the group for this CPU (i) and bind into it */
- for (j = 0; j < tcp_pace.grp_cnt; j++) {
- if (CPU_ISSET(i, &tcp_pace.grps[j]->cg_mask)) {
+ for (j = 0; j < pace->grp_cnt; j++) {
+ if (CPU_ISSET(i, &pace->grps[j]->cg_mask)) {
if (intr_event_bind_ithread_cpuset(hpts->ie,
- &tcp_pace.grps[j]->cg_mask) == 0) {
- bound++;
+ &pace->grps[j]->cg_mask) == 0) {
pc = pcpu_find(i);
domain = pc->pc_domain;
- count = hpts_domains[domain].count;
- hpts_domains[domain].cpu[count] = i;
- hpts_domains[domain].count++;
+ count = pace->domains[domain].count;
+ pace->domains[domain].cpu[count] = i;
+ pace->domains[domain].count++;
break;
}
}
}
}
+
+ hpts->p_mysleep.tv_sec = 0;
+ hpts->p_mysleep.tv_usec = tcp_min_hptsi_time;
tv.tv_sec = 0;
tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_USECS_PER_SLOT;
- hpts->sleeping = tv.tv_usec;
- sb = tvtosbt(tv);
- callout_reset_sbt_on(&hpts->co, sb, 0,
- hpts_timeout_swi, hpts, hpts->p_cpu,
- (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
+ (void)tcp_hpts_sleep(hpts, &tv);
}
- /*
- * If we somehow have an empty domain, fall back to choosing
- * among all htps threads.
- */
- for (i = 0; i < vm_ndomains; i++) {
- if (hpts_domains[i].count == 0) {
- tcp_bind_threads = 0;
- break;
- }
- }
- tcp_hpts_softclock = __tcp_run_hpts;
- tcp_lro_hpts_init();
- printf("TCP Hpts created %d swi interrupt threads and bound %d to %s\n",
- created, bound,
- tcp_bind_threads == 2 ? "NUMA domains" : "cpus");
}
-static void
-tcp_hpts_mod_unload(void)
+/*
+ * Stop all callouts/threads for a tcp_hptsi structure.
+ */
+void
+tcp_hptsi_stop(struct tcp_hptsi *pace)
{
+ struct tcp_hpts_entry *hpts;
int rv __diagused;
+ uint32_t i;
- tcp_lro_hpts_uninit();
- atomic_store_ptr(&tcp_hpts_softclock, NULL);
+ KASSERT(pace != NULL, ("tcp_hptsi_stop: pace is NULL"));
- for (int i = 0; i < tcp_pace.rp_num_hptss; i++) {
- struct tcp_hpts_entry *hpts = tcp_pace.rp_ent[i];
+ for (i = 0; i < pace->rp_num_hptss; i++) {
+ hpts = pace->rp_ent[i];
+ KASSERT(hpts != NULL, ("tcp_hptsi_stop: hpts[%d] is NULL", i));
+ KASSERT(hpts->ie_cookie != NULL,
+ ("tcp_hptsi_stop: hpts[%d]->ie_cookie is NULL", i));
- rv = callout_drain(&hpts->co);
+ rv = _callout_stop_safe(&hpts->co, CS_DRAIN);
MPASS(rv != 0);
rv = swi_remove(hpts->ie_cookie);
MPASS(rv == 0);
+ hpts->ie_cookie = NULL;
+ }
+}
- rv = sysctl_ctx_free(&hpts->hpts_ctx);
- MPASS(rv == 0);
+/*
+ * Destroy a tcp_hptsi structure initialized by tcp_hptsi_create.
+ */
+void
+tcp_hptsi_destroy(struct tcp_hptsi *pace)
+{
+ struct tcp_hpts_entry *hpts;
+ uint32_t i;
+
+ KASSERT(pace != NULL, ("tcp_hptsi_destroy: pace is NULL"));
+ KASSERT(pace->rp_ent != NULL, ("tcp_hptsi_destroy: pace->rp_ent is NULL"));
+
+ /* Cleanup each hpts entry */
+ for (i = 0; i < pace->rp_num_hptss; i++) {
+ hpts = pace->rp_ent[i];
+ if (hpts != NULL) {
+ /* Cleanup SYSCTL if it was initialized */
+ if (hpts->hpts_root != NULL) {
+ sysctl_ctx_free(&hpts->hpts_ctx);
+ }
- mtx_destroy(&hpts->p_mtx);
- free(hpts->p_hptss, M_TCPHPTS);
- free(hpts, M_TCPHPTS);
+ mtx_destroy(&hpts->p_mtx);
+ free(hpts->p_hptss, M_TCPHPTS);
+ free(hpts, M_TCPHPTS);
+ }
}
- free(tcp_pace.rp_ent, M_TCPHPTS);
- free(tcp_pace.cts_last_ran, M_TCPHPTS);
+ /* Cleanup main arrays */
+ free(pace->rp_ent, M_TCPHPTS);
+ free(pace->cts_last_ran, M_TCPHPTS);
#ifdef SMP
- free(tcp_pace.grps, M_TCPHPTS);
+ free(pace->grps, M_TCPHPTS);
#endif
+ /* Free the main structure */
+ free(pace, M_TCPHPTS);
+}
+
+static int
+tcp_hpts_mod_load(void)
+{
+ int i;
+
+ /* Don't try to bind to NUMA domains if we don't have any */
+ if (vm_ndomains == 1 && tcp_bind_threads == 2)
+ tcp_bind_threads = 0;
+
+ /* Create the tcp_hptsi structure */
+ tcp_hptsi_pace = tcp_hptsi_create(&tcp_hptsi_default_funcs, true);
+ if (tcp_hptsi_pace == NULL)
+ return (ENOMEM);
+
+ /* Initialize global counters */
+ hpts_hopelessly_behind = counter_u64_alloc(M_WAITOK);
+ hpts_loops = counter_u64_alloc(M_WAITOK);
+ back_tosleep = counter_u64_alloc(M_WAITOK);
+ combined_wheel_wrap = counter_u64_alloc(M_WAITOK);
+ wheel_wrap = counter_u64_alloc(M_WAITOK);
+ hpts_wake_timeout = counter_u64_alloc(M_WAITOK);
+ hpts_direct_awakening = counter_u64_alloc(M_WAITOK);
+ hpts_back_tosleep = counter_u64_alloc(M_WAITOK);
+ hpts_direct_call = counter_u64_alloc(M_WAITOK);
+ cpu_uses_flowid = counter_u64_alloc(M_WAITOK);
+ cpu_uses_random = counter_u64_alloc(M_WAITOK);
+
+ /* Start the threads */
+ tcp_hptsi_start(tcp_hptsi_pace);
+
+ /* Enable the global HPTS softclock function */
+ tcp_hpts_softclock = __tcp_run_hpts;
+
+ /* Initialize LRO HPTS */
+ tcp_lro_hpts_init();
+
+ /*
+ * If we somehow have an empty domain, fall back to choosing among all
+ * HPTS threads.
+ */
+ for (i = 0; i < vm_ndomains; i++) {
+ if (tcp_hptsi_pace->domains[i].count == 0) {
+ tcp_bind_threads = 0;
+ break;
+ }
+ }
+
+ printf("TCP HPTS started %u (%s) swi interrupt threads\n",
+ tcp_hptsi_pace->rp_num_hptss, (tcp_bind_threads == 0) ?
+ "(unbounded)" :
+ (tcp_bind_threads == 1 ? "per-cpu" : "per-NUMA-domain"));
+
+ return (0);
+}
+
+static void
+tcp_hpts_mod_unload(void)
+{
+ tcp_lro_hpts_uninit();
+
+ /* Disable the global HPTS softclock function */
+ atomic_store_ptr(&tcp_hpts_softclock, NULL);
+
+ tcp_hptsi_stop(tcp_hptsi_pace);
+ tcp_hptsi_destroy(tcp_hptsi_pace);
+ tcp_hptsi_pace = NULL;
+
+ /* Cleanup global counters */
counter_u64_free(hpts_hopelessly_behind);
counter_u64_free(hpts_loops);
counter_u64_free(back_tosleep);
@@ -2101,13 +2193,11 @@ tcp_hpts_mod_unload(void)
}
static int
-tcp_hpts_modevent(module_t mod, int what, void *arg)
+tcp_hpts_mod_event(module_t mod, int what, void *arg)
{
-
switch (what) {
case MOD_LOAD:
- tcp_hpts_mod_load();
- return (0);
+ return (tcp_hpts_mod_load());
case MOD_QUIESCE:
/*
* Since we are a dependency of TCP stack modules, they should
@@ -2127,7 +2217,7 @@ tcp_hpts_modevent(module_t mod, int what, void *arg)
static moduledata_t tcp_hpts_module = {
.name = "tcphpts",
- .evhand = tcp_hpts_modevent,
+ .evhand = tcp_hpts_mod_event,
};
DECLARE_MODULE(tcphpts, tcp_hpts_module, SI_SUB_SOFTINTR, SI_ORDER_ANY);
diff --git a/sys/netinet/tcp_hpts.h b/sys/netinet/tcp_hpts.h
index 0bbf677c25aa..6183e6646133 100644
--- a/sys/netinet/tcp_hpts.h
+++ b/sys/netinet/tcp_hpts.h
@@ -84,13 +84,18 @@ struct hpts_diag {
#ifdef _KERNEL
+extern struct tcp_hptsi *tcp_hptsi_pace;
+
/*
* The following are the definitions for the kernel HPTS interface for managing
* the HPTS ring and the TCBs on it.
*/
-void tcp_hpts_init(struct tcpcb *);
-void tcp_hpts_remove(struct tcpcb *);
+void __tcp_hpts_init(struct tcp_hptsi *pace, struct tcpcb *);
+#define tcp_hpts_init(tp) __tcp_hpts_init(tcp_hptsi_pace, tp)
+
+void __tcp_hpts_remove(struct tcp_hptsi *pace, struct tcpcb *);
+#define tcp_hpts_remove(tp) __tcp_hpts_remove(tcp_hptsi_pace, tp)
static inline bool
tcp_in_hpts(struct tcpcb *tp)
@@ -125,15 +130,19 @@ tcp_in_hpts(struct tcpcb *tp)
* you should already have the INP_WLOCK().
*/
#ifdef INVARIANTS
-void __tcp_hpts_insert(struct tcpcb *tp, uint32_t slot, int32_t line,
- struct hpts_diag *diag);
+void __tcp_hpts_insert(struct tcp_hptsi *pace, struct tcpcb *tp, uint32_t slot,
+ int32_t line, struct hpts_diag *diag);
#define tcp_hpts_insert(tp, slot, diag) \
- __tcp_hpts_insert((tp), (slot), __LINE__, (diag))
+ __tcp_hpts_insert(tcp_hptsi_pace, (tp), (slot), __LINE__, (diag))
#else
-void tcp_hpts_insert(struct tcpcb *tp, uint32_t slot, struct hpts_diag *diag);
+void __tcp_hpts_insert(struct tcp_hptsi *pace, struct tcpcb *tp, uint32_t slot,
+ struct hpts_diag *diag);
+#define tcp_hpts_insert(tp, slot, diag) \
+ __tcp_hpts_insert(tcp_hptsi_pace, (tp), (slot), (diag))
#endif
-void tcp_set_hpts(struct tcpcb *tp);
+void __tcp_set_hpts(struct tcp_hptsi *pace, struct tcpcb *tp);
+#define tcp_set_hpts(tp) __tcp_set_hpts(tcp_hptsi_pace, tp)
extern int32_t tcp_min_hptsi_time;
@@ -165,12 +174,5 @@ tcp_get_usecs(struct timeval *tv)
return (tcp_tv_to_usec(tv));
}
-/*
- * LRO HPTS initialization and uninitialization, only for internal use by the
- * HPTS code.
- */
-void tcp_lro_hpts_init(void);
-void tcp_lro_hpts_uninit(void);
-
#endif /* _KERNEL */
#endif /* __tcp_hpts_h__ */
diff --git a/sys/netinet/tcp_hpts_test.c b/sys/netinet/tcp_hpts_test.c
index 085da37e44c8..fb2d9c0c7024 100644
--- a/sys/netinet/tcp_hpts_test.c
+++ b/sys/netinet/tcp_hpts_test.c
@@ -28,48 +28,1638 @@
#include <tests/ktest.h>
#include <sys/cdefs.h>
#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/interrupt.h>
#include <sys/errno.h>
#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/refcount.h>
#include <sys/socket.h>
+#include <sys/sysctl.h>
#include <sys/systm.h>
#include <netinet/in.h>
#include <netinet/tcp.h>
-#define _WANT_INPCB
#include <netinet/in_pcb.h>
#include <netinet/tcp_seq.h>
-#define _WANT_TCPCB
#include <netinet/tcp_var.h>
#include <netinet/tcp_hpts.h>
+#include <netinet/tcp_hpts_internal.h>
#include <dev/tcp_log/tcp_log_dev.h>
#include <netinet/tcp_log_buf.h>
-#define KTEST_ERR(_ctx, _fmt, ...) \
- KTEST_LOG_LEVEL(_ctx, LOG_ERR, _fmt, ## __VA_ARGS__)
+#undef tcp_hpts_init
+#undef tcp_hpts_remove
+#undef tcp_hpts_insert
+#undef tcp_set_hpts
+
+/* Custom definitions that take the tcp_hptsi */
+#define tcp_hpts_init(pace, tp) __tcp_hpts_init((pace), (tp))
+#define tcp_hpts_remove(pace, tp) __tcp_hpts_remove((pace), (tp))
+#ifdef INVARIANTS
+#define tcp_hpts_insert(pace, tp, slot, diag) \
+ __tcp_hpts_insert((pace), (tp), (slot), __LINE__, (diag))
+#else
+#define tcp_hpts_insert(pace, tp, slot, diag) \
+ __tcp_hpts_insert((pace), (tp), (slot), (diag))
+#endif
+#define tcp_set_hpts(pace, tp) __tcp_set_hpts((pace), (tp))
+
+static MALLOC_DEFINE(M_TCPHPTS, "tcp_hpts_test", "TCP hpts test");
+
+static int test_exit_on_failure = true;
+SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hpts_test, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
+ "TCP HPTS test controls");
+SYSCTL_INT(_net_inet_tcp_hpts_test, OID_AUTO, exit_on_failure, CTLFLAG_RW,
+ &test_exit_on_failure, 0,
+ "Exit HPTS test immediately on first failure (1) or continue running all tests (0)");
#define KTEST_VERIFY(x) do { \
if (!(x)) { \
KTEST_ERR(ctx, "FAIL: %s", #x); \
- return (EINVAL); \
+ if (test_exit_on_failure) \
+ return (EINVAL); \
} else { \
KTEST_LOG(ctx, "PASS: %s", #x); \
} \
} while (0)
+#define KTEST_EQUAL(x, y) do { \
+ if ((x) != (y)) { \
+ KTEST_ERR(ctx, "FAIL: %s != %s (%d != %d)", #x, #y, (x), (y)); \
+ if (test_exit_on_failure) \
+ return (EINVAL); \
+ } else { \
+ KTEST_LOG(ctx, "PASS: %s == %s", #x, #y); \
+ } \
+} while (0)
+
+#define KTEST_NEQUAL(x, y) do { \
+ if ((x) == (y)) { \
+ KTEST_ERR(ctx, "FAIL: %s == %s (%d == %d)", #x, #y, (x), (y)); \
+ if (test_exit_on_failure) \
+ return (EINVAL); \
+ } else { \
+ KTEST_LOG(ctx, "PASS: %s != %s", #x, #y); \
+ } \
+} while (0)
+
+#define KTEST_GREATER_THAN(x, y) do { \
+ if ((x) <= (y)) { \
+ KTEST_ERR(ctx, "FAIL: %s <= %s (%d <= %d)", #x, #y, (x), (y)); \
+ if (test_exit_on_failure) \
+ return (EINVAL); \
+ } else { \
+ KTEST_LOG(ctx, "PASS: %s > %s", #x, #y); \
+ } \
+} while (0)
+
+#define KTEST_VERIFY_RET(x, y) do { \
+ if (!(x)) { \
+ KTEST_ERR(ctx, "FAIL: %s", #x); \
+ if (test_exit_on_failure) \
+ return (y); \
+ } else { \
+ KTEST_LOG(ctx, "PASS: %s", #x); \
+ } \
+} while (0)
+
+static void
+dump_hpts_entry(struct ktest_test_context *ctx, struct tcp_hpts_entry *hpts)
+{
+ KTEST_LOG(ctx, "tcp_hpts_entry(%p)", hpts);
+ KTEST_LOG(ctx, " p_cur_slot: %u", hpts->p_cur_slot);
+ KTEST_LOG(ctx, " p_prev_slot: %u", hpts->p_prev_slot);
+ KTEST_LOG(ctx, " p_nxt_slot: %u", hpts->p_nxt_slot);
+ KTEST_LOG(ctx, " p_runningslot: %u", hpts->p_runningslot);
+ KTEST_LOG(ctx, " p_on_queue_cnt: %d", hpts->p_on_queue_cnt);
+ KTEST_LOG(ctx, " p_hpts_active: %u", hpts->p_hpts_active);
+ KTEST_LOG(ctx, " p_wheel_complete: %u", hpts->p_wheel_complete);
+ KTEST_LOG(ctx, " p_direct_wake: %u", hpts->p_direct_wake);
+ KTEST_LOG(ctx, " p_on_min_sleep: %u", hpts->p_on_min_sleep);
+ KTEST_LOG(ctx, " p_hpts_wake_scheduled: %u", hpts->p_hpts_wake_scheduled);
+ KTEST_LOG(ctx, " hit_callout_thresh: %u", hpts->hit_callout_thresh);
+ KTEST_LOG(ctx, " p_hpts_sleep_time: %u", hpts->p_hpts_sleep_time);
+ KTEST_LOG(ctx, " p_delayed_by: %u", hpts->p_delayed_by);
+ KTEST_LOG(ctx, " overidden_sleep: %u", hpts->overidden_sleep);
+ KTEST_LOG(ctx, " saved_curslot: %u", hpts->saved_curslot);
+ KTEST_LOG(ctx, " saved_prev_slot: %u", hpts->saved_prev_slot);
+ KTEST_LOG(ctx, " syscall_cnt: %lu", hpts->syscall_cnt);
+ KTEST_LOG(ctx, " sleeping: %lu", hpts->sleeping);
+ KTEST_LOG(ctx, " p_cpu: %u", hpts->p_cpu);
+ KTEST_LOG(ctx, " ie_cookie: %p", hpts->ie_cookie);
+ KTEST_LOG(ctx, " p_hptsi: %p", hpts->p_hptsi);
+ KTEST_LOG(ctx, " p_mysleep: %ld.%06ld", hpts->p_mysleep.tv_sec, hpts->p_mysleep.tv_usec);
+}
+
+static void
+dump_tcpcb(struct tcpcb *tp)
+{
+ struct ktest_test_context *ctx = tp->t_fb_ptr;
+ struct inpcb *inp = &tp->t_inpcb;
+
+ KTEST_LOG(ctx, "tcp_control_block(%p)", tp);
+
+ /* HPTS-specific fields */
+ KTEST_LOG(ctx, " t_in_hpts: %d", tp->t_in_hpts);
+ KTEST_LOG(ctx, " t_hpts_cpu: %u", tp->t_hpts_cpu);
+ KTEST_LOG(ctx, " t_hpts_slot: %d", tp->t_hpts_slot);
+ KTEST_LOG(ctx, " t_hpts_gencnt: %u", tp->t_hpts_gencnt);
+ KTEST_LOG(ctx, " t_hpts_request: %u", tp->t_hpts_request);
+
+ /* LRO CPU field */
+ KTEST_LOG(ctx, " t_lro_cpu: %u", tp->t_lro_cpu);
+
+ /* TCP flags that affect HPTS */
+ KTEST_LOG(ctx, " t_flags2: 0x%x", tp->t_flags2);
+ KTEST_LOG(ctx, " TF2_HPTS_CPU_SET: %s", (tp->t_flags2 & TF2_HPTS_CPU_SET) ? "YES" : "NO");
+ KTEST_LOG(ctx, " TF2_HPTS_CALLS: %s", (tp->t_flags2 & TF2_HPTS_CALLS) ? "YES" : "NO");
+ KTEST_LOG(ctx, " TF2_SUPPORTS_MBUFQ: %s", (tp->t_flags2 & TF2_SUPPORTS_MBUFQ) ? "YES" : "NO");
+
+ /* Input PCB fields that HPTS uses */
+ KTEST_LOG(ctx, " inp_flags: 0x%x", inp->inp_flags);
+ KTEST_LOG(ctx, " INP_DROPPED: %s", (inp->inp_flags & INP_DROPPED) ? "YES" : "NO");
+ KTEST_LOG(ctx, " inp_flowid: 0x%x", inp->inp_flowid);
+ KTEST_LOG(ctx, " inp_flowtype: %u", inp->inp_flowtype);
+ KTEST_LOG(ctx, " inp_numa_domain: %d", inp->inp_numa_domain);
+}
+
+/* Enum for call counting indices */
+enum test_call_counts {
+ CCNT_MICROUPTIME = 0,
+ CCNT_SWI_ADD,
+ CCNT_SWI_REMOVE,
+ CCNT_SWI_SCHED,
+ CCNT_INTR_EVENT_BIND,
+ CCNT_INTR_EVENT_BIND_CPUSET,
+ CCNT_CALLOUT_INIT,
+ CCNT_CALLOUT_RESET_SBT_ON,
+ CCNT_CALLOUT_STOP_SAFE,
+ CCNT_TCP_OUTPUT,
+ CCNT_TCP_TFB_DO_QUEUED_SEGMENTS,
+ CCNT_MAX
+};
+
+static uint32_t call_counts[CCNT_MAX];
+
+static uint64_t test_time_usec = 0;
+
+/*
+ * Reset all test global variables to a clean state.
+ */
+static void
+test_hpts_init(void)
+{
+ memset(call_counts, 0, sizeof(call_counts));
+ test_time_usec = 0;
+}
+
+static void
+test_microuptime(struct timeval *tv)
+{
+ call_counts[CCNT_MICROUPTIME]++;
+ tv->tv_sec = test_time_usec / 1000000;
+ tv->tv_usec = test_time_usec % 1000000;
+}
+
static int
-test_hpts_init(struct ktest_test_context *ctx)
+test_swi_add(struct intr_event **eventp, const char *name,
+ driver_intr_t handler, void *arg, int pri, enum intr_type flags,
+ void **cookiep)
+{
+ call_counts[CCNT_SWI_ADD]++;
+ /* Simulate successful SWI creation */
+ *eventp = (struct intr_event *)0xfeedface; /* Mock event */
+ *cookiep = (void *)0xdeadbeef; /* Mock cookie */
+ return (0);
+}
+
+static int
+test_swi_remove(void *cookie)
+{
+ call_counts[CCNT_SWI_REMOVE]++;
+ /* Simulate successful removal */
+ return (0);
+}
+
+static void
+test_swi_sched(void *cookie, int flags)
+{
+ call_counts[CCNT_SWI_SCHED]++;
+ /* Simulate successful SWI scheduling */
+}
+
+static int
+test_intr_event_bind(struct intr_event *ie, int cpu)
+{
+ call_counts[CCNT_INTR_EVENT_BIND]++;
+ /* Simulate successful binding */
+ return (0);
+}
+
+static int
+test_intr_event_bind_ithread_cpuset(struct intr_event *ie, struct _cpuset *mask)
+{
+ call_counts[CCNT_INTR_EVENT_BIND_CPUSET]++;
+ /* Simulate successful cpuset binding */
+ return (0);
+}
+
+static void
+test_callout_init(struct callout *c, int mpsafe)
+{
+ call_counts[CCNT_CALLOUT_INIT]++;
+ memset(c, 0, sizeof(*c));
+}
+
+static int
+test_callout_reset_sbt_on(struct callout *c, sbintime_t sbt, sbintime_t precision,
+ void (*func)(void *), void *arg, int cpu, int flags)
+{
+ call_counts[CCNT_CALLOUT_RESET_SBT_ON]++;
+ /* Return 1 to simulate successful timer scheduling */
+ return (1);
+}
+
+static int
+test_callout_stop_safe(struct callout *c, int flags)
+{
+ call_counts[CCNT_CALLOUT_STOP_SAFE]++;
+ /* Return 1 to simulate successful timer stopping */
+ return (1);
+}
+
+static const struct tcp_hptsi_funcs test_funcs = {
+ .microuptime = test_microuptime,
+ .swi_add = test_swi_add,
+ .swi_remove = test_swi_remove,
+ .swi_sched = test_swi_sched,
+ .intr_event_bind = test_intr_event_bind,
+ .intr_event_bind_ithread_cpuset = test_intr_event_bind_ithread_cpuset,
+ .callout_init = test_callout_init,
+ .callout_reset_sbt_on = test_callout_reset_sbt_on,
+ ._callout_stop_safe = test_callout_stop_safe,
+};
+
+#define TP_REMOVE_FROM_HPTS(tp) tp->bits_spare
+#define TP_LOG_TEST(tp) tp->t_log_state_set
+
+static int
+test_tcp_output(struct tcpcb *tp)
+{
+ struct ktest_test_context *ctx = tp->t_fb_ptr;
+ struct tcp_hptsi *pace = (struct tcp_hptsi*)tp->t_tfo_pending;
+ struct tcp_hpts_entry *hpts = pace->rp_ent[tp->t_hpts_cpu];
+
+ call_counts[CCNT_TCP_OUTPUT]++;
+ if (TP_LOG_TEST(tp)) {
+ KTEST_LOG(ctx, "=> tcp_output(%p)", tp);
+ dump_tcpcb(tp);
+ dump_hpts_entry(ctx, hpts);
+ }
+
+ if ((TP_REMOVE_FROM_HPTS(tp) & 1) != 0) {
+ if (TP_LOG_TEST(tp))
+ KTEST_LOG(ctx, "=> tcp_hpts_remove(%p)", tp);
+ tcp_hpts_remove(pace, tp);
+ }
+
+ if ((TP_REMOVE_FROM_HPTS(tp) & 2) != 0) {
+ INP_WUNLOCK(&tp->t_inpcb); /* tcp_output unlocks on error */
+ return (-1); /* Simulate tcp_output error */
+ }
+
+ return (0);
+}
+
+static int
+test_tfb_do_queued_segments(struct tcpcb *tp, int flag)
+{
+ struct ktest_test_context *ctx = tp->t_fb_ptr;
+ struct tcp_hptsi *pace = (struct tcp_hptsi*)tp->t_tfo_pending;
+ struct tcp_hpts_entry *hpts = pace->rp_ent[tp->t_hpts_cpu];
+
+ call_counts[CCNT_TCP_TFB_DO_QUEUED_SEGMENTS]++;
+ KTEST_LOG(ctx, "=> tfb_do_queued_segments(%p, %d)", tp, flag);
+ dump_tcpcb(tp);
+ dump_hpts_entry(ctx, hpts);
+
+ if ((TP_REMOVE_FROM_HPTS(tp) & 1) != 0) {
+ if (TP_LOG_TEST(tp))
+ KTEST_LOG(ctx, "=> tcp_hpts_remove(%p)", tp);
+ tcp_hpts_remove(pace, tp);
+ }
+
+ if ((TP_REMOVE_FROM_HPTS(tp) & 2) != 0) {
+ INP_WUNLOCK(&tp->t_inpcb); /* do_queued_segments unlocks on error */
+ return (-1); /* Simulate do_queued_segments error */
+ }
+
+ return (0);
+}
+
+static struct tcp_function_block test_tcp_fb = {
+ .tfb_tcp_block_name = "hpts_test_tcp",
+ .tfb_tcp_output = test_tcp_output,
+ .tfb_do_queued_segments = test_tfb_do_queued_segments,
+};
+
+/*
+ * Create a minimally initialized tcpcb that can be safely inserted into HPTS.
+ * This function allocates and initializes all the fields that HPTS code
+ * reads or writes.
+ */
+static struct tcpcb *
+test_hpts_create_tcpcb(struct ktest_test_context *ctx, struct tcp_hptsi *pace)
+{
+ struct tcpcb *tp;
+
+ tp = malloc(sizeof(struct tcpcb), M_TCPHPTS, M_WAITOK | M_ZERO);
+ if (tp) {
+ rw_init_flags(&tp->t_inpcb.inp_lock, "test-inp",
+ RW_RECURSE | RW_DUPOK);
+ refcount_init(&tp->t_inpcb.inp_refcount, 1);
+ tp->t_inpcb.inp_pcbinfo = &V_tcbinfo;
+ tp->t_fb = &test_tcp_fb;
+ tp->t_hpts_cpu = HPTS_CPU_NONE;
+ STAILQ_INIT(&tp->t_inqueue);
+ tcp_hpts_init(pace, tp);
+
+ /* Stuff some pointers in the tcb for test purposes. */
+ tp->t_fb_ptr = ctx;
+ tp->t_tfo_pending = (unsigned int*)pace;
+ }
+
+ return (tp);
+}
+
+/*
+ * Free a test tcpcb created by test_hpts_create_tcpcb()
+ */
+static void
+test_hpts_free_tcpcb(struct tcpcb *tp)
+{
+ if (tp == NULL)
+ return;
+
+ INP_LOCK_DESTROY(&tp->t_inpcb);
+ free(tp, M_TCPHPTS);
+}
+
+/*
+ * ***********************************************
+ * * KTEST functions for testing the HPTS module *
+ * ***********************************************
+ */
+
+/*
+ * Validates that the HPTS module is properly loaded and initialized by checking
+ * that the minimum HPTS time is configured.
+ */
+KTEST_FUNC(module_load)
+{
+ test_hpts_init();
+ KTEST_NEQUAL(tcp_min_hptsi_time, 0);
+ KTEST_VERIFY(tcp_bind_threads >= 0 && tcp_bind_threads <= 2);
+ KTEST_NEQUAL(tcp_hptsi_pace, NULL);
+ return (0);
+}
+
+/*
+ * Validates the creation and destruction of tcp_hptsi structures, ensuring
+ * proper initialization of internal fields and clean destruction.
+ */
+KTEST_FUNC(hptsi_create_destroy)
+{
+ struct tcp_hptsi *pace;
+
+ test_hpts_init();
+
+ pace = tcp_hptsi_create(&test_funcs, false);
+ KTEST_NEQUAL(pace, NULL);
+ KTEST_NEQUAL(pace->rp_ent, NULL);
+ KTEST_NEQUAL(pace->cts_last_ran, NULL);
+ KTEST_VERIFY(pace->rp_num_hptss > 0);
+ KTEST_VERIFY(pace->rp_num_hptss <= MAXCPU); /* Reasonable upper bound */
+ KTEST_VERIFY(pace->grp_cnt >= 1); /* At least one group */
+ KTEST_EQUAL(pace->funcs, &test_funcs); /* Verify function pointer was set */
+
+ /* Verify individual HPTS entries are properly initialized */
+ for (uint32_t i = 0; i < pace->rp_num_hptss; i++) {
+ KTEST_NEQUAL(pace->rp_ent[i], NULL);
+ KTEST_EQUAL(pace->rp_ent[i]->p_cpu, i);
+ KTEST_EQUAL(pace->rp_ent[i]->p_hptsi, pace);
+ KTEST_EQUAL(pace->rp_ent[i]->p_on_queue_cnt, 0);
+ }
+
+ tcp_hptsi_destroy(pace);
+
+ return (0);
+}
+
+/*
+ * Validates that tcp_hptsi structures can be started and stopped properly,
+ * including verification that threads are created during start and cleaned up
+ * during stop operations.
+ */
+KTEST_FUNC(hptsi_start_stop)
+{
+ struct tcp_hptsi *pace;
+
+ test_hpts_init();
+
+ pace = tcp_hptsi_create(&test_funcs, false);
+ KTEST_NEQUAL(pace, NULL);
+
+ tcp_hptsi_start(pace);
+
+ /* Verify that entries have threads started */
+ struct tcp_hpts_entry *hpts = pace->rp_ent[0];
+ KTEST_NEQUAL(hpts->ie_cookie, NULL); /* Should have SWI handler */
+ KTEST_EQUAL(hpts->p_hptsi, pace); /* Should point to our pace */
+
+ tcp_hptsi_stop(pace);
+ tcp_hptsi_destroy(pace);
+
+ return (0);
+}
+
+/*
+ * Validates that multiple tcp_hptsi instances can coexist independently, with
+ * different configurations and CPU assignments without interfering with each
+ * other.
+ */
+KTEST_FUNC(hptsi_independence)
+{
+ struct tcp_hptsi *pace1, *pace2;
+ uint16_t cpu1, cpu2;
+
+ test_hpts_init();
+
+ pace1 = tcp_hptsi_create(&test_funcs, false);
+ pace2 = tcp_hptsi_create(&test_funcs, false);
+ KTEST_NEQUAL(pace1, NULL);
+ KTEST_NEQUAL(pace2, NULL);
+ KTEST_NEQUAL(pace2->rp_ent, NULL);
+
+ cpu1 = tcp_hptsi_random_cpu(pace1);
+ cpu2 = tcp_hptsi_random_cpu(pace2);
+ KTEST_VERIFY(cpu1 < pace1->rp_num_hptss);
+ KTEST_VERIFY(cpu2 < pace2->rp_num_hptss);
+
+ /* Verify both instances have independent entry arrays */
+ KTEST_NEQUAL(pace1->rp_ent, pace2->rp_ent);
+ /* Verify they may have different CPU counts but both reasonable */
+ KTEST_VERIFY(pace1->rp_num_hptss > 0 && pace1->rp_num_hptss <= MAXCPU);
+ KTEST_VERIFY(pace2->rp_num_hptss > 0 && pace2->rp_num_hptss <= MAXCPU);
+
+ tcp_hptsi_destroy(pace1);
+ tcp_hptsi_destroy(pace2);
+
+ return (0);
+}
+
+/*
+ * Validates that custom function injection works correctly, ensuring that
+ * test-specific implementations of microuptime and others are properly
+ * called by the HPTS system.
+ */
+KTEST_FUNC(function_injection)
+{
+ struct tcp_hptsi *pace;
+
+ test_hpts_init();
+
+ pace = tcp_hptsi_create(&test_funcs, false);
+ KTEST_NEQUAL(pace, NULL);
+ KTEST_EQUAL(pace->funcs, &test_funcs);
+ KTEST_VERIFY(call_counts[CCNT_MICROUPTIME] > 0);
+ KTEST_VERIFY(call_counts[CCNT_CALLOUT_INIT] > 0);
+
+ tcp_hptsi_start(pace);
+ KTEST_VERIFY(call_counts[CCNT_SWI_ADD] > 0);
+ KTEST_VERIFY(tcp_bind_threads == 0 ||
+ call_counts[CCNT_INTR_EVENT_BIND] > 0 ||
+ call_counts[CCNT_INTR_EVENT_BIND_CPUSET] > 0);
+ KTEST_VERIFY(call_counts[CCNT_CALLOUT_RESET_SBT_ON] > 0);
+
+ tcp_hptsi_stop(pace);
+ KTEST_VERIFY(call_counts[CCNT_CALLOUT_STOP_SAFE] > 0);
+ KTEST_VERIFY(call_counts[CCNT_SWI_REMOVE] > 0);
+
+ tcp_hptsi_destroy(pace);
+
+ /* Verify we have a reasonable balance of create/destroy calls */
+ KTEST_EQUAL(call_counts[CCNT_SWI_ADD], call_counts[CCNT_SWI_REMOVE]);
+ KTEST_VERIFY(call_counts[CCNT_CALLOUT_RESET_SBT_ON] <= call_counts[CCNT_CALLOUT_STOP_SAFE]);
+
+ return (0);
+}
+
+/*
+ * Validates that a tcpcb can be properly initialized for HPTS compatibility,
+ * ensuring all required fields are set correctly and function pointers are
+ * valid for safe HPTS operations.
+ */
+KTEST_FUNC(tcpcb_initialization)
+{
+ struct tcp_hptsi *pace;
+ struct tcpcb *tp;
+
+ test_hpts_init();
+
+ pace = tcp_hptsi_create(&test_funcs, false);
+ KTEST_NEQUAL(pace, NULL);
+ tcp_hptsi_start(pace);
+
+ /* Verify the tcpcb is properly initialized for HPTS */
+ tp = test_hpts_create_tcpcb(ctx, pace);
+ KTEST_NEQUAL(tp, NULL);
+ KTEST_NEQUAL(tp->t_fb, NULL);
+ KTEST_NEQUAL(tp->t_fb->tfb_tcp_output, NULL);
+ KTEST_NEQUAL(tp->t_fb->tfb_do_queued_segments, NULL);
+ KTEST_EQUAL(tp->t_in_hpts, IHPTS_NONE);
+ KTEST_EQUAL((tp->t_flags2 & (TF2_HPTS_CPU_SET | TF2_HPTS_CALLS)), 0);
+
+ /* Verify that HPTS-specific fields are initialized */
+ KTEST_EQUAL(tp->t_hpts_gencnt, 0);
+ KTEST_EQUAL(tp->t_hpts_slot, 0);
+ KTEST_EQUAL(tp->t_hpts_request, 0);
+ KTEST_EQUAL(tp->t_lro_cpu, 0);
+ KTEST_VERIFY(tp->t_hpts_cpu < pace->rp_num_hptss);
+ KTEST_EQUAL(tp->t_inpcb.inp_refcount, 1);
+ KTEST_VERIFY(!(tp->t_inpcb.inp_flags & INP_DROPPED));
+
+ test_hpts_free_tcpcb(tp);
+ tcp_hptsi_stop(pace);
+ tcp_hptsi_destroy(pace);
+
+ return (0);
+}
+
+/*
+ * Validates that tcpcb structures can be successfully inserted into and removed
+ * from the HPTS wheel, with proper state tracking and slot assignment during
+ * the process.
+ */
+KTEST_FUNC(tcpcb_insertion)
+{
+ struct tcp_hptsi *pace;
+ struct tcpcb *tp;
+ struct tcp_hpts_entry *hpts;
+ uint32_t slot = 10;
+
+ test_hpts_init();
+
+ pace = tcp_hptsi_create(&test_funcs, false);
+ KTEST_NEQUAL(pace, NULL);
+ tcp_hptsi_start(pace);
+
+ tp = test_hpts_create_tcpcb(ctx, pace);
+ KTEST_NEQUAL(tp, NULL);
+ KTEST_EQUAL(tp->t_in_hpts, IHPTS_NONE);
+ KTEST_EQUAL((tp->t_flags2 & TF2_HPTS_CALLS), 0);
+
+ INP_WLOCK(&tp->t_inpcb);
+ tp->t_flags2 |= TF2_HPTS_CALLS;
+ KTEST_EQUAL(call_counts[CCNT_SWI_SCHED], 0);
+ tcp_hpts_insert(pace, tp, slot, NULL);
+ KTEST_EQUAL(tp->t_in_hpts, IHPTS_ONQUEUE);
+ INP_WUNLOCK(&tp->t_inpcb);
+ KTEST_EQUAL(call_counts[CCNT_TCP_OUTPUT], 0);
+ KTEST_EQUAL(call_counts[CCNT_SWI_SCHED], 1);
+ KTEST_VERIFY(tcp_in_hpts(tp));
+ KTEST_VERIFY(tp->t_hpts_slot >= 0);
+ KTEST_VERIFY(tp->t_hpts_slot < NUM_OF_HPTSI_SLOTS);
+
+ hpts = pace->rp_ent[tp->t_hpts_cpu];
+ KTEST_EQUAL(hpts->p_on_queue_cnt, 1);
+ KTEST_EQUAL(tp->t_hpts_request, 0);
+ KTEST_EQUAL(tp->t_hpts_slot, slot);
+ //KTEST_EQUAL(tp->t_hpts_gencnt, 1);
+
+ INP_WLOCK(&tp->t_inpcb);
+ tcp_hpts_remove(pace, tp);
+ KTEST_EQUAL(tp->t_in_hpts, IHPTS_NONE);
+ INP_WUNLOCK(&tp->t_inpcb);
+ KTEST_EQUAL(call_counts[CCNT_TCP_OUTPUT], 0);
+ KTEST_VERIFY(!tcp_in_hpts(tp));
+
+ KTEST_EQUAL(hpts->p_on_queue_cnt, 0);
+
+ test_hpts_free_tcpcb(tp);
+ tcp_hptsi_stop(pace);
+ tcp_hptsi_destroy(pace);
+
+ return (0);
+}
+
+/*
+ * Validates the core HPTS timer functionality by verifying that scheduled
+ * tcpcb entries trigger tcp_output calls at appropriate times, simulating
+ * real-world timer-driven TCP processing.
+ */
+KTEST_FUNC(timer_functionality)
+{
+ struct epoch_tracker et;
+ struct tcp_hptsi *pace;
+ struct tcp_hpts_entry *hpts;
+ struct tcpcb *tp;
+ int32_t slots_ran;
+ uint32_t i;
+
+ test_hpts_init();
+
+ pace = tcp_hptsi_create(&test_funcs, false);
+ KTEST_NEQUAL(pace, NULL);
+ tcp_hptsi_start(pace);
+
+ for (i = 0; i < pace->rp_num_hptss; i++)
+ dump_hpts_entry(ctx, pace->rp_ent[i]);
+
+ /* Create and insert the tcpcb into the HPTS wheel to wait for 500 usec */
+ tp = test_hpts_create_tcpcb(ctx, pace);
+ KTEST_NEQUAL(tp, NULL);
+ dump_tcpcb(tp);
+ TP_LOG_TEST(tp) = 1; /* Enable logging for this tcpcb */
+
+ KTEST_LOG(ctx, "=> tcp_hpts_insert(%p)", tp);
+ INP_WLOCK(&tp->t_inpcb);
+ tp->t_flags2 |= TF2_HPTS_CALLS; /* Mark as needing HPTS processing */
+ tcp_hpts_insert(pace, tp, HPTS_USEC_TO_SLOTS(500), NULL);
+ INP_WUNLOCK(&tp->t_inpcb);
+
+ dump_tcpcb(tp);
+ for (i = 0; i < pace->rp_num_hptss; i++)
+ dump_hpts_entry(ctx, pace->rp_ent[i]);
+
+ hpts = pace->rp_ent[tp->t_hpts_cpu];
+ KTEST_EQUAL(hpts->p_on_queue_cnt, 1);
+ KTEST_EQUAL(hpts->p_prev_slot, 0);
+ KTEST_EQUAL(hpts->p_cur_slot, 0);
+ KTEST_EQUAL(hpts->p_runningslot, 0);
+ KTEST_EQUAL(hpts->p_nxt_slot, 1);
+ KTEST_EQUAL(hpts->p_hpts_active, 0);
+
+ KTEST_EQUAL(tp->t_in_hpts, IHPTS_ONQUEUE);
+ KTEST_EQUAL(tp->t_hpts_request, 0);
+ KTEST_EQUAL(tp->t_hpts_slot, HPTS_USEC_TO_SLOTS(500));
+
+ /* Set our test flag to indicate the tcpcb should be removed from the
+ * wheel when tcp_output is called. */
+ TP_REMOVE_FROM_HPTS(tp) = 1;
+
+ /* Test early exit condition: advance time by insufficient amount */
+ KTEST_LOG(ctx, "Testing early exit with insufficient time advancement");
+ test_time_usec += 1; /* Very small advancement - should cause early exit */
+ HPTS_LOCK(hpts);
+ NET_EPOCH_ENTER(et);
+ slots_ran = tcp_hptsi(hpts, true);
+ HPTS_UNLOCK(hpts);
+ NET_EPOCH_EXIT(et);
+
+ /* Should return 0 slots due to insufficient time advancement */
+ KTEST_EQUAL(slots_ran, 0);
+ KTEST_EQUAL(call_counts[CCNT_TCP_OUTPUT], 0); /* No processing should occur */
+ KTEST_EQUAL(tp->t_in_hpts, IHPTS_ONQUEUE); /* Connection still queued */
+
+ /* Wait for 498 more usecs and trigger the HPTS workers and verify
+ * nothing happens yet (total 499 usec) */
+ KTEST_EQUAL(call_counts[CCNT_TCP_OUTPUT], 0);
+ test_time_usec += 498;
+ for (i = 0; i < pace->rp_num_hptss; i++) {
+ KTEST_LOG(ctx, "=> tcp_hptsi(%p)", pace->rp_ent[i]);
+ HPTS_LOCK(pace->rp_ent[i]);
+ NET_EPOCH_ENTER(et);
+ slots_ran = tcp_hptsi(pace->rp_ent[i], true);
+ HPTS_UNLOCK(pace->rp_ent[i]);
+ NET_EPOCH_EXIT(et);
+
+ dump_hpts_entry(ctx, pace->rp_ent[i]);
+ KTEST_VERIFY(slots_ran >= 0);
+ KTEST_EQUAL(pace->rp_ent[i]->p_prev_slot, 49);
+ KTEST_EQUAL(pace->rp_ent[i]->p_cur_slot, 49);
+ }
+
+ dump_tcpcb(tp);
+ KTEST_EQUAL(call_counts[CCNT_TCP_OUTPUT], 0);
+ KTEST_EQUAL(tp->t_in_hpts, IHPTS_ONQUEUE);
+ KTEST_EQUAL(tp->t_hpts_request, 0);
+ KTEST_EQUAL(tp->t_hpts_slot, HPTS_USEC_TO_SLOTS(500));
+ KTEST_EQUAL(hpts->p_on_queue_cnt, 1);
+
+ /* Wait for 1 more usec and trigger the HPTS workers and verify it
+ * triggers tcp_output this time */
+ KTEST_EQUAL(call_counts[CCNT_TCP_OUTPUT], 0);
+ test_time_usec += 1;
+ for (i = 0; i < pace->rp_num_hptss; i++) {
+ KTEST_LOG(ctx, "=> tcp_hptsi(%p)", pace->rp_ent[i]);
+ HPTS_LOCK(pace->rp_ent[i]);
+ NET_EPOCH_ENTER(et);
+ slots_ran = tcp_hptsi(pace->rp_ent[i], true);
+ HPTS_UNLOCK(pace->rp_ent[i]);
+ NET_EPOCH_EXIT(et);
+
+ dump_hpts_entry(ctx, pace->rp_ent[i]);
+ KTEST_VERIFY(slots_ran >= 0);
+ KTEST_EQUAL(pace->rp_ent[i]->p_prev_slot, 50);
+ KTEST_EQUAL(pace->rp_ent[i]->p_cur_slot, 50);
+ }
+
+ dump_tcpcb(tp);
+ KTEST_EQUAL(call_counts[CCNT_TCP_OUTPUT], 1);
+ KTEST_EQUAL(tp->t_in_hpts, IHPTS_NONE);
+ KTEST_EQUAL(hpts->p_on_queue_cnt, 0);
+
+ test_hpts_free_tcpcb(tp);
+ tcp_hptsi_stop(pace);
+ tcp_hptsi_destroy(pace);
+
+ return (0);
+}
+
+/*
+ * Validates HPTS scalability by creating and inserting a LOT of tcpcbs into
+ * the HPTS wheel, testing performance under high load conditions.
+ */
+KTEST_FUNC(scalability_tcpcbs)
+{
+ struct tcp_hptsi *pace;
+ struct tcpcb **tcpcbs;
+ uint32_t i, num_tcpcbs = 100000, total_queued = 0;
+
+ test_hpts_init();
+
+ pace = tcp_hptsi_create(&test_funcs, false);
+ KTEST_NEQUAL(pace, NULL);
+ tcp_hptsi_start(pace);
+
+ /* Allocate array to hold pointers to all tcpcbs */
+ tcpcbs = malloc(num_tcpcbs * sizeof(struct tcpcb *), M_TCPHPTS, M_WAITOK | M_ZERO);
+ KTEST_VERIFY_RET(tcpcbs != NULL, ENOMEM);
+
+ /* Create a LOT of tcpcbs */
+ KTEST_LOG(ctx, "Creating %u tcpcbs...", num_tcpcbs);
+ for (i = 0; i < num_tcpcbs; i++) {
+ tcpcbs[i] = test_hpts_create_tcpcb(ctx, pace);
+ if (tcpcbs[i] == NULL) {
+ KTEST_ERR(ctx, "FAIL: tcpcbs[i] == NULL");
+ return (EINVAL);
+ }
+ }
+
+ /* Insert all created tcpcbs into HPTS */
+ KTEST_LOG(ctx, "Inserting all tcpcbs into HPTS...");
+ for (i = 0; i < num_tcpcbs; i++) {
+ INP_WLOCK(&tcpcbs[i]->t_inpcb);
+ tcpcbs[i]->t_flags2 |= TF2_HPTS_CALLS;
+ /* Insert with varying future timeouts to distribute across slots */
+ tcp_hpts_insert(pace, tcpcbs[i], HPTS_USEC_TO_SLOTS(100 + (i % 1000)), NULL);
+ INP_WUNLOCK(&tcpcbs[i]->t_inpcb);
+ }
+
+ /* Verify total queue counts across all CPUs */
+ for (i = 0; i < pace->rp_num_hptss; i++) {
+ total_queued += pace->rp_ent[i]->p_on_queue_cnt;
+ }
+ KTEST_EQUAL(total_queued, num_tcpcbs);
+
+ for (i = 0; i < pace->rp_num_hptss; i++)
+ dump_hpts_entry(ctx, pace->rp_ent[i]);
+
+ /* Remove all tcpcbs from HPTS */
+ KTEST_LOG(ctx, "Removing all tcpcbs from HPTS...");
+ for (i = 0; i < num_tcpcbs; i++) {
+ INP_WLOCK(&tcpcbs[i]->t_inpcb);
+ if (tcpcbs[i]->t_in_hpts != IHPTS_NONE) {
+ tcp_hpts_remove(pace, tcpcbs[i]);
+ }
+ INP_WUNLOCK(&tcpcbs[i]->t_inpcb);
+ }
+
+ /* Verify all queues are now empty */
+ for (i = 0; i < pace->rp_num_hptss; i++) {
+ if (pace->rp_ent[i]->p_on_queue_cnt != 0) {
+ KTEST_ERR(ctx, "FAIL: pace->rp_ent[i]->p_on_queue_cnt != 0");
+ return (EINVAL);
+ }
+ }
+
+ for (i = 0; i < num_tcpcbs; i++) {
+ test_hpts_free_tcpcb(tcpcbs[i]);
+ }
+ free(tcpcbs, M_TCPHPTS);
+ tcp_hptsi_stop(pace);
+ tcp_hptsi_destroy(pace);
+
+ return (0);
+}
+
+/*
+ * Validates wheel wrap scenarios where the timer falls significantly behind
+ * and needs to process more than one full wheel revolution worth of slots.
+ */
+KTEST_FUNC(wheel_wrap_recovery)
+{
+ struct epoch_tracker et;
+ struct tcp_hptsi *pace;
+ struct tcpcb **tcpcbs;
+ uint32_t i, slot, num_tcpcbs = 500;
+ int32_t slots_ran;
+
+ test_hpts_init();
+
+ pace = tcp_hptsi_create(&test_funcs, false);
+ KTEST_NEQUAL(pace, NULL);
+ tcp_hptsi_start(pace);
+
+ /* Allocate array to hold pointers to tcpcbs */
+ tcpcbs = malloc(num_tcpcbs * sizeof(struct tcpcb *), M_TCPHPTS, M_WAITOK | M_ZERO);
+ KTEST_VERIFY_RET(tcpcbs != NULL, ENOMEM);
+
+ /* Create tcpcbs and insert them across many slots */
+ for (i = 0; i < num_tcpcbs; i++) {
+ tcpcbs[i] = test_hpts_create_tcpcb(ctx, pace);
+ KTEST_NEQUAL(tcpcbs[i], NULL);
+ TP_REMOVE_FROM_HPTS(tcpcbs[i]) = 1;
+
+ slot = (i * NUM_OF_HPTSI_SLOTS) / num_tcpcbs; /* Spread across slots */
+
+ INP_WLOCK(&tcpcbs[i]->t_inpcb);
+ tcpcbs[i]->t_flags2 |= TF2_HPTS_CALLS;
+ tcp_hpts_insert(pace, tcpcbs[i], slot, NULL);
+ INP_WUNLOCK(&tcpcbs[i]->t_inpcb);
+ }
+
+ /* Fast forward time significantly to trigger wheel wrap */
+ test_time_usec += (NUM_OF_HPTSI_SLOTS + 5000) * HPTS_USECS_PER_SLOT;
+
+ for (i = 0; i < pace->rp_num_hptss; i++) {
+ KTEST_LOG(ctx, "=> tcp_hptsi(%u)", i);
+ KTEST_NEQUAL(pace->rp_ent[i]->p_on_queue_cnt, 0);
+
+ HPTS_LOCK(pace->rp_ent[i]);
+ NET_EPOCH_ENTER(et);
+ slots_ran = tcp_hptsi(pace->rp_ent[i], true);
+ HPTS_UNLOCK(pace->rp_ent[i]);
+ NET_EPOCH_EXIT(et);
+
+ KTEST_EQUAL(slots_ran, NUM_OF_HPTSI_SLOTS-1); /* Should process all slots */
+ KTEST_EQUAL(pace->rp_ent[i]->p_on_queue_cnt, 0);
+ KTEST_NEQUAL(pace->rp_ent[i]->p_cur_slot,
+ pace->rp_ent[i]->p_prev_slot);
+ }
+
+ /* Cleanup */
+ for (i = 0; i < num_tcpcbs; i++) {
+ INP_WLOCK(&tcpcbs[i]->t_inpcb);
+ if (tcpcbs[i]->t_in_hpts != IHPTS_NONE) {
+ tcp_hpts_remove(pace, tcpcbs[i]);
+ }
+ INP_WUNLOCK(&tcpcbs[i]->t_inpcb);
+ test_hpts_free_tcpcb(tcpcbs[i]);
+ }
+ free(tcpcbs, M_TCPHPTS);
+ tcp_hptsi_stop(pace);
+ tcp_hptsi_destroy(pace);
+
+ return (0);
+}
+
+/*
+ * Validates proper handling of tcpcbs in the IHPTS_MOVING state, which occurs
+ * when a tcpcb is being processed by the HPTS thread but gets removed.
+ */
+KTEST_FUNC(tcpcb_moving_state)
+{
+ struct epoch_tracker et;
+ struct tcp_hptsi *pace;
+ struct tcpcb *tp1, *tp2;
+ struct tcp_hpts_entry *hpts;
+ int32_t slots_ran;
+
+ test_hpts_init();
+
+ pace = tcp_hptsi_create(&test_funcs, false);
+ KTEST_NEQUAL(pace, NULL);
+ tcp_hptsi_start(pace);
+
+ /* Create two tcpcbs on the same CPU/slot */
+ tp1 = test_hpts_create_tcpcb(ctx, pace);
+ tp2 = test_hpts_create_tcpcb(ctx, pace);
+ KTEST_NEQUAL(tp1, NULL);
+ KTEST_NEQUAL(tp2, NULL);
+
+ /* Force them to the same CPU for predictable testing */
+ tp1->t_hpts_cpu = 0;
+ tp2->t_hpts_cpu = 0;
+
+ /* Insert both into the same slot */
+ INP_WLOCK(&tp1->t_inpcb);
+ tp1->t_flags2 |= TF2_HPTS_CALLS;
+ tcp_hpts_insert(pace, tp1, HPTS_USEC_TO_SLOTS(100), NULL);
+ INP_WUNLOCK(&tp1->t_inpcb);
+
+ INP_WLOCK(&tp2->t_inpcb);
+ tp2->t_flags2 |= TF2_HPTS_CALLS;
+ tcp_hpts_insert(pace, tp2, HPTS_USEC_TO_SLOTS(100), NULL);
+ INP_WUNLOCK(&tp2->t_inpcb);
+
+ hpts = pace->rp_ent[0];
+
+ /* Manually transition tp1 to MOVING state to simulate race condition */
+ HPTS_LOCK(hpts);
+ tp1->t_in_hpts = IHPTS_MOVING;
+ tp1->t_hpts_slot = -1; /* Mark for removal */
+ HPTS_UNLOCK(hpts);
+
+ /* Set time and run HPTS to process the moving state */
+ test_time_usec += 100;
+ HPTS_LOCK(hpts);
+ NET_EPOCH_ENTER(et);
+ slots_ran = tcp_hptsi(hpts, true);
+ HPTS_UNLOCK(hpts);
+ NET_EPOCH_EXIT(et);
+
+ KTEST_VERIFY(slots_ran >= 0);
+ KTEST_EQUAL(call_counts[CCNT_TCP_OUTPUT], 1); /* Shouldn't call on both */
+
+ /* tp1 should be cleaned up and removed */
+ KTEST_EQUAL(tp1->t_in_hpts, IHPTS_NONE);
+ /* tp2 should have been processed normally */
+ KTEST_EQUAL(tp2->t_in_hpts, IHPTS_NONE);
+
+ test_hpts_free_tcpcb(tp1);
+ test_hpts_free_tcpcb(tp2);
+ tcp_hptsi_stop(pace);
+ tcp_hptsi_destroy(pace);
+
+ return (0);
+}
+
+/*
+ * Validates that tcpcbs with deferred requests (t_hpts_request > 0) are
+ * properly handled and re-inserted into appropriate future slots after
+ * the wheel processes enough slots to accommodate the original request.
+ */
+KTEST_FUNC(deferred_requests)
+{
+ struct epoch_tracker et;
+ struct tcp_hptsi *pace;
+ struct tcpcb *tp, *tp2;
+ struct tcp_hpts_entry *hpts;
+ uint32_t large_slot = NUM_OF_HPTSI_SLOTS + 5000; /* Beyond wheel capacity */
+ uint32_t huge_slot = NUM_OF_HPTSI_SLOTS * 3; /* 3x wheel capacity */
+ uint32_t initial_request;
+ int32_t slots_ran;
+
+ test_hpts_init();
+
+ pace = tcp_hptsi_create(&test_funcs, false);
+ KTEST_NEQUAL(pace, NULL);
+ tcp_hptsi_start(pace);
+
+ tp = test_hpts_create_tcpcb(ctx, pace);
+ KTEST_NEQUAL(tp, NULL);
+
+ /* Insert with a request that exceeds current wheel capacity */
+ INP_WLOCK(&tp->t_inpcb);
+ tp->t_flags2 |= TF2_HPTS_CALLS;
+ tcp_hpts_insert(pace, tp, large_slot, NULL);
+ INP_WUNLOCK(&tp->t_inpcb);
+
+ /* Verify it was inserted with a deferred request */
+ dump_tcpcb(tp);
+ KTEST_EQUAL(tp->t_in_hpts, IHPTS_ONQUEUE);
+ KTEST_VERIFY(tp->t_hpts_request > 0);
+ KTEST_VERIFY(tp->t_hpts_slot < NUM_OF_HPTSI_SLOTS);
+
+ hpts = pace->rp_ent[tp->t_hpts_cpu];
+
+ /* Advance time to process deferred requests */
+ test_time_usec += NUM_OF_HPTSI_SLOTS * HPTS_USECS_PER_SLOT;
+
+ /* Process the wheel to handle deferred requests */
+ HPTS_LOCK(hpts);
+ NET_EPOCH_ENTER(et);
+ slots_ran = tcp_hptsi(hpts, true);
+ HPTS_UNLOCK(hpts);
+ NET_EPOCH_EXIT(et);
+
+ dump_hpts_entry(ctx, hpts);
+ KTEST_GREATER_THAN(slots_ran, 0);
+ dump_tcpcb(tp);
+ KTEST_EQUAL(tp->t_hpts_request, 0);
+
+ /* Test incremental deferred request processing over multiple cycles */
+ KTEST_LOG(ctx, "Testing incremental deferred request processing");
+
+ /* Create a new connection with an even larger request */
+ tp2 = test_hpts_create_tcpcb(ctx, pace);
+ KTEST_NEQUAL(tp2, NULL);
+ tp2->t_hpts_cpu = tp->t_hpts_cpu; /* Same CPU for predictable testing */
+
+ INP_WLOCK(&tp2->t_inpcb);
+ tp2->t_flags2 |= TF2_HPTS_CALLS;
+ tcp_hpts_insert(pace, tp2, huge_slot, NULL);
+ INP_WUNLOCK(&tp2->t_inpcb);
+
+ /* Verify initial deferred request */
+ initial_request = tp2->t_hpts_request;
+ KTEST_VERIFY(initial_request > NUM_OF_HPTSI_SLOTS);
+
+ /* Process one wheel cycle - should reduce but not eliminate request */
+ test_time_usec += NUM_OF_HPTSI_SLOTS * HPTS_USECS_PER_SLOT;
+ HPTS_LOCK(hpts);
+ NET_EPOCH_ENTER(et);
+ slots_ran = tcp_hptsi(hpts, true);
+ HPTS_UNLOCK(hpts);
+ NET_EPOCH_EXIT(et);
+
+ /* Request should be reduced but not zero */
+ KTEST_GREATER_THAN(initial_request, tp2->t_hpts_request);
+ KTEST_VERIFY(tp2->t_hpts_request > 0);
+ KTEST_EQUAL(tp2->t_in_hpts, IHPTS_ONQUEUE); /* Still queued */
+
+ /* For huge_slot = NUM_OF_HPTSI_SLOTS * 3, we need ~3 cycles to complete.
+ * Each cycle can reduce the request by at most NUM_OF_HPTSI_SLOTS. */
+ test_time_usec += NUM_OF_HPTSI_SLOTS * HPTS_USECS_PER_SLOT;
+ HPTS_LOCK(hpts);
+ NET_EPOCH_ENTER(et);
+ slots_ran = tcp_hptsi(hpts, true);
+ HPTS_UNLOCK(hpts);
+ NET_EPOCH_EXIT(et);
+
+ /* After second cycle, request should be reduced significantly (likely by ~NUM_OF_HPTSI_SLOTS) */
+ KTEST_VERIFY(tp2->t_hpts_request < initial_request);
+ KTEST_VERIFY(tp2->t_hpts_request > 0); /* But not yet zero for such a large request */
+
+ /* Clean up second connection */
+ INP_WLOCK(&tp2->t_inpcb);
+ if (tp2->t_in_hpts != IHPTS_NONE) {
+ tcp_hpts_remove(pace, tp2);
+ }
+ INP_WUNLOCK(&tp2->t_inpcb);
+ test_hpts_free_tcpcb(tp2);
+
+ /* Clean up */
+ INP_WLOCK(&tp->t_inpcb);
+ if (tp->t_in_hpts != IHPTS_NONE) {
+ tcp_hpts_remove(pace, tp);
+ }
+ INP_WUNLOCK(&tp->t_inpcb);
+ test_hpts_free_tcpcb(tp);
+ tcp_hptsi_stop(pace);
+ tcp_hptsi_destroy(pace);
+
+ return (0);
+}
+
+/*
+ * Validates CPU assignment and affinity mechanisms, including flowid-based
+ * assignment, random fallback scenarios, and explicit CPU setting. Tests
+ * the actual cpu assignment logic in hpts_cpuid via tcp_set_hpts.
+ */
+KTEST_FUNC(cpu_assignment)
+{
+ struct tcp_hptsi *pace;
+ struct tcpcb *tp1, *tp2, *tp2_dup, *tp3;
+
+ test_hpts_init();
+
+ pace = tcp_hptsi_create(&test_funcs, false);
+ KTEST_NEQUAL(pace, NULL);
+
+ /* Test random CPU assignment (no flowid) */
+ tp1 = test_hpts_create_tcpcb(ctx, pace);
+ KTEST_NEQUAL(tp1, NULL);
+ tp1->t_inpcb.inp_flowtype = M_HASHTYPE_NONE;
+ INP_WLOCK(&tp1->t_inpcb);
+ tcp_set_hpts(pace, tp1);
+ INP_WUNLOCK(&tp1->t_inpcb);
+ KTEST_VERIFY(tp1->t_hpts_cpu < pace->rp_num_hptss);
+ KTEST_VERIFY(tp1->t_flags2 & TF2_HPTS_CPU_SET);
+
+ /* Test flowid-based assignment */
+ tp2 = test_hpts_create_tcpcb(ctx, pace);
+ KTEST_NEQUAL(tp2, NULL);
+ tp2->t_inpcb.inp_flowtype = M_HASHTYPE_RSS_TCP_IPV4;
+ tp2->t_inpcb.inp_flowid = 12345;
+ INP_WLOCK(&tp2->t_inpcb);
+ tcp_set_hpts(pace, tp2);
+ INP_WUNLOCK(&tp2->t_inpcb);
+ KTEST_VERIFY(tp2->t_hpts_cpu < pace->rp_num_hptss);
+ KTEST_VERIFY(tp2->t_flags2 & TF2_HPTS_CPU_SET);
+
+ /* With the same flowid, should get same CPU assignment */
+ tp2_dup = test_hpts_create_tcpcb(ctx, pace);
+ KTEST_NEQUAL(tp2_dup, NULL);
+ tp2_dup->t_inpcb.inp_flowtype = M_HASHTYPE_RSS_TCP_IPV4;
+ tp2_dup->t_inpcb.inp_flowid = 12345;
+ INP_WLOCK(&tp2_dup->t_inpcb);
+ tcp_set_hpts(pace, tp2_dup);
+ INP_WUNLOCK(&tp2_dup->t_inpcb);
+ KTEST_EQUAL(tp2_dup->t_hpts_cpu, tp2->t_hpts_cpu);
+
+ /* Test explicit CPU setting */
+ tp3 = test_hpts_create_tcpcb(ctx, pace);
+ KTEST_NEQUAL(tp3, NULL);
+ tp3->t_hpts_cpu = 1; /* Assume we have at least 2 CPUs */
+ tp3->t_flags2 |= TF2_HPTS_CPU_SET;
+ INP_WLOCK(&tp3->t_inpcb);
+ tcp_set_hpts(pace, tp3);
+ INP_WUNLOCK(&tp3->t_inpcb);
+ KTEST_EQUAL(tp3->t_hpts_cpu, 1);
+
+ test_hpts_free_tcpcb(tp1);
+ test_hpts_free_tcpcb(tp2);
+ test_hpts_free_tcpcb(tp2_dup);
+ test_hpts_free_tcpcb(tp3);
+ tcp_hptsi_destroy(pace);
+
+ return (0);
+}
+
+/*
+ * Validates edge cases in slot calculation including boundary conditions
+ * around slot 0, maximum slots, and slot wrapping arithmetic.
+ */
+KTEST_FUNC(slot_boundary_conditions)
+{
+ struct tcp_hptsi *pace;
+ struct tcpcb *tp;
+
+ test_hpts_init();
+
+ pace = tcp_hptsi_create(&test_funcs, false);
+ KTEST_NEQUAL(pace, NULL);
+ tcp_hptsi_start(pace);
+
+ /* Test insertion at slot 0 */
+ tp = test_hpts_create_tcpcb(ctx, pace);
+ KTEST_NEQUAL(tp, NULL);
+ INP_WLOCK(&tp->t_inpcb);
+ tp->t_flags2 |= TF2_HPTS_CALLS;
+ tcp_hpts_insert(pace, tp, 0, NULL); /* Should insert in next available slot */
+ INP_WUNLOCK(&tp->t_inpcb);
+ KTEST_EQUAL(tp->t_in_hpts, IHPTS_ONQUEUE);
+ KTEST_VERIFY(tp->t_hpts_slot < NUM_OF_HPTSI_SLOTS);
+
+ INP_WLOCK(&tp->t_inpcb);
+ tcp_hpts_remove(pace, tp);
+ INP_WUNLOCK(&tp->t_inpcb);
+
+ /* Test insertion at maximum slot value */
+ INP_WLOCK(&tp->t_inpcb);
+ tp->t_flags2 |= TF2_HPTS_CALLS;
+ tcp_hpts_insert(pace, tp, NUM_OF_HPTSI_SLOTS - 1, NULL);
+ INP_WUNLOCK(&tp->t_inpcb);
+ KTEST_EQUAL(tp->t_in_hpts, IHPTS_ONQUEUE);
+
+ INP_WLOCK(&tp->t_inpcb);
+ tcp_hpts_remove(pace, tp);
+ INP_WUNLOCK(&tp->t_inpcb);
+
+ /* Test very small slot values */
+ INP_WLOCK(&tp->t_inpcb);
+ tp->t_flags2 |= TF2_HPTS_CALLS;
+ tcp_hpts_insert(pace, tp, 1, NULL);
+ INP_WUNLOCK(&tp->t_inpcb);
+ KTEST_EQUAL(tp->t_in_hpts, IHPTS_ONQUEUE);
+ KTEST_EQUAL(tp->t_hpts_slot, 1); /* Should be exactly slot 1 */
+
+ INP_WLOCK(&tp->t_inpcb);
+ tcp_hpts_remove(pace, tp);
+ INP_WUNLOCK(&tp->t_inpcb);
+
+ test_hpts_free_tcpcb(tp);
+ tcp_hptsi_stop(pace);
+ tcp_hptsi_destroy(pace);
+
+ return (0);
+}
+
+/*
+ * Validates HPTS behavior under high load conditions, including proper
+ * processing of many connections and connection count tracking.
+ */
+KTEST_FUNC(dynamic_sleep_adjustment)
{
- /* TODO: Refactor HPTS code so that it may be tested. */
- KTEST_VERIFY(tcp_min_hptsi_time != 0);
+ struct epoch_tracker et;
+ struct tcp_hptsi *pace;
+ struct tcpcb **tcpcbs;
+ struct tcp_hpts_entry *hpts;
+ uint32_t i, num_tcpcbs = DEFAULT_CONNECTION_THRESHOLD + 50;
+ int32_t slots_ran;
+
+ test_hpts_init();
+
+ pace = tcp_hptsi_create(&test_funcs, false);
+ KTEST_NEQUAL(pace, NULL);
+ tcp_hptsi_start(pace);
+
+ /* Create many connections to exceed threshold */
+ tcpcbs = malloc(num_tcpcbs * sizeof(struct tcpcb *), M_TCPHPTS, M_WAITOK | M_ZERO);
+ KTEST_VERIFY_RET(tcpcbs != NULL, ENOMEM);
+
+ for (i = 0; i < num_tcpcbs; i++) {
+ tcpcbs[i] = test_hpts_create_tcpcb(ctx, pace);
+ KTEST_NEQUAL(tcpcbs[i], NULL);
+ tcpcbs[i]->t_hpts_cpu = 0; /* Force all to CPU 0 */
+ INP_WLOCK(&tcpcbs[i]->t_inpcb);
+ tcpcbs[i]->t_flags2 |= TF2_HPTS_CALLS;
+ TP_REMOVE_FROM_HPTS(tcpcbs[i]) = 1; /* Will be removed after output */
+ tcp_hpts_insert(pace, tcpcbs[i], HPTS_USEC_TO_SLOTS(100), NULL);
+ INP_WUNLOCK(&tcpcbs[i]->t_inpcb);
+ }
+
+ hpts = pace->rp_ent[0];
+ dump_hpts_entry(ctx, hpts);
+
+ /* Verify we're above threshold */
+ KTEST_GREATER_THAN(hpts->p_on_queue_cnt, DEFAULT_CONNECTION_THRESHOLD);
+
+ /* Run HPTS to process many connections */
+ test_time_usec += 100;
+ HPTS_LOCK(hpts);
+ NET_EPOCH_ENTER(et);
+ slots_ran = tcp_hptsi(hpts, true);
+ HPTS_UNLOCK(hpts);
+ NET_EPOCH_EXIT(et);
+
+ /* Verify HPTS processed slots and connections correctly */
+ KTEST_GREATER_THAN(slots_ran, 0);
+ KTEST_EQUAL(call_counts[CCNT_TCP_OUTPUT], num_tcpcbs);
+
+ /* Verify all connections were removed from queue */
+ KTEST_EQUAL(hpts->p_on_queue_cnt, 0);
+
+ /* Cleanup */
+ for (i = 0; i < num_tcpcbs; i++) {
+ test_hpts_free_tcpcb(tcpcbs[i]);
+ }
+ free(tcpcbs, M_TCPHPTS);
+ tcp_hptsi_stop(pace);
+ tcp_hptsi_destroy(pace);
+
+ return (0);
+}
+
+/*
+ * Validates handling of concurrent insert/remove operations and race conditions
+ * between HPTS processing and user operations.
+ */
+KTEST_FUNC(concurrent_operations)
+{
+ struct tcp_hptsi *pace;
+ struct tcpcb *tp1, *tp2;
+ struct tcp_hpts_entry *hpts;
+
+ test_hpts_init();
+
+ pace = tcp_hptsi_create(&test_funcs, false);
+ KTEST_NEQUAL(pace, NULL);
+ tcp_hptsi_start(pace);
+
+ tp1 = test_hpts_create_tcpcb(ctx, pace);
+ tp2 = test_hpts_create_tcpcb(ctx, pace);
+ KTEST_NEQUAL(tp1, NULL);
+ KTEST_NEQUAL(tp2, NULL);
+
+ /* Force all to CPU 0 */
+ tp1->t_hpts_cpu = 0;
+ tp2->t_hpts_cpu = 0;
+
+ /* Insert tp1 */
+ INP_WLOCK(&tp1->t_inpcb);
+ tp1->t_flags2 |= TF2_HPTS_CALLS;
+ tcp_hpts_insert(pace, tp1, HPTS_USEC_TO_SLOTS(100), NULL);
+ INP_WUNLOCK(&tp1->t_inpcb);
+
+ /* Insert tp2 into same slot */
+ INP_WLOCK(&tp2->t_inpcb);
+ tp2->t_flags2 |= TF2_HPTS_CALLS;
+ tcp_hpts_insert(pace, tp2, HPTS_USEC_TO_SLOTS(100), NULL);
+ INP_WUNLOCK(&tp2->t_inpcb);
+
+ /* Verify both are inserted */
+ KTEST_EQUAL(tp1->t_in_hpts, IHPTS_ONQUEUE);
+ KTEST_EQUAL(tp2->t_in_hpts, IHPTS_ONQUEUE);
+
+ /* Verify they're both assigned to the same slot */
+ KTEST_EQUAL(tp1->t_hpts_slot, tp2->t_hpts_slot);
+
+ /* Verify queue count reflects both connections */
+ KTEST_EQUAL(tp1->t_hpts_cpu, tp2->t_hpts_cpu); /* Should be on same CPU */
+ hpts = pace->rp_ent[tp1->t_hpts_cpu];
+ KTEST_EQUAL(hpts->p_on_queue_cnt, 2);
+
+ /* Remove tp1 while tp2 is still there */
+ INP_WLOCK(&tp1->t_inpcb);
+ tcp_hpts_remove(pace, tp1);
+ INP_WUNLOCK(&tp1->t_inpcb);
+
+ /* Verify tp1 removed, tp2 still there */
+ KTEST_EQUAL(tp1->t_in_hpts, IHPTS_NONE);
+ KTEST_EQUAL(tp2->t_in_hpts, IHPTS_ONQUEUE);
+
+ /* Verify queue count decreased by one */
+ KTEST_EQUAL(hpts->p_on_queue_cnt, 1);
+
+ /* Remove tp2 */
+ INP_WLOCK(&tp2->t_inpcb);
+ tcp_hpts_remove(pace, tp2);
+ INP_WUNLOCK(&tp2->t_inpcb);
+
+ KTEST_EQUAL(tp2->t_in_hpts, IHPTS_NONE);
+
+ /* Verify queue is now completely empty */
+ KTEST_EQUAL(hpts->p_on_queue_cnt, 0);
+
+ test_hpts_free_tcpcb(tp1);
+ test_hpts_free_tcpcb(tp2);
+ tcp_hptsi_stop(pace);
+ tcp_hptsi_destroy(pace);
+
+ return (0);
+}
+
+/*
+ * Validates the queued segments processing path via tfb_do_queued_segments,
+ * which is an alternative to direct tcp_output calls.
+ */
+KTEST_FUNC(queued_segments_processing)
+{
+ struct epoch_tracker et;
+ struct tcp_hptsi *pace;
+ struct tcpcb *tp;
+ struct tcp_hpts_entry *hpts;
+ struct mbuf *fake_mbuf;
+ int32_t slots_ran;
+
+ test_hpts_init();
+
+ pace = tcp_hptsi_create(&test_funcs, false);
+ KTEST_NEQUAL(pace, NULL);
+ tcp_hptsi_start(pace);
+
+ tp = test_hpts_create_tcpcb(ctx, pace);
+ KTEST_NEQUAL(tp, NULL);
+
+ /* Create a minimal fake mbuf that has valid STAILQ pointers */
+ fake_mbuf = malloc(sizeof(struct mbuf), M_TCPHPTS, M_WAITOK | M_ZERO);
+ KTEST_NEQUAL(fake_mbuf, NULL);
+
+ /* Set up for queued segments path */
+ tp->t_flags2 |= (TF2_HPTS_CALLS | TF2_SUPPORTS_MBUFQ);
+ STAILQ_INSERT_TAIL(&tp->t_inqueue, fake_mbuf, m_stailqpkt);
+
+ INP_WLOCK(&tp->t_inpcb);
+ tcp_hpts_insert(pace, tp, HPTS_USEC_TO_SLOTS(100), NULL);
+ INP_WUNLOCK(&tp->t_inpcb);
+
+ hpts = pace->rp_ent[tp->t_hpts_cpu];
+
+ /* Run HPTS and verify queued segments path is taken */
+ test_time_usec += 100;
+ HPTS_LOCK(hpts);
+ NET_EPOCH_ENTER(et);
+ slots_ran = tcp_hptsi(hpts, true);
+ HPTS_UNLOCK(hpts);
+ NET_EPOCH_EXIT(et);
+
+ KTEST_VERIFY(slots_ran >= 0);
+ KTEST_EQUAL(call_counts[CCNT_TCP_TFB_DO_QUEUED_SEGMENTS], 1);
+
+ /* Connection should be removed from HPTS after processing */
+ KTEST_EQUAL(tp->t_in_hpts, IHPTS_NONE);
+
+ /* Clean up the fake mbuf if it's still in the queue */
+ if (!STAILQ_EMPTY(&tp->t_inqueue)) {
+ struct mbuf *m = STAILQ_FIRST(&tp->t_inqueue);
+ STAILQ_REMOVE_HEAD(&tp->t_inqueue, m_stailqpkt);
+ free(m, M_TCPHPTS);
+ }
+
+ test_hpts_free_tcpcb(tp);
+ tcp_hptsi_stop(pace);
+ tcp_hptsi_destroy(pace);
+
+ return (0);
+}
+
+/*
+ * Validates the direct wake mechanism and wake inhibition logic when
+ * the connection count exceeds thresholds.
+ */
+KTEST_FUNC(direct_wake_mechanism)
+{
+ struct tcp_hptsi *pace;
+ struct tcpcb *tp;
+ struct tcp_hpts_entry *hpts;
+
+ test_hpts_init();
+
+ pace = tcp_hptsi_create(&test_funcs, false);
+ KTEST_NEQUAL(pace, NULL);
+ tcp_hptsi_start(pace);
+
+ tp = test_hpts_create_tcpcb(ctx, pace);
+ KTEST_NEQUAL(tp, NULL);
+ hpts = pace->rp_ent[tp->t_hpts_cpu];
+
+ /* Test direct wake when not over threshold */
+ HPTS_LOCK(hpts);
+ hpts->p_on_queue_cnt = 50; /* Below threshold */
+ hpts->p_hpts_wake_scheduled = 0;
+ tcp_hpts_wake(hpts);
+ KTEST_EQUAL(hpts->p_hpts_wake_scheduled, 1);
+ KTEST_EQUAL(call_counts[CCNT_SWI_SCHED], 1);
+ HPTS_UNLOCK(hpts);
+
+ /* Reset for next test */
+ hpts->p_hpts_wake_scheduled = 0;
+ call_counts[CCNT_SWI_SCHED] = 0;
+
+ /* Test wake inhibition when over threshold */
+ HPTS_LOCK(hpts);
+ hpts->p_on_queue_cnt = 200; /* Above threshold */
+ hpts->p_direct_wake = 1; /* Request direct wake */
+ tcp_hpts_wake(hpts);
+ KTEST_EQUAL(hpts->p_hpts_wake_scheduled, 0); /* Should be inhibited */
+ KTEST_EQUAL(hpts->p_direct_wake, 0); /* Should be cleared */
+ KTEST_EQUAL(call_counts[CCNT_SWI_SCHED], 0); /* No SWI scheduled */
+ HPTS_UNLOCK(hpts);
+
+ test_hpts_free_tcpcb(tp);
+ tcp_hptsi_stop(pace);
+ tcp_hptsi_destroy(pace);
+
+ return (0);
+}
+
+/*
+ * Validates HPTS collision detection when attempting to run HPTS while
+ * it's already active.
+ */
+KTEST_FUNC(hpts_collision_detection)
+{
+ struct epoch_tracker et;
+ struct tcp_hptsi *pace;
+ struct tcp_hpts_entry *hpts;
+ int32_t slots_ran;
+
+ test_hpts_init();
+
+ pace = tcp_hptsi_create(&test_funcs, false);
+ KTEST_NEQUAL(pace, NULL);
+ tcp_hptsi_start(pace);
+
+ hpts = pace->rp_ent[0];
+
+ /* Mark HPTS as active */
+ HPTS_LOCK(hpts);
+ hpts->p_hpts_active = 1;
+ HPTS_UNLOCK(hpts);
+
+ /* Attempt to run HPTS again - should detect collision */
+ HPTS_LOCK(hpts);
+ NET_EPOCH_ENTER(et);
+ slots_ran = tcp_hptsi(hpts, false); /* from_callout = false */
+ HPTS_UNLOCK(hpts);
+ NET_EPOCH_EXIT(et);
+
+ /* Should return 0 indicating no work done due to collision */
+ KTEST_EQUAL(slots_ran, 0);
+
+ tcp_hptsi_stop(pace);
+ tcp_hptsi_destroy(pace);
+
+ return (0);
+}
+
+/*
+ * Validates generation count handling for race condition detection between
+ * HPTS processing and connection insertion/removal operations.
+ */
+KTEST_FUNC(generation_count_validation)
+{
+ struct epoch_tracker et;
+ struct tcp_hptsi *pace;
+ struct tcp_hpts_entry *hpts;
+ struct tcpcb *tp1, *tp2;
+ uint32_t initial_gencnt, slot_to_test = 10;
+ uint32_t tp2_original_gencnt;
+ int32_t slots_ran;
+
+ test_hpts_init();
+
+ pace = tcp_hptsi_create(&test_funcs, false);
+ KTEST_NEQUAL(pace, NULL);
+ tcp_hptsi_start(pace);
+
+ hpts = pace->rp_ent[0];
+
+ /* Record initial generation count for the test slot */
+ initial_gencnt = hpts->p_hptss[slot_to_test].gencnt;
+
+ /* Create and insert first connection */
+ tp1 = test_hpts_create_tcpcb(ctx, pace);
+ KTEST_NEQUAL(tp1, NULL);
+ tp1->t_hpts_cpu = 0; /* Force to CPU 0 */
+
+ INP_WLOCK(&tp1->t_inpcb);
+ tp1->t_flags2 |= TF2_HPTS_CALLS;
+ tcp_hpts_insert(pace, tp1, slot_to_test, NULL);
+ INP_WUNLOCK(&tp1->t_inpcb);
+
+ /* Verify connection stored the generation count */
+ KTEST_EQUAL(tp1->t_in_hpts, IHPTS_ONQUEUE);
+ KTEST_EQUAL(tp1->t_hpts_slot, slot_to_test);
+ KTEST_EQUAL(tp1->t_hpts_gencnt, initial_gencnt);
+
+ /* Create second connection but don't insert yet */
+ tp2 = test_hpts_create_tcpcb(ctx, pace);
+ KTEST_NEQUAL(tp2, NULL);
+ tp2->t_hpts_cpu = 0; /* Force to CPU 0 */
+
+ /* Force generation count increment by processing the slot */
+ test_time_usec += (slot_to_test + 1) * HPTS_USECS_PER_SLOT;
+ HPTS_LOCK(hpts);
+ NET_EPOCH_ENTER(et);
+ slots_ran = tcp_hptsi(hpts, true);
+ HPTS_UNLOCK(hpts);
+ NET_EPOCH_EXIT(et);
+
+ /* Verify processing occurred */
+ KTEST_VERIFY(slots_ran > 0);
+ KTEST_EQUAL(call_counts[CCNT_TCP_OUTPUT], 1);
+
+ /* Verify generation count was incremented */
+ KTEST_EQUAL(hpts->p_hptss[slot_to_test].gencnt, initial_gencnt + 1);
+
+ /* Verify first connection was processed and removed */
+ KTEST_EQUAL(tp1->t_in_hpts, IHPTS_NONE);
+
+ /* Insert second connection and record its generation count */
+ INP_WLOCK(&tp2->t_inpcb);
+ tp2->t_flags2 |= TF2_HPTS_CALLS;
+ tcp_hpts_insert(pace, tp2, slot_to_test, NULL);
+ INP_WUNLOCK(&tp2->t_inpcb);
+
+ /* Verify connection was inserted successfully */
+ KTEST_EQUAL(tp2->t_in_hpts, IHPTS_ONQUEUE);
+
+ /* Record the generation count that tp2 received */
+ tp2_original_gencnt = tp2->t_hpts_gencnt;
+
+ /* Test generation count mismatch detection during processing */
+ /* Manually set stale generation count to simulate race condition */
+ tp2->t_hpts_gencnt = tp2_original_gencnt + 100; /* Force a mismatch */
+
+ /* Process the slot to trigger generation count validation */
+ test_time_usec += (slot_to_test + 1) * HPTS_USECS_PER_SLOT;
+ HPTS_LOCK(hpts);
+ NET_EPOCH_ENTER(et);
+ slots_ran = tcp_hptsi(hpts, true);
+ HPTS_UNLOCK(hpts);
+ NET_EPOCH_EXIT(et);
+
+ /* Connection should be processed despite generation count mismatch */
+ KTEST_EQUAL(tp2->t_in_hpts, IHPTS_NONE); /* Processed and released */
+
+ /* The key test: HPTS should handle mismatched generation counts gracefully */
+ KTEST_VERIFY(slots_ran > 0); /* Processing should still occur */
+
+ test_hpts_free_tcpcb(tp1);
+ test_hpts_free_tcpcb(tp2);
+ tcp_hptsi_stop(pace);
+ tcp_hptsi_destroy(pace);
+
return (0);
}
static const struct ktest_test_info tests[] = {
- {
- .name = "test_hpts_init",
- .desc = "Tests HPTS initialization and cleanup",
- .func = &test_hpts_init,
- },
+ KTEST_INFO(module_load),
+ KTEST_INFO(hptsi_create_destroy),
+ KTEST_INFO(hptsi_start_stop),
+ KTEST_INFO(hptsi_independence),
+ KTEST_INFO(function_injection),
+ KTEST_INFO(tcpcb_initialization),
+ KTEST_INFO(tcpcb_insertion),
+ KTEST_INFO(timer_functionality),
+ KTEST_INFO(scalability_tcpcbs),
+ KTEST_INFO(wheel_wrap_recovery),
+ KTEST_INFO(tcpcb_moving_state),
+ KTEST_INFO(deferred_requests),
+ KTEST_INFO(cpu_assignment),
+ KTEST_INFO(slot_boundary_conditions),
+ KTEST_INFO(dynamic_sleep_adjustment),
+ KTEST_INFO(concurrent_operations),
+ KTEST_INFO(queued_segments_processing),
+ KTEST_INFO(direct_wake_mechanism),
+ KTEST_INFO(hpts_collision_detection),
+ KTEST_INFO(generation_count_validation),
};
KTEST_MODULE_DECLARE(ktest_tcphpts, tests);
diff --git a/sys/netinet/tcp_lro_hpts.c b/sys/netinet/tcp_lro_hpts.c
index 43587285fe26..ac1a27a4290a 100644
--- a/sys/netinet/tcp_lro_hpts.c
+++ b/sys/netinet/tcp_lro_hpts.c
@@ -29,6 +29,8 @@
#include "opt_inet6.h"
#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/interrupt.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/malloc.h>
@@ -62,6 +64,7 @@
#include <netinet/tcp_lro.h>
#include <netinet/tcp_var.h>
#include <netinet/tcp_hpts.h>
+#include <netinet/tcp_hpts_internal.h>
#ifdef TCP_BLACKBOX
#include <netinet/tcp_log_buf.h>
#endif
diff --git a/sys/tests/ktest.h b/sys/tests/ktest.h
index 8af92f1e0c18..75d7a75e2fff 100644
--- a/sys/tests/ktest.h
+++ b/sys/tests/ktest.h
@@ -57,6 +57,8 @@ struct ktest_test_info {
ktest_parse_t parse;
};
+#define KTEST_FUNC(X) static int __ktest_##X(struct ktest_test_context *ctx)
+
struct ktest_module_info {
const char *name;
const struct ktest_test_info *tests;
@@ -64,6 +66,8 @@ struct ktest_module_info {
void *module_ptr;
};
+#define KTEST_INFO(X) { "test_" #X, "Test " #X, __ktest_##X, NULL }
+
int ktest_default_modevent(module_t mod, int type, void *arg);
bool ktest_start_msg(struct ktest_test_context *ctx);
@@ -84,6 +88,9 @@ void ktest_end_msg(struct ktest_test_context *ctx);
#define KTEST_LOG(_ctx, _fmt, ...) \
KTEST_LOG_LEVEL(_ctx, LOG_DEBUG, _fmt, ## __VA_ARGS__)
+#define KTEST_ERR(_ctx, _fmt, ...) \
+ KTEST_LOG_LEVEL(_ctx, LOG_ERR, _fmt, ## __VA_ARGS__)
+
#define KTEST_MAX_BUF 512
#define KTEST_MODULE_DECLARE(_n, _t) \