aboutsummaryrefslogtreecommitdiff
path: root/sys/netinet/tcp_hpts.c
diff options
context:
space:
mode:
Diffstat (limited to 'sys/netinet/tcp_hpts.c')
-rw-r--r--sys/netinet/tcp_hpts.c933
1 files changed, 508 insertions, 425 deletions
diff --git a/sys/netinet/tcp_hpts.c b/sys/netinet/tcp_hpts.c
index 63bbe4bba11b..c54459bb5f01 100644
--- a/sys/netinet/tcp_hpts.c
+++ b/sys/netinet/tcp_hpts.c
@@ -39,15 +39,14 @@
* First, and probably the main thing its used by Rack and BBR, it can
* be used to call tcp_output() of a transport stack at some time in the future.
* The normal way this is done is that tcp_output() of the stack schedules
- * itself to be called again by calling tcp_hpts_insert(tcpcb, slot). The
- * slot is the time from now that the stack wants to be called but it
- * must be converted to tcp_hpts's notion of slot. This is done with
- * one of the macros HPTS_MS_TO_SLOTS or HPTS_USEC_TO_SLOTS. So a typical
+ * itself to be called again by calling tcp_hpts_insert(tcpcb, usecs). The
+ * usecs is the time from now that the stack wants to be called and is
+ * passing time directly in microseconds. So a typical
* call from the tcp_output() routine might look like:
*
- * tcp_hpts_insert(tp, HPTS_USEC_TO_SLOTS(550));
+ * tcp_hpts_insert(tp, 550, NULL);
*
- * The above would schedule tcp_output() to be called in 550 useconds.
+ * The above would schedule tcp_output() to be called in 550 microseconds.
* Note that if using this mechanism the stack will want to add near
* its top a check to prevent unwanted calls (from user land or the
* arrival of incoming ack's). So it would add something like:
@@ -149,27 +148,44 @@
#include <netinet/tcpip.h>
#include <netinet/cc/cc.h>
#include <netinet/tcp_hpts.h>
+#include <netinet/tcp_hpts_internal.h>
#include <netinet/tcp_log_buf.h>
#ifdef tcp_offload
#include <netinet/tcp_offload.h>
#endif
-/*
- * The hpts uses a 102400 wheel. The wheel
- * defines the time in 10 usec increments (102400 x 10).
- * This gives a range of 10usec - 1024ms to place
- * an entry within. If the user requests more than
- * 1.024 second, a remaineder is attached and the hpts
- * when seeing the remainder will re-insert the
- * inpcb forward in time from where it is until
- * the remainder is zero.
- */
+/* Global instance for TCP HPTS */
+struct tcp_hptsi *tcp_hptsi_pace;
+
+/* Default function table for production use. */
+const struct tcp_hptsi_funcs tcp_hptsi_default_funcs = {
+ .microuptime = microuptime,
+ .swi_add = swi_add,
+ .swi_remove = swi_remove,
+ .swi_sched = swi_sched,
+ .intr_event_bind = intr_event_bind,
+ .intr_event_bind_ithread_cpuset = intr_event_bind_ithread_cpuset,
+ .callout_init = callout_init,
+ .callout_reset_sbt_on = callout_reset_sbt_on,
+ ._callout_stop_safe = _callout_stop_safe,
+};
-#define NUM_OF_HPTSI_SLOTS 102400
+#ifdef TCP_HPTS_KTEST
+#define microuptime pace->funcs->microuptime
+#define swi_add pace->funcs->swi_add
+#define swi_remove pace->funcs->swi_remove
+#define swi_sched pace->funcs->swi_sched
+#define intr_event_bind pace->funcs->intr_event_bind
+#define intr_event_bind_ithread_cpuset pace->funcs->intr_event_bind_ithread_cpuset
+#define callout_init pace->funcs->callout_init
+#define callout_reset_sbt_on pace->funcs->callout_reset_sbt_on
+#define _callout_stop_safe pace->funcs->_callout_stop_safe
+#endif
-/* The number of connections after which the dynamic sleep logic kicks in. */
-#define DEFAULT_CONNECTION_THRESHOLD 100
+static MALLOC_DEFINE(M_TCPHPTS, "tcp_hpts", "TCP hpts");
+
+static void tcp_hpts_thread(void *ctx);
/*
* When using the hpts, a TCP stack must make sure
@@ -204,87 +220,22 @@
*
* When we are in the "new" mode i.e. conn_cnt > conn_cnt_thresh
* then we do a dynamic adjustment on the time we sleep.
- * Our threshold is if the lateness of the first client served (in ticks) is
+ * Our threshold is if the lateness of the first client served (in slots) is
* greater than or equal too slots_indicate_more_sleep (10ms
- * or 10000 ticks). If we were that late, the actual sleep time
- * is adjusted down by 50%. If the ticks_ran is less than
- * slots_indicate_more_sleep (100 ticks or 1000usecs).
+ * or 10000 slots). If we were that late, the actual sleep time
+ * is adjusted down by 50%. If the slots_ran is less than
+ * slots_indicate_more_sleep (100 slots or 1000usecs).
*
*/
-/* Each hpts has its own p_mtx which is used for locking */
-#define HPTS_MTX_ASSERT(hpts) mtx_assert(&(hpts)->p_mtx, MA_OWNED)
-#define HPTS_LOCK(hpts) mtx_lock(&(hpts)->p_mtx)
-#define HPTS_TRYLOCK(hpts) mtx_trylock(&(hpts)->p_mtx)
-#define HPTS_UNLOCK(hpts) mtx_unlock(&(hpts)->p_mtx)
-struct tcp_hpts_entry {
- /* Cache line 0x00 */
- struct mtx p_mtx; /* Mutex for hpts */
- struct timeval p_mysleep; /* Our min sleep time */
- uint64_t syscall_cnt;
- uint64_t sleeping; /* What the actual sleep was (if sleeping) */
- uint16_t p_hpts_active; /* Flag that says hpts is awake */
- uint8_t p_wheel_complete; /* have we completed the wheel arc walk? */
- uint32_t p_curtick; /* Tick in 10 us the hpts is going to */
- uint32_t p_runningslot; /* Current tick we are at if we are running */
- uint32_t p_prev_slot; /* Previous slot we were on */
- uint32_t p_cur_slot; /* Current slot in wheel hpts is draining */
- uint32_t p_nxt_slot; /* The next slot outside the current range of
- * slots that the hpts is running on. */
- int32_t p_on_queue_cnt; /* Count on queue in this hpts */
- uint32_t p_lasttick; /* Last tick before the current one */
- uint8_t p_direct_wake :1, /* boolean */
- p_on_min_sleep:1, /* boolean */
- p_hpts_wake_scheduled:1, /* boolean */
- hit_callout_thresh:1,
- p_avail:4;
- uint8_t p_fill[3]; /* Fill to 32 bits */
- /* Cache line 0x40 */
- struct hptsh {
- TAILQ_HEAD(, tcpcb) head;
- uint32_t count;
- uint32_t gencnt;
- } *p_hptss; /* Hptsi wheel */
- uint32_t p_hpts_sleep_time; /* Current sleep interval having a max
- * of 255ms */
- uint32_t overidden_sleep; /* what was overrided by min-sleep for logging */
- uint32_t saved_lasttick; /* for logging */
- uint32_t saved_curtick; /* for logging */
- uint32_t saved_curslot; /* for logging */
- uint32_t saved_prev_slot; /* for logging */
- uint32_t p_delayed_by; /* How much were we delayed by */
- /* Cache line 0x80 */
- struct sysctl_ctx_list hpts_ctx;
- struct sysctl_oid *hpts_root;
- struct intr_event *ie;
- void *ie_cookie;
- uint16_t p_num; /* The hpts number one per cpu */
- uint16_t p_cpu; /* The hpts CPU */
- /* There is extra space in here */
- /* Cache line 0x100 */
- struct callout co __aligned(CACHE_LINE_SIZE);
-} __aligned(CACHE_LINE_SIZE);
-
-static struct tcp_hptsi {
- struct cpu_group **grps;
- struct tcp_hpts_entry **rp_ent; /* Array of hptss */
- uint32_t *cts_last_ran;
- uint32_t grp_cnt;
- uint32_t rp_num_hptss; /* Number of hpts threads */
-} tcp_pace;
-
-static MALLOC_DEFINE(M_TCPHPTS, "tcp_hpts", "TCP hpts");
#ifdef RSS
-static int tcp_bind_threads = 1;
+int tcp_bind_threads = 1;
#else
-static int tcp_bind_threads = 2;
+int tcp_bind_threads = 2;
#endif
static int tcp_use_irq_cpu = 0;
static int hpts_does_tp_logging = 0;
-
-static int32_t tcp_hptsi(struct tcp_hpts_entry *hpts, bool from_callout);
-static void tcp_hpts_thread(void *ctx);
-
+static int32_t tcp_hpts_precision = 120;
int32_t tcp_min_hptsi_time = DEFAULT_MIN_SLEEP;
static int conn_cnt_thresh = DEFAULT_CONNECTION_THRESHOLD;
static int32_t dynamic_min_sleep = DYNAMIC_MIN_SLEEP;
@@ -295,23 +246,6 @@ SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hpts, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
SYSCTL_NODE(_net_inet_tcp_hpts, OID_AUTO, stats, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
"TCP Hpts statistics");
-#define timersub(tvp, uvp, vvp) \
- do { \
- (vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec; \
- (vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec; \
- if ((vvp)->tv_usec < 0) { \
- (vvp)->tv_sec--; \
- (vvp)->tv_usec += 1000000; \
- } \
- } while (0)
-
-static int32_t tcp_hpts_precision = 120;
-
-static struct hpts_domain_info {
- int count;
- int cpu[MAXCPU];
-} hpts_domains[MAXMEMDOM];
-
counter_u64_t hpts_hopelessly_behind;
SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, hopeless, CTLFLAG_RD,
@@ -459,14 +393,14 @@ SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, nowake_over_thresh, CTLFLAG_RW,
&tcp_hpts_no_wake_over_thresh, 0,
"When we are over the threshold on the pacer do we prohibit wakeups?");
-static uint16_t
-hpts_random_cpu(void)
+uint16_t
+tcp_hptsi_random_cpu(struct tcp_hptsi *pace)
{
uint16_t cpuid;
uint32_t ran;
ran = arc4random();
- cpuid = (((ran & 0xffff) % mp_ncpus) % tcp_pace.rp_num_hptss);
+ cpuid = (((ran & 0xffff) % mp_ncpus) % pace->rp_num_hptss);
return (cpuid);
}
@@ -487,13 +421,11 @@ tcp_hpts_log(struct tcp_hpts_entry *hpts, struct tcpcb *tp, struct timeval *tv,
log.u_bbr.flex2 = hpts->p_cur_slot;
log.u_bbr.flex3 = hpts->p_prev_slot;
log.u_bbr.flex4 = idx;
- log.u_bbr.flex5 = hpts->p_curtick;
log.u_bbr.flex6 = hpts->p_on_queue_cnt;
log.u_bbr.flex7 = hpts->p_cpu;
log.u_bbr.flex8 = (uint8_t)from_callout;
log.u_bbr.inflight = slots_to_run;
log.u_bbr.applimited = hpts->overidden_sleep;
- log.u_bbr.delivered = hpts->saved_curtick;
log.u_bbr.timeStamp = tcp_tv_to_usec(tv);
log.u_bbr.epoch = hpts->saved_curslot;
log.u_bbr.lt_epoch = hpts->saved_prev_slot;
@@ -510,11 +442,67 @@ tcp_hpts_log(struct tcp_hpts_entry *hpts, struct tcpcb *tp, struct timeval *tv,
}
}
+/*
+ * Timeout handler for the HPTS sleep callout. It immediately schedules the SWI
+ * for the HPTS entry to run.
+ */
static void
-tcp_wakehpts(struct tcp_hpts_entry *hpts)
+tcp_hpts_sleep_timeout(void *arg)
{
+#ifdef TCP_HPTS_KTEST
+ struct tcp_hptsi *pace;
+#endif
+ struct tcp_hpts_entry *hpts;
+
+ hpts = (struct tcp_hpts_entry *)arg;
+#ifdef TCP_HPTS_KTEST
+ pace = hpts->p_hptsi;
+#endif
+ swi_sched(hpts->ie_cookie, 0);
+}
+
+/*
+ * Reset the HPTS callout timer with the provided timeval. Returns the results
+ * of the callout_reset_sbt_on() function.
+ */
+static int
+tcp_hpts_sleep(struct tcp_hpts_entry *hpts, struct timeval *tv)
+{
+#ifdef TCP_HPTS_KTEST
+ struct tcp_hptsi *pace;
+#endif
+ sbintime_t sb;
+
+#ifdef TCP_HPTS_KTEST
+ pace = hpts->p_hptsi;
+#endif
+
+ /* Store off to make visible the actual sleep time */
+ hpts->sleeping = tv->tv_usec;
+
+ sb = tvtosbt(*tv);
+ return (callout_reset_sbt_on(
+ &hpts->co, sb, 0, tcp_hpts_sleep_timeout, hpts, hpts->p_cpu,
+ (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision))));
+}
+
+/*
+ * Schedules the SWI for the HTPS entry to run, if not already scheduled or
+ * running.
+ */
+void
+tcp_hpts_wake(struct tcp_hpts_entry *hpts)
+{
+#ifdef TCP_HPTS_KTEST
+ struct tcp_hptsi *pace;
+#endif
+
HPTS_MTX_ASSERT(hpts);
+#ifdef TCP_HPTS_KTEST
+ pace = hpts->p_hptsi;
+#endif
+
if (tcp_hpts_no_wake_over_thresh && (hpts->p_on_queue_cnt >= conn_cnt_thresh)) {
hpts->p_direct_wake = 0;
return;
@@ -526,15 +514,6 @@ tcp_wakehpts(struct tcp_hpts_entry *hpts)
}
static void
-hpts_timeout_swi(void *arg)
-{
- struct tcp_hpts_entry *hpts;
-
- hpts = (struct tcp_hpts_entry *)arg;
- swi_sched(hpts->ie_cookie, 0);
-}
-
-static void
tcp_hpts_insert_internal(struct tcpcb *tp, struct tcp_hpts_entry *hpts)
{
struct inpcb *inp = tptoinpcb(tp);
@@ -562,13 +541,13 @@ tcp_hpts_insert_internal(struct tcpcb *tp, struct tcp_hpts_entry *hpts)
}
static struct tcp_hpts_entry *
-tcp_hpts_lock(struct tcpcb *tp)
+tcp_hpts_lock(struct tcp_hptsi *pace, struct tcpcb *tp)
{
struct tcp_hpts_entry *hpts;
INP_LOCK_ASSERT(tptoinpcb(tp));
- hpts = tcp_pace.rp_ent[tp->t_hpts_cpu];
+ hpts = pace->rp_ent[tp->t_hpts_cpu];
HPTS_LOCK(hpts);
return (hpts);
@@ -595,11 +574,10 @@ tcp_hpts_release(struct tcpcb *tp)
* and has never received a first packet.
*/
void
-tcp_hpts_init(struct tcpcb *tp)
+__tcp_hpts_init(struct tcp_hptsi *pace, struct tcpcb *tp)
{
-
if (__predict_true(tp->t_hpts_cpu == HPTS_CPU_NONE)) {
- tp->t_hpts_cpu = hpts_random_cpu();
+ tp->t_hpts_cpu = tcp_hptsi_random_cpu(pace);
MPASS(!(tp->t_flags2 & TF2_HPTS_CPU_SET));
}
}
@@ -611,14 +589,14 @@ tcp_hpts_init(struct tcpcb *tp)
* INP lock and then get the hpts lock.
*/
void
-tcp_hpts_remove(struct tcpcb *tp)
+__tcp_hpts_remove(struct tcp_hptsi *pace, struct tcpcb *tp)
{
struct tcp_hpts_entry *hpts;
struct hptsh *hptsh;
INP_WLOCK_ASSERT(tptoinpcb(tp));
- hpts = tcp_hpts_lock(tp);
+ hpts = tcp_hpts_lock(pace, tp);
if (tp->t_in_hpts == IHPTS_ONQUEUE) {
hptsh = &hpts->p_hptss[tp->t_hpts_slot];
tp->t_hpts_request = 0;
@@ -662,23 +640,19 @@ hpts_slot(uint32_t wheel_slot, uint32_t plus)
{
/*
* Given a slot on the wheel, what slot
- * is that plus ticks out?
+ * is that plus slots out?
*/
- KASSERT(wheel_slot < NUM_OF_HPTSI_SLOTS, ("Invalid tick %u not on wheel", wheel_slot));
+ KASSERT(wheel_slot < NUM_OF_HPTSI_SLOTS, ("Invalid slot %u not on wheel", wheel_slot));
return ((wheel_slot + plus) % NUM_OF_HPTSI_SLOTS);
}
static inline int
-tick_to_wheel(uint32_t cts_in_wticks)
+cts_to_wheel(uint32_t cts)
{
/*
- * Given a timestamp in ticks (so by
- * default to get it to a real time one
- * would multiply by 10.. i.e the number
- * of ticks in a slot) map it to our limited
- * space wheel.
+ * Given a timestamp in useconds map it to our limited space wheel.
*/
- return (cts_in_wticks % NUM_OF_HPTSI_SLOTS);
+ return ((cts / HPTS_USECS_PER_SLOT) % NUM_OF_HPTSI_SLOTS);
}
static inline int
@@ -721,7 +695,7 @@ max_slots_available(struct tcp_hpts_entry *hpts, uint32_t wheel_slot, uint32_t *
if ((hpts->p_hpts_active == 1) &&
(hpts->p_wheel_complete == 0)) {
end_slot = hpts->p_runningslot;
- /* Back up one tick */
+ /* Back up one slot */
if (end_slot == 0)
end_slot = NUM_OF_HPTSI_SLOTS - 1;
else
@@ -734,7 +708,7 @@ max_slots_available(struct tcp_hpts_entry *hpts, uint32_t wheel_slot, uint32_t *
* not active, or we have
* completed the pass over
* the wheel, we can use the
- * prev tick and subtract one from it. This puts us
+ * prev slot and subtract one from it. This puts us
* as far out as possible on the wheel.
*/
end_slot = hpts->p_prev_slot;
@@ -747,7 +721,7 @@ max_slots_available(struct tcp_hpts_entry *hpts, uint32_t wheel_slot, uint32_t *
/*
* Now we have close to the full wheel left minus the
* time it has been since the pacer went to sleep. Note
- * that wheel_tick, passed in, should be the current time
+ * that wheel_slot, passed in, should be the current time
* from the perspective of the caller, mapped to the wheel.
*/
if (hpts->p_prev_slot != wheel_slot)
@@ -824,7 +798,7 @@ max_slots_available(struct tcp_hpts_entry *hpts, uint32_t wheel_slot, uint32_t *
#ifdef INVARIANTS
static void
check_if_slot_would_be_wrong(struct tcp_hpts_entry *hpts, struct tcpcb *tp,
- uint32_t hptsslot, int line)
+ uint32_t hptsslot)
{
/*
* Sanity checks for the pacer with invariants
@@ -855,12 +829,13 @@ check_if_slot_would_be_wrong(struct tcp_hpts_entry *hpts, struct tcpcb *tp,
}
#endif
-uint32_t
-tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_diag *diag)
+void
+__tcp_hpts_insert(struct tcp_hptsi *pace, struct tcpcb *tp, uint32_t usecs,
+ struct hpts_diag *diag)
{
struct tcp_hpts_entry *hpts;
struct timeval tv;
- uint32_t slot_on, wheel_cts, last_slot, need_new_to = 0;
+ uint32_t slot, wheel_cts, last_slot, need_new_to = 0;
int32_t wheel_slot, maxslots;
bool need_wakeup = false;
@@ -869,11 +844,13 @@ tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_
MPASS(!(tp->t_in_hpts == IHPTS_ONQUEUE));
/*
+ * Convert microseconds to slots for internal use.
* We now return the next-slot the hpts will be on, beyond its
* current run (if up) or where it was when it stopped if it is
* sleeping.
*/
- hpts = tcp_hpts_lock(tp);
+ slot = HPTS_USEC_TO_SLOTS(usecs);
+ hpts = tcp_hpts_lock(pace, tp);
microuptime(&tv);
if (diag) {
memset(diag, 0, sizeof(struct hpts_diag));
@@ -882,8 +859,6 @@ tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_
diag->p_runningslot = hpts->p_runningslot;
diag->p_nxt_slot = hpts->p_nxt_slot;
diag->p_cur_slot = hpts->p_cur_slot;
- diag->p_curtick = hpts->p_curtick;
- diag->p_lasttick = hpts->p_lasttick;
diag->slot_req = slot;
diag->p_on_min_sleep = hpts->p_on_min_sleep;
diag->hpts_sleep_time = hpts->p_hpts_sleep_time;
@@ -910,17 +885,15 @@ tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_
* timeout is not 1.
*/
hpts->p_direct_wake = 1;
- tcp_wakehpts(hpts);
+ tcp_hpts_wake(hpts);
}
- slot_on = hpts->p_nxt_slot;
HPTS_UNLOCK(hpts);
- return (slot_on);
+ return;
}
- /* Get the current time relative to the wheel */
- wheel_cts = tcp_tv_to_hpts_slot(&tv);
- /* Map it onto the wheel */
- wheel_slot = tick_to_wheel(wheel_cts);
+ /* Get the current time stamp and map it onto the wheel */
+ wheel_cts = tcp_tv_to_usec(&tv);
+ wheel_slot = cts_to_wheel(wheel_cts);
/* Now what's the max we can place it at? */
maxslots = max_slots_available(hpts, wheel_slot, &last_slot);
if (diag) {
@@ -952,11 +925,11 @@ tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_
tp->t_hpts_slot = last_slot;
}
if (diag) {
- diag->slot_remaining = tp->t_hpts_request;
+ diag->time_remaining = tp->t_hpts_request;
diag->inp_hptsslot = tp->t_hpts_slot;
}
#ifdef INVARIANTS
- check_if_slot_would_be_wrong(hpts, tp, tp->t_hpts_slot, line);
+ check_if_slot_would_be_wrong(hpts, tp, tp->t_hpts_slot);
#endif
if (__predict_true(tp->t_in_hpts != IHPTS_MOVING))
tcp_hpts_insert_internal(tp, hpts);
@@ -995,12 +968,12 @@ tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_
}
/*
* Now how far is the hpts sleeping to? if active is 1, its
- * up and ticking we do nothing, otherwise we may need to
+ * up and running we do nothing, otherwise we may need to
* reschedule its callout if need_new_to is set from above.
*/
if (need_wakeup) {
hpts->p_direct_wake = 1;
- tcp_wakehpts(hpts);
+ tcp_hpts_wake(hpts);
if (diag) {
diag->need_new_to = 0;
diag->co_ret = 0xffff0000;
@@ -1008,7 +981,6 @@ tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_
} else if (need_new_to) {
int32_t co_ret;
struct timeval tv;
- sbintime_t sb;
tv.tv_sec = 0;
tv.tv_usec = 0;
@@ -1016,24 +988,18 @@ tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_
tv.tv_sec++;
need_new_to -= HPTS_USEC_IN_SEC;
}
- tv.tv_usec = need_new_to;
- sb = tvtosbt(tv);
- co_ret = callout_reset_sbt_on(&hpts->co, sb, 0,
- hpts_timeout_swi, hpts, hpts->p_cpu,
- (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
+ tv.tv_usec = need_new_to; /* XXX: Why is this sleeping over the max? */
+ co_ret = tcp_hpts_sleep(hpts, &tv);
if (diag) {
diag->need_new_to = need_new_to;
diag->co_ret = co_ret;
}
}
- slot_on = hpts->p_nxt_slot;
HPTS_UNLOCK(hpts);
-
- return (slot_on);
}
static uint16_t
-hpts_cpuid(struct tcpcb *tp, int *failed)
+hpts_cpuid(struct tcp_hptsi *pace, struct tcpcb *tp, int *failed)
{
struct inpcb *inp = tptoinpcb(tp);
u_int cpuid;
@@ -1060,7 +1026,7 @@ hpts_cpuid(struct tcpcb *tp, int *failed)
#ifdef RSS
cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype);
if (cpuid == NETISR_CPUID_NONE)
- return (hpts_random_cpu());
+ return (tcp_hptsi_random_cpu(pace));
else
return (cpuid);
#endif
@@ -1071,7 +1037,7 @@ hpts_cpuid(struct tcpcb *tp, int *failed)
*/
if (inp->inp_flowtype == M_HASHTYPE_NONE) {
counter_u64_add(cpu_uses_random, 1);
- return (hpts_random_cpu());
+ return (tcp_hptsi_random_cpu(pace));
}
/*
* Hash to a thread based on the flowid. If we are using numa,
@@ -1086,7 +1052,7 @@ hpts_cpuid(struct tcpcb *tp, int *failed)
#ifdef NUMA
} else {
/* Hash into the cpu's that use that domain */
- di = &hpts_domains[inp->inp_numa_domain];
+ di = &pace->domains[inp->inp_numa_domain];
cpuid = di->cpu[inp->inp_flowid % di->count];
}
#endif
@@ -1118,9 +1084,16 @@ tcp_hpts_set_max_sleep(struct tcp_hpts_entry *hpts, int wrap_loop_cnt)
}
}
-static int32_t
+static bool
+tcp_hpts_different_slots(uint32_t cts, uint32_t cts_last_run)
+{
+ return ((cts / HPTS_USECS_PER_SLOT) != (cts_last_run / HPTS_USECS_PER_SLOT));
+}
+
+int32_t
tcp_hptsi(struct tcp_hpts_entry *hpts, bool from_callout)
{
+ struct tcp_hptsi *pace;
struct tcpcb *tp;
struct timeval tv;
int32_t slots_to_run, i, error;
@@ -1130,6 +1103,7 @@ tcp_hptsi(struct tcp_hpts_entry *hpts, bool from_callout)
int32_t wrap_loop_cnt = 0;
int32_t slot_pos_of_endpoint = 0;
int32_t orig_exit_slot;
+ uint32_t cts, cts_last_run;
bool completed_measure, seen_endpoint;
completed_measure = false;
@@ -1137,32 +1111,34 @@ tcp_hptsi(struct tcp_hpts_entry *hpts, bool from_callout)
HPTS_MTX_ASSERT(hpts);
NET_EPOCH_ASSERT();
+
+ pace = hpts->p_hptsi;
+ MPASS(pace != NULL);
+
/* record previous info for any logging */
- hpts->saved_lasttick = hpts->p_lasttick;
- hpts->saved_curtick = hpts->p_curtick;
hpts->saved_curslot = hpts->p_cur_slot;
hpts->saved_prev_slot = hpts->p_prev_slot;
- hpts->p_lasttick = hpts->p_curtick;
- hpts->p_curtick = tcp_gethptstick(&tv);
- tcp_pace.cts_last_ran[hpts->p_num] = tcp_tv_to_usec(&tv);
- orig_exit_slot = hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
+ microuptime(&tv);
+ cts_last_run = pace->cts_last_ran[hpts->p_cpu];
+ pace->cts_last_ran[hpts->p_cpu] = cts = tcp_tv_to_usec(&tv);
+
+ orig_exit_slot = hpts->p_cur_slot = cts_to_wheel(cts);
if ((hpts->p_on_queue_cnt == 0) ||
- (hpts->p_lasttick == hpts->p_curtick)) {
+ !tcp_hpts_different_slots(cts, cts_last_run)) {
/*
- * No time has yet passed,
- * or nothing to do.
+ * Not enough time has yet passed or nothing to do.
*/
hpts->p_prev_slot = hpts->p_cur_slot;
- hpts->p_lasttick = hpts->p_curtick;
goto no_run;
}
again:
hpts->p_wheel_complete = 0;
HPTS_MTX_ASSERT(hpts);
slots_to_run = hpts_slots_diff(hpts->p_prev_slot, hpts->p_cur_slot);
- if (((hpts->p_curtick - hpts->p_lasttick) > (NUM_OF_HPTSI_SLOTS - 1)) &&
- (hpts->p_on_queue_cnt != 0)) {
+ if ((hpts->p_on_queue_cnt != 0) &&
+ ((cts - cts_last_run) >
+ ((NUM_OF_HPTSI_SLOTS-1) * HPTS_USECS_PER_SLOT))) {
/*
* Wheel wrap is occuring, basically we
* are behind and the distance between
@@ -1238,7 +1214,7 @@ again:
uint32_t runningslot;
/*
- * Calculate our delay, if there are no extra ticks there
+ * Calculate our delay, if there are no extra slots there
* was not any (i.e. if slots_to_run == 1, no delay).
*/
hpts->p_delayed_by = (slots_to_run - (i + 1)) *
@@ -1391,7 +1367,7 @@ again:
* gets added to the hpts (not this one)
* :-)
*/
- tcp_set_hpts(tp);
+ __tcp_set_hpts(pace, tp);
}
CURVNET_SET(inp->inp_vnet);
/* Lets do any logging that we might want to */
@@ -1450,10 +1426,12 @@ no_one:
hpts->p_delayed_by = 0;
/*
* Check to see if we took an excess amount of time and need to run
- * more ticks (if we did not hit eno-bufs).
+ * more slots (if we did not hit eno-bufs).
*/
hpts->p_prev_slot = hpts->p_cur_slot;
- hpts->p_lasttick = hpts->p_curtick;
+ microuptime(&tv);
+ cts_last_run = cts;
+ cts = tcp_tv_to_usec(&tv);
if (!from_callout || (loop_cnt > max_pacer_loops)) {
/*
* Something is serious slow we have
@@ -1465,7 +1443,7 @@ no_one:
* can never catch up :(
*
* We will just lie to this thread
- * and let it thing p_curtick is
+ * and let it think p_curslot is
* correct. When it next awakens
* it will find itself further behind.
*/
@@ -1473,20 +1451,19 @@ no_one:
counter_u64_add(hpts_hopelessly_behind, 1);
goto no_run;
}
- hpts->p_curtick = tcp_gethptstick(&tv);
- hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
+
+ hpts->p_cur_slot = cts_to_wheel(cts);
if (!seen_endpoint) {
/* We saw no endpoint but we may be looping */
orig_exit_slot = hpts->p_cur_slot;
}
- if ((wrap_loop_cnt < 2) &&
- (hpts->p_lasttick != hpts->p_curtick)) {
+ if ((wrap_loop_cnt < 2) && tcp_hpts_different_slots(cts, cts_last_run)) {
counter_u64_add(hpts_loops, 1);
loop_cnt++;
goto again;
}
no_run:
- tcp_pace.cts_last_ran[hpts->p_num] = tcp_tv_to_usec(&tv);
+ pace->cts_last_ran[hpts->p_cpu] = cts;
/*
* Set flag to tell that we are done for
* any slot input that happens during
@@ -1494,25 +1471,36 @@ no_run:
*/
hpts->p_wheel_complete = 1;
/*
- * Now did we spend too long running input and need to run more ticks?
- * Note that if wrap_loop_cnt < 2 then we should have the conditions
- * in the KASSERT's true. But if the wheel is behind i.e. wrap_loop_cnt
- * is greater than 2, then the condtion most likely are *not* true.
- * Also if we are called not from the callout, we don't run the wheel
- * multiple times so the slots may not align either.
- */
- KASSERT(((hpts->p_prev_slot == hpts->p_cur_slot) ||
- (wrap_loop_cnt >= 2) || !from_callout),
- ("H:%p p_prev_slot:%u not equal to p_cur_slot:%u", hpts,
- hpts->p_prev_slot, hpts->p_cur_slot));
- KASSERT(((hpts->p_lasttick == hpts->p_curtick)
- || (wrap_loop_cnt >= 2) || !from_callout),
- ("H:%p p_lasttick:%u not equal to p_curtick:%u", hpts,
- hpts->p_lasttick, hpts->p_curtick));
- if (from_callout && (hpts->p_lasttick != hpts->p_curtick)) {
- hpts->p_curtick = tcp_gethptstick(&tv);
+ * If enough time has elapsed that we should be processing the next
+ * slot(s), then we should have kept running and not marked the wheel as
+ * complete.
+ *
+ * But there are several other conditions where we would have stopped
+ * processing, so the prev/cur slots and cts variables won't match.
+ * These conditions are:
+ *
+ * - Calls not from callouts don't run multiple times
+ * - The wheel is empty
+ * - We've processed more than max_pacer_loops times
+ * - We've wrapped more than 2 times
+ *
+ * This assert catches when the logic above has violated this design.
+ *
+ */
+ KASSERT((!from_callout || (hpts->p_on_queue_cnt == 0) ||
+ (loop_cnt > max_pacer_loops) || (wrap_loop_cnt >= 2) ||
+ ((hpts->p_prev_slot == hpts->p_cur_slot) &&
+ !tcp_hpts_different_slots(cts, cts_last_run))),
+ ("H:%p Shouldn't be done! prev_slot:%u, cur_slot:%u, "
+ "cts_last_run:%u, cts:%u, loop_cnt:%d, wrap_loop_cnt:%d",
+ hpts, hpts->p_prev_slot, hpts->p_cur_slot,
+ cts_last_run, cts, loop_cnt, wrap_loop_cnt));
+
+ if (from_callout && tcp_hpts_different_slots(cts, cts_last_run)) {
+ microuptime(&tv);
+ cts = tcp_tv_to_usec(&tv);
+ hpts->p_cur_slot = cts_to_wheel(cts);
counter_u64_add(hpts_loops, 1);
- hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
goto again;
}
@@ -1526,16 +1514,16 @@ no_run:
}
void
-tcp_set_hpts(struct tcpcb *tp)
+__tcp_set_hpts(struct tcp_hptsi *pace, struct tcpcb *tp)
{
struct tcp_hpts_entry *hpts;
int failed;
INP_WLOCK_ASSERT(tptoinpcb(tp));
- hpts = tcp_hpts_lock(tp);
+ hpts = tcp_hpts_lock(pace, tp);
if (tp->t_in_hpts == IHPTS_NONE && !(tp->t_flags2 & TF2_HPTS_CPU_SET)) {
- tp->t_hpts_cpu = hpts_cpuid(tp, &failed);
+ tp->t_hpts_cpu = hpts_cpuid(pace, tp, &failed);
if (failed == 0)
tp->t_flags2 |= TF2_HPTS_CPU_SET;
}
@@ -1543,33 +1531,35 @@ tcp_set_hpts(struct tcpcb *tp)
}
static struct tcp_hpts_entry *
-tcp_choose_hpts_to_run(void)
+tcp_choose_hpts_to_run(struct tcp_hptsi *pace)
{
+ struct timeval tv;
int i, oldest_idx, start, end;
uint32_t cts, time_since_ran, calc;
- cts = tcp_get_usecs(NULL);
+ microuptime(&tv);
+ cts = tcp_tv_to_usec(&tv);
time_since_ran = 0;
/* Default is all one group */
start = 0;
- end = tcp_pace.rp_num_hptss;
+ end = pace->rp_num_hptss;
/*
* If we have more than one L3 group figure out which one
* this CPU is in.
*/
- if (tcp_pace.grp_cnt > 1) {
- for (i = 0; i < tcp_pace.grp_cnt; i++) {
- if (CPU_ISSET(curcpu, &tcp_pace.grps[i]->cg_mask)) {
- start = tcp_pace.grps[i]->cg_first;
- end = (tcp_pace.grps[i]->cg_last + 1);
+ if (pace->grp_cnt > 1) {
+ for (i = 0; i < pace->grp_cnt; i++) {
+ if (CPU_ISSET(curcpu, &pace->grps[i]->cg_mask)) {
+ start = pace->grps[i]->cg_first;
+ end = (pace->grps[i]->cg_last + 1);
break;
}
}
}
oldest_idx = -1;
for (i = start; i < end; i++) {
- if (TSTMP_GT(cts, tcp_pace.cts_last_ran[i]))
- calc = cts - tcp_pace.cts_last_ran[i];
+ if (TSTMP_GT(cts, pace->cts_last_ran[i]))
+ calc = cts - pace->cts_last_ran[i];
else
calc = 0;
if (calc > time_since_ran) {
@@ -1578,9 +1568,9 @@ tcp_choose_hpts_to_run(void)
}
}
if (oldest_idx >= 0)
- return(tcp_pace.rp_ent[oldest_idx]);
+ return(pace->rp_ent[oldest_idx]);
else
- return(tcp_pace.rp_ent[(curcpu % tcp_pace.rp_num_hptss)]);
+ return(pace->rp_ent[(curcpu % pace->rp_num_hptss)]);
}
static void
@@ -1588,9 +1578,9 @@ __tcp_run_hpts(void)
{
struct epoch_tracker et;
struct tcp_hpts_entry *hpts;
- int ticks_ran;
+ int slots_ran;
- hpts = tcp_choose_hpts_to_run();
+ hpts = tcp_choose_hpts_to_run(tcp_hptsi_pace);
if (hpts->p_hpts_active) {
/* Already active */
@@ -1606,12 +1596,11 @@ __tcp_run_hpts(void)
hpts->syscall_cnt++;
counter_u64_add(hpts_direct_call, 1);
hpts->p_hpts_active = 1;
- ticks_ran = tcp_hptsi(hpts, false);
+ slots_ran = tcp_hptsi(hpts, false);
/* We may want to adjust the sleep values here */
if (hpts->p_on_queue_cnt >= conn_cnt_thresh) {
- if (ticks_ran > slots_indicate_less_sleep) {
+ if (slots_ran > slots_indicate_less_sleep) {
struct timeval tv;
- sbintime_t sb;
hpts->p_mysleep.tv_usec /= 2;
if (hpts->p_mysleep.tv_usec < dynamic_min_sleep)
@@ -1635,13 +1624,8 @@ __tcp_run_hpts(void)
* the dynamic value and set the on_min_sleep
* flag so we will not be awoken.
*/
- sb = tvtosbt(tv);
- /* Store off to make visible the actual sleep time */
- hpts->sleeping = tv.tv_usec;
- callout_reset_sbt_on(&hpts->co, sb, 0,
- hpts_timeout_swi, hpts, hpts->p_cpu,
- (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
- } else if (ticks_ran < slots_indicate_more_sleep) {
+ (void)tcp_hpts_sleep(hpts, &tv);
+ } else if (slots_ran < slots_indicate_more_sleep) {
/* For the further sleep, don't reschedule hpts */
hpts->p_mysleep.tv_usec *= 2;
if (hpts->p_mysleep.tv_usec > dynamic_max_sleep)
@@ -1658,17 +1642,22 @@ out_with_mtx:
static void
tcp_hpts_thread(void *ctx)
{
+#ifdef TCP_HPTS_KTEST
+ struct tcp_hptsi *pace;
+#endif
struct tcp_hpts_entry *hpts;
struct epoch_tracker et;
struct timeval tv;
- sbintime_t sb;
- int ticks_ran;
+ int slots_ran;
hpts = (struct tcp_hpts_entry *)ctx;
+#ifdef TCP_HPTS_KTEST
+ pace = hpts->p_hptsi;
+#endif
HPTS_LOCK(hpts);
if (hpts->p_direct_wake) {
/* Signaled by input or output with low occupancy count. */
- callout_stop(&hpts->co);
+ _callout_stop_safe(&hpts->co, 0);
counter_u64_add(hpts_direct_awakening, 1);
} else {
/* Timed out, the normal case. */
@@ -1721,7 +1710,7 @@ tcp_hpts_thread(void *ctx)
}
hpts->sleeping = 0;
hpts->p_hpts_active = 1;
- ticks_ran = tcp_hptsi(hpts, true);
+ slots_ran = tcp_hptsi(hpts, true);
tv.tv_sec = 0;
tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_USECS_PER_SLOT;
if ((hpts->p_on_queue_cnt > conn_cnt_thresh) && (hpts->hit_callout_thresh == 0)) {
@@ -1737,11 +1726,11 @@ tcp_hpts_thread(void *ctx)
* Only adjust sleep time if we were
* called from the callout i.e. direct_wake == 0.
*/
- if (ticks_ran < slots_indicate_more_sleep) {
+ if (slots_ran < slots_indicate_more_sleep) {
hpts->p_mysleep.tv_usec *= 2;
if (hpts->p_mysleep.tv_usec > dynamic_max_sleep)
hpts->p_mysleep.tv_usec = dynamic_max_sleep;
- } else if (ticks_ran > slots_indicate_less_sleep) {
+ } else if (slots_ran > slots_indicate_less_sleep) {
hpts->p_mysleep.tv_usec /= 2;
if (hpts->p_mysleep.tv_usec < dynamic_min_sleep)
hpts->p_mysleep.tv_usec = dynamic_min_sleep;
@@ -1797,18 +1786,11 @@ tcp_hpts_thread(void *ctx)
hpts->p_hpts_active = 0;
back_to_sleep:
hpts->p_direct_wake = 0;
- sb = tvtosbt(tv);
- /* Store off to make visible the actual sleep time */
- hpts->sleeping = tv.tv_usec;
- callout_reset_sbt_on(&hpts->co, sb, 0,
- hpts_timeout_swi, hpts, hpts->p_cpu,
- (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
+ (void)tcp_hpts_sleep(hpts, &tv);
NET_EPOCH_EXIT(et);
HPTS_UNLOCK(hpts);
}
-#undef timersub
-
static int32_t
hpts_count_level(struct cpu_group *cg)
{
@@ -1845,57 +1827,63 @@ hpts_gather_grps(struct cpu_group **grps, int32_t *at, int32_t max, struct cpu_g
}
}
-static void
-tcp_hpts_mod_load(void)
+/*
+ * Initialize a tcp_hptsi structure. This performs the core initialization
+ * without starting threads.
+ */
+struct tcp_hptsi*
+tcp_hptsi_create(const struct tcp_hptsi_funcs *funcs, bool enable_sysctl)
{
+ struct tcp_hptsi *pace;
struct cpu_group *cpu_top;
- int32_t error __diagused;
- int32_t i, j, bound = 0, created = 0;
+ uint32_t i, j, cts;
+ int32_t count;
size_t sz, asz;
struct timeval tv;
- sbintime_t sb;
struct tcp_hpts_entry *hpts;
- struct pcpu *pc;
char unit[16];
uint32_t ncpus = mp_ncpus ? mp_ncpus : MAXCPU;
- int count, domain;
+ KASSERT(funcs != NULL, ("funcs is NULL"));
+
+ /* Allocate the main structure */
+ pace = malloc(sizeof(struct tcp_hptsi), M_TCPHPTS, M_WAITOK | M_ZERO);
+ if (pace == NULL)
+ return (NULL);
+
+ memset(pace, 0, sizeof(*pace));
+ pace->funcs = funcs;
+
+ /* Setup CPU topology information */
#ifdef SMP
cpu_top = smp_topo();
#else
cpu_top = NULL;
#endif
- tcp_pace.rp_num_hptss = ncpus;
- hpts_hopelessly_behind = counter_u64_alloc(M_WAITOK);
- hpts_loops = counter_u64_alloc(M_WAITOK);
- back_tosleep = counter_u64_alloc(M_WAITOK);
- combined_wheel_wrap = counter_u64_alloc(M_WAITOK);
- wheel_wrap = counter_u64_alloc(M_WAITOK);
- hpts_wake_timeout = counter_u64_alloc(M_WAITOK);
- hpts_direct_awakening = counter_u64_alloc(M_WAITOK);
- hpts_back_tosleep = counter_u64_alloc(M_WAITOK);
- hpts_direct_call = counter_u64_alloc(M_WAITOK);
- cpu_uses_flowid = counter_u64_alloc(M_WAITOK);
- cpu_uses_random = counter_u64_alloc(M_WAITOK);
+ pace->rp_num_hptss = ncpus;
- sz = (tcp_pace.rp_num_hptss * sizeof(struct tcp_hpts_entry *));
- tcp_pace.rp_ent = malloc(sz, M_TCPHPTS, M_WAITOK | M_ZERO);
- sz = (sizeof(uint32_t) * tcp_pace.rp_num_hptss);
- tcp_pace.cts_last_ran = malloc(sz, M_TCPHPTS, M_WAITOK);
- tcp_pace.grp_cnt = 0;
+ /* Allocate hpts entry array */
+ sz = (pace->rp_num_hptss * sizeof(struct tcp_hpts_entry *));
+ pace->rp_ent = malloc(sz, M_TCPHPTS, M_WAITOK | M_ZERO);
+
+ /* Allocate timestamp tracking array */
+ sz = (sizeof(uint32_t) * pace->rp_num_hptss);
+ pace->cts_last_ran = malloc(sz, M_TCPHPTS, M_WAITOK);
+
+ /* Setup CPU groups */
if (cpu_top == NULL) {
- tcp_pace.grp_cnt = 1;
+ pace->grp_cnt = 1;
} else {
/* Find out how many cache level 3 domains we have */
count = 0;
- tcp_pace.grp_cnt = hpts_count_level(cpu_top);
- if (tcp_pace.grp_cnt == 0) {
- tcp_pace.grp_cnt = 1;
+ pace->grp_cnt = hpts_count_level(cpu_top);
+ if (pace->grp_cnt == 0) {
+ pace->grp_cnt = 1;
}
- sz = (tcp_pace.grp_cnt * sizeof(struct cpu_group *));
- tcp_pace.grps = malloc(sz, M_TCPHPTS, M_WAITOK);
+ sz = (pace->grp_cnt * sizeof(struct cpu_group *));
+ pace->grps = malloc(sz, M_TCPHPTS, M_WAITOK);
/* Now populate the groups */
- if (tcp_pace.grp_cnt == 1) {
+ if (pace->grp_cnt == 1) {
/*
* All we need is the top level all cpu's are in
* the same cache so when we use grp[0]->cg_mask
@@ -1903,193 +1891,290 @@ tcp_hpts_mod_load(void)
* all cpu's in it. The level here is probably
* zero which is ok.
*/
- tcp_pace.grps[0] = cpu_top;
+ pace->grps[0] = cpu_top;
} else {
/*
* Here we must find all the level three cache domains
* and setup our pointers to them.
*/
count = 0;
- hpts_gather_grps(tcp_pace.grps, &count, tcp_pace.grp_cnt, cpu_top);
+ hpts_gather_grps(pace->grps, &count, pace->grp_cnt, cpu_top);
}
}
+
+ /* Cache the current time for initializing the hpts entries */
+ microuptime(&tv);
+ cts = tcp_tv_to_usec(&tv);
+
+ /* Initialize each hpts entry */
asz = sizeof(struct hptsh) * NUM_OF_HPTSI_SLOTS;
- for (i = 0; i < tcp_pace.rp_num_hptss; i++) {
- tcp_pace.rp_ent[i] = malloc(sizeof(struct tcp_hpts_entry),
+ for (i = 0; i < pace->rp_num_hptss; i++) {
+ pace->rp_ent[i] = malloc(sizeof(struct tcp_hpts_entry),
M_TCPHPTS, M_WAITOK | M_ZERO);
- tcp_pace.rp_ent[i]->p_hptss = malloc(asz, M_TCPHPTS, M_WAITOK);
- hpts = tcp_pace.rp_ent[i];
- /*
- * Init all the hpts structures that are not specifically
- * zero'd by the allocations. Also lets attach them to the
- * appropriate sysctl block as well.
- */
- mtx_init(&hpts->p_mtx, "tcp_hpts_lck",
- "hpts", MTX_DEF | MTX_DUPOK);
- for (j = 0; j < NUM_OF_HPTSI_SLOTS; j++) {
- TAILQ_INIT(&hpts->p_hptss[j].head);
- hpts->p_hptss[j].count = 0;
- hpts->p_hptss[j].gencnt = 0;
- }
- sysctl_ctx_init(&hpts->hpts_ctx);
- sprintf(unit, "%d", i);
- hpts->hpts_root = SYSCTL_ADD_NODE(&hpts->hpts_ctx,
- SYSCTL_STATIC_CHILDREN(_net_inet_tcp_hpts),
- OID_AUTO,
- unit,
- CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
- "");
- SYSCTL_ADD_INT(&hpts->hpts_ctx,
- SYSCTL_CHILDREN(hpts->hpts_root),
- OID_AUTO, "out_qcnt", CTLFLAG_RD,
- &hpts->p_on_queue_cnt, 0,
- "Count TCB's awaiting output processing");
- SYSCTL_ADD_U16(&hpts->hpts_ctx,
- SYSCTL_CHILDREN(hpts->hpts_root),
- OID_AUTO, "active", CTLFLAG_RD,
- &hpts->p_hpts_active, 0,
- "Is the hpts active");
- SYSCTL_ADD_UINT(&hpts->hpts_ctx,
- SYSCTL_CHILDREN(hpts->hpts_root),
- OID_AUTO, "curslot", CTLFLAG_RD,
- &hpts->p_cur_slot, 0,
- "What the current running pacers goal");
- SYSCTL_ADD_UINT(&hpts->hpts_ctx,
- SYSCTL_CHILDREN(hpts->hpts_root),
- OID_AUTO, "runtick", CTLFLAG_RD,
- &hpts->p_runningslot, 0,
- "What the running pacers current slot is");
- SYSCTL_ADD_UINT(&hpts->hpts_ctx,
- SYSCTL_CHILDREN(hpts->hpts_root),
- OID_AUTO, "curtick", CTLFLAG_RD,
- &hpts->p_curtick, 0,
- "What the running pacers last tick mapped to the wheel was");
- SYSCTL_ADD_UINT(&hpts->hpts_ctx,
- SYSCTL_CHILDREN(hpts->hpts_root),
- OID_AUTO, "lastran", CTLFLAG_RD,
- &tcp_pace.cts_last_ran[i], 0,
- "The last usec tick that this hpts ran");
- SYSCTL_ADD_LONG(&hpts->hpts_ctx,
- SYSCTL_CHILDREN(hpts->hpts_root),
- OID_AUTO, "cur_min_sleep", CTLFLAG_RD,
- &hpts->p_mysleep.tv_usec,
- "What the running pacers is using for p_mysleep.tv_usec");
- SYSCTL_ADD_U64(&hpts->hpts_ctx,
- SYSCTL_CHILDREN(hpts->hpts_root),
- OID_AUTO, "now_sleeping", CTLFLAG_RD,
- &hpts->sleeping, 0,
- "What the running pacers is actually sleeping for");
- SYSCTL_ADD_U64(&hpts->hpts_ctx,
- SYSCTL_CHILDREN(hpts->hpts_root),
- OID_AUTO, "syscall_cnt", CTLFLAG_RD,
- &hpts->syscall_cnt, 0,
- "How many times we had syscalls on this hpts");
+ pace->rp_ent[i]->p_hptss = malloc(asz, M_TCPHPTS,
+ M_WAITOK | M_ZERO);
+ hpts = pace->rp_ent[i];
+ /* Basic initialization */
hpts->p_hpts_sleep_time = hpts_sleep_max;
- hpts->p_num = i;
- hpts->p_curtick = tcp_gethptstick(&tv);
- tcp_pace.cts_last_ran[i] = tcp_tv_to_usec(&tv);
- hpts->p_prev_slot = hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
- hpts->p_cpu = 0xffff;
+ hpts->p_cpu = i;
+ pace->cts_last_ran[i] = cts;
+ hpts->p_cur_slot = cts_to_wheel(cts);
+ hpts->p_prev_slot = hpts->p_cur_slot;
hpts->p_nxt_slot = hpts_slot(hpts->p_cur_slot, 1);
callout_init(&hpts->co, 1);
+ hpts->p_hptsi = pace;
+ mtx_init(&hpts->p_mtx, "tcp_hpts_lck", "hpts",
+ MTX_DEF | MTX_DUPOK);
+ for (j = 0; j < NUM_OF_HPTSI_SLOTS; j++) {
+ TAILQ_INIT(&hpts->p_hptss[j].head);
+ }
+
+ /* Setup SYSCTL if requested */
+ if (enable_sysctl) {
+ sysctl_ctx_init(&hpts->hpts_ctx);
+ sprintf(unit, "%d", i);
+ hpts->hpts_root = SYSCTL_ADD_NODE(&hpts->hpts_ctx,
+ SYSCTL_STATIC_CHILDREN(_net_inet_tcp_hpts),
+ OID_AUTO,
+ unit,
+ CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
+ "");
+ SYSCTL_ADD_INT(&hpts->hpts_ctx,
+ SYSCTL_CHILDREN(hpts->hpts_root),
+ OID_AUTO, "out_qcnt", CTLFLAG_RD,
+ &hpts->p_on_queue_cnt, 0,
+ "Count TCB's awaiting output processing");
+ SYSCTL_ADD_U16(&hpts->hpts_ctx,
+ SYSCTL_CHILDREN(hpts->hpts_root),
+ OID_AUTO, "active", CTLFLAG_RD,
+ &hpts->p_hpts_active, 0,
+ "Is the hpts active");
+ SYSCTL_ADD_UINT(&hpts->hpts_ctx,
+ SYSCTL_CHILDREN(hpts->hpts_root),
+ OID_AUTO, "curslot", CTLFLAG_RD,
+ &hpts->p_cur_slot, 0,
+ "What the current running pacers goal");
+ SYSCTL_ADD_UINT(&hpts->hpts_ctx,
+ SYSCTL_CHILDREN(hpts->hpts_root),
+ OID_AUTO, "runslot", CTLFLAG_RD,
+ &hpts->p_runningslot, 0,
+ "What the running pacers current slot is");
+ SYSCTL_ADD_UINT(&hpts->hpts_ctx,
+ SYSCTL_CHILDREN(hpts->hpts_root),
+ OID_AUTO, "lastran", CTLFLAG_RD,
+ &pace->cts_last_ran[i], 0,
+ "The last usec timestamp that this hpts ran");
+ SYSCTL_ADD_LONG(&hpts->hpts_ctx,
+ SYSCTL_CHILDREN(hpts->hpts_root),
+ OID_AUTO, "cur_min_sleep", CTLFLAG_RD,
+ &hpts->p_mysleep.tv_usec,
+ "What the running pacers is using for p_mysleep.tv_usec");
+ SYSCTL_ADD_U64(&hpts->hpts_ctx,
+ SYSCTL_CHILDREN(hpts->hpts_root),
+ OID_AUTO, "now_sleeping", CTLFLAG_RD,
+ &hpts->sleeping, 0,
+ "What the running pacers is actually sleeping for");
+ SYSCTL_ADD_U64(&hpts->hpts_ctx,
+ SYSCTL_CHILDREN(hpts->hpts_root),
+ OID_AUTO, "syscall_cnt", CTLFLAG_RD,
+ &hpts->syscall_cnt, 0,
+ "How many times we had syscalls on this hpts");
+ }
}
- /* Don't try to bind to NUMA domains if we don't have any */
- if (vm_ndomains == 1 && tcp_bind_threads == 2)
- tcp_bind_threads = 0;
- /*
- * Now lets start ithreads to handle the hptss.
- */
- for (i = 0; i < tcp_pace.rp_num_hptss; i++) {
- hpts = tcp_pace.rp_ent[i];
- hpts->p_cpu = i;
+ return (pace);
+}
+
+/*
+ * Create threads for a tcp_hptsi structure and starts timers for the current
+ * (minimum) sleep interval.
+ */
+void
+tcp_hptsi_start(struct tcp_hptsi *pace)
+{
+ struct tcp_hpts_entry *hpts;
+ struct pcpu *pc;
+ struct timeval tv;
+ uint32_t i, j;
+ int count, domain;
+ int error __diagused;
+
+ KASSERT(pace != NULL, ("tcp_hptsi_start: pace is NULL"));
+
+ /* Start threads for each hpts entry */
+ for (i = 0; i < pace->rp_num_hptss; i++) {
+ hpts = pace->rp_ent[i];
+
+ KASSERT(hpts->ie_cookie == NULL,
+ ("tcp_hptsi_start: hpts[%d]->ie_cookie is not NULL", i));
error = swi_add(&hpts->ie, "hpts",
tcp_hpts_thread, (void *)hpts,
SWI_NET, INTR_MPSAFE, &hpts->ie_cookie);
KASSERT(error == 0,
- ("Can't add hpts:%p i:%d err:%d",
- hpts, i, error));
- created++;
- hpts->p_mysleep.tv_sec = 0;
- hpts->p_mysleep.tv_usec = tcp_min_hptsi_time;
+ ("Can't add hpts:%p i:%d err:%d", hpts, i, error));
+
if (tcp_bind_threads == 1) {
- if (intr_event_bind(hpts->ie, i) == 0)
- bound++;
+ (void)intr_event_bind(hpts->ie, i);
} else if (tcp_bind_threads == 2) {
/* Find the group for this CPU (i) and bind into it */
- for (j = 0; j < tcp_pace.grp_cnt; j++) {
- if (CPU_ISSET(i, &tcp_pace.grps[j]->cg_mask)) {
+ for (j = 0; j < pace->grp_cnt; j++) {
+ if (CPU_ISSET(i, &pace->grps[j]->cg_mask)) {
if (intr_event_bind_ithread_cpuset(hpts->ie,
- &tcp_pace.grps[j]->cg_mask) == 0) {
- bound++;
+ &pace->grps[j]->cg_mask) == 0) {
pc = pcpu_find(i);
domain = pc->pc_domain;
- count = hpts_domains[domain].count;
- hpts_domains[domain].cpu[count] = i;
- hpts_domains[domain].count++;
+ count = pace->domains[domain].count;
+ pace->domains[domain].cpu[count] = i;
+ pace->domains[domain].count++;
break;
}
}
}
}
+
+ hpts->p_mysleep.tv_sec = 0;
+ hpts->p_mysleep.tv_usec = tcp_min_hptsi_time;
tv.tv_sec = 0;
tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_USECS_PER_SLOT;
- hpts->sleeping = tv.tv_usec;
- sb = tvtosbt(tv);
- callout_reset_sbt_on(&hpts->co, sb, 0,
- hpts_timeout_swi, hpts, hpts->p_cpu,
- (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
- }
- /*
- * If we somehow have an empty domain, fall back to choosing
- * among all htps threads.
- */
- for (i = 0; i < vm_ndomains; i++) {
- if (hpts_domains[i].count == 0) {
- tcp_bind_threads = 0;
- break;
- }
+ (void)tcp_hpts_sleep(hpts, &tv);
}
- tcp_hpts_softclock = __tcp_run_hpts;
- tcp_lro_hpts_init();
- printf("TCP Hpts created %d swi interrupt threads and bound %d to %s\n",
- created, bound,
- tcp_bind_threads == 2 ? "NUMA domains" : "cpus");
}
-static void
-tcp_hpts_mod_unload(void)
+/*
+ * Stop all callouts/threads for a tcp_hptsi structure.
+ */
+void
+tcp_hptsi_stop(struct tcp_hptsi *pace)
{
+ struct tcp_hpts_entry *hpts;
int rv __diagused;
+ uint32_t i;
- tcp_lro_hpts_uninit();
- atomic_store_ptr(&tcp_hpts_softclock, NULL);
+ KASSERT(pace != NULL, ("tcp_hptsi_stop: pace is NULL"));
- for (int i = 0; i < tcp_pace.rp_num_hptss; i++) {
- struct tcp_hpts_entry *hpts = tcp_pace.rp_ent[i];
+ for (i = 0; i < pace->rp_num_hptss; i++) {
+ hpts = pace->rp_ent[i];
+ KASSERT(hpts != NULL, ("tcp_hptsi_stop: hpts[%d] is NULL", i));
+ KASSERT(hpts->ie_cookie != NULL,
+ ("tcp_hptsi_stop: hpts[%d]->ie_cookie is NULL", i));
- rv = callout_drain(&hpts->co);
+ rv = _callout_stop_safe(&hpts->co, CS_DRAIN);
MPASS(rv != 0);
rv = swi_remove(hpts->ie_cookie);
MPASS(rv == 0);
+ hpts->ie_cookie = NULL;
+ }
+}
- rv = sysctl_ctx_free(&hpts->hpts_ctx);
- MPASS(rv == 0);
+/*
+ * Destroy a tcp_hptsi structure initialized by tcp_hptsi_create.
+ */
+void
+tcp_hptsi_destroy(struct tcp_hptsi *pace)
+{
+ struct tcp_hpts_entry *hpts;
+ uint32_t i;
+
+ KASSERT(pace != NULL, ("tcp_hptsi_destroy: pace is NULL"));
+ KASSERT(pace->rp_ent != NULL, ("tcp_hptsi_destroy: pace->rp_ent is NULL"));
+
+ /* Cleanup each hpts entry */
+ for (i = 0; i < pace->rp_num_hptss; i++) {
+ hpts = pace->rp_ent[i];
+ if (hpts != NULL) {
+ /* Cleanup SYSCTL if it was initialized */
+ if (hpts->hpts_root != NULL) {
+ sysctl_ctx_free(&hpts->hpts_ctx);
+ }
- mtx_destroy(&hpts->p_mtx);
- free(hpts->p_hptss, M_TCPHPTS);
- free(hpts, M_TCPHPTS);
+ mtx_destroy(&hpts->p_mtx);
+ free(hpts->p_hptss, M_TCPHPTS);
+ free(hpts, M_TCPHPTS);
+ }
}
- free(tcp_pace.rp_ent, M_TCPHPTS);
- free(tcp_pace.cts_last_ran, M_TCPHPTS);
+ /* Cleanup main arrays */
+ free(pace->rp_ent, M_TCPHPTS);
+ free(pace->cts_last_ran, M_TCPHPTS);
#ifdef SMP
- free(tcp_pace.grps, M_TCPHPTS);
+ free(pace->grps, M_TCPHPTS);
#endif
+ /* Free the main structure */
+ free(pace, M_TCPHPTS);
+}
+
+static int
+tcp_hpts_mod_load(void)
+{
+ int i;
+
+ /* Don't try to bind to NUMA domains if we don't have any */
+ if (vm_ndomains == 1 && tcp_bind_threads == 2)
+ tcp_bind_threads = 0;
+
+ /* Create the tcp_hptsi structure */
+ tcp_hptsi_pace = tcp_hptsi_create(&tcp_hptsi_default_funcs, true);
+ if (tcp_hptsi_pace == NULL)
+ return (ENOMEM);
+
+ /* Initialize global counters */
+ hpts_hopelessly_behind = counter_u64_alloc(M_WAITOK);
+ hpts_loops = counter_u64_alloc(M_WAITOK);
+ back_tosleep = counter_u64_alloc(M_WAITOK);
+ combined_wheel_wrap = counter_u64_alloc(M_WAITOK);
+ wheel_wrap = counter_u64_alloc(M_WAITOK);
+ hpts_wake_timeout = counter_u64_alloc(M_WAITOK);
+ hpts_direct_awakening = counter_u64_alloc(M_WAITOK);
+ hpts_back_tosleep = counter_u64_alloc(M_WAITOK);
+ hpts_direct_call = counter_u64_alloc(M_WAITOK);
+ cpu_uses_flowid = counter_u64_alloc(M_WAITOK);
+ cpu_uses_random = counter_u64_alloc(M_WAITOK);
+
+ /* Start the threads */
+ tcp_hptsi_start(tcp_hptsi_pace);
+
+ /* Enable the global HPTS softclock function */
+ tcp_hpts_softclock = __tcp_run_hpts;
+
+ /* Initialize LRO HPTS */
+ tcp_lro_hpts_init();
+
+ /*
+ * If we somehow have an empty domain, fall back to choosing among all
+ * HPTS threads.
+ */
+ for (i = 0; i < vm_ndomains; i++) {
+ if (tcp_hptsi_pace->domains[i].count == 0) {
+ tcp_bind_threads = 0;
+ break;
+ }
+ }
+
+ printf("TCP HPTS started %u (%s) swi interrupt threads\n",
+ tcp_hptsi_pace->rp_num_hptss, (tcp_bind_threads == 0) ?
+ "(unbounded)" :
+ (tcp_bind_threads == 1 ? "per-cpu" : "per-NUMA-domain"));
+
+ return (0);
+}
+
+static void
+tcp_hpts_mod_unload(void)
+{
+ tcp_lro_hpts_uninit();
+
+ /* Disable the global HPTS softclock function */
+ atomic_store_ptr(&tcp_hpts_softclock, NULL);
+
+ tcp_hptsi_stop(tcp_hptsi_pace);
+ tcp_hptsi_destroy(tcp_hptsi_pace);
+ tcp_hptsi_pace = NULL;
+
+ /* Cleanup global counters */
counter_u64_free(hpts_hopelessly_behind);
counter_u64_free(hpts_loops);
counter_u64_free(back_tosleep);
@@ -2104,13 +2189,11 @@ tcp_hpts_mod_unload(void)
}
static int
-tcp_hpts_modevent(module_t mod, int what, void *arg)
+tcp_hpts_mod_event(module_t mod, int what, void *arg)
{
-
switch (what) {
case MOD_LOAD:
- tcp_hpts_mod_load();
- return (0);
+ return (tcp_hpts_mod_load());
case MOD_QUIESCE:
/*
* Since we are a dependency of TCP stack modules, they should
@@ -2130,7 +2213,7 @@ tcp_hpts_modevent(module_t mod, int what, void *arg)
static moduledata_t tcp_hpts_module = {
.name = "tcphpts",
- .evhand = tcp_hpts_modevent,
+ .evhand = tcp_hpts_mod_event,
};
DECLARE_MODULE(tcphpts, tcp_hpts_module, SI_SUB_SOFTINTR, SI_ORDER_ANY);