diff options
Diffstat (limited to 'sys/netinet')
113 files changed, 6221 insertions, 6355 deletions
diff --git a/sys/netinet/cc/cc.c b/sys/netinet/cc/cc.c index 9a8ca760fa28..d85ad4e9f4fd 100644 --- a/sys/netinet/cc/cc.c +++ b/sys/netinet/cc/cc.c @@ -392,6 +392,7 @@ void newreno_cc_post_recovery(struct cc_var *ccv) { int pipe; + uint32_t mss = tcp_fixed_maxseg(ccv->tp); if (IN_FASTRECOVERY(CCV(ccv, t_flags))) { /* @@ -400,20 +401,14 @@ newreno_cc_post_recovery(struct cc_var *ccv) * approximately snd_ssthresh outstanding data. But in case we * would be inclined to send a burst, better to do it via the * slow start mechanism. - * - * XXXLAS: Find a way to do this without needing curack */ - if (V_tcp_do_newsack) - pipe = tcp_compute_pipe(ccv->ccvc.tcp); - else - pipe = CCV(ccv, snd_max) - ccv->curack; + pipe = tcp_compute_pipe(ccv->tp); if (pipe < CCV(ccv, snd_ssthresh)) /* * Ensure that cwnd does not collapse to 1 MSS under * adverse conditions. Implements RFC6582 */ - CCV(ccv, snd_cwnd) = max(pipe, CCV(ccv, t_maxseg)) + - CCV(ccv, t_maxseg); + CCV(ccv, snd_cwnd) = max(pipe, mss) + mss; else CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh); } @@ -440,7 +435,7 @@ newreno_cc_after_idle(struct cc_var *ccv) * maximum of the former ssthresh or 3/4 of the old cwnd, to * not exit slow-start prematurely. */ - rw = tcp_compute_initwnd(tcp_fixed_maxseg(ccv->ccvc.tcp)); + rw = tcp_compute_initwnd(tcp_fixed_maxseg(ccv->tp)); CCV(ccv, snd_ssthresh) = max(CCV(ccv, snd_ssthresh), CCV(ccv, snd_cwnd)-(CCV(ccv, snd_cwnd)>>2)); @@ -449,15 +444,14 @@ newreno_cc_after_idle(struct cc_var *ccv) } /* - * Perform any necessary tasks before we enter congestion recovery. - */ -void -newreno_cc_cong_signal(struct cc_var *ccv, ccsignal_t type) + * Get a new congestion window size on a multiplicative decrease event. + * */ +u_int +newreno_cc_cwnd_on_multiplicative_decrease(struct cc_var *ccv, uint32_t mss) { - uint32_t cwin, factor, mss, pipe; + uint32_t cwin, factor; cwin = CCV(ccv, snd_cwnd); - mss = tcp_fixed_maxseg(ccv->ccvc.tcp); /* * Other TCP congestion controls use newreno_cong_signal(), but * with their own private cc_data. Make sure the cc_data is used @@ -465,12 +459,24 @@ newreno_cc_cong_signal(struct cc_var *ccv, ccsignal_t type) */ factor = V_newreno_beta; + return max(((uint64_t)cwin * (uint64_t)factor) / (100ULL * (uint64_t)mss), 2) * mss; +} + +/* + * Perform any necessary tasks before we enter congestion recovery. + */ +void +newreno_cc_cong_signal(struct cc_var *ccv, ccsignal_t type) +{ + uint32_t cwin, mss, pipe; + + mss = tcp_fixed_maxseg(ccv->tp); + /* Catch algos which mistakenly leak private signal types. */ KASSERT((type & CC_SIGPRIVMASK) == 0, ("%s: congestion signal type 0x%08x is private\n", __func__, type)); - cwin = max(((uint64_t)cwin * (uint64_t)factor) / (100ULL * (uint64_t)mss), - 2) * mss; + cwin = newreno_cc_cwnd_on_multiplicative_decrease(ccv, mss); switch (type) { case CC_NDUPACK: @@ -489,13 +495,7 @@ newreno_cc_cong_signal(struct cc_var *ccv, ccsignal_t type) break; case CC_RTO: if (CCV(ccv, t_rxtshift) == 1) { - if (V_tcp_do_newsack) { - pipe = tcp_compute_pipe(ccv->ccvc.tcp); - } else { - pipe = CCV(ccv, snd_max) - - CCV(ccv, snd_fack) + - CCV(ccv, sackhint.sack_bytes_rexmit); - } + pipe = tcp_compute_pipe(ccv->tp); CCV(ccv, snd_ssthresh) = max(2, min(CCV(ccv, snd_wnd), pipe) / 2 / mss) * mss; } @@ -506,78 +506,110 @@ newreno_cc_cong_signal(struct cc_var *ccv, ccsignal_t type) } } -void -newreno_cc_ack_received(struct cc_var *ccv, ccsignal_t type) +u_int +newreno_cc_cwnd_in_cong_avoid(struct cc_var *ccv) { - if (type == CC_ACK && !IN_RECOVERY(CCV(ccv, t_flags)) && - (ccv->flags & CCF_CWND_LIMITED)) { - u_int cw = CCV(ccv, snd_cwnd); - u_int incr = CCV(ccv, t_maxseg); + u_int cw = CCV(ccv, snd_cwnd); + u_int incr = tcp_fixed_maxseg(ccv->tp); + + KASSERT(cw > CCV(ccv, snd_ssthresh), + ("congestion control state not in congestion avoidance\n")); + /* + * Regular in-order ACK, open the congestion window. + * The congestion control state we're in is congestion avoidance. + * + * Check if ABC (RFC 3465) is enabled. + * cong avoid: cwnd > ssthresh + * + * cong avoid and ABC (RFC 3465): + * Grow cwnd linearly by maxseg per RTT for each + * cwnd worth of ACKed data. + * + * cong avoid without ABC (RFC 5681): + * Grow cwnd linearly by approximately maxseg per RTT using + * maxseg^2 / cwnd per ACK as the increment. + * If cwnd > maxseg^2, fix the cwnd increment at 1 byte to + * avoid capping cwnd. + */ + if (V_tcp_do_rfc3465) { + if (ccv->flags & CCF_ABC_SENTAWND) + ccv->flags &= ~CCF_ABC_SENTAWND; + else + incr = 0; + } else + incr = max((incr * incr / cw), 1); + /* ABC is on by default, so incr equals 0 frequently. */ + if (incr > 0) + return min(cw + incr, TCP_MAXWIN << CCV(ccv, snd_scale)); + else + return cw; +} + +u_int +newreno_cc_cwnd_in_slow_start(struct cc_var *ccv) +{ + u_int cw = CCV(ccv, snd_cwnd); + u_int mss = tcp_fixed_maxseg(ccv->tp); + u_int incr = mss; + + KASSERT(cw <= CCV(ccv, snd_ssthresh), + ("congestion control state not in slow start\n")); + + /* + * Regular in-order ACK, open the congestion window. + * The congestion control state we're in is slow start. + * + * slow start: cwnd <= ssthresh + * + * slow start and ABC (RFC 3465): + * Grow cwnd exponentially by the amount of data + * ACKed capping the max increment per ACK to + * (abc_l_var * maxseg) bytes. + * + * slow start without ABC (RFC 5681): + * Grow cwnd exponentially by maxseg per ACK. + */ + if (V_tcp_do_rfc3465) { /* - * Regular in-order ACK, open the congestion window. - * Method depends on which congestion control state we're - * in (slow start or cong avoid) and if ABC (RFC 3465) is - * enabled. - * - * slow start: cwnd <= ssthresh - * cong avoid: cwnd > ssthresh - * - * slow start and ABC (RFC 3465): - * Grow cwnd exponentially by the amount of data - * ACKed capping the max increment per ACK to - * (abc_l_var * maxseg) bytes. - * - * slow start without ABC (RFC 5681): - * Grow cwnd exponentially by maxseg per ACK. - * - * cong avoid and ABC (RFC 3465): - * Grow cwnd linearly by maxseg per RTT for each - * cwnd worth of ACKed data. + * In slow-start with ABC enabled and no RTO in sight? + * (Must not use abc_l_var > 1 if slow starting after + * an RTO. On RTO, snd_nxt = snd_una, so the + * snd_nxt == snd_max check is sufficient to + * handle this). * - * cong avoid without ABC (RFC 5681): - * Grow cwnd linearly by approximately maxseg per RTT using - * maxseg^2 / cwnd per ACK as the increment. - * If cwnd > maxseg^2, fix the cwnd increment at 1 byte to - * avoid capping cwnd. + * XXXLAS: Find a way to signal SS after RTO that + * doesn't rely on tcpcb vars. */ - if (cw > CCV(ccv, snd_ssthresh)) { - if (V_tcp_do_rfc3465) { - if (ccv->flags & CCF_ABC_SENTAWND) - ccv->flags &= ~CCF_ABC_SENTAWND; - else - incr = 0; - } else - incr = max((incr * incr / cw), 1); - } else if (V_tcp_do_rfc3465) { - /* - * In slow-start with ABC enabled and no RTO in sight? - * (Must not use abc_l_var > 1 if slow starting after - * an RTO. On RTO, snd_nxt = snd_una, so the - * snd_nxt == snd_max check is sufficient to - * handle this). - * - * XXXLAS: Find a way to signal SS after RTO that - * doesn't rely on tcpcb vars. - */ - uint16_t abc_val; - - if (ccv->flags & CCF_USE_LOCAL_ABC) - abc_val = ccv->labc; - else - abc_val = V_tcp_abc_l_var; - if (CCV(ccv, snd_nxt) == CCV(ccv, snd_max)) - incr = min(ccv->bytes_this_ack, - ccv->nsegs * abc_val * - CCV(ccv, t_maxseg)); - else - incr = min(ccv->bytes_this_ack, CCV(ccv, t_maxseg)); + uint16_t abc_val; + if (ccv->flags & CCF_USE_LOCAL_ABC) + abc_val = ccv->labc; + else + abc_val = V_tcp_abc_l_var; + if (CCV(ccv, snd_nxt) == CCV(ccv, snd_max)) + incr = min(ccv->bytes_this_ack, + ccv->nsegs * abc_val * mss); + else + incr = min(ccv->bytes_this_ack, mss); + } + /* ABC is on by default, so incr equals 0 frequently. */ + if (incr > 0) + return min(cw + incr, TCP_MAXWIN << CCV(ccv, snd_scale)); + else + return cw; +} + +void +newreno_cc_ack_received(struct cc_var *ccv, ccsignal_t type) +{ + if (type == CC_ACK && !IN_RECOVERY(CCV(ccv, t_flags)) && + (ccv->flags & CCF_CWND_LIMITED)) { + if (CCV(ccv, snd_cwnd) > CCV(ccv, snd_ssthresh)) { + CCV(ccv, snd_cwnd) = newreno_cc_cwnd_in_cong_avoid(ccv); + } else { + CCV(ccv, snd_cwnd) = newreno_cc_cwnd_in_slow_start(ccv); } - /* ABC is on by default, so incr equals 0 frequently. */ - if (incr > 0) - CCV(ccv, snd_cwnd) = min(cw + incr, - TCP_MAXWIN << CCV(ccv, snd_scale)); } } diff --git a/sys/netinet/cc/cc.h b/sys/netinet/cc/cc.h index aac0825e5fe1..890bea69a14b 100644 --- a/sys/netinet/cc/cc.h +++ b/sys/netinet/cc/cc.h @@ -87,21 +87,12 @@ int cc_deregister_algo(struct cc_algo *remove_cc); #endif /* _KERNEL */ #if defined(_KERNEL) || defined(_WANT_TCPCB) -/* - * Wrapper around transport structs that contain same-named congestion - * control variables. Allows algos to be shared amongst multiple CC aware - * transprots. - */ struct cc_var { void *cc_data; /* Per-connection private CC algorithm data. */ int bytes_this_ack; /* # bytes acked by the current ACK. */ tcp_seq curack; /* Most recent ACK. */ uint32_t flags; /* Flags for cc_var (see below) */ - int type; /* Indicates which ptr is valid in ccvc. */ - union ccv_container { - struct tcpcb *tcp; - struct sctp_nets *sctp; - } ccvc; + struct tcpcb *tp; /* Pointer to tcpcb */ uint16_t nsegs; /* # segments coalesced into current chain. */ uint8_t labc; /* Dont use system abc use passed in */ }; @@ -113,10 +104,10 @@ struct cc_var { #define CCF_ACKNOW 0x0008 /* Will this ack be sent now? */ #define CCF_IPHDR_CE 0x0010 /* Does this packet set CE bit? */ #define CCF_TCPHDR_CWR 0x0020 /* Does this packet set CWR bit? */ -#define CCF_MAX_CWND 0x0040 /* Have we reached maximum cwnd? */ -#define CCF_CHG_MAX_CWND 0x0080 /* CUBIC max_cwnd changed, for K */ -#define CCF_USR_IWND 0x0100 /* User specified initial window */ -#define CCF_USR_IWND_INIT_NSEG 0x0200 /* Convert segs to bytes on conn init */ +#define CCF_UNUSED1 0x0040 +#define CCF_UNUSED2 0x0080 +#define CCF_UNUSED3 0x0100 +#define CCF_UNUSED4 0x0200 #define CCF_HYSTART_ALLOWED 0x0400 /* If the CC supports it Hystart is allowed */ #define CCF_HYSTART_CAN_SH_CWND 0x0800 /* Can hystart when going CSS -> CA slam the cwnd */ #define CCF_HYSTART_CONS_SSTH 0x1000 /* Should hystart use the more conservative ssthresh */ @@ -240,6 +231,9 @@ void newreno_cc_post_recovery(struct cc_var *); void newreno_cc_after_idle(struct cc_var *); void newreno_cc_cong_signal(struct cc_var *, ccsignal_t); void newreno_cc_ack_received(struct cc_var *, ccsignal_t); +u_int newreno_cc_cwnd_on_multiplicative_decrease(struct cc_var *ccv, uint32_t mss); +u_int newreno_cc_cwnd_in_cong_avoid(struct cc_var *ccv); +u_int newreno_cc_cwnd_in_slow_start(struct cc_var *ccv); /* Called to temporarily keep an algo from going away during change */ void cc_refer(struct cc_algo *algo); diff --git a/sys/netinet/cc/cc_cdg.c b/sys/netinet/cc/cc_cdg.c index 1e9236f878d4..5b1df76e71a2 100644 --- a/sys/netinet/cc/cc_cdg.c +++ b/sys/netinet/cc/cc_cdg.c @@ -57,6 +57,7 @@ #include <sys/malloc.h> #include <sys/module.h> #include <sys/queue.h> +#include <sys/prng.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/sysctl.h> @@ -294,7 +295,7 @@ cdg_cb_init(struct cc_var *ccv, void *ptr) { struct cdg *cdg_data; - INP_WLOCK_ASSERT(tptoinpcb(ccv->ccvc.tcp)); + INP_WLOCK_ASSERT(tptoinpcb(ccv->tp)); if (ptr == NULL) { cdg_data = malloc(sizeof(struct cdg), M_CC_MEM, M_NOWAIT); if (cdg_data == NULL) @@ -415,27 +416,28 @@ cdg_window_increase(struct cc_var *ccv, int new_measurement) { struct cdg *cdg_data; int incr, s_w_incr; + uint32_t mss = tcp_fixed_maxseg(ccv->tp); cdg_data = ccv->cc_data; incr = s_w_incr = 0; if (CCV(ccv, snd_cwnd) <= CCV(ccv, snd_ssthresh)) { /* Slow start. */ - incr = CCV(ccv, t_maxseg); + incr = mss; s_w_incr = incr; cdg_data->window_incr = cdg_data->rtt_count = 0; } else { /* Congestion avoidance. */ if (new_measurement) { - s_w_incr = CCV(ccv, t_maxseg); + s_w_incr = mss; if (V_cdg_alpha_inc == 0) { - incr = CCV(ccv, t_maxseg); + incr = mss; } else { if (++cdg_data->rtt_count >= V_cdg_alpha_inc) { cdg_data->window_incr++; cdg_data->rtt_count = 0; } - incr = CCV(ccv, t_maxseg) * + incr = mss * cdg_data->window_incr; } } @@ -507,7 +509,8 @@ cdg_cong_signal(struct cc_var *ccv, ccsignal_t signal_type) static inline int prob_backoff(long qtrend) { - int backoff, idx, p; + int backoff, idx; + uint32_t p; backoff = (qtrend > ((MAXGRAD * V_cdg_exp_backoff_scale) << D_P_E)); @@ -519,8 +522,8 @@ prob_backoff(long qtrend) idx = qtrend; /* Backoff probability proportional to rate of queue growth. */ - p = (INT_MAX / (1 << EXP_PREC)) * probexp[idx]; - backoff = (random() < p); + p = (UINT32_MAX / (1 << EXP_PREC)) * probexp[idx]; + backoff = (prng32() < p); } return (backoff); diff --git a/sys/netinet/cc/cc_chd.c b/sys/netinet/cc/cc_chd.c index 52048a7c05ae..1d440f43578f 100644 --- a/sys/netinet/cc/cc_chd.c +++ b/sys/netinet/cc/cc_chd.c @@ -58,6 +58,7 @@ #include <sys/limits.h> #include <sys/malloc.h> #include <sys/module.h> +#include <sys/prng.h> #include <sys/queue.h> #include <sys/socket.h> #include <sys/socketvar.h> @@ -85,8 +86,8 @@ */ #define CC_CHD_DELAY 0x02000000 -/* Largest possible number returned by random(). */ -#define RANDOM_MAX INT_MAX +/* Largest possible number returned by prng32(). */ +#define RANDOM_MAX UINT32_MAX static void chd_ack_received(struct cc_var *ccv, ccsignal_t ack_type); static void chd_cb_destroy(struct cc_var *ccv); @@ -146,10 +147,11 @@ static __inline void chd_window_decrease(struct cc_var *ccv) { unsigned long win; + uint32_t mss = tcp_fixed_maxseg(ccv->tp); - win = min(CCV(ccv, snd_wnd), CCV(ccv, snd_cwnd)) / CCV(ccv, t_maxseg); + win = min(CCV(ccv, snd_wnd), CCV(ccv, snd_cwnd)) / mss; win -= max((win / 2), 1); - CCV(ccv, snd_ssthresh) = max(win, 2) * CCV(ccv, t_maxseg); + CCV(ccv, snd_ssthresh) = max(win, 2) * mss; } /* @@ -159,9 +161,9 @@ chd_window_decrease(struct cc_var *ccv) static __inline int should_backoff(int qdly, int maxqdly, struct chd *chd_data) { - unsigned long p, rand; + uint32_t rand, p; - rand = random(); + rand = prng32(); if (qdly < V_chd_qthresh) { chd_data->loss_compete = 0; @@ -189,6 +191,7 @@ chd_window_increase(struct cc_var *ccv, int new_measurement) { struct chd *chd_data; int incr; + uint32_t mss = tcp_fixed_maxseg(ccv->tp); chd_data = ccv->cc_data; incr = 0; @@ -200,23 +203,22 @@ chd_window_increase(struct cc_var *ccv, int new_measurement) if (CCV(ccv, snd_nxt) == CCV(ccv, snd_max)) { /* Not due to RTO. */ incr = min(ccv->bytes_this_ack, - V_tcp_abc_l_var * CCV(ccv, t_maxseg)); + V_tcp_abc_l_var * mss); } else { /* Due to RTO. */ - incr = min(ccv->bytes_this_ack, - CCV(ccv, t_maxseg)); + incr = min(ccv->bytes_this_ack, mss); } } else - incr = CCV(ccv, t_maxseg); + incr = mss; } else { /* Congestion avoidance. */ if (V_tcp_do_rfc3465) { if (ccv->flags & CCF_ABC_SENTAWND) { ccv->flags &= ~CCF_ABC_SENTAWND; - incr = CCV(ccv, t_maxseg); + incr = mss; } } else if (new_measurement) - incr = CCV(ccv, t_maxseg); + incr = mss; } if (chd_data->shadow_w > 0) { @@ -321,7 +323,7 @@ chd_cb_init(struct cc_var *ccv, void *ptr) { struct chd *chd_data; - INP_WLOCK_ASSERT(tptoinpcb(ccv->ccvc.tcp)); + INP_WLOCK_ASSERT(tptoinpcb(ccv->tp)); if (ptr == NULL) { chd_data = malloc(sizeof(struct chd), M_CC_MEM, M_NOWAIT); if (chd_data == NULL) @@ -379,8 +381,9 @@ chd_cong_signal(struct cc_var *ccv, ccsignal_t signal_type) } if (chd_data->shadow_w > 0) { + uint32_t mss = tcp_fixed_maxseg(ccv->tp); chd_data->shadow_w = max(chd_data->shadow_w / - CCV(ccv, t_maxseg) / 2, 2) * CCV(ccv, t_maxseg); + mss / 2, 2) * mss; } ENTER_FASTRECOVERY(CCV(ccv, t_flags)); break; diff --git a/sys/netinet/cc/cc_cubic.c b/sys/netinet/cc/cc_cubic.c index b4050326ae31..b3e15009244d 100644 --- a/sys/netinet/cc/cc_cubic.c +++ b/sys/netinet/cc/cc_cubic.c @@ -38,7 +38,7 @@ /* * An implementation of the CUBIC congestion control algorithm for FreeBSD, - * based on the Internet Draft "draft-rhee-tcpm-cubic-02" by Rhee, Xu and Ha. + * based on the Internet RFC9438 by Xu, Ha, Rhee, Goel, and Eggert. * Originally released as part of the NewTCP research project at Swinburne * University of Technology's Centre for Advanced Internet Architectures, * Melbourne, Australia, which was made possible in part by a grant from the @@ -81,7 +81,7 @@ static void cubic_conn_init(struct cc_var *ccv); static int cubic_mod_init(void); static void cubic_post_recovery(struct cc_var *ccv); static void cubic_record_rtt(struct cc_var *ccv); -static void cubic_ssthresh_update(struct cc_var *ccv, uint32_t maxseg); +static uint32_t cubic_get_ssthresh(struct cc_var *ccv, uint32_t maxseg); static void cubic_after_idle(struct cc_var *ccv); static size_t cubic_data_sz(void); static void cubic_newround(struct cc_var *ccv, uint32_t round_cnt); @@ -125,7 +125,7 @@ cubic_log_hystart_event(struct cc_var *ccv, struct cubic *cubicd, uint8_t mod, u if (hystart_bblogs == 0) return; - tp = ccv->ccvc.tcp; + tp = ccv->tp; if (tcp_bblogging_on(tp)) { union tcp_log_stackspecific log; struct timeval tv; @@ -168,7 +168,8 @@ cubic_does_slow_start(struct cc_var *ccv, struct cubic *cubicd) * doesn't rely on tcpcb vars. */ u_int cw = CCV(ccv, snd_cwnd); - u_int incr = CCV(ccv, t_maxseg); + uint32_t mss = tcp_fixed_maxseg(ccv->tp); + u_int incr = mss; uint16_t abc_val; cubicd->flags |= CUBICFLAG_IN_SLOWSTART; @@ -216,10 +217,9 @@ cubic_does_slow_start(struct cc_var *ccv, struct cubic *cubicd) } if (CCV(ccv, snd_nxt) == CCV(ccv, snd_max)) incr = min(ccv->bytes_this_ack, - ccv->nsegs * abc_val * - CCV(ccv, t_maxseg)); + ccv->nsegs * abc_val * mss); else - incr = min(ccv->bytes_this_ack, CCV(ccv, t_maxseg)); + incr = min(ccv->bytes_this_ack, mss); /* Only if Hystart is enabled will the flag get set */ if (cubicd->flags & CUBICFLAG_HYSTART_IN_CSS) { @@ -236,9 +236,11 @@ static void cubic_ack_received(struct cc_var *ccv, ccsignal_t type) { struct cubic *cubic_data; - unsigned long W_est, W_cubic; + uint32_t W_est, W_cubic, cwin, target, incr; int usecs_since_epoch; + uint32_t mss = tcp_fixed_maxseg(ccv->tp); + cwin = CCV(ccv, snd_cwnd); cubic_data = ccv->cc_data; cubic_record_rtt(ccv); @@ -249,7 +251,7 @@ cubic_ack_received(struct cc_var *ccv, ccsignal_t type) if (type == CC_ACK && !IN_RECOVERY(CCV(ccv, t_flags)) && (ccv->flags & CCF_CWND_LIMITED)) { /* Use the logic in NewReno ack_received() for slow start. */ - if (CCV(ccv, snd_cwnd) <= CCV(ccv, snd_ssthresh) || + if (cwin <= CCV(ccv, snd_ssthresh) || cubic_data->min_rtt_usecs == TCPTV_SRTTBASE) { cubic_does_slow_start(ccv, cubic_data); } else { @@ -264,21 +266,32 @@ cubic_ack_received(struct cc_var *ccv, ccsignal_t type) cubic_data->flags &= ~CUBICFLAG_HYSTART_ENABLED; cubic_log_hystart_event(ccv, cubic_data, 11, CCV(ccv, snd_ssthresh)); } - if ((cubic_data->flags & CUBICFLAG_RTO_EVENT) && - (cubic_data->flags & CUBICFLAG_IN_SLOWSTART)) { - /* RFC8312 Section 4.7 */ - cubic_data->flags &= ~(CUBICFLAG_RTO_EVENT | - CUBICFLAG_IN_SLOWSTART); - cubic_data->W_max = CCV(ccv, snd_cwnd); - cubic_data->t_epoch = ticks; - cubic_data->K = 0; - } else if (cubic_data->flags & (CUBICFLAG_IN_SLOWSTART | + if (cubic_data->flags & (CUBICFLAG_IN_SLOWSTART | + CUBICFLAG_CONG_EVENT | CUBICFLAG_IN_APPLIMIT)) { + /* + * At the beginning of the current congestion + * avoidance stage, The epoch variables + * (t_epoch, cwnd_epoch, K) are updated in the + * following three cases: + * 1) just exited the slow start + * 2) after a congestion event + * 3) application-limited + */ + cubic_data->t_epoch = ticks; + cubic_data->cwnd_epoch = cwin; + cubic_data->K = cubic_k(cubic_data->W_max / mss, + cubic_data->cwnd_epoch / mss); cubic_data->flags &= ~(CUBICFLAG_IN_SLOWSTART | + CUBICFLAG_CONG_EVENT | CUBICFLAG_IN_APPLIMIT); - cubic_data->t_epoch = ticks; - cubic_data->K = cubic_k(cubic_data->W_max / - CCV(ccv, t_maxseg)); + + if (cubic_data->flags & CUBICFLAG_RTO_EVENT) { + /* RFC9438 Section 4.8: Timeout */ + cubic_data->flags &= ~CUBICFLAG_RTO_EVENT; + cubic_data->W_max = cwin; + cubic_data->K = 0; + } } usecs_since_epoch = (ticks - cubic_data->t_epoch) * tick; if (usecs_since_epoch < 0) { @@ -288,52 +301,35 @@ cubic_ack_received(struct cc_var *ccv, ccsignal_t type) usecs_since_epoch = INT_MAX; cubic_data->t_epoch = ticks - INT_MAX; } + W_est = tf_cwnd(ccv); /* - * The mean RTT is used to best reflect the equations in - * the I-D. Using min_rtt in the tf_cwnd calculation - * causes W_est to grow much faster than it should if the - * RTT is dominated by network buffering rather than - * propagation delay. + * The mean RTT is used to best reflect the equations. */ - W_est = tf_cwnd(usecs_since_epoch, cubic_data->mean_rtt_usecs, - cubic_data->W_max, CCV(ccv, t_maxseg)); - W_cubic = cubic_cwnd(usecs_since_epoch + cubic_data->mean_rtt_usecs, cubic_data->W_max, - CCV(ccv, t_maxseg), + mss, cubic_data->K); - ccv->flags &= ~CCF_ABC_SENTAWND; - if (W_cubic < W_est) { + /* RFC9438 Section 4.3: Reno-friendly region */ + CCV(ccv, snd_cwnd) = W_est; + cubic_data->flags |= CUBICFLAG_IN_TF; + } else { /* - * TCP-friendly region, follow tf - * cwnd growth. - */ - if (CCV(ccv, snd_cwnd) < W_est) - CCV(ccv, snd_cwnd) = ulmin(W_est, INT_MAX); - } else if (CCV(ccv, snd_cwnd) < W_cubic) { - /* - * Concave or convex region, follow CUBIC - * cwnd growth. - * Only update snd_cwnd, if it doesn't shrink. + * RFC9438 Section 4.4 or 4.5: + * Concave or Convex Region */ - CCV(ccv, snd_cwnd) = ulmin(W_cubic, INT_MAX); - } - - /* - * If we're not in slow start and we're probing for a - * new cwnd limit at the start of a connection - * (happens when hostcache has a relevant entry), - * keep updating our current estimate of the - * W_max. - */ - if (((cubic_data->flags & CUBICFLAG_CONG_EVENT) == 0) && - cubic_data->W_max < CCV(ccv, snd_cwnd)) { - cubic_data->W_max = CCV(ccv, snd_cwnd); - cubic_data->K = cubic_k(cubic_data->W_max / - CCV(ccv, t_maxseg)); + if (W_cubic < cwin) { + target = cwin; + } else if (W_cubic > ((cwin * 3) >> 1)) { + target = (cwin * 3) >> 1; + } else { + target = W_cubic; + } + incr = (((target - cwin) << CUBIC_SHIFT) / + cwin * mss) >> CUBIC_SHIFT; + CCV(ccv, snd_cwnd) = cwin + incr; } } } else if (type == CC_ACK && !IN_RECOVERY(CCV(ccv, t_flags)) && @@ -350,12 +346,11 @@ cubic_ack_received(struct cc_var *ccv, ccsignal_t type) static void cubic_after_idle(struct cc_var *ccv) { - struct cubic *cubic_data; - - cubic_data = ccv->cc_data; + struct cubic *cubic_data = ccv->cc_data; + uint32_t mss = tcp_fixed_maxseg(ccv->tp); cubic_data->W_max = ulmax(cubic_data->W_max, CCV(ccv, snd_cwnd)); - cubic_data->K = cubic_k(cubic_data->W_max / CCV(ccv, t_maxseg)); + cubic_data->K = cubic_k(cubic_data->W_max / mss, cubic_data->cwnd_epoch / mss); if ((cubic_data->flags & CUBICFLAG_HYSTART_ENABLED) == 0) { /* * Re-enable hystart if we have been idle. @@ -385,7 +380,7 @@ cubic_cb_init(struct cc_var *ccv, void *ptr) { struct cubic *cubic_data; - INP_WLOCK_ASSERT(tptoinpcb(ccv->ccvc.tcp)); + INP_WLOCK_ASSERT(tptoinpcb(ccv->tp)); if (ptr == NULL) { cubic_data = malloc(sizeof(struct cubic), M_CC_MEM, M_NOWAIT|M_ZERO); if (cubic_data == NULL) @@ -394,7 +389,9 @@ cubic_cb_init(struct cc_var *ccv, void *ptr) cubic_data = ptr; /* Init some key variables with sensible defaults. */ - cubic_data->t_epoch = ticks; + cubic_data->t_epoch = 0; + cubic_data->cwnd_epoch = 0; + cubic_data->K = 0; cubic_data->min_rtt_usecs = TCPTV_SRTTBASE; cubic_data->mean_rtt_usecs = 1; @@ -421,10 +418,10 @@ static void cubic_cong_signal(struct cc_var *ccv, ccsignal_t type) { struct cubic *cubic_data; - uint32_t mss, pipe; + uint32_t mss, pipe, ssthresh; cubic_data = ccv->cc_data; - mss = tcp_fixed_maxseg(ccv->ccvc.tcp); + mss = tcp_fixed_maxseg(ccv->tp); switch (type) { case CC_NDUPACK: @@ -436,10 +433,13 @@ cubic_cong_signal(struct cc_var *ccv, ccsignal_t type) } if (!IN_FASTRECOVERY(CCV(ccv, t_flags))) { if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) { - cubic_ssthresh_update(ccv, mss); + ssthresh = cubic_get_ssthresh(ccv, mss); + CCV(ccv, snd_ssthresh) = max(ssthresh, 2 * mss); + /* + * The congestion flag will recalculate K at the + * beginning of the congestion avoidance stage. + */ cubic_data->flags |= CUBICFLAG_CONG_EVENT; - cubic_data->t_epoch = ticks; - cubic_data->K = cubic_k(cubic_data->W_max / mss); } ENTER_RECOVERY(CCV(ccv, t_flags)); } @@ -453,17 +453,20 @@ cubic_cong_signal(struct cc_var *ccv, ccsignal_t type) cubic_log_hystart_event(ccv, cubic_data, 9, CCV(ccv, snd_ssthresh)); } if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) { - cubic_ssthresh_update(ccv, mss); + ssthresh = cubic_get_ssthresh(ccv, mss); + CCV(ccv, snd_ssthresh) = max(ssthresh, 2 * mss); + CCV(ccv, snd_cwnd) = max(ssthresh, mss); + /* + * The congestion flag will recalculate K at the + * beginning of the congestion avoidance stage. + */ cubic_data->flags |= CUBICFLAG_CONG_EVENT; - cubic_data->t_epoch = ticks; - cubic_data->K = cubic_k(cubic_data->W_max / mss); - CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh); ENTER_CONGRECOVERY(CCV(ccv, t_flags)); } break; case CC_RTO: - /* RFC8312 Section 4.7 */ + /* RFC9438 Section 4.8: Timeout */ if (CCV(ccv, t_rxtshift) == 1) { /* * Remember the state only for the first RTO event. This @@ -473,34 +476,25 @@ cubic_cong_signal(struct cc_var *ccv, ccsignal_t type) */ cubic_data->undo_t_epoch = cubic_data->t_epoch; cubic_data->undo_cwnd_epoch = cubic_data->cwnd_epoch; - cubic_data->undo_W_est = cubic_data->W_est; - cubic_data->undo_cwnd_prior = cubic_data->cwnd_prior; cubic_data->undo_W_max = cubic_data->W_max; cubic_data->undo_K = cubic_data->K; - if (V_tcp_do_newsack) { - pipe = tcp_compute_pipe(ccv->ccvc.tcp); - } else { - pipe = CCV(ccv, snd_max) - - CCV(ccv, snd_fack) + - CCV(ccv, sackhint.sack_bytes_rexmit); - } + pipe = tcp_compute_pipe(ccv->tp); CCV(ccv, snd_ssthresh) = max(2, (((uint64_t)min(CCV(ccv, snd_wnd), pipe) * CUBIC_BETA) >> CUBIC_SHIFT) / mss) * mss; } - cubic_data->flags |= CUBICFLAG_CONG_EVENT | CUBICFLAG_RTO_EVENT; - cubic_data->undo_W_max = cubic_data->W_max; - cubic_data->num_cong_events++; + /* + * The RTO flag will recalculate K at the + * beginning of the congestion avoidance stage. + */ + cubic_data->flags |= CUBICFLAG_RTO_EVENT; CCV(ccv, snd_cwnd) = mss; break; case CC_RTO_ERR: - cubic_data->flags &= ~(CUBICFLAG_CONG_EVENT | CUBICFLAG_RTO_EVENT); - cubic_data->num_cong_events--; + cubic_data->flags &= ~CUBICFLAG_RTO_EVENT; cubic_data->K = cubic_data->undo_K; - cubic_data->cwnd_prior = cubic_data->undo_cwnd_prior; cubic_data->W_max = cubic_data->undo_W_max; - cubic_data->W_est = cubic_data->undo_W_est; cubic_data->cwnd_epoch = cubic_data->undo_cwnd_epoch; cubic_data->t_epoch = cubic_data->undo_t_epoch; break; @@ -521,7 +515,7 @@ cubic_conn_init(struct cc_var *ccv) * this here bad things happen when entries from the TCP hostcache * get used. */ - cubic_data->W_max = CCV(ccv, snd_cwnd); + cubic_data->W_max = UINT_MAX; } static int @@ -538,6 +532,7 @@ cubic_post_recovery(struct cc_var *ccv) { struct cubic *cubic_data; int pipe; + uint32_t mss = tcp_fixed_maxseg(ccv->tp); cubic_data = ccv->cc_data; pipe = 0; @@ -547,26 +542,19 @@ cubic_post_recovery(struct cc_var *ccv) * If inflight data is less than ssthresh, set cwnd * conservatively to avoid a burst of data, as suggested in * the NewReno RFC. Otherwise, use the CUBIC method. - * - * XXXLAS: Find a way to do this without needing curack */ - if (V_tcp_do_newsack) - pipe = tcp_compute_pipe(ccv->ccvc.tcp); - else - pipe = CCV(ccv, snd_max) - ccv->curack; - + pipe = tcp_compute_pipe(ccv->tp); if (pipe < CCV(ccv, snd_ssthresh)) /* * Ensure that cwnd does not collapse to 1 MSS under * adverse conditions. Implements RFC6582 */ - CCV(ccv, snd_cwnd) = max(pipe, CCV(ccv, t_maxseg)) + - CCV(ccv, t_maxseg); + CCV(ccv, snd_cwnd) = max(pipe, mss) + mss; else /* Update cwnd based on beta and adjusted W_max. */ CCV(ccv, snd_cwnd) = max(((uint64_t)cubic_data->W_max * CUBIC_BETA) >> CUBIC_SHIFT, - 2 * CCV(ccv, t_maxseg)); + 2 * mss); } /* Calculate the average RTT between congestion epochs. */ @@ -592,7 +580,7 @@ cubic_record_rtt(struct cc_var *ccv) /* Ignore srtt until a min number of samples have been taken. */ if (CCV(ccv, t_rttupdated) >= CUBIC_MIN_RTT_SAMPLES) { cubic_data = ccv->cc_data; - t_srtt_usecs = tcp_get_srtt(ccv->ccvc.tcp, + t_srtt_usecs = tcp_get_srtt(ccv->tp, TCP_TMR_GRANULARITY_USEC); /* * Record the current SRTT as our minrtt if it's the smallest @@ -627,40 +615,36 @@ cubic_record_rtt(struct cc_var *ccv) } /* - * Update the ssthresh in the event of congestion. + * Return the new value for ssthresh in the event of a congestion. */ -static void -cubic_ssthresh_update(struct cc_var *ccv, uint32_t maxseg) +static uint32_t +cubic_get_ssthresh(struct cc_var *ccv, uint32_t maxseg) { struct cubic *cubic_data; - uint32_t ssthresh; - uint32_t cwnd; + uint32_t cwnd, pipe; cubic_data = ccv->cc_data; cwnd = CCV(ccv, snd_cwnd); - /* Fast convergence heuristic. */ + /* RFC9438 Section 4.7: Fast convergence */ if (cwnd < cubic_data->W_max) { cwnd = ((uint64_t)cwnd * CUBIC_FC_FACTOR) >> CUBIC_SHIFT; } - cubic_data->undo_W_max = cubic_data->W_max; cubic_data->W_max = cwnd; - /* - * On the first congestion event, set ssthresh to cwnd * 0.5 - * and reduce W_max to cwnd * beta. This aligns the cubic concave - * region appropriately. On subsequent congestion events, set - * ssthresh to cwnd * beta. - */ - if ((cubic_data->flags & CUBICFLAG_CONG_EVENT) == 0) { - ssthresh = cwnd >> 1; - cubic_data->W_max = ((uint64_t)cwnd * - CUBIC_BETA) >> CUBIC_SHIFT; + if (cubic_data->flags & CUBICFLAG_IN_TF) { + /* If in the TCP friendly region, follow what newreno does. */ + return (newreno_cc_cwnd_on_multiplicative_decrease(ccv, maxseg)); + } else { - ssthresh = ((uint64_t)cwnd * - CUBIC_BETA) >> CUBIC_SHIFT; + /* + * RFC9438 Section 4.6: Multiplicative Decrease + * Outside the TCP friendly region, set ssthresh to the size of + * inflight_size * beta. + */ + pipe = tcp_compute_pipe(ccv->tp); + return ((pipe * CUBIC_BETA) >> CUBIC_SHIFT); } - CCV(ccv, snd_ssthresh) = max(ssthresh, 2 * maxseg); } static void diff --git a/sys/netinet/cc/cc_cubic.h b/sys/netinet/cc/cc_cubic.h index ce6c2a6633d7..c31506d26b00 100644 --- a/sys/netinet/cc/cc_cubic.h +++ b/sys/netinet/cc/cc_cubic.h @@ -83,26 +83,28 @@ #define CUBICFLAG_RTO_EVENT 0x00000008 /* RTO experienced */ #define CUBICFLAG_HYSTART_ENABLED 0x00000010 /* Hystart++ is enabled */ #define CUBICFLAG_HYSTART_IN_CSS 0x00000020 /* We are in Hystart++ CSS */ +#define CUBICFLAG_IN_TF 0x00000040 /* We are in TCP friendly region */ /* Kernel only bits */ #ifdef _KERNEL struct cubic { - /* CUBIC K in fixed point form with CUBIC_SHIFT worth of precision. */ + /* + * CUBIC K in fixed point form with CUBIC_SHIFT worth of precision. + * Also means the time period in seconds it takes to increase the + * congestion window size at the beginning of the current congestion + * avoidance stage to W_max. + */ int64_t K; /* Sum of RTT samples across an epoch in usecs. */ int64_t sum_rtt_usecs; - /* Size of cwnd just before cwnd was reduced in the last congestion event */ - uint64_t W_max; - /* An estimate for the congestion window in the Reno-friendly region */ - uint64_t W_est; - /* The cwnd at the beginning of the current congestion avoidance stage */ - uint64_t cwnd_epoch; - /* - * Size of cwnd at the time of setting ssthresh most recently, - * either upon exiting the first slow start, or just before cwnd - * was reduced in the last congestion event - */ - uint64_t cwnd_prior; + /* Size of cwnd (in bytes) just before cwnd was reduced in the last congestion event. */ + uint32_t W_max; + /* An estimate (in bytes) for the congestion window in the Reno-friendly region */ + uint32_t W_est; + /* An estimate (in bytes) for the congestion window in the CUBIC region */ + uint32_t W_cubic; + /* The cwnd (in bytes) at the beginning of the current congestion avoidance stage. */ + uint32_t cwnd_epoch; /* various flags */ uint32_t flags; /* Minimum observed rtt in usecs. */ @@ -117,12 +119,8 @@ struct cubic { int undo_t_epoch; /* Few variables to restore the state after RTO_ERR */ int64_t undo_K; - uint64_t undo_cwnd_prior; - uint64_t undo_W_max; - uint64_t undo_W_est; - uint64_t undo_cwnd_epoch; - /* Number of congestion events experienced */ - uint64_t num_cong_events; + uint32_t undo_W_max; + uint32_t undo_cwnd_epoch; uint32_t css_baseline_minrtt; uint32_t css_current_round_minrtt; uint32_t css_lastround_minrtt; @@ -141,60 +139,103 @@ struct cubic { extern int hz; /* - * Implementation based on the formulae found in the CUBIC Internet Draft - * "draft-ietf-tcpm-cubic-04". + * Implementation based on the formulas in RFC9438. * */ -static __inline float -theoretical_cubic_k(double wmax_pkts) + +/* + * Returns K, the time period in seconds it takes to increase the congestion + * window size at the beginning of the current congestion avoidance stage to + * W_max. + */ +static inline float +theoretical_cubic_k(uint32_t wmax_segs, uint32_t cwnd_epoch_segs) { double C; C = 0.4; + if (wmax_segs <= cwnd_epoch_segs) + return 0.0; - return (pow((wmax_pkts * 0.3) / C, (1.0 / 3.0)) * pow(2, CUBIC_SHIFT)); + /* + * Figure 2: K = ((W_max - cwnd_epoch) / C)^(1/3) + */ + return (pow((wmax_segs - cwnd_epoch_segs) / C, (1.0 / 3.0)) * pow(2, CUBIC_SHIFT)); } -static __inline unsigned long -theoretical_cubic_cwnd(int ticks_since_epoch, unsigned long wmax, uint32_t smss) +/* + * Returns the congestion window in segments at time t in seconds based on the + * cubic increase function, where t is the elapsed time in seconds from the + * beginning of the current congestion avoidance stage, as described in RFC9438 + * Section 4.2. + */ +static inline unsigned long +theoretical_cubic_cwnd(int ticks_elapsed, uint32_t wmax_segs, uint32_t cwnd_epoch_segs) { - double C, wmax_pkts; + double C, t; + float K; C = 0.4; - wmax_pkts = wmax / (double)smss; + t = ticks_elapsed / (double)hz; + K = theoretical_cubic_k(wmax_segs, cwnd_epoch_segs); - return (smss * (wmax_pkts + - (C * pow(ticks_since_epoch / (double)hz - - theoretical_cubic_k(wmax_pkts) / pow(2, CUBIC_SHIFT), 3.0)))); + /* + * Figure 1: W_cubic(t) = C * (t - K)^3 + W_max + */ + return (C * pow(t - K / pow(2, CUBIC_SHIFT), 3.0) + wmax_segs); } -static __inline unsigned long -theoretical_reno_cwnd(int ticks_since_epoch, int rtt_ticks, unsigned long wmax, - uint32_t smss) +/* + * Returns estimated Reno congestion window in segments. + */ +static inline unsigned long +theoretical_reno_cwnd(int ticks_elapsed, int rtt_ticks, uint32_t wmax_segs) { - return ((wmax * 0.5) + ((ticks_since_epoch / (float)rtt_ticks) * smss)); + return (wmax_segs * 0.5 + ticks_elapsed / (float)rtt_ticks); } -static __inline unsigned long -theoretical_tf_cwnd(int ticks_since_epoch, int rtt_ticks, unsigned long wmax, - uint32_t smss) +/* + * Returns an estimate for the congestion window in segments in the + * Reno-friendly region -- that is, an estimate for the congestion window of + * Reno, as described in RFC9438 Section 4.3, where: + * cwnd: Current congestion window in segments. + * cwnd_prior: Size of cwnd in segments at the time of setting ssthresh most + * recently, either upon exiting the first slow start or just before + * cwnd was reduced in the last congestion event. + * W_est: An estimate for the congestion window in segments in the Reno-friendly + * region -- that is, an estimate for the congestion window of Reno. + */ +static inline unsigned long +theoretical_tf_cwnd(unsigned long W_est, unsigned long segs_acked, unsigned long cwnd, + unsigned long cwnd_prior) { + float cubic_alpha, cubic_beta; - return ((wmax * 0.7) + ((3 * 0.3) / (2 - 0.3) * - (ticks_since_epoch / (float)rtt_ticks) * smss)); + /* RFC9438 Section 4.6: The parameter β_cubic SHOULD be set to 0.7. */ + cubic_beta = 0.7; + + if (W_est >= cwnd_prior) + cubic_alpha = 1.0; + else + cubic_alpha = (3.0 * (1.0 - cubic_beta)) / (1.0 + cubic_beta); + + /* + * Figure 4: W_est = W_est + α_cubic * segments_acked / cwnd + */ + return (W_est + cubic_alpha * segs_acked / cwnd); } #endif /* !_KERNEL */ /* * Compute the CUBIC K value used in the cwnd calculation, using an - * implementation of eqn 2 in the I-D. The method used - * here is adapted from Apple Computer Technical Report #KT-32. + * implementation mentioned in Figure. 2 of RFC9438. + * The method used here is adapted from Apple Computer Technical Report #KT-32. */ -static __inline int64_t -cubic_k(unsigned long wmax_pkts) +static inline int64_t +cubic_k(uint32_t wmax_segs, uint32_t cwnd_epoch_segs) { int64_t s, K; uint16_t p; @@ -202,8 +243,13 @@ cubic_k(unsigned long wmax_pkts) K = s = 0; p = 0; - /* (wmax * beta)/C with CUBIC_SHIFT worth of precision. */ - s = ((wmax_pkts * ONE_SUB_CUBIC_BETA) << CUBIC_SHIFT) / CUBIC_C_FACTOR; + /* Handle the corner case where W_max <= cwnd_epoch */ + if (wmax_segs <= cwnd_epoch_segs) { + return 0; + } + + /* (wmax - cwnd_epoch) / C with CUBIC_SHIFT worth of precision. */ + s = ((wmax_segs - cwnd_epoch_segs) << (2 * CUBIC_SHIFT)) / CUBIC_C_FACTOR; /* Rebase s to be between 1 and 1/8 with a shift of CUBIC_SHIFT. */ while (s >= 256) { @@ -224,13 +270,14 @@ cubic_k(unsigned long wmax_pkts) } /* - * Compute the new cwnd value using an implementation of eqn 1 from the I-D. + * Compute and return the new cwnd value in bytes using an implementation + * mentioned in Figure. 1 of RFC9438. * Thanks to Kip Macy for help debugging this function. * * XXXLAS: Characterise bounds for overflow. */ -static __inline unsigned long -cubic_cwnd(int usecs_since_epoch, unsigned long wmax, uint32_t smss, int64_t K) +static inline uint32_t +cubic_cwnd(int usecs_since_epoch, uint32_t wmax, uint32_t smss, int64_t K) { int64_t cwnd; @@ -249,7 +296,7 @@ cubic_cwnd(int usecs_since_epoch, unsigned long wmax, uint32_t smss, int64_t K) cwnd *= (cwnd * cwnd); /* - * C(t - K)^3 + wmax + * Figure 1: C * (t - K)^3 + wmax * The down shift by CUBIC_SHIFT_4 is because cwnd has 4 lots of * CUBIC_SHIFT included in the value. 3 from the cubing of cwnd above, * and an extra from multiplying through by CUBIC_C_FACTOR. @@ -264,46 +311,13 @@ cubic_cwnd(int usecs_since_epoch, unsigned long wmax, uint32_t smss, int64_t K) } /* - * Compute an approximation of the NewReno cwnd some number of usecs after a - * congestion event. RTT should be the average RTT estimate for the path - * measured over the previous congestion epoch and wmax is the value of cwnd at - * the last congestion event. The "TCP friendly" concept in the CUBIC I-D is - * rather tricky to understand and it turns out this function is not required. - * It is left here for reference. - * - * XXX: Not used + * Compute the "TCP friendly" cwnd by newreno in congestion avoidance state. */ -static __inline unsigned long -reno_cwnd(int usecs_since_epoch, int rtt_usecs, unsigned long wmax, - uint32_t smss) +static inline uint32_t +tf_cwnd(struct cc_var *ccv) { - - /* - * For NewReno, beta = 0.5, therefore: W_tcp(t) = wmax*0.5 + t/RTT - * W_tcp(t) deals with cwnd/wmax in pkts, so because our cwnd is in - * bytes, we have to multiply by smss. - */ - return (((wmax * RENO_BETA) + (((usecs_since_epoch * smss) - << CUBIC_SHIFT) / rtt_usecs)) >> CUBIC_SHIFT); -} - -/* - * Compute an approximation of the "TCP friendly" cwnd some number of usecs - * after a congestion event that is designed to yield the same average cwnd as - * NewReno while using CUBIC's beta of 0.7. RTT should be the average RTT - * estimate for the path measured over the previous congestion epoch and wmax is - * the value of cwnd at the last congestion event. - */ -static __inline unsigned long -tf_cwnd(int usecs_since_epoch, int rtt_usecs, unsigned long wmax, - uint32_t smss) -{ - - /* Equation 4 of I-D. */ - return (((wmax * CUBIC_BETA) + - (((THREE_X_PT3 * (unsigned long)usecs_since_epoch * - (unsigned long)smss) << CUBIC_SHIFT) / (TWO_SUB_PT3 * rtt_usecs))) - >> CUBIC_SHIFT); + /* newreno is "TCP friendly" */ + return newreno_cc_cwnd_in_cong_avoid(ccv); } #endif /* _NETINET_CC_CUBIC_H_ */ diff --git a/sys/netinet/cc/cc_dctcp.c b/sys/netinet/cc/cc_dctcp.c index 374db98c5e60..757bc005edb4 100644 --- a/sys/netinet/cc/cc_dctcp.c +++ b/sys/netinet/cc/cc_dctcp.c @@ -108,6 +108,7 @@ dctcp_ack_received(struct cc_var *ccv, ccsignal_t type) { struct dctcp *dctcp_data; int bytes_acked = 0; + uint32_t mss = tcp_fixed_maxseg(ccv->tp); dctcp_data = ccv->cc_data; @@ -125,7 +126,7 @@ dctcp_ack_received(struct cc_var *ccv, ccsignal_t type) newreno_cc_ack_received(ccv, type); if (type == CC_DUPACK) - bytes_acked = min(ccv->bytes_this_ack, CCV(ccv, t_maxseg)); + bytes_acked = min(ccv->bytes_this_ack, mss); if (type == CC_ACK) bytes_acked = ccv->bytes_this_ack; @@ -138,16 +139,16 @@ dctcp_ack_received(struct cc_var *ccv, ccsignal_t type) //XXRMS: For fluid-model DCTCP, update //cwnd here during for RTT fairness if (!dctcp_data->ece_prev - && bytes_acked > CCV(ccv, t_maxseg)) { + && bytes_acked > mss) { dctcp_data->bytes_ecn += - (bytes_acked - CCV(ccv, t_maxseg)); + (bytes_acked - mss); } else dctcp_data->bytes_ecn += bytes_acked; dctcp_data->ece_prev = 1; } else { if (dctcp_data->ece_prev - && bytes_acked > CCV(ccv, t_maxseg)) - dctcp_data->bytes_ecn += CCV(ccv, t_maxseg); + && bytes_acked > mss) + dctcp_data->bytes_ecn += mss; dctcp_data->ece_prev = 0; } dctcp_data->ece_curr = 0; @@ -201,7 +202,7 @@ dctcp_cb_init(struct cc_var *ccv, void *ptr) { struct dctcp *dctcp_data; - INP_WLOCK_ASSERT(tptoinpcb(ccv->ccvc.tcp)); + INP_WLOCK_ASSERT(tptoinpcb(ccv->tp)); if (ptr == NULL) { dctcp_data = malloc(sizeof(struct dctcp), M_CC_MEM, M_NOWAIT|M_ZERO); if (dctcp_data == NULL) @@ -245,7 +246,7 @@ dctcp_cong_signal(struct cc_var *ccv, ccsignal_t type) if (CCV(ccv, t_flags2) & TF2_ECN_PERMIT) { dctcp_data = ccv->cc_data; cwin = CCV(ccv, snd_cwnd); - mss = tcp_fixed_maxseg(ccv->ccvc.tcp); + mss = tcp_fixed_maxseg(ccv->tp); switch (type) { case CC_NDUPACK: @@ -293,19 +294,13 @@ dctcp_cong_signal(struct cc_var *ccv, ccsignal_t type) break; case CC_RTO: if (CCV(ccv, t_rxtshift) == 1) { - if (V_tcp_do_newsack) { - pipe = tcp_compute_pipe(ccv->ccvc.tcp); - } else { - pipe = CCV(ccv, snd_max) - - CCV(ccv, snd_fack) + - CCV(ccv, sackhint.sack_bytes_rexmit); - } + pipe = tcp_compute_pipe(ccv->tp); CCV(ccv, snd_ssthresh) = max(2, min(CCV(ccv, snd_wnd), pipe) / 2 / mss) * mss; } CCV(ccv, snd_cwnd) = mss; dctcp_update_alpha(ccv); - dctcp_data->save_sndnxt += CCV(ccv, t_maxseg); + dctcp_data->save_sndnxt += mss; dctcp_data->num_cong_events++; break; default: diff --git a/sys/netinet/cc/cc_hd.c b/sys/netinet/cc/cc_hd.c index 82486563f97e..def1580d8ffb 100644 --- a/sys/netinet/cc/cc_hd.c +++ b/sys/netinet/cc/cc_hd.c @@ -59,6 +59,7 @@ #include <sys/limits.h> #include <sys/malloc.h> #include <sys/module.h> +#include <sys/prng.h> #include <sys/queue.h> #include <sys/socket.h> #include <sys/socketvar.h> @@ -77,8 +78,8 @@ #include <netinet/khelp/h_ertt.h> -/* Largest possible number returned by random(). */ -#define RANDOM_MAX INT_MAX +/* Largest possible number returned by prng32(). */ +#define RANDOM_MAX UINT32_MAX static void hd_ack_received(struct cc_var *ccv, ccsignal_t ack_type); static int hd_mod_init(void); @@ -128,7 +129,7 @@ should_backoff(int qdly, int maxqdly) p = (RANDOM_MAX / 100) * V_hd_pmax; } - return (random() < p); + return (prng32() < p); } /* diff --git a/sys/netinet/cc/cc_htcp.c b/sys/netinet/cc/cc_htcp.c index 41c552a3bfa0..569495144d50 100644 --- a/sys/netinet/cc/cc_htcp.c +++ b/sys/netinet/cc/cc_htcp.c @@ -193,6 +193,7 @@ static void htcp_ack_received(struct cc_var *ccv, ccsignal_t type) { struct htcp *htcp_data; + uint32_t mss = tcp_fixed_maxseg(ccv->tp); htcp_data = ccv->cc_data; htcp_record_rtt(ccv); @@ -220,7 +221,7 @@ htcp_ack_received(struct cc_var *ccv, ccsignal_t type) if (V_tcp_do_rfc3465) { /* Increment cwnd by alpha segments. */ CCV(ccv, snd_cwnd) += htcp_data->alpha * - CCV(ccv, t_maxseg); + mss; ccv->flags &= ~CCF_ABC_SENTAWND; } else /* @@ -230,8 +231,8 @@ htcp_ack_received(struct cc_var *ccv, ccsignal_t type) */ CCV(ccv, snd_cwnd) += (((htcp_data->alpha << HTCP_SHIFT) / (max(1, - CCV(ccv, snd_cwnd) / CCV(ccv, t_maxseg)))) * - CCV(ccv, t_maxseg)) >> HTCP_SHIFT; + CCV(ccv, snd_cwnd) / mss))) * + mss) >> HTCP_SHIFT; } } } @@ -253,7 +254,7 @@ htcp_cb_init(struct cc_var *ccv, void *ptr) { struct htcp *htcp_data; - INP_WLOCK_ASSERT(tptoinpcb(ccv->ccvc.tcp)); + INP_WLOCK_ASSERT(tptoinpcb(ccv->tp)); if (ptr == NULL) { htcp_data = malloc(sizeof(struct htcp), M_CC_MEM, M_NOWAIT); if (htcp_data == NULL) @@ -284,7 +285,7 @@ htcp_cong_signal(struct cc_var *ccv, ccsignal_t type) uint32_t mss, pipe; htcp_data = ccv->cc_data; - mss = tcp_fixed_maxseg(ccv->ccvc.tcp); + mss = tcp_fixed_maxseg(ccv->tp); switch (type) { case CC_NDUPACK: @@ -324,13 +325,7 @@ htcp_cong_signal(struct cc_var *ccv, ccsignal_t type) case CC_RTO: if (CCV(ccv, t_rxtshift) == 1) { - if (V_tcp_do_newsack) { - pipe = tcp_compute_pipe(ccv->ccvc.tcp); - } else { - pipe = CCV(ccv, snd_max) - - CCV(ccv, snd_fack) + - CCV(ccv, sackhint.sack_bytes_rexmit); - } + pipe = tcp_compute_pipe(ccv->tp); CCV(ccv, snd_ssthresh) = max(2, min(CCV(ccv, snd_wnd), pipe) / 2 / mss) * mss; } @@ -370,6 +365,7 @@ htcp_post_recovery(struct cc_var *ccv) { int pipe; struct htcp *htcp_data; + uint32_t mss = tcp_fixed_maxseg(ccv->tp); pipe = 0; htcp_data = ccv->cc_data; @@ -379,25 +375,18 @@ htcp_post_recovery(struct cc_var *ccv) * If inflight data is less than ssthresh, set cwnd * conservatively to avoid a burst of data, as suggested in the * NewReno RFC. Otherwise, use the HTCP method. - * - * XXXLAS: Find a way to do this without needing curack */ - if (V_tcp_do_newsack) - pipe = tcp_compute_pipe(ccv->ccvc.tcp); - else - pipe = CCV(ccv, snd_max) - ccv->curack; - + pipe = tcp_compute_pipe(ccv->tp); if (pipe < CCV(ccv, snd_ssthresh)) /* * Ensure that cwnd down not collape to 1 MSS under * adverse conditions. Implements RFC6582 */ - CCV(ccv, snd_cwnd) = max(pipe, CCV(ccv, t_maxseg)) + - CCV(ccv, t_maxseg); + CCV(ccv, snd_cwnd) = max(pipe, mss) + mss; else CCV(ccv, snd_cwnd) = max(1, ((htcp_data->beta * - htcp_data->prev_cwnd / CCV(ccv, t_maxseg)) - >> HTCP_SHIFT)) * CCV(ccv, t_maxseg); + htcp_data->prev_cwnd / mss) + >> HTCP_SHIFT)) * mss; } } @@ -451,7 +440,7 @@ htcp_recalc_alpha(struct cc_var *ccv) */ if (V_htcp_rtt_scaling) alpha = max(1, (min(max(HTCP_MINROWE, - (tcp_get_srtt(ccv->ccvc.tcp, TCP_TMR_GRANULARITY_TICKS) << HTCP_SHIFT) / + (tcp_get_srtt(ccv->tp, TCP_TMR_GRANULARITY_TICKS) << HTCP_SHIFT) / htcp_rtt_ref), HTCP_MAXROWE) * alpha) >> HTCP_SHIFT); @@ -502,18 +491,18 @@ htcp_record_rtt(struct cc_var *ccv) * or minrtt is currently equal to its initialised value. Ignore SRTT * until a min number of samples have been taken. */ - if ((tcp_get_srtt(ccv->ccvc.tcp, TCP_TMR_GRANULARITY_TICKS) < htcp_data->minrtt || + if ((tcp_get_srtt(ccv->tp, TCP_TMR_GRANULARITY_TICKS) < htcp_data->minrtt || htcp_data->minrtt == TCPTV_SRTTBASE) && (CCV(ccv, t_rttupdated) >= HTCP_MIN_RTT_SAMPLES)) - htcp_data->minrtt = tcp_get_srtt(ccv->ccvc.tcp, TCP_TMR_GRANULARITY_TICKS); + htcp_data->minrtt = tcp_get_srtt(ccv->tp, TCP_TMR_GRANULARITY_TICKS); /* * Record the current SRTT as our maxrtt if it's the largest we've * seen. Ignore SRTT until a min number of samples have been taken. */ - if (tcp_get_srtt(ccv->ccvc.tcp, TCP_TMR_GRANULARITY_TICKS) > htcp_data->maxrtt + if (tcp_get_srtt(ccv->tp, TCP_TMR_GRANULARITY_TICKS) > htcp_data->maxrtt && CCV(ccv, t_rttupdated) >= HTCP_MIN_RTT_SAMPLES) - htcp_data->maxrtt = tcp_get_srtt(ccv->ccvc.tcp, TCP_TMR_GRANULARITY_TICKS); + htcp_data->maxrtt = tcp_get_srtt(ccv->tp, TCP_TMR_GRANULARITY_TICKS); } /* diff --git a/sys/netinet/cc/cc_module.h b/sys/netinet/cc/cc_module.h index e01d7a124ad4..606b2b66e7e5 100644 --- a/sys/netinet/cc/cc_module.h +++ b/sys/netinet/cc/cc_module.h @@ -43,18 +43,7 @@ #ifndef _NETINET_CC_MODULE_H_ #define _NETINET_CC_MODULE_H_ -/* - * Allows a CC algorithm to manipulate a commonly named CC variable regardless - * of the transport protocol and associated C struct. - * XXXLAS: Out of action until the work to support SCTP is done. - * -#define CCV(ccv, what) \ -(*( \ - (ccv)->type == IPPROTO_TCP ? &(ccv)->ccvc.tcp->what : \ - &(ccv)->ccvc.sctp->what \ -)) - */ -#define CCV(ccv, what) (ccv)->ccvc.tcp->what +#define CCV(ccv, what) (ccv)->tp->what #define DECLARE_CC_MODULE(ccname, ccalgo) \ static moduledata_t cc_##ccname = { \ diff --git a/sys/netinet/cc/cc_newreno.c b/sys/netinet/cc/cc_newreno.c index aa20e2c64f7d..de7b878152b0 100644 --- a/sys/netinet/cc/cc_newreno.c +++ b/sys/netinet/cc/cc_newreno.c @@ -135,7 +135,7 @@ newreno_log_hystart_event(struct cc_var *ccv, struct newreno *nreno, uint8_t mod if (hystart_bblogs == 0) return; - tp = ccv->ccvc.tcp; + tp = ccv->tp; if (tcp_bblogging_on(tp)) { union tcp_log_stackspecific log; struct timeval tv; @@ -175,7 +175,7 @@ newreno_cb_init(struct cc_var *ccv, void *ptr) { struct newreno *nreno; - INP_WLOCK_ASSERT(tptoinpcb(ccv->ccvc.tcp)); + INP_WLOCK_ASSERT(tptoinpcb(ccv->tp)); if (ptr == NULL) { ccv->cc_data = malloc(sizeof(struct newreno), M_CC_MEM, M_NOWAIT); if (ccv->cc_data == NULL) @@ -215,12 +215,13 @@ static void newreno_ack_received(struct cc_var *ccv, ccsignal_t type) { struct newreno *nreno; + uint32_t mss = tcp_fixed_maxseg(ccv->tp); nreno = ccv->cc_data; if (type == CC_ACK && !IN_RECOVERY(CCV(ccv, t_flags)) && (ccv->flags & CCF_CWND_LIMITED)) { u_int cw = CCV(ccv, snd_cwnd); - u_int incr = CCV(ccv, t_maxseg); + u_int incr = mss; /* * Regular in-order ACK, open the congestion window. @@ -324,10 +325,9 @@ newreno_ack_received(struct cc_var *ccv, ccsignal_t type) } if (CCV(ccv, snd_nxt) == CCV(ccv, snd_max)) incr = min(ccv->bytes_this_ack, - ccv->nsegs * abc_val * - CCV(ccv, t_maxseg)); + ccv->nsegs * abc_val * mss); else - incr = min(ccv->bytes_this_ack, CCV(ccv, t_maxseg)); + incr = min(ccv->bytes_this_ack, mss); /* Only if Hystart is enabled will the flag get set */ if (nreno->newreno_flags & CC_NEWRENO_HYSTART_IN_CSS) { @@ -369,9 +369,9 @@ newreno_cong_signal(struct cc_var *ccv, ccsignal_t type) uint32_t beta, beta_ecn, cwin, factor, mss, pipe; cwin = CCV(ccv, snd_cwnd); - mss = tcp_fixed_maxseg(ccv->ccvc.tcp); + mss = tcp_fixed_maxseg(ccv->tp); nreno = ccv->cc_data; - beta = (nreno == NULL) ? V_newreno_beta : nreno->beta;; + beta = (nreno == NULL) ? V_newreno_beta : nreno->beta; beta_ecn = (nreno == NULL) ? V_newreno_beta_ecn : nreno->beta_ecn; /* * Note that we only change the backoff for ECN if the @@ -428,13 +428,7 @@ newreno_cong_signal(struct cc_var *ccv, ccsignal_t type) break; case CC_RTO: if (CCV(ccv, t_rxtshift) == 1) { - if (V_tcp_do_newsack) { - pipe = tcp_compute_pipe(ccv->ccvc.tcp); - } else { - pipe = CCV(ccv, snd_max) - - CCV(ccv, snd_fack) + - CCV(ccv, sackhint.sack_bytes_rexmit); - } + pipe = tcp_compute_pipe(ccv->tp); CCV(ccv, snd_ssthresh) = max(2, ((uint64_t)min(CCV(ccv, snd_wnd), pipe) * (uint64_t)factor) / @@ -456,7 +450,7 @@ newreno_ctl_output(struct cc_var *ccv, struct sockopt *sopt, void *buf) if (sopt->sopt_valsize != sizeof(struct cc_newreno_opts)) return (EMSGSIZE); - if (CC_ALGO(ccv->ccvc.tcp) != &newreno_cc_algo) + if (CC_ALGO(ccv->tp) != &newreno_cc_algo) return (ENOPROTOOPT); nreno = (struct newreno *)ccv->cc_data; diff --git a/sys/netinet/cc/cc_vegas.c b/sys/netinet/cc/cc_vegas.c index ecd42c1a0f53..2e24a717f869 100644 --- a/sys/netinet/cc/cc_vegas.c +++ b/sys/netinet/cc/cc_vegas.c @@ -129,6 +129,7 @@ vegas_ack_received(struct cc_var *ccv, ccsignal_t ack_type) struct ertt *e_t; struct vegas *vegas_data; long actual_tx_rate, expected_tx_rate, ndiff; + uint32_t mss = tcp_fixed_maxseg(ccv->tp); e_t = khelp_get_osd(&CCV(ccv, t_osd), ertt_id); vegas_data = ccv->cc_data; @@ -139,7 +140,7 @@ vegas_ack_received(struct cc_var *ccv, ccsignal_t ack_type) actual_tx_rate = e_t->bytes_tx_in_marked_rtt / e_t->markedpkt_rtt; ndiff = (expected_tx_rate - actual_tx_rate) * - e_t->minrtt / CCV(ccv, t_maxseg); + e_t->minrtt / mss; if (ndiff < V_vegas_alpha) { if (CCV(ccv, snd_cwnd) <= @@ -150,8 +151,7 @@ vegas_ack_received(struct cc_var *ccv, ccsignal_t ack_type) } else { vegas_data->slow_start_toggle = 0; CCV(ccv, snd_cwnd) = - min(CCV(ccv, snd_cwnd) + - CCV(ccv, t_maxseg), + min(CCV(ccv, snd_cwnd) + mss, TCP_MAXWIN << CCV(ccv, snd_scale)); } } else if (ndiff > V_vegas_beta) { @@ -184,7 +184,7 @@ vegas_cb_init(struct cc_var *ccv, void *ptr) { struct vegas *vegas_data; - INP_WLOCK_ASSERT(tptoinpcb(ccv->ccvc.tcp)); + INP_WLOCK_ASSERT(tptoinpcb(ccv->tp)); if (ptr == NULL) { vegas_data = malloc(sizeof(struct vegas), M_CC_MEM, M_NOWAIT); if (vegas_data == NULL) @@ -207,6 +207,7 @@ vegas_cong_signal(struct cc_var *ccv, ccsignal_t signal_type) { struct vegas *vegas_data; int presignalrecov; + uint32_t mss = tcp_fixed_maxseg(ccv->tp); vegas_data = ccv->cc_data; @@ -218,8 +219,8 @@ vegas_cong_signal(struct cc_var *ccv, ccsignal_t signal_type) switch((int)signal_type) { case CC_VEGAS_RATE: if (!IN_RECOVERY(CCV(ccv, t_flags))) { - CCV(ccv, snd_cwnd) = max(2 * CCV(ccv, t_maxseg), - CCV(ccv, snd_cwnd) - CCV(ccv, t_maxseg)); + CCV(ccv, snd_cwnd) = max(2 * mss, + CCV(ccv, snd_cwnd) - mss); if (CCV(ccv, snd_cwnd) < CCV(ccv, snd_ssthresh)) /* Exit slow start. */ CCV(ccv, snd_ssthresh) = CCV(ccv, snd_cwnd); diff --git a/sys/netinet/dccp.h b/sys/netinet/dccp.h index 4fb6a0d2ab3e..da83a1b06861 100644 --- a/sys/netinet/dccp.h +++ b/sys/netinet/dccp.h @@ -64,7 +64,7 @@ struct dccphdr { uint8_t seq[6]; } longseq; } d_seqno; -}; +} __packed; #define d_seqno_short d_seqno.shortseq; #define d_seqno_long d_seqno.longseq.seq; diff --git a/sys/netinet/icmp6.h b/sys/netinet/icmp6.h index 4368fd2a0fcf..7845b682f3e4 100644 --- a/sys/netinet/icmp6.h +++ b/sys/netinet/icmp6.h @@ -63,6 +63,8 @@ #ifndef _NETINET_ICMP6_H_ #define _NETINET_ICMP6_H_ +#include <sys/stdint.h> + #define ICMPV6_PLD_MAXLEN 1232 /* IPV6_MMTU - sizeof(struct ip6_hdr) - sizeof(struct icmp6_hdr) */ @@ -307,7 +309,8 @@ struct nd_opt_hdr { /* Neighbor discovery option header */ #define ND_OPT_ROUTE_INFO 24 /* RFC 4191 */ #define ND_OPT_RDNSS 25 /* RFC 6106 */ #define ND_OPT_DNSSL 31 /* RFC 6106 */ -#define ND_OPT_MAX 31 +#define ND_OPT_PREF64 38 /* RFC 8781 */ +#define ND_OPT_MAX 38 struct nd_opt_prefix_info { /* prefix information */ u_int8_t nd_opt_pi_type; @@ -373,6 +376,14 @@ struct nd_opt_dnssl { /* DNSSL option (RFC 6106) */ /* followed by list of DNS search domains */ } __packed; +struct nd_opt_pref64 { /* PREF64 option (RFC 8781) */ + uint8_t nd_opt_pref64_type; + uint8_t nd_opt_pref64_len; + /* bits 0-12 are the SL, bits 13-15 are the PLC */ + uint16_t nd_opt_pref64_sl_plc; + char nd_opt_prefix[12]; +} __packed; + /* * icmp6 namelookup */ @@ -641,7 +652,7 @@ VNET_PCPUSTAT_DECLARE(struct icmp6stat, icmp6stat); #define ICMP6STAT_INC2(name, type) \ do { \ MIB_SDT_PROBE2(icmp6, count, name, 1, type); \ - VNET_PCPUSTAT_ADD(struct icmp6stat, icmp6stat, name, 1); \ + VNET_PCPUSTAT_ADD(struct icmp6stat, icmp6stat, name[type], 1); \ } while (0) /* diff --git a/sys/netinet/icmp_var.h b/sys/netinet/icmp_var.h index b1f2b0ebf911..d6b75e482e35 100644 --- a/sys/netinet/icmp_var.h +++ b/sys/netinet/icmp_var.h @@ -104,11 +104,10 @@ extern int badport_bandlim(int); #define BANDLIM_ICMP_UNREACH 0 #define BANDLIM_ICMP_ECHO 1 #define BANDLIM_ICMP_TSTAMP 2 -#define BANDLIM_RST_CLOSEDPORT 3 /* No connection, and no listeners */ -#define BANDLIM_RST_OPENPORT 4 /* No connection, listener */ -#define BANDLIM_ICMP6_UNREACH 5 -#define BANDLIM_SCTP_OOTB 6 -#define BANDLIM_MAX 7 +#define BANDLIM_TCP_RST 3 +#define BANDLIM_ICMP6_UNREACH 4 +#define BANDLIM_SCTP_OOTB 5 +#define BANDLIM_MAX 6 #endif #endif diff --git a/sys/netinet/if_ether.c b/sys/netinet/if_ether.c index 543e6a3922fb..dc6ef343662d 100644 --- a/sys/netinet/if_ether.c +++ b/sys/netinet/if_ether.c @@ -56,6 +56,7 @@ #include <net/if_dl.h> #include <net/if_private.h> #include <net/if_types.h> +#include <net/if_bridgevar.h> #include <net/netisr.h> #include <net/ethernet.h> #include <net/route.h> @@ -155,11 +156,12 @@ SYSCTL_INT(_net_link_ether_inet, OID_AUTO, max_log_per_second, */ #define MAX_GARP_RETRANSMITS 16 static int sysctl_garp_rexmit(SYSCTL_HANDLER_ARGS); -static int garp_rexmit_count = 0; /* GARP retransmission setting. */ +VNET_DEFINE_STATIC(int, garp_rexmit_count) = 0; /* GARP retransmission setting. */ +#define V_garp_rexmit_count VNET(garp_rexmit_count) SYSCTL_PROC(_net_link_ether_inet, OID_AUTO, garp_rexmit_count, - CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_MPSAFE, - &garp_rexmit_count, 0, sysctl_garp_rexmit, "I", + CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_MPSAFE|CTLFLAG_VNET, + &VNET_NAME(garp_rexmit_count), 0, sysctl_garp_rexmit, "I", "Number of times to retransmit GARP packets;" " 0 to disable, maximum of 16"); @@ -831,7 +833,7 @@ in_arpinput(struct mbuf *m) * when we have clusters of interfaces). */ CK_LIST_FOREACH(ia, INADDR_HASH(itaddr.s_addr), ia_hash) { - if (((bridged && ia->ia_ifp->if_bridge == ifp->if_bridge) || + if (((bridged && bridge_same_p(ia->ia_ifp->if_bridge, ifp->if_bridge)) || ia->ia_ifp == ifp) && itaddr.s_addr == ia->ia_addr.sin_addr.s_addr && (ia->ia_ifa.ifa_carp == NULL || @@ -841,7 +843,7 @@ in_arpinput(struct mbuf *m) } } CK_LIST_FOREACH(ia, INADDR_HASH(isaddr.s_addr), ia_hash) - if (((bridged && ia->ia_ifp->if_bridge == ifp->if_bridge) || + if (((bridged && bridge_same_p(ia->ia_ifp->if_bridge, ifp->if_bridge)) || ia->ia_ifp == ifp) && isaddr.s_addr == ia->ia_addr.sin_addr.s_addr) { ifa_ref(&ia->ia_ifa); @@ -849,7 +851,7 @@ in_arpinput(struct mbuf *m) } #define BDG_MEMBER_MATCHES_ARP(addr, ifp, ia) \ - (ia->ia_ifp->if_bridge == ifp->if_softc && \ + (bridge_get_softc_p(ia->ia_ifp) == ifp->if_softc && \ !bcmp(IF_LLADDR(ia->ia_ifp), IF_LLADDR(ifp), ifp->if_addrlen) && \ addr == ia->ia_addr.sin_addr.s_addr) /* @@ -1352,6 +1354,7 @@ sysctl_garp_rexmit(SYSCTL_HANDLER_ARGS) static void garp_rexmit(void *arg) { + struct epoch_tracker et; struct in_ifaddr *ia = arg; if (callout_pending(&ia->ia_garp_timer) || @@ -1361,6 +1364,7 @@ garp_rexmit(void *arg) return; } + NET_EPOCH_ENTER(et); CURVNET_SET(ia->ia_ifa.ifa_ifp->if_vnet); /* @@ -1377,7 +1381,7 @@ garp_rexmit(void *arg) * the callout to retransmit another GARP packet. */ ++ia->ia_garp_count; - if (ia->ia_garp_count >= garp_rexmit_count) { + if (ia->ia_garp_count >= V_garp_rexmit_count) { ifa_free(&ia->ia_ifa); } else { int rescheduled; @@ -1392,6 +1396,7 @@ garp_rexmit(void *arg) } CURVNET_RESTORE(); + NET_EPOCH_EXIT(et); } /* @@ -1444,7 +1449,7 @@ arp_ifinit(struct ifnet *ifp, struct ifaddr *ifa) NET_EPOCH_ENTER(et); arp_announce_ifaddr(ifp, dst_in->sin_addr, IF_LLADDR(ifp)); NET_EPOCH_EXIT(et); - if (garp_rexmit_count > 0) { + if (V_garp_rexmit_count > 0) { garp_timer_start(ifa); } @@ -1506,7 +1511,7 @@ vnet_arp_init(void) #endif } VNET_SYSINIT(vnet_arp_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_SECOND, - vnet_arp_init, 0); + vnet_arp_init, NULL); #ifdef VIMAGE /* diff --git a/sys/netinet/igmp.c b/sys/netinet/igmp.c index 454668c2aadc..299f3c2e02bb 100644 --- a/sys/netinet/igmp.c +++ b/sys/netinet/igmp.c @@ -402,32 +402,43 @@ out: static int sysctl_igmp_default_version(SYSCTL_HANDLER_ARGS) { + struct epoch_tracker et; int error; int new; + struct igmp_ifsoftc *igi; error = sysctl_wire_old_buffer(req, sizeof(int)); if (error) return (error); - IGMP_LOCK(); - new = V_igmp_default_version; error = sysctl_handle_int(oidp, &new, 0, req); if (error || !req->newptr) - goto out_locked; + return (error); - if (new < IGMP_VERSION_1 || new > IGMP_VERSION_3) { - error = EINVAL; - goto out_locked; - } + if (new < IGMP_VERSION_1 || new > IGMP_VERSION_3) + return (EINVAL); + + IN_MULTI_LIST_LOCK(); + IGMP_LOCK(); + NET_EPOCH_ENTER(et); - CTR2(KTR_IGMPV3, "change igmp_default_version from %d to %d", - V_igmp_default_version, new); + if (V_igmp_default_version != new) { + CTR2(KTR_IGMPV3, "change igmp_default_version from %d to %d", + V_igmp_default_version, new); - V_igmp_default_version = new; + V_igmp_default_version = new; -out_locked: + LIST_FOREACH(igi, &V_igi_head, igi_link) { + if (igi->igi_version > V_igmp_default_version){ + igmp_set_version(igi, V_igmp_default_version); + } + } + } + + NET_EPOCH_EXIT(et); + IN_MULTI_LIST_UNLOCK(); IGMP_UNLOCK(); return (error); } @@ -1471,6 +1482,7 @@ igmp_input(struct mbuf **mp, int *offp, int proto) m = *mp; ifp = m->m_pkthdr.rcvif; *mp = NULL; + M_ASSERTMAPPED(m); IGMPSTAT_INC(igps_rcv_total); diff --git a/sys/netinet/in.c b/sys/netinet/in.c index 940b197d9e95..963449d4b4b1 100644 --- a/sys/netinet/in.c +++ b/sys/netinet/in.c @@ -57,6 +57,7 @@ #include <net/if_llatbl.h> #include <net/if_private.h> #include <net/if_types.h> +#include <net/if_bridgevar.h> #include <net/route.h> #include <net/route/nhop.h> #include <net/route/route_ctl.h> @@ -102,13 +103,13 @@ VNET_DEFINE(bool, ip_allow_net240) = false; #define V_ip_allow_net240 VNET(ip_allow_net240) SYSCTL_BOOL(_net_inet_ip, OID_AUTO, allow_net240, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip_allow_net240), 0, - "Allow use of Experimental addresses, aka Class E (240/4)"); + "Allow forwarding of and ICMP response to Experimental addresses, aka Class E (240/4)"); /* see https://datatracker.ietf.org/doc/draft-schoen-intarea-unicast-240 */ VNET_DEFINE(bool, ip_allow_net0) = false; SYSCTL_BOOL(_net_inet_ip, OID_AUTO, allow_net0, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip_allow_net0), 0, - "Allow use of addresses in network 0/8"); + "Allow forwarding of and ICMP response to addresses in network 0/8"); /* see https://datatracker.ietf.org/doc/draft-schoen-intarea-unicast-0 */ VNET_DEFINE(uint32_t, in_loopback_mask) = IN_LOOPBACK_MASK_DFLT; @@ -127,10 +128,10 @@ static struct sx in_control_sx; SX_SYSINIT(in_control_sx, &in_control_sx, "in_control"); /* - * Return 1 if an internet address is for a ``local'' host + * Return true if an internet address is for a ``local'' host * (one to which we have a connection). */ -int +bool in_localaddr(struct in_addr in) { u_long i = ntohl(in.s_addr); @@ -140,14 +141,14 @@ in_localaddr(struct in_addr in) CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if ((i & ia->ia_subnetmask) == ia->ia_subnet) - return (1); + return (true); } - return (0); + return (false); } /* - * Return 1 if an internet address is for the local host and configured + * Return true if an internet address is for the local host and configured * on one of its interfaces. */ bool @@ -185,9 +186,9 @@ in_localip_fib(struct in_addr in, uint16_t fib) } /* - * Return 1 if an internet address is configured on an interface. + * Return true if an internet address is configured on an interface. */ -int +bool in_ifhasaddr(struct ifnet *ifp, struct in_addr in) { struct ifaddr *ifa; @@ -200,10 +201,10 @@ in_ifhasaddr(struct ifnet *ifp, struct in_addr in) continue; ia = (struct in_ifaddr *)ifa; if (ia->ia_addr.sin_addr.s_addr == in.s_addr) - return (1); + return (true); } - return (0); + return (false); } /* @@ -271,18 +272,19 @@ in_findlocal(uint32_t fibnum, bool loopback_ok) * that may not be forwarded, or whether datagrams to that destination * may be forwarded. */ -int +bool in_canforward(struct in_addr in) { u_long i = ntohl(in.s_addr); - if (IN_MULTICAST(i) || IN_LINKLOCAL(i) || IN_LOOPBACK(i)) - return (0); + if (IN_MULTICAST(i) || IN_LINKLOCAL(i) || IN_LOOPBACK(i) || + in_nullhost(in)) + return (false); if (IN_EXPERIMENTAL(i) && !V_ip_allow_net240) - return (0); + return (false); if (IN_ZERONET(i) && !V_ip_allow_net0) - return (0); - return (1); + return (false); + return (true); } /* @@ -442,6 +444,27 @@ in_control_ioctl(u_long cmd, void *data, struct ifnet *ifp, } int +in_mask2len(struct in_addr *mask) +{ + int x, y; + u_char *p; + + p = (u_char *)mask; + for (x = 0; x < sizeof(*mask); x++) { + if (p[x] != 0xff) + break; + } + y = 0; + if (x < sizeof(*mask)) { + for (y = 0; y < 8; y++) { + if ((p[x] & (0x80 >> y)) == 0) + break; + } + } + return (x * 8 + y); +} + +int in_control(struct socket *so, u_long cmd, void *data, struct ifnet *ifp, struct thread *td) { @@ -497,6 +520,13 @@ in_aifaddr_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp, struct ucred *cred #endif /* + * Check if bridge wants to allow adding addrs to member interfaces. + */ + if (ifp->if_bridge && bridge_member_ifaddrs_p && + !bridge_member_ifaddrs_p()) + return (EINVAL); + + /* * See whether address already exist. */ iaIsFirst = true; @@ -1222,7 +1252,7 @@ in_ifscrub_all(void) { struct ifnet *ifp; struct ifaddr *ifa, *nifa; - struct ifaliasreq ifr; + struct ifreq ifr; IFNET_RLOCK(); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { @@ -1237,9 +1267,7 @@ in_ifscrub_all(void) * cleanly remove addresses and everything attached. */ bzero(&ifr, sizeof(ifr)); - ifr.ifra_addr = *ifa->ifa_addr; - if (ifa->ifa_dstaddr) - ifr.ifra_broadaddr = *ifa->ifa_dstaddr; + ifr.ifr_addr = *ifa->ifa_addr; (void)in_control(NULL, SIOCDIFADDR, (caddr_t)&ifr, ifp, NULL); } @@ -1250,7 +1278,7 @@ in_ifscrub_all(void) IFNET_RUNLOCK(); } -int +bool in_ifaddr_broadcast(struct in_addr in, struct in_ifaddr *ia) { @@ -1259,7 +1287,8 @@ in_ifaddr_broadcast(struct in_addr in, struct in_ifaddr *ia) * Optionally check for old-style (host 0) broadcast, but * taking into account that RFC 3021 obsoletes it. */ - (V_broadcast_lowest && ia->ia_subnetmask != IN_RFC3021_MASK && + __predict_false(V_broadcast_lowest && + ia->ia_subnetmask != IN_RFC3021_MASK && ntohl(in.s_addr) == ia->ia_subnet)) && /* * Check for an all one subnetmask. These @@ -1270,33 +1299,28 @@ in_ifaddr_broadcast(struct in_addr in, struct in_ifaddr *ia) } /* - * Return 1 if the address might be a local broadcast address. + * Return true if the address might be a local broadcast address. */ -int -in_broadcast(struct in_addr in, struct ifnet *ifp) +bool +in_ifnet_broadcast(struct in_addr in, struct ifnet *ifp) { struct ifaddr *ifa; - int found; NET_EPOCH_ASSERT(); - if (in.s_addr == INADDR_BROADCAST || - in.s_addr == INADDR_ANY) - return (1); + if (in_broadcast(in)) + return (true); if ((ifp->if_flags & IFF_BROADCAST) == 0) - return (0); - found = 0; + return (false); /* * Look through the list of addresses for a match * with a broadcast address. */ CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (ifa->ifa_addr->sa_family == AF_INET && - in_ifaddr_broadcast(in, (struct in_ifaddr *)ifa)) { - found = 1; - break; - } - return (found); + in_ifaddr_broadcast(in, (struct in_ifaddr *)ifa)) + return (true); + return (false); } /* @@ -1473,9 +1497,6 @@ in_lltable_new(struct in_addr addr4, u_int flags) return (&lle->base); } -#define IN_ARE_MASKED_ADDR_EQUAL(d, a, m) ( \ - ((((d).s_addr ^ (a).s_addr) & (m).s_addr)) == 0 ) - static int in_lltable_match_prefix(const struct sockaddr *saddr, const struct sockaddr *smask, u_int flags, struct llentry *lle) diff --git a/sys/netinet/in.h b/sys/netinet/in.h index f4fc41178399..3f2c388548ec 100644 --- a/sys/netinet/in.h +++ b/sys/netinet/in.h @@ -672,13 +672,13 @@ int getsourcefilter(int, uint32_t, struct sockaddr *, socklen_t, struct ifnet; struct mbuf; /* forward declarations for Standard C */ struct in_ifaddr; -int in_broadcast(struct in_addr, struct ifnet *); -int in_ifaddr_broadcast(struct in_addr, struct in_ifaddr *); -int in_canforward(struct in_addr); -int in_localaddr(struct in_addr); +bool in_ifnet_broadcast(struct in_addr, struct ifnet *); +bool in_ifaddr_broadcast(struct in_addr, struct in_ifaddr *); +bool in_canforward(struct in_addr); +bool in_localaddr(struct in_addr); bool in_localip(struct in_addr); bool in_localip_fib(struct in_addr, uint16_t); -int in_ifhasaddr(struct ifnet *, struct in_addr); +bool in_ifhasaddr(struct ifnet *, struct in_addr); struct in_ifaddr *in_findlocal(uint32_t, bool); int inet_aton(const char *, struct in_addr *); /* in libkern */ char *inet_ntoa_r(struct in_addr ina, char *buf); /* in libkern */ @@ -686,6 +686,13 @@ char *inet_ntop(int, const void *, char *, socklen_t); /* in libkern */ int inet_pton(int af, const char *, void *); /* in libkern */ void in_ifdetach(struct ifnet *); +static inline bool +in_broadcast(struct in_addr in) +{ + return (in.s_addr == __htonl(INADDR_BROADCAST) || + in.s_addr == __htonl(INADDR_ANY)); +} + #define in_hosteq(s, t) ((s).s_addr == (t).s_addr) #define in_nullhost(x) ((x).s_addr == INADDR_ANY) #define in_allhosts(x) ((x).s_addr == htonl(INADDR_ALLHOSTS_GROUP)) diff --git a/sys/netinet/in_fib_dxr.c b/sys/netinet/in_fib_dxr.c index e7eede53ea51..b889131b544b 100644 --- a/sys/netinet/in_fib_dxr.c +++ b/sys/netinet/in_fib_dxr.c @@ -1,7 +1,7 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * - * Copyright (c) 2012-2022 Marko Zec + * Copyright (c) 2012-2024 Marko Zec * Copyright (c) 2005, 2018 University of Zagreb * Copyright (c) 2005 International Computer Science Institute * @@ -68,9 +68,6 @@ CTASSERT(DXR_TRIE_BITS >= 16 && DXR_TRIE_BITS <= 24); -/* DXR2: two-stage primary trie, instead of a single direct lookup table */ -#define DXR2 - #if DXR_TRIE_BITS > 16 #define DXR_D 16 #else @@ -317,7 +314,6 @@ range_lookup(struct range_entry_long *rt, struct direct_entry de, uint32_t dst) ntohl(key.addr4.s_addr))]); \ } -#ifdef DXR2 #if DXR_TRIE_BITS > 16 DXR_LOOKUP_DEFINE(16) #endif @@ -328,23 +324,16 @@ DXR_LOOKUP_DEFINE(12) DXR_LOOKUP_DEFINE(11) DXR_LOOKUP_DEFINE(10) DXR_LOOKUP_DEFINE(9) -#endif /* DXR2 */ static int inline dxr_lookup(struct dxr *dxr, uint32_t dst) { struct direct_entry de; -#ifdef DXR2 uint16_t *dt = dxr->d; struct direct_entry *xt = dxr->x; de = xt[(dt[dst >> dxr->d_shift] << dxr->x_shift) + ((dst >> DXR_RANGE_SHIFT) & dxr->x_mask)]; -#else /* !DXR2 */ - struct direct_entry *dt = dxr->d; - - de = dt[dst >> DXR_RANGE_SHIFT]; -#endif /* !DXR2 */ if (__predict_true(de.fragments == FRAGS_MARK_HIT)) return (de.base); return (range_lookup(dxr->r, de, dst)); @@ -474,8 +463,7 @@ chunk_ref(struct dxr_aux *da, uint32_t chunk) cdp->cd_max_size = size; cdp->cd_base = fdesc->base; LIST_INSERT_HEAD(&da->all_chunks, cdp, cd_all_le); - KASSERT(cdp->cd_base + cdp->cd_max_size == da->rtbl_top, - ("dxr: %s %d", __FUNCTION__, __LINE__)); + MPASS(cdp->cd_base + cdp->cd_max_size == da->rtbl_top); } cdp->cd_hash = hash; @@ -497,8 +485,11 @@ chunk_ref(struct dxr_aux *da, uint32_t chunk) da->range_tbl = realloc(da->range_tbl, sizeof(*da->range_tbl) * da->rtbl_size + FRAGS_PREF_SHORT, M_DXRAUX, M_NOWAIT); - if (da->range_tbl == NULL) + if (da->range_tbl == NULL) { + FIB_PRINTF(LOG_NOTICE, da->fd, + "Unable to allocate DXR range table"); return (1); + } } return (0); @@ -522,7 +513,7 @@ chunk_unref(struct dxr_aux *da, uint32_t chunk) sizeof(struct range_entry_long) * size) == 0) break; - KASSERT(cdp != NULL, ("dxr: dangling chunk")); + MPASS(cdp != NULL); if (--cdp->cd_refcnt > 0) return; @@ -533,8 +524,7 @@ chunk_unref(struct dxr_aux *da, uint32_t chunk) /* Attempt to merge with the preceding chunk, if empty */ cdp2 = LIST_NEXT(cdp, cd_all_le); if (cdp2 != NULL && cdp2->cd_cur_size == 0) { - KASSERT(cdp2->cd_base + cdp2->cd_max_size == cdp->cd_base, - ("dxr: %s %d", __FUNCTION__, __LINE__)); + MPASS(cdp2->cd_base + cdp2->cd_max_size == cdp->cd_base); LIST_REMOVE(cdp, cd_all_le); LIST_REMOVE(cdp2, cd_hash_le); cdp2->cd_max_size += cdp->cd_max_size; @@ -545,8 +535,7 @@ chunk_unref(struct dxr_aux *da, uint32_t chunk) /* Attempt to merge with the subsequent chunk, if empty */ cdp2 = LIST_PREV(cdp, &da->all_chunks, chunk_desc, cd_all_le); if (cdp2 != NULL && cdp2->cd_cur_size == 0) { - KASSERT(cdp->cd_base + cdp->cd_max_size == cdp2->cd_base, - ("dxr: %s %d", __FUNCTION__, __LINE__)); + MPASS(cdp->cd_base + cdp->cd_max_size == cdp2->cd_base); LIST_REMOVE(cdp, cd_all_le); LIST_REMOVE(cdp2, cd_hash_le); cdp2->cd_max_size += cdp->cd_max_size; @@ -557,8 +546,7 @@ chunk_unref(struct dxr_aux *da, uint32_t chunk) if (cdp->cd_base + cdp->cd_max_size == da->rtbl_top) { /* Free the chunk on the top of the range heap, trim the heap */ - KASSERT(cdp == LIST_FIRST(&da->all_chunks), - ("dxr: %s %d", __FUNCTION__, __LINE__)); + MPASS(cdp == LIST_FIRST(&da->all_chunks)); da->rtbl_top -= cdp->cd_max_size; da->unused_chunks_size -= cdp->cd_max_size; LIST_REMOVE(cdp, cd_all_le); @@ -572,7 +560,6 @@ chunk_unref(struct dxr_aux *da, uint32_t chunk) LIST_INSERT_HEAD(&da->unused_chunks[i], cdp, cd_hash_le); } -#ifdef DXR2 static uint32_t trie_hash(struct dxr_aux *da, uint32_t dxr_x, uint32_t index) { @@ -632,8 +619,11 @@ trie_ref(struct dxr_aux *da, uint32_t index) da->xtbl_size += XTBL_SIZE_INCR; da->x_tbl = realloc(da->x_tbl, sizeof(*da->x_tbl) * da->xtbl_size, M_DXRAUX, M_NOWAIT); - if (da->x_tbl == NULL) + if (da->x_tbl == NULL) { + FIB_PRINTF(LOG_NOTICE, da->fd, + "Unable to allocate DXR extension table"); return (-1); + } } return(tp->td_index); } @@ -668,7 +658,6 @@ trie_unref(struct dxr_aux *da, uint32_t index) } } while (tp != NULL); } -#endif static void heap_inject(struct dxr_aux *da, uint32_t start, uint32_t end, uint32_t preflen, @@ -862,21 +851,23 @@ dxr_build(struct dxr *dxr) uint32_t r_size, dxr_tot_size; uint32_t i, m, range_rebuild = 0; uint32_t range_frag; -#ifdef DXR2 struct trie_desc *tp; uint32_t d_tbl_size, dxr_x, d_size, x_size; uint32_t ti, trie_rebuild = 0, prev_size = 0; uint32_t trie_frag; -#endif - KASSERT(dxr->d == NULL, ("dxr: d not free")); + MPASS(dxr->d == NULL); if (da == NULL) { da = malloc(sizeof(*dxr->aux), M_DXRAUX, M_NOWAIT); - if (da == NULL) + if (da == NULL) { + FIB_PRINTF(LOG_NOTICE, dxr->fd, + "Unable to allocate DXR aux struct"); return; + } dxr->aux = da; da->fibnum = dxr->fibnum; + da->fd = dxr->fd; da->refcnt = 1; LIST_INIT(&da->all_chunks); LIST_INIT(&da->all_trie); @@ -894,20 +885,23 @@ dxr_build(struct dxr *dxr) if (da->range_tbl == NULL) { da->range_tbl = malloc(sizeof(*da->range_tbl) * da->rtbl_size + FRAGS_PREF_SHORT, M_DXRAUX, M_NOWAIT); - if (da->range_tbl == NULL) + if (da->range_tbl == NULL) { + FIB_PRINTF(LOG_NOTICE, da->fd, + "Unable to allocate DXR range table"); return; + } range_rebuild = 1; } -#ifdef DXR2 if (da->x_tbl == NULL) { da->x_tbl = malloc(sizeof(*da->x_tbl) * da->xtbl_size, M_DXRAUX, M_NOWAIT); - if (da->x_tbl == NULL) + if (da->x_tbl == NULL) { + FIB_PRINTF(LOG_NOTICE, da->fd, + "Unable to allocate DXR extension table"); return; + } trie_rebuild = 1; } -#endif - da->fd = dxr->fd; microuptime(&t0); @@ -959,7 +953,6 @@ range_build: r_size = sizeof(*da->range_tbl) * da->rtbl_top; microuptime(&t1); -#ifdef DXR2 if (range_rebuild || abs(fls(da->prefixes) - fls(da->trie_rebuilt_prefixes)) > 1) trie_rebuild = 1; @@ -1033,15 +1026,13 @@ dxr2_try_squeeze: goto dxr2_try_squeeze; } microuptime(&t2); -#else /* !DXR2 */ - dxr_tot_size = sizeof(da->direct_tbl) + r_size; - t2 = t1; -#endif dxr->d = malloc(dxr_tot_size, M_DXRLPM, M_NOWAIT); - if (dxr->d == NULL) + if (dxr->d == NULL) { + FIB_PRINTF(LOG_NOTICE, da->fd, + "Unable to allocate DXR lookup table"); return; -#ifdef DXR2 + } memcpy(dxr->d, da->d_tbl, d_size); dxr->x = ((char *) dxr->d) + d_size; memcpy(dxr->x, da->x_tbl, x_size); @@ -1049,10 +1040,6 @@ dxr2_try_squeeze: dxr->d_shift = 32 - da->d_bits; dxr->x_shift = dxr_x; dxr->x_mask = 0xffffffffU >> (32 - dxr_x); -#else /* !DXR2 */ - memcpy(dxr->d, da->direct_tbl, sizeof(da->direct_tbl)); - dxr->r = ((char *) dxr->d) + sizeof(da->direct_tbl); -#endif memcpy(dxr->r, da->range_tbl, r_size); if (da->updates_low <= da->updates_high) @@ -1062,43 +1049,31 @@ dxr2_try_squeeze: da->updates_high = 0; microuptime(&t3); -#ifdef DXR2 FIB_PRINTF(LOG_INFO, da->fd, "D%dX%dR, %d prefixes, %d nhops (max)", da->d_bits, dxr_x, rinfo.num_prefixes, rinfo.num_nhops); -#else - FIB_PRINTF(LOG_INFO, da->fd, "D%dR, %d prefixes, %d nhops (max)", - DXR_D, rinfo.num_prefixes, rinfo.num_nhops); -#endif i = dxr_tot_size * 100; if (rinfo.num_prefixes) i /= rinfo.num_prefixes; FIB_PRINTF(LOG_INFO, da->fd, "%d.%02d KBytes, %d.%02d Bytes/prefix", dxr_tot_size / 1024, dxr_tot_size * 100 / 1024 % 100, i / 100, i % 100); -#ifdef DXR2 FIB_PRINTF(LOG_INFO, da->fd, "%d.%02d%% trie, %d.%02d%% range fragmentation", trie_frag / 100, trie_frag % 100, range_frag / 100, range_frag % 100); -#else - FIB_PRINTF(LOG_INFO, da->fd, "%d.%01d%% range fragmentation", - range_frag / 100, range_frag % 100); -#endif i = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec; FIB_PRINTF(LOG_INFO, da->fd, "range table %s in %u.%03u ms", range_rebuild ? "rebuilt" : "updated", i / 1000, i % 1000); -#ifdef DXR2 i = (t2.tv_sec - t1.tv_sec) * 1000000 + t2.tv_usec - t1.tv_usec; FIB_PRINTF(LOG_INFO, da->fd, "trie %s in %u.%03u ms", trie_rebuild ? "rebuilt" : "updated", i / 1000, i % 1000); -#endif i = (t3.tv_sec - t2.tv_sec) * 1000000 + t3.tv_usec - t2.tv_usec; FIB_PRINTF(LOG_INFO, da->fd, "snapshot forked in %u.%03u ms", i / 1000, i % 1000); } /* - * Glue functions for attaching to FreeBSD 13 fib_algo infrastructure. + * Glue functions for attaching to the FIB_ALGO infrastructure. */ static struct nhop_object * @@ -1118,11 +1093,15 @@ dxr_init(uint32_t fibnum, struct fib_data *fd, void *old_data, void **data) struct dxr *dxr; dxr = malloc(sizeof(*dxr), M_DXRAUX, M_NOWAIT); - if (dxr == NULL) + if (dxr == NULL) { + FIB_PRINTF(LOG_NOTICE, fd, + "Unable to allocate DXR container struct"); return (FLM_REBUILD); + } /* Check whether we may reuse the old auxiliary structures */ - if (old_dxr != NULL && old_dxr->aux != NULL) { + if (old_dxr != NULL && old_dxr->aux != NULL && + old_dxr->aux->fd == fd) { da = old_dxr->aux; atomic_add_int(&da->refcnt, 1); } @@ -1140,14 +1119,11 @@ static void dxr_destroy(void *data) { struct dxr *dxr = data; - struct dxr_aux *da; + struct dxr_aux *da = dxr->aux; struct chunk_desc *cdp; struct trie_desc *tp; - if (dxr->d != NULL) - free(dxr->d, M_DXRLPM); - - da = dxr->aux; + free(dxr->d, M_DXRLPM); free(dxr, M_DXRAUX); if (da == NULL || atomic_fetchadd_int(&da->refcnt, -1) > 1) @@ -1179,7 +1155,6 @@ static void * choose_lookup_fn(struct dxr_aux *da) { -#ifdef DXR2 switch (da->d_bits) { #if DXR_TRIE_BITS > 16 case 16: @@ -1200,7 +1175,6 @@ choose_lookup_fn(struct dxr_aux *da) case 9: return (dxr_fib_lookup_9); } -#endif /* DXR2 */ return (dxr_fib_lookup); } @@ -1213,17 +1187,12 @@ dxr_dump_end(void *data, struct fib_dp *dp) dxr_build(dxr); da = dxr->aux; - if (da == NULL) + if (da == NULL || dxr->d == NULL) return (FLM_REBUILD); - /* Structural limit exceeded, hard error */ if (da->rtbl_top >= BASE_MAX) return (FLM_ERROR); - /* A malloc(,, M_NOWAIT) failed somewhere, retry later */ - if (dxr->d == NULL) - return (FLM_REBUILD); - dp->f = choose_lookup_fn(da); dp->arg = dxr; @@ -1260,13 +1229,14 @@ dxr_change_rib_batch(struct rib_head *rnh, struct fib_change_queue *q, int update_delta = 0; #endif - KASSERT(data != NULL, ("%s: NULL data", __FUNCTION__)); - KASSERT(q != NULL, ("%s: NULL q", __FUNCTION__)); - KASSERT(q->count < q->size, ("%s: q->count %d q->size %d", - __FUNCTION__, q->count, q->size)); + MPASS(data != NULL); + MPASS(q != NULL); + MPASS(q->count < q->size); da = dxr->aux; - KASSERT(da != NULL, ("%s: NULL dxr->aux", __FUNCTION__)); + MPASS(da != NULL); + MPASS(da->fd == dxr->fd); + MPASS(da->refcnt > 0); FIB_PRINTF(LOG_INFO, da->fd, "processing %d update(s)", q->count); for (ui = 0; ui < q->count; ui++) { @@ -1299,8 +1269,7 @@ dxr_change_rib_batch(struct rib_head *rnh, struct fib_change_queue *q, #ifdef INVARIANTS fib_get_rtable_info(fib_get_rh(da->fd), &rinfo); - KASSERT(da->prefixes + update_delta == rinfo.num_prefixes, - ("%s: update count mismatch", __FUNCTION__)); + MPASS(da->prefixes + update_delta == rinfo.num_prefixes); #endif res = dxr_init(0, dxr->fd, data, (void **) &new_dxr); @@ -1315,7 +1284,6 @@ dxr_change_rib_batch(struct rib_head *rnh, struct fib_change_queue *q, return (FLM_ERROR); } - /* A malloc(,, M_NOWAIT) failed somewhere, retry later */ if (new_dxr->d == NULL) { dxr_destroy(new_dxr); return (FLM_REBUILD); @@ -1329,6 +1297,7 @@ dxr_change_rib_batch(struct rib_head *rnh, struct fib_change_queue *q, return (FLM_SUCCESS); } + FIB_PRINTF(LOG_NOTICE, dxr->fd, "fib_set_datapath_ptr() failed"); dxr_destroy(new_dxr); return (FLM_REBUILD); } diff --git a/sys/netinet/in_jail.c b/sys/netinet/in_jail.c index 9b6b8f670df1..7b2120f575ca 100644 --- a/sys/netinet/in_jail.c +++ b/sys/netinet/in_jail.c @@ -97,13 +97,7 @@ prison_qcmp_v4(const void *ip1, const void *ip2) bool prison_valid_v4(const void *ip) { - in_addr_t ia = ((const struct in_addr *)ip)->s_addr; - - /* - * We do not have to care about byte order for these - * checks so we will do them in NBO. - */ - return (ia != INADDR_ANY && ia != INADDR_BROADCAST); + return (!in_broadcast(*(const struct in_addr *)ip)); } /* diff --git a/sys/netinet/in_kdtrace.c b/sys/netinet/in_kdtrace.c index 2a53b11c3be2..de2a98ce541c 100644 --- a/sys/netinet/in_kdtrace.c +++ b/sys/netinet/in_kdtrace.c @@ -36,7 +36,7 @@ SDT_PROVIDER_DEFINE(tcp); SDT_PROVIDER_DEFINE(udp); SDT_PROVIDER_DEFINE(udplite); -#ifndef KDTRACE_NO_MIB_SDT +#ifdef KDTRACE_MIB_SDT #define MIB_PROBE_IP(name) \ SDT_PROBE_DEFINE1(mib, ip, count, name, \ "int") @@ -286,6 +286,8 @@ MIB_PROBE_TCP(tcps_sc_unreach); MIB_PROBE_TCP(tcps_sc_zonefail); MIB_PROBE_TCP(tcps_sc_sendcookie); MIB_PROBE_TCP(tcps_sc_recvcookie); +MIB_PROBE_TCP(tcps_sc_spurcookie); +MIB_PROBE_TCP(tcps_sc_failcookie); MIB_PROBE_TCP(tcps_hc_added); MIB_PROBE_TCP(tcps_hc_bucketoverflow); @@ -294,6 +296,7 @@ MIB_PROBE_TCP(tcps_finwait2_drops); MIB_PROBE_TCP(tcps_sack_recovery_episode); MIB_PROBE_TCP(tcps_sack_rexmits); +MIB_PROBE_TCP(tcps_sack_rexmits_tso); MIB_PROBE_TCP(tcps_sack_rexmit_bytes); MIB_PROBE_TCP(tcps_sack_rcv_blocks); MIB_PROBE_TCP(tcps_sack_send_blocks); @@ -338,6 +341,101 @@ MIB_PROBE_TCP(tcps_ecn_sndect1); MIB_PROBE_TCP(tcps_tlpresends); MIB_PROBE_TCP(tcps_tlpresend_bytes); +MIB_PROBE_TCP(tcps_rcvghostack); +MIB_PROBE_TCP(tcps_rcvacktooold); + +#define MIB_PROBE_IPSEC(name) SDT_PROBE_DEFINE1(mib, ipsec, count, name, "int") + +MIB_PROBE_IPSEC(ips_in_polvio); +MIB_PROBE_IPSEC(ips_in_nomem); +MIB_PROBE_IPSEC(ips_in_inval); +MIB_PROBE_IPSEC(ips_out_polvio); +MIB_PROBE_IPSEC(ips_out_nosa); +MIB_PROBE_IPSEC(ips_out_nomem); +MIB_PROBE_IPSEC(ips_out_noroute); +MIB_PROBE_IPSEC(ips_out_inval); +MIB_PROBE_IPSEC(ips_out_bundlesa); + +MIB_PROBE_IPSEC(ips_spdcache_hits); +MIB_PROBE_IPSEC(ips_spdcache_misses); + +MIB_PROBE_IPSEC(ips_clcopied); +MIB_PROBE_IPSEC(ips_mbinserted); +MIB_PROBE_IPSEC(ips_input_front); +MIB_PROBE_IPSEC(ips_input_middle); +MIB_PROBE_IPSEC(ips_input_end); + +#define MIB_PROBE_ESP(name) SDT_PROBE_DEFINE1(mib, esp, count, name, "int") +#define MIB_PROBE2_ESP(name) SDT_PROBE_DEFINE2(mib, esp, count, name, "int", "int") + +MIB_PROBE_ESP(esps_hdrops); +MIB_PROBE_ESP(esps_nopf); +MIB_PROBE_ESP(esps_notdb); +MIB_PROBE_ESP(esps_badkcr); +MIB_PROBE_ESP(esps_qfull); +MIB_PROBE_ESP(esps_noxform); +MIB_PROBE_ESP(esps_badilen); +MIB_PROBE_ESP(esps_wrap); +MIB_PROBE_ESP(esps_badenc); +MIB_PROBE_ESP(esps_badauth); +MIB_PROBE_ESP(esps_replay); +MIB_PROBE_ESP(esps_input); +MIB_PROBE_ESP(esps_output); +MIB_PROBE_ESP(esps_invalid); +MIB_PROBE_ESP(esps_ibytes); +MIB_PROBE_ESP(esps_obytes); +MIB_PROBE_ESP(esps_toobig); +MIB_PROBE_ESP(esps_pdrops); +MIB_PROBE_ESP(esps_crypto); +MIB_PROBE_ESP(esps_tunnel); +MIB_PROBE2_ESP(esps_hist); + +#define MIB_PROBE_AH(name) SDT_PROBE_DEFINE1(mib, ah, count, name, "int") +#define MIB_PROBE_AH2(name) SDT_PROBE_DEFINE2(mib, ah, count, name, "int", "int") + +MIB_PROBE_AH(ahs_hdrops); +MIB_PROBE_AH(ahs_nopf); +MIB_PROBE_AH(ahs_notdb); +MIB_PROBE_AH(ahs_badkcr); +MIB_PROBE_AH(ahs_badauth); +MIB_PROBE_AH(ahs_noxform); +MIB_PROBE_AH(ahs_qfull); +MIB_PROBE_AH(ahs_wrap); +MIB_PROBE_AH(ahs_replay); +MIB_PROBE_AH(ahs_badauthl); +MIB_PROBE_AH(ahs_input); +MIB_PROBE_AH(ahs_output); +MIB_PROBE_AH(ahs_invalid); +MIB_PROBE_AH(ahs_ibytes); +MIB_PROBE_AH(ahs_obytes); +MIB_PROBE_AH(ahs_toobig); +MIB_PROBE_AH(ahs_pdrops); +MIB_PROBE_AH(ahs_crypto); +MIB_PROBE_AH(ahs_tunnel); +MIB_PROBE_AH2(ahs_hist); + +#define MIB_PROBE_IPCOMP(name) SDT_PROBE_DEFINE1(mib, ipcomp, count, name, "int") +#define MIB_PROBE_IPCOMP2(name) SDT_PROBE_DEFINE2(mib, ipcomp, count, name, "int", "int") + +MIB_PROBE_IPCOMP(ipcomps_hdrops); +MIB_PROBE_IPCOMP(ipcomps_nopf); +MIB_PROBE_IPCOMP(ipcomps_notdb); +MIB_PROBE_IPCOMP(ipcomps_badkcr); +MIB_PROBE_IPCOMP(ipcomps_qfull); +MIB_PROBE_IPCOMP(ipcomps_noxform); +MIB_PROBE_IPCOMP(ipcomps_wrap); +MIB_PROBE_IPCOMP(ipcomps_input); +MIB_PROBE_IPCOMP(ipcomps_output); +MIB_PROBE_IPCOMP(ipcomps_invalid); +MIB_PROBE_IPCOMP(ipcomps_ibytes); +MIB_PROBE_IPCOMP(ipcomps_obytes); +MIB_PROBE_IPCOMP(ipcomps_toobig); +MIB_PROBE_IPCOMP(ipcomps_pdrops); +MIB_PROBE_IPCOMP(ipcomps_crypto); +MIB_PROBE_IPCOMP2(ipcomps_hist); +MIB_PROBE_IPCOMP(ipcomps_threshold); +MIB_PROBE_IPCOMP(ipcomps_uncompr); + #endif SDT_PROBE_DEFINE6_XLATE(ip, , , receive, diff --git a/sys/netinet/in_kdtrace.h b/sys/netinet/in_kdtrace.h index 780839299993..a203b660d777 100644 --- a/sys/netinet/in_kdtrace.h +++ b/sys/netinet/in_kdtrace.h @@ -54,7 +54,7 @@ SDT_PROVIDER_DECLARE(tcp); SDT_PROVIDER_DECLARE(udp); SDT_PROVIDER_DECLARE(udplite); -#ifndef KDTRACE_NO_MIB_SDT +#ifdef KDTRACE_MIB_SDT SDT_PROVIDER_DECLARE(mib); SDT_PROBE_DECLARE(mib, ip, count, ips_total); @@ -278,6 +278,8 @@ SDT_PROBE_DECLARE(mib, tcp, count, tcps_sc_unreach); SDT_PROBE_DECLARE(mib, tcp, count, tcps_sc_zonefail); SDT_PROBE_DECLARE(mib, tcp, count, tcps_sc_sendcookie); SDT_PROBE_DECLARE(mib, tcp, count, tcps_sc_recvcookie); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_sc_spurcookie); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_sc_failcookie); SDT_PROBE_DECLARE(mib, tcp, count, tcps_hc_added); SDT_PROBE_DECLARE(mib, tcp, count, tcps_hc_bucketoverflow); @@ -286,6 +288,7 @@ SDT_PROBE_DECLARE(mib, tcp, count, tcps_finwait2_drops); SDT_PROBE_DECLARE(mib, tcp, count, tcps_sack_recovery_episode); SDT_PROBE_DECLARE(mib, tcp, count, tcps_sack_rexmits); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_sack_rexmits_tso); SDT_PROBE_DECLARE(mib, tcp, count, tcps_sack_rexmit_bytes); SDT_PROBE_DECLARE(mib, tcp, count, tcps_sack_rcv_blocks); SDT_PROBE_DECLARE(mib, tcp, count, tcps_sack_send_blocks); @@ -329,6 +332,91 @@ SDT_PROBE_DECLARE(mib, tcp, count, tcps_ecn_sndect1); SDT_PROBE_DECLARE(mib, tcp, count, tcps_tlpresends); SDT_PROBE_DECLARE(mib, tcp, count, tcps_tlpresend_bytes); + +SDT_PROBE_DECLARE(mib, tcp, count, tcps_rcvghostack); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_rcvacktooold); + +SDT_PROBE_DECLARE(mib, ipsec, count, ips_in_polvio); +SDT_PROBE_DECLARE(mib, ipsec, count, ips_in_nomem); +SDT_PROBE_DECLARE(mib, ipsec, count, ips_in_inval); +SDT_PROBE_DECLARE(mib, ipsec, count, ips_out_polvio); +SDT_PROBE_DECLARE(mib, ipsec, count, ips_out_nosa); +SDT_PROBE_DECLARE(mib, ipsec, count, ips_out_nomem); +SDT_PROBE_DECLARE(mib, ipsec, count, ips_out_noroute); +SDT_PROBE_DECLARE(mib, ipsec, count, ips_out_inval); +SDT_PROBE_DECLARE(mib, ipsec, count, ips_out_bundlesa); + +SDT_PROBE_DECLARE(mib, ipsec, count, ips_spdcache_hits); +SDT_PROBE_DECLARE(mib, ipsec, count, ips_spdcache_misses); + +SDT_PROBE_DECLARE(mib, ipsec, count, ips_clcopied); +SDT_PROBE_DECLARE(mib, ipsec, count, ips_mbinserted); +SDT_PROBE_DECLARE(mib, ipsec, count, ips_input_front); +SDT_PROBE_DECLARE(mib, ipsec, count, ips_input_middle); +SDT_PROBE_DECLARE(mib, ipsec, count, ips_input_end); + +SDT_PROBE_DECLARE(mib, esp, count, esps_hdrops); +SDT_PROBE_DECLARE(mib, esp, count, esps_nopf); +SDT_PROBE_DECLARE(mib, esp, count, esps_notdb); +SDT_PROBE_DECLARE(mib, esp, count, esps_badkcr); +SDT_PROBE_DECLARE(mib, esp, count, esps_qfull); +SDT_PROBE_DECLARE(mib, esp, count, esps_noxform); +SDT_PROBE_DECLARE(mib, esp, count, esps_badilen); +SDT_PROBE_DECLARE(mib, esp, count, esps_wrap); +SDT_PROBE_DECLARE(mib, esp, count, esps_badenc); +SDT_PROBE_DECLARE(mib, esp, count, esps_badauth); +SDT_PROBE_DECLARE(mib, esp, count, esps_replay); +SDT_PROBE_DECLARE(mib, esp, count, esps_input); +SDT_PROBE_DECLARE(mib, esp, count, esps_output); +SDT_PROBE_DECLARE(mib, esp, count, esps_invalid); +SDT_PROBE_DECLARE(mib, esp, count, esps_ibytes); +SDT_PROBE_DECLARE(mib, esp, count, esps_obytes); +SDT_PROBE_DECLARE(mib, esp, count, esps_toobig); +SDT_PROBE_DECLARE(mib, esp, count, esps_pdrops); +SDT_PROBE_DECLARE(mib, esp, count, esps_crypto); +SDT_PROBE_DECLARE(mib, esp, count, esps_tunnel); +SDT_PROBE_DECLARE(mib, esp, count, esps_hist); + +SDT_PROBE_DECLARE(mib, ah, count, ahs_hdrops); +SDT_PROBE_DECLARE(mib, ah, count, ahs_nopf); +SDT_PROBE_DECLARE(mib, ah, count, ahs_notdb); +SDT_PROBE_DECLARE(mib, ah, count, ahs_badkcr); +SDT_PROBE_DECLARE(mib, ah, count, ahs_badauth); +SDT_PROBE_DECLARE(mib, ah, count, ahs_noxform); +SDT_PROBE_DECLARE(mib, ah, count, ahs_qfull); +SDT_PROBE_DECLARE(mib, ah, count, ahs_wrap); +SDT_PROBE_DECLARE(mib, ah, count, ahs_replay); +SDT_PROBE_DECLARE(mib, ah, count, ahs_badauthl); +SDT_PROBE_DECLARE(mib, ah, count, ahs_input); +SDT_PROBE_DECLARE(mib, ah, count, ahs_output); +SDT_PROBE_DECLARE(mib, ah, count, ahs_invalid); +SDT_PROBE_DECLARE(mib, ah, count, ahs_ibytes); +SDT_PROBE_DECLARE(mib, ah, count, ahs_obytes); +SDT_PROBE_DECLARE(mib, ah, count, ahs_toobig); +SDT_PROBE_DECLARE(mib, ah, count, ahs_pdrops); +SDT_PROBE_DECLARE(mib, ah, count, ahs_crypto); +SDT_PROBE_DECLARE(mib, ah, count, ahs_tunnel); +SDT_PROBE_DECLARE(mib, ah, count, ahs_hist); + +SDT_PROBE_DECLARE(mib, ipcomp, count, ipcomps_hdrops); +SDT_PROBE_DECLARE(mib, ipcomp, count, ipcomps_nopf); +SDT_PROBE_DECLARE(mib, ipcomp, count, ipcomps_notdb); +SDT_PROBE_DECLARE(mib, ipcomp, count, ipcomps_badkcr); +SDT_PROBE_DECLARE(mib, ipcomp, count, ipcomps_qfull); +SDT_PROBE_DECLARE(mib, ipcomp, count, ipcomps_noxform); +SDT_PROBE_DECLARE(mib, ipcomp, count, ipcomps_wrap); +SDT_PROBE_DECLARE(mib, ipcomp, count, ipcomps_input); +SDT_PROBE_DECLARE(mib, ipcomp, count, ipcomps_output); +SDT_PROBE_DECLARE(mib, ipcomp, count, ipcomps_invalid); +SDT_PROBE_DECLARE(mib, ipcomp, count, ipcomps_ibytes); +SDT_PROBE_DECLARE(mib, ipcomp, count, ipcomps_obytes); +SDT_PROBE_DECLARE(mib, ipcomp, count, ipcomps_toobig); +SDT_PROBE_DECLARE(mib, ipcomp, count, ipcomps_pdrops); +SDT_PROBE_DECLARE(mib, ipcomp, count, ipcomps_crypto); +SDT_PROBE_DECLARE(mib, ipcomp, count, ipcomps_hist); +SDT_PROBE_DECLARE(mib, ipcomp, count, ipcomps_threshold); +SDT_PROBE_DECLARE(mib, ipcomp, count, ipcomps_uncompr); + #endif SDT_PROBE_DECLARE(ip, , , receive); diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c index 1a341d421f31..dbe48242381d 100644 --- a/sys/netinet/in_pcb.c +++ b/sys/netinet/in_pcb.c @@ -139,7 +139,7 @@ VNET_DEFINE(int, ipport_randomized) = 1; static struct inpcb *in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport_arg, struct in_addr laddr, u_int lport_arg, - int lookupflags, uint8_t numa_domain); + int lookupflags, uint8_t numa_domain, int fib); #define RANGECHK(var, min, max) \ if ((var) < (min)) { (var) = (min); } \ @@ -232,7 +232,15 @@ in_pcbhashseed_init(void) V_in_pcbhashseed = arc4random(); } VNET_SYSINIT(in_pcbhashseed_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST, - in_pcbhashseed_init, 0); + in_pcbhashseed_init, NULL); + +#ifdef INET +VNET_DEFINE_STATIC(int, connect_inaddr_wild) = 0; +#define V_connect_inaddr_wild VNET(connect_inaddr_wild) +SYSCTL_INT(_net_inet_ip, OID_AUTO, connect_inaddr_wild, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(connect_inaddr_wild), 0, + "Allow connecting to INADDR_ANY or INADDR_BROADCAST for connect(2)"); +#endif static void in_pcbremhash(struct inpcb *); @@ -245,9 +253,8 @@ static void in_pcbremhash(struct inpcb *); */ static struct inpcblbgroup * -in_pcblbgroup_alloc(struct inpcblbgrouphead *hdr, struct ucred *cred, - u_char vflag, uint16_t port, const union in_dependaddr *addr, int size, - uint8_t numa_domain) +in_pcblbgroup_alloc(struct ucred *cred, u_char vflag, uint16_t port, + const union in_dependaddr *addr, int size, uint8_t numa_domain, int fib) { struct inpcblbgroup *grp; size_t bytes; @@ -256,13 +263,14 @@ in_pcblbgroup_alloc(struct inpcblbgrouphead *hdr, struct ucred *cred, grp = malloc(bytes, M_PCB, M_ZERO | M_NOWAIT); if (grp == NULL) return (NULL); + LIST_INIT(&grp->il_pending); grp->il_cred = crhold(cred); grp->il_vflag = vflag; grp->il_lport = port; grp->il_numa_domain = numa_domain; + grp->il_fibnum = fib; grp->il_dependladdr = *addr; grp->il_inpsiz = size; - CK_LIST_INSERT_HEAD(hdr, grp, il_list); return (grp); } @@ -279,21 +287,82 @@ in_pcblbgroup_free_deferred(epoch_context_t ctx) static void in_pcblbgroup_free(struct inpcblbgroup *grp) { + KASSERT(LIST_EMPTY(&grp->il_pending), + ("local group %p still has pending inps", grp)); CK_LIST_REMOVE(grp, il_list); NET_EPOCH_CALL(in_pcblbgroup_free_deferred, &grp->il_epoch_ctx); } static struct inpcblbgroup * +in_pcblbgroup_find(struct inpcb *inp) +{ + struct inpcbinfo *pcbinfo; + struct inpcblbgroup *grp; + struct inpcblbgrouphead *hdr; + + INP_LOCK_ASSERT(inp); + + pcbinfo = inp->inp_pcbinfo; + INP_HASH_LOCK_ASSERT(pcbinfo); + + hdr = &pcbinfo->ipi_lbgrouphashbase[ + INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)]; + CK_LIST_FOREACH(grp, hdr, il_list) { + struct inpcb *inp1; + + for (unsigned int i = 0; i < grp->il_inpcnt; i++) { + if (inp == grp->il_inp[i]) + goto found; + } + LIST_FOREACH(inp1, &grp->il_pending, inp_lbgroup_list) { + if (inp == inp1) + goto found; + } + } +found: + return (grp); +} + +static void +in_pcblbgroup_insert(struct inpcblbgroup *grp, struct inpcb *inp) +{ + KASSERT(grp->il_inpcnt < grp->il_inpsiz, + ("invalid local group size %d and count %d", grp->il_inpsiz, + grp->il_inpcnt)); + INP_WLOCK_ASSERT(inp); + + if (inp->inp_socket->so_proto->pr_listen != pr_listen_notsupp && + !SOLISTENING(inp->inp_socket)) { + /* + * If this is a TCP socket, it should not be visible to lbgroup + * lookups until listen() has been called. + */ + LIST_INSERT_HEAD(&grp->il_pending, inp, inp_lbgroup_list); + grp->il_pendcnt++; + } else { + grp->il_inp[grp->il_inpcnt] = inp; + + /* + * Synchronize with in_pcblookup_lbgroup(): make sure that we + * don't expose a null slot to the lookup path. + */ + atomic_store_rel_int(&grp->il_inpcnt, grp->il_inpcnt + 1); + } + + inp->inp_flags |= INP_INLBGROUP; +} + +static struct inpcblbgroup * in_pcblbgroup_resize(struct inpcblbgrouphead *hdr, struct inpcblbgroup *old_grp, int size) { struct inpcblbgroup *grp; int i; - grp = in_pcblbgroup_alloc(hdr, old_grp->il_cred, old_grp->il_vflag, + grp = in_pcblbgroup_alloc(old_grp->il_cred, old_grp->il_vflag, old_grp->il_lport, &old_grp->il_dependladdr, size, - old_grp->il_numa_domain); + old_grp->il_numa_domain, old_grp->il_fibnum); if (grp == NULL) return (NULL); @@ -304,35 +373,16 @@ in_pcblbgroup_resize(struct inpcblbgrouphead *hdr, for (i = 0; i < old_grp->il_inpcnt; ++i) grp->il_inp[i] = old_grp->il_inp[i]; grp->il_inpcnt = old_grp->il_inpcnt; + CK_LIST_INSERT_HEAD(hdr, grp, il_list); + LIST_SWAP(&old_grp->il_pending, &grp->il_pending, inpcb, + inp_lbgroup_list); + grp->il_pendcnt = old_grp->il_pendcnt; + old_grp->il_pendcnt = 0; in_pcblbgroup_free(old_grp); return (grp); } /* - * PCB at index 'i' is removed from the group. Pull up the ones below il_inp[i] - * and shrink group if possible. - */ -static void -in_pcblbgroup_reorder(struct inpcblbgrouphead *hdr, struct inpcblbgroup **grpp, - int i) -{ - struct inpcblbgroup *grp, *new_grp; - - grp = *grpp; - for (; i + 1 < grp->il_inpcnt; ++i) - grp->il_inp[i] = grp->il_inp[i + 1]; - grp->il_inpcnt--; - - if (grp->il_inpsiz > INPCBLBGROUP_SIZMIN && - grp->il_inpcnt <= grp->il_inpsiz / 4) { - /* Shrink this group. */ - new_grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz / 2); - if (new_grp != NULL) - *grpp = new_grp; - } -} - -/* * Add PCB to load balance group for SO_REUSEPORT_LB option. */ static int @@ -344,12 +394,16 @@ in_pcbinslbgrouphash(struct inpcb *inp, uint8_t numa_domain) struct inpcblbgrouphead *hdr; struct inpcblbgroup *grp; uint32_t idx; + int fib; pcbinfo = inp->inp_pcbinfo; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(pcbinfo); + fib = (inp->inp_flags & INP_BOUNDFIB) != 0 ? + inp->inp_inc.inc_fibnum : RT_ALL_FIBS; + #ifdef INET6 /* * Don't allow IPv4 mapped INET6 wild socket. @@ -368,6 +422,7 @@ in_pcbinslbgrouphash(struct inpcb *inp, uint8_t numa_domain) grp->il_vflag == inp->inp_vflag && grp->il_lport == inp->inp_lport && grp->il_numa_domain == numa_domain && + grp->il_fibnum == fib && memcmp(&grp->il_dependladdr, &inp->inp_inc.inc_ie.ie_dependladdr, sizeof(grp->il_dependladdr)) == 0) { @@ -376,12 +431,14 @@ in_pcbinslbgrouphash(struct inpcb *inp, uint8_t numa_domain) } if (grp == NULL) { /* Create new load balance group. */ - grp = in_pcblbgroup_alloc(hdr, inp->inp_cred, inp->inp_vflag, + grp = in_pcblbgroup_alloc(inp->inp_cred, inp->inp_vflag, inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr, - INPCBLBGROUP_SIZMIN, numa_domain); + INPCBLBGROUP_SIZMIN, numa_domain, fib); if (grp == NULL) - return (ENOBUFS); - } else if (grp->il_inpcnt == grp->il_inpsiz) { + return (ENOMEM); + in_pcblbgroup_insert(grp, inp); + CK_LIST_INSERT_HEAD(hdr, grp, il_list); + } else if (grp->il_inpcnt + grp->il_pendcnt == grp->il_inpsiz) { if (grp->il_inpsiz >= INPCBLBGROUP_SIZMAX) { if (ratecheck(&lastprint, &interval)) printf("lb group port %d, limit reached\n", @@ -392,16 +449,11 @@ in_pcbinslbgrouphash(struct inpcb *inp, uint8_t numa_domain) /* Expand this local group. */ grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz * 2); if (grp == NULL) - return (ENOBUFS); + return (ENOMEM); + in_pcblbgroup_insert(grp, inp); + } else { + in_pcblbgroup_insert(grp, inp); } - - KASSERT(grp->il_inpcnt < grp->il_inpsiz, - ("invalid local group size %d and count %d", grp->il_inpsiz, - grp->il_inpcnt)); - - grp->il_inp[grp->il_inpcnt] = inp; - grp->il_inpcnt++; - inp->inp_flags |= INP_INLBGROUP; return (0); } @@ -414,6 +466,7 @@ in_pcbremlbgrouphash(struct inpcb *inp) struct inpcbinfo *pcbinfo; struct inpcblbgrouphead *hdr; struct inpcblbgroup *grp; + struct inpcb *inp1; int i; pcbinfo = inp->inp_pcbinfo; @@ -429,27 +482,40 @@ in_pcbremlbgrouphash(struct inpcb *inp) if (grp->il_inp[i] != inp) continue; - if (grp->il_inpcnt == 1) { + if (grp->il_inpcnt == 1 && + LIST_EMPTY(&grp->il_pending)) { /* We are the last, free this local group. */ in_pcblbgroup_free(grp); } else { - /* Pull up inpcbs, shrink group if possible. */ - in_pcblbgroup_reorder(hdr, &grp, i); + grp->il_inp[i] = + grp->il_inp[grp->il_inpcnt - 1]; + + /* + * Synchronize with in_pcblookup_lbgroup(). + */ + atomic_store_rel_int(&grp->il_inpcnt, + grp->il_inpcnt - 1); } inp->inp_flags &= ~INP_INLBGROUP; return; } + LIST_FOREACH(inp1, &grp->il_pending, inp_lbgroup_list) { + if (inp == inp1) { + LIST_REMOVE(inp, inp_lbgroup_list); + grp->il_pendcnt--; + inp->inp_flags &= ~INP_INLBGROUP; + return; + } + } } - KASSERT(0, ("%s: did not find %p", __func__, inp)); + __assert_unreachable(); } int in_pcblbgroup_numa(struct inpcb *inp, int arg) { struct inpcbinfo *pcbinfo; - struct inpcblbgrouphead *hdr; - struct inpcblbgroup *grp; - int err, i; + int error; uint8_t numa_domain; switch (arg) { @@ -465,33 +531,20 @@ in_pcblbgroup_numa(struct inpcb *inp, int arg) numa_domain = arg; } - err = 0; pcbinfo = inp->inp_pcbinfo; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK(pcbinfo); - hdr = &pcbinfo->ipi_lbgrouphashbase[ - INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)]; - CK_LIST_FOREACH(grp, hdr, il_list) { - for (i = 0; i < grp->il_inpcnt; ++i) { - if (grp->il_inp[i] != inp) - continue; - - if (grp->il_numa_domain == numa_domain) { - goto abort_with_hash_wlock; - } - - /* Remove it from the old group. */ - in_pcbremlbgrouphash(inp); - - /* Add it to the new group based on numa domain. */ - in_pcbinslbgrouphash(inp, numa_domain); - goto abort_with_hash_wlock; - } + if (in_pcblbgroup_find(inp) != NULL) { + /* Remove it from the old group. */ + in_pcbremlbgrouphash(inp); + /* Add it to the new group based on numa domain. */ + in_pcbinslbgrouphash(inp, numa_domain); + error = 0; + } else { + error = ENOENT; } - err = ENOENT; -abort_with_hash_wlock: INP_HASH_WUNLOCK(pcbinfo); - return (err); + return (error); } /* Make sure it is safe to use hashinit(9) on CK_LIST. */ @@ -523,7 +576,6 @@ in_pcbinfo_init(struct inpcbinfo *pcbinfo, struct inpcbstorage *pcbstor, pcbinfo->ipi_lbgrouphashbase = hashinit(porthash_nelements, M_PCB, &pcbinfo->ipi_lbgrouphashmask); pcbinfo->ipi_zone = pcbstor->ips_zone; - pcbinfo->ipi_portzone = pcbstor->ips_portzone; pcbinfo->ipi_smr = uma_zone_get_smr(pcbinfo->ipi_zone); } @@ -559,10 +611,6 @@ in_pcbstorage_init(void *arg) pcbstor->ips_zone = uma_zcreate(pcbstor->ips_zone_name, pcbstor->ips_size, NULL, NULL, pcbstor->ips_pcbinit, inpcb_fini, UMA_ALIGN_CACHE, UMA_ZONE_SMR); - pcbstor->ips_portzone = uma_zcreate(pcbstor->ips_portzone_name, - sizeof(struct inpcbport), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); - uma_zone_set_smr(pcbstor->ips_portzone, - uma_zone_get_smr(pcbstor->ips_zone)); } /* @@ -574,7 +622,6 @@ in_pcbstorage_destroy(void *arg) struct inpcbstorage *pcbstor = arg; uma_zdestroy(pcbstor->ips_zone); - uma_zdestroy(pcbstor->ips_portzone); } /* @@ -666,7 +713,8 @@ out: #ifdef INET int -in_pcbbind(struct inpcb *inp, struct sockaddr_in *sin, struct ucred *cred) +in_pcbbind(struct inpcb *inp, struct sockaddr_in *sin, int flags, + struct ucred *cred) { int anonport, error; @@ -681,13 +729,15 @@ in_pcbbind(struct inpcb *inp, struct sockaddr_in *sin, struct ucred *cred) return (EINVAL); anonport = sin == NULL || sin->sin_port == 0; error = in_pcbbind_setup(inp, sin, &inp->inp_laddr.s_addr, - &inp->inp_lport, cred); + &inp->inp_lport, flags, cred); if (error) return (error); - if (in_pcbinshash(inp) != 0) { + if (__predict_false((error = in_pcbinshash(inp)) != 0)) { + MPASS(inp->inp_socket->so_options & SO_REUSEPORT_LB); inp->inp_laddr.s_addr = INADDR_ANY; inp->inp_lport = 0; - return (EAGAIN); + inp->inp_flags &= ~INP_BOUNDFIB; + return (error); } if (anonport) inp->inp_flags |= INP_ANONPORT; @@ -703,8 +753,9 @@ in_pcbbind(struct inpcb *inp, struct sockaddr_in *sin, struct ucred *cred) * lsa can be NULL for IPv6. */ int -in_pcb_lport_dest(struct inpcb *inp, struct sockaddr *lsa, u_short *lportp, - struct sockaddr *fsa, u_short fport, struct ucred *cred, int lookupflags) +in_pcb_lport_dest(const struct inpcb *inp, struct sockaddr *lsa, + u_short *lportp, struct sockaddr *fsa, u_short fport, struct ucred *cred, + int lookupflags) { struct inpcbinfo *pcbinfo; struct inpcb *tmpinp; @@ -775,7 +826,6 @@ in_pcb_lport_dest(struct inpcb *inp, struct sockaddr *lsa, u_short *lportp, #endif tmpinp = NULL; - lport = *lportp; if (V_ipport_randomized) *lastport = first + (arc4random() % (last - first)); @@ -795,26 +845,28 @@ in_pcb_lport_dest(struct inpcb *inp, struct sockaddr *lsa, u_short *lportp, if (lsa->sa_family == AF_INET) { tmpinp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport, lookupflags, - M_NODOM); + M_NODOM, RT_ALL_FIBS); } #endif #ifdef INET6 if (lsa->sa_family == AF_INET6) { tmpinp = in6_pcblookup_hash_locked(pcbinfo, faddr6, fport, laddr6, lport, lookupflags, - M_NODOM); + M_NODOM, RT_ALL_FIBS); } #endif } else { #ifdef INET6 if ((inp->inp_vflag & INP_IPV6) != 0) { tmpinp = in6_pcblookup_local(pcbinfo, - &inp->in6p_laddr, lport, lookupflags, cred); + &inp->in6p_laddr, lport, RT_ALL_FIBS, + lookupflags, cred); #ifdef INET if (tmpinp == NULL && (inp->inp_vflag & INP_IPV4)) tmpinp = in_pcblookup_local(pcbinfo, - laddr, lport, lookupflags, cred); + laddr, lport, RT_ALL_FIBS, + lookupflags, cred); #endif } #endif @@ -823,7 +875,7 @@ in_pcb_lport_dest(struct inpcb *inp, struct sockaddr *lsa, u_short *lportp, #endif #ifdef INET tmpinp = in_pcblookup_local(pcbinfo, laddr, - lport, lookupflags, cred); + lport, RT_ALL_FIBS, lookupflags, cred); #endif } } while (tmpinp != NULL); @@ -854,6 +906,99 @@ in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp, #ifdef INET /* + * Determine whether the inpcb can be bound to the specified address/port tuple. + */ +static int +in_pcbbind_avail(struct inpcb *inp, const struct in_addr laddr, + const u_short lport, const int fib, int sooptions, int lookupflags, + struct ucred *cred) +{ + int reuseport, reuseport_lb; + + INP_LOCK_ASSERT(inp); + INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo); + + reuseport = (sooptions & SO_REUSEPORT); + reuseport_lb = (sooptions & SO_REUSEPORT_LB); + + if (IN_MULTICAST(ntohl(laddr.s_addr))) { + /* + * Treat SO_REUSEADDR as SO_REUSEPORT for multicast; + * allow complete duplication of binding if + * SO_REUSEPORT is set, or if SO_REUSEADDR is set + * and a multicast address is bound on both + * new and duplicated sockets. + */ + if ((sooptions & (SO_REUSEADDR | SO_REUSEPORT)) != 0) + reuseport = SO_REUSEADDR | SO_REUSEPORT; + /* + * XXX: How to deal with SO_REUSEPORT_LB here? + * Treat same as SO_REUSEPORT for now. + */ + if ((sooptions & (SO_REUSEADDR | SO_REUSEPORT_LB)) != 0) + reuseport_lb = SO_REUSEADDR | SO_REUSEPORT_LB; + } else if (!in_nullhost(laddr)) { + struct sockaddr_in sin; + + memset(&sin, 0, sizeof(sin)); + sin.sin_family = AF_INET; + sin.sin_len = sizeof(sin); + sin.sin_addr = laddr; + + /* + * Is the address a local IP address? + * If INP_BINDANY is set, then the socket may be bound + * to any endpoint address, local or not. + */ + if ((inp->inp_flags & INP_BINDANY) == 0 && + ifa_ifwithaddr_check((const struct sockaddr *)&sin) == 0) + return (EADDRNOTAVAIL); + } + + if (lport != 0) { + struct inpcb *t; + + if (ntohs(lport) <= V_ipport_reservedhigh && + ntohs(lport) >= V_ipport_reservedlow && + priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT)) + return (EACCES); + + if (!IN_MULTICAST(ntohl(laddr.s_addr)) && + priv_check_cred(inp->inp_cred, PRIV_NETINET_REUSEPORT) != 0) { + /* + * If a socket owned by a different user is already + * bound to this port, fail. In particular, SO_REUSE* + * can only be used to share a port among sockets owned + * by the same user. + * + * However, we can share a port with a connected socket + * which has a unique 4-tuple. + */ + t = in_pcblookup_local(inp->inp_pcbinfo, laddr, lport, + RT_ALL_FIBS, INPLOOKUP_WILDCARD, cred); + if (t != NULL && + (inp->inp_socket->so_type != SOCK_STREAM || + in_nullhost(t->inp_faddr)) && + (inp->inp_cred->cr_uid != t->inp_cred->cr_uid)) + return (EADDRINUSE); + } + t = in_pcblookup_local(inp->inp_pcbinfo, laddr, lport, fib, + lookupflags, cred); + if (t != NULL && ((reuseport | reuseport_lb) & + t->inp_socket->so_options) == 0) { +#ifdef INET6 + if (!in_nullhost(laddr) || + !in_nullhost(t->inp_laddr) || + (inp->inp_vflag & INP_IPV6PROTO) == 0 || + (t->inp_vflag & INP_IPV6PROTO) == 0) +#endif + return (EADDRINUSE); + } + } + return (0); +} + +/* * Set up a bind operation on a PCB, performing port allocation * as required, but do not actually modify the PCB. Callers can * either complete the bind by setting inp_laddr/inp_lport and @@ -864,31 +1009,26 @@ in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp, */ int in_pcbbind_setup(struct inpcb *inp, struct sockaddr_in *sin, in_addr_t *laddrp, - u_short *lportp, struct ucred *cred) + u_short *lportp, int flags, struct ucred *cred) { struct socket *so = inp->inp_socket; - struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; struct in_addr laddr; u_short lport = 0; - int lookupflags = 0, reuseport = (so->so_options & SO_REUSEPORT); - int error; - - /* - * XXX: Maybe we could let SO_REUSEPORT_LB set SO_REUSEPORT bit here - * so that we don't have to add to the (already messy) code below. - */ - int reuseport_lb = (so->so_options & SO_REUSEPORT_LB); + int error, fib, lookupflags, sooptions; /* * No state changes, so read locks are sufficient here. */ INP_LOCK_ASSERT(inp); - INP_HASH_LOCK_ASSERT(pcbinfo); + INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo); laddr.s_addr = *laddrp; if (sin != NULL && laddr.s_addr != INADDR_ANY) return (EINVAL); - if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT|SO_REUSEPORT_LB)) == 0) + + lookupflags = 0; + sooptions = atomic_load_int(&so->so_options); + if ((sooptions & (SO_REUSEADDR | SO_REUSEPORT | SO_REUSEPORT_LB)) == 0) lookupflags = INPLOOKUP_WILDCARD; if (sin == NULL) { if ((error = prison_local_ip4(cred, &laddr)) != 0) @@ -908,79 +1048,16 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr_in *sin, in_addr_t *laddrp, return (EINVAL); lport = sin->sin_port; } - /* NB: lport is left as 0 if the port isn't being changed. */ - if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) { - /* - * Treat SO_REUSEADDR as SO_REUSEPORT for multicast; - * allow complete duplication of binding if - * SO_REUSEPORT is set, or if SO_REUSEADDR is set - * and a multicast address is bound on both - * new and duplicated sockets. - */ - if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) != 0) - reuseport = SO_REUSEADDR|SO_REUSEPORT; - /* - * XXX: How to deal with SO_REUSEPORT_LB here? - * Treat same as SO_REUSEPORT for now. - */ - if ((so->so_options & - (SO_REUSEADDR|SO_REUSEPORT_LB)) != 0) - reuseport_lb = SO_REUSEADDR|SO_REUSEPORT_LB; - } else if (sin->sin_addr.s_addr != INADDR_ANY) { - sin->sin_port = 0; /* yech... */ - bzero(&sin->sin_zero, sizeof(sin->sin_zero)); - /* - * Is the address a local IP address? - * If INP_BINDANY is set, then the socket may be bound - * to any endpoint address, local or not. - */ - if ((inp->inp_flags & INP_BINDANY) == 0 && - ifa_ifwithaddr_check((struct sockaddr *)sin) == 0) - return (EADDRNOTAVAIL); - } laddr = sin->sin_addr; - if (lport) { - struct inpcb *t; - - /* GROSS */ - if (ntohs(lport) <= V_ipport_reservedhigh && - ntohs(lport) >= V_ipport_reservedlow && - priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT)) - return (EACCES); - if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) && - priv_check_cred(inp->inp_cred, PRIV_NETINET_REUSEPORT) != 0) { - t = in_pcblookup_local(pcbinfo, sin->sin_addr, - lport, INPLOOKUP_WILDCARD, cred); - /* - * XXX - * This entire block sorely needs a rewrite. - */ - if (t != NULL && - (so->so_type != SOCK_STREAM || - ntohl(t->inp_faddr.s_addr) == INADDR_ANY) && - (ntohl(sin->sin_addr.s_addr) != INADDR_ANY || - ntohl(t->inp_laddr.s_addr) != INADDR_ANY || - (t->inp_socket->so_options & SO_REUSEPORT) || - (t->inp_socket->so_options & SO_REUSEPORT_LB) == 0) && - (inp->inp_cred->cr_uid != - t->inp_cred->cr_uid)) - return (EADDRINUSE); - } - t = in_pcblookup_local(pcbinfo, sin->sin_addr, - lport, lookupflags, cred); - if (t != NULL && (reuseport & t->inp_socket->so_options) == 0 && - (reuseport_lb & t->inp_socket->so_options) == 0) { -#ifdef INET6 - if (ntohl(sin->sin_addr.s_addr) != - INADDR_ANY || - ntohl(t->inp_laddr.s_addr) != - INADDR_ANY || - (inp->inp_vflag & INP_IPV6PROTO) == 0 || - (t->inp_vflag & INP_IPV6PROTO) == 0) -#endif - return (EADDRINUSE); - } - } + + fib = (flags & INPBIND_FIB) != 0 ? inp->inp_inc.inc_fibnum : + RT_ALL_FIBS; + + /* See if this address/port combo is available. */ + error = in_pcbbind_avail(inp, laddr, lport, fib, sooptions, + lookupflags, cred); + if (error != 0) + return (error); } if (*lportp != 0) lport = *lportp; @@ -991,6 +1068,8 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr_in *sin, in_addr_t *laddrp, } *laddrp = laddr.s_addr; *lportp = lport; + if ((flags & INPBIND_FIB) != 0) + inp->inp_flags |= INP_BOUNDFIB; return (0); } @@ -1001,48 +1080,105 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr_in *sin, in_addr_t *laddrp, * then pick one. */ int -in_pcbconnect(struct inpcb *inp, struct sockaddr_in *sin, struct ucred *cred, - bool rehash __unused) +in_pcbconnect(struct inpcb *inp, struct sockaddr_in *sin, struct ucred *cred) { - u_short lport, fport; - in_addr_t laddr, faddr; - int anonport, error; + struct in_addr laddr, faddr; + u_short lport; + int error; + bool anonport; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); KASSERT(in_nullhost(inp->inp_faddr), ("%s: inp is already connected", __func__)); + KASSERT(sin->sin_family == AF_INET, + ("%s: invalid address family for %p", __func__, sin)); + KASSERT(sin->sin_len == sizeof(*sin), + ("%s: invalid address length for %p", __func__, sin)); - lport = inp->inp_lport; - laddr = inp->inp_laddr.s_addr; - anonport = (lport == 0); - error = in_pcbconnect_setup(inp, sin, &laddr, &lport, &faddr, &fport, - cred); - if (error) - return (error); + if (sin->sin_port == 0) + return (EADDRNOTAVAIL); - inp->inp_faddr.s_addr = faddr; - inp->inp_fport = fport; - - /* Do the initial binding of the local address if required. */ - if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) { - inp->inp_lport = lport; - inp->inp_laddr.s_addr = laddr; - if (in_pcbinshash(inp) != 0) { - inp->inp_laddr.s_addr = inp->inp_faddr.s_addr = - INADDR_ANY; - inp->inp_lport = inp->inp_fport = 0; - return (EAGAIN); - } - } else { - inp->inp_lport = lport; - inp->inp_laddr.s_addr = laddr; - if ((inp->inp_flags & INP_INHASHLIST) != 0) - in_pcbrehash(inp); - else - in_pcbinshash(inp); - } + anonport = (inp->inp_lport == 0); + if (__predict_false(in_broadcast(sin->sin_addr))) { + if (!V_connect_inaddr_wild || CK_STAILQ_EMPTY(&V_in_ifaddrhead)) + return (ENETUNREACH); + /* + * If the destination address is INADDR_ANY, use the primary + * local address. If the supplied address is INADDR_BROADCAST, + * and the primary interface supports broadcast, choose the + * broadcast address for that interface. + */ + if (in_nullhost(sin->sin_addr)) { + faddr = + IA_SIN(CK_STAILQ_FIRST(&V_in_ifaddrhead))->sin_addr; + if ((error = prison_get_ip4(cred, &faddr)) != 0) + return (error); + } else if (sin->sin_addr.s_addr == INADDR_BROADCAST && + CK_STAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags + & IFF_BROADCAST) { + faddr = satosin(&CK_STAILQ_FIRST( + &V_in_ifaddrhead)->ia_broadaddr)->sin_addr; + } else + faddr = sin->sin_addr; + } else + faddr = sin->sin_addr; + + if (in_nullhost(inp->inp_laddr)) { + error = in_pcbladdr(inp, &faddr, &laddr, cred); + if (error) + return (error); + } else + laddr = inp->inp_laddr; + + if (anonport) { + struct sockaddr_in lsin = { + .sin_family = AF_INET, + .sin_addr = laddr, + }; + struct sockaddr_in fsin = { + .sin_family = AF_INET, + .sin_addr = faddr, + }; + + error = in_pcb_lport_dest(inp, (struct sockaddr *)&lsin, + &lport, (struct sockaddr *)&fsin, sin->sin_port, cred, + INPLOOKUP_WILDCARD); + if (error) + return (error); + } else if (in_pcblookup_hash_locked(inp->inp_pcbinfo, faddr, + sin->sin_port, laddr, inp->inp_lport, 0, M_NODOM, RT_ALL_FIBS) != + NULL) + return (EADDRINUSE); + else + lport = inp->inp_lport; + + MPASS(!in_nullhost(inp->inp_laddr) || inp->inp_lport != 0 || + !(inp->inp_flags & INP_INHASHLIST)); + + inp->inp_faddr = faddr; + inp->inp_fport = sin->sin_port; + inp->inp_laddr = laddr; + inp->inp_lport = lport; + + if ((inp->inp_flags & INP_INHASHLIST) == 0) { + error = in_pcbinshash(inp); + MPASS(error == 0); + } else + in_pcbrehash(inp); +#ifdef ROUTE_MPATH + if (CALC_FLOWID_OUTBOUND) { + uint32_t hash_val, hash_type; + + hash_val = fib4_calc_software_hash(inp->inp_laddr, + inp->inp_faddr, 0, sin->sin_port, + inp->inp_socket->so_proto->pr_protocol, &hash_type); + + inp->inp_flowid = hash_val; + inp->inp_flowtype = hash_type; + } +#endif if (anonport) inp->inp_flags |= INP_ANONPORT; return (0); @@ -1053,8 +1189,8 @@ in_pcbconnect(struct inpcb *inp, struct sockaddr_in *sin, struct ucred *cred, * of connect. Take jails into account as well. */ int -in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr, - struct ucred *cred) +in_pcbladdr(const struct inpcb *inp, struct in_addr *faddr, + struct in_addr *laddr, struct ucred *cred) { struct ifaddr *ifa; struct sockaddr *sa; @@ -1072,6 +1208,27 @@ in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr, if (!prison_saddrsel_ip4(cred, laddr)) return (0); + /* + * If the destination address is multicast and an outgoing + * interface has been set as a multicast option, prefer the + * address of that interface as our source address. + */ + if (IN_MULTICAST(ntohl(faddr->s_addr)) && inp->inp_moptions != NULL && + inp->inp_moptions->imo_multicast_ifp != NULL) { + struct ifnet *ifp = inp->inp_moptions->imo_multicast_ifp; + struct in_ifaddr *ia; + + CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { + if (ia->ia_ifp == ifp && + prison_check_ip4(cred, &ia->ia_addr.sin_addr) == 0) + break; + } + if (ia == NULL) + return (EADDRNOTAVAIL); + *laddr = ia->ia_addr.sin_addr; + return (0); + } + error = 0; nh = NULL; @@ -1259,135 +1416,6 @@ done: return (error); } -/* - * Set up for a connect from a socket to the specified address. - * On entry, *laddrp and *lportp should contain the current local - * address and port for the PCB; these are updated to the values - * that should be placed in inp_laddr and inp_lport to complete - * the connect. - * - * On success, *faddrp and *fportp will be set to the remote address - * and port. These are not updated in the error case. - */ -int -in_pcbconnect_setup(struct inpcb *inp, struct sockaddr_in *sin, - in_addr_t *laddrp, u_short *lportp, in_addr_t *faddrp, u_short *fportp, - struct ucred *cred) -{ - struct in_ifaddr *ia; - struct in_addr laddr, faddr; - u_short lport, fport; - int error; - - KASSERT(sin->sin_family == AF_INET, - ("%s: invalid address family for %p", __func__, sin)); - KASSERT(sin->sin_len == sizeof(*sin), - ("%s: invalid address length for %p", __func__, sin)); - - /* - * Because a global state change doesn't actually occur here, a read - * lock is sufficient. - */ - NET_EPOCH_ASSERT(); - INP_LOCK_ASSERT(inp); - INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo); - - if (sin->sin_port == 0) - return (EADDRNOTAVAIL); - laddr.s_addr = *laddrp; - lport = *lportp; - faddr = sin->sin_addr; - fport = sin->sin_port; -#ifdef ROUTE_MPATH - if (CALC_FLOWID_OUTBOUND) { - uint32_t hash_val, hash_type; - - hash_val = fib4_calc_software_hash(laddr, faddr, 0, fport, - inp->inp_socket->so_proto->pr_protocol, &hash_type); - - inp->inp_flowid = hash_val; - inp->inp_flowtype = hash_type; - } -#endif - if (!CK_STAILQ_EMPTY(&V_in_ifaddrhead)) { - /* - * If the destination address is INADDR_ANY, - * use the primary local address. - * If the supplied address is INADDR_BROADCAST, - * and the primary interface supports broadcast, - * choose the broadcast address for that interface. - */ - if (faddr.s_addr == INADDR_ANY) { - faddr = - IA_SIN(CK_STAILQ_FIRST(&V_in_ifaddrhead))->sin_addr; - if ((error = prison_get_ip4(cred, &faddr)) != 0) - return (error); - } else if (faddr.s_addr == (u_long)INADDR_BROADCAST) { - if (CK_STAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags & - IFF_BROADCAST) - faddr = satosin(&CK_STAILQ_FIRST( - &V_in_ifaddrhead)->ia_broadaddr)->sin_addr; - } - } - if (laddr.s_addr == INADDR_ANY) { - error = in_pcbladdr(inp, &faddr, &laddr, cred); - /* - * If the destination address is multicast and an outgoing - * interface has been set as a multicast option, prefer the - * address of that interface as our source address. - */ - if (IN_MULTICAST(ntohl(faddr.s_addr)) && - inp->inp_moptions != NULL) { - struct ip_moptions *imo; - struct ifnet *ifp; - - imo = inp->inp_moptions; - if (imo->imo_multicast_ifp != NULL) { - ifp = imo->imo_multicast_ifp; - CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { - if (ia->ia_ifp == ifp && - prison_check_ip4(cred, - &ia->ia_addr.sin_addr) == 0) - break; - } - if (ia == NULL) - error = EADDRNOTAVAIL; - else { - laddr = ia->ia_addr.sin_addr; - error = 0; - } - } - } - if (error) - return (error); - } - - if (lport != 0) { - if (in_pcblookup_hash_locked(inp->inp_pcbinfo, faddr, - fport, laddr, lport, 0, M_NODOM) != NULL) - return (EADDRINUSE); - } else { - struct sockaddr_in lsin, fsin; - - bzero(&lsin, sizeof(lsin)); - bzero(&fsin, sizeof(fsin)); - lsin.sin_family = AF_INET; - lsin.sin_addr = laddr; - fsin.sin_family = AF_INET; - fsin.sin_addr = faddr; - error = in_pcb_lport_dest(inp, (struct sockaddr *) &lsin, - &lport, (struct sockaddr *)& fsin, fport, cred, - INPLOOKUP_WILDCARD); - if (error) - return (error); - } - *laddrp = laddr.s_addr; - *lportp = lport; - *faddrp = faddr.s_addr; - *fportp = fport; - return (0); -} - void in_pcbdisconnect(struct inpcb *inp) { @@ -1407,6 +1435,26 @@ in_pcbdisconnect(struct inpcb *inp) } #endif /* INET */ +void +in_pcblisten(struct inpcb *inp) +{ + struct inpcblbgroup *grp; + + INP_WLOCK_ASSERT(inp); + + if ((inp->inp_flags & INP_INLBGROUP) != 0) { + struct inpcbinfo *pcbinfo; + + pcbinfo = inp->inp_pcbinfo; + INP_HASH_WLOCK(pcbinfo); + grp = in_pcblbgroup_find(inp); + LIST_REMOVE(inp, inp_lbgroup_list); + grp->il_pendcnt--; + in_pcblbgroup_insert(grp, inp); + INP_HASH_WUNLOCK(pcbinfo); + } +} + /* * inpcb hash lookups are protected by SMR section. * @@ -1697,6 +1745,23 @@ in_pcbrele(struct inpcb *inp, const inp_lookup_t lock) } /* + * Dereference and rlock inp, for which the caller must own the + * reference. Returns true if inp no longer usable, false otherwise. + */ +bool +in_pcbrele_rlock(struct inpcb *inp) +{ + INP_RLOCK(inp); + if (in_pcbrele_rlocked(inp)) + return (true); + if ((inp->inp_flags & INP_FREED) != 0) { + INP_RUNLOCK(inp); + return (true); + } + return (false); +} + +/* * Unconditionally schedule an inpcb to be freed by decrementing its * reference count, which should occur only after the inpcb has been detached * from its socket. If another thread holds a temporary reference (acquired @@ -1915,7 +1980,7 @@ restart: #define INP_LOOKUP_MAPPED_PCB_COST 3 struct inpcb * in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr, - u_short lport, int lookupflags, struct ucred *cred) + u_short lport, int fib, int lookupflags, struct ucred *cred) { struct inpcb *inp; #ifdef INET6 @@ -1927,6 +1992,9 @@ in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr, KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); + KASSERT(fib == RT_ALL_FIBS || (fib >= 0 && fib < V_rt_numfibs), + ("%s: invalid fib %d", __func__, fib)); + INP_HASH_LOCK_ASSERT(pcbinfo); if ((lookupflags & INPLOOKUP_WILDCARD) == 0) { @@ -1945,7 +2013,8 @@ in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr, #endif if (inp->inp_faddr.s_addr == INADDR_ANY && inp->inp_laddr.s_addr == laddr.s_addr && - inp->inp_lport == lport) { + inp->inp_lport == lport && (fib == RT_ALL_FIBS || + inp->inp_inc.inc_fibnum == fib)) { /* * Found? */ @@ -1959,68 +2028,58 @@ in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr, */ return (NULL); } else { - struct inpcbporthead *porthash; - struct inpcbport *phd; + struct inpcbhead *porthash; struct inpcb *match = NULL; + /* - * Best fit PCB lookup. - * - * First see if this local port is in use by looking on the - * port hash list. + * Port is in use by one or more PCBs. Look for best + * fit. */ porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport, pcbinfo->ipi_porthashmask)]; - CK_LIST_FOREACH(phd, porthash, phd_hash) { - if (phd->phd_port == lport) - break; - } - if (phd != NULL) { + CK_LIST_FOREACH(inp, porthash, inp_portlist) { + if (inp->inp_lport != lport) + continue; + if (!prison_equal_ip4(inp->inp_cred->cr_prison, + cred->cr_prison)) + continue; + if (fib != RT_ALL_FIBS && + inp->inp_inc.inc_fibnum != fib) + continue; + wildcard = 0; +#ifdef INET6 + /* XXX inp locking */ + if ((inp->inp_vflag & INP_IPV4) == 0) + continue; /* - * Port is in use by one or more PCBs. Look for best - * fit. + * We never select the PCB that has INP_IPV6 flag and + * is bound to :: if we have another PCB which is bound + * to 0.0.0.0. If a PCB has the INP_IPV6 flag, then we + * set its cost higher than IPv4 only PCBs. + * + * Note that the case only happens when a socket is + * bound to ::, under the condition that the use of the + * mapped address is allowed. */ - CK_LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) { - wildcard = 0; - if (!prison_equal_ip4(inp->inp_cred->cr_prison, - cred->cr_prison)) - continue; -#ifdef INET6 - /* XXX inp locking */ - if ((inp->inp_vflag & INP_IPV4) == 0) - continue; - /* - * We never select the PCB that has - * INP_IPV6 flag and is bound to :: if - * we have another PCB which is bound - * to 0.0.0.0. If a PCB has the - * INP_IPV6 flag, then we set its cost - * higher than IPv4 only PCBs. - * - * Note that the case only happens - * when a socket is bound to ::, under - * the condition that the use of the - * mapped address is allowed. - */ - if ((inp->inp_vflag & INP_IPV6) != 0) - wildcard += INP_LOOKUP_MAPPED_PCB_COST; + if ((inp->inp_vflag & INP_IPV6) != 0) + wildcard += INP_LOOKUP_MAPPED_PCB_COST; #endif - if (inp->inp_faddr.s_addr != INADDR_ANY) + if (inp->inp_faddr.s_addr != INADDR_ANY) + wildcard++; + if (inp->inp_laddr.s_addr != INADDR_ANY) { + if (laddr.s_addr == INADDR_ANY) + wildcard++; + else if (inp->inp_laddr.s_addr != laddr.s_addr) + continue; + } else { + if (laddr.s_addr != INADDR_ANY) wildcard++; - if (inp->inp_laddr.s_addr != INADDR_ANY) { - if (laddr.s_addr == INADDR_ANY) - wildcard++; - else if (inp->inp_laddr.s_addr != laddr.s_addr) - continue; - } else { - if (laddr.s_addr != INADDR_ANY) - wildcard++; - } - if (wildcard < matchwild) { - match = inp; - matchwild = wildcard; - if (matchwild == 0) - break; - } + } + if (wildcard < matchwild) { + match = inp; + matchwild = wildcard; + if (matchwild == 0) + break; } } return (match); @@ -2029,21 +2088,25 @@ in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr, #undef INP_LOOKUP_MAPPED_PCB_COST static bool -in_pcblookup_lb_numa_match(const struct inpcblbgroup *grp, int domain) +in_pcblookup_lb_match(const struct inpcblbgroup *grp, int domain, int fib) { - return (domain == M_NODOM || domain == grp->il_numa_domain); + return ((domain == M_NODOM || domain == grp->il_numa_domain) && + (fib == RT_ALL_FIBS || fib == grp->il_fibnum)); } static struct inpcb * in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo, const struct in_addr *faddr, uint16_t fport, const struct in_addr *laddr, - uint16_t lport, int domain) + uint16_t lport, int domain, int fib) { const struct inpcblbgrouphead *hdr; struct inpcblbgroup *grp; struct inpcblbgroup *jail_exact, *jail_wild, *local_exact, *local_wild; + struct inpcb *inp; + u_int count; INP_HASH_LOCK_ASSERT(pcbinfo); + NET_EPOCH_ASSERT(); hdr = &pcbinfo->ipi_lbgrouphashbase[ INP_PCBPORTHASH(lport, pcbinfo->ipi_lbgrouphashmask)]; @@ -2073,20 +2136,20 @@ in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo, if (grp->il_laddr.s_addr == laddr->s_addr) { if (injail) { jail_exact = grp; - if (in_pcblookup_lb_numa_match(grp, domain)) + if (in_pcblookup_lb_match(grp, domain, fib)) /* This is a perfect match. */ goto out; } else if (local_exact == NULL || - in_pcblookup_lb_numa_match(grp, domain)) { + in_pcblookup_lb_match(grp, domain, fib)) { local_exact = grp; } } else if (grp->il_laddr.s_addr == INADDR_ANY) { if (injail) { if (jail_wild == NULL || - in_pcblookup_lb_numa_match(grp, domain)) + in_pcblookup_lb_match(grp, domain, fib)) jail_wild = grp; } else if (local_wild == NULL || - in_pcblookup_lb_numa_match(grp, domain)) { + in_pcblookup_lb_match(grp, domain, fib)) { local_wild = grp; } } @@ -2102,9 +2165,17 @@ in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo, grp = local_wild; if (grp == NULL) return (NULL); + out: - return (grp->il_inp[INP_PCBLBGROUP_PKTHASH(faddr, lport, fport) % - grp->il_inpcnt]); + /* + * Synchronize with in_pcblbgroup_insert(). + */ + count = atomic_load_acq_int(&grp->il_inpcnt); + if (count == 0) + return (NULL); + inp = grp->il_inp[INP_PCBLBGROUP_PKTHASH(faddr, lport, fport) % count]; + KASSERT(inp != NULL, ("%s: inp == NULL", __func__)); + return (inp); } static bool @@ -2150,7 +2221,7 @@ typedef enum { static inp_lookup_match_t in_pcblookup_wild_match(const struct inpcb *inp, struct in_addr laddr, - u_short lport) + u_short lport, int fib) { #ifdef INET6 /* XXX inp locking */ @@ -2159,6 +2230,8 @@ in_pcblookup_wild_match(const struct inpcb *inp, struct in_addr laddr, #endif if (inp->inp_faddr.s_addr != INADDR_ANY || inp->inp_lport != lport) return (INPLOOKUP_MATCH_NONE); + if (fib != RT_ALL_FIBS && inp->inp_inc.inc_fibnum != fib) + return (INPLOOKUP_MATCH_NONE); if (inp->inp_laddr.s_addr == INADDR_ANY) return (INPLOOKUP_MATCH_WILD); if (inp->inp_laddr.s_addr == laddr.s_addr) @@ -2169,9 +2242,8 @@ in_pcblookup_wild_match(const struct inpcb *inp, struct in_addr laddr, #define INP_LOOKUP_AGAIN ((struct inpcb *)(uintptr_t)-1) static struct inpcb * -in_pcblookup_hash_wild_smr(struct inpcbinfo *pcbinfo, struct in_addr faddr, - u_short fport, struct in_addr laddr, u_short lport, - const inp_lookup_t lockflags) +in_pcblookup_hash_wild_smr(struct inpcbinfo *pcbinfo, struct in_addr laddr, + u_short lport, int fib, const inp_lookup_t lockflags) { struct inpcbhead *head; struct inpcb *inp; @@ -2184,12 +2256,12 @@ in_pcblookup_hash_wild_smr(struct inpcbinfo *pcbinfo, struct in_addr faddr, CK_LIST_FOREACH(inp, head, inp_hash_wild) { inp_lookup_match_t match; - match = in_pcblookup_wild_match(inp, laddr, lport); + match = in_pcblookup_wild_match(inp, laddr, lport, fib); if (match == INPLOOKUP_MATCH_NONE) continue; if (__predict_true(inp_smr_lock(inp, lockflags))) { - match = in_pcblookup_wild_match(inp, laddr, lport); + match = in_pcblookup_wild_match(inp, laddr, lport, fib); if (match != INPLOOKUP_MATCH_NONE && prison_check_ip4_locked(inp->inp_cred->cr_prison, &laddr) == 0) @@ -2207,8 +2279,8 @@ in_pcblookup_hash_wild_smr(struct inpcbinfo *pcbinfo, struct in_addr faddr, } static struct inpcb * -in_pcblookup_hash_wild_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, - u_short fport, struct in_addr laddr, u_short lport) +in_pcblookup_hash_wild_locked(struct inpcbinfo *pcbinfo, struct in_addr laddr, + u_short lport, int fib) { struct inpcbhead *head; struct inpcb *inp, *local_wild, *local_exact, *jail_wild; @@ -2235,7 +2307,7 @@ in_pcblookup_hash_wild_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, inp_lookup_match_t match; bool injail; - match = in_pcblookup_wild_match(inp, laddr, lport); + match = in_pcblookup_wild_match(inp, laddr, lport, fib); if (match == INPLOOKUP_MATCH_NONE) continue; @@ -2288,12 +2360,12 @@ in_pcblookup_hash_wild_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, static struct inpcb * in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, - uint8_t numa_domain) + uint8_t numa_domain, int fib) { struct inpcb *inp; const u_short fport = fport_arg, lport = lport_arg; - KASSERT((lookupflags & ~INPLOOKUP_WILDCARD) == 0, + KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD | INPLOOKUP_FIB)) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); KASSERT(faddr.s_addr != INADDR_ANY, ("%s: invalid foreign address", __func__)); @@ -2307,10 +2379,10 @@ in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { inp = in_pcblookup_lbgroup(pcbinfo, &faddr, fport, - &laddr, lport, numa_domain); + &laddr, lport, numa_domain, fib); if (inp == NULL) { - inp = in_pcblookup_hash_wild_locked(pcbinfo, faddr, - fport, laddr, lport); + inp = in_pcblookup_hash_wild_locked(pcbinfo, laddr, + lport, fib); } } @@ -2320,7 +2392,7 @@ in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, static struct inpcb * in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, struct in_addr laddr, u_int lport, int lookupflags, - uint8_t numa_domain) + uint8_t numa_domain, int fib) { struct inpcb *inp; const inp_lookup_t lockflags = lookupflags & INPLOOKUP_LOCKMASK; @@ -2330,7 +2402,7 @@ in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr, INP_HASH_WLOCK(pcbinfo); inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport, - lookupflags & ~INPLOOKUP_LOCKMASK, numa_domain); + lookupflags & ~INPLOOKUP_LOCKMASK, numa_domain, fib); if (inp != NULL && !inp_trylock(inp, lockflags)) { in_pcbref(inp); INP_HASH_WUNLOCK(pcbinfo); @@ -2347,7 +2419,7 @@ in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr, static struct inpcb * in_pcblookup_hash_smr(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, - uint8_t numa_domain) + uint8_t numa_domain, int fib) { struct inpcb *inp; const inp_lookup_t lockflags = lookupflags & INPLOOKUP_LOCKMASK; @@ -2377,27 +2449,27 @@ in_pcblookup_hash_smr(struct inpcbinfo *pcbinfo, struct in_addr faddr, * out from under us. Fall back to a precise search. */ return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, - lookupflags, numa_domain)); + lookupflags, numa_domain, fib)); } if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { inp = in_pcblookup_lbgroup(pcbinfo, &faddr, fport, - &laddr, lport, numa_domain); + &laddr, lport, numa_domain, fib); if (inp != NULL) { if (__predict_true(inp_smr_lock(inp, lockflags))) { if (__predict_true(in_pcblookup_wild_match(inp, - laddr, lport) != INPLOOKUP_MATCH_NONE)) + laddr, lport, fib) != INPLOOKUP_MATCH_NONE)) return (inp); inp_unlock(inp, lockflags); } inp = INP_LOOKUP_AGAIN; } else { - inp = in_pcblookup_hash_wild_smr(pcbinfo, faddr, fport, - laddr, lport, lockflags); + inp = in_pcblookup_hash_wild_smr(pcbinfo, laddr, lport, + fib, lockflags); } if (inp == INP_LOOKUP_AGAIN) { return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, - lport, lookupflags, numa_domain)); + lport, lookupflags, numa_domain, fib)); } } @@ -2414,10 +2486,13 @@ in_pcblookup_hash_smr(struct inpcbinfo *pcbinfo, struct in_addr faddr, struct inpcb * in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, struct in_addr laddr, u_int lport, int lookupflags, - struct ifnet *ifp __unused) + struct ifnet *ifp) { + int fib; + + fib = (lookupflags & INPLOOKUP_FIB) ? if_getfib(ifp) : RT_ALL_FIBS; return (in_pcblookup_hash_smr(pcbinfo, faddr, fport, laddr, lport, - lookupflags, M_NODOM)); + lookupflags, M_NODOM, fib)); } struct inpcb * @@ -2425,8 +2500,12 @@ in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp __unused, struct mbuf *m) { + int fib; + + M_ASSERTPKTHDR(m); + fib = (lookupflags & INPLOOKUP_FIB) ? M_GETFIB(m) : RT_ALL_FIBS; return (in_pcblookup_hash_smr(pcbinfo, faddr, fport, laddr, lport, - lookupflags, m->m_pkthdr.numa_domain)); + lookupflags, m->m_pkthdr.numa_domain, fib)); } #endif /* INET */ @@ -2546,14 +2625,16 @@ _in6_pcbinshash_wild(struct inpcbhead *pcbhash, struct inpcb *inp) /* * Insert PCB onto various hash lists. + * + * With normal sockets this function shall not fail, so it could return void. + * But for SO_REUSEPORT_LB it may need to allocate memory with locks held, + * that's the only condition when it can fail. */ int in_pcbinshash(struct inpcb *inp) { - struct inpcbhead *pcbhash; - struct inpcbporthead *pcbporthash; + struct inpcbhead *pcbhash, *pcbporthash; struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; - struct inpcbport *phd; uint32_t hash; bool connected; @@ -2594,31 +2675,6 @@ in_pcbinshash(struct inpcb *inp) } /* - * Go through port list and look for a head for this lport. - */ - CK_LIST_FOREACH(phd, pcbporthash, phd_hash) { - if (phd->phd_port == inp->inp_lport) - break; - } - - /* - * If none exists, malloc one and tack it on. - */ - if (phd == NULL) { - phd = uma_zalloc_smr(pcbinfo->ipi_portzone, M_NOWAIT); - if (phd == NULL) { - if ((inp->inp_flags & INP_INLBGROUP) != 0) - in_pcbremlbgrouphash(inp); - return (ENOMEM); - } - phd->phd_port = inp->inp_lport; - CK_LIST_INIT(&phd->phd_pcblist); - CK_LIST_INSERT_HEAD(pcbporthash, phd, phd_hash); - } - inp->inp_phd = phd; - CK_LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist); - - /* * The PCB may have been disconnected in the past. Before we can safely * make it visible in the hash table, we must wait for all readers which * may be traversing this PCB to finish. @@ -2638,6 +2694,7 @@ in_pcbinshash(struct inpcb *inp) #endif _in_pcbinshash_wild(pcbhash, inp); } + CK_LIST_INSERT_HEAD(pcbporthash, inp, inp_portlist); inp->inp_flags |= INP_INHASHLIST; return (0); @@ -2646,7 +2703,6 @@ in_pcbinshash(struct inpcb *inp) void in_pcbremhash_locked(struct inpcb *inp) { - struct inpcbport *phd = inp->inp_phd; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); @@ -2669,10 +2725,6 @@ in_pcbremhash_locked(struct inpcb *inp) CK_LIST_REMOVE(inp, inp_hash_exact); } CK_LIST_REMOVE(inp, inp_portlist); - if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) { - CK_LIST_REMOVE(phd, phd_hash); - uma_zfree_smr(inp->inp_pcbinfo->ipi_portzone, phd); - } inp->inp_flags &= ~INP_INHASHLIST; } @@ -3183,8 +3235,7 @@ db_print_inpcb(struct inpcb *inp, const char *name, int indent) } db_print_indent(indent); - db_printf("inp_phd: %p inp_gencnt: %ju\n", inp->inp_phd, - (uintmax_t)inp->inp_gencnt); + db_printf("inp_gencnt: %ju\n", (uintmax_t)inp->inp_gencnt); } DB_SHOW_COMMAND(inpcb, db_show_inpcb) diff --git a/sys/netinet/in_pcb.h b/sys/netinet/in_pcb.h index a4b4075b3501..9e0618e87601 100644 --- a/sys/netinet/in_pcb.h +++ b/sys/netinet/in_pcb.h @@ -64,7 +64,6 @@ * protocol-specific control block) are stored here. */ CK_LIST_HEAD(inpcbhead, inpcb); -CK_LIST_HEAD(inpcbporthead, inpcbport); CK_LIST_HEAD(inpcblbgrouphead, inpcblbgroup); typedef uint64_t inp_gen_t; @@ -167,7 +166,10 @@ struct inpcbpolicy; struct m_snd_tag; struct inpcb { /* Cache line #1 (amd64) */ - CK_LIST_ENTRY(inpcb) inp_hash_exact; /* hash table linkage */ + union { + CK_LIST_ENTRY(inpcb) inp_hash_exact; /* hash table linkage */ + LIST_ENTRY(inpcb) inp_lbgroup_list; /* lb group list */ + }; CK_LIST_ENTRY(inpcb) inp_hash_wild; /* hash table linkage */ struct rwlock inp_lock; /* Cache line #2 (amd64) */ @@ -218,7 +220,6 @@ struct inpcb { short in6p_hops; }; CK_LIST_ENTRY(inpcb) inp_portlist; /* (r:e/w:h) port list */ - struct inpcbport *inp_phd; /* (r:e/w:h) head of this list */ inp_gen_t inp_gencnt; /* (c) generation count */ void *spare_ptr; /* Spare pointer. */ rt_gen_t inp_rt_cookie; /* generation for route entry */ @@ -302,6 +303,30 @@ struct sockopt_parameters { char sop_optval[]; }; +#ifdef _SYS_KTLS_H_ +struct xktls_session { + uint32_t tsz; /* total sz of elm, next elm is at this+tsz */ + uint32_t fsz; /* size of the struct up to keys */ + uint64_t inp_gencnt; + kvaddr_t so_pcb; + struct in_conninfo coninf; + u_short rx_vlan_id; + struct xktls_session_onedir rcv; + struct xktls_session_onedir snd; +/* + * Next are + * - keydata for rcv, first cipher of length rcv.cipher_key_len, then + * authentication of length rcv.auth_key_len; + * - driver data (string) of length rcv.drv_st_len, if the rcv session is + * offloaded to ifnet rcv.ifnet; + * - keydata for snd, first cipher of length snd.cipher_key_len, then + * authentication of length snd.auth_key_len; + * - driver data (string) of length snd.drv_st_len, if the snd session is + * offloaded to ifnet snd.ifnet; + */ +}; +#endif /* _SYS_KTLS_H_ */ + #ifdef _KERNEL int sysctl_setsockopt(SYSCTL_HANDLER_ARGS, struct inpcbinfo *pcbinfo, int (*ctloutput_set)(struct inpcb *, struct sockopt *)); @@ -367,7 +392,7 @@ struct inpcbinfo { /* * Global hash of inpcbs, hashed by only local port number. */ - struct inpcbporthead *ipi_porthashbase; /* (h) */ + struct inpcbhead *ipi_porthashbase; /* (h) */ u_long ipi_porthashmask; /* (h) */ /* @@ -389,11 +414,9 @@ struct inpcbinfo { */ struct inpcbstorage { uma_zone_t ips_zone; - uma_zone_t ips_portzone; uma_init ips_pcbinit; size_t ips_size; const char * ips_zone_name; - const char * ips_portzone_name; const char * ips_infolock_name; const char * ips_hashlock_name; }; @@ -411,7 +434,6 @@ static struct inpcbstorage prot = { \ .ips_size = sizeof(struct ppcb), \ .ips_pcbinit = prot##_inpcb_init, \ .ips_zone_name = zname, \ - .ips_portzone_name = zname " ports", \ .ips_infolock_name = iname, \ .ips_hashlock_name = hname, \ }; \ @@ -420,28 +442,6 @@ SYSINIT(prot##_inpcbstorage_init, SI_SUB_PROTO_DOMAIN, \ SYSUNINIT(prot##_inpcbstorage_uninit, SI_SUB_PROTO_DOMAIN, \ SI_ORDER_SECOND, in_pcbstorage_destroy, &prot) -/* - * Load balance groups used for the SO_REUSEPORT_LB socket option. Each group - * (or unique address:port combination) can be re-used at most - * INPCBLBGROUP_SIZMAX (256) times. The inpcbs are stored in il_inp which - * is dynamically resized as processes bind/unbind to that specific group. - */ -struct inpcblbgroup { - CK_LIST_ENTRY(inpcblbgroup) il_list; - struct epoch_context il_epoch_ctx; - struct ucred *il_cred; - uint16_t il_lport; /* (c) */ - u_char il_vflag; /* (c) */ - uint8_t il_numa_domain; - uint32_t il_pad2; - union in_dependaddr il_dependladdr; /* (c) */ -#define il_laddr il_dependladdr.id46_addr.ia46_addr4 -#define il6_laddr il_dependladdr.id6_addr - uint32_t il_inpsiz; /* max count in il_inp[] (h) */ - uint32_t il_inpcnt; /* cur count in il_inp[] (h) */ - struct inpcb *il_inp[]; /* (h) */ -}; - #define INP_LOCK_DESTROY(inp) rw_destroy(&(inp)->inp_lock) #define INP_RLOCK(inp) rw_rlock(&(inp)->inp_lock) #define INP_WLOCK(inp) rw_wlock(&(inp)->inp_lock) @@ -571,7 +571,7 @@ void inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp, #define INP_DROPPED 0x04000000 /* protocol drop flag */ #define INP_SOCKREF 0x08000000 /* strong socket reference */ #define INP_RESERVED_0 0x10000000 /* reserved field */ -#define INP_RESERVED_1 0x20000000 /* reserved field */ +#define INP_BOUNDFIB 0x20000000 /* Bound to a specific FIB. */ #define IN6P_RFC2292 0x40000000 /* used RFC2292 API on the socket */ #define IN6P_MTU 0x80000000 /* receive path MTU */ @@ -617,10 +617,11 @@ typedef enum { INPLOOKUP_WILDCARD = 0x00000001, /* Allow wildcard sockets. */ INPLOOKUP_RLOCKPCB = 0x00000002, /* Return inpcb read-locked. */ INPLOOKUP_WLOCKPCB = 0x00000004, /* Return inpcb write-locked. */ + INPLOOKUP_FIB = 0x00000008, /* inp must be from same FIB. */ } inp_lookup_t; #define INPLOOKUP_MASK (INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB | \ - INPLOOKUP_WLOCKPCB) + INPLOOKUP_WLOCKPCB | INPLOOKUP_FIB) #define INPLOOKUP_LOCKMASK (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB) #define sotoinpcb(so) ((struct inpcb *)(so)->so_pcb) @@ -658,20 +659,18 @@ void in_pcbstorage_destroy(void *); void in_pcbpurgeif0(struct inpcbinfo *, struct ifnet *); int in_pcballoc(struct socket *, struct inpcbinfo *); -int in_pcbbind(struct inpcb *, struct sockaddr_in *, struct ucred *); +#define INPBIND_FIB 0x0001 /* bind to the PCB's FIB only */ +int in_pcbbind(struct inpcb *, struct sockaddr_in *, int, struct ucred *); int in_pcbbind_setup(struct inpcb *, struct sockaddr_in *, in_addr_t *, - u_short *, struct ucred *); -int in_pcbconnect(struct inpcb *, struct sockaddr_in *, struct ucred *, - bool); -int in_pcbconnect_setup(struct inpcb *, struct sockaddr_in *, in_addr_t *, - u_short *, in_addr_t *, u_short *, struct ucred *); + u_short *, int, struct ucred *); +int in_pcbconnect(struct inpcb *, struct sockaddr_in *, struct ucred *); void in_pcbdisconnect(struct inpcb *); void in_pcbdrop(struct inpcb *); void in_pcbfree(struct inpcb *); -int in_pcbinshash(struct inpcb *); -int in_pcbladdr(struct inpcb *, struct in_addr *, struct in_addr *, +int in_pcbladdr(const struct inpcb *, struct in_addr *, struct in_addr *, struct ucred *); int in_pcblbgroup_numa(struct inpcb *, int arg); +void in_pcblisten(struct inpcb *); struct inpcb * in_pcblookup(struct inpcbinfo *, struct in_addr, u_int, struct in_addr, u_int, int, struct ifnet *); @@ -679,11 +678,10 @@ struct inpcb * in_pcblookup_mbuf(struct inpcbinfo *, struct in_addr, u_int, struct in_addr, u_int, int, struct ifnet *, struct mbuf *); void in_pcbref(struct inpcb *); -void in_pcbrehash(struct inpcb *); -void in_pcbremhash_locked(struct inpcb *); bool in_pcbrele(struct inpcb *, inp_lookup_t); bool in_pcbrele_rlocked(struct inpcb *); bool in_pcbrele_wlocked(struct inpcb *); +bool in_pcbrele_rlock(struct inpcb *inp); typedef bool inp_match_t(const struct inpcb *, void *); struct inpcb_iterator { diff --git a/sys/netinet/in_pcb_var.h b/sys/netinet/in_pcb_var.h index 655fd03ee9ba..7e8a1626ab40 100644 --- a/sys/netinet/in_pcb_var.h +++ b/sys/netinet/in_pcb_var.h @@ -50,16 +50,37 @@ int inp_trylock(struct inpcb *inp, const inp_lookup_t lock); bool inp_smr_lock(struct inpcb *, const inp_lookup_t); int in_pcb_lport(struct inpcb *, struct in_addr *, u_short *, struct ucred *, int); -int in_pcb_lport_dest(struct inpcb *inp, struct sockaddr *lsa, +int in_pcb_lport_dest(const struct inpcb *inp, struct sockaddr *lsa, u_short *lportp, struct sockaddr *fsa, u_short fport, struct ucred *cred, int lookupflags); -struct inpcb * in_pcblookup_local(struct inpcbinfo *, struct in_addr, u_short, - int, struct ucred *); +struct inpcb *in_pcblookup_local(struct inpcbinfo *, struct in_addr, u_short, + int, int, struct ucred *); +int in_pcbinshash(struct inpcb *); +void in_pcbrehash(struct inpcb *); +void in_pcbremhash_locked(struct inpcb *); -struct inpcbport { - struct inpcbhead phd_pcblist; - CK_LIST_ENTRY(inpcbport) phd_hash; - u_short phd_port; +/* + * Load balance groups used for the SO_REUSEPORT_LB socket option. Each group + * (or unique address:port combination) can be re-used at most + * INPCBLBGROUP_SIZMAX (256) times. The inpcbs are stored in il_inp which + * is dynamically resized as processes bind/unbind to that specific group. + */ +struct inpcblbgroup { + CK_LIST_ENTRY(inpcblbgroup) il_list; + LIST_HEAD(, inpcb) il_pending; /* PCBs waiting for listen() */ + struct epoch_context il_epoch_ctx; + struct ucred *il_cred; + uint16_t il_lport; /* (c) */ + u_char il_vflag; /* (c) */ + uint8_t il_numa_domain; + int il_fibnum; + union in_dependaddr il_dependladdr; /* (c) */ +#define il_laddr il_dependladdr.id46_addr.ia46_addr4 +#define il6_laddr il_dependladdr.id6_addr + uint32_t il_inpsiz; /* max count in il_inp[] (h) */ + uint32_t il_inpcnt; /* cur count in il_inp[] (h) */ + uint32_t il_pendcnt; /* cur count in il_pending (h) */ + struct inpcb *il_inp[]; /* (h) */ }; #endif /* !_NETINET_IN_PCB_VAR_H_ */ diff --git a/sys/netinet/in_prot.c b/sys/netinet/in_prot.c index 204f4f60456e..69f0f3694096 100644 --- a/sys/netinet/in_prot.c +++ b/sys/netinet/in_prot.c @@ -26,21 +26,17 @@ */ /* - * System calls related to processes and protection + * Helpers related to visibility and protection of sockets and inpcb. */ -#include <sys/cdefs.h> -#include "opt_inet.h" -#include "opt_inet6.h" - -#include <sys/param.h> #include <sys/systm.h> +#include <sys/jail.h> #include <sys/kernel.h> #include <sys/lock.h> #include <sys/mutex.h> +#include <sys/priv.h> #include <sys/proc.h> #include <sys/socket.h> -#include <sys/jail.h> #include <netinet/in.h> #include <netinet/in_pcb.h> @@ -72,3 +68,16 @@ cr_canseeinpcb(struct ucred *cred, struct inpcb *inp) return (0); } + +bool +cr_canexport_ktlskeys(struct thread *td, struct inpcb *inp) +{ + int error; + + if (cr_canseeinpcb(td->td_ucred, inp) == 0 && + cr_xids_subset(td->td_ucred, inp->inp_cred)) + return (true); + error = priv_check(td, PRIV_NETINET_KTLSKEYS); + return (error == 0); + +} diff --git a/sys/netinet/in_rmx.c b/sys/netinet/in_rmx.c index aedfd0bc08c7..b8599143b991 100644 --- a/sys/netinet/in_rmx.c +++ b/sys/netinet/in_rmx.c @@ -71,7 +71,7 @@ rib4_set_nh_pfxflags(u_int fibnum, const struct sockaddr *addr, const struct soc * add these routes to support some cases with active-active * load balancing. Given that, retain this support. */ - if (in_broadcast(addr4->sin_addr, nh->nh_ifp)) + if (in_ifnet_broadcast(addr4->sin_addr, nh->nh_ifp)) is_broadcast = true; } else if (mask4->sin_addr.s_addr == 0) nhop_set_pxtype_flag(nh, NHF_DEFAULT); diff --git a/sys/netinet/in_systm.h b/sys/netinet/in_systm.h index 2750733335bb..e2f553ec461c 100644 --- a/sys/netinet/in_systm.h +++ b/sys/netinet/in_systm.h @@ -32,6 +32,8 @@ #ifndef _NETINET_IN_SYSTM_H_ #define _NETINET_IN_SYSTM_H_ +#include <sys/types.h> + /* * Miscellaneous internetwork * definitions for kernel. @@ -56,8 +58,10 @@ typedef u_int32_t n_time; /* ms since 00:00 UTC, byte rev */ #ifdef _KERNEL struct inpcb; struct ucred; +struct thread; int cr_canseeinpcb(struct ucred *cred, struct inpcb *inp); +bool cr_canexport_ktlskeys(struct thread *td, struct inpcb *inp); uint32_t iptime(void); #endif diff --git a/sys/netinet/in_var.h b/sys/netinet/in_var.h index 09d3cd050fc3..1f6f6edb9219 100644 --- a/sys/netinet/in_var.h +++ b/sys/netinet/in_var.h @@ -97,6 +97,11 @@ struct in_ifaddr { #define IN_LNAOF(in, ifa) \ ((ntohl((in).s_addr) & ~((struct in_ifaddr *)(ifa)->ia_subnetmask)) +#ifdef _KERNEL +#define IN_ARE_MASKED_ADDR_EQUAL(d, a, m) ( \ + ((((d).s_addr ^ (a).s_addr) & (m).s_addr)) == 0 ) +#endif + #define LLTABLE(ifp) \ ((struct in_ifinfo *)(ifp)->if_afdata[AF_INET])->ii_llt /* @@ -454,6 +459,7 @@ int in_joingroup_locked(struct ifnet *, const struct in_addr *, int in_leavegroup(struct in_multi *, /*const*/ struct in_mfilter *); int in_leavegroup_locked(struct in_multi *, /*const*/ struct in_mfilter *); +int in_mask2len(struct in_addr *); int in_control(struct socket *, u_long, void *, struct ifnet *, struct thread *); int in_control_ioctl(u_long, void *, struct ifnet *, diff --git a/sys/netinet/ip.h b/sys/netinet/ip.h index 8d205ba07cf5..6de41a7e79fa 100644 --- a/sys/netinet/ip.h +++ b/sys/netinet/ip.h @@ -33,7 +33,8 @@ #ifndef _NETINET_IP_H_ #define _NETINET_IP_H_ -#include <sys/cdefs.h> +#include <sys/types.h> +#include <netinet/in.h> /* * Definitions for internet protocol version 4. @@ -66,7 +67,7 @@ struct ip { u_char ip_p; /* protocol */ u_short ip_sum; /* checksum */ struct in_addr ip_src,ip_dst; /* source and dest address */ -} __packed __aligned(2); +} __packed; #define IP_MAXPACKET 65535 /* maximum packet size */ @@ -186,7 +187,7 @@ struct ip_timestamp { uint32_t ipt_time; /* network format */ } ipt_ta[1]; } ipt_timestamp; -}; +} __packed; /* Flag bits for ipt_flg. */ #define IPOPT_TS_TSONLY 0 /* timestamps only */ diff --git a/sys/netinet/ip_carp.c b/sys/netinet/ip_carp.c index ddbc13e7c878..d3d7957cf087 100644 --- a/sys/netinet/ip_carp.c +++ b/sys/netinet/ip_carp.c @@ -37,6 +37,7 @@ #include <sys/systm.h> #include <sys/devctl.h> #include <sys/jail.h> +#include <sys/kassert.h> #include <sys/kernel.h> #include <sys/limits.h> #include <sys/malloc.h> @@ -95,7 +96,8 @@ static MALLOC_DEFINE(M_CARP, "CARP", "CARP addresses"); struct carp_softc { struct ifnet *sc_carpdev; /* Pointer to parent ifnet. */ struct ifaddr **sc_ifas; /* Our ifaddrs. */ - struct sockaddr_dl sc_addr; /* Our link level address. */ + carp_version_t sc_version; /* carp or VRRPv3 */ + uint8_t sc_addr[ETHER_ADDR_LEN]; /* Our link level address. */ struct callout sc_ad_tmo; /* Advertising timeout. */ #ifdef INET struct callout sc_md_tmo; /* Master down timeout. */ @@ -106,11 +108,25 @@ struct carp_softc { struct mtx sc_mtx; int sc_vhid; - int sc_advskew; - int sc_advbase; - struct in_addr sc_carpaddr; - struct in6_addr sc_carpaddr6; - + union { + struct { /* sc_version == CARP_VERSION_CARP */ + int sc_advskew; + int sc_advbase; + struct in_addr sc_carpaddr; + struct in6_addr sc_carpaddr6; + uint64_t sc_counter; + bool sc_init_counter; +#define CARP_HMAC_PAD 64 + unsigned char sc_key[CARP_KEY_LEN]; + unsigned char sc_pad[CARP_HMAC_PAD]; + SHA1_CTX sc_sha1; + }; + struct { /* sc_version == CARP_VERSION_VRRPv3 */ + uint8_t sc_vrrp_prio; + uint16_t sc_vrrp_adv_inter; + uint16_t sc_vrrp_master_inter; + }; + }; int sc_naddrs; int sc_naddrs6; int sc_ifasiz; @@ -121,15 +137,6 @@ struct carp_softc { int sc_sendad_success; #define CARP_SENDAD_MIN_SUCCESS 3 - int sc_init_counter; - uint64_t sc_counter; - - /* authentication */ -#define CARP_HMAC_PAD 64 - unsigned char sc_key[CARP_KEY_LEN]; - unsigned char sc_pad[CARP_HMAC_PAD]; - SHA1_CTX sc_sha1; - TAILQ_ENTRY(carp_softc) sc_list; /* On the carp_if list. */ LIST_ENTRY(carp_softc) sc_next; /* On the global list. */ }; @@ -166,6 +173,9 @@ struct carpkreq { /* Everything above this is identical to carpreq */ struct in_addr carpr_addr; struct in6_addr carpr_addr6; + carp_version_t carpr_version; + uint8_t carpr_vrrp_priority; + uint16_t carpr_vrrp_adv_inter; }; /* @@ -196,8 +206,6 @@ struct carpkreq { * * Known issues with locking: * - * - Sending ad, we put the pointer to the softc in an mtag, and no reference - * counting is done on the softc. * - On module unload we may race (?) with packet processing thread * dereferencing our function pointers. */ @@ -325,8 +333,9 @@ SYSCTL_VNET_PCPUSTAT(_net_inet_carp, OID_AUTO, stats, struct carpstats, 0 : ((sc)->sc_advskew + V_carp_demotion))) static void carp_input_c(struct mbuf *, struct carp_header *, sa_family_t, int); +static void vrrp_input_c(struct mbuf *, int, sa_family_t, int, int, uint16_t); static struct carp_softc - *carp_alloc(struct ifnet *); + *carp_alloc(struct ifnet *, carp_version_t, int); static void carp_destroy(struct carp_softc *); static struct carp_if *carp_alloc_if(struct ifnet *); @@ -337,8 +346,8 @@ static void carp_setrun(struct carp_softc *, sa_family_t); static void carp_master_down(void *); static void carp_master_down_locked(struct carp_softc *, const char* reason); -static void carp_send_ad(void *); static void carp_send_ad_locked(struct carp_softc *); +static void vrrp_send_ad_locked(struct carp_softc *); static void carp_addroute(struct carp_softc *); static void carp_ifa_addroute(struct ifaddr *); static void carp_delroute(struct carp_softc *); @@ -346,7 +355,7 @@ static void carp_ifa_delroute(struct ifaddr *); static void carp_send_ad_all(void *, int); static void carp_demote_adj(int, char *); -static LIST_HEAD(, carp_softc) carp_list; +static LIST_HEAD(, carp_softc) carp_list = LIST_HEAD_INITIALIZER(carp_list); static struct mtx carp_mtx; static struct sx carp_sx; static struct task carp_sendall_task = @@ -373,7 +382,7 @@ carp_is_supported_if(if_t ifp) static void carp_hmac_prepare(struct carp_softc *sc) { - uint8_t version = CARP_VERSION, type = CARP_ADVERTISEMENT; + uint8_t version = CARP_VERSION_CARP, type = CARP_ADVERTISEMENT; uint8_t vhid = sc->sc_vhid & 0xff; struct ifaddr *ifa; int i, found; @@ -385,6 +394,7 @@ carp_hmac_prepare(struct carp_softc *sc) #endif CARP_LOCK_ASSERT(sc); + MPASS(sc->sc_version == CARP_VERSION_CARP); /* Compute ipad from key. */ bzero(sc->sc_pad, sizeof(sc->sc_pad)); @@ -478,6 +488,22 @@ carp_hmac_verify(struct carp_softc *sc, uint32_t counter[2], return (bcmp(md, md2, sizeof(md2))); } +static int +vrrp_checksum_verify(struct mbuf *m, int off, int len, uint16_t phdrcksum) +{ + uint16_t cksum; + + /* + * Note that VRRPv3 checksums are different from CARP checksums. + * Carp just calculates the checksum over the packet. + * VRRPv3 includes the pseudo-header checksum as well. + */ + cksum = in_cksum_skip(m, off + len, off); + cksum -= phdrcksum; + + return (cksum); +} + /* * process input packet. * we have rearranged checks order compared to the rfc, @@ -488,9 +514,11 @@ static int carp_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m = *mp; - struct ip *ip = mtod(m, struct ip *); - struct carp_header *ch; - int iplen, len; + struct ip *ip; + struct vrrpv3_header *vh; + int iplen; + int minlen; + int totlen; iplen = *offp; *mp = NULL; @@ -502,60 +530,92 @@ carp_input(struct mbuf **mp, int *offp, int proto) return (IPPROTO_DONE); } - iplen = ip->ip_hl << 2; - - if (m->m_pkthdr.len < iplen + sizeof(*ch)) { + /* Ensure we have enough header to figure out the version. */ + if (m->m_pkthdr.len < iplen + sizeof(*vh)) { CARPSTATS_INC(carps_badlen); - CARP_DEBUG("%s: received len %zd < sizeof(struct carp_header) " + CARP_DEBUG("%s: received len %zd < sizeof(struct vrrpv3_header) " "on %s\n", __func__, m->m_len - sizeof(struct ip), if_name(m->m_pkthdr.rcvif)); m_freem(m); return (IPPROTO_DONE); } - if (iplen + sizeof(*ch) < m->m_len) { - if ((m = m_pullup(m, iplen + sizeof(*ch))) == NULL) { + if (m->m_len < iplen + sizeof(*vh)) { + if ((m = m_pullup(m, iplen + sizeof(*vh))) == NULL) { CARPSTATS_INC(carps_hdrops); - CARP_DEBUG("%s: pullup failed\n", __func__); + CARP_DEBUG("%s():%d: pullup failed\n", __func__, __LINE__); return (IPPROTO_DONE); } - ip = mtod(m, struct ip *); } - ch = (struct carp_header *)((char *)ip + iplen); + ip = mtod(m, struct ip *); + totlen = ntohs(ip->ip_len); + vh = (struct vrrpv3_header *)((char *)ip + iplen); - /* - * verify that the received packet length is - * equal to the CARP header - */ - len = iplen + sizeof(*ch); - if (len > m->m_pkthdr.len) { + switch (vh->vrrp_version) { + case CARP_VERSION_CARP: + minlen = sizeof(struct carp_header); + break; + case CARP_VERSION_VRRPv3: + minlen = sizeof(struct vrrpv3_header); + break; + default: + CARPSTATS_INC(carps_badver); + CARP_DEBUG("%s: unsupported version %d on %s\n", __func__, + vh->vrrp_version, if_name(m->m_pkthdr.rcvif)); + m_freem(m); + return (IPPROTO_DONE); + } + + /* And now check the length again but with the real minimal length. */ + if (m->m_pkthdr.len < iplen + minlen) { CARPSTATS_INC(carps_badlen); - CARP_DEBUG("%s: packet too short %d on %s\n", __func__, - m->m_pkthdr.len, + CARP_DEBUG("%s: received len %zd < %d " + "on %s\n", __func__, m->m_len - sizeof(struct ip), + iplen + minlen, if_name(m->m_pkthdr.rcvif)); m_freem(m); return (IPPROTO_DONE); } - if ((m = m_pullup(m, len)) == NULL) { - CARPSTATS_INC(carps_hdrops); - return (IPPROTO_DONE); + if (m->m_len < iplen + minlen) { + if ((m = m_pullup(m, iplen + minlen)) == NULL) { + CARPSTATS_INC(carps_hdrops); + CARP_DEBUG("%s():%d: pullup failed\n", __func__, __LINE__); + return (IPPROTO_DONE); + } + ip = mtod(m, struct ip *); + vh = (struct vrrpv3_header *)((char *)ip + iplen); } - ip = mtod(m, struct ip *); - ch = (struct carp_header *)((char *)ip + iplen); - /* verify the CARP checksum */ - m->m_data += iplen; - if (in_cksum(m, len - iplen)) { - CARPSTATS_INC(carps_badsum); - CARP_DEBUG("%s: checksum failed on %s\n", __func__, - if_name(m->m_pkthdr.rcvif)); - m_freem(m); - return (IPPROTO_DONE); + switch (vh->vrrp_version) { + case CARP_VERSION_CARP: { + struct carp_header *ch; + + /* verify the CARP checksum */ + if (in_cksum_skip(m, totlen, iplen)) { + CARPSTATS_INC(carps_badsum); + CARP_DEBUG("%s: checksum failed on %s\n", __func__, + if_name(m->m_pkthdr.rcvif)); + m_freem(m); + break; + } + ch = (struct carp_header *)((char *)ip + iplen); + carp_input_c(m, ch, AF_INET, ip->ip_ttl); + break; + } + case CARP_VERSION_VRRPv3: { + uint16_t phdrcksum; + + phdrcksum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, + htonl((u_short)(totlen - iplen) + ip->ip_p)); + vrrp_input_c(m, iplen, AF_INET, ip->ip_ttl, totlen - iplen, + phdrcksum); + break; + } + default: + KASSERT(false, ("Unsupported version %d", vh->vrrp_version)); } - m->m_data -= iplen; - carp_input_c(m, ch, AF_INET, ip->ip_ttl); return (IPPROTO_DONE); } #endif @@ -566,8 +626,8 @@ carp6_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m = *mp; struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); - struct carp_header *ch; - u_int len; + struct vrrpv3_header *vh; + u_int len, minlen; CARPSTATS_INC(carps_ipackets6); @@ -585,10 +645,9 @@ carp6_input(struct mbuf **mp, int *offp, int proto) return (IPPROTO_DONE); } - /* verify that we have a complete carp packet */ - if (m->m_len < *offp + sizeof(*ch)) { + if (m->m_len < *offp + sizeof(*vh)) { len = m->m_len; - m = m_pullup(m, *offp + sizeof(*ch)); + m = m_pullup(m, *offp + sizeof(*vh)); if (m == NULL) { CARPSTATS_INC(carps_badlen); CARP_DEBUG("%s: packet size %u too small\n", __func__, len); @@ -596,20 +655,73 @@ carp6_input(struct mbuf **mp, int *offp, int proto) } ip6 = mtod(m, struct ip6_hdr *); } - ch = (struct carp_header *)(mtod(m, char *) + *offp); + vh = (struct vrrpv3_header *)(mtod(m, char *) + *offp); - /* verify the CARP checksum */ - m->m_data += *offp; - if (in_cksum(m, sizeof(*ch))) { - CARPSTATS_INC(carps_badsum); - CARP_DEBUG("%s: checksum failed, on %s\n", __func__, + switch (vh->vrrp_version) { + case CARP_VERSION_CARP: + minlen = sizeof(struct carp_header); + break; + case CARP_VERSION_VRRPv3: + minlen = sizeof(struct vrrpv3_header); + break; + default: + CARPSTATS_INC(carps_badver); + CARP_DEBUG("%s: unsupported version %d on %s\n", __func__, + vh->vrrp_version, if_name(m->m_pkthdr.rcvif)); + m_freem(m); + return (IPPROTO_DONE); + } + + /* And now check the length again but with the real minimal length. */ + if (m->m_pkthdr.len < sizeof(*ip6) + minlen) { + CARPSTATS_INC(carps_badlen); + CARP_DEBUG("%s: received len %zd < %zd " + "on %s\n", __func__, m->m_len - sizeof(struct ip), + sizeof(*ip6) + minlen, if_name(m->m_pkthdr.rcvif)); m_freem(m); return (IPPROTO_DONE); } - m->m_data -= *offp; - carp_input_c(m, ch, AF_INET6, ip6->ip6_hlim); + if (m->m_len < sizeof(*ip6) + minlen) { + if ((m = m_pullup(m, sizeof(*ip6) + minlen)) == NULL) { + CARPSTATS_INC(carps_hdrops); + CARP_DEBUG("%s():%d: pullup failed\n", __func__, __LINE__); + return (IPPROTO_DONE); + } + ip6 = mtod(m, struct ip6_hdr *); + vh = (struct vrrpv3_header *)mtodo(m, sizeof(*ip6)); + } + + switch (vh->vrrp_version) { + case CARP_VERSION_CARP: { + struct carp_header *ch; + + /* verify the CARP checksum */ + if (in_cksum_skip(m, *offp + sizeof(struct carp_header), + *offp)) { + CARPSTATS_INC(carps_badsum); + CARP_DEBUG("%s: checksum failed, on %s\n", __func__, + if_name(m->m_pkthdr.rcvif)); + m_freem(m); + break; + } + ch = (struct carp_header *)((char *)ip6 + sizeof(*ip6)); + carp_input_c(m, ch, AF_INET6, ip6->ip6_hlim); + break; + } + case CARP_VERSION_VRRPv3: { + uint16_t phdrcksum; + + phdrcksum = in6_cksum_pseudo(ip6, ntohs(ip6->ip6_plen), + ip6->ip6_nxt, 0); + vrrp_input_c(m, sizeof(*ip6), AF_INET6, ip6->ip6_hlim, + ntohs(ip6->ip6_plen), phdrcksum); + break; + } + default: + KASSERT(false, ("Unsupported version %d", vh->vrrp_version)); + } return (IPPROTO_DONE); } #endif /* INET6 */ @@ -629,7 +741,7 @@ carp6_input(struct mbuf **mp, int *offp, int proto) * The VHID test is outside this mini-function. */ static int -carp_source_is_self(struct mbuf *m, struct ifaddr *ifa, sa_family_t af) +carp_source_is_self(const struct mbuf *m, struct ifaddr *ifa, sa_family_t af) { #ifdef INET struct ip *ip4; @@ -659,16 +771,12 @@ carp_source_is_self(struct mbuf *m, struct ifaddr *ifa, sa_family_t af) return (0); } -static void -carp_input_c(struct mbuf *m, struct carp_header *ch, sa_family_t af, int ttl) +static struct ifaddr * +carp_find_ifa(const struct mbuf *m, sa_family_t af, uint8_t vhid) { struct ifnet *ifp = m->m_pkthdr.rcvif; struct ifaddr *ifa, *match; - struct carp_softc *sc; - uint64_t tmp_counter; - struct timeval sc_tv, ch_tv; int error; - bool multicast = false; NET_EPOCH_ASSERT(); @@ -688,9 +796,9 @@ carp_input_c(struct mbuf *m, struct carp_header *ch, sa_family_t af, int ttl) IFNET_FOREACH_IFA(ifp, ifa) { if (match == NULL && ifa->ifa_carp != NULL && ifa->ifa_addr->sa_family == af && - ifa->ifa_carp->sc_vhid == ch->carp_vhid) + ifa->ifa_carp->sc_vhid == vhid) match = ifa; - if (ch->carp_vhid == 0 && carp_source_is_self(m, ifa, af)) + if (vhid == 0 && carp_source_is_self(m, ifa, af)) error = ELOOP; } ifa = error ? NULL : match; @@ -705,12 +813,37 @@ carp_input_c(struct mbuf *m, struct carp_header *ch, sa_family_t af, int ttl) } else { CARPSTATS_INC(carps_badvhid); } + } + + return (ifa); +} + +static void +carp_input_c(struct mbuf *m, struct carp_header *ch, sa_family_t af, int ttl) +{ + struct ifnet *ifp = m->m_pkthdr.rcvif; + struct ifaddr *ifa; + struct carp_softc *sc; + uint64_t tmp_counter; + struct timeval sc_tv, ch_tv; + bool multicast = false; + + NET_EPOCH_ASSERT(); + MPASS(ch->carp_version == CARP_VERSION_CARP); + + ifa = carp_find_ifa(m, af, ch->carp_vhid); + if (ifa == NULL) { m_freem(m); return; } + sc = ifa->ifa_carp; + CARP_LOCK(sc); + /* verify the CARP version. */ - if (ch->carp_version != CARP_VERSION) { + if (sc->sc_version != CARP_VERSION_CARP) { + CARP_UNLOCK(sc); + CARPSTATS_INC(carps_badver); CARP_DEBUG("%s: invalid version %d\n", if_name(ifp), ch->carp_version); @@ -719,10 +852,8 @@ carp_input_c(struct mbuf *m, struct carp_header *ch, sa_family_t af, int ttl) return; } - sc = ifa->ifa_carp; - CARP_LOCK(sc); if (ifa->ifa_addr->sa_family == AF_INET) { - multicast = IN_MULTICAST(sc->sc_carpaddr.s_addr); + multicast = IN_MULTICAST(ntohl(sc->sc_carpaddr.s_addr)); } else { multicast = IN6_IS_ADDR_MULTICAST(&sc->sc_carpaddr6); } @@ -749,7 +880,7 @@ carp_input_c(struct mbuf *m, struct carp_header *ch, sa_family_t af, int ttl) /* XXX Replay protection goes here */ - sc->sc_init_counter = 0; + sc->sc_init_counter = false; sc->sc_counter = tmp_counter; sc_tv.tv_sec = sc->sc_advbase; @@ -809,11 +940,133 @@ out: m_freem(m); } +static void +vrrp_input_c(struct mbuf *m, int off, sa_family_t af, int ttl, + int len, uint16_t phdrcksum) +{ + struct vrrpv3_header *vh = mtodo(m, off); + struct ifnet *ifp = m->m_pkthdr.rcvif; + struct ifaddr *ifa; + struct carp_softc *sc; + + NET_EPOCH_ASSERT(); + MPASS(vh->vrrp_version == CARP_VERSION_VRRPv3); + + ifa = carp_find_ifa(m, af, vh->vrrp_vrtid); + if (ifa == NULL) { + m_freem(m); + return; + } + + sc = ifa->ifa_carp; + CARP_LOCK(sc); + + ifa_free(ifa); + + /* verify the CARP version. */ + if (sc->sc_version != CARP_VERSION_VRRPv3) { + CARP_UNLOCK(sc); + + CARPSTATS_INC(carps_badver); + CARP_DEBUG("%s: invalid version %d\n", if_name(ifp), + vh->vrrp_version); + m_freem(m); + return; + } + + /* verify that the IP TTL is 255. */ + if (ttl != CARP_DFLTTL) { + CARPSTATS_INC(carps_badttl); + CARP_DEBUG("%s: received ttl %d != 255 on %s\n", __func__, + ttl, if_name(m->m_pkthdr.rcvif)); + goto out; + } + + if (vrrp_checksum_verify(m, off, len, phdrcksum)) { + CARPSTATS_INC(carps_badsum); + CARP_DEBUG("%s: incorrect checksum for VRID %u@%s\n", __func__, + sc->sc_vhid, if_name(ifp)); + goto out; + } + + /* RFC9568, 7.1 Receiving VRRP packets. */ + if (sc->sc_vrrp_prio == 255) { + CARP_DEBUG("%s: our priority is 255. Ignore peer announcement.\n", + __func__); + goto out; + } + + /* XXX TODO Check IP address payload. */ + + sc->sc_vrrp_master_inter = ntohs(vh->vrrp_max_adver_int); + + switch (sc->sc_state) { + case INIT: + break; + case MASTER: + /* + * If we receive an advertisement from a master who's going to + * be more frequent than us, go into BACKUP state. + * Same if the peer has a higher priority than us. + */ + if (ntohs(vh->vrrp_max_adver_int) < sc->sc_vrrp_adv_inter || + vh->vrrp_priority > sc->sc_vrrp_prio) { + callout_stop(&sc->sc_ad_tmo); + carp_set_state(sc, BACKUP, + "more frequent advertisement received"); + carp_setrun(sc, 0); + carp_delroute(sc); + } + break; + case BACKUP: + /* + * If we're pre-empting masters who advertise slower than us, + * and this one claims to be slower, treat him as down. + */ + if (V_carp_preempt && (ntohs(vh->vrrp_max_adver_int) > sc->sc_vrrp_adv_inter + || vh->vrrp_priority < sc->sc_vrrp_prio)) { + carp_master_down_locked(sc, + "preempting a slower master"); + break; + } + + /* + * Otherwise, we reset the counter and wait for the next + * advertisement. + */ + carp_setrun(sc, af); + break; + } + +out: + CARP_UNLOCK(sc); + m_freem(m); +} + static int -carp_prepare_ad(struct mbuf *m, struct carp_softc *sc, struct carp_header *ch) +carp_tag(struct carp_softc *sc, struct mbuf *m) { struct m_tag *mtag; + /* Tag packet for carp_output */ + if ((mtag = m_tag_get(PACKET_TAG_CARP, sizeof(sc->sc_vhid), + M_NOWAIT)) == NULL) { + m_freem(m); + CARPSTATS_INC(carps_onomem); + return (ENOMEM); + } + bcopy(&sc->sc_vhid, mtag + 1, sizeof(sc->sc_vhid)); + m_tag_prepend(m, mtag); + + return (0); +} + +static void +carp_prepare_ad(struct mbuf *m, struct carp_softc *sc, struct carp_header *ch) +{ + + MPASS(sc->sc_version == CARP_VERSION_CARP); + if (sc->sc_init_counter) { /* this could also be seconds since unix epoch */ sc->sc_counter = arc4random(); @@ -826,18 +1079,19 @@ carp_prepare_ad(struct mbuf *m, struct carp_softc *sc, struct carp_header *ch) ch->carp_counter[1] = htonl(sc->sc_counter&0xffffffff); carp_hmac_generate(sc, ch->carp_counter, ch->carp_md); +} - /* Tag packet for carp_output */ - if ((mtag = m_tag_get(PACKET_TAG_CARP, sizeof(struct carp_softc *), - M_NOWAIT)) == NULL) { - m_freem(m); - CARPSTATS_INC(carps_onomem); - return (ENOMEM); +static inline void +send_ad_locked(struct carp_softc *sc) +{ + switch (sc->sc_version) { + case CARP_VERSION_CARP: + carp_send_ad_locked(sc); + break; + case CARP_VERSION_VRRPv3: + vrrp_send_ad_locked(sc); + break; } - bcopy(&sc, mtag + 1, sizeof(sc)); - m_tag_prepend(m, mtag); - - return (0); } /* @@ -856,7 +1110,7 @@ carp_send_ad_all(void *ctx __unused, int pending __unused) if (sc->sc_state == MASTER) { CARP_LOCK(sc); CURVNET_SET(sc->sc_carpdev->if_vnet); - carp_send_ad_locked(sc); + send_ad_locked(sc); CURVNET_RESTORE(); CARP_UNLOCK(sc); } @@ -866,7 +1120,7 @@ carp_send_ad_all(void *ctx __unused, int pending __unused) /* Send a periodic advertisement, executed in callout context. */ static void -carp_send_ad(void *v) +carp_callout(void *v) { struct carp_softc *sc = v; struct epoch_tracker et; @@ -874,7 +1128,7 @@ carp_send_ad(void *v) NET_EPOCH_ENTER(et); CARP_LOCK_ASSERT(sc); CURVNET_SET(sc->sc_carpdev->if_vnet); - carp_send_ad_locked(sc); + send_ad_locked(sc); CURVNET_RESTORE(); CARP_UNLOCK(sc); NET_EPOCH_EXIT(et); @@ -958,12 +1212,13 @@ carp_send_ad_locked(struct carp_softc *sc) NET_EPOCH_ASSERT(); CARP_LOCK_ASSERT(sc); + MPASS(sc->sc_version == CARP_VERSION_CARP); advskew = DEMOTE_ADVSKEW(sc); tv.tv_sec = sc->sc_advbase; tv.tv_usec = advskew * 1000000 / 256; - ch.carp_version = CARP_VERSION; + ch.carp_version = CARP_VERSION_CARP; ch.carp_type = CARP_ADVERTISEMENT; ch.carp_vhid = sc->sc_vhid; ch.carp_advbase = sc->sc_advbase; @@ -988,7 +1243,7 @@ carp_send_ad_locked(struct carp_softc *sc) m->m_pkthdr.rcvif = NULL; m->m_len = len; M_ALIGN(m, m->m_len); - if (IN_MULTICAST(sc->sc_carpaddr.s_addr)) + if (IN_MULTICAST(ntohl(sc->sc_carpaddr.s_addr))) m->m_flags |= M_MCAST; ip = mtod(m, struct ip *); ip->ip_v = IPVERSION; @@ -999,7 +1254,7 @@ carp_send_ad_locked(struct carp_softc *sc) ip->ip_ttl = CARP_DFLTTL; ip->ip_p = IPPROTO_CARP; ip->ip_sum = 0; - ip_fillid(ip); + ip_fillid(ip, V_ip_random_id); ifa = carp_best_ifa(AF_INET, sc->sc_carpdev); if (ifa != NULL) { @@ -1012,7 +1267,9 @@ carp_send_ad_locked(struct carp_softc *sc) ch_ptr = (struct carp_header *)(&ip[1]); bcopy(&ch, ch_ptr, sizeof(ch)); - if (carp_prepare_ad(m, sc, ch_ptr)) + carp_prepare_ad(m, sc, ch_ptr); + if (IN_MULTICAST(ntohl(sc->sc_carpaddr.s_addr)) && + carp_tag(sc, m) != 0) goto resched; m->m_data += sizeof(*ip); @@ -1072,7 +1329,9 @@ carp_send_ad_locked(struct carp_softc *sc) ch_ptr = (struct carp_header *)(&ip6[1]); bcopy(&ch, ch_ptr, sizeof(ch)); - if (carp_prepare_ad(m, sc, ch_ptr)) + carp_prepare_ad(m, sc, ch_ptr); + if (IN6_IS_ADDR_MULTICAST(&sc->sc_carpaddr6) && + carp_tag(sc, m) != 0) goto resched; m->m_data += sizeof(*ip6); @@ -1087,7 +1346,188 @@ carp_send_ad_locked(struct carp_softc *sc) #endif /* INET6 */ resched: - callout_reset(&sc->sc_ad_tmo, tvtohz(&tv), carp_send_ad, sc); + callout_reset(&sc->sc_ad_tmo, tvtohz(&tv), carp_callout, sc); +} + +static void +vrrp_send_ad_locked(struct carp_softc *sc) +{ + struct vrrpv3_header *vh_ptr; + struct ifaddr *ifa; + struct mbuf *m; + int len; + struct vrrpv3_header vh = { + .vrrp_version = CARP_VERSION_VRRPv3, + .vrrp_type = VRRP_TYPE_ADVERTISEMENT, + .vrrp_vrtid = sc->sc_vhid, + .vrrp_priority = sc->sc_vrrp_prio, + .vrrp_count_addr = 0, + .vrrp_max_adver_int = htons(sc->sc_vrrp_adv_inter), + .vrrp_checksum = 0, + }; + + NET_EPOCH_ASSERT(); + CARP_LOCK_ASSERT(sc); + MPASS(sc->sc_version == CARP_VERSION_VRRPv3); + +#ifdef INET + if (sc->sc_naddrs) { + struct ip *ip; + + m = m_gethdr(M_NOWAIT, MT_DATA); + if (m == NULL) { + CARPSTATS_INC(carps_onomem); + goto resched; + } + len = sizeof(*ip) + sizeof(vh); + m->m_pkthdr.len = len; + m->m_pkthdr.rcvif = NULL; + m->m_len = len; + M_ALIGN(m, m->m_len); + m->m_flags |= M_MCAST; + ip = mtod(m, struct ip *); + ip->ip_v = IPVERSION; + ip->ip_hl = sizeof(*ip) >> 2; + ip->ip_tos = V_carp_dscp << IPTOS_DSCP_OFFSET; + ip->ip_off = htons(IP_DF); + ip->ip_ttl = CARP_DFLTTL; + ip->ip_p = IPPROTO_CARP; + ip->ip_sum = 0; + ip_fillid(ip, V_ip_random_id); + + ifa = carp_best_ifa(AF_INET, sc->sc_carpdev); + if (ifa != NULL) { + ip->ip_src.s_addr = + ifatoia(ifa)->ia_addr.sin_addr.s_addr; + ifa_free(ifa); + } else + ip->ip_src.s_addr = 0; + ip->ip_dst.s_addr = htonl(INADDR_CARP_GROUP); + + /* Include the IP addresses in the announcement. */ + for (int i = 0; i < (sc->sc_naddrs + sc->sc_naddrs6); i++) { + struct sockaddr_in *in; + + MPASS(sc->sc_ifas[i] != NULL); + if (sc->sc_ifas[i]->ifa_addr->sa_family != AF_INET) + continue; + + in = (struct sockaddr_in *)sc->sc_ifas[i]->ifa_addr; + + if (m_append(m, sizeof(in->sin_addr), + (caddr_t)&in->sin_addr) != 1) { + m_freem(m); + goto resched; + } + + vh.vrrp_count_addr++; + len += sizeof(in->sin_addr); + } + ip->ip_len = htons(len); + + vh_ptr = (struct vrrpv3_header *)mtodo(m, sizeof(*ip)); + bcopy(&vh, vh_ptr, sizeof(vh)); + + vh_ptr->vrrp_checksum = in_pseudo(ip->ip_src.s_addr, + ip->ip_dst.s_addr, + htonl((uint16_t)(len - sizeof(*ip)) + ip->ip_p)); + vh_ptr->vrrp_checksum = in_cksum_skip(m, len, sizeof(*ip)); + + if (carp_tag(sc, m)) + goto resched; + + CARPSTATS_INC(carps_opackets); + + carp_send_ad_error(sc, ip_output(m, NULL, NULL, IP_RAWOUTPUT, + &sc->sc_carpdev->if_carp->cif_imo, NULL)); + } +#endif +#ifdef INET6 + if (sc->sc_naddrs6) { + struct ip6_hdr *ip6; + + m = m_gethdr(M_NOWAIT, MT_DATA); + if (m == NULL) { + CARPSTATS_INC(carps_onomem); + goto resched; + } + len = sizeof(*ip6) + sizeof(vh); + m->m_pkthdr.len = len; + m->m_pkthdr.rcvif = NULL; + m->m_len = len; + M_ALIGN(m, m->m_len); + m->m_flags |= M_MCAST; + ip6 = mtod(m, struct ip6_hdr *); + bzero(ip6, sizeof(*ip6)); + ip6->ip6_vfc |= IPV6_VERSION; + /* Traffic class isn't defined in ip6 struct instead + * it gets offset into flowid field */ + ip6->ip6_flow |= htonl(V_carp_dscp << (IPV6_FLOWLABEL_LEN + + IPTOS_DSCP_OFFSET)); + ip6->ip6_hlim = CARP_DFLTTL; + ip6->ip6_nxt = IPPROTO_CARP; + + /* set the source address */ + ifa = carp_best_ifa(AF_INET6, sc->sc_carpdev); + if (ifa != NULL) { + bcopy(IFA_IN6(ifa), &ip6->ip6_src, + sizeof(struct in6_addr)); + ifa_free(ifa); + } else + /* This should never happen with IPv6. */ + bzero(&ip6->ip6_src, sizeof(struct in6_addr)); + + /* Set the multicast destination. */ + bzero(&ip6->ip6_dst, sizeof(ip6->ip6_dst)); + ip6->ip6_dst.s6_addr16[0] = IPV6_ADDR_INT16_MLL; + ip6->ip6_dst.s6_addr8[15] = 0x12; + + /* Include the IP addresses in the announcement. */ + len = sizeof(vh); + for (int i = 0; i < (sc->sc_naddrs + sc->sc_naddrs6); i++) { + struct sockaddr_in6 *in6; + + MPASS(sc->sc_ifas[i] != NULL); + if (sc->sc_ifas[i]->ifa_addr->sa_family != AF_INET6) + continue; + + in6 = (struct sockaddr_in6 *)sc->sc_ifas[i]->ifa_addr; + + if (m_append(m, sizeof(in6->sin6_addr), + (char *)&in6->sin6_addr) != 1) { + m_freem(m); + goto resched; + } + + vh.vrrp_count_addr++; + len += sizeof(in6->sin6_addr); + } + ip6->ip6_plen = htonl(len); + + vh_ptr = (struct vrrpv3_header *)mtodo(m, sizeof(*ip6)); + bcopy(&vh, vh_ptr, sizeof(vh)); + + vh_ptr->vrrp_checksum = in6_cksum_pseudo(ip6, len, ip6->ip6_nxt, 0); + vh_ptr->vrrp_checksum = in_cksum_skip(m, len + sizeof(*ip6), sizeof(*ip6)); + + if (in6_setscope(&ip6->ip6_dst, sc->sc_carpdev, NULL) != 0) { + m_freem(m); + CARP_DEBUG("%s: in6_setscope failed\n", __func__); + goto resched; + } + + if (carp_tag(sc, m)) + goto resched; + CARPSTATS_INC(carps_opackets6); + + carp_send_ad_error(sc, ip6_output(m, NULL, NULL, 0, + &sc->sc_carpdev->if_carp->cif_im6o, NULL, NULL)); + } +#endif + +resched: + callout_reset(&sc->sc_ad_tmo, sc->sc_vrrp_adv_inter * hz / 100, + carp_callout, sc); } static void @@ -1178,7 +1618,7 @@ carp_send_arp(struct carp_softc *sc) if (ifa->ifa_addr->sa_family != AF_INET) continue; addr = ((struct sockaddr_in *)ifa->ifa_addr)->sin_addr; - arp_announce_ifaddr(sc->sc_carpdev, addr, LLADDR(&sc->sc_addr)); + arp_announce_ifaddr(sc->sc_carpdev, addr, sc->sc_addr); } } @@ -1188,7 +1628,7 @@ carp_iamatch(struct ifaddr *ifa, uint8_t **enaddr) struct carp_softc *sc = ifa->ifa_carp; if (sc->sc_state == MASTER) { - *enaddr = LLADDR(&sc->sc_addr); + *enaddr = sc->sc_addr; return (1); } @@ -1246,6 +1686,7 @@ char * carp_macmatch6(struct ifnet *ifp, struct mbuf *m, const struct in6_addr *taddr) { struct ifaddr *ifa; + char *mac = NULL; NET_EPOCH_ASSERT(); @@ -1256,18 +1697,26 @@ carp_macmatch6(struct ifnet *ifp, struct mbuf *m, const struct in6_addr *taddr) struct m_tag *mtag; mtag = m_tag_get(PACKET_TAG_CARP, - sizeof(struct carp_softc *), M_NOWAIT); - if (mtag == NULL) - /* Better a bit than nothing. */ - return (LLADDR(&sc->sc_addr)); + sizeof(sc->sc_vhid) + sizeof(sc->sc_addr), + M_NOWAIT); + if (mtag == NULL) { + CARPSTATS_INC(carps_onomem); + break; + } + /* carp_output expects sc_vhid first. */ + bcopy(&sc->sc_vhid, mtag + 1, sizeof(sc->sc_vhid)); + /* + * Save sc_addr into mtag data after sc_vhid to avoid + * possible access to destroyed softc. + */ + mac = (char *)(mtag + 1) + sizeof(sc->sc_vhid); + bcopy(sc->sc_addr, mac, sizeof(sc->sc_addr)); - bcopy(&sc, mtag + 1, sizeof(sc)); m_tag_prepend(m, mtag); - - return (LLADDR(&sc->sc_addr)); + break; } - return (NULL); + return (mac); } #endif /* INET6 */ @@ -1286,7 +1735,7 @@ carp_forus(struct ifnet *ifp, u_char *dhost) * CARP_LOCK() is not here, since would protect nothing, but * cause deadlock with if_bridge, calling this under its lock. */ - if (sc->sc_state == MASTER && !bcmp(dhost, LLADDR(&sc->sc_addr), + if (sc->sc_state == MASTER && !bcmp(dhost, sc->sc_addr, ETHER_ADDR_LEN)) { CIF_UNLOCK(ifp->if_carp); return (1); @@ -1327,7 +1776,7 @@ carp_master_down_locked(struct carp_softc *sc, const char *reason) switch (sc->sc_state) { case BACKUP: carp_set_state(sc, MASTER, reason); - carp_send_ad_locked(sc); + send_ad_locked(sc); #ifdef INET carp_send_arp(sc); #endif @@ -1357,6 +1806,7 @@ static void carp_setrun(struct carp_softc *sc, sa_family_t af) { struct timeval tv; + int timeout; CARP_LOCK_ASSERT(sc); @@ -1373,40 +1823,63 @@ carp_setrun(struct carp_softc *sc, sa_family_t af) break; case BACKUP: callout_stop(&sc->sc_ad_tmo); - tv.tv_sec = 3 * sc->sc_advbase; - tv.tv_usec = sc->sc_advskew * 1000000 / 256; + + switch (sc->sc_version) { + case CARP_VERSION_CARP: + tv.tv_sec = 3 * sc->sc_advbase; + tv.tv_usec = sc->sc_advskew * 1000000 / 256; + timeout = tvtohz(&tv); + break; + case CARP_VERSION_VRRPv3: + /* skew time */ + timeout = (256 - sc->sc_vrrp_prio) * + sc->sc_vrrp_master_inter / 256; + timeout += (3 * sc->sc_vrrp_master_inter); + timeout *= hz; + timeout /= 100; /* master interval is in centiseconds */ + break; + } switch (af) { #ifdef INET case AF_INET: - callout_reset(&sc->sc_md_tmo, tvtohz(&tv), + callout_reset(&sc->sc_md_tmo, timeout, carp_master_down, sc); break; #endif #ifdef INET6 case AF_INET6: - callout_reset(&sc->sc_md6_tmo, tvtohz(&tv), + callout_reset(&sc->sc_md6_tmo, timeout, carp_master_down, sc); break; #endif default: #ifdef INET if (sc->sc_naddrs) - callout_reset(&sc->sc_md_tmo, tvtohz(&tv), + callout_reset(&sc->sc_md_tmo, timeout, carp_master_down, sc); #endif #ifdef INET6 if (sc->sc_naddrs6) - callout_reset(&sc->sc_md6_tmo, tvtohz(&tv), + callout_reset(&sc->sc_md6_tmo, timeout, carp_master_down, sc); #endif break; } break; case MASTER: - tv.tv_sec = sc->sc_advbase; - tv.tv_usec = sc->sc_advskew * 1000000 / 256; - callout_reset(&sc->sc_ad_tmo, tvtohz(&tv), - carp_send_ad, sc); + switch (sc->sc_version) { + case CARP_VERSION_CARP: + tv.tv_sec = sc->sc_advbase; + tv.tv_usec = sc->sc_advskew * 1000000 / 256; + callout_reset(&sc->sc_ad_tmo, tvtohz(&tv), + carp_callout, sc); + break; + case CARP_VERSION_VRRPv3: + callout_reset(&sc->sc_ad_tmo, + sc->sc_vrrp_adv_inter * hz / 100, + carp_callout, sc); + break; + } break; } } @@ -1559,7 +2032,7 @@ int carp_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *sa) { struct m_tag *mtag; - struct carp_softc *sc; + int vhid; if (!sa) return (0); @@ -1581,20 +2054,7 @@ carp_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *sa) if (mtag == NULL) return (0); - bcopy(mtag + 1, &sc, sizeof(sc)); - - switch (sa->sa_family) { - case AF_INET: - if (! IN_MULTICAST(ntohl(sc->sc_carpaddr.s_addr))) - return (0); - break; - case AF_INET6: - if (! IN6_IS_ADDR_MULTICAST(&sc->sc_carpaddr6)) - return (0); - break; - default: - panic("Unknown af"); - } + bcopy(mtag + 1, &vhid, sizeof(vhid)); /* Set the source MAC address to the Virtual Router MAC Address. */ switch (ifp->if_type) { @@ -1609,7 +2069,7 @@ carp_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *sa) eh->ether_shost[2] = 0x5e; eh->ether_shost[3] = 0; eh->ether_shost[4] = 1; - eh->ether_shost[5] = sc->sc_vhid; + eh->ether_shost[5] = vhid; } break; default: @@ -1622,7 +2082,7 @@ carp_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *sa) } static struct carp_softc* -carp_alloc(struct ifnet *ifp) +carp_alloc(struct ifnet *ifp, carp_version_t version, int vhid) { struct carp_softc *sc; struct carp_if *cif; @@ -1632,20 +2092,31 @@ carp_alloc(struct ifnet *ifp) if ((cif = ifp->if_carp) == NULL) cif = carp_alloc_if(ifp); - sc = malloc(sizeof(*sc), M_CARP, M_WAITOK|M_ZERO); - - sc->sc_advbase = CARP_DFLTINTV; - sc->sc_vhid = -1; /* required setting */ - sc->sc_init_counter = 1; - sc->sc_state = INIT; - - sc->sc_ifasiz = sizeof(struct ifaddr *); + sc = malloc(sizeof(*sc), M_CARP, M_WAITOK); + *sc = (struct carp_softc ){ + .sc_vhid = vhid, + .sc_version = version, + .sc_state = INIT, + .sc_carpdev = ifp, + .sc_ifasiz = sizeof(struct ifaddr *), + .sc_addr = { 0, 0, 0x5e, 0, 1, vhid }, + }; sc->sc_ifas = malloc(sc->sc_ifasiz, M_CARP, M_WAITOK|M_ZERO); - sc->sc_carpdev = ifp; - sc->sc_carpaddr.s_addr = htonl(INADDR_CARP_GROUP); - sc->sc_carpaddr6.s6_addr16[0] = IPV6_ADDR_INT16_MLL; - sc->sc_carpaddr6.s6_addr8[15] = 0x12; + switch (version) { + case CARP_VERSION_CARP: + sc->sc_advbase = CARP_DFLTINTV; + sc->sc_init_counter = true; + sc->sc_carpaddr.s_addr = htonl(INADDR_CARP_GROUP); + sc->sc_carpaddr6.s6_addr16[0] = IPV6_ADDR_INT16_MLL; + sc->sc_carpaddr6.s6_addr8[15] = 0x12; + break; + case CARP_VERSION_VRRPv3: + sc->sc_vrrp_adv_inter = 100; + sc->sc_vrrp_master_inter = sc->sc_vrrp_adv_inter; + sc->sc_vrrp_prio = 100; + break; + } CARP_LOCK_INIT(sc); #ifdef INET @@ -1770,12 +2241,19 @@ carp_carprcp(void *arg, struct carp_softc *sc, int priv) CARP_LOCK(sc); carpr->carpr_state = sc->sc_state; carpr->carpr_vhid = sc->sc_vhid; - carpr->carpr_advbase = sc->sc_advbase; - carpr->carpr_advskew = sc->sc_advskew; - if (priv) - bcopy(sc->sc_key, carpr->carpr_key, sizeof(carpr->carpr_key)); - else - bzero(carpr->carpr_key, sizeof(carpr->carpr_key)); + switch (sc->sc_version) { + case CARP_VERSION_CARP: + carpr->carpr_advbase = sc->sc_advbase; + carpr->carpr_advskew = sc->sc_advskew; + if (priv) + bcopy(sc->sc_key, carpr->carpr_key, + sizeof(carpr->carpr_key)); + else + bzero(carpr->carpr_key, sizeof(carpr->carpr_key)); + break; + case CARP_VERSION_VRRPv3: + break; + } CARP_UNLOCK(sc); return (true); @@ -1788,9 +2266,21 @@ carp_ioctl_set(if_t ifp, struct carpkreq *carpr) struct carp_softc *sc = NULL; int error = 0; + if (carpr->carpr_vhid <= 0 || carpr->carpr_vhid > CARP_MAXVHID) + return (EINVAL); - if (carpr->carpr_vhid <= 0 || carpr->carpr_vhid > CARP_MAXVHID || - carpr->carpr_advbase < 0 || carpr->carpr_advskew < 0) { + switch (carpr->carpr_version) { + case CARP_VERSION_CARP: + if (carpr->carpr_advbase != 0 && (carpr->carpr_advbase > 255 || + carpr->carpr_advbase < CARP_DFLTINTV)) + return (EINVAL); + if (carpr->carpr_advskew < 0 || carpr->carpr_advskew >= 255) + return (EINVAL); + break; + case CARP_VERSION_VRRPv3: + /* XXXGL: shouldn't we check anything? */ + break; + default: return (EINVAL); } @@ -1799,41 +2289,37 @@ carp_ioctl_set(if_t ifp, struct carpkreq *carpr) if (sc->sc_vhid == carpr->carpr_vhid) break; } - if (sc == NULL) { - sc = carp_alloc(ifp); - CARP_LOCK(sc); - sc->sc_vhid = carpr->carpr_vhid; - LLADDR(&sc->sc_addr)[0] = 0; - LLADDR(&sc->sc_addr)[1] = 0; - LLADDR(&sc->sc_addr)[2] = 0x5e; - LLADDR(&sc->sc_addr)[3] = 0; - LLADDR(&sc->sc_addr)[4] = 1; - LLADDR(&sc->sc_addr)[5] = sc->sc_vhid; - } else - CARP_LOCK(sc); - if (carpr->carpr_advbase > 0) { - if (carpr->carpr_advbase > 255 || - carpr->carpr_advbase < CARP_DFLTINTV) { - error = EINVAL; - goto out; + + if (sc == NULL) + sc = carp_alloc(ifp, carpr->carpr_version, carpr->carpr_vhid); + else if (sc->sc_version != carpr->carpr_version) + return (EINVAL); + + CARP_LOCK(sc); + switch (sc->sc_version) { + case CARP_VERSION_CARP: + if (carpr->carpr_advbase != 0) + sc->sc_advbase = carpr->carpr_advbase; + sc->sc_advskew = carpr->carpr_advskew; + if (carpr->carpr_addr.s_addr != INADDR_ANY) + sc->sc_carpaddr = carpr->carpr_addr; + if (!IN6_IS_ADDR_UNSPECIFIED(&carpr->carpr_addr6)) { + memcpy(&sc->sc_carpaddr6, &carpr->carpr_addr6, + sizeof(sc->sc_carpaddr6)); } - sc->sc_advbase = carpr->carpr_advbase; - } - if (carpr->carpr_advskew >= 255) { - error = EINVAL; - goto out; - } - sc->sc_advskew = carpr->carpr_advskew; - if (carpr->carpr_addr.s_addr != INADDR_ANY) - sc->sc_carpaddr = carpr->carpr_addr; - if (! IN6_IS_ADDR_UNSPECIFIED(&carpr->carpr_addr6)) { - memcpy(&sc->sc_carpaddr6, &carpr->carpr_addr6, - sizeof(sc->sc_carpaddr6)); - } - if (carpr->carpr_key[0] != '\0') { - bcopy(carpr->carpr_key, sc->sc_key, sizeof(sc->sc_key)); - carp_hmac_prepare(sc); + if (carpr->carpr_key[0] != '\0') { + bcopy(carpr->carpr_key, sc->sc_key, sizeof(sc->sc_key)); + carp_hmac_prepare(sc); + } + break; + case CARP_VERSION_VRRPv3: + if (carpr->carpr_vrrp_priority != 0) + sc->sc_vrrp_prio = carpr->carpr_vrrp_priority; + if (carpr->carpr_vrrp_adv_inter) + sc->sc_vrrp_adv_inter = carpr->carpr_vrrp_adv_inter; + break; } + if (sc->sc_state != INIT && carpr->carpr_state != sc->sc_state) { switch (carpr->carpr_state) { @@ -1854,8 +2340,6 @@ carp_ioctl_set(if_t ifp, struct carpkreq *carpr) break; } } - -out: CARP_UNLOCK(sc); return (error); @@ -1910,7 +2394,9 @@ int carp_ioctl(struct ifreq *ifr, u_long cmd, struct thread *td) { struct carpreq carpr; - struct carpkreq carprk = { }; + struct carpkreq carprk = { + .carpr_version = CARP_VERSION_CARP, + }; struct ifnet *ifp; int error = 0; @@ -2034,7 +2520,8 @@ carp_attach(struct ifaddr *ifa, int vhid) CARP_LOCK(sc); sc->sc_ifas[index - 1] = ifa; ifa->ifa_carp = sc; - carp_hmac_prepare(sc); + if (sc->sc_version == CARP_VERSION_CARP) + carp_hmac_prepare(sc); carp_sc_state(sc); CARP_UNLOCK(sc); @@ -2087,7 +2574,8 @@ carp_detach(struct ifaddr *ifa, bool keep_cif) ifa->ifa_carp = NULL; ifa_free(ifa); - carp_hmac_prepare(sc); + if (sc->sc_version == CARP_VERSION_CARP) + carp_hmac_prepare(sc); carp_sc_state(sc); if (!keep_cif && sc->sc_naddrs == 0 && sc->sc_naddrs6 == 0) @@ -2279,13 +2767,23 @@ carp_nl_send(void *arg, struct carp_softc *sc, int priv) nlattr_add_u32(nw, CARP_NL_VHID, sc->sc_vhid); nlattr_add_u32(nw, CARP_NL_STATE, sc->sc_state); - nlattr_add_s32(nw, CARP_NL_ADVBASE, sc->sc_advbase); - nlattr_add_s32(nw, CARP_NL_ADVSKEW, sc->sc_advskew); - nlattr_add_in_addr(nw, CARP_NL_ADDR, &sc->sc_carpaddr); - nlattr_add_in6_addr(nw, CARP_NL_ADDR6, &sc->sc_carpaddr6); - - if (priv) - nlattr_add(nw, CARP_NL_KEY, sizeof(sc->sc_key), sc->sc_key); + nlattr_add_u8(nw, CARP_NL_VERSION, sc->sc_version); + switch (sc->sc_version) { + case CARP_VERSION_CARP: + nlattr_add_s32(nw, CARP_NL_ADVBASE, sc->sc_advbase); + nlattr_add_s32(nw, CARP_NL_ADVSKEW, sc->sc_advskew); + nlattr_add_in_addr(nw, CARP_NL_ADDR, &sc->sc_carpaddr); + nlattr_add_in6_addr(nw, CARP_NL_ADDR6, &sc->sc_carpaddr6); + if (priv) + nlattr_add(nw, CARP_NL_KEY, sizeof(sc->sc_key), + sc->sc_key); + break; + case CARP_VERSION_VRRPv3: + nlattr_add_u8(nw, CARP_NL_VRRP_PRIORITY, sc->sc_vrrp_prio); + nlattr_add_u16(nw, CARP_NL_VRRP_ADV_INTER, + sc->sc_vrrp_adv_inter); + break; + } CARP_UNLOCK(sc); @@ -2307,11 +2805,12 @@ struct nl_carp_parsed { char key[CARP_KEY_LEN]; struct in_addr addr; struct in6_addr addr6; + carp_version_t version; + uint8_t vrrp_prio; + uint16_t vrrp_adv_inter; }; -#define _IN(_field) offsetof(struct genlmsghdr, _field) #define _OUT(_field) offsetof(struct nl_carp_parsed, _field) - static const struct nlattr_parser nla_p_set[] = { { .type = CARP_NL_VHID, .off = _OUT(vhid), .cb = nlattr_get_uint32 }, { .type = CARP_NL_STATE, .off = _OUT(state), .cb = nlattr_get_uint32 }, @@ -2322,11 +2821,11 @@ static const struct nlattr_parser nla_p_set[] = { { .type = CARP_NL_ADDR, .off = _OUT(addr), .cb = nlattr_get_in_addr }, { .type = CARP_NL_ADDR6, .off = _OUT(addr6), .cb = nlattr_get_in6_addr }, { .type = CARP_NL_IFNAME, .off = _OUT(ifname), .cb = nlattr_get_string }, + { .type = CARP_NL_VERSION, .off = _OUT(version), .cb = nlattr_get_uint8 }, + { .type = CARP_NL_VRRP_PRIORITY, .off = _OUT(vrrp_prio), .cb = nlattr_get_uint8 }, + { .type = CARP_NL_VRRP_ADV_INTER, .off = _OUT(vrrp_adv_inter), .cb = nlattr_get_uint16 }, }; -static const struct nlfield_parser nlf_p_set[] = { -}; -NL_DECLARE_PARSER(carp_parser, struct genlmsghdr, nlf_p_set, nla_p_set); -#undef _IN +NL_DECLARE_PARSER(carp_parser, struct genlmsghdr, nlf_p_empty, nla_p_set); #undef _OUT @@ -2393,12 +2892,24 @@ carp_nl_set(struct nlmsghdr *hdr, struct nl_pstate *npt) return (EINVAL); if (attrs.state > CARP_MAXSTATE) return (EINVAL); - if (attrs.advbase < 0 || attrs.advskew < 0) - return (EINVAL); - if (attrs.advbase > 255) - return (EINVAL); - if (attrs.advskew >= 255) + if (attrs.version == 0) /* compat with pre-VRRPv3 */ + attrs.version = CARP_VERSION_CARP; + switch (attrs.version) { + case CARP_VERSION_CARP: + if (attrs.advbase < 0 || attrs.advskew < 0) + return (EINVAL); + if (attrs.advbase > 255) + return (EINVAL); + if (attrs.advskew >= 255) + return (EINVAL); + break; + case CARP_VERSION_VRRPv3: + if (attrs.vrrp_adv_inter > VRRP_MAX_INTERVAL) + return (EINVAL); + break; + default: return (EINVAL); + } NET_EPOCH_ENTER(et); if (attrs.ifname != NULL) @@ -2418,12 +2929,20 @@ carp_nl_set(struct nlmsghdr *hdr, struct nl_pstate *npt) carpr.carpr_count = 1; carpr.carpr_vhid = attrs.vhid; carpr.carpr_state = attrs.state; - carpr.carpr_advbase = attrs.advbase; - carpr.carpr_advskew = attrs.advskew; - carpr.carpr_addr = attrs.addr; - carpr.carpr_addr6 = attrs.addr6; - - memcpy(&carpr.carpr_key, &attrs.key, sizeof(attrs.key)); + carpr.carpr_version = attrs.version; + switch (attrs.version) { + case CARP_VERSION_CARP: + carpr.carpr_advbase = attrs.advbase; + carpr.carpr_advskew = attrs.advskew; + carpr.carpr_addr = attrs.addr; + carpr.carpr_addr6 = attrs.addr6; + memcpy(&carpr.carpr_key, &attrs.key, sizeof(attrs.key)); + break; + case CARP_VERSION_VRRPv3: + carpr.carpr_vrrp_priority = attrs.vrrp_prio; + carpr.carpr_vrrp_adv_inter = attrs.vrrp_adv_inter; + break; + } sx_xlock(&carp_sx); error = carp_ioctl_set(ifp, &carpr); @@ -2457,26 +2976,25 @@ static const struct genl_cmd carp_cmds[] = { }, }; +static uint16_t carp_family_id; static void carp_nl_register(void) { bool ret __diagused; - int family_id __diagused; NL_VERIFY_PARSERS(all_parsers); - family_id = genl_register_family(CARP_NL_FAMILY_NAME, 0, 2, + carp_family_id = genl_register_family(CARP_NL_FAMILY_NAME, 0, 2, CARP_NL_CMD_MAX); - MPASS(family_id != 0); + MPASS(carp_family_id != 0); - ret = genl_register_cmds(CARP_NL_FAMILY_NAME, carp_cmds, - NL_ARRAY_LEN(carp_cmds)); + ret = genl_register_cmds(carp_family_id, carp_cmds, nitems(carp_cmds)); MPASS(ret); } static void carp_nl_unregister(void) { - genl_unregister_family(CARP_NL_FAMILY_NAME); + genl_unregister_family(carp_family_id); } static void @@ -2525,7 +3043,6 @@ carp_mod_load(void) mtx_init(&carp_mtx, "carp_mtx", NULL, MTX_DEF); sx_init(&carp_sx, "carp_sx"); - LIST_INIT(&carp_list); carp_get_vhid_p = carp_get_vhid; carp_forus_p = carp_forus; carp_output_p = carp_output; diff --git a/sys/netinet/ip_carp.h b/sys/netinet/ip_carp.h index 0c22e9434797..dc3d9a68b43b 100644 --- a/sys/netinet/ip_carp.h +++ b/sys/netinet/ip_carp.h @@ -31,6 +31,7 @@ #ifndef _IP_CARP_H #define _IP_CARP_H +#ifdef _KERNEL /* * The CARP header layout is as follows: * @@ -77,14 +78,53 @@ struct carp_header { unsigned char carp_md[20]; /* SHA1 HMAC */ } __packed; -#ifdef CTASSERT CTASSERT(sizeof(struct carp_header) == 36); + +/* + * The VRRPv3 header layout is as follows: + * See RFC9568, 5.1. VRRP Packet Format + * + * 0 1 2 3 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * |Version| Type | Virtual Rtr ID| Priority |Count IPvX Addr| + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * |(rsvd) | Max Adver Int | Checksum | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | | + * + + + * | IPvX Address(es) | + * + + + * + + + * + + + * + + + * | | + * + + + * | | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * + */ + +struct vrrpv3_header { +#if BYTE_ORDER == LITTLE_ENDIAN + uint8_t vrrp_type:4, + vrrp_version:4; +#endif +#if BYTE_ORDER == BIG_ENDIAN + uint8_t vrrp_version:4, + vrrp_type:4; #endif + uint8_t vrrp_vrtid; + uint8_t vrrp_priority; + uint8_t vrrp_count_addr; + uint16_t vrrp_max_adver_int; + uint16_t vrrp_checksum; +} __packed; -#define CARP_DFLTTL 255 +CTASSERT(sizeof(struct vrrpv3_header) == 8); +#endif /* _KERNEL */ -/* carp_version */ -#define CARP_VERSION 2 +#define CARP_DFLTTL 255 /* carp_type */ #define CARP_ADVERTISEMENT 0x01 @@ -94,6 +134,8 @@ CTASSERT(sizeof(struct carp_header) == 36); /* carp_advbase */ #define CARP_DFLTINTV 1 +#define VRRP_TYPE_ADVERTISEMENT 0x01 +#define VRRP_MAX_INTERVAL (0x1000 - 1) /* * Statistics. */ @@ -136,6 +178,11 @@ struct carpreq { #define SIOCSVH _IOWR('i', 245, struct ifreq) #define SIOCGVH _IOWR('i', 246, struct ifreq) +typedef enum carp_version { + CARP_VERSION_CARP = 2, + CARP_VERSION_VRRPv3 = 3, +} carp_version_t; + #ifdef _KERNEL int carp_ioctl(struct ifreq *, u_long, struct thread *); int carp_attach(struct ifaddr *, int); diff --git a/sys/netinet/ip_carp_nl.h b/sys/netinet/ip_carp_nl.h index 89720af3e0dc..de4c0367c1d3 100644 --- a/sys/netinet/ip_carp_nl.h +++ b/sys/netinet/ip_carp_nl.h @@ -32,6 +32,9 @@ enum carp_nl_type_t { CARP_NL_ADDR = 7, /* in_addr_t */ CARP_NL_ADDR6 = 8, /* in6_addr_t */ CARP_NL_IFNAME = 9, /* string */ + CARP_NL_VERSION = 10, /* u8 */ + CARP_NL_VRRP_PRIORITY = 11, /* u8 */ + CARP_NL_VRRP_ADV_INTER = 12, /* u16, 12-bit field in centiseconds*/ }; #endif diff --git a/sys/netinet/ip_divert.c b/sys/netinet/ip_divert.c index 6bc76e0be111..5a561814cdb5 100644 --- a/sys/netinet/ip_divert.c +++ b/sys/netinet/ip_divert.c @@ -538,7 +538,7 @@ div_output_inbound(int family, struct socket *so, struct mbuf *m, */ if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) m->m_flags |= M_MCAST; - else if (in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) + else if (in_ifnet_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) m->m_flags |= M_BCAST; netisr_queue_src(NETISR_IP, (uintptr_t)so, m); DIVSTAT_INC(inbound); diff --git a/sys/netinet/ip_fastfwd.c b/sys/netinet/ip_fastfwd.c index 6d34ba4f5420..9b81760e58f3 100644 --- a/sys/netinet/ip_fastfwd.c +++ b/sys/netinet/ip_fastfwd.c @@ -278,14 +278,12 @@ ip_tryforward(struct mbuf *m) */ if ((m->m_flags & (M_BCAST|M_MCAST)) || (m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) || - ntohl(ip->ip_src.s_addr) == (u_long)INADDR_BROADCAST || - ntohl(ip->ip_dst.s_addr) == (u_long)INADDR_BROADCAST || + in_broadcast(ip->ip_src) || + in_broadcast(ip->ip_dst) || IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || IN_LINKLOCAL(ntohl(ip->ip_src.s_addr)) || - IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr)) || - ip->ip_src.s_addr == INADDR_ANY || - ip->ip_dst.s_addr == INADDR_ANY ) + IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr)) ) return m; /* @@ -401,7 +399,7 @@ passin: if (!PFIL_HOOKED_OUT(V_inet_pfil_head)) goto passout; - if (pfil_mbuf_out(V_inet_pfil_head, &m, nh->nh_ifp, + if (pfil_mbuf_fwd(V_inet_pfil_head, &m, nh->nh_ifp, NULL) != PFIL_PASS) goto drop; diff --git a/sys/netinet/ip_fw.h b/sys/netinet/ip_fw.h index 2b59e46b5bcc..c440223b81f8 100644 --- a/sys/netinet/ip_fw.h +++ b/sys/netinet/ip_fw.h @@ -75,6 +75,10 @@ typedef struct _ip_fw3_opheader { uint16_t reserved[2]; /* Align to 64-bit boundary */ } ip_fw3_opheader; +#define IP_FW3_OPVER_0 0 +#define IP_FW3_OPVER_1 1 /* 32bit rulenum */ +#define IP_FW3_OPVER IP_FW3_OPVER_1 + /* IP_FW3 opcodes */ #define IP_FW_TABLE_XADD 86 /* add entry */ #define IP_FW_TABLE_XDEL 87 /* delete entry */ @@ -109,6 +113,7 @@ typedef struct _ip_fw3_opheader { #define IP_FW_DUMP_SOPTCODES 116 /* Dump available sopts/versions */ #define IP_FW_DUMP_SRVOBJECTS 117 /* Dump existing named objects */ +#define IP_FW_SKIPTO_CACHE 118 /* Manage skipto cache */ #define IP_FW_NAT64STL_CREATE 130 /* Create stateless NAT64 instance */ #define IP_FW_NAT64STL_DESTROY 131 /* Destroy stateless NAT64 instance */ @@ -211,8 +216,8 @@ enum ipfw_opcodes { /* arguments (4 byte each) */ O_VERREVPATH, /* none */ O_VERSRCREACH, /* none */ - O_PROBE_STATE, /* none */ - O_KEEP_STATE, /* none */ + O_PROBE_STATE, /* v0:arg1=kidx, v1:kidx=kidx */ + O_KEEP_STATE, /* v0:arg1=kidx, v1:kidx=kidx */ O_LIMIT, /* ipfw_insn_limit */ O_LIMIT_PARENT, /* dyn_type, not an opcode. */ @@ -223,12 +228,13 @@ enum ipfw_opcodes { /* arguments (4 byte each) */ O_LOG, /* ipfw_insn_log */ O_PROB, /* u32 = match probability */ - O_CHECK_STATE, /* none */ + O_CHECK_STATE, /* v0:arg1=kidx, v1:kidx=kidx */ O_ACCEPT, /* none */ O_DENY, /* none */ O_REJECT, /* arg1=icmp arg (same as deny) */ O_COUNT, /* none */ - O_SKIPTO, /* arg1=next rule number */ + O_SKIPTO, /* v0:arg1=next rule number */ + /* v1:kidx= next rule number */ O_PIPE, /* arg1=pipe number */ O_QUEUE, /* arg1=queue number */ O_DIVERT, /* arg1=port number */ @@ -242,8 +248,10 @@ enum ipfw_opcodes { /* arguments (4 byte each) */ * More opcodes. */ O_IPSEC, /* has ipsec history */ - O_IP_SRC_LOOKUP, /* arg1=table number, u32=value */ + O_IP_SRC_LOOKUP, /* v0:arg1=table number, u32=value */ + /* v1:kidx=name, u32=value, arg1=key */ O_IP_DST_LOOKUP, /* arg1=table number, u32=value */ + /* v1:kidx=name, u32=value, arg1=key */ O_ANTISPOOF, /* none */ O_JAIL, /* u32 = id */ O_ALTQ, /* u32 = altq classif. qid */ @@ -278,23 +286,27 @@ enum ipfw_opcodes { /* arguments (4 byte each) */ O_SOCKARG, /* socket argument */ - O_CALLRETURN, /* arg1=called rule number */ + O_CALLRETURN, /* v0:arg1=called rule number */ + /* v1:kidx=called rule number */ O_FORWARD_IP6, /* fwd sockaddr_in6 */ O_DSCP, /* 2 u32 = DSCP mask */ O_SETDSCP, /* arg1=DSCP value */ - O_IP_FLOW_LOOKUP, /* arg1=table number, u32=value */ + O_IP_FLOW_LOOKUP, /* v0:arg1=table number, u32=value */ + /* v1:kidx=name, u32=value */ - O_EXTERNAL_ACTION, /* arg1=id of external action handler */ - O_EXTERNAL_INSTANCE, /* arg1=id of eaction handler instance */ + O_EXTERNAL_ACTION, /* v0:arg1=id of external action handler */ + /* v1:kidx=id of external action handler */ + O_EXTERNAL_INSTANCE, /* v0:arg1=id of eaction handler instance */ + /* v1:kidx=id of eaction handler instance */ O_EXTERNAL_DATA, /* variable length data */ O_SKIP_ACTION, /* none */ O_TCPMSS, /* arg1=MSS value */ - O_MAC_SRC_LOOKUP, /* arg1=table number, u32=value */ - O_MAC_DST_LOOKUP, /* arg1=table number, u32=value */ + O_MAC_SRC_LOOKUP, /* kidx=name, u32=value, arg1=key */ + O_MAC_DST_LOOKUP, /* kidx=name, u32=value, arg1=key */ O_SETMARK, /* u32 = value */ O_MARK, /* 2 u32 = value, bitmask */ @@ -303,22 +315,6 @@ enum ipfw_opcodes { /* arguments (4 byte each) */ }; /* - * Defines key types used by lookup instruction - */ -enum ipfw_table_lookup_type { - LOOKUP_DST_IP, - LOOKUP_SRC_IP, - LOOKUP_DST_PORT, - LOOKUP_SRC_PORT, - LOOKUP_UID, - LOOKUP_JAIL, - LOOKUP_DSCP, - LOOKUP_DST_MAC, - LOOKUP_SRC_MAC, - LOOKUP_MARK, -}; - -/* * The extension header are filtered only for presence using a bit * vector with a flag for each header. */ @@ -392,6 +388,11 @@ typedef struct _ipfw_insn_u32 { u_int32_t d[1]; /* one or more */ } ipfw_insn_u32; +typedef struct _ipfw_insn_kidx { + ipfw_insn o; + uint32_t kidx; +} ipfw_insn_kidx; + /* * This is used to store IP addr-mask pairs. */ @@ -401,6 +402,47 @@ typedef struct _ipfw_insn_ip { struct in_addr mask; } ipfw_insn_ip; +typedef struct _ipfw_insn_table { + ipfw_insn o; /* arg1 is optional lookup key */ + uint32_t kidx; /* table name index */ + uint32_t value; /* table value */ +} ipfw_insn_table; + +#define IPFW_LOOKUP_TYPE_MASK 0x00FF +#define IPFW_LOOKUP_TYPE(insn) ((insn)->arg1 & IPFW_LOOKUP_TYPE_MASK) +#define IPFW_SET_LOOKUP_TYPE(insn, type) do { \ + (insn)->arg1 &= ~IPFW_LOOKUP_TYPE_MASK; \ + (insn)->arg1 |= (type) & IPFW_LOOKUP_TYPE_MASK; \ +} while (0) + +/* + * Defines key types used by lookup instruction + */ +enum ipfw_table_lookup_type { + LOOKUP_NONE = 0, + LOOKUP_DST_IP, + LOOKUP_SRC_IP, + LOOKUP_DST_PORT, + LOOKUP_SRC_PORT, + LOOKUP_UID, + LOOKUP_JAIL, + LOOKUP_DSCP, + LOOKUP_DST_MAC, + LOOKUP_SRC_MAC, + LOOKUP_MARK, + LOOKUP_RULENUM, +}; + +enum ipfw_return_type { + RETURN_NEXT_RULENUM = 0, + RETURN_NEXT_RULE, +}; + +enum ipfw_skipto_cache_op { + SKIPTO_CACHE_DISABLE = 0, + SKIPTO_CACHE_ENABLE, +}; + /* * This is used to forward to a given address (ip). */ @@ -434,7 +476,8 @@ typedef struct _ipfw_insn_if { union { struct in_addr ip; int glob; - uint16_t kidx; + uint16_t kidx_v0; + uint32_t kidx; } p; char name[IFNAMSIZ]; } ipfw_insn_if; @@ -452,6 +495,7 @@ typedef struct _ipfw_insn_altq { */ typedef struct _ipfw_insn_limit { ipfw_insn o; + u_int32_t kidx; u_int8_t _pad; u_int8_t limit_mask; /* combination of DYN_* below */ #define DYN_SRC_ADDR 0x1 @@ -462,6 +506,9 @@ typedef struct _ipfw_insn_limit { u_int16_t conn_limit; } ipfw_insn_limit; +/* MAC/InfiniBand/etc address length */ +#define IPFW_MAX_L2_ADDR_LEN 20 + /* * This is used for log instructions. */ @@ -471,6 +518,22 @@ typedef struct _ipfw_insn_log { u_int32_t log_left; /* how many left to log */ } ipfw_insn_log; +/* ipfw_insn_log->o.arg1 bitmasks */ +#define IPFW_LOG_DEFAULT 0x0000 +#define IPFW_LOG_SYSLOG (1 << 15) +#define IPFW_LOG_IPFW0 (1 << 14) +#define IPFW_LOG_RTSOCK (1 << 13) + +typedef struct _ipfwlog_rtsock_hdr_v2 { + uint32_t rulenum; + uint32_t tablearg; + ipfw_insn cmd; + u_char ether_shost[IPFW_MAX_L2_ADDR_LEN]; + u_char ether_dhost[IPFW_MAX_L2_ADDR_LEN]; + uint32_t mark; + char comment[0]; +} ipfwlog_rtsock_hdr_v2; + /* Legacy NAT structures, compat only */ #ifndef _KERNEL /* @@ -604,6 +667,10 @@ typedef struct _ipfw_insn_icmp6 { */ } ipfw_insn_icmp6; +/* Convert pointer to instruction with specified type */ +#define insntod(p, type) ((ipfw_insn_ ## type *)(p)) +#define insntoc(p, type) ((const ipfw_insn_ ## type *)(p)) + /* * Here we have the structure representing an ipfw rule. * @@ -719,30 +786,29 @@ struct ipfw_flow_id { /* * Dynamic ipfw rule. */ -typedef struct _ipfw_dyn_rule ipfw_dyn_rule; - -struct _ipfw_dyn_rule { - ipfw_dyn_rule *next; /* linked list of rules. */ - struct ip_fw *rule; /* pointer to rule */ - /* 'rule' is used to pass up the rule number (from the parent) */ +#define IPFW_DYN_ORPHANED 0x40000 /* state's parent rule was deleted */ - ipfw_dyn_rule *parent; /* pointer to parent rule */ - u_int64_t pcnt; /* packet match counter */ - u_int64_t bcnt; /* byte match counter */ +typedef struct _ipfw_dyn_rule { struct ipfw_flow_id id; /* (masked) flow id */ - u_int32_t expire; /* expire time */ - u_int32_t bucket; /* which bucket in hash table */ - u_int32_t state; /* state of this rule (typically a + uint8_t set; + uint8_t type; /* rule type */ + uint16_t pad; + uint32_t expire; /* expire time */ + uint32_t rulenum; /* parent's rule number */ + uint32_t kidx; /* index of named object */ + uint64_t pcnt; /* packet match counter */ + uint64_t bcnt; /* byte match counter */ + uint32_t hashval; /* hash value */ + union { + uint32_t state; /* state of this rule (typically a * combination of TCP flags) */ -#define IPFW_DYN_ORPHANED 0x40000 /* state's parent rule was deleted */ - u_int32_t ack_fwd; /* most recent ACKs in forward */ - u_int32_t ack_rev; /* and reverse directions (used */ + uint32_t count; /* number of linked states */ + }; + uint32_t ack_fwd; /* most recent ACKs in forward */ + uint32_t ack_rev; /* and reverse directions (used */ /* to generate keepalives) */ - u_int16_t dyn_type; /* rule type */ - u_int16_t count; /* refcount */ - u_int16_t kidx; /* index of named object */ -} __packed __aligned(8); +} __packed __aligned(8) ipfw_dyn_rule; /* * Definitions for IP option names. @@ -794,16 +860,6 @@ struct _ipfw_dyn_rule { #define IPFW_VTYPE_NH6 0x00000400 /* IPv6 nexthop */ #define IPFW_VTYPE_MARK 0x00000800 /* [fw]mark */ -/* MAC/InfiniBand/etc address length */ -#define IPFW_MAX_L2_ADDR_LEN 20 - -typedef struct _ipfw_table_entry { - in_addr_t addr; /* network address */ - u_int32_t value; /* value */ - u_int16_t tbl; /* table number */ - u_int8_t masklen; /* mask length */ -} ipfw_table_entry; - typedef struct _ipfw_table_xentry { uint16_t len; /* Total entry length */ uint8_t type; /* entry type */ @@ -819,13 +875,6 @@ typedef struct _ipfw_table_xentry { } ipfw_table_xentry; #define IPFW_TCF_INET 0x01 /* CIDR flags: IPv4 record */ -typedef struct _ipfw_table { - u_int32_t size; /* size of entries in bytes */ - u_int32_t cnt; /* # of entries */ - u_int16_t tbl; /* table number */ - ipfw_table_entry ent[0]; /* entries */ -} ipfw_table; - typedef struct _ipfw_xtable { ip_fw3_opheader opheader; /* IP_FW3 opcode */ uint32_t size; /* size of entries in bytes */ @@ -865,10 +914,10 @@ typedef struct _ipfw_obj_data { /* Object name TLV */ typedef struct _ipfw_obj_ntlv { ipfw_obj_tlv head; /* TLV header */ - uint16_t idx; /* Name index */ + uint32_t idx; /* Name index */ uint8_t set; /* set, if applicable */ uint8_t type; /* object type, if applicable */ - uint32_t spare; /* unused */ + uint16_t spare; /* unused */ char name[64]; /* Null-terminated name */ } ipfw_obj_ntlv; @@ -891,19 +940,40 @@ struct tflow_entry { } a; }; +#define IPFW_TVALUE_TYPE_MASK 0xFF00 +#define IPFW_TVALUE_TYPE(insn) (((insn)->arg1 & IPFW_TVALUE_TYPE_MASK) >> 8) +#define IPFW_SET_TVALUE_TYPE(insn, type) do { \ + (insn)->arg1 &= ~IPFW_TVALUE_TYPE_MASK; \ + (insn)->arg1 |= ((type) << 8) & IPFW_TVALUE_TYPE_MASK; \ +} while (0) + +enum ipfw_table_value_type { + TVALUE_TAG = 0, + TVALUE_PIPE, + TVALUE_DIVERT, + TVALUE_SKIPTO, + TVALUE_NETGRAPH, + TVALUE_FIB, + TVALUE_NAT, + TVALUE_NH4, + TVALUE_DSCP, + TVALUE_LIMIT, + TVALUE_MARK, +}; + /* 64-byte structure representing multi-field table value */ typedef struct _ipfw_table_value { uint32_t tag; /* O_TAG/O_TAGGED */ - uint32_t pipe; /* O_PIPE/O_QUEUE */ + uint16_t pipe; /* O_PIPE/O_QUEUE */ uint16_t divert; /* O_DIVERT/O_TEE */ - uint16_t skipto; /* skipto, CALLRET */ + uint32_t skipto; /* skipto, CALLRET */ uint32_t netgraph; /* O_NETGRAPH/O_NGTEE */ - uint32_t fib; /* O_SETFIB */ uint32_t nat; /* O_NAT */ uint32_t nh4; + uint16_t fib; /* O_SETFIB */ uint8_t dscp; uint8_t spare0; - uint16_t kidx; /* value kernel index */ + uint32_t kidx; /* value kernel index */ struct in6_addr nh6; uint32_t limit; /* O_LIMIT */ uint32_t zoneid; /* scope zone id for nh6 */ @@ -918,8 +988,7 @@ typedef struct _ipfw_obj_tentry { uint8_t masklen; /* mask length */ uint8_t result; /* request result */ uint8_t spare0; - uint16_t idx; /* Table name index */ - uint16_t spare1; + uint32_t idx; /* Table name index */ union { /* Longest field needs to be aligned by 8-byte boundary */ struct in_addr addr; /* IPv4 address */ @@ -966,8 +1035,8 @@ typedef struct _ipfw_obj_ctlv { typedef struct _ipfw_range_tlv { ipfw_obj_tlv head; /* TLV header */ uint32_t flags; /* Range flags */ - uint16_t start_rule; /* Range start */ - uint16_t end_rule; /* Range end */ + uint32_t start_rule; /* Range start */ + uint32_t end_rule; /* Range end */ uint32_t set; /* Range set to match */ uint32_t new_set; /* New set to move/swap to */ } ipfw_range_tlv; @@ -979,7 +1048,7 @@ typedef struct _ipfw_range_tlv { #define IPFW_RCFLAG_USER (IPFW_RCFLAG_RANGE | IPFW_RCFLAG_ALL | \ IPFW_RCFLAG_SET | IPFW_RCFLAG_DYNAMIC) /* Internally used flags */ -#define IPFW_RCFLAG_DEFAULT 0x0100 /* Do not skip defaul rule */ +#define IPFW_RCFLAG_DEFAULT 0x0100 /* Do not skip default rule */ typedef struct _ipfw_ta_tinfo { uint32_t flags; /* Format flags */ @@ -1051,10 +1120,16 @@ typedef struct _ipfw_ta_info { uint64_t spare1; } ipfw_ta_info; +typedef struct _ipfw_cmd_header { /* control command header */ + ip_fw3_opheader opheader; /* IP_FW3 opcode */ + uint32_t size; /* Total size (incl. header) */ + uint32_t cmd; /* command */ +} ipfw_cmd_header; + typedef struct _ipfw_obj_header { ip_fw3_opheader opheader; /* IP_FW3 opcode */ - uint32_t spare; - uint16_t idx; /* object name index */ + uint32_t idx; /* object name index */ + uint16_t spare; uint8_t objtype; /* object type */ uint8_t objsubtype; /* object subtype */ ipfw_obj_ntlv ntlv; /* object name tlv */ diff --git a/sys/netinet/ip_gre.c b/sys/netinet/ip_gre.c index c9356edb0608..01a6ef4cd670 100644 --- a/sys/netinet/ip_gre.c +++ b/sys/netinet/ip_gre.c @@ -534,7 +534,7 @@ in_gre_output(struct mbuf *m, int af, int hlen) #ifdef INET6 case AF_INET6: gi->gi_ip.ip_tos = 0; /* XXX */ - ip_fillid(&gi->gi_ip); + ip_fillid(&gi->gi_ip, V_ip_random_id); break; #endif } diff --git a/sys/netinet/ip_icmp.c b/sys/netinet/ip_icmp.c index 1149796c0db3..71b75d18efd0 100644 --- a/sys/netinet/ip_icmp.c +++ b/sys/netinet/ip_icmp.c @@ -88,7 +88,7 @@ SYSCTL_PROC(_net_inet_icmp, ICMPCTL_ICMPLIM, icmplim, CTLTYPE_UINT | &sysctl_icmplim_and_jitter, "IU", "Maximum number of ICMP responses per second"); -VNET_DEFINE_STATIC(int, icmplim_curr_jitter) = 0; +VNET_DEFINE_STATIC(int, icmplim_curr_jitter[BANDLIM_MAX]) = {0}; #define V_icmplim_curr_jitter VNET(icmplim_curr_jitter) VNET_DEFINE_STATIC(u_int, icmplim_jitter) = 16; #define V_icmplim_jitter VNET(icmplim_jitter) @@ -635,15 +635,10 @@ icmp_input(struct mbuf **mp, int *offp, int proto) */ if (icmplen < ICMP_MASKLEN) break; - switch (ip->ip_dst.s_addr) { - case INADDR_BROADCAST: - case INADDR_ANY: + if (in_broadcast(ip->ip_dst)) icmpdst.sin_addr = ip->ip_src; - break; - - default: + else icmpdst.sin_addr = ip->ip_dst; - } ia = (struct in_ifaddr *)ifaof_ifpforaddr( (struct sockaddr *)&icmpdst, m->m_pkthdr.rcvif); if (ia == NULL) @@ -788,10 +783,11 @@ icmp_reflect(struct mbuf *m) if (IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || (IN_EXPERIMENTAL(ntohl(ip->ip_src.s_addr)) && !V_ip_allow_net240) || - (IN_ZERONET(ntohl(ip->ip_src.s_addr)) && !V_ip_allow_net0) ) { + (IN_ZERONET(ntohl(ip->ip_src.s_addr)) && !V_ip_allow_net0) || + in_nullhost(ip->ip_src) ) { m_freem(m); /* Bad return address */ ICMPSTAT_INC(icps_badaddr); - goto done; /* Ip_output() will check for broadcast */ + goto done; /* ip_output() will check for broadcast */ } t = ip->ip_dst; @@ -1094,28 +1090,29 @@ ip_next_mtu(int mtu, int dir) * the 'final' error, but it doesn't make sense to solve the printing * delay with more complex code. */ -VNET_DEFINE_STATIC(struct counter_rate, icmp_rates[BANDLIM_MAX]); +VNET_DEFINE_STATIC(struct counter_rate *, icmp_rates[BANDLIM_MAX]); #define V_icmp_rates VNET(icmp_rates) static const char *icmp_rate_descrs[BANDLIM_MAX] = { [BANDLIM_ICMP_UNREACH] = "icmp unreach", [BANDLIM_ICMP_ECHO] = "icmp ping", [BANDLIM_ICMP_TSTAMP] = "icmp tstamp", - [BANDLIM_RST_CLOSEDPORT] = "closed port RST", - [BANDLIM_RST_OPENPORT] = "open port RST", + [BANDLIM_TCP_RST] = "tcp reset", [BANDLIM_ICMP6_UNREACH] = "icmp6 unreach", [BANDLIM_SCTP_OOTB] = "sctp ootb", }; static void -icmplim_new_jitter(void) +icmplim_new_jitter(int which) { /* * Adjust limit +/- to jitter the measurement to deny a side-channel * port scan as in https://dl.acm.org/doi/10.1145/3372297.3417280 */ + KASSERT(which >= 0 && which < BANDLIM_MAX, + ("%s: which %d", __func__, which)); if (V_icmplim_jitter > 0) - V_icmplim_curr_jitter = + V_icmplim_curr_jitter[which] = arc4random_uniform(V_icmplim_jitter * 2 + 1) - V_icmplim_jitter; } @@ -1144,11 +1141,13 @@ sysctl_icmplim_and_jitter(SYSCTL_HANDLER_ARGS) error = EINVAL; else { V_icmplim_jitter = new; - icmplim_new_jitter(); + for (int i = 0; i < BANDLIM_MAX; i++) { + icmplim_new_jitter(i); + } } } } - MPASS(V_icmplim + V_icmplim_curr_jitter >= 0); + MPASS(V_icmplim == 0 || V_icmplim > V_icmplim_jitter); return (error); } @@ -1158,10 +1157,9 @@ icmp_bandlimit_init(void) { for (int i = 0; i < BANDLIM_MAX; i++) { - V_icmp_rates[i].cr_rate = counter_u64_alloc(M_WAITOK); - V_icmp_rates[i].cr_ticks = ticks; + V_icmp_rates[i] = counter_rate_alloc(M_WAITOK, 1); + icmplim_new_jitter(i); } - icmplim_new_jitter(); } VNET_SYSINIT(icmp_bandlimit, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY, icmp_bandlimit_init, NULL); @@ -1172,7 +1170,7 @@ icmp_bandlimit_uninit(void) { for (int i = 0; i < BANDLIM_MAX; i++) - counter_u64_free(V_icmp_rates[i].cr_rate); + counter_rate_free(V_icmp_rates[i]); } VNET_SYSUNINIT(icmp_bandlimit, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, icmp_bandlimit_uninit, NULL); @@ -1189,15 +1187,15 @@ badport_bandlim(int which) KASSERT(which >= 0 && which < BANDLIM_MAX, ("%s: which %d", __func__, which)); - pps = counter_ratecheck(&V_icmp_rates[which], V_icmplim + - V_icmplim_curr_jitter); + pps = counter_ratecheck(V_icmp_rates[which], V_icmplim + + V_icmplim_curr_jitter[which]); if (pps > 0) { if (V_icmplim_output) log(LOG_NOTICE, "Limiting %s response from %jd to %d packets/sec\n", icmp_rate_descrs[which], (intmax_t )pps, - V_icmplim + V_icmplim_curr_jitter); - icmplim_new_jitter(); + V_icmplim + V_icmplim_curr_jitter[which]); + icmplim_new_jitter(which); } if (pps == -1) return (-1); diff --git a/sys/netinet/ip_id.c b/sys/netinet/ip_id.c index 12dd6c8bf972..738b7eceb448 100644 --- a/sys/netinet/ip_id.c +++ b/sys/netinet/ip_id.c @@ -97,9 +97,9 @@ * user wants to, we can turn on random ID generation. */ VNET_DEFINE_STATIC(int, ip_rfc6864) = 1; -VNET_DEFINE_STATIC(int, ip_do_randomid) = 0; #define V_ip_rfc6864 VNET(ip_rfc6864) -#define V_ip_do_randomid VNET(ip_do_randomid) + +VNET_DEFINE(int, ip_random_id) = 0; /* * Random ID state engine. @@ -126,7 +126,7 @@ VNET_DEFINE_STATIC(struct mtx, ip_id_mtx); VNET_DEFINE_STATIC(counter_u64_t, ip_id); #define V_ip_id VNET(ip_id) -static int sysctl_ip_randomid(SYSCTL_HANDLER_ARGS); +static int sysctl_ip_random_id(SYSCTL_HANDLER_ARGS); static int sysctl_ip_id_change(SYSCTL_HANDLER_ARGS); static void ip_initid(int); static uint16_t ip_randomid(void); @@ -136,7 +136,7 @@ static void ipid_sysuninit(void); SYSCTL_DECL(_net_inet_ip); SYSCTL_PROC(_net_inet_ip, OID_AUTO, random_id, CTLTYPE_INT | CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_MPSAFE, - &VNET_NAME(ip_do_randomid), 0, sysctl_ip_randomid, "IU", + &VNET_NAME(ip_random_id), 0, sysctl_ip_random_id, "IU", "Assign random ip_id values"); SYSCTL_INT(_net_inet_ip, OID_AUTO, rfc6864, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip_rfc6864), 0, @@ -151,22 +151,22 @@ SYSCTL_INT(_net_inet_ip, OID_AUTO, random_id_total, CTLFLAG_RD | CTLFLAG_VNET, &VNET_NAME(random_id_total), 0, "Count of IP IDs created"); static int -sysctl_ip_randomid(SYSCTL_HANDLER_ARGS) +sysctl_ip_random_id(SYSCTL_HANDLER_ARGS) { int error, new; - new = V_ip_do_randomid; + new = V_ip_random_id; error = sysctl_handle_int(oidp, &new, 0, req); if (error || req->newptr == NULL) return (error); if (new != 0 && new != 1) return (EINVAL); - if (new == V_ip_do_randomid) + if (new == V_ip_random_id) return (0); - if (new == 1 && V_ip_do_randomid == 0) + if (new == 1 && V_ip_random_id == 0) ip_initid(8192); /* We don't free memory when turning random ID off, due to race. */ - V_ip_do_randomid = new; + V_ip_random_id = new; return (0); } @@ -238,7 +238,7 @@ ip_randomid(void) } void -ip_fillid(struct ip *ip) +ip_fillid(struct ip *ip, bool do_randomid) { /* @@ -249,7 +249,7 @@ ip_fillid(struct ip *ip) */ if (V_ip_rfc6864 && (ip->ip_off & htons(IP_DF)) == htons(IP_DF)) ip->ip_id = 0; - else if (V_ip_do_randomid) + else if (do_randomid) ip->ip_id = ip_randomid(); else { counter_u64_add(V_ip_id, 1); diff --git a/sys/netinet/ip_input.c b/sys/netinet/ip_input.c index 82d7acdd0710..4d614dfeb0a2 100644 --- a/sys/netinet/ip_input.c +++ b/sys/netinet/ip_input.c @@ -521,11 +521,6 @@ ip_input(struct mbuf *m) goto bad; } } - /* The unspecified address can appear only as a src address - RFC1122 */ - if (__predict_false(ntohl(ip->ip_dst.s_addr) == INADDR_ANY)) { - IPSTAT_INC(ips_badaddr); - goto bad; - } if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) { sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID); @@ -641,6 +636,17 @@ tooshort: } } passin: + /* + * The unspecified address can appear only as a src address - RFC1122. + * + * The check is deferred to here to give firewalls a chance to block + * (and log) such packets. ip_tryforward() will not process such + * packets. + */ + if (__predict_false(ntohl(ip->ip_dst.s_addr) == INADDR_ANY)) { + IPSTAT_INC(ips_badaddr); + goto bad; + } /* * Process options and, if not destined for us, @@ -783,9 +789,7 @@ passin: */ goto ours; } - if (ip->ip_dst.s_addr == (u_long)INADDR_BROADCAST) - goto ours; - if (ip->ip_dst.s_addr == INADDR_ANY) + if (in_broadcast(ip->ip_dst)) goto ours; /* RFC 3927 2.7: Do not forward packets to or from IN_LINKLOCAL. */ if (IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr)) || @@ -920,7 +924,7 @@ ip_forward(struct mbuf *m, int srcrt) NET_EPOCH_ASSERT(); - if (m->m_flags & (M_BCAST|M_MCAST) || in_canforward(ip->ip_dst) == 0) { + if (m->m_flags & (M_BCAST|M_MCAST) || !in_canforward(ip->ip_dst)) { IPSTAT_INC(ips_cantforward); m_freem(m); return; @@ -942,6 +946,18 @@ ip_forward(struct mbuf *m, int srcrt) flowid = m->m_pkthdr.flowid; ro.ro_nh = fib4_lookup(M_GETFIB(m), ip->ip_dst, 0, NHR_REF, flowid); if (ro.ro_nh != NULL) { + if (ro.ro_nh->nh_flags & (NHF_BLACKHOLE | NHF_BROADCAST)) { + IPSTAT_INC(ips_cantforward); + m_freem(m); + NH_FREE(ro.ro_nh); + return; + } + if (ro.ro_nh->nh_flags & NHF_REJECT) { + IPSTAT_INC(ips_cantforward); + NH_FREE(ro.ro_nh); + icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0); + return; + } ia = ifatoia(ro.ro_nh->nh_ifa); } else ia = NULL; diff --git a/sys/netinet/ip_mroute.c b/sys/netinet/ip_mroute.c index b864a4db5abc..d30bd42ec578 100644 --- a/sys/netinet/ip_mroute.c +++ b/sys/netinet/ip_mroute.c @@ -139,6 +139,13 @@ static MALLOC_DEFINE(M_MRTABLE, "mroutetbl", "multicast forwarding cache"); * structures. */ +static struct sx __exclusive_cache_line mrouter_teardown; +#define MRW_TEARDOWN_WLOCK() sx_xlock(&mrouter_teardown) +#define MRW_TEARDOWN_WUNLOCK() sx_xunlock(&mrouter_teardown) +#define MRW_TEARDOWN_LOCK_INIT() \ + sx_init(&mrouter_teardown, "IPv4 multicast forwarding teardown") +#define MRW_TEARDOWN_LOCK_DESTROY() sx_destroy(&mrouter_teardown) + static struct rwlock mrouter_lock; #define MRW_RLOCK() rw_rlock(&mrouter_lock) #define MRW_WLOCK() rw_wlock(&mrouter_lock) @@ -692,15 +699,18 @@ ip_mrouter_init(struct socket *so, int version) if (version != 1) return ENOPROTOOPT; + MRW_TEARDOWN_WLOCK(); MRW_WLOCK(); if (ip_mrouter_unloading) { MRW_WUNLOCK(); + MRW_TEARDOWN_WUNLOCK(); return ENOPROTOOPT; } if (V_ip_mrouter != NULL) { MRW_WUNLOCK(); + MRW_TEARDOWN_WUNLOCK(); return EADDRINUSE; } @@ -708,6 +718,7 @@ ip_mrouter_init(struct socket *so, int version) HASH_NOWAIT); if (V_mfchashtbl == NULL) { MRW_WUNLOCK(); + MRW_TEARDOWN_WUNLOCK(); return (ENOMEM); } @@ -717,6 +728,7 @@ ip_mrouter_init(struct socket *so, int version) M_NOWAIT, &V_bw_upcalls_ring_mtx); if (!V_bw_upcalls_ring) { MRW_WUNLOCK(); + MRW_TEARDOWN_WUNLOCK(); return (ENOMEM); } @@ -736,6 +748,7 @@ ip_mrouter_init(struct socket *so, int version) mtx_init(&V_buf_ring_mtx, "mroute buf_ring mtx", NULL, MTX_DEF); MRW_WUNLOCK(); + MRW_TEARDOWN_WUNLOCK(); CTR1(KTR_IPMF, "%s: done", __func__); @@ -754,8 +767,12 @@ X_ip_mrouter_done(void) vifi_t vifi; struct bw_upcall *bu; - if (V_ip_mrouter == NULL) + MRW_TEARDOWN_WLOCK(); + + if (V_ip_mrouter == NULL) { + MRW_TEARDOWN_WUNLOCK(); return (EINVAL); + } /* * Detach/disable hooks to the reset of the system. @@ -768,7 +785,7 @@ X_ip_mrouter_done(void) * Wait for all epoch sections to complete to ensure * V_ip_mrouter = NULL is visible to others. */ - epoch_wait_preempt(net_epoch_preempt); + NET_EPOCH_WAIT(); /* Stop and drain task queue */ taskqueue_block(V_task_queue); @@ -830,6 +847,7 @@ X_ip_mrouter_done(void) mtx_destroy(&V_buf_ring_mtx); MRW_WUNLOCK(); + MRW_TEARDOWN_WUNLOCK(); /* * Now drop our claim on promiscuous multicast on the interfaces recorded @@ -1311,6 +1329,8 @@ X_ip_mforward(struct ip *ip, struct ifnet *ifp, struct mbuf *m, u_long hash; int hlen; + M_ASSERTMAPPED(m); + CTR3(KTR_IPMF, "ip_mforward: delete mfc orig 0x%08x group %lx ifp %p", ntohl(ip->ip_src.s_addr), (u_long)ntohl(ip->ip_dst.s_addr), ifp); @@ -1562,6 +1582,7 @@ ip_mdq(struct mbuf *m, struct ifnet *ifp, struct mfc *rt, vifi_t xmt_vif) vifi_t vifi; int plen = ntohs(ip->ip_len); + M_ASSERTMAPPED(m); MRW_LOCK_ASSERT(); NET_EPOCH_ASSERT(); @@ -1745,6 +1766,7 @@ phyint_send(struct ip *ip, struct vif *vifp, struct mbuf *m) int hlen = ip->ip_hl << 2; MRW_LOCK_ASSERT(); + M_ASSERTMAPPED(m); /* * Make a new reference to the packet; make sure that @@ -2444,7 +2466,7 @@ pim_register_send_rp(struct ip *ip, struct vif *vifp, struct mbuf *mb_copy, ip_outer->ip_tos = ip->ip_tos; if (ip->ip_off & htons(IP_DF)) ip_outer->ip_off |= htons(IP_DF); - ip_fillid(ip_outer); + ip_fillid(ip_outer, V_ip_random_id); pimhdr = (struct pim_encap_pimhdr *)((caddr_t)ip_outer + sizeof(pim_encap_iphdr)); *pimhdr = pim_encap_pimhdr; @@ -2717,6 +2739,9 @@ sysctl_mfctable(SYSCTL_HANDLER_ARGS) return (error); MRW_RLOCK(); + if (V_mfchashtbl == NULL) + goto out_locked; + for (i = 0; i < mfchashsize; i++) { LIST_FOREACH(rt, &V_mfchashtbl[i], mfc_hash) { error = SYSCTL_OUT(req, rt, sizeof(struct mfc)); @@ -2805,6 +2830,7 @@ ip_mroute_modevent(module_t mod, int type, void *unused) switch (type) { case MOD_LOAD: + MRW_TEARDOWN_LOCK_INIT(); MRW_LOCK_INIT(); if_detach_event_tag = EVENTHANDLER_REGISTER(ifnet_departure_event, @@ -2876,6 +2902,7 @@ ip_mroute_modevent(module_t mod, int type, void *unused) rsvp_input_p = NULL; MRW_LOCK_DESTROY(); + MRW_TEARDOWN_LOCK_DESTROY(); break; default: diff --git a/sys/netinet/ip_options.c b/sys/netinet/ip_options.c index 41f77a7491f2..a9d6836d9e97 100644 --- a/sys/netinet/ip_options.c +++ b/sys/netinet/ip_options.c @@ -514,6 +514,8 @@ ip_insertoptions(struct mbuf *m, struct mbuf *opt, int *phlen) *phlen = 0; return (m); /* XXX should fail */ } + KASSERT((m->m_flags & M_EXTPG) == 0, ("%s: mbuf %p is unmapped", + __func__, m)); if (p->ipopt_dst.s_addr) ip->ip_dst = p->ipopt_dst; if (!M_WRITABLE(m) || M_LEADINGSPACE(m) < optlen) { diff --git a/sys/netinet/ip_output.c b/sys/netinet/ip_output.c index 28fb651a0bc9..ec6ba8d92015 100644 --- a/sys/netinet/ip_output.c +++ b/sys/netinet/ip_output.c @@ -112,13 +112,19 @@ ip_output_pfil(struct mbuf **mp, struct ifnet *ifp, int flags, struct mbuf *m; struct in_addr odst; struct ip *ip; + int ret; m = *mp; ip = mtod(m, struct ip *); /* Run through list of hooks for output packets. */ odst.s_addr = ip->ip_dst.s_addr; - switch (pfil_mbuf_out(V_inet_pfil_head, mp, ifp, inp)) { + if (flags & IP_FORWARDING) + ret = pfil_mbuf_fwd(V_inet_pfil_head, mp, ifp, inp); + else + ret = pfil_mbuf_out(V_inet_pfil_head, mp, ifp, inp); + + switch (ret) { case PFIL_DROPPED: *error = EACCES; /* FALLTHROUGH */ @@ -323,7 +329,7 @@ ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags, const struct sockaddr *gw; struct in_ifaddr *ia = NULL; struct in_addr src; - int isbroadcast; + bool isbroadcast; uint16_t ip_len, ip_off; struct route iproute; uint32_t fibnum; @@ -362,7 +368,7 @@ ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags, if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) { ip->ip_v = IPVERSION; ip->ip_hl = hlen >> 2; - ip_fillid(ip); + ip_fillid(ip, V_ip_random_id); } else { /* Header already set, fetch hlen from there */ hlen = ip->ip_hl << 2; @@ -428,7 +434,7 @@ again: ifp = ia->ia_ifp; mtu = ifp->if_mtu; ip->ip_ttl = 1; - isbroadcast = 1; + isbroadcast = true; src = IA_SIN(ia)->sin_addr; } else if (flags & IP_ROUTETOIF) { if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst), @@ -443,7 +449,8 @@ again: mtu = ifp->if_mtu; ip->ip_ttl = 1; isbroadcast = ifp->if_flags & IFF_BROADCAST ? - in_ifaddr_broadcast(dst->sin_addr, ia) : 0; + (in_broadcast(ip->ip_dst) || + in_ifaddr_broadcast(dst->sin_addr, ia)) : 0; src = IA_SIN(ia)->sin_addr; } else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) && imo != NULL && imo->imo_multicast_ifp != NULL) { @@ -454,7 +461,7 @@ again: ifp = imo->imo_multicast_ifp; mtu = ifp->if_mtu; IFP_TO_IA(ifp, ia); - isbroadcast = 0; /* fool gcc */ + isbroadcast = false; /* Interface may have no addresses. */ if (ia != NULL) src = IA_SIN(ia)->sin_addr; @@ -496,10 +503,13 @@ again: gw = &nh->gw_sa; if (nh->nh_flags & NHF_HOST) isbroadcast = (nh->nh_flags & NHF_BROADCAST); - else if ((ifp->if_flags & IFF_BROADCAST) && (gw->sa_family == AF_INET)) - isbroadcast = in_ifaddr_broadcast(((const struct sockaddr_in *)gw)->sin_addr, ia); + else if ((ifp->if_flags & IFF_BROADCAST) && + (gw->sa_family == AF_INET)) + isbroadcast = in_broadcast(ip->ip_dst) || + in_ifaddr_broadcast( + ((const struct sockaddr_in *)gw)->sin_addr, ia); else - isbroadcast = 0; + isbroadcast = false; mtu = nh->nh_mtu; src = IA_SIN(ia)->sin_addr; } else { @@ -527,11 +537,12 @@ again: gw = &nh->gw_sa; ia = ifatoia(nh->nh_ifa); src = IA_SIN(ia)->sin_addr; - isbroadcast = (((nh->nh_flags & (NHF_HOST | NHF_BROADCAST)) == + isbroadcast = ((nh->nh_flags & (NHF_HOST | NHF_BROADCAST)) == (NHF_HOST | NHF_BROADCAST)) || ((ifp->if_flags & IFF_BROADCAST) && (gw->sa_family == AF_INET) && - in_ifaddr_broadcast(((const struct sockaddr_in *)gw)->sin_addr, ia))); + (in_broadcast(ip->ip_dst) || in_ifaddr_broadcast( + ((const struct sockaddr_in *)gw)->sin_addr, ia))); } /* Catch a possible divide by zero later. */ @@ -667,18 +678,19 @@ again: sendit: #if defined(IPSEC) || defined(IPSEC_SUPPORT) if (IPSEC_ENABLED(ipv4)) { - m = mb_unmapped_to_ext(m); - if (m == NULL) { - IPSTAT_INC(ips_odropped); - error = ENOBUFS; - goto bad; - } - if ((error = IPSEC_OUTPUT(ipv4, m, inp)) != 0) { + struct ip ip_hdr; + + if ((error = IPSEC_OUTPUT(ipv4, ifp, m, inp, mtu)) != 0) { if (error == EINPROGRESS) error = 0; goto done; } + + /* Update variables that are affected by ipsec4_output(). */ + m_copydata(m, 0, sizeof(ip_hdr), (char *)&ip_hdr); + hlen = ip_hdr.ip_hl << 2; } + /* * Check if there was a route for this packet; return error if not. */ @@ -687,9 +699,6 @@ sendit: error = EHOSTUNREACH; goto bad; } - /* Update variables that are affected by ipsec4_output(). */ - ip = mtod(m, struct ip *); - hlen = ip->ip_hl << 2; #endif /* IPSEC */ /* Jump over all PFIL processing if hooks are not active. */ @@ -731,11 +740,20 @@ sendit: /* Ensure the packet data is mapped if the interface requires it. */ if ((ifp->if_capenable & IFCAP_MEXTPG) == 0) { - m = mb_unmapped_to_ext(m); - if (m == NULL) { + struct mbuf *m1; + + error = mb_unmapped_to_ext(m, &m1); + if (error != 0) { + if (error == EINVAL) { + if_printf(ifp, "TLS packet\n"); + /* XXXKIB */ + } else if (error == ENOMEM) { + error = ENOBUFS; + } IPSTAT_INC(ips_odropped); - error = ENOBUFS; - goto bad; + goto done; + } else { + m = m1; } } @@ -841,7 +859,7 @@ sendit: done: return (error); - bad: +bad: m_freem(m); goto done; } @@ -1081,10 +1099,22 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt) sopt->sopt_dir == SOPT_SET) { switch (sopt->sopt_name) { case SO_SETFIB: + error = sooptcopyin(sopt, &optval, + sizeof(optval), sizeof(optval)); + if (error != 0) + break; + INP_WLOCK(inp); - inp->inp_inc.inc_fibnum = so->so_fibnum; + if ((inp->inp_flags & INP_BOUNDFIB) != 0 && + optval != so->so_fibnum) { + INP_WUNLOCK(inp); + error = EISCONN; + break; + } + error = sosetfib(inp->inp_socket, optval); + if (error == 0) + inp->inp_inc.inc_fibnum = optval; INP_WUNLOCK(inp); - error = 0; break; case SO_MAX_PACING_RATE: #ifdef RATELIMIT diff --git a/sys/netinet/ip_reass.c b/sys/netinet/ip_reass.c index a95780aa2f27..177069f5e010 100644 --- a/sys/netinet/ip_reass.c +++ b/sys/netinet/ip_reass.c @@ -670,6 +670,11 @@ ipreass_drain(void) VNET_LIST_RUNLOCK(); } +static void +ipreass_drain_lowmem(void *arg __unused, int flags __unused) +{ + ipreass_drain(); +} /* * Initialize IP reassembly structures. @@ -711,10 +716,10 @@ ipreass_init(void) maxfrags = IP_MAXFRAGS; EVENTHANDLER_REGISTER(nmbclusters_change, ipreass_zone_change, NULL, EVENTHANDLER_PRI_ANY); - EVENTHANDLER_REGISTER(vm_lowmem, ipreass_drain, NULL, + EVENTHANDLER_REGISTER(vm_lowmem, ipreass_drain_lowmem, NULL, + LOWMEM_PRI_DEFAULT); + EVENTHANDLER_REGISTER(mbuf_lowmem, ipreass_drain_lowmem, NULL, LOWMEM_PRI_DEFAULT); - EVENTHANDLER_REGISTER(mbuf_lowmem, ipreass_drain, NULL, - LOWMEM_PRI_DEFAULT); } /* diff --git a/sys/netinet/ip_var.h b/sys/netinet/ip_var.h index 0f2ed8c43e64..f782ebc53eb0 100644 --- a/sys/netinet/ip_var.h +++ b/sys/netinet/ip_var.h @@ -47,7 +47,7 @@ struct ipovly { u_short ih_len; /* protocol length */ struct in_addr ih_src; /* source internet address */ struct in_addr ih_dst; /* destination internet address */ -}; +} __packed; #ifdef _KERNEL /* @@ -204,6 +204,7 @@ extern int (*legal_vif_num)(int); extern u_long (*ip_mcast_src)(int); VNET_DECLARE(int, rsvp_on); VNET_DECLARE(int, drop_redirect); +VNET_DECLARE(int, ip_random_id); #define V_ip_id VNET(ip_id) #define V_ip_defttl VNET(ip_defttl) @@ -216,6 +217,7 @@ VNET_DECLARE(int, drop_redirect); #define V_ip_mrouter VNET(ip_mrouter) #define V_rsvp_on VNET(rsvp_on) #define V_drop_redirect VNET(drop_redirect) +#define V_ip_random_id VNET(ip_random_id) void inp_freemoptions(struct ip_moptions *); int inp_getmoptions(struct inpcb *, struct sockopt *); @@ -235,7 +237,7 @@ struct mbuf * ip_reass(struct mbuf *); void ip_savecontrol(struct inpcb *, struct mbuf **, struct ip *, struct mbuf *); -void ip_fillid(struct ip *); +void ip_fillid(struct ip *, bool); int rip_ctloutput(struct socket *, struct sockopt *); int ipip_input(struct mbuf **, int *, int); int rsvp_input(struct mbuf **, int *, int); diff --git a/sys/netinet/libalias/alias.c b/sys/netinet/libalias/alias.c index 9bf6b82b9369..6758813f6a21 100644 --- a/sys/netinet/libalias/alias.c +++ b/sys/netinet/libalias/alias.c @@ -183,12 +183,12 @@ a timeout period. */ /* Local prototypes */ -static void TcpMonitorIn(u_char, struct alias_link *); +static void TcpMonitorIn(uint16_t, struct alias_link *); -static void TcpMonitorOut(u_char, struct alias_link *); +static void TcpMonitorOut(uint16_t, struct alias_link *); static void -TcpMonitorIn(u_char th_flags, struct alias_link *lnk) +TcpMonitorIn(uint16_t th_flags, struct alias_link *lnk) { switch (GetStateIn(lnk)) { case ALIAS_TCP_STATE_NOT_CONNECTED: @@ -205,7 +205,7 @@ TcpMonitorIn(u_char th_flags, struct alias_link *lnk) } static void -TcpMonitorOut(u_char th_flags, struct alias_link *lnk) +TcpMonitorOut(uint16_t th_flags, struct alias_link *lnk) { switch (GetStateOut(lnk)) { case ALIAS_TCP_STATE_NOT_CONNECTED: @@ -290,13 +290,14 @@ IcmpAliasIn1(struct libalias *la, struct ip *pip) { struct alias_link *lnk; struct icmp *ic; + int ret; LIBALIAS_LOCK_ASSERT(la); ic = (struct icmp *)ip_next(pip); /* Get source address from ICMP data field and restore original data */ - lnk = FindIcmpIn(la, pip->ip_src, pip->ip_dst, ic->icmp_id, 1); - if (lnk != NULL) { + ret = FindIcmpIn(la, pip->ip_src, pip->ip_dst, ic->icmp_id, 1, &lnk); + if (ret == PKT_ALIAS_OK) { u_short original_id; int accumulate; @@ -319,10 +320,8 @@ IcmpAliasIn1(struct libalias *la, struct ip *pip) &original_address, &pip->ip_dst, 2); pip->ip_dst = original_address; } - - return (PKT_ALIAS_OK); } - return (PKT_ALIAS_IGNORED); + return (ret); } /* @@ -337,6 +336,7 @@ IcmpAliasIn2(struct libalias *la, struct ip *pip) struct udphdr *ud; struct tcphdr *tc; struct alias_link *lnk; + int ret; LIBALIAS_LOCK_ASSERT(la); ic = (struct icmp *)ip_next(pip); @@ -346,18 +346,26 @@ IcmpAliasIn2(struct libalias *la, struct ip *pip) tc = (struct tcphdr *)ip_next(ip); ic2 = (struct icmp *)ip_next(ip); - if (ip->ip_p == IPPROTO_UDP) - lnk = FindUdpTcpIn(la, ip->ip_dst, ip->ip_src, + if (ip->ip_p == IPPROTO_UDP) { + ret = FindUdpTcpIn(la, ip->ip_dst, ip->ip_src, ud->uh_dport, ud->uh_sport, - IPPROTO_UDP, 0); - else if (ip->ip_p == IPPROTO_TCP) - lnk = FindUdpTcpIn(la, ip->ip_dst, ip->ip_src, + IPPROTO_UDP, 0, &lnk); + if (ret != PKT_ALIAS_OK) + return (ret); + } else if (ip->ip_p == IPPROTO_TCP) { + ret = FindUdpTcpIn(la, ip->ip_dst, ip->ip_src, tc->th_dport, tc->th_sport, - IPPROTO_TCP, 0); - else if (ip->ip_p == IPPROTO_ICMP) { - if (ic2->icmp_type == ICMP_ECHO || ic2->icmp_type == ICMP_TSTAMP) - lnk = FindIcmpIn(la, ip->ip_dst, ip->ip_src, ic2->icmp_id, 0); - else + IPPROTO_TCP, 0, &lnk); + if (ret != PKT_ALIAS_OK) + return (ret); + } else if (ip->ip_p == IPPROTO_ICMP) { + if (ic2->icmp_type == ICMP_ECHO || + ic2->icmp_type == ICMP_TSTAMP) { + ret = FindIcmpIn(la, ip->ip_dst, ip->ip_src, + ic2->icmp_id, 0, &lnk); + if (ret != PKT_ALIAS_OK) + return (ret); + } else lnk = NULL; } else lnk = NULL; @@ -479,13 +487,15 @@ IcmpAliasOut1(struct libalias *la, struct ip *pip, int create) { struct alias_link *lnk; struct icmp *ic; + int ret; LIBALIAS_LOCK_ASSERT(la); ic = (struct icmp *)ip_next(pip); /* Save overwritten data for when echo packet returns */ - lnk = FindIcmpOut(la, pip->ip_src, pip->ip_dst, ic->icmp_id, create); - if (lnk != NULL) { + ret = FindIcmpOut(la, pip->ip_src, pip->ip_dst, ic->icmp_id, create, + &lnk); + if (ret == PKT_ALIAS_OK) { u_short alias_id; int accumulate; @@ -508,10 +518,8 @@ IcmpAliasOut1(struct libalias *la, struct ip *pip, int create) &alias_address, &pip->ip_src, 2); pip->ip_src = alias_address; } - - return (PKT_ALIAS_OK); } - return (PKT_ALIAS_IGNORED); + return (ret); } /* @@ -526,6 +534,7 @@ IcmpAliasOut2(struct libalias *la, struct ip *pip) struct udphdr *ud; struct tcphdr *tc; struct alias_link *lnk; + int ret; LIBALIAS_LOCK_ASSERT(la); ic = (struct icmp *)ip_next(pip); @@ -535,18 +544,26 @@ IcmpAliasOut2(struct libalias *la, struct ip *pip) tc = (struct tcphdr *)ip_next(ip); ic2 = (struct icmp *)ip_next(ip); - if (ip->ip_p == IPPROTO_UDP) - lnk = FindUdpTcpOut(la, ip->ip_dst, ip->ip_src, + if (ip->ip_p == IPPROTO_UDP) { + ret = FindUdpTcpOut(la, ip->ip_dst, ip->ip_src, ud->uh_dport, ud->uh_sport, - IPPROTO_UDP, 0); - else if (ip->ip_p == IPPROTO_TCP) - lnk = FindUdpTcpOut(la, ip->ip_dst, ip->ip_src, + IPPROTO_UDP, 0, &lnk); + if (ret != PKT_ALIAS_OK) + return (ret); + } else if (ip->ip_p == IPPROTO_TCP) { + ret = FindUdpTcpOut(la, ip->ip_dst, ip->ip_src, tc->th_dport, tc->th_sport, - IPPROTO_TCP, 0); - else if (ip->ip_p == IPPROTO_ICMP) { - if (ic2->icmp_type == ICMP_ECHO || ic2->icmp_type == ICMP_TSTAMP) - lnk = FindIcmpOut(la, ip->ip_dst, ip->ip_src, ic2->icmp_id, 0); - else + IPPROTO_TCP, 0, &lnk); + if (ret != PKT_ALIAS_OK) + return (ret); + } else if (ip->ip_p == IPPROTO_ICMP) { + if (ic2->icmp_type == ICMP_ECHO || + ic2->icmp_type == ICMP_TSTAMP) { + ret = FindIcmpOut(la, ip->ip_dst, ip->ip_src, + ic2->icmp_id, 0, &lnk); + if (ret != PKT_ALIAS_OK) + return (ret); + } else lnk = NULL; } else lnk = NULL; @@ -661,14 +678,15 @@ ProtoAliasIn(struct libalias *la, struct in_addr ip_src, struct ip *pip, u_char ip_p, u_short *ip_sum) { struct alias_link *lnk; + int ret; LIBALIAS_LOCK_ASSERT(la); /* Return if proxy-only mode is enabled */ if (la->packetAliasMode & PKT_ALIAS_PROXY_ONLY) return (PKT_ALIAS_OK); - lnk = FindProtoIn(la, ip_src, pip->ip_dst, ip_p); - if (lnk != NULL) { + ret = FindProtoIn(la, ip_src, pip->ip_dst, ip_p, &lnk); + if (ret == PKT_ALIAS_OK) { struct in_addr original_address; original_address = GetOriginalAddress(lnk); @@ -677,10 +695,8 @@ ProtoAliasIn(struct libalias *la, struct in_addr ip_src, DifferentialChecksum(ip_sum, &original_address, &pip->ip_dst, 2); pip->ip_dst = original_address; - - return (PKT_ALIAS_OK); } - return (PKT_ALIAS_IGNORED); + return (ret); } /* @@ -693,6 +709,7 @@ ProtoAliasOut(struct libalias *la, struct ip *pip, struct in_addr ip_dst, u_char ip_p, u_short *ip_sum, int create) { struct alias_link *lnk; + int ret; LIBALIAS_LOCK_ASSERT(la); @@ -703,8 +720,8 @@ ProtoAliasOut(struct libalias *la, struct ip *pip, if (!create) return (PKT_ALIAS_IGNORED); - lnk = FindProtoOut(la, pip->ip_src, ip_dst, ip_p); - if (lnk != NULL) { + ret = FindProtoOut(la, pip->ip_src, ip_dst, ip_p, &lnk); + if (ret == PKT_ALIAS_OK) { struct in_addr alias_address; alias_address = GetAliasAddress(lnk); @@ -713,10 +730,8 @@ ProtoAliasOut(struct libalias *la, struct ip *pip, DifferentialChecksum(ip_sum, &alias_address, &pip->ip_src, 2); pip->ip_src = alias_address; - - return (PKT_ALIAS_OK); } - return (PKT_ALIAS_IGNORED); + return (ret); } #define MF_ISSET(_pip) (ntohs((_pip)->ip_off) & IP_MF) @@ -745,6 +760,7 @@ UdpAliasIn(struct libalias *la, struct ip *pip) { struct udphdr *ud; struct alias_link *lnk; + int ret; LIBALIAS_LOCK_ASSERT(la); @@ -752,10 +768,12 @@ UdpAliasIn(struct libalias *la, struct ip *pip) if (ud == NULL) return (PKT_ALIAS_IGNORED); - lnk = FindUdpTcpIn(la, pip->ip_src, pip->ip_dst, + ret = FindUdpTcpIn(la, pip->ip_src, pip->ip_dst, ud->uh_sport, ud->uh_dport, - IPPROTO_UDP, !(la->packetAliasMode & PKT_ALIAS_PROXY_ONLY)); - if (lnk != NULL) { + IPPROTO_UDP, !(la->packetAliasMode & PKT_ALIAS_PROXY_ONLY), &lnk); + if (ret != PKT_ALIAS_OK) + return (ret); + { struct in_addr alias_address; struct in_addr original_address; struct in_addr proxy_address; @@ -828,7 +846,6 @@ UdpAliasIn(struct libalias *la, struct ip *pip) return (PKT_ALIAS_OK); } - return (PKT_ALIAS_IGNORED); } static int @@ -840,7 +857,7 @@ UdpAliasOut(struct libalias *la, struct ip *pip, int maxpacketsize, int create) struct in_addr proxy_server_address; u_short dest_port; u_short proxy_server_port; - int proxy_type; + int proxy_type, ret; LIBALIAS_LOCK_ASSERT(la); @@ -877,10 +894,12 @@ UdpAliasOut(struct libalias *la, struct ip *pip, int maxpacketsize, int create) pip->ip_dst = proxy_server_address; ud->uh_dport = proxy_server_port; } - lnk = FindUdpTcpOut(la, pip->ip_src, pip->ip_dst, + ret = FindUdpTcpOut(la, pip->ip_src, pip->ip_dst, ud->uh_sport, ud->uh_dport, - IPPROTO_UDP, create); - if (lnk != NULL) { + IPPROTO_UDP, create, &lnk); + if (ret != PKT_ALIAS_OK) + return (ret); + { u_short alias_port; struct in_addr alias_address; struct alias_data ad = { @@ -930,7 +949,6 @@ UdpAliasOut(struct libalias *la, struct ip *pip, int maxpacketsize, int create) return (PKT_ALIAS_OK); } - return (PKT_ALIAS_IGNORED); } static int @@ -939,6 +957,7 @@ TcpAliasIn(struct libalias *la, struct ip *pip) struct tcphdr *tc; struct alias_link *lnk; size_t dlen; + int ret; LIBALIAS_LOCK_ASSERT(la); @@ -947,11 +966,12 @@ TcpAliasIn(struct libalias *la, struct ip *pip) return (PKT_ALIAS_IGNORED); tc = (struct tcphdr *)ip_next(pip); - lnk = FindUdpTcpIn(la, pip->ip_src, pip->ip_dst, + ret = FindUdpTcpIn(la, pip->ip_src, pip->ip_dst, tc->th_sport, tc->th_dport, IPPROTO_TCP, - !(la->packetAliasMode & PKT_ALIAS_PROXY_ONLY)); - if (lnk != NULL) { + !(la->packetAliasMode & PKT_ALIAS_PROXY_ONLY), + &lnk); + if (ret == PKT_ALIAS_OK) { struct in_addr alias_address; struct in_addr original_address; struct in_addr proxy_address; @@ -1053,17 +1073,17 @@ TcpAliasIn(struct libalias *la, struct ip *pip) /* Monitor TCP connection state */ tc = (struct tcphdr *)ip_next(pip); - TcpMonitorIn(tc->th_flags, lnk); + TcpMonitorIn(__tcp_get_flags(tc), lnk); return (PKT_ALIAS_OK); } - return (PKT_ALIAS_IGNORED); + return (ret); } static int TcpAliasOut(struct libalias *la, struct ip *pip, int maxpacketsize, int create) { - int proxy_type; + int proxy_type, ret; u_short dest_port; u_short proxy_server_port; size_t dlen; @@ -1108,12 +1128,12 @@ TcpAliasOut(struct libalias *la, struct ip *pip, int maxpacketsize, int create) accumulate -= twowords(&pip->ip_dst); ADJUST_CHECKSUM(accumulate, pip->ip_sum); } - lnk = FindUdpTcpOut(la, pip->ip_src, pip->ip_dst, + ret = FindUdpTcpOut(la, pip->ip_src, pip->ip_dst, tc->th_sport, tc->th_dport, - IPPROTO_TCP, create); - if (lnk == NULL) - return (PKT_ALIAS_IGNORED); - if (lnk != NULL) { + IPPROTO_TCP, create, &lnk); + if (ret != PKT_ALIAS_OK) + return (ret); + { u_short alias_port; struct in_addr alias_address; int accumulate; @@ -1142,7 +1162,7 @@ TcpAliasOut(struct libalias *la, struct ip *pip, int maxpacketsize, int create) /* Monitor TCP connection state */ tc = (struct tcphdr *)ip_next(pip); - TcpMonitorOut(tc->th_flags, lnk); + TcpMonitorOut(__tcp_get_flags(tc), lnk); /* Walk out chain. */ find_handler(OUT, TCP, la, pip, &ad); @@ -1177,7 +1197,6 @@ TcpAliasOut(struct libalias *la, struct ip *pip, int maxpacketsize, int create) return (PKT_ALIAS_OK); } - return (PKT_ALIAS_IGNORED); } /* Fragment Handling @@ -1581,17 +1600,24 @@ LibAliasUnaliasOut(struct libalias *la, ic = (struct icmp *)ip_next(pip); /* Find a link */ - if (pip->ip_p == IPPROTO_UDP) - lnk = FindUdpTcpIn(la, pip->ip_dst, pip->ip_src, + if (pip->ip_p == IPPROTO_UDP) { + iresult = FindUdpTcpIn(la, pip->ip_dst, pip->ip_src, ud->uh_dport, ud->uh_sport, - IPPROTO_UDP, 0); - else if (pip->ip_p == IPPROTO_TCP) - lnk = FindUdpTcpIn(la, pip->ip_dst, pip->ip_src, + IPPROTO_UDP, 0, &lnk); + if (iresult != PKT_ALIAS_OK) + goto getout; + } else if (pip->ip_p == IPPROTO_TCP) { + iresult = FindUdpTcpIn(la, pip->ip_dst, pip->ip_src, tc->th_dport, tc->th_sport, - IPPROTO_TCP, 0); - else if (pip->ip_p == IPPROTO_ICMP) - lnk = FindIcmpIn(la, pip->ip_dst, pip->ip_src, ic->icmp_id, 0); - else + IPPROTO_TCP, 0, &lnk); + if (iresult != PKT_ALIAS_OK) + goto getout; + } else if (pip->ip_p == IPPROTO_ICMP) { + iresult = FindIcmpIn(la, pip->ip_dst, pip->ip_src, + ic->icmp_id, 0, &lnk); + if (iresult != PKT_ALIAS_OK) + goto getout; + } else lnk = NULL; /* Change it from an aliased packet to an unaliased packet */ diff --git a/sys/netinet/libalias/alias.h b/sys/netinet/libalias/alias.h index 706184552429..96d8ceec28be 100644 --- a/sys/netinet/libalias/alias.h +++ b/sys/netinet/libalias/alias.h @@ -227,6 +227,26 @@ struct mbuf *m_megapullup(struct mbuf *, int); */ #define PKT_ALIAS_UNREGISTERED_CGN 0x400 +/* + * When this bit is set, UDP uses endpoint-independent mapping (EIM), as per + * RFC 4787 ("full cone" NAT of RFC 3489). All packets from the same internal + * address:port are mapped to the same NAT address:port, regardless of their + * destination address:port. If filtering rules allow, and if + * PKT_ALIAS_DENY_INCOMING is unset, any other external address:port can also + * send to the internal address:port through its mapped NAT address:port. This + * is more compatible with applications, and can reduce the need for port + * forwarding, but less scalable as each NAT address:port can only be + * concurrently used by at most one internal address:port. + * + * When this bit is unset, UDP packets use endpoint-dependent mapping (EDM) + * ("symmetric" NAT). Each connection from a particular internal address:port + * to different external addresses:ports is mapped to a random and + * unpredictable NAT address:port. Two appplications behind EDM NATs can only + * connect to each other by port forwarding on the NAT, or tunnelling through + * an in-between server. + */ +#define PKT_ALIAS_UDP_EIM 0x800 + /* Function return codes. */ #define PKT_ALIAS_ERROR -1 #define PKT_ALIAS_OK 1 diff --git a/sys/netinet/libalias/alias_db.c b/sys/netinet/libalias/alias_db.c index 167201fa1b8f..c143d74a2f45 100644 --- a/sys/netinet/libalias/alias_db.c +++ b/sys/netinet/libalias/alias_db.c @@ -28,13 +28,13 @@ #include <sys/cdefs.h> #ifdef _KERNEL -#include <machine/stdarg.h> #include <sys/param.h> #include <sys/kernel.h> #include <sys/systm.h> #include <sys/lock.h> #include <sys/module.h> #include <sys/rwlock.h> +#include <sys/stdarg.h> #include <sys/syslog.h> #else #include <stdarg.h> @@ -93,6 +93,8 @@ DECLARE_MODULE(alias, alias_mod, SI_SUB_DRIVERS, SI_ORDER_SECOND); SPLAY_GENERATE(splay_out, alias_link, all.out, cmp_out); SPLAY_GENERATE(splay_in, group_in, in, cmp_in); +SPLAY_GENERATE(splay_internal_endpoint, alias_link, all.internal_endpoint, + cmp_internal_endpoint); static struct group_in * StartPointIn(struct libalias *la, @@ -235,6 +237,19 @@ GetNewPort(struct libalias *la, struct alias_link *lnk, int alias_port_param) max_trials = GET_NEW_PORT_MAX_ATTEMPTS; + if ((la->packetAliasMode & PKT_ALIAS_UDP_EIM) && + lnk->link_type == LINK_UDP) { + /* Try reuse the same alias address:port for all destinations + * from the same internal address:port, as per RFC 4787. + */ + struct alias_link *search_result = FindLinkByInternalEndpoint( + la, lnk->src_addr, lnk->src_port, lnk->link_type); + if (search_result != NULL) { + lnk->alias_port = search_result->alias_port; + return (0); + } + } + /* * When the PKT_ALIAS_SAME_PORTS option is chosen, * the first try will be the actual source port. If @@ -254,10 +269,18 @@ GetNewPort(struct libalias *la, struct alias_link *lnk, int alias_port_param) if (grp == NULL) break; + /* As per RFC 4787, UDP cannot share the same alias port among + * multiple internal endpoints + */ + if ((la->packetAliasMode & PKT_ALIAS_UDP_EIM) && + lnk->link_type == LINK_UDP) + continue; + LIST_FOREACH(search_result, &grp->full, all.in) { - if (lnk->dst_addr.s_addr == search_result->dst_addr.s_addr && + if (lnk->dst_addr.s_addr == + search_result->dst_addr.s_addr && lnk->dst_port == search_result->dst_port) - break; /* found match */ + break; /* found match */ } if (search_result == NULL) break; @@ -496,6 +519,10 @@ DeleteLink(struct alias_link **plnk, int deletePermanent) /* Adjust input table pointers */ LIST_REMOVE(lnk, all.in); + /* Adjust "internal endpoint" table pointer */ + SPLAY_REMOVE(splay_internal_endpoint, + &la->linkSplayInternalEndpoint, lnk); + /* Remove intermediate node, if empty */ grp = StartPointIn(la, lnk->alias_addr, lnk->alias_port, lnk->link_type, 0); if (grp != NULL && @@ -696,6 +723,10 @@ AddLink(struct libalias *la, struct in_addr src_addr, struct in_addr dst_addr, LIST_INSERT_HEAD(&grp->partial, lnk, all.in); else LIST_INSERT_HEAD(&grp->full, lnk, all.in); + + /* Set up pointers for "internal endpoint" lookup table */ + SPLAY_INSERT(splay_internal_endpoint, + &la->linkSplayInternalEndpoint, lnk); } break; } @@ -868,8 +899,18 @@ _FindLinkIn(struct libalias *la, struct in_addr dst_addr, case 0: LIST_FOREACH(lnk, &grp->full, all.in) { if (lnk->dst_addr.s_addr == dst_addr.s_addr && - lnk->dst_port == dst_port) - return (UseLink(la, lnk)); + lnk->dst_port == dst_port) { + struct alias_link *found; + + found = UseLink(la, lnk); + if (found != NULL) + return (found); + /* link expired */ + grp = StartPointIn(la, alias_addr, alias_port, link_type, 0); + if (grp == NULL) + return (NULL); + break; + } } break; case LINK_UNKNOWN_DEST_PORT: @@ -954,6 +995,14 @@ FindLinkIn(struct libalias *la, struct in_addr dst_addr, lnk = _FindLinkIn(la, dst_addr, alias_addr, dst_port, alias_port, link_type, replace_partial_links); + if (lnk == NULL && + (la->packetAliasMode & PKT_ALIAS_UDP_EIM) && + link_type == LINK_UDP && + !(la->packetAliasMode & PKT_ALIAS_DENY_INCOMING)) { + lnk = _FindLinkIn(la, ANY_ADDR, alias_addr, 0, alias_port, + link_type, replace_partial_links); + } + if (lnk == NULL) { /* * The following allows permanent links to be specified as @@ -970,6 +1019,20 @@ FindLinkIn(struct libalias *la, struct in_addr dst_addr, return (lnk); } +static struct alias_link * +FindLinkByInternalEndpoint(struct libalias *la, struct in_addr src_addr, + u_short src_port, + int link_type) +{ + struct alias_link needle = { + .src_addr = src_addr, + .src_port = src_port, + .link_type = link_type + }; + LIBALIAS_LOCK_ASSERT(la); + return SPLAY_FIND(splay_internal_endpoint, &la->linkSplayInternalEndpoint, &needle); +} + /* External routines for finding/adding links -- "external" means outside alias_db.c, but within alias*.c -- @@ -986,15 +1049,19 @@ FindLinkIn(struct libalias *la, struct in_addr dst_addr, (prototypes in alias_local.h) */ -struct alias_link * +int FindIcmpIn(struct libalias *la, struct in_addr dst_addr, struct in_addr alias_addr, u_short id_alias, - int create) + int create, + struct alias_link **lnkp) { struct alias_link *lnk; LIBALIAS_LOCK_ASSERT(la); + + *lnkp = NULL; + lnk = FindLinkIn(la, dst_addr, alias_addr, NO_DEST_PORT, id_alias, LINK_ICMP, 0); @@ -1005,19 +1072,26 @@ FindIcmpIn(struct libalias *la, struct in_addr dst_addr, lnk = AddLink(la, target_addr, dst_addr, alias_addr, id_alias, NO_DEST_PORT, id_alias, LINK_ICMP); + if (lnk == NULL) + return (PKT_ALIAS_ERROR); } - return (lnk); + *lnkp = lnk; + return (lnk != NULL ? PKT_ALIAS_OK : PKT_ALIAS_IGNORED); } -struct alias_link * +int FindIcmpOut(struct libalias *la, struct in_addr src_addr, struct in_addr dst_addr, u_short id, - int create) + int create, + struct alias_link **lnkp) { struct alias_link *lnk; LIBALIAS_LOCK_ASSERT(la); + + *lnkp = NULL; + lnk = FindLinkOut(la, src_addr, dst_addr, id, NO_DEST_PORT, LINK_ICMP, 0); @@ -1028,8 +1102,11 @@ FindIcmpOut(struct libalias *la, struct in_addr src_addr, lnk = AddLink(la, src_addr, dst_addr, alias_addr, id, NO_DEST_PORT, GET_ALIAS_ID, LINK_ICMP); + if (lnk == NULL) + return (PKT_ALIAS_ERROR); } - return (lnk); + *lnkp = lnk; + return (lnk != NULL ? PKT_ALIAS_OK : PKT_ALIAS_IGNORED); } struct alias_link * @@ -1083,18 +1160,21 @@ FindFragmentPtr(struct libalias *la, struct in_addr dst_addr, LINK_FRAGMENT_PTR, 0); } -struct alias_link * +int FindProtoIn(struct libalias *la, struct in_addr dst_addr, struct in_addr alias_addr, - u_char proto) + u_char proto, + struct alias_link **lnkp) { struct alias_link *lnk; LIBALIAS_LOCK_ASSERT(la); + + *lnkp = NULL; + lnk = FindLinkIn(la, dst_addr, alias_addr, NO_DEST_PORT, 0, proto, 1); - if (lnk == NULL && !(la->packetAliasMode & PKT_ALIAS_DENY_INCOMING)) { struct in_addr target_addr; @@ -1102,22 +1182,28 @@ FindProtoIn(struct libalias *la, struct in_addr dst_addr, lnk = AddLink(la, target_addr, dst_addr, alias_addr, NO_SRC_PORT, NO_DEST_PORT, 0, proto); + if (lnk == NULL) + return (PKT_ALIAS_ERROR); } - return (lnk); + *lnkp = lnk; + return (lnk != NULL ? PKT_ALIAS_OK : PKT_ALIAS_IGNORED); } -struct alias_link * +int FindProtoOut(struct libalias *la, struct in_addr src_addr, struct in_addr dst_addr, - u_char proto) + u_char proto, + struct alias_link **lnkp) { struct alias_link *lnk; LIBALIAS_LOCK_ASSERT(la); + + *lnkp = NULL; + lnk = FindLinkOut(la, src_addr, dst_addr, NO_SRC_PORT, NO_DEST_PORT, proto, 1); - if (lnk == NULL) { struct in_addr alias_addr; @@ -1125,22 +1211,29 @@ FindProtoOut(struct libalias *la, struct in_addr src_addr, lnk = AddLink(la, src_addr, dst_addr, alias_addr, NO_SRC_PORT, NO_DEST_PORT, 0, proto); + if (lnk == NULL) + return (PKT_ALIAS_ERROR); } - return (lnk); + *lnkp = lnk; + return (lnk != NULL ? PKT_ALIAS_OK : PKT_ALIAS_IGNORED); } -struct alias_link * +int FindUdpTcpIn(struct libalias *la, struct in_addr dst_addr, struct in_addr alias_addr, u_short dst_port, u_short alias_port, u_char proto, - int create) + int create, + struct alias_link **lnkp) { int link_type; struct alias_link *lnk; LIBALIAS_LOCK_ASSERT(la); + + *lnkp = NULL; + switch (proto) { case IPPROTO_UDP: link_type = LINK_UDP; @@ -1149,8 +1242,7 @@ FindUdpTcpIn(struct libalias *la, struct in_addr dst_addr, link_type = LINK_TCP; break; default: - return (NULL); - break; + return (PKT_ALIAS_IGNORED); } lnk = FindLinkIn(la, dst_addr, alias_addr, @@ -1164,22 +1256,30 @@ FindUdpTcpIn(struct libalias *la, struct in_addr dst_addr, lnk = AddLink(la, target_addr, dst_addr, alias_addr, alias_port, dst_port, alias_port, link_type); + if (lnk == NULL) + return (PKT_ALIAS_ERROR); + } - return (lnk); + *lnkp = lnk; + return (lnk != NULL ? PKT_ALIAS_OK : PKT_ALIAS_IGNORED); } -struct alias_link * +int FindUdpTcpOut(struct libalias *la, struct in_addr src_addr, struct in_addr dst_addr, u_short src_port, u_short dst_port, u_char proto, - int create) + int create, + struct alias_link **lnkp) { int link_type; struct alias_link *lnk; LIBALIAS_LOCK_ASSERT(la); + + *lnkp = NULL; + switch (proto) { case IPPROTO_UDP: link_type = LINK_UDP; @@ -1188,12 +1288,10 @@ FindUdpTcpOut(struct libalias *la, struct in_addr src_addr, link_type = LINK_TCP; break; default: - return (NULL); - break; + return (PKT_ALIAS_IGNORED); } lnk = FindLinkOut(la, src_addr, dst_addr, src_port, dst_port, link_type, create); - if (lnk == NULL && create) { struct in_addr alias_addr; @@ -1201,8 +1299,11 @@ FindUdpTcpOut(struct libalias *la, struct in_addr src_addr, lnk = AddLink(la, src_addr, dst_addr, alias_addr, src_port, dst_port, GET_ALIAS_PORT, link_type); + if (lnk == NULL) + return (PKT_ALIAS_ERROR); } - return (lnk); + *lnkp = lnk; + return (lnk != NULL ? PKT_ALIAS_OK : PKT_ALIAS_IGNORED); } struct alias_link * @@ -2100,6 +2201,7 @@ LibAliasInit(struct libalias *la) SPLAY_INIT(&la->linkSplayIn); SPLAY_INIT(&la->linkSplayOut); + SPLAY_INIT(&la->linkSplayInternalEndpoint); LIST_INIT(&la->pptpList); TAILQ_INIT(&la->checkExpire); #ifdef _KERNEL diff --git a/sys/netinet/libalias/alias_db.h b/sys/netinet/libalias/alias_db.h index 35858099bce2..7175d0a50f4b 100644 --- a/sys/netinet/libalias/alias_db.h +++ b/sys/netinet/libalias/alias_db.h @@ -208,12 +208,14 @@ static struct in_addr const ANY_ADDR = { INADDR_ANY }; stored in the auxiliary space. Pointers to unresolved fragments can also be stored. - The link records support two independent chainings. Lookup + The link records support several independent chainings. Lookup tables for input and out tables hold the initial pointers the link chains. On input, the lookup table indexes on alias port and link type. On output, the lookup table indexes on source address, destination address, source port, destination - port and link type. + port and link type. A internal_endpoint table is used for + endpoint-independent mapping, and indexes on source address, + source port and link type. */ /* used to save changes to ACK/sequence numbers */ @@ -292,6 +294,7 @@ struct alias_link { struct { SPLAY_ENTRY(alias_link) out; LIST_ENTRY (alias_link) in; + SPLAY_ENTRY(alias_link) internal_endpoint; } all; struct { LIST_ENTRY (alias_link) list; @@ -374,25 +377,38 @@ cmp_in(struct group_in *a, struct group_in *b) { } SPLAY_PROTOTYPE(splay_in, group_in, in, cmp_in); +static inline int +cmp_internal_endpoint(struct alias_link *a, struct alias_link *b) { + int i = a->link_type - b->link_type; + if (i != 0) return (i); + if (a->src_addr.s_addr > b->src_addr.s_addr) return (1); + if (a->src_addr.s_addr < b->src_addr.s_addr) return (-1); + i = a->src_port - b->src_port; + return (i); +} +SPLAY_PROTOTYPE(splay_internal_endpoint, alias_link, all.internal_endpoint, + cmp_internal_endpoint); + /* Internal routines for finding, deleting and adding links Port Allocation: - GetNewPort() -- find and reserve new alias port number - GetSocket() -- try to allocate a socket for a given port + GetNewPort() -- find and reserve new alias port number + GetSocket() -- try to allocate a socket for a given port Link creation and deletion: - CleanupAliasData() - remove all link chains from lookup table - CleanupLink() - look for a stale link - DeleteLink() - remove link - AddLink() - add link - ReLink() - change link + CleanupAliasData() - remove all link chains from lookup table + CleanupLink() - look for a stale link + DeleteLink() - remove link + AddLink() - add link + ReLink() - change link Link search: - FindLinkOut() - find link for outgoing packets - FindLinkIn() - find link for incoming packets + FindLinkOut() - find link for outgoing packets + FindLinkIn() - find link for incoming packets + FindLinkByInternalEndpoint() - find link by a packet's internal endpoint Port search: - FindNewPortGroup() - find an available group of ports + FindNewPortGroup() - find an available group of ports */ /* Local prototypes */ @@ -417,6 +433,9 @@ FindLinkOut(struct libalias *, struct in_addr, struct in_addr, u_short, u_short, static struct alias_link * FindLinkIn(struct libalias *, struct in_addr, struct in_addr, u_short, u_short, int, int); +static struct alias_link * +FindLinkByInternalEndpoint(struct libalias *, struct in_addr, u_short, int); + static u_short _RandomPort(struct libalias *la); #define GET_NEW_PORT_MAX_ATTEMPTS 20 diff --git a/sys/netinet/libalias/alias_ftp.c b/sys/netinet/libalias/alias_ftp.c index 4a0b616ccf27..4119221e9b35 100644 --- a/sys/netinet/libalias/alias_ftp.c +++ b/sys/netinet/libalias/alias_ftp.c @@ -752,7 +752,7 @@ NewFtpMessage(struct libalias *la, struct ip *pip, /* Compute TCP checksum for revised packet */ tc->th_sum = 0; #ifdef _KERNEL - tc->th_x2 = (TH_RES1 >> 8); + tcp_set_flags(tc, tcp_get_flags(tc) | TH_RES1); #else tc->th_sum = TcpChecksum(pip); #endif diff --git a/sys/netinet/libalias/alias_irc.c b/sys/netinet/libalias/alias_irc.c index 3ef336b7333d..30cee74fff21 100644 --- a/sys/netinet/libalias/alias_irc.c +++ b/sys/netinet/libalias/alias_irc.c @@ -360,9 +360,9 @@ AliasHandleIrcOut(struct libalias *la, * matter, and this would probably allow it through * at least _some_ firewalls. */ - dcc_lnk = FindUdpTcpOut(la, true_addr, destaddr, + (void)FindUdpTcpOut(la, true_addr, destaddr, true_port, 0, - IPPROTO_TCP, 1); + IPPROTO_TCP, 1, &dcc_lnk); DBprintf(("Got a DCC link\n")); if (dcc_lnk) { struct in_addr alias_address; /* Address from aliasing */ @@ -456,7 +456,7 @@ AliasHandleIrcOut(struct libalias *la, /* Compute TCP checksum for revised packet */ tc->th_sum = 0; #ifdef _KERNEL - tc->th_x2 = (TH_RES1 >> 8); + tcp_set_flags(tc, tcp_get_flags(tc) | TH_RES1); #else tc->th_sum = TcpChecksum(pip); #endif diff --git a/sys/netinet/libalias/alias_local.h b/sys/netinet/libalias/alias_local.h index 7b82621a105b..7c1dcb0c8eb0 100644 --- a/sys/netinet/libalias/alias_local.h +++ b/sys/netinet/libalias/alias_local.h @@ -94,10 +94,12 @@ struct libalias { * if no aliasing link already exists */ struct in_addr targetAddress; /* Lookup table of pointers to chains of link records. - * Each link record is doubly indexed into input and - * output lookup tables. */ + * Each link record is indexed into input, + * output and "internal endpoint" lookup tables. */ SPLAY_HEAD(splay_out, alias_link) linkSplayOut; SPLAY_HEAD(splay_in, group_in) linkSplayIn; + SPLAY_HEAD(splay_internal_endpoint, alias_link) + linkSplayInternalEndpoint; LIST_HEAD (, alias_link) pptpList; /* HouseKeeping */ TAILQ_HEAD (, alias_link) checkExpire; @@ -237,12 +239,12 @@ struct alias_link * AddLink(struct libalias *la, struct in_addr src_addr, struct in_addr dst_addr, struct in_addr alias_addr, u_short src_port, u_short dst_port, int alias_param, int link_type); -struct alias_link * +int FindIcmpIn(struct libalias *la, struct in_addr _dst_addr, struct in_addr _alias_addr, - u_short _id_alias, int _create); -struct alias_link * + u_short _id_alias, int _create, struct alias_link **_lnkp); +int FindIcmpOut(struct libalias *la, struct in_addr _src_addr, struct in_addr _dst_addr, - u_short _id, int _create); + u_short _id, int _create, struct alias_link **_lnkp); struct alias_link * FindFragmentIn1(struct libalias *la, struct in_addr _dst_addr, struct in_addr _alias_addr, u_short _ip_id); @@ -253,18 +255,20 @@ struct alias_link * AddFragmentPtrLink(struct libalias *la, struct in_addr _dst_addr, u_short _ip_id); struct alias_link * FindFragmentPtr(struct libalias *la, struct in_addr _dst_addr, u_short _ip_id); -struct alias_link * +int FindProtoIn(struct libalias *la, struct in_addr _dst_addr, struct in_addr _alias_addr, - u_char _proto); -struct alias_link * + u_char _proto, struct alias_link **_lnkp); +int FindProtoOut(struct libalias *la, struct in_addr _src_addr, struct in_addr _dst_addr, - u_char _proto); -struct alias_link * + u_char _proto, struct alias_link **_lnkp); +int FindUdpTcpIn(struct libalias *la, struct in_addr _dst_addr, struct in_addr _alias_addr, - u_short _dst_port, u_short _alias_port, u_char _proto, int _create); -struct alias_link * + u_short _dst_port, u_short _alias_port, u_char _proto, int _create, + struct alias_link **_lnkp); +int FindUdpTcpOut(struct libalias *la, struct in_addr _src_addr, struct in_addr _dst_addr, - u_short _src_port, u_short _dst_port, u_char _proto, int _create); + u_short _src_port, u_short _dst_port, u_char _proto, int _create, + struct alias_link **_lnkp); struct alias_link * AddPptp(struct libalias *la, struct in_addr _src_addr, struct in_addr _dst_addr, struct in_addr _alias_addr, u_int16_t _src_call_id); diff --git a/sys/netinet/libalias/alias_proxy.c b/sys/netinet/libalias/alias_proxy.c index dd685bed760d..0ff4b87b5000 100644 --- a/sys/netinet/libalias/alias_proxy.c +++ b/sys/netinet/libalias/alias_proxy.c @@ -366,7 +366,7 @@ ProxyEncodeTcpStream(struct alias_link *lnk, tc->th_sum = 0; #ifdef _KERNEL - tc->th_x2 = (TH_RES1 >> 8); + tcp_set_flags(tc, tcp_get_flags(tc) | TH_RES1); #else tc->th_sum = TcpChecksum(pip); #endif diff --git a/sys/netinet/libalias/alias_sctp.c b/sys/netinet/libalias/alias_sctp.c index 6781c33f5edb..5ccf31697b42 100644 --- a/sys/netinet/libalias/alias_sctp.c +++ b/sys/netinet/libalias/alias_sctp.c @@ -72,12 +72,12 @@ #ifdef _KERNEL -#include <machine/stdarg.h> #include <sys/param.h> #include <sys/gsb_crc32.h> #include <sys/systm.h> #include <sys/kernel.h> #include <sys/module.h> +#include <sys/stdarg.h> #include <sys/syslog.h> #include <netinet/libalias/alias_sctp.h> #include <netinet/libalias/alias.h> diff --git a/sys/netinet/libalias/alias_skinny.c b/sys/netinet/libalias/alias_skinny.c index 47d66a474fb4..fd9e15d3ad40 100644 --- a/sys/netinet/libalias/alias_skinny.c +++ b/sys/netinet/libalias/alias_skinny.c @@ -214,7 +214,7 @@ alias_skinny_reg_msg(struct RegisterMessage *reg_msg, struct ip *pip, tc->th_sum = 0; #ifdef _KERNEL - tc->th_x2 = (TH_RES1 >> 8); + tcp_set_flags(tc, tcp_get_flags(tc) | TH_RES1); #else tc->th_sum = TcpChecksum(pip); #endif @@ -257,7 +257,7 @@ alias_skinny_port_msg(struct IpPortMessage *port_msg, struct ip *pip, tc->th_sum = 0; #ifdef _KERNEL - tc->th_x2 = (TH_RES1 >> 8); + tcp_set_flags(tc, tcp_get_flags(tc) | TH_RES1); #else tc->th_sum = TcpChecksum(pip); #endif @@ -279,15 +279,15 @@ alias_skinny_opnrcvch_ack(struct libalias *la, struct OpenReceiveChannelAck *opn *localIpAddr = (u_int32_t)opnrcvch_ack->ipAddr; null_addr.s_addr = INADDR_ANY; - opnrcv_lnk = FindUdpTcpOut(la, pip->ip_src, null_addr, + (void)FindUdpTcpOut(la, pip->ip_src, null_addr, htons((u_short) opnrcvch_ack->port), 0, - IPPROTO_UDP, 1); + IPPROTO_UDP, 1, &opnrcv_lnk); opnrcvch_ack->ipAddr = (u_int32_t)GetAliasAddress(opnrcv_lnk).s_addr; opnrcvch_ack->port = (u_int32_t)ntohs(GetAliasPort(opnrcv_lnk)); tc->th_sum = 0; #ifdef _KERNEL - tc->th_x2 = (TH_RES1 >> 8); + tcp_set_flags(tc, tcp_get_flags(tc) | TH_RES1); #else tc->th_sum = TcpChecksum(pip); #endif diff --git a/sys/netinet/libalias/alias_smedia.c b/sys/netinet/libalias/alias_smedia.c index 6c67e0d8f006..badd75a45c61 100644 --- a/sys/netinet/libalias/alias_smedia.c +++ b/sys/netinet/libalias/alias_smedia.c @@ -402,7 +402,7 @@ alias_rtsp_out(struct libalias *la, struct ip *pip, tc->th_sum = 0; #ifdef _KERNEL - tc->th_x2 = (TH_RES1 >> 8); + tcp_set_flags(tc, tcp_get_flags(tc) | TH_RES1); #else tc->th_sum = TcpChecksum(pip); #endif @@ -435,8 +435,8 @@ alias_pna_out(struct libalias *la, struct ip *pip, if ((ntohs(msg_id) == 1) || (ntohs(msg_id) == 7)) { memcpy(&port, work, 2); - pna_links = FindUdpTcpOut(la, pip->ip_src, GetDestAddress(lnk), - port, 0, IPPROTO_UDP, 1); + (void)FindUdpTcpOut(la, pip->ip_src, GetDestAddress(lnk), + port, 0, IPPROTO_UDP, 1, &pna_links); if (pna_links != NULL) { #ifndef NO_FW_PUNCH /* Punch hole in firewall */ @@ -449,7 +449,7 @@ alias_pna_out(struct libalias *la, struct ip *pip, /* Compute TCP checksum for revised packet */ tc->th_sum = 0; #ifdef _KERNEL - tc->th_x2 = (TH_RES1 >> 8); + tcp_set_flags(tc, tcp_get_flags(tc) | TH_RES1); #else tc->th_sum = TcpChecksum(pip); #endif diff --git a/sys/netinet/libalias/libalias.3 b/sys/netinet/libalias/libalias.3 index b4d123682f0b..1b8ecc14059d 100644 --- a/sys/netinet/libalias/libalias.3 +++ b/sys/netinet/libalias/libalias.3 @@ -23,7 +23,7 @@ .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" -.Dd May 31, 2021 +.Dd November 29, 2024 .Dt LIBALIAS 3 .Os .Sh NAME @@ -200,11 +200,11 @@ is closed. .It Dv PKT_ALIAS_UNREGISTERED_ONLY If this mode bit is set, traffic on the local network which does not originate from unregistered address spaces will be ignored. -Standard Class A, B and C unregistered addresses are: +The standard private IP address ranges are: .Pp -10.0.0.0 -> 10.255.255.255 (Class A subnet) -172.16.0.0 -> 172.31.255.255 (Class B subnets) -192.168.0.0 -> 192.168.255.255 (Class C subnets) +10.0.0.0 -> 10.255.255.255 (/8) +172.16.0.0 -> 172.31.255.255 (/16) +192.168.0.0 -> 192.168.255.255 (/24) .Pp This option is useful in the case that the packet aliasing host has both registered and unregistered subnets on different interfaces. @@ -270,6 +270,26 @@ See section in .Xr ipfw 8 for more details. +.It Dv PKT_ALIAS_UDP_EIM +When this bit is set, UDP uses endpoint-independent mapping (EIM), as per +RFC 4787 ("full cone" NAT of RFC 3489). +All packets from the same internal address:port are mapped to the same NAT +address:port, regardless of their destination address:port. +If filtering rules allow, and if +.Em PKT_ALIAS_DENY_INCOMING +is unset, any other external address:port can +also send to the internal address:port through its mapped NAT address:port. +This is more compatible with applications, and can reduce the need for port +forwarding, but less scalable as each NAT address:port can only be +concurrently used by at most one internal address:port. +.Pp +When this bit is unset, UDP packets use endpoint-dependent mapping (EDM) +("symmetric" NAT). +Each connection from a particular internal address:port to different +external addresses:ports is mapped to a random and unpredictable NAT +address:port. +Two appplications behind EDM NATs can only connect to each other +by port forwarding on the NAT, or tunnelling through an in-between server. .El .Ed .Pp diff --git a/sys/netinet/pim.h b/sys/netinet/pim.h index 98230fc6ae2d..4744ffc7e9d8 100644 --- a/sys/netinet/pim.h +++ b/sys/netinet/pim.h @@ -71,7 +71,7 @@ struct pim { #endif /* ! _PIM_VT */ uint8_t pim_reserved; /* Reserved */ uint16_t pim_cksum; /* IP-style checksum */ -}; +} __packed; /* KAME-related name backward compatibility */ #define pim_ver pim_vers #define pim_rsv pim_reserved diff --git a/sys/netinet/raw_ip.c b/sys/netinet/raw_ip.c index a6bef1c7e275..66070faf97e9 100644 --- a/sys/netinet/raw_ip.c +++ b/sys/netinet/raw_ip.c @@ -30,7 +30,6 @@ * SUCH DAMAGE. */ -#include <sys/cdefs.h> #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" @@ -50,6 +49,7 @@ #include <sys/signalvar.h> #include <sys/socket.h> #include <sys/socketvar.h> +#include <sys/stdarg.h> #include <sys/sx.h> #include <sys/sysctl.h> #include <sys/systm.h> @@ -75,7 +75,6 @@ #include <netipsec/ipsec_support.h> -#include <machine/stdarg.h> #include <security/mac/mac_framework.h> extern ipproto_input_t *ip_protox[]; @@ -128,6 +127,12 @@ int (*ip_rsvp_vif)(struct socket *, struct sockopt *); void (*ip_rsvp_force_done)(struct socket *); #endif /* INET */ +#define V_rip_bind_all_fibs VNET(rip_bind_all_fibs) +VNET_DEFINE(int, rip_bind_all_fibs) = 1; +SYSCTL_INT(_net_inet_raw, OID_AUTO, bind_all_fibs, CTLFLAG_VNET | CTLFLAG_RDTUN, + &VNET_NAME(rip_bind_all_fibs), 0, + "Bound sockets receive traffic from all FIBs"); + u_long rip_sendspace = 9216; SYSCTL_ULONG(_net_inet_raw, OID_AUTO, maxdgram, CTLFLAG_RW, &rip_sendspace, 0, "Maximum outgoing raw IP datagram size"); @@ -302,7 +307,9 @@ rip_input(struct mbuf **mp, int *offp, int proto) struct mbuf *m = *mp; struct inpcb *inp; struct sockaddr_in ripsrc; - int appended; + int appended, fib; + + M_ASSERTPKTHDR(m); *mp = NULL; appended = 0; @@ -312,6 +319,7 @@ rip_input(struct mbuf **mp, int *offp, int proto) ripsrc.sin_family = AF_INET; ripsrc.sin_addr = ctx.ip->ip_src; + fib = M_GETFIB(m); ifp = m->m_pkthdr.rcvif; inpi.hash = INP_PCBHASH_RAW(proto, ctx.ip->ip_src.s_addr, @@ -326,6 +334,12 @@ rip_input(struct mbuf **mp, int *offp, int proto) */ continue; } + if (V_rip_bind_all_fibs == 0 && fib != inp->inp_inc.inc_fibnum) + /* + * Sockets bound to a specific FIB can only receive + * packets from that FIB. + */ + continue; appended += rip_append(inp, ctx.ip, m, &ripsrc); } @@ -343,6 +357,9 @@ rip_input(struct mbuf **mp, int *offp, int proto) * and fall through into normal filter path if so. */ continue; + if (V_rip_bind_all_fibs == 0 && fib != inp->inp_inc.inc_fibnum) + continue; + /* * If this raw socket has multicast state, and we * have received a multicast, check if this socket @@ -584,7 +601,7 @@ rip_send(struct socket *so, int pruflags, struct mbuf *m, struct sockaddr *nam, * but we got this limitation from the beginning of history. */ if (ip->ip_id == 0) - ip_fillid(ip); + ip_fillid(ip, V_ip_random_id); /* * XXX prevent ip_output from overwriting header fields. @@ -625,8 +642,6 @@ rip_send(struct socket *so, int pruflags, struct mbuf *m, struct sockaddr *nam, * * When adding new socket options here, make sure to add access control * checks here as necessary. - * - * XXX-BZ inp locking? */ int rip_ctloutput(struct socket *so, struct sockopt *sopt) @@ -635,11 +650,10 @@ rip_ctloutput(struct socket *so, struct sockopt *sopt) int error, optval; if (sopt->sopt_level != IPPROTO_IP) { - if ((sopt->sopt_level == SOL_SOCKET) && - (sopt->sopt_name == SO_SETFIB)) { - inp->inp_inc.inc_fibnum = so->so_fibnum; - return (0); - } + if (sopt->sopt_dir == SOPT_SET && + sopt->sopt_level == SOL_SOCKET && + sopt->sopt_name == SO_SETFIB) + return (ip_ctloutput(so, sopt)); return (EINVAL); } @@ -707,10 +721,12 @@ rip_ctloutput(struct socket *so, struct sockopt *sopt) sizeof optval); if (error) break; + INP_WLOCK(inp); if (optval) inp->inp_flags |= INP_HDRINCL; else inp->inp_flags &= ~INP_HDRINCL; + INP_WUNLOCK(inp); break; case IP_FW3: /* generic ipfw v.3 functions */ diff --git a/sys/netinet/sctp_asconf.c b/sys/netinet/sctp_asconf.c index 3a30b0ba3740..6b98557c45a6 100644 --- a/sys/netinet/sctp_asconf.c +++ b/sys/netinet/sctp_asconf.c @@ -1313,13 +1313,13 @@ sctp_asconf_queue_mgmt(struct sctp_tcb *stcb, struct sctp_ifa *ifa, #ifdef SCTP_DEBUG if (SCTP_BASE_SYSCTL(sctp_debug_on) & SCTP_DEBUG_ASCONF2) { if (type == SCTP_ADD_IP_ADDRESS) { - SCTP_PRINTF("asconf_queue_mgmt: inserted asconf ADD_IP_ADDRESS: "); + SCTPDBG(SCTP_DEBUG_ASCONF2, "asconf_queue_mgmt: inserted asconf ADD_IP_ADDRESS: "); SCTPDBG_ADDR(SCTP_DEBUG_ASCONF2, &ifa->address.sa); } else if (type == SCTP_DEL_IP_ADDRESS) { - SCTP_PRINTF("asconf_queue_mgmt: appended asconf DEL_IP_ADDRESS: "); + SCTPDBG(SCTP_DEBUG_ASCONF2, "asconf_queue_mgmt: appended asconf DEL_IP_ADDRESS: "); SCTPDBG_ADDR(SCTP_DEBUG_ASCONF2, &ifa->address.sa); } else { - SCTP_PRINTF("asconf_queue_mgmt: appended asconf SET_PRIM_ADDR: "); + SCTPDBG(SCTP_DEBUG_ASCONF2, "asconf_queue_mgmt: appended asconf SET_PRIM_ADDR: "); SCTPDBG_ADDR(SCTP_DEBUG_ASCONF2, &ifa->address.sa); } } diff --git a/sys/netinet/sctp_bsd_addr.c b/sys/netinet/sctp_bsd_addr.c index a91b0dde5967..ac715d8298ec 100644 --- a/sys/netinet/sctp_bsd_addr.c +++ b/sys/netinet/sctp_bsd_addr.c @@ -117,25 +117,26 @@ sctp_gather_internal_ifa_flags(struct sctp_ifa *ifa) { struct in6_ifaddr *ifa6; + KASSERT(ifa->address.sa.sa_family == AF_INET6, + ("sctp_gather_internal_ifa_flags() called with address family %u", + ifa->address.sa.sa_family)); ifa6 = (struct in6_ifaddr *)ifa->ifa; ifa->flags = ifa6->ia6_flags; - if (!MODULE_GLOBAL(ip6_use_deprecated)) { - if (ifa->flags & - IN6_IFF_DEPRECATED) { + if (MODULE_GLOBAL(ip6_use_deprecated)) { + ifa->localifa_flags &= ~SCTP_ADDR_IFA_UNUSEABLE; + } else { + if (ifa->flags & IN6_IFF_DEPRECATED) { ifa->localifa_flags |= SCTP_ADDR_IFA_UNUSEABLE; } else { ifa->localifa_flags &= ~SCTP_ADDR_IFA_UNUSEABLE; } - } else { - ifa->localifa_flags &= ~SCTP_ADDR_IFA_UNUSEABLE; } - if (ifa->flags & - (IN6_IFF_DETACHED | - IN6_IFF_ANYCAST | - IN6_IFF_NOTREADY)) { + if (ifa->flags & (IN6_IFF_DETACHED | IN6_IFF_DUPLICATED)) { + ifa->localifa_flags |= SCTP_ADDR_IFA_UNUSEABLE; + } + /* Right now, do not support IPv6 anycast addresses */ + if (ifa->flags & IN6_IFF_ANYCAST) { ifa->localifa_flags |= SCTP_ADDR_IFA_UNUSEABLE; - } else { - ifa->localifa_flags &= ~SCTP_ADDR_IFA_UNUSEABLE; } } #endif /* INET6 */ @@ -338,8 +339,8 @@ sctp_addr_change(struct ifaddr *ifa, int cmd) (void *)ifa, ifa->ifa_addr, ifa_flags, 1); } else { sctp_del_addr_from_vrf(SCTP_DEFAULT_VRFID, ifa->ifa_addr, - ifa->ifa_ifp->if_index, - ifa->ifa_ifp->if_xname); + (void *)ifa->ifa_ifp, + ifa->ifa_ifp->if_index); /* * We don't bump refcount here so when it completes the diff --git a/sys/netinet/sctp_header.h b/sys/netinet/sctp_header.h index 9696c4e954ba..c9fd0341f83a 100644 --- a/sys/netinet/sctp_header.h +++ b/sys/netinet/sctp_header.h @@ -83,7 +83,7 @@ struct sctp_supported_addr_param { /* heartbeat info parameter */ struct sctp_heartbeat_info_param { struct sctp_paramhdr ph; - uint32_t time_value_1; + time_t time_value_1; uint32_t time_value_2; uint32_t random_value1; uint32_t random_value2; diff --git a/sys/netinet/sctp_indata.c b/sys/netinet/sctp_indata.c index a30fd95fef30..693de313b970 100644 --- a/sys/netinet/sctp_indata.c +++ b/sys/netinet/sctp_indata.c @@ -746,21 +746,6 @@ sctp_build_readq_entry_from_ctl(struct sctp_queued_to_read *nc, struct sctp_queu nc->do_not_ref_stcb = control->do_not_ref_stcb; } -static void -sctp_reset_a_control(struct sctp_queued_to_read *control, - struct sctp_inpcb *inp, uint32_t tsn) -{ - control->fsn_included = tsn; - if (control->on_read_q) { - /* - * We have to purge it from there, hopefully this will work - * :-) - */ - TAILQ_REMOVE(&inp->read_queue, control, next); - control->on_read_q = 0; - } -} - static int sctp_handle_old_unordered_data(struct sctp_tcb *stcb, struct sctp_association *asoc, @@ -1922,7 +1907,8 @@ sctp_process_a_data_chunk(struct sctp_tcb *stcb, struct sctp_association *asoc, SCTP_SNPRINTF(msg, sizeof(msg), "Duplicate MID=%8.8x detected.", mid); goto err_out; } else { - if ((tsn == control->fsn_included + 1) && + if ((control->first_frag_seen) && + (tsn == control->fsn_included + 1) && (control->end_added == 0)) { SCTP_SNPRINTF(msg, sizeof(msg), "Illegal message sequence, missing end for MID: %8.8x", @@ -5241,6 +5227,10 @@ sctp_kick_prsctp_reorder_queue(struct sctp_tcb *stcb, uint32_t mid; int need_reasm_check = 0; + KASSERT(stcb != NULL, ("stcb == NULL")); + SCTP_TCB_LOCK_ASSERT(stcb); + SCTP_INP_READ_LOCK_ASSERT(stcb->sctp_ep); + asoc = &stcb->asoc; mid = strmin->last_mid_delivered; /* @@ -5278,11 +5268,9 @@ sctp_kick_prsctp_reorder_queue(struct sctp_tcb *stcb, /* deliver it to at least the delivery-q */ if (stcb->sctp_socket) { sctp_mark_non_revokable(asoc, control->sinfo_tsn); - sctp_add_to_readq(stcb->sctp_ep, stcb, - control, - &stcb->sctp_socket->so_rcv, - 1, SCTP_READ_LOCK_HELD, - SCTP_SO_NOT_LOCKED); + sctp_add_to_readq(stcb->sctp_ep, stcb, control, + &stcb->sctp_socket->so_rcv, 1, + SCTP_READ_LOCK_HELD, SCTP_SO_NOT_LOCKED); } } else { /* Its a fragmented message */ @@ -5352,8 +5340,7 @@ sctp_kick_prsctp_reorder_queue(struct sctp_tcb *stcb, strmin->last_mid_delivered = control->mid; if (stcb->sctp_socket) { sctp_mark_non_revokable(asoc, control->sinfo_tsn); - sctp_add_to_readq(stcb->sctp_ep, stcb, - control, + sctp_add_to_readq(stcb->sctp_ep, stcb, control, &stcb->sctp_socket->so_rcv, 1, SCTP_READ_LOCK_HELD, SCTP_SO_NOT_LOCKED); } @@ -5394,6 +5381,11 @@ sctp_flush_reassm_for_str_seq(struct sctp_tcb *stcb, * it can be delivered... But for now we just dump everything on the * queue. */ + + KASSERT(stcb != NULL, ("stcb == NULL")); + SCTP_TCB_LOCK_ASSERT(stcb); + SCTP_INP_READ_LOCK_ASSERT(stcb->sctp_ep); + if (!asoc->idata_supported && !ordered && control->first_frag_seen && SCTP_TSN_GT(control->fsn_included, cumtsn)) { @@ -5424,12 +5416,25 @@ sctp_flush_reassm_for_str_seq(struct sctp_tcb *stcb, sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED); } if (!TAILQ_EMPTY(&control->reasm)) { - /* This has to be old data, unordered */ + KASSERT(!asoc->idata_supported, + ("Reassembly queue not empty for I-DATA")); + KASSERT(!ordered, + ("Reassembly queue not empty for ordered data")); if (control->data) { sctp_m_freem(control->data); control->data = NULL; } - sctp_reset_a_control(control, stcb->sctp_ep, cumtsn); + control->fsn_included = 0xffffffff; + control->first_frag_seen = 0; + control->last_frag_seen = 0; + if (control->on_read_q) { + /* + * We have to purge it from there, hopefully this + * will work :-) + */ + TAILQ_REMOVE(&stcb->sctp_ep->read_queue, control, next); + control->on_read_q = 0; + } chk = TAILQ_FIRST(&control->reasm); if (chk->rec.data.rcv_flags & SCTP_DATA_FIRST_FRAG) { TAILQ_REMOVE(&control->reasm, chk, sctp_next); diff --git a/sys/netinet/sctp_input.c b/sys/netinet/sctp_input.c index a55ef5ac1eab..dc31ffbc2161 100644 --- a/sys/netinet/sctp_input.c +++ b/sys/netinet/sctp_input.c @@ -2329,7 +2329,7 @@ sctp_handle_cookie_echo(struct mbuf *m, int iphlen, int offset, } ep = &(*inp_p)->sctp_ep; /* which cookie is it? */ - if ((cookie->time_entered.tv_sec < (long)ep->time_of_secret_change) && + if ((cookie->time_entered.tv_sec < ep->time_of_secret_change) && (ep->current_secret_number != ep->last_secret_number)) { /* it's the old cookie */ (void)sctp_hmac_m(SCTP_HMAC, @@ -2352,7 +2352,7 @@ sctp_handle_cookie_echo(struct mbuf *m, int iphlen, int offset, /* compare the received digest with the computed digest */ if (timingsafe_bcmp(calc_sig, sig, SCTP_SIGNATURE_SIZE) != 0) { /* try the old cookie? */ - if ((cookie->time_entered.tv_sec == (long)ep->time_of_secret_change) && + if ((cookie->time_entered.tv_sec == ep->time_of_secret_change) && (ep->current_secret_number != ep->last_secret_number)) { /* compute digest with old */ (void)sctp_hmac_m(SCTP_HMAC, @@ -4231,6 +4231,8 @@ sctp_handle_packet_dropped(struct sctp_pktdrop_chunk *cp, SCTP_STAT_INCR(sctps_pdrpmbda); } } else { + desc.tsn_ifany = htonl(0); + memset(desc.data_bytes, 0, SCTP_NUM_DB_TO_VERIFY); if (pktdrp_flags & SCTP_FROM_MIDDLE_BOX) { SCTP_STAT_INCR(sctps_pdrpmbct); } diff --git a/sys/netinet/sctp_os_bsd.h b/sys/netinet/sctp_os_bsd.h index eb0caec942e9..9cec02aa6a07 100644 --- a/sys/netinet/sctp_os_bsd.h +++ b/sys/netinet/sctp_os_bsd.h @@ -342,7 +342,7 @@ typedef struct callout sctp_os_timer_t; } while(0) /* Other m_pkthdr type things */ -#define SCTP_IS_IT_BROADCAST(dst, m) ((m->m_flags & M_PKTHDR) ? in_broadcast(dst, m->m_pkthdr.rcvif) : 0) +#define SCTP_IS_IT_BROADCAST(dst, m) ((m->m_flags & M_PKTHDR) ? in_ifnet_broadcast(dst, m->m_pkthdr.rcvif) : 0) #define SCTP_IS_IT_LOOPBACK(m) ((m->m_flags & M_PKTHDR) && ((m->m_pkthdr.rcvif == NULL) || (m->m_pkthdr.rcvif->if_type == IFT_LOOP))) /* This converts any input packet header diff --git a/sys/netinet/sctp_output.c b/sys/netinet/sctp_output.c index a8facff6b917..e4bdb4291972 100644 --- a/sys/netinet/sctp_output.c +++ b/sys/netinet/sctp_output.c @@ -3655,8 +3655,7 @@ sctp_process_cmsgs_for_init(struct sctp_tcb *stcb, struct mbuf *control, int *er sin.sin_len = sizeof(struct sockaddr_in); sin.sin_port = stcb->rport; m_copydata(control, cmsg_data_off, sizeof(struct in_addr), (caddr_t)&sin.sin_addr); - if ((sin.sin_addr.s_addr == INADDR_ANY) || - (sin.sin_addr.s_addr == INADDR_BROADCAST) || + if (in_broadcast(sin.sin_addr) || IN_MULTICAST(ntohl(sin.sin_addr.s_addr))) { *error = EINVAL; return (1); @@ -3687,8 +3686,7 @@ sctp_process_cmsgs_for_init(struct sctp_tcb *stcb, struct mbuf *control, int *er #ifdef INET if (IN6_IS_ADDR_V4MAPPED(&sin6.sin6_addr)) { in6_sin6_2_sin(&sin, &sin6); - if ((sin.sin_addr.s_addr == INADDR_ANY) || - (sin.sin_addr.s_addr == INADDR_BROADCAST) || + if (in_broadcast(sin.sin_addr) || IN_MULTICAST(ntohl(sin.sin_addr.s_addr))) { *error = EINVAL; return (1); @@ -4073,7 +4071,7 @@ sctp_lowlevel_chunk_output(struct sctp_inpcb *inp, ip->ip_off = htons(0); } /* FreeBSD has a function for ip_id's */ - ip_fillid(ip); + ip_fillid(ip, V_ip_random_id); ip->ip_ttl = inp->ip_inp.inp.inp_ip_ttl; ip->ip_len = htons(packet_length); @@ -6705,7 +6703,9 @@ sctp_sendall_iterator(struct sctp_inpcb *inp, struct sctp_tcb *stcb, void *ptr, } else { m = sctp_get_mbuf_for_msg(sizeof(struct sctp_paramhdr), 0, M_NOWAIT, 1, MT_DATA); - SCTP_BUF_LEN(m) = sizeof(struct sctp_paramhdr); + if (m != NULL) { + SCTP_BUF_LEN(m) = sizeof(struct sctp_paramhdr); + } } if (m != NULL) { struct sctp_paramhdr *ph; @@ -6909,10 +6909,20 @@ static int sctp_sendall(struct sctp_inpcb *inp, struct uio *uio, struct mbuf *m, struct sctp_nonpad_sndrcvinfo *srcv) { - int ret; struct sctp_copy_all *ca; + struct mbuf *mat; + ssize_t sndlen; + int ret; - if (uio->uio_resid > (ssize_t)SCTP_BASE_SYSCTL(sctp_sendall_limit)) { + if (uio != NULL) { + sndlen = uio->uio_resid; + } else { + sndlen = 0; + for (mat = m; mat; mat = SCTP_BUF_NEXT(mat)) { + sndlen += SCTP_BUF_LEN(mat); + } + } + if (sndlen > (ssize_t)SCTP_BASE_SYSCTL(sctp_sendall_limit)) { /* You must not be larger than the limit! */ return (EMSGSIZE); } @@ -6924,12 +6934,10 @@ sctp_sendall(struct sctp_inpcb *inp, struct uio *uio, struct mbuf *m, return (ENOMEM); } memset(ca, 0, sizeof(struct sctp_copy_all)); - ca->inp = inp; if (srcv != NULL) { memcpy(&ca->sndrcv, srcv, sizeof(struct sctp_nonpad_sndrcvinfo)); } - /* Serialize. */ SCTP_INP_WLOCK(inp); if ((inp->sctp_flags & SCTP_PCB_FLAGS_SND_ITERATOR_UP) != 0) { @@ -6940,15 +6948,14 @@ sctp_sendall(struct sctp_inpcb *inp, struct uio *uio, struct mbuf *m, } inp->sctp_flags |= SCTP_PCB_FLAGS_SND_ITERATOR_UP; SCTP_INP_WUNLOCK(inp); - /* * take off the sendall flag, it would be bad if we failed to do * this :-0 */ ca->sndrcv.sinfo_flags &= ~SCTP_SENDALL; /* get length and mbuf chain */ - if (uio) { - ca->sndlen = uio->uio_resid; + ca->sndlen = sndlen; + if (uio != NULL) { ca->m = sctp_copy_out_all(uio, ca->sndlen); if (ca->m == NULL) { SCTP_FREE(ca, SCTP_M_COPYAL); @@ -6960,20 +6967,14 @@ sctp_sendall(struct sctp_inpcb *inp, struct uio *uio, struct mbuf *m, return (ENOMEM); } } else { - /* Gather the length of the send */ - struct mbuf *mat; - - ca->sndlen = 0; - for (mat = m; mat; mat = SCTP_BUF_NEXT(mat)) { - ca->sndlen += SCTP_BUF_LEN(mat); - } + ca->m = m; } ret = sctp_initiate_iterator(NULL, sctp_sendall_iterator, NULL, SCTP_PCB_ANY_FLAGS, SCTP_PCB_ANY_FEATURES, SCTP_ASOC_ANY_STATE, (void *)ca, 0, sctp_sendall_completes, inp, 1); - if (ret) { + if (ret != 0) { SCTP_INP_WLOCK(inp); inp->sctp_flags &= ~SCTP_PCB_FLAGS_SND_ITERATOR_UP; SCTP_INP_WUNLOCK(inp); @@ -11196,7 +11197,7 @@ sctp_send_resp_msg(struct sockaddr *src, struct sockaddr *dst, ip->ip_hl = (sizeof(struct ip) >> 2); ip->ip_tos = 0; ip->ip_off = htons(IP_DF); - ip_fillid(ip); + ip_fillid(ip, V_ip_random_id); ip->ip_ttl = MODULE_GLOBAL(ip_defttl); if (port) { ip->ip_p = IPPROTO_UDP; @@ -11424,7 +11425,7 @@ sctp_send_hb(struct sctp_tcb *stcb, struct sctp_nets *net, int so_locked) /* Fill out hb parameter */ hb->heartbeat.hb_info.ph.param_type = htons(SCTP_HEARTBEAT_INFO); hb->heartbeat.hb_info.ph.param_length = htons(sizeof(struct sctp_heartbeat_info_param)); - hb->heartbeat.hb_info.time_value_1 = (uint32_t)now.tv_sec; + hb->heartbeat.hb_info.time_value_1 = now.tv_sec; hb->heartbeat.hb_info.time_value_2 = now.tv_usec; /* Did our user request this one, put it in */ hb->heartbeat.hb_info.addr_family = (uint8_t)net->ro._l_addr.sa.sa_family; @@ -13910,15 +13911,15 @@ sctp_v4src_match_nexthop(struct sctp_ifa *sifa, sctp_route_t *ro) mask = (struct sockaddr_in *)(ifa->ifa_netmask); sin = &sifa->address.sin; srcnetaddr.s_addr = (sin->sin_addr.s_addr & mask->sin_addr.s_addr); - SCTPDBG(SCTP_DEBUG_OUTPUT1, "match_nexthop4: src address is "); + SCTPDBG(SCTP_DEBUG_OUTPUT2, "match_nexthop4: src address is "); SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT2, &sifa->address.sa); - SCTPDBG(SCTP_DEBUG_OUTPUT1, "network address is %x\n", srcnetaddr.s_addr); + SCTPDBG(SCTP_DEBUG_OUTPUT2, "network address is %x\n", srcnetaddr.s_addr); sin = &ro->ro_nh->gw4_sa; gwnetaddr.s_addr = (sin->sin_addr.s_addr & mask->sin_addr.s_addr); - SCTPDBG(SCTP_DEBUG_OUTPUT1, "match_nexthop4: nexthop is "); + SCTPDBG(SCTP_DEBUG_OUTPUT2, "match_nexthop4: nexthop is "); SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT2, &ro->ro_nh->gw_sa); - SCTPDBG(SCTP_DEBUG_OUTPUT1, "network address is %x\n", gwnetaddr.s_addr); + SCTPDBG(SCTP_DEBUG_OUTPUT2, "network address is %x\n", gwnetaddr.s_addr); if (srcnetaddr.s_addr == gwnetaddr.s_addr) { return (1); } diff --git a/sys/netinet/sctp_pcb.c b/sys/netinet/sctp_pcb.c index 1509ac13901e..2092f20e3c22 100644 --- a/sys/netinet/sctp_pcb.c +++ b/sys/netinet/sctp_pcb.c @@ -192,21 +192,16 @@ sctp_find_ifn(void *ifn, uint32_t ifn_index) struct sctp_ifn *sctp_ifnp; struct sctp_ifnlist *hash_ifn_head; - /* - * We assume the lock is held for the addresses if that's wrong - * problems could occur :-) - */ SCTP_IPI_ADDR_LOCK_ASSERT(); + KASSERT(ifn != NULL, ("sctp_find_ifn(NULL, %u) called", ifn_index)); hash_ifn_head = &SCTP_BASE_INFO(vrf_ifn_hash)[(ifn_index & SCTP_BASE_INFO(vrf_ifn_hashmark))]; LIST_FOREACH(sctp_ifnp, hash_ifn_head, next_bucket) { - if (sctp_ifnp->ifn_index == ifn_index) { - return (sctp_ifnp); - } - if (sctp_ifnp->ifn_p && ifn && (sctp_ifnp->ifn_p == ifn)) { - return (sctp_ifnp); + if (sctp_ifnp->ifn_index == ifn_index && + sctp_ifnp->ifn_p == ifn) { + break; } } - return (NULL); + return (sctp_ifnp); } struct sctp_vrf * @@ -239,7 +234,7 @@ sctp_free_vrf(struct sctp_vrf *vrf) } } -void +static void sctp_free_ifn(struct sctp_ifn *sctp_ifnp) { if (SCTP_DECREMENT_AND_CHECK_REFCOUNT(&sctp_ifnp->refcount)) { @@ -253,17 +248,6 @@ sctp_free_ifn(struct sctp_ifn *sctp_ifnp) } void -sctp_update_ifn_mtu(uint32_t ifn_index, uint32_t mtu) -{ - struct sctp_ifn *sctp_ifnp; - - sctp_ifnp = sctp_find_ifn((void *)NULL, ifn_index); - if (sctp_ifnp != NULL) { - sctp_ifnp->ifn_mtu = mtu; - } -} - -void sctp_free_ifa(struct sctp_ifa *sctp_ifap) { if (SCTP_DECREMENT_AND_CHECK_REFCOUNT(&sctp_ifap->refcount)) { @@ -277,123 +261,30 @@ sctp_free_ifa(struct sctp_ifa *sctp_ifap) } static void -sctp_delete_ifn(struct sctp_ifn *sctp_ifnp, int hold_addr_lock) +sctp_delete_ifn(struct sctp_ifn *sctp_ifnp) { - struct sctp_ifn *found; - found = sctp_find_ifn(sctp_ifnp->ifn_p, sctp_ifnp->ifn_index); - if (found == NULL) { + SCTP_IPI_ADDR_WLOCK_ASSERT(); + if (sctp_find_ifn(sctp_ifnp->ifn_p, sctp_ifnp->ifn_index) == NULL) { /* Not in the list.. sorry */ return; } - if (hold_addr_lock == 0) { - SCTP_IPI_ADDR_WLOCK(); - } else { - SCTP_IPI_ADDR_WLOCK_ASSERT(); - } LIST_REMOVE(sctp_ifnp, next_bucket); LIST_REMOVE(sctp_ifnp, next_ifn); - if (hold_addr_lock == 0) { - SCTP_IPI_ADDR_WUNLOCK(); - } /* Take away the reference, and possibly free it */ sctp_free_ifn(sctp_ifnp); } -void -sctp_mark_ifa_addr_down(uint32_t vrf_id, struct sockaddr *addr, - const char *if_name, uint32_t ifn_index) -{ - struct sctp_vrf *vrf; - struct sctp_ifa *sctp_ifap; - - SCTP_IPI_ADDR_RLOCK(); - vrf = sctp_find_vrf(vrf_id); - if (vrf == NULL) { - SCTPDBG(SCTP_DEBUG_PCB4, "Can't find vrf_id 0x%x\n", vrf_id); - goto out; - } - sctp_ifap = sctp_find_ifa_by_addr(addr, vrf->vrf_id, SCTP_ADDR_LOCKED); - if (sctp_ifap == NULL) { - SCTPDBG(SCTP_DEBUG_PCB4, "Can't find sctp_ifap for address\n"); - goto out; - } - if (sctp_ifap->ifn_p == NULL) { - SCTPDBG(SCTP_DEBUG_PCB4, "IFA has no IFN - can't mark unusable\n"); - goto out; - } - if (if_name) { - if (strncmp(if_name, sctp_ifap->ifn_p->ifn_name, SCTP_IFNAMSIZ) != 0) { - SCTPDBG(SCTP_DEBUG_PCB4, "IFN %s of IFA not the same as %s\n", - sctp_ifap->ifn_p->ifn_name, if_name); - goto out; - } - } else { - if (sctp_ifap->ifn_p->ifn_index != ifn_index) { - SCTPDBG(SCTP_DEBUG_PCB4, "IFA owned by ifn_index:%d down command for ifn_index:%d - ignored\n", - sctp_ifap->ifn_p->ifn_index, ifn_index); - goto out; - } - } - - sctp_ifap->localifa_flags &= (~SCTP_ADDR_VALID); - sctp_ifap->localifa_flags |= SCTP_ADDR_IFA_UNUSEABLE; -out: - SCTP_IPI_ADDR_RUNLOCK(); -} - -void -sctp_mark_ifa_addr_up(uint32_t vrf_id, struct sockaddr *addr, - const char *if_name, uint32_t ifn_index) -{ - struct sctp_vrf *vrf; - struct sctp_ifa *sctp_ifap; - - SCTP_IPI_ADDR_RLOCK(); - vrf = sctp_find_vrf(vrf_id); - if (vrf == NULL) { - SCTPDBG(SCTP_DEBUG_PCB4, "Can't find vrf_id 0x%x\n", vrf_id); - goto out; - } - sctp_ifap = sctp_find_ifa_by_addr(addr, vrf->vrf_id, SCTP_ADDR_LOCKED); - if (sctp_ifap == NULL) { - SCTPDBG(SCTP_DEBUG_PCB4, "Can't find sctp_ifap for address\n"); - goto out; - } - if (sctp_ifap->ifn_p == NULL) { - SCTPDBG(SCTP_DEBUG_PCB4, "IFA has no IFN - can't mark unusable\n"); - goto out; - } - if (if_name) { - if (strncmp(if_name, sctp_ifap->ifn_p->ifn_name, SCTP_IFNAMSIZ) != 0) { - SCTPDBG(SCTP_DEBUG_PCB4, "IFN %s of IFA not the same as %s\n", - sctp_ifap->ifn_p->ifn_name, if_name); - goto out; - } - } else { - if (sctp_ifap->ifn_p->ifn_index != ifn_index) { - SCTPDBG(SCTP_DEBUG_PCB4, "IFA owned by ifn_index:%d down command for ifn_index:%d - ignored\n", - sctp_ifap->ifn_p->ifn_index, ifn_index); - goto out; - } - } - - sctp_ifap->localifa_flags &= (~SCTP_ADDR_IFA_UNUSEABLE); - sctp_ifap->localifa_flags |= SCTP_ADDR_VALID; -out: - SCTP_IPI_ADDR_RUNLOCK(); -} - /*- * Add an ifa to an ifn. * Register the interface as necessary. - * NOTE: ADDR write lock MUST be held. */ static void sctp_add_ifa_to_ifn(struct sctp_ifn *sctp_ifnp, struct sctp_ifa *sctp_ifap) { int ifa_af; + SCTP_IPI_ADDR_WLOCK_ASSERT(); LIST_INSERT_HEAD(&sctp_ifnp->ifalist, sctp_ifap, next_ifa); sctp_ifap->ifn_p = sctp_ifnp; atomic_add_int(&sctp_ifap->ifn_p->refcount, 1); @@ -424,11 +315,11 @@ sctp_add_ifa_to_ifn(struct sctp_ifn *sctp_ifnp, struct sctp_ifa *sctp_ifap) * Remove an ifa from its ifn. * If no more addresses exist, remove the ifn too. Otherwise, re-register * the interface based on the remaining address families left. - * NOTE: ADDR write lock MUST be held. */ static void sctp_remove_ifa_from_ifn(struct sctp_ifa *sctp_ifap) { + SCTP_IPI_ADDR_WLOCK_ASSERT(); LIST_REMOVE(sctp_ifap, next_ifa); if (sctp_ifap->ifn_p) { /* update address counts */ @@ -450,7 +341,7 @@ sctp_remove_ifa_from_ifn(struct sctp_ifa *sctp_ifap) if (LIST_EMPTY(&sctp_ifap->ifn_p->ifalist)) { /* remove the ifn, possibly freeing it */ - sctp_delete_ifn(sctp_ifap->ifn_p, SCTP_ADDR_LOCKED); + sctp_delete_ifn(sctp_ifap->ifn_p); } else { /* re-register address family type, if needed */ if ((sctp_ifap->ifn_p->num_v6 == 0) && @@ -479,7 +370,6 @@ sctp_add_addr_to_vrf(uint32_t vrf_id, void *ifn, uint32_t ifn_index, struct sctp_ifalist *hash_addr_head; struct sctp_ifnlist *hash_ifn_head; uint32_t hash_of_addr; - int new_ifn_af = 0; #ifdef SCTP_DEBUG SCTPDBG(SCTP_DEBUG_PCB4, "vrf_id 0x%x: adding address: ", vrf_id); @@ -543,59 +433,74 @@ sctp_add_addr_to_vrf(uint32_t vrf_id, void *ifn, uint32_t ifn_index, LIST_INSERT_HEAD(hash_ifn_head, sctp_ifnp, next_bucket); LIST_INSERT_HEAD(&vrf->ifnlist, sctp_ifnp, next_ifn); atomic_add_int(&SCTP_BASE_INFO(ipi_count_ifns), 1); - new_ifn_af = 1; } sctp_ifap = sctp_find_ifa_by_addr(addr, vrf->vrf_id, SCTP_ADDR_LOCKED); - if (sctp_ifap) { - /* Hmm, it already exists? */ - if ((sctp_ifap->ifn_p) && - (sctp_ifap->ifn_p->ifn_index == ifn_index)) { - SCTPDBG(SCTP_DEBUG_PCB4, "Using existing ifn %s (0x%x) for ifa %p\n", - sctp_ifap->ifn_p->ifn_name, ifn_index, - (void *)sctp_ifap); - if (new_ifn_af) { - /* Remove the created one that we don't want */ - sctp_delete_ifn(sctp_ifnp, SCTP_ADDR_LOCKED); - } - if (sctp_ifap->localifa_flags & SCTP_BEING_DELETED) { - /* easy to solve, just switch back to active */ - SCTPDBG(SCTP_DEBUG_PCB4, "Clearing deleted ifa flag\n"); - sctp_ifap->localifa_flags = SCTP_ADDR_VALID; - sctp_ifap->ifn_p = sctp_ifnp; - atomic_add_int(&sctp_ifap->ifn_p->refcount, 1); - } - exit_stage_left: - SCTP_IPI_ADDR_WUNLOCK(); - if (new_sctp_ifnp != NULL) { - SCTP_FREE(new_sctp_ifnp, SCTP_M_IFN); - } - SCTP_FREE(new_sctp_ifap, SCTP_M_IFA); - return (sctp_ifap); - } else { - if (sctp_ifap->ifn_p) { + if (sctp_ifap != NULL) { + /* The address being added is already or still known. */ + if (sctp_ifap->ifn_p != NULL) { + if (sctp_ifap->ifn_p->ifn_index == ifn_index && + sctp_ifap->ifn_p->ifn_p == ifn) { + SCTPDBG(SCTP_DEBUG_PCB4, + "Using existing ifn %s (0x%x) for ifa %p\n", + sctp_ifap->ifn_p->ifn_name, ifn_index, + (void *)sctp_ifap); + if (new_sctp_ifnp == NULL) { + /* Remove the created one not used. */ + sctp_delete_ifn(sctp_ifnp); + } + if (sctp_ifap->localifa_flags & SCTP_BEING_DELETED) { + /* Switch back to active. */ + SCTPDBG(SCTP_DEBUG_PCB4, + "Clearing deleted ifa flag\n"); + sctp_ifap->localifa_flags = SCTP_ADDR_VALID; +#ifdef INET6 + if (sctp_ifap->address.sa.sa_family == AF_INET6) { + sctp_gather_internal_ifa_flags(sctp_ifap); + } +#endif + sctp_ifap->ifn_p = sctp_ifnp; + atomic_add_int(&sctp_ifap->ifn_p->refcount, 1); + } + } else { /* * The last IFN gets the address, remove the - * old one + * old one. */ - SCTPDBG(SCTP_DEBUG_PCB4, "Moving ifa %p from %s (0x%x) to %s (0x%x)\n", - (void *)sctp_ifap, sctp_ifap->ifn_p->ifn_name, + SCTPDBG(SCTP_DEBUG_PCB4, + "Moving ifa %p from %s (0x%x) to %s (0x%x)\n", + (void *)sctp_ifap, + sctp_ifap->ifn_p->ifn_name, sctp_ifap->ifn_p->ifn_index, if_name, ifn_index); /* remove the address from the old ifn */ sctp_remove_ifa_from_ifn(sctp_ifap); /* move the address over to the new ifn */ sctp_add_ifa_to_ifn(sctp_ifnp, sctp_ifap); - goto exit_stage_left; - } else { - /* repair ifnp which was NULL ? */ - sctp_ifap->localifa_flags = SCTP_ADDR_VALID; - SCTPDBG(SCTP_DEBUG_PCB4, "Repairing ifn %p for ifa %p\n", - (void *)sctp_ifnp, (void *)sctp_ifap); - sctp_add_ifa_to_ifn(sctp_ifnp, sctp_ifap); } - goto exit_stage_left; + } else { + /* Repair ifn_p, which was NULL... */ + sctp_ifap->localifa_flags = SCTP_ADDR_VALID; +#ifdef INET6 + if (sctp_ifap->address.sa.sa_family == AF_INET6) { + sctp_gather_internal_ifa_flags(sctp_ifap); + } +#endif + SCTPDBG(SCTP_DEBUG_PCB4, + "Repairing ifn %p for ifa %p\n", + (void *)sctp_ifnp, (void *)sctp_ifap); + sctp_add_ifa_to_ifn(sctp_ifnp, sctp_ifap); + } + SCTP_IPI_ADDR_WUNLOCK(); + if (new_sctp_ifnp != NULL) { + SCTP_FREE(new_sctp_ifnp, SCTP_M_IFN); } + SCTP_FREE(new_sctp_ifap, SCTP_M_IFA); + return (sctp_ifap); } + KASSERT(sctp_ifnp != NULL, + ("sctp_add_addr_to_vrf: sctp_ifnp == NULL")); + KASSERT(sctp_ifap == NULL, + ("sctp_add_addr_to_vrf: sctp_ifap (%p) != NULL", sctp_ifap)); sctp_ifap = new_sctp_ifap; memset(sctp_ifap, 0, sizeof(struct sctp_ifa)); sctp_ifap->ifn_p = sctp_ifnp; @@ -605,6 +510,11 @@ sctp_add_addr_to_vrf(uint32_t vrf_id, void *ifn, uint32_t ifn_index, memcpy(&sctp_ifap->address, addr, addr->sa_len); sctp_ifap->localifa_flags = SCTP_ADDR_VALID | SCTP_ADDR_DEFER_USE; sctp_ifap->flags = ifa_flags; +#ifdef INET6 + if (addr->sa_family == AF_INET6) { + sctp_gather_internal_ifa_flags(sctp_ifap); + } +#endif /* Set scope */ switch (sctp_ifap->address.sa.sa_family) { #ifdef INET @@ -621,8 +531,8 @@ sctp_add_addr_to_vrf(uint32_t vrf_id, void *ifn, uint32_t ifn_index, sctp_ifap->src_is_priv = 1; } sctp_ifnp->num_v4++; - if (new_ifn_af) - new_ifn_af = AF_INET; + if (new_sctp_ifnp == NULL) + sctp_ifnp->registered_af = AF_INET; break; } #endif @@ -641,13 +551,12 @@ sctp_add_addr_to_vrf(uint32_t vrf_id, void *ifn, uint32_t ifn_index, sctp_ifap->src_is_priv = 1; } sctp_ifnp->num_v6++; - if (new_ifn_af) - new_ifn_af = AF_INET6; + if (new_sctp_ifnp == NULL) + sctp_ifnp->registered_af = AF_INET6; break; } #endif default: - new_ifn_af = 0; break; } hash_of_addr = sctp_get_ifa_hash_val(&sctp_ifap->address.sa); @@ -663,9 +572,6 @@ sctp_add_addr_to_vrf(uint32_t vrf_id, void *ifn, uint32_t ifn_index, sctp_ifnp->ifa_count++; vrf->total_ifa_count++; atomic_add_int(&SCTP_BASE_INFO(ipi_count_ifas), 1); - if (new_ifn_af) { - sctp_ifnp->registered_af = new_ifn_af; - } SCTP_IPI_ADDR_WUNLOCK(); if (new_sctp_ifnp != NULL) { SCTP_FREE(new_sctp_ifnp, SCTP_M_IFN); @@ -687,8 +593,7 @@ sctp_add_addr_to_vrf(uint32_t vrf_id, void *ifn, uint32_t ifn_index, */ SCTPDBG(SCTP_DEBUG_PCB4, "Lost an address change?\n"); /* Opps, must decrement the count */ - sctp_del_addr_from_vrf(vrf_id, addr, ifn_index, - if_name); + sctp_del_addr_from_vrf(vrf_id, addr, ifn, ifn_index); return (NULL); } SCTP_INCR_LADDR_COUNT(); @@ -713,16 +618,17 @@ sctp_add_addr_to_vrf(uint32_t vrf_id, void *ifn, uint32_t ifn_index, void sctp_del_addr_from_vrf(uint32_t vrf_id, struct sockaddr *addr, - uint32_t ifn_index, const char *if_name) + void *ifn, uint32_t ifn_index) { struct sctp_vrf *vrf; - struct sctp_ifa *sctp_ifap = NULL; + struct sctp_ifa *sctp_ifap; SCTP_IPI_ADDR_WLOCK(); vrf = sctp_find_vrf(vrf_id); if (vrf == NULL) { SCTPDBG(SCTP_DEBUG_PCB4, "Can't find vrf_id 0x%x\n", vrf_id); - goto out_now; + SCTP_IPI_ADDR_WUNLOCK(); + return; } #ifdef SCTP_DEBUG @@ -730,38 +636,21 @@ sctp_del_addr_from_vrf(uint32_t vrf_id, struct sockaddr *addr, SCTPDBG_ADDR(SCTP_DEBUG_PCB4, addr); #endif sctp_ifap = sctp_find_ifa_by_addr(addr, vrf->vrf_id, SCTP_ADDR_LOCKED); - if (sctp_ifap) { + if (sctp_ifap != NULL) { /* Validate the delete */ if (sctp_ifap->ifn_p) { - int valid = 0; - - /*- - * The name has priority over the ifn_index - * if its given. - */ - if (if_name) { - if (strncmp(if_name, sctp_ifap->ifn_p->ifn_name, SCTP_IFNAMSIZ) == 0) { - /* They match its a correct delete */ - valid = 1; - } - } - if (!valid) { - /* last ditch check ifn_index */ - if (ifn_index == sctp_ifap->ifn_p->ifn_index) { - valid = 1; - } - } - if (!valid) { - SCTPDBG(SCTP_DEBUG_PCB4, "ifn:%d ifname:%s does not match addresses\n", - ifn_index, ((if_name == NULL) ? "NULL" : if_name)); - SCTPDBG(SCTP_DEBUG_PCB4, "ifn:%d ifname:%s - ignoring delete\n", - sctp_ifap->ifn_p->ifn_index, sctp_ifap->ifn_p->ifn_name); + if (ifn_index != sctp_ifap->ifn_p->ifn_index || + ifn != sctp_ifap->ifn_p->ifn_p) { + SCTPDBG(SCTP_DEBUG_PCB4, "ifn:%d (%p) ifname:%s - ignoring delete\n", + sctp_ifap->ifn_p->ifn_index, + sctp_ifap->ifn_p->ifn_p, + sctp_ifap->ifn_p->ifn_name); SCTP_IPI_ADDR_WUNLOCK(); return; } } SCTPDBG(SCTP_DEBUG_PCB4, "Deleting ifa %p\n", (void *)sctp_ifap); - sctp_ifap->localifa_flags &= SCTP_ADDR_VALID; + sctp_ifap->localifa_flags &= ~SCTP_ADDR_VALID; /* * We don't set the flag. This means that the structure will * hang around in EP's that have bound specific to it until @@ -778,13 +667,12 @@ sctp_del_addr_from_vrf(uint32_t vrf_id, struct sockaddr *addr, else { SCTPDBG(SCTP_DEBUG_PCB4, "Del Addr-ifn:%d Could not find address:", ifn_index); - SCTPDBG_ADDR(SCTP_DEBUG_PCB1, addr); + SCTPDBG_ADDR(SCTP_DEBUG_PCB4, addr); } #endif -out_now: SCTP_IPI_ADDR_WUNLOCK(); - if (sctp_ifap) { + if (sctp_ifap != NULL) { struct sctp_laddr *wi; wi = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_laddr), struct sctp_laddr); @@ -2570,7 +2458,7 @@ sctp_inpcb_alloc(struct socket *so, uint32_t vrf_id) /* Setup the initial secret */ (void)SCTP_GETTIME_TIMEVAL(&time); - m->time_of_secret_change = (unsigned int)time.tv_sec; + m->time_of_secret_change = time.tv_sec; for (i = 0; i < SCTP_NUMBER_OF_SECRETS; i++) { m->secret_key[0][i] = sctp_select_initial_TSN(m); @@ -3177,7 +3065,7 @@ continue_anyway: /* GAK, more FIXME IFA lock? */ if (ifa->localifa_flags & SCTP_ADDR_IFA_UNUSEABLE) { /* Can't bind a non-existent addr. */ - error = EINVAL; + error = EADDRNOTAVAIL; SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, error); goto out; } @@ -4257,8 +4145,7 @@ sctp_aloc_assoc_locked(struct sctp_inpcb *inp, struct sockaddr *firstaddr, sin = (struct sockaddr_in *)firstaddr; if ((ntohs(sin->sin_port) == 0) || - (sin->sin_addr.s_addr == INADDR_ANY) || - (sin->sin_addr.s_addr == INADDR_BROADCAST) || + in_broadcast(sin->sin_addr) || IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) || ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) && (SCTP_IPV6_V6ONLY(inp) != 0))) { @@ -4360,7 +4247,6 @@ sctp_aloc_assoc_locked(struct sctp_inpcb *inp, struct sockaddr *firstaddr, LIST_REMOVE(stcb, sctp_asocs); LIST_REMOVE(stcb, sctp_tcbasocidhash); SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_asoc), stcb); - SCTP_INP_WUNLOCK(inp); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, ENOBUFS); *error = ENOBUFS; return (NULL); @@ -4541,7 +4427,7 @@ sctp_del_remote_addr(struct sctp_tcb *stcb, struct sockaddr *remaddr) } static bool -sctp_is_in_timewait(uint32_t tag, uint16_t lport, uint16_t rport, uint32_t now) +sctp_is_in_timewait(uint32_t tag, uint16_t lport, uint16_t rport, time_t now) { struct sctpvtaghead *chain; struct sctp_tagblock *twait_block; @@ -4563,7 +4449,7 @@ sctp_is_in_timewait(uint32_t tag, uint16_t lport, uint16_t rport, uint32_t now) } static void -sctp_set_vtag_block(struct sctp_timewait *vtag_block, uint32_t time, +sctp_set_vtag_block(struct sctp_timewait *vtag_block, time_t time, uint32_t tag, uint16_t lport, uint16_t rport) { vtag_block->tv_sec_at_expire = time; @@ -4578,13 +4464,13 @@ sctp_add_vtag_to_timewait(uint32_t tag, uint16_t lport, uint16_t rport) struct sctpvtaghead *chain; struct sctp_tagblock *twait_block; struct timeval now; - uint32_t time; + time_t time; int i; bool set; SCTP_INP_INFO_WLOCK_ASSERT(); (void)SCTP_GETTIME_TIMEVAL(&now); - time = (uint32_t)now.tv_sec + SCTP_BASE_SYSCTL(sctp_vtag_time_wait); + time = now.tv_sec + SCTP_BASE_SYSCTL(sctp_vtag_time_wait); chain = &SCTP_BASE_INFO(vtag_timewait)[(tag % SCTP_STACK_VTAG_HASH_SIZE)]; set = false; LIST_FOREACH(twait_block, chain, sctp_nxt_tagblock) { @@ -4596,7 +4482,7 @@ sctp_add_vtag_to_timewait(uint32_t tag, uint16_t lport, uint16_t rport) continue; } if ((twait_block->vtag_block[i].v_tag != 0) && - (twait_block->vtag_block[i].tv_sec_at_expire < (uint32_t)now.tv_sec)) { + (twait_block->vtag_block[i].tv_sec_at_expire < now.tv_sec)) { if (set) { /* Audit expires this guy */ sctp_set_vtag_block(twait_block->vtag_block + i, 0, 0, 0, 0); @@ -6136,8 +6022,7 @@ sctp_load_addresses_from_init(struct sctp_tcb *stcb, struct mbuf *m, /* Skip multi-cast addresses */ goto next_param; } - if ((sin.sin_addr.s_addr == INADDR_BROADCAST) || - (sin.sin_addr.s_addr == INADDR_ANY)) { + if (in_broadcast(sin.sin_addr)) { goto next_param; } sa = (struct sockaddr *)&sin; @@ -6745,7 +6630,7 @@ sctp_is_vtag_good(uint32_t tag, uint16_t lport, uint16_t rport, struct timeval * return (false); } } - return (!sctp_is_in_timewait(tag, lport, rport, (uint32_t)now->tv_sec)); + return (!sctp_is_in_timewait(tag, lport, rport, now->tv_sec)); } static void @@ -6952,7 +6837,7 @@ sctp_drain_mbufs(struct sctp_tcb *stcb) } static void -sctp_drain(void) +sctp_drain(void *arg __unused, int flags __unused) { struct epoch_tracker et; diff --git a/sys/netinet/sctp_pcb.h b/sys/netinet/sctp_pcb.h index e57e13654073..2bec2bc32d4e 100644 --- a/sys/netinet/sctp_pcb.h +++ b/sys/netinet/sctp_pcb.h @@ -130,7 +130,7 @@ struct sctp_block_entry { }; struct sctp_timewait { - uint32_t tv_sec_at_expire; /* the seconds from boot to expire */ + time_t tv_sec_at_expire; /* the seconds from boot to expire */ uint32_t v_tag; /* the vtag that can not be reused */ uint16_t lport; /* the local port used in vtag */ uint16_t rport; /* the remote port used in vtag */ @@ -263,8 +263,8 @@ struct sctp_base_info { * access /dev/random. */ struct sctp_pcb { - unsigned int time_of_secret_change; /* number of seconds from - * timeval.tv_sec */ + time_t time_of_secret_change; /* number of seconds from + * timeval.tv_sec */ uint32_t secret_key[SCTP_HOW_MANY_SECRETS][SCTP_NUMBER_OF_SECRETS]; unsigned int size_of_a_cookie; @@ -487,18 +487,6 @@ struct sctp_vrf *sctp_allocate_vrf(int vrfid); struct sctp_vrf *sctp_find_vrf(uint32_t vrfid); void sctp_free_vrf(struct sctp_vrf *vrf); -/*- - * Change address state, can be used if - * O/S supports telling transports about - * changes to IFA/IFN's (link layer triggers). - * If a ifn goes down, we will do src-addr-selection - * and NOT use that, as a source address. This does - * not stop the routing system from routing out - * that interface, but we won't put it as a source. - */ -void sctp_mark_ifa_addr_down(uint32_t vrf_id, struct sockaddr *addr, const char *if_name, uint32_t ifn_index); -void sctp_mark_ifa_addr_up(uint32_t vrf_id, struct sockaddr *addr, const char *if_name, uint32_t ifn_index); - struct sctp_ifa * sctp_add_addr_to_vrf(uint32_t vrfid, void *ifn, uint32_t ifn_index, uint32_t ifn_type, @@ -506,14 +494,11 @@ sctp_add_addr_to_vrf(uint32_t vrfid, void *ifa, struct sockaddr *addr, uint32_t ifa_flags, int dynamic_add); -void sctp_update_ifn_mtu(uint32_t ifn_index, uint32_t mtu); - -void sctp_free_ifn(struct sctp_ifn *sctp_ifnp); void sctp_free_ifa(struct sctp_ifa *sctp_ifap); void sctp_del_addr_from_vrf(uint32_t vrfid, struct sockaddr *addr, - uint32_t ifn_index, const char *if_name); + void *ifn, uint32_t ifn_index); struct sctp_nets *sctp_findnet(struct sctp_tcb *, struct sockaddr *); diff --git a/sys/netinet/sctp_syscalls.c b/sys/netinet/sctp_syscalls.c index d67e260b6f99..9d85576e2592 100644 --- a/sys/netinet/sctp_syscalls.c +++ b/sys/netinet/sctp_syscalls.c @@ -141,13 +141,14 @@ sys_sctp_peeloff(struct thread *td, struct sctp_peeloff_args *uap) { struct file *headfp, *nfp = NULL; struct socket *head, *so; + struct filecaps fcaps; cap_rights_t rights; u_int fflag; int error, fd; AUDIT_ARG_FD(uap->sd); - error = getsock(td, uap->sd, cap_rights_init_one(&rights, CAP_PEELOFF), - &headfp); + error = getsock_cap(td, uap->sd, + cap_rights_init_one(&rights, CAP_PEELOFF), &headfp, &fcaps); if (error != 0) goto done2; fflag = atomic_load_int(&headfp->f_flag); @@ -165,7 +166,7 @@ sys_sctp_peeloff(struct thread *td, struct sctp_peeloff_args *uap) * but that is ok. */ - error = falloc(td, &nfp, &fd, 0); + error = falloc_caps(td, &nfp, &fd, 0, &fcaps); if (error != 0) goto done; td->td_retval[0] = fd; diff --git a/sys/netinet/sctp_sysctl.c b/sys/netinet/sctp_sysctl.c index a4be3471e2fd..bd2f23f40727 100644 --- a/sys/netinet/sctp_sysctl.c +++ b/sys/netinet/sctp_sysctl.c @@ -265,6 +265,10 @@ sctp_sysctl_copy_out_local_addresses(struct sctp_inpcb *inp, struct sctp_tcb *st if (sctp_is_addr_restricted(stcb, sctp_ifa)) { continue; } + } else { + if (sctp_ifa->localifa_flags & SCTP_ADDR_IFA_UNUSEABLE) { + continue; + } } switch (sctp_ifa->address.sa.sa_family) { #ifdef INET @@ -894,7 +898,7 @@ sctp_sysctl_handle_trace_log_clear(SYSCTL_HANDLER_ARGS) return (error); \ } \ SYSCTL_PROC(_net_inet_sctp, OID_AUTO, mib_name, flags, NULL, 0, \ - sctp_sysctl_handle_##mib_name, "UI", prefix##_DESC) + sctp_sysctl_handle_##mib_name, "IU", prefix##_DESC) #define SCTP_UINT_SYSCTL_RDTUN(mib_name, var_name, prefix) \ SYSCTL_UINT(_net_inet_sctp, OID_AUTO, mib_name, \ diff --git a/sys/netinet/sctp_usrreq.c b/sys/netinet/sctp_usrreq.c index 3b0da87edce3..94d57225c20b 100644 --- a/sys/netinet/sctp_usrreq.c +++ b/sys/netinet/sctp_usrreq.c @@ -361,8 +361,9 @@ sctp_getcred(SYSCTL_HANDLER_ARGS) /* FIX, for non-bsd is this right? */ vrf_id = SCTP_DEFAULT_VRFID; + if (req->newptr == NULL) + return (EINVAL); error = priv_check(req->td, PRIV_NETINET_GETCRED); - if (error) return (error); @@ -843,8 +844,10 @@ sctp_shutdown(struct socket *so, enum shutdown_how how) } sctp_free_a_readq(stcb, control); } else { - stcb->asoc.size_on_all_streams += - control->length; + if (stcb != NULL) { + stcb->asoc.size_on_all_streams += + control->length; + } } } SOCK_UNLOCK(so); @@ -7514,7 +7517,6 @@ sctp_peeraddr(struct socket *so, struct sockaddr *sa) .pr_control = in_control, \ .pr_close = sctp_close, \ .pr_detach = sctp_close, \ - .pr_sopoll = sopoll_generic, \ .pr_disconnect = sctp_disconnect, \ .pr_listen = sctp_listen, \ .pr_peeraddr = sctp_peeraddr, \ diff --git a/sys/netinet/sctputil.c b/sys/netinet/sctputil.c index b23efd9c8968..ddfa71d5c7ed 100644 --- a/sys/netinet/sctputil.c +++ b/sys/netinet/sctputil.c @@ -1944,7 +1944,7 @@ sctp_timeout_handler(void *t) type, inp, stcb, net)); SCTP_STAT_INCR(sctps_timosecret); (void)SCTP_GETTIME_TIMEVAL(&tv); - inp->sctp_ep.time_of_secret_change = (unsigned int)tv.tv_sec; + inp->sctp_ep.time_of_secret_change = tv.tv_sec; inp->sctp_ep.last_secret_number = inp->sctp_ep.current_secret_number; inp->sctp_ep.current_secret_number++; @@ -2289,19 +2289,19 @@ sctp_timer_start(int t_type, struct sctp_inpcb *inp, struct sctp_tcb *stcb, } else { to_ticks = net->RTO; } - rndval = sctp_select_initial_TSN(&inp->sctp_ep); - jitter = rndval % to_ticks; - if (to_ticks > 1) { - to_ticks >>= 1; - } - if (jitter < (UINT32_MAX - to_ticks)) { - to_ticks += jitter; - } else { - to_ticks = UINT32_MAX; - } if (!((net->dest_state & SCTP_ADDR_UNCONFIRMED) && (net->dest_state & SCTP_ADDR_REACHABLE)) && ((net->dest_state & SCTP_ADDR_PF) == 0)) { + if (to_ticks > 1) { + rndval = sctp_select_initial_TSN(&inp->sctp_ep); + jitter = rndval % to_ticks; + to_ticks >>= 1; + if (jitter < (UINT32_MAX - to_ticks)) { + to_ticks += jitter; + } else { + to_ticks = UINT32_MAX; + } + } if (net->heart_beat_delay < (UINT32_MAX - to_ticks)) { to_ticks += net->heart_beat_delay; } else { @@ -6634,8 +6634,7 @@ sctp_connectx_helper_add(struct sctp_tcb *stcb, struct sockaddr *addr, case AF_INET: incr = sizeof(struct sockaddr_in); sin = (struct sockaddr_in *)sa; - if ((sin->sin_addr.s_addr == INADDR_ANY) || - (sin->sin_addr.s_addr == INADDR_BROADCAST) || + if (in_broadcast(sin->sin_addr) || IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) { SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTPUTIL, EINVAL); (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, diff --git a/sys/netinet/siftr.c b/sys/netinet/siftr.c index bf0cdc2ac4cc..374b5595fcbc 100644 --- a/sys/netinet/siftr.c +++ b/sys/netinet/siftr.c @@ -273,6 +273,7 @@ static struct mtx siftr_pkt_queue_mtx; static struct mtx siftr_pkt_mgr_mtx; static struct thread *siftr_pkt_manager_thr = NULL; static char direction[2] = {'i','o'}; +static eventhandler_tag siftr_shutdown_tag; /* Required function prototypes. */ static int siftr_sysctl_enabled_handler(SYSCTL_HANDLER_ARGS); @@ -596,9 +597,6 @@ siftr_findinpcb(int ipver, struct ip *ip, struct mbuf *m, uint16_t sport, { struct inpcb *inp; - /* We need the tcbinfo lock. */ - INP_INFO_WUNLOCK_ASSERT(&V_tcbinfo); - if (dir == PFIL_IN) inp = (ipver == INP_IPV4 ? in_pcblookup(&V_tcbinfo, ip->ip_src, sport, ip->ip_dst, @@ -1310,6 +1308,7 @@ static int deinit_siftr(void) { /* Cleanup. */ + EVENTHANDLER_DEREGISTER(shutdown_pre_sync, siftr_shutdown_tag); siftr_manage_ops(SIFTR_DISABLE); hashdestroy(counter_hash, M_SIFTR, siftr_hashmask); mtx_destroy(&siftr_pkt_queue_mtx); @@ -1324,8 +1323,8 @@ deinit_siftr(void) static int init_siftr(void) { - EVENTHANDLER_REGISTER(shutdown_pre_sync, siftr_shutdown_handler, NULL, - SHUTDOWN_PRI_FIRST); + siftr_shutdown_tag = EVENTHANDLER_REGISTER(shutdown_pre_sync, + siftr_shutdown_handler, NULL, SHUTDOWN_PRI_FIRST); /* Initialise our flow counter hash table. */ counter_hash = hashinit(SIFTR_EXPECTED_MAX_TCP_FLOWS, M_SIFTR, diff --git a/sys/netinet/tcp.h b/sys/netinet/tcp.h index c97a3e04d9b6..41a49b318cd5 100644 --- a/sys/netinet/tcp.h +++ b/sys/netinet/tcp.h @@ -71,13 +71,13 @@ struct tcphdr { #define TH_RES3 0x200 #define TH_RES2 0x400 #define TH_RES1 0x800 -#define TH_FLAGS (TH_FIN|TH_SYN|TH_RST|TH_PUSH|TH_ACK|TH_URG|TH_ECE|TH_CWR) +#define TH_FLAGS (TH_FIN|TH_SYN|TH_RST|TH_PUSH|TH_ACK|TH_URG|TH_ECE|TH_CWR|TH_AE) #define PRINT_TH_FLAGS "\20\1FIN\2SYN\3RST\4PUSH\5ACK\6URG\7ECE\10CWR\11AE" u_short th_win; /* window */ u_short th_sum; /* checksum */ u_short th_urp; /* urgent pointer */ -}; +} __packed; static __inline uint16_t __tcp_get_flags(const struct tcphdr *th) @@ -166,8 +166,6 @@ __tcp_set_flags(struct tcphdr *th, uint16_t flags) #define TCP_MAX_WINSHIFT 14 /* maximum window shift */ -#define TCP_MAXBURST 4 /* maximum segments in a burst */ - #define TCP_MAXHLEN (0xf<<2) /* max length of header in bytes */ #define TCP_MAXOLEN (TCP_MAXHLEN - sizeof(struct tcphdr)) /* max space left for options */ @@ -182,176 +180,169 @@ __tcp_set_flags(struct tcphdr *th, uint16_t flags) * values and are not masked together. Some values appear to be * bitmasks for historical reasons. */ -#define TCP_NODELAY 1 /* don't delay send to coalesce packets */ +#define TCP_NODELAY 1 /* don't delay send to coalesce packets */ #if __BSD_VISIBLE -#define TCP_MAXSEG 2 /* set maximum segment size */ -#define TCP_NOPUSH 4 /* don't push last block of write */ -#define TCP_NOOPT 8 /* don't use TCP options */ -#define TCP_MD5SIG 16 /* use MD5 digests (RFC2385) */ -#define TCP_INFO 32 /* retrieve tcp_info structure */ -#define TCP_STATS 33 /* retrieve stats blob structure */ -#define TCP_LOG 34 /* configure event logging for connection */ -#define TCP_LOGBUF 35 /* retrieve event log for connection */ -#define TCP_LOGID 36 /* configure log ID to correlate connections */ -#define TCP_LOGDUMP 37 /* dump connection log events to device */ -#define TCP_LOGDUMPID 38 /* dump events from connections with same ID to - device */ -#define TCP_TXTLS_ENABLE 39 /* TLS framing and encryption for transmit */ -#define TCP_TXTLS_MODE 40 /* Transmit TLS mode */ -#define TCP_RXTLS_ENABLE 41 /* TLS framing and encryption for receive */ -#define TCP_RXTLS_MODE 42 /* Receive TLS mode */ -#define TCP_IWND_NB 43 /* Override initial window (units: bytes) */ -#define TCP_IWND_NSEG 44 /* Override initial window (units: MSS segs) */ +#define TCP_MAXSEG 2 /* set maximum segment size */ +#define TCP_NOPUSH 4 /* don't push last block of write */ +#define TCP_NOOPT 8 /* don't use TCP options */ +#define TCP_MD5SIG 16 /* use MD5 digests (RFC2385) */ +#define TCP_INFO 32 /* retrieve tcp_info structure */ +#define TCP_STATS 33 /* retrieve stats blob structure */ +#define TCP_LOG 34 /* configure event logging for connection */ +#define TCP_LOGBUF 35 /* retrieve event log for connection */ +#define TCP_LOGID 36 /* configure log ID to correlate connections */ +#define TCP_LOGDUMP 37 /* dump connection log events to device */ +#define TCP_LOGDUMPID 38 /* dump events from connections with same ID to + device */ +#define TCP_TXTLS_ENABLE 39 /* TLS framing and encryption for transmit */ +#define TCP_TXTLS_MODE 40 /* Transmit TLS mode */ +#define TCP_RXTLS_ENABLE 41 /* TLS framing and encryption for receive */ +#define TCP_RXTLS_MODE 42 /* Receive TLS mode */ +#define TCP_IWND_NB 43 /* Override initial window (units: bytes) */ +#define TCP_IWND_NSEG 44 /* Override initial window (units: MSS segs) */ #ifdef _KERNEL -#define TCP_USE_DDP 45 /* Use direct data placement for so_rcvbuf */ +#define TCP_USE_DDP 45 /* Use direct data placement for so_rcvbuf */ #endif -#define TCP_LOGID_CNT 46 /* get number of connections with the same ID */ -#define TCP_LOG_TAG 47 /* configure tag for grouping logs */ -#define TCP_USER_LOG 48 /* userspace log event */ -#define TCP_CONGESTION 64 /* get/set congestion control algorithm */ -#define TCP_CCALGOOPT 65 /* get/set cc algorithm specific options */ -#define TCP_MAXUNACKTIME 68 /* maximum time without making progress (sec) */ -#define TCP_MAXPEAKRATE 69 /* maximum peak rate allowed (kbps) */ -#define TCP_IDLE_REDUCE 70 /* Reduce cwnd on idle input */ -#define TCP_REMOTE_UDP_ENCAPS_PORT 71 /* Enable TCP over UDP tunneling via the specified port */ -#define TCP_DELACK 72 /* socket option for delayed ack */ -#define TCP_FIN_IS_RST 73 /* A fin from the peer is treated has a RST */ -#define TCP_LOG_LIMIT 74 /* Limit to number of records in tcp-log */ -#define TCP_SHARED_CWND_ALLOWED 75 /* Use of a shared cwnd is allowed */ -#define TCP_PROC_ACCOUNTING 76 /* Do accounting on tcp cpu usage and counts */ -#define TCP_USE_CMP_ACKS 77 /* The transport can handle the Compressed mbuf acks */ -#define TCP_PERF_INFO 78 /* retrieve accounting counters */ -#define TCP_KEEPINIT 128 /* N, time to establish connection */ -#define TCP_KEEPIDLE 256 /* L,N,X start keeplives after this period */ -#define TCP_KEEPINTVL 512 /* L,N interval between keepalives */ -#define TCP_KEEPCNT 1024 /* L,N number of keepalives before close */ -#define TCP_FASTOPEN 1025 /* enable TFO / was created via TFO */ -#define TCP_PCAP_OUT 2048 /* number of output packets to keep */ -#define TCP_PCAP_IN 4096 /* number of input packets to keep */ -#define TCP_FUNCTION_BLK 8192 /* Set the tcp function pointers to the specified stack */ -#define TCP_FUNCTION_ALIAS 8193 /* Get the current tcp function pointer name alias */ +#define TCP_LOGID_CNT 46 /* get number of connections with the same ID */ +#define TCP_LOG_TAG 47 /* configure tag for grouping logs */ +#define TCP_USER_LOG 48 /* userspace log event */ +#define TCP_CONGESTION 64 /* get/set congestion control algorithm */ +#define TCP_CCALGOOPT 65 /* get/set cc algorithm specific options */ +#define TCP_MAXUNACKTIME 68 /* maximum time without making progress (sec) */ +/* unused 69 */ +#define TCP_IDLE_REDUCE 70 /* Reduce cwnd on idle input */ +#define TCP_REMOTE_UDP_ENCAPS_PORT 71 /* Enable TCP over UDP tunneling via the specified port */ +#define TCP_DELACK 72 /* socket option for delayed ack */ +#define TCP_FIN_IS_RST 73 /* A fin from the peer is treated has a RST */ +#define TCP_LOG_LIMIT 74 /* Limit to number of records in tcp-log */ +#define TCP_SHARED_CWND_ALLOWED 75 /* Use of a shared cwnd is allowed */ +#define TCP_PROC_ACCOUNTING 76 /* Do accounting on tcp cpu usage and counts */ +#define TCP_USE_CMP_ACKS 77 /* The transport can handle the Compressed mbuf acks */ +#define TCP_PERF_INFO 78 /* retrieve accounting counters */ +#define TCP_KEEPINIT 128 /* N, time to establish connection */ +#define TCP_KEEPIDLE 256 /* L,N,X start keeplives after this period */ +#define TCP_KEEPINTVL 512 /* L,N interval between keepalives */ +#define TCP_KEEPCNT 1024 /* L,N number of keepalives before close */ +#define TCP_FASTOPEN 1025 /* enable TFO / was created via TFO */ +/* unused 2048 was TCP_PCAP_OUT */ +/* unused 4096 was TCP_PCAP_IN */ +#define TCP_FUNCTION_BLK 8192 /* Set the tcp function pointers to the specified stack */ +#define TCP_FUNCTION_ALIAS 8193 /* Get the current tcp function pointer name alias */ /* Options for Rack and BBR */ -#define TCP_REUSPORT_LB_NUMA 1026 /* set listen socket numa domain */ -#define TCP_RACK_MBUF_QUEUE 1050 /* Do we allow mbuf queuing if supported */ -#define TCP_RACK_PROP 1051 /* Not used */ -#define TCP_RACK_TLP_REDUCE 1052 /* RACK TLP cwnd reduction (bool) */ -#define TCP_RACK_PACE_REDUCE 1053 /* RACK Pacingv reduction factor (divisor) */ -#define TCP_RACK_PACE_MAX_SEG 1054 /* Max TSO size we will send */ -#define TCP_RACK_PACE_ALWAYS 1055 /* Use the always pace method */ -#define TCP_RACK_PROP_RATE 1056 /* Not used */ -#define TCP_RACK_PRR_SENDALOT 1057 /* Allow PRR to send more than one seg */ -#define TCP_RACK_MIN_TO 1058 /* Minimum time between rack t-o's in ms */ -#define TCP_RACK_EARLY_RECOV 1059 /* Not used */ -#define TCP_RACK_EARLY_SEG 1060 /* If early recovery max segments */ -#define TCP_RACK_REORD_THRESH 1061 /* RACK reorder threshold (shift amount) */ -#define TCP_RACK_REORD_FADE 1062 /* Does reordering fade after ms time */ -#define TCP_RACK_TLP_THRESH 1063 /* RACK TLP theshold i.e. srtt+(srtt/N) */ -#define TCP_RACK_PKT_DELAY 1064 /* RACK added ms i.e. rack-rtt + reord + N */ -#define TCP_RACK_TLP_INC_VAR 1065 /* Does TLP include rtt variance in t-o */ -#define TCP_BBR_IWINTSO 1067 /* Initial TSO window for BBRs first sends */ -#define TCP_BBR_RECFORCE 1068 /* Enter recovery force out a segment disregard pacer no longer valid */ -#define TCP_BBR_STARTUP_PG 1069 /* Startup pacing gain */ -#define TCP_BBR_DRAIN_PG 1070 /* Drain pacing gain */ -#define TCP_BBR_RWND_IS_APP 1071 /* Rwnd limited is considered app limited */ -#define TCP_BBR_PROBE_RTT_INT 1072 /* How long in useconds between probe-rtt */ -#define TCP_BBR_ONE_RETRAN 1073 /* Is only one segment allowed out during retran */ -#define TCP_BBR_STARTUP_LOSS_EXIT 1074 /* Do we exit a loss during startup if not 20% incr */ -#define TCP_BBR_USE_LOWGAIN 1075 /* lower the gain in PROBE_BW enable */ -#define TCP_BBR_LOWGAIN_THRESH 1076 /* Unused after 2.3 morphs to TSLIMITS >= 2.3 */ -#define TCP_BBR_TSLIMITS 1076 /* Do we use experimental Timestamp limiting for our algo */ -#define TCP_BBR_LOWGAIN_HALF 1077 /* Unused after 2.3 */ -#define TCP_BBR_PACE_OH 1077 /* Reused in 4.2 for pacing overhead setting */ -#define TCP_BBR_LOWGAIN_FD 1078 /* Unused after 2.3 */ -#define TCP_BBR_HOLD_TARGET 1078 /* For 4.3 on */ -#define TCP_BBR_USEDEL_RATE 1079 /* Enable use of delivery rate for loss recovery */ -#define TCP_BBR_MIN_RTO 1080 /* Min RTO in milliseconds */ -#define TCP_BBR_MAX_RTO 1081 /* Max RTO in milliseconds */ -#define TCP_BBR_REC_OVER_HPTS 1082 /* Recovery override htps settings 0/1/3 */ -#define TCP_BBR_UNLIMITED 1083 /* Not used before 2.3 and morphs to algorithm >= 2.3 */ -#define TCP_BBR_ALGORITHM 1083 /* What measurement algo does BBR use netflix=0, google=1 */ -#define TCP_BBR_DRAIN_INC_EXTRA 1084 /* Does the 3/4 drain target include the extra gain */ -#define TCP_BBR_STARTUP_EXIT_EPOCH 1085 /* what epoch gets us out of startup */ -#define TCP_BBR_PACE_PER_SEC 1086 -#define TCP_BBR_PACE_DEL_TAR 1087 -#define TCP_BBR_PACE_SEG_MAX 1088 -#define TCP_BBR_PACE_SEG_MIN 1089 -#define TCP_BBR_PACE_CROSS 1090 -#define TCP_RACK_IDLE_REDUCE_HIGH 1092 /* Reduce the highest cwnd seen to IW on idle */ -#define TCP_RACK_MIN_PACE 1093 /* Do we enforce rack min pace time */ -#define TCP_RACK_MIN_PACE_SEG 1094 /* If so what is the seg threshould */ -#define TCP_RACK_GP_INCREASE 1094 /* After 4.1 its the GP increase in older rack */ -#define TCP_RACK_TLP_USE 1095 -#define TCP_BBR_ACK_COMP_ALG 1096 /* Not used */ -#define TCP_BBR_TMR_PACE_OH 1096 /* Recycled in 4.2 */ -#define TCP_BBR_EXTRA_GAIN 1097 -#define TCP_RACK_DO_DETECTION 1097 /* Recycle of extra gain for rack, attack detection */ -#define TCP_BBR_RACK_RTT_USE 1098 /* what RTT should we use 0, 1, or 2? */ -#define TCP_BBR_RETRAN_WTSO 1099 -#define TCP_DATA_AFTER_CLOSE 1100 -#define TCP_BBR_PROBE_RTT_GAIN 1101 -#define TCP_BBR_PROBE_RTT_LEN 1102 -#define TCP_BBR_SEND_IWND_IN_TSO 1103 /* Do we burst out whole iwin size chunks at start? */ -#define TCP_BBR_USE_RACK_RR 1104 /* Do we use the rack rapid recovery for pacing rxt's */ -#define TCP_BBR_USE_RACK_CHEAT TCP_BBR_USE_RACK_RR /* Compat. */ -#define TCP_BBR_HDWR_PACE 1105 /* Enable/disable hardware pacing */ -#define TCP_BBR_UTTER_MAX_TSO 1106 /* Do we enforce an utter max TSO size */ -#define TCP_BBR_EXTRA_STATE 1107 /* Special exit-persist catch up */ -#define TCP_BBR_FLOOR_MIN_TSO 1108 /* The min tso size */ -#define TCP_BBR_MIN_TOPACEOUT 1109 /* Do we suspend pacing until */ -#define TCP_BBR_TSTMP_RAISES 1110 /* Can a timestamp measurement raise the b/w */ -#define TCP_BBR_POLICER_DETECT 1111 /* Turn on/off google mode policer detection */ -#define TCP_BBR_RACK_INIT_RATE 1112 /* Set an initial pacing rate for when we have no b/w in kbits per sec */ -#define TCP_RACK_RR_CONF 1113 /* Rack rapid recovery configuration control*/ -#define TCP_RACK_CHEAT_NOT_CONF_RATE TCP_RACK_RR_CONF -#define TCP_RACK_GP_INCREASE_CA 1114 /* GP increase for Congestion Avoidance */ -#define TCP_RACK_GP_INCREASE_SS 1115 /* GP increase for Slow Start */ -#define TCP_RACK_GP_INCREASE_REC 1116 /* GP increase for Recovery */ -#define TCP_RACK_FORCE_MSEG 1117 /* Override to use the user set max-seg value */ -#define TCP_RACK_PACE_RATE_CA 1118 /* Pacing rate for Congestion Avoidance */ -#define TCP_RACK_PACE_RATE_SS 1119 /* Pacing rate for Slow Start */ -#define TCP_RACK_PACE_RATE_REC 1120 /* Pacing rate for Recovery */ -#define TCP_NO_PRR 1122 /* If pacing, don't use prr */ -#define TCP_RACK_NONRXT_CFG_RATE 1123 /* In recovery does a non-rxt use the cfg rate */ -#define TCP_SHARED_CWND_ENABLE 1124 /* Use a shared cwnd if allowed */ -#define TCP_TIMELY_DYN_ADJ 1125 /* Do we attempt dynamic multipler adjustment with timely. */ -#define TCP_RACK_NO_PUSH_AT_MAX 1126 /* For timely do not push if we are over max rtt */ -#define TCP_RACK_PACE_TO_FILL 1127 /* If we are not in recovery, always pace to fill the cwnd in 1 RTT */ -#define TCP_SHARED_CWND_TIME_LIMIT 1128 /* we should limit to low time values the scwnd life */ -#define TCP_RACK_PROFILE 1129 /* Select a profile that sets multiple options */ -#define TCP_HDWR_RATE_CAP 1130 /* Allow hardware rates to cap pacing rate */ -#define TCP_PACING_RATE_CAP 1131 /* Highest rate allowed in pacing in bytes per second (uint64_t) */ -#define TCP_HDWR_UP_ONLY 1132 /* Allow the pacing rate to climb but not descend (with the exception of fill-cw */ -#define TCP_RACK_ABC_VAL 1133 /* Set a local ABC value different then the system default */ -#define TCP_REC_ABC_VAL 1134 /* Do we use the ABC value for recovery or the override one from sysctl */ -#define TCP_RACK_MEASURE_CNT 1135 /* How many measurements are required in GP pacing */ -#define TCP_DEFER_OPTIONS 1136 /* Defer options until the proper number of measurements occur, does not defer TCP_RACK_MEASURE_CNT */ -#define TCP_FAST_RSM_HACK 1137 /* Not used in modern stacks */ -#define TCP_RACK_PACING_BETA 1138 /* Changing the beta for pacing */ -#define TCP_RACK_PACING_BETA_ECN 1139 /* Changing the beta for ecn with pacing */ -#define TCP_RACK_TIMER_SLOP 1140 /* Set or get the timer slop used */ -#define TCP_RACK_DSACK_OPT 1141 /* How do we setup rack timer DSACK options bit 1/2 */ -#define TCP_RACK_ENABLE_HYSTART 1142 /* Do we allow hystart in the CC modules */ -#define TCP_RACK_SET_RXT_OPTIONS 1143 /* Set the bits in the retransmit options */ -#define TCP_RACK_HI_BETA 1144 /* Turn on/off high beta */ -#define TCP_RACK_SPLIT_LIMIT 1145 /* Set a split limit for split allocations */ -#define TCP_RACK_PACING_DIVISOR 1146 /* Pacing divisor given to rate-limit code for burst sizing */ -#define TCP_RACK_PACE_MIN_SEG 1147 /* Pacing min seg size rack will use */ -#define TCP_RACK_DGP_IN_REC 1148 /* Do we use full DGP in recovery? */ -#define TCP_POLICER_DETECT 1149 /* Do we apply a thresholds to rack to detect and compensate for policers? */ -#define TCP_RXT_CLAMP TCP_POLICER_DETECT -#define TCP_HYBRID_PACING 1150 /* Hybrid pacing enablement */ -#define TCP_PACING_DND 1151 /* When pacing with rr_config=3 can sacks disturb us */ -#define TCP_SS_EEXIT 1152 /* Do we do early exit from slowtart if no b/w growth */ -#define TCP_DGP_UPPER_BOUNDS 1153 /* SS and CA upper bound in percentage */ -#define TCP_NO_TIMELY 1154 /* Disable/enable Timely */ -#define TCP_HONOR_HPTS_MIN 1155 /* Do we honor hpts min to */ -#define TCP_REC_IS_DYN 1156 /* Do we allow timely to change recovery multiplier? */ -#define TCP_SIDECHAN_DIS 1157 /* Disable/enable the side-channel */ -#define TCP_FILLCW_RATE_CAP 1158 /* Set a cap for DGP's fillcw */ -#define TCP_POLICER_MSS 1159 /* Policer MSS requirement */ -#define TCP_STACK_SPEC_INFO 1160 /* Get stack specific information (if present) */ -#define RACK_CSPR_IS_FCC 1161 -#define TCP_GP_USE_LTBW 1162 /* how we use lt_bw 0=not, 1=min, 2=max */ +#define TCP_REUSPORT_LB_NUMA 1026 /* set listen socket numa domain */ +#define TCP_RACK_MBUF_QUEUE 1050 /* Do we allow mbuf queuing if supported */ +/* unused 1051 */ +#define TCP_RACK_TLP_REDUCE 1052 /* RACK TLP cwnd reduction (bool) */ +/* unused 1053 */ +#define TCP_RACK_PACE_MAX_SEG 1054 /* Max TSO size we will send */ +#define TCP_RACK_PACE_ALWAYS 1055 /* Use the always pace method */ +/* unused 1056 */ +#define TCP_RACK_PRR_SENDALOT 1057 /* Allow PRR to send more than one seg */ +#define TCP_RACK_MIN_TO 1058 /* Minimum time between rack t-o's in ms */ +/* unused 1059 */ +#define TCP_RACK_EARLY_SEG 1060 /* If early recovery max segments */ +#define TCP_RACK_REORD_THRESH 1061 /* RACK reorder threshold (shift amount) */ +#define TCP_RACK_REORD_FADE 1062 /* Does reordering fade after ms time */ +#define TCP_RACK_TLP_THRESH 1063 /* RACK TLP theshold i.e. srtt+(srtt/N) */ +#define TCP_RACK_PKT_DELAY 1064 /* RACK added ms i.e. rack-rtt + reord + N */ +/* unused 1065 */ +/* unused 1066 */ +#define TCP_BBR_IWINTSO 1067 /* Initial TSO window for BBRs first sends */ +/* unused 1068 */ +#define TCP_BBR_STARTUP_PG 1069 /* Startup pacing gain */ +#define TCP_BBR_DRAIN_PG 1070 /* Drain pacing gain */ +/* unused 1071 */ +#define TCP_BBR_PROBE_RTT_INT 1072 /* How long in useconds between probe-rtt */ +/* unused 1073 */ +#define TCP_BBR_STARTUP_LOSS_EXIT 1074 /* Do we exit a loss during startup if not 20% incr */ +/* unused 1075 */ +#define TCP_BBR_TSLIMITS 1076 /* Do we use experimental Timestamp limiting for our algo */ +#define TCP_BBR_PACE_OH 1077 /* pacing overhead setting */ +/* unused 1078 */ +#define TCP_BBR_USEDEL_RATE 1079 /* Enable use of delivery rate for loss recovery */ +#define TCP_BBR_MIN_RTO 1080 /* Min RTO in milliseconds */ +#define TCP_BBR_MAX_RTO 1081 /* Max RTO in milliseconds */ +/* unused 1082 */ +#define TCP_BBR_ALGORITHM 1083 /* What measurement algo does BBR use netflix=0, google=1 */ +/* unused 1084 */ +/* unused 1085 */ +#define TCP_BBR_PACE_PER_SEC 1086 +#define TCP_BBR_PACE_DEL_TAR 1087 +#define TCP_BBR_PACE_SEG_MAX 1088 +#define TCP_BBR_PACE_SEG_MIN 1089 +#define TCP_BBR_PACE_CROSS 1090 +/* unused 1091 */ +/* unused 1092 */ +/* unused 1093 */ +/* unused 1094 */ +#define TCP_RACK_TLP_USE 1095 +#define TCP_BBR_TMR_PACE_OH 1096 /* ??? */ +#define TCP_RACK_DO_DETECTION 1097 /* Recycle of extra gain for rack, attack detection */ +#define TCP_BBR_RACK_RTT_USE 1098 /* what RTT should we use 0, 1, or 2? */ +#define TCP_BBR_RETRAN_WTSO 1099 +#define TCP_DATA_AFTER_CLOSE 1100 +#define TCP_BBR_PROBE_RTT_GAIN 1101 +#define TCP_BBR_PROBE_RTT_LEN 1102 +#define TCP_BBR_SEND_IWND_IN_TSO 1103 /* Do we burst out whole iwin size chunks at start? */ +#define TCP_BBR_USE_RACK_RR 1104 /* Do we use the rack rapid recovery for pacing rxt's */ +#define TCP_BBR_USE_RACK_CHEAT TCP_BBR_USE_RACK_RR /* Compat. */ +#define TCP_BBR_HDWR_PACE 1105 /* Enable/disable hardware pacing */ +#define TCP_BBR_UTTER_MAX_TSO 1106 /* Do we enforce an utter max TSO size */ +#define TCP_BBR_EXTRA_STATE 1107 /* Special exit-persist catch up */ +#define TCP_BBR_FLOOR_MIN_TSO 1108 /* The min tso size */ +#define TCP_BBR_MIN_TOPACEOUT 1109 /* Do we suspend pacing until */ +#define TCP_BBR_TSTMP_RAISES 1110 /* Can a timestamp measurement raise the b/w */ +#define TCP_BBR_POLICER_DETECT 1111 /* Turn on/off google mode policer detection */ +#define TCP_BBR_RACK_INIT_RATE 1112 /* Set an initial pacing rate for when we have no b/w in kbits per sec */ +#define TCP_RACK_RR_CONF 1113 /* Rack rapid recovery configuration control*/ +#define TCP_RACK_GP_INCREASE_CA 1114 /* GP increase for Congestion Avoidance */ +#define TCP_RACK_GP_INCREASE_SS 1115 /* GP increase for Slow Start */ +#define TCP_RACK_GP_INCREASE_REC 1116 /* GP increase for Recovery */ +#define TCP_RACK_FORCE_MSEG 1117 /* Override to use the user set max-seg value */ +#define TCP_RACK_PACE_RATE_CA 1118 /* Pacing rate for Congestion Avoidance */ +#define TCP_RACK_PACE_RATE_SS 1119 /* Pacing rate for Slow Start */ +#define TCP_RACK_PACE_RATE_REC 1120 /* Pacing rate for Recovery */ +#define TCP_NO_PRR 1122 /* If pacing, don't use prr */ +#define TCP_RACK_NONRXT_CFG_RATE 1123 /* In recovery does a non-rxt use the cfg rate */ +#define TCP_SHARED_CWND_ENABLE 1124 /* Use a shared cwnd if allowed */ +#define TCP_TIMELY_DYN_ADJ 1125 /* Do we attempt dynamic multipler adjustment with timely. */ +#define TCP_RACK_NO_PUSH_AT_MAX 1126 /* For timely do not push if we are over max rtt */ +#define TCP_RACK_PACE_TO_FILL 1127 /* If we are not in recovery, always pace to fill the cwnd in 1 RTT */ +#define TCP_SHARED_CWND_TIME_LIMIT 1128 /* we should limit to low time values the scwnd life */ +#define TCP_RACK_PROFILE 1129 /* Select a profile that sets multiple options */ +#define TCP_HDWR_RATE_CAP 1130 /* Allow hardware rates to cap pacing rate */ +#define TCP_PACING_RATE_CAP 1131 /* Highest rate allowed in pacing in bytes per second (uint64_t) */ +#define TCP_HDWR_UP_ONLY 1132 /* Allow the pacing rate to climb but not descend (with the exception of fill-cw */ +#define TCP_RACK_ABC_VAL 1133 /* Set a local ABC value different then the system default */ +#define TCP_REC_ABC_VAL 1134 /* Do we use the ABC value for recovery or the override one from sysctl */ +#define TCP_RACK_MEASURE_CNT 1135 /* How many measurements are required in GP pacing */ +#define TCP_DEFER_OPTIONS 1136 /* Defer options until the proper number of measurements occur, does not defer TCP_RACK_MEASURE_CNT */ +/* unused 1137 */ +#define TCP_RACK_PACING_BETA 1138 /* Changing the beta for pacing */ +#define TCP_RACK_PACING_BETA_ECN 1139 /* Changing the beta for ecn with pacing */ +#define TCP_RACK_TIMER_SLOP 1140 /* Set or get the timer slop used */ +#define TCP_RACK_DSACK_OPT 1141 /* How do we setup rack timer DSACK options bit 1/2 */ +#define TCP_RACK_ENABLE_HYSTART 1142 /* Do we allow hystart in the CC modules */ +#define TCP_RACK_SET_RXT_OPTIONS 1143 /* Set the bits in the retransmit options */ +#define TCP_RACK_HI_BETA 1144 /* Turn on/off high beta */ +#define TCP_RACK_SPLIT_LIMIT 1145 /* Set a split limit for split allocations */ +#define TCP_RACK_PACING_DIVISOR 1146 /* Pacing divisor given to rate-limit code for burst sizing */ +#define TCP_RACK_PACE_MIN_SEG 1147 /* Pacing min seg size rack will use */ +#define TCP_RACK_DGP_IN_REC 1148 /* Do we use full DGP in recovery? */ +/* unused 1149 */ +#define TCP_HYBRID_PACING 1150 /* Hybrid pacing enablement */ +#define TCP_PACING_DND 1151 /* When pacing with rr_config=3 can sacks disturb us */ +#define TCP_SS_EEXIT 1152 /* Do we do early exit from slowtart if no b/w growth */ +#define TCP_DGP_UPPER_BOUNDS 1153 /* SS and CA upper bound in percentage */ +#define TCP_NO_TIMELY 1154 /* Disable/enable Timely */ +#define TCP_HONOR_HPTS_MIN 1155 /* Do we honor hpts min to */ +#define TCP_REC_IS_DYN 1156 /* Do we allow timely to change recovery multiplier? */ +#define TCP_SIDECHAN_DIS 1157 /* Disable/enable the side-channel */ +#define TCP_FILLCW_RATE_CAP 1158 /* Set a cap for DGP's fillcw */ +/* unused 1159 */ +#define TCP_STACK_SPEC_INFO 1160 /* Get stack specific information (if present) */ +#define RACK_CSPR_IS_FCC 1161 +#define TCP_GP_USE_LTBW 1162 /* how we use lt_bw 0=not, 1=min, 2=max */ /* Start of reserved space for third-party user-settable options. */ diff --git a/sys/netinet/tcp_hostcache.c b/sys/netinet/tcp_hostcache.c index ed90a9ba7196..dbc966acc56b 100644 --- a/sys/netinet/tcp_hostcache.c +++ b/sys/netinet/tcp_hostcache.c @@ -80,7 +80,6 @@ #include <sys/sbuf.h> #include <sys/smr.h> #include <sys/socket.h> -#include <sys/socketvar.h> #include <sys/sysctl.h> #include <net/vnet.h> @@ -100,23 +99,23 @@ struct hc_head { struct hc_metrics { /* housekeeping */ - CK_SLIST_ENTRY(hc_metrics) rmx_q; + CK_SLIST_ENTRY(hc_metrics) hc_q; struct in_addr ip4; /* IP address */ struct in6_addr ip6; /* IP6 address */ uint32_t ip6_zoneid; /* IPv6 scope zone id */ /* endpoint specific values for tcp */ - uint32_t rmx_mtu; /* MTU for this path */ - uint32_t rmx_ssthresh; /* outbound gateway buffer limit */ - uint32_t rmx_rtt; /* estimated round trip time */ - uint32_t rmx_rttvar; /* estimated rtt variance */ - uint32_t rmx_cwnd; /* congestion window */ - uint32_t rmx_sendpipe; /* outbound delay-bandwidth product */ - uint32_t rmx_recvpipe; /* inbound delay-bandwidth product */ + uint32_t hc_mtu; /* MTU for this path */ + uint32_t hc_ssthresh; /* outbound gateway buffer limit */ + uint32_t hc_rtt; /* estimated round trip time */ + uint32_t hc_rttvar; /* estimated rtt variance */ + uint32_t hc_cwnd; /* congestion window */ + uint32_t hc_sendpipe; /* outbound delay-bandwidth product */ + uint32_t hc_recvpipe; /* inbound delay-bandwidth product */ /* TCP hostcache internal data */ - int rmx_expire; /* lifetime for object */ + int hc_expire; /* lifetime for object */ #ifdef TCP_HC_COUNTERS - u_long rmx_hits; /* number of hits */ - u_long rmx_updates; /* number of updates */ + u_long hc_hits; /* number of hits */ + u_long hc_updates; /* number of updates */ #endif }; @@ -147,7 +146,7 @@ VNET_DEFINE_STATIC(struct tcp_hostcache, tcp_hostcache); VNET_DEFINE_STATIC(struct callout, tcp_hc_callout); #define V_tcp_hc_callout VNET(tcp_hc_callout) -static struct hc_metrics *tcp_hc_lookup(struct in_conninfo *); +static struct hc_metrics *tcp_hc_lookup(const struct in_conninfo *); static int sysctl_tcp_hc_list(SYSCTL_HANDLER_ARGS); static int sysctl_tcp_hc_histo(SYSCTL_HANDLER_ARGS); static int sysctl_tcp_hc_purgenow(SYSCTL_HANDLER_ARGS); @@ -313,7 +312,7 @@ tcp_hc_destroy(void) * Internal function: compare cache entry to a connection. */ static bool -tcp_hc_cmp(struct hc_metrics *hc_entry, struct in_conninfo *inc) +tcp_hc_cmp(struct hc_metrics *hc_entry, const struct in_conninfo *inc) { if (inc->inc_flags & INC_ISIPV6) { @@ -335,7 +334,7 @@ tcp_hc_cmp(struct hc_metrics *hc_entry, struct in_conninfo *inc) * On success returns in SMR section. */ static struct hc_metrics * -tcp_hc_lookup(struct in_conninfo *inc) +tcp_hc_lookup(const struct in_conninfo *inc) { struct hc_head *hc_head; struct hc_metrics *hc_entry; @@ -348,17 +347,17 @@ tcp_hc_lookup(struct in_conninfo *inc) * Iterate through entries in bucket row looking for a match. */ smr_enter(V_tcp_hostcache.smr); - CK_SLIST_FOREACH(hc_entry, &hc_head->hch_bucket, rmx_q) + CK_SLIST_FOREACH(hc_entry, &hc_head->hch_bucket, hc_q) if (tcp_hc_cmp(hc_entry, inc)) break; if (hc_entry != NULL) { - if (atomic_load_int(&hc_entry->rmx_expire) != + if (atomic_load_int(&hc_entry->hc_expire) != V_tcp_hostcache.expire) - atomic_store_int(&hc_entry->rmx_expire, + atomic_store_int(&hc_entry->hc_expire, V_tcp_hostcache.expire); #ifdef TCP_HC_COUNTERS - hc_entry->rmx_hits++; + hc_entry->hc_hits++; #endif } else smr_exit(V_tcp_hostcache.smr); @@ -372,7 +371,8 @@ tcp_hc_lookup(struct in_conninfo *inc) * a value is not set. */ void -tcp_hc_get(struct in_conninfo *inc, struct hc_metrics_lite *hc_metrics_lite) +tcp_hc_get(const struct in_conninfo *inc, + struct hc_metrics_lite *hc_metrics_lite) { struct hc_metrics *hc_entry; @@ -394,13 +394,13 @@ tcp_hc_get(struct in_conninfo *inc, struct hc_metrics_lite *hc_metrics_lite) return; } - hc_metrics_lite->rmx_mtu = atomic_load_32(&hc_entry->rmx_mtu); - hc_metrics_lite->rmx_ssthresh = atomic_load_32(&hc_entry->rmx_ssthresh); - hc_metrics_lite->rmx_rtt = atomic_load_32(&hc_entry->rmx_rtt); - hc_metrics_lite->rmx_rttvar = atomic_load_32(&hc_entry->rmx_rttvar); - hc_metrics_lite->rmx_cwnd = atomic_load_32(&hc_entry->rmx_cwnd); - hc_metrics_lite->rmx_sendpipe = atomic_load_32(&hc_entry->rmx_sendpipe); - hc_metrics_lite->rmx_recvpipe = atomic_load_32(&hc_entry->rmx_recvpipe); + hc_metrics_lite->hc_mtu = atomic_load_32(&hc_entry->hc_mtu); + hc_metrics_lite->hc_ssthresh = atomic_load_32(&hc_entry->hc_ssthresh); + hc_metrics_lite->hc_rtt = atomic_load_32(&hc_entry->hc_rtt); + hc_metrics_lite->hc_rttvar = atomic_load_32(&hc_entry->hc_rttvar); + hc_metrics_lite->hc_cwnd = atomic_load_32(&hc_entry->hc_cwnd); + hc_metrics_lite->hc_sendpipe = atomic_load_32(&hc_entry->hc_sendpipe); + hc_metrics_lite->hc_recvpipe = atomic_load_32(&hc_entry->hc_recvpipe); smr_exit(V_tcp_hostcache.smr); } @@ -411,7 +411,7 @@ tcp_hc_get(struct in_conninfo *inc, struct hc_metrics_lite *hc_metrics_lite) * set. */ uint32_t -tcp_hc_getmtu(struct in_conninfo *inc) +tcp_hc_getmtu(const struct in_conninfo *inc) { struct hc_metrics *hc_entry; uint32_t mtu; @@ -424,7 +424,7 @@ tcp_hc_getmtu(struct in_conninfo *inc) return (0); } - mtu = atomic_load_32(&hc_entry->rmx_mtu); + mtu = atomic_load_32(&hc_entry->hc_mtu); smr_exit(V_tcp_hostcache.smr); return (mtu); @@ -435,9 +435,9 @@ tcp_hc_getmtu(struct in_conninfo *inc) * Creates a new entry if none was found. */ void -tcp_hc_updatemtu(struct in_conninfo *inc, uint32_t mtu) +tcp_hc_updatemtu(const struct in_conninfo *inc, uint32_t mtu) { - struct hc_metrics_lite hcml = { .rmx_mtu = mtu }; + struct hc_metrics_lite hcml = { .hc_mtu = mtu }; return (tcp_hc_update(inc, &hcml)); } @@ -447,7 +447,7 @@ tcp_hc_updatemtu(struct in_conninfo *inc, uint32_t mtu) * Creates a new entry if none was found. */ void -tcp_hc_update(struct in_conninfo *inc, struct hc_metrics_lite *hcml) +tcp_hc_update(const struct in_conninfo *inc, struct hc_metrics_lite *hcml) { struct hc_head *hc_head; struct hc_metrics *hc_entry, *hc_prev; @@ -461,20 +461,20 @@ tcp_hc_update(struct in_conninfo *inc, struct hc_metrics_lite *hcml) hc_prev = NULL; THC_LOCK(hc_head); - CK_SLIST_FOREACH(hc_entry, &hc_head->hch_bucket, rmx_q) { + CK_SLIST_FOREACH(hc_entry, &hc_head->hch_bucket, hc_q) { if (tcp_hc_cmp(hc_entry, inc)) break; - if (CK_SLIST_NEXT(hc_entry, rmx_q) != NULL) + if (CK_SLIST_NEXT(hc_entry, hc_q) != NULL) hc_prev = hc_entry; } if (hc_entry != NULL) { - if (atomic_load_int(&hc_entry->rmx_expire) != + if (atomic_load_int(&hc_entry->hc_expire) != V_tcp_hostcache.expire) - atomic_store_int(&hc_entry->rmx_expire, + atomic_store_int(&hc_entry->hc_expire, V_tcp_hostcache.expire); #ifdef TCP_HC_COUNTERS - hc_entry->rmx_updates++; + hc_entry->hc_updates++; #endif new = false; } else { @@ -492,18 +492,18 @@ tcp_hc_update(struct in_conninfo *inc, struct hc_metrics_lite *hcml) atomic_load_int(&V_tcp_hostcache.cache_count) >= V_tcp_hostcache.cache_limit) { if (hc_prev != NULL) { - hc_entry = CK_SLIST_NEXT(hc_prev, rmx_q); - KASSERT(CK_SLIST_NEXT(hc_entry, rmx_q) == NULL, + hc_entry = CK_SLIST_NEXT(hc_prev, hc_q); + KASSERT(CK_SLIST_NEXT(hc_entry, hc_q) == NULL, ("%s: %p is not one to last", __func__, hc_prev)); - CK_SLIST_REMOVE_AFTER(hc_prev, rmx_q); + CK_SLIST_REMOVE_AFTER(hc_prev, hc_q); } else if ((hc_entry = CK_SLIST_FIRST(&hc_head->hch_bucket)) != NULL) { - KASSERT(CK_SLIST_NEXT(hc_entry, rmx_q) == NULL, + KASSERT(CK_SLIST_NEXT(hc_entry, hc_q) == NULL, ("%s: %p is not the only element", __func__, hc_entry)); CK_SLIST_REMOVE_HEAD(&hc_head->hch_bucket, - rmx_q); + hc_q); } else { THC_UNLOCK(hc_head); return; @@ -536,7 +536,7 @@ tcp_hc_update(struct in_conninfo *inc, struct hc_metrics_lite *hcml) hc_entry->ip6_zoneid = inc->inc6_zoneid; } else hc_entry->ip4 = inc->inc_faddr; - hc_entry->rmx_expire = V_tcp_hostcache.expire; + hc_entry->hc_expire = V_tcp_hostcache.expire; new = true; } @@ -544,60 +544,60 @@ tcp_hc_update(struct in_conninfo *inc, struct hc_metrics_lite *hcml) * Fill in data. Use atomics, since an existing entry is * accessible by readers in SMR section. */ - if (hcml->rmx_mtu != 0) { - atomic_store_32(&hc_entry->rmx_mtu, hcml->rmx_mtu); + if (hcml->hc_mtu != 0) { + atomic_store_32(&hc_entry->hc_mtu, hcml->hc_mtu); } - if (hcml->rmx_rtt != 0) { - if (hc_entry->rmx_rtt == 0) - v = hcml->rmx_rtt; + if (hcml->hc_rtt != 0) { + if (hc_entry->hc_rtt == 0) + v = hcml->hc_rtt; else - v = ((uint64_t)hc_entry->rmx_rtt + - (uint64_t)hcml->rmx_rtt) / 2; - atomic_store_32(&hc_entry->rmx_rtt, v); + v = ((uint64_t)hc_entry->hc_rtt + + (uint64_t)hcml->hc_rtt) / 2; + atomic_store_32(&hc_entry->hc_rtt, v); TCPSTAT_INC(tcps_cachedrtt); } - if (hcml->rmx_rttvar != 0) { - if (hc_entry->rmx_rttvar == 0) - v = hcml->rmx_rttvar; + if (hcml->hc_rttvar != 0) { + if (hc_entry->hc_rttvar == 0) + v = hcml->hc_rttvar; else - v = ((uint64_t)hc_entry->rmx_rttvar + - (uint64_t)hcml->rmx_rttvar) / 2; - atomic_store_32(&hc_entry->rmx_rttvar, v); + v = ((uint64_t)hc_entry->hc_rttvar + + (uint64_t)hcml->hc_rttvar) / 2; + atomic_store_32(&hc_entry->hc_rttvar, v); TCPSTAT_INC(tcps_cachedrttvar); } - if (hcml->rmx_ssthresh != 0) { - if (hc_entry->rmx_ssthresh == 0) - v = hcml->rmx_ssthresh; + if (hcml->hc_ssthresh != 0) { + if (hc_entry->hc_ssthresh == 0) + v = hcml->hc_ssthresh; else - v = (hc_entry->rmx_ssthresh + hcml->rmx_ssthresh) / 2; - atomic_store_32(&hc_entry->rmx_ssthresh, v); + v = (hc_entry->hc_ssthresh + hcml->hc_ssthresh) / 2; + atomic_store_32(&hc_entry->hc_ssthresh, v); TCPSTAT_INC(tcps_cachedssthresh); } - if (hcml->rmx_cwnd != 0) { - if (hc_entry->rmx_cwnd == 0) - v = hcml->rmx_cwnd; + if (hcml->hc_cwnd != 0) { + if (hc_entry->hc_cwnd == 0) + v = hcml->hc_cwnd; else - v = ((uint64_t)hc_entry->rmx_cwnd + - (uint64_t)hcml->rmx_cwnd) / 2; - atomic_store_32(&hc_entry->rmx_cwnd, v); + v = ((uint64_t)hc_entry->hc_cwnd + + (uint64_t)hcml->hc_cwnd) / 2; + atomic_store_32(&hc_entry->hc_cwnd, v); /* TCPSTAT_INC(tcps_cachedcwnd); */ } - if (hcml->rmx_sendpipe != 0) { - if (hc_entry->rmx_sendpipe == 0) - v = hcml->rmx_sendpipe; + if (hcml->hc_sendpipe != 0) { + if (hc_entry->hc_sendpipe == 0) + v = hcml->hc_sendpipe; else - v = ((uint64_t)hc_entry->rmx_sendpipe + - (uint64_t)hcml->rmx_sendpipe) /2; - atomic_store_32(&hc_entry->rmx_sendpipe, v); + v = ((uint64_t)hc_entry->hc_sendpipe + + (uint64_t)hcml->hc_sendpipe) /2; + atomic_store_32(&hc_entry->hc_sendpipe, v); /* TCPSTAT_INC(tcps_cachedsendpipe); */ } - if (hcml->rmx_recvpipe != 0) { - if (hc_entry->rmx_recvpipe == 0) - v = hcml->rmx_recvpipe; + if (hcml->hc_recvpipe != 0) { + if (hc_entry->hc_recvpipe == 0) + v = hcml->hc_recvpipe; else - v = ((uint64_t)hc_entry->rmx_recvpipe + - (uint64_t)hcml->rmx_recvpipe) /2; - atomic_store_32(&hc_entry->rmx_recvpipe, v); + v = ((uint64_t)hc_entry->hc_recvpipe + + (uint64_t)hcml->hc_recvpipe) /2; + atomic_store_32(&hc_entry->hc_recvpipe, v); /* TCPSTAT_INC(tcps_cachedrecvpipe); */ } @@ -605,17 +605,17 @@ tcp_hc_update(struct in_conninfo *inc, struct hc_metrics_lite *hcml) * Put it upfront. */ if (new) { - CK_SLIST_INSERT_HEAD(&hc_head->hch_bucket, hc_entry, rmx_q); + CK_SLIST_INSERT_HEAD(&hc_head->hch_bucket, hc_entry, hc_q); hc_head->hch_length++; KASSERT(hc_head->hch_length <= V_tcp_hostcache.bucket_limit, ("tcp_hostcache: bucket length too high at %p", hc_head)); atomic_add_int(&V_tcp_hostcache.cache_count, 1); TCPSTAT_INC(tcps_hc_added); } else if (hc_entry != CK_SLIST_FIRST(&hc_head->hch_bucket)) { - KASSERT(CK_SLIST_NEXT(hc_prev, rmx_q) == hc_entry, + KASSERT(CK_SLIST_NEXT(hc_prev, hc_q) == hc_entry, ("%s: %p next is not %p", __func__, hc_prev, hc_entry)); - CK_SLIST_REMOVE_AFTER(hc_prev, rmx_q); - CK_SLIST_INSERT_HEAD(&hc_head->hch_bucket, hc_entry, rmx_q); + CK_SLIST_REMOVE_AFTER(hc_prev, hc_q); + CK_SLIST_INSERT_HEAD(&hc_head->hch_bucket, hc_entry, hc_q); } THC_UNLOCK(hc_head); } @@ -668,7 +668,7 @@ sysctl_tcp_hc_list(SYSCTL_HANDLER_ARGS) for (i = 0; i < V_tcp_hostcache.hashsize; i++) { THC_LOCK(&V_tcp_hostcache.hashbase[i]); CK_SLIST_FOREACH(hc_entry, - &V_tcp_hostcache.hashbase[i].hch_bucket, rmx_q) { + &V_tcp_hostcache.hashbase[i].hch_bucket, hc_q) { sbuf_printf(&sb, "%-15s %5u %8u %6lums %6lums %8u %8u %8u " #ifdef TCP_HC_COUNTERS @@ -682,20 +682,20 @@ sysctl_tcp_hc_list(SYSCTL_HANDLER_ARGS) #else "IPv6?", #endif - hc_entry->rmx_mtu, - hc_entry->rmx_ssthresh, - msec((u_long)hc_entry->rmx_rtt * + hc_entry->hc_mtu, + hc_entry->hc_ssthresh, + msec((u_long)hc_entry->hc_rtt * (RTM_RTTUNIT / (hz * TCP_RTT_SCALE))), - msec((u_long)hc_entry->rmx_rttvar * + msec((u_long)hc_entry->hc_rttvar * (RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE))), - hc_entry->rmx_cwnd, - hc_entry->rmx_sendpipe, - hc_entry->rmx_recvpipe, + hc_entry->hc_cwnd, + hc_entry->hc_sendpipe, + hc_entry->hc_recvpipe, #ifdef TCP_HC_COUNTERS - hc_entry->rmx_hits, - hc_entry->rmx_updates, + hc_entry->hc_hits, + hc_entry->hc_updates, #endif - hc_entry->rmx_expire); + hc_entry->hc_expire); } THC_UNLOCK(&V_tcp_hostcache.hashbase[i]); sbuf_drain(&sb); @@ -762,33 +762,33 @@ tcp_hc_purge_internal(int all) head = &V_tcp_hostcache.hashbase[i]; hc_prev = NULL; THC_LOCK(head); - CK_SLIST_FOREACH_SAFE(hc_entry, &head->hch_bucket, rmx_q, + CK_SLIST_FOREACH_SAFE(hc_entry, &head->hch_bucket, hc_q, hc_next) { KASSERT(head->hch_length > 0 && head->hch_length <= V_tcp_hostcache.bucket_limit, ("tcp_hostcache: " "bucket length out of range at %u: %u", i, head->hch_length)); if (all || - atomic_load_int(&hc_entry->rmx_expire) <= 0) { + atomic_load_int(&hc_entry->hc_expire) <= 0) { if (hc_prev != NULL) { KASSERT(hc_entry == - CK_SLIST_NEXT(hc_prev, rmx_q), + CK_SLIST_NEXT(hc_prev, hc_q), ("%s: %p is not next to %p", __func__, hc_entry, hc_prev)); - CK_SLIST_REMOVE_AFTER(hc_prev, rmx_q); + CK_SLIST_REMOVE_AFTER(hc_prev, hc_q); } else { KASSERT(hc_entry == CK_SLIST_FIRST(&head->hch_bucket), ("%s: %p is not first", __func__, hc_entry)); CK_SLIST_REMOVE_HEAD(&head->hch_bucket, - rmx_q); + hc_q); } uma_zfree_smr(V_tcp_hostcache.zone, hc_entry); head->hch_length--; atomic_subtract_int(&V_tcp_hostcache.cache_count, 1); } else { - atomic_subtract_int(&hc_entry->rmx_expire, + atomic_subtract_int(&hc_entry->hc_expire, V_tcp_hostcache.prune); hc_prev = hc_entry; } diff --git a/sys/netinet/tcp_hpts.c b/sys/netinet/tcp_hpts.c index 85341cab0750..b60cdf45af52 100644 --- a/sys/netinet/tcp_hpts.c +++ b/sys/netinet/tcp_hpts.c @@ -173,6 +173,7 @@ /* Each hpts has its own p_mtx which is used for locking */ #define HPTS_MTX_ASSERT(hpts) mtx_assert(&(hpts)->p_mtx, MA_OWNED) #define HPTS_LOCK(hpts) mtx_lock(&(hpts)->p_mtx) +#define HPTS_TRYLOCK(hpts) mtx_trylock(&(hpts)->p_mtx) #define HPTS_UNLOCK(hpts) mtx_unlock(&(hpts)->p_mtx) struct tcp_hpts_entry { /* Cache line 0x00 */ @@ -239,7 +240,7 @@ static int tcp_bind_threads = 2; static int tcp_use_irq_cpu = 0; static int hpts_does_tp_logging = 0; -static int32_t tcp_hptsi(struct tcp_hpts_entry *hpts, int from_callout); +static int32_t tcp_hptsi(struct tcp_hpts_entry *hpts, bool from_callout); static void tcp_hpts_thread(void *ctx); int32_t tcp_min_hptsi_time = DEFAULT_MIN_SLEEP; @@ -430,40 +431,42 @@ hpts_random_cpu(void) static void tcp_hpts_log(struct tcp_hpts_entry *hpts, struct tcpcb *tp, struct timeval *tv, - int slots_to_run, int idx, int from_callout) + int slots_to_run, int idx, bool from_callout) { - union tcp_log_stackspecific log; - /* - * Unused logs are - * 64 bit - delRate, rttProp, bw_inuse - * 16 bit - cwnd_gain - * 8 bit - bbr_state, bbr_substate, inhpts; - */ - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); - log.u_bbr.flex1 = hpts->p_nxt_slot; - log.u_bbr.flex2 = hpts->p_cur_slot; - log.u_bbr.flex3 = hpts->p_prev_slot; - log.u_bbr.flex4 = idx; - log.u_bbr.flex5 = hpts->p_curtick; - log.u_bbr.flex6 = hpts->p_on_queue_cnt; - log.u_bbr.flex7 = hpts->p_cpu; - log.u_bbr.flex8 = (uint8_t)from_callout; - log.u_bbr.inflight = slots_to_run; - log.u_bbr.applimited = hpts->overidden_sleep; - log.u_bbr.delivered = hpts->saved_curtick; - log.u_bbr.timeStamp = tcp_tv_to_usectick(tv); - log.u_bbr.epoch = hpts->saved_curslot; - log.u_bbr.lt_epoch = hpts->saved_prev_slot; - log.u_bbr.pkts_out = hpts->p_delayed_by; - log.u_bbr.lost = hpts->p_hpts_sleep_time; - log.u_bbr.pacing_gain = hpts->p_cpu; - log.u_bbr.pkt_epoch = hpts->p_runningslot; - log.u_bbr.use_lt_bw = 1; - TCP_LOG_EVENTP(tp, NULL, - &tptosocket(tp)->so_rcv, - &tptosocket(tp)->so_snd, - BBR_LOG_HPTSDIAG, 0, - 0, &log, false, tv); + if (hpts_does_tp_logging && tcp_bblogging_on(tp)) { + union tcp_log_stackspecific log; + /* + * Unused logs are + * 64 bit - delRate, rttProp, bw_inuse + * 16 bit - cwnd_gain + * 8 bit - bbr_state, bbr_substate, inhpts; + */ + memset(&log, 0, sizeof(log)); + log.u_bbr.flex1 = hpts->p_nxt_slot; + log.u_bbr.flex2 = hpts->p_cur_slot; + log.u_bbr.flex3 = hpts->p_prev_slot; + log.u_bbr.flex4 = idx; + log.u_bbr.flex5 = hpts->p_curtick; + log.u_bbr.flex6 = hpts->p_on_queue_cnt; + log.u_bbr.flex7 = hpts->p_cpu; + log.u_bbr.flex8 = (uint8_t)from_callout; + log.u_bbr.inflight = slots_to_run; + log.u_bbr.applimited = hpts->overidden_sleep; + log.u_bbr.delivered = hpts->saved_curtick; + log.u_bbr.timeStamp = tcp_tv_to_usectick(tv); + log.u_bbr.epoch = hpts->saved_curslot; + log.u_bbr.lt_epoch = hpts->saved_prev_slot; + log.u_bbr.pkts_out = hpts->p_delayed_by; + log.u_bbr.lost = hpts->p_hpts_sleep_time; + log.u_bbr.pacing_gain = hpts->p_cpu; + log.u_bbr.pkt_epoch = hpts->p_runningslot; + log.u_bbr.use_lt_bw = 1; + TCP_LOG_EVENTP(tp, NULL, + &tptosocket(tp)->so_rcv, + &tptosocket(tp)->so_snd, + BBR_LOG_HPTSDIAG, 0, + 0, &log, false, tv); + } } static void @@ -1075,7 +1078,7 @@ tcp_hpts_set_max_sleep(struct tcp_hpts_entry *hpts, int wrap_loop_cnt) } static int32_t -tcp_hptsi(struct tcp_hpts_entry *hpts, int from_callout) +tcp_hptsi(struct tcp_hpts_entry *hpts, bool from_callout) { struct tcpcb *tp; struct timeval tv; @@ -1086,7 +1089,10 @@ tcp_hptsi(struct tcp_hpts_entry *hpts, int from_callout) int32_t wrap_loop_cnt = 0; int32_t slot_pos_of_endpoint = 0; int32_t orig_exit_slot; - int8_t completed_measure = 0, seen_endpoint = 0; + bool completed_measure, seen_endpoint; + + completed_measure = false; + seen_endpoint = false; HPTS_MTX_ASSERT(hpts); NET_EPOCH_ASSERT(); @@ -1251,11 +1257,11 @@ again: } /* For debugging */ - if (seen_endpoint == 0) { - seen_endpoint = 1; + if (!seen_endpoint) { + seen_endpoint = true; orig_exit_slot = slot_pos_of_endpoint = runningslot; - } else if (completed_measure == 0) { + } else if (!completed_measure) { /* Record the new position */ orig_exit_slot = runningslot; } @@ -1349,9 +1355,7 @@ again: } CURVNET_SET(inp->inp_vnet); /* Lets do any logging that we might want to */ - if (hpts_does_tp_logging && tcp_bblogging_on(tp)) { - tcp_hpts_log(hpts, tp, &tv, slots_to_run, i, from_callout); - } + tcp_hpts_log(hpts, tp, &tv, slots_to_run, i, from_callout); if (tp->t_fb_ptr != NULL) { kern_prefetch(tp->t_fb_ptr, &did_prefetch); @@ -1369,24 +1373,20 @@ again: * cause a call to output if it is needed so we do * not need a second call to tcp_output(). So we do * one or the other but not both. + * + * XXXGL: some KPI abuse here. tfb_do_queued_segments + * returns unlocked with positive error (always 1) and + * tcp_output returns unlocked with negative error. */ tp->t_flags2 |= TF2_HPTS_CALLS; if ((tp->t_flags2 & TF2_SUPPORTS_MBUFQ) && - !STAILQ_EMPTY(&tp->t_inqueue)) { - error = (*tp->t_fb->tfb_do_queued_segments)(tp, 0); - /* - * A non-zero return for input queue processing - * is the lock is released and most likely the - * inp is gone. - */ - if (error) - goto skip_pacing; - } else + !STAILQ_EMPTY(&tp->t_inqueue)) + error = -(*tp->t_fb->tfb_do_queued_segments)(tp, + 0); + else error = tcp_output(tp); - if (error < 0) - goto skip_pacing; - INP_WUNLOCK(inp); - skip_pacing: + if (__predict_true(error >= 0)) + INP_WUNLOCK(inp); CURVNET_RESTORE(); } if (seen_endpoint) { @@ -1397,7 +1397,7 @@ again: * is where we calculated the end of our cycle to * be when we first entered. */ - completed_measure = 1; + completed_measure = true; } HPTS_LOCK(hpts); hpts->p_runningslot++; @@ -1414,7 +1414,7 @@ no_one: */ hpts->p_prev_slot = hpts->p_cur_slot; hpts->p_lasttick = hpts->p_curtick; - if ((from_callout == 0) || (loop_cnt > max_pacer_loops)) { + if (!from_callout || (loop_cnt > max_pacer_loops)) { /* * Something is serious slow we have * looped through processing the wheel @@ -1435,7 +1435,7 @@ no_one: } hpts->p_curtick = tcp_gethptstick(&tv); hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick); - if (seen_endpoint == 0) { + if (!seen_endpoint) { /* We saw no endpoint but we may be looping */ orig_exit_slot = hpts->p_cur_slot; } @@ -1462,11 +1462,11 @@ no_run: * multiple times so the slots may not align either. */ KASSERT(((hpts->p_prev_slot == hpts->p_cur_slot) || - (wrap_loop_cnt >= 2) || (from_callout == 0)), + (wrap_loop_cnt >= 2) || !from_callout), ("H:%p p_prev_slot:%u not equal to p_cur_slot:%u", hpts, hpts->p_prev_slot, hpts->p_cur_slot)); KASSERT(((hpts->p_lasttick == hpts->p_curtick) - || (wrap_loop_cnt >= 2) || (from_callout == 0)), + || (wrap_loop_cnt >= 2) || !from_callout), ("H:%p p_lasttick:%u not equal to p_curtick:%u", hpts, hpts->p_lasttick, hpts->p_curtick)); if (from_callout && (hpts->p_lasttick != hpts->p_curtick)) { @@ -1476,7 +1476,7 @@ no_run: goto again; } - if (from_callout){ + if (from_callout) { tcp_hpts_set_max_sleep(hpts, wrap_loop_cnt); } if (seen_endpoint) @@ -1486,7 +1486,7 @@ no_run: } void -__tcp_set_hpts(struct tcpcb *tp, int32_t line) +tcp_set_hpts(struct tcpcb *tp) { struct tcp_hpts_entry *hpts; int failed; @@ -1499,7 +1499,7 @@ __tcp_set_hpts(struct tcpcb *tp, int32_t line) if (failed == 0) tp->t_flags2 |= TF2_HPTS_CPU_SET; } - mtx_unlock(&hpts->p_mtx); + HPTS_UNLOCK(hpts); } static struct tcp_hpts_entry * @@ -1556,7 +1556,7 @@ __tcp_run_hpts(void) /* Already active */ return; } - if (mtx_trylock(&hpts->p_mtx) == 0) { + if (!HPTS_TRYLOCK(hpts)) { /* Someone else got the lock */ return; } @@ -1566,7 +1566,7 @@ __tcp_run_hpts(void) hpts->syscall_cnt++; counter_u64_add(hpts_direct_call, 1); hpts->p_hpts_active = 1; - ticks_ran = tcp_hptsi(hpts, 0); + ticks_ran = tcp_hptsi(hpts, false); /* We may want to adjust the sleep values here */ if (hpts->p_on_queue_cnt >= conn_cnt_thresh) { if (ticks_ran > ticks_indicate_less_sleep) { @@ -1611,8 +1611,7 @@ __tcp_run_hpts(void) } hpts->p_hpts_active = 0; out_with_mtx: - HPTS_MTX_ASSERT(hpts); - mtx_unlock(&hpts->p_mtx); + HPTS_UNLOCK(hpts); NET_EPOCH_EXIT(et); } @@ -1626,7 +1625,7 @@ tcp_hpts_thread(void *ctx) int ticks_ran; hpts = (struct tcp_hpts_entry *)ctx; - mtx_lock(&hpts->p_mtx); + HPTS_LOCK(hpts); if (hpts->p_direct_wake) { /* Signaled by input or output with low occupancy count. */ callout_stop(&hpts->co); @@ -1636,7 +1635,7 @@ tcp_hpts_thread(void *ctx) counter_u64_add(hpts_wake_timeout, 1); if (callout_pending(&hpts->co) || !callout_active(&hpts->co)) { - mtx_unlock(&hpts->p_mtx); + HPTS_UNLOCK(hpts); return; } } @@ -1682,7 +1681,7 @@ tcp_hpts_thread(void *ctx) } hpts->sleeping = 0; hpts->p_hpts_active = 1; - ticks_ran = tcp_hptsi(hpts, 1); + ticks_ran = tcp_hptsi(hpts, true); tv.tv_sec = 0; tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_SLOT; if ((hpts->p_on_queue_cnt > conn_cnt_thresh) && (hpts->hit_callout_thresh == 0)) { @@ -1765,7 +1764,7 @@ back_to_sleep: hpts_timeout_swi, hpts, hpts->p_cpu, (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision))); NET_EPOCH_EXIT(et); - mtx_unlock(&hpts->p_mtx); + HPTS_UNLOCK(hpts); } #undef timersub diff --git a/sys/netinet/tcp_hpts.h b/sys/netinet/tcp_hpts.h index b097a2b98db9..f5856ed8e688 100644 --- a/sys/netinet/tcp_hpts.h +++ b/sys/netinet/tcp_hpts.h @@ -149,8 +149,7 @@ uint32_t tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, #define tcp_hpts_insert(inp, slot) \ tcp_hpts_insert_diag((inp), (slot), __LINE__, NULL) -void __tcp_set_hpts(struct tcpcb *tp, int32_t line); -#define tcp_set_hpts(a) __tcp_set_hpts(a, __LINE__) +void tcp_set_hpts(struct tcpcb *tp); void tcp_set_inp_to_drop(struct inpcb *inp, uint16_t reason); @@ -165,25 +164,25 @@ extern int32_t tcp_min_hptsi_time; * The following functions should also be available * to userspace as well. */ -static __inline uint32_t +static inline uint32_t tcp_tv_to_hptstick(const struct timeval *sv) { return ((sv->tv_sec * 100000) + (sv->tv_usec / HPTS_TICKS_PER_SLOT)); } -static __inline uint32_t +static inline uint32_t tcp_tv_to_usectick(const struct timeval *sv) { return ((uint32_t) ((sv->tv_sec * HPTS_USEC_IN_SEC) + sv->tv_usec)); } -static __inline uint32_t +static inline uint32_t tcp_tv_to_mssectick(const struct timeval *sv) { return ((uint32_t) ((sv->tv_sec * HPTS_MSEC_IN_SEC) + (sv->tv_usec/HPTS_USEC_IN_MSEC))); } -static __inline uint64_t +static inline uint64_t tcp_tv_to_lusectick(const struct timeval *sv) { return ((uint64_t)((sv->tv_sec * HPTS_USEC_IN_SEC) + sv->tv_usec)); @@ -199,7 +198,7 @@ get_hpts_min_sleep_time(void) return (tcp_min_hptsi_time + HPTS_TICKS_PER_SLOT); } -static __inline uint32_t +static inline uint32_t tcp_gethptstick(struct timeval *sv) { struct timeval tv; @@ -210,7 +209,7 @@ tcp_gethptstick(struct timeval *sv) return (tcp_tv_to_hptstick(sv)); } -static __inline uint64_t +static inline uint64_t tcp_get_u64_usecs(struct timeval *tv) { struct timeval tvd; @@ -221,7 +220,7 @@ tcp_get_u64_usecs(struct timeval *tv) return (tcp_tv_to_lusectick(tv)); } -static __inline uint32_t +static inline uint32_t tcp_get_usecs(struct timeval *tv) { struct timeval tvd; diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c index 3fda6e903738..de428ae1af6f 100644 --- a/sys/netinet/tcp_input.c +++ b/sys/netinet/tcp_input.c @@ -112,9 +112,6 @@ #include <netinet/tcpip.h> #include <netinet/cc/cc.h> #include <netinet/tcp_fastopen.h> -#ifdef TCPPCAP -#include <netinet/tcp_pcap.h> -#endif #include <netinet/tcp_syncache.h> #ifdef TCP_OFFLOAD #include <netinet/tcp_offload.h> @@ -135,6 +132,11 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_log_in_vain), 0, "Log all incoming TCP segments to closed ports"); +VNET_DEFINE(int, tcp_bind_all_fibs) = 1; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, bind_all_fibs, CTLFLAG_VNET | CTLFLAG_RDTUN, + &VNET_NAME(tcp_bind_all_fibs), 0, + "Bound sockets receive traffic from all FIBs"); + VNET_DEFINE(int, blackhole) = 0; #define V_blackhole VNET(blackhole) SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_VNET | CTLFLAG_RW, @@ -202,6 +204,11 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, insecure_rst, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_insecure_rst), 0, "Follow RFC793 instead of RFC5961 criteria for accepting RST packets"); +VNET_DEFINE(int, tcp_insecure_ack) = 0; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, insecure_ack, CTLFLAG_VNET | CTLFLAG_RW, + &VNET_NAME(tcp_insecure_ack), 0, + "Follow RFC793 criteria for validating SEG.ACK"); + VNET_DEFINE(int, tcp_recvspace) = 1024*64; #define V_tcp_recvspace VNET(tcp_recvspace) SYSCTL_INT(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLFLAG_VNET | CTLFLAG_RW, @@ -363,11 +370,11 @@ cc_conn_init(struct tcpcb *tp) tcp_hc_get(&inp->inp_inc, &metrics); maxseg = tcp_maxseg(tp); - if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) { + if (tp->t_srtt == 0 && (rtt = metrics.hc_rtt)) { tp->t_srtt = rtt; TCPSTAT_INC(tcps_usedrtt); - if (metrics.rmx_rttvar) { - tp->t_rttvar = metrics.rmx_rttvar; + if (metrics.hc_rttvar) { + tp->t_rttvar = metrics.hc_rttvar; TCPSTAT_INC(tcps_usedrttvar); } else { /* default variation is +- 1 rtt */ @@ -376,16 +383,16 @@ cc_conn_init(struct tcpcb *tp) } TCPT_RANGESET(tp->t_rxtcur, ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1, - tp->t_rttmin, TCPTV_REXMTMAX); + tp->t_rttmin, tcp_rexmit_max); } - if (metrics.rmx_ssthresh) { + if (metrics.hc_ssthresh) { /* * There's some sort of gateway or interface * buffer limit on the path. Use this to set * the slow start threshold, but set the * threshold to no less than 2*mss. */ - tp->snd_ssthresh = max(2 * maxseg, metrics.rmx_ssthresh); + tp->snd_ssthresh = max(2 * maxseg, metrics.hc_ssthresh); TCPSTAT_INC(tcps_usedssthresh); } @@ -439,10 +446,7 @@ cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type) case CC_RTO: tp->t_dupacks = 0; tp->t_bytes_acked = 0; - if ((tp->t_rxtshift > 1) || - !((tp->t_flags & TF_SACK_PERMIT) && - (!TAILQ_EMPTY(&tp->snd_holes)))) - EXIT_RECOVERY(tp->t_flags); + EXIT_RECOVERY(tp->t_flags); if (tp->t_flags2 & TF2_ECN_PERMIT) tp->t_flags2 |= TF2_ECN_SND_CWR; break; @@ -458,6 +462,7 @@ cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type) ENTER_CONGRECOVERY(tp->t_flags); tp->snd_nxt = tp->snd_max; tp->t_flags &= ~TF_PREVVALID; + tp->t_rxtshift = 0; tp->t_badrxtwin = 0; break; } @@ -562,8 +567,6 @@ int tcp6_input_with_port(struct mbuf **mp, int *offp, int proto, uint16_t port) { struct mbuf *m; - struct in6_ifaddr *ia6; - struct ip6_hdr *ip6; m = *mp; if (m->m_len < *offp + sizeof(struct tcphdr)) { @@ -575,19 +578,6 @@ tcp6_input_with_port(struct mbuf **mp, int *offp, int proto, uint16_t port) } } - /* - * draft-itojun-ipv6-tcp-to-anycast - * better place to put this in? - */ - ip6 = mtod(m, struct ip6_hdr *); - ia6 = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */, false); - if (ia6 && (ia6->ia6_flags & IN6_IFF_ANYCAST)) { - icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR, - (caddr_t)&ip6->ip6_dst - (caddr_t)ip6); - *mp = NULL; - return (IPPROTO_DONE); - } - *mp = m; return (tcp_input_with_port(mp, offp, proto, port)); } @@ -631,6 +621,7 @@ tcp_input_with_port(struct mbuf **mp, int *offp, int proto, uint16_t port) #endif /* INET6 */ struct tcpopt to; /* options in this segment */ char *s = NULL; /* address and port logging */ + bool closed_port = false; /* segment is hitting a closed port */ NET_EPOCH_ASSERT(); @@ -831,8 +822,10 @@ tcp_input_with_port(struct mbuf **mp, int *offp, int proto, uint16_t port) */ lookupflag = INPLOOKUP_WILDCARD | ((thflags & (TH_ACK|TH_SYN)) == TH_SYN ? - INPLOOKUP_RLOCKPCB : INPLOOKUP_WLOCKPCB); + INPLOOKUP_RLOCKPCB : INPLOOKUP_WLOCKPCB) | + (V_tcp_bind_all_fibs ? 0 : INPLOOKUP_FIB); findpcb: + tp = NULL; #ifdef INET6 if (isipv6 && fwd_tag != NULL) { struct sockaddr_in6 *next_hop6; @@ -915,24 +908,8 @@ findpcb: log(LOG_INFO, "%s; %s: Connection attempt " "to closed port\n", s, __func__); } - /* - * When blackholing do not respond with a RST but - * completely ignore the segment and drop it. - */ - if (((V_blackhole == 1 && (thflags & TH_SYN)) || - V_blackhole == 2) && (V_blackhole_local || ( -#ifdef INET6 - isipv6 ? !in6_localaddr(&ip6->ip6_src) : -#endif -#ifdef INET - !in_localip(ip->ip_src) -#else - true -#endif - ))) - goto dropunlock; - - rstreason = BANDLIM_RST_CLOSEDPORT; + rstreason = BANDLIM_TCP_RST; + closed_port = true; goto dropwithreset; } INP_LOCK_ASSERT(inp); @@ -1023,12 +1000,14 @@ findpcb: * down or it is in the CLOSED state. Either way we drop the * segment and send an appropriate response. */ - rstreason = BANDLIM_RST_CLOSEDPORT; + rstreason = BANDLIM_TCP_RST; + closed_port = true; goto dropwithreset; } if ((tp->t_port != port) && (tp->t_state > TCPS_LISTEN)) { - rstreason = BANDLIM_RST_CLOSEDPORT; + rstreason = BANDLIM_TCP_RST; + closed_port = true; goto dropwithreset; } @@ -1080,6 +1059,8 @@ findpcb: * socket appended to the listen queue in SYN_RECEIVED state. */ if ((thflags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK) { + int result; + /* * Parse the TCP options here because * syncookies need access to the reflected @@ -1089,8 +1070,8 @@ findpcb: /* * NB: syncache_expand() doesn't unlock inp. */ - rstreason = syncache_expand(&inc, &to, th, &so, m, port); - if (rstreason < 0) { + result = syncache_expand(&inc, &to, th, &so, m, port); + if (result < 0) { /* * A failing TCP MD5 signature comparison * must result in the segment being dropped @@ -1098,7 +1079,7 @@ findpcb: * to the sender. */ goto dropunlock; - } else if (rstreason == 0) { + } else if (result == 0) { /* * No syncache entry, or ACK was not for our * SYN/ACK. Do our protection against double @@ -1117,7 +1098,7 @@ findpcb: * of the failure cause. */ INP_WUNLOCK(inp); - rstreason = BANDLIM_RST_OPENPORT; + rstreason = BANDLIM_TCP_RST; lookupflag &= ~INPLOOKUP_WILDCARD; goto findpcb; } @@ -1208,7 +1189,7 @@ tfo_socket_result: s, __func__); syncache_badack(&inc, port); /* XXX: Not needed! */ TCPSTAT_INC(tcps_badsyn); - rstreason = BANDLIM_RST_OPENPORT; + rstreason = BANDLIM_TCP_RST; goto dropwithreset; } /* @@ -1284,7 +1265,7 @@ tfo_socket_result: "Connection attempt to deprecated " "IPv6 address rejected\n", s, __func__); - rstreason = BANDLIM_RST_OPENPORT; + rstreason = BANDLIM_TCP_RST; goto dropwithreset; } } @@ -1298,7 +1279,7 @@ tfo_socket_result: * global or subnet broad- or multicast address. * Note that it is quite possible to receive unicast * link-layer packets with a broadcast IP address. Use - * in_broadcast() to find them. + * in_ifnet_broadcast() to find them. */ if (m->m_flags & (M_BCAST|M_MCAST)) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) @@ -1343,7 +1324,7 @@ tfo_socket_result: if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || - in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) { + in_ifnet_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "Connection attempt from/to broad- " @@ -1368,15 +1349,6 @@ tfo_socket_result: * Only the listen socket is unlocked by syncache_add(). */ return (IPPROTO_DONE); - } else if (tp->t_state == TCPS_LISTEN) { - /* - * When a listen socket is torn down the SO_ACCEPTCONN - * flag is removed first while connections are drained - * from the accept queue in a unlock/lock cycle of the - * ACCEPT_LOCK, opening a race condition allowing a SYN - * attempt go through unhandled. - */ - goto dropunlock; } #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (tp->t_flags & TF_SIGNATURE) { @@ -1410,15 +1382,28 @@ tfo_socket_result: return (IPPROTO_DONE); dropwithreset: + /* + * When blackholing do not respond with a RST but + * completely ignore the segment and drop it. + */ + if (rstreason == BANDLIM_TCP_RST && + ((!closed_port && V_blackhole == 3) || + (closed_port && + ((V_blackhole == 1 && (thflags & TH_SYN)) || V_blackhole > 1))) && + (V_blackhole_local || ( +#ifdef INET6 + isipv6 ? !in6_localip(&ip6->ip6_src) : +#endif +#ifdef INET + !in_localip(ip->ip_src) +#else + true +#endif + ))) + goto dropunlock; TCP_PROBE5(receive, NULL, tp, m, tp, th); - - if (inp != NULL) { - tcp_dropwithreset(m, th, tp, tlen, rstreason); - INP_UNLOCK(inp); - } else - tcp_dropwithreset(m, th, NULL, tlen, rstreason); + tcp_dropwithreset(m, th, tp, tlen, rstreason); m = NULL; /* mbuf chain got consumed. */ - goto drop; dropunlock: if (m != NULL) @@ -1460,7 +1445,7 @@ drop: * is at least 3/8 of the current socket buffer size. * 3. receive buffer size has not hit maximal automatic size; * - * If all of the criteria are met we increaset the socket buffer + * If all of the criteria are met, we increase the socket buffer * by a 1/2 (bounded by the max). This allows us to keep ahead * of slow-start but also makes it so our peer never gets limited * by our rwnd which we then open up causing a burst. @@ -1514,7 +1499,7 @@ tcp_handle_wakeup(struct tcpcb *tp) struct socket *so = tptosocket(tp); tp->t_flags &= ~TF_WAKESOR; - SOCKBUF_LOCK_ASSERT(&so->so_rcv); + SOCK_RECVBUF_LOCK_ASSERT(so); sorwakeup_locked(so); } } @@ -1537,7 +1522,9 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, struct tcpopt to; int tfo_syn; u_int maxseg = 0; + bool no_data; + no_data = (tlen == 0); thflags = tcp_get_flags(th); tp->sackhint.last_sack_ack = 0; sack_changed = SACK_NOCHANGE; @@ -1550,10 +1537,6 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", __func__)); -#ifdef TCPPCAP - /* Save segment, if requested. */ - tcp_pcap_add(th, m, &(tp->t_inpkts)); -#endif TCP_LOG_EVENT(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0, tlen, NULL, true); @@ -1615,7 +1598,14 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, tcp_dooptions(&to, (u_char *)(th + 1), (th->th_off << 2) - sizeof(struct tcphdr), (thflags & TH_SYN) ? TO_SYN : 0); - + if (tp->t_flags2 & TF2_PROC_SACK_PROHIBIT) { + /* + * We don't look at sack's from the + * peer because the MSS is too small which + * can subject us to an attack. + */ + to.to_flags &= ~TOF_SACK; + } #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if ((tp->t_flags & TF_SIGNATURE) != 0 && (to.to_flags & TOF_SIGNATURE) == 0) { @@ -1633,11 +1623,6 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, to.to_tsecr -= tp->ts_offset; if (TSTMP_GT(to.to_tsecr, tcp_ts_getticks())) { to.to_tsecr = 0; - } else if (tp->t_rxtshift == 1 && - tp->t_flags & TF_PREVVALID && - tp->t_badrxtwin != 0 && - TSTMP_LT(to.to_tsecr, tp->t_badrxtwin)) { - cc_cong_signal(tp, th, CC_RTO_ERR); } } /* @@ -1778,7 +1763,7 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, tp->ts_recent = to.to_tsval; } - if (tlen == 0) { + if (no_data) { if (SEQ_GT(th->th_ack, tp->snd_una) && SEQ_LEQ(th->th_ack, tp->snd_max) && !IN_RECOVERY(tp->t_flags) && @@ -1790,15 +1775,17 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, TCPSTAT_INC(tcps_predack); /* - * "bad retransmit" recovery without timestamps. + * "bad retransmit" recovery. */ - if ((to.to_flags & TOF_TS) == 0 && - tp->t_rxtshift == 1 && + if (tp->t_rxtshift == 1 && tp->t_flags & TF_PREVVALID && tp->t_badrxtwin != 0 && - TSTMP_LT(ticks, tp->t_badrxtwin)) { + (((to.to_flags & TOF_TS) != 0 && + to.to_tsecr != 0 && + TSTMP_LT(to.to_tsecr, tp->t_badrxtwin)) || + ((to.to_flags & TOF_TS) == 0 && + TSTMP_LT(ticks, tp->t_badrxtwin)))) cc_cong_signal(tp, th, CC_RTO_ERR); - } /* * Recalculate the transmit timer / rtt. @@ -1934,7 +1921,7 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, newsize = tcp_autorcvbuf(m, th, so, tp, tlen); /* Add data to socket buffer. */ - SOCKBUF_LOCK(&so->so_rcv); + SOCK_RECVBUF_LOCK(so); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { m_freem(m); } else { @@ -1985,7 +1972,7 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, if ((thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->snd_una) || SEQ_GT(th->th_ack, tp->snd_max))) { - rstreason = BANDLIM_RST_OPENPORT; + rstreason = BANDLIM_TCP_RST; tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); goto dropwithreset; } @@ -1998,7 +1985,7 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, * FIN, or a RST. */ if ((thflags & (TH_SYN|TH_ACK)) == (TH_SYN|TH_ACK)) { - rstreason = BANDLIM_RST_OPENPORT; + rstreason = BANDLIM_TCP_RST; tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); goto dropwithreset; } else if (thflags & TH_SYN) { @@ -2200,10 +2187,7 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, } } else { TCPSTAT_INC(tcps_badrst); - /* Send challenge ACK. */ - tcp_respond(tp, mtod(m, void *), th, m, - tp->rcv_nxt, tp->snd_nxt, TH_ACK); - tp->last_ack_sent = tp->rcv_nxt; + tcp_send_challenge_ack(tp, th, m); m = NULL; } } @@ -2225,10 +2209,7 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, rstreason = BANDLIM_UNLIMITED; } else { tcp_ecn_input_syn_sent(tp, thflags, iptos); - /* Send challenge ACK. */ - tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt, - tp->snd_nxt, TH_ACK); - tp->last_ack_sent = tp->rcv_nxt; + tcp_send_challenge_ack(tp, th, m); m = NULL; } goto drop; @@ -2272,7 +2253,7 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, * for the "LAND" DoS attack. */ if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) { - rstreason = BANDLIM_RST_OPENPORT; + rstreason = BANDLIM_TCP_RST; tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); goto dropwithreset; } @@ -2435,6 +2416,42 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, /* * Ack processing. */ + if (SEQ_GEQ(tp->snd_una, tp->iss + (TCP_MAXWIN << tp->snd_scale))) { + /* Checking SEG.ACK against ISS is definitely redundant. */ + tp->t_flags2 |= TF2_NO_ISS_CHECK; + } + if (!V_tcp_insecure_ack) { + tcp_seq seq_min; + bool ghost_ack_check; + + if (tp->t_flags2 & TF2_NO_ISS_CHECK) { + /* Check for too old ACKs (RFC 5961, Section 5.2). */ + seq_min = tp->snd_una - tp->max_sndwnd; + ghost_ack_check = false; + } else { + if (SEQ_GT(tp->iss + 1, tp->snd_una - tp->max_sndwnd)) { + /* Checking for ghost ACKs is stricter. */ + seq_min = tp->iss + 1; + ghost_ack_check = true; + } else { + /* + * Checking for too old ACKs (RFC 5961, + * Section 5.2) is stricter. + */ + seq_min = tp->snd_una - tp->max_sndwnd; + ghost_ack_check = false; + } + } + if (SEQ_LT(th->th_ack, seq_min)) { + if (ghost_ack_check) + TCPSTAT_INC(tcps_rcvghostack); + else + TCPSTAT_INC(tcps_rcvacktooold); + tcp_send_challenge_ack(tp, th, m); + m = NULL; + goto drop; + } + } switch (tp->t_state) { /* * In SYN_RECEIVED state, the ack ACKs our SYN, so enter @@ -2549,7 +2566,7 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, if (SEQ_LEQ(th->th_ack, tp->snd_una)) { maxseg = tcp_maxseg(tp); - if (tlen == 0 && + if (no_data && (tiwin == tp->snd_wnd || (tp->t_flags & TF_SACK_PERMIT))) { /* @@ -2618,26 +2635,30 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, tcp_do_prr_ack(tp, th, &to, sack_changed, &maxseg); } else if (tcp_is_sack_recovery(tp, &to) && - IN_FASTRECOVERY(tp->t_flags)) { + IN_FASTRECOVERY(tp->t_flags) && + (tp->snd_nxt == tp->snd_max)) { int awnd; /* * Compute the amount of data in flight first. * We can inject new data into the pipe iff - * we have less than 1/2 the original window's + * we have less than ssthresh * worth of data in flight. */ - if (V_tcp_do_newsack) { - awnd = tcp_compute_pipe(tp); - } else { - awnd = (tp->snd_nxt - tp->snd_fack) + - tp->sackhint.sack_bytes_rexmit; - } + awnd = tcp_compute_pipe(tp); if (awnd < tp->snd_ssthresh) { - tp->snd_cwnd += maxseg; + tp->snd_cwnd += imax(maxseg, + imin(2 * maxseg, + tp->sackhint.delivered_data)); if (tp->snd_cwnd > tp->snd_ssthresh) tp->snd_cwnd = tp->snd_ssthresh; } + } else if (tcp_is_sack_recovery(tp, &to) && + IN_FASTRECOVERY(tp->t_flags) && + SEQ_LT(tp->snd_nxt, tp->snd_max)) { + tp->snd_cwnd += imax(maxseg, + imin(2 * maxseg, + tp->sackhint.delivered_data)); } else { tp->snd_cwnd += maxseg; } @@ -2661,14 +2682,13 @@ enter_recovery: tcp_seq onxt = tp->snd_nxt; /* - * If we're doing sack, or prr, check - * to see if we're already in sack + * If we're doing sack, check to + * see if we're already in sack * recovery. If we're not doing sack, * check to see if we're in newreno * recovery. */ - if (V_tcp_do_prr || - (tp->t_flags & TF_SACK_PERMIT)) { + if (tcp_is_sack_recovery(tp, &to)) { if (IN_FASTRECOVERY(tp->t_flags)) { tp->t_dupacks = 0; break; @@ -2688,30 +2708,40 @@ enter_recovery: tp->t_rtttime = 0; if (V_tcp_do_prr) { /* - * snd_ssthresh is already updated by - * cc_cong_signal. + * snd_ssthresh and snd_recover are + * already updated by cc_cong_signal. */ if (tcp_is_sack_recovery(tp, &to)) { /* - * Exclude Limited Transmit + * Include Limited Transmit * segments here */ tp->sackhint.prr_delivered = - maxseg; + imin(tp->snd_max - th->th_ack, + (tp->snd_limited + 1) * maxseg); } else { tp->sackhint.prr_delivered = - imin(tp->snd_max - tp->snd_una, - imin(INT_MAX / 65536, - tp->t_dupacks) * maxseg); + maxseg; } tp->sackhint.recover_fs = max(1, tp->snd_nxt - tp->snd_una); } + tp->snd_limited = 0; if (tcp_is_sack_recovery(tp, &to)) { TCPSTAT_INC(tcps_sack_recovery_episode); - tp->snd_recover = tp->snd_nxt; - tp->snd_cwnd = maxseg; + /* + * When entering LR after RTO due to + * Duplicate ACKs, retransmit existing + * holes from the scoreboard. + */ + tcp_resend_sackholes(tp); + /* Avoid inflating cwnd in tcp_output */ + tp->snd_nxt = tp->snd_max; + tp->snd_cwnd = tcp_compute_pipe(tp) + + maxseg; (void) tcp_output(tp); + /* Set cwnd to the expected flightsize */ + tp->snd_cwnd = tp->snd_ssthresh; if (SEQ_GT(th->th_ack, tp->snd_una)) { goto resume_partialack; } @@ -2752,18 +2782,23 @@ enter_recovery: __func__)); if (tp->t_dupacks == 1) tp->snd_limited = 0; - tp->snd_cwnd = - (tp->snd_nxt - tp->snd_una) + + if ((tp->snd_nxt == tp->snd_max) && + (tp->t_rxtshift == 0)) + tp->snd_cwnd = + SEQ_SUB(tp->snd_nxt, + tp->snd_una) - + tcp_sack_adjust(tp); + tp->snd_cwnd += (tp->t_dupacks - tp->snd_limited) * - maxseg; + maxseg - tcp_sack_adjust(tp); /* * Only call tcp_output when there * is new data available to be sent * or we need to send an ACK. */ - SOCKBUF_LOCK(&so->so_snd); + SOCK_SENDBUF_LOCK(so); avail = sbavail(&so->so_snd); - SOCKBUF_UNLOCK(&so->so_snd); + SOCK_SENDBUF_UNLOCK(so); if (tp->t_flags & TF_ACKNOW || (avail >= SEQ_SUB(tp->snd_nxt, tp->snd_una))) { @@ -2774,9 +2809,11 @@ enter_recovery: KASSERT((tp->t_dupacks == 2 && tp->snd_limited == 0) || (sent == maxseg + 1 && - tp->t_flags & TF_SENTFIN), - ("%s: sent too much", - __func__)); + tp->t_flags & TF_SENTFIN) || + (sent < 2 * maxseg && + tp->t_flags & TF_NODELAY), + ("%s: sent too much: %u>%u", + __func__, sent, maxseg)); tp->snd_limited = 2; } else if (sent > 0) { ++tp->snd_limited; @@ -2802,7 +2839,9 @@ enter_recovery: * counted as dupacks here. */ if (tcp_is_sack_recovery(tp, &to) && - (sack_changed != SACK_NOCHANGE)) { + (((tp->t_rxtshift == 0) && (sack_changed != SACK_NOCHANGE)) || + ((tp->t_rxtshift > 0) && (sack_changed == SACK_NEWLOSS))) && + (tp->snd_nxt == tp->snd_max)) { tp->t_dupacks++; /* limit overhead by setting maxseg last */ if (!IN_FASTRECOVERY(tp->t_flags) && @@ -2938,7 +2977,7 @@ process_ACK: tcp_xmit_timer(tp, ticks - tp->t_rtttime); } - SOCKBUF_LOCK(&so->so_snd); + SOCK_SENDBUF_LOCK(so); /* * Clear t_acktime if remote side has ACKd all data in the * socket buffer and FIN (if applicable). @@ -2969,7 +3008,7 @@ process_ACK: * skip rest of ACK processing. */ if (acked == 0) { - SOCKBUF_UNLOCK(&so->so_snd); + SOCK_SENDBUF_UNLOCK(so); goto step6; } @@ -3009,9 +3048,8 @@ process_ACK: SEQ_GEQ(th->th_ack, tp->snd_recover)) { cc_post_recovery(tp, th); } - if (tp->t_flags & TF_SACK_PERMIT) { - if (SEQ_GT(tp->snd_una, tp->snd_recover)) - tp->snd_recover = tp->snd_una; + if (SEQ_GT(tp->snd_una, tp->snd_recover)) { + tp->snd_recover = tp->snd_una; } if (SEQ_LT(tp->snd_nxt, tp->snd_una)) tp->snd_nxt = tp->snd_una; @@ -3084,8 +3122,7 @@ step6: (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { /* keep track of pure window updates */ - if (tlen == 0 && - tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) + if (no_data && tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) TCPSTAT_INC(tcps_rcvwinupd); tp->snd_wnd = tiwin; tp->snd_wl1 = th->th_seq; @@ -3106,11 +3143,11 @@ step6: * soreceive. It's hard to imagine someone * actually wanting to send this much urgent data. */ - SOCKBUF_LOCK(&so->so_rcv); + SOCK_RECVBUF_LOCK(so); if (th->th_urp + sbavail(&so->so_rcv) > sb_max) { th->th_urp = 0; /* XXX */ thflags &= ~TH_URG; /* XXX */ - SOCKBUF_UNLOCK(&so->so_rcv); /* XXX */ + SOCK_RECVBUF_UNLOCK(so); /* XXX */ goto dodata; /* XXX */ } /* @@ -3136,7 +3173,7 @@ step6: sohasoutofband(so); tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); } - SOCKBUF_UNLOCK(&so->so_rcv); + SOCK_RECVBUF_UNLOCK(so); /* * Remove out of band data so doesn't get presented to user. * This can happen independent of advancing the URG pointer, @@ -3209,7 +3246,7 @@ dodata: /* XXX */ thflags = tcp_get_flags(th) & TH_FIN; TCPSTAT_INC(tcps_rcvpack); TCPSTAT_ADD(tcps_rcvbyte, tlen); - SOCKBUF_LOCK(&so->so_rcv); + SOCK_RECVBUF_LOCK(so); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) m_freem(m); else @@ -3395,7 +3432,7 @@ dropafterack: if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) && (SEQ_GT(tp->snd_una, th->th_ack) || SEQ_GT(th->th_ack, tp->snd_max)) ) { - rstreason = BANDLIM_RST_OPENPORT; + rstreason = BANDLIM_TCP_RST; tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); goto dropwithreset; } @@ -3466,7 +3503,7 @@ tcp_dropwithreset(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || - in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) + in_ifnet_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) goto drop; } #endif @@ -3703,7 +3740,7 @@ tcp_xmit_timer(struct tcpcb *tp, int rtt) * the minimum feasible timer (which is 2 ticks). */ TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), - max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX); + max(tp->t_rttmin, rtt + 2), tcp_rexmit_max); /* * We received an ack for a packet that wasn't retransmitted; @@ -3818,19 +3855,16 @@ tcp_mss_update(struct tcpcb *tp, int offer, int mtuoffer, offer = max(offer, V_tcp_minmss); } - /* - * rmx information is now retrieved from tcp_hostcache. - */ - tcp_hc_get(&inp->inp_inc, &metrics); - if (metricptr != NULL) - bcopy(&metrics, metricptr, sizeof(struct hc_metrics_lite)); + if (metricptr == NULL) + metricptr = &metrics; + tcp_hc_get(&inp->inp_inc, metricptr); /* * If there's a discovered mtu in tcp hostcache, use it. * Else, use the link mtu. */ - if (metrics.rmx_mtu) - mss = min(metrics.rmx_mtu, maxmtu) - min_protoh; + if (metricptr->hc_mtu) + mss = min(metricptr->hc_mtu, maxmtu) - min_protoh; else { #ifdef INET6 if (isipv6) { @@ -3883,6 +3917,17 @@ tcp_mss_update(struct tcpcb *tp, int offer, int mtuoffer, mss = max(mss, 64); tp->t_maxseg = mss; + if (tp->t_maxseg < V_tcp_mssdflt) { + /* + * The MSS is so small we should not process incoming + * SACK's since we are subject to attack in such a + * case. + */ + tp->t_flags2 |= TF2_PROC_SACK_PROHIBIT; + } else { + tp->t_flags2 &= ~TF2_PROC_SACK_PROHIBIT; + } + } void @@ -3910,9 +3955,9 @@ tcp_mss(struct tcpcb *tp, int offer) * if the mss is larger than the socket buffer, decrease the mss. */ so = inp->inp_socket; - SOCKBUF_LOCK(&so->so_snd); - if ((so->so_snd.sb_hiwat == V_tcp_sendspace) && metrics.rmx_sendpipe) - bufsize = metrics.rmx_sendpipe; + SOCK_SENDBUF_LOCK(so); + if ((so->so_snd.sb_hiwat == V_tcp_sendspace) && metrics.hc_sendpipe) + bufsize = metrics.hc_sendpipe; else bufsize = so->so_snd.sb_hiwat; if (bufsize < mss) @@ -3924,7 +3969,7 @@ tcp_mss(struct tcpcb *tp, int offer) if (bufsize > so->so_snd.sb_hiwat) (void)sbreserve_locked(so, SO_SND, bufsize, NULL); } - SOCKBUF_UNLOCK(&so->so_snd); + SOCK_SENDBUF_UNLOCK(so); /* * Sanity check: make sure that maxseg will be large * enough to allow some data on segments even if the @@ -3934,10 +3979,20 @@ tcp_mss(struct tcpcb *tp, int offer) * XXXGL: shouldn't we reserve space for IP/IPv6 options? */ tp->t_maxseg = max(mss, 64); + if (tp->t_maxseg < V_tcp_mssdflt) { + /* + * The MSS is so small we should not process incoming + * SACK's since we are subject to attack in such a + * case. + */ + tp->t_flags2 |= TF2_PROC_SACK_PROHIBIT; + } else { + tp->t_flags2 &= ~TF2_PROC_SACK_PROHIBIT; + } - SOCKBUF_LOCK(&so->so_rcv); - if ((so->so_rcv.sb_hiwat == V_tcp_recvspace) && metrics.rmx_recvpipe) - bufsize = metrics.rmx_recvpipe; + SOCK_RECVBUF_LOCK(so); + if ((so->so_rcv.sb_hiwat == V_tcp_recvspace) && metrics.hc_recvpipe) + bufsize = metrics.hc_recvpipe; else bufsize = so->so_rcv.sb_hiwat; if (bufsize > mss) { @@ -3947,7 +4002,7 @@ tcp_mss(struct tcpcb *tp, int offer) if (bufsize > so->so_rcv.sb_hiwat) (void)sbreserve_locked(so, SO_RCV, bufsize, NULL); } - SOCKBUF_UNLOCK(&so->so_rcv); + SOCK_RECVBUF_UNLOCK(so); /* Check the interface for TSO capabilities. */ if (cap.ifcap & CSUM_TSO) { @@ -3955,6 +4010,8 @@ tcp_mss(struct tcpcb *tp, int offer) tp->t_tsomax = cap.tsomax; tp->t_tsomaxsegcount = cap.tsomaxsegcount; tp->t_tsomaxsegsize = cap.tsomaxsegsize; + if (cap.ipsec_tso) + tp->t_flags2 |= TF2_IPSEC_TSO; } } @@ -4022,11 +4079,7 @@ tcp_do_prr_ack(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to, (IN_CONGRECOVERY(tp->t_flags) && !IN_FASTRECOVERY(tp->t_flags))) { del_data = tp->sackhint.delivered_data; - if (V_tcp_do_newsack) - pipe = tcp_compute_pipe(tp); - else - pipe = (tp->snd_nxt - tp->snd_fack) + - tp->sackhint.sack_bytes_rexmit; + pipe = tcp_compute_pipe(tp); } else { if (tp->sackhint.prr_delivered < (tcprexmtthresh * maxseg + tp->snd_recover - tp->snd_una)) { @@ -4075,9 +4128,7 @@ tcp_do_prr_ack(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to, */ if (IN_FASTRECOVERY(tp->t_flags)) { if (tcp_is_sack_recovery(tp, to)) { - tp->snd_cwnd = tp->snd_nxt - tp->snd_recover + - tp->sackhint.sack_bytes_rexmit + - (snd_cnt * maxseg); + tp->snd_cwnd = pipe - del_data + (snd_cnt * maxseg); } else { tp->snd_cwnd = (tp->snd_max - tp->snd_una) + (snd_cnt * maxseg); @@ -4105,17 +4156,19 @@ tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th) tcp_timer_activate(tp, TT_REXMT, 0); tp->t_rtttime = 0; - tp->snd_nxt = th->th_ack; - /* - * Set snd_cwnd to one segment beyond acknowledged offset. - * (tp->snd_una has not yet been updated when this function is called.) - */ - tp->snd_cwnd = maxseg + BYTES_THIS_ACK(tp, th); - tp->t_flags |= TF_ACKNOW; - (void) tcp_output(tp); - tp->snd_cwnd = ocwnd; - if (SEQ_GT(onxt, tp->snd_nxt)) - tp->snd_nxt = onxt; + if (IN_FASTRECOVERY(tp->t_flags)) { + tp->snd_nxt = th->th_ack; + /* + * Set snd_cwnd to one segment beyond acknowledged offset. + * (tp->snd_una has not yet been updated when this function is called.) + */ + tp->snd_cwnd = maxseg + BYTES_THIS_ACK(tp, th); + tp->t_flags |= TF_ACKNOW; + (void) tcp_output(tp); + tp->snd_cwnd = ocwnd; + if (SEQ_GT(onxt, tp->snd_nxt)) + tp->snd_nxt = onxt; + } /* * Partial window deflation. Relies on fact that tp->snd_una * not updated yet. @@ -4130,14 +4183,19 @@ tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th) int tcp_compute_pipe(struct tcpcb *tp) { - if (tp->t_fb->tfb_compute_pipe == NULL) { - return (tp->snd_max - tp->snd_una + + int pipe; + + if (tp->t_fb->tfb_compute_pipe != NULL) { + pipe = (*tp->t_fb->tfb_compute_pipe)(tp); + } else if (V_tcp_do_newsack) { + pipe = tp->snd_max - tp->snd_una + tp->sackhint.sack_bytes_rexmit - tp->sackhint.sacked_bytes - - tp->sackhint.lost_bytes); + tp->sackhint.lost_bytes; } else { - return((*tp->t_fb->tfb_compute_pipe)(tp)); + pipe = tp->snd_nxt - tp->snd_fack + tp->sackhint.sack_bytes_rexmit; } + return (imax(pipe, 0)); } uint32_t diff --git a/sys/netinet/tcp_log_buf.c b/sys/netinet/tcp_log_buf.c index 7b937958a4fb..e24790ece43d 100644 --- a/sys/netinet/tcp_log_buf.c +++ b/sys/netinet/tcp_log_buf.c @@ -29,6 +29,7 @@ #include <sys/cdefs.h> #include "opt_inet.h" +#include "opt_ddb.h" #include <sys/param.h> #include <sys/arb.h> #include <sys/hash.h> @@ -43,11 +44,18 @@ #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/sysctl.h> +#ifdef DDB +#include <sys/time.h> +#endif #include <sys/tree.h> #include <sys/stats.h> /* Must come after qmath.h and tree.h */ #include <sys/counter.h> #include <dev/tcp_log/tcp_log_dev.h> +#ifdef DDB +#include <ddb/ddb.h> +#endif + #include <net/if.h> #include <net/if_var.h> #include <net/vnet.h> @@ -1840,35 +1848,36 @@ retry: log_buf->tlb_txbuf.tls_sb_ccc = 0; } /* Copy values from tp to the log entry. */ -#define COPY_STAT(f) log_buf->tlb_ ## f = tp->f -#define COPY_STAT_T(f) log_buf->tlb_ ## f = tp->t_ ## f - COPY_STAT_T(state); - COPY_STAT_T(starttime); - COPY_STAT(iss); - COPY_STAT_T(flags); - COPY_STAT(snd_una); - COPY_STAT(snd_max); - COPY_STAT(snd_cwnd); - COPY_STAT(snd_nxt); - COPY_STAT(snd_recover); - COPY_STAT(snd_wnd); - COPY_STAT(snd_ssthresh); - COPY_STAT_T(srtt); - COPY_STAT_T(rttvar); - COPY_STAT(rcv_up); - COPY_STAT(rcv_adv); - COPY_STAT(rcv_nxt); - COPY_STAT(rcv_wnd); - COPY_STAT_T(dupacks); - COPY_STAT_T(segqlen); - COPY_STAT(snd_numholes); - COPY_STAT(snd_scale); - COPY_STAT(rcv_scale); - COPY_STAT_T(flags2); - COPY_STAT_T(fbyte_in); - COPY_STAT_T(fbyte_out); -#undef COPY_STAT -#undef COPY_STAT_T + log_buf->tlb_state = tp->t_state; + log_buf->tlb_starttime = tp->t_starttime; + log_buf->tlb_iss = tp->iss; + log_buf->tlb_flags = tp->t_flags; + log_buf->tlb_snd_una = tp->snd_una; + log_buf->tlb_snd_max = tp->snd_max; + log_buf->tlb_snd_cwnd = tp->snd_cwnd; + log_buf->tlb_snd_nxt = tp->snd_nxt; + log_buf->tlb_snd_recover = tp->snd_recover; + log_buf->tlb_snd_wnd = tp->snd_wnd; + log_buf->tlb_snd_ssthresh = tp->snd_ssthresh; + log_buf->tlb_srtt = tp->t_srtt; + log_buf->tlb_rttvar = tp->t_rttvar; + log_buf->tlb_rcv_up = tp->rcv_up; + log_buf->tlb_rcv_adv = tp->rcv_adv; + log_buf->tlb_flags2 = tp->t_flags2; + log_buf->tlb_rcv_nxt = tp->rcv_nxt; + log_buf->tlb_rcv_wnd = tp->rcv_wnd; + log_buf->tlb_dupacks = tp->t_dupacks; + log_buf->tlb_segqlen = tp->t_segqlen; + log_buf->tlb_snd_numholes = tp->snd_numholes; + log_buf->tlb_flex1 = 0; + log_buf->tlb_flex2 = 0; + log_buf->tlb_fbyte_in = tp->t_fbyte_in; + log_buf->tlb_fbyte_out = tp->t_fbyte_out; + log_buf->tlb_snd_scale = tp->snd_scale; + log_buf->tlb_rcv_scale = tp->rcv_scale; + log_buf->_pad[0] = 0; + log_buf->_pad[1] = 0; + log_buf->_pad[2] = 0; /* Copy stack-specific info. */ if (stackinfo != NULL) { memcpy(&log_buf->tlb_stackinfo, stackinfo, @@ -2869,10 +2878,11 @@ tcp_log_sendfile(struct socket *so, off_t offset, size_t nbytes, int flags) /* double check log state now that we have the lock */ if (inp->inp_flags & INP_DROPPED) goto done; - if (tp->_t_logstate != TCP_LOG_STATE_OFF) { + if (tcp_bblogging_on(tp)) { struct timeval tv; tcp_log_eventspecific_t log; + memset(&log, 0, sizeof(log)); microuptime(&tv); log.u_sf.offset = offset; log.u_sf.length = nbytes; @@ -2970,3 +2980,370 @@ skip_closed_req: done: INP_WUNLOCK(inp); } + +#ifdef DDB +static void +db_print_indent(int indent) +{ + int i; + + for (i = 0; i < indent; i++) + db_printf(" "); +} + +static void +db_print_tcphdr(struct tcp_log_buffer *tlm_buf) +{ + struct sackblk sack; + struct tcphdr *th; + int cnt, i, j, opt, optlen, num_sacks; + uint32_t val, ecr; + uint16_t mss; + uint16_t flags; + + if ((tlm_buf->tlb_eventflags & TLB_FLAG_HDR) == 0) { + return; + } + th = &tlm_buf->tlb_th; + flags = tcp_get_flags(th); + if (flags & TH_FIN) { + db_printf("F"); + } + if (flags & TH_SYN) { + db_printf("S"); + } + if (flags & TH_RST) { + db_printf("R"); + } + if (flags & TH_PUSH) { + db_printf("P"); + } + if (flags & TH_ACK) { + db_printf("."); + } + if (flags & TH_URG) { + db_printf("U"); + } + if (flags & TH_ECE) { + db_printf("E"); + } + if (flags & TH_CWR) { + db_printf("W"); + } + if (flags & TH_AE) { + db_printf("A"); + } + db_printf(" %u:%u(%u)", ntohl(th->th_seq), + ntohl(th->th_seq) + tlm_buf->tlb_len, tlm_buf->tlb_len); + if (flags & TH_ACK) { + db_printf(" ack %u", ntohl(th->th_ack)); + } + db_printf(" win %u", ntohs(th->th_win)); + if (flags & TH_URG) { + db_printf(" urg %u", ntohs(th->th_urp)); + } + cnt = (th->th_off << 2) - sizeof(struct tcphdr); + if (cnt > 0) { + db_printf(" <"); + for (i = 0; i < cnt; i += optlen) { + opt = tlm_buf->tlb_opts[i]; + if (opt == TCPOPT_EOL || opt == TCPOPT_NOP) { + optlen = 1; + } else { + if (cnt - i < 2) { + break; + } + optlen = tlm_buf->tlb_opts[i + 1]; + if (optlen < 2 || optlen > cnt - i) { + break; + } + } + if (i > 0) { + db_printf(","); + } + switch (opt) { + case TCPOPT_EOL: + db_printf("eol"); + break; + case TCPOPT_NOP: + db_printf("nop"); + break; + case TCPOPT_MAXSEG: + if (optlen != TCPOLEN_MAXSEG) { + break; + } + bcopy(tlm_buf->tlb_opts + i + 2, &mss, + sizeof(uint16_t)); + db_printf("mss %u", ntohs(mss)); + break; + case TCPOPT_WINDOW: + if (optlen != TCPOLEN_WINDOW) { + break; + } + db_printf("wscale %u", + tlm_buf->tlb_opts[i + 2]); + break; + case TCPOPT_SACK_PERMITTED: + if (optlen != TCPOLEN_SACK_PERMITTED) { + break; + } + db_printf("sackOK"); + break; + case TCPOPT_SACK: + if (optlen == TCPOLEN_SACKHDR || + (optlen - 2) % TCPOLEN_SACK != 0) { + break; + } + num_sacks = (optlen - 2) / TCPOLEN_SACK; + db_printf("sack"); + for (j = 0; j < num_sacks; j++) { + bcopy(tlm_buf->tlb_opts + i + 2 + + j * TCPOLEN_SACK, &sack, + TCPOLEN_SACK); + db_printf(" %u:%u", ntohl(sack.start), + ntohl(sack.end)); + } + break; + case TCPOPT_TIMESTAMP: + if (optlen != TCPOLEN_TIMESTAMP) { + break; + } + bcopy(tlm_buf->tlb_opts + i + 2, &val, + sizeof(uint32_t)); + bcopy(tlm_buf->tlb_opts + i + 6, &ecr, + sizeof(uint32_t)); + db_printf("TS val %u ecr %u", ntohl(val), + ntohl(ecr)); + break; + case TCPOPT_SIGNATURE: + db_printf("md5"); + if (optlen > 2) { + db_printf(" "); + } + for (j = 0; j < optlen - 2; j++) { + db_printf("%02x", + tlm_buf->tlb_opts[i + 2 + j]); + } + break; + case TCPOPT_FAST_OPEN: + db_printf("FO"); + if (optlen > 2) { + db_printf(" "); + } + for (j = 0; j < optlen - 2; j++) { + db_printf("%02x", + tlm_buf->tlb_opts[i + 2 + j]); + } + break; + default: + db_printf("opt=%u len=%u", opt, optlen); + break; + } + } + db_printf(">"); + } +} +static void +db_print_pru(struct tcp_log_buffer *tlm_buf) +{ + switch (tlm_buf->tlb_flex1) { + case PRU_ATTACH: + db_printf("ATTACH"); + break; + case PRU_DETACH: + db_printf("DETACH"); + break; + case PRU_BIND: + db_printf("BIND"); + break; + case PRU_LISTEN: + db_printf("LISTEN"); + break; + case PRU_CONNECT: + db_printf("CONNECT"); + break; + case PRU_ACCEPT: + db_printf("ACCEPT"); + break; + case PRU_DISCONNECT: + db_printf("DISCONNECT"); + break; + case PRU_SHUTDOWN: + db_printf("SHUTDOWN"); + break; + case PRU_RCVD: + db_printf("RCVD"); + break; + case PRU_SEND: + db_printf("SEND"); + break; + case PRU_ABORT: + db_printf("ABORT"); + break; + case PRU_CONTROL: + db_printf("CONTROL"); + break; + case PRU_SENSE: + db_printf("SENSE"); + break; + case PRU_RCVOOB: + db_printf("RCVOOB"); + break; + case PRU_SENDOOB: + db_printf("SENDOOB"); + break; + case PRU_SOCKADDR: + db_printf("SOCKADDR"); + break; + case PRU_PEERADDR: + db_printf("PEERADDR"); + break; + case PRU_CONNECT2: + db_printf("CONNECT2"); + break; + case PRU_FASTTIMO: + db_printf("FASTTIMO"); + break; + case PRU_SLOWTIMO: + db_printf("SLOWTIMO"); + break; + case PRU_PROTORCV: + db_printf("PROTORCV"); + break; + case PRU_PROTOSEND: + db_printf("PROTOSEND"); + break; + case PRU_SEND_EOF: + db_printf("SEND_EOF"); + break; + case PRU_SOSETLABEL: + db_printf("SOSETLABEL"); + break; + case PRU_CLOSE: + db_printf("CLOSE"); + break; + case PRU_FLUSH: + db_printf("FLUSH"); + break; + default: + db_printf("Unknown PRU (%u)", tlm_buf->tlb_flex1); + break; + } + if (tlm_buf->tlb_errno >= 0) { + db_printf(", error: %d", tlm_buf->tlb_errno); + } +} + +static void +db_print_rto(struct tcp_log_buffer *tlm_buf) +{ + tt_what what; + tt_which which; + + what = (tlm_buf->tlb_flex1 & 0xffffff00) >> 8; + which = tlm_buf->tlb_flex1 & 0x000000ff; + switch (what) { + case TT_PROCESSING: + db_printf("Processing "); + break; + case TT_PROCESSED: + db_printf("Processed "); + break; + case TT_STARTING: + db_printf("Starting "); + break; + case TT_STOPPING: + db_printf("Stopping "); + break; + default: + db_printf("Unknown operation (%u) for ", what); + break; + } + switch (which) { + case TT_REXMT: + db_printf("Retransmission "); + break; + case TT_PERSIST: + db_printf("Persist "); + break; + case TT_KEEP: + db_printf("Keepalive "); + break; + case TT_2MSL: + db_printf("2 MSL "); + break; + case TT_DELACK: + db_printf("Delayed ACK "); + break; + default: + db_printf("Unknown (%u) ", which); + break; + } + db_printf("timer"); + if (what == TT_STARTING) { + db_printf(": %u ms", tlm_buf->tlb_flex2); + } +} + +static void +db_print_usersend(struct tcp_log_buffer *tlm_buf) +{ + if ((tlm_buf->tlb_eventflags & TLB_FLAG_RXBUF) == 0) { + return; + } + if ((tlm_buf->tlb_eventflags & TLB_FLAG_TXBUF) == 0) { + return; + } + db_printf("usersend: rcv.acc: %u rcv.ccc: %u snd.acc: %u snd.ccc: %u", + tlm_buf->tlb_rxbuf.tls_sb_acc, tlm_buf->tlb_rxbuf.tls_sb_ccc, + tlm_buf->tlb_txbuf.tls_sb_acc, tlm_buf->tlb_txbuf.tls_sb_ccc); +} + +void +db_print_bblog_entries(struct tcp_log_stailq *log_entries, int indent) +{ + struct tcp_log_mem *log_entry; + struct tcp_log_buffer *tlm_buf, *prev_tlm_buf; + int64_t delta_t; + + indent += 2; + prev_tlm_buf = NULL; + STAILQ_FOREACH(log_entry, log_entries, tlm_queue) { + db_print_indent(indent); + tlm_buf = &log_entry->tlm_buf; + if (prev_tlm_buf == NULL) { + db_printf(" 0.000 "); + } else { + delta_t = sbttoms(tvtosbt(tlm_buf->tlb_tv) - + tvtosbt(prev_tlm_buf->tlb_tv)); + db_printf("+%u.%03u ", (uint32_t)(delta_t / 1000), + (uint32_t)(delta_t % 1000)); + } + switch (tlm_buf->tlb_eventid) { + case TCP_LOG_IN: + db_printf("< "); + db_print_tcphdr(tlm_buf); + break; + case TCP_LOG_OUT: + db_printf("> "); + db_print_tcphdr(tlm_buf); + break; + case TCP_LOG_RTO: + db_print_rto(tlm_buf); + break; + case TCP_LOG_PRU: + db_print_pru(tlm_buf); + break; + case TCP_LOG_USERSEND: + db_print_usersend(tlm_buf); + break; + default: + break; + } + db_printf("\n"); + prev_tlm_buf = tlm_buf; + if (db_pager_quit) + break; + } +} +#endif diff --git a/sys/netinet/tcp_log_buf.h b/sys/netinet/tcp_log_buf.h index 38f66e69b093..3e7eef8a1cda 100644 --- a/sys/netinet/tcp_log_buf.h +++ b/sys/netinet/tcp_log_buf.h @@ -60,14 +60,6 @@ struct tcp_log_verbose uint8_t _pad[4]; } ALIGN_TCP_LOG; -/* Internal RACK state variables. */ -struct tcp_log_rack -{ - uint32_t tlr_rack_rtt; /* rc_rack_rtt */ - uint8_t tlr_state; /* Internal RACK state */ - uint8_t _pad[3]; /* Padding */ -}; - struct tcp_log_bbr { uint64_t cur_del_rate; uint64_t delRate; @@ -126,7 +118,6 @@ struct tcp_log_sendfile { */ union tcp_log_stackspecific { - struct tcp_log_rack u_rack; struct tcp_log_bbr u_bbr; struct tcp_log_sendfile u_sf; struct tcp_log_raw u_raw; /* "raw" log access */ @@ -185,7 +176,6 @@ struct tcp_log_buffer uint8_t _pad[3]; /* Padding */ /* Per-stack info */ union tcp_log_stackspecific tlb_stackinfo; -#define tlb_rack tlb_stackinfo.u_rack /* The packet */ uint32_t tlb_len; /* The packet's data length */ @@ -201,14 +191,14 @@ enum tcp_log_events { TCP_LOG_OUT, /* Transmit (without other event) 2 */ TCP_LOG_RTO, /* Retransmit timeout 3 */ TCP_LOG_SB_WAKE, /* Awaken socket buffer 4 */ - TCP_LOG_BAD_RETRAN, /* Detected bad retransmission 5 */ + TCP_UNUSED_5, /* Detected bad retransmission 5 */ TCP_LOG_PRR, /* Doing PRR 6 */ - TCP_LOG_REORDER, /* Detected reorder 7 */ + TCP_UNUSED_7, /* Detected reorder 7 */ TCP_LOG_HPTS, /* Hpts sending a packet 8 */ BBR_LOG_BBRUPD, /* We updated BBR info 9 */ BBR_LOG_BBRSND, /* We did a slot calculation and sending is done 10 */ BBR_LOG_ACKCLEAR, /* A ack clears all outstanding 11 */ - BBR_LOG_INQUEUE, /* The tcb had a packet input to it 12 */ + TCP_UNUSED_12, /* The tcb had a packet input to it 12 */ BBR_LOG_TIMERSTAR, /* Start a timer 13 */ BBR_LOG_TIMERCANC, /* Cancel a timer 14 */ BBR_LOG_ENTREC, /* Entered recovery 15 */ @@ -219,18 +209,18 @@ enum tcp_log_events { BBR_LOG_BBRRTT, /* BBR RTT is updated 20 */ BBR_LOG_JUSTRET, /* We just returned out of output 21 */ BBR_LOG_STATE, /* A BBR state change occurred 22 */ - BBR_LOG_PKT_EPOCH, /* A BBR packet epoch occurred 23 */ - BBR_LOG_PERSIST, /* BBR changed to/from a persists 24 */ - TCP_LOG_FLOWEND, /* End of a flow 25 */ - BBR_LOG_RTO, /* BBR's timeout includes BBR info 26 */ - BBR_LOG_DOSEG_DONE, /* hpts do_segment completes 27 */ - BBR_LOG_EXIT_GAIN, /* hpts do_segment completes 28 */ - BBR_LOG_THRESH_CALC, /* Doing threshold calculation 29 */ + BBR_LOG_PKT_EPOCH, /* A BBR packet epoch occurred 23 */ + BBR_LOG_PERSIST, /* BBR changed to/from a persists 24 */ + TCP_LOG_FLOWEND, /* End of a flow 25 */ + BBR_LOG_RTO, /* BBR's timeout includes BBR info 26 */ + BBR_LOG_DOSEG_DONE, /* hpts do_segment completes 27 */ + BBR_LOG_EXIT_GAIN, /* hpts do_segment completes 28 */ + BBR_LOG_THRESH_CALC, /* Doing threshold calculation 29 */ TCP_LOG_MAPCHG, /* Map Changes to the sendmap 30 */ - TCP_LOG_USERSEND, /* User level sends data 31 */ + TCP_LOG_USERSEND, /* User level sends data 31 */ BBR_RSM_CLEARED, /* RSM cleared of ACK flags 32 */ - BBR_LOG_STATE_TARGET, /* Log of target at state 33 */ - BBR_LOG_TIME_EPOCH, /* A timed based Epoch occurred 34 */ + BBR_LOG_STATE_TARGET, /* Log of target at state 33 */ + BBR_LOG_TIME_EPOCH, /* A timed based Epoch occurred 34 */ BBR_LOG_TO_PROCESS, /* A to was processed 35 */ BBR_LOG_BBRTSO, /* TSO update 36 */ BBR_LOG_HPTSDIAG, /* Hpts diag insert 37 */ @@ -245,7 +235,7 @@ enum tcp_log_events { BBR_LOG_REDUCE, /* old bbr log reduce for 4.1 and earlier 46*/ TCP_LOG_RTT, /* A rtt (in useconds) is being sampled and applied to the srtt algo 47 */ BBR_LOG_SETTINGS_CHG, /* Settings changed for loss response 48 */ - BBR_LOG_SRTT_GAIN_EVENT, /* SRTT gaining -- now not used 49 */ + TCP_UNUSED_49, /* SRTT gaining -- now not used 49 */ TCP_LOG_REASS, /* Reassembly buffer logging 50 */ TCP_HDWR_PACE_SIZE, /* TCP pacing size set (rl and rack uses this) 51 */ BBR_LOG_HDWR_PACE, /* TCP Hardware pacing log 52 */ @@ -253,23 +243,23 @@ enum tcp_log_events { TCP_LOG_CONNEND, /* End of connection 54 */ TCP_LOG_LRO, /* LRO entry 55 */ TCP_SACK_FILTER_RES, /* Results of SACK Filter 56 */ - TCP_SAD_DETECT, /* Sack Attack Detection 57 */ + TCP_UNUSED_57, /* Sack Attack Detection 57 */ TCP_TIMELY_WORK, /* Logs regarding Timely CC tweaks 58 */ - TCP_LOG_USER_EVENT, /* User space event data 59 */ + TCP_UNUSED_59, /* User space event data 59 */ TCP_LOG_SENDFILE, /* sendfile() logging for TCP connections 60 */ - TCP_LOG_REQ_T, /* logging of request tracking 61 */ - TCP_LOG_ACCOUNTING, /* Log of TCP Accounting data 62 */ - TCP_LOG_FSB, /* FSB information 63 */ + TCP_LOG_REQ_T, /* logging of request tracking 61 */ + TCP_LOG_ACCOUNTING, /* Log of TCP Accounting data 62 */ + TCP_LOG_FSB, /* FSB information 63 */ RACK_DSACK_HANDLING, /* Handling of DSACK in rack for reordering window 64 */ - TCP_HYSTART, /* TCP Hystart logging 65 */ - TCP_CHG_QUERY, /* Change query during fnc_init() 66 */ - TCP_RACK_LOG_COLLAPSE, /* Window collapse by peer 67 */ - TCP_RACK_TP_TRIGGERED, /* A rack tracepoint is triggered 68 */ - TCP_HYBRID_PACING_LOG, /* Hybrid pacing log 69 */ + TCP_HYSTART, /* TCP Hystart logging 65 */ + TCP_CHG_QUERY, /* Change query during fnc_init() 66 */ + TCP_RACK_LOG_COLLAPSE, /* Window collapse by peer 67 */ + TCP_RACK_TP_TRIGGERED, /* A rack tracepoint is triggered 68 */ + TCP_HYBRID_PACING_LOG, /* Hybrid pacing log 69 */ TCP_LOG_PRU, /* TCP protocol user request 70 */ - TCP_POLICER_DET, /* TCP Policer detectionn 71 */ - TCP_PCM_MEASURE, /* TCP Path Capacity Measurement 72 */ - TCP_LOG_END /* End (keep at end) 72 */ + TCP_UNUSED_71, /* old TCP Policer detectionn, not used 71 */ + TCP_PCM_MEASURE, /* TCP Path Capacity Measurement 72 */ + TCP_LOG_END /* End (keep at end) 73 */ }; enum tcp_log_states { @@ -549,12 +539,12 @@ struct tcpcb; NULL, NULL, 0, NULL); \ } while (0) #endif /* TCP_LOG_FORCEVERBOSE */ +/* Assumes/requires the caller has already checked tcp_bblogging_on(tp). */ #define TCP_LOG_EVENTP(tp, th, rxbuf, txbuf, eventid, errornum, len, stackinfo, th_hostorder, tv) \ do { \ - if (tcp_bblogging_on(tp)) \ - tcp_log_event(tp, th, rxbuf, txbuf, eventid, \ - errornum, len, stackinfo, th_hostorder, \ - NULL, NULL, 0, tv); \ + KASSERT(tcp_bblogging_on(tp), ("bblogging is off")); \ + tcp_log_event(tp, th, rxbuf, txbuf, eventid, errornum, len, \ + stackinfo, th_hostorder, NULL, NULL, 0, tv); \ } while (0) #ifdef TCP_BLACKBOX @@ -580,6 +570,9 @@ void tcp_log_flowend(struct tcpcb *tp); void tcp_log_sendfile(struct socket *so, off_t offset, size_t nbytes, int flags); int tcp_log_apply_ratio(struct tcpcb *tp, int ratio); +#ifdef DDB +void db_print_bblog_entries(struct tcp_log_stailq *log_entries, int indent); +#endif #else /* !TCP_BLACKBOX */ #define tcp_log_verbose (false) diff --git a/sys/netinet/tcp_lro.c b/sys/netinet/tcp_lro.c index 921d28f82517..10afed17bf3b 100644 --- a/sys/netinet/tcp_lro.c +++ b/sys/netinet/tcp_lro.c @@ -83,6 +83,7 @@ static MALLOC_DEFINE(M_LRO, "LRO", "LRO control structures"); static void tcp_lro_rx_done(struct lro_ctrl *lc); static int tcp_lro_rx_common(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum, bool use_hash); +static void tcp_lro_flush(struct lro_ctrl *lc, struct lro_entry *le); SYSCTL_NODE(_net_inet_tcp, OID_AUTO, lro, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "TCP LRO"); @@ -175,7 +176,7 @@ tcp_lro_init_args(struct lro_ctrl *lc, struct ifnet *ifp, { struct lro_entry *le; size_t size; - unsigned i, elements; + unsigned i; lc->lro_bad_csum = 0; lc->lro_queued = 0; @@ -190,11 +191,7 @@ tcp_lro_init_args(struct lro_ctrl *lc, struct ifnet *ifp, LIST_INIT(&lc->lro_active); /* create hash table to accelerate entry lookup */ - if (lro_entries > lro_mbufs) - elements = lro_entries; - else - elements = lro_mbufs; - lc->lro_hash = phashinit_flags(elements, M_LRO, &lc->lro_hashsz, + lc->lro_hash = phashinit_flags(lro_entries, M_LRO, &lc->lro_hashsz, HASH_NOWAIT); if (lc->lro_hash == NULL) { memset(lc, 0, sizeof(*lc)); @@ -599,7 +596,7 @@ tcp_lro_rx_done(struct lro_ctrl *lc) static void tcp_lro_flush_active(struct lro_ctrl *lc) { - struct lro_entry *le; + struct lro_entry *le, *le_tmp; /* * Walk through the list of le entries, and @@ -611,7 +608,7 @@ tcp_lro_flush_active(struct lro_ctrl *lc) * is being freed. This is ok it will just get * reallocated again like it was new. */ - LIST_FOREACH(le, &lc->lro_active, next) { + LIST_FOREACH_SAFE(le, &lc->lro_active, next, le_tmp) { if (le->m_head != NULL) { tcp_lro_active_remove(le); tcp_lro_flush(lc, le); @@ -1108,7 +1105,7 @@ again: } } -void +static void tcp_lro_flush(struct lro_ctrl *lc, struct lro_entry *le) { diff --git a/sys/netinet/tcp_lro.h b/sys/netinet/tcp_lro.h index b4b5e3f811e4..a94eca665eb5 100644 --- a/sys/netinet/tcp_lro.h +++ b/sys/netinet/tcp_lro.h @@ -216,7 +216,6 @@ int tcp_lro_init(struct lro_ctrl *); int tcp_lro_init_args(struct lro_ctrl *, struct ifnet *, unsigned, unsigned); void tcp_lro_free(struct lro_ctrl *); void tcp_lro_flush_inactive(struct lro_ctrl *, const struct timeval *); -void tcp_lro_flush(struct lro_ctrl *, struct lro_entry *); void tcp_lro_flush_all(struct lro_ctrl *); extern int (*tcp_lro_flush_tcphpts)(struct lro_ctrl *, struct lro_entry *); int tcp_lro_rx(struct lro_ctrl *, struct mbuf *, uint32_t); diff --git a/sys/netinet/tcp_lro_hpts.c b/sys/netinet/tcp_lro_hpts.c index cd757d5a6164..7e756285da45 100644 --- a/sys/netinet/tcp_lro_hpts.c +++ b/sys/netinet/tcp_lro_hpts.c @@ -39,6 +39,7 @@ #include <net/if.h> #include <net/if_var.h> +#include <net/if_private.h> #include <net/ethernet.h> #include <net/bpf.h> #include <net/vnet.h> @@ -61,7 +62,9 @@ #include <netinet/tcp_lro.h> #include <netinet/tcp_var.h> #include <netinet/tcp_hpts.h> +#ifdef TCP_BLACKBOX #include <netinet/tcp_log_buf.h> +#endif static void build_ack_entry(struct tcp_ackent *ae, struct tcphdr *th, struct mbuf *m, @@ -147,6 +150,7 @@ tcp_lro_check_wake_status(struct tcpcb *tp) return (false); } +#ifdef TCP_BLACKBOX static void tcp_lro_log(struct tcpcb *tp, const struct lro_ctrl *lc, const struct lro_entry *le, const struct mbuf *m, @@ -196,6 +200,7 @@ tcp_lro_log(struct tcpcb *tp, const struct lro_ctrl *lc, TCP_LOG_LRO, 0, 0, &log, false, &tv); } } +#endif static struct mbuf * tcp_lro_get_last_if_ackcmp(struct lro_ctrl *lc, struct lro_entry *le, @@ -208,7 +213,9 @@ tcp_lro_get_last_if_ackcmp(struct lro_ctrl *lc, struct lro_entry *le, m = STAILQ_LAST(&tp->t_inqueue, mbuf, m_stailqpkt); if (m != NULL && (m->m_flags & M_ACKCMP) != 0) { if (M_TRAILINGSPACE(m) >= sizeof(struct tcp_ackent)) { +#ifdef TCP_BLACKBOX tcp_lro_log(tp, lc, le, NULL, 23, 0, 0, 0, 0); +#endif *new_m = 0; counter_u64_add(tcp_extra_mbuf, 1); return (m); @@ -219,7 +226,9 @@ tcp_lro_get_last_if_ackcmp(struct lro_ctrl *lc, struct lro_entry *le, } } /* Decide mbuf size. */ +#ifdef TCP_BLACKBOX tcp_lro_log(tp, lc, le, NULL, 21, 0, 0, 0, 0); +#endif if (tp->t_flags2 & TF2_MBUF_L_ACKS) m = m_getcl(M_NOWAIT, MT_DATA, M_ACKCMP | M_PKTHDR); else @@ -611,13 +620,19 @@ _tcp_lro_flush_tcphpts(struct lro_ctrl *lc, struct lro_entry *le) * ack will be required. */ cmp = NULL; +#ifdef TCP_BLACKBOX tcp_lro_log(tp, lc, le, NULL, 25, 0, 0, 0, 0); +#endif } else if (mv_to != NULL) { /* We are asked to move pp up */ pp = &mv_to->m_nextpkt; +#ifdef TCP_BLACKBOX tcp_lro_log(tp, lc, le, NULL, 24, 0, 0, 0, 0); } else tcp_lro_log(tp, lc, le, NULL, 26, 0, 0, 0, 0); +#else + } +#endif } /* Update "m_last_mbuf", if any. */ if (pp == &le->m_head) @@ -628,7 +643,9 @@ _tcp_lro_flush_tcphpts(struct lro_ctrl *lc, struct lro_entry *le) /* Check if any data mbufs left. */ if (le->m_head != NULL) { counter_u64_add(tcp_inp_lro_direct_queue, 1); +#ifdef TCP_BLACKBOX tcp_lro_log(tp, lc, le, NULL, 22, 1, tp->t_flags2, 0, 1); +#endif tcp_queue_pkts(tp, le); } if (should_wake) { diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c index 2bbc9414197c..bc5b42ee6f2c 100644 --- a/sys/netinet/tcp_output.c +++ b/sys/netinet/tcp_output.c @@ -85,9 +85,6 @@ #include <netinet/tcpip.h> #include <netinet/cc/cc.h> #include <netinet/tcp_fastopen.h> -#ifdef TCPPCAP -#include <netinet/tcp_pcap.h> -#endif #ifdef TCP_OFFLOAD #include <netinet/tcp_offload.h> #endif @@ -201,11 +198,9 @@ tcp_default_output(struct tcpcb *tp) struct tcphdr *th; u_char opt[TCP_MAXOLEN]; unsigned ipoptlen, optlen, hdrlen, ulen; -#if defined(IPSEC) || defined(IPSEC_SUPPORT) unsigned ipsec_optlen = 0; -#endif int idle, sendalot, curticks; - int sack_rxmit, sack_bytes_rxmt; + int sack_bytes_rxmt; struct sackhole *p; int tso, mtu; struct tcpopt to; @@ -213,9 +208,7 @@ tcp_default_output(struct tcpcb *tp) struct tcp_log_buffer *lgb; unsigned int wanted_cookie = 0; unsigned int dont_sendalot = 0; -#if 0 - int maxburst = TCP_MAXBURST; -#endif + bool sack_rxmit; #ifdef INET6 struct ip6_hdr *ip6 = NULL; const bool isipv6 = (inp->inp_vflag & INP_IPV6) != 0; @@ -264,19 +257,22 @@ tcp_default_output(struct tcpcb *tp) } } again: + sendwin = 0; /* * If we've recently taken a timeout, snd_max will be greater than * snd_nxt. There may be SACK information that allows us to avoid * resending already delivered data. Adjust snd_nxt accordingly. */ if ((tp->t_flags & TF_SACK_PERMIT) && - SEQ_LT(tp->snd_nxt, tp->snd_max)) - tcp_sack_adjust(tp); + (tp->sackhint.nexthole != NULL) && + !IN_FASTRECOVERY(tp->t_flags)) { + sendwin = tcp_sack_adjust(tp); + } sendalot = 0; tso = 0; mtu = 0; off = tp->snd_nxt - tp->snd_una; - sendwin = min(tp->snd_wnd, tp->snd_cwnd); + sendwin = min(tp->snd_wnd, tp->snd_cwnd + sendwin); flags = tcp_outflags[tp->t_state]; /* @@ -289,16 +285,19 @@ again: /* * Still in sack recovery , reset rxmit flag to zero. */ - sack_rxmit = 0; sack_bytes_rxmt = 0; len = 0; - p = NULL; - if ((tp->t_flags & TF_SACK_PERMIT) && IN_FASTRECOVERY(tp->t_flags) && + if ((tp->t_flags & TF_SACK_PERMIT) && + (IN_FASTRECOVERY(tp->t_flags) || + (SEQ_LT(tp->snd_nxt, tp->snd_max) && (tp->t_dupacks >= tcprexmtthresh))) && (p = tcp_sack_output(tp, &sack_bytes_rxmt))) { - uint32_t cwin; + int32_t cwin; - cwin = - imax(min(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt, 0); + if (IN_FASTRECOVERY(tp->t_flags)) { + cwin = imax(sendwin - tcp_compute_pipe(tp), 0); + } else { + cwin = imax(sendwin - off, 0); + } /* Do not retransmit SACK segments beyond snd_recover */ if (SEQ_GT(p->end, tp->snd_recover)) { /* @@ -314,23 +313,42 @@ again: * moves past p->rxmit. */ p = NULL; + sack_rxmit = false; goto after_sack_rexmit; } else { /* Can rexmit part of the current hole */ - len = ((int32_t)ulmin(cwin, - SEQ_SUB(tp->snd_recover, p->rxmit))); + len = SEQ_SUB(tp->snd_recover, p->rxmit); + if (cwin <= len) { + len = cwin; + } else { + sendalot = 1; + } } } else { - len = ((int32_t)ulmin(cwin, - SEQ_SUB(p->end, p->rxmit))); + len = SEQ_SUB(p->end, p->rxmit); + if (cwin <= len) { + len = cwin; + } else { + sendalot = 1; + } } + /* we could have transmitted from the scoreboard, + * but sendwin (expected flightsize) - pipe didn't + * allow any transmission. + * Bypass recalculating the possible transmission + * length further down by setting sack_rxmit. + * Wouldn't be here if there would have been + * nothing in the scoreboard to transmit. + */ if (len > 0) { off = SEQ_SUB(p->rxmit, tp->snd_una); KASSERT(off >= 0,("%s: sack block to the left of una : %d", __func__, off)); - sack_rxmit = 1; - sendalot = 1; } + sack_rxmit = true; + } else { + p = NULL; + sack_rxmit = false; } after_sack_rexmit: /* @@ -342,7 +360,7 @@ after_sack_rexmit: if (tp->t_flags & TF_NEEDSYN) flags |= TH_SYN; - SOCKBUF_LOCK(&so->so_snd); + SOCK_SENDBUF_LOCK(so); /* * If in persist timeout with window of 0, send 1 byte. * Otherwise, if window is small but nonzero @@ -391,36 +409,18 @@ after_sack_rexmit: * If sack_rxmit is true we are retransmitting from the scoreboard * in which case len is already set. */ - if (sack_rxmit == 0) { - if (sack_bytes_rxmt == 0) { - len = ((int32_t)min(sbavail(&so->so_snd), sendwin) - - off); + if (!sack_rxmit) { + if ((sack_bytes_rxmt == 0) || SEQ_LT(tp->snd_nxt, tp->snd_max)) { + len = imin(sbavail(&so->so_snd), sendwin) - off; } else { - int32_t cwin; - /* * We are inside of a SACK recovery episode and are * sending new data, having retransmitted all the * data possible in the scoreboard. */ - len = ((int32_t)min(sbavail(&so->so_snd), tp->snd_wnd) - - off); - /* - * Don't remove this (len > 0) check ! - * We explicitly check for len > 0 here (although it - * isn't really necessary), to work around a gcc - * optimization issue - to force gcc to compute - * len above. Without this check, the computation - * of len is bungled by the optimizer. - */ - if (len > 0) { - cwin = tp->snd_cwnd - imax(0, (int32_t) - (tp->snd_nxt - tp->snd_recover)) - - sack_bytes_rxmt; - if (cwin < 0) - cwin = 0; - len = imin(len, cwin); - } + len = imax( + imin(sbavail(&so->so_snd), sendwin) - + imax(tcp_compute_pipe(tp), off), 0); } } @@ -515,8 +515,8 @@ after_sack_rexmit: * hardware). * * TSO may only be used if we are in a pure bulk sending state. The - * presence of TCP-MD5, SACK retransmits, SACK advertizements and - * IP options prevent using TSO. With TSO the TCP header is the same + * presence of TCP-MD5, IP options (IPsec), and possibly SACK + * retransmits prevent using TSO. With TSO the TCP header is the same * (except for the sequence number) for all generated packets. This * makes it impossible to transmit any options which vary per generated * segment or packet. @@ -553,15 +553,15 @@ after_sack_rexmit: offsetof(struct ipoption, ipopt_list); else ipoptlen = 0; -#if defined(IPSEC) || defined(IPSEC_SUPPORT) ipoptlen += ipsec_optlen; -#endif if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > tp->t_maxseg && (tp->t_port == 0) && ((tp->t_flags & TF_SIGNATURE) == 0) && - tp->rcv_numsacks == 0 && sack_rxmit == 0 && - ipoptlen == 0 && !(flags & TH_SYN)) + (!sack_rxmit || V_tcp_sack_tso) && + (ipoptlen == 0 || (ipoptlen == ipsec_optlen && + (tp->t_flags2 & TF2_IPSEC_TSO) != 0)) && + !(flags & TH_SYN)) tso = 1; if (SEQ_LT((sack_rxmit ? p->rxmit : tp->snd_nxt) + len, @@ -754,11 +754,11 @@ dontupdate: * No reason to send a segment, just return. */ just_return: - SOCKBUF_UNLOCK(&so->so_snd); + SOCK_SENDBUF_UNLOCK(so); return (0); send: - SOCKBUF_LOCK_ASSERT(&so->so_snd); + SOCK_SENDBUF_LOCK_ASSERT(so); if (len > 0) { if (len >= tp->t_maxseg) tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT; @@ -888,7 +888,7 @@ send: if (tp->t_port) { if (V_tcp_udp_tunneling_port == 0) { /* The port was removed?? */ - SOCKBUF_UNLOCK(&so->so_snd); + SOCK_SENDBUF_UNLOCK(so); return (EHOSTUNREACH); } hdrlen += sizeof(struct udphdr); @@ -917,7 +917,7 @@ send: * overflowing or exceeding the maximum length * allowed by the network interface: */ - KASSERT(ipoptlen == 0, + KASSERT(ipoptlen == ipsec_optlen, ("%s: TSO can't do IP options", __func__)); /* @@ -926,8 +926,8 @@ send: */ if (if_hw_tsomax != 0) { /* compute maximum TSO length */ - max_len = (if_hw_tsomax - hdrlen - - max_linkhdr); + max_len = if_hw_tsomax - hdrlen - + ipsec_optlen - max_linkhdr; if (max_len <= 0) { len = 0; } else if (len > max_len) { @@ -941,7 +941,7 @@ send: * fractional unless the send sockbuf can be * emptied: */ - max_len = (tp->t_maxseg - optlen); + max_len = tp->t_maxseg - optlen - ipsec_optlen; if (((uint32_t)off + (uint32_t)len) < sbavail(&so->so_snd)) { moff = len % max_len; @@ -980,9 +980,9 @@ send: * byte of the payload can be put into the * TCP segment. */ - SOCKBUF_UNLOCK(&so->so_snd); + SOCK_SENDBUF_UNLOCK(so); error = EMSGSIZE; - sack_rxmit = 0; + sack_rxmit = false; goto out; } len = tp->t_maxseg - optlen - ipoptlen; @@ -1037,6 +1037,9 @@ send: TCPSTAT_ADD(tcps_sndrexmitbyte, len); if (sack_rxmit) { TCPSTAT_INC(tcps_sack_rexmits); + if (tso) { + TCPSTAT_INC(tcps_sack_rexmits_tso); + } TCPSTAT_ADD(tcps_sack_rexmit_bytes, len); } #ifdef STATS @@ -1059,9 +1062,9 @@ send: m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { - SOCKBUF_UNLOCK(&so->so_snd); + SOCK_SENDBUF_UNLOCK(so); error = ENOBUFS; - sack_rxmit = 0; + sack_rxmit = false; goto out; } @@ -1080,13 +1083,18 @@ send: sbsndptr_adv(&so->so_snd, mb, len); m->m_len += len; } else { + int32_t old_len; + if (SEQ_LT(tp->snd_nxt, tp->snd_max)) msb = NULL; else msb = &so->so_snd; + old_len = len; m->m_next = tcp_m_copym(mb, moff, &len, if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb, hw_tls); + if (old_len != len) + flags &= ~TH_FIN; if (len <= (tp->t_maxseg - optlen)) { /* * Must have ran out of mbufs for the copy @@ -1097,10 +1105,10 @@ send: tso = 0; } if (m->m_next == NULL) { - SOCKBUF_UNLOCK(&so->so_snd); + SOCK_SENDBUF_UNLOCK(so); (void) m_free(m); error = ENOBUFS; - sack_rxmit = 0; + sack_rxmit = false; goto out; } } @@ -1114,9 +1122,9 @@ send: if (((uint32_t)off + (uint32_t)len == sbused(&so->so_snd)) && !(flags & TH_SYN)) flags |= TH_PUSH; - SOCKBUF_UNLOCK(&so->so_snd); + SOCK_SENDBUF_UNLOCK(so); } else { - SOCKBUF_UNLOCK(&so->so_snd); + SOCK_SENDBUF_UNLOCK(so); if (tp->t_flags & TF_ACKNOW) TCPSTAT_INC(tcps_sndacks); else if (flags & (TH_SYN|TH_FIN|TH_RST)) @@ -1129,7 +1137,7 @@ send: m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { error = ENOBUFS; - sack_rxmit = 0; + sack_rxmit = false; goto out; } #ifdef INET6 @@ -1141,7 +1149,7 @@ send: m->m_data += max_linkhdr; m->m_len = hdrlen; } - SOCKBUF_UNLOCK_ASSERT(&so->so_snd); + SOCK_SENDBUF_UNLOCK_ASSERT(so); m->m_pkthdr.rcvif = (struct ifnet *)0; #ifdef MAC mac_inpcb_create_mbuf(inp, m); @@ -1226,7 +1234,7 @@ send: * case, since we know we aren't doing a retransmission. * (retransmit and persist are mutually exclusive...) */ - if (sack_rxmit == 0) { + if (!sack_rxmit) { if (len || (flags & (TH_SYN|TH_FIN)) || tcp_timer_active(tp, TT_PERSIST)) th->th_seq = htonl(tp->snd_nxt); @@ -1258,7 +1266,6 @@ send: bcopy(opt, th + 1, optlen); th->th_off = (sizeof (struct tcphdr) + optlen) >> 2; } - tcp_set_flags(th, flags); /* * Calculate receive window. Don't shrink window, * but avoid silly window syndrome. @@ -1303,8 +1310,8 @@ send: tp->t_flags &= ~TF_RXWIN0SENT; if (SEQ_GT(tp->snd_up, tp->snd_nxt)) { th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt)); - th->th_flags |= TH_URG; - } else + flags |= TH_URG; + } else { /* * If no urgent pointer to send, then we pull * the urgent pointer to the left edge of the send window @@ -1312,6 +1319,8 @@ send: * number wraparound. */ tp->snd_up = tp->snd_una; /* drag it along */ + } + tcp_set_flags(th, flags); /* * Put TCP length in extended header, and then @@ -1390,10 +1399,10 @@ send: * The TCP pseudo header checksum is always provided. */ if (tso) { - KASSERT(len > tp->t_maxseg - optlen, + KASSERT(len > tp->t_maxseg - optlen - ipsec_optlen, ("%s: len <= tso_segsz", __func__)); m->m_pkthdr.csum_flags |= CSUM_TSO; - m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen; + m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen - ipsec_optlen; } KASSERT(len + hdrlen == m_length(m, NULL), @@ -1454,10 +1463,6 @@ send: TCP_PROBE5(send, NULL, tp, ip6, tp, th); -#ifdef TCPPCAP - /* Save packet, if requested. */ - tcp_pcap_add(th, m, &(tp->t_outpkts)); -#endif /* TODO: IPv6 IP6TOS_ECT bit on */ error = ip6_output(m, inp->in6p_outputopts, &inp->inp_route6, @@ -1500,11 +1505,6 @@ send: TCP_PROBE5(send, NULL, tp, ip, tp, th); -#ifdef TCPPCAP - /* Save packet, if requested. */ - tcp_pcap_add(th, m, &(tp->t_outpkts)); -#endif - error = ip_output(m, inp->inp_options, &inp->inp_route, ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 0, inp); @@ -1633,11 +1633,20 @@ timer: tp->snd_max = tp->snd_nxt + xlen; } if ((error == 0) && - (TCPS_HAVEESTABLISHED(tp->t_state) && - (tp->t_flags & TF_SACK_PERMIT) && - tp->rcv_numsacks > 0)) { - /* Clean up any DSACK's sent */ - tcp_clean_dsack_blocks(tp); + (tp->rcv_numsacks > 0) && + TCPS_HAVEESTABLISHED(tp->t_state) && + (tp->t_flags & TF_SACK_PERMIT)) { + /* Clean up any DSACK's sent */ + tcp_clean_dsack_blocks(tp); + } + if ((error == 0) && + sack_rxmit && + SEQ_LT(tp->snd_nxt, SEQ_MIN(p->rxmit, p->end))) { + /* + * When transmitting from SACK scoreboard + * after an RTO, pull snd_nxt along. + */ + tp->snd_nxt = SEQ_MIN(p->rxmit, p->end); } if (error) { /* @@ -1672,7 +1681,7 @@ timer: if (IN_RECOVERY(tp->t_flags)) tp->sackhint.prr_out -= len; } - SOCKBUF_UNLOCK_ASSERT(&so->so_snd); /* Check gotos. */ + SOCK_SENDBUF_UNLOCK_ASSERT(so); /* Check gotos. */ switch (error) { case EACCES: case EPERM: @@ -1680,7 +1689,7 @@ timer: return (error); case ENOBUFS: TCP_XMIT_TIMER_ASSERT(tp, len, flags); - tp->snd_cwnd = tp->t_maxseg; + tp->snd_cwnd = tcp_maxseg(tp); return (0); case EMSGSIZE: /* diff --git a/sys/netinet/tcp_pcap.c b/sys/netinet/tcp_pcap.c deleted file mode 100644 index f26287bd7f03..000000000000 --- a/sys/netinet/tcp_pcap.c +++ /dev/null @@ -1,452 +0,0 @@ -/*- - * Copyright (c) 2015 - * Jonathan Looney. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include <sys/queue.h> -#include <sys/param.h> -#include <sys/types.h> -#include <sys/socket.h> -#include <sys/socketvar.h> -#include <sys/sysctl.h> -#include <sys/systm.h> -#include <sys/mbuf.h> -#include <sys/eventhandler.h> -#include <machine/atomic.h> -#include <netinet/in.h> -#include <netinet/in_pcb.h> -#include <netinet/tcp_var.h> -#include <netinet/tcp_pcap.h> - -#define M_LEADINGSPACE_NOWRITE(m) \ - ((m)->m_data - M_START(m)) - -int tcp_pcap_aggressive_free = 1; -static int tcp_pcap_clusters_referenced_cur = 0; -static int tcp_pcap_clusters_referenced_max = 0; - -SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_aggressive_free, - CTLFLAG_RW, &tcp_pcap_aggressive_free, 0, - "Free saved packets when the memory system comes under pressure"); -SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_clusters_referenced_cur, - CTLFLAG_RD, &tcp_pcap_clusters_referenced_cur, 0, - "Number of clusters currently referenced on TCP PCAP queues"); -SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_clusters_referenced_max, - CTLFLAG_RW, &tcp_pcap_clusters_referenced_max, 0, - "Maximum number of clusters allowed to be referenced on TCP PCAP " - "queues"); - -static int tcp_pcap_alloc_reuse_ext = 0; -static int tcp_pcap_alloc_reuse_mbuf = 0; -static int tcp_pcap_alloc_new_mbuf = 0; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_alloc_reuse_ext, - CTLFLAG_RD, &tcp_pcap_alloc_reuse_ext, 0, - "Number of mbufs with external storage reused for the TCP PCAP " - "functionality"); -SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_alloc_reuse_mbuf, - CTLFLAG_RD, &tcp_pcap_alloc_reuse_mbuf, 0, - "Number of mbufs with internal storage reused for the TCP PCAP " - "functionality"); -SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_alloc_new_mbuf, - CTLFLAG_RD, &tcp_pcap_alloc_new_mbuf, 0, - "Number of new mbufs allocated for the TCP PCAP functionality"); - -VNET_DEFINE(int, tcp_pcap_packets) = 0; -#define V_tcp_pcap_packets VNET(tcp_pcap_packets) -SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_packets, - CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_pcap_packets), 0, - "Default number of packets saved per direction per TCPCB"); - -/* Initialize the values. */ -static void -tcp_pcap_max_set(void) -{ - - tcp_pcap_clusters_referenced_max = nmbclusters / 4; -} - -void -tcp_pcap_init(void) -{ - - tcp_pcap_max_set(); - EVENTHANDLER_REGISTER(nmbclusters_change, tcp_pcap_max_set, - NULL, EVENTHANDLER_PRI_ANY); -} - -/* - * If we are below the maximum allowed cluster references, - * increment the reference count and return TRUE. Otherwise, - * leave the reference count alone and return FALSE. - */ -static __inline bool -tcp_pcap_take_cluster_reference(void) -{ - if (atomic_fetchadd_int(&tcp_pcap_clusters_referenced_cur, 1) >= - tcp_pcap_clusters_referenced_max) { - atomic_add_int(&tcp_pcap_clusters_referenced_cur, -1); - return FALSE; - } - return TRUE; -} - -/* - * For all the external entries in m, apply the given adjustment. - * This can be used to adjust the counter when an mbuf chain is - * copied or freed. - */ -static __inline void -tcp_pcap_adj_cluster_reference(struct mbuf *m, int adj) -{ - while (m) { - if (m->m_flags & M_EXT) - atomic_add_int(&tcp_pcap_clusters_referenced_cur, adj); - - m = m->m_next; - } -} - -/* - * Free all mbufs in a chain, decrementing the reference count as - * necessary. - * - * Functions in this file should use this instead of m_freem() when - * they are freeing mbuf chains that may contain clusters that were - * already included in tcp_pcap_clusters_referenced_cur. - */ -static void -tcp_pcap_m_freem(struct mbuf *mb) -{ - while (mb != NULL) { - if (mb->m_flags & M_EXT) - atomic_subtract_int(&tcp_pcap_clusters_referenced_cur, - 1); - mb = m_free(mb); - } -} - -/* - * Copy data from m to n, where n cannot fit all the data we might - * want from m. - * - * Prioritize data like this: - * 1. TCP header - * 2. IP header - * 3. Data - */ -static void -tcp_pcap_copy_bestfit(struct tcphdr *th, struct mbuf *m, struct mbuf *n) -{ - struct mbuf *m_cur = m; - int bytes_to_copy=0, trailing_data, skip=0, tcp_off; - - /* Below, we assume these will be non-NULL. */ - KASSERT(th, ("%s: called with th == NULL", __func__)); - KASSERT(m, ("%s: called with m == NULL", __func__)); - KASSERT(n, ("%s: called with n == NULL", __func__)); - - /* We assume this initialization occurred elsewhere. */ - KASSERT(n->m_len == 0, ("%s: called with n->m_len=%d (expected 0)", - __func__, n->m_len)); - KASSERT(n->m_data == M_START(n), - ("%s: called with n->m_data != M_START(n)", __func__)); - - /* - * Calculate the size of the TCP header. We use this often - * enough that it is worth just calculating at the start. - */ - tcp_off = th->th_off << 2; - - /* Trim off leading empty mbufs. */ - while (m && m->m_len == 0) - m = m->m_next; - - if (m) { - m_cur = m; - } - else { - /* - * No data? Highly unusual. We would expect to at - * least see a TCP header in the mbuf. - * As we have a pointer to the TCP header, I guess - * we should just copy that. (???) - */ -fallback: - bytes_to_copy = tcp_off; - if (bytes_to_copy > M_SIZE(n)) - bytes_to_copy = M_SIZE(n); - bcopy(th, n->m_data, bytes_to_copy); - n->m_len = bytes_to_copy; - return; - } - - /* - * Find TCP header. Record the total number of bytes up to, - * and including, the TCP header. - */ - while (m_cur) { - if ((caddr_t) th >= (caddr_t) m_cur->m_data && - (caddr_t) th < (caddr_t) (m_cur->m_data + m_cur->m_len)) - break; - bytes_to_copy += m_cur->m_len; - m_cur = m_cur->m_next; - } - if (m_cur) - bytes_to_copy += (caddr_t) th - (caddr_t) m_cur->m_data; - else - goto fallback; - bytes_to_copy += tcp_off; - - /* - * If we already want to copy more bytes than we can hold - * in the destination mbuf, skip leading bytes and copy - * what we can. - * - * Otherwise, consider trailing data. - */ - if (bytes_to_copy > M_SIZE(n)) { - skip = bytes_to_copy - M_SIZE(n); - bytes_to_copy = M_SIZE(n); - } - else { - /* - * Determine how much trailing data is in the chain. - * We start with the length of this mbuf (the one - * containing th) and subtract the size of the TCP - * header (tcp_off) and the size of the data prior - * to th (th - m_cur->m_data). - * - * This *should not* be negative, as the TCP code - * should put the whole TCP header in a single - * mbuf. But, it isn't a problem if it is. We will - * simple work off our negative balance as we look - * at subsequent mbufs. - */ - trailing_data = m_cur->m_len - tcp_off; - trailing_data -= (caddr_t) th - (caddr_t) m_cur->m_data; - m_cur = m_cur->m_next; - while (m_cur) { - trailing_data += m_cur->m_len; - m_cur = m_cur->m_next; - } - if ((bytes_to_copy + trailing_data) > M_SIZE(n)) - bytes_to_copy = M_SIZE(n); - else - bytes_to_copy += trailing_data; - } - - m_copydata(m, skip, bytes_to_copy, n->m_data); - n->m_len = bytes_to_copy; -} - -void -tcp_pcap_add(struct tcphdr *th, struct mbuf *m, struct mbufq *queue) -{ - struct mbuf *n = NULL, *mhead; - - KASSERT(th, ("%s: called with th == NULL", __func__)); - KASSERT(m, ("%s: called with m == NULL", __func__)); - KASSERT(queue, ("%s: called with queue == NULL", __func__)); - - /* We only care about data packets. */ - while (m && m->m_type != MT_DATA) - m = m->m_next; - - /* We only need to do something if we still have an mbuf. */ - if (!m) - return; - - /* If we are not saving mbufs, return now. */ - if (queue->mq_maxlen == 0) - return; - - /* - * Check to see if we will need to recycle mbufs. - * - * If we need to get rid of mbufs to stay below - * our packet count, try to reuse the mbuf. Once - * we already have a new mbuf (n), then we can - * simply free subsequent mbufs. - * - * Note that most of the logic in here is to deal - * with the reuse. If we are fine with constant - * mbuf allocs/deallocs, we could ditch this logic. - * But, it only seems to make sense to reuse - * mbufs we already have. - */ - while (mbufq_full(queue)) { - mhead = mbufq_dequeue(queue); - - if (n) { - tcp_pcap_m_freem(mhead); - } - else { - /* - * If this held an external cluster, try to - * detach the cluster. But, if we held the - * last reference, go through the normal - * free-ing process. - */ - if (mhead->m_flags & M_EXTPG) { - /* Don't mess around with these. */ - tcp_pcap_m_freem(mhead); - continue; - } else if (mhead->m_flags & M_EXT) { - switch (mhead->m_ext.ext_type) { - case EXT_SFBUF: - /* Don't mess around with these. */ - tcp_pcap_m_freem(mhead); - continue; - default: - if (atomic_fetchadd_int( - mhead->m_ext.ext_cnt, -1) == 1) - { - /* - * We held the last reference - * on this cluster. Restore - * the reference count and put - * it back in the pool. - */ - *(mhead->m_ext.ext_cnt) = 1; - tcp_pcap_m_freem(mhead); - continue; - } - /* - * We were able to cleanly free the - * reference. - */ - atomic_subtract_int( - &tcp_pcap_clusters_referenced_cur, - 1); - tcp_pcap_alloc_reuse_ext++; - break; - } - } else { - tcp_pcap_alloc_reuse_mbuf++; - } - - n = mhead; - tcp_pcap_m_freem(n->m_next); - m_init(n, M_NOWAIT, MT_DATA, 0); - } - } - - /* Check to see if we need to get a new mbuf. */ - if (!n) { - if (!(n = m_get(M_NOWAIT, MT_DATA))) - return; - tcp_pcap_alloc_new_mbuf++; - } - - /* - * What are we dealing with? If a cluster, attach it. Otherwise, - * try to copy the data from the beginning of the mbuf to the - * end of data. (There may be data between the start of the data - * area and the current data pointer. We want to get this, because - * it may contain header information that is useful.) - * In cases where that isn't possible, settle for what we can - * get. - */ - if ((m->m_flags & (M_EXT | M_EXTPG)) && - tcp_pcap_take_cluster_reference()) { - n->m_data = m->m_data; - n->m_len = m->m_len; - mb_dupcl(n, m); - } - else if (((m->m_data + m->m_len) - M_START(m)) <= M_SIZE(n)) { - /* - * At this point, n is guaranteed to be a normal mbuf - * with no cluster and no packet header. Because the - * logic in this code block requires this, the assert - * is here to catch any instances where someone - * changes the logic to invalidate that assumption. - */ - KASSERT((n->m_flags & (M_EXT | M_PKTHDR)) == 0, - ("%s: Unexpected flags (%#x) for mbuf", - __func__, n->m_flags)); - n->m_data = n->m_dat + M_LEADINGSPACE_NOWRITE(m); - n->m_len = m->m_len; - if (m->m_flags & M_EXTPG) - m_copydata(m, 0, m->m_len, n->m_data); - else - bcopy(M_START(m), n->m_dat, - m->m_len + M_LEADINGSPACE_NOWRITE(m)); - } - else { - /* - * This is the case where we need to "settle for what - * we can get". The most probable way to this code - * path is that we've already taken references to the - * maximum number of mbuf clusters we can, and the data - * is too long to fit in an mbuf's internal storage. - * Try for a "best fit". - */ - tcp_pcap_copy_bestfit(th, m, n); - - /* Don't try to get additional data. */ - goto add_to_queue; - } - - if (m->m_next) { - n->m_next = m_copym(m->m_next, 0, M_COPYALL, M_NOWAIT); - tcp_pcap_adj_cluster_reference(n->m_next, 1); - } - -add_to_queue: - /* Add the new mbuf to the list. */ - if (mbufq_enqueue(queue, n)) { - /* This shouldn't happen. If INVARIANTS is defined, panic. */ - KASSERT(0, ("%s: mbufq was unexpectedly full!", __func__)); - tcp_pcap_m_freem(n); - } -} - -void -tcp_pcap_drain(struct mbufq *queue) -{ - struct mbuf *m; - while ((m = mbufq_dequeue(queue))) - tcp_pcap_m_freem(m); -} - -void -tcp_pcap_tcpcb_init(struct tcpcb *tp) -{ - mbufq_init(&(tp->t_inpkts), V_tcp_pcap_packets); - mbufq_init(&(tp->t_outpkts), V_tcp_pcap_packets); -} - -void -tcp_pcap_set_sock_max(struct mbufq *queue, int newval) -{ - queue->mq_maxlen = newval; - while (queue->mq_len > queue->mq_maxlen) - tcp_pcap_m_freem(mbufq_dequeue(queue)); -} - -int -tcp_pcap_get_sock_max(struct mbufq *queue) -{ - return queue->mq_maxlen; -} diff --git a/sys/netinet/tcp_pcap.h b/sys/netinet/tcp_pcap.h deleted file mode 100644 index 8250c06d4ce0..000000000000 --- a/sys/netinet/tcp_pcap.h +++ /dev/null @@ -1,39 +0,0 @@ -/*- - * Copyright (c) 2015 - * Jonathan Looney. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#ifndef _NETINET_TCP_PCAP_H_ -#define _NETINET_TCP_PCAP_H_ - -void tcp_pcap_init(void); -void tcp_pcap_add(struct tcphdr *th, struct mbuf *m, struct mbufq *queue); -void tcp_pcap_drain(struct mbufq *queue); -void tcp_pcap_tcpcb_init(struct tcpcb *tp); -void tcp_pcap_set_sock_max(struct mbufq *queue, int newval); -int tcp_pcap_get_sock_max(struct mbufq *queue); - -extern int tcp_pcap_aggressive_free; - -#endif /* _NETINET_TCP_PCAP_H_ */ diff --git a/sys/netinet/tcp_ratelimit.c b/sys/netinet/tcp_ratelimit.c index 1834c702c493..a0e837cc7d76 100644 --- a/sys/netinet/tcp_ratelimit.c +++ b/sys/netinet/tcp_ratelimit.c @@ -246,10 +246,10 @@ const uint64_t desired_rates[] = { #define RS_ONE_GIGABIT_PERSEC 1000000000 #define RS_TEN_GIGABIT_PERSEC 10000000000 -static struct head_tcp_rate_set int_rs; +static struct head_tcp_rate_set int_rs = CK_LIST_HEAD_INITIALIZER(); static struct mtx rs_mtx; -uint32_t rs_number_alive; -uint32_t rs_number_dead; +uint32_t rs_number_alive = 0; +uint32_t rs_number_dead = 0; static uint32_t rs_floor_mss = 0; static uint32_t wait_time_floor = 8000; /* 8 ms */ static uint32_t rs_hw_floor_mss = 16; @@ -1298,6 +1298,12 @@ tcp_rl_ifnet_departure(void *arg __unused, struct ifnet *ifp) NET_EPOCH_EXIT(et); } +void +tcp_rl_release_ifnet(struct ifnet *ifp) +{ + tcp_rl_ifnet_departure(NULL, ifp); +} + static void tcp_rl_shutdown(void *arg __unused, int howto __unused) { @@ -1772,9 +1778,6 @@ static eventhandler_tag rl_shutdown_start; static void tcp_rs_init(void *st __unused) { - CK_LIST_INIT(&int_rs); - rs_number_alive = 0; - rs_number_dead = 0; mtx_init(&rs_mtx, "tcp_rs_mtx", "rsmtx", MTX_DEF); rl_ifnet_departs = EVENTHANDLER_REGISTER(ifnet_departure_event, tcp_rl_ifnet_departure, diff --git a/sys/netinet/tcp_ratelimit.h b/sys/netinet/tcp_ratelimit.h index cd540d1164e1..0ce42dea0d90 100644 --- a/sys/netinet/tcp_ratelimit.h +++ b/sys/netinet/tcp_ratelimit.h @@ -94,6 +94,8 @@ CK_LIST_HEAD(head_tcp_rate_set, tcp_rate_set); #ifndef ETHERNET_SEGMENT_SIZE #define ETHERNET_SEGMENT_SIZE 1514 #endif +struct tcpcb; + #ifdef RATELIMIT #define DETAILED_RATELIMIT_SYSCTL 1 /* * Undefine this if you don't want @@ -131,6 +133,9 @@ tcp_get_pacing_burst_size_w_divisor(struct tcpcb *tp, uint64_t bw, uint32_t segs void tcp_rl_log_enobuf(const struct tcp_hwrate_limit_table *rte); +void +tcp_rl_release_ifnet(struct ifnet *ifp); + #else static inline const struct tcp_hwrate_limit_table * tcp_set_pacing_rate(struct tcpcb *tp, struct ifnet *ifp, @@ -218,6 +223,10 @@ tcp_rl_log_enobuf(const struct tcp_hwrate_limit_table *rte) { } +static inline void +tcp_rl_release_ifnet(struct ifnet *ifp) +{ +} #endif /* diff --git a/sys/netinet/tcp_reass.c b/sys/netinet/tcp_reass.c index 5768d90a9337..5f73e83dc8a9 100644 --- a/sys/netinet/tcp_reass.c +++ b/sys/netinet/tcp_reass.c @@ -957,7 +957,7 @@ new_entry: flags = tcp_get_flags(th) & TH_FIN; TCPSTAT_INC(tcps_rcvoopack); TCPSTAT_ADD(tcps_rcvoobyte, *tlenp); - SOCKBUF_LOCK(&so->so_rcv); + SOCK_RECVBUF_LOCK(so); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { m_freem(m); } else { @@ -1058,7 +1058,7 @@ present: #endif return (0); } - SOCKBUF_LOCK(&so->so_rcv); + SOCK_RECVBUF_LOCK(so); do { tp->rcv_nxt += q->tqe_len; flags = q->tqe_flags & TH_FIN; diff --git a/sys/netinet/tcp_sack.c b/sys/netinet/tcp_sack.c index f59cc5fe0d0b..90d789f0e224 100644 --- a/sys/netinet/tcp_sack.c +++ b/sys/netinet/tcp_sack.c @@ -137,6 +137,11 @@ SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, lrd, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_lrd), 1, "Perform Lost Retransmission Detection"); +VNET_DEFINE(int, tcp_sack_tso) = 0; +SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, tso, CTLFLAG_VNET | CTLFLAG_RW, + &VNET_NAME(tcp_sack_tso), 0, + "Allow TSO during SACK loss recovery"); + VNET_DEFINE(int, tcp_sack_maxholes) = 128; SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, maxholes, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_sack_maxholes), 0, @@ -558,6 +563,7 @@ tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack) int i, j, num_sack_blks; sackstatus_t sack_changed; int delivered_data, left_edge_delta; + int maxseg = tp->t_maxseg - MAX_TCPOPTLEN; tcp_seq loss_hiack = 0; int loss_thresh = 0; @@ -604,7 +610,9 @@ tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack) SEQ_GT(sack.start, th_ack) && SEQ_LT(sack.start, tp->snd_max) && SEQ_GT(sack.end, tp->snd_una) && - SEQ_LEQ(sack.end, tp->snd_max)) { + SEQ_LEQ(sack.end, tp->snd_max) && + ((sack.end - sack.start) >= maxseg || + SEQ_GEQ(sack.end, tp->snd_max))) { sack_blocks[num_sack_blks++] = sack; } else if (SEQ_LEQ(sack.start, th_ack) && SEQ_LEQ(sack.end, th_ack)) { @@ -645,8 +653,6 @@ tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack) * scoreboard). */ tp->snd_fack = SEQ_MAX(tp->snd_una, th_ack); - tp->sackhint.sacked_bytes = 0; /* reset */ - tp->sackhint.hole_bytes = 0; } /* * In the while-loop below, incoming SACK blocks (sack_blocks[]) and @@ -862,12 +868,26 @@ tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack) } } - KASSERT(!(TAILQ_EMPTY(&tp->snd_holes) && (tp->sackhint.hole_bytes != 0)), - ("SACK scoreboard empty, but accounting non-zero\n")); - + KASSERT(delivered_data >= 0, ("delivered_data < 0")); KASSERT(notlost_bytes <= tp->sackhint.hole_bytes, ("SACK: more bytes marked notlost than in scoreboard holes")); + if (TAILQ_EMPTY(&tp->snd_holes)) { + KASSERT(tp->sackhint.hole_bytes == 0, + ("SACK scoreboard empty, but accounting non-zero\n")); + tp->sackhint.sack_bytes_rexmit = 0; + tp->sackhint.sacked_bytes = 0; + tp->sackhint.lost_bytes = 0; + } else { + KASSERT(tp->sackhint.hole_bytes > 0, + ("SACK scoreboard not empty, but has no bytes\n")); + tp->sackhint.delivered_data = delivered_data; + tp->sackhint.sacked_bytes += delivered_data - left_edge_delta; + KASSERT((tp->sackhint.sacked_bytes >= 0), ("sacked_bytes < 0")); + tp->sackhint.lost_bytes = tp->sackhint.hole_bytes - + notlost_bytes; + } + if (!(to->to_flags & TOF_SACK)) /* * If this ACK did not contain any @@ -878,11 +898,6 @@ tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack) * for RFC6675 rescue retransmission. */ sack_changed = SACK_NOCHANGE; - tp->sackhint.delivered_data = delivered_data; - tp->sackhint.sacked_bytes += delivered_data - left_edge_delta; - tp->sackhint.lost_bytes = tp->sackhint.hole_bytes - notlost_bytes; - KASSERT((delivered_data >= 0), ("delivered_data < 0")); - KASSERT((tp->sackhint.sacked_bytes >= 0), ("sacked_bytes < 0")); return (sack_changed); } @@ -953,16 +968,15 @@ tcp_sack_partialack(struct tcpcb *tp, struct tcphdr *th, u_int *maxsegp) /* Send one or 2 segments based on how much new data was acked. */ if ((BYTES_THIS_ACK(tp, th) / maxseg) >= 2) num_segs = 2; - if (V_tcp_do_newsack) { - tp->snd_cwnd = imax(tp->snd_nxt - th->th_ack + - tp->sackhint.sack_bytes_rexmit - - tp->sackhint.sacked_bytes - - tp->sackhint.lost_bytes, maxseg) + - num_segs * maxseg; - } else { + if (tp->snd_nxt == tp->snd_max) { tp->snd_cwnd = (tp->sackhint.sack_bytes_rexmit + - imax(0, tp->snd_nxt - tp->snd_recover) + - num_segs * maxseg); + (tp->snd_nxt - tp->snd_recover) + num_segs * maxseg); + } else { + /* + * Since cwnd is not the expected flightsize during + * SACK LR, not deflating cwnd allows the partial + * ACKed amount to be sent. + */ } if (tp->snd_cwnd > tp->snd_ssthresh) tp->snd_cwnd = tp->snd_ssthresh; @@ -998,7 +1012,7 @@ tcp_sack_partialack(struct tcpcb *tp, struct tcphdr *th, u_int *maxsegp) highdata--; highdata = SEQ_MIN(highdata, tp->snd_recover); if (SEQ_LT(th->th_ack, highdata)) { - tp->snd_fack = th->th_ack; + tp->snd_fack = SEQ_MAX(th->th_ack, tp->snd_fack); if ((temp = tcp_sackhole_insert(tp, SEQ_MAX(th->th_ack, highdata - maxseg), highdata, NULL)) != NULL) { tp->sackhint.hole_bytes += @@ -1068,41 +1082,47 @@ tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt) * After a timeout, the SACK list may be rebuilt. This SACK information * should be used to avoid retransmitting SACKed data. This function * traverses the SACK list to see if snd_nxt should be moved forward. + * In addition, cwnd will be inflated by the sacked bytes traversed when + * moving snd_nxt forward. This prevents a traffic burst after the final + * full ACK, and also keeps ACKs coming back. */ -void +int tcp_sack_adjust(struct tcpcb *tp) { + int sacked = 0; struct sackhole *p, *cur = TAILQ_FIRST(&tp->snd_holes); INP_WLOCK_ASSERT(tptoinpcb(tp)); if (cur == NULL) { /* No holes */ - return; + return (0); } if (SEQ_GEQ(tp->snd_nxt, tp->snd_fack)) { /* We're already beyond any SACKed blocks */ - return; + return (tp->sackhint.sacked_bytes); } - /*- + /* * Two cases for which we want to advance snd_nxt: * i) snd_nxt lies between end of one hole and beginning of another * ii) snd_nxt lies between end of last hole and snd_fack */ while ((p = TAILQ_NEXT(cur, scblink)) != NULL) { if (SEQ_LT(tp->snd_nxt, cur->end)) { - return; + return (sacked); } + sacked += p->start - cur->end; if (SEQ_GEQ(tp->snd_nxt, p->start)) { cur = p; } else { tp->snd_nxt = p->start; - return; + return (sacked); } } if (SEQ_LT(tp->snd_nxt, cur->end)) { - return; + return (sacked); } tp->snd_nxt = tp->snd_fack; + return (tp->sackhint.sacked_bytes); } /* diff --git a/sys/netinet/tcp_stacks/bbr.c b/sys/netinet/tcp_stacks/bbr.c index 946b65cda6a5..d2636f01714e 100644 --- a/sys/netinet/tcp_stacks/bbr.c +++ b/sys/netinet/tcp_stacks/bbr.c @@ -607,7 +607,7 @@ activate_rxt: TCPT_RANGESET_NOSLOP(to, tov, (bbr->r_ctl.rc_min_rto_ms * MS_IN_USEC), (bbr->rc_max_rto_sec * USECS_IN_SECOND)); - bbr_log_timer_var(bbr, 2, cts, 0, srtt, 0, to); + bbr_log_timer_var(bbr, 2, cts, 0, bbr_get_rtt(bbr, BBR_SRTT), 0, to); return (to); } return (0); @@ -978,14 +978,6 @@ bbr_timer_audit(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts, struct sock * and we do */ return; - } else if (sbavail(&inp->inp_socket->so_snd) && - (tmr_up == PACE_TMR_RXT)) { - /* - * if we hit enobufs then we would expect the - * possibility of nothing outstanding and the RXT up - * (and the hptsi timer). - */ - return; } else if (((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && (tp->t_state <= TCPS_CLOSING)) && @@ -2356,11 +2348,11 @@ bbr_log_to_event(struct tcp_bbr *bbr, uint32_t cts, int32_t to_num) log.u_bbr.flex1 = bbr->bbr_timer_src; log.u_bbr.flex2 = 0; log.u_bbr.flex3 = bbr->r_ctl.rc_hpts_flags; - ar = (uint64_t)(bbr->r_ctl.rc_resend); + ar = (uintptr_t)(bbr->r_ctl.rc_resend); ar >>= 32; ar &= 0x00000000ffffffff; log.u_bbr.flex4 = (uint32_t)ar; - ar = (uint64_t)bbr->r_ctl.rc_resend; + ar = (uintptr_t)bbr->r_ctl.rc_resend; ar &= 0x00000000ffffffff; log.u_bbr.flex5 = (uint32_t)ar; log.u_bbr.flex6 = TICKS_2_USEC(bbr->rc_tp->t_rxtcur); @@ -2718,12 +2710,13 @@ bbr_type_log_hdwr_pacing(struct tcp_bbr *bbr, const struct ifnet *ifp, { if (tcp_bblogging_on(bbr->rc_tp)) { union tcp_log_stackspecific log; + uint64_t ifp64 = (uintptr_t)ifp; bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); log.u_bbr.flex1 = ((hw_rate >> 32) & 0x00000000ffffffff); log.u_bbr.flex2 = (hw_rate & 0x00000000ffffffff); - log.u_bbr.flex3 = (((uint64_t)ifp >> 32) & 0x00000000ffffffff); - log.u_bbr.flex4 = ((uint64_t)ifp & 0x00000000ffffffff); + log.u_bbr.flex3 = ((ifp64 >> 32) & 0x00000000ffffffff); + log.u_bbr.flex4 = (ifp64 & 0x00000000ffffffff); log.u_bbr.bw_inuse = rate; log.u_bbr.flex5 = line; log.u_bbr.flex6 = error; @@ -2992,9 +2985,6 @@ use_initial_window: /* We should not be at 0, go to the initial window then */ goto use_initial_window; } - if (bw < 1) - /* Probably should panic */ - bw = 1; if (bw < min_bw) bw = min_bw; return (bw); @@ -3842,7 +3832,7 @@ bbr_post_recovery(struct tcpcb *tp) else if (bbr->r_ctl.rc_delivered == 0) lr2use = 1000; else { - lr2use = bbr->r_ctl.rc_lost * 1000; + lr2use = (uint64_t)bbr->r_ctl.rc_lost * (uint64_t)1000; lr2use /= bbr->r_ctl.rc_delivered; } lr2use += bbr->r_ctl.recovery_lr; @@ -4613,7 +4603,7 @@ need_retran: */ if (collapsed_win == 0) { rsm = TAILQ_LAST_FAST(&bbr->r_ctl.rc_map, bbr_sendmap, r_next); - if (rsm && (BBR_ACKED | BBR_HAS_FIN)) { + if (rsm && (rsm->r_flags & (BBR_ACKED | BBR_HAS_FIN))) { rsm = bbr_find_high_nonack(bbr, rsm); } if (rsm == NULL) { @@ -5134,6 +5124,16 @@ bbr_timeout_rxt(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts) tp->t_flags2 |= TF2_PLPMTU_PMTUD; tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE; tp->t_maxseg = tp->t_pmtud_saved_maxseg; + if (tp->t_maxseg < V_tcp_mssdflt) { + /* + * The MSS is so small we should not + * process incoming SACK's since we are + * subject to attack in such a case. + */ + tp->t_flags2 |= TF2_PROC_SACK_PROHIBIT; + } else { + tp->t_flags2 &= ~TF2_PROC_SACK_PROHIBIT; + } KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_failed); } } @@ -5542,7 +5542,7 @@ lost_rate: bbr_type_log_hdwr_pacing(bbr, bbr->r_ctl.crte->ptbl->rs_ifp, rate, - ((bbr->r_ctl.crte == NULL) ? 0 : bbr->r_ctl.crte->rate), + bbr->r_ctl.crte->rate, __LINE__, cts, error); @@ -6318,8 +6318,6 @@ tcp_bbr_xmit_timer_commit(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts) } /* Round it up */ rtt_ticks = USEC_2_TICKS((rtt + (USECS_IN_MSEC - 1))); - if (rtt_ticks == 0) - rtt_ticks = 1; if (tp->t_srtt != 0) { /* * srtt is stored as fixed point with 5 bits after the @@ -6703,7 +6701,7 @@ bbr_update_bbr_info(struct tcp_bbr *bbr, struct bbr_sendmap *rsm, uint32_t rtt, bbr_log_rtt_shrinks(bbr, cts, 0, rtt, __LINE__, BBR_RTTS_NEWRTT, 0); bbr_set_reduced_rtt(bbr, cts, __LINE__); } - bbr_log_type_bbrrttprop(bbr, rtt, (rsm ? rsm->r_end : 0), uts, cts, + bbr_log_type_bbrrttprop(bbr, rtt, rsm->r_end, uts, cts, match, rsm->r_start, rsm->r_flags); apply_filter_min_small(&bbr->r_ctl.rc_rttprop, rtt, cts); if (old_rttprop != bbr_get_rtt(bbr, BBR_RTT_PROP)) { @@ -6783,8 +6781,6 @@ bbr_update_rtt(struct tcpcb *tp, struct tcp_bbr *bbr, t = cts - rsm->r_tim_lastsent[0]; else t = 1; - if ((int)t <= 0) - t = 1; bbr->r_ctl.rc_last_rtt = t; bbr_update_bbr_info(bbr, rsm, t, cts, to->to_tsecr, 0, BBR_RTT_BY_EXACTMATCH, rsm->r_tim_lastsent[0], ack_type, to); @@ -6825,8 +6821,6 @@ bbr_update_rtt(struct tcpcb *tp, struct tcp_bbr *bbr, t = cts - rsm->r_tim_lastsent[i]; else t = 1; - if ((int)t <= 0) - t = 1; bbr->r_ctl.rc_last_rtt = t; bbr_update_bbr_info(bbr, rsm, t, cts, to->to_tsecr, uts, BBR_RTT_BY_TSMATCHING, rsm->r_tim_lastsent[i], ack_type, to); @@ -7313,11 +7307,9 @@ bbr_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th, p_maxseg = min(bbr->r_ctl.rc_pace_max_segs, maxseg); th_ack = th->th_ack; if (SEQ_GT(th_ack, tp->snd_una)) { - acked = th_ack - tp->snd_una; bbr_log_progress_event(bbr, tp, ticks, PROGRESS_UPDATE, __LINE__); bbr->rc_tp->t_acktime = ticks; - } else - acked = 0; + } if (SEQ_LEQ(th_ack, tp->snd_una)) { /* Only sent here for sack processing */ goto proc_sack; @@ -7556,7 +7548,7 @@ proc_sack: * Sort the SACK blocks so we can update the rack scoreboard with * just one pass. */ - new_sb = sack_filter_blks(&bbr->r_ctl.bbr_sf, sack_blocks, + new_sb = sack_filter_blks(tp, &bbr->r_ctl.bbr_sf, sack_blocks, num_sack_blks, th->th_ack); ctf_log_sack_filter(bbr->rc_tp, new_sb, sack_blocks); BBR_STAT_ADD(bbr_sack_blocks, num_sack_blks); @@ -7700,6 +7692,43 @@ bbr_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so, bbr = (struct tcp_bbr *)tp->t_fb_ptr; lost = bbr->r_ctl.rc_lost; nsegs = max(1, m->m_pkthdr.lro_nsegs); + if (SEQ_GEQ(tp->snd_una, tp->iss + (65535 << tp->snd_scale))) { + /* Checking SEG.ACK against ISS is definitely redundant. */ + tp->t_flags2 |= TF2_NO_ISS_CHECK; + } + if (!V_tcp_insecure_ack) { + tcp_seq seq_min; + bool ghost_ack_check; + + if (tp->t_flags2 & TF2_NO_ISS_CHECK) { + /* Check for too old ACKs (RFC 5961, Section 5.2). */ + seq_min = tp->snd_una - tp->max_sndwnd; + ghost_ack_check = false; + } else { + if (SEQ_GT(tp->iss + 1, tp->snd_una - tp->max_sndwnd)) { + /* Checking for ghost ACKs is stricter. */ + seq_min = tp->iss + 1; + ghost_ack_check = true; + } else { + /* + * Checking for too old ACKs (RFC 5961, + * Section 5.2) is stricter. + */ + seq_min = tp->snd_una - tp->max_sndwnd; + ghost_ack_check = false; + } + } + if (SEQ_LT(th->th_ack, seq_min)) { + if (ghost_ack_check) + TCPSTAT_INC(tcps_rcvghostack); + else + TCPSTAT_INC(tcps_rcvacktooold); + /* Send challenge ACK. */ + ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val); + bbr->r_wanted_output = 1; + return (1); + } + } if (SEQ_GT(th->th_ack, tp->snd_max)) { ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val); bbr->r_wanted_output = 1; @@ -7775,7 +7804,7 @@ bbr_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so, (int)(ticks - tp->t_badrxtwin) < 0) bbr_cong_signal(tp, th, CC_RTO_ERR, NULL); } - SOCKBUF_LOCK(&so->so_snd); + SOCK_SENDBUF_LOCK(so); acked_amount = min(acked, (int)sbavail(&so->so_snd)); tp->snd_wnd -= acked_amount; mfree = sbcut_locked(&so->so_snd, acked_amount); @@ -8247,7 +8276,7 @@ bbr_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so, thflags = tcp_get_flags(th) & TH_FIN; KMOD_TCPSTAT_ADD(tcps_rcvpack, (int)nsegs); KMOD_TCPSTAT_ADD(tcps_rcvbyte, tlen); - SOCKBUF_LOCK(&so->so_rcv); + SOCK_RECVBUF_LOCK(so); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) m_freem(m); else @@ -8480,7 +8509,7 @@ bbr_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so, newsize = tcp_autorcvbuf(m, th, so, tp, tlen); /* Add data to socket buffer. */ - SOCKBUF_LOCK(&so->so_rcv); + SOCK_RECVBUF_LOCK(so); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { m_freem(m); } else { @@ -8734,7 +8763,7 @@ bbr_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so, (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); - ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset(m, tp, th, BANDLIM_TCP_RST, tlen); return (1); } if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) { @@ -8936,7 +8965,7 @@ bbr_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, (SEQ_LEQ(th->th_ack, tp->snd_una) || SEQ_GT(th->th_ack, tp->snd_max))) { tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); - ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset(m, tp, th, BANDLIM_TCP_RST, tlen); return (1); } if (tp->t_flags & TF_FASTOPEN) { @@ -8948,7 +8977,7 @@ bbr_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, */ if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) { tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); - ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset(m, tp, th, BANDLIM_TCP_RST, tlen); return (1); } else if (thflags & TH_SYN) { /* non-initial SYN is ignored */ @@ -8981,7 +9010,7 @@ bbr_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, */ if (SEQ_LT(th->th_seq, tp->irs)) { tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); - ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset(m, tp, th, BANDLIM_TCP_RST, tlen); return (1); } if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { @@ -9259,7 +9288,7 @@ bbr_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so, if (sbavail(&so->so_snd)) { if (ctf_progress_timeout_check(tp, true)) { bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__); - ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen); return (1); } } @@ -9356,7 +9385,7 @@ bbr_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so, if (sbavail(&so->so_snd)) { if (ctf_progress_timeout_check(tp, true)) { bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__); - ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen); return (1); } } @@ -9506,7 +9535,7 @@ bbr_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so, if (sbavail(&so->so_snd)) { if (ctf_progress_timeout_check(tp, true)) { bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__); - ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen); return (1); } } @@ -9608,7 +9637,7 @@ bbr_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so, if (sbavail(&so->so_snd)) { if (ctf_progress_timeout_check(tp, true)) { bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__); - ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen); return (1); } } @@ -9710,7 +9739,7 @@ bbr_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so, if (sbavail(&so->so_snd)) { if (ctf_progress_timeout_check(tp, true)) { bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__); - ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen); return (1); } } @@ -9819,7 +9848,7 @@ bbr_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so, if (sbavail(&so->so_snd)) { if (ctf_progress_timeout_check(tp, true)) { bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__); - ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen); return (1); } } @@ -10121,7 +10150,7 @@ bbr_init(struct tcpcb *tp, void **ptr) tcp_change_time_units(tp, TCP_TMR_GRANULARITY_TICKS); TCPT_RANGESET(tp->t_rxtcur, ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1, - tp->t_rttmin, TCPTV_REXMTMAX); + tp->t_rttmin, tcp_rexmit_max); bbr_start_hpts_timer(bbr, tp, cts, 5, 0, 0); return (0); } @@ -10269,10 +10298,6 @@ bbr_substate_change(struct tcp_bbr *bbr, uint32_t cts, int32_t line, int dolog) bbr->r_ctl.bbr_smallest_srtt_state2 = bbr->r_ctl.bbr_smallest_srtt_this_state; } bbr->rc_bbr_substate++; - if (bbr->rc_bbr_substate >= BBR_SUBSTATE_COUNT) { - /* Cycle back to first state-> gain */ - bbr->rc_bbr_substate = 0; - } if (bbr_state_val(bbr) == BBR_SUB_GAIN) { /* * We enter the gain(5/4) cycle (possibly less if @@ -11323,7 +11348,14 @@ bbr_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, tcp_dooptions(&to, (u_char *)(th + 1), (th->th_off << 2) - sizeof(struct tcphdr), (thflags & TH_SYN) ? TO_SYN : 0); - + if (tp->t_flags2 & TF2_PROC_SACK_PROHIBIT) { + /* + * We don't look at sack's from the + * peer because the MSS is too small which + * can subject us to an attack. + */ + to.to_flags &= ~TOF_SACK; + } /* * If timestamps were negotiated during SYN/ACK and a * segment without a timestamp is received, silently drop @@ -11478,7 +11510,7 @@ bbr_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); - ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen); return (1); } if (tiwin > bbr->r_ctl.rc_high_rwnd) @@ -12083,7 +12115,7 @@ again: len = 0; rsm = NULL; if (flags & TH_RST) { - SOCKBUF_LOCK(sb); + SOCK_SENDBUF_LOCK(so); goto send; } recheck_resend: @@ -12150,7 +12182,7 @@ recheck_resend: } else { /* Retransmitting SYN */ rsm = NULL; - SOCKBUF_LOCK(sb); + SOCK_SENDBUF_LOCK(so); goto send; } } else @@ -12249,7 +12281,7 @@ recheck_resend: kern_prefetch(end_rsm, &prefetch_rsm); prefetch_rsm = 1; } - SOCKBUF_LOCK(sb); + SOCK_SENDBUF_LOCK(so); /* * If snd_nxt == snd_max and we have transmitted a FIN, the * sb_offset will be > 0 even if so_snd.sb_cc is 0, resulting in a @@ -12564,7 +12596,6 @@ recheck_resend: (len > maxseg) && (tp->t_port == 0) && ((tp->t_flags & TF_SIGNATURE) == 0) && - tp->rcv_numsacks == 0 && ipoptlen == 0) tso = 1; @@ -12667,7 +12698,7 @@ recheck_resend: * No reason to send a segment, just return. */ just_return: - SOCKBUF_UNLOCK(sb); + SOCK_SENDBUF_UNLOCK(so); just_return_nolock: if (tot_len) slot = bbr_get_pacing_delay(bbr, bbr->r_ctl.rc_bbr_hptsi_gain, tot_len, cts, 0); @@ -12775,7 +12806,7 @@ send: len--; } } - SOCKBUF_LOCK_ASSERT(sb); + SOCK_SENDBUF_LOCK_ASSERT(so); if (len > 0) { if ((tp->snd_una == tp->snd_max) && (bbr_calc_time(cts, bbr->r_ctl.rc_went_idle_time) >= bbr_rtt_probe_time)) { @@ -12891,7 +12922,7 @@ send: if (tp->t_port) { if (V_tcp_udp_tunneling_port == 0) { /* The port was removed?? */ - SOCKBUF_UNLOCK(&so->so_snd); + SOCK_SENDBUF_UNLOCK(so); return (EHOSTUNREACH); } hdrlen += sizeof(struct udphdr); @@ -12982,7 +13013,7 @@ send: * byte of the payload can be put into the * TCP segment. */ - SOCKBUF_UNLOCK(&so->so_snd); + SOCK_SENDBUF_UNLOCK(so); error = EMSGSIZE; sack_rxmit = 0; goto out; @@ -13052,7 +13083,7 @@ send: if (m == NULL) { BBR_STAT_INC(bbr_failed_mbuf_aloc); bbr_log_enobuf_jmp(bbr, len, cts, __LINE__, len, 0, 0); - SOCKBUF_UNLOCK(sb); + SOCK_SENDBUF_UNLOCK(so); error = ENOBUFS; sack_rxmit = 0; goto out; @@ -13096,7 +13127,7 @@ send: * is the only thing to do. */ BBR_STAT_INC(bbr_offset_drop); - SOCKBUF_UNLOCK(sb); + SOCK_SENDBUF_UNLOCK(so); (void)m_free(m); return (-EFAULT); /* tcp_drop() */ } @@ -13156,7 +13187,7 @@ send: tso = 0; } if (m->m_next == NULL) { - SOCKBUF_UNLOCK(sb); + SOCK_SENDBUF_UNLOCK(so); (void)m_free(m); error = ENOBUFS; sack_rxmit = 0; @@ -13192,9 +13223,9 @@ send: !(flags & TH_SYN)) { flags |= TH_PUSH; } - SOCKBUF_UNLOCK(sb); + SOCK_SENDBUF_UNLOCK(so); } else { - SOCKBUF_UNLOCK(sb); + SOCK_SENDBUF_UNLOCK(so); if (tp->t_flags & TF_ACKNOW) KMOD_TCPSTAT_INC(tcps_sndacks); else if (flags & (TH_SYN | TH_FIN | TH_RST)) @@ -13220,7 +13251,7 @@ send: m->m_data += max_linkhdr; m->m_len = hdrlen; } - SOCKBUF_UNLOCK_ASSERT(sb); + SOCK_SENDBUF_UNLOCK_ASSERT(so); m->m_pkthdr.rcvif = (struct ifnet *)0; #ifdef MAC mac_inpcb_create_mbuf(inp, m); @@ -13712,7 +13743,7 @@ nomore: * Everything else will just have to retransmit with the timer * (no pacer). */ - SOCKBUF_UNLOCK_ASSERT(sb); + SOCK_SENDBUF_UNLOCK_ASSERT(so); BBR_STAT_INC(bbr_saw_oerr); /* Clear all delay/early tracks */ bbr->r_ctl.rc_hptsi_agg_delay = 0; @@ -13773,6 +13804,16 @@ nomore: if (old_maxseg <= tp->t_maxseg) { /* Huh it did not shrink? */ tp->t_maxseg = old_maxseg - 40; + if (tp->t_maxseg < V_tcp_mssdflt) { + /* + * The MSS is so small we should not + * process incoming SACK's since we are + * subject to attack in such a case. + */ + tp->t_flags2 |= TF2_PROC_SACK_PROHIBIT; + } else { + tp->t_flags2 &= ~TF2_PROC_SACK_PROHIBIT; + } bbr_log_msgsize_fail(bbr, tp, len, maxseg, mtu, 0, tso, cts); } /* @@ -13802,6 +13843,7 @@ nomore: case ENETUNREACH: if (TCPS_HAVERCVDSYN(tp->t_state)) { tp->t_softerror = error; + error = 0; } /* FALLTHROUGH */ default: @@ -14116,7 +14158,7 @@ struct tcp_function_block __tcp_bbr = { .tfb_tcp_mtu_chg = bbr_mtu_chg, .tfb_pru_options = bbr_pru_options, .tfb_switch_failed = bbr_switch_failed, - .tfb_flags = TCP_FUNC_OUTPUT_CANDROP, + .tfb_flags = TCP_FUNC_OUTPUT_CANDROP | TCP_FUNC_DEFAULT_OK, }; /* @@ -14149,10 +14191,8 @@ bbr_set_sockopt(struct tcpcb *tp, struct sockopt *sopt) case TCP_BBR_ALGORITHM: case TCP_BBR_TSLIMITS: case TCP_BBR_IWINTSO: - case TCP_BBR_RECFORCE: case TCP_BBR_STARTUP_PG: case TCP_BBR_DRAIN_PG: - case TCP_BBR_RWND_IS_APP: case TCP_BBR_PROBE_RTT_INT: case TCP_BBR_PROBE_RTT_GAIN: case TCP_BBR_PROBE_RTT_LEN: @@ -14526,6 +14566,7 @@ bbr_get_sockopt(struct tcpcb *tp, struct sockopt *sopt) { struct inpcb *inp = tptoinpcb(tp); struct tcp_bbr *bbr; + uint64_t loptval; int32_t error, optval; bbr = (struct tcp_bbr *)tp->t_fb_ptr; @@ -14586,7 +14627,7 @@ bbr_get_sockopt(struct tcpcb *tp, struct sockopt *sopt) optval = bbr->rc_loss_exit; break; case TCP_BBR_USEDEL_RATE: - error = EINVAL; + loptval = get_filter_value(&bbr->r_ctl.rc_delrate); break; case TCP_BBR_MIN_RTO: optval = bbr->r_ctl.rc_min_rto_ms; @@ -14670,7 +14711,10 @@ bbr_get_sockopt(struct tcpcb *tp, struct sockopt *sopt) break; } INP_WUNLOCK(inp); - error = sooptcopyout(sopt, &optval, sizeof optval); + if (sopt->sopt_name == TCP_BBR_USEDEL_RATE) + error = sooptcopyout(sopt, &loptval, sizeof loptval); + else + error = sooptcopyout(sopt, &optval, sizeof optval); return (error); } diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c index 669d213e58fb..5280f18dc983 100644 --- a/sys/netinet/tcp_stacks/rack.c +++ b/sys/netinet/tcp_stacks/rack.c @@ -40,7 +40,6 @@ #endif #include <sys/lock.h> #include <sys/malloc.h> -#include <sys/lock.h> #include <sys/mutex.h> #include <sys/mbuf.h> #include <sys/proc.h> /* for proc0 declaration */ @@ -193,20 +192,12 @@ static int32_t rack_tlp_use_greater = 1; static int32_t rack_reorder_thresh = 2; static int32_t rack_reorder_fade = 60000000; /* 0 - never fade, def 60,000,000 * - 60 seconds */ -static uint16_t rack_policer_rxt_thresh= 0; /* 499 = 49.9%, 0 is off */ -static uint8_t rack_policer_avg_thresh = 0; /* 3.2 */ -static uint8_t rack_policer_med_thresh = 0; /* 1 - 16 */ -static uint16_t rack_policer_bucket_reserve = 20; /* How much % is reserved in the bucket */ -static uint64_t rack_pol_min_bw = 125000; /* 1mbps in Bytes per sec */ -static uint32_t rack_policer_data_thresh = 64000; /* 64,000 bytes must be sent before we engage */ -static uint32_t rack_policing_do_bw_comp = 1; static uint32_t rack_pcm_every_n_rounds = 100; static uint32_t rack_pcm_blast = 0; static uint32_t rack_pcm_is_enabled = 1; -static uint8_t rack_req_del_mss = 18; /* How many segments need to be sent in a recovery episode to do policer_detection */ static uint8_t rack_ssthresh_rest_rto_rec = 0; /* Do we restore ssthresh when we have rec -> rto -> rec */ -static uint32_t rack_gp_gain_req = 1200; /* Amount percent wise required to gain to record a round has "gaining" */ +static uint32_t rack_gp_gain_req = 1200; /* Amount percent wise required to gain to record a round as "gaining" */ static uint32_t rack_rnd_cnt_req = 0x10005; /* Default number of rounds if we are below rack_gp_gain_req where we exit ss */ @@ -220,7 +211,6 @@ static uint32_t rack_highest_sack_thresh_seen = 0; static uint32_t rack_highest_move_thresh_seen = 0; static uint32_t rack_merge_out_sacks_on_attack = 0; static int32_t rack_enable_hw_pacing = 0; /* Due to CCSP keep it off by default */ -static int32_t rack_hw_pace_extra_slots = 0; /* 2 extra MSS time betweens */ static int32_t rack_hw_rate_caps = 0; /* 1; */ static int32_t rack_hw_rate_cap_per = 0; /* 0 -- off */ static int32_t rack_hw_rate_min = 0; /* 1500000;*/ @@ -271,7 +261,7 @@ static int32_t rack_enobuf_hw_max = 12000; /* 12 ms in usecs */ static int32_t rack_enobuf_hw_min = 10000; /* 10 ms in usecs */ static int32_t rack_hw_rwnd_factor = 2; /* How many max_segs the rwnd must be before we hold off sending */ static int32_t rack_hw_check_queue = 0; /* Do we always pre-check queue depth of a hw queue */ -static int32_t rack_full_buffer_discount = 10; + /* * Currently regular tcp has a rto_min of 30ms * the backoff goes 12 times so that ends up @@ -364,8 +354,6 @@ static int32_t rack_timely_dec_clear = 6; /* Do we clear decrement count at a va static int32_t rack_timely_max_push_rise = 3; /* One round of pushing */ static int32_t rack_timely_max_push_drop = 3; /* Three round of pushing */ static int32_t rack_timely_min_segs = 4; /* 4 segment minimum */ -static int32_t rack_use_max_for_nobackoff = 0; -static int32_t rack_timely_int_timely_only = 0; /* do interim timely's only use the timely algo (no b/w changes)? */ static int32_t rack_timely_no_stopping = 0; static int32_t rack_down_raise_thresh = 100; static int32_t rack_req_segs = 1; @@ -392,7 +380,6 @@ counter_u64_t rack_tlp_retran; counter_u64_t rack_tlp_retran_bytes; counter_u64_t rack_to_tot; counter_u64_t rack_hot_alloc; -counter_u64_t tcp_policer_detected; counter_u64_t rack_to_alloc; counter_u64_t rack_to_alloc_hard; counter_u64_t rack_to_alloc_emerg; @@ -536,7 +523,7 @@ static int32_t rack_output(struct tcpcb *tp); static uint32_t rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk *sack, struct tcpopt *to, struct rack_sendmap **prsm, - uint32_t cts, int *no_extra, int *moved_two, uint32_t segsiz); + uint32_t cts, uint32_t segsiz); static void rack_post_recovery(struct tcpcb *tp, uint32_t th_seq); static void rack_remxt_tmr(struct tcpcb *tp); static int rack_set_sockopt(struct tcpcb *tp, struct sockopt *sopt); @@ -558,9 +545,6 @@ rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); -static void -rack_peg_rxt(struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t segsiz); - static int rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, @@ -633,9 +617,10 @@ rack_swap_beta_values(struct tcp_rack *rack, uint8_t flex8) { struct sockopt sopt; struct cc_newreno_opts opt; - struct newreno old; struct tcpcb *tp; - int error, failed = 0; + uint32_t old_beta; + uint32_t old_beta_ecn; + int error = 0, failed = 0; tp = rack->rc_tp; if (tp->t_cc == NULL) { @@ -663,33 +648,34 @@ rack_swap_beta_values(struct tcp_rack *rack, uint8_t flex8) failed = 3; goto out; } - old.beta = opt.val; + old_beta = opt.val; opt.name = CC_NEWRENO_BETA_ECN; error = CC_ALGO(tp)->ctl_output(&tp->t_ccv, &sopt, &opt); if (error) { failed = 4; goto out; } - old.beta_ecn = opt.val; + old_beta_ecn = opt.val; /* Now lets set in the values we have stored */ sopt.sopt_dir = SOPT_SET; opt.name = CC_NEWRENO_BETA; - opt.val = rack->r_ctl.rc_saved_beta.beta; + opt.val = rack->r_ctl.rc_saved_beta; error = CC_ALGO(tp)->ctl_output(&tp->t_ccv, &sopt, &opt); if (error) { failed = 5; goto out; } opt.name = CC_NEWRENO_BETA_ECN; - opt.val = rack->r_ctl.rc_saved_beta.beta_ecn; + opt.val = rack->r_ctl.rc_saved_beta_ecn; error = CC_ALGO(tp)->ctl_output(&tp->t_ccv, &sopt, &opt); if (error) { failed = 6; goto out; } /* Save off the values for restoral */ - memcpy(&rack->r_ctl.rc_saved_beta, &old, sizeof(struct newreno)); + rack->r_ctl.rc_saved_beta = old_beta; + rack->r_ctl.rc_saved_beta_ecn = old_beta_ecn; out: if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { union tcp_log_stackspecific log; @@ -697,13 +683,13 @@ out: struct newreno *ptr; ptr = ((struct newreno *)tp->t_ccv.cc_data); - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + memset(&log, 0, sizeof(log)); log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.flex1 = ptr->beta; log.u_bbr.flex2 = ptr->beta_ecn; log.u_bbr.flex3 = ptr->newreno_flags; - log.u_bbr.flex4 = rack->r_ctl.rc_saved_beta.beta; - log.u_bbr.flex5 = rack->r_ctl.rc_saved_beta.beta_ecn; + log.u_bbr.flex4 = rack->r_ctl.rc_saved_beta; + log.u_bbr.flex5 = rack->r_ctl.rc_saved_beta_ecn; log.u_bbr.flex6 = failed; log.u_bbr.flex7 = rack->gp_ready; log.u_bbr.flex7 <<= 1; @@ -898,7 +884,6 @@ rack_init_sysctls(void) struct sysctl_oid *rack_measure; struct sysctl_oid *rack_probertt; struct sysctl_oid *rack_hw_pacing; - struct sysctl_oid *rack_policing; rack_attack = SYSCTL_ADD_NODE(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), @@ -952,7 +937,7 @@ rack_init_sysctls(void) SYSCTL_ADD_U32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_probertt), OID_AUTO, "time_between", CTLFLAG_RW, - & rack_time_between_probertt, 96000000, + &rack_time_between_probertt, 96000000, "How many useconds between the lowest rtt falling must past before we enter probertt"); SYSCTL_ADD_U32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_probertt), @@ -1068,11 +1053,6 @@ rack_init_sysctls(void) "Do we not use timely in DGP?"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_pacing), - OID_AUTO, "fullbufdisc", CTLFLAG_RW, - &rack_full_buffer_discount, 10, - "What percentage b/w reduction over the GP estimate for a full buffer (default=0 off)?"); - SYSCTL_ADD_S32(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_pacing), OID_AUTO, "fillcw", CTLFLAG_RW, &rack_fill_cw_state, 0, "Enable fillcw on new connections (default=0 off)?"); @@ -1213,11 +1193,6 @@ rack_init_sysctls(void) OID_AUTO, "up_only", CTLFLAG_RW, &rack_hw_up_only, 0, "Do we allow hw pacing to lower the rate selected?"); - SYSCTL_ADD_S32(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_hw_pacing), - OID_AUTO, "extra_mss_precise", CTLFLAG_RW, - &rack_hw_pace_extra_slots, 0, - "If the rates between software and hardware match precisely how many extra time_betweens do we get?"); rack_timely = SYSCTL_ADD_NODE(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, @@ -1313,16 +1288,6 @@ rack_init_sysctls(void) "Rack timely when setting the cwnd what is the min num segments"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_timely), - OID_AUTO, "noback_max", CTLFLAG_RW, - &rack_use_max_for_nobackoff, 0, - "Rack timely when deciding if to backoff on a loss, do we use under max rtt else min"); - SYSCTL_ADD_S32(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_timely), - OID_AUTO, "interim_timely_only", CTLFLAG_RW, - &rack_timely_int_timely_only, 0, - "Rack timely when doing interim timely's do we only do timely (no b/w consideration)"); - SYSCTL_ADD_S32(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_timely), OID_AUTO, "nonstop", CTLFLAG_RW, &rack_timely_no_stopping, 0, "Rack timely don't stop increase"); @@ -1551,53 +1516,6 @@ rack_init_sysctls(void) OID_AUTO, "hystartplusplus", CTLFLAG_RW, &rack_do_hystart, 0, "Should RACK enable HyStart++ on connections?"); - /* Policer detection */ - rack_policing = SYSCTL_ADD_NODE(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), - OID_AUTO, - "policing", - CTLFLAG_RW | CTLFLAG_MPSAFE, 0, - "policer detection"); - SYSCTL_ADD_U16(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_policing), - OID_AUTO, "rxt_thresh", CTLFLAG_RW, - &rack_policer_rxt_thresh, 0, - "Percentage of retransmits we need to be a possible policer (499 = 49.9 percent)"); - SYSCTL_ADD_U8(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_policing), - OID_AUTO, "avg_thresh", CTLFLAG_RW, - &rack_policer_avg_thresh, 0, - "What threshold of average retransmits needed to recover a lost packet (1 - 169 aka 21 = 2.1)?"); - SYSCTL_ADD_U8(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_policing), - OID_AUTO, "med_thresh", CTLFLAG_RW, - &rack_policer_med_thresh, 0, - "What threshold of Median retransmits needed to recover a lost packet (1 - 16)?"); - SYSCTL_ADD_U32(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_policing), - OID_AUTO, "data_thresh", CTLFLAG_RW, - &rack_policer_data_thresh, 64000, - "How many bytes must have gotten through before we can start doing policer detection?"); - SYSCTL_ADD_U32(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_policing), - OID_AUTO, "bwcomp", CTLFLAG_RW, - &rack_policing_do_bw_comp, 1, - "Do we raise up low b/w so that at least pace_max_seg can be sent in the srtt?"); - SYSCTL_ADD_U8(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_policing), - OID_AUTO, "recmss", CTLFLAG_RW, - &rack_req_del_mss, 18, - "How many MSS must be delivered during recovery to engage policer detection?"); - SYSCTL_ADD_U16(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_policing), - OID_AUTO, "res_div", CTLFLAG_RW, - &rack_policer_bucket_reserve, 20, - "What percentage is reserved in the policer bucket?"); - SYSCTL_ADD_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_policing), - OID_AUTO, "min_comp_bw", CTLFLAG_RW, - &rack_pol_min_bw, 125000, - "Do we have a min b/w for b/w compensation (0 = no)?"); /* Misc rack controls */ rack_misc = SYSCTL_ADD_NODE(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), @@ -1880,13 +1798,6 @@ rack_init_sysctls(void) OID_AUTO, "alloc_hot", CTLFLAG_RD, &rack_hot_alloc, "Total allocations from the top of our list"); - tcp_policer_detected = counter_u64_alloc(M_WAITOK); - SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_counters), - OID_AUTO, "policer_detected", CTLFLAG_RD, - &tcp_policer_detected, - "Total policer_detections"); - rack_to_alloc = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_counters), @@ -2643,6 +2554,7 @@ rack_log_hdwr_pacing(struct tcp_rack *rack, union tcp_log_stackspecific log; struct timeval tv; const struct ifnet *ifp; + uint64_t ifp64; memset(&log, 0, sizeof(log)); log.u_bbr.flex1 = ((hw_rate >> 32) & 0x00000000ffffffff); @@ -2655,8 +2567,9 @@ rack_log_hdwr_pacing(struct tcp_rack *rack, } else ifp = NULL; if (ifp) { - log.u_bbr.flex3 = (((uint64_t)ifp >> 32) & 0x00000000ffffffff); - log.u_bbr.flex4 = ((uint64_t)ifp & 0x00000000ffffffff); + ifp64 = (uintptr_t)ifp; + log.u_bbr.flex3 = ((ifp64 >> 32) & 0x00000000ffffffff); + log.u_bbr.flex4 = (ifp64 & 0x00000000ffffffff); } log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.bw_inuse = rate; @@ -2752,8 +2665,6 @@ rack_log_retran_reason(struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t union tcp_log_stackspecific log; struct timeval tv; - if (rack->sack_attack_disable > 0) - goto log_anyway; if ((mod != 1) && (rack_verbose_logging == 0)) { /* * We get 3 values currently for mod @@ -2766,8 +2677,7 @@ rack_log_retran_reason(struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t */ return; } -log_anyway: - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + memset(&log, 0, sizeof(log)); log.u_bbr.flex1 = tsused; log.u_bbr.flex2 = thresh; log.u_bbr.flex3 = rsm->r_flags; @@ -2798,7 +2708,7 @@ rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot union tcp_log_stackspecific log; struct timeval tv; - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + memset(&log, 0, sizeof(log)); log.u_bbr.flex1 = rack->rc_tp->t_srtt; log.u_bbr.flex2 = to; log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags; @@ -2841,7 +2751,7 @@ rack_log_to_event(struct tcp_rack *rack, int32_t to_num, struct rack_sendmap *rs union tcp_log_stackspecific log; struct timeval tv; - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + memset(&log, 0, sizeof(log)); log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); log.u_bbr.flex8 = to_num; log.u_bbr.flex1 = rack->r_ctl.rc_rack_min_rtt; @@ -2881,12 +2791,12 @@ rack_log_map_chg(struct tcpcb *tp, struct tcp_rack *rack, union tcp_log_stackspecific log; struct timeval tv; - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + memset(&log, 0, sizeof(log)); log.u_bbr.flex8 = flag; log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); - log.u_bbr.cur_del_rate = (uint64_t)prev; - log.u_bbr.delRate = (uint64_t)rsm; - log.u_bbr.rttProp = (uint64_t)next; + log.u_bbr.cur_del_rate = (uintptr_t)prev; + log.u_bbr.delRate = (uintptr_t)rsm; + log.u_bbr.rttProp = (uintptr_t)next; log.u_bbr.flex7 = 0; if (prev) { log.u_bbr.flex1 = prev->r_start; @@ -2929,7 +2839,7 @@ rack_log_rtt_upd(struct tcpcb *tp, struct tcp_rack *rack, uint32_t t, uint32_t l if (tcp_bblogging_on(tp)) { union tcp_log_stackspecific log; struct timeval tv; - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + memset(&log, 0, sizeof(log)); log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); log.u_bbr.flex1 = t; log.u_bbr.flex2 = len; @@ -3007,13 +2917,8 @@ rack_log_rtt_sample(struct tcp_rack *rack, uint32_t rtt) /* Convert our ms to a microsecond */ memset(&log, 0, sizeof(log)); log.u_bbr.flex1 = rtt; - log.u_bbr.flex2 = rack->r_ctl.ack_count; - log.u_bbr.flex3 = rack->r_ctl.sack_count; - log.u_bbr.flex4 = rack->r_ctl.sack_noextra_move; - log.u_bbr.flex5 = rack->r_ctl.sack_moved_extra; log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; log.u_bbr.flex7 = 1; - log.u_bbr.flex8 = rack->sack_attack_disable; log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; @@ -3107,7 +3012,7 @@ rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, union tcp_log_stackspecific log; struct timeval tv; - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + memset(&log, 0, sizeof(log)); log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); log.u_bbr.flex1 = line; log.u_bbr.flex2 = tick; @@ -3136,7 +3041,7 @@ rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t slot, uint32_ if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { union tcp_log_stackspecific log; - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + memset(&log, 0, sizeof(log)); log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); log.u_bbr.flex1 = slot; if (rack->rack_no_prr) @@ -3144,7 +3049,6 @@ rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t slot, uint32_ else log.u_bbr.flex2 = rack->r_ctl.rc_prr_sndcnt; log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; - log.u_bbr.flex5 = rack->r_ctl.ack_during_sd; log.u_bbr.flex6 = line; log.u_bbr.flex7 = (0x0000ffff & rack->r_ctl.rc_hpts_flags); log.u_bbr.flex8 = rack->rc_in_persist; @@ -3244,7 +3148,7 @@ rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, ui union tcp_log_stackspecific log; struct timeval tv; - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + memset(&log, 0, sizeof(log)); log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); log.u_bbr.flex1 = slot; log.u_bbr.flex2 = rack->r_ctl.rc_hpts_flags; @@ -3280,7 +3184,7 @@ rack_log_to_cancel(struct tcp_rack *rack, int32_t hpts_removed, int line, uint32 if (tcp_bblogging_on(rack->rc_tp)) { union tcp_log_stackspecific log; - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + memset(&log, 0, sizeof(log)); log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); log.u_bbr.flex1 = line; log.u_bbr.flex2 = rack->r_ctl.rc_last_output_to; @@ -3325,7 +3229,7 @@ rack_log_alt_to_to_cancel(struct tcp_rack *rack, /* No you can't use 1, its for the real to cancel */ return; } - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + memset(&log, 0, sizeof(log)); log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.flex1 = flex1; log.u_bbr.flex2 = flex2; @@ -3350,7 +3254,7 @@ rack_log_to_processing(struct tcp_rack *rack, uint32_t cts, int32_t ret, int32_t union tcp_log_stackspecific log; struct timeval tv; - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + memset(&log, 0, sizeof(log)); log.u_bbr.flex1 = timers; log.u_bbr.flex2 = ret; log.u_bbr.flex3 = rack->r_ctl.rc_timer_exp; @@ -3380,7 +3284,7 @@ rack_log_to_prr(struct tcp_rack *rack, int frm, int orig_cwnd, int line) union tcp_log_stackspecific log; struct timeval tv; - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + memset(&log, 0, sizeof(log)); log.u_bbr.flex1 = rack->r_ctl.rc_prr_out; log.u_bbr.flex2 = rack->r_ctl.rc_prr_recovery_fs; if (rack->rack_no_prr) @@ -3406,40 +3310,6 @@ rack_log_to_prr(struct tcp_rack *rack, int frm, int orig_cwnd, int line) } } -#ifdef TCP_SAD_DETECTION -static void -rack_log_sad(struct tcp_rack *rack, int event) -{ - if (tcp_bblogging_on(rack->rc_tp)) { - union tcp_log_stackspecific log; - struct timeval tv; - - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); - log.u_bbr.flex1 = rack->r_ctl.sack_count; - log.u_bbr.flex2 = rack->r_ctl.ack_count; - log.u_bbr.flex3 = rack->r_ctl.sack_moved_extra; - log.u_bbr.flex4 = rack->r_ctl.sack_noextra_move; - log.u_bbr.flex5 = rack->r_ctl.rc_num_maps_alloced; - log.u_bbr.flex6 = tcp_sack_to_ack_thresh; - log.u_bbr.pkts_out = tcp_sack_to_move_thresh; - log.u_bbr.lt_epoch = (tcp_force_detection << 8); - log.u_bbr.lt_epoch |= rack->do_detection; - log.u_bbr.applimited = tcp_map_minimum; - log.u_bbr.flex7 = rack->sack_attack_disable; - log.u_bbr.flex8 = event; - log.u_bbr.bbr_state = rack->rc_suspicious; - log.u_bbr.timeStamp = tcp_get_usecs(&tv); - log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); - log.u_bbr.delivered = tcp_sad_decay_val; - TCP_LOG_EVENTP(rack->rc_tp, NULL, - &rack->rc_inp->inp_socket->so_rcv, - &rack->rc_inp->inp_socket->so_snd, - TCP_SAD_DETECT, 0, - 0, &log, false, &tv); - } -} -#endif - static void rack_counter_destroy(void) { @@ -3470,7 +3340,6 @@ rack_counter_destroy(void) counter_u64_free(rack_saw_enobuf_hw); counter_u64_free(rack_saw_enetunreach); counter_u64_free(rack_hot_alloc); - counter_u64_free(tcp_policer_detected); counter_u64_free(rack_to_alloc); counter_u64_free(rack_to_alloc_hard); counter_u64_free(rack_to_alloc_emerg); @@ -3549,7 +3418,6 @@ static struct rack_sendmap * rack_alloc_full_limit(struct tcp_rack *rack) { if ((V_tcp_map_entries_limit > 0) && - (rack->do_detection == 0) && (rack->r_ctl.rc_num_maps_alloced >= V_tcp_map_entries_limit)) { counter_u64_add(rack_to_alloc_limited, 1); if (!rack->alloc_limit_reported) { @@ -3570,7 +3438,6 @@ rack_alloc_limit(struct tcp_rack *rack, uint8_t limit_type) if (limit_type) { /* currently there is only one limit type */ if (rack->r_ctl.rc_split_limit > 0 && - (rack->do_detection == 0) && rack->r_ctl.rc_num_split_allocs >= rack->r_ctl.rc_split_limit) { counter_u64_add(rack_split_limited, 1); if (!rack->alloc_limit_reported) { @@ -3578,17 +3445,6 @@ rack_alloc_limit(struct tcp_rack *rack, uint8_t limit_type) counter_u64_add(rack_alloc_limited_conns, 1); } return (NULL); -#ifdef TCP_SAD_DETECTION - } else if ((tcp_sad_limit != 0) && - (rack->do_detection == 1) && - (rack->r_ctl.rc_num_split_allocs >= tcp_sad_limit)) { - counter_u64_add(rack_split_limited, 1); - if (!rack->alloc_limit_reported) { - rack->alloc_limit_reported = 1; - counter_u64_add(rack_alloc_limited_conns, 1); - } - return (NULL); -#endif } } @@ -3623,16 +3479,16 @@ static void rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm) { if (rsm->r_flags & RACK_APP_LIMITED) { - if (rack->r_ctl.rc_app_limited_cnt > 0) { - rack->r_ctl.rc_app_limited_cnt--; - } + KASSERT((rack->r_ctl.rc_app_limited_cnt > 0), + ("app_cnt %u, rsm %p", rack->r_ctl.rc_app_limited_cnt, rsm)); + rack->r_ctl.rc_app_limited_cnt--; } if (rsm->r_limit_type) { /* currently there is only one limit type */ rack->r_ctl.rc_num_split_allocs--; } if (rsm == rack->r_ctl.rc_first_appl) { - rack->r_ctl.cleared_app_ack_seq = rsm->r_start + (rsm->r_end - rsm->r_start); + rack->r_ctl.cleared_app_ack_seq = rsm->r_end; rack->r_ctl.cleared_app_ack = 1; if (rack->r_ctl.rc_app_limited_cnt == 0) rack->r_ctl.rc_first_appl = NULL; @@ -3697,8 +3553,7 @@ rack_get_measure_window(struct tcpcb *tp, struct tcp_rack *rack) * earlier. * * So lets calculate the BDP with the "known" b/w using - * the SRTT has our rtt and then multiply it by the - * goal. + * the SRTT as our rtt and then multiply it by the goal. */ bw = rack_get_bw(rack); srtt = (uint64_t)tp->t_srtt; @@ -4261,7 +4116,7 @@ rack_log_rtt_shrinks(struct tcp_rack *rack, uint32_t us_cts, union tcp_log_stackspecific log; struct timeval tv; - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + memset(&log, 0, sizeof(log)); log.u_bbr.flex1 = line; log.u_bbr.flex2 = rack->r_ctl.rc_time_probertt_starts; log.u_bbr.flex3 = rack->r_ctl.rc_lower_rtt_us_cts; @@ -5007,7 +4862,7 @@ rack_log_gp_calc(struct tcp_rack *rack, uint32_t add_part, uint32_t sub_part, ui union tcp_log_stackspecific log; struct timeval tv; - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + memset(&log, 0, sizeof(log)); log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.flex1 = add_part; log.u_bbr.flex2 = sub_part; @@ -5357,7 +5212,7 @@ rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack, union tcp_log_stackspecific log; struct timeval tv; - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + memset(&log, 0, sizeof(log)); log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.flex1 = rack->r_ctl.current_round; log.u_bbr.flex2 = rack->r_ctl.last_rnd_of_gp_rise; @@ -5393,7 +5248,7 @@ rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack, union tcp_log_stackspecific log; struct timeval tv; - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + memset(&log, 0, sizeof(log)); log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.flex1 = rack->r_ctl.current_round; log.u_bbr.flex2 = (uint32_t)gp_est; @@ -5583,7 +5438,7 @@ skip_measurement: rack_log_pacing_delay_calc(rack, tp->gput_seq, tp->gput_ack, - (uint64_t)rsm, + (uintptr_t)rsm, tp->gput_ts, (((uint64_t)rack->r_ctl.rc_app_limited_cnt << 32) | (uint64_t)rack->r_ctl.rc_gp_output_ts), 9, @@ -5676,7 +5531,7 @@ rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, uint32_t th_ack, uint union tcp_log_stackspecific log; struct timeval tv; - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + memset(&log, 0, sizeof(log)); log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.flex1 = th_ack; log.u_bbr.flex2 = tp->t_ccv.flags; @@ -5756,459 +5611,12 @@ tcp_rack_partialack(struct tcpcb *tp) rack->r_wanted_output = 1; } -static inline uint64_t -rack_get_rxt_per(uint64_t snds, uint64_t rxts) -{ - uint64_t rxt_per; - - if (snds > 0) { - rxt_per = rxts * 1000; - rxt_per /= snds; - } else { - /* This is an unlikely path */ - if (rxts) { - /* Its the max it was all re-transmits */ - rxt_per = 0xffffffffffffffff; - } else { - rxt_per = 0; - } - } - return (rxt_per); -} - -static void -policer_detection_log(struct tcp_rack *rack, uint32_t flex1, uint32_t flex2, uint32_t flex3, uint32_t flex4, uint8_t flex8) -{ - if (tcp_bblogging_on(rack->rc_tp)) { - union tcp_log_stackspecific log; - struct timeval tv; - - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); - log.u_bbr.timeStamp = tcp_get_usecs(&tv); - log.u_bbr.flex1 = flex1; - log.u_bbr.flex2 = flex2; - log.u_bbr.flex3 = flex3; - log.u_bbr.flex4 = flex4; - log.u_bbr.flex5 = rack->r_ctl.current_policer_bucket; - log.u_bbr.flex6 = rack->r_ctl.policer_bucket_size; - log.u_bbr.flex7 = 0; - log.u_bbr.flex8 = flex8; - log.u_bbr.bw_inuse = rack->r_ctl.policer_bw; - log.u_bbr.applimited = rack->r_ctl.current_round; - log.u_bbr.epoch = rack->r_ctl.policer_max_seg; - log.u_bbr.delivered = (uint32_t)rack->r_ctl.bytes_acked_in_recovery; - log.u_bbr.cur_del_rate = rack->rc_tp->t_sndbytes; - log.u_bbr.delRate = rack->rc_tp->t_snd_rxt_bytes; - log.u_bbr.rttProp = rack->r_ctl.gp_bw; - log.u_bbr.bbr_state = rack->rc_policer_detected; - log.u_bbr.bbr_substate = 0; - log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); - log.u_bbr.use_lt_bw = rack->policer_detect_on; - log.u_bbr.lt_epoch = 0; - log.u_bbr.pkts_out = 0; - tcp_log_event(rack->rc_tp, NULL, NULL, NULL, TCP_POLICER_DET, 0, - 0, &log, false, NULL, NULL, 0, &tv); - } - -} - -static void -policer_detection(struct tcpcb *tp, struct tcp_rack *rack, int post_recovery) -{ - /* - * Rack excess rxt accounting is turned on. If we - * are above a threshold of rxt's in at least N - * rounds, then back off the cwnd and ssthresh - * to fit into the long-term b/w. - */ - - uint32_t pkts, mid, med, alt_med, avg, segsiz, tot_retran_pkt_count = 0; - uint32_t cnt_of_mape_rxt = 0; - uint64_t snds, rxts, rxt_per, tim, del, del_bw; - int i; - struct timeval tv; - - - /* - * First is there enough packets delivered during recovery to make - * a determiniation of b/w? - */ - segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); - if ((rack->rc_policer_detected == 0) && - (rack->r_ctl.policer_del_mss > 0) && - ((uint32_t)rack->r_ctl.policer_del_mss > ((rack->r_ctl.bytes_acked_in_recovery + segsiz - 1)/segsiz))) { - /* - * Not enough data sent in recovery for initial detection. Once - * we have deteced a policer we allow less than the threshold (polcer_del_mss) - * amount of data in a recovery to let us fall through and double check - * our policer settings and possibly expand or collapse the bucket size and - * the polcier b/w. - * - * Once you are declared to be policed. this block of code cannot be - * reached, instead blocks further down will re-check the policer detection - * triggers and possibly reset the measurements if somehow we have let the - * policer bucket size grow too large. - */ - if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { - policer_detection_log(rack, rack->r_ctl.policer_del_mss, - ((rack->r_ctl.bytes_acked_in_recovery + segsiz - 1)/segsiz), - rack->r_ctl.bytes_acked_in_recovery, segsiz, 18); - } - return; - } - tcp_get_usecs(&tv); - tim = tcp_tv_to_lusectick(&tv) - rack->r_ctl.time_entered_recovery; - del = rack->r_ctl.bytes_acked_in_recovery; - if (tim > 0) - del_bw = (del * (uint64_t)1000000) / tim; - else - del_bw = 0; - /* B/W compensation? */ - - if (rack->r_ctl.pol_bw_comp && ((rack->r_ctl.policer_bw > 0) || - (del_bw > 0))) { - /* - * Sanity check now that the data is in. How long does it - * take for us to pace out two of our policer_max_seg's? - * - * If it is longer than the RTT then we are set - * too slow, maybe because of not enough data - * sent during recovery. - */ - uint64_t lentime, res, srtt, max_delbw, alt_bw; - - srtt = (uint64_t)rack_grab_rtt(tp, rack); - if ((tp->t_srtt > 0) && (srtt > tp->t_srtt)) - srtt = tp->t_srtt; - lentime = rack->r_ctl.policer_max_seg * (uint64_t)HPTS_USEC_IN_SEC * 2; - if (del_bw > rack->r_ctl.policer_bw) { - max_delbw = del_bw; - } else { - max_delbw = rack->r_ctl.policer_bw; - } - res = lentime / max_delbw; - if ((srtt > 0) && (res > srtt)) { - /* - * At this rate we can not get two policer_maxsegs - * out before the ack arrives back. - * - * Lets at least get it raised up so that - * we can be a bit faster than that if possible. - */ - lentime = (rack->r_ctl.policer_max_seg * 2); - tim = srtt; - alt_bw = (lentime * (uint64_t)HPTS_USEC_IN_SEC) / tim; - if (alt_bw > max_delbw) { - uint64_t cap_alt_bw; - - cap_alt_bw = (max_delbw + (max_delbw * rack->r_ctl.pol_bw_comp)); - if ((rack_pol_min_bw > 0) && (cap_alt_bw < rack_pol_min_bw)) { - /* We place a min on the cap which defaults to 1Mbps */ - cap_alt_bw = rack_pol_min_bw; - } - if (alt_bw <= cap_alt_bw) { - /* It should be */ - del_bw = alt_bw; - policer_detection_log(rack, - (uint32_t)tim, - rack->r_ctl.policer_max_seg, - 0, - 0, - 16); - } else { - /* - * This is an odd case where likely the RTT is very very - * low. And yet it is still being policed. We don't want - * to get more than (rack_policing_do_bw_comp+1) x del-rate - * where del-rate is what we got in recovery for either the - * first Policer Detection(PD) or this PD we are on now. - */ - del_bw = cap_alt_bw; - policer_detection_log(rack, - (uint32_t)tim, - rack->r_ctl.policer_max_seg, - (uint32_t)max_delbw, - (rack->r_ctl.pol_bw_comp + 1), - 16); - } - } - } - } - snds = tp->t_sndbytes - rack->r_ctl.last_policer_sndbytes; - rxts = tp->t_snd_rxt_bytes - rack->r_ctl.last_policer_snd_rxt_bytes; - rxt_per = rack_get_rxt_per(snds, rxts); - /* Figure up the average and median */ - for(i = 0; i < RETRAN_CNT_SIZE; i++) { - if (rack->r_ctl.rc_cnt_of_retran[i] > 0) { - tot_retran_pkt_count += (i + 1) * rack->r_ctl.rc_cnt_of_retran[i]; - cnt_of_mape_rxt += rack->r_ctl.rc_cnt_of_retran[i]; - } - } - if (cnt_of_mape_rxt) - avg = (tot_retran_pkt_count * 10)/cnt_of_mape_rxt; - else - avg = 0; - alt_med = med = 0; - mid = tot_retran_pkt_count/2; - for(i = 0; i < RETRAN_CNT_SIZE; i++) { - pkts = (i + 1) * rack->r_ctl.rc_cnt_of_retran[i]; - if (mid > pkts) { - mid -= pkts; - continue; - } - med = (i + 1); - break; - } - mid = cnt_of_mape_rxt / 2; - for(i = 0; i < RETRAN_CNT_SIZE; i++) { - if (mid > rack->r_ctl.rc_cnt_of_retran[i]) { - mid -= rack->r_ctl.rc_cnt_of_retran[i]; - continue; - } - alt_med = (i + 1); - break; - } - if (rack->r_ctl.policer_alt_median) { - /* Swap the medians */ - uint32_t swap; - - swap = med; - med = alt_med; - alt_med = swap; - } - if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { - union tcp_log_stackspecific log; - struct timeval tv; - - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); - log.u_bbr.timeStamp = tcp_get_usecs(&tv); - log.u_bbr.flex1 = avg; - log.u_bbr.flex2 = med; - log.u_bbr.flex3 = (uint32_t)rxt_per; - log.u_bbr.flex4 = rack->r_ctl.policer_avg_threshold; - log.u_bbr.flex5 = rack->r_ctl.policer_med_threshold; - log.u_bbr.flex6 = rack->r_ctl.policer_rxt_threshold; - log.u_bbr.flex7 = rack->r_ctl.policer_alt_median; - log.u_bbr.flex8 = 1; - log.u_bbr.delivered = rack->r_ctl.policer_bucket_size; - log.u_bbr.applimited = rack->r_ctl.current_round; - log.u_bbr.epoch = rack->r_ctl.policer_max_seg; - log.u_bbr.bw_inuse = del_bw; - log.u_bbr.cur_del_rate = rxts; - log.u_bbr.delRate = snds; - log.u_bbr.rttProp = rack->r_ctl.gp_bw; - log.u_bbr.bbr_state = rack->rc_policer_detected; - log.u_bbr.bbr_substate = 0; - log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); - log.u_bbr.use_lt_bw = rack->policer_detect_on; - log.u_bbr.lt_epoch = (uint32_t)tim; - log.u_bbr.pkts_out = rack->r_ctl.bytes_acked_in_recovery; - tcp_log_event(tp, NULL, NULL, NULL, TCP_POLICER_DET, 0, - 0, &log, false, NULL, NULL, 0, &tv); - } - if (med == RETRAN_CNT_SIZE) { - /* - * If the median is the maximum, then what we - * likely have here is a network breakage. Either that - * or we are so unlucky that all of our traffic is being - * dropped and having to be retransmitted the maximum times - * and this just is not how a policer works. - * - * If it is truely a policer eventually we will come - * through and it won't be the maximum. - */ - return; - } - /* Has enough rounds progressed for us to re-measure? */ - if ((rxt_per >= (uint64_t)rack->r_ctl.policer_rxt_threshold) && - (avg >= rack->r_ctl.policer_avg_threshold) && - (med >= rack->r_ctl.policer_med_threshold)) { - /* - * We hit all thresholds that indicate we are - * being policed. Now we may be doing this from a rack timeout - * which then means the rest of recovery will hopefully go - * smoother as we pace. At the end of recovery we will - * fall back in here and reset the values using the - * results of the entire recovery episode (we could also - * hit this as we exit recovery as well which means only - * one time in here). - * - * This is done explicitly that if we hit the thresholds - * again in a second recovery we overwrite the values. We do - * that because over time, as we pace the policer_bucket_size may - * continue to grow. This then provides more and more times when - * we are not pacing to the policer rate. This lets us compensate - * for when we hit a false positive and those flows continue to - * increase. However if its a real policer we will then get over its - * limit, over time, again and thus end up back here hitting the - * thresholds again. - * - * The alternative to this is to instead whenever we pace due to - * policing in rack_policed_sending we could add the amount len paced to the - * idle_snd_una value (which decreases the amount in last_amount_before_rec - * since that is always [th_ack - idle_snd_una]). This would then prevent - * the polcier_bucket_size from growing in additional recovery episodes - * Which would then mean false postives would be pretty much stuck - * after things got back to normal (assuming that what caused the - * false positive was a small network outage). - * - */ - tcp_trace_point(rack->rc_tp, TCP_TP_POLICER_DET); - if (rack->rc_policer_detected == 0) { - /* - * Increment the stat that tells us we identified - * a policer only once. Note that if we ever allow - * the flag to be cleared (reverted) then we need - * to adjust this to not do multi-counting. - */ - counter_u64_add(tcp_policer_detected, 1); - } - rack->r_ctl.last_policer_sndbytes = tp->t_sndbytes; - rack->r_ctl.last_policer_snd_rxt_bytes = tp->t_snd_rxt_bytes; - rack->r_ctl.policer_bw = del_bw; - rack->r_ctl.policer_max_seg = tcp_get_pacing_burst_size_w_divisor(rack->rc_tp, - rack->r_ctl.policer_bw, - min(ctf_fixed_maxseg(rack->rc_tp), - rack->r_ctl.rc_pace_min_segs), - 0, NULL, - NULL, rack->r_ctl.pace_len_divisor); - /* Now what about the policer bucket size */ - rack->r_ctl.policer_bucket_size = rack->r_ctl.last_amount_before_rec; - if (rack->r_ctl.policer_bucket_size < rack->r_ctl.policer_max_seg) { - /* We must be able to send our max-seg or else chaos ensues */ - rack->r_ctl.policer_bucket_size = rack->r_ctl.policer_max_seg * 2; - } - if (rack->rc_policer_detected == 0) - rack->r_ctl.current_policer_bucket = 0; - if (tcp_bblogging_on(rack->rc_tp)) { - union tcp_log_stackspecific log; - struct timeval tv; - - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); - log.u_bbr.timeStamp = tcp_get_usecs(&tv); - log.u_bbr.flex1 = avg; - log.u_bbr.flex2 = med; - log.u_bbr.flex3 = rxt_per; - log.u_bbr.flex4 = rack->r_ctl.policer_avg_threshold; - log.u_bbr.flex5 = rack->r_ctl.policer_med_threshold; - log.u_bbr.flex6 = rack->r_ctl.policer_rxt_threshold; - log.u_bbr.flex7 = rack->r_ctl.policer_alt_median; - log.u_bbr.flex8 = 2; - log.u_bbr.applimited = rack->r_ctl.current_round; - log.u_bbr.bw_inuse = del_bw; - log.u_bbr.delivered = rack->r_ctl.policer_bucket_size; - log.u_bbr.cur_del_rate = rxts; - log.u_bbr.delRate = snds; - log.u_bbr.rttProp = rack->r_ctl.gp_bw; - log.u_bbr.bbr_state = rack->rc_policer_detected; - log.u_bbr.bbr_substate = 0; - log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); - log.u_bbr.use_lt_bw = rack->policer_detect_on; - log.u_bbr.epoch = rack->r_ctl.policer_max_seg; - log.u_bbr.lt_epoch = (uint32_t)tim; - log.u_bbr.pkts_out = rack->r_ctl.bytes_acked_in_recovery; - tcp_log_event(tp, NULL, NULL, NULL, TCP_POLICER_DET, 0, - 0, &log, false, NULL, NULL, 0, &tv); - /* - * Put out an added log, 19, for the sole purpose - * of getting the txt/rxt so that we can benchmark - * in read-bbrlog the ongoing rxt rate after our - * policer invocation in the HYSTART announcments. - */ - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); - log.u_bbr.timeStamp = tcp_tv_to_usectick(&tv); - log.u_bbr.flex1 = alt_med; - log.u_bbr.flex8 = 19; - log.u_bbr.cur_del_rate = tp->t_sndbytes; - log.u_bbr.delRate = tp->t_snd_rxt_bytes; - tcp_log_event(tp, NULL, NULL, NULL, TCP_POLICER_DET, 0, - 0, &log, false, NULL, NULL, 0, &tv); - } - /* Turn off any fast output, thats ended */ - rack->r_fast_output = 0; - /* Mark the time for credits */ - rack->r_ctl.last_sendtime = tcp_get_u64_usecs(NULL); - if (rack->r_rr_config < 2) { - /* - * We need to be stricter on the RR config so - * the pacing has priority. - */ - rack->r_rr_config = 2; - } - policer_detection_log(rack, - rack->r_ctl.idle_snd_una, - rack->r_ctl.ack_for_idle, - 0, - (uint32_t)tim, - 14); - rack->rc_policer_detected = 1; - } else if ((rack->rc_policer_detected == 1) && - (post_recovery == 1)) { - /* - * If we are exiting recovery and have already detected - * we need to possibly update the values. - * - * First: Update the idle -> recovery sent value. - */ - uint32_t srtt; - - if (rack->r_ctl.last_amount_before_rec > rack->r_ctl.policer_bucket_size) { - rack->r_ctl.policer_bucket_size = rack->r_ctl.last_amount_before_rec; - } - srtt = (uint64_t)rack_grab_rtt(tp, rack); - if ((tp->t_srtt > 0) && (srtt > tp->t_srtt)) - srtt = tp->t_srtt; - if ((srtt != 0) && - (tim < (uint64_t)srtt)) { - /* - * Not long enough. - */ - if (rack_verbose_logging) - policer_detection_log(rack, - (uint32_t)tim, - 0, - 0, - 0, - 15); - return; - } - /* - * Finally update the b/w if its grown. - */ - if (del_bw > rack->r_ctl.policer_bw) { - rack->r_ctl.policer_bw = del_bw; - rack->r_ctl.policer_max_seg = tcp_get_pacing_burst_size_w_divisor(rack->rc_tp, - rack->r_ctl.policer_bw, - min(ctf_fixed_maxseg(rack->rc_tp), - rack->r_ctl.rc_pace_min_segs), - 0, NULL, - NULL, rack->r_ctl.pace_len_divisor); - if (rack->r_ctl.policer_bucket_size < rack->r_ctl.policer_max_seg) { - /* We must be able to send our max-seg or else chaos ensues */ - rack->r_ctl.policer_bucket_size = rack->r_ctl.policer_max_seg * 2; - } - } - policer_detection_log(rack, - rack->r_ctl.idle_snd_una, - rack->r_ctl.ack_for_idle, - 0, - (uint32_t)tim, - 3); - } -} - static void rack_exit_recovery(struct tcpcb *tp, struct tcp_rack *rack, int how) { - /* now check with the policer if on */ - if (rack->policer_detect_on == 1) { - policer_detection(tp, rack, 1); - } /* - * Now exit recovery, note we must do the idle set after the policer_detection - * to get the amount acked prior to recovery correct. + * Now exit recovery. */ - rack->r_ctl.idle_snd_una = tp->snd_una; EXIT_RECOVERY(tp->t_flags); } @@ -6238,7 +5646,7 @@ rack_post_recovery(struct tcpcb *tp, uint32_t th_ack) union tcp_log_stackspecific log; struct timeval tv; - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + memset(&log, 0, sizeof(log)); log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.flex1 = th_ack; log.u_bbr.flex2 = tp->t_ccv.flags; @@ -6314,69 +5722,11 @@ rack_cong_signal(struct tcpcb *tp, uint32_t type, uint32_t ack, int line) tp->t_flags &= ~TF_WASFRECOVERY; tp->t_flags &= ~TF_WASCRECOVERY; if (!IN_FASTRECOVERY(tp->t_flags)) { - struct rack_sendmap *rsm; - struct timeval tv; - uint32_t segsiz; - /* Check if this is the end of the initial Start-up i.e. initial slow-start */ if (rack->rc_initial_ss_comp == 0) { /* Yep it is the end of the initial slowstart */ rack->rc_initial_ss_comp = 1; } - microuptime(&tv); - rack->r_ctl.time_entered_recovery = tcp_tv_to_lusectick(&tv); - if (SEQ_GEQ(ack, tp->snd_una)) { - /* - * The ack is above snd_una. Lets see - * if we can establish a postive distance from - * our idle mark. - */ - rack->r_ctl.ack_for_idle = ack; - if (SEQ_GT(ack, rack->r_ctl.idle_snd_una)) { - rack->r_ctl.last_amount_before_rec = ack - rack->r_ctl.idle_snd_una; - } else { - /* No data thru yet */ - rack->r_ctl.last_amount_before_rec = 0; - } - } else if (SEQ_GT(tp->snd_una, rack->r_ctl.idle_snd_una)) { - /* - * The ack is out of order and behind the snd_una. It may - * have contained SACK information which we processed else - * we would have rejected it. - */ - rack->r_ctl.ack_for_idle = tp->snd_una; - rack->r_ctl.last_amount_before_rec = tp->snd_una - rack->r_ctl.idle_snd_una; - } else { - rack->r_ctl.ack_for_idle = ack; - rack->r_ctl.last_amount_before_rec = 0; - } - if (rack->rc_policer_detected) { - /* - * If we are being policed and we have a loss, it - * means our bucket is now empty. This can happen - * where some other flow on the same host sends - * that this connection is not aware of. - */ - rack->r_ctl.current_policer_bucket = 0; - if (rack_verbose_logging) - policer_detection_log(rack, rack->r_ctl.last_amount_before_rec, 0, 0, 0, 4); - if (rack->r_ctl.last_amount_before_rec > rack->r_ctl.policer_bucket_size) { - rack->r_ctl.policer_bucket_size = rack->r_ctl.last_amount_before_rec; - } - } - memset(rack->r_ctl.rc_cnt_of_retran, 0, sizeof(rack->r_ctl.rc_cnt_of_retran)); - segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); - TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) { - /* - * Go through the outstanding and re-peg - * any that should have been left in the - * retransmit list (on a double recovery). - */ - if (rsm->r_act_rxt_cnt > 0) { - rack_peg_rxt(rack, rsm, segsiz); - } - } - rack->r_ctl.bytes_acked_in_recovery = 0; rack->r_ctl.rc_prr_delivered = 0; rack->r_ctl.rc_prr_out = 0; rack->r_fast_output = 0; @@ -6411,8 +5761,6 @@ rack_cong_signal(struct tcpcb *tp, uint32_t type, uint32_t ack, int line) rack->r_fast_output = 0; if (IN_RECOVERY(tp->t_flags)) rack_exit_recovery(tp, rack, 2); - rack->r_ctl.bytes_acked_in_recovery = 0; - rack->r_ctl.time_entered_recovery = 0; orig_cwnd = tp->snd_cwnd; rack_log_to_prr(rack, 16, orig_cwnd, line); if (CC_ALGO(tp)->cong_signal == NULL) { @@ -6443,7 +5791,7 @@ rack_cong_signal(struct tcpcb *tp, uint32_t type, uint32_t ack, int line) tp->t_badrxtwin = 0; break; } - if ((CC_ALGO(tp)->cong_signal != NULL) && + if ((CC_ALGO(tp)->cong_signal != NULL) && (type != CC_RTO)){ tp->t_ccv.curack = ack; CC_ALGO(tp)->cong_signal(&tp->t_ccv, type); @@ -6554,7 +5902,7 @@ rack_calc_thresh_rack(struct tcp_rack *rack, uint32_t srtt, uint32_t cts, int li * * If reorder-fade is configured, then we track the last time we saw * re-ordering occur. If we reach the point where enough time as - * passed we no longer consider reordering has occuring. + * passed we no longer consider reordering as occurring. * * Or if reorder-face is 0, then once we see reordering we consider * the connection to alway be subject to reordering and just set lro @@ -6812,7 +6160,6 @@ rack_timer_start(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int sup_ } rack->rc_on_min_to = 0; if ((tp->t_state < TCPS_ESTABLISHED) || - (rack->sack_attack_disable > 0) || ((tp->t_flags & TF_SACK_PERMIT) == 0)) { goto activate_rxt; } @@ -6884,16 +6231,6 @@ activate_rxt: goto activate_rxt; } } - if (rack->sack_attack_disable) { - /* - * We don't want to do - * any TLP's if you are an attacker. - * Though if you are doing what - * is expected you may still have - * SACK-PASSED marks. - */ - goto activate_rxt; - } /* Convert from ms to usecs */ if ((rsm->r_flags & RACK_SACK_PASSED) || (rsm->r_flags & RACK_RWND_COLLAPSED) || @@ -7008,7 +6345,7 @@ activate_tlp: if (to < rack_tlp_min) { to = rack_tlp_min; } - if (to > TICKS_2_USEC(TCPTV_REXMTMAX)) { + if (to > TICKS_2_USEC(tcp_rexmit_max)) { /* * If the TLP time works out to larger than the max * RTO lets not do TLP.. just RTO. @@ -7124,7 +6461,6 @@ rack_exit_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) rack->lt_bw_up = 1; rack->r_persist_lt_bw_off = 0; } - rack->r_ctl.idle_snd_una = tp->snd_una; rack->rc_in_persist = 0; rack->r_ctl.rc_went_idle_time = 0; tp->t_rxtshift = 0; @@ -7143,7 +6479,7 @@ rack_log_hpts_diag(struct tcp_rack *rack, uint32_t cts, if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { union tcp_log_stackspecific log; - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + memset(&log, 0, sizeof(log)); log.u_bbr.flex1 = diag->p_nxt_slot; log.u_bbr.flex2 = diag->p_cur_slot; log.u_bbr.flex3 = diag->slot_req; @@ -7182,7 +6518,7 @@ rack_log_wakeup(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb, uin union tcp_log_stackspecific log; struct timeval tv; - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + memset(&log, 0, sizeof(log)); log.u_bbr.flex1 = sb->sb_flags; log.u_bbr.flex2 = len; log.u_bbr.flex3 = sb->sb_state; @@ -7304,25 +6640,6 @@ rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, } } hpts_timeout = rack_timer_start(tp, rack, cts, sup_rack); -#ifdef TCP_SAD_DETECTION - if (rack->sack_attack_disable && - (rack->r_ctl.ack_during_sd > 0) && - (slot < tcp_sad_pacing_interval)) { - /* - * We have a potential attacker on - * the line. We have possibly some - * (or now) pacing time set. We want to - * slow down the processing of sacks by some - * amount (if it is an attacker). Set the default - * slot for attackers in place (unless the original - * interval is longer). Its stored in - * micro-seconds, so lets convert to msecs. - */ - slot = tcp_sad_pacing_interval; - rack_log_type_bbrsnd(rack, tot_len_this_send, slot, us_cts, &tv, __LINE__); - rack->r_ctl.ack_during_sd = 0; - } -#endif if (tp->t_flags & TF_DELACK) { delayed_ack = TICKS_2_USEC(tcp_delacktime); rack->r_ctl.rc_hpts_flags |= PACE_TMR_DELACK; @@ -7472,11 +6789,7 @@ rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, tp->t_flags2 |= TF2_DONT_SACK_QUEUE; } } - /* For sack attackers we want to ignore sack */ - if (rack->sack_attack_disable == 1) { - tp->t_flags2 |= (TF2_DONT_SACK_QUEUE | - TF2_MBUF_QUEUE_READY); - } else if (rack->rc_ack_can_sendout_data) { + if (rack->rc_ack_can_sendout_data) { /* * Ahh but wait, this is that special case * where the pacing timer can be disturbed @@ -7608,16 +6921,6 @@ rack_timeout_rack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 0, 0, 0); return (1); } - if ((rack->policer_detect_on == 1) && - (rack->rc_policer_detected == 0)) { - /* - * We do this early if we have not - * deteceted to attempt to detect - * quicker. Normally we want to do this - * as recovery exits (and we will again). - */ - policer_detection(tp, rack, 0); - } return (0); } @@ -7740,6 +7043,9 @@ rack_clone_rsm(struct tcp_rack *rack, struct rack_sendmap *nrsm, /* Push bit must go to the right edge as well */ if (rsm->r_flags & RACK_HAD_PUSH) rsm->r_flags &= ~RACK_HAD_PUSH; + /* Update the count if app limited */ + if (nrsm->r_flags & RACK_APP_LIMITED) + rack->r_ctl.rc_app_limited_cnt++; /* Clone over the state of the hw_tls flag */ nrsm->r_hw_tls = rsm->r_hw_tls; /* @@ -7791,7 +7097,7 @@ rack_merge_rsm(struct tcp_rack *rack, l_rsm->r_flags |= RACK_TLP; if (r_rsm->r_flags & RACK_RWND_COLLAPSED) l_rsm->r_flags |= RACK_RWND_COLLAPSED; - if ((r_rsm->r_flags & RACK_APP_LIMITED) && + if ((r_rsm->r_flags & RACK_APP_LIMITED) && ((l_rsm->r_flags & RACK_APP_LIMITED) == 0)) { /* * If both are app-limited then let the @@ -8281,11 +7587,8 @@ rack_remxt_tmr(struct tcpcb *tp) rack->r_ctl.rc_resend = tqhash_min(rack->r_ctl.tqh); if (rack->r_ctl.rc_resend != NULL) rack->r_ctl.rc_resend->r_flags |= RACK_TO_REXT; - if ((((tp->t_flags & TF_SACK_PERMIT) == 0) -#ifdef TCP_SAD_DETECTION - || (rack->sack_attack_disable != 0) -#endif - ) && ((tp->t_flags & TF_SENTFIN) == 0)) { + if (((tp->t_flags & TF_SACK_PERMIT) == 0) && + ((tp->t_flags & TF_SENTFIN) == 0)) { /* * For non-sack customers new data * needs to go out as retransmits until @@ -8583,6 +7886,16 @@ drop_it: tp->t_flags2 |= TF2_PLPMTU_PMTUD; tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE; tp->t_maxseg = tp->t_pmtud_saved_maxseg; + if (tp->t_maxseg < V_tcp_mssdflt) { + /* + * The MSS is so small we should not + * process incoming SACK's since we are + * subject to attack in such a case. + */ + tp->t_flags2 |= TF2_PROC_SACK_PROHIBIT; + } else { + tp->t_flags2 &= ~TF2_PROC_SACK_PROHIBIT; + } KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_failed); } } @@ -8720,6 +8033,7 @@ skip_time_check: ret = rack_timeout_rack(tp, rack, cts); } else if (timers & PACE_TMR_TLP) { rack->r_ctl.rc_tlp_rxt_last_time = cts; + rack->r_fast_output = 0; ret = rack_timeout_tlp(tp, rack, cts, doing_tlp); } else if (timers & PACE_TMR_RXT) { rack->r_ctl.rc_tlp_rxt_last_time = cts; @@ -8799,86 +8113,6 @@ rack_stop_all_timers(struct tcpcb *tp, struct tcp_rack *rack) } } -/* - * We maintain an array fo 16 (RETRAN_CNT_SIZE) entries. This - * array is zeroed at the start of recovery. Each time a segment - * is retransmitted, we translate that into a number of packets - * (based on segsiz) and based on how many times its been retransmitted - * increment by the number of packets the counter that represents - * retansmitted N times. Index 0 is retransmitted 1 time, index 1 - * is retransmitted 2 times etc. - * - * So for example when we send a 4344 byte transmission with a 1448 - * byte segsize, and its the third time we have retransmitted this - * segment, we would add to the rc_cnt_of_retran[2] the value of - * 3. That represents 3 MSS were retransmitted 3 times (index is - * the number of times retranmitted minus 1). - */ -static void -rack_peg_rxt(struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t segsiz) -{ - int idx; - uint32_t peg; - - peg = ((rsm->r_end - rsm->r_start) + segsiz) - 1; - peg /= segsiz; - idx = rsm->r_act_rxt_cnt - 1; - if (idx >= RETRAN_CNT_SIZE) - idx = RETRAN_CNT_SIZE - 1; - /* Max of a uint16_t retransmits in a bucket */ - if ((rack->r_ctl.rc_cnt_of_retran[idx] + peg) < 0xffff) - rack->r_ctl.rc_cnt_of_retran[idx] += peg; - else - rack->r_ctl.rc_cnt_of_retran[idx] = 0xffff; -} - -/* - * We maintain an array fo 16 (RETRAN_CNT_SIZE) entries. This - * array is zeroed at the start of recovery. Each time a segment - * is retransmitted, we translate that into a number of packets - * (based on segsiz) and based on how many times its been retransmitted - * increment by the number of packets the counter that represents - * retansmitted N times. Index 0 is retransmitted 1 time, index 1 - * is retransmitted 2 times etc. - * - * The rack_unpeg_rxt is used when we go to retransmit a segment - * again. Basically if the segment had previously been retransmitted - * say 3 times (as our previous example illustrated in the comment - * above rack_peg_rxt() prior to calling that and incrementing - * r_ack_rxt_cnt we would have called rack_unpeg_rxt() that would - * subtract back the previous add from its last rxt (in this - * example r_act_cnt would have been 2 for 2 retransmissions. So - * we would have subtracted 3 from rc_cnt_of_reetran[1] to remove - * those 3 segments. You will see this in the rack_update_rsm() - * below where we do: - * if (rsm->r_act_rxt_cnt > 0) { - * rack_unpeg_rxt(rack, rsm, segsiz); - * } - * rsm->r_act_rxt_cnt++; - * rack_peg_rxt(rack, rsm, segsiz); - * - * This effectively moves the count from rc_cnt_of_retran[1] to - * rc_cnt_of_retran[2]. - */ -static void -rack_unpeg_rxt(struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t segsiz) -{ - int idx; - uint32_t peg; - - idx = rsm->r_act_rxt_cnt - 1; - if (idx >= RETRAN_CNT_SIZE) - idx = RETRAN_CNT_SIZE - 1; - peg = ((rsm->r_end - rsm->r_start) + segsiz) - 1; - peg /= segsiz; - if (peg < rack->r_ctl.rc_cnt_of_retran[idx]) - rack->r_ctl.rc_cnt_of_retran[idx] -= peg; - else { - /* TSNH */ - rack->r_ctl.rc_cnt_of_retran[idx] = 0; - } -} - static void rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm, uint64_t ts, uint32_t add_flag, int segsiz) @@ -8890,13 +8124,8 @@ rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, rsm->r_rtr_cnt = RACK_NUM_OF_RETRANS; rsm->r_flags |= RACK_OVERMAX; } - if (rsm->r_act_rxt_cnt > 0) { - /* Drop the count back for this, its retransmitting again */ - rack_unpeg_rxt(rack, rsm, segsiz); - } rsm->r_act_rxt_cnt++; /* Peg the count/index */ - rack_peg_rxt(rack, rsm, segsiz); rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); rsm->r_dupack = 0; if ((rsm->r_rtr_cnt > 1) && ((rsm->r_flags & RACK_TLP) == 0)) { @@ -8909,7 +8138,7 @@ rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, * remove the lost desgination and reduce the * bytes considered lost. */ - rsm->r_flags &= ~RACK_WAS_LOST; + rsm->r_flags &= ~RACK_WAS_LOST; KASSERT((rack->r_ctl.rc_considered_lost >= (rsm->r_end - rsm->r_start)), ("rsm:%p rack:%p rc_considered_lost goes negative", rsm, rack)); if (rack->r_ctl.rc_considered_lost >= (rsm->r_end - rsm->r_start)) @@ -9604,7 +8833,7 @@ rack_apply_updated_usrtt(struct tcp_rack *rack, uint32_t us_rtt, uint32_t us_cts val = rack_probertt_lower_within * rack_time_between_probertt; val /= 100; - if ((rack->in_probe_rtt == 0) && + if ((rack->in_probe_rtt == 0) && (rack->rc_skip_timely == 0) && ((us_cts - rack->r_ctl.rc_lower_rtt_us_cts) >= (rack_time_between_probertt - val))) { rack_enter_probertt(rack, us_cts); @@ -10092,40 +9321,19 @@ is_rsm_inside_declared_tlp_block(struct tcp_rack *rack, struct rack_sendmap *rsm static uint32_t rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk *sack, struct tcpopt *to, struct rack_sendmap **prsm, uint32_t cts, - int *no_extra, - int *moved_two, uint32_t segsiz) + uint32_t segsiz) { uint32_t start, end, changed = 0; struct rack_sendmap stack_map; struct rack_sendmap *rsm, *nrsm, *prev, *next; int insret __diagused; int32_t used_ref = 1; - int moved = 0; -#ifdef TCP_SAD_DETECTION - int allow_segsiz; - int first_time_through = 1; -#endif - int noextra = 0; int can_use_hookery = 0; start = sack->start; end = sack->end; rsm = *prsm; -#ifdef TCP_SAD_DETECTION - /* - * There are a strange number of proxys and meddle boxes in the world - * that seem to cut up segments on different boundaries. This gets us - * smaller sacks that are still ok in terms of it being an attacker. - * We use the base segsiz to calculate an allowable smallness but - * also enforce a min on the segsiz in case it is an attacker playing - * games with MSS. So basically if the sack arrives and it is - * larger than a worse case 960 bytes, we don't classify the guy - * as supicious. - */ - allow_segsiz = max(segsiz, 1200) * sad_seg_size_per; - allow_segsiz /= 1000; -#endif do_rest_ofb: if ((rsm == NULL) || (SEQ_LT(end, rsm->r_start)) || @@ -10137,105 +9345,11 @@ do_rest_ofb: */ used_ref = 0; rsm = tqhash_find(rack->r_ctl.tqh, start); - moved++; } if (rsm == NULL) { /* TSNH */ goto out; } -#ifdef TCP_SAD_DETECTION - /* Now we must check for suspicous activity */ - if ((first_time_through == 1) && - ((end - start) < min((rsm->r_end - rsm->r_start), allow_segsiz)) && - ((rsm->r_flags & RACK_PMTU_CHG) == 0) && - ((rsm->r_flags & RACK_TLP) == 0)) { - /* - * Its less than a full MSS or the segment being acked - * this should only happen if the rsm in question had the - * r_just_ret flag set <and> the end matches the end of - * the rsm block. - * - * Note we do not look at segments that have had TLP's on - * them since we can get un-reported rwnd collapses that - * basically we TLP on and then we get back a sack block - * that goes from the start to only a small way. - * - */ - int loss, ok; - - ok = 0; - if (SEQ_GEQ(end, rsm->r_end)) { - if (rsm->r_just_ret == 1) { - /* This was at the end of a send which is ok */ - ok = 1; - } else { - /* A bit harder was it the end of our segment */ - int segs, len; - - len = (rsm->r_end - rsm->r_start); - segs = len / segsiz; - segs *= segsiz; - if ((segs + (rsm->r_end - start)) == len) { - /* - * So this last bit was the - * end of our send if we cut it - * up into segsiz pieces so its ok. - */ - ok = 1; - } - } - } - if (ok == 0) { - /* - * This guy is doing something suspicious - * lets start detection. - */ - if (rack->rc_suspicious == 0) { - tcp_trace_point(rack->rc_tp, TCP_TP_SAD_SUSPECT); - counter_u64_add(rack_sack_attacks_suspect, 1); - rack->rc_suspicious = 1; - rack_log_sad(rack, 4); - if (tcp_bblogging_on(rack->rc_tp)) { - union tcp_log_stackspecific log; - struct timeval tv; - - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); - log.u_bbr.flex1 = end; - log.u_bbr.flex2 = start; - log.u_bbr.flex3 = rsm->r_end; - log.u_bbr.flex4 = rsm->r_start; - log.u_bbr.flex5 = segsiz; - log.u_bbr.flex6 = rsm->r_fas; - log.u_bbr.flex7 = rsm->r_bas; - log.u_bbr.flex8 = 5; - log.u_bbr.pkts_out = rsm->r_flags; - log.u_bbr.bbr_state = rack->rc_suspicious; - log.u_bbr.bbr_substate = rsm->r_just_ret; - log.u_bbr.timeStamp = tcp_get_usecs(&tv); - log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); - TCP_LOG_EVENTP(rack->rc_tp, NULL, - &rack->rc_inp->inp_socket->so_rcv, - &rack->rc_inp->inp_socket->so_snd, - TCP_SAD_DETECTION, 0, - 0, &log, false, &tv); - } - } - /* You loose some ack count every time you sack - * a small bit that is not butting to the end of - * what we have sent. This is because we never - * send small bits unless its the end of the sb. - * Anyone sending a sack that is not at the end - * is thus very very suspicious. - */ - loss = (segsiz/2) / (end - start); - if (loss < rack->r_ctl.ack_count) - rack->r_ctl.ack_count -= loss; - else - rack->r_ctl.ack_count = 0; - } - } - first_time_through = 0; -#endif /* Ok we have an ACK for some piece of this rsm */ if (rsm->r_start != start) { if ((rsm->r_flags & RACK_ACKED) == 0) { @@ -10332,7 +9446,6 @@ do_rest_ofb: * use to update all the gizmos. */ /* Copy up our fudge block */ - noextra++; nrsm = &stack_map; memcpy(nrsm, rsm, sizeof(struct rack_sendmap)); /* Now adjust our tree blocks */ @@ -10383,9 +9496,6 @@ do_rest_ofb: if (rack->app_limited_needs_set) rack_need_set_test(tp, rack, nrsm, tp->snd_una, __LINE__, RACK_USE_END); changed += (nrsm->r_end - nrsm->r_start); - /* You get a count for acking a whole segment or more */ - if ((nrsm->r_end - nrsm->r_start) >= segsiz) - rack->r_ctl.ack_count += ((nrsm->r_end - nrsm->r_start) / segsiz); rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start); if (rsm->r_flags & RACK_WAS_LOST) { int my_chg; @@ -10463,7 +9573,6 @@ do_rest_ofb: } counter_u64_add(rack_sack_splits, 1); rack_clone_rsm(rack, nrsm, rsm, start); - moved++; rsm->r_just_ret = 0; #ifndef INVARIANTS (void)tqhash_insert(rack->r_ctl.tqh, nrsm); @@ -10485,14 +9594,12 @@ do_rest_ofb: } else { /* Already sacked this piece */ counter_u64_add(rack_sack_skipped_acked, 1); - moved++; if (end == rsm->r_end) { /* Done with block */ rsm = tqhash_next(rack->r_ctl.tqh, rsm); goto out; } else if (SEQ_LT(end, rsm->r_end)) { /* A partial sack to a already sacked block */ - moved++; rsm = tqhash_next(rack->r_ctl.tqh, rsm); goto out; } else { @@ -10559,8 +9666,6 @@ do_rest_ofb: rack_update_rtt(tp, rack, rsm, to, cts, SACKED, 0); changed += (rsm->r_end - rsm->r_start); /* You get a count for acking a whole segment or more */ - if ((rsm->r_end - rsm->r_start) >= segsiz) - rack->r_ctl.ack_count += ((rsm->r_end - rsm->r_start) / segsiz); if (rsm->r_flags & RACK_WAS_LOST) { int my_chg; @@ -10595,7 +9700,6 @@ do_rest_ofb: rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_SACK_M3, end, __LINE__); } else { counter_u64_add(rack_sack_skipped_acked, 1); - moved++; } if (end == rsm->r_end) { /* This block only - done, setup for next */ @@ -10693,7 +9797,6 @@ do_rest_ofb: * Note if either prev/rsm is a TLP we don't * do this. */ - noextra++; nrsm = &stack_map; memcpy(nrsm, rsm, sizeof(struct rack_sendmap)); tqhash_update_end(rack->r_ctl.tqh, prev, end); @@ -10752,10 +9855,6 @@ do_rest_ofb: if (rack->app_limited_needs_set) rack_need_set_test(tp, rack, nrsm, tp->snd_una, __LINE__, RACK_USE_END); changed += (nrsm->r_end - nrsm->r_start); - /* You get a count for acking a whole segment or more */ - if ((nrsm->r_end - nrsm->r_start) >= segsiz) - rack->r_ctl.ack_count += ((nrsm->r_end - nrsm->r_start) / segsiz); - rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start); if (rsm->r_flags & RACK_WAS_LOST) { int my_chg; @@ -10842,7 +9941,6 @@ do_rest_ofb: */ counter_u64_add(rack_sack_splits, 1); rack_clone_rsm(rack, nrsm, rsm, end); - moved++; rsm->r_flags &= (~RACK_HAS_FIN); rsm->r_just_ret = 0; #ifndef INVARIANTS @@ -10861,9 +9959,6 @@ do_rest_ofb: rack_log_retran_reason(rack, nrsm, __LINE__, 0, 2); rack_update_rtt(tp, rack, rsm, to, cts, SACKED, 0); changed += (rsm->r_end - rsm->r_start); - /* You get a count for acking a whole segment or more */ - if ((rsm->r_end - rsm->r_start) >= segsiz) - rack->r_ctl.ack_count += ((rsm->r_end - rsm->r_start) / segsiz); if (rsm->r_flags & RACK_WAS_LOST) { int my_chg; @@ -10903,7 +9998,6 @@ do_rest_ofb: * The block was already acked. */ counter_u64_add(rack_sack_skipped_acked, 1); - moved++; } out: if (rsm && @@ -10940,7 +10034,6 @@ out: if (next->r_flags & RACK_ACKED) { /* yep this and next can be merged */ rsm = rack_merge_rsm(rack, rsm, next); - noextra++; next = tqhash_next(rack->r_ctl.tqh, rsm); } else break; @@ -10972,7 +10065,6 @@ out: if (prev->r_flags & RACK_ACKED) { /* yep the previous and this can be merged */ rsm = rack_merge_rsm(rack, prev, rsm); - noextra++; prev = tqhash_prev(rack->r_ctl.tqh, rsm); } else break; @@ -10986,12 +10078,6 @@ out: /* Save off the next one for quick reference. */ nrsm = tqhash_find(rack->r_ctl.tqh, end); *prsm = rack->r_ctl.rc_sacklast = nrsm; - /* Pass back the moved. */ - *moved_two = moved; - *no_extra = noextra; - if (IN_RECOVERY(tp->t_flags)) { - rack->r_ctl.bytes_acked_in_recovery += changed; - } return (changed); } @@ -11030,66 +10116,6 @@ rack_peer_reneges(struct tcp_rack *rack, struct rack_sendmap *rsm, tcp_seq th_ac } -static void -rack_do_decay(struct tcp_rack *rack) -{ - struct timeval res; - -#define timersub(tvp, uvp, vvp) \ - do { \ - (vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec; \ - (vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec; \ - if ((vvp)->tv_usec < 0) { \ - (vvp)->tv_sec--; \ - (vvp)->tv_usec += 1000000; \ - } \ - } while (0) - - timersub(&rack->r_ctl.act_rcv_time, &rack->r_ctl.rc_last_time_decay, &res); -#undef timersub - - rack->r_ctl.input_pkt++; - if ((rack->rc_in_persist) || - (res.tv_sec >= 1) || - (rack->rc_tp->snd_max == rack->rc_tp->snd_una)) { - /* - * Check for decay of non-SAD, - * we want all SAD detection metrics to - * decay 1/4 per second (or more) passed. - * Current default is 800 so it decays - * 80% every second. - */ -#ifdef TCP_SAD_DETECTION - uint32_t pkt_delta; - - pkt_delta = rack->r_ctl.input_pkt - rack->r_ctl.saved_input_pkt; -#endif - /* Update our saved tracking values */ - rack->r_ctl.saved_input_pkt = rack->r_ctl.input_pkt; - rack->r_ctl.rc_last_time_decay = rack->r_ctl.act_rcv_time; - /* Now do we escape without decay? */ -#ifdef TCP_SAD_DETECTION - if (rack->rc_in_persist || - (rack->rc_tp->snd_max == rack->rc_tp->snd_una) || - (pkt_delta < tcp_sad_low_pps)){ - /* - * We don't decay idle connections - * or ones that have a low input pps. - */ - return; - } - /* Decay the counters */ - rack->r_ctl.ack_count = ctf_decay_count(rack->r_ctl.ack_count, - tcp_sad_decay_val); - rack->r_ctl.sack_count = ctf_decay_count(rack->r_ctl.sack_count, - tcp_sad_decay_val); - rack->r_ctl.sack_moved_extra = ctf_decay_count(rack->r_ctl.sack_moved_extra, - tcp_sad_decay_val); - rack->r_ctl.sack_noextra_move = ctf_decay_count(rack->r_ctl.sack_noextra_move, - tcp_sad_decay_val); -#endif - } -} static void inline rack_rsm_sender_update(struct tcp_rack *rack, struct tcpcb *tp, struct rack_sendmap *rsm, uint8_t from) @@ -11197,7 +10223,7 @@ rack_process_to_cumack(struct tcpcb *tp, struct tcp_rack *rack, register uint32_ * If we have some sack blocks in the filter * lets prune them out by calling sfb with no blocks. */ - sack_filter_blks(&rack->r_ctl.rack_sf, NULL, 0, th_ack); + sack_filter_blks(tp, &rack->r_ctl.rack_sf, NULL, 0, th_ack); } if (SEQ_GT(th_ack, tp->snd_una)) { /* Clear any app ack remembered settings */ @@ -11344,7 +10370,7 @@ more: * and yet before retransmitting we get an ack * which can happen due to reordering. */ - rsm->r_flags &= ~RACK_WAS_LOST; + rsm->r_flags &= ~RACK_WAS_LOST; KASSERT((rack->r_ctl.rc_considered_lost >= (rsm->r_end - rsm->r_start)), ("rsm:%p rack:%p rc_considered_lost goes negative", rsm, rack)); if (rack->r_ctl.rc_considered_lost >= (rsm->r_end - rsm->r_start)) @@ -11366,10 +10392,6 @@ more: rsm->r_in_tmap = 0; } newly_acked = 1; - if (((rsm->r_flags & RACK_ACKED) == 0) && - (IN_RECOVERY(tp->t_flags))) { - rack->r_ctl.bytes_acked_in_recovery += (rsm->r_end - rsm->r_start); - } if (rsm->r_flags & RACK_ACKED) { /* * It was acked on the scoreboard -- remove @@ -11452,10 +10474,6 @@ more: */ rack->r_ctl.rc_sacked -= (th_ack - rsm->r_start); } else { - if (((rsm->r_flags & RACK_ACKED) == 0) && - (IN_RECOVERY(tp->t_flags))) { - rack->r_ctl.bytes_acked_in_recovery += (th_ack - rsm->r_start); - } rack_update_pcm_ack(rack, 1, rsm->r_start, th_ack); } /* And what about the lost flag? */ @@ -11606,192 +10624,11 @@ rack_handle_might_revert(struct tcpcb *tp, struct tcp_rack *rack) tp->snd_ssthresh = rack->r_ctl.rto_ssthresh; } } - rack->r_ctl.bytes_acked_in_recovery = 0; - rack->r_ctl.time_entered_recovery = 0; } rack->r_might_revert = 0; } } -#ifdef TCP_SAD_DETECTION - -static void -rack_merge_out_sacks(struct tcp_rack *rack) -{ - struct rack_sendmap *cur, *next, *rsm, *trsm = NULL; - - cur = tqhash_min(rack->r_ctl.tqh); - while(cur) { - next = tqhash_next(rack->r_ctl.tqh, cur); - /* - * The idea is to go through all and merge back - * together the pieces sent together, - */ - if ((next != NULL) && - (cur->r_tim_lastsent[0] == next->r_tim_lastsent[0])) { - rack_merge_rsm(rack, cur, next); - } else { - cur = next; - } - } - /* - * now treat it like a rxt event, everything is outstanding - * and sent nothing acvked and dupacks are all zero. If this - * is not an attacker it will have to dupack its way through - * it all. - */ - TAILQ_INIT(&rack->r_ctl.rc_tmap); - TQHASH_FOREACH(rsm, rack->r_ctl.tqh) { - rsm->r_dupack = 0; - /* We must re-add it back to the tlist */ - if (trsm == NULL) { - TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext); - } else { - TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, trsm, rsm, r_tnext); - } - rsm->r_in_tmap = 1; - trsm = rsm; - rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | RACK_WAS_SACKPASS | RACK_RWND_COLLAPSED); - } - sack_filter_clear(&rack->r_ctl.rack_sf, rack->rc_tp->snd_una); -} - -static void -rack_do_detection(struct tcpcb *tp, struct tcp_rack *rack, uint32_t bytes_this_ack, uint32_t segsiz) -{ - int do_detection = 0; - - if (rack->sack_attack_disable || rack->rc_suspicious) { - /* - * If we have been disabled we must detect - * to possibly reverse it. Or if the guy has - * sent in suspicious sacks we want to do detection too. - */ - do_detection = 1; - - } else if ((rack->do_detection || tcp_force_detection) && - (tcp_sack_to_ack_thresh > 0) && - (tcp_sack_to_move_thresh > 0) && - (rack->r_ctl.rc_num_maps_alloced > tcp_map_minimum)) { - /* - * We only detect here if: - * 1) System wide forcing is on <or> do_detection is on - * <and> - * 2) We have thresholds for move and ack (set one to 0 and we are off) - * <and> - * 3) We have maps allocated larger than our min (500). - */ - do_detection = 1; - } - if (do_detection > 0) { - /* - * We have thresholds set to find - * possible attackers and disable sack. - * Check them. - */ - uint64_t ackratio, moveratio, movetotal; - - /* Log detecting */ - rack_log_sad(rack, 1); - /* Do we establish a ack ratio */ - if ((rack->r_ctl.sack_count > tcp_map_minimum) || - (rack->rc_suspicious == 1) || - (rack->sack_attack_disable > 0)) { - ackratio = (uint64_t)(rack->r_ctl.sack_count); - ackratio *= (uint64_t)(1000); - if (rack->r_ctl.ack_count) - ackratio /= (uint64_t)(rack->r_ctl.ack_count); - else { - /* We can hit this due to ack totals degregation (via small sacks) */ - ackratio = 1000; - } - } else { - /* - * No ack ratio needed if we have not - * seen more sacks then the number of map entries. - * The exception to that is if we have disabled sack then - * we need to find a ratio. - */ - ackratio = 0; - } - - if ((rack->sack_attack_disable == 0) && - (ackratio > rack_highest_sack_thresh_seen)) - rack_highest_sack_thresh_seen = (uint32_t)ackratio; - /* Do we establish a move ratio? */ - if ((rack->r_ctl.sack_moved_extra > tcp_map_minimum) || - (rack->rc_suspicious == 1) || - (rack->sack_attack_disable > 0)) { - /* - * We need to have more sack moves than maps - * allocated to have a move ratio considered. - */ - movetotal = rack->r_ctl.sack_moved_extra; - movetotal += rack->r_ctl.sack_noextra_move; - moveratio = rack->r_ctl.sack_moved_extra; - moveratio *= (uint64_t)1000; - if (movetotal) - moveratio /= movetotal; - else { - /* No moves, thats pretty good */ - moveratio = 0; - } - } else { - /* - * Not enough moves have occured to consider - * if we are out of whack in that ratio. - * The exception to that is if we have disabled sack then - * we need to find a ratio. - */ - moveratio = 0; - } - if ((rack->sack_attack_disable == 0) && - (moveratio > rack_highest_move_thresh_seen)) - rack_highest_move_thresh_seen = (uint32_t)moveratio; - /* Now the tests */ - if (rack->sack_attack_disable == 0) { - /* Not disabled, do we need to disable? */ - if ((ackratio > tcp_sack_to_ack_thresh) && - (moveratio > tcp_sack_to_move_thresh)) { - /* Disable sack processing */ - tcp_trace_point(rack->rc_tp, TCP_TP_SAD_TRIGGERED); - rack->sack_attack_disable = 1; - /* set it so we have the built in delay */ - rack->r_ctl.ack_during_sd = 1; - if (rack_merge_out_sacks_on_attack) - rack_merge_out_sacks(rack); - counter_u64_add(rack_sack_attacks_detected, 1); - tcp_trace_point(rack->rc_tp, TCP_TP_SAD_TRIGGERED); - /* Clamp the cwnd at flight size */ - rack->r_ctl.rc_saved_cwnd = rack->rc_tp->snd_cwnd; - rack->rc_tp->snd_cwnd = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); - rack_log_sad(rack, 2); - } - } else { - /* We are sack-disabled check for false positives */ - if ((ackratio <= tcp_restoral_thresh) || - ((rack_merge_out_sacks_on_attack == 0) && - (rack->rc_suspicious == 0) && - (rack->r_ctl.rc_num_maps_alloced <= (tcp_map_minimum/2)))) { - rack->sack_attack_disable = 0; - rack_log_sad(rack, 3); - /* Restart counting */ - rack->r_ctl.sack_count = 0; - rack->r_ctl.sack_moved_extra = 0; - rack->r_ctl.sack_noextra_move = 1; - rack->rc_suspicious = 0; - rack->r_ctl.ack_count = max(1, - (bytes_this_ack / segsiz)); - - counter_u64_add(rack_sack_attacks_reversed, 1); - /* Restore the cwnd */ - if (rack->r_ctl.rc_saved_cwnd > rack->rc_tp->snd_cwnd) - rack->rc_tp->snd_cwnd = rack->r_ctl.rc_saved_cwnd; - } - } - } -} -#endif static int rack_note_dsack(struct tcp_rack *rack, tcp_seq start, tcp_seq end) @@ -11948,9 +10785,9 @@ rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th, int entered register uint32_t th_ack; int32_t i, j, k, num_sack_blks = 0; uint32_t cts, acked, ack_point; - int loop_start = 0, moved_two = 0, no_extra = 0; + int loop_start = 0; uint32_t tsused; - uint32_t segsiz, o_cnt; + uint32_t segsiz; INP_WLOCK_ASSERT(tptoinpcb(tp)); @@ -11963,8 +10800,6 @@ rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th, int entered rsm = tqhash_min(rack->r_ctl.tqh); changed = 0; th_ack = th->th_ack; - if (rack->sack_attack_disable == 0) - rack_do_decay(rack); segsiz = ctf_fixed_maxseg(rack->rc_tp); if (BYTES_THIS_ACK(tp, th) >= segsiz) { /* @@ -11975,17 +10810,8 @@ rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th, int entered int ac; ac = BYTES_THIS_ACK(tp, th) / ctf_fixed_maxseg(rack->rc_tp); - rack->r_ctl.ack_count += ac; counter_u64_add(rack_ack_total, ac); } - if (rack->r_ctl.ack_count > 0xfff00000) { - /* - * reduce the number to keep us under - * a uint32_t. - */ - rack->r_ctl.ack_count /= 2; - rack->r_ctl.sack_count /= 2; - } if (SEQ_GT(th_ack, tp->snd_una)) { rack_log_progress_event(rack, tp, ticks, PROGRESS_UPDATE, __LINE__); tp->t_acktime = ticks; @@ -12051,52 +10877,16 @@ rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th, int entered * Sort the SACK blocks so we can update the rack scoreboard with * just one pass. */ - o_cnt = num_sack_blks; - num_sack_blks = sack_filter_blks(&rack->r_ctl.rack_sf, sack_blocks, + num_sack_blks = sack_filter_blks(tp, &rack->r_ctl.rack_sf, sack_blocks, num_sack_blks, th->th_ack); ctf_log_sack_filter(rack->rc_tp, num_sack_blks, sack_blocks); if (sacks_seen != NULL) *sacks_seen = num_sack_blks; if (num_sack_blks == 0) { /* Nothing to sack, but we need to update counts */ - if ((o_cnt == 1) && - (*dsack_seen != 1)) - rack->r_ctl.sack_count++; - else if (o_cnt > 1) - rack->r_ctl.sack_count++; goto out_with_totals; } - if (rack->sack_attack_disable) { - /* - * An attacker disablement is in place, for - * every sack block that is not at least a full MSS - * count up sack_count. - */ - for (i = 0; i < num_sack_blks; i++) { - if ((sack_blocks[i].end - sack_blocks[i].start) < segsiz) { - rack->r_ctl.sack_count++; - } - if (rack->r_ctl.sack_count > 0xfff00000) { - /* - * reduce the number to keep us under - * a uint32_t. - */ - rack->r_ctl.ack_count /= 2; - rack->r_ctl.sack_count /= 2; - } - } - goto out; - } /* Its a sack of some sort */ - rack->r_ctl.sack_count += num_sack_blks; - if (rack->r_ctl.sack_count > 0xfff00000) { - /* - * reduce the number to keep us under - * a uint32_t. - */ - rack->r_ctl.ack_count /= 2; - rack->r_ctl.sack_count /= 2; - } if (num_sack_blks < 2) { /* Only one, we don't need to sort */ goto do_sack_work; @@ -12164,7 +10954,7 @@ do_sack_work: * We probably did the FR and the next * SACK in continues as we would expect. */ - acked = rack_proc_sack_blk(tp, rack, &sack_blocks[0], to, &rsm, cts, &no_extra, &moved_two, segsiz); + acked = rack_proc_sack_blk(tp, rack, &sack_blocks[0], to, &rsm, cts, segsiz); if (acked) { rack->r_wanted_output = 1; changed += acked; @@ -12180,40 +10970,8 @@ do_sack_work: * are acked). Count this as ACK'd data to boost * up the chances of recovering any false positives. */ - rack->r_ctl.ack_count += (acked / ctf_fixed_maxseg(rack->rc_tp)); counter_u64_add(rack_ack_total, (acked / ctf_fixed_maxseg(rack->rc_tp))); counter_u64_add(rack_express_sack, 1); - if (rack->r_ctl.ack_count > 0xfff00000) { - /* - * reduce the number to keep us under - * a uint32_t. - */ - rack->r_ctl.ack_count /= 2; - rack->r_ctl.sack_count /= 2; - } - if (moved_two) { - /* - * If we did not get a SACK for at least a MSS and - * had to move at all, or if we moved more than our - * threshold, it counts against the "extra" move. - */ - rack->r_ctl.sack_moved_extra += moved_two; - rack->r_ctl.sack_noextra_move += no_extra; - counter_u64_add(rack_move_some, 1); - } else { - /* - * else we did not have to move - * any more than we would expect. - */ - rack->r_ctl.sack_noextra_move += no_extra; - rack->r_ctl.sack_noextra_move++; - counter_u64_add(rack_move_none, 1); - } - if ((rack->r_ctl.sack_moved_extra > 0xfff00000) || - (rack->r_ctl.sack_noextra_move > 0xfff00000)) { - rack->r_ctl.sack_moved_extra /= 2; - rack->r_ctl.sack_noextra_move /= 2; - } goto out_with_totals; } else { /* @@ -12226,57 +10984,11 @@ do_sack_work: counter_u64_add(rack_sack_total, 1); rsm = rack->r_ctl.rc_sacklast; for (i = loop_start; i < num_sack_blks; i++) { - acked = rack_proc_sack_blk(tp, rack, &sack_blocks[i], to, &rsm, cts, &no_extra, &moved_two, segsiz); + acked = rack_proc_sack_blk(tp, rack, &sack_blocks[i], to, &rsm, cts, segsiz); if (acked) { rack->r_wanted_output = 1; changed += acked; } - if (moved_two) { - /* - * If we did not get a SACK for at least a MSS and - * had to move at all, or if we moved more than our - * threshold, it counts against the "extra" move. - */ - rack->r_ctl.sack_moved_extra += moved_two; - rack->r_ctl.sack_noextra_move += no_extra; - counter_u64_add(rack_move_some, 1); - } else { - /* - * else we did not have to move - * any more than we would expect. - */ - rack->r_ctl.sack_noextra_move += no_extra; - rack->r_ctl.sack_noextra_move++; - counter_u64_add(rack_move_none, 1); - } - if ((rack->r_ctl.sack_moved_extra > 0xfff00000) || - (rack->r_ctl.sack_noextra_move > 0xfff00000)) { - rack->r_ctl.sack_moved_extra /= 2; - rack->r_ctl.sack_noextra_move /= 2; - } - if (moved_two && (acked < ctf_fixed_maxseg(rack->rc_tp))) { - /* - * If the SACK was not a full MSS then - * we add to sack_count the number of - * MSS's (or possibly more than - * a MSS if its a TSO send) we had to skip by. - */ - rack->r_ctl.sack_count += moved_two; - if (rack->r_ctl.sack_count > 0xfff00000) { - rack->r_ctl.ack_count /= 2; - rack->r_ctl.sack_count /= 2; - } - counter_u64_add(rack_sack_total, moved_two); - } - /* - * Now we need to setup for the next - * round. First we make sure we won't - * exceed the size of our uint32_t on - * the various counts, and then clear out - * moved_two. - */ - moved_two = 0; - no_extra = 0; } out_with_totals: if (num_sack_blks > 1) { @@ -12288,13 +11000,9 @@ out_with_totals: * it could be an attacker constantly * moving us. */ - rack->r_ctl.sack_moved_extra++; counter_u64_add(rack_move_some, 1); } out: -#ifdef TCP_SAD_DETECTION - rack_do_detection(tp, rack, BYTES_THIS_ACK(tp, th), ctf_fixed_maxseg(rack->rc_tp)); -#endif if (changed) { /* Something changed cancel the rack timer */ rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); @@ -12358,7 +11066,7 @@ rack_strike_dupack(struct tcp_rack *rack, tcp_seq th_ack) * We need to skip anything already set * to be retransmitted. */ - if ((rsm->r_dupack >= DUP_ACK_THRESHOLD) || + if ((rsm->r_dupack >= DUP_ACK_THRESHOLD) || (rsm->r_flags & RACK_MUST_RXT)) { rsm = TAILQ_NEXT(rsm, r_tnext); continue; @@ -13061,10 +11769,45 @@ rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so, INP_WLOCK_ASSERT(tptoinpcb(tp)); rack = (struct tcp_rack *)tp->t_fb_ptr; + if (SEQ_GEQ(tp->snd_una, tp->iss + (65535 << tp->snd_scale))) { + /* Checking SEG.ACK against ISS is definitely redundant. */ + tp->t_flags2 |= TF2_NO_ISS_CHECK; + } + if (!V_tcp_insecure_ack) { + tcp_seq seq_min; + bool ghost_ack_check; + + if (tp->t_flags2 & TF2_NO_ISS_CHECK) { + /* Check for too old ACKs (RFC 5961, Section 5.2). */ + seq_min = tp->snd_una - tp->max_sndwnd; + ghost_ack_check = false; + } else { + if (SEQ_GT(tp->iss + 1, tp->snd_una - tp->max_sndwnd)) { + /* Checking for ghost ACKs is stricter. */ + seq_min = tp->iss + 1; + ghost_ack_check = true; + } else { + /* + * Checking for too old ACKs (RFC 5961, + * Section 5.2) is stricter. + */ + seq_min = tp->snd_una - tp->max_sndwnd; + ghost_ack_check = false; + } + } + if (SEQ_LT(th->th_ack, seq_min)) { + if (ghost_ack_check) + TCPSTAT_INC(tcps_rcvghostack); + else + TCPSTAT_INC(tcps_rcvacktooold); + /* Send challenge ACK. */ + ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val); + rack->r_wanted_output = 1; + return (1); + } + } if (SEQ_GT(th->th_ack, tp->snd_max)) { - __ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val, - &rack->r_ctl.challenge_ack_ts, - &rack->r_ctl.challenge_ack_cnt); + ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val); rack->r_wanted_output = 1; return (1); } @@ -13092,24 +11835,6 @@ rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so, } rack_log_ack(tp, to, th, ((in_rec == 0) && IN_FASTRECOVERY(tp->t_flags)), dup_ack_struck, &dsack_seen, &sacks_seen); - if ((rack->sack_attack_disable > 0) && - (th->th_ack == tp->snd_una) && - (tiwin == tp->snd_wnd) && - (orig_tlen == 0) && - (dsack_seen == 0) && - (sacks_seen > 0)) { - /* - * If sacks have been disabled we may - * want to strike a dup-ack "ignoring" the - * sack as long as the sack was not a "dsack". Note - * that if no sack is sent (TOF_SACK is off) then the - * normal dsack code above rack_log_ack() would have - * already struck. So this is just to catch the case - * were we are ignoring sacks from this guy due to - * it being a suspected attacker. - */ - rack_strike_dupack(rack, th->th_ack); - } } if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { @@ -13248,7 +11973,7 @@ rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so, /* Must be non-newreno (cubic) getting too ahead of itself */ tp->snd_cwnd = p_cwnd; } - SOCKBUF_LOCK(&so->so_snd); + SOCK_SENDBUF_LOCK(so); acked_amount = min(acked, (int)sbavail(&so->so_snd)); tp->snd_wnd -= acked_amount; mfree = sbcut_locked(&so->so_snd, acked_amount); @@ -13289,8 +12014,6 @@ rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so, if (tp->snd_una == tp->snd_max) { /* Nothing left outstanding */ tp->t_flags &= ~TF_PREVVALID; - rack->r_ctl.idle_snd_una = tp->snd_una; - rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL); if (rack->r_ctl.rc_went_idle_time == 0) rack->r_ctl.rc_went_idle_time = 1; rack->r_ctl.retran_during_recovery = 0; @@ -13348,7 +12071,7 @@ rack_log_collapse(struct tcp_rack *rack, uint32_t cnt, uint32_t split, uint32_t if (rsm == NULL) log.u_bbr.rttProp = 0; else - log.u_bbr.rttProp = (uint64_t)rsm; + log.u_bbr.rttProp = (uintptr_t)rsm; log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); TCP_LOG_EVENTP(rack->rc_tp, NULL, @@ -13636,7 +12359,7 @@ rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so, thflags = tcp_get_flags(th) & TH_FIN; KMOD_TCPSTAT_ADD(tcps_rcvpack, nsegs); KMOD_TCPSTAT_ADD(tcps_rcvbyte, tlen); - SOCKBUF_LOCK(&so->so_rcv); + SOCK_RECVBUF_LOCK(so); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { m_freem(m); } else { @@ -13878,7 +12601,7 @@ rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so, newsize = tcp_autorcvbuf(m, th, so, tp, tlen); /* Add data to socket buffer. */ - SOCKBUF_LOCK(&so->so_rcv); + SOCK_RECVBUF_LOCK(so); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { m_freem(m); } else { @@ -14037,7 +12760,7 @@ rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, struct mbuf *mfree; rack_ack_received(tp, rack, th->th_ack, nsegs, CC_ACK, 0); - SOCKBUF_LOCK(&so->so_snd); + SOCK_SENDBUF_LOCK(so); mfree = sbcut_locked(&so->so_snd, acked); tp->snd_una = th->th_ack; /* Note we want to hold the sb lock through the sendmap adjust */ @@ -14103,7 +12826,6 @@ rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, rack->r_ctl.retran_during_recovery = 0; rack->rc_suspicious = 0; rack->r_ctl.dsack_byte_cnt = 0; - rack->r_ctl.idle_snd_una = tp->snd_una; rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL); if (rack->r_ctl.rc_went_idle_time == 0) rack->r_ctl.rc_went_idle_time = 1; @@ -14154,7 +12876,7 @@ rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so, (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); - ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset(m, tp, th, BANDLIM_TCP_RST, tlen); return (1); } if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) { @@ -14363,14 +13085,12 @@ rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, ctf_calc_rwin(so, tp); if ((thflags & TH_RST) || (tp->t_fin_is_rst && (thflags & TH_FIN))) - return (__ctf_process_rst(m, th, so, tp, - &rack->r_ctl.challenge_ack_ts, - &rack->r_ctl.challenge_ack_cnt)); + return (ctf_process_rst(m, th, so, tp)); if ((thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->snd_una) || SEQ_GT(th->th_ack, tp->snd_max))) { tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); - ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset(m, tp, th, BANDLIM_TCP_RST, tlen); return (1); } if (tp->t_flags & TF_FASTOPEN) { @@ -14383,7 +13103,7 @@ rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, */ if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) { tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); - ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset(m, tp, th, BANDLIM_TCP_RST, tlen); return (1); } else if (thflags & TH_SYN) { /* non-initial SYN is ignored */ @@ -14417,12 +13137,10 @@ rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, */ if (SEQ_LT(th->th_seq, tp->irs)) { tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); - ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset(m, tp, th, BANDLIM_TCP_RST, tlen); return (1); } - if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val, - &rack->r_ctl.challenge_ack_ts, - &rack->r_ctl.challenge_ack_cnt)) { + if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { return (ret_val); } /* @@ -14611,9 +13329,7 @@ rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so, if ((thflags & TH_RST) || (tp->t_fin_is_rst && (thflags & TH_FIN))) - return (__ctf_process_rst(m, th, so, tp, - &rack->r_ctl.challenge_ack_ts, - &rack->r_ctl.challenge_ack_cnt)); + return (ctf_process_rst(m, th, so, tp)); /* * RFC5961 Section 4.2 Send challenge ACK for any SYN in @@ -14632,9 +13348,7 @@ rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so, if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) return (ret_val); } - if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val, - &rack->r_ctl.challenge_ack_ts, - &rack->r_ctl.challenge_ack_cnt)) { + if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { return (ret_val); } /* @@ -14686,7 +13400,7 @@ rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so, if (sbavail(&so->so_snd)) { if (ctf_progress_timeout_check(tp, true)) { rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); - ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen); return (1); } } @@ -14707,15 +13421,11 @@ rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so, { int32_t ret_val = 0; int32_t orig_tlen = tlen; - struct tcp_rack *rack; - rack = (struct tcp_rack *)tp->t_fb_ptr; ctf_calc_rwin(so, tp); if ((thflags & TH_RST) || (tp->t_fin_is_rst && (thflags & TH_FIN))) - return (__ctf_process_rst(m, th, so, tp, - &rack->r_ctl.challenge_ack_ts, - &rack->r_ctl.challenge_ack_cnt)); + return (ctf_process_rst(m, th, so, tp)); /* * RFC5961 Section 4.2 Send challenge ACK for any SYN in * synchronized state. @@ -14733,9 +13443,7 @@ rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so, if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) return (ret_val); } - if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val, - &rack->r_ctl.challenge_ack_ts, - &rack->r_ctl.challenge_ack_cnt)) { + if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { return (ret_val); } /* @@ -14788,7 +13496,7 @@ rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so, if (ctf_progress_timeout_check(tp, true)) { rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, tp, tick, PROGRESS_DROP, __LINE__); - ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen); return (1); } } @@ -14837,16 +13545,12 @@ rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so, int32_t ret_val = 0; int32_t orig_tlen = tlen; int32_t ourfinisacked = 0; - struct tcp_rack *rack; - rack = (struct tcp_rack *)tp->t_fb_ptr; ctf_calc_rwin(so, tp); if ((thflags & TH_RST) || (tp->t_fin_is_rst && (thflags & TH_FIN))) - return (__ctf_process_rst(m, th, so, tp, - &rack->r_ctl.challenge_ack_ts, - &rack->r_ctl.challenge_ack_cnt)); + return (ctf_process_rst(m, th, so, tp)); /* * RFC5961 Section 4.2 Send challenge ACK for any SYN in * synchronized state. @@ -14864,9 +13568,7 @@ rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so, if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) return (ret_val); } - if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val, - &rack->r_ctl.challenge_ack_ts, - &rack->r_ctl.challenge_ack_cnt)) { + if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { return (ret_val); } /* @@ -14944,7 +13646,7 @@ rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so, if (ctf_progress_timeout_check(tp, true)) { rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, tp, tick, PROGRESS_DROP, __LINE__); - ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen); return (1); } } @@ -14965,16 +13667,12 @@ rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so, int32_t ret_val = 0; int32_t orig_tlen = tlen; int32_t ourfinisacked = 0; - struct tcp_rack *rack; - rack = (struct tcp_rack *)tp->t_fb_ptr; ctf_calc_rwin(so, tp); if ((thflags & TH_RST) || (tp->t_fin_is_rst && (thflags & TH_FIN))) - return (__ctf_process_rst(m, th, so, tp, - &rack->r_ctl.challenge_ack_ts, - &rack->r_ctl.challenge_ack_cnt)); + return (ctf_process_rst(m, th, so, tp)); /* * RFC5961 Section 4.2 Send challenge ACK for any SYN in * synchronized state. @@ -14992,9 +13690,7 @@ rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so, if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) return (ret_val); } - if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val, - &rack->r_ctl.challenge_ack_ts, - &rack->r_ctl.challenge_ack_cnt)) { + if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { return (ret_val); } /* @@ -15051,7 +13747,7 @@ rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so, if (ctf_progress_timeout_check(tp, true)) { rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, tp, tick, PROGRESS_DROP, __LINE__); - ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen); return (1); } } @@ -15072,16 +13768,12 @@ rack_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so, int32_t ret_val = 0; int32_t orig_tlen; int32_t ourfinisacked = 0; - struct tcp_rack *rack; - rack = (struct tcp_rack *)tp->t_fb_ptr; ctf_calc_rwin(so, tp); if ((thflags & TH_RST) || (tp->t_fin_is_rst && (thflags & TH_FIN))) - return (__ctf_process_rst(m, th, so, tp, - &rack->r_ctl.challenge_ack_ts, - &rack->r_ctl.challenge_ack_cnt)); + return (ctf_process_rst(m, th, so, tp)); /* * RFC5961 Section 4.2 Send challenge ACK for any SYN in * synchronized state. @@ -15100,9 +13792,7 @@ rack_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so, return (ret_val); } orig_tlen = tlen; - if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val, - &rack->r_ctl.challenge_ack_ts, - &rack->r_ctl.challenge_ack_cnt)) { + if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { return (ret_val); } /* @@ -15159,7 +13849,7 @@ rack_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so, if (ctf_progress_timeout_check(tp, true)) { rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, tp, tick, PROGRESS_DROP, __LINE__); - ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen); return (1); } } @@ -15180,17 +13870,13 @@ rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so, int32_t ret_val = 0; int32_t orig_tlen = tlen; int32_t ourfinisacked = 0; - struct tcp_rack *rack; - rack = (struct tcp_rack *)tp->t_fb_ptr; ctf_calc_rwin(so, tp); /* Reset receive buffer auto scaling when not in bulk receive mode. */ if ((thflags & TH_RST) || (tp->t_fin_is_rst && (thflags & TH_FIN))) - return (__ctf_process_rst(m, th, so, tp, - &rack->r_ctl.challenge_ack_ts, - &rack->r_ctl.challenge_ack_cnt)); + return (ctf_process_rst(m, th, so, tp)); /* * RFC5961 Section 4.2 Send challenge ACK for any SYN in * synchronized state. @@ -15208,9 +13894,7 @@ rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so, if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) return (ret_val); } - if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val, - &rack->r_ctl.challenge_ack_ts, - &rack->r_ctl.challenge_ack_cnt)) { + if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { return (ret_val); } /* @@ -15269,7 +13953,7 @@ rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so, if (ctf_progress_timeout_check(tp, true)) { rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, tp, tick, PROGRESS_DROP, __LINE__); - ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen); return (1); } } @@ -15545,7 +14229,7 @@ rack_log_chg_info(struct tcpcb *tp, struct tcp_rack *rack, uint8_t mod, union tcp_log_stackspecific log; struct timeval tv; - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + memset(&log, 0, sizeof(log)); log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.flex8 = mod; log.u_bbr.flex1 = flex1; @@ -15860,36 +14544,6 @@ rack_init_outstanding(struct tcpcb *tp, struct tcp_rack *rack, uint32_t us_cts, return (0); } -static void -rack_translate_policer_detect(struct tcp_rack *rack, uint32_t optval) -{ - /* - * P = Percent of retransmits 499 = 49.9% - * A = Average number 1 (.1%) -> 169 (16.9%) - * M = Median number of retrans 1 - 16 - * MMMM MMMM AAAA AAAA PPPP PPPP PPPP PPPP - * - */ - uint16_t per, upp; - - per = optval & 0x0000ffff; - rack->r_ctl.policer_rxt_threshold = (uint32_t)(per & 0xffff); - upp = ((optval & 0xffff0000) >> 16); - rack->r_ctl.policer_avg_threshold = (0x00ff & upp); - rack->r_ctl.policer_med_threshold = ((upp >> 8) & 0x00ff); - if ((rack->r_ctl.policer_rxt_threshold > 0) && - (rack->r_ctl.policer_avg_threshold > 0) && - (rack->r_ctl.policer_med_threshold > 0)) { - rack->policer_detect_on = 1; - } else { - rack->policer_detect_on = 0; - } - rack->r_ctl.saved_policer_val = optval; - policer_detection_log(rack, optval, - rack->r_ctl.policer_avg_threshold, - rack->r_ctl.policer_med_threshold, - rack->r_ctl.policer_rxt_threshold, 11); -} static int32_t rack_init(struct tcpcb *tp, void **ptr) @@ -15957,21 +14611,9 @@ rack_init(struct tcpcb *tp, void **ptr) rack->r_ctl.rc_split_limit = V_tcp_map_split_limit; /* We want abe like behavior as well */ - rack->r_ctl.rc_saved_beta.newreno_flags |= CC_NEWRENO_BETA_ECN_ENABLED; rack->r_ctl.rc_reorder_fade = rack_reorder_fade; rack->rc_allow_data_af_clo = rack_ignore_data_after_close; rack->r_ctl.rc_tlp_threshold = rack_tlp_thresh; - rack->r_ctl.policer_del_mss = rack_req_del_mss; - if ((rack_policer_rxt_thresh > 0) && - (rack_policer_avg_thresh > 0) && - (rack_policer_med_thresh > 0)) { - rack->r_ctl.policer_rxt_threshold = rack_policer_rxt_thresh; - rack->r_ctl.policer_avg_threshold = rack_policer_avg_thresh; - rack->r_ctl.policer_med_threshold = rack_policer_med_thresh; - rack->policer_detect_on = 1; - } else { - rack->policer_detect_on = 0; - } if (rack_fill_cw_state) rack->rc_pace_to_cwnd = 1; if (rack_pacing_min_seg) @@ -16008,13 +14650,13 @@ rack_init(struct tcpcb *tp, void **ptr) rack->r_ctl.max_reduction = rack_max_reduce; rack->rc_force_max_seg = 0; TAILQ_INIT(&rack->r_ctl.opt_list); - rack->r_ctl.rc_saved_beta.beta = V_newreno_beta_ecn; - rack->r_ctl.rc_saved_beta.beta_ecn = V_newreno_beta_ecn; + rack->r_ctl.rc_saved_beta = V_newreno_beta_ecn; + rack->r_ctl.rc_saved_beta_ecn = V_newreno_beta_ecn; if (rack_hibeta_setting) { rack->rack_hibeta = 1; if ((rack_hibeta_setting >= 50) && (rack_hibeta_setting <= 100)) { - rack->r_ctl.rc_saved_beta.beta = rack_hibeta_setting; + rack->r_ctl.rc_saved_beta = rack_hibeta_setting; rack->r_ctl.saved_hibeta = rack_hibeta_setting; } } else { @@ -16028,7 +14670,6 @@ rack_init(struct tcpcb *tp, void **ptr) rack->r_ctl.last_tm_mark = 0xffffffffffffffff; rack->r_ctl.rc_reorder_shift = rack_reorder_thresh; rack->r_ctl.rc_pkt_delay = rack_pkt_delay; - rack->r_ctl.pol_bw_comp = rack_policing_do_bw_comp; rack->r_ctl.rc_tlp_cwnd_reduce = rack_lower_cwnd_at_tlp; rack->r_ctl.rc_lowest_us_rtt = 0xffffffff; rack->r_ctl.rc_highest_us_rtt = 0; @@ -16064,7 +14705,6 @@ rack_init(struct tcpcb *tp, void **ptr) if (rack_honors_hpts_min_to) rack->r_use_hpts_min = 1; if (tp->snd_una != 0) { - rack->r_ctl.idle_snd_una = tp->snd_una; rack->rc_sendvars_notset = 0; /* * Make sure any TCP timers are not running. @@ -16115,7 +14755,6 @@ rack_init(struct tcpcb *tp, void **ptr) rack->r_ctl.rc_lower_rtt_us_cts = us_cts; rack->r_ctl.rc_time_of_last_probertt = us_cts; rack->r_ctl.rc_went_idle_time = us_cts; - rack->r_ctl.challenge_ack_ts = tcp_ts_getticks() - (tcp_ack_war_time_window + 1); rack->r_ctl.rc_time_probertt_starts = 0; rack->r_ctl.gp_rnd_thresh = rack_rnd_cnt_req & 0xff; @@ -16142,11 +14781,6 @@ rack_init(struct tcpcb *tp, void **ptr) rack->rack_hdw_pace_ena = 1; if (rack_hw_rate_caps) rack->r_rack_hw_rate_caps = 1; -#ifdef TCP_SAD_DETECTION - rack->do_detection = 1; -#else - rack->do_detection = 0; -#endif if (rack_non_rxt_use_cr) rack->rack_rec_nonrxt_use_cr = 1; /* Lets setup the fsb block */ @@ -16485,16 +15119,16 @@ rack_fini(struct tcpcb *tp, int32_t tcb_is_purged) union tcp_log_stackspecific log; struct timeval tv; - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + memset(&log, 0, sizeof(log)); log.u_bbr.flex8 = 10; log.u_bbr.flex1 = rack->r_ctl.rc_num_maps_alloced; log.u_bbr.flex2 = rack->rc_free_cnt; log.u_bbr.flex3 = cnt_free; log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); rsm = tqhash_min(rack->r_ctl.tqh); - log.u_bbr.delRate = (uint64_t)rsm; + log.u_bbr.delRate = (uintptr_t)rsm; rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); - log.u_bbr.cur_del_rate = (uint64_t)rsm; + log.u_bbr.cur_del_rate = (uintptr_t)rsm; log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.pkt_epoch = __LINE__; (void)tcp_log_event(tp, NULL, NULL, NULL, TCP_LOG_OUT, ERRNO_UNK, @@ -16605,12 +15239,6 @@ rack_timer_audit(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb) if (tmr_up == PACE_TMR_DELACK) /* We are supposed to have delayed ack up and we do */ return; - } else if (sbavail(&tptosocket(tp)->so_snd) && (tmr_up == PACE_TMR_RXT)) { - /* - * if we hit enobufs then we would expect the possibility - * of nothing outstanding and the RXT up (and the hptsi timer). - */ - return; } else if (((V_tcp_always_keepalive || rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) && (tp->t_state <= TCPS_CLOSING)) && @@ -16735,7 +15363,7 @@ rack_log_input_packet(struct tcpcb *tp, struct tcp_rack *rack, struct tcp_ackent tcp_req = tcp_req_find_req_for_seq(tp, ae->ack); } #endif - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + memset(&log, 0, sizeof(log)); log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); if (rack->rack_no_prr == 0) log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; @@ -16938,7 +15566,7 @@ rack_log_pcm(struct tcp_rack *rack, uint8_t mod, uint32_t flex1, uint32_t flex2, struct timeval tv; (void)tcp_get_usecs(&tv); - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + memset(&log, 0, sizeof(log)); log.u_bbr.timeStamp = tcp_tv_to_usectick(&tv); log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); log.u_bbr.flex8 = mod; @@ -17021,7 +15649,7 @@ rack_new_round_setup(struct tcpcb *tp, struct tcp_rack *rack, uint32_t high_seq) union tcp_log_stackspecific log; struct timeval tv; - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + memset(&log, 0, sizeof(log)); log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.flex1 = rack->r_ctl.current_round; log.u_bbr.flex2 = rack->r_ctl.last_rnd_of_gp_rise; @@ -17164,10 +15792,7 @@ rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mb /* Case C */ ae->ack_val_set = ACK_RWND; } - if (rack->sack_attack_disable > 0) { - rack_log_type_bbrsnd(rack, 0, 0, cts, tv, __LINE__); - rack->r_ctl.ack_during_sd++; - } + rack_log_type_bbrsnd(rack, 0, 0, cts, tv, __LINE__); rack_log_input_packet(tp, rack, ae, ae->ack_val_set, high_seq); /* Validate timestamp */ if (ae->flags & HAS_TSTMP) { @@ -17278,7 +15903,7 @@ rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mb * ack is beyond the largest seq we sent. */ if ((tp->t_flags & TF_ACKNOW) == 0) { - ctf_ack_war_checks(tp, &rack->r_ctl.challenge_ack_ts, &rack->r_ctl.challenge_ack_cnt); + ctf_ack_war_checks(tp); if (tp->t_flags && TF_ACKNOW) rack->r_wanted_output = 1; } @@ -17374,28 +15999,6 @@ rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mb * since cum-ack moved forward. */ rack->probe_not_answered = 0; - if (rack->sack_attack_disable == 0) - rack_do_decay(rack); - if (acked >= segsiz) { - /* - * You only get credit for - * MSS and greater (and you get extra - * credit for larger cum-ack moves). - */ - int ac; - - ac = acked / segsiz; - rack->r_ctl.ack_count += ac; - counter_u64_add(rack_ack_total, ac); - } - if (rack->r_ctl.ack_count > 0xfff00000) { - /* - * reduce the number to keep us under - * a uint32_t. - */ - rack->r_ctl.ack_count /= 2; - rack->r_ctl.sack_count /= 2; - } if (tp->t_flags & TF_NEEDSYN) { /* * T/TCP: Connection was half-synchronized, and our SYN has @@ -17409,16 +16012,6 @@ rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mb } if (acked > sbavail(&so->so_snd)) acked_amount = sbavail(&so->so_snd); -#ifdef TCP_SAD_DETECTION - /* - * We only care on a cum-ack move if we are in a sack-disabled - * state. We have already added in to the ack_count, and we never - * would disable on a cum-ack move, so we only care to do the - * detection if it may "undo" it, i.e. we were in disabled already. - */ - if (rack->sack_attack_disable) - rack_do_detection(tp, rack, acked_amount, segsiz); -#endif if (IN_FASTRECOVERY(tp->t_flags) && (rack->rack_no_prr == 0)) rack_update_prr(tp, rack, acked_amount, high_seq); @@ -17489,7 +16082,7 @@ rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mb /* Must be non-newreno (cubic) getting too ahead of itself */ tp->snd_cwnd = p_cwnd; } - SOCKBUF_LOCK(&so->so_snd); + SOCK_SENDBUF_LOCK(so); mfree = sbcut_locked(&so->so_snd, acked_amount); tp->snd_una = high_seq; /* Note we want to hold the sb lock through the sendmap adjust */ @@ -17933,7 +16526,14 @@ rack_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, __func__)); KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", __func__)); - + if (tp->t_flags2 & TF2_PROC_SACK_PROHIBIT) { + /* + * We don't look at sack's from the + * peer because the MSS is too small which + * can subject us to an attack. + */ + to.to_flags &= ~TOF_SACK; + } if ((tp->t_state >= TCPS_FIN_WAIT_1) && (tp->t_flags & TF_GPUTINPROG)) { /* @@ -17972,7 +16572,7 @@ rack_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, tcp_req = tcp_req_find_req_for_seq(tp, th->th_ack); } #endif - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + memset(&log, 0, sizeof(log)); log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); if (rack->rack_no_prr == 0) log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; @@ -18042,10 +16642,7 @@ rack_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, /* Remove ack required flag if set, we have one */ if (thflags & TH_ACK) rack->rc_ack_required = 0; - if (rack->sack_attack_disable > 0) { - rack->r_ctl.ack_during_sd++; - rack_log_type_bbrsnd(rack, 0, 0, cts, tv, __LINE__); - } + rack_log_type_bbrsnd(rack, 0, 0, cts, tv, __LINE__); if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) { way_out = 4; retval = 0; @@ -18059,7 +16656,7 @@ rack_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); - ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset(m, tp, th, BANDLIM_TCP_RST, tlen); #ifdef TCP_ACCOUNTING sched_unpin(); #endif @@ -18323,7 +16920,7 @@ do_output_now: } else if ((nxt_pkt == 0) && (tp->t_flags & TF_ACKNOW)) { goto do_output_now; } else if ((no_output == 1) && - (nxt_pkt == 0) && + (nxt_pkt == 0) && (tcp_in_hpts(rack->rc_tp) == 0)) { /* * We are not in hpts and we had a pacing timer up. Use @@ -18441,7 +17038,6 @@ tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tsused) struct rack_sendmap *rsm = NULL; int32_t idx; uint32_t srtt = 0, thresh = 0, ts_low = 0; - int no_sack = 0; /* Return the next guy to be re-transmitted */ if (tqhash_empty(rack->r_ctl.tqh)) { @@ -18464,11 +17060,7 @@ tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tsused) return (NULL); } check_it: - if (((rack->rc_tp->t_flags & TF_SACK_PERMIT) == 0) || - (rack->sack_attack_disable > 0)) { - no_sack = 1; - } - if ((no_sack > 0) && + if (((rack->rc_tp->t_flags & TF_SACK_PERMIT) == 0) && (rsm->r_dupack >= DUP_ACK_THRESHOLD)) { /* * No sack so we automatically do the 3 strikes and @@ -18498,8 +17090,7 @@ check_it: return (NULL); } if ((rsm->r_dupack >= DUP_ACK_THRESHOLD) || - ((rsm->r_flags & RACK_SACK_PASSED) && - (rack->sack_attack_disable == 0))) { + ((rsm->r_flags & RACK_SACK_PASSED))) { /* * We have passed the dup-ack threshold <or> * a SACK has indicated this is missing. @@ -18589,6 +17180,12 @@ rack_log_pacing_delay_calc (struct tcp_rack *rack, uint32_t len, uint32_t slot, log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ss; log.u_bbr.cwnd_gain <<= 1; log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ca; + log.u_bbr.cwnd_gain <<= 1; + log.u_bbr.cwnd_gain |= rack->use_fixed_rate; + log.u_bbr.cwnd_gain <<= 1; + log.u_bbr.cwnd_gain |= rack->rc_always_pace; + log.u_bbr.cwnd_gain <<= 1; + log.u_bbr.cwnd_gain |= rack->gp_ready; log.u_bbr.bbr_substate = quality; log.u_bbr.bbr_state = rack->dgp_on; log.u_bbr.bbr_state <<= 1; @@ -18755,7 +17352,7 @@ at_lt_bw: union tcp_log_stackspecific log; struct timeval tv; - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + memset(&log, 0, sizeof(log)); log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.flex1 = rack_bw_multipler; log.u_bbr.flex2 = len; @@ -18839,122 +17436,11 @@ at_lt_bw: return (slot); } -static uint32_t -rack_policer_check_send(struct tcp_rack *rack, uint32_t len, uint32_t segsiz, uint32_t *needs) -{ - uint64_t calc; - - rack->rc_policer_should_pace = 0; - calc = rack_policer_bucket_reserve * rack->r_ctl.policer_bucket_size; - calc /= 100; - /* - * Now lets look at if we want more than is in the bucket <or> - * we want more than is reserved in the bucket. - */ - if (rack_verbose_logging > 0) - policer_detection_log(rack, len, segsiz, calc, rack->r_ctl.current_policer_bucket, 8); - if ((calc > rack->r_ctl.current_policer_bucket) || - (len >= (rack->r_ctl.current_policer_bucket - calc))) { - /* - * We may want to pace depending on if we are going - * into the reserve or not. - */ - uint32_t newlen; - - if (calc > rack->r_ctl.current_policer_bucket) { - /* - * This will eat into the reserve if we - * don't have room at all some lines - * below will catch it. - */ - newlen = rack->r_ctl.policer_max_seg; - rack->rc_policer_should_pace = 1; - } else { - /* - * We have all of the reserve plus something in the bucket - * that we can give out. - */ - newlen = rack->r_ctl.current_policer_bucket - calc; - if (newlen < rack->r_ctl.policer_max_seg) { - /* - * Into the reserve to get a full policer_max_seg - * so we set the len to that and eat into - * the reserve. If we go over the code - * below will make us wait. - */ - newlen = rack->r_ctl.policer_max_seg; - rack->rc_policer_should_pace = 1; - } - } - if (newlen > rack->r_ctl.current_policer_bucket) { - /* We have to wait some */ - *needs = newlen - rack->r_ctl.current_policer_bucket; - return (0); - } - if (rack_verbose_logging > 0) - policer_detection_log(rack, len, segsiz, newlen, 0, 9); - len = newlen; - } /* else we have all len available above the reserve */ - if (rack_verbose_logging > 0) - policer_detection_log(rack, len, segsiz, calc, 0, 10); - return (len); -} - -static uint32_t -rack_policed_sending(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, uint32_t segsiz, int call_line) -{ - /* - * Given a send of len, and a token bucket set at current_policer_bucket_size - * are we close enough to the end of the bucket that we need to pace? If so - * calculate out a time and return it. Otherwise subtract the tokens from - * the bucket. - */ - uint64_t calc; - - if ((rack->r_ctl.policer_bw == 0) || - (rack->r_ctl.policer_bucket_size < segsiz)) { - /* - * We should have an estimate here... - */ - return (0); - } - calc = (uint64_t)rack_policer_bucket_reserve * (uint64_t)rack->r_ctl.policer_bucket_size; - calc /= 100; - if ((rack->r_ctl.current_policer_bucket < len) || - (rack->rc_policer_should_pace == 1) || - ((rack->r_ctl.current_policer_bucket - len) <= (uint32_t)calc)) { - /* we need to pace */ - uint64_t lentim, res; - uint32_t slot; - - lentim = (uint64_t)len * (uint64_t)HPTS_USEC_IN_SEC; - res = lentim / rack->r_ctl.policer_bw; - slot = (uint32_t)res; - if (rack->r_ctl.current_policer_bucket > len) - rack->r_ctl.current_policer_bucket -= len; - else - rack->r_ctl.current_policer_bucket = 0; - policer_detection_log(rack, len, slot, (uint32_t)rack_policer_bucket_reserve, call_line, 5); - rack->rc_policer_should_pace = 0; - return(slot); - } - /* Just take tokens out of the bucket and let rack do whatever it would have */ - policer_detection_log(rack, len, 0, (uint32_t)rack_policer_bucket_reserve, call_line, 6); - if (len < rack->r_ctl.current_policer_bucket) { - rack->r_ctl.current_policer_bucket -= len; - } else { - rack->r_ctl.current_policer_bucket = 0; - } - return (0); -} - - static int32_t rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, struct rack_sendmap *rsm, uint32_t segsiz, int line) { uint64_t srtt; int32_t slot = 0; - int32_t minslot = 0; int can_start_hw_pacing = 1; int err; int pace_one; @@ -18964,25 +17450,6 @@ rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, str pace_one = 1; else pace_one = 0; - if (rack->rc_policer_detected == 1) { - /* - * A policer has been detected and we - * have all of our data (policer-bw and - * policer bucket size) calculated. Call - * into the function to find out if we are - * overriding the time. - */ - slot = rack_policed_sending(rack, tp, len, segsiz, line); - if (slot) { - uint64_t logbw; - - logbw = rack->r_ctl.current_policer_bucket; - logbw <<= 32; - logbw |= rack->r_ctl.policer_bucket_size; - rack_log_pacing_delay_calc(rack, len, slot, rack->r_ctl.policer_bw, logbw, 0, 89, __LINE__, NULL, 0); - return(slot); - } - } if (rack->rc_always_pace == 0) { /* * We use the most optimistic possible cwnd/srtt for @@ -18992,7 +17459,7 @@ rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, str * the peer to have a gap in data sending. */ uint64_t cwnd, tr_perms = 0; - int32_t reduce = 0; + int32_t reduce; old_method: /* @@ -19029,7 +17496,8 @@ rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, str slot -= reduce; } else slot = 0; - } + } else + reduce = 0; slot *= HPTS_USEC_IN_MSEC; if (rack->rc_pace_to_cwnd) { uint64_t rate_wanted = 0; @@ -19079,8 +17547,8 @@ rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, str rack->r_ctl.rc_last_us_rtt, 88, __LINE__, NULL, gain); } - if ((bw_est == 0) || (rate_wanted == 0) || - ((rack->gp_ready == 0) && (rack->use_fixed_rate == 0))) { + if (((bw_est == 0) || (rate_wanted == 0) || (rack->gp_ready == 0)) && + (rack->use_fixed_rate == 0)) { /* * No way yet to make a b/w estimate or * our raise is set incorrectly. @@ -19305,11 +17773,6 @@ rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, str } } } - if (minslot && (minslot > slot)) { - rack_log_pacing_delay_calc(rack, minslot, slot, rack->r_ctl.crte->rate, bw_est, lentim, - 98, __LINE__, NULL, 0); - slot = minslot; - } done_w_hdwr: if (rack_limit_time_with_srtt && (rack->use_fixed_rate == 0) && @@ -19536,7 +17999,7 @@ start_set: rack_log_pacing_delay_calc(rack, tp->gput_seq, tp->gput_ack, - (uint64_t)my_rsm, + (uintptr_t)my_rsm, tp->gput_ts, (((uint64_t)rack->r_ctl.rc_app_limited_cnt << 32) | (uint64_t)rack->r_ctl.rc_gp_output_ts), 9, @@ -19589,7 +18052,7 @@ use_latest: rack_log_pacing_delay_calc(rack, tp->gput_seq, tp->gput_ack, - (uint64_t)my_rsm, + (uintptr_t)my_rsm, tp->gput_ts, (((uint64_t)rack->r_ctl.rc_app_limited_cnt << 32) | (uint64_t)rack->r_ctl.rc_gp_output_ts), 9, __LINE__, NULL, 0); @@ -19647,7 +18110,7 @@ rack_log_fsb(struct tcp_rack *rack, struct tcpcb *tp, struct socket *so, uint32_ union tcp_log_stackspecific log; struct timeval tv; - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + memset(&log, 0, sizeof(log)); log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); log.u_bbr.flex1 = error; log.u_bbr.flex2 = flags; @@ -19912,7 +18375,7 @@ rack_log_queue_level(struct tcpcb *tp, struct tcp_rack *rack, err = in_pcbquery_txrlevel(rack->rc_inp, &p_queue); err = in_pcbquery_txrtlmt(rack->rc_inp, &p_rate); #endif - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + memset(&log, 0, sizeof(log)); log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); log.u_bbr.flex1 = p_rate; log.u_bbr.flex2 = p_queue; @@ -20365,7 +18828,7 @@ rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendma counter_u64_add(rack_collapsed_win_rxt, 1); counter_u64_add(rack_collapsed_win_rxt_bytes, (rsm->r_end - rsm->r_start)); } - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + memset(&log, 0, sizeof(log)); log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); if (rack->rack_no_prr) log.u_bbr.flex1 = 0; @@ -20388,7 +18851,7 @@ rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendma log.u_bbr.pkts_out = tp->t_maxseg; log.u_bbr.timeStamp = cts; log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); - if (rsm && (rsm->r_rtr_cnt > 0)) { + if (rsm->r_rtr_cnt > 0) { /* * When we have a retransmit we want to log the * burst at send and flight at send from before. @@ -20405,7 +18868,7 @@ rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendma } log.u_bbr.lt_epoch = rack->r_ctl.cwnd_to_use; log.u_bbr.delivered = 0; - log.u_bbr.rttProp = (uint64_t)rsm; + log.u_bbr.rttProp = (uintptr_t)rsm; log.u_bbr.delRate = rsm->r_flags; log.u_bbr.delRate <<= 31; log.u_bbr.delRate |= rack->r_must_retran; @@ -20515,11 +18978,7 @@ rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendma crtsc = get_cyclecount(); if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { tp->tcp_cnt_counters[SND_OUT_DATA] += cnt_thru; - } - if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { tp->tcp_proc_time[SND_OUT_DATA] += (crtsc - ts_val); - } - if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((len + segsiz - 1) / segsiz); } sched_unpin(); @@ -20588,7 +19047,7 @@ rack_sndbuf_autoscale(struct tcp_rack *rack) static int rack_fast_output(struct tcpcb *tp, struct tcp_rack *rack, uint64_t ts_val, - uint32_t cts, uint32_t ms_cts, struct timeval *tv, long tot_len, int *send_err) + uint32_t cts, uint32_t ms_cts, struct timeval *tv, long *tot_len, int *send_err, int line) { /* * Enter to do fast output. We are given that the sched_pin is @@ -20761,7 +19220,7 @@ again: } if (rack->r_ctl.fsb.rfo_apply_push && (len == rack->r_ctl.fsb.left_to_send)) { - tcp_set_flags(th, flags | TH_PUSH); + flags |= TH_PUSH; add_flag |= RACK_HAD_PUSH; } if ((m->m_next == NULL) || (len <= 0)){ @@ -20918,7 +19377,7 @@ again: if (tcp_bblogging_on(rack->rc_tp)) { union tcp_log_stackspecific log; - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + memset(&log, 0, sizeof(log)); log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); if (rack->rack_no_prr) log.u_bbr.flex1 = 0; @@ -20940,11 +19399,11 @@ again: log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); log.u_bbr.flex5 = log.u_bbr.inflight; log.u_bbr.lt_epoch = rack->r_ctl.cwnd_to_use; - log.u_bbr.delivered = 0; + log.u_bbr.delivered = rack->r_ctl.fsb.left_to_send; log.u_bbr.rttProp = 0; log.u_bbr.delRate = rack->r_must_retran; log.u_bbr.delRate <<= 1; - log.u_bbr.pkt_epoch = __LINE__; + log.u_bbr.pkt_epoch = line; /* For fast output no retrans so just inflight and how many mss we send */ log.u_bbr.flex5 = log.u_bbr.inflight; log.u_bbr.bbr_substate = (uint8_t)((len + segsiz - 1)/segsiz); @@ -21008,7 +19467,6 @@ again: } rack_log_output(tp, &to, len, tp->snd_max, flags, error, rack_to_usec_ts(tv), NULL, add_flag, s_mb, s_soff, rack->r_ctl.fsb.hw_tls, segsiz); - m = NULL; if (tp->snd_una == tp->snd_max) { rack->r_ctl.rc_tlp_rxt_last_time = cts; rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__); @@ -21018,7 +19476,7 @@ again: tcp_account_for_send(tp, len, 0, 0, rack->r_ctl.fsb.hw_tls); rack->forced_ack = 0; /* If we send something zap the FA flag */ - tot_len += len; + *tot_len += len; if ((tp->t_flags & TF_GPUTINPROG) == 0) rack_start_gp_measurement(tp, rack, tp->snd_max, sb_offset); tp->snd_max += len; @@ -21043,9 +19501,9 @@ again: rack->r_fast_output = 0; rack->r_ctl.fsb.left_to_send = 0; /* At the end of fast_output scale up the sb */ - SOCKBUF_LOCK(&rack->rc_inp->inp_socket->so_snd); + SOCK_SENDBUF_LOCK(rack->rc_inp->inp_socket); rack_sndbuf_autoscale(rack); - SOCKBUF_UNLOCK(&rack->rc_inp->inp_socket->so_snd); + SOCK_SENDBUF_UNLOCK(rack->rc_inp->inp_socket); } if (tp->t_rtttime == 0) { tp->t_rtttime = ticks; @@ -21054,6 +19512,7 @@ again: } if ((rack->r_ctl.fsb.left_to_send >= segsiz) && (max_val > len) && + (*tot_len < rack->r_ctl.rc_pace_max_segs) && (tso == 0)) { max_val -= len; len = segsiz; @@ -21065,18 +19524,14 @@ again: } tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); counter_u64_add(rack_fto_send, 1); - slot = rack_get_pacing_delay(rack, tp, tot_len, NULL, segsiz, __LINE__); - rack_start_hpts_timer(rack, tp, cts, slot, tot_len, 0); + slot = rack_get_pacing_delay(rack, tp, *tot_len, NULL, segsiz, __LINE__); + rack_start_hpts_timer(rack, tp, cts, slot, *tot_len, 0); #ifdef TCP_ACCOUNTING crtsc = get_cyclecount(); if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { tp->tcp_cnt_counters[SND_OUT_DATA] += cnt_thru; - } - if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { tp->tcp_proc_time[SND_OUT_DATA] += (crtsc - ts_val); - } - if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { - tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((tot_len + segsiz - 1) / segsiz); + tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((*tot_len + segsiz - 1) / segsiz); } sched_unpin(); #endif @@ -21189,25 +19644,6 @@ restart: return (NULL); } -static void -rack_credit_back_policer_idle_time(struct tcp_rack *rack, uint64_t idle_t, int line) -{ - /* - * We were idle some time (idle_t) and so our policer bucket - * needs to grow. It can go no higher than policer_bucket_size. - */ - uint64_t len; - - len = idle_t * rack->r_ctl.policer_bw; - len /= HPTS_USEC_IN_SEC; - rack->r_ctl.current_policer_bucket += (uint32_t)len; - if (rack->r_ctl.policer_bucket_size < rack->r_ctl.current_policer_bucket) { - rack->r_ctl.current_policer_bucket = rack->r_ctl.policer_bucket_size; - } - if (rack_verbose_logging > 0) - policer_detection_log(rack, (uint32_t)len, line, (uint32_t)idle_t, 0, 7); -} - static inline void rack_validate_sizes(struct tcp_rack *rack, int32_t *len, int32_t segsiz, uint32_t pace_max_seg) { @@ -21425,8 +19861,6 @@ rack_output(struct tcpcb *tp) crtsc = get_cyclecount(); if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { tp->tcp_proc_time[SND_BLOCKED] += (crtsc - ts_val); - } - if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { tp->tcp_cnt_counters[SND_BLOCKED]++; } sched_unpin(); @@ -21459,20 +19893,36 @@ rack_output(struct tcpcb *tp) TCPS_HAVEESTABLISHED(tp->t_state)) { rack_set_state(tp, rack); } + segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); + minseg = segsiz; + if (rack->r_ctl.rc_pace_max_segs == 0) + pace_max_seg = rack->rc_user_set_max_segs * segsiz; + else + pace_max_seg = rack->r_ctl.rc_pace_max_segs; if ((rack->r_fast_output) && (doing_tlp == 0) && (tp->rcv_numsacks == 0)) { int ret; error = 0; - ret = rack_fast_output(tp, rack, ts_val, cts, ms_cts, &tv, tot_len_this_send, &error); - if (ret >= 0) + ret = rack_fast_output(tp, rack, ts_val, cts, ms_cts, &tv, &tot_len_this_send, &error, __LINE__); + if (ret > 0) return(ret); else if (error) { inp = rack->rc_inp; so = inp->inp_socket; sb = &so->so_snd; goto nomore; + } else { + /* Return == 0, if there is more we can send tot_len wise fall through and send */ + if (tot_len_this_send >= pace_max_seg) + return (ret); +#ifdef TCP_ACCOUNTING + /* We need to re-pin since fast_output un-pined */ + sched_pin(); + ts_val = get_cyclecount(); +#endif + /* Fall back out so we can send any more that may bring us to pace_max_seg */ } } inp = rack->rc_inp; @@ -21486,10 +19936,11 @@ rack_output(struct tcpcb *tp) (tp->t_state == TCPS_SYN_SENT)) && SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN or SYN|ACK sent */ (tp->t_rxtshift == 0)) { /* not a retransmit */ - cwnd_to_use = rack->r_ctl.cwnd_to_use = tp->snd_cwnd; - so = inp->inp_socket; - sb = &so->so_snd; - goto just_return_nolock; + rack_start_hpts_timer(rack, tp, cts, 0, 0, 0); +#ifdef TCP_ACCOUNTING + sched_unpin(); +#endif + return (0); } /* * Determine length of data that should be transmitted, and flags @@ -21524,35 +19975,14 @@ rack_output(struct tcpcb *tp) rack_exit_probertt(rack, cts); } } - } - if(rack->policer_detect_on) { - /* - * If we are doing policer detetion we at a minium - * record the time but if possible add back to - * the bucket based on the idle time. - */ - uint64_t idle_t, u64_cts; - - segsiz = min(ctf_fixed_maxseg(tp), - rack->r_ctl.rc_pace_min_segs); - u64_cts = tcp_tv_to_lusectick(&tv); - if ((rack->rc_policer_detected == 1) && - (rack->r_ctl.policer_bucket_size > segsiz) && - (rack->r_ctl.policer_bw > 0) && - (u64_cts > rack->r_ctl.last_sendtime)) { - /* We are being policed add back the time */ - idle_t = u64_cts - rack->r_ctl.last_sendtime; - rack_credit_back_policer_idle_time(rack, idle_t, __LINE__); - } - rack->r_ctl.last_sendtime = u64_cts; - } + } else + tot_idle = 0; if (rack_use_fsb && (rack->r_ctl.fsb.tcp_ip_hdr) && (rack->r_fsb_inited == 0) && (rack->r_state != TCPS_CLOSED)) rack_init_fsb_block(tp, rack, tcp_outflags[tp->t_state]); if (rack->rc_sendvars_notset == 1) { - rack->r_ctl.idle_snd_una = tp->snd_una; rack->rc_sendvars_notset = 0; /* * Make sure any TCP timers (keep-alive) is not running. @@ -21599,12 +20029,6 @@ again: ms_cts = tcp_tv_to_mssectick(&tv); tso = 0; mtu = 0; - segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); - minseg = segsiz; - if (rack->r_ctl.rc_pace_max_segs == 0) - pace_max_seg = rack->rc_user_set_max_segs * segsiz; - else - pace_max_seg = rack->r_ctl.rc_pace_max_segs; if (TCPS_HAVEESTABLISHED(tp->t_state) && (rack->r_ctl.pcm_max_seg == 0)) { /* @@ -21620,7 +20044,7 @@ again: rack->r_ctl.pcm_max_seg = ctf_fixed_maxseg(tp) * 10; } } - if ((rack->r_ctl.pcm_max_seg != 0) && (rack->pcm_needed == 1)) { + if ((rack->r_ctl.pcm_max_seg != 0) && (rack->pcm_needed == 1)) { uint32_t rw_avail, cwa; if (tp->snd_wnd > ctf_outstanding(tp)) @@ -21664,7 +20088,7 @@ again: len = 0; rsm = NULL; if (flags & TH_RST) { - SOCKBUF_LOCK(&inp->inp_socket->so_snd); + SOCK_SENDBUF_LOCK(inp->inp_socket); so = inp->inp_socket; sb = &so->so_snd; goto send; @@ -21841,7 +20265,6 @@ again: * as long as we are not retransmiting. */ if ((rsm == NULL) && - (rack->do_detection == 0) && (V_tcp_map_entries_limit > 0) && (rack->r_ctl.rc_num_maps_alloced >= V_tcp_map_entries_limit)) { counter_u64_add(rack_to_alloc_limited, 1); @@ -21869,19 +20292,10 @@ again: ((rsm->r_flags & RACK_HAS_FIN) == 0)) { int ret; - if ((rack->rc_policer_detected == 1) && - (rack->r_ctl.policer_bucket_size > segsiz) && - (rack->r_ctl.policer_bw > 0)) { - /* Check to see if there is room */ - if (rack->r_ctl.current_policer_bucket < len) { - goto skip_fast_output; - } - } ret = rack_fast_rsm_output(tp, rack, rsm, ts_val, cts, ms_cts, &tv, len, doing_tlp); if (ret == 0) return (0); } -skip_fast_output: so = inp->inp_socket; sb = &so->so_snd; if (do_a_prefetch == 0) { @@ -21933,7 +20347,7 @@ skip_fast_output: kern_prefetch(end_rsm, &prefetch_rsm); prefetch_rsm = 1; } - SOCKBUF_LOCK(sb); + SOCK_SENDBUF_LOCK(so); if ((sack_rxmit == 0) && (TCPS_HAVEESTABLISHED(tp->t_state) || (tp->t_flags & TF_FASTOPEN))) { @@ -22072,43 +20486,6 @@ skip_fast_output: prefetch_so_done = 1; } orig_len = len; - if ((rack->rc_policer_detected == 1) && - (rack->r_ctl.policer_bucket_size > segsiz) && - (rack->r_ctl.policer_bw > 0) && - (len > 0)) { - /* - * Ok we believe we have a policer watching - * what we send, can we send len? If not can - * we tune it down to a smaller value? - */ - uint32_t plen, buck_needs; - - plen = rack_policer_check_send(rack, len, segsiz, &buck_needs); - if (plen == 0) { - /* - * We are not allowed to send. How long - * do we need to pace for i.e. how long - * before len is available to send? - */ - uint64_t lentime; - - lentime = buck_needs; - lentime *= HPTS_USEC_IN_SEC; - lentime /= rack->r_ctl.policer_bw; - slot = (uint32_t)lentime; - tot_len_this_send = 0; - SOCKBUF_UNLOCK(sb); - if (rack_verbose_logging > 0) - policer_detection_log(rack, len, slot, buck_needs, 0, 12); - rack_start_hpts_timer(rack, tp, cts, slot, 0, 0); - rack_log_type_just_return(rack, cts, 0, slot, hpts_calling, 0, cwnd_to_use); - goto just_return_clean; - } - if (plen < len) { - sendalot = 0; - len = plen; - } - } /* * Lop off SYN bit if it has already been sent. However, if this is * SYN-SENT state and if segment contains data and if we don't know @@ -22308,7 +20685,7 @@ skip_fast_output: if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > segsiz && (tp->t_port == 0) && ((tp->t_flags & TF_SIGNATURE) == 0) && - tp->rcv_numsacks == 0 && sack_rxmit == 0 && + sack_rxmit == 0 && ipoptlen == 0) tso = 1; { @@ -22480,7 +20857,7 @@ dontupdate: * No reason to send a segment, just return. */ just_return: - SOCKBUF_UNLOCK(sb); + SOCK_SENDBUF_UNLOCK(so); just_return_nolock: { int app_limited = CTF_JR_SENT_DATA; @@ -22507,14 +20884,13 @@ just_return_nolock: rack->r_ctl.fsb.recwin = recwin; slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, NULL, segsiz, __LINE__); if ((error == 0) && - (rack->rc_policer_detected == 0) && rack_use_rfo && ((flags & (TH_SYN|TH_FIN)) == 0) && (ipoptlen == 0) && - (tp->rcv_numsacks == 0) && rack->r_fsb_inited && TCPS_HAVEESTABLISHED(tp->t_state) && ((IN_RECOVERY(tp->t_flags)) == 0) && + (doing_tlp == 0) && (rack->r_must_retran == 0) && ((tp->t_flags & TF_NEEDFIN) == 0) && (len > 0) && (orig_len > 0) && @@ -22656,7 +21032,7 @@ just_return_nolock: } else log = 1; } - /* Mark the last packet has app limited */ + /* Mark the last packet as app limited */ rsm = tqhash_max(rack->r_ctl.tqh); if (rsm && ((rsm->r_flags & RACK_APP_LIMITED) == 0)) { if (rack->r_ctl.rc_app_limited_cnt == 0) @@ -22692,7 +21068,6 @@ just_return_nolock: rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, sup_rack); rack_log_type_just_return(rack, cts, tot_len_this_send, slot, hpts_calling, app_limited, cwnd_to_use); } -just_return_clean: #ifdef NETFLIX_SHARED_CWND if ((sbavail(sb) == 0) && rack->r_ctl.rc_scw) { @@ -22705,19 +21080,13 @@ just_return_clean: crtsc = get_cyclecount(); if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { tp->tcp_cnt_counters[SND_OUT_DATA]++; - } - if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { tp->tcp_proc_time[SND_OUT_DATA] += (crtsc - ts_val); - } - if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((tot_len_this_send + segsiz - 1) / segsiz); } } else { crtsc = get_cyclecount(); if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { tp->tcp_cnt_counters[SND_LIMITED]++; - } - if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { tp->tcp_proc_time[SND_LIMITED] += (crtsc - ts_val); } } @@ -22742,7 +21111,7 @@ send: rack->r_ctl.rc_agg_early = 0; rack->r_early = 0; rack->r_late = 0; - SOCKBUF_UNLOCK(&so->so_snd); + SOCK_SENDBUF_UNLOCK(so); goto skip_all_send; } } @@ -22759,7 +21128,8 @@ send: * is acked first. */ flags &= ~TH_FIN; - if ((sbused(sb) == (tp->snd_max - tp->snd_una)) && + if (TCPS_HAVEESTABLISHED(tp->t_state) && + (sbused(sb) == (tp->snd_max - tp->snd_una)) && ((tp->snd_max - tp->snd_una) <= segsiz)) { /* * Ok less than or right at a MSS is @@ -22958,13 +21328,11 @@ send: if (tp->t_port) { if (V_tcp_udp_tunneling_port == 0) { /* The port was removed?? */ - SOCKBUF_UNLOCK(&so->so_snd); + SOCK_SENDBUF_UNLOCK(so); #ifdef TCP_ACCOUNTING crtsc = get_cyclecount(); if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { tp->tcp_cnt_counters[SND_OUT_FAIL]++; - } - if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val); } sched_unpin(); @@ -23016,7 +21384,8 @@ send: if (max_len <= 0) { len = 0; } else if (len > max_len) { - sendalot = 1; + if (doing_tlp == 0) + sendalot = 1; len = max_len; mark = 2; } @@ -23061,7 +21430,7 @@ send: * byte of the payload can be put into the * TCP segment. */ - SOCKBUF_UNLOCK(&so->so_snd); + SOCK_SENDBUF_UNLOCK(so); error = EMSGSIZE; sack_rxmit = 0; goto out; @@ -23144,7 +21513,7 @@ send: m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { - SOCKBUF_UNLOCK(sb); + SOCK_SENDBUF_UNLOCK(so); error = ENOBUFS; sack_rxmit = 0; goto out; @@ -23202,7 +21571,7 @@ send: tso = 0; } if (m->m_next == NULL) { - SOCKBUF_UNLOCK(sb); + SOCK_SENDBUF_UNLOCK(so); (void)m_free(m); error = ENOBUFS; sack_rxmit = 0; @@ -23245,10 +21614,9 @@ send: flags |= TH_PUSH; add_flag |= RACK_HAD_PUSH; } - - SOCKBUF_UNLOCK(sb); + SOCK_SENDBUF_UNLOCK(so); } else { - SOCKBUF_UNLOCK(sb); + SOCK_SENDBUF_UNLOCK(so); if (tp->t_flags & TF_ACKNOW) KMOD_TCPSTAT_INC(tcps_sndacks); else if (flags & (TH_SYN | TH_FIN | TH_RST)) @@ -23271,7 +21639,7 @@ send: m->m_data += max_linkhdr; m->m_len = hdrlen; } - SOCKBUF_UNLOCK_ASSERT(sb); + SOCK_SENDBUF_UNLOCK_ASSERT(so); m->m_pkthdr.rcvif = (struct ifnet *)0; #ifdef MAC mac_inpcb_create_mbuf(inp, m); @@ -23538,7 +21906,7 @@ send: if (tcp_bblogging_on(rack->rc_tp)) { union tcp_log_stackspecific log; - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + memset(&log, 0, sizeof(log)); log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); if (rack->rack_no_prr) log.u_bbr.flex1 = 0; @@ -23592,7 +21960,7 @@ send: } log.u_bbr.lt_epoch = cwnd_to_use; log.u_bbr.delivered = sendalot; - log.u_bbr.rttProp = (uint64_t)rsm; + log.u_bbr.rttProp = (uintptr_t)rsm; log.u_bbr.pkt_epoch = __LINE__; if (rsm) { log.u_bbr.delRate = rsm->r_flags; @@ -23714,6 +22082,8 @@ out: * In transmit state, time the transmission and arrange for the * retransmit. In persist state, just set snd_max. */ + if ((rsm == NULL) && doing_tlp) + add_flag |= RACK_TLP; rack_log_output(tp, &to, len, rack_seq, (uint8_t) flags, error, rack_to_usec_ts(&tv), rsm, add_flag, s_mb, s_moff, hw_tls, segsiz); @@ -23800,15 +22170,14 @@ out: rack->r_ctl.rc_prr_sndcnt = 0; } sub_from_prr = 0; - if (doing_tlp) { - /* Make sure the TLP is added */ - add_flag |= RACK_TLP; - } else if (rsm) { - /* If its a resend without TLP then it must not have the flag */ - rsm->r_flags &= ~RACK_TLP; - } - - + if (rsm != NULL) { + if (doing_tlp) + /* Make sure the TLP is added */ + rsm->r_flags |= RACK_TLP; + else + /* If its a resend without TLP then it must not have the flag */ + rsm->r_flags &= ~RACK_TLP; + } if ((error == 0) && (len > 0) && (tp->snd_una == tp->snd_max)) @@ -23933,7 +22302,7 @@ out: len = n_len; sb_offset = tp->snd_max - tp->snd_una; /* Re-lock for the next spin */ - SOCKBUF_LOCK(sb); + SOCK_SENDBUF_LOCK(so); goto send; } } else { @@ -23952,7 +22321,7 @@ out: len = n_len; sb_offset = tp->snd_max - tp->snd_una; /* Re-lock for the next spin */ - SOCKBUF_LOCK(sb); + SOCK_SENDBUF_LOCK(so); goto send; } } @@ -23982,8 +22351,6 @@ nomore: crtsc = get_cyclecount(); if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { tp->tcp_cnt_counters[SND_OUT_FAIL]++; - } - if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val); } sched_unpin(); @@ -24037,8 +22404,6 @@ nomore: crtsc = get_cyclecount(); if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { tp->tcp_cnt_counters[SND_OUT_FAIL]++; - } - if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val); } sched_unpin(); @@ -24046,11 +22411,13 @@ nomore: return (error); case ENETUNREACH: counter_u64_add(rack_saw_enetunreach, 1); + /* FALLTHROUGH */ case EHOSTDOWN: case EHOSTUNREACH: case ENETDOWN: if (TCPS_HAVERCVDSYN(tp->t_state)) { tp->t_softerror = error; + error = 0; } /* FALLTHROUGH */ default: @@ -24060,8 +22427,6 @@ nomore: crtsc = get_cyclecount(); if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { tp->tcp_cnt_counters[SND_OUT_FAIL]++; - } - if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val); } sched_unpin(); @@ -24150,8 +22515,7 @@ enobufs: ((flags & (TH_SYN|TH_FIN)) == 0) && (rsm == NULL) && (ipoptlen == 0) && - (tp->rcv_numsacks == 0) && - (rack->rc_policer_detected == 0) && + (doing_tlp == 0) && rack->r_fsb_inited && TCPS_HAVEESTABLISHED(tp->t_state) && ((IN_RECOVERY(tp->t_flags)) == 0) && @@ -24178,8 +22542,8 @@ enobufs: rack_use_rfo && ((flags & (TH_SYN|TH_FIN)) == 0) && (rsm == NULL) && + (doing_tlp == 0) && (ipoptlen == 0) && - (tp->rcv_numsacks == 0) && (rack->r_must_retran == 0) && rack->r_fsb_inited && TCPS_HAVEESTABLISHED(tp->t_state) && @@ -24195,7 +22559,7 @@ enobufs: segsiz, pace_max_seg, hw_tls, flags); if (rack->r_fast_output) { error = 0; - ret = rack_fast_output(tp, rack, ts_val, cts, ms_cts, &tv, tot_len_this_send, &error); + ret = rack_fast_output(tp, rack, ts_val, cts, ms_cts, &tv, &tot_len_this_send, &error, __LINE__); if (ret >= 0) return (ret); else if (error) @@ -24215,18 +22579,12 @@ skip_all_send: if (tot_len_this_send) { if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { tp->tcp_cnt_counters[SND_OUT_DATA]++; - } - if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { tp->tcp_proc_time[SND_OUT_DATA] += crtsc; - } - if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((tot_len_this_send + segsiz - 1) /segsiz); } } else { if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { tp->tcp_cnt_counters[SND_OUT_ACK]++; - } - if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { tp->tcp_proc_time[SND_OUT_ACK] += crtsc; } } @@ -24562,28 +22920,7 @@ process_hybrid_pacing(struct tcp_rack *rack, struct tcp_hybrid_req *hybrid) static int rack_stack_information(struct tcpcb *tp, struct stack_specific_info *si) { - /* - * Gather rack specific information. - */ - struct tcp_rack *rack; - - rack = (struct tcp_rack *)tp->t_fb_ptr; /* We pulled a SSI info log out what was there */ - policer_detection_log(rack, rack->rc_highly_buffered, 0, 0, 0, 20); - if (rack->policer_detect_on) { - si->policer_detection_enabled = 1; - if (rack->rc_policer_detected) { - si->policer_detected = 1; - si->policer_bucket_size = rack->r_ctl.policer_bucket_size; - si->policer_last_bw = rack->r_ctl.policer_bw; - } else { - si->policer_detected = 0; - si->policer_bucket_size = 0; - si->policer_last_bw = 0; - } - si->current_round = rack->r_ctl.current_round; - si->highly_buffered = rack->rc_highly_buffered; - } si->bytes_transmitted = tp->t_sndbytes; si->bytes_retransmitted = tp->t_snd_rxt_bytes; return (0); @@ -24603,7 +22940,7 @@ rack_process_option(struct tcpcb *tp, struct tcp_rack *rack, int sopt_name, switch (sopt_name) { case TCP_RACK_SET_RXT_OPTIONS: - if ((optval >= 0) && (optval <= 2)) { + if (optval <= 2) { rack_init_retransmit_value(rack, optval); } else { /* @@ -24650,7 +22987,7 @@ rack_process_option(struct tcpcb *tp, struct tcp_rack *rack, int sopt_name, rack->r_ctl.saved_hibeta = optval; if (rack->rc_pacing_cc_set) rack_undo_cc_pacing(rack); - rack->r_ctl.rc_saved_beta.beta = optval; + rack->r_ctl.rc_saved_beta = optval; } if (rack->rc_pacing_cc_set == 0) rack_set_cc_pacing(rack); @@ -24701,8 +23038,7 @@ rack_process_option(struct tcpcb *tp, struct tcp_rack *rack, int sopt_name, * Not pacing yet so set it into our local * rack pcb storage. */ - rack->r_ctl.rc_saved_beta.beta_ecn = optval; - rack->r_ctl.rc_saved_beta.newreno_flags = CC_NEWRENO_BETA_ECN_ENABLED; + rack->r_ctl.rc_saved_beta_ecn = optval; } break; case TCP_DEFER_OPTIONS: @@ -24814,36 +23150,6 @@ rack_process_option(struct tcpcb *tp, struct tcp_rack *rack, int sopt_name, case TCP_RACK_DGP_IN_REC: error = EINVAL; break; - case TCP_POLICER_DETECT: /* URL:pol_det */ - RACK_OPTS_INC(tcp_pol_detect); - rack_translate_policer_detect(rack, optval); - break; - case TCP_POLICER_MSS: - RACK_OPTS_INC(tcp_pol_mss); - rack->r_ctl.policer_del_mss = (uint8_t)optval; - if (optval & 0x00000100) { - /* - * Value is setup like so: - * VVVV VVVV VVVV VVVV VVVV VVAI MMMM MMMM - * Where MMMM MMMM is MSS setting - * I (9th bit) is the Postive value that - * says it is being set (if its 0 then the - * upper bits 11 - 32 have no meaning. - * This allows setting it off with - * 0x000001MM. - * - * The 10th bit is used to turn on the - * alternate median (not the expanded one). - * - */ - rack->r_ctl.pol_bw_comp = (optval >> 10); - } - if (optval & 0x00000200) { - rack->r_ctl.policer_alt_median = 1; - } else { - rack->r_ctl.policer_alt_median = 0; - } - break; case TCP_RACK_PACE_TO_FILL: RACK_OPTS_INC(tcp_fillcw); if (optval == 0) @@ -24929,11 +23235,7 @@ rack_process_option(struct tcpcb *tp, struct tcp_rack *rack, int sopt_name, } break; case TCP_RACK_DO_DETECTION: - RACK_OPTS_INC(tcp_rack_do_detection); - if (optval == 0) - rack->do_detection = 0; - else - rack->do_detection = 1; + error = EINVAL; break; case TCP_RACK_TLP_USE: if ((optval < TLP_USE_ID) || (optval > TLP_USE_TWO_TWO)) { @@ -25462,7 +23764,7 @@ rack_inherit(struct tcpcb *tp, struct inpcb *parent) if (src->rack_hibeta != dest->rack_hibeta) { cnt++; if (src->rack_hibeta) { - dest->r_ctl.rc_saved_beta.beta = src->r_ctl.rc_saved_beta.beta; + dest->r_ctl.rc_saved_beta = src->r_ctl.rc_saved_beta; dest->rack_hibeta = 1; } else { dest->rack_hibeta = 0; @@ -25474,12 +23776,8 @@ rack_inherit(struct tcpcb *tp, struct inpcb *parent) cnt++; } /* TCP_RACK_PACING_BETA_ECN */ - if (dest->r_ctl.rc_saved_beta.beta_ecn != src->r_ctl.rc_saved_beta.beta_ecn) { - dest->r_ctl.rc_saved_beta.beta_ecn = src->r_ctl.rc_saved_beta.beta_ecn; - cnt++; - } - if (dest->r_ctl.rc_saved_beta.newreno_flags != src->r_ctl.rc_saved_beta.newreno_flags) { - dest->r_ctl.rc_saved_beta.newreno_flags = src->r_ctl.rc_saved_beta.newreno_flags; + if (dest->r_ctl.rc_saved_beta_ecn != src->r_ctl.rc_saved_beta_ecn) { + dest->r_ctl.rc_saved_beta_ecn = src->r_ctl.rc_saved_beta_ecn; cnt++; } /* We do not do TCP_DEFER_OPTIONS */ @@ -25514,43 +23812,6 @@ rack_inherit(struct tcpcb *tp, struct inpcb *parent) dest->r_limit_scw = src->r_limit_scw; cnt++; } - /* TCP_POLICER_DETECT */ - if (dest->r_ctl.policer_rxt_threshold != src->r_ctl.policer_rxt_threshold) { - dest->r_ctl.policer_rxt_threshold = src->r_ctl.policer_rxt_threshold; - cnt++; - } - if (dest->r_ctl.policer_avg_threshold != src->r_ctl.policer_avg_threshold) { - dest->r_ctl.policer_avg_threshold = src->r_ctl.policer_avg_threshold; - cnt++; - } - if (dest->r_ctl.policer_med_threshold != src->r_ctl.policer_med_threshold) { - dest->r_ctl.policer_med_threshold = src->r_ctl.policer_med_threshold; - cnt++; - } - if (dest->policer_detect_on != src->policer_detect_on) { - dest->policer_detect_on = src->policer_detect_on; - cnt++; - } - - if (dest->r_ctl.saved_policer_val != src->r_ctl.saved_policer_val) { - dest->r_ctl.saved_policer_val = src->r_ctl.saved_policer_val; - cnt++; - } - /* TCP_POLICER_MSS */ - if (dest->r_ctl.policer_del_mss != src->r_ctl.policer_del_mss) { - dest->r_ctl.policer_del_mss = src->r_ctl.policer_del_mss; - cnt++; - } - - if (dest->r_ctl.pol_bw_comp != src->r_ctl.pol_bw_comp) { - dest->r_ctl.pol_bw_comp = src->r_ctl.pol_bw_comp; - cnt++; - } - - if (dest->r_ctl.policer_alt_median != src->r_ctl.policer_alt_median) { - dest->r_ctl.policer_alt_median = src->r_ctl.policer_alt_median; - cnt++; - } /* TCP_RACK_PACE_TO_FILL */ if (dest->rc_pace_to_cwnd != src->rc_pace_to_cwnd) { dest->rc_pace_to_cwnd = src->rc_pace_to_cwnd; @@ -25625,11 +23886,6 @@ rack_inherit(struct tcpcb *tp, struct inpcb *parent) dest->r_ctl.rack_per_of_gp_ca = src->r_ctl.rack_per_of_gp_ca; cnt++; } - /* TCP_RACK_DO_DETECTION */ - if (dest->do_detection != src->do_detection) { - dest->do_detection = src->do_detection; - cnt++; - } /* TCP_RACK_TLP_USE */ if (dest->rack_tlp_threshold_use != src->rack_tlp_threshold_use) { dest->rack_tlp_threshold_use = src->rack_tlp_threshold_use; @@ -25908,7 +24164,7 @@ static struct tcp_function_block __tcp_rack = { .tfb_compute_pipe = rack_compute_pipe, .tfb_stack_info = rack_stack_information, .tfb_inherit = rack_inherit, - .tfb_flags = TCP_FUNC_OUTPUT_CANDROP, + .tfb_flags = TCP_FUNC_OUTPUT_CANDROP | TCP_FUNC_DEFAULT_OK, }; @@ -26007,8 +24263,6 @@ rack_set_sockopt(struct tcpcb *tp, struct sockopt *sopt) case TCP_RACK_PACING_BETA_ECN: /* URL:pacing_beta_ecn */ case TCP_RACK_PACE_TO_FILL: /* URL:fillcw */ /* End pacing related */ - case TCP_POLICER_DETECT: /* URL:pol_det */ - case TCP_POLICER_MSS: /* URL:pol_mss */ case TCP_DELACK: /* URL:delack (in base TCP i.e. tcp_hints along with cc etc ) */ case TCP_RACK_PRR_SENDALOT: /* URL:prr_sendalot */ case TCP_RACK_MIN_TO: /* URL:min_to */ @@ -26020,7 +24274,6 @@ rack_set_sockopt(struct tcpcb *tp, struct sockopt *sopt) case TCP_RACK_TLP_USE: /* URL:tlp_use */ case TCP_BBR_RACK_RTT_USE: /* URL:rttuse */ case TCP_BBR_USE_RACK_RR: /* URL:rackrr */ - case TCP_RACK_DO_DETECTION: /* URL:detect */ case TCP_NO_PRR: /* URL:noprr */ case TCP_TIMELY_DYN_ADJ: /* URL:dynamic */ case TCP_DATA_AFTER_CLOSE: /* no URL */ @@ -26199,20 +24452,34 @@ rack_get_sockopt(struct tcpcb *tp, struct sockopt *sopt) * when you exit recovery. */ case TCP_RACK_PACING_BETA: + if (strcmp(tp->t_cc->name, CCALGONAME_NEWRENO) != 0) + error = EINVAL; + else if (rack->rc_pacing_cc_set == 0) + optval = rack->r_ctl.rc_saved_beta; + else { + /* + * Reach out into the CC data and report back what + * I have previously set. Yeah it looks hackish but + * we don't want to report the saved values. + */ + if (tp->t_ccv.cc_data) + optval = ((struct newreno *)tp->t_ccv.cc_data)->beta; + else + error = EINVAL; + } break; - /* - * Beta_ecn is the congestion control value for NewReno that influences how - * much of a backoff happens when a ECN mark is detected. It is normally set - * to 80 for 80% i.e. the cwnd is reduced by 20% of its previous value when - * you exit recovery. Note that classic ECN has a beta of 50, it is only - * ABE Ecn that uses this "less" value, but we do too with pacing :) - */ - + /* + * Beta_ecn is the congestion control value for NewReno that influences how + * much of a backoff happens when a ECN mark is detected. It is normally set + * to 80 for 80% i.e. the cwnd is reduced by 20% of its previous value when + * you exit recovery. Note that classic ECN has a beta of 50, it is only + * ABE Ecn that uses this "less" value, but we do too with pacing :) + */ case TCP_RACK_PACING_BETA_ECN: if (strcmp(tp->t_cc->name, CCALGONAME_NEWRENO) != 0) error = EINVAL; else if (rack->rc_pacing_cc_set == 0) - optval = rack->r_ctl.rc_saved_beta.beta_ecn; + optval = rack->r_ctl.rc_saved_beta_ecn; else { /* * Reach out into the CC data and report back what @@ -26253,12 +24520,6 @@ rack_get_sockopt(struct tcpcb *tp, struct sockopt *sopt) case TCP_RACK_HI_BETA: optval = rack->rack_hibeta; break; - case TCP_POLICER_MSS: - optval = rack->r_ctl.policer_del_mss; - break; - case TCP_POLICER_DETECT: - optval = rack->r_ctl.saved_policer_val; - break; case TCP_DEFER_OPTIONS: optval = rack->defer_options; break; @@ -26327,7 +24588,7 @@ rack_get_sockopt(struct tcpcb *tp, struct sockopt *sopt) } break; case TCP_RACK_DO_DETECTION: - optval = rack->do_detection; + error = EINVAL; break; case TCP_RACK_MBUF_QUEUE: /* Now do we use the LRO mbuf-queue feature */ diff --git a/sys/netinet/tcp_stacks/rack_bbr_common.c b/sys/netinet/tcp_stacks/rack_bbr_common.c index 4a4a8af2bd78..d1c4ba58bf55 100644 --- a/sys/netinet/tcp_stacks/rack_bbr_common.c +++ b/sys/netinet/tcp_stacks/rack_bbr_common.c @@ -361,26 +361,15 @@ ctf_process_inbound_raw(struct tcpcb *tp, struct mbuf *m, int has_pkt) int32_t retval, nxt_pkt, tlen, off; int etype = 0; uint16_t drop_hdrlen; - uint8_t iptos, no_vn=0; + uint8_t iptos; inp = tptoinpcb(tp); INP_WLOCK_ASSERT(inp); NET_EPOCH_ASSERT(); - - if (m) - ifp = m_rcvif(m); - else - ifp = NULL; - if (ifp == NULL) { - /* - * We probably should not work around - * but kassert, since lro alwasy sets rcvif. - */ - no_vn = 1; - goto skip_vnet; - } + KASSERT(m != NULL, ("ctf_process_inbound_raw: m == NULL")); + ifp = m_rcvif(m); + KASSERT(ifp != NULL, ("ctf_process_inbound_raw: ifp == NULL")); CURVNET_SET(ifp->if_vnet); -skip_vnet: tcp_get_usecs(&tv); while (m) { m_save = m->m_nextpkt; @@ -466,19 +455,15 @@ skip_vnet: m_freem(m); m = m_save; } - if (no_vn == 0) { - CURVNET_RESTORE(); - } + CURVNET_RESTORE(); INP_UNLOCK_ASSERT(inp); - return(retval); + return (retval); } skipped_pkt: m = m_save; } - if (no_vn == 0) { - CURVNET_RESTORE(); - } - return(retval); + CURVNET_RESTORE(); + return (0); } int @@ -532,28 +517,19 @@ ctf_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, } void -ctf_ack_war_checks(struct tcpcb *tp, uint32_t *ts, uint32_t *cnt) +ctf_ack_war_checks(struct tcpcb *tp) { - if ((ts != NULL) && (cnt != NULL) && - (tcp_ack_war_time_window > 0) && - (tcp_ack_war_cnt > 0)) { - /* We are possibly doing ack war prevention */ - uint32_t cts; - - /* - * We use a msec tick here which gives us - * roughly 49 days. We don't need the - * precision of a microsecond timestamp which - * would only give us hours. - */ - cts = tcp_ts_getticks(); - if (TSTMP_LT((*ts), cts)) { - /* Timestamp is in the past */ - *cnt = 0; - *ts = (cts + tcp_ack_war_time_window); + sbintime_t now; + + if ((V_tcp_ack_war_time_window > 0) && (V_tcp_ack_war_cnt > 0)) { + now = getsbinuptime(); + if (tp->t_challenge_ack_end < now) { + tp->t_challenge_ack_cnt = 0; + tp->t_challenge_ack_end = now + + V_tcp_ack_war_time_window * SBT_1MS; } - if (*cnt < tcp_ack_war_cnt) { - *cnt = (*cnt + 1); + if (tp->t_challenge_ack_cnt < V_tcp_ack_war_cnt) { + tp->t_challenge_ack_cnt++; tp->t_flags |= TF_ACKNOW; } else tp->t_flags &= ~TF_ACKNOW; @@ -568,10 +544,9 @@ ctf_ack_war_checks(struct tcpcb *tp, uint32_t *ts, uint32_t *cnt) * TCB is still valid and locked. */ int -_ctf_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th, - struct tcpcb *tp, int32_t *tlenp, - int32_t *thf, int32_t *drop_hdrlen, int32_t *ret_val, - uint32_t *ts, uint32_t *cnt) +ctf_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th, + struct tcpcb *tp, int32_t *tlenp, + int32_t *thf, int32_t *drop_hdrlen, int32_t *ret_val) { int32_t todrop; int32_t thflags; @@ -605,7 +580,7 @@ _ctf_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th, * Send an ACK to resynchronize and drop any data. * But keep on processing for RST or ACK. */ - ctf_ack_war_checks(tp, ts, cnt); + ctf_ack_war_checks(tp); todrop = tlen; KMOD_TCPSTAT_INC(tcps_rcvduppack); KMOD_TCPSTAT_ADD(tcps_rcvdupbyte, todrop); @@ -621,7 +596,7 @@ _ctf_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th, * ACK now, as the next in-sequence segment * will clear the DSACK block again */ - ctf_ack_war_checks(tp, ts, cnt); + ctf_ack_war_checks(tp); if (tp->t_flags & TF_ACKNOW) tcp_update_sack_list(tp, th->th_seq, th->th_seq + todrop); @@ -653,10 +628,10 @@ _ctf_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th, * ack. */ if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { - ctf_ack_war_checks(tp, ts, cnt); + ctf_ack_war_checks(tp); KMOD_TCPSTAT_INC(tcps_rcvwinprobe); } else { - __ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val, ts, cnt); + ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val); return (1); } } else @@ -677,7 +652,7 @@ _ctf_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th, * and valid. */ void -__ctf_do_dropafterack(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t *ret_val, uint32_t *ts, uint32_t *cnt) +ctf_do_dropafterack(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t *ret_val) { /* * Generate an ACK dropping incoming segment if it occupies sequence @@ -697,11 +672,11 @@ __ctf_do_dropafterack(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32 (SEQ_GT(tp->snd_una, th->th_ack) || SEQ_GT(th->th_ack, tp->snd_max))) { *ret_val = 1; - ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset(m, tp, th, BANDLIM_TCP_RST, tlen); return; } else *ret_val = 0; - ctf_ack_war_checks(tp, ts, cnt); + ctf_ack_war_checks(tp); if (m) m_freem(m); } @@ -720,8 +695,8 @@ ctf_do_drop(struct mbuf *m, struct tcpcb *tp) } int -__ctf_process_rst(struct mbuf *m, struct tcphdr *th, struct socket *so, - struct tcpcb *tp, uint32_t *ts, uint32_t *cnt) +ctf_process_rst(struct mbuf *m, struct tcphdr *th, struct socket *so, + struct tcpcb *tp) { /* * RFC5961 Section 3.2 @@ -768,40 +743,8 @@ __ctf_process_rst(struct mbuf *m, struct tcphdr *th, struct socket *so, dropped = 1; ctf_do_drop(m, tp); } else { - int send_challenge; - KMOD_TCPSTAT_INC(tcps_badrst); - if ((ts != NULL) && (cnt != NULL) && - (tcp_ack_war_time_window > 0) && - (tcp_ack_war_cnt > 0)) { - /* We are possibly preventing an ack-rst war prevention */ - uint32_t cts; - - /* - * We use a msec tick here which gives us - * roughly 49 days. We don't need the - * precision of a microsecond timestamp which - * would only give us hours. - */ - cts = tcp_ts_getticks(); - if (TSTMP_LT((*ts), cts)) { - /* Timestamp is in the past */ - *cnt = 0; - *ts = (cts + tcp_ack_war_time_window); - } - if (*cnt < tcp_ack_war_cnt) { - *cnt = (*cnt + 1); - send_challenge = 1; - } else - send_challenge = 0; - } else - send_challenge = 1; - if (send_challenge) { - /* Send challenge ACK. */ - tcp_respond(tp, mtod(m, void *), th, m, - tp->rcv_nxt, tp->snd_nxt, TH_ACK); - tp->last_ack_sent = tp->rcv_nxt; - } + tcp_send_challenge_ack(tp, th, m); } } else { m_freem(m); diff --git a/sys/netinet/tcp_stacks/rack_bbr_common.h b/sys/netinet/tcp_stacks/rack_bbr_common.h index 9e5fbe675a3a..6a8a056d89b0 100644 --- a/sys/netinet/tcp_stacks/rack_bbr_common.h +++ b/sys/netinet/tcp_stacks/rack_bbr_common.h @@ -89,19 +89,15 @@ int ctf_do_queued_segments(struct tcpcb *tp, int have_pkt); uint32_t ctf_outstanding(struct tcpcb *tp); uint32_t ctf_flight_size(struct tcpcb *tp, uint32_t rc_sacked); int -_ctf_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th, +ctf_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t *tlenp, - int32_t *thf, int32_t *drop_hdrlen, int32_t *ret_val, - uint32_t *ts, uint32_t *cnt); -void ctf_ack_war_checks(struct tcpcb *tp, uint32_t *ts, uint32_t *cnt); -#define ctf_drop_checks(a, b, c, d, e, f, g, h) _ctf_drop_checks(a, b, c, d, e, f, g, h, NULL, NULL) + int32_t *thf, int32_t *drop_hdrlen, int32_t *ret_val); +void ctf_ack_war_checks(struct tcpcb *tp); void -__ctf_do_dropafterack(struct mbuf *m, struct tcpcb *tp, +ctf_do_dropafterack(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t thflags, int32_t tlen, - int32_t *ret_val, uint32_t *ts, uint32_t *cnt); - -#define ctf_do_dropafterack(a, b, c, d, e, f) __ctf_do_dropafterack(a, b, c, d, e, f, NULL, NULL) + int32_t *ret_val); void ctf_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, @@ -110,9 +106,8 @@ void ctf_do_drop(struct mbuf *m, struct tcpcb *tp); int -__ctf_process_rst(struct mbuf *m, struct tcphdr *th, - struct socket *so, struct tcpcb *tp, uint32_t *ts, uint32_t *cnt); -#define ctf_process_rst(m, t, s, p) __ctf_process_rst(m, t, s, p, NULL, NULL) +ctf_process_rst(struct mbuf *m, struct tcphdr *th, + struct socket *so, struct tcpcb *tp); void ctf_challenge_ack(struct mbuf *m, struct tcphdr *th, diff --git a/sys/netinet/tcp_stacks/rack_pcm.c b/sys/netinet/tcp_stacks/rack_pcm.c index 09e90da88895..b0e300847c4a 100644 --- a/sys/netinet/tcp_stacks/rack_pcm.c +++ b/sys/netinet/tcp_stacks/rack_pcm.c @@ -241,7 +241,7 @@ skip_ack_accounting: for (i=0; i<rack->r_ctl.pcm_i.cnt; i++) { e = &rack->r_ctl.pcm_s[i]; - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + memset(&log, 0, sizeof(log)); log.u_bbr.timeStamp = tcp_tv_to_usectick(&tv); log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); log.u_bbr.flex8 = 1; diff --git a/sys/netinet/tcp_stacks/sack_filter.c b/sys/netinet/tcp_stacks/sack_filter.c index e82fcee2ffac..fc9ee8454a1e 100644 --- a/sys/netinet/tcp_stacks/sack_filter.c +++ b/sys/netinet/tcp_stacks/sack_filter.c @@ -35,7 +35,13 @@ #include <sys/sockopt.h> #endif #include <netinet/in.h> +#ifdef _KERNEL #include <netinet/in_pcb.h> +#else +struct inpcb { + uint32_t stuff; +}; +#endif #include <netinet/tcp.h> #include <netinet/tcp_var.h> #include <netinet/tcp_seq.h> @@ -86,9 +92,9 @@ uint64_t cnt_used_oldsack = 0; int highest_used=0; int over_written=0; int empty_avail=0; -int no_collapse = 0; FILE *out = NULL; FILE *in = NULL; + #endif #define sack_blk_used(sf, i) ((1 << i) & sf->sf_bits) @@ -96,6 +102,13 @@ FILE *in = NULL; #define sack_blk_clr(sf, i) (~(1 << i) & sf->sf_bits) #ifndef _KERNEL + +static u_int tcp_fixed_maxseg(const struct tcpcb *tp) +{ + /* Lets pretend their are timestamps on for user space */ + return (tp->t_maxseg - 12); +} + static #endif void @@ -118,7 +131,7 @@ sack_filter_prune(struct sack_filter *sf, tcp_seq th_ack) /* start with the oldest */ for (i = 0; i < SACK_FILTER_BLOCKS; i++) { if (sack_blk_used(sf, i)) { - if (SEQ_GT(th_ack, sf->sf_blks[i].end)) { + if (SEQ_GEQ(th_ack, sf->sf_blks[i].end)) { /* This block is consumed */ sf->sf_bits = sack_blk_clr(sf, i); sf->sf_used--; @@ -143,23 +156,27 @@ sack_filter_prune(struct sack_filter *sf, tcp_seq th_ack) * if part of it is on the board. */ static int32_t -is_sack_on_board(struct sack_filter *sf, struct sackblk *b) +is_sack_on_board(struct sack_filter *sf, struct sackblk *b, int32_t segmax, uint32_t snd_max) { int32_t i, cnt; + int span_cnt = 0; + uint32_t span_start, span_end; + if (SEQ_LT(b->start, sf->sf_ack)) { + /* Behind cum-ack update */ + b->start = sf->sf_ack; + } + if (SEQ_LT(b->end, sf->sf_ack)) { + /* End back behind too */ + b->end = sf->sf_ack; + } + if (b->start == b->end) { + return(1); + } + span_start = b->start; + span_end = b->end; for (i = sf->sf_cur, cnt=0; cnt < SACK_FILTER_BLOCKS; cnt++) { if (sack_blk_used(sf, i)) { - if (SEQ_LT(b->start, sf->sf_ack)) { - /* Behind cum-ack update */ - b->start = sf->sf_ack; - } - if (SEQ_LT(b->end, sf->sf_ack)) { - /* End back behind too */ - b->end = sf->sf_ack; - } - if (b->start == b->end) { - return(1); - } /* Jonathans Rule 1 */ if (SEQ_LEQ(sf->sf_blks[i].start, b->start) && SEQ_GEQ(sf->sf_blks[i].end, b->end)) { @@ -184,6 +201,15 @@ is_sack_on_board(struct sack_filter *sf, struct sackblk *b) * board |---| * sack |---| */ + if ((b->end != snd_max) && + (span_cnt < 2) && + ((b->end - b->start) < segmax)) { + /* + * Too small for us to mess with so we + * pretend its on the board. + */ + return (1); + } goto nxt_blk; } /* Jonathans Rule 3 */ @@ -194,6 +220,16 @@ is_sack_on_board(struct sack_filter *sf, struct sackblk *b) * board |---| * sack |---| */ + if ((b->end != snd_max) && + (sf->sf_blks[i].end != snd_max) && + (span_cnt < 2) && + ((b->end - b->start) < segmax)) { + /* + * Too small for us to mess with so we + * pretend its on the board. + */ + return (1); + } goto nxt_blk; } if (SEQ_LEQ(sf->sf_blks[i].start, b->start)) { @@ -207,12 +243,36 @@ is_sack_on_board(struct sack_filter *sf, struct sackblk *b) * sack |--------------| * * up with this one (we have part of it). + * * 1) Update the board block to the new end * and * 2) Update the start of this block to my end. + * + * We only do this if the new piece is large enough. */ + if (((b->end != snd_max) || (sf->sf_blks[i].end == snd_max)) && + (span_cnt == 0) && + ((b->end - sf->sf_blks[i].end) < segmax)) { + /* + * Too small for us to mess with so we + * pretend its on the board. + */ + return (1); + } b->start = sf->sf_blks[i].end; sf->sf_blks[i].end = b->end; + if (span_cnt == 0) { + span_start = sf->sf_blks[i].start; + span_end = sf->sf_blks[i].end; + } else { + if (SEQ_LT(span_start, sf->sf_blks[i].start)) { + span_start = sf->sf_blks[i].start; + } + if (SEQ_GT(span_end, sf->sf_blks[i].end)) { + span_end = sf->sf_blks[i].end; + } + } + span_cnt++; goto nxt_blk; } if (SEQ_GEQ(sf->sf_blks[i].end, b->end)) { @@ -224,12 +284,36 @@ is_sack_on_board(struct sack_filter *sf, struct sackblk *b) * <or> * board |----| * sack |----------| + * * 1) Update the board block to the new start * and * 2) Update the start of this block to my end. + * + * We only do this if the new piece is large enough. */ + if (((b->end != snd_max) || (sf->sf_blks[i].end == snd_max)) && + (span_cnt == 0) && + ((sf->sf_blks[i].start - b->start) < segmax)) { + /* + * Too small for us to mess with so we + * pretend its on the board. + */ + return (1); + } b->end = sf->sf_blks[i].start; sf->sf_blks[i].start = b->start; + if (span_cnt == 0) { + span_start = sf->sf_blks[i].start; + span_end = sf->sf_blks[i].end; + } else { + if (SEQ_LT(span_start, sf->sf_blks[i].start)) { + span_start = sf->sf_blks[i].start; + } + if (SEQ_GT(span_end, sf->sf_blks[i].end)) { + span_end = sf->sf_blks[i].end; + } + } + span_cnt++; goto nxt_blk; } } @@ -238,46 +322,23 @@ is_sack_on_board(struct sack_filter *sf, struct sackblk *b) i %= SACK_FILTER_BLOCKS; } /* Did we totally consume it in pieces? */ - if (b->start != b->end) - return(0); - else - return(1); -} - -static int32_t -sack_filter_old(struct sack_filter *sf, struct sackblk *in, int numblks) -{ - int32_t num, i; - struct sackblk blkboard[TCP_MAX_SACK]; - /* - * An old sack has arrived. It may contain data - * we do not have. We might not have it since - * we could have had a lost ack <or> we might have the - * entire thing on our current board. We want to prune - * off anything we have. With this function though we - * won't add to the board. - */ - for( i = 0, num = 0; i<numblks; i++ ) { - if (is_sack_on_board(sf, &in[i])) { -#ifndef _KERNEL - cnt_skipped_oldsack++; -#endif - continue; + if (b->start != b->end) { + if ((b->end != snd_max) && + ((b->end - b->start) < segmax) && + ((span_end - span_start) < segmax)) { + /* + * Too small for us to mess with so we + * pretend its on the board. + */ + return (1); } - /* Did not find it (or found only - * a piece of it). Copy it to - * our outgoing board. + return(0); + } else { + /* + * It was all consumed by the board. */ - memcpy(&blkboard[num], &in[i], sizeof(struct sackblk)); -#ifndef _KERNEL - cnt_used_oldsack++; -#endif - num++; - } - if (num) { - memcpy(in, blkboard, (num * sizeof(struct sackblk))); + return(1); } - return (num); } /* @@ -303,54 +364,53 @@ sack_move_to_empty(struct sack_filter *sf, uint32_t idx) } static int32_t -sack_filter_new(struct sack_filter *sf, struct sackblk *in, int numblks, tcp_seq th_ack) +sack_filter_run(struct sack_filter *sf, struct sackblk *in, int numblks, tcp_seq th_ack, int32_t segmax, uint32_t snd_max) { struct sackblk blkboard[TCP_MAX_SACK]; - int32_t num, i; + int32_t num, i, room, at; /* * First lets trim the old and possibly * throw any away we have. */ for(i=0, num=0; i<numblks; i++) { - if (is_sack_on_board(sf, &in[i])) + if (is_sack_on_board(sf, &in[i], segmax, snd_max)) continue; memcpy(&blkboard[num], &in[i], sizeof(struct sackblk)); num++; } - if (num == 0) + if (num == 0) { return(num); + } - /* Now what we are left with is either - * completely merged on to the board - * from the above steps, or is new - * and need to be added to the board - * with the last one updated to current. - * - * First copy it out, we want to return that - * to our caller for processing. + /* + * Calculate the space we have in the filter table. */ - memcpy(in, blkboard, (num * sizeof(struct sackblk))); - numblks = num; - /* Now go through and add to our board as needed */ - for(i=(num-1); i>=0; i--) { - if (is_sack_on_board(sf, &blkboard[i])) { - continue; + room = SACK_FILTER_BLOCKS - sf->sf_used; + if (room < 1) + return (0); + /* + * Now lets walk through our filtered blkboard (the previous loop + * trimmed off anything on the board we already have so anything + * in blkboard is unique and not seen before) if there is room we copy + * it back out and place a new entry on our board. + */ + for(i=0, at=0; i<num; i++) { + if (room == 0) { + /* Can't copy out any more, no more room */ + break; } - /* Add this guy its not listed */ + /* Copy it out to the outbound */ + memcpy(&in[at], &blkboard[i], sizeof(struct sackblk)); + at++; + room--; + /* now lets add it to our sack-board */ sf->sf_cur++; sf->sf_cur %= SACK_FILTER_BLOCKS; if ((sack_blk_used(sf, sf->sf_cur)) && (sf->sf_used < SACK_FILTER_BLOCKS)) { sack_move_to_empty(sf, sf->sf_cur); } -#ifndef _KERNEL - if (sack_blk_used(sf, sf->sf_cur)) { - over_written++; - if (sf->sf_used < SACK_FILTER_BLOCKS) - empty_avail++; - } -#endif - memcpy(&sf->sf_blks[sf->sf_cur], &in[i], sizeof(struct sackblk)); + memcpy(&sf->sf_blks[sf->sf_cur], &blkboard[i], sizeof(struct sackblk)); if (sack_blk_used(sf, sf->sf_cur) == 0) { sf->sf_used++; #ifndef _KERNEL @@ -360,7 +420,26 @@ sack_filter_new(struct sack_filter *sf, struct sackblk *in, int numblks, tcp_seq sf->sf_bits = sack_blk_set(sf, sf->sf_cur); } } - return(numblks); + return(at); +} + +/* + * Collapse entry src into entry into + * and free up the src entry afterwards. + */ +static void +sack_collapse(struct sack_filter *sf, int32_t src, int32_t into) +{ + if (SEQ_LT(sf->sf_blks[src].start, sf->sf_blks[into].start)) { + /* src has a lower starting point */ + sf->sf_blks[into].start = sf->sf_blks[src].start; + } + if (SEQ_GT(sf->sf_blks[src].end, sf->sf_blks[into].end)) { + /* src has a higher ending point */ + sf->sf_blks[into].end = sf->sf_blks[src].end; + } + sf->sf_bits = sack_blk_clr(sf, src); + sf->sf_used--; } /* @@ -415,25 +494,6 @@ sack_blocks_overlap_or_meet(struct sack_filter *sf, struct sackblk *sb, uint32_t return (-1); } -/* - * Collapse entry src into entry into - * and free up the src entry afterwards. - */ -static void -sack_collapse(struct sack_filter *sf, int32_t src, int32_t into) -{ - if (SEQ_LT(sf->sf_blks[src].start, sf->sf_blks[into].start)) { - /* src has a lower starting point */ - sf->sf_blks[into].start = sf->sf_blks[src].start; - } - if (SEQ_GT(sf->sf_blks[src].end, sf->sf_blks[into].end)) { - /* src has a higher ending point */ - sf->sf_blks[into].end = sf->sf_blks[src].end; - } - sf->sf_bits = sack_blk_clr(sf, src); - sf->sf_used--; -} - static void sack_board_collapse(struct sack_filter *sf) { @@ -485,9 +545,12 @@ sack_filter_dump(FILE *out, struct sack_filter *sf) for(i=0; i<SACK_FILTER_BLOCKS; i++) { if (sack_blk_used(sf, i)) { - fprintf(out, "Entry:%d start:%u end:%u\n", i, - sf->sf_blks[i].start, - sf->sf_blks[i].end); + fprintf(out, "Entry:%d start:%u end:%u the block is %s\n", + i, + sf->sf_blks[i].start, + sf->sf_blks[i].end, + (sack_blk_used(sf, i) ? "USED" : "NOT-USED") + ); } } } @@ -497,10 +560,11 @@ sack_filter_dump(FILE *out, struct sack_filter *sf) static #endif int -sack_filter_blks(struct sack_filter *sf, struct sackblk *in, int numblks, +sack_filter_blks(struct tcpcb *tp, struct sack_filter *sf, struct sackblk *in, int numblks, tcp_seq th_ack) { int32_t i, ret; + int32_t segmax; if (numblks > TCP_MAX_SACK) { #ifdef _KERNEL @@ -510,14 +574,10 @@ sack_filter_blks(struct sack_filter *sf, struct sackblk *in, int numblks, #endif return(numblks); } -#ifndef _KERNEL - if ((sf->sf_used > 1) && (no_collapse == 0)) - sack_board_collapse(sf); - -#else if (sf->sf_used > 1) sack_board_collapse(sf); -#endif + + segmax = tcp_fixed_maxseg(tp); if ((sf->sf_used == 0) && numblks) { /* * We are brand new add the blocks in @@ -527,7 +587,15 @@ sack_filter_blks(struct sack_filter *sf, struct sackblk *in, int numblks, int cnt_added = 0; sf->sf_ack = th_ack; - for(i=(numblks-1), sf->sf_cur=0; i >= 0; i--) { + for(i=0, sf->sf_cur=0; i<numblks; i++) { + if ((in[i].end != tp->snd_max) && + ((in[i].end - in[i].start) < segmax)) { + /* + * We do not accept blocks less than a MSS minus all + * possible options space that is not at max_seg. + */ + continue; + } memcpy(&sf->sf_blks[sf->sf_cur], &in[i], sizeof(struct sackblk)); sf->sf_bits = sack_blk_set(sf, sf->sf_cur); sf->sf_cur++; @@ -548,11 +616,9 @@ sack_filter_blks(struct sack_filter *sf, struct sackblk *in, int numblks, sack_filter_prune(sf, th_ack); } if (numblks) { - if (SEQ_GEQ(th_ack, sf->sf_ack)) { - ret = sack_filter_new(sf, in, numblks, th_ack); - } else { - ret = sack_filter_old(sf, in, numblks); - } + ret = sack_filter_run(sf, in, numblks, th_ack, segmax, tp->snd_max); + if (sf->sf_used > 1) + sack_board_collapse(sf); } else ret = 0; return (ret); @@ -625,7 +691,8 @@ main(int argc, char **argv) char buffer[512]; struct sackblk blks[TCP_MAX_SACK]; FILE *err; - tcp_seq th_ack, snd_una, snd_max = 0; + tcp_seq th_ack; + struct tcpcb tp; struct sack_filter sf; int32_t numblks,i; int snd_una_set=0; @@ -638,10 +705,13 @@ main(int argc, char **argv) in = stdin; out = stdout; - while ((i = getopt(argc, argv, "ndIi:o:?h")) != -1) { + memset(&tp, 0, sizeof(tp)); + tp.t_maxseg = 1460; + + while ((i = getopt(argc, argv, "dIi:o:?hS:")) != -1) { switch (i) { - case 'n': - no_collapse = 1; + case 'S': + tp.t_maxseg = strtol(optarg, NULL, 0); break; case 'd': detailed_dump = 1; @@ -666,7 +736,7 @@ main(int argc, char **argv) default: case '?': case 'h': - fprintf(stderr, "Use %s [ -i infile -o outfile -I]\n", argv[0]); + fprintf(stderr, "Use %s [ -i infile -o outfile -I -S maxseg -n -d ]\n", argv[0]); return(0); break; }; @@ -679,28 +749,28 @@ main(int argc, char **argv) while (fgets(buffer, sizeof(buffer), in) != NULL) { sprintf(line_buf[line_buf_at], "%s", buffer); line_buf_at++; - if (strncmp(buffer, "QUIT", 4) == 0) { + if (strncmp(buffer, "quit", 4) == 0) { break; - } else if (strncmp(buffer, "DUMP", 4) == 0) { + } else if (strncmp(buffer, "dump", 4) == 0) { sack_filter_dump(out, &sf); - } else if (strncmp(buffer, "MAX:", 4) == 0) { - snd_max = strtoul(&buffer[4], NULL, 0); - } else if (strncmp(buffer, "COMMIT", 6) == 0) { + } else if (strncmp(buffer, "max:", 4) == 0) { + tp.snd_max = strtoul(&buffer[4], NULL, 0); + } else if (strncmp(buffer, "commit", 6) == 0) { int nn, ii; if (numblks) { uint32_t szof, tot_chg; + printf("Dumping line buffer (lines:%d)\n", line_buf_at); for(ii=0; ii<line_buf_at; ii++) { fprintf(out, "%s", line_buf[ii]); } - fprintf(out, "------------------------------------\n"); - nn = sack_filter_blks(&sf, blks, numblks, th_ack); + fprintf(out, "------------------------------------ call sfb() nb:%d\n", numblks); + nn = sack_filter_blks(&tp, &sf, blks, numblks, th_ack); saved += numblks - nn; tot_sack_blks += numblks; - fprintf(out, "ACK:%u\n", sf.sf_ack); for(ii=0, tot_chg=0; ii<nn; ii++) { szof = blks[ii].end - blks[ii].start; tot_chg += szof; - fprintf(out, "SACK:%u:%u [%u]\n", + fprintf(out, "sack:%u:%u [%u]\n", blks[ii].start, blks[ii].end, szof); } @@ -715,7 +785,7 @@ main(int argc, char **argv) memset(line_buf, 0, sizeof(line_buf)); line_buf_at=0; numblks = 0; - } else if (strncmp(buffer, "CHG:", 4) == 0) { + } else if (strncmp(buffer, "chg:", 4) == 0) { sack_chg = strtoul(&buffer[4], NULL, 0); if ((sack_chg != chg_remembered) && (sack_chg > chg_remembered)){ @@ -724,20 +794,21 @@ main(int argc, char **argv) ); } sack_chg = chg_remembered = 0; - } else if (strncmp(buffer, "RXT", 3) == 0) { - sack_filter_clear(&sf, snd_una); - } else if (strncmp(buffer, "ACK:", 4) == 0) { + } else if (strncmp(buffer, "rxt", 3) == 0) { + sack_filter_clear(&sf, tp.snd_una); + } else if (strncmp(buffer, "ack:", 4) == 0) { th_ack = strtoul(&buffer[4], NULL, 0); if (snd_una_set == 0) { - snd_una = th_ack; + tp.snd_una = th_ack; snd_una_set = 1; - } else if (SEQ_GT(th_ack, snd_una)) { - snd_una = th_ack; + } else if (SEQ_GT(th_ack, tp.snd_una)) { + tp.snd_una = th_ack; } - } else if (strncmp(buffer, "EXIT", 4) == 0) { - sack_filter_clear(&sf, snd_una); + sack_filter_blks(&tp, &sf, NULL, 0, th_ack); + } else if (strncmp(buffer, "exit", 4) == 0) { + sack_filter_clear(&sf, tp.snd_una); sack_chg = chg_remembered = 0; - } else if (strncmp(buffer, "SACK:", 5) == 0) { + } else if (strncmp(buffer, "sack:", 5) == 0) { char *end=NULL; uint32_t start; uint32_t endv; @@ -749,8 +820,8 @@ main(int argc, char **argv) fprintf(out, "--Sack invalid skip 0 start:%u : ??\n", start); continue; } - if (SEQ_GT(endv, snd_max)) - snd_max = endv; + if (SEQ_GT(endv, tp.snd_max)) + tp.snd_max = endv; if (SEQ_LT(endv, start)) { fprintf(out, "--Sack invalid skip 1 endv:%u < start:%u\n", endv, start); continue; @@ -762,7 +833,7 @@ main(int argc, char **argv) blks[numblks].start = start; blks[numblks].end = endv; numblks++; - } else if (strncmp(buffer, "REJ:n:n", 4) == 0) { + } else if (strncmp(buffer, "rej:n:n", 4) == 0) { struct sackblk in; char *end=NULL; @@ -772,18 +843,63 @@ main(int argc, char **argv) sack_filter_reject(&sf, &in); } else fprintf(out, "Invalid input END:A:B\n"); - } else if (strncmp(buffer, "HELP", 4) == 0) { + } else if (strncmp(buffer, "save", 4) == 0) { + FILE *io; + + io = fopen("sack_setup.bin", "w+"); + if (io != NULL) { + if (fwrite(&sf, sizeof(sf), 1, io) != 1) { + printf("Failed to write out sf data\n"); + unlink("sack_setup.bin"); + goto outwrite; + } + if (fwrite(&tp, sizeof(tp), 1, io) != 1) { + printf("Failed to write out tp data\n"); + unlink("sack_setup.bin"); + } else + printf("Save completed\n"); + outwrite: + fclose(io); + } else { + printf("failed to open sack_setup.bin for writting .. sorry\n"); + } + } else if (strncmp(buffer, "restore", 7) == 0) { + FILE *io; + + io = fopen("sack_setup.bin", "r"); + if (io != NULL) { + if (fread(&sf, sizeof(sf), 1, io) != 1) { + printf("Failed to read out sf data\n"); + goto outread; + } + if (fread(&tp, sizeof(tp), 1, io) != 1) { + printf("Failed to read out tp data\n"); + } else { + printf("Restore completed\n"); + sack_filter_dump(out, &sf); + } + outread: + fclose(io); + } else { + printf("can't open sack_setup.bin -- sorry no load\n"); + } + + } else if (strncmp(buffer, "help", 4) == 0) { +help: fprintf(out, "You can input:\n"); - fprintf(out, "SACK:S:E -- to define a sack block\n"); - fprintf(out, "RXT -- to clear the filter without changing the remembered\n"); - fprintf(out, "EXIT -- To clear the sack filter and start all fresh\n"); - fprintf(out, "ACK:N -- To advance the cum-ack to N\n"); - fprintf(out, "MAX:N -- To set send-max to N\n"); - fprintf(out, "COMMIT -- To apply the sack you built to the filter and dump the filter\n"); - fprintf(out, "DUMP -- To display the current contents of the sack filter\n"); - fprintf(out, "QUIT -- To exit this program\n"); + fprintf(out, "sack:S:E -- to define a sack block\n"); + fprintf(out, "rxt -- to clear the filter without changing the remembered\n"); + fprintf(out, "save -- save current state to sack_setup.bin\n"); + fprintf(out, "restore -- restore state from sack_setup.bin\n"); + fprintf(out, "exit -- To clear the sack filter and start all fresh\n"); + fprintf(out, "ack:N -- To advance the cum-ack to N\n"); + fprintf(out, "max:N -- To set send-max to N\n"); + fprintf(out, "commit -- To apply the sack you built to the filter and dump the filter\n"); + fprintf(out, "dump -- To display the current contents of the sack filter\n"); + fprintf(out, "quit -- To exit this program\n"); } else { fprintf(out, "Command %s unknown\n", buffer); + goto help; } memset(buffer, 0, sizeof(buffer)); } diff --git a/sys/netinet/tcp_stacks/sack_filter.h b/sys/netinet/tcp_stacks/sack_filter.h index fe34b1e3ca9b..b12fcf84567c 100644 --- a/sys/netinet/tcp_stacks/sack_filter.h +++ b/sys/netinet/tcp_stacks/sack_filter.h @@ -25,19 +25,84 @@ * SUCH DAMAGE. */ -/* - * Seven entry's is carefully choosen to - * fit in one cache line. We can easily - * change this to 15 (but it gets very - * little extra filtering). To change it - * to be larger than 15 would require either - * sf_bits becoming a uint32_t and then you - * could go to 31.. or change it to a full - * bitstring.. It is really doubtful you - * will get much benefit beyond 7, in testing - * there was a small amount but very very small. +/** + * + * The Sack filter is designed to do two functions, first it trys to reduce + * the processing of sacks. Consider that often times you get something like + * + * ack 1 (sack 100:200) + * ack 1 (sack 100:300) + * ack 1 (sack(100:400) + * + * You really want to process the 100:200 and then on the next sack process + * only 200:300 (the new data) and then finally on the third 300:400. The filter + * removes from your processing routines the already processed sack information so + * that after the filter completes you only have "new" sacks that you have not + * processed. This saves computation time so you do not need to worry about + * previously processed sack information. + * + * The second thing that the sack filter does is help protect against malicious + * attackers that are trying to attack any linked lists (or other data structures) + * that are used in sack processing. Consider an attacker sending in sacks for + * every other byte of data outstanding. This could in theory drastically split + * up any scoreboard you are maintaining and make you search through a very large + * linked list (or other structure) eating up CPU. If you split far enough and + * fracture your data structure enough you could potentially be crippled by a malicious + * peer. How the filter works here is it filters out sacks that are less than an MSS. + * We do this because generally a packet (aka MSS) should be kept whole. The only place + * we allow a smaller SACK is when the SACK touches the end of our socket buffer. This allows + * TLP to still work properly and yet protects us from splitting. The filter also only allows + * a set number of splits (defined in SACK_FILTER_BLOCKS). If more than that many sacks locations + * are being sent we discard additional ones until the earlier holes are filled up. The maximum + * the current filter can be is 15, which we have moved to since we want to be as generous as + * possible with allowing for loss. However, in previous testing of the filter it was found + * that there was very little benefit from moving from 7 to 15 sack points. Though at + * that previous set of tests, we would just discard earlier information in the filter. Now + * that we do not do that i.e. discard information and instead drop sack data we have raised + * the value to the max i.e. 15. If you want to expand beyond 15 one would have to either increase + * the size of the sf_bits to a uint32_t which could then get you a maximum of 31 splits or + * move to a true bitstring. If this is done however it further increases your risk to + * sack attacks, the bigger the number of splits (filter blocks) that are allowed + * the larger your processing arrays will grow as well as the filter. + * + * Note that this protection does not prevent an attacker from asking for a 20 byte + * MSS, that protection must be done elsewhere during the negotiation of the connection + * and is done now by just ignoring sack's from connections with too small of MSS which + * prevents sack from working and thus makes the connection less efficient but protects + * the system from harm. + * + * We may actually want to consider dropping the size of the array back to 7 to further + * protect the system which would be a more cautious approach. + * + * TCP Developer information: + * + * To use the sack filter its actually pretty simple. All you do is the normal sorting + * and sanity checks of your sacks but then after that you call out to sack_filter_blks() + * passing in the tcpcb, the sack-filter you are using (memory you have allocated) the + * pointer to the sackblk array and how many sorted valid blocks there are as well + * as what the new th_ack point is. The filter will return to you the number of + * blocks left after filtering. It will reshape the blocks based on the previous + * sacks you have received and processed. If sack_filter_blks() returns 0 then no + * new sack data is present to be processed. + * + * Whenever you reach the point of snd_una == snd_max, you should call sack_filter_clear with + * the snd_una point. You also need to call this if you invalidate your sack array for any + * reason (such as RTO's or MTU changes or some other thing that makes you think all + * data is now un-acknowledged). You can also pass in sack_filter_blks(tp, sf, NULL, 0, th_ack) to + * advance the cum-ack point. You can use sack_filter_blks_used(sf) to determine if you have filter blocks as + * well. So putting these two together, anytime the cum-ack moves forward you probably want to + * do: + * if (sack_filter_blks_used(sf)) + * sack_filter_blks(tp, sf, NULL, 0, th_ack); + * + * If for some reason you have ran the sack-filter and something goes wrong (you can't allocate space + * for example to split your sack-array. You can "undo" the data within the sack filter by calling + * sack_filter_rject(sf, in) passing in the list of blocks to be "removed" from the sack-filter. + * You can see an example of this use in bbr.c though rack.c has never found it needed. + * */ -#define SACK_FILTER_BLOCKS 7 + +#define SACK_FILTER_BLOCKS 15 struct sack_filter { tcp_seq sf_ack; @@ -48,7 +113,7 @@ struct sack_filter { }; #ifdef _KERNEL void sack_filter_clear(struct sack_filter *sf, tcp_seq seq); -int sack_filter_blks(struct sack_filter *sf, struct sackblk *in, int numblks, +int sack_filter_blks(struct tcpcb *tp, struct sack_filter *sf, struct sackblk *in, int numblks, tcp_seq th_ack); void sack_filter_reject(struct sack_filter *sf, struct sackblk *in); static inline uint8_t sack_filter_blks_used(struct sack_filter *sf) diff --git a/sys/netinet/tcp_stacks/tcp_bbr.h b/sys/netinet/tcp_stacks/tcp_bbr.h index f88efe3c9ef9..10ddd12bda75 100644 --- a/sys/netinet/tcp_stacks/tcp_bbr.h +++ b/sys/netinet/tcp_stacks/tcp_bbr.h @@ -347,8 +347,6 @@ struct bbr_log_sysctl_out { /* * Locking for the rack control block. * a) Locked by INP_WLOCK - * b) Locked by the hpts-mutex - * */ #define BBR_STATE_STARTUP 0x01 #define BBR_STATE_DRAIN 0x02 diff --git a/sys/netinet/tcp_stacks/tcp_rack.h b/sys/netinet/tcp_stacks/tcp_rack.h index 708b437b32ed..144b4fabf7eb 100644 --- a/sys/netinet/tcp_stacks/tcp_rack.h +++ b/sys/netinet/tcp_stacks/tcp_rack.h @@ -199,7 +199,6 @@ struct rack_opts_stats { uint64_t tcp_rack_min_pace_seg; uint64_t tcp_rack_pace_rate_ca; uint64_t tcp_rack_rr; - uint64_t tcp_rack_do_detection; uint64_t tcp_rack_rrr_no_conf_rate; uint64_t tcp_initial_rate; uint64_t tcp_initial_win; @@ -328,8 +327,6 @@ extern counter_u64_t rack_opts_arry[RACK_OPTS_SIZE]; /* * Locking for the rack control block. * a) Locked by INP_WLOCK - * b) Locked by the hpts-mutex - * */ #define RACK_GP_HIST 4 /* How much goodput history do we maintain? */ #define RETRAN_CNT_SIZE 16 @@ -436,7 +433,6 @@ struct rack_control { uint32_t rc_rcvtime; /* When we last received data */ uint32_t rc_num_split_allocs; /* num split map entries allocated */ uint32_t rc_split_limit; /* Limit from control var can be set by socket opt */ - uint32_t rack_avg_rec_sends; uint32_t rc_last_output_to; uint32_t rc_went_idle_time; @@ -458,16 +454,11 @@ struct rack_control { uint16_t rack_per_of_gp_rec; /* 100 = 100%, so from 65536 = 655 x bw, 0=off */ uint16_t rack_per_of_gp_probertt; /* 100 = 100%, so from 65536 = 655 x bw, 0=off */ uint32_t rc_high_rwnd; - uint32_t ack_count; - uint32_t sack_count; - uint32_t sack_noextra_move; - uint32_t sack_moved_extra; struct rack_rtt_sample rack_rs; const struct tcp_hwrate_limit_table *crte; uint32_t rc_agg_early; uint32_t rc_agg_delayed; uint32_t rc_tlp_rxt_last_time; - uint32_t rc_saved_cwnd; uint64_t rc_gp_output_ts; /* chg*/ uint64_t rc_gp_cumack_ts; /* chg*/ struct timeval act_rcv_time; @@ -489,12 +480,6 @@ struct rack_control { int32_t rc_rtt_diff; /* Timely style rtt diff of our gp_srtt */ uint64_t last_tmit_time_acked; /* Holds the last cumack point's last send time */ /* Recovery stats */ - uint64_t time_entered_recovery; - uint64_t bytes_acked_in_recovery; - /* Policer Detection */ - uint64_t last_policer_sndbytes; - uint64_t last_policer_snd_rxt_bytes; - uint64_t policer_bw; uint64_t last_sendtime; uint64_t last_gpest; @@ -507,19 +492,9 @@ struct rack_control { uint32_t gp_rnd_thresh; uint32_t ss_hi_fs; uint32_t gate_to_fs; - uint32_t policer_max_seg; - uint32_t pol_bw_comp; - uint16_t policer_rxt_threshold; - uint8_t policer_avg_threshold; - uint8_t policer_med_threshold; uint32_t pcm_max_seg; uint32_t last_pcm_round; uint32_t pcm_idle_rounds; - uint32_t current_policer_bucket; - uint32_t policer_bucket_size; - uint32_t idle_snd_una; - uint32_t ack_for_idle; - uint32_t last_amount_before_rec; uint32_t rc_gp_srtt; /* Current GP srtt */ uint32_t rc_prev_gp_srtt; /* Previous RTT */ @@ -558,22 +533,17 @@ struct rack_control { uint32_t rc_last_timeout_snduna; uint32_t last_tlp_acked_start; uint32_t last_tlp_acked_end; - uint32_t challenge_ack_ts; - uint32_t challenge_ack_cnt; uint32_t rc_min_to; /* Socket option value Lock(a) */ uint32_t rc_pkt_delay; /* Socket option value Lock(a) */ uint32_t persist_lost_ends; - uint32_t ack_during_sd; - uint32_t input_pkt; - uint32_t saved_input_pkt; - uint32_t saved_policer_val; /* The encoded value we used to setup policer detection */ uint32_t cleared_app_ack_seq; uint32_t last_rcv_tstmp_for_rtt; uint32_t last_time_of_arm_rcv; uint32_t rto_ssthresh; - struct newreno rc_saved_beta; /* - * For newreno cc: - * rc_saved_cc are the values we have had + uint32_t rc_saved_beta; + uint32_t rc_saved_beta_ecn; /* + * For newreno cc: rc_saved_beta and + * rc_saved_beta_ecn are the values we have had * set by the user, if pacing is not happening * (i.e. its early and we have not turned on yet * or it was turned off). The minute pacing @@ -586,7 +556,6 @@ struct rack_control { uint16_t rc_cnt_of_retran[RETRAN_CNT_SIZE]; uint16_t rc_early_recovery_segs; /* Socket option value Lock(a) */ uint16_t rc_reorder_shift; /* Socket option value Lock(a) */ - uint8_t policer_del_mss; /* How many mss during recovery for policer detection */ uint8_t rack_per_upper_bound_ss; uint8_t rack_per_upper_bound_ca; uint8_t cleared_app_ack; @@ -598,12 +567,9 @@ struct rack_control { uint8_t rc_tlp_cwnd_reduce; /* Socket option value Lock(a) */ uint8_t rc_prr_sendalot;/* Socket option value Lock(a) */ uint8_t rc_rate_sample_method; - uint8_t policer_alt_median; /* Alternate median for policer detection */ - uint8_t full_dgp_in_rec; /* Flag to say if we do full DGP in recovery */ uint8_t client_suggested_maxseg; /* Not sure what to do with this yet */ uint8_t use_gp_not_last; uint8_t pacing_method; /* If pace_always, what type of pacing */ - uint8_t already_had_a_excess; }; #endif @@ -611,27 +577,6 @@ struct rack_control { #define RACK_DGP_PACING 0x01 #define RACK_REG_PACING 0x02 -/* DGP with no buffer level mitigations */ -#define DGP_LEVEL0 0 - -/* - * DGP with buffer level mitigation where BL:4 caps fillcw and BL:5 - * turns off fillcw. - */ -#define DGP_LEVEL1 1 - -/* - * DGP with buffer level mitigation where BL:3 caps fillcw and BL:4 turns off fillcw - * and BL:5 reduces by 10% - */ -#define DGP_LEVEL2 2 - -/* - * DGP with buffer level mitigation where BL:2 caps fillcw and BL:3 turns off - * fillcw BL:4 reduces by 10% and BL:5 reduces by 20% - */ -#define DGP_LEVEL3 3 - /* Hybrid pacing log defines */ #define HYBRID_LOG_NO_ROOM 0 /* No room for the clients request */ #define HYBRID_LOG_TURNED_OFF 1 /* Turned off hybrid pacing */ @@ -650,12 +595,7 @@ struct rack_control { #define HYBRID_LOG_EXTEND 14 /* We extended the end */ #define HYBRID_LOG_SENT_LOST 15 /* A closing sent/lost report */ -#define LOST_ZERO 1 /* Zero it out */ -#define LOST_ADD 2 /* Add to it */ -#define LOST_SUB 3 /* Sub from it */ - #define RACK_TIMELY_CNT_BOOST 5 /* At 5th increase boost */ -#define RACK_MINRTT_FILTER_TIM 10 /* Seconds */ #define RACK_HYSTART_OFF 0 #define RACK_HYSTART_ON 1 /* hystart++ on */ @@ -672,7 +612,6 @@ struct rack_control { struct tcp_rack { /* First cache line 0x00 */ - TAILQ_ENTRY(tcp_rack) r_hpts; /* hptsi queue next Lock(b) */ int32_t(*r_substate) (struct mbuf *, struct tcphdr *, struct socket *, struct tcpcb *, struct tcpopt *, int32_t, int32_t, uint32_t, int, int, uint8_t); /* Lock(a) */ @@ -790,8 +729,7 @@ struct tcp_rack { set_pacing_done_a_iw : 1, use_rack_rr : 1, alloc_limit_reported : 1, - sack_attack_disable : 1, - do_detection : 1, + rack_avail : 2, rc_force_max_seg : 1; uint8_t r_early : 1, r_late : 1, @@ -801,12 +739,9 @@ struct tcp_rack { r_collapse_point_valid : 1, dgp_on : 1; uint16_t rto_from_rec: 1, - avail_bit: 1, + avail_bit: 4, pcm_in_progress: 1, pcm_needed: 1, - policer_detect_on: 1, /* Are we detecting policers? */ - rc_policer_detected : 1, /* We are beiing policed */ - rc_policer_should_pace : 1, /* The sizing algo thinks we should pace */ rc_sendvars_notset : 1, /* Inside rack_init send variables (snd_max/una etc) were not set */ rc_gp_rtt_set : 1, rc_gp_dyn_mul : 1, diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c index 312740ccf599..db415f6bdf03 100644 --- a/sys/netinet/tcp_subr.c +++ b/sys/netinet/tcp_subr.c @@ -58,7 +58,6 @@ #include <sys/refcount.h> #include <sys/mbuf.h> #include <sys/priv.h> -#include <sys/proc.h> #include <sys/sdt.h> #include <sys/socket.h> #include <sys/socketvar.h> @@ -110,9 +109,6 @@ #include <netinet/tcpip.h> #include <netinet/tcp_fastopen.h> #include <netinet/tcp_accounting.h> -#ifdef TCPPCAP -#include <netinet/tcp_pcap.h> -#endif #ifdef TCP_OFFLOAD #include <netinet/tcp_offload.h> #endif @@ -139,68 +135,14 @@ VNET_DEFINE(int, tcp_mssdflt) = TCP_MSS; VNET_DEFINE(int, tcp_v6mssdflt) = TCP6_MSS; #endif -#ifdef TCP_SAD_DETECTION -/* Sack attack detection thresholds and such */ -SYSCTL_NODE(_net_inet_tcp, OID_AUTO, sack_attack, - CTLFLAG_RW | CTLFLAG_MPSAFE, 0, - "Sack Attack detection thresholds"); -int32_t tcp_force_detection = 0; -SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, force_detection, - CTLFLAG_RW, - &tcp_force_detection, 0, - "Do we force detection even if the INP has it off?"); -int32_t tcp_sad_limit = 10000; -SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, limit, - CTLFLAG_RW, - &tcp_sad_limit, 10000, - "If SaD is enabled, what is the limit to sendmap entries (0 = unlimited)?"); -int32_t tcp_sack_to_ack_thresh = 700; /* 70 % */ -SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, sack_to_ack_thresh, - CTLFLAG_RW, - &tcp_sack_to_ack_thresh, 700, - "Percentage of sacks to acks we must see above (10.1 percent is 101)?"); -int32_t tcp_sack_to_move_thresh = 600; /* 60 % */ -SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, move_thresh, - CTLFLAG_RW, - &tcp_sack_to_move_thresh, 600, - "Percentage of sack moves we must see above (10.1 percent is 101)"); -int32_t tcp_restoral_thresh = 450; /* 45 % (sack:2:ack -25%) (mv:ratio -15%) **/ -SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, restore_thresh, - CTLFLAG_RW, - &tcp_restoral_thresh, 450, - "Percentage of sack to ack percentage we must see below to restore(10.1 percent is 101)"); -int32_t tcp_sad_decay_val = 800; -SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, decay_per, - CTLFLAG_RW, - &tcp_sad_decay_val, 800, - "The decay percentage (10.1 percent equals 101 )"); -int32_t tcp_map_minimum = 500; -SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, nummaps, - CTLFLAG_RW, - &tcp_map_minimum, 500, - "Number of Map enteries before we start detection"); -int32_t tcp_sad_pacing_interval = 2000; -SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, sad_pacing_int, - CTLFLAG_RW, - &tcp_sad_pacing_interval, 2000, - "What is the minimum pacing interval for a classified attacker?"); - -int32_t tcp_sad_low_pps = 100; -SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, sad_low_pps, - CTLFLAG_RW, - &tcp_sad_low_pps, 100, - "What is the input pps that below which we do not decay?"); -#endif -uint32_t tcp_ack_war_time_window = 1000; +VNET_DEFINE(uint32_t, tcp_ack_war_time_window) = 1000; SYSCTL_UINT(_net_inet_tcp, OID_AUTO, ack_war_timewindow, - CTLFLAG_RW, - &tcp_ack_war_time_window, 1000, - "If the tcp_stack does ack-war prevention how many milliseconds are in its time window?"); -uint32_t tcp_ack_war_cnt = 5; -SYSCTL_UINT(_net_inet_tcp, OID_AUTO, ack_war_cnt, - CTLFLAG_RW, - &tcp_ack_war_cnt, 5, - "If the tcp_stack does ack-war prevention how many acks can be sent in its time window?"); + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_ack_war_time_window), 0, + "Time interval in ms used to limit the number (ack_war_cnt) of challenge ACKs sent per TCP connection"); +VNET_DEFINE(uint32_t, tcp_ack_war_cnt) = 5; +SYSCTL_UINT(_net_inet_tcp, OID_AUTO, ack_war_cnt, CTLFLAG_VNET | CTLFLAG_RW, + &VNET_NAME(tcp_ack_war_cnt), 0, + "Maximum number of challenge ACKs sent per TCP connection during the time interval (ack_war_timewindow)"); struct rwlock tcp_function_lock; @@ -411,6 +353,7 @@ static struct tcp_function_block tcp_def_funcblk = { .tfb_tcp_fb_init = tcp_default_fb_init, .tfb_tcp_fb_fini = tcp_default_fb_fini, .tfb_switch_failed = tcp_default_switch_failed, + .tfb_flags = TCP_FUNC_DEFAULT_OK, }; static int tcp_fb_cnt = 0; @@ -446,23 +389,25 @@ static struct tcp_function_block * find_tcp_functions_locked(struct tcp_function_set *fs) { struct tcp_function *f; - struct tcp_function_block *blk=NULL; + struct tcp_function_block *blk = NULL; + rw_assert(&tcp_function_lock, RA_LOCKED); TAILQ_FOREACH(f, &t_functions, tf_next) { if (strcmp(f->tf_name, fs->function_set_name) == 0) { blk = f->tf_fb; break; } } - return(blk); + return (blk); } static struct tcp_function_block * find_tcp_fb_locked(struct tcp_function_block *blk, struct tcp_function **s) { - struct tcp_function_block *rblk=NULL; + struct tcp_function_block *rblk = NULL; struct tcp_function *f; + rw_assert(&tcp_function_lock, RA_LOCKED); TAILQ_FOREACH(f, &t_functions, tf_next) { if (f->tf_fb == blk) { rblk = blk; @@ -485,7 +430,7 @@ find_and_ref_tcp_functions(struct tcp_function_set *fs) if (blk) refcount_acquire(&blk->tfb_refcnt); rw_runlock(&tcp_function_lock); - return(blk); + return (blk); } struct tcp_function_block * @@ -498,7 +443,7 @@ find_and_ref_tcp_fb(struct tcp_function_block *blk) if (rblk) refcount_acquire(&rblk->tfb_refcnt); rw_runlock(&tcp_function_lock); - return(rblk); + return (rblk); } /* Find a matching alias for the given tcp_function_block. */ @@ -568,8 +513,7 @@ tcp_switch_back_to_default(struct tcpcb *tp) tfb = NULL; } /* Does the stack accept this connection? */ - if (tfb != NULL && tfb->tfb_tcp_handoff_ok != NULL && - (*tfb->tfb_tcp_handoff_ok)(tp)) { + if (tfb != NULL && (*tfb->tfb_tcp_handoff_ok)(tp)) { refcount_release(&tfb->tfb_refcnt); tfb = NULL; } @@ -603,11 +547,9 @@ tcp_switch_back_to_default(struct tcpcb *tp) /* there always should be a default */ panic("Can't refer to tcp_def_funcblk"); } - if (tfb->tfb_tcp_handoff_ok != NULL) { - if ((*tfb->tfb_tcp_handoff_ok) (tp)) { - /* The default stack cannot say no */ - panic("Default stack rejects a new session?"); - } + if ((*tfb->tfb_tcp_handoff_ok)(tp)) { + /* The default stack cannot say no */ + panic("Default stack rejects a new session?"); } if (tfb->tfb_tcp_fb_init != NULL && (*tfb->tfb_tcp_fb_init)(tp, &ptr)) { @@ -702,7 +644,7 @@ out: static int sysctl_net_inet_default_tcp_functions(SYSCTL_HANDLER_ARGS) { - int error=ENOENT; + int error = ENOENT; struct tcp_function_set fs; struct tcp_function_block *blk; @@ -720,7 +662,7 @@ sysctl_net_inet_default_tcp_functions(SYSCTL_HANDLER_ARGS) /* Check for error or no change */ if (error != 0 || req->newptr == NULL) - return(error); + return (error); rw_wlock(&tcp_function_lock); blk = find_tcp_functions_locked(&fs); @@ -729,6 +671,10 @@ sysctl_net_inet_default_tcp_functions(SYSCTL_HANDLER_ARGS) error = ENOENT; goto done; } + if ((blk->tfb_flags & TCP_FUNC_DEFAULT_OK) == 0) { + error = EINVAL; + goto done; + } V_tcp_func_set_ptr = blk; done: rw_wunlock(&tcp_function_lock); @@ -1086,10 +1032,6 @@ tcp_default_fb_init(struct tcpcb *tp, void **ptr) /* We don't use the pointer */ *ptr = NULL; - KASSERT(tp->t_state < TCPS_TIME_WAIT, - ("%s: connection %p in unexpected state %d", __func__, tp, - tp->t_state)); - /* Make sure we get no interesting mbuf queuing behavior */ /* All mbuf queue/ack compress flags should be off */ tcp_lro_features_off(tp); @@ -1106,7 +1048,8 @@ tcp_default_fb_init(struct tcpcb *tp, void **ptr) if (tp->t_rxtshift == 0) tp->t_rxtcur = rexmt; else - TCPT_RANGESET(tp->t_rxtcur, rexmt, tp->t_rttmin, TCPTV_REXMTMAX); + TCPT_RANGESET(tp->t_rxtcur, rexmt, tp->t_rttmin, + tcp_rexmit_max); /* * Nothing to do for ESTABLISHED or LISTEN states. And, we don't @@ -1225,80 +1168,83 @@ int register_tcp_functions_as_names(struct tcp_function_block *blk, int wait, const char *names[], int *num_names) { - struct tcp_function *n; + struct tcp_function *f[TCP_FUNCTION_NAME_NUM_MAX]; struct tcp_function_set fs; - int error, i; + int error, i, num_registered; - KASSERT(names != NULL && *num_names > 0, - ("%s: Called with 0-length name list", __func__)); KASSERT(names != NULL, ("%s: Called with NULL name list", __func__)); + KASSERT(*num_names > 0, + ("%s: Called with non-positive length of name list", __func__)); KASSERT(rw_initialized(&tcp_function_lock), ("%s: called too early", __func__)); + if (*num_names > TCP_FUNCTION_NAME_NUM_MAX) { + /* Too many names. */ + *num_names = 0; + return (E2BIG); + } if ((blk->tfb_tcp_output == NULL) || (blk->tfb_tcp_do_segment == NULL) || (blk->tfb_tcp_ctloutput == NULL) || + (blk->tfb_tcp_handoff_ok == NULL) || (strlen(blk->tfb_tcp_block_name) == 0)) { - /* - * These functions are required and you - * need a name. - */ + /* These functions are required and a name is needed. */ *num_names = 0; return (EINVAL); } - if (blk->tfb_flags & TCP_FUNC_BEING_REMOVED) { - *num_names = 0; - return (EINVAL); + for (i = 0; i < *num_names; i++) { + f[i] = malloc(sizeof(struct tcp_function), M_TCPFUNCTIONS, wait); + if (f[i] == NULL) { + while (--i >= 0) + free(f[i], M_TCPFUNCTIONS); + *num_names = 0; + return (ENOMEM); + } } + num_registered = 0; + rw_wlock(&tcp_function_lock); + if (find_tcp_fb_locked(blk, NULL) != NULL) { + /* A TCP function block can only be registered once. */ + error = EALREADY; + goto cleanup; + } + if (blk->tfb_flags & TCP_FUNC_BEING_REMOVED) { + error = EINVAL; + goto cleanup; + } refcount_init(&blk->tfb_refcnt, 0); blk->tfb_id = atomic_fetchadd_int(&next_tcp_stack_id, 1); for (i = 0; i < *num_names; i++) { - n = malloc(sizeof(struct tcp_function), M_TCPFUNCTIONS, wait); - if (n == NULL) { - error = ENOMEM; - goto cleanup; - } - n->tf_fb = blk; - (void)strlcpy(fs.function_set_name, names[i], sizeof(fs.function_set_name)); - rw_wlock(&tcp_function_lock); if (find_tcp_functions_locked(&fs) != NULL) { /* Duplicate name space not allowed */ - rw_wunlock(&tcp_function_lock); - free(n, M_TCPFUNCTIONS); error = EALREADY; goto cleanup; } - (void)strlcpy(n->tf_name, names[i], sizeof(n->tf_name)); - TAILQ_INSERT_TAIL(&t_functions, n, tf_next); + f[i]->tf_fb = blk; + (void)strlcpy(f[i]->tf_name, names[i], sizeof(f[i]->tf_name)); + TAILQ_INSERT_TAIL(&t_functions, f[i], tf_next); tcp_fb_cnt++; - rw_wunlock(&tcp_function_lock); + num_registered++; } - return(0); + rw_wunlock(&tcp_function_lock); + return (0); cleanup: - /* - * Deregister the names we just added. Because registration failed - * for names[i], we don't need to deregister that name. - */ - *num_names = i; - rw_wlock(&tcp_function_lock); - while (--i >= 0) { - TAILQ_FOREACH(n, &t_functions, tf_next) { - if (!strncmp(n->tf_name, names[i], - TCP_FUNCTION_NAME_LEN_MAX)) { - TAILQ_REMOVE(&t_functions, n, tf_next); - tcp_fb_cnt--; - n->tf_fb = NULL; - free(n, M_TCPFUNCTIONS); - break; - } + /* Remove the entries just added. */ + for (i = 0; i < *num_names; i++) { + if (i < num_registered) { + TAILQ_REMOVE(&t_functions, f[i], tf_next); + tcp_fb_cnt--; } + f[i]->tf_fb = NULL; + free(f[i], M_TCPFUNCTIONS); } rw_wunlock(&tcp_function_lock); + *num_names = num_registered; return (error); } @@ -1432,7 +1378,7 @@ deregister_tcp_functions(struct tcp_function_block *blk, bool quiesce, } static void -tcp_drain(void) +tcp_drain(void *ctx __unused, int flags __unused) { struct epoch_tracker et; VNET_ITERATOR_DECL(vnet_iter); @@ -1464,13 +1410,6 @@ tcp_drain(void) #ifdef TCP_BLACKBOX tcp_log_drain(tcpb); #endif -#ifdef TCPPCAP - if (tcp_pcap_aggressive_free) { - /* Free the TCP PCAP queues. */ - tcp_pcap_drain(&(tcpb->t_inpkts)); - tcp_pcap_drain(&(tcpb->t_outpkts)); - } -#endif } } CURVNET_RESTORE(); @@ -1512,6 +1451,8 @@ tcp_vnet_init(void *arg __unused) VNET_PCPUSTAT_ALLOC(tcpstat, M_WAITOK); V_tcp_msl = TCPTV_MSL; + V_tcp_msl_local = TCPTV_MSL_LOCAL; + arc4rand(&V_ts_offset_secret, sizeof(V_ts_offset_secret), 0); } VNET_SYSINIT(tcp_vnet_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, tcp_vnet_init, NULL); @@ -1530,11 +1471,8 @@ tcp_init(void *arg __unused) tcp_keepintvl = TCPTV_KEEPINTVL; tcp_maxpersistidle = TCPTV_KEEP_IDLE; tcp_rexmit_initial = TCPTV_RTOBASE; - if (tcp_rexmit_initial < 1) - tcp_rexmit_initial = 1; tcp_rexmit_min = TCPTV_MIN; - if (tcp_rexmit_min < 1) - tcp_rexmit_min = 1; + tcp_rexmit_max = TCPTV_REXMTMAX; tcp_persmin = TCPTV_PERSMIN; tcp_persmax = TCPTV_PERSMAX; tcp_rexmit_slop = TCPTV_CPU_VAR; @@ -1549,7 +1487,6 @@ tcp_init(void *arg __unused) /* Initialize the TCP logging data. */ tcp_log_init(); #endif - arc4rand(&V_ts_offset_secret, sizeof(V_ts_offset_secret), 0); if (tcp_soreceive_stream) { #ifdef INET @@ -1583,9 +1520,6 @@ tcp_init(void *arg __unused) tcp_bad_csums = counter_u64_alloc(M_WAITOK); tcp_pacing_failures = counter_u64_alloc(M_WAITOK); tcp_dgp_failures = counter_u64_alloc(M_WAITOK); -#ifdef TCPPCAP - tcp_pcap_init(); -#endif hashsize = tcp_tcbhashsize; if (hashsize == 0) { @@ -1640,24 +1574,10 @@ SYSINIT(tcp_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, tcp_init, NULL); static void tcp_destroy(void *unused __unused) { - int n; #ifdef TCP_HHOOK int error; #endif - /* - * All our processes are gone, all our sockets should be cleaned - * up, which means, we should be past the tcp_discardcb() calls. - * Sleep to let all tcpcb timers really disappear and cleanup. - */ - for (;;) { - INP_INFO_WLOCK(&V_tcbinfo); - n = V_tcbinfo.ipi_count; - INP_INFO_WUNLOCK(&V_tcbinfo); - if (n == 0) - break; - pause("tcpdes", hz / 10); - } tcp_hc_destroy(); syncache_destroy(); in_pcbinfo_destroy(&V_tcbinfo); @@ -1793,6 +1713,7 @@ tcpip_maketemplate(struct inpcb *inp) * * NOTE: If m != NULL, then th must point to *inside* the mbuf. */ + void tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, tcp_seq ack, tcp_seq seq, uint16_t flags) @@ -2160,7 +2081,7 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, union tcp_log_stackspecific log; struct timeval tv; - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + memset(&log, 0, sizeof(log)); log.u_bbr.inhpts = tcp_in_hpts(tp); log.u_bbr.flex8 = 4; log.u_bbr.pkts_out = tp->t_maxseg; @@ -2226,12 +2147,53 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, } /* + * Send a challenge ack (no data, no SACK option), but not more than + * V_tcp_ack_war_cnt per V_tcp_ack_war_time_window (per TCP connection). + */ +void +tcp_send_challenge_ack(struct tcpcb *tp, struct tcphdr *th, struct mbuf *m) +{ + sbintime_t now; + bool send_challenge_ack; + + if (V_tcp_ack_war_time_window == 0 || V_tcp_ack_war_cnt == 0) { + /* ACK war protection is disabled. */ + send_challenge_ack = true; + } else { + /* Start new epoch, if the previous one is already over. */ + now = getsbinuptime(); + if (tp->t_challenge_ack_end < now) { + tp->t_challenge_ack_cnt = 0; + tp->t_challenge_ack_end = now + + V_tcp_ack_war_time_window * SBT_1MS; + } + /* + * Send a challenge ACK, if less than tcp_ack_war_cnt have been + * sent in the current epoch. + */ + if (tp->t_challenge_ack_cnt < V_tcp_ack_war_cnt) { + send_challenge_ack = true; + tp->t_challenge_ack_cnt++; + } else { + send_challenge_ack = false; + } + } + if (send_challenge_ack) { + tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt, + tp->snd_nxt, TH_ACK); + tp->last_ack_sent = tp->rcv_nxt; + } +} + +/* * Create a new TCP control block, making an empty reassembly queue and hooking * it to the argument protocol control block. The `inp' parameter must have * come from the zone allocator set up by tcpcbstor declaration. + * The caller can provide a pointer to a tcpcb of the listener to inherit the + * TCP function block from the listener. */ struct tcpcb * -tcp_newtcpcb(struct inpcb *inp) +tcp_newtcpcb(struct inpcb *inp, struct tcpcb *listening_tcb) { struct tcpcb *tp = intotcpcb(inp); #ifdef INET6 @@ -2246,17 +2208,38 @@ tcp_newtcpcb(struct inpcb *inp) bzero(&tp->t_start_zero, t_zero_size); /* Initialise cc_var struct for this tcpcb. */ - tp->t_ccv.type = IPPROTO_TCP; - tp->t_ccv.ccvc.tcp = tp; + tp->t_ccv.tp = tp; rw_rlock(&tcp_function_lock); - tp->t_fb = V_tcp_func_set_ptr; + if (listening_tcb != NULL) { + INP_LOCK_ASSERT(tptoinpcb(listening_tcb)); + KASSERT(listening_tcb->t_fb != NULL, + ("tcp_newtcpcb: listening_tcb->t_fb is NULL")); + if (listening_tcb->t_fb->tfb_flags & TCP_FUNC_BEING_REMOVED) { + rw_runlock(&tcp_function_lock); + return (NULL); + } + tp->t_fb = listening_tcb->t_fb; + } else { + tp->t_fb = V_tcp_func_set_ptr; + } refcount_acquire(&tp->t_fb->tfb_refcnt); + KASSERT((tp->t_fb->tfb_flags & TCP_FUNC_BEING_REMOVED) == 0, + ("tcp_newtcpcb: using TFB being removed")); rw_runlock(&tcp_function_lock); - /* - * Use the current system default CC algorithm. - */ - cc_attach(tp, CC_DEFAULT_ALGO()); - + CC_LIST_RLOCK(); + if (listening_tcb != NULL) { + if (CC_ALGO(listening_tcb)->flags & CC_MODULE_BEING_REMOVED) { + CC_LIST_RUNLOCK(); + if (tp->t_fb->tfb_tcp_fb_fini) + (*tp->t_fb->tfb_tcp_fb_fini)(tp, 1); + refcount_release(&tp->t_fb->tfb_refcnt); + return (NULL); + } + CC_ALGO(tp) = CC_ALGO(listening_tcb); + } else + CC_ALGO(tp) = CC_DEFAULT_ALGO(); + cc_refer(CC_ALGO(tp)); + CC_LIST_RUNLOCK(); if (CC_ALGO(tp)->cb_init != NULL) if (CC_ALGO(tp)->cb_init(&tp->t_ccv, NULL) > 0) { cc_detach(tp); @@ -2268,6 +2251,10 @@ tcp_newtcpcb(struct inpcb *inp) #ifdef TCP_HHOOK if (khelp_init_osd(HELPER_CLASS_TCP, &tp->t_osd)) { + if (CC_ALGO(tp)->cb_destroy != NULL) + CC_ALGO(tp)->cb_destroy(&tp->t_ccv); + CC_DATA(tp) = NULL; + cc_detach(tp); if (tp->t_fb->tfb_tcp_fb_fini) (*tp->t_fb->tfb_tcp_fb_fini)(tp, 1); refcount_release(&tp->t_fb->tfb_refcnt); @@ -2289,7 +2276,8 @@ tcp_newtcpcb(struct inpcb *inp) tp->t_hpts_cpu = HPTS_CPU_NONE; tp->t_lro_cpu = HPTS_CPU_NONE; - callout_init_rw(&tp->t_callout, &inp->inp_lock, CALLOUT_RETURNUNLOCKED); + callout_init_rw(&tp->t_callout, &inp->inp_lock, + CALLOUT_TRYLOCK | CALLOUT_RETURNUNLOCKED); for (int i = 0; i < TT_N; i++) tp->t_timers[i] = SBT_MAX; @@ -2331,12 +2319,6 @@ tcp_newtcpcb(struct inpcb *inp) * which may match an IPv4-mapped IPv6 address. */ inp->inp_ip_ttl = V_ip_defttl; -#ifdef TCPPCAP - /* - * Init the TCP PCAP queues. - */ - tcp_pcap_tcpcb_init(tp); -#endif #ifdef TCP_BLACKBOX /* Initialize the per-TCPCB log data. */ tcp_log_tcpcbinit(tp); @@ -2344,6 +2326,13 @@ tcp_newtcpcb(struct inpcb *inp) tp->t_pacing_rate = -1; if (tp->t_fb->tfb_tcp_fb_init) { if ((*tp->t_fb->tfb_tcp_fb_init)(tp, &tp->t_fb_ptr)) { + if (CC_ALGO(tp)->cb_destroy != NULL) + CC_ALGO(tp)->cb_destroy(&tp->t_ccv); + CC_DATA(tp) = NULL; + cc_detach(tp); +#ifdef TCP_HHOOK + khelp_destroy_osd(&tp->t_osd); +#endif refcount_release(&tp->t_fb->tfb_refcnt); return (NULL); } @@ -2406,11 +2395,6 @@ tcp_discardcb(struct tcpcb *tp) if (tp->t_flags & TF_TOE) tcp_offload_detach(tp); #endif -#ifdef TCPPCAP - /* Free the TCP PCAP queues. */ - tcp_pcap_drain(&(tp->t_inpkts)); - tcp_pcap_drain(&(tp->t_outpkts)); -#endif /* Allow the CC algorithm to clean up after itself. */ if (CC_ALGO(tp)->cb_destroy != NULL) @@ -2456,10 +2440,8 @@ tcp_discardcb(struct tcpcb *tp) * XXXRRS: Updating must be after the stack fini() since * that may be converting some internal representation of * say srtt etc into the general one used by other stacks. - * Lets also at least protect against the so being NULL - * as RW stated below. */ - if ((tp->t_rttupdated >= 4) && (so != NULL)) { + if (tp->t_rttupdated >= 4) { struct hc_metrics_lite metrics; uint32_t ssthresh; @@ -2469,9 +2451,6 @@ tcp_discardcb(struct tcpcb *tp) * are satisfied. This gives us better new start value * for the congestion avoidance for new connections. * ssthresh is only set if packet loss occurred on a session. - * - * XXXRW: 'so' may be NULL here, and/or socket buffer may be - * being torn down. Ideally this code would not use 'so'. */ ssthresh = tp->snd_ssthresh; if (ssthresh != 0 && ssthresh < so->so_snd.sb_hiwat / 2) { @@ -2494,13 +2473,13 @@ tcp_discardcb(struct tcpcb *tp) ); } else ssthresh = 0; - metrics.rmx_ssthresh = ssthresh; + metrics.hc_ssthresh = ssthresh; - metrics.rmx_rtt = tp->t_srtt; - metrics.rmx_rttvar = tp->t_rttvar; - metrics.rmx_cwnd = tp->snd_cwnd; - metrics.rmx_sendpipe = 0; - metrics.rmx_recvpipe = 0; + metrics.hc_rtt = tp->t_srtt; + metrics.hc_rttvar = tp->t_rttvar; + metrics.hc_cwnd = tp->snd_cwnd; + metrics.hc_sendpipe = 0; + metrics.hc_recvpipe = 0; tcp_hc_update(&inp->inp_inc, &metrics); } @@ -2680,6 +2659,272 @@ SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, NULL, 0, tcp_pcblist, "S,xtcpcb", "List of active TCP connections"); +#define SND_TAG_STATUS_MAXLEN 128 + +#ifdef KERN_TLS + +static struct sx ktlslist_lock; +SX_SYSINIT(ktlslistlock, &ktlslist_lock, "ktlslist"); +static uint64_t ktls_glob_gen = 1; + +static int +tcp_ktlslist_locked(SYSCTL_HANDLER_ARGS, bool export_keys) +{ + struct xinpgen xig; + struct inpcb *inp; + struct socket *so; + struct ktls_session *ksr, *kss; + char *buf; + struct xktls_session *xktls; + uint64_t ipi_gencnt; + size_t buflen, len, sz; + u_int cnt; + int error; + bool ek, p; + + sx_assert(&ktlslist_lock, SA_XLOCKED); + if (req->newptr != NULL) + return (EPERM); + + len = 0; + cnt = 0; + ipi_gencnt = V_tcbinfo.ipi_gencnt; + bzero(&xig, sizeof(xig)); + xig.xig_len = sizeof(xig); + xig.xig_gen = ktls_glob_gen++; + xig.xig_sogen = so_gencnt; + + struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_tcbinfo, + INPLOOKUP_RLOCKPCB); + while ((inp = inp_next(&inpi)) != NULL) { + if (inp->inp_gencnt > ipi_gencnt || + cr_canseeinpcb(req->td->td_ucred, inp) != 0) + continue; + + so = inp->inp_socket; + if (so != NULL && so->so_gencnt <= xig.xig_sogen) { + p = false; + ek = export_keys && cr_canexport_ktlskeys( + req->td, inp); + ksr = so->so_rcv.sb_tls_info; + if (ksr != NULL) { + ksr->gen = xig.xig_gen; + p = true; + if (ek) { + sz = SIZE_T_MAX; + ktls_session_copy_keys(ksr, + NULL, &sz); + len += sz; + } + if (ksr->snd_tag != NULL && + ksr->snd_tag->sw->snd_tag_status_str != + NULL) { + sz = SND_TAG_STATUS_MAXLEN; + in_pcbref(inp); + INP_RUNLOCK(inp); + error = ksr->snd_tag->sw-> + snd_tag_status_str( + ksr->snd_tag, NULL, &sz); + if (in_pcbrele_rlock(inp)) + return (EDEADLK); + if (error == 0) + len += sz; + } + } + kss = so->so_snd.sb_tls_info; + if (kss != NULL) { + kss->gen = xig.xig_gen; + p = true; + if (ek) { + sz = SIZE_T_MAX; + ktls_session_copy_keys(kss, + NULL, &sz); + len += sz; + } + if (kss->snd_tag != NULL && + kss->snd_tag->sw->snd_tag_status_str != + NULL) { + sz = SND_TAG_STATUS_MAXLEN; + in_pcbref(inp); + INP_RUNLOCK(inp); + error = kss->snd_tag->sw-> + snd_tag_status_str( + kss->snd_tag, NULL, &sz); + if (in_pcbrele_rlock(inp)) + return (EDEADLK); + if (error == 0) + len += sz; + } + } + if (p) { + len += sizeof(*xktls); + len = roundup2(len, __alignof(struct + xktls_session)); + } + } + } + if (req->oldptr == NULL) { + len += 2 * sizeof(xig); + len += 3 * len / 4; + req->oldidx = len; + return (0); + } + + if ((error = sysctl_wire_old_buffer(req, 0)) != 0) + return (error); + + error = SYSCTL_OUT(req, &xig, sizeof xig); + if (error != 0) + return (error); + + buflen = roundup2(sizeof(*xktls) + 2 * TLS_MAX_PARAM_SIZE + + 2 * SND_TAG_STATUS_MAXLEN, __alignof(struct xktls_session)); + buf = malloc(buflen, M_TEMP, M_WAITOK | M_ZERO); + struct inpcb_iterator inpi1 = INP_ALL_ITERATOR(&V_tcbinfo, + INPLOOKUP_RLOCKPCB); + while ((inp = inp_next(&inpi1)) != NULL) { + if (inp->inp_gencnt > ipi_gencnt || + cr_canseeinpcb(req->td->td_ucred, inp) != 0) + continue; + + so = inp->inp_socket; + if (so == NULL) + continue; + + p = false; + ek = export_keys && cr_canexport_ktlskeys(req->td, inp); + ksr = so->so_rcv.sb_tls_info; + kss = so->so_snd.sb_tls_info; + xktls = (struct xktls_session *)buf; + if (ksr != NULL && ksr->gen == xig.xig_gen) { + p = true; + ktls_session_to_xktls_onedir(ksr, ek, &xktls->rcv); + } + if (kss != NULL && kss->gen == xig.xig_gen) { + p = true; + ktls_session_to_xktls_onedir(kss, ek, &xktls->snd); + } + if (!p) + continue; + + xktls->inp_gencnt = inp->inp_gencnt; + xktls->so_pcb = (kvaddr_t)inp; + memcpy(&xktls->coninf, &inp->inp_inc, sizeof(xktls->coninf)); + len = sizeof(*xktls); + if (ksr != NULL && ksr->gen == xig.xig_gen) { + if (ek) { + sz = buflen - len; + ktls_session_copy_keys(ksr, buf + len, &sz); + len += sz; + } else { + xktls->rcv.cipher_key_len = 0; + xktls->rcv.auth_key_len = 0; + } + if (ksr->snd_tag != NULL && + ksr->snd_tag->sw->snd_tag_status_str != NULL) { + sz = SND_TAG_STATUS_MAXLEN; + in_pcbref(inp); + INP_RUNLOCK(inp); + error = ksr->snd_tag->sw->snd_tag_status_str( + ksr->snd_tag, buf + len, &sz); + if (in_pcbrele_rlock(inp)) + return (EDEADLK); + if (error == 0) { + xktls->rcv.drv_st_len = sz; + len += sz; + } + } + } + if (kss != NULL && kss->gen == xig.xig_gen) { + if (ek) { + sz = buflen - len; + ktls_session_copy_keys(kss, buf + len, &sz); + len += sz; + } else { + xktls->snd.cipher_key_len = 0; + xktls->snd.auth_key_len = 0; + } + if (kss->snd_tag != NULL && + kss->snd_tag->sw->snd_tag_status_str != NULL) { + sz = SND_TAG_STATUS_MAXLEN; + in_pcbref(inp); + INP_RUNLOCK(inp); + error = kss->snd_tag->sw->snd_tag_status_str( + kss->snd_tag, buf + len, &sz); + if (in_pcbrele_rlock(inp)) + return (EDEADLK); + if (error == 0) { + xktls->snd.drv_st_len = sz; + len += sz; + } + } + } + len = roundup2(len, __alignof(*xktls)); + xktls->tsz = len; + xktls->fsz = sizeof(*xktls); + + error = SYSCTL_OUT(req, xktls, len); + if (error != 0) { + INP_RUNLOCK(inp); + break; + } + cnt++; + } + + if (error == 0) { + xig.xig_sogen = so_gencnt; + xig.xig_count = cnt; + error = SYSCTL_OUT(req, &xig, sizeof(xig)); + } + + zfree(buf, M_TEMP); + return (error); +} + +static int +tcp_ktlslist1(SYSCTL_HANDLER_ARGS, bool export_keys) +{ + int repeats, error; + + for (repeats = 0; repeats < 100; repeats++) { + if (sx_xlock_sig(&ktlslist_lock)) + return (EINTR); + error = tcp_ktlslist_locked(oidp, arg1, arg2, req, + export_keys); + sx_xunlock(&ktlslist_lock); + if (error != EDEADLK) + break; + if (sig_intr() != 0) { + error = EINTR; + break; + } + req->oldidx = 0; + } + return (error); +} + +static int +tcp_ktlslist_nokeys(SYSCTL_HANDLER_ARGS) +{ + return (tcp_ktlslist1(oidp, arg1, arg2, req, false)); +} + +static int +tcp_ktlslist_wkeys(SYSCTL_HANDLER_ARGS) +{ + return (tcp_ktlslist1(oidp, arg1, arg2, req, true)); +} + +SYSCTL_PROC(_net_inet_tcp, TCPCTL_KTLSLIST, ktlslist, + CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, + NULL, 0, tcp_ktlslist_nokeys, "S,xktls_session", + "List of active kTLS sessions for TCP connections"); +SYSCTL_PROC(_net_inet_tcp, TCPCTL_KTLSLIST_WKEYS, ktlslist_wkeys, + CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, + NULL, 0, tcp_ktlslist_wkeys, "S,xktls_session", + "List of active kTLS sessions for TCP connections with keys"); +#endif /* KERN_TLS */ + #ifdef INET static int tcp_getcred(SYSCTL_HANDLER_ARGS) @@ -2690,6 +2935,8 @@ tcp_getcred(SYSCTL_HANDLER_ARGS) struct inpcb *inp; int error; + if (req->newptr == NULL) + return (EINVAL); error = priv_check(req->td, PRIV_NETINET_GETCRED); if (error) return (error); @@ -2732,6 +2979,8 @@ tcp6_getcred(SYSCTL_HANDLER_ARGS) int mapped = 0; #endif + if (req->newptr == NULL) + return (EINVAL); error = priv_check(req->td, PRIV_NETINET_GETCRED); if (error) return (error); @@ -3318,11 +3567,22 @@ tcp_mtudisc(struct inpcb *inp, int mtuoffer) tcp_mss_update(tp, -1, mtuoffer, NULL, NULL); so = inp->inp_socket; - SOCKBUF_LOCK(&so->so_snd); + SOCK_SENDBUF_LOCK(so); /* If the mss is larger than the socket buffer, decrease the mss. */ - if (so->so_snd.sb_hiwat < tp->t_maxseg) + if (so->so_snd.sb_hiwat < tp->t_maxseg) { tp->t_maxseg = so->so_snd.sb_hiwat; - SOCKBUF_UNLOCK(&so->so_snd); + if (tp->t_maxseg < V_tcp_mssdflt) { + /* + * The MSS is so small we should not process incoming + * SACK's since we are subject to attack in such a + * case. + */ + tp->t_flags2 |= TF2_PROC_SACK_PROHIBIT; + } else { + tp->t_flags2 &= ~TF2_PROC_SACK_PROHIBIT; + } + } + SOCK_SENDBUF_UNLOCK(so); TCPSTAT_INC(tcps_mturesent); tp->t_rtttime = 0; @@ -3378,6 +3638,9 @@ tcp_maxmtu(struct in_conninfo *inc, struct tcp_ifcap *cap) cap->tsomax = ifp->if_hw_tsomax; cap->tsomaxsegcount = ifp->if_hw_tsomaxsegcount; cap->tsomaxsegsize = ifp->if_hw_tsomaxsegsize; + /* XXXKIB IFCAP2_IPSEC_OFFLOAD_TSO */ + cap->ipsec_tso = (ifp->if_capenable2 & + IFCAP2_BIT(IFCAP2_IPSEC_OFFLOAD)) != 0; } } } @@ -3417,6 +3680,7 @@ tcp_maxmtu6(struct in_conninfo *inc, struct tcp_ifcap *cap) cap->tsomax = ifp->if_hw_tsomax; cap->tsomaxsegcount = ifp->if_hw_tsomaxsegcount; cap->tsomaxsegsize = ifp->if_hw_tsomaxsegsize; + cap->ipsec_tso = false; /* XXXKIB */ } } } @@ -3454,8 +3718,19 @@ tcp6_use_min_mtu(struct tcpcb *tp) opt = inp->in6p_outputopts; if (opt != NULL && opt->ip6po_minmtu == IP6PO_MINMTU_ALL && - tp->t_maxseg > TCP6_MSS) + tp->t_maxseg > TCP6_MSS) { tp->t_maxseg = TCP6_MSS; + if (tp->t_maxseg < V_tcp_mssdflt) { + /* + * The MSS is so small we should not process incoming + * SACK's since we are subject to attack in such a + * case. + */ + tp->t_flags2 |= TF2_PROC_SACK_PROHIBIT; + } else { + tp->t_flags2 &= ~TF2_PROC_SACK_PROHIBIT; + } + } } } #endif /* INET6 */ @@ -3507,7 +3782,6 @@ tcp_maxseg(const struct tcpcb *tp) if (tp->t_flags & TF_SACK_PERMIT) optlen += PADTCPOLEN(TCPOLEN_SACK_PERMITTED); } -#undef PAD optlen = min(optlen, TCP_MAXOLEN); return (tp->t_maxseg - optlen); } @@ -3529,7 +3803,6 @@ tcp_fixed_maxseg(const struct tcpcb *tp) * for cc modules to figure out what the modulo of the * cwnd should be. */ -#define PAD(len) ((((len) / 4) + !!((len) % 4)) * 4) if (TCPS_HAVEESTABLISHED(tp->t_state)) { if (tp->t_flags & TF_RCVD_TSTMP) optlen = TCPOLEN_TSTAMP_APPA; @@ -3537,23 +3810,22 @@ tcp_fixed_maxseg(const struct tcpcb *tp) optlen = 0; #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (tp->t_flags & TF_SIGNATURE) - optlen += PAD(TCPOLEN_SIGNATURE); + optlen += PADTCPOLEN(TCPOLEN_SIGNATURE); #endif } else { if (tp->t_flags & TF_REQ_TSTMP) optlen = TCPOLEN_TSTAMP_APPA; else - optlen = PAD(TCPOLEN_MAXSEG); + optlen = PADTCPOLEN(TCPOLEN_MAXSEG); if (tp->t_flags & TF_REQ_SCALE) - optlen += PAD(TCPOLEN_WINDOW); + optlen += PADTCPOLEN(TCPOLEN_WINDOW); #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (tp->t_flags & TF_SIGNATURE) - optlen += PAD(TCPOLEN_SIGNATURE); + optlen += PADTCPOLEN(TCPOLEN_SIGNATURE); #endif if (tp->t_flags & TF_SACK_PERMIT) - optlen += PAD(TCPOLEN_SACK_PERMITTED); + optlen += PADTCPOLEN(TCPOLEN_SACK_PERMITTED); } -#undef PAD optlen = min(optlen, TCP_MAXOLEN); return (tp->t_maxseg - optlen); } @@ -4353,7 +4625,7 @@ tcp_req_log_req_info(struct tcpcb *tp, struct tcp_sendfile_track *req, union tcp_log_stackspecific log; struct timeval tv; - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + memset(&log, 0, sizeof(log)); log.u_bbr.inhpts = tcp_in_hpts(tp); log.u_bbr.flex8 = val; log.u_bbr.rttProp = req->timestamp; @@ -4408,7 +4680,7 @@ tcp_req_check_for_stale_entries(struct tcpcb *tp, uint64_t ts, int rm_oldest) uint64_t time_delta, oldest_delta; int i, oldest, oldest_set = 0, cnt_rm = 0; - for(i = 0; i < MAX_TCP_TRK_REQ; i++) { + for (i = 0; i < MAX_TCP_TRK_REQ; i++) { ent = &tp->t_tcpreq_info[i]; if (ent->flags != TCP_TRK_TRACK_FLG_USED) { /* @@ -4451,15 +4723,15 @@ tcp_req_check_for_stale_entries(struct tcpcb *tp, uint64_t ts, int rm_oldest) int tcp_req_check_for_comp(struct tcpcb *tp, tcp_seq ack_point) { - int i, ret=0; + int i, ret = 0; struct tcp_sendfile_track *ent; /* Clean up any old closed end requests that are now completed */ if (tp->t_tcpreq_req == 0) - return(0); + return (0); if (tp->t_tcpreq_closed == 0) - return(0); - for(i = 0; i < MAX_TCP_TRK_REQ; i++) { + return (0); + for (i = 0; i < MAX_TCP_TRK_REQ; i++) { ent = &tp->t_tcpreq_info[i]; /* Skip empty ones */ if (ent->flags == TCP_TRK_TRACK_FLG_EMPTY) @@ -4482,11 +4754,11 @@ int tcp_req_is_entry_comp(struct tcpcb *tp, struct tcp_sendfile_track *ent, tcp_seq ack_point) { if (tp->t_tcpreq_req == 0) - return(-1); + return (-1); if (tp->t_tcpreq_closed == 0) - return(-1); + return (-1); if (ent->flags == TCP_TRK_TRACK_FLG_EMPTY) - return(-1); + return (-1); if (SEQ_GEQ(ack_point, ent->end_seq)) { return (1); } @@ -4508,7 +4780,7 @@ tcp_req_find_a_req_that_is_completed_by(struct tcpcb *tp, tcp_seq th_ack, int *i /* none open */ return (NULL); } - for(i = 0; i < MAX_TCP_TRK_REQ; i++) { + for (i = 0; i < MAX_TCP_TRK_REQ; i++) { ent = &tp->t_tcpreq_info[i]; if (ent->flags == TCP_TRK_TRACK_FLG_EMPTY) continue; @@ -4532,7 +4804,7 @@ tcp_req_find_req_for_seq(struct tcpcb *tp, tcp_seq seq) /* none open */ return (NULL); } - for(i = 0; i < MAX_TCP_TRK_REQ; i++) { + for (i = 0; i < MAX_TCP_TRK_REQ; i++) { ent = &tp->t_tcpreq_info[i]; tcp_req_log_req_info(tp, ent, i, TCP_TRK_REQ_LOG_SEARCH, (uint64_t)seq, 0); @@ -4580,7 +4852,7 @@ tcp_req_alloc_req_full(struct tcpcb *tp, struct tcp_snd_req *req, uint64_t ts, i (tp->t_tcpreq_req >= MAX_TCP_TRK_REQ)); /* Check to see if this is a duplicate of one not started */ if (tp->t_tcpreq_req) { - for(i = 0, allocated = 0; i < MAX_TCP_TRK_REQ; i++) { + for (i = 0, allocated = 0; i < MAX_TCP_TRK_REQ; i++) { fil = &tp->t_tcpreq_info[i]; if ((fil->flags & TCP_TRK_TRACK_FLG_USED) == 0) continue; @@ -4595,20 +4867,20 @@ tcp_req_alloc_req_full(struct tcpcb *tp, struct tcp_snd_req *req, uint64_t ts, i * a 4xx of some sort and its going to age * out, lets not duplicate it. */ - return(fil); + return (fil); } } } /* Ok if there is no room at the inn we are in trouble */ if (tp->t_tcpreq_req >= MAX_TCP_TRK_REQ) { tcp_trace_point(tp, TCP_TP_REQ_LOG_FAIL); - for(i = 0; i < MAX_TCP_TRK_REQ; i++) { + for (i = 0; i < MAX_TCP_TRK_REQ; i++) { tcp_req_log_req_info(tp, &tp->t_tcpreq_info[i], i, TCP_TRK_REQ_LOG_ALLOCFAIL, 0, 0); } return (NULL); } - for(i = 0, allocated = 0; i < MAX_TCP_TRK_REQ; i++) { + for (i = 0, allocated = 0; i < MAX_TCP_TRK_REQ; i++) { fil = &tp->t_tcpreq_info[i]; if (fil->flags == TCP_TRK_TRACK_FLG_EMPTY) { allocated = 1; diff --git a/sys/netinet/tcp_syncache.c b/sys/netinet/tcp_syncache.c index c21dbbb58e31..80e6b53d10df 100644 --- a/sys/netinet/tcp_syncache.c +++ b/sys/netinet/tcp_syncache.c @@ -114,14 +114,6 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, syncookies_only, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_syncookiesonly), 0, "Use only TCP SYN cookies"); -VNET_DEFINE_STATIC(int, functions_inherit_listen_socket_stack) = 1; -#define V_functions_inherit_listen_socket_stack \ - VNET(functions_inherit_listen_socket_stack) -SYSCTL_INT(_net_inet_tcp, OID_AUTO, functions_inherit_listen_socket_stack, - CTLFLAG_VNET | CTLFLAG_RW, - &VNET_NAME(functions_inherit_listen_socket_stack), 0, - "Inherit listen socket's stack"); - #ifdef TCP_OFFLOAD #define ADDED_BY_TOE(sc) ((sc)->sc_tod != NULL) #endif @@ -139,17 +131,18 @@ static void syncache_timer(void *); static uint32_t syncookie_mac(struct in_conninfo *, tcp_seq, uint8_t, uint8_t *, uintptr_t); static tcp_seq syncookie_generate(struct syncache_head *, struct syncache *); -static struct syncache - *syncookie_lookup(struct in_conninfo *, struct syncache_head *, - struct syncache *, struct tcphdr *, struct tcpopt *, - struct socket *, uint16_t); +static bool syncookie_expand(struct in_conninfo *, + const struct syncache_head *, struct syncache *, + struct tcphdr *, struct tcpopt *, struct socket *, + uint16_t); static void syncache_pause(struct in_conninfo *); static void syncache_unpause(void *); static void syncookie_reseed(void *); #ifdef INVARIANTS -static int syncookie_cmp(struct in_conninfo *inc, struct syncache_head *sch, - struct syncache *sc, struct tcphdr *th, struct tcpopt *to, - struct socket *lso, uint16_t port); +static void syncookie_cmp(struct in_conninfo *, + const struct syncache_head *, struct syncache *, + struct tcphdr *, struct tcpopt *, struct socket *, + uint16_t); #endif /* @@ -215,7 +208,7 @@ sysctl_net_inet_tcp_syncache_rexmtlimit_check(SYSCTL_HANDLER_ARGS) SYSCTL_PROC(_net_inet_tcp_syncache, OID_AUTO, rexmtlimit, CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &VNET_NAME(tcp_syncache.rexmt_limit), 0, - sysctl_net_inet_tcp_syncache_rexmtlimit_check, "UI", + sysctl_net_inet_tcp_syncache_rexmtlimit_check, "IU", "Limit on SYN/ACK retransmissions"); VNET_DEFINE(int, tcp_sc_rst_sock_fail) = 1; @@ -237,7 +230,7 @@ syncache_free(struct syncache *sc) { if (sc->sc_ipopts) - (void) m_free(sc->sc_ipopts); + (void)m_free(sc->sc_ipopts); if (sc->sc_cred) crfree(sc->sc_cred); #ifdef MAC @@ -450,7 +443,7 @@ syncache_timeout(struct syncache *sc, struct syncache_head *sch, int docallout) else TCPT_RANGESET(rexmt, tcp_rexmit_initial * tcp_backoff[sc->sc_rxmits], - tcp_rexmit_min, TCPTV_REXMTMAX); + tcp_rexmit_min, tcp_rexmit_max); sc->sc_rxttime = ticks + rexmt; sc->sc_rxmits++; if (TSTMP_LT(sc->sc_rxttime, sch->sch_nextc)) { @@ -535,10 +528,16 @@ syncache_timer(void *xsch) } NET_EPOCH_ENTER(et); - syncache_respond(sc, NULL, TH_SYN|TH_ACK); + if (syncache_respond(sc, NULL, TH_SYN|TH_ACK) == 0) { + syncache_timeout(sc, sch, 0); + TCPSTAT_INC(tcps_sndacks); + TCPSTAT_INC(tcps_sndtotal); + TCPSTAT_INC(tcps_sc_retransmitted); + } else { + syncache_drop(sc, sch); + TCPSTAT_INC(tcps_sc_dropped); + } NET_EPOCH_EXIT(et); - TCPSTAT_INC(tcps_sc_retransmitted); - syncache_timeout(sc, sch, 0); } if (!TAILQ_EMPTY(&(sch)->sch_bucket)) callout_reset(&(sch)->sch_timer, (sch)->sch_nextc - tick, @@ -696,7 +695,13 @@ syncache_chkrst(struct in_conninfo *inc, struct tcphdr *th, struct mbuf *m, "sending challenge ACK\n", s, __func__, th->th_seq, sc->sc_irs + 1, sc->sc_wnd); - syncache_respond(sc, m, TH_ACK); + if (syncache_respond(sc, m, TH_ACK) == 0) { + TCPSTAT_INC(tcps_sndacks); + TCPSTAT_INC(tcps_sndtotal); + } else { + syncache_drop(sc, sch); + TCPSTAT_INC(tcps_sc_dropped); + } } } else { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) @@ -777,7 +782,6 @@ done: static struct socket * syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m) { - struct tcp_function_block *blk; struct inpcb *inp = NULL; struct socket *so; struct tcpcb *tp; @@ -802,7 +806,7 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m) goto allocfail; } inp = sotoinpcb(so); - if ((tp = tcp_newtcpcb(inp)) == NULL) { + if ((tp = tcp_newtcpcb(inp, sototcpcb(lso))) == NULL) { in_pcbfree(inp); sodealloc(so); goto allocfail; @@ -895,7 +899,7 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m) sin.sin_port = sc->sc_inc.inc_fport; bzero((caddr_t)sin.sin_zero, sizeof(sin.sin_zero)); INP_HASH_WLOCK(&V_tcbinfo); - error = in_pcbconnect(inp, &sin, thread0.td_ucred, false); + error = in_pcbconnect(inp, &sin, thread0.td_ucred); INP_HASH_WUNLOCK(&V_tcbinfo); if (error != 0) goto abort; @@ -912,37 +916,6 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m) tp->t_port = sc->sc_port; tcp_rcvseqinit(tp); tcp_sendseqinit(tp); - blk = sototcpcb(lso)->t_fb; - if (V_functions_inherit_listen_socket_stack && blk != tp->t_fb) { - /* - * Our parents t_fb was not the default, - * we need to release our ref on tp->t_fb and - * pickup one on the new entry. - */ - struct tcp_function_block *rblk; - void *ptr = NULL; - - rblk = find_and_ref_tcp_fb(blk); - KASSERT(rblk != NULL, - ("cannot find blk %p out of syncache?", blk)); - - if (rblk->tfb_tcp_fb_init == NULL || - (*rblk->tfb_tcp_fb_init)(tp, &ptr) == 0) { - /* Release the old stack */ - if (tp->t_fb->tfb_tcp_fb_fini != NULL) - (*tp->t_fb->tfb_tcp_fb_fini)(tp, 0); - refcount_release(&tp->t_fb->tfb_refcnt); - /* Now set in all the pointers */ - tp->t_fb = rblk; - tp->t_fb_ptr = ptr; - } else { - /* - * Initialization failed. Release the reference count on - * the looked up default stack. - */ - refcount_release(&rblk->tfb_refcnt); - } - } tp->snd_wl1 = sc->sc_irs; tp->snd_max = tp->iss + 1; tp->snd_nxt = tp->iss + 1; @@ -1053,6 +1026,7 @@ allocfail: return (NULL); abort: + tcp_discardcb(tp); in_pcbfree(inp); sodealloc(so); if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) { @@ -1123,6 +1097,7 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, */ if (locked && !V_tcp_syncookies) { SCH_UNLOCK(sch); + TCPSTAT_INC(tcps_sc_spurcookie); if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Spurious ACK, " "segment rejected (syncookies disabled)\n", @@ -1132,17 +1107,21 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, if (locked && !V_tcp_syncookiesonly && sch->sch_last_overflow < time_uptime - SYNCOOKIE_LIFETIME) { SCH_UNLOCK(sch); + TCPSTAT_INC(tcps_sc_spurcookie); if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Spurious ACK, " "segment rejected (no syncache entry)\n", s, __func__); goto failed; } - bzero(&scs, sizeof(scs)); - sc = syncookie_lookup(inc, sch, &scs, th, to, *lsop, port); if (locked) SCH_UNLOCK(sch); - if (sc == NULL) { + bzero(&scs, sizeof(scs)); + if (syncookie_expand(inc, sch, &scs, th, to, *lsop, port)) { + sc = &scs; + TCPSTAT_INC(tcps_sc_recvcookie); + } else { + TCPSTAT_INC(tcps_sc_failcookie); if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Segment failed " "SYNCOOKIE authentication, segment rejected " @@ -1399,10 +1378,9 @@ syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, int autoflowlabel = 0; #endif #ifdef MAC - struct label *maclabel; + struct label *maclabel = NULL; #endif struct syncache scs; - struct ucred *cred; uint64_t tfo_response_cookie; unsigned int *tfo_pending = NULL; int tfo_cookie_valid = 0; @@ -1419,7 +1397,6 @@ syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, */ KASSERT(SOLISTENING(so), ("%s: %p not listening", __func__, so)); tp = sototcpcb(so); - cred = V_tcp_syncache.see_other ? NULL : crhold(so->so_cred); #ifdef INET6 if (inc->inc_flags & INC_ISIPV6) { @@ -1549,7 +1526,7 @@ syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, * forget it and use the new one we've been given. */ if (sc->sc_ipopts) - (void) m_free(sc->sc_ipopts); + (void)m_free(sc->sc_ipopts); sc->sc_ipopts = ipopts; } /* @@ -1565,7 +1542,7 @@ syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, */ if (sc->sc_flags & SCF_ECN_MASK) { sc->sc_flags &= ~SCF_ECN_MASK; - sc->sc_flags = tcp_ecn_syncache_add(tcp_get_flags(th), iptos); + sc->sc_flags |= tcp_ecn_syncache_add(tcp_get_flags(th), iptos); } #ifdef MAC /* @@ -1588,56 +1565,54 @@ syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, syncache_timeout(sc, sch, 1); TCPSTAT_INC(tcps_sndacks); TCPSTAT_INC(tcps_sndtotal); + } else { + syncache_drop(sc, sch); + TCPSTAT_INC(tcps_sc_dropped); } SCH_UNLOCK(sch); goto donenoprobe; } - if (tfo_cookie_valid) { - bzero(&scs, sizeof(scs)); - sc = &scs; - goto skip_alloc; - } - + KASSERT(sc == NULL, ("sc(%p) != NULL", sc)); /* * Skip allocating a syncache entry if we are just going to discard * it later. */ - if (!locked) { + if (!locked || tfo_cookie_valid) { bzero(&scs, sizeof(scs)); sc = &scs; - } else - sc = uma_zalloc(V_tcp_syncache.zone, M_NOWAIT | M_ZERO); - if (sc == NULL) { - /* - * The zone allocator couldn't provide more entries. - * Treat this as if the cache was full; drop the oldest - * entry and insert the new one. - */ - TCPSTAT_INC(tcps_sc_zonefail); - if ((sc = TAILQ_LAST(&sch->sch_bucket, sch_head)) != NULL) { - sch->sch_last_overflow = time_uptime; - syncache_drop(sc, sch); - syncache_pause(inc); - } + } else { sc = uma_zalloc(V_tcp_syncache.zone, M_NOWAIT | M_ZERO); if (sc == NULL) { - if (V_tcp_syncookies) { - bzero(&scs, sizeof(scs)); - sc = &scs; - } else { - KASSERT(locked, - ("%s: bucket unexpectedly unlocked", - __func__)); - SCH_UNLOCK(sch); - if (ipopts) - (void) m_free(ipopts); - goto done; + /* + * The zone allocator couldn't provide more entries. + * Treat this as if the cache was full; drop the oldest + * entry and insert the new one. + */ + TCPSTAT_INC(tcps_sc_zonefail); + sc = TAILQ_LAST(&sch->sch_bucket, sch_head); + if (sc != NULL) { + sch->sch_last_overflow = time_uptime; + syncache_drop(sc, sch); + syncache_pause(inc); + } + sc = uma_zalloc(V_tcp_syncache.zone, M_NOWAIT | M_ZERO); + if (sc == NULL) { + if (V_tcp_syncookies) { + bzero(&scs, sizeof(scs)); + sc = &scs; + } else { + KASSERT(locked, + ("%s: bucket unexpectedly unlocked", + __func__)); + SCH_UNLOCK(sch); + goto done; + } } } } -skip_alloc: + KASSERT(sc != NULL, ("sc == NULL")); if (!tfo_cookie_valid && tfo_response_cookie_valid) sc->sc_tfo_cookie = &tfo_response_cookie; @@ -1647,9 +1622,21 @@ skip_alloc: #ifdef MAC sc->sc_label = maclabel; #endif - sc->sc_cred = cred; + /* + * sc_cred is only used in syncache_pcblist() to list TCP endpoints in + * TCPS_SYN_RECEIVED state when V_tcp_syncache.see_other is false. + * Therefore, store the credentials and take a reference count only + * when needed: + * - sc is allocated from the zone and not using the on stack instance. + * - the sysctl variable net.inet.tcp.syncache.see_other is false. + * The reference count is decremented when a zone allocated sc is + * freed in syncache_free(). + */ + if (sc != &scs && !V_tcp_syncache.see_other) + sc->sc_cred = crhold(so->so_cred); + else + sc->sc_cred = NULL; sc->sc_port = port; - cred = NULL; sc->sc_ipopts = ipopts; bcopy(inc, &sc->sc_inc, sizeof(struct in_conninfo)); sc->sc_ip_tos = ip_tos; @@ -1759,9 +1746,7 @@ skip_alloc: * Do a standard 3-way handshake. */ if (syncache_respond(sc, m, TH_SYN|TH_ACK) == 0) { - if (V_tcp_syncookies && V_tcp_syncookiesonly && sc != &scs) - syncache_free(sc); - else if (sc != &scs) + if (sc != &scs) syncache_insert(sc, sch); /* locks and unlocks sch */ TCPSTAT_INC(tcps_sndacks); TCPSTAT_INC(tcps_sndtotal); @@ -1787,12 +1772,13 @@ donenoprobe: tcp_fastopen_decrement_counter(tfo_pending); tfo_expanded: - if (cred != NULL) - crfree(cred); + if (sc == NULL || sc == &scs) { #ifdef MAC - if (sc == &scs) mac_syncache_destroy(&maclabel); #endif + if (ipopts) + (void)m_free(ipopts); + } return (rv); } @@ -2271,8 +2257,8 @@ syncookie_generate(struct syncache_head *sch, struct syncache *sc) return (iss); } -static struct syncache * -syncookie_lookup(struct in_conninfo *inc, struct syncache_head *sch, +static bool +syncookie_expand(struct in_conninfo *inc, const struct syncache_head *sch, struct syncache *sc, struct tcphdr *th, struct tcpopt *to, struct socket *lso, uint16_t port) { @@ -2302,7 +2288,7 @@ syncookie_lookup(struct in_conninfo *inc, struct syncache_head *sch, /* The recomputed hash matches the ACK if this was a genuine cookie. */ if ((ack & ~0xff) != (hash & ~0xff)) - return (NULL); + return (false); /* Fill in the syncache values. */ sc->sc_flags = 0; @@ -2362,47 +2348,47 @@ syncookie_lookup(struct in_conninfo *inc, struct syncache_head *sch, sc->sc_port = port; - TCPSTAT_INC(tcps_sc_recvcookie); - return (sc); + return (true); } #ifdef INVARIANTS -static int -syncookie_cmp(struct in_conninfo *inc, struct syncache_head *sch, +static void +syncookie_cmp(struct in_conninfo *inc, const struct syncache_head *sch, struct syncache *sc, struct tcphdr *th, struct tcpopt *to, struct socket *lso, uint16_t port) { - struct syncache scs, *scx; + struct syncache scs; char *s; bzero(&scs, sizeof(scs)); - scx = syncookie_lookup(inc, sch, &scs, th, to, lso, port); + if (syncookie_expand(inc, sch, &scs, th, to, lso, port) && + (sc->sc_peer_mss != scs.sc_peer_mss || + sc->sc_requested_r_scale != scs.sc_requested_r_scale || + sc->sc_requested_s_scale != scs.sc_requested_s_scale || + (sc->sc_flags & SCF_SACK) != (scs.sc_flags & SCF_SACK))) { - if ((s = tcp_log_addrs(inc, th, NULL, NULL)) == NULL) - return (0); + if ((s = tcp_log_addrs(inc, th, NULL, NULL)) == NULL) + return; - if (scx != NULL) { - if (sc->sc_peer_mss != scx->sc_peer_mss) + if (sc->sc_peer_mss != scs.sc_peer_mss) log(LOG_DEBUG, "%s; %s: mss different %i vs %i\n", - s, __func__, sc->sc_peer_mss, scx->sc_peer_mss); + s, __func__, sc->sc_peer_mss, scs.sc_peer_mss); - if (sc->sc_requested_r_scale != scx->sc_requested_r_scale) + if (sc->sc_requested_r_scale != scs.sc_requested_r_scale) log(LOG_DEBUG, "%s; %s: rwscale different %i vs %i\n", s, __func__, sc->sc_requested_r_scale, - scx->sc_requested_r_scale); + scs.sc_requested_r_scale); - if (sc->sc_requested_s_scale != scx->sc_requested_s_scale) + if (sc->sc_requested_s_scale != scs.sc_requested_s_scale) log(LOG_DEBUG, "%s; %s: swscale different %i vs %i\n", s, __func__, sc->sc_requested_s_scale, - scx->sc_requested_s_scale); + scs.sc_requested_s_scale); - if ((sc->sc_flags & SCF_SACK) != (scx->sc_flags & SCF_SACK)) + if ((sc->sc_flags & SCF_SACK) != (scs.sc_flags & SCF_SACK)) log(LOG_DEBUG, "%s; %s: SACK different\n", s, __func__); - } - if (s != NULL) free(s, M_TCPLOG); - return (0); + } } #endif /* INVARIANTS */ diff --git a/sys/netinet/tcp_syncache.h b/sys/netinet/tcp_syncache.h index 9f3beebf16af..55e062e35a54 100644 --- a/sys/netinet/tcp_syncache.h +++ b/sys/netinet/tcp_syncache.h @@ -67,7 +67,7 @@ struct syncache { u_int8_t sc_requested_s_scale:4, sc_requested_r_scale:4; u_int16_t sc_flags; -#if defined(TCP_OFFLOAD) || !defined(TCP_OFFLOAD_DISABLE) +#if defined(TCP_OFFLOAD) struct toedev *sc_tod; /* entry added by this TOE */ void *sc_todctx; /* TOE driver context */ #endif @@ -127,7 +127,9 @@ struct tcp_syncache { u_int cache_limit; u_int rexmt_limit; uint32_t hash_secret; +#ifdef VIMAGE struct vnet *vnet; +#endif struct syncookie_secret secret; struct mtx pause_mtx; struct callout pause_co; diff --git a/sys/netinet/tcp_timer.c b/sys/netinet/tcp_timer.c index 785f68be5621..32ce3001929c 100644 --- a/sys/netinet/tcp_timer.c +++ b/sys/netinet/tcp_timer.c @@ -74,39 +74,33 @@ #include <netinet/tcpip.h> int tcp_persmin; -SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmin, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmin, CTLTYPE_INT | CTLFLAG_RW, &tcp_persmin, 0, sysctl_msec_to_ticks, "I", "minimum persistence interval"); int tcp_persmax; -SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmax, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmax, CTLTYPE_INT | CTLFLAG_RW, &tcp_persmax, 0, sysctl_msec_to_ticks, "I", "maximum persistence interval"); int tcp_keepinit; -SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, +SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT | CTLFLAG_RW, &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", "time to establish connection"); int tcp_keepidle; -SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, +SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, CTLTYPE_INT | CTLFLAG_RW, &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", "time before keepalive probes begin"); int tcp_keepintvl; SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, - &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", + CTLTYPE_INT | CTLFLAG_RW, &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", "time between keepalive probes"); int tcp_delacktime; SYSCTL_PROC(_net_inet_tcp, TCPCTL_DELACKTIME, delacktime, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, - &tcp_delacktime, 0, sysctl_msec_to_ticks, "I", + CTLTYPE_INT | CTLFLAG_RW, &tcp_delacktime, 0, sysctl_msec_to_ticks, "I", "Time before a delayed ACK is sent"); VNET_DEFINE(int, tcp_msl); @@ -115,21 +109,29 @@ SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, &VNET_NAME(tcp_msl), 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime"); +VNET_DEFINE(int, tcp_msl_local); +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl_local, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_VNET, + &VNET_NAME(tcp_msl_local), 0, sysctl_msec_to_ticks, "I", + "Maximum segment lifetime for local communication"); + int tcp_rexmit_initial; -SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_initial, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_initial, CTLTYPE_INT | CTLFLAG_RW, &tcp_rexmit_initial, 0, sysctl_msec_to_ticks, "I", "Initial Retransmission Timeout"); int tcp_rexmit_min; -SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_min, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_min, CTLTYPE_INT | CTLFLAG_RW, &tcp_rexmit_min, 0, sysctl_msec_to_ticks, "I", "Minimum Retransmission Timeout"); +int tcp_rexmit_max; +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_max, CTLTYPE_INT | CTLFLAG_RW, + &tcp_rexmit_max, 0, sysctl_msec_to_ticks, "I", + "Maximum Retransmission Timeout"); + int tcp_rexmit_slop; -SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_slop, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_slop, CTLTYPE_INT | CTLFLAG_RW, &tcp_rexmit_slop, 0, sysctl_msec_to_ticks, "I", "Retransmission Timer Slop"); @@ -144,8 +146,7 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, fast_finwait2_recycle, CTLFLAG_RW, "Recycle closed FIN_WAIT_2 connections faster"); int tcp_finwait2_timeout; -SYSCTL_PROC(_net_inet_tcp, OID_AUTO, finwait2_timeout, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, finwait2_timeout, CTLTYPE_INT | CTLFLAG_RW, &tcp_finwait2_timeout, 0, sysctl_msec_to_ticks, "I", "FIN-WAIT2 timeout"); @@ -162,8 +163,7 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmit_drop_options, CTLFLAG_RW, "Drop TCP options from 3rd and later retransmitted SYN"); int tcp_maxunacktime = TCPTV_MAXUNACKTIME; -SYSCTL_PROC(_net_inet_tcp, OID_AUTO, maxunacktime, - CTLTYPE_INT|CTLFLAG_RW | CTLFLAG_NEEDGIANT, +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, maxunacktime, CTLTYPE_INT | CTLFLAG_RW, &tcp_maxunacktime, 0, sysctl_msec_to_ticks, "I", "Maximum time (in ms) that a session can linger without making progress"); @@ -629,8 +629,7 @@ tcp_timer_rexmt(struct tcpcb *tp) rexmt = tcp_rexmit_initial * tcp_backoff[tp->t_rxtshift]; else rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; - TCPT_RANGESET(tp->t_rxtcur, rexmt, - tp->t_rttmin, TCPTV_REXMTMAX); + TCPT_RANGESET(tp->t_rxtcur, rexmt, tp->t_rttmin, tcp_rexmit_max); /* * We enter the path for PLMTUD if connection is established or, if @@ -756,6 +755,16 @@ tcp_timer_rexmt(struct tcpcb *tp) tp->t_flags2 |= TF2_PLPMTU_PMTUD; tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE; tp->t_maxseg = tp->t_pmtud_saved_maxseg; + if (tp->t_maxseg < V_tcp_mssdflt) { + /* + * The MSS is so small we should not + * process incoming SACK's since we are + * subject to attack in such a case. + */ + tp->t_flags2 |= TF2_PROC_SACK_PROHIBIT; + } else { + tp->t_flags2 &= ~TF2_PROC_SACK_PROHIBIT; + } TCPSTAT_INC(tcps_pmtud_blackhole_failed); /* * Reset the slow-start flight size as it @@ -800,7 +809,9 @@ tcp_timer_rexmt(struct tcpcb *tp) */ tp->t_rtttime = 0; - cc_cong_signal(tp, NULL, CC_RTO); + /* Do not overwrite the snd_cwnd on SYN retransmissions. */ + if (tp->t_state != TCPS_SYN_SENT) + cc_cong_signal(tp, NULL, CC_RTO); NET_EPOCH_ENTER(et); rv = tcp_output_locked(tp); NET_EPOCH_EXIT(et); @@ -864,12 +875,8 @@ tcp_timer_enter(void *xtp) struct inpcb *inp = tptoinpcb(tp); sbintime_t precision; tt_which which; - bool tp_valid; INP_WLOCK_ASSERT(inp); - MPASS((curthread->td_pflags & TDP_INTCPCALLOUT) == 0); - - curthread->td_pflags |= TDP_INTCPCALLOUT; which = tcp_timer_next(tp, NULL); MPASS(which < TT_N); @@ -877,8 +884,7 @@ tcp_timer_enter(void *xtp) tp->t_precisions[which] = 0; tcp_bblog_timer(tp, which, TT_PROCESSING, 0); - tp_valid = tcp_timersw[which](tp); - if (tp_valid) { + if (tcp_timersw[which](tp)) { tcp_bblog_timer(tp, which, TT_PROCESSED, 0); if ((which = tcp_timer_next(tp, &precision)) != TT_N) { MPASS(tp->t_state > TCPS_CLOSED); @@ -888,8 +894,6 @@ tcp_timer_enter(void *xtp) } INP_WUNLOCK(inp); } - - curthread->td_pflags &= ~TDP_INTCPCALLOUT; } /* @@ -939,35 +943,26 @@ tcp_timer_active(struct tcpcb *tp, tt_which which) /* * Stop all timers associated with tcpcb. - * * Called when tcpcb moves to TCPS_CLOSED. - * - * XXXGL: unfortunately our callout(9) is not able to fully stop a locked - * callout even when only two threads are involved: the callout itself and the - * thread that does callout_stop(). See where softclock_call_cc() swaps the - * callwheel lock to callout lock and then checks cc_exec_cancel(). This is - * the race window. If it happens, the tcp_timer_enter() won't be executed, - * however pcb lock will be locked and released, hence we can't free memory. - * Until callout(9) is improved, just keep retrying. In my profiling I've seen - * such event happening less than 1 time per hour with 20-30 Gbit/s of traffic. */ void tcp_timer_stop(struct tcpcb *tp) { - struct inpcb *inp = tptoinpcb(tp); - INP_WLOCK_ASSERT(inp); - - if (curthread->td_pflags & TDP_INTCPCALLOUT) { - int stopped __diagused; + INP_WLOCK_ASSERT(tptoinpcb(tp)); - stopped = callout_stop(&tp->t_callout); - MPASS(stopped == 0); - for (tt_which i = 0; i < TT_N; i++) - tp->t_timers[i] = SBT_MAX; - } else while(__predict_false(callout_stop(&tp->t_callout) == 0)) { - INP_WUNLOCK(inp); - kern_yield(PRI_UNCHANGED); - INP_WLOCK(inp); - } + /* + * We don't check return value from callout_stop(). There are two + * reasons why it can return 0. First, a legitimate one: we could have + * been called from the callout itself. Second, callout(9) has a bug. + * It can race internally in softclock_call_cc(), when callout has + * already completed, but cc_exec_curr still points at the callout. + */ + (void )callout_stop(&tp->t_callout); + /* + * In case of being called from callout itself, we must make sure that + * we don't reschedule. + */ + for (tt_which i = 0; i < TT_N; i++) + tp->t_timers[i] = SBT_MAX; } diff --git a/sys/netinet/tcp_timer.h b/sys/netinet/tcp_timer.h index a3ca268417ba..34a0f1375463 100644 --- a/sys/netinet/tcp_timer.h +++ b/sys/netinet/tcp_timer.h @@ -32,6 +32,8 @@ #ifndef _NETINET_TCP_TIMER_H_ #define _NETINET_TCP_TIMER_H_ +#ifdef _KERNEL + /* * The TCPT_REXMT timer is used to force retransmissions. * The TCP has the TCPT_REXMT timer set whenever segments @@ -71,21 +73,22 @@ /* * Time constants. */ -#define TCPTV_MSL ( 30*hz) /* max seg lifetime (hah!) */ +#define TCPTV_MSL MSEC_2_TICKS(30000) /* max seg lifetime (hah!) */ +#define TCPTV_MSL_LOCAL MSEC_2_TICKS(10) /* max seg lifetime for local comm */ #define TCPTV_SRTTBASE 0 /* base roundtrip time; if 0, no idea yet */ -#define TCPTV_RTOBASE ( 1*hz) /* assumed RTO if no info */ +#define TCPTV_RTOBASE MSEC_2_TICKS(1000) /* assumed RTO if no info */ -#define TCPTV_PERSMIN ( 5*hz) /* minimum persist interval */ -#define TCPTV_PERSMAX ( 60*hz) /* maximum persist interval */ +#define TCPTV_PERSMIN MSEC_2_TICKS(5000) /* minimum persist interval */ +#define TCPTV_PERSMAX MSEC_2_TICKS(60000) /* maximum persist interval */ -#define TCPTV_KEEP_INIT ( 75*hz) /* initial connect keepalive */ -#define TCPTV_KEEP_IDLE (120*60*hz) /* dflt time before probing */ -#define TCPTV_KEEPINTVL ( 75*hz) /* default probe interval */ +#define TCPTV_KEEP_INIT MSEC_2_TICKS(75000) /* initial connect keepalive */ +#define TCPTV_KEEP_IDLE MSEC_2_TICKS(120*60*1000) /* dflt time before probing */ +#define TCPTV_KEEPINTVL MSEC_2_TICKS(75000) /* default probe interval */ #define TCPTV_KEEPCNT 8 /* max probes before drop */ #define TCPTV_MAXUNACKTIME 0 /* max time without making progress */ -#define TCPTV_FINWAIT2_TIMEOUT (60*hz) /* FIN_WAIT_2 timeout if no receiver */ +#define TCPTV_FINWAIT2_TIMEOUT MSEC_2_TICKS(60000) /* FIN_WAIT_2 timeout if no receiver */ /* * Minimum retransmit timer is 3 ticks, for algorithmic stability. @@ -107,15 +110,13 @@ * The prior minimum of 1*hz (1 second) badly breaks throughput on any * networks faster then a modem that has minor (e.g. 1%) packet loss. */ -#define TCPTV_MIN ( hz/33 ) /* minimum allowable value */ -#define TCPTV_CPU_VAR ( hz/5 ) /* cpu variance allowed (200ms) */ -#define TCPTV_REXMTMAX ( 64*hz) /* max allowable REXMT value */ - -#define TCPTV_TWTRUNC 8 /* RTO factor to truncate TW */ +#define TCPTV_MIN MSEC_2_TICKS(30) /* minimum allowable value */ +#define TCPTV_CPU_VAR MSEC_2_TICKS(200) /* cpu variance allowed (200ms) */ +#define TCPTV_REXMTMAX MSEC_2_TICKS(64000) /* max allowable REXMT value */ #define TCP_MAXRXTSHIFT 12 /* maximum retransmits */ -#define TCPTV_DELACK ( hz/25 ) /* 40ms timeout */ +#define TCPTV_DELACK MSEC_2_TICKS(40) /* 40ms timeout */ /* * If we exceed this number of retransmits for a single segment, we'll consider @@ -135,8 +136,6 @@ (tv) = (tvmax); \ } while(0) -#ifdef _KERNEL - #define TP_KEEPINIT(tp) ((tp)->t_keepinit ? (tp)->t_keepinit : tcp_keepinit) #define TP_KEEPIDLE(tp) ((tp)->t_keepidle ? (tp)->t_keepidle : tcp_keepidle) #define TP_KEEPINTVL(tp) ((tp)->t_keepintvl ? (tp)->t_keepintvl : tcp_keepintvl) @@ -165,6 +164,7 @@ extern int tcp_maxunacktime; /* max time without making progress */ extern int tcp_maxpersistidle; extern int tcp_rexmit_initial; extern int tcp_rexmit_min; +extern int tcp_rexmit_max; extern int tcp_rexmit_slop; extern int tcp_ttl; /* time to live for TCP segs */ extern int tcp_backoff[]; @@ -184,6 +184,8 @@ VNET_DECLARE(int, tcp_v6pmtud_blackhole_mss); #define V_tcp_v6pmtud_blackhole_mss VNET(tcp_v6pmtud_blackhole_mss) VNET_DECLARE(int, tcp_msl); #define V_tcp_msl VNET(tcp_msl) +VNET_DECLARE(int, tcp_msl_local); +#define V_tcp_msl_local VNET(tcp_msl_local) #endif /* _KERNEL */ diff --git a/sys/netinet/tcp_timewait.c b/sys/netinet/tcp_timewait.c index 8d77db275310..c095fc8f7765 100644 --- a/sys/netinet/tcp_timewait.c +++ b/sys/netinet/tcp_timewait.c @@ -87,12 +87,52 @@ #include <security/mac/mac_framework.h> -VNET_DEFINE_STATIC(bool, nolocaltimewait) = true; +VNET_DEFINE_STATIC(bool, nolocaltimewait) = false; #define V_nolocaltimewait VNET(nolocaltimewait) -SYSCTL_BOOL(_net_inet_tcp, OID_AUTO, nolocaltimewait, - CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nolocaltimewait), true, + +static int +sysctl_net_inet_tcp_nolocaltimewait(SYSCTL_HANDLER_ARGS) +{ + int error; + bool new; + + new = V_nolocaltimewait; + error = sysctl_handle_bool(oidp, &new, 0, req); + if (error == 0 && req->newptr) { + V_nolocaltimewait = new; + gone_in(16, "net.inet.tcp.nolocaltimewait is obsolete." + " Use net.inet.tcp.local_msl instead.\n"); + } + return (error); +} + +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, nolocaltimewait, + CTLFLAG_VNET | CTLFLAG_RW | CTLTYPE_U8, + &VNET_NAME(nolocaltimewait), 0, sysctl_net_inet_tcp_nolocaltimewait, "CU", "Do not create TCP TIME_WAIT state for local connections"); +static u_int +tcp_eff_msl(struct tcpcb *tp) +{ + struct inpcb *inp = tptoinpcb(tp); +#ifdef INET6 + bool isipv6 = inp->inp_inc.inc_flags & INC_ISIPV6; +#endif + + if ( +#ifdef INET6 + isipv6 ? in6_localip(&inp->in6p_faddr) : +#endif +#ifdef INET + in_localip(inp->inp_faddr)) +#else + false) +#endif + return (V_tcp_msl_local); + else + return (V_tcp_msl); +} + /* * Move a TCP connection into TIME_WAIT state. * inp is locked, and is unlocked before returning. @@ -127,7 +167,7 @@ tcp_twstart(struct tcpcb *tp) if (V_nolocaltimewait && ( #ifdef INET6 - isipv6 ? in6_localaddr(&inp->in6p_faddr) : + isipv6 ? in6_localip(&inp->in6p_faddr) : #endif #ifdef INET in_localip(inp->inp_faddr) @@ -140,7 +180,7 @@ tcp_twstart(struct tcpcb *tp) return; } - tcp_timer_activate(tp, TT_2MSL, 2 * V_tcp_msl); + tcp_timer_activate(tp, TT_2MSL, 2 * tcp_eff_msl(tp)); INP_WUNLOCK(inp); } @@ -283,7 +323,7 @@ tcp_twcheck(struct inpcb *inp, struct tcpopt *to, struct tcphdr *th, if (thflags & TH_FIN) { seq = th->th_seq + tlen + (thflags & TH_SYN ? 1 : 0); if (seq + 1 == tp->rcv_nxt) - tcp_timer_activate(tp, TT_2MSL, 2 * V_tcp_msl); + tcp_timer_activate(tp, TT_2MSL, 2 * tcp_eff_msl(tp)); } /* diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c index abdc2de545e9..687b0d538666 100644 --- a/sys/netinet/tcp_usrreq.c +++ b/sys/netinet/tcp_usrreq.c @@ -95,9 +95,6 @@ #include <netinet/cc/cc.h> #include <netinet/tcp_fastopen.h> #include <netinet/tcp_hpts.h> -#ifdef TCPPCAP -#include <netinet/tcp_pcap.h> -#endif #ifdef TCP_OFFLOAD #include <netinet/tcp_offload.h> #endif @@ -149,7 +146,7 @@ tcp_bblog_pru(struct tcpcb *tp, uint32_t pru, int error) } /* - * TCP attaches to socket via pru_attach(), reserving space, + * TCP attaches to socket via pr_attach(), reserving space, * and an internet control block. */ static int @@ -172,19 +169,13 @@ tcp_usr_attach(struct socket *so, int proto, struct thread *td) if (error) goto out; inp = sotoinpcb(so); - tp = tcp_newtcpcb(inp); + tp = tcp_newtcpcb(inp, NULL); if (tp == NULL) { error = ENOBUFS; in_pcbfree(inp); goto out; } tp->t_state = TCPS_CLOSED; - /* Can we inherit anything from the listener? */ - if ((so->so_listen != NULL) && - (so->so_listen->so_pcb != NULL) && - (tp->t_fb->tfb_inherit != NULL)) { - (*tp->t_fb->tfb_inherit)(tp, sotoinpcb(so->so_listen)); - } tcp_bblog_pru(tp, PRU_ATTACH, error); INP_WUNLOCK(inp); TCPSTATES_INC(TCPS_CLOSED); @@ -268,7 +259,8 @@ tcp_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td) goto out; } INP_HASH_WLOCK(&V_tcbinfo); - error = in_pcbbind(inp, sinp, td->td_ucred); + error = in_pcbbind(inp, sinp, V_tcp_bind_all_fibs ? 0 : INPBIND_FIB, + td->td_ucred); INP_HASH_WUNLOCK(&V_tcbinfo); out: tcp_bblog_pru(tp, PRU_BIND, error); @@ -336,13 +328,14 @@ tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td) } inp->inp_vflag |= INP_IPV4; inp->inp_vflag &= ~INP_IPV6; - error = in_pcbbind(inp, &sin, td->td_ucred); + error = in_pcbbind(inp, &sin, 0, td->td_ucred); INP_HASH_WUNLOCK(&V_tcbinfo); goto out; } } #endif - error = in6_pcbbind(inp, sin6, td->td_ucred); + error = in6_pcbbind(inp, sin6, V_tcp_bind_all_fibs ? 0 : INPBIND_FIB, + td->td_ucred); INP_HASH_WUNLOCK(&V_tcbinfo); out: if (error != 0) @@ -361,9 +354,10 @@ out: static int tcp_usr_listen(struct socket *so, int backlog, struct thread *td) { - int error = 0; struct inpcb *inp; struct tcpcb *tp; + int error = 0; + bool already_listening; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_listen: inp == NULL")); @@ -375,6 +369,7 @@ tcp_usr_listen(struct socket *so, int backlog, struct thread *td) tp = intotcpcb(inp); SOCK_LOCK(so); + already_listening = SOLISTENING(so); error = solisten_proto_check(so); if (error != 0) { SOCK_UNLOCK(so); @@ -382,7 +377,8 @@ tcp_usr_listen(struct socket *so, int backlog, struct thread *td) } if (inp->inp_lport == 0) { INP_HASH_WLOCK(&V_tcbinfo); - error = in_pcbbind(inp, NULL, td->td_ucred); + error = in_pcbbind(inp, NULL, + V_tcp_bind_all_fibs ? 0 : INPBIND_FIB, td->td_ucred); INP_HASH_WUNLOCK(&V_tcbinfo); } if (error == 0) { @@ -396,7 +392,11 @@ tcp_usr_listen(struct socket *so, int backlog, struct thread *td) solisten_proto_abort(so); } SOCK_UNLOCK(so); + if (already_listening) + goto out; + if (error == 0) + in_pcblisten(inp); if (tp->t_flags & TF_FASTOPEN) tp->t_tfo_pending = tcp_fastopen_alloc_counter(); @@ -412,10 +412,11 @@ out: static int tcp6_usr_listen(struct socket *so, int backlog, struct thread *td) { - int error = 0; struct inpcb *inp; struct tcpcb *tp; u_char vflagsav; + int error = 0; + bool already_listening; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp6_usr_listen: inp == NULL")); @@ -429,6 +430,7 @@ tcp6_usr_listen(struct socket *so, int backlog, struct thread *td) vflagsav = inp->inp_vflag; SOCK_LOCK(so); + already_listening = SOLISTENING(so); error = solisten_proto_check(so); if (error != 0) { SOCK_UNLOCK(so); @@ -439,7 +441,8 @@ tcp6_usr_listen(struct socket *so, int backlog, struct thread *td) inp->inp_vflag &= ~INP_IPV4; if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) inp->inp_vflag |= INP_IPV4; - error = in6_pcbbind(inp, NULL, td->td_ucred); + error = in6_pcbbind(inp, NULL, + V_tcp_bind_all_fibs ? 0 : INPBIND_FIB, td->td_ucred); } INP_HASH_WUNLOCK(&V_tcbinfo); if (error == 0) { @@ -453,7 +456,11 @@ tcp6_usr_listen(struct socket *so, int backlog, struct thread *td) solisten_proto_abort(so); } SOCK_UNLOCK(so); + if (already_listening) + goto out; + if (error == 0) + in_pcblisten(inp); if (tp->t_flags & TF_FASTOPEN) tp->t_tfo_pending = tcp_fastopen_alloc_counter(); @@ -584,7 +591,7 @@ tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) goto out; } if (SOLISTENING(so)) { - error = EINVAL; + error = EOPNOTSUPP; goto out; } #ifdef INET @@ -691,28 +698,22 @@ tcp_usr_disconnect(struct socket *so) struct inpcb *inp; struct tcpcb *tp = NULL; struct epoch_tracker et; - int error = 0; NET_EPOCH_ENTER(et); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_disconnect: inp == NULL")); INP_WLOCK(inp); - if (inp->inp_flags & INP_DROPPED) { - INP_WUNLOCK(inp); - NET_EPOCH_EXIT(et); - return (ECONNRESET); - } tp = intotcpcb(inp); if (tp->t_state == TCPS_TIME_WAIT) goto out; tcp_disconnect(tp); out: - tcp_bblog_pru(tp, PRU_DISCONNECT, error); + tcp_bblog_pru(tp, PRU_DISCONNECT, 0); TCP_PROBE2(debug__user, tp, PRU_DISCONNECT); INP_WUNLOCK(inp); NET_EPOCH_EXIT(et); - return (error); + return (0); } #ifdef INET @@ -906,8 +907,8 @@ out: /* * Do a send by putting data in output queue and updating urgent * marker if URG set. Possibly send more data. Unlike the other - * pru_*() routines, the mbuf chains are our responsibility. We - * must either enqueue them or free them. The other pru_* routines + * pr_*() routines, the mbuf chains are our responsibility. We + * must either enqueue them or free them. The other pr_*() routines * generally are caller-frees. */ static int @@ -1131,9 +1132,9 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, /* * XXXRW: PRUS_EOF not implemented with PRUS_OOB? */ - SOCKBUF_LOCK(&so->so_snd); + SOCK_SENDBUF_LOCK(so); if (sbspace(&so->so_snd) < -512) { - SOCKBUF_UNLOCK(&so->so_snd); + SOCK_SENDBUF_UNLOCK(so); error = ENOBUFS; goto out; } @@ -1148,7 +1149,7 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, if (tp->t_acktime == 0) tp->t_acktime = ticks; sbappendstream_locked(&so->so_snd, m, flags); - SOCKBUF_UNLOCK(&so->so_snd); + SOCK_SENDBUF_UNLOCK(so); m = NULL; if (nam && tp->t_state < TCPS_SYN_SENT) { /* @@ -1243,9 +1244,9 @@ tcp_usr_ready(struct socket *so, struct mbuf *m, int count) } tp = intotcpcb(inp); - SOCKBUF_LOCK(&so->so_snd); + SOCK_SENDBUF_LOCK(so); error = sbready(&so->so_snd, m, count); - SOCKBUF_UNLOCK(&so->so_snd); + SOCK_SENDBUF_UNLOCK(so); if (error) { INP_WUNLOCK(inp); return (error); @@ -1418,6 +1419,7 @@ struct protosw tcp_protosw = { .pr_rcvd = tcp_usr_rcvd, .pr_rcvoob = tcp_usr_rcvoob, .pr_send = tcp_usr_send, + .pr_sendfile_wait = sendfile_wait_generic, .pr_ready = tcp_usr_ready, .pr_shutdown = tcp_usr_shutdown, .pr_sockaddr = in_getsockaddr, @@ -1446,6 +1448,7 @@ struct protosw tcp6_protosw = { .pr_rcvd = tcp_usr_rcvd, .pr_rcvoob = tcp_usr_rcvoob, .pr_send = tcp_usr_send, + .pr_sendfile_wait = sendfile_wait_generic, .pr_ready = tcp_usr_ready, .pr_shutdown = tcp_usr_shutdown, .pr_sockaddr = in6_mapped_sockaddr, @@ -1475,9 +1478,11 @@ tcp_connect(struct tcpcb *tp, struct sockaddr_in *sin, struct thread *td) (SS_ISCONNECTING | SS_ISCONNECTED | SS_ISDISCONNECTING | SS_ISDISCONNECTED)) != 0)) return (EISCONN); + if (__predict_false((so->so_options & SO_REUSEPORT_LB) != 0)) + return (EOPNOTSUPP); INP_HASH_WLOCK(&V_tcbinfo); - error = in_pcbconnect(inp, sin, td->td_ucred, true); + error = in_pcbconnect(inp, sin, td->td_ucred); INP_HASH_WUNLOCK(&V_tcbinfo); if (error != 0) return (error); @@ -1515,8 +1520,11 @@ tcp6_connect(struct tcpcb *tp, struct sockaddr_in6 *sin6, struct thread *td) INP_WLOCK_ASSERT(inp); if (__predict_false((so->so_state & - (SS_ISCONNECTING | SS_ISCONNECTED)) != 0)) + (SS_ISCONNECTING | SS_ISCONNECTED | SS_ISDISCONNECTING | + SS_ISDISCONNECTED)) != 0)) return (EISCONN); + if (__predict_false((so->so_options & SO_REUSEPORT_LB) != 0)) + return (EOPNOTSUPP); INP_HASH_WLOCK(&V_tcbinfo); error = in6_pcbconnect(inp, sin6, td->td_ucred, true); @@ -1709,11 +1717,7 @@ tcp_ctloutput_set(struct inpcb *inp, struct sockopt *sopt) default: return (error); } - INP_WLOCK(inp); - if (inp->inp_flags & INP_DROPPED) { - INP_WUNLOCK(inp); - return (ECONNRESET); - } + INP_WLOCK_RECHECK(inp); } else if (sopt->sopt_name == TCP_FUNCTION_BLK) { /* * Protect the TCP option TCP_FUNCTION_BLK so @@ -1728,8 +1732,7 @@ tcp_ctloutput_set(struct inpcb *inp, struct sockopt *sopt) if (error) return (error); - INP_WLOCK(inp); - tp = intotcpcb(inp); + INP_WLOCK_RECHECK(inp); blk = find_and_ref_tcp_functions(&fsn); if (blk == NULL) { @@ -1742,32 +1745,17 @@ tcp_ctloutput_set(struct inpcb *inp, struct sockopt *sopt) INP_WUNLOCK(inp); return (0); } - if (tp->t_state != TCPS_CLOSED) { - /* - * The user has advanced the state - * past the initial point, we may not - * be able to switch. - */ - if (blk->tfb_tcp_handoff_ok != NULL) { - /* - * Does the stack provide a - * query mechanism, if so it may - * still be possible? - */ - error = (*blk->tfb_tcp_handoff_ok)(tp); - } else - error = EINVAL; - if (error) { - refcount_release(&blk->tfb_refcnt); - INP_WUNLOCK(inp); - return(error); - } - } if (blk->tfb_flags & TCP_FUNC_BEING_REMOVED) { refcount_release(&blk->tfb_refcnt); INP_WUNLOCK(inp); return (ENOENT); } + error = (*blk->tfb_tcp_handoff_ok)(tp); + if (error) { + refcount_release(&blk->tfb_refcnt); + INP_WUNLOCK(inp); + return (error); + } /* * Ensure the new stack takes ownership with a * clean slate on peak rate threshold. @@ -1983,7 +1971,7 @@ no_mem_needed: tp = intotcpcb(inp); if (ptr != NULL) memset(ptr, 0, mem_sz); - cc_mem.ccvc.tcp = tp; + cc_mem.tp = tp; /* * We once again hold a write lock over the tcb so it's * safe to do these things without ordering concerns. @@ -2203,9 +2191,19 @@ unlock_and_done: INP_WLOCK_RECHECK(inp); if (optval > 0 && optval <= tp->t_maxseg && - optval + 40 >= V_tcp_minmss) + optval + 40 >= V_tcp_minmss) { tp->t_maxseg = optval; - else + if (tp->t_maxseg < V_tcp_mssdflt) { + /* + * The MSS is so small we should not process incoming + * SACK's since we are subject to attack in such a + * case. + */ + tp->t_flags2 |= TF2_PROC_SACK_PROHIBIT; + } else { + tp->t_flags2 &= ~TF2_PROC_SACK_PROHIBIT; + } + } else error = EINVAL; goto unlock_and_done; @@ -2348,26 +2346,6 @@ unlock_and_done: TP_MAXIDLE(tp)); goto unlock_and_done; -#ifdef TCPPCAP - case TCP_PCAP_OUT: - case TCP_PCAP_IN: - INP_WUNLOCK(inp); - error = sooptcopyin(sopt, &optval, sizeof optval, - sizeof optval); - if (error) - return (error); - - INP_WLOCK_RECHECK(inp); - if (optval >= 0) - tcp_pcap_set_sock_max( - (sopt->sopt_name == TCP_PCAP_OUT) ? - &(tp->t_outpkts) : &(tp->t_inpkts), - optval); - else - error = EINVAL; - goto unlock_and_done; -#endif - case TCP_FASTOPEN: { struct tcp_fastopen tfo_optval; @@ -2598,16 +2576,6 @@ unhold: INP_WUNLOCK(inp); error = sooptcopyout(sopt, &ui, sizeof(ui)); break; -#ifdef TCPPCAP - case TCP_PCAP_OUT: - case TCP_PCAP_IN: - optval = tcp_pcap_get_sock_max( - (sopt->sopt_name == TCP_PCAP_OUT) ? - &(tp->t_outpkts) : &(tp->t_inpkts)); - INP_WUNLOCK(inp); - error = sooptcopyout(sopt, &optval, sizeof optval); - break; -#endif case TCP_FASTOPEN: optval = tp->t_flags & TF_FASTOPEN; INP_WUNLOCK(inp); @@ -2892,6 +2860,14 @@ db_print_tflags(u_int t_flags) db_printf("%sTF_PREVVALID", comma ? ", " : ""); comma = 1; } + if (t_flags & TF_WAKESOR) { + db_printf("%sTF_WAKESOR", comma ? ", " : ""); + comma = 1; + } + if (t_flags & TF_GPUTINPROG) { + db_printf("%sTF_GPUTINPROG", comma ? ", " : ""); + comma = 1; + } if (t_flags & TF_MORETOCOME) { db_printf("%sTF_MORETOCOME", comma ? ", " : ""); comma = 1; @@ -2912,18 +2888,10 @@ db_print_tflags(u_int t_flags) db_printf("%sTF_FASTRECOVERY", comma ? ", " : ""); comma = 1; } - if (t_flags & TF_CONGRECOVERY) { - db_printf("%sTF_CONGRECOVERY", comma ? ", " : ""); - comma = 1; - } if (t_flags & TF_WASFRECOVERY) { db_printf("%sTF_WASFRECOVERY", comma ? ", " : ""); comma = 1; } - if (t_flags & TF_WASCRECOVERY) { - db_printf("%sTF_WASCRECOVERY", comma ? ", " : ""); - comma = 1; - } if (t_flags & TF_SIGNATURE) { db_printf("%sTF_SIGNATURE", comma ? ", " : ""); comma = 1; @@ -2936,6 +2904,30 @@ db_print_tflags(u_int t_flags) db_printf("%sTF_TSO", comma ? ", " : ""); comma = 1; } + if (t_flags & TF_TOE) { + db_printf("%sTF_TOE", comma ? ", " : ""); + comma = 1; + } + if (t_flags & TF_CLOSED) { + db_printf("%sTF_CLOSED", comma ? ", " : ""); + comma = 1; + } + if (t_flags & TF_SENTSYN) { + db_printf("%sTF_SENTSYN", comma ? ", " : ""); + comma = 1; + } + if (t_flags & TF_LRD) { + db_printf("%sTF_LRD", comma ? ", " : ""); + comma = 1; + } + if (t_flags & TF_CONGRECOVERY) { + db_printf("%sTF_CONGRECOVERY", comma ? ", " : ""); + comma = 1; + } + if (t_flags & TF_WASCRECOVERY) { + db_printf("%sTF_WASCRECOVERY", comma ? ", " : ""); + comma = 1; + } if (t_flags & TF_FASTOPEN) { db_printf("%sTF_FASTOPEN", comma ? ", " : ""); comma = 1; @@ -2984,10 +2976,62 @@ db_print_tflags2(u_int t_flags2) db_printf("%sTF2_ACE_PERMIT", comma ? ", " : ""); comma = 1; } + if (t_flags2 & TF2_HPTS_CPU_SET) { + db_printf("%sTF2_HPTS_CPU_SET", comma ? ", " : ""); + comma = 1; + } if (t_flags2 & TF2_FBYTES_COMPLETE) { db_printf("%sTF2_FBYTES_COMPLETE", comma ? ", " : ""); comma = 1; } + if (t_flags2 & TF2_ECN_USE_ECT1) { + db_printf("%sTF2_ECN_USE_ECT1", comma ? ", " : ""); + comma = 1; + } + if (t_flags2 & TF2_TCP_ACCOUNTING) { + db_printf("%sTF2_TCP_ACCOUNTING", comma ? ", " : ""); + comma = 1; + } + if (t_flags2 & TF2_HPTS_CALLS) { + db_printf("%sTF2_HPTS_CALLS", comma ? ", " : ""); + comma = 1; + } + if (t_flags2 & TF2_MBUF_L_ACKS) { + db_printf("%sTF2_MBUF_L_ACKS", comma ? ", " : ""); + comma = 1; + } + if (t_flags2 & TF2_MBUF_ACKCMP) { + db_printf("%sTF2_MBUF_ACKCMP", comma ? ", " : ""); + comma = 1; + } + if (t_flags2 & TF2_SUPPORTS_MBUFQ) { + db_printf("%sTF2_SUPPORTS_MBUFQ", comma ? ", " : ""); + comma = 1; + } + if (t_flags2 & TF2_MBUF_QUEUE_READY) { + db_printf("%sTF2_MBUF_QUEUE_READY", comma ? ", " : ""); + comma = 1; + } + if (t_flags2 & TF2_DONT_SACK_QUEUE) { + db_printf("%sTF2_DONT_SACK_QUEUE", comma ? ", " : ""); + comma = 1; + } + if (t_flags2 & TF2_CANNOT_DO_ECN) { + db_printf("%sTF2_CANNOT_DO_ECN", comma ? ", " : ""); + comma = 1; + } + if (t_flags2 & TF2_PROC_SACK_PROHIBIT) { + db_printf("%sTF2_PROC_SACK_PROHIBIT", comma ? ", " : ""); + comma = 1; + } + if (t_flags2 & TF2_IPSEC_TSO) { + db_printf("%sTF2_IPSEC_TSO", comma ? ", " : ""); + comma = 1; + } + if (t_flags2 & TF2_NO_ISS_CHECK) { + db_printf("%sTF2_NO_ISS_CHECK", comma ? ", " : ""); + comma = 1; + } } static void @@ -3007,7 +3051,44 @@ db_print_toobflags(char t_oobflags) } static void -db_print_tcpcb(struct tcpcb *tp, const char *name, int indent) +db_print_bblog_state(int state) +{ + switch (state) { + case TCP_LOG_STATE_RATIO_OFF: + db_printf("TCP_LOG_STATE_RATIO_OFF"); + break; + case TCP_LOG_STATE_CLEAR: + db_printf("TCP_LOG_STATE_CLEAR"); + break; + case TCP_LOG_STATE_OFF: + db_printf("TCP_LOG_STATE_OFF"); + break; + case TCP_LOG_STATE_TAIL: + db_printf("TCP_LOG_STATE_TAIL"); + break; + case TCP_LOG_STATE_HEAD: + db_printf("TCP_LOG_STATE_HEAD"); + break; + case TCP_LOG_STATE_HEAD_AUTO: + db_printf("TCP_LOG_STATE_HEAD_AUTO"); + break; + case TCP_LOG_STATE_CONTINUAL: + db_printf("TCP_LOG_STATE_CONTINUAL"); + break; + case TCP_LOG_STATE_TAIL_AUTO: + db_printf("TCP_LOG_STATE_TAIL_AUTO"); + break; + case TCP_LOG_VIA_BBPOINTS: + db_printf("TCP_LOG_STATE_BBPOINTS"); + break; + default: + db_printf("UNKNOWN(%d)", state); + break; + } +} + +static void +db_print_tcpcb(struct tcpcb *tp, const char *name, int indent, bool show_bblog) { db_print_indent(indent); @@ -3117,18 +3198,68 @@ db_print_tcpcb(struct tcpcb *tp, const char *name, int indent) db_print_indent(indent); db_printf("t_rttlow: %d rfbuf_ts: %u rfbuf_cnt: %d\n", tp->t_rttlow, tp->rfbuf_ts, tp->rfbuf_cnt); + + db_print_indent(indent); + db_printf("t_fb.tfb_tcp_block_name: %s\n", tp->t_fb->tfb_tcp_block_name); + + db_print_indent(indent); + db_printf("t_cc.name: %s\n", tp->t_cc->name); + + db_print_indent(indent); + db_printf("_t_logstate: %d (", tp->_t_logstate); + db_print_bblog_state(tp->_t_logstate); + db_printf(")\n"); + + db_print_indent(indent); + db_printf("t_lognum: %d t_loglimit: %d t_logsn: %u\n", + tp->t_lognum, tp->t_loglimit, tp->t_logsn); + + if (show_bblog) { +#ifdef TCP_BLACKBOX + db_print_bblog_entries(&tp->t_logs, indent); +#else + db_print_indent(indent); + db_printf("BBLog not supported\n"); +#endif + } } DB_SHOW_COMMAND(tcpcb, db_show_tcpcb) { struct tcpcb *tp; + bool show_bblog; if (!have_addr) { db_printf("usage: show tcpcb <addr>\n"); return; } + show_bblog = strchr(modif, 'b') != NULL; tp = (struct tcpcb *)addr; - db_print_tcpcb(tp, "tcpcb", 0); + db_print_tcpcb(tp, "tcpcb", 0, show_bblog); +} + +DB_SHOW_ALL_COMMAND(tcpcbs, db_show_all_tcpcbs) +{ + VNET_ITERATOR_DECL(vnet_iter); + struct inpcb *inp; + bool only_locked, show_bblog; + + only_locked = strchr(modif, 'l') != NULL; + show_bblog = strchr(modif, 'b') != NULL; + VNET_FOREACH(vnet_iter) { + CURVNET_SET(vnet_iter); + CK_LIST_FOREACH(inp, &V_tcbinfo.ipi_listhead, inp_list) { + if (only_locked && + inp->inp_lock.rw_lock == RW_UNLOCKED) + continue; + db_print_tcpcb(intotcpcb(inp), "tcpcb", 0, show_bblog); + if (db_pager_quit) + break; + } + CURVNET_RESTORE(); + if (db_pager_quit) + break; + } } #endif diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h index a339f52c2ffa..059b2aff689d 100644 --- a/sys/netinet/tcp_var.h +++ b/sys/netinet/tcp_var.h @@ -452,7 +452,6 @@ struct tcpcb { tcp_seq gput_seq; /* Outbound measurement seq */ tcp_seq gput_ack; /* Inbound measurement ack */ int32_t t_stats_gput_prev; /* XXXLAS: Prev gput measurement */ - uint32_t t_maxpeakrate; /* max peak rate set by user, bytes/s */ uint32_t t_sndtlppack; /* tail loss probe packets sent */ uint64_t t_sndtlpbyte; /* total tail loss probe bytes sent */ uint64_t t_sndbytes; /* total bytes sent */ @@ -465,6 +464,11 @@ struct tcpcb { /* TCP Fast Open */ uint8_t t_tfo_client_cookie_len; /* TFO client cookie length */ uint32_t t_end_info_status; /* Status flag of end info */ + sbintime_t t_challenge_ack_end; /* End of the challenge ack epoch */ + uint32_t t_challenge_ack_cnt; /* Number of challenge ACKs sent in + * current epoch + */ + unsigned int *t_tfo_pending; /* TFO server pending counter */ union { uint8_t client[TCP_FASTOPEN_MAX_COOKIE_LEN]; @@ -495,10 +499,6 @@ struct tcpcb { uint64_t tcp_cnt_counters[TCP_NUM_CNT_COUNTERS]; uint64_t tcp_proc_time[TCP_NUM_CNT_COUNTERS]; #endif -#ifdef TCPPCAP - struct mbufq t_inpkts; /* List of saved input packets. */ - struct mbufq t_outpkts; /* List of saved output packets. */ -#endif }; #endif /* _KERNEL || _WANT_TCPCB */ @@ -528,27 +528,16 @@ typedef enum { /* Minimum map entries limit value, if set */ #define TCP_MIN_MAP_ENTRIES_LIMIT 128 -/* - * TODO: We yet need to brave plowing in - * to tcp_input() and the pru_usrreq() block. - * Right now these go to the old standards which - * are somewhat ok, but in the long term may - * need to be changed. If we do tackle tcp_input() - * then we need to get rid of the tcp_do_segment() - * function below. - */ /* Flags for tcp functions */ #define TCP_FUNC_BEING_REMOVED 0x01 /* Can no longer be referenced */ #define TCP_FUNC_OUTPUT_CANDROP 0x02 /* tfb_tcp_output may ask tcp_drop */ +#define TCP_FUNC_DEFAULT_OK 0x04 /* Can be used as default */ /** - * Adding a tfb_tcp_handoff_ok function allows the socket - * option to change stacks to query you even if the - * connection is in a later stage. You return 0 to - * say you can take over and run your stack, you return - * non-zero (an error number) to say no you can't. - * If the function is undefined you can only change - * in the early states (before connect or listen). + * tfb_tcp_handoff_ok is a mandatory function allowing + * to query a stack, if it can take over a tcpcb. + * You return 0 to say you can take over and run your stack, + * you return non-zero (an error number) to say no you can't. * * tfb_tcp_fb_init is used to allow the new stack to * setup its control block. Among the things it must @@ -637,6 +626,9 @@ struct tcp_function_block { uint8_t tfb_id; }; +/* Maximum number of names each TCP function block can be registered with. */ +#define TCP_FUNCTION_NAME_NUM_MAX 8 + struct tcp_function { TAILQ_ENTRY(tcp_function) tf_next; char tf_name[TCP_FUNCTION_NAME_LEN_MAX]; @@ -846,6 +838,9 @@ tcp_packets_this_ack(struct tcpcb *tp, tcp_seq ack) #define TF2_MBUF_QUEUE_READY 0x00020000 /* Inputs can be queued */ #define TF2_DONT_SACK_QUEUE 0x00040000 /* Don't wake on sack */ #define TF2_CANNOT_DO_ECN 0x00080000 /* The stack does not do ECN */ +#define TF2_PROC_SACK_PROHIBIT 0x00100000 /* Due to small MSS size do not process sack's */ +#define TF2_IPSEC_TSO 0x00200000 /* IPSEC + TSO supported */ +#define TF2_NO_ISS_CHECK 0x00400000 /* Don't check SEG.ACK against ISS */ /* * Structure to hold TCP options that are only used during segment @@ -883,13 +878,13 @@ struct tcpopt { #define TO_SYN 0x01 /* parse SYN-only options */ struct hc_metrics_lite { /* must stay in sync with hc_metrics */ - uint32_t rmx_mtu; /* MTU for this path */ - uint32_t rmx_ssthresh; /* outbound gateway buffer limit */ - uint32_t rmx_rtt; /* estimated round trip time */ - uint32_t rmx_rttvar; /* estimated rtt variance */ - uint32_t rmx_cwnd; /* congestion window */ - uint32_t rmx_sendpipe; /* outbound delay-bandwidth product */ - uint32_t rmx_recvpipe; /* inbound delay-bandwidth product */ + uint32_t hc_mtu; /* MTU for this path */ + uint32_t hc_ssthresh; /* outbound gateway buffer limit */ + uint32_t hc_rtt; /* estimated round trip time */ + uint32_t hc_rttvar; /* estimated rtt variance */ + uint32_t hc_cwnd; /* congestion window */ + uint32_t hc_sendpipe; /* outbound delay-bandwidth product */ + uint32_t hc_recvpipe; /* inbound delay-bandwidth product */ }; #ifndef _NETINET_IN_PCB_H_ @@ -932,9 +927,12 @@ struct in_conninfo; + (tp)->t_rttvar) >> TCP_DELTA_SHIFT) /* - * TCP statistics. - * Many of these should be kept per connection, - * but that's inconvenient at the moment. + * Global (per-VNET) TCP statistics. The below structure represents what we + * export to the userland, but in the kernel we have an array of counter_u64_t + * with as many elements as there are members in the structure. The counters + * shall be increased by TCPSTAT_INC() or KMOD_TCPSTAT_INC(). Adding a new + * counter also requires adding corresponding SDT probes into in_kdtrace.h and + * into in_kdtrace.c. */ struct tcpstat { uint64_t tcps_connattempt; /* connections initiated */ @@ -1020,6 +1018,8 @@ struct tcpstat { uint64_t tcps_sc_zonefail; /* zalloc() failed */ uint64_t tcps_sc_sendcookie; /* SYN cookie sent */ uint64_t tcps_sc_recvcookie; /* SYN cookie received */ + uint64_t tcps_sc_spurcookie; /* SYN cookie spurious, rejected */ + uint64_t tcps_sc_failcookie; /* SYN cookie failed, rejected */ uint64_t tcps_hc_added; /* entry added to hostcache */ uint64_t tcps_hc_bucketoverflow;/* hostcache per bucket limit hit */ @@ -1029,6 +1029,7 @@ struct tcpstat { /* SACK related stats */ uint64_t tcps_sack_recovery_episode; /* SACK recovery episodes */ uint64_t tcps_sack_rexmits; /* SACK rexmit segments */ + uint64_t tcps_sack_rexmits_tso; /* SACK rexmit TSO chunks */ uint64_t tcps_sack_rexmit_bytes; /* SACK rexmit bytes */ uint64_t tcps_sack_rcv_blocks; /* SACK blocks (options) received */ uint64_t tcps_sack_send_blocks; /* SACK blocks (options) sent */ @@ -1086,15 +1087,17 @@ struct tcpstat { uint64_t tcps_tlpresends; /* number of tlp resends */ uint64_t tcps_tlpresend_bytes; /* number of bytes resent by tlp */ + /* SEG.ACK validation failures */ + uint64_t tcps_rcvghostack; /* received ACK for data never sent */ + uint64_t tcps_rcvacktooold; /* received ACK for data too long ago */ - uint64_t _pad[4]; /* 4 TBD placeholder for STABLE */ + + uint64_t _pad[1]; /* 1 TBD placeholder for STABLE */ }; #define tcps_rcvmemdrop tcps_rcvreassfull /* compat */ #ifdef _KERNEL -#define TI_UNLOCKED 1 -#define TI_RLOCKED 2 #include <sys/counter.h> #include <netinet/in_kdtrace.h> @@ -1236,6 +1239,9 @@ struct tcp_function_info { #define TCPCTL_SACK 14 /* Selective Acknowledgement,rfc 2018 */ #define TCPCTL_DROP 15 /* drop tcp connection */ #define TCPCTL_STATES 16 /* connection counts by TCP state */ +#define TCPCTL_KTLSLIST 17 /* connections with active ktls + session */ +#define TCPCTL_KTLSLIST_WKEYS 18 /* KTLSLIST with key data exported */ #ifdef _KERNEL #ifdef SYSCTL_DECL @@ -1254,9 +1260,12 @@ VNET_DECLARE(int, tcp_log_in_vain); VNET_DECLARE(int, drop_synfin); VNET_DECLARE(int, path_mtu_discovery); VNET_DECLARE(int, tcp_abc_l_var); +VNET_DECLARE(uint32_t, tcp_ack_war_cnt); +VNET_DECLARE(uint32_t, tcp_ack_war_time_window); VNET_DECLARE(int, tcp_autorcvbuf_max); VNET_DECLARE(int, tcp_autosndbuf_inc); VNET_DECLARE(int, tcp_autosndbuf_max); +VNET_DECLARE(int, tcp_bind_all_fibs); VNET_DECLARE(int, tcp_delack_enabled); VNET_DECLARE(int, tcp_do_autorcvbuf); VNET_DECLARE(int, tcp_do_autosndbuf); @@ -1277,6 +1286,7 @@ VNET_DECLARE(int, tcp_ecn_maxretries); VNET_DECLARE(int, tcp_initcwnd_segments); VNET_DECLARE(int, tcp_insecure_rst); VNET_DECLARE(int, tcp_insecure_syn); +VNET_DECLARE(int, tcp_insecure_ack); VNET_DECLARE(uint32_t, tcp_map_entries_limit); VNET_DECLARE(uint32_t, tcp_map_split_limit); VNET_DECLARE(int, tcp_minmss); @@ -1290,6 +1300,7 @@ VNET_DECLARE(int, tcp_retries); VNET_DECLARE(int, tcp_sack_globalholes); VNET_DECLARE(int, tcp_sack_globalmaxholes); VNET_DECLARE(int, tcp_sack_maxholes); +VNET_DECLARE(int, tcp_sack_tso); VNET_DECLARE(int, tcp_sc_rst_sock_fail); VNET_DECLARE(int, tcp_sendspace); VNET_DECLARE(int, tcp_udp_tunneling_overhead); @@ -1303,9 +1314,12 @@ VNET_DECLARE(struct inpcbinfo, tcbinfo); #define V_path_mtu_discovery VNET(path_mtu_discovery) #define V_tcbinfo VNET(tcbinfo) #define V_tcp_abc_l_var VNET(tcp_abc_l_var) +#define V_tcp_ack_war_cnt VNET(tcp_ack_war_cnt) +#define V_tcp_ack_war_time_window VNET(tcp_ack_war_time_window) #define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max) #define V_tcp_autosndbuf_inc VNET(tcp_autosndbuf_inc) #define V_tcp_autosndbuf_max VNET(tcp_autosndbuf_max) +#define V_tcp_bind_all_fibs VNET(tcp_bind_all_fibs) #define V_tcp_delack_enabled VNET(tcp_delack_enabled) #define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf) #define V_tcp_do_autosndbuf VNET(tcp_do_autosndbuf) @@ -1323,6 +1337,7 @@ VNET_DECLARE(struct inpcbinfo, tcbinfo); #define V_tcp_initcwnd_segments VNET(tcp_initcwnd_segments) #define V_tcp_insecure_rst VNET(tcp_insecure_rst) #define V_tcp_insecure_syn VNET(tcp_insecure_syn) +#define V_tcp_insecure_ack VNET(tcp_insecure_ack) #define V_tcp_map_entries_limit VNET(tcp_map_entries_limit) #define V_tcp_map_split_limit VNET(tcp_map_split_limit) #define V_tcp_minmss VNET(tcp_minmss) @@ -1336,6 +1351,7 @@ VNET_DECLARE(struct inpcbinfo, tcbinfo); #define V_tcp_sack_globalholes VNET(tcp_sack_globalholes) #define V_tcp_sack_globalmaxholes VNET(tcp_sack_globalmaxholes) #define V_tcp_sack_maxholes VNET(tcp_sack_maxholes) +#define V_tcp_sack_tso VNET(tcp_sack_tso) #define V_tcp_sc_rst_sock_fail VNET(tcp_sc_rst_sock_fail) #define V_tcp_sendspace VNET(tcp_sendspace) #define V_tcp_udp_tunneling_overhead VNET(tcp_udp_tunneling_overhead) @@ -1417,19 +1433,6 @@ extern counter_u64_t tcp_comp_total; extern counter_u64_t tcp_uncomp_total; extern counter_u64_t tcp_bad_csums; -#ifdef TCP_SAD_DETECTION -/* Various SACK attack thresholds */ -extern int32_t tcp_force_detection; -extern int32_t tcp_sad_limit; -extern int32_t tcp_sack_to_ack_thresh; -extern int32_t tcp_sack_to_move_thresh; -extern int32_t tcp_restoral_thresh; -extern int32_t tcp_sad_decay_val; -extern int32_t tcp_sad_pacing_interval; -extern int32_t tcp_sad_low_pps; -extern int32_t tcp_map_minimum; -extern int32_t tcp_attack_on_turns_on_logging; -#endif extern uint32_t tcp_ack_war_time_window; extern uint32_t tcp_ack_war_cnt; @@ -1442,6 +1445,7 @@ struct tcp_ifcap { u_int tsomax; u_int tsomaxsegcount; u_int tsomaxsegsize; + bool ipsec_tso; }; uint32_t tcp_maxmtu(struct in_conninfo *, struct tcp_ifcap *); uint32_t tcp_maxmtu6(struct in_conninfo *, struct tcp_ifcap *); @@ -1454,11 +1458,12 @@ void tcp_mss_update(struct tcpcb *, int, int, struct hc_metrics_lite *, void tcp_mss(struct tcpcb *, int); int tcp_mssopt(struct in_conninfo *); struct tcpcb * - tcp_newtcpcb(struct inpcb *); + tcp_newtcpcb(struct inpcb *, struct tcpcb *); int tcp_default_output(struct tcpcb *); void tcp_state_change(struct tcpcb *, int); void tcp_respond(struct tcpcb *, void *, struct tcphdr *, struct mbuf *, tcp_seq, tcp_seq, uint16_t); +void tcp_send_challenge_ack(struct tcpcb *, struct tcphdr *, struct mbuf *); bool tcp_twcheck(struct inpcb *, struct tcpopt *, struct tcphdr *, struct mbuf *, int); void tcp_setpersist(struct tcpcb *); @@ -1477,10 +1482,10 @@ void tcp_hc_init(void); #ifdef VIMAGE void tcp_hc_destroy(void); #endif -void tcp_hc_get(struct in_conninfo *, struct hc_metrics_lite *); -uint32_t tcp_hc_getmtu(struct in_conninfo *); -void tcp_hc_updatemtu(struct in_conninfo *, uint32_t); -void tcp_hc_update(struct in_conninfo *, struct hc_metrics_lite *); +void tcp_hc_get(const struct in_conninfo *, struct hc_metrics_lite *); +uint32_t tcp_hc_getmtu(const struct in_conninfo *); +void tcp_hc_updatemtu(const struct in_conninfo *, uint32_t); +void tcp_hc_update(const struct in_conninfo *, struct hc_metrics_lite *); void cc_after_idle(struct tcpcb *tp); extern struct protosw tcp_protosw; /* shared for TOE */ @@ -1497,7 +1502,7 @@ void tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_laststart, tcp_seq rcv_lastend); void tcp_clean_dsack_blocks(struct tcpcb *tp); void tcp_clean_sackreport(struct tcpcb *tp); -void tcp_sack_adjust(struct tcpcb *tp); +int tcp_sack_adjust(struct tcpcb *tp); struct sackhole *tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt); void tcp_do_prr_ack(struct tcpcb *, struct tcphdr *, struct tcpopt *, sackstatus_t, u_int *); diff --git a/sys/netinet/toecore.c b/sys/netinet/toecore.c index 76aadad9a3b9..4203029ff7c3 100644 --- a/sys/netinet/toecore.c +++ b/sys/netinet/toecore.c @@ -525,7 +525,7 @@ toe_connect_failed(struct toedev *tod, struct inpcb *inp, int err) /* * Temporary failure during offload, take this PCB back. * Detach from the TOE driver and do the rest of what - * TCP's pru_connect would have done if the connection + * TCP's pr_connect() would have done if the connection * wasn't offloaded. */ diff --git a/sys/netinet/toecore.h b/sys/netinet/toecore.h index 612c2fe1caf5..843b261ec162 100644 --- a/sys/netinet/toecore.h +++ b/sys/netinet/toecore.h @@ -66,7 +66,7 @@ struct toedev { void (*tod_input)(struct toedev *, struct tcpcb *, struct mbuf *); /* - * This is called by the kernel during pru_rcvd for an offloaded TCP + * This is called by the kernel during pr_rcvd() for an offloaded TCP * connection and provides an opportunity for the TOE driver to manage * its rx window and credits. */ diff --git a/sys/netinet/udp.h b/sys/netinet/udp.h index edff456ba70e..010f2210b516 100644 --- a/sys/netinet/udp.h +++ b/sys/netinet/udp.h @@ -44,7 +44,7 @@ struct udphdr { u_short uh_dport; /* destination port */ u_short uh_ulen; /* udp length */ u_short uh_sum; /* udp checksum */ -}; +} __packed; /* * User-settable options (used with setsockopt). diff --git a/sys/netinet/udp_usrreq.c b/sys/netinet/udp_usrreq.c index 9dad79e95b04..dafbaf6dc672 100644 --- a/sys/netinet/udp_usrreq.c +++ b/sys/netinet/udp_usrreq.c @@ -105,6 +105,11 @@ * Per RFC 3828, July, 2004. */ +VNET_DEFINE(int, udp_bind_all_fibs) = 1; +SYSCTL_INT(_net_inet_udp, OID_AUTO, bind_all_fibs, CTLFLAG_VNET | CTLFLAG_RDTUN, + &VNET_NAME(udp_bind_all_fibs), 0, + "Bound sockets receive traffic from all FIBs"); + /* * BSD 4.2 defaulted the udp checksum to be off. Turning off udp checksums * removes the only data integrity mechanism for packets and malformed @@ -359,10 +364,12 @@ udp_multi_input(struct mbuf *m, int proto, struct sockaddr_in *udp_in) #endif struct inpcb *inp; struct mbuf *n; - int appends = 0; + int appends = 0, fib; MPASS(ip->ip_hl == sizeof(struct ip) >> 2); + fib = M_GETFIB(m); + while ((inp = inp_next(&inpi)) != NULL) { /* * XXXRW: Because we weren't holding either the inpcb @@ -370,6 +377,14 @@ udp_multi_input(struct mbuf *m, int proto, struct sockaddr_in *udp_in) * before, we should probably recheck now that the * inpcb lock is held. */ + + if (V_udp_bind_all_fibs == 0 && fib != inp->inp_inc.inc_fibnum) + /* + * Sockets bound to a specific FIB can only receive + * packets from that FIB. + */ + continue; + /* * Handle socket delivery policy for any-source * and source-specific multicast. [RFC3678] @@ -453,7 +468,7 @@ udp_input(struct mbuf **mp, int *offp, int proto) struct sockaddr_in udp_in[2]; struct mbuf *m; struct m_tag *fwd_tag; - int cscov_partial, iphlen; + int cscov_partial, iphlen, lookupflags; m = *mp; iphlen = *offp; @@ -544,12 +559,12 @@ udp_input(struct mbuf **mp, int *offp, int proto) char b[offsetof(struct ipovly, ih_src)]; struct ipovly *ipov = (struct ipovly *)ip; - bcopy(ipov, b, sizeof(b)); + memcpy(b, ipov, sizeof(b)); bzero(ipov, sizeof(ipov->ih_x1)); ipov->ih_len = (proto == IPPROTO_UDP) ? uh->uh_ulen : htons(ip_len); uh_sum = in_cksum(m, len + sizeof (struct ip)); - bcopy(b, ipov, sizeof(b)); + memcpy(ipov, b, sizeof(b)); } if (uh_sum) { UDPSTAT_INC(udps_badsum); @@ -568,14 +583,18 @@ udp_input(struct mbuf **mp, int *offp, int proto) } if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || - in_broadcast(ip->ip_dst, ifp)) + in_ifnet_broadcast(ip->ip_dst, ifp)) return (udp_multi_input(m, proto, udp_in)); pcbinfo = udp_get_inpcbinfo(proto); /* * Locate pcb for datagram. - * + */ + lookupflags = INPLOOKUP_RLOCKPCB | + (V_udp_bind_all_fibs ? 0 : INPLOOKUP_FIB); + + /* * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. */ if ((m->m_flags & M_IP_NEXTHOP) && @@ -589,7 +608,7 @@ udp_input(struct mbuf **mp, int *offp, int proto) * Already got one like this? */ inp = in_pcblookup_mbuf(pcbinfo, ip->ip_src, uh->uh_sport, - ip->ip_dst, uh->uh_dport, INPLOOKUP_RLOCKPCB, ifp, m); + ip->ip_dst, uh->uh_dport, lookupflags, ifp, m); if (!inp) { /* * It's new. Try to find the ambushing socket. @@ -599,8 +618,8 @@ udp_input(struct mbuf **mp, int *offp, int proto) inp = in_pcblookup(pcbinfo, ip->ip_src, uh->uh_sport, next_hop->sin_addr, next_hop->sin_port ? htons(next_hop->sin_port) : - uh->uh_dport, INPLOOKUP_WILDCARD | - INPLOOKUP_RLOCKPCB, ifp); + uh->uh_dport, INPLOOKUP_WILDCARD | lookupflags, + ifp); } /* Remove the tag from the packet. We don't need it anymore. */ m_tag_delete(m, fwd_tag); @@ -608,7 +627,7 @@ udp_input(struct mbuf **mp, int *offp, int proto) } else inp = in_pcblookup_mbuf(pcbinfo, ip->ip_src, uh->uh_sport, ip->ip_dst, uh->uh_dport, INPLOOKUP_WILDCARD | - INPLOOKUP_RLOCKPCB, ifp, m); + lookupflags, ifp, m); if (inp == NULL) { if (V_udp_log_in_vain) { char src[INET_ADDRSTRLEN]; @@ -825,6 +844,8 @@ udp_getcred(SYSCTL_HANDLER_ARGS) struct inpcb *inp; int error; + if (req->newptr == NULL) + return (EINVAL); error = priv_check(req->td, PRIV_NETINET_GETCRED); if (error) return (error); @@ -1021,6 +1042,8 @@ udp_v4mapped_pktinfo(struct cmsghdr *cm, struct sockaddr_in * src, struct in6_pktinfo *pktinfo; struct in_addr ia; + NET_EPOCH_ASSERT(); + if ((flags & PRUS_IPV6) == 0) return (0); @@ -1042,18 +1065,14 @@ udp_v4mapped_pktinfo(struct cmsghdr *cm, struct sockaddr_in * src, /* Validate the interface index if specified. */ if (pktinfo->ipi6_ifindex) { - struct epoch_tracker et; - - NET_EPOCH_ENTER(et); ifp = ifnet_byindex(pktinfo->ipi6_ifindex); - NET_EPOCH_EXIT(et); /* XXXGL: unsafe ifp */ if (ifp == NULL) return (ENXIO); } else ifp = NULL; if (ifp != NULL && !IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) { ia.s_addr = pktinfo->ipi6_addr.s6_addr32[3]; - if (in_ifhasaddr(ifp, ia) == 0) + if (!in_ifhasaddr(ifp, ia)) return (EADDRNOTAVAIL); } @@ -1116,10 +1135,9 @@ udp_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, sin = (struct sockaddr_in *)addr; /* - * udp_send() may need to temporarily bind or connect the current - * inpcb. As such, we don't know up front whether we will need the - * pcbinfo lock or not. Do any work to decide what is needed up - * front before acquiring any locks. + * udp_send() may need to bind the current inpcb. As such, we don't + * know up front whether we will need the pcbinfo lock or not. Do any + * work to decide what is needed up front before acquiring any locks. * * We will need network epoch in either case, to safely lookup into * pcb hash. @@ -1243,7 +1261,7 @@ udp_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, } INP_HASH_WLOCK(pcbinfo); error = in_pcbbind_setup(inp, &src, &laddr.s_addr, &lport, - td->td_ucred); + V_udp_bind_all_fibs ? 0 : INPBIND_FIB, td->td_ucred); INP_HASH_WUNLOCK(pcbinfo); if ((flags & PRUS_IPV6) != 0) inp->inp_vflag = vflagsav; @@ -1273,66 +1291,37 @@ udp_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, error = prison_remote_ip4(td->td_ucred, &sin->sin_addr); if (error) goto release; - /* - * If a local address or port hasn't yet been selected, or if - * the destination address needs to be rewritten due to using - * a special INADDR_ constant, invoke in_pcbconnect_setup() - * to do the heavy lifting. Once a port is selected, we - * commit the binding back to the socket; we also commit the - * binding of the address if in jail. - * - * If we already have a valid binding and we're not - * requesting a destination address rewrite, use a fast path. + * sendto(2) on unconnected UDP socket results in implicit + * binding to INADDR_ANY and anonymous port. This has two + * side effects: + * 1) after first sendto(2) the socket will receive datagrams + * destined to the selected port. + * 2) subsequent sendto(2) calls will use the same source port. */ - if (inp->inp_laddr.s_addr == INADDR_ANY || - inp->inp_lport == 0 || - sin->sin_addr.s_addr == INADDR_ANY || - sin->sin_addr.s_addr == INADDR_BROADCAST) { - if ((flags & PRUS_IPV6) != 0) { - vflagsav = inp->inp_vflag; - inp->inp_vflag |= INP_IPV4; - inp->inp_vflag &= ~INP_IPV6; - } + if (inp->inp_lport == 0) { + struct sockaddr_in wild = { + .sin_family = AF_INET, + .sin_len = sizeof(struct sockaddr_in), + }; + INP_HASH_WLOCK(pcbinfo); - error = in_pcbconnect_setup(inp, sin, &laddr.s_addr, - &lport, &faddr.s_addr, &fport, td->td_ucred); - if ((flags & PRUS_IPV6) != 0) - inp->inp_vflag = vflagsav; - if (error) { - INP_HASH_WUNLOCK(pcbinfo); + error = in_pcbbind(inp, &wild, V_udp_bind_all_fibs ? + 0 : INPBIND_FIB, td->td_ucred); + INP_HASH_WUNLOCK(pcbinfo); + if (error) + goto release; + lport = inp->inp_lport; + laddr = inp->inp_laddr; + } + if (laddr.s_addr == INADDR_ANY) { + error = in_pcbladdr(inp, &sin->sin_addr, &laddr, + td->td_ucred); + if (error) goto release; - } - - /* - * XXXRW: Why not commit the port if the address is - * !INADDR_ANY? - */ - /* Commit the local port if newly assigned. */ - if (inp->inp_laddr.s_addr == INADDR_ANY && - inp->inp_lport == 0) { - INP_WLOCK_ASSERT(inp); - /* - * Remember addr if jailed, to prevent - * rebinding. - */ - if (prison_flag(td->td_ucred, PR_IP4)) - inp->inp_laddr = laddr; - inp->inp_lport = lport; - error = in_pcbinshash(inp); - INP_HASH_WUNLOCK(pcbinfo); - if (error != 0) { - inp->inp_lport = 0; - error = EAGAIN; - goto release; - } - inp->inp_flags |= INP_ANONPORT; - } else - INP_HASH_WUNLOCK(pcbinfo); - } else { - faddr = sin->sin_addr; - fport = sin->sin_port; } + faddr = sin->sin_addr; + fport = sin->sin_port; } else { INP_LOCK_ASSERT(inp); faddr = inp->inp_faddr; @@ -1592,7 +1581,8 @@ udp_bind(struct socket *so, struct sockaddr *nam, struct thread *td) INP_WLOCK(inp); INP_HASH_WLOCK(pcbinfo); - error = in_pcbbind(inp, sinp, td->td_ucred); + error = in_pcbbind(inp, sinp, V_udp_bind_all_fibs ? 0 : INPBIND_FIB, + td->td_ucred); INP_HASH_WUNLOCK(pcbinfo); INP_WUNLOCK(inp); return (error); @@ -1648,7 +1638,7 @@ udp_connect(struct socket *so, struct sockaddr *nam, struct thread *td) } NET_EPOCH_ENTER(et); INP_HASH_WLOCK(pcbinfo); - error = in_pcbconnect(inp, sin, td->td_ucred, true); + error = in_pcbconnect(inp, sin, td->td_ucred); INP_HASH_WUNLOCK(pcbinfo); NET_EPOCH_EXIT(et); if (error == 0) diff --git a/sys/netinet/udp_var.h b/sys/netinet/udp_var.h index 0d70bad91df4..3895f365db3c 100644 --- a/sys/netinet/udp_var.h +++ b/sys/netinet/udp_var.h @@ -120,7 +120,7 @@ struct udpcb { void *u_tun_ctx; /* Tunneling callback context. */ }; -#define intoudpcb(ip) __containerof((inp), struct udpcb, u_inpcb) +#define intoudpcb(ip) __containerof((ip), struct udpcb, u_inpcb) #define sotoudpcb(so) (intoudpcb(sotoinpcb(so))) VNET_PCPUSTAT_DECLARE(struct udpstat, udpstat); @@ -155,13 +155,15 @@ VNET_DECLARE(struct inpcbinfo, ulitecbinfo); extern u_long udp_sendspace; extern u_long udp_recvspace; -VNET_DECLARE(int, udp_cksum); +VNET_DECLARE(int, udp_bind_all_fibs); VNET_DECLARE(int, udp_blackhole); VNET_DECLARE(bool, udp_blackhole_local); +VNET_DECLARE(int, udp_cksum); VNET_DECLARE(int, udp_log_in_vain); -#define V_udp_cksum VNET(udp_cksum) +#define V_udp_bind_all_fibs VNET(udp_bind_all_fibs) #define V_udp_blackhole VNET(udp_blackhole) #define V_udp_blackhole_local VNET(udp_blackhole_local) +#define V_udp_cksum VNET(udp_cksum) #define V_udp_log_in_vain VNET(udp_log_in_vain) VNET_DECLARE(int, zero_checksum_port); |