9 files changed, 3041 insertions, 2 deletions
diff --git a/sys/netinet/tcp.h b/sys/netinet/tcp.h
index 774eb348a9b2..c2ad951a4604 100644
--- a/sys/netinet/tcp.h
+++ b/sys/netinet/tcp.h
@@ -168,6 +168,12 @@ struct tcphdr {
 #define TCP_NOOPT	8	/* don't use TCP options */
 #define TCP_MD5SIG	16	/* use MD5 digests (RFC2385) */
 #define	TCP_INFO	32	/* retrieve tcp_info structure */
+#define	TCP_LOG		34	/* configure event logging for connection */
+#define	TCP_LOGBUF	35	/* retrieve event log for connection */
+#define	TCP_LOGID	36	/* configure log ID to correlate connections */
+#define	TCP_LOGDUMP	37	/* dump connection log events to device */
+#define	TCP_LOGDUMPID	38	/* dump events from connections with same ID to
+				   device */
 #define	TCP_CONGESTION	64	/* get/set congestion control algorithm */
 #define	TCP_CCALGOOPT	65	/* get/set cc algorithm specific options */
 #define	TCP_KEEPINIT	128	/* N, time to establish connection */
@@ -189,6 +195,9 @@ struct tcphdr {
 #define	TCPI_OPT_ECN		0x08
 #define	TCPI_OPT_TOE		0x10
 
+/* Maximum length of log ID. */
+#define TCP_LOG_ID_LEN	64
+
 /*
  * The TCP_INFO socket option comes from the Linux 2.6 TCP API, and permits
  * the caller to query certain information about the state of a TCP
diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c
index 40655ecc16c9..0724eaa9882d 100644
--- a/sys/netinet/tcp_input.c
+++ b/sys/netinet/tcp_input.c
@@ -102,6 +102,7 @@ __FBSDID("$FreeBSD$");
 #include <netinet6/nd6.h>
 #include <netinet/tcp.h>
 #include <netinet/tcp_fsm.h>
+#include <netinet/tcp_log_buf.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
@@ -1592,6 +1593,8 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
 	/* Save segment, if requested. */
 	tcp_pcap_add(th, m, &(tp->t_inpkts));
 #endif
+	TCP_LOG_EVENT(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0,
+	    tlen, NULL, true);
 
 	if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) {
 		if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
diff --git a/sys/netinet/tcp_log_buf.c b/sys/netinet/tcp_log_buf.c
new file mode 100644
index 000000000000..ef45ce1210f5
--- /dev/null
+++ b/sys/netinet/tcp_log_buf.c
@@ -0,0 +1,2480 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2016-2018
+ *	Netflix Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/queue.h>
+#include <sys/refcount.h>
+#include <sys/rwlock.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysctl.h>
+#include <sys/tree.h>
+#include <sys/counter.h>
+
+#include <dev/tcp_log/tcp_log_dev.h>
+
+#include <net/if.h>
+#include <net/if_var.h>
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+#include <netinet/in_var.h>
+#include <netinet/tcp_var.h>
+#include <netinet/tcp_log_buf.h>
+
+/* Default expiry time */
+#define	TCP_LOG_EXPIRE_TIME	((sbintime_t)60 * SBT_1S)
+
+/* Max interval at which to run the expiry timer */
+#define	TCP_LOG_EXPIRE_INTVL	((sbintime_t)5 * SBT_1S)
+
+bool	tcp_log_verbose;
+static uma_zone_t tcp_log_bucket_zone, tcp_log_node_zone, tcp_log_zone;
+static int	tcp_log_session_limit = TCP_LOG_BUF_DEFAULT_SESSION_LIMIT;
+static uint32_t	tcp_log_version = TCP_LOG_BUF_VER;
+RB_HEAD(tcp_log_id_tree, tcp_log_id_bucket);
+static struct tcp_log_id_tree tcp_log_id_head;
+static STAILQ_HEAD(, tcp_log_id_node) tcp_log_expireq_head =
+    STAILQ_HEAD_INITIALIZER(tcp_log_expireq_head);
+static struct mtx tcp_log_expireq_mtx;
+static struct callout tcp_log_expireq_callout;
+static uint64_t tcp_log_auto_ratio = 0;
+static uint64_t tcp_log_auto_ratio_cur = 0;
+static uint32_t tcp_log_auto_mode = TCP_LOG_STATE_TAIL;
+static bool tcp_log_auto_all = false;
+
+RB_PROTOTYPE_STATIC(tcp_log_id_tree, tcp_log_id_bucket, tlb_rb, tcp_log_id_cmp)
+
+SYSCTL_NODE(_net_inet_tcp, OID_AUTO, bb, CTLFLAG_RW, 0, "TCP Black Box controls");
+
+SYSCTL_BOOL(_net_inet_tcp_bb, OID_AUTO, log_verbose, CTLFLAG_RW, &tcp_log_verbose,
+    0, "Force verbose logging for TCP traces");
+
+SYSCTL_INT(_net_inet_tcp_bb, OID_AUTO, log_session_limit,
+    CTLFLAG_RW, &tcp_log_session_limit, 0,
+    "Maximum number of events maintained for each TCP session");
+
+SYSCTL_UMA_MAX(_net_inet_tcp_bb, OID_AUTO, log_global_limit, CTLFLAG_RW,
+    &tcp_log_zone, "Maximum number of events maintained for all TCP sessions");
+
+SYSCTL_UMA_CUR(_net_inet_tcp_bb, OID_AUTO, log_global_entries, CTLFLAG_RD,
+    &tcp_log_zone, "Current number of events maintained for all TCP sessions");
+
+SYSCTL_UMA_MAX(_net_inet_tcp_bb, OID_AUTO, log_id_limit, CTLFLAG_RW,
+    &tcp_log_bucket_zone, "Maximum number of log IDs");
+
+SYSCTL_UMA_CUR(_net_inet_tcp_bb, OID_AUTO, log_id_entries, CTLFLAG_RD,
+    &tcp_log_bucket_zone, "Current number of log IDs");
+
+SYSCTL_UMA_MAX(_net_inet_tcp_bb, OID_AUTO, log_id_tcpcb_limit, CTLFLAG_RW,
+    &tcp_log_node_zone, "Maximum number of tcpcbs with log IDs");
+
+SYSCTL_UMA_CUR(_net_inet_tcp_bb, OID_AUTO, log_id_tcpcb_entries, CTLFLAG_RD,
+    &tcp_log_node_zone, "Current number of tcpcbs with log IDs");
+
+SYSCTL_U32(_net_inet_tcp_bb, OID_AUTO, log_version, CTLFLAG_RD, &tcp_log_version,
+    0, "Version of log formats exported");
+
+SYSCTL_U64(_net_inet_tcp_bb, OID_AUTO, log_auto_ratio, CTLFLAG_RW,
+    &tcp_log_auto_ratio, 0, "Do auto capturing for 1 out of N sessions");
+
+SYSCTL_U32(_net_inet_tcp_bb, OID_AUTO, log_auto_mode, CTLFLAG_RW,
+    &tcp_log_auto_mode, TCP_LOG_STATE_HEAD_AUTO,
+    "Logging mode for auto-selected sessions (default is TCP_LOG_STATE_HEAD_AUTO)");
+
+SYSCTL_BOOL(_net_inet_tcp_bb, OID_AUTO, log_auto_all, CTLFLAG_RW,
+    &tcp_log_auto_all, false,
+    "Auto-select from all sessions (rather than just those with IDs)");
+
+#ifdef TCPLOG_DEBUG_COUNTERS
+counter_u64_t tcp_log_queued;
+counter_u64_t tcp_log_que_fail1;
+counter_u64_t tcp_log_que_fail2;
+counter_u64_t tcp_log_que_fail3;
+counter_u64_t tcp_log_que_fail4;
+counter_u64_t tcp_log_que_fail5;
+counter_u64_t tcp_log_que_copyout;
+counter_u64_t tcp_log_que_read;
+counter_u64_t tcp_log_que_freed;
+
+SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, queued, CTLFLAG_RD,
+    &tcp_log_queued, "Number of entries queued");
+SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, fail1, CTLFLAG_RD,
+    &tcp_log_que_fail1, "Number of entries queued but fail 1");
+SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, fail2, CTLFLAG_RD,
+    &tcp_log_que_fail2, "Number of entries queued but fail 2");
+SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, fail3, CTLFLAG_RD,
+    &tcp_log_que_fail3, "Number of entries queued but fail 3");
+SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, fail4, CTLFLAG_RD,
+    &tcp_log_que_fail4, "Number of entries queued but fail 4");
+SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, fail5, CTLFLAG_RD,
+    &tcp_log_que_fail5, "Number of entries queued but fail 4");
+SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, copyout, CTLFLAG_RD,
+    &tcp_log_que_copyout, "Number of entries copied out");
+SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, read, CTLFLAG_RD,
+    &tcp_log_que_read, "Number of entries read from the queue");
+SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, freed, CTLFLAG_RD,
+    &tcp_log_que_freed, "Number of entries freed after reading");
+#endif
+
+#ifdef INVARIANTS
+#define	TCPLOG_DEBUG_RINGBUF
+#endif
+
+struct tcp_log_mem
+{
+	STAILQ_ENTRY(tcp_log_mem) tlm_queue;
+	struct tcp_log_buffer	tlm_buf;
+	struct tcp_log_verbose	tlm_v;
+#ifdef TCPLOG_DEBUG_RINGBUF
+	volatile int		tlm_refcnt;
+#endif
+};
+
+/* 60 bytes for the header, + 16 bytes for padding */
+static uint8_t	zerobuf[76];
+
+/*
+ * Lock order:
+ * 1. TCPID_TREE
+ * 2. TCPID_BUCKET
+ * 3. INP
+ *
+ * Rules:
+ * A. You need a lock on the Tree to add/remove buckets.
+ * B. You need a lock on the bucket to add/remove nodes from the bucket.
+ * C. To change information in a node, you need the INP lock if the tln_closed
+ *    field is false. Otherwise, you need the bucket lock. (Note that the
+ *    tln_closed field can change at any point, so you need to recheck the
+ *    entry after acquiring the INP lock.)
+ * D. To remove a node from the bucket, you must have that entry locked,
+ *    according to the criteria of Rule C. Also, the node must not be on
+ *    the expiry queue.
+ * E. The exception to C is the expiry queue fields, which are locked by
+ *    the TCPLOG_EXPIREQ lock.
+ *
+ * Buckets have a reference count. Each node is a reference. Further,
+ * other callers may add reference counts to keep a bucket from disappearing.
+ * You can add a reference as long as you own a lock sufficient to keep the
+ * bucket from disappearing. For example, a common use is:
+ *   a. Have a locked INP, but need to lock the TCPID_BUCKET.
+ *   b. Add a refcount on the bucket. (Safe because the INP lock prevents
+ *      the TCPID_BUCKET from going away.)
+ *   c. Drop the INP lock.
+ *   d. Acquire a lock on the TCPID_BUCKET.
+ *   e. Acquire a lock on the INP.
+ *   f. Drop the refcount on the bucket.
+ *      (At this point, the bucket may disappear.)
+ *
+ * Expire queue lock:
+ * You can acquire this with either the bucket or INP lock. Don't reverse it.
+ * When the expire code has committed to freeing a node, it resets the expiry
+ * time to SBT_MAX. That is the signal to everyone else that they should
+ * leave that node alone.
+ */
+static struct rwlock tcp_id_tree_lock;
+#define	TCPID_TREE_WLOCK()		rw_wlock(&tcp_id_tree_lock)
+#define	TCPID_TREE_RLOCK()		rw_rlock(&tcp_id_tree_lock)
+#define	TCPID_TREE_UPGRADE()		rw_try_upgrade(&tcp_id_tree_lock)
+#define	TCPID_TREE_WUNLOCK()		rw_wunlock(&tcp_id_tree_lock)
+#define	TCPID_TREE_RUNLOCK()		rw_runlock(&tcp_id_tree_lock)
+#define	TCPID_TREE_WLOCK_ASSERT()	rw_assert(&tcp_id_tree_lock, RA_WLOCKED)
+#define	TCPID_TREE_RLOCK_ASSERT()	rw_assert(&tcp_id_tree_lock, RA_RLOCKED)
+#define	TCPID_TREE_UNLOCK_ASSERT()	rw_assert(&tcp_id_tree_lock, RA_UNLOCKED)
+
+#define	TCPID_BUCKET_LOCK_INIT(tlb)	mtx_init(&((tlb)->tlb_mtx), "tcp log id bucket", NULL, MTX_DEF)
+#define	TCPID_BUCKET_LOCK_DESTROY(tlb)	mtx_destroy(&((tlb)->tlb_mtx))
+#define	TCPID_BUCKET_LOCK(tlb)		mtx_lock(&((tlb)->tlb_mtx))
+#define	TCPID_BUCKET_UNLOCK(tlb)	mtx_unlock(&((tlb)->tlb_mtx))
+#define	TCPID_BUCKET_LOCK_ASSERT(tlb)	mtx_assert(&((tlb)->tlb_mtx), MA_OWNED)
+#define	TCPID_BUCKET_UNLOCK_ASSERT(tlb) mtx_assert(&((tlb)->tlb_mtx), MA_NOTOWNED)
+
+#define	TCPID_BUCKET_REF(tlb)		refcount_acquire(&((tlb)->tlb_refcnt))
+#define	TCPID_BUCKET_UNREF(tlb)		refcount_release(&((tlb)->tlb_refcnt))
+
+#define	TCPLOG_EXPIREQ_LOCK()		mtx_lock(&tcp_log_expireq_mtx)
+#define	TCPLOG_EXPIREQ_UNLOCK()		mtx_unlock(&tcp_log_expireq_mtx)
+
+SLIST_HEAD(tcp_log_id_head, tcp_log_id_node);
+
+struct tcp_log_id_bucket
+{
+	/*
+	 * tlb_id must be first. This lets us use strcmp on
+	 * (struct tcp_log_id_bucket *) and (char *) interchangeably.
+	 */
+	char				tlb_id[TCP_LOG_ID_LEN];
+	RB_ENTRY(tcp_log_id_bucket)	tlb_rb;
+	struct tcp_log_id_head		tlb_head;
+	struct mtx			tlb_mtx;
+	volatile u_int			tlb_refcnt;
+};
+
+struct tcp_log_id_node
+{
+	SLIST_ENTRY(tcp_log_id_node) tln_list;
+	STAILQ_ENTRY(tcp_log_id_node) tln_expireq; /* Locked by the expireq lock */
+	sbintime_t		tln_expiretime;	/* Locked by the expireq lock */
+
+	/*
+	 * If INP is NULL, that means the connection has closed. We've
+	 * saved the connection endpoint information and the log entries
+	 * in the tln_ie and tln_entries members. We've also saved a pointer
+	 * to the enclosing bucket here. If INP is not NULL, the information is
+	 * in the PCB and not here.
+	 */
+	struct inpcb		*tln_inp;
+	struct tcpcb		*tln_tp;
+	struct tcp_log_id_bucket *tln_bucket;
+	struct in_endpoints	tln_ie;
+	struct tcp_log_stailq	tln_entries;
+	int			tln_count;
+	volatile int		tln_closed;
+	uint8_t			tln_af;
+};
+
+enum tree_lock_state {
+	TREE_UNLOCKED = 0,
+	TREE_RLOCKED,
+	TREE_WLOCKED,
+};
+
+/* Do we want to select this session for auto-logging? */
+static __inline bool
+tcp_log_selectauto(void)
+{
+
+	/*
+	 * If we are doing auto-capturing, figure out whether we will capture
+	 * this session.
+	 */
+	if (tcp_log_auto_ratio &&
+	    (atomic_fetchadd_64(&tcp_log_auto_ratio_cur, 1) %
+	    tcp_log_auto_ratio) == 0)
+		return (true);
+	return (false);
+}
+
+static __inline int
+tcp_log_id_cmp(struct tcp_log_id_bucket *a, struct tcp_log_id_bucket *b)
+{
+	KASSERT(a != NULL, ("tcp_log_id_cmp: argument a is unexpectedly NULL"));
+	KASSERT(b != NULL, ("tcp_log_id_cmp: argument b is unexpectedly NULL"));
+	return strncmp(a->tlb_id, b->tlb_id, TCP_LOG_ID_LEN);
+}
+
+RB_GENERATE_STATIC(tcp_log_id_tree, tcp_log_id_bucket, tlb_rb, tcp_log_id_cmp)
+
+static __inline void
+tcp_log_id_validate_tree_lock(int tree_locked)
+{
+
+#ifdef INVARIANTS
+	switch (tree_locked) {
+	case TREE_WLOCKED:
+		TCPID_TREE_WLOCK_ASSERT();
+		break;
+	case TREE_RLOCKED:
+		TCPID_TREE_RLOCK_ASSERT();
+		break;
+	case TREE_UNLOCKED:
+		TCPID_TREE_UNLOCK_ASSERT();
+		break;
+	default:
+		kassert_panic("%s:%d: unknown tree lock state", __func__,
+		    __LINE__);
+	}
+#endif
+}
+
+static __inline void
+tcp_log_remove_bucket(struct tcp_log_id_bucket *tlb)
+{
+
+	TCPID_TREE_WLOCK_ASSERT();
+	KASSERT(SLIST_EMPTY(&tlb->tlb_head),
+	    ("%s: Attempt to remove non-empty bucket", __func__));
+	if (RB_REMOVE(tcp_log_id_tree, &tcp_log_id_head, tlb) == NULL) {
+#ifdef INVARIANTS
+		kassert_panic("%s:%d: error removing element from tree",
+			    __func__, __LINE__);
+#endif
+	}
+	TCPID_BUCKET_LOCK_DESTROY(tlb);
+	uma_zfree(tcp_log_bucket_zone, tlb);
+}
+
+/*
+ * Call with a referenced and locked bucket.
+ * Will return true if the bucket was freed; otherwise, false.
+ * tlb: The bucket to unreference.
+ * tree_locked: A pointer to the state of the tree lock. If the tree lock
+ *    state changes, the function will update it.
+ * inp: If not NULL and the function needs to drop the inp lock to relock the
+ *    tree, it will do so. (The caller must ensure inp will not become invalid,
+ *    probably by holding a reference to it.)
+ */
+static bool
+tcp_log_unref_bucket(struct tcp_log_id_bucket *tlb, int *tree_locked,
+    struct inpcb *inp)
+{
+
+	KASSERT(tlb != NULL, ("%s: called with NULL tlb", __func__));
+	KASSERT(tree_locked != NULL, ("%s: called with NULL tree_locked",
+	    __func__));
+
+	tcp_log_id_validate_tree_lock(*tree_locked);
+
+	/*
+	 * Did we hold the last reference on the tlb? If so, we may need
+	 * to free it. (Note that we can realistically only execute the
+	 * loop twice: once without a write lock and once with a write
+	 * lock.)
+	 */
+	while (TCPID_BUCKET_UNREF(tlb)) {
+		/*
+		 * We need a write lock on the tree to free this.
+		 * If we can upgrade the tree lock, this is "easy". If we
+		 * can't upgrade the tree lock, we need to do this the
+		 * "hard" way: unwind all our locks and relock everything.
+		 * In the meantime, anything could have changed. We even
+		 * need to validate that we still need to free the bucket.
+		 */
+		if (*tree_locked == TREE_RLOCKED && TCPID_TREE_UPGRADE())
+			*tree_locked = TREE_WLOCKED;
+		else if (*tree_locked != TREE_WLOCKED) {
+			TCPID_BUCKET_REF(tlb);
+			if (inp != NULL)
+				INP_WUNLOCK(inp);
+			TCPID_BUCKET_UNLOCK(tlb);
+			if (*tree_locked == TREE_RLOCKED)
+				TCPID_TREE_RUNLOCK();
+			TCPID_TREE_WLOCK();
+			*tree_locked = TREE_WLOCKED;
+			TCPID_BUCKET_LOCK(tlb);
+			if (inp != NULL)
+				INP_WLOCK(inp);
+			continue;
+		}
+
+		/*
+		 * We have an empty bucket and a write lock on the tree.
+		 * Remove the empty bucket.
+		 */
+		tcp_log_remove_bucket(tlb);
+		return (true);
+	}
+	return (false);
+}
+
+/*
+ * Call with a locked bucket. This function will release the lock on the
+ * bucket before returning.
+ *
+ * The caller is responsible for freeing the tp->t_lin/tln node!
+ *
+ * Note: one of tp or both tlb and tln must be supplied.
+ *
+ * inp: A pointer to the inp. If the function needs to drop the inp lock to
+ *    acquire the tree write lock, it will do so. (The caller must ensure inp
+ *    will not become invalid, probably by holding a reference to it.)
+ * tp: A pointer to the tcpcb. (optional; if specified, tlb and tln are ignored)
+ * tlb: A pointer to the bucket. (optional; ignored if tp is specified)
+ * tln: A pointer to the node. (optional; ignored if tp is specified)
+ * tree_locked: A pointer to the state of the tree lock. If the tree lock
+ *    state changes, the function will update it.
+ *
+ * Will return true if the INP lock was reacquired; otherwise, false.
+ */
+static bool
+tcp_log_remove_id_node(struct inpcb *inp, struct tcpcb *tp,
+    struct tcp_log_id_bucket *tlb, struct tcp_log_id_node *tln,
+    int *tree_locked)
+{
+	int orig_tree_locked;
+
+	KASSERT(tp != NULL || (tlb != NULL && tln != NULL),
+	    ("%s: called with tp=%p, tlb=%p, tln=%p", __func__,
+	    tp, tlb, tln));
+	KASSERT(tree_locked != NULL, ("%s: called with NULL tree_locked",
+	    __func__));
+
+	if (tp != NULL) {
+		tlb = tp->t_lib;
+		tln = tp->t_lin;
+		KASSERT(tlb != NULL, ("%s: unexpectedly NULL tlb", __func__));
+		KASSERT(tln != NULL, ("%s: unexpectedly NULL tln", __func__));
+	}
+
+	tcp_log_id_validate_tree_lock(*tree_locked);
+	TCPID_BUCKET_LOCK_ASSERT(tlb);
+
+	/*
+	 * Remove the node, clear the log bucket and node from the TCPCB, and
+	 * decrement the bucket refcount. In the process, if this is the
+	 * last reference, the bucket will be freed.
+	 */
+	SLIST_REMOVE(&tlb->tlb_head, tln, tcp_log_id_node, tln_list);
+	if (tp != NULL) {
+		tp->t_lib = NULL;
+		tp->t_lin = NULL;
+	}
+	orig_tree_locked = *tree_locked;
+	if (!tcp_log_unref_bucket(tlb, tree_locked, inp))
+		TCPID_BUCKET_UNLOCK(tlb);
+	return (*tree_locked != orig_tree_locked);
+}
+
+#define	RECHECK_INP_CLEAN(cleanup)	do {			\
+	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {	\
+		rv = ECONNRESET;				\
+		cleanup;					\
+		goto done;					\
+	}							\
+	tp = intotcpcb(inp);					\
+} while (0)
+
+#define	RECHECK_INP()	RECHECK_INP_CLEAN(/* noop */)
+
+static void
+tcp_log_grow_tlb(char *tlb_id, struct tcpcb *tp)
+{
+
+	INP_WLOCK_ASSERT(tp->t_inpcb);
+
+#ifdef NETFLIX
+	if (V_tcp_perconn_stats_enable == 2 && tp->t_stats == NULL)
+		(void)tcp_stats_sample_rollthedice(tp, tlb_id, strlen(tlb_id));
+#endif
+}
+
+/*
+ * Set the TCP log ID for a TCPCB.
+ * Called with INPCB locked. Returns with it unlocked.
+ */
+int
+tcp_log_set_id(struct tcpcb *tp, char *id)
+{
+	struct tcp_log_id_bucket *tlb, *tmp_tlb;
+	struct tcp_log_id_node *tln;
+	struct inpcb *inp;
+	int tree_locked, rv;
+	bool bucket_locked;
+
+	tlb = NULL;
+	tln = NULL;
+	inp = tp->t_inpcb;
+	tree_locked = TREE_UNLOCKED;
+	bucket_locked = false;
+
+restart:
+	INP_WLOCK_ASSERT(inp);
+
+	/* See if the ID is unchanged. */
+	if ((tp->t_lib != NULL && !strcmp(tp->t_lib->tlb_id, id)) ||
+	    (tp->t_lib == NULL && *id == 0)) {
+		rv = 0;
+		goto done;
+	}
+
+	/*
+	 * If the TCPCB had a previous ID, we need to extricate it from
+	 * the previous list.
+	 *
+	 * Drop the TCPCB lock and lock the tree and the bucket.
+	 * Because this is called in the socket context, we (theoretically)
+	 * don't need to worry about the INPCB completely going away
+	 * while we are gone.
+	 */
+	if (tp->t_lib != NULL) {
+		tlb = tp->t_lib;
+		TCPID_BUCKET_REF(tlb);
+		INP_WUNLOCK(inp);
+
+		if (tree_locked == TREE_UNLOCKED) {
+			TCPID_TREE_RLOCK();
+			tree_locked = TREE_RLOCKED;
+		}
+		TCPID_BUCKET_LOCK(tlb);
+		bucket_locked = true;
+		INP_WLOCK(inp);
+
+		/*
+		 * Unreference the bucket. If our bucket went away, it is no
+		 * longer locked or valid.
+		 */
+		if (tcp_log_unref_bucket(tlb, &tree_locked, inp)) {
+			bucket_locked = false;
+			tlb = NULL;
+		}
+
+		/* Validate the INP. */
+		RECHECK_INP();
+
+		/*
+		 * Evaluate whether the bucket changed while we were unlocked.
+		 *
+		 * Possible scenarios here:
+		 * 1. Bucket is unchanged and the same one we started with.
+		 * 2. The TCPCB no longer has a bucket and our bucket was
+		 *    freed.
+		 * 3. The TCPCB has a new bucket, whether ours was freed.
+		 * 4. The TCPCB no longer has a bucket and our bucket was
+		 *    not freed.
+		 *
+		 * In cases 2-4, we will start over. In case 1, we will
+		 * proceed here to remove the bucket.
+		 */
+		if (tlb == NULL || tp->t_lib != tlb) {
+			KASSERT(bucket_locked || tlb == NULL,
+			    ("%s: bucket_locked (%d) and tlb (%p) are "
+			    "inconsistent", __func__, bucket_locked, tlb));
+			
+			if (bucket_locked) {
+				TCPID_BUCKET_UNLOCK(tlb);
+				bucket_locked = false;
+				tlb = NULL;
+			}
+			goto restart;
+		}
+
+		/*
+		 * Store the (struct tcp_log_id_node) for reuse. Then, remove
+		 * it from the bucket. In the process, we may end up relocking.
+		 * If so, we need to validate that the INP is still valid, and
+		 * the TCPCB entries match we expect.
+		 *
+		 * We will clear tlb and change the bucket_locked state just
+		 * before calling tcp_log_remove_id_node(), since that function
+		 * will unlock the bucket.
+		 */
+		if (tln != NULL)
+			uma_zfree(tcp_log_node_zone, tln);
+		tln = tp->t_lin;
+		tlb = NULL;
+		bucket_locked = false;
+		if (tcp_log_remove_id_node(inp, tp, NULL, NULL, &tree_locked)) {
+			RECHECK_INP();
+
+			/*
+			 * If the TCPCB moved to a new bucket while we had
+			 * dropped the lock, restart.
+			 */
+			if (tp->t_lib != NULL || tp->t_lin != NULL)
+				goto restart;
+		}
+
+		/*
+		 * Yay! We successfully removed the TCPCB from its old
+		 * bucket. Phew!
+		 *
+		 * On to bigger and better things...
+		 */
+	}
+
+	/* At this point, the TCPCB should not be in any bucket. */
+	KASSERT(tp->t_lib == NULL, ("%s: tp->t_lib is not NULL", __func__));
+
+	/*
+	 * If the new ID is not empty, we need to now assign this TCPCB to a
+	 * new bucket.
+	 */
+	if (*id) {
+		/* Get a new tln, if we don't already have one to reuse. */
+		if (tln == NULL) {
+			tln = uma_zalloc(tcp_log_node_zone, M_NOWAIT | M_ZERO);
+			if (tln == NULL) {
+				rv = ENOBUFS;
+				goto done;
+			}
+			tln->tln_inp = inp;
+			tln->tln_tp = tp;
+		}
+
+		/*
+		 * Drop the INP lock for a bit. We don't need it, and dropping
+		 * it prevents lock order reversals.
+		 */
+		INP_WUNLOCK(inp);
+
+		/* Make sure we have at least a read lock on the tree. */
+		tcp_log_id_validate_tree_lock(tree_locked);
+		if (tree_locked == TREE_UNLOCKED) {
+			TCPID_TREE_RLOCK();
+			tree_locked = TREE_RLOCKED;
+		}
+
+refind:
+		/*
+		 * Remember that we constructed (struct tcp_log_id_node) so
+		 * we can safely cast the id to it for the purposes of finding.
+		 */
+		KASSERT(tlb == NULL, ("%s:%d tlb unexpectedly non-NULL", 
+		    __func__, __LINE__));
+		tmp_tlb = RB_FIND(tcp_log_id_tree, &tcp_log_id_head,
+		    (struct tcp_log_id_bucket *) id);
+
+		/*
+		 * If we didn't find a matching bucket, we need to add a new
+		 * one. This requires a write lock. But, of course, we will
+		 * need to recheck some things when we re-acquire the lock.
+		 */
+		if (tmp_tlb == NULL && tree_locked != TREE_WLOCKED) {
+			tree_locked = TREE_WLOCKED;
+			if (!TCPID_TREE_UPGRADE()) {
+				TCPID_TREE_RUNLOCK();
+				TCPID_TREE_WLOCK();
+
+				/*
+				 * The tree may have changed while we were
+				 * unlocked.
+				 */
+				goto refind;
+			}
+		}
+
+		/* If we need to add a new bucket, do it now. */
+		if (tmp_tlb == NULL) {
+			/* Allocate new bucket. */
+			tlb = uma_zalloc(tcp_log_bucket_zone, M_NOWAIT);
+			if (tlb == NULL) {
+				rv = ENOBUFS;
+				goto done_noinp;
+			}
+
+			/*
+			 * Copy the ID to the bucket.
+			 * NB: Don't use strlcpy() unless you are sure
+			 * we've always validated NULL termination.
+			 *
+			 * TODO: When I'm done writing this, see if we
+			 * we have correctly validated NULL termination and
+			 * can use strlcpy(). :-)
+			 */
+			strncpy(tlb->tlb_id, id, TCP_LOG_ID_LEN - 1);
+			tlb->tlb_id[TCP_LOG_ID_LEN - 1] = '\0';
+
+			/*
+			 * Take the refcount for the first node and go ahead
+			 * and lock this. Note that we zero the tlb_mtx
+			 * structure, since 0xdeadc0de flips the right bits
+			 * for the code to think that this mutex has already
+			 * been initialized. :-(
+			 */
+			SLIST_INIT(&tlb->tlb_head);
+			refcount_init(&tlb->tlb_refcnt, 1);
+			memset(&tlb->tlb_mtx, 0, sizeof(struct mtx));
+			TCPID_BUCKET_LOCK_INIT(tlb);
+			TCPID_BUCKET_LOCK(tlb);
+			bucket_locked = true;
+
+#define	FREE_NEW_TLB()	do {				\
+	TCPID_BUCKET_LOCK_DESTROY(tlb);			\
+	uma_zfree(tcp_log_bucket_zone, tlb);		\
+	bucket_locked = false;				\
+	tlb = NULL;					\
+} while (0)
+			/*
+			 * Relock the INP and make sure we are still
+			 * unassigned.
+			 */
+			INP_WLOCK(inp);
+			RECHECK_INP_CLEAN(FREE_NEW_TLB());
+			if (tp->t_lib != NULL) {
+				FREE_NEW_TLB();
+				goto restart;
+			}
+
+			/* Add the new bucket to the tree. */
+			tmp_tlb = RB_INSERT(tcp_log_id_tree, &tcp_log_id_head,
+			    tlb);
+			KASSERT(tmp_tlb == NULL,
+			    ("%s: Unexpected conflicting bucket (%p) while "
+			    "adding new bucket (%p)", __func__, tmp_tlb, tlb));
+
+			/*
+			 * If we found a conflicting bucket, free the new
+			 * one we made and fall through to use the existing
+			 * bucket.
+			 */
+			if (tmp_tlb != NULL) {
+				FREE_NEW_TLB();
+				INP_WUNLOCK(inp);
+			}
+#undef	FREE_NEW_TLB
+		}
+
+		/* If we found an existing bucket, use it. */
+		if (tmp_tlb != NULL) {
+			tlb = tmp_tlb;
+			TCPID_BUCKET_LOCK(tlb);
+			bucket_locked = true;
+
+			/*
+			 * Relock the INP and make sure we are still
+			 * unassigned.
+			 */
+			INP_UNLOCK_ASSERT(inp);
+			INP_WLOCK(inp);
+			RECHECK_INP();
+			if (tp->t_lib != NULL) {
+				TCPID_BUCKET_UNLOCK(tlb);
+				tlb = NULL;
+				goto restart;
+			}
+
+			/* Take a reference on the bucket. */
+			TCPID_BUCKET_REF(tlb);
+		}
+
+		tcp_log_grow_tlb(tlb->tlb_id, tp);
+
+		/* Add the new node to the list. */
+		SLIST_INSERT_HEAD(&tlb->tlb_head, tln, tln_list);
+		tp->t_lib = tlb;
+		tp->t_lin = tln;
+		tln = NULL;
+	}
+
+	rv = 0;
+
+done:
+	/* Unlock things, as needed, and return. */
+	INP_WUNLOCK(inp);
+done_noinp:
+	INP_UNLOCK_ASSERT(inp);
+	if (bucket_locked) {
+		TCPID_BUCKET_LOCK_ASSERT(tlb);
+		TCPID_BUCKET_UNLOCK(tlb);
+	} else if (tlb != NULL)
+		TCPID_BUCKET_UNLOCK_ASSERT(tlb);
+	if (tree_locked == TREE_WLOCKED) {
+		TCPID_TREE_WLOCK_ASSERT();
+		TCPID_TREE_WUNLOCK();
+	} else if (tree_locked == TREE_RLOCKED) {
+		TCPID_TREE_RLOCK_ASSERT();
+		TCPID_TREE_RUNLOCK();
+	} else
+		TCPID_TREE_UNLOCK_ASSERT();
+	if (tln != NULL)
+		uma_zfree(tcp_log_node_zone, tln);
+	return (rv);
+}
+
+/*
+ * Get the TCP log ID for a TCPCB.
+ * Called with INPCB locked.
+ * 'buf' must point to a buffer that is at least TCP_LOG_ID_LEN bytes long.
+ * Returns number of bytes copied.
+ */
+size_t
+tcp_log_get_id(struct tcpcb *tp, char *buf)
+{
+	size_t len;
+
+	INP_LOCK_ASSERT(tp->t_inpcb);
+	if (tp->t_lib != NULL) {
+		len = strlcpy(buf, tp->t_lib->tlb_id, TCP_LOG_ID_LEN);
+		KASSERT(len < TCP_LOG_ID_LEN,
+		    ("%s:%d: tp->t_lib->tlb_id too long (%zu)",
+		    __func__, __LINE__, len));
+	} else {
+		*buf = '\0';
+		len = 0;
+	}
+	return (len);
+}
+
+/*
+ * Get number of connections with the same log ID.
+ * Log ID is taken from given TCPCB.
+ * Called with INPCB locked.
+ */
+u_int
+tcp_log_get_id_cnt(struct tcpcb *tp)
+{
+
+	INP_WLOCK_ASSERT(tp->t_inpcb);
+	return ((tp->t_lib == NULL) ? 0 : tp->t_lib->tlb_refcnt);
+}
+
+#ifdef TCPLOG_DEBUG_RINGBUF
+/*
+ * Functions/macros to increment/decrement reference count for a log
+ * entry. This should catch when we do a double-free/double-remove or
+ * a double-add.
+ */
+static inline void
+_tcp_log_entry_refcnt_add(struct tcp_log_mem *log_entry, const char *func,
+    int line)
+{
+	int refcnt;
+
+	refcnt = atomic_fetchadd_int(&log_entry->tlm_refcnt, 1);
+	if (refcnt != 0)
+		panic("%s:%d: log_entry(%p)->tlm_refcnt is %d (expected 0)",
+		    func, line, log_entry, refcnt);
+}
+#define	tcp_log_entry_refcnt_add(l)	\
+    _tcp_log_entry_refcnt_add((l), __func__, __LINE__)
+
+static inline void
+_tcp_log_entry_refcnt_rem(struct tcp_log_mem *log_entry, const char *func,
+    int line)
+{
+	int refcnt;
+
+	refcnt = atomic_fetchadd_int(&log_entry->tlm_refcnt, -1);
+	if (refcnt != 1)
+		panic("%s:%d: log_entry(%p)->tlm_refcnt is %d (expected 1)",
+		    func, line, log_entry, refcnt);
+}
+#define	tcp_log_entry_refcnt_rem(l)	\
+    _tcp_log_entry_refcnt_rem((l), __func__, __LINE__)
+
+#else /* !TCPLOG_DEBUG_RINGBUF */
+
+#define	tcp_log_entry_refcnt_add(l)
+#define	tcp_log_entry_refcnt_rem(l)
+
+#endif
+
+/*
+ * Cleanup after removing a log entry, but only decrement the count if we
+ * are running INVARIANTS.
+ */
+static inline void
+tcp_log_free_log_common(struct tcp_log_mem *log_entry, int *count __unused)
+{
+
+	uma_zfree(tcp_log_zone, log_entry);
+#ifdef INVARIANTS
+	(*count)--;
+	KASSERT(*count >= 0,
+	    ("%s: count unexpectedly negative", __func__));
+#endif
+}
+
+static void
+tcp_log_free_entries(struct tcp_log_stailq *head, int *count)
+{
+	struct tcp_log_mem *log_entry;
+
+	/* Free the entries. */
+	while ((log_entry = STAILQ_FIRST(head)) != NULL) {
+		STAILQ_REMOVE_HEAD(head, tlm_queue);
+		tcp_log_entry_refcnt_rem(log_entry);
+		tcp_log_free_log_common(log_entry, count);
+	}
+}
+
+/* Cleanup after removing a log entry. */
+static inline void
+tcp_log_remove_log_cleanup(struct tcpcb *tp, struct tcp_log_mem *log_entry)
+{
+	uma_zfree(tcp_log_zone, log_entry);
+	tp->t_lognum--;
+	KASSERT(tp->t_lognum >= 0,
+	    ("%s: tp->t_lognum unexpectedly negative", __func__));
+}
+
+/* Remove a log entry from the head of a list. */
+static inline void
+tcp_log_remove_log_head(struct tcpcb *tp, struct tcp_log_mem *log_entry)
+{
+
+	KASSERT(log_entry == STAILQ_FIRST(&tp->t_logs),
+	    ("%s: attempt to remove non-HEAD log entry", __func__));
+	STAILQ_REMOVE_HEAD(&tp->t_logs, tlm_queue);
+	tcp_log_entry_refcnt_rem(log_entry);
+	tcp_log_remove_log_cleanup(tp, log_entry);
+}
+
+#ifdef TCPLOG_DEBUG_RINGBUF
+/*
+ * Initialize the log entry's reference count, which we want to
+ * survive allocations.
+ */
+static int
+tcp_log_zone_init(void *mem, int size, int flags __unused)
+{
+	struct tcp_log_mem *tlm;
+
+	KASSERT(size >= sizeof(struct tcp_log_mem),
+	    ("%s: unexpectedly short (%d) allocation", __func__, size));
+	tlm = (struct tcp_log_mem *)mem;
+	tlm->tlm_refcnt = 0;
+	return (0);
+}
+
+/*
+ * Double check that the refcnt is zero on allocation and return.
+ */
+static int
+tcp_log_zone_ctor(void *mem, int size, void *args __unused, int flags __unused)
+{
+	struct tcp_log_mem *tlm;
+
+	KASSERT(size >= sizeof(struct tcp_log_mem),
+	    ("%s: unexpectedly short (%d) allocation", __func__, size));
+	tlm = (struct tcp_log_mem *)mem;
+	if (tlm->tlm_refcnt != 0)
+		panic("%s:%d: tlm(%p)->tlm_refcnt is %d (expected 0)",
+		    __func__, __LINE__, tlm, tlm->tlm_refcnt);
+	return (0);
+}
+
+static void
+tcp_log_zone_dtor(void *mem, int size, void *args __unused)
+{
+	struct tcp_log_mem *tlm;
+
+	KASSERT(size >= sizeof(struct tcp_log_mem),
+	    ("%s: unexpectedly short (%d) allocation", __func__, size));
+	tlm = (struct tcp_log_mem *)mem;
+	if (tlm->tlm_refcnt != 0)
+		panic("%s:%d: tlm(%p)->tlm_refcnt is %d (expected 0)",
+		    __func__, __LINE__, tlm, tlm->tlm_refcnt);
+}
+#endif /* TCPLOG_DEBUG_RINGBUF */
+
+/* Do global initialization. */
+void
+tcp_log_init(void)
+{
+
+	tcp_log_zone = uma_zcreate("tcp_log", sizeof(struct tcp_log_mem),
+#ifdef TCPLOG_DEBUG_RINGBUF
+	    tcp_log_zone_ctor, tcp_log_zone_dtor, tcp_log_zone_init,
+#else
+	    NULL, NULL, NULL,
+#endif
+	    NULL, UMA_ALIGN_PTR, 0);
+	(void)uma_zone_set_max(tcp_log_zone, TCP_LOG_BUF_DEFAULT_GLOBAL_LIMIT);
+	tcp_log_bucket_zone = uma_zcreate("tcp_log_bucket",
+	    sizeof(struct tcp_log_id_bucket), NULL, NULL, NULL, NULL,
+	    UMA_ALIGN_PTR, 0);
+	tcp_log_node_zone = uma_zcreate("tcp_log_node",
+	    sizeof(struct tcp_log_id_node), NULL, NULL, NULL, NULL,
+	    UMA_ALIGN_PTR, 0);
+#ifdef TCPLOG_DEBUG_COUNTERS
+	tcp_log_queued = counter_u64_alloc(M_WAITOK);
+	tcp_log_que_fail1 = counter_u64_alloc(M_WAITOK);
+	tcp_log_que_fail2 = counter_u64_alloc(M_WAITOK);
+	tcp_log_que_fail3 = counter_u64_alloc(M_WAITOK);
+	tcp_log_que_fail4 = counter_u64_alloc(M_WAITOK);
+	tcp_log_que_fail5 = counter_u64_alloc(M_WAITOK);
+	tcp_log_que_copyout = counter_u64_alloc(M_WAITOK);
+	tcp_log_que_read = counter_u64_alloc(M_WAITOK);
+	tcp_log_que_freed = counter_u64_alloc(M_WAITOK);
+#endif
+
+	rw_init_flags(&tcp_id_tree_lock, "TCP ID tree", RW_NEW);
+	mtx_init(&tcp_log_expireq_mtx, "TCP log expireq", NULL, MTX_DEF);
+	callout_init(&tcp_log_expireq_callout, 1);
+}
+
+/* Do per-TCPCB initialization. */
+void
+tcp_log_tcpcbinit(struct tcpcb *tp)
+{
+
+	/* A new TCPCB should start out zero-initialized. */
+	STAILQ_INIT(&tp->t_logs);
+
+	/*
+	 * If we are doing auto-capturing, figure out whether we will capture
+	 * this session.
+	 */
+	if (tcp_log_selectauto()) {
+		tp->t_logstate = tcp_log_auto_mode;
+		tp->t_flags2 |= TF2_LOG_AUTO;
+	}
+}
+
+
+/* Remove entries */
+static void
+tcp_log_expire(void *unused __unused)
+{
+	struct tcp_log_id_bucket *tlb;
+	struct tcp_log_id_node *tln;
+	sbintime_t expiry_limit;
+	int tree_locked;
+
+	TCPLOG_EXPIREQ_LOCK();
+	if (callout_pending(&tcp_log_expireq_callout)) {
+		/* Callout was reset. */
+		TCPLOG_EXPIREQ_UNLOCK();
+		return;
+	}
+
+	/*
+	 * Process entries until we reach one that expires too far in the
+	 * future. Look one second in the future.
+	 */
+	expiry_limit = getsbinuptime() + SBT_1S;
+	tree_locked = TREE_UNLOCKED;
+
+	while ((tln = STAILQ_FIRST(&tcp_log_expireq_head)) != NULL &&
+	    tln->tln_expiretime <= expiry_limit) {
+		if (!callout_active(&tcp_log_expireq_callout)) {
+			/*
+			 * Callout was stopped. I guess we should
+			 * just quit at this point.
+			 */
+			TCPLOG_EXPIREQ_UNLOCK();
+			return;
+		}
+
+		/*
+		 * Remove the node from the head of the list and unlock
+		 * the list. Change the expiry time to SBT_MAX as a signal
+		 * to other threads that we now own this.
+		 */
+		STAILQ_REMOVE_HEAD(&tcp_log_expireq_head, tln_expireq);
+		tln->tln_expiretime = SBT_MAX;
+		TCPLOG_EXPIREQ_UNLOCK();
+
+		/*
+		 * Remove the node from the bucket.
+		 */
+		tlb = tln->tln_bucket;
+		TCPID_BUCKET_LOCK(tlb);
+		if (tcp_log_remove_id_node(NULL, NULL, tlb, tln, &tree_locked)) {
+			tcp_log_id_validate_tree_lock(tree_locked);
+			if (tree_locked == TREE_WLOCKED)
+				TCPID_TREE_WUNLOCK();
+			else
+				TCPID_TREE_RUNLOCK();
+			tree_locked = TREE_UNLOCKED;
+		}
+
+		/* Drop the INP reference. */
+		INP_WLOCK(tln->tln_inp);
+		if (!in_pcbrele_wlocked(tln->tln_inp))
+			INP_WUNLOCK(tln->tln_inp);
+
+		/* Free the log records. */
+		tcp_log_free_entries(&tln->tln_entries, &tln->tln_count);
+
+		/* Free the node. */
+		uma_zfree(tcp_log_node_zone, tln);
+
+		/* Relock the expiry queue. */
+		TCPLOG_EXPIREQ_LOCK();
+	}
+
+	/*
+	 * We've expired all the entries we can. Do we need to reschedule
+	 * ourselves?
+	 */
+	callout_deactivate(&tcp_log_expireq_callout);
+	if (tln != NULL) {
+		/*
+		 * Get max(now + TCP_LOG_EXPIRE_INTVL, tln->tln_expiretime) and
+		 * set the next callout to that. (This helps ensure we generally
+		 * run the callout no more often than desired.)
+		 */
+		expiry_limit = getsbinuptime() + TCP_LOG_EXPIRE_INTVL;
+		if (expiry_limit < tln->tln_expiretime)
+			expiry_limit = tln->tln_expiretime;
+		callout_reset_sbt(&tcp_log_expireq_callout, expiry_limit,
+		    SBT_1S, tcp_log_expire, NULL, C_ABSOLUTE);
+	}
+
+	/* We're done. */
+	TCPLOG_EXPIREQ_UNLOCK();
+	return;
+}
+
+/*
+ * Move log data from the TCPCB to a new node. This will reset the TCPCB log
+ * entries and log count; however, it will not touch other things from the
+ * TCPCB (e.g. t_lin, t_lib).
+ *
+ * NOTE: Must hold a lock on the INP.
+ */
+static void
+tcp_log_move_tp_to_node(struct tcpcb *tp, struct tcp_log_id_node *tln)
+{
+
+	INP_WLOCK_ASSERT(tp->t_inpcb);
+
+	tln->tln_ie = tp->t_inpcb->inp_inc.inc_ie;
+	if (tp->t_inpcb->inp_inc.inc_flags & INC_ISIPV6)
+		tln->tln_af = AF_INET6;
+	else
+		tln->tln_af = AF_INET;
+	tln->tln_entries = tp->t_logs;
+	tln->tln_count = tp->t_lognum;
+	tln->tln_bucket = tp->t_lib;
+
+	/* Clear information from the PCB. */
+	STAILQ_INIT(&tp->t_logs);
+	tp->t_lognum = 0;
+}
+
+/* Do per-TCPCB cleanup */
+void
+tcp_log_tcpcbfini(struct tcpcb *tp)
+{
+	struct tcp_log_id_node *tln, *tln_first;
+	struct tcp_log_mem *log_entry;
+	sbintime_t callouttime;
+
+	INP_WLOCK_ASSERT(tp->t_inpcb);
+
+	/*
+	 * If we were gathering packets to be automatically dumped, try to do
+	 * it now. If this succeeds, the log information in the TCPCB will be
+	 * cleared. Otherwise, we'll handle the log information as we do
+	 * for other states.
+	 */
+	switch(tp->t_logstate) {
+	case TCP_LOG_STATE_HEAD_AUTO:
+		(void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from head",
+		    M_NOWAIT, false);
+		break;
+	case TCP_LOG_STATE_TAIL_AUTO:
+		(void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from tail",
+		    M_NOWAIT, false);
+		break;
+	case TCP_LOG_STATE_CONTINUAL:
+		(void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from continual",
+		    M_NOWAIT, false);
+		break;
+	}
+
+	/*
+	 * There are two ways we could keep logs: per-socket or per-ID. If
+	 * we are tracking logs with an ID, then the logs survive the
+	 * destruction of the TCPCB.
+	 * 
+	 * If the TCPCB is associated with an ID node, move the logs from the
+	 * TCPCB to the ID node. In theory, this is safe, for reasons which I
+	 * will now explain for my own benefit when I next need to figure out
+	 * this code. :-)
+	 *
+	 * We own the INP lock. Therefore, no one else can change the contents
+	 * of this node (Rule C). Further, no one can remove this node from
+	 * the bucket while we hold the lock (Rule D). Basically, no one can
+	 * mess with this node. That leaves two states in which we could be:
+	 * 
+	 * 1. Another thread is currently waiting to acquire the INP lock, with
+	 *    plans to do something with this node. When we drop the INP lock,
+	 *    they will have a chance to do that. They will recheck the
+	 *    tln_closed field (see note to Rule C) and then acquire the
+	 *    bucket lock before proceeding further.
+	 *
+	 * 2. Another thread will try to acquire a lock at some point in the
+	 *    future. If they try to acquire a lock before we set the
+	 *    tln_closed field, they will follow state #1. If they try to
+	 *    acquire a lock after we set the tln_closed field, they will be
+	 *    able to make changes to the node, at will, following Rule C.
+	 *
+	 * Therefore, we currently own this node and can make any changes
+	 * we want. But, as soon as we set the tln_closed field to true, we
+	 * have effectively dropped our lock on the node. (For this reason, we
+	 * also need to make sure our writes are ordered correctly. An atomic
+	 * operation with "release" semantics should be sufficient.)
+	 */
+
+	if (tp->t_lin != NULL) {
+		/* Copy the relevant information to the log entry. */
+		tln = tp->t_lin;
+		KASSERT(tln->tln_inp == tp->t_inpcb,
+		    ("%s: Mismatched inp (tln->tln_inp=%p, tp->t_inpcb=%p)",
+		    __func__, tln->tln_inp, tp->t_inpcb));
+		tcp_log_move_tp_to_node(tp, tln);
+
+		/* Clear information from the PCB. */
+		tp->t_lin = NULL;
+		tp->t_lib = NULL;
+
+		/*
+		 * Take a reference on the INP. This ensures that the INP
+		 * remains valid while the node is on the expiry queue. This
+		 * ensures the INP is valid for other threads that may be
+		 * racing to lock this node when we move it to the expire
+		 * queue.
+		 */
+		in_pcbref(tp->t_inpcb);
+
+		/*
+		 * Store the entry on the expiry list. The exact behavior
+		 * depends on whether we have entries to keep. If so, we
+		 * put the entry at the tail of the list and expire in
+		 * TCP_LOG_EXPIRE_TIME. Otherwise, we expire "now" and put
+		 * the entry at the head of the list. (Handling the cleanup
+		 * via the expiry timer lets us avoid locking messy-ness here.)
+		 */
+		tln->tln_expiretime = getsbinuptime();
+		TCPLOG_EXPIREQ_LOCK();
+		if (tln->tln_count) {
+			tln->tln_expiretime += TCP_LOG_EXPIRE_TIME;
+			if (STAILQ_EMPTY(&tcp_log_expireq_head) &&
+			    !callout_active(&tcp_log_expireq_callout)) {
+				/*
+				 * We are adding the first entry and a callout
+				 * is not currently scheduled; therefore, we
+				 * need to schedule one.
+				 */
+				callout_reset_sbt(&tcp_log_expireq_callout,
+				    tln->tln_expiretime, SBT_1S, tcp_log_expire,
+				    NULL, C_ABSOLUTE);
+			}
+			STAILQ_INSERT_TAIL(&tcp_log_expireq_head, tln,
+			    tln_expireq);
+		} else {
+			callouttime = tln->tln_expiretime +
+			    TCP_LOG_EXPIRE_INTVL;
+			tln_first = STAILQ_FIRST(&tcp_log_expireq_head);
+
+			if ((tln_first == NULL ||
+			    callouttime < tln_first->tln_expiretime) &&
+			    (callout_pending(&tcp_log_expireq_callout) ||
+			    !callout_active(&tcp_log_expireq_callout))) {
+				/*
+				 * The list is empty, or we want to run the
+				 * expire code before the first entry's timer
+				 * fires. Also, we are in a case where a callout
+				 * is not actively running. We want to reset
+				 * the callout to occur sooner.
+				 */
+				callout_reset_sbt(&tcp_log_expireq_callout,
+				    callouttime, SBT_1S, tcp_log_expire, NULL,
+				    C_ABSOLUTE);
+			}
+
+			/*
+			 * Insert to the head, or just after the head, as
+			 * appropriate. (This might result in small
+			 * mis-orderings as a bunch of "expire now" entries
+			 * gather at the start of the list, but that should
+			 * not produce big problems, since the expire timer
+			 * will walk through all of them.)
+			 */
+			if (tln_first == NULL ||
+			    tln->tln_expiretime < tln_first->tln_expiretime)
+				STAILQ_INSERT_HEAD(&tcp_log_expireq_head, tln,
+				    tln_expireq);
+			else
+				STAILQ_INSERT_AFTER(&tcp_log_expireq_head,
+				    tln_first, tln, tln_expireq);
+		}
+		TCPLOG_EXPIREQ_UNLOCK();
+
+		/*
+		 * We are done messing with the tln. After this point, we
+		 * can't touch it. (Note that the "release" semantics should
+		 * be included with the TCPLOG_EXPIREQ_UNLOCK() call above.
+		 * Therefore, they should be unnecessary here. However, it
+		 * seems like a good idea to include them anyway, since we
+		 * really are releasing a lock here.)
+		 */
+		atomic_store_rel_int(&tln->tln_closed, 1);
+	} else {
+		/* Remove log entries. */
+		while ((log_entry = STAILQ_FIRST(&tp->t_logs)) != NULL)
+			tcp_log_remove_log_head(tp, log_entry);
+		KASSERT(tp->t_lognum == 0,
+		    ("%s: After freeing entries, tp->t_lognum=%d (expected 0)",
+			__func__, tp->t_lognum));
+	}
+
+	/*
+	 * Change the log state to off (just in case anything tries to sneak
+	 * in a last-minute log).
+	 */
+	tp->t_logstate = TCP_LOG_STATE_OFF;
+}
+
+/*
+ * This logs an event for a TCP socket. Normally, this is called via
+ * TCP_LOG_EVENT or TCP_LOG_EVENT_VERBOSE. See the documentation for
+ * TCP_LOG_EVENT().
+ */
+
+struct tcp_log_buffer *
+tcp_log_event_(struct tcpcb *tp, struct tcphdr *th, struct sockbuf *rxbuf,
+    struct sockbuf *txbuf, uint8_t eventid, int errornum, uint32_t len,
+    union tcp_log_stackspecific *stackinfo, int th_hostorder,
+    const char *output_caller, const char *func, int line, const struct timeval *itv)
+{
+	struct tcp_log_mem *log_entry;
+	struct tcp_log_buffer *log_buf;
+	int attempt_count = 0;
+	struct tcp_log_verbose *log_verbose;
+	uint32_t logsn;
+
+	KASSERT((func == NULL && line == 0) || (func != NULL && line > 0),
+	    ("%s called with inconsistent func (%p) and line (%d) arguments",
+		__func__, func, line));
+
+	INP_WLOCK_ASSERT(tp->t_inpcb);
+
+	KASSERT(tp->t_logstate == TCP_LOG_STATE_HEAD ||
+	    tp->t_logstate == TCP_LOG_STATE_TAIL ||
+	    tp->t_logstate == TCP_LOG_STATE_CONTINUAL ||
+	    tp->t_logstate == TCP_LOG_STATE_HEAD_AUTO ||
+	    tp->t_logstate == TCP_LOG_STATE_TAIL_AUTO,
+	    ("%s called with unexpected tp->t_logstate (%d)", __func__,
+		tp->t_logstate));
+
+	/*
+	 * Get the serial number. We do this early so it will
+	 * increment even if we end up skipping the log entry for some
+	 * reason.
+	 */
+	logsn = tp->t_logsn++;
+
+	/*
+	 * Can we get a new log entry? If so, increment the lognum counter
+	 * here.
+	 */
+retry:
+	if (tp->t_lognum < tcp_log_session_limit) {
+		if ((log_entry = uma_zalloc(tcp_log_zone, M_NOWAIT)) != NULL)
+			tp->t_lognum++;
+	} else
+		log_entry = NULL;
+
+	/* Do we need to try to reuse? */
+	if (log_entry == NULL) {
+		/*
+		 * Sacrifice auto-logged sessions without a log ID if
+		 * tcp_log_auto_all is false. (If they don't have a log
+		 * ID by now, it is probable that either they won't get one
+		 * or we are resource-constrained.)
+		 */
+		if (tp->t_lib == NULL && (tp->t_flags2 & TF2_LOG_AUTO) &&
+		    !tcp_log_auto_all) {
+			if (tcp_log_state_change(tp, TCP_LOG_STATE_CLEAR)) {
+#ifdef INVARIANTS
+				panic("%s:%d: tcp_log_state_change() failed "
+				    "to set tp %p to TCP_LOG_STATE_CLEAR",
+				    __func__, __LINE__, tp);
+#endif
+				tp->t_logstate = TCP_LOG_STATE_OFF;
+			}
+			return (NULL);
+		}
+		/*
+		 * If we are in TCP_LOG_STATE_HEAD_AUTO state, try to dump
+		 * the buffers. If successful, deactivate tracing. Otherwise,
+		 * leave it active so we will retry.
+		 */
+		if (tp->t_logstate == TCP_LOG_STATE_HEAD_AUTO &&
+		    !tcp_log_dump_tp_logbuf(tp, "auto-dumped from head",
+		    M_NOWAIT, false)) {
+			tp->t_logstate = TCP_LOG_STATE_OFF;
+			return(NULL);
+		} else if ((tp->t_logstate == TCP_LOG_STATE_CONTINUAL) &&
+		    !tcp_log_dump_tp_logbuf(tp, "auto-dumped from continual",
+		    M_NOWAIT, false)) {
+			if (attempt_count == 0) {
+				attempt_count++;
+				goto retry;
+			}
+#ifdef TCPLOG_DEBUG_COUNTERS
+			counter_u64_add(tcp_log_que_fail4, 1);
+#endif
+			return(NULL);
+		} else if (tp->t_logstate == TCP_LOG_STATE_HEAD_AUTO)
+			return(NULL);
+
+		/* If in HEAD state, just deactivate the tracing and return. */
+		if (tp->t_logstate == TCP_LOG_STATE_HEAD) {
+			tp->t_logstate = TCP_LOG_STATE_OFF;
+			return(NULL);
+		}
+
+		/*
+		 * Get a buffer to reuse. If that fails, just give up.
+		 * (We can't log anything without a buffer in which to
+		 * put it.)
+		 *
+		 * Note that we don't change the t_lognum counter
+		 * here. Because we are re-using the buffer, the total
+		 * number won't change.
+		 */
+		if ((log_entry = STAILQ_FIRST(&tp->t_logs)) == NULL)
+			return(NULL);
+		STAILQ_REMOVE_HEAD(&tp->t_logs, tlm_queue);
+		tcp_log_entry_refcnt_rem(log_entry);
+	}
+
+	KASSERT(log_entry != NULL,
+	    ("%s: log_entry unexpectedly NULL", __func__));
+
+	/* Extract the log buffer and verbose buffer pointers. */
+	log_buf = &log_entry->tlm_buf;
+	log_verbose = &log_entry->tlm_v;
+
+	/* Basic entries. */
+	if (itv == NULL)
+		getmicrouptime(&log_buf->tlb_tv);
+	else
+		memcpy(&log_buf->tlb_tv, itv, sizeof(struct timeval));
+	log_buf->tlb_ticks = ticks;
+	log_buf->tlb_sn = logsn;
+	log_buf->tlb_stackid = tp->t_fb->tfb_id;
+	log_buf->tlb_eventid = eventid;
+	log_buf->tlb_eventflags = 0;
+	log_buf->tlb_errno = errornum;
+
+	/* Socket buffers */
+	if (rxbuf != NULL) {
+		log_buf->tlb_eventflags |= TLB_FLAG_RXBUF;
+		log_buf->tlb_rxbuf.tls_sb_acc = rxbuf->sb_acc;
+		log_buf->tlb_rxbuf.tls_sb_ccc = rxbuf->sb_ccc;
+		log_buf->tlb_rxbuf.tls_sb_spare = 0;
+	}
+	if (txbuf != NULL) {
+		log_buf->tlb_eventflags |= TLB_FLAG_TXBUF;
+		log_buf->tlb_txbuf.tls_sb_acc = txbuf->sb_acc;
+		log_buf->tlb_txbuf.tls_sb_ccc = txbuf->sb_ccc;
+		log_buf->tlb_txbuf.tls_sb_spare = 0;
+	}
+	/* Copy values from tp to the log entry. */
+#define	COPY_STAT(f)	log_buf->tlb_ ## f = tp->f
+#define	COPY_STAT_T(f)	log_buf->tlb_ ## f = tp->t_ ## f
+	COPY_STAT_T(state);
+	COPY_STAT_T(starttime);
+	COPY_STAT(iss);
+	COPY_STAT_T(flags);
+	COPY_STAT(snd_una);
+	COPY_STAT(snd_max);
+	COPY_STAT(snd_cwnd);
+	COPY_STAT(snd_nxt);
+	COPY_STAT(snd_recover);
+	COPY_STAT(snd_wnd);
+	COPY_STAT(snd_ssthresh);
+	COPY_STAT_T(srtt);
+	COPY_STAT_T(rttvar);
+	COPY_STAT(rcv_up);
+	COPY_STAT(rcv_adv);
+	COPY_STAT(rcv_nxt);
+	COPY_STAT(sack_newdata);
+	COPY_STAT(rcv_wnd);
+	COPY_STAT_T(dupacks);
+	COPY_STAT_T(segqlen);
+	COPY_STAT(snd_numholes);
+	COPY_STAT(snd_scale);
+	COPY_STAT(rcv_scale);
+#undef COPY_STAT
+#undef COPY_STAT_T
+	log_buf->tlb_flex1 = 0;
+	log_buf->tlb_flex2 = 0;
+	/* Copy stack-specific info. */
+	if (stackinfo != NULL) {
+		memcpy(&log_buf->tlb_stackinfo, stackinfo,
+		    sizeof(log_buf->tlb_stackinfo));
+		log_buf->tlb_eventflags |= TLB_FLAG_STACKINFO;
+	}
+
+	/* The packet */
+	log_buf->tlb_len = len;
+	if (th) {
+		int optlen;
+
+		log_buf->tlb_eventflags |= TLB_FLAG_HDR;
+		log_buf->tlb_th = *th;
+		if (th_hostorder)
+			tcp_fields_to_net(&log_buf->tlb_th);
+		optlen = (th->th_off << 2) - sizeof (struct tcphdr);
+		if (optlen > 0)
+			memcpy(log_buf->tlb_opts, th + 1, optlen);
+	}
+
+	/* Verbose information */
+	if (func != NULL) {
+		log_buf->tlb_eventflags |= TLB_FLAG_VERBOSE;
+		if (output_caller != NULL)
+			strlcpy(log_verbose->tlv_snd_frm, output_caller,
+			    TCP_FUNC_LEN);
+		else
+			*log_verbose->tlv_snd_frm = 0;
+		strlcpy(log_verbose->tlv_trace_func, func, TCP_FUNC_LEN);
+		log_verbose->tlv_trace_line = line;
+	}
+
+	/* Insert the new log at the tail. */
+	STAILQ_INSERT_TAIL(&tp->t_logs, log_entry, tlm_queue);
+	tcp_log_entry_refcnt_add(log_entry);
+	return (log_buf);
+}
+
+/*
+ * Change the logging state for a TCPCB. Returns 0 on success or an
+ * error code on failure.
+ */
+int
+tcp_log_state_change(struct tcpcb *tp, int state)
+{
+	struct tcp_log_mem *log_entry;
+
+	INP_WLOCK_ASSERT(tp->t_inpcb);
+	switch(state) {
+	case TCP_LOG_STATE_CLEAR:
+		while ((log_entry = STAILQ_FIRST(&tp->t_logs)) != NULL)
+			tcp_log_remove_log_head(tp, log_entry);
+		/* Fall through */
+
+	case TCP_LOG_STATE_OFF:
+		tp->t_logstate = TCP_LOG_STATE_OFF;
+		break;
+
+	case TCP_LOG_STATE_TAIL:
+	case TCP_LOG_STATE_HEAD:
+	case TCP_LOG_STATE_CONTINUAL:
+	case TCP_LOG_STATE_HEAD_AUTO:
+	case TCP_LOG_STATE_TAIL_AUTO:
+		tp->t_logstate = state;
+		break;
+
+	default:
+		return (EINVAL);
+	}
+
+	tp->t_flags2 &= ~(TF2_LOG_AUTO);
+
+	return (0);
+}
+
+/* If tcp_drain() is called, flush half the log entries. */
+void
+tcp_log_drain(struct tcpcb *tp)
+{
+	struct tcp_log_mem *log_entry, *next;
+	int target, skip;
+
+	INP_WLOCK_ASSERT(tp->t_inpcb);
+	if ((target = tp->t_lognum / 2) == 0)
+		return;
+
+	/*
+	 * If we are logging the "head" packets, we want to discard
+	 * from the tail of the queue. Otherwise, we want to discard
+	 * from the head.
+	 */
+	if (tp->t_logstate == TCP_LOG_STATE_HEAD ||
+	    tp->t_logstate == TCP_LOG_STATE_HEAD_AUTO) {
+		skip = tp->t_lognum - target;
+		STAILQ_FOREACH(log_entry, &tp->t_logs, tlm_queue)
+			if (!--skip)
+				break;
+		KASSERT(log_entry != NULL,
+		    ("%s: skipped through all entries!", __func__));
+		if (log_entry == NULL)
+			return;
+		while ((next = STAILQ_NEXT(log_entry, tlm_queue)) != NULL) {
+			STAILQ_REMOVE_AFTER(&tp->t_logs, log_entry, tlm_queue);
+			tcp_log_entry_refcnt_rem(next);
+			tcp_log_remove_log_cleanup(tp, next);
+#ifdef INVARIANTS
+			target--;
+#endif
+		}
+		KASSERT(target == 0,
+		    ("%s: After removing from tail, target was %d", __func__,
+			target));
+	} else if (tp->t_logstate == TCP_LOG_STATE_CONTINUAL) {
+		(void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from continual",
+		    M_NOWAIT, false);
+	} else {
+		while ((log_entry = STAILQ_FIRST(&tp->t_logs)) != NULL &&
+		    target--)
+			tcp_log_remove_log_head(tp, log_entry);
+		KASSERT(target <= 0,
+		    ("%s: After removing from head, target was %d", __func__,
+			target));
+		KASSERT(tp->t_lognum > 0,
+		    ("%s: After removing from head, tp->t_lognum was %d",
+			__func__, target));
+		KASSERT(log_entry != NULL,
+		    ("%s: After removing from head, the tailq was empty",
+			__func__));
+	}
+}
+
+static inline int
+tcp_log_copyout(struct sockopt *sopt, void *src, void *dst, size_t len)
+{
+
+	if (sopt->sopt_td != NULL)
+		return (copyout(src, dst, len));
+	bcopy(src, dst, len);
+	return (0);
+}
+
+static int
+tcp_log_logs_to_buf(struct sockopt *sopt, struct tcp_log_stailq *log_tailqp,
+    struct tcp_log_buffer **end, int count)
+{
+	struct tcp_log_buffer *out_entry;
+	struct tcp_log_mem *log_entry;
+	size_t entrysize;
+	int error;
+#ifdef INVARIANTS
+	int orig_count = count;
+#endif
+
+	/* Copy the data out. */
+	error = 0;
+	out_entry = (struct tcp_log_buffer *) sopt->sopt_val;
+	STAILQ_FOREACH(log_entry, log_tailqp, tlm_queue) {
+		count--;
+		KASSERT(count >= 0,
+		    ("%s:%d: Exceeded expected count (%d) processing list %p",
+		    __func__, __LINE__, orig_count, log_tailqp));
+
+#ifdef TCPLOG_DEBUG_COUNTERS
+		counter_u64_add(tcp_log_que_copyout, 1);
+#endif
+#if 0
+		struct tcp_log_buffer *lb = &log_entry->tlm_buf;
+		int i;
+
+		printf("lb = %p:\n", lb);
+#define	PRINT(f)	printf(#f " = %u\n", (unsigned int)lb->f)
+		printf("tlb_tv = {%lu, %lu}\n", lb->tlb_tv.tv_sec, lb->tlb_tv.tv_usec);
+		PRINT(tlb_ticks);
+		PRINT(tlb_sn);
+		PRINT(tlb_stackid);
+		PRINT(tlb_eventid);
+		PRINT(tlb_eventflags);
+		PRINT(tlb_errno);
+		PRINT(tlb_rxbuf.tls_sb_acc);
+		PRINT(tlb_rxbuf.tls_sb_ccc);
+		PRINT(tlb_rxbuf.tls_sb_spare);
+		PRINT(tlb_txbuf.tls_sb_acc);
+		PRINT(tlb_txbuf.tls_sb_ccc);
+		PRINT(tlb_txbuf.tls_sb_spare);
+		PRINT(tlb_state);
+		PRINT(tlb_flags);
+		PRINT(tlb_snd_una);
+		PRINT(tlb_snd_max);
+		PRINT(tlb_snd_cwnd);
+		PRINT(tlb_snd_nxt);
+		PRINT(tlb_snd_recover);
+		PRINT(tlb_snd_wnd);
+		PRINT(tlb_snd_ssthresh);
+		PRINT(tlb_srtt);
+		PRINT(tlb_rttvar);
+		PRINT(tlb_rcv_up);
+		PRINT(tlb_rcv_adv);
+		PRINT(tlb_rcv_nxt);
+		PRINT(tlb_sack_newdata);
+		PRINT(tlb_rcv_wnd);
+		PRINT(tlb_dupacks);
+		PRINT(tlb_segqlen);
+		PRINT(tlb_snd_numholes);
+		PRINT(tlb_snd_scale);
+		PRINT(tlb_rcv_scale);
+		PRINT(tlb_len);
+		printf("hex dump: ");
+		for (i = 0; i < sizeof(struct tcp_log_buffer); i++)
+			printf("%02x", *(((uint8_t *)lb) + i));
+#undef PRINT
+#endif
+		/*
+		 * Skip copying out the header if it isn't present.
+		 * Instead, copy out zeros (to ensure we don't leak info).
+		 * TODO: Make sure we truly do zero everything we don't
+		 * explicitly set.
+		 */
+		if (log_entry->tlm_buf.tlb_eventflags & TLB_FLAG_HDR)
+			entrysize = sizeof(struct tcp_log_buffer);
+		else
+			entrysize = offsetof(struct tcp_log_buffer, tlb_th);
+		error = tcp_log_copyout(sopt, &log_entry->tlm_buf, out_entry,
+		    entrysize);
+		if (error)
+			break;
+		if (!(log_entry->tlm_buf.tlb_eventflags & TLB_FLAG_HDR)) {
+			error = tcp_log_copyout(sopt, zerobuf,
+			    ((uint8_t *)out_entry) + entrysize,
+			    sizeof(struct tcp_log_buffer) - entrysize);
+		}
+
+		/*
+		 * Copy out the verbose bit, if needed. Either way,
+		 * increment the output pointer the correct amount.
+		 */
+		if (log_entry->tlm_buf.tlb_eventflags & TLB_FLAG_VERBOSE) {
+			error = tcp_log_copyout(sopt, &log_entry->tlm_v,
+			    out_entry->tlb_verbose,
+			    sizeof(struct tcp_log_verbose));
+			if (error)
+				break;
+			out_entry = (struct tcp_log_buffer *)
+			    (((uint8_t *) (out_entry + 1)) +
+			    sizeof(struct tcp_log_verbose));
+		} else
+			out_entry++;
+	}
+	*end = out_entry;
+	KASSERT(error || count == 0,
+	    ("%s:%d: Less than expected count (%d) processing list %p"
+	    " (%d remain)", __func__, __LINE__, orig_count,
+	    log_tailqp, count));
+
+	return (error);
+}
+
+/*
+ * Copy out the buffer. Note that we do incremental copying, so
+ * sooptcopyout() won't work. However, the goal is to produce the same
+ * end result as if we copied in the entire user buffer, updated it,
+ * and then used sooptcopyout() to copy it out.
+ *
+ * NOTE: This should be called with a write lock on the PCB; however,
+ * the function will drop it after it extracts the data from the TCPCB.
+ */
+int
+tcp_log_getlogbuf(struct sockopt *sopt, struct tcpcb *tp)
+{
+	struct tcp_log_stailq log_tailq;
+	struct tcp_log_mem *log_entry, *log_next;
+	struct tcp_log_buffer *out_entry;
+	struct inpcb *inp;
+	size_t outsize, entrysize;
+	int error, outnum;
+
+	INP_WLOCK_ASSERT(tp->t_inpcb);
+	inp = tp->t_inpcb;
+
+	/*
+	 * Determine which log entries will fit in the buffer. As an
+	 * optimization, skip this if all the entries will clearly fit
+	 * in the buffer. (However, get an exact size if we are using
+	 * INVARIANTS.)
+	 */
+#ifndef INVARIANTS
+	if (sopt->sopt_valsize / (sizeof(struct tcp_log_buffer) +
+	    sizeof(struct tcp_log_verbose)) >= tp->t_lognum) {
+		log_entry = STAILQ_LAST(&tp->t_logs, tcp_log_mem, tlm_queue);
+		log_next = NULL;
+		outsize = 0;
+		outnum = tp->t_lognum;
+	} else {
+#endif
+		outsize = outnum = 0;
+		log_entry = NULL;
+		STAILQ_FOREACH(log_next, &tp->t_logs, tlm_queue) {
+			entrysize = sizeof(struct tcp_log_buffer);
+			if (log_next->tlm_buf.tlb_eventflags &
+			    TLB_FLAG_VERBOSE)
+				entrysize += sizeof(struct tcp_log_verbose);
+			if ((sopt->sopt_valsize - outsize) < entrysize)
+				break;
+			outsize += entrysize;
+			outnum++;
+			log_entry = log_next;
+		}
+		KASSERT(outsize <= sopt->sopt_valsize,
+		    ("%s: calculated output size (%zu) greater than available"
+			"space (%zu)", __func__, outsize, sopt->sopt_valsize));
+#ifndef INVARIANTS
+	}
+#endif
+
+	/*
+	 * Copy traditional sooptcopyout() behavior: if sopt->sopt_val
+	 * is NULL, silently skip the copy. However, in this case, we
+	 * will leave the list alone and return. Functionally, this
+	 * gives userspace a way to poll for an approximate buffer
+	 * size they will need to get the log entries.
+	 */
+	if (sopt->sopt_val == NULL) {
+		INP_WUNLOCK(inp);
+		if (outsize == 0) {
+			outsize = outnum * (sizeof(struct tcp_log_buffer) +
+			    sizeof(struct tcp_log_verbose));
+		}
+		if (sopt->sopt_valsize > outsize)
+			sopt->sopt_valsize = outsize;
+		return (0);
+	}
+
+	/*
+	 * Break apart the list. We'll save the ones we want to copy
+	 * out locally and remove them from the TCPCB list. We can
+	 * then drop the INPCB lock while we do the copyout.
+	 *
+	 * There are roughly three cases:
+	 * 1. There was nothing to copy out. That's easy: drop the
+	 * lock and return.
+	 * 2. We are copying out the entire list. Again, that's easy:
+	 * move the whole list.
+	 * 3. We are copying out a partial list. That's harder. We
+	 * need to update the list book-keeping entries.
+	 */
+	if (log_entry != NULL && log_next == NULL) {
+		/* Move entire list. */
+		KASSERT(outnum == tp->t_lognum,
+		    ("%s:%d: outnum (%d) should match tp->t_lognum (%d)",
+			__func__, __LINE__, outnum, tp->t_lognum));
+		log_tailq = tp->t_logs;
+		tp->t_lognum = 0;
+		STAILQ_INIT(&tp->t_logs);
+	} else if (log_entry != NULL) {
+		/* Move partial list. */
+		KASSERT(outnum < tp->t_lognum,
+		    ("%s:%d: outnum (%d) not less than tp->t_lognum (%d)",
+			__func__, __LINE__, outnum, tp->t_lognum));
+		STAILQ_FIRST(&log_tailq) = STAILQ_FIRST(&tp->t_logs);
+		STAILQ_FIRST(&tp->t_logs) = STAILQ_NEXT(log_entry, tlm_queue);
+		KASSERT(STAILQ_NEXT(log_entry, tlm_queue) != NULL,
+		    ("%s:%d: tp->t_logs is unexpectedly shorter than expected"
+		    "(tp: %p, log_tailq: %p, outnum: %d, tp->t_lognum: %d)",
+		    __func__, __LINE__, tp, &log_tailq, outnum, tp->t_lognum));
+		STAILQ_NEXT(log_entry, tlm_queue) = NULL;
+		log_tailq.stqh_last = &STAILQ_NEXT(log_entry, tlm_queue);
+		tp->t_lognum -= outnum;
+	} else
+		STAILQ_INIT(&log_tailq);
+
+	/* Drop the PCB lock. */
+	INP_WUNLOCK(inp);
+
+	/* Copy the data out. */
+	error = tcp_log_logs_to_buf(sopt, &log_tailq, &out_entry, outnum);
+
+	if (error) {
+		/* Restore list */
+		INP_WLOCK(inp);
+		if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) == 0) {
+			tp = intotcpcb(inp);
+
+			/* Merge the two lists. */
+			STAILQ_CONCAT(&log_tailq, &tp->t_logs);
+			tp->t_logs = log_tailq;
+			tp->t_lognum += outnum;
+		}
+		INP_WUNLOCK(inp);
+	} else {
+		/* Sanity check entries */
+		KASSERT(((caddr_t)out_entry - (caddr_t)sopt->sopt_val)  ==
+		    outsize, ("%s: Actual output size (%zu) != "
+			"calculated output size (%zu)", __func__,
+			(size_t)((caddr_t)out_entry - (caddr_t)sopt->sopt_val),
+			outsize));
+
+		/* Free the entries we just copied out. */
+		STAILQ_FOREACH_SAFE(log_entry, &log_tailq, tlm_queue, log_next) {
+			tcp_log_entry_refcnt_rem(log_entry);
+			uma_zfree(tcp_log_zone, log_entry);
+		}
+	}
+
+	sopt->sopt_valsize = (size_t)((caddr_t)out_entry -
+	    (caddr_t)sopt->sopt_val);
+	return (error);
+}
+
+static void
+tcp_log_free_queue(struct tcp_log_dev_queue *param)
+{
+	struct tcp_log_dev_log_queue *entry;
+
+	KASSERT(param != NULL, ("%s: called with NULL param", __func__));
+	if (param == NULL)
+		return;
+
+	entry = (struct tcp_log_dev_log_queue *)param;
+
+	/* Free the entries. */
+	tcp_log_free_entries(&entry->tldl_entries, &entry->tldl_count);
+
+	/* Free the buffer, if it is allocated. */
+	if (entry->tldl_common.tldq_buf != NULL)
+		free(entry->tldl_common.tldq_buf, M_TCPLOGDEV);
+
+	/* Free the queue entry. */
+	free(entry, M_TCPLOGDEV);
+}
+
+static struct tcp_log_common_header *
+tcp_log_expandlogbuf(struct tcp_log_dev_queue *param)
+{
+	struct tcp_log_dev_log_queue *entry;
+	struct tcp_log_header *hdr;
+	uint8_t *end;
+	struct sockopt sopt;
+	int error;
+
+	entry = (struct tcp_log_dev_log_queue *)param;
+
+	/* Take a worst-case guess at space needs. */
+	sopt.sopt_valsize = sizeof(struct tcp_log_header) +
+	    entry->tldl_count * (sizeof(struct tcp_log_buffer) +
+	    sizeof(struct tcp_log_verbose));
+	hdr = malloc(sopt.sopt_valsize, M_TCPLOGDEV, M_NOWAIT);
+	if (hdr == NULL) {
+#ifdef TCPLOG_DEBUG_COUNTERS
+		counter_u64_add(tcp_log_que_fail5, entry->tldl_count);
+#endif
+		return (NULL);
+	}
+	sopt.sopt_val = hdr + 1;
+	sopt.sopt_valsize -= sizeof(struct tcp_log_header);
+	sopt.sopt_td = NULL;
+	
+	error = tcp_log_logs_to_buf(&sopt, &entry->tldl_entries,
+	    (struct tcp_log_buffer **)&end, entry->tldl_count);
+	if (error) {
+		free(hdr, M_TCPLOGDEV);
+		return (NULL);
+	}
+
+	/* Free the entries. */
+	tcp_log_free_entries(&entry->tldl_entries, &entry->tldl_count);
+	entry->tldl_count = 0;
+
+	memset(hdr, 0, sizeof(struct tcp_log_header));
+	hdr->tlh_version = TCP_LOG_BUF_VER;
+	hdr->tlh_type = TCP_LOG_DEV_TYPE_BBR;
+	hdr->tlh_length = end - (uint8_t *)hdr;
+	hdr->tlh_ie = entry->tldl_ie;
+	hdr->tlh_af = entry->tldl_af;
+	getboottime(&hdr->tlh_offset);
+	strlcpy(hdr->tlh_id, entry->tldl_id, TCP_LOG_ID_LEN);
+	strlcpy(hdr->tlh_reason, entry->tldl_reason, TCP_LOG_REASON_LEN);
+	return ((struct tcp_log_common_header *)hdr);
+}
+
+/*
+ * Queue the tcpcb's log buffer for transmission via the log buffer facility.
+ *
+ * NOTE: This should be called with a write lock on the PCB.
+ *
+ * how should be M_WAITOK or M_NOWAIT. If M_WAITOK, the function will drop
+ * and reacquire the INP lock if it needs to do so.
+ *
+ * If force is false, this will only dump auto-logged sessions if
+ * tcp_log_auto_all is true or if there is a log ID defined for the session.
+ */
+int
+tcp_log_dump_tp_logbuf(struct tcpcb *tp, char *reason, int how, bool force)
+{
+	struct tcp_log_dev_log_queue *entry;
+	struct inpcb *inp;
+#ifdef TCPLOG_DEBUG_COUNTERS
+	int num_entries;
+#endif
+
+	inp = tp->t_inpcb;
+	INP_WLOCK_ASSERT(inp);
+
+	/* If there are no log entries, there is nothing to do. */
+	if (tp->t_lognum == 0)
+		return (0);
+
+	/* Check for a log ID. */
+	if (tp->t_lib == NULL && (tp->t_flags2 & TF2_LOG_AUTO) &&
+	    !tcp_log_auto_all && !force) {
+		struct tcp_log_mem *log_entry;
+
+		/*
+		 * We needed a log ID and none was found. Free the log entries
+		 * and return success. Also, cancel further logging. If the
+		 * session doesn't have a log ID by now, we'll assume it isn't
+		 * going to get one.
+		 */
+		while ((log_entry = STAILQ_FIRST(&tp->t_logs)) != NULL)
+			tcp_log_remove_log_head(tp, log_entry);
+		KASSERT(tp->t_lognum == 0,
+		    ("%s: After freeing entries, tp->t_lognum=%d (expected 0)",
+			__func__, tp->t_lognum));
+		tp->t_logstate = TCP_LOG_STATE_OFF;
+		return (0);
+	}
+
+	/*
+	 * Allocate memory. If we must wait, we'll need to drop the locks
+	 * and reacquire them (and do all the related business that goes
+	 * along with that).
+	 */
+	entry = malloc(sizeof(struct tcp_log_dev_log_queue), M_TCPLOGDEV,
+	    M_NOWAIT);
+	if (entry == NULL && (how & M_NOWAIT)) {
+#ifdef TCPLOG_DEBUG_COUNTERS
+		counter_u64_add(tcp_log_que_fail3, 1);
+#endif
+		return (ENOBUFS);
+	}
+	if (entry == NULL) {
+		INP_WUNLOCK(inp);
+		entry = malloc(sizeof(struct tcp_log_dev_log_queue),
+		    M_TCPLOGDEV, M_WAITOK);
+		INP_WLOCK(inp);
+		/*
+		 * Note that this check is slightly overly-restrictive in
+		 * that the TCB can survive either of these events.
+		 * However, there is currently not a good way to ensure
+		 * that is the case. So, if we hit this M_WAIT path, we
+		 * may end up dropping some entries. That seems like a
+		 * small price to pay for safety.
+		 */
+		if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
+			free(entry, M_TCPLOGDEV);
+#ifdef TCPLOG_DEBUG_COUNTERS
+			counter_u64_add(tcp_log_que_fail2, 1);
+#endif
+			return (ECONNRESET);
+		}
+		tp = intotcpcb(inp);
+		if (tp->t_lognum == 0) {
+			free(entry, M_TCPLOGDEV);
+			return (0);
+		}
+	}
+
+	/* Fill in the unique parts of the queue entry. */
+	if (tp->t_lib != NULL)
+		strlcpy(entry->tldl_id, tp->t_lib->tlb_id, TCP_LOG_ID_LEN);
+	else
+		strlcpy(entry->tldl_id, "UNKNOWN", TCP_LOG_ID_LEN);
+	if (reason != NULL)
+		strlcpy(entry->tldl_reason, reason, TCP_LOG_REASON_LEN);
+	else
+		strlcpy(entry->tldl_reason, "UNKNOWN", TCP_LOG_ID_LEN);
+	entry->tldl_ie = inp->inp_inc.inc_ie;
+	if (inp->inp_inc.inc_flags & INC_ISIPV6)
+		entry->tldl_af = AF_INET6;
+	else
+		entry->tldl_af = AF_INET;
+	entry->tldl_entries = tp->t_logs;
+	entry->tldl_count = tp->t_lognum;
+
+	/* Fill in the common parts of the queue entry. */
+	entry->tldl_common.tldq_buf = NULL;
+	entry->tldl_common.tldq_xform = tcp_log_expandlogbuf;
+	entry->tldl_common.tldq_dtor = tcp_log_free_queue;
+
+	/* Clear the log data from the TCPCB. */
+#ifdef TCPLOG_DEBUG_COUNTERS
+	num_entries = tp->t_lognum;
+#endif
+	tp->t_lognum = 0;
+	STAILQ_INIT(&tp->t_logs);
+
+	/* Add the entry. If no one is listening, free the entry. */
+	if (tcp_log_dev_add_log((struct tcp_log_dev_queue *)entry)) {
+		tcp_log_free_queue((struct tcp_log_dev_queue *)entry);
+#ifdef TCPLOG_DEBUG_COUNTERS
+		counter_u64_add(tcp_log_que_fail1, num_entries);
+	} else {
+		counter_u64_add(tcp_log_queued, num_entries);
+#endif
+	}
+	return (0);
+}
+
+/*
+ * Queue the log_id_node's log buffers for transmission via the log buffer
+ * facility.
+ *
+ * NOTE: This should be called with the bucket locked and referenced.
+ *
+ * how should be M_WAITOK or M_NOWAIT. If M_WAITOK, the function will drop
+ * and reacquire the bucket lock if it needs to do so. (The caller must
+ * ensure that the tln is no longer on any lists so no one else will mess
+ * with this while the lock is dropped!)
+ */
+static int
+tcp_log_dump_node_logbuf(struct tcp_log_id_node *tln, char *reason, int how)
+{
+	struct tcp_log_dev_log_queue *entry;
+	struct tcp_log_id_bucket *tlb;
+
+	tlb = tln->tln_bucket;
+	TCPID_BUCKET_LOCK_ASSERT(tlb);
+	KASSERT(tlb->tlb_refcnt > 0,
+	    ("%s:%d: Called with unreferenced bucket (tln=%p, tlb=%p)",
+	    __func__, __LINE__, tln, tlb));
+	KASSERT(tln->tln_closed,
+	    ("%s:%d: Called for node with tln_closed==false (tln=%p)",
+	    __func__, __LINE__, tln));
+
+	/* If there are no log entries, there is nothing to do. */
+	if (tln->tln_count == 0)
+		return (0);
+
+	/*
+	 * Allocate memory. If we must wait, we'll need to drop the locks
+	 * and reacquire them (and do all the related business that goes
+	 * along with that).
+	 */
+	entry = malloc(sizeof(struct tcp_log_dev_log_queue), M_TCPLOGDEV,
+	    M_NOWAIT);
+	if (entry == NULL && (how & M_NOWAIT))
+		return (ENOBUFS);
+	if (entry == NULL) {
+		TCPID_BUCKET_UNLOCK(tlb);
+		entry = malloc(sizeof(struct tcp_log_dev_log_queue),
+		    M_TCPLOGDEV, M_WAITOK);
+		TCPID_BUCKET_LOCK(tlb);
+	}
+
+	/* Fill in the common parts of the queue entry.. */
+	entry->tldl_common.tldq_buf = NULL;
+	entry->tldl_common.tldq_xform = tcp_log_expandlogbuf;
+	entry->tldl_common.tldq_dtor = tcp_log_free_queue;
+
+	/* Fill in the unique parts of the queue entry. */
+	strlcpy(entry->tldl_id, tlb->tlb_id, TCP_LOG_ID_LEN);
+	if (reason != NULL)
+		strlcpy(entry->tldl_reason, reason, TCP_LOG_REASON_LEN);
+	else
+		strlcpy(entry->tldl_reason, "UNKNOWN", TCP_LOG_ID_LEN);
+	entry->tldl_ie = tln->tln_ie;
+	entry->tldl_entries = tln->tln_entries;
+	entry->tldl_count = tln->tln_count;
+	entry->tldl_af = tln->tln_af;
+
+	/* Add the entry. If no one is listening, free the entry. */
+	if (tcp_log_dev_add_log((struct tcp_log_dev_queue *)entry))
+		tcp_log_free_queue((struct tcp_log_dev_queue *)entry);
+
+	return (0);
+}
+
+
+/*
+ * Queue the log buffers for all sessions in a bucket for transmissions via
+ * the log buffer facility.
+ *
+ * NOTE: This should be called with a locked bucket; however, the function
+ * will drop the lock.
+ */
+#define	LOCAL_SAVE	10
+static void
+tcp_log_dumpbucketlogs(struct tcp_log_id_bucket *tlb, char *reason)
+{
+	struct tcp_log_id_node local_entries[LOCAL_SAVE];
+	struct inpcb *inp;
+	struct tcpcb *tp;
+	struct tcp_log_id_node *cur_tln, *prev_tln, *tmp_tln;
+	int i, num_local_entries, tree_locked;
+	bool expireq_locked;
+
+	TCPID_BUCKET_LOCK_ASSERT(tlb);
+
+	/*
+	 * Take a reference on the bucket to keep it from disappearing until
+	 * we are done.
+	 */
+	TCPID_BUCKET_REF(tlb);
+
+	/*
+	 * We'll try to create these without dropping locks. However, we
+	 * might very well need to drop locks to get memory. If that's the
+	 * case, we'll save up to 10 on the stack, and sacrifice the rest.
+	 * (Otherwise, we need to worry about finding our place again in a
+	 * potentially changed list. It just doesn't seem worth the trouble
+	 * to do that.
+	 */
+	expireq_locked = false;
+	num_local_entries = 0;
+	prev_tln = NULL;
+	tree_locked = TREE_UNLOCKED;
+	SLIST_FOREACH_SAFE(cur_tln, &tlb->tlb_head, tln_list, tmp_tln) {
+		/*
+		 * If this isn't associated with a TCPCB, we can pull it off
+		 * the list now. We need to be careful that the expire timer
+		 * hasn't already taken ownership (tln_expiretime == SBT_MAX).
+		 * If so, we let the expire timer code free the data. 
+		 */
+		if (cur_tln->tln_closed) {
+no_inp:
+			/*
+			 * Get the expireq lock so we can get a consistent
+			 * read of tln_expiretime and so we can remove this
+			 * from the expireq.
+			 */
+			if (!expireq_locked) {
+				TCPLOG_EXPIREQ_LOCK();
+				expireq_locked = true;
+			}
+
+			/*
+			 * We ignore entries with tln_expiretime == SBT_MAX.
+			 * The expire timer code already owns those.
+			 */
+			KASSERT(cur_tln->tln_expiretime > (sbintime_t) 0,
+			    ("%s:%d: node on the expire queue without positive "
+			    "expire time", __func__, __LINE__));
+			if (cur_tln->tln_expiretime == SBT_MAX) {
+				prev_tln = cur_tln;
+				continue;
+			}
+
+			/* Remove the entry from the expireq. */
+			STAILQ_REMOVE(&tcp_log_expireq_head, cur_tln,
+			    tcp_log_id_node, tln_expireq);
+
+			/* Remove the entry from the bucket. */
+			if (prev_tln != NULL)
+				SLIST_REMOVE_AFTER(prev_tln, tln_list);
+			else
+				SLIST_REMOVE_HEAD(&tlb->tlb_head, tln_list);
+
+			/*
+			 * Drop the INP and bucket reference counts. Due to
+			 * lock-ordering rules, we need to drop the expire
+			 * queue lock.
+			 */
+			TCPLOG_EXPIREQ_UNLOCK();
+			expireq_locked = false;
+
+			/* Drop the INP reference. */
+			INP_WLOCK(cur_tln->tln_inp);
+			if (!in_pcbrele_wlocked(cur_tln->tln_inp))
+				INP_WUNLOCK(cur_tln->tln_inp);
+
+			if (tcp_log_unref_bucket(tlb, &tree_locked, NULL)) {
+#ifdef INVARIANTS
+				panic("%s: Bucket refcount unexpectedly 0.",
+				    __func__);
+#endif
+				/*
+				 * Recover as best we can: free the entry we
+				 * own.
+				 */
+				tcp_log_free_entries(&cur_tln->tln_entries,
+				    &cur_tln->tln_count);
+				uma_zfree(tcp_log_node_zone, cur_tln);
+				goto done;
+			}
+
+			if (tcp_log_dump_node_logbuf(cur_tln, reason,
+			    M_NOWAIT)) {
+				/*
+				 * If we have sapce, save the entries locally.
+				 * Otherwise, free them.
+				 */
+				if (num_local_entries < LOCAL_SAVE) {
+					local_entries[num_local_entries] =
+					    *cur_tln;
+					num_local_entries++;
+				} else {
+					tcp_log_free_entries(
+					    &cur_tln->tln_entries,
+					    &cur_tln->tln_count);
+				}
+			}
+
+			/* No matter what, we are done with the node now. */
+			uma_zfree(tcp_log_node_zone, cur_tln);
+
+			/*
+			 * Because we removed this entry from the list, prev_tln
+			 * (which tracks the previous entry still on the tlb
+			 * list) remains unchanged.
+			 */
+			continue;
+		}
+
+		/*
+		 * If we get to this point, the session data is still held in
+		 * the TCPCB. So, we need to pull the data out of that.
+		 *
+		 * We will need to drop the expireq lock so we can lock the INP.
+		 * We can then try to extract the data the "easy" way. If that
+		 * fails, we'll save the log entries for later.
+		 */
+		if (expireq_locked) {
+			TCPLOG_EXPIREQ_UNLOCK();
+			expireq_locked = false;
+		}
+
+		/* Lock the INP and then re-check the state. */
+		inp = cur_tln->tln_inp;
+		INP_WLOCK(inp);
+		/*
+		 * If we caught this while it was transitioning, the data
+		 * might have moved from the TCPCB to the tln (signified by
+		 * setting tln_closed to true. If so, treat this like an
+		 * inactive connection.
+		 */
+		if (cur_tln->tln_closed) {
+			/*
+			 * It looks like we may have caught this connection
+			 * while it was transitioning from active to inactive.
+			 * Treat this like an inactive connection.
+			 */
+			INP_WUNLOCK(inp);
+			goto no_inp;
+		}
+
+		/*
+		 * Try to dump the data from the tp without dropping the lock.
+		 * If this fails, try to save off the data locally.
+		 */
+		tp = cur_tln->tln_tp;
+		if (tcp_log_dump_tp_logbuf(tp, reason, M_NOWAIT, true) &&
+		    num_local_entries < LOCAL_SAVE) {
+			tcp_log_move_tp_to_node(tp,
+			    &local_entries[num_local_entries]);
+			local_entries[num_local_entries].tln_closed = 1;
+			KASSERT(local_entries[num_local_entries].tln_bucket ==
+			    tlb, ("%s: %d: bucket mismatch for node %p",
+			    __func__, __LINE__, cur_tln));
+			num_local_entries++;
+		}
+
+		INP_WUNLOCK(inp);
+
+		/*
+		 * We are goint to leave the current tln on the list. It will
+		 * become the previous tln.
+		 */
+		prev_tln = cur_tln;
+	}
+
+	/* Drop our locks, if any. */
+	KASSERT(tree_locked == TREE_UNLOCKED,
+	    ("%s: %d: tree unexpectedly locked", __func__, __LINE__));
+	switch (tree_locked) {
+	case TREE_WLOCKED:
+		TCPID_TREE_WUNLOCK();
+		tree_locked = TREE_UNLOCKED;
+		break;
+	case TREE_RLOCKED:
+		TCPID_TREE_RUNLOCK();
+		tree_locked = TREE_UNLOCKED;
+		break;
+	}
+	if (expireq_locked) {
+		TCPLOG_EXPIREQ_UNLOCK();
+		expireq_locked = false;
+	}
+
+	/*
+	 * Try again for any saved entries. tcp_log_dump_node_logbuf() is
+	 * guaranteed to free the log entries within the node. And, since
+	 * the node itself is on our stack, we don't need to free it.
+	 */
+	for (i = 0; i < num_local_entries; i++)
+		tcp_log_dump_node_logbuf(&local_entries[i], reason, M_WAITOK);
+
+	/* Drop our reference. */
+	if (!tcp_log_unref_bucket(tlb, &tree_locked, NULL))
+		TCPID_BUCKET_UNLOCK(tlb);
+
+done:
+	/* Drop our locks, if any. */
+	switch (tree_locked) {
+	case TREE_WLOCKED:
+		TCPID_TREE_WUNLOCK();
+		break;
+	case TREE_RLOCKED:
+		TCPID_TREE_RUNLOCK();
+		break;
+	}
+	if (expireq_locked)
+		TCPLOG_EXPIREQ_UNLOCK();
+}
+#undef	LOCAL_SAVE
+
+
+/*
+ * Queue the log buffers for all sessions in a bucket for transmissions via
+ * the log buffer facility.
+ *
+ * NOTE: This should be called with a locked INP; however, the function
+ * will drop the lock.
+ */
+void
+tcp_log_dump_tp_bucket_logbufs(struct tcpcb *tp, char *reason)
+{
+	struct tcp_log_id_bucket *tlb;
+	int tree_locked;
+
+	/* Figure out our bucket and lock it. */
+	INP_WLOCK_ASSERT(tp->t_inpcb);
+	tlb = tp->t_lib;
+	if (tlb == NULL) {
+		/*
+		 * No bucket; treat this like a request to dump a single
+		 * session's traces.
+		 */
+		(void)tcp_log_dump_tp_logbuf(tp, reason, M_WAITOK, true);
+		INP_WUNLOCK(tp->t_inpcb);
+		return;
+	}
+	TCPID_BUCKET_REF(tlb);
+	INP_WUNLOCK(tp->t_inpcb);
+	TCPID_BUCKET_LOCK(tlb);
+
+	/* If we are the last reference, we have nothing more to do here. */
+	tree_locked = TREE_UNLOCKED;
+	if (tcp_log_unref_bucket(tlb, &tree_locked, NULL)) {
+		switch (tree_locked) {
+		case TREE_WLOCKED:
+			TCPID_TREE_WUNLOCK();
+			break;
+		case TREE_RLOCKED:
+			TCPID_TREE_RUNLOCK();
+			break;
+		}
+		return;
+	}
+
+	/* Turn this over to tcp_log_dumpbucketlogs() to finish the work. */ 
+	tcp_log_dumpbucketlogs(tlb, reason);
+}
+
+/*
+ * Mark the end of a flow with the current stack. A stack can add
+ * stack-specific info to this trace event by overriding this
+ * function (see bbr_log_flowend() for example).
+ */
+void
+tcp_log_flowend(struct tcpcb *tp)
+{
+	if (tp->t_logstate != TCP_LOG_STATE_OFF) {
+		struct socket *so = tp->t_inpcb->inp_socket;
+		TCP_LOG_EVENT(tp, NULL, &so->so_rcv, &so->so_snd,
+				TCP_LOG_FLOWEND, 0, 0, NULL, false);
+	}
+}
+
diff --git a/sys/netinet/tcp_log_buf.h b/sys/netinet/tcp_log_buf.h
new file mode 100644
index 000000000000..e45782114688
--- /dev/null
+++ b/sys/netinet/tcp_log_buf.h
@@ -0,0 +1,353 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2016-2018
+ *	Netflix Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef __tcp_log_buf_h__
+#define __tcp_log_buf_h__
+
+#define	TCP_LOG_REASON_LEN	32
+#define	TCP_LOG_BUF_VER		(6)
+
+/*
+ * Because the (struct tcp_log_buffer) includes 8-byte uint64_t's, it requires
+ * 8-byte alignment to work properly on all platforms. Therefore, we will
+ * enforce 8-byte alignment for all the structures that may appear by
+ * themselves (instead of being embedded in another structure) in a data
+ * stream.
+ */
+#define	ALIGN_TCP_LOG		__aligned(8)
+
+/* Information about the socketbuffer state. */
+struct tcp_log_sockbuf
+{
+	uint32_t	tls_sb_acc;	/* available chars (sb->sb_acc) */
+	uint32_t	tls_sb_ccc;	/* claimed chars (sb->sb_ccc) */
+	uint32_t	tls_sb_spare;	/* spare */
+};
+
+/* Optional, verbose information that may be appended to an event log. */
+struct tcp_log_verbose
+{
+#define	TCP_FUNC_LEN	32
+	char		tlv_snd_frm[TCP_FUNC_LEN]; /* tcp_output() caller */
+	char		tlv_trace_func[TCP_FUNC_LEN]; /* Function that
+							 generated trace */
+	uint32_t	tlv_trace_line;	/* Line number that generated trace */
+	uint8_t		_pad[4];
+} ALIGN_TCP_LOG;
+
+/* Internal RACK state variables. */
+struct tcp_log_rack
+{
+	uint32_t	tlr_rack_rtt;		/* rc_rack_rtt */
+	uint8_t		tlr_state;		/* Internal RACK state */
+	uint8_t		_pad[3];		/* Padding */
+};
+
+struct tcp_log_bbr {
+	uint64_t cur_del_rate;
+	uint64_t delRate;
+	uint64_t rttProp;
+	uint64_t bw_inuse;
+	uint32_t inflight;
+	uint32_t applimited;
+	uint32_t delivered;
+	uint32_t timeStamp;
+	uint32_t epoch;
+	uint32_t lt_epoch;
+	uint32_t pkts_out;
+	uint32_t flex1;
+	uint32_t flex2;
+	uint32_t flex3;
+	uint32_t flex4;
+	uint32_t flex5;
+	uint32_t flex6;
+	uint32_t lost;
+	uint16_t pacing_gain;
+	uint16_t cwnd_gain;
+	uint16_t flex7;
+	uint8_t bbr_state;
+	uint8_t bbr_substate;
+	uint8_t inpacer;
+	uint8_t ininput;
+	uint8_t use_lt_bw;
+	uint8_t flex8;
+	uint32_t pkt_epoch;
+};
+
+/* Per-stack stack-specific info. */
+union tcp_log_stackspecific
+{
+	struct tcp_log_rack u_rack;
+	struct tcp_log_bbr u_bbr;
+};
+
+struct tcp_log_buffer
+{
+	/* Event basics */
+	struct timeval	tlb_tv;		/* Timestamp of trace */
+	uint32_t	tlb_ticks;	/* Timestamp of trace */
+	uint32_t	tlb_sn;		/* Serial number */
+	uint8_t		tlb_stackid;	/* Stack ID */
+	uint8_t		tlb_eventid;	/* Event ID */
+	uint16_t	tlb_eventflags;	/* Flags for the record */
+#define	TLB_FLAG_RXBUF		0x0001	/* Includes receive buffer info */
+#define	TLB_FLAG_TXBUF		0x0002	/* Includes send buffer info */
+#define	TLB_FLAG_HDR		0x0004	/* Includes a TCP header */
+#define	TLB_FLAG_VERBOSE	0x0008	/* Includes function/line numbers */
+#define	TLB_FLAG_STACKINFO	0x0010	/* Includes stack-specific info */
+	int		tlb_errno;	/* Event error (if any) */
+
+	/* Internal session state */
+	struct tcp_log_sockbuf tlb_rxbuf; /* Receive buffer */
+	struct tcp_log_sockbuf tlb_txbuf; /* Send buffer */
+
+	int		tlb_state;	/* TCPCB t_state */
+	uint32_t	tlb_starttime;	/* TCPCB t_starttime */
+	uint32_t	tlb_iss;		/* TCPCB iss */
+	uint32_t	tlb_flags;	/* TCPCB flags */
+	uint32_t	tlb_snd_una;	/* TCPCB snd_una */
+	uint32_t	tlb_snd_max;	/* TCPCB snd_max */
+	uint32_t	tlb_snd_cwnd;	/* TCPCB snd_cwnd */
+	uint32_t	tlb_snd_nxt;	/* TCPCB snd_nxt */
+	uint32_t	tlb_snd_recover;/* TCPCB snd_recover */
+	uint32_t	tlb_snd_wnd;	/* TCPCB snd_wnd */
+	uint32_t	tlb_snd_ssthresh; /* TCPCB snd_ssthresh */
+	uint32_t	tlb_srtt;	/* TCPCB t_srtt */
+	uint32_t	tlb_rttvar;	/* TCPCB t_rttvar */
+	uint32_t	tlb_rcv_up;	/* TCPCB rcv_up */
+	uint32_t	tlb_rcv_adv;	/* TCPCB rcv_adv */
+	uint32_t	tlb_rcv_nxt;	/* TCPCB rcv_nxt */
+	tcp_seq		tlb_sack_newdata; /* TCPCB sack_newdata */
+	uint32_t       	tlb_rcv_wnd;	/* TCPCB rcv_wnd */
+	uint32_t	tlb_dupacks;	/* TCPCB t_dupacks */
+	int		tlb_segqlen;	/* TCPCB segqlen */
+	int		tlb_snd_numholes; /* TCPCB snd_numholes */
+	uint32_t 	tlb_flex1; /* Event specific information */
+	uint32_t 	tlb_flex2; /* Event specific information */
+	uint8_t		tlb_snd_scale:4, /* TCPCB snd_scale */
+			tlb_rcv_scale:4; /* TCPCB rcv_scale */
+	uint8_t		_pad[3];	/* Padding */
+
+	/* Per-stack info */
+	union tcp_log_stackspecific tlb_stackinfo;
+#define	tlb_rack	tlb_stackinfo.u_rack
+
+	/* The packet */
+	uint32_t	tlb_len;	/* The packet's data length */
+	struct tcphdr	tlb_th;		/* The TCP header */
+	uint8_t		tlb_opts[TCP_MAXOLEN]; /* The TCP options */
+
+	/* Verbose information (optional) */
+	struct tcp_log_verbose tlb_verbose[0];
+} ALIGN_TCP_LOG;
+
+enum tcp_log_events {
+	TCP_LOG_IN = 1,	/* Incoming packet                 1 */
+	TCP_LOG_OUT,	/* Transmit (without other event)  2 */
+	TCP_LOG_RTO,	/* Retransmit timeout              3 */
+	TCP_LOG_TF_ACK,	/* Transmit due to TF_ACK          4 */
+	TCP_LOG_BAD_RETRAN, /* Detected bad retransmission 5 */
+	TCP_LOG_PRR,	/* Doing PRR                       6 */
+	TCP_LOG_REORDER,/* Detected reorder                7 */
+	TCP_LOG_PACER,	/* Pacer sending a packet          8 */
+	BBR_LOG_BBRUPD,		/* We updated BBR info     9 */
+	BBR_LOG_BBRSND,		/* We did a slot calculation and sending is done 10 */
+	BBR_LOG_ACKCLEAR,	/* A ack clears all outstanding     11 */
+	BBR_LOG_INQUEUE,	/* The tcb had a packet input to it 12 */
+	BBR_LOG_TIMERSTAR,	/* Start a timer                    13 */
+	BBR_LOG_TIMERCANC,	/* Cancel a timer                   14 */
+	BBR_LOG_ENTREC,		/* Entered recovery                 15 */
+	BBR_LOG_EXITREC,	/* Exited recovery                  16 */
+	BBR_LOG_CWND,		/* Cwnd change                      17 */
+	BBR_LOG_BWSAMP,		/* LT B/W sample has been made      18 */
+	BBR_LOG_MSGSIZE,	/* We received a EMSGSIZE error     19 */
+	BBR_LOG_BBRRTT,		/* BBR RTT is updated               20 */
+	BBR_LOG_JUSTRET,	/* We just returned out of output   21 */
+	BBR_LOG_STATE,		/* A BBR state change occured       22 */
+	BBR_LOG_PKT_EPOCH,      /* A BBR packet epoch occured       23 */
+	BBR_LOG_PERSIST,        /* BBR changed to/from a persists   24 */
+	TCP_LOG_FLOWEND,        /* End of a flow                    25 */
+	BBR_LOG_RTO,            /* BBR's timeout includes BBR info  26 */
+	BBR_LOG_DOSEG_DONE,     /* pacer do_segment completes       27 */
+	BBR_LOG_EXIT_GAIN,      /* pacer do_segment completes       28 */
+	BBR_LOG_THRESH_CALC,    /* Doing threshold calculation      29 */
+	BBR_LOG_EXTRACWNDGAIN,	/* Removed                          30 */
+	TCP_LOG_USERSEND, 	/* User level sends data            31 */
+	UNUSED_32,	 	/* Unused                           32 */
+	UNUSED_33, 		/* Unused                           33 */
+	BBR_LOG_TIME_EPOCH, 	/* A timed based Epoch occured      34 */
+	BBR_LOG_TO_PROCESS,	/* A to was processed               35 */
+	BBR_LOG_BBRTSO, 	/* TSO update	                    36 */
+	BBR_LOG_PACERDIAG,	/* Pacer diag insert                37 */
+	BBR_LOG_LOWGAIN,	/* Low gain accounting              38 */
+	BBR_LOG_PROGRESS,	/* Progress timer event             39 */
+	TCP_LOG_SOCKET_OPT,	/* A socket option is set	    40 */
+	BBR_LOG_TIMERPREP,	/* A BBR var to debug out TLP issues  41 */
+	BBR_LOG_ENOBUF_JMP,	/* We had a enobuf jump 42 */
+	BBR_LOG_PACING_CALC,	/* calc the pacing time 43 */
+	BBR_LOG_RTT_SHRINKS,	/* We had a log reduction of rttProp 44 */
+	BBR_LOG_BW_RED_EV,	/* B/W reduction events 45 */
+	BBR_LOG_REDUCE,		/* old bbr log reduce for 4.1 and earlier 46*/
+	TCP_LOG_RTT,		/* A rtt (in useconds) is being sampled and applied to the srtt algo 47 */
+	BBR_LOG_SETTINGS_CHG,   /* Settings changed for loss response 48 */
+	TCP_LOG_END		/* End (keep at end)	            49 */
+};
+
+enum tcp_log_states {
+	TCP_LOG_STATE_CLEAR = -1,	/* Deactivate and clear tracing */
+	TCP_LOG_STATE_OFF = 0,		/* Pause */
+	TCP_LOG_STATE_TAIL=1,		/* Keep the trailing events */
+	TCP_LOG_STATE_HEAD=2,		/* Keep the leading events */
+	TCP_LOG_STATE_HEAD_AUTO=3,	/* Keep the leading events, and
+					   automatically dump them to the
+					   device  */
+	TCP_LOG_STATE_CONTINUAL=4,	/* Continually dump the data when full */
+	TCP_LOG_STATE_TAIL_AUTO=5,	/* Keep the trailing events, and
+					   automatically dump them when the
+					   session ends */
+};
+
+/* Use this if we don't know whether the operation succeeded. */
+#define	ERRNO_UNK	(-1)
+
+/*
+ * If the user included dev/tcp_log/tcp_log_dev.h, then include our private
+ * headers. Otherwise, there is no reason to pollute all the files with an
+ * additional include.
+ *
+ * This structure is aligned to an 8-byte boundary to match the alignment
+ * requirements of (struct tcp_log_buffer).
+ */
+#ifdef __tcp_log_dev_h__
+struct tcp_log_header {
+	struct tcp_log_common_header tlh_common;
+#define	tlh_version	tlh_common.tlch_version
+#define	tlh_type	tlh_common.tlch_type
+#define	tlh_length	tlh_common.tlch_length
+	struct in_endpoints	tlh_ie;
+	struct timeval		tlh_offset;	/* Uptime -> UTC offset */
+	char			tlh_id[TCP_LOG_ID_LEN];
+	char			tlh_reason[TCP_LOG_REASON_LEN];
+	uint8_t		tlh_af;
+	uint8_t		_pad[7];
+} ALIGN_TCP_LOG;
+
+#ifdef _KERNEL
+struct tcp_log_dev_log_queue {
+	struct tcp_log_dev_queue tldl_common;
+	char			tldl_id[TCP_LOG_ID_LEN];
+	char			tldl_reason[TCP_LOG_REASON_LEN];
+	struct in_endpoints	tldl_ie;
+	struct tcp_log_stailq	tldl_entries;
+	int			tldl_count;
+	uint8_t			tldl_af;
+};
+#endif /* _KERNEL */
+#endif /* __tcp_log_dev_h__ */
+
+#ifdef _KERNEL
+
+#define	TCP_LOG_BUF_DEFAULT_SESSION_LIMIT	10000
+#define	TCP_LOG_BUF_DEFAULT_GLOBAL_LIMIT	1000000
+
+/*
+ * TCP_LOG_EVENT_VERBOSE: The same as TCP_LOG_EVENT, except it always
+ * tries to record verbose information.
+ */
+#define	TCP_LOG_EVENT_VERBOSE(tp, th, rxbuf, txbuf, eventid, errornum, len, stackinfo, th_hostorder, tv) \
+	do {								\
+		if (tp->t_logstate != TCP_LOG_STATE_OFF)		\
+			tcp_log_event_(tp, th, rxbuf, txbuf, eventid,	\
+	 	        errornum, len, stackinfo, th_hostorder,		\
+		        tp->t_output_caller, __func__, __LINE__, tv);	\
+	} while (0)
+
+/*
+ * TCP_LOG_EVENT: This is a macro so we can capture function/line
+ * information when needed.
+ *
+ * Prototype:
+ * TCP_LOG_EVENT(struct tcpcb *tp, struct tcphdr *th, struct sockbuf *rxbuf, 
+ *     struct sockbuf *txbuf, uint8_t eventid, int errornum,
+ *     union tcp_log_stackspecific *stackinfo)
+ *
+ * tp is mandatory and must be write locked.
+ * th is optional; if present, it will appear in the record.
+ * rxbuf and txbuf are optional; if present, they will appear in the record.
+ * eventid is mandatory.
+ * errornum is mandatory (it indicates the success or failure of the
+ *     operation associated with the event).
+ * len indicates the length of the packet. If no packet, use 0.
+ * stackinfo is optional; if present, it will appear in the record.
+ */
+#ifdef TCP_LOG_FORCEVERBOSE
+#define	TCP_LOG_EVENT	TCP_LOG_EVENT_VERBOSE
+#else
+#define	TCP_LOG_EVENT(tp, th, rxbuf, txbuf, eventid, errornum, len, stackinfo, th_hostorder) \
+	do {								\
+		if (tcp_log_verbose)					\
+			TCP_LOG_EVENT_VERBOSE(tp, th, rxbuf, txbuf,	\
+			    eventid, errornum, len, stackinfo,		\
+			    th_hostorder, NULL);				\
+		else if (tp->t_logstate != TCP_LOG_STATE_OFF)		\
+			tcp_log_event_(tp, th, rxbuf, txbuf, eventid,	\
+			    errornum, len, stackinfo, th_hostorder,	\
+			    NULL, NULL, 0, NULL);				\
+	} while (0)
+#endif /* TCP_LOG_FORCEVERBOSE */
+#define	TCP_LOG_EVENTP(tp, th, rxbuf, txbuf, eventid, errornum, len, stackinfo, th_hostorder, tv) \
+	do {								\
+		if (tp->t_logstate != TCP_LOG_STATE_OFF)		\
+			tcp_log_event_(tp, th, rxbuf, txbuf, eventid,	\
+			    errornum, len, stackinfo, th_hostorder,	\
+			    NULL, NULL, 0, tv);				\
+	} while (0)
+
+
+extern bool tcp_log_verbose;
+void tcp_log_drain(struct tcpcb *tp);
+int tcp_log_dump_tp_logbuf(struct tcpcb *tp, char *reason, int how, bool force);
+void tcp_log_dump_tp_bucket_logbufs(struct tcpcb *tp, char *reason);
+struct tcp_log_buffer *tcp_log_event_(struct tcpcb *tp, struct tcphdr *th, struct sockbuf *rxbuf,
+    struct sockbuf *txbuf, uint8_t eventid, int errornum, uint32_t len,
+    union tcp_log_stackspecific *stackinfo, int th_hostorder,
+    const char *output_caller, const char *func, int line, const struct timeval *tv);
+size_t tcp_log_get_id(struct tcpcb *tp, char *buf);
+u_int tcp_log_get_id_cnt(struct tcpcb *tp);
+int tcp_log_getlogbuf(struct sockopt *sopt, struct tcpcb *tp);
+void tcp_log_init(void);
+int tcp_log_set_id(struct tcpcb *tp, char *id);
+int tcp_log_state_change(struct tcpcb *tp, int state);
+void tcp_log_tcpcbinit(struct tcpcb *tp);
+void tcp_log_tcpcbfini(struct tcpcb *tp);
+void tcp_log_flowend(struct tcpcb *tp);
+
+#endif	/* _KERNEL */
+#endif	/* __tcp_log_buf_h__ */
diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c
index db7557d774d2..90ec964629bc 100644
--- a/sys/netinet/tcp_output.c
+++ b/sys/netinet/tcp_output.c
@@ -74,6 +74,7 @@ __FBSDID("$FreeBSD$");
 #include <netinet/tcp.h>
 #define	TCPOUTFLAGS
 #include <netinet/tcp_fsm.h>
+#include <netinet/tcp_log_buf.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
@@ -1310,6 +1311,10 @@ send:
 	}
 #endif
 
+	/* We're getting ready to send; log now. */
+	TCP_LOG_EVENT(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK,
+	    len, NULL, false);
+
 	/*
 	 * Enable TSO and specify the size of the segments.
 	 * The TCP pseudo header checksum is always provided.
@@ -1549,6 +1554,9 @@ timer:
 	}
 
 	if (error) {
+		/* Record the error. */
+		TCP_LOG_EVENT(tp, NULL, &so->so_rcv, &so->so_snd, TCP_LOG_OUT,
+		    error, 0, NULL, false);
 
 		/*
 		 * We know that the packet was lost, so back out the
diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c
index 6f2ab0746458..c5d4d657b0e8 100644
--- a/sys/netinet/tcp_subr.c
+++ b/sys/netinet/tcp_subr.c
@@ -98,6 +98,7 @@ __FBSDID("$FreeBSD$");
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
+#include <netinet/tcp_log_buf.h>
 #include <netinet/tcp_syncache.h>
 #include <netinet/cc/cc.h>
 #ifdef INET6
@@ -426,6 +427,71 @@ SYSCTL_PROC(_net_inet_tcp, OID_AUTO, functions_available,
 	    "list available TCP Function sets");
 
 /*
+ * Exports one (struct tcp_function_id) for each non-alias.
+ */
+static int
+sysctl_net_inet_list_func_ids(SYSCTL_HANDLER_ARGS)
+{
+	int error, cnt;
+	struct tcp_function *f;
+	struct tcp_function_id tfi;
+
+	/*
+	 * We don't allow writes.
+	 */
+	if (req->newptr != NULL)
+		return (EINVAL);
+
+	/*
+	 * Wire the old buffer so we can directly copy the functions to
+	 * user space without dropping the lock.
+	 */
+	if (req->oldptr != NULL) {
+		error = sysctl_wire_old_buffer(req, 0);
+		if (error)
+			return (error);
+	}
+
+	/*
+	 * Walk the list, comparing the name of the function entry and
+	 * function block to determine which is an alias.
+	 * If exporting the list, copy out matching entries. Otherwise,
+	 * just record the total length.
+	 */
+	cnt = 0;
+	rw_rlock(&tcp_function_lock);
+	TAILQ_FOREACH(f, &t_functions, tf_next) {
+		if (strncmp(f->tf_name, f->tf_fb->tfb_tcp_block_name,
+		    TCP_FUNCTION_NAME_LEN_MAX))
+			continue;
+		if (req->oldptr != NULL) {
+			tfi.tfi_id = f->tf_fb->tfb_id;
+			(void)strncpy(tfi.tfi_name, f->tf_name,
+			    TCP_FUNCTION_NAME_LEN_MAX);
+			tfi.tfi_name[TCP_FUNCTION_NAME_LEN_MAX - 1] = '\0';
+			error = SYSCTL_OUT(req, &tfi, sizeof(tfi));
+			/*
+			 * Don't stop on error, as that is the
+			 * mechanism we use to accumulate length
+			 * information if the buffer was too short.
+			 */
+		} else
+			cnt++;
+	}
+	rw_runlock(&tcp_function_lock);
+	if (req->oldptr == NULL)
+		error = SYSCTL_OUT(req, NULL,
+		    (cnt + 1) * sizeof(struct tcp_function_id));
+
+	return (error);
+}
+
+SYSCTL_PROC(_net_inet_tcp, OID_AUTO, function_ids,
+	    CTLTYPE_OPAQUE | CTLFLAG_SKIP | CTLFLAG_RD | CTLFLAG_MPSAFE,
+	    NULL, 0, sysctl_net_inet_list_func_ids, "S,tcp_function_id",
+	    "List TCP function block name-to-ID mappings");
+
+/*
  * Target size of TCP PCB hash tables. Must be a power of two.
  *
  * Note that this can be overridden by the kernel environment
@@ -504,6 +570,8 @@ maketcp_hashsize(int size)
 	return (hashsize);
 }
 
+static volatile int next_tcp_stack_id = 1;
+
 /*
  * Register a TCP function block with the name provided in the names
  * array.  (Note that this function does NOT automatically register
@@ -563,6 +631,7 @@ register_tcp_functions_as_names(struct tcp_function_block *blk, int wait,
 
 	refcount_init(&blk->tfb_refcnt, 0);
 	blk->tfb_flags = 0;
+	blk->tfb_id = atomic_fetchadd_int(&next_tcp_stack_id, 1);
 	for (i = 0; i < *num_names; i++) {
 		n = malloc(sizeof(struct tcp_function), M_TCPFUNCTIONS, wait);
 		if (n == NULL) {
@@ -779,6 +848,8 @@ tcp_init(void)
 	/* Setup the tcp function block list */
 	init_tcp_functions();
 	register_tcp_functions(&tcp_def_funcblk, M_WAITOK);
+	/* Initialize the TCP logging data. */
+	tcp_log_init();
 
 	if (tcp_soreceive_stream) {
 #ifdef INET
@@ -1360,6 +1431,8 @@ tcp_newtcpcb(struct inpcb *inp)
 	 */
 	tcp_pcap_tcpcb_init(tp);
 #endif
+	/* Initialize the per-TCPCB log data. */
+	tcp_log_tcpcbinit(tp);
 	if (tp->t_fb->tfb_tcp_fb_init) {
 		(*tp->t_fb->tfb_tcp_fb_init)(tp);
 	}
@@ -1577,6 +1650,7 @@ tcp_discardcb(struct tcpcb *tp)
 	inp->inp_ppcb = NULL;
 	if (tp->t_timers->tt_draincnt == 0) {
 		/* We own the last reference on tcpcb, let's free it. */
+		tcp_log_tcpcbfini(tp);
 		TCPSTATES_DEC(tp->t_state);
 		if (tp->t_fb->tfb_tcp_fb_fini)
 			(*tp->t_fb->tfb_tcp_fb_fini)(tp, 1);
@@ -1607,6 +1681,7 @@ tcp_timer_discard(void *ptp)
 	tp->t_timers->tt_draincnt--;
 	if (tp->t_timers->tt_draincnt == 0) {
 		/* We own the last reference on this tcpcb, let's free it. */
+		tcp_log_tcpcbfini(tp);
 		TCPSTATES_DEC(tp->t_state);
 		if (tp->t_fb->tfb_tcp_fb_fini)
 			(*tp->t_fb->tfb_tcp_fb_fini)(tp, 1);
@@ -1700,6 +1775,7 @@ tcp_drain(void)
 			if ((tcpb = intotcpcb(inpb)) != NULL) {
 				tcp_reass_flush(tcpb);
 				tcp_clean_sackreport(tcpb);
+				tcp_log_drain(tcpb);
 #ifdef TCPPCAP
 				if (tcp_pcap_aggressive_free) {
 					/* Free the TCP PCAP queues. */
@@ -2856,6 +2932,7 @@ tcp_inptoxtp(const struct inpcb *inp, struct xtcpcb *xt)
 		xt->t_state = TCPS_TIME_WAIT;
 	} else {
 		xt->t_state = tp->t_state;
+		xt->t_logstate = tp->t_logstate;
 		xt->t_flags = tp->t_flags;
 		xt->t_sndzerowin = tp->t_sndzerowin;
 		xt->t_sndrexmitpack = tp->t_sndrexmitpack;
@@ -2879,6 +2956,8 @@ tcp_inptoxtp(const struct inpcb *inp, struct xtcpcb *xt)
 
 		bcopy(tp->t_fb->tfb_tcp_block_name, xt->xt_stack,
 		    TCP_FUNCTION_NAME_LEN_MAX);
+		bzero(xt->xt_logid, TCP_LOG_ID_LEN);
+		(void)tcp_log_get_id(tp, xt->xt_logid);
 	}
 
 	xt->xt_len = sizeof(struct xtcpcb);
diff --git a/sys/netinet/tcp_timer.c b/sys/netinet/tcp_timer.c
index cea7e5a5e5a1..6f07f0314bfe 100644
--- a/sys/netinet/tcp_timer.c
+++ b/sys/netinet/tcp_timer.c
@@ -68,6 +68,7 @@ __FBSDID("$FreeBSD$");
 #include <netinet/ip_var.h>
 #include <netinet/tcp.h>
 #include <netinet/tcp_fsm.h>
+#include <netinet/tcp_log_buf.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/cc/cc.h>
@@ -644,6 +645,7 @@ tcp_timer_rexmt(void * xtp)
 	KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
 		("%s: tp %p tcpcb can't be stopped here", __func__, tp));
 	tcp_free_sackholes(tp);
+	TCP_LOG_EVENT(tp, NULL, NULL, NULL, TCP_LOG_RTO, 0, 0, NULL, false);
 	if (tp->t_fb->tfb_tcp_rexmit_tmr) {
 		/* The stack has a timer action too. */
 		(*tp->t_fb->tfb_tcp_rexmit_tmr)(tp);
diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c
index 67124e5cc143..75e9599e646b 100644
--- a/sys/netinet/tcp_usrreq.c
+++ b/sys/netinet/tcp_usrreq.c
@@ -90,6 +90,7 @@ __FBSDID("$FreeBSD$");
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
+#include <netinet/tcp_log_buf.h>
 #include <netinet/tcpip.h>
 #include <netinet/cc/cc.h>
 #include <netinet/tcp_fastopen.h>
@@ -1026,6 +1027,11 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
 			tp->t_flags &= ~TF_FORCEDATA;
 		}
 	}
+	TCP_LOG_EVENT(tp, NULL,
+	    &inp->inp_socket->so_rcv,
+	    &inp->inp_socket->so_snd,
+	    TCP_LOG_USERSEND, error,
+	    0, NULL, false);
 out:
 	TCPDEBUG2((flags & PRUS_OOB) ? PRU_SENDOOB :
 		  ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND));
@@ -1533,6 +1539,15 @@ tcp_ctloutput(struct socket *so, struct sockopt *sopt)
 	return (tp->t_fb->tfb_tcp_ctloutput(so, sopt, inp, tp));
 }
 
+/*
+ * If this assert becomes untrue, we need to change the size of the buf
+ * variable in tcp_default_ctloutput().
+ */
+#ifdef CTASSERT
+CTASSERT(TCP_CA_NAME_MAX <= TCP_LOG_ID_LEN);
+CTASSERT(TCP_LOG_REASON_LEN <= TCP_LOG_ID_LEN);
+#endif
+
 int
 tcp_default_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp)
 {
@@ -1540,7 +1555,7 @@ tcp_default_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp
 	u_int	ui;
 	struct	tcp_info ti;
 	struct cc_algo *algo;
-	char	*pbuf, buf[TCP_CA_NAME_MAX];
+	char	*pbuf, buf[TCP_LOG_ID_LEN];
 	size_t	len;
 
 	/*
@@ -1822,6 +1837,55 @@ unlock_and_done:
 			goto unlock_and_done;
 		}
 
+		case TCP_LOG:
+			INP_WUNLOCK(inp);
+			error = sooptcopyin(sopt, &optval, sizeof optval,
+			    sizeof optval);
+			if (error)
+				return (error);
+
+			INP_WLOCK_RECHECK(inp);
+			error = tcp_log_state_change(tp, optval);
+			goto unlock_and_done;
+
+		case TCP_LOGBUF:
+			INP_WUNLOCK(inp);
+			error = EINVAL;
+			break;
+
+		case TCP_LOGID:
+			INP_WUNLOCK(inp);
+			error = sooptcopyin(sopt, buf, TCP_LOG_ID_LEN - 1, 0);
+			if (error)
+				break;
+			buf[sopt->sopt_valsize] = '\0';
+			INP_WLOCK_RECHECK(inp);
+			error = tcp_log_set_id(tp, buf);
+			/* tcp_log_set_id() unlocks the INP. */
+			break;
+
+		case TCP_LOGDUMP:
+		case TCP_LOGDUMPID:
+			INP_WUNLOCK(inp);
+			error =
+			    sooptcopyin(sopt, buf, TCP_LOG_REASON_LEN - 1, 0);
+			if (error)
+				break;
+			buf[sopt->sopt_valsize] = '\0';
+			INP_WLOCK_RECHECK(inp);
+			if (sopt->sopt_name == TCP_LOGDUMP) {
+				error = tcp_log_dump_tp_logbuf(tp, buf,
+				    M_WAITOK, true);
+				INP_WUNLOCK(inp);
+			} else {
+				tcp_log_dump_tp_bucket_logbufs(tp, buf);
+				/*
+				 * tcp_log_dump_tp_bucket_logbufs() drops the
+				 * INP lock.
+				 */
+			}
+			break;
+
 		default:
 			INP_WUNLOCK(inp);
 			error = ENOPROTOOPT;
@@ -1907,6 +1971,25 @@ unlock_and_done:
 			INP_WUNLOCK(inp);
 			error = sooptcopyout(sopt, &optval, sizeof optval);
 			break;
+		case TCP_LOG:
+			optval = tp->t_logstate;
+			INP_WUNLOCK(inp);
+			error = sooptcopyout(sopt, &optval, sizeof(optval));
+			break;
+		case TCP_LOGBUF:
+			/* tcp_log_getlogbuf() does INP_WUNLOCK(inp) */
+			error = tcp_log_getlogbuf(sopt, tp);
+			break;
+		case TCP_LOGID:
+			len = tcp_log_get_id(tp, buf);
+			INP_WUNLOCK(inp);
+			error = sooptcopyout(sopt, buf, len + 1);
+			break;
+		case TCP_LOGDUMP:
+		case TCP_LOGDUMPID:
+			INP_WUNLOCK(inp);
+			error = EINVAL;
+			break;
 		default:
 			INP_WUNLOCK(inp);
 			error = ENOPROTOOPT;
diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h
index 2ccc2ad7b166..f09bd19c5f6e 100644
--- a/sys/netinet/tcp_var.h
+++ b/sys/netinet/tcp_var.h
@@ -79,6 +79,8 @@ struct sackhint {
 	uint64_t	_pad[1];	/* TBD */
 };
 
+STAILQ_HEAD(tcp_log_stailq, tcp_log_mem);
+
 /*
  * Tcp control block, one per tcp; fields:
  * Organized for 16 byte cacheline efficiency.
@@ -189,6 +191,13 @@ struct tcpcb {
 	u_int	t_tsomaxsegcount;	/* TSO maximum segment count */
 	u_int	t_tsomaxsegsize;	/* TSO maximum segment size in bytes */
 	u_int	t_flags2;		/* More tcpcb flags storage */
+	int	t_logstate;		/* State of "black box" logging */
+	struct tcp_log_stailq t_logs;	/* Log buffer */
+	int	t_lognum;		/* Number of log entries */
+	uint32_t t_logsn;		/* Log "serial number" */
+	struct tcp_log_id_node *t_lin;
+	struct tcp_log_id_bucket *t_lib;
+	const char *t_output_caller;	/* Function that called tcp_output */
 	struct tcp_function_block *t_fb;/* TCP function call block */
 	void	*t_fb_ptr;		/* Pointer to t_fb specific data */
 	uint8_t t_tfo_client_cookie_len; /* TCP Fast Open client cookie length */
@@ -267,6 +276,7 @@ struct tcp_function_block {
 	int	(*tfb_tcp_handoff_ok)(struct tcpcb *);
 	volatile uint32_t tfb_refcnt;
 	uint32_t  tfb_flags;
+	uint8_t	tfb_id;
 };
 
 struct tcp_function {
@@ -339,11 +349,12 @@ TAILQ_HEAD(tcp_funchead, tcp_function);
 #define	TCPOOB_HADDATA	0x02
 
 /*
- * Flags for PLPMTU handling, t_flags2
+ * Flags for the extended TCP flags field, t_flags2
  */
 #define	TF2_PLPMTU_BLACKHOLE	0x00000001 /* Possible PLPMTUD Black Hole. */
 #define	TF2_PLPMTU_PMTUD	0x00000002 /* Allowed to attempt PLPMTUD. */
 #define	TF2_PLPMTU_MAXSEGSNT	0x00000004 /* Last seg sent was full seg. */
+#define	TF2_LOG_AUTO		0x00000008 /* Session is auto-logging. */
 
 /*
  * Structure to hold TCP options that are only used during segment
@@ -654,6 +665,7 @@ struct xtcpcb {
 	size_t		xt_len;		/* length of this structure */
 	struct xinpcb	xt_inp;
 	char		xt_stack[TCP_FUNCTION_NAME_LEN_MAX];	/* (s) */
+	char		xt_logid[TCP_LOG_ID_LEN];	/* (s) */
 	int64_t		spare64[8];
 	int32_t		t_state;		/* (s,p) */
 	uint32_t	t_flags;		/* (s,p) */
@@ -666,14 +678,24 @@ struct xtcpcb {
 	int32_t		tt_keep;		/* (s) */
 	int32_t		tt_2msl;		/* (s) */
 	int32_t		tt_delack;		/* (s) */
+	int32_t		t_logstate;		/* (3) */
 	int32_t		spare32[32];
 } __aligned(8);
+
 #ifdef _KERNEL
 void	tcp_inptoxtp(const struct inpcb *, struct xtcpcb *);
 #endif
 #endif
 
 /*
+ * TCP function name-to-id mapping exported to user-land via sysctl(3).
+ */
+struct tcp_function_id {
+	uint8_t		tfi_id;
+	char		tfi_name[TCP_FUNCTION_NAME_LEN_MAX];
+};
+
+/*
  * Identifiers for TCP sysctl nodes
  */
 #define	TCPCTL_DO_RFC1323	1	/* use RFC-1323 extensions */