aboutsummaryrefslogtreecommitdiff
path: root/sys/kern/uipc_socket.c
diff options
context:
space:
mode:
Diffstat (limited to 'sys/kern/uipc_socket.c')
-rw-r--r--sys/kern/uipc_socket.c1170
1 files changed, 983 insertions, 187 deletions
diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c
index f0b36fc5595e..6c9eb7139cd1 100644
--- a/sys/kern/uipc_socket.c
+++ b/sys/kern/uipc_socket.c
@@ -41,18 +41,18 @@
* sodealloc() tears down socket layer state for a socket, called only by
* sofree() and sonewconn(). Socket layer private.
*
- * pru_attach() associates protocol layer state with an allocated socket;
+ * pr_attach() associates protocol layer state with an allocated socket;
* called only once, may fail, aborting socket allocation. This is called
* from socreate() and sonewconn(). Socket layer private.
*
- * pru_detach() disassociates protocol layer state from an attached socket,
- * and will be called exactly once for sockets in which pru_attach() has
- * been successfully called. If pru_attach() returned an error,
- * pru_detach() will not be called. Socket layer private.
+ * pr_detach() disassociates protocol layer state from an attached socket,
+ * and will be called exactly once for sockets in which pr_attach() has
+ * been successfully called. If pr_attach() returned an error,
+ * pr_detach() will not be called. Socket layer private.
*
- * pru_abort() and pru_close() notify the protocol layer that the last
+ * pr_abort() and pr_close() notify the protocol layer that the last
* consumer of a socket is starting to tear down the socket, and that the
- * protocol should terminate the connection. Historically, pru_abort() also
+ * protocol should terminate the connection. Historically, pr_abort() also
* detached protocol state from the socket state, but this is no longer the
* case.
*
@@ -96,8 +96,8 @@
* NOTE: With regard to VNETs the general rule is that callers do not set
* curvnet. Exceptions to this rule include soabort(), sodisconnect(),
* sofree(), sorele(), sonewconn() and sorflush(), which are usually called
- * from a pre-set VNET context. sopoll() currently does not need a VNET
- * context to be set.
+ * from a pre-set VNET context. sopoll_generic() currently does not need a
+ * VNET context to be set.
*/
#include <sys/cdefs.h>
@@ -122,6 +122,7 @@
#include <sys/hhook.h>
#include <sys/kernel.h>
#include <sys/khelp.h>
+#include <sys/kthread.h>
#include <sys/ktls.h>
#include <sys/event.h>
#include <sys/eventhandler.h>
@@ -133,7 +134,9 @@
#include <sys/socketvar.h>
#include <sys/resourcevar.h>
#include <net/route.h>
+#include <sys/sched.h>
#include <sys/signalvar.h>
+#include <sys/smp.h>
#include <sys/stat.h>
#include <sys/sx.h>
#include <sys/sysctl.h>
@@ -150,6 +153,7 @@
#include <net/vnet.h>
#include <security/mac/mac_framework.h>
+#include <security/mac/mac_internal.h>
#include <vm/uma.h>
@@ -159,8 +163,17 @@
#include <compat/freebsd32/freebsd32.h>
#endif
+static int soreceive_generic_locked(struct socket *so,
+ struct sockaddr **psa, struct uio *uio, struct mbuf **mp,
+ struct mbuf **controlp, int *flagsp);
static int soreceive_rcvoob(struct socket *so, struct uio *uio,
int flags);
+static int soreceive_stream_locked(struct socket *so, struct sockbuf *sb,
+ struct sockaddr **psa, struct uio *uio, struct mbuf **mp,
+ struct mbuf **controlp, int flags);
+static int sosend_generic_locked(struct socket *so, struct sockaddr *addr,
+ struct uio *uio, struct mbuf *top, struct mbuf *control,
+ int flags, struct thread *td);
static void so_rdknl_lock(void *);
static void so_rdknl_unlock(void *);
static void so_rdknl_assert_lock(void *, int);
@@ -173,20 +186,18 @@ static int filt_soread(struct knote *kn, long hint);
static void filt_sowdetach(struct knote *kn);
static int filt_sowrite(struct knote *kn, long hint);
static int filt_soempty(struct knote *kn, long hint);
-static int inline hhook_run_socket(struct socket *so, void *hctx, int32_t h_id);
-fo_kqfilter_t soo_kqfilter;
-static struct filterops soread_filtops = {
+static const struct filterops soread_filtops = {
.f_isfd = 1,
.f_detach = filt_sordetach,
.f_event = filt_soread,
};
-static struct filterops sowrite_filtops = {
+static const struct filterops sowrite_filtops = {
.f_isfd = 1,
.f_detach = filt_sowdetach,
.f_event = filt_sowrite,
};
-static struct filterops soempty_filtops = {
+static const struct filterops soempty_filtops = {
.f_isfd = 1,
.f_detach = filt_sowdetach,
.f_event = filt_soempty,
@@ -201,8 +212,26 @@ MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
VNET_ASSERT(curvnet != NULL, \
("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so)));
+#ifdef SOCKET_HHOOK
VNET_DEFINE(struct hhook_head *, socket_hhh[HHOOK_SOCKET_LAST + 1]);
#define V_socket_hhh VNET(socket_hhh)
+static inline int hhook_run_socket(struct socket *, void *, int32_t);
+#endif
+
+#ifdef COMPAT_FREEBSD32
+#ifdef __amd64__
+/* off_t has 4-byte alignment on i386 but not on other 32-bit platforms. */
+#define __splice32_packed __packed
+#else
+#define __splice32_packed
+#endif
+struct splice32 {
+ int32_t sp_fd;
+ int64_t sp_max;
+ struct timeval32 sp_idle;
+} __splice32_packed;
+#undef __splice32_packed
+#endif
/*
* Limit on the number of connections in the listen queue waiting
@@ -210,43 +239,58 @@ VNET_DEFINE(struct hhook_head *, socket_hhh[HHOOK_SOCKET_LAST + 1]);
* NB: The original sysctl somaxconn is still available but hidden
* to prevent confusion about the actual purpose of this number.
*/
-static u_int somaxconn = SOMAXCONN;
+VNET_DEFINE_STATIC(u_int, somaxconn) = SOMAXCONN;
+#define V_somaxconn VNET(somaxconn)
static int
sysctl_somaxconn(SYSCTL_HANDLER_ARGS)
{
int error;
- int val;
+ u_int val;
- val = somaxconn;
+ val = V_somaxconn;
error = sysctl_handle_int(oidp, &val, 0, req);
if (error || !req->newptr )
return (error);
/*
* The purpose of the UINT_MAX / 3 limit, is so that the formula
- * 3 * so_qlimit / 2
+ * 3 * sol_qlimit / 2
* below, will not overflow.
*/
if (val < 1 || val > UINT_MAX / 3)
return (EINVAL);
- somaxconn = val;
+ V_somaxconn = val;
return (0);
}
SYSCTL_PROC(_kern_ipc, OID_AUTO, soacceptqueue,
- CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, sizeof(int),
- sysctl_somaxconn, "I",
+ CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_VNET, 0, sizeof(u_int),
+ sysctl_somaxconn, "IU",
"Maximum listen socket pending connection accept queue size");
SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
- CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_SKIP | CTLFLAG_MPSAFE, 0,
- sizeof(int), sysctl_somaxconn, "I",
+ CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_SKIP | CTLFLAG_MPSAFE | CTLFLAG_VNET, 0,
+ sizeof(u_int), sysctl_somaxconn, "IU",
"Maximum listen socket pending connection accept queue size (compat)");
-static int numopensockets;
-SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD,
- &numopensockets, 0, "Number of open sockets");
+static u_int numopensockets;
+static int
+sysctl_numopensockets(SYSCTL_HANDLER_ARGS)
+{
+ u_int val;
+
+#ifdef VIMAGE
+ if(!IS_DEFAULT_VNET(curvnet))
+ val = curvnet->vnet_sockcnt;
+ else
+#endif
+ val = numopensockets;
+ return (sysctl_handle_int(oidp, &val, 0, req));
+}
+SYSCTL_PROC(_kern_ipc, OID_AUTO, numopensockets,
+ CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_VNET, 0, sizeof(u_int),
+ sysctl_numopensockets, "IU", "Number of open sockets");
/*
* so_global_mtx protects so_gencnt, numopensockets, and the per-socket
@@ -276,22 +320,364 @@ socket_zone_change(void *tag)
maxsockets = uma_zone_set_max(socket_zone, maxsockets);
}
+static int splice_init_state;
+static struct sx splice_init_lock;
+SX_SYSINIT(splice_init_lock, &splice_init_lock, "splice_init");
+
+static SYSCTL_NODE(_kern_ipc, OID_AUTO, splice, CTLFLAG_RW, 0,
+ "Settings relating to the SO_SPLICE socket option");
+
+static bool splice_receive_stream = true;
+SYSCTL_BOOL(_kern_ipc_splice, OID_AUTO, receive_stream, CTLFLAG_RWTUN,
+ &splice_receive_stream, 0,
+ "Use soreceive_stream() for stream splices");
+
+static uma_zone_t splice_zone;
+static struct proc *splice_proc;
+struct splice_wq {
+ struct mtx mtx;
+ STAILQ_HEAD(, so_splice) head;
+ bool running;
+} __aligned(CACHE_LINE_SIZE);
+static struct splice_wq *splice_wq;
+static uint32_t splice_index = 0;
+
+static void so_splice_timeout(void *arg, int pending);
+static void so_splice_xfer(struct so_splice *s);
+static int so_unsplice(struct socket *so, bool timeout);
+
static void
-socket_hhook_register(int subtype)
+splice_work_thread(void *ctx)
{
+ struct splice_wq *wq = ctx;
+ struct so_splice *s, *s_temp;
+ STAILQ_HEAD(, so_splice) local_head;
+ int cpu;
+
+ cpu = wq - splice_wq;
+ if (bootverbose)
+ printf("starting so_splice worker thread for CPU %d\n", cpu);
+
+ for (;;) {
+ mtx_lock(&wq->mtx);
+ while (STAILQ_EMPTY(&wq->head)) {
+ wq->running = false;
+ mtx_sleep(wq, &wq->mtx, 0, "-", 0);
+ wq->running = true;
+ }
+ STAILQ_INIT(&local_head);
+ STAILQ_CONCAT(&local_head, &wq->head);
+ STAILQ_INIT(&wq->head);
+ mtx_unlock(&wq->mtx);
+ STAILQ_FOREACH_SAFE(s, &local_head, next, s_temp) {
+ mtx_lock(&s->mtx);
+ CURVNET_SET(s->src->so_vnet);
+ so_splice_xfer(s);
+ CURVNET_RESTORE();
+ }
+ }
+}
- if (hhook_head_register(HHOOK_TYPE_SOCKET, subtype,
- &V_socket_hhh[subtype],
- HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0)
- printf("%s: WARNING: unable to register hook\n", __func__);
+static void
+so_splice_dispatch_async(struct so_splice *sp)
+{
+ struct splice_wq *wq;
+ bool running;
+
+ wq = &splice_wq[sp->wq_index];
+ mtx_lock(&wq->mtx);
+ STAILQ_INSERT_TAIL(&wq->head, sp, next);
+ running = wq->running;
+ mtx_unlock(&wq->mtx);
+ if (!running)
+ wakeup(wq);
+}
+
+void
+so_splice_dispatch(struct so_splice *sp)
+{
+ mtx_assert(&sp->mtx, MA_OWNED);
+
+ if (sp->state != SPLICE_IDLE) {
+ mtx_unlock(&sp->mtx);
+ } else {
+ sp->state = SPLICE_QUEUED;
+ mtx_unlock(&sp->mtx);
+ so_splice_dispatch_async(sp);
+ }
+}
+
+static int
+splice_zinit(void *mem, int size __unused, int flags __unused)
+{
+ struct so_splice *s;
+
+ s = (struct so_splice *)mem;
+ mtx_init(&s->mtx, "so_splice", NULL, MTX_DEF);
+ return (0);
}
static void
-socket_hhook_deregister(int subtype)
+splice_zfini(void *mem, int size)
{
+ struct so_splice *s;
- if (hhook_head_deregister(V_socket_hhh[subtype]) != 0)
- printf("%s: WARNING: unable to deregister hook\n", __func__);
+ s = (struct so_splice *)mem;
+ mtx_destroy(&s->mtx);
+}
+
+static int
+splice_init(void)
+{
+ struct thread *td;
+ int error, i, state;
+
+ state = atomic_load_acq_int(&splice_init_state);
+ if (__predict_true(state > 0))
+ return (0);
+ if (state < 0)
+ return (ENXIO);
+ sx_xlock(&splice_init_lock);
+ if (splice_init_state != 0) {
+ sx_xunlock(&splice_init_lock);
+ return (0);
+ }
+
+ splice_zone = uma_zcreate("splice", sizeof(struct so_splice), NULL,
+ NULL, splice_zinit, splice_zfini, UMA_ALIGN_CACHE, 0);
+
+ splice_wq = mallocarray(mp_maxid + 1, sizeof(*splice_wq), M_TEMP,
+ M_WAITOK | M_ZERO);
+
+ /*
+ * Initialize the workqueues to run the splice work. We create a
+ * work queue for each CPU.
+ */
+ CPU_FOREACH(i) {
+ STAILQ_INIT(&splice_wq[i].head);
+ mtx_init(&splice_wq[i].mtx, "splice work queue", NULL, MTX_DEF);
+ }
+
+ /* Start kthreads for each workqueue. */
+ error = 0;
+ CPU_FOREACH(i) {
+ error = kproc_kthread_add(splice_work_thread, &splice_wq[i],
+ &splice_proc, &td, 0, 0, "so_splice", "thr_%d", i);
+ if (error) {
+ printf("Can't add so_splice thread %d error %d\n",
+ i, error);
+ break;
+ }
+
+ /*
+ * It's possible to create loops with SO_SPLICE; ensure that
+ * worker threads aren't able to starve the system too easily.
+ */
+ thread_lock(td);
+ sched_prio(td, PUSER);
+ thread_unlock(td);
+ }
+
+ splice_init_state = error != 0 ? -1 : 1;
+ sx_xunlock(&splice_init_lock);
+
+ return (error);
+}
+
+/*
+ * Lock a pair of socket's I/O locks for splicing. Avoid blocking while holding
+ * one lock in order to avoid potential deadlocks in case there is some other
+ * code path which acquires more than one I/O lock at a time.
+ */
+static void
+splice_lock_pair(struct socket *so_src, struct socket *so_dst)
+{
+ int error;
+
+ for (;;) {
+ error = SOCK_IO_SEND_LOCK(so_dst, SBL_WAIT | SBL_NOINTR);
+ KASSERT(error == 0,
+ ("%s: failed to lock send I/O lock: %d", __func__, error));
+ error = SOCK_IO_RECV_LOCK(so_src, 0);
+ KASSERT(error == 0 || error == EWOULDBLOCK,
+ ("%s: failed to lock recv I/O lock: %d", __func__, error));
+ if (error == 0)
+ break;
+ SOCK_IO_SEND_UNLOCK(so_dst);
+
+ error = SOCK_IO_RECV_LOCK(so_src, SBL_WAIT | SBL_NOINTR);
+ KASSERT(error == 0,
+ ("%s: failed to lock recv I/O lock: %d", __func__, error));
+ error = SOCK_IO_SEND_LOCK(so_dst, 0);
+ KASSERT(error == 0 || error == EWOULDBLOCK,
+ ("%s: failed to lock send I/O lock: %d", __func__, error));
+ if (error == 0)
+ break;
+ SOCK_IO_RECV_UNLOCK(so_src);
+ }
+}
+
+static void
+splice_unlock_pair(struct socket *so_src, struct socket *so_dst)
+{
+ SOCK_IO_RECV_UNLOCK(so_src);
+ SOCK_IO_SEND_UNLOCK(so_dst);
+}
+
+/*
+ * Move data from the source to the sink. Assumes that both of the relevant
+ * socket I/O locks are held.
+ */
+static int
+so_splice_xfer_data(struct socket *so_src, struct socket *so_dst, off_t max,
+ ssize_t *lenp)
+{
+ struct uio uio;
+ struct mbuf *m;
+ struct sockbuf *sb_src, *sb_dst;
+ ssize_t len;
+ long space;
+ int error, flags;
+
+ SOCK_IO_RECV_ASSERT_LOCKED(so_src);
+ SOCK_IO_SEND_ASSERT_LOCKED(so_dst);
+
+ error = 0;
+ m = NULL;
+ memset(&uio, 0, sizeof(uio));
+
+ sb_src = &so_src->so_rcv;
+ sb_dst = &so_dst->so_snd;
+
+ space = sbspace(sb_dst);
+ if (space < 0)
+ space = 0;
+ len = MIN(max, MIN(space, sbavail(sb_src)));
+ if (len == 0) {
+ SOCK_RECVBUF_LOCK(so_src);
+ if ((sb_src->sb_state & SBS_CANTRCVMORE) != 0)
+ error = EPIPE;
+ SOCK_RECVBUF_UNLOCK(so_src);
+ } else {
+ flags = MSG_DONTWAIT;
+ uio.uio_resid = len;
+ if (splice_receive_stream && sb_src->sb_tls_info == NULL) {
+ error = soreceive_stream_locked(so_src, sb_src, NULL,
+ &uio, &m, NULL, flags);
+ } else {
+ error = soreceive_generic_locked(so_src, NULL,
+ &uio, &m, NULL, &flags);
+ }
+ if (error != 0 && m != NULL) {
+ m_freem(m);
+ m = NULL;
+ }
+ }
+ if (m != NULL) {
+ len -= uio.uio_resid;
+ error = sosend_generic_locked(so_dst, NULL, NULL, m, NULL,
+ MSG_DONTWAIT, curthread);
+ } else if (error == 0) {
+ len = 0;
+ SOCK_SENDBUF_LOCK(so_dst);
+ if ((sb_dst->sb_state & SBS_CANTSENDMORE) != 0)
+ error = EPIPE;
+ SOCK_SENDBUF_UNLOCK(so_dst);
+ }
+ if (error == 0)
+ *lenp = len;
+ return (error);
+}
+
+/*
+ * Transfer data from the source to the sink.
+ */
+static void
+so_splice_xfer(struct so_splice *sp)
+{
+ struct socket *so_src, *so_dst;
+ off_t max;
+ ssize_t len;
+ int error;
+
+ mtx_assert(&sp->mtx, MA_OWNED);
+ KASSERT(sp->state == SPLICE_QUEUED || sp->state == SPLICE_CLOSING,
+ ("so_splice_xfer: invalid state %d", sp->state));
+ KASSERT(sp->max != 0, ("so_splice_xfer: max == 0"));
+
+ if (sp->state == SPLICE_CLOSING) {
+ /* Userspace asked us to close the splice. */
+ goto closing;
+ }
+
+ sp->state = SPLICE_RUNNING;
+ so_src = sp->src;
+ so_dst = sp->dst;
+ max = sp->max > 0 ? sp->max - so_src->so_splice_sent : OFF_MAX;
+ if (max < 0)
+ max = 0;
+
+ /*
+ * Lock the sockets in order to block userspace from doing anything
+ * sneaky. If an error occurs or one of the sockets can no longer
+ * transfer data, we will automatically unsplice.
+ */
+ mtx_unlock(&sp->mtx);
+ splice_lock_pair(so_src, so_dst);
+
+ error = so_splice_xfer_data(so_src, so_dst, max, &len);
+
+ mtx_lock(&sp->mtx);
+
+ /*
+ * Update our stats while still holding the socket locks. This
+ * synchronizes with getsockopt(SO_SPLICE), see the comment there.
+ */
+ if (error == 0) {
+ KASSERT(len >= 0, ("%s: len %zd < 0", __func__, len));
+ so_src->so_splice_sent += len;
+ }
+ splice_unlock_pair(so_src, so_dst);
+
+ switch (sp->state) {
+ case SPLICE_CLOSING:
+closing:
+ sp->state = SPLICE_CLOSED;
+ wakeup(sp);
+ mtx_unlock(&sp->mtx);
+ break;
+ case SPLICE_RUNNING:
+ if (error != 0 ||
+ (sp->max > 0 && so_src->so_splice_sent >= sp->max)) {
+ sp->state = SPLICE_EXCEPTION;
+ soref(so_src);
+ mtx_unlock(&sp->mtx);
+ (void)so_unsplice(so_src, false);
+ sorele(so_src);
+ } else {
+ /*
+ * Locklessly check for additional bytes in the source's
+ * receive buffer and queue more work if possible. We
+ * may end up queuing needless work, but that's ok, and
+ * if we race with a thread inserting more data into the
+ * buffer and observe sbavail() == 0, the splice mutex
+ * ensures that splice_push() will queue more work for
+ * us.
+ */
+ if (sbavail(&so_src->so_rcv) > 0 &&
+ sbspace(&so_dst->so_snd) > 0) {
+ sp->state = SPLICE_QUEUED;
+ mtx_unlock(&sp->mtx);
+ so_splice_dispatch_async(sp);
+ } else {
+ sp->state = SPLICE_IDLE;
+ mtx_unlock(&sp->mtx);
+ }
+ }
+ break;
+ default:
+ __assert_unreachable();
+ }
}
static void
@@ -307,6 +693,25 @@ socket_init(void *tag)
}
SYSINIT(socket, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_init, NULL);
+#ifdef SOCKET_HHOOK
+static void
+socket_hhook_register(int subtype)
+{
+
+ if (hhook_head_register(HHOOK_TYPE_SOCKET, subtype,
+ &V_socket_hhh[subtype],
+ HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0)
+ printf("%s: WARNING: unable to register hook\n", __func__);
+}
+
+static void
+socket_hhook_deregister(int subtype)
+{
+
+ if (hhook_head_deregister(V_socket_hhh[subtype]) != 0)
+ printf("%s: WARNING: unable to deregister hook\n", __func__);
+}
+
static void
socket_vnet_init(const void *unused __unused)
{
@@ -329,6 +734,7 @@ socket_vnet_uninit(const void *unused __unused)
}
VNET_SYSUNINIT(socket_vnet_uninit, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY,
socket_vnet_uninit, NULL);
+#endif /* SOCKET_HHOOK */
/*
* Initialise maxsockets. This SYSINIT must be run after
@@ -423,12 +829,14 @@ soalloc(struct vnet *vnet)
__func__, __LINE__, so));
so->so_vnet = vnet;
#endif
+#ifdef SOCKET_HHOOK
/* We shouldn't need the so_global_mtx */
if (hhook_run_socket(so, NULL, HHOOK_SOCKET_CREATE)) {
/* Do we need more comprehensive error returns? */
uma_zfree(socket_zone, so);
return (NULL);
}
+#endif
mtx_lock(&so_global_mtx);
so->so_gencnt = ++so_gencnt;
++numopensockets;
@@ -464,7 +872,9 @@ sodealloc(struct socket *so)
#ifdef MAC
mac_socket_destroy(so);
#endif
+#ifdef SOCKET_HHOOK
hhook_run_socket(so, NULL, HHOOK_SOCKET_CLOSE);
+#endif
khelp_destroy_osd(&so->osd);
if (SOLISTENING(so)) {
@@ -559,7 +969,7 @@ socreate(int dom, struct socket **aso, int type, int proto,
}
/*
* Auto-sizing of socket buffers is managed by the protocols and
- * the appropriate flags must be set in the pru_attach function.
+ * the appropriate flags must be set in the pr_attach() method.
*/
CURVNET_SET(so->so_vnet);
error = prp->pr_attach(so, proto, td);
@@ -773,6 +1183,7 @@ solisten_clone(struct socket *head)
so->so_fibnum = head->so_fibnum;
so->so_proto = head->so_proto;
so->so_cred = crhold(head->so_cred);
+#ifdef SOCKET_HHOOK
if (V_socket_hhh[HHOOK_SOCKET_NEWCONN]->hhh_nhooks > 0) {
if (hhook_run_socket(so, head, HHOOK_SOCKET_NEWCONN)) {
sodealloc(so);
@@ -780,6 +1191,7 @@ solisten_clone(struct socket *head)
return (NULL);
}
}
+#endif
#ifdef MAC
mac_socket_newconn(head, so);
#endif
@@ -926,9 +1338,9 @@ sopeeloff(struct socket *head)
__func__, head->so_pcb);
return (NULL);
}
- if ((*so->so_proto->pr_attach)(so, 0, NULL)) {
+ if (so->so_proto->pr_attach(so, 0, NULL)) {
sodealloc(so);
- log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n",
+ log(LOG_DEBUG, "%s: pcb %p: pr_attach() failed\n",
__func__, head->so_pcb);
return (NULL);
}
@@ -1083,6 +1495,10 @@ solisten_proto(struct socket *so, int backlog)
sbrcv_timeo = so->so_rcv.sb_timeo;
sbsnd_timeo = so->so_snd.sb_timeo;
+#ifdef MAC
+ mac_socketpeer_label_free(so->so_peerlabel);
+#endif
+
if (!(so->so_proto->pr_flags & PR_SOCKBUF)) {
sbdestroy(so, SO_SND);
sbdestroy(so, SO_RCV);
@@ -1116,8 +1532,8 @@ solisten_proto(struct socket *so, int backlog)
so->so_options |= SO_ACCEPTCONN;
listening:
- if (backlog < 0 || backlog > somaxconn)
- backlog = somaxconn;
+ if (backlog < 0 || backlog > V_somaxconn)
+ backlog = V_somaxconn;
so->sol_qlimit = backlog;
mtx_unlock(&so->so_snd_mtx);
@@ -1203,6 +1619,242 @@ solisten_dequeue(struct socket *head, struct socket **ret, int flags)
return (0);
}
+static struct so_splice *
+so_splice_alloc(off_t max)
+{
+ struct so_splice *sp;
+
+ sp = uma_zalloc(splice_zone, M_WAITOK);
+ sp->src = NULL;
+ sp->dst = NULL;
+ sp->max = max > 0 ? max : -1;
+ do {
+ sp->wq_index = atomic_fetchadd_32(&splice_index, 1) %
+ (mp_maxid + 1);
+ } while (CPU_ABSENT(sp->wq_index));
+ sp->state = SPLICE_INIT;
+ TIMEOUT_TASK_INIT(taskqueue_thread, &sp->timeout, 0, so_splice_timeout,
+ sp);
+ return (sp);
+}
+
+static void
+so_splice_free(struct so_splice *sp)
+{
+ KASSERT(sp->state == SPLICE_CLOSED,
+ ("so_splice_free: sp %p not closed", sp));
+ uma_zfree(splice_zone, sp);
+}
+
+static void
+so_splice_timeout(void *arg, int pending __unused)
+{
+ struct so_splice *sp;
+
+ sp = arg;
+ (void)so_unsplice(sp->src, true);
+}
+
+/*
+ * Splice the output from so to the input of so2.
+ */
+static int
+so_splice(struct socket *so, struct socket *so2, struct splice *splice)
+{
+ struct so_splice *sp;
+ int error;
+
+ if (splice->sp_max < 0)
+ return (EINVAL);
+ /* Handle only TCP for now; TODO: other streaming protos */
+ if (so->so_proto->pr_protocol != IPPROTO_TCP ||
+ so2->so_proto->pr_protocol != IPPROTO_TCP)
+ return (EPROTONOSUPPORT);
+ if (so->so_vnet != so2->so_vnet)
+ return (EINVAL);
+
+ /* so_splice_xfer() assumes that we're using these implementations. */
+ KASSERT(so->so_proto->pr_sosend == sosend_generic,
+ ("so_splice: sosend not sosend_generic"));
+ KASSERT(so2->so_proto->pr_soreceive == soreceive_generic ||
+ so2->so_proto->pr_soreceive == soreceive_stream,
+ ("so_splice: soreceive not soreceive_generic/stream"));
+
+ sp = so_splice_alloc(splice->sp_max);
+ so->so_splice_sent = 0;
+ sp->src = so;
+ sp->dst = so2;
+
+ error = 0;
+ SOCK_LOCK(so);
+ if (SOLISTENING(so))
+ error = EINVAL;
+ else if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0)
+ error = ENOTCONN;
+ else if (so->so_splice != NULL)
+ error = EBUSY;
+ if (error != 0) {
+ SOCK_UNLOCK(so);
+ uma_zfree(splice_zone, sp);
+ return (error);
+ }
+ SOCK_RECVBUF_LOCK(so);
+ if (so->so_rcv.sb_tls_info != NULL) {
+ SOCK_RECVBUF_UNLOCK(so);
+ SOCK_UNLOCK(so);
+ uma_zfree(splice_zone, sp);
+ return (EINVAL);
+ }
+ so->so_rcv.sb_flags |= SB_SPLICED;
+ so->so_splice = sp;
+ soref(so);
+ SOCK_RECVBUF_UNLOCK(so);
+ SOCK_UNLOCK(so);
+
+ error = 0;
+ SOCK_LOCK(so2);
+ if (SOLISTENING(so2))
+ error = EINVAL;
+ else if ((so2->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0)
+ error = ENOTCONN;
+ else if (so2->so_splice_back != NULL)
+ error = EBUSY;
+ if (error != 0) {
+ SOCK_UNLOCK(so2);
+ so_unsplice(so, false);
+ return (error);
+ }
+ SOCK_SENDBUF_LOCK(so2);
+ if (so->so_snd.sb_tls_info != NULL) {
+ SOCK_SENDBUF_UNLOCK(so2);
+ SOCK_UNLOCK(so2);
+ so_unsplice(so, false);
+ return (EINVAL);
+ }
+ so2->so_snd.sb_flags |= SB_SPLICED;
+ so2->so_splice_back = sp;
+ soref(so2);
+ mtx_lock(&sp->mtx);
+ SOCK_SENDBUF_UNLOCK(so2);
+ SOCK_UNLOCK(so2);
+
+ if (splice->sp_idle.tv_sec != 0 || splice->sp_idle.tv_usec != 0) {
+ taskqueue_enqueue_timeout_sbt(taskqueue_thread, &sp->timeout,
+ tvtosbt(splice->sp_idle), 0, C_PREL(4));
+ }
+
+ /*
+ * Transfer any data already present in the socket buffer.
+ */
+ KASSERT(sp->state == SPLICE_INIT,
+ ("so_splice: splice %p state %d", sp, sp->state));
+ sp->state = SPLICE_QUEUED;
+ so_splice_xfer(sp);
+ return (0);
+}
+
+static int
+so_unsplice(struct socket *so, bool timeout)
+{
+ struct socket *so2;
+ struct so_splice *sp;
+ bool drain, so2rele;
+
+ /*
+ * First unset SB_SPLICED and hide the splice structure so that
+ * wakeup routines will stop enqueuing work. This also ensures that
+ * a only a single thread will proceed with the unsplice.
+ */
+ SOCK_LOCK(so);
+ if (SOLISTENING(so)) {
+ SOCK_UNLOCK(so);
+ return (EINVAL);
+ }
+ SOCK_RECVBUF_LOCK(so);
+ if ((so->so_rcv.sb_flags & SB_SPLICED) == 0) {
+ SOCK_RECVBUF_UNLOCK(so);
+ SOCK_UNLOCK(so);
+ return (ENOTCONN);
+ }
+ sp = so->so_splice;
+ mtx_lock(&sp->mtx);
+ if (sp->state == SPLICE_INIT) {
+ /*
+ * A splice is in the middle of being set up.
+ */
+ mtx_unlock(&sp->mtx);
+ SOCK_RECVBUF_UNLOCK(so);
+ SOCK_UNLOCK(so);
+ return (ENOTCONN);
+ }
+ mtx_unlock(&sp->mtx);
+ so->so_rcv.sb_flags &= ~SB_SPLICED;
+ so->so_splice = NULL;
+ SOCK_RECVBUF_UNLOCK(so);
+ SOCK_UNLOCK(so);
+
+ so2 = sp->dst;
+ SOCK_LOCK(so2);
+ KASSERT(!SOLISTENING(so2), ("%s: so2 is listening", __func__));
+ SOCK_SENDBUF_LOCK(so2);
+ KASSERT(sp->state == SPLICE_INIT ||
+ (so2->so_snd.sb_flags & SB_SPLICED) != 0,
+ ("%s: so2 is not spliced", __func__));
+ KASSERT(sp->state == SPLICE_INIT ||
+ so2->so_splice_back == sp,
+ ("%s: so_splice_back != sp", __func__));
+ so2->so_snd.sb_flags &= ~SB_SPLICED;
+ so2rele = so2->so_splice_back != NULL;
+ so2->so_splice_back = NULL;
+ SOCK_SENDBUF_UNLOCK(so2);
+ SOCK_UNLOCK(so2);
+
+ /*
+ * No new work is being enqueued. The worker thread might be
+ * splicing data right now, in which case we want to wait for it to
+ * finish before proceeding.
+ */
+ mtx_lock(&sp->mtx);
+ switch (sp->state) {
+ case SPLICE_QUEUED:
+ case SPLICE_RUNNING:
+ sp->state = SPLICE_CLOSING;
+ while (sp->state == SPLICE_CLOSING)
+ msleep(sp, &sp->mtx, PSOCK, "unsplice", 0);
+ break;
+ case SPLICE_INIT:
+ case SPLICE_IDLE:
+ case SPLICE_EXCEPTION:
+ sp->state = SPLICE_CLOSED;
+ break;
+ default:
+ __assert_unreachable();
+ }
+ if (!timeout) {
+ drain = taskqueue_cancel_timeout(taskqueue_thread, &sp->timeout,
+ NULL) != 0;
+ } else {
+ drain = false;
+ }
+ mtx_unlock(&sp->mtx);
+ if (drain)
+ taskqueue_drain_timeout(taskqueue_thread, &sp->timeout);
+
+ /*
+ * Now we hold the sole reference to the splice structure.
+ * Clean up: signal userspace and release socket references.
+ */
+ sorwakeup(so);
+ CURVNET_SET(so->so_vnet);
+ sorele(so);
+ sowwakeup(so2);
+ if (so2rele)
+ sorele(so2);
+ CURVNET_RESTORE();
+ so_splice_free(sp);
+ return (0);
+}
+
/*
* Free socket upon release of the very last reference.
*/
@@ -1216,6 +1868,12 @@ sofree(struct socket *so)
("%s: so %p has references", __func__, so));
KASSERT(SOLISTENING(so) || so->so_qstate == SQ_NONE,
("%s: so %p is on listen queue", __func__, so));
+ KASSERT(SOLISTENING(so) || (so->so_rcv.sb_flags & SB_SPLICED) == 0,
+ ("%s: so %p rcvbuf is spliced", __func__, so));
+ KASSERT(SOLISTENING(so) || (so->so_snd.sb_flags & SB_SPLICED) == 0,
+ ("%s: so %p sndbuf is spliced", __func__, so));
+ KASSERT(so->so_splice == NULL && so->so_splice_back == NULL,
+ ("%s: so %p has spliced data", __func__, so));
SOCK_UNLOCK(so);
@@ -1226,14 +1884,22 @@ sofree(struct socket *so)
if (pr->pr_detach != NULL)
pr->pr_detach(so);
- /*
- * From this point on, we assume that no other references to this
- * socket exist anywhere else in the stack. Therefore, no locks need
- * to be acquired or held.
- */
if (!(pr->pr_flags & PR_SOCKBUF) && !SOLISTENING(so)) {
+ /*
+ * From this point on, we assume that no other references to
+ * this socket exist anywhere else in the stack. Therefore,
+ * no locks need to be acquired or held.
+ */
+#ifdef INVARIANTS
+ SOCK_SENDBUF_LOCK(so);
+ SOCK_RECVBUF_LOCK(so);
+#endif
sbdestroy(so, SO_SND);
sbdestroy(so, SO_RCV);
+#ifdef INVARIANTS
+ SOCK_SENDBUF_UNLOCK(so);
+ SOCK_RECVBUF_UNLOCK(so);
+#endif
}
seldrain(&so->so_rdsel);
seldrain(&so->so_wrsel);
@@ -1389,11 +2055,11 @@ sopeeraddr(struct socket *so, struct sockaddr *sa)
#endif
int error;
- CURVNET_SET(so->so_vnet);
+ CURVNET_ASSERT_SET();
+
error = so->so_proto->pr_peeraddr(so, sa);
KASSERT(sa->sa_len <= len,
("%s: protocol %p sockaddr overflow", __func__, so->so_proto));
- CURVNET_RESTORE();
return (error);
}
@@ -1646,8 +2312,8 @@ out:
* counts if EINTR/ERESTART are returned. Data and control buffers are freed
* on return.
*/
-int
-sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio,
+static int
+sosend_generic_locked(struct socket *so, struct sockaddr *addr, struct uio *uio,
struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
{
long space;
@@ -1663,6 +2329,9 @@ sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio,
tls = NULL;
tls_rtype = TLS_RLTYPE_APP;
#endif
+
+ SOCK_IO_SEND_ASSERT_LOCKED(so);
+
if (uio != NULL)
resid = uio->uio_resid;
else if ((top->m_flags & M_PKTHDR) != 0)
@@ -1692,10 +2361,6 @@ sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio,
if (control != NULL)
clen = control->m_len;
- error = SOCK_IO_SEND_LOCK(so, SBLOCKWAIT(flags));
- if (error)
- goto out;
-
#ifdef KERN_TLS
tls_send_flag = 0;
tls = ktls_hold(so->so_snd.sb_tls_info);
@@ -1718,7 +2383,7 @@ sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio,
if (resid == 0 && !ktls_permit_empty_frames(tls)) {
error = EINVAL;
- goto release;
+ goto out;
}
}
#endif
@@ -1729,13 +2394,13 @@ restart:
if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
SOCKBUF_UNLOCK(&so->so_snd);
error = EPIPE;
- goto release;
+ goto out;
}
if (so->so_error) {
error = so->so_error;
so->so_error = 0;
SOCKBUF_UNLOCK(&so->so_snd);
- goto release;
+ goto out;
}
if ((so->so_state & SS_ISCONNECTED) == 0) {
/*
@@ -1749,7 +2414,7 @@ restart:
if (!(resid == 0 && clen != 0)) {
SOCKBUF_UNLOCK(&so->so_snd);
error = ENOTCONN;
- goto release;
+ goto out;
}
} else if (addr == NULL) {
SOCKBUF_UNLOCK(&so->so_snd);
@@ -1757,7 +2422,7 @@ restart:
error = ENOTCONN;
else
error = EDESTADDRREQ;
- goto release;
+ goto out;
}
}
space = sbspace(&so->so_snd);
@@ -1767,7 +2432,7 @@ restart:
clen > so->so_snd.sb_hiwat) {
SOCKBUF_UNLOCK(&so->so_snd);
error = EMSGSIZE;
- goto release;
+ goto out;
}
if (space < resid + clen &&
(atomic || space < so->so_snd.sb_lowat || space < clen)) {
@@ -1775,12 +2440,12 @@ restart:
(flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) {
SOCKBUF_UNLOCK(&so->so_snd);
error = EWOULDBLOCK;
- goto release;
+ goto out;
}
error = sbwait(so, SO_SND);
SOCKBUF_UNLOCK(&so->so_snd);
if (error)
- goto release;
+ goto out;
goto restart;
}
SOCKBUF_UNLOCK(&so->so_snd);
@@ -1825,7 +2490,7 @@ restart:
((flags & MSG_EOR) ? M_EOR : 0));
if (top == NULL) {
error = EFAULT; /* only possible error */
- goto release;
+ goto out;
}
space -= resid - uio->uio_resid;
resid = uio->uio_resid;
@@ -1889,12 +2554,10 @@ restart:
control = NULL;
top = NULL;
if (error)
- goto release;
+ goto out;
} while (resid && space > 0);
} while (resid);
-release:
- SOCK_IO_SEND_UNLOCK(so);
out:
#ifdef KERN_TLS
if (tls != NULL)
@@ -1907,6 +2570,20 @@ out:
return (error);
}
+int
+sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio,
+ struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
+{
+ int error;
+
+ error = SOCK_IO_SEND_LOCK(so, SBLOCKWAIT(flags));
+ if (error)
+ return (error);
+ error = sosend_generic_locked(so, addr, uio, top, control, flags, td);
+ SOCK_IO_SEND_UNLOCK(so);
+ return (error);
+}
+
/*
* Send to a socket from a kernel thread.
*
@@ -2060,11 +2737,11 @@ sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord)
* mbuf **mp0 for use in returning the chain. The uio is then used only for
* the count in uio_resid.
*/
-int
-soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio,
- struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
+static int
+soreceive_generic_locked(struct socket *so, struct sockaddr **psa,
+ struct uio *uio, struct mbuf **mp, struct mbuf **controlp, int *flagsp)
{
- struct mbuf *m, **mp;
+ struct mbuf *m;
int flags, error, offset;
ssize_t len;
struct protosw *pr = so->so_proto;
@@ -2073,25 +2750,15 @@ soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio,
ssize_t orig_resid = uio->uio_resid;
bool report_real_len = false;
- mp = mp0;
- if (psa != NULL)
- *psa = NULL;
- if (controlp != NULL)
- *controlp = NULL;
+ SOCK_IO_RECV_ASSERT_LOCKED(so);
+
+ error = 0;
if (flagsp != NULL) {
report_real_len = *flagsp & MSG_TRUNC;
*flagsp &= ~MSG_TRUNC;
flags = *flagsp &~ MSG_EOR;
} else
flags = 0;
- if (flags & MSG_OOB)
- return (soreceive_rcvoob(so, uio, flags));
- if (mp != NULL)
- *mp = NULL;
-
- error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags));
- if (error)
- return (error);
restart:
SOCKBUF_LOCK(&so->so_rcv);
@@ -2267,13 +2934,7 @@ dontblock:
while (cm != NULL) {
cmn = cm->m_next;
cm->m_next = NULL;
- if (pr->pr_domain->dom_externalize != NULL) {
- SOCKBUF_UNLOCK(&so->so_rcv);
- VNET_SO_ASSERT(so);
- error = (*pr->pr_domain->dom_externalize)
- (cm, controlp, flags);
- SOCKBUF_LOCK(&so->so_rcv);
- } else if (controlp != NULL)
+ if (controlp != NULL)
*controlp = cm;
else
m_freem(cm);
@@ -2549,74 +3210,56 @@ dontblock:
if (flagsp != NULL)
*flagsp |= flags;
release:
- SOCK_IO_RECV_UNLOCK(so);
return (error);
}
-/*
- * Optimized version of soreceive() for stream (TCP) sockets.
- */
int
-soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio,
- struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
+soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio,
+ struct mbuf **mp, struct mbuf **controlp, int *flagsp)
{
- int len = 0, error = 0, flags, oresid;
- struct sockbuf *sb;
- struct mbuf *m, *n = NULL;
+ int error, flags;
- /* We only do stream sockets. */
- if (so->so_type != SOCK_STREAM)
- return (EINVAL);
if (psa != NULL)
*psa = NULL;
- if (flagsp != NULL)
- flags = *flagsp &~ MSG_EOR;
- else
- flags = 0;
if (controlp != NULL)
*controlp = NULL;
- if (flags & MSG_OOB)
- return (soreceive_rcvoob(so, uio, flags));
- if (mp0 != NULL)
- *mp0 = NULL;
-
- sb = &so->so_rcv;
-
-#ifdef KERN_TLS
- /*
- * KTLS store TLS records as records with a control message to
- * describe the framing.
- *
- * We check once here before acquiring locks to optimize the
- * common case.
- */
- if (sb->sb_tls_info != NULL)
- return (soreceive_generic(so, psa, uio, mp0, controlp,
- flagsp));
-#endif
+ if (flagsp != NULL) {
+ flags = *flagsp;
+ if ((flags & MSG_OOB) != 0)
+ return (soreceive_rcvoob(so, uio, flags));
+ } else {
+ flags = 0;
+ }
+ if (mp != NULL)
+ *mp = NULL;
- /* Prevent other readers from entering the socket. */
error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags));
if (error)
return (error);
- SOCKBUF_LOCK(sb);
+ error = soreceive_generic_locked(so, psa, uio, mp, controlp, flagsp);
+ SOCK_IO_RECV_UNLOCK(so);
+ return (error);
+}
-#ifdef KERN_TLS
- if (sb->sb_tls_info != NULL) {
- SOCKBUF_UNLOCK(sb);
- SOCK_IO_RECV_UNLOCK(so);
- return (soreceive_generic(so, psa, uio, mp0, controlp,
- flagsp));
- }
-#endif
+/*
+ * Optimized version of soreceive() for stream (TCP) sockets.
+ */
+static int
+soreceive_stream_locked(struct socket *so, struct sockbuf *sb,
+ struct sockaddr **psa, struct uio *uio, struct mbuf **mp0,
+ struct mbuf **controlp, int flags)
+{
+ int len = 0, error = 0, oresid;
+ struct mbuf *m, *n = NULL;
+
+ SOCK_IO_RECV_ASSERT_LOCKED(so);
/* Easy one, no space to copyout anything. */
- if (uio->uio_resid == 0) {
- error = EINVAL;
- goto out;
- }
+ if (uio->uio_resid == 0)
+ return (EINVAL);
oresid = uio->uio_resid;
+ SOCKBUF_LOCK(sb);
/* We will never ever get anything unless we are or were connected. */
if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) {
error = ENOTCONN;
@@ -2770,6 +3413,62 @@ out:
SBLASTRECORDCHK(sb);
SBLASTMBUFCHK(sb);
SOCKBUF_UNLOCK(sb);
+ return (error);
+}
+
+int
+soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio,
+ struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
+{
+ struct sockbuf *sb;
+ int error, flags;
+
+ sb = &so->so_rcv;
+
+ /* We only do stream sockets. */
+ if (so->so_type != SOCK_STREAM)
+ return (EINVAL);
+ if (psa != NULL)
+ *psa = NULL;
+ if (flagsp != NULL)
+ flags = *flagsp & ~MSG_EOR;
+ else
+ flags = 0;
+ if (controlp != NULL)
+ *controlp = NULL;
+ if (flags & MSG_OOB)
+ return (soreceive_rcvoob(so, uio, flags));
+ if (mp0 != NULL)
+ *mp0 = NULL;
+
+#ifdef KERN_TLS
+ /*
+ * KTLS store TLS records as records with a control message to
+ * describe the framing.
+ *
+ * We check once here before acquiring locks to optimize the
+ * common case.
+ */
+ if (sb->sb_tls_info != NULL)
+ return (soreceive_generic(so, psa, uio, mp0, controlp,
+ flagsp));
+#endif
+
+ /*
+ * Prevent other threads from reading from the socket. This lock may be
+ * dropped in order to sleep waiting for data to arrive.
+ */
+ error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags));
+ if (error)
+ return (error);
+#ifdef KERN_TLS
+ if (__predict_false(sb->sb_tls_info != NULL)) {
+ SOCK_IO_RECV_UNLOCK(so);
+ return (soreceive_generic(so, psa, uio, mp0, controlp,
+ flagsp));
+ }
+#endif
+ error = soreceive_stream_locked(so, sb, psa, uio, mp0, controlp, flags);
SOCK_IO_RECV_UNLOCK(so);
return (error);
}
@@ -2918,10 +3617,7 @@ soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio,
while (cm != NULL) {
cmn = cm->m_next;
cm->m_next = NULL;
- if (pr->pr_domain->dom_externalize != NULL) {
- error = (*pr->pr_domain->dom_externalize)
- (cm, controlp, flags);
- } else if (controlp != NULL)
+ if (controlp != NULL)
*controlp = cm;
else
m_freem(cm);
@@ -3019,11 +3715,25 @@ sorflush(struct socket *so)
}
+int
+sosetfib(struct socket *so, int fibnum)
+{
+ if (fibnum < 0 || fibnum >= rt_numfibs)
+ return (EINVAL);
+
+ SOCK_LOCK(so);
+ so->so_fibnum = fibnum;
+ SOCK_UNLOCK(so);
+
+ return (0);
+}
+
+#ifdef SOCKET_HHOOK
/*
* Wrapper for Socket established helper hook.
* Parameters: socket, context of the hook point, hook id.
*/
-static int inline
+static inline int
hhook_run_socket(struct socket *so, void *hctx, int32_t h_id)
{
struct socket_hhook_data hhook_data = {
@@ -3040,6 +3750,7 @@ hhook_run_socket(struct socket *so, void *hctx, int32_t h_id)
/* Ugly but needed, since hhooks return void for now */
return (hhook_data.status);
}
+#endif
/*
* Perhaps this routine, and sooptcopyout(), below, ought to come in an
@@ -3106,10 +3817,7 @@ sosetopt(struct socket *so, struct sockopt *sopt)
CURVNET_SET(so->so_vnet);
error = 0;
if (sopt->sopt_level != SOL_SOCKET) {
- if (so->so_proto->pr_ctloutput != NULL)
- error = (*so->so_proto->pr_ctloutput)(so, sopt);
- else
- error = ENOPROTOOPT;
+ error = so->so_proto->pr_ctloutput(so, sopt);
} else {
switch (sopt->sopt_name) {
case SO_ACCEPTFILTER:
@@ -3165,21 +3873,7 @@ sosetopt(struct socket *so, struct sockopt *sopt)
break;
case SO_SETFIB:
- error = sooptcopyin(sopt, &optval, sizeof optval,
- sizeof optval);
- if (error)
- goto bad;
-
- if (optval < 0 || optval >= rt_numfibs) {
- error = EINVAL;
- goto bad;
- }
- if (((so->so_proto->pr_domain->dom_family == PF_INET) ||
- (so->so_proto->pr_domain->dom_family == PF_INET6) ||
- (so->so_proto->pr_domain->dom_family == PF_ROUTE)))
- so->so_fibnum = optval;
- else
- so->so_fibnum = 0;
+ error = so->so_proto->pr_ctloutput(so, sopt);
break;
case SO_USER_COOKIE:
@@ -3267,16 +3961,74 @@ sosetopt(struct socket *so, struct sockopt *sopt)
so->so_max_pacing_rate = val32;
break;
+ case SO_SPLICE: {
+ struct splice splice;
+
+#ifdef COMPAT_FREEBSD32
+ if (SV_CURPROC_FLAG(SV_ILP32)) {
+ struct splice32 splice32;
+
+ error = sooptcopyin(sopt, &splice32,
+ sizeof(splice32), sizeof(splice32));
+ if (error == 0) {
+ splice.sp_fd = splice32.sp_fd;
+ splice.sp_max = splice32.sp_max;
+ CP(splice32.sp_idle, splice.sp_idle,
+ tv_sec);
+ CP(splice32.sp_idle, splice.sp_idle,
+ tv_usec);
+ }
+ } else
+#endif
+ {
+ error = sooptcopyin(sopt, &splice,
+ sizeof(splice), sizeof(splice));
+ }
+ if (error)
+ goto bad;
+#ifdef KTRACE
+ if (KTRPOINT(curthread, KTR_STRUCT))
+ ktrsplice(&splice);
+#endif
+
+ error = splice_init();
+ if (error != 0)
+ goto bad;
+
+ if (splice.sp_fd >= 0) {
+ struct file *fp;
+ struct socket *so2;
+
+ if (!cap_rights_contains(sopt->sopt_rights,
+ &cap_recv_rights)) {
+ error = ENOTCAPABLE;
+ goto bad;
+ }
+ error = getsock(sopt->sopt_td, splice.sp_fd,
+ &cap_send_rights, &fp);
+ if (error != 0)
+ goto bad;
+ so2 = fp->f_data;
+
+ error = so_splice(so, so2, &splice);
+ fdrop(fp, sopt->sopt_td);
+ } else {
+ error = so_unsplice(so, false);
+ }
+ break;
+ }
default:
+#ifdef SOCKET_HHOOK
if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0)
error = hhook_run_socket(so, sopt,
HHOOK_SOCKET_OPT);
else
+#endif
error = ENOPROTOOPT;
break;
}
- if (error == 0 && so->so_proto->pr_ctloutput != NULL)
- (void)(*so->so_proto->pr_ctloutput)(so, sopt);
+ if (error == 0)
+ (void)so->so_proto->pr_ctloutput(so, sopt);
}
bad:
CURVNET_RESTORE();
@@ -3326,10 +4078,7 @@ sogetopt(struct socket *so, struct sockopt *sopt)
CURVNET_SET(so->so_vnet);
error = 0;
if (sopt->sopt_level != SOL_SOCKET) {
- if (so->so_proto->pr_ctloutput != NULL)
- error = (*so->so_proto->pr_ctloutput)(so, sopt);
- else
- error = ENOPROTOOPT;
+ error = so->so_proto->pr_ctloutput(so, sopt);
CURVNET_RESTORE();
return (error);
} else {
@@ -3367,6 +4116,12 @@ integer:
error = sooptcopyout(sopt, &optval, sizeof optval);
break;
+ case SO_FIB:
+ SOCK_LOCK(so);
+ optval = so->so_fibnum;
+ SOCK_UNLOCK(so);
+ goto integer;
+
case SO_DOMAIN:
optval = so->so_proto->pr_domain->dom_family;
goto integer;
@@ -3392,23 +4147,31 @@ integer:
goto integer;
case SO_SNDBUF:
+ SOCK_LOCK(so);
optval = SOLISTENING(so) ? so->sol_sbsnd_hiwat :
so->so_snd.sb_hiwat;
+ SOCK_UNLOCK(so);
goto integer;
case SO_RCVBUF:
+ SOCK_LOCK(so);
optval = SOLISTENING(so) ? so->sol_sbrcv_hiwat :
so->so_rcv.sb_hiwat;
+ SOCK_UNLOCK(so);
goto integer;
case SO_SNDLOWAT:
+ SOCK_LOCK(so);
optval = SOLISTENING(so) ? so->sol_sbsnd_lowat :
so->so_snd.sb_lowat;
+ SOCK_UNLOCK(so);
goto integer;
case SO_RCVLOWAT:
+ SOCK_LOCK(so);
optval = SOLISTENING(so) ? so->sol_sbrcv_lowat :
so->so_rcv.sb_lowat;
+ SOCK_UNLOCK(so);
goto integer;
case SO_SNDTIMEO:
@@ -3465,15 +4228,21 @@ integer:
break;
case SO_LISTENQLIMIT:
+ SOCK_LOCK(so);
optval = SOLISTENING(so) ? so->sol_qlimit : 0;
+ SOCK_UNLOCK(so);
goto integer;
case SO_LISTENQLEN:
+ SOCK_LOCK(so);
optval = SOLISTENING(so) ? so->sol_qlen : 0;
+ SOCK_UNLOCK(so);
goto integer;
case SO_LISTENINCQLEN:
+ SOCK_LOCK(so);
optval = SOLISTENING(so) ? so->sol_incqlen : 0;
+ SOCK_UNLOCK(so);
goto integer;
case SO_TS_CLOCK:
@@ -3484,18 +4253,45 @@ integer:
optval = so->so_max_pacing_rate;
goto integer;
+ case SO_SPLICE: {
+ off_t n;
+
+ /*
+ * Acquire the I/O lock to serialize with
+ * so_splice_xfer(). This is not required for
+ * correctness, but makes testing simpler: once a byte
+ * has been transmitted to the sink and observed (e.g.,
+ * by reading from the socket to which the sink is
+ * connected), a subsequent getsockopt(SO_SPLICE) will
+ * return an up-to-date value.
+ */
+ error = SOCK_IO_RECV_LOCK(so, SBL_WAIT);
+ if (error != 0)
+ goto bad;
+ SOCK_LOCK(so);
+ if (SOLISTENING(so)) {
+ n = 0;
+ } else {
+ n = so->so_splice_sent;
+ }
+ SOCK_UNLOCK(so);
+ SOCK_IO_RECV_UNLOCK(so);
+ error = sooptcopyout(sopt, &n, sizeof(n));
+ break;
+ }
+
default:
+#ifdef SOCKET_HHOOK
if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0)
error = hhook_run_socket(so, sopt,
HHOOK_SOCKET_OPT);
else
+#endif
error = ENOPROTOOPT;
break;
}
}
-#ifdef MAC
bad:
-#endif
CURVNET_RESTORE();
return (error);
}
@@ -3624,20 +4420,7 @@ sohasoutofband(struct socket *so)
}
int
-sopoll(struct socket *so, int events, struct ucred *active_cred,
- struct thread *td)
-{
-
- /*
- * We do not need to set or assert curvnet as long as everyone uses
- * sopoll_generic().
- */
- return (so->so_proto->pr_sopoll(so, events, active_cred, td));
-}
-
-int
-sopoll_generic(struct socket *so, int events, struct ucred *active_cred,
- struct thread *td)
+sopoll_generic(struct socket *so, int events, struct thread *td)
{
int revents;
@@ -3658,10 +4441,10 @@ sopoll_generic(struct socket *so, int events, struct ucred *active_cred,
SOCK_SENDBUF_LOCK(so);
SOCK_RECVBUF_LOCK(so);
if (events & (POLLIN | POLLRDNORM))
- if (soreadabledata(so))
+ if (soreadabledata(so) && !isspliced(so))
revents |= events & (POLLIN | POLLRDNORM);
if (events & (POLLOUT | POLLWRNORM))
- if (sowriteable(so))
+ if (sowriteable(so) && !issplicedback(so))
revents |= events & (POLLOUT | POLLWRNORM);
if (events & (POLLPRI | POLLRDBAND))
if (so->so_oobmark ||
@@ -3695,9 +4478,8 @@ sopoll_generic(struct socket *so, int events, struct ucred *active_cred,
}
int
-soo_kqfilter(struct file *fp, struct knote *kn)
+sokqfilter_generic(struct socket *so, struct knote *kn)
{
- struct socket *so = kn->kn_fp->f_data;
struct sockbuf *sb;
sb_which which;
struct knlist *knl;
@@ -3769,6 +4551,9 @@ filt_soread(struct knote *kn, long hint)
return (!TAILQ_EMPTY(&so->sol_comp));
}
+ if ((so->so_rcv.sb_flags & SB_SPLICED) != 0)
+ return (0);
+
SOCK_RECVBUF_LOCK_ASSERT(so);
kn->kn_data = sbavail(&so->so_rcv) - so->so_rcv.sb_ctl;
@@ -3785,8 +4570,12 @@ filt_soread(struct knote *kn, long hint)
} else if (sbavail(&so->so_rcv) >= so->so_rcv.sb_lowat)
return (1);
+#ifdef SOCKET_HHOOK
/* This hook returning non-zero indicates an event, not error */
return (hhook_run_socket(so, NULL, HHOOK_FILT_SOREAD));
+#else
+ return (0);
+#endif
}
static void
@@ -3815,7 +4604,9 @@ filt_sowrite(struct knote *kn, long hint)
SOCK_SENDBUF_LOCK_ASSERT(so);
kn->kn_data = sbspace(&so->so_snd);
+#ifdef SOCKET_HHOOK
hhook_run_socket(so, kn, HHOOK_FILT_SOWRITE);
+#endif
if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
kn->kn_flags |= EV_EOF;
@@ -4257,6 +5048,8 @@ sotoxsocket(struct socket *so, struct xsocket *xso)
xso->so_error = so->so_error;
xso->so_uid = so->so_cred->cr_uid;
xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0;
+ SOCK_LOCK(so);
+ xso->so_fibnum = so->so_fibnum;
if (SOLISTENING(so)) {
xso->so_qlen = so->sol_qlen;
xso->so_incqlen = so->sol_incqlen;
@@ -4268,7 +5061,10 @@ sotoxsocket(struct socket *so, struct xsocket *xso)
xso->so_oobmark = so->so_oobmark;
sbtoxsockbuf(&so->so_snd, &xso->so_snd);
sbtoxsockbuf(&so->so_rcv, &xso->so_rcv);
+ if ((so->so_rcv.sb_flags & SB_SPLICED) != 0)
+ xso->so_splice_so = (uintptr_t)so->so_splice->dst;
}
+ SOCK_UNLOCK(so);
}
int