summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRobert Watson <rwatson@FreeBSD.org>2008-09-15 20:46:32 +0000
committerRobert Watson <rwatson@FreeBSD.org>2008-09-15 20:46:32 +0000
commit2af76becb086f727e90248ab836611f5601c9606 (patch)
treea1f3872fd2048179526347040088296df2cee990
parentea341e943e0727b08f6b464c5ad75dd0446d6077 (diff)
Notes
-rw-r--r--sys/kern/uipc_socket.c231
-rw-r--r--sys/netinet/udp_usrreq.c14
-rw-r--r--sys/netinet6/udp6_usrreq.c1
-rw-r--r--sys/sys/socketvar.h3
4 files changed, 248 insertions, 1 deletions
diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c
index fdde01a2ad67..7689a7bf06ca 100644
--- a/sys/kern/uipc_socket.c
+++ b/sys/kern/uipc_socket.c
@@ -2,7 +2,7 @@
* Copyright (c) 1982, 1986, 1988, 1990, 1993
* The Regents of the University of California.
* Copyright (c) 2004 The FreeBSD Foundation
- * Copyright (c) 2004-2007 Robert N. M. Watson
+ * Copyright (c) 2004-2008 Robert N. M. Watson
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -1847,6 +1847,235 @@ release:
return (error);
}
+/*
+ * Optimized version of soreceive() for simple datagram cases from userspace;
+ * this is experimental, and while heavily tested, may contain errors.
+ */
+int
+soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio,
+ struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
+{
+ struct mbuf *m, *m2;
+ int flags, len, error, offset;
+ struct protosw *pr = so->so_proto;
+ struct mbuf *nextrecord;
+
+ if (psa != NULL)
+ *psa = NULL;
+ if (controlp != NULL)
+ *controlp = NULL;
+ if (flagsp != NULL)
+ flags = *flagsp &~ MSG_EOR;
+ else
+ flags = 0;
+
+ /*
+ * For any complicated cases, fall back to the full
+ * soreceive_generic().
+ */
+ if (mp0 != NULL || (flags & MSG_PEEK) || (flags & MSG_OOB))
+ return (soreceive_generic(so, psa, uio, mp0, controlp,
+ flagsp));
+
+ /*
+ * Enforce restrictions on use.
+ */
+ KASSERT((pr->pr_flags & PR_WANTRCVD) == 0,
+ ("soreceive_dgram: wantrcvd"));
+ KASSERT(pr->pr_flags & PR_ATOMIC, ("soreceive_dgram: !atomic"));
+ KASSERT((so->so_rcv.sb_state & SBS_RCVATMARK) == 0,
+ ("soreceive_dgram: SBS_RCVATMARK"));
+ KASSERT((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0,
+ ("soreceive_dgram: P_CONNREQUIRED"));
+
+restart:
+ SOCKBUF_LOCK(&so->so_rcv);
+ m = so->so_rcv.sb_mb;
+
+ /*
+ * If we have less data than requested, block awaiting more (subject
+ * to any timeout) if:
+ * 1. the current count is less than the low water mark, or
+ * 2. MSG_WAITALL is set, and it is possible to do the entire
+ * receive operation at once if we block (resid <= hiwat).
+ * 3. MSG_DONTWAIT is not set
+ * If MSG_WAITALL is set but resid is larger than the receive buffer,
+ * we have to do the receive in sections, and thus risk returning a
+ * short count if a timeout or signal occurs after we start.
+ */
+ if (m == NULL) {
+ KASSERT(m != NULL || !so->so_rcv.sb_cc,
+ ("receive: m == %p so->so_rcv.sb_cc == %u",
+ m, so->so_rcv.sb_cc));
+ if (so->so_error) {
+ if (m != NULL)
+ goto dontblock;
+ error = so->so_error;
+ so->so_error = 0;
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ return (error);
+ }
+ SOCKBUF_LOCK_ASSERT(&so->so_rcv);
+ if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
+ if (m == NULL) {
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ return (0);
+ } else
+ goto dontblock;
+ }
+ if (uio->uio_resid == 0) {
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ return (0);
+ }
+ if ((so->so_state & SS_NBIO) ||
+ (flags & (MSG_DONTWAIT|MSG_NBIO))) {
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ error = EWOULDBLOCK;
+ return (error);
+ }
+ SBLASTRECORDCHK(&so->so_rcv);
+ SBLASTMBUFCHK(&so->so_rcv);
+
+ error = sbwait(&so->so_rcv);
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ if (error)
+ return (error);
+ goto restart;
+ }
+dontblock:
+ /*
+ * From this point onward, we maintain 'nextrecord' as a cache of the
+ * pointer to the next record in the socket buffer. We must keep the
+ * various socket buffer pointers and local stack versions of the
+ * pointers in sync, pushing out modifications before dropping the
+ * socket buffer mutex, and re-reading them when picking it up.
+ *
+ * Otherwise, we will race with the network stack appending new data
+ * or records onto the socket buffer by using inconsistent/stale
+ * versions of the field, possibly resulting in socket buffer
+ * corruption.
+ *
+ * By holding the high-level sblock(), we prevent simultaneous
+ * readers from pulling off the front of the socket buffer.
+ */
+ SOCKBUF_LOCK_ASSERT(&so->so_rcv);
+ if (uio->uio_td)
+ uio->uio_td->td_ru.ru_msgrcv++;
+ KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb"));
+ SBLASTRECORDCHK(&so->so_rcv);
+ SBLASTMBUFCHK(&so->so_rcv);
+ nextrecord = m->m_nextpkt;
+ if (pr->pr_flags & PR_ADDR) {
+ KASSERT(m->m_type == MT_SONAME,
+ ("m->m_type == %d", m->m_type));
+ if (psa != NULL)
+ *psa = sodupsockaddr(mtod(m, struct sockaddr *),
+ M_NOWAIT);
+ sbfree(&so->so_rcv, m);
+ so->so_rcv.sb_mb = m_free(m);
+ m = so->so_rcv.sb_mb;
+ sockbuf_pushsync(&so->so_rcv, nextrecord);
+ }
+ if (m == NULL) {
+ /* XXXRW: Can this happen? */
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ return (0);
+ }
+ KASSERT(m->m_nextpkt == nextrecord,
+ ("soreceive: post-control, nextrecord !sync"));
+ if (nextrecord == NULL) {
+ KASSERT(so->so_rcv.sb_mb == m,
+ ("soreceive: post-control, sb_mb!=m"));
+ KASSERT(so->so_rcv.sb_lastrecord == m,
+ ("soreceive: post-control, lastrecord!=m"));
+ }
+
+ SOCKBUF_LOCK_ASSERT(&so->so_rcv);
+ SBLASTRECORDCHK(&so->so_rcv);
+ SBLASTMBUFCHK(&so->so_rcv);
+ KASSERT(m == so->so_rcv.sb_mb, ("soreceive_dgram: m not sb_mb"));
+ KASSERT(so->so_rcv.sb_mb->m_nextpkt == nextrecord,
+ ("soreceive_dgram: m_nextpkt != nextrecord"));
+
+ /*
+ * Pull 'm' and its chain off the front of the packet queue.
+ */
+ so->so_rcv.sb_mb = NULL;
+ sockbuf_pushsync(&so->so_rcv, nextrecord);
+
+ /*
+ * Walk 'm's chain and free that many bytes from the socket buffer.
+ */
+ for (m2 = m; m2 != NULL; m2 = m2->m_next)
+ sbfree(&so->so_rcv, m2);
+
+ /*
+ * Do a few last checks before we let go of the lock.
+ */
+ SBLASTRECORDCHK(&so->so_rcv);
+ SBLASTMBUFCHK(&so->so_rcv);
+ SOCKBUF_UNLOCK(&so->so_rcv);
+
+ /*
+ * Packet to copyout() is now in 'm' and it is disconnected from the
+ * queue.
+ *
+ * Process one or more MT_CONTROL mbufs present before any data mbufs
+ * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
+ * just copy the data; if !MSG_PEEK, we call into the protocol to
+ * perform externalization (or freeing if controlp == NULL).
+ */
+ if (m->m_type == MT_CONTROL) {
+ struct mbuf *cm = NULL, *cmn;
+ struct mbuf **cme = &cm;
+
+ do {
+ m2 = m->m_next;
+ m->m_next = NULL;
+ *cme = m;
+ cme = &(*cme)->m_next;
+ m = m2;
+ } while (m != NULL && m->m_type == MT_CONTROL);
+ while (cm != NULL) {
+ cmn = cm->m_next;
+ cm->m_next = NULL;
+ if (pr->pr_domain->dom_externalize != NULL) {
+ error = (*pr->pr_domain->dom_externalize)
+ (cm, controlp);
+ } else if (controlp != NULL)
+ *controlp = cm;
+ else
+ m_freem(cm);
+ if (controlp != NULL) {
+ while (*controlp != NULL)
+ controlp = &(*controlp)->m_next;
+ }
+ cm = cmn;
+ }
+ }
+
+ KASSERT(m->m_type == MT_DATA, ("soreceive_dgram: !data"));
+
+ offset = 0;
+ while (m != NULL && uio->uio_resid > 0) {
+ len = uio->uio_resid;
+ if (len > m->m_len)
+ len = m->m_len;
+ error = uiomove(mtod(m, char *), (int)len, uio);
+ if (error) {
+ m_freem(m);
+ return (error);
+ }
+ m = m_free(m);
+ }
+ if (m != NULL && pr->pr_flags & PR_ATOMIC)
+ flags |= MSG_TRUNC;
+ m_freem(m);
+ if (flagsp != NULL)
+ *flagsp |= flags;
+ return (0);
+}
+
int
soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
diff --git a/sys/netinet/udp_usrreq.c b/sys/netinet/udp_usrreq.c
index b6a4669aac4f..26f367eebebb 100644
--- a/sys/netinet/udp_usrreq.c
+++ b/sys/netinet/udp_usrreq.c
@@ -80,6 +80,9 @@ __FBSDID("$FreeBSD$");
#endif
#include <netinet/udp.h>
#include <netinet/udp_var.h>
+#ifdef INET6
+#include <netinet6/udp6_var.h>
+#endif
#ifdef IPSEC
#include <netipsec/ipsec.h>
@@ -128,6 +131,11 @@ u_long udp_recvspace = 40 * (1024 +
SYSCTL_ULONG(_net_inet_udp, UDPCTL_RECVSPACE, recvspace, CTLFLAG_RW,
&udp_recvspace, 0, "Maximum space for incoming UDP datagrams");
+static int udp_soreceive_dgram;
+SYSCTL_INT(_net_inet_udp, OID_AUTO, soreceive_dgram_enabled,
+ CTLFLAG_RD | CTLFLAG_TUN, &udp_soreceive_dgram, 0,
+ "Use experimental optimized datagram receive");
+
struct inpcbhead udb; /* from udp_var.h */
struct inpcbinfo udbinfo;
@@ -155,6 +163,12 @@ udp_inpcb_init(void *mem, int size, int flags)
{
struct inpcb *inp;
+ TUNABLE_INT_FETCH("net.inet.udp.soreceive_dgram_enabled",
+ &udp_soreceive_dgram);
+ if (udp_soreceive_dgram) {
+ udp_usrreqs.pru_soreceive = soreceive_dgram;
+ udp6_usrreqs.pru_soreceive = soreceive_dgram;
+ }
inp = mem;
INP_LOCK_INIT(inp, "inp", "udpinp");
return (0);
diff --git a/sys/netinet6/udp6_usrreq.c b/sys/netinet6/udp6_usrreq.c
index b19b25166c5b..0ebb69f6a44b 100644
--- a/sys/netinet6/udp6_usrreq.c
+++ b/sys/netinet6/udp6_usrreq.c
@@ -1013,6 +1013,7 @@ struct pr_usrreqs udp6_usrreqs = {
.pru_send = udp6_send,
.pru_shutdown = udp_shutdown,
.pru_sockaddr = in6_mapped_sockaddr,
+ .pru_sosend = sosend_dgram,
.pru_sosetlabel = in_pcbsosetlabel,
.pru_close = udp6_close
};
diff --git a/sys/sys/socketvar.h b/sys/sys/socketvar.h
index 024d9e875e27..e70b36bb3e6a 100644
--- a/sys/sys/socketvar.h
+++ b/sys/sys/socketvar.h
@@ -333,6 +333,9 @@ int sopoll_generic(struct socket *so, int events,
struct ucred *active_cred, struct thread *td);
int soreceive(struct socket *so, struct sockaddr **paddr, struct uio *uio,
struct mbuf **mp0, struct mbuf **controlp, int *flagsp);
+int soreceive_dgram(struct socket *so, struct sockaddr **paddr,
+ struct uio *uio, struct mbuf **mp0, struct mbuf **controlp,
+ int *flagsp);
int soreceive_generic(struct socket *so, struct sockaddr **paddr,
struct uio *uio, struct mbuf **mp0, struct mbuf **controlp,
int *flagsp);