diff options
author | Robert Watson <rwatson@FreeBSD.org> | 2008-09-15 20:46:32 +0000 |
---|---|---|
committer | Robert Watson <rwatson@FreeBSD.org> | 2008-09-15 20:46:32 +0000 |
commit | 2af76becb086f727e90248ab836611f5601c9606 (patch) | |
tree | a1f3872fd2048179526347040088296df2cee990 | |
parent | ea341e943e0727b08f6b464c5ad75dd0446d6077 (diff) |
Notes
-rw-r--r-- | sys/kern/uipc_socket.c | 231 | ||||
-rw-r--r-- | sys/netinet/udp_usrreq.c | 14 | ||||
-rw-r--r-- | sys/netinet6/udp6_usrreq.c | 1 | ||||
-rw-r--r-- | sys/sys/socketvar.h | 3 |
4 files changed, 248 insertions, 1 deletions
diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c index fdde01a2ad67..7689a7bf06ca 100644 --- a/sys/kern/uipc_socket.c +++ b/sys/kern/uipc_socket.c @@ -2,7 +2,7 @@ * Copyright (c) 1982, 1986, 1988, 1990, 1993 * The Regents of the University of California. * Copyright (c) 2004 The FreeBSD Foundation - * Copyright (c) 2004-2007 Robert N. M. Watson + * Copyright (c) 2004-2008 Robert N. M. Watson * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -1847,6 +1847,235 @@ release: return (error); } +/* + * Optimized version of soreceive() for simple datagram cases from userspace; + * this is experimental, and while heavily tested, may contain errors. + */ +int +soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio, + struct mbuf **mp0, struct mbuf **controlp, int *flagsp) +{ + struct mbuf *m, *m2; + int flags, len, error, offset; + struct protosw *pr = so->so_proto; + struct mbuf *nextrecord; + + if (psa != NULL) + *psa = NULL; + if (controlp != NULL) + *controlp = NULL; + if (flagsp != NULL) + flags = *flagsp &~ MSG_EOR; + else + flags = 0; + + /* + * For any complicated cases, fall back to the full + * soreceive_generic(). + */ + if (mp0 != NULL || (flags & MSG_PEEK) || (flags & MSG_OOB)) + return (soreceive_generic(so, psa, uio, mp0, controlp, + flagsp)); + + /* + * Enforce restrictions on use. + */ + KASSERT((pr->pr_flags & PR_WANTRCVD) == 0, + ("soreceive_dgram: wantrcvd")); + KASSERT(pr->pr_flags & PR_ATOMIC, ("soreceive_dgram: !atomic")); + KASSERT((so->so_rcv.sb_state & SBS_RCVATMARK) == 0, + ("soreceive_dgram: SBS_RCVATMARK")); + KASSERT((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0, + ("soreceive_dgram: P_CONNREQUIRED")); + +restart: + SOCKBUF_LOCK(&so->so_rcv); + m = so->so_rcv.sb_mb; + + /* + * If we have less data than requested, block awaiting more (subject + * to any timeout) if: + * 1. the current count is less than the low water mark, or + * 2. MSG_WAITALL is set, and it is possible to do the entire + * receive operation at once if we block (resid <= hiwat). + * 3. MSG_DONTWAIT is not set + * If MSG_WAITALL is set but resid is larger than the receive buffer, + * we have to do the receive in sections, and thus risk returning a + * short count if a timeout or signal occurs after we start. + */ + if (m == NULL) { + KASSERT(m != NULL || !so->so_rcv.sb_cc, + ("receive: m == %p so->so_rcv.sb_cc == %u", + m, so->so_rcv.sb_cc)); + if (so->so_error) { + if (m != NULL) + goto dontblock; + error = so->so_error; + so->so_error = 0; + SOCKBUF_UNLOCK(&so->so_rcv); + return (error); + } + SOCKBUF_LOCK_ASSERT(&so->so_rcv); + if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { + if (m == NULL) { + SOCKBUF_UNLOCK(&so->so_rcv); + return (0); + } else + goto dontblock; + } + if (uio->uio_resid == 0) { + SOCKBUF_UNLOCK(&so->so_rcv); + return (0); + } + if ((so->so_state & SS_NBIO) || + (flags & (MSG_DONTWAIT|MSG_NBIO))) { + SOCKBUF_UNLOCK(&so->so_rcv); + error = EWOULDBLOCK; + return (error); + } + SBLASTRECORDCHK(&so->so_rcv); + SBLASTMBUFCHK(&so->so_rcv); + + error = sbwait(&so->so_rcv); + SOCKBUF_UNLOCK(&so->so_rcv); + if (error) + return (error); + goto restart; + } +dontblock: + /* + * From this point onward, we maintain 'nextrecord' as a cache of the + * pointer to the next record in the socket buffer. We must keep the + * various socket buffer pointers and local stack versions of the + * pointers in sync, pushing out modifications before dropping the + * socket buffer mutex, and re-reading them when picking it up. + * + * Otherwise, we will race with the network stack appending new data + * or records onto the socket buffer by using inconsistent/stale + * versions of the field, possibly resulting in socket buffer + * corruption. + * + * By holding the high-level sblock(), we prevent simultaneous + * readers from pulling off the front of the socket buffer. + */ + SOCKBUF_LOCK_ASSERT(&so->so_rcv); + if (uio->uio_td) + uio->uio_td->td_ru.ru_msgrcv++; + KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb")); + SBLASTRECORDCHK(&so->so_rcv); + SBLASTMBUFCHK(&so->so_rcv); + nextrecord = m->m_nextpkt; + if (pr->pr_flags & PR_ADDR) { + KASSERT(m->m_type == MT_SONAME, + ("m->m_type == %d", m->m_type)); + if (psa != NULL) + *psa = sodupsockaddr(mtod(m, struct sockaddr *), + M_NOWAIT); + sbfree(&so->so_rcv, m); + so->so_rcv.sb_mb = m_free(m); + m = so->so_rcv.sb_mb; + sockbuf_pushsync(&so->so_rcv, nextrecord); + } + if (m == NULL) { + /* XXXRW: Can this happen? */ + SOCKBUF_UNLOCK(&so->so_rcv); + return (0); + } + KASSERT(m->m_nextpkt == nextrecord, + ("soreceive: post-control, nextrecord !sync")); + if (nextrecord == NULL) { + KASSERT(so->so_rcv.sb_mb == m, + ("soreceive: post-control, sb_mb!=m")); + KASSERT(so->so_rcv.sb_lastrecord == m, + ("soreceive: post-control, lastrecord!=m")); + } + + SOCKBUF_LOCK_ASSERT(&so->so_rcv); + SBLASTRECORDCHK(&so->so_rcv); + SBLASTMBUFCHK(&so->so_rcv); + KASSERT(m == so->so_rcv.sb_mb, ("soreceive_dgram: m not sb_mb")); + KASSERT(so->so_rcv.sb_mb->m_nextpkt == nextrecord, + ("soreceive_dgram: m_nextpkt != nextrecord")); + + /* + * Pull 'm' and its chain off the front of the packet queue. + */ + so->so_rcv.sb_mb = NULL; + sockbuf_pushsync(&so->so_rcv, nextrecord); + + /* + * Walk 'm's chain and free that many bytes from the socket buffer. + */ + for (m2 = m; m2 != NULL; m2 = m2->m_next) + sbfree(&so->so_rcv, m2); + + /* + * Do a few last checks before we let go of the lock. + */ + SBLASTRECORDCHK(&so->so_rcv); + SBLASTMBUFCHK(&so->so_rcv); + SOCKBUF_UNLOCK(&so->so_rcv); + + /* + * Packet to copyout() is now in 'm' and it is disconnected from the + * queue. + * + * Process one or more MT_CONTROL mbufs present before any data mbufs + * in the first mbuf chain on the socket buffer. If MSG_PEEK, we + * just copy the data; if !MSG_PEEK, we call into the protocol to + * perform externalization (or freeing if controlp == NULL). + */ + if (m->m_type == MT_CONTROL) { + struct mbuf *cm = NULL, *cmn; + struct mbuf **cme = &cm; + + do { + m2 = m->m_next; + m->m_next = NULL; + *cme = m; + cme = &(*cme)->m_next; + m = m2; + } while (m != NULL && m->m_type == MT_CONTROL); + while (cm != NULL) { + cmn = cm->m_next; + cm->m_next = NULL; + if (pr->pr_domain->dom_externalize != NULL) { + error = (*pr->pr_domain->dom_externalize) + (cm, controlp); + } else if (controlp != NULL) + *controlp = cm; + else + m_freem(cm); + if (controlp != NULL) { + while (*controlp != NULL) + controlp = &(*controlp)->m_next; + } + cm = cmn; + } + } + + KASSERT(m->m_type == MT_DATA, ("soreceive_dgram: !data")); + + offset = 0; + while (m != NULL && uio->uio_resid > 0) { + len = uio->uio_resid; + if (len > m->m_len) + len = m->m_len; + error = uiomove(mtod(m, char *), (int)len, uio); + if (error) { + m_freem(m); + return (error); + } + m = m_free(m); + } + if (m != NULL && pr->pr_flags & PR_ATOMIC) + flags |= MSG_TRUNC; + m_freem(m); + if (flagsp != NULL) + *flagsp |= flags; + return (0); +} + int soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) diff --git a/sys/netinet/udp_usrreq.c b/sys/netinet/udp_usrreq.c index b6a4669aac4f..26f367eebebb 100644 --- a/sys/netinet/udp_usrreq.c +++ b/sys/netinet/udp_usrreq.c @@ -80,6 +80,9 @@ __FBSDID("$FreeBSD$"); #endif #include <netinet/udp.h> #include <netinet/udp_var.h> +#ifdef INET6 +#include <netinet6/udp6_var.h> +#endif #ifdef IPSEC #include <netipsec/ipsec.h> @@ -128,6 +131,11 @@ u_long udp_recvspace = 40 * (1024 + SYSCTL_ULONG(_net_inet_udp, UDPCTL_RECVSPACE, recvspace, CTLFLAG_RW, &udp_recvspace, 0, "Maximum space for incoming UDP datagrams"); +static int udp_soreceive_dgram; +SYSCTL_INT(_net_inet_udp, OID_AUTO, soreceive_dgram_enabled, + CTLFLAG_RD | CTLFLAG_TUN, &udp_soreceive_dgram, 0, + "Use experimental optimized datagram receive"); + struct inpcbhead udb; /* from udp_var.h */ struct inpcbinfo udbinfo; @@ -155,6 +163,12 @@ udp_inpcb_init(void *mem, int size, int flags) { struct inpcb *inp; + TUNABLE_INT_FETCH("net.inet.udp.soreceive_dgram_enabled", + &udp_soreceive_dgram); + if (udp_soreceive_dgram) { + udp_usrreqs.pru_soreceive = soreceive_dgram; + udp6_usrreqs.pru_soreceive = soreceive_dgram; + } inp = mem; INP_LOCK_INIT(inp, "inp", "udpinp"); return (0); diff --git a/sys/netinet6/udp6_usrreq.c b/sys/netinet6/udp6_usrreq.c index b19b25166c5b..0ebb69f6a44b 100644 --- a/sys/netinet6/udp6_usrreq.c +++ b/sys/netinet6/udp6_usrreq.c @@ -1013,6 +1013,7 @@ struct pr_usrreqs udp6_usrreqs = { .pru_send = udp6_send, .pru_shutdown = udp_shutdown, .pru_sockaddr = in6_mapped_sockaddr, + .pru_sosend = sosend_dgram, .pru_sosetlabel = in_pcbsosetlabel, .pru_close = udp6_close }; diff --git a/sys/sys/socketvar.h b/sys/sys/socketvar.h index 024d9e875e27..e70b36bb3e6a 100644 --- a/sys/sys/socketvar.h +++ b/sys/sys/socketvar.h @@ -333,6 +333,9 @@ int sopoll_generic(struct socket *so, int events, struct ucred *active_cred, struct thread *td); int soreceive(struct socket *so, struct sockaddr **paddr, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp); +int soreceive_dgram(struct socket *so, struct sockaddr **paddr, + struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, + int *flagsp); int soreceive_generic(struct socket *so, struct sockaddr **paddr, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp); |