diff options
Diffstat (limited to 'sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c')
-rw-r--r-- | sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c | 729 |
1 files changed, 552 insertions, 177 deletions
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c b/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c index a3dd692def86e..6edeacd0c49ca 100644 --- a/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c +++ b/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c @@ -38,14 +38,18 @@ __FBSDID("$FreeBSD$"); #include <sys/limits.h> #include <sys/lock.h> #include <sys/mbuf.h> +#include <sys/condvar.h> #include <sys/mutex.h> #include <sys/proc.h> +#include <sys/smp.h> #include <sys/socket.h> #include <sys/syslog.h> #include <sys/socketvar.h> #include <sys/uio.h> +#include <sys/file.h> #include <machine/bus.h> +#include <machine/cpu.h> #include <net/if.h> #include <net/route.h> @@ -56,6 +60,7 @@ __FBSDID("$FreeBSD$"); #include <netinet/in_var.h> +#include <dev/cxgb/cxgb_config.h> #include <dev/cxgb/cxgb_osdep.h> #include <dev/cxgb/sys/mbufq.h> @@ -72,6 +77,7 @@ __FBSDID("$FreeBSD$"); #include <dev/cxgb/common/cxgb_ctl_defs.h> #include <dev/cxgb/cxgb_l2t.h> #include <dev/cxgb/cxgb_offload.h> + #include <vm/vm.h> #include <vm/vm_page.h> #include <vm/vm_map.h> @@ -85,6 +91,7 @@ __FBSDID("$FreeBSD$"); #include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h> #include <dev/cxgb/ulp/tom/cxgb_toepcb.h> #include <dev/cxgb/ulp/tom/cxgb_tcp.h> +#include <dev/cxgb/ulp/tom/cxgb_vm.h> static int (*pru_sosend)(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, @@ -94,13 +101,11 @@ static int (*pru_soreceive)(struct socket *so, struct sockaddr **paddr, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp); -#ifdef notyet -#define VM_HOLD_WRITEABLE 0x1 -static int vm_fault_hold_user_pages(vm_offset_t addr, int len, vm_page_t *mp, - int *count, int flags); -#endif -static void vm_fault_unhold_pages(vm_page_t *m, int count); #define TMP_IOV_MAX 16 +#ifndef PG_FRAME +#define PG_FRAME ~PAGE_MASK +#endif +#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK) void t3_init_socket_ops(void) @@ -110,20 +115,8 @@ t3_init_socket_ops(void) prp = pffindtype(AF_INET, SOCK_STREAM); pru_sosend = prp->pr_usrreqs->pru_sosend; pru_soreceive = prp->pr_usrreqs->pru_soreceive; -#ifdef TCP_USRREQS_OVERLOAD - tcp_usrreqs.pru_connect = cxgb_tcp_usrreqs.pru_connect; - tcp_usrreqs.pru_abort = cxgb_tcp_usrreqs.pru_abort; - tcp_usrreqs.pru_listen = cxgb_tcp_usrreqs.pru_listen; - tcp_usrreqs.pru_send = cxgb_tcp_usrreqs.pru_send; - tcp_usrreqs.pru_abort = cxgb_tcp_usrreqs.pru_abort; - tcp_usrreqs.pru_disconnect = cxgb_tcp_usrreqs.pru_disconnect; - tcp_usrreqs.pru_close = cxgb_tcp_usrreqs.pru_close; - tcp_usrreqs.pru_shutdown = cxgb_tcp_usrreqs.pru_shutdown; - tcp_usrreqs.pru_rcvd = cxgb_tcp_usrreqs.pru_rcvd; -#endif } - struct cxgb_dma_info { size_t cdi_mapped; int cdi_nsegs; @@ -182,21 +175,172 @@ iov_adj(struct iovec **iov, int *iovcnt, size_t count) } } - static void -cxgb_zero_copy_free(void *cl, void *arg) {} +cxgb_zero_copy_free(void *cl, void *arg) +{ + struct mbuf_vec *mv; + struct mbuf *m = (struct mbuf *)cl; + + mv = mtomv(m); + /* + * Physical addresses, don't try to free should be unheld separately from sbdrop + * + */ + mv->mv_count = 0; + m_free_iovec(m, m->m_type); +} + static int cxgb_hold_iovec_pages(struct uio *uio, vm_page_t *m, int *held, int flags) { + struct iovec *iov = uio->uio_iov; + int iovcnt = uio->uio_iovcnt; + int err, i, count, totcount, maxcount, totbytes, npages, curbytes; + uint64_t start, end; + vm_page_t *mp; + + totbytes = totcount = 0; + maxcount = *held; + + mp = m; + for (totcount = i = 0; (i < iovcnt) && (totcount < maxcount); i++, iov++) { + count = maxcount - totcount; + + start = (uintptr_t)iov->iov_base; + end = (uintptr_t)((caddr_t)iov->iov_base + iov->iov_len); + start &= PG_FRAME; + end += PAGE_MASK; + end &= PG_FRAME; + npages = (end - start) >> PAGE_SHIFT; + + count = min(count, npages); + + err = vm_fault_hold_user_pages((vm_offset_t)iov->iov_base, mp, count, flags); + if (err) { + vm_fault_unhold_pages(m, totcount); + return (err); + } + mp += count; + totcount += count; + curbytes = iov->iov_len; + if (count != npages) + curbytes = count*PAGE_SIZE - (((uintptr_t)iov->iov_base)&PAGE_MASK); + totbytes += curbytes; + } + uio->uio_resid -= totbytes; - return (EINVAL); + return (0); +} + +/* + * Returns whether a connection should enable DDP. This happens when all of + * the following conditions are met: + * - the connection's ULP mode is DDP + * - DDP is not already enabled + * - the last receive was above the DDP threshold + * - receive buffers are in user space + * - receive side isn't shutdown (handled by caller) + * - the connection's receive window is big enough so that sizable buffers + * can be posted without closing the window in the middle of DDP (checked + * when the connection is offloaded) + */ +static int +so_should_ddp(const struct toepcb *toep, int last_recv_len) +{ + + DPRINTF("ulp_mode=%d last_recv_len=%d ddp_thresh=%d rcv_wnd=%ld ddp_copy_limit=%d\n", + toep->tp_ulp_mode, last_recv_len, TOM_TUNABLE(toep->tp_toedev, ddp_thres), + toep->tp_tp->rcv_wnd, (TOM_TUNABLE(toep->tp_toedev, ddp_copy_limit) + DDP_RSVD_WIN)); + + return toep->tp_ulp_mode == ULP_MODE_TCPDDP && (toep->tp_ddp_state.kbuf[0] == NULL) && + last_recv_len > TOM_TUNABLE(toep->tp_toedev, ddp_thres) && + toep->tp_tp->rcv_wnd > + (TOM_TUNABLE(toep->tp_toedev, ddp_copy_limit) + DDP_RSVD_WIN); +} + +static inline int +is_ddp(const struct mbuf *m) +{ + return (m->m_flags & M_DDP); +} + +static inline int +is_ddp_psh(const struct mbuf *m) +{ + return is_ddp(m) && (m->m_pkthdr.csum_flags & DDP_BF_PSH); +} + +static int +m_uiomove(const struct mbuf *m, int offset, int len, struct uio *uio) +{ + int curlen, startlen, resid_init, err = 0; + caddr_t buf; + + DPRINTF("m_uiomove(m=%p, offset=%d, len=%d, ...)\n", + m, offset, len); + + startlen = len; + resid_init = uio->uio_resid; + while (m && len) { + buf = mtod(m, caddr_t); + curlen = m->m_len; + if (offset && (offset < curlen)) { + curlen -= offset; + buf += offset; + offset = 0; + } else if (offset) { + offset -= curlen; + m = m->m_next; + continue; + } + err = uiomove(buf, min(len, curlen), uio); + if (err) { + printf("uiomove returned %d\n", err); + return (err); + } + + len -= min(len, curlen); + m = m->m_next; + } + DPRINTF("copied %d bytes - resid_init=%d uio_resid=%d\n", + startlen - len, resid_init, uio->uio_resid); + return (err); +} + +/* + * Copy data from an sk_buff to an iovec. Deals with RX_DATA, which carry the + * data in the sk_buff body, and with RX_DATA_DDP, which place the data in a + * DDP buffer. + */ +static inline int +copy_data(const struct mbuf *m, int offset, int len, struct uio *uio) +{ + struct iovec *to = uio->uio_iov; + int err; + + + if (__predict_true(!is_ddp(m))) { /* RX_DATA */ + return m_uiomove(m, offset, len, uio); + } if (__predict_true(m->m_ddp_flags & DDP_BF_NOCOPY)) { /* user DDP */ + to->iov_len -= len; + to->iov_base = ((caddr_t)to->iov_base) + len; + uio->uio_iov = to; + uio->uio_resid -= len; + return (0); + } + err = t3_ddp_copy(m, offset, uio, len); /* kernel DDP */ + return (err); } static void -cxgb_wait_dma_completion(struct toepcb *tp) +cxgb_wait_dma_completion(struct toepcb *toep) { + struct mtx *lock; + lock = &toep->tp_tp->t_inpcb->inp_mtx; + INP_LOCK(toep->tp_tp->t_inpcb); + cv_wait_unlock(&toep->tp_cv, lock); } static int @@ -234,7 +378,13 @@ cxgb_vm_page_to_miov(struct toepcb *toep, struct uio *uio, struct mbuf **m) mi_collapse_sge(mi, segs); *m = m0; - + + /* + * This appears to be a no-op at the moment + * as busdma is all or nothing need to make + * sure the tag values are large enough + * + */ if (cdi.cdi_mapped < uio->uio_resid) { uio->uio_resid -= cdi.cdi_mapped; } else @@ -305,10 +455,11 @@ sendmore: } uio->uio_resid -= m->m_pkthdr.len; sent += m->m_pkthdr.len; - sbappend_locked(&so->so_snd, m); + sbappend(&so->so_snd, m); t3_push_frames(so, TRUE); iov_adj(&uiotmp.uio_iov, &iovcnt, uiotmp.uio_resid); } + /* * Wait for pending I/O to be DMA'd to the card * @@ -357,7 +508,7 @@ cxgb_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, zcopy_thres = TOM_TUNABLE(tdev, zcopy_sosend_partial_thres); zcopy_enabled = TOM_TUNABLE(tdev, zcopy_sosend_enabled); - if ((uio->uio_resid > zcopy_thres) && + if (uio && (uio->uio_resid > zcopy_thres) && (uio->uio_iovcnt < TMP_IOV_MAX) && ((so->so_state & SS_NBIO) == 0) && zcopy_enabled) { rv = t3_sosend(so, uio); @@ -368,36 +519,378 @@ cxgb_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, return pru_sosend(so, addr, uio, top, control, flags, td); } +/* + * Following replacement or removal of the first mbuf on the first mbuf chain + * of a socket buffer, push necessary state changes back into the socket + * buffer so that other consumers see the values consistently. 'nextrecord' + * is the callers locally stored value of the original value of + * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes. + * NOTE: 'nextrecord' may be NULL. + */ +static __inline void +sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord) +{ + + SOCKBUF_LOCK_ASSERT(sb); + /* + * First, update for the new value of nextrecord. If necessary, make + * it the first record. + */ + if (sb->sb_mb != NULL) + sb->sb_mb->m_nextpkt = nextrecord; + else + sb->sb_mb = nextrecord; + + /* + * Now update any dependent socket buffer fields to reflect the new + * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the + * addition of a second clause that takes care of the case where + * sb_mb has been updated, but remains the last record. + */ + if (sb->sb_mb == NULL) { + sb->sb_mbtail = NULL; + sb->sb_lastrecord = NULL; + } else if (sb->sb_mb->m_nextpkt == NULL) + sb->sb_lastrecord = sb->sb_mb; +} + +#define IS_NONBLOCKING(so) ((so)->so_state & SS_NBIO) + static int -t3_soreceive(struct socket *so, struct uio *uio) +t3_soreceive(struct socket *so, int *flagsp, struct uio *uio) { -#ifdef notyet - int i, rv, count, hold_resid, sent, iovcnt; - struct iovec iovtmp[TMP_IOV_MAX], *iovtmpp, *iov; struct tcpcb *tp = sototcpcb(so); struct toepcb *toep = tp->t_toe; struct mbuf *m; - struct uio uiotmp; + uint32_t offset; + int err, flags, avail, len, copied, copied_unacked; + int target; /* Read at least this many bytes */ + int user_ddp_ok; + struct ddp_state *p; + struct inpcb *inp = sotoinpcb(so); + + avail = offset = copied = copied_unacked = 0; + flags = flagsp ? (*flagsp &~ MSG_EOR) : 0; + err = sblock(&so->so_rcv, SBLOCKWAIT(flags)); + p = &toep->tp_ddp_state; + + if (err) + return (err); + SOCKBUF_LOCK(&so->so_rcv); + p->user_ddp_pending = 0; +restart: + len = uio->uio_resid; + m = so->so_rcv.sb_mb; + target = (flags & MSG_WAITALL) ? len : so->so_rcv.sb_lowat; + user_ddp_ok = p->ubuf_ddp_ready; + p->cancel_ubuf = 0; + + if (len == 0) + goto done; +#if 0 + while (m && m->m_len == 0) { + so->so_rcv.sb_mb = m_free(m); + m = so->so_rcv.sb_mb; + } +#endif + if (m) + goto got_mbuf; + + /* empty receive queue */ + if (copied >= target && (so->so_rcv.sb_mb == NULL) && + !p->user_ddp_pending) + goto done; + + if (copied) { + if (so->so_error || tp->t_state == TCPS_CLOSED || + (so->so_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED))) + goto done; + } else { + if (so->so_state & SS_NOFDREF) + goto done; + if (so->so_error) { + err = so->so_error; + so->so_error = 0; + goto done; + } + if (so->so_rcv.sb_state & SBS_CANTRCVMORE) + goto done; + if (so->so_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) + goto done; + if (tp->t_state == TCPS_CLOSED) { + err = ENOTCONN; + goto done; + } + } + if (so->so_rcv.sb_mb && !p->user_ddp_pending) { + SOCKBUF_UNLOCK(&so->so_rcv); + INP_LOCK(inp); + t3_cleanup_rbuf(tp, copied_unacked); + INP_UNLOCK(inp); + SOCKBUF_LOCK(&so->so_rcv); + copied_unacked = 0; + goto restart; + } + if (p->kbuf[0] && user_ddp_ok && !p->user_ddp_pending && + uio->uio_iov->iov_len > p->kbuf[0]->dgl_length && + p->ubuf_ddp_ready) { + p->user_ddp_pending = + !t3_overlay_ubuf(so, uio, IS_NONBLOCKING(so), flags, 1, 1); + if (p->user_ddp_pending) { + p->kbuf_posted++; + user_ddp_ok = 0; + } + } + if (p->kbuf[0] && (p->kbuf_posted == 0)) { + t3_post_kbuf(so, 1, IS_NONBLOCKING(so)); + p->kbuf_posted++; + } + if (p->user_ddp_pending) { + /* One shot at DDP if we already have enough data */ + if (copied >= target) + user_ddp_ok = 0; + + DPRINTF("sbwaiting 1\n"); + if ((err = sbwait(&so->so_rcv)) != 0) + goto done; +//for timers to work await_ddp_completion(sk, flags, &timeo); + } else if (copied >= target) + goto done; + else { + if (copied_unacked) { + int i = 0; + + SOCKBUF_UNLOCK(&so->so_rcv); + INP_LOCK(inp); + t3_cleanup_rbuf(tp, copied_unacked); + INP_UNLOCK(inp); + copied_unacked = 0; + if (mp_ncpus > 1) + while (i++ < 200 && so->so_rcv.sb_mb == NULL) + cpu_spinwait(); + SOCKBUF_LOCK(&so->so_rcv); + } + + if (so->so_rcv.sb_mb) + goto restart; + DPRINTF("sbwaiting 2 copied=%d target=%d avail=%d so=%p mb=%p cc=%d\n", copied, target, avail, so, + so->so_rcv.sb_mb, so->so_rcv.sb_cc); + if ((err = sbwait(&so->so_rcv)) != 0) + goto done; + } + goto restart; +got_mbuf: + KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) || !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d m_pktlen=%d\n", !!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len, m->m_pkthdr.len)); + KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x m->m_len=%d", + m->m_next, m->m_nextpkt, m->m_flags, m->m_len)); + if (m->m_pkthdr.len == 0) { + if ((m->m_ddp_flags & DDP_BF_NOCOPY) == 0) + panic("empty mbuf and NOCOPY not set\n"); + CTR0(KTR_TOM, "ddp done notification"); + p->user_ddp_pending = 0; + sbdroprecord_locked(&so->so_rcv); + goto done; + } + + offset = toep->tp_copied_seq + copied_unacked - m->m_seq; + DPRINTF("m=%p copied_seq=0x%x copied_unacked=%d m_seq=0x%x offset=%d pktlen=%d is_ddp(m)=%d\n", + m, toep->tp_copied_seq, copied_unacked, m->m_seq, offset, m->m_pkthdr.len, !!is_ddp(m)); + + if (offset >= m->m_pkthdr.len) + panic("t3_soreceive: OFFSET >= LEN offset %d copied_seq 0x%x seq 0x%x " + "pktlen %d ddp flags 0x%x", offset, toep->tp_copied_seq + copied_unacked, m->m_seq, + m->m_pkthdr.len, m->m_ddp_flags); + + avail = m->m_pkthdr.len - offset; + if (len < avail) { + if (is_ddp(m) && (m->m_ddp_flags & DDP_BF_NOCOPY)) + panic("bad state in t3_soreceive len=%d avail=%d offset=%d\n", len, avail, offset); + avail = len; + } + CTR4(KTR_TOM, "t3_soreceive: m_len=%u offset=%u len=%u m_seq=0%08x", m->m_pkthdr.len, offset, len, m->m_seq); + +#ifdef URGENT_DATA_SUPPORTED /* - * Events requiring iteration: - * - number of pages exceeds max hold pages for process or system - * - number of pages exceeds maximum sg entries for a single WR - * - * We're limited to holding 128 pages at once - and we're limited to - * 34 SG entries per work request, but each SG entry can be any number - * of contiguous pages - * + * Check if the data we are preparing to copy contains urgent + * data. Either stop short of urgent data or skip it if it's + * first and we are not delivering urgent data inline. + */ + if (__predict_false(toep->tp_urg_data)) { + uint32_t urg_offset = tp->rcv_up - tp->copied_seq + copied_unacked; + + if (urg_offset < avail) { + if (urg_offset) { + /* stop short of the urgent data */ + avail = urg_offset; + } else if ((so->so_options & SO_OOBINLINE) == 0) { + /* First byte is urgent, skip */ + toep->tp_copied_seq++; + offset++; + avail--; + if (!avail) + goto skip_copy; + } + } + } +#endif + if (is_ddp_psh(m) || offset) { + user_ddp_ok = 0; +#ifdef T3_TRACE + T3_TRACE0(TIDTB(so), "t3_sosend: PSH"); +#endif + } + + if (user_ddp_ok && !p->user_ddp_pending && + uio->uio_iov->iov_len > p->kbuf[0]->dgl_length && + p->ubuf_ddp_ready) { + p->user_ddp_pending = + !t3_overlay_ubuf(so, uio, IS_NONBLOCKING(so), flags, 1, 1); + if (p->user_ddp_pending) { + p->kbuf_posted++; + user_ddp_ok = 0; + } + DPRINTF("user_ddp_pending=%d\n", p->user_ddp_pending); + } else + DPRINTF("user_ddp_ok=%d user_ddp_pending=%d iov_len=%ld dgl_length=%d ubuf_ddp_ready=%d ulp_mode=%d is_ddp(m)=%d flags=0x%x ubuf=%p kbuf_posted=%d\n", + user_ddp_ok, p->user_ddp_pending, uio->uio_iov->iov_len, p->kbuf[0] ? p->kbuf[0]->dgl_length : 0, + p->ubuf_ddp_ready, toep->tp_ulp_mode, !!is_ddp(m), m->m_ddp_flags, p->ubuf, p->kbuf_posted); + + /* + * If MSG_TRUNC is specified the data is discarded. + * XXX need to check pr_atomic */ + KASSERT(avail > 0, ("avail=%d resid=%d offset=%d", avail, uio->uio_resid, offset)); + if (__predict_true(!(flags & MSG_TRUNC))) { + int resid = uio->uio_resid; + + SOCKBUF_UNLOCK(&so->so_rcv); + if ((err = copy_data(m, offset, avail, uio))) { + if (err) + err = EFAULT; + goto done_unlocked; + } + SOCKBUF_LOCK(&so->so_rcv); + if (avail != (resid - uio->uio_resid)) + printf("didn't copy all bytes :-/ avail=%d offset=%d pktlen=%d resid=%d uio_resid=%d copied=%d copied_unacked=%d is_ddp(m)=%d\n", + avail, offset, m->m_pkthdr.len, resid, uio->uio_resid, copied, copied_unacked, is_ddp(m)); + } + + copied += avail; + copied_unacked += avail; + len -= avail; + +#ifdef URGENT_DATA_SUPPORTED +skip_copy: + if (tp->urg_data && after(tp->copied_seq + copied_unacked, tp->urg_seq)) + tp->urg_data = 0; +#endif + /* + * If the buffer is fully consumed free it. If it's a DDP + * buffer also handle any events it indicates. + */ + if (avail + offset >= m->m_pkthdr.len) { + unsigned int fl = m->m_ddp_flags; + int exitnow, got_psh = 0, nomoredata = 0; + int count; + struct mbuf *nextrecord; + + if (p->kbuf[0] != NULL && is_ddp(m) && (fl & 1)) { + if (is_ddp_psh(m) && p->user_ddp_pending) + got_psh = 1; + + if (fl & DDP_BF_NOCOPY) + p->user_ddp_pending = 0; + else if ((fl & DDP_BF_NODATA) && IS_NONBLOCKING(so)) { + p->kbuf_posted--; + nomoredata = 1; + } else { + p->kbuf_posted--; + p->ubuf_ddp_ready = 1; + } + } - uiotmp = *uio; - iovcnt = uio->uio_iovcnt; - iov = uio->uio_iov; - sent = 0; - re; -#endif - return (0); + nextrecord = m->m_nextpkt; + count = m->m_pkthdr.len; + while (count > 0) { + count -= m->m_len; + KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) || !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n", !!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len)); + sbfree(&so->so_rcv, m); + so->so_rcv.sb_mb = m_free(m); + m = so->so_rcv.sb_mb; + } + sockbuf_pushsync(&so->so_rcv, nextrecord); +#if 0 + sbdrop_locked(&so->so_rcv, m->m_pkthdr.len); +#endif + exitnow = got_psh || nomoredata; + if ((so->so_rcv.sb_mb == NULL) && exitnow) + goto done; + if (copied_unacked > (so->so_rcv.sb_hiwat >> 2)) { + SOCKBUF_UNLOCK(&so->so_rcv); + INP_LOCK(inp); + t3_cleanup_rbuf(tp, copied_unacked); + INP_UNLOCK(inp); + copied_unacked = 0; + SOCKBUF_LOCK(&so->so_rcv); + } + } + if (len > 0) + goto restart; + + done: + /* + * If we can still receive decide what to do in preparation for the + * next receive. Note that RCV_SHUTDOWN is set if the connection + * transitioned to CLOSE but not if it was in that state to begin with. + */ + if (__predict_true((so->so_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) == 0)) { + if (p->user_ddp_pending) { + SOCKBUF_UNLOCK(&so->so_rcv); + SOCKBUF_LOCK(&so->so_rcv); + user_ddp_ok = 0; + t3_cancel_ubuf(toep); + if (so->so_rcv.sb_mb) { + if (copied < 0) + copied = 0; + if (len > 0) + goto restart; + } + p->user_ddp_pending = 0; + } + if ((p->kbuf[0] != NULL) && (p->kbuf_posted == 0)) { +#ifdef T3_TRACE + T3_TRACE0(TIDTB(so), + "chelsio_recvmsg: about to exit, repost kbuf"); +#endif + + t3_post_kbuf(so, 1, IS_NONBLOCKING(so)); + p->kbuf_posted++; + } else if (so_should_ddp(toep, copied) && uio->uio_iovcnt == 1) { + CTR1(KTR_TOM ,"entering ddp on tid=%u", toep->tp_tid); + if (!t3_enter_ddp(so, TOM_TUNABLE(TOE_DEV(so), + ddp_copy_limit), 0, IS_NONBLOCKING(so))) + p->kbuf_posted = 1; + } + } +#ifdef T3_TRACE + T3_TRACE5(TIDTB(so), + "chelsio_recvmsg <-: copied %d len %d buffers_freed %d " + "kbuf_posted %d user_ddp_pending %u", + copied, len, buffers_freed, p ? p->kbuf_posted : -1, + p->user_ddp_pending); +#endif + SOCKBUF_UNLOCK(&so->so_rcv); +done_unlocked: + if (copied_unacked) { + INP_LOCK(inp); + t3_cleanup_rbuf(tp, copied_unacked); + INP_UNLOCK(inp); + } + sbunlock(&so->so_rcv); + + return (err); } static int @@ -405,9 +898,11 @@ cxgb_soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { struct toedev *tdev; - int rv, zcopy_thres, zcopy_enabled; + int rv, zcopy_thres, zcopy_enabled, flags; struct tcpcb *tp = sototcpcb(so); + flags = flagsp ? *flagsp &~ MSG_EOR : 0; + /* * In order to use DMA direct from userspace the following * conditions must be met: @@ -421,150 +916,30 @@ cxgb_soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, * - iovcnt is 1 * */ - if (tp->t_flags & TF_TOE) { + + if ((tp->t_flags & TF_TOE) && uio && ((flags & (MSG_WAITALL|MSG_OOB|MSG_PEEK|MSG_DONTWAIT)) == 0) + && (uio->uio_iovcnt == 1) && (mp0 == NULL)) { tdev = TOE_DEV(so); zcopy_thres = TOM_TUNABLE(tdev, ddp_thres); zcopy_enabled = TOM_TUNABLE(tdev, ddp); if ((uio->uio_resid > zcopy_thres) && - (uio->uio_iovcnt == 1) && ((so->so_state & SS_NBIO) == 0) + (uio->uio_iovcnt == 1) && zcopy_enabled) { - rv = t3_soreceive(so, uio); + rv = t3_soreceive(so, flagsp, uio); if (rv != EAGAIN) return (rv); - } - } - + else + printf("returned EAGAIN\n"); + } + } else if ((tp->t_flags & TF_TOE) && uio && mp0 == NULL) + printf("skipping t3_soreceive flags=0x%x iovcnt=%d sb_state=0x%x\n", + flags, uio->uio_iovcnt, so->so_rcv.sb_state); return pru_soreceive(so, psa, uio, mp0, controlp, flagsp); } - void t3_install_socket_ops(struct socket *so) { so->so_proto->pr_usrreqs->pru_sosend = cxgb_sosend; so->so_proto->pr_usrreqs->pru_soreceive = cxgb_soreceive; } - -/* - * This routine takes a user address range and does the following: - * - validate that the user has access to those pages (flags indicates read or write) - if not fail - * - validate that count is enough to hold range number of pages - if not fail - * - fault in any non-resident pages - * - if the user is doing a read force a write fault for any COWed pages - * - if the user is doing a read mark all pages as dirty - * - hold all pages - * - return number of pages in count - */ -#ifdef notyet -static int -vm_fault_hold_user_pages(vm_offset_t addr, int len, vm_page_t *mp, int *count, int flags) -{ - - vm_offset_t start, va; - vm_paddr_t pa; - int pageslen, faults, rv; - - struct thread *td; - vm_map_t map; - pmap_t pmap; - vm_page_t m, *pages; - vm_prot_t prot; - - start = addr & ~PAGE_MASK; - pageslen = roundup2(addr + len, PAGE_SIZE); - if (*count < (pageslen >> PAGE_SHIFT)) - return (EFBIG); - - *count = pageslen >> PAGE_SHIFT; - /* - * Check that virtual address range is legal - * This check is somewhat bogus as on some architectures kernel - * and user do not share VA - however, it appears that all FreeBSD - * architectures define it - */ - if (addr + len > VM_MAXUSER_ADDRESS) - return (EFAULT); - - td = curthread; - map = &td->td_proc->p_vmspace->vm_map; - pmap = &td->td_proc->p_vmspace->vm_pmap; - pages = mp; - - prot = (flags & VM_HOLD_WRITEABLE) ? VM_PROT_WRITE : VM_PROT_READ; - bzero(pages, sizeof(vm_page_t *) * (*count)); -retry: - - /* - * First optimistically assume that all pages are resident (and R/W if for write) - * if so just mark pages as held (and dirty if for write) and return - */ - vm_page_lock_queues(); - for (pages = mp, faults = 0, va = start; va < pageslen; va += PAGE_SIZE, pages++) { - /* - * Assure that we only hold the page once - */ - if (*pages == NULL) { - /* - * page queue mutex is recursable so this is OK - * it would be really nice if we had an unlocked version of this so - * we were only acquiring the pmap lock 1 time as opposed to potentially - * many dozens of times - */ - m = pmap_extract_and_hold(pmap, va, prot); - if (m == NULL) { - faults++; - continue; - } - *pages = m; - if (flags & VM_HOLD_WRITEABLE) - vm_page_dirty(m); - } - } - vm_page_unlock_queues(); - - if (faults == 0) - return (0); - /* - * Pages either have insufficient permissions or are not present - * trigger a fault where neccessary - * - */ - for (va = start; va < pageslen; va += PAGE_SIZE) { - m = NULL; - pa = pmap_extract(pmap, va); - rv = 0; - if (pa) - m = PHYS_TO_VM_PAGE(pa); - if (flags & VM_HOLD_WRITEABLE) { - if (m == NULL || (m->flags & PG_WRITEABLE) == 0) - rv = vm_fault(map, va, VM_PROT_WRITE, VM_FAULT_DIRTY); - } else if (m == NULL) - rv = vm_fault(map, va, VM_PROT_READ, VM_FAULT_NORMAL); - if (rv) - goto error; - } - goto retry; - -error: - vm_page_lock_queues(); - for (pages = mp, va = start; va < pageslen; va += PAGE_SIZE, pages++) - if (*pages) - vm_page_unhold(*pages); - vm_page_unlock_queues(); - return (EFAULT); -} -#endif - -static void -vm_fault_unhold_pages(vm_page_t *mp, int count) -{ - - KASSERT(count >= 0, ("negative count %d", count)); - vm_page_lock_queues(); - while (count--) { - vm_page_unhold(*mp); - mp++; - } - vm_page_unlock_queues(); -} - |