diff options
| author | Luigi Rizzo <luigi@FreeBSD.org> | 2014-02-15 04:53:04 +0000 |
|---|---|---|
| committer | Luigi Rizzo <luigi@FreeBSD.org> | 2014-02-15 04:53:04 +0000 |
| commit | f0ea3689a9c1c27067145ed902811149e78cc4fa (patch) | |
| tree | 5f40d56905d46741e85cd83a0278b12363e3e2a7 /sys | |
| parent | 53bf5ef829d5fd312db3851ce6cb589173b744e1 (diff) | |
Notes
Diffstat (limited to 'sys')
| -rw-r--r-- | sys/conf/files | 4 | ||||
| -rw-r--r-- | sys/dev/netmap/netmap.c | 495 | ||||
| -rw-r--r-- | sys/dev/netmap/netmap_freebsd.c | 265 | ||||
| -rw-r--r-- | sys/dev/netmap/netmap_generic.c | 41 | ||||
| -rw-r--r-- | sys/dev/netmap/netmap_kern.h | 227 | ||||
| -rw-r--r-- | sys/dev/netmap/netmap_mem2.c | 382 | ||||
| -rw-r--r-- | sys/dev/netmap/netmap_mem2.h | 14 | ||||
| -rw-r--r-- | sys/dev/netmap/netmap_offloadings.c | 401 | ||||
| -rw-r--r-- | sys/dev/netmap/netmap_pipe.c | 711 | ||||
| -rw-r--r-- | sys/dev/netmap/netmap_vale.c | 281 | ||||
| -rw-r--r-- | sys/modules/netmap/Makefile | 2 | ||||
| -rw-r--r-- | sys/net/netmap.h | 163 | ||||
| -rw-r--r-- | sys/net/netmap_user.h | 354 |
13 files changed, 2770 insertions, 570 deletions
diff --git a/sys/conf/files b/sys/conf/files index 1f20111572fe..c61030225e84 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -1875,8 +1875,10 @@ dev/ncv/ncr53c500_pccard.c optional ncv pccard dev/netmap/netmap.c optional netmap dev/netmap/netmap_freebsd.c optional netmap dev/netmap/netmap_generic.c optional netmap -dev/netmap/netmap_mbq.c optional netmap +dev/netmap/netmap_mbq.c optional netmap dev/netmap/netmap_mem2.c optional netmap +dev/netmap/netmap_offloadings.c optional netmap +dev/netmap/netmap_pipe.c optional netmap dev/netmap/netmap_vale.c optional netmap # compile-with "${NORMAL_C} -Wconversion -Wextra" dev/nge/if_nge.c optional nge diff --git a/sys/dev/netmap/netmap.c b/sys/dev/netmap/netmap.c index fdd368a346fe..de88fb58fc8c 100644 --- a/sys/dev/netmap/netmap.c +++ b/sys/dev/netmap/netmap.c @@ -156,9 +156,11 @@ ports attached to the switch) /* reduce conditional code */ -#define init_waitqueue_head(x) // only needed in linux - +// linux API, use for the knlist in FreeBSD +#define init_waitqueue_head(x) knlist_init_mtx(&(x)->si_note, NULL) +void freebsd_selwakeup(struct selinfo *si, int pri); +#define OS_selwakeup(a, b) freebsd_selwakeup(a, b) #elif defined(linux) @@ -231,6 +233,7 @@ static int netmap_admode = NETMAP_ADMODE_BEST; int netmap_generic_mit = 100*1000; /* Generic mitigation interval in nanoseconds. */ int netmap_generic_ringsize = 1024; /* Generic ringsize. */ +int netmap_generic_rings = 1; /* number of queues in generic. */ SYSCTL_INT(_dev_netmap, OID_AUTO, flags, CTLFLAG_RW, &netmap_flags, 0 , ""); SYSCTL_INT(_dev_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0 , ""); @@ -238,6 +241,7 @@ SYSCTL_INT(_dev_netmap, OID_AUTO, mmap_unreg, CTLFLAG_RW, &netmap_mmap_unreg, 0, SYSCTL_INT(_dev_netmap, OID_AUTO, admode, CTLFLAG_RW, &netmap_admode, 0 , ""); SYSCTL_INT(_dev_netmap, OID_AUTO, generic_mit, CTLFLAG_RW, &netmap_generic_mit, 0 , ""); SYSCTL_INT(_dev_netmap, OID_AUTO, generic_ringsize, CTLFLAG_RW, &netmap_generic_ringsize, 0 , ""); +SYSCTL_INT(_dev_netmap, OID_AUTO, generic_rings, CTLFLAG_RW, &netmap_generic_rings, 0 , ""); NMG_LOCK_T netmap_global_lock; @@ -270,28 +274,30 @@ netmap_set_all_rings(struct ifnet *ifp, int stopped) { struct netmap_adapter *na; int i; + u_int ntx, nrx; if (!(ifp->if_capenable & IFCAP_NETMAP)) return; na = NA(ifp); - for (i = 0; i <= na->num_tx_rings; i++) { + ntx = netmap_real_tx_rings(na); + nrx = netmap_real_rx_rings(na); + + for (i = 0; i < ntx; i++) { if (stopped) netmap_disable_ring(na->tx_rings + i); else na->tx_rings[i].nkr_stopped = 0; - na->nm_notify(na, i, NR_TX, NAF_DISABLE_NOTIFY | - (i == na->num_tx_rings ? NAF_GLOBAL_NOTIFY: 0)); + na->nm_notify(na, i, NR_TX, NAF_DISABLE_NOTIFY); } - for (i = 0; i <= na->num_rx_rings; i++) { + for (i = 0; i < nrx; i++) { if (stopped) netmap_disable_ring(na->rx_rings + i); else na->rx_rings[i].nkr_stopped = 0; - na->nm_notify(na, i, NR_RX, NAF_DISABLE_NOTIFY | - (i == na->num_rx_rings ? NAF_GLOBAL_NOTIFY: 0)); + na->nm_notify(na, i, NR_RX, NAF_DISABLE_NOTIFY); } } @@ -426,14 +432,73 @@ netmap_update_config(struct netmap_adapter *na) return 1; } +static int +netmap_txsync_compat(struct netmap_kring *kring, int flags) +{ + struct netmap_adapter *na = kring->na; + return na->nm_txsync(na, kring->ring_id, flags); +} + +static int +netmap_rxsync_compat(struct netmap_kring *kring, int flags) +{ + struct netmap_adapter *na = kring->na; + return na->nm_rxsync(na, kring->ring_id, flags); +} + +static int +netmap_txsync_to_host_compat(struct netmap_kring *kring, int flags) +{ + (void)flags; + netmap_txsync_to_host(kring->na); + return 0; +} + +static int +netmap_rxsync_from_host_compat(struct netmap_kring *kring, int flags) +{ + (void)flags; + netmap_rxsync_from_host(kring->na, NULL, NULL); + return 0; +} + + +/* create the krings array and initialize the fields common to all adapters. + * The array layout is this: + * + * +----------+ + * na->tx_rings ----->| | \ + * | | } na->num_tx_ring + * | | / + * +----------+ + * | | host tx kring + * na->rx_rings ----> +----------+ + * | | \ + * | | } na->num_rx_rings + * | | / + * +----------+ + * | | host rx kring + * +----------+ + * na->tailroom ----->| | \ + * | | } tailroom bytes + * | | / + * +----------+ + * + * Note: for compatibility, host krings are created even when not needed. + * The tailroom space is currently used by vale ports for allocating leases. + */ int -netmap_krings_create(struct netmap_adapter *na, u_int ntx, u_int nrx, u_int tailroom) +netmap_krings_create(struct netmap_adapter *na, u_int tailroom) { u_int i, len, ndesc; struct netmap_kring *kring; + u_int ntx, nrx; + + /* account for the (possibly fake) host rings */ + ntx = na->num_tx_rings + 1; + nrx = na->num_rx_rings + 1; - // XXX additional space for extra rings ? len = (ntx + nrx) * sizeof(struct netmap_kring) + tailroom; na->tx_rings = malloc((size_t)len, M_DEVBUF, M_NOWAIT | M_ZERO); @@ -454,12 +519,19 @@ netmap_krings_create(struct netmap_adapter *na, u_int ntx, u_int nrx, u_int tail kring->na = na; kring->ring_id = i; kring->nkr_num_slots = ndesc; + if (i < na->num_tx_rings) { + kring->nm_sync = netmap_txsync_compat; // XXX + } else if (i == na->num_tx_rings) { + kring->nm_sync = netmap_txsync_to_host_compat; + } /* * IMPORTANT: Always keep one slot empty. */ kring->rhead = kring->rcur = kring->nr_hwcur = 0; kring->rtail = kring->nr_hwtail = ndesc - 1; snprintf(kring->name, sizeof(kring->name) - 1, "%s TX%d", NM_IFPNAME(na->ifp), i); + ND("ktx %s h %d c %d t %d", + kring->name, kring->rhead, kring->rcur, kring->rtail); mtx_init(&kring->q_lock, "nm_txq_lock", NULL, MTX_DEF); init_waitqueue_head(&kring->si); } @@ -471,9 +543,16 @@ netmap_krings_create(struct netmap_adapter *na, u_int ntx, u_int nrx, u_int tail kring->na = na; kring->ring_id = i; kring->nkr_num_slots = ndesc; + if (i < na->num_rx_rings) { + kring->nm_sync = netmap_rxsync_compat; // XXX + } else if (i == na->num_rx_rings) { + kring->nm_sync = netmap_rxsync_from_host_compat; + } kring->rhead = kring->rcur = kring->nr_hwcur = 0; kring->rtail = kring->nr_hwtail = 0; snprintf(kring->name, sizeof(kring->name) - 1, "%s RX%d", NM_IFPNAME(na->ifp), i); + ND("krx %s h %d c %d t %d", + kring->name, kring->rhead, kring->rcur, kring->rtail); mtx_init(&kring->q_lock, "nm_rxq_lock", NULL, MTX_DEF); init_waitqueue_head(&kring->si); } @@ -486,17 +565,15 @@ netmap_krings_create(struct netmap_adapter *na, u_int ntx, u_int nrx, u_int tail } -/* XXX check boundaries */ +/* undo the actions performed by netmap_krings_create */ void netmap_krings_delete(struct netmap_adapter *na) { - int i; + struct netmap_kring *kring = na->tx_rings; - for (i = 0; i < na->num_tx_rings + 1; i++) { - mtx_destroy(&na->tx_rings[i].q_lock); - } - for (i = 0; i < na->num_rx_rings + 1; i++) { - mtx_destroy(&na->rx_rings[i].q_lock); + /* we rely on the krings layout described above */ + for ( ; kring != na->tailroom; kring++) { + mtx_destroy(&kring->q_lock); } free(na->tx_rings, M_DEVBUF); na->tx_rings = na->rx_rings = na->tailroom = NULL; @@ -677,6 +754,20 @@ netmap_do_unregif(struct netmap_priv_d *priv, struct netmap_if *nifp) netmap_mem_if_delete(na, nifp); } +static __inline int +nm_tx_si_user(struct netmap_priv_d *priv) +{ + return (priv->np_na != NULL && + (priv->np_txqlast - priv->np_txqfirst > 1)); +} + +static __inline int +nm_rx_si_user(struct netmap_priv_d *priv) +{ + return (priv->np_na != NULL && + (priv->np_rxqlast - priv->np_rxqfirst > 1)); +} + /* * returns 1 if this is the last instance and we can free priv @@ -702,6 +793,10 @@ netmap_dtor_locked(struct netmap_priv_d *priv) priv->np_nifp = NULL; netmap_drop_memory_locked(priv); if (priv->np_na) { + if (nm_tx_si_user(priv)) + na->tx_si_users--; + if (nm_rx_si_user(priv)) + na->rx_si_users--; netmap_adapter_put(na); priv->np_na = NULL; } @@ -864,22 +959,8 @@ netmap_txsync_to_host(struct netmap_adapter *na) struct netmap_kring *kring = &na->tx_rings[na->num_tx_rings]; struct netmap_ring *ring = kring->ring; u_int const lim = kring->nkr_num_slots - 1; - u_int const head = nm_txsync_prologue(kring); + u_int const head = kring->rhead; struct mbq q; - int error; - - error = nm_kr_tryget(kring); - if (error) { - if (error == NM_KR_BUSY) - D("ring %p busy (user error)", kring); - return; - } - if (head > lim) { - D("invalid ring index in stack TX kring %p", kring); - netmap_ring_reinit(kring); - nm_kr_put(kring); - return; - } /* Take packets from hwcur to head and pass them up. * force head = cur since netmap_grab_packets() stops at head @@ -896,7 +977,6 @@ netmap_txsync_to_host(struct netmap_adapter *na) kring->nr_hwtail -= lim + 1; nm_txsync_finalize(kring); - nm_kr_put(kring); netmap_send_up(na->ifp, &q); } @@ -921,27 +1001,15 @@ netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwai struct netmap_ring *ring = kring->ring; u_int nm_i, n; u_int const lim = kring->nkr_num_slots - 1; - u_int const head = nm_rxsync_prologue(kring); + u_int const head = kring->rhead; int ret = 0; struct mbq *q = &kring->rx_queue; (void)pwait; /* disable unused warnings */ - - if (head > lim) { - netmap_ring_reinit(kring); - return EINVAL; - } - - if (kring->nkr_stopped) /* check a first time without lock */ - return EBUSY; + (void)td; mtx_lock(&q->lock); - if (kring->nkr_stopped) { /* check again with lock held */ - ret = EBUSY; - goto unlock_out; - } - /* First part: import newly received packets */ n = mbq_len(q); if (n) { /* grab packets from the queue */ @@ -982,8 +1050,6 @@ netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwai if (kring->rcur == kring->rtail && td) /* no bufs available */ selrecord(td, &kring->si); -unlock_out: - mtx_unlock(&q->lock); return ret; } @@ -1107,19 +1173,26 @@ netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na) int netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create) { - struct ifnet *ifp; + struct ifnet *ifp = NULL; int error = 0; - struct netmap_adapter *ret; + struct netmap_adapter *ret = NULL; *na = NULL; /* default return value */ /* first try to see if this is a bridge port. */ NMG_LOCK_ASSERT(); + error = netmap_get_pipe_na(nmr, na, create); + if (error || *na != NULL) + return error; + error = netmap_get_bdg_na(nmr, na, create); - if (error || *na != NULL) /* valid match in netmap_get_bdg_na() */ + if (error) return error; + if (*na != NULL) /* valid match in netmap_get_bdg_na() */ + goto pipes; + ifp = ifunit_ref(nmr->nr_name); if (ifp == NULL) { return ENXIO; @@ -1129,18 +1202,23 @@ netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create) if (error) goto out; - if (ret != NULL) { - /* Users cannot use the NIC attached to a bridge directly */ - if (NETMAP_OWNED_BY_KERN(ret)) { - error = EBUSY; - goto out; - } - error = 0; - *na = ret; - netmap_adapter_get(ret); + /* Users cannot use the NIC attached to a bridge directly */ + if (NETMAP_OWNED_BY_KERN(ret)) { + error = EBUSY; + goto out; } + *na = ret; + netmap_adapter_get(ret); + +pipes: + error = netmap_pipe_alloc(*na, nmr); + out: - if_rele(ifp); + if (error && ret != NULL) + netmap_adapter_put(ret); + + if (ifp) + if_rele(ifp); return error; } @@ -1365,45 +1443,88 @@ netmap_ring_reinit(struct netmap_kring *kring) * for all rings is the same as a single ring. */ static int -netmap_set_ringid(struct netmap_priv_d *priv, u_int ringid) +netmap_set_ringid(struct netmap_priv_d *priv, uint16_t ringid, uint32_t flags) { struct netmap_adapter *na = priv->np_na; - struct ifnet *ifp = na->ifp; - u_int i = ringid & NETMAP_RING_MASK; - /* initially (np_qfirst == np_qlast) we don't want to lock */ - u_int lim = na->num_rx_rings; + u_int j, i = ringid & NETMAP_RING_MASK; + u_int reg = flags & NR_REG_MASK; - if (na->num_tx_rings > lim) - lim = na->num_tx_rings; - if ( (ringid & NETMAP_HW_RING) && i >= lim) { - D("invalid ring id %d", i); - return (EINVAL); + if (reg == NR_REG_DEFAULT) { + /* convert from old ringid to flags */ + if (ringid & NETMAP_SW_RING) { + reg = NR_REG_SW; + } else if (ringid & NETMAP_HW_RING) { + reg = NR_REG_ONE_NIC; + } else { + reg = NR_REG_ALL_NIC; + } + D("deprecated API, old ringid 0x%x -> ringid %x reg %d", ringid, i, reg); } - priv->np_ringid = ringid; - if (ringid & NETMAP_SW_RING) { - priv->np_qfirst = NETMAP_SW_RING; - priv->np_qlast = 0; - } else if (ringid & NETMAP_HW_RING) { - priv->np_qfirst = i; - priv->np_qlast = i + 1; - } else { - priv->np_qfirst = 0; - priv->np_qlast = NETMAP_HW_RING ; + switch (reg) { + case NR_REG_ALL_NIC: + case NR_REG_PIPE_MASTER: + case NR_REG_PIPE_SLAVE: + priv->np_txqfirst = 0; + priv->np_txqlast = na->num_tx_rings; + priv->np_rxqfirst = 0; + priv->np_rxqlast = na->num_rx_rings; + ND("%s %d %d", "ALL/PIPE", + priv->np_rxqfirst, priv->np_rxqlast); + break; + case NR_REG_SW: + case NR_REG_NIC_SW: + if (!(na->na_flags & NAF_HOST_RINGS)) { + D("host rings not supported"); + return EINVAL; + } + priv->np_txqfirst = (reg == NR_REG_SW ? + na->num_tx_rings : 0); + priv->np_txqlast = na->num_tx_rings + 1; + priv->np_rxqfirst = (reg == NR_REG_SW ? + na->num_rx_rings : 0); + priv->np_rxqlast = na->num_rx_rings + 1; + ND("%s %d %d", reg == NR_REG_SW ? "SW" : "NIC+SW", + priv->np_rxqfirst, priv->np_rxqlast); + break; + case NR_REG_ONE_NIC: + if (i >= na->num_tx_rings && i >= na->num_rx_rings) { + D("invalid ring id %d", i); + return EINVAL; + } + /* if not enough rings, use the first one */ + j = i; + if (j >= na->num_tx_rings) + j = 0; + priv->np_txqfirst = j; + priv->np_txqlast = j + 1; + j = i; + if (j >= na->num_rx_rings) + j = 0; + priv->np_rxqfirst = j; + priv->np_rxqlast = j + 1; + break; + default: + D("invalid regif type %d", reg); + return EINVAL; } priv->np_txpoll = (ringid & NETMAP_NO_TX_POLL) ? 0 : 1; - if (netmap_verbose) { - if (ringid & NETMAP_SW_RING) - D("ringid %s set to SW RING", NM_IFPNAME(ifp)); - else if (ringid & NETMAP_HW_RING) - D("ringid %s set to HW RING %d", NM_IFPNAME(ifp), - priv->np_qfirst); - else - D("ringid %s set to all %d HW RINGS", NM_IFPNAME(ifp), lim); - } + priv->np_flags = (flags & ~NR_REG_MASK) | reg; + if (nm_tx_si_user(priv)) + na->tx_si_users++; + if (nm_rx_si_user(priv)) + na->rx_si_users++; + if (netmap_verbose) { + D("%s: tx [%d,%d) rx [%d,%d) id %d", + NM_IFPNAME(na->ifp), + priv->np_txqfirst, + priv->np_txqlast, + priv->np_rxqfirst, + priv->np_rxqlast, + i); + } return 0; } - /* * possibly move the interface to netmap-mode. * If success it returns a pointer to netmap_if, otherwise NULL. @@ -1411,7 +1532,7 @@ netmap_set_ringid(struct netmap_priv_d *priv, u_int ringid) */ struct netmap_if * netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na, - uint16_t ringid, int *err) + uint16_t ringid, uint32_t flags, int *err) { struct ifnet *ifp = na->ifp; struct netmap_if *nifp = NULL; @@ -1421,7 +1542,7 @@ netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na, /* ring configuration may have changed, fetch from the card */ netmap_update_config(na); priv->np_na = na; /* store the reference */ - error = netmap_set_ringid(priv, ringid); + error = netmap_set_ringid(priv, ringid, flags); if (error) goto out; /* ensure allocators are ready */ @@ -1501,26 +1622,12 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, struct nmreq *nmr = (struct nmreq *) data; struct netmap_adapter *na = NULL; int error; - u_int i, lim; + u_int i, qfirst, qlast; struct netmap_if *nifp; struct netmap_kring *krings; (void)dev; /* UNUSED */ (void)fflag; /* UNUSED */ -#ifdef linux -#define devfs_get_cdevpriv(pp) \ - ({ *(struct netmap_priv_d **)pp = ((struct file *)td)->private_data; \ - (*pp ? 0 : ENOENT); }) - -/* devfs_set_cdevpriv cannot fail on linux */ -#define devfs_set_cdevpriv(p, fn) \ - ({ ((struct file *)td)->private_data = p; (p ? 0 : EINVAL); }) - - -#define devfs_clear_cdevpriv() do { \ - netmap_dtor(priv); ((struct file *)td)->private_data = 0; \ - } while (0) -#endif /* linux */ if (cmd == NIOCGINFO || cmd == NIOCREGIF) { /* truncate name */ @@ -1530,6 +1637,9 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, nmr->nr_name, nmr->nr_version, NETMAP_API); nmr->nr_version = NETMAP_API; + } + if (nmr->nr_version < NETMAP_MIN_API || + nmr->nr_version > NETMAP_MAX_API) { return EINVAL; } } @@ -1564,7 +1674,8 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, nmd = na->nm_mem; /* get memory allocator */ } - error = netmap_mem_get_info(nmd, &nmr->nr_memsize, &memflags); + error = netmap_mem_get_info(nmd, &nmr->nr_memsize, &memflags, + &nmr->nr_arg2); if (error) break; if (na == NULL) /* only memory info */ @@ -1576,8 +1687,6 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, nmr->nr_tx_rings = na->num_tx_rings; nmr->nr_rx_slots = na->num_rx_desc; nmr->nr_tx_slots = na->num_tx_desc; - if (memflags & NETMAP_MEM_PRIVATE) - nmr->nr_ringid |= NETMAP_PRIV_MEM; netmap_adapter_put(na); } while (0); NMG_UNLOCK(); @@ -1587,7 +1696,7 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, /* possibly attach/detach NIC and VALE switch */ i = nmr->nr_cmd; if (i == NETMAP_BDG_ATTACH || i == NETMAP_BDG_DETACH - || i == NETMAP_BDG_OFFSET) { + || i == NETMAP_BDG_VNET_HDR) { error = netmap_bdg_ctl(nmr, NULL); break; } else if (i != 0) { @@ -1602,7 +1711,7 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, u_int memflags; if (priv->np_na != NULL) { /* thread already registered */ - error = netmap_set_ringid(priv, nmr->nr_ringid); + error = EBUSY; break; } /* find the interface and a reference */ @@ -1615,27 +1724,39 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, error = EBUSY; break; } - nifp = netmap_do_regif(priv, na, nmr->nr_ringid, &error); + nifp = netmap_do_regif(priv, na, nmr->nr_ringid, nmr->nr_flags, &error); if (!nifp) { /* reg. failed, release priv and ref */ netmap_adapter_put(na); priv->np_nifp = NULL; break; } + priv->np_td = td; // XXX kqueue, debugging only /* return the offset of the netmap_if object */ nmr->nr_rx_rings = na->num_rx_rings; nmr->nr_tx_rings = na->num_tx_rings; nmr->nr_rx_slots = na->num_rx_desc; nmr->nr_tx_slots = na->num_tx_desc; - error = netmap_mem_get_info(na->nm_mem, &nmr->nr_memsize, &memflags); + error = netmap_mem_get_info(na->nm_mem, &nmr->nr_memsize, &memflags, + &nmr->nr_arg2); if (error) { netmap_adapter_put(na); break; } if (memflags & NETMAP_MEM_PRIVATE) { - nmr->nr_ringid |= NETMAP_PRIV_MEM; *(uint32_t *)(uintptr_t)&nifp->ni_flags |= NI_PRIV_MEM; } + priv->np_txsi = (priv->np_txqlast - priv->np_txqfirst > 1) ? + &na->tx_si : &na->tx_rings[priv->np_txqfirst].si; + priv->np_rxsi = (priv->np_rxqlast - priv->np_rxqfirst > 1) ? + &na->rx_si : &na->rx_rings[priv->np_rxqfirst].si; + + if (nmr->nr_arg3) { + D("requested %d extra buffers", nmr->nr_arg3); + nmr->nr_arg3 = netmap_extra_alloc(na, + &nifp->ni_bufs_head, nmr->nr_arg3); + D("got %d extra buffers", nmr->nr_arg3); + } nmr->nr_offset = netmap_mem_if_offset(na->nm_mem, nifp); } while (0); NMG_UNLOCK(); @@ -1666,21 +1787,17 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, break; } - if (priv->np_qfirst == NETMAP_SW_RING) { /* host rings */ - if (cmd == NIOCTXSYNC) - netmap_txsync_to_host(na); - else - netmap_rxsync_from_host(na, NULL, NULL); - break; + if (cmd == NIOCTXSYNC) { + krings = na->tx_rings; + qfirst = priv->np_txqfirst; + qlast = priv->np_txqlast; + } else { + krings = na->rx_rings; + qfirst = priv->np_rxqfirst; + qlast = priv->np_rxqlast; } - /* find the last ring to scan */ - lim = priv->np_qlast; - if (lim == NETMAP_HW_RING) - lim = (cmd == NIOCTXSYNC) ? - na->num_tx_rings : na->num_rx_rings; - krings = (cmd == NIOCTXSYNC) ? na->tx_rings : na->rx_rings; - for (i = priv->np_qfirst; i < lim; i++) { + for (i = qfirst; i < qlast; i++) { struct netmap_kring *kring = krings + i; if (nm_kr_tryget(kring)) { error = EBUSY; @@ -1694,14 +1811,14 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, if (nm_txsync_prologue(kring) >= kring->nkr_num_slots) { netmap_ring_reinit(kring); } else { - na->nm_txsync(na, i, NAF_FORCE_RECLAIM); + kring->nm_sync(kring, NAF_FORCE_RECLAIM); } if (netmap_verbose & NM_VERB_TXSYNC) D("post txsync ring %d cur %d hwcur %d", i, kring->ring->cur, kring->nr_hwcur); } else { - na->nm_rxsync(na, i, NAF_FORCE_READ); + kring->nm_sync(kring, NAF_FORCE_READ); microtime(&na->rx_rings[i].ring->ts); } nm_kr_put(kring); @@ -1772,9 +1889,9 @@ netmap_poll(struct cdev *dev, int events, struct thread *td) struct ifnet *ifp; struct netmap_kring *kring; u_int i, check_all_tx, check_all_rx, want_tx, want_rx, revents = 0; - u_int lim_tx, lim_rx; struct mbq q; /* packets from hw queues to host stack */ void *pwait = dev; /* linux compatibility */ + int is_kevent = 0; /* * In order to avoid nested locks, we need to "double check" @@ -1786,7 +1903,19 @@ netmap_poll(struct cdev *dev, int events, struct thread *td) (void)pwait; mbq_init(&q); - if (devfs_get_cdevpriv((void **)&priv) != 0 || priv == NULL) + /* + * XXX kevent has curthread->tp_fop == NULL, + * so devfs_get_cdevpriv() fails. We circumvent this by passing + * priv as the first argument, which is also useful to avoid + * the selrecord() which are not necessary in that case. + */ + if (devfs_get_cdevpriv((void **)&priv) != 0) { + is_kevent = 1; + if (netmap_verbose) + D("called from kevent"); + priv = (struct netmap_priv_d *)dev; + } + if (priv == NULL) return POLLERR; if (priv->np_nifp == NULL) { @@ -1811,28 +1940,6 @@ netmap_poll(struct cdev *dev, int events, struct thread *td) want_tx = events & (POLLOUT | POLLWRNORM); want_rx = events & (POLLIN | POLLRDNORM); - lim_tx = na->num_tx_rings; - lim_rx = na->num_rx_rings; - - if (priv->np_qfirst == NETMAP_SW_RING) { - // XXX locking ? - /* handle the host stack ring */ - if (priv->np_txpoll || want_tx) { - /* push any packets up, then we are always ready */ - netmap_txsync_to_host(na); - revents |= want_tx; - } - if (want_rx) { - kring = &na->rx_rings[lim_rx]; - /* XXX replace with rxprologue etc. */ - if (nm_ring_empty(kring->ring)) - netmap_rxsync_from_host(na, td, dev); - if (!nm_ring_empty(kring->ring)) - revents |= want_rx; - } - return (revents); - } - /* * check_all_{tx|rx} are set if the card has more than one queue AND @@ -1847,19 +1954,15 @@ netmap_poll(struct cdev *dev, int events, struct thread *td) * there are pending packets to send. The latter can be disabled * passing NETMAP_NO_TX_POLL in the NIOCREG call. */ - check_all_tx = (priv->np_qlast == NETMAP_HW_RING) && (lim_tx > 1); - check_all_rx = (priv->np_qlast == NETMAP_HW_RING) && (lim_rx > 1); - - if (priv->np_qlast != NETMAP_HW_RING) { - lim_tx = lim_rx = priv->np_qlast; - } + check_all_tx = nm_tx_si_user(priv); + check_all_rx = nm_rx_si_user(priv); /* * We start with a lock free round which is cheap if we have * slots available. If this fails, then lock and call the sync * routines. */ - for (i = priv->np_qfirst; want_rx && i < lim_rx; i++) { + for (i = priv->np_rxqfirst; want_rx && i < priv->np_rxqlast; i++) { kring = &na->rx_rings[i]; /* XXX compare ring->cur and kring->tail */ if (!nm_ring_empty(kring->ring)) { @@ -1867,7 +1970,7 @@ netmap_poll(struct cdev *dev, int events, struct thread *td) want_rx = 0; /* also breaks the loop */ } } - for (i = priv->np_qfirst; want_tx && i < lim_tx; i++) { + for (i = priv->np_txqfirst; want_tx && i < priv->np_txqlast; i++) { kring = &na->tx_rings[i]; /* XXX compare ring->cur and kring->tail */ if (!nm_ring_empty(kring->ring)) { @@ -1891,7 +1994,7 @@ netmap_poll(struct cdev *dev, int events, struct thread *td) * used to skip rings with no pending transmissions. */ flush_tx: - for (i = priv->np_qfirst; i < lim_tx; i++) { + for (i = priv->np_txqfirst; i < priv->np_txqlast; i++) { int found = 0; kring = &na->tx_rings[i]; @@ -1906,7 +2009,7 @@ flush_tx: netmap_ring_reinit(kring); revents |= POLLERR; } else { - if (na->nm_txsync(na, i, 0)) + if (kring->nm_sync(kring, 0)) revents |= POLLERR; } @@ -1921,12 +2024,12 @@ flush_tx: if (found) { /* notify other listeners */ revents |= want_tx; want_tx = 0; - na->nm_notify(na, i, NR_TX, NAF_GLOBAL_NOTIFY); + na->nm_notify(na, i, NR_TX, 0); } } - if (want_tx && retry_tx) { + if (want_tx && retry_tx && !is_kevent) { selrecord(td, check_all_tx ? - &na->tx_si : &na->tx_rings[priv->np_qfirst].si); + &na->tx_si : &na->tx_rings[priv->np_txqfirst].si); retry_tx = 0; goto flush_tx; } @@ -1940,7 +2043,7 @@ flush_tx: int send_down = 0; /* transparent mode */ /* two rounds here to for race avoidance */ do_retry_rx: - for (i = priv->np_qfirst; i < lim_rx; i++) { + for (i = priv->np_rxqfirst; i < priv->np_rxqlast; i++) { int found = 0; kring = &na->rx_rings[i]; @@ -1962,7 +2065,7 @@ do_retry_rx: netmap_grab_packets(kring, &q, netmap_fwd); } - if (na->nm_rxsync(na, i, 0)) + if (kring->nm_sync(kring, 0)) revents |= POLLERR; if (netmap_no_timestamp == 0 || kring->ring->flags & NR_TIMESTAMP) { @@ -1974,24 +2077,26 @@ do_retry_rx: if (found) { revents |= want_rx; retry_rx = 0; - na->nm_notify(na, i, NR_RX, NAF_GLOBAL_NOTIFY); + na->nm_notify(na, i, NR_RX, 0); } } /* transparent mode XXX only during first pass ? */ - kring = &na->rx_rings[lim_rx]; - if (check_all_rx - && (netmap_fwd || kring->ring->flags & NR_FORWARD)) { - /* XXX fix to use kring fields */ - if (nm_ring_empty(kring->ring)) - send_down = netmap_rxsync_from_host(na, td, dev); - if (!nm_ring_empty(kring->ring)) - revents |= want_rx; + if (na->na_flags & NAF_HOST_RINGS) { + kring = &na->rx_rings[na->num_rx_rings]; + if (check_all_rx + && (netmap_fwd || kring->ring->flags & NR_FORWARD)) { + /* XXX fix to use kring fields */ + if (nm_ring_empty(kring->ring)) + send_down = netmap_rxsync_from_host(na, td, dev); + if (!nm_ring_empty(kring->ring)) + revents |= want_rx; + } } - if (retry_rx) + if (retry_rx && !is_kevent) selrecord(td, check_all_rx ? - &na->rx_si : &na->rx_rings[priv->np_qfirst].si); + &na->rx_si : &na->rx_rings[priv->np_rxqfirst].si); if (send_down > 0 || retry_rx) { retry_rx = 0; if (send_down) @@ -2032,14 +2137,14 @@ netmap_notify(struct netmap_adapter *na, u_int n_ring, if (tx == NR_TX) { kring = na->tx_rings + n_ring; - selwakeuppri(&kring->si, PI_NET); - if (flags & NAF_GLOBAL_NOTIFY) - selwakeuppri(&na->tx_si, PI_NET); + OS_selwakeup(&kring->si, PI_NET); + if (na->tx_si_users > 0) + OS_selwakeup(&na->tx_si, PI_NET); } else { kring = na->rx_rings + n_ring; - selwakeuppri(&kring->si, PI_NET); - if (flags & NAF_GLOBAL_NOTIFY) - selwakeuppri(&na->rx_si, PI_NET); + OS_selwakeup(&kring->si, PI_NET); + if (na->rx_si_users > 0) + OS_selwakeup(&na->rx_si, PI_NET); } return 0; } @@ -2090,6 +2195,7 @@ netmap_detach_common(struct netmap_adapter *na) D("freeing leftover tx_rings"); na->nm_krings_delete(na); } + netmap_pipe_dealloc(na); if (na->na_flags & NAF_MEM_OWNER) netmap_mem_private_delete(na->nm_mem); bzero(na, sizeof(*na)); @@ -2120,6 +2226,7 @@ netmap_attach(struct netmap_adapter *arg) if (hwna == NULL) goto fail; hwna->up = *arg; + hwna->up.na_flags |= NAF_HOST_RINGS; if (netmap_attach_common(&hwna->up)) { free(hwna, M_DEVBUF); goto fail; @@ -2177,12 +2284,10 @@ NM_DBG(netmap_adapter_put)(struct netmap_adapter *na) return 1; } - int netmap_hw_krings_create(struct netmap_adapter *na) { - int ret = netmap_krings_create(na, - na->num_tx_rings + 1, na->num_rx_rings + 1, 0); + int ret = netmap_krings_create(na, 0); if (ret == 0) { /* initialize the mbq for the sw rx ring */ mbq_safe_init(&na->rx_rings[na->num_rx_rings].rx_queue); @@ -2370,7 +2475,7 @@ netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n, * We do the wakeup here, but the ring is not yet reconfigured. * However, we are under lock so there are no races. */ - na->nm_notify(na, n, tx, NAF_GLOBAL_NOTIFY); + na->nm_notify(na, n, tx, 0); return kring->ring->slot; } @@ -2405,15 +2510,13 @@ netmap_common_irq(struct ifnet *ifp, u_int q, u_int *work_done) return; // not a physical queue kring = na->rx_rings + q; kring->nr_kflags |= NKR_PENDINTR; // XXX atomic ? - na->nm_notify(na, q, NR_RX, - (na->num_rx_rings > 1 ? NAF_GLOBAL_NOTIFY : 0)); + na->nm_notify(na, q, NR_RX, 0); *work_done = 1; /* do not fire napi again */ } else { /* TX path */ if (q >= na->num_tx_rings) return; // not a physical queue kring = na->tx_rings + q; - na->nm_notify(na, q, NR_TX, - (na->num_tx_rings > 1 ? NAF_GLOBAL_NOTIFY : 0)); + na->nm_notify(na, q, NR_TX, 0); } } diff --git a/sys/dev/netmap/netmap_freebsd.c b/sys/dev/netmap/netmap_freebsd.c index 6716168526dc..a8e287c6ddd8 100644 --- a/sys/dev/netmap/netmap_freebsd.c +++ b/sys/dev/netmap/netmap_freebsd.c @@ -29,8 +29,10 @@ #include <sys/module.h> #include <sys/errno.h> #include <sys/param.h> /* defines used in kernel.h */ +#include <sys/poll.h> /* POLLIN, POLLOUT */ #include <sys/kernel.h> /* types used in module initialization */ #include <sys/conf.h> /* DEV_MODULE */ +#include <sys/endian.h> #include <sys/rwlock.h> @@ -49,6 +51,8 @@ #include <net/if.h> #include <net/if_var.h> #include <machine/bus.h> /* bus_dmamap_* */ +#include <netinet/in.h> /* in6_cksum_pseudo() */ +#include <machine/in_cksum.h> /* in_pseudo(), in_cksum_hdr() */ #include <net/netmap.h> #include <dev/netmap/netmap_kern.h> @@ -57,6 +61,73 @@ /* ======================== FREEBSD-SPECIFIC ROUTINES ================== */ +rawsum_t nm_csum_raw(uint8_t *data, size_t len, rawsum_t cur_sum) +{ + /* TODO XXX please use the FreeBSD implementation for this. */ + uint16_t *words = (uint16_t *)data; + int nw = len / 2; + int i; + + for (i = 0; i < nw; i++) + cur_sum += be16toh(words[i]); + + if (len & 1) + cur_sum += (data[len-1] << 8); + + return cur_sum; +} + +/* Fold a raw checksum: 'cur_sum' is in host byte order, while the + * return value is in network byte order. + */ +uint16_t nm_csum_fold(rawsum_t cur_sum) +{ + /* TODO XXX please use the FreeBSD implementation for this. */ + while (cur_sum >> 16) + cur_sum = (cur_sum & 0xFFFF) + (cur_sum >> 16); + + return htobe16((~cur_sum) & 0xFFFF); +} + +uint16_t nm_csum_ipv4(struct nm_iphdr *iph) +{ +#if 0 + return in_cksum_hdr((void *)iph); +#else + return nm_csum_fold(nm_csum_raw((uint8_t*)iph, sizeof(struct nm_iphdr), 0)); +#endif +} + +void nm_csum_tcpudp_ipv4(struct nm_iphdr *iph, void *data, + size_t datalen, uint16_t *check) +{ + uint16_t pseudolen = datalen + iph->protocol; + + /* Compute and insert the pseudo-header cheksum. */ + *check = in_pseudo(iph->saddr, iph->daddr, + htobe16(pseudolen)); + /* Compute the checksum on TCP/UDP header + payload + * (includes the pseudo-header). + */ + *check = nm_csum_fold(nm_csum_raw(data, datalen, 0)); +} + +void nm_csum_tcpudp_ipv6(struct nm_ipv6hdr *ip6h, void *data, + size_t datalen, uint16_t *check) +{ +#ifdef INET6 + *check = in6_cksum_pseudo((void*)ip6h, datalen, ip6h->nexthdr, 0); + *check = nm_csum_fold(nm_csum_raw(data, datalen, 0)); +#else + static int notsupported = 0; + if (!notsupported) { + notsupported = 1; + D("inet6 segmentation not supported"); + } +#endif +} + + /* * Intercept the rx routine in the standard device driver. * Second argument is non-zero to intercept, 0 to restore @@ -91,10 +162,7 @@ netmap_catch_rx(struct netmap_adapter *na, int intercept) * Intercept the packet steering routine in the tx path, * so that we can decide which queue is used for an mbuf. * Second argument is non-zero to intercept, 0 to restore. - * - * actually we also need to redirect the if_transmit ? - * - * XXX see if FreeBSD has such a mechanism + * On freebsd we just intercept if_transmit. */ void netmap_catch_tx(struct netmap_generic_adapter *gna, int enable) @@ -111,7 +179,8 @@ netmap_catch_tx(struct netmap_generic_adapter *gna, int enable) } -/* Transmit routine used by generic_netmap_txsync(). Returns 0 on success +/* + * Transmit routine used by generic_netmap_txsync(). Returns 0 on success * and non-zero on error (which may be packet drops or other errors). * addr and len identify the netmap buffer, m is the (preallocated) * mbuf to use for transmissions. @@ -162,38 +231,39 @@ void generic_find_num_queues(struct ifnet *ifp, u_int *txq, u_int *rxq) { D("called"); - *txq = 1; - *rxq = 1; + *txq = netmap_generic_rings; + *rxq = netmap_generic_rings; } -void netmap_mitigation_init(struct netmap_generic_adapter *na) +void netmap_mitigation_init(struct nm_generic_mit *mit, struct netmap_adapter *na) { ND("called"); - na->mit_pending = 0; + mit->mit_pending = 0; + mit->mit_na = na; } -void netmap_mitigation_start(struct netmap_generic_adapter *na) +void netmap_mitigation_start(struct nm_generic_mit *mit) { ND("called"); } -void netmap_mitigation_restart(struct netmap_generic_adapter *na) +void netmap_mitigation_restart(struct nm_generic_mit *mit) { ND("called"); } -int netmap_mitigation_active(struct netmap_generic_adapter *na) +int netmap_mitigation_active(struct nm_generic_mit *mit) { ND("called"); return 0; } -void netmap_mitigation_cleanup(struct netmap_generic_adapter *na) +void netmap_mitigation_cleanup(struct nm_generic_mit *mit) { ND("called"); } @@ -216,8 +286,10 @@ netmap_dev_pager_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t foff, struct ucred *cred, u_short *color) { struct netmap_vm_handle_t *vmh = handle; - D("handle %p size %jd prot %d foff %jd", - handle, (intmax_t)size, prot, (intmax_t)foff); + + if (netmap_verbose) + D("handle %p size %jd prot %d foff %jd", + handle, (intmax_t)size, prot, (intmax_t)foff); dev_ref(vmh->dev); return 0; } @@ -229,7 +301,9 @@ netmap_dev_pager_dtor(void *handle) struct netmap_vm_handle_t *vmh = handle; struct cdev *dev = vmh->dev; struct netmap_priv_d *priv = vmh->priv; - D("handle %p", handle); + + if (netmap_verbose) + D("handle %p", handle); netmap_dtor(priv); free(vmh, M_DEVBUF); dev_rel(dev); @@ -302,8 +376,9 @@ netmap_mmap_single(struct cdev *cdev, vm_ooffset_t *foff, struct netmap_priv_d *priv; vm_object_t obj; - D("cdev %p foff %jd size %jd objp %p prot %d", cdev, - (intmax_t )*foff, (intmax_t )objsize, objp, prot); + if (netmap_verbose) + D("cdev %p foff %jd size %jd objp %p prot %d", cdev, + (intmax_t )*foff, (intmax_t )objsize, objp, prot); vmh = malloc(sizeof(struct netmap_vm_handle_t), M_DEVBUF, M_NOWAIT | M_ZERO); @@ -383,6 +458,157 @@ netmap_open(struct cdev *dev, int oflags, int devtype, struct thread *td) return 0; } +/******************** kqueue support ****************/ + +/* + * The OS_selwakeup also needs to issue a KNOTE_UNLOCKED. + * We use a non-zero argument to distinguish the call from the one + * in kevent_scan() which instead also needs to run netmap_poll(). + * The knote uses a global mutex for the time being. We might + * try to reuse the one in the si, but it is not allocated + * permanently so it might be a bit tricky. + * + * The *kqfilter function registers one or another f_event + * depending on read or write mode. + * In the call to f_event() td_fpop is NULL so any child function + * calling devfs_get_cdevpriv() would fail - and we need it in + * netmap_poll(). As a workaround we store priv into kn->kn_hook + * and pass it as first argument to netmap_poll(), which then + * uses the failure to tell that we are called from f_event() + * and do not need the selrecord(). + */ + +void freebsd_selwakeup(struct selinfo *si, int pri); + +void +freebsd_selwakeup(struct selinfo *si, int pri) +{ + if (netmap_verbose) + D("on knote %p", &si->si_note); + selwakeuppri(si, pri); + /* use a non-zero hint to tell the notification from the + * call done in kqueue_scan() which uses 0 + */ + KNOTE_UNLOCKED(&si->si_note, 0x100 /* notification */); +} + +static void +netmap_knrdetach(struct knote *kn) +{ + struct netmap_priv_d *priv = (struct netmap_priv_d *)kn->kn_hook; + struct selinfo *si = priv->np_rxsi; + + D("remove selinfo %p", si); + knlist_remove(&si->si_note, kn, 0); +} + +static void +netmap_knwdetach(struct knote *kn) +{ + struct netmap_priv_d *priv = (struct netmap_priv_d *)kn->kn_hook; + struct selinfo *si = priv->np_txsi; + + D("remove selinfo %p", si); + knlist_remove(&si->si_note, kn, 0); +} + +/* + * callback from notifies (generated externally) and our + * calls to kevent(). The former we just return 1 (ready) + * since we do not know better. + * In the latter we call netmap_poll and return 0/1 accordingly. + */ +static int +netmap_knrw(struct knote *kn, long hint, int events) +{ + struct netmap_priv_d *priv; + int revents; + + if (hint != 0) { + ND(5, "call from notify"); + return 1; /* assume we are ready */ + } + priv = kn->kn_hook; + /* the notification may come from an external thread, + * in which case we do not want to run the netmap_poll + * This should be filtered above, but check just in case. + */ + if (curthread != priv->np_td) { /* should not happen */ + RD(5, "curthread changed %p %p", curthread, priv->np_td); + return 1; + } else { + revents = netmap_poll((void *)priv, events, curthread); + return (events & revents) ? 1 : 0; + } +} + +static int +netmap_knread(struct knote *kn, long hint) +{ + return netmap_knrw(kn, hint, POLLIN); +} + +static int +netmap_knwrite(struct knote *kn, long hint) +{ + return netmap_knrw(kn, hint, POLLOUT); +} + +static struct filterops netmap_rfiltops = { + .f_isfd = 1, + .f_detach = netmap_knrdetach, + .f_event = netmap_knread, +}; + +static struct filterops netmap_wfiltops = { + .f_isfd = 1, + .f_detach = netmap_knwdetach, + .f_event = netmap_knwrite, +}; + + +/* + * This is called when a thread invokes kevent() to record + * a change in the configuration of the kqueue(). + * The 'priv' should be the same as in the netmap device. + */ +static int +netmap_kqfilter(struct cdev *dev, struct knote *kn) +{ + struct netmap_priv_d *priv; + int error; + struct netmap_adapter *na; + struct selinfo *si; + int ev = kn->kn_filter; + + if (ev != EVFILT_READ && ev != EVFILT_WRITE) { + D("bad filter request %d", ev); + return 1; + } + error = devfs_get_cdevpriv((void**)&priv); + if (error) { + D("device not yet setup"); + return 1; + } + na = priv->np_na; + if (na == NULL) { + D("no netmap adapter for this file descriptor"); + return 1; + } + /* the si is indicated in the priv */ + si = (ev == EVFILT_WRITE) ? priv->np_txsi : priv->np_rxsi; + // XXX lock(priv) ? + kn->kn_fop = (ev == EVFILT_WRITE) ? + &netmap_wfiltops : &netmap_rfiltops; + kn->kn_hook = priv; + knlist_add(&si->si_note, kn, 1); + // XXX unlock(priv) + ND("register %p %s td %p priv %p kn %p np_nifp %p kn_fp/fpop %s", + na, na->ifp->if_xname, curthread, priv, kn, + priv->np_nifp, + kn->kn_fp == curthread->td_fpop ? "match" : "MISMATCH"); + return 0; +} struct cdevsw netmap_cdevsw = { .d_version = D_VERSION, @@ -391,9 +617,10 @@ struct cdevsw netmap_cdevsw = { .d_mmap_single = netmap_mmap_single, .d_ioctl = netmap_ioctl, .d_poll = netmap_poll, + .d_kqfilter = netmap_kqfilter, .d_close = netmap_close, }; - +/*--- end of kqueue support ----*/ /* * Kernel entry point. diff --git a/sys/dev/netmap/netmap_generic.c b/sys/dev/netmap/netmap_generic.c index e695fcbd29f8..63253b6b0693 100644 --- a/sys/dev/netmap/netmap_generic.c +++ b/sys/dev/netmap/netmap_generic.c @@ -84,6 +84,7 @@ __FBSDID("$FreeBSD$"); #define rtnl_lock() D("rtnl_lock called"); #define rtnl_unlock() D("rtnl_unlock called"); #define MBUF_TXQ(m) ((m)->m_pkthdr.flowid) +#define MBUF_RXQ(m) ((m)->m_pkthdr.flowid) #define smp_mb() /* @@ -222,6 +223,17 @@ generic_netmap_register(struct netmap_adapter *na, int enable) #endif /* REG_RESET */ if (enable) { /* Enable netmap mode. */ + /* Init the mitigation support. */ + gna->mit = malloc(na->num_rx_rings * sizeof(struct nm_generic_mit), + M_DEVBUF, M_NOWAIT | M_ZERO); + if (!gna->mit) { + D("mitigation allocation failed"); + error = ENOMEM; + goto out; + } + for (r=0; r<na->num_rx_rings; r++) + netmap_mitigation_init(&gna->mit[r], na); + /* Initialize the rx queue, as generic_rx_handler() can * be called as soon as netmap_catch_rx() returns. */ @@ -229,9 +241,6 @@ generic_netmap_register(struct netmap_adapter *na, int enable) mbq_safe_init(&na->rx_rings[r].rx_queue); } - /* Init the mitigation timer. */ - netmap_mitigation_init(gna); - /* * Preallocate packet buffers for the tx rings. */ @@ -306,7 +315,9 @@ generic_netmap_register(struct netmap_adapter *na, int enable) mbq_safe_destroy(&na->rx_rings[r].rx_queue); } - netmap_mitigation_cleanup(gna); + for (r=0; r<na->num_rx_rings; r++) + netmap_mitigation_cleanup(&gna->mit[r]); + free(gna->mit, M_DEVBUF); for (r=0; r<na->num_tx_rings; r++) { for (i=0; i<na->num_tx_desc; i++) { @@ -344,10 +355,12 @@ free_tx_pools: free(na->tx_rings[r].tx_pool, M_DEVBUF); na->tx_rings[r].tx_pool = NULL; } - netmap_mitigation_cleanup(gna); for (r=0; r<na->num_rx_rings; r++) { + netmap_mitigation_cleanup(&gna->mit[r]); mbq_safe_destroy(&na->rx_rings[r].rx_queue); } + free(gna->mit, M_DEVBUF); +out: return error; } @@ -557,12 +570,11 @@ generic_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) } slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED); nm_i = nm_next(nm_i, lim); + IFRATE(rate_ctx.new.txpkt ++); } /* Update hwcur to the next slot to transmit. */ kring->nr_hwcur = nm_i; /* not head, we could break early */ - - IFRATE(rate_ctx.new.txpkt += ntx); } /* @@ -600,7 +612,11 @@ generic_rx_handler(struct ifnet *ifp, struct mbuf *m) struct netmap_adapter *na = NA(ifp); struct netmap_generic_adapter *gna = (struct netmap_generic_adapter *)na; u_int work_done; - u_int rr = 0; // receive ring number + u_int rr = MBUF_RXQ(m); // receive ring number + + if (rr >= na->num_rx_rings) { + rr = rr % na->num_rx_rings; // XXX expensive... + } /* limit the size of the queue */ if (unlikely(mbq_len(&na->rx_rings[rr].rx_queue) > 1024)) { @@ -617,13 +633,13 @@ generic_rx_handler(struct ifnet *ifp, struct mbuf *m) /* same as send combining, filter notification if there is a * pending timer, otherwise pass it up and start a timer. */ - if (likely(netmap_mitigation_active(gna))) { + if (likely(netmap_mitigation_active(&gna->mit[rr]))) { /* Record that there is some pending work. */ - gna->mit_pending = 1; + gna->mit[rr].mit_pending = 1; } else { netmap_generic_irq(na->ifp, rr, &work_done); IFRATE(rate_ctx.new.rxirq++); - netmap_mitigation_start(gna); + netmap_mitigation_start(&gna->mit[rr]); } } } @@ -682,7 +698,6 @@ generic_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) ring->slot[nm_i].flags = slot_flags; m_freem(m); nm_i = nm_next(nm_i, lim); - n++; } if (n) { kring->nr_hwtail = nm_i; @@ -772,7 +787,7 @@ generic_netmap_attach(struct ifnet *ifp) /* when using generic, IFCAP_NETMAP is set so we force * NAF_SKIP_INTR to use the regular interrupt handler */ - na->na_flags = NAF_SKIP_INTR; + na->na_flags = NAF_SKIP_INTR | NAF_HOST_RINGS; ND("[GNA] num_tx_queues(%d), real_num_tx_queues(%d), len(%lu)", ifp->num_tx_queues, ifp->real_num_tx_queues, diff --git a/sys/dev/netmap/netmap_kern.h b/sys/dev/netmap/netmap_kern.h index 668e083e0b96..ddcb0e3185a6 100644 --- a/sys/dev/netmap/netmap_kern.h +++ b/sys/dev/netmap/netmap_kern.h @@ -35,6 +35,7 @@ #define _NET_NETMAP_KERN_H_ #define WITH_VALE // comment out to disable VALE support +#define WITH_PIPES #if defined(__FreeBSD__) @@ -267,11 +268,11 @@ struct netmap_kring { volatile int nkr_stopped; // XXX what for ? - /* support for adapters without native netmap support. + /* Support for adapters without native netmap support. * On tx rings we preallocate an array of tx buffers * (same size as the netmap ring), on rx rings we - * store incoming packets in a queue. - * XXX who writes to the rx queue ? + * store incoming mbufs in a queue that is drained by + * a rxsync. */ struct mbuf **tx_pool; // u_int nr_ntc; /* Emulation of a next-to-clean RX ring pointer. */ @@ -280,6 +281,13 @@ struct netmap_kring { uint32_t ring_id; /* debugging */ char name[64]; /* diagnostic */ + int (*nm_sync)(struct netmap_kring *kring, int flags); + +#ifdef WITH_PIPES + struct netmap_kring *pipe; + struct netmap_ring *save_ring; +#endif /* WITH_PIPES */ + } __attribute__((__aligned__(64))); @@ -388,6 +396,7 @@ struct netmap_adapter { * emulated. Where possible (e.g. FreeBSD) * IFCAP_NETMAP also mirrors this flag. */ +#define NAF_HOST_RINGS 64 /* the adapter supports the host rings */ int active_fds; /* number of user-space descriptors using this interface, which is equal to the number of struct netmap_if objs in the mapped region. */ @@ -411,6 +420,9 @@ struct netmap_adapter { NM_SELINFO_T tx_si, rx_si; /* global wait queues */ + /* count users of the global wait queues */ + int tx_si_users, rx_si_users; + /* copy of if_qflush and if_transmit pointers, to intercept * packets from the network stack when netmap is active. */ @@ -438,9 +450,11 @@ struct netmap_adapter { * * nm_config() returns configuration information from the OS * - * nm_krings_create() XXX + * nm_krings_create() create and init the krings array + * (the array layout must conform to the description + * found above the definition of netmap_krings_create) * - * nm_krings_delete() XXX + * nm_krings_delete() cleanup and delete the kring array * * nm_notify() is used to act after data have become available. * For hw devices this is typically a selwakeup(), @@ -464,7 +478,6 @@ struct netmap_adapter { void (*nm_krings_delete)(struct netmap_adapter *); int (*nm_notify)(struct netmap_adapter *, u_int ring, enum txrx, int flags); -#define NAF_GLOBAL_NOTIFY 4 #define NAF_DISABLE_NOTIFY 8 /* standard refcount to control the lifetime of the adapter @@ -484,6 +497,12 @@ struct netmap_adapter { * from userspace */ void *na_private; + +#ifdef WITH_PIPES + struct netmap_pipe_adapter **na_pipes; + int na_next_pipe; + int na_max_pipes; +#endif /* WITH_PIPES */ }; @@ -514,7 +533,10 @@ struct netmap_vp_adapter { /* VALE software port */ struct nm_bridge *na_bdg; int retry; - u_int offset; /* Offset of ethernet header for each packet. */ + /* Offset of ethernet header for each packet. */ + u_int virt_hdr_len; + /* Maximum Frame Size, used in bdg_mismatch_datapath() */ + u_int mfs; }; @@ -524,6 +546,12 @@ struct netmap_hw_adapter { /* physical device */ struct net_device_ops nm_ndo; // XXX linux only }; +/* Mitigation support. */ +struct nm_generic_mit { + struct hrtimer mit_timer; + int mit_pending; + struct netmap_adapter *mit_na; /* backpointer */ +}; struct netmap_generic_adapter { /* emulated device */ struct netmap_hw_adapter up; @@ -534,18 +562,29 @@ struct netmap_generic_adapter { /* emulated device */ /* generic netmap adapters support: * a net_device_ops struct overrides ndo_select_queue(), * save_if_input saves the if_input hook (FreeBSD), - * mit_timer and mit_pending implement rx interrupt mitigation, + * mit implements rx interrupt mitigation, */ struct net_device_ops generic_ndo; void (*save_if_input)(struct ifnet *, struct mbuf *); - struct hrtimer mit_timer; - int mit_pending; + struct nm_generic_mit *mit; #ifdef linux netdev_tx_t (*save_start_xmit)(struct mbuf *, struct ifnet *); #endif }; +static __inline int +netmap_real_tx_rings(struct netmap_adapter *na) +{ + return na->num_tx_rings + !!(na->na_flags & NAF_HOST_RINGS); +} + +static __inline int +netmap_real_rx_rings(struct netmap_adapter *na) +{ + return na->num_rx_rings + !!(na->na_flags & NAF_HOST_RINGS); +} + #ifdef WITH_VALE /* @@ -614,6 +653,25 @@ struct netmap_bwrap_adapter { #endif /* WITH_VALE */ +#ifdef WITH_PIPES + +#define NM_MAXPIPES 64 /* max number of pipes per adapter */ + +struct netmap_pipe_adapter { + struct netmap_adapter up; + + u_int id; /* pipe identifier */ + int role; /* either NR_REG_PIPE_MASTER or NR_REG_PIPE_SLAVE */ + + struct netmap_adapter *parent; /* adapter that owns the memory */ + struct netmap_pipe_adapter *peer; /* the other end of the pipe */ + int peer_ref; /* 1 iff we are holding a ref to the peer */ + + u_int parent_slot; /* index in the parent pipe array */ +}; + +#endif /* WITH_PIPES */ + /* return slots reserved to rx clients; used in drivers */ static inline uint32_t @@ -767,9 +825,8 @@ uint32_t nm_rxsync_prologue(struct netmap_kring *); static inline void nm_txsync_finalize(struct netmap_kring *kring) { - /* update ring head/tail to what the kernel knows */ + /* update ring tail to what the kernel knows */ kring->ring->tail = kring->rtail = kring->nr_hwtail; - kring->ring->head = kring->rhead = kring->nr_hwcur; /* note, head/rhead/hwcur might be behind cur/rcur * if no carrier @@ -819,14 +876,14 @@ nm_rxsync_finalize(struct netmap_kring *kring) * Support routines to be used with the VALE switch */ int netmap_update_config(struct netmap_adapter *na); -int netmap_krings_create(struct netmap_adapter *na, u_int ntx, u_int nrx, u_int tailroom); +int netmap_krings_create(struct netmap_adapter *na, u_int tailroom); void netmap_krings_delete(struct netmap_adapter *na); int netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait); struct netmap_if * netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na, - uint16_t ringid, int *err); + uint16_t ringid, uint32_t flags, int *err); @@ -868,6 +925,20 @@ int netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func); #define netmap_bdg_ctl(_1, _2) EINVAL #endif /* !WITH_VALE */ +#ifdef WITH_PIPES +/* max number of pipes per device */ +#define NM_MAXPIPES 64 /* XXX how many? */ +/* in case of no error, returns the actual number of pipes in nmr->nr_arg1 */ +int netmap_pipe_alloc(struct netmap_adapter *, struct nmreq *nmr); +void netmap_pipe_dealloc(struct netmap_adapter *); +int netmap_get_pipe_na(struct nmreq *nmr, struct netmap_adapter **na, int create); +#else /* !WITH_PIPES */ +#define NM_MAXPIPES 0 +#define netmap_pipe_alloc(_1, _2) EOPNOTSUPP +#define netmap_pipe_dealloc(_1) +#define netmap_get_pipe_na(_1, _2, _3) 0 +#endif + /* Various prototypes */ int netmap_poll(struct cdev *dev, int events, struct thread *td); int netmap_init(void); @@ -938,6 +1009,7 @@ enum { /* verbose flags */ extern int netmap_txsync_retry; extern int netmap_generic_mit; extern int netmap_generic_ringsize; +extern int netmap_generic_rings; /* * NA returns a pointer to the struct netmap adapter from the ifp, @@ -1160,13 +1232,21 @@ struct netmap_priv_d { struct netmap_if * volatile np_nifp; /* netmap if descriptor. */ struct netmap_adapter *np_na; - int np_ringid; /* from the ioctl */ - u_int np_qfirst, np_qlast; /* range of rings to scan */ - uint16_t np_txpoll; + uint32_t np_flags; /* from the ioctl */ + u_int np_txqfirst, np_txqlast; /* range of tx rings to scan */ + u_int np_rxqfirst, np_rxqlast; /* range of rx rings to scan */ + uint16_t np_txpoll; /* XXX and also np_rxpoll ? */ struct netmap_mem_d *np_mref; /* use with NMG_LOCK held */ /* np_refcount is only used on FreeBSD */ int np_refcount; /* use with NMG_LOCK held */ + + /* pointers to the selinfo to be used for selrecord. + * Either the local or the global one depending on the + * number of rings. + */ + NM_SELINFO_T *np_rxsi, *np_txsi; + struct thread *np_td; /* kqueue, just debugging */ }; @@ -1188,10 +1268,113 @@ void generic_find_num_queues(struct ifnet *ifp, u_int *txq, u_int *rxq); * to reduce the number of interrupt requests/selwakeup * to clients on incoming packets. */ -void netmap_mitigation_init(struct netmap_generic_adapter *na); -void netmap_mitigation_start(struct netmap_generic_adapter *na); -void netmap_mitigation_restart(struct netmap_generic_adapter *na); -int netmap_mitigation_active(struct netmap_generic_adapter *na); -void netmap_mitigation_cleanup(struct netmap_generic_adapter *na); +void netmap_mitigation_init(struct nm_generic_mit *mit, struct netmap_adapter *na); +void netmap_mitigation_start(struct nm_generic_mit *mit); +void netmap_mitigation_restart(struct nm_generic_mit *mit); +int netmap_mitigation_active(struct nm_generic_mit *mit); +void netmap_mitigation_cleanup(struct nm_generic_mit *mit); + + + +/* Shared declarations for the VALE switch. */ + +/* + * Each transmit queue accumulates a batch of packets into + * a structure before forwarding. Packets to the same + * destination are put in a list using ft_next as a link field. + * ft_frags and ft_next are valid only on the first fragment. + */ +struct nm_bdg_fwd { /* forwarding entry for a bridge */ + void *ft_buf; /* netmap or indirect buffer */ + uint8_t ft_frags; /* how many fragments (only on 1st frag) */ + uint8_t _ft_port; /* dst port (unused) */ + uint16_t ft_flags; /* flags, e.g. indirect */ + uint16_t ft_len; /* src fragment len */ + uint16_t ft_next; /* next packet to same destination */ +}; + +/* struct 'virtio_net_hdr' from linux. */ +struct nm_vnet_hdr { +#define VIRTIO_NET_HDR_F_NEEDS_CSUM 1 /* Use csum_start, csum_offset */ +#define VIRTIO_NET_HDR_F_DATA_VALID 2 /* Csum is valid */ + uint8_t flags; +#define VIRTIO_NET_HDR_GSO_NONE 0 /* Not a GSO frame */ +#define VIRTIO_NET_HDR_GSO_TCPV4 1 /* GSO frame, IPv4 TCP (TSO) */ +#define VIRTIO_NET_HDR_GSO_UDP 3 /* GSO frame, IPv4 UDP (UFO) */ +#define VIRTIO_NET_HDR_GSO_TCPV6 4 /* GSO frame, IPv6 TCP */ +#define VIRTIO_NET_HDR_GSO_ECN 0x80 /* TCP has ECN set */ + uint8_t gso_type; + uint16_t hdr_len; + uint16_t gso_size; + uint16_t csum_start; + uint16_t csum_offset; +}; + +#define WORST_CASE_GSO_HEADER (14+40+60) /* IPv6 + TCP */ + +/* Private definitions for IPv4, IPv6, UDP and TCP headers. */ + +struct nm_iphdr { + uint8_t version_ihl; + uint8_t tos; + uint16_t tot_len; + uint16_t id; + uint16_t frag_off; + uint8_t ttl; + uint8_t protocol; + uint16_t check; + uint32_t saddr; + uint32_t daddr; + /*The options start here. */ +}; + +struct nm_tcphdr { + uint16_t source; + uint16_t dest; + uint32_t seq; + uint32_t ack_seq; + uint8_t doff; /* Data offset + Reserved */ + uint8_t flags; + uint16_t window; + uint16_t check; + uint16_t urg_ptr; +}; + +struct nm_udphdr { + uint16_t source; + uint16_t dest; + uint16_t len; + uint16_t check; +}; + +struct nm_ipv6hdr { + uint8_t priority_version; + uint8_t flow_lbl[3]; + + uint16_t payload_len; + uint8_t nexthdr; + uint8_t hop_limit; + + uint8_t saddr[16]; + uint8_t daddr[16]; +}; + +/* Type used to store a checksum (in host byte order) that hasn't been + * folded yet. + */ +#define rawsum_t uint32_t + +rawsum_t nm_csum_raw(uint8_t *data, size_t len, rawsum_t cur_sum); +uint16_t nm_csum_ipv4(struct nm_iphdr *iph); +void nm_csum_tcpudp_ipv4(struct nm_iphdr *iph, void *data, + size_t datalen, uint16_t *check); +void nm_csum_tcpudp_ipv6(struct nm_ipv6hdr *ip6h, void *data, + size_t datalen, uint16_t *check); +uint16_t nm_csum_fold(rawsum_t cur_sum); + +void bdg_mismatch_datapath(struct netmap_vp_adapter *na, + struct netmap_vp_adapter *dst_na, + struct nm_bdg_fwd *ft_p, struct netmap_ring *ring, + u_int *j, u_int lim, u_int *howmany); #endif /* _NET_NETMAP_KERN_H_ */ diff --git a/sys/dev/netmap/netmap_mem2.c b/sys/dev/netmap/netmap_mem2.c index 55f598518434..5491845090e7 100644 --- a/sys/dev/netmap/netmap_mem2.c +++ b/sys/dev/netmap/netmap_mem2.c @@ -82,6 +82,21 @@ struct netmap_obj_params netmap_params[NETMAP_POOLS_NR] = { }, }; +struct netmap_obj_params netmap_min_priv_params[NETMAP_POOLS_NR] = { + [NETMAP_IF_POOL] = { + .size = 1024, + .num = 1, + }, + [NETMAP_RING_POOL] = { + .size = 5*PAGE_SIZE, + .num = 4, + }, + [NETMAP_BUF_POOL] = { + .size = 2048, + .num = 4098, + }, +}; + /* * nm_mem is the memory allocator used for all physical interfaces @@ -118,9 +133,16 @@ struct netmap_mem_d nm_mem = { /* Our memory allocator. */ .config = netmap_mem_global_config, .finalize = netmap_mem_global_finalize, .deref = netmap_mem_global_deref, + + .nm_id = 1, + + .prev = &nm_mem, + .next = &nm_mem, }; +struct netmap_mem_d *netmap_last_mem_d = &nm_mem; + // XXX logically belongs to nm_mem struct lut_entry *netmap_buffer_lut; /* exported */ @@ -135,7 +157,7 @@ const struct netmap_mem_d nm_blueprint = { .objminsize = sizeof(struct netmap_if), .objmaxsize = 4096, .nummin = 1, - .nummax = 10, + .nummax = 100, }, [NETMAP_RING_POOL] = { .name = "%s_ring", @@ -172,13 +194,67 @@ const struct netmap_mem_d nm_blueprint = { SYSCTL_INT(_dev_netmap, OID_AUTO, name##_num, \ CTLFLAG_RW, &netmap_params[id].num, 0, "Requested number of netmap " STRINGIFY(name) "s"); \ SYSCTL_INT(_dev_netmap, OID_AUTO, name##_curr_num, \ - CTLFLAG_RD, &nm_mem.pools[id].objtotal, 0, "Current number of netmap " STRINGIFY(name) "s") + CTLFLAG_RD, &nm_mem.pools[id].objtotal, 0, "Current number of netmap " STRINGIFY(name) "s"); \ + SYSCTL_INT(_dev_netmap, OID_AUTO, priv_##name##_size, \ + CTLFLAG_RW, &netmap_min_priv_params[id].size, 0, \ + "Default size of private netmap " STRINGIFY(name) "s"); \ + SYSCTL_INT(_dev_netmap, OID_AUTO, priv_##name##_num, \ + CTLFLAG_RW, &netmap_min_priv_params[id].num, 0, \ + "Default number of private netmap " STRINGIFY(name) "s") SYSCTL_DECL(_dev_netmap); DECLARE_SYSCTLS(NETMAP_IF_POOL, if); DECLARE_SYSCTLS(NETMAP_RING_POOL, ring); DECLARE_SYSCTLS(NETMAP_BUF_POOL, buf); +static int +nm_mem_assign_id(struct netmap_mem_d *nmd) +{ + nm_memid_t id; + struct netmap_mem_d *scan = netmap_last_mem_d; + int error = ENOMEM; + + NMA_LOCK(&nm_mem); + + do { + /* we rely on unsigned wrap around */ + id = scan->nm_id + 1; + if (id == 0) /* reserve 0 as error value */ + id = 1; + scan = scan->next; + if (id != scan->nm_id) { + nmd->nm_id = id; + nmd->prev = scan->prev; + nmd->next = scan; + scan->prev->next = nmd; + scan->prev = nmd; + netmap_last_mem_d = nmd; + error = 0; + break; + } + } while (scan != netmap_last_mem_d); + + NMA_UNLOCK(&nm_mem); + return error; +} + +static void +nm_mem_release_id(struct netmap_mem_d *nmd) +{ + NMA_LOCK(&nm_mem); + + nmd->prev->next = nmd->next; + nmd->next->prev = nmd->prev; + + if (netmap_last_mem_d == nmd) + netmap_last_mem_d = nmd->prev; + + nmd->prev = nmd->next = NULL; + + NMA_UNLOCK(&nm_mem); +} + + /* * First, find the allocator that contains the requested offset, * then locate the cluster through a lookup table. @@ -216,7 +292,8 @@ netmap_mem_ofstophys(struct netmap_mem_d* nmd, vm_ooffset_t offset) } int -netmap_mem_get_info(struct netmap_mem_d* nmd, u_int* size, u_int *memflags) +netmap_mem_get_info(struct netmap_mem_d* nmd, u_int* size, u_int *memflags, + nm_memid_t *id) { int error = 0; NMA_LOCK(nmd); @@ -234,6 +311,7 @@ netmap_mem_get_info(struct netmap_mem_d* nmd, u_int* size, u_int *memflags) } } *memflags = nmd->flags; + *id = nmd->nm_id; out: NMA_UNLOCK(nmd); return error; @@ -343,21 +421,34 @@ netmap_obj_malloc(struct netmap_obj_pool *p, u_int len, uint32_t *start, uint32_ /* - * free by index, not by address. This is slow, but is only used - * for a small number of objects (rings, nifp) + * free by index, not by address. + * XXX should we also cleanup the content ? */ -static void +static int netmap_obj_free(struct netmap_obj_pool *p, uint32_t j) { + uint32_t *ptr, mask; + if (j >= p->objtotal) { D("invalid index %u, max %u", j, p->objtotal); - return; + return 1; + } + ptr = &p->bitmap[j / 32]; + mask = (1 << (j % 32)); + if (*ptr & mask) { + D("ouch, double free on buffer %d", j); + return 1; + } else { + *ptr |= mask; + p->objfree++; + return 0; } - p->bitmap[j / 32] |= (1 << (j % 32)); - p->objfree++; - return; } +/* + * free by address. This is slow but is only used for a few + * objects (rings, nifp) + */ static void netmap_obj_free_va(struct netmap_obj_pool *p, void *vaddr) { @@ -388,9 +479,63 @@ netmap_obj_free_va(struct netmap_obj_pool *p, void *vaddr) netmap_obj_malloc(&(n)->pools[NETMAP_BUF_POOL], NETMAP_BDG_BUF_SIZE(n), _pos, _index) +#if 0 // XXX unused /* Return the index associated to the given packet buffer */ #define netmap_buf_index(n, v) \ (netmap_obj_offset(&(n)->pools[NETMAP_BUF_POOL], (v)) / NETMAP_BDG_BUF_SIZE(n)) +#endif + +/* + * allocate extra buffers in a linked list. + * returns the actual number. + */ +uint32_t +netmap_extra_alloc(struct netmap_adapter *na, uint32_t *head, uint32_t n) +{ + struct netmap_mem_d *nmd = na->nm_mem; + uint32_t i, pos = 0; /* opaque, scan position in the bitmap */ + + NMA_LOCK(nmd); + + *head = 0; /* default, 'null' index ie empty list */ + for (i = 0 ; i < n; i++) { + uint32_t cur = *head; /* save current head */ + uint32_t *p = netmap_buf_malloc(nmd, &pos, head); + if (p == NULL) { + D("no more buffers after %d of %d", i, n); + *head = cur; /* restore */ + break; + } + RD(5, "allocate buffer %d -> %d", *head, cur); + *p = cur; /* link to previous head */ + } + + NMA_UNLOCK(nmd); + + return i; +} + +static void +netmap_extra_free(struct netmap_adapter *na, uint32_t head) +{ + struct lut_entry *lut = na->na_lut; + struct netmap_mem_d *nmd = na->nm_mem; + struct netmap_obj_pool *p = &nmd->pools[NETMAP_BUF_POOL]; + uint32_t i, cur, *buf; + + D("freeing the extra list"); + for (i = 0; head >=2 && head < p->objtotal; i++) { + cur = head; + buf = lut[head].vaddr; + head = *buf; + *buf = 0; + if (netmap_obj_free(p, cur)) + break; + } + if (head != 0) + D("breaking with head %d", head); + D("freed %d buffers", i); +} /* Return nonzero on error */ @@ -425,6 +570,19 @@ cleanup: return (ENOMEM); } +static void +netmap_mem_set_ring(struct netmap_mem_d *nmd, struct netmap_slot *slot, u_int n, uint32_t index) +{ + struct netmap_obj_pool *p = &nmd->pools[NETMAP_BUF_POOL]; + u_int i; + + for (i = 0; i < n; i++) { + slot[i].buf_idx = index; + slot[i].len = p->_objsize; + slot[i].flags = 0; + } +} + static void netmap_free_buf(struct netmap_mem_d *nmd, uint32_t i) @@ -438,6 +596,18 @@ netmap_free_buf(struct netmap_mem_d *nmd, uint32_t i) netmap_obj_free(p, i); } + +static void +netmap_free_bufs(struct netmap_mem_d *nmd, struct netmap_slot *slot, u_int n) +{ + u_int i; + + for (i = 0; i < n; i++) { + if (slot[i].buf_idx > 2) + netmap_free_buf(nmd, slot[i].buf_idx); + } +} + static void netmap_reset_obj_allocator(struct netmap_obj_pool *p) { @@ -677,7 +847,9 @@ static void netmap_mem_reset_all(struct netmap_mem_d *nmd) { int i; - D("resetting %p", nmd); + + if (netmap_verbose) + D("resetting %p", nmd); for (i = 0; i < NETMAP_POOLS_NR; i++) { netmap_reset_obj_allocator(&nmd->pools[i]); } @@ -703,12 +875,14 @@ netmap_mem_finalize_all(struct netmap_mem_d *nmd) nmd->pools[NETMAP_BUF_POOL].bitmap[0] = ~3; nmd->flags |= NETMAP_MEM_FINALIZED; - D("Have %d KB for interfaces, %d KB for rings and %d MB for buffers", - nmd->pools[NETMAP_IF_POOL].memtotal >> 10, - nmd->pools[NETMAP_RING_POOL].memtotal >> 10, - nmd->pools[NETMAP_BUF_POOL].memtotal >> 20); + if (netmap_verbose) + D("interfaces %d KB, rings %d KB, buffers %d MB", + nmd->pools[NETMAP_IF_POOL].memtotal >> 10, + nmd->pools[NETMAP_RING_POOL].memtotal >> 10, + nmd->pools[NETMAP_BUF_POOL].memtotal >> 20); - D("Free buffers: %d", nmd->pools[NETMAP_BUF_POOL].objfree); + if (netmap_verbose) + D("Free buffers: %d", nmd->pools[NETMAP_BUF_POOL].objfree); return 0; @@ -724,10 +898,13 @@ netmap_mem_private_delete(struct netmap_mem_d *nmd) { if (nmd == NULL) return; - D("deleting %p", nmd); + if (netmap_verbose) + D("deleting %p", nmd); if (nmd->refcount > 0) D("bug: deleting mem allocator with refcount=%d!", nmd->refcount); - D("done deleting %p", nmd); + nm_mem_release_id(nmd); + if (netmap_verbose) + D("done deleting %p", nmd); NMA_LOCK_DESTROY(nmd); free(nmd, M_DEVBUF); } @@ -762,35 +939,70 @@ netmap_mem_private_deref(struct netmap_mem_d *nmd) NMA_UNLOCK(nmd); } + +/* + * allocator for private memory + */ struct netmap_mem_d * -netmap_mem_private_new(const char *name, u_int txr, u_int txd, u_int rxr, u_int rxd) +netmap_mem_private_new(const char *name, u_int txr, u_int txd, + u_int rxr, u_int rxd, u_int extra_bufs, u_int npipes, int *perr) { struct netmap_mem_d *d = NULL; struct netmap_obj_params p[NETMAP_POOLS_NR]; - int i; - u_int maxd; + int i, err; + u_int v, maxd; d = malloc(sizeof(struct netmap_mem_d), M_DEVBUF, M_NOWAIT | M_ZERO); - if (d == NULL) - return NULL; + if (d == NULL) { + err = ENOMEM; + goto error; + } *d = nm_blueprint; - /* XXX the rest of the code assumes the stack rings are alwasy present */ + err = nm_mem_assign_id(d); + if (err) + goto error; + + /* account for the fake host rings */ txr++; rxr++; - p[NETMAP_IF_POOL].size = sizeof(struct netmap_if) + - sizeof(ssize_t) * (txr + rxr); - p[NETMAP_IF_POOL].num = 2; + + /* copy the min values */ + for (i = 0; i < NETMAP_POOLS_NR; i++) { + p[i] = netmap_min_priv_params[i]; + } + + /* possibly increase them to fit user request */ + v = sizeof(struct netmap_if) + sizeof(ssize_t) * (txr + rxr); + if (p[NETMAP_IF_POOL].size < v) + p[NETMAP_IF_POOL].size = v; + v = 2 + 4 * npipes; + if (p[NETMAP_IF_POOL].num < v) + p[NETMAP_IF_POOL].num = v; maxd = (txd > rxd) ? txd : rxd; - p[NETMAP_RING_POOL].size = sizeof(struct netmap_ring) + - sizeof(struct netmap_slot) * maxd; - p[NETMAP_RING_POOL].num = txr + rxr; - p[NETMAP_BUF_POOL].size = 2048; /* XXX find a way to let the user choose this */ - p[NETMAP_BUF_POOL].num = rxr * (rxd + 2) + txr * (txd + 2); + v = sizeof(struct netmap_ring) + sizeof(struct netmap_slot) * maxd; + if (p[NETMAP_RING_POOL].size < v) + p[NETMAP_RING_POOL].size = v; + /* each pipe endpoint needs two tx rings (1 normal + 1 host, fake) + * and two rx rings (again, 1 normal and 1 fake host) + */ + v = txr + rxr + 8 * npipes; + if (p[NETMAP_RING_POOL].num < v) + p[NETMAP_RING_POOL].num = v; + /* for each pipe we only need the buffers for the 4 "real" rings. + * On the other end, the pipe ring dimension may be different from + * the parent port ring dimension. As a compromise, we allocate twice the + * space actually needed if the pipe rings were the same size as the parent rings + */ + v = (4 * npipes + rxr) * rxd + (4 * npipes + txr) * txd + 2 + extra_bufs; + /* the +2 is for the tx and rx fake buffers (indices 0 and 1) */ + if (p[NETMAP_BUF_POOL].num < v) + p[NETMAP_BUF_POOL].num = v; - D("req if %d*%d ring %d*%d buf %d*%d", + if (netmap_verbose) + D("req if %d*%d ring %d*%d buf %d*%d", p[NETMAP_IF_POOL].num, p[NETMAP_IF_POOL].size, p[NETMAP_RING_POOL].num, @@ -802,8 +1014,9 @@ netmap_mem_private_new(const char *name, u_int txr, u_int txd, u_int rxr, u_int snprintf(d->pools[i].name, NETMAP_POOL_MAX_NAMSZ, nm_blueprint.pools[i].name, name); - if (netmap_config_obj_allocator(&d->pools[i], - p[i].num, p[i].size)) + err = netmap_config_obj_allocator(&d->pools[i], + p[i].num, p[i].size); + if (err) goto error; } @@ -814,6 +1027,8 @@ netmap_mem_private_new(const char *name, u_int txr, u_int txd, u_int rxr, u_int return d; error: netmap_mem_private_delete(d); + if (perr) + *perr = err; return NULL; } @@ -917,20 +1132,25 @@ netmap_mem_fini(void) static void netmap_free_rings(struct netmap_adapter *na) { - u_int i; + struct netmap_kring *kring; + struct netmap_ring *ring; if (!na->tx_rings) return; - for (i = 0; i < na->num_tx_rings + 1; i++) { - if (na->tx_rings[i].ring) { - netmap_ring_free(na->nm_mem, na->tx_rings[i].ring); - na->tx_rings[i].ring = NULL; - } + for (kring = na->tx_rings; kring != na->rx_rings; kring++) { + ring = kring->ring; + if (ring == NULL) + continue; + netmap_free_bufs(na->nm_mem, ring->slot, kring->nkr_num_slots); + netmap_ring_free(na->nm_mem, ring); + kring->ring = NULL; } - for (i = 0; i < na->num_rx_rings + 1; i++) { - if (na->rx_rings[i].ring) { - netmap_ring_free(na->nm_mem, na->rx_rings[i].ring); - na->rx_rings[i].ring = NULL; - } + for (/* cont'd from above */; kring != na->tailroom; kring++) { + ring = kring->ring; + if (ring == NULL) + continue; + netmap_free_bufs(na->nm_mem, ring->slot, kring->nkr_num_slots); + netmap_ring_free(na->nm_mem, ring); + kring->ring = NULL; } } @@ -938,6 +1158,8 @@ netmap_free_rings(struct netmap_adapter *na) * * Allocate netmap rings and buffers for this card * The rings are contiguous, but have variable size. + * The kring array must follow the layout described + * in netmap_krings_create(). */ int netmap_mem_rings_create(struct netmap_adapter *na) @@ -945,10 +1167,16 @@ netmap_mem_rings_create(struct netmap_adapter *na) struct netmap_ring *ring; u_int len, ndesc; struct netmap_kring *kring; + u_int i; NMA_LOCK(na->nm_mem); - for (kring = na->tx_rings; kring != na->rx_rings; kring++) { /* Transmit rings */ + /* transmit rings */ + for (i =0, kring = na->tx_rings; kring != na->rx_rings; kring++, i++) { + if (kring->ring) { + ND("%s %ld already created", kring->name, kring - na->tx_rings); + continue; /* already created by somebody else */ + } ndesc = kring->nkr_num_slots; len = sizeof(struct netmap_ring) + ndesc * sizeof(struct netmap_slot); @@ -971,14 +1199,27 @@ netmap_mem_rings_create(struct netmap_adapter *na) ring->tail = kring->rtail; *(uint16_t *)(uintptr_t)&ring->nr_buf_size = NETMAP_BDG_BUF_SIZE(na->nm_mem); + ND("%s h %d c %d t %d", kring->name, + ring->head, ring->cur, ring->tail); ND("initializing slots for txring"); - if (netmap_new_bufs(na->nm_mem, ring->slot, ndesc)) { - D("Cannot allocate buffers for tx_ring"); - goto cleanup; + if (i != na->num_tx_rings || (na->na_flags & NAF_HOST_RINGS)) { + /* this is a real ring */ + if (netmap_new_bufs(na->nm_mem, ring->slot, ndesc)) { + D("Cannot allocate buffers for tx_ring"); + goto cleanup; + } + } else { + /* this is a fake tx ring, set all indices to 0 */ + netmap_mem_set_ring(na->nm_mem, ring->slot, ndesc, 0); } } - for ( ; kring != na->tailroom; kring++) { /* Receive rings */ + /* receive rings */ + for ( i = 0 /* kring cont'd from above */ ; kring != na->tailroom; kring++, i++) { + if (kring->ring) { + ND("%s %ld already created", kring->name, kring - na->rx_rings); + continue; /* already created by somebody else */ + } ndesc = kring->nkr_num_slots; len = sizeof(struct netmap_ring) + ndesc * sizeof(struct netmap_slot); @@ -1001,10 +1242,18 @@ netmap_mem_rings_create(struct netmap_adapter *na) ring->tail = kring->rtail; *(int *)(uintptr_t)&ring->nr_buf_size = NETMAP_BDG_BUF_SIZE(na->nm_mem); + ND("%s h %d c %d t %d", kring->name, + ring->head, ring->cur, ring->tail); ND("initializing slots for rxring %p", ring); - if (netmap_new_bufs(na->nm_mem, ring->slot, ndesc)) { - D("Cannot allocate buffers for rx_ring"); - goto cleanup; + if (i != na->num_rx_rings || (na->na_flags & NAF_HOST_RINGS)) { + /* this is a real ring */ + if (netmap_new_bufs(na->nm_mem, ring->slot, ndesc)) { + D("Cannot allocate buffers for rx_ring"); + goto cleanup; + } + } else { + /* this is a fake rx ring, set all indices to 1 */ + netmap_mem_set_ring(na->nm_mem, ring->slot, ndesc, 1); } } @@ -1024,20 +1273,8 @@ void netmap_mem_rings_delete(struct netmap_adapter *na) { /* last instance, release bufs and rings */ - u_int i, lim; - struct netmap_kring *kring; - struct netmap_ring *ring; - NMA_LOCK(na->nm_mem); - for (kring = na->tx_rings; kring != na->tailroom; kring++) { - ring = kring->ring; - if (ring == NULL) - continue; - lim = kring->nkr_num_slots; - for (i = 0; i < lim; i++) - netmap_free_buf(na->nm_mem, ring->slot[i].buf_idx); - } netmap_free_rings(na); NMA_UNLOCK(na->nm_mem); @@ -1059,16 +1296,12 @@ netmap_mem_if_new(const char *ifname, struct netmap_adapter *na) ssize_t base; /* handy for relative offsets between rings and nifp */ u_int i, len, ntx, nrx; - /* - * verify whether virtual port need the stack ring - */ - ntx = na->num_tx_rings + 1; /* shorthand, include stack ring */ - nrx = na->num_rx_rings + 1; /* shorthand, include stack ring */ + /* account for the (eventually fake) host rings */ + ntx = na->num_tx_rings + 1; + nrx = na->num_rx_rings + 1; /* * the descriptor is followed inline by an array of offsets * to the tx and rx rings in the shared memory region. - * For virtual rx rings we also allocate an array of - * pointers to assign to nkr_leases. */ NMA_LOCK(na->nm_mem); @@ -1112,7 +1345,8 @@ netmap_mem_if_delete(struct netmap_adapter *na, struct netmap_if *nifp) /* nothing to do */ return; NMA_LOCK(na->nm_mem); - + if (nifp->ni_bufs_head) + netmap_extra_free(na, nifp->ni_bufs_head); netmap_if_free(na->nm_mem, nifp); NMA_UNLOCK(na->nm_mem); diff --git a/sys/dev/netmap/netmap_mem2.h b/sys/dev/netmap/netmap_mem2.h index 8e6c58cbc4ee..e83616a5195f 100644 --- a/sys/dev/netmap/netmap_mem2.h +++ b/sys/dev/netmap/netmap_mem2.h @@ -160,6 +160,7 @@ typedef int (*netmap_mem_config_t)(struct netmap_mem_d*); typedef int (*netmap_mem_finalize_t)(struct netmap_mem_d*); typedef void (*netmap_mem_deref_t)(struct netmap_mem_d*); +typedef uint16_t nm_memid_t; /* We implement two kinds of netmap_mem_d structures: * @@ -192,6 +193,11 @@ struct netmap_mem_d { netmap_mem_config_t config; netmap_mem_finalize_t finalize; netmap_mem_deref_t deref; + + nm_memid_t nm_id; /* allocator identifier */ + + /* list of all existing allocators, sorted by nm_id */ + struct netmap_mem_d *prev, *next; }; extern struct netmap_mem_d nm_mem; @@ -206,14 +212,16 @@ void netmap_mem_if_delete(struct netmap_adapter *, struct netmap_if *); int netmap_mem_rings_create(struct netmap_adapter *); void netmap_mem_rings_delete(struct netmap_adapter *); void netmap_mem_deref(struct netmap_mem_d *); -int netmap_mem_get_info(struct netmap_mem_d *, u_int *size, u_int *memflags); +int netmap_mem_get_info(struct netmap_mem_d *, u_int *size, u_int *memflags, uint16_t *id); ssize_t netmap_mem_if_offset(struct netmap_mem_d *, const void *vaddr); -struct netmap_mem_d* - netmap_mem_private_new(const char *name, u_int txr, u_int txd, u_int rxr, u_int rxd); +struct netmap_mem_d* netmap_mem_private_new(const char *name, + u_int txr, u_int txd, u_int rxr, u_int rxd, u_int extra_bufs, u_int npipes, + int* error); void netmap_mem_private_delete(struct netmap_mem_d *); #define NETMAP_BDG_BUF_SIZE(n) ((n)->pools[NETMAP_BUF_POOL]._objsize) +uint32_t netmap_extra_alloc(struct netmap_adapter *, uint32_t *, uint32_t n); #endif diff --git a/sys/dev/netmap/netmap_offloadings.c b/sys/dev/netmap/netmap_offloadings.c new file mode 100644 index 000000000000..a776a2424577 --- /dev/null +++ b/sys/dev/netmap/netmap_offloadings.c @@ -0,0 +1,401 @@ +/* + * Copyright (C) 2014 Vincenzo Maffione. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* $FreeBSD$ */ + +#if defined(__FreeBSD__) +#include <sys/cdefs.h> /* prerequisite */ + +#include <sys/types.h> +#include <sys/errno.h> +#include <sys/param.h> /* defines used in kernel.h */ +#include <sys/kernel.h> /* types used in module initialization */ +#include <sys/sockio.h> +#include <sys/socketvar.h> /* struct socket */ +#include <sys/socket.h> /* sockaddrs */ +#include <net/if.h> +#include <net/if_var.h> +#include <machine/bus.h> /* bus_dmamap_* */ +#include <sys/endian.h> + +#elif defined(linux) + +#include "bsd_glue.h" + +#elif defined(__APPLE__) + +#warning OSX support is only partial +#include "osx_glue.h" + +#else + +#error Unsupported platform + +#endif /* unsupported */ + +#include <net/netmap.h> +#include <dev/netmap/netmap_kern.h> + + + +/* This routine is called by bdg_mismatch_datapath() when it finishes + * accumulating bytes for a segment, in order to fix some fields in the + * segment headers (which still contain the same content as the header + * of the original GSO packet). 'buf' points to the beginning (e.g. + * the ethernet header) of the segment, and 'len' is its length. + */ +static void gso_fix_segment(uint8_t *buf, size_t len, u_int idx, + u_int segmented_bytes, u_int last_segment, + u_int tcp, u_int iphlen) +{ + struct nm_iphdr *iph = (struct nm_iphdr *)(buf + 14); + struct nm_ipv6hdr *ip6h = (struct nm_ipv6hdr *)(buf + 14); + uint16_t *check = NULL; + uint8_t *check_data = NULL; + + if (iphlen == 20) { + /* Set the IPv4 "Total Length" field. */ + iph->tot_len = htobe16(len-14); + ND("ip total length %u", be16toh(ip->tot_len)); + + /* Set the IPv4 "Identification" field. */ + iph->id = htobe16(be16toh(iph->id) + idx); + ND("ip identification %u", be16toh(iph->id)); + + /* Compute and insert the IPv4 header checksum. */ + iph->check = 0; + iph->check = nm_csum_ipv4(iph); + ND("IP csum %x", be16toh(iph->check)); + } else {/* if (iphlen == 40) */ + /* Set the IPv6 "Payload Len" field. */ + ip6h->payload_len = htobe16(len-14-iphlen); + } + + if (tcp) { + struct nm_tcphdr *tcph = (struct nm_tcphdr *)(buf + 14 + iphlen); + + /* Set the TCP sequence number. */ + tcph->seq = htobe32(be32toh(tcph->seq) + segmented_bytes); + ND("tcp seq %u", be32toh(tcph->seq)); + + /* Zero the PSH and FIN TCP flags if this is not the last + segment. */ + if (!last_segment) + tcph->flags &= ~(0x8 | 0x1); + ND("last_segment %u", last_segment); + + check = &tcph->check; + check_data = (uint8_t *)tcph; + } else { /* UDP */ + struct nm_udphdr *udph = (struct nm_udphdr *)(buf + 14 + iphlen); + + /* Set the UDP 'Length' field. */ + udph->len = htobe16(len-14-iphlen); + + check = &udph->check; + check_data = (uint8_t *)udph; + } + + /* Compute and insert TCP/UDP checksum. */ + *check = 0; + if (iphlen == 20) + nm_csum_tcpudp_ipv4(iph, check_data, len-14-iphlen, check); + else + nm_csum_tcpudp_ipv6(ip6h, check_data, len-14-iphlen, check); + + ND("TCP/UDP csum %x", be16toh(*check)); +} + + +/* The VALE mismatch datapath implementation. */ +void bdg_mismatch_datapath(struct netmap_vp_adapter *na, + struct netmap_vp_adapter *dst_na, + struct nm_bdg_fwd *ft_p, struct netmap_ring *ring, + u_int *j, u_int lim, u_int *howmany) +{ + struct netmap_slot *slot = NULL; + struct nm_vnet_hdr *vh = NULL; + /* Number of source slots to process. */ + u_int frags = ft_p->ft_frags; + struct nm_bdg_fwd *ft_end = ft_p + frags; + + /* Source and destination pointers. */ + uint8_t *dst, *src; + size_t src_len, dst_len; + + u_int j_start = *j; + u_int dst_slots = 0; + + /* If the source port uses the offloadings, while destination doesn't, + * we grab the source virtio-net header and do the offloadings here. + */ + if (na->virt_hdr_len && !dst_na->virt_hdr_len) { + vh = (struct nm_vnet_hdr *)ft_p->ft_buf; + } + + /* Init source and dest pointers. */ + src = ft_p->ft_buf; + src_len = ft_p->ft_len; + slot = &ring->slot[*j]; + dst = BDG_NMB(&dst_na->up, slot); + dst_len = src_len; + + /* We are processing the first input slot and there is a mismatch + * between source and destination virt_hdr_len (SHL and DHL). + * When the a client is using virtio-net headers, the header length + * can be: + * - 10: the header corresponds to the struct nm_vnet_hdr + * - 12: the first 10 bytes correspond to the struct + * virtio_net_hdr, and the last 2 bytes store the + * "mergeable buffers" info, which is an optional + * hint that can be zeroed for compability + * + * The destination header is therefore built according to the + * following table: + * + * SHL | DHL | destination header + * ----------------------------- + * 0 | 10 | zero + * 0 | 12 | zero + * 10 | 0 | doesn't exist + * 10 | 12 | first 10 bytes are copied from source header, last 2 are zero + * 12 | 0 | doesn't exist + * 12 | 10 | copied from the first 10 bytes of source header + */ + bzero(dst, dst_na->virt_hdr_len); + if (na->virt_hdr_len && dst_na->virt_hdr_len) + memcpy(dst, src, sizeof(struct nm_vnet_hdr)); + /* Skip the virtio-net headers. */ + src += na->virt_hdr_len; + src_len -= na->virt_hdr_len; + dst += dst_na->virt_hdr_len; + dst_len = dst_na->virt_hdr_len + src_len; + + /* Here it could be dst_len == 0 (which implies src_len == 0), + * so we avoid passing a zero length fragment. + */ + if (dst_len == 0) { + ft_p++; + src = ft_p->ft_buf; + src_len = ft_p->ft_len; + dst_len = src_len; + } + + if (vh && vh->gso_type != VIRTIO_NET_HDR_GSO_NONE) { + u_int gso_bytes = 0; + /* Length of the GSO packet header. */ + u_int gso_hdr_len = 0; + /* Pointer to the GSO packet header. Assume it is in a single fragment. */ + uint8_t *gso_hdr = NULL; + /* Index of the current segment. */ + u_int gso_idx = 0; + /* Payload data bytes segmented so far (e.g. TCP data bytes). */ + u_int segmented_bytes = 0; + /* Length of the IP header (20 if IPv4, 40 if IPv6). */ + u_int iphlen = 0; + /* Is this a TCP or an UDP GSO packet? */ + u_int tcp = ((vh->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) + == VIRTIO_NET_HDR_GSO_UDP) ? 0 : 1; + + /* Segment the GSO packet contained into the input slots (frags). */ + while (ft_p != ft_end) { + size_t copy; + + /* Grab the GSO header if we don't have it. */ + if (!gso_hdr) { + uint16_t ethertype; + + gso_hdr = src; + + /* Look at the 'Ethertype' field to see if this packet + * is IPv4 or IPv6. + */ + ethertype = be16toh(*((uint16_t *)(gso_hdr + 12))); + if (ethertype == 0x0800) + iphlen = 20; + else /* if (ethertype == 0x86DD) */ + iphlen = 40; + ND(3, "type=%04x", ethertype); + + /* Compute gso_hdr_len. For TCP we need to read the + * content of the 'Data Offset' field. + */ + if (tcp) { + struct nm_tcphdr *tcph = + (struct nm_tcphdr *)&gso_hdr[14+iphlen]; + + gso_hdr_len = 14 + iphlen + 4*(tcph->doff >> 4); + } else + gso_hdr_len = 14 + iphlen + 8; /* UDP */ + + ND(3, "gso_hdr_len %u gso_mtu %d", gso_hdr_len, + dst_na->mfs); + + /* Advance source pointers. */ + src += gso_hdr_len; + src_len -= gso_hdr_len; + if (src_len == 0) { + ft_p++; + if (ft_p == ft_end) + break; + src = ft_p->ft_buf; + src_len = ft_p->ft_len; + continue; + } + } + + /* Fill in the header of the current segment. */ + if (gso_bytes == 0) { + memcpy(dst, gso_hdr, gso_hdr_len); + gso_bytes = gso_hdr_len; + } + + /* Fill in data and update source and dest pointers. */ + copy = src_len; + if (gso_bytes + copy > dst_na->mfs) + copy = dst_na->mfs - gso_bytes; + memcpy(dst + gso_bytes, src, copy); + gso_bytes += copy; + src += copy; + src_len -= copy; + + /* A segment is complete or we have processed all the + the GSO payload bytes. */ + if (gso_bytes >= dst_na->mfs || + (src_len == 0 && ft_p + 1 == ft_end)) { + /* After raw segmentation, we must fix some header + * fields and compute checksums, in a protocol dependent + * way. */ + gso_fix_segment(dst, gso_bytes, gso_idx, + segmented_bytes, + src_len == 0 && ft_p + 1 == ft_end, + tcp, iphlen); + + ND("frame %u completed with %d bytes", gso_idx, (int)gso_bytes); + slot->len = gso_bytes; + slot->flags = 0; + segmented_bytes += gso_bytes - gso_hdr_len; + + dst_slots++; + + /* Next destination slot. */ + *j = nm_next(*j, lim); + slot = &ring->slot[*j]; + dst = BDG_NMB(&dst_na->up, slot); + + gso_bytes = 0; + gso_idx++; + } + + /* Next input slot. */ + if (src_len == 0) { + ft_p++; + if (ft_p == ft_end) + break; + src = ft_p->ft_buf; + src_len = ft_p->ft_len; + } + } + ND(3, "%d bytes segmented", segmented_bytes); + + } else { + /* Address of a checksum field into a destination slot. */ + uint16_t *check = NULL; + /* Accumulator for an unfolded checksum. */ + rawsum_t csum = 0; + + /* Process a non-GSO packet. */ + + /* Init 'check' if necessary. */ + if (vh && (vh->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)) { + if (unlikely(vh->csum_offset + vh->csum_start > src_len)) + D("invalid checksum request"); + else + check = (uint16_t *)(dst + vh->csum_start + + vh->csum_offset); + } + + while (ft_p != ft_end) { + /* Init/update the packet checksum if needed. */ + if (vh && (vh->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)) { + if (!dst_slots) + csum = nm_csum_raw(src + vh->csum_start, + src_len - vh->csum_start, 0); + else + csum = nm_csum_raw(src, src_len, csum); + } + + /* Round to a multiple of 64 */ + src_len = (src_len + 63) & ~63; + + if (ft_p->ft_flags & NS_INDIRECT) { + if (copyin(src, dst, src_len)) { + /* Invalid user pointer, pretend len is 0. */ + dst_len = 0; + } + } else { + memcpy(dst, src, (int)src_len); + } + slot->len = dst_len; + + dst_slots++; + + /* Next destination slot. */ + *j = nm_next(*j, lim); + slot = &ring->slot[*j]; + dst = BDG_NMB(&dst_na->up, slot); + + /* Next source slot. */ + ft_p++; + src = ft_p->ft_buf; + dst_len = src_len = ft_p->ft_len; + + } + + /* Finalize (fold) the checksum if needed. */ + if (check && vh && (vh->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)) { + *check = nm_csum_fold(csum); + } + ND(3, "using %u dst_slots", dst_slots); + + /* A second pass on the desitations slots to set the slot flags, + * using the right number of destination slots. + */ + while (j_start != *j) { + slot = &ring->slot[j_start]; + slot->flags = (dst_slots << 8)| NS_MOREFRAG; + j_start = nm_next(j_start, lim); + } + /* Clear NS_MOREFRAG flag on last entry. */ + slot->flags = (dst_slots << 8); + } + + /* Update howmany. */ + if (unlikely(dst_slots > *howmany)) { + dst_slots = *howmany; + D("Slot allocation error: Should never happen"); + } + *howmany -= dst_slots; +} diff --git a/sys/dev/netmap/netmap_pipe.c b/sys/dev/netmap/netmap_pipe.c new file mode 100644 index 000000000000..f8f29fa1770a --- /dev/null +++ b/sys/dev/netmap/netmap_pipe.c @@ -0,0 +1,711 @@ +/* + * Copyright (C) 2014 Giuseppe Lettieri. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* $FreeBSD$ */ + +#if defined(__FreeBSD__) +#include <sys/cdefs.h> /* prerequisite */ + +#include <sys/types.h> +#include <sys/errno.h> +#include <sys/param.h> /* defines used in kernel.h */ +#include <sys/kernel.h> /* types used in module initialization */ +#include <sys/malloc.h> +#include <sys/poll.h> +#include <sys/lock.h> +#include <sys/rwlock.h> +#include <sys/selinfo.h> +#include <sys/sysctl.h> +#include <sys/socket.h> /* sockaddrs */ +#include <net/if.h> +#include <net/if_var.h> +#include <machine/bus.h> /* bus_dmamap_* */ +#include <sys/refcount.h> + + +#elif defined(linux) + +#include "bsd_glue.h" + +#elif defined(__APPLE__) + +#warning OSX support is only partial +#include "osx_glue.h" + +#else + +#error Unsupported platform + +#endif /* unsupported */ + +/* + * common headers + */ + +#include <net/netmap.h> +#include <dev/netmap/netmap_kern.h> +#include <dev/netmap/netmap_mem2.h> + +#ifdef WITH_PIPES + +#define NM_PIPE_MAXSLOTS 4096 + +int netmap_default_pipes = 0; /* default number of pipes for each nic */ +SYSCTL_DECL(_dev_netmap); +SYSCTL_INT(_dev_netmap, OID_AUTO, default_pipes, CTLFLAG_RW, &netmap_default_pipes, 0 , ""); + +/* allocate the pipe array in the parent adapter */ +int +netmap_pipe_alloc(struct netmap_adapter *na, struct nmreq *nmr) +{ + size_t len; + int mode = nmr->nr_flags & NR_REG_MASK; + u_int npipes; + + if (mode == NR_REG_PIPE_MASTER || mode == NR_REG_PIPE_SLAVE) { + /* this is for our parent, not for us */ + return 0; + } + + /* TODO: we can resize the array if the new + * request can accomodate the already existing pipes + */ + if (na->na_pipes) { + nmr->nr_arg1 = na->na_max_pipes; + return 0; + } + + npipes = nmr->nr_arg1; + if (npipes == 0) + npipes = netmap_default_pipes; + nm_bound_var(&npipes, 0, 0, NM_MAXPIPES, NULL); + + if (npipes == 0) { + /* really zero, nothing to alloc */ + goto out; + } + + len = sizeof(struct netmap_pipe_adapter *) * npipes; + na->na_pipes = malloc(len, M_DEVBUF, M_NOWAIT | M_ZERO); + if (na->na_pipes == NULL) + return ENOMEM; + + na->na_max_pipes = npipes; + na->na_next_pipe = 0; + +out: + nmr->nr_arg1 = npipes; + + return 0; +} + +/* deallocate the parent array in the parent adapter */ +void +netmap_pipe_dealloc(struct netmap_adapter *na) +{ + if (na->na_pipes) { + ND("freeing pipes for %s", NM_IFPNAME(na->ifp)); + free(na->na_pipes, M_DEVBUF); + na->na_pipes = NULL; + na->na_max_pipes = 0; + na->na_next_pipe = 0; + } +} + +/* find a pipe endpoint with the given id among the parent's pipes */ +static struct netmap_pipe_adapter * +netmap_pipe_find(struct netmap_adapter *parent, u_int pipe_id) +{ + int i; + struct netmap_pipe_adapter *na; + + for (i = 0; i < parent->na_next_pipe; i++) { + na = parent->na_pipes[i]; + if (na->id == pipe_id) { + return na; + } + } + return NULL; +} + +/* add a new pipe endpoint to the parent array */ +static int +netmap_pipe_add(struct netmap_adapter *parent, struct netmap_pipe_adapter *na) +{ + if (parent->na_next_pipe >= parent->na_max_pipes) { + D("%s: no space left for pipes", NM_IFPNAME(parent->ifp)); + return ENOMEM; + } + + parent->na_pipes[parent->na_next_pipe] = na; + na->parent_slot = parent->na_next_pipe; + parent->na_next_pipe++; + return 0; +} + +/* remove the given pipe endpoint from the parent array */ +static void +netmap_pipe_remove(struct netmap_adapter *parent, struct netmap_pipe_adapter *na) +{ + u_int n; + n = --parent->na_next_pipe; + if (n != na->parent_slot) { + parent->na_pipes[na->parent_slot] = + parent->na_pipes[n]; + } + parent->na_pipes[n] = NULL; +} + +static int +netmap_pipe_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) +{ + struct netmap_kring *txkring = na->tx_rings + ring_nr, + *rxkring = txkring->pipe; + u_int limit; /* slots to transfer */ + u_int j, k, lim_tx = txkring->nkr_num_slots - 1, + lim_rx = rxkring->nkr_num_slots - 1; + int m, busy; + + ND("%p: %s %x -> %s", txkring, txkring->name, flags, rxkring->name); + ND(2, "before: hwcur %d hwtail %d cur %d head %d tail %d", txkring->nr_hwcur, txkring->nr_hwtail, + txkring->rcur, txkring->rhead, txkring->rtail); + + j = rxkring->nr_hwtail; /* RX */ + k = txkring->nr_hwcur; /* TX */ + m = txkring->rhead - txkring->nr_hwcur; /* new slots */ + if (m < 0) + m += txkring->nkr_num_slots; + limit = m; + m = rxkring->nkr_num_slots - 1; /* max avail space on destination */ + busy = j - rxkring->nr_hwcur; /* busy slots */ + if (busy < 0) + busy += txkring->nkr_num_slots; + m -= busy; /* subtract busy slots */ + ND(2, "m %d limit %d", m, limit); + if (m < limit) + limit = m; + + if (limit == 0) { + /* either the rxring is full, or nothing to send */ + nm_txsync_finalize(txkring); /* actually useless */ + return 0; + } + + while (limit-- > 0) { + struct netmap_slot *rs = &rxkring->save_ring->slot[j]; + struct netmap_slot *ts = &txkring->ring->slot[k]; + struct netmap_slot tmp; + + /* swap the slots */ + tmp = *rs; + *rs = *ts; + *ts = tmp; + + /* no need to report the buffer change */ + + j = nm_next(j, lim_rx); + k = nm_next(k, lim_tx); + } + + wmb(); /* make sure the slots are updated before publishing them */ + rxkring->nr_hwtail = j; + txkring->nr_hwcur = k; + txkring->nr_hwtail = nm_prev(k, lim_tx); + + nm_txsync_finalize(txkring); + ND(2, "after: hwcur %d hwtail %d cur %d head %d tail %d j %d", txkring->nr_hwcur, txkring->nr_hwtail, + txkring->rcur, txkring->rhead, txkring->rtail, j); + + wmb(); /* make sure rxkring->nr_hwtail is updated before notifying */ + rxkring->na->nm_notify(rxkring->na, rxkring->ring_id, NR_RX, 0); + + return 0; +} + +static int +netmap_pipe_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) +{ + struct netmap_kring *rxkring = na->rx_rings + ring_nr, + *txkring = rxkring->pipe; + uint32_t oldhwcur = rxkring->nr_hwcur; + + ND("%s %x <- %s", rxkring->name, flags, txkring->name); + rxkring->nr_hwcur = rxkring->rhead; /* recover user-relased slots */ + ND(5, "hwcur %d hwtail %d cur %d head %d tail %d", rxkring->nr_hwcur, rxkring->nr_hwtail, + rxkring->rcur, rxkring->rhead, rxkring->rtail); + rmb(); /* paired with the first wmb() in txsync */ + nm_rxsync_finalize(rxkring); + + if (oldhwcur != rxkring->nr_hwcur) { + /* we have released some slots, notify the other end */ + wmb(); /* make sure nr_hwcur is updated before notifying */ + txkring->na->nm_notify(txkring->na, txkring->ring_id, NR_TX, 0); + } + return 0; +} + +/* Pipe endpoints are created and destroyed together, so that endopoints do not + * have to check for the existence of their peer at each ?xsync. + * + * To play well with the existing netmap infrastructure (refcounts etc.), we + * adopt the following strategy: + * + * 1) The first endpoint that is created also creates the other endpoint and + * grabs a reference to it. + * + * state A) user1 --> endpoint1 --> endpoint2 + * + * 2) If, starting from state A, endpoint2 is then registered, endpoint1 gives + * its reference to the user: + * + * state B) user1 --> endpoint1 endpoint2 <--- user2 + * + * 3) Assume that, starting from state B endpoint2 is closed. In the unregister + * callback endpoint2 notes that endpoint1 is still active and adds a reference + * from endpoint1 to itself. When user2 then releases her own reference, + * endpoint2 is not destroyed and we are back to state A. A symmetrical state + * would be reached if endpoint1 were released instead. + * + * 4) If, starting from state A, endpoint1 is closed, the destructor notes that + * it owns a reference to endpoint2 and releases it. + * + * Something similar goes on for the creation and destruction of the krings. + */ + + +/* netmap_pipe_krings_delete. + * + * There are two cases: + * + * 1) state is + * + * usr1 --> e1 --> e2 + * + * and we are e1. We have to create both sets + * of krings. + * + * 2) state is + * + * usr1 --> e1 --> e2 + * + * and we are e2. e1 is certainly registered and our + * krings already exist, but they may be hidden. + */ +static int +netmap_pipe_krings_create(struct netmap_adapter *na) +{ + struct netmap_pipe_adapter *pna = + (struct netmap_pipe_adapter *)na; + struct netmap_adapter *ona = &pna->peer->up; + int error = 0; + if (pna->peer_ref) { + int i; + + /* case 1) above */ + D("%p: case 1, create everything", na); + error = netmap_krings_create(na, 0); + if (error) + goto err; + + /* we also create all the rings, since we need to + * update the save_ring pointers. + * netmap_mem_rings_create (called by our caller) + * will not create the rings again + */ + + error = netmap_mem_rings_create(na); + if (error) + goto del_krings1; + + /* update our hidden ring pointers */ + for (i = 0; i < na->num_tx_rings + 1; i++) + na->tx_rings[i].save_ring = na->tx_rings[i].ring; + for (i = 0; i < na->num_rx_rings + 1; i++) + na->rx_rings[i].save_ring = na->rx_rings[i].ring; + + /* now, create krings and rings of the other end */ + error = netmap_krings_create(ona, 0); + if (error) + goto del_rings1; + + error = netmap_mem_rings_create(ona); + if (error) + goto del_krings2; + + for (i = 0; i < ona->num_tx_rings + 1; i++) + ona->tx_rings[i].save_ring = ona->tx_rings[i].ring; + for (i = 0; i < ona->num_rx_rings + 1; i++) + ona->rx_rings[i].save_ring = ona->rx_rings[i].ring; + + /* cross link the krings */ + for (i = 0; i < na->num_tx_rings; i++) { + na->tx_rings[i].pipe = pna->peer->up.rx_rings + i; + na->rx_rings[i].pipe = pna->peer->up.tx_rings + i; + pna->peer->up.tx_rings[i].pipe = na->rx_rings + i; + pna->peer->up.rx_rings[i].pipe = na->tx_rings + i; + } + } else { + int i; + /* case 2) above */ + /* recover the hidden rings */ + ND("%p: case 2, hidden rings", na); + for (i = 0; i < na->num_tx_rings + 1; i++) + na->tx_rings[i].ring = na->tx_rings[i].save_ring; + for (i = 0; i < na->num_rx_rings + 1; i++) + na->rx_rings[i].ring = na->rx_rings[i].save_ring; + } + return 0; + +del_krings2: + netmap_krings_delete(ona); +del_rings1: + netmap_mem_rings_delete(na); +del_krings1: + netmap_krings_delete(na); +err: + return error; +} + +/* netmap_pipe_reg. + * + * There are two cases on registration (onoff==1) + * + * 1.a) state is + * + * usr1 --> e1 --> e2 + * + * and we are e1. Nothing special to do. + * + * 1.b) state is + * + * usr1 --> e1 --> e2 <-- usr2 + * + * and we are e2. Drop the ref e1 is holding. + * + * There are two additional cases on unregister (onoff==0) + * + * 2.a) state is + * + * usr1 --> e1 --> e2 + * + * and we are e1. Nothing special to do, e2 will + * be cleaned up by the destructor of e1. + * + * 2.b) state is + * + * usr1 --> e1 e2 <-- usr2 + * + * and we are either e1 or e2. Add a ref from the + * other end and hide our rings. + */ +static int +netmap_pipe_reg(struct netmap_adapter *na, int onoff) +{ + struct netmap_pipe_adapter *pna = + (struct netmap_pipe_adapter *)na; + struct ifnet *ifp = na->ifp; + ND("%p: onoff %d", na, onoff); + if (onoff) { + ifp->if_capenable |= IFCAP_NETMAP; + } else { + ifp->if_capenable &= ~IFCAP_NETMAP; + } + if (pna->peer_ref) { + ND("%p: case 1.a or 2.a, nothing to do", na); + return 0; + } + if (onoff) { + ND("%p: case 1.b, drop peer", na); + pna->peer->peer_ref = 0; + netmap_adapter_put(na); + } else { + int i; + ND("%p: case 2.b, grab peer", na); + netmap_adapter_get(na); + pna->peer->peer_ref = 1; + /* hide our rings from netmap_mem_rings_delete */ + for (i = 0; i < na->num_tx_rings + 1; i++) { + na->tx_rings[i].ring = NULL; + } + for (i = 0; i < na->num_rx_rings + 1; i++) { + na->rx_rings[i].ring = NULL; + } + } + return 0; +} + +/* netmap_pipe_krings_delete. + * + * There are two cases: + * + * 1) state is + * + * usr1 --> e1 --> e2 + * + * and we are e1 (e2 is not registered, so krings_delete cannot be + * called on it); + * + * 2) state is + * + * usr1 --> e1 e2 <-- usr2 + * + * and we are either e1 or e2. + * + * In the former case we have to also delete the krings of e2; + * in the latter case we do nothing (note that our krings + * have already been hidden in the unregister callback). + */ +static void +netmap_pipe_krings_delete(struct netmap_adapter *na) +{ + struct netmap_pipe_adapter *pna = + (struct netmap_pipe_adapter *)na; + struct netmap_adapter *ona; /* na of the other end */ + int i; + + if (!pna->peer_ref) { + ND("%p: case 2, kept alive by peer", na); + return; + } + /* case 1) above */ + ND("%p: case 1, deleting everyhing", na); + netmap_krings_delete(na); /* also zeroes tx_rings etc. */ + /* restore the ring to be deleted on the peer */ + ona = &pna->peer->up; + if (ona->tx_rings == NULL) { + /* already deleted, we must be on an + * cleanup-after-error path */ + return; + } + for (i = 0; i < ona->num_tx_rings + 1; i++) + ona->tx_rings[i].ring = ona->tx_rings[i].save_ring; + for (i = 0; i < ona->num_rx_rings + 1; i++) + ona->rx_rings[i].ring = ona->rx_rings[i].save_ring; + netmap_mem_rings_delete(ona); + netmap_krings_delete(ona); +} + + +static void +netmap_pipe_dtor(struct netmap_adapter *na) +{ + struct netmap_pipe_adapter *pna = + (struct netmap_pipe_adapter *)na; + ND("%p", na); + if (pna->peer_ref) { + ND("%p: clean up peer", na); + pna->peer_ref = 0; + netmap_adapter_put(&pna->peer->up); + } + if (pna->role == NR_REG_PIPE_MASTER) + netmap_pipe_remove(pna->parent, pna); + netmap_adapter_put(pna->parent); + free(na->ifp, M_DEVBUF); + na->ifp = NULL; + pna->parent = NULL; +} + +int +netmap_get_pipe_na(struct nmreq *nmr, struct netmap_adapter **na, int create) +{ + struct nmreq pnmr; + struct netmap_adapter *pna; /* parent adapter */ + struct netmap_pipe_adapter *mna, *sna, *req; + struct ifnet *ifp, *ifp2; + u_int pipe_id; + int role = nmr->nr_flags & NR_REG_MASK; + int error; + + ND("flags %x", nmr->nr_flags); + + if (role != NR_REG_PIPE_MASTER && role != NR_REG_PIPE_SLAVE) { + ND("not a pipe"); + return 0; + } + role = nmr->nr_flags & NR_REG_MASK; + + /* first, try to find the parent adapter */ + bzero(&pnmr, sizeof(pnmr)); + memcpy(&pnmr.nr_name, nmr->nr_name, IFNAMSIZ); + /* pass to parent the requested number of pipes */ + pnmr.nr_arg1 = nmr->nr_arg1; + error = netmap_get_na(&pnmr, &pna, create); + if (error) { + ND("parent lookup failed: %d", error); + return error; + } + ND("found parent: %s", NM_IFPNAME(pna->ifp)); + + if (NETMAP_OWNED_BY_KERN(pna)) { + ND("parent busy"); + error = EBUSY; + goto put_out; + } + + /* next, lookup the pipe id in the parent list */ + req = NULL; + pipe_id = nmr->nr_ringid & NETMAP_RING_MASK; + mna = netmap_pipe_find(pna, pipe_id); + if (mna) { + if (mna->role == role) { + ND("found %d directly at %d", pipe_id, mna->parent_slot); + req = mna; + } else { + ND("found %d indirectly at %d", pipe_id, mna->parent_slot); + req = mna->peer; + } + /* the pipe we have found already holds a ref to the parent, + * so we need to drop the one we got from netmap_get_na() + */ + netmap_adapter_put(pna); + goto found; + } + ND("pipe %d not found, create %d", pipe_id, create); + if (!create) { + error = ENODEV; + goto put_out; + } + /* we create both master and slave. + * The endpoint we were asked for holds a reference to + * the other one. + */ + ifp = malloc(sizeof(*ifp), M_DEVBUF, M_NOWAIT | M_ZERO); + if (!ifp) { + error = ENOMEM; + goto put_out; + } + strcpy(ifp->if_xname, NM_IFPNAME(pna->ifp)); + + mna = malloc(sizeof(*mna), M_DEVBUF, M_NOWAIT | M_ZERO); + if (mna == NULL) { + error = ENOMEM; + goto free_ifp; + } + mna->up.ifp = ifp; + + mna->id = pipe_id; + mna->role = NR_REG_PIPE_MASTER; + mna->parent = pna; + + mna->up.nm_txsync = netmap_pipe_txsync; + mna->up.nm_rxsync = netmap_pipe_rxsync; + mna->up.nm_register = netmap_pipe_reg; + mna->up.nm_dtor = netmap_pipe_dtor; + mna->up.nm_krings_create = netmap_pipe_krings_create; + mna->up.nm_krings_delete = netmap_pipe_krings_delete; + mna->up.nm_mem = pna->nm_mem; + mna->up.na_lut = pna->na_lut; + mna->up.na_lut_objtotal = pna->na_lut_objtotal; + + mna->up.num_tx_rings = 1; + mna->up.num_rx_rings = 1; + mna->up.num_tx_desc = nmr->nr_tx_slots; + nm_bound_var(&mna->up.num_tx_desc, pna->num_tx_desc, + 1, NM_PIPE_MAXSLOTS, NULL); + mna->up.num_rx_desc = nmr->nr_rx_slots; + nm_bound_var(&mna->up.num_rx_desc, pna->num_rx_desc, + 1, NM_PIPE_MAXSLOTS, NULL); + error = netmap_attach_common(&mna->up); + if (error) + goto free_ifp; + /* register the master with the parent */ + error = netmap_pipe_add(pna, mna); + if (error) + goto free_mna; + + /* create the slave */ + ifp2 = malloc(sizeof(*ifp), M_DEVBUF, M_NOWAIT | M_ZERO); + if (!ifp) { + error = ENOMEM; + goto free_mna; + } + strcpy(ifp2->if_xname, NM_IFPNAME(pna->ifp)); + + sna = malloc(sizeof(*mna), M_DEVBUF, M_NOWAIT | M_ZERO); + if (sna == NULL) { + error = ENOMEM; + goto free_ifp2; + } + /* most fields are the same, copy from master and then fix */ + *sna = *mna; + sna->up.ifp = ifp2; + sna->role = NR_REG_PIPE_SLAVE; + error = netmap_attach_common(&sna->up); + if (error) + goto free_sna; + + /* join the two endpoints */ + mna->peer = sna; + sna->peer = mna; + + /* we already have a reference to the parent, but we + * need another one for the other endpoint we created + */ + netmap_adapter_get(pna); + + if (role == NR_REG_PIPE_MASTER) { + req = mna; + mna->peer_ref = 1; + netmap_adapter_get(&sna->up); + } else { + req = sna; + sna->peer_ref = 1; + netmap_adapter_get(&mna->up); + } + ND("created master %p and slave %p", mna, sna); +found: + + ND("pipe %d %s at %p", pipe_id, + (req->role == NR_REG_PIPE_MASTER ? "master" : "slave"), req); + *na = &req->up; + netmap_adapter_get(*na); + + /* write the configuration back */ + nmr->nr_tx_rings = req->up.num_tx_rings; + nmr->nr_rx_rings = req->up.num_rx_rings; + nmr->nr_tx_slots = req->up.num_tx_desc; + nmr->nr_rx_slots = req->up.num_rx_desc; + + /* keep the reference to the parent. + * It will be released by the req destructor + */ + + return 0; + +free_sna: + free(sna, M_DEVBUF); +free_ifp2: + free(ifp2, M_DEVBUF); +free_mna: + free(mna, M_DEVBUF); +free_ifp: + free(ifp, M_DEVBUF); +put_out: + netmap_adapter_put(pna); + return error; +} + + +#endif /* WITH_PIPES */ diff --git a/sys/dev/netmap/netmap_vale.c b/sys/dev/netmap/netmap_vale.c index 13a725378c28..34e39126e525 100644 --- a/sys/dev/netmap/netmap_vale.c +++ b/sys/dev/netmap/netmap_vale.c @@ -164,21 +164,6 @@ static int netmap_bwrap_register(struct netmap_adapter *, int onoff); int kern_netmap_regif(struct nmreq *nmr); /* - * Each transmit queue accumulates a batch of packets into - * a structure before forwarding. Packets to the same - * destination are put in a list using ft_next as a link field. - * ft_frags and ft_next are valid only on the first fragment. - */ -struct nm_bdg_fwd { /* forwarding entry for a bridge */ - void *ft_buf; /* netmap or indirect buffer */ - uint8_t ft_frags; /* how many fragments (only on 1st frag) */ - uint8_t _ft_port; /* dst port (unused) */ - uint16_t ft_flags; /* flags, e.g. indirect */ - uint16_t ft_len; /* src fragment len */ - uint16_t ft_next; /* next packet to same destination */ -}; - -/* * For each output interface, nm_bdg_q is used to construct a list. * bq_len is the number of output buffers (we can have coalescing * during the copy). @@ -381,7 +366,7 @@ nm_alloc_bdgfwd(struct netmap_adapter *na) l += sizeof(struct nm_bdg_q) * num_dstq; l += sizeof(uint16_t) * NM_BDG_BATCH_MAX; - nrings = na->num_tx_rings + 1; + nrings = netmap_real_tx_rings(na); kring = na->tx_rings; for (i = 0; i < nrings; i++) { struct nm_bdg_fwd *ft; @@ -421,7 +406,8 @@ netmap_bdg_detach_common(struct nm_bridge *b, int hw, int sw) acquire BDG_WLOCK() and copy back the array. */ - D("detach %d and %d (lim %d)", hw, sw, lim); + if (netmap_verbose) + D("detach %d and %d (lim %d)", hw, sw, lim); /* make a copy of the list of active ports, update it, * and then copy back within BDG_WLOCK(). */ @@ -675,7 +661,7 @@ nm_bdg_attach(struct nmreq *nmr) goto unref_exit; } - nifp = netmap_do_regif(npriv, na, nmr->nr_ringid, &error); + nifp = netmap_do_regif(npriv, na, nmr->nr_ringid, nmr->nr_flags, &error); if (!nifp) { goto unref_exit; } @@ -855,15 +841,23 @@ netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func) NMG_UNLOCK(); break; - case NETMAP_BDG_OFFSET: + case NETMAP_BDG_VNET_HDR: + /* Valid lengths for the virtio-net header are 0 (no header), + 10 and 12. */ + if (nmr->nr_arg1 != 0 && + nmr->nr_arg1 != sizeof(struct nm_vnet_hdr) && + nmr->nr_arg1 != 12) { + error = EINVAL; + break; + } NMG_LOCK(); error = netmap_get_bdg_na(nmr, &na, 0); if (na && !error) { vpna = (struct netmap_vp_adapter *)na; - if (nmr->nr_arg1 > NETMAP_BDG_MAX_OFFSET) - nmr->nr_arg1 = NETMAP_BDG_MAX_OFFSET; - vpna->offset = nmr->nr_arg1; - D("Using offset %d for %p", vpna->offset, vpna); + vpna->virt_hdr_len = nmr->nr_arg1; + if (vpna->virt_hdr_len) + vpna->mfs = NETMAP_BDG_BUF_SIZE(na->nm_mem); + D("Using vnet_hdr_len %d for %p", vpna->virt_hdr_len, vpna); netmap_adapter_put(na); } NMG_UNLOCK(); @@ -877,26 +871,20 @@ netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func) return error; } - static int netmap_vp_krings_create(struct netmap_adapter *na) { - u_int ntx, nrx, tailroom; + u_int tailroom; int error, i; uint32_t *leases; - - /* XXX vps do not need host rings, - * but we crash if we don't have one - */ - ntx = na->num_tx_rings + 1; - nrx = na->num_rx_rings + 1; + u_int nrx = netmap_real_rx_rings(na); /* * Leases are attached to RX rings on vale ports */ tailroom = sizeof(uint32_t) * na->num_rx_desc * nrx; - error = netmap_krings_create(na, ntx, nrx, tailroom); + error = netmap_krings_create(na, tailroom); if (error) return error; @@ -1212,16 +1200,16 @@ nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na, u_int len = ft[i].ft_len; ND("slot %d frags %d", i, ft[i].ft_frags); - /* Drop the packet if the offset is not into the first + /* Drop the packet if the virtio-net header is not into the first fragment nor at the very beginning of the second. */ - if (unlikely(na->offset > len)) + if (unlikely(na->virt_hdr_len > len)) continue; - if (len == na->offset) { + if (len == na->virt_hdr_len) { buf = ft[i+1].ft_buf; len = ft[i+1].ft_len; } else { - buf += na->offset; - len -= na->offset; + buf += na->virt_hdr_len; + len -= na->virt_hdr_len; } dst_port = b->nm_bdg_lookup(buf, len, &dst_ring, na); if (netmap_verbose > 255) @@ -1280,13 +1268,13 @@ nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na, struct netmap_vp_adapter *dst_na; struct netmap_kring *kring; struct netmap_ring *ring; - u_int dst_nr, lim, j, sent = 0, d_i, next, brd_next; + u_int dst_nr, lim, j, d_i, next, brd_next; u_int needed, howmany; int retry = netmap_txsync_retry; struct nm_bdg_q *d; uint32_t my_start = 0, lease_idx = 0; int nrings; - int offset_mismatch; + int virt_hdr_mismatch = 0; d_i = dsts[i]; ND("second pass %d port %d", i, d_i); @@ -1311,8 +1299,6 @@ nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na, goto cleanup; } - offset_mismatch = (dst_na->offset != na->offset); - /* there is at least one either unicast or broadcast packet */ brd_next = brddst->bq_head; next = d->bq_head; @@ -1325,6 +1311,29 @@ nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na, */ needed = d->bq_len + brddst->bq_len; + if (unlikely(dst_na->virt_hdr_len != na->virt_hdr_len)) { + /* There is a virtio-net header/offloadings mismatch between + * source and destination. The slower mismatch datapath will + * be used to cope with all the mismatches. + */ + virt_hdr_mismatch = 1; + if (dst_na->mfs < na->mfs) { + /* We may need to do segmentation offloadings, and so + * we may need a number of destination slots greater + * than the number of input slots ('needed'). + * We look for the smallest integer 'x' which satisfies: + * needed * na->mfs + x * H <= x * na->mfs + * where 'H' is the length of the longest header that may + * be replicated in the segmentation process (e.g. for + * TCPv4 we must account for ethernet header, IP header + * and TCPv4 header). + */ + needed = (needed * na->mfs) / + (dst_na->mfs - WORST_CASE_GSO_HEADER) + 1; + ND(3, "srcmtu=%u, dstmtu=%u, x=%u", na->mfs, dst_na->mfs, needed); + } + } + ND(5, "pass 2 dst %d is %x %s", i, d_i, is_vp ? "virtual" : "nic/host"); dst_nr = d_i & (NM_BDG_MAXRINGS-1); @@ -1337,6 +1346,10 @@ nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na, retry: + if (dst_na->retry && retry) { + /* try to get some free slot from the previous run */ + dst_na->up.nm_notify(&dst_na->up, dst_nr, NR_RX, 0); + } /* reserve the buffers in the queue and an entry * to report completion, and drop lock. * XXX this might become a helper function. @@ -1346,9 +1359,6 @@ retry: mtx_unlock(&kring->q_lock); goto cleanup; } - if (dst_na->retry) { - dst_na->up.nm_notify(&dst_na->up, dst_nr, NR_RX, 0); - } my_start = j = kring->nkr_hwlease; howmany = nm_kr_space(kring, 1); if (needed < howmany) @@ -1365,7 +1375,6 @@ retry: struct netmap_slot *slot; struct nm_bdg_fwd *ft_p, *ft_end; u_int cnt; - int fix_mismatch = offset_mismatch; /* find the queue from which we pick next packet. * NM_FT_NULL is always higher than valid indexes @@ -1383,58 +1392,43 @@ retry: cnt = ft_p->ft_frags; // cnt > 0 if (unlikely(cnt > howmany)) break; /* no more space */ - howmany -= cnt; if (netmap_verbose && cnt > 1) RD(5, "rx %d frags to %d", cnt, j); ft_end = ft_p + cnt; - do { - char *dst, *src = ft_p->ft_buf; - size_t copy_len = ft_p->ft_len, dst_len = copy_len; - - slot = &ring->slot[j]; - dst = BDG_NMB(&dst_na->up, slot); + if (unlikely(virt_hdr_mismatch)) { + bdg_mismatch_datapath(na, dst_na, ft_p, ring, &j, lim, &howmany); + } else { + howmany -= cnt; + do { + char *dst, *src = ft_p->ft_buf; + size_t copy_len = ft_p->ft_len, dst_len = copy_len; - if (unlikely(fix_mismatch)) { - /* We are processing the first fragment - * and there is a mismatch between source - * and destination offsets. Create a zeroed - * header for the destination, independently - * of the source header length and content. - */ - src += na->offset; - copy_len -= na->offset; - bzero(dst, dst_na->offset); - dst += dst_na->offset; - dst_len = dst_na->offset + copy_len; - /* fix the first fragment only */ - fix_mismatch = 0; - /* Here it could be copy_len == dst_len == 0, - * and so a zero length fragment is passed. - */ - } + slot = &ring->slot[j]; + dst = BDG_NMB(&dst_na->up, slot); - ND("send [%d] %d(%d) bytes at %s:%d", - i, (int)copy_len, (int)dst_len, - NM_IFPNAME(dst_ifp), j); - /* round to a multiple of 64 */ - copy_len = (copy_len + 63) & ~63; + ND("send [%d] %d(%d) bytes at %s:%d", + i, (int)copy_len, (int)dst_len, + NM_IFPNAME(dst_ifp), j); + /* round to a multiple of 64 */ + copy_len = (copy_len + 63) & ~63; - if (ft_p->ft_flags & NS_INDIRECT) { - if (copyin(src, dst, copy_len)) { - // invalid user pointer, pretend len is 0 - dst_len = 0; - } - } else { - //memcpy(dst, src, copy_len); - pkt_copy(src, dst, (int)copy_len); - } - slot->len = dst_len; - slot->flags = (cnt << 8)| NS_MOREFRAG; - j = nm_next(j, lim); - ft_p++; - sent++; - } while (ft_p != ft_end); - slot->flags = (cnt << 8); /* clear flag on last entry */ + if (ft_p->ft_flags & NS_INDIRECT) { + if (copyin(src, dst, copy_len)) { + // invalid user pointer, pretend len is 0 + dst_len = 0; + } + } else { + //memcpy(dst, src, copy_len); + pkt_copy(src, dst, (int)copy_len); + } + slot->len = dst_len; + slot->flags = (cnt << 8)| NS_MOREFRAG; + j = nm_next(j, lim); + needed--; + ft_p++; + } while (ft_p != ft_end); + slot->flags = (cnt << 8); /* clear flag on last entry */ + } /* are we done ? */ if (next == NM_FT_NULL && brd_next == NM_FT_NULL) break; @@ -1484,9 +1478,9 @@ retry: */ if (likely(j != my_start)) { kring->nr_hwtail = j; - dst_na->up.nm_notify(&dst_na->up, dst_nr, NR_RX, 0); still_locked = 0; mtx_unlock(&kring->q_lock); + dst_na->up.nm_notify(&dst_na->up, dst_nr, NR_RX, 0); if (dst_na->retry && retry--) goto retry; } @@ -1615,6 +1609,7 @@ bdg_netmap_attach(struct nmreq *nmr, struct ifnet *ifp) struct netmap_vp_adapter *vpna; struct netmap_adapter *na; int error; + u_int npipes = 0; vpna = malloc(sizeof(*vpna), M_DEVBUF, M_NOWAIT | M_ZERO); if (vpna == NULL) @@ -1636,8 +1631,23 @@ bdg_netmap_attach(struct nmreq *nmr, struct ifnet *ifp) na->num_tx_desc = nmr->nr_tx_slots; nm_bound_var(&nmr->nr_rx_slots, NM_BRIDGE_RINGSIZE, 1, NM_BDG_MAXSLOTS, NULL); + /* validate number of pipes. We want at least 1, + * but probably can do with some more. + * So let's use 2 as default (when 0 is supplied) + */ + npipes = nmr->nr_arg1; + nm_bound_var(&npipes, 2, 1, NM_MAXPIPES, NULL); + nmr->nr_arg1 = npipes; /* write back */ + /* validate extra bufs */ + nm_bound_var(&nmr->nr_arg3, 0, 0, + 128*NM_BDG_MAXSLOTS, NULL); na->num_rx_desc = nmr->nr_rx_slots; - vpna->offset = 0; + vpna->virt_hdr_len = 0; + vpna->mfs = 1514; + /*if (vpna->mfs > netmap_buf_size) TODO netmap_buf_size is zero?? + vpna->mfs = netmap_buf_size; */ + if (netmap_verbose) + D("max frame size %u", vpna->mfs); na->na_flags |= NAF_BDG_MAYSLEEP | NAF_MEM_OWNER; na->nm_txsync = bdg_netmap_txsync; @@ -1648,14 +1658,21 @@ bdg_netmap_attach(struct nmreq *nmr, struct ifnet *ifp) na->nm_krings_delete = netmap_vp_krings_delete; na->nm_mem = netmap_mem_private_new(NM_IFPNAME(na->ifp), na->num_tx_rings, na->num_tx_desc, - na->num_rx_rings, na->num_rx_desc); + na->num_rx_rings, na->num_rx_desc, + nmr->nr_arg3, npipes, &error); + if (na->nm_mem == NULL) + goto err; /* other nmd fields are set in the common routine */ error = netmap_attach_common(na); - if (error) { - free(vpna, M_DEVBUF); - return error; - } + if (error) + goto err; return 0; + +err: + if (na->nm_mem != NULL) + netmap_mem_private_delete(na->nm_mem); + free(vpna, M_DEVBUF); + return error; } @@ -1763,19 +1780,17 @@ netmap_bwrap_intr_notify(struct netmap_adapter *na, u_int ring_nr, enum txrx tx, ring->cur = kring->rcur; ring->tail = kring->rtail; - /* simulate a user wakeup on the rx ring */ if (is_host_ring) { - netmap_rxsync_from_host(na, NULL, NULL); vpna = hostna; ring_nr = 0; - } else { - /* fetch packets that have arrived. - * XXX maybe do this in a loop ? - */ - error = na->nm_rxsync(na, ring_nr, 0); - if (error) - goto put_out; - } + } + /* simulate a user wakeup on the rx ring */ + /* fetch packets that have arrived. + * XXX maybe do this in a loop ? + */ + error = kring->nm_sync(kring, 0); + if (error) + goto put_out; if (kring->nr_hwcur == kring->nr_hwtail && netmap_verbose) { D("how strange, interrupt with no packets on %s", NM_IFPNAME(ifp)); @@ -1801,7 +1816,7 @@ netmap_bwrap_intr_notify(struct netmap_adapter *na, u_int ring_nr, enum txrx tx, ring->tail = kring->rtail; /* another call to actually release the buffers */ if (!is_host_ring) { - error = na->nm_rxsync(na, ring_nr, 0); + error = kring->nm_sync(kring, 0); } else { /* mark all packets as released, as in the * second part of netmap_rxsync_from_host() @@ -1842,11 +1857,11 @@ netmap_bwrap_register(struct netmap_adapter *na, int onoff) * The original number of rings comes from hwna, * rx rings on one side equals tx rings on the other. */ - for (i = 0; i <= na->num_rx_rings; i++) { + for (i = 0; i < na->num_rx_rings + 1; i++) { hwna->tx_rings[i].nkr_num_slots = na->rx_rings[i].nkr_num_slots; hwna->tx_rings[i].ring = na->rx_rings[i].ring; } - for (i = 0; i <= na->num_tx_rings; i++) { + for (i = 0; i < na->num_tx_rings + 1; i++) { hwna->rx_rings[i].nkr_num_slots = na->tx_rings[i].nkr_num_slots; hwna->rx_rings[i].ring = na->tx_rings[i].ring; } @@ -1914,8 +1929,10 @@ netmap_bwrap_krings_create(struct netmap_adapter *na) return error; } - hostna->tx_rings = na->tx_rings + na->num_tx_rings; - hostna->rx_rings = na->rx_rings + na->num_rx_rings; + if (na->na_flags & NAF_HOST_RINGS) { + hostna->tx_rings = na->tx_rings + na->num_tx_rings; + hostna->rx_rings = na->rx_rings + na->num_rx_rings; + } return 0; } @@ -1957,6 +1974,7 @@ netmap_bwrap_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx, int f if (hwna->ifp == NULL || !(hwna->ifp->if_capenable & IFCAP_NETMAP)) return 0; + mtx_lock(&kring->q_lock); /* first step: simulate a user wakeup on the rx ring */ netmap_vp_rxsync(na, ring_n, flags); ND("%s[%d] PRE rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)", @@ -1972,12 +1990,8 @@ netmap_bwrap_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx, int f */ /* set tail to what the hw expects */ ring->tail = hw_kring->rtail; - if (ring_n == na->num_rx_rings) { - netmap_txsync_to_host(hwna); - } else { - nm_txsync_prologue(&hwna->tx_rings[ring_n]); // XXX error checking ? - error = hwna->nm_txsync(hwna, ring_n, flags); - } + nm_txsync_prologue(&hwna->tx_rings[ring_n]); // XXX error checking ? + error = hw_kring->nm_sync(hw_kring, flags); /* fourth step: now we are back the rx ring */ /* claim ownership on all hw owned bufs */ @@ -1991,7 +2005,7 @@ netmap_bwrap_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx, int f kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease, ring->head, ring->cur, ring->tail, hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_kring->rtail); - + mtx_unlock(&kring->q_lock); return error; } @@ -2047,18 +2061,21 @@ netmap_bwrap_attach(struct ifnet *fake, struct ifnet *real) bna->hwna = hwna; netmap_adapter_get(hwna); hwna->na_private = bna; /* weak reference */ - - hostna = &bna->host.up; - hostna->ifp = hwna->ifp; - hostna->num_tx_rings = 1; - hostna->num_tx_desc = hwna->num_rx_desc; - hostna->num_rx_rings = 1; - hostna->num_rx_desc = hwna->num_tx_desc; - // hostna->nm_txsync = netmap_bwrap_host_txsync; - // hostna->nm_rxsync = netmap_bwrap_host_rxsync; - hostna->nm_notify = netmap_bwrap_host_notify; - hostna->nm_mem = na->nm_mem; - hostna->na_private = bna; + + if (hwna->na_flags & NAF_HOST_RINGS) { + na->na_flags |= NAF_HOST_RINGS; + hostna = &bna->host.up; + hostna->ifp = hwna->ifp; + hostna->num_tx_rings = 1; + hostna->num_tx_desc = hwna->num_rx_desc; + hostna->num_rx_rings = 1; + hostna->num_rx_desc = hwna->num_tx_desc; + // hostna->nm_txsync = netmap_bwrap_host_txsync; + // hostna->nm_rxsync = netmap_bwrap_host_rxsync; + hostna->nm_notify = netmap_bwrap_host_notify; + hostna->nm_mem = na->nm_mem; + hostna->na_private = bna; + } ND("%s<->%s txr %d txd %d rxr %d rxd %d", fake->if_xname, real->if_xname, diff --git a/sys/modules/netmap/Makefile b/sys/modules/netmap/Makefile index aea844bde1ce..647cd103600f 100644 --- a/sys/modules/netmap/Makefile +++ b/sys/modules/netmap/Makefile @@ -14,5 +14,7 @@ SRCS += netmap_generic.c SRCS += netmap_mbq.c netmap_mbq.h SRCS += netmap_vale.c SRCS += netmap_freebsd.c +SRCS += netmap_offloadings.c +SRCS += netmap_pipe.c .include <bsd.kmod.mk> diff --git a/sys/net/netmap.h b/sys/net/netmap.h index a5ee9b55edc9..f0b4c56d4e39 100644 --- a/sys/net/netmap.h +++ b/sys/net/netmap.h @@ -39,8 +39,10 @@ #ifndef _NET_NETMAP_H_ #define _NET_NETMAP_H_ -#define NETMAP_API 10 /* current API version */ +#define NETMAP_API 11 /* current API version */ +#define NETMAP_MIN_API 11 /* min and max versions accepted */ +#define NETMAP_MAX_API 15 /* * Some fields should be cache-aligned to reduce contention. * The alignment is architecture and OS dependent, but rather than @@ -73,20 +75,21 @@ +===============+ / | buf_idx, len | slot[1] | txring_ofs[0] | (rel.to nifp)--' | flags, ptr | | txring_ofs[1] | +---------------+ - (tx+1+extra_tx entries) (num_slots entries) + (tx+1 entries) (num_slots entries) | txring_ofs[t] | | buf_idx, len | slot[n-1] +---------------+ | flags, ptr | | rxring_ofs[0] | +---------------+ | rxring_ofs[1] | - (rx+1+extra_rx entries) + (rx+1 entries) | rxring_ofs[r] | +---------------+ - * For each "interface" (NIC, host stack, VALE switch port) attached to a - * file descriptor, the mmap()ed region contains a (logically readonly) + * For each "interface" (NIC, host stack, PIPE, VALE switch port) bound to + * a file descriptor, the mmap()ed region contains a (logically readonly) * struct netmap_if pointing to struct netmap_ring's. + * * There is one netmap_ring per physical NIC ring, plus one tx/rx ring - * pair attached to the host stack (this pair is unused for VALE ports). + * pair attached to the host stack (this pair is unused for non-NIC ports). * * All physical/host stack ports share the same memory region, * so that zero-copy can be implemented between them. @@ -98,7 +101,42 @@ * is provided for user-supplied buffers in the tx path. * * In user space, the buffer address is computed as - * (char *)ring + buf_ofs + index*NETMAP_BUF_SIZE + * (char *)ring + buf_ofs + index * NETMAP_BUF_SIZE + * + * Added in NETMAP_API 11: + * + * + NIOCREGIF can request the allocation of extra spare buffers from + * the same memory pool. The desired number of buffers must be in + * nr_arg3. The ioctl may return fewer buffers, depending on memory + * availability. nr_arg3 will return the actual value, and, once + * mapped, nifp->ni_bufs_head will be the index of the first buffer. + * + * The buffers are linked to each other using the first uint32_t + * as the index. On close, ni_bufs_head must point to the list of + * buffers to be released. + * + * + NIOCREGIF can request space for extra rings (and buffers) + * allocated in the same memory space. The number of extra rings + * is in nr_arg1, and is advisory. This is a no-op on NICs where + * the size of the memory space is fixed. + * + * + NIOCREGIF can attach to PIPE rings sharing the same memory + * space with a parent device. The ifname indicates the parent device, + * which must already exist. Flags in nr_flags indicate if we want to + * bind the master or slave side, the index (from nr_ringid) + * is just a cookie and does need to be sequential. + * + * + NIOCREGIF can also attach to 'monitor' rings that replicate + * the content of specific rings, also from the same memory space. + * + * Extra flags in nr_flags support the above functions. + * Application libraries may use the following naming scheme: + * netmap:foo all NIC ring pairs + * netmap:foo^ only host ring pair + * netmap:foo+ all NIC ring + host ring pairs + * netmap:foo-k the k-th NIC ring pair + * netmap:foo{k PIPE ring pair k, master side + * netmap:foo}k PIPE ring pair k, slave side */ /* @@ -284,8 +322,8 @@ struct netmap_if { const uint32_t ni_tx_rings; /* number of HW tx rings */ const uint32_t ni_rx_rings; /* number of HW rx rings */ - const uint32_t ni_extra_tx_rings; - const uint32_t ni_extra_rx_rings; + uint32_t ni_bufs_head; /* head index for extra bufs */ + uint32_t ni_spare1[5]; /* * The following array contains the offset of each netmap ring * from this structure, in the following order: @@ -321,6 +359,7 @@ struct netmap_if { * * The actual argument (struct nmreq) has a number of options to request * different functions. + * The following are used in NIOCREGIF when nr_cmd == 0: * * nr_name (in) * The name of the port (em0, valeXXX:YYY, etc.) @@ -337,6 +376,13 @@ struct netmap_if { * * nr_ringid (in) * Indicates how rings should be bound to the file descriptors. + * If nr_flags != 0, then the low bits (in NETMAP_RING_MASK) + * are used to indicate the ring number, and nr_flags specifies + * the actual rings to bind. NETMAP_NO_TX_POLL is unaffected. + * + * NOTE: THE FOLLOWING (nr_flags == 0) IS DEPRECATED: + * If nr_flags == 0, NETMAP_HW_RING and NETMAP_SW_RING control + * the binding as follows: * 0 (default) binds all physical rings * NETMAP_HW_RING | ring number binds a single ring pair * NETMAP_SW_RING binds only the host tx/rx rings @@ -345,8 +391,41 @@ struct netmap_if { * packets on tx rings only if POLLOUT is set. * The default is to push any pending packet. * - * NETMAP_PRIV_MEM is set on return for ports that use private - * memory regions and cannot use buffer swapping. + * NETMAP_DO_RX_POLL can be OR-ed to make select()/poll() release + * packets on rx rings also when POLLIN is NOT set. + * The default is to touch the rx ring only with POLLIN. + * Note that this is the opposite of TX because it + * reflects the common usage. + * + * NOTE: NETMAP_PRIV_MEM IS DEPRECATED, use nr_arg2 instead. + * NETMAP_PRIV_MEM is set on return for ports that do not use + * the global memory allocator. + * This information is not significant and applications + * should look at the region id in nr_arg2 + * + * nr_flags is the recommended mode to indicate which rings should + * be bound to a file descriptor. Values are NR_REG_* + * + * nr_arg1 (in) The number of extra rings to be reserved. + * Especially when allocating a VALE port the system only + * allocates the amount of memory needed for the port. + * If more shared memory rings are desired (e.g. for pipes), + * the first invocation for the same basename/allocator + * should specify a suitable number. Memory cannot be + * extended after the first allocation without closing + * all ports on the same region. + * + * nr_arg2 (in/out) The identity of the memory region used. + * On input, 0 means the system decides autonomously, + * other values may try to select a specific region. + * On return the actual value is reported. + * Region '1' is the global allocator, normally shared + * by all interfaces. Other values are private regions. + * If two ports the same region zero-copy is possible. + * + * nr_arg3 (in/out) number of extra buffers to be allocated. + * + * * * nr_cmd (in) if non-zero indicates a special command: * NETMAP_BDG_ATTACH and nr_name = vale*:ifname @@ -362,17 +441,33 @@ struct netmap_if { * NETMAP_BDG_LIST * list the configuration of VALE switches. * - * NETMAP_BDG_OFFSET XXX ? - * Set the offset of data in packets. Used with VALE - * switches where the clients use the vhost header. + * NETMAP_BDG_VNET_HDR + * Set the virtio-net header length used by the client + * of a VALE switch port. * - * nr_arg1, nr_arg2 (in/out) command specific + * nr_arg1, nr_arg2, nr_arg3 (in/out) command specific + * + * * */ /* - * struct nmreq overlays a struct ifreq + * struct nmreq overlays a struct ifreq (just the name) + * + * On input, nr_ringid indicates which rings we are requesting, + * with the low flags for the specific ring number. + * selection FLAGS RING INDEX + * + * all the NIC rings 0x0000 - + * only HOST ring 0x2000 ring index + * single NIC ring 0x4000 - + * all the NIC+HOST rings 0x6000 - + * one pipe ring, master 0x8000 ring index + * *** INVALID 0xA000 + * one pipe ring, slave 0xC000 ring index + * *** INVALID 0xE000 + * */ struct nmreq { char nr_name[IFNAMSIZ]; @@ -383,27 +478,47 @@ struct nmreq { uint32_t nr_rx_slots; /* slots in rx rings */ uint16_t nr_tx_rings; /* number of tx rings */ uint16_t nr_rx_rings; /* number of rx rings */ + uint16_t nr_ringid; /* ring(s) we care about */ -#define NETMAP_PRIV_MEM 0x8000 /* rings use private memory */ -#define NETMAP_HW_RING 0x4000 /* low bits indicate one hw ring */ -#define NETMAP_SW_RING 0x2000 /* process the sw ring */ +#define NETMAP_HW_RING 0x4000 /* single NIC ring pair */ +#define NETMAP_SW_RING 0x2000 /* only host ring pair */ + +#define NETMAP_RING_MASK 0x0fff /* the ring number */ + #define NETMAP_NO_TX_POLL 0x1000 /* no automatic txsync on poll */ -#define NETMAP_RING_MASK 0xfff /* the ring number */ + +#define NETMAP_DO_RX_POLL 0x8000 /* DO automatic rxsync on poll */ uint16_t nr_cmd; #define NETMAP_BDG_ATTACH 1 /* attach the NIC */ #define NETMAP_BDG_DETACH 2 /* detach the NIC */ #define NETMAP_BDG_LOOKUP_REG 3 /* register lookup function */ #define NETMAP_BDG_LIST 4 /* get bridge's info */ -#define NETMAP_BDG_OFFSET 5 /* set the port offset */ +#define NETMAP_BDG_VNET_HDR 5 /* set the port virtio-net-hdr length */ +#define NETMAP_BDG_OFFSET NETMAP_BDG_VNET_HDR /* deprecated alias */ - uint16_t nr_arg1; + uint16_t nr_arg1; /* reserve extra rings in NIOCREGIF */ #define NETMAP_BDG_HOST 1 /* attach the host stack on ATTACH */ -#define NETMAP_BDG_MAX_OFFSET 12 uint16_t nr_arg2; - uint32_t spare2[3]; + uint32_t nr_arg3; /* req. extra buffers in NIOCREGIF */ + uint32_t nr_flags; + /* various modes, extends nr_ringid */ + uint32_t spare2[1]; +}; + +#define NR_REG_MASK 0xf /* values for nr_flags */ +enum { NR_REG_DEFAULT = 0, /* backward compat, should not be used. */ + NR_REG_ALL_NIC = 1, + NR_REG_SW = 2, + NR_REG_NIC_SW = 3, + NR_REG_ONE_NIC = 4, + NR_REG_PIPE_MASTER = 5, + NR_REG_PIPE_SLAVE = 6, }; +/* monitor uses the NR_REG to select the rings to monitor */ +#define NR_MONITOR_TX 0x100 +#define NR_MONITOR_RX 0x200 /* diff --git a/sys/net/netmap_user.h b/sys/net/netmap_user.h index 1bb337cf0ef7..9c3a4c1e5949 100644 --- a/sys/net/netmap_user.h +++ b/sys/net/netmap_user.h @@ -66,6 +66,7 @@ #define _NET_NETMAP_USER_H_ #include <stdint.h> +#include <sys/socket.h> /* apple needs sockaddr */ #include <net/if.h> /* IFNAMSIZ */ #ifndef likely @@ -104,12 +105,12 @@ nm_ring_next(struct netmap_ring *r, uint32_t i) /* * Return 1 if we have pending transmissions in the tx ring. - * When everything is complete ring->cur = ring->tail + 1 (modulo ring size) + * When everything is complete ring->head = ring->tail + 1 (modulo ring size) */ static inline int nm_tx_pending(struct netmap_ring *r) { - return nm_ring_next(r, r->tail) != r->cur; + return nm_ring_next(r, r->tail) != r->head; } @@ -142,13 +143,41 @@ nm_ring_space(struct netmap_ring *ring) #include <signal.h> #include <stdlib.h> -struct nm_hdr_t { /* same as pcap_pkthdr */ +#ifndef ND /* debug macros */ +/* debug support */ +#define ND(_fmt, ...) do {} while(0) +#define D(_fmt, ...) \ + do { \ + struct timeval t0; \ + gettimeofday(&t0, NULL); \ + fprintf(stderr, "%03d.%06d %s [%d] " _fmt "\n", \ + (int)(t0.tv_sec % 1000), (int)t0.tv_usec, \ + __FUNCTION__, __LINE__, ##__VA_ARGS__); \ + } while (0) + +/* Rate limited version of "D", lps indicates how many per second */ +#define RD(lps, format, ...) \ + do { \ + static int t0, __cnt; \ + struct timeval __xxts; \ + gettimeofday(&__xxts, NULL); \ + if (t0 != __xxts.tv_sec) { \ + t0 = __xxts.tv_sec; \ + __cnt = 0; \ + } \ + if (__cnt++ < lps) { \ + D(format, ##__VA_ARGS__); \ + } \ + } while (0) +#endif + +struct nm_pkthdr { /* same as pcap_pkthdr */ struct timeval ts; uint32_t caplen; uint32_t len; }; -struct nm_stat_t { // pcap_stat +struct nm_stat { /* same as pcap_stat */ u_int ps_recv; u_int ps_drop; u_int ps_ifdrop; @@ -159,19 +188,29 @@ struct nm_stat_t { // pcap_stat #define NM_ERRBUF_SIZE 512 -struct nm_desc_t { - struct nm_desc_t *self; +struct nm_desc { + struct nm_desc *self; /* point to self if netmap. */ int fd; void *mem; int memsize; - struct netmap_if *nifp; + int done_mmap; /* set if mem is the result of mmap */ + struct netmap_if * const nifp; uint16_t first_tx_ring, last_tx_ring, cur_tx_ring; uint16_t first_rx_ring, last_rx_ring, cur_rx_ring; struct nmreq req; /* also contains the nr_name = ifname */ - struct nm_hdr_t hdr; - - struct netmap_ring *tx, *rx; /* shortcuts to base hw/sw rings */ + struct nm_pkthdr hdr; + /* + * The memory contains netmap_if, rings and then buffers. + * Given a pointer (e.g. to nm_inject) we can compare with + * mem/buf_start/buf_end to tell if it is a buffer or + * some other descriptor in our region. + * We also store a pointer to some ring as it helps in the + * translation from buffer indexes to addresses. + */ + struct netmap_ring * const some_ring; + void * const buf_start; + void * const buf_end; /* parameters from pcap_open_live */ int snaplen; int promisc; @@ -183,7 +222,7 @@ struct nm_desc_t { uint32_t if_reqcap; uint32_t if_curcap; - struct nm_stat_t st; + struct nm_stat st; char msg[NM_ERRBUF_SIZE]; }; @@ -191,8 +230,8 @@ struct nm_desc_t { * when the descriptor is open correctly, d->self == d * Eventually we should also use some magic number. */ -#define P2NMD(p) ((struct nm_desc_t *)(p)) -#define IS_NETMAP_DESC(d) (P2NMD(d)->self == P2NMD(d)) +#define P2NMD(p) ((struct nm_desc *)(p)) +#define IS_NETMAP_DESC(d) ((d) && P2NMD(d)->self == P2NMD(d)) #define NETMAP_FD(d) (P2NMD(d)->fd) @@ -205,7 +244,7 @@ struct nm_desc_t { * XXX only for multiples of 64 bytes, non overlapped. */ static inline void -pkt_copy(const void *_src, void *_dst, int l) +nm_pkt_copy(const void *_src, void *_dst, int l) { const uint64_t *src = (const uint64_t *)_src; uint64_t *dst = (uint64_t *)_dst; @@ -230,7 +269,7 @@ pkt_copy(const void *_src, void *_dst, int l) /* * The callback, invoked on each received packet. Same as libpcap */ -typedef void (*nm_cb_t)(u_char *, const struct nm_hdr_t *, const u_char *d); +typedef void (*nm_cb_t)(u_char *, const struct nm_pkthdr *, const u_char *d); /* *--- the pcap-like API --- @@ -238,21 +277,49 @@ typedef void (*nm_cb_t)(u_char *, const struct nm_hdr_t *, const u_char *d); * nm_open() opens a file descriptor, binds to a port and maps memory. * * ifname (netmap:foo or vale:foo) is the port name - * flags can be NETMAP_SW_RING or NETMAP_HW_RING etc. - * ring_no only used if NETMAP_HW_RING is specified, is interpreted - * as a string or integer indicating the ring number - * ring_flags is stored in all ring flags (e.g. for transparent mode) - * to open. If successful, t opens the fd and maps the memory. + * a suffix can indicate the follwing: + * ^ bind the host (sw) ring pair + * * bind host and NIC ring pairs (transparent) + * -NN bind individual NIC ring pair + * {NN bind master side of pipe NN + * }NN bind slave side of pipe NN + * + * req provides the initial values of nmreq before parsing ifname. + * Remember that the ifname parsing will override the ring + * number in nm_ringid, and part of nm_flags; + * flags special functions, normally 0 + * indicates which fields of *arg are significant + * arg special functions, normally NULL + * if passed a netmap_desc with mem != NULL, + * use that memory instead of mmap. */ -static struct nm_desc_t *nm_open(const char *ifname, - const char *ring_no, int flags, int ring_flags); +static struct nm_desc *nm_open(const char *ifname, const struct nmreq *req, + uint64_t flags, const struct nm_desc *arg); + +/* + * nm_open can import some fields from the parent descriptor. + * These flags control which ones. + * Also in flags you can specify NETMAP_NO_TX_POLL and NETMAP_DO_RX_POLL, + * which set the initial value for these flags. + * Note that the 16 low bits of the flags are reserved for data + * that may go into the nmreq. + */ +enum { + NM_OPEN_NO_MMAP = 0x040000, /* reuse mmap from parent */ + NM_OPEN_IFNAME = 0x080000, /* nr_name, nr_ringid, nr_flags */ + NM_OPEN_ARG1 = 0x100000, + NM_OPEN_ARG2 = 0x200000, + NM_OPEN_ARG3 = 0x400000, + NM_OPEN_RING_CFG = 0x800000, /* tx|rx rings|slots */ +}; + /* * nm_close() closes and restores the port to its previous state */ -static int nm_close(struct nm_desc_t *); +static int nm_close(struct nm_desc *); /* * nm_inject() is the same as pcap_inject() @@ -260,111 +327,226 @@ static int nm_close(struct nm_desc_t *); * nm_nextpkt() is the same as pcap_next() */ -static int nm_inject(struct nm_desc_t *, const void *, size_t); -static int nm_dispatch(struct nm_desc_t *, int, nm_cb_t, u_char *); -static u_char *nm_nextpkt(struct nm_desc_t *, struct nm_hdr_t *); +static int nm_inject(struct nm_desc *, const void *, size_t); +static int nm_dispatch(struct nm_desc *, int, nm_cb_t, u_char *); +static u_char *nm_nextpkt(struct nm_desc *, struct nm_pkthdr *); /* * Try to open, return descriptor if successful, NULL otherwise. * An invalid netmap name will return errno = 0; + * You can pass a pointer to a pre-filled nm_desc to add special + * parameters. Flags is used as follows + * NM_OPEN_NO_MMAP use the memory from arg, only + * if the nr_arg2 (memory block) matches. + * NM_OPEN_ARG1 use req.nr_arg1 from arg + * NM_OPEN_ARG2 use req.nr_arg2 from arg + * NM_OPEN_RING_CFG user ring config from arg */ -static struct nm_desc_t * -nm_open(const char *ifname, const char *ring_name, int flags, int ring_flags) +static struct nm_desc * +nm_open(const char *ifname, const struct nmreq *req, + uint64_t new_flags, const struct nm_desc *arg) { - struct nm_desc_t *d; - u_int n, namelen; - char *port = NULL; + struct nm_desc *d = NULL; + const struct nm_desc *parent = arg; + u_int namelen; + uint32_t nr_ringid = 0, nr_flags; + const char *port = NULL; + const char *errmsg = NULL; if (strncmp(ifname, "netmap:", 7) && strncmp(ifname, "vale", 4)) { - errno = 0; /* name not recognised */ + errno = 0; /* name not recognised, not an error */ return NULL; } if (ifname[0] == 'n') ifname += 7; - port = strchr(ifname, '-'); - if (!port) { - namelen = strlen(ifname); - } else { - namelen = port - ifname; - flags &= ~(NETMAP_SW_RING | NETMAP_HW_RING | NETMAP_RING_MASK); - if (port[1] == 's') - flags |= NETMAP_SW_RING; - else - ring_name = port; + /* scan for a separator */ + for (port = ifname; *port && !index("-*^{}", *port); port++) + ; + namelen = port - ifname; + if (namelen >= sizeof(d->req.nr_name)) { + errmsg = "name too long"; + goto fail; + } + switch (*port) { + default: /* '\0', no suffix */ + nr_flags = NR_REG_ALL_NIC; + break; + case '-': /* one NIC */ + nr_flags = NR_REG_ONE_NIC; + nr_ringid = atoi(port + 1); + break; + case '*': /* NIC and SW, ignore port */ + nr_flags = NR_REG_NIC_SW; + if (port[1]) { + errmsg = "invalid port for nic+sw"; + goto fail; + } + break; + case '^': /* only sw ring */ + nr_flags = NR_REG_SW; + if (port[1]) { + errmsg = "invalid port for sw ring"; + goto fail; + } + break; + case '{': + nr_flags = NR_REG_PIPE_MASTER; + nr_ringid = atoi(port + 1); + break; + case '}': + nr_flags = NR_REG_PIPE_SLAVE; + nr_ringid = atoi(port + 1); + break; } - if (namelen >= sizeof(d->req.nr_name)) - namelen = sizeof(d->req.nr_name) - 1; - d = (struct nm_desc_t *)calloc(1, sizeof(*d)); + if (nr_ringid >= NETMAP_RING_MASK) { + errmsg = "invalid ringid"; + goto fail; + } + /* add the *XPOLL flags */ + nr_ringid |= new_flags & (NETMAP_NO_TX_POLL | NETMAP_DO_RX_POLL); + + d = (struct nm_desc *)calloc(1, sizeof(*d)); if (d == NULL) { + errmsg = "nm_desc alloc failure"; errno = ENOMEM; return NULL; } d->self = d; /* set this early so nm_close() works */ d->fd = open("/dev/netmap", O_RDWR); - if (d->fd < 0) + if (d->fd < 0) { + errmsg = "cannot open /dev/netmap"; goto fail; - - if (flags & NETMAP_SW_RING) { - d->req.nr_ringid = NETMAP_SW_RING; - } else { - u_int r; - if (flags & NETMAP_HW_RING) /* interpret ring as int */ - r = (uintptr_t)ring_name; - else /* interpret ring as numeric string */ - r = ring_name ? atoi(ring_name) : ~0; - r = (r < NETMAP_RING_MASK) ? (r | NETMAP_HW_RING) : 0; - d->req.nr_ringid = r; /* set the ring */ } - d->req.nr_ringid |= (flags & ~NETMAP_RING_MASK); + + if (req) + d->req = *req; d->req.nr_version = NETMAP_API; + d->req.nr_ringid &= ~NETMAP_RING_MASK; + + /* these fields are overridden by ifname and flags processing */ + d->req.nr_ringid |= nr_ringid; + d->req.nr_flags = nr_flags; memcpy(d->req.nr_name, ifname, namelen); d->req.nr_name[namelen] = '\0'; + /* optionally import info from parent */ + if (IS_NETMAP_DESC(parent) && new_flags) { + if (new_flags & NM_OPEN_ARG1) + D("overriding ARG1 %d", parent->req.nr_arg1); + d->req.nr_arg1 = new_flags & NM_OPEN_ARG1 ? + parent->req.nr_arg1 : 4; + if (new_flags & NM_OPEN_ARG2) + D("overriding ARG2 %d", parent->req.nr_arg2); + d->req.nr_arg2 = new_flags & NM_OPEN_ARG2 ? + parent->req.nr_arg2 : 0; + if (new_flags & NM_OPEN_ARG3) + D("overriding ARG3 %d", parent->req.nr_arg3); + d->req.nr_arg3 = new_flags & NM_OPEN_ARG3 ? + parent->req.nr_arg3 : 0; + if (new_flags & NM_OPEN_RING_CFG) { + D("overriding RING_CFG"); + d->req.nr_tx_slots = parent->req.nr_tx_slots; + d->req.nr_rx_slots = parent->req.nr_rx_slots; + d->req.nr_tx_rings = parent->req.nr_tx_rings; + d->req.nr_rx_rings = parent->req.nr_rx_rings; + } + if (new_flags & NM_OPEN_IFNAME) { + D("overriding ifname %s ringid 0x%x flags 0x%x", + parent->req.nr_name, parent->req.nr_ringid, + parent->req.nr_flags); + memcpy(d->req.nr_name, parent->req.nr_name, + sizeof(d->req.nr_name)); + d->req.nr_ringid = parent->req.nr_ringid; + d->req.nr_flags = parent->req.nr_flags; + } + } if (ioctl(d->fd, NIOCREGIF, &d->req)) { + errmsg = "NIOCREGIF failed"; goto fail; } - d->memsize = d->req.nr_memsize; - d->mem = mmap(0, d->memsize, PROT_WRITE | PROT_READ, MAP_SHARED, - d->fd, 0); - if (d->mem == NULL) - goto fail; - d->nifp = NETMAP_IF(d->mem, d->req.nr_offset); - if (d->req.nr_ringid & NETMAP_SW_RING) { + if (IS_NETMAP_DESC(parent) && parent->mem && + parent->req.nr_arg2 == d->req.nr_arg2) { + /* do not mmap, inherit from parent */ + d->memsize = parent->memsize; + d->mem = parent->mem; + } else { + d->memsize = d->req.nr_memsize; + d->mem = mmap(0, d->memsize, PROT_WRITE | PROT_READ, MAP_SHARED, + d->fd, 0); + if (d->mem == NULL) { + errmsg = "mmap failed"; + goto fail; + } + d->done_mmap = 1; + } + { + struct netmap_if *nifp = NETMAP_IF(d->mem, d->req.nr_offset); + struct netmap_ring *r = NETMAP_RXRING(nifp, ); + + *(struct netmap_if **)(uintptr_t)&(d->nifp) = nifp; + *(struct netmap_ring **)(uintptr_t)&d->some_ring = r; + *(void **)(uintptr_t)&d->buf_start = NETMAP_BUF(r, 0); + *(void **)(uintptr_t)&d->buf_end = + (char *)d->mem + d->memsize; + } + + if (nr_flags == NR_REG_SW) { /* host stack */ d->first_tx_ring = d->last_tx_ring = d->req.nr_tx_rings; d->first_rx_ring = d->last_rx_ring = d->req.nr_rx_rings; - } else if (d->req.nr_ringid & NETMAP_HW_RING) { - /* XXX check validity */ - d->first_tx_ring = d->last_tx_ring = - d->first_rx_ring = d->last_rx_ring = - d->req.nr_ringid & NETMAP_RING_MASK; - } else { - d->first_tx_ring = d->last_rx_ring = 0; + } else if (nr_flags == NR_REG_ALL_NIC) { /* only nic */ + d->first_tx_ring = 0; + d->first_rx_ring = 0; d->last_tx_ring = d->req.nr_tx_rings - 1; d->last_rx_ring = d->req.nr_rx_rings - 1; + } else if (nr_flags == NR_REG_NIC_SW) { + d->first_tx_ring = 0; + d->first_rx_ring = 0; + d->last_tx_ring = d->req.nr_tx_rings; + d->last_rx_ring = d->req.nr_rx_rings; + } else if (nr_flags == NR_REG_ONE_NIC) { + /* XXX check validity */ + d->first_tx_ring = d->last_tx_ring = + d->first_rx_ring = d->last_rx_ring = nr_ringid; + } else { /* pipes */ + d->first_tx_ring = d->last_tx_ring = 0; + d->first_rx_ring = d->last_rx_ring = 0; } - d->tx = NETMAP_TXRING(d->nifp, 0); - d->rx = NETMAP_RXRING(d->nifp, 0); - d->cur_tx_ring = d->first_tx_ring; - d->cur_rx_ring = d->first_rx_ring; - for (n = d->first_tx_ring; n <= d->last_tx_ring; n++) { - d->tx[n].flags |= ring_flags; + +#ifdef DEBUG_NETMAP_USER + { /* debugging code */ + int i; + + D("%s tx %d .. %d %d rx %d .. %d %d", ifname, + d->first_tx_ring, d->last_tx_ring, d->req.nr_tx_rings, + d->first_rx_ring, d->last_rx_ring, d->req.nr_rx_rings); + for (i = 0; i <= d->req.nr_tx_rings; i++) { + struct netmap_ring *r = NETMAP_TXRING(d->nifp, i); + D("TX%d %p h %d c %d t %d", i, r, r->head, r->cur, r->tail); } - for (n = d->first_rx_ring; n <= d->last_rx_ring; n++) { - d->rx[n].flags |= ring_flags; + for (i = 0; i <= d->req.nr_rx_rings; i++) { + struct netmap_ring *r = NETMAP_RXRING(d->nifp, i); + D("RX%d %p h %d c %d t %d", i, r, r->head, r->cur, r->tail); } + } +#endif /* debugging */ + + d->cur_tx_ring = d->first_tx_ring; + d->cur_rx_ring = d->first_rx_ring; return d; fail: nm_close(d); + if (errmsg) + D("%s %s", errmsg, ifname); errno = EINVAL; return NULL; } static int -nm_close(struct nm_desc_t *d) +nm_close(struct nm_desc *d) { /* * ugly trick to avoid unused warnings @@ -375,7 +557,7 @@ nm_close(struct nm_desc_t *d) if (d == NULL || d->self != d) return EINVAL; - if (d->mem) + if (d->done_mmap && d->mem) munmap(d->mem, d->memsize); if (d->fd != -1) close(d->fd); @@ -389,7 +571,7 @@ nm_close(struct nm_desc_t *d) * Same prototype as pcap_inject(), only need to cast. */ static int -nm_inject(struct nm_desc_t *d, const void *buf, size_t size) +nm_inject(struct nm_desc *d, const void *buf, size_t size) { u_int c, n = d->last_tx_ring - d->first_tx_ring + 1; @@ -408,7 +590,7 @@ nm_inject(struct nm_desc_t *d, const void *buf, size_t size) i = ring->cur; idx = ring->slot[i].buf_idx; ring->slot[i].len = size; - pkt_copy(buf, NETMAP_BUF(ring, idx), size); + nm_pkt_copy(buf, NETMAP_BUF(ring, idx), size); d->cur_tx_ring = ri; ring->head = ring->cur = nm_ring_next(ring, i); return size; @@ -421,7 +603,7 @@ nm_inject(struct nm_desc_t *d, const void *buf, size_t size) * Same prototype as pcap_dispatch(), only need to cast. */ static int -nm_dispatch(struct nm_desc_t *d, int cnt, nm_cb_t cb, u_char *arg) +nm_dispatch(struct nm_desc *d, int cnt, nm_cb_t cb, u_char *arg) { int n = d->last_rx_ring - d->first_rx_ring + 1; int c, got = 0, ri = d->cur_rx_ring; @@ -457,7 +639,7 @@ nm_dispatch(struct nm_desc_t *d, int cnt, nm_cb_t cb, u_char *arg) } static u_char * -nm_nextpkt(struct nm_desc_t *d, struct nm_hdr_t *hdr) +nm_nextpkt(struct nm_desc *d, struct nm_pkthdr *hdr) { int ri = d->cur_rx_ring; |
