aboutsummaryrefslogtreecommitdiff
path: root/sys/dev/netmap/netmap_generic.c
diff options
context:
space:
mode:
Diffstat (limited to 'sys/dev/netmap/netmap_generic.c')
-rw-r--r--sys/dev/netmap/netmap_generic.c936
1 files changed, 669 insertions, 267 deletions
diff --git a/sys/dev/netmap/netmap_generic.c b/sys/dev/netmap/netmap_generic.c
index 85a6a9f76ea2f..5cef4a29110a5 100644
--- a/sys/dev/netmap/netmap_generic.c
+++ b/sys/dev/netmap/netmap_generic.c
@@ -1,5 +1,7 @@
/*
- * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
+ * Copyright (C) 2013-2016 Vincenzo Maffione
+ * Copyright (C) 2013-2016 Luigi Rizzo
+ * All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -83,25 +85,25 @@ __FBSDID("$FreeBSD$");
#define rtnl_lock() ND("rtnl_lock called")
#define rtnl_unlock() ND("rtnl_unlock called")
-#define MBUF_TXQ(m) ((m)->m_pkthdr.flowid)
#define MBUF_RXQ(m) ((m)->m_pkthdr.flowid)
#define smp_mb()
/*
* FreeBSD mbuf allocator/deallocator in emulation mode:
- *
+ */
+#if __FreeBSD_version < 1100000
+
+/*
+ * For older versions of FreeBSD:
+ *
* We allocate EXT_PACKET mbuf+clusters, but need to set M_NOFREE
* so that the destructor, if invoked, will not free the packet.
- * In principle we should set the destructor only on demand,
+ * In principle we should set the destructor only on demand,
* but since there might be a race we better do it on allocation.
* As a consequence, we also need to set the destructor or we
* would leak buffers.
*/
-/*
- * mbuf wrappers
- */
-
/* mbuf destructor, also need to change the type to EXT_EXTREF,
* add an M_NOFREE flag, and then clear the flag and
* chain into uma_zfree(zone_pack, mf)
@@ -112,35 +114,93 @@ __FBSDID("$FreeBSD$");
(m)->m_ext.ext_type = EXT_EXTREF; \
} while (0)
-static void
-netmap_default_mbuf_destructor(struct mbuf *m)
+static int
+void_mbuf_dtor(struct mbuf *m, void *arg1, void *arg2)
{
/* restore original mbuf */
m->m_ext.ext_buf = m->m_data = m->m_ext.ext_arg1;
m->m_ext.ext_arg1 = NULL;
m->m_ext.ext_type = EXT_PACKET;
m->m_ext.ext_free = NULL;
- if (GET_MBUF_REFCNT(m) == 0)
+ if (MBUF_REFCNT(m) == 0)
SET_MBUF_REFCNT(m, 1);
uma_zfree(zone_pack, m);
+
+ return 0;
}
static inline struct mbuf *
-netmap_get_mbuf(int len)
+nm_os_get_mbuf(struct ifnet *ifp, int len)
{
struct mbuf *m;
+
+ (void)ifp;
m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
if (m) {
- m->m_flags |= M_NOFREE; /* XXXNP: Almost certainly incorrect. */
+ /* m_getcl() (mb_ctor_mbuf) has an assert that checks that
+ * M_NOFREE flag is not specified as third argument,
+ * so we have to set M_NOFREE after m_getcl(). */
+ m->m_flags |= M_NOFREE;
m->m_ext.ext_arg1 = m->m_ext.ext_buf; // XXX save
- m->m_ext.ext_free = (void *)netmap_default_mbuf_destructor;
+ m->m_ext.ext_free = (void *)void_mbuf_dtor;
m->m_ext.ext_type = EXT_EXTREF;
- ND(5, "create m %p refcnt %d", m, GET_MBUF_REFCNT(m));
+ ND(5, "create m %p refcnt %d", m, MBUF_REFCNT(m));
}
return m;
}
+#else /* __FreeBSD_version >= 1100000 */
+
+/*
+ * Newer versions of FreeBSD, using a straightforward scheme.
+ *
+ * We allocate mbufs with m_gethdr(), since the mbuf header is needed
+ * by the driver. We also attach a customly-provided external storage,
+ * which in this case is a netmap buffer. When calling m_extadd(), however
+ * we pass a NULL address, since the real address (and length) will be
+ * filled in by nm_os_generic_xmit_frame() right before calling
+ * if_transmit().
+ *
+ * The dtor function does nothing, however we need it since mb_free_ext()
+ * has a KASSERT(), checking that the mbuf dtor function is not NULL.
+ */
+
+#define SET_MBUF_DESTRUCTOR(m, fn) do { \
+ (m)->m_ext.ext_free = (void *)fn; \
+} while (0)
+
+static void void_mbuf_dtor(struct mbuf *m, void *arg1, void *arg2) { }
+
+static inline struct mbuf *
+nm_os_get_mbuf(struct ifnet *ifp, int len)
+{
+ struct mbuf *m;
+
+ (void)ifp;
+ (void)len;
+
+ m = m_gethdr(M_NOWAIT, MT_DATA);
+ if (m == NULL) {
+ return m;
+ }
+
+ m_extadd(m, NULL /* buf */, 0 /* size */, void_mbuf_dtor,
+ NULL, NULL, 0, EXT_NET_DRV);
+
+ return m;
+}
+
+#endif /* __FreeBSD_version >= 1100000 */
+#elif defined _WIN32
+
+#include "win_glue.h"
+
+#define rtnl_lock() ND("rtnl_lock called")
+#define rtnl_unlock() ND("rtnl_unlock called")
+#define MBUF_TXQ(m) 0//((m)->m_pkthdr.flowid)
+#define MBUF_RXQ(m) 0//((m)->m_pkthdr.flowid)
+#define smp_mb() //XXX: to be correctly defined
#else /* linux */
@@ -150,7 +210,12 @@ netmap_get_mbuf(int len)
#include <linux/ethtool.h> /* struct ethtool_ops, get_ringparam */
#include <linux/hrtimer.h>
-//#define REG_RESET
+static inline struct mbuf *
+nm_os_get_mbuf(struct ifnet *ifp, int len)
+{
+ return alloc_skb(ifp->needed_headroom + len +
+ ifp->needed_tailroom, GFP_ATOMIC);
+}
#endif /* linux */
@@ -161,8 +226,21 @@ netmap_get_mbuf(int len)
#include <dev/netmap/netmap_mem2.h>
+#define for_each_kring_n(_i, _k, _karr, _n) \
+ for (_k=_karr, _i = 0; _i < _n; (_k)++, (_i)++)
+
+#define for_each_tx_kring(_i, _k, _na) \
+ for_each_kring_n(_i, _k, (_na)->tx_rings, (_na)->num_tx_rings)
+#define for_each_tx_kring_h(_i, _k, _na) \
+ for_each_kring_n(_i, _k, (_na)->tx_rings, (_na)->num_tx_rings + 1)
+
+#define for_each_rx_kring(_i, _k, _na) \
+ for_each_kring_n(_i, _k, (_na)->rx_rings, (_na)->num_rx_rings)
+#define for_each_rx_kring_h(_i, _k, _na) \
+ for_each_kring_n(_i, _k, (_na)->rx_rings, (_na)->num_rx_rings + 1)
-/* ======================== usage stats =========================== */
+
+/* ======================== PERFORMANCE STATISTICS =========================== */
#ifdef RATE_GENERIC
#define IFRATE(x) x
@@ -170,6 +248,8 @@ struct rate_stats {
unsigned long txpkt;
unsigned long txsync;
unsigned long txirq;
+ unsigned long txrepl;
+ unsigned long txdrop;
unsigned long rxpkt;
unsigned long rxirq;
unsigned long rxsync;
@@ -194,6 +274,8 @@ static void rate_callback(unsigned long arg)
RATE_PRINTK(txpkt);
RATE_PRINTK(txsync);
RATE_PRINTK(txirq);
+ RATE_PRINTK(txrepl);
+ RATE_PRINTK(txdrop);
RATE_PRINTK(rxpkt);
RATE_PRINTK(rxsync);
RATE_PRINTK(rxirq);
@@ -230,94 +312,222 @@ void generic_rate(int txp, int txs, int txi, int rxp, int rxs, int rxi)
* the poller threads. Differently from netmap_rx_irq(), we check
* only NAF_NETMAP_ON instead of NAF_NATIVE_ON to enable the irq.
*/
-static void
-netmap_generic_irq(struct ifnet *ifp, u_int q, u_int *work_done)
+void
+netmap_generic_irq(struct netmap_adapter *na, u_int q, u_int *work_done)
{
- struct netmap_adapter *na = NA(ifp);
if (unlikely(!nm_netmap_on(na)))
return;
- netmap_common_irq(ifp, q, work_done);
+ netmap_common_irq(na, q, work_done);
+#ifdef RATE_GENERIC
+ if (work_done)
+ rate_ctx.new.rxirq++;
+ else
+ rate_ctx.new.txirq++;
+#endif /* RATE_GENERIC */
}
+static int
+generic_netmap_unregister(struct netmap_adapter *na)
+{
+ struct netmap_generic_adapter *gna = (struct netmap_generic_adapter *)na;
+ struct netmap_kring *kring = NULL;
+ int i, r;
+
+ if (na->active_fds == 0) {
+ D("Generic adapter %p goes off", na);
+ rtnl_lock();
+
+ na->na_flags &= ~NAF_NETMAP_ON;
+
+ /* Release packet steering control. */
+ nm_os_catch_tx(gna, 0);
+
+ /* Stop intercepting packets on the RX path. */
+ nm_os_catch_rx(gna, 0);
+
+ rtnl_unlock();
+ }
+
+ for_each_rx_kring_h(r, kring, na) {
+ if (nm_kring_pending_off(kring)) {
+ D("RX ring %d of generic adapter %p goes off", r, na);
+ kring->nr_mode = NKR_NETMAP_OFF;
+ }
+ }
+ for_each_tx_kring_h(r, kring, na) {
+ if (nm_kring_pending_off(kring)) {
+ kring->nr_mode = NKR_NETMAP_OFF;
+ D("TX ring %d of generic adapter %p goes off", r, na);
+ }
+ }
+
+ for_each_rx_kring(r, kring, na) {
+ /* Free the mbufs still pending in the RX queues,
+ * that did not end up into the corresponding netmap
+ * RX rings. */
+ mbq_safe_purge(&kring->rx_queue);
+ nm_os_mitigation_cleanup(&gna->mit[r]);
+ }
+
+ /* Decrement reference counter for the mbufs in the
+ * TX pools. These mbufs can be still pending in drivers,
+ * (e.g. this happens with virtio-net driver, which
+ * does lazy reclaiming of transmitted mbufs). */
+ for_each_tx_kring(r, kring, na) {
+ /* We must remove the destructor on the TX event,
+ * because the destructor invokes netmap code, and
+ * the netmap module may disappear before the
+ * TX event is consumed. */
+ mtx_lock_spin(&kring->tx_event_lock);
+ if (kring->tx_event) {
+ SET_MBUF_DESTRUCTOR(kring->tx_event, NULL);
+ }
+ kring->tx_event = NULL;
+ mtx_unlock_spin(&kring->tx_event_lock);
+ }
+
+ if (na->active_fds == 0) {
+ free(gna->mit, M_DEVBUF);
+
+ for_each_rx_kring(r, kring, na) {
+ mbq_safe_fini(&kring->rx_queue);
+ }
+
+ for_each_tx_kring(r, kring, na) {
+ mtx_destroy(&kring->tx_event_lock);
+ if (kring->tx_pool == NULL) {
+ continue;
+ }
+
+ for (i=0; i<na->num_tx_desc; i++) {
+ if (kring->tx_pool[i]) {
+ m_freem(kring->tx_pool[i]);
+ }
+ }
+ free(kring->tx_pool, M_DEVBUF);
+ kring->tx_pool = NULL;
+ }
+
+#ifdef RATE_GENERIC
+ if (--rate_ctx.refcount == 0) {
+ D("del_timer()");
+ del_timer(&rate_ctx.timer);
+ }
+#endif
+ }
+
+ return 0;
+}
/* Enable/disable netmap mode for a generic network interface. */
static int
generic_netmap_register(struct netmap_adapter *na, int enable)
{
struct netmap_generic_adapter *gna = (struct netmap_generic_adapter *)na;
- struct mbuf *m;
+ struct netmap_kring *kring = NULL;
int error;
int i, r;
- if (!na)
+ if (!na) {
return EINVAL;
+ }
-#ifdef REG_RESET
- error = ifp->netdev_ops->ndo_stop(ifp);
- if (error) {
- return error;
+ if (!enable) {
+ /* This is actually an unregif. */
+ return generic_netmap_unregister(na);
}
-#endif /* REG_RESET */
- if (enable) { /* Enable netmap mode. */
- /* Init the mitigation support on all the rx queues. */
+ if (na->active_fds == 0) {
+ D("Generic adapter %p goes on", na);
+ /* Do all memory allocations when (na->active_fds == 0), to
+ * simplify error management. */
+
+ /* Allocate memory for mitigation support on all the rx queues. */
gna->mit = malloc(na->num_rx_rings * sizeof(struct nm_generic_mit),
- M_DEVBUF, M_NOWAIT | M_ZERO);
+ M_DEVBUF, M_NOWAIT | M_ZERO);
if (!gna->mit) {
D("mitigation allocation failed");
error = ENOMEM;
goto out;
}
- for (r=0; r<na->num_rx_rings; r++)
- netmap_mitigation_init(&gna->mit[r], r, na);
- /* Initialize the rx queue, as generic_rx_handler() can
- * be called as soon as netmap_catch_rx() returns.
- */
- for (r=0; r<na->num_rx_rings; r++) {
- mbq_safe_init(&na->rx_rings[r].rx_queue);
+ for_each_rx_kring(r, kring, na) {
+ /* Init mitigation support. */
+ nm_os_mitigation_init(&gna->mit[r], r, na);
+
+ /* Initialize the rx queue, as generic_rx_handler() can
+ * be called as soon as nm_os_catch_rx() returns.
+ */
+ mbq_safe_init(&kring->rx_queue);
}
/*
- * Preallocate packet buffers for the tx rings.
+ * Prepare mbuf pools (parallel to the tx rings), for packet
+ * transmission. Don't preallocate the mbufs here, it's simpler
+ * to leave this task to txsync.
*/
- for (r=0; r<na->num_tx_rings; r++)
- na->tx_rings[r].tx_pool = NULL;
- for (r=0; r<na->num_tx_rings; r++) {
- na->tx_rings[r].tx_pool = malloc(na->num_tx_desc * sizeof(struct mbuf *),
- M_DEVBUF, M_NOWAIT | M_ZERO);
- if (!na->tx_rings[r].tx_pool) {
+ for_each_tx_kring(r, kring, na) {
+ kring->tx_pool = NULL;
+ }
+ for_each_tx_kring(r, kring, na) {
+ kring->tx_pool =
+ malloc(na->num_tx_desc * sizeof(struct mbuf *),
+ M_DEVBUF, M_NOWAIT | M_ZERO);
+ if (!kring->tx_pool) {
D("tx_pool allocation failed");
error = ENOMEM;
goto free_tx_pools;
}
- for (i=0; i<na->num_tx_desc; i++)
- na->tx_rings[r].tx_pool[i] = NULL;
- for (i=0; i<na->num_tx_desc; i++) {
- m = netmap_get_mbuf(NETMAP_BUF_SIZE(na));
- if (!m) {
- D("tx_pool[%d] allocation failed", i);
- error = ENOMEM;
- goto free_tx_pools;
- }
- na->tx_rings[r].tx_pool[i] = m;
- }
+ mtx_init(&kring->tx_event_lock, "tx_event_lock",
+ NULL, MTX_SPIN);
}
+ }
+
+ for_each_rx_kring_h(r, kring, na) {
+ if (nm_kring_pending_on(kring)) {
+ D("RX ring %d of generic adapter %p goes on", r, na);
+ kring->nr_mode = NKR_NETMAP_ON;
+ }
+
+ }
+ for_each_tx_kring_h(r, kring, na) {
+ if (nm_kring_pending_on(kring)) {
+ D("TX ring %d of generic adapter %p goes on", r, na);
+ kring->nr_mode = NKR_NETMAP_ON;
+ }
+ }
+
+ for_each_tx_kring(r, kring, na) {
+ /* Initialize tx_pool and tx_event. */
+ for (i=0; i<na->num_tx_desc; i++) {
+ kring->tx_pool[i] = NULL;
+ }
+
+ kring->tx_event = NULL;
+ }
+
+ if (na->active_fds == 0) {
rtnl_lock();
+
/* Prepare to intercept incoming traffic. */
- error = netmap_catch_rx(gna, 1);
+ error = nm_os_catch_rx(gna, 1);
if (error) {
- D("netdev_rx_handler_register() failed (%d)", error);
+ D("nm_os_catch_rx(1) failed (%d)", error);
goto register_handler;
}
- na->na_flags |= NAF_NETMAP_ON;
/* Make netmap control the packet steering. */
- netmap_catch_tx(gna, 1);
+ error = nm_os_catch_tx(gna, 1);
+ if (error) {
+ D("nm_os_catch_tx(1) failed (%d)", error);
+ goto catch_rx;
+ }
rtnl_unlock();
+ na->na_flags |= NAF_NETMAP_ON;
+
#ifdef RATE_GENERIC
if (rate_ctx.refcount == 0) {
D("setup_timer()");
@@ -329,73 +539,26 @@ generic_netmap_register(struct netmap_adapter *na, int enable)
}
rate_ctx.refcount++;
#endif /* RATE */
-
- } else if (na->tx_rings[0].tx_pool) {
- /* Disable netmap mode. We enter here only if the previous
- generic_netmap_register(na, 1) was successful.
- If it was not, na->tx_rings[0].tx_pool was set to NULL by the
- error handling code below. */
- rtnl_lock();
-
- na->na_flags &= ~NAF_NETMAP_ON;
-
- /* Release packet steering control. */
- netmap_catch_tx(gna, 0);
-
- /* Do not intercept packets on the rx path. */
- netmap_catch_rx(gna, 0);
-
- rtnl_unlock();
-
- /* Free the mbufs going to the netmap rings */
- for (r=0; r<na->num_rx_rings; r++) {
- mbq_safe_purge(&na->rx_rings[r].rx_queue);
- mbq_safe_destroy(&na->rx_rings[r].rx_queue);
- }
-
- for (r=0; r<na->num_rx_rings; r++)
- netmap_mitigation_cleanup(&gna->mit[r]);
- free(gna->mit, M_DEVBUF);
-
- for (r=0; r<na->num_tx_rings; r++) {
- for (i=0; i<na->num_tx_desc; i++) {
- m_freem(na->tx_rings[r].tx_pool[i]);
- }
- free(na->tx_rings[r].tx_pool, M_DEVBUF);
- }
-
-#ifdef RATE_GENERIC
- if (--rate_ctx.refcount == 0) {
- D("del_timer()");
- del_timer(&rate_ctx.timer);
- }
-#endif
- }
-
-#ifdef REG_RESET
- error = ifp->netdev_ops->ndo_open(ifp);
- if (error) {
- goto free_tx_pools;
}
-#endif
return 0;
+ /* Here (na->active_fds == 0) holds. */
+catch_rx:
+ nm_os_catch_rx(gna, 0);
register_handler:
rtnl_unlock();
free_tx_pools:
- for (r=0; r<na->num_tx_rings; r++) {
- if (na->tx_rings[r].tx_pool == NULL)
+ for_each_tx_kring(r, kring, na) {
+ mtx_destroy(&kring->tx_event_lock);
+ if (kring->tx_pool == NULL) {
continue;
- for (i=0; i<na->num_tx_desc; i++)
- if (na->tx_rings[r].tx_pool[i])
- m_freem(na->tx_rings[r].tx_pool[i]);
- free(na->tx_rings[r].tx_pool, M_DEVBUF);
- na->tx_rings[r].tx_pool = NULL;
+ }
+ free(kring->tx_pool, M_DEVBUF);
+ kring->tx_pool = NULL;
}
- for (r=0; r<na->num_rx_rings; r++) {
- netmap_mitigation_cleanup(&gna->mit[r]);
- mbq_safe_destroy(&na->rx_rings[r].rx_queue);
+ for_each_rx_kring(r, kring, na) {
+ mbq_safe_fini(&kring->rx_queue);
}
free(gna->mit, M_DEVBUF);
out:
@@ -411,13 +574,58 @@ out:
static void
generic_mbuf_destructor(struct mbuf *m)
{
- netmap_generic_irq(MBUF_IFP(m), MBUF_TXQ(m), NULL);
+ struct netmap_adapter *na = NA(GEN_TX_MBUF_IFP(m));
+ struct netmap_kring *kring;
+ unsigned int r = MBUF_TXQ(m);
+ unsigned int r_orig = r;
+
+ if (unlikely(!nm_netmap_on(na) || r >= na->num_tx_rings)) {
+ D("Error: no netmap adapter on device %p",
+ GEN_TX_MBUF_IFP(m));
+ return;
+ }
+
+ /*
+ * First, clear the event mbuf.
+ * In principle, the event 'm' should match the one stored
+ * on ring 'r'. However we check it explicitely to stay
+ * safe against lower layers (qdisc, driver, etc.) changing
+ * MBUF_TXQ(m) under our feet. If the match is not found
+ * on 'r', we try to see if it belongs to some other ring.
+ */
+ for (;;) {
+ bool match = false;
+
+ kring = &na->tx_rings[r];
+ mtx_lock_spin(&kring->tx_event_lock);
+ if (kring->tx_event == m) {
+ kring->tx_event = NULL;
+ match = true;
+ }
+ mtx_unlock_spin(&kring->tx_event_lock);
+
+ if (match) {
+ if (r != r_orig) {
+ RD(1, "event %p migrated: ring %u --> %u",
+ m, r_orig, r);
+ }
+ break;
+ }
+
+ if (++r == na->num_tx_rings) r = 0;
+
+ if (r == r_orig) {
+ RD(1, "Cannot match event %p", m);
+ return;
+ }
+ }
+
+ /* Second, wake up clients. They will reclaim the event through
+ * txsync. */
+ netmap_generic_irq(na, r, NULL);
#ifdef __FreeBSD__
- if (netmap_verbose)
- RD(5, "Tx irq (%p) queue %d index %d" , m, MBUF_TXQ(m), (int)(uintptr_t)m->m_ext.ext_arg1);
- netmap_default_mbuf_destructor(m);
-#endif /* __FreeBSD__ */
- IFRATE(rate_ctx.new.txirq++);
+ void_mbuf_dtor(m, NULL, NULL);
+#endif
}
extern int netmap_adaptive_io;
@@ -428,7 +636,7 @@ extern int netmap_adaptive_io;
* nr_hwcur is the first unsent buffer.
*/
static u_int
-generic_netmap_tx_clean(struct netmap_kring *kring)
+generic_netmap_tx_clean(struct netmap_kring *kring, int txqdisc)
{
u_int const lim = kring->nkr_num_slots - 1;
u_int nm_i = nm_next(kring->nr_hwtail, lim);
@@ -436,20 +644,50 @@ generic_netmap_tx_clean(struct netmap_kring *kring)
u_int n = 0;
struct mbuf **tx_pool = kring->tx_pool;
+ ND("hwcur = %d, hwtail = %d", kring->nr_hwcur, kring->nr_hwtail);
+
while (nm_i != hwcur) { /* buffers not completed */
struct mbuf *m = tx_pool[nm_i];
- if (unlikely(m == NULL)) {
- /* this is done, try to replenish the entry */
- tx_pool[nm_i] = m = netmap_get_mbuf(NETMAP_BUF_SIZE(kring->na));
+ if (txqdisc) {
+ if (m == NULL) {
+ /* Nothing to do, this is going
+ * to be replenished. */
+ RD(3, "Is this happening?");
+
+ } else if (MBUF_QUEUED(m)) {
+ break; /* Not dequeued yet. */
+
+ } else if (MBUF_REFCNT(m) != 1) {
+ /* This mbuf has been dequeued but is still busy
+ * (refcount is 2).
+ * Leave it to the driver and replenish. */
+ m_freem(m);
+ tx_pool[nm_i] = NULL;
+ }
+
+ } else {
if (unlikely(m == NULL)) {
- D("mbuf allocation failed, XXX error");
- // XXX how do we proceed ? break ?
- return -ENOMEM;
+ int event_consumed;
+
+ /* This slot was used to place an event. */
+ mtx_lock_spin(&kring->tx_event_lock);
+ event_consumed = (kring->tx_event == NULL);
+ mtx_unlock_spin(&kring->tx_event_lock);
+ if (!event_consumed) {
+ /* The event has not been consumed yet,
+ * still busy in the driver. */
+ break;
+ }
+ /* The event has been consumed, we can go
+ * ahead. */
+
+ } else if (MBUF_REFCNT(m) != 1) {
+ /* This mbuf is still busy: its refcnt is 2. */
+ break;
}
- } else if (GET_MBUF_REFCNT(m) != 1) {
- break; /* This mbuf is still busy: its refcnt is 2. */
}
+
n++;
nm_i = nm_next(nm_i, lim);
#if 0 /* rate adaptation */
@@ -476,23 +714,17 @@ generic_netmap_tx_clean(struct netmap_kring *kring)
return n;
}
-
-/*
- * We have pending packets in the driver between nr_hwtail +1 and hwcur.
- * Compute a position in the middle, to be used to generate
- * a notification.
- */
+/* Compute a slot index in the middle between inf and sup. */
static inline u_int
-generic_tx_event_middle(struct netmap_kring *kring, u_int hwcur)
+ring_middle(u_int inf, u_int sup, u_int lim)
{
- u_int n = kring->nkr_num_slots;
- u_int ntc = nm_next(kring->nr_hwtail, n-1);
+ u_int n = lim + 1;
u_int e;
- if (hwcur >= ntc) {
- e = (hwcur + ntc) / 2;
+ if (sup >= inf) {
+ e = (sup + inf) / 2;
} else { /* wrap around */
- e = (hwcur + n + ntc) / 2;
+ e = (sup + n + inf) / 2;
if (e >= n) {
e -= n;
}
@@ -506,35 +738,59 @@ generic_tx_event_middle(struct netmap_kring *kring, u_int hwcur)
return e;
}
-/*
- * We have pending packets in the driver between nr_hwtail+1 and hwcur.
- * Schedule a notification approximately in the middle of the two.
- * There is a race but this is only called within txsync which does
- * a double check.
- */
static void
generic_set_tx_event(struct netmap_kring *kring, u_int hwcur)
{
+ u_int lim = kring->nkr_num_slots - 1;
struct mbuf *m;
u_int e;
+ u_int ntc = nm_next(kring->nr_hwtail, lim); /* next to clean */
- if (nm_next(kring->nr_hwtail, kring->nkr_num_slots -1) == hwcur) {
+ if (ntc == hwcur) {
return; /* all buffers are free */
}
- e = generic_tx_event_middle(kring, hwcur);
+
+ /*
+ * We have pending packets in the driver between hwtail+1
+ * and hwcur, and we have to chose one of these slot to
+ * generate a notification.
+ * There is a race but this is only called within txsync which
+ * does a double check.
+ */
+#if 0
+ /* Choose a slot in the middle, so that we don't risk ending
+ * up in a situation where the client continuously wake up,
+ * fills one or a few TX slots and go to sleep again. */
+ e = ring_middle(ntc, hwcur, lim);
+#else
+ /* Choose the first pending slot, to be safe against driver
+ * reordering mbuf transmissions. */
+ e = ntc;
+#endif
m = kring->tx_pool[e];
- ND(5, "Request Event at %d mbuf %p refcnt %d", e, m, m ? GET_MBUF_REFCNT(m) : -2 );
if (m == NULL) {
- /* This can happen if there is already an event on the netmap
- slot 'e': There is nothing to do. */
+ /* An event is already in place. */
return;
}
- kring->tx_pool[e] = NULL;
+
+ mtx_lock_spin(&kring->tx_event_lock);
+ if (kring->tx_event) {
+ /* An event is already in place. */
+ mtx_unlock_spin(&kring->tx_event_lock);
+ return;
+ }
+
SET_MBUF_DESTRUCTOR(m, generic_mbuf_destructor);
+ kring->tx_event = m;
+ mtx_unlock_spin(&kring->tx_event_lock);
+
+ kring->tx_pool[e] = NULL;
+
+ ND(5, "Request Event at %d mbuf %p refcnt %d", e, m, m ? MBUF_REFCNT(m) : -2 );
- // XXX wmb() ?
- /* Decrement the refcount an free it if we have the last one. */
+ /* Decrement the refcount. This will free it if we lose the race
+ * with the driver. */
m_freem(m);
smp_mb();
}
@@ -551,6 +807,7 @@ static int
generic_netmap_txsync(struct netmap_kring *kring, int flags)
{
struct netmap_adapter *na = kring->na;
+ struct netmap_generic_adapter *gna = (struct netmap_generic_adapter *)na;
struct ifnet *ifp = na->ifp;
struct netmap_ring *ring = kring->ring;
u_int nm_i; /* index into the netmap ring */ // j
@@ -560,8 +817,6 @@ generic_netmap_txsync(struct netmap_kring *kring, int flags)
IFRATE(rate_ctx.new.txsync++);
- // TODO: handle the case of mbuf allocation failure
-
rmb();
/*
@@ -569,72 +824,121 @@ generic_netmap_txsync(struct netmap_kring *kring, int flags)
*/
nm_i = kring->nr_hwcur;
if (nm_i != head) { /* we have new packets to send */
+ struct nm_os_gen_arg a;
+ u_int event = -1;
+
+ if (gna->txqdisc && nm_kr_txempty(kring)) {
+ /* In txqdisc mode, we ask for a delayed notification,
+ * but only when cur == hwtail, which means that the
+ * client is going to block. */
+ event = ring_middle(nm_i, head, lim);
+ ND(3, "Place txqdisc event (hwcur=%u,event=%u,"
+ "head=%u,hwtail=%u)", nm_i, event, head,
+ kring->nr_hwtail);
+ }
+
+ a.ifp = ifp;
+ a.ring_nr = ring_nr;
+ a.head = a.tail = NULL;
+
while (nm_i != head) {
struct netmap_slot *slot = &ring->slot[nm_i];
u_int len = slot->len;
void *addr = NMB(na, slot);
-
/* device-specific */
struct mbuf *m;
int tx_ret;
NM_CHECK_ADDR_LEN(na, addr, len);
- /* Tale a mbuf from the tx pool and copy in the user packet. */
+ /* Tale a mbuf from the tx pool (replenishing the pool
+ * entry if necessary) and copy in the user packet. */
m = kring->tx_pool[nm_i];
- if (unlikely(!m)) {
- RD(5, "This should never happen");
- kring->tx_pool[nm_i] = m = netmap_get_mbuf(NETMAP_BUF_SIZE(na));
- if (unlikely(m == NULL)) {
- D("mbuf allocation failed");
+ if (unlikely(m == NULL)) {
+ kring->tx_pool[nm_i] = m =
+ nm_os_get_mbuf(ifp, NETMAP_BUF_SIZE(na));
+ if (m == NULL) {
+ RD(2, "Failed to replenish mbuf");
+ /* Here we could schedule a timer which
+ * retries to replenish after a while,
+ * and notifies the client when it
+ * manages to replenish some slots. In
+ * any case we break early to avoid
+ * crashes. */
break;
}
+ IFRATE(rate_ctx.new.txrepl++);
}
- /* XXX we should ask notifications when NS_REPORT is set,
- * or roughly every half frame. We can optimize this
- * by lazily requesting notifications only when a
- * transmission fails. Probably the best way is to
- * break on failures and set notifications when
- * ring->cur == ring->tail || nm_i != cur
+
+ a.m = m;
+ a.addr = addr;
+ a.len = len;
+ a.qevent = (nm_i == event);
+ /* When not in txqdisc mode, we should ask
+ * notifications when NS_REPORT is set, or roughly
+ * every half ring. To optimize this, we set a
+ * notification event when the client runs out of
+ * TX ring space, or when transmission fails. In
+ * the latter case we also break early.
*/
- tx_ret = generic_xmit_frame(ifp, m, addr, len, ring_nr);
+ tx_ret = nm_os_generic_xmit_frame(&a);
if (unlikely(tx_ret)) {
- ND(5, "start_xmit failed: err %d [nm_i %u, head %u, hwtail %u]",
- tx_ret, nm_i, head, kring->nr_hwtail);
- /*
- * No room for this mbuf in the device driver.
- * Request a notification FOR A PREVIOUS MBUF,
- * then call generic_netmap_tx_clean(kring) to do the
- * double check and see if we can free more buffers.
- * If there is space continue, else break;
- * NOTE: the double check is necessary if the problem
- * occurs in the txsync call after selrecord().
- * Also, we need some way to tell the caller that not
- * all buffers were queued onto the device (this was
- * not a problem with native netmap driver where space
- * is preallocated). The bridge has a similar problem
- * and we solve it there by dropping the excess packets.
- */
- generic_set_tx_event(kring, nm_i);
- if (generic_netmap_tx_clean(kring)) { /* space now available */
- continue;
- } else {
- break;
+ if (!gna->txqdisc) {
+ /*
+ * No room for this mbuf in the device driver.
+ * Request a notification FOR A PREVIOUS MBUF,
+ * then call generic_netmap_tx_clean(kring) to do the
+ * double check and see if we can free more buffers.
+ * If there is space continue, else break;
+ * NOTE: the double check is necessary if the problem
+ * occurs in the txsync call after selrecord().
+ * Also, we need some way to tell the caller that not
+ * all buffers were queued onto the device (this was
+ * not a problem with native netmap driver where space
+ * is preallocated). The bridge has a similar problem
+ * and we solve it there by dropping the excess packets.
+ */
+ generic_set_tx_event(kring, nm_i);
+ if (generic_netmap_tx_clean(kring, gna->txqdisc)) {
+ /* space now available */
+ continue;
+ } else {
+ break;
+ }
}
+
+ /* In txqdisc mode, the netmap-aware qdisc
+ * queue has the same length as the number of
+ * netmap slots (N). Since tail is advanced
+ * only when packets are dequeued, qdisc
+ * queue overrun cannot happen, so
+ * nm_os_generic_xmit_frame() did not fail
+ * because of that.
+ * However, packets can be dropped because
+ * carrier is off, or because our qdisc is
+ * being deactivated, or possibly for other
+ * reasons. In these cases, we just let the
+ * packet to be dropped. */
+ IFRATE(rate_ctx.new.txdrop++);
}
+
slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED);
nm_i = nm_next(nm_i, lim);
- IFRATE(rate_ctx.new.txpkt ++);
+ IFRATE(rate_ctx.new.txpkt++);
}
-
- /* Update hwcur to the next slot to transmit. */
- kring->nr_hwcur = nm_i; /* not head, we could break early */
+ if (a.head != NULL) {
+ a.addr = NULL;
+ nm_os_generic_xmit_frame(&a);
+ }
+ /* Update hwcur to the next slot to transmit. Here nm_i
+ * is not necessarily head, we could break early. */
+ kring->nr_hwcur = nm_i;
}
/*
* Second, reclaim completed buffers
*/
- if (flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring)) {
+ if (!gna->txqdisc && (flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring))) {
/* No more available slots? Set a notification event
* on a netmap slot that will be cleaned in the future.
* No doublecheck is performed, since txsync() will be
@@ -642,58 +946,74 @@ generic_netmap_txsync(struct netmap_kring *kring, int flags)
*/
generic_set_tx_event(kring, nm_i);
}
- ND("tx #%d, hwtail = %d", n, kring->nr_hwtail);
- generic_netmap_tx_clean(kring);
+ generic_netmap_tx_clean(kring, gna->txqdisc);
return 0;
}
/*
- * This handler is registered (through netmap_catch_rx())
+ * This handler is registered (through nm_os_catch_rx())
* within the attached network interface
* in the RX subsystem, so that every mbuf passed up by
* the driver can be stolen to the network stack.
* Stolen packets are put in a queue where the
* generic_netmap_rxsync() callback can extract them.
+ * Returns 1 if the packet was stolen, 0 otherwise.
*/
-void
+int
generic_rx_handler(struct ifnet *ifp, struct mbuf *m)
{
struct netmap_adapter *na = NA(ifp);
struct netmap_generic_adapter *gna = (struct netmap_generic_adapter *)na;
+ struct netmap_kring *kring;
u_int work_done;
- u_int rr = MBUF_RXQ(m); // receive ring number
+ u_int r = MBUF_RXQ(m); /* receive ring number */
- if (rr >= na->num_rx_rings) {
- rr = rr % na->num_rx_rings; // XXX expensive...
+ if (r >= na->num_rx_rings) {
+ r = r % na->num_rx_rings;
+ }
+
+ kring = &na->rx_rings[r];
+
+ if (kring->nr_mode == NKR_NETMAP_OFF) {
+ /* We must not intercept this mbuf. */
+ return 0;
}
/* limit the size of the queue */
- if (unlikely(mbq_len(&na->rx_rings[rr].rx_queue) > 1024)) {
+ if (unlikely(!gna->rxsg && MBUF_LEN(m) > NETMAP_BUF_SIZE(na))) {
+ /* This may happen when GRO/LRO features are enabled for
+ * the NIC driver when the generic adapter does not
+ * support RX scatter-gather. */
+ RD(2, "Warning: driver pushed up big packet "
+ "(size=%d)", (int)MBUF_LEN(m));
+ m_freem(m);
+ } else if (unlikely(mbq_len(&kring->rx_queue) > 1024)) {
m_freem(m);
} else {
- mbq_safe_enqueue(&na->rx_rings[rr].rx_queue, m);
+ mbq_safe_enqueue(&kring->rx_queue, m);
}
if (netmap_generic_mit < 32768) {
/* no rx mitigation, pass notification up */
- netmap_generic_irq(na->ifp, rr, &work_done);
- IFRATE(rate_ctx.new.rxirq++);
+ netmap_generic_irq(na, r, &work_done);
} else {
/* same as send combining, filter notification if there is a
* pending timer, otherwise pass it up and start a timer.
*/
- if (likely(netmap_mitigation_active(&gna->mit[rr]))) {
+ if (likely(nm_os_mitigation_active(&gna->mit[r]))) {
/* Record that there is some pending work. */
- gna->mit[rr].mit_pending = 1;
+ gna->mit[r].mit_pending = 1;
} else {
- netmap_generic_irq(na->ifp, rr, &work_done);
- IFRATE(rate_ctx.new.rxirq++);
- netmap_mitigation_start(&gna->mit[rr]);
+ netmap_generic_irq(na, r, &work_done);
+ nm_os_mitigation_start(&gna->mit[r]);
}
}
+
+ /* We have intercepted the mbuf. */
+ return 1;
}
/*
@@ -713,54 +1033,23 @@ generic_netmap_rxsync(struct netmap_kring *kring, int flags)
u_int const head = kring->rhead;
int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR;
+ /* Adapter-specific variables. */
+ uint16_t slot_flags = kring->nkr_slot_flags;
+ u_int nm_buf_len = NETMAP_BUF_SIZE(na);
+ struct mbq tmpq;
+ struct mbuf *m;
+ int avail; /* in bytes */
+ int mlen;
+ int copy;
+
if (head > lim)
return netmap_ring_reinit(kring);
- /*
- * First part: import newly received packets.
- */
- if (netmap_no_pendintr || force_update) {
- /* extract buffers from the rx queue, stop at most one
- * slot before nr_hwcur (stop_i)
- */
- uint16_t slot_flags = kring->nkr_slot_flags;
- u_int stop_i = nm_prev(kring->nr_hwcur, lim);
-
- nm_i = kring->nr_hwtail; /* first empty slot in the receive ring */
- for (n = 0; nm_i != stop_i; n++) {
- int len;
- void *addr = NMB(na, &ring->slot[nm_i]);
- struct mbuf *m;
-
- /* we only check the address here on generic rx rings */
- if (addr == NETMAP_BUF_BASE(na)) { /* Bad buffer */
- return netmap_ring_reinit(kring);
- }
- /*
- * Call the locked version of the function.
- * XXX Ideally we could grab a batch of mbufs at once
- * and save some locking overhead.
- */
- m = mbq_safe_dequeue(&kring->rx_queue);
- if (!m) /* no more data */
- break;
- len = MBUF_LEN(m);
- m_copydata(m, 0, len, addr);
- ring->slot[nm_i].len = len;
- ring->slot[nm_i].flags = slot_flags;
- m_freem(m);
- nm_i = nm_next(nm_i, lim);
- }
- if (n) {
- kring->nr_hwtail = nm_i;
- IFRATE(rate_ctx.new.rxpkt += n);
- }
- kring->nr_kflags &= ~NKR_PENDINTR;
- }
+ IFRATE(rate_ctx.new.rxsync++);
- // XXX should we invert the order ?
/*
- * Second part: skip past packets that userspace has released.
+ * First part: skip past packets that userspace has released.
+ * This can possibly make room for the second part.
*/
nm_i = kring->nr_hwcur;
if (nm_i != head) {
@@ -773,7 +1062,106 @@ generic_netmap_rxsync(struct netmap_kring *kring, int flags)
}
kring->nr_hwcur = head;
}
- IFRATE(rate_ctx.new.rxsync++);
+
+ /*
+ * Second part: import newly received packets.
+ */
+ if (!netmap_no_pendintr && !force_update) {
+ return 0;
+ }
+
+ nm_i = kring->nr_hwtail; /* First empty slot in the receive ring. */
+
+ /* Compute the available space (in bytes) in this netmap ring.
+ * The first slot that is not considered in is the one before
+ * nr_hwcur. */
+
+ avail = nm_prev(kring->nr_hwcur, lim) - nm_i;
+ if (avail < 0)
+ avail += lim + 1;
+ avail *= nm_buf_len;
+
+ /* First pass: While holding the lock on the RX mbuf queue,
+ * extract as many mbufs as they fit the available space,
+ * and put them in a temporary queue.
+ * To avoid performing a per-mbuf division (mlen / nm_buf_len) to
+ * to update avail, we do the update in a while loop that we
+ * also use to set the RX slots, but without performing the copy. */
+ mbq_init(&tmpq);
+ mbq_lock(&kring->rx_queue);
+ for (n = 0;; n++) {
+ m = mbq_peek(&kring->rx_queue);
+ if (!m) {
+ /* No more packets from the driver. */
+ break;
+ }
+
+ mlen = MBUF_LEN(m);
+ if (mlen > avail) {
+ /* No more space in the ring. */
+ break;
+ }
+
+ mbq_dequeue(&kring->rx_queue);
+
+ while (mlen) {
+ copy = nm_buf_len;
+ if (mlen < copy) {
+ copy = mlen;
+ }
+ mlen -= copy;
+ avail -= nm_buf_len;
+
+ ring->slot[nm_i].len = copy;
+ ring->slot[nm_i].flags = slot_flags | (mlen ? NS_MOREFRAG : 0);
+ nm_i = nm_next(nm_i, lim);
+ }
+
+ mbq_enqueue(&tmpq, m);
+ }
+ mbq_unlock(&kring->rx_queue);
+
+ /* Second pass: Drain the temporary queue, going over the used RX slots,
+ * and perform the copy out of the RX queue lock. */
+ nm_i = kring->nr_hwtail;
+
+ for (;;) {
+ void *nmaddr;
+ int ofs = 0;
+ int morefrag;
+
+ m = mbq_dequeue(&tmpq);
+ if (!m) {
+ break;
+ }
+
+ do {
+ nmaddr = NMB(na, &ring->slot[nm_i]);
+ /* We only check the address here on generic rx rings. */
+ if (nmaddr == NETMAP_BUF_BASE(na)) { /* Bad buffer */
+ m_freem(m);
+ mbq_purge(&tmpq);
+ mbq_fini(&tmpq);
+ return netmap_ring_reinit(kring);
+ }
+
+ copy = ring->slot[nm_i].len;
+ m_copydata(m, ofs, copy, nmaddr);
+ ofs += copy;
+ morefrag = ring->slot[nm_i].flags & NS_MOREFRAG;
+ nm_i = nm_next(nm_i, lim);
+ } while (morefrag);
+
+ m_freem(m);
+ }
+
+ mbq_fini(&tmpq);
+
+ if (n) {
+ kring->nr_hwtail = nm_i;
+ IFRATE(rate_ctx.new.rxpkt += n);
+ }
+ kring->nr_kflags &= ~NKR_PENDINTR;
return 0;
}
@@ -787,9 +1175,8 @@ generic_netmap_dtor(struct netmap_adapter *na)
if (prev_na != NULL) {
D("Released generic NA %p", gna);
- if_rele(ifp);
netmap_adapter_put(prev_na);
- if (na->ifp == NULL) {
+ if (nm_iszombie(na)) {
/*
* The driver has been removed without releasing
* the reference so we need to do it here.
@@ -797,9 +1184,13 @@ generic_netmap_dtor(struct netmap_adapter *na)
netmap_adapter_put(prev_na);
}
}
- WNA(ifp) = prev_na;
- D("Restored native NA %p", prev_na);
+ NM_ATTACH_NA(ifp, prev_na);
+ /*
+ * netmap_detach_common(), that it's called after this function,
+ * overrides WNA(ifp) if na->ifp is not NULL.
+ */
na->ifp = NULL;
+ D("Restored native NA %p", prev_na);
}
/*
@@ -823,7 +1214,7 @@ generic_netmap_attach(struct ifnet *ifp)
num_tx_desc = num_rx_desc = netmap_generic_ringsize; /* starting point */
- generic_find_num_desc(ifp, &num_tx_desc, &num_rx_desc); /* ignore errors */
+ nm_os_generic_find_num_desc(ifp, &num_tx_desc, &num_rx_desc); /* ignore errors */
ND("Netmap ring size: TX = %d, RX = %d", num_tx_desc, num_rx_desc);
if (num_tx_desc == 0 || num_rx_desc == 0) {
D("Device has no hw slots (tx %u, rx %u)", num_tx_desc, num_rx_desc);
@@ -855,12 +1246,23 @@ generic_netmap_attach(struct ifnet *ifp)
ND("[GNA] num_rx_queues(%d), real_num_rx_queues(%d)",
ifp->num_rx_queues, ifp->real_num_rx_queues);
- generic_find_num_queues(ifp, &na->num_tx_rings, &na->num_rx_rings);
+ nm_os_generic_find_num_queues(ifp, &na->num_tx_rings, &na->num_rx_rings);
retval = netmap_attach_common(na);
if (retval) {
free(gna, M_DEVBUF);
+ return retval;
}
+ gna->prev = NA(ifp); /* save old na */
+ if (gna->prev != NULL) {
+ netmap_adapter_get(gna->prev);
+ }
+ NM_ATTACH_NA(ifp, na);
+
+ nm_os_generic_set_features(gna);
+
+ D("Created generic NA %p (prev %p)", gna, gna->prev);
+
return retval;
}