aboutsummaryrefslogtreecommitdiff
path: root/sys/dev/netmap
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2016-10-21 16:29:40 +0000
committerDimitry Andric <dim@FreeBSD.org>2016-10-21 16:29:40 +0000
commit5763f79695f9b1ffacce55a8594cb7be08c3f31c (patch)
tree67ab054bf023a10a12dd0403739ccb46ab122278 /sys/dev/netmap
parenta0e610c43975ca0ec0bfc7d1df88d8b7a3cb871c (diff)
parent4b83a776069610210759fa1ddf89e67cc7b8a9a1 (diff)
Notes
Diffstat (limited to 'sys/dev/netmap')
-rw-r--r--sys/dev/netmap/if_ixl_netmap.h4
-rw-r--r--sys/dev/netmap/if_lem_netmap.h205
-rw-r--r--sys/dev/netmap/if_ptnet.c2283
-rw-r--r--sys/dev/netmap/if_vtnet_netmap.h4
-rw-r--r--sys/dev/netmap/ixgbe_netmap.h21
-rw-r--r--sys/dev/netmap/netmap.c1253
-rw-r--r--sys/dev/netmap/netmap_freebsd.c767
-rw-r--r--sys/dev/netmap/netmap_generic.c953
-rw-r--r--sys/dev/netmap/netmap_kern.h660
-rw-r--r--sys/dev/netmap/netmap_mbq.c9
-rw-r--r--sys/dev/netmap/netmap_mbq.h18
-rw-r--r--sys/dev/netmap/netmap_mem2.c930
-rw-r--r--sys/dev/netmap/netmap_mem2.h20
-rw-r--r--sys/dev/netmap/netmap_monitor.c118
-rw-r--r--sys/dev/netmap/netmap_offloadings.c260
-rw-r--r--sys/dev/netmap/netmap_pipe.c158
-rw-r--r--sys/dev/netmap/netmap_pt.c1438
-rw-r--r--sys/dev/netmap/netmap_vale.c667
18 files changed, 8133 insertions, 1635 deletions
diff --git a/sys/dev/netmap/if_ixl_netmap.h b/sys/dev/netmap/if_ixl_netmap.h
index 2c7f9be541b3..223dc06e36ab 100644
--- a/sys/dev/netmap/if_ixl_netmap.h
+++ b/sys/dev/netmap/if_ixl_netmap.h
@@ -59,7 +59,7 @@ extern int ixl_rx_miss, ixl_rx_miss_bufs, ixl_crcstrip;
/*
* device-specific sysctl variables:
*
- * ixl_crcstrip: 0: keep CRC in rx frames (default), 1: strip it.
+ * ixl_crcstrip: 0: NIC keeps CRC in rx frames, 1: NIC strips it (default).
* During regular operations the CRC is stripped, but on some
* hardware reception of frames not multiple of 64 is slower,
* so using crcstrip=0 helps in benchmarks.
@@ -73,7 +73,7 @@ SYSCTL_DECL(_dev_netmap);
*/
#if 0
SYSCTL_INT(_dev_netmap, OID_AUTO, ixl_crcstrip,
- CTLFLAG_RW, &ixl_crcstrip, 1, "strip CRC on rx frames");
+ CTLFLAG_RW, &ixl_crcstrip, 1, "NIC strips CRC on rx frames");
#endif
SYSCTL_INT(_dev_netmap, OID_AUTO, ixl_rx_miss,
CTLFLAG_RW, &ixl_rx_miss, 0, "potentially missed rx intr");
diff --git a/sys/dev/netmap/if_lem_netmap.h b/sys/dev/netmap/if_lem_netmap.h
index 0ec9b1346609..91c637a8b3f8 100644
--- a/sys/dev/netmap/if_lem_netmap.h
+++ b/sys/dev/netmap/if_lem_netmap.h
@@ -35,12 +35,8 @@
#include <net/netmap.h>
#include <sys/selinfo.h>
-#include <vm/vm.h>
-#include <vm/pmap.h> /* vtophys ? */
#include <dev/netmap/netmap_kern.h>
-extern int netmap_adaptive_io;
-
/*
* Register/unregister. We are already under netmap lock.
*/
@@ -81,6 +77,22 @@ lem_netmap_reg(struct netmap_adapter *na, int onoff)
}
+static void
+lem_netmap_intr(struct netmap_adapter *na, int onoff)
+{
+ struct ifnet *ifp = na->ifp;
+ struct adapter *adapter = ifp->if_softc;
+
+ EM_CORE_LOCK(adapter);
+ if (onoff) {
+ lem_enable_intr(adapter);
+ } else {
+ lem_disable_intr(adapter);
+ }
+ EM_CORE_UNLOCK(adapter);
+}
+
+
/*
* Reconcile kernel and user view of the transmit ring.
*/
@@ -99,10 +111,6 @@ lem_netmap_txsync(struct netmap_kring *kring, int flags)
/* device-specific */
struct adapter *adapter = ifp->if_softc;
-#ifdef NIC_PARAVIRT
- struct paravirt_csb *csb = adapter->csb;
- uint64_t *csbd = (uint64_t *)(csb + 1);
-#endif /* NIC_PARAVIRT */
bus_dmamap_sync(adapter->txdma.dma_tag, adapter->txdma.dma_map,
BUS_DMASYNC_POSTREAD);
@@ -113,19 +121,6 @@ lem_netmap_txsync(struct netmap_kring *kring, int flags)
nm_i = kring->nr_hwcur;
if (nm_i != head) { /* we have new packets to send */
-#ifdef NIC_PARAVIRT
- int do_kick = 0;
- uint64_t t = 0; // timestamp
- int n = head - nm_i;
- if (n < 0)
- n += lim + 1;
- if (csb) {
- t = rdtsc(); /* last timestamp */
- csbd[16] += t - csbd[0]; /* total Wg */
- csbd[17] += n; /* Wg count */
- csbd[0] = t;
- }
-#endif /* NIC_PARAVIRT */
nic_i = netmap_idx_k2n(kring, nm_i);
while (nm_i != head) {
struct netmap_slot *slot = &ring->slot[nm_i];
@@ -166,38 +161,8 @@ lem_netmap_txsync(struct netmap_kring *kring, int flags)
bus_dmamap_sync(adapter->txdma.dma_tag, adapter->txdma.dma_map,
BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
-#ifdef NIC_PARAVIRT
- /* set unconditionally, then also kick if needed */
- if (csb) {
- t = rdtsc();
- if (csb->host_need_txkick == 2) {
- /* can compute an update of delta */
- int64_t delta = t - csbd[3];
- if (delta < 0)
- delta = -delta;
- if (csbd[8] == 0 || delta < csbd[8]) {
- csbd[8] = delta;
- csbd[9]++;
- }
- csbd[10]++;
- }
- csb->guest_tdt = nic_i;
- csbd[18] += t - csbd[0]; // total wp
- csbd[19] += n;
- }
- if (!csb || !csb->guest_csb_on || (csb->host_need_txkick & 1))
- do_kick = 1;
- if (do_kick)
-#endif /* NIC_PARAVIRT */
/* (re)start the tx unit up to slot nic_i (excluded) */
E1000_WRITE_REG(&adapter->hw, E1000_TDT(0), nic_i);
-#ifdef NIC_PARAVIRT
- if (do_kick) {
- uint64_t t1 = rdtsc();
- csbd[20] += t1 - t; // total Np
- csbd[21]++;
- }
-#endif /* NIC_PARAVIRT */
}
/*
@@ -206,93 +171,6 @@ lem_netmap_txsync(struct netmap_kring *kring, int flags)
if (ticks != kring->last_reclaim || flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring)) {
kring->last_reclaim = ticks;
/* record completed transmissions using TDH */
-#ifdef NIC_PARAVIRT
- /* host updates tdh unconditionally, and we have
- * no side effects on reads, so we can read from there
- * instead of exiting.
- */
- if (csb) {
- static int drain = 0, nodrain=0, good = 0, bad = 0, fail = 0;
- u_int x = adapter->next_tx_to_clean;
- csbd[19]++; // XXX count reclaims
- nic_i = csb->host_tdh;
- if (csb->guest_csb_on) {
- if (nic_i == x) {
- bad++;
- csbd[24]++; // failed reclaims
- /* no progress, request kick and retry */
- csb->guest_need_txkick = 1;
- mb(); // XXX barrier
- nic_i = csb->host_tdh;
- } else {
- good++;
- }
- if (nic_i != x) {
- csb->guest_need_txkick = 2;
- if (nic_i == csb->guest_tdt)
- drain++;
- else
- nodrain++;
-#if 1
- if (netmap_adaptive_io) {
- /* new mechanism: last half ring (or so)
- * released one slot at a time.
- * This effectively makes the system spin.
- *
- * Take next_to_clean + 1 as a reference.
- * tdh must be ahead or equal
- * On entry, the logical order is
- * x < tdh = nic_i
- * We first push tdh up to avoid wraps.
- * The limit is tdh-ll (half ring).
- * if tdh-256 < x we report x;
- * else we report tdh-256
- */
- u_int tdh = nic_i;
- u_int ll = csbd[15];
- u_int delta = lim/8;
- if (netmap_adaptive_io == 2 || ll > delta)
- csbd[15] = ll = delta;
- else if (netmap_adaptive_io == 1 && ll > 1) {
- csbd[15]--;
- }
-
- if (nic_i >= kring->nkr_num_slots) {
- RD(5, "bad nic_i %d on input", nic_i);
- }
- x = nm_next(x, lim);
- if (tdh < x)
- tdh += lim + 1;
- if (tdh <= x + ll) {
- nic_i = x;
- csbd[25]++; //report n + 1;
- } else {
- tdh = nic_i;
- if (tdh < ll)
- tdh += lim + 1;
- nic_i = tdh - ll;
- csbd[26]++; // report tdh - ll
- }
- }
-#endif
- } else {
- /* we stop, count whether we are idle or not */
- int bh_active = csb->host_need_txkick & 2 ? 4 : 0;
- csbd[27+ csb->host_need_txkick]++;
- if (netmap_adaptive_io == 1) {
- if (bh_active && csbd[15] > 1)
- csbd[15]--;
- else if (!bh_active && csbd[15] < lim/2)
- csbd[15]++;
- }
- bad--;
- fail++;
- }
- }
- RD(1, "drain %d nodrain %d good %d retry %d fail %d",
- drain, nodrain, good, bad, fail);
- } else
-#endif /* !NIC_PARAVIRT */
nic_i = E1000_READ_REG(&adapter->hw, E1000_TDH(0));
if (nic_i >= kring->nkr_num_slots) { /* XXX can it happen ? */
D("TDH wrap %d", nic_i);
@@ -324,21 +202,10 @@ lem_netmap_rxsync(struct netmap_kring *kring, int flags)
/* device-specific */
struct adapter *adapter = ifp->if_softc;
-#ifdef NIC_PARAVIRT
- struct paravirt_csb *csb = adapter->csb;
- uint32_t csb_mode = csb && csb->guest_csb_on;
- uint32_t do_host_rxkick = 0;
-#endif /* NIC_PARAVIRT */
if (head > lim)
return netmap_ring_reinit(kring);
-#ifdef NIC_PARAVIRT
- if (csb_mode) {
- force_update = 1;
- csb->guest_need_rxkick = 0;
- }
-#endif /* NIC_PARAVIRT */
/* XXX check sync modes */
bus_dmamap_sync(adapter->rxdma.dma_tag, adapter->rxdma.dma_map,
BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
@@ -357,23 +224,6 @@ lem_netmap_rxsync(struct netmap_kring *kring, int flags)
uint32_t staterr = le32toh(curr->status);
int len;
-#ifdef NIC_PARAVIRT
- if (csb_mode) {
- if ((staterr & E1000_RXD_STAT_DD) == 0) {
- /* don't bother to retry if more than 1 pkt */
- if (n > 1)
- break;
- csb->guest_need_rxkick = 1;
- wmb();
- staterr = le32toh(curr->status);
- if ((staterr & E1000_RXD_STAT_DD) == 0) {
- break;
- } else { /* we are good */
- csb->guest_need_rxkick = 0;
- }
- }
- } else
-#endif /* NIC_PARAVIRT */
if ((staterr & E1000_RXD_STAT_DD) == 0)
break;
len = le16toh(curr->length) - 4; // CRC
@@ -390,18 +240,6 @@ lem_netmap_rxsync(struct netmap_kring *kring, int flags)
nic_i = nm_next(nic_i, lim);
}
if (n) { /* update the state variables */
-#ifdef NIC_PARAVIRT
- if (csb_mode) {
- if (n > 1) {
- /* leave one spare buffer so we avoid rxkicks */
- nm_i = nm_prev(nm_i, lim);
- nic_i = nm_prev(nic_i, lim);
- n--;
- } else {
- csb->guest_need_rxkick = 1;
- }
- }
-#endif /* NIC_PARAVIRT */
ND("%d new packets at nic %d nm %d tail %d",
n,
adapter->next_rx_desc_to_check,
@@ -440,10 +278,6 @@ lem_netmap_rxsync(struct netmap_kring *kring, int flags)
curr->status = 0;
bus_dmamap_sync(adapter->rxtag, rxbuf->map,
BUS_DMASYNC_PREREAD);
-#ifdef NIC_PARAVIRT
- if (csb_mode && csb->host_rxkick_at == nic_i)
- do_host_rxkick = 1;
-#endif /* NIC_PARAVIRT */
nm_i = nm_next(nm_i, lim);
nic_i = nm_next(nic_i, lim);
}
@@ -455,12 +289,6 @@ lem_netmap_rxsync(struct netmap_kring *kring, int flags)
* so move nic_i back by one unit
*/
nic_i = nm_prev(nic_i, lim);
-#ifdef NIC_PARAVIRT
- /* set unconditionally, then also kick if needed */
- if (csb)
- csb->guest_rdt = nic_i;
- if (!csb_mode || do_host_rxkick)
-#endif /* NIC_PARAVIRT */
E1000_WRITE_REG(&adapter->hw, E1000_RDT(0), nic_i);
}
@@ -486,6 +314,7 @@ lem_netmap_attach(struct adapter *adapter)
na.nm_rxsync = lem_netmap_rxsync;
na.nm_register = lem_netmap_reg;
na.num_tx_rings = na.num_rx_rings = 1;
+ na.nm_intr = lem_netmap_intr;
netmap_attach(&na);
}
diff --git a/sys/dev/netmap/if_ptnet.c b/sys/dev/netmap/if_ptnet.c
new file mode 100644
index 000000000000..90a90e984a5a
--- /dev/null
+++ b/sys/dev/netmap/if_ptnet.c
@@ -0,0 +1,2283 @@
+/*-
+ * Copyright (c) 2016, Vincenzo Maffione
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/* Driver for ptnet paravirtualized network device. */
+
+#include <sys/cdefs.h>
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/sockio.h>
+#include <sys/mbuf.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/taskqueue.h>
+#include <sys/smp.h>
+#include <sys/time.h>
+#include <machine/smp.h>
+
+#include <vm/uma.h>
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <net/ethernet.h>
+#include <net/if.h>
+#include <net/if_var.h>
+#include <net/if_arp.h>
+#include <net/if_dl.h>
+#include <net/if_types.h>
+#include <net/if_media.h>
+#include <net/if_vlan_var.h>
+#include <net/bpf.h>
+
+#include <netinet/in_systm.h>
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/ip6.h>
+#include <netinet6/ip6_var.h>
+#include <netinet/udp.h>
+#include <netinet/tcp.h>
+#include <netinet/sctp.h>
+
+#include <machine/bus.h>
+#include <machine/resource.h>
+#include <sys/bus.h>
+#include <sys/rman.h>
+
+#include <dev/pci/pcivar.h>
+#include <dev/pci/pcireg.h>
+
+#include "opt_inet.h"
+#include "opt_inet6.h"
+
+#include <sys/selinfo.h>
+#include <net/netmap.h>
+#include <dev/netmap/netmap_kern.h>
+#include <net/netmap_virt.h>
+#include <dev/netmap/netmap_mem2.h>
+#include <dev/virtio/network/virtio_net.h>
+
+#ifndef PTNET_CSB_ALLOC
+#error "No support for on-device CSB"
+#endif
+
+#ifndef INET
+#error "INET not defined, cannot support offloadings"
+#endif
+
+#if __FreeBSD_version >= 1100000
+static uint64_t ptnet_get_counter(if_t, ift_counter);
+#else
+typedef struct ifnet *if_t;
+#define if_getsoftc(_ifp) (_ifp)->if_softc
+#endif
+
+//#define PTNETMAP_STATS
+//#define DEBUG
+#ifdef DEBUG
+#define DBG(x) x
+#else /* !DEBUG */
+#define DBG(x)
+#endif /* !DEBUG */
+
+extern int ptnet_vnet_hdr; /* Tunable parameter */
+
+struct ptnet_softc;
+
+struct ptnet_queue_stats {
+ uint64_t packets; /* if_[io]packets */
+ uint64_t bytes; /* if_[io]bytes */
+ uint64_t errors; /* if_[io]errors */
+ uint64_t iqdrops; /* if_iqdrops */
+ uint64_t mcasts; /* if_[io]mcasts */
+#ifdef PTNETMAP_STATS
+ uint64_t intrs;
+ uint64_t kicks;
+#endif /* PTNETMAP_STATS */
+};
+
+struct ptnet_queue {
+ struct ptnet_softc *sc;
+ struct resource *irq;
+ void *cookie;
+ int kring_id;
+ struct ptnet_ring *ptring;
+ unsigned int kick;
+ struct mtx lock;
+ struct buf_ring *bufring; /* for TX queues */
+ struct ptnet_queue_stats stats;
+#ifdef PTNETMAP_STATS
+ struct ptnet_queue_stats last_stats;
+#endif /* PTNETMAP_STATS */
+ struct taskqueue *taskq;
+ struct task task;
+ char lock_name[16];
+};
+
+#define PTNET_Q_LOCK(_pq) mtx_lock(&(_pq)->lock)
+#define PTNET_Q_TRYLOCK(_pq) mtx_trylock(&(_pq)->lock)
+#define PTNET_Q_UNLOCK(_pq) mtx_unlock(&(_pq)->lock)
+
+struct ptnet_softc {
+ device_t dev;
+ if_t ifp;
+ struct ifmedia media;
+ struct mtx lock;
+ char lock_name[16];
+ char hwaddr[ETHER_ADDR_LEN];
+
+ /* Mirror of PTFEAT register. */
+ uint32_t ptfeatures;
+ unsigned int vnet_hdr_len;
+
+ /* PCI BARs support. */
+ struct resource *iomem;
+ struct resource *msix_mem;
+
+ unsigned int num_rings;
+ unsigned int num_tx_rings;
+ struct ptnet_queue *queues;
+ struct ptnet_queue *rxqueues;
+ struct ptnet_csb *csb;
+
+ unsigned int min_tx_space;
+
+ struct netmap_pt_guest_adapter *ptna;
+
+ struct callout tick;
+#ifdef PTNETMAP_STATS
+ struct timeval last_ts;
+#endif /* PTNETMAP_STATS */
+};
+
+#define PTNET_CORE_LOCK(_sc) mtx_lock(&(_sc)->lock)
+#define PTNET_CORE_UNLOCK(_sc) mtx_unlock(&(_sc)->lock)
+
+static int ptnet_probe(device_t);
+static int ptnet_attach(device_t);
+static int ptnet_detach(device_t);
+static int ptnet_suspend(device_t);
+static int ptnet_resume(device_t);
+static int ptnet_shutdown(device_t);
+
+static void ptnet_init(void *opaque);
+static int ptnet_ioctl(if_t ifp, u_long cmd, caddr_t data);
+static int ptnet_init_locked(struct ptnet_softc *sc);
+static int ptnet_stop(struct ptnet_softc *sc);
+static int ptnet_transmit(if_t ifp, struct mbuf *m);
+static int ptnet_drain_transmit_queue(struct ptnet_queue *pq,
+ unsigned int budget,
+ bool may_resched);
+static void ptnet_qflush(if_t ifp);
+static void ptnet_tx_task(void *context, int pending);
+
+static int ptnet_media_change(if_t ifp);
+static void ptnet_media_status(if_t ifp, struct ifmediareq *ifmr);
+#ifdef PTNETMAP_STATS
+static void ptnet_tick(void *opaque);
+#endif
+
+static int ptnet_irqs_init(struct ptnet_softc *sc);
+static void ptnet_irqs_fini(struct ptnet_softc *sc);
+
+static uint32_t ptnet_nm_ptctl(if_t ifp, uint32_t cmd);
+static int ptnet_nm_config(struct netmap_adapter *na, unsigned *txr,
+ unsigned *txd, unsigned *rxr, unsigned *rxd);
+static void ptnet_update_vnet_hdr(struct ptnet_softc *sc);
+static int ptnet_nm_register(struct netmap_adapter *na, int onoff);
+static int ptnet_nm_txsync(struct netmap_kring *kring, int flags);
+static int ptnet_nm_rxsync(struct netmap_kring *kring, int flags);
+
+static void ptnet_tx_intr(void *opaque);
+static void ptnet_rx_intr(void *opaque);
+
+static unsigned ptnet_rx_discard(struct netmap_kring *kring,
+ unsigned int head);
+static int ptnet_rx_eof(struct ptnet_queue *pq, unsigned int budget,
+ bool may_resched);
+static void ptnet_rx_task(void *context, int pending);
+
+#ifdef DEVICE_POLLING
+static poll_handler_t ptnet_poll;
+#endif
+
+static device_method_t ptnet_methods[] = {
+ DEVMETHOD(device_probe, ptnet_probe),
+ DEVMETHOD(device_attach, ptnet_attach),
+ DEVMETHOD(device_detach, ptnet_detach),
+ DEVMETHOD(device_suspend, ptnet_suspend),
+ DEVMETHOD(device_resume, ptnet_resume),
+ DEVMETHOD(device_shutdown, ptnet_shutdown),
+ DEVMETHOD_END
+};
+
+static driver_t ptnet_driver = {
+ "ptnet",
+ ptnet_methods,
+ sizeof(struct ptnet_softc)
+};
+
+/* We use (SI_ORDER_MIDDLE+2) here, see DEV_MODULE_ORDERED() invocation. */
+static devclass_t ptnet_devclass;
+DRIVER_MODULE_ORDERED(ptnet, pci, ptnet_driver, ptnet_devclass,
+ NULL, NULL, SI_ORDER_MIDDLE + 2);
+
+static int
+ptnet_probe(device_t dev)
+{
+ if (pci_get_vendor(dev) != PTNETMAP_PCI_VENDOR_ID ||
+ pci_get_device(dev) != PTNETMAP_PCI_NETIF_ID) {
+ return (ENXIO);
+ }
+
+ device_set_desc(dev, "ptnet network adapter");
+
+ return (BUS_PROBE_DEFAULT);
+}
+
+static inline void ptnet_kick(struct ptnet_queue *pq)
+{
+#ifdef PTNETMAP_STATS
+ pq->stats.kicks ++;
+#endif /* PTNETMAP_STATS */
+ bus_write_4(pq->sc->iomem, pq->kick, 0);
+}
+
+#define PTNET_BUF_RING_SIZE 4096
+#define PTNET_RX_BUDGET 512
+#define PTNET_RX_BATCH 1
+#define PTNET_TX_BUDGET 512
+#define PTNET_TX_BATCH 64
+#define PTNET_HDR_SIZE sizeof(struct virtio_net_hdr_mrg_rxbuf)
+#define PTNET_MAX_PKT_SIZE 65536
+
+#define PTNET_CSUM_OFFLOAD (CSUM_TCP | CSUM_UDP | CSUM_SCTP)
+#define PTNET_CSUM_OFFLOAD_IPV6 (CSUM_TCP_IPV6 | CSUM_UDP_IPV6 |\
+ CSUM_SCTP_IPV6)
+#define PTNET_ALL_OFFLOAD (CSUM_TSO | PTNET_CSUM_OFFLOAD |\
+ PTNET_CSUM_OFFLOAD_IPV6)
+
+static int
+ptnet_attach(device_t dev)
+{
+ uint32_t ptfeatures = PTNETMAP_F_BASE;
+ unsigned int num_rx_rings, num_tx_rings;
+ struct netmap_adapter na_arg;
+ unsigned int nifp_offset;
+ struct ptnet_softc *sc;
+ if_t ifp;
+ uint32_t macreg;
+ int err, rid;
+ int i;
+
+ sc = device_get_softc(dev);
+ sc->dev = dev;
+
+ /* Setup PCI resources. */
+ pci_enable_busmaster(dev);
+
+ rid = PCIR_BAR(PTNETMAP_IO_PCI_BAR);
+ sc->iomem = bus_alloc_resource_any(dev, SYS_RES_IOPORT, &rid,
+ RF_ACTIVE);
+ if (sc->iomem == NULL) {
+ device_printf(dev, "Failed to map I/O BAR\n");
+ return (ENXIO);
+ }
+
+ /* Check if we are supported by the hypervisor. If not,
+ * bail out immediately. */
+ if (ptnet_vnet_hdr) {
+ ptfeatures |= PTNETMAP_F_VNET_HDR;
+ }
+ bus_write_4(sc->iomem, PTNET_IO_PTFEAT, ptfeatures); /* wanted */
+ ptfeatures = bus_read_4(sc->iomem, PTNET_IO_PTFEAT); /* acked */
+ if (!(ptfeatures & PTNETMAP_F_BASE)) {
+ device_printf(dev, "Hypervisor does not support netmap "
+ "passthorugh\n");
+ err = ENXIO;
+ goto err_path;
+ }
+ sc->ptfeatures = ptfeatures;
+
+ /* Allocate CSB and carry out CSB allocation protocol (CSBBAH first,
+ * then CSBBAL). */
+ sc->csb = malloc(sizeof(struct ptnet_csb), M_DEVBUF,
+ M_NOWAIT | M_ZERO);
+ if (sc->csb == NULL) {
+ device_printf(dev, "Failed to allocate CSB\n");
+ err = ENOMEM;
+ goto err_path;
+ }
+
+ {
+ /*
+ * We use uint64_t rather than vm_paddr_t since we
+ * need 64 bit addresses even on 32 bit platforms.
+ */
+ uint64_t paddr = vtophys(sc->csb);
+
+ bus_write_4(sc->iomem, PTNET_IO_CSBBAH,
+ (paddr >> 32) & 0xffffffff);
+ bus_write_4(sc->iomem, PTNET_IO_CSBBAL, paddr & 0xffffffff);
+ }
+
+ num_tx_rings = bus_read_4(sc->iomem, PTNET_IO_NUM_TX_RINGS);
+ num_rx_rings = bus_read_4(sc->iomem, PTNET_IO_NUM_RX_RINGS);
+ sc->num_rings = num_tx_rings + num_rx_rings;
+ sc->num_tx_rings = num_tx_rings;
+
+ /* Allocate and initialize per-queue data structures. */
+ sc->queues = malloc(sizeof(struct ptnet_queue) * sc->num_rings,
+ M_DEVBUF, M_NOWAIT | M_ZERO);
+ if (sc->queues == NULL) {
+ err = ENOMEM;
+ goto err_path;
+ }
+ sc->rxqueues = sc->queues + num_tx_rings;
+
+ for (i = 0; i < sc->num_rings; i++) {
+ struct ptnet_queue *pq = sc->queues + i;
+
+ pq->sc = sc;
+ pq->kring_id = i;
+ pq->kick = PTNET_IO_KICK_BASE + 4 * i;
+ pq->ptring = sc->csb->rings + i;
+ snprintf(pq->lock_name, sizeof(pq->lock_name), "%s-%d",
+ device_get_nameunit(dev), i);
+ mtx_init(&pq->lock, pq->lock_name, NULL, MTX_DEF);
+ if (i >= num_tx_rings) {
+ /* RX queue: fix kring_id. */
+ pq->kring_id -= num_tx_rings;
+ } else {
+ /* TX queue: allocate buf_ring. */
+ pq->bufring = buf_ring_alloc(PTNET_BUF_RING_SIZE,
+ M_DEVBUF, M_NOWAIT, &pq->lock);
+ if (pq->bufring == NULL) {
+ err = ENOMEM;
+ goto err_path;
+ }
+ }
+ }
+
+ sc->min_tx_space = 64; /* Safe initial value. */
+
+ err = ptnet_irqs_init(sc);
+ if (err) {
+ goto err_path;
+ }
+
+ /* Setup Ethernet interface. */
+ sc->ifp = ifp = if_alloc(IFT_ETHER);
+ if (ifp == NULL) {
+ device_printf(dev, "Failed to allocate ifnet\n");
+ err = ENOMEM;
+ goto err_path;
+ }
+
+ if_initname(ifp, device_get_name(dev), device_get_unit(dev));
+ ifp->if_baudrate = IF_Gbps(10);
+ ifp->if_softc = sc;
+ ifp->if_flags = IFF_BROADCAST | IFF_MULTICAST | IFF_SIMPLEX;
+ ifp->if_init = ptnet_init;
+ ifp->if_ioctl = ptnet_ioctl;
+#if __FreeBSD_version >= 1100000
+ ifp->if_get_counter = ptnet_get_counter;
+#endif
+ ifp->if_transmit = ptnet_transmit;
+ ifp->if_qflush = ptnet_qflush;
+
+ ifmedia_init(&sc->media, IFM_IMASK, ptnet_media_change,
+ ptnet_media_status);
+ ifmedia_add(&sc->media, IFM_ETHER | IFM_10G_T | IFM_FDX, 0, NULL);
+ ifmedia_set(&sc->media, IFM_ETHER | IFM_10G_T | IFM_FDX);
+
+ macreg = bus_read_4(sc->iomem, PTNET_IO_MAC_HI);
+ sc->hwaddr[0] = (macreg >> 8) & 0xff;
+ sc->hwaddr[1] = macreg & 0xff;
+ macreg = bus_read_4(sc->iomem, PTNET_IO_MAC_LO);
+ sc->hwaddr[2] = (macreg >> 24) & 0xff;
+ sc->hwaddr[3] = (macreg >> 16) & 0xff;
+ sc->hwaddr[4] = (macreg >> 8) & 0xff;
+ sc->hwaddr[5] = macreg & 0xff;
+
+ ether_ifattach(ifp, sc->hwaddr);
+
+ ifp->if_hdrlen = sizeof(struct ether_vlan_header);
+ ifp->if_capabilities |= IFCAP_JUMBO_MTU | IFCAP_VLAN_MTU;
+
+ if (sc->ptfeatures & PTNETMAP_F_VNET_HDR) {
+ /* Similarly to what the vtnet driver does, we can emulate
+ * VLAN offloadings by inserting and removing the 802.1Q
+ * header during transmit and receive. We are then able
+ * to do checksum offloading of VLAN frames. */
+ ifp->if_capabilities |= IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6
+ | IFCAP_VLAN_HWCSUM
+ | IFCAP_TSO | IFCAP_LRO
+ | IFCAP_VLAN_HWTSO
+ | IFCAP_VLAN_HWTAGGING;
+ }
+
+ ifp->if_capenable = ifp->if_capabilities;
+#ifdef DEVICE_POLLING
+ /* Don't enable polling by default. */
+ ifp->if_capabilities |= IFCAP_POLLING;
+#endif
+ snprintf(sc->lock_name, sizeof(sc->lock_name),
+ "%s", device_get_nameunit(dev));
+ mtx_init(&sc->lock, sc->lock_name, "ptnet core lock", MTX_DEF);
+ callout_init_mtx(&sc->tick, &sc->lock, 0);
+
+ /* Prepare a netmap_adapter struct instance to do netmap_attach(). */
+ nifp_offset = bus_read_4(sc->iomem, PTNET_IO_NIFP_OFS);
+ memset(&na_arg, 0, sizeof(na_arg));
+ na_arg.ifp = ifp;
+ na_arg.num_tx_desc = bus_read_4(sc->iomem, PTNET_IO_NUM_TX_SLOTS);
+ na_arg.num_rx_desc = bus_read_4(sc->iomem, PTNET_IO_NUM_RX_SLOTS);
+ na_arg.num_tx_rings = num_tx_rings;
+ na_arg.num_rx_rings = num_rx_rings;
+ na_arg.nm_config = ptnet_nm_config;
+ na_arg.nm_krings_create = ptnet_nm_krings_create;
+ na_arg.nm_krings_delete = ptnet_nm_krings_delete;
+ na_arg.nm_dtor = ptnet_nm_dtor;
+ na_arg.nm_register = ptnet_nm_register;
+ na_arg.nm_txsync = ptnet_nm_txsync;
+ na_arg.nm_rxsync = ptnet_nm_rxsync;
+
+ netmap_pt_guest_attach(&na_arg, sc->csb, nifp_offset, ptnet_nm_ptctl);
+
+ /* Now a netmap adapter for this ifp has been allocated, and it
+ * can be accessed through NA(ifp). We also have to initialize the CSB
+ * pointer. */
+ sc->ptna = (struct netmap_pt_guest_adapter *)NA(ifp);
+
+ /* If virtio-net header was negotiated, set the virt_hdr_len field in
+ * the netmap adapter, to inform users that this netmap adapter requires
+ * the application to deal with the headers. */
+ ptnet_update_vnet_hdr(sc);
+
+ device_printf(dev, "%s() completed\n", __func__);
+
+ return (0);
+
+err_path:
+ ptnet_detach(dev);
+ return err;
+}
+
+static int
+ptnet_detach(device_t dev)
+{
+ struct ptnet_softc *sc = device_get_softc(dev);
+ int i;
+
+#ifdef DEVICE_POLLING
+ if (sc->ifp->if_capenable & IFCAP_POLLING) {
+ ether_poll_deregister(sc->ifp);
+ }
+#endif
+ callout_drain(&sc->tick);
+
+ if (sc->queues) {
+ /* Drain taskqueues before calling if_detach. */
+ for (i = 0; i < sc->num_rings; i++) {
+ struct ptnet_queue *pq = sc->queues + i;
+
+ if (pq->taskq) {
+ taskqueue_drain(pq->taskq, &pq->task);
+ }
+ }
+ }
+
+ if (sc->ifp) {
+ ether_ifdetach(sc->ifp);
+
+ /* Uninitialize netmap adapters for this device. */
+ netmap_detach(sc->ifp);
+
+ ifmedia_removeall(&sc->media);
+ if_free(sc->ifp);
+ sc->ifp = NULL;
+ }
+
+ ptnet_irqs_fini(sc);
+
+ if (sc->csb) {
+ bus_write_4(sc->iomem, PTNET_IO_CSBBAH, 0);
+ bus_write_4(sc->iomem, PTNET_IO_CSBBAL, 0);
+ free(sc->csb, M_DEVBUF);
+ sc->csb = NULL;
+ }
+
+ if (sc->queues) {
+ for (i = 0; i < sc->num_rings; i++) {
+ struct ptnet_queue *pq = sc->queues + i;
+
+ if (mtx_initialized(&pq->lock)) {
+ mtx_destroy(&pq->lock);
+ }
+ if (pq->bufring != NULL) {
+ buf_ring_free(pq->bufring, M_DEVBUF);
+ }
+ }
+ free(sc->queues, M_DEVBUF);
+ sc->queues = NULL;
+ }
+
+ if (sc->iomem) {
+ bus_release_resource(dev, SYS_RES_IOPORT,
+ PCIR_BAR(PTNETMAP_IO_PCI_BAR), sc->iomem);
+ sc->iomem = NULL;
+ }
+
+ mtx_destroy(&sc->lock);
+
+ device_printf(dev, "%s() completed\n", __func__);
+
+ return (0);
+}
+
+static int
+ptnet_suspend(device_t dev)
+{
+ struct ptnet_softc *sc;
+
+ sc = device_get_softc(dev);
+ (void)sc;
+
+ return (0);
+}
+
+static int
+ptnet_resume(device_t dev)
+{
+ struct ptnet_softc *sc;
+
+ sc = device_get_softc(dev);
+ (void)sc;
+
+ return (0);
+}
+
+static int
+ptnet_shutdown(device_t dev)
+{
+ /*
+ * Suspend already does all of what we need to
+ * do here; we just never expect to be resumed.
+ */
+ return (ptnet_suspend(dev));
+}
+
+static int
+ptnet_irqs_init(struct ptnet_softc *sc)
+{
+ int rid = PCIR_BAR(PTNETMAP_MSIX_PCI_BAR);
+ int nvecs = sc->num_rings;
+ device_t dev = sc->dev;
+ int err = ENOSPC;
+ int cpu_cur;
+ int i;
+
+ if (pci_find_cap(dev, PCIY_MSIX, NULL) != 0) {
+ device_printf(dev, "Could not find MSI-X capability\n");
+ return (ENXIO);
+ }
+
+ sc->msix_mem = bus_alloc_resource_any(dev, SYS_RES_MEMORY,
+ &rid, RF_ACTIVE);
+ if (sc->msix_mem == NULL) {
+ device_printf(dev, "Failed to allocate MSIX PCI BAR\n");
+ return (ENXIO);
+ }
+
+ if (pci_msix_count(dev) < nvecs) {
+ device_printf(dev, "Not enough MSI-X vectors\n");
+ goto err_path;
+ }
+
+ err = pci_alloc_msix(dev, &nvecs);
+ if (err) {
+ device_printf(dev, "Failed to allocate MSI-X vectors\n");
+ goto err_path;
+ }
+
+ for (i = 0; i < nvecs; i++) {
+ struct ptnet_queue *pq = sc->queues + i;
+
+ rid = i + 1;
+ pq->irq = bus_alloc_resource_any(dev, SYS_RES_IRQ, &rid,
+ RF_ACTIVE);
+ if (pq->irq == NULL) {
+ device_printf(dev, "Failed to allocate interrupt "
+ "for queue #%d\n", i);
+ err = ENOSPC;
+ goto err_path;
+ }
+ }
+
+ cpu_cur = CPU_FIRST();
+ for (i = 0; i < nvecs; i++) {
+ struct ptnet_queue *pq = sc->queues + i;
+ void (*handler)(void *) = ptnet_tx_intr;
+
+ if (i >= sc->num_tx_rings) {
+ handler = ptnet_rx_intr;
+ }
+ err = bus_setup_intr(dev, pq->irq, INTR_TYPE_NET | INTR_MPSAFE,
+ NULL /* intr_filter */, handler,
+ pq, &pq->cookie);
+ if (err) {
+ device_printf(dev, "Failed to register intr handler "
+ "for queue #%d\n", i);
+ goto err_path;
+ }
+
+ bus_describe_intr(dev, pq->irq, pq->cookie, "q%d", i);
+#if 0
+ bus_bind_intr(sc->dev, pq->irq, cpu_cur);
+#endif
+ cpu_cur = CPU_NEXT(cpu_cur);
+ }
+
+ device_printf(dev, "Allocated %d MSI-X vectors\n", nvecs);
+
+ cpu_cur = CPU_FIRST();
+ for (i = 0; i < nvecs; i++) {
+ struct ptnet_queue *pq = sc->queues + i;
+ static void (*handler)(void *context, int pending);
+
+ handler = (i < sc->num_tx_rings) ? ptnet_tx_task : ptnet_rx_task;
+
+ TASK_INIT(&pq->task, 0, handler, pq);
+ pq->taskq = taskqueue_create_fast("ptnet_queue", M_NOWAIT,
+ taskqueue_thread_enqueue, &pq->taskq);
+ taskqueue_start_threads(&pq->taskq, 1, PI_NET, "%s-pq-%d",
+ device_get_nameunit(sc->dev), cpu_cur);
+ cpu_cur = CPU_NEXT(cpu_cur);
+ }
+
+ return 0;
+err_path:
+ ptnet_irqs_fini(sc);
+ return err;
+}
+
+static void
+ptnet_irqs_fini(struct ptnet_softc *sc)
+{
+ device_t dev = sc->dev;
+ int i;
+
+ for (i = 0; i < sc->num_rings; i++) {
+ struct ptnet_queue *pq = sc->queues + i;
+
+ if (pq->taskq) {
+ taskqueue_free(pq->taskq);
+ pq->taskq = NULL;
+ }
+
+ if (pq->cookie) {
+ bus_teardown_intr(dev, pq->irq, pq->cookie);
+ pq->cookie = NULL;
+ }
+
+ if (pq->irq) {
+ bus_release_resource(dev, SYS_RES_IRQ, i + 1, pq->irq);
+ pq->irq = NULL;
+ }
+ }
+
+ if (sc->msix_mem) {
+ pci_release_msi(dev);
+
+ bus_release_resource(dev, SYS_RES_MEMORY,
+ PCIR_BAR(PTNETMAP_MSIX_PCI_BAR),
+ sc->msix_mem);
+ sc->msix_mem = NULL;
+ }
+}
+
+static void
+ptnet_init(void *opaque)
+{
+ struct ptnet_softc *sc = opaque;
+
+ PTNET_CORE_LOCK(sc);
+ ptnet_init_locked(sc);
+ PTNET_CORE_UNLOCK(sc);
+}
+
+static int
+ptnet_ioctl(if_t ifp, u_long cmd, caddr_t data)
+{
+ struct ptnet_softc *sc = if_getsoftc(ifp);
+ device_t dev = sc->dev;
+ struct ifreq *ifr = (struct ifreq *)data;
+ int mask, err = 0;
+
+ switch (cmd) {
+ case SIOCSIFFLAGS:
+ device_printf(dev, "SIOCSIFFLAGS %x\n", ifp->if_flags);
+ PTNET_CORE_LOCK(sc);
+ if (ifp->if_flags & IFF_UP) {
+ /* Network stack wants the iff to be up. */
+ err = ptnet_init_locked(sc);
+ } else {
+ /* Network stack wants the iff to be down. */
+ err = ptnet_stop(sc);
+ }
+ /* We don't need to do nothing to support IFF_PROMISC,
+ * since that is managed by the backend port. */
+ PTNET_CORE_UNLOCK(sc);
+ break;
+
+ case SIOCSIFCAP:
+ device_printf(dev, "SIOCSIFCAP %x %x\n",
+ ifr->ifr_reqcap, ifp->if_capenable);
+ mask = ifr->ifr_reqcap ^ ifp->if_capenable;
+#ifdef DEVICE_POLLING
+ if (mask & IFCAP_POLLING) {
+ struct ptnet_queue *pq;
+ int i;
+
+ if (ifr->ifr_reqcap & IFCAP_POLLING) {
+ err = ether_poll_register(ptnet_poll, ifp);
+ if (err) {
+ break;
+ }
+ /* Stop queues and sync with taskqueues. */
+ ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
+ for (i = 0; i < sc->num_rings; i++) {
+ pq = sc-> queues + i;
+ /* Make sure the worker sees the
+ * IFF_DRV_RUNNING down. */
+ PTNET_Q_LOCK(pq);
+ pq->ptring->guest_need_kick = 0;
+ PTNET_Q_UNLOCK(pq);
+ /* Wait for rescheduling to finish. */
+ if (pq->taskq) {
+ taskqueue_drain(pq->taskq,
+ &pq->task);
+ }
+ }
+ ifp->if_drv_flags |= IFF_DRV_RUNNING;
+ } else {
+ err = ether_poll_deregister(ifp);
+ for (i = 0; i < sc->num_rings; i++) {
+ pq = sc-> queues + i;
+ PTNET_Q_LOCK(pq);
+ pq->ptring->guest_need_kick = 1;
+ PTNET_Q_UNLOCK(pq);
+ }
+ }
+ }
+#endif /* DEVICE_POLLING */
+ ifp->if_capenable = ifr->ifr_reqcap;
+ break;
+
+ case SIOCSIFMTU:
+ /* We support any reasonable MTU. */
+ if (ifr->ifr_mtu < ETHERMIN ||
+ ifr->ifr_mtu > PTNET_MAX_PKT_SIZE) {
+ err = EINVAL;
+ } else {
+ PTNET_CORE_LOCK(sc);
+ ifp->if_mtu = ifr->ifr_mtu;
+ PTNET_CORE_UNLOCK(sc);
+ }
+ break;
+
+ case SIOCSIFMEDIA:
+ case SIOCGIFMEDIA:
+ err = ifmedia_ioctl(ifp, ifr, &sc->media, cmd);
+ break;
+
+ default:
+ err = ether_ioctl(ifp, cmd, data);
+ break;
+ }
+
+ return err;
+}
+
+static int
+ptnet_init_locked(struct ptnet_softc *sc)
+{
+ if_t ifp = sc->ifp;
+ struct netmap_adapter *na_dr = &sc->ptna->dr.up;
+ struct netmap_adapter *na_nm = &sc->ptna->hwup.up;
+ unsigned int nm_buf_size;
+ int ret;
+
+ if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
+ return 0; /* nothing to do */
+ }
+
+ device_printf(sc->dev, "%s\n", __func__);
+
+ /* Translate offload capabilities according to if_capenable. */
+ ifp->if_hwassist = 0;
+ if (ifp->if_capenable & IFCAP_TXCSUM)
+ ifp->if_hwassist |= PTNET_CSUM_OFFLOAD;
+ if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
+ ifp->if_hwassist |= PTNET_CSUM_OFFLOAD_IPV6;
+ if (ifp->if_capenable & IFCAP_TSO4)
+ ifp->if_hwassist |= CSUM_IP_TSO;
+ if (ifp->if_capenable & IFCAP_TSO6)
+ ifp->if_hwassist |= CSUM_IP6_TSO;
+
+ /*
+ * Prepare the interface for netmap mode access.
+ */
+ netmap_update_config(na_dr);
+
+ ret = netmap_mem_finalize(na_dr->nm_mem, na_dr);
+ if (ret) {
+ device_printf(sc->dev, "netmap_mem_finalize() failed\n");
+ return ret;
+ }
+
+ if (sc->ptna->backend_regifs == 0) {
+ ret = ptnet_nm_krings_create(na_nm);
+ if (ret) {
+ device_printf(sc->dev, "ptnet_nm_krings_create() "
+ "failed\n");
+ goto err_mem_finalize;
+ }
+
+ ret = netmap_mem_rings_create(na_dr);
+ if (ret) {
+ device_printf(sc->dev, "netmap_mem_rings_create() "
+ "failed\n");
+ goto err_rings_create;
+ }
+
+ ret = netmap_mem_get_lut(na_dr->nm_mem, &na_dr->na_lut);
+ if (ret) {
+ device_printf(sc->dev, "netmap_mem_get_lut() "
+ "failed\n");
+ goto err_get_lut;
+ }
+ }
+
+ ret = ptnet_nm_register(na_dr, 1 /* on */);
+ if (ret) {
+ goto err_register;
+ }
+
+ nm_buf_size = NETMAP_BUF_SIZE(na_dr);
+
+ KASSERT(nm_buf_size > 0, ("Invalid netmap buffer size"));
+ sc->min_tx_space = PTNET_MAX_PKT_SIZE / nm_buf_size + 2;
+ device_printf(sc->dev, "%s: min_tx_space = %u\n", __func__,
+ sc->min_tx_space);
+#ifdef PTNETMAP_STATS
+ callout_reset(&sc->tick, hz, ptnet_tick, sc);
+#endif
+
+ ifp->if_drv_flags |= IFF_DRV_RUNNING;
+
+ return 0;
+
+err_register:
+ memset(&na_dr->na_lut, 0, sizeof(na_dr->na_lut));
+err_get_lut:
+ netmap_mem_rings_delete(na_dr);
+err_rings_create:
+ ptnet_nm_krings_delete(na_nm);
+err_mem_finalize:
+ netmap_mem_deref(na_dr->nm_mem, na_dr);
+
+ return ret;
+}
+
+/* To be called under core lock. */
+static int
+ptnet_stop(struct ptnet_softc *sc)
+{
+ if_t ifp = sc->ifp;
+ struct netmap_adapter *na_dr = &sc->ptna->dr.up;
+ struct netmap_adapter *na_nm = &sc->ptna->hwup.up;
+ int i;
+
+ device_printf(sc->dev, "%s\n", __func__);
+
+ if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
+ return 0; /* nothing to do */
+ }
+
+ /* Clear the driver-ready flag, and synchronize with all the queues,
+ * so that after this loop we are sure nobody is working anymore with
+ * the device. This scheme is taken from the vtnet driver. */
+ ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
+ callout_stop(&sc->tick);
+ for (i = 0; i < sc->num_rings; i++) {
+ PTNET_Q_LOCK(sc->queues + i);
+ PTNET_Q_UNLOCK(sc->queues + i);
+ }
+
+ ptnet_nm_register(na_dr, 0 /* off */);
+
+ if (sc->ptna->backend_regifs == 0) {
+ netmap_mem_rings_delete(na_dr);
+ ptnet_nm_krings_delete(na_nm);
+ }
+ netmap_mem_deref(na_dr->nm_mem, na_dr);
+
+ return 0;
+}
+
+static void
+ptnet_qflush(if_t ifp)
+{
+ struct ptnet_softc *sc = if_getsoftc(ifp);
+ int i;
+
+ /* Flush all the bufrings and do the interface flush. */
+ for (i = 0; i < sc->num_rings; i++) {
+ struct ptnet_queue *pq = sc->queues + i;
+ struct mbuf *m;
+
+ PTNET_Q_LOCK(pq);
+ if (pq->bufring) {
+ while ((m = buf_ring_dequeue_sc(pq->bufring))) {
+ m_freem(m);
+ }
+ }
+ PTNET_Q_UNLOCK(pq);
+ }
+
+ if_qflush(ifp);
+}
+
+static int
+ptnet_media_change(if_t ifp)
+{
+ struct ptnet_softc *sc = if_getsoftc(ifp);
+ struct ifmedia *ifm = &sc->media;
+
+ if (IFM_TYPE(ifm->ifm_media) != IFM_ETHER) {
+ return EINVAL;
+ }
+
+ return 0;
+}
+
+#if __FreeBSD_version >= 1100000
+static uint64_t
+ptnet_get_counter(if_t ifp, ift_counter cnt)
+{
+ struct ptnet_softc *sc = if_getsoftc(ifp);
+ struct ptnet_queue_stats stats[2];
+ int i;
+
+ /* Accumulate statistics over the queues. */
+ memset(stats, 0, sizeof(stats));
+ for (i = 0; i < sc->num_rings; i++) {
+ struct ptnet_queue *pq = sc->queues + i;
+ int idx = (i < sc->num_tx_rings) ? 0 : 1;
+
+ stats[idx].packets += pq->stats.packets;
+ stats[idx].bytes += pq->stats.bytes;
+ stats[idx].errors += pq->stats.errors;
+ stats[idx].iqdrops += pq->stats.iqdrops;
+ stats[idx].mcasts += pq->stats.mcasts;
+ }
+
+ switch (cnt) {
+ case IFCOUNTER_IPACKETS:
+ return (stats[1].packets);
+ case IFCOUNTER_IQDROPS:
+ return (stats[1].iqdrops);
+ case IFCOUNTER_IERRORS:
+ return (stats[1].errors);
+ case IFCOUNTER_OPACKETS:
+ return (stats[0].packets);
+ case IFCOUNTER_OBYTES:
+ return (stats[0].bytes);
+ case IFCOUNTER_OMCASTS:
+ return (stats[0].mcasts);
+ default:
+ return (if_get_counter_default(ifp, cnt));
+ }
+}
+#endif
+
+
+#ifdef PTNETMAP_STATS
+/* Called under core lock. */
+static void
+ptnet_tick(void *opaque)
+{
+ struct ptnet_softc *sc = opaque;
+ int i;
+
+ for (i = 0; i < sc->num_rings; i++) {
+ struct ptnet_queue *pq = sc->queues + i;
+ struct ptnet_queue_stats cur = pq->stats;
+ struct timeval now;
+ unsigned int delta;
+
+ microtime(&now);
+ delta = now.tv_usec - sc->last_ts.tv_usec +
+ (now.tv_sec - sc->last_ts.tv_sec) * 1000000;
+ delta /= 1000; /* in milliseconds */
+
+ if (delta == 0)
+ continue;
+
+ device_printf(sc->dev, "#%d[%u ms]:pkts %lu, kicks %lu, "
+ "intr %lu\n", i, delta,
+ (cur.packets - pq->last_stats.packets),
+ (cur.kicks - pq->last_stats.kicks),
+ (cur.intrs - pq->last_stats.intrs));
+ pq->last_stats = cur;
+ }
+ microtime(&sc->last_ts);
+ callout_schedule(&sc->tick, hz);
+}
+#endif /* PTNETMAP_STATS */
+
+static void
+ptnet_media_status(if_t ifp, struct ifmediareq *ifmr)
+{
+ /* We are always active, as the backend netmap port is
+ * always open in netmap mode. */
+ ifmr->ifm_status = IFM_AVALID | IFM_ACTIVE;
+ ifmr->ifm_active = IFM_ETHER | IFM_10G_T | IFM_FDX;
+}
+
+static uint32_t
+ptnet_nm_ptctl(if_t ifp, uint32_t cmd)
+{
+ struct ptnet_softc *sc = if_getsoftc(ifp);
+ int ret;
+
+ bus_write_4(sc->iomem, PTNET_IO_PTCTL, cmd);
+ ret = bus_read_4(sc->iomem, PTNET_IO_PTSTS);
+ device_printf(sc->dev, "PTCTL %u, ret %u\n", cmd, ret);
+
+ return ret;
+}
+
+static int
+ptnet_nm_config(struct netmap_adapter *na, unsigned *txr, unsigned *txd,
+ unsigned *rxr, unsigned *rxd)
+{
+ struct ptnet_softc *sc = if_getsoftc(na->ifp);
+
+ *txr = bus_read_4(sc->iomem, PTNET_IO_NUM_TX_RINGS);
+ *rxr = bus_read_4(sc->iomem, PTNET_IO_NUM_RX_RINGS);
+ *txd = bus_read_4(sc->iomem, PTNET_IO_NUM_TX_SLOTS);
+ *rxd = bus_read_4(sc->iomem, PTNET_IO_NUM_RX_SLOTS);
+
+ device_printf(sc->dev, "txr %u, rxr %u, txd %u, rxd %u\n",
+ *txr, *rxr, *txd, *rxd);
+
+ return 0;
+}
+
+static void
+ptnet_sync_from_csb(struct ptnet_softc *sc, struct netmap_adapter *na)
+{
+ int i;
+
+ /* Sync krings from the host, reading from
+ * CSB. */
+ for (i = 0; i < sc->num_rings; i++) {
+ struct ptnet_ring *ptring = sc->queues[i].ptring;
+ struct netmap_kring *kring;
+
+ if (i < na->num_tx_rings) {
+ kring = na->tx_rings + i;
+ } else {
+ kring = na->rx_rings + i - na->num_tx_rings;
+ }
+ kring->rhead = kring->ring->head = ptring->head;
+ kring->rcur = kring->ring->cur = ptring->cur;
+ kring->nr_hwcur = ptring->hwcur;
+ kring->nr_hwtail = kring->rtail =
+ kring->ring->tail = ptring->hwtail;
+
+ ND("%d,%d: csb {hc %u h %u c %u ht %u}", t, i,
+ ptring->hwcur, ptring->head, ptring->cur,
+ ptring->hwtail);
+ ND("%d,%d: kring {hc %u rh %u rc %u h %u c %u ht %u rt %u t %u}",
+ t, i, kring->nr_hwcur, kring->rhead, kring->rcur,
+ kring->ring->head, kring->ring->cur, kring->nr_hwtail,
+ kring->rtail, kring->ring->tail);
+ }
+}
+
+static void
+ptnet_update_vnet_hdr(struct ptnet_softc *sc)
+{
+ unsigned int wanted_hdr_len = ptnet_vnet_hdr ? PTNET_HDR_SIZE : 0;
+
+ bus_write_4(sc->iomem, PTNET_IO_VNET_HDR_LEN, wanted_hdr_len);
+ sc->vnet_hdr_len = bus_read_4(sc->iomem, PTNET_IO_VNET_HDR_LEN);
+ sc->ptna->hwup.up.virt_hdr_len = sc->vnet_hdr_len;
+}
+
+static int
+ptnet_nm_register(struct netmap_adapter *na, int onoff)
+{
+ /* device-specific */
+ if_t ifp = na->ifp;
+ struct ptnet_softc *sc = if_getsoftc(ifp);
+ int native = (na == &sc->ptna->hwup.up);
+ struct ptnet_queue *pq;
+ enum txrx t;
+ int ret = 0;
+ int i;
+
+ if (!onoff) {
+ sc->ptna->backend_regifs--;
+ }
+
+ /* If this is the last netmap client, guest interrupt enable flags may
+ * be in arbitrary state. Since these flags are going to be used also
+ * by the netdevice driver, we have to make sure to start with
+ * notifications enabled. Also, schedule NAPI to flush pending packets
+ * in the RX rings, since we will not receive further interrupts
+ * until these will be processed. */
+ if (native && !onoff && na->active_fds == 0) {
+ D("Exit netmap mode, re-enable interrupts");
+ for (i = 0; i < sc->num_rings; i++) {
+ pq = sc->queues + i;
+ pq->ptring->guest_need_kick = 1;
+ }
+ }
+
+ if (onoff) {
+ if (sc->ptna->backend_regifs == 0) {
+ /* Initialize notification enable fields in the CSB. */
+ for (i = 0; i < sc->num_rings; i++) {
+ pq = sc->queues + i;
+ pq->ptring->host_need_kick = 1;
+ pq->ptring->guest_need_kick =
+ (!(ifp->if_capenable & IFCAP_POLLING)
+ && i >= sc->num_tx_rings);
+ }
+
+ /* Set the virtio-net header length. */
+ ptnet_update_vnet_hdr(sc);
+
+ /* Make sure the host adapter passed through is ready
+ * for txsync/rxsync. */
+ ret = ptnet_nm_ptctl(ifp, PTNETMAP_PTCTL_REGIF);
+ if (ret) {
+ return ret;
+ }
+ }
+
+ /* Sync from CSB must be done after REGIF PTCTL. Skip this
+ * step only if this is a netmap client and it is not the
+ * first one. */
+ if ((!native && sc->ptna->backend_regifs == 0) ||
+ (native && na->active_fds == 0)) {
+ ptnet_sync_from_csb(sc, na);
+ }
+
+ /* If not native, don't call nm_set_native_flags, since we don't want
+ * to replace if_transmit method, nor set NAF_NETMAP_ON */
+ if (native) {
+ for_rx_tx(t) {
+ for (i = 0; i <= nma_get_nrings(na, t); i++) {
+ struct netmap_kring *kring = &NMR(na, t)[i];
+
+ if (nm_kring_pending_on(kring)) {
+ kring->nr_mode = NKR_NETMAP_ON;
+ }
+ }
+ }
+ nm_set_native_flags(na);
+ }
+
+ } else {
+ if (native) {
+ nm_clear_native_flags(na);
+ for_rx_tx(t) {
+ for (i = 0; i <= nma_get_nrings(na, t); i++) {
+ struct netmap_kring *kring = &NMR(na, t)[i];
+
+ if (nm_kring_pending_off(kring)) {
+ kring->nr_mode = NKR_NETMAP_OFF;
+ }
+ }
+ }
+ }
+
+ /* Sync from CSB must be done before UNREGIF PTCTL, on the last
+ * netmap client. */
+ if (native && na->active_fds == 0) {
+ ptnet_sync_from_csb(sc, na);
+ }
+
+ if (sc->ptna->backend_regifs == 0) {
+ ret = ptnet_nm_ptctl(ifp, PTNETMAP_PTCTL_UNREGIF);
+ }
+ }
+
+ if (onoff) {
+ sc->ptna->backend_regifs++;
+ }
+
+ return ret;
+}
+
+static int
+ptnet_nm_txsync(struct netmap_kring *kring, int flags)
+{
+ struct ptnet_softc *sc = if_getsoftc(kring->na->ifp);
+ struct ptnet_queue *pq = sc->queues + kring->ring_id;
+ bool notify;
+
+ notify = netmap_pt_guest_txsync(pq->ptring, kring, flags);
+ if (notify) {
+ ptnet_kick(pq);
+ }
+
+ return 0;
+}
+
+static int
+ptnet_nm_rxsync(struct netmap_kring *kring, int flags)
+{
+ struct ptnet_softc *sc = if_getsoftc(kring->na->ifp);
+ struct ptnet_queue *pq = sc->rxqueues + kring->ring_id;
+ bool notify;
+
+ notify = netmap_pt_guest_rxsync(pq->ptring, kring, flags);
+ if (notify) {
+ ptnet_kick(pq);
+ }
+
+ return 0;
+}
+
+static void
+ptnet_tx_intr(void *opaque)
+{
+ struct ptnet_queue *pq = opaque;
+ struct ptnet_softc *sc = pq->sc;
+
+ DBG(device_printf(sc->dev, "Tx interrupt #%d\n", pq->kring_id));
+#ifdef PTNETMAP_STATS
+ pq->stats.intrs ++;
+#endif /* PTNETMAP_STATS */
+
+ if (netmap_tx_irq(sc->ifp, pq->kring_id) != NM_IRQ_PASS) {
+ return;
+ }
+
+ /* Schedule the tasqueue to flush process transmissions requests.
+ * However, vtnet, if_em and if_igb just call ptnet_transmit() here,
+ * at least when using MSI-X interrupts. The if_em driver, instead
+ * schedule taskqueue when using legacy interrupts. */
+ taskqueue_enqueue(pq->taskq, &pq->task);
+}
+
+static void
+ptnet_rx_intr(void *opaque)
+{
+ struct ptnet_queue *pq = opaque;
+ struct ptnet_softc *sc = pq->sc;
+ unsigned int unused;
+
+ DBG(device_printf(sc->dev, "Rx interrupt #%d\n", pq->kring_id));
+#ifdef PTNETMAP_STATS
+ pq->stats.intrs ++;
+#endif /* PTNETMAP_STATS */
+
+ if (netmap_rx_irq(sc->ifp, pq->kring_id, &unused) != NM_IRQ_PASS) {
+ return;
+ }
+
+ /* Like vtnet, if_igb and if_em drivers when using MSI-X interrupts,
+ * receive-side processing is executed directly in the interrupt
+ * service routine. Alternatively, we may schedule the taskqueue. */
+ ptnet_rx_eof(pq, PTNET_RX_BUDGET, true);
+}
+
+/* The following offloadings-related functions are taken from the vtnet
+ * driver, but the same functionality is required for the ptnet driver.
+ * As a temporary solution, I copied this code from vtnet and I started
+ * to generalize it (taking away driver-specific statistic accounting),
+ * making as little modifications as possible.
+ * In the future we need to share these functions between vtnet and ptnet.
+ */
+static int
+ptnet_tx_offload_ctx(struct mbuf *m, int *etype, int *proto, int *start)
+{
+ struct ether_vlan_header *evh;
+ int offset;
+
+ evh = mtod(m, struct ether_vlan_header *);
+ if (evh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
+ /* BMV: We should handle nested VLAN tags too. */
+ *etype = ntohs(evh->evl_proto);
+ offset = sizeof(struct ether_vlan_header);
+ } else {
+ *etype = ntohs(evh->evl_encap_proto);
+ offset = sizeof(struct ether_header);
+ }
+
+ switch (*etype) {
+#if defined(INET)
+ case ETHERTYPE_IP: {
+ struct ip *ip, iphdr;
+ if (__predict_false(m->m_len < offset + sizeof(struct ip))) {
+ m_copydata(m, offset, sizeof(struct ip),
+ (caddr_t) &iphdr);
+ ip = &iphdr;
+ } else
+ ip = (struct ip *)(m->m_data + offset);
+ *proto = ip->ip_p;
+ *start = offset + (ip->ip_hl << 2);
+ break;
+ }
+#endif
+#if defined(INET6)
+ case ETHERTYPE_IPV6:
+ *proto = -1;
+ *start = ip6_lasthdr(m, offset, IPPROTO_IPV6, proto);
+ /* Assert the network stack sent us a valid packet. */
+ KASSERT(*start > offset,
+ ("%s: mbuf %p start %d offset %d proto %d", __func__, m,
+ *start, offset, *proto));
+ break;
+#endif
+ default:
+ /* Here we should increment the tx_csum_bad_ethtype counter. */
+ return (EINVAL);
+ }
+
+ return (0);
+}
+
+static int
+ptnet_tx_offload_tso(if_t ifp, struct mbuf *m, int eth_type,
+ int offset, bool allow_ecn, struct virtio_net_hdr *hdr)
+{
+ static struct timeval lastecn;
+ static int curecn;
+ struct tcphdr *tcp, tcphdr;
+
+ if (__predict_false(m->m_len < offset + sizeof(struct tcphdr))) {
+ m_copydata(m, offset, sizeof(struct tcphdr), (caddr_t) &tcphdr);
+ tcp = &tcphdr;
+ } else
+ tcp = (struct tcphdr *)(m->m_data + offset);
+
+ hdr->hdr_len = offset + (tcp->th_off << 2);
+ hdr->gso_size = m->m_pkthdr.tso_segsz;
+ hdr->gso_type = eth_type == ETHERTYPE_IP ? VIRTIO_NET_HDR_GSO_TCPV4 :
+ VIRTIO_NET_HDR_GSO_TCPV6;
+
+ if (tcp->th_flags & TH_CWR) {
+ /*
+ * Drop if VIRTIO_NET_F_HOST_ECN was not negotiated. In FreeBSD,
+ * ECN support is not on a per-interface basis, but globally via
+ * the net.inet.tcp.ecn.enable sysctl knob. The default is off.
+ */
+ if (!allow_ecn) {
+ if (ppsratecheck(&lastecn, &curecn, 1))
+ if_printf(ifp,
+ "TSO with ECN not negotiated with host\n");
+ return (ENOTSUP);
+ }
+ hdr->gso_type |= VIRTIO_NET_HDR_GSO_ECN;
+ }
+
+ /* Here we should increment tx_tso counter. */
+
+ return (0);
+}
+
+static struct mbuf *
+ptnet_tx_offload(if_t ifp, struct mbuf *m, bool allow_ecn,
+ struct virtio_net_hdr *hdr)
+{
+ int flags, etype, csum_start, proto, error;
+
+ flags = m->m_pkthdr.csum_flags;
+
+ error = ptnet_tx_offload_ctx(m, &etype, &proto, &csum_start);
+ if (error)
+ goto drop;
+
+ if ((etype == ETHERTYPE_IP && flags & PTNET_CSUM_OFFLOAD) ||
+ (etype == ETHERTYPE_IPV6 && flags & PTNET_CSUM_OFFLOAD_IPV6)) {
+ /*
+ * We could compare the IP protocol vs the CSUM_ flag too,
+ * but that really should not be necessary.
+ */
+ hdr->flags |= VIRTIO_NET_HDR_F_NEEDS_CSUM;
+ hdr->csum_start = csum_start;
+ hdr->csum_offset = m->m_pkthdr.csum_data;
+ /* Here we should increment the tx_csum counter. */
+ }
+
+ if (flags & CSUM_TSO) {
+ if (__predict_false(proto != IPPROTO_TCP)) {
+ /* Likely failed to correctly parse the mbuf.
+ * Here we should increment the tx_tso_not_tcp
+ * counter. */
+ goto drop;
+ }
+
+ KASSERT(hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM,
+ ("%s: mbuf %p TSO without checksum offload %#x",
+ __func__, m, flags));
+
+ error = ptnet_tx_offload_tso(ifp, m, etype, csum_start,
+ allow_ecn, hdr);
+ if (error)
+ goto drop;
+ }
+
+ return (m);
+
+drop:
+ m_freem(m);
+ return (NULL);
+}
+
+static void
+ptnet_vlan_tag_remove(struct mbuf *m)
+{
+ struct ether_vlan_header *evh;
+
+ evh = mtod(m, struct ether_vlan_header *);
+ m->m_pkthdr.ether_vtag = ntohs(evh->evl_tag);
+ m->m_flags |= M_VLANTAG;
+
+ /* Strip the 802.1Q header. */
+ bcopy((char *) evh, (char *) evh + ETHER_VLAN_ENCAP_LEN,
+ ETHER_HDR_LEN - ETHER_TYPE_LEN);
+ m_adj(m, ETHER_VLAN_ENCAP_LEN);
+}
+
+/*
+ * Use the checksum offset in the VirtIO header to set the
+ * correct CSUM_* flags.
+ */
+static int
+ptnet_rx_csum_by_offset(struct mbuf *m, uint16_t eth_type, int ip_start,
+ struct virtio_net_hdr *hdr)
+{
+#if defined(INET) || defined(INET6)
+ int offset = hdr->csum_start + hdr->csum_offset;
+#endif
+
+ /* Only do a basic sanity check on the offset. */
+ switch (eth_type) {
+#if defined(INET)
+ case ETHERTYPE_IP:
+ if (__predict_false(offset < ip_start + sizeof(struct ip)))
+ return (1);
+ break;
+#endif
+#if defined(INET6)
+ case ETHERTYPE_IPV6:
+ if (__predict_false(offset < ip_start + sizeof(struct ip6_hdr)))
+ return (1);
+ break;
+#endif
+ default:
+ /* Here we should increment the rx_csum_bad_ethtype counter. */
+ return (1);
+ }
+
+ /*
+ * Use the offset to determine the appropriate CSUM_* flags. This is
+ * a bit dirty, but we can get by with it since the checksum offsets
+ * happen to be different. We assume the host host does not do IPv4
+ * header checksum offloading.
+ */
+ switch (hdr->csum_offset) {
+ case offsetof(struct udphdr, uh_sum):
+ case offsetof(struct tcphdr, th_sum):
+ m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
+ m->m_pkthdr.csum_data = 0xFFFF;
+ break;
+ case offsetof(struct sctphdr, checksum):
+ m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
+ break;
+ default:
+ /* Here we should increment the rx_csum_bad_offset counter. */
+ return (1);
+ }
+
+ return (0);
+}
+
+static int
+ptnet_rx_csum_by_parse(struct mbuf *m, uint16_t eth_type, int ip_start,
+ struct virtio_net_hdr *hdr)
+{
+ int offset, proto;
+
+ switch (eth_type) {
+#if defined(INET)
+ case ETHERTYPE_IP: {
+ struct ip *ip;
+ if (__predict_false(m->m_len < ip_start + sizeof(struct ip)))
+ return (1);
+ ip = (struct ip *)(m->m_data + ip_start);
+ proto = ip->ip_p;
+ offset = ip_start + (ip->ip_hl << 2);
+ break;
+ }
+#endif
+#if defined(INET6)
+ case ETHERTYPE_IPV6:
+ if (__predict_false(m->m_len < ip_start +
+ sizeof(struct ip6_hdr)))
+ return (1);
+ offset = ip6_lasthdr(m, ip_start, IPPROTO_IPV6, &proto);
+ if (__predict_false(offset < 0))
+ return (1);
+ break;
+#endif
+ default:
+ /* Here we should increment the rx_csum_bad_ethtype counter. */
+ return (1);
+ }
+
+ switch (proto) {
+ case IPPROTO_TCP:
+ if (__predict_false(m->m_len < offset + sizeof(struct tcphdr)))
+ return (1);
+ m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
+ m->m_pkthdr.csum_data = 0xFFFF;
+ break;
+ case IPPROTO_UDP:
+ if (__predict_false(m->m_len < offset + sizeof(struct udphdr)))
+ return (1);
+ m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
+ m->m_pkthdr.csum_data = 0xFFFF;
+ break;
+ case IPPROTO_SCTP:
+ if (__predict_false(m->m_len < offset + sizeof(struct sctphdr)))
+ return (1);
+ m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
+ break;
+ default:
+ /*
+ * For the remaining protocols, FreeBSD does not support
+ * checksum offloading, so the checksum will be recomputed.
+ */
+#if 0
+ if_printf(ifp, "cksum offload of unsupported "
+ "protocol eth_type=%#x proto=%d csum_start=%d "
+ "csum_offset=%d\n", __func__, eth_type, proto,
+ hdr->csum_start, hdr->csum_offset);
+#endif
+ break;
+ }
+
+ return (0);
+}
+
+/*
+ * Set the appropriate CSUM_* flags. Unfortunately, the information
+ * provided is not directly useful to us. The VirtIO header gives the
+ * offset of the checksum, which is all Linux needs, but this is not
+ * how FreeBSD does things. We are forced to peek inside the packet
+ * a bit.
+ *
+ * It would be nice if VirtIO gave us the L4 protocol or if FreeBSD
+ * could accept the offsets and let the stack figure it out.
+ */
+static int
+ptnet_rx_csum(struct mbuf *m, struct virtio_net_hdr *hdr)
+{
+ struct ether_header *eh;
+ struct ether_vlan_header *evh;
+ uint16_t eth_type;
+ int offset, error;
+
+ eh = mtod(m, struct ether_header *);
+ eth_type = ntohs(eh->ether_type);
+ if (eth_type == ETHERTYPE_VLAN) {
+ /* BMV: We should handle nested VLAN tags too. */
+ evh = mtod(m, struct ether_vlan_header *);
+ eth_type = ntohs(evh->evl_proto);
+ offset = sizeof(struct ether_vlan_header);
+ } else
+ offset = sizeof(struct ether_header);
+
+ if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)
+ error = ptnet_rx_csum_by_offset(m, eth_type, offset, hdr);
+ else
+ error = ptnet_rx_csum_by_parse(m, eth_type, offset, hdr);
+
+ return (error);
+}
+/* End of offloading-related functions to be shared with vtnet. */
+
+static inline void
+ptnet_sync_tail(struct ptnet_ring *ptring, struct netmap_kring *kring)
+{
+ struct netmap_ring *ring = kring->ring;
+
+ /* Update hwcur and hwtail as known by the host. */
+ ptnetmap_guest_read_kring_csb(ptring, kring);
+
+ /* nm_sync_finalize */
+ ring->tail = kring->rtail = kring->nr_hwtail;
+}
+
+static void
+ptnet_ring_update(struct ptnet_queue *pq, struct netmap_kring *kring,
+ unsigned int head, unsigned int sync_flags)
+{
+ struct netmap_ring *ring = kring->ring;
+ struct ptnet_ring *ptring = pq->ptring;
+
+ /* Some packets have been pushed to the netmap ring. We have
+ * to tell the host to process the new packets, updating cur
+ * and head in the CSB. */
+ ring->head = ring->cur = head;
+
+ /* Mimic nm_txsync_prologue/nm_rxsync_prologue. */
+ kring->rcur = kring->rhead = head;
+
+ ptnetmap_guest_write_kring_csb(ptring, kring->rcur, kring->rhead);
+
+ /* Kick the host if needed. */
+ if (NM_ACCESS_ONCE(ptring->host_need_kick)) {
+ ptring->sync_flags = sync_flags;
+ ptnet_kick(pq);
+ }
+}
+
+#define PTNET_TX_NOSPACE(_h, _k, _min) \
+ ((((_h) < (_k)->rtail) ? 0 : (_k)->nkr_num_slots) + \
+ (_k)->rtail - (_h)) < (_min)
+
+/* This function may be called by the network stack, or by
+ * by the taskqueue thread. */
+static int
+ptnet_drain_transmit_queue(struct ptnet_queue *pq, unsigned int budget,
+ bool may_resched)
+{
+ struct ptnet_softc *sc = pq->sc;
+ bool have_vnet_hdr = sc->vnet_hdr_len;
+ struct netmap_adapter *na = &sc->ptna->dr.up;
+ if_t ifp = sc->ifp;
+ unsigned int batch_count = 0;
+ struct ptnet_ring *ptring;
+ struct netmap_kring *kring;
+ struct netmap_ring *ring;
+ struct netmap_slot *slot;
+ unsigned int count = 0;
+ unsigned int minspace;
+ unsigned int head;
+ unsigned int lim;
+ struct mbuf *mhead;
+ struct mbuf *mf;
+ int nmbuf_bytes;
+ uint8_t *nmbuf;
+
+ if (!PTNET_Q_TRYLOCK(pq)) {
+ /* We failed to acquire the lock, schedule the taskqueue. */
+ RD(1, "Deferring TX work");
+ if (may_resched) {
+ taskqueue_enqueue(pq->taskq, &pq->task);
+ }
+
+ return 0;
+ }
+
+ if (unlikely(!(ifp->if_drv_flags & IFF_DRV_RUNNING))) {
+ PTNET_Q_UNLOCK(pq);
+ RD(1, "Interface is down");
+ return ENETDOWN;
+ }
+
+ ptring = pq->ptring;
+ kring = na->tx_rings + pq->kring_id;
+ ring = kring->ring;
+ lim = kring->nkr_num_slots - 1;
+ head = ring->head;
+ minspace = sc->min_tx_space;
+
+ while (count < budget) {
+ if (PTNET_TX_NOSPACE(head, kring, minspace)) {
+ /* We ran out of slot, let's see if the host has
+ * freed up some, by reading hwcur and hwtail from
+ * the CSB. */
+ ptnet_sync_tail(ptring, kring);
+
+ if (PTNET_TX_NOSPACE(head, kring, minspace)) {
+ /* Still no slots available. Reactivate the
+ * interrupts so that we can be notified
+ * when some free slots are made available by
+ * the host. */
+ ptring->guest_need_kick = 1;
+
+ /* Double-check. */
+ ptnet_sync_tail(ptring, kring);
+ if (likely(PTNET_TX_NOSPACE(head, kring,
+ minspace))) {
+ break;
+ }
+
+ RD(1, "Found more slots by doublecheck");
+ /* More slots were freed before reactivating
+ * the interrupts. */
+ ptring->guest_need_kick = 0;
+ }
+ }
+
+ mhead = drbr_peek(ifp, pq->bufring);
+ if (!mhead) {
+ break;
+ }
+
+ /* Initialize transmission state variables. */
+ slot = ring->slot + head;
+ nmbuf = NMB(na, slot);
+ nmbuf_bytes = 0;
+
+ /* If needed, prepare the virtio-net header at the beginning
+ * of the first slot. */
+ if (have_vnet_hdr) {
+ struct virtio_net_hdr *vh =
+ (struct virtio_net_hdr *)nmbuf;
+
+ /* For performance, we could replace this memset() with
+ * two 8-bytes-wide writes. */
+ memset(nmbuf, 0, PTNET_HDR_SIZE);
+ if (mhead->m_pkthdr.csum_flags & PTNET_ALL_OFFLOAD) {
+ mhead = ptnet_tx_offload(ifp, mhead, false,
+ vh);
+ if (unlikely(!mhead)) {
+ /* Packet dropped because errors
+ * occurred while preparing the vnet
+ * header. Let's go ahead with the next
+ * packet. */
+ pq->stats.errors ++;
+ drbr_advance(ifp, pq->bufring);
+ continue;
+ }
+ }
+ ND(1, "%s: [csum_flags %lX] vnet hdr: flags %x "
+ "csum_start %u csum_ofs %u hdr_len = %u "
+ "gso_size %u gso_type %x", __func__,
+ mhead->m_pkthdr.csum_flags, vh->flags,
+ vh->csum_start, vh->csum_offset, vh->hdr_len,
+ vh->gso_size, vh->gso_type);
+
+ nmbuf += PTNET_HDR_SIZE;
+ nmbuf_bytes += PTNET_HDR_SIZE;
+ }
+
+ for (mf = mhead; mf; mf = mf->m_next) {
+ uint8_t *mdata = mf->m_data;
+ int mlen = mf->m_len;
+
+ for (;;) {
+ int copy = NETMAP_BUF_SIZE(na) - nmbuf_bytes;
+
+ if (mlen < copy) {
+ copy = mlen;
+ }
+ memcpy(nmbuf, mdata, copy);
+
+ mdata += copy;
+ mlen -= copy;
+ nmbuf += copy;
+ nmbuf_bytes += copy;
+
+ if (!mlen) {
+ break;
+ }
+
+ slot->len = nmbuf_bytes;
+ slot->flags = NS_MOREFRAG;
+
+ head = nm_next(head, lim);
+ KASSERT(head != ring->tail,
+ ("Unexpectedly run out of TX space"));
+ slot = ring->slot + head;
+ nmbuf = NMB(na, slot);
+ nmbuf_bytes = 0;
+ }
+ }
+
+ /* Complete last slot and update head. */
+ slot->len = nmbuf_bytes;
+ slot->flags = 0;
+ head = nm_next(head, lim);
+
+ /* Consume the packet just processed. */
+ drbr_advance(ifp, pq->bufring);
+
+ /* Copy the packet to listeners. */
+ ETHER_BPF_MTAP(ifp, mhead);
+
+ pq->stats.packets ++;
+ pq->stats.bytes += mhead->m_pkthdr.len;
+ if (mhead->m_flags & M_MCAST) {
+ pq->stats.mcasts ++;
+ }
+
+ m_freem(mhead);
+
+ count ++;
+ if (++batch_count == PTNET_TX_BATCH) {
+ ptnet_ring_update(pq, kring, head, NAF_FORCE_RECLAIM);
+ batch_count = 0;
+ }
+ }
+
+ if (batch_count) {
+ ptnet_ring_update(pq, kring, head, NAF_FORCE_RECLAIM);
+ }
+
+ if (count >= budget && may_resched) {
+ DBG(RD(1, "out of budget: resched, %d mbufs pending\n",
+ drbr_inuse(ifp, pq->bufring)));
+ taskqueue_enqueue(pq->taskq, &pq->task);
+ }
+
+ PTNET_Q_UNLOCK(pq);
+
+ return count;
+}
+
+static int
+ptnet_transmit(if_t ifp, struct mbuf *m)
+{
+ struct ptnet_softc *sc = if_getsoftc(ifp);
+ struct ptnet_queue *pq;
+ unsigned int queue_idx;
+ int err;
+
+ DBG(device_printf(sc->dev, "transmit %p\n", m));
+
+ /* Insert 802.1Q header if needed. */
+ if (m->m_flags & M_VLANTAG) {
+ m = ether_vlanencap(m, m->m_pkthdr.ether_vtag);
+ if (m == NULL) {
+ return ENOBUFS;
+ }
+ m->m_flags &= ~M_VLANTAG;
+ }
+
+ /* Get the flow-id if available. */
+ queue_idx = (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) ?
+ m->m_pkthdr.flowid : curcpu;
+
+ if (unlikely(queue_idx >= sc->num_tx_rings)) {
+ queue_idx %= sc->num_tx_rings;
+ }
+
+ pq = sc->queues + queue_idx;
+
+ err = drbr_enqueue(ifp, pq->bufring, m);
+ if (err) {
+ /* ENOBUFS when the bufring is full */
+ RD(1, "%s: drbr_enqueue() failed %d\n",
+ __func__, err);
+ pq->stats.errors ++;
+ return err;
+ }
+
+ if (ifp->if_capenable & IFCAP_POLLING) {
+ /* If polling is on, the transmit queues will be
+ * drained by the poller. */
+ return 0;
+ }
+
+ err = ptnet_drain_transmit_queue(pq, PTNET_TX_BUDGET, true);
+
+ return (err < 0) ? err : 0;
+}
+
+static unsigned int
+ptnet_rx_discard(struct netmap_kring *kring, unsigned int head)
+{
+ struct netmap_ring *ring = kring->ring;
+ struct netmap_slot *slot = ring->slot + head;
+
+ for (;;) {
+ head = nm_next(head, kring->nkr_num_slots - 1);
+ if (!(slot->flags & NS_MOREFRAG) || head == ring->tail) {
+ break;
+ }
+ slot = ring->slot + head;
+ }
+
+ return head;
+}
+
+static inline struct mbuf *
+ptnet_rx_slot(struct mbuf *mtail, uint8_t *nmbuf, unsigned int nmbuf_len)
+{
+ uint8_t *mdata = mtod(mtail, uint8_t *) + mtail->m_len;
+
+ do {
+ unsigned int copy;
+
+ if (mtail->m_len == MCLBYTES) {
+ struct mbuf *mf;
+
+ mf = m_getcl(M_NOWAIT, MT_DATA, 0);
+ if (unlikely(!mf)) {
+ return NULL;
+ }
+
+ mtail->m_next = mf;
+ mtail = mf;
+ mdata = mtod(mtail, uint8_t *);
+ mtail->m_len = 0;
+ }
+
+ copy = MCLBYTES - mtail->m_len;
+ if (nmbuf_len < copy) {
+ copy = nmbuf_len;
+ }
+
+ memcpy(mdata, nmbuf, copy);
+
+ nmbuf += copy;
+ nmbuf_len -= copy;
+ mdata += copy;
+ mtail->m_len += copy;
+ } while (nmbuf_len);
+
+ return mtail;
+}
+
+static int
+ptnet_rx_eof(struct ptnet_queue *pq, unsigned int budget, bool may_resched)
+{
+ struct ptnet_softc *sc = pq->sc;
+ bool have_vnet_hdr = sc->vnet_hdr_len;
+ struct ptnet_ring *ptring = pq->ptring;
+ struct netmap_adapter *na = &sc->ptna->dr.up;
+ struct netmap_kring *kring = na->rx_rings + pq->kring_id;
+ struct netmap_ring *ring = kring->ring;
+ unsigned int const lim = kring->nkr_num_slots - 1;
+ unsigned int head = ring->head;
+ unsigned int batch_count = 0;
+ if_t ifp = sc->ifp;
+ unsigned int count = 0;
+
+ PTNET_Q_LOCK(pq);
+
+ if (unlikely(!(ifp->if_drv_flags & IFF_DRV_RUNNING))) {
+ goto unlock;
+ }
+
+ kring->nr_kflags &= ~NKR_PENDINTR;
+
+ while (count < budget) {
+ unsigned int prev_head = head;
+ struct mbuf *mhead, *mtail;
+ struct virtio_net_hdr *vh;
+ struct netmap_slot *slot;
+ unsigned int nmbuf_len;
+ uint8_t *nmbuf;
+host_sync:
+ if (head == ring->tail) {
+ /* We ran out of slot, let's see if the host has
+ * added some, by reading hwcur and hwtail from
+ * the CSB. */
+ ptnet_sync_tail(ptring, kring);
+
+ if (head == ring->tail) {
+ /* Still no slots available. Reactivate
+ * interrupts as they were disabled by the
+ * host thread right before issuing the
+ * last interrupt. */
+ ptring->guest_need_kick = 1;
+
+ /* Double-check. */
+ ptnet_sync_tail(ptring, kring);
+ if (likely(head == ring->tail)) {
+ break;
+ }
+ ptring->guest_need_kick = 0;
+ }
+ }
+
+ /* Initialize ring state variables, possibly grabbing the
+ * virtio-net header. */
+ slot = ring->slot + head;
+ nmbuf = NMB(na, slot);
+ nmbuf_len = slot->len;
+
+ vh = (struct virtio_net_hdr *)nmbuf;
+ if (have_vnet_hdr) {
+ if (unlikely(nmbuf_len < PTNET_HDR_SIZE)) {
+ /* There is no good reason why host should
+ * put the header in multiple netmap slots.
+ * If this is the case, discard. */
+ RD(1, "Fragmented vnet-hdr: dropping");
+ head = ptnet_rx_discard(kring, head);
+ pq->stats.iqdrops ++;
+ goto skip;
+ }
+ ND(1, "%s: vnet hdr: flags %x csum_start %u "
+ "csum_ofs %u hdr_len = %u gso_size %u "
+ "gso_type %x", __func__, vh->flags,
+ vh->csum_start, vh->csum_offset, vh->hdr_len,
+ vh->gso_size, vh->gso_type);
+ nmbuf += PTNET_HDR_SIZE;
+ nmbuf_len -= PTNET_HDR_SIZE;
+ }
+
+ /* Allocate the head of a new mbuf chain.
+ * We use m_getcl() to allocate an mbuf with standard cluster
+ * size (MCLBYTES). In the future we could use m_getjcl()
+ * to choose different sizes. */
+ mhead = mtail = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
+ if (unlikely(mhead == NULL)) {
+ device_printf(sc->dev, "%s: failed to allocate mbuf "
+ "head\n", __func__);
+ pq->stats.errors ++;
+ break;
+ }
+
+ /* Initialize the mbuf state variables. */
+ mhead->m_pkthdr.len = nmbuf_len;
+ mtail->m_len = 0;
+
+ /* Scan all the netmap slots containing the current packet. */
+ for (;;) {
+ DBG(device_printf(sc->dev, "%s: h %u t %u rcv frag "
+ "len %u, flags %u\n", __func__,
+ head, ring->tail, slot->len,
+ slot->flags));
+
+ mtail = ptnet_rx_slot(mtail, nmbuf, nmbuf_len);
+ if (unlikely(!mtail)) {
+ /* Ouch. We ran out of memory while processing
+ * a packet. We have to restore the previous
+ * head position, free the mbuf chain, and
+ * schedule the taskqueue to give the packet
+ * another chance. */
+ device_printf(sc->dev, "%s: failed to allocate"
+ " mbuf frag, reset head %u --> %u\n",
+ __func__, head, prev_head);
+ head = prev_head;
+ m_freem(mhead);
+ pq->stats.errors ++;
+ if (may_resched) {
+ taskqueue_enqueue(pq->taskq,
+ &pq->task);
+ }
+ goto escape;
+ }
+
+ /* We have to increment head irrespective of the
+ * NS_MOREFRAG being set or not. */
+ head = nm_next(head, lim);
+
+ if (!(slot->flags & NS_MOREFRAG)) {
+ break;
+ }
+
+ if (unlikely(head == ring->tail)) {
+ /* The very last slot prepared by the host has
+ * the NS_MOREFRAG set. Drop it and continue
+ * the outer cycle (to do the double-check). */
+ RD(1, "Incomplete packet: dropping");
+ m_freem(mhead);
+ pq->stats.iqdrops ++;
+ goto host_sync;
+ }
+
+ slot = ring->slot + head;
+ nmbuf = NMB(na, slot);
+ nmbuf_len = slot->len;
+ mhead->m_pkthdr.len += nmbuf_len;
+ }
+
+ mhead->m_pkthdr.rcvif = ifp;
+ mhead->m_pkthdr.csum_flags = 0;
+
+ /* Store the queue idx in the packet header. */
+ mhead->m_pkthdr.flowid = pq->kring_id;
+ M_HASHTYPE_SET(mhead, M_HASHTYPE_OPAQUE);
+
+ if (ifp->if_capenable & IFCAP_VLAN_HWTAGGING) {
+ struct ether_header *eh;
+
+ eh = mtod(mhead, struct ether_header *);
+ if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
+ ptnet_vlan_tag_remove(mhead);
+ /*
+ * With the 802.1Q header removed, update the
+ * checksum starting location accordingly.
+ */
+ if (vh->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)
+ vh->csum_start -= ETHER_VLAN_ENCAP_LEN;
+ }
+ }
+
+ if (have_vnet_hdr && (vh->flags & (VIRTIO_NET_HDR_F_NEEDS_CSUM
+ | VIRTIO_NET_HDR_F_DATA_VALID))) {
+ if (unlikely(ptnet_rx_csum(mhead, vh))) {
+ m_freem(mhead);
+ RD(1, "Csum offload error: dropping");
+ pq->stats.iqdrops ++;
+ goto skip;
+ }
+ }
+
+ pq->stats.packets ++;
+ pq->stats.bytes += mhead->m_pkthdr.len;
+
+ PTNET_Q_UNLOCK(pq);
+ (*ifp->if_input)(ifp, mhead);
+ PTNET_Q_LOCK(pq);
+
+ if (unlikely(!(ifp->if_drv_flags & IFF_DRV_RUNNING))) {
+ /* The interface has gone down while we didn't
+ * have the lock. Stop any processing and exit. */
+ goto unlock;
+ }
+skip:
+ count ++;
+ if (++batch_count == PTNET_RX_BATCH) {
+ /* Some packets have been pushed to the network stack.
+ * We need to update the CSB to tell the host about the new
+ * ring->cur and ring->head (RX buffer refill). */
+ ptnet_ring_update(pq, kring, head, NAF_FORCE_READ);
+ batch_count = 0;
+ }
+ }
+escape:
+ if (batch_count) {
+ ptnet_ring_update(pq, kring, head, NAF_FORCE_READ);
+
+ }
+
+ if (count >= budget && may_resched) {
+ /* If we ran out of budget or the double-check found new
+ * slots to process, schedule the taskqueue. */
+ DBG(RD(1, "out of budget: resched h %u t %u\n",
+ head, ring->tail));
+ taskqueue_enqueue(pq->taskq, &pq->task);
+ }
+unlock:
+ PTNET_Q_UNLOCK(pq);
+
+ return count;
+}
+
+static void
+ptnet_rx_task(void *context, int pending)
+{
+ struct ptnet_queue *pq = context;
+
+ DBG(RD(1, "%s: pq #%u\n", __func__, pq->kring_id));
+ ptnet_rx_eof(pq, PTNET_RX_BUDGET, true);
+}
+
+static void
+ptnet_tx_task(void *context, int pending)
+{
+ struct ptnet_queue *pq = context;
+
+ DBG(RD(1, "%s: pq #%u\n", __func__, pq->kring_id));
+ ptnet_drain_transmit_queue(pq, PTNET_TX_BUDGET, true);
+}
+
+#ifdef DEVICE_POLLING
+/* We don't need to handle differently POLL_AND_CHECK_STATUS and
+ * POLL_ONLY, since we don't have an Interrupt Status Register. */
+static int
+ptnet_poll(if_t ifp, enum poll_cmd cmd, int budget)
+{
+ struct ptnet_softc *sc = if_getsoftc(ifp);
+ unsigned int queue_budget;
+ unsigned int count = 0;
+ bool borrow = false;
+ int i;
+
+ KASSERT(sc->num_rings > 0, ("Found no queues in while polling ptnet"));
+ queue_budget = MAX(budget / sc->num_rings, 1);
+ RD(1, "Per-queue budget is %d", queue_budget);
+
+ while (budget) {
+ unsigned int rcnt = 0;
+
+ for (i = 0; i < sc->num_rings; i++) {
+ struct ptnet_queue *pq = sc->queues + i;
+
+ if (borrow) {
+ queue_budget = MIN(queue_budget, budget);
+ if (queue_budget == 0) {
+ break;
+ }
+ }
+
+ if (i < sc->num_tx_rings) {
+ rcnt += ptnet_drain_transmit_queue(pq,
+ queue_budget, false);
+ } else {
+ rcnt += ptnet_rx_eof(pq, queue_budget,
+ false);
+ }
+ }
+
+ if (!rcnt) {
+ /* A scan of the queues gave no result, we can
+ * stop here. */
+ break;
+ }
+
+ if (rcnt > budget) {
+ /* This may happen when initial budget < sc->num_rings,
+ * since one packet budget is given to each queue
+ * anyway. Just pretend we didn't eat "so much". */
+ rcnt = budget;
+ }
+ count += rcnt;
+ budget -= rcnt;
+ borrow = true;
+ }
+
+
+ return count;
+}
+#endif /* DEVICE_POLLING */
diff --git a/sys/dev/netmap/if_vtnet_netmap.h b/sys/dev/netmap/if_vtnet_netmap.h
index 791cee56bcee..4bed0e718dd4 100644
--- a/sys/dev/netmap/if_vtnet_netmap.h
+++ b/sys/dev/netmap/if_vtnet_netmap.h
@@ -127,7 +127,7 @@ vtnet_netmap_txsync(struct netmap_kring *kring, int flags)
* First part: process new packets to send.
*/
rmb();
-
+
nm_i = kring->nr_hwcur;
if (nm_i != head) { /* we have new packets to send */
struct sglist *sg = txq->vtntx_sg;
@@ -182,7 +182,7 @@ vtnet_netmap_txsync(struct netmap_kring *kring, int flags)
virtqueue_enable_intr(vq); // like postpone with 0
}
-
+
/* Free used slots. We only consider our own used buffers, recognized
* by the token we passed to virtqueue_add_outbuf.
*/
diff --git a/sys/dev/netmap/ixgbe_netmap.h b/sys/dev/netmap/ixgbe_netmap.h
index 0f34e7218503..7986c9965173 100644
--- a/sys/dev/netmap/ixgbe_netmap.h
+++ b/sys/dev/netmap/ixgbe_netmap.h
@@ -53,7 +53,7 @@ void ixgbe_netmap_attach(struct adapter *adapter);
/*
* device-specific sysctl variables:
*
- * ix_crcstrip: 0: keep CRC in rx frames (default), 1: strip it.
+ * ix_crcstrip: 0: NIC keeps CRC in rx frames (default), 1: NIC strips it.
* During regular operations the CRC is stripped, but on some
* hardware reception of frames not multiple of 64 is slower,
* so using crcstrip=0 helps in benchmarks.
@@ -65,7 +65,7 @@ SYSCTL_DECL(_dev_netmap);
static int ix_rx_miss, ix_rx_miss_bufs;
int ix_crcstrip;
SYSCTL_INT(_dev_netmap, OID_AUTO, ix_crcstrip,
- CTLFLAG_RW, &ix_crcstrip, 0, "strip CRC on rx frames");
+ CTLFLAG_RW, &ix_crcstrip, 0, "NIC strips CRC on rx frames");
SYSCTL_INT(_dev_netmap, OID_AUTO, ix_rx_miss,
CTLFLAG_RW, &ix_rx_miss, 0, "potentially missed rx intr");
SYSCTL_INT(_dev_netmap, OID_AUTO, ix_rx_miss_bufs,
@@ -109,6 +109,20 @@ set_crcstrip(struct ixgbe_hw *hw, int onoff)
IXGBE_WRITE_REG(hw, IXGBE_RDRXCTL, rxc);
}
+static void
+ixgbe_netmap_intr(struct netmap_adapter *na, int onoff)
+{
+ struct ifnet *ifp = na->ifp;
+ struct adapter *adapter = ifp->if_softc;
+
+ IXGBE_CORE_LOCK(adapter);
+ if (onoff) {
+ ixgbe_enable_intr(adapter); // XXX maybe ixgbe_stop ?
+ } else {
+ ixgbe_disable_intr(adapter); // XXX maybe ixgbe_stop ?
+ }
+ IXGBE_CORE_UNLOCK(adapter);
+}
/*
* Register/unregister. We are already under netmap lock.
@@ -311,7 +325,7 @@ ixgbe_netmap_txsync(struct netmap_kring *kring, int flags)
* good way.
*/
nic_i = IXGBE_READ_REG(&adapter->hw, IXGBE_IS_VF(adapter) ?
- IXGBE_VFTDH(kring->ring_id) : IXGBE_TDH(kring->ring_id));
+ IXGBE_VFTDH(kring->ring_id) : IXGBE_TDH(kring->ring_id));
if (nic_i >= kring->nkr_num_slots) { /* XXX can it happen ? */
D("TDH wrap %d", nic_i);
nic_i -= kring->nkr_num_slots;
@@ -486,6 +500,7 @@ ixgbe_netmap_attach(struct adapter *adapter)
na.nm_rxsync = ixgbe_netmap_rxsync;
na.nm_register = ixgbe_netmap_reg;
na.num_tx_rings = na.num_rx_rings = adapter->num_queues;
+ na.nm_intr = ixgbe_netmap_intr;
netmap_attach(&na);
}
diff --git a/sys/dev/netmap/netmap.c b/sys/dev/netmap/netmap.c
index aff757bdadfe..46aca2eab5e2 100644
--- a/sys/dev/netmap/netmap.c
+++ b/sys/dev/netmap/netmap.c
@@ -1,5 +1,9 @@
/*
- * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved.
+ * Copyright (C) 2011-2014 Matteo Landi
+ * Copyright (C) 2011-2016 Luigi Rizzo
+ * Copyright (C) 2011-2016 Giuseppe Lettieri
+ * Copyright (C) 2011-2016 Vincenzo Maffione
+ * All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -133,13 +137,12 @@ ports attached to the switch)
* > select()able file descriptor on which events are reported.
*
* Internally, we allocate a netmap_priv_d structure, that will be
- * initialized on ioctl(NIOCREGIF).
+ * initialized on ioctl(NIOCREGIF). There is one netmap_priv_d
+ * structure for each open().
*
* os-specific:
- * FreeBSD: netmap_open (netmap_freebsd.c). The priv is
- * per-thread.
- * linux: linux_netmap_open (netmap_linux.c). The priv is
- * per-open.
+ * FreeBSD: see netmap_open() (netmap_freebsd.c)
+ * linux: see linux_netmap_open() (netmap_linux.c)
*
* > 2. on each descriptor, the process issues an ioctl() to identify
* > the interface that should report events to the file descriptor.
@@ -299,18 +302,17 @@ ports attached to the switch)
* netmap_transmit()
* na->nm_notify == netmap_notify()
* 2) ioctl(NIOCRXSYNC)/netmap_poll() in process context
- * kring->nm_sync() == netmap_rxsync_from_host_compat
+ * kring->nm_sync() == netmap_rxsync_from_host
* netmap_rxsync_from_host(na, NULL, NULL)
* - tx to host stack
* ioctl(NIOCTXSYNC)/netmap_poll() in process context
- * kring->nm_sync() == netmap_txsync_to_host_compat
+ * kring->nm_sync() == netmap_txsync_to_host
* netmap_txsync_to_host(na)
- * NM_SEND_UP()
- * FreeBSD: na->if_input() == ?? XXX
+ * nm_os_send_up()
+ * FreeBSD: na->if_input() == ether_input()
* linux: netif_rx() with NM_MAGIC_PRIORITY_RX
*
*
- *
* -= SYSTEM DEVICE WITH GENERIC SUPPORT =-
*
* na == NA(ifp) == generic_netmap_adapter created in generic_netmap_attach()
@@ -319,10 +321,11 @@ ports attached to the switch)
* concurrently:
* 1) ioctl(NIOCTXSYNC)/netmap_poll() in process context
* kring->nm_sync() == generic_netmap_txsync()
- * linux: dev_queue_xmit() with NM_MAGIC_PRIORITY_TX
- * generic_ndo_start_xmit()
- * orig. dev. start_xmit
- * FreeBSD: na->if_transmit() == orig. dev if_transmit
+ * nm_os_generic_xmit_frame()
+ * linux: dev_queue_xmit() with NM_MAGIC_PRIORITY_TX
+ * ifp->ndo_start_xmit == generic_ndo_start_xmit()
+ * gna->save_start_xmit == orig. dev. start_xmit
+ * FreeBSD: na->if_transmit() == orig. dev if_transmit
* 2) generic_mbuf_destructor()
* na->nm_notify() == netmap_notify()
* - rx from netmap userspace:
@@ -333,24 +336,15 @@ ports attached to the switch)
* generic_rx_handler()
* mbq_safe_enqueue()
* na->nm_notify() == netmap_notify()
- * - rx from host stack:
- * concurrently:
+ * - rx from host stack
+ * FreeBSD: same as native
+ * Linux: same as native except:
* 1) host stack
- * linux: generic_ndo_start_xmit()
- * netmap_transmit()
- * FreeBSD: ifp->if_input() == netmap_transmit
- * both:
- * na->nm_notify() == netmap_notify()
- * 2) ioctl(NIOCRXSYNC)/netmap_poll() in process context
- * kring->nm_sync() == netmap_rxsync_from_host_compat
- * netmap_rxsync_from_host(na, NULL, NULL)
- * - tx to host stack:
- * ioctl(NIOCTXSYNC)/netmap_poll() in process context
- * kring->nm_sync() == netmap_txsync_to_host_compat
- * netmap_txsync_to_host(na)
- * NM_SEND_UP()
- * FreeBSD: na->if_input() == ??? XXX
- * linux: netif_rx() with NM_MAGIC_PRIORITY_RX
+ * dev_queue_xmit() without NM_MAGIC_PRIORITY_TX
+ * ifp->ndo_start_xmit == generic_ndo_start_xmit()
+ * netmap_transmit()
+ * na->nm_notify() == netmap_notify()
+ * - tx to host stack (same as native):
*
*
* -= VALE =-
@@ -371,7 +365,7 @@ ports attached to the switch)
* from host stack:
* netmap_transmit()
* na->nm_notify() == netmap_bwrap_intr_notify(ring_nr == host ring)
- * kring->nm_sync() == netmap_rxsync_from_host_compat()
+ * kring->nm_sync() == netmap_rxsync_from_host()
* netmap_vp_txsync()
*
* - system device with generic support:
@@ -384,7 +378,7 @@ ports attached to the switch)
* from host stack:
* netmap_transmit()
* na->nm_notify() == netmap_bwrap_intr_notify(ring_nr == host ring)
- * kring->nm_sync() == netmap_rxsync_from_host_compat()
+ * kring->nm_sync() == netmap_rxsync_from_host()
* netmap_vp_txsync()
*
* (all cases) --> nm_bdg_flush()
@@ -407,7 +401,7 @@ ports attached to the switch)
* netmap_vp_rxsync()
* to host stack:
* netmap_vp_rxsync()
- * kring->nm_sync() == netmap_txsync_to_host_compat
+ * kring->nm_sync() == netmap_txsync_to_host
* netmap_vp_rxsync_locked()
*
* - system device with generic adapter:
@@ -418,7 +412,7 @@ ports attached to the switch)
* netmap_vp_rxsync()
* to host stack:
* netmap_vp_rxsync()
- * kring->nm_sync() == netmap_txsync_to_host_compat
+ * kring->nm_sync() == netmap_txsync_to_host
* netmap_vp_rxsync()
*
*/
@@ -455,29 +449,19 @@ ports attached to the switch)
#include <sys/refcount.h>
-/* reduce conditional code */
-// linux API, use for the knlist in FreeBSD
-/* use a private mutex for the knlist */
-#define init_waitqueue_head(x) do { \
- struct mtx *m = &(x)->m; \
- mtx_init(m, "nm_kn_lock", NULL, MTX_DEF); \
- knlist_init_mtx(&(x)->si.si_note, m); \
- } while (0)
-
-#define OS_selrecord(a, b) selrecord(a, &((b)->si))
-#define OS_selwakeup(a, b) freebsd_selwakeup(a, b)
-
#elif defined(linux)
#include "bsd_glue.h"
-
-
#elif defined(__APPLE__)
#warning OSX support is only partial
#include "osx_glue.h"
+#elif defined (_WIN32)
+
+#include "win_glue.h"
+
#else
#error Unsupported platform
@@ -492,47 +476,69 @@ ports attached to the switch)
#include <dev/netmap/netmap_mem2.h>
-MALLOC_DEFINE(M_NETMAP, "netmap", "Network memory map");
-
/* user-controlled variables */
int netmap_verbose;
static int netmap_no_timestamp; /* don't timestamp on rxsync */
-
-SYSCTL_NODE(_dev, OID_AUTO, netmap, CTLFLAG_RW, 0, "Netmap args");
-SYSCTL_INT(_dev_netmap, OID_AUTO, verbose,
- CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode");
-SYSCTL_INT(_dev_netmap, OID_AUTO, no_timestamp,
- CTLFLAG_RW, &netmap_no_timestamp, 0, "no_timestamp");
int netmap_mitigate = 1;
-SYSCTL_INT(_dev_netmap, OID_AUTO, mitigate, CTLFLAG_RW, &netmap_mitigate, 0, "");
int netmap_no_pendintr = 1;
-SYSCTL_INT(_dev_netmap, OID_AUTO, no_pendintr,
- CTLFLAG_RW, &netmap_no_pendintr, 0, "Always look for new received packets.");
int netmap_txsync_retry = 2;
-SYSCTL_INT(_dev_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW,
- &netmap_txsync_retry, 0 , "Number of txsync loops in bridge's flush.");
-
-int netmap_adaptive_io = 0;
-SYSCTL_INT(_dev_netmap, OID_AUTO, adaptive_io, CTLFLAG_RW,
- &netmap_adaptive_io, 0 , "Adaptive I/O on paravirt");
-
int netmap_flags = 0; /* debug flags */
-int netmap_fwd = 0; /* force transparent mode */
+static int netmap_fwd = 0; /* force transparent mode */
/*
* netmap_admode selects the netmap mode to use.
* Invalid values are reset to NETMAP_ADMODE_BEST
*/
-enum { NETMAP_ADMODE_BEST = 0, /* use native, fallback to generic */
+enum { NETMAP_ADMODE_BEST = 0, /* use native, fallback to generic */
NETMAP_ADMODE_NATIVE, /* either native or none */
NETMAP_ADMODE_GENERIC, /* force generic */
NETMAP_ADMODE_LAST };
static int netmap_admode = NETMAP_ADMODE_BEST;
-int netmap_generic_mit = 100*1000; /* Generic mitigation interval in nanoseconds. */
-int netmap_generic_ringsize = 1024; /* Generic ringsize. */
-int netmap_generic_rings = 1; /* number of queues in generic. */
+/* netmap_generic_mit controls mitigation of RX notifications for
+ * the generic netmap adapter. The value is a time interval in
+ * nanoseconds. */
+int netmap_generic_mit = 100*1000;
+
+/* We use by default netmap-aware qdiscs with generic netmap adapters,
+ * even if there can be a little performance hit with hardware NICs.
+ * However, using the qdisc is the safer approach, for two reasons:
+ * 1) it prevents non-fifo qdiscs to break the TX notification
+ * scheme, which is based on mbuf destructors when txqdisc is
+ * not used.
+ * 2) it makes it possible to transmit over software devices that
+ * change skb->dev, like bridge, veth, ...
+ *
+ * Anyway users looking for the best performance should
+ * use native adapters.
+ */
+int netmap_generic_txqdisc = 1;
+
+/* Default number of slots and queues for generic adapters. */
+int netmap_generic_ringsize = 1024;
+int netmap_generic_rings = 1;
+
+/* Non-zero if ptnet devices are allowed to use virtio-net headers. */
+int ptnet_vnet_hdr = 1;
+
+/*
+ * SYSCTL calls are grouped between SYSBEGIN and SYSEND to be emulated
+ * in some other operating systems
+ */
+SYSBEGIN(main_init);
+
+SYSCTL_DECL(_dev_netmap);
+SYSCTL_NODE(_dev, OID_AUTO, netmap, CTLFLAG_RW, 0, "Netmap args");
+SYSCTL_INT(_dev_netmap, OID_AUTO, verbose,
+ CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode");
+SYSCTL_INT(_dev_netmap, OID_AUTO, no_timestamp,
+ CTLFLAG_RW, &netmap_no_timestamp, 0, "no_timestamp");
+SYSCTL_INT(_dev_netmap, OID_AUTO, mitigate, CTLFLAG_RW, &netmap_mitigate, 0, "");
+SYSCTL_INT(_dev_netmap, OID_AUTO, no_pendintr,
+ CTLFLAG_RW, &netmap_no_pendintr, 0, "Always look for new received packets.");
+SYSCTL_INT(_dev_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW,
+ &netmap_txsync_retry, 0 , "Number of txsync loops in bridge's flush.");
SYSCTL_INT(_dev_netmap, OID_AUTO, flags, CTLFLAG_RW, &netmap_flags, 0 , "");
SYSCTL_INT(_dev_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0 , "");
@@ -540,19 +546,24 @@ SYSCTL_INT(_dev_netmap, OID_AUTO, admode, CTLFLAG_RW, &netmap_admode, 0 , "");
SYSCTL_INT(_dev_netmap, OID_AUTO, generic_mit, CTLFLAG_RW, &netmap_generic_mit, 0 , "");
SYSCTL_INT(_dev_netmap, OID_AUTO, generic_ringsize, CTLFLAG_RW, &netmap_generic_ringsize, 0 , "");
SYSCTL_INT(_dev_netmap, OID_AUTO, generic_rings, CTLFLAG_RW, &netmap_generic_rings, 0 , "");
+SYSCTL_INT(_dev_netmap, OID_AUTO, generic_txqdisc, CTLFLAG_RW, &netmap_generic_txqdisc, 0 , "");
+SYSCTL_INT(_dev_netmap, OID_AUTO, ptnet_vnet_hdr, CTLFLAG_RW, &ptnet_vnet_hdr, 0 , "");
+
+SYSEND;
NMG_LOCK_T netmap_global_lock;
-int netmap_use_count = 0; /* number of active netmap instances */
/*
* mark the ring as stopped, and run through the locks
* to make sure other users get to see it.
+ * stopped must be either NR_KR_STOPPED (for unbounded stop)
+ * of NR_KR_LOCKED (brief stop for mutual exclusion purposes)
*/
static void
-netmap_disable_ring(struct netmap_kring *kr)
+netmap_disable_ring(struct netmap_kring *kr, int stopped)
{
- kr->nkr_stopped = 1;
- nm_kr_get(kr);
+ nm_kr_stop(kr, stopped);
+ // XXX check if nm_kr_stop is sufficient
mtx_lock(&kr->q_lock);
mtx_unlock(&kr->q_lock);
nm_kr_put(kr);
@@ -563,7 +574,7 @@ void
netmap_set_ring(struct netmap_adapter *na, u_int ring_id, enum txrx t, int stopped)
{
if (stopped)
- netmap_disable_ring(NMR(na, t) + ring_id);
+ netmap_disable_ring(NMR(na, t) + ring_id, stopped);
else
NMR(na, t)[ring_id].nkr_stopped = 0;
}
@@ -590,13 +601,14 @@ netmap_set_all_rings(struct netmap_adapter *na, int stopped)
* Convenience function used in drivers. Waits for current txsync()s/rxsync()s
* to finish and prevents any new one from starting. Call this before turning
* netmap mode off, or before removing the hardware rings (e.g., on module
- * onload). As a rule of thumb for linux drivers, this should be placed near
- * each napi_disable().
+ * onload).
*/
void
netmap_disable_all_rings(struct ifnet *ifp)
{
- netmap_set_all_rings(NA(ifp), 1 /* stopped */);
+ if (NM_NA_VALID(ifp)) {
+ netmap_set_all_rings(NA(ifp), NM_KR_STOPPED);
+ }
}
/*
@@ -607,9 +619,34 @@ netmap_disable_all_rings(struct ifnet *ifp)
void
netmap_enable_all_rings(struct ifnet *ifp)
{
- netmap_set_all_rings(NA(ifp), 0 /* enabled */);
+ if (NM_NA_VALID(ifp)) {
+ netmap_set_all_rings(NA(ifp), 0 /* enabled */);
+ }
+}
+
+void
+netmap_make_zombie(struct ifnet *ifp)
+{
+ if (NM_NA_VALID(ifp)) {
+ struct netmap_adapter *na = NA(ifp);
+ netmap_set_all_rings(na, NM_KR_LOCKED);
+ na->na_flags |= NAF_ZOMBIE;
+ netmap_set_all_rings(na, 0);
+ }
}
+void
+netmap_undo_zombie(struct ifnet *ifp)
+{
+ if (NM_NA_VALID(ifp)) {
+ struct netmap_adapter *na = NA(ifp);
+ if (na->na_flags & NAF_ZOMBIE) {
+ netmap_set_all_rings(na, NM_KR_LOCKED);
+ na->na_flags &= ~NAF_ZOMBIE;
+ netmap_set_all_rings(na, 0);
+ }
+ }
+}
/*
* generic bound_checking function
@@ -727,28 +764,9 @@ netmap_update_config(struct netmap_adapter *na)
return 1;
}
-static void netmap_txsync_to_host(struct netmap_adapter *na);
-static int netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait);
-
-/* kring->nm_sync callback for the host tx ring */
-static int
-netmap_txsync_to_host_compat(struct netmap_kring *kring, int flags)
-{
- (void)flags; /* unused */
- netmap_txsync_to_host(kring->na);
- return 0;
-}
-
-/* kring->nm_sync callback for the host rx ring */
-static int
-netmap_rxsync_from_host_compat(struct netmap_kring *kring, int flags)
-{
- (void)flags; /* unused */
- netmap_rxsync_from_host(kring->na, NULL, NULL);
- return 0;
-}
-
-
+/* nm_sync callbacks for the host rings */
+static int netmap_txsync_to_host(struct netmap_kring *kring, int flags);
+static int netmap_rxsync_from_host(struct netmap_kring *kring, int flags);
/* create the krings array and initialize the fields common to all adapters.
* The array layout is this:
@@ -809,12 +827,14 @@ netmap_krings_create(struct netmap_adapter *na, u_int tailroom)
kring->ring_id = i;
kring->tx = t;
kring->nkr_num_slots = ndesc;
+ kring->nr_mode = NKR_NETMAP_OFF;
+ kring->nr_pending_mode = NKR_NETMAP_OFF;
if (i < nma_get_nrings(na, t)) {
kring->nm_sync = (t == NR_TX ? na->nm_txsync : na->nm_rxsync);
- } else if (i == na->num_tx_rings) {
+ } else {
kring->nm_sync = (t == NR_TX ?
- netmap_txsync_to_host_compat :
- netmap_rxsync_from_host_compat);
+ netmap_txsync_to_host:
+ netmap_rxsync_from_host);
}
kring->nm_notify = na->nm_notify;
kring->rhead = kring->rcur = kring->nr_hwcur = 0;
@@ -822,14 +842,14 @@ netmap_krings_create(struct netmap_adapter *na, u_int tailroom)
* IMPORTANT: Always keep one slot empty.
*/
kring->rtail = kring->nr_hwtail = (t == NR_TX ? ndesc - 1 : 0);
- snprintf(kring->name, sizeof(kring->name) - 1, "%s %s%d", na->name,
+ snprintf(kring->name, sizeof(kring->name) - 1, "%s %s%d", na->name,
nm_txrx2str(t), i);
ND("ktx %s h %d c %d t %d",
kring->name, kring->rhead, kring->rcur, kring->rtail);
mtx_init(&kring->q_lock, (t == NR_TX ? "nm_txq_lock" : "nm_rxq_lock"), NULL, MTX_DEF);
- init_waitqueue_head(&kring->si);
+ nm_os_selinfo_init(&kring->si);
}
- init_waitqueue_head(&na->si[t]);
+ nm_os_selinfo_init(&na->si[t]);
}
na->tailroom = na->rx_rings + n[NR_RX];
@@ -838,19 +858,6 @@ netmap_krings_create(struct netmap_adapter *na, u_int tailroom)
}
-#ifdef __FreeBSD__
-static void
-netmap_knlist_destroy(NM_SELINFO_T *si)
-{
- /* XXX kqueue(9) needed; these will mirror knlist_init. */
- knlist_delete(&si->si.si_note, curthread, 0 /* not locked */ );
- knlist_destroy(&si->si.si_note);
- /* now we don't need the mutex anymore */
- mtx_destroy(&si->m);
-}
-#endif /* __FreeBSD__ */
-
-
/* undo the actions performed by netmap_krings_create */
/* call with NMG_LOCK held */
void
@@ -860,12 +867,12 @@ netmap_krings_delete(struct netmap_adapter *na)
enum txrx t;
for_rx_tx(t)
- netmap_knlist_destroy(&na->si[t]);
+ nm_os_selinfo_uninit(&na->si[t]);
/* we rely on the krings layout described above */
for ( ; kring != na->tailroom; kring++) {
mtx_destroy(&kring->q_lock);
- netmap_knlist_destroy(&kring->si);
+ nm_os_selinfo_uninit(&kring->si);
}
free(na->tx_rings, M_DEVBUF);
na->tx_rings = na->rx_rings = na->tailroom = NULL;
@@ -878,14 +885,14 @@ netmap_krings_delete(struct netmap_adapter *na)
* them first.
*/
/* call with NMG_LOCK held */
-static void
+void
netmap_hw_krings_delete(struct netmap_adapter *na)
{
struct mbq *q = &na->rx_rings[na->num_rx_rings].rx_queue;
ND("destroy sw mbq with len %d", mbq_len(q));
mbq_purge(q);
- mbq_safe_destroy(q);
+ mbq_safe_fini(q);
netmap_krings_delete(na);
}
@@ -898,29 +905,38 @@ netmap_hw_krings_delete(struct netmap_adapter *na)
*/
/* call with NMG_LOCK held */
static void netmap_unset_ringid(struct netmap_priv_d *);
-static void netmap_rel_exclusive(struct netmap_priv_d *);
-static void
+static void netmap_krings_put(struct netmap_priv_d *);
+void
netmap_do_unregif(struct netmap_priv_d *priv)
{
struct netmap_adapter *na = priv->np_na;
NMG_LOCK_ASSERT();
na->active_fds--;
- /* release exclusive use if it was requested on regif */
- netmap_rel_exclusive(priv);
- if (na->active_fds <= 0) { /* last instance */
-
- if (netmap_verbose)
- D("deleting last instance for %s", na->name);
+ /* unset nr_pending_mode and possibly release exclusive mode */
+ netmap_krings_put(priv);
#ifdef WITH_MONITOR
+ /* XXX check whether we have to do something with monitor
+ * when rings change nr_mode. */
+ if (na->active_fds <= 0) {
/* walk through all the rings and tell any monitor
* that the port is going to exit netmap mode
*/
netmap_monitor_stop(na);
+ }
#endif
+
+ if (na->active_fds <= 0 || nm_kring_pending(priv)) {
+ na->nm_register(na, 0);
+ }
+
+ /* delete rings and buffers that are no longer needed */
+ netmap_mem_rings_delete(na);
+
+ if (na->active_fds <= 0) { /* last instance */
/*
- * (TO CHECK) This function is only called
+ * (TO CHECK) We enter here
* when the last reference to this file descriptor goes
* away. This means we cannot have any pending poll()
* or interrupt routine operating on the structure.
@@ -933,16 +949,16 @@ netmap_do_unregif(struct netmap_priv_d *priv)
* happens if the close() occurs while a concurrent
* syscall is running.
*/
- na->nm_register(na, 0); /* off, clear flags */
- /* Wake up any sleeping threads. netmap_poll will
- * then return POLLERR
- * XXX The wake up now must happen during *_down(), when
- * we order all activities to stop. -gl
- */
- /* delete rings and buffers */
- netmap_mem_rings_delete(na);
+ if (netmap_verbose)
+ D("deleting last instance for %s", na->name);
+
+ if (nm_netmap_on(na)) {
+ D("BUG: netmap on while going to delete the krings");
+ }
+
na->nm_krings_delete(na);
}
+
/* possibily decrement counter of tx_si/rx_si users */
netmap_unset_ringid(priv);
/* delete the nifp */
@@ -962,6 +978,20 @@ nm_si_user(struct netmap_priv_d *priv, enum txrx t)
(priv->np_qlast[t] - priv->np_qfirst[t] > 1));
}
+struct netmap_priv_d*
+netmap_priv_new(void)
+{
+ struct netmap_priv_d *priv;
+
+ priv = malloc(sizeof(struct netmap_priv_d), M_DEVBUF,
+ M_NOWAIT | M_ZERO);
+ if (priv == NULL)
+ return NULL;
+ priv->np_refs = 1;
+ nm_os_get_module();
+ return priv;
+}
+
/*
* Destructor of the netmap_priv_d, called when the fd is closed
* Action: undo all the things done by NIOCREGIF,
@@ -971,22 +1001,22 @@ nm_si_user(struct netmap_priv_d *priv, enum txrx t)
*
*/
/* call with NMG_LOCK held */
-int
-netmap_dtor_locked(struct netmap_priv_d *priv)
+void
+netmap_priv_delete(struct netmap_priv_d *priv)
{
struct netmap_adapter *na = priv->np_na;
/* number of active references to this fd */
if (--priv->np_refs > 0) {
- return 0;
+ return;
}
- netmap_use_count--;
- if (!na) {
- return 1; //XXX is it correct?
+ nm_os_put_module();
+ if (na) {
+ netmap_do_unregif(priv);
}
- netmap_do_unregif(priv);
- netmap_adapter_put(na);
- return 1;
+ netmap_unget_na(na, priv->np_ifp);
+ bzero(priv, sizeof(*priv)); /* for safety */
+ free(priv, M_DEVBUF);
}
@@ -995,15 +1025,10 @@ void
netmap_dtor(void *data)
{
struct netmap_priv_d *priv = data;
- int last_instance;
NMG_LOCK();
- last_instance = netmap_dtor_locked(priv);
+ netmap_priv_delete(priv);
NMG_UNLOCK();
- if (last_instance) {
- bzero(priv, sizeof(*priv)); /* for safety */
- free(priv, M_DEVBUF);
- }
}
@@ -1036,14 +1061,19 @@ static void
netmap_send_up(struct ifnet *dst, struct mbq *q)
{
struct mbuf *m;
+ struct mbuf *head = NULL, *prev = NULL;
/* send packets up, outside the lock */
while ((m = mbq_dequeue(q)) != NULL) {
if (netmap_verbose & NM_VERB_HOST)
D("sending up pkt %p size %d", m, MBUF_LEN(m));
- NM_SEND_UP(dst, m);
+ prev = nm_os_send_up(dst, m, prev);
+ if (head == NULL)
+ head = prev;
}
- mbq_destroy(q);
+ if (head)
+ nm_os_send_up(dst, NULL, head);
+ mbq_fini(q);
}
@@ -1081,6 +1111,27 @@ netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force)
}
}
+static inline int
+_nm_may_forward(struct netmap_kring *kring)
+{
+ return ((netmap_fwd || kring->ring->flags & NR_FORWARD) &&
+ kring->na->na_flags & NAF_HOST_RINGS &&
+ kring->tx == NR_RX);
+}
+
+static inline int
+nm_may_forward_up(struct netmap_kring *kring)
+{
+ return _nm_may_forward(kring) &&
+ kring->ring_id != kring->na->num_rx_rings;
+}
+
+static inline int
+nm_may_forward_down(struct netmap_kring *kring)
+{
+ return _nm_may_forward(kring) &&
+ kring->ring_id == kring->na->num_rx_rings;
+}
/*
* Send to the NIC rings packets marked NS_FORWARD between
@@ -1107,7 +1158,7 @@ netmap_sw_to_nic(struct netmap_adapter *na)
for (; rxcur != head && !nm_ring_empty(rdst);
rxcur = nm_next(rxcur, src_lim) ) {
struct netmap_slot *src, *dst, tmp;
- u_int dst_cur = rdst->cur;
+ u_int dst_head = rdst->head;
src = &rxslot[rxcur];
if ((src->flags & NS_FORWARD) == 0 && !netmap_fwd)
@@ -1115,7 +1166,7 @@ netmap_sw_to_nic(struct netmap_adapter *na)
sent++;
- dst = &rdst->slot[dst_cur];
+ dst = &rdst->slot[dst_head];
tmp = *src;
@@ -1126,7 +1177,7 @@ netmap_sw_to_nic(struct netmap_adapter *na)
dst->len = tmp.len;
dst->flags = NS_BUF_CHANGED;
- rdst->cur = nm_next(dst_cur, dst_lim);
+ rdst->head = rdst->cur = nm_next(dst_head, dst_lim);
}
/* if (sent) XXX txsync ? */
}
@@ -1140,10 +1191,10 @@ netmap_sw_to_nic(struct netmap_adapter *na)
* can be among multiple user threads erroneously calling
* this routine concurrently.
*/
-static void
-netmap_txsync_to_host(struct netmap_adapter *na)
+static int
+netmap_txsync_to_host(struct netmap_kring *kring, int flags)
{
- struct netmap_kring *kring = &na->tx_rings[na->num_tx_rings];
+ struct netmap_adapter *na = kring->na;
u_int const lim = kring->nkr_num_slots - 1;
u_int const head = kring->rhead;
struct mbq q;
@@ -1162,6 +1213,7 @@ netmap_txsync_to_host(struct netmap_adapter *na)
kring->nr_hwtail -= lim + 1;
netmap_send_up(na->ifp, &q);
+ return 0;
}
@@ -1171,17 +1223,15 @@ netmap_txsync_to_host(struct netmap_adapter *na)
* We protect access to the kring using kring->rx_queue.lock
*
* This routine also does the selrecord if called from the poll handler
- * (we know because td != NULL).
+ * (we know because sr != NULL).
*
- * NOTE: on linux, selrecord() is defined as a macro and uses pwait
- * as an additional hidden argument.
* returns the number of packets delivered to tx queues in
* transparent mode, or a negative value if error
*/
static int
-netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait)
+netmap_rxsync_from_host(struct netmap_kring *kring, int flags)
{
- struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings];
+ struct netmap_adapter *na = kring->na;
struct netmap_ring *ring = kring->ring;
u_int nm_i, n;
u_int const lim = kring->nkr_num_slots - 1;
@@ -1189,9 +1239,6 @@ netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwai
int ret = 0;
struct mbq *q = &kring->rx_queue, fq;
- (void)pwait; /* disable unused warnings */
- (void)td;
-
mbq_init(&fq); /* fq holds packets to be freed */
mbq_lock(q);
@@ -1226,19 +1273,20 @@ netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwai
*/
nm_i = kring->nr_hwcur;
if (nm_i != head) { /* something was released */
- if (netmap_fwd || kring->ring->flags & NR_FORWARD)
+ if (nm_may_forward_down(kring)) {
ret = netmap_sw_to_nic(na);
+ if (ret > 0) {
+ kring->nr_kflags |= NR_FORWARD;
+ ret = 0;
+ }
+ }
kring->nr_hwcur = head;
}
- /* access copies of cur,tail in the kring */
- if (kring->rcur == kring->rtail && td) /* no bufs available */
- OS_selrecord(td, &kring->si);
-
mbq_unlock(q);
mbq_purge(&fq);
- mbq_destroy(&fq);
+ mbq_fini(&fq);
return ret;
}
@@ -1267,17 +1315,14 @@ netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwai
* 0 NETMAP_ADMODE_GENERIC GENERIC GENERIC
*
*/
-
+static void netmap_hw_dtor(struct netmap_adapter *); /* needed by NM_IS_NATIVE() */
int
netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na)
{
/* generic support */
int i = netmap_admode; /* Take a snapshot. */
struct netmap_adapter *prev_na;
-#ifdef WITH_GENERIC
- struct netmap_generic_adapter *gna;
int error = 0;
-#endif
*na = NULL; /* default */
@@ -1285,7 +1330,7 @@ netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na)
if (i < NETMAP_ADMODE_BEST || i >= NETMAP_ADMODE_LAST)
i = netmap_admode = NETMAP_ADMODE_BEST;
- if (NETMAP_CAPABLE(ifp)) {
+ if (NM_NA_VALID(ifp)) {
prev_na = NA(ifp);
/* If an adapter already exists, return it if
* there are active file descriptors or if
@@ -1310,10 +1355,9 @@ netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na)
/* If there isn't native support and netmap is not allowed
* to use generic adapters, we cannot satisfy the request.
*/
- if (!NETMAP_CAPABLE(ifp) && i == NETMAP_ADMODE_NATIVE)
+ if (!NM_IS_NATIVE(ifp) && i == NETMAP_ADMODE_NATIVE)
return EOPNOTSUPP;
-#ifdef WITH_GENERIC
/* Otherwise, create a generic adapter and return it,
* saving the previously used netmap adapter, if any.
*
@@ -1328,25 +1372,12 @@ netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na)
* the branches above. This ensures that we never override
* a generic adapter with another generic adapter.
*/
- prev_na = NA(ifp);
error = generic_netmap_attach(ifp);
if (error)
return error;
*na = NA(ifp);
- gna = (struct netmap_generic_adapter*)NA(ifp);
- gna->prev = prev_na; /* save old na */
- if (prev_na != NULL) {
- ifunit_ref(ifp->if_xname);
- // XXX add a refcount ?
- netmap_adapter_get(prev_na);
- }
- ND("Created generic NA %p (prev %p)", gna, gna->prev);
-
return 0;
-#else /* !WITH_GENERIC */
- return EOPNOTSUPP;
-#endif
}
@@ -1364,21 +1395,22 @@ netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na)
* could not be allocated.
* If successful, hold a reference to the netmap adapter.
*
- * No reference is kept on the real interface, which may then
- * disappear at any time.
+ * If the interface specified by nmr is a system one, also keep
+ * a reference to it and return a valid *ifp.
*/
int
-netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create)
+netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na,
+ struct ifnet **ifp, int create)
{
- struct ifnet *ifp = NULL;
int error = 0;
struct netmap_adapter *ret = NULL;
*na = NULL; /* default return value */
+ *ifp = NULL;
NMG_LOCK_ASSERT();
- /* we cascade through all possible types of netmap adapter.
+ /* We cascade through all possible types of netmap adapter.
* All netmap_get_*_na() functions return an error and an na,
* with the following combinations:
*
@@ -1389,6 +1421,11 @@ netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create)
* !0 !NULL impossible
*/
+ /* try to see if this is a ptnetmap port */
+ error = netmap_get_pt_host_na(nmr, na, create);
+ if (error || *na != NULL)
+ return error;
+
/* try to see if this is a monitor port */
error = netmap_get_monitor_na(nmr, na, create);
if (error || *na != NULL)
@@ -1413,12 +1450,12 @@ netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create)
* This may still be a tap, a veth/epair, or even a
* persistent VALE port.
*/
- ifp = ifunit_ref(nmr->nr_name);
- if (ifp == NULL) {
+ *ifp = ifunit_ref(nmr->nr_name);
+ if (*ifp == NULL) {
return ENXIO;
}
- error = netmap_get_hw_na(ifp, &ret);
+ error = netmap_get_hw_na(*ifp, &ret);
if (error)
goto out;
@@ -1426,15 +1463,42 @@ netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create)
netmap_adapter_get(ret);
out:
- if (error && ret != NULL)
- netmap_adapter_put(ret);
-
- if (ifp)
- if_rele(ifp); /* allow live unloading of drivers modules */
+ if (error) {
+ if (ret)
+ netmap_adapter_put(ret);
+ if (*ifp) {
+ if_rele(*ifp);
+ *ifp = NULL;
+ }
+ }
return error;
}
+/* undo netmap_get_na() */
+void
+netmap_unget_na(struct netmap_adapter *na, struct ifnet *ifp)
+{
+ if (ifp)
+ if_rele(ifp);
+ if (na)
+ netmap_adapter_put(na);
+}
+
+
+#define NM_FAIL_ON(t) do { \
+ if (unlikely(t)) { \
+ RD(5, "%s: fail '" #t "' " \
+ "h %d c %d t %d " \
+ "rh %d rc %d rt %d " \
+ "hc %d ht %d", \
+ kring->name, \
+ head, cur, ring->tail, \
+ kring->rhead, kring->rcur, kring->rtail, \
+ kring->nr_hwcur, kring->nr_hwtail); \
+ return kring->nkr_num_slots; \
+ } \
+} while (0)
/*
* validate parameters on entry for *_txsync()
@@ -1449,11 +1513,9 @@ out:
*
* hwcur, rhead, rtail and hwtail are reliable
*/
-static u_int
-nm_txsync_prologue(struct netmap_kring *kring)
+u_int
+nm_txsync_prologue(struct netmap_kring *kring, struct netmap_ring *ring)
{
-#define NM_ASSERT(t) if (t) { D("fail " #t); goto error; }
- struct netmap_ring *ring = kring->ring;
u_int head = ring->head; /* read only once */
u_int cur = ring->cur; /* read only once */
u_int n = kring->nkr_num_slots;
@@ -1463,54 +1525,44 @@ nm_txsync_prologue(struct netmap_kring *kring)
kring->nr_hwcur, kring->nr_hwtail,
ring->head, ring->cur, ring->tail);
#if 1 /* kernel sanity checks; but we can trust the kring. */
- if (kring->nr_hwcur >= n || kring->rhead >= n ||
- kring->rtail >= n || kring->nr_hwtail >= n)
- goto error;
+ NM_FAIL_ON(kring->nr_hwcur >= n || kring->rhead >= n ||
+ kring->rtail >= n || kring->nr_hwtail >= n);
#endif /* kernel sanity checks */
/*
- * user sanity checks. We only use 'cur',
- * A, B, ... are possible positions for cur:
+ * user sanity checks. We only use head,
+ * A, B, ... are possible positions for head:
*
- * 0 A cur B tail C n-1
- * 0 D tail E cur F n-1
+ * 0 A rhead B rtail C n-1
+ * 0 D rtail E rhead F n-1
*
* B, F, D are valid. A, C, E are wrong
*/
if (kring->rtail >= kring->rhead) {
/* want rhead <= head <= rtail */
- NM_ASSERT(head < kring->rhead || head > kring->rtail);
+ NM_FAIL_ON(head < kring->rhead || head > kring->rtail);
/* and also head <= cur <= rtail */
- NM_ASSERT(cur < head || cur > kring->rtail);
+ NM_FAIL_ON(cur < head || cur > kring->rtail);
} else { /* here rtail < rhead */
/* we need head outside rtail .. rhead */
- NM_ASSERT(head > kring->rtail && head < kring->rhead);
+ NM_FAIL_ON(head > kring->rtail && head < kring->rhead);
/* two cases now: head <= rtail or head >= rhead */
if (head <= kring->rtail) {
/* want head <= cur <= rtail */
- NM_ASSERT(cur < head || cur > kring->rtail);
+ NM_FAIL_ON(cur < head || cur > kring->rtail);
} else { /* head >= rhead */
/* cur must be outside rtail..head */
- NM_ASSERT(cur > kring->rtail && cur < head);
+ NM_FAIL_ON(cur > kring->rtail && cur < head);
}
}
if (ring->tail != kring->rtail) {
- RD(5, "tail overwritten was %d need %d",
+ RD(5, "%s tail overwritten was %d need %d", kring->name,
ring->tail, kring->rtail);
ring->tail = kring->rtail;
}
kring->rhead = head;
kring->rcur = cur;
return head;
-
-error:
- RD(5, "%s kring error: head %d cur %d tail %d rhead %d rcur %d rtail %d hwcur %d hwtail %d",
- kring->name,
- head, cur, ring->tail,
- kring->rhead, kring->rcur, kring->rtail,
- kring->nr_hwcur, kring->nr_hwtail);
- return n;
-#undef NM_ASSERT
}
@@ -1525,10 +1577,9 @@ error:
* hwcur and hwtail are reliable.
*
*/
-static u_int
-nm_rxsync_prologue(struct netmap_kring *kring)
+u_int
+nm_rxsync_prologue(struct netmap_kring *kring, struct netmap_ring *ring)
{
- struct netmap_ring *ring = kring->ring;
uint32_t const n = kring->nkr_num_slots;
uint32_t head, cur;
@@ -1546,30 +1597,24 @@ nm_rxsync_prologue(struct netmap_kring *kring)
cur = kring->rcur = ring->cur; /* read only once */
head = kring->rhead = ring->head; /* read only once */
#if 1 /* kernel sanity checks */
- if (kring->nr_hwcur >= n || kring->nr_hwtail >= n)
- goto error;
+ NM_FAIL_ON(kring->nr_hwcur >= n || kring->nr_hwtail >= n);
#endif /* kernel sanity checks */
/* user sanity checks */
if (kring->nr_hwtail >= kring->nr_hwcur) {
/* want hwcur <= rhead <= hwtail */
- if (head < kring->nr_hwcur || head > kring->nr_hwtail)
- goto error;
+ NM_FAIL_ON(head < kring->nr_hwcur || head > kring->nr_hwtail);
/* and also rhead <= rcur <= hwtail */
- if (cur < head || cur > kring->nr_hwtail)
- goto error;
+ NM_FAIL_ON(cur < head || cur > kring->nr_hwtail);
} else {
/* we need rhead outside hwtail..hwcur */
- if (head < kring->nr_hwcur && head > kring->nr_hwtail)
- goto error;
+ NM_FAIL_ON(head < kring->nr_hwcur && head > kring->nr_hwtail);
/* two cases now: head <= hwtail or head >= hwcur */
if (head <= kring->nr_hwtail) {
/* want head <= cur <= hwtail */
- if (cur < head || cur > kring->nr_hwtail)
- goto error;
+ NM_FAIL_ON(cur < head || cur > kring->nr_hwtail);
} else {
/* cur must be outside hwtail..head */
- if (cur < head && cur > kring->nr_hwtail)
- goto error;
+ NM_FAIL_ON(cur < head && cur > kring->nr_hwtail);
}
}
if (ring->tail != kring->rtail) {
@@ -1579,13 +1624,6 @@ nm_rxsync_prologue(struct netmap_kring *kring)
ring->tail = kring->rtail;
}
return head;
-
-error:
- RD(5, "kring error: hwcur %d rcur %d hwtail %d head %d cur %d tail %d",
- kring->nr_hwcur,
- kring->rcur, kring->nr_hwtail,
- kring->rhead, kring->rcur, ring->tail);
- return n;
}
@@ -1659,6 +1697,7 @@ netmap_interp_ringid(struct netmap_priv_d *priv, uint16_t ringid, uint32_t flags
struct netmap_adapter *na = priv->np_na;
u_int j, i = ringid & NETMAP_RING_MASK;
u_int reg = flags & NR_REG_MASK;
+ int excluded_direction[] = { NR_TX_RINGS_ONLY, NR_RX_RINGS_ONLY };
enum txrx t;
if (reg == NR_REG_DEFAULT) {
@@ -1672,48 +1711,58 @@ netmap_interp_ringid(struct netmap_priv_d *priv, uint16_t ringid, uint32_t flags
}
D("deprecated API, old ringid 0x%x -> ringid %x reg %d", ringid, i, reg);
}
- switch (reg) {
- case NR_REG_ALL_NIC:
- case NR_REG_PIPE_MASTER:
- case NR_REG_PIPE_SLAVE:
- for_rx_tx(t) {
+
+ if ((flags & NR_PTNETMAP_HOST) && (reg != NR_REG_ALL_NIC ||
+ flags & (NR_RX_RINGS_ONLY|NR_TX_RINGS_ONLY))) {
+ D("Error: only NR_REG_ALL_NIC supported with netmap passthrough");
+ return EINVAL;
+ }
+
+ for_rx_tx(t) {
+ if (flags & excluded_direction[t]) {
+ priv->np_qfirst[t] = priv->np_qlast[t] = 0;
+ continue;
+ }
+ switch (reg) {
+ case NR_REG_ALL_NIC:
+ case NR_REG_PIPE_MASTER:
+ case NR_REG_PIPE_SLAVE:
priv->np_qfirst[t] = 0;
priv->np_qlast[t] = nma_get_nrings(na, t);
- }
- ND("%s %d %d", "ALL/PIPE",
- priv->np_qfirst[NR_RX], priv->np_qlast[NR_RX]);
- break;
- case NR_REG_SW:
- case NR_REG_NIC_SW:
- if (!(na->na_flags & NAF_HOST_RINGS)) {
- D("host rings not supported");
- return EINVAL;
- }
- for_rx_tx(t) {
+ ND("ALL/PIPE: %s %d %d", nm_txrx2str(t),
+ priv->np_qfirst[t], priv->np_qlast[t]);
+ break;
+ case NR_REG_SW:
+ case NR_REG_NIC_SW:
+ if (!(na->na_flags & NAF_HOST_RINGS)) {
+ D("host rings not supported");
+ return EINVAL;
+ }
priv->np_qfirst[t] = (reg == NR_REG_SW ?
nma_get_nrings(na, t) : 0);
priv->np_qlast[t] = nma_get_nrings(na, t) + 1;
- }
- ND("%s %d %d", reg == NR_REG_SW ? "SW" : "NIC+SW",
- priv->np_qfirst[NR_RX], priv->np_qlast[NR_RX]);
- break;
- case NR_REG_ONE_NIC:
- if (i >= na->num_tx_rings && i >= na->num_rx_rings) {
- D("invalid ring id %d", i);
- return EINVAL;
- }
- for_rx_tx(t) {
+ ND("%s: %s %d %d", reg == NR_REG_SW ? "SW" : "NIC+SW",
+ nm_txrx2str(t),
+ priv->np_qfirst[t], priv->np_qlast[t]);
+ break;
+ case NR_REG_ONE_NIC:
+ if (i >= na->num_tx_rings && i >= na->num_rx_rings) {
+ D("invalid ring id %d", i);
+ return EINVAL;
+ }
/* if not enough rings, use the first one */
j = i;
if (j >= nma_get_nrings(na, t))
j = 0;
priv->np_qfirst[t] = j;
priv->np_qlast[t] = j + 1;
+ ND("ONE_NIC: %s %d %d", nm_txrx2str(t),
+ priv->np_qfirst[t], priv->np_qlast[t]);
+ break;
+ default:
+ D("invalid regif type %d", reg);
+ return EINVAL;
}
- break;
- default:
- D("invalid regif type %d", reg);
- return EINVAL;
}
priv->np_flags = (flags & ~NR_REG_MASK) | reg;
@@ -1776,11 +1825,12 @@ netmap_unset_ringid(struct netmap_priv_d *priv)
}
-/* check that the rings we want to bind are not exclusively owned by a previous
- * bind. If exclusive ownership has been requested, we also mark the rings.
+/* Set the nr_pending_mode for the requested rings.
+ * If requested, also try to get exclusive access to the rings, provided
+ * the rings we want to bind are not exclusively owned by a previous bind.
*/
static int
-netmap_get_exclusive(struct netmap_priv_d *priv)
+netmap_krings_get(struct netmap_priv_d *priv)
{
struct netmap_adapter *na = priv->np_na;
u_int i;
@@ -1811,16 +1861,16 @@ netmap_get_exclusive(struct netmap_priv_d *priv)
}
}
- /* second round: increment usage cound and possibly
- * mark as exclusive
+ /* second round: increment usage count (possibly marking them
+ * as exclusive) and set the nr_pending_mode
*/
-
for_rx_tx(t) {
for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {
kring = &NMR(na, t)[i];
kring->users++;
if (excl)
kring->nr_kflags |= NKR_EXCLUSIVE;
+ kring->nr_pending_mode = NKR_NETMAP_ON;
}
}
@@ -1828,9 +1878,11 @@ netmap_get_exclusive(struct netmap_priv_d *priv)
}
-/* undo netmap_get_ownership() */
+/* Undo netmap_krings_get(). This is done by clearing the exclusive mode
+ * if was asked on regif, and unset the nr_pending_mode if we are the
+ * last users of the involved rings. */
static void
-netmap_rel_exclusive(struct netmap_priv_d *priv)
+netmap_krings_put(struct netmap_priv_d *priv)
{
struct netmap_adapter *na = priv->np_na;
u_int i;
@@ -1852,6 +1904,8 @@ netmap_rel_exclusive(struct netmap_priv_d *priv)
if (excl)
kring->nr_kflags &= ~NKR_EXCLUSIVE;
kring->users--;
+ if (kring->users == 0)
+ kring->nr_pending_mode = NKR_NETMAP_OFF;
}
}
}
@@ -1899,9 +1953,8 @@ netmap_rel_exclusive(struct netmap_priv_d *priv)
* (put the adapter in netmap mode)
*
* This may be one of the following:
- * (XXX these should be either all *_register or all *_reg 2014-03-15)
*
- * * netmap_hw_register (hw ports)
+ * * netmap_hw_reg (hw ports)
* checks that the ifp is still there, then calls
* the hardware specific callback;
*
@@ -1919,7 +1972,7 @@ netmap_rel_exclusive(struct netmap_priv_d *priv)
* intercept the sync callbacks of the monitored
* rings
*
- * * netmap_bwrap_register (bwraps)
+ * * netmap_bwrap_reg (bwraps)
* cross-link the bwrap and hwna rings,
* forward the request to the hwna, override
* the hwna notify callback (to get the frames
@@ -1948,7 +2001,7 @@ netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na,
if (na->active_fds == 0) {
/*
* If this is the first registration of the adapter,
- * also create the netmap rings and their in-kernel view,
+ * create the in-kernel view of the netmap rings,
* the netmap krings.
*/
@@ -1960,39 +2013,48 @@ netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na,
if (error)
goto err_drop_mem;
- /* create all missing netmap rings */
- error = netmap_mem_rings_create(na);
- if (error)
- goto err_del_krings;
}
- /* now the kring must exist and we can check whether some
- * previous bind has exclusive ownership on them
+ /* now the krings must exist and we can check whether some
+ * previous bind has exclusive ownership on them, and set
+ * nr_pending_mode
*/
- error = netmap_get_exclusive(priv);
+ error = netmap_krings_get(priv);
if (error)
- goto err_del_rings;
+ goto err_del_krings;
+
+ /* create all needed missing netmap rings */
+ error = netmap_mem_rings_create(na);
+ if (error)
+ goto err_rel_excl;
/* in all cases, create a new netmap if */
nifp = netmap_mem_if_new(na);
if (nifp == NULL) {
error = ENOMEM;
- goto err_rel_excl;
+ goto err_del_rings;
}
- na->active_fds++;
- if (!nm_netmap_on(na)) {
- /* Netmap not active, set the card in netmap mode
- * and make it use the shared buffers.
- */
+ if (na->active_fds == 0) {
/* cache the allocator info in the na */
- netmap_mem_get_lut(na->nm_mem, &na->na_lut);
- ND("%p->na_lut == %p", na, na->na_lut.lut);
- error = na->nm_register(na, 1); /* mode on */
- if (error)
+ error = netmap_mem_get_lut(na->nm_mem, &na->na_lut);
+ if (error)
goto err_del_if;
+ ND("lut %p bufs %u size %u", na->na_lut.lut, na->na_lut.objtotal,
+ na->na_lut.objsize);
+ }
+
+ if (nm_kring_pending(priv)) {
+ /* Some kring is switching mode, tell the adapter to
+ * react on this. */
+ error = na->nm_register(na, 1);
+ if (error)
+ goto err_put_lut;
}
+ /* Commit the reference. */
+ na->active_fds++;
+
/*
* advertise that the interface is ready by setting np_nifp.
* The barrier is needed because readers (poll, *SYNC and mmap)
@@ -2003,15 +2065,15 @@ netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na,
return 0;
+err_put_lut:
+ if (na->active_fds == 0)
+ memset(&na->na_lut, 0, sizeof(na->na_lut));
err_del_if:
- memset(&na->na_lut, 0, sizeof(na->na_lut));
- na->active_fds--;
netmap_mem_if_delete(na, nifp);
err_rel_excl:
- netmap_rel_exclusive(priv);
+ netmap_krings_put(priv);
err_del_rings:
- if (na->active_fds == 0)
- netmap_mem_rings_delete(na);
+ netmap_mem_rings_delete(na);
err_del_krings:
if (na->active_fds == 0)
na->nm_krings_delete(na);
@@ -2024,41 +2086,23 @@ err:
/*
- * update kring and ring at the end of txsync.
+ * update kring and ring at the end of rxsync/txsync.
*/
static inline void
-nm_txsync_finalize(struct netmap_kring *kring)
+nm_sync_finalize(struct netmap_kring *kring)
{
- /* update ring tail to what the kernel knows */
+ /*
+ * Update ring tail to what the kernel knows
+ * After txsync: head/rhead/hwcur might be behind cur/rcur
+ * if no carrier.
+ */
kring->ring->tail = kring->rtail = kring->nr_hwtail;
- /* note, head/rhead/hwcur might be behind cur/rcur
- * if no carrier
- */
ND(5, "%s now hwcur %d hwtail %d head %d cur %d tail %d",
kring->name, kring->nr_hwcur, kring->nr_hwtail,
kring->rhead, kring->rcur, kring->rtail);
}
-
-/*
- * update kring and ring at the end of rxsync
- */
-static inline void
-nm_rxsync_finalize(struct netmap_kring *kring)
-{
- /* tell userspace that there might be new packets */
- //struct netmap_ring *ring = kring->ring;
- ND("head %d cur %d tail %d -> %d", ring->head, ring->cur, ring->tail,
- kring->nr_hwtail);
- kring->ring->tail = kring->rtail = kring->nr_hwtail;
- /* make a copy of the state for next round */
- kring->rhead = kring->ring->head;
- kring->rcur = kring->ring->cur;
-}
-
-
-
/*
* ioctl(2) support for the "netmap" device.
*
@@ -2072,21 +2116,17 @@ nm_rxsync_finalize(struct netmap_kring *kring)
* Return 0 on success, errno otherwise.
*/
int
-netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data,
- int fflag, struct thread *td)
+netmap_ioctl(struct netmap_priv_d *priv, u_long cmd, caddr_t data, struct thread *td)
{
- struct netmap_priv_d *priv = NULL;
struct nmreq *nmr = (struct nmreq *) data;
struct netmap_adapter *na = NULL;
- int error;
+ struct ifnet *ifp = NULL;
+ int error = 0;
u_int i, qfirst, qlast;
struct netmap_if *nifp;
struct netmap_kring *krings;
enum txrx t;
- (void)dev; /* UNUSED */
- (void)fflag; /* UNUSED */
-
if (cmd == NIOCGINFO || cmd == NIOCREGIF) {
/* truncate name */
nmr->nr_name[sizeof(nmr->nr_name) - 1] = '\0';
@@ -2101,15 +2141,6 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data,
return EINVAL;
}
}
- CURVNET_SET(TD_TO_VNET(td));
-
- error = devfs_get_cdevpriv((void **)&priv);
- if (error) {
- CURVNET_RESTORE();
- /* XXX ENOENT should be impossible, since the priv
- * is now created in the open */
- return (error == ENOENT ? ENXIO : error);
- }
switch (cmd) {
case NIOCGINFO: /* return capabilities etc */
@@ -2125,10 +2156,14 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data,
u_int memflags;
if (nmr->nr_name[0] != '\0') {
+
/* get a refcount */
- error = netmap_get_na(nmr, &na, 1 /* create */);
- if (error)
+ error = netmap_get_na(nmr, &na, &ifp, 1 /* create */);
+ if (error) {
+ na = NULL;
+ ifp = NULL;
break;
+ }
nmd = na->nm_mem; /* get memory allocator */
}
@@ -2145,8 +2180,8 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data,
nmr->nr_tx_rings = na->num_tx_rings;
nmr->nr_rx_slots = na->num_rx_desc;
nmr->nr_tx_slots = na->num_tx_desc;
- netmap_adapter_put(na);
} while (0);
+ netmap_unget_na(na, ifp);
NMG_UNLOCK();
break;
@@ -2156,9 +2191,25 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data,
if (i == NETMAP_BDG_ATTACH || i == NETMAP_BDG_DETACH
|| i == NETMAP_BDG_VNET_HDR
|| i == NETMAP_BDG_NEWIF
- || i == NETMAP_BDG_DELIF) {
+ || i == NETMAP_BDG_DELIF
+ || i == NETMAP_BDG_POLLING_ON
+ || i == NETMAP_BDG_POLLING_OFF) {
error = netmap_bdg_ctl(nmr, NULL);
break;
+ } else if (i == NETMAP_PT_HOST_CREATE || i == NETMAP_PT_HOST_DELETE) {
+ error = ptnetmap_ctl(nmr, priv->np_na);
+ break;
+ } else if (i == NETMAP_VNET_HDR_GET) {
+ struct ifnet *ifp;
+
+ NMG_LOCK();
+ error = netmap_get_na(nmr, &na, &ifp, 0);
+ if (na && !error) {
+ nmr->nr_arg1 = na->virt_hdr_len;
+ }
+ netmap_unget_na(na, ifp);
+ NMG_UNLOCK();
+ break;
} else if (i != 0) {
D("nr_cmd must be 0 not %d", i);
error = EINVAL;
@@ -2169,23 +2220,32 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data,
NMG_LOCK();
do {
u_int memflags;
+ struct ifnet *ifp;
if (priv->np_nifp != NULL) { /* thread already registered */
error = EBUSY;
break;
}
/* find the interface and a reference */
- error = netmap_get_na(nmr, &na, 1 /* create */); /* keep reference */
+ error = netmap_get_na(nmr, &na, &ifp,
+ 1 /* create */); /* keep reference */
if (error)
break;
if (NETMAP_OWNED_BY_KERN(na)) {
- netmap_adapter_put(na);
+ netmap_unget_na(na, ifp);
error = EBUSY;
break;
}
+
+ if (na->virt_hdr_len && !(nmr->nr_flags & NR_ACCEPT_VNET_HDR)) {
+ netmap_unget_na(na, ifp);
+ error = EIO;
+ break;
+ }
+
error = netmap_do_regif(priv, na, nmr->nr_ringid, nmr->nr_flags);
if (error) { /* reg. failed, release priv and ref */
- netmap_adapter_put(na);
+ netmap_unget_na(na, ifp);
break;
}
nifp = priv->np_nifp;
@@ -2200,7 +2260,7 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data,
&nmr->nr_arg2);
if (error) {
netmap_do_unregif(priv);
- netmap_adapter_put(na);
+ netmap_unget_na(na, ifp);
break;
}
if (memflags & NETMAP_MEM_PRIVATE) {
@@ -2212,12 +2272,17 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data,
}
if (nmr->nr_arg3) {
- D("requested %d extra buffers", nmr->nr_arg3);
+ if (netmap_verbose)
+ D("requested %d extra buffers", nmr->nr_arg3);
nmr->nr_arg3 = netmap_extra_alloc(na,
&nifp->ni_bufs_head, nmr->nr_arg3);
- D("got %d extra buffers", nmr->nr_arg3);
+ if (netmap_verbose)
+ D("got %d extra buffers", nmr->nr_arg3);
}
nmr->nr_offset = netmap_mem_if_offset(na->nm_mem, nifp);
+
+ /* store ifp reference so that priv destructor may release it */
+ priv->np_ifp = ifp;
} while (0);
NMG_UNLOCK();
break;
@@ -2240,11 +2305,6 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data,
break;
}
- if (!nm_netmap_on(na)) {
- error = ENXIO;
- break;
- }
-
t = (cmd == NIOCTXSYNC ? NR_TX : NR_RX);
krings = NMR(na, t);
qfirst = priv->np_qfirst[t];
@@ -2252,31 +2312,34 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data,
for (i = qfirst; i < qlast; i++) {
struct netmap_kring *kring = krings + i;
- if (nm_kr_tryget(kring)) {
- error = EBUSY;
- goto out;
+ struct netmap_ring *ring = kring->ring;
+
+ if (unlikely(nm_kr_tryget(kring, 1, &error))) {
+ error = (error ? EIO : 0);
+ continue;
}
+
if (cmd == NIOCTXSYNC) {
if (netmap_verbose & NM_VERB_TXSYNC)
D("pre txsync ring %d cur %d hwcur %d",
- i, kring->ring->cur,
+ i, ring->cur,
kring->nr_hwcur);
- if (nm_txsync_prologue(kring) >= kring->nkr_num_slots) {
+ if (nm_txsync_prologue(kring, ring) >= kring->nkr_num_slots) {
netmap_ring_reinit(kring);
} else if (kring->nm_sync(kring, NAF_FORCE_RECLAIM) == 0) {
- nm_txsync_finalize(kring);
+ nm_sync_finalize(kring);
}
if (netmap_verbose & NM_VERB_TXSYNC)
D("post txsync ring %d cur %d hwcur %d",
- i, kring->ring->cur,
+ i, ring->cur,
kring->nr_hwcur);
} else {
- if (nm_rxsync_prologue(kring) >= kring->nkr_num_slots) {
+ if (nm_rxsync_prologue(kring, ring) >= kring->nkr_num_slots) {
netmap_ring_reinit(kring);
} else if (kring->nm_sync(kring, NAF_FORCE_READ) == 0) {
- nm_rxsync_finalize(kring);
+ nm_sync_finalize(kring);
}
- microtime(&na->rx_rings[i].ring->ts);
+ microtime(&ring->ts);
}
nm_kr_put(kring);
}
@@ -2323,9 +2386,7 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data,
error = EOPNOTSUPP;
#endif /* linux */
}
-out:
- CURVNET_RESTORE();
return (error);
}
@@ -2345,17 +2406,15 @@ out:
* hidden argument.
*/
int
-netmap_poll(struct cdev *dev, int events, struct thread *td)
+netmap_poll(struct netmap_priv_d *priv, int events, NM_SELRECORD_T *sr)
{
- struct netmap_priv_d *priv = NULL;
struct netmap_adapter *na;
struct netmap_kring *kring;
+ struct netmap_ring *ring;
u_int i, check_all_tx, check_all_rx, want[NR_TXRX], revents = 0;
#define want_tx want[NR_TX]
#define want_rx want[NR_RX]
struct mbq q; /* packets from hw queues to host stack */
- void *pwait = dev; /* linux compatibility */
- int is_kevent = 0;
enum txrx t;
/*
@@ -2365,23 +2424,13 @@ netmap_poll(struct cdev *dev, int events, struct thread *td)
*/
int retry_tx = 1, retry_rx = 1;
- (void)pwait;
- mbq_init(&q);
-
- /*
- * XXX kevent has curthread->tp_fop == NULL,
- * so devfs_get_cdevpriv() fails. We circumvent this by passing
- * priv as the first argument, which is also useful to avoid
- * the selrecord() which are not necessary in that case.
+ /* transparent mode: send_down is 1 if we have found some
+ * packets to forward during the rx scan and we have not
+ * sent them down to the nic yet
*/
- if (devfs_get_cdevpriv((void **)&priv) != 0) {
- is_kevent = 1;
- if (netmap_verbose)
- D("called from kevent");
- priv = (struct netmap_priv_d *)dev;
- }
- if (priv == NULL)
- return POLLERR;
+ int send_down = 0;
+
+ mbq_init(&q);
if (priv->np_nifp == NULL) {
D("No if registered");
@@ -2399,7 +2448,6 @@ netmap_poll(struct cdev *dev, int events, struct thread *td)
want_tx = events & (POLLOUT | POLLWRNORM);
want_rx = events & (POLLIN | POLLRDNORM);
-
/*
* check_all_{tx|rx} are set if the card has more than one queue AND
* the file descriptor is bound to all of them. If so, we sleep on
@@ -2421,6 +2469,32 @@ netmap_poll(struct cdev *dev, int events, struct thread *td)
* slots available. If this fails, then lock and call the sync
* routines.
*/
+#if 1 /* new code- call rx if any of the ring needs to release or read buffers */
+ if (want_tx) {
+ t = NR_TX;
+ for (i = priv->np_qfirst[t]; want[t] && i < priv->np_qlast[t]; i++) {
+ kring = &NMR(na, t)[i];
+ /* XXX compare ring->cur and kring->tail */
+ if (!nm_ring_empty(kring->ring)) {
+ revents |= want[t];
+ want[t] = 0; /* also breaks the loop */
+ }
+ }
+ }
+ if (want_rx) {
+ want_rx = 0; /* look for a reason to run the handlers */
+ t = NR_RX;
+ for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {
+ kring = &NMR(na, t)[i];
+ if (kring->ring->cur == kring->ring->tail /* try fetch new buffers */
+ || kring->rhead != kring->ring->head /* release buffers */) {
+ want_rx = 1;
+ }
+ }
+ if (!want_rx)
+ revents |= events & (POLLIN | POLLRDNORM); /* we have data */
+ }
+#else /* old code */
for_rx_tx(t) {
for (i = priv->np_qfirst[t]; want[t] && i < priv->np_qlast[t]; i++) {
kring = &NMR(na, t)[i];
@@ -2431,6 +2505,7 @@ netmap_poll(struct cdev *dev, int events, struct thread *td)
}
}
}
+#endif /* old code */
/*
* If we want to push packets out (priv->np_txpoll) or
@@ -2447,32 +2522,26 @@ netmap_poll(struct cdev *dev, int events, struct thread *td)
* used to skip rings with no pending transmissions.
*/
flush_tx:
- for (i = priv->np_qfirst[NR_TX]; i < priv->np_qlast[NR_RX]; i++) {
+ for (i = priv->np_qfirst[NR_TX]; i < priv->np_qlast[NR_TX]; i++) {
int found = 0;
kring = &na->tx_rings[i];
- if (!want_tx && kring->ring->cur == kring->nr_hwcur)
+ ring = kring->ring;
+
+ if (!send_down && !want_tx && ring->cur == kring->nr_hwcur)
continue;
- /* only one thread does txsync */
- if (nm_kr_tryget(kring)) {
- /* either busy or stopped
- * XXX if the ring is stopped, sleeping would
- * be better. In current code, however, we only
- * stop the rings for brief intervals (2014-03-14)
- */
- if (netmap_verbose)
- RD(2, "%p lost race on txring %d, ok",
- priv, i);
+
+ if (nm_kr_tryget(kring, 1, &revents))
continue;
- }
- if (nm_txsync_prologue(kring) >= kring->nkr_num_slots) {
+
+ if (nm_txsync_prologue(kring, ring) >= kring->nkr_num_slots) {
netmap_ring_reinit(kring);
revents |= POLLERR;
} else {
if (kring->nm_sync(kring, 0))
revents |= POLLERR;
else
- nm_txsync_finalize(kring);
+ nm_sync_finalize(kring);
}
/*
@@ -2489,8 +2558,10 @@ flush_tx:
kring->nm_notify(kring, 0);
}
}
- if (want_tx && retry_tx && !is_kevent) {
- OS_selrecord(td, check_all_tx ?
+ /* if there were any packet to forward we must have handled them by now */
+ send_down = 0;
+ if (want_tx && retry_tx && sr) {
+ nm_os_selrecord(sr, check_all_tx ?
&na->si[NR_TX] : &na->tx_rings[priv->np_qfirst[NR_TX]].si);
retry_tx = 0;
goto flush_tx;
@@ -2502,22 +2573,18 @@ flush_tx:
* Do it on all rings because otherwise we starve.
*/
if (want_rx) {
- int send_down = 0; /* transparent mode */
/* two rounds here for race avoidance */
do_retry_rx:
for (i = priv->np_qfirst[NR_RX]; i < priv->np_qlast[NR_RX]; i++) {
int found = 0;
kring = &na->rx_rings[i];
+ ring = kring->ring;
- if (nm_kr_tryget(kring)) {
- if (netmap_verbose)
- RD(2, "%p lost race on rxring %d, ok",
- priv, i);
+ if (unlikely(nm_kr_tryget(kring, 1, &revents)))
continue;
- }
- if (nm_rxsync_prologue(kring) >= kring->nkr_num_slots) {
+ if (nm_rxsync_prologue(kring, ring) >= kring->nkr_num_slots) {
netmap_ring_reinit(kring);
revents |= POLLERR;
}
@@ -2526,22 +2593,22 @@ do_retry_rx:
/*
* transparent mode support: collect packets
* from the rxring(s).
- * XXX NR_FORWARD should only be read on
- * physical or NIC ports
*/
- if (netmap_fwd ||kring->ring->flags & NR_FORWARD) {
+ if (nm_may_forward_up(kring)) {
ND(10, "forwarding some buffers up %d to %d",
- kring->nr_hwcur, kring->ring->cur);
+ kring->nr_hwcur, ring->cur);
netmap_grab_packets(kring, &q, netmap_fwd);
}
+ kring->nr_kflags &= ~NR_FORWARD;
if (kring->nm_sync(kring, 0))
revents |= POLLERR;
else
- nm_rxsync_finalize(kring);
+ nm_sync_finalize(kring);
+ send_down |= (kring->nr_kflags & NR_FORWARD); /* host ring only */
if (netmap_no_timestamp == 0 ||
- kring->ring->flags & NR_TIMESTAMP) {
- microtime(&kring->ring->ts);
+ ring->flags & NR_TIMESTAMP) {
+ microtime(&ring->ts);
}
found = kring->rcur != kring->rtail;
nm_kr_put(kring);
@@ -2552,22 +2619,10 @@ do_retry_rx:
}
}
- /* transparent mode XXX only during first pass ? */
- if (na->na_flags & NAF_HOST_RINGS) {
- kring = &na->rx_rings[na->num_rx_rings];
- if (check_all_rx
- && (netmap_fwd || kring->ring->flags & NR_FORWARD)) {
- /* XXX fix to use kring fields */
- if (nm_ring_empty(kring->ring))
- send_down = netmap_rxsync_from_host(na, td, dev);
- if (!nm_ring_empty(kring->ring))
- revents |= want_rx;
- }
- }
-
- if (retry_rx && !is_kevent)
- OS_selrecord(td, check_all_rx ?
+ if (retry_rx && sr) {
+ nm_os_selrecord(sr, check_all_rx ?
&na->si[NR_RX] : &na->rx_rings[priv->np_qfirst[NR_RX]].si);
+ }
if (send_down > 0 || retry_rx) {
retry_rx = 0;
if (send_down)
@@ -2582,15 +2637,14 @@ do_retry_rx:
* kring->nr_hwcur and ring->head
* are passed to the other endpoint.
*
- * In this mode we also scan the sw rxring, which in
- * turn passes packets up.
- *
- * XXX Transparent mode at the moment requires to bind all
+ * Transparent mode requires to bind all
* rings to a single file descriptor.
*/
- if (q.head && na->ifp != NULL)
+ if (q.head && !nm_kr_tryget(&na->tx_rings[na->num_tx_rings], 1, &revents)) {
netmap_send_up(na->ifp, &q);
+ nm_kr_put(&na->tx_rings[na->num_tx_rings]);
+ }
return (revents);
#undef want_tx
@@ -2600,8 +2654,6 @@ do_retry_rx:
/*-------------------- driver support routines -------------------*/
-static int netmap_hw_krings_create(struct netmap_adapter *);
-
/* default notify callback */
static int
netmap_notify(struct netmap_kring *kring, int flags)
@@ -2609,51 +2661,51 @@ netmap_notify(struct netmap_kring *kring, int flags)
struct netmap_adapter *na = kring->na;
enum txrx t = kring->tx;
- OS_selwakeup(&kring->si, PI_NET);
+ nm_os_selwakeup(&kring->si);
/* optimization: avoid a wake up on the global
* queue if nobody has registered for more
* than one ring
*/
if (na->si_users[t] > 0)
- OS_selwakeup(&na->si[t], PI_NET);
+ nm_os_selwakeup(&na->si[t]);
- return 0;
+ return NM_IRQ_COMPLETED;
}
+#if 0
+static int
+netmap_notify(struct netmap_adapter *na, u_int n_ring,
+enum txrx tx, int flags)
+{
+ if (tx == NR_TX) {
+ KeSetEvent(notes->TX_EVENT, 0, FALSE);
+ }
+ else
+ {
+ KeSetEvent(notes->RX_EVENT, 0, FALSE);
+ }
+ return 0;
+}
+#endif
/* called by all routines that create netmap_adapters.
- * Attach na to the ifp (if any) and provide defaults
- * for optional callbacks. Defaults assume that we
- * are creating an hardware netmap_adapter.
+ * provide some defaults and get a reference to the
+ * memory allocator
*/
int
netmap_attach_common(struct netmap_adapter *na)
{
- struct ifnet *ifp = na->ifp;
-
if (na->num_tx_rings == 0 || na->num_rx_rings == 0) {
D("%s: invalid rings tx %d rx %d",
na->name, na->num_tx_rings, na->num_rx_rings);
return EINVAL;
}
- /* ifp is NULL for virtual adapters (bwrap, non-persistent VALE ports,
- * pipes, monitors). For bwrap we actually have a non-null ifp for
- * use by the external modules, but that is set after this
- * function has been called.
- * XXX this is ugly, maybe split this function in two (2014-03-14)
- */
- if (ifp != NULL) {
- WNA(ifp) = na;
- /* the following is only needed for na that use the host port.
- * XXX do we have something similar for linux ?
- */
#ifdef __FreeBSD__
- na->if_input = ifp->if_input; /* for netmap_send_up */
-#endif /* __FreeBSD__ */
-
- NETMAP_SET_CAPABLE(ifp);
+ if (na->na_flags & NAF_HOST_RINGS && na->ifp) {
+ na->if_input = na->ifp->if_input; /* for netmap_send_up */
}
+#endif /* __FreeBSD__ */
if (na->nm_krings_create == NULL) {
/* we assume that we have been called by a driver,
* since other port types all provide their own
@@ -2677,6 +2729,7 @@ netmap_attach_common(struct netmap_adapter *na)
*/
na->nm_bdg_attach = netmap_bwrap_attach;
#endif
+
return 0;
}
@@ -2685,9 +2738,6 @@ netmap_attach_common(struct netmap_adapter *na)
void
netmap_detach_common(struct netmap_adapter *na)
{
- if (na->ifp != NULL)
- WNA(na->ifp) = NULL; /* XXX do we need this? */
-
if (na->tx_rings) { /* XXX should not happen */
D("freeing leftover tx_rings");
na->nm_krings_delete(na);
@@ -2699,31 +2749,52 @@ netmap_detach_common(struct netmap_adapter *na)
free(na, M_DEVBUF);
}
-/* Wrapper for the register callback provided hardware drivers.
- * na->ifp == NULL means the driver module has been
+/* Wrapper for the register callback provided netmap-enabled
+ * hardware drivers.
+ * nm_iszombie(na) means that the driver module has been
* unloaded, so we cannot call into it.
- * Note that module unloading, in our patched linux drivers,
- * happens under NMG_LOCK and after having stopped all the
- * nic rings (see netmap_detach). This provides sufficient
- * protection for the other driver-provied callbacks
- * (i.e., nm_config and nm_*xsync), that therefore don't need
- * to wrapped.
+ * nm_os_ifnet_lock() must guarantee mutual exclusion with
+ * module unloading.
*/
static int
-netmap_hw_register(struct netmap_adapter *na, int onoff)
+netmap_hw_reg(struct netmap_adapter *na, int onoff)
{
struct netmap_hw_adapter *hwna =
(struct netmap_hw_adapter*)na;
+ int error = 0;
+
+ nm_os_ifnet_lock();
+
+ if (nm_iszombie(na)) {
+ if (onoff) {
+ error = ENXIO;
+ } else if (na != NULL) {
+ na->na_flags &= ~NAF_NETMAP_ON;
+ }
+ goto out;
+ }
+
+ error = hwna->nm_hw_register(na, onoff);
+
+out:
+ nm_os_ifnet_unlock();
+
+ return error;
+}
- if (na->ifp == NULL)
- return onoff ? ENXIO : 0;
+static void
+netmap_hw_dtor(struct netmap_adapter *na)
+{
+ if (nm_iszombie(na) || na->ifp == NULL)
+ return;
- return hwna->nm_hw_register(na, onoff);
+ WNA(na->ifp) = NULL;
}
/*
- * Initialize a ``netmap_adapter`` object created by driver on attach.
+ * Allocate a ``netmap_adapter`` object, and initialize it from the
+ * 'arg' passed by the driver on attach.
* We allocate a block of memory with room for a struct netmap_adapter
* plus two sets of N+2 struct netmap_kring (where N is the number
* of hardware rings):
@@ -2732,29 +2803,31 @@ netmap_hw_register(struct netmap_adapter *na, int onoff)
* kring N+1 is only used for the selinfo for all queues. // XXX still true ?
* Return 0 on success, ENOMEM otherwise.
*/
-int
-netmap_attach(struct netmap_adapter *arg)
+static int
+_netmap_attach(struct netmap_adapter *arg, size_t size)
{
struct netmap_hw_adapter *hwna = NULL;
- // XXX when is arg == NULL ?
- struct ifnet *ifp = arg ? arg->ifp : NULL;
+ struct ifnet *ifp = NULL;
- if (arg == NULL || ifp == NULL)
+ if (arg == NULL || arg->ifp == NULL)
goto fail;
- hwna = malloc(sizeof(*hwna), M_DEVBUF, M_NOWAIT | M_ZERO);
+ ifp = arg->ifp;
+ hwna = malloc(size, M_DEVBUF, M_NOWAIT | M_ZERO);
if (hwna == NULL)
goto fail;
hwna->up = *arg;
hwna->up.na_flags |= NAF_HOST_RINGS | NAF_NATIVE;
strncpy(hwna->up.name, ifp->if_xname, sizeof(hwna->up.name));
hwna->nm_hw_register = hwna->up.nm_register;
- hwna->up.nm_register = netmap_hw_register;
+ hwna->up.nm_register = netmap_hw_reg;
if (netmap_attach_common(&hwna->up)) {
free(hwna, M_DEVBUF);
goto fail;
}
netmap_adapter_get(&hwna->up);
+ NM_ATTACH_NA(ifp, &hwna->up);
+
#ifdef linux
if (ifp->netdev_ops) {
/* prepare a clone of the netdev ops */
@@ -2762,7 +2835,7 @@ netmap_attach(struct netmap_adapter *arg)
hwna->nm_ndo.ndo_start_xmit = ifp->netdev_ops;
#else
hwna->nm_ndo = *ifp->netdev_ops;
-#endif
+#endif /* NETMAP_LINUX_HAVE_NETDEV_OPS */
}
hwna->nm_ndo.ndo_start_xmit = linux_netmap_start_xmit;
if (ifp->ethtool_ops) {
@@ -2771,11 +2844,14 @@ netmap_attach(struct netmap_adapter *arg)
hwna->nm_eto.set_ringparam = linux_netmap_set_ringparam;
#ifdef NETMAP_LINUX_HAVE_SET_CHANNELS
hwna->nm_eto.set_channels = linux_netmap_set_channels;
-#endif
+#endif /* NETMAP_LINUX_HAVE_SET_CHANNELS */
if (arg->nm_config == NULL) {
hwna->up.nm_config = netmap_linux_config;
}
#endif /* linux */
+ if (arg->nm_dtor == NULL) {
+ hwna->up.nm_dtor = netmap_hw_dtor;
+ }
if_printf(ifp, "netmap queues/slots: TX %d/%d, RX %d/%d\n",
hwna->up.num_tx_rings, hwna->up.num_tx_desc,
@@ -2784,12 +2860,57 @@ netmap_attach(struct netmap_adapter *arg)
fail:
D("fail, arg %p ifp %p na %p", arg, ifp, hwna);
- if (ifp)
- netmap_detach(ifp);
return (hwna ? EINVAL : ENOMEM);
}
+int
+netmap_attach(struct netmap_adapter *arg)
+{
+ return _netmap_attach(arg, sizeof(struct netmap_hw_adapter));
+}
+
+
+#ifdef WITH_PTNETMAP_GUEST
+int
+netmap_pt_guest_attach(struct netmap_adapter *arg,
+ void *csb,
+ unsigned int nifp_offset,
+ nm_pt_guest_ptctl_t ptctl)
+{
+ struct netmap_pt_guest_adapter *ptna;
+ struct ifnet *ifp = arg ? arg->ifp : NULL;
+ int error;
+
+ /* get allocator */
+ arg->nm_mem = netmap_mem_pt_guest_new(ifp, nifp_offset, ptctl);
+ if (arg->nm_mem == NULL)
+ return ENOMEM;
+ arg->na_flags |= NAF_MEM_OWNER;
+ error = _netmap_attach(arg, sizeof(struct netmap_pt_guest_adapter));
+ if (error)
+ return error;
+
+ /* get the netmap_pt_guest_adapter */
+ ptna = (struct netmap_pt_guest_adapter *) NA(ifp);
+ ptna->csb = csb;
+
+ /* Initialize a separate pass-through netmap adapter that is going to
+ * be used by the ptnet driver only, and so never exposed to netmap
+ * applications. We only need a subset of the available fields. */
+ memset(&ptna->dr, 0, sizeof(ptna->dr));
+ ptna->dr.up.ifp = ifp;
+ ptna->dr.up.nm_mem = ptna->hwup.up.nm_mem;
+ netmap_mem_get(ptna->dr.up.nm_mem);
+ ptna->dr.up.nm_config = ptna->hwup.up.nm_config;
+
+ ptna->backend_regifs = 0;
+
+ return 0;
+}
+#endif /* WITH_PTNETMAP_GUEST */
+
+
void
NM_DBG(netmap_adapter_get)(struct netmap_adapter *na)
{
@@ -2841,28 +2962,29 @@ void
netmap_detach(struct ifnet *ifp)
{
struct netmap_adapter *na = NA(ifp);
- int skip;
if (!na)
return;
- skip = 0;
NMG_LOCK();
- netmap_disable_all_rings(ifp);
- na->ifp = NULL;
- na->na_flags &= ~NAF_NETMAP_ON;
+ netmap_set_all_rings(na, NM_KR_LOCKED);
+ na->na_flags |= NAF_ZOMBIE;
/*
* if the netmap adapter is not native, somebody
* changed it, so we can not release it here.
- * The NULL na->ifp will notify the new owner that
+ * The NAF_ZOMBIE flag will notify the new owner that
* the driver is gone.
*/
if (na->na_flags & NAF_NATIVE) {
- skip = netmap_adapter_put(na);
+ netmap_adapter_put(na);
}
- /* give them a chance to notice */
- if (skip == 0)
- netmap_enable_all_rings(ifp);
+ /* give active users a chance to notice that NAF_ZOMBIE has been
+ * turned on, so that they can stop and return an error to userspace.
+ * Note that this becomes a NOP if there are no active users and,
+ * therefore, the put() above has deleted the na, since now NA(ifp) is
+ * NULL.
+ */
+ netmap_enable_all_rings(ifp);
NMG_UNLOCK();
}
@@ -2883,9 +3005,10 @@ int
netmap_transmit(struct ifnet *ifp, struct mbuf *m)
{
struct netmap_adapter *na = NA(ifp);
- struct netmap_kring *kring;
+ struct netmap_kring *kring, *tx_kring;
u_int len = MBUF_LEN(m);
u_int error = ENOBUFS;
+ unsigned int txr;
struct mbq *q;
int space;
@@ -2900,6 +3023,16 @@ netmap_transmit(struct ifnet *ifp, struct mbuf *m)
goto done;
}
+ txr = MBUF_TXQ(m);
+ if (txr >= na->num_tx_rings) {
+ txr %= na->num_tx_rings;
+ }
+ tx_kring = &NMR(na, NR_TX)[txr];
+
+ if (tx_kring->nr_mode == NKR_NETMAP_OFF) {
+ return MBUF_TRANSMIT(na, ifp, m);
+ }
+
q = &kring->rx_queue;
// XXX reconsider long packets if we handle fragments
@@ -2909,6 +3042,11 @@ netmap_transmit(struct ifnet *ifp, struct mbuf *m)
goto done;
}
+ if (nm_os_mbuf_has_offld(m)) {
+ RD(1, "%s drop mbuf requiring offloadings", na->name);
+ goto done;
+ }
+
/* protect against rxsync_from_host(), netmap_sw_to_nic()
* and maybe other instances of netmap_transmit (the latter
* not possible on Linux).
@@ -2951,6 +3089,8 @@ done:
* netmap_reset() is called by the driver routines when reinitializing
* a ring. The driver is in charge of locking to protect the kring.
* If native netmap mode is not set just return NULL.
+ * If native netmap mode is set, in particular, we have to set nr_mode to
+ * NKR_NETMAP_ON.
*/
struct netmap_slot *
netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n,
@@ -2975,13 +3115,26 @@ netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n,
if (tx == NR_TX) {
if (n >= na->num_tx_rings)
return NULL;
+
kring = na->tx_rings + n;
+
+ if (kring->nr_pending_mode == NKR_NETMAP_OFF) {
+ kring->nr_mode = NKR_NETMAP_OFF;
+ return NULL;
+ }
+
// XXX check whether we should use hwcur or rcur
new_hwofs = kring->nr_hwcur - new_cur;
} else {
if (n >= na->num_rx_rings)
return NULL;
kring = na->rx_rings + n;
+
+ if (kring->nr_pending_mode == NKR_NETMAP_OFF) {
+ kring->nr_mode = NKR_NETMAP_OFF;
+ return NULL;
+ }
+
new_hwofs = kring->nr_hwtail - new_cur;
}
lim = kring->nkr_num_slots - 1;
@@ -3018,6 +3171,7 @@ netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n,
* We do the wakeup here, but the ring is not yet reconfigured.
* However, we are under lock so there are no races.
*/
+ kring->nr_mode = NKR_NETMAP_ON;
kring->nm_notify(kring, 0);
return kring->ring->slot;
}
@@ -3037,10 +3191,9 @@ netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n,
* - for a nic connected to a switch, call the proper forwarding routine
* (see netmap_bwrap_intr_notify)
*/
-void
-netmap_common_irq(struct ifnet *ifp, u_int q, u_int *work_done)
+int
+netmap_common_irq(struct netmap_adapter *na, u_int q, u_int *work_done)
{
- struct netmap_adapter *na = NA(ifp);
struct netmap_kring *kring;
enum txrx t = (work_done ? NR_RX : NR_TX);
@@ -3051,15 +3204,20 @@ netmap_common_irq(struct ifnet *ifp, u_int q, u_int *work_done)
}
if (q >= nma_get_nrings(na, t))
- return; // not a physical queue
+ return NM_IRQ_PASS; // not a physical queue
kring = NMR(na, t) + q;
+ if (kring->nr_mode == NKR_NETMAP_OFF) {
+ return NM_IRQ_PASS;
+ }
+
if (t == NR_RX) {
kring->nr_kflags |= NKR_PENDINTR; // XXX atomic ?
*work_done = 1; /* do not fire napi again */
}
- kring->nm_notify(kring, 0);
+
+ return kring->nm_notify(kring, 0);
}
@@ -3067,17 +3225,17 @@ netmap_common_irq(struct ifnet *ifp, u_int q, u_int *work_done)
* Default functions to handle rx/tx interrupts from a physical device.
* "work_done" is non-null on the RX path, NULL for the TX path.
*
- * If the card is not in netmap mode, simply return 0,
+ * If the card is not in netmap mode, simply return NM_IRQ_PASS,
* so that the caller proceeds with regular processing.
- * Otherwise call netmap_common_irq() and return 1.
+ * Otherwise call netmap_common_irq().
*
* If the card is connected to a netmap file descriptor,
* do a selwakeup on the individual queue, plus one on the global one
* if needed (multiqueue card _and_ there are multiqueue listeners),
- * and return 1.
+ * and return NR_IRQ_COMPLETED.
*
* Finally, if called on rx from an interface connected to a switch,
- * calls the proper forwarding routine, and return 1.
+ * calls the proper forwarding routine.
*/
int
netmap_rx_irq(struct ifnet *ifp, u_int q, u_int *work_done)
@@ -3091,15 +3249,14 @@ netmap_rx_irq(struct ifnet *ifp, u_int q, u_int *work_done)
* nm_native_on() here.
*/
if (!nm_netmap_on(na))
- return 0;
+ return NM_IRQ_PASS;
if (na->na_flags & NAF_SKIP_INTR) {
ND("use regular interrupt");
- return 0;
+ return NM_IRQ_PASS;
}
- netmap_common_irq(ifp, q, work_done);
- return 1;
+ return netmap_common_irq(na, q, work_done);
}
@@ -3120,9 +3277,11 @@ extern struct cdevsw netmap_cdevsw;
void
netmap_fini(void)
{
- netmap_uninit_bridges();
if (netmap_dev)
destroy_dev(netmap_dev);
+ /* we assume that there are no longer netmap users */
+ nm_os_ifnet_fini();
+ netmap_uninit_bridges();
netmap_mem_fini();
NMG_LOCK_DESTROY();
printf("netmap: unloaded module.\n");
@@ -3155,9 +3314,13 @@ netmap_init(void)
goto fail;
#ifdef __FreeBSD__
- nm_vi_init_index();
+ nm_os_vi_init_index();
#endif
+ error = nm_os_ifnet_init();
+ if (error)
+ goto fail;
+
printf("netmap: loaded module\n");
return (0);
fail:
diff --git a/sys/dev/netmap/netmap_freebsd.c b/sys/dev/netmap/netmap_freebsd.c
index 8490ae85670b..d83f21e255ec 100644
--- a/sys/dev/netmap/netmap_freebsd.c
+++ b/sys/dev/netmap/netmap_freebsd.c
@@ -27,14 +27,15 @@
#include "opt_inet.h"
#include "opt_inet6.h"
-#include <sys/types.h>
+#include <sys/param.h>
#include <sys/module.h>
#include <sys/errno.h>
-#include <sys/param.h> /* defines used in kernel.h */
+#include <sys/jail.h>
#include <sys/poll.h> /* POLLIN, POLLOUT */
#include <sys/kernel.h> /* types used in module initialization */
-#include <sys/conf.h> /* DEV_MODULE */
+#include <sys/conf.h> /* DEV_MODULE_ORDERED */
#include <sys/endian.h>
+#include <sys/syscallsubr.h> /* kern_ioctl() */
#include <sys/rwlock.h>
@@ -50,6 +51,11 @@
#include <sys/malloc.h>
#include <sys/socket.h> /* sockaddrs */
#include <sys/selinfo.h>
+#include <sys/kthread.h> /* kthread_add() */
+#include <sys/proc.h> /* PROC_LOCK() */
+#include <sys/unistd.h> /* RFNOWAIT */
+#include <sys/sched.h> /* sched_bind() */
+#include <sys/smp.h> /* mp_maxid */
#include <net/if.h>
#include <net/if_var.h>
#include <net/if_types.h> /* IFT_ETHER */
@@ -61,13 +67,94 @@
#include <net/netmap.h>
#include <dev/netmap/netmap_kern.h>
+#include <net/netmap_virt.h>
#include <dev/netmap/netmap_mem2.h>
/* ======================== FREEBSD-SPECIFIC ROUTINES ================== */
+void nm_os_selinfo_init(NM_SELINFO_T *si) {
+ struct mtx *m = &si->m;
+ mtx_init(m, "nm_kn_lock", NULL, MTX_DEF);
+ knlist_init_mtx(&si->si.si_note, m);
+}
+
+void
+nm_os_selinfo_uninit(NM_SELINFO_T *si)
+{
+ /* XXX kqueue(9) needed; these will mirror knlist_init. */
+ knlist_delete(&si->si.si_note, curthread, 0 /* not locked */ );
+ knlist_destroy(&si->si.si_note);
+ /* now we don't need the mutex anymore */
+ mtx_destroy(&si->m);
+}
+
+void
+nm_os_ifnet_lock(void)
+{
+ IFNET_WLOCK();
+}
+
+void
+nm_os_ifnet_unlock(void)
+{
+ IFNET_WUNLOCK();
+}
+
+static int netmap_use_count = 0;
+
+void
+nm_os_get_module(void)
+{
+ netmap_use_count++;
+}
+
+void
+nm_os_put_module(void)
+{
+ netmap_use_count--;
+}
+
+static void
+netmap_ifnet_arrival_handler(void *arg __unused, struct ifnet *ifp)
+{
+ netmap_undo_zombie(ifp);
+}
+
+static void
+netmap_ifnet_departure_handler(void *arg __unused, struct ifnet *ifp)
+{
+ netmap_make_zombie(ifp);
+}
+
+static eventhandler_tag nm_ifnet_ah_tag;
+static eventhandler_tag nm_ifnet_dh_tag;
+
+int
+nm_os_ifnet_init(void)
+{
+ nm_ifnet_ah_tag =
+ EVENTHANDLER_REGISTER(ifnet_arrival_event,
+ netmap_ifnet_arrival_handler,
+ NULL, EVENTHANDLER_PRI_ANY);
+ nm_ifnet_dh_tag =
+ EVENTHANDLER_REGISTER(ifnet_departure_event,
+ netmap_ifnet_departure_handler,
+ NULL, EVENTHANDLER_PRI_ANY);
+ return 0;
+}
+
+void
+nm_os_ifnet_fini(void)
+{
+ EVENTHANDLER_DEREGISTER(ifnet_arrival_event,
+ nm_ifnet_ah_tag);
+ EVENTHANDLER_DEREGISTER(ifnet_departure_event,
+ nm_ifnet_dh_tag);
+}
+
rawsum_t
-nm_csum_raw(uint8_t *data, size_t len, rawsum_t cur_sum)
+nm_os_csum_raw(uint8_t *data, size_t len, rawsum_t cur_sum)
{
/* TODO XXX please use the FreeBSD implementation for this. */
uint16_t *words = (uint16_t *)data;
@@ -87,7 +174,7 @@ nm_csum_raw(uint8_t *data, size_t len, rawsum_t cur_sum)
* return value is in network byte order.
*/
uint16_t
-nm_csum_fold(rawsum_t cur_sum)
+nm_os_csum_fold(rawsum_t cur_sum)
{
/* TODO XXX please use the FreeBSD implementation for this. */
while (cur_sum >> 16)
@@ -96,17 +183,17 @@ nm_csum_fold(rawsum_t cur_sum)
return htobe16((~cur_sum) & 0xFFFF);
}
-uint16_t nm_csum_ipv4(struct nm_iphdr *iph)
+uint16_t nm_os_csum_ipv4(struct nm_iphdr *iph)
{
#if 0
return in_cksum_hdr((void *)iph);
#else
- return nm_csum_fold(nm_csum_raw((uint8_t*)iph, sizeof(struct nm_iphdr), 0));
+ return nm_os_csum_fold(nm_os_csum_raw((uint8_t*)iph, sizeof(struct nm_iphdr), 0));
#endif
}
void
-nm_csum_tcpudp_ipv4(struct nm_iphdr *iph, void *data,
+nm_os_csum_tcpudp_ipv4(struct nm_iphdr *iph, void *data,
size_t datalen, uint16_t *check)
{
#ifdef INET
@@ -118,7 +205,7 @@ nm_csum_tcpudp_ipv4(struct nm_iphdr *iph, void *data,
/* Compute the checksum on TCP/UDP header + payload
* (includes the pseudo-header).
*/
- *check = nm_csum_fold(nm_csum_raw(data, datalen, 0));
+ *check = nm_os_csum_fold(nm_os_csum_raw(data, datalen, 0));
#else
static int notsupported = 0;
if (!notsupported) {
@@ -129,12 +216,12 @@ nm_csum_tcpudp_ipv4(struct nm_iphdr *iph, void *data,
}
void
-nm_csum_tcpudp_ipv6(struct nm_ipv6hdr *ip6h, void *data,
+nm_os_csum_tcpudp_ipv6(struct nm_ipv6hdr *ip6h, void *data,
size_t datalen, uint16_t *check)
{
#ifdef INET6
*check = in6_cksum_pseudo((void*)ip6h, datalen, ip6h->nexthdr, 0);
- *check = nm_csum_fold(nm_csum_raw(data, datalen, 0));
+ *check = nm_os_csum_fold(nm_os_csum_raw(data, datalen, 0));
#else
static int notsupported = 0;
if (!notsupported) {
@@ -144,13 +231,41 @@ nm_csum_tcpudp_ipv6(struct nm_ipv6hdr *ip6h, void *data,
#endif
}
+/* on FreeBSD we send up one packet at a time */
+void *
+nm_os_send_up(struct ifnet *ifp, struct mbuf *m, struct mbuf *prev)
+{
+
+ NA(ifp)->if_input(ifp, m);
+ return NULL;
+}
+
+int
+nm_os_mbuf_has_offld(struct mbuf *m)
+{
+ return m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_SCTP |
+ CSUM_TCP_IPV6 | CSUM_UDP_IPV6 |
+ CSUM_SCTP_IPV6 | CSUM_TSO);
+}
+
+static void
+freebsd_generic_rx_handler(struct ifnet *ifp, struct mbuf *m)
+{
+ struct netmap_generic_adapter *gna =
+ (struct netmap_generic_adapter *)NA(ifp);
+ int stolen = generic_rx_handler(ifp, m);
+
+ if (!stolen) {
+ gna->save_if_input(ifp, m);
+ }
+}
/*
* Intercept the rx routine in the standard device driver.
* Second argument is non-zero to intercept, 0 to restore
*/
int
-netmap_catch_rx(struct netmap_generic_adapter *gna, int intercept)
+nm_os_catch_rx(struct netmap_generic_adapter *gna, int intercept)
{
struct netmap_adapter *na = &gna->up.up;
struct ifnet *ifp = na->ifp;
@@ -161,7 +276,7 @@ netmap_catch_rx(struct netmap_generic_adapter *gna, int intercept)
return EINVAL; /* already set */
}
gna->save_if_input = ifp->if_input;
- ifp->if_input = generic_rx_handler;
+ ifp->if_input = freebsd_generic_rx_handler;
} else {
if (!gna->save_if_input){
D("cannot restore");
@@ -181,18 +296,20 @@ netmap_catch_rx(struct netmap_generic_adapter *gna, int intercept)
* Second argument is non-zero to intercept, 0 to restore.
* On freebsd we just intercept if_transmit.
*/
-void
-netmap_catch_tx(struct netmap_generic_adapter *gna, int enable)
+int
+nm_os_catch_tx(struct netmap_generic_adapter *gna, int intercept)
{
struct netmap_adapter *na = &gna->up.up;
struct ifnet *ifp = netmap_generic_getifp(gna);
- if (enable) {
+ if (intercept) {
na->if_transmit = ifp->if_transmit;
ifp->if_transmit = netmap_transmit;
} else {
ifp->if_transmit = na->if_transmit;
}
+
+ return 0;
}
@@ -213,40 +330,44 @@ netmap_catch_tx(struct netmap_generic_adapter *gna, int enable)
*
*/
int
-generic_xmit_frame(struct ifnet *ifp, struct mbuf *m,
- void *addr, u_int len, u_int ring_nr)
+nm_os_generic_xmit_frame(struct nm_os_gen_arg *a)
{
int ret;
+ u_int len = a->len;
+ struct ifnet *ifp = a->ifp;
+ struct mbuf *m = a->m;
+#if __FreeBSD_version < 1100000
/*
- * The mbuf should be a cluster from our special pool,
- * so we do not need to do an m_copyback but just copy
- * (and eventually, just reference the netmap buffer)
+ * Old FreeBSD versions. The mbuf has a cluster attached,
+ * we need to copy from the cluster to the netmap buffer.
*/
-
- if (GET_MBUF_REFCNT(m) != 1) {
- D("invalid refcnt %d for %p",
- GET_MBUF_REFCNT(m), m);
+ if (MBUF_REFCNT(m) != 1) {
+ D("invalid refcnt %d for %p", MBUF_REFCNT(m), m);
panic("in generic_xmit_frame");
}
- // XXX the ext_size check is unnecessary if we link the netmap buf
if (m->m_ext.ext_size < len) {
RD(5, "size %d < len %d", m->m_ext.ext_size, len);
len = m->m_ext.ext_size;
}
- if (0) { /* XXX seems to have negligible benefits */
- m->m_ext.ext_buf = m->m_data = addr;
- } else {
- bcopy(addr, m->m_data, len);
- }
+ bcopy(a->addr, m->m_data, len);
+#else /* __FreeBSD_version >= 1100000 */
+ /* New FreeBSD versions. Link the external storage to
+ * the netmap buffer, so that no copy is necessary. */
+ m->m_ext.ext_buf = m->m_data = a->addr;
+ m->m_ext.ext_size = len;
+#endif /* __FreeBSD_version >= 1100000 */
+
m->m_len = m->m_pkthdr.len = len;
- // inc refcount. All ours, we could skip the atomic
- atomic_fetchadd_int(PNT_MBUF_REFCNT(m), 1);
+
+ /* mbuf refcnt is not contended, no need to use atomic
+ * (a memory barrier is enough). */
+ SET_MBUF_REFCNT(m, 2);
M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
- m->m_pkthdr.flowid = ring_nr;
+ m->m_pkthdr.flowid = a->ring_nr;
m->m_pkthdr.rcvif = ifp; /* used for tx notification */
ret = NA(ifp)->if_transmit(ifp, m);
- return ret;
+ return ret ? -1 : 0;
}
@@ -263,7 +384,7 @@ netmap_getna(if_t ifp)
* way to extract the info from the ifp
*/
int
-generic_find_num_desc(struct ifnet *ifp, unsigned int *tx, unsigned int *rx)
+nm_os_generic_find_num_desc(struct ifnet *ifp, unsigned int *tx, unsigned int *rx)
{
D("called, in tx %d rx %d", *tx, *rx);
return 0;
@@ -271,16 +392,23 @@ generic_find_num_desc(struct ifnet *ifp, unsigned int *tx, unsigned int *rx)
void
-generic_find_num_queues(struct ifnet *ifp, u_int *txq, u_int *rxq)
+nm_os_generic_find_num_queues(struct ifnet *ifp, u_int *txq, u_int *rxq)
{
D("called, in txq %d rxq %d", *txq, *rxq);
*txq = netmap_generic_rings;
*rxq = netmap_generic_rings;
}
+void
+nm_os_generic_set_features(struct netmap_generic_adapter *gna)
+{
+
+ gna->rxsg = 1; /* Supported through m_copydata. */
+ gna->txqdisc = 0; /* Not supported. */
+}
void
-netmap_mitigation_init(struct nm_generic_mit *mit, int idx, struct netmap_adapter *na)
+nm_os_mitigation_init(struct nm_generic_mit *mit, int idx, struct netmap_adapter *na)
{
ND("called");
mit->mit_pending = 0;
@@ -290,21 +418,21 @@ netmap_mitigation_init(struct nm_generic_mit *mit, int idx, struct netmap_adapte
void
-netmap_mitigation_start(struct nm_generic_mit *mit)
+nm_os_mitigation_start(struct nm_generic_mit *mit)
{
ND("called");
}
void
-netmap_mitigation_restart(struct nm_generic_mit *mit)
+nm_os_mitigation_restart(struct nm_generic_mit *mit)
{
ND("called");
}
int
-netmap_mitigation_active(struct nm_generic_mit *mit)
+nm_os_mitigation_active(struct nm_generic_mit *mit)
{
ND("called");
return 0;
@@ -312,7 +440,7 @@ netmap_mitigation_active(struct nm_generic_mit *mit)
void
-netmap_mitigation_cleanup(struct nm_generic_mit *mit)
+nm_os_mitigation_cleanup(struct nm_generic_mit *mit)
{
ND("called");
}
@@ -342,7 +470,7 @@ static struct {
} nm_vi_indices;
void
-nm_vi_init_index(void)
+nm_os_vi_init_index(void)
{
int i;
for (i = 0; i < NM_VI_MAX; i++)
@@ -398,7 +526,7 @@ nm_vi_free_index(uint8_t val)
* increment this refcount on if_attach().
*/
int
-nm_vi_persist(const char *name, struct ifnet **ret)
+nm_os_vi_persist(const char *name, struct ifnet **ret)
{
struct ifnet *ifp;
u_short macaddr_hi;
@@ -438,15 +566,221 @@ nm_vi_persist(const char *name, struct ifnet **ret)
*ret = ifp;
return 0;
}
+
/* unregister from the system and drop the final refcount */
void
-nm_vi_detach(struct ifnet *ifp)
+nm_os_vi_detach(struct ifnet *ifp)
{
nm_vi_free_index(((char *)IF_LLADDR(ifp))[5]);
ether_ifdetach(ifp);
if_free(ifp);
}
+/* ======================== PTNETMAP SUPPORT ========================== */
+
+#ifdef WITH_PTNETMAP_GUEST
+#include <sys/bus.h>
+#include <sys/rman.h>
+#include <machine/bus.h> /* bus_dmamap_* */
+#include <machine/resource.h>
+#include <dev/pci/pcivar.h>
+#include <dev/pci/pcireg.h>
+/*
+ * ptnetmap memory device (memdev) for freebsd guest,
+ * ssed to expose host netmap memory to the guest through a PCI BAR.
+ */
+
+/*
+ * ptnetmap memdev private data structure
+ */
+struct ptnetmap_memdev {
+ device_t dev;
+ struct resource *pci_io;
+ struct resource *pci_mem;
+ struct netmap_mem_d *nm_mem;
+};
+
+static int ptn_memdev_probe(device_t);
+static int ptn_memdev_attach(device_t);
+static int ptn_memdev_detach(device_t);
+static int ptn_memdev_shutdown(device_t);
+
+static device_method_t ptn_memdev_methods[] = {
+ DEVMETHOD(device_probe, ptn_memdev_probe),
+ DEVMETHOD(device_attach, ptn_memdev_attach),
+ DEVMETHOD(device_detach, ptn_memdev_detach),
+ DEVMETHOD(device_shutdown, ptn_memdev_shutdown),
+ DEVMETHOD_END
+};
+
+static driver_t ptn_memdev_driver = {
+ PTNETMAP_MEMDEV_NAME,
+ ptn_memdev_methods,
+ sizeof(struct ptnetmap_memdev),
+};
+
+/* We use (SI_ORDER_MIDDLE+1) here, see DEV_MODULE_ORDERED() invocation
+ * below. */
+static devclass_t ptnetmap_devclass;
+DRIVER_MODULE_ORDERED(ptn_memdev, pci, ptn_memdev_driver, ptnetmap_devclass,
+ NULL, NULL, SI_ORDER_MIDDLE + 1);
+
+/*
+ * I/O port read/write wrappers.
+ * Some are not used, so we keep them commented out until needed
+ */
+#define ptn_ioread16(ptn_dev, reg) bus_read_2((ptn_dev)->pci_io, (reg))
+#define ptn_ioread32(ptn_dev, reg) bus_read_4((ptn_dev)->pci_io, (reg))
+#if 0
+#define ptn_ioread8(ptn_dev, reg) bus_read_1((ptn_dev)->pci_io, (reg))
+#define ptn_iowrite8(ptn_dev, reg, val) bus_write_1((ptn_dev)->pci_io, (reg), (val))
+#define ptn_iowrite16(ptn_dev, reg, val) bus_write_2((ptn_dev)->pci_io, (reg), (val))
+#define ptn_iowrite32(ptn_dev, reg, val) bus_write_4((ptn_dev)->pci_io, (reg), (val))
+#endif /* unused */
+
+/*
+ * Map host netmap memory through PCI-BAR in the guest OS,
+ * returning physical (nm_paddr) and virtual (nm_addr) addresses
+ * of the netmap memory mapped in the guest.
+ */
+int
+nm_os_pt_memdev_iomap(struct ptnetmap_memdev *ptn_dev, vm_paddr_t *nm_paddr,
+ void **nm_addr)
+{
+ uint32_t mem_size;
+ int rid;
+
+ D("ptn_memdev_driver iomap");
+
+ rid = PCIR_BAR(PTNETMAP_MEM_PCI_BAR);
+ mem_size = ptn_ioread32(ptn_dev, PTNETMAP_IO_PCI_MEMSIZE);
+
+ /* map memory allocator */
+ ptn_dev->pci_mem = bus_alloc_resource(ptn_dev->dev, SYS_RES_MEMORY,
+ &rid, 0, ~0, mem_size, RF_ACTIVE);
+ if (ptn_dev->pci_mem == NULL) {
+ *nm_paddr = 0;
+ *nm_addr = 0;
+ return ENOMEM;
+ }
+
+ *nm_paddr = rman_get_start(ptn_dev->pci_mem);
+ *nm_addr = rman_get_virtual(ptn_dev->pci_mem);
+
+ D("=== BAR %d start %lx len %lx mem_size %x ===",
+ PTNETMAP_MEM_PCI_BAR,
+ (unsigned long)(*nm_paddr),
+ (unsigned long)rman_get_size(ptn_dev->pci_mem),
+ mem_size);
+ return (0);
+}
+
+/* Unmap host netmap memory. */
+void
+nm_os_pt_memdev_iounmap(struct ptnetmap_memdev *ptn_dev)
+{
+ D("ptn_memdev_driver iounmap");
+
+ if (ptn_dev->pci_mem) {
+ bus_release_resource(ptn_dev->dev, SYS_RES_MEMORY,
+ PCIR_BAR(PTNETMAP_MEM_PCI_BAR), ptn_dev->pci_mem);
+ ptn_dev->pci_mem = NULL;
+ }
+}
+
+/* Device identification routine, return BUS_PROBE_DEFAULT on success,
+ * positive on failure */
+static int
+ptn_memdev_probe(device_t dev)
+{
+ char desc[256];
+
+ if (pci_get_vendor(dev) != PTNETMAP_PCI_VENDOR_ID)
+ return (ENXIO);
+ if (pci_get_device(dev) != PTNETMAP_PCI_DEVICE_ID)
+ return (ENXIO);
+
+ snprintf(desc, sizeof(desc), "%s PCI adapter",
+ PTNETMAP_MEMDEV_NAME);
+ device_set_desc_copy(dev, desc);
+
+ return (BUS_PROBE_DEFAULT);
+}
+
+/* Device initialization routine. */
+static int
+ptn_memdev_attach(device_t dev)
+{
+ struct ptnetmap_memdev *ptn_dev;
+ int rid;
+ uint16_t mem_id;
+
+ D("ptn_memdev_driver attach");
+
+ ptn_dev = device_get_softc(dev);
+ ptn_dev->dev = dev;
+
+ pci_enable_busmaster(dev);
+
+ rid = PCIR_BAR(PTNETMAP_IO_PCI_BAR);
+ ptn_dev->pci_io = bus_alloc_resource_any(dev, SYS_RES_IOPORT, &rid,
+ RF_ACTIVE);
+ if (ptn_dev->pci_io == NULL) {
+ device_printf(dev, "cannot map I/O space\n");
+ return (ENXIO);
+ }
+
+ mem_id = ptn_ioread16(ptn_dev, PTNETMAP_IO_PCI_HOSTID);
+
+ /* create guest allocator */
+ ptn_dev->nm_mem = netmap_mem_pt_guest_attach(ptn_dev, mem_id);
+ if (ptn_dev->nm_mem == NULL) {
+ ptn_memdev_detach(dev);
+ return (ENOMEM);
+ }
+ netmap_mem_get(ptn_dev->nm_mem);
+
+ D("ptn_memdev_driver probe OK - host_id: %d", mem_id);
+
+ return (0);
+}
+
+/* Device removal routine. */
+static int
+ptn_memdev_detach(device_t dev)
+{
+ struct ptnetmap_memdev *ptn_dev;
+
+ D("ptn_memdev_driver detach");
+ ptn_dev = device_get_softc(dev);
+
+ if (ptn_dev->nm_mem) {
+ netmap_mem_put(ptn_dev->nm_mem);
+ ptn_dev->nm_mem = NULL;
+ }
+ if (ptn_dev->pci_mem) {
+ bus_release_resource(dev, SYS_RES_MEMORY,
+ PCIR_BAR(PTNETMAP_MEM_PCI_BAR), ptn_dev->pci_mem);
+ ptn_dev->pci_mem = NULL;
+ }
+ if (ptn_dev->pci_io) {
+ bus_release_resource(dev, SYS_RES_IOPORT,
+ PCIR_BAR(PTNETMAP_IO_PCI_BAR), ptn_dev->pci_io);
+ ptn_dev->pci_io = NULL;
+ }
+
+ return (0);
+}
+
+static int
+ptn_memdev_shutdown(device_t dev)
+{
+ D("ptn_memdev_driver shutdown");
+ return bus_generic_shutdown(dev);
+}
+
+#endif /* WITH_PTNETMAP_GUEST */
+
/*
* In order to track whether pages are still mapped, we hook into
* the standard cdev_pager and intercept the constructor and
@@ -606,7 +940,7 @@ err_unlock:
* the device (/dev/netmap) so we cannot do anything useful.
* To track close() on individual file descriptors we pass netmap_dtor() to
* devfs_set_cdevpriv() on open(). The FreeBSD kernel will call the destructor
- * when the last fd pointing to the device is closed.
+ * when the last fd pointing to the device is closed.
*
* Note that FreeBSD does not even munmap() on close() so we also have
* to track mmap() ourselves, and postpone the call to
@@ -634,26 +968,275 @@ netmap_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
(void)devtype;
(void)td;
- priv = malloc(sizeof(struct netmap_priv_d), M_DEVBUF,
- M_NOWAIT | M_ZERO);
- if (priv == NULL)
- return ENOMEM;
- priv->np_refs = 1;
+ NMG_LOCK();
+ priv = netmap_priv_new();
+ if (priv == NULL) {
+ error = ENOMEM;
+ goto out;
+ }
error = devfs_set_cdevpriv(priv, netmap_dtor);
if (error) {
- free(priv, M_DEVBUF);
- } else {
- NMG_LOCK();
- netmap_use_count++;
- NMG_UNLOCK();
+ netmap_priv_delete(priv);
}
+out:
+ NMG_UNLOCK();
return error;
}
+/******************** kthread wrapper ****************/
+#include <sys/sysproto.h>
+u_int
+nm_os_ncpus(void)
+{
+ return mp_maxid + 1;
+}
+
+struct nm_kthread_ctx {
+ struct thread *user_td; /* thread user-space (kthread creator) to send ioctl */
+ /* notification to guest (interrupt) */
+ int irq_fd; /* ioctl fd */
+ struct nm_kth_ioctl irq_ioctl; /* ioctl arguments */
+
+ /* notification from guest */
+ void *ioevent_file; /* tsleep() argument */
+
+ /* worker function and parameter */
+ nm_kthread_worker_fn_t worker_fn;
+ void *worker_private;
+
+ struct nm_kthread *nmk;
+
+ /* integer to manage multiple worker contexts (e.g., RX or TX on ptnetmap) */
+ long type;
+};
+
+struct nm_kthread {
+ struct thread *worker;
+ struct mtx worker_lock;
+ uint64_t scheduled; /* pending wake_up request */
+ struct nm_kthread_ctx worker_ctx;
+ int run; /* used to stop kthread */
+ int attach_user; /* kthread attached to user_process */
+ int affinity;
+};
+
+void inline
+nm_os_kthread_wakeup_worker(struct nm_kthread *nmk)
+{
+ /*
+ * There may be a race between FE and BE,
+ * which call both this function, and worker kthread,
+ * that reads nmk->scheduled.
+ *
+ * For us it is not important the counter value,
+ * but simply that it has changed since the last
+ * time the kthread saw it.
+ */
+ mtx_lock(&nmk->worker_lock);
+ nmk->scheduled++;
+ if (nmk->worker_ctx.ioevent_file) {
+ wakeup(nmk->worker_ctx.ioevent_file);
+ }
+ mtx_unlock(&nmk->worker_lock);
+}
+
+void inline
+nm_os_kthread_send_irq(struct nm_kthread *nmk)
+{
+ struct nm_kthread_ctx *ctx = &nmk->worker_ctx;
+ int err;
+
+ if (ctx->user_td && ctx->irq_fd > 0) {
+ err = kern_ioctl(ctx->user_td, ctx->irq_fd, ctx->irq_ioctl.com, (caddr_t)&ctx->irq_ioctl.data.msix);
+ if (err) {
+ D("kern_ioctl error: %d ioctl parameters: fd %d com %ju data %p",
+ err, ctx->irq_fd, (uintmax_t)ctx->irq_ioctl.com, &ctx->irq_ioctl.data);
+ }
+ }
+}
+
+static void
+nm_kthread_worker(void *data)
+{
+ struct nm_kthread *nmk = data;
+ struct nm_kthread_ctx *ctx = &nmk->worker_ctx;
+ uint64_t old_scheduled = nmk->scheduled;
+
+ if (nmk->affinity >= 0) {
+ thread_lock(curthread);
+ sched_bind(curthread, nmk->affinity);
+ thread_unlock(curthread);
+ }
+
+ while (nmk->run) {
+ /*
+ * check if the parent process dies
+ * (when kthread is attached to user process)
+ */
+ if (ctx->user_td) {
+ PROC_LOCK(curproc);
+ thread_suspend_check(0);
+ PROC_UNLOCK(curproc);
+ } else {
+ kthread_suspend_check();
+ }
+
+ /*
+ * if ioevent_file is not defined, we don't have notification
+ * mechanism and we continually execute worker_fn()
+ */
+ if (!ctx->ioevent_file) {
+ ctx->worker_fn(ctx->worker_private); /* worker body */
+ } else {
+ /* checks if there is a pending notification */
+ mtx_lock(&nmk->worker_lock);
+ if (likely(nmk->scheduled != old_scheduled)) {
+ old_scheduled = nmk->scheduled;
+ mtx_unlock(&nmk->worker_lock);
+
+ ctx->worker_fn(ctx->worker_private); /* worker body */
+
+ continue;
+ } else if (nmk->run) {
+ /* wait on event with one second timeout */
+ msleep_spin(ctx->ioevent_file, &nmk->worker_lock,
+ "nmk_ev", hz);
+ nmk->scheduled++;
+ }
+ mtx_unlock(&nmk->worker_lock);
+ }
+ }
+
+ kthread_exit();
+}
+
+static int
+nm_kthread_open_files(struct nm_kthread *nmk, struct nm_kthread_cfg *cfg)
+{
+ /* send irq through ioctl to bhyve (vmm.ko) */
+ if (cfg->event.irqfd) {
+ nmk->worker_ctx.irq_fd = cfg->event.irqfd;
+ nmk->worker_ctx.irq_ioctl = cfg->event.ioctl;
+ }
+ /* ring.ioeventfd contains the chan where do tsleep to wait events */
+ if (cfg->event.ioeventfd) {
+ nmk->worker_ctx.ioevent_file = (void *)cfg->event.ioeventfd;
+ }
+
+ return 0;
+}
+
+static void
+nm_kthread_close_files(struct nm_kthread *nmk)
+{
+ nmk->worker_ctx.irq_fd = 0;
+ nmk->worker_ctx.ioevent_file = NULL;
+}
+
+void
+nm_os_kthread_set_affinity(struct nm_kthread *nmk, int affinity)
+{
+ nmk->affinity = affinity;
+}
+
+struct nm_kthread *
+nm_os_kthread_create(struct nm_kthread_cfg *cfg)
+{
+ struct nm_kthread *nmk = NULL;
+ int error;
+
+ nmk = malloc(sizeof(*nmk), M_DEVBUF, M_NOWAIT | M_ZERO);
+ if (!nmk)
+ return NULL;
+
+ mtx_init(&nmk->worker_lock, "nm_kthread lock", NULL, MTX_SPIN);
+ nmk->worker_ctx.worker_fn = cfg->worker_fn;
+ nmk->worker_ctx.worker_private = cfg->worker_private;
+ nmk->worker_ctx.type = cfg->type;
+ nmk->affinity = -1;
+
+ /* attach kthread to user process (ptnetmap) */
+ nmk->attach_user = cfg->attach_user;
+
+ /* open event fd */
+ error = nm_kthread_open_files(nmk, cfg);
+ if (error)
+ goto err;
+
+ return nmk;
+err:
+ free(nmk, M_DEVBUF);
+ return NULL;
+}
+
+int
+nm_os_kthread_start(struct nm_kthread *nmk)
+{
+ struct proc *p = NULL;
+ int error = 0;
+
+ if (nmk->worker) {
+ return EBUSY;
+ }
+
+ /* check if we want to attach kthread to user process */
+ if (nmk->attach_user) {
+ nmk->worker_ctx.user_td = curthread;
+ p = curthread->td_proc;
+ }
+
+ /* enable kthread main loop */
+ nmk->run = 1;
+ /* create kthread */
+ if((error = kthread_add(nm_kthread_worker, nmk, p,
+ &nmk->worker, RFNOWAIT /* to be checked */, 0, "nm-kthread-%ld",
+ nmk->worker_ctx.type))) {
+ goto err;
+ }
+
+ D("nm_kthread started td 0x%p", nmk->worker);
+
+ return 0;
+err:
+ D("nm_kthread start failed err %d", error);
+ nmk->worker = NULL;
+ return error;
+}
+
+void
+nm_os_kthread_stop(struct nm_kthread *nmk)
+{
+ if (!nmk->worker) {
+ return;
+ }
+ /* tell to kthread to exit from main loop */
+ nmk->run = 0;
+
+ /* wake up kthread if it sleeps */
+ kthread_resume(nmk->worker);
+ nm_os_kthread_wakeup_worker(nmk);
+
+ nmk->worker = NULL;
+}
+
+void
+nm_os_kthread_delete(struct nm_kthread *nmk)
+{
+ if (!nmk)
+ return;
+ if (nmk->worker) {
+ nm_os_kthread_stop(nmk);
+ }
+
+ nm_kthread_close_files(nmk);
+
+ free(nmk, M_DEVBUF);
+}
+
/******************** kqueue support ****************/
/*
- * The OS_selwakeup also needs to issue a KNOTE_UNLOCKED.
+ * nm_os_selwakeup also needs to issue a KNOTE_UNLOCKED.
* We use a non-zero argument to distinguish the call from the one
* in kevent_scan() which instead also needs to run netmap_poll().
* The knote uses a global mutex for the time being. We might
@@ -672,17 +1255,23 @@ netmap_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
void
-freebsd_selwakeup(struct nm_selinfo *si, int pri)
+nm_os_selwakeup(struct nm_selinfo *si)
{
if (netmap_verbose)
D("on knote %p", &si->si.si_note);
- selwakeuppri(&si->si, pri);
+ selwakeuppri(&si->si, PI_NET);
/* use a non-zero hint to tell the notification from the
* call done in kqueue_scan() which uses 0
*/
KNOTE_UNLOCKED(&si->si.si_note, 0x100 /* notification */);
}
+void
+nm_os_selrecord(struct thread *td, struct nm_selinfo *si)
+{
+ selrecord(td, &si->si);
+}
+
static void
netmap_knrdetach(struct knote *kn)
{
@@ -728,7 +1317,7 @@ netmap_knrw(struct knote *kn, long hint, int events)
RD(5, "curthread changed %p %p", curthread, priv->np_td);
return 1;
} else {
- revents = netmap_poll((void *)priv, events, curthread);
+ revents = netmap_poll(priv, events, NULL);
return (events & revents) ? 1 : 0;
}
}
@@ -801,13 +1390,47 @@ netmap_kqfilter(struct cdev *dev, struct knote *kn)
return 0;
}
+static int
+freebsd_netmap_poll(struct cdev *cdevi __unused, int events, struct thread *td)
+{
+ struct netmap_priv_d *priv;
+ if (devfs_get_cdevpriv((void **)&priv)) {
+ return POLLERR;
+ }
+ return netmap_poll(priv, events, td);
+}
+
+static int
+freebsd_netmap_ioctl(struct cdev *dev __unused, u_long cmd, caddr_t data,
+ int ffla __unused, struct thread *td)
+{
+ int error;
+ struct netmap_priv_d *priv;
+
+ CURVNET_SET(TD_TO_VNET(td));
+ error = devfs_get_cdevpriv((void **)&priv);
+ if (error) {
+ /* XXX ENOENT should be impossible, since the priv
+ * is now created in the open */
+ if (error == ENOENT)
+ error = ENXIO;
+ goto out;
+ }
+ error = netmap_ioctl(priv, cmd, data, td);
+out:
+ CURVNET_RESTORE();
+
+ return error;
+}
+
+extern struct cdevsw netmap_cdevsw; /* XXX used in netmap.c, should go elsewhere */
struct cdevsw netmap_cdevsw = {
.d_version = D_VERSION,
.d_name = "netmap",
.d_open = netmap_open,
.d_mmap_single = netmap_mmap_single,
- .d_ioctl = netmap_ioctl,
- .d_poll = netmap_poll,
+ .d_ioctl = freebsd_netmap_ioctl,
+ .d_poll = freebsd_netmap_poll,
.d_kqfilter = netmap_kqfilter,
.d_close = netmap_close,
};
@@ -852,6 +1475,24 @@ netmap_loader(__unused struct module *module, int event, __unused void *arg)
return (error);
}
-
+#ifdef DEV_MODULE_ORDERED
+/*
+ * The netmap module contains three drivers: (i) the netmap character device
+ * driver; (ii) the ptnetmap memdev PCI device driver, (iii) the ptnet PCI
+ * device driver. The attach() routines of both (ii) and (iii) need the
+ * lock of the global allocator, and such lock is initialized in netmap_init(),
+ * which is part of (i).
+ * Therefore, we make sure that (i) is loaded before (ii) and (iii), using
+ * the 'order' parameter of driver declaration macros. For (i), we specify
+ * SI_ORDER_MIDDLE, while higher orders are used with the DRIVER_MODULE_ORDERED
+ * macros for (ii) and (iii).
+ */
+DEV_MODULE_ORDERED(netmap, netmap_loader, NULL, SI_ORDER_MIDDLE);
+#else /* !DEV_MODULE_ORDERED */
DEV_MODULE(netmap, netmap_loader, NULL);
+#endif /* DEV_MODULE_ORDERED */
+MODULE_DEPEND(netmap, pci, 1, 1, 1);
MODULE_VERSION(netmap, 1);
+/* reduce conditional code */
+// linux API, use for the knlist in FreeBSD
+/* use a private mutex for the knlist */
diff --git a/sys/dev/netmap/netmap_generic.c b/sys/dev/netmap/netmap_generic.c
index 85a6a9f76ea2..87072069fbcf 100644
--- a/sys/dev/netmap/netmap_generic.c
+++ b/sys/dev/netmap/netmap_generic.c
@@ -1,5 +1,7 @@
/*
- * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
+ * Copyright (C) 2013-2016 Vincenzo Maffione
+ * Copyright (C) 2013-2016 Luigi Rizzo
+ * All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -83,25 +85,25 @@ __FBSDID("$FreeBSD$");
#define rtnl_lock() ND("rtnl_lock called")
#define rtnl_unlock() ND("rtnl_unlock called")
-#define MBUF_TXQ(m) ((m)->m_pkthdr.flowid)
#define MBUF_RXQ(m) ((m)->m_pkthdr.flowid)
#define smp_mb()
/*
* FreeBSD mbuf allocator/deallocator in emulation mode:
+ */
+#if __FreeBSD_version < 1100000
+
+/*
+ * For older versions of FreeBSD:
*
* We allocate EXT_PACKET mbuf+clusters, but need to set M_NOFREE
* so that the destructor, if invoked, will not free the packet.
- * In principle we should set the destructor only on demand,
+ * In principle we should set the destructor only on demand,
* but since there might be a race we better do it on allocation.
* As a consequence, we also need to set the destructor or we
* would leak buffers.
*/
-/*
- * mbuf wrappers
- */
-
/* mbuf destructor, also need to change the type to EXT_EXTREF,
* add an M_NOFREE flag, and then clear the flag and
* chain into uma_zfree(zone_pack, mf)
@@ -112,35 +114,93 @@ __FBSDID("$FreeBSD$");
(m)->m_ext.ext_type = EXT_EXTREF; \
} while (0)
-static void
-netmap_default_mbuf_destructor(struct mbuf *m)
+static int
+void_mbuf_dtor(struct mbuf *m, void *arg1, void *arg2)
{
/* restore original mbuf */
m->m_ext.ext_buf = m->m_data = m->m_ext.ext_arg1;
m->m_ext.ext_arg1 = NULL;
m->m_ext.ext_type = EXT_PACKET;
m->m_ext.ext_free = NULL;
- if (GET_MBUF_REFCNT(m) == 0)
+ if (MBUF_REFCNT(m) == 0)
SET_MBUF_REFCNT(m, 1);
uma_zfree(zone_pack, m);
+
+ return 0;
}
static inline struct mbuf *
-netmap_get_mbuf(int len)
+nm_os_get_mbuf(struct ifnet *ifp, int len)
{
struct mbuf *m;
+
+ (void)ifp;
m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
if (m) {
- m->m_flags |= M_NOFREE; /* XXXNP: Almost certainly incorrect. */
+ /* m_getcl() (mb_ctor_mbuf) has an assert that checks that
+ * M_NOFREE flag is not specified as third argument,
+ * so we have to set M_NOFREE after m_getcl(). */
+ m->m_flags |= M_NOFREE;
m->m_ext.ext_arg1 = m->m_ext.ext_buf; // XXX save
- m->m_ext.ext_free = (void *)netmap_default_mbuf_destructor;
+ m->m_ext.ext_free = (void *)void_mbuf_dtor;
m->m_ext.ext_type = EXT_EXTREF;
- ND(5, "create m %p refcnt %d", m, GET_MBUF_REFCNT(m));
+ ND(5, "create m %p refcnt %d", m, MBUF_REFCNT(m));
+ }
+ return m;
+}
+
+#else /* __FreeBSD_version >= 1100000 */
+
+/*
+ * Newer versions of FreeBSD, using a straightforward scheme.
+ *
+ * We allocate mbufs with m_gethdr(), since the mbuf header is needed
+ * by the driver. We also attach a customly-provided external storage,
+ * which in this case is a netmap buffer. When calling m_extadd(), however
+ * we pass a NULL address, since the real address (and length) will be
+ * filled in by nm_os_generic_xmit_frame() right before calling
+ * if_transmit().
+ *
+ * The dtor function does nothing, however we need it since mb_free_ext()
+ * has a KASSERT(), checking that the mbuf dtor function is not NULL.
+ */
+
+#define SET_MBUF_DESTRUCTOR(m, fn) do { \
+ (m)->m_ext.ext_free = (void *)fn; \
+} while (0)
+
+static void void_mbuf_dtor(struct mbuf *m, void *arg1, void *arg2) { }
+
+static inline struct mbuf *
+nm_os_get_mbuf(struct ifnet *ifp, int len)
+{
+ struct mbuf *m;
+
+ (void)ifp;
+ (void)len;
+
+ m = m_gethdr(M_NOWAIT, MT_DATA);
+ if (m == NULL) {
+ return m;
}
+
+ m_extadd(m, NULL /* buf */, 0 /* size */, void_mbuf_dtor,
+ NULL, NULL, 0, EXT_NET_DRV);
+
return m;
}
+#endif /* __FreeBSD_version >= 1100000 */
+
+#elif defined _WIN32
+
+#include "win_glue.h"
+#define rtnl_lock() ND("rtnl_lock called")
+#define rtnl_unlock() ND("rtnl_unlock called")
+#define MBUF_TXQ(m) 0//((m)->m_pkthdr.flowid)
+#define MBUF_RXQ(m) 0//((m)->m_pkthdr.flowid)
+#define smp_mb() //XXX: to be correctly defined
#else /* linux */
@@ -150,7 +210,12 @@ netmap_get_mbuf(int len)
#include <linux/ethtool.h> /* struct ethtool_ops, get_ringparam */
#include <linux/hrtimer.h>
-//#define REG_RESET
+static inline struct mbuf *
+nm_os_get_mbuf(struct ifnet *ifp, int len)
+{
+ return alloc_skb(ifp->needed_headroom + len +
+ ifp->needed_tailroom, GFP_ATOMIC);
+}
#endif /* linux */
@@ -161,8 +226,21 @@ netmap_get_mbuf(int len)
#include <dev/netmap/netmap_mem2.h>
+#define for_each_kring_n(_i, _k, _karr, _n) \
+ for (_k=_karr, _i = 0; _i < _n; (_k)++, (_i)++)
-/* ======================== usage stats =========================== */
+#define for_each_tx_kring(_i, _k, _na) \
+ for_each_kring_n(_i, _k, (_na)->tx_rings, (_na)->num_tx_rings)
+#define for_each_tx_kring_h(_i, _k, _na) \
+ for_each_kring_n(_i, _k, (_na)->tx_rings, (_na)->num_tx_rings + 1)
+
+#define for_each_rx_kring(_i, _k, _na) \
+ for_each_kring_n(_i, _k, (_na)->rx_rings, (_na)->num_rx_rings)
+#define for_each_rx_kring_h(_i, _k, _na) \
+ for_each_kring_n(_i, _k, (_na)->rx_rings, (_na)->num_rx_rings + 1)
+
+
+/* ======================== PERFORMANCE STATISTICS =========================== */
#ifdef RATE_GENERIC
#define IFRATE(x) x
@@ -170,6 +248,8 @@ struct rate_stats {
unsigned long txpkt;
unsigned long txsync;
unsigned long txirq;
+ unsigned long txrepl;
+ unsigned long txdrop;
unsigned long rxpkt;
unsigned long rxirq;
unsigned long rxsync;
@@ -194,6 +274,8 @@ static void rate_callback(unsigned long arg)
RATE_PRINTK(txpkt);
RATE_PRINTK(txsync);
RATE_PRINTK(txirq);
+ RATE_PRINTK(txrepl);
+ RATE_PRINTK(txdrop);
RATE_PRINTK(rxpkt);
RATE_PRINTK(rxsync);
RATE_PRINTK(rxirq);
@@ -230,94 +312,222 @@ void generic_rate(int txp, int txs, int txi, int rxp, int rxs, int rxi)
* the poller threads. Differently from netmap_rx_irq(), we check
* only NAF_NETMAP_ON instead of NAF_NATIVE_ON to enable the irq.
*/
-static void
-netmap_generic_irq(struct ifnet *ifp, u_int q, u_int *work_done)
+void
+netmap_generic_irq(struct netmap_adapter *na, u_int q, u_int *work_done)
{
- struct netmap_adapter *na = NA(ifp);
if (unlikely(!nm_netmap_on(na)))
return;
- netmap_common_irq(ifp, q, work_done);
+ netmap_common_irq(na, q, work_done);
+#ifdef RATE_GENERIC
+ if (work_done)
+ rate_ctx.new.rxirq++;
+ else
+ rate_ctx.new.txirq++;
+#endif /* RATE_GENERIC */
}
+static int
+generic_netmap_unregister(struct netmap_adapter *na)
+{
+ struct netmap_generic_adapter *gna = (struct netmap_generic_adapter *)na;
+ struct netmap_kring *kring = NULL;
+ int i, r;
+
+ if (na->active_fds == 0) {
+ D("Generic adapter %p goes off", na);
+ rtnl_lock();
+
+ na->na_flags &= ~NAF_NETMAP_ON;
+
+ /* Release packet steering control. */
+ nm_os_catch_tx(gna, 0);
+
+ /* Stop intercepting packets on the RX path. */
+ nm_os_catch_rx(gna, 0);
+
+ rtnl_unlock();
+ }
+
+ for_each_rx_kring_h(r, kring, na) {
+ if (nm_kring_pending_off(kring)) {
+ D("RX ring %d of generic adapter %p goes off", r, na);
+ kring->nr_mode = NKR_NETMAP_OFF;
+ }
+ }
+ for_each_tx_kring_h(r, kring, na) {
+ if (nm_kring_pending_off(kring)) {
+ kring->nr_mode = NKR_NETMAP_OFF;
+ D("TX ring %d of generic adapter %p goes off", r, na);
+ }
+ }
+
+ for_each_rx_kring(r, kring, na) {
+ /* Free the mbufs still pending in the RX queues,
+ * that did not end up into the corresponding netmap
+ * RX rings. */
+ mbq_safe_purge(&kring->rx_queue);
+ nm_os_mitigation_cleanup(&gna->mit[r]);
+ }
+
+ /* Decrement reference counter for the mbufs in the
+ * TX pools. These mbufs can be still pending in drivers,
+ * (e.g. this happens with virtio-net driver, which
+ * does lazy reclaiming of transmitted mbufs). */
+ for_each_tx_kring(r, kring, na) {
+ /* We must remove the destructor on the TX event,
+ * because the destructor invokes netmap code, and
+ * the netmap module may disappear before the
+ * TX event is consumed. */
+ mtx_lock_spin(&kring->tx_event_lock);
+ if (kring->tx_event) {
+ SET_MBUF_DESTRUCTOR(kring->tx_event, NULL);
+ }
+ kring->tx_event = NULL;
+ mtx_unlock_spin(&kring->tx_event_lock);
+ }
+
+ if (na->active_fds == 0) {
+ free(gna->mit, M_DEVBUF);
+
+ for_each_rx_kring(r, kring, na) {
+ mbq_safe_fini(&kring->rx_queue);
+ }
+
+ for_each_tx_kring(r, kring, na) {
+ mtx_destroy(&kring->tx_event_lock);
+ if (kring->tx_pool == NULL) {
+ continue;
+ }
+
+ for (i=0; i<na->num_tx_desc; i++) {
+ if (kring->tx_pool[i]) {
+ m_freem(kring->tx_pool[i]);
+ }
+ }
+ free(kring->tx_pool, M_DEVBUF);
+ kring->tx_pool = NULL;
+ }
+
+#ifdef RATE_GENERIC
+ if (--rate_ctx.refcount == 0) {
+ D("del_timer()");
+ del_timer(&rate_ctx.timer);
+ }
+#endif
+ }
+
+ return 0;
+}
/* Enable/disable netmap mode for a generic network interface. */
static int
generic_netmap_register(struct netmap_adapter *na, int enable)
{
struct netmap_generic_adapter *gna = (struct netmap_generic_adapter *)na;
- struct mbuf *m;
+ struct netmap_kring *kring = NULL;
int error;
int i, r;
- if (!na)
+ if (!na) {
return EINVAL;
+ }
-#ifdef REG_RESET
- error = ifp->netdev_ops->ndo_stop(ifp);
- if (error) {
- return error;
+ if (!enable) {
+ /* This is actually an unregif. */
+ return generic_netmap_unregister(na);
}
-#endif /* REG_RESET */
- if (enable) { /* Enable netmap mode. */
- /* Init the mitigation support on all the rx queues. */
+ if (na->active_fds == 0) {
+ D("Generic adapter %p goes on", na);
+ /* Do all memory allocations when (na->active_fds == 0), to
+ * simplify error management. */
+
+ /* Allocate memory for mitigation support on all the rx queues. */
gna->mit = malloc(na->num_rx_rings * sizeof(struct nm_generic_mit),
- M_DEVBUF, M_NOWAIT | M_ZERO);
+ M_DEVBUF, M_NOWAIT | M_ZERO);
if (!gna->mit) {
D("mitigation allocation failed");
error = ENOMEM;
goto out;
}
- for (r=0; r<na->num_rx_rings; r++)
- netmap_mitigation_init(&gna->mit[r], r, na);
- /* Initialize the rx queue, as generic_rx_handler() can
- * be called as soon as netmap_catch_rx() returns.
- */
- for (r=0; r<na->num_rx_rings; r++) {
- mbq_safe_init(&na->rx_rings[r].rx_queue);
+ for_each_rx_kring(r, kring, na) {
+ /* Init mitigation support. */
+ nm_os_mitigation_init(&gna->mit[r], r, na);
+
+ /* Initialize the rx queue, as generic_rx_handler() can
+ * be called as soon as nm_os_catch_rx() returns.
+ */
+ mbq_safe_init(&kring->rx_queue);
}
/*
- * Preallocate packet buffers for the tx rings.
+ * Prepare mbuf pools (parallel to the tx rings), for packet
+ * transmission. Don't preallocate the mbufs here, it's simpler
+ * to leave this task to txsync.
*/
- for (r=0; r<na->num_tx_rings; r++)
- na->tx_rings[r].tx_pool = NULL;
- for (r=0; r<na->num_tx_rings; r++) {
- na->tx_rings[r].tx_pool = malloc(na->num_tx_desc * sizeof(struct mbuf *),
- M_DEVBUF, M_NOWAIT | M_ZERO);
- if (!na->tx_rings[r].tx_pool) {
+ for_each_tx_kring(r, kring, na) {
+ kring->tx_pool = NULL;
+ }
+ for_each_tx_kring(r, kring, na) {
+ kring->tx_pool =
+ malloc(na->num_tx_desc * sizeof(struct mbuf *),
+ M_DEVBUF, M_NOWAIT | M_ZERO);
+ if (!kring->tx_pool) {
D("tx_pool allocation failed");
error = ENOMEM;
goto free_tx_pools;
}
- for (i=0; i<na->num_tx_desc; i++)
- na->tx_rings[r].tx_pool[i] = NULL;
- for (i=0; i<na->num_tx_desc; i++) {
- m = netmap_get_mbuf(NETMAP_BUF_SIZE(na));
- if (!m) {
- D("tx_pool[%d] allocation failed", i);
- error = ENOMEM;
- goto free_tx_pools;
- }
- na->tx_rings[r].tx_pool[i] = m;
- }
+ mtx_init(&kring->tx_event_lock, "tx_event_lock",
+ NULL, MTX_SPIN);
}
+ }
+
+ for_each_rx_kring_h(r, kring, na) {
+ if (nm_kring_pending_on(kring)) {
+ D("RX ring %d of generic adapter %p goes on", r, na);
+ kring->nr_mode = NKR_NETMAP_ON;
+ }
+
+ }
+ for_each_tx_kring_h(r, kring, na) {
+ if (nm_kring_pending_on(kring)) {
+ D("TX ring %d of generic adapter %p goes on", r, na);
+ kring->nr_mode = NKR_NETMAP_ON;
+ }
+ }
+
+ for_each_tx_kring(r, kring, na) {
+ /* Initialize tx_pool and tx_event. */
+ for (i=0; i<na->num_tx_desc; i++) {
+ kring->tx_pool[i] = NULL;
+ }
+
+ kring->tx_event = NULL;
+ }
+
+ if (na->active_fds == 0) {
rtnl_lock();
+
/* Prepare to intercept incoming traffic. */
- error = netmap_catch_rx(gna, 1);
+ error = nm_os_catch_rx(gna, 1);
if (error) {
- D("netdev_rx_handler_register() failed (%d)", error);
+ D("nm_os_catch_rx(1) failed (%d)", error);
goto register_handler;
}
- na->na_flags |= NAF_NETMAP_ON;
/* Make netmap control the packet steering. */
- netmap_catch_tx(gna, 1);
+ error = nm_os_catch_tx(gna, 1);
+ if (error) {
+ D("nm_os_catch_tx(1) failed (%d)", error);
+ goto catch_rx;
+ }
rtnl_unlock();
+ na->na_flags |= NAF_NETMAP_ON;
+
#ifdef RATE_GENERIC
if (rate_ctx.refcount == 0) {
D("setup_timer()");
@@ -329,73 +539,26 @@ generic_netmap_register(struct netmap_adapter *na, int enable)
}
rate_ctx.refcount++;
#endif /* RATE */
-
- } else if (na->tx_rings[0].tx_pool) {
- /* Disable netmap mode. We enter here only if the previous
- generic_netmap_register(na, 1) was successful.
- If it was not, na->tx_rings[0].tx_pool was set to NULL by the
- error handling code below. */
- rtnl_lock();
-
- na->na_flags &= ~NAF_NETMAP_ON;
-
- /* Release packet steering control. */
- netmap_catch_tx(gna, 0);
-
- /* Do not intercept packets on the rx path. */
- netmap_catch_rx(gna, 0);
-
- rtnl_unlock();
-
- /* Free the mbufs going to the netmap rings */
- for (r=0; r<na->num_rx_rings; r++) {
- mbq_safe_purge(&na->rx_rings[r].rx_queue);
- mbq_safe_destroy(&na->rx_rings[r].rx_queue);
- }
-
- for (r=0; r<na->num_rx_rings; r++)
- netmap_mitigation_cleanup(&gna->mit[r]);
- free(gna->mit, M_DEVBUF);
-
- for (r=0; r<na->num_tx_rings; r++) {
- for (i=0; i<na->num_tx_desc; i++) {
- m_freem(na->tx_rings[r].tx_pool[i]);
- }
- free(na->tx_rings[r].tx_pool, M_DEVBUF);
- }
-
-#ifdef RATE_GENERIC
- if (--rate_ctx.refcount == 0) {
- D("del_timer()");
- del_timer(&rate_ctx.timer);
- }
-#endif
}
-#ifdef REG_RESET
- error = ifp->netdev_ops->ndo_open(ifp);
- if (error) {
- goto free_tx_pools;
- }
-#endif
-
return 0;
+ /* Here (na->active_fds == 0) holds. */
+catch_rx:
+ nm_os_catch_rx(gna, 0);
register_handler:
rtnl_unlock();
free_tx_pools:
- for (r=0; r<na->num_tx_rings; r++) {
- if (na->tx_rings[r].tx_pool == NULL)
+ for_each_tx_kring(r, kring, na) {
+ mtx_destroy(&kring->tx_event_lock);
+ if (kring->tx_pool == NULL) {
continue;
- for (i=0; i<na->num_tx_desc; i++)
- if (na->tx_rings[r].tx_pool[i])
- m_freem(na->tx_rings[r].tx_pool[i]);
- free(na->tx_rings[r].tx_pool, M_DEVBUF);
- na->tx_rings[r].tx_pool = NULL;
+ }
+ free(kring->tx_pool, M_DEVBUF);
+ kring->tx_pool = NULL;
}
- for (r=0; r<na->num_rx_rings; r++) {
- netmap_mitigation_cleanup(&gna->mit[r]);
- mbq_safe_destroy(&na->rx_rings[r].rx_queue);
+ for_each_rx_kring(r, kring, na) {
+ mbq_safe_fini(&kring->rx_queue);
}
free(gna->mit, M_DEVBUF);
out:
@@ -411,24 +574,67 @@ out:
static void
generic_mbuf_destructor(struct mbuf *m)
{
- netmap_generic_irq(MBUF_IFP(m), MBUF_TXQ(m), NULL);
+ struct netmap_adapter *na = NA(GEN_TX_MBUF_IFP(m));
+ struct netmap_kring *kring;
+ unsigned int r = MBUF_TXQ(m);
+ unsigned int r_orig = r;
+
+ if (unlikely(!nm_netmap_on(na) || r >= na->num_tx_rings)) {
+ D("Error: no netmap adapter on device %p",
+ GEN_TX_MBUF_IFP(m));
+ return;
+ }
+
+ /*
+ * First, clear the event mbuf.
+ * In principle, the event 'm' should match the one stored
+ * on ring 'r'. However we check it explicitely to stay
+ * safe against lower layers (qdisc, driver, etc.) changing
+ * MBUF_TXQ(m) under our feet. If the match is not found
+ * on 'r', we try to see if it belongs to some other ring.
+ */
+ for (;;) {
+ bool match = false;
+
+ kring = &na->tx_rings[r];
+ mtx_lock_spin(&kring->tx_event_lock);
+ if (kring->tx_event == m) {
+ kring->tx_event = NULL;
+ match = true;
+ }
+ mtx_unlock_spin(&kring->tx_event_lock);
+
+ if (match) {
+ if (r != r_orig) {
+ RD(1, "event %p migrated: ring %u --> %u",
+ m, r_orig, r);
+ }
+ break;
+ }
+
+ if (++r == na->num_tx_rings) r = 0;
+
+ if (r == r_orig) {
+ RD(1, "Cannot match event %p", m);
+ return;
+ }
+ }
+
+ /* Second, wake up clients. They will reclaim the event through
+ * txsync. */
+ netmap_generic_irq(na, r, NULL);
#ifdef __FreeBSD__
- if (netmap_verbose)
- RD(5, "Tx irq (%p) queue %d index %d" , m, MBUF_TXQ(m), (int)(uintptr_t)m->m_ext.ext_arg1);
- netmap_default_mbuf_destructor(m);
-#endif /* __FreeBSD__ */
- IFRATE(rate_ctx.new.txirq++);
+ void_mbuf_dtor(m, NULL, NULL);
+#endif
}
-extern int netmap_adaptive_io;
-
/* Record completed transmissions and update hwtail.
*
* The oldest tx buffer not yet completed is at nr_hwtail + 1,
* nr_hwcur is the first unsent buffer.
*/
static u_int
-generic_netmap_tx_clean(struct netmap_kring *kring)
+generic_netmap_tx_clean(struct netmap_kring *kring, int txqdisc)
{
u_int const lim = kring->nkr_num_slots - 1;
u_int nm_i = nm_next(kring->nr_hwtail, lim);
@@ -436,39 +642,52 @@ generic_netmap_tx_clean(struct netmap_kring *kring)
u_int n = 0;
struct mbuf **tx_pool = kring->tx_pool;
+ ND("hwcur = %d, hwtail = %d", kring->nr_hwcur, kring->nr_hwtail);
+
while (nm_i != hwcur) { /* buffers not completed */
struct mbuf *m = tx_pool[nm_i];
- if (unlikely(m == NULL)) {
- /* this is done, try to replenish the entry */
- tx_pool[nm_i] = m = netmap_get_mbuf(NETMAP_BUF_SIZE(kring->na));
+ if (txqdisc) {
+ if (m == NULL) {
+ /* Nothing to do, this is going
+ * to be replenished. */
+ RD(3, "Is this happening?");
+
+ } else if (MBUF_QUEUED(m)) {
+ break; /* Not dequeued yet. */
+
+ } else if (MBUF_REFCNT(m) != 1) {
+ /* This mbuf has been dequeued but is still busy
+ * (refcount is 2).
+ * Leave it to the driver and replenish. */
+ m_freem(m);
+ tx_pool[nm_i] = NULL;
+ }
+
+ } else {
if (unlikely(m == NULL)) {
- D("mbuf allocation failed, XXX error");
- // XXX how do we proceed ? break ?
- return -ENOMEM;
+ int event_consumed;
+
+ /* This slot was used to place an event. */
+ mtx_lock_spin(&kring->tx_event_lock);
+ event_consumed = (kring->tx_event == NULL);
+ mtx_unlock_spin(&kring->tx_event_lock);
+ if (!event_consumed) {
+ /* The event has not been consumed yet,
+ * still busy in the driver. */
+ break;
+ }
+ /* The event has been consumed, we can go
+ * ahead. */
+
+ } else if (MBUF_REFCNT(m) != 1) {
+ /* This mbuf is still busy: its refcnt is 2. */
+ break;
}
- } else if (GET_MBUF_REFCNT(m) != 1) {
- break; /* This mbuf is still busy: its refcnt is 2. */
}
+
n++;
nm_i = nm_next(nm_i, lim);
-#if 0 /* rate adaptation */
- if (netmap_adaptive_io > 1) {
- if (n >= netmap_adaptive_io)
- break;
- } else if (netmap_adaptive_io) {
- /* if hwcur - nm_i < lim/8 do an early break
- * so we prevent the sender from stalling. See CVT.
- */
- if (hwcur >= nm_i) {
- if (hwcur - nm_i < lim/2)
- break;
- } else {
- if (hwcur + lim + 1 - nm_i < lim/2)
- break;
- }
- }
-#endif
}
kring->nr_hwtail = nm_prev(nm_i, lim);
ND("tx completed [%d] -> hwtail %d", n, kring->nr_hwtail);
@@ -476,23 +695,17 @@ generic_netmap_tx_clean(struct netmap_kring *kring)
return n;
}
-
-/*
- * We have pending packets in the driver between nr_hwtail +1 and hwcur.
- * Compute a position in the middle, to be used to generate
- * a notification.
- */
+/* Compute a slot index in the middle between inf and sup. */
static inline u_int
-generic_tx_event_middle(struct netmap_kring *kring, u_int hwcur)
+ring_middle(u_int inf, u_int sup, u_int lim)
{
- u_int n = kring->nkr_num_slots;
- u_int ntc = nm_next(kring->nr_hwtail, n-1);
+ u_int n = lim + 1;
u_int e;
- if (hwcur >= ntc) {
- e = (hwcur + ntc) / 2;
+ if (sup >= inf) {
+ e = (sup + inf) / 2;
} else { /* wrap around */
- e = (hwcur + n + ntc) / 2;
+ e = (sup + n + inf) / 2;
if (e >= n) {
e -= n;
}
@@ -506,35 +719,59 @@ generic_tx_event_middle(struct netmap_kring *kring, u_int hwcur)
return e;
}
-/*
- * We have pending packets in the driver between nr_hwtail+1 and hwcur.
- * Schedule a notification approximately in the middle of the two.
- * There is a race but this is only called within txsync which does
- * a double check.
- */
static void
generic_set_tx_event(struct netmap_kring *kring, u_int hwcur)
{
+ u_int lim = kring->nkr_num_slots - 1;
struct mbuf *m;
u_int e;
+ u_int ntc = nm_next(kring->nr_hwtail, lim); /* next to clean */
- if (nm_next(kring->nr_hwtail, kring->nkr_num_slots -1) == hwcur) {
+ if (ntc == hwcur) {
return; /* all buffers are free */
}
- e = generic_tx_event_middle(kring, hwcur);
+
+ /*
+ * We have pending packets in the driver between hwtail+1
+ * and hwcur, and we have to chose one of these slot to
+ * generate a notification.
+ * There is a race but this is only called within txsync which
+ * does a double check.
+ */
+#if 0
+ /* Choose a slot in the middle, so that we don't risk ending
+ * up in a situation where the client continuously wake up,
+ * fills one or a few TX slots and go to sleep again. */
+ e = ring_middle(ntc, hwcur, lim);
+#else
+ /* Choose the first pending slot, to be safe against driver
+ * reordering mbuf transmissions. */
+ e = ntc;
+#endif
m = kring->tx_pool[e];
- ND(5, "Request Event at %d mbuf %p refcnt %d", e, m, m ? GET_MBUF_REFCNT(m) : -2 );
if (m == NULL) {
- /* This can happen if there is already an event on the netmap
- slot 'e': There is nothing to do. */
+ /* An event is already in place. */
return;
}
- kring->tx_pool[e] = NULL;
+
+ mtx_lock_spin(&kring->tx_event_lock);
+ if (kring->tx_event) {
+ /* An event is already in place. */
+ mtx_unlock_spin(&kring->tx_event_lock);
+ return;
+ }
+
SET_MBUF_DESTRUCTOR(m, generic_mbuf_destructor);
+ kring->tx_event = m;
+ mtx_unlock_spin(&kring->tx_event_lock);
- // XXX wmb() ?
- /* Decrement the refcount an free it if we have the last one. */
+ kring->tx_pool[e] = NULL;
+
+ ND(5, "Request Event at %d mbuf %p refcnt %d", e, m, m ? MBUF_REFCNT(m) : -2 );
+
+ /* Decrement the refcount. This will free it if we lose the race
+ * with the driver. */
m_freem(m);
smp_mb();
}
@@ -551,6 +788,7 @@ static int
generic_netmap_txsync(struct netmap_kring *kring, int flags)
{
struct netmap_adapter *na = kring->na;
+ struct netmap_generic_adapter *gna = (struct netmap_generic_adapter *)na;
struct ifnet *ifp = na->ifp;
struct netmap_ring *ring = kring->ring;
u_int nm_i; /* index into the netmap ring */ // j
@@ -560,8 +798,6 @@ generic_netmap_txsync(struct netmap_kring *kring, int flags)
IFRATE(rate_ctx.new.txsync++);
- // TODO: handle the case of mbuf allocation failure
-
rmb();
/*
@@ -569,72 +805,121 @@ generic_netmap_txsync(struct netmap_kring *kring, int flags)
*/
nm_i = kring->nr_hwcur;
if (nm_i != head) { /* we have new packets to send */
+ struct nm_os_gen_arg a;
+ u_int event = -1;
+
+ if (gna->txqdisc && nm_kr_txempty(kring)) {
+ /* In txqdisc mode, we ask for a delayed notification,
+ * but only when cur == hwtail, which means that the
+ * client is going to block. */
+ event = ring_middle(nm_i, head, lim);
+ ND(3, "Place txqdisc event (hwcur=%u,event=%u,"
+ "head=%u,hwtail=%u)", nm_i, event, head,
+ kring->nr_hwtail);
+ }
+
+ a.ifp = ifp;
+ a.ring_nr = ring_nr;
+ a.head = a.tail = NULL;
+
while (nm_i != head) {
struct netmap_slot *slot = &ring->slot[nm_i];
u_int len = slot->len;
void *addr = NMB(na, slot);
-
/* device-specific */
struct mbuf *m;
int tx_ret;
NM_CHECK_ADDR_LEN(na, addr, len);
- /* Tale a mbuf from the tx pool and copy in the user packet. */
+ /* Tale a mbuf from the tx pool (replenishing the pool
+ * entry if necessary) and copy in the user packet. */
m = kring->tx_pool[nm_i];
- if (unlikely(!m)) {
- RD(5, "This should never happen");
- kring->tx_pool[nm_i] = m = netmap_get_mbuf(NETMAP_BUF_SIZE(na));
- if (unlikely(m == NULL)) {
- D("mbuf allocation failed");
+ if (unlikely(m == NULL)) {
+ kring->tx_pool[nm_i] = m =
+ nm_os_get_mbuf(ifp, NETMAP_BUF_SIZE(na));
+ if (m == NULL) {
+ RD(2, "Failed to replenish mbuf");
+ /* Here we could schedule a timer which
+ * retries to replenish after a while,
+ * and notifies the client when it
+ * manages to replenish some slots. In
+ * any case we break early to avoid
+ * crashes. */
break;
}
+ IFRATE(rate_ctx.new.txrepl++);
}
- /* XXX we should ask notifications when NS_REPORT is set,
- * or roughly every half frame. We can optimize this
- * by lazily requesting notifications only when a
- * transmission fails. Probably the best way is to
- * break on failures and set notifications when
- * ring->cur == ring->tail || nm_i != cur
+
+ a.m = m;
+ a.addr = addr;
+ a.len = len;
+ a.qevent = (nm_i == event);
+ /* When not in txqdisc mode, we should ask
+ * notifications when NS_REPORT is set, or roughly
+ * every half ring. To optimize this, we set a
+ * notification event when the client runs out of
+ * TX ring space, or when transmission fails. In
+ * the latter case we also break early.
*/
- tx_ret = generic_xmit_frame(ifp, m, addr, len, ring_nr);
+ tx_ret = nm_os_generic_xmit_frame(&a);
if (unlikely(tx_ret)) {
- ND(5, "start_xmit failed: err %d [nm_i %u, head %u, hwtail %u]",
- tx_ret, nm_i, head, kring->nr_hwtail);
- /*
- * No room for this mbuf in the device driver.
- * Request a notification FOR A PREVIOUS MBUF,
- * then call generic_netmap_tx_clean(kring) to do the
- * double check and see if we can free more buffers.
- * If there is space continue, else break;
- * NOTE: the double check is necessary if the problem
- * occurs in the txsync call after selrecord().
- * Also, we need some way to tell the caller that not
- * all buffers were queued onto the device (this was
- * not a problem with native netmap driver where space
- * is preallocated). The bridge has a similar problem
- * and we solve it there by dropping the excess packets.
- */
- generic_set_tx_event(kring, nm_i);
- if (generic_netmap_tx_clean(kring)) { /* space now available */
- continue;
- } else {
- break;
+ if (!gna->txqdisc) {
+ /*
+ * No room for this mbuf in the device driver.
+ * Request a notification FOR A PREVIOUS MBUF,
+ * then call generic_netmap_tx_clean(kring) to do the
+ * double check and see if we can free more buffers.
+ * If there is space continue, else break;
+ * NOTE: the double check is necessary if the problem
+ * occurs in the txsync call after selrecord().
+ * Also, we need some way to tell the caller that not
+ * all buffers were queued onto the device (this was
+ * not a problem with native netmap driver where space
+ * is preallocated). The bridge has a similar problem
+ * and we solve it there by dropping the excess packets.
+ */
+ generic_set_tx_event(kring, nm_i);
+ if (generic_netmap_tx_clean(kring, gna->txqdisc)) {
+ /* space now available */
+ continue;
+ } else {
+ break;
+ }
}
+
+ /* In txqdisc mode, the netmap-aware qdisc
+ * queue has the same length as the number of
+ * netmap slots (N). Since tail is advanced
+ * only when packets are dequeued, qdisc
+ * queue overrun cannot happen, so
+ * nm_os_generic_xmit_frame() did not fail
+ * because of that.
+ * However, packets can be dropped because
+ * carrier is off, or because our qdisc is
+ * being deactivated, or possibly for other
+ * reasons. In these cases, we just let the
+ * packet to be dropped. */
+ IFRATE(rate_ctx.new.txdrop++);
}
+
slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED);
nm_i = nm_next(nm_i, lim);
- IFRATE(rate_ctx.new.txpkt ++);
+ IFRATE(rate_ctx.new.txpkt++);
}
-
- /* Update hwcur to the next slot to transmit. */
- kring->nr_hwcur = nm_i; /* not head, we could break early */
+ if (a.head != NULL) {
+ a.addr = NULL;
+ nm_os_generic_xmit_frame(&a);
+ }
+ /* Update hwcur to the next slot to transmit. Here nm_i
+ * is not necessarily head, we could break early. */
+ kring->nr_hwcur = nm_i;
}
/*
* Second, reclaim completed buffers
*/
- if (flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring)) {
+ if (!gna->txqdisc && (flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring))) {
/* No more available slots? Set a notification event
* on a netmap slot that will be cleaned in the future.
* No doublecheck is performed, since txsync() will be
@@ -642,58 +927,74 @@ generic_netmap_txsync(struct netmap_kring *kring, int flags)
*/
generic_set_tx_event(kring, nm_i);
}
- ND("tx #%d, hwtail = %d", n, kring->nr_hwtail);
- generic_netmap_tx_clean(kring);
+ generic_netmap_tx_clean(kring, gna->txqdisc);
return 0;
}
/*
- * This handler is registered (through netmap_catch_rx())
+ * This handler is registered (through nm_os_catch_rx())
* within the attached network interface
* in the RX subsystem, so that every mbuf passed up by
* the driver can be stolen to the network stack.
* Stolen packets are put in a queue where the
* generic_netmap_rxsync() callback can extract them.
+ * Returns 1 if the packet was stolen, 0 otherwise.
*/
-void
+int
generic_rx_handler(struct ifnet *ifp, struct mbuf *m)
{
struct netmap_adapter *na = NA(ifp);
struct netmap_generic_adapter *gna = (struct netmap_generic_adapter *)na;
+ struct netmap_kring *kring;
u_int work_done;
- u_int rr = MBUF_RXQ(m); // receive ring number
+ u_int r = MBUF_RXQ(m); /* receive ring number */
+
+ if (r >= na->num_rx_rings) {
+ r = r % na->num_rx_rings;
+ }
+
+ kring = &na->rx_rings[r];
- if (rr >= na->num_rx_rings) {
- rr = rr % na->num_rx_rings; // XXX expensive...
+ if (kring->nr_mode == NKR_NETMAP_OFF) {
+ /* We must not intercept this mbuf. */
+ return 0;
}
/* limit the size of the queue */
- if (unlikely(mbq_len(&na->rx_rings[rr].rx_queue) > 1024)) {
+ if (unlikely(!gna->rxsg && MBUF_LEN(m) > NETMAP_BUF_SIZE(na))) {
+ /* This may happen when GRO/LRO features are enabled for
+ * the NIC driver when the generic adapter does not
+ * support RX scatter-gather. */
+ RD(2, "Warning: driver pushed up big packet "
+ "(size=%d)", (int)MBUF_LEN(m));
+ m_freem(m);
+ } else if (unlikely(mbq_len(&kring->rx_queue) > 1024)) {
m_freem(m);
} else {
- mbq_safe_enqueue(&na->rx_rings[rr].rx_queue, m);
+ mbq_safe_enqueue(&kring->rx_queue, m);
}
if (netmap_generic_mit < 32768) {
/* no rx mitigation, pass notification up */
- netmap_generic_irq(na->ifp, rr, &work_done);
- IFRATE(rate_ctx.new.rxirq++);
+ netmap_generic_irq(na, r, &work_done);
} else {
/* same as send combining, filter notification if there is a
* pending timer, otherwise pass it up and start a timer.
*/
- if (likely(netmap_mitigation_active(&gna->mit[rr]))) {
+ if (likely(nm_os_mitigation_active(&gna->mit[r]))) {
/* Record that there is some pending work. */
- gna->mit[rr].mit_pending = 1;
+ gna->mit[r].mit_pending = 1;
} else {
- netmap_generic_irq(na->ifp, rr, &work_done);
- IFRATE(rate_ctx.new.rxirq++);
- netmap_mitigation_start(&gna->mit[rr]);
+ netmap_generic_irq(na, r, &work_done);
+ nm_os_mitigation_start(&gna->mit[r]);
}
}
+
+ /* We have intercepted the mbuf. */
+ return 1;
}
/*
@@ -713,54 +1014,23 @@ generic_netmap_rxsync(struct netmap_kring *kring, int flags)
u_int const head = kring->rhead;
int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR;
+ /* Adapter-specific variables. */
+ uint16_t slot_flags = kring->nkr_slot_flags;
+ u_int nm_buf_len = NETMAP_BUF_SIZE(na);
+ struct mbq tmpq;
+ struct mbuf *m;
+ int avail; /* in bytes */
+ int mlen;
+ int copy;
+
if (head > lim)
return netmap_ring_reinit(kring);
- /*
- * First part: import newly received packets.
- */
- if (netmap_no_pendintr || force_update) {
- /* extract buffers from the rx queue, stop at most one
- * slot before nr_hwcur (stop_i)
- */
- uint16_t slot_flags = kring->nkr_slot_flags;
- u_int stop_i = nm_prev(kring->nr_hwcur, lim);
-
- nm_i = kring->nr_hwtail; /* first empty slot in the receive ring */
- for (n = 0; nm_i != stop_i; n++) {
- int len;
- void *addr = NMB(na, &ring->slot[nm_i]);
- struct mbuf *m;
-
- /* we only check the address here on generic rx rings */
- if (addr == NETMAP_BUF_BASE(na)) { /* Bad buffer */
- return netmap_ring_reinit(kring);
- }
- /*
- * Call the locked version of the function.
- * XXX Ideally we could grab a batch of mbufs at once
- * and save some locking overhead.
- */
- m = mbq_safe_dequeue(&kring->rx_queue);
- if (!m) /* no more data */
- break;
- len = MBUF_LEN(m);
- m_copydata(m, 0, len, addr);
- ring->slot[nm_i].len = len;
- ring->slot[nm_i].flags = slot_flags;
- m_freem(m);
- nm_i = nm_next(nm_i, lim);
- }
- if (n) {
- kring->nr_hwtail = nm_i;
- IFRATE(rate_ctx.new.rxpkt += n);
- }
- kring->nr_kflags &= ~NKR_PENDINTR;
- }
+ IFRATE(rate_ctx.new.rxsync++);
- // XXX should we invert the order ?
/*
- * Second part: skip past packets that userspace has released.
+ * First part: skip past packets that userspace has released.
+ * This can possibly make room for the second part.
*/
nm_i = kring->nr_hwcur;
if (nm_i != head) {
@@ -773,7 +1043,106 @@ generic_netmap_rxsync(struct netmap_kring *kring, int flags)
}
kring->nr_hwcur = head;
}
- IFRATE(rate_ctx.new.rxsync++);
+
+ /*
+ * Second part: import newly received packets.
+ */
+ if (!netmap_no_pendintr && !force_update) {
+ return 0;
+ }
+
+ nm_i = kring->nr_hwtail; /* First empty slot in the receive ring. */
+
+ /* Compute the available space (in bytes) in this netmap ring.
+ * The first slot that is not considered in is the one before
+ * nr_hwcur. */
+
+ avail = nm_prev(kring->nr_hwcur, lim) - nm_i;
+ if (avail < 0)
+ avail += lim + 1;
+ avail *= nm_buf_len;
+
+ /* First pass: While holding the lock on the RX mbuf queue,
+ * extract as many mbufs as they fit the available space,
+ * and put them in a temporary queue.
+ * To avoid performing a per-mbuf division (mlen / nm_buf_len) to
+ * to update avail, we do the update in a while loop that we
+ * also use to set the RX slots, but without performing the copy. */
+ mbq_init(&tmpq);
+ mbq_lock(&kring->rx_queue);
+ for (n = 0;; n++) {
+ m = mbq_peek(&kring->rx_queue);
+ if (!m) {
+ /* No more packets from the driver. */
+ break;
+ }
+
+ mlen = MBUF_LEN(m);
+ if (mlen > avail) {
+ /* No more space in the ring. */
+ break;
+ }
+
+ mbq_dequeue(&kring->rx_queue);
+
+ while (mlen) {
+ copy = nm_buf_len;
+ if (mlen < copy) {
+ copy = mlen;
+ }
+ mlen -= copy;
+ avail -= nm_buf_len;
+
+ ring->slot[nm_i].len = copy;
+ ring->slot[nm_i].flags = slot_flags | (mlen ? NS_MOREFRAG : 0);
+ nm_i = nm_next(nm_i, lim);
+ }
+
+ mbq_enqueue(&tmpq, m);
+ }
+ mbq_unlock(&kring->rx_queue);
+
+ /* Second pass: Drain the temporary queue, going over the used RX slots,
+ * and perform the copy out of the RX queue lock. */
+ nm_i = kring->nr_hwtail;
+
+ for (;;) {
+ void *nmaddr;
+ int ofs = 0;
+ int morefrag;
+
+ m = mbq_dequeue(&tmpq);
+ if (!m) {
+ break;
+ }
+
+ do {
+ nmaddr = NMB(na, &ring->slot[nm_i]);
+ /* We only check the address here on generic rx rings. */
+ if (nmaddr == NETMAP_BUF_BASE(na)) { /* Bad buffer */
+ m_freem(m);
+ mbq_purge(&tmpq);
+ mbq_fini(&tmpq);
+ return netmap_ring_reinit(kring);
+ }
+
+ copy = ring->slot[nm_i].len;
+ m_copydata(m, ofs, copy, nmaddr);
+ ofs += copy;
+ morefrag = ring->slot[nm_i].flags & NS_MOREFRAG;
+ nm_i = nm_next(nm_i, lim);
+ } while (morefrag);
+
+ m_freem(m);
+ }
+
+ mbq_fini(&tmpq);
+
+ if (n) {
+ kring->nr_hwtail = nm_i;
+ IFRATE(rate_ctx.new.rxpkt += n);
+ }
+ kring->nr_kflags &= ~NKR_PENDINTR;
return 0;
}
@@ -787,9 +1156,8 @@ generic_netmap_dtor(struct netmap_adapter *na)
if (prev_na != NULL) {
D("Released generic NA %p", gna);
- if_rele(ifp);
netmap_adapter_put(prev_na);
- if (na->ifp == NULL) {
+ if (nm_iszombie(na)) {
/*
* The driver has been removed without releasing
* the reference so we need to do it here.
@@ -797,9 +1165,13 @@ generic_netmap_dtor(struct netmap_adapter *na)
netmap_adapter_put(prev_na);
}
}
- WNA(ifp) = prev_na;
- D("Restored native NA %p", prev_na);
+ NM_ATTACH_NA(ifp, prev_na);
+ /*
+ * netmap_detach_common(), that it's called after this function,
+ * overrides WNA(ifp) if na->ifp is not NULL.
+ */
na->ifp = NULL;
+ D("Restored native NA %p", prev_na);
}
/*
@@ -823,7 +1195,7 @@ generic_netmap_attach(struct ifnet *ifp)
num_tx_desc = num_rx_desc = netmap_generic_ringsize; /* starting point */
- generic_find_num_desc(ifp, &num_tx_desc, &num_rx_desc); /* ignore errors */
+ nm_os_generic_find_num_desc(ifp, &num_tx_desc, &num_rx_desc); /* ignore errors */
ND("Netmap ring size: TX = %d, RX = %d", num_tx_desc, num_rx_desc);
if (num_tx_desc == 0 || num_rx_desc == 0) {
D("Device has no hw slots (tx %u, rx %u)", num_tx_desc, num_rx_desc);
@@ -855,12 +1227,23 @@ generic_netmap_attach(struct ifnet *ifp)
ND("[GNA] num_rx_queues(%d), real_num_rx_queues(%d)",
ifp->num_rx_queues, ifp->real_num_rx_queues);
- generic_find_num_queues(ifp, &na->num_tx_rings, &na->num_rx_rings);
+ nm_os_generic_find_num_queues(ifp, &na->num_tx_rings, &na->num_rx_rings);
retval = netmap_attach_common(na);
if (retval) {
free(gna, M_DEVBUF);
+ return retval;
}
+ gna->prev = NA(ifp); /* save old na */
+ if (gna->prev != NULL) {
+ netmap_adapter_get(gna->prev);
+ }
+ NM_ATTACH_NA(ifp, na);
+
+ nm_os_generic_set_features(gna);
+
+ D("Created generic NA %p (prev %p)", gna, gna->prev);
+
return retval;
}
diff --git a/sys/dev/netmap/netmap_kern.h b/sys/dev/netmap/netmap_kern.h
index 4aead85285fd..28e69d7ab093 100644
--- a/sys/dev/netmap/netmap_kern.h
+++ b/sys/dev/netmap/netmap_kern.h
@@ -1,6 +1,7 @@
/*
- * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved.
- * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
+ * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo
+ * Copyright (C) 2013-2016 Universita` di Pisa
+ * All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -48,16 +49,26 @@
#if defined(CONFIG_NETMAP_GENERIC)
#define WITH_GENERIC
#endif
-#if defined(CONFIG_NETMAP_V1000)
-#define WITH_V1000
+#if defined(CONFIG_NETMAP_PTNETMAP_GUEST)
+#define WITH_PTNETMAP_GUEST
+#endif
+#if defined(CONFIG_NETMAP_PTNETMAP_HOST)
+#define WITH_PTNETMAP_HOST
#endif
-#else /* not linux */
+#elif defined (_WIN32)
+#define WITH_VALE // comment out to disable VALE support
+#define WITH_PIPES
+#define WITH_MONITOR
+#define WITH_GENERIC
+#else /* neither linux nor windows */
#define WITH_VALE // comment out to disable VALE support
#define WITH_PIPES
#define WITH_MONITOR
#define WITH_GENERIC
+#define WITH_PTNETMAP_HOST /* ptnetmap host support */
+#define WITH_PTNETMAP_GUEST /* ptnetmap guest support */
#endif
@@ -66,6 +77,7 @@
#define likely(x) __builtin_expect((long)!!(x), 1L)
#define unlikely(x) __builtin_expect((long)!!(x), 0L)
+#define __user
#define NM_LOCK_T struct mtx /* low level spinlock, used to protect queues */
@@ -77,9 +89,11 @@
#define NM_MTX_ASSERT(m) sx_assert(&(m), SA_XLOCKED)
#define NM_SELINFO_T struct nm_selinfo
+#define NM_SELRECORD_T struct thread
#define MBUF_LEN(m) ((m)->m_pkthdr.len)
-#define MBUF_IFP(m) ((m)->m_pkthdr.rcvif)
-#define NM_SEND_UP(ifp, m) ((NA(ifp))->if_input)(ifp, m)
+#define MBUF_TXQ(m) ((m)->m_pkthdr.flowid)
+#define MBUF_TRANSMIT(na, ifp, m) ((na)->if_transmit(ifp, m))
+#define GEN_TX_MBUF_IFP(m) ((m)->m_pkthdr.rcvif)
#define NM_ATOMIC_T volatile int // XXX ?
/* atomic operations */
@@ -98,23 +112,20 @@ struct netmap_adapter *netmap_getna(if_t ifp);
#endif
#if __FreeBSD_version >= 1100027
-#define GET_MBUF_REFCNT(m) ((m)->m_ext.ext_cnt ? *((m)->m_ext.ext_cnt) : -1)
-#define SET_MBUF_REFCNT(m, x) *((m)->m_ext.ext_cnt) = x
-#define PNT_MBUF_REFCNT(m) ((m)->m_ext.ext_cnt)
+#define MBUF_REFCNT(m) ((m)->m_ext.ext_count)
+#define SET_MBUF_REFCNT(m, x) (m)->m_ext.ext_count = x
#else
-#define GET_MBUF_REFCNT(m) ((m)->m_ext.ref_cnt ? *((m)->m_ext.ref_cnt) : -1)
+#define MBUF_REFCNT(m) ((m)->m_ext.ref_cnt ? *((m)->m_ext.ref_cnt) : -1)
#define SET_MBUF_REFCNT(m, x) *((m)->m_ext.ref_cnt) = x
-#define PNT_MBUF_REFCNT(m) ((m)->m_ext.ref_cnt)
#endif
-MALLOC_DECLARE(M_NETMAP);
+#define MBUF_QUEUED(m) 1
struct nm_selinfo {
struct selinfo si;
struct mtx m;
};
-void freebsd_selwakeup(struct nm_selinfo *si, int pri);
// XXX linux struct, not used in FreeBSD
struct net_device_ops {
@@ -131,12 +142,16 @@ struct hrtimer {
#define NM_LOCK_T safe_spinlock_t // see bsd_glue.h
#define NM_SELINFO_T wait_queue_head_t
#define MBUF_LEN(m) ((m)->len)
-#define MBUF_IFP(m) ((m)->dev)
-#define NM_SEND_UP(ifp, m) \
- do { \
- m->priority = NM_MAGIC_PRIORITY_RX; \
- netif_rx(m); \
- } while (0)
+#define MBUF_TRANSMIT(na, ifp, m) \
+ ({ \
+ /* Avoid infinite recursion with generic. */ \
+ m->priority = NM_MAGIC_PRIORITY_TX; \
+ (((struct net_device_ops *)(na)->if_transmit)->ndo_start_xmit(m, ifp)); \
+ 0; \
+ })
+
+/* See explanation in nm_os_generic_xmit_frame. */
+#define GEN_TX_MBUF_IFP(m) ((struct ifnet *)skb_shinfo(m)->destructor_arg)
#define NM_ATOMIC_T volatile long unsigned int
@@ -159,7 +174,51 @@ struct hrtimer {
#define NM_LOCK_T IOLock *
#define NM_SELINFO_T struct selinfo
#define MBUF_LEN(m) ((m)->m_pkthdr.len)
-#define NM_SEND_UP(ifp, m) ((ifp)->if_input)(ifp, m)
+
+#elif defined (_WIN32)
+#include "../../../WINDOWS/win_glue.h"
+
+#define NM_SELRECORD_T IO_STACK_LOCATION
+#define NM_SELINFO_T win_SELINFO // see win_glue.h
+#define NM_LOCK_T win_spinlock_t // see win_glue.h
+#define NM_MTX_T KGUARDED_MUTEX /* OS-specific mutex (sleepable) */
+
+#define NM_MTX_INIT(m) KeInitializeGuardedMutex(&m);
+#define NM_MTX_DESTROY(m) do { (void)(m); } while (0)
+#define NM_MTX_LOCK(m) KeAcquireGuardedMutex(&(m))
+#define NM_MTX_UNLOCK(m) KeReleaseGuardedMutex(&(m))
+#define NM_MTX_ASSERT(m) assert(&m.Count>0)
+
+//These linknames are for the NDIS driver
+#define NETMAP_NDIS_LINKNAME_STRING L"\\DosDevices\\NMAPNDIS"
+#define NETMAP_NDIS_NTDEVICE_STRING L"\\Device\\NMAPNDIS"
+
+//Definition of internal driver-to-driver ioctl codes
+#define NETMAP_KERNEL_XCHANGE_POINTERS _IO('i', 180)
+#define NETMAP_KERNEL_SEND_SHUTDOWN_SIGNAL _IO_direct('i', 195)
+
+//Empty data structures are not permitted by MSVC compiler
+//XXX_ale, try to solve this problem
+struct net_device_ops{
+ char data[1];
+};
+typedef struct ethtool_ops{
+ char data[1];
+};
+typedef struct hrtimer{
+ KTIMER timer;
+ BOOLEAN active;
+ KDPC deferred_proc;
+};
+
+/* MSVC does not have likely/unlikely support */
+#ifdef _MSC_VER
+#define likely(x) (x)
+#define unlikely(x) (x)
+#else
+#define likely(x) __builtin_expect((long)!!(x), 1L)
+#define unlikely(x) __builtin_expect((long)!!(x), 0L)
+#endif //_MSC_VER
#else
@@ -167,6 +226,13 @@ struct hrtimer {
#endif /* end - platform-specific code */
+#ifndef _WIN32 /* support for emulated sysctl */
+#define SYSBEGIN(x)
+#define SYSEND
+#endif /* _WIN32 */
+
+#define NM_ACCESS_ONCE(x) (*(volatile __typeof__(x) *)&(x))
+
#define NMG_LOCK_T NM_MTX_T
#define NMG_LOCK_INIT() NM_MTX_INIT(netmap_global_lock)
#define NMG_LOCK_DESTROY() NM_MTX_DESTROY(netmap_global_lock)
@@ -201,8 +267,36 @@ struct nm_bdg_fwd;
struct nm_bridge;
struct netmap_priv_d;
+/* os-specific NM_SELINFO_T initialzation/destruction functions */
+void nm_os_selinfo_init(NM_SELINFO_T *);
+void nm_os_selinfo_uninit(NM_SELINFO_T *);
+
const char *nm_dump_buf(char *p, int len, int lim, char *dst);
+void nm_os_selwakeup(NM_SELINFO_T *si);
+void nm_os_selrecord(NM_SELRECORD_T *sr, NM_SELINFO_T *si);
+
+int nm_os_ifnet_init(void);
+void nm_os_ifnet_fini(void);
+void nm_os_ifnet_lock(void);
+void nm_os_ifnet_unlock(void);
+
+void nm_os_get_module(void);
+void nm_os_put_module(void);
+
+void netmap_make_zombie(struct ifnet *);
+void netmap_undo_zombie(struct ifnet *);
+
+/* passes a packet up to the host stack.
+ * If the packet is sent (or dropped) immediately it returns NULL,
+ * otherwise it links the packet to prev and returns m.
+ * In this case, a final call with m=NULL and prev != NULL will send up
+ * the entire chain to the host stack.
+ */
+void *nm_os_send_up(struct ifnet *, struct mbuf *m, struct mbuf *prev);
+
+int nm_os_mbuf_has_offld(struct mbuf *m);
+
#include "netmap_mbq.h"
extern NMG_LOCK_T netmap_global_lock;
@@ -299,6 +393,19 @@ struct netmap_kring {
uint32_t nr_kflags; /* private driver flags */
#define NKR_PENDINTR 0x1 // Pending interrupt.
#define NKR_EXCLUSIVE 0x2 /* exclusive binding */
+#define NKR_FORWARD 0x4 /* (host ring only) there are
+ packets to forward
+ */
+#define NKR_NEEDRING 0x8 /* ring needed even if users==0
+ * (used internally by pipes and
+ * by ptnetmap host ports)
+ */
+
+ uint32_t nr_mode;
+ uint32_t nr_pending_mode;
+#define NKR_NETMAP_OFF 0x0
+#define NKR_NETMAP_ON 0x1
+
uint32_t nkr_num_slots;
/*
@@ -344,13 +451,14 @@ struct netmap_kring {
* store incoming mbufs in a queue that is drained by
* a rxsync.
*/
- struct mbuf **tx_pool;
- // u_int nr_ntc; /* Emulation of a next-to-clean RX ring pointer. */
- struct mbq rx_queue; /* intercepted rx mbufs. */
+ struct mbuf **tx_pool;
+ struct mbuf *tx_event; /* TX event used as a notification */
+ NM_LOCK_T tx_event_lock; /* protects the tx_event mbuf */
+ struct mbq rx_queue; /* intercepted rx mbufs. */
uint32_t users; /* existing bindings for this ring */
- uint32_t ring_id; /* debugging */
+ uint32_t ring_id; /* kring identifier */
enum txrx tx; /* kind of ring (tx or rx) */
char name[64]; /* diagnostic */
@@ -372,9 +480,6 @@ struct netmap_kring {
struct netmap_kring *pipe; /* if this is a pipe ring,
* pointer to the other end
*/
- struct netmap_ring *save_ring; /* pointer to hidden rings
- * (see netmap_pipe.c for details)
- */
#endif /* WITH_PIPES */
#ifdef WITH_VALE
@@ -397,8 +502,28 @@ struct netmap_kring {
uint32_t mon_tail; /* last seen slot on rx */
uint32_t mon_pos; /* index of this ring in the monitored ring array */
#endif
-} __attribute__((__aligned__(64)));
+}
+#ifdef _WIN32
+__declspec(align(64));
+#else
+__attribute__((__aligned__(64)));
+#endif
+/* return 1 iff the kring needs to be turned on */
+static inline int
+nm_kring_pending_on(struct netmap_kring *kring)
+{
+ return kring->nr_pending_mode == NKR_NETMAP_ON &&
+ kring->nr_mode == NKR_NETMAP_OFF;
+}
+
+/* return 1 iff the kring needs to be turned off */
+static inline int
+nm_kring_pending_off(struct netmap_kring *kring)
+{
+ return kring->nr_pending_mode == NKR_NETMAP_OFF &&
+ kring->nr_mode == NKR_NETMAP_ON;
+}
/* return the next index, with wraparound */
static inline uint32_t
@@ -514,6 +639,8 @@ struct netmap_adapter {
*/
#define NAF_HOST_RINGS 64 /* the adapter supports the host rings */
#define NAF_FORCE_NATIVE 128 /* the adapter is always NATIVE */
+#define NAF_PTNETMAP_HOST 256 /* the adapter supports ptnetmap in the host */
+#define NAF_ZOMBIE (1U<<30) /* the nic driver has been unloaded */
#define NAF_BUSY (1U<<31) /* the adapter is used internally and
* cannot be registered from userspace
*/
@@ -592,10 +719,14 @@ struct netmap_adapter {
* For hw devices this is typically a selwakeup(),
* but for NIC/host ports attached to a switch (or vice-versa)
* we also need to invoke the 'txsync' code downstream.
+ * This callback pointer is actually used only to initialize
+ * kring->nm_notify.
+ * Return values are the same as for netmap_rx_irq().
*/
void (*nm_dtor)(struct netmap_adapter *);
int (*nm_register)(struct netmap_adapter *, int onoff);
+ void (*nm_intr)(struct netmap_adapter *, int onoff);
int (*nm_txsync)(struct netmap_kring *kring, int flags);
int (*nm_rxsync)(struct netmap_kring *kring, int flags);
@@ -640,14 +771,14 @@ struct netmap_adapter {
/* memory allocator (opaque)
* We also cache a pointer to the lut_entry for translating
- * buffer addresses, and the total number of buffers.
+ * buffer addresses, the total number of buffers and the buffer size.
*/
struct netmap_mem_d *nm_mem;
struct netmap_lut na_lut;
/* additional information attached to this adapter
* by other netmap subsystems. Currently used by
- * bwrap and LINUX/v1000.
+ * bwrap, LINUX/v1000 and ptnetmap
*/
void *na_private;
@@ -656,6 +787,9 @@ struct netmap_adapter {
int na_next_pipe; /* next free slot in the array */
int na_max_pipes; /* size of the array */
+ /* Offset of ethernet header for each packet. */
+ u_int virt_hdr_len;
+
char name[64];
};
@@ -721,8 +855,6 @@ struct netmap_vp_adapter { /* VALE software port */
struct nm_bridge *na_bdg;
int retry;
- /* Offset of ethernet header for each packet. */
- u_int virt_hdr_len;
/* Maximum Frame Size, used in bdg_mismatch_datapath() */
u_int mfs;
/* Last source MAC on this port */
@@ -767,6 +899,13 @@ struct netmap_generic_adapter { /* emulated device */
#ifdef linux
netdev_tx_t (*save_start_xmit)(struct mbuf *, struct ifnet *);
#endif
+ /* Is the adapter able to use multiple RX slots to scatter
+ * each packet pushed up by the driver? */
+ int rxsg;
+
+ /* Is the transmission path controlled by a netmap-aware
+ * device queue (i.e. qdisc on linux)? */
+ int txqdisc;
};
#endif /* WITH_GENERIC */
@@ -777,7 +916,7 @@ netmap_real_rings(struct netmap_adapter *na, enum txrx t)
}
#ifdef WITH_VALE
-
+struct nm_bdg_polling_state;
/*
* Bridge wrapper for non VALE ports attached to a VALE switch.
*
@@ -827,9 +966,6 @@ struct netmap_bwrap_adapter {
struct netmap_vp_adapter host; /* for host rings */
struct netmap_adapter *hwna; /* the underlying device */
- /* backup of the hwna memory allocator */
- struct netmap_mem_d *save_nmd;
-
/*
* When we attach a physical interface to the bridge, we
* allow the controlling process to terminate, so we need
@@ -838,10 +974,10 @@ struct netmap_bwrap_adapter {
* are attached to a bridge.
*/
struct netmap_priv_d *na_kpriv;
+ struct nm_bdg_polling_state *na_polling_state;
};
int netmap_bwrap_attach(const char *name, struct netmap_adapter *);
-
#endif /* WITH_VALE */
#ifdef WITH_PIPES
@@ -876,56 +1012,122 @@ nm_kr_rxspace(struct netmap_kring *k)
return space;
}
+/* return slots reserved to tx clients */
+#define nm_kr_txspace(_k) nm_kr_rxspace(_k)
-/* True if no space in the tx ring. only valid after txsync_prologue */
+
+/* True if no space in the tx ring, only valid after txsync_prologue */
static inline int
nm_kr_txempty(struct netmap_kring *kring)
{
return kring->rcur == kring->nr_hwtail;
}
+/* True if no more completed slots in the rx ring, only valid after
+ * rxsync_prologue */
+#define nm_kr_rxempty(_k) nm_kr_txempty(_k)
/*
* protect against multiple threads using the same ring.
- * also check that the ring has not been stopped.
- * We only care for 0 or !=0 as a return code.
+ * also check that the ring has not been stopped or locked
*/
-#define NM_KR_BUSY 1
-#define NM_KR_STOPPED 2
+#define NM_KR_BUSY 1 /* some other thread is syncing the ring */
+#define NM_KR_STOPPED 2 /* unbounded stop (ifconfig down or driver unload) */
+#define NM_KR_LOCKED 3 /* bounded, brief stop for mutual exclusion */
+/* release the previously acquired right to use the *sync() methods of the ring */
static __inline void nm_kr_put(struct netmap_kring *kr)
{
NM_ATOMIC_CLEAR(&kr->nr_busy);
}
-static __inline int nm_kr_tryget(struct netmap_kring *kr)
+/* true if the ifp that backed the adapter has disappeared (e.g., the
+ * driver has been unloaded)
+ */
+static inline int nm_iszombie(struct netmap_adapter *na);
+
+/* try to obtain exclusive right to issue the *sync() operations on the ring.
+ * The right is obtained and must be later relinquished via nm_kr_put() if and
+ * only if nm_kr_tryget() returns 0.
+ * If can_sleep is 1 there are only two other possible outcomes:
+ * - the function returns NM_KR_BUSY
+ * - the function returns NM_KR_STOPPED and sets the POLLERR bit in *perr
+ * (if non-null)
+ * In both cases the caller will typically skip the ring, possibly collecting
+ * errors along the way.
+ * If the calling context does not allow sleeping, the caller must pass 0 in can_sleep.
+ * In the latter case, the function may also return NM_KR_LOCKED and leave *perr
+ * untouched: ideally, the caller should try again at a later time.
+ */
+static __inline int nm_kr_tryget(struct netmap_kring *kr, int can_sleep, int *perr)
{
+ int busy = 1, stopped;
/* check a first time without taking the lock
* to avoid starvation for nm_kr_get()
*/
- if (unlikely(kr->nkr_stopped)) {
- ND("ring %p stopped (%d)", kr, kr->nkr_stopped);
- return NM_KR_STOPPED;
+retry:
+ stopped = kr->nkr_stopped;
+ if (unlikely(stopped)) {
+ goto stop;
}
- if (unlikely(NM_ATOMIC_TEST_AND_SET(&kr->nr_busy)))
- return NM_KR_BUSY;
- /* check a second time with lock held */
- if (unlikely(kr->nkr_stopped)) {
- ND("ring %p stopped (%d)", kr, kr->nkr_stopped);
+ busy = NM_ATOMIC_TEST_AND_SET(&kr->nr_busy);
+ /* we should not return NM_KR_BUSY if the ring was
+ * actually stopped, so check another time after
+ * the barrier provided by the atomic operation
+ */
+ stopped = kr->nkr_stopped;
+ if (unlikely(stopped)) {
+ goto stop;
+ }
+
+ if (unlikely(nm_iszombie(kr->na))) {
+ stopped = NM_KR_STOPPED;
+ goto stop;
+ }
+
+ return unlikely(busy) ? NM_KR_BUSY : 0;
+
+stop:
+ if (!busy)
nm_kr_put(kr);
- return NM_KR_STOPPED;
+ if (stopped == NM_KR_STOPPED) {
+/* if POLLERR is defined we want to use it to simplify netmap_poll().
+ * Otherwise, any non-zero value will do.
+ */
+#ifdef POLLERR
+#define NM_POLLERR POLLERR
+#else
+#define NM_POLLERR 1
+#endif /* POLLERR */
+ if (perr)
+ *perr |= NM_POLLERR;
+#undef NM_POLLERR
+ } else if (can_sleep) {
+ tsleep(kr, 0, "NM_KR_TRYGET", 4);
+ goto retry;
}
- return 0;
+ return stopped;
}
-static __inline void nm_kr_get(struct netmap_kring *kr)
+/* put the ring in the 'stopped' state and wait for the current user (if any) to
+ * notice. stopped must be either NM_KR_STOPPED or NM_KR_LOCKED
+ */
+static __inline void nm_kr_stop(struct netmap_kring *kr, int stopped)
{
+ kr->nkr_stopped = stopped;
while (NM_ATOMIC_TEST_AND_SET(&kr->nr_busy))
tsleep(kr, 0, "NM_KR_GET", 4);
}
+/* restart a ring after a stop */
+static __inline void nm_kr_start(struct netmap_kring *kr)
+{
+ kr->nkr_stopped = 0;
+ nm_kr_put(kr);
+}
+
/*
* The following functions are used by individual drivers to
@@ -953,10 +1155,26 @@ struct netmap_slot *netmap_reset(struct netmap_adapter *na,
enum txrx tx, u_int n, u_int new_cur);
int netmap_ring_reinit(struct netmap_kring *);
+/* Return codes for netmap_*x_irq. */
+enum {
+ /* Driver should do normal interrupt processing, e.g. because
+ * the interface is not in netmap mode. */
+ NM_IRQ_PASS = 0,
+ /* Port is in netmap mode, and the interrupt work has been
+ * completed. The driver does not have to notify netmap
+ * again before the next interrupt. */
+ NM_IRQ_COMPLETED = -1,
+ /* Port is in netmap mode, but the interrupt work has not been
+ * completed. The driver has to make sure netmap will be
+ * notified again soon, even if no more interrupts come (e.g.
+ * on Linux the driver should not call napi_complete()). */
+ NM_IRQ_RESCHED = -2,
+};
+
/* default functions to handle rx/tx interrupts */
int netmap_rx_irq(struct ifnet *, u_int, u_int *);
#define netmap_tx_irq(_n, _q) netmap_rx_irq(_n, _q, NULL)
-void netmap_common_irq(struct ifnet *, u_int, u_int *work_done);
+int netmap_common_irq(struct netmap_adapter *, u_int, u_int *work_done);
#ifdef WITH_VALE
@@ -986,35 +1204,74 @@ nm_native_on(struct netmap_adapter *na)
return nm_netmap_on(na) && (na->na_flags & NAF_NATIVE);
}
+static inline int
+nm_iszombie(struct netmap_adapter *na)
+{
+ return na == NULL || (na->na_flags & NAF_ZOMBIE);
+}
+
+static inline void
+nm_update_hostrings_mode(struct netmap_adapter *na)
+{
+ /* Process nr_mode and nr_pending_mode for host rings. */
+ na->tx_rings[na->num_tx_rings].nr_mode =
+ na->tx_rings[na->num_tx_rings].nr_pending_mode;
+ na->rx_rings[na->num_rx_rings].nr_mode =
+ na->rx_rings[na->num_rx_rings].nr_pending_mode;
+}
+
/* set/clear native flags and if_transmit/netdev_ops */
static inline void
nm_set_native_flags(struct netmap_adapter *na)
{
struct ifnet *ifp = na->ifp;
+ /* We do the setup for intercepting packets only if we are the
+ * first user of this adapapter. */
+ if (na->active_fds > 0) {
+ return;
+ }
+
na->na_flags |= NAF_NETMAP_ON;
#ifdef IFCAP_NETMAP /* or FreeBSD ? */
ifp->if_capenable |= IFCAP_NETMAP;
#endif
-#ifdef __FreeBSD__
+#if defined (__FreeBSD__)
na->if_transmit = ifp->if_transmit;
ifp->if_transmit = netmap_transmit;
+#elif defined (_WIN32)
+ (void)ifp; /* prevent a warning */
+ //XXX_ale can we just comment those?
+ //na->if_transmit = ifp->if_transmit;
+ //ifp->if_transmit = netmap_transmit;
#else
na->if_transmit = (void *)ifp->netdev_ops;
ifp->netdev_ops = &((struct netmap_hw_adapter *)na)->nm_ndo;
((struct netmap_hw_adapter *)na)->save_ethtool = ifp->ethtool_ops;
ifp->ethtool_ops = &((struct netmap_hw_adapter*)na)->nm_eto;
#endif
+ nm_update_hostrings_mode(na);
}
-
static inline void
nm_clear_native_flags(struct netmap_adapter *na)
{
struct ifnet *ifp = na->ifp;
-#ifdef __FreeBSD__
+ /* We undo the setup for intercepting packets only if we are the
+ * last user of this adapapter. */
+ if (na->active_fds > 0) {
+ return;
+ }
+
+ nm_update_hostrings_mode(na);
+
+#if defined(__FreeBSD__)
ifp->if_transmit = na->if_transmit;
+#elif defined(_WIN32)
+ (void)ifp; /* prevent a warning */
+ //XXX_ale can we just comment those?
+ //ifp->if_transmit = na->if_transmit;
#else
ifp->netdev_ops = (void *)na->if_transmit;
ifp->ethtool_ops = ((struct netmap_hw_adapter*)na)->save_ethtool;
@@ -1025,6 +1282,28 @@ nm_clear_native_flags(struct netmap_adapter *na)
#endif
}
+/*
+ * nm_*sync_prologue() functions are used in ioctl/poll and ptnetmap
+ * kthreads.
+ * We need netmap_ring* parameter, because in ptnetmap it is decoupled
+ * from host kring.
+ * The user-space ring pointers (head/cur/tail) are shared through
+ * CSB between host and guest.
+ */
+
+/*
+ * validates parameters in the ring/kring, returns a value for head
+ * If any error, returns ring_size to force a reinit.
+ */
+uint32_t nm_txsync_prologue(struct netmap_kring *, struct netmap_ring *);
+
+
+/*
+ * validates parameters in the ring/kring, returns a value for head
+ * If any error, returns ring_size lim to force a reinit.
+ */
+uint32_t nm_rxsync_prologue(struct netmap_kring *, struct netmap_ring *);
+
/* check/fix address and len in tx rings */
#if 1 /* debug version */
@@ -1080,6 +1359,9 @@ int netmap_krings_create(struct netmap_adapter *na, u_int tailroom);
*/
void netmap_krings_delete(struct netmap_adapter *na);
+int netmap_hw_krings_create(struct netmap_adapter *na);
+void netmap_hw_krings_delete(struct netmap_adapter *na);
+
/* set the stopped/enabled status of ring
* When stopping, they also wait for all current activity on the ring to
* terminate. The status change is then notified using the na nm_notify
@@ -1088,16 +1370,18 @@ void netmap_krings_delete(struct netmap_adapter *na);
void netmap_set_ring(struct netmap_adapter *, u_int ring_id, enum txrx, int stopped);
/* set the stopped/enabled status of all rings of the adapter. */
void netmap_set_all_rings(struct netmap_adapter *, int stopped);
-/* convenience wrappers for netmap_set_all_rings, used in drivers */
+/* convenience wrappers for netmap_set_all_rings */
void netmap_disable_all_rings(struct ifnet *);
void netmap_enable_all_rings(struct ifnet *);
int netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na,
uint16_t ringid, uint32_t flags);
-
+void netmap_do_unregif(struct netmap_priv_d *priv);
u_int nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg);
-int netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create);
+int netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na,
+ struct ifnet **ifp, int create);
+void netmap_unget_na(struct netmap_adapter *na, struct ifnet *ifp);
int netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na);
@@ -1124,12 +1408,11 @@ struct netmap_bdg_ops {
u_int netmap_bdg_learning(struct nm_bdg_fwd *ft, uint8_t *dst_ring,
struct netmap_vp_adapter *);
+#define NM_BRIDGES 8 /* number of bridges */
#define NM_BDG_MAXPORTS 254 /* up to 254 */
#define NM_BDG_BROADCAST NM_BDG_MAXPORTS
#define NM_BDG_NOPORT (NM_BDG_MAXPORTS+1)
-#define NM_NAME "vale" /* prefix for bridge port name */
-
/* these are redefined in case of no VALE support */
int netmap_get_bdg_na(struct nmreq *nmr, struct netmap_adapter **na, int create);
struct nm_bridge *netmap_init_bridges2(u_int);
@@ -1181,14 +1464,13 @@ void netmap_bns_getbridges(struct nm_bridge **, u_int *);
#endif
/* Various prototypes */
-int netmap_poll(struct cdev *dev, int events, struct thread *td);
+int netmap_poll(struct netmap_priv_d *, int events, NM_SELRECORD_T *td);
int netmap_init(void);
void netmap_fini(void);
int netmap_get_memory(struct netmap_priv_d* p);
void netmap_dtor(void *data);
-int netmap_dtor_locked(struct netmap_priv_d *priv);
-int netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td);
+int netmap_ioctl(struct netmap_priv_d *priv, u_long cmd, caddr_t data, struct thread *);
/* netmap_adapter creation/destruction */
@@ -1228,11 +1510,11 @@ int netmap_adapter_put(struct netmap_adapter *na);
/*
* module variables
*/
-#define NETMAP_BUF_BASE(na) ((na)->na_lut.lut[0].vaddr)
-#define NETMAP_BUF_SIZE(na) ((na)->na_lut.objsize)
-extern int netmap_mitigate; // XXX not really used
+#define NETMAP_BUF_BASE(_na) ((_na)->na_lut.lut[0].vaddr)
+#define NETMAP_BUF_SIZE(_na) ((_na)->na_lut.objsize)
extern int netmap_no_pendintr;
-extern int netmap_verbose; // XXX debugging
+extern int netmap_mitigate;
+extern int netmap_verbose; /* for debugging */
enum { /* verbose flags */
NM_VERB_ON = 1, /* generic verbose */
NM_VERB_HOST = 0x2, /* verbose host stack */
@@ -1245,10 +1527,11 @@ enum { /* verbose flags */
};
extern int netmap_txsync_retry;
+extern int netmap_flags;
extern int netmap_generic_mit;
extern int netmap_generic_ringsize;
extern int netmap_generic_rings;
-extern int netmap_use_count;
+extern int netmap_generic_txqdisc;
/*
* NA returns a pointer to the struct netmap adapter from the ifp,
@@ -1257,37 +1540,27 @@ extern int netmap_use_count;
#define NA(_ifp) ((struct netmap_adapter *)WNA(_ifp))
/*
- * Macros to determine if an interface is netmap capable or netmap enabled.
- * See the magic field in struct netmap_adapter.
- */
-#ifdef __FreeBSD__
-/*
- * on FreeBSD just use if_capabilities and if_capenable.
- */
-#define NETMAP_CAPABLE(ifp) (NA(ifp) && \
- (ifp)->if_capabilities & IFCAP_NETMAP )
-
-#define NETMAP_SET_CAPABLE(ifp) \
- (ifp)->if_capabilities |= IFCAP_NETMAP
-
-#else /* linux */
-
-/*
- * on linux:
- * we check if NA(ifp) is set and its first element has a related
+ * On old versions of FreeBSD, NA(ifp) is a pspare. On linux we
+ * overload another pointer in the netdev.
+ *
+ * We check if NA(ifp) is set and its first element has a related
* magic value. The capenable is within the struct netmap_adapter.
*/
#define NETMAP_MAGIC 0x52697a7a
-#define NETMAP_CAPABLE(ifp) (NA(ifp) && \
+#define NM_NA_VALID(ifp) (NA(ifp) && \
((uint32_t)(uintptr_t)NA(ifp) ^ NA(ifp)->magic) == NETMAP_MAGIC )
-#define NETMAP_SET_CAPABLE(ifp) \
- NA(ifp)->magic = ((uint32_t)(uintptr_t)NA(ifp)) ^ NETMAP_MAGIC
+#define NM_ATTACH_NA(ifp, na) do { \
+ WNA(ifp) = na; \
+ if (NA(ifp)) \
+ NA(ifp)->magic = \
+ ((uint32_t)(uintptr_t)NA(ifp)) ^ NETMAP_MAGIC; \
+} while(0)
-#endif /* linux */
+#define NM_IS_NATIVE(ifp) (NM_NA_VALID(ifp) && NA(ifp)->nm_dtor == netmap_hw_dtor)
-#ifdef __FreeBSD__
+#if defined(__FreeBSD__)
/* Assigns the device IOMMU domain to an allocator.
* Returns -ENOMEM in case the domain is different */
@@ -1331,6 +1604,8 @@ netmap_reload_map(struct netmap_adapter *na,
}
}
+#elif defined(_WIN32)
+
#else /* linux */
int nm_iommu_group_id(bus_dma_tag_t dev);
@@ -1341,8 +1616,8 @@ netmap_load_map(struct netmap_adapter *na,
bus_dma_tag_t tag, bus_dmamap_t map, void *buf)
{
if (0 && map) {
- *map = dma_map_single(na->pdev, buf, na->na_lut.objsize,
- DMA_BIDIRECTIONAL);
+ *map = dma_map_single(na->pdev, buf, NETMAP_BUF_SIZE(na),
+ DMA_BIDIRECTIONAL);
}
}
@@ -1350,11 +1625,11 @@ static inline void
netmap_unload_map(struct netmap_adapter *na,
bus_dma_tag_t tag, bus_dmamap_t map)
{
- u_int sz = na->na_lut.objsize;
+ u_int sz = NETMAP_BUF_SIZE(na);
if (*map) {
dma_unmap_single(na->pdev, *map, sz,
- DMA_BIDIRECTIONAL);
+ DMA_BIDIRECTIONAL);
}
}
@@ -1362,7 +1637,7 @@ static inline void
netmap_reload_map(struct netmap_adapter *na,
bus_dma_tag_t tag, bus_dmamap_t map, void *buf)
{
- u_int sz = na->na_lut.objsize;
+ u_int sz = NETMAP_BUF_SIZE(na);
if (*map) {
dma_unmap_single(na->pdev, *map, sz,
@@ -1473,7 +1748,11 @@ PNMB(struct netmap_adapter *na, struct netmap_slot *slot, uint64_t *pp)
struct lut_entry *lut = na->na_lut.lut;
void *ret = (i >= na->na_lut.objtotal) ? lut[0].vaddr : lut[i].vaddr;
+#ifndef _WIN32
*pp = (i >= na->na_lut.objtotal) ? lut[0].paddr : lut[i].paddr;
+#else
+ *pp = (i >= na->na_lut.objtotal) ? (uint64_t)lut[0].paddr.QuadPart : (uint64_t)lut[i].paddr.QuadPart;
+#endif
return ret;
}
@@ -1497,8 +1776,9 @@ struct netmap_priv_d {
struct netmap_if * volatile np_nifp; /* netmap if descriptor. */
struct netmap_adapter *np_na;
+ struct ifnet *np_ifp;
uint32_t np_flags; /* from the ioctl */
- u_int np_qfirst[NR_TXRX],
+ u_int np_qfirst[NR_TXRX],
np_qlast[NR_TXRX]; /* range of tx/rx rings to scan */
uint16_t np_txpoll; /* XXX and also np_rxpoll ? */
@@ -1512,6 +1792,26 @@ struct netmap_priv_d {
struct thread *np_td; /* kqueue, just debugging */
};
+struct netmap_priv_d *netmap_priv_new(void);
+void netmap_priv_delete(struct netmap_priv_d *);
+
+static inline int nm_kring_pending(struct netmap_priv_d *np)
+{
+ struct netmap_adapter *na = np->np_na;
+ enum txrx t;
+ int i;
+
+ for_rx_tx(t) {
+ for (i = np->np_qfirst[t]; i < np->np_qlast[t]; i++) {
+ struct netmap_kring *kring = &NMR(na, t)[i];
+ if (kring->nr_mode != kring->nr_pending_mode) {
+ return 1;
+ }
+ }
+ }
+ return 0;
+}
+
#ifdef WITH_MONITOR
struct netmap_monitor_adapter {
@@ -1530,13 +1830,36 @@ struct netmap_monitor_adapter {
* native netmap support.
*/
int generic_netmap_attach(struct ifnet *ifp);
+int generic_rx_handler(struct ifnet *ifp, struct mbuf *m);;
+
+int nm_os_catch_rx(struct netmap_generic_adapter *gna, int intercept);
+int nm_os_catch_tx(struct netmap_generic_adapter *gna, int intercept);
+
+/*
+ * the generic transmit routine is passed a structure to optionally
+ * build a queue of descriptors, in an OS-specific way.
+ * The payload is at addr, if non-null, and the routine should send or queue
+ * the packet, returning 0 if successful, 1 on failure.
+ *
+ * At the end, if head is non-null, there will be an additional call
+ * to the function with addr = NULL; this should tell the OS-specific
+ * routine to send the queue and free any resources. Failure is ignored.
+ */
+struct nm_os_gen_arg {
+ struct ifnet *ifp;
+ void *m; /* os-specific mbuf-like object */
+ void *head, *tail; /* tailq, if the OS-specific routine needs to build one */
+ void *addr; /* payload of current packet */
+ u_int len; /* packet length */
+ u_int ring_nr; /* packet length */
+ u_int qevent; /* in txqdisc mode, place an event on this mbuf */
+};
+
+int nm_os_generic_xmit_frame(struct nm_os_gen_arg *);
+int nm_os_generic_find_num_desc(struct ifnet *ifp, u_int *tx, u_int *rx);
+void nm_os_generic_find_num_queues(struct ifnet *ifp, u_int *txq, u_int *rxq);
+void nm_os_generic_set_features(struct netmap_generic_adapter *gna);
-int netmap_catch_rx(struct netmap_generic_adapter *na, int intercept);
-void generic_rx_handler(struct ifnet *ifp, struct mbuf *m);;
-void netmap_catch_tx(struct netmap_generic_adapter *na, int enable);
-int generic_xmit_frame(struct ifnet *ifp, struct mbuf *m, void *addr, u_int len, u_int ring_nr);
-int generic_find_num_desc(struct ifnet *ifp, u_int *tx, u_int *rx);
-void generic_find_num_queues(struct ifnet *ifp, u_int *txq, u_int *rxq);
static inline struct ifnet*
netmap_generic_getifp(struct netmap_generic_adapter *gna)
{
@@ -1546,6 +1869,8 @@ netmap_generic_getifp(struct netmap_generic_adapter *gna)
return gna->up.up.ifp;
}
+void netmap_generic_irq(struct netmap_adapter *na, u_int q, u_int *work_done);
+
//#define RATE_GENERIC /* Enables communication statistics for generic. */
#ifdef RATE_GENERIC
void generic_rate(int txp, int txs, int txi, int rxp, int rxs, int rxi);
@@ -1558,16 +1883,16 @@ void generic_rate(int txp, int txs, int txi, int rxp, int rxs, int rxi);
* to reduce the number of interrupt requests/selwakeup
* to clients on incoming packets.
*/
-void netmap_mitigation_init(struct nm_generic_mit *mit, int idx,
+void nm_os_mitigation_init(struct nm_generic_mit *mit, int idx,
struct netmap_adapter *na);
-void netmap_mitigation_start(struct nm_generic_mit *mit);
-void netmap_mitigation_restart(struct nm_generic_mit *mit);
-int netmap_mitigation_active(struct nm_generic_mit *mit);
-void netmap_mitigation_cleanup(struct nm_generic_mit *mit);
+void nm_os_mitigation_start(struct nm_generic_mit *mit);
+void nm_os_mitigation_restart(struct nm_generic_mit *mit);
+int nm_os_mitigation_active(struct nm_generic_mit *mit);
+void nm_os_mitigation_cleanup(struct nm_generic_mit *mit);
+#else /* !WITH_GENERIC */
+#define generic_netmap_attach(ifp) (EOPNOTSUPP)
#endif /* WITH_GENERIC */
-
-
/* Shared declarations for the VALE switch. */
/*
@@ -1656,22 +1981,111 @@ struct nm_ipv6hdr {
*/
#define rawsum_t uint32_t
-rawsum_t nm_csum_raw(uint8_t *data, size_t len, rawsum_t cur_sum);
-uint16_t nm_csum_ipv4(struct nm_iphdr *iph);
-void nm_csum_tcpudp_ipv4(struct nm_iphdr *iph, void *data,
+rawsum_t nm_os_csum_raw(uint8_t *data, size_t len, rawsum_t cur_sum);
+uint16_t nm_os_csum_ipv4(struct nm_iphdr *iph);
+void nm_os_csum_tcpudp_ipv4(struct nm_iphdr *iph, void *data,
size_t datalen, uint16_t *check);
-void nm_csum_tcpudp_ipv6(struct nm_ipv6hdr *ip6h, void *data,
+void nm_os_csum_tcpudp_ipv6(struct nm_ipv6hdr *ip6h, void *data,
size_t datalen, uint16_t *check);
-uint16_t nm_csum_fold(rawsum_t cur_sum);
+uint16_t nm_os_csum_fold(rawsum_t cur_sum);
void bdg_mismatch_datapath(struct netmap_vp_adapter *na,
struct netmap_vp_adapter *dst_na,
- struct nm_bdg_fwd *ft_p, struct netmap_ring *ring,
+ const struct nm_bdg_fwd *ft_p,
+ struct netmap_ring *dst_ring,
u_int *j, u_int lim, u_int *howmany);
/* persistent virtual port routines */
-int nm_vi_persist(const char *, struct ifnet **);
-void nm_vi_detach(struct ifnet *);
-void nm_vi_init_index(void);
+int nm_os_vi_persist(const char *, struct ifnet **);
+void nm_os_vi_detach(struct ifnet *);
+void nm_os_vi_init_index(void);
+
+/*
+ * kernel thread routines
+ */
+struct nm_kthread; /* OS-specific kthread - opaque */
+typedef void (*nm_kthread_worker_fn_t)(void *data);
+
+/* kthread configuration */
+struct nm_kthread_cfg {
+ long type; /* kthread type/identifier */
+ struct ptnet_ring_cfg event; /* event/ioctl fd */
+ nm_kthread_worker_fn_t worker_fn; /* worker function */
+ void *worker_private;/* worker parameter */
+ int attach_user; /* attach kthread to user process */
+};
+/* kthread configuration */
+struct nm_kthread *nm_os_kthread_create(struct nm_kthread_cfg *cfg);
+int nm_os_kthread_start(struct nm_kthread *);
+void nm_os_kthread_stop(struct nm_kthread *);
+void nm_os_kthread_delete(struct nm_kthread *);
+void nm_os_kthread_wakeup_worker(struct nm_kthread *nmk);
+void nm_os_kthread_send_irq(struct nm_kthread *);
+void nm_os_kthread_set_affinity(struct nm_kthread *, int);
+u_int nm_os_ncpus(void);
+
+#ifdef WITH_PTNETMAP_HOST
+/*
+ * netmap adapter for host ptnetmap ports
+ */
+struct netmap_pt_host_adapter {
+ struct netmap_adapter up;
+
+ struct netmap_adapter *parent;
+ int (*parent_nm_notify)(struct netmap_kring *kring, int flags);
+ void *ptns;
+};
+/* ptnetmap HOST routines */
+int netmap_get_pt_host_na(struct nmreq *nmr, struct netmap_adapter **na, int create);
+int ptnetmap_ctl(struct nmreq *nmr, struct netmap_adapter *na);
+static inline int
+nm_ptnetmap_host_on(struct netmap_adapter *na)
+{
+ return na && na->na_flags & NAF_PTNETMAP_HOST;
+}
+#else /* !WITH_PTNETMAP_HOST */
+#define netmap_get_pt_host_na(nmr, _2, _3) \
+ ((nmr)->nr_flags & (NR_PTNETMAP_HOST) ? EOPNOTSUPP : 0)
+#define ptnetmap_ctl(_1, _2) EINVAL
+#define nm_ptnetmap_host_on(_1) EINVAL
+#endif /* !WITH_PTNETMAP_HOST */
+
+#ifdef WITH_PTNETMAP_GUEST
+/* ptnetmap GUEST routines */
+
+typedef uint32_t (*nm_pt_guest_ptctl_t)(struct ifnet *, uint32_t);
+
+/*
+ * netmap adapter for guest ptnetmap ports
+ */
+struct netmap_pt_guest_adapter {
+ /* The netmap adapter to be used by netmap applications.
+ * This field must be the first, to allow upcast. */
+ struct netmap_hw_adapter hwup;
+
+ /* The netmap adapter to be used by the driver. */
+ struct netmap_hw_adapter dr;
+
+ void *csb;
+
+ /* Reference counter to track users of backend netmap port: the
+ * network stack and netmap clients.
+ * Used to decide when we need (de)allocate krings/rings and
+ * start (stop) ptnetmap kthreads. */
+ int backend_regifs;
+
+};
+
+int netmap_pt_guest_attach(struct netmap_adapter *, void *,
+ unsigned int, nm_pt_guest_ptctl_t);
+struct ptnet_ring;
+bool netmap_pt_guest_txsync(struct ptnet_ring *ptring, struct netmap_kring *kring,
+ int flags);
+bool netmap_pt_guest_rxsync(struct ptnet_ring *ptring, struct netmap_kring *kring,
+ int flags);
+int ptnet_nm_krings_create(struct netmap_adapter *na);
+void ptnet_nm_krings_delete(struct netmap_adapter *na);
+void ptnet_nm_dtor(struct netmap_adapter *na);
+#endif /* WITH_PTNETMAP_GUEST */
#endif /* _NET_NETMAP_KERN_H_ */
diff --git a/sys/dev/netmap/netmap_mbq.c b/sys/dev/netmap/netmap_mbq.c
index 503f5a13aa95..3eb971b74561 100644
--- a/sys/dev/netmap/netmap_mbq.c
+++ b/sys/dev/netmap/netmap_mbq.c
@@ -1,5 +1,6 @@
/*
- * Copyright (C) 2013-2014 Vincenzo Maffione. All rights reserved.
+ * Copyright (C) 2013-2014 Vincenzo Maffione
+ * All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -30,6 +31,8 @@
#ifdef linux
#include "bsd_glue.h"
+#elif defined (_WIN32)
+#include "win_glue.h"
#else /* __FreeBSD__ */
#include <sys/param.h>
#include <sys/lock.h>
@@ -152,12 +155,12 @@ void mbq_safe_purge(struct mbq *q)
}
-void mbq_safe_destroy(struct mbq *q)
+void mbq_safe_fini(struct mbq *q)
{
mtx_destroy(&q->lock);
}
-void mbq_destroy(struct mbq *q)
+void mbq_fini(struct mbq *q)
{
}
diff --git a/sys/dev/netmap/netmap_mbq.h b/sys/dev/netmap/netmap_mbq.h
index 455ca8a2c3ac..9dafa8b1149b 100644
--- a/sys/dev/netmap/netmap_mbq.h
+++ b/sys/dev/netmap/netmap_mbq.h
@@ -1,5 +1,6 @@
/*
- * Copyright (C) 2013-2014 Vincenzo Maffione. All rights reserved.
+ * Copyright (C) 2013-2014 Vincenzo Maffione
+ * All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -40,6 +41,8 @@
/* XXX probably rely on a previous definition of SPINLOCK_T */
#ifdef linux
#define SPINLOCK_T safe_spinlock_t
+#elif defined (_WIN32)
+#define SPINLOCK_T win_spinlock_t
#else
#define SPINLOCK_T struct mtx
#endif
@@ -52,16 +55,21 @@ struct mbq {
SPINLOCK_T lock;
};
-/* XXX "destroy" does not match "init" as a name.
- * We should also clarify whether init can be used while
+/* We should clarify whether init can be used while
* holding a lock, and whether mbq_safe_destroy() is a NOP.
*/
void mbq_init(struct mbq *q);
-void mbq_destroy(struct mbq *q);
+void mbq_fini(struct mbq *q);
void mbq_enqueue(struct mbq *q, struct mbuf *m);
struct mbuf *mbq_dequeue(struct mbq *q);
void mbq_purge(struct mbq *q);
+static inline struct mbuf *
+mbq_peek(struct mbq *q)
+{
+ return q->head ? q->head : NULL;
+}
+
static inline void
mbq_lock(struct mbq *q)
{
@@ -76,7 +84,7 @@ mbq_unlock(struct mbq *q)
void mbq_safe_init(struct mbq *q);
-void mbq_safe_destroy(struct mbq *q);
+void mbq_safe_fini(struct mbq *q);
void mbq_safe_enqueue(struct mbq *q, struct mbuf *m);
struct mbuf *mbq_safe_dequeue(struct mbq *q);
void mbq_safe_purge(struct mbq *q);
diff --git a/sys/dev/netmap/netmap_mem2.c b/sys/dev/netmap/netmap_mem2.c
index fd0c06bb8b57..bb0f9c8b6f39 100644
--- a/sys/dev/netmap/netmap_mem2.c
+++ b/sys/dev/netmap/netmap_mem2.c
@@ -1,5 +1,8 @@
/*
- * Copyright (C) 2012-2014 Matteo Landi, Luigi Rizzo, Giuseppe Lettieri. All rights reserved.
+ * Copyright (C) 2012-2014 Matteo Landi
+ * Copyright (C) 2012-2016 Luigi Rizzo
+ * Copyright (C) 2012-2016 Giuseppe Lettieri
+ * All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -37,6 +40,7 @@ __FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <sys/malloc.h>
+#include <sys/kernel.h> /* MALLOC_DEFINE */
#include <sys/proc.h>
#include <vm/vm.h> /* vtophys */
#include <vm/pmap.h> /* vtophys */
@@ -48,13 +52,26 @@ __FBSDID("$FreeBSD$");
#include <net/vnet.h>
#include <machine/bus.h> /* bus_dmamap_* */
+/* M_NETMAP only used in here */
+MALLOC_DECLARE(M_NETMAP);
+MALLOC_DEFINE(M_NETMAP, "netmap", "Network memory map");
+
#endif /* __FreeBSD__ */
+#ifdef _WIN32
+#include <win_glue.h>
+#endif
+
#include <net/netmap.h>
#include <dev/netmap/netmap_kern.h>
+#include <net/netmap_virt.h>
#include "netmap_mem2.h"
-#define NETMAP_BUF_MAX_NUM 20*4096*2 /* large machine */
+#ifdef _WIN32_USE_SMALL_GENERIC_DEVICES_MEMORY
+#define NETMAP_BUF_MAX_NUM 8*4096 /* if too big takes too much time to allocate */
+#else
+#define NETMAP_BUF_MAX_NUM 20*4096*2 /* large machine */
+#endif
#define NETMAP_POOL_MAX_NAMSZ 32
@@ -111,7 +128,7 @@ struct netmap_obj_pool {
struct netmap_mem_ops {
- void (*nmd_get_lut)(struct netmap_mem_d *, struct netmap_lut*);
+ int (*nmd_get_lut)(struct netmap_mem_d *, struct netmap_lut*);
int (*nmd_get_info)(struct netmap_mem_d *, u_int *size,
u_int *memflags, uint16_t *id);
@@ -130,6 +147,39 @@ struct netmap_mem_ops {
typedef uint16_t nm_memid_t;
+/*
+ * Shared info for netmap allocator
+ *
+ * Each allocator contains this structur as first netmap_if.
+ * In this way, we can share same details about allocator
+ * to the VM.
+ * Used in ptnetmap.
+ */
+struct netmap_mem_shared_info {
+#ifndef _WIN32
+ struct netmap_if up; /* ends with a 0-sized array, which VSC does not like */
+#else /* !_WIN32 */
+ char up[sizeof(struct netmap_if)];
+#endif /* !_WIN32 */
+ uint64_t features;
+#define NMS_FEAT_BUF_POOL 0x0001
+#define NMS_FEAT_MEMSIZE 0x0002
+
+ uint32_t buf_pool_offset;
+ uint32_t buf_pool_objtotal;
+ uint32_t buf_pool_objsize;
+ uint32_t totalsize;
+};
+
+#define NMS_NAME "nms_info"
+#define NMS_VERSION 1
+static const struct netmap_if nms_if_blueprint = {
+ .ni_name = NMS_NAME,
+ .ni_version = NMS_VERSION,
+ .ni_tx_rings = 0,
+ .ni_rx_rings = 0
+};
+
struct netmap_mem_d {
NMA_LOCK_T nm_mtx; /* protect the allocator */
u_int nm_totalsize; /* shorthand */
@@ -151,6 +201,9 @@ struct netmap_mem_d {
struct netmap_mem_ops *ops;
};
+/*
+ * XXX need to fix the case of t0 == void
+ */
#define NMD_DEFCB(t0, name) \
t0 \
netmap_mem_##name(struct netmap_mem_d *nmd) \
@@ -186,7 +239,7 @@ netmap_mem_##name(struct netmap_adapter *na, t1 a1) \
return na->nm_mem->ops->nmd_##name(na, a1); \
}
-NMD_DEFCB1(void, get_lut, struct netmap_lut *);
+NMD_DEFCB1(int, get_lut, struct netmap_lut *);
NMD_DEFCB3(int, get_info, u_int *, u_int *, uint16_t *);
NMD_DEFCB1(vm_paddr_t, ofstophys, vm_ooffset_t);
static int netmap_mem_config(struct netmap_mem_d *);
@@ -201,7 +254,7 @@ NMD_DEFNACB(void, rings_delete);
static int netmap_mem_map(struct netmap_obj_pool *, struct netmap_adapter *);
static int netmap_mem_unmap(struct netmap_obj_pool *, struct netmap_adapter *);
-static int nm_mem_assign_group(struct netmap_mem_d *, device_t);
+static int nm_mem_assign_group(struct netmap_mem_d *, struct device *);
#define NMA_LOCK_INIT(n) NM_MTX_INIT((n)->nm_mtx)
#define NMA_LOCK_DESTROY(n) NM_MTX_DESTROY((n)->nm_mtx)
@@ -248,7 +301,9 @@ netmap_mem_finalize(struct netmap_mem_d *nmd, struct netmap_adapter *na)
if (nm_mem_assign_group(nmd, na->pdev) < 0) {
return ENOMEM;
} else {
- nmd->ops->nmd_finalize(nmd);
+ NMA_LOCK(nmd);
+ nmd->lasterr = nmd->ops->nmd_finalize(nmd);
+ NMA_UNLOCK(nmd);
}
if (!nmd->lasterr && na->pdev)
@@ -257,26 +312,83 @@ netmap_mem_finalize(struct netmap_mem_d *nmd, struct netmap_adapter *na)
return nmd->lasterr;
}
+static int netmap_mem_init_shared_info(struct netmap_mem_d *nmd);
+
void
netmap_mem_deref(struct netmap_mem_d *nmd, struct netmap_adapter *na)
{
NMA_LOCK(nmd);
netmap_mem_unmap(&nmd->pools[NETMAP_BUF_POOL], na);
+ if (nmd->active == 1) {
+ u_int i;
+
+ /*
+ * Reset the allocator when it falls out of use so that any
+ * pool resources leaked by unclean application exits are
+ * reclaimed.
+ */
+ for (i = 0; i < NETMAP_POOLS_NR; i++) {
+ struct netmap_obj_pool *p;
+ u_int j;
+
+ p = &nmd->pools[i];
+ p->objfree = p->objtotal;
+ /*
+ * Reproduce the net effect of the M_ZERO malloc()
+ * and marking of free entries in the bitmap that
+ * occur in finalize_obj_allocator()
+ */
+ memset(p->bitmap,
+ '\0',
+ sizeof(uint32_t) * ((p->objtotal + 31) / 32));
+
+ /*
+ * Set all the bits in the bitmap that have
+ * corresponding buffers to 1 to indicate they are
+ * free.
+ */
+ for (j = 0; j < p->objtotal; j++) {
+ if (p->lut[j].vaddr != NULL) {
+ p->bitmap[ (j>>5) ] |= ( 1 << (j & 31) );
+ }
+ }
+ }
+
+ /*
+ * Per netmap_mem_finalize_all(),
+ * buffers 0 and 1 are reserved
+ */
+ nmd->pools[NETMAP_BUF_POOL].objfree -= 2;
+ if (nmd->pools[NETMAP_BUF_POOL].bitmap) {
+ /* XXX This check is a workaround that prevents a
+ * NULL pointer crash which currently happens only
+ * with ptnetmap guests. Also,
+ * netmap_mem_init_shared_info must not be called
+ * by ptnetmap guest. */
+ nmd->pools[NETMAP_BUF_POOL].bitmap[0] = ~3;
+
+ /* expose info to the ptnetmap guest */
+ netmap_mem_init_shared_info(nmd);
+ }
+ }
+ nmd->ops->nmd_deref(nmd);
+
NMA_UNLOCK(nmd);
- return nmd->ops->nmd_deref(nmd);
}
/* accessor functions */
-static void
+static int
netmap_mem2_get_lut(struct netmap_mem_d *nmd, struct netmap_lut *lut)
{
lut->lut = nmd->pools[NETMAP_BUF_POOL].lut;
lut->objtotal = nmd->pools[NETMAP_BUF_POOL].objtotal;
lut->objsize = nmd->pools[NETMAP_BUF_POOL]._objsize;
+
+ return 0;
}
-struct netmap_obj_params netmap_params[NETMAP_POOLS_NR] = {
+static struct netmap_obj_params netmap_params[NETMAP_POOLS_NR] = {
[NETMAP_IF_POOL] = {
.size = 1024,
.num = 100,
@@ -291,10 +403,10 @@ struct netmap_obj_params netmap_params[NETMAP_POOLS_NR] = {
},
};
-struct netmap_obj_params netmap_min_priv_params[NETMAP_POOLS_NR] = {
+static struct netmap_obj_params netmap_min_priv_params[NETMAP_POOLS_NR] = {
[NETMAP_IF_POOL] = {
.size = 1024,
- .num = 1,
+ .num = 2,
},
[NETMAP_RING_POOL] = {
.size = 5*PAGE_SIZE,
@@ -348,11 +460,12 @@ struct netmap_mem_d nm_mem = { /* Our memory allocator. */
};
-struct netmap_mem_d *netmap_last_mem_d = &nm_mem;
+static struct netmap_mem_d *netmap_last_mem_d = &nm_mem;
/* blueprint for the private memory allocators */
extern struct netmap_mem_ops netmap_mem_private_ops; /* forward */
-const struct netmap_mem_d nm_blueprint = {
+/* XXX clang is not happy about using name as a print format */
+static const struct netmap_mem_d nm_blueprint = {
.pools = {
[NETMAP_IF_POOL] = {
.name = "%s_if",
@@ -388,6 +501,7 @@ const struct netmap_mem_d nm_blueprint = {
#define DECLARE_SYSCTLS(id, name) \
+ SYSBEGIN(mem2_ ## name); \
SYSCTL_INT(_dev_netmap, OID_AUTO, name##_size, \
CTLFLAG_RW, &netmap_params[id].size, 0, "Requested size of netmap " STRINGIFY(name) "s"); \
SYSCTL_INT(_dev_netmap, OID_AUTO, name##_curr_size, \
@@ -401,22 +515,22 @@ const struct netmap_mem_d nm_blueprint = {
"Default size of private netmap " STRINGIFY(name) "s"); \
SYSCTL_INT(_dev_netmap, OID_AUTO, priv_##name##_num, \
CTLFLAG_RW, &netmap_min_priv_params[id].num, 0, \
- "Default number of private netmap " STRINGIFY(name) "s")
+ "Default number of private netmap " STRINGIFY(name) "s"); \
+ SYSEND
SYSCTL_DECL(_dev_netmap);
DECLARE_SYSCTLS(NETMAP_IF_POOL, if);
DECLARE_SYSCTLS(NETMAP_RING_POOL, ring);
DECLARE_SYSCTLS(NETMAP_BUF_POOL, buf);
+/* call with NMA_LOCK(&nm_mem) held */
static int
-nm_mem_assign_id(struct netmap_mem_d *nmd)
+nm_mem_assign_id_locked(struct netmap_mem_d *nmd)
{
nm_memid_t id;
struct netmap_mem_d *scan = netmap_last_mem_d;
int error = ENOMEM;
- NMA_LOCK(&nm_mem);
-
do {
/* we rely on unsigned wrap around */
id = scan->nm_id + 1;
@@ -435,10 +549,22 @@ nm_mem_assign_id(struct netmap_mem_d *nmd)
}
} while (scan != netmap_last_mem_d);
- NMA_UNLOCK(&nm_mem);
return error;
}
+/* call with NMA_LOCK(&nm_mem) *not* held */
+static int
+nm_mem_assign_id(struct netmap_mem_d *nmd)
+{
+ int ret;
+
+ NMA_LOCK(&nm_mem);
+ ret = nm_mem_assign_id_locked(nmd);
+ NMA_UNLOCK(&nm_mem);
+
+ return ret;
+}
+
static void
nm_mem_release_id(struct netmap_mem_d *nmd)
{
@@ -456,7 +582,7 @@ nm_mem_release_id(struct netmap_mem_d *nmd)
}
static int
-nm_mem_assign_group(struct netmap_mem_d *nmd, device_t dev)
+nm_mem_assign_group(struct netmap_mem_d *nmd, struct device *dev)
{
int err = 0, id;
id = nm_iommu_group_id(dev);
@@ -494,8 +620,13 @@ netmap_mem2_ofstophys(struct netmap_mem_d* nmd, vm_ooffset_t offset)
if (offset >= p[i].memtotal)
continue;
// now lookup the cluster's address
+#ifndef _WIN32
pa = vtophys(p[i].lut[offset / p[i]._objsize].vaddr) +
offset % p[i]._objsize;
+#else
+ pa = vtophys(p[i].lut[offset / p[i]._objsize].vaddr);
+ pa.QuadPart += offset % p[i]._objsize;
+#endif
NMA_UNLOCK(nmd);
return pa;
}
@@ -508,7 +639,110 @@ netmap_mem2_ofstophys(struct netmap_mem_d* nmd, vm_ooffset_t offset)
+ p[NETMAP_RING_POOL].memtotal
+ p[NETMAP_BUF_POOL].memtotal);
NMA_UNLOCK(nmd);
+#ifndef _WIN32
return 0; // XXX bad address
+#else
+ vm_paddr_t res;
+ res.QuadPart = 0;
+ return res;
+#endif
+}
+
+#ifdef _WIN32
+
+/*
+ * win32_build_virtual_memory_for_userspace
+ *
+ * This function get all the object making part of the pools and maps
+ * a contiguous virtual memory space for the userspace
+ * It works this way
+ * 1 - allocate a Memory Descriptor List wide as the sum
+ * of the memory needed for the pools
+ * 2 - cycle all the objects in every pool and for every object do
+ *
+ * 2a - cycle all the objects in every pool, get the list
+ * of the physical address descriptors
+ * 2b - calculate the offset in the array of pages desciptor in the
+ * main MDL
+ * 2c - copy the descriptors of the object in the main MDL
+ *
+ * 3 - return the resulting MDL that needs to be mapped in userland
+ *
+ * In this way we will have an MDL that describes all the memory for the
+ * objects in a single object
+*/
+
+PMDL
+win32_build_user_vm_map(struct netmap_mem_d* nmd)
+{
+ int i, j;
+ u_int memsize, memflags, ofs = 0;
+ PMDL mainMdl, tempMdl;
+
+ if (netmap_mem_get_info(nmd, &memsize, &memflags, NULL)) {
+ D("memory not finalised yet");
+ return NULL;
+ }
+
+ mainMdl = IoAllocateMdl(NULL, memsize, FALSE, FALSE, NULL);
+ if (mainMdl == NULL) {
+ D("failed to allocate mdl");
+ return NULL;
+ }
+
+ NMA_LOCK(nmd);
+ for (i = 0; i < NETMAP_POOLS_NR; i++) {
+ struct netmap_obj_pool *p = &nmd->pools[i];
+ int clsz = p->_clustsize;
+ int clobjs = p->_clustentries; /* objects per cluster */
+ int mdl_len = sizeof(PFN_NUMBER) * BYTES_TO_PAGES(clsz);
+ PPFN_NUMBER pSrc, pDst;
+
+ /* each pool has a different cluster size so we need to reallocate */
+ tempMdl = IoAllocateMdl(p->lut[0].vaddr, clsz, FALSE, FALSE, NULL);
+ if (tempMdl == NULL) {
+ NMA_UNLOCK(nmd);
+ D("fail to allocate tempMdl");
+ IoFreeMdl(mainMdl);
+ return NULL;
+ }
+ pSrc = MmGetMdlPfnArray(tempMdl);
+ /* create one entry per cluster, the lut[] has one entry per object */
+ for (j = 0; j < p->numclusters; j++, ofs += clsz) {
+ pDst = &MmGetMdlPfnArray(mainMdl)[BYTES_TO_PAGES(ofs)];
+ MmInitializeMdl(tempMdl, p->lut[j*clobjs].vaddr, clsz);
+ MmBuildMdlForNonPagedPool(tempMdl); /* compute physical page addresses */
+ RtlCopyMemory(pDst, pSrc, mdl_len); /* copy the page descriptors */
+ mainMdl->MdlFlags = tempMdl->MdlFlags; /* XXX what is in here ? */
+ }
+ IoFreeMdl(tempMdl);
+ }
+ NMA_UNLOCK(nmd);
+ return mainMdl;
+}
+
+#endif /* _WIN32 */
+
+/*
+ * helper function for OS-specific mmap routines (currently only windows).
+ * Given an nmd and a pool index, returns the cluster size and number of clusters.
+ * Returns 0 if memory is finalised and the pool is valid, otherwise 1.
+ * It should be called under NMA_LOCK(nmd) otherwise the underlying info can change.
+ */
+
+int
+netmap_mem2_get_pool_info(struct netmap_mem_d* nmd, u_int pool, u_int *clustsize, u_int *numclusters)
+{
+ if (!nmd || !clustsize || !numclusters || pool >= NETMAP_POOLS_NR)
+ return 1; /* invalid arguments */
+ // NMA_LOCK_ASSERT(nmd);
+ if (!(nmd->flags & NETMAP_MEM_FINALIZED)) {
+ *clustsize = *numclusters = 0;
+ return 1; /* not ready yet */
+ }
+ *clustsize = nmd->pools[pool]._clustsize;
+ *numclusters = nmd->pools[pool].numclusters;
+ return 0; /* success */
}
static int
@@ -578,12 +812,6 @@ netmap_obj_offset(struct netmap_obj_pool *p, const void *vaddr)
((n)->pools[NETMAP_IF_POOL].memtotal + \
netmap_obj_offset(&(n)->pools[NETMAP_RING_POOL], (v)))
-#define netmap_buf_offset(n, v) \
- ((n)->pools[NETMAP_IF_POOL].memtotal + \
- (n)->pools[NETMAP_RING_POOL].memtotal + \
- netmap_obj_offset(&(n)->pools[NETMAP_BUF_POOL], (v)))
-
-
static ssize_t
netmap_mem2_if_offset(struct netmap_mem_d *nmd, const void *addr)
{
@@ -602,7 +830,7 @@ static void *
netmap_obj_malloc(struct netmap_obj_pool *p, u_int len, uint32_t *start, uint32_t *index)
{
uint32_t i = 0; /* index in the bitmap */
- uint32_t mask, j; /* slot counter */
+ uint32_t mask, j = 0; /* slot counter */
void *vaddr = NULL;
if (len > p->_objsize) {
@@ -636,7 +864,7 @@ netmap_obj_malloc(struct netmap_obj_pool *p, u_int len, uint32_t *start, uint32_
if (index)
*index = i * 32 + j;
}
- ND("%s allocator: allocated object @ [%d][%d]: vaddr %p", i, j, vaddr);
+ ND("%s allocator: allocated object @ [%d][%d]: vaddr %p",p->name, i, j, vaddr);
if (start)
*start = i;
@@ -733,7 +961,7 @@ netmap_extra_alloc(struct netmap_adapter *na, uint32_t *head, uint32_t n)
*head = cur; /* restore */
break;
}
- RD(5, "allocate buffer %d -> %d", *head, cur);
+ ND(5, "allocate buffer %d -> %d", *head, cur);
*p = cur; /* link to previous head */
}
@@ -750,7 +978,7 @@ netmap_extra_free(struct netmap_adapter *na, uint32_t head)
struct netmap_obj_pool *p = &nmd->pools[NETMAP_BUF_POOL];
uint32_t i, cur, *buf;
- D("freeing the extra list");
+ ND("freeing the extra list");
for (i = 0; head >=2 && head < p->objtotal; i++) {
cur = head;
buf = lut[head].vaddr;
@@ -761,7 +989,8 @@ netmap_extra_free(struct netmap_adapter *na, uint32_t head)
}
if (head != 0)
D("breaking with head %d", head);
- D("freed %d buffers", i);
+ if (netmap_verbose)
+ D("freed %d buffers", i);
}
@@ -846,7 +1075,6 @@ netmap_reset_obj_allocator(struct netmap_obj_pool *p)
p->bitmap = NULL;
if (p->lut) {
u_int i;
- size_t sz = p->_clustsize;
/*
* Free each cluster allocated in
@@ -856,7 +1084,7 @@ netmap_reset_obj_allocator(struct netmap_obj_pool *p)
*/
for (i = 0; i < p->objtotal; i += p->_clustentries) {
if (p->lut[i].vaddr)
- contigfree(p->lut[i].vaddr, sz, M_NETMAP);
+ contigfree(p->lut[i].vaddr, p->_clustsize, M_NETMAP);
}
bzero(p->lut, sizeof(struct lut_entry) * p->objtotal);
#ifdef linux
@@ -973,6 +1201,18 @@ netmap_config_obj_allocator(struct netmap_obj_pool *p, u_int objtotal, u_int obj
return 0;
}
+static struct lut_entry *
+nm_alloc_lut(u_int nobj)
+{
+ size_t n = sizeof(struct lut_entry) * nobj;
+ struct lut_entry *lut;
+#ifdef linux
+ lut = vmalloc(n);
+#else
+ lut = malloc(n, M_NETMAP, M_NOWAIT | M_ZERO);
+#endif
+ return lut;
+}
/* call with NMA_LOCK held */
static int
@@ -985,14 +1225,9 @@ netmap_finalize_obj_allocator(struct netmap_obj_pool *p)
p->numclusters = p->_numclusters;
p->objtotal = p->_objtotal;
- n = sizeof(struct lut_entry) * p->objtotal;
-#ifdef linux
- p->lut = vmalloc(n);
-#else
- p->lut = malloc(n, M_NETMAP, M_NOWAIT | M_ZERO);
-#endif
+ p->lut = nm_alloc_lut(p->objtotal);
if (p->lut == NULL) {
- D("Unable to create lookup table (%d bytes) for '%s'", (int)n, p->name);
+ D("Unable to create lookup table for '%s'", p->name);
goto clean;
}
@@ -1015,6 +1250,13 @@ netmap_finalize_obj_allocator(struct netmap_obj_pool *p)
int lim = i + p->_clustentries;
char *clust;
+ /*
+ * XXX Note, we only need contigmalloc() for buffers attached
+ * to native interfaces. In all other cases (nifp, netmap rings
+ * and even buffers for VALE ports or emulated interfaces) we
+ * can live with standard malloc, because the hardware will not
+ * access the pages directly.
+ */
clust = contigmalloc(n, M_NETMAP, M_NOWAIT | M_ZERO,
(size_t)0, -1UL, PAGE_SIZE, 0);
if (clust == NULL) {
@@ -1108,10 +1350,15 @@ netmap_mem_unmap(struct netmap_obj_pool *p, struct netmap_adapter *na)
if (na->pdev == NULL)
return 0;
-#ifdef __FreeBSD__
+#if defined(__FreeBSD__)
(void)i;
(void)lim;
D("unsupported on FreeBSD");
+
+#elif defined(_WIN32)
+ (void)i;
+ (void)lim;
+ D("unsupported on Windows"); //XXX_ale, really?
#else /* linux */
for (i = 2; i < lim; i++) {
netmap_unload_map(na, (bus_dma_tag_t) na->pdev, &p->lut[i].paddr);
@@ -1124,8 +1371,10 @@ netmap_mem_unmap(struct netmap_obj_pool *p, struct netmap_adapter *na)
static int
netmap_mem_map(struct netmap_obj_pool *p, struct netmap_adapter *na)
{
-#ifdef __FreeBSD__
+#if defined(__FreeBSD__)
D("unsupported on FreeBSD");
+#elif defined(_WIN32)
+ D("unsupported on Windows"); //XXX_ale, really?
#else /* linux */
int i, lim = p->_objtotal;
@@ -1142,6 +1391,30 @@ netmap_mem_map(struct netmap_obj_pool *p, struct netmap_adapter *na)
}
static int
+netmap_mem_init_shared_info(struct netmap_mem_d *nmd)
+{
+ struct netmap_mem_shared_info *nms_info;
+ ssize_t base;
+
+ /* Use the first slot in IF_POOL */
+ nms_info = netmap_if_malloc(nmd, sizeof(*nms_info));
+ if (nms_info == NULL) {
+ return ENOMEM;
+ }
+
+ base = netmap_if_offset(nmd, nms_info);
+
+ memcpy(&nms_info->up, &nms_if_blueprint, sizeof(nms_if_blueprint));
+ nms_info->buf_pool_offset = nmd->pools[NETMAP_IF_POOL].memtotal + nmd->pools[NETMAP_RING_POOL].memtotal;
+ nms_info->buf_pool_objtotal = nmd->pools[NETMAP_BUF_POOL].objtotal;
+ nms_info->buf_pool_objsize = nmd->pools[NETMAP_BUF_POOL]._objsize;
+ nms_info->totalsize = nmd->nm_totalsize;
+ nms_info->features = NMS_FEAT_BUF_POOL | NMS_FEAT_MEMSIZE;
+
+ return 0;
+}
+
+static int
netmap_mem_finalize_all(struct netmap_mem_d *nmd)
{
int i;
@@ -1160,6 +1433,11 @@ netmap_mem_finalize_all(struct netmap_mem_d *nmd)
nmd->pools[NETMAP_BUF_POOL].bitmap[0] = ~3;
nmd->flags |= NETMAP_MEM_FINALIZED;
+ /* expose info to the ptnetmap guest */
+ nmd->lasterr = netmap_mem_init_shared_info(nmd);
+ if (nmd->lasterr)
+ goto error;
+
if (netmap_verbose)
D("interfaces %d KB, rings %d KB, buffers %d MB",
nmd->pools[NETMAP_IF_POOL].memtotal >> 10,
@@ -1207,10 +1485,9 @@ static int
netmap_mem_private_finalize(struct netmap_mem_d *nmd)
{
int err;
- NMA_LOCK(nmd);
- nmd->active++;
err = netmap_mem_finalize_all(nmd);
- NMA_UNLOCK(nmd);
+ if (!err)
+ nmd->active++;
return err;
}
@@ -1218,10 +1495,8 @@ netmap_mem_private_finalize(struct netmap_mem_d *nmd)
static void
netmap_mem_private_deref(struct netmap_mem_d *nmd)
{
- NMA_LOCK(nmd);
if (--nmd->active <= 0)
netmap_mem_reset_all(nmd);
- NMA_UNLOCK(nmd);
}
@@ -1238,7 +1513,7 @@ netmap_mem_private_new(const char *name, u_int txr, u_int txd,
u_int v, maxd;
d = malloc(sizeof(struct netmap_mem_d),
- M_DEVBUF, M_NOWAIT | M_ZERO);
+ M_DEVBUF, M_NOWAIT | M_ZERO);
if (d == NULL) {
err = ENOMEM;
goto error;
@@ -1357,10 +1632,10 @@ static int
netmap_mem_global_finalize(struct netmap_mem_d *nmd)
{
int err;
-
+
/* update configuration if changed */
if (netmap_mem_global_config(nmd))
- goto out;
+ return nmd->lasterr;
nmd->active++;
@@ -1417,13 +1692,17 @@ netmap_free_rings(struct netmap_adapter *na)
for_rx_tx(t) {
u_int i;
- for (i = 0; i < netmap_real_rings(na, t); i++) {
+ for (i = 0; i < nma_get_nrings(na, t) + 1; i++) {
struct netmap_kring *kring = &NMR(na, t)[i];
struct netmap_ring *ring = kring->ring;
- if (ring == NULL)
+ if (ring == NULL || kring->users > 0 || (kring->nr_kflags & NKR_NEEDRING)) {
+ ND("skipping ring %s (ring %p, users %d)",
+ kring->name, ring, kring->users);
continue;
- netmap_free_bufs(na->nm_mem, ring->slot, kring->nkr_num_slots);
+ }
+ if (i != nma_get_nrings(na, t) || na->na_flags & NAF_HOST_RINGS)
+ netmap_free_bufs(na->nm_mem, ring->slot, kring->nkr_num_slots);
netmap_ring_free(na->nm_mem, ring);
kring->ring = NULL;
}
@@ -1452,9 +1731,10 @@ netmap_mem2_rings_create(struct netmap_adapter *na)
struct netmap_ring *ring = kring->ring;
u_int len, ndesc;
- if (ring) {
- ND("%s already created", kring->name);
- continue; /* already created by somebody else */
+ if (ring || (!kring->users && !(kring->nr_kflags & NKR_NEEDRING))) {
+ /* uneeded, or already created by somebody else */
+ ND("skipping ring %s", kring->name);
+ continue;
}
ndesc = kring->nkr_num_slots;
len = sizeof(struct netmap_ring) +
@@ -1569,10 +1849,22 @@ netmap_mem2_if_new(struct netmap_adapter *na)
*/
base = netmap_if_offset(na->nm_mem, nifp);
for (i = 0; i < n[NR_TX]; i++) {
+ if (na->tx_rings[i].ring == NULL) {
+ // XXX maybe use the offset of an error ring,
+ // like we do for buffers?
+ *(ssize_t *)(uintptr_t)&nifp->ring_ofs[i] = 0;
+ continue;
+ }
*(ssize_t *)(uintptr_t)&nifp->ring_ofs[i] =
netmap_ring_offset(na->nm_mem, na->tx_rings[i].ring) - base;
}
for (i = 0; i < n[NR_RX]; i++) {
+ if (na->rx_rings[i].ring == NULL) {
+ // XXX maybe use the offset of an error ring,
+ // like we do for buffers?
+ *(ssize_t *)(uintptr_t)&nifp->ring_ofs[i+n[NR_TX]] = 0;
+ continue;
+ }
*(ssize_t *)(uintptr_t)&nifp->ring_ofs[i+n[NR_TX]] =
netmap_ring_offset(na->nm_mem, na->rx_rings[i].ring) - base;
}
@@ -1636,3 +1928,531 @@ struct netmap_mem_ops netmap_mem_private_ops = {
.nmd_rings_create = netmap_mem2_rings_create,
.nmd_rings_delete = netmap_mem2_rings_delete
};
+
+#ifdef WITH_PTNETMAP_GUEST
+struct mem_pt_if {
+ struct mem_pt_if *next;
+ struct ifnet *ifp;
+ unsigned int nifp_offset;
+ nm_pt_guest_ptctl_t ptctl;
+};
+
+/* Netmap allocator for ptnetmap guests. */
+struct netmap_mem_ptg {
+ struct netmap_mem_d up;
+
+ vm_paddr_t nm_paddr; /* physical address in the guest */
+ void *nm_addr; /* virtual address in the guest */
+ struct netmap_lut buf_lut; /* lookup table for BUF pool in the guest */
+ nm_memid_t nm_host_id; /* allocator identifier in the host */
+ struct ptnetmap_memdev *ptn_dev;
+ struct mem_pt_if *pt_ifs; /* list of interfaces in passthrough */
+};
+
+/* Link a passthrough interface to a passthrough netmap allocator. */
+static int
+netmap_mem_pt_guest_ifp_add(struct netmap_mem_d *nmd, struct ifnet *ifp,
+ unsigned int nifp_offset,
+ nm_pt_guest_ptctl_t ptctl)
+{
+ struct netmap_mem_ptg *ptnmd = (struct netmap_mem_ptg *)nmd;
+ struct mem_pt_if *ptif = malloc(sizeof(*ptif), M_NETMAP,
+ M_NOWAIT | M_ZERO);
+
+ if (!ptif) {
+ return ENOMEM;
+ }
+
+ NMA_LOCK(nmd);
+
+ ptif->ifp = ifp;
+ ptif->nifp_offset = nifp_offset;
+ ptif->ptctl = ptctl;
+
+ if (ptnmd->pt_ifs) {
+ ptif->next = ptnmd->pt_ifs;
+ }
+ ptnmd->pt_ifs = ptif;
+
+ NMA_UNLOCK(nmd);
+
+ D("added (ifp=%p,nifp_offset=%u)", ptif->ifp, ptif->nifp_offset);
+
+ return 0;
+}
+
+/* Called with NMA_LOCK(nmd) held. */
+static struct mem_pt_if *
+netmap_mem_pt_guest_ifp_lookup(struct netmap_mem_d *nmd, struct ifnet *ifp)
+{
+ struct netmap_mem_ptg *ptnmd = (struct netmap_mem_ptg *)nmd;
+ struct mem_pt_if *curr;
+
+ for (curr = ptnmd->pt_ifs; curr; curr = curr->next) {
+ if (curr->ifp == ifp) {
+ return curr;
+ }
+ }
+
+ return NULL;
+}
+
+/* Unlink a passthrough interface from a passthrough netmap allocator. */
+int
+netmap_mem_pt_guest_ifp_del(struct netmap_mem_d *nmd, struct ifnet *ifp)
+{
+ struct netmap_mem_ptg *ptnmd = (struct netmap_mem_ptg *)nmd;
+ struct mem_pt_if *prev = NULL;
+ struct mem_pt_if *curr;
+ int ret = -1;
+
+ NMA_LOCK(nmd);
+
+ for (curr = ptnmd->pt_ifs; curr; curr = curr->next) {
+ if (curr->ifp == ifp) {
+ if (prev) {
+ prev->next = curr->next;
+ } else {
+ ptnmd->pt_ifs = curr->next;
+ }
+ D("removed (ifp=%p,nifp_offset=%u)",
+ curr->ifp, curr->nifp_offset);
+ free(curr, M_NETMAP);
+ ret = 0;
+ break;
+ }
+ prev = curr;
+ }
+
+ NMA_UNLOCK(nmd);
+
+ return ret;
+}
+
+/* Read allocator info from the first netmap_if (only on finalize) */
+static int
+netmap_mem_pt_guest_read_shared_info(struct netmap_mem_d *nmd)
+{
+ struct netmap_mem_ptg *ptnmd = (struct netmap_mem_ptg *)nmd;
+ struct netmap_mem_shared_info *nms_info;
+ uint32_t bufsize;
+ uint32_t nbuffers;
+ char *vaddr;
+ vm_paddr_t paddr;
+ int i;
+
+ nms_info = (struct netmap_mem_shared_info *)ptnmd->nm_addr;
+ if (strncmp(nms_info->up.ni_name, NMS_NAME, sizeof(NMS_NAME)) != 0) {
+ D("error, the first slot does not contain shared info");
+ return EINVAL;
+ }
+ /* check features mem_shared info */
+ if ((nms_info->features & (NMS_FEAT_BUF_POOL | NMS_FEAT_MEMSIZE)) !=
+ (NMS_FEAT_BUF_POOL | NMS_FEAT_MEMSIZE)) {
+ D("error, the shared info does not contain BUF_POOL and MEMSIZE");
+ return EINVAL;
+ }
+
+ bufsize = nms_info->buf_pool_objsize;
+ nbuffers = nms_info->buf_pool_objtotal;
+
+ /* allocate the lut */
+ if (ptnmd->buf_lut.lut == NULL) {
+ D("allocating lut");
+ ptnmd->buf_lut.lut = nm_alloc_lut(nbuffers);
+ if (ptnmd->buf_lut.lut == NULL) {
+ D("lut allocation failed");
+ return ENOMEM;
+ }
+ }
+
+ /* we have physically contiguous memory mapped through PCI BAR */
+ vaddr = (char *)(ptnmd->nm_addr) + nms_info->buf_pool_offset;
+ paddr = ptnmd->nm_paddr + nms_info->buf_pool_offset;
+
+ for (i = 0; i < nbuffers; i++) {
+ ptnmd->buf_lut.lut[i].vaddr = vaddr;
+ ptnmd->buf_lut.lut[i].paddr = paddr;
+ vaddr += bufsize;
+ paddr += bufsize;
+ }
+
+ ptnmd->buf_lut.objtotal = nbuffers;
+ ptnmd->buf_lut.objsize = bufsize;
+
+ nmd->nm_totalsize = nms_info->totalsize;
+
+ return 0;
+}
+
+static int
+netmap_mem_pt_guest_get_lut(struct netmap_mem_d *nmd, struct netmap_lut *lut)
+{
+ struct netmap_mem_ptg *ptnmd = (struct netmap_mem_ptg *)nmd;
+
+ if (!(nmd->flags & NETMAP_MEM_FINALIZED)) {
+ return EINVAL;
+ }
+
+ *lut = ptnmd->buf_lut;
+ return 0;
+}
+
+static int
+netmap_mem_pt_guest_get_info(struct netmap_mem_d *nmd, u_int *size,
+ u_int *memflags, uint16_t *id)
+{
+ int error = 0;
+
+ NMA_LOCK(nmd);
+
+ error = nmd->ops->nmd_config(nmd);
+ if (error)
+ goto out;
+
+ if (size)
+ *size = nmd->nm_totalsize;
+ if (memflags)
+ *memflags = nmd->flags;
+ if (id)
+ *id = nmd->nm_id;
+
+out:
+ NMA_UNLOCK(nmd);
+
+ return error;
+}
+
+static vm_paddr_t
+netmap_mem_pt_guest_ofstophys(struct netmap_mem_d *nmd, vm_ooffset_t off)
+{
+ struct netmap_mem_ptg *ptnmd = (struct netmap_mem_ptg *)nmd;
+ vm_paddr_t paddr;
+ /* if the offset is valid, just return csb->base_addr + off */
+ paddr = (vm_paddr_t)(ptnmd->nm_paddr + off);
+ ND("off %lx padr %lx", off, (unsigned long)paddr);
+ return paddr;
+}
+
+static int
+netmap_mem_pt_guest_config(struct netmap_mem_d *nmd)
+{
+ /* nothing to do, we are configured on creation
+ * and configuration never changes thereafter
+ */
+ return 0;
+}
+
+static int
+netmap_mem_pt_guest_finalize(struct netmap_mem_d *nmd)
+{
+ struct netmap_mem_ptg *ptnmd = (struct netmap_mem_ptg *)nmd;
+ int error = 0;
+
+ nmd->active++;
+
+ if (nmd->flags & NETMAP_MEM_FINALIZED)
+ goto out;
+
+ if (ptnmd->ptn_dev == NULL) {
+ D("ptnetmap memdev not attached");
+ error = ENOMEM;
+ goto err;
+ }
+ /* map memory through ptnetmap-memdev BAR */
+ error = nm_os_pt_memdev_iomap(ptnmd->ptn_dev, &ptnmd->nm_paddr,
+ &ptnmd->nm_addr);
+ if (error)
+ goto err;
+
+ /* read allcator info and create lut */
+ error = netmap_mem_pt_guest_read_shared_info(nmd);
+ if (error)
+ goto err;
+
+ nmd->flags |= NETMAP_MEM_FINALIZED;
+out:
+ return 0;
+err:
+ nmd->active--;
+ return error;
+}
+
+static void
+netmap_mem_pt_guest_deref(struct netmap_mem_d *nmd)
+{
+ struct netmap_mem_ptg *ptnmd = (struct netmap_mem_ptg *)nmd;
+
+ nmd->active--;
+ if (nmd->active <= 0 &&
+ (nmd->flags & NETMAP_MEM_FINALIZED)) {
+ nmd->flags &= ~NETMAP_MEM_FINALIZED;
+ /* unmap ptnetmap-memdev memory */
+ if (ptnmd->ptn_dev) {
+ nm_os_pt_memdev_iounmap(ptnmd->ptn_dev);
+ }
+ ptnmd->nm_addr = 0;
+ ptnmd->nm_paddr = 0;
+ }
+}
+
+static ssize_t
+netmap_mem_pt_guest_if_offset(struct netmap_mem_d *nmd, const void *vaddr)
+{
+ struct netmap_mem_ptg *ptnmd = (struct netmap_mem_ptg *)nmd;
+
+ return (const char *)(vaddr) - (char *)(ptnmd->nm_addr);
+}
+
+static void
+netmap_mem_pt_guest_delete(struct netmap_mem_d *nmd)
+{
+ if (nmd == NULL)
+ return;
+ if (netmap_verbose)
+ D("deleting %p", nmd);
+ if (nmd->active > 0)
+ D("bug: deleting mem allocator with active=%d!", nmd->active);
+ nm_mem_release_id(nmd);
+ if (netmap_verbose)
+ D("done deleting %p", nmd);
+ NMA_LOCK_DESTROY(nmd);
+ free(nmd, M_DEVBUF);
+}
+
+static struct netmap_if *
+netmap_mem_pt_guest_if_new(struct netmap_adapter *na)
+{
+ struct netmap_mem_ptg *ptnmd = (struct netmap_mem_ptg *)na->nm_mem;
+ struct mem_pt_if *ptif;
+ struct netmap_if *nifp = NULL;
+
+ NMA_LOCK(na->nm_mem);
+
+ ptif = netmap_mem_pt_guest_ifp_lookup(na->nm_mem, na->ifp);
+ if (ptif == NULL) {
+ D("Error: interface %p is not in passthrough", na->ifp);
+ goto out;
+ }
+
+ nifp = (struct netmap_if *)((char *)(ptnmd->nm_addr) +
+ ptif->nifp_offset);
+ NMA_UNLOCK(na->nm_mem);
+out:
+ return nifp;
+}
+
+static void
+netmap_mem_pt_guest_if_delete(struct netmap_adapter *na, struct netmap_if *nifp)
+{
+ struct mem_pt_if *ptif;
+
+ NMA_LOCK(na->nm_mem);
+
+ ptif = netmap_mem_pt_guest_ifp_lookup(na->nm_mem, na->ifp);
+ if (ptif == NULL) {
+ D("Error: interface %p is not in passthrough", na->ifp);
+ goto out;
+ }
+
+ ptif->ptctl(na->ifp, PTNETMAP_PTCTL_IFDELETE);
+out:
+ NMA_UNLOCK(na->nm_mem);
+}
+
+static int
+netmap_mem_pt_guest_rings_create(struct netmap_adapter *na)
+{
+ struct netmap_mem_ptg *ptnmd = (struct netmap_mem_ptg *)na->nm_mem;
+ struct mem_pt_if *ptif;
+ struct netmap_if *nifp;
+ int i, error = -1;
+
+ NMA_LOCK(na->nm_mem);
+
+ ptif = netmap_mem_pt_guest_ifp_lookup(na->nm_mem, na->ifp);
+ if (ptif == NULL) {
+ D("Error: interface %p is not in passthrough", na->ifp);
+ goto out;
+ }
+
+
+ /* point each kring to the corresponding backend ring */
+ nifp = (struct netmap_if *)((char *)ptnmd->nm_addr + ptif->nifp_offset);
+ for (i = 0; i <= na->num_tx_rings; i++) {
+ struct netmap_kring *kring = na->tx_rings + i;
+ if (kring->ring)
+ continue;
+ kring->ring = (struct netmap_ring *)
+ ((char *)nifp + nifp->ring_ofs[i]);
+ }
+ for (i = 0; i <= na->num_rx_rings; i++) {
+ struct netmap_kring *kring = na->rx_rings + i;
+ if (kring->ring)
+ continue;
+ kring->ring = (struct netmap_ring *)
+ ((char *)nifp +
+ nifp->ring_ofs[i + na->num_tx_rings + 1]);
+ }
+
+ //error = ptif->ptctl->nm_ptctl(ifp, PTNETMAP_PTCTL_RINGSCREATE);
+ error = 0;
+out:
+ NMA_UNLOCK(na->nm_mem);
+
+ return error;
+}
+
+static void
+netmap_mem_pt_guest_rings_delete(struct netmap_adapter *na)
+{
+ /* TODO: remove?? */
+#if 0
+ struct netmap_mem_ptg *ptnmd = (struct netmap_mem_ptg *)na->nm_mem;
+ struct mem_pt_if *ptif = netmap_mem_pt_guest_ifp_lookup(na->nm_mem,
+ na->ifp);
+#endif
+}
+
+static struct netmap_mem_ops netmap_mem_pt_guest_ops = {
+ .nmd_get_lut = netmap_mem_pt_guest_get_lut,
+ .nmd_get_info = netmap_mem_pt_guest_get_info,
+ .nmd_ofstophys = netmap_mem_pt_guest_ofstophys,
+ .nmd_config = netmap_mem_pt_guest_config,
+ .nmd_finalize = netmap_mem_pt_guest_finalize,
+ .nmd_deref = netmap_mem_pt_guest_deref,
+ .nmd_if_offset = netmap_mem_pt_guest_if_offset,
+ .nmd_delete = netmap_mem_pt_guest_delete,
+ .nmd_if_new = netmap_mem_pt_guest_if_new,
+ .nmd_if_delete = netmap_mem_pt_guest_if_delete,
+ .nmd_rings_create = netmap_mem_pt_guest_rings_create,
+ .nmd_rings_delete = netmap_mem_pt_guest_rings_delete
+};
+
+/* Called with NMA_LOCK(&nm_mem) held. */
+static struct netmap_mem_d *
+netmap_mem_pt_guest_find_hostid(nm_memid_t host_id)
+{
+ struct netmap_mem_d *mem = NULL;
+ struct netmap_mem_d *scan = netmap_last_mem_d;
+
+ do {
+ /* find ptnetmap allocator through host ID */
+ if (scan->ops->nmd_deref == netmap_mem_pt_guest_deref &&
+ ((struct netmap_mem_ptg *)(scan))->nm_host_id == host_id) {
+ mem = scan;
+ break;
+ }
+ scan = scan->next;
+ } while (scan != netmap_last_mem_d);
+
+ return mem;
+}
+
+/* Called with NMA_LOCK(&nm_mem) held. */
+static struct netmap_mem_d *
+netmap_mem_pt_guest_create(nm_memid_t host_id)
+{
+ struct netmap_mem_ptg *ptnmd;
+ int err = 0;
+
+ ptnmd = malloc(sizeof(struct netmap_mem_ptg),
+ M_DEVBUF, M_NOWAIT | M_ZERO);
+ if (ptnmd == NULL) {
+ err = ENOMEM;
+ goto error;
+ }
+
+ ptnmd->up.ops = &netmap_mem_pt_guest_ops;
+ ptnmd->nm_host_id = host_id;
+ ptnmd->pt_ifs = NULL;
+
+ /* Assign new id in the guest (We have the lock) */
+ err = nm_mem_assign_id_locked(&ptnmd->up);
+ if (err)
+ goto error;
+
+ ptnmd->up.flags &= ~NETMAP_MEM_FINALIZED;
+ ptnmd->up.flags |= NETMAP_MEM_IO;
+
+ NMA_LOCK_INIT(&ptnmd->up);
+
+ return &ptnmd->up;
+error:
+ netmap_mem_pt_guest_delete(&ptnmd->up);
+ return NULL;
+}
+
+/*
+ * find host id in guest allocators and create guest allocator
+ * if it is not there
+ */
+static struct netmap_mem_d *
+netmap_mem_pt_guest_get(nm_memid_t host_id)
+{
+ struct netmap_mem_d *nmd;
+
+ NMA_LOCK(&nm_mem);
+ nmd = netmap_mem_pt_guest_find_hostid(host_id);
+ if (nmd == NULL) {
+ nmd = netmap_mem_pt_guest_create(host_id);
+ }
+ NMA_UNLOCK(&nm_mem);
+
+ return nmd;
+}
+
+/*
+ * The guest allocator can be created by ptnetmap_memdev (during the device
+ * attach) or by ptnetmap device (e1000/virtio), during the netmap_attach.
+ *
+ * The order is not important (we have different order in LINUX and FreeBSD).
+ * The first one, creates the device, and the second one simply attaches it.
+ */
+
+/* Called when ptnetmap_memdev is attaching, to attach a new allocator in
+ * the guest */
+struct netmap_mem_d *
+netmap_mem_pt_guest_attach(struct ptnetmap_memdev *ptn_dev, nm_memid_t host_id)
+{
+ struct netmap_mem_d *nmd;
+ struct netmap_mem_ptg *ptnmd;
+
+ nmd = netmap_mem_pt_guest_get(host_id);
+
+ /* assign this device to the guest allocator */
+ if (nmd) {
+ ptnmd = (struct netmap_mem_ptg *)nmd;
+ ptnmd->ptn_dev = ptn_dev;
+ }
+
+ return nmd;
+}
+
+/* Called when ptnetmap device (virtio/e1000) is attaching */
+struct netmap_mem_d *
+netmap_mem_pt_guest_new(struct ifnet *ifp,
+ unsigned int nifp_offset,
+ nm_pt_guest_ptctl_t ptctl)
+{
+ struct netmap_mem_d *nmd;
+ nm_memid_t host_id;
+
+ if (ifp == NULL || ptctl == NULL) {
+ return NULL;
+ }
+
+ /* Get the host id allocator. */
+ host_id = ptctl(ifp, PTNETMAP_PTCTL_HOSTMEMID);
+
+ nmd = netmap_mem_pt_guest_get(host_id);
+
+ if (nmd) {
+ netmap_mem_pt_guest_ifp_add(nmd, ifp, nifp_offset,
+ ptctl);
+ }
+
+ return nmd;
+}
+
+#endif /* WITH_PTNETMAP_GUEST */
diff --git a/sys/dev/netmap/netmap_mem2.h b/sys/dev/netmap/netmap_mem2.h
index ef0ff96d8e7f..7f4c5e9e9624 100644
--- a/sys/dev/netmap/netmap_mem2.h
+++ b/sys/dev/netmap/netmap_mem2.h
@@ -1,5 +1,8 @@
/*
- * Copyright (C) 2012-2014 Matteo Landi, Luigi Rizzo, Giuseppe Lettieri. All rights reserved.
+ * Copyright (C) 2012-2014 Matteo Landi
+ * Copyright (C) 2012-2016 Luigi Rizzo
+ * Copyright (C) 2012-2016 Giuseppe Lettieri
+ * All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -117,8 +120,11 @@
extern struct netmap_mem_d nm_mem;
-void netmap_mem_get_lut(struct netmap_mem_d *, struct netmap_lut *);
+int netmap_mem_get_lut(struct netmap_mem_d *, struct netmap_lut *);
vm_paddr_t netmap_mem_ofstophys(struct netmap_mem_d *, vm_ooffset_t);
+#ifdef _WIN32
+PMDL win32_build_user_vm_map(struct netmap_mem_d* nmd);
+#endif
int netmap_mem_finalize(struct netmap_mem_d *, struct netmap_adapter *);
int netmap_mem_init(void);
void netmap_mem_fini(void);
@@ -127,6 +133,7 @@ void netmap_mem_if_delete(struct netmap_adapter *, struct netmap_if *);
int netmap_mem_rings_create(struct netmap_adapter *);
void netmap_mem_rings_delete(struct netmap_adapter *);
void netmap_mem_deref(struct netmap_mem_d *, struct netmap_adapter *);
+int netmap_mem2_get_pool_info(struct netmap_mem_d *, u_int, u_int *, u_int *);
int netmap_mem_get_info(struct netmap_mem_d *, u_int *size, u_int *memflags, uint16_t *id);
ssize_t netmap_mem_if_offset(struct netmap_mem_d *, const void *vaddr);
struct netmap_mem_d* netmap_mem_private_new(const char *name,
@@ -157,6 +164,15 @@ void netmap_mem_put(struct netmap_mem_d *);
#endif /* !NM_DEBUG_PUTGET */
+#ifdef WITH_PTNETMAP_GUEST
+struct netmap_mem_d* netmap_mem_pt_guest_new(struct ifnet *,
+ unsigned int nifp_offset,
+ nm_pt_guest_ptctl_t);
+struct ptnetmap_memdev;
+struct netmap_mem_d* netmap_mem_pt_guest_attach(struct ptnetmap_memdev *, uint16_t);
+int netmap_mem_pt_guest_ifp_del(struct netmap_mem_d *, struct ifnet *);
+#endif /* WITH_PTNETMAP_GUEST */
+
#define NETMAP_MEM_PRIVATE 0x2 /* allocator uses private address space */
#define NETMAP_MEM_IO 0x4 /* the underlying memory is mmapped I/O */
diff --git a/sys/dev/netmap/netmap_monitor.c b/sys/dev/netmap/netmap_monitor.c
index c303952417ff..bf6e23f5546e 100644
--- a/sys/dev/netmap/netmap_monitor.c
+++ b/sys/dev/netmap/netmap_monitor.c
@@ -1,5 +1,6 @@
/*
- * Copyright (C) 2014 Giuseppe Lettieri. All rights reserved.
+ * Copyright (C) 2014-2016 Giuseppe Lettieri
+ * All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -101,6 +102,8 @@
#warning OSX support is only partial
#include "osx_glue.h"
+#elif defined(_WIN32)
+#include "win_glue.h"
#else
#error Unsupported platform
@@ -151,13 +154,17 @@ netmap_monitor_rxsync(struct netmap_kring *kring, int flags)
}
/* nm_krings_create callbacks for monitors.
- * We could use the default netmap_hw_krings_zmon, but
- * we don't need the mbq.
*/
static int
netmap_monitor_krings_create(struct netmap_adapter *na)
{
- return netmap_krings_create(na, 0);
+ int error = netmap_krings_create(na, 0);
+ if (error)
+ return error;
+ /* override the host rings callbacks */
+ na->tx_rings[na->num_tx_rings].nm_sync = netmap_monitor_txsync;
+ na->rx_rings[na->num_rx_rings].nm_sync = netmap_monitor_rxsync;
+ return 0;
}
/* nm_krings_delete callback for monitors */
@@ -184,9 +191,13 @@ nm_monitor_alloc(struct netmap_kring *kring, u_int n)
if (n <= kring->max_monitors)
/* we already have more entries that requested */
return 0;
-
+
len = sizeof(struct netmap_kring *) * n;
+#ifndef _WIN32
nm = realloc(kring->monitors, len, M_DEVBUF, M_NOWAIT | M_ZERO);
+#else
+ nm = realloc(kring->monitors, len, sizeof(struct netmap_kring *)*kring->max_monitors);
+#endif
if (nm == NULL)
return ENOMEM;
@@ -229,10 +240,10 @@ static int netmap_monitor_parent_notify(struct netmap_kring *, int);
static int
netmap_monitor_add(struct netmap_kring *mkring, struct netmap_kring *kring, int zcopy)
{
- int error = 0;
+ int error = NM_IRQ_COMPLETED;
/* sinchronize with concurrently running nm_sync()s */
- nm_kr_get(kring);
+ nm_kr_stop(kring, NM_KR_LOCKED);
/* make sure the monitor array exists and is big enough */
error = nm_monitor_alloc(kring, kring->n_monitors + 1);
if (error)
@@ -242,7 +253,7 @@ netmap_monitor_add(struct netmap_kring *mkring, struct netmap_kring *kring, int
kring->n_monitors++;
if (kring->n_monitors == 1) {
/* this is the first monitor, intercept callbacks */
- D("%s: intercept callbacks on %s", mkring->name, kring->name);
+ ND("%s: intercept callbacks on %s", mkring->name, kring->name);
kring->mon_sync = kring->nm_sync;
/* zcopy monitors do not override nm_notify(), but
* we save the original one regardless, so that
@@ -265,7 +276,7 @@ netmap_monitor_add(struct netmap_kring *mkring, struct netmap_kring *kring, int
}
out:
- nm_kr_put(kring);
+ nm_kr_start(kring);
return error;
}
@@ -277,7 +288,7 @@ static void
netmap_monitor_del(struct netmap_kring *mkring, struct netmap_kring *kring)
{
/* sinchronize with concurrently running nm_sync()s */
- nm_kr_get(kring);
+ nm_kr_stop(kring, NM_KR_LOCKED);
kring->n_monitors--;
if (mkring->mon_pos != kring->n_monitors) {
kring->monitors[mkring->mon_pos] = kring->monitors[kring->n_monitors];
@@ -286,18 +297,18 @@ netmap_monitor_del(struct netmap_kring *mkring, struct netmap_kring *kring)
kring->monitors[kring->n_monitors] = NULL;
if (kring->n_monitors == 0) {
/* this was the last monitor, restore callbacks and delete monitor array */
- D("%s: restoring sync on %s: %p", mkring->name, kring->name, kring->mon_sync);
+ ND("%s: restoring sync on %s: %p", mkring->name, kring->name, kring->mon_sync);
kring->nm_sync = kring->mon_sync;
kring->mon_sync = NULL;
if (kring->tx == NR_RX) {
- D("%s: restoring notify on %s: %p",
+ ND("%s: restoring notify on %s: %p",
mkring->name, kring->name, kring->mon_notify);
kring->nm_notify = kring->mon_notify;
kring->mon_notify = NULL;
}
nm_monitor_dealloc(kring);
}
- nm_kr_put(kring);
+ nm_kr_start(kring);
}
@@ -316,7 +327,7 @@ netmap_monitor_stop(struct netmap_adapter *na)
for_rx_tx(t) {
u_int i;
- for (i = 0; i < nma_get_nrings(na, t); i++) {
+ for (i = 0; i < nma_get_nrings(na, t) + 1; i++) {
struct netmap_kring *kring = &NMR(na, t)[i];
u_int j;
@@ -360,23 +371,32 @@ netmap_monitor_reg_common(struct netmap_adapter *na, int onoff, int zmon)
for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {
kring = &NMR(pna, t)[i];
mkring = &na->rx_rings[i];
- netmap_monitor_add(mkring, kring, zmon);
+ if (nm_kring_pending_on(mkring)) {
+ netmap_monitor_add(mkring, kring, zmon);
+ mkring->nr_mode = NKR_NETMAP_ON;
+ }
}
}
}
na->na_flags |= NAF_NETMAP_ON;
} else {
- if (pna == NULL) {
- D("%s: parent left netmap mode, nothing to restore", na->name);
- return 0;
- }
- na->na_flags &= ~NAF_NETMAP_ON;
+ if (na->active_fds == 0)
+ na->na_flags &= ~NAF_NETMAP_ON;
for_rx_tx(t) {
if (mna->flags & nm_txrx2flag(t)) {
for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {
- kring = &NMR(pna, t)[i];
mkring = &na->rx_rings[i];
- netmap_monitor_del(mkring, kring);
+ if (nm_kring_pending_off(mkring)) {
+ mkring->nr_mode = NKR_NETMAP_OFF;
+ /* we cannot access the parent krings if the parent
+ * has left netmap mode. This is signaled by a NULL
+ * pna pointer
+ */
+ if (pna) {
+ kring = &NMR(pna, t)[i];
+ netmap_monitor_del(mkring, kring);
+ }
+ }
}
}
}
@@ -386,7 +406,7 @@ netmap_monitor_reg_common(struct netmap_adapter *na, int onoff, int zmon)
/*
****************************************************************
- * functions specific for zero-copy monitors
+ * functions specific for zero-copy monitors
****************************************************************
*/
@@ -534,7 +554,7 @@ netmap_zmon_dtor(struct netmap_adapter *na)
/*
****************************************************************
- * functions specific for copy monitors
+ * functions specific for copy monitors
****************************************************************
*/
@@ -652,17 +672,27 @@ netmap_monitor_parent_rxsync(struct netmap_kring *kring, int flags)
static int
netmap_monitor_parent_notify(struct netmap_kring *kring, int flags)
{
+ int (*notify)(struct netmap_kring*, int);
ND(5, "%s %x", kring->name, flags);
/* ?xsync callbacks have tryget called by their callers
* (NIOCREGIF and poll()), but here we have to call it
* by ourself
*/
- if (nm_kr_tryget(kring))
- goto out;
- netmap_monitor_parent_rxsync(kring, NAF_FORCE_READ);
+ if (nm_kr_tryget(kring, 0, NULL)) {
+ /* in all cases, just skip the sync */
+ return NM_IRQ_COMPLETED;
+ }
+ if (kring->n_monitors > 0) {
+ netmap_monitor_parent_rxsync(kring, NAF_FORCE_READ);
+ notify = kring->mon_notify;
+ } else {
+ /* we are no longer monitoring this ring, so both
+ * mon_sync and mon_notify are NULL
+ */
+ notify = kring->nm_notify;
+ }
nm_kr_put(kring);
-out:
- return kring->mon_notify(kring, flags);
+ return notify(kring, flags);
}
@@ -691,18 +721,25 @@ netmap_get_monitor_na(struct nmreq *nmr, struct netmap_adapter **na, int create)
struct nmreq pnmr;
struct netmap_adapter *pna; /* parent adapter */
struct netmap_monitor_adapter *mna;
+ struct ifnet *ifp = NULL;
int i, error;
enum txrx t;
int zcopy = (nmr->nr_flags & NR_ZCOPY_MON);
char monsuff[10] = "";
if ((nmr->nr_flags & (NR_MONITOR_TX | NR_MONITOR_RX)) == 0) {
+ if (nmr->nr_flags & NR_ZCOPY_MON) {
+ /* the flag makes no sense unless you are
+ * creating a monitor
+ */
+ return EINVAL;
+ }
ND("not a monitor");
return 0;
}
/* this is a request for a monitor adapter */
- D("flags %x", nmr->nr_flags);
+ ND("flags %x", nmr->nr_flags);
mna = malloc(sizeof(*mna), M_DEVBUF, M_NOWAIT | M_ZERO);
if (mna == NULL) {
@@ -716,13 +753,14 @@ netmap_get_monitor_na(struct nmreq *nmr, struct netmap_adapter **na, int create)
* except other monitors.
*/
memcpy(&pnmr, nmr, sizeof(pnmr));
- pnmr.nr_flags &= ~(NR_MONITOR_TX | NR_MONITOR_RX);
- error = netmap_get_na(&pnmr, &pna, create);
+ pnmr.nr_flags &= ~(NR_MONITOR_TX | NR_MONITOR_RX | NR_ZCOPY_MON);
+ error = netmap_get_na(&pnmr, &pna, &ifp, create);
if (error) {
D("parent lookup failed: %d", error);
+ free(mna, M_DEVBUF);
return error;
}
- D("found parent: %s", pna->name);
+ ND("found parent: %s", pna->name);
if (!nm_netmap_on(pna)) {
/* parent not in netmap mode */
@@ -829,19 +867,17 @@ netmap_get_monitor_na(struct nmreq *nmr, struct netmap_adapter **na, int create)
*na = &mna->up;
netmap_adapter_get(*na);
- /* write the configuration back */
- nmr->nr_tx_rings = mna->up.num_tx_rings;
- nmr->nr_rx_rings = mna->up.num_rx_rings;
- nmr->nr_tx_slots = mna->up.num_tx_desc;
- nmr->nr_rx_slots = mna->up.num_rx_desc;
-
/* keep the reference to the parent */
- D("monitor ok");
+ ND("monitor ok");
+
+ /* drop the reference to the ifp, if any */
+ if (ifp)
+ if_rele(ifp);
return 0;
put_out:
- netmap_adapter_put(pna);
+ netmap_unget_na(pna, ifp);
free(mna, M_DEVBUF);
return error;
}
diff --git a/sys/dev/netmap/netmap_offloadings.c b/sys/dev/netmap/netmap_offloadings.c
index dadc1dcbc14c..f8da672ffa53 100644
--- a/sys/dev/netmap/netmap_offloadings.c
+++ b/sys/dev/netmap/netmap_offloadings.c
@@ -1,5 +1,6 @@
/*
- * Copyright (C) 2014 Vincenzo Maffione. All rights reserved.
+ * Copyright (C) 2014-2015 Vincenzo Maffione
+ * All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -31,9 +32,9 @@
#include <sys/types.h>
#include <sys/errno.h>
#include <sys/param.h> /* defines used in kernel.h */
-#include <sys/malloc.h> /* types used in module initialization */
#include <sys/kernel.h> /* types used in module initialization */
#include <sys/sockio.h>
+#include <sys/malloc.h>
#include <sys/socketvar.h> /* struct socket */
#include <sys/socket.h> /* sockaddrs */
#include <net/if.h>
@@ -64,21 +65,21 @@
/* This routine is called by bdg_mismatch_datapath() when it finishes
* accumulating bytes for a segment, in order to fix some fields in the
* segment headers (which still contain the same content as the header
- * of the original GSO packet). 'buf' points to the beginning (e.g.
- * the ethernet header) of the segment, and 'len' is its length.
+ * of the original GSO packet). 'pkt' points to the beginning of the IP
+ * header of the segment, while 'len' is the length of the IP packet.
*/
-static void gso_fix_segment(uint8_t *buf, size_t len, u_int idx,
- u_int segmented_bytes, u_int last_segment,
- u_int tcp, u_int iphlen)
+static void
+gso_fix_segment(uint8_t *pkt, size_t len, u_int ipv4, u_int iphlen, u_int tcp,
+ u_int idx, u_int segmented_bytes, u_int last_segment)
{
- struct nm_iphdr *iph = (struct nm_iphdr *)(buf + 14);
- struct nm_ipv6hdr *ip6h = (struct nm_ipv6hdr *)(buf + 14);
+ struct nm_iphdr *iph = (struct nm_iphdr *)(pkt);
+ struct nm_ipv6hdr *ip6h = (struct nm_ipv6hdr *)(pkt);
uint16_t *check = NULL;
uint8_t *check_data = NULL;
- if (iphlen == 20) {
+ if (ipv4) {
/* Set the IPv4 "Total Length" field. */
- iph->tot_len = htobe16(len-14);
+ iph->tot_len = htobe16(len);
ND("ip total length %u", be16toh(ip->tot_len));
/* Set the IPv4 "Identification" field. */
@@ -87,15 +88,15 @@ static void gso_fix_segment(uint8_t *buf, size_t len, u_int idx,
/* Compute and insert the IPv4 header checksum. */
iph->check = 0;
- iph->check = nm_csum_ipv4(iph);
+ iph->check = nm_os_csum_ipv4(iph);
ND("IP csum %x", be16toh(iph->check));
- } else {/* if (iphlen == 40) */
+ } else {
/* Set the IPv6 "Payload Len" field. */
- ip6h->payload_len = htobe16(len-14-iphlen);
+ ip6h->payload_len = htobe16(len-iphlen);
}
if (tcp) {
- struct nm_tcphdr *tcph = (struct nm_tcphdr *)(buf + 14 + iphlen);
+ struct nm_tcphdr *tcph = (struct nm_tcphdr *)(pkt + iphlen);
/* Set the TCP sequence number. */
tcph->seq = htobe32(be32toh(tcph->seq) + segmented_bytes);
@@ -110,10 +111,10 @@ static void gso_fix_segment(uint8_t *buf, size_t len, u_int idx,
check = &tcph->check;
check_data = (uint8_t *)tcph;
} else { /* UDP */
- struct nm_udphdr *udph = (struct nm_udphdr *)(buf + 14 + iphlen);
+ struct nm_udphdr *udph = (struct nm_udphdr *)(pkt + iphlen);
/* Set the UDP 'Length' field. */
- udph->len = htobe16(len-14-iphlen);
+ udph->len = htobe16(len-iphlen);
check = &udph->check;
check_data = (uint8_t *)udph;
@@ -121,48 +122,80 @@ static void gso_fix_segment(uint8_t *buf, size_t len, u_int idx,
/* Compute and insert TCP/UDP checksum. */
*check = 0;
- if (iphlen == 20)
- nm_csum_tcpudp_ipv4(iph, check_data, len-14-iphlen, check);
+ if (ipv4)
+ nm_os_csum_tcpudp_ipv4(iph, check_data, len-iphlen, check);
else
- nm_csum_tcpudp_ipv6(ip6h, check_data, len-14-iphlen, check);
+ nm_os_csum_tcpudp_ipv6(ip6h, check_data, len-iphlen, check);
ND("TCP/UDP csum %x", be16toh(*check));
}
+static int
+vnet_hdr_is_bad(struct nm_vnet_hdr *vh)
+{
+ uint8_t gso_type = vh->gso_type & ~VIRTIO_NET_HDR_GSO_ECN;
+
+ return (
+ (gso_type != VIRTIO_NET_HDR_GSO_NONE &&
+ gso_type != VIRTIO_NET_HDR_GSO_TCPV4 &&
+ gso_type != VIRTIO_NET_HDR_GSO_UDP &&
+ gso_type != VIRTIO_NET_HDR_GSO_TCPV6)
+ ||
+ (vh->flags & ~(VIRTIO_NET_HDR_F_NEEDS_CSUM
+ | VIRTIO_NET_HDR_F_DATA_VALID))
+ );
+}
/* The VALE mismatch datapath implementation. */
-void bdg_mismatch_datapath(struct netmap_vp_adapter *na,
- struct netmap_vp_adapter *dst_na,
- struct nm_bdg_fwd *ft_p, struct netmap_ring *ring,
- u_int *j, u_int lim, u_int *howmany)
+void
+bdg_mismatch_datapath(struct netmap_vp_adapter *na,
+ struct netmap_vp_adapter *dst_na,
+ const struct nm_bdg_fwd *ft_p,
+ struct netmap_ring *dst_ring,
+ u_int *j, u_int lim, u_int *howmany)
{
- struct netmap_slot *slot = NULL;
+ struct netmap_slot *dst_slot = NULL;
struct nm_vnet_hdr *vh = NULL;
- /* Number of source slots to process. */
- u_int frags = ft_p->ft_frags;
- struct nm_bdg_fwd *ft_end = ft_p + frags;
+ const struct nm_bdg_fwd *ft_end = ft_p + ft_p->ft_frags;
/* Source and destination pointers. */
uint8_t *dst, *src;
size_t src_len, dst_len;
+ /* Indices and counters for the destination ring. */
u_int j_start = *j;
+ u_int j_cur = j_start;
u_int dst_slots = 0;
- /* If the source port uses the offloadings, while destination doesn't,
- * we grab the source virtio-net header and do the offloadings here.
- */
- if (na->virt_hdr_len && !dst_na->virt_hdr_len) {
- vh = (struct nm_vnet_hdr *)ft_p->ft_buf;
+ if (unlikely(ft_p == ft_end)) {
+ RD(3, "No source slots to process");
+ return;
}
/* Init source and dest pointers. */
src = ft_p->ft_buf;
src_len = ft_p->ft_len;
- slot = &ring->slot[*j];
- dst = NMB(&dst_na->up, slot);
+ dst_slot = &dst_ring->slot[j_cur];
+ dst = NMB(&dst_na->up, dst_slot);
dst_len = src_len;
+ /* If the source port uses the offloadings, while destination doesn't,
+ * we grab the source virtio-net header and do the offloadings here.
+ */
+ if (na->up.virt_hdr_len && !dst_na->up.virt_hdr_len) {
+ vh = (struct nm_vnet_hdr *)src;
+ /* Initial sanity check on the source virtio-net header. If
+ * something seems wrong, just drop the packet. */
+ if (src_len < na->up.virt_hdr_len) {
+ RD(3, "Short src vnet header, dropping");
+ return;
+ }
+ if (vnet_hdr_is_bad(vh)) {
+ RD(3, "Bad src vnet header, dropping");
+ return;
+ }
+ }
+
/* We are processing the first input slot and there is a mismatch
* between source and destination virt_hdr_len (SHL and DHL).
* When the a client is using virtio-net headers, the header length
@@ -185,14 +218,14 @@ void bdg_mismatch_datapath(struct netmap_vp_adapter *na,
* 12 | 0 | doesn't exist
* 12 | 10 | copied from the first 10 bytes of source header
*/
- bzero(dst, dst_na->virt_hdr_len);
- if (na->virt_hdr_len && dst_na->virt_hdr_len)
+ bzero(dst, dst_na->up.virt_hdr_len);
+ if (na->up.virt_hdr_len && dst_na->up.virt_hdr_len)
memcpy(dst, src, sizeof(struct nm_vnet_hdr));
/* Skip the virtio-net headers. */
- src += na->virt_hdr_len;
- src_len -= na->virt_hdr_len;
- dst += dst_na->virt_hdr_len;
- dst_len = dst_na->virt_hdr_len + src_len;
+ src += na->up.virt_hdr_len;
+ src_len -= na->up.virt_hdr_len;
+ dst += dst_na->up.virt_hdr_len;
+ dst_len = dst_na->up.virt_hdr_len + src_len;
/* Here it could be dst_len == 0 (which implies src_len == 0),
* so we avoid passing a zero length fragment.
@@ -214,16 +247,27 @@ void bdg_mismatch_datapath(struct netmap_vp_adapter *na,
u_int gso_idx = 0;
/* Payload data bytes segmented so far (e.g. TCP data bytes). */
u_int segmented_bytes = 0;
+ /* Is this an IPv4 or IPv6 GSO packet? */
+ u_int ipv4 = 0;
/* Length of the IP header (20 if IPv4, 40 if IPv6). */
u_int iphlen = 0;
+ /* Length of the Ethernet header (18 if 802.1q, otherwise 14). */
+ u_int ethhlen = 14;
/* Is this a TCP or an UDP GSO packet? */
u_int tcp = ((vh->gso_type & ~VIRTIO_NET_HDR_GSO_ECN)
== VIRTIO_NET_HDR_GSO_UDP) ? 0 : 1;
/* Segment the GSO packet contained into the input slots (frags). */
- while (ft_p != ft_end) {
+ for (;;) {
size_t copy;
+ if (dst_slots >= *howmany) {
+ /* We still have work to do, but we've run out of
+ * dst slots, so we have to drop the packet. */
+ RD(3, "Not enough slots, dropping GSO packet");
+ return;
+ }
+
/* Grab the GSO header if we don't have it. */
if (!gso_hdr) {
uint16_t ethertype;
@@ -231,28 +275,75 @@ void bdg_mismatch_datapath(struct netmap_vp_adapter *na,
gso_hdr = src;
/* Look at the 'Ethertype' field to see if this packet
- * is IPv4 or IPv6.
- */
- ethertype = be16toh(*((uint16_t *)(gso_hdr + 12)));
- if (ethertype == 0x0800)
- iphlen = 20;
- else /* if (ethertype == 0x86DD) */
- iphlen = 40;
+ * is IPv4 or IPv6, taking into account VLAN
+ * encapsulation. */
+ for (;;) {
+ if (src_len < ethhlen) {
+ RD(3, "Short GSO fragment [eth], dropping");
+ return;
+ }
+ ethertype = be16toh(*((uint16_t *)
+ (gso_hdr + ethhlen - 2)));
+ if (ethertype != 0x8100) /* not 802.1q */
+ break;
+ ethhlen += 4;
+ }
+ switch (ethertype) {
+ case 0x0800: /* IPv4 */
+ {
+ struct nm_iphdr *iph = (struct nm_iphdr *)
+ (gso_hdr + ethhlen);
+
+ if (src_len < ethhlen + 20) {
+ RD(3, "Short GSO fragment "
+ "[IPv4], dropping");
+ return;
+ }
+ ipv4 = 1;
+ iphlen = 4 * (iph->version_ihl & 0x0F);
+ break;
+ }
+ case 0x86DD: /* IPv6 */
+ ipv4 = 0;
+ iphlen = 40;
+ break;
+ default:
+ RD(3, "Unsupported ethertype, "
+ "dropping GSO packet");
+ return;
+ }
ND(3, "type=%04x", ethertype);
+ if (src_len < ethhlen + iphlen) {
+ RD(3, "Short GSO fragment [IP], dropping");
+ return;
+ }
+
/* Compute gso_hdr_len. For TCP we need to read the
* content of the 'Data Offset' field.
*/
if (tcp) {
- struct nm_tcphdr *tcph =
- (struct nm_tcphdr *)&gso_hdr[14+iphlen];
+ struct nm_tcphdr *tcph = (struct nm_tcphdr *)
+ (gso_hdr + ethhlen + iphlen);
- gso_hdr_len = 14 + iphlen + 4*(tcph->doff >> 4);
- } else
- gso_hdr_len = 14 + iphlen + 8; /* UDP */
+ if (src_len < ethhlen + iphlen + 20) {
+ RD(3, "Short GSO fragment "
+ "[TCP], dropping");
+ return;
+ }
+ gso_hdr_len = ethhlen + iphlen +
+ 4 * (tcph->doff >> 4);
+ } else {
+ gso_hdr_len = ethhlen + iphlen + 8; /* UDP */
+ }
+
+ if (src_len < gso_hdr_len) {
+ RD(3, "Short GSO fragment [TCP/UDP], dropping");
+ return;
+ }
ND(3, "gso_hdr_len %u gso_mtu %d", gso_hdr_len,
- dst_na->mfs);
+ dst_na->mfs);
/* Advance source pointers. */
src += gso_hdr_len;
@@ -263,7 +354,6 @@ void bdg_mismatch_datapath(struct netmap_vp_adapter *na,
break;
src = ft_p->ft_buf;
src_len = ft_p->ft_len;
- continue;
}
}
@@ -289,25 +379,24 @@ void bdg_mismatch_datapath(struct netmap_vp_adapter *na,
/* After raw segmentation, we must fix some header
* fields and compute checksums, in a protocol dependent
* way. */
- gso_fix_segment(dst, gso_bytes, gso_idx,
- segmented_bytes,
- src_len == 0 && ft_p + 1 == ft_end,
- tcp, iphlen);
+ gso_fix_segment(dst + ethhlen, gso_bytes - ethhlen,
+ ipv4, iphlen, tcp,
+ gso_idx, segmented_bytes,
+ src_len == 0 && ft_p + 1 == ft_end);
ND("frame %u completed with %d bytes", gso_idx, (int)gso_bytes);
- slot->len = gso_bytes;
- slot->flags = 0;
- segmented_bytes += gso_bytes - gso_hdr_len;
-
+ dst_slot->len = gso_bytes;
+ dst_slot->flags = 0;
dst_slots++;
-
- /* Next destination slot. */
- *j = nm_next(*j, lim);
- slot = &ring->slot[*j];
- dst = NMB(&dst_na->up, slot);
+ segmented_bytes += gso_bytes - gso_hdr_len;
gso_bytes = 0;
gso_idx++;
+
+ /* Next destination slot. */
+ j_cur = nm_next(j_cur, lim);
+ dst_slot = &dst_ring->slot[j_cur];
+ dst = NMB(&dst_na->up, dst_slot);
}
/* Next input slot. */
@@ -342,10 +431,10 @@ void bdg_mismatch_datapath(struct netmap_vp_adapter *na,
/* Init/update the packet checksum if needed. */
if (vh && (vh->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)) {
if (!dst_slots)
- csum = nm_csum_raw(src + vh->csum_start,
+ csum = nm_os_csum_raw(src + vh->csum_start,
src_len - vh->csum_start, 0);
else
- csum = nm_csum_raw(src, src_len, csum);
+ csum = nm_os_csum_raw(src, src_len, csum);
}
/* Round to a multiple of 64 */
@@ -359,44 +448,43 @@ void bdg_mismatch_datapath(struct netmap_vp_adapter *na,
} else {
memcpy(dst, src, (int)src_len);
}
- slot->len = dst_len;
-
+ dst_slot->len = dst_len;
dst_slots++;
/* Next destination slot. */
- *j = nm_next(*j, lim);
- slot = &ring->slot[*j];
- dst = NMB(&dst_na->up, slot);
+ j_cur = nm_next(j_cur, lim);
+ dst_slot = &dst_ring->slot[j_cur];
+ dst = NMB(&dst_na->up, dst_slot);
/* Next source slot. */
ft_p++;
src = ft_p->ft_buf;
dst_len = src_len = ft_p->ft_len;
-
}
/* Finalize (fold) the checksum if needed. */
if (check && vh && (vh->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)) {
- *check = nm_csum_fold(csum);
+ *check = nm_os_csum_fold(csum);
}
ND(3, "using %u dst_slots", dst_slots);
- /* A second pass on the desitations slots to set the slot flags,
+ /* A second pass on the destination slots to set the slot flags,
* using the right number of destination slots.
*/
- while (j_start != *j) {
- slot = &ring->slot[j_start];
- slot->flags = (dst_slots << 8)| NS_MOREFRAG;
+ while (j_start != j_cur) {
+ dst_slot = &dst_ring->slot[j_start];
+ dst_slot->flags = (dst_slots << 8)| NS_MOREFRAG;
j_start = nm_next(j_start, lim);
}
/* Clear NS_MOREFRAG flag on last entry. */
- slot->flags = (dst_slots << 8);
+ dst_slot->flags = (dst_slots << 8);
}
- /* Update howmany. */
+ /* Update howmany and j. This is to commit the use of
+ * those slots in the destination ring. */
if (unlikely(dst_slots > *howmany)) {
- dst_slots = *howmany;
- D("Slot allocation error: Should never happen");
+ D("Slot allocation error: This is a bug");
}
+ *j = j_cur;
*howmany -= dst_slots;
}
diff --git a/sys/dev/netmap/netmap_pipe.c b/sys/dev/netmap/netmap_pipe.c
index 67e840248c88..f00f73f8b9b2 100644
--- a/sys/dev/netmap/netmap_pipe.c
+++ b/sys/dev/netmap/netmap_pipe.c
@@ -1,5 +1,6 @@
/*
- * Copyright (C) 2014 Giuseppe Lettieri. All rights reserved.
+ * Copyright (C) 2014-2016 Giuseppe Lettieri
+ * All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -54,6 +55,9 @@
#warning OSX support is only partial
#include "osx_glue.h"
+#elif defined(_WIN32)
+#include "win_glue.h"
+
#else
#error Unsupported platform
@@ -72,9 +76,11 @@
#define NM_PIPE_MAXSLOTS 4096
-int netmap_default_pipes = 0; /* ignored, kept for compatibility */
+static int netmap_default_pipes = 0; /* ignored, kept for compatibility */
+SYSBEGIN(vars_pipes);
SYSCTL_DECL(_dev_netmap);
SYSCTL_INT(_dev_netmap, OID_AUTO, default_pipes, CTLFLAG_RW, &netmap_default_pipes, 0 , "");
+SYSEND;
/* allocate the pipe array in the parent adapter */
static int
@@ -86,12 +92,16 @@ nm_pipe_alloc(struct netmap_adapter *na, u_int npipes)
if (npipes <= na->na_max_pipes)
/* we already have more entries that requested */
return 0;
-
+
if (npipes < na->na_next_pipe || npipes > NM_MAXPIPES)
return EINVAL;
len = sizeof(struct netmap_pipe_adapter *) * npipes;
+#ifndef _WIN32
npa = realloc(na->na_pipes, len, M_DEVBUF, M_NOWAIT | M_ZERO);
+#else
+ npa = realloc(na->na_pipes, len, sizeof(struct netmap_pipe_adapter *)*na->na_max_pipes);
+#endif
if (npa == NULL)
return ENOMEM;
@@ -199,7 +209,7 @@ netmap_pipe_txsync(struct netmap_kring *txkring, int flags)
}
while (limit-- > 0) {
- struct netmap_slot *rs = &rxkring->save_ring->slot[j];
+ struct netmap_slot *rs = &rxkring->ring->slot[j];
struct netmap_slot *ts = &txkring->ring->slot[k];
struct netmap_slot tmp;
@@ -295,7 +305,7 @@ netmap_pipe_rxsync(struct netmap_kring *rxkring, int flags)
* usr1 --> e1 --> e2
*
* and we are e2. e1 is certainly registered and our
- * krings already exist, but they may be hidden.
+ * krings already exist. Nothing to do.
*/
static int
netmap_pipe_krings_create(struct netmap_adapter *na)
@@ -310,65 +320,28 @@ netmap_pipe_krings_create(struct netmap_adapter *na)
int i;
/* case 1) above */
- ND("%p: case 1, create everything", na);
+ D("%p: case 1, create both ends", na);
error = netmap_krings_create(na, 0);
if (error)
goto err;
- /* we also create all the rings, since we need to
- * update the save_ring pointers.
- * netmap_mem_rings_create (called by our caller)
- * will not create the rings again
- */
-
- error = netmap_mem_rings_create(na);
- if (error)
- goto del_krings1;
-
- /* update our hidden ring pointers */
- for_rx_tx(t) {
- for (i = 0; i < nma_get_nrings(na, t) + 1; i++)
- NMR(na, t)[i].save_ring = NMR(na, t)[i].ring;
- }
-
- /* now, create krings and rings of the other end */
+ /* create the krings of the other end */
error = netmap_krings_create(ona, 0);
if (error)
- goto del_rings1;
-
- error = netmap_mem_rings_create(ona);
- if (error)
- goto del_krings2;
-
- for_rx_tx(t) {
- for (i = 0; i < nma_get_nrings(ona, t) + 1; i++)
- NMR(ona, t)[i].save_ring = NMR(ona, t)[i].ring;
- }
+ goto del_krings1;
/* cross link the krings */
for_rx_tx(t) {
- enum txrx r= nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */
+ enum txrx r = nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */
for (i = 0; i < nma_get_nrings(na, t); i++) {
NMR(na, t)[i].pipe = NMR(&pna->peer->up, r) + i;
NMR(&pna->peer->up, r)[i].pipe = NMR(na, t) + i;
}
}
- } else {
- int i;
- /* case 2) above */
- /* recover the hidden rings */
- ND("%p: case 2, hidden rings", na);
- for_rx_tx(t) {
- for (i = 0; i < nma_get_nrings(na, t) + 1; i++)
- NMR(na, t)[i].ring = NMR(na, t)[i].save_ring;
- }
+
}
return 0;
-del_krings2:
- netmap_krings_delete(ona);
-del_rings1:
- netmap_mem_rings_delete(na);
del_krings1:
netmap_krings_delete(na);
err:
@@ -383,7 +356,8 @@ err:
*
* usr1 --> e1 --> e2
*
- * and we are e1. Nothing special to do.
+ * and we are e1. Create the needed rings of the
+ * other end.
*
* 1.b) state is
*
@@ -412,14 +386,65 @@ netmap_pipe_reg(struct netmap_adapter *na, int onoff)
{
struct netmap_pipe_adapter *pna =
(struct netmap_pipe_adapter *)na;
+ struct netmap_adapter *ona = &pna->peer->up;
+ int i, error = 0;
enum txrx t;
ND("%p: onoff %d", na, onoff);
if (onoff) {
- na->na_flags |= NAF_NETMAP_ON;
+ for_rx_tx(t) {
+ for (i = 0; i < nma_get_nrings(na, t) + 1; i++) {
+ struct netmap_kring *kring = &NMR(na, t)[i];
+
+ if (nm_kring_pending_on(kring)) {
+ /* mark the partner ring as needed */
+ kring->pipe->nr_kflags |= NKR_NEEDRING;
+ }
+ }
+ }
+
+ /* create all missing needed rings on the other end */
+ error = netmap_mem_rings_create(ona);
+ if (error)
+ return error;
+
+ /* In case of no error we put our rings in netmap mode */
+ for_rx_tx(t) {
+ for (i = 0; i < nma_get_nrings(na, t) + 1; i++) {
+ struct netmap_kring *kring = &NMR(na, t)[i];
+
+ if (nm_kring_pending_on(kring)) {
+ kring->nr_mode = NKR_NETMAP_ON;
+ }
+ }
+ }
+ if (na->active_fds == 0)
+ na->na_flags |= NAF_NETMAP_ON;
} else {
- na->na_flags &= ~NAF_NETMAP_ON;
+ if (na->active_fds == 0)
+ na->na_flags &= ~NAF_NETMAP_ON;
+ for_rx_tx(t) {
+ for (i = 0; i < nma_get_nrings(na, t) + 1; i++) {
+ struct netmap_kring *kring = &NMR(na, t)[i];
+
+ if (nm_kring_pending_off(kring)) {
+ kring->nr_mode = NKR_NETMAP_OFF;
+ /* mark the peer ring as no longer needed by us
+ * (it may still be kept if sombody else is using it)
+ */
+ kring->pipe->nr_kflags &= ~NKR_NEEDRING;
+ }
+ }
+ }
+ /* delete all the peer rings that are no longer needed */
+ netmap_mem_rings_delete(ona);
}
+
+ if (na->active_fds) {
+ D("active_fds %d", na->active_fds);
+ return 0;
+ }
+
if (pna->peer_ref) {
ND("%p: case 1.a or 2.a, nothing to do", na);
return 0;
@@ -429,18 +454,11 @@ netmap_pipe_reg(struct netmap_adapter *na, int onoff)
pna->peer->peer_ref = 0;
netmap_adapter_put(na);
} else {
- int i;
ND("%p: case 2.b, grab peer", na);
netmap_adapter_get(na);
pna->peer->peer_ref = 1;
- /* hide our rings from netmap_mem_rings_delete */
- for_rx_tx(t) {
- for (i = 0; i < nma_get_nrings(na, t) + 1; i++) {
- NMR(na, t)[i].ring = NULL;
- }
- }
}
- return 0;
+ return error;
}
/* netmap_pipe_krings_delete.
@@ -470,8 +488,6 @@ netmap_pipe_krings_delete(struct netmap_adapter *na)
struct netmap_pipe_adapter *pna =
(struct netmap_pipe_adapter *)na;
struct netmap_adapter *ona; /* na of the other end */
- int i;
- enum txrx t;
if (!pna->peer_ref) {
ND("%p: case 2, kept alive by peer", na);
@@ -480,18 +496,12 @@ netmap_pipe_krings_delete(struct netmap_adapter *na)
/* case 1) above */
ND("%p: case 1, deleting everyhing", na);
netmap_krings_delete(na); /* also zeroes tx_rings etc. */
- /* restore the ring to be deleted on the peer */
ona = &pna->peer->up;
if (ona->tx_rings == NULL) {
/* already deleted, we must be on an
* cleanup-after-error path */
return;
}
- for_rx_tx(t) {
- for (i = 0; i < nma_get_nrings(ona, t) + 1; i++)
- NMR(ona, t)[i].ring = NMR(ona, t)[i].save_ring;
- }
- netmap_mem_rings_delete(ona);
netmap_krings_delete(ona);
}
@@ -519,6 +529,7 @@ netmap_get_pipe_na(struct nmreq *nmr, struct netmap_adapter **na, int create)
struct nmreq pnmr;
struct netmap_adapter *pna; /* parent adapter */
struct netmap_pipe_adapter *mna, *sna, *req;
+ struct ifnet *ifp = NULL;
u_int pipe_id;
int role = nmr->nr_flags & NR_REG_MASK;
int error;
@@ -536,7 +547,7 @@ netmap_get_pipe_na(struct nmreq *nmr, struct netmap_adapter **na, int create)
memcpy(&pnmr.nr_name, nmr->nr_name, IFNAMSIZ);
/* pass to parent the requested number of pipes */
pnmr.nr_arg1 = nmr->nr_arg1;
- error = netmap_get_na(&pnmr, &pna, create);
+ error = netmap_get_na(&pnmr, &pna, &ifp, create);
if (error) {
ND("parent lookup failed: %d", error);
return error;
@@ -652,16 +663,15 @@ found:
*na = &req->up;
netmap_adapter_get(*na);
- /* write the configuration back */
- nmr->nr_tx_rings = req->up.num_tx_rings;
- nmr->nr_rx_rings = req->up.num_rx_rings;
- nmr->nr_tx_slots = req->up.num_tx_desc;
- nmr->nr_rx_slots = req->up.num_rx_desc;
-
/* keep the reference to the parent.
* It will be released by the req destructor
*/
+ /* drop the ifp reference, if any */
+ if (ifp) {
+ if_rele(ifp);
+ }
+
return 0;
free_sna:
@@ -671,7 +681,7 @@ unregister_mna:
free_mna:
free(mna, M_DEVBUF);
put_out:
- netmap_adapter_put(pna);
+ netmap_unget_na(pna, ifp);
return error;
}
diff --git a/sys/dev/netmap/netmap_pt.c b/sys/dev/netmap/netmap_pt.c
new file mode 100644
index 000000000000..56434a236145
--- /dev/null
+++ b/sys/dev/netmap/netmap_pt.c
@@ -0,0 +1,1438 @@
+/*
+ * Copyright (C) 2015 Stefano Garzarella
+ * Copyright (C) 2016 Vincenzo Maffione
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * common headers
+ */
+#if defined(__FreeBSD__)
+#include <sys/cdefs.h>
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/types.h>
+#include <sys/selinfo.h>
+#include <sys/socket.h>
+#include <net/if.h>
+#include <net/if_var.h>
+#include <machine/bus.h>
+
+//#define usleep_range(_1, _2)
+#define usleep_range(_1, _2) \
+ pause_sbt("ptnetmap-sleep", SBT_1US * _1, SBT_1US * 1, C_ABSOLUTE)
+
+#elif defined(linux)
+#include <bsd_glue.h>
+#endif
+
+#include <net/netmap.h>
+#include <dev/netmap/netmap_kern.h>
+#include <net/netmap_virt.h>
+#include <dev/netmap/netmap_mem2.h>
+
+#ifdef WITH_PTNETMAP_HOST
+
+/* RX cycle without receive any packets */
+#define PTN_RX_DRY_CYCLES_MAX 10
+
+/* Limit Batch TX to half ring.
+ * Currently disabled, since it does not manage NS_MOREFRAG, which
+ * results in random drops in the VALE txsync. */
+//#define PTN_TX_BATCH_LIM(_n) ((_n >> 1))
+
+//#define BUSY_WAIT
+
+#define NETMAP_PT_DEBUG /* Enables communication debugging. */
+#ifdef NETMAP_PT_DEBUG
+#define DBG(x) x
+#else
+#define DBG(x)
+#endif
+
+
+#undef RATE
+//#define RATE /* Enables communication statistics. */
+#ifdef RATE
+#define IFRATE(x) x
+struct rate_batch_stats {
+ unsigned long sync;
+ unsigned long sync_dry;
+ unsigned long pkt;
+};
+
+struct rate_stats {
+ unsigned long gtxk; /* Guest --> Host Tx kicks. */
+ unsigned long grxk; /* Guest --> Host Rx kicks. */
+ unsigned long htxk; /* Host --> Guest Tx kicks. */
+ unsigned long hrxk; /* Host --> Guest Rx Kicks. */
+ unsigned long btxwu; /* Backend Tx wake-up. */
+ unsigned long brxwu; /* Backend Rx wake-up. */
+ struct rate_batch_stats txbs;
+ struct rate_batch_stats rxbs;
+};
+
+struct rate_context {
+ struct timer_list timer;
+ struct rate_stats new;
+ struct rate_stats old;
+};
+
+#define RATE_PERIOD 2
+static void
+rate_callback(unsigned long arg)
+{
+ struct rate_context * ctx = (struct rate_context *)arg;
+ struct rate_stats cur = ctx->new;
+ struct rate_batch_stats *txbs = &cur.txbs;
+ struct rate_batch_stats *rxbs = &cur.rxbs;
+ struct rate_batch_stats *txbs_old = &ctx->old.txbs;
+ struct rate_batch_stats *rxbs_old = &ctx->old.rxbs;
+ uint64_t tx_batch, rx_batch;
+ unsigned long txpkts, rxpkts;
+ unsigned long gtxk, grxk;
+ int r;
+
+ txpkts = txbs->pkt - txbs_old->pkt;
+ rxpkts = rxbs->pkt - rxbs_old->pkt;
+
+ tx_batch = ((txbs->sync - txbs_old->sync) > 0) ?
+ txpkts / (txbs->sync - txbs_old->sync): 0;
+ rx_batch = ((rxbs->sync - rxbs_old->sync) > 0) ?
+ rxpkts / (rxbs->sync - rxbs_old->sync): 0;
+
+ /* Fix-up gtxk and grxk estimates. */
+ gtxk = (cur.gtxk - ctx->old.gtxk) - (cur.btxwu - ctx->old.btxwu);
+ grxk = (cur.grxk - ctx->old.grxk) - (cur.brxwu - ctx->old.brxwu);
+
+ printk("txpkts = %lu Hz\n", txpkts/RATE_PERIOD);
+ printk("gtxk = %lu Hz\n", gtxk/RATE_PERIOD);
+ printk("htxk = %lu Hz\n", (cur.htxk - ctx->old.htxk)/RATE_PERIOD);
+ printk("btxw = %lu Hz\n", (cur.btxwu - ctx->old.btxwu)/RATE_PERIOD);
+ printk("rxpkts = %lu Hz\n", rxpkts/RATE_PERIOD);
+ printk("grxk = %lu Hz\n", grxk/RATE_PERIOD);
+ printk("hrxk = %lu Hz\n", (cur.hrxk - ctx->old.hrxk)/RATE_PERIOD);
+ printk("brxw = %lu Hz\n", (cur.brxwu - ctx->old.brxwu)/RATE_PERIOD);
+ printk("txbatch = %llu avg\n", tx_batch);
+ printk("rxbatch = %llu avg\n", rx_batch);
+ printk("\n");
+
+ ctx->old = cur;
+ r = mod_timer(&ctx->timer, jiffies +
+ msecs_to_jiffies(RATE_PERIOD * 1000));
+ if (unlikely(r))
+ D("[ptnetmap] Error: mod_timer()\n");
+}
+
+static void
+rate_batch_stats_update(struct rate_batch_stats *bf, uint32_t pre_tail,
+ uint32_t act_tail, uint32_t num_slots)
+{
+ int n = (int)act_tail - pre_tail;
+
+ if (n) {
+ if (n < 0)
+ n += num_slots;
+
+ bf->sync++;
+ bf->pkt += n;
+ } else {
+ bf->sync_dry++;
+ }
+}
+
+#else /* !RATE */
+#define IFRATE(x)
+#endif /* RATE */
+
+struct ptnetmap_state {
+ /* Kthreads. */
+ struct nm_kthread **kthreads;
+
+ /* Shared memory with the guest (TX/RX) */
+ struct ptnet_ring __user *ptrings;
+
+ bool stopped;
+
+ /* Netmap adapter wrapping the backend. */
+ struct netmap_pt_host_adapter *pth_na;
+
+ IFRATE(struct rate_context rate_ctx;)
+};
+
+static inline void
+ptnetmap_kring_dump(const char *title, const struct netmap_kring *kring)
+{
+ RD(1, "%s - name: %s hwcur: %d hwtail: %d rhead: %d rcur: %d \
+ rtail: %d head: %d cur: %d tail: %d",
+ title, kring->name, kring->nr_hwcur,
+ kring->nr_hwtail, kring->rhead, kring->rcur, kring->rtail,
+ kring->ring->head, kring->ring->cur, kring->ring->tail);
+}
+
+/*
+ * TX functions to set/get and to handle host/guest kick.
+ */
+
+
+/* Enable or disable guest --> host kicks. */
+static inline void
+ptring_kick_enable(struct ptnet_ring __user *ptring, uint32_t val)
+{
+ CSB_WRITE(ptring, host_need_kick, val);
+}
+
+/* Are guest interrupt enabled or disabled? */
+static inline uint32_t
+ptring_intr_enabled(struct ptnet_ring __user *ptring)
+{
+ uint32_t v;
+
+ CSB_READ(ptring, guest_need_kick, v);
+
+ return v;
+}
+
+/* Enable or disable guest interrupts. */
+static inline void
+ptring_intr_enable(struct ptnet_ring __user *ptring, uint32_t val)
+{
+ CSB_WRITE(ptring, guest_need_kick, val);
+}
+
+/* Handle TX events: from the guest or from the backend */
+static void
+ptnetmap_tx_handler(void *data)
+{
+ struct netmap_kring *kring = data;
+ struct netmap_pt_host_adapter *pth_na =
+ (struct netmap_pt_host_adapter *)kring->na->na_private;
+ struct ptnetmap_state *ptns = pth_na->ptns;
+ struct ptnet_ring __user *ptring;
+ struct netmap_ring shadow_ring; /* shadow copy of the netmap_ring */
+ bool more_txspace = false;
+ struct nm_kthread *kth;
+ uint32_t num_slots;
+ int batch;
+ IFRATE(uint32_t pre_tail);
+
+ if (unlikely(!ptns)) {
+ D("ERROR ptnetmap state is NULL");
+ return;
+ }
+
+ if (unlikely(ptns->stopped)) {
+ RD(1, "backend netmap is being stopped");
+ return;
+ }
+
+ if (unlikely(nm_kr_tryget(kring, 1, NULL))) {
+ D("ERROR nm_kr_tryget()");
+ return;
+ }
+
+ /* This is a guess, to be fixed in the rate callback. */
+ IFRATE(ptns->rate_ctx.new.gtxk++);
+
+ /* Get TX ptring pointer from the CSB. */
+ ptring = ptns->ptrings + kring->ring_id;
+ kth = ptns->kthreads[kring->ring_id];
+
+ num_slots = kring->nkr_num_slots;
+ shadow_ring.head = kring->rhead;
+ shadow_ring.cur = kring->rcur;
+
+ /* Disable guest --> host notifications. */
+ ptring_kick_enable(ptring, 0);
+ /* Copy the guest kring pointers from the CSB */
+ ptnetmap_host_read_kring_csb(ptring, &shadow_ring, num_slots);
+
+ for (;;) {
+ /* If guest moves ahead too fast, let's cut the move so
+ * that we don't exceed our batch limit. */
+ batch = shadow_ring.head - kring->nr_hwcur;
+ if (batch < 0)
+ batch += num_slots;
+
+#ifdef PTN_TX_BATCH_LIM
+ if (batch > PTN_TX_BATCH_LIM(num_slots)) {
+ uint32_t head_lim = kring->nr_hwcur + PTN_TX_BATCH_LIM(num_slots);
+
+ if (head_lim >= num_slots)
+ head_lim -= num_slots;
+ ND(1, "batch: %d head: %d head_lim: %d", batch, shadow_ring.head,
+ head_lim);
+ shadow_ring.head = head_lim;
+ batch = PTN_TX_BATCH_LIM(num_slots);
+ }
+#endif /* PTN_TX_BATCH_LIM */
+
+ if (nm_kr_txspace(kring) <= (num_slots >> 1)) {
+ shadow_ring.flags |= NAF_FORCE_RECLAIM;
+ }
+
+ /* Netmap prologue */
+ shadow_ring.tail = kring->rtail;
+ if (unlikely(nm_txsync_prologue(kring, &shadow_ring) >= num_slots)) {
+ /* Reinit ring and enable notifications. */
+ netmap_ring_reinit(kring);
+ ptring_kick_enable(ptring, 1);
+ break;
+ }
+
+ if (unlikely(netmap_verbose & NM_VERB_TXSYNC)) {
+ ptnetmap_kring_dump("pre txsync", kring);
+ }
+
+ IFRATE(pre_tail = kring->rtail);
+ if (unlikely(kring->nm_sync(kring, shadow_ring.flags))) {
+ /* Reenable notifications. */
+ ptring_kick_enable(ptring, 1);
+ D("ERROR txsync()");
+ break;
+ }
+
+ /*
+ * Finalize
+ * Copy host hwcur and hwtail into the CSB for the guest sync(), and
+ * do the nm_sync_finalize.
+ */
+ ptnetmap_host_write_kring_csb(ptring, kring->nr_hwcur,
+ kring->nr_hwtail);
+ if (kring->rtail != kring->nr_hwtail) {
+ /* Some more room available in the parent adapter. */
+ kring->rtail = kring->nr_hwtail;
+ more_txspace = true;
+ }
+
+ IFRATE(rate_batch_stats_update(&ptns->rate_ctx.new.txbs, pre_tail,
+ kring->rtail, num_slots));
+
+ if (unlikely(netmap_verbose & NM_VERB_TXSYNC)) {
+ ptnetmap_kring_dump("post txsync", kring);
+ }
+
+#ifndef BUSY_WAIT
+ /* Interrupt the guest if needed. */
+ if (more_txspace && ptring_intr_enabled(ptring)) {
+ /* Disable guest kick to avoid sending unnecessary kicks */
+ ptring_intr_enable(ptring, 0);
+ nm_os_kthread_send_irq(kth);
+ IFRATE(ptns->rate_ctx.new.htxk++);
+ more_txspace = false;
+ }
+#endif
+ /* Read CSB to see if there is more work to do. */
+ ptnetmap_host_read_kring_csb(ptring, &shadow_ring, num_slots);
+#ifndef BUSY_WAIT
+ if (shadow_ring.head == kring->rhead) {
+ /*
+ * No more packets to transmit. We enable notifications and
+ * go to sleep, waiting for a kick from the guest when new
+ * new slots are ready for transmission.
+ */
+ usleep_range(1,1);
+ /* Reenable notifications. */
+ ptring_kick_enable(ptring, 1);
+ /* Doublecheck. */
+ ptnetmap_host_read_kring_csb(ptring, &shadow_ring, num_slots);
+ if (shadow_ring.head != kring->rhead) {
+ /* We won the race condition, there are more packets to
+ * transmit. Disable notifications and do another cycle */
+ ptring_kick_enable(ptring, 0);
+ continue;
+ }
+ break;
+ }
+
+ if (nm_kr_txempty(kring)) {
+ /* No more available TX slots. We stop waiting for a notification
+ * from the backend (netmap_tx_irq). */
+ ND(1, "TX ring");
+ break;
+ }
+#endif
+ if (unlikely(ptns->stopped)) {
+ D("backend netmap is being stopped");
+ break;
+ }
+ }
+
+ nm_kr_put(kring);
+
+ if (more_txspace && ptring_intr_enabled(ptring)) {
+ ptring_intr_enable(ptring, 0);
+ nm_os_kthread_send_irq(kth);
+ IFRATE(ptns->rate_ctx.new.htxk++);
+ }
+}
+
+/*
+ * We need RX kicks from the guest when (tail == head-1), where we wait
+ * for the guest to refill.
+ */
+#ifndef BUSY_WAIT
+static inline int
+ptnetmap_norxslots(struct netmap_kring *kring, uint32_t g_head)
+{
+ return (NM_ACCESS_ONCE(kring->nr_hwtail) == nm_prev(g_head,
+ kring->nkr_num_slots - 1));
+}
+#endif /* !BUSY_WAIT */
+
+/* Handle RX events: from the guest or from the backend */
+static void
+ptnetmap_rx_handler(void *data)
+{
+ struct netmap_kring *kring = data;
+ struct netmap_pt_host_adapter *pth_na =
+ (struct netmap_pt_host_adapter *)kring->na->na_private;
+ struct ptnetmap_state *ptns = pth_na->ptns;
+ struct ptnet_ring __user *ptring;
+ struct netmap_ring shadow_ring; /* shadow copy of the netmap_ring */
+ struct nm_kthread *kth;
+ uint32_t num_slots;
+ int dry_cycles = 0;
+ bool some_recvd = false;
+ IFRATE(uint32_t pre_tail);
+
+ if (unlikely(!ptns || !ptns->pth_na)) {
+ D("ERROR ptnetmap state %p, ptnetmap host adapter %p", ptns,
+ ptns ? ptns->pth_na : NULL);
+ return;
+ }
+
+ if (unlikely(ptns->stopped)) {
+ RD(1, "backend netmap is being stopped");
+ return;
+ }
+
+ if (unlikely(nm_kr_tryget(kring, 1, NULL))) {
+ D("ERROR nm_kr_tryget()");
+ return;
+ }
+
+ /* This is a guess, to be fixed in the rate callback. */
+ IFRATE(ptns->rate_ctx.new.grxk++);
+
+ /* Get RX ptring pointer from the CSB. */
+ ptring = ptns->ptrings + (pth_na->up.num_tx_rings + kring->ring_id);
+ kth = ptns->kthreads[pth_na->up.num_tx_rings + kring->ring_id];
+
+ num_slots = kring->nkr_num_slots;
+ shadow_ring.head = kring->rhead;
+ shadow_ring.cur = kring->rcur;
+
+ /* Disable notifications. */
+ ptring_kick_enable(ptring, 0);
+ /* Copy the guest kring pointers from the CSB */
+ ptnetmap_host_read_kring_csb(ptring, &shadow_ring, num_slots);
+
+ for (;;) {
+ uint32_t hwtail;
+
+ /* Netmap prologue */
+ shadow_ring.tail = kring->rtail;
+ if (unlikely(nm_rxsync_prologue(kring, &shadow_ring) >= num_slots)) {
+ /* Reinit ring and enable notifications. */
+ netmap_ring_reinit(kring);
+ ptring_kick_enable(ptring, 1);
+ break;
+ }
+
+ if (unlikely(netmap_verbose & NM_VERB_RXSYNC)) {
+ ptnetmap_kring_dump("pre rxsync", kring);
+ }
+
+ IFRATE(pre_tail = kring->rtail);
+ if (unlikely(kring->nm_sync(kring, shadow_ring.flags))) {
+ /* Reenable notifications. */
+ ptring_kick_enable(ptring, 1);
+ D("ERROR rxsync()");
+ break;
+ }
+ /*
+ * Finalize
+ * Copy host hwcur and hwtail into the CSB for the guest sync()
+ */
+ hwtail = NM_ACCESS_ONCE(kring->nr_hwtail);
+ ptnetmap_host_write_kring_csb(ptring, kring->nr_hwcur, hwtail);
+ if (kring->rtail != hwtail) {
+ kring->rtail = hwtail;
+ some_recvd = true;
+ dry_cycles = 0;
+ } else {
+ dry_cycles++;
+ }
+
+ IFRATE(rate_batch_stats_update(&ptns->rate_ctx.new.rxbs, pre_tail,
+ kring->rtail, num_slots));
+
+ if (unlikely(netmap_verbose & NM_VERB_RXSYNC)) {
+ ptnetmap_kring_dump("post rxsync", kring);
+ }
+
+#ifndef BUSY_WAIT
+ /* Interrupt the guest if needed. */
+ if (some_recvd && ptring_intr_enabled(ptring)) {
+ /* Disable guest kick to avoid sending unnecessary kicks */
+ ptring_intr_enable(ptring, 0);
+ nm_os_kthread_send_irq(kth);
+ IFRATE(ptns->rate_ctx.new.hrxk++);
+ some_recvd = false;
+ }
+#endif
+ /* Read CSB to see if there is more work to do. */
+ ptnetmap_host_read_kring_csb(ptring, &shadow_ring, num_slots);
+#ifndef BUSY_WAIT
+ if (ptnetmap_norxslots(kring, shadow_ring.head)) {
+ /*
+ * No more slots available for reception. We enable notification and
+ * go to sleep, waiting for a kick from the guest when new receive
+ * slots are available.
+ */
+ usleep_range(1,1);
+ /* Reenable notifications. */
+ ptring_kick_enable(ptring, 1);
+ /* Doublecheck. */
+ ptnetmap_host_read_kring_csb(ptring, &shadow_ring, num_slots);
+ if (!ptnetmap_norxslots(kring, shadow_ring.head)) {
+ /* We won the race condition, more slots are available. Disable
+ * notifications and do another cycle. */
+ ptring_kick_enable(ptring, 0);
+ continue;
+ }
+ break;
+ }
+
+ hwtail = NM_ACCESS_ONCE(kring->nr_hwtail);
+ if (unlikely(hwtail == kring->rhead ||
+ dry_cycles >= PTN_RX_DRY_CYCLES_MAX)) {
+ /* No more packets to be read from the backend. We stop and
+ * wait for a notification from the backend (netmap_rx_irq). */
+ ND(1, "nr_hwtail: %d rhead: %d dry_cycles: %d",
+ hwtail, kring->rhead, dry_cycles);
+ break;
+ }
+#endif
+ if (unlikely(ptns->stopped)) {
+ D("backend netmap is being stopped");
+ break;
+ }
+ }
+
+ nm_kr_put(kring);
+
+ /* Interrupt the guest if needed. */
+ if (some_recvd && ptring_intr_enabled(ptring)) {
+ ptring_intr_enable(ptring, 0);
+ nm_os_kthread_send_irq(kth);
+ IFRATE(ptns->rate_ctx.new.hrxk++);
+ }
+}
+
+#ifdef NETMAP_PT_DEBUG
+static void
+ptnetmap_print_configuration(struct ptnetmap_cfg *cfg)
+{
+ int k;
+
+ D("[PTN] configuration:");
+ D(" CSB ptrings @%p, num_rings=%u, features %08x", cfg->ptrings,
+ cfg->num_rings, cfg->features);
+ for (k = 0; k < cfg->num_rings; k++) {
+ D(" ring #%d: iofd=%llu, irqfd=%llu", k,
+ (unsigned long long)cfg->entries[k].ioeventfd,
+ (unsigned long long)cfg->entries[k].irqfd);
+ }
+
+}
+#endif /* NETMAP_PT_DEBUG */
+
+/* Copy actual state of the host ring into the CSB for the guest init */
+static int
+ptnetmap_kring_snapshot(struct netmap_kring *kring, struct ptnet_ring __user *ptring)
+{
+ if(CSB_WRITE(ptring, head, kring->rhead))
+ goto err;
+ if(CSB_WRITE(ptring, cur, kring->rcur))
+ goto err;
+
+ if(CSB_WRITE(ptring, hwcur, kring->nr_hwcur))
+ goto err;
+ if(CSB_WRITE(ptring, hwtail, NM_ACCESS_ONCE(kring->nr_hwtail)))
+ goto err;
+
+ DBG(ptnetmap_kring_dump("ptnetmap_kring_snapshot", kring);)
+
+ return 0;
+err:
+ return EFAULT;
+}
+
+static struct netmap_kring *
+ptnetmap_kring(struct netmap_pt_host_adapter *pth_na, int k)
+{
+ if (k < pth_na->up.num_tx_rings) {
+ return pth_na->up.tx_rings + k;
+ }
+ return pth_na->up.rx_rings + k - pth_na->up.num_tx_rings;
+}
+
+static int
+ptnetmap_krings_snapshot(struct netmap_pt_host_adapter *pth_na)
+{
+ struct ptnetmap_state *ptns = pth_na->ptns;
+ struct netmap_kring *kring;
+ unsigned int num_rings;
+ int err = 0, k;
+
+ num_rings = pth_na->up.num_tx_rings +
+ pth_na->up.num_rx_rings;
+
+ for (k = 0; k < num_rings; k++) {
+ kring = ptnetmap_kring(pth_na, k);
+ err |= ptnetmap_kring_snapshot(kring, ptns->ptrings + k);
+ }
+
+ return err;
+}
+
+/*
+ * Functions to create, start and stop the kthreads
+ */
+
+static int
+ptnetmap_create_kthreads(struct netmap_pt_host_adapter *pth_na,
+ struct ptnetmap_cfg *cfg)
+{
+ struct ptnetmap_state *ptns = pth_na->ptns;
+ struct nm_kthread_cfg nmk_cfg;
+ unsigned int num_rings;
+ int k;
+
+ num_rings = pth_na->up.num_tx_rings +
+ pth_na->up.num_rx_rings;
+
+ for (k = 0; k < num_rings; k++) {
+ nmk_cfg.attach_user = 1; /* attach kthread to user process */
+ nmk_cfg.worker_private = ptnetmap_kring(pth_na, k);
+ nmk_cfg.event = *(cfg->entries + k);
+ nmk_cfg.type = k;
+ if (k < pth_na->up.num_tx_rings) {
+ nmk_cfg.worker_fn = ptnetmap_tx_handler;
+ } else {
+ nmk_cfg.worker_fn = ptnetmap_rx_handler;
+ }
+
+ ptns->kthreads[k] = nm_os_kthread_create(&nmk_cfg);
+ if (ptns->kthreads[k] == NULL) {
+ goto err;
+ }
+ }
+
+ return 0;
+err:
+ for (k = 0; k < num_rings; k++) {
+ if (ptns->kthreads[k]) {
+ nm_os_kthread_delete(ptns->kthreads[k]);
+ ptns->kthreads[k] = NULL;
+ }
+ }
+ return EFAULT;
+}
+
+static int
+ptnetmap_start_kthreads(struct netmap_pt_host_adapter *pth_na)
+{
+ struct ptnetmap_state *ptns = pth_na->ptns;
+ int num_rings;
+ int error;
+ int k;
+
+ if (!ptns) {
+ D("BUG ptns is NULL");
+ return EFAULT;
+ }
+
+ ptns->stopped = false;
+
+ num_rings = ptns->pth_na->up.num_tx_rings +
+ ptns->pth_na->up.num_rx_rings;
+ for (k = 0; k < num_rings; k++) {
+ //nm_os_kthread_set_affinity(ptns->kthreads[k], xxx);
+ error = nm_os_kthread_start(ptns->kthreads[k]);
+ if (error) {
+ return error;
+ }
+ }
+
+ return 0;
+}
+
+static void
+ptnetmap_stop_kthreads(struct netmap_pt_host_adapter *pth_na)
+{
+ struct ptnetmap_state *ptns = pth_na->ptns;
+ int num_rings;
+ int k;
+
+ if (!ptns) {
+ /* Nothing to do. */
+ return;
+ }
+
+ ptns->stopped = true;
+
+ num_rings = ptns->pth_na->up.num_tx_rings +
+ ptns->pth_na->up.num_rx_rings;
+ for (k = 0; k < num_rings; k++) {
+ nm_os_kthread_stop(ptns->kthreads[k]);
+ }
+}
+
+static struct ptnetmap_cfg *
+ptnetmap_read_cfg(struct nmreq *nmr)
+{
+ uintptr_t *nmr_ptncfg = (uintptr_t *)&nmr->nr_arg1;
+ struct ptnetmap_cfg *cfg;
+ struct ptnetmap_cfg tmp;
+ size_t cfglen;
+
+ if (copyin((const void *)*nmr_ptncfg, &tmp, sizeof(tmp))) {
+ D("Partial copyin() failed");
+ return NULL;
+ }
+
+ cfglen = sizeof(tmp) + tmp.num_rings * sizeof(struct ptnet_ring_cfg);
+ cfg = malloc(cfglen, M_DEVBUF, M_NOWAIT | M_ZERO);
+ if (!cfg) {
+ return NULL;
+ }
+
+ if (copyin((const void *)*nmr_ptncfg, cfg, cfglen)) {
+ D("Full copyin() failed");
+ free(cfg, M_DEVBUF);
+ return NULL;
+ }
+
+ return cfg;
+}
+
+static int nm_unused_notify(struct netmap_kring *, int);
+static int nm_pt_host_notify(struct netmap_kring *, int);
+
+/* Create ptnetmap state and switch parent adapter to ptnetmap mode. */
+static int
+ptnetmap_create(struct netmap_pt_host_adapter *pth_na,
+ struct ptnetmap_cfg *cfg)
+{
+ unsigned ft_mask = (PTNETMAP_CFG_FEAT_CSB | PTNETMAP_CFG_FEAT_EVENTFD);
+ struct ptnetmap_state *ptns;
+ unsigned int num_rings;
+ int ret, i;
+
+ /* Check if ptnetmap state is already there. */
+ if (pth_na->ptns) {
+ D("ERROR adapter %p already in ptnetmap mode", pth_na->parent);
+ return EINVAL;
+ }
+
+ if ((cfg->features & ft_mask) != ft_mask) {
+ D("ERROR ptnetmap_cfg(%x) does not contain CSB and EVENTFD",
+ cfg->features);
+ return EINVAL;
+ }
+
+ num_rings = pth_na->up.num_tx_rings + pth_na->up.num_rx_rings;
+
+ if (num_rings != cfg->num_rings) {
+ D("ERROR configuration mismatch, expected %u rings, found %u",
+ num_rings, cfg->num_rings);
+ return EINVAL;
+ }
+
+ ptns = malloc(sizeof(*ptns) + num_rings * sizeof(*ptns->kthreads),
+ M_DEVBUF, M_NOWAIT | M_ZERO);
+ if (!ptns) {
+ return ENOMEM;
+ }
+
+ ptns->kthreads = (struct nm_kthread **)(ptns + 1);
+ ptns->stopped = true;
+
+ /* Cross-link data structures. */
+ pth_na->ptns = ptns;
+ ptns->pth_na = pth_na;
+
+ /* Store the CSB address provided by the hypervisor. */
+ ptns->ptrings = cfg->ptrings;
+
+ DBG(ptnetmap_print_configuration(cfg));
+
+ /* Create kthreads */
+ if ((ret = ptnetmap_create_kthreads(pth_na, cfg))) {
+ D("ERROR ptnetmap_create_kthreads()");
+ goto err;
+ }
+ /* Copy krings state into the CSB for the guest initialization */
+ if ((ret = ptnetmap_krings_snapshot(pth_na))) {
+ D("ERROR ptnetmap_krings_snapshot()");
+ goto err;
+ }
+
+ /* Overwrite parent nm_notify krings callback. */
+ pth_na->parent->na_private = pth_na;
+ pth_na->parent_nm_notify = pth_na->parent->nm_notify;
+ pth_na->parent->nm_notify = nm_unused_notify;
+
+ for (i = 0; i < pth_na->parent->num_rx_rings; i++) {
+ pth_na->up.rx_rings[i].save_notify =
+ pth_na->up.rx_rings[i].nm_notify;
+ pth_na->up.rx_rings[i].nm_notify = nm_pt_host_notify;
+ }
+ for (i = 0; i < pth_na->parent->num_tx_rings; i++) {
+ pth_na->up.tx_rings[i].save_notify =
+ pth_na->up.tx_rings[i].nm_notify;
+ pth_na->up.tx_rings[i].nm_notify = nm_pt_host_notify;
+ }
+
+#ifdef RATE
+ memset(&ptns->rate_ctx, 0, sizeof(ptns->rate_ctx));
+ setup_timer(&ptns->rate_ctx.timer, &rate_callback,
+ (unsigned long)&ptns->rate_ctx);
+ if (mod_timer(&ptns->rate_ctx.timer, jiffies + msecs_to_jiffies(1500)))
+ D("[ptn] Error: mod_timer()\n");
+#endif
+
+ DBG(D("[%s] ptnetmap configuration DONE", pth_na->up.name));
+
+ return 0;
+
+err:
+ pth_na->ptns = NULL;
+ free(ptns, M_DEVBUF);
+ return ret;
+}
+
+/* Switch parent adapter back to normal mode and destroy
+ * ptnetmap state. */
+static void
+ptnetmap_delete(struct netmap_pt_host_adapter *pth_na)
+{
+ struct ptnetmap_state *ptns = pth_na->ptns;
+ int num_rings;
+ int i;
+
+ if (!ptns) {
+ /* Nothing to do. */
+ return;
+ }
+
+ /* Restore parent adapter callbacks. */
+ pth_na->parent->nm_notify = pth_na->parent_nm_notify;
+ pth_na->parent->na_private = NULL;
+
+ for (i = 0; i < pth_na->parent->num_rx_rings; i++) {
+ pth_na->up.rx_rings[i].nm_notify =
+ pth_na->up.rx_rings[i].save_notify;
+ pth_na->up.rx_rings[i].save_notify = NULL;
+ }
+ for (i = 0; i < pth_na->parent->num_tx_rings; i++) {
+ pth_na->up.tx_rings[i].nm_notify =
+ pth_na->up.tx_rings[i].save_notify;
+ pth_na->up.tx_rings[i].save_notify = NULL;
+ }
+
+ /* Delete kthreads. */
+ num_rings = ptns->pth_na->up.num_tx_rings +
+ ptns->pth_na->up.num_rx_rings;
+ for (i = 0; i < num_rings; i++) {
+ nm_os_kthread_delete(ptns->kthreads[i]);
+ ptns->kthreads[i] = NULL;
+ }
+
+ IFRATE(del_timer(&ptns->rate_ctx.timer));
+
+ free(ptns, M_DEVBUF);
+
+ pth_na->ptns = NULL;
+
+ DBG(D("[%s] ptnetmap deleted", pth_na->up.name));
+}
+
+/*
+ * Called by netmap_ioctl().
+ * Operation is indicated in nmr->nr_cmd.
+ *
+ * Called without NMG_LOCK.
+ */
+int
+ptnetmap_ctl(struct nmreq *nmr, struct netmap_adapter *na)
+{
+ struct netmap_pt_host_adapter *pth_na;
+ struct ptnetmap_cfg *cfg;
+ char *name;
+ int cmd, error = 0;
+
+ name = nmr->nr_name;
+ cmd = nmr->nr_cmd;
+
+ DBG(D("name: %s", name));
+
+ if (!nm_ptnetmap_host_on(na)) {
+ D("ERROR Netmap adapter %p is not a ptnetmap host adapter", na);
+ error = ENXIO;
+ goto done;
+ }
+ pth_na = (struct netmap_pt_host_adapter *)na;
+
+ NMG_LOCK();
+ switch (cmd) {
+ case NETMAP_PT_HOST_CREATE:
+ /* Read hypervisor configuration from userspace. */
+ cfg = ptnetmap_read_cfg(nmr);
+ if (!cfg)
+ break;
+ /* Create ptnetmap state (kthreads, ...) and switch parent
+ * adapter to ptnetmap mode. */
+ error = ptnetmap_create(pth_na, cfg);
+ free(cfg, M_DEVBUF);
+ if (error)
+ break;
+ /* Start kthreads. */
+ error = ptnetmap_start_kthreads(pth_na);
+ if (error)
+ ptnetmap_delete(pth_na);
+ break;
+
+ case NETMAP_PT_HOST_DELETE:
+ /* Stop kthreads. */
+ ptnetmap_stop_kthreads(pth_na);
+ /* Switch parent adapter back to normal mode and destroy
+ * ptnetmap state (kthreads, ...). */
+ ptnetmap_delete(pth_na);
+ break;
+
+ default:
+ D("ERROR invalid cmd (nmr->nr_cmd) (0x%x)", cmd);
+ error = EINVAL;
+ break;
+ }
+ NMG_UNLOCK();
+
+done:
+ return error;
+}
+
+/* nm_notify callbacks for ptnetmap */
+static int
+nm_pt_host_notify(struct netmap_kring *kring, int flags)
+{
+ struct netmap_adapter *na = kring->na;
+ struct netmap_pt_host_adapter *pth_na =
+ (struct netmap_pt_host_adapter *)na->na_private;
+ struct ptnetmap_state *ptns;
+ int k;
+
+ /* First check that the passthrough port is not being destroyed. */
+ if (unlikely(!pth_na)) {
+ return NM_IRQ_COMPLETED;
+ }
+
+ ptns = pth_na->ptns;
+ if (unlikely(!ptns || ptns->stopped)) {
+ return NM_IRQ_COMPLETED;
+ }
+
+ k = kring->ring_id;
+
+ /* Notify kthreads (wake up if needed) */
+ if (kring->tx == NR_TX) {
+ ND(1, "TX backend irq");
+ IFRATE(ptns->rate_ctx.new.btxwu++);
+ } else {
+ k += pth_na->up.num_tx_rings;
+ ND(1, "RX backend irq");
+ IFRATE(ptns->rate_ctx.new.brxwu++);
+ }
+ nm_os_kthread_wakeup_worker(ptns->kthreads[k]);
+
+ return NM_IRQ_COMPLETED;
+}
+
+static int
+nm_unused_notify(struct netmap_kring *kring, int flags)
+{
+ D("BUG this should never be called");
+ return ENXIO;
+}
+
+/* nm_config callback for bwrap */
+static int
+nm_pt_host_config(struct netmap_adapter *na, u_int *txr, u_int *txd,
+ u_int *rxr, u_int *rxd)
+{
+ struct netmap_pt_host_adapter *pth_na =
+ (struct netmap_pt_host_adapter *)na;
+ struct netmap_adapter *parent = pth_na->parent;
+ int error;
+
+ //XXX: maybe calling parent->nm_config is better
+
+ /* forward the request */
+ error = netmap_update_config(parent);
+
+ *rxr = na->num_rx_rings = parent->num_rx_rings;
+ *txr = na->num_tx_rings = parent->num_tx_rings;
+ *txd = na->num_tx_desc = parent->num_tx_desc;
+ *rxd = na->num_rx_desc = parent->num_rx_desc;
+
+ DBG(D("rxr: %d txr: %d txd: %d rxd: %d", *rxr, *txr, *txd, *rxd));
+
+ return error;
+}
+
+/* nm_krings_create callback for ptnetmap */
+static int
+nm_pt_host_krings_create(struct netmap_adapter *na)
+{
+ struct netmap_pt_host_adapter *pth_na =
+ (struct netmap_pt_host_adapter *)na;
+ struct netmap_adapter *parent = pth_na->parent;
+ enum txrx t;
+ int error;
+
+ DBG(D("%s", pth_na->up.name));
+
+ /* create the parent krings */
+ error = parent->nm_krings_create(parent);
+ if (error) {
+ return error;
+ }
+
+ /* A ptnetmap host adapter points the very same krings
+ * as its parent adapter. These pointer are used in the
+ * TX/RX worker functions. */
+ na->tx_rings = parent->tx_rings;
+ na->rx_rings = parent->rx_rings;
+ na->tailroom = parent->tailroom;
+
+ for_rx_tx(t) {
+ struct netmap_kring *kring;
+
+ /* Parent's kring_create function will initialize
+ * its own na->si. We have to init our na->si here. */
+ nm_os_selinfo_init(&na->si[t]);
+
+ /* Force the mem_rings_create() method to create the
+ * host rings independently on what the regif asked for:
+ * these rings are needed by the guest ptnetmap adapter
+ * anyway. */
+ kring = &NMR(na, t)[nma_get_nrings(na, t)];
+ kring->nr_kflags |= NKR_NEEDRING;
+ }
+
+ return 0;
+}
+
+/* nm_krings_delete callback for ptnetmap */
+static void
+nm_pt_host_krings_delete(struct netmap_adapter *na)
+{
+ struct netmap_pt_host_adapter *pth_na =
+ (struct netmap_pt_host_adapter *)na;
+ struct netmap_adapter *parent = pth_na->parent;
+
+ DBG(D("%s", pth_na->up.name));
+
+ parent->nm_krings_delete(parent);
+
+ na->tx_rings = na->rx_rings = na->tailroom = NULL;
+}
+
+/* nm_register callback */
+static int
+nm_pt_host_register(struct netmap_adapter *na, int onoff)
+{
+ struct netmap_pt_host_adapter *pth_na =
+ (struct netmap_pt_host_adapter *)na;
+ struct netmap_adapter *parent = pth_na->parent;
+ int error;
+ DBG(D("%s onoff %d", pth_na->up.name, onoff));
+
+ if (onoff) {
+ /* netmap_do_regif has been called on the ptnetmap na.
+ * We need to pass the information about the
+ * memory allocator to the parent before
+ * putting it in netmap mode
+ */
+ parent->na_lut = na->na_lut;
+ }
+
+ /* forward the request to the parent */
+ error = parent->nm_register(parent, onoff);
+ if (error)
+ return error;
+
+
+ if (onoff) {
+ na->na_flags |= NAF_NETMAP_ON | NAF_PTNETMAP_HOST;
+ } else {
+ ptnetmap_delete(pth_na);
+ na->na_flags &= ~(NAF_NETMAP_ON | NAF_PTNETMAP_HOST);
+ }
+
+ return 0;
+}
+
+/* nm_dtor callback */
+static void
+nm_pt_host_dtor(struct netmap_adapter *na)
+{
+ struct netmap_pt_host_adapter *pth_na =
+ (struct netmap_pt_host_adapter *)na;
+ struct netmap_adapter *parent = pth_na->parent;
+
+ DBG(D("%s", pth_na->up.name));
+
+ /* The equivalent of NETMAP_PT_HOST_DELETE if the hypervisor
+ * didn't do it. */
+ ptnetmap_stop_kthreads(pth_na);
+ ptnetmap_delete(pth_na);
+
+ parent->na_flags &= ~NAF_BUSY;
+
+ netmap_adapter_put(pth_na->parent);
+ pth_na->parent = NULL;
+}
+
+/* check if nmr is a request for a ptnetmap adapter that we can satisfy */
+int
+netmap_get_pt_host_na(struct nmreq *nmr, struct netmap_adapter **na, int create)
+{
+ struct nmreq parent_nmr;
+ struct netmap_adapter *parent; /* target adapter */
+ struct netmap_pt_host_adapter *pth_na;
+ struct ifnet *ifp = NULL;
+ int error;
+
+ /* Check if it is a request for a ptnetmap adapter */
+ if ((nmr->nr_flags & (NR_PTNETMAP_HOST)) == 0) {
+ return 0;
+ }
+
+ D("Requesting a ptnetmap host adapter");
+
+ pth_na = malloc(sizeof(*pth_na), M_DEVBUF, M_NOWAIT | M_ZERO);
+ if (pth_na == NULL) {
+ D("ERROR malloc");
+ return ENOMEM;
+ }
+
+ /* first, try to find the adapter that we want to passthrough
+ * We use the same nmr, after we have turned off the ptnetmap flag.
+ * In this way we can potentially passthrough everything netmap understands.
+ */
+ memcpy(&parent_nmr, nmr, sizeof(parent_nmr));
+ parent_nmr.nr_flags &= ~(NR_PTNETMAP_HOST);
+ error = netmap_get_na(&parent_nmr, &parent, &ifp, create);
+ if (error) {
+ D("parent lookup failed: %d", error);
+ goto put_out_noputparent;
+ }
+ DBG(D("found parent: %s", parent->name));
+
+ /* make sure the interface is not already in use */
+ if (NETMAP_OWNED_BY_ANY(parent)) {
+ D("NIC %s busy, cannot ptnetmap", parent->name);
+ error = EBUSY;
+ goto put_out;
+ }
+
+ pth_na->parent = parent;
+
+ /* Follow netmap_attach()-like operations for the host
+ * ptnetmap adapter. */
+
+ //XXX pth_na->up.na_flags = parent->na_flags;
+ pth_na->up.num_rx_rings = parent->num_rx_rings;
+ pth_na->up.num_tx_rings = parent->num_tx_rings;
+ pth_na->up.num_tx_desc = parent->num_tx_desc;
+ pth_na->up.num_rx_desc = parent->num_rx_desc;
+
+ pth_na->up.nm_dtor = nm_pt_host_dtor;
+ pth_na->up.nm_register = nm_pt_host_register;
+
+ /* Reuse parent's adapter txsync and rxsync methods. */
+ pth_na->up.nm_txsync = parent->nm_txsync;
+ pth_na->up.nm_rxsync = parent->nm_rxsync;
+
+ pth_na->up.nm_krings_create = nm_pt_host_krings_create;
+ pth_na->up.nm_krings_delete = nm_pt_host_krings_delete;
+ pth_na->up.nm_config = nm_pt_host_config;
+
+ /* Set the notify method only or convenience, it will never
+ * be used, since - differently from default krings_create - we
+ * ptnetmap krings_create callback inits kring->nm_notify
+ * directly. */
+ pth_na->up.nm_notify = nm_unused_notify;
+
+ pth_na->up.nm_mem = parent->nm_mem;
+
+ pth_na->up.na_flags |= NAF_HOST_RINGS;
+
+ error = netmap_attach_common(&pth_na->up);
+ if (error) {
+ D("ERROR netmap_attach_common()");
+ goto put_out;
+ }
+
+ *na = &pth_na->up;
+ netmap_adapter_get(*na);
+
+ /* set parent busy, because attached for ptnetmap */
+ parent->na_flags |= NAF_BUSY;
+
+ strncpy(pth_na->up.name, parent->name, sizeof(pth_na->up.name));
+ strcat(pth_na->up.name, "-PTN");
+
+ DBG(D("%s ptnetmap request DONE", pth_na->up.name));
+
+ /* drop the reference to the ifp, if any */
+ if (ifp)
+ if_rele(ifp);
+
+ return 0;
+
+put_out:
+ netmap_adapter_put(parent);
+ if (ifp)
+ if_rele(ifp);
+put_out_noputparent:
+ free(pth_na, M_DEVBUF);
+ return error;
+}
+#endif /* WITH_PTNETMAP_HOST */
+
+#ifdef WITH_PTNETMAP_GUEST
+/*
+ * GUEST ptnetmap generic txsync()/rxsync() used in e1000/virtio-net device
+ * driver notify is set when we need to send notification to the host
+ * (driver-specific)
+ */
+
+/*
+ * Reconcile host and guest views of the transmit ring.
+ *
+ * Guest user wants to transmit packets up to the one before ring->head,
+ * and guest kernel knows tx_ring->hwcur is the first packet unsent
+ * by the host kernel.
+ *
+ * We push out as many packets as possible, and possibly
+ * reclaim buffers from previously completed transmission.
+ *
+ * Notifications from the host are enabled only if the user guest would
+ * block (no space in the ring).
+ */
+bool
+netmap_pt_guest_txsync(struct ptnet_ring *ptring, struct netmap_kring *kring,
+ int flags)
+{
+ bool notify = false;
+
+ /* Disable notifications */
+ ptring->guest_need_kick = 0;
+
+ /*
+ * First part: tell the host (updating the CSB) to process the new
+ * packets.
+ */
+ kring->nr_hwcur = ptring->hwcur;
+ ptnetmap_guest_write_kring_csb(ptring, kring->rcur, kring->rhead);
+
+ /* Ask for a kick from a guest to the host if needed. */
+ if ((kring->rhead != kring->nr_hwcur &&
+ NM_ACCESS_ONCE(ptring->host_need_kick)) ||
+ (flags & NAF_FORCE_RECLAIM)) {
+ ptring->sync_flags = flags;
+ notify = true;
+ }
+
+ /*
+ * Second part: reclaim buffers for completed transmissions.
+ */
+ if (nm_kr_txempty(kring) || (flags & NAF_FORCE_RECLAIM)) {
+ ptnetmap_guest_read_kring_csb(ptring, kring);
+ }
+
+ /*
+ * No more room in the ring for new transmissions. The user thread will
+ * go to sleep and we need to be notified by the host when more free
+ * space is available.
+ */
+ if (nm_kr_txempty(kring)) {
+ /* Reenable notifications. */
+ ptring->guest_need_kick = 1;
+ /* Double check */
+ ptnetmap_guest_read_kring_csb(ptring, kring);
+ /* If there is new free space, disable notifications */
+ if (unlikely(!nm_kr_txempty(kring))) {
+ ptring->guest_need_kick = 0;
+ }
+ }
+
+ ND(1, "TX - CSB: head:%u cur:%u hwtail:%u - KRING: head:%u cur:%u tail: %u",
+ ptring->head, ptring->cur, ptring->hwtail,
+ kring->rhead, kring->rcur, kring->nr_hwtail);
+
+ return notify;
+}
+
+/*
+ * Reconcile host and guest view of the receive ring.
+ *
+ * Update hwcur/hwtail from host (reading from CSB).
+ *
+ * If guest user has released buffers up to the one before ring->head, we
+ * also give them to the host.
+ *
+ * Notifications from the host are enabled only if the user guest would
+ * block (no more completed slots in the ring).
+ */
+bool
+netmap_pt_guest_rxsync(struct ptnet_ring *ptring, struct netmap_kring *kring,
+ int flags)
+{
+ bool notify = false;
+
+ /* Disable notifications */
+ ptring->guest_need_kick = 0;
+
+ /*
+ * First part: import newly received packets, by updating the kring
+ * hwtail to the hwtail known from the host (read from the CSB).
+ * This also updates the kring hwcur.
+ */
+ ptnetmap_guest_read_kring_csb(ptring, kring);
+ kring->nr_kflags &= ~NKR_PENDINTR;
+
+ /*
+ * Second part: tell the host about the slots that guest user has
+ * released, by updating cur and head in the CSB.
+ */
+ if (kring->rhead != kring->nr_hwcur) {
+ ptnetmap_guest_write_kring_csb(ptring, kring->rcur,
+ kring->rhead);
+ /* Ask for a kick from the guest to the host if needed. */
+ if (NM_ACCESS_ONCE(ptring->host_need_kick)) {
+ ptring->sync_flags = flags;
+ notify = true;
+ }
+ }
+
+ /*
+ * No more completed RX slots. The user thread will go to sleep and
+ * we need to be notified by the host when more RX slots have been
+ * completed.
+ */
+ if (nm_kr_rxempty(kring)) {
+ /* Reenable notifications. */
+ ptring->guest_need_kick = 1;
+ /* Double check */
+ ptnetmap_guest_read_kring_csb(ptring, kring);
+ /* If there are new slots, disable notifications. */
+ if (!nm_kr_rxempty(kring)) {
+ ptring->guest_need_kick = 0;
+ }
+ }
+
+ ND(1, "RX - CSB: head:%u cur:%u hwtail:%u - KRING: head:%u cur:%u",
+ ptring->head, ptring->cur, ptring->hwtail,
+ kring->rhead, kring->rcur);
+
+ return notify;
+}
+
+/*
+ * Callbacks for ptnet drivers: nm_krings_create, nm_krings_delete, nm_dtor.
+ */
+int
+ptnet_nm_krings_create(struct netmap_adapter *na)
+{
+ struct netmap_pt_guest_adapter *ptna =
+ (struct netmap_pt_guest_adapter *)na; /* Upcast. */
+ struct netmap_adapter *na_nm = &ptna->hwup.up;
+ struct netmap_adapter *na_dr = &ptna->dr.up;
+ int ret;
+
+ if (ptna->backend_regifs) {
+ return 0;
+ }
+
+ /* Create krings on the public netmap adapter. */
+ ret = netmap_hw_krings_create(na_nm);
+ if (ret) {
+ return ret;
+ }
+
+ /* Copy krings into the netmap adapter private to the driver. */
+ na_dr->tx_rings = na_nm->tx_rings;
+ na_dr->rx_rings = na_nm->rx_rings;
+
+ return 0;
+}
+
+void
+ptnet_nm_krings_delete(struct netmap_adapter *na)
+{
+ struct netmap_pt_guest_adapter *ptna =
+ (struct netmap_pt_guest_adapter *)na; /* Upcast. */
+ struct netmap_adapter *na_nm = &ptna->hwup.up;
+ struct netmap_adapter *na_dr = &ptna->dr.up;
+
+ if (ptna->backend_regifs) {
+ return;
+ }
+
+ na_dr->tx_rings = NULL;
+ na_dr->rx_rings = NULL;
+
+ netmap_hw_krings_delete(na_nm);
+}
+
+void
+ptnet_nm_dtor(struct netmap_adapter *na)
+{
+ struct netmap_pt_guest_adapter *ptna =
+ (struct netmap_pt_guest_adapter *)na;
+
+ netmap_mem_put(ptna->dr.up.nm_mem);
+ memset(&ptna->dr, 0, sizeof(ptna->dr));
+ netmap_mem_pt_guest_ifp_del(na->nm_mem, na->ifp);
+}
+
+#endif /* WITH_PTNETMAP_GUEST */
diff --git a/sys/dev/netmap/netmap_vale.c b/sys/dev/netmap/netmap_vale.c
index ddd7334a8378..78c53409c0b3 100644
--- a/sys/dev/netmap/netmap_vale.c
+++ b/sys/dev/netmap/netmap_vale.c
@@ -1,5 +1,6 @@
/*
- * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
+ * Copyright (C) 2013-2016 Universita` di Pisa
+ * All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -101,6 +102,9 @@ __FBSDID("$FreeBSD$");
#warning OSX support is only partial
#include "osx_glue.h"
+#elif defined(_WIN32)
+#include "win_glue.h"
+
#else
#error Unsupported platform
@@ -119,7 +123,7 @@ __FBSDID("$FreeBSD$");
/*
* system parameters (most of them in netmap_kern.h)
- * NM_NAME prefix for switch port names, default "vale"
+ * NM_BDG_NAME prefix for switch port names, default "vale"
* NM_BDG_MAXPORTS number of ports
* NM_BRIDGES max number of switches in the system.
* XXX should become a sysctl or tunable
@@ -144,7 +148,6 @@ __FBSDID("$FreeBSD$");
#define NM_BDG_BATCH_MAX (NM_BDG_BATCH + NM_MULTISEG)
/* NM_FT_NULL terminates a list of slots in the ft */
#define NM_FT_NULL NM_BDG_BATCH_MAX
-#define NM_BRIDGES 8 /* number of bridges */
/*
@@ -152,14 +155,15 @@ __FBSDID("$FreeBSD$");
* used in the bridge. The actual value may be larger as the
* last packet in the block may overflow the size.
*/
-int bridge_batch = NM_BDG_BATCH; /* bridge batch size */
+static int bridge_batch = NM_BDG_BATCH; /* bridge batch size */
+SYSBEGIN(vars_vale);
SYSCTL_DECL(_dev_netmap);
SYSCTL_INT(_dev_netmap, OID_AUTO, bridge_batch, CTLFLAG_RW, &bridge_batch, 0 , "");
-
+SYSEND;
static int netmap_vp_create(struct nmreq *, struct ifnet *, struct netmap_vp_adapter **);
static int netmap_vp_reg(struct netmap_adapter *na, int onoff);
-static int netmap_bwrap_register(struct netmap_adapter *, int onoff);
+static int netmap_bwrap_reg(struct netmap_adapter *, int onoff);
/*
* For each output interface, nm_bdg_q is used to construct a list.
@@ -213,7 +217,7 @@ struct nm_bridge {
* forward this packet. ring_nr is the source ring index, and the
* function may overwrite this value to forward this packet to a
* different ring index.
- * This function must be set by netmap_bdgctl().
+ * This function must be set by netmap_bdg_ctl().
*/
struct netmap_bdg_ops bdg_ops;
@@ -244,7 +248,7 @@ netmap_bdg_name(struct netmap_vp_adapter *vp)
* Right now we have a static array and deletions are protected
* by an exclusive lock.
*/
-struct nm_bridge *nm_bridges;
+static struct nm_bridge *nm_bridges;
#endif /* !CONFIG_NET_NS */
@@ -278,6 +282,45 @@ pkt_copy(void *_src, void *_dst, int l)
}
+static int
+nm_is_id_char(const char c)
+{
+ return (c >= 'a' && c <= 'z') ||
+ (c >= 'A' && c <= 'Z') ||
+ (c >= '0' && c <= '9') ||
+ (c == '_');
+}
+
+/* Validate the name of a VALE bridge port and return the
+ * position of the ":" character. */
+static int
+nm_vale_name_validate(const char *name)
+{
+ int colon_pos = -1;
+ int i;
+
+ if (!name || strlen(name) < strlen(NM_BDG_NAME)) {
+ return -1;
+ }
+
+ for (i = 0; name[i]; i++) {
+ if (name[i] == ':') {
+ if (colon_pos != -1) {
+ return -1;
+ }
+ colon_pos = i;
+ } else if (!nm_is_id_char(name[i])) {
+ return -1;
+ }
+ }
+
+ if (i >= IFNAMSIZ) {
+ return -1;
+ }
+
+ return colon_pos;
+}
+
/*
* locate a bridge among the existing ones.
* MUST BE CALLED WITH NMG_LOCK()
@@ -288,7 +331,7 @@ pkt_copy(void *_src, void *_dst, int l)
static struct nm_bridge *
nm_find_bridge(const char *name, int create)
{
- int i, l, namelen;
+ int i, namelen;
struct nm_bridge *b = NULL, *bridges;
u_int num_bridges;
@@ -296,21 +339,11 @@ nm_find_bridge(const char *name, int create)
netmap_bns_getbridges(&bridges, &num_bridges);
- namelen = strlen(NM_NAME); /* base length */
- l = name ? strlen(name) : 0; /* actual length */
- if (l < namelen) {
+ namelen = nm_vale_name_validate(name);
+ if (namelen < 0) {
D("invalid bridge name %s", name ? name : NULL);
return NULL;
}
- for (i = namelen + 1; i < l; i++) {
- if (name[i] == ':') {
- namelen = i;
- break;
- }
- }
- if (namelen >= IFNAMSIZ)
- namelen = IFNAMSIZ;
- ND("--- prefix is '%.*s' ---", namelen, name);
/* lookup the name, remember empty slot if there is one */
for (i = 0; i < num_bridges; i++) {
@@ -479,6 +512,7 @@ netmap_vp_bdg_ctl(struct netmap_adapter *na, struct nmreq *nmr, int attach)
struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter *)na;
struct nm_bridge *b = vpna->na_bdg;
+ (void)nmr; // XXX merge ?
if (attach)
return 0; /* nothing to do */
if (b) {
@@ -518,7 +552,7 @@ nm_vi_destroy(const char *name)
return ENXIO;
NMG_LOCK();
/* make sure this is actually a VALE port */
- if (!NETMAP_CAPABLE(ifp) || NA(ifp)->nm_register != netmap_vp_reg) {
+ if (!NM_NA_VALID(ifp) || NA(ifp)->nm_register != netmap_vp_reg) {
error = EINVAL;
goto err;
}
@@ -535,7 +569,7 @@ nm_vi_destroy(const char *name)
*/
if_rele(ifp);
netmap_detach(ifp);
- nm_vi_detach(ifp);
+ nm_os_vi_detach(ifp);
return 0;
err:
@@ -556,14 +590,14 @@ nm_vi_create(struct nmreq *nmr)
int error;
/* don't include VALE prefix */
- if (!strncmp(nmr->nr_name, NM_NAME, strlen(NM_NAME)))
+ if (!strncmp(nmr->nr_name, NM_BDG_NAME, strlen(NM_BDG_NAME)))
return EINVAL;
ifp = ifunit_ref(nmr->nr_name);
if (ifp) { /* already exist, cannot create new one */
if_rele(ifp);
return EEXIST;
}
- error = nm_vi_persist(nmr->nr_name, &ifp);
+ error = nm_os_vi_persist(nmr->nr_name, &ifp);
if (error)
return error;
@@ -572,12 +606,13 @@ nm_vi_create(struct nmreq *nmr)
error = netmap_vp_create(nmr, ifp, &vpna);
if (error) {
D("error %d", error);
- nm_vi_detach(ifp);
+ nm_os_vi_detach(ifp);
return error;
}
/* persist-specific routines */
vpna->up.nm_bdg_ctl = netmap_vp_bdg_ctl;
netmap_adapter_get(&vpna->up);
+ NM_ATTACH_NA(ifp, &vpna->up);
NMG_UNLOCK();
D("created %s", ifp->if_xname);
return 0;
@@ -608,7 +643,7 @@ netmap_get_bdg_na(struct nmreq *nmr, struct netmap_adapter **na, int create)
/* first try to see if this is a bridge port. */
NMG_LOCK_ASSERT();
- if (strncmp(nr_name, NM_NAME, sizeof(NM_NAME) - 1)) {
+ if (strncmp(nr_name, NM_BDG_NAME, sizeof(NM_BDG_NAME) - 1)) {
return 0; /* no error, but no VALE prefix */
}
@@ -693,7 +728,6 @@ netmap_get_bdg_na(struct nmreq *nmr, struct netmap_adapter **na, int create)
goto out;
vpna = hw->na_vp;
hostna = hw->na_hostvp;
- if_rele(ifp);
if (nmr->nr_arg1 != NETMAP_BDG_HOST)
hostna = NULL;
}
@@ -768,6 +802,11 @@ unlock_exit:
return error;
}
+static inline int
+nm_is_bwrap(struct netmap_adapter *na)
+{
+ return na->nm_register == netmap_bwrap_reg;
+}
/* process NETMAP_BDG_DETACH */
static int
@@ -785,8 +824,13 @@ nm_bdg_ctl_detach(struct nmreq *nmr)
if (na == NULL) { /* VALE prefix missing */
error = EINVAL;
goto unlock_exit;
+ } else if (nm_is_bwrap(na) &&
+ ((struct netmap_bwrap_adapter *)na)->na_polling_state) {
+ /* Don't detach a NIC with polling */
+ error = EBUSY;
+ netmap_adapter_put(na);
+ goto unlock_exit;
}
-
if (na->nm_bdg_ctl) {
/* remove the port from bridge. The bwrap
* also needs to put the hwna in normal mode
@@ -801,6 +845,267 @@ unlock_exit:
}
+struct nm_bdg_polling_state;
+struct
+nm_bdg_kthread {
+ struct nm_kthread *nmk;
+ u_int qfirst;
+ u_int qlast;
+ struct nm_bdg_polling_state *bps;
+};
+
+struct nm_bdg_polling_state {
+ bool configured;
+ bool stopped;
+ struct netmap_bwrap_adapter *bna;
+ u_int reg;
+ u_int qfirst;
+ u_int qlast;
+ u_int cpu_from;
+ u_int ncpus;
+ struct nm_bdg_kthread *kthreads;
+};
+
+static void
+netmap_bwrap_polling(void *data)
+{
+ struct nm_bdg_kthread *nbk = data;
+ struct netmap_bwrap_adapter *bna;
+ u_int qfirst, qlast, i;
+ struct netmap_kring *kring0, *kring;
+
+ if (!nbk)
+ return;
+ qfirst = nbk->qfirst;
+ qlast = nbk->qlast;
+ bna = nbk->bps->bna;
+ kring0 = NMR(bna->hwna, NR_RX);
+
+ for (i = qfirst; i < qlast; i++) {
+ kring = kring0 + i;
+ kring->nm_notify(kring, 0);
+ }
+}
+
+static int
+nm_bdg_create_kthreads(struct nm_bdg_polling_state *bps)
+{
+ struct nm_kthread_cfg kcfg;
+ int i, j;
+
+ bps->kthreads = malloc(sizeof(struct nm_bdg_kthread) * bps->ncpus,
+ M_DEVBUF, M_NOWAIT | M_ZERO);
+ if (bps->kthreads == NULL)
+ return ENOMEM;
+
+ bzero(&kcfg, sizeof(kcfg));
+ kcfg.worker_fn = netmap_bwrap_polling;
+ for (i = 0; i < bps->ncpus; i++) {
+ struct nm_bdg_kthread *t = bps->kthreads + i;
+ int all = (bps->ncpus == 1 && bps->reg == NR_REG_ALL_NIC);
+ int affinity = bps->cpu_from + i;
+
+ t->bps = bps;
+ t->qfirst = all ? bps->qfirst /* must be 0 */: affinity;
+ t->qlast = all ? bps->qlast : t->qfirst + 1;
+ D("kthread %d a:%u qf:%u ql:%u", i, affinity, t->qfirst,
+ t->qlast);
+
+ kcfg.type = i;
+ kcfg.worker_private = t;
+ t->nmk = nm_os_kthread_create(&kcfg);
+ if (t->nmk == NULL) {
+ goto cleanup;
+ }
+ nm_os_kthread_set_affinity(t->nmk, affinity);
+ }
+ return 0;
+
+cleanup:
+ for (j = 0; j < i; j++) {
+ struct nm_bdg_kthread *t = bps->kthreads + i;
+ nm_os_kthread_delete(t->nmk);
+ }
+ free(bps->kthreads, M_DEVBUF);
+ return EFAULT;
+}
+
+/* a version of ptnetmap_start_kthreads() */
+static int
+nm_bdg_polling_start_kthreads(struct nm_bdg_polling_state *bps)
+{
+ int error, i, j;
+
+ if (!bps) {
+ D("polling is not configured");
+ return EFAULT;
+ }
+ bps->stopped = false;
+
+ for (i = 0; i < bps->ncpus; i++) {
+ struct nm_bdg_kthread *t = bps->kthreads + i;
+ error = nm_os_kthread_start(t->nmk);
+ if (error) {
+ D("error in nm_kthread_start()");
+ goto cleanup;
+ }
+ }
+ return 0;
+
+cleanup:
+ for (j = 0; j < i; j++) {
+ struct nm_bdg_kthread *t = bps->kthreads + i;
+ nm_os_kthread_stop(t->nmk);
+ }
+ bps->stopped = true;
+ return error;
+}
+
+static void
+nm_bdg_polling_stop_delete_kthreads(struct nm_bdg_polling_state *bps)
+{
+ int i;
+
+ if (!bps)
+ return;
+
+ for (i = 0; i < bps->ncpus; i++) {
+ struct nm_bdg_kthread *t = bps->kthreads + i;
+ nm_os_kthread_stop(t->nmk);
+ nm_os_kthread_delete(t->nmk);
+ }
+ bps->stopped = true;
+}
+
+static int
+get_polling_cfg(struct nmreq *nmr, struct netmap_adapter *na,
+ struct nm_bdg_polling_state *bps)
+{
+ int req_cpus, avail_cpus, core_from;
+ u_int reg, i, qfirst, qlast;
+
+ avail_cpus = nm_os_ncpus();
+ req_cpus = nmr->nr_arg1;
+
+ if (req_cpus == 0) {
+ D("req_cpus must be > 0");
+ return EINVAL;
+ } else if (req_cpus >= avail_cpus) {
+ D("for safety, we need at least one core left in the system");
+ return EINVAL;
+ }
+ reg = nmr->nr_flags & NR_REG_MASK;
+ i = nmr->nr_ringid & NETMAP_RING_MASK;
+ /*
+ * ONE_NIC: dedicate one core to one ring. If multiple cores
+ * are specified, consecutive rings are also polled.
+ * For example, if ringid=2 and 2 cores are given,
+ * ring 2 and 3 are polled by core 2 and 3, respectively.
+ * ALL_NIC: poll all the rings using a core specified by ringid.
+ * the number of cores must be 1.
+ */
+ if (reg == NR_REG_ONE_NIC) {
+ if (i + req_cpus > nma_get_nrings(na, NR_RX)) {
+ D("only %d rings exist (ring %u-%u is given)",
+ nma_get_nrings(na, NR_RX), i, i+req_cpus);
+ return EINVAL;
+ }
+ qfirst = i;
+ qlast = qfirst + req_cpus;
+ core_from = qfirst;
+ } else if (reg == NR_REG_ALL_NIC) {
+ if (req_cpus != 1) {
+ D("ncpus must be 1 not %d for REG_ALL_NIC", req_cpus);
+ return EINVAL;
+ }
+ qfirst = 0;
+ qlast = nma_get_nrings(na, NR_RX);
+ core_from = i;
+ } else {
+ D("reg must be ALL_NIC or ONE_NIC");
+ return EINVAL;
+ }
+
+ bps->reg = reg;
+ bps->qfirst = qfirst;
+ bps->qlast = qlast;
+ bps->cpu_from = core_from;
+ bps->ncpus = req_cpus;
+ D("%s qfirst %u qlast %u cpu_from %u ncpus %u",
+ reg == NR_REG_ALL_NIC ? "REG_ALL_NIC" : "REG_ONE_NIC",
+ qfirst, qlast, core_from, req_cpus);
+ return 0;
+}
+
+static int
+nm_bdg_ctl_polling_start(struct nmreq *nmr, struct netmap_adapter *na)
+{
+ struct nm_bdg_polling_state *bps;
+ struct netmap_bwrap_adapter *bna;
+ int error;
+
+ bna = (struct netmap_bwrap_adapter *)na;
+ if (bna->na_polling_state) {
+ D("ERROR adapter already in polling mode");
+ return EFAULT;
+ }
+
+ bps = malloc(sizeof(*bps), M_DEVBUF, M_NOWAIT | M_ZERO);
+ if (!bps)
+ return ENOMEM;
+ bps->configured = false;
+ bps->stopped = true;
+
+ if (get_polling_cfg(nmr, na, bps)) {
+ free(bps, M_DEVBUF);
+ return EINVAL;
+ }
+
+ if (nm_bdg_create_kthreads(bps)) {
+ free(bps, M_DEVBUF);
+ return EFAULT;
+ }
+
+ bps->configured = true;
+ bna->na_polling_state = bps;
+ bps->bna = bna;
+
+ /* disable interrupt if possible */
+ if (bna->hwna->nm_intr)
+ bna->hwna->nm_intr(bna->hwna, 0);
+ /* start kthread now */
+ error = nm_bdg_polling_start_kthreads(bps);
+ if (error) {
+ D("ERROR nm_bdg_polling_start_kthread()");
+ free(bps->kthreads, M_DEVBUF);
+ free(bps, M_DEVBUF);
+ bna->na_polling_state = NULL;
+ if (bna->hwna->nm_intr)
+ bna->hwna->nm_intr(bna->hwna, 1);
+ }
+ return error;
+}
+
+static int
+nm_bdg_ctl_polling_stop(struct nmreq *nmr, struct netmap_adapter *na)
+{
+ struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter *)na;
+ struct nm_bdg_polling_state *bps;
+
+ if (!bna->na_polling_state) {
+ D("ERROR adapter is not in polling mode");
+ return EFAULT;
+ }
+ bps = bna->na_polling_state;
+ nm_bdg_polling_stop_delete_kthreads(bna->na_polling_state);
+ bps->configured = false;
+ free(bps, M_DEVBUF);
+ bna->na_polling_state = NULL;
+ /* reenable interrupt */
+ if (bna->hwna->nm_intr)
+ bna->hwna->nm_intr(bna->hwna, 1);
+ return 0;
+}
/* Called by either user's context (netmap_ioctl())
* or external kernel modules (e.g., Openvswitch).
@@ -843,7 +1148,7 @@ netmap_bdg_ctl(struct nmreq *nmr, struct netmap_bdg_ops *bdg_ops)
case NETMAP_BDG_LIST:
/* this is used to enumerate bridges and ports */
if (namelen) { /* look up indexes of bridge and port */
- if (strncmp(name, NM_NAME, strlen(NM_NAME))) {
+ if (strncmp(name, NM_BDG_NAME, strlen(NM_BDG_NAME))) {
error = EINVAL;
break;
}
@@ -855,7 +1160,9 @@ netmap_bdg_ctl(struct nmreq *nmr, struct netmap_bdg_ops *bdg_ops)
break;
}
- error = ENOENT;
+ error = 0;
+ nmr->nr_arg1 = b - bridges; /* bridge index */
+ nmr->nr_arg2 = NM_BDG_NOPORT;
for (j = 0; j < b->bdg_active_ports; j++) {
i = b->bdg_port_index[j];
vpna = b->bdg_ports[i];
@@ -867,10 +1174,7 @@ netmap_bdg_ctl(struct nmreq *nmr, struct netmap_bdg_ops *bdg_ops)
* virtual port and a NIC, respectively
*/
if (!strcmp(vpna->up.name, name)) {
- /* bridge index */
- nmr->nr_arg1 = b - bridges;
nmr->nr_arg2 = i; /* port index */
- error = 0;
break;
}
}
@@ -937,10 +1241,34 @@ netmap_bdg_ctl(struct nmreq *nmr, struct netmap_bdg_ops *bdg_ops)
error = netmap_get_bdg_na(nmr, &na, 0);
if (na && !error) {
vpna = (struct netmap_vp_adapter *)na;
- vpna->virt_hdr_len = nmr->nr_arg1;
- if (vpna->virt_hdr_len)
+ na->virt_hdr_len = nmr->nr_arg1;
+ if (na->virt_hdr_len) {
vpna->mfs = NETMAP_BUF_SIZE(na);
- D("Using vnet_hdr_len %d for %p", vpna->virt_hdr_len, vpna);
+ }
+ D("Using vnet_hdr_len %d for %p", na->virt_hdr_len, na);
+ netmap_adapter_put(na);
+ } else if (!na) {
+ error = ENXIO;
+ }
+ NMG_UNLOCK();
+ break;
+
+ case NETMAP_BDG_POLLING_ON:
+ case NETMAP_BDG_POLLING_OFF:
+ NMG_LOCK();
+ error = netmap_get_bdg_na(nmr, &na, 0);
+ if (na && !error) {
+ if (!nm_is_bwrap(na)) {
+ error = EOPNOTSUPP;
+ } else if (cmd == NETMAP_BDG_POLLING_ON) {
+ error = nm_bdg_ctl_polling_start(nmr, na);
+ if (!error)
+ netmap_adapter_get(na);
+ } else {
+ error = nm_bdg_ctl_polling_stop(nmr, na);
+ if (!error)
+ netmap_adapter_put(na);
+ }
netmap_adapter_put(na);
}
NMG_UNLOCK();
@@ -1097,10 +1425,12 @@ nm_bdg_preflush(struct netmap_kring *kring, u_int end)
ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr);
}
if (frags > 1) {
- D("truncate incomplete fragment at %d (%d frags)", ft_i, frags);
- // ft_i > 0, ft[ft_i-1].flags has NS_MOREFRAG
- ft[ft_i - 1].ft_frags &= ~NS_MOREFRAG;
- ft[ft_i - frags].ft_frags = frags - 1;
+ /* Here ft_i > 0, ft[ft_i-1].flags has NS_MOREFRAG, and we
+ * have to fix frags count. */
+ frags--;
+ ft[ft_i - 1].ft_flags &= ~NS_MOREFRAG;
+ ft[ft_i - frags].ft_frags = frags;
+ D("Truncate incomplete fragment at %d (%d frags)", ft_i, frags);
}
if (ft_i)
ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr);
@@ -1157,6 +1487,8 @@ netmap_vp_reg(struct netmap_adapter *na, int onoff)
{
struct netmap_vp_adapter *vpna =
(struct netmap_vp_adapter*)na;
+ enum txrx t;
+ int i;
/* persistent ports may be put in netmap mode
* before being attached to a bridge
@@ -1164,12 +1496,30 @@ netmap_vp_reg(struct netmap_adapter *na, int onoff)
if (vpna->na_bdg)
BDG_WLOCK(vpna->na_bdg);
if (onoff) {
- na->na_flags |= NAF_NETMAP_ON;
+ for_rx_tx(t) {
+ for (i = 0; i < nma_get_nrings(na, t) + 1; i++) {
+ struct netmap_kring *kring = &NMR(na, t)[i];
+
+ if (nm_kring_pending_on(kring))
+ kring->nr_mode = NKR_NETMAP_ON;
+ }
+ }
+ if (na->active_fds == 0)
+ na->na_flags |= NAF_NETMAP_ON;
/* XXX on FreeBSD, persistent VALE ports should also
* toggle IFCAP_NETMAP in na->ifp (2014-03-16)
*/
} else {
- na->na_flags &= ~NAF_NETMAP_ON;
+ if (na->active_fds == 0)
+ na->na_flags &= ~NAF_NETMAP_ON;
+ for_rx_tx(t) {
+ for (i = 0; i < nma_get_nrings(na, t) + 1; i++) {
+ struct netmap_kring *kring = &NMR(na, t)[i];
+
+ if (nm_kring_pending_off(kring))
+ kring->nr_mode = NKR_NETMAP_OFF;
+ }
+ }
}
if (vpna->na_bdg)
BDG_WUNLOCK(vpna->na_bdg);
@@ -1193,13 +1543,14 @@ netmap_bdg_learning(struct nm_bdg_fwd *ft, uint8_t *dst_ring,
uint32_t sh, dh;
u_int dst, mysrc = na->bdg_port;
uint64_t smac, dmac;
+ uint8_t indbuf[12];
/* safety check, unfortunately we have many cases */
- if (buf_len >= 14 + na->virt_hdr_len) {
+ if (buf_len >= 14 + na->up.virt_hdr_len) {
/* virthdr + mac_hdr in the same slot */
- buf += na->virt_hdr_len;
- buf_len -= na->virt_hdr_len;
- } else if (buf_len == na->virt_hdr_len && ft->ft_flags & NS_MOREFRAG) {
+ buf += na->up.virt_hdr_len;
+ buf_len -= na->up.virt_hdr_len;
+ } else if (buf_len == na->up.virt_hdr_len && ft->ft_flags & NS_MOREFRAG) {
/* only header in first fragment */
ft++;
buf = ft->ft_buf;
@@ -1208,6 +1559,14 @@ netmap_bdg_learning(struct nm_bdg_fwd *ft, uint8_t *dst_ring,
RD(5, "invalid buf format, length %d", buf_len);
return NM_BDG_NOPORT;
}
+
+ if (ft->ft_flags & NS_INDIRECT) {
+ if (copyin(buf, indbuf, sizeof(indbuf))) {
+ return NM_BDG_NOPORT;
+ }
+ buf = indbuf;
+ }
+
dmac = le64toh(*(uint64_t *)(buf)) & 0xffffffffffff;
smac = le64toh(*(uint64_t *)(buf + 4));
smac >>= 16;
@@ -1321,7 +1680,7 @@ nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na,
struct nm_bdg_q *dst_ents, *brddst;
uint16_t num_dsts = 0, *dsts;
struct nm_bridge *b = na->na_bdg;
- u_int i, j, me = na->bdg_port;
+ u_int i, me = na->bdg_port;
/*
* The work area (pointed by ft) is followed by an array of
@@ -1341,7 +1700,7 @@ nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na,
ND("slot %d frags %d", i, ft[i].ft_frags);
/* Drop the packet if the virtio-net header is not into the first
fragment nor at the very beginning of the second. */
- if (unlikely(na->virt_hdr_len > ft[i].ft_len))
+ if (unlikely(na->up.virt_hdr_len > ft[i].ft_len))
continue;
dst_port = b->bdg_ops.lookup(&ft[i], &dst_ring, na);
if (netmap_verbose > 255)
@@ -1382,6 +1741,7 @@ nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na,
*/
brddst = dst_ents + NM_BDG_BROADCAST * NM_BDG_MAXRINGS;
if (brddst->bq_head != NM_FT_NULL) {
+ u_int j;
for (j = 0; likely(j < b->bdg_active_ports); j++) {
uint16_t d_i;
i = b->bdg_port_index[j];
@@ -1441,8 +1801,9 @@ nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na,
*/
needed = d->bq_len + brddst->bq_len;
- if (unlikely(dst_na->virt_hdr_len != na->virt_hdr_len)) {
- RD(3, "virt_hdr_mismatch, src %d dst %d", na->virt_hdr_len, dst_na->virt_hdr_len);
+ if (unlikely(dst_na->up.virt_hdr_len != na->up.virt_hdr_len)) {
+ RD(3, "virt_hdr_mismatch, src %d dst %d", na->up.virt_hdr_len,
+ dst_na->up.virt_hdr_len);
/* There is a virtio-net header/offloadings mismatch between
* source and destination. The slower mismatch datapath will
* be used to cope with all the mismatches.
@@ -1803,7 +2164,6 @@ netmap_vp_create(struct nmreq *nmr, struct ifnet *ifp, struct netmap_vp_adapter
nm_bound_var(&nmr->nr_arg3, 0, 0,
128*NM_BDG_MAXSLOTS, NULL);
na->num_rx_desc = nmr->nr_rx_slots;
- vpna->virt_hdr_len = 0;
vpna->mfs = 1514;
vpna->last_smac = ~0llu;
/*if (vpna->mfs > netmap_buf_size) TODO netmap_buf_size is zero??
@@ -1880,19 +2240,17 @@ netmap_bwrap_dtor(struct netmap_adapter *na)
{
struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na;
struct netmap_adapter *hwna = bna->hwna;
+ struct nm_bridge *b = bna->up.na_bdg,
+ *bh = bna->host.na_bdg;
+
+ if (b) {
+ netmap_bdg_detach_common(b, bna->up.bdg_port,
+ (bh ? bna->host.bdg_port : -1));
+ }
ND("na %p", na);
- /* drop reference to hwna->ifp.
- * If we don't do this, netmap_detach_common(na)
- * will think it has set NA(na->ifp) to NULL
- */
na->ifp = NULL;
- /* for safety, also drop the possible reference
- * in the hostna
- */
bna->host.up.ifp = NULL;
-
- hwna->nm_mem = bna->save_nmd;
hwna->na_private = NULL;
hwna->na_vp = hwna->na_hostvp = NULL;
hwna->na_flags &= ~NAF_BUSY;
@@ -1916,7 +2274,8 @@ netmap_bwrap_dtor(struct netmap_adapter *na)
* (part as a receive ring, part as a transmit ring).
*
* callback that overwrites the hwna notify callback.
- * Packets come from the outside or from the host stack and are put on an hwna rx ring.
+ * Packets come from the outside or from the host stack and are put on an
+ * hwna rx ring.
* The bridge wrapper then sends the packets through the bridge.
*/
static int
@@ -1927,19 +2286,18 @@ netmap_bwrap_intr_notify(struct netmap_kring *kring, int flags)
struct netmap_kring *bkring;
struct netmap_vp_adapter *vpna = &bna->up;
u_int ring_nr = kring->ring_id;
- int error = 0;
+ int ret = NM_IRQ_COMPLETED;
+ int error;
if (netmap_verbose)
D("%s %s 0x%x", na->name, kring->name, flags);
- if (!nm_netmap_on(na))
- return 0;
-
bkring = &vpna->up.tx_rings[ring_nr];
/* make sure the ring is not disabled */
- if (nm_kr_tryget(kring))
- return 0;
+ if (nm_kr_tryget(kring, 0 /* can't sleep */, NULL)) {
+ return EIO;
+ }
if (netmap_verbose)
D("%s head %d cur %d tail %d", na->name,
@@ -1951,9 +2309,10 @@ netmap_bwrap_intr_notify(struct netmap_kring *kring, int flags)
error = kring->nm_sync(kring, 0);
if (error)
goto put_out;
- if (kring->nr_hwcur == kring->nr_hwtail && netmap_verbose) {
- D("how strange, interrupt with no packets on %s",
- na->name);
+ if (kring->nr_hwcur == kring->nr_hwtail) {
+ if (netmap_verbose)
+ D("how strange, interrupt with no packets on %s",
+ na->name);
goto put_out;
}
@@ -1970,28 +2329,32 @@ netmap_bwrap_intr_notify(struct netmap_kring *kring, int flags)
/* another call to actually release the buffers */
error = kring->nm_sync(kring, 0);
+ /* The second rxsync may have further advanced hwtail. If this happens,
+ * return NM_IRQ_RESCHED, otherwise just return NM_IRQ_COMPLETED. */
+ if (kring->rcur != kring->nr_hwtail) {
+ ret = NM_IRQ_RESCHED;
+ }
put_out:
nm_kr_put(kring);
- return error;
+
+ return error ? error : ret;
}
/* nm_register callback for bwrap */
static int
-netmap_bwrap_register(struct netmap_adapter *na, int onoff)
+netmap_bwrap_reg(struct netmap_adapter *na, int onoff)
{
struct netmap_bwrap_adapter *bna =
(struct netmap_bwrap_adapter *)na;
struct netmap_adapter *hwna = bna->hwna;
struct netmap_vp_adapter *hostna = &bna->host;
- int error;
+ int error, i;
enum txrx t;
ND("%s %s", na->name, onoff ? "on" : "off");
if (onoff) {
- int i;
-
/* netmap_do_regif has been called on the bwrap na.
* We need to pass the information about the
* memory allocator down to the hwna before
@@ -2010,16 +2373,32 @@ netmap_bwrap_register(struct netmap_adapter *na, int onoff)
/* cross-link the netmap rings
* The original number of rings comes from hwna,
* rx rings on one side equals tx rings on the other.
- * We need to do this now, after the initialization
- * of the kring->ring pointers
*/
for_rx_tx(t) {
- enum txrx r= nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */
- for (i = 0; i < nma_get_nrings(na, r) + 1; i++) {
- NMR(hwna, t)[i].nkr_num_slots = NMR(na, r)[i].nkr_num_slots;
- NMR(hwna, t)[i].ring = NMR(na, r)[i].ring;
+ enum txrx r = nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */
+ for (i = 0; i < nma_get_nrings(hwna, r) + 1; i++) {
+ NMR(hwna, r)[i].ring = NMR(na, t)[i].ring;
}
}
+
+ if (na->na_flags & NAF_HOST_RINGS) {
+ struct netmap_adapter *hna = &hostna->up;
+ /* the hostna rings are the host rings of the bwrap.
+ * The corresponding krings must point back to the
+ * hostna
+ */
+ hna->tx_rings = &na->tx_rings[na->num_tx_rings];
+ hna->tx_rings[0].na = hna;
+ hna->rx_rings = &na->rx_rings[na->num_rx_rings];
+ hna->rx_rings[0].na = hna;
+ }
+ }
+
+ /* pass down the pending ring state information */
+ for_rx_tx(t) {
+ for (i = 0; i < nma_get_nrings(na, t) + 1; i++)
+ NMR(hwna, t)[i].nr_pending_mode =
+ NMR(na, t)[i].nr_pending_mode;
}
/* forward the request to the hwna */
@@ -2027,6 +2406,13 @@ netmap_bwrap_register(struct netmap_adapter *na, int onoff)
if (error)
return error;
+ /* copy up the current ring state information */
+ for_rx_tx(t) {
+ for (i = 0; i < nma_get_nrings(na, t) + 1; i++)
+ NMR(na, t)[i].nr_mode =
+ NMR(hwna, t)[i].nr_mode;
+ }
+
/* impersonate a netmap_vp_adapter */
netmap_vp_reg(na, onoff);
if (hostna->na_bdg)
@@ -2046,8 +2432,14 @@ netmap_bwrap_register(struct netmap_adapter *na, int onoff)
/* also intercept the host ring notify */
hwna->rx_rings[i].nm_notify = netmap_bwrap_intr_notify;
}
+ if (na->active_fds == 0)
+ na->na_flags |= NAF_NETMAP_ON;
} else {
u_int i;
+
+ if (na->active_fds == 0)
+ na->na_flags &= ~NAF_NETMAP_ON;
+
/* reset all notify callbacks (including host ring) */
for (i = 0; i <= hwna->num_rx_rings; i++) {
hwna->rx_rings[i].nm_notify = hwna->rx_rings[i].save_notify;
@@ -2089,8 +2481,8 @@ netmap_bwrap_krings_create(struct netmap_adapter *na)
struct netmap_bwrap_adapter *bna =
(struct netmap_bwrap_adapter *)na;
struct netmap_adapter *hwna = bna->hwna;
- struct netmap_adapter *hostna = &bna->host.up;
- int error;
+ int i, error = 0;
+ enum txrx t;
ND("%s", na->name);
@@ -2102,26 +2494,23 @@ netmap_bwrap_krings_create(struct netmap_adapter *na)
/* also create the hwna krings */
error = hwna->nm_krings_create(hwna);
if (error) {
- netmap_vp_krings_delete(na);
- return error;
+ goto err_del_vp_rings;
}
- /* the connection between the bwrap krings and the hwna krings
- * will be perfomed later, in the nm_register callback, since
- * now the kring->ring pointers have not been initialized yet
- */
- if (na->na_flags & NAF_HOST_RINGS) {
- /* the hostna rings are the host rings of the bwrap.
- * The corresponding krings must point back to the
- * hostna
- */
- hostna->tx_rings = &na->tx_rings[na->num_tx_rings];
- hostna->tx_rings[0].na = hostna;
- hostna->rx_rings = &na->rx_rings[na->num_rx_rings];
- hostna->rx_rings[0].na = hostna;
+ /* get each ring slot number from the corresponding hwna ring */
+ for_rx_tx(t) {
+ enum txrx r = nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */
+ for (i = 0; i < nma_get_nrings(hwna, r) + 1; i++) {
+ NMR(na, t)[i].nkr_num_slots = NMR(hwna, r)[i].nkr_num_slots;
+ }
}
return 0;
+
+err_del_vp_rings:
+ netmap_vp_krings_delete(na);
+
+ return error;
}
@@ -2149,19 +2538,18 @@ netmap_bwrap_notify(struct netmap_kring *kring, int flags)
u_int ring_n = kring->ring_id;
u_int lim = kring->nkr_num_slots - 1;
struct netmap_kring *hw_kring;
- int error = 0;
+ int error;
- ND("%s: na %s hwna %s",
+ ND("%s: na %s hwna %s",
(kring ? kring->name : "NULL!"),
(na ? na->name : "NULL!"),
(hwna ? hwna->name : "NULL!"));
hw_kring = &hwna->tx_rings[ring_n];
- if (nm_kr_tryget(hw_kring))
- return 0;
+ if (nm_kr_tryget(hw_kring, 0, NULL)) {
+ return ENXIO;
+ }
- if (!nm_netmap_on(hwna))
- return 0;
/* first step: simulate a user wakeup on the rx ring */
netmap_vp_rxsync(kring, flags);
ND("%s[%d] PRE rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)",
@@ -2175,7 +2563,7 @@ netmap_bwrap_notify(struct netmap_kring *kring, int flags)
hw_kring->rhead = hw_kring->rcur = kring->nr_hwtail;
error = hw_kring->nm_sync(hw_kring, flags);
if (error)
- goto out;
+ goto put_out;
/* third step: now we are back the rx ring */
/* claim ownership on all hw owned bufs */
@@ -2188,9 +2576,10 @@ netmap_bwrap_notify(struct netmap_kring *kring, int flags)
kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease,
ring->head, ring->cur, ring->tail,
hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_kring->rtail);
-out:
+put_out:
nm_kr_put(hw_kring);
- return error;
+
+ return error ? error : NM_IRQ_COMPLETED;
}
@@ -2217,44 +2606,23 @@ netmap_bwrap_bdg_ctl(struct netmap_adapter *na, struct nmreq *nmr, int attach)
/* nothing to do */
return 0;
}
- npriv = malloc(sizeof(*npriv), M_DEVBUF, M_NOWAIT|M_ZERO);
+ npriv = netmap_priv_new();
if (npriv == NULL)
return ENOMEM;
- error = netmap_do_regif(npriv, na, nmr->nr_ringid, nmr->nr_flags);
+ npriv->np_ifp = na->ifp; /* let the priv destructor release the ref */
+ error = netmap_do_regif(npriv, na, 0, NR_REG_NIC_SW);
if (error) {
- bzero(npriv, sizeof(*npriv));
- free(npriv, M_DEVBUF);
+ netmap_priv_delete(npriv);
return error;
}
bna->na_kpriv = npriv;
na->na_flags |= NAF_BUSY;
} else {
- int last_instance;
-
if (na->active_fds == 0) /* not registered */
return EINVAL;
- last_instance = netmap_dtor_locked(bna->na_kpriv);
- if (!last_instance) {
- D("--- error, trying to detach an entry with active mmaps");
- error = EINVAL;
- } else {
- struct nm_bridge *b = bna->up.na_bdg,
- *bh = bna->host.na_bdg;
- npriv = bna->na_kpriv;
- bna->na_kpriv = NULL;
- D("deleting priv");
-
- bzero(npriv, sizeof(*npriv));
- free(npriv, M_DEVBUF);
- if (b) {
- /* XXX the bwrap dtor should take care
- * of this (2014-06-16)
- */
- netmap_bdg_detach_common(b, bna->up.bdg_port,
- (bh ? bna->host.bdg_port : -1));
- }
- na->na_flags &= ~NAF_BUSY;
- }
+ netmap_priv_delete(bna->na_kpriv);
+ bna->na_kpriv = NULL;
+ na->na_flags &= ~NAF_BUSY;
}
return error;
@@ -2282,6 +2650,8 @@ netmap_bwrap_attach(const char *nr_name, struct netmap_adapter *hwna)
}
na = &bna->up.up;
+ /* make bwrap ifp point to the real ifp */
+ na->ifp = hwna->ifp;
na->na_private = bna;
strncpy(na->name, nr_name, sizeof(na->name));
/* fill the ring data for the bwrap adapter with rx/tx meanings
@@ -2294,7 +2664,7 @@ netmap_bwrap_attach(const char *nr_name, struct netmap_adapter *hwna)
nma_set_ndesc(na, t, nma_get_ndesc(hwna, r));
}
na->nm_dtor = netmap_bwrap_dtor;
- na->nm_register = netmap_bwrap_register;
+ na->nm_register = netmap_bwrap_reg;
// na->nm_txsync = netmap_bwrap_txsync;
// na->nm_rxsync = netmap_bwrap_rxsync;
na->nm_config = netmap_bwrap_config;
@@ -2303,13 +2673,8 @@ netmap_bwrap_attach(const char *nr_name, struct netmap_adapter *hwna)
na->nm_notify = netmap_bwrap_notify;
na->nm_bdg_ctl = netmap_bwrap_bdg_ctl;
na->pdev = hwna->pdev;
- na->nm_mem = netmap_mem_private_new(na->name,
- na->num_tx_rings, na->num_tx_desc,
- na->num_rx_rings, na->num_rx_desc,
- 0, 0, &error);
- na->na_flags |= NAF_MEM_OWNER;
- if (na->nm_mem == NULL)
- goto err_put;
+ na->nm_mem = hwna->nm_mem;
+ na->virt_hdr_len = hwna->virt_hdr_len;
bna->up.retry = 1; /* XXX maybe this should depend on the hwna */
bna->hwna = hwna;
@@ -2349,24 +2714,10 @@ netmap_bwrap_attach(const char *nr_name, struct netmap_adapter *hwna)
if (error) {
goto err_free;
}
- /* make bwrap ifp point to the real ifp
- * NOTE: netmap_attach_common() interprets a non-NULL na->ifp
- * as a request to make the ifp point to the na. Since we
- * do not want to change the na already pointed to by hwna->ifp,
- * the following assignment has to be delayed until now
- */
- na->ifp = hwna->ifp;
hwna->na_flags |= NAF_BUSY;
- /* make hwna point to the allocator we are actually using,
- * so that monitors will be able to find it
- */
- bna->save_nmd = hwna->nm_mem;
- hwna->nm_mem = na->nm_mem;
return 0;
err_free:
- netmap_mem_delete(na->nm_mem);
-err_put:
hwna->na_vp = hwna->na_hostvp = NULL;
netmap_adapter_put(hwna);
free(bna, M_DEVBUF);