diff options
| author | Luigi Rizzo <luigi@FreeBSD.org> | 2016-10-16 14:13:32 +0000 |
|---|---|---|
| committer | Luigi Rizzo <luigi@FreeBSD.org> | 2016-10-16 14:13:32 +0000 |
| commit | 37e3a6d349581b4dd0aebf24be7b1b159a698dcf (patch) | |
| tree | 0e61deea141c9733af511b0485cf1fd0f2dd17ed | |
| parent | 63f6b1a75a8e6e33e4f9d65571c6a221444d3b05 (diff) | |
Notes
27 files changed, 7995 insertions, 1977 deletions
diff --git a/share/man/man4/netmap.4 b/share/man/man4/netmap.4 index 4d4ed32ef75c..4c95da347596 100644 --- a/share/man/man4/netmap.4 +++ b/share/man/man4/netmap.4 @@ -33,10 +33,10 @@ .Sh NAME .Nm netmap .Nd a framework for fast packet I/O -.Pp +.br .Nm VALE .Nd a fast VirtuAl Local Ethernet using the netmap API -.Pp +.br .Nm netmap pipes .Nd a shared memory packet transport channel .Sh SYNOPSIS @@ -44,28 +44,49 @@ .Sh DESCRIPTION .Nm is a framework for extremely fast and efficient packet I/O -for both userspace and kernel clients. +for userspace and kernel clients, and for Virtual Machines. It runs on .Fx -and Linux, and includes -.Nm VALE , -a very fast and modular in-kernel software switch/dataplane, -and -.Nm netmap pipes , -a shared memory packet transport channel. -All these are accessed interchangeably with the same API. +Linux and some versions of Windows, and supports a variety of +.Nm netmap ports , +including +.Bl -tag -width XXXX +.It Nm physical NIC ports +to access individual queues of network interfaces; +.It Nm host ports +to inject packets into the host stack; +.It Nm VALE ports +implementing a very fast and modular in-kernel software switch/dataplane; +.It Nm netmap pipes +a shared memory packet transport channel; +.It Nm netmap monitors +a mechanism similar to +.Xr bpf +to capture traffic +.El .Pp -.Nm , -.Nm VALE -and -.Nm netmap pipes -are at least one order of magnitude faster than +All these +.Nm netmap ports +are accessed interchangeably with the same API, +and are at least one order of magnitude faster than standard OS mechanisms -(sockets, bpf, tun/tap interfaces, native switches, pipes), -reaching 14.88 million packets per second (Mpps) -with much less than one core on a 10 Gbit NIC, -about 20 Mpps per core for VALE ports, -and over 100 Mpps for netmap pipes. +(sockets, bpf, tun/tap interfaces, native switches, pipes). +With suitably fast hardware (NICs, PCIe buses, CPUs), +packet I/O using +.Nm +on supported NICs +reaches 14.88 million packets per second (Mpps) +with much less than one core on 10 Gbit/s NICs; +35-40 Mpps on 40 Gbit/s NICs (limited by the hardware); +about 20 Mpps per core for VALE ports; +and over 100 Mpps for +.Nm netmap pipes. +NICs without native +.Nm +support can still use the API in emulated mode, +which uses unmodified device drivers and is 3-5 times faster than +.Xr bpf +or raw sockets. .Pp Userspace clients can dynamically switch NICs into .Nm @@ -73,8 +94,10 @@ mode and send and receive raw packets through memory mapped buffers. Similarly, .Nm VALE -switch instances and ports, and +switch instances and ports, .Nm netmap pipes +and +.Nm netmap monitors can be created dynamically, providing high speed packet I/O between processes, virtual machines, NICs and the host stack. @@ -89,17 +112,17 @@ and standard OS mechanisms such as .Xr epoll 2 , and .Xr kqueue 2 . -.Nm VALE -and -.Nm netmap pipes +All types of +.Nm netmap ports +and the +.Nm VALE switch are implemented by a single kernel module, which also emulates the .Nm -API over standard drivers for devices without native -.Nm -support. +API over standard drivers. For best performance, .Nm -requires explicit support in device drivers. +requires native support in device drivers. +A list of such devices is at the end of this document. .Pp In the rest of this (long) manual page we document various aspects of the @@ -116,7 +139,7 @@ which can be connected to a physical interface to the host stack, or to a .Nm VALE -switch). +switch. Ports use preallocated circular queues of buffers .Em ( rings ) residing in an mmapped region. @@ -166,16 +189,18 @@ has multiple modes of operation controlled by the .Vt struct nmreq argument. .Va arg.nr_name -specifies the port name, as follows: +specifies the netmap port name, as follows: .Bl -tag -width XXXX .It Dv OS network interface name (e.g. 'em0', 'eth1', ... ) the data path of the NIC is disconnected from the host stack, and the file descriptor is bound to the NIC (one or all queues), or to the host stack; -.It Dv valeXXX:YYY (arbitrary XXX and YYY) -the file descriptor is bound to port YYY of a VALE switch called XXX, -both dynamically created if necessary. -The string cannot exceed IFNAMSIZ characters, and YYY cannot +.It Dv valeSSS:PPP +the file descriptor is bound to port PPP of VALE switch SSS. +Switch instances and ports are dynamically created if necessary. +.br +Both SSS and PPP have the form [0-9a-zA-Z_]+ , the string +cannot exceed IFNAMSIZ characters, and PPP cannot be the name of any existing OS network interface. .El .Pp @@ -312,9 +337,6 @@ one slot is always kept empty. The ring size .Va ( num_slots ) should not be assumed to be a power of two. -.br -(NOTE: older versions of netmap used head/count format to indicate -the content of a ring). .Pp .Va head is the first slot available to userspace; @@ -585,6 +607,15 @@ it from the host stack. Multiple file descriptors can be bound to the same port, with proper synchronization left to the user. .Pp +The recommended way to bind a file descriptor to a port is +to use function +.Va nm_open(..) +(see +.Xr LIBRARIES ) +which parses names to access specific port types and +enable features. +In the following we document the main features. +.Pp .Dv NIOCREGIF can also bind a file descriptor to one endpoint of a .Em netmap pipe , consisting of two netmap ports with a crossover connection. @@ -734,7 +765,7 @@ similar to binds a file descriptor to a port. .Bl -tag -width XX .It Va ifname -is a port name, in the form "netmap:XXX" for a NIC and "valeXXX:YYY" for a +is a port name, in the form "netmap:PPP" for a NIC and "valeSSS:PPP" for a .Nm VALE port. .It Va req @@ -774,28 +805,39 @@ similar to pcap_next(), fetches the next packet natively supports the following devices: .Pp On FreeBSD: +.Xr cxgbe 4 , .Xr em 4 , .Xr igb 4 , .Xr ixgbe 4 , +.Xr ixl 4 , .Xr lem 4 , .Xr re 4 . .Pp On Linux .Xr e1000 4 , .Xr e1000e 4 , +.Xr i40e 4 , .Xr igb 4 , .Xr ixgbe 4 , -.Xr mlx4 4 , -.Xr forcedeth 4 , .Xr r8169 4 . .Pp NICs without native support can still be used in .Nm mode through emulation. Performance is inferior to native netmap -mode but still significantly higher than sockets, and approaching +mode but still significantly higher than various raw socket types +(bpf, PF_PACKET, etc.). +Note that for slow devices (such as 1 Gbit/s and slower NICs, +or several 10 Gbit/s NICs whose hardware is unable that of in-kernel solutions such as Linux's .Xr pktgen . +When emulation is in use, packet sniffer programs such as tcpdump +could see received packets before they are diverted by netmap. This behaviour +is not intentional, being just an artifact of the implementation of emulation. +Note that in case the netmap application subsequently moves packets received +from the emulated adapter onto the host RX ring, the sniffer will intercept +those packets again, since the packets are injected to the host stack as they +were received by the network interface. .Pp Emulation is also available for devices with native netmap support, which can be used for testing or performance comparison. @@ -812,8 +854,12 @@ and module parameters on Linux .Bl -tag -width indent .It Va dev.netmap.admode: 0 Controls the use of native or emulated adapter mode. -0 uses the best available option, 1 forces native and -fails if not available, 2 forces emulated hence never fails. +.br +0 uses the best available option; +.br +1 forces native mode and fails if not available; +.br +2 forces emulated hence never fails. .It Va dev.netmap.generic_ringsize: 1024 Ring size used for emulated netmap mode .It Va dev.netmap.generic_mit: 100000 @@ -861,9 +907,9 @@ performance. uses .Xr select 2 , .Xr poll 2 , -.Xr epoll +.Xr epoll 2 and -.Xr kqueue +.Xr kqueue 2 to wake up processes when significant events occur, and .Xr mmap 2 to map memory. @@ -1015,8 +1061,8 @@ e.g. running the following in two different terminals: .Dl pkt-gen -i vale1:b -f tx # sender The same example can be used to test netmap pipes, by simply changing port names, e.g. -.Dl pkt-gen -i vale:x{3 -f rx # receiver on the master side -.Dl pkt-gen -i vale:x}3 -f tx # sender on the slave side +.Dl pkt-gen -i vale2:x{3 -f rx # receiver on the master side +.Dl pkt-gen -i vale2:x}3 -f tx # sender on the slave side .Pp The following command attaches an interface and the host stack to a switch: diff --git a/sys/conf/files b/sys/conf/files index 8e2ce6bb671d..c9bfc2e44fe4 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -2187,6 +2187,7 @@ dev/nand/nfc_if.m optional nand dev/ncr/ncr.c optional ncr pci dev/ncv/ncr53c500.c optional ncv dev/ncv/ncr53c500_pccard.c optional ncv pccard +dev/netmap/if_ptnet.c optional netmap dev/netmap/netmap.c optional netmap dev/netmap/netmap_freebsd.c optional netmap dev/netmap/netmap_generic.c optional netmap @@ -2195,6 +2196,7 @@ dev/netmap/netmap_mem2.c optional netmap dev/netmap/netmap_monitor.c optional netmap dev/netmap/netmap_offloadings.c optional netmap dev/netmap/netmap_pipe.c optional netmap +dev/netmap/netmap_pt.c optional netmap dev/netmap/netmap_vale.c optional netmap # compile-with "${NORMAL_C} -Wconversion -Wextra" dev/nfsmb/nfsmb.c optional nfsmb pci diff --git a/sys/dev/netmap/if_ixl_netmap.h b/sys/dev/netmap/if_ixl_netmap.h index 2c7f9be541b3..223dc06e36ab 100644 --- a/sys/dev/netmap/if_ixl_netmap.h +++ b/sys/dev/netmap/if_ixl_netmap.h @@ -59,7 +59,7 @@ extern int ixl_rx_miss, ixl_rx_miss_bufs, ixl_crcstrip; /* * device-specific sysctl variables: * - * ixl_crcstrip: 0: keep CRC in rx frames (default), 1: strip it. + * ixl_crcstrip: 0: NIC keeps CRC in rx frames, 1: NIC strips it (default). * During regular operations the CRC is stripped, but on some * hardware reception of frames not multiple of 64 is slower, * so using crcstrip=0 helps in benchmarks. @@ -73,7 +73,7 @@ SYSCTL_DECL(_dev_netmap); */ #if 0 SYSCTL_INT(_dev_netmap, OID_AUTO, ixl_crcstrip, - CTLFLAG_RW, &ixl_crcstrip, 1, "strip CRC on rx frames"); + CTLFLAG_RW, &ixl_crcstrip, 1, "NIC strips CRC on rx frames"); #endif SYSCTL_INT(_dev_netmap, OID_AUTO, ixl_rx_miss, CTLFLAG_RW, &ixl_rx_miss, 0, "potentially missed rx intr"); diff --git a/sys/dev/netmap/if_lem_netmap.h b/sys/dev/netmap/if_lem_netmap.h index 0ec9b1346609..1c2afbd18f10 100644 --- a/sys/dev/netmap/if_lem_netmap.h +++ b/sys/dev/netmap/if_lem_netmap.h @@ -81,6 +81,22 @@ lem_netmap_reg(struct netmap_adapter *na, int onoff) } +static void +lem_netmap_intr(struct netmap_adapter *na, int onoff) +{ + struct ifnet *ifp = na->ifp; + struct adapter *adapter = ifp->if_softc; + + EM_CORE_LOCK(adapter); + if (onoff) { + lem_enable_intr(adapter); + } else { + lem_disable_intr(adapter); + } + EM_CORE_UNLOCK(adapter); +} + + /* * Reconcile kernel and user view of the transmit ring. */ @@ -99,10 +115,6 @@ lem_netmap_txsync(struct netmap_kring *kring, int flags) /* device-specific */ struct adapter *adapter = ifp->if_softc; -#ifdef NIC_PARAVIRT - struct paravirt_csb *csb = adapter->csb; - uint64_t *csbd = (uint64_t *)(csb + 1); -#endif /* NIC_PARAVIRT */ bus_dmamap_sync(adapter->txdma.dma_tag, adapter->txdma.dma_map, BUS_DMASYNC_POSTREAD); @@ -113,19 +125,6 @@ lem_netmap_txsync(struct netmap_kring *kring, int flags) nm_i = kring->nr_hwcur; if (nm_i != head) { /* we have new packets to send */ -#ifdef NIC_PARAVIRT - int do_kick = 0; - uint64_t t = 0; // timestamp - int n = head - nm_i; - if (n < 0) - n += lim + 1; - if (csb) { - t = rdtsc(); /* last timestamp */ - csbd[16] += t - csbd[0]; /* total Wg */ - csbd[17] += n; /* Wg count */ - csbd[0] = t; - } -#endif /* NIC_PARAVIRT */ nic_i = netmap_idx_k2n(kring, nm_i); while (nm_i != head) { struct netmap_slot *slot = &ring->slot[nm_i]; @@ -166,38 +165,8 @@ lem_netmap_txsync(struct netmap_kring *kring, int flags) bus_dmamap_sync(adapter->txdma.dma_tag, adapter->txdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); -#ifdef NIC_PARAVIRT - /* set unconditionally, then also kick if needed */ - if (csb) { - t = rdtsc(); - if (csb->host_need_txkick == 2) { - /* can compute an update of delta */ - int64_t delta = t - csbd[3]; - if (delta < 0) - delta = -delta; - if (csbd[8] == 0 || delta < csbd[8]) { - csbd[8] = delta; - csbd[9]++; - } - csbd[10]++; - } - csb->guest_tdt = nic_i; - csbd[18] += t - csbd[0]; // total wp - csbd[19] += n; - } - if (!csb || !csb->guest_csb_on || (csb->host_need_txkick & 1)) - do_kick = 1; - if (do_kick) -#endif /* NIC_PARAVIRT */ /* (re)start the tx unit up to slot nic_i (excluded) */ E1000_WRITE_REG(&adapter->hw, E1000_TDT(0), nic_i); -#ifdef NIC_PARAVIRT - if (do_kick) { - uint64_t t1 = rdtsc(); - csbd[20] += t1 - t; // total Np - csbd[21]++; - } -#endif /* NIC_PARAVIRT */ } /* @@ -206,93 +175,6 @@ lem_netmap_txsync(struct netmap_kring *kring, int flags) if (ticks != kring->last_reclaim || flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring)) { kring->last_reclaim = ticks; /* record completed transmissions using TDH */ -#ifdef NIC_PARAVIRT - /* host updates tdh unconditionally, and we have - * no side effects on reads, so we can read from there - * instead of exiting. - */ - if (csb) { - static int drain = 0, nodrain=0, good = 0, bad = 0, fail = 0; - u_int x = adapter->next_tx_to_clean; - csbd[19]++; // XXX count reclaims - nic_i = csb->host_tdh; - if (csb->guest_csb_on) { - if (nic_i == x) { - bad++; - csbd[24]++; // failed reclaims - /* no progress, request kick and retry */ - csb->guest_need_txkick = 1; - mb(); // XXX barrier - nic_i = csb->host_tdh; - } else { - good++; - } - if (nic_i != x) { - csb->guest_need_txkick = 2; - if (nic_i == csb->guest_tdt) - drain++; - else - nodrain++; -#if 1 - if (netmap_adaptive_io) { - /* new mechanism: last half ring (or so) - * released one slot at a time. - * This effectively makes the system spin. - * - * Take next_to_clean + 1 as a reference. - * tdh must be ahead or equal - * On entry, the logical order is - * x < tdh = nic_i - * We first push tdh up to avoid wraps. - * The limit is tdh-ll (half ring). - * if tdh-256 < x we report x; - * else we report tdh-256 - */ - u_int tdh = nic_i; - u_int ll = csbd[15]; - u_int delta = lim/8; - if (netmap_adaptive_io == 2 || ll > delta) - csbd[15] = ll = delta; - else if (netmap_adaptive_io == 1 && ll > 1) { - csbd[15]--; - } - - if (nic_i >= kring->nkr_num_slots) { - RD(5, "bad nic_i %d on input", nic_i); - } - x = nm_next(x, lim); - if (tdh < x) - tdh += lim + 1; - if (tdh <= x + ll) { - nic_i = x; - csbd[25]++; //report n + 1; - } else { - tdh = nic_i; - if (tdh < ll) - tdh += lim + 1; - nic_i = tdh - ll; - csbd[26]++; // report tdh - ll - } - } -#endif - } else { - /* we stop, count whether we are idle or not */ - int bh_active = csb->host_need_txkick & 2 ? 4 : 0; - csbd[27+ csb->host_need_txkick]++; - if (netmap_adaptive_io == 1) { - if (bh_active && csbd[15] > 1) - csbd[15]--; - else if (!bh_active && csbd[15] < lim/2) - csbd[15]++; - } - bad--; - fail++; - } - } - RD(1, "drain %d nodrain %d good %d retry %d fail %d", - drain, nodrain, good, bad, fail); - } else -#endif /* !NIC_PARAVIRT */ nic_i = E1000_READ_REG(&adapter->hw, E1000_TDH(0)); if (nic_i >= kring->nkr_num_slots) { /* XXX can it happen ? */ D("TDH wrap %d", nic_i); @@ -324,21 +206,10 @@ lem_netmap_rxsync(struct netmap_kring *kring, int flags) /* device-specific */ struct adapter *adapter = ifp->if_softc; -#ifdef NIC_PARAVIRT - struct paravirt_csb *csb = adapter->csb; - uint32_t csb_mode = csb && csb->guest_csb_on; - uint32_t do_host_rxkick = 0; -#endif /* NIC_PARAVIRT */ if (head > lim) return netmap_ring_reinit(kring); -#ifdef NIC_PARAVIRT - if (csb_mode) { - force_update = 1; - csb->guest_need_rxkick = 0; - } -#endif /* NIC_PARAVIRT */ /* XXX check sync modes */ bus_dmamap_sync(adapter->rxdma.dma_tag, adapter->rxdma.dma_map, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); @@ -357,23 +228,6 @@ lem_netmap_rxsync(struct netmap_kring *kring, int flags) uint32_t staterr = le32toh(curr->status); int len; -#ifdef NIC_PARAVIRT - if (csb_mode) { - if ((staterr & E1000_RXD_STAT_DD) == 0) { - /* don't bother to retry if more than 1 pkt */ - if (n > 1) - break; - csb->guest_need_rxkick = 1; - wmb(); - staterr = le32toh(curr->status); - if ((staterr & E1000_RXD_STAT_DD) == 0) { - break; - } else { /* we are good */ - csb->guest_need_rxkick = 0; - } - } - } else -#endif /* NIC_PARAVIRT */ if ((staterr & E1000_RXD_STAT_DD) == 0) break; len = le16toh(curr->length) - 4; // CRC @@ -390,18 +244,6 @@ lem_netmap_rxsync(struct netmap_kring *kring, int flags) nic_i = nm_next(nic_i, lim); } if (n) { /* update the state variables */ -#ifdef NIC_PARAVIRT - if (csb_mode) { - if (n > 1) { - /* leave one spare buffer so we avoid rxkicks */ - nm_i = nm_prev(nm_i, lim); - nic_i = nm_prev(nic_i, lim); - n--; - } else { - csb->guest_need_rxkick = 1; - } - } -#endif /* NIC_PARAVIRT */ ND("%d new packets at nic %d nm %d tail %d", n, adapter->next_rx_desc_to_check, @@ -440,10 +282,6 @@ lem_netmap_rxsync(struct netmap_kring *kring, int flags) curr->status = 0; bus_dmamap_sync(adapter->rxtag, rxbuf->map, BUS_DMASYNC_PREREAD); -#ifdef NIC_PARAVIRT - if (csb_mode && csb->host_rxkick_at == nic_i) - do_host_rxkick = 1; -#endif /* NIC_PARAVIRT */ nm_i = nm_next(nm_i, lim); nic_i = nm_next(nic_i, lim); } @@ -455,12 +293,6 @@ lem_netmap_rxsync(struct netmap_kring *kring, int flags) * so move nic_i back by one unit */ nic_i = nm_prev(nic_i, lim); -#ifdef NIC_PARAVIRT - /* set unconditionally, then also kick if needed */ - if (csb) - csb->guest_rdt = nic_i; - if (!csb_mode || do_host_rxkick) -#endif /* NIC_PARAVIRT */ E1000_WRITE_REG(&adapter->hw, E1000_RDT(0), nic_i); } @@ -486,6 +318,7 @@ lem_netmap_attach(struct adapter *adapter) na.nm_rxsync = lem_netmap_rxsync; na.nm_register = lem_netmap_reg; na.num_tx_rings = na.num_rx_rings = 1; + na.nm_intr = lem_netmap_intr; netmap_attach(&na); } diff --git a/sys/dev/netmap/ixgbe_netmap.h b/sys/dev/netmap/ixgbe_netmap.h index 0f34e7218503..7986c9965173 100644 --- a/sys/dev/netmap/ixgbe_netmap.h +++ b/sys/dev/netmap/ixgbe_netmap.h @@ -53,7 +53,7 @@ void ixgbe_netmap_attach(struct adapter *adapter); /* * device-specific sysctl variables: * - * ix_crcstrip: 0: keep CRC in rx frames (default), 1: strip it. + * ix_crcstrip: 0: NIC keeps CRC in rx frames (default), 1: NIC strips it. * During regular operations the CRC is stripped, but on some * hardware reception of frames not multiple of 64 is slower, * so using crcstrip=0 helps in benchmarks. @@ -65,7 +65,7 @@ SYSCTL_DECL(_dev_netmap); static int ix_rx_miss, ix_rx_miss_bufs; int ix_crcstrip; SYSCTL_INT(_dev_netmap, OID_AUTO, ix_crcstrip, - CTLFLAG_RW, &ix_crcstrip, 0, "strip CRC on rx frames"); + CTLFLAG_RW, &ix_crcstrip, 0, "NIC strips CRC on rx frames"); SYSCTL_INT(_dev_netmap, OID_AUTO, ix_rx_miss, CTLFLAG_RW, &ix_rx_miss, 0, "potentially missed rx intr"); SYSCTL_INT(_dev_netmap, OID_AUTO, ix_rx_miss_bufs, @@ -109,6 +109,20 @@ set_crcstrip(struct ixgbe_hw *hw, int onoff) IXGBE_WRITE_REG(hw, IXGBE_RDRXCTL, rxc); } +static void +ixgbe_netmap_intr(struct netmap_adapter *na, int onoff) +{ + struct ifnet *ifp = na->ifp; + struct adapter *adapter = ifp->if_softc; + + IXGBE_CORE_LOCK(adapter); + if (onoff) { + ixgbe_enable_intr(adapter); // XXX maybe ixgbe_stop ? + } else { + ixgbe_disable_intr(adapter); // XXX maybe ixgbe_stop ? + } + IXGBE_CORE_UNLOCK(adapter); +} /* * Register/unregister. We are already under netmap lock. @@ -311,7 +325,7 @@ ixgbe_netmap_txsync(struct netmap_kring *kring, int flags) * good way. */ nic_i = IXGBE_READ_REG(&adapter->hw, IXGBE_IS_VF(adapter) ? - IXGBE_VFTDH(kring->ring_id) : IXGBE_TDH(kring->ring_id)); + IXGBE_VFTDH(kring->ring_id) : IXGBE_TDH(kring->ring_id)); if (nic_i >= kring->nkr_num_slots) { /* XXX can it happen ? */ D("TDH wrap %d", nic_i); nic_i -= kring->nkr_num_slots; @@ -486,6 +500,7 @@ ixgbe_netmap_attach(struct adapter *adapter) na.nm_rxsync = ixgbe_netmap_rxsync; na.nm_register = ixgbe_netmap_reg; na.num_tx_rings = na.num_rx_rings = adapter->num_queues; + na.nm_intr = ixgbe_netmap_intr; netmap_attach(&na); } diff --git a/sys/dev/netmap/netmap.c b/sys/dev/netmap/netmap.c index aff757bdadfe..d92d342af83c 100644 --- a/sys/dev/netmap/netmap.c +++ b/sys/dev/netmap/netmap.c @@ -1,5 +1,9 @@ /* - * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved. + * Copyright (C) 2011-2014 Matteo Landi + * Copyright (C) 2011-2016 Luigi Rizzo + * Copyright (C) 2011-2016 Giuseppe Lettieri + * Copyright (C) 2011-2016 Vincenzo Maffione + * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -133,13 +137,12 @@ ports attached to the switch) * > select()able file descriptor on which events are reported. * * Internally, we allocate a netmap_priv_d structure, that will be - * initialized on ioctl(NIOCREGIF). + * initialized on ioctl(NIOCREGIF). There is one netmap_priv_d + * structure for each open(). * * os-specific: - * FreeBSD: netmap_open (netmap_freebsd.c). The priv is - * per-thread. - * linux: linux_netmap_open (netmap_linux.c). The priv is - * per-open. + * FreeBSD: see netmap_open() (netmap_freebsd.c) + * linux: see linux_netmap_open() (netmap_linux.c) * * > 2. on each descriptor, the process issues an ioctl() to identify * > the interface that should report events to the file descriptor. @@ -299,18 +302,17 @@ ports attached to the switch) * netmap_transmit() * na->nm_notify == netmap_notify() * 2) ioctl(NIOCRXSYNC)/netmap_poll() in process context - * kring->nm_sync() == netmap_rxsync_from_host_compat + * kring->nm_sync() == netmap_rxsync_from_host * netmap_rxsync_from_host(na, NULL, NULL) * - tx to host stack * ioctl(NIOCTXSYNC)/netmap_poll() in process context - * kring->nm_sync() == netmap_txsync_to_host_compat + * kring->nm_sync() == netmap_txsync_to_host * netmap_txsync_to_host(na) - * NM_SEND_UP() - * FreeBSD: na->if_input() == ?? XXX + * nm_os_send_up() + * FreeBSD: na->if_input() == ether_input() * linux: netif_rx() with NM_MAGIC_PRIORITY_RX * * - * * -= SYSTEM DEVICE WITH GENERIC SUPPORT =- * * na == NA(ifp) == generic_netmap_adapter created in generic_netmap_attach() @@ -319,10 +321,11 @@ ports attached to the switch) * concurrently: * 1) ioctl(NIOCTXSYNC)/netmap_poll() in process context * kring->nm_sync() == generic_netmap_txsync() - * linux: dev_queue_xmit() with NM_MAGIC_PRIORITY_TX - * generic_ndo_start_xmit() - * orig. dev. start_xmit - * FreeBSD: na->if_transmit() == orig. dev if_transmit + * nm_os_generic_xmit_frame() + * linux: dev_queue_xmit() with NM_MAGIC_PRIORITY_TX + * ifp->ndo_start_xmit == generic_ndo_start_xmit() + * gna->save_start_xmit == orig. dev. start_xmit + * FreeBSD: na->if_transmit() == orig. dev if_transmit * 2) generic_mbuf_destructor() * na->nm_notify() == netmap_notify() * - rx from netmap userspace: @@ -333,24 +336,15 @@ ports attached to the switch) * generic_rx_handler() * mbq_safe_enqueue() * na->nm_notify() == netmap_notify() - * - rx from host stack: - * concurrently: + * - rx from host stack + * FreeBSD: same as native + * Linux: same as native except: * 1) host stack - * linux: generic_ndo_start_xmit() - * netmap_transmit() - * FreeBSD: ifp->if_input() == netmap_transmit - * both: - * na->nm_notify() == netmap_notify() - * 2) ioctl(NIOCRXSYNC)/netmap_poll() in process context - * kring->nm_sync() == netmap_rxsync_from_host_compat - * netmap_rxsync_from_host(na, NULL, NULL) - * - tx to host stack: - * ioctl(NIOCTXSYNC)/netmap_poll() in process context - * kring->nm_sync() == netmap_txsync_to_host_compat - * netmap_txsync_to_host(na) - * NM_SEND_UP() - * FreeBSD: na->if_input() == ??? XXX - * linux: netif_rx() with NM_MAGIC_PRIORITY_RX + * dev_queue_xmit() without NM_MAGIC_PRIORITY_TX + * ifp->ndo_start_xmit == generic_ndo_start_xmit() + * netmap_transmit() + * na->nm_notify() == netmap_notify() + * - tx to host stack (same as native): * * * -= VALE =- @@ -371,7 +365,7 @@ ports attached to the switch) * from host stack: * netmap_transmit() * na->nm_notify() == netmap_bwrap_intr_notify(ring_nr == host ring) - * kring->nm_sync() == netmap_rxsync_from_host_compat() + * kring->nm_sync() == netmap_rxsync_from_host() * netmap_vp_txsync() * * - system device with generic support: @@ -384,7 +378,7 @@ ports attached to the switch) * from host stack: * netmap_transmit() * na->nm_notify() == netmap_bwrap_intr_notify(ring_nr == host ring) - * kring->nm_sync() == netmap_rxsync_from_host_compat() + * kring->nm_sync() == netmap_rxsync_from_host() * netmap_vp_txsync() * * (all cases) --> nm_bdg_flush() @@ -407,7 +401,7 @@ ports attached to the switch) * netmap_vp_rxsync() * to host stack: * netmap_vp_rxsync() - * kring->nm_sync() == netmap_txsync_to_host_compat + * kring->nm_sync() == netmap_txsync_to_host * netmap_vp_rxsync_locked() * * - system device with generic adapter: @@ -418,7 +412,7 @@ ports attached to the switch) * netmap_vp_rxsync() * to host stack: * netmap_vp_rxsync() - * kring->nm_sync() == netmap_txsync_to_host_compat + * kring->nm_sync() == netmap_txsync_to_host * netmap_vp_rxsync() * */ @@ -455,29 +449,19 @@ ports attached to the switch) #include <sys/refcount.h> -/* reduce conditional code */ -// linux API, use for the knlist in FreeBSD -/* use a private mutex for the knlist */ -#define init_waitqueue_head(x) do { \ - struct mtx *m = &(x)->m; \ - mtx_init(m, "nm_kn_lock", NULL, MTX_DEF); \ - knlist_init_mtx(&(x)->si.si_note, m); \ - } while (0) - -#define OS_selrecord(a, b) selrecord(a, &((b)->si)) -#define OS_selwakeup(a, b) freebsd_selwakeup(a, b) - #elif defined(linux) #include "bsd_glue.h" - - #elif defined(__APPLE__) #warning OSX support is only partial #include "osx_glue.h" +#elif defined (_WIN32) + +#include "win_glue.h" + #else #error Unsupported platform @@ -492,47 +476,72 @@ ports attached to the switch) #include <dev/netmap/netmap_mem2.h> -MALLOC_DEFINE(M_NETMAP, "netmap", "Network memory map"); - /* user-controlled variables */ int netmap_verbose; static int netmap_no_timestamp; /* don't timestamp on rxsync */ - -SYSCTL_NODE(_dev, OID_AUTO, netmap, CTLFLAG_RW, 0, "Netmap args"); -SYSCTL_INT(_dev_netmap, OID_AUTO, verbose, - CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode"); -SYSCTL_INT(_dev_netmap, OID_AUTO, no_timestamp, - CTLFLAG_RW, &netmap_no_timestamp, 0, "no_timestamp"); int netmap_mitigate = 1; -SYSCTL_INT(_dev_netmap, OID_AUTO, mitigate, CTLFLAG_RW, &netmap_mitigate, 0, ""); int netmap_no_pendintr = 1; -SYSCTL_INT(_dev_netmap, OID_AUTO, no_pendintr, - CTLFLAG_RW, &netmap_no_pendintr, 0, "Always look for new received packets."); int netmap_txsync_retry = 2; -SYSCTL_INT(_dev_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW, - &netmap_txsync_retry, 0 , "Number of txsync loops in bridge's flush."); - int netmap_adaptive_io = 0; -SYSCTL_INT(_dev_netmap, OID_AUTO, adaptive_io, CTLFLAG_RW, - &netmap_adaptive_io, 0 , "Adaptive I/O on paravirt"); - int netmap_flags = 0; /* debug flags */ -int netmap_fwd = 0; /* force transparent mode */ +static int netmap_fwd = 0; /* force transparent mode */ /* * netmap_admode selects the netmap mode to use. * Invalid values are reset to NETMAP_ADMODE_BEST */ -enum { NETMAP_ADMODE_BEST = 0, /* use native, fallback to generic */ +enum { NETMAP_ADMODE_BEST = 0, /* use native, fallback to generic */ NETMAP_ADMODE_NATIVE, /* either native or none */ NETMAP_ADMODE_GENERIC, /* force generic */ NETMAP_ADMODE_LAST }; static int netmap_admode = NETMAP_ADMODE_BEST; -int netmap_generic_mit = 100*1000; /* Generic mitigation interval in nanoseconds. */ -int netmap_generic_ringsize = 1024; /* Generic ringsize. */ -int netmap_generic_rings = 1; /* number of queues in generic. */ +/* netmap_generic_mit controls mitigation of RX notifications for + * the generic netmap adapter. The value is a time interval in + * nanoseconds. */ +int netmap_generic_mit = 100*1000; + +/* We use by default netmap-aware qdiscs with generic netmap adapters, + * even if there can be a little performance hit with hardware NICs. + * However, using the qdisc is the safer approach, for two reasons: + * 1) it prevents non-fifo qdiscs to break the TX notification + * scheme, which is based on mbuf destructors when txqdisc is + * not used. + * 2) it makes it possible to transmit over software devices that + * change skb->dev, like bridge, veth, ... + * + * Anyway users looking for the best performance should + * use native adapters. + */ +int netmap_generic_txqdisc = 1; + +/* Default number of slots and queues for generic adapters. */ +int netmap_generic_ringsize = 1024; +int netmap_generic_rings = 1; + +/* Non-zero if ptnet devices are allowed to use virtio-net headers. */ +int ptnet_vnet_hdr = 1; + +/* + * SYSCTL calls are grouped between SYSBEGIN and SYSEND to be emulated + * in some other operating systems + */ +SYSBEGIN(main_init); + +SYSCTL_DECL(_dev_netmap); +SYSCTL_NODE(_dev, OID_AUTO, netmap, CTLFLAG_RW, 0, "Netmap args"); +SYSCTL_INT(_dev_netmap, OID_AUTO, verbose, + CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode"); +SYSCTL_INT(_dev_netmap, OID_AUTO, no_timestamp, + CTLFLAG_RW, &netmap_no_timestamp, 0, "no_timestamp"); +SYSCTL_INT(_dev_netmap, OID_AUTO, mitigate, CTLFLAG_RW, &netmap_mitigate, 0, ""); +SYSCTL_INT(_dev_netmap, OID_AUTO, no_pendintr, + CTLFLAG_RW, &netmap_no_pendintr, 0, "Always look for new received packets."); +SYSCTL_INT(_dev_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW, + &netmap_txsync_retry, 0 , "Number of txsync loops in bridge's flush."); +SYSCTL_INT(_dev_netmap, OID_AUTO, adaptive_io, CTLFLAG_RW, + &netmap_adaptive_io, 0 , "Adaptive I/O on paravirt"); SYSCTL_INT(_dev_netmap, OID_AUTO, flags, CTLFLAG_RW, &netmap_flags, 0 , ""); SYSCTL_INT(_dev_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0 , ""); @@ -540,19 +549,24 @@ SYSCTL_INT(_dev_netmap, OID_AUTO, admode, CTLFLAG_RW, &netmap_admode, 0 , ""); SYSCTL_INT(_dev_netmap, OID_AUTO, generic_mit, CTLFLAG_RW, &netmap_generic_mit, 0 , ""); SYSCTL_INT(_dev_netmap, OID_AUTO, generic_ringsize, CTLFLAG_RW, &netmap_generic_ringsize, 0 , ""); SYSCTL_INT(_dev_netmap, OID_AUTO, generic_rings, CTLFLAG_RW, &netmap_generic_rings, 0 , ""); +SYSCTL_INT(_dev_netmap, OID_AUTO, generic_txqdisc, CTLFLAG_RW, &netmap_generic_txqdisc, 0 , ""); +SYSCTL_INT(_dev_netmap, OID_AUTO, ptnet_vnet_hdr, CTLFLAG_RW, &ptnet_vnet_hdr, 0 , ""); + +SYSEND; NMG_LOCK_T netmap_global_lock; -int netmap_use_count = 0; /* number of active netmap instances */ /* * mark the ring as stopped, and run through the locks * to make sure other users get to see it. + * stopped must be either NR_KR_STOPPED (for unbounded stop) + * of NR_KR_LOCKED (brief stop for mutual exclusion purposes) */ static void -netmap_disable_ring(struct netmap_kring *kr) +netmap_disable_ring(struct netmap_kring *kr, int stopped) { - kr->nkr_stopped = 1; - nm_kr_get(kr); + nm_kr_stop(kr, stopped); + // XXX check if nm_kr_stop is sufficient mtx_lock(&kr->q_lock); mtx_unlock(&kr->q_lock); nm_kr_put(kr); @@ -563,7 +577,7 @@ void netmap_set_ring(struct netmap_adapter *na, u_int ring_id, enum txrx t, int stopped) { if (stopped) - netmap_disable_ring(NMR(na, t) + ring_id); + netmap_disable_ring(NMR(na, t) + ring_id, stopped); else NMR(na, t)[ring_id].nkr_stopped = 0; } @@ -590,13 +604,14 @@ netmap_set_all_rings(struct netmap_adapter *na, int stopped) * Convenience function used in drivers. Waits for current txsync()s/rxsync()s * to finish and prevents any new one from starting. Call this before turning * netmap mode off, or before removing the hardware rings (e.g., on module - * onload). As a rule of thumb for linux drivers, this should be placed near - * each napi_disable(). + * onload). */ void netmap_disable_all_rings(struct ifnet *ifp) { - netmap_set_all_rings(NA(ifp), 1 /* stopped */); + if (NM_NA_VALID(ifp)) { + netmap_set_all_rings(NA(ifp), NM_KR_STOPPED); + } } /* @@ -607,9 +622,34 @@ netmap_disable_all_rings(struct ifnet *ifp) void netmap_enable_all_rings(struct ifnet *ifp) { - netmap_set_all_rings(NA(ifp), 0 /* enabled */); + if (NM_NA_VALID(ifp)) { + netmap_set_all_rings(NA(ifp), 0 /* enabled */); + } +} + +void +netmap_make_zombie(struct ifnet *ifp) +{ + if (NM_NA_VALID(ifp)) { + struct netmap_adapter *na = NA(ifp); + netmap_set_all_rings(na, NM_KR_LOCKED); + na->na_flags |= NAF_ZOMBIE; + netmap_set_all_rings(na, 0); + } } +void +netmap_undo_zombie(struct ifnet *ifp) +{ + if (NM_NA_VALID(ifp)) { + struct netmap_adapter *na = NA(ifp); + if (na->na_flags & NAF_ZOMBIE) { + netmap_set_all_rings(na, NM_KR_LOCKED); + na->na_flags &= ~NAF_ZOMBIE; + netmap_set_all_rings(na, 0); + } + } +} /* * generic bound_checking function @@ -727,28 +767,9 @@ netmap_update_config(struct netmap_adapter *na) return 1; } -static void netmap_txsync_to_host(struct netmap_adapter *na); -static int netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait); - -/* kring->nm_sync callback for the host tx ring */ -static int -netmap_txsync_to_host_compat(struct netmap_kring *kring, int flags) -{ - (void)flags; /* unused */ - netmap_txsync_to_host(kring->na); - return 0; -} - -/* kring->nm_sync callback for the host rx ring */ -static int -netmap_rxsync_from_host_compat(struct netmap_kring *kring, int flags) -{ - (void)flags; /* unused */ - netmap_rxsync_from_host(kring->na, NULL, NULL); - return 0; -} - - +/* nm_sync callbacks for the host rings */ +static int netmap_txsync_to_host(struct netmap_kring *kring, int flags); +static int netmap_rxsync_from_host(struct netmap_kring *kring, int flags); /* create the krings array and initialize the fields common to all adapters. * The array layout is this: @@ -809,12 +830,14 @@ netmap_krings_create(struct netmap_adapter *na, u_int tailroom) kring->ring_id = i; kring->tx = t; kring->nkr_num_slots = ndesc; + kring->nr_mode = NKR_NETMAP_OFF; + kring->nr_pending_mode = NKR_NETMAP_OFF; if (i < nma_get_nrings(na, t)) { kring->nm_sync = (t == NR_TX ? na->nm_txsync : na->nm_rxsync); - } else if (i == na->num_tx_rings) { + } else { kring->nm_sync = (t == NR_TX ? - netmap_txsync_to_host_compat : - netmap_rxsync_from_host_compat); + netmap_txsync_to_host: + netmap_rxsync_from_host); } kring->nm_notify = na->nm_notify; kring->rhead = kring->rcur = kring->nr_hwcur = 0; @@ -822,14 +845,14 @@ netmap_krings_create(struct netmap_adapter *na, u_int tailroom) * IMPORTANT: Always keep one slot empty. */ kring->rtail = kring->nr_hwtail = (t == NR_TX ? ndesc - 1 : 0); - snprintf(kring->name, sizeof(kring->name) - 1, "%s %s%d", na->name, + snprintf(kring->name, sizeof(kring->name) - 1, "%s %s%d", na->name, nm_txrx2str(t), i); ND("ktx %s h %d c %d t %d", kring->name, kring->rhead, kring->rcur, kring->rtail); mtx_init(&kring->q_lock, (t == NR_TX ? "nm_txq_lock" : "nm_rxq_lock"), NULL, MTX_DEF); - init_waitqueue_head(&kring->si); + nm_os_selinfo_init(&kring->si); } - init_waitqueue_head(&na->si[t]); + nm_os_selinfo_init(&na->si[t]); } na->tailroom = na->rx_rings + n[NR_RX]; @@ -838,19 +861,6 @@ netmap_krings_create(struct netmap_adapter *na, u_int tailroom) } -#ifdef __FreeBSD__ -static void -netmap_knlist_destroy(NM_SELINFO_T *si) -{ - /* XXX kqueue(9) needed; these will mirror knlist_init. */ - knlist_delete(&si->si.si_note, curthread, 0 /* not locked */ ); - knlist_destroy(&si->si.si_note); - /* now we don't need the mutex anymore */ - mtx_destroy(&si->m); -} -#endif /* __FreeBSD__ */ - - /* undo the actions performed by netmap_krings_create */ /* call with NMG_LOCK held */ void @@ -860,12 +870,12 @@ netmap_krings_delete(struct netmap_adapter *na) enum txrx t; for_rx_tx(t) - netmap_knlist_destroy(&na->si[t]); + nm_os_selinfo_uninit(&na->si[t]); /* we rely on the krings layout described above */ for ( ; kring != na->tailroom; kring++) { mtx_destroy(&kring->q_lock); - netmap_knlist_destroy(&kring->si); + nm_os_selinfo_uninit(&kring->si); } free(na->tx_rings, M_DEVBUF); na->tx_rings = na->rx_rings = na->tailroom = NULL; @@ -878,14 +888,14 @@ netmap_krings_delete(struct netmap_adapter *na) * them first. */ /* call with NMG_LOCK held */ -static void +void netmap_hw_krings_delete(struct netmap_adapter *na) { struct mbq *q = &na->rx_rings[na->num_rx_rings].rx_queue; ND("destroy sw mbq with len %d", mbq_len(q)); mbq_purge(q); - mbq_safe_destroy(q); + mbq_safe_fini(q); netmap_krings_delete(na); } @@ -898,29 +908,38 @@ netmap_hw_krings_delete(struct netmap_adapter *na) */ /* call with NMG_LOCK held */ static void netmap_unset_ringid(struct netmap_priv_d *); -static void netmap_rel_exclusive(struct netmap_priv_d *); -static void +static void netmap_krings_put(struct netmap_priv_d *); +void netmap_do_unregif(struct netmap_priv_d *priv) { struct netmap_adapter *na = priv->np_na; NMG_LOCK_ASSERT(); na->active_fds--; - /* release exclusive use if it was requested on regif */ - netmap_rel_exclusive(priv); - if (na->active_fds <= 0) { /* last instance */ - - if (netmap_verbose) - D("deleting last instance for %s", na->name); + /* unset nr_pending_mode and possibly release exclusive mode */ + netmap_krings_put(priv); #ifdef WITH_MONITOR + /* XXX check whether we have to do something with monitor + * when rings change nr_mode. */ + if (na->active_fds <= 0) { /* walk through all the rings and tell any monitor * that the port is going to exit netmap mode */ netmap_monitor_stop(na); + } #endif + + if (na->active_fds <= 0 || nm_kring_pending(priv)) { + na->nm_register(na, 0); + } + + /* delete rings and buffers that are no longer needed */ + netmap_mem_rings_delete(na); + + if (na->active_fds <= 0) { /* last instance */ /* - * (TO CHECK) This function is only called + * (TO CHECK) We enter here * when the last reference to this file descriptor goes * away. This means we cannot have any pending poll() * or interrupt routine operating on the structure. @@ -933,16 +952,16 @@ netmap_do_unregif(struct netmap_priv_d *priv) * happens if the close() occurs while a concurrent * syscall is running. */ - na->nm_register(na, 0); /* off, clear flags */ - /* Wake up any sleeping threads. netmap_poll will - * then return POLLERR - * XXX The wake up now must happen during *_down(), when - * we order all activities to stop. -gl - */ - /* delete rings and buffers */ - netmap_mem_rings_delete(na); + if (netmap_verbose) + D("deleting last instance for %s", na->name); + + if (nm_netmap_on(na)) { + D("BUG: netmap on while going to delete the krings"); + } + na->nm_krings_delete(na); } + /* possibily decrement counter of tx_si/rx_si users */ netmap_unset_ringid(priv); /* delete the nifp */ @@ -962,6 +981,20 @@ nm_si_user(struct netmap_priv_d *priv, enum txrx t) (priv->np_qlast[t] - priv->np_qfirst[t] > 1)); } +struct netmap_priv_d* +netmap_priv_new(void) +{ + struct netmap_priv_d *priv; + + priv = malloc(sizeof(struct netmap_priv_d), M_DEVBUF, + M_NOWAIT | M_ZERO); + if (priv == NULL) + return NULL; + priv->np_refs = 1; + nm_os_get_module(); + return priv; +} + /* * Destructor of the netmap_priv_d, called when the fd is closed * Action: undo all the things done by NIOCREGIF, @@ -971,22 +1004,22 @@ nm_si_user(struct netmap_priv_d *priv, enum txrx t) * */ /* call with NMG_LOCK held */ -int -netmap_dtor_locked(struct netmap_priv_d *priv) +void +netmap_priv_delete(struct netmap_priv_d *priv) { struct netmap_adapter *na = priv->np_na; /* number of active references to this fd */ if (--priv->np_refs > 0) { - return 0; + return; } - netmap_use_count--; - if (!na) { - return 1; //XXX is it correct? + nm_os_put_module(); + if (na) { + netmap_do_unregif(priv); } - netmap_do_unregif(priv); - netmap_adapter_put(na); - return 1; + netmap_unget_na(na, priv->np_ifp); + bzero(priv, sizeof(*priv)); /* for safety */ + free(priv, M_DEVBUF); } @@ -995,15 +1028,10 @@ void netmap_dtor(void *data) { struct netmap_priv_d *priv = data; - int last_instance; NMG_LOCK(); - last_instance = netmap_dtor_locked(priv); + netmap_priv_delete(priv); NMG_UNLOCK(); - if (last_instance) { - bzero(priv, sizeof(*priv)); /* for safety */ - free(priv, M_DEVBUF); - } } @@ -1036,14 +1064,19 @@ static void netmap_send_up(struct ifnet *dst, struct mbq *q) { struct mbuf *m; + struct mbuf *head = NULL, *prev = NULL; /* send packets up, outside the lock */ while ((m = mbq_dequeue(q)) != NULL) { if (netmap_verbose & NM_VERB_HOST) D("sending up pkt %p size %d", m, MBUF_LEN(m)); - NM_SEND_UP(dst, m); + prev = nm_os_send_up(dst, m, prev); + if (head == NULL) + head = prev; } - mbq_destroy(q); + if (head) + nm_os_send_up(dst, NULL, head); + mbq_fini(q); } @@ -1081,6 +1114,27 @@ netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force) } } +static inline int +_nm_may_forward(struct netmap_kring *kring) +{ + return ((netmap_fwd || kring->ring->flags & NR_FORWARD) && + kring->na->na_flags & NAF_HOST_RINGS && + kring->tx == NR_RX); +} + +static inline int +nm_may_forward_up(struct netmap_kring *kring) +{ + return _nm_may_forward(kring) && + kring->ring_id != kring->na->num_rx_rings; +} + +static inline int +nm_may_forward_down(struct netmap_kring *kring) +{ + return _nm_may_forward(kring) && + kring->ring_id == kring->na->num_rx_rings; +} /* * Send to the NIC rings packets marked NS_FORWARD between @@ -1107,7 +1161,7 @@ netmap_sw_to_nic(struct netmap_adapter *na) for (; rxcur != head && !nm_ring_empty(rdst); rxcur = nm_next(rxcur, src_lim) ) { struct netmap_slot *src, *dst, tmp; - u_int dst_cur = rdst->cur; + u_int dst_head = rdst->head; src = &rxslot[rxcur]; if ((src->flags & NS_FORWARD) == 0 && !netmap_fwd) @@ -1115,7 +1169,7 @@ netmap_sw_to_nic(struct netmap_adapter *na) sent++; - dst = &rdst->slot[dst_cur]; + dst = &rdst->slot[dst_head]; tmp = *src; @@ -1126,7 +1180,7 @@ netmap_sw_to_nic(struct netmap_adapter *na) dst->len = tmp.len; dst->flags = NS_BUF_CHANGED; - rdst->cur = nm_next(dst_cur, dst_lim); + rdst->head = rdst->cur = nm_next(dst_head, dst_lim); } /* if (sent) XXX txsync ? */ } @@ -1140,10 +1194,10 @@ netmap_sw_to_nic(struct netmap_adapter *na) * can be among multiple user threads erroneously calling * this routine concurrently. */ -static void -netmap_txsync_to_host(struct netmap_adapter *na) +static int +netmap_txsync_to_host(struct netmap_kring *kring, int flags) { - struct netmap_kring *kring = &na->tx_rings[na->num_tx_rings]; + struct netmap_adapter *na = kring->na; u_int const lim = kring->nkr_num_slots - 1; u_int const head = kring->rhead; struct mbq q; @@ -1162,6 +1216,7 @@ netmap_txsync_to_host(struct netmap_adapter *na) kring->nr_hwtail -= lim + 1; netmap_send_up(na->ifp, &q); + return 0; } @@ -1171,17 +1226,15 @@ netmap_txsync_to_host(struct netmap_adapter *na) * We protect access to the kring using kring->rx_queue.lock * * This routine also does the selrecord if called from the poll handler - * (we know because td != NULL). + * (we know because sr != NULL). * - * NOTE: on linux, selrecord() is defined as a macro and uses pwait - * as an additional hidden argument. * returns the number of packets delivered to tx queues in * transparent mode, or a negative value if error */ static int -netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait) +netmap_rxsync_from_host(struct netmap_kring *kring, int flags) { - struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings]; + struct netmap_adapter *na = kring->na; struct netmap_ring *ring = kring->ring; u_int nm_i, n; u_int const lim = kring->nkr_num_slots - 1; @@ -1189,9 +1242,6 @@ netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwai int ret = 0; struct mbq *q = &kring->rx_queue, fq; - (void)pwait; /* disable unused warnings */ - (void)td; - mbq_init(&fq); /* fq holds packets to be freed */ mbq_lock(q); @@ -1226,19 +1276,20 @@ netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwai */ nm_i = kring->nr_hwcur; if (nm_i != head) { /* something was released */ - if (netmap_fwd || kring->ring->flags & NR_FORWARD) + if (nm_may_forward_down(kring)) { ret = netmap_sw_to_nic(na); + if (ret > 0) { + kring->nr_kflags |= NR_FORWARD; + ret = 0; + } + } kring->nr_hwcur = head; } - /* access copies of cur,tail in the kring */ - if (kring->rcur == kring->rtail && td) /* no bufs available */ - OS_selrecord(td, &kring->si); - mbq_unlock(q); mbq_purge(&fq); - mbq_destroy(&fq); + mbq_fini(&fq); return ret; } @@ -1267,17 +1318,14 @@ netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwai * 0 NETMAP_ADMODE_GENERIC GENERIC GENERIC * */ - +static void netmap_hw_dtor(struct netmap_adapter *); /* needed by NM_IS_NATIVE() */ int netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na) { /* generic support */ int i = netmap_admode; /* Take a snapshot. */ struct netmap_adapter *prev_na; -#ifdef WITH_GENERIC - struct netmap_generic_adapter *gna; int error = 0; -#endif *na = NULL; /* default */ @@ -1285,7 +1333,7 @@ netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na) if (i < NETMAP_ADMODE_BEST || i >= NETMAP_ADMODE_LAST) i = netmap_admode = NETMAP_ADMODE_BEST; - if (NETMAP_CAPABLE(ifp)) { + if (NM_NA_VALID(ifp)) { prev_na = NA(ifp); /* If an adapter already exists, return it if * there are active file descriptors or if @@ -1310,10 +1358,9 @@ netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na) /* If there isn't native support and netmap is not allowed * to use generic adapters, we cannot satisfy the request. */ - if (!NETMAP_CAPABLE(ifp) && i == NETMAP_ADMODE_NATIVE) + if (!NM_IS_NATIVE(ifp) && i == NETMAP_ADMODE_NATIVE) return EOPNOTSUPP; -#ifdef WITH_GENERIC /* Otherwise, create a generic adapter and return it, * saving the previously used netmap adapter, if any. * @@ -1328,25 +1375,12 @@ netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na) * the branches above. This ensures that we never override * a generic adapter with another generic adapter. */ - prev_na = NA(ifp); error = generic_netmap_attach(ifp); if (error) return error; *na = NA(ifp); - gna = (struct netmap_generic_adapter*)NA(ifp); - gna->prev = prev_na; /* save old na */ - if (prev_na != NULL) { - ifunit_ref(ifp->if_xname); - // XXX add a refcount ? - netmap_adapter_get(prev_na); - } - ND("Created generic NA %p (prev %p)", gna, gna->prev); - return 0; -#else /* !WITH_GENERIC */ - return EOPNOTSUPP; -#endif } @@ -1364,21 +1398,22 @@ netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na) * could not be allocated. * If successful, hold a reference to the netmap adapter. * - * No reference is kept on the real interface, which may then - * disappear at any time. + * If the interface specified by nmr is a system one, also keep + * a reference to it and return a valid *ifp. */ int -netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create) +netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, + struct ifnet **ifp, int create) { - struct ifnet *ifp = NULL; int error = 0; struct netmap_adapter *ret = NULL; *na = NULL; /* default return value */ + *ifp = NULL; NMG_LOCK_ASSERT(); - /* we cascade through all possible types of netmap adapter. + /* We cascade through all possible types of netmap adapter. * All netmap_get_*_na() functions return an error and an na, * with the following combinations: * @@ -1389,6 +1424,11 @@ netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create) * !0 !NULL impossible */ + /* try to see if this is a ptnetmap port */ + error = netmap_get_pt_host_na(nmr, na, create); + if (error || *na != NULL) + return error; + /* try to see if this is a monitor port */ error = netmap_get_monitor_na(nmr, na, create); if (error || *na != NULL) @@ -1413,12 +1453,12 @@ netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create) * This may still be a tap, a veth/epair, or even a * persistent VALE port. */ - ifp = ifunit_ref(nmr->nr_name); - if (ifp == NULL) { + *ifp = ifunit_ref(nmr->nr_name); + if (*ifp == NULL) { return ENXIO; } - error = netmap_get_hw_na(ifp, &ret); + error = netmap_get_hw_na(*ifp, &ret); if (error) goto out; @@ -1426,15 +1466,42 @@ netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create) netmap_adapter_get(ret); out: - if (error && ret != NULL) - netmap_adapter_put(ret); - - if (ifp) - if_rele(ifp); /* allow live unloading of drivers modules */ + if (error) { + if (ret) + netmap_adapter_put(ret); + if (*ifp) { + if_rele(*ifp); + *ifp = NULL; + } + } return error; } +/* undo netmap_get_na() */ +void +netmap_unget_na(struct netmap_adapter *na, struct ifnet *ifp) +{ + if (ifp) + if_rele(ifp); + if (na) + netmap_adapter_put(na); +} + + +#define NM_FAIL_ON(t) do { \ + if (unlikely(t)) { \ + RD(5, "%s: fail '" #t "' " \ + "h %d c %d t %d " \ + "rh %d rc %d rt %d " \ + "hc %d ht %d", \ + kring->name, \ + head, cur, ring->tail, \ + kring->rhead, kring->rcur, kring->rtail, \ + kring->nr_hwcur, kring->nr_hwtail); \ + return kring->nkr_num_slots; \ + } \ +} while (0) /* * validate parameters on entry for *_txsync() @@ -1449,11 +1516,9 @@ out: * * hwcur, rhead, rtail and hwtail are reliable */ -static u_int -nm_txsync_prologue(struct netmap_kring *kring) +u_int +nm_txsync_prologue(struct netmap_kring *kring, struct netmap_ring *ring) { -#define NM_ASSERT(t) if (t) { D("fail " #t); goto error; } - struct netmap_ring *ring = kring->ring; u_int head = ring->head; /* read only once */ u_int cur = ring->cur; /* read only once */ u_int n = kring->nkr_num_slots; @@ -1463,35 +1528,34 @@ nm_txsync_prologue(struct netmap_kring *kring) kring->nr_hwcur, kring->nr_hwtail, ring->head, ring->cur, ring->tail); #if 1 /* kernel sanity checks; but we can trust the kring. */ - if (kring->nr_hwcur >= n || kring->rhead >= n || - kring->rtail >= n || kring->nr_hwtail >= n) - goto error; + NM_FAIL_ON(kring->nr_hwcur >= n || kring->rhead >= n || + kring->rtail >= n || kring->nr_hwtail >= n); #endif /* kernel sanity checks */ /* - * user sanity checks. We only use 'cur', - * A, B, ... are possible positions for cur: + * user sanity checks. We only use head, + * A, B, ... are possible positions for head: * - * 0 A cur B tail C n-1 - * 0 D tail E cur F n-1 + * 0 A rhead B rtail C n-1 + * 0 D rtail E rhead F n-1 * * B, F, D are valid. A, C, E are wrong */ if (kring->rtail >= kring->rhead) { /* want rhead <= head <= rtail */ - NM_ASSERT(head < kring->rhead || head > kring->rtail); + NM_FAIL_ON(head < kring->rhead || head > kring->rtail); /* and also head <= cur <= rtail */ - NM_ASSERT(cur < head || cur > kring->rtail); + NM_FAIL_ON(cur < head || cur > kring->rtail); } else { /* here rtail < rhead */ /* we need head outside rtail .. rhead */ - NM_ASSERT(head > kring->rtail && head < kring->rhead); + NM_FAIL_ON(head > kring->rtail && head < kring->rhead); /* two cases now: head <= rtail or head >= rhead */ if (head <= kring->rtail) { /* want head <= cur <= rtail */ - NM_ASSERT(cur < head || cur > kring->rtail); + NM_FAIL_ON(cur < head || cur > kring->rtail); } else { /* head >= rhead */ /* cur must be outside rtail..head */ - NM_ASSERT(cur > kring->rtail && cur < head); + NM_FAIL_ON(cur > kring->rtail && cur < head); } } if (ring->tail != kring->rtail) { @@ -1502,15 +1566,6 @@ nm_txsync_prologue(struct netmap_kring *kring) kring->rhead = head; kring->rcur = cur; return head; - -error: - RD(5, "%s kring error: head %d cur %d tail %d rhead %d rcur %d rtail %d hwcur %d hwtail %d", - kring->name, - head, cur, ring->tail, - kring->rhead, kring->rcur, kring->rtail, - kring->nr_hwcur, kring->nr_hwtail); - return n; -#undef NM_ASSERT } @@ -1525,10 +1580,9 @@ error: * hwcur and hwtail are reliable. * */ -static u_int -nm_rxsync_prologue(struct netmap_kring *kring) +u_int +nm_rxsync_prologue(struct netmap_kring *kring, struct netmap_ring *ring) { - struct netmap_ring *ring = kring->ring; uint32_t const n = kring->nkr_num_slots; uint32_t head, cur; @@ -1546,30 +1600,24 @@ nm_rxsync_prologue(struct netmap_kring *kring) cur = kring->rcur = ring->cur; /* read only once */ head = kring->rhead = ring->head; /* read only once */ #if 1 /* kernel sanity checks */ - if (kring->nr_hwcur >= n || kring->nr_hwtail >= n) - goto error; + NM_FAIL_ON(kring->nr_hwcur >= n || kring->nr_hwtail >= n); #endif /* kernel sanity checks */ /* user sanity checks */ if (kring->nr_hwtail >= kring->nr_hwcur) { /* want hwcur <= rhead <= hwtail */ - if (head < kring->nr_hwcur || head > kring->nr_hwtail) - goto error; + NM_FAIL_ON(head < kring->nr_hwcur || head > kring->nr_hwtail); /* and also rhead <= rcur <= hwtail */ - if (cur < head || cur > kring->nr_hwtail) - goto error; + NM_FAIL_ON(cur < head || cur > kring->nr_hwtail); } else { /* we need rhead outside hwtail..hwcur */ - if (head < kring->nr_hwcur && head > kring->nr_hwtail) - goto error; + NM_FAIL_ON(head < kring->nr_hwcur && head > kring->nr_hwtail); /* two cases now: head <= hwtail or head >= hwcur */ if (head <= kring->nr_hwtail) { /* want head <= cur <= hwtail */ - if (cur < head || cur > kring->nr_hwtail) - goto error; + NM_FAIL_ON(cur < head || cur > kring->nr_hwtail); } else { /* cur must be outside hwtail..head */ - if (cur < head && cur > kring->nr_hwtail) - goto error; + NM_FAIL_ON(cur < head && cur > kring->nr_hwtail); } } if (ring->tail != kring->rtail) { @@ -1579,13 +1627,6 @@ nm_rxsync_prologue(struct netmap_kring *kring) ring->tail = kring->rtail; } return head; - -error: - RD(5, "kring error: hwcur %d rcur %d hwtail %d head %d cur %d tail %d", - kring->nr_hwcur, - kring->rcur, kring->nr_hwtail, - kring->rhead, kring->rcur, ring->tail); - return n; } @@ -1659,6 +1700,7 @@ netmap_interp_ringid(struct netmap_priv_d *priv, uint16_t ringid, uint32_t flags struct netmap_adapter *na = priv->np_na; u_int j, i = ringid & NETMAP_RING_MASK; u_int reg = flags & NR_REG_MASK; + int excluded_direction[] = { NR_TX_RINGS_ONLY, NR_RX_RINGS_ONLY }; enum txrx t; if (reg == NR_REG_DEFAULT) { @@ -1672,48 +1714,58 @@ netmap_interp_ringid(struct netmap_priv_d *priv, uint16_t ringid, uint32_t flags } D("deprecated API, old ringid 0x%x -> ringid %x reg %d", ringid, i, reg); } - switch (reg) { - case NR_REG_ALL_NIC: - case NR_REG_PIPE_MASTER: - case NR_REG_PIPE_SLAVE: - for_rx_tx(t) { + + if ((flags & NR_PTNETMAP_HOST) && (reg != NR_REG_ALL_NIC || + flags & (NR_RX_RINGS_ONLY|NR_TX_RINGS_ONLY))) { + D("Error: only NR_REG_ALL_NIC supported with netmap passthrough"); + return EINVAL; + } + + for_rx_tx(t) { + if (flags & excluded_direction[t]) { + priv->np_qfirst[t] = priv->np_qlast[t] = 0; + continue; + } + switch (reg) { + case NR_REG_ALL_NIC: + case NR_REG_PIPE_MASTER: + case NR_REG_PIPE_SLAVE: priv->np_qfirst[t] = 0; priv->np_qlast[t] = nma_get_nrings(na, t); - } - ND("%s %d %d", "ALL/PIPE", - priv->np_qfirst[NR_RX], priv->np_qlast[NR_RX]); - break; - case NR_REG_SW: - case NR_REG_NIC_SW: - if (!(na->na_flags & NAF_HOST_RINGS)) { - D("host rings not supported"); - return EINVAL; - } - for_rx_tx(t) { + ND("ALL/PIPE: %s %d %d", nm_txrx2str(t), + priv->np_qfirst[t], priv->np_qlast[t]); + break; + case NR_REG_SW: + case NR_REG_NIC_SW: + if (!(na->na_flags & NAF_HOST_RINGS)) { + D("host rings not supported"); + return EINVAL; + } priv->np_qfirst[t] = (reg == NR_REG_SW ? nma_get_nrings(na, t) : 0); priv->np_qlast[t] = nma_get_nrings(na, t) + 1; - } - ND("%s %d %d", reg == NR_REG_SW ? "SW" : "NIC+SW", - priv->np_qfirst[NR_RX], priv->np_qlast[NR_RX]); - break; - case NR_REG_ONE_NIC: - if (i >= na->num_tx_rings && i >= na->num_rx_rings) { - D("invalid ring id %d", i); - return EINVAL; - } - for_rx_tx(t) { + ND("%s: %s %d %d", reg == NR_REG_SW ? "SW" : "NIC+SW", + nm_txrx2str(t), + priv->np_qfirst[t], priv->np_qlast[t]); + break; + case NR_REG_ONE_NIC: + if (i >= na->num_tx_rings && i >= na->num_rx_rings) { + D("invalid ring id %d", i); + return EINVAL; + } /* if not enough rings, use the first one */ j = i; if (j >= nma_get_nrings(na, t)) j = 0; priv->np_qfirst[t] = j; priv->np_qlast[t] = j + 1; + ND("ONE_NIC: %s %d %d", nm_txrx2str(t), + priv->np_qfirst[t], priv->np_qlast[t]); + break; + default: + D("invalid regif type %d", reg); + return EINVAL; } - break; - default: - D("invalid regif type %d", reg); - return EINVAL; } priv->np_flags = (flags & ~NR_REG_MASK) | reg; @@ -1776,11 +1828,12 @@ netmap_unset_ringid(struct netmap_priv_d *priv) } -/* check that the rings we want to bind are not exclusively owned by a previous - * bind. If exclusive ownership has been requested, we also mark the rings. +/* Set the nr_pending_mode for the requested rings. + * If requested, also try to get exclusive access to the rings, provided + * the rings we want to bind are not exclusively owned by a previous bind. */ static int -netmap_get_exclusive(struct netmap_priv_d *priv) +netmap_krings_get(struct netmap_priv_d *priv) { struct netmap_adapter *na = priv->np_na; u_int i; @@ -1811,16 +1864,16 @@ netmap_get_exclusive(struct netmap_priv_d *priv) } } - /* second round: increment usage cound and possibly - * mark as exclusive + /* second round: increment usage count (possibly marking them + * as exclusive) and set the nr_pending_mode */ - for_rx_tx(t) { for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) { kring = &NMR(na, t)[i]; kring->users++; if (excl) kring->nr_kflags |= NKR_EXCLUSIVE; + kring->nr_pending_mode = NKR_NETMAP_ON; } } @@ -1828,9 +1881,11 @@ netmap_get_exclusive(struct netmap_priv_d *priv) } -/* undo netmap_get_ownership() */ +/* Undo netmap_krings_get(). This is done by clearing the exclusive mode + * if was asked on regif, and unset the nr_pending_mode if we are the + * last users of the involved rings. */ static void -netmap_rel_exclusive(struct netmap_priv_d *priv) +netmap_krings_put(struct netmap_priv_d *priv) { struct netmap_adapter *na = priv->np_na; u_int i; @@ -1852,6 +1907,8 @@ netmap_rel_exclusive(struct netmap_priv_d *priv) if (excl) kring->nr_kflags &= ~NKR_EXCLUSIVE; kring->users--; + if (kring->users == 0) + kring->nr_pending_mode = NKR_NETMAP_OFF; } } } @@ -1899,9 +1956,8 @@ netmap_rel_exclusive(struct netmap_priv_d *priv) * (put the adapter in netmap mode) * * This may be one of the following: - * (XXX these should be either all *_register or all *_reg 2014-03-15) * - * * netmap_hw_register (hw ports) + * * netmap_hw_reg (hw ports) * checks that the ifp is still there, then calls * the hardware specific callback; * @@ -1919,7 +1975,7 @@ netmap_rel_exclusive(struct netmap_priv_d *priv) * intercept the sync callbacks of the monitored * rings * - * * netmap_bwrap_register (bwraps) + * * netmap_bwrap_reg (bwraps) * cross-link the bwrap and hwna rings, * forward the request to the hwna, override * the hwna notify callback (to get the frames @@ -1948,7 +2004,7 @@ netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na, if (na->active_fds == 0) { /* * If this is the first registration of the adapter, - * also create the netmap rings and their in-kernel view, + * create the in-kernel view of the netmap rings, * the netmap krings. */ @@ -1960,39 +2016,48 @@ netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na, if (error) goto err_drop_mem; - /* create all missing netmap rings */ - error = netmap_mem_rings_create(na); - if (error) - goto err_del_krings; } - /* now the kring must exist and we can check whether some - * previous bind has exclusive ownership on them + /* now the krings must exist and we can check whether some + * previous bind has exclusive ownership on them, and set + * nr_pending_mode */ - error = netmap_get_exclusive(priv); + error = netmap_krings_get(priv); if (error) - goto err_del_rings; + goto err_del_krings; + + /* create all needed missing netmap rings */ + error = netmap_mem_rings_create(na); + if (error) + goto err_rel_excl; /* in all cases, create a new netmap if */ nifp = netmap_mem_if_new(na); if (nifp == NULL) { error = ENOMEM; - goto err_rel_excl; + goto err_del_rings; } - na->active_fds++; - if (!nm_netmap_on(na)) { - /* Netmap not active, set the card in netmap mode - * and make it use the shared buffers. - */ + if (na->active_fds == 0) { /* cache the allocator info in the na */ - netmap_mem_get_lut(na->nm_mem, &na->na_lut); - ND("%p->na_lut == %p", na, na->na_lut.lut); - error = na->nm_register(na, 1); /* mode on */ - if (error) + error = netmap_mem_get_lut(na->nm_mem, &na->na_lut); + if (error) goto err_del_if; + ND("lut %p bufs %u size %u", na->na_lut.lut, na->na_lut.objtotal, + na->na_lut.objsize); } + if (nm_kring_pending(priv)) { + /* Some kring is switching mode, tell the adapter to + * react on this. */ + error = na->nm_register(na, 1); + if (error) + goto err_put_lut; + } + + /* Commit the reference. */ + na->active_fds++; + /* * advertise that the interface is ready by setting np_nifp. * The barrier is needed because readers (poll, *SYNC and mmap) @@ -2003,15 +2068,15 @@ netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na, return 0; +err_put_lut: + if (na->active_fds == 0) + memset(&na->na_lut, 0, sizeof(na->na_lut)); err_del_if: - memset(&na->na_lut, 0, sizeof(na->na_lut)); - na->active_fds--; netmap_mem_if_delete(na, nifp); err_rel_excl: - netmap_rel_exclusive(priv); + netmap_krings_put(priv); err_del_rings: - if (na->active_fds == 0) - netmap_mem_rings_delete(na); + netmap_mem_rings_delete(na); err_del_krings: if (na->active_fds == 0) na->nm_krings_delete(na); @@ -2024,41 +2089,23 @@ err: /* - * update kring and ring at the end of txsync. + * update kring and ring at the end of rxsync/txsync. */ static inline void -nm_txsync_finalize(struct netmap_kring *kring) +nm_sync_finalize(struct netmap_kring *kring) { - /* update ring tail to what the kernel knows */ + /* + * Update ring tail to what the kernel knows + * After txsync: head/rhead/hwcur might be behind cur/rcur + * if no carrier. + */ kring->ring->tail = kring->rtail = kring->nr_hwtail; - /* note, head/rhead/hwcur might be behind cur/rcur - * if no carrier - */ ND(5, "%s now hwcur %d hwtail %d head %d cur %d tail %d", kring->name, kring->nr_hwcur, kring->nr_hwtail, kring->rhead, kring->rcur, kring->rtail); } - -/* - * update kring and ring at the end of rxsync - */ -static inline void -nm_rxsync_finalize(struct netmap_kring *kring) -{ - /* tell userspace that there might be new packets */ - //struct netmap_ring *ring = kring->ring; - ND("head %d cur %d tail %d -> %d", ring->head, ring->cur, ring->tail, - kring->nr_hwtail); - kring->ring->tail = kring->rtail = kring->nr_hwtail; - /* make a copy of the state for next round */ - kring->rhead = kring->ring->head; - kring->rcur = kring->ring->cur; -} - - - /* * ioctl(2) support for the "netmap" device. * @@ -2072,21 +2119,17 @@ nm_rxsync_finalize(struct netmap_kring *kring) * Return 0 on success, errno otherwise. */ int -netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, - int fflag, struct thread *td) +netmap_ioctl(struct netmap_priv_d *priv, u_long cmd, caddr_t data, struct thread *td) { - struct netmap_priv_d *priv = NULL; struct nmreq *nmr = (struct nmreq *) data; struct netmap_adapter *na = NULL; - int error; + struct ifnet *ifp = NULL; + int error = 0; u_int i, qfirst, qlast; struct netmap_if *nifp; struct netmap_kring *krings; enum txrx t; - (void)dev; /* UNUSED */ - (void)fflag; /* UNUSED */ - if (cmd == NIOCGINFO || cmd == NIOCREGIF) { /* truncate name */ nmr->nr_name[sizeof(nmr->nr_name) - 1] = '\0'; @@ -2101,15 +2144,6 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, return EINVAL; } } - CURVNET_SET(TD_TO_VNET(td)); - - error = devfs_get_cdevpriv((void **)&priv); - if (error) { - CURVNET_RESTORE(); - /* XXX ENOENT should be impossible, since the priv - * is now created in the open */ - return (error == ENOENT ? ENXIO : error); - } switch (cmd) { case NIOCGINFO: /* return capabilities etc */ @@ -2125,10 +2159,14 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, u_int memflags; if (nmr->nr_name[0] != '\0') { + /* get a refcount */ - error = netmap_get_na(nmr, &na, 1 /* create */); - if (error) + error = netmap_get_na(nmr, &na, &ifp, 1 /* create */); + if (error) { + na = NULL; + ifp = NULL; break; + } nmd = na->nm_mem; /* get memory allocator */ } @@ -2145,8 +2183,8 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, nmr->nr_tx_rings = na->num_tx_rings; nmr->nr_rx_slots = na->num_rx_desc; nmr->nr_tx_slots = na->num_tx_desc; - netmap_adapter_put(na); } while (0); + netmap_unget_na(na, ifp); NMG_UNLOCK(); break; @@ -2156,9 +2194,25 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, if (i == NETMAP_BDG_ATTACH || i == NETMAP_BDG_DETACH || i == NETMAP_BDG_VNET_HDR || i == NETMAP_BDG_NEWIF - || i == NETMAP_BDG_DELIF) { + || i == NETMAP_BDG_DELIF + || i == NETMAP_BDG_POLLING_ON + || i == NETMAP_BDG_POLLING_OFF) { error = netmap_bdg_ctl(nmr, NULL); break; + } else if (i == NETMAP_PT_HOST_CREATE || i == NETMAP_PT_HOST_DELETE) { + error = ptnetmap_ctl(nmr, priv->np_na); + break; + } else if (i == NETMAP_VNET_HDR_GET) { + struct ifnet *ifp; + + NMG_LOCK(); + error = netmap_get_na(nmr, &na, &ifp, 0); + if (na && !error) { + nmr->nr_arg1 = na->virt_hdr_len; + } + netmap_unget_na(na, ifp); + NMG_UNLOCK(); + break; } else if (i != 0) { D("nr_cmd must be 0 not %d", i); error = EINVAL; @@ -2169,23 +2223,32 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, NMG_LOCK(); do { u_int memflags; + struct ifnet *ifp; if (priv->np_nifp != NULL) { /* thread already registered */ error = EBUSY; break; } /* find the interface and a reference */ - error = netmap_get_na(nmr, &na, 1 /* create */); /* keep reference */ + error = netmap_get_na(nmr, &na, &ifp, + 1 /* create */); /* keep reference */ if (error) break; if (NETMAP_OWNED_BY_KERN(na)) { - netmap_adapter_put(na); + netmap_unget_na(na, ifp); error = EBUSY; break; } + + if (na->virt_hdr_len && !(nmr->nr_flags & NR_ACCEPT_VNET_HDR)) { + netmap_unget_na(na, ifp); + error = EIO; + break; + } + error = netmap_do_regif(priv, na, nmr->nr_ringid, nmr->nr_flags); if (error) { /* reg. failed, release priv and ref */ - netmap_adapter_put(na); + netmap_unget_na(na, ifp); break; } nifp = priv->np_nifp; @@ -2200,7 +2263,7 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, &nmr->nr_arg2); if (error) { netmap_do_unregif(priv); - netmap_adapter_put(na); + netmap_unget_na(na, ifp); break; } if (memflags & NETMAP_MEM_PRIVATE) { @@ -2212,12 +2275,17 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, } if (nmr->nr_arg3) { - D("requested %d extra buffers", nmr->nr_arg3); + if (netmap_verbose) + D("requested %d extra buffers", nmr->nr_arg3); nmr->nr_arg3 = netmap_extra_alloc(na, &nifp->ni_bufs_head, nmr->nr_arg3); - D("got %d extra buffers", nmr->nr_arg3); + if (netmap_verbose) + D("got %d extra buffers", nmr->nr_arg3); } nmr->nr_offset = netmap_mem_if_offset(na->nm_mem, nifp); + + /* store ifp reference so that priv destructor may release it */ + priv->np_ifp = ifp; } while (0); NMG_UNLOCK(); break; @@ -2240,11 +2308,6 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, break; } - if (!nm_netmap_on(na)) { - error = ENXIO; - break; - } - t = (cmd == NIOCTXSYNC ? NR_TX : NR_RX); krings = NMR(na, t); qfirst = priv->np_qfirst[t]; @@ -2252,31 +2315,34 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, for (i = qfirst; i < qlast; i++) { struct netmap_kring *kring = krings + i; - if (nm_kr_tryget(kring)) { - error = EBUSY; - goto out; + struct netmap_ring *ring = kring->ring; + + if (unlikely(nm_kr_tryget(kring, 1, &error))) { + error = (error ? EIO : 0); + continue; } + if (cmd == NIOCTXSYNC) { if (netmap_verbose & NM_VERB_TXSYNC) D("pre txsync ring %d cur %d hwcur %d", - i, kring->ring->cur, + i, ring->cur, kring->nr_hwcur); - if (nm_txsync_prologue(kring) >= kring->nkr_num_slots) { + if (nm_txsync_prologue(kring, ring) >= kring->nkr_num_slots) { netmap_ring_reinit(kring); } else if (kring->nm_sync(kring, NAF_FORCE_RECLAIM) == 0) { - nm_txsync_finalize(kring); + nm_sync_finalize(kring); } if (netmap_verbose & NM_VERB_TXSYNC) D("post txsync ring %d cur %d hwcur %d", - i, kring->ring->cur, + i, ring->cur, kring->nr_hwcur); } else { - if (nm_rxsync_prologue(kring) >= kring->nkr_num_slots) { + if (nm_rxsync_prologue(kring, ring) >= kring->nkr_num_slots) { netmap_ring_reinit(kring); } else if (kring->nm_sync(kring, NAF_FORCE_READ) == 0) { - nm_rxsync_finalize(kring); + nm_sync_finalize(kring); } - microtime(&na->rx_rings[i].ring->ts); + microtime(&ring->ts); } nm_kr_put(kring); } @@ -2323,9 +2389,7 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, error = EOPNOTSUPP; #endif /* linux */ } -out: - CURVNET_RESTORE(); return (error); } @@ -2345,17 +2409,15 @@ out: * hidden argument. */ int -netmap_poll(struct cdev *dev, int events, struct thread *td) +netmap_poll(struct netmap_priv_d *priv, int events, NM_SELRECORD_T *sr) { - struct netmap_priv_d *priv = NULL; struct netmap_adapter *na; struct netmap_kring *kring; + struct netmap_ring *ring; u_int i, check_all_tx, check_all_rx, want[NR_TXRX], revents = 0; #define want_tx want[NR_TX] #define want_rx want[NR_RX] struct mbq q; /* packets from hw queues to host stack */ - void *pwait = dev; /* linux compatibility */ - int is_kevent = 0; enum txrx t; /* @@ -2365,23 +2427,13 @@ netmap_poll(struct cdev *dev, int events, struct thread *td) */ int retry_tx = 1, retry_rx = 1; - (void)pwait; - mbq_init(&q); - - /* - * XXX kevent has curthread->tp_fop == NULL, - * so devfs_get_cdevpriv() fails. We circumvent this by passing - * priv as the first argument, which is also useful to avoid - * the selrecord() which are not necessary in that case. + /* transparent mode: send_down is 1 if we have found some + * packets to forward during the rx scan and we have not + * sent them down to the nic yet */ - if (devfs_get_cdevpriv((void **)&priv) != 0) { - is_kevent = 1; - if (netmap_verbose) - D("called from kevent"); - priv = (struct netmap_priv_d *)dev; - } - if (priv == NULL) - return POLLERR; + int send_down = 0; + + mbq_init(&q); if (priv->np_nifp == NULL) { D("No if registered"); @@ -2399,7 +2451,6 @@ netmap_poll(struct cdev *dev, int events, struct thread *td) want_tx = events & (POLLOUT | POLLWRNORM); want_rx = events & (POLLIN | POLLRDNORM); - /* * check_all_{tx|rx} are set if the card has more than one queue AND * the file descriptor is bound to all of them. If so, we sleep on @@ -2421,6 +2472,32 @@ netmap_poll(struct cdev *dev, int events, struct thread *td) * slots available. If this fails, then lock and call the sync * routines. */ +#if 1 /* new code- call rx if any of the ring needs to release or read buffers */ + if (want_tx) { + t = NR_TX; + for (i = priv->np_qfirst[t]; want[t] && i < priv->np_qlast[t]; i++) { + kring = &NMR(na, t)[i]; + /* XXX compare ring->cur and kring->tail */ + if (!nm_ring_empty(kring->ring)) { + revents |= want[t]; + want[t] = 0; /* also breaks the loop */ + } + } + } + if (want_rx) { + want_rx = 0; /* look for a reason to run the handlers */ + t = NR_RX; + for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) { + kring = &NMR(na, t)[i]; + if (kring->ring->cur == kring->ring->tail /* try fetch new buffers */ + || kring->rhead != kring->ring->head /* release buffers */) { + want_rx = 1; + } + } + if (!want_rx) + revents |= events & (POLLIN | POLLRDNORM); /* we have data */ + } +#else /* old code */ for_rx_tx(t) { for (i = priv->np_qfirst[t]; want[t] && i < priv->np_qlast[t]; i++) { kring = &NMR(na, t)[i]; @@ -2431,6 +2508,7 @@ netmap_poll(struct cdev *dev, int events, struct thread *td) } } } +#endif /* old code */ /* * If we want to push packets out (priv->np_txpoll) or @@ -2447,32 +2525,26 @@ netmap_poll(struct cdev *dev, int events, struct thread *td) * used to skip rings with no pending transmissions. */ flush_tx: - for (i = priv->np_qfirst[NR_TX]; i < priv->np_qlast[NR_RX]; i++) { + for (i = priv->np_qfirst[NR_TX]; i < priv->np_qlast[NR_TX]; i++) { int found = 0; kring = &na->tx_rings[i]; - if (!want_tx && kring->ring->cur == kring->nr_hwcur) + ring = kring->ring; + + if (!send_down && !want_tx && ring->cur == kring->nr_hwcur) continue; - /* only one thread does txsync */ - if (nm_kr_tryget(kring)) { - /* either busy or stopped - * XXX if the ring is stopped, sleeping would - * be better. In current code, however, we only - * stop the rings for brief intervals (2014-03-14) - */ - if (netmap_verbose) - RD(2, "%p lost race on txring %d, ok", - priv, i); + + if (nm_kr_tryget(kring, 1, &revents)) continue; - } - if (nm_txsync_prologue(kring) >= kring->nkr_num_slots) { + + if (nm_txsync_prologue(kring, ring) >= kring->nkr_num_slots) { netmap_ring_reinit(kring); revents |= POLLERR; } else { if (kring->nm_sync(kring, 0)) revents |= POLLERR; else - nm_txsync_finalize(kring); + nm_sync_finalize(kring); } /* @@ -2489,8 +2561,10 @@ flush_tx: kring->nm_notify(kring, 0); } } - if (want_tx && retry_tx && !is_kevent) { - OS_selrecord(td, check_all_tx ? + /* if there were any packet to forward we must have handled them by now */ + send_down = 0; + if (want_tx && retry_tx && sr) { + nm_os_selrecord(sr, check_all_tx ? &na->si[NR_TX] : &na->tx_rings[priv->np_qfirst[NR_TX]].si); retry_tx = 0; goto flush_tx; @@ -2502,22 +2576,18 @@ flush_tx: * Do it on all rings because otherwise we starve. */ if (want_rx) { - int send_down = 0; /* transparent mode */ /* two rounds here for race avoidance */ do_retry_rx: for (i = priv->np_qfirst[NR_RX]; i < priv->np_qlast[NR_RX]; i++) { int found = 0; kring = &na->rx_rings[i]; + ring = kring->ring; - if (nm_kr_tryget(kring)) { - if (netmap_verbose) - RD(2, "%p lost race on rxring %d, ok", - priv, i); + if (unlikely(nm_kr_tryget(kring, 1, &revents))) continue; - } - if (nm_rxsync_prologue(kring) >= kring->nkr_num_slots) { + if (nm_rxsync_prologue(kring, ring) >= kring->nkr_num_slots) { netmap_ring_reinit(kring); revents |= POLLERR; } @@ -2526,22 +2596,22 @@ do_retry_rx: /* * transparent mode support: collect packets * from the rxring(s). - * XXX NR_FORWARD should only be read on - * physical or NIC ports */ - if (netmap_fwd ||kring->ring->flags & NR_FORWARD) { + if (nm_may_forward_up(kring)) { ND(10, "forwarding some buffers up %d to %d", - kring->nr_hwcur, kring->ring->cur); + kring->nr_hwcur, ring->cur); netmap_grab_packets(kring, &q, netmap_fwd); } + kring->nr_kflags &= ~NR_FORWARD; if (kring->nm_sync(kring, 0)) revents |= POLLERR; else - nm_rxsync_finalize(kring); + nm_sync_finalize(kring); + send_down |= (kring->nr_kflags & NR_FORWARD); /* host ring only */ if (netmap_no_timestamp == 0 || - kring->ring->flags & NR_TIMESTAMP) { - microtime(&kring->ring->ts); + ring->flags & NR_TIMESTAMP) { + microtime(&ring->ts); } found = kring->rcur != kring->rtail; nm_kr_put(kring); @@ -2552,22 +2622,10 @@ do_retry_rx: } } - /* transparent mode XXX only during first pass ? */ - if (na->na_flags & NAF_HOST_RINGS) { - kring = &na->rx_rings[na->num_rx_rings]; - if (check_all_rx - && (netmap_fwd || kring->ring->flags & NR_FORWARD)) { - /* XXX fix to use kring fields */ - if (nm_ring_empty(kring->ring)) - send_down = netmap_rxsync_from_host(na, td, dev); - if (!nm_ring_empty(kring->ring)) - revents |= want_rx; - } - } - - if (retry_rx && !is_kevent) - OS_selrecord(td, check_all_rx ? + if (retry_rx && sr) { + nm_os_selrecord(sr, check_all_rx ? &na->si[NR_RX] : &na->rx_rings[priv->np_qfirst[NR_RX]].si); + } if (send_down > 0 || retry_rx) { retry_rx = 0; if (send_down) @@ -2582,15 +2640,14 @@ do_retry_rx: * kring->nr_hwcur and ring->head * are passed to the other endpoint. * - * In this mode we also scan the sw rxring, which in - * turn passes packets up. - * - * XXX Transparent mode at the moment requires to bind all + * Transparent mode requires to bind all * rings to a single file descriptor. */ - if (q.head && na->ifp != NULL) + if (q.head && !nm_kr_tryget(&na->tx_rings[na->num_tx_rings], 1, &revents)) { netmap_send_up(na->ifp, &q); + nm_kr_put(&na->tx_rings[na->num_tx_rings]); + } return (revents); #undef want_tx @@ -2600,8 +2657,6 @@ do_retry_rx: /*-------------------- driver support routines -------------------*/ -static int netmap_hw_krings_create(struct netmap_adapter *); - /* default notify callback */ static int netmap_notify(struct netmap_kring *kring, int flags) @@ -2609,51 +2664,51 @@ netmap_notify(struct netmap_kring *kring, int flags) struct netmap_adapter *na = kring->na; enum txrx t = kring->tx; - OS_selwakeup(&kring->si, PI_NET); + nm_os_selwakeup(&kring->si); /* optimization: avoid a wake up on the global * queue if nobody has registered for more * than one ring */ if (na->si_users[t] > 0) - OS_selwakeup(&na->si[t], PI_NET); + nm_os_selwakeup(&na->si[t]); - return 0; + return NM_IRQ_COMPLETED; } +#if 0 +static int +netmap_notify(struct netmap_adapter *na, u_int n_ring, +enum txrx tx, int flags) +{ + if (tx == NR_TX) { + KeSetEvent(notes->TX_EVENT, 0, FALSE); + } + else + { + KeSetEvent(notes->RX_EVENT, 0, FALSE); + } + return 0; +} +#endif /* called by all routines that create netmap_adapters. - * Attach na to the ifp (if any) and provide defaults - * for optional callbacks. Defaults assume that we - * are creating an hardware netmap_adapter. + * provide some defaults and get a reference to the + * memory allocator */ int netmap_attach_common(struct netmap_adapter *na) { - struct ifnet *ifp = na->ifp; - if (na->num_tx_rings == 0 || na->num_rx_rings == 0) { D("%s: invalid rings tx %d rx %d", na->name, na->num_tx_rings, na->num_rx_rings); return EINVAL; } - /* ifp is NULL for virtual adapters (bwrap, non-persistent VALE ports, - * pipes, monitors). For bwrap we actually have a non-null ifp for - * use by the external modules, but that is set after this - * function has been called. - * XXX this is ugly, maybe split this function in two (2014-03-14) - */ - if (ifp != NULL) { - WNA(ifp) = na; - /* the following is only needed for na that use the host port. - * XXX do we have something similar for linux ? - */ #ifdef __FreeBSD__ - na->if_input = ifp->if_input; /* for netmap_send_up */ -#endif /* __FreeBSD__ */ - - NETMAP_SET_CAPABLE(ifp); + if (na->na_flags & NAF_HOST_RINGS && na->ifp) { + na->if_input = na->ifp->if_input; /* for netmap_send_up */ } +#endif /* __FreeBSD__ */ if (na->nm_krings_create == NULL) { /* we assume that we have been called by a driver, * since other port types all provide their own @@ -2677,6 +2732,7 @@ netmap_attach_common(struct netmap_adapter *na) */ na->nm_bdg_attach = netmap_bwrap_attach; #endif + return 0; } @@ -2685,9 +2741,6 @@ netmap_attach_common(struct netmap_adapter *na) void netmap_detach_common(struct netmap_adapter *na) { - if (na->ifp != NULL) - WNA(na->ifp) = NULL; /* XXX do we need this? */ - if (na->tx_rings) { /* XXX should not happen */ D("freeing leftover tx_rings"); na->nm_krings_delete(na); @@ -2699,31 +2752,52 @@ netmap_detach_common(struct netmap_adapter *na) free(na, M_DEVBUF); } -/* Wrapper for the register callback provided hardware drivers. - * na->ifp == NULL means the driver module has been +/* Wrapper for the register callback provided netmap-enabled + * hardware drivers. + * nm_iszombie(na) means that the driver module has been * unloaded, so we cannot call into it. - * Note that module unloading, in our patched linux drivers, - * happens under NMG_LOCK and after having stopped all the - * nic rings (see netmap_detach). This provides sufficient - * protection for the other driver-provied callbacks - * (i.e., nm_config and nm_*xsync), that therefore don't need - * to wrapped. + * nm_os_ifnet_lock() must guarantee mutual exclusion with + * module unloading. */ static int -netmap_hw_register(struct netmap_adapter *na, int onoff) +netmap_hw_reg(struct netmap_adapter *na, int onoff) { struct netmap_hw_adapter *hwna = (struct netmap_hw_adapter*)na; + int error = 0; + + nm_os_ifnet_lock(); + + if (nm_iszombie(na)) { + if (onoff) { + error = ENXIO; + } else if (na != NULL) { + na->na_flags &= ~NAF_NETMAP_ON; + } + goto out; + } + + error = hwna->nm_hw_register(na, onoff); - if (na->ifp == NULL) - return onoff ? ENXIO : 0; +out: + nm_os_ifnet_unlock(); - return hwna->nm_hw_register(na, onoff); + return error; +} + +static void +netmap_hw_dtor(struct netmap_adapter *na) +{ + if (nm_iszombie(na) || na->ifp == NULL) + return; + + WNA(na->ifp) = NULL; } /* - * Initialize a ``netmap_adapter`` object created by driver on attach. + * Allocate a ``netmap_adapter`` object, and initialize it from the + * 'arg' passed by the driver on attach. * We allocate a block of memory with room for a struct netmap_adapter * plus two sets of N+2 struct netmap_kring (where N is the number * of hardware rings): @@ -2732,29 +2806,31 @@ netmap_hw_register(struct netmap_adapter *na, int onoff) * kring N+1 is only used for the selinfo for all queues. // XXX still true ? * Return 0 on success, ENOMEM otherwise. */ -int -netmap_attach(struct netmap_adapter *arg) +static int +_netmap_attach(struct netmap_adapter *arg, size_t size) { struct netmap_hw_adapter *hwna = NULL; - // XXX when is arg == NULL ? - struct ifnet *ifp = arg ? arg->ifp : NULL; + struct ifnet *ifp = NULL; - if (arg == NULL || ifp == NULL) + if (arg == NULL || arg->ifp == NULL) goto fail; - hwna = malloc(sizeof(*hwna), M_DEVBUF, M_NOWAIT | M_ZERO); + ifp = arg->ifp; + hwna = malloc(size, M_DEVBUF, M_NOWAIT | M_ZERO); if (hwna == NULL) goto fail; hwna->up = *arg; hwna->up.na_flags |= NAF_HOST_RINGS | NAF_NATIVE; strncpy(hwna->up.name, ifp->if_xname, sizeof(hwna->up.name)); hwna->nm_hw_register = hwna->up.nm_register; - hwna->up.nm_register = netmap_hw_register; + hwna->up.nm_register = netmap_hw_reg; if (netmap_attach_common(&hwna->up)) { free(hwna, M_DEVBUF); goto fail; } netmap_adapter_get(&hwna->up); + NM_ATTACH_NA(ifp, &hwna->up); + #ifdef linux if (ifp->netdev_ops) { /* prepare a clone of the netdev ops */ @@ -2762,7 +2838,7 @@ netmap_attach(struct netmap_adapter *arg) hwna->nm_ndo.ndo_start_xmit = ifp->netdev_ops; #else hwna->nm_ndo = *ifp->netdev_ops; -#endif +#endif /* NETMAP_LINUX_HAVE_NETDEV_OPS */ } hwna->nm_ndo.ndo_start_xmit = linux_netmap_start_xmit; if (ifp->ethtool_ops) { @@ -2771,11 +2847,14 @@ netmap_attach(struct netmap_adapter *arg) hwna->nm_eto.set_ringparam = linux_netmap_set_ringparam; #ifdef NETMAP_LINUX_HAVE_SET_CHANNELS hwna->nm_eto.set_channels = linux_netmap_set_channels; -#endif +#endif /* NETMAP_LINUX_HAVE_SET_CHANNELS */ if (arg->nm_config == NULL) { hwna->up.nm_config = netmap_linux_config; } #endif /* linux */ + if (arg->nm_dtor == NULL) { + hwna->up.nm_dtor = netmap_hw_dtor; + } if_printf(ifp, "netmap queues/slots: TX %d/%d, RX %d/%d\n", hwna->up.num_tx_rings, hwna->up.num_tx_desc, @@ -2784,12 +2863,57 @@ netmap_attach(struct netmap_adapter *arg) fail: D("fail, arg %p ifp %p na %p", arg, ifp, hwna); - if (ifp) - netmap_detach(ifp); return (hwna ? EINVAL : ENOMEM); } +int +netmap_attach(struct netmap_adapter *arg) +{ + return _netmap_attach(arg, sizeof(struct netmap_hw_adapter)); +} + + +#ifdef WITH_PTNETMAP_GUEST +int +netmap_pt_guest_attach(struct netmap_adapter *arg, + void *csb, + unsigned int nifp_offset, + nm_pt_guest_ptctl_t ptctl) +{ + struct netmap_pt_guest_adapter *ptna; + struct ifnet *ifp = arg ? arg->ifp : NULL; + int error; + + /* get allocator */ + arg->nm_mem = netmap_mem_pt_guest_new(ifp, nifp_offset, ptctl); + if (arg->nm_mem == NULL) + return ENOMEM; + arg->na_flags |= NAF_MEM_OWNER; + error = _netmap_attach(arg, sizeof(struct netmap_pt_guest_adapter)); + if (error) + return error; + + /* get the netmap_pt_guest_adapter */ + ptna = (struct netmap_pt_guest_adapter *) NA(ifp); + ptna->csb = csb; + + /* Initialize a separate pass-through netmap adapter that is going to + * be used by the ptnet driver only, and so never exposed to netmap + * applications. We only need a subset of the available fields. */ + memset(&ptna->dr, 0, sizeof(ptna->dr)); + ptna->dr.up.ifp = ifp; + ptna->dr.up.nm_mem = ptna->hwup.up.nm_mem; + netmap_mem_get(ptna->dr.up.nm_mem); + ptna->dr.up.nm_config = ptna->hwup.up.nm_config; + + ptna->backend_regifs = 0; + + return 0; +} +#endif /* WITH_PTNETMAP_GUEST */ + + void NM_DBG(netmap_adapter_get)(struct netmap_adapter *na) { @@ -2841,28 +2965,29 @@ void netmap_detach(struct ifnet *ifp) { struct netmap_adapter *na = NA(ifp); - int skip; if (!na) return; - skip = 0; NMG_LOCK(); - netmap_disable_all_rings(ifp); - na->ifp = NULL; - na->na_flags &= ~NAF_NETMAP_ON; + netmap_set_all_rings(na, NM_KR_LOCKED); + na->na_flags |= NAF_ZOMBIE; /* * if the netmap adapter is not native, somebody * changed it, so we can not release it here. - * The NULL na->ifp will notify the new owner that + * The NAF_ZOMBIE flag will notify the new owner that * the driver is gone. */ if (na->na_flags & NAF_NATIVE) { - skip = netmap_adapter_put(na); + netmap_adapter_put(na); } - /* give them a chance to notice */ - if (skip == 0) - netmap_enable_all_rings(ifp); + /* give active users a chance to notice that NAF_ZOMBIE has been + * turned on, so that they can stop and return an error to userspace. + * Note that this becomes a NOP if there are no active users and, + * therefore, the put() above has deleted the na, since now NA(ifp) is + * NULL. + */ + netmap_enable_all_rings(ifp); NMG_UNLOCK(); } @@ -2883,9 +3008,10 @@ int netmap_transmit(struct ifnet *ifp, struct mbuf *m) { struct netmap_adapter *na = NA(ifp); - struct netmap_kring *kring; + struct netmap_kring *kring, *tx_kring; u_int len = MBUF_LEN(m); u_int error = ENOBUFS; + unsigned int txr; struct mbq *q; int space; @@ -2900,6 +3026,16 @@ netmap_transmit(struct ifnet *ifp, struct mbuf *m) goto done; } + txr = MBUF_TXQ(m); + if (txr >= na->num_tx_rings) { + txr %= na->num_tx_rings; + } + tx_kring = &NMR(na, NR_TX)[txr]; + + if (tx_kring->nr_mode == NKR_NETMAP_OFF) { + return MBUF_TRANSMIT(na, ifp, m); + } + q = &kring->rx_queue; // XXX reconsider long packets if we handle fragments @@ -2909,6 +3045,11 @@ netmap_transmit(struct ifnet *ifp, struct mbuf *m) goto done; } + if (nm_os_mbuf_has_offld(m)) { + RD(1, "%s drop mbuf requiring offloadings", na->name); + goto done; + } + /* protect against rxsync_from_host(), netmap_sw_to_nic() * and maybe other instances of netmap_transmit (the latter * not possible on Linux). @@ -2951,6 +3092,8 @@ done: * netmap_reset() is called by the driver routines when reinitializing * a ring. The driver is in charge of locking to protect the kring. * If native netmap mode is not set just return NULL. + * If native netmap mode is set, in particular, we have to set nr_mode to + * NKR_NETMAP_ON. */ struct netmap_slot * netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n, @@ -2975,13 +3118,26 @@ netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n, if (tx == NR_TX) { if (n >= na->num_tx_rings) return NULL; + kring = na->tx_rings + n; + + if (kring->nr_pending_mode == NKR_NETMAP_OFF) { + kring->nr_mode = NKR_NETMAP_OFF; + return NULL; + } + // XXX check whether we should use hwcur or rcur new_hwofs = kring->nr_hwcur - new_cur; } else { if (n >= na->num_rx_rings) return NULL; kring = na->rx_rings + n; + + if (kring->nr_pending_mode == NKR_NETMAP_OFF) { + kring->nr_mode = NKR_NETMAP_OFF; + return NULL; + } + new_hwofs = kring->nr_hwtail - new_cur; } lim = kring->nkr_num_slots - 1; @@ -3018,6 +3174,7 @@ netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n, * We do the wakeup here, but the ring is not yet reconfigured. * However, we are under lock so there are no races. */ + kring->nr_mode = NKR_NETMAP_ON; kring->nm_notify(kring, 0); return kring->ring->slot; } @@ -3037,10 +3194,9 @@ netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n, * - for a nic connected to a switch, call the proper forwarding routine * (see netmap_bwrap_intr_notify) */ -void -netmap_common_irq(struct ifnet *ifp, u_int q, u_int *work_done) +int +netmap_common_irq(struct netmap_adapter *na, u_int q, u_int *work_done) { - struct netmap_adapter *na = NA(ifp); struct netmap_kring *kring; enum txrx t = (work_done ? NR_RX : NR_TX); @@ -3051,15 +3207,20 @@ netmap_common_irq(struct ifnet *ifp, u_int q, u_int *work_done) } if (q >= nma_get_nrings(na, t)) - return; // not a physical queue + return NM_IRQ_PASS; // not a physical queue kring = NMR(na, t) + q; + if (kring->nr_mode == NKR_NETMAP_OFF) { + return NM_IRQ_PASS; + } + if (t == NR_RX) { kring->nr_kflags |= NKR_PENDINTR; // XXX atomic ? *work_done = 1; /* do not fire napi again */ } - kring->nm_notify(kring, 0); + + return kring->nm_notify(kring, 0); } @@ -3067,17 +3228,17 @@ netmap_common_irq(struct ifnet *ifp, u_int q, u_int *work_done) * Default functions to handle rx/tx interrupts from a physical device. * "work_done" is non-null on the RX path, NULL for the TX path. * - * If the card is not in netmap mode, simply return 0, + * If the card is not in netmap mode, simply return NM_IRQ_PASS, * so that the caller proceeds with regular processing. - * Otherwise call netmap_common_irq() and return 1. + * Otherwise call netmap_common_irq(). * * If the card is connected to a netmap file descriptor, * do a selwakeup on the individual queue, plus one on the global one * if needed (multiqueue card _and_ there are multiqueue listeners), - * and return 1. + * and return NR_IRQ_COMPLETED. * * Finally, if called on rx from an interface connected to a switch, - * calls the proper forwarding routine, and return 1. + * calls the proper forwarding routine. */ int netmap_rx_irq(struct ifnet *ifp, u_int q, u_int *work_done) @@ -3091,15 +3252,14 @@ netmap_rx_irq(struct ifnet *ifp, u_int q, u_int *work_done) * nm_native_on() here. */ if (!nm_netmap_on(na)) - return 0; + return NM_IRQ_PASS; if (na->na_flags & NAF_SKIP_INTR) { ND("use regular interrupt"); - return 0; + return NM_IRQ_PASS; } - netmap_common_irq(ifp, q, work_done); - return 1; + return netmap_common_irq(na, q, work_done); } @@ -3120,9 +3280,11 @@ extern struct cdevsw netmap_cdevsw; void netmap_fini(void) { - netmap_uninit_bridges(); if (netmap_dev) destroy_dev(netmap_dev); + /* we assume that there are no longer netmap users */ + nm_os_ifnet_fini(); + netmap_uninit_bridges(); netmap_mem_fini(); NMG_LOCK_DESTROY(); printf("netmap: unloaded module.\n"); @@ -3155,9 +3317,13 @@ netmap_init(void) goto fail; #ifdef __FreeBSD__ - nm_vi_init_index(); + nm_os_vi_init_index(); #endif + error = nm_os_ifnet_init(); + if (error) + goto fail; + printf("netmap: loaded module\n"); return (0); fail: diff --git a/sys/dev/netmap/netmap_freebsd.c b/sys/dev/netmap/netmap_freebsd.c index 8490ae85670b..20ea5c8f2972 100644 --- a/sys/dev/netmap/netmap_freebsd.c +++ b/sys/dev/netmap/netmap_freebsd.c @@ -33,8 +33,9 @@ #include <sys/param.h> /* defines used in kernel.h */ #include <sys/poll.h> /* POLLIN, POLLOUT */ #include <sys/kernel.h> /* types used in module initialization */ -#include <sys/conf.h> /* DEV_MODULE */ +#include <sys/conf.h> /* DEV_MODULE_ORDERED */ #include <sys/endian.h> +#include <sys/syscallsubr.h> /* kern_ioctl() */ #include <sys/rwlock.h> @@ -50,6 +51,11 @@ #include <sys/malloc.h> #include <sys/socket.h> /* sockaddrs */ #include <sys/selinfo.h> +#include <sys/kthread.h> /* kthread_add() */ +#include <sys/proc.h> /* PROC_LOCK() */ +#include <sys/unistd.h> /* RFNOWAIT */ +#include <sys/sched.h> /* sched_bind() */ +#include <sys/smp.h> /* mp_maxid */ #include <net/if.h> #include <net/if_var.h> #include <net/if_types.h> /* IFT_ETHER */ @@ -61,13 +67,94 @@ #include <net/netmap.h> #include <dev/netmap/netmap_kern.h> +#include <net/netmap_virt.h> #include <dev/netmap/netmap_mem2.h> /* ======================== FREEBSD-SPECIFIC ROUTINES ================== */ +void nm_os_selinfo_init(NM_SELINFO_T *si) { + struct mtx *m = &si->m; + mtx_init(m, "nm_kn_lock", NULL, MTX_DEF); + knlist_init_mtx(&si->si.si_note, m); +} + +void +nm_os_selinfo_uninit(NM_SELINFO_T *si) +{ + /* XXX kqueue(9) needed; these will mirror knlist_init. */ + knlist_delete(&si->si.si_note, curthread, 0 /* not locked */ ); + knlist_destroy(&si->si.si_note); + /* now we don't need the mutex anymore */ + mtx_destroy(&si->m); +} + +void +nm_os_ifnet_lock(void) +{ + IFNET_WLOCK(); +} + +void +nm_os_ifnet_unlock(void) +{ + IFNET_WUNLOCK(); +} + +static int netmap_use_count = 0; + +void +nm_os_get_module(void) +{ + netmap_use_count++; +} + +void +nm_os_put_module(void) +{ + netmap_use_count--; +} + +static void +netmap_ifnet_arrival_handler(void *arg __unused, struct ifnet *ifp) +{ + netmap_undo_zombie(ifp); +} + +static void +netmap_ifnet_departure_handler(void *arg __unused, struct ifnet *ifp) +{ + netmap_make_zombie(ifp); +} + +static eventhandler_tag nm_ifnet_ah_tag; +static eventhandler_tag nm_ifnet_dh_tag; + +int +nm_os_ifnet_init(void) +{ + nm_ifnet_ah_tag = + EVENTHANDLER_REGISTER(ifnet_arrival_event, + netmap_ifnet_arrival_handler, + NULL, EVENTHANDLER_PRI_ANY); + nm_ifnet_dh_tag = + EVENTHANDLER_REGISTER(ifnet_departure_event, + netmap_ifnet_departure_handler, + NULL, EVENTHANDLER_PRI_ANY); + return 0; +} + +void +nm_os_ifnet_fini(void) +{ + EVENTHANDLER_DEREGISTER(ifnet_arrival_event, + nm_ifnet_ah_tag); + EVENTHANDLER_DEREGISTER(ifnet_departure_event, + nm_ifnet_dh_tag); +} + rawsum_t -nm_csum_raw(uint8_t *data, size_t len, rawsum_t cur_sum) +nm_os_csum_raw(uint8_t *data, size_t len, rawsum_t cur_sum) { /* TODO XXX please use the FreeBSD implementation for this. */ uint16_t *words = (uint16_t *)data; @@ -87,7 +174,7 @@ nm_csum_raw(uint8_t *data, size_t len, rawsum_t cur_sum) * return value is in network byte order. */ uint16_t -nm_csum_fold(rawsum_t cur_sum) +nm_os_csum_fold(rawsum_t cur_sum) { /* TODO XXX please use the FreeBSD implementation for this. */ while (cur_sum >> 16) @@ -96,17 +183,17 @@ nm_csum_fold(rawsum_t cur_sum) return htobe16((~cur_sum) & 0xFFFF); } -uint16_t nm_csum_ipv4(struct nm_iphdr *iph) +uint16_t nm_os_csum_ipv4(struct nm_iphdr *iph) { #if 0 return in_cksum_hdr((void *)iph); #else - return nm_csum_fold(nm_csum_raw((uint8_t*)iph, sizeof(struct nm_iphdr), 0)); + return nm_os_csum_fold(nm_os_csum_raw((uint8_t*)iph, sizeof(struct nm_iphdr), 0)); #endif } void -nm_csum_tcpudp_ipv4(struct nm_iphdr *iph, void *data, +nm_os_csum_tcpudp_ipv4(struct nm_iphdr *iph, void *data, size_t datalen, uint16_t *check) { #ifdef INET @@ -118,7 +205,7 @@ nm_csum_tcpudp_ipv4(struct nm_iphdr *iph, void *data, /* Compute the checksum on TCP/UDP header + payload * (includes the pseudo-header). */ - *check = nm_csum_fold(nm_csum_raw(data, datalen, 0)); + *check = nm_os_csum_fold(nm_os_csum_raw(data, datalen, 0)); #else static int notsupported = 0; if (!notsupported) { @@ -129,12 +216,12 @@ nm_csum_tcpudp_ipv4(struct nm_iphdr *iph, void *data, } void -nm_csum_tcpudp_ipv6(struct nm_ipv6hdr *ip6h, void *data, +nm_os_csum_tcpudp_ipv6(struct nm_ipv6hdr *ip6h, void *data, size_t datalen, uint16_t *check) { #ifdef INET6 *check = in6_cksum_pseudo((void*)ip6h, datalen, ip6h->nexthdr, 0); - *check = nm_csum_fold(nm_csum_raw(data, datalen, 0)); + *check = nm_os_csum_fold(nm_os_csum_raw(data, datalen, 0)); #else static int notsupported = 0; if (!notsupported) { @@ -144,13 +231,41 @@ nm_csum_tcpudp_ipv6(struct nm_ipv6hdr *ip6h, void *data, #endif } +/* on FreeBSD we send up one packet at a time */ +void * +nm_os_send_up(struct ifnet *ifp, struct mbuf *m, struct mbuf *prev) +{ + + NA(ifp)->if_input(ifp, m); + return NULL; +} + +int +nm_os_mbuf_has_offld(struct mbuf *m) +{ + return m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_SCTP | + CSUM_TCP_IPV6 | CSUM_UDP_IPV6 | + CSUM_SCTP_IPV6 | CSUM_TSO); +} + +static void +freebsd_generic_rx_handler(struct ifnet *ifp, struct mbuf *m) +{ + struct netmap_generic_adapter *gna = + (struct netmap_generic_adapter *)NA(ifp); + int stolen = generic_rx_handler(ifp, m); + + if (!stolen) { + gna->save_if_input(ifp, m); + } +} /* * Intercept the rx routine in the standard device driver. * Second argument is non-zero to intercept, 0 to restore */ int -netmap_catch_rx(struct netmap_generic_adapter *gna, int intercept) +nm_os_catch_rx(struct netmap_generic_adapter *gna, int intercept) { struct netmap_adapter *na = &gna->up.up; struct ifnet *ifp = na->ifp; @@ -161,7 +276,7 @@ netmap_catch_rx(struct netmap_generic_adapter *gna, int intercept) return EINVAL; /* already set */ } gna->save_if_input = ifp->if_input; - ifp->if_input = generic_rx_handler; + ifp->if_input = freebsd_generic_rx_handler; } else { if (!gna->save_if_input){ D("cannot restore"); @@ -181,18 +296,20 @@ netmap_catch_rx(struct netmap_generic_adapter *gna, int intercept) * Second argument is non-zero to intercept, 0 to restore. * On freebsd we just intercept if_transmit. */ -void -netmap_catch_tx(struct netmap_generic_adapter *gna, int enable) +int +nm_os_catch_tx(struct netmap_generic_adapter *gna, int intercept) { struct netmap_adapter *na = &gna->up.up; struct ifnet *ifp = netmap_generic_getifp(gna); - if (enable) { + if (intercept) { na->if_transmit = ifp->if_transmit; ifp->if_transmit = netmap_transmit; } else { ifp->if_transmit = na->if_transmit; } + + return 0; } @@ -213,40 +330,44 @@ netmap_catch_tx(struct netmap_generic_adapter *gna, int enable) * */ int -generic_xmit_frame(struct ifnet *ifp, struct mbuf *m, - void *addr, u_int len, u_int ring_nr) +nm_os_generic_xmit_frame(struct nm_os_gen_arg *a) { int ret; + u_int len = a->len; + struct ifnet *ifp = a->ifp; + struct mbuf *m = a->m; +#if __FreeBSD_version < 1100000 /* - * The mbuf should be a cluster from our special pool, - * so we do not need to do an m_copyback but just copy - * (and eventually, just reference the netmap buffer) + * Old FreeBSD versions. The mbuf has a cluster attached, + * we need to copy from the cluster to the netmap buffer. */ - - if (GET_MBUF_REFCNT(m) != 1) { - D("invalid refcnt %d for %p", - GET_MBUF_REFCNT(m), m); + if (MBUF_REFCNT(m) != 1) { + D("invalid refcnt %d for %p", MBUF_REFCNT(m), m); panic("in generic_xmit_frame"); } - // XXX the ext_size check is unnecessary if we link the netmap buf if (m->m_ext.ext_size < len) { RD(5, "size %d < len %d", m->m_ext.ext_size, len); len = m->m_ext.ext_size; } - if (0) { /* XXX seems to have negligible benefits */ - m->m_ext.ext_buf = m->m_data = addr; - } else { - bcopy(addr, m->m_data, len); - } + bcopy(a->addr, m->m_data, len); +#else /* __FreeBSD_version >= 1100000 */ + /* New FreeBSD versions. Link the external storage to + * the netmap buffer, so that no copy is necessary. */ + m->m_ext.ext_buf = m->m_data = a->addr; + m->m_ext.ext_size = len; +#endif /* __FreeBSD_version >= 1100000 */ + m->m_len = m->m_pkthdr.len = len; - // inc refcount. All ours, we could skip the atomic - atomic_fetchadd_int(PNT_MBUF_REFCNT(m), 1); + + /* mbuf refcnt is not contended, no need to use atomic + * (a memory barrier is enough). */ + SET_MBUF_REFCNT(m, 2); M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE); - m->m_pkthdr.flowid = ring_nr; + m->m_pkthdr.flowid = a->ring_nr; m->m_pkthdr.rcvif = ifp; /* used for tx notification */ ret = NA(ifp)->if_transmit(ifp, m); - return ret; + return ret ? -1 : 0; } @@ -263,7 +384,7 @@ netmap_getna(if_t ifp) * way to extract the info from the ifp */ int -generic_find_num_desc(struct ifnet *ifp, unsigned int *tx, unsigned int *rx) +nm_os_generic_find_num_desc(struct ifnet *ifp, unsigned int *tx, unsigned int *rx) { D("called, in tx %d rx %d", *tx, *rx); return 0; @@ -271,16 +392,23 @@ generic_find_num_desc(struct ifnet *ifp, unsigned int *tx, unsigned int *rx) void -generic_find_num_queues(struct ifnet *ifp, u_int *txq, u_int *rxq) +nm_os_generic_find_num_queues(struct ifnet *ifp, u_int *txq, u_int *rxq) { D("called, in txq %d rxq %d", *txq, *rxq); *txq = netmap_generic_rings; *rxq = netmap_generic_rings; } +void +nm_os_generic_set_features(struct netmap_generic_adapter *gna) +{ + + gna->rxsg = 1; /* Supported through m_copydata. */ + gna->txqdisc = 0; /* Not supported. */ +} void -netmap_mitigation_init(struct nm_generic_mit *mit, int idx, struct netmap_adapter *na) +nm_os_mitigation_init(struct nm_generic_mit *mit, int idx, struct netmap_adapter *na) { ND("called"); mit->mit_pending = 0; @@ -290,21 +418,21 @@ netmap_mitigation_init(struct nm_generic_mit *mit, int idx, struct netmap_adapte void -netmap_mitigation_start(struct nm_generic_mit *mit) +nm_os_mitigation_start(struct nm_generic_mit *mit) { ND("called"); } void -netmap_mitigation_restart(struct nm_generic_mit *mit) +nm_os_mitigation_restart(struct nm_generic_mit *mit) { ND("called"); } int -netmap_mitigation_active(struct nm_generic_mit *mit) +nm_os_mitigation_active(struct nm_generic_mit *mit) { ND("called"); return 0; @@ -312,7 +440,7 @@ netmap_mitigation_active(struct nm_generic_mit *mit) void -netmap_mitigation_cleanup(struct nm_generic_mit *mit) +nm_os_mitigation_cleanup(struct nm_generic_mit *mit) { ND("called"); } @@ -342,7 +470,7 @@ static struct { } nm_vi_indices; void -nm_vi_init_index(void) +nm_os_vi_init_index(void) { int i; for (i = 0; i < NM_VI_MAX; i++) @@ -398,7 +526,7 @@ nm_vi_free_index(uint8_t val) * increment this refcount on if_attach(). */ int -nm_vi_persist(const char *name, struct ifnet **ret) +nm_os_vi_persist(const char *name, struct ifnet **ret) { struct ifnet *ifp; u_short macaddr_hi; @@ -438,15 +566,220 @@ nm_vi_persist(const char *name, struct ifnet **ret) *ret = ifp; return 0; } + /* unregister from the system and drop the final refcount */ void -nm_vi_detach(struct ifnet *ifp) +nm_os_vi_detach(struct ifnet *ifp) { nm_vi_free_index(((char *)IF_LLADDR(ifp))[5]); ether_ifdetach(ifp); if_free(ifp); } +/* ======================== PTNETMAP SUPPORT ========================== */ + +#ifdef WITH_PTNETMAP_GUEST +#include <sys/bus.h> +#include <sys/rman.h> +#include <machine/bus.h> /* bus_dmamap_* */ +#include <machine/resource.h> +#include <dev/pci/pcivar.h> +#include <dev/pci/pcireg.h> +/* + * ptnetmap memory device (memdev) for freebsd guest, + * ssed to expose host netmap memory to the guest through a PCI BAR. + */ + +/* + * ptnetmap memdev private data structure + */ +struct ptnetmap_memdev { + device_t dev; + struct resource *pci_io; + struct resource *pci_mem; + struct netmap_mem_d *nm_mem; +}; + +static int ptn_memdev_probe(device_t); +static int ptn_memdev_attach(device_t); +static int ptn_memdev_detach(device_t); +static int ptn_memdev_shutdown(device_t); + +static device_method_t ptn_memdev_methods[] = { + DEVMETHOD(device_probe, ptn_memdev_probe), + DEVMETHOD(device_attach, ptn_memdev_attach), + DEVMETHOD(device_detach, ptn_memdev_detach), + DEVMETHOD(device_shutdown, ptn_memdev_shutdown), + DEVMETHOD_END +}; + +static driver_t ptn_memdev_driver = { + PTNETMAP_MEMDEV_NAME, + ptn_memdev_methods, + sizeof(struct ptnetmap_memdev), +}; + +/* We use (SI_ORDER_MIDDLE+1) here, see DEV_MODULE_ORDERED() invocation + * below. */ +static devclass_t ptnetmap_devclass; +DRIVER_MODULE_ORDERED(ptn_memdev, pci, ptn_memdev_driver, ptnetmap_devclass, + NULL, NULL, SI_ORDER_MIDDLE + 1); + +/* + * I/O port read/write wrappers. + * Some are not used, so we keep them commented out until needed + */ +#define ptn_ioread16(ptn_dev, reg) bus_read_2((ptn_dev)->pci_io, (reg)) +#define ptn_ioread32(ptn_dev, reg) bus_read_4((ptn_dev)->pci_io, (reg)) +#if 0 +#define ptn_ioread8(ptn_dev, reg) bus_read_1((ptn_dev)->pci_io, (reg)) +#define ptn_iowrite8(ptn_dev, reg, val) bus_write_1((ptn_dev)->pci_io, (reg), (val)) +#define ptn_iowrite16(ptn_dev, reg, val) bus_write_2((ptn_dev)->pci_io, (reg), (val)) +#define ptn_iowrite32(ptn_dev, reg, val) bus_write_4((ptn_dev)->pci_io, (reg), (val)) +#endif /* unused */ + +/* + * Map host netmap memory through PCI-BAR in the guest OS, + * returning physical (nm_paddr) and virtual (nm_addr) addresses + * of the netmap memory mapped in the guest. + */ +int +nm_os_pt_memdev_iomap(struct ptnetmap_memdev *ptn_dev, vm_paddr_t *nm_paddr, void **nm_addr) +{ + uint32_t mem_size; + int rid; + + D("ptn_memdev_driver iomap"); + + rid = PCIR_BAR(PTNETMAP_MEM_PCI_BAR); + mem_size = ptn_ioread32(ptn_dev, PTNETMAP_IO_PCI_MEMSIZE); + + /* map memory allocator */ + ptn_dev->pci_mem = bus_alloc_resource(ptn_dev->dev, SYS_RES_MEMORY, + &rid, 0, ~0, mem_size, RF_ACTIVE); + if (ptn_dev->pci_mem == NULL) { + *nm_paddr = 0; + *nm_addr = 0; + return ENOMEM; + } + + *nm_paddr = rman_get_start(ptn_dev->pci_mem); + *nm_addr = rman_get_virtual(ptn_dev->pci_mem); + + D("=== BAR %d start %lx len %lx mem_size %x ===", + PTNETMAP_MEM_PCI_BAR, + *nm_paddr, + rman_get_size(ptn_dev->pci_mem), + mem_size); + return (0); +} + +/* Unmap host netmap memory. */ +void +nm_os_pt_memdev_iounmap(struct ptnetmap_memdev *ptn_dev) +{ + D("ptn_memdev_driver iounmap"); + + if (ptn_dev->pci_mem) { + bus_release_resource(ptn_dev->dev, SYS_RES_MEMORY, + PCIR_BAR(PTNETMAP_MEM_PCI_BAR), ptn_dev->pci_mem); + ptn_dev->pci_mem = NULL; + } +} + +/* Device identification routine, return BUS_PROBE_DEFAULT on success, + * positive on failure */ +static int +ptn_memdev_probe(device_t dev) +{ + char desc[256]; + + if (pci_get_vendor(dev) != PTNETMAP_PCI_VENDOR_ID) + return (ENXIO); + if (pci_get_device(dev) != PTNETMAP_PCI_DEVICE_ID) + return (ENXIO); + + snprintf(desc, sizeof(desc), "%s PCI adapter", + PTNETMAP_MEMDEV_NAME); + device_set_desc_copy(dev, desc); + + return (BUS_PROBE_DEFAULT); +} + +/* Device initialization routine. */ +static int +ptn_memdev_attach(device_t dev) +{ + struct ptnetmap_memdev *ptn_dev; + int rid; + uint16_t mem_id; + + D("ptn_memdev_driver attach"); + + ptn_dev = device_get_softc(dev); + ptn_dev->dev = dev; + + pci_enable_busmaster(dev); + + rid = PCIR_BAR(PTNETMAP_IO_PCI_BAR); + ptn_dev->pci_io = bus_alloc_resource_any(dev, SYS_RES_IOPORT, &rid, + RF_ACTIVE); + if (ptn_dev->pci_io == NULL) { + device_printf(dev, "cannot map I/O space\n"); + return (ENXIO); + } + + mem_id = ptn_ioread16(ptn_dev, PTNETMAP_IO_PCI_HOSTID); + + /* create guest allocator */ + ptn_dev->nm_mem = netmap_mem_pt_guest_attach(ptn_dev, mem_id); + if (ptn_dev->nm_mem == NULL) { + ptn_memdev_detach(dev); + return (ENOMEM); + } + netmap_mem_get(ptn_dev->nm_mem); + + D("ptn_memdev_driver probe OK - host_id: %d", mem_id); + + return (0); +} + +/* Device removal routine. */ +static int +ptn_memdev_detach(device_t dev) +{ + struct ptnetmap_memdev *ptn_dev; + + D("ptn_memdev_driver detach"); + ptn_dev = device_get_softc(dev); + + if (ptn_dev->nm_mem) { + netmap_mem_put(ptn_dev->nm_mem); + ptn_dev->nm_mem = NULL; + } + if (ptn_dev->pci_mem) { + bus_release_resource(dev, SYS_RES_MEMORY, + PCIR_BAR(PTNETMAP_MEM_PCI_BAR), ptn_dev->pci_mem); + ptn_dev->pci_mem = NULL; + } + if (ptn_dev->pci_io) { + bus_release_resource(dev, SYS_RES_IOPORT, + PCIR_BAR(PTNETMAP_IO_PCI_BAR), ptn_dev->pci_io); + ptn_dev->pci_io = NULL; + } + + return (0); +} + +static int +ptn_memdev_shutdown(device_t dev) +{ + D("ptn_memdev_driver shutdown"); + return bus_generic_shutdown(dev); +} + +#endif /* WITH_PTNETMAP_GUEST */ + /* * In order to track whether pages are still mapped, we hook into * the standard cdev_pager and intercept the constructor and @@ -606,7 +939,7 @@ err_unlock: * the device (/dev/netmap) so we cannot do anything useful. * To track close() on individual file descriptors we pass netmap_dtor() to * devfs_set_cdevpriv() on open(). The FreeBSD kernel will call the destructor - * when the last fd pointing to the device is closed. + * when the last fd pointing to the device is closed. * * Note that FreeBSD does not even munmap() on close() so we also have * to track mmap() ourselves, and postpone the call to @@ -634,26 +967,275 @@ netmap_open(struct cdev *dev, int oflags, int devtype, struct thread *td) (void)devtype; (void)td; - priv = malloc(sizeof(struct netmap_priv_d), M_DEVBUF, - M_NOWAIT | M_ZERO); - if (priv == NULL) - return ENOMEM; - priv->np_refs = 1; + NMG_LOCK(); + priv = netmap_priv_new(); + if (priv == NULL) { + error = ENOMEM; + goto out; + } error = devfs_set_cdevpriv(priv, netmap_dtor); if (error) { - free(priv, M_DEVBUF); - } else { - NMG_LOCK(); - netmap_use_count++; - NMG_UNLOCK(); + netmap_priv_delete(priv); } +out: + NMG_UNLOCK(); return error; } +/******************** kthread wrapper ****************/ +#include <sys/sysproto.h> +u_int +nm_os_ncpus(void) +{ + return mp_maxid + 1; +} + +struct nm_kthread_ctx { + struct thread *user_td; /* thread user-space (kthread creator) to send ioctl */ + /* notification to guest (interrupt) */ + int irq_fd; /* ioctl fd */ + struct nm_kth_ioctl irq_ioctl; /* ioctl arguments */ + + /* notification from guest */ + void *ioevent_file; /* tsleep() argument */ + + /* worker function and parameter */ + nm_kthread_worker_fn_t worker_fn; + void *worker_private; + + struct nm_kthread *nmk; + + /* integer to manage multiple worker contexts (e.g., RX or TX on ptnetmap) */ + long type; +}; + +struct nm_kthread { + struct thread *worker; + struct mtx worker_lock; + uint64_t scheduled; /* pending wake_up request */ + struct nm_kthread_ctx worker_ctx; + int run; /* used to stop kthread */ + int attach_user; /* kthread attached to user_process */ + int affinity; +}; + +void inline +nm_os_kthread_wakeup_worker(struct nm_kthread *nmk) +{ + /* + * There may be a race between FE and BE, + * which call both this function, and worker kthread, + * that reads nmk->scheduled. + * + * For us it is not important the counter value, + * but simply that it has changed since the last + * time the kthread saw it. + */ + mtx_lock(&nmk->worker_lock); + nmk->scheduled++; + if (nmk->worker_ctx.ioevent_file) { + wakeup(nmk->worker_ctx.ioevent_file); + } + mtx_unlock(&nmk->worker_lock); +} + +void inline +nm_os_kthread_send_irq(struct nm_kthread *nmk) +{ + struct nm_kthread_ctx *ctx = &nmk->worker_ctx; + int err; + + if (ctx->user_td && ctx->irq_fd > 0) { + err = kern_ioctl(ctx->user_td, ctx->irq_fd, ctx->irq_ioctl.com, (caddr_t)&ctx->irq_ioctl.data.msix); + if (err) { + D("kern_ioctl error: %d ioctl parameters: fd %d com %lu data %p", + err, ctx->irq_fd, ctx->irq_ioctl.com, &ctx->irq_ioctl.data); + } + } +} + +static void +nm_kthread_worker(void *data) +{ + struct nm_kthread *nmk = data; + struct nm_kthread_ctx *ctx = &nmk->worker_ctx; + uint64_t old_scheduled = nmk->scheduled; + + if (nmk->affinity >= 0) { + thread_lock(curthread); + sched_bind(curthread, nmk->affinity); + thread_unlock(curthread); + } + + while (nmk->run) { + /* + * check if the parent process dies + * (when kthread is attached to user process) + */ + if (ctx->user_td) { + PROC_LOCK(curproc); + thread_suspend_check(0); + PROC_UNLOCK(curproc); + } else { + kthread_suspend_check(); + } + + /* + * if ioevent_file is not defined, we don't have notification + * mechanism and we continually execute worker_fn() + */ + if (!ctx->ioevent_file) { + ctx->worker_fn(ctx->worker_private); /* worker body */ + } else { + /* checks if there is a pending notification */ + mtx_lock(&nmk->worker_lock); + if (likely(nmk->scheduled != old_scheduled)) { + old_scheduled = nmk->scheduled; + mtx_unlock(&nmk->worker_lock); + + ctx->worker_fn(ctx->worker_private); /* worker body */ + + continue; + } else if (nmk->run) { + /* wait on event with one second timeout */ + msleep_spin(ctx->ioevent_file, &nmk->worker_lock, + "nmk_ev", hz); + nmk->scheduled++; + } + mtx_unlock(&nmk->worker_lock); + } + } + + kthread_exit(); +} + +static int +nm_kthread_open_files(struct nm_kthread *nmk, struct nm_kthread_cfg *cfg) +{ + /* send irq through ioctl to bhyve (vmm.ko) */ + if (cfg->event.irqfd) { + nmk->worker_ctx.irq_fd = cfg->event.irqfd; + nmk->worker_ctx.irq_ioctl = cfg->event.ioctl; + } + /* ring.ioeventfd contains the chan where do tsleep to wait events */ + if (cfg->event.ioeventfd) { + nmk->worker_ctx.ioevent_file = (void *)cfg->event.ioeventfd; + } + + return 0; +} + +static void +nm_kthread_close_files(struct nm_kthread *nmk) +{ + nmk->worker_ctx.irq_fd = 0; + nmk->worker_ctx.ioevent_file = NULL; +} + +void +nm_os_kthread_set_affinity(struct nm_kthread *nmk, int affinity) +{ + nmk->affinity = affinity; +} + +struct nm_kthread * +nm_os_kthread_create(struct nm_kthread_cfg *cfg) +{ + struct nm_kthread *nmk = NULL; + int error; + + nmk = malloc(sizeof(*nmk), M_DEVBUF, M_NOWAIT | M_ZERO); + if (!nmk) + return NULL; + + mtx_init(&nmk->worker_lock, "nm_kthread lock", NULL, MTX_SPIN); + nmk->worker_ctx.worker_fn = cfg->worker_fn; + nmk->worker_ctx.worker_private = cfg->worker_private; + nmk->worker_ctx.type = cfg->type; + nmk->affinity = -1; + + /* attach kthread to user process (ptnetmap) */ + nmk->attach_user = cfg->attach_user; + + /* open event fd */ + error = nm_kthread_open_files(nmk, cfg); + if (error) + goto err; + + return nmk; +err: + free(nmk, M_DEVBUF); + return NULL; +} + +int +nm_os_kthread_start(struct nm_kthread *nmk) +{ + struct proc *p = NULL; + int error = 0; + + if (nmk->worker) { + return EBUSY; + } + + /* check if we want to attach kthread to user process */ + if (nmk->attach_user) { + nmk->worker_ctx.user_td = curthread; + p = curthread->td_proc; + } + + /* enable kthread main loop */ + nmk->run = 1; + /* create kthread */ + if((error = kthread_add(nm_kthread_worker, nmk, p, + &nmk->worker, RFNOWAIT /* to be checked */, 0, "nm-kthread-%ld", + nmk->worker_ctx.type))) { + goto err; + } + + D("nm_kthread started td 0x%p", nmk->worker); + + return 0; +err: + D("nm_kthread start failed err %d", error); + nmk->worker = NULL; + return error; +} + +void +nm_os_kthread_stop(struct nm_kthread *nmk) +{ + if (!nmk->worker) { + return; + } + /* tell to kthread to exit from main loop */ + nmk->run = 0; + + /* wake up kthread if it sleeps */ + kthread_resume(nmk->worker); + nm_os_kthread_wakeup_worker(nmk); + + nmk->worker = NULL; +} + +void +nm_os_kthread_delete(struct nm_kthread *nmk) +{ + if (!nmk) + return; + if (nmk->worker) { + nm_os_kthread_stop(nmk); + } + + nm_kthread_close_files(nmk); + + free(nmk, M_DEVBUF); +} + /******************** kqueue support ****************/ /* - * The OS_selwakeup also needs to issue a KNOTE_UNLOCKED. + * nm_os_selwakeup also needs to issue a KNOTE_UNLOCKED. * We use a non-zero argument to distinguish the call from the one * in kevent_scan() which instead also needs to run netmap_poll(). * The knote uses a global mutex for the time being. We might @@ -672,17 +1254,23 @@ netmap_open(struct cdev *dev, int oflags, int devtype, struct thread *td) void -freebsd_selwakeup(struct nm_selinfo *si, int pri) +nm_os_selwakeup(struct nm_selinfo *si) { if (netmap_verbose) D("on knote %p", &si->si.si_note); - selwakeuppri(&si->si, pri); + selwakeuppri(&si->si, PI_NET); /* use a non-zero hint to tell the notification from the * call done in kqueue_scan() which uses 0 */ KNOTE_UNLOCKED(&si->si.si_note, 0x100 /* notification */); } +void +nm_os_selrecord(struct thread *td, struct nm_selinfo *si) +{ + selrecord(td, &si->si); +} + static void netmap_knrdetach(struct knote *kn) { @@ -728,7 +1316,7 @@ netmap_knrw(struct knote *kn, long hint, int events) RD(5, "curthread changed %p %p", curthread, priv->np_td); return 1; } else { - revents = netmap_poll((void *)priv, events, curthread); + revents = netmap_poll(priv, events, NULL); return (events & revents) ? 1 : 0; } } @@ -801,13 +1389,47 @@ netmap_kqfilter(struct cdev *dev, struct knote *kn) return 0; } +static int +freebsd_netmap_poll(struct cdev *cdevi __unused, int events, struct thread *td) +{ + struct netmap_priv_d *priv; + if (devfs_get_cdevpriv((void **)&priv)) { + return POLLERR; + } + return netmap_poll(priv, events, td); +} + +static int +freebsd_netmap_ioctl(struct cdev *dev __unused, u_long cmd, caddr_t data, + int ffla __unused, struct thread *td) +{ + int error; + struct netmap_priv_d *priv; + + CURVNET_SET(TD_TO_VNET(rd)); + error = devfs_get_cdevpriv((void **)&priv); + if (error) { + /* XXX ENOENT should be impossible, since the priv + * is now created in the open */ + if (error == ENOENT) + error = ENXIO; + goto out; + } + error = netmap_ioctl(priv, cmd, data, td); +out: + CURVNET_RESTORE(); + + return error; +} + +extern struct cdevsw netmap_cdevsw; /* XXX used in netmap.c, should go elsewhere */ struct cdevsw netmap_cdevsw = { .d_version = D_VERSION, .d_name = "netmap", .d_open = netmap_open, .d_mmap_single = netmap_mmap_single, - .d_ioctl = netmap_ioctl, - .d_poll = netmap_poll, + .d_ioctl = freebsd_netmap_ioctl, + .d_poll = freebsd_netmap_poll, .d_kqfilter = netmap_kqfilter, .d_close = netmap_close, }; @@ -852,6 +1474,24 @@ netmap_loader(__unused struct module *module, int event, __unused void *arg) return (error); } - +#ifdef DEV_MODULE_ORDERED +/* + * The netmap module contains three drivers: (i) the netmap character device + * driver; (ii) the ptnetmap memdev PCI device driver, (iii) the ptnet PCI + * device driver. The attach() routines of both (ii) and (iii) need the + * lock of the global allocator, and such lock is initialized in netmap_init(), + * which is part of (i). + * Therefore, we make sure that (i) is loaded before (ii) and (iii), using + * the 'order' parameter of driver declaration macros. For (i), we specify + * SI_ORDER_MIDDLE, while higher orders are used with the DRIVER_MODULE_ORDERED + * macros for (ii) and (iii). + */ +DEV_MODULE_ORDERED(netmap, netmap_loader, NULL, SI_ORDER_MIDDLE); +#else /* !DEV_MODULE_ORDERED */ DEV_MODULE(netmap, netmap_loader, NULL); +#endif /* DEV_MODULE_ORDERED */ +MODULE_DEPEND(netmap, pci, 1, 1, 1); MODULE_VERSION(netmap, 1); +/* reduce conditional code */ +// linux API, use for the knlist in FreeBSD +/* use a private mutex for the knlist */ diff --git a/sys/dev/netmap/netmap_generic.c b/sys/dev/netmap/netmap_generic.c index 85a6a9f76ea2..5cef4a29110a 100644 --- a/sys/dev/netmap/netmap_generic.c +++ b/sys/dev/netmap/netmap_generic.c @@ -1,5 +1,7 @@ /* - * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved. + * Copyright (C) 2013-2016 Vincenzo Maffione + * Copyright (C) 2013-2016 Luigi Rizzo + * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -83,25 +85,25 @@ __FBSDID("$FreeBSD$"); #define rtnl_lock() ND("rtnl_lock called") #define rtnl_unlock() ND("rtnl_unlock called") -#define MBUF_TXQ(m) ((m)->m_pkthdr.flowid) #define MBUF_RXQ(m) ((m)->m_pkthdr.flowid) #define smp_mb() /* * FreeBSD mbuf allocator/deallocator in emulation mode: - * + */ +#if __FreeBSD_version < 1100000 + +/* + * For older versions of FreeBSD: + * * We allocate EXT_PACKET mbuf+clusters, but need to set M_NOFREE * so that the destructor, if invoked, will not free the packet. - * In principle we should set the destructor only on demand, + * In principle we should set the destructor only on demand, * but since there might be a race we better do it on allocation. * As a consequence, we also need to set the destructor or we * would leak buffers. */ -/* - * mbuf wrappers - */ - /* mbuf destructor, also need to change the type to EXT_EXTREF, * add an M_NOFREE flag, and then clear the flag and * chain into uma_zfree(zone_pack, mf) @@ -112,35 +114,93 @@ __FBSDID("$FreeBSD$"); (m)->m_ext.ext_type = EXT_EXTREF; \ } while (0) -static void -netmap_default_mbuf_destructor(struct mbuf *m) +static int +void_mbuf_dtor(struct mbuf *m, void *arg1, void *arg2) { /* restore original mbuf */ m->m_ext.ext_buf = m->m_data = m->m_ext.ext_arg1; m->m_ext.ext_arg1 = NULL; m->m_ext.ext_type = EXT_PACKET; m->m_ext.ext_free = NULL; - if (GET_MBUF_REFCNT(m) == 0) + if (MBUF_REFCNT(m) == 0) SET_MBUF_REFCNT(m, 1); uma_zfree(zone_pack, m); + + return 0; } static inline struct mbuf * -netmap_get_mbuf(int len) +nm_os_get_mbuf(struct ifnet *ifp, int len) { struct mbuf *m; + + (void)ifp; m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m) { - m->m_flags |= M_NOFREE; /* XXXNP: Almost certainly incorrect. */ + /* m_getcl() (mb_ctor_mbuf) has an assert that checks that + * M_NOFREE flag is not specified as third argument, + * so we have to set M_NOFREE after m_getcl(). */ + m->m_flags |= M_NOFREE; m->m_ext.ext_arg1 = m->m_ext.ext_buf; // XXX save - m->m_ext.ext_free = (void *)netmap_default_mbuf_destructor; + m->m_ext.ext_free = (void *)void_mbuf_dtor; m->m_ext.ext_type = EXT_EXTREF; - ND(5, "create m %p refcnt %d", m, GET_MBUF_REFCNT(m)); + ND(5, "create m %p refcnt %d", m, MBUF_REFCNT(m)); } return m; } +#else /* __FreeBSD_version >= 1100000 */ + +/* + * Newer versions of FreeBSD, using a straightforward scheme. + * + * We allocate mbufs with m_gethdr(), since the mbuf header is needed + * by the driver. We also attach a customly-provided external storage, + * which in this case is a netmap buffer. When calling m_extadd(), however + * we pass a NULL address, since the real address (and length) will be + * filled in by nm_os_generic_xmit_frame() right before calling + * if_transmit(). + * + * The dtor function does nothing, however we need it since mb_free_ext() + * has a KASSERT(), checking that the mbuf dtor function is not NULL. + */ + +#define SET_MBUF_DESTRUCTOR(m, fn) do { \ + (m)->m_ext.ext_free = (void *)fn; \ +} while (0) + +static void void_mbuf_dtor(struct mbuf *m, void *arg1, void *arg2) { } + +static inline struct mbuf * +nm_os_get_mbuf(struct ifnet *ifp, int len) +{ + struct mbuf *m; + + (void)ifp; + (void)len; + + m = m_gethdr(M_NOWAIT, MT_DATA); + if (m == NULL) { + return m; + } + + m_extadd(m, NULL /* buf */, 0 /* size */, void_mbuf_dtor, + NULL, NULL, 0, EXT_NET_DRV); + + return m; +} + +#endif /* __FreeBSD_version >= 1100000 */ +#elif defined _WIN32 + +#include "win_glue.h" + +#define rtnl_lock() ND("rtnl_lock called") +#define rtnl_unlock() ND("rtnl_unlock called") +#define MBUF_TXQ(m) 0//((m)->m_pkthdr.flowid) +#define MBUF_RXQ(m) 0//((m)->m_pkthdr.flowid) +#define smp_mb() //XXX: to be correctly defined #else /* linux */ @@ -150,7 +210,12 @@ netmap_get_mbuf(int len) #include <linux/ethtool.h> /* struct ethtool_ops, get_ringparam */ #include <linux/hrtimer.h> -//#define REG_RESET +static inline struct mbuf * +nm_os_get_mbuf(struct ifnet *ifp, int len) +{ + return alloc_skb(ifp->needed_headroom + len + + ifp->needed_tailroom, GFP_ATOMIC); +} #endif /* linux */ @@ -161,8 +226,21 @@ netmap_get_mbuf(int len) #include <dev/netmap/netmap_mem2.h> +#define for_each_kring_n(_i, _k, _karr, _n) \ + for (_k=_karr, _i = 0; _i < _n; (_k)++, (_i)++) + +#define for_each_tx_kring(_i, _k, _na) \ + for_each_kring_n(_i, _k, (_na)->tx_rings, (_na)->num_tx_rings) +#define for_each_tx_kring_h(_i, _k, _na) \ + for_each_kring_n(_i, _k, (_na)->tx_rings, (_na)->num_tx_rings + 1) + +#define for_each_rx_kring(_i, _k, _na) \ + for_each_kring_n(_i, _k, (_na)->rx_rings, (_na)->num_rx_rings) +#define for_each_rx_kring_h(_i, _k, _na) \ + for_each_kring_n(_i, _k, (_na)->rx_rings, (_na)->num_rx_rings + 1) -/* ======================== usage stats =========================== */ + +/* ======================== PERFORMANCE STATISTICS =========================== */ #ifdef RATE_GENERIC #define IFRATE(x) x @@ -170,6 +248,8 @@ struct rate_stats { unsigned long txpkt; unsigned long txsync; unsigned long txirq; + unsigned long txrepl; + unsigned long txdrop; unsigned long rxpkt; unsigned long rxirq; unsigned long rxsync; @@ -194,6 +274,8 @@ static void rate_callback(unsigned long arg) RATE_PRINTK(txpkt); RATE_PRINTK(txsync); RATE_PRINTK(txirq); + RATE_PRINTK(txrepl); + RATE_PRINTK(txdrop); RATE_PRINTK(rxpkt); RATE_PRINTK(rxsync); RATE_PRINTK(rxirq); @@ -230,94 +312,222 @@ void generic_rate(int txp, int txs, int txi, int rxp, int rxs, int rxi) * the poller threads. Differently from netmap_rx_irq(), we check * only NAF_NETMAP_ON instead of NAF_NATIVE_ON to enable the irq. */ -static void -netmap_generic_irq(struct ifnet *ifp, u_int q, u_int *work_done) +void +netmap_generic_irq(struct netmap_adapter *na, u_int q, u_int *work_done) { - struct netmap_adapter *na = NA(ifp); if (unlikely(!nm_netmap_on(na))) return; - netmap_common_irq(ifp, q, work_done); + netmap_common_irq(na, q, work_done); +#ifdef RATE_GENERIC + if (work_done) + rate_ctx.new.rxirq++; + else + rate_ctx.new.txirq++; +#endif /* RATE_GENERIC */ } +static int +generic_netmap_unregister(struct netmap_adapter *na) +{ + struct netmap_generic_adapter *gna = (struct netmap_generic_adapter *)na; + struct netmap_kring *kring = NULL; + int i, r; + + if (na->active_fds == 0) { + D("Generic adapter %p goes off", na); + rtnl_lock(); + + na->na_flags &= ~NAF_NETMAP_ON; + + /* Release packet steering control. */ + nm_os_catch_tx(gna, 0); + + /* Stop intercepting packets on the RX path. */ + nm_os_catch_rx(gna, 0); + + rtnl_unlock(); + } + + for_each_rx_kring_h(r, kring, na) { + if (nm_kring_pending_off(kring)) { + D("RX ring %d of generic adapter %p goes off", r, na); + kring->nr_mode = NKR_NETMAP_OFF; + } + } + for_each_tx_kring_h(r, kring, na) { + if (nm_kring_pending_off(kring)) { + kring->nr_mode = NKR_NETMAP_OFF; + D("TX ring %d of generic adapter %p goes off", r, na); + } + } + + for_each_rx_kring(r, kring, na) { + /* Free the mbufs still pending in the RX queues, + * that did not end up into the corresponding netmap + * RX rings. */ + mbq_safe_purge(&kring->rx_queue); + nm_os_mitigation_cleanup(&gna->mit[r]); + } + + /* Decrement reference counter for the mbufs in the + * TX pools. These mbufs can be still pending in drivers, + * (e.g. this happens with virtio-net driver, which + * does lazy reclaiming of transmitted mbufs). */ + for_each_tx_kring(r, kring, na) { + /* We must remove the destructor on the TX event, + * because the destructor invokes netmap code, and + * the netmap module may disappear before the + * TX event is consumed. */ + mtx_lock_spin(&kring->tx_event_lock); + if (kring->tx_event) { + SET_MBUF_DESTRUCTOR(kring->tx_event, NULL); + } + kring->tx_event = NULL; + mtx_unlock_spin(&kring->tx_event_lock); + } + + if (na->active_fds == 0) { + free(gna->mit, M_DEVBUF); + + for_each_rx_kring(r, kring, na) { + mbq_safe_fini(&kring->rx_queue); + } + + for_each_tx_kring(r, kring, na) { + mtx_destroy(&kring->tx_event_lock); + if (kring->tx_pool == NULL) { + continue; + } + + for (i=0; i<na->num_tx_desc; i++) { + if (kring->tx_pool[i]) { + m_freem(kring->tx_pool[i]); + } + } + free(kring->tx_pool, M_DEVBUF); + kring->tx_pool = NULL; + } + +#ifdef RATE_GENERIC + if (--rate_ctx.refcount == 0) { + D("del_timer()"); + del_timer(&rate_ctx.timer); + } +#endif + } + + return 0; +} /* Enable/disable netmap mode for a generic network interface. */ static int generic_netmap_register(struct netmap_adapter *na, int enable) { struct netmap_generic_adapter *gna = (struct netmap_generic_adapter *)na; - struct mbuf *m; + struct netmap_kring *kring = NULL; int error; int i, r; - if (!na) + if (!na) { return EINVAL; + } -#ifdef REG_RESET - error = ifp->netdev_ops->ndo_stop(ifp); - if (error) { - return error; + if (!enable) { + /* This is actually an unregif. */ + return generic_netmap_unregister(na); } -#endif /* REG_RESET */ - if (enable) { /* Enable netmap mode. */ - /* Init the mitigation support on all the rx queues. */ + if (na->active_fds == 0) { + D("Generic adapter %p goes on", na); + /* Do all memory allocations when (na->active_fds == 0), to + * simplify error management. */ + + /* Allocate memory for mitigation support on all the rx queues. */ gna->mit = malloc(na->num_rx_rings * sizeof(struct nm_generic_mit), - M_DEVBUF, M_NOWAIT | M_ZERO); + M_DEVBUF, M_NOWAIT | M_ZERO); if (!gna->mit) { D("mitigation allocation failed"); error = ENOMEM; goto out; } - for (r=0; r<na->num_rx_rings; r++) - netmap_mitigation_init(&gna->mit[r], r, na); - /* Initialize the rx queue, as generic_rx_handler() can - * be called as soon as netmap_catch_rx() returns. - */ - for (r=0; r<na->num_rx_rings; r++) { - mbq_safe_init(&na->rx_rings[r].rx_queue); + for_each_rx_kring(r, kring, na) { + /* Init mitigation support. */ + nm_os_mitigation_init(&gna->mit[r], r, na); + + /* Initialize the rx queue, as generic_rx_handler() can + * be called as soon as nm_os_catch_rx() returns. + */ + mbq_safe_init(&kring->rx_queue); } /* - * Preallocate packet buffers for the tx rings. + * Prepare mbuf pools (parallel to the tx rings), for packet + * transmission. Don't preallocate the mbufs here, it's simpler + * to leave this task to txsync. */ - for (r=0; r<na->num_tx_rings; r++) - na->tx_rings[r].tx_pool = NULL; - for (r=0; r<na->num_tx_rings; r++) { - na->tx_rings[r].tx_pool = malloc(na->num_tx_desc * sizeof(struct mbuf *), - M_DEVBUF, M_NOWAIT | M_ZERO); - if (!na->tx_rings[r].tx_pool) { + for_each_tx_kring(r, kring, na) { + kring->tx_pool = NULL; + } + for_each_tx_kring(r, kring, na) { + kring->tx_pool = + malloc(na->num_tx_desc * sizeof(struct mbuf *), + M_DEVBUF, M_NOWAIT | M_ZERO); + if (!kring->tx_pool) { D("tx_pool allocation failed"); error = ENOMEM; goto free_tx_pools; } - for (i=0; i<na->num_tx_desc; i++) - na->tx_rings[r].tx_pool[i] = NULL; - for (i=0; i<na->num_tx_desc; i++) { - m = netmap_get_mbuf(NETMAP_BUF_SIZE(na)); - if (!m) { - D("tx_pool[%d] allocation failed", i); - error = ENOMEM; - goto free_tx_pools; - } - na->tx_rings[r].tx_pool[i] = m; - } + mtx_init(&kring->tx_event_lock, "tx_event_lock", + NULL, MTX_SPIN); } + } + + for_each_rx_kring_h(r, kring, na) { + if (nm_kring_pending_on(kring)) { + D("RX ring %d of generic adapter %p goes on", r, na); + kring->nr_mode = NKR_NETMAP_ON; + } + + } + for_each_tx_kring_h(r, kring, na) { + if (nm_kring_pending_on(kring)) { + D("TX ring %d of generic adapter %p goes on", r, na); + kring->nr_mode = NKR_NETMAP_ON; + } + } + + for_each_tx_kring(r, kring, na) { + /* Initialize tx_pool and tx_event. */ + for (i=0; i<na->num_tx_desc; i++) { + kring->tx_pool[i] = NULL; + } + + kring->tx_event = NULL; + } + + if (na->active_fds == 0) { rtnl_lock(); + /* Prepare to intercept incoming traffic. */ - error = netmap_catch_rx(gna, 1); + error = nm_os_catch_rx(gna, 1); if (error) { - D("netdev_rx_handler_register() failed (%d)", error); + D("nm_os_catch_rx(1) failed (%d)", error); goto register_handler; } - na->na_flags |= NAF_NETMAP_ON; /* Make netmap control the packet steering. */ - netmap_catch_tx(gna, 1); + error = nm_os_catch_tx(gna, 1); + if (error) { + D("nm_os_catch_tx(1) failed (%d)", error); + goto catch_rx; + } rtnl_unlock(); + na->na_flags |= NAF_NETMAP_ON; + #ifdef RATE_GENERIC if (rate_ctx.refcount == 0) { D("setup_timer()"); @@ -329,73 +539,26 @@ generic_netmap_register(struct netmap_adapter *na, int enable) } rate_ctx.refcount++; #endif /* RATE */ - - } else if (na->tx_rings[0].tx_pool) { - /* Disable netmap mode. We enter here only if the previous - generic_netmap_register(na, 1) was successful. - If it was not, na->tx_rings[0].tx_pool was set to NULL by the - error handling code below. */ - rtnl_lock(); - - na->na_flags &= ~NAF_NETMAP_ON; - - /* Release packet steering control. */ - netmap_catch_tx(gna, 0); - - /* Do not intercept packets on the rx path. */ - netmap_catch_rx(gna, 0); - - rtnl_unlock(); - - /* Free the mbufs going to the netmap rings */ - for (r=0; r<na->num_rx_rings; r++) { - mbq_safe_purge(&na->rx_rings[r].rx_queue); - mbq_safe_destroy(&na->rx_rings[r].rx_queue); - } - - for (r=0; r<na->num_rx_rings; r++) - netmap_mitigation_cleanup(&gna->mit[r]); - free(gna->mit, M_DEVBUF); - - for (r=0; r<na->num_tx_rings; r++) { - for (i=0; i<na->num_tx_desc; i++) { - m_freem(na->tx_rings[r].tx_pool[i]); - } - free(na->tx_rings[r].tx_pool, M_DEVBUF); - } - -#ifdef RATE_GENERIC - if (--rate_ctx.refcount == 0) { - D("del_timer()"); - del_timer(&rate_ctx.timer); - } -#endif - } - -#ifdef REG_RESET - error = ifp->netdev_ops->ndo_open(ifp); - if (error) { - goto free_tx_pools; } -#endif return 0; + /* Here (na->active_fds == 0) holds. */ +catch_rx: + nm_os_catch_rx(gna, 0); register_handler: rtnl_unlock(); free_tx_pools: - for (r=0; r<na->num_tx_rings; r++) { - if (na->tx_rings[r].tx_pool == NULL) + for_each_tx_kring(r, kring, na) { + mtx_destroy(&kring->tx_event_lock); + if (kring->tx_pool == NULL) { continue; - for (i=0; i<na->num_tx_desc; i++) - if (na->tx_rings[r].tx_pool[i]) - m_freem(na->tx_rings[r].tx_pool[i]); - free(na->tx_rings[r].tx_pool, M_DEVBUF); - na->tx_rings[r].tx_pool = NULL; + } + free(kring->tx_pool, M_DEVBUF); + kring->tx_pool = NULL; } - for (r=0; r<na->num_rx_rings; r++) { - netmap_mitigation_cleanup(&gna->mit[r]); - mbq_safe_destroy(&na->rx_rings[r].rx_queue); + for_each_rx_kring(r, kring, na) { + mbq_safe_fini(&kring->rx_queue); } free(gna->mit, M_DEVBUF); out: @@ -411,13 +574,58 @@ out: static void generic_mbuf_destructor(struct mbuf *m) { - netmap_generic_irq(MBUF_IFP(m), MBUF_TXQ(m), NULL); + struct netmap_adapter *na = NA(GEN_TX_MBUF_IFP(m)); + struct netmap_kring *kring; + unsigned int r = MBUF_TXQ(m); + unsigned int r_orig = r; + + if (unlikely(!nm_netmap_on(na) || r >= na->num_tx_rings)) { + D("Error: no netmap adapter on device %p", + GEN_TX_MBUF_IFP(m)); + return; + } + + /* + * First, clear the event mbuf. + * In principle, the event 'm' should match the one stored + * on ring 'r'. However we check it explicitely to stay + * safe against lower layers (qdisc, driver, etc.) changing + * MBUF_TXQ(m) under our feet. If the match is not found + * on 'r', we try to see if it belongs to some other ring. + */ + for (;;) { + bool match = false; + + kring = &na->tx_rings[r]; + mtx_lock_spin(&kring->tx_event_lock); + if (kring->tx_event == m) { + kring->tx_event = NULL; + match = true; + } + mtx_unlock_spin(&kring->tx_event_lock); + + if (match) { + if (r != r_orig) { + RD(1, "event %p migrated: ring %u --> %u", + m, r_orig, r); + } + break; + } + + if (++r == na->num_tx_rings) r = 0; + + if (r == r_orig) { + RD(1, "Cannot match event %p", m); + return; + } + } + + /* Second, wake up clients. They will reclaim the event through + * txsync. */ + netmap_generic_irq(na, r, NULL); #ifdef __FreeBSD__ - if (netmap_verbose) - RD(5, "Tx irq (%p) queue %d index %d" , m, MBUF_TXQ(m), (int)(uintptr_t)m->m_ext.ext_arg1); - netmap_default_mbuf_destructor(m); -#endif /* __FreeBSD__ */ - IFRATE(rate_ctx.new.txirq++); + void_mbuf_dtor(m, NULL, NULL); +#endif } extern int netmap_adaptive_io; @@ -428,7 +636,7 @@ extern int netmap_adaptive_io; * nr_hwcur is the first unsent buffer. */ static u_int -generic_netmap_tx_clean(struct netmap_kring *kring) +generic_netmap_tx_clean(struct netmap_kring *kring, int txqdisc) { u_int const lim = kring->nkr_num_slots - 1; u_int nm_i = nm_next(kring->nr_hwtail, lim); @@ -436,20 +644,50 @@ generic_netmap_tx_clean(struct netmap_kring *kring) u_int n = 0; struct mbuf **tx_pool = kring->tx_pool; + ND("hwcur = %d, hwtail = %d", kring->nr_hwcur, kring->nr_hwtail); + while (nm_i != hwcur) { /* buffers not completed */ struct mbuf *m = tx_pool[nm_i]; - if (unlikely(m == NULL)) { - /* this is done, try to replenish the entry */ - tx_pool[nm_i] = m = netmap_get_mbuf(NETMAP_BUF_SIZE(kring->na)); + if (txqdisc) { + if (m == NULL) { + /* Nothing to do, this is going + * to be replenished. */ + RD(3, "Is this happening?"); + + } else if (MBUF_QUEUED(m)) { + break; /* Not dequeued yet. */ + + } else if (MBUF_REFCNT(m) != 1) { + /* This mbuf has been dequeued but is still busy + * (refcount is 2). + * Leave it to the driver and replenish. */ + m_freem(m); + tx_pool[nm_i] = NULL; + } + + } else { if (unlikely(m == NULL)) { - D("mbuf allocation failed, XXX error"); - // XXX how do we proceed ? break ? - return -ENOMEM; + int event_consumed; + + /* This slot was used to place an event. */ + mtx_lock_spin(&kring->tx_event_lock); + event_consumed = (kring->tx_event == NULL); + mtx_unlock_spin(&kring->tx_event_lock); + if (!event_consumed) { + /* The event has not been consumed yet, + * still busy in the driver. */ + break; + } + /* The event has been consumed, we can go + * ahead. */ + + } else if (MBUF_REFCNT(m) != 1) { + /* This mbuf is still busy: its refcnt is 2. */ + break; } - } else if (GET_MBUF_REFCNT(m) != 1) { - break; /* This mbuf is still busy: its refcnt is 2. */ } + n++; nm_i = nm_next(nm_i, lim); #if 0 /* rate adaptation */ @@ -476,23 +714,17 @@ generic_netmap_tx_clean(struct netmap_kring *kring) return n; } - -/* - * We have pending packets in the driver between nr_hwtail +1 and hwcur. - * Compute a position in the middle, to be used to generate - * a notification. - */ +/* Compute a slot index in the middle between inf and sup. */ static inline u_int -generic_tx_event_middle(struct netmap_kring *kring, u_int hwcur) +ring_middle(u_int inf, u_int sup, u_int lim) { - u_int n = kring->nkr_num_slots; - u_int ntc = nm_next(kring->nr_hwtail, n-1); + u_int n = lim + 1; u_int e; - if (hwcur >= ntc) { - e = (hwcur + ntc) / 2; + if (sup >= inf) { + e = (sup + inf) / 2; } else { /* wrap around */ - e = (hwcur + n + ntc) / 2; + e = (sup + n + inf) / 2; if (e >= n) { e -= n; } @@ -506,35 +738,59 @@ generic_tx_event_middle(struct netmap_kring *kring, u_int hwcur) return e; } -/* - * We have pending packets in the driver between nr_hwtail+1 and hwcur. - * Schedule a notification approximately in the middle of the two. - * There is a race but this is only called within txsync which does - * a double check. - */ static void generic_set_tx_event(struct netmap_kring *kring, u_int hwcur) { + u_int lim = kring->nkr_num_slots - 1; struct mbuf *m; u_int e; + u_int ntc = nm_next(kring->nr_hwtail, lim); /* next to clean */ - if (nm_next(kring->nr_hwtail, kring->nkr_num_slots -1) == hwcur) { + if (ntc == hwcur) { return; /* all buffers are free */ } - e = generic_tx_event_middle(kring, hwcur); + + /* + * We have pending packets in the driver between hwtail+1 + * and hwcur, and we have to chose one of these slot to + * generate a notification. + * There is a race but this is only called within txsync which + * does a double check. + */ +#if 0 + /* Choose a slot in the middle, so that we don't risk ending + * up in a situation where the client continuously wake up, + * fills one or a few TX slots and go to sleep again. */ + e = ring_middle(ntc, hwcur, lim); +#else + /* Choose the first pending slot, to be safe against driver + * reordering mbuf transmissions. */ + e = ntc; +#endif m = kring->tx_pool[e]; - ND(5, "Request Event at %d mbuf %p refcnt %d", e, m, m ? GET_MBUF_REFCNT(m) : -2 ); if (m == NULL) { - /* This can happen if there is already an event on the netmap - slot 'e': There is nothing to do. */ + /* An event is already in place. */ return; } - kring->tx_pool[e] = NULL; + + mtx_lock_spin(&kring->tx_event_lock); + if (kring->tx_event) { + /* An event is already in place. */ + mtx_unlock_spin(&kring->tx_event_lock); + return; + } + SET_MBUF_DESTRUCTOR(m, generic_mbuf_destructor); + kring->tx_event = m; + mtx_unlock_spin(&kring->tx_event_lock); + + kring->tx_pool[e] = NULL; + + ND(5, "Request Event at %d mbuf %p refcnt %d", e, m, m ? MBUF_REFCNT(m) : -2 ); - // XXX wmb() ? - /* Decrement the refcount an free it if we have the last one. */ + /* Decrement the refcount. This will free it if we lose the race + * with the driver. */ m_freem(m); smp_mb(); } @@ -551,6 +807,7 @@ static int generic_netmap_txsync(struct netmap_kring *kring, int flags) { struct netmap_adapter *na = kring->na; + struct netmap_generic_adapter *gna = (struct netmap_generic_adapter *)na; struct ifnet *ifp = na->ifp; struct netmap_ring *ring = kring->ring; u_int nm_i; /* index into the netmap ring */ // j @@ -560,8 +817,6 @@ generic_netmap_txsync(struct netmap_kring *kring, int flags) IFRATE(rate_ctx.new.txsync++); - // TODO: handle the case of mbuf allocation failure - rmb(); /* @@ -569,72 +824,121 @@ generic_netmap_txsync(struct netmap_kring *kring, int flags) */ nm_i = kring->nr_hwcur; if (nm_i != head) { /* we have new packets to send */ + struct nm_os_gen_arg a; + u_int event = -1; + + if (gna->txqdisc && nm_kr_txempty(kring)) { + /* In txqdisc mode, we ask for a delayed notification, + * but only when cur == hwtail, which means that the + * client is going to block. */ + event = ring_middle(nm_i, head, lim); + ND(3, "Place txqdisc event (hwcur=%u,event=%u," + "head=%u,hwtail=%u)", nm_i, event, head, + kring->nr_hwtail); + } + + a.ifp = ifp; + a.ring_nr = ring_nr; + a.head = a.tail = NULL; + while (nm_i != head) { struct netmap_slot *slot = &ring->slot[nm_i]; u_int len = slot->len; void *addr = NMB(na, slot); - /* device-specific */ struct mbuf *m; int tx_ret; NM_CHECK_ADDR_LEN(na, addr, len); - /* Tale a mbuf from the tx pool and copy in the user packet. */ + /* Tale a mbuf from the tx pool (replenishing the pool + * entry if necessary) and copy in the user packet. */ m = kring->tx_pool[nm_i]; - if (unlikely(!m)) { - RD(5, "This should never happen"); - kring->tx_pool[nm_i] = m = netmap_get_mbuf(NETMAP_BUF_SIZE(na)); - if (unlikely(m == NULL)) { - D("mbuf allocation failed"); + if (unlikely(m == NULL)) { + kring->tx_pool[nm_i] = m = + nm_os_get_mbuf(ifp, NETMAP_BUF_SIZE(na)); + if (m == NULL) { + RD(2, "Failed to replenish mbuf"); + /* Here we could schedule a timer which + * retries to replenish after a while, + * and notifies the client when it + * manages to replenish some slots. In + * any case we break early to avoid + * crashes. */ break; } + IFRATE(rate_ctx.new.txrepl++); } - /* XXX we should ask notifications when NS_REPORT is set, - * or roughly every half frame. We can optimize this - * by lazily requesting notifications only when a - * transmission fails. Probably the best way is to - * break on failures and set notifications when - * ring->cur == ring->tail || nm_i != cur + + a.m = m; + a.addr = addr; + a.len = len; + a.qevent = (nm_i == event); + /* When not in txqdisc mode, we should ask + * notifications when NS_REPORT is set, or roughly + * every half ring. To optimize this, we set a + * notification event when the client runs out of + * TX ring space, or when transmission fails. In + * the latter case we also break early. */ - tx_ret = generic_xmit_frame(ifp, m, addr, len, ring_nr); + tx_ret = nm_os_generic_xmit_frame(&a); if (unlikely(tx_ret)) { - ND(5, "start_xmit failed: err %d [nm_i %u, head %u, hwtail %u]", - tx_ret, nm_i, head, kring->nr_hwtail); - /* - * No room for this mbuf in the device driver. - * Request a notification FOR A PREVIOUS MBUF, - * then call generic_netmap_tx_clean(kring) to do the - * double check and see if we can free more buffers. - * If there is space continue, else break; - * NOTE: the double check is necessary if the problem - * occurs in the txsync call after selrecord(). - * Also, we need some way to tell the caller that not - * all buffers were queued onto the device (this was - * not a problem with native netmap driver where space - * is preallocated). The bridge has a similar problem - * and we solve it there by dropping the excess packets. - */ - generic_set_tx_event(kring, nm_i); - if (generic_netmap_tx_clean(kring)) { /* space now available */ - continue; - } else { - break; + if (!gna->txqdisc) { + /* + * No room for this mbuf in the device driver. + * Request a notification FOR A PREVIOUS MBUF, + * then call generic_netmap_tx_clean(kring) to do the + * double check and see if we can free more buffers. + * If there is space continue, else break; + * NOTE: the double check is necessary if the problem + * occurs in the txsync call after selrecord(). + * Also, we need some way to tell the caller that not + * all buffers were queued onto the device (this was + * not a problem with native netmap driver where space + * is preallocated). The bridge has a similar problem + * and we solve it there by dropping the excess packets. + */ + generic_set_tx_event(kring, nm_i); + if (generic_netmap_tx_clean(kring, gna->txqdisc)) { + /* space now available */ + continue; + } else { + break; + } } + + /* In txqdisc mode, the netmap-aware qdisc + * queue has the same length as the number of + * netmap slots (N). Since tail is advanced + * only when packets are dequeued, qdisc + * queue overrun cannot happen, so + * nm_os_generic_xmit_frame() did not fail + * because of that. + * However, packets can be dropped because + * carrier is off, or because our qdisc is + * being deactivated, or possibly for other + * reasons. In these cases, we just let the + * packet to be dropped. */ + IFRATE(rate_ctx.new.txdrop++); } + slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED); nm_i = nm_next(nm_i, lim); - IFRATE(rate_ctx.new.txpkt ++); + IFRATE(rate_ctx.new.txpkt++); } - - /* Update hwcur to the next slot to transmit. */ - kring->nr_hwcur = nm_i; /* not head, we could break early */ + if (a.head != NULL) { + a.addr = NULL; + nm_os_generic_xmit_frame(&a); + } + /* Update hwcur to the next slot to transmit. Here nm_i + * is not necessarily head, we could break early. */ + kring->nr_hwcur = nm_i; } /* * Second, reclaim completed buffers */ - if (flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring)) { + if (!gna->txqdisc && (flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring))) { /* No more available slots? Set a notification event * on a netmap slot that will be cleaned in the future. * No doublecheck is performed, since txsync() will be @@ -642,58 +946,74 @@ generic_netmap_txsync(struct netmap_kring *kring, int flags) */ generic_set_tx_event(kring, nm_i); } - ND("tx #%d, hwtail = %d", n, kring->nr_hwtail); - generic_netmap_tx_clean(kring); + generic_netmap_tx_clean(kring, gna->txqdisc); return 0; } /* - * This handler is registered (through netmap_catch_rx()) + * This handler is registered (through nm_os_catch_rx()) * within the attached network interface * in the RX subsystem, so that every mbuf passed up by * the driver can be stolen to the network stack. * Stolen packets are put in a queue where the * generic_netmap_rxsync() callback can extract them. + * Returns 1 if the packet was stolen, 0 otherwise. */ -void +int generic_rx_handler(struct ifnet *ifp, struct mbuf *m) { struct netmap_adapter *na = NA(ifp); struct netmap_generic_adapter *gna = (struct netmap_generic_adapter *)na; + struct netmap_kring *kring; u_int work_done; - u_int rr = MBUF_RXQ(m); // receive ring number + u_int r = MBUF_RXQ(m); /* receive ring number */ - if (rr >= na->num_rx_rings) { - rr = rr % na->num_rx_rings; // XXX expensive... + if (r >= na->num_rx_rings) { + r = r % na->num_rx_rings; + } + + kring = &na->rx_rings[r]; + + if (kring->nr_mode == NKR_NETMAP_OFF) { + /* We must not intercept this mbuf. */ + return 0; } /* limit the size of the queue */ - if (unlikely(mbq_len(&na->rx_rings[rr].rx_queue) > 1024)) { + if (unlikely(!gna->rxsg && MBUF_LEN(m) > NETMAP_BUF_SIZE(na))) { + /* This may happen when GRO/LRO features are enabled for + * the NIC driver when the generic adapter does not + * support RX scatter-gather. */ + RD(2, "Warning: driver pushed up big packet " + "(size=%d)", (int)MBUF_LEN(m)); + m_freem(m); + } else if (unlikely(mbq_len(&kring->rx_queue) > 1024)) { m_freem(m); } else { - mbq_safe_enqueue(&na->rx_rings[rr].rx_queue, m); + mbq_safe_enqueue(&kring->rx_queue, m); } if (netmap_generic_mit < 32768) { /* no rx mitigation, pass notification up */ - netmap_generic_irq(na->ifp, rr, &work_done); - IFRATE(rate_ctx.new.rxirq++); + netmap_generic_irq(na, r, &work_done); } else { /* same as send combining, filter notification if there is a * pending timer, otherwise pass it up and start a timer. */ - if (likely(netmap_mitigation_active(&gna->mit[rr]))) { + if (likely(nm_os_mitigation_active(&gna->mit[r]))) { /* Record that there is some pending work. */ - gna->mit[rr].mit_pending = 1; + gna->mit[r].mit_pending = 1; } else { - netmap_generic_irq(na->ifp, rr, &work_done); - IFRATE(rate_ctx.new.rxirq++); - netmap_mitigation_start(&gna->mit[rr]); + netmap_generic_irq(na, r, &work_done); + nm_os_mitigation_start(&gna->mit[r]); } } + + /* We have intercepted the mbuf. */ + return 1; } /* @@ -713,54 +1033,23 @@ generic_netmap_rxsync(struct netmap_kring *kring, int flags) u_int const head = kring->rhead; int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR; + /* Adapter-specific variables. */ + uint16_t slot_flags = kring->nkr_slot_flags; + u_int nm_buf_len = NETMAP_BUF_SIZE(na); + struct mbq tmpq; + struct mbuf *m; + int avail; /* in bytes */ + int mlen; + int copy; + if (head > lim) return netmap_ring_reinit(kring); - /* - * First part: import newly received packets. - */ - if (netmap_no_pendintr || force_update) { - /* extract buffers from the rx queue, stop at most one - * slot before nr_hwcur (stop_i) - */ - uint16_t slot_flags = kring->nkr_slot_flags; - u_int stop_i = nm_prev(kring->nr_hwcur, lim); - - nm_i = kring->nr_hwtail; /* first empty slot in the receive ring */ - for (n = 0; nm_i != stop_i; n++) { - int len; - void *addr = NMB(na, &ring->slot[nm_i]); - struct mbuf *m; - - /* we only check the address here on generic rx rings */ - if (addr == NETMAP_BUF_BASE(na)) { /* Bad buffer */ - return netmap_ring_reinit(kring); - } - /* - * Call the locked version of the function. - * XXX Ideally we could grab a batch of mbufs at once - * and save some locking overhead. - */ - m = mbq_safe_dequeue(&kring->rx_queue); - if (!m) /* no more data */ - break; - len = MBUF_LEN(m); - m_copydata(m, 0, len, addr); - ring->slot[nm_i].len = len; - ring->slot[nm_i].flags = slot_flags; - m_freem(m); - nm_i = nm_next(nm_i, lim); - } - if (n) { - kring->nr_hwtail = nm_i; - IFRATE(rate_ctx.new.rxpkt += n); - } - kring->nr_kflags &= ~NKR_PENDINTR; - } + IFRATE(rate_ctx.new.rxsync++); - // XXX should we invert the order ? /* - * Second part: skip past packets that userspace has released. + * First part: skip past packets that userspace has released. + * This can possibly make room for the second part. */ nm_i = kring->nr_hwcur; if (nm_i != head) { @@ -773,7 +1062,106 @@ generic_netmap_rxsync(struct netmap_kring *kring, int flags) } kring->nr_hwcur = head; } - IFRATE(rate_ctx.new.rxsync++); + + /* + * Second part: import newly received packets. + */ + if (!netmap_no_pendintr && !force_update) { + return 0; + } + + nm_i = kring->nr_hwtail; /* First empty slot in the receive ring. */ + + /* Compute the available space (in bytes) in this netmap ring. + * The first slot that is not considered in is the one before + * nr_hwcur. */ + + avail = nm_prev(kring->nr_hwcur, lim) - nm_i; + if (avail < 0) + avail += lim + 1; + avail *= nm_buf_len; + + /* First pass: While holding the lock on the RX mbuf queue, + * extract as many mbufs as they fit the available space, + * and put them in a temporary queue. + * To avoid performing a per-mbuf division (mlen / nm_buf_len) to + * to update avail, we do the update in a while loop that we + * also use to set the RX slots, but without performing the copy. */ + mbq_init(&tmpq); + mbq_lock(&kring->rx_queue); + for (n = 0;; n++) { + m = mbq_peek(&kring->rx_queue); + if (!m) { + /* No more packets from the driver. */ + break; + } + + mlen = MBUF_LEN(m); + if (mlen > avail) { + /* No more space in the ring. */ + break; + } + + mbq_dequeue(&kring->rx_queue); + + while (mlen) { + copy = nm_buf_len; + if (mlen < copy) { + copy = mlen; + } + mlen -= copy; + avail -= nm_buf_len; + + ring->slot[nm_i].len = copy; + ring->slot[nm_i].flags = slot_flags | (mlen ? NS_MOREFRAG : 0); + nm_i = nm_next(nm_i, lim); + } + + mbq_enqueue(&tmpq, m); + } + mbq_unlock(&kring->rx_queue); + + /* Second pass: Drain the temporary queue, going over the used RX slots, + * and perform the copy out of the RX queue lock. */ + nm_i = kring->nr_hwtail; + + for (;;) { + void *nmaddr; + int ofs = 0; + int morefrag; + + m = mbq_dequeue(&tmpq); + if (!m) { + break; + } + + do { + nmaddr = NMB(na, &ring->slot[nm_i]); + /* We only check the address here on generic rx rings. */ + if (nmaddr == NETMAP_BUF_BASE(na)) { /* Bad buffer */ + m_freem(m); + mbq_purge(&tmpq); + mbq_fini(&tmpq); + return netmap_ring_reinit(kring); + } + + copy = ring->slot[nm_i].len; + m_copydata(m, ofs, copy, nmaddr); + ofs += copy; + morefrag = ring->slot[nm_i].flags & NS_MOREFRAG; + nm_i = nm_next(nm_i, lim); + } while (morefrag); + + m_freem(m); + } + + mbq_fini(&tmpq); + + if (n) { + kring->nr_hwtail = nm_i; + IFRATE(rate_ctx.new.rxpkt += n); + } + kring->nr_kflags &= ~NKR_PENDINTR; return 0; } @@ -787,9 +1175,8 @@ generic_netmap_dtor(struct netmap_adapter *na) if (prev_na != NULL) { D("Released generic NA %p", gna); - if_rele(ifp); netmap_adapter_put(prev_na); - if (na->ifp == NULL) { + if (nm_iszombie(na)) { /* * The driver has been removed without releasing * the reference so we need to do it here. @@ -797,9 +1184,13 @@ generic_netmap_dtor(struct netmap_adapter *na) netmap_adapter_put(prev_na); } } - WNA(ifp) = prev_na; - D("Restored native NA %p", prev_na); + NM_ATTACH_NA(ifp, prev_na); + /* + * netmap_detach_common(), that it's called after this function, + * overrides WNA(ifp) if na->ifp is not NULL. + */ na->ifp = NULL; + D("Restored native NA %p", prev_na); } /* @@ -823,7 +1214,7 @@ generic_netmap_attach(struct ifnet *ifp) num_tx_desc = num_rx_desc = netmap_generic_ringsize; /* starting point */ - generic_find_num_desc(ifp, &num_tx_desc, &num_rx_desc); /* ignore errors */ + nm_os_generic_find_num_desc(ifp, &num_tx_desc, &num_rx_desc); /* ignore errors */ ND("Netmap ring size: TX = %d, RX = %d", num_tx_desc, num_rx_desc); if (num_tx_desc == 0 || num_rx_desc == 0) { D("Device has no hw slots (tx %u, rx %u)", num_tx_desc, num_rx_desc); @@ -855,12 +1246,23 @@ generic_netmap_attach(struct ifnet *ifp) ND("[GNA] num_rx_queues(%d), real_num_rx_queues(%d)", ifp->num_rx_queues, ifp->real_num_rx_queues); - generic_find_num_queues(ifp, &na->num_tx_rings, &na->num_rx_rings); + nm_os_generic_find_num_queues(ifp, &na->num_tx_rings, &na->num_rx_rings); retval = netmap_attach_common(na); if (retval) { free(gna, M_DEVBUF); + return retval; } + gna->prev = NA(ifp); /* save old na */ + if (gna->prev != NULL) { + netmap_adapter_get(gna->prev); + } + NM_ATTACH_NA(ifp, na); + + nm_os_generic_set_features(gna); + + D("Created generic NA %p (prev %p)", gna, gna->prev); + return retval; } diff --git a/sys/dev/netmap/netmap_kern.h b/sys/dev/netmap/netmap_kern.h index 4aead85285fd..de21f29585e0 100644 --- a/sys/dev/netmap/netmap_kern.h +++ b/sys/dev/netmap/netmap_kern.h @@ -1,6 +1,7 @@ /* - * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved. - * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved. + * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo + * Copyright (C) 2013-2016 Universita` di Pisa + * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -48,24 +49,34 @@ #if defined(CONFIG_NETMAP_GENERIC) #define WITH_GENERIC #endif -#if defined(CONFIG_NETMAP_V1000) -#define WITH_V1000 +#if defined(CONFIG_NETMAP_PTNETMAP_GUEST) +#define WITH_PTNETMAP_GUEST +#endif +#if defined(CONFIG_NETMAP_PTNETMAP_HOST) +#define WITH_PTNETMAP_HOST #endif -#else /* not linux */ +#elif defined (_WIN32) +#define WITH_VALE // comment out to disable VALE support +#define WITH_PIPES +#define WITH_MONITOR +#define WITH_GENERIC +#else /* neither linux nor windows */ #define WITH_VALE // comment out to disable VALE support #define WITH_PIPES #define WITH_MONITOR #define WITH_GENERIC +#define WITH_PTNETMAP_HOST /* ptnetmap host support */ +#define WITH_PTNETMAP_GUEST /* ptnetmap guest support */ #endif #if defined(__FreeBSD__) -#include <sys/selinfo.h> #define likely(x) __builtin_expect((long)!!(x), 1L) #define unlikely(x) __builtin_expect((long)!!(x), 0L) +#define __user #define NM_LOCK_T struct mtx /* low level spinlock, used to protect queues */ @@ -77,9 +88,11 @@ #define NM_MTX_ASSERT(m) sx_assert(&(m), SA_XLOCKED) #define NM_SELINFO_T struct nm_selinfo +#define NM_SELRECORD_T struct thread #define MBUF_LEN(m) ((m)->m_pkthdr.len) -#define MBUF_IFP(m) ((m)->m_pkthdr.rcvif) -#define NM_SEND_UP(ifp, m) ((NA(ifp))->if_input)(ifp, m) +#define MBUF_TXQ(m) ((m)->m_pkthdr.flowid) +#define MBUF_TRANSMIT(na, ifp, m) ((na)->if_transmit(ifp, m)) +#define GEN_TX_MBUF_IFP(m) ((m)->m_pkthdr.rcvif) #define NM_ATOMIC_T volatile int // XXX ? /* atomic operations */ @@ -98,23 +111,20 @@ struct netmap_adapter *netmap_getna(if_t ifp); #endif #if __FreeBSD_version >= 1100027 -#define GET_MBUF_REFCNT(m) ((m)->m_ext.ext_cnt ? *((m)->m_ext.ext_cnt) : -1) -#define SET_MBUF_REFCNT(m, x) *((m)->m_ext.ext_cnt) = x -#define PNT_MBUF_REFCNT(m) ((m)->m_ext.ext_cnt) +#define MBUF_REFCNT(m) ((m)->m_ext.ext_count) +#define SET_MBUF_REFCNT(m, x) (m)->m_ext.ext_count = x #else -#define GET_MBUF_REFCNT(m) ((m)->m_ext.ref_cnt ? *((m)->m_ext.ref_cnt) : -1) +#define MBUF_REFCNT(m) ((m)->m_ext.ref_cnt ? *((m)->m_ext.ref_cnt) : -1) #define SET_MBUF_REFCNT(m, x) *((m)->m_ext.ref_cnt) = x -#define PNT_MBUF_REFCNT(m) ((m)->m_ext.ref_cnt) #endif -MALLOC_DECLARE(M_NETMAP); +#define MBUF_QUEUED(m) 1 struct nm_selinfo { struct selinfo si; struct mtx m; }; -void freebsd_selwakeup(struct nm_selinfo *si, int pri); // XXX linux struct, not used in FreeBSD struct net_device_ops { @@ -131,12 +141,16 @@ struct hrtimer { #define NM_LOCK_T safe_spinlock_t // see bsd_glue.h #define NM_SELINFO_T wait_queue_head_t #define MBUF_LEN(m) ((m)->len) -#define MBUF_IFP(m) ((m)->dev) -#define NM_SEND_UP(ifp, m) \ - do { \ - m->priority = NM_MAGIC_PRIORITY_RX; \ - netif_rx(m); \ - } while (0) +#define MBUF_TRANSMIT(na, ifp, m) \ + ({ \ + /* Avoid infinite recursion with generic. */ \ + m->priority = NM_MAGIC_PRIORITY_TX; \ + (((struct net_device_ops *)(na)->if_transmit)->ndo_start_xmit(m, ifp)); \ + 0; \ + }) + +/* See explanation in nm_os_generic_xmit_frame. */ +#define GEN_TX_MBUF_IFP(m) ((struct ifnet *)skb_shinfo(m)->destructor_arg) #define NM_ATOMIC_T volatile long unsigned int @@ -159,7 +173,51 @@ struct hrtimer { #define NM_LOCK_T IOLock * #define NM_SELINFO_T struct selinfo #define MBUF_LEN(m) ((m)->m_pkthdr.len) -#define NM_SEND_UP(ifp, m) ((ifp)->if_input)(ifp, m) + +#elif defined (_WIN32) +#include "../../../WINDOWS/win_glue.h" + +#define NM_SELRECORD_T IO_STACK_LOCATION +#define NM_SELINFO_T win_SELINFO // see win_glue.h +#define NM_LOCK_T win_spinlock_t // see win_glue.h +#define NM_MTX_T KGUARDED_MUTEX /* OS-specific mutex (sleepable) */ + +#define NM_MTX_INIT(m) KeInitializeGuardedMutex(&m); +#define NM_MTX_DESTROY(m) do { (void)(m); } while (0) +#define NM_MTX_LOCK(m) KeAcquireGuardedMutex(&(m)) +#define NM_MTX_UNLOCK(m) KeReleaseGuardedMutex(&(m)) +#define NM_MTX_ASSERT(m) assert(&m.Count>0) + +//These linknames are for the NDIS driver +#define NETMAP_NDIS_LINKNAME_STRING L"\\DosDevices\\NMAPNDIS" +#define NETMAP_NDIS_NTDEVICE_STRING L"\\Device\\NMAPNDIS" + +//Definition of internal driver-to-driver ioctl codes +#define NETMAP_KERNEL_XCHANGE_POINTERS _IO('i', 180) +#define NETMAP_KERNEL_SEND_SHUTDOWN_SIGNAL _IO_direct('i', 195) + +//Empty data structures are not permitted by MSVC compiler +//XXX_ale, try to solve this problem +struct net_device_ops{ + char data[1]; +}; +typedef struct ethtool_ops{ + char data[1]; +}; +typedef struct hrtimer{ + KTIMER timer; + BOOLEAN active; + KDPC deferred_proc; +}; + +/* MSVC does not have likely/unlikely support */ +#ifdef _MSC_VER +#define likely(x) (x) +#define unlikely(x) (x) +#else +#define likely(x) __builtin_expect((long)!!(x), 1L) +#define unlikely(x) __builtin_expect((long)!!(x), 0L) +#endif //_MSC_VER #else @@ -167,6 +225,13 @@ struct hrtimer { #endif /* end - platform-specific code */ +#ifndef _WIN32 /* support for emulated sysctl */ +#define SYSBEGIN(x) +#define SYSEND +#endif /* _WIN32 */ + +#define NM_ACCESS_ONCE(x) (*(volatile __typeof__(x) *)&(x)) + #define NMG_LOCK_T NM_MTX_T #define NMG_LOCK_INIT() NM_MTX_INIT(netmap_global_lock) #define NMG_LOCK_DESTROY() NM_MTX_DESTROY(netmap_global_lock) @@ -201,8 +266,36 @@ struct nm_bdg_fwd; struct nm_bridge; struct netmap_priv_d; +/* os-specific NM_SELINFO_T initialzation/destruction functions */ +void nm_os_selinfo_init(NM_SELINFO_T *); +void nm_os_selinfo_uninit(NM_SELINFO_T *); + const char *nm_dump_buf(char *p, int len, int lim, char *dst); +void nm_os_selwakeup(NM_SELINFO_T *si); +void nm_os_selrecord(NM_SELRECORD_T *sr, NM_SELINFO_T *si); + +int nm_os_ifnet_init(void); +void nm_os_ifnet_fini(void); +void nm_os_ifnet_lock(void); +void nm_os_ifnet_unlock(void); + +void nm_os_get_module(void); +void nm_os_put_module(void); + +void netmap_make_zombie(struct ifnet *); +void netmap_undo_zombie(struct ifnet *); + +/* passes a packet up to the host stack. + * If the packet is sent (or dropped) immediately it returns NULL, + * otherwise it links the packet to prev and returns m. + * In this case, a final call with m=NULL and prev != NULL will send up + * the entire chain to the host stack. + */ +void *nm_os_send_up(struct ifnet *, struct mbuf *m, struct mbuf *prev); + +int nm_os_mbuf_has_offld(struct mbuf *m); + #include "netmap_mbq.h" extern NMG_LOCK_T netmap_global_lock; @@ -299,6 +392,19 @@ struct netmap_kring { uint32_t nr_kflags; /* private driver flags */ #define NKR_PENDINTR 0x1 // Pending interrupt. #define NKR_EXCLUSIVE 0x2 /* exclusive binding */ +#define NKR_FORWARD 0x4 /* (host ring only) there are + packets to forward + */ +#define NKR_NEEDRING 0x8 /* ring needed even if users==0 + * (used internally by pipes and + * by ptnetmap host ports) + */ + + uint32_t nr_mode; + uint32_t nr_pending_mode; +#define NKR_NETMAP_OFF 0x0 +#define NKR_NETMAP_ON 0x1 + uint32_t nkr_num_slots; /* @@ -344,13 +450,14 @@ struct netmap_kring { * store incoming mbufs in a queue that is drained by * a rxsync. */ - struct mbuf **tx_pool; - // u_int nr_ntc; /* Emulation of a next-to-clean RX ring pointer. */ - struct mbq rx_queue; /* intercepted rx mbufs. */ + struct mbuf **tx_pool; + struct mbuf *tx_event; /* TX event used as a notification */ + NM_LOCK_T tx_event_lock; /* protects the tx_event mbuf */ + struct mbq rx_queue; /* intercepted rx mbufs. */ uint32_t users; /* existing bindings for this ring */ - uint32_t ring_id; /* debugging */ + uint32_t ring_id; /* kring identifier */ enum txrx tx; /* kind of ring (tx or rx) */ char name[64]; /* diagnostic */ @@ -372,9 +479,6 @@ struct netmap_kring { struct netmap_kring *pipe; /* if this is a pipe ring, * pointer to the other end */ - struct netmap_ring *save_ring; /* pointer to hidden rings - * (see netmap_pipe.c for details) - */ #endif /* WITH_PIPES */ #ifdef WITH_VALE @@ -397,8 +501,28 @@ struct netmap_kring { uint32_t mon_tail; /* last seen slot on rx */ uint32_t mon_pos; /* index of this ring in the monitored ring array */ #endif -} __attribute__((__aligned__(64))); +} +#ifdef _WIN32 +__declspec(align(64)); +#else +__attribute__((__aligned__(64))); +#endif +/* return 1 iff the kring needs to be turned on */ +static inline int +nm_kring_pending_on(struct netmap_kring *kring) +{ + return kring->nr_pending_mode == NKR_NETMAP_ON && + kring->nr_mode == NKR_NETMAP_OFF; +} + +/* return 1 iff the kring needs to be turned off */ +static inline int +nm_kring_pending_off(struct netmap_kring *kring) +{ + return kring->nr_pending_mode == NKR_NETMAP_OFF && + kring->nr_mode == NKR_NETMAP_ON; +} /* return the next index, with wraparound */ static inline uint32_t @@ -514,6 +638,8 @@ struct netmap_adapter { */ #define NAF_HOST_RINGS 64 /* the adapter supports the host rings */ #define NAF_FORCE_NATIVE 128 /* the adapter is always NATIVE */ +#define NAF_PTNETMAP_HOST 256 /* the adapter supports ptnetmap in the host */ +#define NAF_ZOMBIE (1U<<30) /* the nic driver has been unloaded */ #define NAF_BUSY (1U<<31) /* the adapter is used internally and * cannot be registered from userspace */ @@ -592,10 +718,14 @@ struct netmap_adapter { * For hw devices this is typically a selwakeup(), * but for NIC/host ports attached to a switch (or vice-versa) * we also need to invoke the 'txsync' code downstream. + * This callback pointer is actually used only to initialize + * kring->nm_notify. + * Return values are the same as for netmap_rx_irq(). */ void (*nm_dtor)(struct netmap_adapter *); int (*nm_register)(struct netmap_adapter *, int onoff); + void (*nm_intr)(struct netmap_adapter *, int onoff); int (*nm_txsync)(struct netmap_kring *kring, int flags); int (*nm_rxsync)(struct netmap_kring *kring, int flags); @@ -640,14 +770,14 @@ struct netmap_adapter { /* memory allocator (opaque) * We also cache a pointer to the lut_entry for translating - * buffer addresses, and the total number of buffers. + * buffer addresses, the total number of buffers and the buffer size. */ struct netmap_mem_d *nm_mem; struct netmap_lut na_lut; /* additional information attached to this adapter * by other netmap subsystems. Currently used by - * bwrap and LINUX/v1000. + * bwrap, LINUX/v1000 and ptnetmap */ void *na_private; @@ -656,6 +786,9 @@ struct netmap_adapter { int na_next_pipe; /* next free slot in the array */ int na_max_pipes; /* size of the array */ + /* Offset of ethernet header for each packet. */ + u_int virt_hdr_len; + char name[64]; }; @@ -721,8 +854,6 @@ struct netmap_vp_adapter { /* VALE software port */ struct nm_bridge *na_bdg; int retry; - /* Offset of ethernet header for each packet. */ - u_int virt_hdr_len; /* Maximum Frame Size, used in bdg_mismatch_datapath() */ u_int mfs; /* Last source MAC on this port */ @@ -767,6 +898,13 @@ struct netmap_generic_adapter { /* emulated device */ #ifdef linux netdev_tx_t (*save_start_xmit)(struct mbuf *, struct ifnet *); #endif + /* Is the adapter able to use multiple RX slots to scatter + * each packet pushed up by the driver? */ + int rxsg; + + /* Is the transmission path controlled by a netmap-aware + * device queue (i.e. qdisc on linux)? */ + int txqdisc; }; #endif /* WITH_GENERIC */ @@ -777,7 +915,7 @@ netmap_real_rings(struct netmap_adapter *na, enum txrx t) } #ifdef WITH_VALE - +struct nm_bdg_polling_state; /* * Bridge wrapper for non VALE ports attached to a VALE switch. * @@ -827,9 +965,6 @@ struct netmap_bwrap_adapter { struct netmap_vp_adapter host; /* for host rings */ struct netmap_adapter *hwna; /* the underlying device */ - /* backup of the hwna memory allocator */ - struct netmap_mem_d *save_nmd; - /* * When we attach a physical interface to the bridge, we * allow the controlling process to terminate, so we need @@ -838,10 +973,10 @@ struct netmap_bwrap_adapter { * are attached to a bridge. */ struct netmap_priv_d *na_kpriv; + struct nm_bdg_polling_state *na_polling_state; }; int netmap_bwrap_attach(const char *name, struct netmap_adapter *); - #endif /* WITH_VALE */ #ifdef WITH_PIPES @@ -876,56 +1011,122 @@ nm_kr_rxspace(struct netmap_kring *k) return space; } +/* return slots reserved to tx clients */ +#define nm_kr_txspace(_k) nm_kr_rxspace(_k) -/* True if no space in the tx ring. only valid after txsync_prologue */ + +/* True if no space in the tx ring, only valid after txsync_prologue */ static inline int nm_kr_txempty(struct netmap_kring *kring) { return kring->rcur == kring->nr_hwtail; } +/* True if no more completed slots in the rx ring, only valid after + * rxsync_prologue */ +#define nm_kr_rxempty(_k) nm_kr_txempty(_k) /* * protect against multiple threads using the same ring. - * also check that the ring has not been stopped. - * We only care for 0 or !=0 as a return code. + * also check that the ring has not been stopped or locked */ -#define NM_KR_BUSY 1 -#define NM_KR_STOPPED 2 +#define NM_KR_BUSY 1 /* some other thread is syncing the ring */ +#define NM_KR_STOPPED 2 /* unbounded stop (ifconfig down or driver unload) */ +#define NM_KR_LOCKED 3 /* bounded, brief stop for mutual exclusion */ +/* release the previously acquired right to use the *sync() methods of the ring */ static __inline void nm_kr_put(struct netmap_kring *kr) { NM_ATOMIC_CLEAR(&kr->nr_busy); } -static __inline int nm_kr_tryget(struct netmap_kring *kr) +/* true if the ifp that backed the adapter has disappeared (e.g., the + * driver has been unloaded) + */ +static inline int nm_iszombie(struct netmap_adapter *na); + +/* try to obtain exclusive right to issue the *sync() operations on the ring. + * The right is obtained and must be later relinquished via nm_kr_put() if and + * only if nm_kr_tryget() returns 0. + * If can_sleep is 1 there are only two other possible outcomes: + * - the function returns NM_KR_BUSY + * - the function returns NM_KR_STOPPED and sets the POLLERR bit in *perr + * (if non-null) + * In both cases the caller will typically skip the ring, possibly collecting + * errors along the way. + * If the calling context does not allow sleeping, the caller must pass 0 in can_sleep. + * In the latter case, the function may also return NM_KR_LOCKED and leave *perr + * untouched: ideally, the caller should try again at a later time. + */ +static __inline int nm_kr_tryget(struct netmap_kring *kr, int can_sleep, int *perr) { + int busy = 1, stopped; /* check a first time without taking the lock * to avoid starvation for nm_kr_get() */ - if (unlikely(kr->nkr_stopped)) { - ND("ring %p stopped (%d)", kr, kr->nkr_stopped); - return NM_KR_STOPPED; +retry: + stopped = kr->nkr_stopped; + if (unlikely(stopped)) { + goto stop; } - if (unlikely(NM_ATOMIC_TEST_AND_SET(&kr->nr_busy))) - return NM_KR_BUSY; - /* check a second time with lock held */ - if (unlikely(kr->nkr_stopped)) { - ND("ring %p stopped (%d)", kr, kr->nkr_stopped); + busy = NM_ATOMIC_TEST_AND_SET(&kr->nr_busy); + /* we should not return NM_KR_BUSY if the ring was + * actually stopped, so check another time after + * the barrier provided by the atomic operation + */ + stopped = kr->nkr_stopped; + if (unlikely(stopped)) { + goto stop; + } + + if (unlikely(nm_iszombie(kr->na))) { + stopped = NM_KR_STOPPED; + goto stop; + } + + return unlikely(busy) ? NM_KR_BUSY : 0; + +stop: + if (!busy) nm_kr_put(kr); - return NM_KR_STOPPED; + if (stopped == NM_KR_STOPPED) { +/* if POLLERR is defined we want to use it to simplify netmap_poll(). + * Otherwise, any non-zero value will do. + */ +#ifdef POLLERR +#define NM_POLLERR POLLERR +#else +#define NM_POLLERR 1 +#endif /* POLLERR */ + if (perr) + *perr |= NM_POLLERR; +#undef NM_POLLERR + } else if (can_sleep) { + tsleep(kr, 0, "NM_KR_TRYGET", 4); + goto retry; } - return 0; + return stopped; } -static __inline void nm_kr_get(struct netmap_kring *kr) +/* put the ring in the 'stopped' state and wait for the current user (if any) to + * notice. stopped must be either NM_KR_STOPPED or NM_KR_LOCKED + */ +static __inline void nm_kr_stop(struct netmap_kring *kr, int stopped) { + kr->nkr_stopped = stopped; while (NM_ATOMIC_TEST_AND_SET(&kr->nr_busy)) tsleep(kr, 0, "NM_KR_GET", 4); } +/* restart a ring after a stop */ +static __inline void nm_kr_start(struct netmap_kring *kr) +{ + kr->nkr_stopped = 0; + nm_kr_put(kr); +} + /* * The following functions are used by individual drivers to @@ -953,10 +1154,26 @@ struct netmap_slot *netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n, u_int new_cur); int netmap_ring_reinit(struct netmap_kring *); +/* Return codes for netmap_*x_irq. */ +enum { + /* Driver should do normal interrupt processing, e.g. because + * the interface is not in netmap mode. */ + NM_IRQ_PASS = 0, + /* Port is in netmap mode, and the interrupt work has been + * completed. The driver does not have to notify netmap + * again before the next interrupt. */ + NM_IRQ_COMPLETED = -1, + /* Port is in netmap mode, but the interrupt work has not been + * completed. The driver has to make sure netmap will be + * notified again soon, even if no more interrupts come (e.g. + * on Linux the driver should not call napi_complete()). */ + NM_IRQ_RESCHED = -2, +}; + /* default functions to handle rx/tx interrupts */ int netmap_rx_irq(struct ifnet *, u_int, u_int *); #define netmap_tx_irq(_n, _q) netmap_rx_irq(_n, _q, NULL) -void netmap_common_irq(struct ifnet *, u_int, u_int *work_done); +int netmap_common_irq(struct netmap_adapter *, u_int, u_int *work_done); #ifdef WITH_VALE @@ -986,35 +1203,74 @@ nm_native_on(struct netmap_adapter *na) return nm_netmap_on(na) && (na->na_flags & NAF_NATIVE); } +static inline int +nm_iszombie(struct netmap_adapter *na) +{ + return na == NULL || (na->na_flags & NAF_ZOMBIE); +} + +static inline void +nm_update_hostrings_mode(struct netmap_adapter *na) +{ + /* Process nr_mode and nr_pending_mode for host rings. */ + na->tx_rings[na->num_tx_rings].nr_mode = + na->tx_rings[na->num_tx_rings].nr_pending_mode; + na->rx_rings[na->num_rx_rings].nr_mode = + na->rx_rings[na->num_rx_rings].nr_pending_mode; +} + /* set/clear native flags and if_transmit/netdev_ops */ static inline void nm_set_native_flags(struct netmap_adapter *na) { struct ifnet *ifp = na->ifp; + /* We do the setup for intercepting packets only if we are the + * first user of this adapapter. */ + if (na->active_fds > 0) { + return; + } + na->na_flags |= NAF_NETMAP_ON; #ifdef IFCAP_NETMAP /* or FreeBSD ? */ ifp->if_capenable |= IFCAP_NETMAP; #endif -#ifdef __FreeBSD__ +#if defined (__FreeBSD__) na->if_transmit = ifp->if_transmit; ifp->if_transmit = netmap_transmit; +#elif defined (_WIN32) + (void)ifp; /* prevent a warning */ + //XXX_ale can we just comment those? + //na->if_transmit = ifp->if_transmit; + //ifp->if_transmit = netmap_transmit; #else na->if_transmit = (void *)ifp->netdev_ops; ifp->netdev_ops = &((struct netmap_hw_adapter *)na)->nm_ndo; ((struct netmap_hw_adapter *)na)->save_ethtool = ifp->ethtool_ops; ifp->ethtool_ops = &((struct netmap_hw_adapter*)na)->nm_eto; #endif + nm_update_hostrings_mode(na); } - static inline void nm_clear_native_flags(struct netmap_adapter *na) { struct ifnet *ifp = na->ifp; -#ifdef __FreeBSD__ + /* We undo the setup for intercepting packets only if we are the + * last user of this adapapter. */ + if (na->active_fds > 0) { + return; + } + + nm_update_hostrings_mode(na); + +#if defined(__FreeBSD__) ifp->if_transmit = na->if_transmit; +#elif defined(_WIN32) + (void)ifp; /* prevent a warning */ + //XXX_ale can we just comment those? + //ifp->if_transmit = na->if_transmit; #else ifp->netdev_ops = (void *)na->if_transmit; ifp->ethtool_ops = ((struct netmap_hw_adapter*)na)->save_ethtool; @@ -1025,6 +1281,28 @@ nm_clear_native_flags(struct netmap_adapter *na) #endif } +/* + * nm_*sync_prologue() functions are used in ioctl/poll and ptnetmap + * kthreads. + * We need netmap_ring* parameter, because in ptnetmap it is decoupled + * from host kring. + * The user-space ring pointers (head/cur/tail) are shared through + * CSB between host and guest. + */ + +/* + * validates parameters in the ring/kring, returns a value for head + * If any error, returns ring_size to force a reinit. + */ +uint32_t nm_txsync_prologue(struct netmap_kring *, struct netmap_ring *); + + +/* + * validates parameters in the ring/kring, returns a value for head + * If any error, returns ring_size lim to force a reinit. + */ +uint32_t nm_rxsync_prologue(struct netmap_kring *, struct netmap_ring *); + /* check/fix address and len in tx rings */ #if 1 /* debug version */ @@ -1080,6 +1358,9 @@ int netmap_krings_create(struct netmap_adapter *na, u_int tailroom); */ void netmap_krings_delete(struct netmap_adapter *na); +int netmap_hw_krings_create(struct netmap_adapter *na); +void netmap_hw_krings_delete(struct netmap_adapter *na); + /* set the stopped/enabled status of ring * When stopping, they also wait for all current activity on the ring to * terminate. The status change is then notified using the na nm_notify @@ -1088,16 +1369,18 @@ void netmap_krings_delete(struct netmap_adapter *na); void netmap_set_ring(struct netmap_adapter *, u_int ring_id, enum txrx, int stopped); /* set the stopped/enabled status of all rings of the adapter. */ void netmap_set_all_rings(struct netmap_adapter *, int stopped); -/* convenience wrappers for netmap_set_all_rings, used in drivers */ +/* convenience wrappers for netmap_set_all_rings */ void netmap_disable_all_rings(struct ifnet *); void netmap_enable_all_rings(struct ifnet *); int netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na, uint16_t ringid, uint32_t flags); - +void netmap_do_unregif(struct netmap_priv_d *priv); u_int nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg); -int netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create); +int netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, + struct ifnet **ifp, int create); +void netmap_unget_na(struct netmap_adapter *na, struct ifnet *ifp); int netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na); @@ -1124,12 +1407,11 @@ struct netmap_bdg_ops { u_int netmap_bdg_learning(struct nm_bdg_fwd *ft, uint8_t *dst_ring, struct netmap_vp_adapter *); +#define NM_BRIDGES 8 /* number of bridges */ #define NM_BDG_MAXPORTS 254 /* up to 254 */ #define NM_BDG_BROADCAST NM_BDG_MAXPORTS #define NM_BDG_NOPORT (NM_BDG_MAXPORTS+1) -#define NM_NAME "vale" /* prefix for bridge port name */ - /* these are redefined in case of no VALE support */ int netmap_get_bdg_na(struct nmreq *nmr, struct netmap_adapter **na, int create); struct nm_bridge *netmap_init_bridges2(u_int); @@ -1181,14 +1463,13 @@ void netmap_bns_getbridges(struct nm_bridge **, u_int *); #endif /* Various prototypes */ -int netmap_poll(struct cdev *dev, int events, struct thread *td); +int netmap_poll(struct netmap_priv_d *, int events, NM_SELRECORD_T *td); int netmap_init(void); void netmap_fini(void); int netmap_get_memory(struct netmap_priv_d* p); void netmap_dtor(void *data); -int netmap_dtor_locked(struct netmap_priv_d *priv); -int netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td); +int netmap_ioctl(struct netmap_priv_d *priv, u_long cmd, caddr_t data, struct thread *); /* netmap_adapter creation/destruction */ @@ -1228,8 +1509,8 @@ int netmap_adapter_put(struct netmap_adapter *na); /* * module variables */ -#define NETMAP_BUF_BASE(na) ((na)->na_lut.lut[0].vaddr) -#define NETMAP_BUF_SIZE(na) ((na)->na_lut.objsize) +#define NETMAP_BUF_BASE(_na) ((_na)->na_lut.lut[0].vaddr) +#define NETMAP_BUF_SIZE(_na) ((_na)->na_lut.objsize) extern int netmap_mitigate; // XXX not really used extern int netmap_no_pendintr; extern int netmap_verbose; // XXX debugging @@ -1245,10 +1526,12 @@ enum { /* verbose flags */ }; extern int netmap_txsync_retry; +extern int netmap_adaptive_io; +extern int netmap_flags; extern int netmap_generic_mit; extern int netmap_generic_ringsize; extern int netmap_generic_rings; -extern int netmap_use_count; +extern int netmap_generic_txqdisc; /* * NA returns a pointer to the struct netmap adapter from the ifp, @@ -1257,37 +1540,27 @@ extern int netmap_use_count; #define NA(_ifp) ((struct netmap_adapter *)WNA(_ifp)) /* - * Macros to determine if an interface is netmap capable or netmap enabled. - * See the magic field in struct netmap_adapter. - */ -#ifdef __FreeBSD__ -/* - * on FreeBSD just use if_capabilities and if_capenable. - */ -#define NETMAP_CAPABLE(ifp) (NA(ifp) && \ - (ifp)->if_capabilities & IFCAP_NETMAP ) - -#define NETMAP_SET_CAPABLE(ifp) \ - (ifp)->if_capabilities |= IFCAP_NETMAP - -#else /* linux */ - -/* - * on linux: - * we check if NA(ifp) is set and its first element has a related + * On old versions of FreeBSD, NA(ifp) is a pspare. On linux we + * overload another pointer in the netdev. + * + * We check if NA(ifp) is set and its first element has a related * magic value. The capenable is within the struct netmap_adapter. */ #define NETMAP_MAGIC 0x52697a7a -#define NETMAP_CAPABLE(ifp) (NA(ifp) && \ +#define NM_NA_VALID(ifp) (NA(ifp) && \ ((uint32_t)(uintptr_t)NA(ifp) ^ NA(ifp)->magic) == NETMAP_MAGIC ) -#define NETMAP_SET_CAPABLE(ifp) \ - NA(ifp)->magic = ((uint32_t)(uintptr_t)NA(ifp)) ^ NETMAP_MAGIC +#define NM_ATTACH_NA(ifp, na) do { \ + WNA(ifp) = na; \ + if (NA(ifp)) \ + NA(ifp)->magic = \ + ((uint32_t)(uintptr_t)NA(ifp)) ^ NETMAP_MAGIC; \ +} while(0) -#endif /* linux */ +#define NM_IS_NATIVE(ifp) (NM_NA_VALID(ifp) && NA(ifp)->nm_dtor == netmap_hw_dtor) -#ifdef __FreeBSD__ +#if defined(__FreeBSD__) /* Assigns the device IOMMU domain to an allocator. * Returns -ENOMEM in case the domain is different */ @@ -1331,6 +1604,8 @@ netmap_reload_map(struct netmap_adapter *na, } } +#elif defined(_WIN32) + #else /* linux */ int nm_iommu_group_id(bus_dma_tag_t dev); @@ -1341,8 +1616,8 @@ netmap_load_map(struct netmap_adapter *na, bus_dma_tag_t tag, bus_dmamap_t map, void *buf) { if (0 && map) { - *map = dma_map_single(na->pdev, buf, na->na_lut.objsize, - DMA_BIDIRECTIONAL); + *map = dma_map_single(na->pdev, buf, NETMAP_BUF_SIZE(na), + DMA_BIDIRECTIONAL); } } @@ -1350,11 +1625,11 @@ static inline void netmap_unload_map(struct netmap_adapter *na, bus_dma_tag_t tag, bus_dmamap_t map) { - u_int sz = na->na_lut.objsize; + u_int sz = NETMAP_BUF_SIZE(na); if (*map) { dma_unmap_single(na->pdev, *map, sz, - DMA_BIDIRECTIONAL); + DMA_BIDIRECTIONAL); } } @@ -1362,7 +1637,7 @@ static inline void netmap_reload_map(struct netmap_adapter *na, bus_dma_tag_t tag, bus_dmamap_t map, void *buf) { - u_int sz = na->na_lut.objsize; + u_int sz = NETMAP_BUF_SIZE(na); if (*map) { dma_unmap_single(na->pdev, *map, sz, @@ -1473,7 +1748,11 @@ PNMB(struct netmap_adapter *na, struct netmap_slot *slot, uint64_t *pp) struct lut_entry *lut = na->na_lut.lut; void *ret = (i >= na->na_lut.objtotal) ? lut[0].vaddr : lut[i].vaddr; +#ifndef _WIN32 *pp = (i >= na->na_lut.objtotal) ? lut[0].paddr : lut[i].paddr; +#else + *pp = (i >= na->na_lut.objtotal) ? (uint64_t)lut[0].paddr.QuadPart : (uint64_t)lut[i].paddr.QuadPart; +#endif return ret; } @@ -1497,8 +1776,9 @@ struct netmap_priv_d { struct netmap_if * volatile np_nifp; /* netmap if descriptor. */ struct netmap_adapter *np_na; + struct ifnet *np_ifp; uint32_t np_flags; /* from the ioctl */ - u_int np_qfirst[NR_TXRX], + u_int np_qfirst[NR_TXRX], np_qlast[NR_TXRX]; /* range of tx/rx rings to scan */ uint16_t np_txpoll; /* XXX and also np_rxpoll ? */ @@ -1512,6 +1792,26 @@ struct netmap_priv_d { struct thread *np_td; /* kqueue, just debugging */ }; +struct netmap_priv_d *netmap_priv_new(void); +void netmap_priv_delete(struct netmap_priv_d *); + +static inline int nm_kring_pending(struct netmap_priv_d *np) +{ + struct netmap_adapter *na = np->np_na; + enum txrx t; + int i; + + for_rx_tx(t) { + for (i = np->np_qfirst[t]; i < np->np_qlast[t]; i++) { + struct netmap_kring *kring = &NMR(na, t)[i]; + if (kring->nr_mode != kring->nr_pending_mode) { + return 1; + } + } + } + return 0; +} + #ifdef WITH_MONITOR struct netmap_monitor_adapter { @@ -1530,13 +1830,36 @@ struct netmap_monitor_adapter { * native netmap support. */ int generic_netmap_attach(struct ifnet *ifp); +int generic_rx_handler(struct ifnet *ifp, struct mbuf *m);; + +int nm_os_catch_rx(struct netmap_generic_adapter *gna, int intercept); +int nm_os_catch_tx(struct netmap_generic_adapter *gna, int intercept); + +/* + * the generic transmit routine is passed a structure to optionally + * build a queue of descriptors, in an OS-specific way. + * The payload is at addr, if non-null, and the routine should send or queue + * the packet, returning 0 if successful, 1 on failure. + * + * At the end, if head is non-null, there will be an additional call + * to the function with addr = NULL; this should tell the OS-specific + * routine to send the queue and free any resources. Failure is ignored. + */ +struct nm_os_gen_arg { + struct ifnet *ifp; + void *m; /* os-specific mbuf-like object */ + void *head, *tail; /* tailq, if the OS-specific routine needs to build one */ + void *addr; /* payload of current packet */ + u_int len; /* packet length */ + u_int ring_nr; /* packet length */ + u_int qevent; /* in txqdisc mode, place an event on this mbuf */ +}; + +int nm_os_generic_xmit_frame(struct nm_os_gen_arg *); +int nm_os_generic_find_num_desc(struct ifnet *ifp, u_int *tx, u_int *rx); +void nm_os_generic_find_num_queues(struct ifnet *ifp, u_int *txq, u_int *rxq); +void nm_os_generic_set_features(struct netmap_generic_adapter *gna); -int netmap_catch_rx(struct netmap_generic_adapter *na, int intercept); -void generic_rx_handler(struct ifnet *ifp, struct mbuf *m);; -void netmap_catch_tx(struct netmap_generic_adapter *na, int enable); -int generic_xmit_frame(struct ifnet *ifp, struct mbuf *m, void *addr, u_int len, u_int ring_nr); -int generic_find_num_desc(struct ifnet *ifp, u_int *tx, u_int *rx); -void generic_find_num_queues(struct ifnet *ifp, u_int *txq, u_int *rxq); static inline struct ifnet* netmap_generic_getifp(struct netmap_generic_adapter *gna) { @@ -1546,6 +1869,8 @@ netmap_generic_getifp(struct netmap_generic_adapter *gna) return gna->up.up.ifp; } +void netmap_generic_irq(struct netmap_adapter *na, u_int q, u_int *work_done); + //#define RATE_GENERIC /* Enables communication statistics for generic. */ #ifdef RATE_GENERIC void generic_rate(int txp, int txs, int txi, int rxp, int rxs, int rxi); @@ -1558,16 +1883,16 @@ void generic_rate(int txp, int txs, int txi, int rxp, int rxs, int rxi); * to reduce the number of interrupt requests/selwakeup * to clients on incoming packets. */ -void netmap_mitigation_init(struct nm_generic_mit *mit, int idx, +void nm_os_mitigation_init(struct nm_generic_mit *mit, int idx, struct netmap_adapter *na); -void netmap_mitigation_start(struct nm_generic_mit *mit); -void netmap_mitigation_restart(struct nm_generic_mit *mit); -int netmap_mitigation_active(struct nm_generic_mit *mit); -void netmap_mitigation_cleanup(struct nm_generic_mit *mit); +void nm_os_mitigation_start(struct nm_generic_mit *mit); +void nm_os_mitigation_restart(struct nm_generic_mit *mit); +int nm_os_mitigation_active(struct nm_generic_mit *mit); +void nm_os_mitigation_cleanup(struct nm_generic_mit *mit); +#else /* !WITH_GENERIC */ +#define generic_netmap_attach(ifp) (EOPNOTSUPP) #endif /* WITH_GENERIC */ - - /* Shared declarations for the VALE switch. */ /* @@ -1656,22 +1981,111 @@ struct nm_ipv6hdr { */ #define rawsum_t uint32_t -rawsum_t nm_csum_raw(uint8_t *data, size_t len, rawsum_t cur_sum); -uint16_t nm_csum_ipv4(struct nm_iphdr *iph); -void nm_csum_tcpudp_ipv4(struct nm_iphdr *iph, void *data, +rawsum_t nm_os_csum_raw(uint8_t *data, size_t len, rawsum_t cur_sum); +uint16_t nm_os_csum_ipv4(struct nm_iphdr *iph); +void nm_os_csum_tcpudp_ipv4(struct nm_iphdr *iph, void *data, size_t datalen, uint16_t *check); -void nm_csum_tcpudp_ipv6(struct nm_ipv6hdr *ip6h, void *data, +void nm_os_csum_tcpudp_ipv6(struct nm_ipv6hdr *ip6h, void *data, size_t datalen, uint16_t *check); -uint16_t nm_csum_fold(rawsum_t cur_sum); +uint16_t nm_os_csum_fold(rawsum_t cur_sum); void bdg_mismatch_datapath(struct netmap_vp_adapter *na, struct netmap_vp_adapter *dst_na, - struct nm_bdg_fwd *ft_p, struct netmap_ring *ring, + const struct nm_bdg_fwd *ft_p, + struct netmap_ring *dst_ring, u_int *j, u_int lim, u_int *howmany); /* persistent virtual port routines */ -int nm_vi_persist(const char *, struct ifnet **); -void nm_vi_detach(struct ifnet *); -void nm_vi_init_index(void); +int nm_os_vi_persist(const char *, struct ifnet **); +void nm_os_vi_detach(struct ifnet *); +void nm_os_vi_init_index(void); + +/* + * kernel thread routines + */ +struct nm_kthread; /* OS-specific kthread - opaque */ +typedef void (*nm_kthread_worker_fn_t)(void *data); + +/* kthread configuration */ +struct nm_kthread_cfg { + long type; /* kthread type/identifier */ + struct ptnet_ring_cfg event; /* event/ioctl fd */ + nm_kthread_worker_fn_t worker_fn; /* worker function */ + void *worker_private;/* worker parameter */ + int attach_user; /* attach kthread to user process */ +}; +/* kthread configuration */ +struct nm_kthread *nm_os_kthread_create(struct nm_kthread_cfg *cfg); +int nm_os_kthread_start(struct nm_kthread *); +void nm_os_kthread_stop(struct nm_kthread *); +void nm_os_kthread_delete(struct nm_kthread *); +void nm_os_kthread_wakeup_worker(struct nm_kthread *nmk); +void nm_os_kthread_send_irq(struct nm_kthread *); +void nm_os_kthread_set_affinity(struct nm_kthread *, int); +u_int nm_os_ncpus(void); + +#ifdef WITH_PTNETMAP_HOST +/* + * netmap adapter for host ptnetmap ports + */ +struct netmap_pt_host_adapter { + struct netmap_adapter up; + + struct netmap_adapter *parent; + int (*parent_nm_notify)(struct netmap_kring *kring, int flags); + void *ptns; +}; +/* ptnetmap HOST routines */ +int netmap_get_pt_host_na(struct nmreq *nmr, struct netmap_adapter **na, int create); +int ptnetmap_ctl(struct nmreq *nmr, struct netmap_adapter *na); +static inline int +nm_ptnetmap_host_on(struct netmap_adapter *na) +{ + return na && na->na_flags & NAF_PTNETMAP_HOST; +} +#else /* !WITH_PTNETMAP_HOST */ +#define netmap_get_pt_host_na(nmr, _2, _3) \ + ((nmr)->nr_flags & (NR_PTNETMAP_HOST) ? EOPNOTSUPP : 0) +#define ptnetmap_ctl(_1, _2) EINVAL +#define nm_ptnetmap_host_on(_1) EINVAL +#endif /* !WITH_PTNETMAP_HOST */ + +#ifdef WITH_PTNETMAP_GUEST +/* ptnetmap GUEST routines */ + +typedef uint32_t (*nm_pt_guest_ptctl_t)(struct ifnet *, uint32_t); + +/* + * netmap adapter for guest ptnetmap ports + */ +struct netmap_pt_guest_adapter { + /* The netmap adapter to be used by netmap applications. + * This field must be the first, to allow upcast. */ + struct netmap_hw_adapter hwup; + + /* The netmap adapter to be used by the driver. */ + struct netmap_hw_adapter dr; + + void *csb; + + /* Reference counter to track users of backend netmap port: the + * network stack and netmap clients. + * Used to decide when we need (de)allocate krings/rings and + * start (stop) ptnetmap kthreads. */ + int backend_regifs; + +}; + +int netmap_pt_guest_attach(struct netmap_adapter *, void *, + unsigned int, nm_pt_guest_ptctl_t); +struct ptnet_ring; +bool netmap_pt_guest_txsync(struct ptnet_ring *ptring, struct netmap_kring *kring, + int flags); +bool netmap_pt_guest_rxsync(struct ptnet_ring *ptring, struct netmap_kring *kring, + int flags); +int ptnet_nm_krings_create(struct netmap_adapter *na); +void ptnet_nm_krings_delete(struct netmap_adapter *na); +void ptnet_nm_dtor(struct netmap_adapter *na); +#endif /* WITH_PTNETMAP_GUEST */ #endif /* _NET_NETMAP_KERN_H_ */ diff --git a/sys/dev/netmap/netmap_mbq.c b/sys/dev/netmap/netmap_mbq.c index 503f5a13aa95..3eb971b74561 100644 --- a/sys/dev/netmap/netmap_mbq.c +++ b/sys/dev/netmap/netmap_mbq.c @@ -1,5 +1,6 @@ /* - * Copyright (C) 2013-2014 Vincenzo Maffione. All rights reserved. + * Copyright (C) 2013-2014 Vincenzo Maffione + * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -30,6 +31,8 @@ #ifdef linux #include "bsd_glue.h" +#elif defined (_WIN32) +#include "win_glue.h" #else /* __FreeBSD__ */ #include <sys/param.h> #include <sys/lock.h> @@ -152,12 +155,12 @@ void mbq_safe_purge(struct mbq *q) } -void mbq_safe_destroy(struct mbq *q) +void mbq_safe_fini(struct mbq *q) { mtx_destroy(&q->lock); } -void mbq_destroy(struct mbq *q) +void mbq_fini(struct mbq *q) { } diff --git a/sys/dev/netmap/netmap_mbq.h b/sys/dev/netmap/netmap_mbq.h index 455ca8a2c3ac..9dafa8b1149b 100644 --- a/sys/dev/netmap/netmap_mbq.h +++ b/sys/dev/netmap/netmap_mbq.h @@ -1,5 +1,6 @@ /* - * Copyright (C) 2013-2014 Vincenzo Maffione. All rights reserved. + * Copyright (C) 2013-2014 Vincenzo Maffione + * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -40,6 +41,8 @@ /* XXX probably rely on a previous definition of SPINLOCK_T */ #ifdef linux #define SPINLOCK_T safe_spinlock_t +#elif defined (_WIN32) +#define SPINLOCK_T win_spinlock_t #else #define SPINLOCK_T struct mtx #endif @@ -52,16 +55,21 @@ struct mbq { SPINLOCK_T lock; }; -/* XXX "destroy" does not match "init" as a name. - * We should also clarify whether init can be used while +/* We should clarify whether init can be used while * holding a lock, and whether mbq_safe_destroy() is a NOP. */ void mbq_init(struct mbq *q); -void mbq_destroy(struct mbq *q); +void mbq_fini(struct mbq *q); void mbq_enqueue(struct mbq *q, struct mbuf *m); struct mbuf *mbq_dequeue(struct mbq *q); void mbq_purge(struct mbq *q); +static inline struct mbuf * +mbq_peek(struct mbq *q) +{ + return q->head ? q->head : NULL; +} + static inline void mbq_lock(struct mbq *q) { @@ -76,7 +84,7 @@ mbq_unlock(struct mbq *q) void mbq_safe_init(struct mbq *q); -void mbq_safe_destroy(struct mbq *q); +void mbq_safe_fini(struct mbq *q); void mbq_safe_enqueue(struct mbq *q, struct mbuf *m); struct mbuf *mbq_safe_dequeue(struct mbq *q); void mbq_safe_purge(struct mbq *q); diff --git a/sys/dev/netmap/netmap_mem2.c b/sys/dev/netmap/netmap_mem2.c index fd0c06bb8b57..b54c9813c33f 100644 --- a/sys/dev/netmap/netmap_mem2.c +++ b/sys/dev/netmap/netmap_mem2.c @@ -1,5 +1,8 @@ /* - * Copyright (C) 2012-2014 Matteo Landi, Luigi Rizzo, Giuseppe Lettieri. All rights reserved. + * Copyright (C) 2012-2014 Matteo Landi + * Copyright (C) 2012-2016 Luigi Rizzo + * Copyright (C) 2012-2016 Giuseppe Lettieri + * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -37,6 +40,7 @@ __FBSDID("$FreeBSD$"); #include <sys/types.h> #include <sys/malloc.h> +#include <sys/kernel.h> /* MALLOC_DEFINE */ #include <sys/proc.h> #include <vm/vm.h> /* vtophys */ #include <vm/pmap.h> /* vtophys */ @@ -48,13 +52,26 @@ __FBSDID("$FreeBSD$"); #include <net/vnet.h> #include <machine/bus.h> /* bus_dmamap_* */ +/* M_NETMAP only used in here */ +MALLOC_DECLARE(M_NETMAP); +MALLOC_DEFINE(M_NETMAP, "netmap", "Network memory map"); + #endif /* __FreeBSD__ */ +#ifdef _WIN32 +#include <win_glue.h> +#endif + #include <net/netmap.h> #include <dev/netmap/netmap_kern.h> +#include <net/netmap_virt.h> #include "netmap_mem2.h" -#define NETMAP_BUF_MAX_NUM 20*4096*2 /* large machine */ +#ifdef _WIN32_USE_SMALL_GENERIC_DEVICES_MEMORY +#define NETMAP_BUF_MAX_NUM 8*4096 /* if too big takes too much time to allocate */ +#else +#define NETMAP_BUF_MAX_NUM 20*4096*2 /* large machine */ +#endif #define NETMAP_POOL_MAX_NAMSZ 32 @@ -111,7 +128,7 @@ struct netmap_obj_pool { struct netmap_mem_ops { - void (*nmd_get_lut)(struct netmap_mem_d *, struct netmap_lut*); + int (*nmd_get_lut)(struct netmap_mem_d *, struct netmap_lut*); int (*nmd_get_info)(struct netmap_mem_d *, u_int *size, u_int *memflags, uint16_t *id); @@ -130,6 +147,39 @@ struct netmap_mem_ops { typedef uint16_t nm_memid_t; +/* + * Shared info for netmap allocator + * + * Each allocator contains this structur as first netmap_if. + * In this way, we can share same details about allocator + * to the VM. + * Used in ptnetmap. + */ +struct netmap_mem_shared_info { +#ifndef _WIN32 + struct netmap_if up; /* ends with a 0-sized array, which VSC does not like */ +#else /* !_WIN32 */ + char up[sizeof(struct netmap_if)]; +#endif /* !_WIN32 */ + uint64_t features; +#define NMS_FEAT_BUF_POOL 0x0001 +#define NMS_FEAT_MEMSIZE 0x0002 + + uint32_t buf_pool_offset; + uint32_t buf_pool_objtotal; + uint32_t buf_pool_objsize; + uint32_t totalsize; +}; + +#define NMS_NAME "nms_info" +#define NMS_VERSION 1 +static const struct netmap_if nms_if_blueprint = { + .ni_name = NMS_NAME, + .ni_version = NMS_VERSION, + .ni_tx_rings = 0, + .ni_rx_rings = 0 +}; + struct netmap_mem_d { NMA_LOCK_T nm_mtx; /* protect the allocator */ u_int nm_totalsize; /* shorthand */ @@ -151,6 +201,9 @@ struct netmap_mem_d { struct netmap_mem_ops *ops; }; +/* + * XXX need to fix the case of t0 == void + */ #define NMD_DEFCB(t0, name) \ t0 \ netmap_mem_##name(struct netmap_mem_d *nmd) \ @@ -186,7 +239,7 @@ netmap_mem_##name(struct netmap_adapter *na, t1 a1) \ return na->nm_mem->ops->nmd_##name(na, a1); \ } -NMD_DEFCB1(void, get_lut, struct netmap_lut *); +NMD_DEFCB1(int, get_lut, struct netmap_lut *); NMD_DEFCB3(int, get_info, u_int *, u_int *, uint16_t *); NMD_DEFCB1(vm_paddr_t, ofstophys, vm_ooffset_t); static int netmap_mem_config(struct netmap_mem_d *); @@ -201,7 +254,7 @@ NMD_DEFNACB(void, rings_delete); static int netmap_mem_map(struct netmap_obj_pool *, struct netmap_adapter *); static int netmap_mem_unmap(struct netmap_obj_pool *, struct netmap_adapter *); -static int nm_mem_assign_group(struct netmap_mem_d *, device_t); +static int nm_mem_assign_group(struct netmap_mem_d *, struct device *); #define NMA_LOCK_INIT(n) NM_MTX_INIT((n)->nm_mtx) #define NMA_LOCK_DESTROY(n) NM_MTX_DESTROY((n)->nm_mtx) @@ -248,7 +301,9 @@ netmap_mem_finalize(struct netmap_mem_d *nmd, struct netmap_adapter *na) if (nm_mem_assign_group(nmd, na->pdev) < 0) { return ENOMEM; } else { - nmd->ops->nmd_finalize(nmd); + NMA_LOCK(nmd); + nmd->lasterr = nmd->ops->nmd_finalize(nmd); + NMA_UNLOCK(nmd); } if (!nmd->lasterr && na->pdev) @@ -257,26 +312,83 @@ netmap_mem_finalize(struct netmap_mem_d *nmd, struct netmap_adapter *na) return nmd->lasterr; } +static int netmap_mem_init_shared_info(struct netmap_mem_d *nmd); + void netmap_mem_deref(struct netmap_mem_d *nmd, struct netmap_adapter *na) { NMA_LOCK(nmd); netmap_mem_unmap(&nmd->pools[NETMAP_BUF_POOL], na); + if (nmd->active == 1) { + u_int i; + + /* + * Reset the allocator when it falls out of use so that any + * pool resources leaked by unclean application exits are + * reclaimed. + */ + for (i = 0; i < NETMAP_POOLS_NR; i++) { + struct netmap_obj_pool *p; + u_int j; + + p = &nmd->pools[i]; + p->objfree = p->objtotal; + /* + * Reproduce the net effect of the M_ZERO malloc() + * and marking of free entries in the bitmap that + * occur in finalize_obj_allocator() + */ + memset(p->bitmap, + '\0', + sizeof(uint32_t) * ((p->objtotal + 31) / 32)); + + /* + * Set all the bits in the bitmap that have + * corresponding buffers to 1 to indicate they are + * free. + */ + for (j = 0; j < p->objtotal; j++) { + if (p->lut[j].vaddr != NULL) { + p->bitmap[ (j>>5) ] |= ( 1 << (j & 31) ); + } + } + } + + /* + * Per netmap_mem_finalize_all(), + * buffers 0 and 1 are reserved + */ + nmd->pools[NETMAP_BUF_POOL].objfree -= 2; + if (nmd->pools[NETMAP_BUF_POOL].bitmap) { + /* XXX This check is a workaround that prevents a + * NULL pointer crash which currently happens only + * with ptnetmap guests. Also, + * netmap_mem_init_shared_info must not be called + * by ptnetmap guest. */ + nmd->pools[NETMAP_BUF_POOL].bitmap[0] = ~3; + + /* expose info to the ptnetmap guest */ + netmap_mem_init_shared_info(nmd); + } + } + nmd->ops->nmd_deref(nmd); + NMA_UNLOCK(nmd); - return nmd->ops->nmd_deref(nmd); } /* accessor functions */ -static void +static int netmap_mem2_get_lut(struct netmap_mem_d *nmd, struct netmap_lut *lut) { lut->lut = nmd->pools[NETMAP_BUF_POOL].lut; lut->objtotal = nmd->pools[NETMAP_BUF_POOL].objtotal; lut->objsize = nmd->pools[NETMAP_BUF_POOL]._objsize; + + return 0; } -struct netmap_obj_params netmap_params[NETMAP_POOLS_NR] = { +static struct netmap_obj_params netmap_params[NETMAP_POOLS_NR] = { [NETMAP_IF_POOL] = { .size = 1024, .num = 100, @@ -291,10 +403,10 @@ struct netmap_obj_params netmap_params[NETMAP_POOLS_NR] = { }, }; -struct netmap_obj_params netmap_min_priv_params[NETMAP_POOLS_NR] = { +static struct netmap_obj_params netmap_min_priv_params[NETMAP_POOLS_NR] = { [NETMAP_IF_POOL] = { .size = 1024, - .num = 1, + .num = 2, }, [NETMAP_RING_POOL] = { .size = 5*PAGE_SIZE, @@ -348,11 +460,12 @@ struct netmap_mem_d nm_mem = { /* Our memory allocator. */ }; -struct netmap_mem_d *netmap_last_mem_d = &nm_mem; +static struct netmap_mem_d *netmap_last_mem_d = &nm_mem; /* blueprint for the private memory allocators */ extern struct netmap_mem_ops netmap_mem_private_ops; /* forward */ -const struct netmap_mem_d nm_blueprint = { +/* XXX clang is not happy about using name as a print format */ +static const struct netmap_mem_d nm_blueprint = { .pools = { [NETMAP_IF_POOL] = { .name = "%s_if", @@ -388,6 +501,8 @@ const struct netmap_mem_d nm_blueprint = { #define DECLARE_SYSCTLS(id, name) \ + SYSBEGIN(mem2_ ## name); \ + SYSCTL_DECL(_dev_netmap); /* leave it here, easier for porting */ \ SYSCTL_INT(_dev_netmap, OID_AUTO, name##_size, \ CTLFLAG_RW, &netmap_params[id].size, 0, "Requested size of netmap " STRINGIFY(name) "s"); \ SYSCTL_INT(_dev_netmap, OID_AUTO, name##_curr_size, \ @@ -401,22 +516,21 @@ const struct netmap_mem_d nm_blueprint = { "Default size of private netmap " STRINGIFY(name) "s"); \ SYSCTL_INT(_dev_netmap, OID_AUTO, priv_##name##_num, \ CTLFLAG_RW, &netmap_min_priv_params[id].num, 0, \ - "Default number of private netmap " STRINGIFY(name) "s") + "Default number of private netmap " STRINGIFY(name) "s"); \ + SYSEND -SYSCTL_DECL(_dev_netmap); DECLARE_SYSCTLS(NETMAP_IF_POOL, if); DECLARE_SYSCTLS(NETMAP_RING_POOL, ring); DECLARE_SYSCTLS(NETMAP_BUF_POOL, buf); +/* call with NMA_LOCK(&nm_mem) held */ static int -nm_mem_assign_id(struct netmap_mem_d *nmd) +nm_mem_assign_id_locked(struct netmap_mem_d *nmd) { nm_memid_t id; struct netmap_mem_d *scan = netmap_last_mem_d; int error = ENOMEM; - NMA_LOCK(&nm_mem); - do { /* we rely on unsigned wrap around */ id = scan->nm_id + 1; @@ -435,10 +549,22 @@ nm_mem_assign_id(struct netmap_mem_d *nmd) } } while (scan != netmap_last_mem_d); - NMA_UNLOCK(&nm_mem); return error; } +/* call with NMA_LOCK(&nm_mem) *not* held */ +static int +nm_mem_assign_id(struct netmap_mem_d *nmd) +{ + int ret; + + NMA_LOCK(&nm_mem); + ret = nm_mem_assign_id_locked(nmd); + NMA_UNLOCK(&nm_mem); + + return ret; +} + static void nm_mem_release_id(struct netmap_mem_d *nmd) { @@ -456,7 +582,7 @@ nm_mem_release_id(struct netmap_mem_d *nmd) } static int -nm_mem_assign_group(struct netmap_mem_d *nmd, device_t dev) +nm_mem_assign_group(struct netmap_mem_d *nmd, struct device *dev) { int err = 0, id; id = nm_iommu_group_id(dev); @@ -494,8 +620,13 @@ netmap_mem2_ofstophys(struct netmap_mem_d* nmd, vm_ooffset_t offset) if (offset >= p[i].memtotal) continue; // now lookup the cluster's address +#ifndef _WIN32 pa = vtophys(p[i].lut[offset / p[i]._objsize].vaddr) + offset % p[i]._objsize; +#else + pa = vtophys(p[i].lut[offset / p[i]._objsize].vaddr); + pa.QuadPart += offset % p[i]._objsize; +#endif NMA_UNLOCK(nmd); return pa; } @@ -508,7 +639,110 @@ netmap_mem2_ofstophys(struct netmap_mem_d* nmd, vm_ooffset_t offset) + p[NETMAP_RING_POOL].memtotal + p[NETMAP_BUF_POOL].memtotal); NMA_UNLOCK(nmd); +#ifndef _WIN32 return 0; // XXX bad address +#else + vm_paddr_t res; + res.QuadPart = 0; + return res; +#endif +} + +#ifdef _WIN32 + +/* + * win32_build_virtual_memory_for_userspace + * + * This function get all the object making part of the pools and maps + * a contiguous virtual memory space for the userspace + * It works this way + * 1 - allocate a Memory Descriptor List wide as the sum + * of the memory needed for the pools + * 2 - cycle all the objects in every pool and for every object do + * + * 2a - cycle all the objects in every pool, get the list + * of the physical address descriptors + * 2b - calculate the offset in the array of pages desciptor in the + * main MDL + * 2c - copy the descriptors of the object in the main MDL + * + * 3 - return the resulting MDL that needs to be mapped in userland + * + * In this way we will have an MDL that describes all the memory for the + * objects in a single object +*/ + +PMDL +win32_build_user_vm_map(struct netmap_mem_d* nmd) +{ + int i, j; + u_int memsize, memflags, ofs = 0; + PMDL mainMdl, tempMdl; + + if (netmap_mem_get_info(nmd, &memsize, &memflags, NULL)) { + D("memory not finalised yet"); + return NULL; + } + + mainMdl = IoAllocateMdl(NULL, memsize, FALSE, FALSE, NULL); + if (mainMdl == NULL) { + D("failed to allocate mdl"); + return NULL; + } + + NMA_LOCK(nmd); + for (i = 0; i < NETMAP_POOLS_NR; i++) { + struct netmap_obj_pool *p = &nmd->pools[i]; + int clsz = p->_clustsize; + int clobjs = p->_clustentries; /* objects per cluster */ + int mdl_len = sizeof(PFN_NUMBER) * BYTES_TO_PAGES(clsz); + PPFN_NUMBER pSrc, pDst; + + /* each pool has a different cluster size so we need to reallocate */ + tempMdl = IoAllocateMdl(p->lut[0].vaddr, clsz, FALSE, FALSE, NULL); + if (tempMdl == NULL) { + NMA_UNLOCK(nmd); + D("fail to allocate tempMdl"); + IoFreeMdl(mainMdl); + return NULL; + } + pSrc = MmGetMdlPfnArray(tempMdl); + /* create one entry per cluster, the lut[] has one entry per object */ + for (j = 0; j < p->numclusters; j++, ofs += clsz) { + pDst = &MmGetMdlPfnArray(mainMdl)[BYTES_TO_PAGES(ofs)]; + MmInitializeMdl(tempMdl, p->lut[j*clobjs].vaddr, clsz); + MmBuildMdlForNonPagedPool(tempMdl); /* compute physical page addresses */ + RtlCopyMemory(pDst, pSrc, mdl_len); /* copy the page descriptors */ + mainMdl->MdlFlags = tempMdl->MdlFlags; /* XXX what is in here ? */ + } + IoFreeMdl(tempMdl); + } + NMA_UNLOCK(nmd); + return mainMdl; +} + +#endif /* _WIN32 */ + +/* + * helper function for OS-specific mmap routines (currently only windows). + * Given an nmd and a pool index, returns the cluster size and number of clusters. + * Returns 0 if memory is finalised and the pool is valid, otherwise 1. + * It should be called under NMA_LOCK(nmd) otherwise the underlying info can change. + */ + +int +netmap_mem2_get_pool_info(struct netmap_mem_d* nmd, u_int pool, u_int *clustsize, u_int *numclusters) +{ + if (!nmd || !clustsize || !numclusters || pool >= NETMAP_POOLS_NR) + return 1; /* invalid arguments */ + // NMA_LOCK_ASSERT(nmd); + if (!(nmd->flags & NETMAP_MEM_FINALIZED)) { + *clustsize = *numclusters = 0; + return 1; /* not ready yet */ + } + *clustsize = nmd->pools[pool]._clustsize; + *numclusters = nmd->pools[pool].numclusters; + return 0; /* success */ } static int @@ -578,12 +812,6 @@ netmap_obj_offset(struct netmap_obj_pool *p, const void *vaddr) ((n)->pools[NETMAP_IF_POOL].memtotal + \ netmap_obj_offset(&(n)->pools[NETMAP_RING_POOL], (v))) -#define netmap_buf_offset(n, v) \ - ((n)->pools[NETMAP_IF_POOL].memtotal + \ - (n)->pools[NETMAP_RING_POOL].memtotal + \ - netmap_obj_offset(&(n)->pools[NETMAP_BUF_POOL], (v))) - - static ssize_t netmap_mem2_if_offset(struct netmap_mem_d *nmd, const void *addr) { @@ -602,7 +830,7 @@ static void * netmap_obj_malloc(struct netmap_obj_pool *p, u_int len, uint32_t *start, uint32_t *index) { uint32_t i = 0; /* index in the bitmap */ - uint32_t mask, j; /* slot counter */ + uint32_t mask, j = 0; /* slot counter */ void *vaddr = NULL; if (len > p->_objsize) { @@ -636,7 +864,7 @@ netmap_obj_malloc(struct netmap_obj_pool *p, u_int len, uint32_t *start, uint32_ if (index) *index = i * 32 + j; } - ND("%s allocator: allocated object @ [%d][%d]: vaddr %p", i, j, vaddr); + ND("%s allocator: allocated object @ [%d][%d]: vaddr %p",p->name, i, j, vaddr); if (start) *start = i; @@ -733,7 +961,7 @@ netmap_extra_alloc(struct netmap_adapter *na, uint32_t *head, uint32_t n) *head = cur; /* restore */ break; } - RD(5, "allocate buffer %d -> %d", *head, cur); + ND(5, "allocate buffer %d -> %d", *head, cur); *p = cur; /* link to previous head */ } @@ -750,7 +978,7 @@ netmap_extra_free(struct netmap_adapter *na, uint32_t head) struct netmap_obj_pool *p = &nmd->pools[NETMAP_BUF_POOL]; uint32_t i, cur, *buf; - D("freeing the extra list"); + ND("freeing the extra list"); for (i = 0; head >=2 && head < p->objtotal; i++) { cur = head; buf = lut[head].vaddr; @@ -761,7 +989,8 @@ netmap_extra_free(struct netmap_adapter *na, uint32_t head) } if (head != 0) D("breaking with head %d", head); - D("freed %d buffers", i); + if (netmap_verbose) + D("freed %d buffers", i); } @@ -846,7 +1075,6 @@ netmap_reset_obj_allocator(struct netmap_obj_pool *p) p->bitmap = NULL; if (p->lut) { u_int i; - size_t sz = p->_clustsize; /* * Free each cluster allocated in @@ -856,7 +1084,7 @@ netmap_reset_obj_allocator(struct netmap_obj_pool *p) */ for (i = 0; i < p->objtotal; i += p->_clustentries) { if (p->lut[i].vaddr) - contigfree(p->lut[i].vaddr, sz, M_NETMAP); + contigfree(p->lut[i].vaddr, p->_clustsize, M_NETMAP); } bzero(p->lut, sizeof(struct lut_entry) * p->objtotal); #ifdef linux @@ -973,6 +1201,18 @@ netmap_config_obj_allocator(struct netmap_obj_pool *p, u_int objtotal, u_int obj return 0; } +static struct lut_entry * +nm_alloc_lut(u_int nobj) +{ + size_t n = sizeof(struct lut_entry) * nobj; + struct lut_entry *lut; +#ifdef linux + lut = vmalloc(n); +#else + lut = malloc(n, M_NETMAP, M_NOWAIT | M_ZERO); +#endif + return lut; +} /* call with NMA_LOCK held */ static int @@ -985,14 +1225,9 @@ netmap_finalize_obj_allocator(struct netmap_obj_pool *p) p->numclusters = p->_numclusters; p->objtotal = p->_objtotal; - n = sizeof(struct lut_entry) * p->objtotal; -#ifdef linux - p->lut = vmalloc(n); -#else - p->lut = malloc(n, M_NETMAP, M_NOWAIT | M_ZERO); -#endif + p->lut = nm_alloc_lut(p->objtotal); if (p->lut == NULL) { - D("Unable to create lookup table (%d bytes) for '%s'", (int)n, p->name); + D("Unable to create lookup table for '%s'", p->name); goto clean; } @@ -1015,6 +1250,13 @@ netmap_finalize_obj_allocator(struct netmap_obj_pool *p) int lim = i + p->_clustentries; char *clust; + /* + * XXX Note, we only need contigmalloc() for buffers attached + * to native interfaces. In all other cases (nifp, netmap rings + * and even buffers for VALE ports or emulated interfaces) we + * can live with standard malloc, because the hardware will not + * access the pages directly. + */ clust = contigmalloc(n, M_NETMAP, M_NOWAIT | M_ZERO, (size_t)0, -1UL, PAGE_SIZE, 0); if (clust == NULL) { @@ -1108,10 +1350,15 @@ netmap_mem_unmap(struct netmap_obj_pool *p, struct netmap_adapter *na) if (na->pdev == NULL) return 0; -#ifdef __FreeBSD__ +#if defined(__FreeBSD__) (void)i; (void)lim; D("unsupported on FreeBSD"); + +#elif defined(_WIN32) + (void)i; + (void)lim; + D("unsupported on Windows"); //XXX_ale, really? #else /* linux */ for (i = 2; i < lim; i++) { netmap_unload_map(na, (bus_dma_tag_t) na->pdev, &p->lut[i].paddr); @@ -1124,8 +1371,10 @@ netmap_mem_unmap(struct netmap_obj_pool *p, struct netmap_adapter *na) static int netmap_mem_map(struct netmap_obj_pool *p, struct netmap_adapter *na) { -#ifdef __FreeBSD__ +#if defined(__FreeBSD__) D("unsupported on FreeBSD"); +#elif defined(_WIN32) + D("unsupported on Windows"); //XXX_ale, really? #else /* linux */ int i, lim = p->_objtotal; @@ -1142,6 +1391,30 @@ netmap_mem_map(struct netmap_obj_pool *p, struct netmap_adapter *na) } static int +netmap_mem_init_shared_info(struct netmap_mem_d *nmd) +{ + struct netmap_mem_shared_info *nms_info; + ssize_t base; + + /* Use the first slot in IF_POOL */ + nms_info = netmap_if_malloc(nmd, sizeof(*nms_info)); + if (nms_info == NULL) { + return ENOMEM; + } + + base = netmap_if_offset(nmd, nms_info); + + memcpy(&nms_info->up, &nms_if_blueprint, sizeof(nms_if_blueprint)); + nms_info->buf_pool_offset = nmd->pools[NETMAP_IF_POOL].memtotal + nmd->pools[NETMAP_RING_POOL].memtotal; + nms_info->buf_pool_objtotal = nmd->pools[NETMAP_BUF_POOL].objtotal; + nms_info->buf_pool_objsize = nmd->pools[NETMAP_BUF_POOL]._objsize; + nms_info->totalsize = nmd->nm_totalsize; + nms_info->features = NMS_FEAT_BUF_POOL | NMS_FEAT_MEMSIZE; + + return 0; +} + +static int netmap_mem_finalize_all(struct netmap_mem_d *nmd) { int i; @@ -1160,6 +1433,11 @@ netmap_mem_finalize_all(struct netmap_mem_d *nmd) nmd->pools[NETMAP_BUF_POOL].bitmap[0] = ~3; nmd->flags |= NETMAP_MEM_FINALIZED; + /* expose info to the ptnetmap guest */ + nmd->lasterr = netmap_mem_init_shared_info(nmd); + if (nmd->lasterr) + goto error; + if (netmap_verbose) D("interfaces %d KB, rings %d KB, buffers %d MB", nmd->pools[NETMAP_IF_POOL].memtotal >> 10, @@ -1207,10 +1485,9 @@ static int netmap_mem_private_finalize(struct netmap_mem_d *nmd) { int err; - NMA_LOCK(nmd); - nmd->active++; err = netmap_mem_finalize_all(nmd); - NMA_UNLOCK(nmd); + if (!err) + nmd->active++; return err; } @@ -1218,10 +1495,8 @@ netmap_mem_private_finalize(struct netmap_mem_d *nmd) static void netmap_mem_private_deref(struct netmap_mem_d *nmd) { - NMA_LOCK(nmd); if (--nmd->active <= 0) netmap_mem_reset_all(nmd); - NMA_UNLOCK(nmd); } @@ -1238,7 +1513,7 @@ netmap_mem_private_new(const char *name, u_int txr, u_int txd, u_int v, maxd; d = malloc(sizeof(struct netmap_mem_d), - M_DEVBUF, M_NOWAIT | M_ZERO); + M_DEVBUF, M_NOWAIT | M_ZERO); if (d == NULL) { err = ENOMEM; goto error; @@ -1357,10 +1632,10 @@ static int netmap_mem_global_finalize(struct netmap_mem_d *nmd) { int err; - + /* update configuration if changed */ if (netmap_mem_global_config(nmd)) - goto out; + return nmd->lasterr; nmd->active++; @@ -1417,13 +1692,17 @@ netmap_free_rings(struct netmap_adapter *na) for_rx_tx(t) { u_int i; - for (i = 0; i < netmap_real_rings(na, t); i++) { + for (i = 0; i < nma_get_nrings(na, t) + 1; i++) { struct netmap_kring *kring = &NMR(na, t)[i]; struct netmap_ring *ring = kring->ring; - if (ring == NULL) + if (ring == NULL || kring->users > 0 || (kring->nr_kflags & NKR_NEEDRING)) { + ND("skipping ring %s (ring %p, users %d)", + kring->name, ring, kring->users); continue; - netmap_free_bufs(na->nm_mem, ring->slot, kring->nkr_num_slots); + } + if (i != nma_get_nrings(na, t) || na->na_flags & NAF_HOST_RINGS) + netmap_free_bufs(na->nm_mem, ring->slot, kring->nkr_num_slots); netmap_ring_free(na->nm_mem, ring); kring->ring = NULL; } @@ -1452,9 +1731,10 @@ netmap_mem2_rings_create(struct netmap_adapter *na) struct netmap_ring *ring = kring->ring; u_int len, ndesc; - if (ring) { - ND("%s already created", kring->name); - continue; /* already created by somebody else */ + if (ring || (!kring->users && !(kring->nr_kflags & NKR_NEEDRING))) { + /* uneeded, or already created by somebody else */ + ND("skipping ring %s", kring->name); + continue; } ndesc = kring->nkr_num_slots; len = sizeof(struct netmap_ring) + @@ -1569,10 +1849,22 @@ netmap_mem2_if_new(struct netmap_adapter *na) */ base = netmap_if_offset(na->nm_mem, nifp); for (i = 0; i < n[NR_TX]; i++) { + if (na->tx_rings[i].ring == NULL) { + // XXX maybe use the offset of an error ring, + // like we do for buffers? + *(ssize_t *)(uintptr_t)&nifp->ring_ofs[i] = 0; + continue; + } *(ssize_t *)(uintptr_t)&nifp->ring_ofs[i] = netmap_ring_offset(na->nm_mem, na->tx_rings[i].ring) - base; } for (i = 0; i < n[NR_RX]; i++) { + if (na->rx_rings[i].ring == NULL) { + // XXX maybe use the offset of an error ring, + // like we do for buffers? + *(ssize_t *)(uintptr_t)&nifp->ring_ofs[i+n[NR_TX]] = 0; + continue; + } *(ssize_t *)(uintptr_t)&nifp->ring_ofs[i+n[NR_TX]] = netmap_ring_offset(na->nm_mem, na->rx_rings[i].ring) - base; } @@ -1636,3 +1928,531 @@ struct netmap_mem_ops netmap_mem_private_ops = { .nmd_rings_create = netmap_mem2_rings_create, .nmd_rings_delete = netmap_mem2_rings_delete }; + +#ifdef WITH_PTNETMAP_GUEST +struct mem_pt_if { + struct mem_pt_if *next; + struct ifnet *ifp; + unsigned int nifp_offset; + nm_pt_guest_ptctl_t ptctl; +}; + +/* Netmap allocator for ptnetmap guests. */ +struct netmap_mem_ptg { + struct netmap_mem_d up; + + vm_paddr_t nm_paddr; /* physical address in the guest */ + void *nm_addr; /* virtual address in the guest */ + struct netmap_lut buf_lut; /* lookup table for BUF pool in the guest */ + nm_memid_t nm_host_id; /* allocator identifier in the host */ + struct ptnetmap_memdev *ptn_dev; + struct mem_pt_if *pt_ifs; /* list of interfaces in passthrough */ +}; + +/* Link a passthrough interface to a passthrough netmap allocator. */ +static int +netmap_mem_pt_guest_ifp_add(struct netmap_mem_d *nmd, struct ifnet *ifp, + unsigned int nifp_offset, + nm_pt_guest_ptctl_t ptctl) +{ + struct netmap_mem_ptg *ptnmd = (struct netmap_mem_ptg *)nmd; + struct mem_pt_if *ptif = malloc(sizeof(*ptif), M_NETMAP, + M_NOWAIT | M_ZERO); + + if (!ptif) { + return ENOMEM; + } + + NMA_LOCK(nmd); + + ptif->ifp = ifp; + ptif->nifp_offset = nifp_offset; + ptif->ptctl = ptctl; + + if (ptnmd->pt_ifs) { + ptif->next = ptnmd->pt_ifs; + } + ptnmd->pt_ifs = ptif; + + NMA_UNLOCK(nmd); + + D("added (ifp=%p,nifp_offset=%u)", ptif->ifp, ptif->nifp_offset); + + return 0; +} + +/* Called with NMA_LOCK(nmd) held. */ +static struct mem_pt_if * +netmap_mem_pt_guest_ifp_lookup(struct netmap_mem_d *nmd, struct ifnet *ifp) +{ + struct netmap_mem_ptg *ptnmd = (struct netmap_mem_ptg *)nmd; + struct mem_pt_if *curr; + + for (curr = ptnmd->pt_ifs; curr; curr = curr->next) { + if (curr->ifp == ifp) { + return curr; + } + } + + return NULL; +} + +/* Unlink a passthrough interface from a passthrough netmap allocator. */ +int +netmap_mem_pt_guest_ifp_del(struct netmap_mem_d *nmd, struct ifnet *ifp) +{ + struct netmap_mem_ptg *ptnmd = (struct netmap_mem_ptg *)nmd; + struct mem_pt_if *prev = NULL; + struct mem_pt_if *curr; + int ret = -1; + + NMA_LOCK(nmd); + + for (curr = ptnmd->pt_ifs; curr; curr = curr->next) { + if (curr->ifp == ifp) { + if (prev) { + prev->next = curr->next; + } else { + ptnmd->pt_ifs = curr->next; + } + D("removed (ifp=%p,nifp_offset=%u)", + curr->ifp, curr->nifp_offset); + free(curr, M_NETMAP); + ret = 0; + break; + } + prev = curr; + } + + NMA_UNLOCK(nmd); + + return ret; +} + +/* Read allocator info from the first netmap_if (only on finalize) */ +static int +netmap_mem_pt_guest_read_shared_info(struct netmap_mem_d *nmd) +{ + struct netmap_mem_ptg *ptnmd = (struct netmap_mem_ptg *)nmd; + struct netmap_mem_shared_info *nms_info; + uint32_t bufsize; + uint32_t nbuffers; + char *vaddr; + vm_paddr_t paddr; + int i; + + nms_info = (struct netmap_mem_shared_info *)ptnmd->nm_addr; + if (strncmp(nms_info->up.ni_name, NMS_NAME, sizeof(NMS_NAME)) != 0) { + D("error, the first slot does not contain shared info"); + return EINVAL; + } + /* check features mem_shared info */ + if ((nms_info->features & (NMS_FEAT_BUF_POOL | NMS_FEAT_MEMSIZE)) != + (NMS_FEAT_BUF_POOL | NMS_FEAT_MEMSIZE)) { + D("error, the shared info does not contain BUF_POOL and MEMSIZE"); + return EINVAL; + } + + bufsize = nms_info->buf_pool_objsize; + nbuffers = nms_info->buf_pool_objtotal; + + /* allocate the lut */ + if (ptnmd->buf_lut.lut == NULL) { + D("allocating lut"); + ptnmd->buf_lut.lut = nm_alloc_lut(nbuffers); + if (ptnmd->buf_lut.lut == NULL) { + D("lut allocation failed"); + return ENOMEM; + } + } + + /* we have physically contiguous memory mapped through PCI BAR */ + vaddr = (char *)(ptnmd->nm_addr) + nms_info->buf_pool_offset; + paddr = ptnmd->nm_paddr + nms_info->buf_pool_offset; + + for (i = 0; i < nbuffers; i++) { + ptnmd->buf_lut.lut[i].vaddr = vaddr; + ptnmd->buf_lut.lut[i].paddr = paddr; + vaddr += bufsize; + paddr += bufsize; + } + + ptnmd->buf_lut.objtotal = nbuffers; + ptnmd->buf_lut.objsize = bufsize; + + nmd->nm_totalsize = nms_info->totalsize; + + return 0; +} + +static int +netmap_mem_pt_guest_get_lut(struct netmap_mem_d *nmd, struct netmap_lut *lut) +{ + struct netmap_mem_ptg *ptnmd = (struct netmap_mem_ptg *)nmd; + + if (!(nmd->flags & NETMAP_MEM_FINALIZED)) { + return EINVAL; + } + + *lut = ptnmd->buf_lut; + return 0; +} + +static int +netmap_mem_pt_guest_get_info(struct netmap_mem_d *nmd, u_int *size, + u_int *memflags, uint16_t *id) +{ + int error = 0; + + NMA_LOCK(nmd); + + error = nmd->ops->nmd_config(nmd); + if (error) + goto out; + + if (size) + *size = nmd->nm_totalsize; + if (memflags) + *memflags = nmd->flags; + if (id) + *id = nmd->nm_id; + +out: + NMA_UNLOCK(nmd); + + return error; +} + +static vm_paddr_t +netmap_mem_pt_guest_ofstophys(struct netmap_mem_d *nmd, vm_ooffset_t off) +{ + struct netmap_mem_ptg *ptnmd = (struct netmap_mem_ptg *)nmd; + vm_paddr_t paddr; + /* if the offset is valid, just return csb->base_addr + off */ + paddr = (vm_paddr_t)(ptnmd->nm_paddr + off); + ND("off %lx padr %lx", off, (unsigned long)paddr); + return paddr; +} + +static int +netmap_mem_pt_guest_config(struct netmap_mem_d *nmd) +{ + /* nothing to do, we are configured on creation + * and configuration never changes thereafter + */ + return 0; +} + +static int +netmap_mem_pt_guest_finalize(struct netmap_mem_d *nmd) +{ + struct netmap_mem_ptg *ptnmd = (struct netmap_mem_ptg *)nmd; + int error = 0; + + nmd->active++; + + if (nmd->flags & NETMAP_MEM_FINALIZED) + goto out; + + if (ptnmd->ptn_dev == NULL) { + D("ptnetmap memdev not attached"); + error = ENOMEM; + goto err; + } + /* map memory through ptnetmap-memdev BAR */ + error = nm_os_pt_memdev_iomap(ptnmd->ptn_dev, &ptnmd->nm_paddr, + &ptnmd->nm_addr); + if (error) + goto err; + + /* read allcator info and create lut */ + error = netmap_mem_pt_guest_read_shared_info(nmd); + if (error) + goto err; + + nmd->flags |= NETMAP_MEM_FINALIZED; +out: + return 0; +err: + nmd->active--; + return error; +} + +static void +netmap_mem_pt_guest_deref(struct netmap_mem_d *nmd) +{ + struct netmap_mem_ptg *ptnmd = (struct netmap_mem_ptg *)nmd; + + nmd->active--; + if (nmd->active <= 0 && + (nmd->flags & NETMAP_MEM_FINALIZED)) { + nmd->flags &= ~NETMAP_MEM_FINALIZED; + /* unmap ptnetmap-memdev memory */ + if (ptnmd->ptn_dev) { + nm_os_pt_memdev_iounmap(ptnmd->ptn_dev); + } + ptnmd->nm_addr = 0; + ptnmd->nm_paddr = 0; + } +} + +static ssize_t +netmap_mem_pt_guest_if_offset(struct netmap_mem_d *nmd, const void *vaddr) +{ + struct netmap_mem_ptg *ptnmd = (struct netmap_mem_ptg *)nmd; + + return (const char *)(vaddr) - (char *)(ptnmd->nm_addr); +} + +static void +netmap_mem_pt_guest_delete(struct netmap_mem_d *nmd) +{ + if (nmd == NULL) + return; + if (netmap_verbose) + D("deleting %p", nmd); + if (nmd->active > 0) + D("bug: deleting mem allocator with active=%d!", nmd->active); + nm_mem_release_id(nmd); + if (netmap_verbose) + D("done deleting %p", nmd); + NMA_LOCK_DESTROY(nmd); + free(nmd, M_DEVBUF); +} + +static struct netmap_if * +netmap_mem_pt_guest_if_new(struct netmap_adapter *na) +{ + struct netmap_mem_ptg *ptnmd = (struct netmap_mem_ptg *)na->nm_mem; + struct mem_pt_if *ptif; + struct netmap_if *nifp = NULL; + + NMA_LOCK(na->nm_mem); + + ptif = netmap_mem_pt_guest_ifp_lookup(na->nm_mem, na->ifp); + if (ptif == NULL) { + D("Error: interface %p is not in passthrough", na->ifp); + goto out; + } + + nifp = (struct netmap_if *)((char *)(ptnmd->nm_addr) + + ptif->nifp_offset); + NMA_UNLOCK(na->nm_mem); +out: + return nifp; +} + +static void +netmap_mem_pt_guest_if_delete(struct netmap_adapter *na, struct netmap_if *nifp) +{ + struct mem_pt_if *ptif; + + NMA_LOCK(na->nm_mem); + + ptif = netmap_mem_pt_guest_ifp_lookup(na->nm_mem, na->ifp); + if (ptif == NULL) { + D("Error: interface %p is not in passthrough", na->ifp); + goto out; + } + + ptif->ptctl(na->ifp, PTNETMAP_PTCTL_IFDELETE); +out: + NMA_UNLOCK(na->nm_mem); +} + +static int +netmap_mem_pt_guest_rings_create(struct netmap_adapter *na) +{ + struct netmap_mem_ptg *ptnmd = (struct netmap_mem_ptg *)na->nm_mem; + struct mem_pt_if *ptif; + struct netmap_if *nifp; + int i, error = -1; + + NMA_LOCK(na->nm_mem); + + ptif = netmap_mem_pt_guest_ifp_lookup(na->nm_mem, na->ifp); + if (ptif == NULL) { + D("Error: interface %p is not in passthrough", na->ifp); + goto out; + } + + + /* point each kring to the corresponding backend ring */ + nifp = (struct netmap_if *)((char *)ptnmd->nm_addr + ptif->nifp_offset); + for (i = 0; i <= na->num_tx_rings; i++) { + struct netmap_kring *kring = na->tx_rings + i; + if (kring->ring) + continue; + kring->ring = (struct netmap_ring *) + ((char *)nifp + nifp->ring_ofs[i]); + } + for (i = 0; i <= na->num_rx_rings; i++) { + struct netmap_kring *kring = na->rx_rings + i; + if (kring->ring) + continue; + kring->ring = (struct netmap_ring *) + ((char *)nifp + + nifp->ring_ofs[i + na->num_tx_rings + 1]); + } + + //error = ptif->ptctl->nm_ptctl(ifp, PTNETMAP_PTCTL_RINGSCREATE); + error = 0; +out: + NMA_UNLOCK(na->nm_mem); + + return error; +} + +static void +netmap_mem_pt_guest_rings_delete(struct netmap_adapter *na) +{ + /* TODO: remove?? */ +#if 0 + struct netmap_mem_ptg *ptnmd = (struct netmap_mem_ptg *)na->nm_mem; + struct mem_pt_if *ptif = netmap_mem_pt_guest_ifp_lookup(na->nm_mem, + na->ifp); +#endif +} + +static struct netmap_mem_ops netmap_mem_pt_guest_ops = { + .nmd_get_lut = netmap_mem_pt_guest_get_lut, + .nmd_get_info = netmap_mem_pt_guest_get_info, + .nmd_ofstophys = netmap_mem_pt_guest_ofstophys, + .nmd_config = netmap_mem_pt_guest_config, + .nmd_finalize = netmap_mem_pt_guest_finalize, + .nmd_deref = netmap_mem_pt_guest_deref, + .nmd_if_offset = netmap_mem_pt_guest_if_offset, + .nmd_delete = netmap_mem_pt_guest_delete, + .nmd_if_new = netmap_mem_pt_guest_if_new, + .nmd_if_delete = netmap_mem_pt_guest_if_delete, + .nmd_rings_create = netmap_mem_pt_guest_rings_create, + .nmd_rings_delete = netmap_mem_pt_guest_rings_delete +}; + +/* Called with NMA_LOCK(&nm_mem) held. */ +static struct netmap_mem_d * +netmap_mem_pt_guest_find_hostid(nm_memid_t host_id) +{ + struct netmap_mem_d *mem = NULL; + struct netmap_mem_d *scan = netmap_last_mem_d; + + do { + /* find ptnetmap allocator through host ID */ + if (scan->ops->nmd_deref == netmap_mem_pt_guest_deref && + ((struct netmap_mem_ptg *)(scan))->nm_host_id == host_id) { + mem = scan; + break; + } + scan = scan->next; + } while (scan != netmap_last_mem_d); + + return mem; +} + +/* Called with NMA_LOCK(&nm_mem) held. */ +static struct netmap_mem_d * +netmap_mem_pt_guest_create(nm_memid_t host_id) +{ + struct netmap_mem_ptg *ptnmd; + int err = 0; + + ptnmd = malloc(sizeof(struct netmap_mem_ptg), + M_DEVBUF, M_NOWAIT | M_ZERO); + if (ptnmd == NULL) { + err = ENOMEM; + goto error; + } + + ptnmd->up.ops = &netmap_mem_pt_guest_ops; + ptnmd->nm_host_id = host_id; + ptnmd->pt_ifs = NULL; + + /* Assign new id in the guest (We have the lock) */ + err = nm_mem_assign_id_locked(&ptnmd->up); + if (err) + goto error; + + ptnmd->up.flags &= ~NETMAP_MEM_FINALIZED; + ptnmd->up.flags |= NETMAP_MEM_IO; + + NMA_LOCK_INIT(&ptnmd->up); + + return &ptnmd->up; +error: + netmap_mem_pt_guest_delete(&ptnmd->up); + return NULL; +} + +/* + * find host id in guest allocators and create guest allocator + * if it is not there + */ +static struct netmap_mem_d * +netmap_mem_pt_guest_get(nm_memid_t host_id) +{ + struct netmap_mem_d *nmd; + + NMA_LOCK(&nm_mem); + nmd = netmap_mem_pt_guest_find_hostid(host_id); + if (nmd == NULL) { + nmd = netmap_mem_pt_guest_create(host_id); + } + NMA_UNLOCK(&nm_mem); + + return nmd; +} + +/* + * The guest allocator can be created by ptnetmap_memdev (during the device + * attach) or by ptnetmap device (e1000/virtio), during the netmap_attach. + * + * The order is not important (we have different order in LINUX and FreeBSD). + * The first one, creates the device, and the second one simply attaches it. + */ + +/* Called when ptnetmap_memdev is attaching, to attach a new allocator in + * the guest */ +struct netmap_mem_d * +netmap_mem_pt_guest_attach(struct ptnetmap_memdev *ptn_dev, nm_memid_t host_id) +{ + struct netmap_mem_d *nmd; + struct netmap_mem_ptg *ptnmd; + + nmd = netmap_mem_pt_guest_get(host_id); + + /* assign this device to the guest allocator */ + if (nmd) { + ptnmd = (struct netmap_mem_ptg *)nmd; + ptnmd->ptn_dev = ptn_dev; + } + + return nmd; +} + +/* Called when ptnetmap device (virtio/e1000) is attaching */ +struct netmap_mem_d * +netmap_mem_pt_guest_new(struct ifnet *ifp, + unsigned int nifp_offset, + nm_pt_guest_ptctl_t ptctl) +{ + struct netmap_mem_d *nmd; + nm_memid_t host_id; + + if (ifp == NULL || ptctl == NULL) { + return NULL; + } + + /* Get the host id allocator. */ + host_id = ptctl(ifp, PTNETMAP_PTCTL_HOSTMEMID); + + nmd = netmap_mem_pt_guest_get(host_id); + + if (nmd) { + netmap_mem_pt_guest_ifp_add(nmd, ifp, nifp_offset, + ptctl); + } + + return nmd; +} + +#endif /* WITH_PTNETMAP_GUEST */ diff --git a/sys/dev/netmap/netmap_mem2.h b/sys/dev/netmap/netmap_mem2.h index ef0ff96d8e7f..7f4c5e9e9624 100644 --- a/sys/dev/netmap/netmap_mem2.h +++ b/sys/dev/netmap/netmap_mem2.h @@ -1,5 +1,8 @@ /* - * Copyright (C) 2012-2014 Matteo Landi, Luigi Rizzo, Giuseppe Lettieri. All rights reserved. + * Copyright (C) 2012-2014 Matteo Landi + * Copyright (C) 2012-2016 Luigi Rizzo + * Copyright (C) 2012-2016 Giuseppe Lettieri + * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -117,8 +120,11 @@ extern struct netmap_mem_d nm_mem; -void netmap_mem_get_lut(struct netmap_mem_d *, struct netmap_lut *); +int netmap_mem_get_lut(struct netmap_mem_d *, struct netmap_lut *); vm_paddr_t netmap_mem_ofstophys(struct netmap_mem_d *, vm_ooffset_t); +#ifdef _WIN32 +PMDL win32_build_user_vm_map(struct netmap_mem_d* nmd); +#endif int netmap_mem_finalize(struct netmap_mem_d *, struct netmap_adapter *); int netmap_mem_init(void); void netmap_mem_fini(void); @@ -127,6 +133,7 @@ void netmap_mem_if_delete(struct netmap_adapter *, struct netmap_if *); int netmap_mem_rings_create(struct netmap_adapter *); void netmap_mem_rings_delete(struct netmap_adapter *); void netmap_mem_deref(struct netmap_mem_d *, struct netmap_adapter *); +int netmap_mem2_get_pool_info(struct netmap_mem_d *, u_int, u_int *, u_int *); int netmap_mem_get_info(struct netmap_mem_d *, u_int *size, u_int *memflags, uint16_t *id); ssize_t netmap_mem_if_offset(struct netmap_mem_d *, const void *vaddr); struct netmap_mem_d* netmap_mem_private_new(const char *name, @@ -157,6 +164,15 @@ void netmap_mem_put(struct netmap_mem_d *); #endif /* !NM_DEBUG_PUTGET */ +#ifdef WITH_PTNETMAP_GUEST +struct netmap_mem_d* netmap_mem_pt_guest_new(struct ifnet *, + unsigned int nifp_offset, + nm_pt_guest_ptctl_t); +struct ptnetmap_memdev; +struct netmap_mem_d* netmap_mem_pt_guest_attach(struct ptnetmap_memdev *, uint16_t); +int netmap_mem_pt_guest_ifp_del(struct netmap_mem_d *, struct ifnet *); +#endif /* WITH_PTNETMAP_GUEST */ + #define NETMAP_MEM_PRIVATE 0x2 /* allocator uses private address space */ #define NETMAP_MEM_IO 0x4 /* the underlying memory is mmapped I/O */ diff --git a/sys/dev/netmap/netmap_monitor.c b/sys/dev/netmap/netmap_monitor.c index c303952417ff..5b4f9cdf61c0 100644 --- a/sys/dev/netmap/netmap_monitor.c +++ b/sys/dev/netmap/netmap_monitor.c @@ -1,5 +1,6 @@ /* - * Copyright (C) 2014 Giuseppe Lettieri. All rights reserved. + * Copyright (C) 2014-2016 Giuseppe Lettieri + * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -101,6 +102,8 @@ #warning OSX support is only partial #include "osx_glue.h" +#elif defined(_WIN32) +#include "win_glue.h" #else #error Unsupported platform @@ -151,13 +154,17 @@ netmap_monitor_rxsync(struct netmap_kring *kring, int flags) } /* nm_krings_create callbacks for monitors. - * We could use the default netmap_hw_krings_zmon, but - * we don't need the mbq. */ static int netmap_monitor_krings_create(struct netmap_adapter *na) { - return netmap_krings_create(na, 0); + int error = netmap_krings_create(na, 0); + if (error) + return error; + /* override the host rings callbacks */ + na->tx_rings[na->num_tx_rings].nm_sync = netmap_monitor_txsync; + na->rx_rings[na->num_rx_rings].nm_sync = netmap_monitor_rxsync; + return 0; } /* nm_krings_delete callback for monitors */ @@ -186,7 +193,11 @@ nm_monitor_alloc(struct netmap_kring *kring, u_int n) return 0; len = sizeof(struct netmap_kring *) * n; +#ifndef _WIN32 nm = realloc(kring->monitors, len, M_DEVBUF, M_NOWAIT | M_ZERO); +#else + nm = realloc(kring->monitors, len, sizeof(struct netmap_kring *)*kring->max_monitors); +#endif if (nm == NULL) return ENOMEM; @@ -229,10 +240,10 @@ static int netmap_monitor_parent_notify(struct netmap_kring *, int); static int netmap_monitor_add(struct netmap_kring *mkring, struct netmap_kring *kring, int zcopy) { - int error = 0; + int error = NM_IRQ_COMPLETED; /* sinchronize with concurrently running nm_sync()s */ - nm_kr_get(kring); + nm_kr_stop(kring, NM_KR_LOCKED); /* make sure the monitor array exists and is big enough */ error = nm_monitor_alloc(kring, kring->n_monitors + 1); if (error) @@ -242,7 +253,7 @@ netmap_monitor_add(struct netmap_kring *mkring, struct netmap_kring *kring, int kring->n_monitors++; if (kring->n_monitors == 1) { /* this is the first monitor, intercept callbacks */ - D("%s: intercept callbacks on %s", mkring->name, kring->name); + ND("%s: intercept callbacks on %s", mkring->name, kring->name); kring->mon_sync = kring->nm_sync; /* zcopy monitors do not override nm_notify(), but * we save the original one regardless, so that @@ -265,7 +276,7 @@ netmap_monitor_add(struct netmap_kring *mkring, struct netmap_kring *kring, int } out: - nm_kr_put(kring); + nm_kr_start(kring); return error; } @@ -277,7 +288,7 @@ static void netmap_monitor_del(struct netmap_kring *mkring, struct netmap_kring *kring) { /* sinchronize with concurrently running nm_sync()s */ - nm_kr_get(kring); + nm_kr_stop(kring, NM_KR_LOCKED); kring->n_monitors--; if (mkring->mon_pos != kring->n_monitors) { kring->monitors[mkring->mon_pos] = kring->monitors[kring->n_monitors]; @@ -286,18 +297,18 @@ netmap_monitor_del(struct netmap_kring *mkring, struct netmap_kring *kring) kring->monitors[kring->n_monitors] = NULL; if (kring->n_monitors == 0) { /* this was the last monitor, restore callbacks and delete monitor array */ - D("%s: restoring sync on %s: %p", mkring->name, kring->name, kring->mon_sync); + ND("%s: restoring sync on %s: %p", mkring->name, kring->name, kring->mon_sync); kring->nm_sync = kring->mon_sync; kring->mon_sync = NULL; if (kring->tx == NR_RX) { - D("%s: restoring notify on %s: %p", + ND("%s: restoring notify on %s: %p", mkring->name, kring->name, kring->mon_notify); kring->nm_notify = kring->mon_notify; kring->mon_notify = NULL; } nm_monitor_dealloc(kring); } - nm_kr_put(kring); + nm_kr_start(kring); } @@ -316,7 +327,7 @@ netmap_monitor_stop(struct netmap_adapter *na) for_rx_tx(t) { u_int i; - for (i = 0; i < nma_get_nrings(na, t); i++) { + for (i = 0; i < nma_get_nrings(na, t) + 1; i++) { struct netmap_kring *kring = &NMR(na, t)[i]; u_int j; @@ -360,23 +371,32 @@ netmap_monitor_reg_common(struct netmap_adapter *na, int onoff, int zmon) for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) { kring = &NMR(pna, t)[i]; mkring = &na->rx_rings[i]; - netmap_monitor_add(mkring, kring, zmon); + if (nm_kring_pending_on(mkring)) { + netmap_monitor_add(mkring, kring, zmon); + mkring->nr_mode = NKR_NETMAP_ON; + } } } } na->na_flags |= NAF_NETMAP_ON; } else { - if (pna == NULL) { - D("%s: parent left netmap mode, nothing to restore", na->name); - return 0; - } - na->na_flags &= ~NAF_NETMAP_ON; + if (na->active_fds == 0) + na->na_flags &= ~NAF_NETMAP_ON; for_rx_tx(t) { if (mna->flags & nm_txrx2flag(t)) { for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) { - kring = &NMR(pna, t)[i]; mkring = &na->rx_rings[i]; - netmap_monitor_del(mkring, kring); + if (nm_kring_pending_off(mkring)) { + mkring->nr_mode = NKR_NETMAP_OFF; + /* we cannot access the parent krings if the parent + * has left netmap mode. This is signaled by a NULL + * pna pointer + */ + if (pna) { + kring = &NMR(pna, t)[i]; + netmap_monitor_del(mkring, kring); + } + } } } } @@ -652,17 +672,27 @@ netmap_monitor_parent_rxsync(struct netmap_kring *kring, int flags) static int netmap_monitor_parent_notify(struct netmap_kring *kring, int flags) { + int (*notify)(struct netmap_kring*, int); ND(5, "%s %x", kring->name, flags); /* ?xsync callbacks have tryget called by their callers * (NIOCREGIF and poll()), but here we have to call it * by ourself */ - if (nm_kr_tryget(kring)) - goto out; - netmap_monitor_parent_rxsync(kring, NAF_FORCE_READ); + if (nm_kr_tryget(kring, 0, NULL)) { + /* in all cases, just skip the sync */ + return NM_IRQ_COMPLETED; + } + if (kring->n_monitors > 0) { + netmap_monitor_parent_rxsync(kring, NAF_FORCE_READ); + notify = kring->mon_notify; + } else { + /* we are no longer monitoring this ring, so both + * mon_sync and mon_notify are NULL + */ + notify = kring->nm_notify; + } nm_kr_put(kring); -out: - return kring->mon_notify(kring, flags); + return notify(kring, flags); } @@ -691,18 +721,25 @@ netmap_get_monitor_na(struct nmreq *nmr, struct netmap_adapter **na, int create) struct nmreq pnmr; struct netmap_adapter *pna; /* parent adapter */ struct netmap_monitor_adapter *mna; + struct ifnet *ifp = NULL; int i, error; enum txrx t; int zcopy = (nmr->nr_flags & NR_ZCOPY_MON); char monsuff[10] = ""; if ((nmr->nr_flags & (NR_MONITOR_TX | NR_MONITOR_RX)) == 0) { + if (nmr->nr_flags & NR_ZCOPY_MON) { + /* the flag makes no sense unless you are + * creating a monitor + */ + return EINVAL; + } ND("not a monitor"); return 0; } /* this is a request for a monitor adapter */ - D("flags %x", nmr->nr_flags); + ND("flags %x", nmr->nr_flags); mna = malloc(sizeof(*mna), M_DEVBUF, M_NOWAIT | M_ZERO); if (mna == NULL) { @@ -716,13 +753,14 @@ netmap_get_monitor_na(struct nmreq *nmr, struct netmap_adapter **na, int create) * except other monitors. */ memcpy(&pnmr, nmr, sizeof(pnmr)); - pnmr.nr_flags &= ~(NR_MONITOR_TX | NR_MONITOR_RX); - error = netmap_get_na(&pnmr, &pna, create); + pnmr.nr_flags &= ~(NR_MONITOR_TX | NR_MONITOR_RX | NR_ZCOPY_MON); + error = netmap_get_na(&pnmr, &pna, &ifp, create); if (error) { D("parent lookup failed: %d", error); + free(mna, M_DEVBUF); return error; } - D("found parent: %s", pna->name); + ND("found parent: %s", pna->name); if (!nm_netmap_on(pna)) { /* parent not in netmap mode */ @@ -829,19 +867,17 @@ netmap_get_monitor_na(struct nmreq *nmr, struct netmap_adapter **na, int create) *na = &mna->up; netmap_adapter_get(*na); - /* write the configuration back */ - nmr->nr_tx_rings = mna->up.num_tx_rings; - nmr->nr_rx_rings = mna->up.num_rx_rings; - nmr->nr_tx_slots = mna->up.num_tx_desc; - nmr->nr_rx_slots = mna->up.num_rx_desc; - /* keep the reference to the parent */ - D("monitor ok"); + ND("monitor ok"); + + /* drop the reference to the ifp, if any */ + if (ifp) + if_rele(ifp); return 0; put_out: - netmap_adapter_put(pna); + netmap_unget_na(pna, ifp); free(mna, M_DEVBUF); return error; } diff --git a/sys/dev/netmap/netmap_offloadings.c b/sys/dev/netmap/netmap_offloadings.c index dadc1dcbc14c..f8da672ffa53 100644 --- a/sys/dev/netmap/netmap_offloadings.c +++ b/sys/dev/netmap/netmap_offloadings.c @@ -1,5 +1,6 @@ /* - * Copyright (C) 2014 Vincenzo Maffione. All rights reserved. + * Copyright (C) 2014-2015 Vincenzo Maffione + * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -31,9 +32,9 @@ #include <sys/types.h> #include <sys/errno.h> #include <sys/param.h> /* defines used in kernel.h */ -#include <sys/malloc.h> /* types used in module initialization */ #include <sys/kernel.h> /* types used in module initialization */ #include <sys/sockio.h> +#include <sys/malloc.h> #include <sys/socketvar.h> /* struct socket */ #include <sys/socket.h> /* sockaddrs */ #include <net/if.h> @@ -64,21 +65,21 @@ /* This routine is called by bdg_mismatch_datapath() when it finishes * accumulating bytes for a segment, in order to fix some fields in the * segment headers (which still contain the same content as the header - * of the original GSO packet). 'buf' points to the beginning (e.g. - * the ethernet header) of the segment, and 'len' is its length. + * of the original GSO packet). 'pkt' points to the beginning of the IP + * header of the segment, while 'len' is the length of the IP packet. */ -static void gso_fix_segment(uint8_t *buf, size_t len, u_int idx, - u_int segmented_bytes, u_int last_segment, - u_int tcp, u_int iphlen) +static void +gso_fix_segment(uint8_t *pkt, size_t len, u_int ipv4, u_int iphlen, u_int tcp, + u_int idx, u_int segmented_bytes, u_int last_segment) { - struct nm_iphdr *iph = (struct nm_iphdr *)(buf + 14); - struct nm_ipv6hdr *ip6h = (struct nm_ipv6hdr *)(buf + 14); + struct nm_iphdr *iph = (struct nm_iphdr *)(pkt); + struct nm_ipv6hdr *ip6h = (struct nm_ipv6hdr *)(pkt); uint16_t *check = NULL; uint8_t *check_data = NULL; - if (iphlen == 20) { + if (ipv4) { /* Set the IPv4 "Total Length" field. */ - iph->tot_len = htobe16(len-14); + iph->tot_len = htobe16(len); ND("ip total length %u", be16toh(ip->tot_len)); /* Set the IPv4 "Identification" field. */ @@ -87,15 +88,15 @@ static void gso_fix_segment(uint8_t *buf, size_t len, u_int idx, /* Compute and insert the IPv4 header checksum. */ iph->check = 0; - iph->check = nm_csum_ipv4(iph); + iph->check = nm_os_csum_ipv4(iph); ND("IP csum %x", be16toh(iph->check)); - } else {/* if (iphlen == 40) */ + } else { /* Set the IPv6 "Payload Len" field. */ - ip6h->payload_len = htobe16(len-14-iphlen); + ip6h->payload_len = htobe16(len-iphlen); } if (tcp) { - struct nm_tcphdr *tcph = (struct nm_tcphdr *)(buf + 14 + iphlen); + struct nm_tcphdr *tcph = (struct nm_tcphdr *)(pkt + iphlen); /* Set the TCP sequence number. */ tcph->seq = htobe32(be32toh(tcph->seq) + segmented_bytes); @@ -110,10 +111,10 @@ static void gso_fix_segment(uint8_t *buf, size_t len, u_int idx, check = &tcph->check; check_data = (uint8_t *)tcph; } else { /* UDP */ - struct nm_udphdr *udph = (struct nm_udphdr *)(buf + 14 + iphlen); + struct nm_udphdr *udph = (struct nm_udphdr *)(pkt + iphlen); /* Set the UDP 'Length' field. */ - udph->len = htobe16(len-14-iphlen); + udph->len = htobe16(len-iphlen); check = &udph->check; check_data = (uint8_t *)udph; @@ -121,48 +122,80 @@ static void gso_fix_segment(uint8_t *buf, size_t len, u_int idx, /* Compute and insert TCP/UDP checksum. */ *check = 0; - if (iphlen == 20) - nm_csum_tcpudp_ipv4(iph, check_data, len-14-iphlen, check); + if (ipv4) + nm_os_csum_tcpudp_ipv4(iph, check_data, len-iphlen, check); else - nm_csum_tcpudp_ipv6(ip6h, check_data, len-14-iphlen, check); + nm_os_csum_tcpudp_ipv6(ip6h, check_data, len-iphlen, check); ND("TCP/UDP csum %x", be16toh(*check)); } +static int +vnet_hdr_is_bad(struct nm_vnet_hdr *vh) +{ + uint8_t gso_type = vh->gso_type & ~VIRTIO_NET_HDR_GSO_ECN; + + return ( + (gso_type != VIRTIO_NET_HDR_GSO_NONE && + gso_type != VIRTIO_NET_HDR_GSO_TCPV4 && + gso_type != VIRTIO_NET_HDR_GSO_UDP && + gso_type != VIRTIO_NET_HDR_GSO_TCPV6) + || + (vh->flags & ~(VIRTIO_NET_HDR_F_NEEDS_CSUM + | VIRTIO_NET_HDR_F_DATA_VALID)) + ); +} /* The VALE mismatch datapath implementation. */ -void bdg_mismatch_datapath(struct netmap_vp_adapter *na, - struct netmap_vp_adapter *dst_na, - struct nm_bdg_fwd *ft_p, struct netmap_ring *ring, - u_int *j, u_int lim, u_int *howmany) +void +bdg_mismatch_datapath(struct netmap_vp_adapter *na, + struct netmap_vp_adapter *dst_na, + const struct nm_bdg_fwd *ft_p, + struct netmap_ring *dst_ring, + u_int *j, u_int lim, u_int *howmany) { - struct netmap_slot *slot = NULL; + struct netmap_slot *dst_slot = NULL; struct nm_vnet_hdr *vh = NULL; - /* Number of source slots to process. */ - u_int frags = ft_p->ft_frags; - struct nm_bdg_fwd *ft_end = ft_p + frags; + const struct nm_bdg_fwd *ft_end = ft_p + ft_p->ft_frags; /* Source and destination pointers. */ uint8_t *dst, *src; size_t src_len, dst_len; + /* Indices and counters for the destination ring. */ u_int j_start = *j; + u_int j_cur = j_start; u_int dst_slots = 0; - /* If the source port uses the offloadings, while destination doesn't, - * we grab the source virtio-net header and do the offloadings here. - */ - if (na->virt_hdr_len && !dst_na->virt_hdr_len) { - vh = (struct nm_vnet_hdr *)ft_p->ft_buf; + if (unlikely(ft_p == ft_end)) { + RD(3, "No source slots to process"); + return; } /* Init source and dest pointers. */ src = ft_p->ft_buf; src_len = ft_p->ft_len; - slot = &ring->slot[*j]; - dst = NMB(&dst_na->up, slot); + dst_slot = &dst_ring->slot[j_cur]; + dst = NMB(&dst_na->up, dst_slot); dst_len = src_len; + /* If the source port uses the offloadings, while destination doesn't, + * we grab the source virtio-net header and do the offloadings here. + */ + if (na->up.virt_hdr_len && !dst_na->up.virt_hdr_len) { + vh = (struct nm_vnet_hdr *)src; + /* Initial sanity check on the source virtio-net header. If + * something seems wrong, just drop the packet. */ + if (src_len < na->up.virt_hdr_len) { + RD(3, "Short src vnet header, dropping"); + return; + } + if (vnet_hdr_is_bad(vh)) { + RD(3, "Bad src vnet header, dropping"); + return; + } + } + /* We are processing the first input slot and there is a mismatch * between source and destination virt_hdr_len (SHL and DHL). * When the a client is using virtio-net headers, the header length @@ -185,14 +218,14 @@ void bdg_mismatch_datapath(struct netmap_vp_adapter *na, * 12 | 0 | doesn't exist * 12 | 10 | copied from the first 10 bytes of source header */ - bzero(dst, dst_na->virt_hdr_len); - if (na->virt_hdr_len && dst_na->virt_hdr_len) + bzero(dst, dst_na->up.virt_hdr_len); + if (na->up.virt_hdr_len && dst_na->up.virt_hdr_len) memcpy(dst, src, sizeof(struct nm_vnet_hdr)); /* Skip the virtio-net headers. */ - src += na->virt_hdr_len; - src_len -= na->virt_hdr_len; - dst += dst_na->virt_hdr_len; - dst_len = dst_na->virt_hdr_len + src_len; + src += na->up.virt_hdr_len; + src_len -= na->up.virt_hdr_len; + dst += dst_na->up.virt_hdr_len; + dst_len = dst_na->up.virt_hdr_len + src_len; /* Here it could be dst_len == 0 (which implies src_len == 0), * so we avoid passing a zero length fragment. @@ -214,16 +247,27 @@ void bdg_mismatch_datapath(struct netmap_vp_adapter *na, u_int gso_idx = 0; /* Payload data bytes segmented so far (e.g. TCP data bytes). */ u_int segmented_bytes = 0; + /* Is this an IPv4 or IPv6 GSO packet? */ + u_int ipv4 = 0; /* Length of the IP header (20 if IPv4, 40 if IPv6). */ u_int iphlen = 0; + /* Length of the Ethernet header (18 if 802.1q, otherwise 14). */ + u_int ethhlen = 14; /* Is this a TCP or an UDP GSO packet? */ u_int tcp = ((vh->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) == VIRTIO_NET_HDR_GSO_UDP) ? 0 : 1; /* Segment the GSO packet contained into the input slots (frags). */ - while (ft_p != ft_end) { + for (;;) { size_t copy; + if (dst_slots >= *howmany) { + /* We still have work to do, but we've run out of + * dst slots, so we have to drop the packet. */ + RD(3, "Not enough slots, dropping GSO packet"); + return; + } + /* Grab the GSO header if we don't have it. */ if (!gso_hdr) { uint16_t ethertype; @@ -231,28 +275,75 @@ void bdg_mismatch_datapath(struct netmap_vp_adapter *na, gso_hdr = src; /* Look at the 'Ethertype' field to see if this packet - * is IPv4 or IPv6. - */ - ethertype = be16toh(*((uint16_t *)(gso_hdr + 12))); - if (ethertype == 0x0800) - iphlen = 20; - else /* if (ethertype == 0x86DD) */ - iphlen = 40; + * is IPv4 or IPv6, taking into account VLAN + * encapsulation. */ + for (;;) { + if (src_len < ethhlen) { + RD(3, "Short GSO fragment [eth], dropping"); + return; + } + ethertype = be16toh(*((uint16_t *) + (gso_hdr + ethhlen - 2))); + if (ethertype != 0x8100) /* not 802.1q */ + break; + ethhlen += 4; + } + switch (ethertype) { + case 0x0800: /* IPv4 */ + { + struct nm_iphdr *iph = (struct nm_iphdr *) + (gso_hdr + ethhlen); + + if (src_len < ethhlen + 20) { + RD(3, "Short GSO fragment " + "[IPv4], dropping"); + return; + } + ipv4 = 1; + iphlen = 4 * (iph->version_ihl & 0x0F); + break; + } + case 0x86DD: /* IPv6 */ + ipv4 = 0; + iphlen = 40; + break; + default: + RD(3, "Unsupported ethertype, " + "dropping GSO packet"); + return; + } ND(3, "type=%04x", ethertype); + if (src_len < ethhlen + iphlen) { + RD(3, "Short GSO fragment [IP], dropping"); + return; + } + /* Compute gso_hdr_len. For TCP we need to read the * content of the 'Data Offset' field. */ if (tcp) { - struct nm_tcphdr *tcph = - (struct nm_tcphdr *)&gso_hdr[14+iphlen]; + struct nm_tcphdr *tcph = (struct nm_tcphdr *) + (gso_hdr + ethhlen + iphlen); - gso_hdr_len = 14 + iphlen + 4*(tcph->doff >> 4); - } else - gso_hdr_len = 14 + iphlen + 8; /* UDP */ + if (src_len < ethhlen + iphlen + 20) { + RD(3, "Short GSO fragment " + "[TCP], dropping"); + return; + } + gso_hdr_len = ethhlen + iphlen + + 4 * (tcph->doff >> 4); + } else { + gso_hdr_len = ethhlen + iphlen + 8; /* UDP */ + } + + if (src_len < gso_hdr_len) { + RD(3, "Short GSO fragment [TCP/UDP], dropping"); + return; + } ND(3, "gso_hdr_len %u gso_mtu %d", gso_hdr_len, - dst_na->mfs); + dst_na->mfs); /* Advance source pointers. */ src += gso_hdr_len; @@ -263,7 +354,6 @@ void bdg_mismatch_datapath(struct netmap_vp_adapter *na, break; src = ft_p->ft_buf; src_len = ft_p->ft_len; - continue; } } @@ -289,25 +379,24 @@ void bdg_mismatch_datapath(struct netmap_vp_adapter *na, /* After raw segmentation, we must fix some header * fields and compute checksums, in a protocol dependent * way. */ - gso_fix_segment(dst, gso_bytes, gso_idx, - segmented_bytes, - src_len == 0 && ft_p + 1 == ft_end, - tcp, iphlen); + gso_fix_segment(dst + ethhlen, gso_bytes - ethhlen, + ipv4, iphlen, tcp, + gso_idx, segmented_bytes, + src_len == 0 && ft_p + 1 == ft_end); ND("frame %u completed with %d bytes", gso_idx, (int)gso_bytes); - slot->len = gso_bytes; - slot->flags = 0; - segmented_bytes += gso_bytes - gso_hdr_len; - + dst_slot->len = gso_bytes; + dst_slot->flags = 0; dst_slots++; - - /* Next destination slot. */ - *j = nm_next(*j, lim); - slot = &ring->slot[*j]; - dst = NMB(&dst_na->up, slot); + segmented_bytes += gso_bytes - gso_hdr_len; gso_bytes = 0; gso_idx++; + + /* Next destination slot. */ + j_cur = nm_next(j_cur, lim); + dst_slot = &dst_ring->slot[j_cur]; + dst = NMB(&dst_na->up, dst_slot); } /* Next input slot. */ @@ -342,10 +431,10 @@ void bdg_mismatch_datapath(struct netmap_vp_adapter *na, /* Init/update the packet checksum if needed. */ if (vh && (vh->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)) { if (!dst_slots) - csum = nm_csum_raw(src + vh->csum_start, + csum = nm_os_csum_raw(src + vh->csum_start, src_len - vh->csum_start, 0); else - csum = nm_csum_raw(src, src_len, csum); + csum = nm_os_csum_raw(src, src_len, csum); } /* Round to a multiple of 64 */ @@ -359,44 +448,43 @@ void bdg_mismatch_datapath(struct netmap_vp_adapter *na, } else { memcpy(dst, src, (int)src_len); } - slot->len = dst_len; - + dst_slot->len = dst_len; dst_slots++; /* Next destination slot. */ - *j = nm_next(*j, lim); - slot = &ring->slot[*j]; - dst = NMB(&dst_na->up, slot); + j_cur = nm_next(j_cur, lim); + dst_slot = &dst_ring->slot[j_cur]; + dst = NMB(&dst_na->up, dst_slot); /* Next source slot. */ ft_p++; src = ft_p->ft_buf; dst_len = src_len = ft_p->ft_len; - } /* Finalize (fold) the checksum if needed. */ if (check && vh && (vh->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)) { - *check = nm_csum_fold(csum); + *check = nm_os_csum_fold(csum); } ND(3, "using %u dst_slots", dst_slots); - /* A second pass on the desitations slots to set the slot flags, + /* A second pass on the destination slots to set the slot flags, * using the right number of destination slots. */ - while (j_start != *j) { - slot = &ring->slot[j_start]; - slot->flags = (dst_slots << 8)| NS_MOREFRAG; + while (j_start != j_cur) { + dst_slot = &dst_ring->slot[j_start]; + dst_slot->flags = (dst_slots << 8)| NS_MOREFRAG; j_start = nm_next(j_start, lim); } /* Clear NS_MOREFRAG flag on last entry. */ - slot->flags = (dst_slots << 8); + dst_slot->flags = (dst_slots << 8); } - /* Update howmany. */ + /* Update howmany and j. This is to commit the use of + * those slots in the destination ring. */ if (unlikely(dst_slots > *howmany)) { - dst_slots = *howmany; - D("Slot allocation error: Should never happen"); + D("Slot allocation error: This is a bug"); } + *j = j_cur; *howmany -= dst_slots; } diff --git a/sys/dev/netmap/netmap_pipe.c b/sys/dev/netmap/netmap_pipe.c index 67e840248c88..f0f1b524300a 100644 --- a/sys/dev/netmap/netmap_pipe.c +++ b/sys/dev/netmap/netmap_pipe.c @@ -1,5 +1,6 @@ /* - * Copyright (C) 2014 Giuseppe Lettieri. All rights reserved. + * Copyright (C) 2014-2016 Giuseppe Lettieri + * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -54,6 +55,9 @@ #warning OSX support is only partial #include "osx_glue.h" +#elif defined(_WIN32) +#include "win_glue.h" + #else #error Unsupported platform @@ -72,9 +76,11 @@ #define NM_PIPE_MAXSLOTS 4096 -int netmap_default_pipes = 0; /* ignored, kept for compatibility */ +static int netmap_default_pipes = 0; /* ignored, kept for compatibility */ +SYSBEGIN(vars_pipes); SYSCTL_DECL(_dev_netmap); SYSCTL_INT(_dev_netmap, OID_AUTO, default_pipes, CTLFLAG_RW, &netmap_default_pipes, 0 , ""); +SYSEND; /* allocate the pipe array in the parent adapter */ static int @@ -91,7 +97,11 @@ nm_pipe_alloc(struct netmap_adapter *na, u_int npipes) return EINVAL; len = sizeof(struct netmap_pipe_adapter *) * npipes; +#ifndef _WIN32 npa = realloc(na->na_pipes, len, M_DEVBUF, M_NOWAIT | M_ZERO); +#else + npa = realloc(na->na_pipes, len, sizeof(struct netmap_pipe_adapter *)*na->na_max_pipes); +#endif if (npa == NULL) return ENOMEM; @@ -199,7 +209,7 @@ netmap_pipe_txsync(struct netmap_kring *txkring, int flags) } while (limit-- > 0) { - struct netmap_slot *rs = &rxkring->save_ring->slot[j]; + struct netmap_slot *rs = &rxkring->ring->slot[j]; struct netmap_slot *ts = &txkring->ring->slot[k]; struct netmap_slot tmp; @@ -295,7 +305,7 @@ netmap_pipe_rxsync(struct netmap_kring *rxkring, int flags) * usr1 --> e1 --> e2 * * and we are e2. e1 is certainly registered and our - * krings already exist, but they may be hidden. + * krings already exist. Nothing to do. */ static int netmap_pipe_krings_create(struct netmap_adapter *na) @@ -310,65 +320,28 @@ netmap_pipe_krings_create(struct netmap_adapter *na) int i; /* case 1) above */ - ND("%p: case 1, create everything", na); + D("%p: case 1, create both ends", na); error = netmap_krings_create(na, 0); if (error) goto err; - /* we also create all the rings, since we need to - * update the save_ring pointers. - * netmap_mem_rings_create (called by our caller) - * will not create the rings again - */ - - error = netmap_mem_rings_create(na); - if (error) - goto del_krings1; - - /* update our hidden ring pointers */ - for_rx_tx(t) { - for (i = 0; i < nma_get_nrings(na, t) + 1; i++) - NMR(na, t)[i].save_ring = NMR(na, t)[i].ring; - } - - /* now, create krings and rings of the other end */ + /* create the krings of the other end */ error = netmap_krings_create(ona, 0); if (error) - goto del_rings1; - - error = netmap_mem_rings_create(ona); - if (error) - goto del_krings2; - - for_rx_tx(t) { - for (i = 0; i < nma_get_nrings(ona, t) + 1; i++) - NMR(ona, t)[i].save_ring = NMR(ona, t)[i].ring; - } + goto del_krings1; /* cross link the krings */ for_rx_tx(t) { - enum txrx r= nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */ + enum txrx r = nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */ for (i = 0; i < nma_get_nrings(na, t); i++) { NMR(na, t)[i].pipe = NMR(&pna->peer->up, r) + i; NMR(&pna->peer->up, r)[i].pipe = NMR(na, t) + i; } } - } else { - int i; - /* case 2) above */ - /* recover the hidden rings */ - ND("%p: case 2, hidden rings", na); - for_rx_tx(t) { - for (i = 0; i < nma_get_nrings(na, t) + 1; i++) - NMR(na, t)[i].ring = NMR(na, t)[i].save_ring; - } + } return 0; -del_krings2: - netmap_krings_delete(ona); -del_rings1: - netmap_mem_rings_delete(na); del_krings1: netmap_krings_delete(na); err: @@ -383,7 +356,8 @@ err: * * usr1 --> e1 --> e2 * - * and we are e1. Nothing special to do. + * and we are e1. Create the needed rings of the + * other end. * * 1.b) state is * @@ -412,14 +386,65 @@ netmap_pipe_reg(struct netmap_adapter *na, int onoff) { struct netmap_pipe_adapter *pna = (struct netmap_pipe_adapter *)na; + struct netmap_adapter *ona = &pna->peer->up; + int i, error = 0; enum txrx t; ND("%p: onoff %d", na, onoff); if (onoff) { - na->na_flags |= NAF_NETMAP_ON; + for_rx_tx(t) { + for (i = 0; i < nma_get_nrings(na, t) + 1; i++) { + struct netmap_kring *kring = &NMR(na, t)[i]; + + if (nm_kring_pending_on(kring)) { + /* mark the partner ring as needed */ + kring->pipe->nr_kflags |= NKR_NEEDRING; + } + } + } + + /* create all missing needed rings on the other end */ + error = netmap_mem_rings_create(ona); + if (error) + return error; + + /* In case of no error we put our rings in netmap mode */ + for_rx_tx(t) { + for (i = 0; i < nma_get_nrings(na, t) + 1; i++) { + struct netmap_kring *kring = &NMR(na, t)[i]; + + if (nm_kring_pending_on(kring)) { + kring->nr_mode = NKR_NETMAP_ON; + } + } + } + if (na->active_fds == 0) + na->na_flags |= NAF_NETMAP_ON; } else { - na->na_flags &= ~NAF_NETMAP_ON; + if (na->active_fds == 0) + na->na_flags &= ~NAF_NETMAP_ON; + for_rx_tx(t) { + for (i = 0; i < nma_get_nrings(na, t) + 1; i++) { + struct netmap_kring *kring = &NMR(na, t)[i]; + + if (nm_kring_pending_off(kring)) { + kring->nr_mode = NKR_NETMAP_OFF; + /* mark the peer ring as no longer needed by us + * (it may still be kept if sombody else is using it) + */ + kring->pipe->nr_kflags &= ~NKR_NEEDRING; + } + } + } + /* delete all the peer rings that are no longer needed */ + netmap_mem_rings_delete(ona); + } + + if (na->active_fds) { + D("active_fds %d", na->active_fds); + return 0; } + if (pna->peer_ref) { ND("%p: case 1.a or 2.a, nothing to do", na); return 0; @@ -429,18 +454,11 @@ netmap_pipe_reg(struct netmap_adapter *na, int onoff) pna->peer->peer_ref = 0; netmap_adapter_put(na); } else { - int i; ND("%p: case 2.b, grab peer", na); netmap_adapter_get(na); pna->peer->peer_ref = 1; - /* hide our rings from netmap_mem_rings_delete */ - for_rx_tx(t) { - for (i = 0; i < nma_get_nrings(na, t) + 1; i++) { - NMR(na, t)[i].ring = NULL; - } - } } - return 0; + return error; } /* netmap_pipe_krings_delete. @@ -470,8 +488,6 @@ netmap_pipe_krings_delete(struct netmap_adapter *na) struct netmap_pipe_adapter *pna = (struct netmap_pipe_adapter *)na; struct netmap_adapter *ona; /* na of the other end */ - int i; - enum txrx t; if (!pna->peer_ref) { ND("%p: case 2, kept alive by peer", na); @@ -480,18 +496,12 @@ netmap_pipe_krings_delete(struct netmap_adapter *na) /* case 1) above */ ND("%p: case 1, deleting everyhing", na); netmap_krings_delete(na); /* also zeroes tx_rings etc. */ - /* restore the ring to be deleted on the peer */ ona = &pna->peer->up; if (ona->tx_rings == NULL) { /* already deleted, we must be on an * cleanup-after-error path */ return; } - for_rx_tx(t) { - for (i = 0; i < nma_get_nrings(ona, t) + 1; i++) - NMR(ona, t)[i].ring = NMR(ona, t)[i].save_ring; - } - netmap_mem_rings_delete(ona); netmap_krings_delete(ona); } @@ -519,6 +529,7 @@ netmap_get_pipe_na(struct nmreq *nmr, struct netmap_adapter **na, int create) struct nmreq pnmr; struct netmap_adapter *pna; /* parent adapter */ struct netmap_pipe_adapter *mna, *sna, *req; + struct ifnet *ifp = NULL; u_int pipe_id; int role = nmr->nr_flags & NR_REG_MASK; int error; @@ -536,7 +547,7 @@ netmap_get_pipe_na(struct nmreq *nmr, struct netmap_adapter **na, int create) memcpy(&pnmr.nr_name, nmr->nr_name, IFNAMSIZ); /* pass to parent the requested number of pipes */ pnmr.nr_arg1 = nmr->nr_arg1; - error = netmap_get_na(&pnmr, &pna, create); + error = netmap_get_na(&pnmr, &pna, &ifp, create); if (error) { ND("parent lookup failed: %d", error); return error; @@ -652,16 +663,15 @@ found: *na = &req->up; netmap_adapter_get(*na); - /* write the configuration back */ - nmr->nr_tx_rings = req->up.num_tx_rings; - nmr->nr_rx_rings = req->up.num_rx_rings; - nmr->nr_tx_slots = req->up.num_tx_desc; - nmr->nr_rx_slots = req->up.num_rx_desc; - /* keep the reference to the parent. * It will be released by the req destructor */ + /* drop the ifp reference, if any */ + if (ifp) { + if_rele(ifp); + } + return 0; free_sna: @@ -671,7 +681,7 @@ unregister_mna: free_mna: free(mna, M_DEVBUF); put_out: - netmap_adapter_put(pna); + netmap_unget_na(pna, ifp); return error; } diff --git a/sys/dev/netmap/netmap_vale.c b/sys/dev/netmap/netmap_vale.c index ddd7334a8378..2d2c807681d2 100644 --- a/sys/dev/netmap/netmap_vale.c +++ b/sys/dev/netmap/netmap_vale.c @@ -1,5 +1,6 @@ /* - * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved. + * Copyright (C) 2013-2016 Universita` di Pisa + * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -101,6 +102,9 @@ __FBSDID("$FreeBSD$"); #warning OSX support is only partial #include "osx_glue.h" +#elif defined(_WIN32) +#include "win_glue.h" + #else #error Unsupported platform @@ -119,7 +123,7 @@ __FBSDID("$FreeBSD$"); /* * system parameters (most of them in netmap_kern.h) - * NM_NAME prefix for switch port names, default "vale" + * NM_BDG_NAME prefix for switch port names, default "vale" * NM_BDG_MAXPORTS number of ports * NM_BRIDGES max number of switches in the system. * XXX should become a sysctl or tunable @@ -144,7 +148,6 @@ __FBSDID("$FreeBSD$"); #define NM_BDG_BATCH_MAX (NM_BDG_BATCH + NM_MULTISEG) /* NM_FT_NULL terminates a list of slots in the ft */ #define NM_FT_NULL NM_BDG_BATCH_MAX -#define NM_BRIDGES 8 /* number of bridges */ /* @@ -152,14 +155,15 @@ __FBSDID("$FreeBSD$"); * used in the bridge. The actual value may be larger as the * last packet in the block may overflow the size. */ -int bridge_batch = NM_BDG_BATCH; /* bridge batch size */ +static int bridge_batch = NM_BDG_BATCH; /* bridge batch size */ +SYSBEGIN(vars_vale); SYSCTL_DECL(_dev_netmap); SYSCTL_INT(_dev_netmap, OID_AUTO, bridge_batch, CTLFLAG_RW, &bridge_batch, 0 , ""); - +SYSEND; static int netmap_vp_create(struct nmreq *, struct ifnet *, struct netmap_vp_adapter **); static int netmap_vp_reg(struct netmap_adapter *na, int onoff); -static int netmap_bwrap_register(struct netmap_adapter *, int onoff); +static int netmap_bwrap_reg(struct netmap_adapter *, int onoff); /* * For each output interface, nm_bdg_q is used to construct a list. @@ -213,7 +217,7 @@ struct nm_bridge { * forward this packet. ring_nr is the source ring index, and the * function may overwrite this value to forward this packet to a * different ring index. - * This function must be set by netmap_bdgctl(). + * This function must be set by netmap_bdg_ctl(). */ struct netmap_bdg_ops bdg_ops; @@ -244,7 +248,7 @@ netmap_bdg_name(struct netmap_vp_adapter *vp) * Right now we have a static array and deletions are protected * by an exclusive lock. */ -struct nm_bridge *nm_bridges; +static struct nm_bridge *nm_bridges; #endif /* !CONFIG_NET_NS */ @@ -278,6 +282,45 @@ pkt_copy(void *_src, void *_dst, int l) } +static int +nm_is_id_char(const char c) +{ + return (c >= 'a' && c <= 'z') || + (c >= 'A' && c <= 'Z') || + (c >= '0' && c <= '9') || + (c == '_'); +} + +/* Validate the name of a VALE bridge port and return the + * position of the ":" character. */ +static int +nm_vale_name_validate(const char *name) +{ + int colon_pos = -1; + int i; + + if (!name || strlen(name) < strlen(NM_BDG_NAME)) { + return -1; + } + + for (i = 0; name[i]; i++) { + if (name[i] == ':') { + if (colon_pos != -1) { + return -1; + } + colon_pos = i; + } else if (!nm_is_id_char(name[i])) { + return -1; + } + } + + if (i >= IFNAMSIZ) { + return -1; + } + + return colon_pos; +} + /* * locate a bridge among the existing ones. * MUST BE CALLED WITH NMG_LOCK() @@ -288,7 +331,7 @@ pkt_copy(void *_src, void *_dst, int l) static struct nm_bridge * nm_find_bridge(const char *name, int create) { - int i, l, namelen; + int i, namelen; struct nm_bridge *b = NULL, *bridges; u_int num_bridges; @@ -296,21 +339,11 @@ nm_find_bridge(const char *name, int create) netmap_bns_getbridges(&bridges, &num_bridges); - namelen = strlen(NM_NAME); /* base length */ - l = name ? strlen(name) : 0; /* actual length */ - if (l < namelen) { + namelen = nm_vale_name_validate(name); + if (namelen < 0) { D("invalid bridge name %s", name ? name : NULL); return NULL; } - for (i = namelen + 1; i < l; i++) { - if (name[i] == ':') { - namelen = i; - break; - } - } - if (namelen >= IFNAMSIZ) - namelen = IFNAMSIZ; - ND("--- prefix is '%.*s' ---", namelen, name); /* lookup the name, remember empty slot if there is one */ for (i = 0; i < num_bridges; i++) { @@ -479,6 +512,7 @@ netmap_vp_bdg_ctl(struct netmap_adapter *na, struct nmreq *nmr, int attach) struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter *)na; struct nm_bridge *b = vpna->na_bdg; + (void)nmr; // XXX merge ? if (attach) return 0; /* nothing to do */ if (b) { @@ -518,7 +552,7 @@ nm_vi_destroy(const char *name) return ENXIO; NMG_LOCK(); /* make sure this is actually a VALE port */ - if (!NETMAP_CAPABLE(ifp) || NA(ifp)->nm_register != netmap_vp_reg) { + if (!NM_NA_VALID(ifp) || NA(ifp)->nm_register != netmap_vp_reg) { error = EINVAL; goto err; } @@ -535,7 +569,7 @@ nm_vi_destroy(const char *name) */ if_rele(ifp); netmap_detach(ifp); - nm_vi_detach(ifp); + nm_os_vi_detach(ifp); return 0; err: @@ -556,14 +590,14 @@ nm_vi_create(struct nmreq *nmr) int error; /* don't include VALE prefix */ - if (!strncmp(nmr->nr_name, NM_NAME, strlen(NM_NAME))) + if (!strncmp(nmr->nr_name, NM_BDG_NAME, strlen(NM_BDG_NAME))) return EINVAL; ifp = ifunit_ref(nmr->nr_name); if (ifp) { /* already exist, cannot create new one */ if_rele(ifp); return EEXIST; } - error = nm_vi_persist(nmr->nr_name, &ifp); + error = nm_os_vi_persist(nmr->nr_name, &ifp); if (error) return error; @@ -572,12 +606,13 @@ nm_vi_create(struct nmreq *nmr) error = netmap_vp_create(nmr, ifp, &vpna); if (error) { D("error %d", error); - nm_vi_detach(ifp); + nm_os_vi_detach(ifp); return error; } /* persist-specific routines */ vpna->up.nm_bdg_ctl = netmap_vp_bdg_ctl; netmap_adapter_get(&vpna->up); + NM_ATTACH_NA(ifp, &vpna->up); NMG_UNLOCK(); D("created %s", ifp->if_xname); return 0; @@ -608,7 +643,7 @@ netmap_get_bdg_na(struct nmreq *nmr, struct netmap_adapter **na, int create) /* first try to see if this is a bridge port. */ NMG_LOCK_ASSERT(); - if (strncmp(nr_name, NM_NAME, sizeof(NM_NAME) - 1)) { + if (strncmp(nr_name, NM_BDG_NAME, sizeof(NM_BDG_NAME) - 1)) { return 0; /* no error, but no VALE prefix */ } @@ -693,7 +728,6 @@ netmap_get_bdg_na(struct nmreq *nmr, struct netmap_adapter **na, int create) goto out; vpna = hw->na_vp; hostna = hw->na_hostvp; - if_rele(ifp); if (nmr->nr_arg1 != NETMAP_BDG_HOST) hostna = NULL; } @@ -768,6 +802,11 @@ unlock_exit: return error; } +static inline int +nm_is_bwrap(struct netmap_adapter *na) +{ + return na->nm_register == netmap_bwrap_reg; +} /* process NETMAP_BDG_DETACH */ static int @@ -785,8 +824,13 @@ nm_bdg_ctl_detach(struct nmreq *nmr) if (na == NULL) { /* VALE prefix missing */ error = EINVAL; goto unlock_exit; + } else if (nm_is_bwrap(na) && + ((struct netmap_bwrap_adapter *)na)->na_polling_state) { + /* Don't detach a NIC with polling */ + error = EBUSY; + netmap_adapter_put(na); + goto unlock_exit; } - if (na->nm_bdg_ctl) { /* remove the port from bridge. The bwrap * also needs to put the hwna in normal mode @@ -801,6 +845,267 @@ unlock_exit: } +struct nm_bdg_polling_state; +struct +nm_bdg_kthread { + struct nm_kthread *nmk; + u_int qfirst; + u_int qlast; + struct nm_bdg_polling_state *bps; +}; + +struct nm_bdg_polling_state { + bool configured; + bool stopped; + struct netmap_bwrap_adapter *bna; + u_int reg; + u_int qfirst; + u_int qlast; + u_int cpu_from; + u_int ncpus; + struct nm_bdg_kthread *kthreads; +}; + +static void +netmap_bwrap_polling(void *data) +{ + struct nm_bdg_kthread *nbk = data; + struct netmap_bwrap_adapter *bna; + u_int qfirst, qlast, i; + struct netmap_kring *kring0, *kring; + + if (!nbk) + return; + qfirst = nbk->qfirst; + qlast = nbk->qlast; + bna = nbk->bps->bna; + kring0 = NMR(bna->hwna, NR_RX); + + for (i = qfirst; i < qlast; i++) { + kring = kring0 + i; + kring->nm_notify(kring, 0); + } +} + +static int +nm_bdg_create_kthreads(struct nm_bdg_polling_state *bps) +{ + struct nm_kthread_cfg kcfg; + int i, j; + + bps->kthreads = malloc(sizeof(struct nm_bdg_kthread) * bps->ncpus, + M_DEVBUF, M_NOWAIT | M_ZERO); + if (bps->kthreads == NULL) + return ENOMEM; + + bzero(&kcfg, sizeof(kcfg)); + kcfg.worker_fn = netmap_bwrap_polling; + for (i = 0; i < bps->ncpus; i++) { + struct nm_bdg_kthread *t = bps->kthreads + i; + int all = (bps->ncpus == 1 && bps->reg == NR_REG_ALL_NIC); + int affinity = bps->cpu_from + i; + + t->bps = bps; + t->qfirst = all ? bps->qfirst /* must be 0 */: affinity; + t->qlast = all ? bps->qlast : t->qfirst + 1; + D("kthread %d a:%u qf:%u ql:%u", i, affinity, t->qfirst, + t->qlast); + + kcfg.type = i; + kcfg.worker_private = t; + t->nmk = nm_os_kthread_create(&kcfg); + if (t->nmk == NULL) { + goto cleanup; + } + nm_os_kthread_set_affinity(t->nmk, affinity); + } + return 0; + +cleanup: + for (j = 0; j < i; j++) { + struct nm_bdg_kthread *t = bps->kthreads + i; + nm_os_kthread_delete(t->nmk); + } + free(bps->kthreads, M_DEVBUF); + return EFAULT; +} + +/* a version of ptnetmap_start_kthreads() */ +static int +nm_bdg_polling_start_kthreads(struct nm_bdg_polling_state *bps) +{ + int error, i, j; + + if (!bps) { + D("polling is not configured"); + return EFAULT; + } + bps->stopped = false; + + for (i = 0; i < bps->ncpus; i++) { + struct nm_bdg_kthread *t = bps->kthreads + i; + error = nm_os_kthread_start(t->nmk); + if (error) { + D("error in nm_kthread_start()"); + goto cleanup; + } + } + return 0; + +cleanup: + for (j = 0; j < i; j++) { + struct nm_bdg_kthread *t = bps->kthreads + i; + nm_os_kthread_stop(t->nmk); + } + bps->stopped = true; + return error; +} + +static void +nm_bdg_polling_stop_delete_kthreads(struct nm_bdg_polling_state *bps) +{ + int i; + + if (!bps) + return; + + for (i = 0; i < bps->ncpus; i++) { + struct nm_bdg_kthread *t = bps->kthreads + i; + nm_os_kthread_stop(t->nmk); + nm_os_kthread_delete(t->nmk); + } + bps->stopped = true; +} + +static int +get_polling_cfg(struct nmreq *nmr, struct netmap_adapter *na, + struct nm_bdg_polling_state *bps) +{ + int req_cpus, avail_cpus, core_from; + u_int reg, i, qfirst, qlast; + + avail_cpus = nm_os_ncpus(); + req_cpus = nmr->nr_arg1; + + if (req_cpus == 0) { + D("req_cpus must be > 0"); + return EINVAL; + } else if (req_cpus >= avail_cpus) { + D("for safety, we need at least one core left in the system"); + return EINVAL; + } + reg = nmr->nr_flags & NR_REG_MASK; + i = nmr->nr_ringid & NETMAP_RING_MASK; + /* + * ONE_NIC: dedicate one core to one ring. If multiple cores + * are specified, consecutive rings are also polled. + * For example, if ringid=2 and 2 cores are given, + * ring 2 and 3 are polled by core 2 and 3, respectively. + * ALL_NIC: poll all the rings using a core specified by ringid. + * the number of cores must be 1. + */ + if (reg == NR_REG_ONE_NIC) { + if (i + req_cpus > nma_get_nrings(na, NR_RX)) { + D("only %d rings exist (ring %u-%u is given)", + nma_get_nrings(na, NR_RX), i, i+req_cpus); + return EINVAL; + } + qfirst = i; + qlast = qfirst + req_cpus; + core_from = qfirst; + } else if (reg == NR_REG_ALL_NIC) { + if (req_cpus != 1) { + D("ncpus must be 1 not %d for REG_ALL_NIC", req_cpus); + return EINVAL; + } + qfirst = 0; + qlast = nma_get_nrings(na, NR_RX); + core_from = i; + } else { + D("reg must be ALL_NIC or ONE_NIC"); + return EINVAL; + } + + bps->reg = reg; + bps->qfirst = qfirst; + bps->qlast = qlast; + bps->cpu_from = core_from; + bps->ncpus = req_cpus; + D("%s qfirst %u qlast %u cpu_from %u ncpus %u", + reg == NR_REG_ALL_NIC ? "REG_ALL_NIC" : "REG_ONE_NIC", + qfirst, qlast, core_from, req_cpus); + return 0; +} + +static int +nm_bdg_ctl_polling_start(struct nmreq *nmr, struct netmap_adapter *na) +{ + struct nm_bdg_polling_state *bps; + struct netmap_bwrap_adapter *bna; + int error; + + bna = (struct netmap_bwrap_adapter *)na; + if (bna->na_polling_state) { + D("ERROR adapter already in polling mode"); + return EFAULT; + } + + bps = malloc(sizeof(*bps), M_DEVBUF, M_NOWAIT | M_ZERO); + if (!bps) + return ENOMEM; + bps->configured = false; + bps->stopped = true; + + if (get_polling_cfg(nmr, na, bps)) { + free(bps, M_DEVBUF); + return EINVAL; + } + + if (nm_bdg_create_kthreads(bps)) { + free(bps, M_DEVBUF); + return EFAULT; + } + + bps->configured = true; + bna->na_polling_state = bps; + bps->bna = bna; + + /* disable interrupt if possible */ + if (bna->hwna->nm_intr) + bna->hwna->nm_intr(bna->hwna, 0); + /* start kthread now */ + error = nm_bdg_polling_start_kthreads(bps); + if (error) { + D("ERROR nm_bdg_polling_start_kthread()"); + free(bps->kthreads, M_DEVBUF); + free(bps, M_DEVBUF); + bna->na_polling_state = NULL; + if (bna->hwna->nm_intr) + bna->hwna->nm_intr(bna->hwna, 1); + } + return error; +} + +static int +nm_bdg_ctl_polling_stop(struct nmreq *nmr, struct netmap_adapter *na) +{ + struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter *)na; + struct nm_bdg_polling_state *bps; + + if (!bna->na_polling_state) { + D("ERROR adapter is not in polling mode"); + return EFAULT; + } + bps = bna->na_polling_state; + nm_bdg_polling_stop_delete_kthreads(bna->na_polling_state); + bps->configured = false; + free(bps, M_DEVBUF); + bna->na_polling_state = NULL; + /* reenable interrupt */ + if (bna->hwna->nm_intr) + bna->hwna->nm_intr(bna->hwna, 1); + return 0; +} /* Called by either user's context (netmap_ioctl()) * or external kernel modules (e.g., Openvswitch). @@ -843,7 +1148,7 @@ netmap_bdg_ctl(struct nmreq *nmr, struct netmap_bdg_ops *bdg_ops) case NETMAP_BDG_LIST: /* this is used to enumerate bridges and ports */ if (namelen) { /* look up indexes of bridge and port */ - if (strncmp(name, NM_NAME, strlen(NM_NAME))) { + if (strncmp(name, NM_BDG_NAME, strlen(NM_BDG_NAME))) { error = EINVAL; break; } @@ -855,7 +1160,9 @@ netmap_bdg_ctl(struct nmreq *nmr, struct netmap_bdg_ops *bdg_ops) break; } - error = ENOENT; + error = 0; + nmr->nr_arg1 = b - bridges; /* bridge index */ + nmr->nr_arg2 = NM_BDG_NOPORT; for (j = 0; j < b->bdg_active_ports; j++) { i = b->bdg_port_index[j]; vpna = b->bdg_ports[i]; @@ -867,10 +1174,7 @@ netmap_bdg_ctl(struct nmreq *nmr, struct netmap_bdg_ops *bdg_ops) * virtual port and a NIC, respectively */ if (!strcmp(vpna->up.name, name)) { - /* bridge index */ - nmr->nr_arg1 = b - bridges; nmr->nr_arg2 = i; /* port index */ - error = 0; break; } } @@ -937,10 +1241,34 @@ netmap_bdg_ctl(struct nmreq *nmr, struct netmap_bdg_ops *bdg_ops) error = netmap_get_bdg_na(nmr, &na, 0); if (na && !error) { vpna = (struct netmap_vp_adapter *)na; - vpna->virt_hdr_len = nmr->nr_arg1; - if (vpna->virt_hdr_len) + na->virt_hdr_len = nmr->nr_arg1; + if (na->virt_hdr_len) { vpna->mfs = NETMAP_BUF_SIZE(na); - D("Using vnet_hdr_len %d for %p", vpna->virt_hdr_len, vpna); + } + D("Using vnet_hdr_len %d for %p", na->virt_hdr_len, na); + netmap_adapter_put(na); + } else if (!na) { + error = ENXIO; + } + NMG_UNLOCK(); + break; + + case NETMAP_BDG_POLLING_ON: + case NETMAP_BDG_POLLING_OFF: + NMG_LOCK(); + error = netmap_get_bdg_na(nmr, &na, 0); + if (na && !error) { + if (!nm_is_bwrap(na)) { + error = EOPNOTSUPP; + } else if (cmd == NETMAP_BDG_POLLING_ON) { + error = nm_bdg_ctl_polling_start(nmr, na); + if (!error) + netmap_adapter_get(na); + } else { + error = nm_bdg_ctl_polling_stop(nmr, na); + if (!error) + netmap_adapter_put(na); + } netmap_adapter_put(na); } NMG_UNLOCK(); @@ -1097,10 +1425,12 @@ nm_bdg_preflush(struct netmap_kring *kring, u_int end) ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr); } if (frags > 1) { - D("truncate incomplete fragment at %d (%d frags)", ft_i, frags); - // ft_i > 0, ft[ft_i-1].flags has NS_MOREFRAG - ft[ft_i - 1].ft_frags &= ~NS_MOREFRAG; - ft[ft_i - frags].ft_frags = frags - 1; + /* Here ft_i > 0, ft[ft_i-1].flags has NS_MOREFRAG, and we + * have to fix frags count. */ + frags--; + ft[ft_i - 1].ft_flags &= ~NS_MOREFRAG; + ft[ft_i - frags].ft_frags = frags; + D("Truncate incomplete fragment at %d (%d frags)", ft_i, frags); } if (ft_i) ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr); @@ -1157,6 +1487,8 @@ netmap_vp_reg(struct netmap_adapter *na, int onoff) { struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter*)na; + enum txrx t; + int i; /* persistent ports may be put in netmap mode * before being attached to a bridge @@ -1164,12 +1496,30 @@ netmap_vp_reg(struct netmap_adapter *na, int onoff) if (vpna->na_bdg) BDG_WLOCK(vpna->na_bdg); if (onoff) { - na->na_flags |= NAF_NETMAP_ON; + for_rx_tx(t) { + for (i = 0; i < nma_get_nrings(na, t) + 1; i++) { + struct netmap_kring *kring = &NMR(na, t)[i]; + + if (nm_kring_pending_on(kring)) + kring->nr_mode = NKR_NETMAP_ON; + } + } + if (na->active_fds == 0) + na->na_flags |= NAF_NETMAP_ON; /* XXX on FreeBSD, persistent VALE ports should also * toggle IFCAP_NETMAP in na->ifp (2014-03-16) */ } else { - na->na_flags &= ~NAF_NETMAP_ON; + if (na->active_fds == 0) + na->na_flags &= ~NAF_NETMAP_ON; + for_rx_tx(t) { + for (i = 0; i < nma_get_nrings(na, t) + 1; i++) { + struct netmap_kring *kring = &NMR(na, t)[i]; + + if (nm_kring_pending_off(kring)) + kring->nr_mode = NKR_NETMAP_OFF; + } + } } if (vpna->na_bdg) BDG_WUNLOCK(vpna->na_bdg); @@ -1193,13 +1543,14 @@ netmap_bdg_learning(struct nm_bdg_fwd *ft, uint8_t *dst_ring, uint32_t sh, dh; u_int dst, mysrc = na->bdg_port; uint64_t smac, dmac; + uint8_t indbuf[12]; /* safety check, unfortunately we have many cases */ - if (buf_len >= 14 + na->virt_hdr_len) { + if (buf_len >= 14 + na->up.virt_hdr_len) { /* virthdr + mac_hdr in the same slot */ - buf += na->virt_hdr_len; - buf_len -= na->virt_hdr_len; - } else if (buf_len == na->virt_hdr_len && ft->ft_flags & NS_MOREFRAG) { + buf += na->up.virt_hdr_len; + buf_len -= na->up.virt_hdr_len; + } else if (buf_len == na->up.virt_hdr_len && ft->ft_flags & NS_MOREFRAG) { /* only header in first fragment */ ft++; buf = ft->ft_buf; @@ -1208,6 +1559,14 @@ netmap_bdg_learning(struct nm_bdg_fwd *ft, uint8_t *dst_ring, RD(5, "invalid buf format, length %d", buf_len); return NM_BDG_NOPORT; } + + if (ft->ft_flags & NS_INDIRECT) { + if (copyin(buf, indbuf, sizeof(indbuf))) { + return NM_BDG_NOPORT; + } + buf = indbuf; + } + dmac = le64toh(*(uint64_t *)(buf)) & 0xffffffffffff; smac = le64toh(*(uint64_t *)(buf + 4)); smac >>= 16; @@ -1321,7 +1680,7 @@ nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na, struct nm_bdg_q *dst_ents, *brddst; uint16_t num_dsts = 0, *dsts; struct nm_bridge *b = na->na_bdg; - u_int i, j, me = na->bdg_port; + u_int i, me = na->bdg_port; /* * The work area (pointed by ft) is followed by an array of @@ -1341,7 +1700,7 @@ nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na, ND("slot %d frags %d", i, ft[i].ft_frags); /* Drop the packet if the virtio-net header is not into the first fragment nor at the very beginning of the second. */ - if (unlikely(na->virt_hdr_len > ft[i].ft_len)) + if (unlikely(na->up.virt_hdr_len > ft[i].ft_len)) continue; dst_port = b->bdg_ops.lookup(&ft[i], &dst_ring, na); if (netmap_verbose > 255) @@ -1382,6 +1741,7 @@ nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na, */ brddst = dst_ents + NM_BDG_BROADCAST * NM_BDG_MAXRINGS; if (brddst->bq_head != NM_FT_NULL) { + u_int j; for (j = 0; likely(j < b->bdg_active_ports); j++) { uint16_t d_i; i = b->bdg_port_index[j]; @@ -1441,8 +1801,9 @@ nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na, */ needed = d->bq_len + brddst->bq_len; - if (unlikely(dst_na->virt_hdr_len != na->virt_hdr_len)) { - RD(3, "virt_hdr_mismatch, src %d dst %d", na->virt_hdr_len, dst_na->virt_hdr_len); + if (unlikely(dst_na->up.virt_hdr_len != na->up.virt_hdr_len)) { + RD(3, "virt_hdr_mismatch, src %d dst %d", na->up.virt_hdr_len, + dst_na->up.virt_hdr_len); /* There is a virtio-net header/offloadings mismatch between * source and destination. The slower mismatch datapath will * be used to cope with all the mismatches. @@ -1803,7 +2164,6 @@ netmap_vp_create(struct nmreq *nmr, struct ifnet *ifp, struct netmap_vp_adapter nm_bound_var(&nmr->nr_arg3, 0, 0, 128*NM_BDG_MAXSLOTS, NULL); na->num_rx_desc = nmr->nr_rx_slots; - vpna->virt_hdr_len = 0; vpna->mfs = 1514; vpna->last_smac = ~0llu; /*if (vpna->mfs > netmap_buf_size) TODO netmap_buf_size is zero?? @@ -1880,19 +2240,17 @@ netmap_bwrap_dtor(struct netmap_adapter *na) { struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na; struct netmap_adapter *hwna = bna->hwna; + struct nm_bridge *b = bna->up.na_bdg, + *bh = bna->host.na_bdg; + + if (b) { + netmap_bdg_detach_common(b, bna->up.bdg_port, + (bh ? bna->host.bdg_port : -1)); + } ND("na %p", na); - /* drop reference to hwna->ifp. - * If we don't do this, netmap_detach_common(na) - * will think it has set NA(na->ifp) to NULL - */ na->ifp = NULL; - /* for safety, also drop the possible reference - * in the hostna - */ bna->host.up.ifp = NULL; - - hwna->nm_mem = bna->save_nmd; hwna->na_private = NULL; hwna->na_vp = hwna->na_hostvp = NULL; hwna->na_flags &= ~NAF_BUSY; @@ -1916,7 +2274,8 @@ netmap_bwrap_dtor(struct netmap_adapter *na) * (part as a receive ring, part as a transmit ring). * * callback that overwrites the hwna notify callback. - * Packets come from the outside or from the host stack and are put on an hwna rx ring. + * Packets come from the outside or from the host stack and are put on an + * hwna rx ring. * The bridge wrapper then sends the packets through the bridge. */ static int @@ -1927,19 +2286,18 @@ netmap_bwrap_intr_notify(struct netmap_kring *kring, int flags) struct netmap_kring *bkring; struct netmap_vp_adapter *vpna = &bna->up; u_int ring_nr = kring->ring_id; - int error = 0; + int ret = NM_IRQ_COMPLETED; + int error; if (netmap_verbose) D("%s %s 0x%x", na->name, kring->name, flags); - if (!nm_netmap_on(na)) - return 0; - bkring = &vpna->up.tx_rings[ring_nr]; /* make sure the ring is not disabled */ - if (nm_kr_tryget(kring)) - return 0; + if (nm_kr_tryget(kring, 0 /* can't sleep */, NULL)) { + return EIO; + } if (netmap_verbose) D("%s head %d cur %d tail %d", na->name, @@ -1951,9 +2309,10 @@ netmap_bwrap_intr_notify(struct netmap_kring *kring, int flags) error = kring->nm_sync(kring, 0); if (error) goto put_out; - if (kring->nr_hwcur == kring->nr_hwtail && netmap_verbose) { - D("how strange, interrupt with no packets on %s", - na->name); + if (kring->nr_hwcur == kring->nr_hwtail) { + if (netmap_verbose) + D("how strange, interrupt with no packets on %s", + na->name); goto put_out; } @@ -1970,28 +2329,32 @@ netmap_bwrap_intr_notify(struct netmap_kring *kring, int flags) /* another call to actually release the buffers */ error = kring->nm_sync(kring, 0); + /* The second rxsync may have further advanced hwtail. If this happens, + * return NM_IRQ_RESCHED, otherwise just return NM_IRQ_COMPLETED. */ + if (kring->rcur != kring->nr_hwtail) { + ret = NM_IRQ_RESCHED; + } put_out: nm_kr_put(kring); - return error; + + return error ? error : ret; } /* nm_register callback for bwrap */ static int -netmap_bwrap_register(struct netmap_adapter *na, int onoff) +netmap_bwrap_reg(struct netmap_adapter *na, int onoff) { struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter *)na; struct netmap_adapter *hwna = bna->hwna; struct netmap_vp_adapter *hostna = &bna->host; - int error; + int error, i; enum txrx t; ND("%s %s", na->name, onoff ? "on" : "off"); if (onoff) { - int i; - /* netmap_do_regif has been called on the bwrap na. * We need to pass the information about the * memory allocator down to the hwna before @@ -2010,16 +2373,32 @@ netmap_bwrap_register(struct netmap_adapter *na, int onoff) /* cross-link the netmap rings * The original number of rings comes from hwna, * rx rings on one side equals tx rings on the other. - * We need to do this now, after the initialization - * of the kring->ring pointers */ for_rx_tx(t) { - enum txrx r= nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */ - for (i = 0; i < nma_get_nrings(na, r) + 1; i++) { - NMR(hwna, t)[i].nkr_num_slots = NMR(na, r)[i].nkr_num_slots; - NMR(hwna, t)[i].ring = NMR(na, r)[i].ring; + enum txrx r = nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */ + for (i = 0; i < nma_get_nrings(hwna, r) + 1; i++) { + NMR(hwna, r)[i].ring = NMR(na, t)[i].ring; } } + + if (na->na_flags & NAF_HOST_RINGS) { + struct netmap_adapter *hna = &hostna->up; + /* the hostna rings are the host rings of the bwrap. + * The corresponding krings must point back to the + * hostna + */ + hna->tx_rings = &na->tx_rings[na->num_tx_rings]; + hna->tx_rings[0].na = hna; + hna->rx_rings = &na->rx_rings[na->num_rx_rings]; + hna->rx_rings[0].na = hna; + } + } + + /* pass down the pending ring state information */ + for_rx_tx(t) { + for (i = 0; i < nma_get_nrings(na, t) + 1; i++) + NMR(hwna, t)[i].nr_pending_mode = + NMR(na, t)[i].nr_pending_mode; } /* forward the request to the hwna */ @@ -2027,6 +2406,13 @@ netmap_bwrap_register(struct netmap_adapter *na, int onoff) if (error) return error; + /* copy up the current ring state information */ + for_rx_tx(t) { + for (i = 0; i < nma_get_nrings(na, t) + 1; i++) + NMR(na, t)[i].nr_mode = + NMR(hwna, t)[i].nr_mode; + } + /* impersonate a netmap_vp_adapter */ netmap_vp_reg(na, onoff); if (hostna->na_bdg) @@ -2046,8 +2432,14 @@ netmap_bwrap_register(struct netmap_adapter *na, int onoff) /* also intercept the host ring notify */ hwna->rx_rings[i].nm_notify = netmap_bwrap_intr_notify; } + if (na->active_fds == 0) + na->na_flags |= NAF_NETMAP_ON; } else { u_int i; + + if (na->active_fds == 0) + na->na_flags &= ~NAF_NETMAP_ON; + /* reset all notify callbacks (including host ring) */ for (i = 0; i <= hwna->num_rx_rings; i++) { hwna->rx_rings[i].nm_notify = hwna->rx_rings[i].save_notify; @@ -2089,8 +2481,8 @@ netmap_bwrap_krings_create(struct netmap_adapter *na) struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter *)na; struct netmap_adapter *hwna = bna->hwna; - struct netmap_adapter *hostna = &bna->host.up; - int error; + int i, error = 0; + enum txrx t; ND("%s", na->name); @@ -2102,26 +2494,23 @@ netmap_bwrap_krings_create(struct netmap_adapter *na) /* also create the hwna krings */ error = hwna->nm_krings_create(hwna); if (error) { - netmap_vp_krings_delete(na); - return error; + goto err_del_vp_rings; } - /* the connection between the bwrap krings and the hwna krings - * will be perfomed later, in the nm_register callback, since - * now the kring->ring pointers have not been initialized yet - */ - if (na->na_flags & NAF_HOST_RINGS) { - /* the hostna rings are the host rings of the bwrap. - * The corresponding krings must point back to the - * hostna - */ - hostna->tx_rings = &na->tx_rings[na->num_tx_rings]; - hostna->tx_rings[0].na = hostna; - hostna->rx_rings = &na->rx_rings[na->num_rx_rings]; - hostna->rx_rings[0].na = hostna; + /* get each ring slot number from the corresponding hwna ring */ + for_rx_tx(t) { + enum txrx r = nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */ + for (i = 0; i < nma_get_nrings(hwna, r) + 1; i++) { + NMR(na, t)[i].nkr_num_slots = NMR(hwna, r)[i].nkr_num_slots; + } } return 0; + +err_del_vp_rings: + netmap_vp_krings_delete(na); + + return error; } @@ -2149,7 +2538,7 @@ netmap_bwrap_notify(struct netmap_kring *kring, int flags) u_int ring_n = kring->ring_id; u_int lim = kring->nkr_num_slots - 1; struct netmap_kring *hw_kring; - int error = 0; + int error; ND("%s: na %s hwna %s", (kring ? kring->name : "NULL!"), @@ -2157,11 +2546,10 @@ netmap_bwrap_notify(struct netmap_kring *kring, int flags) (hwna ? hwna->name : "NULL!")); hw_kring = &hwna->tx_rings[ring_n]; - if (nm_kr_tryget(hw_kring)) - return 0; + if (nm_kr_tryget(hw_kring, 0, NULL)) { + return ENXIO; + } - if (!nm_netmap_on(hwna)) - return 0; /* first step: simulate a user wakeup on the rx ring */ netmap_vp_rxsync(kring, flags); ND("%s[%d] PRE rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)", @@ -2175,7 +2563,7 @@ netmap_bwrap_notify(struct netmap_kring *kring, int flags) hw_kring->rhead = hw_kring->rcur = kring->nr_hwtail; error = hw_kring->nm_sync(hw_kring, flags); if (error) - goto out; + goto put_out; /* third step: now we are back the rx ring */ /* claim ownership on all hw owned bufs */ @@ -2188,9 +2576,10 @@ netmap_bwrap_notify(struct netmap_kring *kring, int flags) kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease, ring->head, ring->cur, ring->tail, hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_kring->rtail); -out: +put_out: nm_kr_put(hw_kring); - return error; + + return error ? error : NM_IRQ_COMPLETED; } @@ -2217,44 +2606,23 @@ netmap_bwrap_bdg_ctl(struct netmap_adapter *na, struct nmreq *nmr, int attach) /* nothing to do */ return 0; } - npriv = malloc(sizeof(*npriv), M_DEVBUF, M_NOWAIT|M_ZERO); + npriv = netmap_priv_new(); if (npriv == NULL) return ENOMEM; - error = netmap_do_regif(npriv, na, nmr->nr_ringid, nmr->nr_flags); + npriv->np_ifp = na->ifp; /* let the priv destructor release the ref */ + error = netmap_do_regif(npriv, na, 0, NR_REG_NIC_SW); if (error) { - bzero(npriv, sizeof(*npriv)); - free(npriv, M_DEVBUF); + netmap_priv_delete(npriv); return error; } bna->na_kpriv = npriv; na->na_flags |= NAF_BUSY; } else { - int last_instance; - if (na->active_fds == 0) /* not registered */ return EINVAL; - last_instance = netmap_dtor_locked(bna->na_kpriv); - if (!last_instance) { - D("--- error, trying to detach an entry with active mmaps"); - error = EINVAL; - } else { - struct nm_bridge *b = bna->up.na_bdg, - *bh = bna->host.na_bdg; - npriv = bna->na_kpriv; - bna->na_kpriv = NULL; - D("deleting priv"); - - bzero(npriv, sizeof(*npriv)); - free(npriv, M_DEVBUF); - if (b) { - /* XXX the bwrap dtor should take care - * of this (2014-06-16) - */ - netmap_bdg_detach_common(b, bna->up.bdg_port, - (bh ? bna->host.bdg_port : -1)); - } - na->na_flags &= ~NAF_BUSY; - } + netmap_priv_delete(bna->na_kpriv); + bna->na_kpriv = NULL; + na->na_flags &= ~NAF_BUSY; } return error; @@ -2282,6 +2650,8 @@ netmap_bwrap_attach(const char *nr_name, struct netmap_adapter *hwna) } na = &bna->up.up; + /* make bwrap ifp point to the real ifp */ + na->ifp = hwna->ifp; na->na_private = bna; strncpy(na->name, nr_name, sizeof(na->name)); /* fill the ring data for the bwrap adapter with rx/tx meanings @@ -2294,7 +2664,7 @@ netmap_bwrap_attach(const char *nr_name, struct netmap_adapter *hwna) nma_set_ndesc(na, t, nma_get_ndesc(hwna, r)); } na->nm_dtor = netmap_bwrap_dtor; - na->nm_register = netmap_bwrap_register; + na->nm_register = netmap_bwrap_reg; // na->nm_txsync = netmap_bwrap_txsync; // na->nm_rxsync = netmap_bwrap_rxsync; na->nm_config = netmap_bwrap_config; @@ -2303,13 +2673,8 @@ netmap_bwrap_attach(const char *nr_name, struct netmap_adapter *hwna) na->nm_notify = netmap_bwrap_notify; na->nm_bdg_ctl = netmap_bwrap_bdg_ctl; na->pdev = hwna->pdev; - na->nm_mem = netmap_mem_private_new(na->name, - na->num_tx_rings, na->num_tx_desc, - na->num_rx_rings, na->num_rx_desc, - 0, 0, &error); - na->na_flags |= NAF_MEM_OWNER; - if (na->nm_mem == NULL) - goto err_put; + na->nm_mem = hwna->nm_mem; + na->virt_hdr_len = hwna->virt_hdr_len; bna->up.retry = 1; /* XXX maybe this should depend on the hwna */ bna->hwna = hwna; @@ -2349,24 +2714,10 @@ netmap_bwrap_attach(const char *nr_name, struct netmap_adapter *hwna) if (error) { goto err_free; } - /* make bwrap ifp point to the real ifp - * NOTE: netmap_attach_common() interprets a non-NULL na->ifp - * as a request to make the ifp point to the na. Since we - * do not want to change the na already pointed to by hwna->ifp, - * the following assignment has to be delayed until now - */ - na->ifp = hwna->ifp; hwna->na_flags |= NAF_BUSY; - /* make hwna point to the allocator we are actually using, - * so that monitors will be able to find it - */ - bna->save_nmd = hwna->nm_mem; - hwna->nm_mem = na->nm_mem; return 0; err_free: - netmap_mem_delete(na->nm_mem); -err_put: hwna->na_vp = hwna->na_hostvp = NULL; netmap_adapter_put(hwna); free(bna, M_DEVBUF); diff --git a/sys/modules/netmap/Makefile b/sys/modules/netmap/Makefile index 8e5364bbe7a2..978a4858edb9 100644 --- a/sys/modules/netmap/Makefile +++ b/sys/modules/netmap/Makefile @@ -3,11 +3,14 @@ # Compile netmap as a module, useful if you want a netmap bridge # or loadable drivers. +.include <bsd.own.mk> # FreeBSD 10 and earlier +# .include "${SYSDIR}/conf/kern.opts.mk" + .PATH: ${.CURDIR}/../../dev/netmap .PATH.h: ${.CURDIR}/../../net -CFLAGS += -I${.CURDIR}/../../ +CFLAGS += -I${.CURDIR}/../../ -D INET KMOD = netmap -SRCS = device_if.h bus_if.h opt_netmap.h +SRCS = device_if.h bus_if.h pci_if.h opt_netmap.h SRCS += netmap.c netmap.h netmap_kern.h SRCS += netmap_mem2.c netmap_mem2.h SRCS += netmap_generic.c @@ -17,5 +20,8 @@ SRCS += netmap_freebsd.c SRCS += netmap_offloadings.c SRCS += netmap_pipe.c SRCS += netmap_monitor.c +SRCS += netmap_pt.c +SRCS += if_ptnet.c +SRCS += opt_inet.h opt_inet6.h .include <bsd.kmod.mk> diff --git a/sys/net/netmap.h b/sys/net/netmap.h index 88b2957502ab..c3b8b9205d3d 100644 --- a/sys/net/netmap.h +++ b/sys/net/netmap.h @@ -137,6 +137,26 @@ * netmap:foo-k the k-th NIC ring pair * netmap:foo{k PIPE ring pair k, master side * netmap:foo}k PIPE ring pair k, slave side + * + * Some notes about host rings: + * + * + The RX host ring is used to store those packets that the host network + * stack is trying to transmit through a NIC queue, but only if that queue + * is currently in netmap mode. Netmap will not intercept host stack mbufs + * designated to NIC queues that are not in netmap mode. As a consequence, + * registering a netmap port with netmap:foo^ is not enough to intercept + * mbufs in the RX host ring; the netmap port should be registered with + * netmap:foo*, or another registration should be done to open at least a + * NIC TX queue in netmap mode. + * + * + Netmap is not currently able to deal with intercepted trasmit mbufs which + * require offloadings like TSO, UFO, checksumming offloadings, etc. It is + * responsibility of the user to disable those offloadings (e.g. using + * ifconfig on FreeBSD or ethtool -K on Linux) for an interface that is being + * used in netmap mode. If the offloadings are not disabled, GSO and/or + * unchecksummed packets may be dropped immediately or end up in the host RX + * ring, and will be dropped as soon as the packet reaches another netmap + * adapter. */ /* @@ -277,7 +297,11 @@ struct netmap_ring { struct timeval ts; /* (k) time of last *sync() */ /* opaque room for a mutex or similar object */ - uint8_t sem[128] __attribute__((__aligned__(NM_CACHE_ALIGN))); +#if !defined(_WIN32) || defined(__CYGWIN__) + uint8_t __attribute__((__aligned__(NM_CACHE_ALIGN))) sem[128]; +#else + uint8_t __declspec(align(NM_CACHE_ALIGN)) sem[128]; +#endif /* the slots follow. This struct has variable size */ struct netmap_slot slot[0]; /* array of slots. */ @@ -496,6 +520,11 @@ struct nmreq { #define NETMAP_BDG_OFFSET NETMAP_BDG_VNET_HDR /* deprecated alias */ #define NETMAP_BDG_NEWIF 6 /* create a virtual port */ #define NETMAP_BDG_DELIF 7 /* destroy a virtual port */ +#define NETMAP_PT_HOST_CREATE 8 /* create ptnetmap kthreads */ +#define NETMAP_PT_HOST_DELETE 9 /* delete ptnetmap kthreads */ +#define NETMAP_BDG_POLLING_ON 10 /* delete polling kthread */ +#define NETMAP_BDG_POLLING_OFF 11 /* delete polling kthread */ +#define NETMAP_VNET_HDR_GET 12 /* get the port virtio-net-hdr length */ uint16_t nr_arg1; /* reserve extra rings in NIOCREGIF */ #define NETMAP_BDG_HOST 1 /* attach the host stack on ATTACH */ @@ -521,7 +550,61 @@ enum { NR_REG_DEFAULT = 0, /* backward compat, should not be used. */ #define NR_ZCOPY_MON 0x400 /* request exclusive access to the selected rings */ #define NR_EXCLUSIVE 0x800 +/* request ptnetmap host support */ +#define NR_PASSTHROUGH_HOST NR_PTNETMAP_HOST /* deprecated */ +#define NR_PTNETMAP_HOST 0x1000 +#define NR_RX_RINGS_ONLY 0x2000 +#define NR_TX_RINGS_ONLY 0x4000 +/* Applications set this flag if they are able to deal with virtio-net headers, + * that is send/receive frames that start with a virtio-net header. + * If not set, NIOCREGIF will fail with netmap ports that require applications + * to use those headers. If the flag is set, the application can use the + * NETMAP_VNET_HDR_GET command to figure out the header length. */ +#define NR_ACCEPT_VNET_HDR 0x8000 +#define NM_BDG_NAME "vale" /* prefix for bridge port name */ + +/* + * Windows does not have _IOWR(). _IO(), _IOW() and _IOR() are defined + * in ws2def.h but not sure if they are in the form we need. + * XXX so we redefine them + * in a convenient way to use for DeviceIoControl signatures + */ +#ifdef _WIN32 +#undef _IO // ws2def.h +#define _WIN_NM_IOCTL_TYPE 40000 +#define _IO(_c, _n) CTL_CODE(_WIN_NM_IOCTL_TYPE, ((_n) + 0x800) , \ + METHOD_BUFFERED, FILE_ANY_ACCESS ) +#define _IO_direct(_c, _n) CTL_CODE(_WIN_NM_IOCTL_TYPE, ((_n) + 0x800) , \ + METHOD_OUT_DIRECT, FILE_ANY_ACCESS ) + +#define _IOWR(_c, _n, _s) _IO(_c, _n) + +/* We havesome internal sysctl in addition to the externally visible ones */ +#define NETMAP_MMAP _IO_direct('i', 160) // note METHOD_OUT_DIRECT +#define NETMAP_POLL _IO('i', 162) + +/* and also two setsockopt for sysctl emulation */ +#define NETMAP_SETSOCKOPT _IO('i', 140) +#define NETMAP_GETSOCKOPT _IO('i', 141) + + +//These linknames are for the Netmap Core Driver +#define NETMAP_NT_DEVICE_NAME L"\\Device\\NETMAP" +#define NETMAP_DOS_DEVICE_NAME L"\\DosDevices\\netmap" + +//Definition of a structure used to pass a virtual address within an IOCTL +typedef struct _MEMORY_ENTRY { + PVOID pUsermodeVirtualAddress; +} MEMORY_ENTRY, *PMEMORY_ENTRY; + +typedef struct _POLL_REQUEST_DATA { + int events; + int timeout; + int revents; +} POLL_REQUEST_DATA; + +#endif /* _WIN32 */ /* * FreeBSD uses the size value embedded in the _IOWR to determine @@ -561,4 +644,28 @@ struct nm_ifreq { char data[NM_IFRDATA_LEN]; }; +/* + * netmap kernel thread configuration + */ +/* bhyve/vmm.ko MSIX parameters for IOCTL */ +struct ptn_vmm_ioctl_msix { + uint64_t msg; + uint64_t addr; +}; + +/* IOCTL parameters */ +struct nm_kth_ioctl { + u_long com; + /* TODO: use union */ + union { + struct ptn_vmm_ioctl_msix msix; + } data; +}; + +/* Configuration of a ptnetmap ring */ +struct ptnet_ring_cfg { + uint64_t ioeventfd; /* eventfd in linux, tsleep() parameter in FreeBSD */ + uint64_t irqfd; /* eventfd in linux, ioctl fd in FreeBSD */ + struct nm_kth_ioctl ioctl; /* ioctl parameter to send irq (only used in bhyve/FreeBSD) */ +}; #endif /* _NET_NETMAP_H_ */ diff --git a/sys/net/netmap_user.h b/sys/net/netmap_user.h index 130117db7a2e..4ec3d941c504 100644 --- a/sys/net/netmap_user.h +++ b/sys/net/netmap_user.h @@ -1,5 +1,6 @@ /* - * Copyright (C) 2011-2014 Universita` di Pisa. All rights reserved. + * Copyright (C) 2011-2016 Universita` di Pisa + * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -65,9 +66,31 @@ #ifndef _NET_NETMAP_USER_H_ #define _NET_NETMAP_USER_H_ +#define NETMAP_DEVICE_NAME "/dev/netmap" + +#ifdef __CYGWIN__ +/* + * we can compile userspace apps with either cygwin or msvc, + * and we use _WIN32 to identify windows specific code + */ +#ifndef _WIN32 +#define _WIN32 +#endif /* _WIN32 */ + +#endif /* __CYGWIN__ */ + +#ifdef _WIN32 +#undef NETMAP_DEVICE_NAME +#define NETMAP_DEVICE_NAME "/proc/sys/DosDevices/Global/netmap" +#include <windows.h> +#include <WinDef.h> +#include <sys/cygwin.h> +#endif /* _WIN32 */ + #include <stdint.h> #include <sys/socket.h> /* apple needs sockaddr */ #include <net/if.h> /* IFNAMSIZ */ +#include <ctype.h> #ifndef likely #define likely(x) __builtin_expect(!!(x), 1) @@ -172,17 +195,23 @@ nm_ring_space(struct netmap_ring *ring) } while (0) #endif -struct nm_pkthdr { /* same as pcap_pkthdr */ +struct nm_pkthdr { /* first part is the same as pcap_pkthdr */ struct timeval ts; uint32_t caplen; uint32_t len; + + uint64_t flags; /* NM_MORE_PKTS etc */ +#define NM_MORE_PKTS 1 + struct nm_desc *d; + struct netmap_slot *slot; + uint8_t *buf; }; struct nm_stat { /* same as pcap_stat */ u_int ps_recv; u_int ps_drop; u_int ps_ifdrop; -#ifdef WIN32 +#ifdef WIN32 /* XXX or _WIN32 ? */ u_int bs_capt; #endif /* WIN32 */ }; @@ -284,12 +313,14 @@ typedef void (*nm_cb_t)(u_char *, const struct nm_pkthdr *, const u_char *d); * -NN bind individual NIC ring pair * {NN bind master side of pipe NN * }NN bind slave side of pipe NN - * a suffix starting with + and the following flags, + * a suffix starting with / and the following flags, * in any order: * x exclusive access * z zero copy monitor * t monitor tx side * r monitor rx side + * R bind only RX ring(s) + * T bind only TX ring(s) * * req provides the initial values of nmreq before parsing ifname. * Remember that the ifname parsing will override the ring @@ -329,6 +360,13 @@ enum { static int nm_close(struct nm_desc *); /* + * nm_mmap() do mmap or inherit from parent if the nr_arg2 + * (memory block) matches. + */ + +static int nm_mmap(struct nm_desc *, const struct nm_desc *); + +/* * nm_inject() is the same as pcap_inject() * nm_dispatch() is the same as pcap_dispatch() * nm_nextpkt() is the same as pcap_next() @@ -338,13 +376,247 @@ static int nm_inject(struct nm_desc *, const void *, size_t); static int nm_dispatch(struct nm_desc *, int, nm_cb_t, u_char *); static u_char *nm_nextpkt(struct nm_desc *, struct nm_pkthdr *); +#ifdef _WIN32 + +intptr_t _get_osfhandle(int); /* defined in io.h in windows */ + +/* + * In windows we do not have yet native poll support, so we keep track + * of file descriptors associated to netmap ports to emulate poll on + * them and fall back on regular poll on other file descriptors. + */ +struct win_netmap_fd_list { + struct win_netmap_fd_list *next; + int win_netmap_fd; + HANDLE win_netmap_handle; +}; + +/* + * list head containing all the netmap opened fd and their + * windows HANDLE counterparts + */ +static struct win_netmap_fd_list *win_netmap_fd_list_head; + +static void +win_insert_fd_record(int fd) +{ + struct win_netmap_fd_list *curr; + + for (curr = win_netmap_fd_list_head; curr; curr = curr->next) { + if (fd == curr->win_netmap_fd) { + return; + } + } + curr = calloc(1, sizeof(*curr)); + curr->next = win_netmap_fd_list_head; + curr->win_netmap_fd = fd; + curr->win_netmap_handle = IntToPtr(_get_osfhandle(fd)); + win_netmap_fd_list_head = curr; +} + +void +win_remove_fd_record(int fd) +{ + struct win_netmap_fd_list *curr = win_netmap_fd_list_head; + struct win_netmap_fd_list *prev = NULL; + for (; curr ; prev = curr, curr = curr->next) { + if (fd != curr->win_netmap_fd) + continue; + /* found the entry */ + if (prev == NULL) { /* we are freeing the first entry */ + win_netmap_fd_list_head = curr->next; + } else { + prev->next = curr->next; + } + free(curr); + break; + } +} + + +HANDLE +win_get_netmap_handle(int fd) +{ + struct win_netmap_fd_list *curr; + + for (curr = win_netmap_fd_list_head; curr; curr = curr->next) { + if (fd == curr->win_netmap_fd) { + return curr->win_netmap_handle; + } + } + return NULL; +} + +/* + * we need to wrap ioctl and mmap, at least for the netmap file descriptors + */ + +/* + * use this function only from netmap_user.h internal functions + * same as ioctl, returns 0 on success and -1 on error + */ +static int +win_nm_ioctl_internal(HANDLE h, int32_t ctlCode, void *arg) +{ + DWORD bReturn = 0, szIn, szOut; + BOOL ioctlReturnStatus; + void *inParam = arg, *outParam = arg; + + switch (ctlCode) { + case NETMAP_POLL: + szIn = sizeof(POLL_REQUEST_DATA); + szOut = sizeof(POLL_REQUEST_DATA); + break; + case NETMAP_MMAP: + szIn = 0; + szOut = sizeof(void*); + inParam = NULL; /* nothing on input */ + break; + case NIOCTXSYNC: + case NIOCRXSYNC: + szIn = 0; + szOut = 0; + break; + case NIOCREGIF: + szIn = sizeof(struct nmreq); + szOut = sizeof(struct nmreq); + break; + case NIOCCONFIG: + D("unsupported NIOCCONFIG!"); + return -1; + + default: /* a regular ioctl */ + D("invalid ioctl %x on netmap fd", ctlCode); + return -1; + } + + ioctlReturnStatus = DeviceIoControl(h, + ctlCode, inParam, szIn, + outParam, szOut, + &bReturn, NULL); + // XXX note windows returns 0 on error or async call, 1 on success + // we could call GetLastError() to figure out what happened + return ioctlReturnStatus ? 0 : -1; +} + +/* + * this function is what must be called from user-space programs + * same as ioctl, returns 0 on success and -1 on error + */ +static int +win_nm_ioctl(int fd, int32_t ctlCode, void *arg) +{ + HANDLE h = win_get_netmap_handle(fd); + + if (h == NULL) { + return ioctl(fd, ctlCode, arg); + } else { + return win_nm_ioctl_internal(h, ctlCode, arg); + } +} + +#define ioctl win_nm_ioctl /* from now on, within this file ... */ + +/* + * We cannot use the native mmap on windows + * The only parameter used is "fd", the other ones are just declared to + * make this signature comparable to the FreeBSD/Linux one + */ +static void * +win32_mmap_emulated(void *addr, size_t length, int prot, int flags, int fd, int32_t offset) +{ + HANDLE h = win_get_netmap_handle(fd); + + if (h == NULL) { + return mmap(addr, length, prot, flags, fd, offset); + } else { + MEMORY_ENTRY ret; + + return win_nm_ioctl_internal(h, NETMAP_MMAP, &ret) ? + NULL : ret.pUsermodeVirtualAddress; + } +} + +#define mmap win32_mmap_emulated + +#include <sys/poll.h> /* XXX needed to use the structure pollfd */ + +static int +win_nm_poll(struct pollfd *fds, int nfds, int timeout) +{ + HANDLE h; + + if (nfds != 1 || fds == NULL || (h = win_get_netmap_handle(fds->fd)) == NULL) {; + return poll(fds, nfds, timeout); + } else { + POLL_REQUEST_DATA prd; + + prd.timeout = timeout; + prd.events = fds->events; + + win_nm_ioctl_internal(h, NETMAP_POLL, &prd); + if ((prd.revents == POLLERR) || (prd.revents == STATUS_TIMEOUT)) { + return -1; + } + return 1; + } +} + +#define poll win_nm_poll + +static int +win_nm_open(char* pathname, int flags) +{ + + if (strcmp(pathname, NETMAP_DEVICE_NAME) == 0) { + int fd = open(NETMAP_DEVICE_NAME, O_RDWR); + if (fd < 0) { + return -1; + } + + win_insert_fd_record(fd); + return fd; + } else { + return open(pathname, flags); + } +} + +#define open win_nm_open + +static int +win_nm_close(int fd) +{ + if (fd != -1) { + close(fd); + if (win_get_netmap_handle(fd) != NULL) { + win_remove_fd_record(fd); + } + } + return 0; +} + +#define close win_nm_close + +#endif /* _WIN32 */ + +static int +nm_is_identifier(const char *s, const char *e) +{ + for (; s != e; s++) { + if (!isalnum(*s) && *s != '_') { + return 0; + } + } + + return 1; +} /* * Try to open, return descriptor if successful, NULL otherwise. * An invalid netmap name will return errno = 0; * You can pass a pointer to a pre-filled nm_desc to add special * parameters. Flags is used as follows - * NM_OPEN_NO_MMAP use the memory from arg, only + * NM_OPEN_NO_MMAP use the memory from arg, only XXX avoid mmap * if the nr_arg2 (memory block) matches. * NM_OPEN_ARG1 use req.nr_arg1 from arg * NM_OPEN_ARG2 use req.nr_arg2 from arg @@ -359,20 +631,48 @@ nm_open(const char *ifname, const struct nmreq *req, u_int namelen; uint32_t nr_ringid = 0, nr_flags, nr_reg; const char *port = NULL; + const char *vpname = NULL; #define MAXERRMSG 80 char errmsg[MAXERRMSG] = ""; enum { P_START, P_RNGSFXOK, P_GETNUM, P_FLAGS, P_FLAGSOK } p_state; + int is_vale; long num; - if (strncmp(ifname, "netmap:", 7) && strncmp(ifname, "vale", 4)) { + if (strncmp(ifname, "netmap:", 7) && + strncmp(ifname, NM_BDG_NAME, strlen(NM_BDG_NAME))) { errno = 0; /* name not recognised, not an error */ return NULL; } - if (ifname[0] == 'n') + + is_vale = (ifname[0] == 'v'); + if (is_vale) { + port = index(ifname, ':'); + if (port == NULL) { + snprintf(errmsg, MAXERRMSG, + "missing ':' in vale name"); + goto fail; + } + + if (!nm_is_identifier(ifname + 4, port)) { + snprintf(errmsg, MAXERRMSG, "invalid bridge name"); + goto fail; + } + + vpname = ++port; + } else { ifname += 7; + port = ifname; + } + /* scan for a separator */ - for (port = ifname; *port && !index("-*^{}/", *port); port++) + for (; *port && !index("-*^{}/", *port); port++) ; + + if (is_vale && !nm_is_identifier(vpname, port)) { + snprintf(errmsg, MAXERRMSG, "invalid bridge port name"); + goto fail; + } + namelen = port - ifname; if (namelen >= sizeof(d->req.nr_name)) { snprintf(errmsg, MAXERRMSG, "name too long"); @@ -449,6 +749,12 @@ nm_open(const char *ifname, const struct nmreq *req, case 'r': nr_flags |= NR_MONITOR_RX; break; + case 'R': + nr_flags |= NR_RX_RINGS_ONLY; + break; + case 'T': + nr_flags |= NR_TX_RINGS_ONLY; + break; default: snprintf(errmsg, MAXERRMSG, "unrecognized flag: '%c'", *port); goto fail; @@ -462,6 +768,11 @@ nm_open(const char *ifname, const struct nmreq *req, snprintf(errmsg, MAXERRMSG, "unexpected end of port name"); goto fail; } + if ((nr_flags & NR_ZCOPY_MON) && + !(nr_flags & (NR_MONITOR_TX|NR_MONITOR_RX))) { + snprintf(errmsg, MAXERRMSG, "'z' used but neither 'r', nor 't' found"); + goto fail; + } ND("flags: %s %s %s %s", (nr_flags & NR_EXCLUSIVE) ? "EXCLUSIVE" : "", (nr_flags & NR_ZCOPY_MON) ? "ZCOPY_MON" : "", @@ -474,7 +785,7 @@ nm_open(const char *ifname, const struct nmreq *req, return NULL; } d->self = d; /* set this early so nm_close() works */ - d->fd = open("/dev/netmap", O_RDWR); + d->fd = open(NETMAP_DEVICE_NAME, O_RDWR); if (d->fd < 0) { snprintf(errmsg, MAXERRMSG, "cannot open /dev/netmap: %s", strerror(errno)); goto fail; @@ -487,7 +798,7 @@ nm_open(const char *ifname, const struct nmreq *req, /* these fields are overridden by ifname and flags processing */ d->req.nr_ringid |= nr_ringid; - d->req.nr_flags = nr_flags; + d->req.nr_flags |= nr_flags; memcpy(d->req.nr_name, ifname, namelen); d->req.nr_name[namelen] = '\0'; /* optionally import info from parent */ @@ -529,31 +840,10 @@ nm_open(const char *ifname, const struct nmreq *req, goto fail; } - if (IS_NETMAP_DESC(parent) && parent->mem && - parent->req.nr_arg2 == d->req.nr_arg2) { - /* do not mmap, inherit from parent */ - d->memsize = parent->memsize; - d->mem = parent->mem; - } else { - /* XXX TODO: check if memsize is too large (or there is overflow) */ - d->memsize = d->req.nr_memsize; - d->mem = mmap(0, d->memsize, PROT_WRITE | PROT_READ, MAP_SHARED, - d->fd, 0); - if (d->mem == MAP_FAILED) { - snprintf(errmsg, MAXERRMSG, "mmap failed: %s", strerror(errno)); - goto fail; - } - d->done_mmap = 1; - } - { - struct netmap_if *nifp = NETMAP_IF(d->mem, d->req.nr_offset); - struct netmap_ring *r = NETMAP_RXRING(nifp, ); - - *(struct netmap_if **)(uintptr_t)&(d->nifp) = nifp; - *(struct netmap_ring **)(uintptr_t)&d->some_ring = r; - *(void **)(uintptr_t)&d->buf_start = NETMAP_BUF(r, 0); - *(void **)(uintptr_t)&d->buf_end = - (char *)d->mem + d->memsize; + /* if parent is defined, do nm_mmap() even if NM_OPEN_NO_MMAP is set */ + if ((!(new_flags & NM_OPEN_NO_MMAP) || parent) && nm_mmap(d, parent)) { + snprintf(errmsg, MAXERRMSG, "mmap failed: %s", strerror(errno)); + goto fail; } nr_reg = d->req.nr_flags & NR_REG_MASK; @@ -626,14 +916,54 @@ nm_close(struct nm_desc *d) return EINVAL; if (d->done_mmap && d->mem) munmap(d->mem, d->memsize); - if (d->fd != -1) + if (d->fd != -1) { close(d->fd); + } + bzero(d, sizeof(*d)); free(d); return 0; } +static int +nm_mmap(struct nm_desc *d, const struct nm_desc *parent) +{ + //XXX TODO: check if mmap is already done + + if (IS_NETMAP_DESC(parent) && parent->mem && + parent->req.nr_arg2 == d->req.nr_arg2) { + /* do not mmap, inherit from parent */ + D("do not mmap, inherit from parent"); + d->memsize = parent->memsize; + d->mem = parent->mem; + } else { + /* XXX TODO: check if memsize is too large (or there is overflow) */ + d->memsize = d->req.nr_memsize; + d->mem = mmap(0, d->memsize, PROT_WRITE | PROT_READ, MAP_SHARED, + d->fd, 0); + if (d->mem == MAP_FAILED) { + goto fail; + } + d->done_mmap = 1; + } + { + struct netmap_if *nifp = NETMAP_IF(d->mem, d->req.nr_offset); + struct netmap_ring *r = NETMAP_RXRING(nifp, ); + + *(struct netmap_if **)(uintptr_t)&(d->nifp) = nifp; + *(struct netmap_ring **)(uintptr_t)&d->some_ring = r; + *(void **)(uintptr_t)&d->buf_start = NETMAP_BUF(r, 0); + *(void **)(uintptr_t)&d->buf_end = + (char *)d->mem + d->memsize; + } + + return 0; + +fail: + return EINVAL; +} + /* * Same prototype as pcap_inject(), only need to cast. */ @@ -674,6 +1004,9 @@ nm_dispatch(struct nm_desc *d, int cnt, nm_cb_t cb, u_char *arg) { int n = d->last_rx_ring - d->first_rx_ring + 1; int c, got = 0, ri = d->cur_rx_ring; + d->hdr.buf = NULL; + d->hdr.flags = NM_MORE_PKTS; + d->hdr.d = d; if (cnt == 0) cnt = -1; @@ -690,17 +1023,24 @@ nm_dispatch(struct nm_desc *d, int cnt, nm_cb_t cb, u_char *arg) ri = d->first_rx_ring; ring = NETMAP_RXRING(d->nifp, ri); for ( ; !nm_ring_empty(ring) && cnt != got; got++) { - u_int i = ring->cur; - u_int idx = ring->slot[i].buf_idx; - u_char *buf = (u_char *)NETMAP_BUF(ring, idx); - + u_int idx, i; + if (d->hdr.buf) { /* from previous round */ + cb(arg, &d->hdr, d->hdr.buf); + } + i = ring->cur; + idx = ring->slot[i].buf_idx; + d->hdr.slot = &ring->slot[i]; + d->hdr.buf = (u_char *)NETMAP_BUF(ring, idx); // __builtin_prefetch(buf); d->hdr.len = d->hdr.caplen = ring->slot[i].len; d->hdr.ts = ring->ts; - cb(arg, &d->hdr, buf); ring->head = ring->cur = nm_ring_next(ring, i); } } + if (d->hdr.buf) { /* from previous round */ + d->hdr.flags = 0; + cb(arg, &d->hdr, d->hdr.buf); + } d->cur_rx_ring = ri; return got; } diff --git a/tools/tools/netmap/Makefile b/tools/tools/netmap/Makefile index 7d7c44b1cce1..8daf59ff8ba8 100644 --- a/tools/tools/netmap/Makefile +++ b/tools/tools/netmap/Makefile @@ -3,11 +3,12 @@ # # For multiple programs using a single source file each, # we can just define 'progs' and create custom targets. -PROGS = pkt-gen bridge vale-ctl +PROGS = pkt-gen nmreplay bridge vale-ctl CLEANFILES = $(PROGS) *.o MAN= -CFLAGS += -Werror -Wall # -nostdinc -I/usr/include -I../../../sys +CFLAGS += -Werror -Wall +CFLAGS += -nostdinc -I ../../../sys -I/usr/include CFLAGS += -Wextra LDFLAGS += -lpthread @@ -16,6 +17,7 @@ CFLAGS += -DNO_PCAP .else LDFLAGS += -lpcap .endif +LDFLAGS += -lm # used by nmreplay .include <bsd.prog.mk> .include <bsd.lib.mk> @@ -28,5 +30,8 @@ pkt-gen: pkt-gen.o bridge: bridge.o $(CC) $(CFLAGS) -o bridge bridge.o +nmreplay: nmreplay.o + $(CC) $(CFLAGS) -o nmreplay nmreplay.o $(LDFLAGS) + vale-ctl: vale-ctl.o $(CC) $(CFLAGS) -o vale-ctl vale-ctl.o diff --git a/tools/tools/netmap/bridge.c b/tools/tools/netmap/bridge.c index 0895d4ede676..e99a507a829a 100644 --- a/tools/tools/netmap/bridge.c +++ b/tools/tools/netmap/bridge.c @@ -143,7 +143,7 @@ static void usage(void) { fprintf(stderr, - "usage: bridge [-v] [-i ifa] [-i ifb] [-b burst] [-w wait_time] [iface]\n"); + "usage: bridge [-v] [-i ifa] [-i ifb] [-b burst] [-w wait_time] [ifa [ifb [burst]]]\n"); exit(1); } @@ -201,12 +201,12 @@ main(int argc, char **argv) argc -= optind; argv += optind; + if (argc > 0) + ifa = argv[0]; if (argc > 1) - ifa = argv[1]; + ifb = argv[1]; if (argc > 2) - ifb = argv[2]; - if (argc > 3) - burst = atoi(argv[3]); + burst = atoi(argv[2]); if (!ifb) ifb = ifa; if (!ifa) { @@ -233,7 +233,7 @@ main(int argc, char **argv) D("cannot open %s", ifa); return (1); } - // XXX use a single mmap ? + /* try to reuse the mmap() of the first interface, if possible */ pb = nm_open(ifb, NULL, NM_OPEN_NO_MMAP, pa); if (pb == NULL) { D("cannot open %s", ifb); @@ -262,6 +262,23 @@ main(int argc, char **argv) pollfd[0].revents = pollfd[1].revents = 0; n0 = pkt_queued(pa, 0); n1 = pkt_queued(pb, 0); +#if defined(_WIN32) || defined(BUSYWAIT) + if (n0){ + ioctl(pollfd[1].fd, NIOCTXSYNC, NULL); + pollfd[1].revents = POLLOUT; + } + else { + ioctl(pollfd[0].fd, NIOCRXSYNC, NULL); + } + if (n1){ + ioctl(pollfd[0].fd, NIOCTXSYNC, NULL); + pollfd[0].revents = POLLOUT; + } + else { + ioctl(pollfd[1].fd, NIOCRXSYNC, NULL); + } + ret = 1; +#else if (n0) pollfd[1].events |= POLLOUT; else @@ -271,6 +288,7 @@ main(int argc, char **argv) else pollfd[1].events |= POLLIN; ret = poll(pollfd, 2, 2500); +#endif //defined(_WIN32) || defined(BUSYWAIT) if (ret <= 0 || verbose) D("poll %s [0] ev %x %x rx %d@%d tx %d," " [1] ev %x %x rx %d@%d tx %d", diff --git a/tools/tools/netmap/ctrs.h b/tools/tools/netmap/ctrs.h new file mode 100644 index 000000000000..cee316477144 --- /dev/null +++ b/tools/tools/netmap/ctrs.h @@ -0,0 +1,108 @@ +#ifndef CTRS_H_ +#define CTRS_H_ + +/* $FreeBSD$ */ + +#include <sys/time.h> + +/* counters to accumulate statistics */ +struct my_ctrs { + uint64_t pkts, bytes, events, drop; + uint64_t min_space; + struct timeval t; +}; + +/* very crude code to print a number in normalized form. + * Caller has to make sure that the buffer is large enough. + */ +static const char * +norm2(char *buf, double val, char *fmt) +{ + char *units[] = { "", "K", "M", "G", "T" }; + u_int i; + + for (i = 0; val >=1000 && i < sizeof(units)/sizeof(char *) - 1; i++) + val /= 1000; + sprintf(buf, fmt, val, units[i]); + return buf; +} + +static __inline const char * +norm(char *buf, double val) +{ + return norm2(buf, val, "%.3f %s"); +} + +static __inline int +timespec_ge(const struct timespec *a, const struct timespec *b) +{ + + if (a->tv_sec > b->tv_sec) + return (1); + if (a->tv_sec < b->tv_sec) + return (0); + if (a->tv_nsec >= b->tv_nsec) + return (1); + return (0); +} + +static __inline struct timespec +timeval2spec(const struct timeval *a) +{ + struct timespec ts = { + .tv_sec = a->tv_sec, + .tv_nsec = a->tv_usec * 1000 + }; + return ts; +} + +static __inline struct timeval +timespec2val(const struct timespec *a) +{ + struct timeval tv = { + .tv_sec = a->tv_sec, + .tv_usec = a->tv_nsec / 1000 + }; + return tv; +} + + +static __inline struct timespec +timespec_add(struct timespec a, struct timespec b) +{ + struct timespec ret = { a.tv_sec + b.tv_sec, a.tv_nsec + b.tv_nsec }; + if (ret.tv_nsec >= 1000000000) { + ret.tv_sec++; + ret.tv_nsec -= 1000000000; + } + return ret; +} + +static __inline struct timespec +timespec_sub(struct timespec a, struct timespec b) +{ + struct timespec ret = { a.tv_sec - b.tv_sec, a.tv_nsec - b.tv_nsec }; + if (ret.tv_nsec < 0) { + ret.tv_sec--; + ret.tv_nsec += 1000000000; + } + return ret; +} + +static uint64_t +wait_for_next_report(struct timeval *prev, struct timeval *cur, + int report_interval) +{ + struct timeval delta; + + delta.tv_sec = report_interval/1000; + delta.tv_usec = (report_interval%1000)*1000; + if (select(0, NULL, NULL, NULL, &delta) < 0 && errno != EINTR) { + perror("select"); + abort(); + } + gettimeofday(cur, NULL); + timersub(cur, prev, &delta); + return delta.tv_sec* 1000000 + delta.tv_usec; +} +#endif /* CTRS_H_ */ diff --git a/tools/tools/netmap/nmreplay.8 b/tools/tools/netmap/nmreplay.8 new file mode 100644 index 000000000000..8e5ddb9698dd --- /dev/null +++ b/tools/tools/netmap/nmreplay.8 @@ -0,0 +1,129 @@ +.\" Copyright (c) 2016 Luigi Rizzo, Universita` di Pisa +.\" All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" $FreeBSD$ +.\" +.Dd February 16, 2016 +.Dt NMREPLAY 1 +.Os +.Sh NAME +.Nm nmreplay +.Nd playback a pcap file through a netmap interface +.Sh SYNOPSIS +.Bk -words +.Bl -tag -width "nmreplay" +.It Nm +.Op Fl f Ar pcap-file +.Op Fl i Ar netmap-interface +.Op Fl B Ar bandwidth +.Op Fl D Ar delay +.Op Fl L Ar loss +.Op Fl b Ar batch size +.Op Fl w Ar wait-link +.Op Fl v +.Op Fl C Ar cpu-placement +.Sh DESCRIPTION +.Nm +works like +.Nm tcpreplay +to replay a pcap file through a netmap interface, +with programmable rates and possibly delays, losses +and packet alterations. +.Nm +is designed to run at high speed, so the transmit schedule +is computed ahead of time, and the thread in charge of transmission +only has to pump data through the interface. +.Nm +can connect to any type of netmap port. +.Pp +Command line options are as follows +.Bl -tag -width Ds +.It Fl f Ar pcap-file +Name of the pcap file to replay. +.It Fl i Ar interface +Name of the netmap interface to use as output. +.It Fl v +Enable verbose mode +.It Fl b Ar batch-size +Maximum batch size to use during transmissions. +.Nm +normally transmits packets one at a time, but it may use +larger batches, up to the value specified with this option, +when running at high rates. +.It Fl B Ar bps | Cm constant, Ns Ar bps | Cm ether, Ns Ar bps | Cm real Ns Op , Ns Ar speedup +Bandwidth to be used for transmission. +.Ar bps +is a floating point number optionally follow by a character +(k, K, m, M, g, G) that multiplies the value by 10^3, 10^6 and 10^9 +respectively. +.Cm constant +(can be omitted) means that the bandwidth will be computed +with reference to the actual packet size (excluding CRC and framing). +.Cm ether +indicates that the ethernet framing (160 bits) and CRC (32 bits) +will be included in the computation of the packet size. +.Cm real +means transmission will occur according to the timestamps +recorded in the trace. The optional +.Ar speedup +multiplier (defaults to 1) indicates how much faster +or slower than real time the trace should be replayed. +.It Fl D Ar dt | Cm constant, Ns Ar dt | Cm uniform, Ns Ar dmin,dmax | Cm exp, Ar dmin,davg +Adds additional delay to the packet transmission, whose distribution +can be constant, uniform or exponential. +.Ar dt, dmin, dmax, avt +are times expressed as floating point numbers optionally followed +by a character (s, m, u, n) to indicate seconds, milliseconds, +microseconds, nanoseconds. +The delay is added to the transmit time and adjusted so that there is +never packet reordering. +.It Fl L Ar x | Cm plr, Ns Ar x | Cm ber, Ns Ar x +Simulates packet or bit errors, causing offending packets to be dropped. +.Ar x +is a floating point number indicating the packet or bit error rate. +.It Fl w Ar wait-link +indicates the number of seconds to wait before transmitting. +It defaults to 2, and may be useful when talking to physical +ports to let link negotiation complete before starting transmission. +.El +.Sh OPERATION +.Nm +creates an in-memory schedule with all packets to be transmitted, +and then launches a separate thread to take care of transmissions +while the main thread reports statistics every second. +.Sh SEE ALSO +.Pa http://info.iet.unipi.it/~luigi/netmap/ +.Pp +Luigi Rizzo, Revisiting network I/O APIs: the netmap framework, +Communications of the ACM, 55 (3), pp.45-51, March 2012 +.Pp +Luigi Rizzo, Giuseppe Lettieri, +VALE, a switched ethernet for virtual machines, +ACM CoNEXT'12, December 2012, Nice +.Sh AUTHORS +.An -nosplit +.Nm +has been written by +.An Luigi Rizzo, Andrea Beconcini, Francesco Mola and Lorenzo Biagini +at the Universita` di Pisa, Italy. diff --git a/tools/tools/netmap/nmreplay.c b/tools/tools/netmap/nmreplay.c new file mode 100644 index 000000000000..7a46bd57e198 --- /dev/null +++ b/tools/tools/netmap/nmreplay.c @@ -0,0 +1,1820 @@ +/* + * Copyright (C) 2016 Universita` di Pisa. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + + +#if 0 /* COMMENT */ + +This program implements NMREPLAY, a program to replay a pcap file +enforcing the output rate and possibly random losses and delay +distributions. +It is meant to be run from the command line and implemented with a main +control thread for monitoring, plus a thread to push packets out. + +The control thread parses command line arguments, prepares a +schedule for transmission in a memory buffer and then sits +in a loop where it periodically reads traffic statistics from +the other threads and prints them out on the console. + +The transmit buffer contains headers and packets. Each header +includes a timestamp that determines when the packet should be sent out. +A "consumer" thread cons() reads from the queue and transmits packets +on the output netmap port when their time has come. + +The program does CPU pinning and sets the scheduler and priority +for the "cons" threads. Externally one should do the +assignment of other threads (e.g. interrupt handlers) and +make sure that network interfaces are configured properly. + +--- Main functions of the program --- +within each function, q is used as a pointer to the queue holding +packets and parameters. + +pcap_prod() + + reads from the pcap file and prepares packets to transmit. + After reading a packet from the pcap file, the following information + are extracted which can be used to determine the schedule: + + q->cur_pkt points to the buffer containing the packet + q->cur_len packet length, excluding CRC + q->cur_caplen available packet length (may be shorter than cur_len) + q->cur_tt transmission time for the packet, computed from the trace. + + The following functions are then called in sequence: + + q->c_loss (set with the -L command line option) decides + whether the packet should be dropped before even queuing. + This is generally useful to emulate random loss. + The function is supposed to set q->c_drop = 1 if the + packet should be dropped, or leave it to 0 otherwise. + + q->c_bw (set with the -B command line option) is used to + enforce the transmit bandwidth. The function must store + in q->cur_tt the transmission time (in nanoseconds) of + the packet, which is typically proportional to the length + of the packet, i.e. q->cur_tt = q->cur_len / <bandwidth> + Variants are possible, eg. to account for constant framing + bits as on the ethernet, or variable channel acquisition times, + etc. + This mechanism can also be used to simulate variable queueing + delay e.g. due to the presence of cross traffic. + + q->c_delay (set with the -D option) implements delay emulation. + The function should set q->cur_delay to the additional + delay the packet is subject to. The framework will take care of + computing the actual exit time of a packet so that there is no + reordering. + + +#endif /* COMMENT */ + +// debugging macros +#define NED(_fmt, ...) do {} while (0) +#define ED(_fmt, ...) \ + do { \ + struct timeval _t0; \ + gettimeofday(&_t0, NULL); \ + fprintf(stderr, "%03d.%03d %-10.10s [%5d] \t" _fmt "\n", \ + (int)(_t0.tv_sec % 1000), (int)_t0.tv_usec/1000, \ + __FUNCTION__, __LINE__, ##__VA_ARGS__); \ + } while (0) + +/* WWW is for warnings, EEE is for errors */ +#define WWW(_fmt, ...) ED("--WWW-- " _fmt, ##__VA_ARGS__) +#define EEE(_fmt, ...) ED("--EEE-- " _fmt, ##__VA_ARGS__) +#define DDD(_fmt, ...) ED("--DDD-- " _fmt, ##__VA_ARGS__) + +#define _GNU_SOURCE // for CPU_SET() etc +#include <stdio.h> +#define NETMAP_WITH_LIBS +#include <net/netmap_user.h> +#include <sys/poll.h> + + +/* + * +A packet in the queue is q_pkt plus the payload. + +For the packet descriptor we need the following: + + - position of next packet in the queue (can go backwards). + We can reduce to 32 bits if we consider alignments, + or we just store the length to be added to the current + value and assume 0 as a special index. + - actual packet length (16 bits may be ok) + - queue output time, in nanoseconds (64 bits) + - delay line output time, in nanoseconds + One of the two can be packed to a 32bit value + +A convenient coding uses 32 bytes per packet. + + */ + +struct q_pkt { + uint64_t next; /* buffer index for next packet */ + uint64_t pktlen; /* actual packet len */ + uint64_t pt_qout; /* time of output from queue */ + uint64_t pt_tx; /* transmit time */ +}; + + +/* + * The header for a pcap file + */ +struct pcap_file_header { + uint32_t magic; + /*used to detect the file format itself and the byte + ordering. The writing application writes 0xa1b2c3d4 with it's native byte + ordering format into this field. The reading application will read either + 0xa1b2c3d4 (identical) or 0xd4c3b2a1 (swapped). If the reading application + reads the swapped 0xd4c3b2a1 value, it knows that all the following fields + will have to be swapped too. For nanosecond-resolution files, the writing + application writes 0xa1b23c4d, with the two nibbles of the two lower-order + bytes swapped, and the reading application will read either 0xa1b23c4d + (identical) or 0x4d3cb2a1 (swapped)*/ + uint16_t version_major; + uint16_t version_minor; /*the version number of this file format */ + int32_t thiszone; + /*the correction time in seconds between GMT (UTC) and the + local timezone of the following packet header timestamps. Examples: If the + timestamps are in GMT (UTC), thiszone is simply 0. If the timestamps are in + Central European time (Amsterdam, Berlin, ...) which is GMT + 1:00, thiszone + must be -3600*/ + uint32_t stampacc; /*the accuracy of time stamps in the capture*/ + uint32_t snaplen; + /*the "snapshot length" for the capture (typically 65535 + or even more, but might be limited by the user)*/ + uint32_t network; + /*link-layer header type, specifying the type of headers + at the beginning of the packet (e.g. 1 for Ethernet); this can be various + types such as 802.11, 802.11 with various radio information, PPP, Token + Ring, FDDI, etc.*/ +}; + +#if 0 /* from pcap.h */ +struct pcap_file_header { + bpf_u_int32 magic; + u_short version_major; + u_short version_minor; + bpf_int32 thiszone; /* gmt to local correction */ + bpf_u_int32 sigfigs; /* accuracy of timestamps */ + bpf_u_int32 snaplen; /* max length saved portion of each pkt */ + bpf_u_int32 linktype; /* data link type (LINKTYPE_*) */ +}; + +struct pcap_pkthdr { + struct timeval ts; /* time stamp */ + bpf_u_int32 caplen; /* length of portion present */ + bpf_u_int32 len; /* length this packet (off wire) */ +}; +#endif /* from pcap.h */ + +struct pcap_pkthdr { + uint32_t ts_sec; /* seconds from epoch */ + uint32_t ts_frac; /* microseconds or nanoseconds depending on sigfigs */ + uint32_t caplen; + /*the number of bytes of packet data actually captured + and saved in the file. This value should never become larger than orig_len + or the snaplen value of the global header*/ + uint32_t len; /* wire length */ +}; + + +#define PKT_PAD (32) /* padding on packets */ + +static inline int pad(int x) +{ + return ((x) + PKT_PAD - 1) & ~(PKT_PAD - 1) ; +} + + + +/* + * wrapper around the pcap file. + * We mmap the file so it is easy to do multiple passes through it. + */ +struct nm_pcap_file { + int fd; + uint64_t filesize; + const char *data; /* mmapped file */ + + uint64_t tot_pkt; + uint64_t tot_bytes; + uint64_t tot_bytes_rounded; /* need hdr + pad(len) */ + uint32_t resolution; /* 1000 for us, 1 for ns */ + int swap; /* need to swap fields ? */ + + uint64_t first_ts; + uint64_t total_tx_time; + /* + * total_tx_time is computed as last_ts - first_ts, plus the + * transmission time for the first packet which in turn is + * computed according to the average bandwidth + */ + + uint64_t file_len; + const char *cur; /* running pointer */ + const char *lim; /* data + file_len */ + int err; +}; + +static struct nm_pcap_file *readpcap(const char *fn); +static void destroy_pcap(struct nm_pcap_file *file); + + +#include <stdio.h> +#include <stdlib.h> +#include <stdint.h> +#include <unistd.h> +#include <fcntl.h> +#include <string.h> /* memcpy */ + +#include <sys/mman.h> + +#define NS_SCALE 1000000000UL /* nanoseconds in 1s */ + +static void destroy_pcap(struct nm_pcap_file *pf) +{ + if (!pf) + return; + + munmap((void *)(uintptr_t)pf->data, pf->filesize); + close(pf->fd); + bzero(pf, sizeof(*pf)); + free(pf); + return; +} + +// convert a field of given size if swap is needed. +static uint32_t +cvt(const void *src, int size, char swap) +{ + uint32_t ret = 0; + if (size != 2 && size != 4) { + EEE("Invalid size %d\n", size); + exit(1); + } + memcpy(&ret, src, size); + if (swap) { + unsigned char tmp, *data = (unsigned char *)&ret; + int i; + for (i = 0; i < size / 2; i++) { + tmp = data[i]; + data[i] = data[size - (1 + i)]; + data[size - (1 + i)] = tmp; + } + } + return ret; +} + +static uint32_t +read_next_info(struct nm_pcap_file *pf, int size) +{ + const char *end = pf->cur + size; + uint32_t ret; + if (end > pf->lim) { + pf->err = 1; + ret = 0; + } else { + ret = cvt(pf->cur, size, pf->swap); + pf->cur = end; + } + return ret; +} + +/* + * mmap the file, make sure timestamps are sorted, and count + * packets and sizes + * Timestamps represent the receive time of the packets. + * We need to compute also the 'first_ts' which refers to a hypotetical + * packet right before the first one, see the code for details. + */ +static struct nm_pcap_file * +readpcap(const char *fn) +{ + struct nm_pcap_file _f, *pf = &_f; + uint64_t prev_ts, first_pkt_time; + uint32_t magic, first_len = 0; + + bzero(pf, sizeof(*pf)); + pf->fd = open(fn, O_RDONLY); + if (pf->fd < 0) { + EEE("cannot open file %s", fn); + return NULL; + } + /* compute length */ + pf->filesize = lseek(pf->fd, 0, SEEK_END); + lseek(pf->fd, 0, SEEK_SET); + ED("filesize is %lu", (u_long)(pf->filesize)); + if (pf->filesize < sizeof(struct pcap_file_header)) { + EEE("file too short %s", fn); + close(pf->fd); + return NULL; + } + pf->data = mmap(NULL, pf->filesize, PROT_READ, MAP_SHARED, pf->fd, 0); + if (pf->data == MAP_FAILED) { + EEE("cannot mmap file %s", fn); + close(pf->fd); + return NULL; + } + pf->cur = pf->data; + pf->lim = pf->data + pf->filesize; + pf->err = 0; + pf->swap = 0; /* default, same endianness when read magic */ + + magic = read_next_info(pf, 4); + ED("magic is 0x%x", magic); + switch (magic) { + case 0xa1b2c3d4: /* native, us resolution */ + pf->swap = 0; + pf->resolution = 1000; + break; + case 0xd4c3b2a1: /* swapped, us resolution */ + pf->swap = 1; + pf->resolution = 1000; + break; + case 0xa1b23c4d: /* native, ns resolution */ + pf->swap = 0; + pf->resolution = 1; /* nanoseconds */ + break; + case 0x4d3cb2a1: /* swapped, ns resolution */ + pf->swap = 1; + pf->resolution = 1; /* nanoseconds */ + break; + default: + EEE("unknown magic 0x%x", magic); + return NULL; + } + + ED("swap %d res %d\n", pf->swap, pf->resolution); + pf->cur = pf->data + sizeof(struct pcap_file_header); + pf->lim = pf->data + pf->filesize; + pf->err = 0; + prev_ts = 0; + while (pf->cur < pf->lim && pf->err == 0) { + uint32_t base = pf->cur - pf->data; + uint64_t cur_ts = read_next_info(pf, 4) * NS_SCALE + + read_next_info(pf, 4) * pf->resolution; + uint32_t caplen = read_next_info(pf, 4); + uint32_t len = read_next_info(pf, 4); + + if (pf->err) { + WWW("end of pcap file after %d packets\n", + (int)pf->tot_pkt); + break; + } + if (cur_ts < prev_ts) { + WWW("reordered packet %d\n", + (int)pf->tot_pkt); + } + prev_ts = cur_ts; + (void)base; + if (pf->tot_pkt == 0) { + pf->first_ts = cur_ts; + first_len = len; + } + pf->tot_pkt++; + pf->tot_bytes += len; + pf->tot_bytes_rounded += pad(len) + sizeof(struct q_pkt); + pf->cur += caplen; + } + pf->total_tx_time = prev_ts - pf->first_ts; /* excluding first packet */ + ED("tot_pkt %lu tot_bytes %lu tx_time %.6f s first_len %lu", + (u_long)pf->tot_pkt, (u_long)pf->tot_bytes, + 1e-9*pf->total_tx_time, (u_long)first_len); + /* + * We determine that based on the + * average bandwidth of the trace, as follows + * first_pkt_ts = p[0].len / avg_bw + * In turn avg_bw = (total_len - p[0].len)/(p[n-1].ts - p[0].ts) + * so + * first_ts = p[0].ts - p[0].len * (p[n-1].ts - p[0].ts) / (total_len - p[0].len) + */ + if (pf->tot_bytes == first_len) { + /* cannot estimate bandwidth, so force 1 Gbit */ + first_pkt_time = first_len * 8; /* * 10^9 / bw */ + } else { + first_pkt_time = pf->total_tx_time * first_len / (pf->tot_bytes - first_len); + } + ED("first_pkt_time %.6f s", 1e-9*first_pkt_time); + pf->total_tx_time += first_pkt_time; + pf->first_ts -= first_pkt_time; + + /* all correct, allocate a record and copy */ + pf = calloc(1, sizeof(*pf)); + *pf = _f; + /* reset pointer to start */ + pf->cur = pf->data + sizeof(struct pcap_file_header); + pf->err = 0; + return pf; +} + +enum my_pcap_mode { PM_NONE, PM_FAST, PM_FIXED, PM_REAL }; + +int verbose = 0; + +static int do_abort = 0; + +#include <stdlib.h> +#include <stdio.h> +#include <pthread.h> +#include <sys/time.h> + +#include <sys/resource.h> // setpriority + +#ifdef __FreeBSD__ +#include <pthread_np.h> /* pthread w/ affinity */ +#include <sys/cpuset.h> /* cpu_set */ +#endif /* __FreeBSD__ */ + +#ifdef linux +#define cpuset_t cpu_set_t +#endif + +#ifdef __APPLE__ +#define cpuset_t uint64_t // XXX +static inline void CPU_ZERO(cpuset_t *p) +{ + *p = 0; +} + +static inline void CPU_SET(uint32_t i, cpuset_t *p) +{ + *p |= 1<< (i & 0x3f); +} + +#define pthread_setaffinity_np(a, b, c) ((void)a, 0) +#define sched_setscheduler(a, b, c) (1) /* error */ +#define clock_gettime(a,b) \ + do {struct timespec t0 = {0,0}; *(b) = t0; } while (0) + +#define _P64 unsigned long +#endif + +#ifndef _P64 + +/* we use uint64_t widely, but printf gives trouble on different + * platforms so we use _P64 as a cast + */ +#define _P64 uint64_t +#endif /* print stuff */ + + +struct _qs; /* forward */ +/* + * descriptor of a configuration entry. + * Each handler has a parse function which takes ac/av[] and returns + * true if successful. Any allocated space is stored into struct _cfg * + * that is passed as argument. + * arg and arg_len are included for convenience. + */ +struct _cfg { + int (*parse)(struct _qs *, struct _cfg *, int ac, char *av[]); /* 0 ok, 1 on error */ + int (*run)(struct _qs *, struct _cfg *arg); /* 0 Ok, 1 on error */ + // int close(struct _qs *, void *arg); /* 0 Ok, 1 on error */ + + const char *optarg; /* command line argument. Initial value is the error message */ + /* placeholders for common values */ + void *arg; /* allocated memory if any */ + int arg_len; /* size of *arg in case a realloc is needed */ + uint64_t d[16]; /* static storage for simple cases */ + double f[4]; /* static storage for simple cases */ +}; + + +/* + * communication occurs through this data structure, with fields + * cache-aligned according to who are the readers/writers. + * + +The queue is an array of memory (buf) of size buflen (does not change). + +The producer uses 'tail' as an index in the queue to indicate +the first empty location (ie. after the last byte of data), +the consumer uses head to indicate the next byte to consume. + +For best performance we should align buffers and packets +to multiples of cacheline, but this would explode memory too much. +Worst case memory explosion is with 65 byte packets. +Memory usage as shown below: + + qpkt-pad + size 32-16 32-32 32-64 64-64 + + 64 96 96 96 128 + 65 112 128 160 192 + + +An empty queue has head == tail, a full queue will have free space +below a threshold. In our case the queue is large enough and we +are non blocking so we can simply drop traffic when the queue +approaches a full state. + +To simulate bandwidth limitations efficiently, the producer has a second +pointer, prod_tail_1, used to check for expired packets. This is done lazily. + + */ +/* + * When sizing the buffer, we must assume some value for the bandwidth. + * INFINITE_BW is supposed to be faster than what we support + */ +#define INFINITE_BW (200ULL*1000000*1000) +#define MY_CACHELINE (128ULL) +#define MAX_PKT (9200) /* max packet size */ + +#define ALIGN_CACHE __attribute__ ((aligned (MY_CACHELINE))) + +struct _qs { /* shared queue */ + uint64_t t0; /* start of times */ + + uint64_t buflen; /* queue length */ + char *buf; + + /* handlers for various options */ + struct _cfg c_delay; + struct _cfg c_bw; + struct _cfg c_loss; + + /* producer's fields */ + uint64_t tx ALIGN_CACHE; /* tx counter */ + uint64_t prod_tail_1; /* head of queue */ + uint64_t prod_head; /* cached copy */ + uint64_t prod_tail; /* cached copy */ + uint64_t prod_drop; /* drop packet count */ + uint64_t prod_max_gap; /* rx round duration */ + + struct nm_pcap_file *pcap; /* the pcap struct */ + + /* parameters for reading from the netmap port */ + struct nm_desc *src_port; /* netmap descriptor */ + const char * prod_ifname; /* interface name or pcap file */ + struct netmap_ring *rxring; /* current ring being handled */ + uint32_t si; /* ring index */ + int burst; + uint32_t rx_qmax; /* stats on max queued */ + + uint64_t qt_qout; /* queue exit time for last packet */ + /* + * when doing shaping, the software computes and stores here + * the time when the most recently queued packet will exit from + * the queue. + */ + + uint64_t qt_tx; /* delay line exit time for last packet */ + /* + * The software computes the time at which the most recently + * queued packet exits from the queue. + * To avoid reordering, the next packet should exit at least + * at qt_tx + cur_tt + */ + + /* producer's fields controlling the queueing */ + const char * cur_pkt; /* current packet being analysed */ + uint32_t cur_len; /* length of current packet */ + uint32_t cur_caplen; /* captured length of current packet */ + + int cur_drop; /* 1 if current packet should be dropped. */ + /* + * cur_drop can be set as a result of the loss emulation, + * and may need to use the packet size, current time, etc. + */ + + uint64_t cur_tt; /* transmission time (ns) for current packet */ + /* + * The transmission time is how much link time the packet will consume. + * should be set by the function that does the bandwidth emulation, + * but could also be the result of a function that emulates the + * presence of competing traffic, MAC protocols etc. + * cur_tt is 0 for links with infinite bandwidth. + */ + + uint64_t cur_delay; /* delay (ns) for current packet from c_delay.run() */ + /* + * this should be set by the function that computes the extra delay + * applied to the packet. + * The code makes sure that there is no reordering and possibly + * bumps the output time as needed. + */ + + + /* consumer's fields */ + const char * cons_ifname; + uint64_t rx ALIGN_CACHE; /* rx counter */ + uint64_t cons_head; /* cached copy */ + uint64_t cons_tail; /* cached copy */ + uint64_t cons_now; /* most recent producer timestamp */ + uint64_t rx_wait; /* stats */ + + /* shared fields */ + volatile uint64_t _tail ALIGN_CACHE ; /* producer writes here */ + volatile uint64_t _head ALIGN_CACHE ; /* consumer reads from here */ +}; + +struct pipe_args { + int wait_link; + + pthread_t cons_tid; /* main thread */ + pthread_t prod_tid; /* producer thread */ + + /* Affinity: */ + int cons_core; /* core for cons() */ + int prod_core; /* core for prod() */ + + struct nm_desc *pa; /* netmap descriptor */ + struct nm_desc *pb; + + struct _qs q; +}; + +#define NS_IN_S (1000000000ULL) // nanoseconds +#define TIME_UNITS NS_IN_S +/* set the thread affinity. */ +static int +setaffinity(int i) +{ + cpuset_t cpumask; + struct sched_param p; + + if (i == -1) + return 0; + + /* Set thread affinity affinity.*/ + CPU_ZERO(&cpumask); + CPU_SET(i, &cpumask); + + if (pthread_setaffinity_np(pthread_self(), sizeof(cpuset_t), &cpumask) != 0) { + WWW("Unable to set affinity: %s", strerror(errno)); + } + if (setpriority(PRIO_PROCESS, 0, -10)) {; // XXX not meaningful + WWW("Unable to set priority: %s", strerror(errno)); + } + bzero(&p, sizeof(p)); + p.sched_priority = 10; // 99 on linux ? + // use SCHED_RR or SCHED_FIFO + if (sched_setscheduler(0, SCHED_RR, &p)) { + WWW("Unable to set scheduler: %s", strerror(errno)); + } + return 0; +} + + +/* + * set the timestamp from the clock, subtract t0 + */ +static inline void +set_tns_now(uint64_t *now, uint64_t t0) +{ + struct timespec t; + + clock_gettime(CLOCK_REALTIME, &t); // XXX precise on FreeBSD ? + *now = (uint64_t)(t.tv_nsec + NS_IN_S * t.tv_sec); + *now -= t0; +} + + + +/* compare two timestamps */ +static inline int64_t +ts_cmp(uint64_t a, uint64_t b) +{ + return (int64_t)(a - b); +} + +/* create a packet descriptor */ +static inline struct q_pkt * +pkt_at(struct _qs *q, uint64_t ofs) +{ + return (struct q_pkt *)(q->buf + ofs); +} + + +/* + * we have already checked for room and prepared p->next + */ +static inline int +enq(struct _qs *q) +{ + struct q_pkt *p = pkt_at(q, q->prod_tail); + + /* hopefully prefetch has been done ahead */ + nm_pkt_copy(q->cur_pkt, (char *)(p+1), q->cur_caplen); + p->pktlen = q->cur_len; + p->pt_qout = q->qt_qout; + p->pt_tx = q->qt_tx; + p->next = q->prod_tail + pad(q->cur_len) + sizeof(struct q_pkt); + ND("enqueue len %d at %d new tail %ld qout %.6f tx %.6f", + q->cur_len, (int)q->prod_tail, p->next, + 1e-9*p->pt_qout, 1e-9*p->pt_tx); + q->prod_tail = p->next; + q->tx++; + return 0; +} + +/* + * simple handler for parameters not supplied + */ +static int +null_run_fn(struct _qs *q, struct _cfg *cfg) +{ + (void)q; + (void)cfg; + return 0; +} + + + +/* + * put packet data into the buffer. + * We read from the mmapped pcap file, construct header, copy + * the captured length of the packet and pad with zeroes. + */ +static void * +pcap_prod(void *_pa) +{ + struct pipe_args *pa = _pa; + struct _qs *q = &pa->q; + struct nm_pcap_file *pf = q->pcap; /* already opened by readpcap */ + uint32_t loops, i, tot_pkts; + + /* data plus the loop record */ + uint64_t need; + uint64_t t_tx, tt, last_ts; /* last timestamp from trace */ + + /* + * For speed we make sure the trace is at least some 1000 packets, + * so we may need to loop the trace more than once (for short traces) + */ + loops = (1 + 10000 / pf->tot_pkt); + tot_pkts = loops * pf->tot_pkt; + need = loops * pf->tot_bytes_rounded + sizeof(struct q_pkt); + q->buf = calloc(1, need); + if (q->buf == NULL) { + D("alloc %ld bytes for queue failed, exiting",(_P64)need); + goto fail; + } + q->prod_head = q->prod_tail = 0; + q->buflen = need; + + pf->cur = pf->data + sizeof(struct pcap_file_header); + pf->err = 0; + + ED("--- start create %lu packets at tail %d", + (u_long)tot_pkts, (int)q->prod_tail); + last_ts = pf->first_ts; /* beginning of the trace */ + + q->qt_qout = 0; /* first packet out of the queue */ + + for (loops = 0, i = 0; i < tot_pkts && !do_abort; i++) { + const char *next_pkt; /* in the pcap buffer */ + uint64_t cur_ts; + + /* read values from the pcap buffer */ + cur_ts = read_next_info(pf, 4) * NS_SCALE + + read_next_info(pf, 4) * pf->resolution; + q->cur_caplen = read_next_info(pf, 4); + q->cur_len = read_next_info(pf, 4); + next_pkt = pf->cur + q->cur_caplen; + + /* prepare fields in q for the generator */ + q->cur_pkt = pf->cur; + /* initial estimate of tx time */ + q->cur_tt = cur_ts - last_ts; + // -pf->first_ts + loops * pf->total_tx_time - last_ts; + + if ((i % pf->tot_pkt) == 0) + ED("insert %5d len %lu cur_tt %.6f", + i, (u_long)q->cur_len, 1e-9*q->cur_tt); + + /* prepare for next iteration */ + pf->cur = next_pkt; + last_ts = cur_ts; + if (next_pkt == pf->lim) { //last pkt + pf->cur = pf->data + sizeof(struct pcap_file_header); + last_ts = pf->first_ts; /* beginning of the trace */ + loops++; + } + + q->c_loss.run(q, &q->c_loss); + if (q->cur_drop) + continue; + q->c_bw.run(q, &q->c_bw); + tt = q->cur_tt; + q->qt_qout += tt; +#if 0 + if (drop_after(q)) + continue; +#endif + q->c_delay.run(q, &q->c_delay); /* compute delay */ + t_tx = q->qt_qout + q->cur_delay; + ND(5, "tt %ld qout %ld tx %ld qt_tx %ld", tt, q->qt_qout, t_tx, q->qt_tx); + /* insure no reordering and spacing by transmission time */ + q->qt_tx = (t_tx >= q->qt_tx + tt) ? t_tx : q->qt_tx + tt; + enq(q); + + q->tx++; + ND("ins %d q->prod_tail = %lu", (int)insert, (unsigned long)q->prod_tail); + } + /* loop marker ? */ + ED("done q->prod_tail:%d",(int)q->prod_tail); + q->_tail = q->prod_tail; /* publish */ + + return NULL; +fail: + if (q->buf != NULL) { + free(q->buf); + } + nm_close(pa->pb); + return (NULL); +} + + +/* + * the consumer reads from the queue using head, + * advances it every now and then. + */ +static void * +cons(void *_pa) +{ + struct pipe_args *pa = _pa; + struct _qs *q = &pa->q; + int pending = 0; + uint64_t last_ts = 0; + + /* read the start of times in q->t0 */ + set_tns_now(&q->t0, 0); + /* set the time (cons_now) to clock - q->t0 */ + set_tns_now(&q->cons_now, q->t0); + q->cons_head = q->_head; + q->cons_tail = q->_tail; + while (!do_abort) { /* consumer, infinite */ + struct q_pkt *p = pkt_at(q, q->cons_head); + + __builtin_prefetch (q->buf + p->next); + + if (q->cons_head == q->cons_tail) { //reset record + ND("Transmission restarted"); + /* + * add to q->t0 the time for the last packet + */ + q->t0 += last_ts; + q->cons_head = 0; //restart from beginning of the queue + continue; + } + last_ts = p->pt_tx; + if (ts_cmp(p->pt_tx, q->cons_now) > 0) { + // packet not ready + q->rx_wait++; + /* the ioctl should be conditional */ + ioctl(pa->pb->fd, NIOCTXSYNC, 0); // XXX just in case + pending = 0; + usleep(20); + set_tns_now(&q->cons_now, q->t0); + continue; + } + /* XXX copy is inefficient but simple */ + pending++; + if (nm_inject(pa->pb, (char *)(p + 1), p->pktlen) == 0 || + pending > q->burst) { + RD(1, "inject failed len %d now %ld tx %ld h %ld t %ld next %ld", + (int)p->pktlen, (u_long)q->cons_now, (u_long)p->pt_tx, + (u_long)q->_head, (u_long)q->_tail, (u_long)p->next); + ioctl(pa->pb->fd, NIOCTXSYNC, 0); + pending = 0; + continue; + } + q->cons_head = p->next; + /* drain packets from the queue */ + q->rx++; + } + D("exiting on abort"); + return NULL; +} + +/* + * In case of pcap file as input, the program acts in 2 different + * phases. It first fill the queue and then starts the cons() + */ +static void * +nmreplay_main(void *_a) +{ + struct pipe_args *a = _a; + struct _qs *q = &a->q; + const char *cap_fname = q->prod_ifname; + + setaffinity(a->cons_core); + set_tns_now(&q->t0, 0); /* starting reference */ + if (cap_fname == NULL) { + goto fail; + } + q->pcap = readpcap(cap_fname); + if (q->pcap == NULL) { + EEE("unable to read file %s", cap_fname); + goto fail; + } + pcap_prod((void*)a); + destroy_pcap(q->pcap); + q->pcap = NULL; + a->pb = nm_open(q->cons_ifname, NULL, 0, NULL); + if (a->pb == NULL) { + EEE("cannot open netmap on %s", q->cons_ifname); + do_abort = 1; // XXX any better way ? + return NULL; + } + /* continue as cons() */ + WWW("prepare to send packets"); + usleep(1000); + cons((void*)a); + EEE("exiting on abort"); +fail: + if (q->pcap != NULL) { + destroy_pcap(q->pcap); + } + do_abort = 1; + return NULL; +} + + +static void +sigint_h(int sig) +{ + (void)sig; /* UNUSED */ + do_abort = 1; + signal(SIGINT, SIG_DFL); +} + + + +static void +usage(void) +{ + fprintf(stderr, + "usage: nmreplay [-v] [-D delay] [-B {[constant,]bps|ether,bps|real,speedup}] [-L loss]\n" + "\t[-b burst] -i ifa-or-pcap-file -i ifb\n"); + exit(1); +} + + +/*---- configuration handling ---- */ +/* + * support routine: split argument, returns ac and *av. + * av contains two extra entries, a NULL and a pointer + * to the entire string. + */ +static char ** +split_arg(const char *src, int *_ac) +{ + char *my = NULL, **av = NULL, *seps = " \t\r\n,"; + int l, i, ac; /* number of entries */ + + if (!src) + return NULL; + l = strlen(src); + /* in the first pass we count fields, in the second pass + * we allocate the av[] array and a copy of the string + * and fill av[]. av[ac] = NULL, av[ac+1] + */ + for (;;) { + i = ac = 0; + ND("start pass %d: <%s>", av ? 1 : 0, my); + while (i < l) { + /* trim leading separator */ + while (i <l && strchr(seps, src[i])) + i++; + if (i >= l) + break; + ND(" pass %d arg %d: <%s>", av ? 1 : 0, ac, src+i); + if (av) /* in the second pass, set the result */ + av[ac] = my+i; + ac++; + /* skip string */ + while (i <l && !strchr(seps, src[i])) i++; + if (av) + my[i] = '\0'; /* write marker */ + } + if (!av) { /* end of first pass */ + ND("ac is %d", ac); + av = calloc(1, (l+1) + (ac + 2)*sizeof(char *)); + my = (char *)&(av[ac+2]); + strcpy(my, src); + } else { + break; + } + } + for (i = 0; i < ac; i++) { + NED("%d: <%s>", i, av[i]); + } + av[i++] = NULL; + av[i++] = my; + *_ac = ac; + return av; +} + + +/* + * apply a command against a set of functions, + * install a handler in *dst + */ +static int +cmd_apply(const struct _cfg *a, const char *arg, struct _qs *q, struct _cfg *dst) +{ + int ac = 0; + char **av; + int i; + + if (arg == NULL || *arg == '\0') + return 1; /* no argument may be ok */ + if (a == NULL || dst == NULL) { + ED("program error - invalid arguments"); + exit(1); + } + av = split_arg(arg, &ac); + if (av == NULL) + return 1; /* error */ + for (i = 0; a[i].parse; i++) { + struct _cfg x = a[i]; + const char *errmsg = x.optarg; + int ret; + + x.arg = NULL; + x.arg_len = 0; + bzero(&x.d, sizeof(x.d)); + ND("apply %s to %s", av[0], errmsg); + ret = x.parse(q, &x, ac, av); + if (ret == 2) /* not recognised */ + continue; + if (ret == 1) { + ED("invalid arguments: need '%s' have '%s'", + errmsg, arg); + break; + } + x.optarg = arg; + *dst = x; + return 0; + } + ED("arguments %s not recognised", arg); + free(av); + return 1; +} + +static struct _cfg delay_cfg[]; +static struct _cfg bw_cfg[]; +static struct _cfg loss_cfg[]; + +static uint64_t parse_bw(const char *arg); + +/* + * prodcons [options] + * accept separate sets of arguments for the two directions + * + */ + +static void +add_to(const char ** v, int l, const char *arg, const char *msg) +{ + for (; l > 0 && *v != NULL ; l--, v++); + if (l == 0) { + ED("%s %s", msg, arg); + exit(1); + } + *v = arg; +} + +int +main(int argc, char **argv) +{ + int ch, i, err=0; + +#define N_OPTS 1 + struct pipe_args bp[N_OPTS]; + const char *d[N_OPTS], *b[N_OPTS], *l[N_OPTS], *q[N_OPTS], *ifname[N_OPTS], *m[N_OPTS]; + const char *pcap_file[N_OPTS]; + int cores[4] = { 2, 8, 4, 10 }; /* default values */ + + bzero(&bp, sizeof(bp)); /* all data initially go here */ + bzero(d, sizeof(d)); + bzero(b, sizeof(b)); + bzero(l, sizeof(l)); + bzero(q, sizeof(q)); + bzero(m, sizeof(m)); + bzero(ifname, sizeof(ifname)); + bzero(pcap_file, sizeof(pcap_file)); + + + /* set default values */ + for (i = 0; i < N_OPTS; i++) { + struct _qs *q = &bp[i].q; + + q->burst = 128; + q->c_delay.optarg = "0"; + q->c_delay.run = null_run_fn; + q->c_loss.optarg = "0"; + q->c_loss.run = null_run_fn; + q->c_bw.optarg = "0"; + q->c_bw.run = null_run_fn; + } + + // Options: + // B bandwidth in bps + // D delay in seconds + // L loss probability + // f pcap file + // i interface name + // w wait link + // b batch size + // v verbose + // C cpu placement + + while ( (ch = getopt(argc, argv, "B:C:D:L:b:f:i:vw:")) != -1) { + switch (ch) { + default: + D("bad option %c %s", ch, optarg); + usage(); + break; + + case 'C': /* CPU placement, up to 4 arguments */ + { + int ac = 0; + char **av = split_arg(optarg, &ac); + if (ac == 1) { /* sequential after the first */ + cores[0] = atoi(av[0]); + cores[1] = cores[0] + 1; + cores[2] = cores[1] + 1; + cores[3] = cores[2] + 1; + } else if (ac == 2) { /* two sequential pairs */ + cores[0] = atoi(av[0]); + cores[1] = cores[0] + 1; + cores[2] = atoi(av[1]); + cores[3] = cores[2] + 1; + } else if (ac == 4) { /* four values */ + cores[0] = atoi(av[0]); + cores[1] = atoi(av[1]); + cores[2] = atoi(av[2]); + cores[3] = atoi(av[3]); + } else { + ED(" -C accepts 1, 2 or 4 comma separated arguments"); + usage(); + } + if (av) + free(av); + } + break; + + case 'B': /* bandwidth in bps */ + add_to(b, N_OPTS, optarg, "-B too many times"); + break; + + case 'D': /* delay in seconds (float) */ + add_to(d, N_OPTS, optarg, "-D too many times"); + break; + + case 'L': /* loss probability */ + add_to(l, N_OPTS, optarg, "-L too many times"); + break; + + case 'b': /* burst */ + bp[0].q.burst = atoi(optarg); + break; + + case 'f': /* pcap_file */ + add_to(pcap_file, N_OPTS, optarg, "-f too many times"); + break; + case 'i': /* interface */ + add_to(ifname, N_OPTS, optarg, "-i too many times"); + break; + case 'v': + verbose++; + break; + case 'w': + bp[0].wait_link = atoi(optarg); + break; + } + + } + + argc -= optind; + argv += optind; + + /* + * consistency checks for common arguments + * if pcap file has been provided we need just one interface, two otherwise + */ + if (!pcap_file[0]) { + ED("missing pcap file"); + usage(); + } + if (!ifname[0]) { + ED("missing interface"); + usage(); + } + if (bp[0].q.burst < 1 || bp[0].q.burst > 8192) { + WWW("invalid burst %d, set to 1024", bp[0].q.burst); + bp[0].q.burst = 1024; // XXX 128 is probably better + } + if (bp[0].wait_link > 100) { + ED("invalid wait_link %d, set to 4", bp[0].wait_link); + bp[0].wait_link = 4; + } + + bp[0].q.prod_ifname = pcap_file[0]; + bp[0].q.cons_ifname = ifname[0]; + + /* assign cores. prod and cons work better if on the same HT */ + bp[0].cons_core = cores[0]; + bp[0].prod_core = cores[1]; + ED("running on cores %d %d %d %d", cores[0], cores[1], cores[2], cores[3]); + + /* apply commands */ + for (i = 0; i < N_OPTS; i++) { /* once per queue */ + struct _qs *q = &bp[i].q; + err += cmd_apply(delay_cfg, d[i], q, &q->c_delay); + err += cmd_apply(bw_cfg, b[i], q, &q->c_bw); + err += cmd_apply(loss_cfg, l[i], q, &q->c_loss); + } + + pthread_create(&bp[0].cons_tid, NULL, nmreplay_main, (void*)&bp[0]); + signal(SIGINT, sigint_h); + sleep(1); + while (!do_abort) { + struct _qs olda = bp[0].q; + struct _qs *q0 = &bp[0].q; + + sleep(1); + ED("%ld -> %ld maxq %d round %ld", + (_P64)(q0->rx - olda.rx), (_P64)(q0->tx - olda.tx), + q0->rx_qmax, (_P64)q0->prod_max_gap + ); + ED("plr nominal %le actual %le", + (double)(q0->c_loss.d[0])/(1<<24), + q0->c_loss.d[1] == 0 ? 0 : + (double)(q0->c_loss.d[2])/q0->c_loss.d[1]); + bp[0].q.rx_qmax = (bp[0].q.rx_qmax * 7)/8; // ewma + bp[0].q.prod_max_gap = (bp[0].q.prod_max_gap * 7)/8; // ewma + } + D("exiting on abort"); + sleep(1); + + return (0); +} + +/* conversion factor for numbers. + * Each entry has a set of characters and conversion factor, + * the first entry should have an empty string and default factor, + * the final entry has s = NULL. + */ +struct _sm { /* string and multiplier */ + char *s; + double m; +}; + +/* + * parse a generic value + */ +static double +parse_gen(const char *arg, const struct _sm *conv, int *err) +{ + double d; + char *ep; + int dummy; + + if (err == NULL) + err = &dummy; + *err = 0; + if (arg == NULL) + goto error; + d = strtod(arg, &ep); + if (ep == arg) { /* no value */ + ED("bad argument %s", arg); + goto error; + } + /* special case, no conversion */ + if (conv == NULL && *ep == '\0') + goto done; + ND("checking %s [%s]", arg, ep); + for (;conv->s; conv++) { + if (strchr(conv->s, *ep)) + goto done; + } +error: + *err = 1; /* unrecognised */ + return 0; + +done: + if (conv) { + ND("scale is %s %lf", conv->s, conv->m); + d *= conv->m; /* apply default conversion */ + } + ND("returning %lf", d); + return d; +} + +#define U_PARSE_ERR ~(0ULL) + +/* returns a value in nanoseconds */ +static uint64_t +parse_time(const char *arg) +{ + struct _sm a[] = { + {"", 1000000000 /* seconds */}, + {"n", 1 /* nanoseconds */}, {"u", 1000 /* microseconds */}, + {"m", 1000000 /* milliseconds */}, {"s", 1000000000 /* seconds */}, + {NULL, 0 /* seconds */} + }; + int err; + uint64_t ret = (uint64_t)parse_gen(arg, a, &err); + return err ? U_PARSE_ERR : ret; +} + + +/* + * parse a bandwidth, returns value in bps or U_PARSE_ERR if error. + */ +static uint64_t +parse_bw(const char *arg) +{ + struct _sm a[] = { + {"", 1}, {"kK", 1000}, {"mM", 1000000}, {"gG", 1000000000}, {NULL, 0} + }; + int err; + uint64_t ret = (uint64_t)parse_gen(arg, a, &err); + return err ? U_PARSE_ERR : ret; +} + + +/* + * For some function we need random bits. + * This is a wrapper to whatever function you want that returns + * 24 useful random bits. + */ + +#include <math.h> /* log, exp etc. */ +static inline uint64_t +my_random24(void) /* 24 useful bits */ +{ + return random() & ((1<<24) - 1); +} + + +/*-------------- user-configuration -----------------*/ + +#if 0 /* start of comment block */ + +Here we place the functions to implement the various features +of the system. For each feature one should define a struct _cfg +(see at the beginning for definition) that refers a *_parse() function +to extract values from the command line, and a *_run() function +that is invoked on each packet to implement the desired function. + +Examples of the two functions are below. In general: + +- the *_parse() function takes argc/argv[], matches the function + name in argv[0], extracts the operating parameters, allocates memory + if needed, and stores them in the struct _cfg. + Return value is 2 if argv[0] is not recosnised, 1 if there is an + error in the arguments, 0 if all ok. + + On the command line, argv[] is a single, comma separated argument + that follow the specific option eg -D constant,20ms + + struct _cfg has some preallocated space (e.g an array of uint64_t) so simple + function can use that without having to allocate memory. + +- the *_run() function takes struct _q *q and struct _cfg *cfg as arguments. + *q contains all the informatio that may be possibly needed, including + those on the packet currently under processing. + The basic values are the following: + + char * cur_pkt points to the current packet (linear buffer) + uint32_t cur_len; length of the current packet + the functions are not supposed to modify these values + + int cur_drop; true if current packet must be dropped. + Must be set to non-zero by the loss emulation function + + uint64_t cur_delay; delay in nanoseconds for the current packet + Must be set by the delay emulation function + + More sophisticated functions may need to access other fields in *q, + see the structure description for that. + +When implementing a new function for a feature (e.g. for delay, +bandwidth, loss...) the struct _cfg should be added to the array +that contains all possible options. + + --- Specific notes --- + +DELAY emulation -D option_arguments + + If the option is not supplied, the system applies 0 extra delay + + The resolution for times is 1ns, the precision is load dependent and + generally in the order of 20-50us. + Times are in nanoseconds, can be followed by a character specifying + a different unit e.g. + + n nanoseconds + u microseconds + m milliseconds + s seconds + + Currently implemented options: + + constant,t constant delay equal to t + + uniform,tmin,tmax uniform delay between tmin and tmax + + exp,tavg,tmin exponential distribution with average tavg + and minimum tmin (corresponds to an exponential + distribution with argument 1/(tavg-tmin) ) + + +LOSS emulation -L option_arguments + + Loss is expressed as packet or bit error rate, which is an absolute + number between 0 and 1 (typically small). + + Currently implemented options + + plr,p uniform packet loss rate p, independent + of packet size + + burst,p,lmin,lmax burst loss with burst probability p and + burst length uniformly distributed between + lmin and lmax + + ber,p uniformly distributed bit error rate p, + so actual loss prob. depends on size. + +BANDWIDTH emulation -B option_arguments + + Bandwidths are expressed in bits per second, can be followed by a + character specifying a different unit e.g. + + b/B bits per second + k/K kbits/s (10^3 bits/s) + m/M mbits/s (10^6 bits/s) + g/G gbits/s (10^9 bits/s) + + Currently implemented options + + const,b constant bw, excluding mac framing + ether,b constant bw, including ethernet framing + (20 bytes framing + 4 bytes crc) + real,[scale] use real time, optionally with a scaling factor + +#endif /* end of comment block */ + +/* + * Configuration options for delay + */ + +/* constant delay, also accepts just a number */ +static int +const_delay_parse(struct _qs *q, struct _cfg *dst, int ac, char *av[]) +{ + uint64_t delay; + + (void)q; + if (strncmp(av[0], "const", 5) != 0 && ac > 1) + return 2; /* unrecognised */ + if (ac > 2) + return 1; /* error */ + delay = parse_time(av[ac - 1]); + if (delay == U_PARSE_ERR) + return 1; /* error */ + dst->d[0] = delay; + return 0; /* success */ +} + +/* runtime function, store the delay into q->cur_delay */ +static int +const_delay_run(struct _qs *q, struct _cfg *arg) +{ + q->cur_delay = arg->d[0]; /* the delay */ + return 0; +} + +static int +uniform_delay_parse(struct _qs *q, struct _cfg *dst, int ac, char *av[]) +{ + uint64_t dmin, dmax; + + (void)q; + if (strcmp(av[0], "uniform") != 0) + return 2; /* not recognised */ + if (ac != 3) + return 1; /* error */ + dmin = parse_time(av[1]); + dmax = parse_time(av[2]); + if (dmin == U_PARSE_ERR || dmax == U_PARSE_ERR || dmin > dmax) + return 1; + D("dmin %ld dmax %ld", (_P64)dmin, (_P64)dmax); + dst->d[0] = dmin; + dst->d[1] = dmax; + dst->d[2] = dmax - dmin; + return 0; +} + +static int +uniform_delay_run(struct _qs *q, struct _cfg *arg) +{ + uint64_t x = my_random24(); + q->cur_delay = arg->d[0] + ((arg->d[2] * x) >> 24); +#if 0 /* COMPUTE_STATS */ +#endif /* COMPUTE_STATS */ + return 0; +} + +/* + * exponential delay: Prob(delay = x) = exp(-x/d_av) + * gives a delay between 0 and infinity with average d_av + * The cumulative function is 1 - d_av exp(-x/d_av) + * + * The inverse function generates a uniform random number p in 0..1 + * and generates delay = (d_av-d_min) * -ln(1-p) + d_min + * + * To speed up behaviour at runtime we tabulate the values + */ + +static int +exp_delay_parse(struct _qs *q, struct _cfg *dst, int ac, char *av[]) +{ +#define PTS_D_EXP 512 + uint64_t i, d_av, d_min, *t; /*table of values */ + + (void)q; + if (strcmp(av[0], "exp") != 0) + return 2; /* not recognised */ + if (ac != 3) + return 1; /* error */ + d_av = parse_time(av[1]); + d_min = parse_time(av[2]); + if (d_av == U_PARSE_ERR || d_min == U_PARSE_ERR || d_av < d_min) + return 1; /* error */ + d_av -= d_min; + dst->arg_len = PTS_D_EXP * sizeof(uint64_t); + dst->arg = calloc(1, dst->arg_len); + if (dst->arg == NULL) + return 1; /* no memory */ + t = (uint64_t *)dst->arg; + /* tabulate -ln(1-n)*delay for n in 0..1 */ + for (i = 0; i < PTS_D_EXP; i++) { + double d = -log2 ((double)(PTS_D_EXP - i) / PTS_D_EXP) * d_av + d_min; + t[i] = (uint64_t)d; + ND(5, "%ld: %le", i, d); + } + return 0; +} + +static int +exp_delay_run(struct _qs *q, struct _cfg *arg) +{ + uint64_t *t = (uint64_t *)arg->arg; + q->cur_delay = t[my_random24() & (PTS_D_EXP - 1)]; + RD(5, "delay %lu", (_P64)q->cur_delay); + return 0; +} + + +/* unused arguments in configuration */ +#define _CFG_END NULL, 0, {0}, {0} + +static struct _cfg delay_cfg[] = { + { const_delay_parse, const_delay_run, + "constant,delay", _CFG_END }, + { uniform_delay_parse, uniform_delay_run, + "uniform,dmin,dmax # dmin <= dmax", _CFG_END }, + { exp_delay_parse, exp_delay_run, + "exp,dmin,davg # dmin <= davg", _CFG_END }, + { NULL, NULL, NULL, _CFG_END } +}; + +/* standard bandwidth, also accepts just a number */ +static int +const_bw_parse(struct _qs *q, struct _cfg *dst, int ac, char *av[]) +{ + uint64_t bw; + + (void)q; + if (strncmp(av[0], "const", 5) != 0 && ac > 1) + return 2; /* unrecognised */ + if (ac > 2) + return 1; /* error */ + bw = parse_bw(av[ac - 1]); + if (bw == U_PARSE_ERR) { + return (ac == 2) ? 1 /* error */ : 2 /* unrecognised */; + } + dst->d[0] = bw; + return 0; /* success */ +} + + +/* runtime function, store the delay into q->cur_delay */ +static int +const_bw_run(struct _qs *q, struct _cfg *arg) +{ + uint64_t bps = arg->d[0]; + q->cur_tt = bps ? 8ULL* TIME_UNITS * q->cur_len / bps : 0 ; + return 0; +} + +/* ethernet bandwidth, add 672 bits per packet */ +static int +ether_bw_parse(struct _qs *q, struct _cfg *dst, int ac, char *av[]) +{ + uint64_t bw; + + (void)q; + if (strcmp(av[0], "ether") != 0) + return 2; /* unrecognised */ + if (ac != 2) + return 1; /* error */ + bw = parse_bw(av[ac - 1]); + if (bw == U_PARSE_ERR) + return 1; /* error */ + dst->d[0] = bw; + return 0; /* success */ +} + + +/* runtime function, add 20 bytes (framing) + 4 bytes (crc) */ +static int +ether_bw_run(struct _qs *q, struct _cfg *arg) +{ + uint64_t bps = arg->d[0]; + q->cur_tt = bps ? 8ULL * TIME_UNITS * (q->cur_len + 24) / bps : 0 ; + return 0; +} + +/* real bandwidth, plus scaling factor */ +static int +real_bw_parse(struct _qs *q, struct _cfg *dst, int ac, char *av[]) +{ + double scale; + + (void)q; + if (strcmp(av[0], "real") != 0) + return 2; /* unrecognised */ + if (ac > 2) { /* second argument is optional */ + return 1; /* error */ + } else if (ac == 1) { + scale = 1; + } else { + int err = 0; + scale = parse_gen(av[ac-1], NULL, &err); + if (err || scale <= 0 || scale > 1000) + return 1; + } + ED("real -> scale is %.6f", scale); + dst->f[0] = scale; + return 0; /* success */ +} + +static int +real_bw_run(struct _qs *q, struct _cfg *arg) +{ + q->cur_tt /= arg->f[0]; + return 0; +} + +static struct _cfg bw_cfg[] = { + { const_bw_parse, const_bw_run, + "constant,bps", _CFG_END }, + { ether_bw_parse, ether_bw_run, + "ether,bps", _CFG_END }, + { real_bw_parse, real_bw_run, + "real,scale", _CFG_END }, + { NULL, NULL, NULL, _CFG_END } +}; + +/* + * loss patterns + */ +static int +const_plr_parse(struct _qs *q, struct _cfg *dst, int ac, char *av[]) +{ + double plr; + int err; + + (void)q; + if (strcmp(av[0], "plr") != 0 && ac > 1) + return 2; /* unrecognised */ + if (ac > 2) + return 1; /* error */ + // XXX to be completed + plr = parse_gen(av[ac-1], NULL, &err); + if (err || plr < 0 || plr > 1) + return 1; + dst->d[0] = plr * (1<<24); /* scale is 16m */ + if (plr != 0 && dst->d[0] == 0) + ED("WWW warning, rounding %le down to 0", plr); + return 0; /* success */ +} + +static int +const_plr_run(struct _qs *q, struct _cfg *arg) +{ + (void)arg; + uint64_t r = my_random24(); + q->cur_drop = r < arg->d[0]; +#if 1 /* keep stats */ + arg->d[1]++; + arg->d[2] += q->cur_drop; +#endif + return 0; +} + + +/* + * For BER the loss is 1- (1-ber)**bit_len + * The linear approximation is only good for small values, so we + * tabulate (1-ber)**len for various sizes in bytes + */ +static int +const_ber_parse(struct _qs *q, struct _cfg *dst, int ac, char *av[]) +{ + double ber, ber8, cur; + int i, err; + uint32_t *plr; + const uint32_t mask = (1<<24) - 1; + + (void)q; + if (strcmp(av[0], "ber") != 0) + return 2; /* unrecognised */ + if (ac != 2) + return 1; /* error */ + ber = parse_gen(av[ac-1], NULL, &err); + if (err || ber < 0 || ber > 1) + return 1; + dst->arg_len = MAX_PKT * sizeof(uint32_t); + plr = calloc(1, dst->arg_len); + if (plr == NULL) + return 1; /* no memory */ + dst->arg = plr; + ber8 = 1 - ber; + ber8 *= ber8; /* **2 */ + ber8 *= ber8; /* **4 */ + ber8 *= ber8; /* **8 */ + cur = 1; + for (i=0; i < MAX_PKT; i++, cur *= ber8) { + plr[i] = (mask + 1)*(1 - cur); + if (plr[i] > mask) + plr[i] = mask; +#if 0 + if (i>= 60) // && plr[i] < mask/2) + RD(50,"%4d: %le %ld", i, 1.0 - cur, (_P64)plr[i]); +#endif + } + dst->d[0] = ber * (mask + 1); + return 0; /* success */ +} + +static int +const_ber_run(struct _qs *q, struct _cfg *arg) +{ + int l = q->cur_len; + uint64_t r = my_random24(); + uint32_t *plr = arg->arg; + + if (l >= MAX_PKT) { + RD(5, "pkt len %d too large, trim to %d", l, MAX_PKT-1); + l = MAX_PKT-1; + } + q->cur_drop = r < plr[l]; +#if 1 /* keep stats */ + arg->d[1] += l * 8; + arg->d[2] += q->cur_drop; +#endif + return 0; +} + +static struct _cfg loss_cfg[] = { + { const_plr_parse, const_plr_run, + "plr,prob # 0 <= prob <= 1", _CFG_END }, + { const_ber_parse, const_ber_run, + "ber,prob # 0 <= prob <= 1", _CFG_END }, + { NULL, NULL, NULL, _CFG_END } +}; diff --git a/tools/tools/netmap/pkt-gen.c b/tools/tools/netmap/pkt-gen.c index 6d9bee6de634..168e022cfba9 100644 --- a/tools/tools/netmap/pkt-gen.c +++ b/tools/tools/netmap/pkt-gen.c @@ -1,6 +1,6 @@ /* * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved. - * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved. + * Copyright (C) 2013-2015 Universita` di Pisa. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -37,8 +37,6 @@ * */ -// #define TRASH_VHOST_HDR - #define _GNU_SOURCE /* for CPU_SET() */ #include <stdio.h> #define NETMAP_WITH_LIBS @@ -49,12 +47,16 @@ #include <unistd.h> // sysconf() #include <sys/poll.h> #include <arpa/inet.h> /* ntohs */ +#ifndef _WIN32 #include <sys/sysctl.h> /* sysctl */ +#endif #include <ifaddrs.h> /* getifaddrs */ #include <net/ethernet.h> #include <netinet/in.h> #include <netinet/ip.h> #include <netinet/udp.h> +#include <assert.h> +#include <math.h> #include <pthread.h> @@ -62,6 +64,69 @@ #include <pcap/pcap.h> #endif +#include "ctrs.h" + +#ifdef _WIN32 +#define cpuset_t DWORD_PTR //uint64_t +static inline void CPU_ZERO(cpuset_t *p) +{ + *p = 0; +} + +static inline void CPU_SET(uint32_t i, cpuset_t *p) +{ + *p |= 1<< (i & 0x3f); +} + +#define pthread_setaffinity_np(a, b, c) !SetThreadAffinityMask(a, *c) //((void)a, 0) +#define TAP_CLONEDEV "/dev/tap" +#define AF_LINK 18 //defined in winsocks.h +#define CLOCK_REALTIME_PRECISE CLOCK_REALTIME +#include <net/if_dl.h> + +/* + * Convert an ASCII representation of an ethernet address to + * binary form. + */ +struct ether_addr * +ether_aton(const char *a) +{ + int i; + static struct ether_addr o; + unsigned int o0, o1, o2, o3, o4, o5; + + i = sscanf(a, "%x:%x:%x:%x:%x:%x", &o0, &o1, &o2, &o3, &o4, &o5); + + if (i != 6) + return (NULL); + + o.octet[0]=o0; + o.octet[1]=o1; + o.octet[2]=o2; + o.octet[3]=o3; + o.octet[4]=o4; + o.octet[5]=o5; + + return ((struct ether_addr *)&o); +} + +/* + * Convert a binary representation of an ethernet address to + * an ASCII string. + */ +char * +ether_ntoa(const struct ether_addr *n) +{ + int i; + static char a[18]; + + i = sprintf(a, "%02x:%02x:%02x:%02x:%02x:%02x", + n->octet[0], n->octet[1], n->octet[2], + n->octet[3], n->octet[4], n->octet[5]); + return (i < 17 ? NULL : (char *)&a); +} +#endif /* _WIN32 */ + #ifdef linux #define cpuset_t cpu_set_t @@ -169,10 +234,12 @@ struct glob_arg { int pkt_size; int burst; int forever; - int npackets; /* total packets to send */ + uint64_t npackets; /* total packets to send */ int frags; /* fragments per packet */ int nthreads; - int cpus; + int cpus; /* cpus used for running */ + int system_cpus; /* cpus on the system */ + int options; /* testing */ #define OPT_PREFETCH 1 #define OPT_ACCESS 2 @@ -181,10 +248,10 @@ struct glob_arg { #define OPT_TS 16 /* add a timestamp */ #define OPT_INDIRECT 32 /* use indirect buffers, tx only */ #define OPT_DUMP 64 /* dump rx/tx traffic */ -#define OPT_MONITOR_TX 128 -#define OPT_MONITOR_RX 256 +#define OPT_RUBBISH 256 /* send wathever the buffers contain */ #define OPT_RANDOM_SRC 512 #define OPT_RANDOM_DST 1024 +#define OPT_PPS_STATS 2048 int dev_type; #ifndef NO_PCAP pcap_t *p; @@ -198,13 +265,18 @@ struct glob_arg { struct nm_desc *nmd; int report_interval; /* milliseconds between prints */ void *(*td_body)(void *); + int td_type; void *mmap_addr; char ifname[MAX_IFNAMELEN]; char *nmr_config; int dummy_send; int virt_header; /* send also the virt_header */ int extra_bufs; /* goes in nr_arg3 */ + int extra_pipes; /* goes in nr_arg1 */ char *packet_file; /* -P option */ +#define STATS_WIN 15 + int win_idx; + int64_t win[STATS_WIN]; }; enum dev_type { DEV_NONE, DEV_NETMAP, DEV_PCAP, DEV_TAP }; @@ -220,7 +292,11 @@ struct targ { int cancel; int fd; struct nm_desc *nmd; - volatile uint64_t count; + /* these ought to be volatile, but they are + * only sampled and errors should not accumulate + */ + struct my_ctrs ctr; + struct timespec tic, toc; int me; pthread_t thread; @@ -327,11 +403,10 @@ sigint_h(int sig) int i; (void)sig; /* UNUSED */ - D("received control-C on thread %p", pthread_self()); + D("received control-C on thread %p", (void *)pthread_self()); for (i = 0; i < global_nthreads; i++) { targs[i].cancel = 1; } - signal(SIGINT, SIG_DFL); } /* sysctl wrapper to return the number of active CPUs */ @@ -345,6 +420,12 @@ system_ncpus(void) sysctl(mib, 2, &ncpus, &len, NULL, 0); #elif defined(linux) ncpus = sysconf(_SC_NPROCESSORS_ONLN); +#elif defined(_WIN32) + { + SYSTEM_INFO sysinfo; + GetSystemInfo(&sysinfo); + ncpus = sysinfo.dwNumberOfProcessors; + } #else /* others */ ncpus = 1; #endif /* others */ @@ -518,10 +599,11 @@ wrapsum(u_int32_t sum) * Look for consecutive ascii representations of the size of the packet. */ static void -dump_payload(char *p, int len, struct netmap_ring *ring, int cur) +dump_payload(const char *_p, int len, struct netmap_ring *ring, int cur) { char buf[128]; int i, j, i0; + const unsigned char *p = (const unsigned char *)_p; /* get the length in ASCII of the length of the packet. */ @@ -629,6 +711,7 @@ initialize_packet(struct targ *targ) indirect_payload : default_payload; int i, l0 = strlen(payload); +#ifndef NO_PCAP char errbuf[PCAP_ERRBUF_SIZE]; pcap_t *file; struct pcap_pkthdr *header; @@ -650,6 +733,7 @@ initialize_packet(struct targ *targ) pcap_close(file); return; } +#endif /* create a nice NUL-terminated string */ for (i = 0; i < paylen; i += l0) { @@ -695,35 +779,49 @@ initialize_packet(struct targ *targ) eh->ether_type = htons(ETHERTYPE_IP); bzero(&pkt->vh, sizeof(pkt->vh)); -#ifdef TRASH_VHOST_HDR - /* set bogus content */ - pkt->vh.fields[0] = 0xff; - pkt->vh.fields[1] = 0xff; - pkt->vh.fields[2] = 0xff; - pkt->vh.fields[3] = 0xff; - pkt->vh.fields[4] = 0xff; - pkt->vh.fields[5] = 0xff; -#endif /* TRASH_VHOST_HDR */ // dump_payload((void *)pkt, targ->g->pkt_size, NULL, 0); } static void -set_vnet_hdr_len(struct targ *t) +get_vnet_hdr_len(struct glob_arg *g) { - int err, l = t->g->virt_header; + struct nmreq req; + int err; + + memset(&req, 0, sizeof(req)); + bcopy(g->nmd->req.nr_name, req.nr_name, sizeof(req.nr_name)); + req.nr_version = NETMAP_API; + req.nr_cmd = NETMAP_VNET_HDR_GET; + err = ioctl(g->main_fd, NIOCREGIF, &req); + if (err) { + D("Unable to get virtio-net header length"); + return; + } + + g->virt_header = req.nr_arg1; + if (g->virt_header) { + D("Port requires virtio-net header, length = %d", + g->virt_header); + } +} + +static void +set_vnet_hdr_len(struct glob_arg *g) +{ + int err, l = g->virt_header; struct nmreq req; if (l == 0) return; memset(&req, 0, sizeof(req)); - bcopy(t->nmd->req.nr_name, req.nr_name, sizeof(req.nr_name)); + bcopy(g->nmd->req.nr_name, req.nr_name, sizeof(req.nr_name)); req.nr_version = NETMAP_API; req.nr_cmd = NETMAP_BDG_VNET_HDR; req.nr_arg1 = l; - err = ioctl(t->fd, NIOCREGIF, &req); + err = ioctl(g->main_fd, NIOCREGIF, &req); if (err) { - D("Unable to set vnet header length %d", l); + D("Unable to set virtio-net header length %d", l); } } @@ -763,12 +861,15 @@ send_packets(struct netmap_ring *ring, struct pkt *pkt, void *frame, for (fcnt = nfrags, sent = 0; sent < count; sent++) { struct netmap_slot *slot = &ring->slot[cur]; char *p = NETMAP_BUF(ring, slot->buf_idx); + int buf_changed = slot->flags & NS_BUF_CHANGED; slot->flags = 0; - if (options & OPT_INDIRECT) { + if (options & OPT_RUBBISH) { + /* do nothing */ + } else if (options & OPT_INDIRECT) { slot->flags |= NS_INDIRECT; - slot->ptr = (uint64_t)frame; - } else if (options & OPT_COPY) { + slot->ptr = (uint64_t)((uintptr_t)frame); + } else if ((options & OPT_COPY) || buf_changed) { nm_pkt_copy(frame, p, size); if (fcnt == nfrags) update_addresses(pkt, g); @@ -798,6 +899,21 @@ send_packets(struct netmap_ring *ring, struct pkt *pkt, void *frame, } /* + * Index of the highest bit set + */ +uint32_t +msb64(uint64_t x) +{ + uint64_t m = 1ULL << 63; + int i; + + for (i = 63; i >= 0; i--, m >>=1) + if (m & x) + return i; + return 0; +} + +/* * Send a packet, and wait for a response. * The payload (after UDP header, ofs 42) has a 4-byte sequence * followed by a struct timeval (or bintime?) @@ -810,25 +926,28 @@ pinger_body(void *data) struct targ *targ = (struct targ *) data; struct pollfd pfd = { .fd = targ->fd, .events = POLLIN }; struct netmap_if *nifp = targ->nmd->nifp; - int i, rx = 0, n = targ->g->npackets; + int i, rx = 0; void *frame; int size; - uint32_t sent = 0; struct timespec ts, now, last_print; - uint32_t count = 0, min = 1000000000, av = 0; + uint64_t sent = 0, n = targ->g->npackets; + uint64_t count = 0, t_cur, t_min = ~0, av = 0; + uint64_t buckets[64]; /* bins for delays, ns */ frame = &targ->pkt; frame += sizeof(targ->pkt.vh) - targ->g->virt_header; size = targ->g->pkt_size + targ->g->virt_header; + if (targ->g->nthreads > 1) { D("can only ping with 1 thread"); return NULL; } + bzero(&buckets, sizeof(buckets)); clock_gettime(CLOCK_REALTIME_PRECISE, &last_print); now = last_print; - while (n == 0 || (int)sent < n) { + while (!targ->cancel && (n == 0 || sent < n)) { struct netmap_ring *ring = NETMAP_TXRING(nifp, 0); struct netmap_slot *slot; char *p; @@ -864,6 +983,8 @@ pinger_body(void *data) while (!nm_ring_empty(ring)) { uint32_t seq; struct tstamp *tp; + int pos; + slot = &ring->slot[ring->cur]; p = NETMAP_BUF(ring, slot->buf_idx); @@ -878,12 +999,16 @@ pinger_body(void *data) ts.tv_nsec += 1000000000; ts.tv_sec--; } - if (1) D("seq %d/%d delta %d.%09d", seq, sent, + if (0) D("seq %d/%lu delta %d.%09d", seq, sent, (int)ts.tv_sec, (int)ts.tv_nsec); - if (ts.tv_nsec < (int)min) - min = ts.tv_nsec; + t_cur = ts.tv_sec * 1000000000UL + ts.tv_nsec; + if (t_cur < t_min) + t_min = t_cur; count ++; - av += ts.tv_nsec; + av += t_cur; + pos = msb64(t_cur); + buckets[pos]++; + /* now store it in a bucket */ ring->head = ring->cur = nm_ring_next(ring, ring->cur); rx++; } @@ -897,14 +1022,32 @@ pinger_body(void *data) ts.tv_sec--; } if (ts.tv_sec >= 1) { - D("count %d min %d av %d", - count, min, av/count); + D("count %d RTT: min %d av %d ns", + (int)count, (int)t_min, (int)(av/count)); + int k, j, kmin; + char buf[512]; + + for (kmin = 0; kmin < 64; kmin ++) + if (buckets[kmin]) + break; + for (k = 63; k >= kmin; k--) + if (buckets[k]) + break; + buf[0] = '\0'; + for (j = kmin; j <= k; j++) + sprintf(buf, "%s %5d", buf, (int)buckets[j]); + D("k: %d .. %d\n\t%s", 1<<kmin, 1<<k, buf); + bzero(&buckets, sizeof(buckets)); count = 0; av = 0; - min = 100000000; + t_min = ~0; last_print = now; } } + + /* reset the ``used`` flag. */ + targ->used = 0; + return NULL; } @@ -919,14 +1062,15 @@ ponger_body(void *data) struct pollfd pfd = { .fd = targ->fd, .events = POLLIN }; struct netmap_if *nifp = targ->nmd->nifp; struct netmap_ring *txring, *rxring; - int i, rx = 0, sent = 0, n = targ->g->npackets; + int i, rx = 0; + uint64_t sent = 0, n = targ->g->npackets; if (targ->g->nthreads > 1) { D("can only reply ping with 1 thread"); return NULL; } - D("understood ponger %d but don't know how to do it", n); - while (n == 0 || sent < n) { + D("understood ponger %lu but don't know how to do it", n); + while (!targ->cancel && (n == 0 || sent < n)) { uint32_t txcur, txavail; //#define BUSYWAIT #ifdef BUSYWAIT @@ -975,69 +1119,17 @@ ponger_body(void *data) } } txring->head = txring->cur = txcur; - targ->count = sent; + targ->ctr.pkts = sent; #ifdef BUSYWAIT ioctl(pfd.fd, NIOCTXSYNC, NULL); #endif //D("tx %d rx %d", sent, rx); } - return NULL; -} - -static __inline int -timespec_ge(const struct timespec *a, const struct timespec *b) -{ - - if (a->tv_sec > b->tv_sec) - return (1); - if (a->tv_sec < b->tv_sec) - return (0); - if (a->tv_nsec >= b->tv_nsec) - return (1); - return (0); -} - -static __inline struct timespec -timeval2spec(const struct timeval *a) -{ - struct timespec ts = { - .tv_sec = a->tv_sec, - .tv_nsec = a->tv_usec * 1000 - }; - return ts; -} - -static __inline struct timeval -timespec2val(const struct timespec *a) -{ - struct timeval tv = { - .tv_sec = a->tv_sec, - .tv_usec = a->tv_nsec / 1000 - }; - return tv; -} + /* reset the ``used`` flag. */ + targ->used = 0; -static __inline struct timespec -timespec_add(struct timespec a, struct timespec b) -{ - struct timespec ret = { a.tv_sec + b.tv_sec, a.tv_nsec + b.tv_nsec }; - if (ret.tv_nsec >= 1000000000) { - ret.tv_sec++; - ret.tv_nsec -= 1000000000; - } - return ret; -} - -static __inline struct timespec -timespec_sub(struct timespec a, struct timespec b) -{ - struct timespec ret = { a.tv_sec - b.tv_sec, a.tv_nsec - b.tv_nsec }; - if (ret.tv_nsec < 0) { - ret.tv_sec--; - ret.tv_nsec += 1000000000; - } - return ret; + return NULL; } @@ -1065,9 +1157,11 @@ sender_body(void *data) struct targ *targ = (struct targ *) data; struct pollfd pfd = { .fd = targ->fd, .events = POLLOUT }; struct netmap_if *nifp; - struct netmap_ring *txring; - int i, n = targ->g->npackets / targ->g->nthreads; - int64_t sent = 0; + struct netmap_ring *txring = NULL; + int i; + uint64_t n = targ->g->npackets / targ->g->nthreads; + uint64_t sent = 0; + uint64_t event = 0; int options = targ->g->options | OPT_COPY; struct timespec nexttime = { 0, 0}; // XXX silence compiler int rate_limit = targ->g->tx_rate; @@ -1104,7 +1198,9 @@ sender_body(void *data) sent++; update_addresses(pkt, targ->g); if (i > 10000) { - targ->count = sent; + targ->ctr.pkts = sent; + targ->ctr.bytes = sent*size; + targ->ctr.events = sent; i = 0; } } @@ -1117,7 +1213,9 @@ sender_body(void *data) sent++; update_addresses(pkt, targ->g); if (i > 10000) { - targ->count = sent; + targ->ctr.pkts = sent; + targ->ctr.bytes = sent*size; + targ->ctr.events = sent; i = 0; } } @@ -1126,7 +1224,7 @@ sender_body(void *data) int tosend = 0; int frags = targ->g->frags; - nifp = targ->nmd->nifp; + nifp = targ->nmd->nifp; while (!targ->cancel && (n == 0 || sent < n)) { if (rate_limit && tosend <= 0) { @@ -1138,6 +1236,13 @@ sender_body(void *data) /* * wait for available room in the send queue(s) */ +#ifdef BUSYWAIT + if (ioctl(pfd.fd, NIOCTXSYNC, NULL) < 0) { + D("ioctl error on queue %d: %s", targ->me, + strerror(errno)); + goto quit; + } +#else /* !BUSYWAIT */ if (poll(&pfd, 1, 2000) <= 0) { if (targ->cancel) break; @@ -1146,9 +1251,11 @@ sender_body(void *data) // goto quit; } if (pfd.revents & POLLERR) { - D("poll error"); + D("poll error on %d ring %d-%d", pfd.fd, + targ->nmd->first_tx_ring, targ->nmd->last_tx_ring); goto quit; } +#endif /* !BUSYWAIT */ /* * scan our queues and send on those with room */ @@ -1157,7 +1264,8 @@ sender_body(void *data) options &= ~OPT_COPY; } for (i = targ->nmd->first_tx_ring; i <= targ->nmd->last_tx_ring; i++) { - int m, limit = rate_limit ? tosend : targ->g->burst; + int m; + uint64_t limit = rate_limit ? tosend : targ->g->burst; if (n > 0 && n - sent < limit) limit = n - sent; txring = NETMAP_TXRING(nifp, i); @@ -1171,7 +1279,11 @@ sender_body(void *data) ND("limit %d tail %d frags %d m %d", limit, txring->tail, frags, m); sent += m; - targ->count = sent; + if (m > 0) //XXX-ste: can m be 0? + event++; + targ->ctr.pkts = sent; + targ->ctr.bytes = sent*size; + targ->ctr.events = event; if (rate_limit) { tosend -= m; if (tosend <= 0) @@ -1182,13 +1294,13 @@ sender_body(void *data) /* flush any remaining packets */ D("flush tail %d head %d on thread %p", txring->tail, txring->head, - pthread_self()); + (void *)pthread_self()); ioctl(pfd.fd, NIOCTXSYNC, NULL); /* final part: wait all the TX queues to be empty. */ for (i = targ->nmd->first_tx_ring; i <= targ->nmd->last_tx_ring; i++) { txring = NETMAP_TXRING(nifp, i); - while (nm_tx_pending(txring)) { + while (!targ->cancel && nm_tx_pending(txring)) { RD(5, "pending tx tail %d head %d on ring %d", txring->tail, txring->head, i); ioctl(pfd.fd, NIOCTXSYNC, NULL); @@ -1199,8 +1311,9 @@ sender_body(void *data) clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc); targ->completed = 1; - targ->count = sent; - + targ->ctr.pkts = sent; + targ->ctr.bytes = sent*size; + targ->ctr.events = event; quit: /* reset the ``used`` flag. */ targ->used = 0; @@ -1214,17 +1327,22 @@ static void receive_pcap(u_char *user, const struct pcap_pkthdr * h, const u_char * bytes) { - int *count = (int *)user; - (void)h; /* UNUSED */ + struct my_ctrs *ctr = (struct my_ctrs *)user; (void)bytes; /* UNUSED */ - (*count)++; + ctr->bytes += h->len; + ctr->pkts++; } #endif /* !NO_PCAP */ + static int -receive_packets(struct netmap_ring *ring, u_int limit, int dump) +receive_packets(struct netmap_ring *ring, u_int limit, int dump, uint64_t *bytes) { u_int cur, rx, n; + uint64_t b = 0; + + if (bytes == NULL) + bytes = &b; cur = ring->cur; n = nm_ring_space(ring); @@ -1234,6 +1352,7 @@ receive_packets(struct netmap_ring *ring, u_int limit, int dump) struct netmap_slot *slot = &ring->slot[cur]; char *p = NETMAP_BUF(ring, slot->buf_idx); + *bytes += slot->len; if (dump) dump_payload(p, slot->len, ring, cur); @@ -1252,7 +1371,10 @@ receiver_body(void *data) struct netmap_if *nifp; struct netmap_ring *rxring; int i; - uint64_t received = 0; + struct my_ctrs cur; + + cur.pkts = cur.bytes = cur.events = cur.min_space = 0; + cur.t.tv_usec = cur.t.tv_sec = 0; // unused, just silence the compiler if (setaffinity(targ->thread, targ->affinity)) goto quit; @@ -1273,24 +1395,36 @@ receiver_body(void *data) while (!targ->cancel) { char buf[MAX_BODYSIZE]; /* XXX should we poll ? */ - if (read(targ->g->main_fd, buf, sizeof(buf)) > 0) - targ->count++; + i = read(targ->g->main_fd, buf, sizeof(buf)); + if (i > 0) { + targ->ctr.pkts++; + targ->ctr.bytes += i; + targ->ctr.events++; + } } #ifndef NO_PCAP } else if (targ->g->dev_type == DEV_PCAP) { while (!targ->cancel) { /* XXX should we poll ? */ pcap_dispatch(targ->g->p, targ->g->burst, receive_pcap, - (u_char *)&targ->count); + (u_char *)&targ->ctr); + targ->ctr.events++; } #endif /* !NO_PCAP */ } else { int dump = targ->g->options & OPT_DUMP; - nifp = targ->nmd->nifp; + nifp = targ->nmd->nifp; while (!targ->cancel) { /* Once we started to receive packets, wait at most 1 seconds before quitting. */ +#ifdef BUSYWAIT + if (ioctl(pfd.fd, NIOCRXSYNC, NULL) < 0) { + D("ioctl error on queue %d: %s", targ->me, + strerror(errno)); + goto quit; + } +#else /* !BUSYWAIT */ if (poll(&pfd, 1, 1 * 1000) <= 0 && !targ->g->forever) { clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc); targ->toc.tv_sec -= 1; /* Subtract timeout time. */ @@ -1301,26 +1435,39 @@ receiver_body(void *data) D("poll err"); goto quit; } - +#endif /* !BUSYWAIT */ + uint64_t cur_space = 0; for (i = targ->nmd->first_rx_ring; i <= targ->nmd->last_rx_ring; i++) { int m; rxring = NETMAP_RXRING(nifp, i); + /* compute free space in the ring */ + m = rxring->head + rxring->num_slots - rxring->tail; + if (m >= (int) rxring->num_slots) + m -= rxring->num_slots; + cur_space += m; if (nm_ring_empty(rxring)) continue; - m = receive_packets(rxring, targ->g->burst, dump); - received += m; + m = receive_packets(rxring, targ->g->burst, dump, &cur.bytes); + cur.pkts += m; + if (m > 0) //XXX-ste: can m be 0? + cur.events++; } - targ->count = received; + cur.min_space = targ->ctr.min_space; + if (cur_space < cur.min_space) + cur.min_space = cur_space; + targ->ctr = cur; } } clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc); +#if !defined(BUSYWAIT) out: +#endif targ->completed = 1; - targ->count = received; + targ->ctr = cur; quit: /* reset the ``used`` flag. */ @@ -1329,56 +1476,390 @@ quit: return (NULL); } -/* very crude code to print a number in normalized form. - * Caller has to make sure that the buffer is large enough. - */ -static const char * -norm(char *buf, double val) +static void * +txseq_body(void *data) +{ + struct targ *targ = (struct targ *) data; + struct pollfd pfd = { .fd = targ->fd, .events = POLLOUT }; + struct netmap_ring *ring; + int64_t sent = 0; + uint64_t event = 0; + int options = targ->g->options | OPT_COPY; + struct timespec nexttime = {0, 0}; + int rate_limit = targ->g->tx_rate; + struct pkt *pkt = &targ->pkt; + int frags = targ->g->frags; + uint32_t sequence = 0; + int budget = 0; + void *frame; + int size; + + if (targ->g->nthreads > 1) { + D("can only txseq ping with 1 thread"); + return NULL; + } + + if (targ->g->npackets > 0) { + D("Ignoring -n argument"); + } + + frame = pkt; + frame += sizeof(pkt->vh) - targ->g->virt_header; + size = targ->g->pkt_size + targ->g->virt_header; + + D("start, fd %d main_fd %d", targ->fd, targ->g->main_fd); + if (setaffinity(targ->thread, targ->affinity)) + goto quit; + + clock_gettime(CLOCK_REALTIME_PRECISE, &targ->tic); + if (rate_limit) { + targ->tic = timespec_add(targ->tic, (struct timespec){2,0}); + targ->tic.tv_nsec = 0; + wait_time(targ->tic); + nexttime = targ->tic; + } + + /* Only use the first queue. */ + ring = NETMAP_TXRING(targ->nmd->nifp, targ->nmd->first_tx_ring); + + while (!targ->cancel) { + int64_t limit; + unsigned int space; + unsigned int head; + int fcnt; + + if (!rate_limit) { + budget = targ->g->burst; + + } else if (budget <= 0) { + budget = targ->g->burst; + nexttime = timespec_add(nexttime, targ->g->tx_period); + wait_time(nexttime); + } + + /* wait for available room in the send queue */ + if (poll(&pfd, 1, 2000) <= 0) { + if (targ->cancel) + break; + D("poll error/timeout on queue %d: %s", targ->me, + strerror(errno)); + } + if (pfd.revents & POLLERR) { + D("poll error on %d ring %d-%d", pfd.fd, + targ->nmd->first_tx_ring, targ->nmd->last_tx_ring); + goto quit; + } + + /* If no room poll() again. */ + space = nm_ring_space(ring); + if (!space) { + continue; + } + + limit = budget; + + if (space < limit) { + limit = space; + } + + /* Cut off ``limit`` to make sure is multiple of ``frags``. */ + if (frags > 1) { + limit = (limit / frags) * frags; + } + + limit = sent + limit; /* Convert to absolute. */ + + for (fcnt = frags, head = ring->head; + sent < limit; sent++, sequence++) { + struct netmap_slot *slot = &ring->slot[head]; + char *p = NETMAP_BUF(ring, slot->buf_idx); + + slot->flags = 0; + pkt->body[0] = sequence >> 24; + pkt->body[1] = (sequence >> 16) & 0xff; + pkt->body[2] = (sequence >> 8) & 0xff; + pkt->body[3] = sequence & 0xff; + nm_pkt_copy(frame, p, size); + if (fcnt == frags) { + update_addresses(pkt, targ->g); + } + + if (options & OPT_DUMP) { + dump_payload(p, size, ring, head); + } + + slot->len = size; + + if (--fcnt > 0) { + slot->flags |= NS_MOREFRAG; + } else { + fcnt = frags; + } + + if (sent == limit - 1) { + /* Make sure we don't push an incomplete + * packet. */ + assert(!(slot->flags & NS_MOREFRAG)); + slot->flags |= NS_REPORT; + } + + head = nm_ring_next(ring, head); + if (rate_limit) { + budget--; + } + } + + ring->cur = ring->head = head; + + event ++; + targ->ctr.pkts = sent; + targ->ctr.bytes = sent * size; + targ->ctr.events = event; + } + + /* flush any remaining packets */ + D("flush tail %d head %d on thread %p", + ring->tail, ring->head, + (void *)pthread_self()); + ioctl(pfd.fd, NIOCTXSYNC, NULL); + + /* final part: wait the TX queues to become empty. */ + while (!targ->cancel && nm_tx_pending(ring)) { + RD(5, "pending tx tail %d head %d on ring %d", + ring->tail, ring->head, targ->nmd->first_tx_ring); + ioctl(pfd.fd, NIOCTXSYNC, NULL); + usleep(1); /* wait 1 tick */ + } + + clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc); + targ->completed = 1; + targ->ctr.pkts = sent; + targ->ctr.bytes = sent * size; + targ->ctr.events = event; +quit: + /* reset the ``used`` flag. */ + targ->used = 0; + + return (NULL); +} + + +static char * +multi_slot_to_string(struct netmap_ring *ring, unsigned int head, + unsigned int nfrags, char *strbuf, size_t strbuflen) { - char *units[] = { "", "K", "M", "G", "T" }; - u_int i; + unsigned int f; + char *ret = strbuf; - for (i = 0; val >=1000 && i < sizeof(units)/sizeof(char *) - 1; i++) - val /= 1000; - sprintf(buf, "%.2f %s", val, units[i]); - return buf; + for (f = 0; f < nfrags; f++) { + struct netmap_slot *slot = &ring->slot[head]; + int m = snprintf(strbuf, strbuflen, "|%u,%x|", slot->len, + slot->flags); + if (m >= (int)strbuflen) { + break; + } + strbuf += m; + strbuflen -= m; + + head = nm_ring_next(ring, head); + } + + return ret; } -static void -tx_output(uint64_t sent, int size, double delta) +static void * +rxseq_body(void *data) { - double bw, raw_bw, pps; - char b1[40], b2[80], b3[80]; + struct targ *targ = (struct targ *) data; + struct pollfd pfd = { .fd = targ->fd, .events = POLLIN }; + int dump = targ->g->options & OPT_DUMP; + struct netmap_ring *ring; + unsigned int frags_exp = 1; + uint32_t seq_exp = 0; + struct my_ctrs cur; + unsigned int frags = 0; + int first_packet = 1; + int first_slot = 1; + int i; - printf("Sent %llu packets, %d bytes each, in %.2f seconds.\n", - (unsigned long long)sent, size, delta); - if (delta == 0) - delta = 1e-6; - if (size < 60) /* correct for min packet size */ - size = 60; - pps = sent / delta; - bw = (8.0 * size * sent) / delta; - /* raw packets have4 bytes crc + 20 bytes framing */ - raw_bw = (8.0 * (size + 24) * sent) / delta; + cur.pkts = cur.bytes = cur.events = cur.min_space = 0; + cur.t.tv_usec = cur.t.tv_sec = 0; // unused, just silence the compiler + + if (setaffinity(targ->thread, targ->affinity)) + goto quit; + + D("reading from %s fd %d main_fd %d", + targ->g->ifname, targ->fd, targ->g->main_fd); + /* unbounded wait for the first packet. */ + for (;!targ->cancel;) { + i = poll(&pfd, 1, 1000); + if (i > 0 && !(pfd.revents & POLLERR)) + break; + RD(1, "waiting for initial packets, poll returns %d %d", + i, pfd.revents); + } + + clock_gettime(CLOCK_REALTIME_PRECISE, &targ->tic); + + ring = NETMAP_RXRING(targ->nmd->nifp, targ->nmd->first_rx_ring); + + while (!targ->cancel) { + unsigned int head; + uint32_t seq; + int limit; + + /* Once we started to receive packets, wait at most 1 seconds + before quitting. */ + if (poll(&pfd, 1, 1 * 1000) <= 0 && !targ->g->forever) { + clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc); + targ->toc.tv_sec -= 1; /* Subtract timeout time. */ + goto out; + } + + if (pfd.revents & POLLERR) { + D("poll err"); + goto quit; + } + + if (nm_ring_empty(ring)) + continue; + + limit = nm_ring_space(ring); + if (limit > targ->g->burst) + limit = targ->g->burst; + +#if 0 + /* Enable this if + * 1) we remove the early-return optimization from + * the netmap poll implementation, or + * 2) pipes get NS_MOREFRAG support. + * With the current netmap implementation, an experiment like + * pkt-gen -i vale:1{1 -f txseq -F 9 + * pkt-gen -i vale:1}1 -f rxseq + * would get stuck as soon as we find nm_ring_space(ring) < 9, + * since here limit is rounded to 0 and + * pipe rxsync is not called anymore by the poll() of this loop. + */ + if (frags_exp > 1) { + int o = limit; + /* Cut off to the closest smaller multiple. */ + limit = (limit / frags_exp) * frags_exp; + RD(2, "LIMIT %d --> %d", o, limit); + } +#endif + + for (head = ring->head, i = 0; i < limit; i++) { + struct netmap_slot *slot = &ring->slot[head]; + char *p = NETMAP_BUF(ring, slot->buf_idx); + int len = slot->len; + struct pkt *pkt; + + if (dump) { + dump_payload(p, slot->len, ring, head); + } - printf("Speed: %spps Bandwidth: %sbps (raw %sbps)\n", - norm(b1, pps), norm(b2, bw), norm(b3, raw_bw) ); + frags++; + if (!(slot->flags & NS_MOREFRAG)) { + if (first_packet) { + first_packet = 0; + } else if (frags != frags_exp) { + char prbuf[512]; + RD(1, "Received packets with %u frags, " + "expected %u, '%s'", frags, frags_exp, + multi_slot_to_string(ring, head-frags+1, frags, + prbuf, sizeof(prbuf))); + } + first_packet = 0; + frags_exp = frags; + frags = 0; + } + + p -= sizeof(pkt->vh) - targ->g->virt_header; + len += sizeof(pkt->vh) - targ->g->virt_header; + pkt = (struct pkt *)p; + + if ((char *)pkt + len < ((char *)pkt->body) + sizeof(seq)) { + RD(1, "%s: packet too small (len=%u)", __func__, + slot->len); + } else { + seq = (pkt->body[0] << 24) | (pkt->body[1] << 16) + | (pkt->body[2] << 8) | pkt->body[3]; + if (first_slot) { + /* Grab the first one, whatever it + is. */ + seq_exp = seq; + first_slot = 0; + } else if (seq != seq_exp) { + uint32_t delta = seq - seq_exp; + + if (delta < (0xFFFFFFFF >> 1)) { + RD(2, "Sequence GAP: exp %u found %u", + seq_exp, seq); + } else { + RD(2, "Sequence OUT OF ORDER: " + "exp %u found %u", seq_exp, seq); + } + seq_exp = seq; + } + seq_exp++; + } + + cur.bytes += slot->len; + head = nm_ring_next(ring, head); + cur.pkts++; + } + + ring->cur = ring->head = head; + + cur.events++; + targ->ctr = cur; + } + + clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc); + +out: + targ->completed = 1; + targ->ctr = cur; + +quit: + /* reset the ``used`` flag. */ + targ->used = 0; + + return (NULL); } static void -rx_output(uint64_t received, double delta) +tx_output(struct my_ctrs *cur, double delta, const char *msg) { - double pps; - char b1[40]; + double bw, raw_bw, pps, abs; + char b1[40], b2[80], b3[80]; + int size; + + if (cur->pkts == 0) { + printf("%s nothing.\n", msg); + return; + } - printf("Received %llu packets, in %.2f seconds.\n", - (unsigned long long) received, delta); + size = (int)(cur->bytes / cur->pkts); + printf("%s %llu packets %llu bytes %llu events %d bytes each in %.2f seconds.\n", + msg, + (unsigned long long)cur->pkts, + (unsigned long long)cur->bytes, + (unsigned long long)cur->events, size, delta); if (delta == 0) delta = 1e-6; - pps = received / delta; - printf("Speed: %spps\n", norm(b1, pps)); + if (size < 60) /* correct for min packet size */ + size = 60; + pps = cur->pkts / delta; + bw = (8.0 * cur->bytes) / delta; + /* raw packets have4 bytes crc + 20 bytes framing */ + raw_bw = (8.0 * (cur->pkts * 24 + cur->bytes)) / delta; + abs = cur->pkts / (double)(cur->events); + + printf("Speed: %spps Bandwidth: %sbps (raw %sbps). Average batch: %.2f pkts\n", + norm(b1, pps), norm(b2, bw), norm(b3, raw_bw), abs); } static void @@ -1389,9 +1870,9 @@ usage(void) "Usage:\n" "%s arguments\n" "\t-i interface interface name\n" - "\t-f function tx rx ping pong\n" + "\t-f function tx rx ping pong txseq rxseq\n" "\t-n count number of iterations (can be 0)\n" - "\t-t pkts_to_send also forces tx mode\n" + "\t-t pkts_to_send also forces tx mode\n" "\t-r pkts_to_receive also forces rx mode\n" "\t-l pkt_size in bytes excluding CRC\n" "\t-d dst_ip[:port[-dst_ip:port]] single or range\n" @@ -1403,20 +1884,29 @@ usage(void) "\t-c cores cores to use\n" "\t-p threads processes/threads to use\n" "\t-T report_ms milliseconds between reports\n" - "\t-P use libpcap instead of netmap\n" "\t-w wait_for_link_time in seconds\n" "\t-R rate in packets per second\n" "\t-X dump payload\n" "\t-H len add empty virtio-net-header with size 'len'\n" + "\t-E pipes allocate extra space for a number of pipes\n" + "\t-r do not touch the buffers (send rubbish)\n" "\t-P file load packet from pcap file\n" "\t-z use random IPv4 src address/port\n" "\t-Z use random IPv4 dst address/port\n" + "\t-F num_frags send multi-slot packets\n" + "\t-A activate pps stats on receiver\n" "", cmd); exit(0); } +enum { + TD_TYPE_SENDER = 1, + TD_TYPE_RECEIVER, + TD_TYPE_OTHER, +}; + static void start_threads(struct glob_arg *g) { @@ -1439,33 +1929,32 @@ start_threads(struct glob_arg *g) uint64_t nmd_flags = 0; nmd.self = &nmd; - if (g->nthreads > 1) { - if (nmd.req.nr_flags != NR_REG_ALL_NIC) { - D("invalid nthreads mode %d", nmd.req.nr_flags); - continue; + if (i > 0) { + /* the first thread uses the fd opened by the main + * thread, the other threads re-open /dev/netmap + */ + if (g->nthreads > 1) { + nmd.req.nr_flags = + g->nmd->req.nr_flags & ~NR_REG_MASK; + nmd.req.nr_flags |= NR_REG_ONE_NIC; + nmd.req.nr_ringid = i; } - nmd.req.nr_flags = NR_REG_ONE_NIC; - nmd.req.nr_ringid = i; - } - /* Only touch one of the rings (rx is already ok) */ - if (g->td_body == receiver_body) - nmd_flags |= NETMAP_NO_TX_POLL; - - /* register interface. Override ifname and ringid etc. */ - if (g->options & OPT_MONITOR_TX) - nmd.req.nr_flags |= NR_MONITOR_TX; - if (g->options & OPT_MONITOR_RX) - nmd.req.nr_flags |= NR_MONITOR_RX; + /* Only touch one of the rings (rx is already ok) */ + if (g->td_type == TD_TYPE_RECEIVER) + nmd_flags |= NETMAP_NO_TX_POLL; - t->nmd = nm_open(t->g->ifname, NULL, nmd_flags | - NM_OPEN_IFNAME | NM_OPEN_NO_MMAP, &nmd); - if (t->nmd == NULL) { - D("Unable to open %s: %s", - t->g->ifname, strerror(errno)); - continue; + /* register interface. Override ifname and ringid etc. */ + t->nmd = nm_open(t->g->ifname, NULL, nmd_flags | + NM_OPEN_IFNAME | NM_OPEN_NO_MMAP, &nmd); + if (t->nmd == NULL) { + D("Unable to open %s: %s", + t->g->ifname, strerror(errno)); + continue; + } + } else { + t->nmd = g->nmd; } t->fd = t->nmd->fd; - set_vnet_hdr_len(t); } else { targs[i].fd = g->main_fd; @@ -1473,10 +1962,7 @@ start_threads(struct glob_arg *g) t->used = 1; t->me = i; if (g->affinity >= 0) { - if (g->affinity < g->cpus) - t->affinity = g->affinity; - else - t->affinity = i % g->cpus; + t->affinity = (g->affinity + i) % g->system_cpus; } else { t->affinity = -1; } @@ -1495,45 +1981,89 @@ main_thread(struct glob_arg *g) { int i; - uint64_t prev = 0; - uint64_t count = 0; + struct my_ctrs prev, cur; double delta_t; struct timeval tic, toc; - gettimeofday(&toc, NULL); + prev.pkts = prev.bytes = prev.events = 0; + gettimeofday(&prev.t, NULL); for (;;) { - struct timeval now, delta; - uint64_t pps, usec, my_count, npkts; + char b1[40], b2[40], b3[40], b4[70]; + uint64_t pps, usec; + struct my_ctrs x; + double abs; int done = 0; - delta.tv_sec = g->report_interval/1000; - delta.tv_usec = (g->report_interval%1000)*1000; - select(0, NULL, NULL, NULL, &delta); - gettimeofday(&now, NULL); - timersub(&now, &toc, &toc); - my_count = 0; + usec = wait_for_next_report(&prev.t, &cur.t, + g->report_interval); + + cur.pkts = cur.bytes = cur.events = 0; + cur.min_space = 0; + if (usec < 10000) /* too short to be meaningful */ + continue; + /* accumulate counts for all threads */ for (i = 0; i < g->nthreads; i++) { - my_count += targs[i].count; + cur.pkts += targs[i].ctr.pkts; + cur.bytes += targs[i].ctr.bytes; + cur.events += targs[i].ctr.events; + cur.min_space += targs[i].ctr.min_space; + targs[i].ctr.min_space = 99999; if (targs[i].used == 0) done++; } - usec = toc.tv_sec* 1000000 + toc.tv_usec; - if (usec < 10000) - continue; - npkts = my_count - prev; - pps = (npkts*1000000 + usec/2) / usec; - D("%llu pps (%llu pkts in %llu usec)", - (unsigned long long)pps, - (unsigned long long)npkts, - (unsigned long long)usec); - prev = my_count; - toc = now; + x.pkts = cur.pkts - prev.pkts; + x.bytes = cur.bytes - prev.bytes; + x.events = cur.events - prev.events; + pps = (x.pkts*1000000 + usec/2) / usec; + abs = (x.events > 0) ? (x.pkts / (double) x.events) : 0; + + if (!(g->options & OPT_PPS_STATS)) { + strcpy(b4, ""); + } else { + /* Compute some pps stats using a sliding window. */ + double ppsavg = 0.0, ppsdev = 0.0; + int nsamples = 0; + + g->win[g->win_idx] = pps; + g->win_idx = (g->win_idx + 1) % STATS_WIN; + + for (i = 0; i < STATS_WIN; i++) { + ppsavg += g->win[i]; + if (g->win[i]) { + nsamples ++; + } + } + ppsavg /= nsamples; + + for (i = 0; i < STATS_WIN; i++) { + if (g->win[i] == 0) { + continue; + } + ppsdev += (g->win[i] - ppsavg) * (g->win[i] - ppsavg); + } + ppsdev /= nsamples; + ppsdev = sqrt(ppsdev); + + snprintf(b4, sizeof(b4), "[avg/std %s/%s pps]", + norm(b1, ppsavg), norm(b2, ppsdev)); + } + + D("%spps %s(%spkts %sbps in %llu usec) %.2f avg_batch %d min_space", + norm(b1, pps), b4, + norm(b2, (double)x.pkts), + norm(b3, (double)x.bytes*8), + (unsigned long long)usec, + abs, (int)cur.min_space); + prev = cur; + if (done == g->nthreads) break; } timerclear(&tic); timerclear(&toc); + cur.pkts = cur.bytes = cur.events = 0; + /* final round */ for (i = 0; i < g->nthreads; i++) { struct timespec t_tic, t_toc; /* @@ -1541,8 +2071,13 @@ main_thread(struct glob_arg *g) * file descriptors. */ if (targs[i].used) - pthread_join(targs[i].thread, NULL); - close(targs[i].fd); + pthread_join(targs[i].thread, NULL); /* blocking */ + if (g->dev_type == DEV_NETMAP) { + nm_close(targs[i].nmd); + targs[i].nmd = NULL; + } else { + close(targs[i].fd); + } if (targs[i].completed == 0) D("ouch, thread %d exited with error", i); @@ -1551,7 +2086,13 @@ main_thread(struct glob_arg *g) * Collect threads output and extract information about * how long it took to send all the packets. */ - count += targs[i].count; + cur.pkts += targs[i].ctr.pkts; + cur.bytes += targs[i].ctr.bytes; + cur.events += targs[i].ctr.events; + /* collect the largest start (tic) and end (toc) times, + * XXX maybe we should do the earliest tic, or do a weighted + * average ? + */ t_tic = timeval2spec(&tic); t_toc = timeval2spec(&toc); if (!timerisset(&tic) || timespec_ge(&targs[i].tic, &t_tic)) @@ -1563,29 +2104,26 @@ main_thread(struct glob_arg *g) /* print output. */ timersub(&toc, &tic, &toc); delta_t = toc.tv_sec + 1e-6* toc.tv_usec; - if (g->td_body == sender_body) - tx_output(count, g->pkt_size, delta_t); + if (g->td_type == TD_TYPE_SENDER) + tx_output(&cur, delta_t, "Sent"); else - rx_output(count, delta_t); - - if (g->dev_type == DEV_NETMAP) { - munmap(g->nmd->mem, g->nmd->req.nr_memsize); - close(g->main_fd); - } + tx_output(&cur, delta_t, "Received"); } - -struct sf { +struct td_desc { + int ty; char *key; void *f; }; -static struct sf func[] = { - { "tx", sender_body }, - { "rx", receiver_body }, - { "ping", pinger_body }, - { "pong", ponger_body }, - { NULL, NULL } +static struct td_desc func[] = { + { TD_TYPE_SENDER, "tx", sender_body }, + { TD_TYPE_RECEIVER, "rx", receiver_body }, + { TD_TYPE_OTHER, "ping", pinger_body }, + { TD_TYPE_OTHER, "pong", ponger_body }, + { TD_TYPE_SENDER, "txseq", txseq_body }, + { TD_TYPE_RECEIVER, "rxseq", rxseq_body }, + { 0, NULL, NULL } }; static int @@ -1654,6 +2192,8 @@ int main(int arc, char **argv) { int i; + struct sigaction sa; + sigset_t ss; struct glob_arg g; @@ -1665,6 +2205,7 @@ main(int arc, char **argv) g.main_fd = -1; g.td_body = receiver_body; + g.td_type = TD_TYPE_RECEIVER; g.report_interval = 1000; /* report interval */ g.affinity = -1; /* ip addresses can also be a range x.x.x.x-x.x.x.y */ @@ -1675,7 +2216,7 @@ main(int arc, char **argv) g.pkt_size = 60; g.burst = 512; // default g.nthreads = 1; - g.cpus = 1; + g.cpus = 1; // default g.forever = 1; g.tx_rate = 0; g.frags = 1; @@ -1683,8 +2224,8 @@ main(int arc, char **argv) g.virt_header = 0; while ( (ch = getopt(arc, argv, - "a:f:F:n:i:Il:d:s:D:S:b:c:o:p:T:w:WvR:XC:H:e:m:P:zZ")) != -1) { - struct sf *fn; + "a:f:F:n:i:Il:d:s:D:S:b:c:o:p:T:w:WvR:XC:H:e:E:m:rP:zZA")) != -1) { + struct td_desc *fn; switch(ch) { default: @@ -1693,7 +2234,7 @@ main(int arc, char **argv) break; case 'n': - g.npackets = atoi(optarg); + g.npackets = strtoull(optarg, NULL, 10); break; case 'F': @@ -1710,10 +2251,12 @@ main(int arc, char **argv) if (!strcmp(fn->key, optarg)) break; } - if (fn->key) + if (fn->key) { g.td_body = fn->f; - else + g.td_type = fn->ty; + } else { D("unrecognised function %s", optarg); + } break; case 'o': /* data generation options */ @@ -1817,24 +2360,27 @@ main(int arc, char **argv) case 'e': /* extra bufs */ g.extra_bufs = atoi(optarg); break; - case 'm': - if (strcmp(optarg, "tx") == 0) { - g.options |= OPT_MONITOR_TX; - } else if (strcmp(optarg, "rx") == 0) { - g.options |= OPT_MONITOR_RX; - } else { - D("unrecognized monitor mode %s", optarg); - } + case 'E': + g.extra_pipes = atoi(optarg); break; case 'P': g.packet_file = strdup(optarg); break; + case 'm': + /* ignored */ + break; + case 'r': + g.options |= OPT_RUBBISH; + break; case 'z': g.options |= OPT_RANDOM_SRC; break; case 'Z': g.options |= OPT_RANDOM_DST; break; + case 'A': + g.options |= OPT_PPS_STATS; + break; } } @@ -1843,11 +2389,12 @@ main(int arc, char **argv) usage(); } - i = system_ncpus(); + g.system_cpus = i = system_ncpus(); if (g.cpus < 0 || g.cpus > i) { D("%d cpus is too high, have only %d cpus", g.cpus, i); usage(); } +D("running on %d cpus (have %d)", g.cpus, i); if (g.cpus == 0) g.cpus = i; @@ -1914,6 +2461,11 @@ main(int arc, char **argv) if (g.extra_bufs) { base_nmd.nr_arg3 = g.extra_bufs; } + if (g.extra_pipes) { + base_nmd.nr_arg1 = g.extra_pipes; + } + + base_nmd.nr_flags |= NR_ACCEPT_VNET_HDR; /* * Open the netmap device using nm_open(). @@ -1927,13 +2479,38 @@ main(int arc, char **argv) D("Unable to open %s: %s", g.ifname, strerror(errno)); goto out; } + + if (g.nthreads > 1) { + struct nm_desc saved_desc = *g.nmd; + saved_desc.self = &saved_desc; + saved_desc.mem = NULL; + nm_close(g.nmd); + saved_desc.req.nr_flags &= ~NR_REG_MASK; + saved_desc.req.nr_flags |= NR_REG_ONE_NIC; + saved_desc.req.nr_ringid = 0; + g.nmd = nm_open(g.ifname, &base_nmd, NM_OPEN_IFNAME, &saved_desc); + if (g.nmd == NULL) { + D("Unable to open %s: %s", g.ifname, strerror(errno)); + goto out; + } + } g.main_fd = g.nmd->fd; D("mapped %dKB at %p", g.nmd->req.nr_memsize>>10, g.nmd->mem); - /* get num of queues in tx or rx */ - if (g.td_body == sender_body) + if (g.virt_header) { + /* Set the virtio-net header length, since the user asked + * for it explicitely. */ + set_vnet_hdr_len(&g); + } else { + /* Check whether the netmap port we opened requires us to send + * and receive frames with virtio-net header. */ + get_vnet_hdr_len(&g); + } + + /* get num of queues in tx or rx */ + if (g.td_type == TD_TYPE_SENDER) devqueues = g.nmd->req.nr_tx_rings; - else + else devqueues = g.nmd->req.nr_rx_rings; /* validate provided nthreads. */ @@ -1951,25 +2528,27 @@ main(int arc, char **argv) req->nr_arg2); for (i = 0; i <= req->nr_tx_rings; i++) { struct netmap_ring *ring = NETMAP_TXRING(nifp, i); - D(" TX%d at 0x%lx slots %d", i, - (char *)ring - (char *)nifp, ring->num_slots); + D(" TX%d at 0x%p slots %d", i, + (void *)((char *)ring - (char *)nifp), ring->num_slots); } for (i = 0; i <= req->nr_rx_rings; i++) { struct netmap_ring *ring = NETMAP_RXRING(nifp, i); - D(" RX%d at 0x%lx slots %d", i, - (char *)ring - (char *)nifp, ring->num_slots); + D(" RX%d at 0x%p slots %d", i, + (void *)((char *)ring - (char *)nifp), ring->num_slots); } } /* Print some debug information. */ fprintf(stdout, "%s %s: %d queues, %d threads and %d cpus.\n", - (g.td_body == sender_body) ? "Sending on" : "Receiving from", + (g.td_type == TD_TYPE_SENDER) ? "Sending on" : + ((g.td_type == TD_TYPE_RECEIVER) ? "Receiving from" : + "Working on"), g.ifname, devqueues, g.nthreads, g.cpus); - if (g.td_body == sender_body) { + if (g.td_type == TD_TYPE_SENDER) { fprintf(stdout, "%s -> %s (%s -> %s)\n", g.src_ip.name, g.dst_ip.name, g.src_mac.name, g.dst_mac.name); @@ -1985,12 +2564,13 @@ out: if (g.options) { - D("--- SPECIAL OPTIONS:%s%s%s%s%s\n", + D("--- SPECIAL OPTIONS:%s%s%s%s%s%s\n", g.options & OPT_PREFETCH ? " prefetch" : "", g.options & OPT_ACCESS ? " access" : "", g.options & OPT_MEMCPY ? " memcpy" : "", g.options & OPT_INDIRECT ? " indirect" : "", - g.options & OPT_COPY ? " copy" : ""); + g.options & OPT_COPY ? " copy" : "", + g.options & OPT_RUBBISH ? " rubbish " : ""); } g.tx_period.tv_sec = g.tx_period.tv_nsec = 0; @@ -2010,7 +2590,7 @@ out: g.tx_period.tv_sec = g.tx_period.tv_nsec / 1000000000; g.tx_period.tv_nsec = g.tx_period.tv_nsec % 1000000000; } - if (g.td_body == sender_body) + if (g.td_type == TD_TYPE_SENDER) D("Sending %d packets every %ld.%09ld s", g.burst, g.tx_period.tv_sec, g.tx_period.tv_nsec); /* Wait for PHY reset. */ @@ -2020,10 +2600,24 @@ out: /* Install ^C handler. */ global_nthreads = g.nthreads; - signal(SIGINT, sigint_h); - + sigemptyset(&ss); + sigaddset(&ss, SIGINT); + /* block SIGINT now, so that all created threads will inherit the mask */ + if (pthread_sigmask(SIG_BLOCK, &ss, NULL) < 0) { + D("failed to block SIGINT: %s", strerror(errno)); + } start_threads(&g); + /* Install the handler and re-enable SIGINT for the main thread */ + sa.sa_handler = sigint_h; + if (sigaction(SIGINT, &sa, NULL) < 0) { + D("failed to install ^C handler: %s", strerror(errno)); + } + + if (pthread_sigmask(SIG_UNBLOCK, &ss, NULL) < 0) { + D("failed to re-enable SIGINT: %s", strerror(errno)); + } main_thread(&g); + free(targs); return 0; } diff --git a/tools/tools/netmap/vale-ctl.c b/tools/tools/netmap/vale-ctl.c index c9e5f31b9206..bf6e51fbde97 100644 --- a/tools/tools/netmap/vale-ctl.c +++ b/tools/tools/netmap/vale-ctl.c @@ -25,6 +25,10 @@ /* $FreeBSD$ */ +#define NETMAP_WITH_LIBS +#include <net/netmap_user.h> +#include <net/netmap.h> + #include <errno.h> #include <stdio.h> #include <inttypes.h> /* PRI* macros */ @@ -35,17 +39,9 @@ #include <sys/param.h> #include <sys/socket.h> /* apple needs sockaddr */ #include <net/if.h> /* ifreq */ -#include <net/netmap.h> -#include <net/netmap_user.h> #include <libgen.h> /* basename */ #include <stdlib.h> /* atoi, free */ -/* debug support */ -#define ND(format, ...) do {} while(0) -#define D(format, ...) \ - fprintf(stderr, "%s [%d] " format "\n", \ - __FUNCTION__, __LINE__, ##__VA_ARGS__) - /* XXX cut and paste from pkt-gen.c because I'm not sure whether this * program may include nm_util.h */ @@ -117,8 +113,11 @@ bdg_ctl(const char *name, int nr_cmd, int nr_arg, char *nmr_config) break; case NETMAP_BDG_ATTACH: case NETMAP_BDG_DETACH: - if (nr_arg && nr_arg != NETMAP_BDG_HOST) + nmr.nr_flags = NR_REG_ALL_NIC; + if (nr_arg && nr_arg != NETMAP_BDG_HOST) { + nmr.nr_flags = NR_REG_NIC_SW; nr_arg = 0; + } nmr.nr_arg1 = nr_arg; error = ioctl(fd, NIOCREGIF, &nmr); if (error == -1) { @@ -152,6 +151,36 @@ bdg_ctl(const char *name, int nr_cmd, int nr_arg, char *nmr_config) break; + case NETMAP_BDG_POLLING_ON: + case NETMAP_BDG_POLLING_OFF: + /* We reuse nmreq fields as follows: + * nr_tx_slots: 0 and non-zero indicate REG_ALL_NIC + * REG_ONE_NIC, respectively. + * nr_rx_slots: CPU core index. This also indicates the + * first queue in the case of REG_ONE_NIC + * nr_tx_rings: (REG_ONE_NIC only) indicates the + * number of CPU cores or the last queue + */ + nmr.nr_flags |= nmr.nr_tx_slots ? + NR_REG_ONE_NIC : NR_REG_ALL_NIC; + nmr.nr_ringid = nmr.nr_rx_slots; + /* number of cores/rings */ + if (nmr.nr_flags == NR_REG_ALL_NIC) + nmr.nr_arg1 = 1; + else + nmr.nr_arg1 = nmr.nr_tx_rings; + + error = ioctl(fd, NIOCREGIF, &nmr); + if (!error) + D("polling on %s %s", nmr.nr_name, + nr_cmd == NETMAP_BDG_POLLING_ON ? + "started" : "stopped"); + else + D("polling on %s %s (err %d)", nmr.nr_name, + nr_cmd == NETMAP_BDG_POLLING_ON ? + "couldn't start" : "couldn't stop", error); + break; + default: /* GINFO */ nmr.nr_cmd = nmr.nr_arg1 = nmr.nr_arg2 = 0; error = ioctl(fd, NIOCGINFO, &nmr); @@ -173,7 +202,7 @@ main(int argc, char *argv[]) const char *command = basename(argv[0]); char *name = NULL, *nmr_config = NULL; - if (argc > 3) { + if (argc > 5) { usage: fprintf(stderr, "Usage:\n" @@ -186,12 +215,18 @@ usage: "\t-r interface interface name to be deleted\n" "\t-l list all or specified bridge's interfaces (default)\n" "\t-C string ring/slot setting of an interface creating by -n\n" + "\t-p interface start polling. Additional -C x,y,z configures\n" + "\t\t x: 0 (REG_ALL_NIC) or 1 (REG_ONE_NIC),\n" + "\t\t y: CPU core id for ALL_NIC and core/ring for ONE_NIC\n" + "\t\t z: (ONE_NIC only) num of total cores/rings\n" + "\t-P interface stop polling\n" "", command); return 0; } - while ((ch = getopt(argc, argv, "d:a:h:g:l:n:r:C:")) != -1) { - name = optarg; /* default */ + while ((ch = getopt(argc, argv, "d:a:h:g:l:n:r:C:p:P:")) != -1) { + if (ch != 'C') + name = optarg; /* default */ switch (ch) { default: fprintf(stderr, "bad option %c %s", ch, optarg); @@ -223,11 +258,17 @@ usage: case 'C': nmr_config = strdup(optarg); break; + case 'p': + nr_cmd = NETMAP_BDG_POLLING_ON; + break; + case 'P': + nr_cmd = NETMAP_BDG_POLLING_OFF; + break; } - if (optind != argc) { - // fprintf(stderr, "optind %d argc %d\n", optind, argc); - goto usage; - } + } + if (optind != argc) { + // fprintf(stderr, "optind %d argc %d\n", optind, argc); + goto usage; } if (argc == 1) nr_cmd = NETMAP_BDG_LIST; |
