Import the current version of netmap, aligned with the one on github. - src

diff options


context:
space:
mode:

author	Luigi Rizzo <luigi@FreeBSD.org>	2016-10-16 14:13:32 +0000
committer	Luigi Rizzo <luigi@FreeBSD.org>	2016-10-16 14:13:32 +0000
commit	37e3a6d349581b4dd0aebf24be7b1b159a698dcf (patch)
tree	0e61deea141c9733af511b0485cf1fd0f2dd17ed
parent	63f6b1a75a8e6e33e4f9d65571c6a221444d3b05 (diff)

Notes

Diffstat

-rw-r--r--

share/man/man4/netmap.4

140

-rw-r--r--

sys/conf/files

-rw-r--r--

sys/dev/netmap/if_ixl_netmap.h

-rw-r--r--

sys/dev/netmap/if_lem_netmap.h

201

-rw-r--r--

sys/dev/netmap/ixgbe_netmap.h

-rw-r--r--

sys/dev/netmap/netmap.c

1252

-rw-r--r--

sys/dev/netmap/netmap_freebsd.c

762

-rw-r--r--

sys/dev/netmap/netmap_generic.c

936

-rw-r--r--

sys/dev/netmap/netmap_kern.h

658

-rw-r--r--

sys/dev/netmap/netmap_mbq.c

-rw-r--r--

sys/dev/netmap/netmap_mbq.h

-rw-r--r--

sys/dev/netmap/netmap_mem2.c

932

-rw-r--r--

sys/dev/netmap/netmap_mem2.h

-rw-r--r--

sys/dev/netmap/netmap_monitor.c

112

-rw-r--r--

sys/dev/netmap/netmap_offloadings.c

260

-rw-r--r--

sys/dev/netmap/netmap_pipe.c

156

-rw-r--r--

sys/dev/netmap/netmap_vale.c

665

-rw-r--r--

sys/modules/netmap/Makefile

-rw-r--r--

sys/net/netmap.h

109

-rw-r--r--

sys/net/netmap_user.h

422

-rw-r--r--

tools/tools/netmap/Makefile

-rw-r--r--

tools/tools/netmap/bridge.c

-rw-r--r--

tools/tools/netmap/ctrs.h

108

-rw-r--r--

tools/tools/netmap/nmreplay.8

129

-rw-r--r--

tools/tools/netmap/nmreplay.c

1820

-rw-r--r--

tools/tools/netmap/pkt-gen.c

1114

-rw-r--r--

tools/tools/netmap/vale-ctl.c

27 files changed, 7995 insertions, 1977 deletions

diff --git a/share/man/man4/netmap.4 b/share/man/man4/netmap.4
index 4d4ed32ef75c..4c95da347596 100644
--- a/share/man/man4/netmap.4
+++ b/share/man/man4/netmap.4

@@ -33,10 +33,10 @@

.Sh NAME

.Nm netmap

.Nd a framework for fast packet I/O

-.Pp

+.br

.Nm VALE

.Nd a fast VirtuAl Local Ethernet using the netmap API

-.Pp

+.br

.Nm netmap pipes

.Nd a shared memory packet transport channel

.Sh SYNOPSIS

@@ -44,28 +44,49 @@

.Sh DESCRIPTION

.Nm

is a framework for extremely fast and efficient packet I/O

-for both userspace and kernel clients.

+for userspace and kernel clients, and for Virtual Machines.

It runs on

.Fx

-and Linux, and includes

-.Nm VALE ,

-a very fast and modular in-kernel software switch/dataplane,

-and

-.Nm netmap pipes ,

-a shared memory packet transport channel.

-All these are accessed interchangeably with the same API.

+Linux and some versions of Windows, and supports a variety of

+.Nm netmap ports ,

+including

+.Bl -tag -width XXXX

+.It Nm physical NIC ports

+to access individual queues of network interfaces;

+.It Nm host ports

+to inject packets into the host stack;

+.It Nm VALE ports

+implementing a very fast and modular in-kernel software switch/dataplane;

+.It Nm netmap pipes

+a shared memory packet transport channel;

+.It Nm netmap monitors

+a mechanism similar to

+.Xr bpf

+to capture traffic

+.El

.Pp

-.Nm ,

-.Nm VALE

-and

-.Nm netmap pipes

-are at least one order of magnitude faster than

+All these

+.Nm netmap ports

+are accessed interchangeably with the same API,

+and are at least one order of magnitude faster than

standard OS mechanisms

-(sockets, bpf, tun/tap interfaces, native switches, pipes),

-reaching 14.88 million packets per second (Mpps)

-with much less than one core on a 10 Gbit NIC,

-about 20 Mpps per core for VALE ports,

-and over 100 Mpps for netmap pipes.

+(sockets, bpf, tun/tap interfaces, native switches, pipes).

+With suitably fast hardware (NICs, PCIe buses, CPUs),

+packet I/O using

+.Nm

+on supported NICs

+reaches 14.88 million packets per second (Mpps)

+with much less than one core on 10 Gbit/s NICs;

+35-40 Mpps on 40 Gbit/s NICs (limited by the hardware);

+about 20 Mpps per core for VALE ports;

+and over 100 Mpps for

+.Nm netmap pipes.

+NICs without native

+.Nm

+support can still use the API in emulated mode,

+which uses unmodified device drivers and is 3-5 times faster than

+.Xr bpf

+or raw sockets.

.Pp

Userspace clients can dynamically switch NICs into

.Nm

@@ -73,8 +94,10 @@ mode and send and receive raw packets through

memory mapped buffers.

Similarly,

.Nm VALE

-switch instances and ports, and

+switch instances and ports,

.Nm netmap pipes

+and

+.Nm netmap monitors

can be created dynamically,

providing high speed packet I/O between processes,

virtual machines, NICs and the host stack.

@@ -89,17 +112,17 @@ and standard OS mechanisms such as

.Xr epoll 2 ,

and

.Xr kqueue 2 .

-.Nm VALE

-and

-.Nm netmap pipes

+All types of

+.Nm netmap ports

+and the

+.Nm VALE switch

are implemented by a single kernel module, which also emulates the

.Nm

-API over standard drivers for devices without native

-.Nm

-support.

+API over standard drivers.

For best performance,

.Nm

-requires explicit support in device drivers.

+requires native support in device drivers.

+A list of such devices is at the end of this document.

.Pp

In the rest of this (long) manual page we document

various aspects of the

@@ -116,7 +139,7 @@ which can be connected to a physical interface

to the host stack,

or to a

.Nm VALE

-switch).

+switch.

Ports use preallocated circular queues of buffers

.Em ( rings )

residing in an mmapped region.

@@ -166,16 +189,18 @@ has multiple modes of operation controlled by the

.Vt struct nmreq

argument.

.Va arg.nr_name

-specifies the port name, as follows:

+specifies the netmap port name, as follows:

.Bl -tag -width XXXX

.It Dv OS network interface name (e.g. 'em0', 'eth1', ... )

the data path of the NIC is disconnected from the host stack,

and the file descriptor is bound to the NIC (one or all queues),

or to the host stack;

-.It Dv valeXXX:YYY (arbitrary XXX and YYY)

-the file descriptor is bound to port YYY of a VALE switch called XXX,

-both dynamically created if necessary.

-The string cannot exceed IFNAMSIZ characters, and YYY cannot

+.It Dv valeSSS:PPP

+the file descriptor is bound to port PPP of VALE switch SSS.

+Switch instances and ports are dynamically created if necessary.

+.br

+Both SSS and PPP have the form [0-9a-zA-Z_]+ , the string

+cannot exceed IFNAMSIZ characters, and PPP cannot

be the name of any existing OS network interface.

.El

.Pp

@@ -312,9 +337,6 @@ one slot is always kept empty.

The ring size

.Va ( num_slots )

should not be assumed to be a power of two.

-.br

-(NOTE: older versions of netmap used head/count format to indicate

-the content of a ring).

.Pp

.Va head

is the first slot available to userspace;

@@ -585,6 +607,15 @@ it from the host stack.

Multiple file descriptors can be bound to the same port,

with proper synchronization left to the user.

.Pp

+The recommended way to bind a file descriptor to a port is

+to use function

+.Va nm_open(..)

+(see

+.Xr LIBRARIES )

+which parses names to access specific port types and

+enable features.

+In the following we document the main features.

+.Pp

.Dv NIOCREGIF can also bind a file descriptor to one endpoint of a

.Em netmap pipe ,

consisting of two netmap ports with a crossover connection.

@@ -734,7 +765,7 @@ similar to

binds a file descriptor to a port.

.Bl -tag -width XX

.It Va ifname

-is a port name, in the form "netmap:XXX" for a NIC and "valeXXX:YYY" for a

+is a port name, in the form "netmap:PPP" for a NIC and "valeSSS:PPP" for a

.Nm VALE

port.

.It Va req

@@ -774,28 +805,39 @@ similar to pcap_next(), fetches the next packet

natively supports the following devices:

.Pp

On FreeBSD:

+.Xr cxgbe 4 ,

.Xr em 4 ,

.Xr igb 4 ,

.Xr ixgbe 4 ,

+.Xr ixl 4 ,

.Xr lem 4 ,

.Xr re 4 .

.Pp

On Linux

.Xr e1000 4 ,

.Xr e1000e 4 ,

+.Xr i40e 4 ,

.Xr igb 4 ,

.Xr ixgbe 4 ,

-.Xr mlx4 4 ,

-.Xr forcedeth 4 ,

.Xr r8169 4 .

.Pp

NICs without native support can still be used in

.Nm

mode through emulation.

Performance is inferior to native netmap

-mode but still significantly higher than sockets, and approaching

+mode but still significantly higher than various raw socket types

+(bpf, PF_PACKET, etc.).

+Note that for slow devices (such as 1 Gbit/s and slower NICs,

+or several 10 Gbit/s NICs whose hardware is unable

that of in-kernel solutions such as Linux's

.Xr pktgen .

+When emulation is in use, packet sniffer programs such as tcpdump

+could see received packets before they are diverted by netmap. This behaviour

+is not intentional, being just an artifact of the implementation of emulation.

+Note that in case the netmap application subsequently moves packets received

+from the emulated adapter onto the host RX ring, the sniffer will intercept

+those packets again, since the packets are injected to the host stack as they

+were received by the network interface.

.Pp

Emulation is also available for devices with native netmap support,

which can be used for testing or performance comparison.

@@ -812,8 +854,12 @@ and module parameters on Linux

.Bl -tag -width indent

.It Va dev.netmap.admode: 0

Controls the use of native or emulated adapter mode.

-0 uses the best available option, 1 forces native and

-fails if not available, 2 forces emulated hence never fails.

+.br

+0 uses the best available option;

+.br

+1 forces native mode and fails if not available;

+.br

+2 forces emulated hence never fails.

.It Va dev.netmap.generic_ringsize: 1024

Ring size used for emulated netmap mode

.It Va dev.netmap.generic_mit: 100000

@@ -861,9 +907,9 @@ performance.

uses

.Xr select 2 ,

.Xr poll 2 ,

-.Xr epoll

+.Xr epoll 2

and

-.Xr kqueue

+.Xr kqueue 2

to wake up processes when significant events occur, and

.Xr mmap 2

to map memory.

@@ -1015,8 +1061,8 @@ e.g. running the following in two different terminals:

.Dl pkt-gen -i vale1:b -f tx # sender

The same example can be used to test netmap pipes, by simply

changing port names, e.g.

-.Dl pkt-gen -i vale:x{3 -f rx # receiver on the master side

-.Dl pkt-gen -i vale:x}3 -f tx # sender on the slave side

+.Dl pkt-gen -i vale2:x{3 -f rx # receiver on the master side

+.Dl pkt-gen -i vale2:x}3 -f tx # sender on the slave side

.Pp

The following command attaches an interface and the host stack

to a switch:

diff --git a/sys/conf/files b/sys/conf/files
index 8e2ce6bb671d..c9bfc2e44fe4 100644
--- a/sys/conf/files
+++ b/sys/conf/files

@@ -2187,6 +2187,7 @@ dev/nand/nfc_if.m optional nand

dev/ncr/ncr.c optional ncr pci

dev/ncv/ncr53c500.c optional ncv

dev/ncv/ncr53c500_pccard.c optional ncv pccard

+dev/netmap/if_ptnet.c optional netmap

dev/netmap/netmap.c optional netmap

dev/netmap/netmap_freebsd.c optional netmap

dev/netmap/netmap_generic.c optional netmap

@@ -2195,6 +2196,7 @@ dev/netmap/netmap_mem2.c optional netmap

dev/netmap/netmap_monitor.c optional netmap

dev/netmap/netmap_offloadings.c optional netmap

dev/netmap/netmap_pipe.c optional netmap

+dev/netmap/netmap_pt.c optional netmap

dev/netmap/netmap_vale.c optional netmap

# compile-with "${NORMAL_C} -Wconversion -Wextra"

dev/nfsmb/nfsmb.c optional nfsmb pci

diff --git a/sys/dev/netmap/if_ixl_netmap.h b/sys/dev/netmap/if_ixl_netmap.h
index 2c7f9be541b3..223dc06e36ab 100644
--- a/sys/dev/netmap/if_ixl_netmap.h
+++ b/sys/dev/netmap/if_ixl_netmap.h

@@ -59,7 +59,7 @@ extern int ixl_rx_miss, ixl_rx_miss_bufs, ixl_crcstrip;

* device-specific sysctl variables:

- * ixl_crcstrip: 0: keep CRC in rx frames (default), 1: strip it.

+ * ixl_crcstrip: 0: NIC keeps CRC in rx frames, 1: NIC strips it (default).

* During regular operations the CRC is stripped, but on some

* hardware reception of frames not multiple of 64 is slower,

* so using crcstrip=0 helps in benchmarks.

@@ -73,7 +73,7 @@ SYSCTL_DECL(_dev_netmap);

#if 0

SYSCTL_INT(_dev_netmap, OID_AUTO, ixl_crcstrip,

- CTLFLAG_RW, &ixl_crcstrip, 1, "strip CRC on rx frames");

+ CTLFLAG_RW, &ixl_crcstrip, 1, "NIC strips CRC on rx frames");

#endif

SYSCTL_INT(_dev_netmap, OID_AUTO, ixl_rx_miss,

CTLFLAG_RW, &ixl_rx_miss, 0, "potentially missed rx intr");

diff --git a/sys/dev/netmap/if_lem_netmap.h b/sys/dev/netmap/if_lem_netmap.h
index 0ec9b1346609..1c2afbd18f10 100644
--- a/sys/dev/netmap/if_lem_netmap.h
+++ b/sys/dev/netmap/if_lem_netmap.h

@@ -81,6 +81,22 @@ lem_netmap_reg(struct netmap_adapter *na, int onoff)

}

+static void

+lem_netmap_intr(struct netmap_adapter *na, int onoff)

+ struct ifnet *ifp = na->ifp;

+ struct adapter *adapter = ifp->if_softc;

+ EM_CORE_LOCK(adapter);

+ if (onoff) {

+ lem_enable_intr(adapter);

+ } else {

+ lem_disable_intr(adapter);

+ }

+ EM_CORE_UNLOCK(adapter);

* Reconcile kernel and user view of the transmit ring.

@@ -99,10 +115,6 @@ lem_netmap_txsync(struct netmap_kring *kring, int flags)

/* device-specific */

struct adapter *adapter = ifp->if_softc;

-#ifdef NIC_PARAVIRT

- struct paravirt_csb *csb = adapter->csb;

- uint64_t *csbd = (uint64_t *)(csb + 1);

-#endif /* NIC_PARAVIRT */

bus_dmamap_sync(adapter->txdma.dma_tag, adapter->txdma.dma_map,

BUS_DMASYNC_POSTREAD);

@@ -113,19 +125,6 @@ lem_netmap_txsync(struct netmap_kring *kring, int flags)

nm_i = kring->nr_hwcur;

if (nm_i != head) { /* we have new packets to send */

-#ifdef NIC_PARAVIRT

- int do_kick = 0;

- uint64_t t = 0; // timestamp

- int n = head - nm_i;

- if (n < 0)

- n += lim + 1;

- if (csb) {

- t = rdtsc(); /* last timestamp */

- csbd[16] += t - csbd[0]; /* total Wg */

- csbd[17] += n; /* Wg count */

- csbd[0] = t;

- }

-#endif /* NIC_PARAVIRT */

nic_i = netmap_idx_k2n(kring, nm_i);

while (nm_i != head) {

struct netmap_slot *slot = &ring->slot[nm_i];

@@ -166,38 +165,8 @@ lem_netmap_txsync(struct netmap_kring *kring, int flags)

bus_dmamap_sync(adapter->txdma.dma_tag, adapter->txdma.dma_map,

BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);

-#ifdef NIC_PARAVIRT

- /* set unconditionally, then also kick if needed */

- if (csb) {

- t = rdtsc();

- if (csb->host_need_txkick == 2) {

- /* can compute an update of delta */

- int64_t delta = t - csbd[3];

- if (delta < 0)

- delta = -delta;

- if (csbd[8] == 0 || delta < csbd[8]) {

- csbd[8] = delta;

- csbd[9]++;

- }

- csbd[10]++;

- }

- csb->guest_tdt = nic_i;

- csbd[18] += t - csbd[0]; // total wp

- csbd[19] += n;

- }

- if (!csb || !csb->guest_csb_on || (csb->host_need_txkick & 1))

- do_kick = 1;

- if (do_kick)

-#endif /* NIC_PARAVIRT */

/* (re)start the tx unit up to slot nic_i (excluded) */

E1000_WRITE_REG(&adapter->hw, E1000_TDT(0), nic_i);

-#ifdef NIC_PARAVIRT

- if (do_kick) {

- uint64_t t1 = rdtsc();

- csbd[20] += t1 - t; // total Np

- csbd[21]++;

- }

-#endif /* NIC_PARAVIRT */

}

@@ -206,93 +175,6 @@ lem_netmap_txsync(struct netmap_kring *kring, int flags)

if (ticks != kring->last_reclaim || flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring)) {

kring->last_reclaim = ticks;

/* record completed transmissions using TDH */

-#ifdef NIC_PARAVIRT

- /* host updates tdh unconditionally, and we have

- * no side effects on reads, so we can read from there

- * instead of exiting.

- */

- if (csb) {

- static int drain = 0, nodrain=0, good = 0, bad = 0, fail = 0;

- u_int x = adapter->next_tx_to_clean;

- csbd[19]++; // XXX count reclaims

- nic_i = csb->host_tdh;

- if (csb->guest_csb_on) {

- if (nic_i == x) {

- bad++;

- csbd[24]++; // failed reclaims

- /* no progress, request kick and retry */

- csb->guest_need_txkick = 1;

- mb(); // XXX barrier

- nic_i = csb->host_tdh;

- } else {

- good++;

- }

- if (nic_i != x) {

- csb->guest_need_txkick = 2;

- if (nic_i == csb->guest_tdt)

- drain++;

- else

- nodrain++;

-#if 1

- if (netmap_adaptive_io) {

- /* new mechanism: last half ring (or so)

- * released one slot at a time.

- * This effectively makes the system spin.

- *

- * Take next_to_clean + 1 as a reference.

- * tdh must be ahead or equal

- * On entry, the logical order is

- * x < tdh = nic_i

- * We first push tdh up to avoid wraps.

- * The limit is tdh-ll (half ring).

- * if tdh-256 < x we report x;

- * else we report tdh-256

- */

- u_int tdh = nic_i;

- u_int ll = csbd[15];

- u_int delta = lim/8;

- if (netmap_adaptive_io == 2 || ll > delta)

- csbd[15] = ll = delta;

- else if (netmap_adaptive_io == 1 && ll > 1) {

- csbd[15]--;

- }

- if (nic_i >= kring->nkr_num_slots) {

- RD(5, "bad nic_i %d on input", nic_i);

- }

- x = nm_next(x, lim);

- if (tdh < x)

- tdh += lim + 1;

- if (tdh <= x + ll) {

- nic_i = x;

- csbd[25]++; //report n + 1;

- } else {

- tdh = nic_i;

- if (tdh < ll)

- tdh += lim + 1;

- nic_i = tdh - ll;

- csbd[26]++; // report tdh - ll

- }

-#endif

- } else {

- /* we stop, count whether we are idle or not */

- int bh_active = csb->host_need_txkick & 2 ? 4 : 0;

- csbd[27+ csb->host_need_txkick]++;

- if (netmap_adaptive_io == 1) {

- if (bh_active && csbd[15] > 1)

- csbd[15]--;

- else if (!bh_active && csbd[15] < lim/2)

- csbd[15]++;

- }

- bad--;

- fail++;

- }

- RD(1, "drain %d nodrain %d good %d retry %d fail %d",

- drain, nodrain, good, bad, fail);

- } else

-#endif /* !NIC_PARAVIRT */

nic_i = E1000_READ_REG(&adapter->hw, E1000_TDH(0));

if (nic_i >= kring->nkr_num_slots) { /* XXX can it happen ? */

D("TDH wrap %d", nic_i);

@@ -324,21 +206,10 @@ lem_netmap_rxsync(struct netmap_kring *kring, int flags)

/* device-specific */

struct adapter *adapter = ifp->if_softc;

-#ifdef NIC_PARAVIRT

- struct paravirt_csb *csb = adapter->csb;

- uint32_t csb_mode = csb && csb->guest_csb_on;

- uint32_t do_host_rxkick = 0;

-#endif /* NIC_PARAVIRT */

if (head > lim)

return netmap_ring_reinit(kring);

-#ifdef NIC_PARAVIRT

- if (csb_mode) {

- force_update = 1;

- csb->guest_need_rxkick = 0;

- }

-#endif /* NIC_PARAVIRT */

/* XXX check sync modes */

bus_dmamap_sync(adapter->rxdma.dma_tag, adapter->rxdma.dma_map,

BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);

@@ -357,23 +228,6 @@ lem_netmap_rxsync(struct netmap_kring *kring, int flags)

uint32_t staterr = le32toh(curr->status);

int len;

-#ifdef NIC_PARAVIRT

- if (csb_mode) {

- if ((staterr & E1000_RXD_STAT_DD) == 0) {

- /* don't bother to retry if more than 1 pkt */

- if (n > 1)

- break;

- csb->guest_need_rxkick = 1;

- wmb();

- staterr = le32toh(curr->status);

- if ((staterr & E1000_RXD_STAT_DD) == 0) {

- break;

- } else { /* we are good */

- csb->guest_need_rxkick = 0;

- }

- } else

-#endif /* NIC_PARAVIRT */

if ((staterr & E1000_RXD_STAT_DD) == 0)

break;

len = le16toh(curr->length) - 4; // CRC

@@ -390,18 +244,6 @@ lem_netmap_rxsync(struct netmap_kring *kring, int flags)

nic_i = nm_next(nic_i, lim);

}

if (n) { /* update the state variables */

-#ifdef NIC_PARAVIRT

- if (csb_mode) {

- if (n > 1) {

- /* leave one spare buffer so we avoid rxkicks */

- nm_i = nm_prev(nm_i, lim);

- nic_i = nm_prev(nic_i, lim);

- n--;

- } else {

- csb->guest_need_rxkick = 1;

- }

-#endif /* NIC_PARAVIRT */

ND("%d new packets at nic %d nm %d tail %d",

adapter->next_rx_desc_to_check,

@@ -440,10 +282,6 @@ lem_netmap_rxsync(struct netmap_kring *kring, int flags)

curr->status = 0;

bus_dmamap_sync(adapter->rxtag, rxbuf->map,

BUS_DMASYNC_PREREAD);

-#ifdef NIC_PARAVIRT

- if (csb_mode && csb->host_rxkick_at == nic_i)

- do_host_rxkick = 1;

-#endif /* NIC_PARAVIRT */

nm_i = nm_next(nm_i, lim);

nic_i = nm_next(nic_i, lim);

}

@@ -455,12 +293,6 @@ lem_netmap_rxsync(struct netmap_kring *kring, int flags)

* so move nic_i back by one unit

nic_i = nm_prev(nic_i, lim);

-#ifdef NIC_PARAVIRT

- /* set unconditionally, then also kick if needed */

- if (csb)

- csb->guest_rdt = nic_i;

- if (!csb_mode || do_host_rxkick)

-#endif /* NIC_PARAVIRT */

E1000_WRITE_REG(&adapter->hw, E1000_RDT(0), nic_i);

}

@@ -486,6 +318,7 @@ lem_netmap_attach(struct adapter *adapter)

na.nm_rxsync = lem_netmap_rxsync;

na.nm_register = lem_netmap_reg;

na.num_tx_rings = na.num_rx_rings = 1;

+ na.nm_intr = lem_netmap_intr;

netmap_attach(&na);

}

diff --git a/sys/dev/netmap/ixgbe_netmap.h b/sys/dev/netmap/ixgbe_netmap.h
index 0f34e7218503..7986c9965173 100644
--- a/sys/dev/netmap/ixgbe_netmap.h
+++ b/sys/dev/netmap/ixgbe_netmap.h

@@ -53,7 +53,7 @@ void ixgbe_netmap_attach(struct adapter *adapter);

* device-specific sysctl variables:

- * ix_crcstrip: 0: keep CRC in rx frames (default), 1: strip it.

+ * ix_crcstrip: 0: NIC keeps CRC in rx frames (default), 1: NIC strips it.

* During regular operations the CRC is stripped, but on some

* hardware reception of frames not multiple of 64 is slower,

* so using crcstrip=0 helps in benchmarks.

@@ -65,7 +65,7 @@ SYSCTL_DECL(_dev_netmap);

static int ix_rx_miss, ix_rx_miss_bufs;

int ix_crcstrip;

SYSCTL_INT(_dev_netmap, OID_AUTO, ix_crcstrip,

- CTLFLAG_RW, &ix_crcstrip, 0, "strip CRC on rx frames");

+ CTLFLAG_RW, &ix_crcstrip, 0, "NIC strips CRC on rx frames");

SYSCTL_INT(_dev_netmap, OID_AUTO, ix_rx_miss,

CTLFLAG_RW, &ix_rx_miss, 0, "potentially missed rx intr");

SYSCTL_INT(_dev_netmap, OID_AUTO, ix_rx_miss_bufs,

@@ -109,6 +109,20 @@ set_crcstrip(struct ixgbe_hw *hw, int onoff)

IXGBE_WRITE_REG(hw, IXGBE_RDRXCTL, rxc);

}

+static void

+ixgbe_netmap_intr(struct netmap_adapter *na, int onoff)

+ struct ifnet *ifp = na->ifp;

+ struct adapter *adapter = ifp->if_softc;

+ IXGBE_CORE_LOCK(adapter);

+ if (onoff) {

+ ixgbe_enable_intr(adapter); // XXX maybe ixgbe_stop ?

+ } else {

+ ixgbe_disable_intr(adapter); // XXX maybe ixgbe_stop ?

+ }

+ IXGBE_CORE_UNLOCK(adapter);

* Register/unregister. We are already under netmap lock.

@@ -311,7 +325,7 @@ ixgbe_netmap_txsync(struct netmap_kring *kring, int flags)

* good way.

nic_i = IXGBE_READ_REG(&adapter->hw, IXGBE_IS_VF(adapter) ?

- IXGBE_VFTDH(kring->ring_id) : IXGBE_TDH(kring->ring_id));

+ IXGBE_VFTDH(kring->ring_id) : IXGBE_TDH(kring->ring_id));

if (nic_i >= kring->nkr_num_slots) { /* XXX can it happen ? */

D("TDH wrap %d", nic_i);

nic_i -= kring->nkr_num_slots;

@@ -486,6 +500,7 @@ ixgbe_netmap_attach(struct adapter *adapter)

na.nm_rxsync = ixgbe_netmap_rxsync;

na.nm_register = ixgbe_netmap_reg;

na.num_tx_rings = na.num_rx_rings = adapter->num_queues;

+ na.nm_intr = ixgbe_netmap_intr;

netmap_attach(&na);

}

diff --git a/sys/dev/netmap/netmap.c b/sys/dev/netmap/netmap.c
index aff757bdadfe..d92d342af83c 100644
--- a/sys/dev/netmap/netmap.c
+++ b/sys/dev/netmap/netmap.c

@@ -1,5 +1,9 @@

* Redistribution and use in source and binary forms, with or without

* modification, are permitted provided that the following conditions

@@ -133,13 +137,12 @@ ports attached to the switch)

* > select()able file descriptor on which events are reported.

* Internally, we allocate a netmap_priv_d structure, that will be

- * initialized on ioctl(NIOCREGIF).

+ * initialized on ioctl(NIOCREGIF). There is one netmap_priv_d

+ * structure for each open().

* os-specific:

- * FreeBSD: netmap_open (netmap_freebsd.c). The priv is

- * per-thread.

- * linux: linux_netmap_open (netmap_linux.c). The priv is

- * per-open.

+ * FreeBSD: see netmap_open() (netmap_freebsd.c)

+ * linux: see linux_netmap_open() (netmap_linux.c)

* > 2. on each descriptor, the process issues an ioctl() to identify

* > the interface that should report events to the file descriptor.

@@ -299,18 +302,17 @@ ports attached to the switch)

* netmap_transmit()

* na->nm_notify == netmap_notify()

* 2) ioctl(NIOCRXSYNC)/netmap_poll() in process context

- * kring->nm_sync() == netmap_rxsync_from_host_compat

+ * kring->nm_sync() == netmap_rxsync_from_host

* netmap_rxsync_from_host(na, NULL, NULL)

* - tx to host stack

* ioctl(NIOCTXSYNC)/netmap_poll() in process context

- * kring->nm_sync() == netmap_txsync_to_host_compat

+ * kring->nm_sync() == netmap_txsync_to_host

* netmap_txsync_to_host(na)

- * NM_SEND_UP()

- * FreeBSD: na->if_input() == ?? XXX

+ * nm_os_send_up()

+ * FreeBSD: na->if_input() == ether_input()

* linux: netif_rx() with NM_MAGIC_PRIORITY_RX

- *

* -= SYSTEM DEVICE WITH GENERIC SUPPORT =-

* na == NA(ifp) == generic_netmap_adapter created in generic_netmap_attach()

@@ -319,10 +321,11 @@ ports attached to the switch)

* concurrently:

* 1) ioctl(NIOCTXSYNC)/netmap_poll() in process context

* kring->nm_sync() == generic_netmap_txsync()

- * linux: dev_queue_xmit() with NM_MAGIC_PRIORITY_TX

- * generic_ndo_start_xmit()

- * orig. dev. start_xmit

- * FreeBSD: na->if_transmit() == orig. dev if_transmit

+ * nm_os_generic_xmit_frame()

+ * linux: dev_queue_xmit() with NM_MAGIC_PRIORITY_TX

+ * ifp->ndo_start_xmit == generic_ndo_start_xmit()

+ * gna->save_start_xmit == orig. dev. start_xmit

+ * FreeBSD: na->if_transmit() == orig. dev if_transmit

* 2) generic_mbuf_destructor()

* na->nm_notify() == netmap_notify()

* - rx from netmap userspace:

@@ -333,24 +336,15 @@ ports attached to the switch)

* generic_rx_handler()

* mbq_safe_enqueue()

* na->nm_notify() == netmap_notify()

- * - rx from host stack:

- * concurrently:

+ * - rx from host stack

+ * FreeBSD: same as native

+ * Linux: same as native except:

* 1) host stack

- * linux: generic_ndo_start_xmit()

- * netmap_transmit()

- * FreeBSD: ifp->if_input() == netmap_transmit

- * both:

- * na->nm_notify() == netmap_notify()

- * 2) ioctl(NIOCRXSYNC)/netmap_poll() in process context

- * kring->nm_sync() == netmap_rxsync_from_host_compat

- * netmap_rxsync_from_host(na, NULL, NULL)

- * - tx to host stack:

- * ioctl(NIOCTXSYNC)/netmap_poll() in process context

- * kring->nm_sync() == netmap_txsync_to_host_compat

- * netmap_txsync_to_host(na)

- * NM_SEND_UP()

- * FreeBSD: na->if_input() == ??? XXX

- * linux: netif_rx() with NM_MAGIC_PRIORITY_RX

+ * dev_queue_xmit() without NM_MAGIC_PRIORITY_TX

+ * ifp->ndo_start_xmit == generic_ndo_start_xmit()

+ * netmap_transmit()

+ * na->nm_notify() == netmap_notify()

+ * - tx to host stack (same as native):

* -= VALE =-

@@ -371,7 +365,7 @@ ports attached to the switch)

* from host stack:

* netmap_transmit()

* na->nm_notify() == netmap_bwrap_intr_notify(ring_nr == host ring)

- * kring->nm_sync() == netmap_rxsync_from_host_compat()

+ * kring->nm_sync() == netmap_rxsync_from_host()

* netmap_vp_txsync()

* - system device with generic support:

@@ -384,7 +378,7 @@ ports attached to the switch)

* from host stack:

* netmap_transmit()

* na->nm_notify() == netmap_bwrap_intr_notify(ring_nr == host ring)

- * kring->nm_sync() == netmap_rxsync_from_host_compat()

+ * kring->nm_sync() == netmap_rxsync_from_host()

* netmap_vp_txsync()

* (all cases) --> nm_bdg_flush()

@@ -407,7 +401,7 @@ ports attached to the switch)

* netmap_vp_rxsync()

* to host stack:

* netmap_vp_rxsync()

- * kring->nm_sync() == netmap_txsync_to_host_compat

+ * kring->nm_sync() == netmap_txsync_to_host

* netmap_vp_rxsync_locked()

* - system device with generic adapter:

@@ -418,7 +412,7 @@ ports attached to the switch)

* netmap_vp_rxsync()

* to host stack:

* netmap_vp_rxsync()

- * kring->nm_sync() == netmap_txsync_to_host_compat

+ * kring->nm_sync() == netmap_txsync_to_host

* netmap_vp_rxsync()

@@ -455,29 +449,19 @@ ports attached to the switch)

#include <sys/refcount.h>

-/* reduce conditional code */

-// linux API, use for the knlist in FreeBSD

-/* use a private mutex for the knlist */

-#define init_waitqueue_head(x) do { \

- struct mtx *m = &(x)->m; \

- mtx_init(m, "nm_kn_lock", NULL, MTX_DEF); \

- knlist_init_mtx(&(x)->si.si_note, m); \

- } while (0)

-#define OS_selrecord(a, b) selrecord(a, &((b)->si))

-#define OS_selwakeup(a, b) freebsd_selwakeup(a, b)

#elif defined(linux)

#include "bsd_glue.h"

#elif defined(__APPLE__)

#warning OSX support is only partial

#include "osx_glue.h"

+#elif defined (_WIN32)

+#include "win_glue.h"

#else

#error Unsupported platform

@@ -492,47 +476,72 @@ ports attached to the switch)

#include <dev/netmap/netmap_mem2.h>

-MALLOC_DEFINE(M_NETMAP, "netmap", "Network memory map");

/* user-controlled variables */

int netmap_verbose;

static int netmap_no_timestamp; /* don't timestamp on rxsync */

-SYSCTL_NODE(_dev, OID_AUTO, netmap, CTLFLAG_RW, 0, "Netmap args");

-SYSCTL_INT(_dev_netmap, OID_AUTO, verbose,

- CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode");

-SYSCTL_INT(_dev_netmap, OID_AUTO, no_timestamp,

- CTLFLAG_RW, &netmap_no_timestamp, 0, "no_timestamp");

int netmap_mitigate = 1;

-SYSCTL_INT(_dev_netmap, OID_AUTO, mitigate, CTLFLAG_RW, &netmap_mitigate, 0, "");

int netmap_no_pendintr = 1;

-SYSCTL_INT(_dev_netmap, OID_AUTO, no_pendintr,

- CTLFLAG_RW, &netmap_no_pendintr, 0, "Always look for new received packets.");

int netmap_txsync_retry = 2;

-SYSCTL_INT(_dev_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW,

- &netmap_txsync_retry, 0 , "Number of txsync loops in bridge's flush.");

int netmap_adaptive_io = 0;

-SYSCTL_INT(_dev_netmap, OID_AUTO, adaptive_io, CTLFLAG_RW,

- &netmap_adaptive_io, 0 , "Adaptive I/O on paravirt");

int netmap_flags = 0; /* debug flags */

-int netmap_fwd = 0; /* force transparent mode */

+static int netmap_fwd = 0; /* force transparent mode */

* netmap_admode selects the netmap mode to use.

* Invalid values are reset to NETMAP_ADMODE_BEST

-enum { NETMAP_ADMODE_BEST = 0, /* use native, fallback to generic */

+enum { NETMAP_ADMODE_BEST = 0, /* use native, fallback to generic */

NETMAP_ADMODE_NATIVE, /* either native or none */

NETMAP_ADMODE_GENERIC, /* force generic */

NETMAP_ADMODE_LAST };

static int netmap_admode = NETMAP_ADMODE_BEST;

-int netmap_generic_mit = 100*1000; /* Generic mitigation interval in nanoseconds. */

-int netmap_generic_ringsize = 1024; /* Generic ringsize. */

-int netmap_generic_rings = 1; /* number of queues in generic. */

+/* netmap_generic_mit controls mitigation of RX notifications for

+ * the generic netmap adapter. The value is a time interval in

+ * nanoseconds. */

+int netmap_generic_mit = 100*1000;

+/* We use by default netmap-aware qdiscs with generic netmap adapters,

+ * even if there can be a little performance hit with hardware NICs.

+ * However, using the qdisc is the safer approach, for two reasons:

+ * 1) it prevents non-fifo qdiscs to break the TX notification

+ * scheme, which is based on mbuf destructors when txqdisc is

+ * not used.

+ * 2) it makes it possible to transmit over software devices that

+ * change skb->dev, like bridge, veth, ...

+ *

+ * Anyway users looking for the best performance should

+ * use native adapters.

+ */

+int netmap_generic_txqdisc = 1;

+/* Default number of slots and queues for generic adapters. */

+int netmap_generic_ringsize = 1024;

+int netmap_generic_rings = 1;

+/* Non-zero if ptnet devices are allowed to use virtio-net headers. */

+int ptnet_vnet_hdr = 1;

+/*

+ * SYSCTL calls are grouped between SYSBEGIN and SYSEND to be emulated

+ * in some other operating systems

+ */

+SYSBEGIN(main_init);

+SYSCTL_DECL(_dev_netmap);

+SYSCTL_NODE(_dev, OID_AUTO, netmap, CTLFLAG_RW, 0, "Netmap args");

+SYSCTL_INT(_dev_netmap, OID_AUTO, verbose,

+ CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode");

+SYSCTL_INT(_dev_netmap, OID_AUTO, no_timestamp,

+ CTLFLAG_RW, &netmap_no_timestamp, 0, "no_timestamp");

+SYSCTL_INT(_dev_netmap, OID_AUTO, mitigate, CTLFLAG_RW, &netmap_mitigate, 0, "");

+SYSCTL_INT(_dev_netmap, OID_AUTO, no_pendintr,

+ CTLFLAG_RW, &netmap_no_pendintr, 0, "Always look for new received packets.");

+SYSCTL_INT(_dev_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW,

+ &netmap_txsync_retry, 0 , "Number of txsync loops in bridge's flush.");

+SYSCTL_INT(_dev_netmap, OID_AUTO, adaptive_io, CTLFLAG_RW,

+ &netmap_adaptive_io, 0 , "Adaptive I/O on paravirt");

SYSCTL_INT(_dev_netmap, OID_AUTO, flags, CTLFLAG_RW, &netmap_flags, 0 , "");

SYSCTL_INT(_dev_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0 , "");

@@ -540,19 +549,24 @@ SYSCTL_INT(_dev_netmap, OID_AUTO, admode, CTLFLAG_RW, &netmap_admode, 0 , "");

SYSCTL_INT(_dev_netmap, OID_AUTO, generic_mit, CTLFLAG_RW, &netmap_generic_mit, 0 , "");

SYSCTL_INT(_dev_netmap, OID_AUTO, generic_ringsize, CTLFLAG_RW, &netmap_generic_ringsize, 0 , "");

SYSCTL_INT(_dev_netmap, OID_AUTO, generic_rings, CTLFLAG_RW, &netmap_generic_rings, 0 , "");

+SYSCTL_INT(_dev_netmap, OID_AUTO, generic_txqdisc, CTLFLAG_RW, &netmap_generic_txqdisc, 0 , "");

+SYSCTL_INT(_dev_netmap, OID_AUTO, ptnet_vnet_hdr, CTLFLAG_RW, &ptnet_vnet_hdr, 0 , "");

+SYSEND;

NMG_LOCK_T netmap_global_lock;

-int netmap_use_count = 0; /* number of active netmap instances */

* mark the ring as stopped, and run through the locks

* to make sure other users get to see it.

+ * stopped must be either NR_KR_STOPPED (for unbounded stop)

+ * of NR_KR_LOCKED (brief stop for mutual exclusion purposes)

static void

-netmap_disable_ring(struct netmap_kring *kr)

+netmap_disable_ring(struct netmap_kring *kr, int stopped)

{

- kr->nkr_stopped = 1;

- nm_kr_get(kr);

+ nm_kr_stop(kr, stopped);

+ // XXX check if nm_kr_stop is sufficient

mtx_lock(&kr->q_lock);

mtx_unlock(&kr->q_lock);

nm_kr_put(kr);

@@ -563,7 +577,7 @@ void

netmap_set_ring(struct netmap_adapter *na, u_int ring_id, enum txrx t, int stopped)

{

if (stopped)

- netmap_disable_ring(NMR(na, t) + ring_id);

+ netmap_disable_ring(NMR(na, t) + ring_id, stopped);

else

NMR(na, t)[ring_id].nkr_stopped = 0;

}

@@ -590,13 +604,14 @@ netmap_set_all_rings(struct netmap_adapter *na, int stopped)

* Convenience function used in drivers. Waits for current txsync()s/rxsync()s

* to finish and prevents any new one from starting. Call this before turning

* netmap mode off, or before removing the hardware rings (e.g., on module

- * onload). As a rule of thumb for linux drivers, this should be placed near

- * each napi_disable().

+ * onload).

void

netmap_disable_all_rings(struct ifnet *ifp)

{

- netmap_set_all_rings(NA(ifp), 1 /* stopped */);

+ if (NM_NA_VALID(ifp)) {

+ netmap_set_all_rings(NA(ifp), NM_KR_STOPPED);

+ }

}

@@ -607,9 +622,34 @@ netmap_disable_all_rings(struct ifnet *ifp)

void

netmap_enable_all_rings(struct ifnet *ifp)

{

- netmap_set_all_rings(NA(ifp), 0 /* enabled */);

+ if (NM_NA_VALID(ifp)) {

+ netmap_set_all_rings(NA(ifp), 0 /* enabled */);

+ }

+void

+netmap_make_zombie(struct ifnet *ifp)

+ if (NM_NA_VALID(ifp)) {

+ struct netmap_adapter *na = NA(ifp);

+ netmap_set_all_rings(na, NM_KR_LOCKED);

+ na->na_flags |= NAF_ZOMBIE;

+ netmap_set_all_rings(na, 0);

+ }

}

+void

+netmap_undo_zombie(struct ifnet *ifp)

+ if (NM_NA_VALID(ifp)) {

+ struct netmap_adapter *na = NA(ifp);

+ if (na->na_flags & NAF_ZOMBIE) {

+ netmap_set_all_rings(na, NM_KR_LOCKED);

+ na->na_flags &= ~NAF_ZOMBIE;

+ netmap_set_all_rings(na, 0);

+ }

* generic bound_checking function

@@ -727,28 +767,9 @@ netmap_update_config(struct netmap_adapter *na)

return 1;

}

-static void netmap_txsync_to_host(struct netmap_adapter *na);

-static int netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait);

-/* kring->nm_sync callback for the host tx ring */

-static int

-netmap_txsync_to_host_compat(struct netmap_kring *kring, int flags)

- (void)flags; /* unused */

- netmap_txsync_to_host(kring->na);

- return 0;

-/* kring->nm_sync callback for the host rx ring */

-static int

-netmap_rxsync_from_host_compat(struct netmap_kring *kring, int flags)

- (void)flags; /* unused */

- netmap_rxsync_from_host(kring->na, NULL, NULL);

- return 0;

+/* nm_sync callbacks for the host rings */

+static int netmap_txsync_to_host(struct netmap_kring *kring, int flags);

+static int netmap_rxsync_from_host(struct netmap_kring *kring, int flags);

/* create the krings array and initialize the fields common to all adapters.

* The array layout is this:

@@ -809,12 +830,14 @@ netmap_krings_create(struct netmap_adapter *na, u_int tailroom)

kring->ring_id = i;

kring->tx = t;

kring->nkr_num_slots = ndesc;

+ kring->nr_mode = NKR_NETMAP_OFF;

+ kring->nr_pending_mode = NKR_NETMAP_OFF;

if (i < nma_get_nrings(na, t)) {

kring->nm_sync = (t == NR_TX ? na->nm_txsync : na->nm_rxsync);

- } else if (i == na->num_tx_rings) {

+ } else {

kring->nm_sync = (t == NR_TX ?

- netmap_txsync_to_host_compat :

- netmap_rxsync_from_host_compat);

+ netmap_txsync_to_host:

+ netmap_rxsync_from_host);

}

kring->nm_notify = na->nm_notify;

kring->rhead = kring->rcur = kring->nr_hwcur = 0;

@@ -822,14 +845,14 @@ netmap_krings_create(struct netmap_adapter *na, u_int tailroom)

* IMPORTANT: Always keep one slot empty.

kring->rtail = kring->nr_hwtail = (t == NR_TX ? ndesc - 1 : 0);

- snprintf(kring->name, sizeof(kring->name) - 1, "%s %s%d", na->name,

+ snprintf(kring->name, sizeof(kring->name) - 1, "%s %s%d", na->name,

nm_txrx2str(t), i);

ND("ktx %s h %d c %d t %d",

kring->name, kring->rhead, kring->rcur, kring->rtail);

mtx_init(&kring->q_lock, (t == NR_TX ? "nm_txq_lock" : "nm_rxq_lock"), NULL, MTX_DEF);

- init_waitqueue_head(&kring->si);

+ nm_os_selinfo_init(&kring->si);

}

- init_waitqueue_head(&na->si[t]);

+ nm_os_selinfo_init(&na->si[t]);

}

na->tailroom = na->rx_rings + n[NR_RX];

@@ -838,19 +861,6 @@ netmap_krings_create(struct netmap_adapter *na, u_int tailroom)

}

-#ifdef __FreeBSD__

-static void

-netmap_knlist_destroy(NM_SELINFO_T *si)

- /* XXX kqueue(9) needed; these will mirror knlist_init. */

- knlist_delete(&si->si.si_note, curthread, 0 /* not locked */ );

- knlist_destroy(&si->si.si_note);

- /* now we don't need the mutex anymore */

- mtx_destroy(&si->m);

-#endif /* __FreeBSD__ */

/* undo the actions performed by netmap_krings_create */

/* call with NMG_LOCK held */

void

@@ -860,12 +870,12 @@ netmap_krings_delete(struct netmap_adapter *na)

enum txrx t;

for_rx_tx(t)

- netmap_knlist_destroy(&na->si[t]);

+ nm_os_selinfo_uninit(&na->si[t]);

/* we rely on the krings layout described above */

for ( ; kring != na->tailroom; kring++) {

mtx_destroy(&kring->q_lock);

- netmap_knlist_destroy(&kring->si);

+ nm_os_selinfo_uninit(&kring->si);

}

free(na->tx_rings, M_DEVBUF);

na->tx_rings = na->rx_rings = na->tailroom = NULL;

@@ -878,14 +888,14 @@ netmap_krings_delete(struct netmap_adapter *na)

* them first.

/* call with NMG_LOCK held */

-static void

+void

netmap_hw_krings_delete(struct netmap_adapter *na)

{

struct mbq *q = &na->rx_rings[na->num_rx_rings].rx_queue;

ND("destroy sw mbq with len %d", mbq_len(q));

mbq_purge(q);

- mbq_safe_destroy(q);

+ mbq_safe_fini(q);

netmap_krings_delete(na);

}

@@ -898,29 +908,38 @@ netmap_hw_krings_delete(struct netmap_adapter *na)

/* call with NMG_LOCK held */

static void netmap_unset_ringid(struct netmap_priv_d *);

-static void netmap_rel_exclusive(struct netmap_priv_d *);

-static void

+static void netmap_krings_put(struct netmap_priv_d *);

+void

netmap_do_unregif(struct netmap_priv_d *priv)

{

struct netmap_adapter *na = priv->np_na;

NMG_LOCK_ASSERT();

na->active_fds--;

- /* release exclusive use if it was requested on regif */

- netmap_rel_exclusive(priv);

- if (na->active_fds <= 0) { /* last instance */

- if (netmap_verbose)

- D("deleting last instance for %s", na->name);

+ /* unset nr_pending_mode and possibly release exclusive mode */

+ netmap_krings_put(priv);

#ifdef WITH_MONITOR

+ /* XXX check whether we have to do something with monitor

+ * when rings change nr_mode. */

+ if (na->active_fds <= 0) {

/* walk through all the rings and tell any monitor

* that the port is going to exit netmap mode

netmap_monitor_stop(na);

+ }

#endif

+ if (na->active_fds <= 0 || nm_kring_pending(priv)) {

+ na->nm_register(na, 0);

+ }

+ /* delete rings and buffers that are no longer needed */

+ netmap_mem_rings_delete(na);

+ if (na->active_fds <= 0) { /* last instance */

- * (TO CHECK) This function is only called

+ * (TO CHECK) We enter here

* when the last reference to this file descriptor goes

* away. This means we cannot have any pending poll()

* or interrupt routine operating on the structure.

@@ -933,16 +952,16 @@ netmap_do_unregif(struct netmap_priv_d *priv)

* happens if the close() occurs while a concurrent

* syscall is running.

- na->nm_register(na, 0); /* off, clear flags */

- /* Wake up any sleeping threads. netmap_poll will

- * then return POLLERR

- * XXX The wake up now must happen during *_down(), when

- * we order all activities to stop. -gl

- */

- /* delete rings and buffers */

- netmap_mem_rings_delete(na);

+ if (netmap_verbose)

+ D("deleting last instance for %s", na->name);

+ if (nm_netmap_on(na)) {

+ D("BUG: netmap on while going to delete the krings");

+ }

na->nm_krings_delete(na);

}

/* possibily decrement counter of tx_si/rx_si users */

netmap_unset_ringid(priv);

/* delete the nifp */

@@ -962,6 +981,20 @@ nm_si_user(struct netmap_priv_d *priv, enum txrx t)

(priv->np_qlast[t] - priv->np_qfirst[t] > 1));

}

+struct netmap_priv_d*

+netmap_priv_new(void)

+ struct netmap_priv_d *priv;

+ priv = malloc(sizeof(struct netmap_priv_d), M_DEVBUF,

+ M_NOWAIT | M_ZERO);

+ if (priv == NULL)

+ return NULL;

+ priv->np_refs = 1;

+ nm_os_get_module();

+ return priv;

* Destructor of the netmap_priv_d, called when the fd is closed

* Action: undo all the things done by NIOCREGIF,

@@ -971,22 +1004,22 @@ nm_si_user(struct netmap_priv_d *priv, enum txrx t)

/* call with NMG_LOCK held */

-int

-netmap_dtor_locked(struct netmap_priv_d *priv)

+void

+netmap_priv_delete(struct netmap_priv_d *priv)

{

struct netmap_adapter *na = priv->np_na;

/* number of active references to this fd */

if (--priv->np_refs > 0) {

- return 0;

+ return;

}

- netmap_use_count--;

- if (!na) {

- return 1; //XXX is it correct?

+ nm_os_put_module();

+ if (na) {

+ netmap_do_unregif(priv);

}

- netmap_do_unregif(priv);

- netmap_adapter_put(na);

- return 1;

+ netmap_unget_na(na, priv->np_ifp);

+ bzero(priv, sizeof(*priv)); /* for safety */

+ free(priv, M_DEVBUF);

}

@@ -995,15 +1028,10 @@ void

netmap_dtor(void *data)

{

struct netmap_priv_d *priv = data;

- int last_instance;

NMG_LOCK();

- last_instance = netmap_dtor_locked(priv);

+ netmap_priv_delete(priv);

NMG_UNLOCK();

- if (last_instance) {

- bzero(priv, sizeof(*priv)); /* for safety */

- free(priv, M_DEVBUF);

- }

}

@@ -1036,14 +1064,19 @@ static void

netmap_send_up(struct ifnet *dst, struct mbq *q)

{

struct mbuf *m;

+ struct mbuf *head = NULL, *prev = NULL;

/* send packets up, outside the lock */

while ((m = mbq_dequeue(q)) != NULL) {

if (netmap_verbose & NM_VERB_HOST)

D("sending up pkt %p size %d", m, MBUF_LEN(m));

- NM_SEND_UP(dst, m);

+ prev = nm_os_send_up(dst, m, prev);

+ if (head == NULL)

+ head = prev;

}

- mbq_destroy(q);

+ if (head)

+ nm_os_send_up(dst, NULL, head);

+ mbq_fini(q);

}

@@ -1081,6 +1114,27 @@ netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force)

}

+static inline int

+_nm_may_forward(struct netmap_kring *kring)

+ return ((netmap_fwd || kring->ring->flags & NR_FORWARD) &&

+ kring->na->na_flags & NAF_HOST_RINGS &&

+ kring->tx == NR_RX);

+static inline int

+nm_may_forward_up(struct netmap_kring *kring)

+ return _nm_may_forward(kring) &&

+ kring->ring_id != kring->na->num_rx_rings;

+static inline int

+nm_may_forward_down(struct netmap_kring *kring)

+ return _nm_may_forward(kring) &&

+ kring->ring_id == kring->na->num_rx_rings;

* Send to the NIC rings packets marked NS_FORWARD between

@@ -1107,7 +1161,7 @@ netmap_sw_to_nic(struct netmap_adapter *na)

for (; rxcur != head && !nm_ring_empty(rdst);

rxcur = nm_next(rxcur, src_lim) ) {

struct netmap_slot *src, *dst, tmp;

- u_int dst_cur = rdst->cur;

+ u_int dst_head = rdst->head;

src = &rxslot[rxcur];

if ((src->flags & NS_FORWARD) == 0 && !netmap_fwd)

@@ -1115,7 +1169,7 @@ netmap_sw_to_nic(struct netmap_adapter *na)

sent++;

- dst = &rdst->slot[dst_cur];

+ dst = &rdst->slot[dst_head];

tmp = *src;

@@ -1126,7 +1180,7 @@ netmap_sw_to_nic(struct netmap_adapter *na)

dst->len = tmp.len;

dst->flags = NS_BUF_CHANGED;

- rdst->cur = nm_next(dst_cur, dst_lim);

+ rdst->head = rdst->cur = nm_next(dst_head, dst_lim);

}

/* if (sent) XXX txsync ? */

}

@@ -1140,10 +1194,10 @@ netmap_sw_to_nic(struct netmap_adapter *na)

* can be among multiple user threads erroneously calling

* this routine concurrently.

-static void

-netmap_txsync_to_host(struct netmap_adapter *na)

+static int

+netmap_txsync_to_host(struct netmap_kring *kring, int flags)

{

- struct netmap_kring *kring = &na->tx_rings[na->num_tx_rings];

+ struct netmap_adapter *na = kring->na;

u_int const lim = kring->nkr_num_slots - 1;

u_int const head = kring->rhead;

struct mbq q;

@@ -1162,6 +1216,7 @@ netmap_txsync_to_host(struct netmap_adapter *na)

kring->nr_hwtail -= lim + 1;

netmap_send_up(na->ifp, &q);

+ return 0;

}

@@ -1171,17 +1226,15 @@ netmap_txsync_to_host(struct netmap_adapter *na)

* We protect access to the kring using kring->rx_queue.lock

* This routine also does the selrecord if called from the poll handler

- * (we know because td != NULL).

+ * (we know because sr != NULL).

- * NOTE: on linux, selrecord() is defined as a macro and uses pwait

- * as an additional hidden argument.

* returns the number of packets delivered to tx queues in

* transparent mode, or a negative value if error

static int

-netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait)

+netmap_rxsync_from_host(struct netmap_kring *kring, int flags)

{

- struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings];

+ struct netmap_adapter *na = kring->na;

struct netmap_ring *ring = kring->ring;

u_int nm_i, n;

u_int const lim = kring->nkr_num_slots - 1;

@@ -1189,9 +1242,6 @@ netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwai

int ret = 0;

struct mbq *q = &kring->rx_queue, fq;

- (void)pwait; /* disable unused warnings */

- (void)td;

mbq_init(&fq); /* fq holds packets to be freed */

mbq_lock(q);

@@ -1226,19 +1276,20 @@ netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwai

nm_i = kring->nr_hwcur;

if (nm_i != head) { /* something was released */

- if (netmap_fwd || kring->ring->flags & NR_FORWARD)

+ if (nm_may_forward_down(kring)) {

ret = netmap_sw_to_nic(na);

+ if (ret > 0) {

+ kring->nr_kflags |= NR_FORWARD;

+ ret = 0;

+ }

kring->nr_hwcur = head;

}

- /* access copies of cur,tail in the kring */

- if (kring->rcur == kring->rtail && td) /* no bufs available */

- OS_selrecord(td, &kring->si);

mbq_unlock(q);

mbq_purge(&fq);

- mbq_destroy(&fq);

+ mbq_fini(&fq);

return ret;

}

@@ -1267,17 +1318,14 @@ netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwai

* 0 NETMAP_ADMODE_GENERIC GENERIC GENERIC

+static void netmap_hw_dtor(struct netmap_adapter *); /* needed by NM_IS_NATIVE() */

int

netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na)

{

/* generic support */

int i = netmap_admode; /* Take a snapshot. */

struct netmap_adapter *prev_na;

-#ifdef WITH_GENERIC

- struct netmap_generic_adapter *gna;

int error = 0;

-#endif

*na = NULL; /* default */

@@ -1285,7 +1333,7 @@ netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na)

if (i < NETMAP_ADMODE_BEST || i >= NETMAP_ADMODE_LAST)

i = netmap_admode = NETMAP_ADMODE_BEST;

- if (NETMAP_CAPABLE(ifp)) {

+ if (NM_NA_VALID(ifp)) {

prev_na = NA(ifp);

/* If an adapter already exists, return it if

* there are active file descriptors or if

@@ -1310,10 +1358,9 @@ netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na)

/* If there isn't native support and netmap is not allowed

* to use generic adapters, we cannot satisfy the request.

- if (!NETMAP_CAPABLE(ifp) && i == NETMAP_ADMODE_NATIVE)

+ if (!NM_IS_NATIVE(ifp) && i == NETMAP_ADMODE_NATIVE)

return EOPNOTSUPP;

-#ifdef WITH_GENERIC

/* Otherwise, create a generic adapter and return it,

* saving the previously used netmap adapter, if any.

@@ -1328,25 +1375,12 @@ netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na)

* the branches above. This ensures that we never override

* a generic adapter with another generic adapter.

- prev_na = NA(ifp);

error = generic_netmap_attach(ifp);

if (error)

return error;

*na = NA(ifp);

- gna = (struct netmap_generic_adapter*)NA(ifp);

- gna->prev = prev_na; /* save old na */

- if (prev_na != NULL) {

- ifunit_ref(ifp->if_xname);

- // XXX add a refcount ?

- netmap_adapter_get(prev_na);

- }

- ND("Created generic NA %p (prev %p)", gna, gna->prev);

return 0;

-#else /* !WITH_GENERIC */

- return EOPNOTSUPP;

-#endif

}

@@ -1364,21 +1398,22 @@ netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na)

* could not be allocated.

* If successful, hold a reference to the netmap adapter.

- * No reference is kept on the real interface, which may then

- * disappear at any time.

+ * If the interface specified by nmr is a system one, also keep

+ * a reference to it and return a valid *ifp.

int

-netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create)

+netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na,

+ struct ifnet **ifp, int create)

{

- struct ifnet *ifp = NULL;

int error = 0;

struct netmap_adapter *ret = NULL;

*na = NULL; /* default return value */

+ *ifp = NULL;

NMG_LOCK_ASSERT();

- /* we cascade through all possible types of netmap adapter.

+ /* We cascade through all possible types of netmap adapter.

* All netmap_get_*_na() functions return an error and an na,

* with the following combinations:

@@ -1389,6 +1424,11 @@ netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create)

* !0 !NULL impossible

+ /* try to see if this is a ptnetmap port */

+ error = netmap_get_pt_host_na(nmr, na, create);

+ if (error || *na != NULL)

+ return error;

/* try to see if this is a monitor port */

error = netmap_get_monitor_na(nmr, na, create);

if (error || *na != NULL)

@@ -1413,12 +1453,12 @@ netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create)

* This may still be a tap, a veth/epair, or even a

* persistent VALE port.

- ifp = ifunit_ref(nmr->nr_name);

- if (ifp == NULL) {

+ *ifp = ifunit_ref(nmr->nr_name);

+ if (*ifp == NULL) {

return ENXIO;

}

- error = netmap_get_hw_na(ifp, &ret);

+ error = netmap_get_hw_na(*ifp, &ret);

if (error)

goto out;

@@ -1426,15 +1466,42 @@ netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create)

netmap_adapter_get(ret);

out:

- if (error && ret != NULL)

- netmap_adapter_put(ret);

- if (ifp)

- if_rele(ifp); /* allow live unloading of drivers modules */

+ if (error) {

+ if (ret)

+ netmap_adapter_put(ret);

+ if (*ifp) {

+ if_rele(*ifp);

+ *ifp = NULL;

+ }

return error;

}

+/* undo netmap_get_na() */

+void

+netmap_unget_na(struct netmap_adapter *na, struct ifnet *ifp)

+ if (ifp)

+ if_rele(ifp);

+ if (na)

+ netmap_adapter_put(na);

+#define NM_FAIL_ON(t) do { \

+ if (unlikely(t)) { \

+ RD(5, "%s: fail '" #t "' " \

+ "h %d c %d t %d " \

+ "rh %d rc %d rt %d " \

+ "hc %d ht %d", \

+ kring->name, \

+ head, cur, ring->tail, \

+ kring->rhead, kring->rcur, kring->rtail, \

+ kring->nr_hwcur, kring->nr_hwtail); \

+ return kring->nkr_num_slots; \

+ } \

+} while (0)

* validate parameters on entry for *_txsync()

@@ -1449,11 +1516,9 @@ out:

* hwcur, rhead, rtail and hwtail are reliable

-static u_int

-nm_txsync_prologue(struct netmap_kring *kring)

+u_int

+nm_txsync_prologue(struct netmap_kring *kring, struct netmap_ring *ring)

{

-#define NM_ASSERT(t) if (t) { D("fail " #t); goto error; }

- struct netmap_ring *ring = kring->ring;

u_int head = ring->head; /* read only once */

u_int cur = ring->cur; /* read only once */

u_int n = kring->nkr_num_slots;

@@ -1463,35 +1528,34 @@ nm_txsync_prologue(struct netmap_kring *kring)

kring->nr_hwcur, kring->nr_hwtail,

ring->head, ring->cur, ring->tail);

#if 1 /* kernel sanity checks; but we can trust the kring. */

- if (kring->nr_hwcur >= n || kring->rhead >= n ||

- kring->rtail >= n || kring->nr_hwtail >= n)

- goto error;

+ NM_FAIL_ON(kring->nr_hwcur >= n || kring->rhead >= n ||

+ kring->rtail >= n || kring->nr_hwtail >= n);

#endif /* kernel sanity checks */

- * user sanity checks. We only use 'cur',

- * A, B, ... are possible positions for cur:

+ * user sanity checks. We only use head,

+ * A, B, ... are possible positions for head:

- * 0 A cur B tail C n-1

- * 0 D tail E cur F n-1

+ * 0 A rhead B rtail C n-1

+ * 0 D rtail E rhead F n-1

* B, F, D are valid. A, C, E are wrong

if (kring->rtail >= kring->rhead) {

/* want rhead <= head <= rtail */

- NM_ASSERT(head < kring->rhead || head > kring->rtail);

+ NM_FAIL_ON(head < kring->rhead || head > kring->rtail);

/* and also head <= cur <= rtail */

- NM_ASSERT(cur < head || cur > kring->rtail);

+ NM_FAIL_ON(cur < head || cur > kring->rtail);

} else { /* here rtail < rhead */

/* we need head outside rtail .. rhead */

- NM_ASSERT(head > kring->rtail && head < kring->rhead);

+ NM_FAIL_ON(head > kring->rtail && head < kring->rhead);

/* two cases now: head <= rtail or head >= rhead */

if (head <= kring->rtail) {

/* want head <= cur <= rtail */

- NM_ASSERT(cur < head || cur > kring->rtail);

+ NM_FAIL_ON(cur < head || cur > kring->rtail);

} else { /* head >= rhead */

/* cur must be outside rtail..head */

- NM_ASSERT(cur > kring->rtail && cur < head);

+ NM_FAIL_ON(cur > kring->rtail && cur < head);

}

if (ring->tail != kring->rtail) {

@@ -1502,15 +1566,6 @@ nm_txsync_prologue(struct netmap_kring *kring)

kring->rhead = head;

kring->rcur = cur;

return head;

-error:

- RD(5, "%s kring error: head %d cur %d tail %d rhead %d rcur %d rtail %d hwcur %d hwtail %d",

- kring->name,

- head, cur, ring->tail,

- kring->rhead, kring->rcur, kring->rtail,

- kring->nr_hwcur, kring->nr_hwtail);

- return n;

-#undef NM_ASSERT

}

@@ -1525,10 +1580,9 @@ error:

* hwcur and hwtail are reliable.

-static u_int

-nm_rxsync_prologue(struct netmap_kring *kring)

+u_int

+nm_rxsync_prologue(struct netmap_kring *kring, struct netmap_ring *ring)

{

- struct netmap_ring *ring = kring->ring;

uint32_t const n = kring->nkr_num_slots;

uint32_t head, cur;

@@ -1546,30 +1600,24 @@ nm_rxsync_prologue(struct netmap_kring *kring)

cur = kring->rcur = ring->cur; /* read only once */

head = kring->rhead = ring->head; /* read only once */

#if 1 /* kernel sanity checks */

- if (kring->nr_hwcur >= n || kring->nr_hwtail >= n)

- goto error;

+ NM_FAIL_ON(kring->nr_hwcur >= n || kring->nr_hwtail >= n);

#endif /* kernel sanity checks */

/* user sanity checks */

if (kring->nr_hwtail >= kring->nr_hwcur) {

/* want hwcur <= rhead <= hwtail */

- if (head < kring->nr_hwcur || head > kring->nr_hwtail)

- goto error;

+ NM_FAIL_ON(head < kring->nr_hwcur || head > kring->nr_hwtail);

/* and also rhead <= rcur <= hwtail */

- if (cur < head || cur > kring->nr_hwtail)

- goto error;

+ NM_FAIL_ON(cur < head || cur > kring->nr_hwtail);

} else {

/* we need rhead outside hwtail..hwcur */

- if (head < kring->nr_hwcur && head > kring->nr_hwtail)

- goto error;

+ NM_FAIL_ON(head < kring->nr_hwcur && head > kring->nr_hwtail);

/* two cases now: head <= hwtail or head >= hwcur */

if (head <= kring->nr_hwtail) {

/* want head <= cur <= hwtail */

- if (cur < head || cur > kring->nr_hwtail)

- goto error;

+ NM_FAIL_ON(cur < head || cur > kring->nr_hwtail);

} else {

/* cur must be outside hwtail..head */

- if (cur < head && cur > kring->nr_hwtail)

- goto error;

+ NM_FAIL_ON(cur < head && cur > kring->nr_hwtail);

}

if (ring->tail != kring->rtail) {

@@ -1579,13 +1627,6 @@ nm_rxsync_prologue(struct netmap_kring *kring)

ring->tail = kring->rtail;

}

return head;

-error:

- RD(5, "kring error: hwcur %d rcur %d hwtail %d head %d cur %d tail %d",

- kring->nr_hwcur,

- kring->rcur, kring->nr_hwtail,

- kring->rhead, kring->rcur, ring->tail);

- return n;

}

@@ -1659,6 +1700,7 @@ netmap_interp_ringid(struct netmap_priv_d *priv, uint16_t ringid, uint32_t flags

struct netmap_adapter *na = priv->np_na;

u_int j, i = ringid & NETMAP_RING_MASK;

u_int reg = flags & NR_REG_MASK;

+ int excluded_direction[] = { NR_TX_RINGS_ONLY, NR_RX_RINGS_ONLY };

enum txrx t;

if (reg == NR_REG_DEFAULT) {

@@ -1672,48 +1714,58 @@ netmap_interp_ringid(struct netmap_priv_d *priv, uint16_t ringid, uint32_t flags

}

D("deprecated API, old ringid 0x%x -> ringid %x reg %d", ringid, i, reg);

}

- switch (reg) {

- case NR_REG_ALL_NIC:

- case NR_REG_PIPE_MASTER:

- case NR_REG_PIPE_SLAVE:

- for_rx_tx(t) {

+ if ((flags & NR_PTNETMAP_HOST) && (reg != NR_REG_ALL_NIC ||

+ flags & (NR_RX_RINGS_ONLY|NR_TX_RINGS_ONLY))) {

+ D("Error: only NR_REG_ALL_NIC supported with netmap passthrough");

+ return EINVAL;

+ }

+ for_rx_tx(t) {

+ if (flags & excluded_direction[t]) {

+ priv->np_qfirst[t] = priv->np_qlast[t] = 0;

+ continue;

+ }

+ switch (reg) {

+ case NR_REG_ALL_NIC:

+ case NR_REG_PIPE_MASTER:

+ case NR_REG_PIPE_SLAVE:

priv->np_qfirst[t] = 0;

priv->np_qlast[t] = nma_get_nrings(na, t);

- }

- ND("%s %d %d", "ALL/PIPE",

- priv->np_qfirst[NR_RX], priv->np_qlast[NR_RX]);

- break;

- case NR_REG_SW:

- case NR_REG_NIC_SW:

- if (!(na->na_flags & NAF_HOST_RINGS)) {

- D("host rings not supported");

- return EINVAL;

- }

- for_rx_tx(t) {

+ ND("ALL/PIPE: %s %d %d", nm_txrx2str(t),

+ priv->np_qfirst[t], priv->np_qlast[t]);

+ break;

+ case NR_REG_SW:

+ case NR_REG_NIC_SW:

+ if (!(na->na_flags & NAF_HOST_RINGS)) {

+ D("host rings not supported");

+ return EINVAL;

+ }

priv->np_qfirst[t] = (reg == NR_REG_SW ?

nma_get_nrings(na, t) : 0);

priv->np_qlast[t] = nma_get_nrings(na, t) + 1;

- }

- ND("%s %d %d", reg == NR_REG_SW ? "SW" : "NIC+SW",

- priv->np_qfirst[NR_RX], priv->np_qlast[NR_RX]);

- break;

- case NR_REG_ONE_NIC:

- if (i >= na->num_tx_rings && i >= na->num_rx_rings) {

- D("invalid ring id %d", i);

- return EINVAL;

- }

- for_rx_tx(t) {

+ ND("%s: %s %d %d", reg == NR_REG_SW ? "SW" : "NIC+SW",

+ nm_txrx2str(t),

+ priv->np_qfirst[t], priv->np_qlast[t]);

+ break;

+ case NR_REG_ONE_NIC:

+ if (i >= na->num_tx_rings && i >= na->num_rx_rings) {

+ D("invalid ring id %d", i);

+ return EINVAL;

+ }

/* if not enough rings, use the first one */

j = i;

if (j >= nma_get_nrings(na, t))

j = 0;

priv->np_qfirst[t] = j;

priv->np_qlast[t] = j + 1;

+ ND("ONE_NIC: %s %d %d", nm_txrx2str(t),

+ priv->np_qfirst[t], priv->np_qlast[t]);

+ break;

+ default:

+ D("invalid regif type %d", reg);

+ return EINVAL;

}

- break;

- default:

- D("invalid regif type %d", reg);

- return EINVAL;

}

priv->np_flags = (flags & ~NR_REG_MASK) | reg;

@@ -1776,11 +1828,12 @@ netmap_unset_ringid(struct netmap_priv_d *priv)

}

-/* check that the rings we want to bind are not exclusively owned by a previous

- * bind. If exclusive ownership has been requested, we also mark the rings.

+/* Set the nr_pending_mode for the requested rings.

+ * If requested, also try to get exclusive access to the rings, provided

+ * the rings we want to bind are not exclusively owned by a previous bind.

static int

-netmap_get_exclusive(struct netmap_priv_d *priv)

+netmap_krings_get(struct netmap_priv_d *priv)

{

struct netmap_adapter *na = priv->np_na;

u_int i;

@@ -1811,16 +1864,16 @@ netmap_get_exclusive(struct netmap_priv_d *priv)

}

- /* second round: increment usage cound and possibly

- * mark as exclusive

+ /* second round: increment usage count (possibly marking them

+ * as exclusive) and set the nr_pending_mode

for_rx_tx(t) {

for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {

kring = &NMR(na, t)[i];

kring->users++;

if (excl)

kring->nr_kflags |= NKR_EXCLUSIVE;

+ kring->nr_pending_mode = NKR_NETMAP_ON;

}

@@ -1828,9 +1881,11 @@ netmap_get_exclusive(struct netmap_priv_d *priv)

}

-/* undo netmap_get_ownership() */

+/* Undo netmap_krings_get(). This is done by clearing the exclusive mode

+ * if was asked on regif, and unset the nr_pending_mode if we are the

+ * last users of the involved rings. */

static void

-netmap_rel_exclusive(struct netmap_priv_d *priv)

+netmap_krings_put(struct netmap_priv_d *priv)

{

struct netmap_adapter *na = priv->np_na;

u_int i;

@@ -1852,6 +1907,8 @@ netmap_rel_exclusive(struct netmap_priv_d *priv)

if (excl)

kring->nr_kflags &= ~NKR_EXCLUSIVE;

kring->users--;

+ if (kring->users == 0)

+ kring->nr_pending_mode = NKR_NETMAP_OFF;

}

@@ -1899,9 +1956,8 @@ netmap_rel_exclusive(struct netmap_priv_d *priv)

* (put the adapter in netmap mode)

* This may be one of the following:

- * (XXX these should be either all *_register or all *_reg 2014-03-15)

- * * netmap_hw_register (hw ports)

+ * * netmap_hw_reg (hw ports)

* checks that the ifp is still there, then calls

* the hardware specific callback;

@@ -1919,7 +1975,7 @@ netmap_rel_exclusive(struct netmap_priv_d *priv)

* intercept the sync callbacks of the monitored

* rings

- * * netmap_bwrap_register (bwraps)

+ * * netmap_bwrap_reg (bwraps)

* cross-link the bwrap and hwna rings,

* forward the request to the hwna, override

* the hwna notify callback (to get the frames

@@ -1948,7 +2004,7 @@ netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na,

if (na->active_fds == 0) {

* If this is the first registration of the adapter,

- * also create the netmap rings and their in-kernel view,

+ * create the in-kernel view of the netmap rings,

* the netmap krings.

@@ -1960,39 +2016,48 @@ netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na,

if (error)

goto err_drop_mem;

- /* create all missing netmap rings */

- error = netmap_mem_rings_create(na);

- if (error)

- goto err_del_krings;

}

- /* now the kring must exist and we can check whether some

- * previous bind has exclusive ownership on them

+ /* now the krings must exist and we can check whether some

+ * previous bind has exclusive ownership on them, and set

+ * nr_pending_mode

- error = netmap_get_exclusive(priv);

+ error = netmap_krings_get(priv);

if (error)

- goto err_del_rings;

+ goto err_del_krings;

+ /* create all needed missing netmap rings */

+ error = netmap_mem_rings_create(na);

+ if (error)

+ goto err_rel_excl;

/* in all cases, create a new netmap if */

nifp = netmap_mem_if_new(na);

if (nifp == NULL) {

error = ENOMEM;

- goto err_rel_excl;

+ goto err_del_rings;

}

- na->active_fds++;

- if (!nm_netmap_on(na)) {

- /* Netmap not active, set the card in netmap mode

- * and make it use the shared buffers.

- */

+ if (na->active_fds == 0) {

/* cache the allocator info in the na */

- netmap_mem_get_lut(na->nm_mem, &na->na_lut);

- ND("%p->na_lut == %p", na, na->na_lut.lut);

- error = na->nm_register(na, 1); /* mode on */

- if (error)

+ error = netmap_mem_get_lut(na->nm_mem, &na->na_lut);

+ if (error)

goto err_del_if;

+ ND("lut %p bufs %u size %u", na->na_lut.lut, na->na_lut.objtotal,

+ na->na_lut.objsize);

}

+ if (nm_kring_pending(priv)) {

+ /* Some kring is switching mode, tell the adapter to

+ * react on this. */

+ error = na->nm_register(na, 1);

+ if (error)

+ goto err_put_lut;

+ }

+ /* Commit the reference. */

+ na->active_fds++;

* advertise that the interface is ready by setting np_nifp.

* The barrier is needed because readers (poll, *SYNC and mmap)

@@ -2003,15 +2068,15 @@ netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na,

return 0;

+err_put_lut:

+ if (na->active_fds == 0)

+ memset(&na->na_lut, 0, sizeof(na->na_lut));

err_del_if:

- memset(&na->na_lut, 0, sizeof(na->na_lut));

- na->active_fds--;

netmap_mem_if_delete(na, nifp);

err_rel_excl:

- netmap_rel_exclusive(priv);

+ netmap_krings_put(priv);

err_del_rings:

- if (na->active_fds == 0)

- netmap_mem_rings_delete(na);

+ netmap_mem_rings_delete(na);

err_del_krings:

if (na->active_fds == 0)

na->nm_krings_delete(na);

@@ -2024,41 +2089,23 @@ err:

- * update kring and ring at the end of txsync.

+ * update kring and ring at the end of rxsync/txsync.

static inline void

-nm_txsync_finalize(struct netmap_kring *kring)

+nm_sync_finalize(struct netmap_kring *kring)

{

- /* update ring tail to what the kernel knows */

+ /*

+ * Update ring tail to what the kernel knows

+ * After txsync: head/rhead/hwcur might be behind cur/rcur

+ * if no carrier.

+ */

kring->ring->tail = kring->rtail = kring->nr_hwtail;

- /* note, head/rhead/hwcur might be behind cur/rcur

- * if no carrier

- */

ND(5, "%s now hwcur %d hwtail %d head %d cur %d tail %d",

kring->name, kring->nr_hwcur, kring->nr_hwtail,

kring->rhead, kring->rcur, kring->rtail);

}

-/*

- * update kring and ring at the end of rxsync

- */

-static inline void

-nm_rxsync_finalize(struct netmap_kring *kring)

- /* tell userspace that there might be new packets */

- //struct netmap_ring *ring = kring->ring;

- ND("head %d cur %d tail %d -> %d", ring->head, ring->cur, ring->tail,

- kring->nr_hwtail);

- kring->ring->tail = kring->rtail = kring->nr_hwtail;

- /* make a copy of the state for next round */

- kring->rhead = kring->ring->head;

- kring->rcur = kring->ring->cur;

* ioctl(2) support for the "netmap" device.

@@ -2072,21 +2119,17 @@ nm_rxsync_finalize(struct netmap_kring *kring)

* Return 0 on success, errno otherwise.

int

-netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data,

- int fflag, struct thread *td)

+netmap_ioctl(struct netmap_priv_d *priv, u_long cmd, caddr_t data, struct thread *td)

{

- struct netmap_priv_d *priv = NULL;

struct nmreq *nmr = (struct nmreq *) data;

struct netmap_adapter *na = NULL;

- int error;

+ struct ifnet *ifp = NULL;

+ int error = 0;

u_int i, qfirst, qlast;

struct netmap_if *nifp;

struct netmap_kring *krings;

enum txrx t;

- (void)dev; /* UNUSED */

- (void)fflag; /* UNUSED */

if (cmd == NIOCGINFO || cmd == NIOCREGIF) {

/* truncate name */

nmr->nr_name[sizeof(nmr->nr_name) - 1] = '\0';

@@ -2101,15 +2144,6 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data,

return EINVAL;

}

- CURVNET_SET(TD_TO_VNET(td));

- error = devfs_get_cdevpriv((void **)&priv);

- if (error) {

- CURVNET_RESTORE();

- /* XXX ENOENT should be impossible, since the priv

- * is now created in the open */

- return (error == ENOENT ? ENXIO : error);

- }

switch (cmd) {

case NIOCGINFO: /* return capabilities etc */

@@ -2125,10 +2159,14 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data,

u_int memflags;

if (nmr->nr_name[0] != '\0') {

/* get a refcount */

- error = netmap_get_na(nmr, &na, 1 /* create */);

- if (error)

+ error = netmap_get_na(nmr, &na, &ifp, 1 /* create */);

+ if (error) {

+ na = NULL;

+ ifp = NULL;

break;

+ }

nmd = na->nm_mem; /* get memory allocator */

}

@@ -2145,8 +2183,8 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data,

nmr->nr_tx_rings = na->num_tx_rings;

nmr->nr_rx_slots = na->num_rx_desc;

nmr->nr_tx_slots = na->num_tx_desc;

- netmap_adapter_put(na);

} while (0);

+ netmap_unget_na(na, ifp);

NMG_UNLOCK();

break;

@@ -2156,9 +2194,25 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data,

if (i == NETMAP_BDG_ATTACH || i == NETMAP_BDG_DETACH

|| i == NETMAP_BDG_VNET_HDR

|| i == NETMAP_BDG_NEWIF

- || i == NETMAP_BDG_DELIF) {

+ || i == NETMAP_BDG_DELIF

+ || i == NETMAP_BDG_POLLING_ON

+ || i == NETMAP_BDG_POLLING_OFF) {

error = netmap_bdg_ctl(nmr, NULL);

break;

+ } else if (i == NETMAP_PT_HOST_CREATE || i == NETMAP_PT_HOST_DELETE) {

+ error = ptnetmap_ctl(nmr, priv->np_na);

+ break;

+ } else if (i == NETMAP_VNET_HDR_GET) {

+ struct ifnet *ifp;

+ NMG_LOCK();

+ error = netmap_get_na(nmr, &na, &ifp, 0);

+ if (na && !error) {

+ nmr->nr_arg1 = na->virt_hdr_len;

+ }

+ netmap_unget_na(na, ifp);

+ NMG_UNLOCK();

+ break;

} else if (i != 0) {

D("nr_cmd must be 0 not %d", i);

error = EINVAL;

@@ -2169,23 +2223,32 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data,

NMG_LOCK();

do {

u_int memflags;

+ struct ifnet *ifp;

if (priv->np_nifp != NULL) { /* thread already registered */

error = EBUSY;

break;

}

/* find the interface and a reference */

- error = netmap_get_na(nmr, &na, 1 /* create */); /* keep reference */

+ error = netmap_get_na(nmr, &na, &ifp,

+ 1 /* create */); /* keep reference */

if (error)

break;

if (NETMAP_OWNED_BY_KERN(na)) {

- netmap_adapter_put(na);

+ netmap_unget_na(na, ifp);

error = EBUSY;

break;

}

+ if (na->virt_hdr_len && !(nmr->nr_flags & NR_ACCEPT_VNET_HDR)) {

+ netmap_unget_na(na, ifp);

+ error = EIO;

+ break;

+ }

error = netmap_do_regif(priv, na, nmr->nr_ringid, nmr->nr_flags);

if (error) { /* reg. failed, release priv and ref */

- netmap_adapter_put(na);

+ netmap_unget_na(na, ifp);

break;

}

nifp = priv->np_nifp;

@@ -2200,7 +2263,7 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data,

&nmr->nr_arg2);

if (error) {

netmap_do_unregif(priv);

- netmap_adapter_put(na);

+ netmap_unget_na(na, ifp);

break;

}

if (memflags & NETMAP_MEM_PRIVATE) {

@@ -2212,12 +2275,17 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data,

}

if (nmr->nr_arg3) {

- D("requested %d extra buffers", nmr->nr_arg3);

+ if (netmap_verbose)

+ D("requested %d extra buffers", nmr->nr_arg3);

nmr->nr_arg3 = netmap_extra_alloc(na,

&nifp->ni_bufs_head, nmr->nr_arg3);

- D("got %d extra buffers", nmr->nr_arg3);

+ if (netmap_verbose)

+ D("got %d extra buffers", nmr->nr_arg3);

}

nmr->nr_offset = netmap_mem_if_offset(na->nm_mem, nifp);

+ /* store ifp reference so that priv destructor may release it */

+ priv->np_ifp = ifp;

} while (0);

NMG_UNLOCK();

break;

@@ -2240,11 +2308,6 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data,

break;

}

- if (!nm_netmap_on(na)) {

- error = ENXIO;

- break;

- }

t = (cmd == NIOCTXSYNC ? NR_TX : NR_RX);

krings = NMR(na, t);

qfirst = priv->np_qfirst[t];

@@ -2252,31 +2315,34 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data,

for (i = qfirst; i < qlast; i++) {

struct netmap_kring *kring = krings + i;

- if (nm_kr_tryget(kring)) {

- error = EBUSY;

- goto out;

+ struct netmap_ring *ring = kring->ring;

+ if (unlikely(nm_kr_tryget(kring, 1, &error))) {

+ error = (error ? EIO : 0);

+ continue;

}

if (cmd == NIOCTXSYNC) {

if (netmap_verbose & NM_VERB_TXSYNC)

D("pre txsync ring %d cur %d hwcur %d",

- i, kring->ring->cur,

+ i, ring->cur,

kring->nr_hwcur);

- if (nm_txsync_prologue(kring) >= kring->nkr_num_slots) {

+ if (nm_txsync_prologue(kring, ring) >= kring->nkr_num_slots) {

netmap_ring_reinit(kring);

} else if (kring->nm_sync(kring, NAF_FORCE_RECLAIM) == 0) {

- nm_txsync_finalize(kring);

+ nm_sync_finalize(kring);

}

if (netmap_verbose & NM_VERB_TXSYNC)

D("post txsync ring %d cur %d hwcur %d",

- i, kring->ring->cur,

+ i, ring->cur,

kring->nr_hwcur);

} else {

- if (nm_rxsync_prologue(kring) >= kring->nkr_num_slots) {

+ if (nm_rxsync_prologue(kring, ring) >= kring->nkr_num_slots) {

netmap_ring_reinit(kring);

} else if (kring->nm_sync(kring, NAF_FORCE_READ) == 0) {

- nm_rxsync_finalize(kring);

+ nm_sync_finalize(kring);

}

- microtime(&na->rx_rings[i].ring->ts);

+ microtime(&ring->ts);

}

nm_kr_put(kring);

}

@@ -2323,9 +2389,7 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data,

error = EOPNOTSUPP;

#endif /* linux */

}

-out:

- CURVNET_RESTORE();

return (error);

}

@@ -2345,17 +2409,15 @@ out:

* hidden argument.

int

-netmap_poll(struct cdev *dev, int events, struct thread *td)

+netmap_poll(struct netmap_priv_d *priv, int events, NM_SELRECORD_T *sr)

{

- struct netmap_priv_d *priv = NULL;

struct netmap_adapter *na;

struct netmap_kring *kring;

+ struct netmap_ring *ring;

u_int i, check_all_tx, check_all_rx, want[NR_TXRX], revents = 0;

#define want_tx want[NR_TX]

#define want_rx want[NR_RX]

struct mbq q; /* packets from hw queues to host stack */

- void *pwait = dev; /* linux compatibility */

- int is_kevent = 0;

enum txrx t;

@@ -2365,23 +2427,13 @@ netmap_poll(struct cdev *dev, int events, struct thread *td)

int retry_tx = 1, retry_rx = 1;

- (void)pwait;

- mbq_init(&q);

- /*

- * XXX kevent has curthread->tp_fop == NULL,

- * so devfs_get_cdevpriv() fails. We circumvent this by passing

- * priv as the first argument, which is also useful to avoid

- * the selrecord() which are not necessary in that case.

+ /* transparent mode: send_down is 1 if we have found some

+ * packets to forward during the rx scan and we have not

+ * sent them down to the nic yet

- if (devfs_get_cdevpriv((void **)&priv) != 0) {

- is_kevent = 1;

- if (netmap_verbose)

- D("called from kevent");

- priv = (struct netmap_priv_d *)dev;

- }

- if (priv == NULL)

- return POLLERR;

+ int send_down = 0;

+ mbq_init(&q);

if (priv->np_nifp == NULL) {

D("No if registered");

@@ -2399,7 +2451,6 @@ netmap_poll(struct cdev *dev, int events, struct thread *td)

want_tx = events & (POLLOUT | POLLWRNORM);

want_rx = events & (POLLIN | POLLRDNORM);

* check_all_{tx|rx} are set if the card has more than one queue AND

* the file descriptor is bound to all of them. If so, we sleep on

@@ -2421,6 +2472,32 @@ netmap_poll(struct cdev *dev, int events, struct thread *td)

* slots available. If this fails, then lock and call the sync

* routines.

+#if 1 /* new code- call rx if any of the ring needs to release or read buffers */

+ if (want_tx) {

+ t = NR_TX;

+ for (i = priv->np_qfirst[t]; want[t] && i < priv->np_qlast[t]; i++) {

+ kring = &NMR(na, t)[i];

+ /* XXX compare ring->cur and kring->tail */

+ if (!nm_ring_empty(kring->ring)) {

+ revents |= want[t];

+ want[t] = 0; /* also breaks the loop */

+ }

+ if (want_rx) {

+ want_rx = 0; /* look for a reason to run the handlers */

+ t = NR_RX;

+ for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {

+ kring = &NMR(na, t)[i];

+ if (kring->ring->cur == kring->ring->tail /* try fetch new buffers */

+ || kring->rhead != kring->ring->head /* release buffers */) {

+ want_rx = 1;

+ }

+ if (!want_rx)

+ revents |= events & (POLLIN | POLLRDNORM); /* we have data */

+ }

+#else /* old code */

for_rx_tx(t) {

for (i = priv->np_qfirst[t]; want[t] && i < priv->np_qlast[t]; i++) {

kring = &NMR(na, t)[i];

@@ -2431,6 +2508,7 @@ netmap_poll(struct cdev *dev, int events, struct thread *td)

}

+#endif /* old code */

* If we want to push packets out (priv->np_txpoll) or

@@ -2447,32 +2525,26 @@ netmap_poll(struct cdev *dev, int events, struct thread *td)

* used to skip rings with no pending transmissions.

flush_tx:

- for (i = priv->np_qfirst[NR_TX]; i < priv->np_qlast[NR_RX]; i++) {

+ for (i = priv->np_qfirst[NR_TX]; i < priv->np_qlast[NR_TX]; i++) {

int found = 0;

kring = &na->tx_rings[i];

- if (!want_tx && kring->ring->cur == kring->nr_hwcur)

+ ring = kring->ring;

+ if (!send_down && !want_tx && ring->cur == kring->nr_hwcur)

continue;

- /* only one thread does txsync */

- if (nm_kr_tryget(kring)) {

- /* either busy or stopped

- * XXX if the ring is stopped, sleeping would

- * be better. In current code, however, we only

- * stop the rings for brief intervals (2014-03-14)

- */

- if (netmap_verbose)

- RD(2, "%p lost race on txring %d, ok",

- priv, i);

+ if (nm_kr_tryget(kring, 1, &revents))

continue;

- }

- if (nm_txsync_prologue(kring) >= kring->nkr_num_slots) {

+ if (nm_txsync_prologue(kring, ring) >= kring->nkr_num_slots) {

netmap_ring_reinit(kring);

revents |= POLLERR;

} else {

if (kring->nm_sync(kring, 0))

revents |= POLLERR;

else

- nm_txsync_finalize(kring);

+ nm_sync_finalize(kring);

}

@@ -2489,8 +2561,10 @@ flush_tx:

kring->nm_notify(kring, 0);

}

- if (want_tx && retry_tx && !is_kevent) {

- OS_selrecord(td, check_all_tx ?

+ /* if there were any packet to forward we must have handled them by now */

+ send_down = 0;

+ if (want_tx && retry_tx && sr) {

+ nm_os_selrecord(sr, check_all_tx ?

&na->si[NR_TX] : &na->tx_rings[priv->np_qfirst[NR_TX]].si);

retry_tx = 0;

goto flush_tx;

@@ -2502,22 +2576,18 @@ flush_tx:

* Do it on all rings because otherwise we starve.

if (want_rx) {

- int send_down = 0; /* transparent mode */

/* two rounds here for race avoidance */

do_retry_rx:

for (i = priv->np_qfirst[NR_RX]; i < priv->np_qlast[NR_RX]; i++) {

int found = 0;

kring = &na->rx_rings[i];

+ ring = kring->ring;

- if (nm_kr_tryget(kring)) {

- if (netmap_verbose)

- RD(2, "%p lost race on rxring %d, ok",

- priv, i);

+ if (unlikely(nm_kr_tryget(kring, 1, &revents)))

continue;

- }

- if (nm_rxsync_prologue(kring) >= kring->nkr_num_slots) {

+ if (nm_rxsync_prologue(kring, ring) >= kring->nkr_num_slots) {

netmap_ring_reinit(kring);

revents |= POLLERR;

}

@@ -2526,22 +2596,22 @@ do_retry_rx:

* transparent mode support: collect packets

* from the rxring(s).

- * XXX NR_FORWARD should only be read on

- * physical or NIC ports

- if (netmap_fwd ||kring->ring->flags & NR_FORWARD) {

+ if (nm_may_forward_up(kring)) {

ND(10, "forwarding some buffers up %d to %d",

- kring->nr_hwcur, kring->ring->cur);

+ kring->nr_hwcur, ring->cur);

netmap_grab_packets(kring, &q, netmap_fwd);

}

+ kring->nr_kflags &= ~NR_FORWARD;

if (kring->nm_sync(kring, 0))

revents |= POLLERR;

else

- nm_rxsync_finalize(kring);

+ nm_sync_finalize(kring);

+ send_down |= (kring->nr_kflags & NR_FORWARD); /* host ring only */

if (netmap_no_timestamp == 0 ||

- kring->ring->flags & NR_TIMESTAMP) {

- microtime(&kring->ring->ts);

+ ring->flags & NR_TIMESTAMP) {

+ microtime(&ring->ts);

}

found = kring->rcur != kring->rtail;

nm_kr_put(kring);

@@ -2552,22 +2622,10 @@ do_retry_rx:

}

- /* transparent mode XXX only during first pass ? */

- if (na->na_flags & NAF_HOST_RINGS) {

- kring = &na->rx_rings[na->num_rx_rings];

- if (check_all_rx

- && (netmap_fwd || kring->ring->flags & NR_FORWARD)) {

- /* XXX fix to use kring fields */

- if (nm_ring_empty(kring->ring))

- send_down = netmap_rxsync_from_host(na, td, dev);

- if (!nm_ring_empty(kring->ring))

- revents |= want_rx;

- }

- if (retry_rx && !is_kevent)

- OS_selrecord(td, check_all_rx ?

+ if (retry_rx && sr) {

+ nm_os_selrecord(sr, check_all_rx ?

&na->si[NR_RX] : &na->rx_rings[priv->np_qfirst[NR_RX]].si);

+ }

if (send_down > 0 || retry_rx) {

retry_rx = 0;

if (send_down)

@@ -2582,15 +2640,14 @@ do_retry_rx:

* kring->nr_hwcur and ring->head

* are passed to the other endpoint.

- * In this mode we also scan the sw rxring, which in

- * turn passes packets up.

- *

- * XXX Transparent mode at the moment requires to bind all

+ * Transparent mode requires to bind all

* rings to a single file descriptor.

- if (q.head && na->ifp != NULL)

+ if (q.head && !nm_kr_tryget(&na->tx_rings[na->num_tx_rings], 1, &revents)) {

netmap_send_up(na->ifp, &q);

+ nm_kr_put(&na->tx_rings[na->num_tx_rings]);

+ }

return (revents);

#undef want_tx

@@ -2600,8 +2657,6 @@ do_retry_rx:

/*-------------------- driver support routines -------------------*/

-static int netmap_hw_krings_create(struct netmap_adapter *);

/* default notify callback */

static int

netmap_notify(struct netmap_kring *kring, int flags)

@@ -2609,51 +2664,51 @@ netmap_notify(struct netmap_kring *kring, int flags)

struct netmap_adapter *na = kring->na;

enum txrx t = kring->tx;

- OS_selwakeup(&kring->si, PI_NET);

+ nm_os_selwakeup(&kring->si);

/* optimization: avoid a wake up on the global

* queue if nobody has registered for more

* than one ring

if (na->si_users[t] > 0)

- OS_selwakeup(&na->si[t], PI_NET);

+ nm_os_selwakeup(&na->si[t]);

- return 0;

+ return NM_IRQ_COMPLETED;

}

+#if 0

+static int

+netmap_notify(struct netmap_adapter *na, u_int n_ring,

+enum txrx tx, int flags)

+ if (tx == NR_TX) {

+ KeSetEvent(notes->TX_EVENT, 0, FALSE);

+ }

+ else

+ {

+ KeSetEvent(notes->RX_EVENT, 0, FALSE);

+ }

+ return 0;

+#endif

/* called by all routines that create netmap_adapters.

- * Attach na to the ifp (if any) and provide defaults

- * for optional callbacks. Defaults assume that we

- * are creating an hardware netmap_adapter.

+ * provide some defaults and get a reference to the

+ * memory allocator

int

netmap_attach_common(struct netmap_adapter *na)

{

- struct ifnet *ifp = na->ifp;

if (na->num_tx_rings == 0 || na->num_rx_rings == 0) {

D("%s: invalid rings tx %d rx %d",

na->name, na->num_tx_rings, na->num_rx_rings);

return EINVAL;

}

- /* ifp is NULL for virtual adapters (bwrap, non-persistent VALE ports,

- * pipes, monitors). For bwrap we actually have a non-null ifp for

- * use by the external modules, but that is set after this

- * function has been called.

- * XXX this is ugly, maybe split this function in two (2014-03-14)

- */

- if (ifp != NULL) {

- WNA(ifp) = na;

- /* the following is only needed for na that use the host port.

- * XXX do we have something similar for linux ?

- */

#ifdef __FreeBSD__

- na->if_input = ifp->if_input; /* for netmap_send_up */

-#endif /* __FreeBSD__ */

- NETMAP_SET_CAPABLE(ifp);

+ if (na->na_flags & NAF_HOST_RINGS && na->ifp) {

+ na->if_input = na->ifp->if_input; /* for netmap_send_up */

}

+#endif /* __FreeBSD__ */

if (na->nm_krings_create == NULL) {

/* we assume that we have been called by a driver,

* since other port types all provide their own

@@ -2677,6 +2732,7 @@ netmap_attach_common(struct netmap_adapter *na)

na->nm_bdg_attach = netmap_bwrap_attach;

#endif

return 0;

}

@@ -2685,9 +2741,6 @@ netmap_attach_common(struct netmap_adapter *na)

void

netmap_detach_common(struct netmap_adapter *na)

{

- if (na->ifp != NULL)

- WNA(na->ifp) = NULL; /* XXX do we need this? */

if (na->tx_rings) { /* XXX should not happen */

D("freeing leftover tx_rings");

na->nm_krings_delete(na);

@@ -2699,31 +2752,52 @@ netmap_detach_common(struct netmap_adapter *na)

free(na, M_DEVBUF);

}

-/* Wrapper for the register callback provided hardware drivers.

- * na->ifp == NULL means the driver module has been

+/* Wrapper for the register callback provided netmap-enabled

+ * hardware drivers.

+ * nm_iszombie(na) means that the driver module has been

* unloaded, so we cannot call into it.

- * Note that module unloading, in our patched linux drivers,

- * happens under NMG_LOCK and after having stopped all the

- * nic rings (see netmap_detach). This provides sufficient

- * protection for the other driver-provied callbacks

- * (i.e., nm_config and nm_*xsync), that therefore don't need

- * to wrapped.

+ * nm_os_ifnet_lock() must guarantee mutual exclusion with

+ * module unloading.

static int

-netmap_hw_register(struct netmap_adapter *na, int onoff)

+netmap_hw_reg(struct netmap_adapter *na, int onoff)

{

struct netmap_hw_adapter *hwna =

(struct netmap_hw_adapter*)na;

+ int error = 0;

+ nm_os_ifnet_lock();

+ if (nm_iszombie(na)) {

+ if (onoff) {

+ error = ENXIO;

+ } else if (na != NULL) {

+ na->na_flags &= ~NAF_NETMAP_ON;

+ }

+ goto out;

+ }

+ error = hwna->nm_hw_register(na, onoff);

- if (na->ifp == NULL)

- return onoff ? ENXIO : 0;

+out:

+ nm_os_ifnet_unlock();

- return hwna->nm_hw_register(na, onoff);

+ return error;

+static void

+netmap_hw_dtor(struct netmap_adapter *na)

+ if (nm_iszombie(na) || na->ifp == NULL)

+ return;

+ WNA(na->ifp) = NULL;

}

- * Initialize a ``netmap_adapter`` object created by driver on attach.

+ * Allocate a ``netmap_adapter`` object, and initialize it from the

+ * 'arg' passed by the driver on attach.

* We allocate a block of memory with room for a struct netmap_adapter

* plus two sets of N+2 struct netmap_kring (where N is the number

* of hardware rings):

@@ -2732,29 +2806,31 @@ netmap_hw_register(struct netmap_adapter *na, int onoff)

* kring N+1 is only used for the selinfo for all queues. // XXX still true ?

* Return 0 on success, ENOMEM otherwise.

-int

-netmap_attach(struct netmap_adapter *arg)

+static int

+_netmap_attach(struct netmap_adapter *arg, size_t size)

{

struct netmap_hw_adapter *hwna = NULL;

- // XXX when is arg == NULL ?

- struct ifnet *ifp = arg ? arg->ifp : NULL;

+ struct ifnet *ifp = NULL;

- if (arg == NULL || ifp == NULL)

+ if (arg == NULL || arg->ifp == NULL)

goto fail;

- hwna = malloc(sizeof(*hwna), M_DEVBUF, M_NOWAIT | M_ZERO);

+ ifp = arg->ifp;

+ hwna = malloc(size, M_DEVBUF, M_NOWAIT | M_ZERO);

if (hwna == NULL)

goto fail;

hwna->up = *arg;

hwna->up.na_flags |= NAF_HOST_RINGS | NAF_NATIVE;

strncpy(hwna->up.name, ifp->if_xname, sizeof(hwna->up.name));

hwna->nm_hw_register = hwna->up.nm_register;

- hwna->up.nm_register = netmap_hw_register;

+ hwna->up.nm_register = netmap_hw_reg;

if (netmap_attach_common(&hwna->up)) {

free(hwna, M_DEVBUF);

goto fail;

}

netmap_adapter_get(&hwna->up);

+ NM_ATTACH_NA(ifp, &hwna->up);

#ifdef linux

if (ifp->netdev_ops) {

/* prepare a clone of the netdev ops */

@@ -2762,7 +2838,7 @@ netmap_attach(struct netmap_adapter *arg)

hwna->nm_ndo.ndo_start_xmit = ifp->netdev_ops;

#else

hwna->nm_ndo = *ifp->netdev_ops;

-#endif

+#endif /* NETMAP_LINUX_HAVE_NETDEV_OPS */

}

hwna->nm_ndo.ndo_start_xmit = linux_netmap_start_xmit;

if (ifp->ethtool_ops) {

@@ -2771,11 +2847,14 @@ netmap_attach(struct netmap_adapter *arg)

hwna->nm_eto.set_ringparam = linux_netmap_set_ringparam;

#ifdef NETMAP_LINUX_HAVE_SET_CHANNELS

hwna->nm_eto.set_channels = linux_netmap_set_channels;

-#endif

+#endif /* NETMAP_LINUX_HAVE_SET_CHANNELS */

if (arg->nm_config == NULL) {

hwna->up.nm_config = netmap_linux_config;

}

#endif /* linux */

+ if (arg->nm_dtor == NULL) {

+ hwna->up.nm_dtor = netmap_hw_dtor;

+ }

if_printf(ifp, "netmap queues/slots: TX %d/%d, RX %d/%d\n",

hwna->up.num_tx_rings, hwna->up.num_tx_desc,

@@ -2784,12 +2863,57 @@ netmap_attach(struct netmap_adapter *arg)

fail:

D("fail, arg %p ifp %p na %p", arg, ifp, hwna);

- if (ifp)

- netmap_detach(ifp);

return (hwna ? EINVAL : ENOMEM);

}

+int

+netmap_attach(struct netmap_adapter *arg)

+ return _netmap_attach(arg, sizeof(struct netmap_hw_adapter));

+#ifdef WITH_PTNETMAP_GUEST

+int

+netmap_pt_guest_attach(struct netmap_adapter *arg,

+ void *csb,

+ unsigned int nifp_offset,

+ nm_pt_guest_ptctl_t ptctl)

+ struct netmap_pt_guest_adapter *ptna;

+ struct ifnet *ifp = arg ? arg->ifp : NULL;

+ int error;

+ /* get allocator */

+ arg->nm_mem = netmap_mem_pt_guest_new(ifp, nifp_offset, ptctl);

+ if (arg->nm_mem == NULL)

+ return ENOMEM;

+ arg->na_flags |= NAF_MEM_OWNER;

+ error = _netmap_attach(arg, sizeof(struct netmap_pt_guest_adapter));

+ if (error)

+ return error;

+ /* get the netmap_pt_guest_adapter */

+ ptna = (struct netmap_pt_guest_adapter *) NA(ifp);

+ ptna->csb = csb;

+ /* Initialize a separate pass-through netmap adapter that is going to

+ * be used by the ptnet driver only, and so never exposed to netmap

+ * applications. We only need a subset of the available fields. */

+ memset(&ptna->dr, 0, sizeof(ptna->dr));

+ ptna->dr.up.ifp = ifp;

+ ptna->dr.up.nm_mem = ptna->hwup.up.nm_mem;

+ netmap_mem_get(ptna->dr.up.nm_mem);

+ ptna->dr.up.nm_config = ptna->hwup.up.nm_config;

+ ptna->backend_regifs = 0;

+ return 0;

+#endif /* WITH_PTNETMAP_GUEST */

void

NM_DBG(netmap_adapter_get)(struct netmap_adapter *na)

{

@@ -2841,28 +2965,29 @@ void

netmap_detach(struct ifnet *ifp)

{

struct netmap_adapter *na = NA(ifp);

- int skip;

if (!na)

return;

- skip = 0;

NMG_LOCK();

- netmap_disable_all_rings(ifp);

- na->ifp = NULL;

- na->na_flags &= ~NAF_NETMAP_ON;

+ netmap_set_all_rings(na, NM_KR_LOCKED);

+ na->na_flags |= NAF_ZOMBIE;

* if the netmap adapter is not native, somebody

* changed it, so we can not release it here.

- * The NULL na->ifp will notify the new owner that

+ * The NAF_ZOMBIE flag will notify the new owner that

* the driver is gone.

if (na->na_flags & NAF_NATIVE) {

- skip = netmap_adapter_put(na);

+ netmap_adapter_put(na);

}

- /* give them a chance to notice */

- if (skip == 0)

- netmap_enable_all_rings(ifp);

+ /* give active users a chance to notice that NAF_ZOMBIE has been

+ * turned on, so that they can stop and return an error to userspace.

+ * Note that this becomes a NOP if there are no active users and,

+ * therefore, the put() above has deleted the na, since now NA(ifp) is

+ * NULL.

+ */

+ netmap_enable_all_rings(ifp);

NMG_UNLOCK();

}

@@ -2883,9 +3008,10 @@ int

netmap_transmit(struct ifnet *ifp, struct mbuf *m)

{

struct netmap_adapter *na = NA(ifp);

- struct netmap_kring *kring;

+ struct netmap_kring *kring, *tx_kring;

u_int len = MBUF_LEN(m);

u_int error = ENOBUFS;

+ unsigned int txr;

struct mbq *q;

int space;

@@ -2900,6 +3026,16 @@ netmap_transmit(struct ifnet *ifp, struct mbuf *m)

goto done;

}

+ txr = MBUF_TXQ(m);

+ if (txr >= na->num_tx_rings) {

+ txr %= na->num_tx_rings;

+ }

+ tx_kring = &NMR(na, NR_TX)[txr];

+ if (tx_kring->nr_mode == NKR_NETMAP_OFF) {

+ return MBUF_TRANSMIT(na, ifp, m);

+ }

q = &kring->rx_queue;

// XXX reconsider long packets if we handle fragments

@@ -2909,6 +3045,11 @@ netmap_transmit(struct ifnet *ifp, struct mbuf *m)

goto done;

}

+ if (nm_os_mbuf_has_offld(m)) {

+ RD(1, "%s drop mbuf requiring offloadings", na->name);

+ goto done;

+ }

/* protect against rxsync_from_host(), netmap_sw_to_nic()

* and maybe other instances of netmap_transmit (the latter

* not possible on Linux).

@@ -2951,6 +3092,8 @@ done:

* netmap_reset() is called by the driver routines when reinitializing

* a ring. The driver is in charge of locking to protect the kring.

* If native netmap mode is not set just return NULL.

+ * If native netmap mode is set, in particular, we have to set nr_mode to

+ * NKR_NETMAP_ON.

struct netmap_slot *

netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n,

@@ -2975,13 +3118,26 @@ netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n,

if (tx == NR_TX) {

if (n >= na->num_tx_rings)

return NULL;

kring = na->tx_rings + n;

+ if (kring->nr_pending_mode == NKR_NETMAP_OFF) {

+ kring->nr_mode = NKR_NETMAP_OFF;

+ return NULL;

+ }

// XXX check whether we should use hwcur or rcur

new_hwofs = kring->nr_hwcur - new_cur;

} else {

if (n >= na->num_rx_rings)

return NULL;

kring = na->rx_rings + n;

+ if (kring->nr_pending_mode == NKR_NETMAP_OFF) {

+ kring->nr_mode = NKR_NETMAP_OFF;

+ return NULL;

+ }

new_hwofs = kring->nr_hwtail - new_cur;

}

lim = kring->nkr_num_slots - 1;

@@ -3018,6 +3174,7 @@ netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n,

* We do the wakeup here, but the ring is not yet reconfigured.

* However, we are under lock so there are no races.

+ kring->nr_mode = NKR_NETMAP_ON;

kring->nm_notify(kring, 0);

return kring->ring->slot;

}

@@ -3037,10 +3194,9 @@ netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n,

* - for a nic connected to a switch, call the proper forwarding routine

* (see netmap_bwrap_intr_notify)

-void

-netmap_common_irq(struct ifnet *ifp, u_int q, u_int *work_done)

+int

+netmap_common_irq(struct netmap_adapter *na, u_int q, u_int *work_done)

{

- struct netmap_adapter *na = NA(ifp);

struct netmap_kring *kring;

enum txrx t = (work_done ? NR_RX : NR_TX);

@@ -3051,15 +3207,20 @@ netmap_common_irq(struct ifnet *ifp, u_int q, u_int *work_done)

}

if (q >= nma_get_nrings(na, t))

- return; // not a physical queue

+ return NM_IRQ_PASS; // not a physical queue

kring = NMR(na, t) + q;

+ if (kring->nr_mode == NKR_NETMAP_OFF) {

+ return NM_IRQ_PASS;

+ }

if (t == NR_RX) {

kring->nr_kflags |= NKR_PENDINTR; // XXX atomic ?

*work_done = 1; /* do not fire napi again */

}

- kring->nm_notify(kring, 0);

+ return kring->nm_notify(kring, 0);

}

@@ -3067,17 +3228,17 @@ netmap_common_irq(struct ifnet *ifp, u_int q, u_int *work_done)

* Default functions to handle rx/tx interrupts from a physical device.

* "work_done" is non-null on the RX path, NULL for the TX path.

- * If the card is not in netmap mode, simply return 0,

+ * If the card is not in netmap mode, simply return NM_IRQ_PASS,

* so that the caller proceeds with regular processing.

- * Otherwise call netmap_common_irq() and return 1.

+ * Otherwise call netmap_common_irq().

* If the card is connected to a netmap file descriptor,

* do a selwakeup on the individual queue, plus one on the global one

* if needed (multiqueue card _and_ there are multiqueue listeners),

- * and return 1.

+ * and return NR_IRQ_COMPLETED.

* Finally, if called on rx from an interface connected to a switch,

- * calls the proper forwarding routine, and return 1.

+ * calls the proper forwarding routine.

int

netmap_rx_irq(struct ifnet *ifp, u_int q, u_int *work_done)

@@ -3091,15 +3252,14 @@ netmap_rx_irq(struct ifnet *ifp, u_int q, u_int *work_done)

* nm_native_on() here.

if (!nm_netmap_on(na))

- return 0;

+ return NM_IRQ_PASS;

if (na->na_flags & NAF_SKIP_INTR) {

ND("use regular interrupt");

- return 0;

+ return NM_IRQ_PASS;

}

- netmap_common_irq(ifp, q, work_done);

- return 1;

+ return netmap_common_irq(na, q, work_done);

}

@@ -3120,9 +3280,11 @@ extern struct cdevsw netmap_cdevsw;

void

netmap_fini(void)

{

- netmap_uninit_bridges();

if (netmap_dev)

destroy_dev(netmap_dev);

+ /* we assume that there are no longer netmap users */

+ nm_os_ifnet_fini();

+ netmap_uninit_bridges();

netmap_mem_fini();

NMG_LOCK_DESTROY();

printf("netmap: unloaded module.\n");

@@ -3155,9 +3317,13 @@ netmap_init(void)

goto fail;

#ifdef __FreeBSD__

- nm_vi_init_index();

+ nm_os_vi_init_index();

#endif

+ error = nm_os_ifnet_init();

+ if (error)

+ goto fail;

printf("netmap: loaded module\n");

return (0);

fail:

diff --git a/sys/dev/netmap/netmap_freebsd.c b/sys/dev/netmap/netmap_freebsd.c
index 8490ae85670b..20ea5c8f2972 100644
--- a/sys/dev/netmap/netmap_freebsd.c
+++ b/sys/dev/netmap/netmap_freebsd.c

@@ -33,8 +33,9 @@

#include <sys/param.h> /* defines used in kernel.h */

#include <sys/poll.h> /* POLLIN, POLLOUT */

#include <sys/kernel.h> /* types used in module initialization */

-#include <sys/conf.h> /* DEV_MODULE */

+#include <sys/conf.h> /* DEV_MODULE_ORDERED */

#include <sys/endian.h>

+#include <sys/syscallsubr.h> /* kern_ioctl() */

#include <sys/rwlock.h>

@@ -50,6 +51,11 @@

#include <sys/malloc.h>

#include <sys/socket.h> /* sockaddrs */

#include <sys/selinfo.h>

+#include <sys/kthread.h> /* kthread_add() */

+#include <sys/proc.h> /* PROC_LOCK() */

+#include <sys/unistd.h> /* RFNOWAIT */

+#include <sys/sched.h> /* sched_bind() */

+#include <sys/smp.h> /* mp_maxid */

#include <net/if.h>

#include <net/if_var.h>

#include <net/if_types.h> /* IFT_ETHER */

@@ -61,13 +67,94 @@

#include <net/netmap.h>

#include <dev/netmap/netmap_kern.h>

+#include <net/netmap_virt.h>

#include <dev/netmap/netmap_mem2.h>

/* ======================== FREEBSD-SPECIFIC ROUTINES ================== */

+void nm_os_selinfo_init(NM_SELINFO_T *si) {

+ struct mtx *m = &si->m;

+ mtx_init(m, "nm_kn_lock", NULL, MTX_DEF);

+ knlist_init_mtx(&si->si.si_note, m);

+void

+nm_os_selinfo_uninit(NM_SELINFO_T *si)

+ /* XXX kqueue(9) needed; these will mirror knlist_init. */

+ knlist_delete(&si->si.si_note, curthread, 0 /* not locked */ );

+ knlist_destroy(&si->si.si_note);

+ /* now we don't need the mutex anymore */

+ mtx_destroy(&si->m);

+void

+nm_os_ifnet_lock(void)

+ IFNET_WLOCK();

+void

+nm_os_ifnet_unlock(void)

+ IFNET_WUNLOCK();

+static int netmap_use_count = 0;

+void

+nm_os_get_module(void)

+ netmap_use_count++;

+void

+nm_os_put_module(void)

+ netmap_use_count--;

+static void

+netmap_ifnet_arrival_handler(void *arg __unused, struct ifnet *ifp)

+ netmap_undo_zombie(ifp);

+static void

+netmap_ifnet_departure_handler(void *arg __unused, struct ifnet *ifp)

+ netmap_make_zombie(ifp);

+static eventhandler_tag nm_ifnet_ah_tag;

+static eventhandler_tag nm_ifnet_dh_tag;

+int

+nm_os_ifnet_init(void)

+ nm_ifnet_ah_tag =

+ EVENTHANDLER_REGISTER(ifnet_arrival_event,

+ netmap_ifnet_arrival_handler,

+ NULL, EVENTHANDLER_PRI_ANY);

+ nm_ifnet_dh_tag =

+ EVENTHANDLER_REGISTER(ifnet_departure_event,

+ netmap_ifnet_departure_handler,

+ NULL, EVENTHANDLER_PRI_ANY);

+ return 0;

+void

+nm_os_ifnet_fini(void)

+ EVENTHANDLER_DEREGISTER(ifnet_arrival_event,

+ nm_ifnet_ah_tag);

+ EVENTHANDLER_DEREGISTER(ifnet_departure_event,

+ nm_ifnet_dh_tag);

rawsum_t

-nm_csum_raw(uint8_t *data, size_t len, rawsum_t cur_sum)

+nm_os_csum_raw(uint8_t *data, size_t len, rawsum_t cur_sum)

{

/* TODO XXX please use the FreeBSD implementation for this. */

uint16_t *words = (uint16_t *)data;

@@ -87,7 +174,7 @@ nm_csum_raw(uint8_t *data, size_t len, rawsum_t cur_sum)

* return value is in network byte order.

uint16_t

-nm_csum_fold(rawsum_t cur_sum)

+nm_os_csum_fold(rawsum_t cur_sum)

{

/* TODO XXX please use the FreeBSD implementation for this. */

while (cur_sum >> 16)

@@ -96,17 +183,17 @@ nm_csum_fold(rawsum_t cur_sum)

return htobe16((~cur_sum) & 0xFFFF);

}

-uint16_t nm_csum_ipv4(struct nm_iphdr *iph)

+uint16_t nm_os_csum_ipv4(struct nm_iphdr *iph)

{

#if 0

return in_cksum_hdr((void *)iph);

#else

- return nm_csum_fold(nm_csum_raw((uint8_t*)iph, sizeof(struct nm_iphdr), 0));

+ return nm_os_csum_fold(nm_os_csum_raw((uint8_t*)iph, sizeof(struct nm_iphdr), 0));

#endif

}

void

-nm_csum_tcpudp_ipv4(struct nm_iphdr *iph, void *data,

+nm_os_csum_tcpudp_ipv4(struct nm_iphdr *iph, void *data,

size_t datalen, uint16_t *check)

{

#ifdef INET

@@ -118,7 +205,7 @@ nm_csum_tcpudp_ipv4(struct nm_iphdr *iph, void *data,

/* Compute the checksum on TCP/UDP header + payload

* (includes the pseudo-header).

- *check = nm_csum_fold(nm_csum_raw(data, datalen, 0));

+ *check = nm_os_csum_fold(nm_os_csum_raw(data, datalen, 0));

#else

static int notsupported = 0;

if (!notsupported) {

@@ -129,12 +216,12 @@ nm_csum_tcpudp_ipv4(struct nm_iphdr *iph, void *data,

}

void

-nm_csum_tcpudp_ipv6(struct nm_ipv6hdr *ip6h, void *data,

+nm_os_csum_tcpudp_ipv6(struct nm_ipv6hdr *ip6h, void *data,

size_t datalen, uint16_t *check)

{

#ifdef INET6

*check = in6_cksum_pseudo((void*)ip6h, datalen, ip6h->nexthdr, 0);

- *check = nm_csum_fold(nm_csum_raw(data, datalen, 0));

+ *check = nm_os_csum_fold(nm_os_csum_raw(data, datalen, 0));

#else

static int notsupported = 0;

if (!notsupported) {

@@ -144,13 +231,41 @@ nm_csum_tcpudp_ipv6(struct nm_ipv6hdr *ip6h, void *data,

#endif

}

+/* on FreeBSD we send up one packet at a time */

+void *

+nm_os_send_up(struct ifnet *ifp, struct mbuf *m, struct mbuf *prev)

+ NA(ifp)->if_input(ifp, m);

+ return NULL;

+int

+nm_os_mbuf_has_offld(struct mbuf *m)

+ return m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_SCTP |

+ CSUM_TCP_IPV6 | CSUM_UDP_IPV6 |

+ CSUM_SCTP_IPV6 | CSUM_TSO);

+static void

+freebsd_generic_rx_handler(struct ifnet *ifp, struct mbuf *m)

+ struct netmap_generic_adapter *gna =

+ (struct netmap_generic_adapter *)NA(ifp);

+ int stolen = generic_rx_handler(ifp, m);

+ if (!stolen) {

+ gna->save_if_input(ifp, m);

+ }

* Intercept the rx routine in the standard device driver.

* Second argument is non-zero to intercept, 0 to restore

int

-netmap_catch_rx(struct netmap_generic_adapter *gna, int intercept)

+nm_os_catch_rx(struct netmap_generic_adapter *gna, int intercept)

{

struct netmap_adapter *na = &gna->up.up;

struct ifnet *ifp = na->ifp;

@@ -161,7 +276,7 @@ netmap_catch_rx(struct netmap_generic_adapter *gna, int intercept)

return EINVAL; /* already set */

}

gna->save_if_input = ifp->if_input;

- ifp->if_input = generic_rx_handler;

+ ifp->if_input = freebsd_generic_rx_handler;

} else {

if (!gna->save_if_input){

D("cannot restore");

@@ -181,18 +296,20 @@ netmap_catch_rx(struct netmap_generic_adapter *gna, int intercept)

* Second argument is non-zero to intercept, 0 to restore.

* On freebsd we just intercept if_transmit.

-void

-netmap_catch_tx(struct netmap_generic_adapter *gna, int enable)

+int

+nm_os_catch_tx(struct netmap_generic_adapter *gna, int intercept)

{

struct netmap_adapter *na = &gna->up.up;

struct ifnet *ifp = netmap_generic_getifp(gna);

- if (enable) {

+ if (intercept) {

na->if_transmit = ifp->if_transmit;

ifp->if_transmit = netmap_transmit;

} else {

ifp->if_transmit = na->if_transmit;

}

+ return 0;

}

@@ -213,40 +330,44 @@ netmap_catch_tx(struct netmap_generic_adapter *gna, int enable)

int

-generic_xmit_frame(struct ifnet *ifp, struct mbuf *m,

- void *addr, u_int len, u_int ring_nr)

+nm_os_generic_xmit_frame(struct nm_os_gen_arg *a)

{

int ret;

+ u_int len = a->len;

+ struct ifnet *ifp = a->ifp;

+ struct mbuf *m = a->m;

+#if __FreeBSD_version < 1100000

- * The mbuf should be a cluster from our special pool,

- * so we do not need to do an m_copyback but just copy

- * (and eventually, just reference the netmap buffer)

+ * Old FreeBSD versions. The mbuf has a cluster attached,

+ * we need to copy from the cluster to the netmap buffer.

- if (GET_MBUF_REFCNT(m) != 1) {

- D("invalid refcnt %d for %p",

- GET_MBUF_REFCNT(m), m);

+ if (MBUF_REFCNT(m) != 1) {

+ D("invalid refcnt %d for %p", MBUF_REFCNT(m), m);

panic("in generic_xmit_frame");

}

- // XXX the ext_size check is unnecessary if we link the netmap buf

if (m->m_ext.ext_size < len) {

RD(5, "size %d < len %d", m->m_ext.ext_size, len);

len = m->m_ext.ext_size;

}

- if (0) { /* XXX seems to have negligible benefits */

- m->m_ext.ext_buf = m->m_data = addr;

- } else {

- bcopy(addr, m->m_data, len);

- }

+ bcopy(a->addr, m->m_data, len);

+#else /* __FreeBSD_version >= 1100000 */

+ /* New FreeBSD versions. Link the external storage to

+ * the netmap buffer, so that no copy is necessary. */

+ m->m_ext.ext_buf = m->m_data = a->addr;

+ m->m_ext.ext_size = len;

+#endif /* __FreeBSD_version >= 1100000 */

m->m_len = m->m_pkthdr.len = len;

- // inc refcount. All ours, we could skip the atomic

- atomic_fetchadd_int(PNT_MBUF_REFCNT(m), 1);

+ /* mbuf refcnt is not contended, no need to use atomic

+ * (a memory barrier is enough). */

+ SET_MBUF_REFCNT(m, 2);

M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);

- m->m_pkthdr.flowid = ring_nr;

+ m->m_pkthdr.flowid = a->ring_nr;

m->m_pkthdr.rcvif = ifp; /* used for tx notification */

ret = NA(ifp)->if_transmit(ifp, m);

- return ret;

+ return ret ? -1 : 0;

}

@@ -263,7 +384,7 @@ netmap_getna(if_t ifp)

* way to extract the info from the ifp

int

-generic_find_num_desc(struct ifnet *ifp, unsigned int *tx, unsigned int *rx)

+nm_os_generic_find_num_desc(struct ifnet *ifp, unsigned int *tx, unsigned int *rx)

{

D("called, in tx %d rx %d", *tx, *rx);

return 0;

@@ -271,16 +392,23 @@ generic_find_num_desc(struct ifnet *ifp, unsigned int *tx, unsigned int *rx)

void

-generic_find_num_queues(struct ifnet *ifp, u_int *txq, u_int *rxq)

+nm_os_generic_find_num_queues(struct ifnet *ifp, u_int *txq, u_int *rxq)

{

D("called, in txq %d rxq %d", *txq, *rxq);

*txq = netmap_generic_rings;

*rxq = netmap_generic_rings;

}

+void

+nm_os_generic_set_features(struct netmap_generic_adapter *gna)

+ gna->rxsg = 1; /* Supported through m_copydata. */

+ gna->txqdisc = 0; /* Not supported. */

void

-netmap_mitigation_init(struct nm_generic_mit *mit, int idx, struct netmap_adapter *na)

+nm_os_mitigation_init(struct nm_generic_mit *mit, int idx, struct netmap_adapter *na)

{

ND("called");

mit->mit_pending = 0;

@@ -290,21 +418,21 @@ netmap_mitigation_init(struct nm_generic_mit *mit, int idx, struct netmap_adapte

void

-netmap_mitigation_start(struct nm_generic_mit *mit)

+nm_os_mitigation_start(struct nm_generic_mit *mit)

{

ND("called");

}

void

-netmap_mitigation_restart(struct nm_generic_mit *mit)

+nm_os_mitigation_restart(struct nm_generic_mit *mit)

{

ND("called");

}

int

-netmap_mitigation_active(struct nm_generic_mit *mit)

+nm_os_mitigation_active(struct nm_generic_mit *mit)

{

ND("called");

return 0;

@@ -312,7 +440,7 @@ netmap_mitigation_active(struct nm_generic_mit *mit)

void

-netmap_mitigation_cleanup(struct nm_generic_mit *mit)

+nm_os_mitigation_cleanup(struct nm_generic_mit *mit)

{

ND("called");

}

@@ -342,7 +470,7 @@ static struct {

} nm_vi_indices;

void

-nm_vi_init_index(void)

+nm_os_vi_init_index(void)

{

int i;

for (i = 0; i < NM_VI_MAX; i++)

@@ -398,7 +526,7 @@ nm_vi_free_index(uint8_t val)

* increment this refcount on if_attach().

int

-nm_vi_persist(const char *name, struct ifnet **ret)

+nm_os_vi_persist(const char *name, struct ifnet **ret)

{

struct ifnet *ifp;

u_short macaddr_hi;

@@ -438,15 +566,220 @@ nm_vi_persist(const char *name, struct ifnet **ret)

*ret = ifp;

return 0;

}

/* unregister from the system and drop the final refcount */

void

-nm_vi_detach(struct ifnet *ifp)

+nm_os_vi_detach(struct ifnet *ifp)

{

nm_vi_free_index(((char *)IF_LLADDR(ifp))[5]);

ether_ifdetach(ifp);

if_free(ifp);

}

+/* ======================== PTNETMAP SUPPORT ========================== */

+#ifdef WITH_PTNETMAP_GUEST

+#include <sys/bus.h>

+#include <sys/rman.h>

+#include <machine/bus.h> /* bus_dmamap_* */

+#include <machine/resource.h>

+#include <dev/pci/pcivar.h>

+#include <dev/pci/pcireg.h>

+/*

+ * ptnetmap memory device (memdev) for freebsd guest,

+ * ssed to expose host netmap memory to the guest through a PCI BAR.

+ */

+/*

+ * ptnetmap memdev private data structure

+ */

+struct ptnetmap_memdev {

+ device_t dev;

+ struct resource *pci_io;

+ struct resource *pci_mem;

+ struct netmap_mem_d *nm_mem;

+};

+static int ptn_memdev_probe(device_t);

+static int ptn_memdev_attach(device_t);

+static int ptn_memdev_detach(device_t);

+static int ptn_memdev_shutdown(device_t);

+static device_method_t ptn_memdev_methods[] = {

+ DEVMETHOD(device_probe, ptn_memdev_probe),

+ DEVMETHOD(device_attach, ptn_memdev_attach),

+ DEVMETHOD(device_detach, ptn_memdev_detach),

+ DEVMETHOD(device_shutdown, ptn_memdev_shutdown),

+ DEVMETHOD_END

+};

+static driver_t ptn_memdev_driver = {

+ PTNETMAP_MEMDEV_NAME,

+ ptn_memdev_methods,

+ sizeof(struct ptnetmap_memdev),

+};

+/* We use (SI_ORDER_MIDDLE+1) here, see DEV_MODULE_ORDERED() invocation

+ * below. */

+static devclass_t ptnetmap_devclass;

+DRIVER_MODULE_ORDERED(ptn_memdev, pci, ptn_memdev_driver, ptnetmap_devclass,

+ NULL, NULL, SI_ORDER_MIDDLE + 1);

+/*

+ * I/O port read/write wrappers.

+ * Some are not used, so we keep them commented out until needed

+ */

+#define ptn_ioread16(ptn_dev, reg) bus_read_2((ptn_dev)->pci_io, (reg))

+#define ptn_ioread32(ptn_dev, reg) bus_read_4((ptn_dev)->pci_io, (reg))

+#if 0

+#define ptn_ioread8(ptn_dev, reg) bus_read_1((ptn_dev)->pci_io, (reg))

+#define ptn_iowrite8(ptn_dev, reg, val) bus_write_1((ptn_dev)->pci_io, (reg), (val))

+#define ptn_iowrite16(ptn_dev, reg, val) bus_write_2((ptn_dev)->pci_io, (reg), (val))

+#define ptn_iowrite32(ptn_dev, reg, val) bus_write_4((ptn_dev)->pci_io, (reg), (val))

+#endif /* unused */

+/*

+ * Map host netmap memory through PCI-BAR in the guest OS,

+ * returning physical (nm_paddr) and virtual (nm_addr) addresses

+ * of the netmap memory mapped in the guest.

+ */

+int

+nm_os_pt_memdev_iomap(struct ptnetmap_memdev *ptn_dev, vm_paddr_t *nm_paddr, void **nm_addr)

+ uint32_t mem_size;

+ int rid;

+ D("ptn_memdev_driver iomap");

+ rid = PCIR_BAR(PTNETMAP_MEM_PCI_BAR);

+ mem_size = ptn_ioread32(ptn_dev, PTNETMAP_IO_PCI_MEMSIZE);

+ /* map memory allocator */

+ ptn_dev->pci_mem = bus_alloc_resource(ptn_dev->dev, SYS_RES_MEMORY,

+ &rid, 0, ~0, mem_size, RF_ACTIVE);

+ if (ptn_dev->pci_mem == NULL) {

+ *nm_paddr = 0;

+ *nm_addr = 0;

+ return ENOMEM;

+ }

+ *nm_paddr = rman_get_start(ptn_dev->pci_mem);

+ *nm_addr = rman_get_virtual(ptn_dev->pci_mem);

+ D("=== BAR %d start %lx len %lx mem_size %x ===",

+ PTNETMAP_MEM_PCI_BAR,

+ *nm_paddr,

+ rman_get_size(ptn_dev->pci_mem),

+ mem_size);

+ return (0);

+/* Unmap host netmap memory. */

+void

+nm_os_pt_memdev_iounmap(struct ptnetmap_memdev *ptn_dev)

+ D("ptn_memdev_driver iounmap");

+ if (ptn_dev->pci_mem) {

+ bus_release_resource(ptn_dev->dev, SYS_RES_MEMORY,

+ PCIR_BAR(PTNETMAP_MEM_PCI_BAR), ptn_dev->pci_mem);

+ ptn_dev->pci_mem = NULL;

+ }

+/* Device identification routine, return BUS_PROBE_DEFAULT on success,

+ * positive on failure */

+static int

+ptn_memdev_probe(device_t dev)

+ char desc[256];

+ if (pci_get_vendor(dev) != PTNETMAP_PCI_VENDOR_ID)

+ return (ENXIO);

+ if (pci_get_device(dev) != PTNETMAP_PCI_DEVICE_ID)

+ return (ENXIO);

+ snprintf(desc, sizeof(desc), "%s PCI adapter",

+ PTNETMAP_MEMDEV_NAME);

+ device_set_desc_copy(dev, desc);

+ return (BUS_PROBE_DEFAULT);

+/* Device initialization routine. */

+static int

+ptn_memdev_attach(device_t dev)

+ struct ptnetmap_memdev *ptn_dev;

+ int rid;

+ uint16_t mem_id;

+ D("ptn_memdev_driver attach");

+ ptn_dev = device_get_softc(dev);

+ ptn_dev->dev = dev;

+ pci_enable_busmaster(dev);

+ rid = PCIR_BAR(PTNETMAP_IO_PCI_BAR);

+ ptn_dev->pci_io = bus_alloc_resource_any(dev, SYS_RES_IOPORT, &rid,

+ RF_ACTIVE);

+ if (ptn_dev->pci_io == NULL) {

+ device_printf(dev, "cannot map I/O space\n");

+ return (ENXIO);

+ }

+ mem_id = ptn_ioread16(ptn_dev, PTNETMAP_IO_PCI_HOSTID);

+ /* create guest allocator */

+ ptn_dev->nm_mem = netmap_mem_pt_guest_attach(ptn_dev, mem_id);

+ if (ptn_dev->nm_mem == NULL) {

+ ptn_memdev_detach(dev);

+ return (ENOMEM);

+ }

+ netmap_mem_get(ptn_dev->nm_mem);

+ D("ptn_memdev_driver probe OK - host_id: %d", mem_id);

+ return (0);

+/* Device removal routine. */

+static int

+ptn_memdev_detach(device_t dev)

+ struct ptnetmap_memdev *ptn_dev;

+ D("ptn_memdev_driver detach");

+ ptn_dev = device_get_softc(dev);

+ if (ptn_dev->nm_mem) {

+ netmap_mem_put(ptn_dev->nm_mem);

+ ptn_dev->nm_mem = NULL;

+ }

+ if (ptn_dev->pci_mem) {

+ bus_release_resource(dev, SYS_RES_MEMORY,

+ PCIR_BAR(PTNETMAP_MEM_PCI_BAR), ptn_dev->pci_mem);

+ ptn_dev->pci_mem = NULL;

+ }

+ if (ptn_dev->pci_io) {

+ bus_release_resource(dev, SYS_RES_IOPORT,

+ PCIR_BAR(PTNETMAP_IO_PCI_BAR), ptn_dev->pci_io);

+ ptn_dev->pci_io = NULL;

+ }

+ return (0);

+static int

+ptn_memdev_shutdown(device_t dev)

+ D("ptn_memdev_driver shutdown");

+ return bus_generic_shutdown(dev);

+#endif /* WITH_PTNETMAP_GUEST */

* In order to track whether pages are still mapped, we hook into

* the standard cdev_pager and intercept the constructor and

@@ -606,7 +939,7 @@ err_unlock:

* the device (/dev/netmap) so we cannot do anything useful.

* To track close() on individual file descriptors we pass netmap_dtor() to

* devfs_set_cdevpriv() on open(). The FreeBSD kernel will call the destructor

- * when the last fd pointing to the device is closed.

+ * when the last fd pointing to the device is closed.

* Note that FreeBSD does not even munmap() on close() so we also have

* to track mmap() ourselves, and postpone the call to

@@ -634,26 +967,275 @@ netmap_open(struct cdev *dev, int oflags, int devtype, struct thread *td)

(void)devtype;

(void)td;

- priv = malloc(sizeof(struct netmap_priv_d), M_DEVBUF,

- M_NOWAIT | M_ZERO);

- if (priv == NULL)

- return ENOMEM;

- priv->np_refs = 1;

+ NMG_LOCK();

+ priv = netmap_priv_new();

+ if (priv == NULL) {

+ error = ENOMEM;

+ goto out;

+ }

error = devfs_set_cdevpriv(priv, netmap_dtor);

if (error) {

- free(priv, M_DEVBUF);

- } else {

- NMG_LOCK();

- netmap_use_count++;

- NMG_UNLOCK();

+ netmap_priv_delete(priv);

}

+out:

+ NMG_UNLOCK();

return error;

}

+/******************** kthread wrapper ****************/

+#include <sys/sysproto.h>

+u_int

+nm_os_ncpus(void)

+ return mp_maxid + 1;

+struct nm_kthread_ctx {

+ struct thread *user_td; /* thread user-space (kthread creator) to send ioctl */

+ /* notification to guest (interrupt) */

+ int irq_fd; /* ioctl fd */

+ struct nm_kth_ioctl irq_ioctl; /* ioctl arguments */

+ /* notification from guest */

+ void *ioevent_file; /* tsleep() argument */

+ /* worker function and parameter */

+ nm_kthread_worker_fn_t worker_fn;

+ void *worker_private;

+ struct nm_kthread *nmk;

+ /* integer to manage multiple worker contexts (e.g., RX or TX on ptnetmap) */

+ long type;

+};

+struct nm_kthread {

+ struct thread *worker;

+ struct mtx worker_lock;

+ uint64_t scheduled; /* pending wake_up request */

+ struct nm_kthread_ctx worker_ctx;

+ int run; /* used to stop kthread */

+ int attach_user; /* kthread attached to user_process */

+ int affinity;

+};

+void inline

+nm_os_kthread_wakeup_worker(struct nm_kthread *nmk)

+ /*

+ * There may be a race between FE and BE,

+ * which call both this function, and worker kthread,

+ * that reads nmk->scheduled.

+ *

+ * For us it is not important the counter value,

+ * but simply that it has changed since the last

+ * time the kthread saw it.

+ */

+ mtx_lock(&nmk->worker_lock);

+ nmk->scheduled++;

+ if (nmk->worker_ctx.ioevent_file) {

+ wakeup(nmk->worker_ctx.ioevent_file);

+ }

+ mtx_unlock(&nmk->worker_lock);

+void inline

+nm_os_kthread_send_irq(struct nm_kthread *nmk)

+ struct nm_kthread_ctx *ctx = &nmk->worker_ctx;

+ int err;

+ if (ctx->user_td && ctx->irq_fd > 0) {

+ err = kern_ioctl(ctx->user_td, ctx->irq_fd, ctx->irq_ioctl.com, (caddr_t)&ctx->irq_ioctl.data.msix);

+ if (err) {

+ D("kern_ioctl error: %d ioctl parameters: fd %d com %lu data %p",

+ err, ctx->irq_fd, ctx->irq_ioctl.com, &ctx->irq_ioctl.data);

+ }

+static void

+nm_kthread_worker(void *data)

+ struct nm_kthread *nmk = data;

+ struct nm_kthread_ctx *ctx = &nmk->worker_ctx;

+ uint64_t old_scheduled = nmk->scheduled;

+ if (nmk->affinity >= 0) {

+ thread_lock(curthread);

+ sched_bind(curthread, nmk->affinity);

+ thread_unlock(curthread);

+ }

+ while (nmk->run) {

+ /*

+ * check if the parent process dies

+ * (when kthread is attached to user process)

+ */

+ if (ctx->user_td) {

+ PROC_LOCK(curproc);

+ thread_suspend_check(0);

+ PROC_UNLOCK(curproc);

+ } else {

+ kthread_suspend_check();

+ }

+ /*

+ * if ioevent_file is not defined, we don't have notification

+ * mechanism and we continually execute worker_fn()

+ */

+ if (!ctx->ioevent_file) {

+ ctx->worker_fn(ctx->worker_private); /* worker body */

+ } else {

+ /* checks if there is a pending notification */

+ mtx_lock(&nmk->worker_lock);

+ if (likely(nmk->scheduled != old_scheduled)) {

+ old_scheduled = nmk->scheduled;

+ mtx_unlock(&nmk->worker_lock);

+ ctx->worker_fn(ctx->worker_private); /* worker body */

+ continue;

+ } else if (nmk->run) {

+ /* wait on event with one second timeout */

+ msleep_spin(ctx->ioevent_file, &nmk->worker_lock,

+ "nmk_ev", hz);

+ nmk->scheduled++;

+ }

+ mtx_unlock(&nmk->worker_lock);

+ }

+ kthread_exit();

+static int

+nm_kthread_open_files(struct nm_kthread *nmk, struct nm_kthread_cfg *cfg)

+ /* send irq through ioctl to bhyve (vmm.ko) */

+ if (cfg->event.irqfd) {

+ nmk->worker_ctx.irq_fd = cfg->event.irqfd;

+ nmk->worker_ctx.irq_ioctl = cfg->event.ioctl;

+ }

+ /* ring.ioeventfd contains the chan where do tsleep to wait events */

+ if (cfg->event.ioeventfd) {

+ nmk->worker_ctx.ioevent_file = (void *)cfg->event.ioeventfd;

+ }

+ return 0;

+static void

+nm_kthread_close_files(struct nm_kthread *nmk)

+ nmk->worker_ctx.irq_fd = 0;

+ nmk->worker_ctx.ioevent_file = NULL;

+void

+nm_os_kthread_set_affinity(struct nm_kthread *nmk, int affinity)

+ nmk->affinity = affinity;

+struct nm_kthread *

+nm_os_kthread_create(struct nm_kthread_cfg *cfg)

+ struct nm_kthread *nmk = NULL;

+ int error;

+ nmk = malloc(sizeof(*nmk), M_DEVBUF, M_NOWAIT | M_ZERO);

+ if (!nmk)

+ return NULL;

+ mtx_init(&nmk->worker_lock, "nm_kthread lock", NULL, MTX_SPIN);

+ nmk->worker_ctx.worker_fn = cfg->worker_fn;

+ nmk->worker_ctx.worker_private = cfg->worker_private;

+ nmk->worker_ctx.type = cfg->type;

+ nmk->affinity = -1;

+ /* attach kthread to user process (ptnetmap) */

+ nmk->attach_user = cfg->attach_user;

+ /* open event fd */

+ error = nm_kthread_open_files(nmk, cfg);

+ if (error)

+ goto err;

+ return nmk;

+err:

+ free(nmk, M_DEVBUF);

+ return NULL;

+int

+nm_os_kthread_start(struct nm_kthread *nmk)

+ struct proc *p = NULL;

+ int error = 0;

+ if (nmk->worker) {

+ return EBUSY;

+ }

+ /* check if we want to attach kthread to user process */

+ if (nmk->attach_user) {

+ nmk->worker_ctx.user_td = curthread;

+ p = curthread->td_proc;

+ }

+ /* enable kthread main loop */

+ nmk->run = 1;

+ /* create kthread */

+ if((error = kthread_add(nm_kthread_worker, nmk, p,

+ &nmk->worker, RFNOWAIT /* to be checked */, 0, "nm-kthread-%ld",

+ nmk->worker_ctx.type))) {

+ goto err;

+ }

+ D("nm_kthread started td 0x%p", nmk->worker);

+ return 0;

+err:

+ D("nm_kthread start failed err %d", error);

+ nmk->worker = NULL;

+ return error;

+void

+nm_os_kthread_stop(struct nm_kthread *nmk)

+ if (!nmk->worker) {

+ return;

+ }

+ /* tell to kthread to exit from main loop */

+ nmk->run = 0;

+ /* wake up kthread if it sleeps */

+ kthread_resume(nmk->worker);

+ nm_os_kthread_wakeup_worker(nmk);

+ nmk->worker = NULL;

+void

+nm_os_kthread_delete(struct nm_kthread *nmk)

+ if (!nmk)

+ return;

+ if (nmk->worker) {

+ nm_os_kthread_stop(nmk);

+ }

+ nm_kthread_close_files(nmk);

+ free(nmk, M_DEVBUF);

/******************** kqueue support ****************/

- * The OS_selwakeup also needs to issue a KNOTE_UNLOCKED.

+ * nm_os_selwakeup also needs to issue a KNOTE_UNLOCKED.

* We use a non-zero argument to distinguish the call from the one

* in kevent_scan() which instead also needs to run netmap_poll().

* The knote uses a global mutex for the time being. We might

@@ -672,17 +1254,23 @@ netmap_open(struct cdev *dev, int oflags, int devtype, struct thread *td)

void

-freebsd_selwakeup(struct nm_selinfo *si, int pri)

+nm_os_selwakeup(struct nm_selinfo *si)

{

if (netmap_verbose)

D("on knote %p", &si->si.si_note);

- selwakeuppri(&si->si, pri);

+ selwakeuppri(&si->si, PI_NET);

/* use a non-zero hint to tell the notification from the

* call done in kqueue_scan() which uses 0

KNOTE_UNLOCKED(&si->si.si_note, 0x100 /* notification */);

}

+void

+nm_os_selrecord(struct thread *td, struct nm_selinfo *si)

+ selrecord(td, &si->si);

static void

netmap_knrdetach(struct knote *kn)

{

@@ -728,7 +1316,7 @@ netmap_knrw(struct knote *kn, long hint, int events)

RD(5, "curthread changed %p %p", curthread, priv->np_td);

return 1;

} else {

- revents = netmap_poll((void *)priv, events, curthread);

+ revents = netmap_poll(priv, events, NULL);

return (events & revents) ? 1 : 0;

}

@@ -801,13 +1389,47 @@ netmap_kqfilter(struct cdev *dev, struct knote *kn)

return 0;

}

+static int

+freebsd_netmap_poll(struct cdev *cdevi __unused, int events, struct thread *td)

+ struct netmap_priv_d *priv;

+ if (devfs_get_cdevpriv((void **)&priv)) {

+ return POLLERR;

+ }

+ return netmap_poll(priv, events, td);

+static int

+freebsd_netmap_ioctl(struct cdev *dev __unused, u_long cmd, caddr_t data,

+ int ffla __unused, struct thread *td)

+ int error;

+ struct netmap_priv_d *priv;

+ CURVNET_SET(TD_TO_VNET(rd));

+ error = devfs_get_cdevpriv((void **)&priv);

+ if (error) {

+ /* XXX ENOENT should be impossible, since the priv

+ * is now created in the open */

+ if (error == ENOENT)

+ error = ENXIO;

+ goto out;

+ }

+ error = netmap_ioctl(priv, cmd, data, td);

+out:

+ CURVNET_RESTORE();

+ return error;

+extern struct cdevsw netmap_cdevsw; /* XXX used in netmap.c, should go elsewhere */

struct cdevsw netmap_cdevsw = {

.d_version = D_VERSION,

.d_name = "netmap",

.d_open = netmap_open,

.d_mmap_single = netmap_mmap_single,

- .d_ioctl = netmap_ioctl,

- .d_poll = netmap_poll,

+ .d_ioctl = freebsd_netmap_ioctl,

+ .d_poll = freebsd_netmap_poll,

.d_kqfilter = netmap_kqfilter,

.d_close = netmap_close,

};

@@ -852,6 +1474,24 @@ netmap_loader(__unused struct module *module, int event, __unused void *arg)

return (error);

}

+#ifdef DEV_MODULE_ORDERED

+/*

+ * The netmap module contains three drivers: (i) the netmap character device

+ * driver; (ii) the ptnetmap memdev PCI device driver, (iii) the ptnet PCI

+ * device driver. The attach() routines of both (ii) and (iii) need the

+ * lock of the global allocator, and such lock is initialized in netmap_init(),

+ * which is part of (i).

+ * Therefore, we make sure that (i) is loaded before (ii) and (iii), using

+ * the 'order' parameter of driver declaration macros. For (i), we specify

+ * SI_ORDER_MIDDLE, while higher orders are used with the DRIVER_MODULE_ORDERED

+ * macros for (ii) and (iii).

+ */

+DEV_MODULE_ORDERED(netmap, netmap_loader, NULL, SI_ORDER_MIDDLE);

+#else /* !DEV_MODULE_ORDERED */

DEV_MODULE(netmap, netmap_loader, NULL);

+#endif /* DEV_MODULE_ORDERED */

+MODULE_DEPEND(netmap, pci, 1, 1, 1);

MODULE_VERSION(netmap, 1);

+/* reduce conditional code */

+// linux API, use for the knlist in FreeBSD

+/* use a private mutex for the knlist */

diff --git a/sys/dev/netmap/netmap_generic.c b/sys/dev/netmap/netmap_generic.c
index 85a6a9f76ea2..5cef4a29110a 100644
--- a/sys/dev/netmap/netmap_generic.c
+++ b/sys/dev/netmap/netmap_generic.c

@@ -1,5 +1,7 @@

* Redistribution and use in source and binary forms, with or without

* modification, are permitted provided that the following conditions

@@ -83,25 +85,25 @@ __FBSDID("$FreeBSD$");

#define rtnl_lock() ND("rtnl_lock called")

#define rtnl_unlock() ND("rtnl_unlock called")

-#define MBUF_TXQ(m) ((m)->m_pkthdr.flowid)

#define MBUF_RXQ(m) ((m)->m_pkthdr.flowid)

#define smp_mb()

* FreeBSD mbuf allocator/deallocator in emulation mode:

- *

+ */

+#if __FreeBSD_version < 1100000

+/*

+ * For older versions of FreeBSD:

+ *

* We allocate EXT_PACKET mbuf+clusters, but need to set M_NOFREE

* so that the destructor, if invoked, will not free the packet.

- * In principle we should set the destructor only on demand,

+ * In principle we should set the destructor only on demand,

* but since there might be a race we better do it on allocation.

* As a consequence, we also need to set the destructor or we

* would leak buffers.

-/*

- * mbuf wrappers

- */

/* mbuf destructor, also need to change the type to EXT_EXTREF,

* add an M_NOFREE flag, and then clear the flag and

* chain into uma_zfree(zone_pack, mf)

@@ -112,35 +114,93 @@ __FBSDID("$FreeBSD$");

(m)->m_ext.ext_type = EXT_EXTREF; \

} while (0)

-static void

-netmap_default_mbuf_destructor(struct mbuf *m)

+static int

+void_mbuf_dtor(struct mbuf *m, void *arg1, void *arg2)

{

/* restore original mbuf */

m->m_ext.ext_buf = m->m_data = m->m_ext.ext_arg1;

m->m_ext.ext_arg1 = NULL;

m->m_ext.ext_type = EXT_PACKET;

m->m_ext.ext_free = NULL;

- if (GET_MBUF_REFCNT(m) == 0)

+ if (MBUF_REFCNT(m) == 0)

SET_MBUF_REFCNT(m, 1);

uma_zfree(zone_pack, m);

+ return 0;

}

static inline struct mbuf *

-netmap_get_mbuf(int len)

+nm_os_get_mbuf(struct ifnet *ifp, int len)

{

struct mbuf *m;

+ (void)ifp;

m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);

if (m) {

- m->m_flags |= M_NOFREE; /* XXXNP: Almost certainly incorrect. */

+ /* m_getcl() (mb_ctor_mbuf) has an assert that checks that

+ * M_NOFREE flag is not specified as third argument,

+ * so we have to set M_NOFREE after m_getcl(). */

+ m->m_flags |= M_NOFREE;

m->m_ext.ext_arg1 = m->m_ext.ext_buf; // XXX save

- m->m_ext.ext_free = (void *)netmap_default_mbuf_destructor;

+ m->m_ext.ext_free = (void *)void_mbuf_dtor;

m->m_ext.ext_type = EXT_EXTREF;

- ND(5, "create m %p refcnt %d", m, GET_MBUF_REFCNT(m));

+ ND(5, "create m %p refcnt %d", m, MBUF_REFCNT(m));

}

return m;

}

+#else /* __FreeBSD_version >= 1100000 */

+/*

+ * Newer versions of FreeBSD, using a straightforward scheme.

+ *

+ * We allocate mbufs with m_gethdr(), since the mbuf header is needed

+ * by the driver. We also attach a customly-provided external storage,

+ * which in this case is a netmap buffer. When calling m_extadd(), however

+ * we pass a NULL address, since the real address (and length) will be

+ * filled in by nm_os_generic_xmit_frame() right before calling

+ * if_transmit().

+ *

+ * The dtor function does nothing, however we need it since mb_free_ext()

+ * has a KASSERT(), checking that the mbuf dtor function is not NULL.

+ */

+#define SET_MBUF_DESTRUCTOR(m, fn) do { \

+ (m)->m_ext.ext_free = (void *)fn; \

+} while (0)

+static void void_mbuf_dtor(struct mbuf *m, void *arg1, void *arg2) { }

+static inline struct mbuf *

+nm_os_get_mbuf(struct ifnet *ifp, int len)

+ struct mbuf *m;

+ (void)ifp;

+ (void)len;

+ m = m_gethdr(M_NOWAIT, MT_DATA);

+ if (m == NULL) {

+ return m;

+ }

+ m_extadd(m, NULL /* buf */, 0 /* size */, void_mbuf_dtor,

+ NULL, NULL, 0, EXT_NET_DRV);

+ return m;

+#endif /* __FreeBSD_version >= 1100000 */

+#elif defined _WIN32

+#include "win_glue.h"

+#define rtnl_lock() ND("rtnl_lock called")

+#define rtnl_unlock() ND("rtnl_unlock called")

+#define MBUF_TXQ(m) 0//((m)->m_pkthdr.flowid)

+#define MBUF_RXQ(m) 0//((m)->m_pkthdr.flowid)

+#define smp_mb() //XXX: to be correctly defined

#else /* linux */

@@ -150,7 +210,12 @@ netmap_get_mbuf(int len)

#include <linux/ethtool.h> /* struct ethtool_ops, get_ringparam */

#include <linux/hrtimer.h>

-//#define REG_RESET

+static inline struct mbuf *

+nm_os_get_mbuf(struct ifnet *ifp, int len)

+ return alloc_skb(ifp->needed_headroom + len +

+ ifp->needed_tailroom, GFP_ATOMIC);

#endif /* linux */

@@ -161,8 +226,21 @@ netmap_get_mbuf(int len)

#include <dev/netmap/netmap_mem2.h>

+#define for_each_kring_n(_i, _k, _karr, _n) \

+ for (_k=_karr, _i = 0; _i < _n; (_k)++, (_i)++)

+#define for_each_tx_kring(_i, _k, _na) \

+ for_each_kring_n(_i, _k, (_na)->tx_rings, (_na)->num_tx_rings)

+#define for_each_tx_kring_h(_i, _k, _na) \

+ for_each_kring_n(_i, _k, (_na)->tx_rings, (_na)->num_tx_rings + 1)

+#define for_each_rx_kring(_i, _k, _na) \

+ for_each_kring_n(_i, _k, (_na)->rx_rings, (_na)->num_rx_rings)

+#define for_each_rx_kring_h(_i, _k, _na) \

+ for_each_kring_n(_i, _k, (_na)->rx_rings, (_na)->num_rx_rings + 1)

-/* ======================== usage stats =========================== */

+/* ======================== PERFORMANCE STATISTICS =========================== */

#ifdef RATE_GENERIC

#define IFRATE(x) x

@@ -170,6 +248,8 @@ struct rate_stats {

unsigned long txpkt;

unsigned long txsync;

unsigned long txirq;

+ unsigned long txrepl;

+ unsigned long txdrop;

unsigned long rxpkt;

unsigned long rxirq;

unsigned long rxsync;

@@ -194,6 +274,8 @@ static void rate_callback(unsigned long arg)

RATE_PRINTK(txpkt);

RATE_PRINTK(txsync);

RATE_PRINTK(txirq);

+ RATE_PRINTK(txrepl);

+ RATE_PRINTK(txdrop);

RATE_PRINTK(rxpkt);

RATE_PRINTK(rxsync);

RATE_PRINTK(rxirq);

@@ -230,94 +312,222 @@ void generic_rate(int txp, int txs, int txi, int rxp, int rxs, int rxi)

* the poller threads. Differently from netmap_rx_irq(), we check

* only NAF_NETMAP_ON instead of NAF_NATIVE_ON to enable the irq.

-static void

-netmap_generic_irq(struct ifnet *ifp, u_int q, u_int *work_done)

+void

+netmap_generic_irq(struct netmap_adapter *na, u_int q, u_int *work_done)

{

- struct netmap_adapter *na = NA(ifp);

if (unlikely(!nm_netmap_on(na)))

return;

- netmap_common_irq(ifp, q, work_done);

+ netmap_common_irq(na, q, work_done);

+#ifdef RATE_GENERIC

+ if (work_done)

+ rate_ctx.new.rxirq++;

+ else

+ rate_ctx.new.txirq++;

+#endif /* RATE_GENERIC */

}

+static int

+generic_netmap_unregister(struct netmap_adapter *na)

+ struct netmap_generic_adapter *gna = (struct netmap_generic_adapter *)na;

+ struct netmap_kring *kring = NULL;

+ int i, r;

+ if (na->active_fds == 0) {

+ D("Generic adapter %p goes off", na);

+ rtnl_lock();

+ na->na_flags &= ~NAF_NETMAP_ON;

+ /* Release packet steering control. */

+ nm_os_catch_tx(gna, 0);

+ /* Stop intercepting packets on the RX path. */

+ nm_os_catch_rx(gna, 0);

+ rtnl_unlock();

+ }

+ for_each_rx_kring_h(r, kring, na) {

+ if (nm_kring_pending_off(kring)) {

+ D("RX ring %d of generic adapter %p goes off", r, na);

+ kring->nr_mode = NKR_NETMAP_OFF;

+ }

+ for_each_tx_kring_h(r, kring, na) {

+ if (nm_kring_pending_off(kring)) {

+ kring->nr_mode = NKR_NETMAP_OFF;

+ D("TX ring %d of generic adapter %p goes off", r, na);

+ }

+ for_each_rx_kring(r, kring, na) {

+ /* Free the mbufs still pending in the RX queues,

+ * that did not end up into the corresponding netmap

+ * RX rings. */

+ mbq_safe_purge(&kring->rx_queue);

+ nm_os_mitigation_cleanup(&gna->mit[r]);

+ }

+ /* Decrement reference counter for the mbufs in the

+ * TX pools. These mbufs can be still pending in drivers,

+ * (e.g. this happens with virtio-net driver, which

+ * does lazy reclaiming of transmitted mbufs). */

+ for_each_tx_kring(r, kring, na) {

+ /* We must remove the destructor on the TX event,

+ * because the destructor invokes netmap code, and

+ * the netmap module may disappear before the

+ * TX event is consumed. */

+ mtx_lock_spin(&kring->tx_event_lock);

+ if (kring->tx_event) {

+ SET_MBUF_DESTRUCTOR(kring->tx_event, NULL);

+ }

+ kring->tx_event = NULL;

+ mtx_unlock_spin(&kring->tx_event_lock);

+ }

+ if (na->active_fds == 0) {

+ free(gna->mit, M_DEVBUF);

+ for_each_rx_kring(r, kring, na) {

+ mbq_safe_fini(&kring->rx_queue);

+ }

+ for_each_tx_kring(r, kring, na) {

+ mtx_destroy(&kring->tx_event_lock);

+ if (kring->tx_pool == NULL) {

+ continue;

+ }

+ for (i=0; i<na->num_tx_desc; i++) {

+ if (kring->tx_pool[i]) {

+ m_freem(kring->tx_pool[i]);

+ }

+ free(kring->tx_pool, M_DEVBUF);

+ kring->tx_pool = NULL;

+ }

+#ifdef RATE_GENERIC

+ if (--rate_ctx.refcount == 0) {

+ D("del_timer()");

+ del_timer(&rate_ctx.timer);

+ }

+#endif

+ }

+ return 0;

/* Enable/disable netmap mode for a generic network interface. */

static int

generic_netmap_register(struct netmap_adapter *na, int enable)

{

struct netmap_generic_adapter *gna = (struct netmap_generic_adapter *)na;

- struct mbuf *m;

+ struct netmap_kring *kring = NULL;

int error;

int i, r;

- if (!na)

+ if (!na) {

return EINVAL;

+ }

-#ifdef REG_RESET

- error = ifp->netdev_ops->ndo_stop(ifp);

- if (error) {

- return error;

+ if (!enable) {

+ /* This is actually an unregif. */

+ return generic_netmap_unregister(na);

}

-#endif /* REG_RESET */

- if (enable) { /* Enable netmap mode. */

- /* Init the mitigation support on all the rx queues. */

+ if (na->active_fds == 0) {

+ D("Generic adapter %p goes on", na);

+ /* Do all memory allocations when (na->active_fds == 0), to

+ * simplify error management. */

+ /* Allocate memory for mitigation support on all the rx queues. */

gna->mit = malloc(na->num_rx_rings * sizeof(struct nm_generic_mit),

- M_DEVBUF, M_NOWAIT | M_ZERO);

+ M_DEVBUF, M_NOWAIT | M_ZERO);

if (!gna->mit) {

D("mitigation allocation failed");

error = ENOMEM;

goto out;

}

- for (r=0; r<na->num_rx_rings; r++)

- netmap_mitigation_init(&gna->mit[r], r, na);

- /* Initialize the rx queue, as generic_rx_handler() can

- * be called as soon as netmap_catch_rx() returns.

- */

- for (r=0; r<na->num_rx_rings; r++) {

- mbq_safe_init(&na->rx_rings[r].rx_queue);

+ for_each_rx_kring(r, kring, na) {

+ /* Init mitigation support. */

+ nm_os_mitigation_init(&gna->mit[r], r, na);

+ /* Initialize the rx queue, as generic_rx_handler() can

+ * be called as soon as nm_os_catch_rx() returns.

+ */

+ mbq_safe_init(&kring->rx_queue);

}

- * Preallocate packet buffers for the tx rings.

+ * Prepare mbuf pools (parallel to the tx rings), for packet

+ * transmission. Don't preallocate the mbufs here, it's simpler

+ * to leave this task to txsync.

- for (r=0; r<na->num_tx_rings; r++)

- na->tx_rings[r].tx_pool = NULL;

- for (r=0; r<na->num_tx_rings; r++) {

- na->tx_rings[r].tx_pool = malloc(na->num_tx_desc * sizeof(struct mbuf *),

- M_DEVBUF, M_NOWAIT | M_ZERO);

- if (!na->tx_rings[r].tx_pool) {

+ for_each_tx_kring(r, kring, na) {

+ kring->tx_pool = NULL;

+ }

+ for_each_tx_kring(r, kring, na) {

+ kring->tx_pool =

+ malloc(na->num_tx_desc * sizeof(struct mbuf *),

+ M_DEVBUF, M_NOWAIT | M_ZERO);

+ if (!kring->tx_pool) {

D("tx_pool allocation failed");

error = ENOMEM;

goto free_tx_pools;

}

- for (i=0; i<na->num_tx_desc; i++)

- na->tx_rings[r].tx_pool[i] = NULL;

- for (i=0; i<na->num_tx_desc; i++) {

- m = netmap_get_mbuf(NETMAP_BUF_SIZE(na));

- if (!m) {

- D("tx_pool[%d] allocation failed", i);

- error = ENOMEM;

- goto free_tx_pools;

- }

- na->tx_rings[r].tx_pool[i] = m;

- }

+ mtx_init(&kring->tx_event_lock, "tx_event_lock",

+ NULL, MTX_SPIN);

}

+ }

+ for_each_rx_kring_h(r, kring, na) {

+ if (nm_kring_pending_on(kring)) {

+ D("RX ring %d of generic adapter %p goes on", r, na);

+ kring->nr_mode = NKR_NETMAP_ON;

+ }

+ for_each_tx_kring_h(r, kring, na) {

+ if (nm_kring_pending_on(kring)) {

+ D("TX ring %d of generic adapter %p goes on", r, na);

+ kring->nr_mode = NKR_NETMAP_ON;

+ }

+ for_each_tx_kring(r, kring, na) {

+ /* Initialize tx_pool and tx_event. */

+ for (i=0; i<na->num_tx_desc; i++) {

+ kring->tx_pool[i] = NULL;

+ }

+ kring->tx_event = NULL;

+ }

+ if (na->active_fds == 0) {

rtnl_lock();

/* Prepare to intercept incoming traffic. */

- error = netmap_catch_rx(gna, 1);

+ error = nm_os_catch_rx(gna, 1);

if (error) {

- D("netdev_rx_handler_register() failed (%d)", error);

+ D("nm_os_catch_rx(1) failed (%d)", error);

goto register_handler;

}

- na->na_flags |= NAF_NETMAP_ON;

/* Make netmap control the packet steering. */

- netmap_catch_tx(gna, 1);

+ error = nm_os_catch_tx(gna, 1);

+ if (error) {

+ D("nm_os_catch_tx(1) failed (%d)", error);

+ goto catch_rx;

+ }

rtnl_unlock();

+ na->na_flags |= NAF_NETMAP_ON;

#ifdef RATE_GENERIC

if (rate_ctx.refcount == 0) {

D("setup_timer()");

@@ -329,73 +539,26 @@ generic_netmap_register(struct netmap_adapter *na, int enable)

}

rate_ctx.refcount++;

#endif /* RATE */

- } else if (na->tx_rings[0].tx_pool) {

- /* Disable netmap mode. We enter here only if the previous

- generic_netmap_register(na, 1) was successful.

- If it was not, na->tx_rings[0].tx_pool was set to NULL by the

- error handling code below. */

- rtnl_lock();

- na->na_flags &= ~NAF_NETMAP_ON;

- /* Release packet steering control. */

- netmap_catch_tx(gna, 0);

- /* Do not intercept packets on the rx path. */

- netmap_catch_rx(gna, 0);

- rtnl_unlock();

- /* Free the mbufs going to the netmap rings */

- for (r=0; r<na->num_rx_rings; r++) {

- mbq_safe_purge(&na->rx_rings[r].rx_queue);

- mbq_safe_destroy(&na->rx_rings[r].rx_queue);

- }

- for (r=0; r<na->num_rx_rings; r++)

- netmap_mitigation_cleanup(&gna->mit[r]);

- free(gna->mit, M_DEVBUF);

- for (r=0; r<na->num_tx_rings; r++) {

- for (i=0; i<na->num_tx_desc; i++) {

- m_freem(na->tx_rings[r].tx_pool[i]);

- }

- free(na->tx_rings[r].tx_pool, M_DEVBUF);

- }

-#ifdef RATE_GENERIC

- if (--rate_ctx.refcount == 0) {

- D("del_timer()");

- del_timer(&rate_ctx.timer);

- }

-#endif

- }

-#ifdef REG_RESET

- error = ifp->netdev_ops->ndo_open(ifp);

- if (error) {

- goto free_tx_pools;

}

-#endif

return 0;

+ /* Here (na->active_fds == 0) holds. */

+catch_rx:

+ nm_os_catch_rx(gna, 0);

register_handler:

rtnl_unlock();

free_tx_pools:

- for (r=0; r<na->num_tx_rings; r++) {

- if (na->tx_rings[r].tx_pool == NULL)

+ for_each_tx_kring(r, kring, na) {

+ mtx_destroy(&kring->tx_event_lock);

+ if (kring->tx_pool == NULL) {

continue;

- for (i=0; i<na->num_tx_desc; i++)

- if (na->tx_rings[r].tx_pool[i])

- m_freem(na->tx_rings[r].tx_pool[i]);

- free(na->tx_rings[r].tx_pool, M_DEVBUF);

- na->tx_rings[r].tx_pool = NULL;

+ }

+ free(kring->tx_pool, M_DEVBUF);

+ kring->tx_pool = NULL;

}

- for (r=0; r<na->num_rx_rings; r++) {

- netmap_mitigation_cleanup(&gna->mit[r]);

- mbq_safe_destroy(&na->rx_rings[r].rx_queue);

+ for_each_rx_kring(r, kring, na) {

+ mbq_safe_fini(&kring->rx_queue);

}

free(gna->mit, M_DEVBUF);

out:

@@ -411,13 +574,58 @@ out:

static void

generic_mbuf_destructor(struct mbuf *m)

{

- netmap_generic_irq(MBUF_IFP(m), MBUF_TXQ(m), NULL);

+ struct netmap_adapter *na = NA(GEN_TX_MBUF_IFP(m));

+ struct netmap_kring *kring;

+ unsigned int r = MBUF_TXQ(m);

+ unsigned int r_orig = r;

+ if (unlikely(!nm_netmap_on(na) || r >= na->num_tx_rings)) {

+ D("Error: no netmap adapter on device %p",

+ GEN_TX_MBUF_IFP(m));

+ return;

+ }

+ /*

+ * First, clear the event mbuf.

+ * In principle, the event 'm' should match the one stored

+ * on ring 'r'. However we check it explicitely to stay

+ * safe against lower layers (qdisc, driver, etc.) changing

+ * MBUF_TXQ(m) under our feet. If the match is not found

+ * on 'r', we try to see if it belongs to some other ring.

+ */

+ for (;;) {

+ bool match = false;

+ kring = &na->tx_rings[r];

+ mtx_lock_spin(&kring->tx_event_lock);

+ if (kring->tx_event == m) {

+ kring->tx_event = NULL;

+ match = true;

+ }

+ mtx_unlock_spin(&kring->tx_event_lock);

+ if (match) {

+ if (r != r_orig) {

+ RD(1, "event %p migrated: ring %u --> %u",

+ m, r_orig, r);

+ }

+ break;

+ }

+ if (++r == na->num_tx_rings) r = 0;

+ if (r == r_orig) {

+ RD(1, "Cannot match event %p", m);

+ return;

+ }

+ /* Second, wake up clients. They will reclaim the event through

+ * txsync. */

+ netmap_generic_irq(na, r, NULL);

#ifdef __FreeBSD__

- if (netmap_verbose)

- RD(5, "Tx irq (%p) queue %d index %d" , m, MBUF_TXQ(m), (int)(uintptr_t)m->m_ext.ext_arg1);

- netmap_default_mbuf_destructor(m);

-#endif /* __FreeBSD__ */

- IFRATE(rate_ctx.new.txirq++);

+ void_mbuf_dtor(m, NULL, NULL);

+#endif

}

extern int netmap_adaptive_io;

@@ -428,7 +636,7 @@ extern int netmap_adaptive_io;

* nr_hwcur is the first unsent buffer.

static u_int

-generic_netmap_tx_clean(struct netmap_kring *kring)

+generic_netmap_tx_clean(struct netmap_kring *kring, int txqdisc)

{

u_int const lim = kring->nkr_num_slots - 1;

u_int nm_i = nm_next(kring->nr_hwtail, lim);

@@ -436,20 +644,50 @@ generic_netmap_tx_clean(struct netmap_kring *kring)

u_int n = 0;

struct mbuf **tx_pool = kring->tx_pool;

+ ND("hwcur = %d, hwtail = %d", kring->nr_hwcur, kring->nr_hwtail);

while (nm_i != hwcur) { /* buffers not completed */

struct mbuf *m = tx_pool[nm_i];

- if (unlikely(m == NULL)) {

- /* this is done, try to replenish the entry */

- tx_pool[nm_i] = m = netmap_get_mbuf(NETMAP_BUF_SIZE(kring->na));

+ if (txqdisc) {

+ if (m == NULL) {

+ /* Nothing to do, this is going

+ * to be replenished. */

+ RD(3, "Is this happening?");

+ } else if (MBUF_QUEUED(m)) {

+ break; /* Not dequeued yet. */

+ } else if (MBUF_REFCNT(m) != 1) {

+ /* This mbuf has been dequeued but is still busy

+ * (refcount is 2).

+ * Leave it to the driver and replenish. */

+ m_freem(m);

+ tx_pool[nm_i] = NULL;

+ }

+ } else {

if (unlikely(m == NULL)) {

- D("mbuf allocation failed, XXX error");

- // XXX how do we proceed ? break ?

- return -ENOMEM;

+ int event_consumed;

+ /* This slot was used to place an event. */

+ mtx_lock_spin(&kring->tx_event_lock);

+ event_consumed = (kring->tx_event == NULL);

+ mtx_unlock_spin(&kring->tx_event_lock);

+ if (!event_consumed) {

+ /* The event has not been consumed yet,

+ * still busy in the driver. */

+ break;

+ }

+ /* The event has been consumed, we can go

+ * ahead. */

+ } else if (MBUF_REFCNT(m) != 1) {

+ /* This mbuf is still busy: its refcnt is 2. */

+ break;

}

- } else if (GET_MBUF_REFCNT(m) != 1) {

- break; /* This mbuf is still busy: its refcnt is 2. */

}

n++;

nm_i = nm_next(nm_i, lim);

#if 0 /* rate adaptation */

@@ -476,23 +714,17 @@ generic_netmap_tx_clean(struct netmap_kring *kring)

return n;

}

-/*

- * We have pending packets in the driver between nr_hwtail +1 and hwcur.

- * Compute a position in the middle, to be used to generate

- * a notification.

- */

+/* Compute a slot index in the middle between inf and sup. */

static inline u_int

-generic_tx_event_middle(struct netmap_kring *kring, u_int hwcur)

+ring_middle(u_int inf, u_int sup, u_int lim)

{

- u_int n = kring->nkr_num_slots;

- u_int ntc = nm_next(kring->nr_hwtail, n-1);

+ u_int n = lim + 1;

u_int e;

- if (hwcur >= ntc) {

- e = (hwcur + ntc) / 2;

+ if (sup >= inf) {

+ e = (sup + inf) / 2;

} else { /* wrap around */

- e = (hwcur + n + ntc) / 2;

+ e = (sup + n + inf) / 2;

if (e >= n) {

e -= n;

}

@@ -506,35 +738,59 @@ generic_tx_event_middle(struct netmap_kring *kring, u_int hwcur)

return e;

}

-/*

- * We have pending packets in the driver between nr_hwtail+1 and hwcur.

- * Schedule a notification approximately in the middle of the two.

- * There is a race but this is only called within txsync which does

- * a double check.

- */

static void

generic_set_tx_event(struct netmap_kring *kring, u_int hwcur)

{

+ u_int lim = kring->nkr_num_slots - 1;

struct mbuf *m;

u_int e;

+ u_int ntc = nm_next(kring->nr_hwtail, lim); /* next to clean */

- if (nm_next(kring->nr_hwtail, kring->nkr_num_slots -1) == hwcur) {

+ if (ntc == hwcur) {

return; /* all buffers are free */

}

- e = generic_tx_event_middle(kring, hwcur);

+ /*

+ * We have pending packets in the driver between hwtail+1

+ * and hwcur, and we have to chose one of these slot to

+ * generate a notification.

+ * There is a race but this is only called within txsync which

+ * does a double check.

+ */

+#if 0

+ /* Choose a slot in the middle, so that we don't risk ending

+ * up in a situation where the client continuously wake up,

+ * fills one or a few TX slots and go to sleep again. */

+ e = ring_middle(ntc, hwcur, lim);

+#else

+ /* Choose the first pending slot, to be safe against driver

+ * reordering mbuf transmissions. */

+ e = ntc;

+#endif

m = kring->tx_pool[e];

- ND(5, "Request Event at %d mbuf %p refcnt %d", e, m, m ? GET_MBUF_REFCNT(m) : -2 );

if (m == NULL) {

- /* This can happen if there is already an event on the netmap

- slot 'e': There is nothing to do. */

+ /* An event is already in place. */

return;

}

- kring->tx_pool[e] = NULL;

+ mtx_lock_spin(&kring->tx_event_lock);

+ if (kring->tx_event) {

+ /* An event is already in place. */

+ mtx_unlock_spin(&kring->tx_event_lock);

+ return;

+ }

SET_MBUF_DESTRUCTOR(m, generic_mbuf_destructor);

+ kring->tx_event = m;

+ mtx_unlock_spin(&kring->tx_event_lock);

+ kring->tx_pool[e] = NULL;

+ ND(5, "Request Event at %d mbuf %p refcnt %d", e, m, m ? MBUF_REFCNT(m) : -2 );

- // XXX wmb() ?

- /* Decrement the refcount an free it if we have the last one. */

+ /* Decrement the refcount. This will free it if we lose the race

+ * with the driver. */

m_freem(m);

smp_mb();

}

@@ -551,6 +807,7 @@ static int

generic_netmap_txsync(struct netmap_kring *kring, int flags)

{

struct netmap_adapter *na = kring->na;

+ struct netmap_generic_adapter *gna = (struct netmap_generic_adapter *)na;

struct ifnet *ifp = na->ifp;

struct netmap_ring *ring = kring->ring;

u_int nm_i; /* index into the netmap ring */ // j

@@ -560,8 +817,6 @@ generic_netmap_txsync(struct netmap_kring *kring, int flags)

IFRATE(rate_ctx.new.txsync++);

- // TODO: handle the case of mbuf allocation failure

rmb();

@@ -569,72 +824,121 @@ generic_netmap_txsync(struct netmap_kring *kring, int flags)

nm_i = kring->nr_hwcur;

if (nm_i != head) { /* we have new packets to send */

+ struct nm_os_gen_arg a;

+ u_int event = -1;

+ if (gna->txqdisc && nm_kr_txempty(kring)) {

+ /* In txqdisc mode, we ask for a delayed notification,

+ * but only when cur == hwtail, which means that the

+ * client is going to block. */

+ event = ring_middle(nm_i, head, lim);

+ ND(3, "Place txqdisc event (hwcur=%u,event=%u,"

+ "head=%u,hwtail=%u)", nm_i, event, head,

+ kring->nr_hwtail);

+ }

+ a.ifp = ifp;

+ a.ring_nr = ring_nr;

+ a.head = a.tail = NULL;

while (nm_i != head) {

struct netmap_slot *slot = &ring->slot[nm_i];

u_int len = slot->len;

void *addr = NMB(na, slot);

/* device-specific */

struct mbuf *m;

int tx_ret;

NM_CHECK_ADDR_LEN(na, addr, len);

- /* Tale a mbuf from the tx pool and copy in the user packet. */

+ /* Tale a mbuf from the tx pool (replenishing the pool

+ * entry if necessary) and copy in the user packet. */

m = kring->tx_pool[nm_i];

- if (unlikely(!m)) {

- RD(5, "This should never happen");

- kring->tx_pool[nm_i] = m = netmap_get_mbuf(NETMAP_BUF_SIZE(na));

- if (unlikely(m == NULL)) {

- D("mbuf allocation failed");

+ if (unlikely(m == NULL)) {

+ kring->tx_pool[nm_i] = m =

+ nm_os_get_mbuf(ifp, NETMAP_BUF_SIZE(na));

+ if (m == NULL) {

+ RD(2, "Failed to replenish mbuf");

+ /* Here we could schedule a timer which

+ * retries to replenish after a while,

+ * and notifies the client when it

+ * manages to replenish some slots. In

+ * any case we break early to avoid

+ * crashes. */

break;

}

+ IFRATE(rate_ctx.new.txrepl++);

}

- /* XXX we should ask notifications when NS_REPORT is set,

- * or roughly every half frame. We can optimize this

- * by lazily requesting notifications only when a

- * transmission fails. Probably the best way is to

- * break on failures and set notifications when

- * ring->cur == ring->tail || nm_i != cur

+ a.m = m;

+ a.addr = addr;

+ a.len = len;

+ a.qevent = (nm_i == event);

+ /* When not in txqdisc mode, we should ask

+ * notifications when NS_REPORT is set, or roughly

+ * every half ring. To optimize this, we set a

+ * notification event when the client runs out of

+ * TX ring space, or when transmission fails. In

+ * the latter case we also break early.

- tx_ret = generic_xmit_frame(ifp, m, addr, len, ring_nr);

+ tx_ret = nm_os_generic_xmit_frame(&a);

if (unlikely(tx_ret)) {

- ND(5, "start_xmit failed: err %d [nm_i %u, head %u, hwtail %u]",

- tx_ret, nm_i, head, kring->nr_hwtail);

- /*

- * No room for this mbuf in the device driver.

- * Request a notification FOR A PREVIOUS MBUF,

- * then call generic_netmap_tx_clean(kring) to do the

- * double check and see if we can free more buffers.

- * If there is space continue, else break;

- * NOTE: the double check is necessary if the problem

- * occurs in the txsync call after selrecord().

- * Also, we need some way to tell the caller that not

- * all buffers were queued onto the device (this was

- * not a problem with native netmap driver where space

- * is preallocated). The bridge has a similar problem

- * and we solve it there by dropping the excess packets.

- */

- generic_set_tx_event(kring, nm_i);

- if (generic_netmap_tx_clean(kring)) { /* space now available */

- continue;

- } else {

- break;

+ if (!gna->txqdisc) {

+ /*

+ * No room for this mbuf in the device driver.

+ * Request a notification FOR A PREVIOUS MBUF,

+ * then call generic_netmap_tx_clean(kring) to do the

+ * double check and see if we can free more buffers.

+ * If there is space continue, else break;

+ * NOTE: the double check is necessary if the problem

+ * occurs in the txsync call after selrecord().

+ * Also, we need some way to tell the caller that not

+ * all buffers were queued onto the device (this was

+ * not a problem with native netmap driver where space

+ * is preallocated). The bridge has a similar problem

+ * and we solve it there by dropping the excess packets.

+ */

+ generic_set_tx_event(kring, nm_i);

+ if (generic_netmap_tx_clean(kring, gna->txqdisc)) {

+ /* space now available */

+ continue;

+ } else {

+ break;

+ }

}

+ /* In txqdisc mode, the netmap-aware qdisc

+ * queue has the same length as the number of

+ * netmap slots (N). Since tail is advanced

+ * only when packets are dequeued, qdisc

+ * queue overrun cannot happen, so

+ * nm_os_generic_xmit_frame() did not fail

+ * because of that.

+ * However, packets can be dropped because

+ * carrier is off, or because our qdisc is

+ * being deactivated, or possibly for other

+ * reasons. In these cases, we just let the

+ * packet to be dropped. */

+ IFRATE(rate_ctx.new.txdrop++);

}

slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED);

nm_i = nm_next(nm_i, lim);

- IFRATE(rate_ctx.new.txpkt ++);

+ IFRATE(rate_ctx.new.txpkt++);

}

- /* Update hwcur to the next slot to transmit. */

- kring->nr_hwcur = nm_i; /* not head, we could break early */

+ if (a.head != NULL) {

+ a.addr = NULL;

+ nm_os_generic_xmit_frame(&a);

+ }

+ /* Update hwcur to the next slot to transmit. Here nm_i

+ * is not necessarily head, we could break early. */

+ kring->nr_hwcur = nm_i;

}

* Second, reclaim completed buffers

- if (flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring)) {

+ if (!gna->txqdisc && (flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring))) {

/* No more available slots? Set a notification event

* on a netmap slot that will be cleaned in the future.

* No doublecheck is performed, since txsync() will be

@@ -642,58 +946,74 @@ generic_netmap_txsync(struct netmap_kring *kring, int flags)

generic_set_tx_event(kring, nm_i);

}

- ND("tx #%d, hwtail = %d", n, kring->nr_hwtail);

- generic_netmap_tx_clean(kring);

+ generic_netmap_tx_clean(kring, gna->txqdisc);

return 0;

}

- * This handler is registered (through netmap_catch_rx())

+ * This handler is registered (through nm_os_catch_rx())

* within the attached network interface

* in the RX subsystem, so that every mbuf passed up by

* the driver can be stolen to the network stack.

* Stolen packets are put in a queue where the

* generic_netmap_rxsync() callback can extract them.

+ * Returns 1 if the packet was stolen, 0 otherwise.

-void

+int

generic_rx_handler(struct ifnet *ifp, struct mbuf *m)

{

struct netmap_adapter *na = NA(ifp);

struct netmap_generic_adapter *gna = (struct netmap_generic_adapter *)na;

+ struct netmap_kring *kring;

u_int work_done;

- u_int rr = MBUF_RXQ(m); // receive ring number

+ u_int r = MBUF_RXQ(m); /* receive ring number */

- if (rr >= na->num_rx_rings) {

- rr = rr % na->num_rx_rings; // XXX expensive...

+ if (r >= na->num_rx_rings) {

+ r = r % na->num_rx_rings;

+ }

+ kring = &na->rx_rings[r];

+ if (kring->nr_mode == NKR_NETMAP_OFF) {

+ /* We must not intercept this mbuf. */

+ return 0;

}

/* limit the size of the queue */

- if (unlikely(mbq_len(&na->rx_rings[rr].rx_queue) > 1024)) {

+ if (unlikely(!gna->rxsg && MBUF_LEN(m) > NETMAP_BUF_SIZE(na))) {

+ /* This may happen when GRO/LRO features are enabled for

+ * the NIC driver when the generic adapter does not

+ * support RX scatter-gather. */

+ RD(2, "Warning: driver pushed up big packet "

+ "(size=%d)", (int)MBUF_LEN(m));

+ m_freem(m);

+ } else if (unlikely(mbq_len(&kring->rx_queue) > 1024)) {

m_freem(m);

} else {

- mbq_safe_enqueue(&na->rx_rings[rr].rx_queue, m);

+ mbq_safe_enqueue(&kring->rx_queue, m);

}

if (netmap_generic_mit < 32768) {

/* no rx mitigation, pass notification up */

- netmap_generic_irq(na->ifp, rr, &work_done);

- IFRATE(rate_ctx.new.rxirq++);

+ netmap_generic_irq(na, r, &work_done);

} else {

/* same as send combining, filter notification if there is a

* pending timer, otherwise pass it up and start a timer.

- if (likely(netmap_mitigation_active(&gna->mit[rr]))) {

+ if (likely(nm_os_mitigation_active(&gna->mit[r]))) {

/* Record that there is some pending work. */

- gna->mit[rr].mit_pending = 1;

+ gna->mit[r].mit_pending = 1;

} else {

- netmap_generic_irq(na->ifp, rr, &work_done);

- IFRATE(rate_ctx.new.rxirq++);

- netmap_mitigation_start(&gna->mit[rr]);

+ netmap_generic_irq(na, r, &work_done);

+ nm_os_mitigation_start(&gna->mit[r]);

}

+ /* We have intercepted the mbuf. */

+ return 1;

}

@@ -713,54 +1033,23 @@ generic_netmap_rxsync(struct netmap_kring *kring, int flags)

u_int const head = kring->rhead;

int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR;

+ /* Adapter-specific variables. */

+ uint16_t slot_flags = kring->nkr_slot_flags;

+ u_int nm_buf_len = NETMAP_BUF_SIZE(na);

+ struct mbq tmpq;

+ struct mbuf *m;

+ int avail; /* in bytes */

+ int mlen;

+ int copy;

if (head > lim)

return netmap_ring_reinit(kring);

- /*

- * First part: import newly received packets.

- */

- if (netmap_no_pendintr || force_update) {

- /* extract buffers from the rx queue, stop at most one

- * slot before nr_hwcur (stop_i)

- */

- uint16_t slot_flags = kring->nkr_slot_flags;

- u_int stop_i = nm_prev(kring->nr_hwcur, lim);

- nm_i = kring->nr_hwtail; /* first empty slot in the receive ring */

- for (n = 0; nm_i != stop_i; n++) {

- int len;

- void *addr = NMB(na, &ring->slot[nm_i]);

- struct mbuf *m;

- /* we only check the address here on generic rx rings */

- if (addr == NETMAP_BUF_BASE(na)) { /* Bad buffer */

- return netmap_ring_reinit(kring);

- }

- /*

- * Call the locked version of the function.

- * XXX Ideally we could grab a batch of mbufs at once

- * and save some locking overhead.

- */

- m = mbq_safe_dequeue(&kring->rx_queue);

- if (!m) /* no more data */

- break;

- len = MBUF_LEN(m);

- m_copydata(m, 0, len, addr);

- ring->slot[nm_i].len = len;

- ring->slot[nm_i].flags = slot_flags;

- m_freem(m);

- nm_i = nm_next(nm_i, lim);

- }

- if (n) {

- kring->nr_hwtail = nm_i;

- IFRATE(rate_ctx.new.rxpkt += n);

- }

- kring->nr_kflags &= ~NKR_PENDINTR;

- }

+ IFRATE(rate_ctx.new.rxsync++);

- // XXX should we invert the order ?

- * Second part: skip past packets that userspace has released.

+ * First part: skip past packets that userspace has released.

+ * This can possibly make room for the second part.

nm_i = kring->nr_hwcur;

if (nm_i != head) {

@@ -773,7 +1062,106 @@ generic_netmap_rxsync(struct netmap_kring *kring, int flags)

}

kring->nr_hwcur = head;

}

- IFRATE(rate_ctx.new.rxsync++);

+ /*

+ * Second part: import newly received packets.

+ */

+ if (!netmap_no_pendintr && !force_update) {

+ return 0;

+ }

+ nm_i = kring->nr_hwtail; /* First empty slot in the receive ring. */

+ /* Compute the available space (in bytes) in this netmap ring.

+ * The first slot that is not considered in is the one before

+ * nr_hwcur. */

+ avail = nm_prev(kring->nr_hwcur, lim) - nm_i;

+ if (avail < 0)

+ avail += lim + 1;

+ avail *= nm_buf_len;

+ /* First pass: While holding the lock on the RX mbuf queue,

+ * extract as many mbufs as they fit the available space,

+ * and put them in a temporary queue.

+ * To avoid performing a per-mbuf division (mlen / nm_buf_len) to

+ * to update avail, we do the update in a while loop that we

+ * also use to set the RX slots, but without performing the copy. */

+ mbq_init(&tmpq);

+ mbq_lock(&kring->rx_queue);

+ for (n = 0;; n++) {

+ m = mbq_peek(&kring->rx_queue);

+ if (!m) {

+ /* No more packets from the driver. */

+ break;

+ }

+ mlen = MBUF_LEN(m);

+ if (mlen > avail) {

+ /* No more space in the ring. */

+ break;

+ }

+ mbq_dequeue(&kring->rx_queue);

+ while (mlen) {

+ copy = nm_buf_len;

+ if (mlen < copy) {

+ copy = mlen;

+ }

+ mlen -= copy;

+ avail -= nm_buf_len;

+ ring->slot[nm_i].len = copy;

+ ring->slot[nm_i].flags = slot_flags | (mlen ? NS_MOREFRAG : 0);

+ nm_i = nm_next(nm_i, lim);

+ }

+ mbq_enqueue(&tmpq, m);

+ }

+ mbq_unlock(&kring->rx_queue);

+ /* Second pass: Drain the temporary queue, going over the used RX slots,

+ * and perform the copy out of the RX queue lock. */

+ nm_i = kring->nr_hwtail;

+ for (;;) {

+ void *nmaddr;

+ int ofs = 0;

+ int morefrag;

+ m = mbq_dequeue(&tmpq);

+ if (!m) {

+ break;

+ }

+ do {

+ nmaddr = NMB(na, &ring->slot[nm_i]);

+ /* We only check the address here on generic rx rings. */

+ if (nmaddr == NETMAP_BUF_BASE(na)) { /* Bad buffer */

+ m_freem(m);

+ mbq_purge(&tmpq);

+ mbq_fini(&tmpq);

+ return netmap_ring_reinit(kring);

+ }

+ copy = ring->slot[nm_i].len;

+ m_copydata(m, ofs, copy, nmaddr);

+ ofs += copy;

+ morefrag = ring->slot[nm_i].flags & NS_MOREFRAG;

+ nm_i = nm_next(nm_i, lim);

+ } while (morefrag);

+ m_freem(m);

+ }

+ mbq_fini(&tmpq);

+ if (n) {

+ kring->nr_hwtail = nm_i;

+ IFRATE(rate_ctx.new.rxpkt += n);

+ }

+ kring->nr_kflags &= ~NKR_PENDINTR;

return 0;

}

@@ -787,9 +1175,8 @@ generic_netmap_dtor(struct netmap_adapter *na)

if (prev_na != NULL) {

D("Released generic NA %p", gna);

- if_rele(ifp);

netmap_adapter_put(prev_na);

- if (na->ifp == NULL) {

+ if (nm_iszombie(na)) {

* The driver has been removed without releasing

* the reference so we need to do it here.

@@ -797,9 +1184,13 @@ generic_netmap_dtor(struct netmap_adapter *na)

netmap_adapter_put(prev_na);

}

- WNA(ifp) = prev_na;

- D("Restored native NA %p", prev_na);

+ NM_ATTACH_NA(ifp, prev_na);

+ /*

+ * netmap_detach_common(), that it's called after this function,

+ * overrides WNA(ifp) if na->ifp is not NULL.

+ */

na->ifp = NULL;

+ D("Restored native NA %p", prev_na);

}

@@ -823,7 +1214,7 @@ generic_netmap_attach(struct ifnet *ifp)

num_tx_desc = num_rx_desc = netmap_generic_ringsize; /* starting point */

- generic_find_num_desc(ifp, &num_tx_desc, &num_rx_desc); /* ignore errors */

+ nm_os_generic_find_num_desc(ifp, &num_tx_desc, &num_rx_desc); /* ignore errors */

ND("Netmap ring size: TX = %d, RX = %d", num_tx_desc, num_rx_desc);

if (num_tx_desc == 0 || num_rx_desc == 0) {

D("Device has no hw slots (tx %u, rx %u)", num_tx_desc, num_rx_desc);

@@ -855,12 +1246,23 @@ generic_netmap_attach(struct ifnet *ifp)

ND("[GNA] num_rx_queues(%d), real_num_rx_queues(%d)",

ifp->num_rx_queues, ifp->real_num_rx_queues);

- generic_find_num_queues(ifp, &na->num_tx_rings, &na->num_rx_rings);

+ nm_os_generic_find_num_queues(ifp, &na->num_tx_rings, &na->num_rx_rings);

retval = netmap_attach_common(na);

if (retval) {

free(gna, M_DEVBUF);

+ return retval;

}

+ gna->prev = NA(ifp); /* save old na */

+ if (gna->prev != NULL) {

+ netmap_adapter_get(gna->prev);

+ }

+ NM_ATTACH_NA(ifp, na);

+ nm_os_generic_set_features(gna);

+ D("Created generic NA %p (prev %p)", gna, gna->prev);

return retval;

}

diff --git a/sys/dev/netmap/netmap_kern.h b/sys/dev/netmap/netmap_kern.h
index 4aead85285fd..de21f29585e0 100644
--- a/sys/dev/netmap/netmap_kern.h
+++ b/sys/dev/netmap/netmap_kern.h

@@ -1,6 +1,7 @@

* Redistribution and use in source and binary forms, with or without

* modification, are permitted provided that the following conditions

@@ -48,24 +49,34 @@

#if defined(CONFIG_NETMAP_GENERIC)

#define WITH_GENERIC

#endif

-#if defined(CONFIG_NETMAP_V1000)

-#define WITH_V1000

+#if defined(CONFIG_NETMAP_PTNETMAP_GUEST)

+#define WITH_PTNETMAP_GUEST

+#endif

+#if defined(CONFIG_NETMAP_PTNETMAP_HOST)

+#define WITH_PTNETMAP_HOST

#endif

-#else /* not linux */

+#elif defined (_WIN32)

+#define WITH_VALE // comment out to disable VALE support

+#define WITH_PIPES

+#define WITH_MONITOR

+#define WITH_GENERIC

+#else /* neither linux nor windows */

#define WITH_VALE // comment out to disable VALE support

#define WITH_PIPES

#define WITH_MONITOR

#define WITH_GENERIC

+#define WITH_PTNETMAP_HOST /* ptnetmap host support */

+#define WITH_PTNETMAP_GUEST /* ptnetmap guest support */

#endif

#if defined(__FreeBSD__)

-#include <sys/selinfo.h>

#define likely(x) __builtin_expect((long)!!(x), 1L)

#define unlikely(x) __builtin_expect((long)!!(x), 0L)

+#define __user

#define NM_LOCK_T struct mtx /* low level spinlock, used to protect queues */

@@ -77,9 +88,11 @@

#define NM_MTX_ASSERT(m) sx_assert(&(m), SA_XLOCKED)

#define NM_SELINFO_T struct nm_selinfo

+#define NM_SELRECORD_T struct thread

#define MBUF_LEN(m) ((m)->m_pkthdr.len)

-#define MBUF_IFP(m) ((m)->m_pkthdr.rcvif)

-#define NM_SEND_UP(ifp, m) ((NA(ifp))->if_input)(ifp, m)

+#define MBUF_TXQ(m) ((m)->m_pkthdr.flowid)

+#define MBUF_TRANSMIT(na, ifp, m) ((na)->if_transmit(ifp, m))

+#define GEN_TX_MBUF_IFP(m) ((m)->m_pkthdr.rcvif)

#define NM_ATOMIC_T volatile int // XXX ?

/* atomic operations */

@@ -98,23 +111,20 @@ struct netmap_adapter *netmap_getna(if_t ifp);

#endif

#if __FreeBSD_version >= 1100027

-#define GET_MBUF_REFCNT(m) ((m)->m_ext.ext_cnt ? *((m)->m_ext.ext_cnt) : -1)

-#define SET_MBUF_REFCNT(m, x) *((m)->m_ext.ext_cnt) = x

-#define PNT_MBUF_REFCNT(m) ((m)->m_ext.ext_cnt)

+#define MBUF_REFCNT(m) ((m)->m_ext.ext_count)

+#define SET_MBUF_REFCNT(m, x) (m)->m_ext.ext_count = x

#else

-#define GET_MBUF_REFCNT(m) ((m)->m_ext.ref_cnt ? *((m)->m_ext.ref_cnt) : -1)

+#define MBUF_REFCNT(m) ((m)->m_ext.ref_cnt ? *((m)->m_ext.ref_cnt) : -1)

#define SET_MBUF_REFCNT(m, x) *((m)->m_ext.ref_cnt) = x

-#define PNT_MBUF_REFCNT(m) ((m)->m_ext.ref_cnt)

#endif

-MALLOC_DECLARE(M_NETMAP);

+#define MBUF_QUEUED(m) 1

struct nm_selinfo {

struct selinfo si;

struct mtx m;

};

-void freebsd_selwakeup(struct nm_selinfo *si, int pri);

// XXX linux struct, not used in FreeBSD

struct net_device_ops {

@@ -131,12 +141,16 @@ struct hrtimer {

#define NM_LOCK_T safe_spinlock_t // see bsd_glue.h

#define NM_SELINFO_T wait_queue_head_t

#define MBUF_LEN(m) ((m)->len)

-#define MBUF_IFP(m) ((m)->dev)

-#define NM_SEND_UP(ifp, m) \

- do { \

- m->priority = NM_MAGIC_PRIORITY_RX; \

- netif_rx(m); \

- } while (0)

+#define MBUF_TRANSMIT(na, ifp, m) \

+ ({ \

+ /* Avoid infinite recursion with generic. */ \

+ m->priority = NM_MAGIC_PRIORITY_TX; \

+ (((struct net_device_ops *)(na)->if_transmit)->ndo_start_xmit(m, ifp)); \

+ 0; \

+ })

+/* See explanation in nm_os_generic_xmit_frame. */

+#define GEN_TX_MBUF_IFP(m) ((struct ifnet *)skb_shinfo(m)->destructor_arg)

#define NM_ATOMIC_T volatile long unsigned int

@@ -159,7 +173,51 @@ struct hrtimer {

#define NM_LOCK_T IOLock *

#define NM_SELINFO_T struct selinfo

#define MBUF_LEN(m) ((m)->m_pkthdr.len)

-#define NM_SEND_UP(ifp, m) ((ifp)->if_input)(ifp, m)

+#elif defined (_WIN32)

+#include "../../../WINDOWS/win_glue.h"

+#define NM_SELRECORD_T IO_STACK_LOCATION

+#define NM_SELINFO_T win_SELINFO // see win_glue.h

+#define NM_LOCK_T win_spinlock_t // see win_glue.h

+#define NM_MTX_T KGUARDED_MUTEX /* OS-specific mutex (sleepable) */

+#define NM_MTX_INIT(m) KeInitializeGuardedMutex(&m);

+#define NM_MTX_DESTROY(m) do { (void)(m); } while (0)

+#define NM_MTX_LOCK(m) KeAcquireGuardedMutex(&(m))

+#define NM_MTX_UNLOCK(m) KeReleaseGuardedMutex(&(m))

+#define NM_MTX_ASSERT(m) assert(&m.Count>0)

+//These linknames are for the NDIS driver

+#define NETMAP_NDIS_LINKNAME_STRING L"\\DosDevices\\NMAPNDIS"

+#define NETMAP_NDIS_NTDEVICE_STRING L"\\Device\\NMAPNDIS"

+//Definition of internal driver-to-driver ioctl codes

+#define NETMAP_KERNEL_XCHANGE_POINTERS _IO('i', 180)

+#define NETMAP_KERNEL_SEND_SHUTDOWN_SIGNAL _IO_direct('i', 195)

+//Empty data structures are not permitted by MSVC compiler

+//XXX_ale, try to solve this problem

+struct net_device_ops{

+ char data[1];

+};

+typedef struct ethtool_ops{

+ char data[1];

+};

+typedef struct hrtimer{

+ KTIMER timer;

+ BOOLEAN active;

+ KDPC deferred_proc;

+};

+/* MSVC does not have likely/unlikely support */

+#ifdef _MSC_VER

+#define likely(x) (x)

+#define unlikely(x) (x)

+#else

+#define likely(x) __builtin_expect((long)!!(x), 1L)

+#define unlikely(x) __builtin_expect((long)!!(x), 0L)

+#endif //_MSC_VER

#else

@@ -167,6 +225,13 @@ struct hrtimer {

#endif /* end - platform-specific code */

+#ifndef _WIN32 /* support for emulated sysctl */

+#define SYSBEGIN(x)

+#define SYSEND

+#endif /* _WIN32 */

+#define NM_ACCESS_ONCE(x) (*(volatile __typeof__(x) *)&(x))

#define NMG_LOCK_T NM_MTX_T

#define NMG_LOCK_INIT() NM_MTX_INIT(netmap_global_lock)

#define NMG_LOCK_DESTROY() NM_MTX_DESTROY(netmap_global_lock)

@@ -201,8 +266,36 @@ struct nm_bdg_fwd;

struct nm_bridge;

struct netmap_priv_d;

+/* os-specific NM_SELINFO_T initialzation/destruction functions */

+void nm_os_selinfo_init(NM_SELINFO_T *);

+void nm_os_selinfo_uninit(NM_SELINFO_T *);

const char *nm_dump_buf(char *p, int len, int lim, char *dst);

+void nm_os_selwakeup(NM_SELINFO_T *si);

+void nm_os_selrecord(NM_SELRECORD_T *sr, NM_SELINFO_T *si);

+int nm_os_ifnet_init(void);

+void nm_os_ifnet_fini(void);

+void nm_os_ifnet_lock(void);

+void nm_os_ifnet_unlock(void);

+void nm_os_get_module(void);

+void nm_os_put_module(void);

+void netmap_make_zombie(struct ifnet *);

+void netmap_undo_zombie(struct ifnet *);

+/* passes a packet up to the host stack.

+ * If the packet is sent (or dropped) immediately it returns NULL,

+ * otherwise it links the packet to prev and returns m.

+ * In this case, a final call with m=NULL and prev != NULL will send up

+ * the entire chain to the host stack.

+ */

+void *nm_os_send_up(struct ifnet *, struct mbuf *m, struct mbuf *prev);

+int nm_os_mbuf_has_offld(struct mbuf *m);

#include "netmap_mbq.h"

extern NMG_LOCK_T netmap_global_lock;

@@ -299,6 +392,19 @@ struct netmap_kring {

uint32_t nr_kflags; /* private driver flags */

#define NKR_PENDINTR 0x1 // Pending interrupt.

#define NKR_EXCLUSIVE 0x2 /* exclusive binding */

+#define NKR_FORWARD 0x4 /* (host ring only) there are

+ packets to forward

+ */

+#define NKR_NEEDRING 0x8 /* ring needed even if users==0

+ * (used internally by pipes and

+ * by ptnetmap host ports)

+ */

+ uint32_t nr_mode;

+ uint32_t nr_pending_mode;

+#define NKR_NETMAP_OFF 0x0

+#define NKR_NETMAP_ON 0x1

uint32_t nkr_num_slots;

@@ -344,13 +450,14 @@ struct netmap_kring {

* store incoming mbufs in a queue that is drained by

* a rxsync.

- struct mbuf **tx_pool;

- // u_int nr_ntc; /* Emulation of a next-to-clean RX ring pointer. */

- struct mbq rx_queue; /* intercepted rx mbufs. */

+ struct mbuf **tx_pool;

+ struct mbuf *tx_event; /* TX event used as a notification */

+ NM_LOCK_T tx_event_lock; /* protects the tx_event mbuf */

+ struct mbq rx_queue; /* intercepted rx mbufs. */

uint32_t users; /* existing bindings for this ring */

- uint32_t ring_id; /* debugging */

+ uint32_t ring_id; /* kring identifier */

enum txrx tx; /* kind of ring (tx or rx) */

char name[64]; /* diagnostic */

@@ -372,9 +479,6 @@ struct netmap_kring {

struct netmap_kring *pipe; /* if this is a pipe ring,

* pointer to the other end

- struct netmap_ring *save_ring; /* pointer to hidden rings

- * (see netmap_pipe.c for details)

- */

#endif /* WITH_PIPES */

#ifdef WITH_VALE

@@ -397,8 +501,28 @@ struct netmap_kring {

uint32_t mon_tail; /* last seen slot on rx */

uint32_t mon_pos; /* index of this ring in the monitored ring array */

#endif

-} __attribute__((__aligned__(64)));

+#ifdef _WIN32

+__declspec(align(64));

+#else

+__attribute__((__aligned__(64)));

+#endif

+/* return 1 iff the kring needs to be turned on */

+static inline int

+nm_kring_pending_on(struct netmap_kring *kring)

+ return kring->nr_pending_mode == NKR_NETMAP_ON &&

+ kring->nr_mode == NKR_NETMAP_OFF;

+/* return 1 iff the kring needs to be turned off */

+static inline int

+nm_kring_pending_off(struct netmap_kring *kring)

+ return kring->nr_pending_mode == NKR_NETMAP_OFF &&

+ kring->nr_mode == NKR_NETMAP_ON;

/* return the next index, with wraparound */

static inline uint32_t

@@ -514,6 +638,8 @@ struct netmap_adapter {

#define NAF_HOST_RINGS 64 /* the adapter supports the host rings */

#define NAF_FORCE_NATIVE 128 /* the adapter is always NATIVE */

+#define NAF_PTNETMAP_HOST 256 /* the adapter supports ptnetmap in the host */

+#define NAF_ZOMBIE (1U<<30) /* the nic driver has been unloaded */

#define NAF_BUSY (1U<<31) /* the adapter is used internally and

* cannot be registered from userspace

@@ -592,10 +718,14 @@ struct netmap_adapter {

* For hw devices this is typically a selwakeup(),

* but for NIC/host ports attached to a switch (or vice-versa)

* we also need to invoke the 'txsync' code downstream.

+ * This callback pointer is actually used only to initialize

+ * kring->nm_notify.

+ * Return values are the same as for netmap_rx_irq().

void (*nm_dtor)(struct netmap_adapter *);

int (*nm_register)(struct netmap_adapter *, int onoff);

+ void (*nm_intr)(struct netmap_adapter *, int onoff);

int (*nm_txsync)(struct netmap_kring *kring, int flags);

int (*nm_rxsync)(struct netmap_kring *kring, int flags);

@@ -640,14 +770,14 @@ struct netmap_adapter {

/* memory allocator (opaque)

* We also cache a pointer to the lut_entry for translating

- * buffer addresses, and the total number of buffers.

+ * buffer addresses, the total number of buffers and the buffer size.

struct netmap_mem_d *nm_mem;

struct netmap_lut na_lut;

/* additional information attached to this adapter

* by other netmap subsystems. Currently used by

- * bwrap and LINUX/v1000.

+ * bwrap, LINUX/v1000 and ptnetmap

void *na_private;

@@ -656,6 +786,9 @@ struct netmap_adapter {

int na_next_pipe; /* next free slot in the array */

int na_max_pipes; /* size of the array */

+ /* Offset of ethernet header for each packet. */

+ u_int virt_hdr_len;

char name[64];

};

@@ -721,8 +854,6 @@ struct netmap_vp_adapter { /* VALE software port */

struct nm_bridge *na_bdg;

int retry;

- /* Offset of ethernet header for each packet. */

- u_int virt_hdr_len;

/* Maximum Frame Size, used in bdg_mismatch_datapath() */

u_int mfs;

/* Last source MAC on this port */

@@ -767,6 +898,13 @@ struct netmap_generic_adapter { /* emulated device */

#ifdef linux

netdev_tx_t (*save_start_xmit)(struct mbuf *, struct ifnet *);

#endif

+ /* Is the adapter able to use multiple RX slots to scatter

+ * each packet pushed up by the driver? */

+ int rxsg;

+ /* Is the transmission path controlled by a netmap-aware

+ * device queue (i.e. qdisc on linux)? */

+ int txqdisc;

};

#endif /* WITH_GENERIC */

@@ -777,7 +915,7 @@ netmap_real_rings(struct netmap_adapter *na, enum txrx t)

}

#ifdef WITH_VALE

+struct nm_bdg_polling_state;

* Bridge wrapper for non VALE ports attached to a VALE switch.

@@ -827,9 +965,6 @@ struct netmap_bwrap_adapter {

struct netmap_vp_adapter host; /* for host rings */

struct netmap_adapter *hwna; /* the underlying device */

- /* backup of the hwna memory allocator */

- struct netmap_mem_d *save_nmd;

* When we attach a physical interface to the bridge, we

* allow the controlling process to terminate, so we need

@@ -838,10 +973,10 @@ struct netmap_bwrap_adapter {

* are attached to a bridge.

struct netmap_priv_d *na_kpriv;

+ struct nm_bdg_polling_state *na_polling_state;

};

int netmap_bwrap_attach(const char *name, struct netmap_adapter *);

#endif /* WITH_VALE */

#ifdef WITH_PIPES

@@ -876,56 +1011,122 @@ nm_kr_rxspace(struct netmap_kring *k)

return space;

}

+/* return slots reserved to tx clients */

+#define nm_kr_txspace(_k) nm_kr_rxspace(_k)

-/* True if no space in the tx ring. only valid after txsync_prologue */

+/* True if no space in the tx ring, only valid after txsync_prologue */

static inline int

nm_kr_txempty(struct netmap_kring *kring)

{

return kring->rcur == kring->nr_hwtail;

}

+/* True if no more completed slots in the rx ring, only valid after

+ * rxsync_prologue */

+#define nm_kr_rxempty(_k) nm_kr_txempty(_k)

* protect against multiple threads using the same ring.

- * also check that the ring has not been stopped.

- * We only care for 0 or !=0 as a return code.

+ * also check that the ring has not been stopped or locked

-#define NM_KR_BUSY 1

-#define NM_KR_STOPPED 2

+#define NM_KR_BUSY 1 /* some other thread is syncing the ring */

+#define NM_KR_STOPPED 2 /* unbounded stop (ifconfig down or driver unload) */

+#define NM_KR_LOCKED 3 /* bounded, brief stop for mutual exclusion */

+/* release the previously acquired right to use the *sync() methods of the ring */

static __inline void nm_kr_put(struct netmap_kring *kr)

{

NM_ATOMIC_CLEAR(&kr->nr_busy);

}

-static __inline int nm_kr_tryget(struct netmap_kring *kr)

+/* true if the ifp that backed the adapter has disappeared (e.g., the

+ * driver has been unloaded)

+ */

+static inline int nm_iszombie(struct netmap_adapter *na);

+/* try to obtain exclusive right to issue the *sync() operations on the ring.

+ * The right is obtained and must be later relinquished via nm_kr_put() if and

+ * only if nm_kr_tryget() returns 0.

+ * If can_sleep is 1 there are only two other possible outcomes:

+ * - the function returns NM_KR_BUSY

+ * - the function returns NM_KR_STOPPED and sets the POLLERR bit in *perr

+ * (if non-null)

+ * In both cases the caller will typically skip the ring, possibly collecting

+ * errors along the way.

+ * If the calling context does not allow sleeping, the caller must pass 0 in can_sleep.

+ * In the latter case, the function may also return NM_KR_LOCKED and leave *perr

+ * untouched: ideally, the caller should try again at a later time.

+ */

+static __inline int nm_kr_tryget(struct netmap_kring *kr, int can_sleep, int *perr)

{

+ int busy = 1, stopped;

/* check a first time without taking the lock

* to avoid starvation for nm_kr_get()

- if (unlikely(kr->nkr_stopped)) {

- ND("ring %p stopped (%d)", kr, kr->nkr_stopped);

- return NM_KR_STOPPED;

+retry:

+ stopped = kr->nkr_stopped;

+ if (unlikely(stopped)) {

+ goto stop;

}

- if (unlikely(NM_ATOMIC_TEST_AND_SET(&kr->nr_busy)))

- return NM_KR_BUSY;

- /* check a second time with lock held */

- if (unlikely(kr->nkr_stopped)) {

- ND("ring %p stopped (%d)", kr, kr->nkr_stopped);

+ busy = NM_ATOMIC_TEST_AND_SET(&kr->nr_busy);

+ /* we should not return NM_KR_BUSY if the ring was

+ * actually stopped, so check another time after

+ * the barrier provided by the atomic operation

+ */

+ stopped = kr->nkr_stopped;

+ if (unlikely(stopped)) {

+ goto stop;

+ }

+ if (unlikely(nm_iszombie(kr->na))) {

+ stopped = NM_KR_STOPPED;

+ goto stop;

+ }

+ return unlikely(busy) ? NM_KR_BUSY : 0;

+stop:

+ if (!busy)

nm_kr_put(kr);

- return NM_KR_STOPPED;

+ if (stopped == NM_KR_STOPPED) {

+/* if POLLERR is defined we want to use it to simplify netmap_poll().

+ * Otherwise, any non-zero value will do.

+ */

+#ifdef POLLERR

+#define NM_POLLERR POLLERR

+#else

+#define NM_POLLERR 1

+#endif /* POLLERR */

+ if (perr)

+ *perr |= NM_POLLERR;

+#undef NM_POLLERR

+ } else if (can_sleep) {

+ tsleep(kr, 0, "NM_KR_TRYGET", 4);

+ goto retry;

}

- return 0;

+ return stopped;

}

-static __inline void nm_kr_get(struct netmap_kring *kr)

+/* put the ring in the 'stopped' state and wait for the current user (if any) to

+ * notice. stopped must be either NM_KR_STOPPED or NM_KR_LOCKED

+ */

+static __inline void nm_kr_stop(struct netmap_kring *kr, int stopped)

{

+ kr->nkr_stopped = stopped;

while (NM_ATOMIC_TEST_AND_SET(&kr->nr_busy))

tsleep(kr, 0, "NM_KR_GET", 4);

}

+/* restart a ring after a stop */

+static __inline void nm_kr_start(struct netmap_kring *kr)

+ kr->nkr_stopped = 0;

+ nm_kr_put(kr);

* The following functions are used by individual drivers to

@@ -953,10 +1154,26 @@ struct netmap_slot *netmap_reset(struct netmap_adapter *na,

enum txrx tx, u_int n, u_int new_cur);

int netmap_ring_reinit(struct netmap_kring *);

+/* Return codes for netmap_*x_irq. */

+enum {

+ /* Driver should do normal interrupt processing, e.g. because

+ * the interface is not in netmap mode. */

+ NM_IRQ_PASS = 0,

+ /* Port is in netmap mode, and the interrupt work has been

+ * completed. The driver does not have to notify netmap

+ * again before the next interrupt. */

+ NM_IRQ_COMPLETED = -1,

+ /* Port is in netmap mode, but the interrupt work has not been

+ * completed. The driver has to make sure netmap will be

+ * notified again soon, even if no more interrupts come (e.g.

+ * on Linux the driver should not call napi_complete()). */

+ NM_IRQ_RESCHED = -2,

+};

/* default functions to handle rx/tx interrupts */

int netmap_rx_irq(struct ifnet *, u_int, u_int *);

#define netmap_tx_irq(_n, _q) netmap_rx_irq(_n, _q, NULL)

-void netmap_common_irq(struct ifnet *, u_int, u_int *work_done);

+int netmap_common_irq(struct netmap_adapter *, u_int, u_int *work_done);

#ifdef WITH_VALE

@@ -986,35 +1203,74 @@ nm_native_on(struct netmap_adapter *na)

return nm_netmap_on(na) && (na->na_flags & NAF_NATIVE);

}

+static inline int

+nm_iszombie(struct netmap_adapter *na)

+ return na == NULL || (na->na_flags & NAF_ZOMBIE);

+static inline void

+nm_update_hostrings_mode(struct netmap_adapter *na)

+ /* Process nr_mode and nr_pending_mode for host rings. */

+ na->tx_rings[na->num_tx_rings].nr_mode =

+ na->tx_rings[na->num_tx_rings].nr_pending_mode;

+ na->rx_rings[na->num_rx_rings].nr_mode =

+ na->rx_rings[na->num_rx_rings].nr_pending_mode;

/* set/clear native flags and if_transmit/netdev_ops */

static inline void

nm_set_native_flags(struct netmap_adapter *na)

{

struct ifnet *ifp = na->ifp;

+ /* We do the setup for intercepting packets only if we are the

+ * first user of this adapapter. */

+ if (na->active_fds > 0) {

+ return;

+ }

na->na_flags |= NAF_NETMAP_ON;

#ifdef IFCAP_NETMAP /* or FreeBSD ? */

ifp->if_capenable |= IFCAP_NETMAP;

#endif

-#ifdef __FreeBSD__

+#if defined (__FreeBSD__)

na->if_transmit = ifp->if_transmit;

ifp->if_transmit = netmap_transmit;

+#elif defined (_WIN32)

+ (void)ifp; /* prevent a warning */

+ //XXX_ale can we just comment those?

+ //na->if_transmit = ifp->if_transmit;

+ //ifp->if_transmit = netmap_transmit;

#else

na->if_transmit = (void *)ifp->netdev_ops;

ifp->netdev_ops = &((struct netmap_hw_adapter *)na)->nm_ndo;

((struct netmap_hw_adapter *)na)->save_ethtool = ifp->ethtool_ops;

ifp->ethtool_ops = &((struct netmap_hw_adapter*)na)->nm_eto;

#endif

+ nm_update_hostrings_mode(na);

}

static inline void

nm_clear_native_flags(struct netmap_adapter *na)

{

struct ifnet *ifp = na->ifp;

-#ifdef __FreeBSD__

+ /* We undo the setup for intercepting packets only if we are the

+ * last user of this adapapter. */

+ if (na->active_fds > 0) {

+ return;

+ }

+ nm_update_hostrings_mode(na);

+#if defined(__FreeBSD__)

ifp->if_transmit = na->if_transmit;

+#elif defined(_WIN32)

+ (void)ifp; /* prevent a warning */

+ //XXX_ale can we just comment those?

+ //ifp->if_transmit = na->if_transmit;

#else

ifp->netdev_ops = (void *)na->if_transmit;

ifp->ethtool_ops = ((struct netmap_hw_adapter*)na)->save_ethtool;

@@ -1025,6 +1281,28 @@ nm_clear_native_flags(struct netmap_adapter *na)

#endif

}

+/*

+ * nm_*sync_prologue() functions are used in ioctl/poll and ptnetmap

+ * kthreads.

+ * We need netmap_ring* parameter, because in ptnetmap it is decoupled

+ * from host kring.

+ * The user-space ring pointers (head/cur/tail) are shared through

+ * CSB between host and guest.

+ */

+/*

+ * validates parameters in the ring/kring, returns a value for head

+ * If any error, returns ring_size to force a reinit.

+ */

+uint32_t nm_txsync_prologue(struct netmap_kring *, struct netmap_ring *);

+/*

+ * validates parameters in the ring/kring, returns a value for head

+ * If any error, returns ring_size lim to force a reinit.

+ */

+uint32_t nm_rxsync_prologue(struct netmap_kring *, struct netmap_ring *);

/* check/fix address and len in tx rings */

#if 1 /* debug version */

@@ -1080,6 +1358,9 @@ int netmap_krings_create(struct netmap_adapter *na, u_int tailroom);

void netmap_krings_delete(struct netmap_adapter *na);

+int netmap_hw_krings_create(struct netmap_adapter *na);

+void netmap_hw_krings_delete(struct netmap_adapter *na);

/* set the stopped/enabled status of ring

* When stopping, they also wait for all current activity on the ring to

* terminate. The status change is then notified using the na nm_notify

@@ -1088,16 +1369,18 @@ void netmap_krings_delete(struct netmap_adapter *na);

void netmap_set_ring(struct netmap_adapter *, u_int ring_id, enum txrx, int stopped);

/* set the stopped/enabled status of all rings of the adapter. */

void netmap_set_all_rings(struct netmap_adapter *, int stopped);

-/* convenience wrappers for netmap_set_all_rings, used in drivers */

+/* convenience wrappers for netmap_set_all_rings */

void netmap_disable_all_rings(struct ifnet *);

void netmap_enable_all_rings(struct ifnet *);

int netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na,

uint16_t ringid, uint32_t flags);

+void netmap_do_unregif(struct netmap_priv_d *priv);

u_int nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg);

-int netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create);

+int netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na,

+ struct ifnet **ifp, int create);

+void netmap_unget_na(struct netmap_adapter *na, struct ifnet *ifp);

int netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na);

@@ -1124,12 +1407,11 @@ struct netmap_bdg_ops {

u_int netmap_bdg_learning(struct nm_bdg_fwd *ft, uint8_t *dst_ring,

struct netmap_vp_adapter *);

+#define NM_BRIDGES 8 /* number of bridges */

#define NM_BDG_MAXPORTS 254 /* up to 254 */

#define NM_BDG_BROADCAST NM_BDG_MAXPORTS

#define NM_BDG_NOPORT (NM_BDG_MAXPORTS+1)

-#define NM_NAME "vale" /* prefix for bridge port name */

/* these are redefined in case of no VALE support */

int netmap_get_bdg_na(struct nmreq *nmr, struct netmap_adapter **na, int create);

struct nm_bridge *netmap_init_bridges2(u_int);

@@ -1181,14 +1463,13 @@ void netmap_bns_getbridges(struct nm_bridge **, u_int *);

#endif

/* Various prototypes */

-int netmap_poll(struct cdev *dev, int events, struct thread *td);

+int netmap_poll(struct netmap_priv_d *, int events, NM_SELRECORD_T *td);

int netmap_init(void);

void netmap_fini(void);

int netmap_get_memory(struct netmap_priv_d* p);

void netmap_dtor(void *data);

-int netmap_dtor_locked(struct netmap_priv_d *priv);

-int netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td);

+int netmap_ioctl(struct netmap_priv_d *priv, u_long cmd, caddr_t data, struct thread *);

/* netmap_adapter creation/destruction */

@@ -1228,8 +1509,8 @@ int netmap_adapter_put(struct netmap_adapter *na);

* module variables

-#define NETMAP_BUF_BASE(na) ((na)->na_lut.lut[0].vaddr)

-#define NETMAP_BUF_SIZE(na) ((na)->na_lut.objsize)

+#define NETMAP_BUF_BASE(_na) ((_na)->na_lut.lut[0].vaddr)

+#define NETMAP_BUF_SIZE(_na) ((_na)->na_lut.objsize)

extern int netmap_mitigate; // XXX not really used

extern int netmap_no_pendintr;

extern int netmap_verbose; // XXX debugging

@@ -1245,10 +1526,12 @@ enum { /* verbose flags */

};

extern int netmap_txsync_retry;

+extern int netmap_adaptive_io;

+extern int netmap_flags;

extern int netmap_generic_mit;

extern int netmap_generic_ringsize;

extern int netmap_generic_rings;

-extern int netmap_use_count;

+extern int netmap_generic_txqdisc;

* NA returns a pointer to the struct netmap adapter from the ifp,

@@ -1257,37 +1540,27 @@ extern int netmap_use_count;

#define NA(_ifp) ((struct netmap_adapter *)WNA(_ifp))

- * Macros to determine if an interface is netmap capable or netmap enabled.

- * See the magic field in struct netmap_adapter.

- */

-#ifdef __FreeBSD__

-/*

- * on FreeBSD just use if_capabilities and if_capenable.

- */

-#define NETMAP_CAPABLE(ifp) (NA(ifp) && \

- (ifp)->if_capabilities & IFCAP_NETMAP )

-#define NETMAP_SET_CAPABLE(ifp) \

- (ifp)->if_capabilities |= IFCAP_NETMAP

-#else /* linux */

-/*

- * on linux:

- * we check if NA(ifp) is set and its first element has a related

+ * On old versions of FreeBSD, NA(ifp) is a pspare. On linux we

+ * overload another pointer in the netdev.

+ *

+ * We check if NA(ifp) is set and its first element has a related

* magic value. The capenable is within the struct netmap_adapter.

#define NETMAP_MAGIC 0x52697a7a

-#define NETMAP_CAPABLE(ifp) (NA(ifp) && \

+#define NM_NA_VALID(ifp) (NA(ifp) && \

((uint32_t)(uintptr_t)NA(ifp) ^ NA(ifp)->magic) == NETMAP_MAGIC )

-#define NETMAP_SET_CAPABLE(ifp) \

- NA(ifp)->magic = ((uint32_t)(uintptr_t)NA(ifp)) ^ NETMAP_MAGIC

+#define NM_ATTACH_NA(ifp, na) do { \

+ WNA(ifp) = na; \

+ if (NA(ifp)) \

+ NA(ifp)->magic = \

+ ((uint32_t)(uintptr_t)NA(ifp)) ^ NETMAP_MAGIC; \

+} while(0)

-#endif /* linux */

+#define NM_IS_NATIVE(ifp) (NM_NA_VALID(ifp) && NA(ifp)->nm_dtor == netmap_hw_dtor)

-#ifdef __FreeBSD__

+#if defined(__FreeBSD__)

/* Assigns the device IOMMU domain to an allocator.

* Returns -ENOMEM in case the domain is different */

@@ -1331,6 +1604,8 @@ netmap_reload_map(struct netmap_adapter *na,

}

+#elif defined(_WIN32)

#else /* linux */

int nm_iommu_group_id(bus_dma_tag_t dev);

@@ -1341,8 +1616,8 @@ netmap_load_map(struct netmap_adapter *na,

bus_dma_tag_t tag, bus_dmamap_t map, void *buf)

{

if (0 && map) {

- *map = dma_map_single(na->pdev, buf, na->na_lut.objsize,

- DMA_BIDIRECTIONAL);

+ *map = dma_map_single(na->pdev, buf, NETMAP_BUF_SIZE(na),

+ DMA_BIDIRECTIONAL);

}

@@ -1350,11 +1625,11 @@ static inline void

netmap_unload_map(struct netmap_adapter *na,

bus_dma_tag_t tag, bus_dmamap_t map)

{

- u_int sz = na->na_lut.objsize;

+ u_int sz = NETMAP_BUF_SIZE(na);

if (*map) {

dma_unmap_single(na->pdev, *map, sz,

- DMA_BIDIRECTIONAL);

+ DMA_BIDIRECTIONAL);

}

@@ -1362,7 +1637,7 @@ static inline void

netmap_reload_map(struct netmap_adapter *na,

bus_dma_tag_t tag, bus_dmamap_t map, void *buf)

{

- u_int sz = na->na_lut.objsize;

+ u_int sz = NETMAP_BUF_SIZE(na);

if (*map) {

dma_unmap_single(na->pdev, *map, sz,

@@ -1473,7 +1748,11 @@ PNMB(struct netmap_adapter *na, struct netmap_slot *slot, uint64_t *pp)

struct lut_entry *lut = na->na_lut.lut;

void *ret = (i >= na->na_lut.objtotal) ? lut[0].vaddr : lut[i].vaddr;

+#ifndef _WIN32

*pp = (i >= na->na_lut.objtotal) ? lut[0].paddr : lut[i].paddr;

+#else

+ *pp = (i >= na->na_lut.objtotal) ? (uint64_t)lut[0].paddr.QuadPart : (uint64_t)lut[i].paddr.QuadPart;

+#endif

return ret;

}

@@ -1497,8 +1776,9 @@ struct netmap_priv_d {

struct netmap_if * volatile np_nifp; /* netmap if descriptor. */

struct netmap_adapter *np_na;

+ struct ifnet *np_ifp;

uint32_t np_flags; /* from the ioctl */

- u_int np_qfirst[NR_TXRX],

+ u_int np_qfirst[NR_TXRX],

np_qlast[NR_TXRX]; /* range of tx/rx rings to scan */

uint16_t np_txpoll; /* XXX and also np_rxpoll ? */

@@ -1512,6 +1792,26 @@ struct netmap_priv_d {

struct thread *np_td; /* kqueue, just debugging */

};

+struct netmap_priv_d *netmap_priv_new(void);

+void netmap_priv_delete(struct netmap_priv_d *);

+static inline int nm_kring_pending(struct netmap_priv_d *np)

+ struct netmap_adapter *na = np->np_na;

+ enum txrx t;

+ int i;

+ for_rx_tx(t) {

+ for (i = np->np_qfirst[t]; i < np->np_qlast[t]; i++) {

+ struct netmap_kring *kring = &NMR(na, t)[i];

+ if (kring->nr_mode != kring->nr_pending_mode) {

+ return 1;

+ }

+ return 0;

#ifdef WITH_MONITOR

struct netmap_monitor_adapter {

@@ -1530,13 +1830,36 @@ struct netmap_monitor_adapter {

* native netmap support.

int generic_netmap_attach(struct ifnet *ifp);

+int generic_rx_handler(struct ifnet *ifp, struct mbuf *m);;

+int nm_os_catch_rx(struct netmap_generic_adapter *gna, int intercept);

+int nm_os_catch_tx(struct netmap_generic_adapter *gna, int intercept);

+/*

+ * the generic transmit routine is passed a structure to optionally

+ * build a queue of descriptors, in an OS-specific way.

+ * The payload is at addr, if non-null, and the routine should send or queue

+ * the packet, returning 0 if successful, 1 on failure.

+ *

+ * At the end, if head is non-null, there will be an additional call

+ * to the function with addr = NULL; this should tell the OS-specific

+ * routine to send the queue and free any resources. Failure is ignored.

+ */

+struct nm_os_gen_arg {

+ struct ifnet *ifp;

+ void *m; /* os-specific mbuf-like object */

+ void *head, *tail; /* tailq, if the OS-specific routine needs to build one */

+ void *addr; /* payload of current packet */

+ u_int len; /* packet length */

+ u_int ring_nr; /* packet length */

+ u_int qevent; /* in txqdisc mode, place an event on this mbuf */

+};

+int nm_os_generic_xmit_frame(struct nm_os_gen_arg *);

+int nm_os_generic_find_num_desc(struct ifnet *ifp, u_int *tx, u_int *rx);

+void nm_os_generic_find_num_queues(struct ifnet *ifp, u_int *txq, u_int *rxq);

+void nm_os_generic_set_features(struct netmap_generic_adapter *gna);

-int netmap_catch_rx(struct netmap_generic_adapter *na, int intercept);

-void generic_rx_handler(struct ifnet *ifp, struct mbuf *m);;

-void netmap_catch_tx(struct netmap_generic_adapter *na, int enable);

-int generic_xmit_frame(struct ifnet *ifp, struct mbuf *m, void *addr, u_int len, u_int ring_nr);

-int generic_find_num_desc(struct ifnet *ifp, u_int *tx, u_int *rx);

-void generic_find_num_queues(struct ifnet *ifp, u_int *txq, u_int *rxq);

static inline struct ifnet*

netmap_generic_getifp(struct netmap_generic_adapter *gna)

{

@@ -1546,6 +1869,8 @@ netmap_generic_getifp(struct netmap_generic_adapter *gna)

return gna->up.up.ifp;

}

+void netmap_generic_irq(struct netmap_adapter *na, u_int q, u_int *work_done);

//#define RATE_GENERIC /* Enables communication statistics for generic. */

#ifdef RATE_GENERIC

void generic_rate(int txp, int txs, int txi, int rxp, int rxs, int rxi);

@@ -1558,16 +1883,16 @@ void generic_rate(int txp, int txs, int txi, int rxp, int rxs, int rxi);

* to reduce the number of interrupt requests/selwakeup

* to clients on incoming packets.

-void netmap_mitigation_init(struct nm_generic_mit *mit, int idx,

+void nm_os_mitigation_init(struct nm_generic_mit *mit, int idx,

struct netmap_adapter *na);

-void netmap_mitigation_start(struct nm_generic_mit *mit);

-void netmap_mitigation_restart(struct nm_generic_mit *mit);

-int netmap_mitigation_active(struct nm_generic_mit *mit);

-void netmap_mitigation_cleanup(struct nm_generic_mit *mit);

+void nm_os_mitigation_start(struct nm_generic_mit *mit);

+void nm_os_mitigation_restart(struct nm_generic_mit *mit);

+int nm_os_mitigation_active(struct nm_generic_mit *mit);

+void nm_os_mitigation_cleanup(struct nm_generic_mit *mit);

+#else /* !WITH_GENERIC */

+#define generic_netmap_attach(ifp) (EOPNOTSUPP)

#endif /* WITH_GENERIC */

/* Shared declarations for the VALE switch. */

@@ -1656,22 +1981,111 @@ struct nm_ipv6hdr {

#define rawsum_t uint32_t

-rawsum_t nm_csum_raw(uint8_t *data, size_t len, rawsum_t cur_sum);

-uint16_t nm_csum_ipv4(struct nm_iphdr *iph);

-void nm_csum_tcpudp_ipv4(struct nm_iphdr *iph, void *data,

+rawsum_t nm_os_csum_raw(uint8_t *data, size_t len, rawsum_t cur_sum);

+uint16_t nm_os_csum_ipv4(struct nm_iphdr *iph);

+void nm_os_csum_tcpudp_ipv4(struct nm_iphdr *iph, void *data,

size_t datalen, uint16_t *check);

-void nm_csum_tcpudp_ipv6(struct nm_ipv6hdr *ip6h, void *data,

+void nm_os_csum_tcpudp_ipv6(struct nm_ipv6hdr *ip6h, void *data,

size_t datalen, uint16_t *check);

-uint16_t nm_csum_fold(rawsum_t cur_sum);

+uint16_t nm_os_csum_fold(rawsum_t cur_sum);

void bdg_mismatch_datapath(struct netmap_vp_adapter *na,

struct netmap_vp_adapter *dst_na,

- struct nm_bdg_fwd *ft_p, struct netmap_ring *ring,

+ const struct nm_bdg_fwd *ft_p,

+ struct netmap_ring *dst_ring,

u_int *j, u_int lim, u_int *howmany);

/* persistent virtual port routines */

-int nm_vi_persist(const char *, struct ifnet **);

-void nm_vi_detach(struct ifnet *);

-void nm_vi_init_index(void);

+int nm_os_vi_persist(const char *, struct ifnet **);

+void nm_os_vi_detach(struct ifnet *);

+void nm_os_vi_init_index(void);

+/*

+ * kernel thread routines

+ */

+struct nm_kthread; /* OS-specific kthread - opaque */

+typedef void (*nm_kthread_worker_fn_t)(void *data);

+/* kthread configuration */

+struct nm_kthread_cfg {

+ long type; /* kthread type/identifier */

+ struct ptnet_ring_cfg event; /* event/ioctl fd */

+ nm_kthread_worker_fn_t worker_fn; /* worker function */

+ void *worker_private;/* worker parameter */

+ int attach_user; /* attach kthread to user process */

+};

+/* kthread configuration */

+struct nm_kthread *nm_os_kthread_create(struct nm_kthread_cfg *cfg);

+int nm_os_kthread_start(struct nm_kthread *);

+void nm_os_kthread_stop(struct nm_kthread *);

+void nm_os_kthread_delete(struct nm_kthread *);

+void nm_os_kthread_wakeup_worker(struct nm_kthread *nmk);

+void nm_os_kthread_send_irq(struct nm_kthread *);

+void nm_os_kthread_set_affinity(struct nm_kthread *, int);

+u_int nm_os_ncpus(void);

+#ifdef WITH_PTNETMAP_HOST

+/*

+ * netmap adapter for host ptnetmap ports

+ */

+struct netmap_pt_host_adapter {

+ struct netmap_adapter up;

+ struct netmap_adapter *parent;

+ int (*parent_nm_notify)(struct netmap_kring *kring, int flags);

+ void *ptns;

+};

+/* ptnetmap HOST routines */

+int netmap_get_pt_host_na(struct nmreq *nmr, struct netmap_adapter **na, int create);

+int ptnetmap_ctl(struct nmreq *nmr, struct netmap_adapter *na);

+static inline int

+nm_ptnetmap_host_on(struct netmap_adapter *na)

+ return na && na->na_flags & NAF_PTNETMAP_HOST;

+#else /* !WITH_PTNETMAP_HOST */

+#define netmap_get_pt_host_na(nmr, _2, _3) \

+ ((nmr)->nr_flags & (NR_PTNETMAP_HOST) ? EOPNOTSUPP : 0)

+#define ptnetmap_ctl(_1, _2) EINVAL

+#define nm_ptnetmap_host_on(_1) EINVAL

+#endif /* !WITH_PTNETMAP_HOST */

+#ifdef WITH_PTNETMAP_GUEST

+/* ptnetmap GUEST routines */

+typedef uint32_t (*nm_pt_guest_ptctl_t)(struct ifnet *, uint32_t);

+/*

+ * netmap adapter for guest ptnetmap ports

+ */

+struct netmap_pt_guest_adapter {

+ /* The netmap adapter to be used by netmap applications.

+ * This field must be the first, to allow upcast. */

+ struct netmap_hw_adapter hwup;

+ /* The netmap adapter to be used by the driver. */

+ struct netmap_hw_adapter dr;

+ void *csb;

+ /* Reference counter to track users of backend netmap port: the

+ * network stack and netmap clients.

+ * Used to decide when we need (de)allocate krings/rings and

+ * start (stop) ptnetmap kthreads. */

+ int backend_regifs;

+};

+int netmap_pt_guest_attach(struct netmap_adapter *, void *,

+ unsigned int, nm_pt_guest_ptctl_t);

+struct ptnet_ring;

+bool netmap_pt_guest_txsync(struct ptnet_ring *ptring, struct netmap_kring *kring,

+ int flags);

+bool netmap_pt_guest_rxsync(struct ptnet_ring *ptring, struct netmap_kring *kring,

+ int flags);

+int ptnet_nm_krings_create(struct netmap_adapter *na);

+void ptnet_nm_krings_delete(struct netmap_adapter *na);

+void ptnet_nm_dtor(struct netmap_adapter *na);

+#endif /* WITH_PTNETMAP_GUEST */

#endif /* _NET_NETMAP_KERN_H_ */

diff --git a/sys/dev/netmap/netmap_mbq.c b/sys/dev/netmap/netmap_mbq.c
index 503f5a13aa95..3eb971b74561 100644
--- a/sys/dev/netmap/netmap_mbq.c
+++ b/sys/dev/netmap/netmap_mbq.c

@@ -1,5 +1,6 @@

* Redistribution and use in source and binary forms, with or without

* modification, are permitted provided that the following conditions

@@ -30,6 +31,8 @@

#ifdef linux

#include "bsd_glue.h"

+#elif defined (_WIN32)

+#include "win_glue.h"

#else /* __FreeBSD__ */

#include <sys/param.h>

#include <sys/lock.h>

@@ -152,12 +155,12 @@ void mbq_safe_purge(struct mbq *q)

}

-void mbq_safe_destroy(struct mbq *q)

+void mbq_safe_fini(struct mbq *q)

{

mtx_destroy(&q->lock);

}

-void mbq_destroy(struct mbq *q)

+void mbq_fini(struct mbq *q)

{

}

diff --git a/sys/dev/netmap/netmap_mbq.h b/sys/dev/netmap/netmap_mbq.h
index 455ca8a2c3ac..9dafa8b1149b 100644
--- a/sys/dev/netmap/netmap_mbq.h
+++ b/sys/dev/netmap/netmap_mbq.h

@@ -1,5 +1,6 @@

* Redistribution and use in source and binary forms, with or without

* modification, are permitted provided that the following conditions

@@ -40,6 +41,8 @@

/* XXX probably rely on a previous definition of SPINLOCK_T */

#ifdef linux

#define SPINLOCK_T safe_spinlock_t

+#elif defined (_WIN32)

+#define SPINLOCK_T win_spinlock_t

#else

#define SPINLOCK_T struct mtx

#endif

@@ -52,16 +55,21 @@ struct mbq {

SPINLOCK_T lock;

};

-/* XXX "destroy" does not match "init" as a name.

- * We should also clarify whether init can be used while

+/* We should clarify whether init can be used while

* holding a lock, and whether mbq_safe_destroy() is a NOP.

void mbq_init(struct mbq *q);

-void mbq_destroy(struct mbq *q);

+void mbq_fini(struct mbq *q);

void mbq_enqueue(struct mbq *q, struct mbuf *m);

struct mbuf *mbq_dequeue(struct mbq *q);

void mbq_purge(struct mbq *q);

+static inline struct mbuf *

+mbq_peek(struct mbq *q)

+ return q->head ? q->head : NULL;

static inline void

mbq_lock(struct mbq *q)

{

@@ -76,7 +84,7 @@ mbq_unlock(struct mbq *q)

void mbq_safe_init(struct mbq *q);

-void mbq_safe_destroy(struct mbq *q);

+void mbq_safe_fini(struct mbq *q);

void mbq_safe_enqueue(struct mbq *q, struct mbuf *m);

struct mbuf *mbq_safe_dequeue(struct mbq *q);

void mbq_safe_purge(struct mbq *q);

diff --git a/sys/dev/netmap/netmap_mem2.c b/sys/dev/netmap/netmap_mem2.c
index fd0c06bb8b57..b54c9813c33f 100644
--- a/sys/dev/netmap/netmap_mem2.c
+++ b/sys/dev/netmap/netmap_mem2.c

@@ -1,5 +1,8 @@

* Redistribution and use in source and binary forms, with or without

* modification, are permitted provided that the following conditions

@@ -37,6 +40,7 @@ __FBSDID("$FreeBSD$");

#include <sys/types.h>

#include <sys/malloc.h>

+#include <sys/kernel.h> /* MALLOC_DEFINE */

#include <sys/proc.h>

#include <vm/vm.h> /* vtophys */

#include <vm/pmap.h> /* vtophys */

@@ -48,13 +52,26 @@ __FBSDID("$FreeBSD$");

#include <net/vnet.h>

#include <machine/bus.h> /* bus_dmamap_* */

+/* M_NETMAP only used in here */

+MALLOC_DECLARE(M_NETMAP);

+MALLOC_DEFINE(M_NETMAP, "netmap", "Network memory map");

#endif /* __FreeBSD__ */

+#ifdef _WIN32

+#include <win_glue.h>

+#endif

#include <net/netmap.h>

#include <dev/netmap/netmap_kern.h>

+#include <net/netmap_virt.h>

#include "netmap_mem2.h"

-#define NETMAP_BUF_MAX_NUM 20*4096*2 /* large machine */

+#ifdef _WIN32_USE_SMALL_GENERIC_DEVICES_MEMORY

+#define NETMAP_BUF_MAX_NUM 8*4096 /* if too big takes too much time to allocate */

+#else

+#define NETMAP_BUF_MAX_NUM 20*4096*2 /* large machine */

+#endif

#define NETMAP_POOL_MAX_NAMSZ 32

@@ -111,7 +128,7 @@ struct netmap_obj_pool {

struct netmap_mem_ops {

- void (*nmd_get_lut)(struct netmap_mem_d *, struct netmap_lut*);

+ int (*nmd_get_lut)(struct netmap_mem_d *, struct netmap_lut*);

int (*nmd_get_info)(struct netmap_mem_d *, u_int *size,

u_int *memflags, uint16_t *id);

@@ -130,6 +147,39 @@ struct netmap_mem_ops {

typedef uint16_t nm_memid_t;

+/*

+ * Shared info for netmap allocator

+ *

+ * Each allocator contains this structur as first netmap_if.

+ * In this way, we can share same details about allocator

+ * to the VM.

+ * Used in ptnetmap.

+ */

+struct netmap_mem_shared_info {

+#ifndef _WIN32

+ struct netmap_if up; /* ends with a 0-sized array, which VSC does not like */

+#else /* !_WIN32 */

+ char up[sizeof(struct netmap_if)];

+#endif /* !_WIN32 */

+ uint64_t features;

+#define NMS_FEAT_BUF_POOL 0x0001

+#define NMS_FEAT_MEMSIZE 0x0002

+ uint32_t buf_pool_offset;

+ uint32_t buf_pool_objtotal;

+ uint32_t buf_pool_objsize;

+ uint32_t totalsize;

+};

+#define NMS_NAME "nms_info"

+#define NMS_VERSION 1

+static const struct netmap_if nms_if_blueprint = {

+ .ni_name = NMS_NAME,

+ .ni_version = NMS_VERSION,

+ .ni_tx_rings = 0,

+ .ni_rx_rings = 0

+};

struct netmap_mem_d {

NMA_LOCK_T nm_mtx; /* protect the allocator */

u_int nm_totalsize; /* shorthand */

@@ -151,6 +201,9 @@ struct netmap_mem_d {

struct netmap_mem_ops *ops;

};

+/*

+ * XXX need to fix the case of t0 == void

+ */

#define NMD_DEFCB(t0, name) \

t0 \

netmap_mem_##name(struct netmap_mem_d *nmd) \

@@ -186,7 +239,7 @@ netmap_mem_##name(struct netmap_adapter *na, t1 a1) \

return na->nm_mem->ops->nmd_##name(na, a1); \

}

-NMD_DEFCB1(void, get_lut, struct netmap_lut *);

+NMD_DEFCB1(int, get_lut, struct netmap_lut *);

NMD_DEFCB3(int, get_info, u_int *, u_int *, uint16_t *);

NMD_DEFCB1(vm_paddr_t, ofstophys, vm_ooffset_t);

static int netmap_mem_config(struct netmap_mem_d *);

@@ -201,7 +254,7 @@ NMD_DEFNACB(void, rings_delete);

static int netmap_mem_map(struct netmap_obj_pool *, struct netmap_adapter *);

static int netmap_mem_unmap(struct netmap_obj_pool *, struct netmap_adapter *);

-static int nm_mem_assign_group(struct netmap_mem_d *, device_t);

+static int nm_mem_assign_group(struct netmap_mem_d *, struct device *);

#define NMA_LOCK_INIT(n) NM_MTX_INIT((n)->nm_mtx)

#define NMA_LOCK_DESTROY(n) NM_MTX_DESTROY((n)->nm_mtx)

@@ -248,7 +301,9 @@ netmap_mem_finalize(struct netmap_mem_d *nmd, struct netmap_adapter *na)

if (nm_mem_assign_group(nmd, na->pdev) < 0) {

return ENOMEM;

} else {

- nmd->ops->nmd_finalize(nmd);

+ NMA_LOCK(nmd);

+ nmd->lasterr = nmd->ops->nmd_finalize(nmd);

+ NMA_UNLOCK(nmd);

}

if (!nmd->lasterr && na->pdev)

@@ -257,26 +312,83 @@ netmap_mem_finalize(struct netmap_mem_d *nmd, struct netmap_adapter *na)

return nmd->lasterr;

}

+static int netmap_mem_init_shared_info(struct netmap_mem_d *nmd);

void

netmap_mem_deref(struct netmap_mem_d *nmd, struct netmap_adapter *na)

{

NMA_LOCK(nmd);

netmap_mem_unmap(&nmd->pools[NETMAP_BUF_POOL], na);

+ if (nmd->active == 1) {

+ u_int i;

+ /*

+ * Reset the allocator when it falls out of use so that any

+ * pool resources leaked by unclean application exits are

+ * reclaimed.

+ */

+ for (i = 0; i < NETMAP_POOLS_NR; i++) {

+ struct netmap_obj_pool *p;

+ u_int j;

+ p = &nmd->pools[i];

+ p->objfree = p->objtotal;

+ /*

+ * Reproduce the net effect of the M_ZERO malloc()

+ * and marking of free entries in the bitmap that

+ * occur in finalize_obj_allocator()

+ */

+ memset(p->bitmap,

+ '\0',

+ sizeof(uint32_t) * ((p->objtotal + 31) / 32));

+ /*

+ * Set all the bits in the bitmap that have

+ * corresponding buffers to 1 to indicate they are

+ * free.

+ */

+ for (j = 0; j < p->objtotal; j++) {

+ if (p->lut[j].vaddr != NULL) {

+ p->bitmap[ (j>>5) ] |= ( 1 << (j & 31) );

+ }

+ /*

+ * Per netmap_mem_finalize_all(),

+ * buffers 0 and 1 are reserved

+ */

+ nmd->pools[NETMAP_BUF_POOL].objfree -= 2;

+ if (nmd->pools[NETMAP_BUF_POOL].bitmap) {

+ /* XXX This check is a workaround that prevents a

+ * NULL pointer crash which currently happens only

+ * with ptnetmap guests. Also,

+ * netmap_mem_init_shared_info must not be called

+ * by ptnetmap guest. */

+ nmd->pools[NETMAP_BUF_POOL].bitmap[0] = ~3;

+ /* expose info to the ptnetmap guest */

+ netmap_mem_init_shared_info(nmd);

+ }

+ nmd->ops->nmd_deref(nmd);

NMA_UNLOCK(nmd);

- return nmd->ops->nmd_deref(nmd);

}

/* accessor functions */

-static void

+static int

netmap_mem2_get_lut(struct netmap_mem_d *nmd, struct netmap_lut *lut)

{

lut->lut = nmd->pools[NETMAP_BUF_POOL].lut;

lut->objtotal = nmd->pools[NETMAP_BUF_POOL].objtotal;

lut->objsize = nmd->pools[NETMAP_BUF_POOL]._objsize;

+ return 0;

}

-struct netmap_obj_params netmap_params[NETMAP_POOLS_NR] = {

+static struct netmap_obj_params netmap_params[NETMAP_POOLS_NR] = {

[NETMAP_IF_POOL] = {

.size = 1024,

.num = 100,

@@ -291,10 +403,10 @@ struct netmap_obj_params netmap_params[NETMAP_POOLS_NR] = {

};

-struct netmap_obj_params netmap_min_priv_params[NETMAP_POOLS_NR] = {

+static struct netmap_obj_params netmap_min_priv_params[NETMAP_POOLS_NR] = {

[NETMAP_IF_POOL] = {

.size = 1024,

- .num = 1,

+ .num = 2,

[NETMAP_RING_POOL] = {

.size = 5*PAGE_SIZE,

@@ -348,11 +460,12 @@ struct netmap_mem_d nm_mem = { /* Our memory allocator. */

};

-struct netmap_mem_d *netmap_last_mem_d = &nm_mem;

+static struct netmap_mem_d *netmap_last_mem_d = &nm_mem;

/* blueprint for the private memory allocators */

extern struct netmap_mem_ops netmap_mem_private_ops; /* forward */

-const struct netmap_mem_d nm_blueprint = {

+/* XXX clang is not happy about using name as a print format */

+static const struct netmap_mem_d nm_blueprint = {

.pools = {

[NETMAP_IF_POOL] = {

.name = "%s_if",

@@ -388,6 +501,8 @@ const struct netmap_mem_d nm_blueprint = {

#define DECLARE_SYSCTLS(id, name) \

+ SYSBEGIN(mem2_ ## name); \

+ SYSCTL_DECL(_dev_netmap); /* leave it here, easier for porting */ \

SYSCTL_INT(_dev_netmap, OID_AUTO, name##_size, \

CTLFLAG_RW, &netmap_params[id].size, 0, "Requested size of netmap " STRINGIFY(name) "s"); \

SYSCTL_INT(_dev_netmap, OID_AUTO, name##_curr_size, \

@@ -401,22 +516,21 @@ const struct netmap_mem_d nm_blueprint = {

"Default size of private netmap " STRINGIFY(name) "s"); \

SYSCTL_INT(_dev_netmap, OID_AUTO, priv_##name##_num, \

CTLFLAG_RW, &netmap_min_priv_params[id].num, 0, \

- "Default number of private netmap " STRINGIFY(name) "s")

+ "Default number of private netmap " STRINGIFY(name) "s"); \

+ SYSEND

-SYSCTL_DECL(_dev_netmap);

DECLARE_SYSCTLS(NETMAP_IF_POOL, if);

DECLARE_SYSCTLS(NETMAP_RING_POOL, ring);

DECLARE_SYSCTLS(NETMAP_BUF_POOL, buf);

+/* call with NMA_LOCK(&nm_mem) held */

static int

-nm_mem_assign_id(struct netmap_mem_d *nmd)

+nm_mem_assign_id_locked(struct netmap_mem_d *nmd)

{

nm_memid_t id;

struct netmap_mem_d *scan = netmap_last_mem_d;

int error = ENOMEM;

- NMA_LOCK(&nm_mem);

do {

/* we rely on unsigned wrap around */

id = scan->nm_id + 1;

@@ -435,10 +549,22 @@ nm_mem_assign_id(struct netmap_mem_d *nmd)

}

} while (scan != netmap_last_mem_d);

- NMA_UNLOCK(&nm_mem);

return error;

}

+/* call with NMA_LOCK(&nm_mem) *not* held */

+static int

+nm_mem_assign_id(struct netmap_mem_d *nmd)

+ int ret;

+ NMA_LOCK(&nm_mem);

+ ret = nm_mem_assign_id_locked(nmd);

+ NMA_UNLOCK(&nm_mem);

+ return ret;

static void

nm_mem_release_id(struct netmap_mem_d *nmd)

{

@@ -456,7 +582,7 @@ nm_mem_release_id(struct netmap_mem_d *nmd)

}

static int

-nm_mem_assign_group(struct netmap_mem_d *nmd, device_t dev)

+nm_mem_assign_group(struct netmap_mem_d *nmd, struct device *dev)

{

int err = 0, id;

id = nm_iommu_group_id(dev);

@@ -494,8 +620,13 @@ netmap_mem2_ofstophys(struct netmap_mem_d* nmd, vm_ooffset_t offset)

if (offset >= p[i].memtotal)

continue;

// now lookup the cluster's address

+#ifndef _WIN32

pa = vtophys(p[i].lut[offset / p[i]._objsize].vaddr) +

offset % p[i]._objsize;

+#else

+ pa = vtophys(p[i].lut[offset / p[i]._objsize].vaddr);

+ pa.QuadPart += offset % p[i]._objsize;

+#endif

NMA_UNLOCK(nmd);

return pa;

}

@@ -508,7 +639,110 @@ netmap_mem2_ofstophys(struct netmap_mem_d* nmd, vm_ooffset_t offset)

+ p[NETMAP_RING_POOL].memtotal

+ p[NETMAP_BUF_POOL].memtotal);

NMA_UNLOCK(nmd);

+#ifndef _WIN32

return 0; // XXX bad address

+#else

+ vm_paddr_t res;

+ res.QuadPart = 0;

+ return res;

+#endif

+#ifdef _WIN32

+/*

+ * win32_build_virtual_memory_for_userspace

+ *

+ * This function get all the object making part of the pools and maps

+ * a contiguous virtual memory space for the userspace

+ * It works this way

+ * 1 - allocate a Memory Descriptor List wide as the sum

+ * of the memory needed for the pools

+ * 2 - cycle all the objects in every pool and for every object do

+ *

+ * 2a - cycle all the objects in every pool, get the list

+ * of the physical address descriptors

+ * 2b - calculate the offset in the array of pages desciptor in the

+ * main MDL

+ * 2c - copy the descriptors of the object in the main MDL

+ *

+ * 3 - return the resulting MDL that needs to be mapped in userland

+ *

+ * In this way we will have an MDL that describes all the memory for the

+ * objects in a single object

+*/

+PMDL

+win32_build_user_vm_map(struct netmap_mem_d* nmd)

+ int i, j;

+ u_int memsize, memflags, ofs = 0;

+ PMDL mainMdl, tempMdl;

+ if (netmap_mem_get_info(nmd, &memsize, &memflags, NULL)) {

+ D("memory not finalised yet");

+ return NULL;

+ }

+ mainMdl = IoAllocateMdl(NULL, memsize, FALSE, FALSE, NULL);

+ if (mainMdl == NULL) {

+ D("failed to allocate mdl");

+ return NULL;

+ }

+ NMA_LOCK(nmd);

+ for (i = 0; i < NETMAP_POOLS_NR; i++) {

+ struct netmap_obj_pool *p = &nmd->pools[i];

+ int clsz = p->_clustsize;

+ int clobjs = p->_clustentries; /* objects per cluster */

+ int mdl_len = sizeof(PFN_NUMBER) * BYTES_TO_PAGES(clsz);

+ PPFN_NUMBER pSrc, pDst;

+ /* each pool has a different cluster size so we need to reallocate */

+ tempMdl = IoAllocateMdl(p->lut[0].vaddr, clsz, FALSE, FALSE, NULL);

+ if (tempMdl == NULL) {

+ NMA_UNLOCK(nmd);

+ D("fail to allocate tempMdl");

+ IoFreeMdl(mainMdl);

+ return NULL;

+ }

+ pSrc = MmGetMdlPfnArray(tempMdl);

+ /* create one entry per cluster, the lut[] has one entry per object */

+ for (j = 0; j < p->numclusters; j++, ofs += clsz) {

+ pDst = &MmGetMdlPfnArray(mainMdl)[BYTES_TO_PAGES(ofs)];

+ MmInitializeMdl(tempMdl, p->lut[j*clobjs].vaddr, clsz);

+ MmBuildMdlForNonPagedPool(tempMdl); /* compute physical page addresses */

+ RtlCopyMemory(pDst, pSrc, mdl_len); /* copy the page descriptors */

+ mainMdl->MdlFlags = tempMdl->MdlFlags; /* XXX what is in here ? */

+ }

+ IoFreeMdl(tempMdl);

+ }

+ NMA_UNLOCK(nmd);

+ return mainMdl;

+#endif /* _WIN32 */

+/*

+ * helper function for OS-specific mmap routines (currently only windows).

+ * Given an nmd and a pool index, returns the cluster size and number of clusters.

+ * Returns 0 if memory is finalised and the pool is valid, otherwise 1.

+ * It should be called under NMA_LOCK(nmd) otherwise the underlying info can change.

+ */

+int

+netmap_mem2_get_pool_info(struct netmap_mem_d* nmd, u_int pool, u_int *clustsize, u_int *numclusters)

+ if (!nmd || !clustsize || !numclusters || pool >= NETMAP_POOLS_NR)

+ return 1; /* invalid arguments */

+ // NMA_LOCK_ASSERT(nmd);

+ if (!(nmd->flags & NETMAP_MEM_FINALIZED)) {

+ *clustsize = *numclusters = 0;

+ return 1; /* not ready yet */

+ }

+ *clustsize = nmd->pools[pool]._clustsize;

+ *numclusters = nmd->pools[pool].numclusters;

+ return 0; /* success */

}

static int

@@ -578,12 +812,6 @@ netmap_obj_offset(struct netmap_obj_pool *p, const void *vaddr)

((n)->pools[NETMAP_IF_POOL].memtotal + \

netmap_obj_offset(&(n)->pools[NETMAP_RING_POOL], (v)))

-#define netmap_buf_offset(n, v) \

- ((n)->pools[NETMAP_IF_POOL].memtotal + \

- (n)->pools[NETMAP_RING_POOL].memtotal + \

- netmap_obj_offset(&(n)->pools[NETMAP_BUF_POOL], (v)))

static ssize_t

netmap_mem2_if_offset(struct netmap_mem_d *nmd, const void *addr)

{

@@ -602,7 +830,7 @@ static void *

netmap_obj_malloc(struct netmap_obj_pool *p, u_int len, uint32_t *start, uint32_t *index)

{

uint32_t i = 0; /* index in the bitmap */

- uint32_t mask, j; /* slot counter */

+ uint32_t mask, j = 0; /* slot counter */

void *vaddr = NULL;

if (len > p->_objsize) {

@@ -636,7 +864,7 @@ netmap_obj_malloc(struct netmap_obj_pool *p, u_int len, uint32_t *start, uint32_

if (index)

*index = i * 32 + j;

}

- ND("%s allocator: allocated object @ [%d][%d]: vaddr %p", i, j, vaddr);

+ ND("%s allocator: allocated object @ [%d][%d]: vaddr %p",p->name, i, j, vaddr);

if (start)

*start = i;

@@ -733,7 +961,7 @@ netmap_extra_alloc(struct netmap_adapter *na, uint32_t *head, uint32_t n)

*head = cur; /* restore */

break;

}

- RD(5, "allocate buffer %d -> %d", *head, cur);

+ ND(5, "allocate buffer %d -> %d", *head, cur);

*p = cur; /* link to previous head */

}

@@ -750,7 +978,7 @@ netmap_extra_free(struct netmap_adapter *na, uint32_t head)

struct netmap_obj_pool *p = &nmd->pools[NETMAP_BUF_POOL];

uint32_t i, cur, *buf;

- D("freeing the extra list");

+ ND("freeing the extra list");

for (i = 0; head >=2 && head < p->objtotal; i++) {

cur = head;

buf = lut[head].vaddr;

@@ -761,7 +989,8 @@ netmap_extra_free(struct netmap_adapter *na, uint32_t head)

}

if (head != 0)

D("breaking with head %d", head);

- D("freed %d buffers", i);

+ if (netmap_verbose)

+ D("freed %d buffers", i);

}

@@ -846,7 +1075,6 @@ netmap_reset_obj_allocator(struct netmap_obj_pool *p)

p->bitmap = NULL;

if (p->lut) {

u_int i;

- size_t sz = p->_clustsize;

* Free each cluster allocated in

@@ -856,7 +1084,7 @@ netmap_reset_obj_allocator(struct netmap_obj_pool *p)

for (i = 0; i < p->objtotal; i += p->_clustentries) {

if (p->lut[i].vaddr)

- contigfree(p->lut[i].vaddr, sz, M_NETMAP);

+ contigfree(p->lut[i].vaddr, p->_clustsize, M_NETMAP);

}

bzero(p->lut, sizeof(struct lut_entry) * p->objtotal);

#ifdef linux

@@ -973,6 +1201,18 @@ netmap_config_obj_allocator(struct netmap_obj_pool *p, u_int objtotal, u_int obj

return 0;

}

+static struct lut_entry *

+nm_alloc_lut(u_int nobj)

+ size_t n = sizeof(struct lut_entry) * nobj;

+ struct lut_entry *lut;

+#ifdef linux

+ lut = vmalloc(n);

+#else

+ lut = malloc(n, M_NETMAP, M_NOWAIT | M_ZERO);

+#endif

+ return lut;

/* call with NMA_LOCK held */

static int

@@ -985,14 +1225,9 @@ netmap_finalize_obj_allocator(struct netmap_obj_pool *p)

p->numclusters = p->_numclusters;

p->objtotal = p->_objtotal;

- n = sizeof(struct lut_entry) * p->objtotal;

-#ifdef linux

- p->lut = vmalloc(n);

-#else

- p->lut = malloc(n, M_NETMAP, M_NOWAIT | M_ZERO);

-#endif

+ p->lut = nm_alloc_lut(p->objtotal);

if (p->lut == NULL) {

- D("Unable to create lookup table (%d bytes) for '%s'", (int)n, p->name);

+ D("Unable to create lookup table for '%s'", p->name);

goto clean;

}

@@ -1015,6 +1250,13 @@ netmap_finalize_obj_allocator(struct netmap_obj_pool *p)

int lim = i + p->_clustentries;

char *clust;

+ /*

+ * XXX Note, we only need contigmalloc() for buffers attached

+ * to native interfaces. In all other cases (nifp, netmap rings

+ * and even buffers for VALE ports or emulated interfaces) we

+ * can live with standard malloc, because the hardware will not

+ * access the pages directly.

+ */

clust = contigmalloc(n, M_NETMAP, M_NOWAIT | M_ZERO,

(size_t)0, -1UL, PAGE_SIZE, 0);

if (clust == NULL) {

@@ -1108,10 +1350,15 @@ netmap_mem_unmap(struct netmap_obj_pool *p, struct netmap_adapter *na)

if (na->pdev == NULL)

return 0;

-#ifdef __FreeBSD__

+#if defined(__FreeBSD__)

(void)i;

(void)lim;

D("unsupported on FreeBSD");

+#elif defined(_WIN32)

+ (void)i;

+ (void)lim;

+ D("unsupported on Windows"); //XXX_ale, really?

#else /* linux */

for (i = 2; i < lim; i++) {

netmap_unload_map(na, (bus_dma_tag_t) na->pdev, &p->lut[i].paddr);

@@ -1124,8 +1371,10 @@ netmap_mem_unmap(struct netmap_obj_pool *p, struct netmap_adapter *na)

static int

netmap_mem_map(struct netmap_obj_pool *p, struct netmap_adapter *na)

{

-#ifdef __FreeBSD__

+#if defined(__FreeBSD__)

D("unsupported on FreeBSD");

+#elif defined(_WIN32)

+ D("unsupported on Windows"); //XXX_ale, really?

#else /* linux */

int i, lim = p->_objtotal;

@@ -1142,6 +1391,30 @@ netmap_mem_map(struct netmap_obj_pool *p, struct netmap_adapter *na)

}

static int

+netmap_mem_init_shared_info(struct netmap_mem_d *nmd)

+ struct netmap_mem_shared_info *nms_info;

+ ssize_t base;

+ /* Use the first slot in IF_POOL */

+ nms_info = netmap_if_malloc(nmd, sizeof(*nms_info));

+ if (nms_info == NULL) {

+ return ENOMEM;

+ }

+ base = netmap_if_offset(nmd, nms_info);

+ memcpy(&nms_info->up, &nms_if_blueprint, sizeof(nms_if_blueprint));

+ nms_info->buf_pool_offset = nmd->pools[NETMAP_IF_POOL].memtotal + nmd->pools[NETMAP_RING_POOL].memtotal;

+ nms_info->buf_pool_objtotal = nmd->pools[NETMAP_BUF_POOL].objtotal;

+ nms_info->buf_pool_objsize = nmd->pools[NETMAP_BUF_POOL]._objsize;

+ nms_info->totalsize = nmd->nm_totalsize;

+ nms_info->features = NMS_FEAT_BUF_POOL | NMS_FEAT_MEMSIZE;

+ return 0;

+static int

netmap_mem_finalize_all(struct netmap_mem_d *nmd)

{

int i;

@@ -1160,6 +1433,11 @@ netmap_mem_finalize_all(struct netmap_mem_d *nmd)

nmd->pools[NETMAP_BUF_POOL].bitmap[0] = ~3;

nmd->flags |= NETMAP_MEM_FINALIZED;

+ /* expose info to the ptnetmap guest */

+ nmd->lasterr = netmap_mem_init_shared_info(nmd);

+ if (nmd->lasterr)

+ goto error;

if (netmap_verbose)

D("interfaces %d KB, rings %d KB, buffers %d MB",

nmd->pools[NETMAP_IF_POOL].memtotal >> 10,

@@ -1207,10 +1485,9 @@ static int

netmap_mem_private_finalize(struct netmap_mem_d *nmd)

{

int err;

- NMA_LOCK(nmd);

- nmd->active++;

err = netmap_mem_finalize_all(nmd);

- NMA_UNLOCK(nmd);

+ if (!err)

+ nmd->active++;

return err;

}

@@ -1218,10 +1495,8 @@ netmap_mem_private_finalize(struct netmap_mem_d *nmd)

static void

netmap_mem_private_deref(struct netmap_mem_d *nmd)

{

- NMA_LOCK(nmd);

if (--nmd->active <= 0)

netmap_mem_reset_all(nmd);

- NMA_UNLOCK(nmd);

}

@@ -1238,7 +1513,7 @@ netmap_mem_private_new(const char *name, u_int txr, u_int txd,

u_int v, maxd;

d = malloc(sizeof(struct netmap_mem_d),

- M_DEVBUF, M_NOWAIT | M_ZERO);

+ M_DEVBUF, M_NOWAIT | M_ZERO);

if (d == NULL) {

err = ENOMEM;

goto error;

@@ -1357,10 +1632,10 @@ static int

netmap_mem_global_finalize(struct netmap_mem_d *nmd)

{

int err;

/* update configuration if changed */

if (netmap_mem_global_config(nmd))

- goto out;

+ return nmd->lasterr;

nmd->active++;

@@ -1417,13 +1692,17 @@ netmap_free_rings(struct netmap_adapter *na)

for_rx_tx(t) {

u_int i;

- for (i = 0; i < netmap_real_rings(na, t); i++) {

+ for (i = 0; i < nma_get_nrings(na, t) + 1; i++) {

struct netmap_kring *kring = &NMR(na, t)[i];

struct netmap_ring *ring = kring->ring;

- if (ring == NULL)

+ if (ring == NULL || kring->users > 0 || (kring->nr_kflags & NKR_NEEDRING)) {

+ ND("skipping ring %s (ring %p, users %d)",

+ kring->name, ring, kring->users);

continue;

- netmap_free_bufs(na->nm_mem, ring->slot, kring->nkr_num_slots);

+ }

+ if (i != nma_get_nrings(na, t) || na->na_flags & NAF_HOST_RINGS)

+ netmap_free_bufs(na->nm_mem, ring->slot, kring->nkr_num_slots);

netmap_ring_free(na->nm_mem, ring);

kring->ring = NULL;

}

@@ -1452,9 +1731,10 @@ netmap_mem2_rings_create(struct netmap_adapter *na)

struct netmap_ring *ring = kring->ring;

u_int len, ndesc;

- if (ring) {

- ND("%s already created", kring->name);

- continue; /* already created by somebody else */

+ if (ring || (!kring->users && !(kring->nr_kflags & NKR_NEEDRING))) {

+ /* uneeded, or already created by somebody else */

+ ND("skipping ring %s", kring->name);

+ continue;

}

ndesc = kring->nkr_num_slots;

len = sizeof(struct netmap_ring) +

@@ -1569,10 +1849,22 @@ netmap_mem2_if_new(struct netmap_adapter *na)

base = netmap_if_offset(na->nm_mem, nifp);

for (i = 0; i < n[NR_TX]; i++) {

+ if (na->tx_rings[i].ring == NULL) {

+ // XXX maybe use the offset of an error ring,

+ // like we do for buffers?

+ *(ssize_t *)(uintptr_t)&nifp->ring_ofs[i] = 0;

+ continue;

+ }

*(ssize_t *)(uintptr_t)&nifp->ring_ofs[i] =

netmap_ring_offset(na->nm_mem, na->tx_rings[i].ring) - base;

}

for (i = 0; i < n[NR_RX]; i++) {

+ if (na->rx_rings[i].ring == NULL) {

+ // XXX maybe use the offset of an error ring,

+ // like we do for buffers?

+ *(ssize_t *)(uintptr_t)&nifp->ring_ofs[i+n[NR_TX]] = 0;

+ continue;

+ }

*(ssize_t *)(uintptr_t)&nifp->ring_ofs[i+n[NR_TX]] =

netmap_ring_offset(na->nm_mem, na->rx_rings[i].ring) - base;

}

@@ -1636,3 +1928,531 @@ struct netmap_mem_ops netmap_mem_private_ops = {

.nmd_rings_create = netmap_mem2_rings_create,

.nmd_rings_delete = netmap_mem2_rings_delete

};

+#ifdef WITH_PTNETMAP_GUEST

+struct mem_pt_if {

+ struct mem_pt_if *next;

+ struct ifnet *ifp;

+ unsigned int nifp_offset;

+ nm_pt_guest_ptctl_t ptctl;

+};

+/* Netmap allocator for ptnetmap guests. */

+struct netmap_mem_ptg {

+ struct netmap_mem_d up;

+ vm_paddr_t nm_paddr; /* physical address in the guest */

+ void *nm_addr; /* virtual address in the guest */

+ struct netmap_lut buf_lut; /* lookup table for BUF pool in the guest */

+ nm_memid_t nm_host_id; /* allocator identifier in the host */

+ struct ptnetmap_memdev *ptn_dev;

+ struct mem_pt_if *pt_ifs; /* list of interfaces in passthrough */

+};

+/* Link a passthrough interface to a passthrough netmap allocator. */

+static int

+netmap_mem_pt_guest_ifp_add(struct netmap_mem_d *nmd, struct ifnet *ifp,

+ unsigned int nifp_offset,

+ nm_pt_guest_ptctl_t ptctl)

+ struct netmap_mem_ptg *ptnmd = (struct netmap_mem_ptg *)nmd;

+ struct mem_pt_if *ptif = malloc(sizeof(*ptif), M_NETMAP,

+ M_NOWAIT | M_ZERO);

+ if (!ptif) {

+ return ENOMEM;

+ }

+ NMA_LOCK(nmd);

+ ptif->ifp = ifp;

+ ptif->nifp_offset = nifp_offset;

+ ptif->ptctl = ptctl;

+ if (ptnmd->pt_ifs) {

+ ptif->next = ptnmd->pt_ifs;

+ }

+ ptnmd->pt_ifs = ptif;

+ NMA_UNLOCK(nmd);

+ D("added (ifp=%p,nifp_offset=%u)", ptif->ifp, ptif->nifp_offset);

+ return 0;

+/* Called with NMA_LOCK(nmd) held. */

+static struct mem_pt_if *

+netmap_mem_pt_guest_ifp_lookup(struct netmap_mem_d *nmd, struct ifnet *ifp)

+ struct netmap_mem_ptg *ptnmd = (struct netmap_mem_ptg *)nmd;

+ struct mem_pt_if *curr;

+ for (curr = ptnmd->pt_ifs; curr; curr = curr->next) {

+ if (curr->ifp == ifp) {

+ return curr;

+ }

+ return NULL;

+/* Unlink a passthrough interface from a passthrough netmap allocator. */

+int

+netmap_mem_pt_guest_ifp_del(struct netmap_mem_d *nmd, struct ifnet *ifp)

+ struct netmap_mem_ptg *ptnmd = (struct netmap_mem_ptg *)nmd;

+ struct mem_pt_if *prev = NULL;

+ struct mem_pt_if *curr;

+ int ret = -1;

+ NMA_LOCK(nmd);

+ for (curr = ptnmd->pt_ifs; curr; curr = curr->next) {

+ if (curr->ifp == ifp) {

+ if (prev) {

+ prev->next = curr->next;

+ } else {

+ ptnmd->pt_ifs = curr->next;

+ }

+ D("removed (ifp=%p,nifp_offset=%u)",

+ curr->ifp, curr->nifp_offset);

+ free(curr, M_NETMAP);

+ ret = 0;

+ break;

+ }

+ prev = curr;

+ }

+ NMA_UNLOCK(nmd);

+ return ret;

+/* Read allocator info from the first netmap_if (only on finalize) */

+static int

+netmap_mem_pt_guest_read_shared_info(struct netmap_mem_d *nmd)

+ struct netmap_mem_ptg *ptnmd = (struct netmap_mem_ptg *)nmd;

+ struct netmap_mem_shared_info *nms_info;

+ uint32_t bufsize;

+ uint32_t nbuffers;

+ char *vaddr;

+ vm_paddr_t paddr;

+ int i;

+ nms_info = (struct netmap_mem_shared_info *)ptnmd->nm_addr;

+ if (strncmp(nms_info->up.ni_name, NMS_NAME, sizeof(NMS_NAME)) != 0) {

+ D("error, the first slot does not contain shared info");

+ return EINVAL;

+ }

+ /* check features mem_shared info */

+ if ((nms_info->features & (NMS_FEAT_BUF_POOL | NMS_FEAT_MEMSIZE)) !=

+ (NMS_FEAT_BUF_POOL | NMS_FEAT_MEMSIZE)) {

+ D("error, the shared info does not contain BUF_POOL and MEMSIZE");

+ return EINVAL;

+ }

+ bufsize = nms_info->buf_pool_objsize;

+ nbuffers = nms_info->buf_pool_objtotal;

+ /* allocate the lut */

+ if (ptnmd->buf_lut.lut == NULL) {

+ D("allocating lut");

+ ptnmd->buf_lut.lut = nm_alloc_lut(nbuffers);

+ if (ptnmd->buf_lut.lut == NULL) {

+ D("lut allocation failed");

+ return ENOMEM;

+ }

+ /* we have physically contiguous memory mapped through PCI BAR */

+ vaddr = (char *)(ptnmd->nm_addr) + nms_info->buf_pool_offset;

+ paddr = ptnmd->nm_paddr + nms_info->buf_pool_offset;

+ for (i = 0; i < nbuffers; i++) {

+ ptnmd->buf_lut.lut[i].vaddr = vaddr;

+ ptnmd->buf_lut.lut[i].paddr = paddr;

+ vaddr += bufsize;

+ paddr += bufsize;

+ }

+ ptnmd->buf_lut.objtotal = nbuffers;

+ ptnmd->buf_lut.objsize = bufsize;

+ nmd->nm_totalsize = nms_info->totalsize;

+ return 0;

+static int

+netmap_mem_pt_guest_get_lut(struct netmap_mem_d *nmd, struct netmap_lut *lut)

+ struct netmap_mem_ptg *ptnmd = (struct netmap_mem_ptg *)nmd;

+ if (!(nmd->flags & NETMAP_MEM_FINALIZED)) {

+ return EINVAL;

+ }

+ *lut = ptnmd->buf_lut;

+ return 0;

+static int

+netmap_mem_pt_guest_get_info(struct netmap_mem_d *nmd, u_int *size,

+ u_int *memflags, uint16_t *id)

+ int error = 0;

+ NMA_LOCK(nmd);

+ error = nmd->ops->nmd_config(nmd);

+ if (error)

+ goto out;

+ if (size)

+ *size = nmd->nm_totalsize;

+ if (memflags)

+ *memflags = nmd->flags;

+ if (id)

+ *id = nmd->nm_id;

+out:

+ NMA_UNLOCK(nmd);

+ return error;

+static vm_paddr_t

+netmap_mem_pt_guest_ofstophys(struct netmap_mem_d *nmd, vm_ooffset_t off)

+ struct netmap_mem_ptg *ptnmd = (struct netmap_mem_ptg *)nmd;

+ vm_paddr_t paddr;

+ /* if the offset is valid, just return csb->base_addr + off */

+ paddr = (vm_paddr_t)(ptnmd->nm_paddr + off);

+ ND("off %lx padr %lx", off, (unsigned long)paddr);

+ return paddr;

+static int

+netmap_mem_pt_guest_config(struct netmap_mem_d *nmd)

+ /* nothing to do, we are configured on creation

+ * and configuration never changes thereafter

+ */

+ return 0;

+static int

+netmap_mem_pt_guest_finalize(struct netmap_mem_d *nmd)

+ struct netmap_mem_ptg *ptnmd = (struct netmap_mem_ptg *)nmd;

+ int error = 0;

+ nmd->active++;

+ if (nmd->flags & NETMAP_MEM_FINALIZED)

+ goto out;

+ if (ptnmd->ptn_dev == NULL) {

+ D("ptnetmap memdev not attached");

+ error = ENOMEM;

+ goto err;

+ }

+ /* map memory through ptnetmap-memdev BAR */

+ error = nm_os_pt_memdev_iomap(ptnmd->ptn_dev, &ptnmd->nm_paddr,

+ &ptnmd->nm_addr);

+ if (error)

+ goto err;

+ /* read allcator info and create lut */

+ error = netmap_mem_pt_guest_read_shared_info(nmd);

+ if (error)

+ goto err;

+ nmd->flags |= NETMAP_MEM_FINALIZED;

+out:

+ return 0;

+err:

+ nmd->active--;

+ return error;

+static void

+netmap_mem_pt_guest_deref(struct netmap_mem_d *nmd)

+ struct netmap_mem_ptg *ptnmd = (struct netmap_mem_ptg *)nmd;

+ nmd->active--;

+ if (nmd->active <= 0 &&

+ (nmd->flags & NETMAP_MEM_FINALIZED)) {

+ nmd->flags &= ~NETMAP_MEM_FINALIZED;

+ /* unmap ptnetmap-memdev memory */

+ if (ptnmd->ptn_dev) {

+ nm_os_pt_memdev_iounmap(ptnmd->ptn_dev);

+ }

+ ptnmd->nm_addr = 0;

+ ptnmd->nm_paddr = 0;

+ }

+static ssize_t

+netmap_mem_pt_guest_if_offset(struct netmap_mem_d *nmd, const void *vaddr)

+ struct netmap_mem_ptg *ptnmd = (struct netmap_mem_ptg *)nmd;

+ return (const char *)(vaddr) - (char *)(ptnmd->nm_addr);

+static void

+netmap_mem_pt_guest_delete(struct netmap_mem_d *nmd)

+ if (nmd == NULL)

+ return;

+ if (netmap_verbose)

+ D("deleting %p", nmd);

+ if (nmd->active > 0)

+ D("bug: deleting mem allocator with active=%d!", nmd->active);

+ nm_mem_release_id(nmd);

+ if (netmap_verbose)

+ D("done deleting %p", nmd);

+ NMA_LOCK_DESTROY(nmd);

+ free(nmd, M_DEVBUF);

+static struct netmap_if *

+netmap_mem_pt_guest_if_new(struct netmap_adapter *na)

+ struct netmap_mem_ptg *ptnmd = (struct netmap_mem_ptg *)na->nm_mem;

+ struct mem_pt_if *ptif;

+ struct netmap_if *nifp = NULL;

+ NMA_LOCK(na->nm_mem);

+ ptif = netmap_mem_pt_guest_ifp_lookup(na->nm_mem, na->ifp);

+ if (ptif == NULL) {

+ D("Error: interface %p is not in passthrough", na->ifp);

+ goto out;

+ }

+ nifp = (struct netmap_if *)((char *)(ptnmd->nm_addr) +

+ ptif->nifp_offset);

+ NMA_UNLOCK(na->nm_mem);

+out:

+ return nifp;

+static void

+netmap_mem_pt_guest_if_delete(struct netmap_adapter *na, struct netmap_if *nifp)

+ struct mem_pt_if *ptif;

+ NMA_LOCK(na->nm_mem);

+ ptif = netmap_mem_pt_guest_ifp_lookup(na->nm_mem, na->ifp);

+ if (ptif == NULL) {

+ D("Error: interface %p is not in passthrough", na->ifp);

+ goto out;

+ }

+ ptif->ptctl(na->ifp, PTNETMAP_PTCTL_IFDELETE);

+out:

+ NMA_UNLOCK(na->nm_mem);

+static int

+netmap_mem_pt_guest_rings_create(struct netmap_adapter *na)

+ struct netmap_mem_ptg *ptnmd = (struct netmap_mem_ptg *)na->nm_mem;

+ struct mem_pt_if *ptif;

+ struct netmap_if *nifp;

+ int i, error = -1;

+ NMA_LOCK(na->nm_mem);

+ ptif = netmap_mem_pt_guest_ifp_lookup(na->nm_mem, na->ifp);

+ if (ptif == NULL) {

+ D("Error: interface %p is not in passthrough", na->ifp);

+ goto out;

+ }

+ /* point each kring to the corresponding backend ring */

+ nifp = (struct netmap_if *)((char *)ptnmd->nm_addr + ptif->nifp_offset);

+ for (i = 0; i <= na->num_tx_rings; i++) {

+ struct netmap_kring *kring = na->tx_rings + i;

+ if (kring->ring)

+ continue;

+ kring->ring = (struct netmap_ring *)

+ ((char *)nifp + nifp->ring_ofs[i]);

+ }

+ for (i = 0; i <= na->num_rx_rings; i++) {

+ struct netmap_kring *kring = na->rx_rings + i;

+ if (kring->ring)

+ continue;

+ kring->ring = (struct netmap_ring *)

+ ((char *)nifp +

+ nifp->ring_ofs[i + na->num_tx_rings + 1]);

+ }

+ //error = ptif->ptctl->nm_ptctl(ifp, PTNETMAP_PTCTL_RINGSCREATE);

+ error = 0;

+out:

+ NMA_UNLOCK(na->nm_mem);

+ return error;

+static void

+netmap_mem_pt_guest_rings_delete(struct netmap_adapter *na)

+ /* TODO: remove?? */

+#if 0

+ struct netmap_mem_ptg *ptnmd = (struct netmap_mem_ptg *)na->nm_mem;

+ struct mem_pt_if *ptif = netmap_mem_pt_guest_ifp_lookup(na->nm_mem,

+ na->ifp);

+#endif

+static struct netmap_mem_ops netmap_mem_pt_guest_ops = {

+ .nmd_get_lut = netmap_mem_pt_guest_get_lut,

+ .nmd_get_info = netmap_mem_pt_guest_get_info,

+ .nmd_ofstophys = netmap_mem_pt_guest_ofstophys,

+ .nmd_config = netmap_mem_pt_guest_config,

+ .nmd_finalize = netmap_mem_pt_guest_finalize,

+ .nmd_deref = netmap_mem_pt_guest_deref,

+ .nmd_if_offset = netmap_mem_pt_guest_if_offset,

+ .nmd_delete = netmap_mem_pt_guest_delete,

+ .nmd_if_new = netmap_mem_pt_guest_if_new,

+ .nmd_if_delete = netmap_mem_pt_guest_if_delete,

+ .nmd_rings_create = netmap_mem_pt_guest_rings_create,

+ .nmd_rings_delete = netmap_mem_pt_guest_rings_delete

+};

+/* Called with NMA_LOCK(&nm_mem) held. */

+static struct netmap_mem_d *

+netmap_mem_pt_guest_find_hostid(nm_memid_t host_id)

+ struct netmap_mem_d *mem = NULL;

+ struct netmap_mem_d *scan = netmap_last_mem_d;

+ do {

+ /* find ptnetmap allocator through host ID */

+ if (scan->ops->nmd_deref == netmap_mem_pt_guest_deref &&

+ ((struct netmap_mem_ptg *)(scan))->nm_host_id == host_id) {

+ mem = scan;

+ break;

+ }

+ scan = scan->next;

+ } while (scan != netmap_last_mem_d);

+ return mem;

+/* Called with NMA_LOCK(&nm_mem) held. */

+static struct netmap_mem_d *

+netmap_mem_pt_guest_create(nm_memid_t host_id)

+ struct netmap_mem_ptg *ptnmd;

+ int err = 0;

+ ptnmd = malloc(sizeof(struct netmap_mem_ptg),

+ M_DEVBUF, M_NOWAIT | M_ZERO);

+ if (ptnmd == NULL) {

+ err = ENOMEM;

+ goto error;

+ }

+ ptnmd->up.ops = &netmap_mem_pt_guest_ops;

+ ptnmd->nm_host_id = host_id;

+ ptnmd->pt_ifs = NULL;

+ /* Assign new id in the guest (We have the lock) */

+ err = nm_mem_assign_id_locked(&ptnmd->up);

+ if (err)

+ goto error;

+ ptnmd->up.flags &= ~NETMAP_MEM_FINALIZED;

+ ptnmd->up.flags |= NETMAP_MEM_IO;

+ NMA_LOCK_INIT(&ptnmd->up);

+ return &ptnmd->up;

+error:

+ netmap_mem_pt_guest_delete(&ptnmd->up);

+ return NULL;

+/*

+ * find host id in guest allocators and create guest allocator

+ * if it is not there

+ */

+static struct netmap_mem_d *

+netmap_mem_pt_guest_get(nm_memid_t host_id)

+ struct netmap_mem_d *nmd;

+ NMA_LOCK(&nm_mem);

+ nmd = netmap_mem_pt_guest_find_hostid(host_id);

+ if (nmd == NULL) {

+ nmd = netmap_mem_pt_guest_create(host_id);

+ }

+ NMA_UNLOCK(&nm_mem);

+ return nmd;

+/*

+ * The guest allocator can be created by ptnetmap_memdev (during the device

+ * attach) or by ptnetmap device (e1000/virtio), during the netmap_attach.

+ *

+ * The order is not important (we have different order in LINUX and FreeBSD).

+ * The first one, creates the device, and the second one simply attaches it.

+ */

+/* Called when ptnetmap_memdev is attaching, to attach a new allocator in

+ * the guest */

+struct netmap_mem_d *

+netmap_mem_pt_guest_attach(struct ptnetmap_memdev *ptn_dev, nm_memid_t host_id)

+ struct netmap_mem_d *nmd;

+ struct netmap_mem_ptg *ptnmd;

+ nmd = netmap_mem_pt_guest_get(host_id);

+ /* assign this device to the guest allocator */

+ if (nmd) {

+ ptnmd = (struct netmap_mem_ptg *)nmd;

+ ptnmd->ptn_dev = ptn_dev;

+ }

+ return nmd;

+/* Called when ptnetmap device (virtio/e1000) is attaching */

+struct netmap_mem_d *

+netmap_mem_pt_guest_new(struct ifnet *ifp,

+ unsigned int nifp_offset,

+ nm_pt_guest_ptctl_t ptctl)

+ struct netmap_mem_d *nmd;

+ nm_memid_t host_id;

+ if (ifp == NULL || ptctl == NULL) {

+ return NULL;

+ }

+ /* Get the host id allocator. */

+ host_id = ptctl(ifp, PTNETMAP_PTCTL_HOSTMEMID);

+ nmd = netmap_mem_pt_guest_get(host_id);

+ if (nmd) {

+ netmap_mem_pt_guest_ifp_add(nmd, ifp, nifp_offset,

+ ptctl);

+ }

+ return nmd;

+#endif /* WITH_PTNETMAP_GUEST */

diff --git a/sys/dev/netmap/netmap_mem2.h b/sys/dev/netmap/netmap_mem2.h
index ef0ff96d8e7f..7f4c5e9e9624 100644
--- a/sys/dev/netmap/netmap_mem2.h
+++ b/sys/dev/netmap/netmap_mem2.h

@@ -1,5 +1,8 @@

* Redistribution and use in source and binary forms, with or without

* modification, are permitted provided that the following conditions

@@ -117,8 +120,11 @@

extern struct netmap_mem_d nm_mem;

-void netmap_mem_get_lut(struct netmap_mem_d *, struct netmap_lut *);

+int netmap_mem_get_lut(struct netmap_mem_d *, struct netmap_lut *);

vm_paddr_t netmap_mem_ofstophys(struct netmap_mem_d *, vm_ooffset_t);

+#ifdef _WIN32

+PMDL win32_build_user_vm_map(struct netmap_mem_d* nmd);

+#endif

int netmap_mem_finalize(struct netmap_mem_d *, struct netmap_adapter *);

int netmap_mem_init(void);

void netmap_mem_fini(void);

@@ -127,6 +133,7 @@ void netmap_mem_if_delete(struct netmap_adapter *, struct netmap_if *);

int netmap_mem_rings_create(struct netmap_adapter *);

void netmap_mem_rings_delete(struct netmap_adapter *);

void netmap_mem_deref(struct netmap_mem_d *, struct netmap_adapter *);

+int netmap_mem2_get_pool_info(struct netmap_mem_d *, u_int, u_int *, u_int *);

int netmap_mem_get_info(struct netmap_mem_d *, u_int *size, u_int *memflags, uint16_t *id);

ssize_t netmap_mem_if_offset(struct netmap_mem_d *, const void *vaddr);

struct netmap_mem_d* netmap_mem_private_new(const char *name,

@@ -157,6 +164,15 @@ void netmap_mem_put(struct netmap_mem_d *);

#endif /* !NM_DEBUG_PUTGET */

+#ifdef WITH_PTNETMAP_GUEST

+struct netmap_mem_d* netmap_mem_pt_guest_new(struct ifnet *,

+ unsigned int nifp_offset,

+ nm_pt_guest_ptctl_t);

+struct ptnetmap_memdev;

+struct netmap_mem_d* netmap_mem_pt_guest_attach(struct ptnetmap_memdev *, uint16_t);

+int netmap_mem_pt_guest_ifp_del(struct netmap_mem_d *, struct ifnet *);

+#endif /* WITH_PTNETMAP_GUEST */

#define NETMAP_MEM_PRIVATE 0x2 /* allocator uses private address space */

#define NETMAP_MEM_IO 0x4 /* the underlying memory is mmapped I/O */

diff --git a/sys/dev/netmap/netmap_monitor.c b/sys/dev/netmap/netmap_monitor.c
index c303952417ff..5b4f9cdf61c0 100644
--- a/sys/dev/netmap/netmap_monitor.c
+++ b/sys/dev/netmap/netmap_monitor.c

@@ -1,5 +1,6 @@

* Redistribution and use in source and binary forms, with or without

* modification, are permitted provided that the following conditions

@@ -101,6 +102,8 @@

#warning OSX support is only partial

#include "osx_glue.h"

+#elif defined(_WIN32)

+#include "win_glue.h"

#else

#error Unsupported platform

@@ -151,13 +154,17 @@ netmap_monitor_rxsync(struct netmap_kring *kring, int flags)

}

/* nm_krings_create callbacks for monitors.

- * We could use the default netmap_hw_krings_zmon, but

- * we don't need the mbq.

static int

netmap_monitor_krings_create(struct netmap_adapter *na)

{

- return netmap_krings_create(na, 0);

+ int error = netmap_krings_create(na, 0);

+ if (error)

+ return error;

+ /* override the host rings callbacks */

+ na->tx_rings[na->num_tx_rings].nm_sync = netmap_monitor_txsync;

+ na->rx_rings[na->num_rx_rings].nm_sync = netmap_monitor_rxsync;

+ return 0;

}

/* nm_krings_delete callback for monitors */

@@ -186,7 +193,11 @@ nm_monitor_alloc(struct netmap_kring *kring, u_int n)

return 0;

len = sizeof(struct netmap_kring *) * n;

+#ifndef _WIN32

nm = realloc(kring->monitors, len, M_DEVBUF, M_NOWAIT | M_ZERO);

+#else

+ nm = realloc(kring->monitors, len, sizeof(struct netmap_kring *)*kring->max_monitors);

+#endif

if (nm == NULL)

return ENOMEM;

@@ -229,10 +240,10 @@ static int netmap_monitor_parent_notify(struct netmap_kring *, int);

static int

netmap_monitor_add(struct netmap_kring *mkring, struct netmap_kring *kring, int zcopy)

{

- int error = 0;

+ int error = NM_IRQ_COMPLETED;

/* sinchronize with concurrently running nm_sync()s */

- nm_kr_get(kring);

+ nm_kr_stop(kring, NM_KR_LOCKED);

/* make sure the monitor array exists and is big enough */

error = nm_monitor_alloc(kring, kring->n_monitors + 1);

if (error)

@@ -242,7 +253,7 @@ netmap_monitor_add(struct netmap_kring *mkring, struct netmap_kring *kring, int

kring->n_monitors++;

if (kring->n_monitors == 1) {

/* this is the first monitor, intercept callbacks */

- D("%s: intercept callbacks on %s", mkring->name, kring->name);

+ ND("%s: intercept callbacks on %s", mkring->name, kring->name);

kring->mon_sync = kring->nm_sync;

/* zcopy monitors do not override nm_notify(), but

* we save the original one regardless, so that

@@ -265,7 +276,7 @@ netmap_monitor_add(struct netmap_kring *mkring, struct netmap_kring *kring, int

}

out:

- nm_kr_put(kring);

+ nm_kr_start(kring);

return error;

}

@@ -277,7 +288,7 @@ static void

netmap_monitor_del(struct netmap_kring *mkring, struct netmap_kring *kring)

{

/* sinchronize with concurrently running nm_sync()s */

- nm_kr_get(kring);

+ nm_kr_stop(kring, NM_KR_LOCKED);

kring->n_monitors--;

if (mkring->mon_pos != kring->n_monitors) {

kring->monitors[mkring->mon_pos] = kring->monitors[kring->n_monitors];

@@ -286,18 +297,18 @@ netmap_monitor_del(struct netmap_kring *mkring, struct netmap_kring *kring)

kring->monitors[kring->n_monitors] = NULL;

if (kring->n_monitors == 0) {

/* this was the last monitor, restore callbacks and delete monitor array */

- D("%s: restoring sync on %s: %p", mkring->name, kring->name, kring->mon_sync);

+ ND("%s: restoring sync on %s: %p", mkring->name, kring->name, kring->mon_sync);

kring->nm_sync = kring->mon_sync;

kring->mon_sync = NULL;

if (kring->tx == NR_RX) {

- D("%s: restoring notify on %s: %p",

+ ND("%s: restoring notify on %s: %p",

mkring->name, kring->name, kring->mon_notify);

kring->nm_notify = kring->mon_notify;

kring->mon_notify = NULL;

}

nm_monitor_dealloc(kring);

}

- nm_kr_put(kring);

+ nm_kr_start(kring);

}

@@ -316,7 +327,7 @@ netmap_monitor_stop(struct netmap_adapter *na)

for_rx_tx(t) {

u_int i;

- for (i = 0; i < nma_get_nrings(na, t); i++) {

+ for (i = 0; i < nma_get_nrings(na, t) + 1; i++) {

struct netmap_kring *kring = &NMR(na, t)[i];

u_int j;

@@ -360,23 +371,32 @@ netmap_monitor_reg_common(struct netmap_adapter *na, int onoff, int zmon)

for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {

kring = &NMR(pna, t)[i];

mkring = &na->rx_rings[i];

- netmap_monitor_add(mkring, kring, zmon);

+ if (nm_kring_pending_on(mkring)) {

+ netmap_monitor_add(mkring, kring, zmon);

+ mkring->nr_mode = NKR_NETMAP_ON;

+ }

}

na->na_flags |= NAF_NETMAP_ON;

} else {

- if (pna == NULL) {

- D("%s: parent left netmap mode, nothing to restore", na->name);

- return 0;

- }

- na->na_flags &= ~NAF_NETMAP_ON;

+ if (na->active_fds == 0)

+ na->na_flags &= ~NAF_NETMAP_ON;

for_rx_tx(t) {

if (mna->flags & nm_txrx2flag(t)) {

for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {

- kring = &NMR(pna, t)[i];

mkring = &na->rx_rings[i];

- netmap_monitor_del(mkring, kring);

+ if (nm_kring_pending_off(mkring)) {

+ mkring->nr_mode = NKR_NETMAP_OFF;

+ /* we cannot access the parent krings if the parent

+ * has left netmap mode. This is signaled by a NULL

+ * pna pointer

+ */

+ if (pna) {

+ kring = &NMR(pna, t)[i];

+ netmap_monitor_del(mkring, kring);

+ }

}

@@ -652,17 +672,27 @@ netmap_monitor_parent_rxsync(struct netmap_kring *kring, int flags)

static int

netmap_monitor_parent_notify(struct netmap_kring *kring, int flags)

{

+ int (*notify)(struct netmap_kring*, int);

ND(5, "%s %x", kring->name, flags);

/* ?xsync callbacks have tryget called by their callers

* (NIOCREGIF and poll()), but here we have to call it

* by ourself

- if (nm_kr_tryget(kring))

- goto out;

- netmap_monitor_parent_rxsync(kring, NAF_FORCE_READ);

+ if (nm_kr_tryget(kring, 0, NULL)) {

+ /* in all cases, just skip the sync */

+ return NM_IRQ_COMPLETED;

+ }

+ if (kring->n_monitors > 0) {

+ netmap_monitor_parent_rxsync(kring, NAF_FORCE_READ);

+ notify = kring->mon_notify;

+ } else {

+ /* we are no longer monitoring this ring, so both

+ * mon_sync and mon_notify are NULL

+ */

+ notify = kring->nm_notify;

+ }

nm_kr_put(kring);

-out:

- return kring->mon_notify(kring, flags);

+ return notify(kring, flags);

}

@@ -691,18 +721,25 @@ netmap_get_monitor_na(struct nmreq *nmr, struct netmap_adapter **na, int create)

struct nmreq pnmr;

struct netmap_adapter *pna; /* parent adapter */

struct netmap_monitor_adapter *mna;

+ struct ifnet *ifp = NULL;

int i, error;

enum txrx t;

int zcopy = (nmr->nr_flags & NR_ZCOPY_MON);

char monsuff[10] = "";

if ((nmr->nr_flags & (NR_MONITOR_TX | NR_MONITOR_RX)) == 0) {

+ if (nmr->nr_flags & NR_ZCOPY_MON) {

+ /* the flag makes no sense unless you are

+ * creating a monitor

+ */

+ return EINVAL;

+ }

ND("not a monitor");

return 0;

}

/* this is a request for a monitor adapter */

- D("flags %x", nmr->nr_flags);

+ ND("flags %x", nmr->nr_flags);

mna = malloc(sizeof(*mna), M_DEVBUF, M_NOWAIT | M_ZERO);

if (mna == NULL) {

@@ -716,13 +753,14 @@ netmap_get_monitor_na(struct nmreq *nmr, struct netmap_adapter **na, int create)

* except other monitors.

memcpy(&pnmr, nmr, sizeof(pnmr));

- pnmr.nr_flags &= ~(NR_MONITOR_TX | NR_MONITOR_RX);

- error = netmap_get_na(&pnmr, &pna, create);

+ pnmr.nr_flags &= ~(NR_MONITOR_TX | NR_MONITOR_RX | NR_ZCOPY_MON);

+ error = netmap_get_na(&pnmr, &pna, &ifp, create);

if (error) {

D("parent lookup failed: %d", error);

+ free(mna, M_DEVBUF);

return error;

}

- D("found parent: %s", pna->name);

+ ND("found parent: %s", pna->name);

if (!nm_netmap_on(pna)) {

/* parent not in netmap mode */

@@ -829,19 +867,17 @@ netmap_get_monitor_na(struct nmreq *nmr, struct netmap_adapter **na, int create)

*na = &mna->up;

netmap_adapter_get(*na);

- /* write the configuration back */

- nmr->nr_tx_rings = mna->up.num_tx_rings;

- nmr->nr_rx_rings = mna->up.num_rx_rings;

- nmr->nr_tx_slots = mna->up.num_tx_desc;

- nmr->nr_rx_slots = mna->up.num_rx_desc;

/* keep the reference to the parent */

- D("monitor ok");

+ ND("monitor ok");

+ /* drop the reference to the ifp, if any */

+ if (ifp)

+ if_rele(ifp);

return 0;

put_out:

- netmap_adapter_put(pna);

+ netmap_unget_na(pna, ifp);

free(mna, M_DEVBUF);

return error;

}

diff --git a/sys/dev/netmap/netmap_offloadings.c b/sys/dev/netmap/netmap_offloadings.c
index dadc1dcbc14c..f8da672ffa53 100644
--- a/sys/dev/netmap/netmap_offloadings.c
+++ b/sys/dev/netmap/netmap_offloadings.c

@@ -1,5 +1,6 @@

* Redistribution and use in source and binary forms, with or without

* modification, are permitted provided that the following conditions

@@ -31,9 +32,9 @@

#include <sys/types.h>

#include <sys/errno.h>

#include <sys/param.h> /* defines used in kernel.h */

-#include <sys/malloc.h> /* types used in module initialization */

#include <sys/kernel.h> /* types used in module initialization */

#include <sys/sockio.h>

+#include <sys/malloc.h>

#include <sys/socketvar.h> /* struct socket */

#include <sys/socket.h> /* sockaddrs */

#include <net/if.h>

@@ -64,21 +65,21 @@

/* This routine is called by bdg_mismatch_datapath() when it finishes

* accumulating bytes for a segment, in order to fix some fields in the

* segment headers (which still contain the same content as the header

- * of the original GSO packet). 'buf' points to the beginning (e.g.

- * the ethernet header) of the segment, and 'len' is its length.

+ * of the original GSO packet). 'pkt' points to the beginning of the IP

+ * header of the segment, while 'len' is the length of the IP packet.

-static void gso_fix_segment(uint8_t *buf, size_t len, u_int idx,

- u_int segmented_bytes, u_int last_segment,

- u_int tcp, u_int iphlen)

+static void

+gso_fix_segment(uint8_t *pkt, size_t len, u_int ipv4, u_int iphlen, u_int tcp,

+ u_int idx, u_int segmented_bytes, u_int last_segment)

{

- struct nm_iphdr *iph = (struct nm_iphdr *)(buf + 14);

- struct nm_ipv6hdr *ip6h = (struct nm_ipv6hdr *)(buf + 14);

+ struct nm_iphdr *iph = (struct nm_iphdr *)(pkt);

+ struct nm_ipv6hdr *ip6h = (struct nm_ipv6hdr *)(pkt);

uint16_t *check = NULL;

uint8_t *check_data = NULL;

- if (iphlen == 20) {

+ if (ipv4) {

/* Set the IPv4 "Total Length" field. */

- iph->tot_len = htobe16(len-14);

+ iph->tot_len = htobe16(len);

ND("ip total length %u", be16toh(ip->tot_len));

/* Set the IPv4 "Identification" field. */

@@ -87,15 +88,15 @@ static void gso_fix_segment(uint8_t *buf, size_t len, u_int idx,

/* Compute and insert the IPv4 header checksum. */

iph->check = 0;

- iph->check = nm_csum_ipv4(iph);

+ iph->check = nm_os_csum_ipv4(iph);

ND("IP csum %x", be16toh(iph->check));

- } else {/* if (iphlen == 40) */

+ } else {

/* Set the IPv6 "Payload Len" field. */

- ip6h->payload_len = htobe16(len-14-iphlen);

+ ip6h->payload_len = htobe16(len-iphlen);

}

if (tcp) {

- struct nm_tcphdr *tcph = (struct nm_tcphdr *)(buf + 14 + iphlen);

+ struct nm_tcphdr *tcph = (struct nm_tcphdr *)(pkt + iphlen);

/* Set the TCP sequence number. */

tcph->seq = htobe32(be32toh(tcph->seq) + segmented_bytes);

@@ -110,10 +111,10 @@ static void gso_fix_segment(uint8_t *buf, size_t len, u_int idx,

check = &tcph->check;

check_data = (uint8_t *)tcph;

} else { /* UDP */

- struct nm_udphdr *udph = (struct nm_udphdr *)(buf + 14 + iphlen);

+ struct nm_udphdr *udph = (struct nm_udphdr *)(pkt + iphlen);

/* Set the UDP 'Length' field. */

- udph->len = htobe16(len-14-iphlen);

+ udph->len = htobe16(len-iphlen);

check = &udph->check;

check_data = (uint8_t *)udph;

@@ -121,48 +122,80 @@ static void gso_fix_segment(uint8_t *buf, size_t len, u_int idx,

/* Compute and insert TCP/UDP checksum. */

*check = 0;

- if (iphlen == 20)

- nm_csum_tcpudp_ipv4(iph, check_data, len-14-iphlen, check);

+ if (ipv4)

+ nm_os_csum_tcpudp_ipv4(iph, check_data, len-iphlen, check);

else

- nm_csum_tcpudp_ipv6(ip6h, check_data, len-14-iphlen, check);

+ nm_os_csum_tcpudp_ipv6(ip6h, check_data, len-iphlen, check);

ND("TCP/UDP csum %x", be16toh(*check));

}

+static int

+vnet_hdr_is_bad(struct nm_vnet_hdr *vh)

+ uint8_t gso_type = vh->gso_type & ~VIRTIO_NET_HDR_GSO_ECN;

+ return (

+ (gso_type != VIRTIO_NET_HDR_GSO_NONE &&

+ gso_type != VIRTIO_NET_HDR_GSO_TCPV4 &&

+ gso_type != VIRTIO_NET_HDR_GSO_UDP &&

+ gso_type != VIRTIO_NET_HDR_GSO_TCPV6)

+ ||

+ (vh->flags & ~(VIRTIO_NET_HDR_F_NEEDS_CSUM

+ | VIRTIO_NET_HDR_F_DATA_VALID))

+ );

/* The VALE mismatch datapath implementation. */

-void bdg_mismatch_datapath(struct netmap_vp_adapter *na,

- struct netmap_vp_adapter *dst_na,

- struct nm_bdg_fwd *ft_p, struct netmap_ring *ring,

- u_int *j, u_int lim, u_int *howmany)

+void

+bdg_mismatch_datapath(struct netmap_vp_adapter *na,

+ struct netmap_vp_adapter *dst_na,

+ const struct nm_bdg_fwd *ft_p,

+ struct netmap_ring *dst_ring,

+ u_int *j, u_int lim, u_int *howmany)

{

- struct netmap_slot *slot = NULL;

+ struct netmap_slot *dst_slot = NULL;

struct nm_vnet_hdr *vh = NULL;

- /* Number of source slots to process. */

- u_int frags = ft_p->ft_frags;

- struct nm_bdg_fwd *ft_end = ft_p + frags;

+ const struct nm_bdg_fwd *ft_end = ft_p + ft_p->ft_frags;

/* Source and destination pointers. */

uint8_t *dst, *src;

size_t src_len, dst_len;

+ /* Indices and counters for the destination ring. */

u_int j_start = *j;

+ u_int j_cur = j_start;

u_int dst_slots = 0;

- /* If the source port uses the offloadings, while destination doesn't,

- * we grab the source virtio-net header and do the offloadings here.

- */

- if (na->virt_hdr_len && !dst_na->virt_hdr_len) {

- vh = (struct nm_vnet_hdr *)ft_p->ft_buf;

+ if (unlikely(ft_p == ft_end)) {

+ RD(3, "No source slots to process");

+ return;

}

/* Init source and dest pointers. */

src = ft_p->ft_buf;

src_len = ft_p->ft_len;

- slot = &ring->slot[*j];

- dst = NMB(&dst_na->up, slot);

+ dst_slot = &dst_ring->slot[j_cur];

+ dst = NMB(&dst_na->up, dst_slot);

dst_len = src_len;

+ /* If the source port uses the offloadings, while destination doesn't,

+ * we grab the source virtio-net header and do the offloadings here.

+ */

+ if (na->up.virt_hdr_len && !dst_na->up.virt_hdr_len) {

+ vh = (struct nm_vnet_hdr *)src;

+ /* Initial sanity check on the source virtio-net header. If

+ * something seems wrong, just drop the packet. */

+ if (src_len < na->up.virt_hdr_len) {

+ RD(3, "Short src vnet header, dropping");

+ return;

+ }

+ if (vnet_hdr_is_bad(vh)) {

+ RD(3, "Bad src vnet header, dropping");

+ return;

+ }

/* We are processing the first input slot and there is a mismatch

* between source and destination virt_hdr_len (SHL and DHL).

* When the a client is using virtio-net headers, the header length

@@ -185,14 +218,14 @@ void bdg_mismatch_datapath(struct netmap_vp_adapter *na,

* 12 | 0 | doesn't exist

* 12 | 10 | copied from the first 10 bytes of source header

- bzero(dst, dst_na->virt_hdr_len);

- if (na->virt_hdr_len && dst_na->virt_hdr_len)

+ bzero(dst, dst_na->up.virt_hdr_len);

+ if (na->up.virt_hdr_len && dst_na->up.virt_hdr_len)

memcpy(dst, src, sizeof(struct nm_vnet_hdr));

/* Skip the virtio-net headers. */

- src += na->virt_hdr_len;

- src_len -= na->virt_hdr_len;

- dst += dst_na->virt_hdr_len;

- dst_len = dst_na->virt_hdr_len + src_len;

+ src += na->up.virt_hdr_len;

+ src_len -= na->up.virt_hdr_len;

+ dst += dst_na->up.virt_hdr_len;

+ dst_len = dst_na->up.virt_hdr_len + src_len;

/* Here it could be dst_len == 0 (which implies src_len == 0),

* so we avoid passing a zero length fragment.

@@ -214,16 +247,27 @@ void bdg_mismatch_datapath(struct netmap_vp_adapter *na,

u_int gso_idx = 0;

/* Payload data bytes segmented so far (e.g. TCP data bytes). */

u_int segmented_bytes = 0;

+ /* Is this an IPv4 or IPv6 GSO packet? */

+ u_int ipv4 = 0;

/* Length of the IP header (20 if IPv4, 40 if IPv6). */

u_int iphlen = 0;

+ /* Length of the Ethernet header (18 if 802.1q, otherwise 14). */

+ u_int ethhlen = 14;

/* Is this a TCP or an UDP GSO packet? */

u_int tcp = ((vh->gso_type & ~VIRTIO_NET_HDR_GSO_ECN)

== VIRTIO_NET_HDR_GSO_UDP) ? 0 : 1;

/* Segment the GSO packet contained into the input slots (frags). */

- while (ft_p != ft_end) {

+ for (;;) {

size_t copy;

+ if (dst_slots >= *howmany) {

+ /* We still have work to do, but we've run out of

+ * dst slots, so we have to drop the packet. */

+ RD(3, "Not enough slots, dropping GSO packet");

+ return;

+ }

/* Grab the GSO header if we don't have it. */

if (!gso_hdr) {

uint16_t ethertype;

@@ -231,28 +275,75 @@ void bdg_mismatch_datapath(struct netmap_vp_adapter *na,

gso_hdr = src;

/* Look at the 'Ethertype' field to see if this packet

- * is IPv4 or IPv6.

- */

- ethertype = be16toh(*((uint16_t *)(gso_hdr + 12)));

- if (ethertype == 0x0800)

- iphlen = 20;

- else /* if (ethertype == 0x86DD) */

- iphlen = 40;

+ * is IPv4 or IPv6, taking into account VLAN

+ * encapsulation. */

+ for (;;) {

+ if (src_len < ethhlen) {

+ RD(3, "Short GSO fragment [eth], dropping");

+ return;

+ }

+ ethertype = be16toh(*((uint16_t *)

+ (gso_hdr + ethhlen - 2)));

+ if (ethertype != 0x8100) /* not 802.1q */

+ break;

+ ethhlen += 4;

+ }

+ switch (ethertype) {

+ case 0x0800: /* IPv4 */

+ {

+ struct nm_iphdr *iph = (struct nm_iphdr *)

+ (gso_hdr + ethhlen);

+ if (src_len < ethhlen + 20) {

+ RD(3, "Short GSO fragment "

+ "[IPv4], dropping");

+ return;

+ }

+ ipv4 = 1;

+ iphlen = 4 * (iph->version_ihl & 0x0F);

+ break;

+ }

+ case 0x86DD: /* IPv6 */

+ ipv4 = 0;

+ iphlen = 40;

+ break;

+ default:

+ RD(3, "Unsupported ethertype, "

+ "dropping GSO packet");

+ return;

+ }

ND(3, "type=%04x", ethertype);

+ if (src_len < ethhlen + iphlen) {

+ RD(3, "Short GSO fragment [IP], dropping");

+ return;

+ }

/* Compute gso_hdr_len. For TCP we need to read the

* content of the 'Data Offset' field.

if (tcp) {

- struct nm_tcphdr *tcph =

- (struct nm_tcphdr *)&gso_hdr[14+iphlen];

+ struct nm_tcphdr *tcph = (struct nm_tcphdr *)

+ (gso_hdr + ethhlen + iphlen);

- gso_hdr_len = 14 + iphlen + 4*(tcph->doff >> 4);

- } else

- gso_hdr_len = 14 + iphlen + 8; /* UDP */

+ if (src_len < ethhlen + iphlen + 20) {

+ RD(3, "Short GSO fragment "

+ "[TCP], dropping");

+ return;

+ }

+ gso_hdr_len = ethhlen + iphlen +

+ 4 * (tcph->doff >> 4);

+ } else {

+ gso_hdr_len = ethhlen + iphlen + 8; /* UDP */

+ }

+ if (src_len < gso_hdr_len) {

+ RD(3, "Short GSO fragment [TCP/UDP], dropping");

+ return;

+ }

ND(3, "gso_hdr_len %u gso_mtu %d", gso_hdr_len,

- dst_na->mfs);

+ dst_na->mfs);

/* Advance source pointers. */

src += gso_hdr_len;

@@ -263,7 +354,6 @@ void bdg_mismatch_datapath(struct netmap_vp_adapter *na,

break;

src = ft_p->ft_buf;

src_len = ft_p->ft_len;

- continue;

}

@@ -289,25 +379,24 @@ void bdg_mismatch_datapath(struct netmap_vp_adapter *na,

/* After raw segmentation, we must fix some header

* fields and compute checksums, in a protocol dependent

* way. */

- gso_fix_segment(dst, gso_bytes, gso_idx,

- segmented_bytes,

- src_len == 0 && ft_p + 1 == ft_end,

- tcp, iphlen);

+ gso_fix_segment(dst + ethhlen, gso_bytes - ethhlen,

+ ipv4, iphlen, tcp,

+ gso_idx, segmented_bytes,

+ src_len == 0 && ft_p + 1 == ft_end);

ND("frame %u completed with %d bytes", gso_idx, (int)gso_bytes);

- slot->len = gso_bytes;

- slot->flags = 0;

- segmented_bytes += gso_bytes - gso_hdr_len;

+ dst_slot->len = gso_bytes;

+ dst_slot->flags = 0;

dst_slots++;

- /* Next destination slot. */

- *j = nm_next(*j, lim);

- slot = &ring->slot[*j];

- dst = NMB(&dst_na->up, slot);

+ segmented_bytes += gso_bytes - gso_hdr_len;

gso_bytes = 0;

gso_idx++;

+ /* Next destination slot. */

+ j_cur = nm_next(j_cur, lim);

+ dst_slot = &dst_ring->slot[j_cur];

+ dst = NMB(&dst_na->up, dst_slot);

}

/* Next input slot. */

@@ -342,10 +431,10 @@ void bdg_mismatch_datapath(struct netmap_vp_adapter *na,

/* Init/update the packet checksum if needed. */

if (vh && (vh->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)) {

if (!dst_slots)

- csum = nm_csum_raw(src + vh->csum_start,

+ csum = nm_os_csum_raw(src + vh->csum_start,

src_len - vh->csum_start, 0);

else

- csum = nm_csum_raw(src, src_len, csum);

+ csum = nm_os_csum_raw(src, src_len, csum);

}

/* Round to a multiple of 64 */

@@ -359,44 +448,43 @@ void bdg_mismatch_datapath(struct netmap_vp_adapter *na,

} else {

memcpy(dst, src, (int)src_len);

}

- slot->len = dst_len;

+ dst_slot->len = dst_len;

dst_slots++;

/* Next destination slot. */

- *j = nm_next(*j, lim);

- slot = &ring->slot[*j];

- dst = NMB(&dst_na->up, slot);

+ j_cur = nm_next(j_cur, lim);

+ dst_slot = &dst_ring->slot[j_cur];

+ dst = NMB(&dst_na->up, dst_slot);

/* Next source slot. */

ft_p++;

src = ft_p->ft_buf;

dst_len = src_len = ft_p->ft_len;

}

/* Finalize (fold) the checksum if needed. */

if (check && vh && (vh->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)) {

- *check = nm_csum_fold(csum);

+ *check = nm_os_csum_fold(csum);

}

ND(3, "using %u dst_slots", dst_slots);

- /* A second pass on the desitations slots to set the slot flags,

+ /* A second pass on the destination slots to set the slot flags,

* using the right number of destination slots.

- while (j_start != *j) {

- slot = &ring->slot[j_start];

- slot->flags = (dst_slots << 8)| NS_MOREFRAG;

+ while (j_start != j_cur) {

+ dst_slot = &dst_ring->slot[j_start];

+ dst_slot->flags = (dst_slots << 8)| NS_MOREFRAG;

j_start = nm_next(j_start, lim);

}

/* Clear NS_MOREFRAG flag on last entry. */

- slot->flags = (dst_slots << 8);

+ dst_slot->flags = (dst_slots << 8);

}

- /* Update howmany. */

+ /* Update howmany and j. This is to commit the use of

+ * those slots in the destination ring. */

if (unlikely(dst_slots > *howmany)) {

- dst_slots = *howmany;

- D("Slot allocation error: Should never happen");

+ D("Slot allocation error: This is a bug");

}

+ *j = j_cur;

*howmany -= dst_slots;

}

diff --git a/sys/dev/netmap/netmap_pipe.c b/sys/dev/netmap/netmap_pipe.c
index 67e840248c88..f0f1b524300a 100644
--- a/sys/dev/netmap/netmap_pipe.c
+++ b/sys/dev/netmap/netmap_pipe.c

@@ -1,5 +1,6 @@

* Redistribution and use in source and binary forms, with or without

* modification, are permitted provided that the following conditions

@@ -54,6 +55,9 @@

#warning OSX support is only partial

#include "osx_glue.h"

+#elif defined(_WIN32)

+#include "win_glue.h"

#else

#error Unsupported platform

@@ -72,9 +76,11 @@

#define NM_PIPE_MAXSLOTS 4096

-int netmap_default_pipes = 0; /* ignored, kept for compatibility */

+static int netmap_default_pipes = 0; /* ignored, kept for compatibility */

+SYSBEGIN(vars_pipes);

SYSCTL_DECL(_dev_netmap);

SYSCTL_INT(_dev_netmap, OID_AUTO, default_pipes, CTLFLAG_RW, &netmap_default_pipes, 0 , "");

+SYSEND;

/* allocate the pipe array in the parent adapter */

static int

@@ -91,7 +97,11 @@ nm_pipe_alloc(struct netmap_adapter *na, u_int npipes)

return EINVAL;

len = sizeof(struct netmap_pipe_adapter *) * npipes;

+#ifndef _WIN32

npa = realloc(na->na_pipes, len, M_DEVBUF, M_NOWAIT | M_ZERO);

+#else

+ npa = realloc(na->na_pipes, len, sizeof(struct netmap_pipe_adapter *)*na->na_max_pipes);

+#endif

if (npa == NULL)

return ENOMEM;

@@ -199,7 +209,7 @@ netmap_pipe_txsync(struct netmap_kring *txkring, int flags)

}

while (limit-- > 0) {

- struct netmap_slot *rs = &rxkring->save_ring->slot[j];

+ struct netmap_slot *rs = &rxkring->ring->slot[j];

struct netmap_slot *ts = &txkring->ring->slot[k];

struct netmap_slot tmp;

@@ -295,7 +305,7 @@ netmap_pipe_rxsync(struct netmap_kring *rxkring, int flags)

* usr1 --> e1 --> e2

* and we are e2. e1 is certainly registered and our

- * krings already exist, but they may be hidden.

+ * krings already exist. Nothing to do.

static int

netmap_pipe_krings_create(struct netmap_adapter *na)

@@ -310,65 +320,28 @@ netmap_pipe_krings_create(struct netmap_adapter *na)

int i;

/* case 1) above */

- ND("%p: case 1, create everything", na);

+ D("%p: case 1, create both ends", na);

error = netmap_krings_create(na, 0);

if (error)

goto err;

- /* we also create all the rings, since we need to

- * update the save_ring pointers.

- * netmap_mem_rings_create (called by our caller)

- * will not create the rings again

- */

- error = netmap_mem_rings_create(na);

- if (error)

- goto del_krings1;

- /* update our hidden ring pointers */

- for_rx_tx(t) {

- for (i = 0; i < nma_get_nrings(na, t) + 1; i++)

- NMR(na, t)[i].save_ring = NMR(na, t)[i].ring;

- }

- /* now, create krings and rings of the other end */

+ /* create the krings of the other end */

error = netmap_krings_create(ona, 0);

if (error)

- goto del_rings1;

- error = netmap_mem_rings_create(ona);

- if (error)

- goto del_krings2;

- for_rx_tx(t) {

- for (i = 0; i < nma_get_nrings(ona, t) + 1; i++)

- NMR(ona, t)[i].save_ring = NMR(ona, t)[i].ring;

- }

+ goto del_krings1;

/* cross link the krings */

for_rx_tx(t) {

- enum txrx r= nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */

+ enum txrx r = nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */

for (i = 0; i < nma_get_nrings(na, t); i++) {

NMR(na, t)[i].pipe = NMR(&pna->peer->up, r) + i;

NMR(&pna->peer->up, r)[i].pipe = NMR(na, t) + i;

}

- } else {

- int i;

- /* case 2) above */

- /* recover the hidden rings */

- ND("%p: case 2, hidden rings", na);

- for_rx_tx(t) {

- for (i = 0; i < nma_get_nrings(na, t) + 1; i++)

- NMR(na, t)[i].ring = NMR(na, t)[i].save_ring;

- }

}

return 0;

-del_krings2:

- netmap_krings_delete(ona);

-del_rings1:

- netmap_mem_rings_delete(na);

del_krings1:

netmap_krings_delete(na);

err:

@@ -383,7 +356,8 @@ err:

* usr1 --> e1 --> e2

- * and we are e1. Nothing special to do.

+ * and we are e1. Create the needed rings of the

+ * other end.

* 1.b) state is

@@ -412,14 +386,65 @@ netmap_pipe_reg(struct netmap_adapter *na, int onoff)

{

struct netmap_pipe_adapter *pna =

(struct netmap_pipe_adapter *)na;

+ struct netmap_adapter *ona = &pna->peer->up;

+ int i, error = 0;

enum txrx t;

ND("%p: onoff %d", na, onoff);

if (onoff) {

- na->na_flags |= NAF_NETMAP_ON;

+ for_rx_tx(t) {

+ for (i = 0; i < nma_get_nrings(na, t) + 1; i++) {

+ struct netmap_kring *kring = &NMR(na, t)[i];

+ if (nm_kring_pending_on(kring)) {

+ /* mark the partner ring as needed */

+ kring->pipe->nr_kflags |= NKR_NEEDRING;

+ }

+ /* create all missing needed rings on the other end */

+ error = netmap_mem_rings_create(ona);

+ if (error)

+ return error;

+ /* In case of no error we put our rings in netmap mode */

+ for_rx_tx(t) {

+ for (i = 0; i < nma_get_nrings(na, t) + 1; i++) {

+ struct netmap_kring *kring = &NMR(na, t)[i];

+ if (nm_kring_pending_on(kring)) {

+ kring->nr_mode = NKR_NETMAP_ON;

+ }

+ if (na->active_fds == 0)

+ na->na_flags |= NAF_NETMAP_ON;

} else {

- na->na_flags &= ~NAF_NETMAP_ON;

+ if (na->active_fds == 0)

+ na->na_flags &= ~NAF_NETMAP_ON;

+ for_rx_tx(t) {

+ for (i = 0; i < nma_get_nrings(na, t) + 1; i++) {

+ struct netmap_kring *kring = &NMR(na, t)[i];

+ if (nm_kring_pending_off(kring)) {

+ kring->nr_mode = NKR_NETMAP_OFF;

+ /* mark the peer ring as no longer needed by us

+ * (it may still be kept if sombody else is using it)

+ */

+ kring->pipe->nr_kflags &= ~NKR_NEEDRING;

+ }

+ /* delete all the peer rings that are no longer needed */

+ netmap_mem_rings_delete(ona);

+ }

+ if (na->active_fds) {

+ D("active_fds %d", na->active_fds);

+ return 0;

}

if (pna->peer_ref) {

ND("%p: case 1.a or 2.a, nothing to do", na);

return 0;

@@ -429,18 +454,11 @@ netmap_pipe_reg(struct netmap_adapter *na, int onoff)

pna->peer->peer_ref = 0;

netmap_adapter_put(na);

} else {

- int i;

ND("%p: case 2.b, grab peer", na);

netmap_adapter_get(na);

pna->peer->peer_ref = 1;

- /* hide our rings from netmap_mem_rings_delete */

- for_rx_tx(t) {

- for (i = 0; i < nma_get_nrings(na, t) + 1; i++) {

- NMR(na, t)[i].ring = NULL;

- }

}

- return 0;

+ return error;

}

/* netmap_pipe_krings_delete.

@@ -470,8 +488,6 @@ netmap_pipe_krings_delete(struct netmap_adapter *na)

struct netmap_pipe_adapter *pna =

(struct netmap_pipe_adapter *)na;

struct netmap_adapter *ona; /* na of the other end */

- int i;

- enum txrx t;

if (!pna->peer_ref) {

ND("%p: case 2, kept alive by peer", na);

@@ -480,18 +496,12 @@ netmap_pipe_krings_delete(struct netmap_adapter *na)

/* case 1) above */

ND("%p: case 1, deleting everyhing", na);

netmap_krings_delete(na); /* also zeroes tx_rings etc. */

- /* restore the ring to be deleted on the peer */

ona = &pna->peer->up;

if (ona->tx_rings == NULL) {

/* already deleted, we must be on an

* cleanup-after-error path */

return;

}

- for_rx_tx(t) {

- for (i = 0; i < nma_get_nrings(ona, t) + 1; i++)

- NMR(ona, t)[i].ring = NMR(ona, t)[i].save_ring;

- }

- netmap_mem_rings_delete(ona);

netmap_krings_delete(ona);

}

@@ -519,6 +529,7 @@ netmap_get_pipe_na(struct nmreq *nmr, struct netmap_adapter **na, int create)

struct nmreq pnmr;

struct netmap_adapter *pna; /* parent adapter */

struct netmap_pipe_adapter *mna, *sna, *req;

+ struct ifnet *ifp = NULL;

u_int pipe_id;

int role = nmr->nr_flags & NR_REG_MASK;

int error;

@@ -536,7 +547,7 @@ netmap_get_pipe_na(struct nmreq *nmr, struct netmap_adapter **na, int create)

memcpy(&pnmr.nr_name, nmr->nr_name, IFNAMSIZ);

/* pass to parent the requested number of pipes */

pnmr.nr_arg1 = nmr->nr_arg1;

- error = netmap_get_na(&pnmr, &pna, create);

+ error = netmap_get_na(&pnmr, &pna, &ifp, create);

if (error) {

ND("parent lookup failed: %d", error);

return error;

@@ -652,16 +663,15 @@ found:

*na = &req->up;

netmap_adapter_get(*na);

- /* write the configuration back */

- nmr->nr_tx_rings = req->up.num_tx_rings;

- nmr->nr_rx_rings = req->up.num_rx_rings;

- nmr->nr_tx_slots = req->up.num_tx_desc;

- nmr->nr_rx_slots = req->up.num_rx_desc;

/* keep the reference to the parent.

* It will be released by the req destructor

+ /* drop the ifp reference, if any */

+ if (ifp) {

+ if_rele(ifp);

+ }

return 0;

free_sna:

@@ -671,7 +681,7 @@ unregister_mna:

free_mna:

free(mna, M_DEVBUF);

put_out:

- netmap_adapter_put(pna);

+ netmap_unget_na(pna, ifp);

return error;

}

diff --git a/sys/dev/netmap/netmap_vale.c b/sys/dev/netmap/netmap_vale.c
index ddd7334a8378..2d2c807681d2 100644
--- a/sys/dev/netmap/netmap_vale.c
+++ b/sys/dev/netmap/netmap_vale.c

@@ -1,5 +1,6 @@

* Redistribution and use in source and binary forms, with or without

* modification, are permitted provided that the following conditions

@@ -101,6 +102,9 @@ __FBSDID("$FreeBSD$");

#warning OSX support is only partial

#include "osx_glue.h"

+#elif defined(_WIN32)

+#include "win_glue.h"

#else

#error Unsupported platform

@@ -119,7 +123,7 @@ __FBSDID("$FreeBSD$");

* system parameters (most of them in netmap_kern.h)

- * NM_NAME prefix for switch port names, default "vale"

+ * NM_BDG_NAME prefix for switch port names, default "vale"

* NM_BDG_MAXPORTS number of ports

* NM_BRIDGES max number of switches in the system.

* XXX should become a sysctl or tunable

@@ -144,7 +148,6 @@ __FBSDID("$FreeBSD$");

#define NM_BDG_BATCH_MAX (NM_BDG_BATCH + NM_MULTISEG)

/* NM_FT_NULL terminates a list of slots in the ft */

#define NM_FT_NULL NM_BDG_BATCH_MAX

-#define NM_BRIDGES 8 /* number of bridges */

@@ -152,14 +155,15 @@ __FBSDID("$FreeBSD$");

* used in the bridge. The actual value may be larger as the

* last packet in the block may overflow the size.

-int bridge_batch = NM_BDG_BATCH; /* bridge batch size */

+static int bridge_batch = NM_BDG_BATCH; /* bridge batch size */

+SYSBEGIN(vars_vale);

SYSCTL_DECL(_dev_netmap);

SYSCTL_INT(_dev_netmap, OID_AUTO, bridge_batch, CTLFLAG_RW, &bridge_batch, 0 , "");

+SYSEND;

static int netmap_vp_create(struct nmreq *, struct ifnet *, struct netmap_vp_adapter **);

static int netmap_vp_reg(struct netmap_adapter *na, int onoff);

-static int netmap_bwrap_register(struct netmap_adapter *, int onoff);

+static int netmap_bwrap_reg(struct netmap_adapter *, int onoff);

* For each output interface, nm_bdg_q is used to construct a list.

@@ -213,7 +217,7 @@ struct nm_bridge {

* forward this packet. ring_nr is the source ring index, and the

* function may overwrite this value to forward this packet to a

* different ring index.

- * This function must be set by netmap_bdgctl().

+ * This function must be set by netmap_bdg_ctl().

struct netmap_bdg_ops bdg_ops;

@@ -244,7 +248,7 @@ netmap_bdg_name(struct netmap_vp_adapter *vp)

* Right now we have a static array and deletions are protected

* by an exclusive lock.

-struct nm_bridge *nm_bridges;

+static struct nm_bridge *nm_bridges;

#endif /* !CONFIG_NET_NS */

@@ -278,6 +282,45 @@ pkt_copy(void *_src, void *_dst, int l)

}

+static int

+nm_is_id_char(const char c)

+ return (c >= 'a' && c <= 'z') ||

+ (c >= 'A' && c <= 'Z') ||

+ (c >= '0' && c <= '9') ||

+ (c == '_');

+/* Validate the name of a VALE bridge port and return the

+ * position of the ":" character. */

+static int

+nm_vale_name_validate(const char *name)

+ int colon_pos = -1;

+ int i;

+ if (!name || strlen(name) < strlen(NM_BDG_NAME)) {

+ return -1;

+ }

+ for (i = 0; name[i]; i++) {

+ if (name[i] == ':') {

+ if (colon_pos != -1) {

+ return -1;

+ }

+ colon_pos = i;

+ } else if (!nm_is_id_char(name[i])) {

+ return -1;

+ }

+ if (i >= IFNAMSIZ) {

+ return -1;

+ }

+ return colon_pos;

* locate a bridge among the existing ones.

* MUST BE CALLED WITH NMG_LOCK()

@@ -288,7 +331,7 @@ pkt_copy(void *_src, void *_dst, int l)

static struct nm_bridge *

nm_find_bridge(const char *name, int create)

{

- int i, l, namelen;

+ int i, namelen;

struct nm_bridge *b = NULL, *bridges;

u_int num_bridges;

@@ -296,21 +339,11 @@ nm_find_bridge(const char *name, int create)

netmap_bns_getbridges(&bridges, &num_bridges);

- namelen = strlen(NM_NAME); /* base length */

- l = name ? strlen(name) : 0; /* actual length */

- if (l < namelen) {

+ namelen = nm_vale_name_validate(name);

+ if (namelen < 0) {

D("invalid bridge name %s", name ? name : NULL);

return NULL;

}

- for (i = namelen + 1; i < l; i++) {

- if (name[i] == ':') {

- namelen = i;

- break;

- }

- if (namelen >= IFNAMSIZ)

- namelen = IFNAMSIZ;

- ND("--- prefix is '%.*s' ---", namelen, name);

/* lookup the name, remember empty slot if there is one */

for (i = 0; i < num_bridges; i++) {

@@ -479,6 +512,7 @@ netmap_vp_bdg_ctl(struct netmap_adapter *na, struct nmreq *nmr, int attach)

struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter *)na;

struct nm_bridge *b = vpna->na_bdg;

+ (void)nmr; // XXX merge ?

if (attach)

return 0; /* nothing to do */

if (b) {

@@ -518,7 +552,7 @@ nm_vi_destroy(const char *name)

return ENXIO;

NMG_LOCK();

/* make sure this is actually a VALE port */

- if (!NETMAP_CAPABLE(ifp) || NA(ifp)->nm_register != netmap_vp_reg) {

+ if (!NM_NA_VALID(ifp) || NA(ifp)->nm_register != netmap_vp_reg) {

error = EINVAL;

goto err;

}

@@ -535,7 +569,7 @@ nm_vi_destroy(const char *name)

if_rele(ifp);

netmap_detach(ifp);

- nm_vi_detach(ifp);

+ nm_os_vi_detach(ifp);

return 0;

err:

@@ -556,14 +590,14 @@ nm_vi_create(struct nmreq *nmr)

int error;

/* don't include VALE prefix */

- if (!strncmp(nmr->nr_name, NM_NAME, strlen(NM_NAME)))

+ if (!strncmp(nmr->nr_name, NM_BDG_NAME, strlen(NM_BDG_NAME)))

return EINVAL;

ifp = ifunit_ref(nmr->nr_name);

if (ifp) { /* already exist, cannot create new one */

if_rele(ifp);

return EEXIST;

}

- error = nm_vi_persist(nmr->nr_name, &ifp);

+ error = nm_os_vi_persist(nmr->nr_name, &ifp);

if (error)

return error;

@@ -572,12 +606,13 @@ nm_vi_create(struct nmreq *nmr)

error = netmap_vp_create(nmr, ifp, &vpna);

if (error) {

D("error %d", error);

- nm_vi_detach(ifp);

+ nm_os_vi_detach(ifp);

return error;

}

/* persist-specific routines */

vpna->up.nm_bdg_ctl = netmap_vp_bdg_ctl;

netmap_adapter_get(&vpna->up);

+ NM_ATTACH_NA(ifp, &vpna->up);

NMG_UNLOCK();

D("created %s", ifp->if_xname);

return 0;

@@ -608,7 +643,7 @@ netmap_get_bdg_na(struct nmreq *nmr, struct netmap_adapter **na, int create)

/* first try to see if this is a bridge port. */

NMG_LOCK_ASSERT();

- if (strncmp(nr_name, NM_NAME, sizeof(NM_NAME) - 1)) {

+ if (strncmp(nr_name, NM_BDG_NAME, sizeof(NM_BDG_NAME) - 1)) {

return 0; /* no error, but no VALE prefix */

}

@@ -693,7 +728,6 @@ netmap_get_bdg_na(struct nmreq *nmr, struct netmap_adapter **na, int create)

goto out;

vpna = hw->na_vp;

hostna = hw->na_hostvp;

- if_rele(ifp);

if (nmr->nr_arg1 != NETMAP_BDG_HOST)

hostna = NULL;

}

@@ -768,6 +802,11 @@ unlock_exit:

return error;

}

+static inline int

+nm_is_bwrap(struct netmap_adapter *na)

+ return na->nm_register == netmap_bwrap_reg;

/* process NETMAP_BDG_DETACH */

static int

@@ -785,8 +824,13 @@ nm_bdg_ctl_detach(struct nmreq *nmr)

if (na == NULL) { /* VALE prefix missing */

error = EINVAL;

goto unlock_exit;

+ } else if (nm_is_bwrap(na) &&

+ ((struct netmap_bwrap_adapter *)na)->na_polling_state) {

+ /* Don't detach a NIC with polling */

+ error = EBUSY;

+ netmap_adapter_put(na);

+ goto unlock_exit;

}

if (na->nm_bdg_ctl) {

/* remove the port from bridge. The bwrap

* also needs to put the hwna in normal mode

@@ -801,6 +845,267 @@ unlock_exit:

}

+struct nm_bdg_polling_state;

+struct

+nm_bdg_kthread {

+ struct nm_kthread *nmk;

+ u_int qfirst;

+ u_int qlast;

+ struct nm_bdg_polling_state *bps;

+};

+struct nm_bdg_polling_state {

+ bool configured;

+ bool stopped;

+ struct netmap_bwrap_adapter *bna;

+ u_int reg;

+ u_int qfirst;

+ u_int qlast;

+ u_int cpu_from;

+ u_int ncpus;

+ struct nm_bdg_kthread *kthreads;

+};

+static void

+netmap_bwrap_polling(void *data)

+ struct nm_bdg_kthread *nbk = data;

+ struct netmap_bwrap_adapter *bna;

+ u_int qfirst, qlast, i;

+ struct netmap_kring *kring0, *kring;

+ if (!nbk)

+ return;

+ qfirst = nbk->qfirst;

+ qlast = nbk->qlast;

+ bna = nbk->bps->bna;

+ kring0 = NMR(bna->hwna, NR_RX);

+ for (i = qfirst; i < qlast; i++) {

+ kring = kring0 + i;

+ kring->nm_notify(kring, 0);

+ }

+static int

+nm_bdg_create_kthreads(struct nm_bdg_polling_state *bps)

+ struct nm_kthread_cfg kcfg;

+ int i, j;

+ bps->kthreads = malloc(sizeof(struct nm_bdg_kthread) * bps->ncpus,

+ M_DEVBUF, M_NOWAIT | M_ZERO);

+ if (bps->kthreads == NULL)

+ return ENOMEM;

+ bzero(&kcfg, sizeof(kcfg));

+ kcfg.worker_fn = netmap_bwrap_polling;

+ for (i = 0; i < bps->ncpus; i++) {

+ struct nm_bdg_kthread *t = bps->kthreads + i;

+ int all = (bps->ncpus == 1 && bps->reg == NR_REG_ALL_NIC);

+ int affinity = bps->cpu_from + i;

+ t->bps = bps;

+ t->qfirst = all ? bps->qfirst /* must be 0 */: affinity;

+ t->qlast = all ? bps->qlast : t->qfirst + 1;

+ D("kthread %d a:%u qf:%u ql:%u", i, affinity, t->qfirst,

+ t->qlast);

+ kcfg.type = i;

+ kcfg.worker_private = t;

+ t->nmk = nm_os_kthread_create(&kcfg);

+ if (t->nmk == NULL) {

+ goto cleanup;

+ }

+ nm_os_kthread_set_affinity(t->nmk, affinity);

+ }

+ return 0;

+cleanup:

+ for (j = 0; j < i; j++) {

+ struct nm_bdg_kthread *t = bps->kthreads + i;

+ nm_os_kthread_delete(t->nmk);

+ }

+ free(bps->kthreads, M_DEVBUF);

+ return EFAULT;

+/* a version of ptnetmap_start_kthreads() */

+static int

+nm_bdg_polling_start_kthreads(struct nm_bdg_polling_state *bps)

+ int error, i, j;

+ if (!bps) {

+ D("polling is not configured");

+ return EFAULT;

+ }

+ bps->stopped = false;

+ for (i = 0; i < bps->ncpus; i++) {

+ struct nm_bdg_kthread *t = bps->kthreads + i;

+ error = nm_os_kthread_start(t->nmk);

+ if (error) {

+ D("error in nm_kthread_start()");

+ goto cleanup;

+ }

+ return 0;

+cleanup:

+ for (j = 0; j < i; j++) {

+ struct nm_bdg_kthread *t = bps->kthreads + i;

+ nm_os_kthread_stop(t->nmk);

+ }

+ bps->stopped = true;

+ return error;

+static void

+nm_bdg_polling_stop_delete_kthreads(struct nm_bdg_polling_state *bps)

+ int i;

+ if (!bps)

+ return;

+ for (i = 0; i < bps->ncpus; i++) {

+ struct nm_bdg_kthread *t = bps->kthreads + i;

+ nm_os_kthread_stop(t->nmk);

+ nm_os_kthread_delete(t->nmk);

+ }

+ bps->stopped = true;

+static int

+get_polling_cfg(struct nmreq *nmr, struct netmap_adapter *na,

+ struct nm_bdg_polling_state *bps)

+ int req_cpus, avail_cpus, core_from;

+ u_int reg, i, qfirst, qlast;

+ avail_cpus = nm_os_ncpus();

+ req_cpus = nmr->nr_arg1;

+ if (req_cpus == 0) {

+ D("req_cpus must be > 0");

+ return EINVAL;

+ } else if (req_cpus >= avail_cpus) {

+ D("for safety, we need at least one core left in the system");

+ return EINVAL;

+ }

+ reg = nmr->nr_flags & NR_REG_MASK;

+ i = nmr->nr_ringid & NETMAP_RING_MASK;

+ /*

+ * ONE_NIC: dedicate one core to one ring. If multiple cores

+ * are specified, consecutive rings are also polled.

+ * For example, if ringid=2 and 2 cores are given,

+ * ring 2 and 3 are polled by core 2 and 3, respectively.

+ * ALL_NIC: poll all the rings using a core specified by ringid.

+ * the number of cores must be 1.

+ */

+ if (reg == NR_REG_ONE_NIC) {

+ if (i + req_cpus > nma_get_nrings(na, NR_RX)) {

+ D("only %d rings exist (ring %u-%u is given)",

+ nma_get_nrings(na, NR_RX), i, i+req_cpus);

+ return EINVAL;

+ }

+ qfirst = i;

+ qlast = qfirst + req_cpus;

+ core_from = qfirst;

+ } else if (reg == NR_REG_ALL_NIC) {

+ if (req_cpus != 1) {

+ D("ncpus must be 1 not %d for REG_ALL_NIC", req_cpus);

+ return EINVAL;

+ }

+ qfirst = 0;

+ qlast = nma_get_nrings(na, NR_RX);

+ core_from = i;

+ } else {

+ D("reg must be ALL_NIC or ONE_NIC");

+ return EINVAL;

+ }

+ bps->reg = reg;

+ bps->qfirst = qfirst;

+ bps->qlast = qlast;

+ bps->cpu_from = core_from;

+ bps->ncpus = req_cpus;

+ D("%s qfirst %u qlast %u cpu_from %u ncpus %u",

+ reg == NR_REG_ALL_NIC ? "REG_ALL_NIC" : "REG_ONE_NIC",

+ qfirst, qlast, core_from, req_cpus);

+ return 0;

+static int

+nm_bdg_ctl_polling_start(struct nmreq *nmr, struct netmap_adapter *na)

+ struct nm_bdg_polling_state *bps;

+ struct netmap_bwrap_adapter *bna;

+ int error;

+ bna = (struct netmap_bwrap_adapter *)na;

+ if (bna->na_polling_state) {

+ D("ERROR adapter already in polling mode");

+ return EFAULT;

+ }

+ bps = malloc(sizeof(*bps), M_DEVBUF, M_NOWAIT | M_ZERO);

+ if (!bps)

+ return ENOMEM;

+ bps->configured = false;

+ bps->stopped = true;

+ if (get_polling_cfg(nmr, na, bps)) {

+ free(bps, M_DEVBUF);

+ return EINVAL;

+ }

+ if (nm_bdg_create_kthreads(bps)) {

+ free(bps, M_DEVBUF);

+ return EFAULT;

+ }

+ bps->configured = true;

+ bna->na_polling_state = bps;

+ bps->bna = bna;

+ /* disable interrupt if possible */

+ if (bna->hwna->nm_intr)

+ bna->hwna->nm_intr(bna->hwna, 0);

+ /* start kthread now */

+ error = nm_bdg_polling_start_kthreads(bps);

+ if (error) {

+ D("ERROR nm_bdg_polling_start_kthread()");

+ free(bps->kthreads, M_DEVBUF);

+ free(bps, M_DEVBUF);

+ bna->na_polling_state = NULL;

+ if (bna->hwna->nm_intr)

+ bna->hwna->nm_intr(bna->hwna, 1);

+ }

+ return error;

+static int

+nm_bdg_ctl_polling_stop(struct nmreq *nmr, struct netmap_adapter *na)

+ struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter *)na;

+ struct nm_bdg_polling_state *bps;

+ if (!bna->na_polling_state) {

+ D("ERROR adapter is not in polling mode");

+ return EFAULT;

+ }

+ bps = bna->na_polling_state;

+ nm_bdg_polling_stop_delete_kthreads(bna->na_polling_state);

+ bps->configured = false;

+ free(bps, M_DEVBUF);

+ bna->na_polling_state = NULL;

+ /* reenable interrupt */

+ if (bna->hwna->nm_intr)

+ bna->hwna->nm_intr(bna->hwna, 1);

+ return 0;

/* Called by either user's context (netmap_ioctl())

* or external kernel modules (e.g., Openvswitch).

@@ -843,7 +1148,7 @@ netmap_bdg_ctl(struct nmreq *nmr, struct netmap_bdg_ops *bdg_ops)

case NETMAP_BDG_LIST:

/* this is used to enumerate bridges and ports */

if (namelen) { /* look up indexes of bridge and port */

- if (strncmp(name, NM_NAME, strlen(NM_NAME))) {

+ if (strncmp(name, NM_BDG_NAME, strlen(NM_BDG_NAME))) {

error = EINVAL;

break;

}

@@ -855,7 +1160,9 @@ netmap_bdg_ctl(struct nmreq *nmr, struct netmap_bdg_ops *bdg_ops)

break;

}

- error = ENOENT;

+ error = 0;

+ nmr->nr_arg1 = b - bridges; /* bridge index */

+ nmr->nr_arg2 = NM_BDG_NOPORT;

for (j = 0; j < b->bdg_active_ports; j++) {

i = b->bdg_port_index[j];

vpna = b->bdg_ports[i];

@@ -867,10 +1174,7 @@ netmap_bdg_ctl(struct nmreq *nmr, struct netmap_bdg_ops *bdg_ops)

* virtual port and a NIC, respectively

if (!strcmp(vpna->up.name, name)) {

- /* bridge index */

- nmr->nr_arg1 = b - bridges;

nmr->nr_arg2 = i; /* port index */

- error = 0;

break;

}

@@ -937,10 +1241,34 @@ netmap_bdg_ctl(struct nmreq *nmr, struct netmap_bdg_ops *bdg_ops)

error = netmap_get_bdg_na(nmr, &na, 0);

if (na && !error) {

vpna = (struct netmap_vp_adapter *)na;

- vpna->virt_hdr_len = nmr->nr_arg1;

- if (vpna->virt_hdr_len)

+ na->virt_hdr_len = nmr->nr_arg1;

+ if (na->virt_hdr_len) {

vpna->mfs = NETMAP_BUF_SIZE(na);

- D("Using vnet_hdr_len %d for %p", vpna->virt_hdr_len, vpna);

+ }

+ D("Using vnet_hdr_len %d for %p", na->virt_hdr_len, na);

+ netmap_adapter_put(na);

+ } else if (!na) {

+ error = ENXIO;

+ }

+ NMG_UNLOCK();

+ break;

+ case NETMAP_BDG_POLLING_ON:

+ case NETMAP_BDG_POLLING_OFF:

+ NMG_LOCK();

+ error = netmap_get_bdg_na(nmr, &na, 0);

+ if (na && !error) {

+ if (!nm_is_bwrap(na)) {

+ error = EOPNOTSUPP;

+ } else if (cmd == NETMAP_BDG_POLLING_ON) {

+ error = nm_bdg_ctl_polling_start(nmr, na);

+ if (!error)

+ netmap_adapter_get(na);

+ } else {

+ error = nm_bdg_ctl_polling_stop(nmr, na);

+ if (!error)

+ netmap_adapter_put(na);

+ }

netmap_adapter_put(na);

}

NMG_UNLOCK();

@@ -1097,10 +1425,12 @@ nm_bdg_preflush(struct netmap_kring *kring, u_int end)

ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr);

}

if (frags > 1) {

- D("truncate incomplete fragment at %d (%d frags)", ft_i, frags);

- // ft_i > 0, ft[ft_i-1].flags has NS_MOREFRAG

- ft[ft_i - 1].ft_frags &= ~NS_MOREFRAG;

- ft[ft_i - frags].ft_frags = frags - 1;

+ /* Here ft_i > 0, ft[ft_i-1].flags has NS_MOREFRAG, and we

+ * have to fix frags count. */

+ frags--;

+ ft[ft_i - 1].ft_flags &= ~NS_MOREFRAG;

+ ft[ft_i - frags].ft_frags = frags;

+ D("Truncate incomplete fragment at %d (%d frags)", ft_i, frags);

}

if (ft_i)

ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr);

@@ -1157,6 +1487,8 @@ netmap_vp_reg(struct netmap_adapter *na, int onoff)

{

struct netmap_vp_adapter *vpna =

(struct netmap_vp_adapter*)na;

+ enum txrx t;

+ int i;

/* persistent ports may be put in netmap mode

* before being attached to a bridge

@@ -1164,12 +1496,30 @@ netmap_vp_reg(struct netmap_adapter *na, int onoff)

if (vpna->na_bdg)

BDG_WLOCK(vpna->na_bdg);

if (onoff) {

- na->na_flags |= NAF_NETMAP_ON;

+ for_rx_tx(t) {

+ for (i = 0; i < nma_get_nrings(na, t) + 1; i++) {

+ struct netmap_kring *kring = &NMR(na, t)[i];

+ if (nm_kring_pending_on(kring))

+ kring->nr_mode = NKR_NETMAP_ON;

+ }

+ if (na->active_fds == 0)

+ na->na_flags |= NAF_NETMAP_ON;

/* XXX on FreeBSD, persistent VALE ports should also

* toggle IFCAP_NETMAP in na->ifp (2014-03-16)

} else {

- na->na_flags &= ~NAF_NETMAP_ON;

+ if (na->active_fds == 0)

+ na->na_flags &= ~NAF_NETMAP_ON;

+ for_rx_tx(t) {

+ for (i = 0; i < nma_get_nrings(na, t) + 1; i++) {

+ struct netmap_kring *kring = &NMR(na, t)[i];

+ if (nm_kring_pending_off(kring))

+ kring->nr_mode = NKR_NETMAP_OFF;

+ }

}

if (vpna->na_bdg)

BDG_WUNLOCK(vpna->na_bdg);

@@ -1193,13 +1543,14 @@ netmap_bdg_learning(struct nm_bdg_fwd *ft, uint8_t *dst_ring,

uint32_t sh, dh;

u_int dst, mysrc = na->bdg_port;

uint64_t smac, dmac;

+ uint8_t indbuf[12];

/* safety check, unfortunately we have many cases */

- if (buf_len >= 14 + na->virt_hdr_len) {

+ if (buf_len >= 14 + na->up.virt_hdr_len) {

/* virthdr + mac_hdr in the same slot */

- buf += na->virt_hdr_len;

- buf_len -= na->virt_hdr_len;

- } else if (buf_len == na->virt_hdr_len && ft->ft_flags & NS_MOREFRAG) {

+ buf += na->up.virt_hdr_len;

+ buf_len -= na->up.virt_hdr_len;

+ } else if (buf_len == na->up.virt_hdr_len && ft->ft_flags & NS_MOREFRAG) {

/* only header in first fragment */

ft++;

buf = ft->ft_buf;

@@ -1208,6 +1559,14 @@ netmap_bdg_learning(struct nm_bdg_fwd *ft, uint8_t *dst_ring,

RD(5, "invalid buf format, length %d", buf_len);

return NM_BDG_NOPORT;

}

+ if (ft->ft_flags & NS_INDIRECT) {

+ if (copyin(buf, indbuf, sizeof(indbuf))) {

+ return NM_BDG_NOPORT;

+ }

+ buf = indbuf;

+ }

dmac = le64toh(*(uint64_t *)(buf)) & 0xffffffffffff;

smac = le64toh(*(uint64_t *)(buf + 4));

smac >>= 16;

@@ -1321,7 +1680,7 @@ nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na,

struct nm_bdg_q *dst_ents, *brddst;

uint16_t num_dsts = 0, *dsts;

struct nm_bridge *b = na->na_bdg;

- u_int i, j, me = na->bdg_port;

+ u_int i, me = na->bdg_port;

* The work area (pointed by ft) is followed by an array of

@@ -1341,7 +1700,7 @@ nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na,

ND("slot %d frags %d", i, ft[i].ft_frags);

/* Drop the packet if the virtio-net header is not into the first

fragment nor at the very beginning of the second. */

- if (unlikely(na->virt_hdr_len > ft[i].ft_len))

+ if (unlikely(na->up.virt_hdr_len > ft[i].ft_len))

continue;

dst_port = b->bdg_ops.lookup(&ft[i], &dst_ring, na);

if (netmap_verbose > 255)

@@ -1382,6 +1741,7 @@ nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na,

brddst = dst_ents + NM_BDG_BROADCAST * NM_BDG_MAXRINGS;

if (brddst->bq_head != NM_FT_NULL) {

+ u_int j;

for (j = 0; likely(j < b->bdg_active_ports); j++) {

uint16_t d_i;

i = b->bdg_port_index[j];

@@ -1441,8 +1801,9 @@ nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na,

needed = d->bq_len + brddst->bq_len;

- if (unlikely(dst_na->virt_hdr_len != na->virt_hdr_len)) {

- RD(3, "virt_hdr_mismatch, src %d dst %d", na->virt_hdr_len, dst_na->virt_hdr_len);

+ if (unlikely(dst_na->up.virt_hdr_len != na->up.virt_hdr_len)) {

+ RD(3, "virt_hdr_mismatch, src %d dst %d", na->up.virt_hdr_len,

+ dst_na->up.virt_hdr_len);

/* There is a virtio-net header/offloadings mismatch between

* source and destination. The slower mismatch datapath will

* be used to cope with all the mismatches.

@@ -1803,7 +2164,6 @@ netmap_vp_create(struct nmreq *nmr, struct ifnet *ifp, struct netmap_vp_adapter

nm_bound_var(&nmr->nr_arg3, 0, 0,

128*NM_BDG_MAXSLOTS, NULL);

na->num_rx_desc = nmr->nr_rx_slots;

- vpna->virt_hdr_len = 0;

vpna->mfs = 1514;

vpna->last_smac = ~0llu;

/*if (vpna->mfs > netmap_buf_size) TODO netmap_buf_size is zero??

@@ -1880,19 +2240,17 @@ netmap_bwrap_dtor(struct netmap_adapter *na)

{

struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na;

struct netmap_adapter *hwna = bna->hwna;

+ struct nm_bridge *b = bna->up.na_bdg,

+ *bh = bna->host.na_bdg;

+ if (b) {

+ netmap_bdg_detach_common(b, bna->up.bdg_port,

+ (bh ? bna->host.bdg_port : -1));

+ }

ND("na %p", na);

- /* drop reference to hwna->ifp.

- * If we don't do this, netmap_detach_common(na)

- * will think it has set NA(na->ifp) to NULL

- */

na->ifp = NULL;

- /* for safety, also drop the possible reference

- * in the hostna

- */

bna->host.up.ifp = NULL;

- hwna->nm_mem = bna->save_nmd;

hwna->na_private = NULL;

hwna->na_vp = hwna->na_hostvp = NULL;

hwna->na_flags &= ~NAF_BUSY;

@@ -1916,7 +2274,8 @@ netmap_bwrap_dtor(struct netmap_adapter *na)

* (part as a receive ring, part as a transmit ring).

* callback that overwrites the hwna notify callback.

- * Packets come from the outside or from the host stack and are put on an hwna rx ring.

+ * Packets come from the outside or from the host stack and are put on an

+ * hwna rx ring.

* The bridge wrapper then sends the packets through the bridge.

static int

@@ -1927,19 +2286,18 @@ netmap_bwrap_intr_notify(struct netmap_kring *kring, int flags)

struct netmap_kring *bkring;

struct netmap_vp_adapter *vpna = &bna->up;

u_int ring_nr = kring->ring_id;

- int error = 0;

+ int ret = NM_IRQ_COMPLETED;

+ int error;

if (netmap_verbose)

D("%s %s 0x%x", na->name, kring->name, flags);

- if (!nm_netmap_on(na))

- return 0;

bkring = &vpna->up.tx_rings[ring_nr];

/* make sure the ring is not disabled */

- if (nm_kr_tryget(kring))

- return 0;

+ if (nm_kr_tryget(kring, 0 /* can't sleep */, NULL)) {

+ return EIO;

+ }

if (netmap_verbose)

D("%s head %d cur %d tail %d", na->name,

@@ -1951,9 +2309,10 @@ netmap_bwrap_intr_notify(struct netmap_kring *kring, int flags)

error = kring->nm_sync(kring, 0);

if (error)

goto put_out;

- if (kring->nr_hwcur == kring->nr_hwtail && netmap_verbose) {

- D("how strange, interrupt with no packets on %s",

- na->name);

+ if (kring->nr_hwcur == kring->nr_hwtail) {

+ if (netmap_verbose)

+ D("how strange, interrupt with no packets on %s",

+ na->name);

goto put_out;

}

@@ -1970,28 +2329,32 @@ netmap_bwrap_intr_notify(struct netmap_kring *kring, int flags)

/* another call to actually release the buffers */

error = kring->nm_sync(kring, 0);

+ /* The second rxsync may have further advanced hwtail. If this happens,

+ * return NM_IRQ_RESCHED, otherwise just return NM_IRQ_COMPLETED. */

+ if (kring->rcur != kring->nr_hwtail) {

+ ret = NM_IRQ_RESCHED;

+ }

put_out:

nm_kr_put(kring);

- return error;

+ return error ? error : ret;

}

/* nm_register callback for bwrap */

static int

-netmap_bwrap_register(struct netmap_adapter *na, int onoff)

+netmap_bwrap_reg(struct netmap_adapter *na, int onoff)

{

struct netmap_bwrap_adapter *bna =

(struct netmap_bwrap_adapter *)na;

struct netmap_adapter *hwna = bna->hwna;

struct netmap_vp_adapter *hostna = &bna->host;

- int error;

+ int error, i;

enum txrx t;

ND("%s %s", na->name, onoff ? "on" : "off");

if (onoff) {

- int i;

/* netmap_do_regif has been called on the bwrap na.

* We need to pass the information about the

* memory allocator down to the hwna before

@@ -2010,16 +2373,32 @@ netmap_bwrap_register(struct netmap_adapter *na, int onoff)

/* cross-link the netmap rings

* The original number of rings comes from hwna,

* rx rings on one side equals tx rings on the other.

- * We need to do this now, after the initialization

- * of the kring->ring pointers

for_rx_tx(t) {

- enum txrx r= nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */

- for (i = 0; i < nma_get_nrings(na, r) + 1; i++) {

- NMR(hwna, t)[i].nkr_num_slots = NMR(na, r)[i].nkr_num_slots;

- NMR(hwna, t)[i].ring = NMR(na, r)[i].ring;

+ enum txrx r = nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */

+ for (i = 0; i < nma_get_nrings(hwna, r) + 1; i++) {

+ NMR(hwna, r)[i].ring = NMR(na, t)[i].ring;

}

+ if (na->na_flags & NAF_HOST_RINGS) {

+ struct netmap_adapter *hna = &hostna->up;

+ /* the hostna rings are the host rings of the bwrap.

+ * The corresponding krings must point back to the

+ * hostna

+ */

+ hna->tx_rings = &na->tx_rings[na->num_tx_rings];

+ hna->tx_rings[0].na = hna;

+ hna->rx_rings = &na->rx_rings[na->num_rx_rings];

+ hna->rx_rings[0].na = hna;

+ }

+ /* pass down the pending ring state information */

+ for_rx_tx(t) {

+ for (i = 0; i < nma_get_nrings(na, t) + 1; i++)

+ NMR(hwna, t)[i].nr_pending_mode =

+ NMR(na, t)[i].nr_pending_mode;

}

/* forward the request to the hwna */

@@ -2027,6 +2406,13 @@ netmap_bwrap_register(struct netmap_adapter *na, int onoff)

if (error)

return error;

+ /* copy up the current ring state information */

+ for_rx_tx(t) {

+ for (i = 0; i < nma_get_nrings(na, t) + 1; i++)

+ NMR(na, t)[i].nr_mode =

+ NMR(hwna, t)[i].nr_mode;

+ }

/* impersonate a netmap_vp_adapter */

netmap_vp_reg(na, onoff);

if (hostna->na_bdg)

@@ -2046,8 +2432,14 @@ netmap_bwrap_register(struct netmap_adapter *na, int onoff)

/* also intercept the host ring notify */

hwna->rx_rings[i].nm_notify = netmap_bwrap_intr_notify;

}

+ if (na->active_fds == 0)

+ na->na_flags |= NAF_NETMAP_ON;

} else {

u_int i;

+ if (na->active_fds == 0)

+ na->na_flags &= ~NAF_NETMAP_ON;

/* reset all notify callbacks (including host ring) */

for (i = 0; i <= hwna->num_rx_rings; i++) {

hwna->rx_rings[i].nm_notify = hwna->rx_rings[i].save_notify;

@@ -2089,8 +2481,8 @@ netmap_bwrap_krings_create(struct netmap_adapter *na)

struct netmap_bwrap_adapter *bna =

(struct netmap_bwrap_adapter *)na;

struct netmap_adapter *hwna = bna->hwna;

- struct netmap_adapter *hostna = &bna->host.up;

- int error;

+ int i, error = 0;

+ enum txrx t;

ND("%s", na->name);

@@ -2102,26 +2494,23 @@ netmap_bwrap_krings_create(struct netmap_adapter *na)

/* also create the hwna krings */

error = hwna->nm_krings_create(hwna);

if (error) {

- netmap_vp_krings_delete(na);

- return error;

+ goto err_del_vp_rings;

}

- /* the connection between the bwrap krings and the hwna krings

- * will be perfomed later, in the nm_register callback, since

- * now the kring->ring pointers have not been initialized yet

- */

- if (na->na_flags & NAF_HOST_RINGS) {

- /* the hostna rings are the host rings of the bwrap.

- * The corresponding krings must point back to the

- * hostna

- */

- hostna->tx_rings = &na->tx_rings[na->num_tx_rings];

- hostna->tx_rings[0].na = hostna;

- hostna->rx_rings = &na->rx_rings[na->num_rx_rings];

- hostna->rx_rings[0].na = hostna;

+ /* get each ring slot number from the corresponding hwna ring */

+ for_rx_tx(t) {

+ enum txrx r = nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */

+ for (i = 0; i < nma_get_nrings(hwna, r) + 1; i++) {

+ NMR(na, t)[i].nkr_num_slots = NMR(hwna, r)[i].nkr_num_slots;

+ }

}

return 0;

+err_del_vp_rings:

+ netmap_vp_krings_delete(na);

+ return error;

}

@@ -2149,7 +2538,7 @@ netmap_bwrap_notify(struct netmap_kring *kring, int flags)

u_int ring_n = kring->ring_id;

u_int lim = kring->nkr_num_slots - 1;

struct netmap_kring *hw_kring;

- int error = 0;

+ int error;

ND("%s: na %s hwna %s",

(kring ? kring->name : "NULL!"),

@@ -2157,11 +2546,10 @@ netmap_bwrap_notify(struct netmap_kring *kring, int flags)

(hwna ? hwna->name : "NULL!"));

hw_kring = &hwna->tx_rings[ring_n];

- if (nm_kr_tryget(hw_kring))

- return 0;

+ if (nm_kr_tryget(hw_kring, 0, NULL)) {

+ return ENXIO;

+ }

- if (!nm_netmap_on(hwna))

- return 0;

/* first step: simulate a user wakeup on the rx ring */

netmap_vp_rxsync(kring, flags);

ND("%s[%d] PRE rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)",

@@ -2175,7 +2563,7 @@ netmap_bwrap_notify(struct netmap_kring *kring, int flags)

hw_kring->rhead = hw_kring->rcur = kring->nr_hwtail;

error = hw_kring->nm_sync(hw_kring, flags);

if (error)

- goto out;

+ goto put_out;

/* third step: now we are back the rx ring */

/* claim ownership on all hw owned bufs */

@@ -2188,9 +2576,10 @@ netmap_bwrap_notify(struct netmap_kring *kring, int flags)

kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease,

ring->head, ring->cur, ring->tail,

hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_kring->rtail);

-out:

+put_out:

nm_kr_put(hw_kring);

- return error;

+ return error ? error : NM_IRQ_COMPLETED;

}

@@ -2217,44 +2606,23 @@ netmap_bwrap_bdg_ctl(struct netmap_adapter *na, struct nmreq *nmr, int attach)

/* nothing to do */

return 0;

}

- npriv = malloc(sizeof(*npriv), M_DEVBUF, M_NOWAIT|M_ZERO);

+ npriv = netmap_priv_new();

if (npriv == NULL)

return ENOMEM;

- error = netmap_do_regif(npriv, na, nmr->nr_ringid, nmr->nr_flags);

+ npriv->np_ifp = na->ifp; /* let the priv destructor release the ref */

+ error = netmap_do_regif(npriv, na, 0, NR_REG_NIC_SW);

if (error) {

- bzero(npriv, sizeof(*npriv));

- free(npriv, M_DEVBUF);

+ netmap_priv_delete(npriv);

return error;

}

bna->na_kpriv = npriv;

na->na_flags |= NAF_BUSY;

} else {

- int last_instance;

if (na->active_fds == 0) /* not registered */

return EINVAL;

- last_instance = netmap_dtor_locked(bna->na_kpriv);

- if (!last_instance) {

- D("--- error, trying to detach an entry with active mmaps");

- error = EINVAL;

- } else {

- struct nm_bridge *b = bna->up.na_bdg,

- *bh = bna->host.na_bdg;

- npriv = bna->na_kpriv;

- bna->na_kpriv = NULL;

- D("deleting priv");

- bzero(npriv, sizeof(*npriv));

- free(npriv, M_DEVBUF);

- if (b) {

- /* XXX the bwrap dtor should take care

- * of this (2014-06-16)

- */

- netmap_bdg_detach_common(b, bna->up.bdg_port,

- (bh ? bna->host.bdg_port : -1));

- }

- na->na_flags &= ~NAF_BUSY;

- }

+ netmap_priv_delete(bna->na_kpriv);

+ bna->na_kpriv = NULL;

+ na->na_flags &= ~NAF_BUSY;

}

return error;

@@ -2282,6 +2650,8 @@ netmap_bwrap_attach(const char *nr_name, struct netmap_adapter *hwna)

}

na = &bna->up.up;

+ /* make bwrap ifp point to the real ifp */

+ na->ifp = hwna->ifp;

na->na_private = bna;

strncpy(na->name, nr_name, sizeof(na->name));

/* fill the ring data for the bwrap adapter with rx/tx meanings

@@ -2294,7 +2664,7 @@ netmap_bwrap_attach(const char *nr_name, struct netmap_adapter *hwna)

nma_set_ndesc(na, t, nma_get_ndesc(hwna, r));

}

na->nm_dtor = netmap_bwrap_dtor;

- na->nm_register = netmap_bwrap_register;

+ na->nm_register = netmap_bwrap_reg;

// na->nm_txsync = netmap_bwrap_txsync;

// na->nm_rxsync = netmap_bwrap_rxsync;

na->nm_config = netmap_bwrap_config;

@@ -2303,13 +2673,8 @@ netmap_bwrap_attach(const char *nr_name, struct netmap_adapter *hwna)

na->nm_notify = netmap_bwrap_notify;

na->nm_bdg_ctl = netmap_bwrap_bdg_ctl;

na->pdev = hwna->pdev;

- na->nm_mem = netmap_mem_private_new(na->name,

- na->num_tx_rings, na->num_tx_desc,

- na->num_rx_rings, na->num_rx_desc,

- 0, 0, &error);

- na->na_flags |= NAF_MEM_OWNER;

- if (na->nm_mem == NULL)

- goto err_put;

+ na->nm_mem = hwna->nm_mem;

+ na->virt_hdr_len = hwna->virt_hdr_len;

bna->up.retry = 1; /* XXX maybe this should depend on the hwna */

bna->hwna = hwna;

@@ -2349,24 +2714,10 @@ netmap_bwrap_attach(const char *nr_name, struct netmap_adapter *hwna)

if (error) {

goto err_free;

}

- /* make bwrap ifp point to the real ifp

- * NOTE: netmap_attach_common() interprets a non-NULL na->ifp

- * as a request to make the ifp point to the na. Since we

- * do not want to change the na already pointed to by hwna->ifp,

- * the following assignment has to be delayed until now

- */

- na->ifp = hwna->ifp;

hwna->na_flags |= NAF_BUSY;

- /* make hwna point to the allocator we are actually using,

- * so that monitors will be able to find it

- */

- bna->save_nmd = hwna->nm_mem;

- hwna->nm_mem = na->nm_mem;

return 0;

err_free:

- netmap_mem_delete(na->nm_mem);

-err_put:

hwna->na_vp = hwna->na_hostvp = NULL;

netmap_adapter_put(hwna);

free(bna, M_DEVBUF);

diff --git a/sys/modules/netmap/Makefile b/sys/modules/netmap/Makefile
index 8e5364bbe7a2..978a4858edb9 100644
--- a/sys/modules/netmap/Makefile
+++ b/sys/modules/netmap/Makefile

@@ -3,11 +3,14 @@

# Compile netmap as a module, useful if you want a netmap bridge

# or loadable drivers.

+.include <bsd.own.mk> # FreeBSD 10 and earlier

+# .include "${SYSDIR}/conf/kern.opts.mk"

.PATH: ${.CURDIR}/../../dev/netmap

.PATH.h: ${.CURDIR}/../../net

-CFLAGS += -I${.CURDIR}/../../

+CFLAGS += -I${.CURDIR}/../../ -D INET

KMOD = netmap

-SRCS = device_if.h bus_if.h opt_netmap.h

+SRCS = device_if.h bus_if.h pci_if.h opt_netmap.h

SRCS += netmap.c netmap.h netmap_kern.h

SRCS += netmap_mem2.c netmap_mem2.h

SRCS += netmap_generic.c

@@ -17,5 +20,8 @@ SRCS += netmap_freebsd.c

SRCS += netmap_offloadings.c

SRCS += netmap_pipe.c

SRCS += netmap_monitor.c

+SRCS += netmap_pt.c

+SRCS += if_ptnet.c

+SRCS += opt_inet.h opt_inet6.h

.include <bsd.kmod.mk>

diff --git a/sys/net/netmap.h b/sys/net/netmap.h
index 88b2957502ab..c3b8b9205d3d 100644
--- a/sys/net/netmap.h
+++ b/sys/net/netmap.h

@@ -137,6 +137,26 @@

* netmap:foo-k the k-th NIC ring pair

* netmap:foo{k PIPE ring pair k, master side

* netmap:foo}k PIPE ring pair k, slave side

+ *

+ * Some notes about host rings:

+ *

+ * + The RX host ring is used to store those packets that the host network

+ * stack is trying to transmit through a NIC queue, but only if that queue

+ * is currently in netmap mode. Netmap will not intercept host stack mbufs

+ * designated to NIC queues that are not in netmap mode. As a consequence,

+ * registering a netmap port with netmap:foo^ is not enough to intercept

+ * mbufs in the RX host ring; the netmap port should be registered with

+ * netmap:foo*, or another registration should be done to open at least a

+ * NIC TX queue in netmap mode.

+ *

+ * + Netmap is not currently able to deal with intercepted trasmit mbufs which

+ * require offloadings like TSO, UFO, checksumming offloadings, etc. It is

+ * responsibility of the user to disable those offloadings (e.g. using

+ * ifconfig on FreeBSD or ethtool -K on Linux) for an interface that is being

+ * used in netmap mode. If the offloadings are not disabled, GSO and/or

+ * unchecksummed packets may be dropped immediately or end up in the host RX

+ * ring, and will be dropped as soon as the packet reaches another netmap

+ * adapter.

@@ -277,7 +297,11 @@ struct netmap_ring {

struct timeval ts; /* (k) time of last *sync() */

/* opaque room for a mutex or similar object */

- uint8_t sem[128] __attribute__((__aligned__(NM_CACHE_ALIGN)));

+#if !defined(_WIN32) || defined(__CYGWIN__)

+ uint8_t __attribute__((__aligned__(NM_CACHE_ALIGN))) sem[128];

+#else

+ uint8_t __declspec(align(NM_CACHE_ALIGN)) sem[128];

+#endif

/* the slots follow. This struct has variable size */

struct netmap_slot slot[0]; /* array of slots. */

@@ -496,6 +520,11 @@ struct nmreq {

#define NETMAP_BDG_OFFSET NETMAP_BDG_VNET_HDR /* deprecated alias */

#define NETMAP_BDG_NEWIF 6 /* create a virtual port */

#define NETMAP_BDG_DELIF 7 /* destroy a virtual port */

+#define NETMAP_PT_HOST_CREATE 8 /* create ptnetmap kthreads */

+#define NETMAP_PT_HOST_DELETE 9 /* delete ptnetmap kthreads */

+#define NETMAP_BDG_POLLING_ON 10 /* delete polling kthread */

+#define NETMAP_BDG_POLLING_OFF 11 /* delete polling kthread */

+#define NETMAP_VNET_HDR_GET 12 /* get the port virtio-net-hdr length */

uint16_t nr_arg1; /* reserve extra rings in NIOCREGIF */

#define NETMAP_BDG_HOST 1 /* attach the host stack on ATTACH */

@@ -521,7 +550,61 @@ enum { NR_REG_DEFAULT = 0, /* backward compat, should not be used. */

#define NR_ZCOPY_MON 0x400

/* request exclusive access to the selected rings */

#define NR_EXCLUSIVE 0x800

+/* request ptnetmap host support */

+#define NR_PASSTHROUGH_HOST NR_PTNETMAP_HOST /* deprecated */

+#define NR_PTNETMAP_HOST 0x1000

+#define NR_RX_RINGS_ONLY 0x2000

+#define NR_TX_RINGS_ONLY 0x4000

+/* Applications set this flag if they are able to deal with virtio-net headers,

+ * that is send/receive frames that start with a virtio-net header.

+ * If not set, NIOCREGIF will fail with netmap ports that require applications

+ * to use those headers. If the flag is set, the application can use the

+ * NETMAP_VNET_HDR_GET command to figure out the header length. */

+#define NR_ACCEPT_VNET_HDR 0x8000

+#define NM_BDG_NAME "vale" /* prefix for bridge port name */

+/*

+ * Windows does not have _IOWR(). _IO(), _IOW() and _IOR() are defined

+ * in ws2def.h but not sure if they are in the form we need.

+ * XXX so we redefine them

+ * in a convenient way to use for DeviceIoControl signatures

+ */

+#ifdef _WIN32

+#undef _IO // ws2def.h

+#define _WIN_NM_IOCTL_TYPE 40000

+#define _IO(_c, _n) CTL_CODE(_WIN_NM_IOCTL_TYPE, ((_n) + 0x800) , \

+ METHOD_BUFFERED, FILE_ANY_ACCESS )

+#define _IO_direct(_c, _n) CTL_CODE(_WIN_NM_IOCTL_TYPE, ((_n) + 0x800) , \

+ METHOD_OUT_DIRECT, FILE_ANY_ACCESS )

+#define _IOWR(_c, _n, _s) _IO(_c, _n)

+/* We havesome internal sysctl in addition to the externally visible ones */

+#define NETMAP_MMAP _IO_direct('i', 160) // note METHOD_OUT_DIRECT

+#define NETMAP_POLL _IO('i', 162)

+/* and also two setsockopt for sysctl emulation */

+#define NETMAP_SETSOCKOPT _IO('i', 140)

+#define NETMAP_GETSOCKOPT _IO('i', 141)

+//These linknames are for the Netmap Core Driver

+#define NETMAP_NT_DEVICE_NAME L"\\Device\\NETMAP"

+#define NETMAP_DOS_DEVICE_NAME L"\\DosDevices\\netmap"

+//Definition of a structure used to pass a virtual address within an IOCTL

+typedef struct _MEMORY_ENTRY {

+ PVOID pUsermodeVirtualAddress;

+} MEMORY_ENTRY, *PMEMORY_ENTRY;

+typedef struct _POLL_REQUEST_DATA {

+ int events;

+ int timeout;

+ int revents;

+} POLL_REQUEST_DATA;

+#endif /* _WIN32 */

* FreeBSD uses the size value embedded in the _IOWR to determine

@@ -561,4 +644,28 @@ struct nm_ifreq {

char data[NM_IFRDATA_LEN];

};

+/*

+ * netmap kernel thread configuration

+ */

+/* bhyve/vmm.ko MSIX parameters for IOCTL */

+struct ptn_vmm_ioctl_msix {

+ uint64_t msg;

+ uint64_t addr;

+};

+/* IOCTL parameters */

+struct nm_kth_ioctl {

+ u_long com;

+ /* TODO: use union */

+ union {

+ struct ptn_vmm_ioctl_msix msix;

+ } data;

+};

+/* Configuration of a ptnetmap ring */

+struct ptnet_ring_cfg {

+ uint64_t ioeventfd; /* eventfd in linux, tsleep() parameter in FreeBSD */

+ uint64_t irqfd; /* eventfd in linux, ioctl fd in FreeBSD */

+ struct nm_kth_ioctl ioctl; /* ioctl parameter to send irq (only used in bhyve/FreeBSD) */

+};

#endif /* _NET_NETMAP_H_ */

diff --git a/sys/net/netmap_user.h b/sys/net/netmap_user.h
index 130117db7a2e..4ec3d941c504 100644
--- a/sys/net/netmap_user.h
+++ b/sys/net/netmap_user.h

@@ -1,5 +1,6 @@

* Redistribution and use in source and binary forms, with or without

* modification, are permitted provided that the following conditions

@@ -65,9 +66,31 @@

#ifndef _NET_NETMAP_USER_H_

#define _NET_NETMAP_USER_H_

+#define NETMAP_DEVICE_NAME "/dev/netmap"

+#ifdef __CYGWIN__

+/*

+ * we can compile userspace apps with either cygwin or msvc,

+ * and we use _WIN32 to identify windows specific code

+ */

+#ifndef _WIN32

+#define _WIN32

+#endif /* _WIN32 */

+#endif /* __CYGWIN__ */

+#ifdef _WIN32

+#undef NETMAP_DEVICE_NAME

+#define NETMAP_DEVICE_NAME "/proc/sys/DosDevices/Global/netmap"

+#include <windows.h>

+#include <WinDef.h>

+#include <sys/cygwin.h>

+#endif /* _WIN32 */

#include <stdint.h>

#include <sys/socket.h> /* apple needs sockaddr */

#include <net/if.h> /* IFNAMSIZ */

+#include <ctype.h>

#ifndef likely

#define likely(x) __builtin_expect(!!(x), 1)

@@ -172,17 +195,23 @@ nm_ring_space(struct netmap_ring *ring)

} while (0)

#endif

-struct nm_pkthdr { /* same as pcap_pkthdr */

+struct nm_pkthdr { /* first part is the same as pcap_pkthdr */

struct timeval ts;

uint32_t caplen;

uint32_t len;

+ uint64_t flags; /* NM_MORE_PKTS etc */

+#define NM_MORE_PKTS 1

+ struct nm_desc *d;

+ struct netmap_slot *slot;

+ uint8_t *buf;

};

struct nm_stat { /* same as pcap_stat */

u_int ps_recv;

u_int ps_drop;

u_int ps_ifdrop;

-#ifdef WIN32

+#ifdef WIN32 /* XXX or _WIN32 ? */

u_int bs_capt;

#endif /* WIN32 */

};

@@ -284,12 +313,14 @@ typedef void (*nm_cb_t)(u_char *, const struct nm_pkthdr *, const u_char *d);

* -NN bind individual NIC ring pair

* {NN bind master side of pipe NN

* }NN bind slave side of pipe NN

- * a suffix starting with + and the following flags,

+ * a suffix starting with / and the following flags,

* in any order:

* x exclusive access

* z zero copy monitor

* t monitor tx side

* r monitor rx side

+ * R bind only RX ring(s)

+ * T bind only TX ring(s)

* req provides the initial values of nmreq before parsing ifname.

* Remember that the ifname parsing will override the ring

@@ -329,6 +360,13 @@ enum {

static int nm_close(struct nm_desc *);

+ * nm_mmap() do mmap or inherit from parent if the nr_arg2

+ * (memory block) matches.

+ */

+static int nm_mmap(struct nm_desc *, const struct nm_desc *);

+/*

* nm_inject() is the same as pcap_inject()

* nm_dispatch() is the same as pcap_dispatch()

* nm_nextpkt() is the same as pcap_next()

@@ -338,13 +376,247 @@ static int nm_inject(struct nm_desc *, const void *, size_t);

static int nm_dispatch(struct nm_desc *, int, nm_cb_t, u_char *);

static u_char *nm_nextpkt(struct nm_desc *, struct nm_pkthdr *);

+#ifdef _WIN32

+intptr_t _get_osfhandle(int); /* defined in io.h in windows */

+/*

+ * In windows we do not have yet native poll support, so we keep track

+ * of file descriptors associated to netmap ports to emulate poll on

+ * them and fall back on regular poll on other file descriptors.

+ */

+struct win_netmap_fd_list {

+ struct win_netmap_fd_list *next;

+ int win_netmap_fd;

+ HANDLE win_netmap_handle;

+};

+/*

+ * list head containing all the netmap opened fd and their

+ * windows HANDLE counterparts

+ */

+static struct win_netmap_fd_list *win_netmap_fd_list_head;

+static void

+win_insert_fd_record(int fd)

+ struct win_netmap_fd_list *curr;

+ for (curr = win_netmap_fd_list_head; curr; curr = curr->next) {

+ if (fd == curr->win_netmap_fd) {

+ return;

+ }

+ curr = calloc(1, sizeof(*curr));

+ curr->next = win_netmap_fd_list_head;

+ curr->win_netmap_fd = fd;

+ curr->win_netmap_handle = IntToPtr(_get_osfhandle(fd));

+ win_netmap_fd_list_head = curr;

+void

+win_remove_fd_record(int fd)

+ struct win_netmap_fd_list *curr = win_netmap_fd_list_head;

+ struct win_netmap_fd_list *prev = NULL;

+ for (; curr ; prev = curr, curr = curr->next) {

+ if (fd != curr->win_netmap_fd)

+ continue;

+ /* found the entry */

+ if (prev == NULL) { /* we are freeing the first entry */

+ win_netmap_fd_list_head = curr->next;

+ } else {

+ prev->next = curr->next;

+ }

+ free(curr);

+ break;

+ }

+HANDLE

+win_get_netmap_handle(int fd)

+ struct win_netmap_fd_list *curr;

+ for (curr = win_netmap_fd_list_head; curr; curr = curr->next) {

+ if (fd == curr->win_netmap_fd) {

+ return curr->win_netmap_handle;

+ }

+ return NULL;

+/*

+ * we need to wrap ioctl and mmap, at least for the netmap file descriptors

+ */

+/*

+ * use this function only from netmap_user.h internal functions

+ * same as ioctl, returns 0 on success and -1 on error

+ */

+static int

+win_nm_ioctl_internal(HANDLE h, int32_t ctlCode, void *arg)

+ DWORD bReturn = 0, szIn, szOut;

+ BOOL ioctlReturnStatus;

+ void *inParam = arg, *outParam = arg;

+ switch (ctlCode) {

+ case NETMAP_POLL:

+ szIn = sizeof(POLL_REQUEST_DATA);

+ szOut = sizeof(POLL_REQUEST_DATA);

+ break;

+ case NETMAP_MMAP:

+ szIn = 0;

+ szOut = sizeof(void*);

+ inParam = NULL; /* nothing on input */

+ break;

+ case NIOCTXSYNC:

+ case NIOCRXSYNC:

+ szIn = 0;

+ szOut = 0;

+ break;

+ case NIOCREGIF:

+ szIn = sizeof(struct nmreq);

+ szOut = sizeof(struct nmreq);

+ break;

+ case NIOCCONFIG:

+ D("unsupported NIOCCONFIG!");

+ return -1;

+ default: /* a regular ioctl */

+ D("invalid ioctl %x on netmap fd", ctlCode);

+ return -1;

+ }

+ ioctlReturnStatus = DeviceIoControl(h,

+ ctlCode, inParam, szIn,

+ outParam, szOut,

+ &bReturn, NULL);

+ // XXX note windows returns 0 on error or async call, 1 on success

+ // we could call GetLastError() to figure out what happened

+ return ioctlReturnStatus ? 0 : -1;

+/*

+ * this function is what must be called from user-space programs

+ * same as ioctl, returns 0 on success and -1 on error

+ */

+static int

+win_nm_ioctl(int fd, int32_t ctlCode, void *arg)

+ HANDLE h = win_get_netmap_handle(fd);

+ if (h == NULL) {

+ return ioctl(fd, ctlCode, arg);

+ } else {

+ return win_nm_ioctl_internal(h, ctlCode, arg);

+ }

+#define ioctl win_nm_ioctl /* from now on, within this file ... */

+/*

+ * We cannot use the native mmap on windows

+ * The only parameter used is "fd", the other ones are just declared to

+ * make this signature comparable to the FreeBSD/Linux one

+ */

+static void *

+win32_mmap_emulated(void *addr, size_t length, int prot, int flags, int fd, int32_t offset)

+ HANDLE h = win_get_netmap_handle(fd);

+ if (h == NULL) {

+ return mmap(addr, length, prot, flags, fd, offset);

+ } else {

+ MEMORY_ENTRY ret;

+ return win_nm_ioctl_internal(h, NETMAP_MMAP, &ret) ?

+ NULL : ret.pUsermodeVirtualAddress;

+ }

+#define mmap win32_mmap_emulated

+#include <sys/poll.h> /* XXX needed to use the structure pollfd */

+static int

+win_nm_poll(struct pollfd *fds, int nfds, int timeout)

+ HANDLE h;

+ if (nfds != 1 || fds == NULL || (h = win_get_netmap_handle(fds->fd)) == NULL) {;

+ return poll(fds, nfds, timeout);

+ } else {

+ POLL_REQUEST_DATA prd;

+ prd.timeout = timeout;

+ prd.events = fds->events;

+ win_nm_ioctl_internal(h, NETMAP_POLL, &prd);

+ if ((prd.revents == POLLERR) || (prd.revents == STATUS_TIMEOUT)) {

+ return -1;

+ }

+ return 1;

+ }

+#define poll win_nm_poll

+static int

+win_nm_open(char* pathname, int flags)

+ if (strcmp(pathname, NETMAP_DEVICE_NAME) == 0) {

+ int fd = open(NETMAP_DEVICE_NAME, O_RDWR);

+ if (fd < 0) {

+ return -1;

+ }

+ win_insert_fd_record(fd);

+ return fd;

+ } else {

+ return open(pathname, flags);

+ }

+#define open win_nm_open

+static int

+win_nm_close(int fd)

+ if (fd != -1) {

+ close(fd);

+ if (win_get_netmap_handle(fd) != NULL) {

+ win_remove_fd_record(fd);

+ }

+ return 0;

+#define close win_nm_close

+#endif /* _WIN32 */

+static int

+nm_is_identifier(const char *s, const char *e)

+ for (; s != e; s++) {

+ if (!isalnum(*s) && *s != '_') {

+ return 0;

+ }

+ return 1;

* Try to open, return descriptor if successful, NULL otherwise.

* An invalid netmap name will return errno = 0;

* You can pass a pointer to a pre-filled nm_desc to add special

* parameters. Flags is used as follows

- * NM_OPEN_NO_MMAP use the memory from arg, only

+ * NM_OPEN_NO_MMAP use the memory from arg, only XXX avoid mmap

* if the nr_arg2 (memory block) matches.

* NM_OPEN_ARG1 use req.nr_arg1 from arg

* NM_OPEN_ARG2 use req.nr_arg2 from arg

@@ -359,20 +631,48 @@ nm_open(const char *ifname, const struct nmreq *req,

u_int namelen;

uint32_t nr_ringid = 0, nr_flags, nr_reg;

const char *port = NULL;

+ const char *vpname = NULL;

#define MAXERRMSG 80

char errmsg[MAXERRMSG] = "";

enum { P_START, P_RNGSFXOK, P_GETNUM, P_FLAGS, P_FLAGSOK } p_state;

+ int is_vale;

long num;

- if (strncmp(ifname, "netmap:", 7) && strncmp(ifname, "vale", 4)) {

+ if (strncmp(ifname, "netmap:", 7) &&

+ strncmp(ifname, NM_BDG_NAME, strlen(NM_BDG_NAME))) {

errno = 0; /* name not recognised, not an error */

return NULL;

}

- if (ifname[0] == 'n')

+ is_vale = (ifname[0] == 'v');

+ if (is_vale) {

+ port = index(ifname, ':');

+ if (port == NULL) {

+ snprintf(errmsg, MAXERRMSG,

+ "missing ':' in vale name");

+ goto fail;

+ }

+ if (!nm_is_identifier(ifname + 4, port)) {

+ snprintf(errmsg, MAXERRMSG, "invalid bridge name");

+ goto fail;

+ }

+ vpname = ++port;

+ } else {

ifname += 7;

+ port = ifname;

+ }

/* scan for a separator */

- for (port = ifname; *port && !index("-*^{}/", *port); port++)

+ for (; *port && !index("-*^{}/", *port); port++)

;

+ if (is_vale && !nm_is_identifier(vpname, port)) {

+ snprintf(errmsg, MAXERRMSG, "invalid bridge port name");

+ goto fail;

+ }

namelen = port - ifname;

if (namelen >= sizeof(d->req.nr_name)) {

snprintf(errmsg, MAXERRMSG, "name too long");

@@ -449,6 +749,12 @@ nm_open(const char *ifname, const struct nmreq *req,

case 'r':

nr_flags |= NR_MONITOR_RX;

break;

+ case 'R':

+ nr_flags |= NR_RX_RINGS_ONLY;

+ break;

+ case 'T':

+ nr_flags |= NR_TX_RINGS_ONLY;

+ break;

default:

snprintf(errmsg, MAXERRMSG, "unrecognized flag: '%c'", *port);

goto fail;

@@ -462,6 +768,11 @@ nm_open(const char *ifname, const struct nmreq *req,

snprintf(errmsg, MAXERRMSG, "unexpected end of port name");

goto fail;

}

+ if ((nr_flags & NR_ZCOPY_MON) &&

+ !(nr_flags & (NR_MONITOR_TX|NR_MONITOR_RX))) {

+ snprintf(errmsg, MAXERRMSG, "'z' used but neither 'r', nor 't' found");

+ goto fail;

+ }

ND("flags: %s %s %s %s",

(nr_flags & NR_EXCLUSIVE) ? "EXCLUSIVE" : "",

(nr_flags & NR_ZCOPY_MON) ? "ZCOPY_MON" : "",

@@ -474,7 +785,7 @@ nm_open(const char *ifname, const struct nmreq *req,

return NULL;

}

d->self = d; /* set this early so nm_close() works */

- d->fd = open("/dev/netmap", O_RDWR);

+ d->fd = open(NETMAP_DEVICE_NAME, O_RDWR);

if (d->fd < 0) {

snprintf(errmsg, MAXERRMSG, "cannot open /dev/netmap: %s", strerror(errno));

goto fail;

@@ -487,7 +798,7 @@ nm_open(const char *ifname, const struct nmreq *req,

/* these fields are overridden by ifname and flags processing */

d->req.nr_ringid |= nr_ringid;

- d->req.nr_flags = nr_flags;

+ d->req.nr_flags |= nr_flags;

memcpy(d->req.nr_name, ifname, namelen);

d->req.nr_name[namelen] = '\0';

/* optionally import info from parent */

@@ -529,31 +840,10 @@ nm_open(const char *ifname, const struct nmreq *req,

goto fail;

}

- if (IS_NETMAP_DESC(parent) && parent->mem &&

- parent->req.nr_arg2 == d->req.nr_arg2) {

- /* do not mmap, inherit from parent */

- d->memsize = parent->memsize;

- d->mem = parent->mem;

- } else {

- /* XXX TODO: check if memsize is too large (or there is overflow) */

- d->memsize = d->req.nr_memsize;

- d->mem = mmap(0, d->memsize, PROT_WRITE | PROT_READ, MAP_SHARED,

- d->fd, 0);

- if (d->mem == MAP_FAILED) {

- snprintf(errmsg, MAXERRMSG, "mmap failed: %s", strerror(errno));

- goto fail;

- }

- d->done_mmap = 1;

- }

- {

- struct netmap_if *nifp = NETMAP_IF(d->mem, d->req.nr_offset);

- struct netmap_ring *r = NETMAP_RXRING(nifp, );

- *(struct netmap_if **)(uintptr_t)&(d->nifp) = nifp;

- *(struct netmap_ring **)(uintptr_t)&d->some_ring = r;

- *(void **)(uintptr_t)&d->buf_start = NETMAP_BUF(r, 0);

- *(void **)(uintptr_t)&d->buf_end =

- (char *)d->mem + d->memsize;

+ /* if parent is defined, do nm_mmap() even if NM_OPEN_NO_MMAP is set */

+ if ((!(new_flags & NM_OPEN_NO_MMAP) || parent) && nm_mmap(d, parent)) {

+ snprintf(errmsg, MAXERRMSG, "mmap failed: %s", strerror(errno));

+ goto fail;

}

nr_reg = d->req.nr_flags & NR_REG_MASK;

@@ -626,14 +916,54 @@ nm_close(struct nm_desc *d)

return EINVAL;

if (d->done_mmap && d->mem)

munmap(d->mem, d->memsize);

- if (d->fd != -1)

+ if (d->fd != -1) {

close(d->fd);

+ }

bzero(d, sizeof(*d));

free(d);

return 0;

}

+static int

+nm_mmap(struct nm_desc *d, const struct nm_desc *parent)

+ //XXX TODO: check if mmap is already done

+ if (IS_NETMAP_DESC(parent) && parent->mem &&

+ parent->req.nr_arg2 == d->req.nr_arg2) {

+ /* do not mmap, inherit from parent */

+ D("do not mmap, inherit from parent");

+ d->memsize = parent->memsize;

+ d->mem = parent->mem;

+ } else {

+ /* XXX TODO: check if memsize is too large (or there is overflow) */

+ d->memsize = d->req.nr_memsize;

+ d->mem = mmap(0, d->memsize, PROT_WRITE | PROT_READ, MAP_SHARED,

+ d->fd, 0);

+ if (d->mem == MAP_FAILED) {

+ goto fail;

+ }

+ d->done_mmap = 1;

+ }

+ {

+ struct netmap_if *nifp = NETMAP_IF(d->mem, d->req.nr_offset);

+ struct netmap_ring *r = NETMAP_RXRING(nifp, );

+ *(struct netmap_if **)(uintptr_t)&(d->nifp) = nifp;

+ *(struct netmap_ring **)(uintptr_t)&d->some_ring = r;

+ *(void **)(uintptr_t)&d->buf_start = NETMAP_BUF(r, 0);

+ *(void **)(uintptr_t)&d->buf_end =

+ (char *)d->mem + d->memsize;

+ }

+ return 0;

+fail:

+ return EINVAL;

* Same prototype as pcap_inject(), only need to cast.

@@ -674,6 +1004,9 @@ nm_dispatch(struct nm_desc *d, int cnt, nm_cb_t cb, u_char *arg)

{

int n = d->last_rx_ring - d->first_rx_ring + 1;

int c, got = 0, ri = d->cur_rx_ring;

+ d->hdr.buf = NULL;

+ d->hdr.flags = NM_MORE_PKTS;

+ d->hdr.d = d;

if (cnt == 0)

cnt = -1;

@@ -690,17 +1023,24 @@ nm_dispatch(struct nm_desc *d, int cnt, nm_cb_t cb, u_char *arg)

ri = d->first_rx_ring;

ring = NETMAP_RXRING(d->nifp, ri);

for ( ; !nm_ring_empty(ring) && cnt != got; got++) {

- u_int i = ring->cur;

- u_int idx = ring->slot[i].buf_idx;

- u_char *buf = (u_char *)NETMAP_BUF(ring, idx);

+ u_int idx, i;

+ if (d->hdr.buf) { /* from previous round */

+ cb(arg, &d->hdr, d->hdr.buf);

+ }

+ i = ring->cur;

+ idx = ring->slot[i].buf_idx;

+ d->hdr.slot = &ring->slot[i];

+ d->hdr.buf = (u_char *)NETMAP_BUF(ring, idx);

// __builtin_prefetch(buf);

d->hdr.len = d->hdr.caplen = ring->slot[i].len;

d->hdr.ts = ring->ts;

- cb(arg, &d->hdr, buf);

ring->head = ring->cur = nm_ring_next(ring, i);

}

+ if (d->hdr.buf) { /* from previous round */

+ d->hdr.flags = 0;

+ cb(arg, &d->hdr, d->hdr.buf);

+ }

d->cur_rx_ring = ri;

return got;

}

diff --git a/tools/tools/netmap/Makefile b/tools/tools/netmap/Makefile
index 7d7c44b1cce1..8daf59ff8ba8 100644
--- a/tools/tools/netmap/Makefile
+++ b/tools/tools/netmap/Makefile

@@ -3,11 +3,12 @@

# For multiple programs using a single source file each,

# we can just define 'progs' and create custom targets.

-PROGS = pkt-gen bridge vale-ctl

+PROGS = pkt-gen nmreplay bridge vale-ctl

CLEANFILES = $(PROGS) *.o

MAN=

-CFLAGS += -Werror -Wall # -nostdinc -I/usr/include -I../../../sys

+CFLAGS += -Werror -Wall

+CFLAGS += -nostdinc -I ../../../sys -I/usr/include

CFLAGS += -Wextra

LDFLAGS += -lpthread

@@ -16,6 +17,7 @@ CFLAGS += -DNO_PCAP

.else

LDFLAGS += -lpcap

.endif

+LDFLAGS += -lm # used by nmreplay

.include <bsd.prog.mk>

.include <bsd.lib.mk>

@@ -28,5 +30,8 @@ pkt-gen: pkt-gen.o

bridge: bridge.o

$(CC) $(CFLAGS) -o bridge bridge.o

+nmreplay: nmreplay.o

+ $(CC) $(CFLAGS) -o nmreplay nmreplay.o $(LDFLAGS)

vale-ctl: vale-ctl.o

$(CC) $(CFLAGS) -o vale-ctl vale-ctl.o

diff --git a/tools/tools/netmap/bridge.c b/tools/tools/netmap/bridge.c
index 0895d4ede676..e99a507a829a 100644
--- a/tools/tools/netmap/bridge.c
+++ b/tools/tools/netmap/bridge.c

@@ -143,7 +143,7 @@ static void

usage(void)

{

fprintf(stderr,

- "usage: bridge [-v] [-i ifa] [-i ifb] [-b burst] [-w wait_time] [iface]\n");

+ "usage: bridge [-v] [-i ifa] [-i ifb] [-b burst] [-w wait_time] [ifa [ifb [burst]]]\n");

exit(1);

}

@@ -201,12 +201,12 @@ main(int argc, char **argv)

argc -= optind;

argv += optind;

+ if (argc > 0)

+ ifa = argv[0];

if (argc > 1)

- ifa = argv[1];

+ ifb = argv[1];

if (argc > 2)

- ifb = argv[2];

- if (argc > 3)

- burst = atoi(argv[3]);

+ burst = atoi(argv[2]);

if (!ifb)

ifb = ifa;

if (!ifa) {

@@ -233,7 +233,7 @@ main(int argc, char **argv)

D("cannot open %s", ifa);

return (1);

}

- // XXX use a single mmap ?

+ /* try to reuse the mmap() of the first interface, if possible */

pb = nm_open(ifb, NULL, NM_OPEN_NO_MMAP, pa);

if (pb == NULL) {

D("cannot open %s", ifb);

@@ -262,6 +262,23 @@ main(int argc, char **argv)

pollfd[0].revents = pollfd[1].revents = 0;

n0 = pkt_queued(pa, 0);

n1 = pkt_queued(pb, 0);

+#if defined(_WIN32) || defined(BUSYWAIT)

+ if (n0){

+ ioctl(pollfd[1].fd, NIOCTXSYNC, NULL);

+ pollfd[1].revents = POLLOUT;

+ }

+ else {

+ ioctl(pollfd[0].fd, NIOCRXSYNC, NULL);

+ }

+ if (n1){

+ ioctl(pollfd[0].fd, NIOCTXSYNC, NULL);

+ pollfd[0].revents = POLLOUT;

+ }

+ else {

+ ioctl(pollfd[1].fd, NIOCRXSYNC, NULL);

+ }

+ ret = 1;

+#else

if (n0)

pollfd[1].events |= POLLOUT;

else

@@ -271,6 +288,7 @@ main(int argc, char **argv)

else

pollfd[1].events |= POLLIN;

ret = poll(pollfd, 2, 2500);

+#endif //defined(_WIN32) || defined(BUSYWAIT)

if (ret <= 0 || verbose)

D("poll %s [0] ev %x %x rx %d@%d tx %d,"

" [1] ev %x %x rx %d@%d tx %d",

diff --git a/tools/tools/netmap/ctrs.h b/tools/tools/netmap/ctrs.h
new file mode 100644
index 000000000000..cee316477144
--- /dev/null
+++ b/tools/tools/netmap/ctrs.h

@@ -0,0 +1,108 @@

+#ifndef CTRS_H_

+#define CTRS_H_

+/* $FreeBSD$ */

+#include <sys/time.h>

+/* counters to accumulate statistics */

+struct my_ctrs {

+ uint64_t pkts, bytes, events, drop;

+ uint64_t min_space;

+ struct timeval t;

+};

+/* very crude code to print a number in normalized form.

+ * Caller has to make sure that the buffer is large enough.

+ */

+static const char *

+norm2(char *buf, double val, char *fmt)

+ char *units[] = { "", "K", "M", "G", "T" };

+ u_int i;

+ for (i = 0; val >=1000 && i < sizeof(units)/sizeof(char *) - 1; i++)

+ val /= 1000;

+ sprintf(buf, fmt, val, units[i]);

+ return buf;

+static __inline const char *

+norm(char *buf, double val)

+ return norm2(buf, val, "%.3f %s");

+static __inline int

+timespec_ge(const struct timespec *a, const struct timespec *b)

+ if (a->tv_sec > b->tv_sec)

+ return (1);

+ if (a->tv_sec < b->tv_sec)

+ return (0);

+ if (a->tv_nsec >= b->tv_nsec)

+ return (1);

+ return (0);

+static __inline struct timespec

+timeval2spec(const struct timeval *a)

+ struct timespec ts = {

+ .tv_sec = a->tv_sec,

+ .tv_nsec = a->tv_usec * 1000

+ };

+ return ts;

+static __inline struct timeval

+timespec2val(const struct timespec *a)

+ struct timeval tv = {

+ .tv_sec = a->tv_sec,

+ .tv_usec = a->tv_nsec / 1000

+ };

+ return tv;

+static __inline struct timespec

+timespec_add(struct timespec a, struct timespec b)

+ struct timespec ret = { a.tv_sec + b.tv_sec, a.tv_nsec + b.tv_nsec };

+ if (ret.tv_nsec >= 1000000000) {

+ ret.tv_sec++;

+ ret.tv_nsec -= 1000000000;

+ }

+ return ret;

+static __inline struct timespec

+timespec_sub(struct timespec a, struct timespec b)

+ struct timespec ret = { a.tv_sec - b.tv_sec, a.tv_nsec - b.tv_nsec };

+ if (ret.tv_nsec < 0) {

+ ret.tv_sec--;

+ ret.tv_nsec += 1000000000;

+ }

+ return ret;

+static uint64_t

+wait_for_next_report(struct timeval *prev, struct timeval *cur,

+ int report_interval)

+ struct timeval delta;

+ delta.tv_sec = report_interval/1000;

+ delta.tv_usec = (report_interval%1000)*1000;

+ if (select(0, NULL, NULL, NULL, &delta) < 0 && errno != EINTR) {

+ perror("select");

+ abort();

+ }

+ gettimeofday(cur, NULL);

+ timersub(cur, prev, &delta);

+ return delta.tv_sec* 1000000 + delta.tv_usec;

+#endif /* CTRS_H_ */

diff --git a/tools/tools/netmap/nmreplay.8 b/tools/tools/netmap/nmreplay.8
new file mode 100644
index 000000000000..8e5ddb9698dd
--- /dev/null
+++ b/tools/tools/netmap/nmreplay.8

@@ -0,0 +1,129 @@

+.\"

+.\" Redistribution and use in source and binary forms, with or without

+.\" modification, are permitted provided that the following conditions

+.\" are met:

+.\" 1. Redistributions of source code must retain the above copyright

+.\" notice, this list of conditions and the following disclaimer.

+.\" 2. Redistributions in binary form must reproduce the above copyright

+.\" notice, this list of conditions and the following disclaimer in the

+.\" documentation and/or other materials provided with the distribution.

+.\"

+.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND

+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE

+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE

+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL

+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS

+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)

+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT

+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY

+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF

+.\" SUCH DAMAGE.

+.\"

+.\" $FreeBSD$

+.\"

+.Dd February 16, 2016

+.Dt NMREPLAY 1

+.Os

+.Sh NAME

+.Nm nmreplay

+.Nd playback a pcap file through a netmap interface

+.Sh SYNOPSIS

+.Bk -words

+.Bl -tag -width "nmreplay"

+.It Nm

+.Op Fl f Ar pcap-file

+.Op Fl i Ar netmap-interface

+.Op Fl B Ar bandwidth

+.Op Fl D Ar delay

+.Op Fl L Ar loss

+.Op Fl b Ar batch size

+.Op Fl w Ar wait-link

+.Op Fl v

+.Op Fl C Ar cpu-placement

+.Sh DESCRIPTION

+.Nm

+works like

+.Nm tcpreplay

+to replay a pcap file through a netmap interface,

+with programmable rates and possibly delays, losses

+and packet alterations.

+.Nm

+is designed to run at high speed, so the transmit schedule

+is computed ahead of time, and the thread in charge of transmission

+only has to pump data through the interface.

+.Nm

+can connect to any type of netmap port.

+.Pp

+Command line options are as follows

+.Bl -tag -width Ds

+.It Fl f Ar pcap-file

+Name of the pcap file to replay.

+.It Fl i Ar interface

+Name of the netmap interface to use as output.

+.It Fl v

+Enable verbose mode

+.It Fl b Ar batch-size

+Maximum batch size to use during transmissions.

+.Nm

+normally transmits packets one at a time, but it may use

+larger batches, up to the value specified with this option,

+when running at high rates.

+.It Fl B Ar bps | Cm constant, Ns Ar bps | Cm ether, Ns Ar bps | Cm real Ns Op , Ns Ar speedup

+Bandwidth to be used for transmission.

+.Ar bps

+is a floating point number optionally follow by a character

+(k, K, m, M, g, G) that multiplies the value by 10^3, 10^6 and 10^9

+respectively.

+.Cm constant

+(can be omitted) means that the bandwidth will be computed

+with reference to the actual packet size (excluding CRC and framing).

+.Cm ether

+indicates that the ethernet framing (160 bits) and CRC (32 bits)

+will be included in the computation of the packet size.

+.Cm real

+means transmission will occur according to the timestamps

+recorded in the trace. The optional

+.Ar speedup

+multiplier (defaults to 1) indicates how much faster

+or slower than real time the trace should be replayed.

+.It Fl D Ar dt | Cm constant, Ns Ar dt | Cm uniform, Ns Ar dmin,dmax | Cm exp, Ar dmin,davg

+Adds additional delay to the packet transmission, whose distribution

+can be constant, uniform or exponential.

+.Ar dt, dmin, dmax, avt

+are times expressed as floating point numbers optionally followed

+by a character (s, m, u, n) to indicate seconds, milliseconds,

+microseconds, nanoseconds.

+The delay is added to the transmit time and adjusted so that there is

+never packet reordering.

+.It Fl L Ar x | Cm plr, Ns Ar x | Cm ber, Ns Ar x

+Simulates packet or bit errors, causing offending packets to be dropped.

+.Ar x

+is a floating point number indicating the packet or bit error rate.

+.It Fl w Ar wait-link

+indicates the number of seconds to wait before transmitting.

+It defaults to 2, and may be useful when talking to physical

+ports to let link negotiation complete before starting transmission.

+.El

+.Sh OPERATION

+.Nm

+creates an in-memory schedule with all packets to be transmitted,

+and then launches a separate thread to take care of transmissions

+while the main thread reports statistics every second.

+.Sh SEE ALSO

+.Pa http://info.iet.unipi.it/~luigi/netmap/

+.Pp

+Luigi Rizzo, Revisiting network I/O APIs: the netmap framework,

+Communications of the ACM, 55 (3), pp.45-51, March 2012

+.Pp

+Luigi Rizzo, Giuseppe Lettieri,

+VALE, a switched ethernet for virtual machines,

+ACM CoNEXT'12, December 2012, Nice

+.Sh AUTHORS

+.An -nosplit

+.Nm

+has been written by

+.An Luigi Rizzo, Andrea Beconcini, Francesco Mola and Lorenzo Biagini

+at the Universita` di Pisa, Italy.

diff --git a/tools/tools/netmap/nmreplay.c b/tools/tools/netmap/nmreplay.c
new file mode 100644
index 000000000000..7a46bd57e198
--- /dev/null
+++ b/tools/tools/netmap/nmreplay.c

@@ -0,0 +1,1820 @@

+/*

+ *

+ * Redistribution and use in source and binary forms, with or without

+ * modification, are permitted provided that the following conditions

+ * are met:

+ * 1. Redistributions of source code must retain the above copyright

+ * notice, this list of conditions and the following disclaimer.

+ * 2. Redistributions in binary form must reproduce the above copyright

+ * notice, this list of conditions and the following disclaimer in the

+ * documentation and/or other materials provided with the distribution.

+ *

+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND

+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE

+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE

+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL

+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS

+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)

+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT

+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY

+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF

+ * SUCH DAMAGE.

+ *

+ * $FreeBSD$

+ */

+#if 0 /* COMMENT */

+This program implements NMREPLAY, a program to replay a pcap file

+enforcing the output rate and possibly random losses and delay

+distributions.

+It is meant to be run from the command line and implemented with a main

+control thread for monitoring, plus a thread to push packets out.

+The control thread parses command line arguments, prepares a

+schedule for transmission in a memory buffer and then sits

+in a loop where it periodically reads traffic statistics from

+the other threads and prints them out on the console.

+The transmit buffer contains headers and packets. Each header

+includes a timestamp that determines when the packet should be sent out.

+A "consumer" thread cons() reads from the queue and transmits packets

+on the output netmap port when their time has come.

+The program does CPU pinning and sets the scheduler and priority

+for the "cons" threads. Externally one should do the

+assignment of other threads (e.g. interrupt handlers) and

+make sure that network interfaces are configured properly.

+--- Main functions of the program ---

+within each function, q is used as a pointer to the queue holding

+packets and parameters.

+pcap_prod()

+ reads from the pcap file and prepares packets to transmit.

+ After reading a packet from the pcap file, the following information

+ are extracted which can be used to determine the schedule:

+ q->cur_pkt points to the buffer containing the packet

+ q->cur_len packet length, excluding CRC

+ q->cur_caplen available packet length (may be shorter than cur_len)

+ q->cur_tt transmission time for the packet, computed from the trace.

+ The following functions are then called in sequence:

+ q->c_loss (set with the -L command line option) decides

+ whether the packet should be dropped before even queuing.

+ This is generally useful to emulate random loss.

+ The function is supposed to set q->c_drop = 1 if the

+ packet should be dropped, or leave it to 0 otherwise.

+ q->c_bw (set with the -B command line option) is used to

+ enforce the transmit bandwidth. The function must store

+ in q->cur_tt the transmission time (in nanoseconds) of

+ the packet, which is typically proportional to the length

+ of the packet, i.e. q->cur_tt = q->cur_len / <bandwidth>

+ Variants are possible, eg. to account for constant framing

+ bits as on the ethernet, or variable channel acquisition times,

+ etc.

+ This mechanism can also be used to simulate variable queueing

+ delay e.g. due to the presence of cross traffic.

+ q->c_delay (set with the -D option) implements delay emulation.

+ The function should set q->cur_delay to the additional

+ delay the packet is subject to. The framework will take care of

+ computing the actual exit time of a packet so that there is no

+ reordering.

+#endif /* COMMENT */

+// debugging macros

+#define NED(_fmt, ...) do {} while (0)

+#define ED(_fmt, ...) \

+ do { \

+ struct timeval _t0; \

+ gettimeofday(&_t0, NULL); \

+ fprintf(stderr, "%03d.%03d %-10.10s [%5d] \t" _fmt "\n", \

+ (int)(_t0.tv_sec % 1000), (int)_t0.tv_usec/1000, \

+ __FUNCTION__, __LINE__, ##__VA_ARGS__); \

+ } while (0)

+/* WWW is for warnings, EEE is for errors */

+#define WWW(_fmt, ...) ED("--WWW-- " _fmt, ##__VA_ARGS__)

+#define EEE(_fmt, ...) ED("--EEE-- " _fmt, ##__VA_ARGS__)

+#define DDD(_fmt, ...) ED("--DDD-- " _fmt, ##__VA_ARGS__)

+#define _GNU_SOURCE // for CPU_SET() etc

+#include <stdio.h>

+#define NETMAP_WITH_LIBS

+#include <net/netmap_user.h>

+#include <sys/poll.h>

+/*

+ *

+A packet in the queue is q_pkt plus the payload.

+For the packet descriptor we need the following:

+ - position of next packet in the queue (can go backwards).

+ We can reduce to 32 bits if we consider alignments,

+ or we just store the length to be added to the current

+ value and assume 0 as a special index.

+ - actual packet length (16 bits may be ok)

+ - queue output time, in nanoseconds (64 bits)

+ - delay line output time, in nanoseconds

+ One of the two can be packed to a 32bit value

+A convenient coding uses 32 bytes per packet.

+ */

+struct q_pkt {

+ uint64_t next; /* buffer index for next packet */

+ uint64_t pktlen; /* actual packet len */

+ uint64_t pt_qout; /* time of output from queue */

+ uint64_t pt_tx; /* transmit time */

+};

+/*

+ * The header for a pcap file

+ */

+struct pcap_file_header {

+ uint32_t magic;

+ /*used to detect the file format itself and the byte

+ ordering. The writing application writes 0xa1b2c3d4 with it's native byte

+ ordering format into this field. The reading application will read either

+ 0xa1b2c3d4 (identical) or 0xd4c3b2a1 (swapped). If the reading application

+ reads the swapped 0xd4c3b2a1 value, it knows that all the following fields

+ will have to be swapped too. For nanosecond-resolution files, the writing

+ application writes 0xa1b23c4d, with the two nibbles of the two lower-order

+ bytes swapped, and the reading application will read either 0xa1b23c4d

+ (identical) or 0x4d3cb2a1 (swapped)*/

+ uint16_t version_major;

+ uint16_t version_minor; /*the version number of this file format */

+ int32_t thiszone;

+ /*the correction time in seconds between GMT (UTC) and the

+ local timezone of the following packet header timestamps. Examples: If the

+ timestamps are in GMT (UTC), thiszone is simply 0. If the timestamps are in

+ Central European time (Amsterdam, Berlin, ...) which is GMT + 1:00, thiszone

+ must be -3600*/

+ uint32_t stampacc; /*the accuracy of time stamps in the capture*/

+ uint32_t snaplen;

+ /*the "snapshot length" for the capture (typically 65535

+ or even more, but might be limited by the user)*/

+ uint32_t network;

+ /*link-layer header type, specifying the type of headers

+ at the beginning of the packet (e.g. 1 for Ethernet); this can be various

+ types such as 802.11, 802.11 with various radio information, PPP, Token

+ Ring, FDDI, etc.*/

+};

+#if 0 /* from pcap.h */

+struct pcap_file_header {

+ bpf_u_int32 magic;

+ u_short version_major;

+ u_short version_minor;

+ bpf_int32 thiszone; /* gmt to local correction */

+ bpf_u_int32 sigfigs; /* accuracy of timestamps */

+ bpf_u_int32 snaplen; /* max length saved portion of each pkt */

+ bpf_u_int32 linktype; /* data link type (LINKTYPE_*) */

+};

+struct pcap_pkthdr {

+ struct timeval ts; /* time stamp */

+ bpf_u_int32 caplen; /* length of portion present */

+ bpf_u_int32 len; /* length this packet (off wire) */

+};

+#endif /* from pcap.h */

+struct pcap_pkthdr {

+ uint32_t ts_sec; /* seconds from epoch */

+ uint32_t ts_frac; /* microseconds or nanoseconds depending on sigfigs */

+ uint32_t caplen;

+ /*the number of bytes of packet data actually captured

+ and saved in the file. This value should never become larger than orig_len

+ or the snaplen value of the global header*/

+ uint32_t len; /* wire length */

+};

+#define PKT_PAD (32) /* padding on packets */

+static inline int pad(int x)

+ return ((x) + PKT_PAD - 1) & ~(PKT_PAD - 1) ;

+/*

+ * wrapper around the pcap file.

+ * We mmap the file so it is easy to do multiple passes through it.

+ */

+struct nm_pcap_file {

+ int fd;

+ uint64_t filesize;

+ const char *data; /* mmapped file */

+ uint64_t tot_pkt;

+ uint64_t tot_bytes;

+ uint64_t tot_bytes_rounded; /* need hdr + pad(len) */

+ uint32_t resolution; /* 1000 for us, 1 for ns */

+ int swap; /* need to swap fields ? */

+ uint64_t first_ts;

+ uint64_t total_tx_time;

+ /*

+ * total_tx_time is computed as last_ts - first_ts, plus the

+ * transmission time for the first packet which in turn is

+ * computed according to the average bandwidth

+ */

+ uint64_t file_len;

+ const char *cur; /* running pointer */

+ const char *lim; /* data + file_len */

+ int err;

+};

+static struct nm_pcap_file *readpcap(const char *fn);

+static void destroy_pcap(struct nm_pcap_file *file);

+#include <stdio.h>

+#include <stdlib.h>

+#include <stdint.h>

+#include <unistd.h>

+#include <fcntl.h>

+#include <string.h> /* memcpy */

+#include <sys/mman.h>

+#define NS_SCALE 1000000000UL /* nanoseconds in 1s */

+static void destroy_pcap(struct nm_pcap_file *pf)

+ if (!pf)

+ return;

+ munmap((void *)(uintptr_t)pf->data, pf->filesize);

+ close(pf->fd);

+ bzero(pf, sizeof(*pf));

+ free(pf);

+ return;

+// convert a field of given size if swap is needed.

+static uint32_t

+cvt(const void *src, int size, char swap)

+ uint32_t ret = 0;

+ if (size != 2 && size != 4) {

+ EEE("Invalid size %d\n", size);

+ exit(1);

+ }

+ memcpy(&ret, src, size);

+ if (swap) {

+ unsigned char tmp, *data = (unsigned char *)&ret;

+ int i;

+ for (i = 0; i < size / 2; i++) {

+ tmp = data[i];

+ data[i] = data[size - (1 + i)];

+ data[size - (1 + i)] = tmp;

+ }

+ return ret;

+static uint32_t

+read_next_info(struct nm_pcap_file *pf, int size)

+ const char *end = pf->cur + size;

+ uint32_t ret;

+ if (end > pf->lim) {

+ pf->err = 1;

+ ret = 0;

+ } else {

+ ret = cvt(pf->cur, size, pf->swap);

+ pf->cur = end;

+ }

+ return ret;

+/*

+ * mmap the file, make sure timestamps are sorted, and count

+ * packets and sizes

+ * Timestamps represent the receive time of the packets.

+ * We need to compute also the 'first_ts' which refers to a hypotetical

+ * packet right before the first one, see the code for details.

+ */

+static struct nm_pcap_file *

+readpcap(const char *fn)

+ struct nm_pcap_file _f, *pf = &_f;

+ uint64_t prev_ts, first_pkt_time;

+ uint32_t magic, first_len = 0;

+ bzero(pf, sizeof(*pf));

+ pf->fd = open(fn, O_RDONLY);

+ if (pf->fd < 0) {

+ EEE("cannot open file %s", fn);

+ return NULL;

+ }

+ /* compute length */

+ pf->filesize = lseek(pf->fd, 0, SEEK_END);

+ lseek(pf->fd, 0, SEEK_SET);

+ ED("filesize is %lu", (u_long)(pf->filesize));

+ if (pf->filesize < sizeof(struct pcap_file_header)) {

+ EEE("file too short %s", fn);

+ close(pf->fd);

+ return NULL;

+ }

+ pf->data = mmap(NULL, pf->filesize, PROT_READ, MAP_SHARED, pf->fd, 0);

+ if (pf->data == MAP_FAILED) {

+ EEE("cannot mmap file %s", fn);

+ close(pf->fd);

+ return NULL;

+ }

+ pf->cur = pf->data;

+ pf->lim = pf->data + pf->filesize;

+ pf->err = 0;

+ pf->swap = 0; /* default, same endianness when read magic */

+ magic = read_next_info(pf, 4);

+ ED("magic is 0x%x", magic);

+ switch (magic) {

+ case 0xa1b2c3d4: /* native, us resolution */

+ pf->swap = 0;

+ pf->resolution = 1000;

+ break;

+ case 0xd4c3b2a1: /* swapped, us resolution */

+ pf->swap = 1;

+ pf->resolution = 1000;

+ break;

+ case 0xa1b23c4d: /* native, ns resolution */

+ pf->swap = 0;

+ pf->resolution = 1; /* nanoseconds */

+ break;

+ case 0x4d3cb2a1: /* swapped, ns resolution */

+ pf->swap = 1;

+ pf->resolution = 1; /* nanoseconds */

+ break;

+ default:

+ EEE("unknown magic 0x%x", magic);

+ return NULL;

+ }

+ ED("swap %d res %d\n", pf->swap, pf->resolution);

+ pf->cur = pf->data + sizeof(struct pcap_file_header);

+ pf->lim = pf->data + pf->filesize;

+ pf->err = 0;

+ prev_ts = 0;

+ while (pf->cur < pf->lim && pf->err == 0) {

+ uint32_t base = pf->cur - pf->data;

+ uint64_t cur_ts = read_next_info(pf, 4) * NS_SCALE +

+ read_next_info(pf, 4) * pf->resolution;

+ uint32_t caplen = read_next_info(pf, 4);

+ uint32_t len = read_next_info(pf, 4);

+ if (pf->err) {

+ WWW("end of pcap file after %d packets\n",

+ (int)pf->tot_pkt);

+ break;

+ }

+ if (cur_ts < prev_ts) {

+ WWW("reordered packet %d\n",

+ (int)pf->tot_pkt);

+ }

+ prev_ts = cur_ts;

+ (void)base;

+ if (pf->tot_pkt == 0) {

+ pf->first_ts = cur_ts;

+ first_len = len;

+ }

+ pf->tot_pkt++;

+ pf->tot_bytes += len;

+ pf->tot_bytes_rounded += pad(len) + sizeof(struct q_pkt);

+ pf->cur += caplen;

+ }

+ pf->total_tx_time = prev_ts - pf->first_ts; /* excluding first packet */

+ ED("tot_pkt %lu tot_bytes %lu tx_time %.6f s first_len %lu",

+ (u_long)pf->tot_pkt, (u_long)pf->tot_bytes,

+ 1e-9*pf->total_tx_time, (u_long)first_len);

+ /*

+ * We determine that based on the

+ * average bandwidth of the trace, as follows

+ * first_pkt_ts = p[0].len / avg_bw

+ * In turn avg_bw = (total_len - p[0].len)/(p[n-1].ts - p[0].ts)

+ * so

+ * first_ts = p[0].ts - p[0].len * (p[n-1].ts - p[0].ts) / (total_len - p[0].len)

+ */

+ if (pf->tot_bytes == first_len) {

+ /* cannot estimate bandwidth, so force 1 Gbit */

+ first_pkt_time = first_len * 8; /* * 10^9 / bw */

+ } else {

+ first_pkt_time = pf->total_tx_time * first_len / (pf->tot_bytes - first_len);

+ }

+ ED("first_pkt_time %.6f s", 1e-9*first_pkt_time);

+ pf->total_tx_time += first_pkt_time;

+ pf->first_ts -= first_pkt_time;

+ /* all correct, allocate a record and copy */

+ pf = calloc(1, sizeof(*pf));

+ *pf = _f;

+ /* reset pointer to start */

+ pf->cur = pf->data + sizeof(struct pcap_file_header);

+ pf->err = 0;

+ return pf;

+enum my_pcap_mode { PM_NONE, PM_FAST, PM_FIXED, PM_REAL };

+int verbose = 0;

+static int do_abort = 0;

+#include <stdlib.h>

+#include <stdio.h>

+#include <pthread.h>

+#include <sys/time.h>

+#include <sys/resource.h> // setpriority

+#ifdef __FreeBSD__

+#include <pthread_np.h> /* pthread w/ affinity */

+#include <sys/cpuset.h> /* cpu_set */

+#endif /* __FreeBSD__ */

+#ifdef linux

+#define cpuset_t cpu_set_t

+#endif

+#ifdef __APPLE__

+#define cpuset_t uint64_t // XXX

+static inline void CPU_ZERO(cpuset_t *p)

+ *p = 0;

+static inline void CPU_SET(uint32_t i, cpuset_t *p)

+ *p |= 1<< (i & 0x3f);

+#define pthread_setaffinity_np(a, b, c) ((void)a, 0)

+#define sched_setscheduler(a, b, c) (1) /* error */

+#define clock_gettime(a,b) \

+ do {struct timespec t0 = {0,0}; *(b) = t0; } while (0)

+#define _P64 unsigned long

+#endif

+#ifndef _P64

+/* we use uint64_t widely, but printf gives trouble on different

+ * platforms so we use _P64 as a cast

+ */

+#define _P64 uint64_t

+#endif /* print stuff */

+struct _qs; /* forward */

+/*

+ * descriptor of a configuration entry.

+ * Each handler has a parse function which takes ac/av[] and returns

+ * true if successful. Any allocated space is stored into struct _cfg *

+ * that is passed as argument.

+ * arg and arg_len are included for convenience.

+ */

+struct _cfg {

+ int (*parse)(struct _qs *, struct _cfg *, int ac, char *av[]); /* 0 ok, 1 on error */

+ int (*run)(struct _qs *, struct _cfg *arg); /* 0 Ok, 1 on error */

+ // int close(struct _qs *, void *arg); /* 0 Ok, 1 on error */

+ const char *optarg; /* command line argument. Initial value is the error message */

+ /* placeholders for common values */

+ void *arg; /* allocated memory if any */

+ int arg_len; /* size of *arg in case a realloc is needed */

+ uint64_t d[16]; /* static storage for simple cases */

+ double f[4]; /* static storage for simple cases */

+};

+/*

+ * communication occurs through this data structure, with fields

+ * cache-aligned according to who are the readers/writers.

+ *

+The queue is an array of memory (buf) of size buflen (does not change).

+The producer uses 'tail' as an index in the queue to indicate

+the first empty location (ie. after the last byte of data),

+the consumer uses head to indicate the next byte to consume.

+For best performance we should align buffers and packets

+to multiples of cacheline, but this would explode memory too much.

+Worst case memory explosion is with 65 byte packets.

+Memory usage as shown below:

+ qpkt-pad

+ size 32-16 32-32 32-64 64-64

+ 64 96 96 96 128

+ 65 112 128 160 192

+An empty queue has head == tail, a full queue will have free space

+below a threshold. In our case the queue is large enough and we

+are non blocking so we can simply drop traffic when the queue

+approaches a full state.

+To simulate bandwidth limitations efficiently, the producer has a second

+pointer, prod_tail_1, used to check for expired packets. This is done lazily.

+ */

+/*

+ * When sizing the buffer, we must assume some value for the bandwidth.

+ * INFINITE_BW is supposed to be faster than what we support

+ */

+#define INFINITE_BW (200ULL*1000000*1000)

+#define MY_CACHELINE (128ULL)

+#define MAX_PKT (9200) /* max packet size */

+#define ALIGN_CACHE __attribute__ ((aligned (MY_CACHELINE)))

+struct _qs { /* shared queue */

+ uint64_t t0; /* start of times */

+ uint64_t buflen; /* queue length */

+ char *buf;

+ /* handlers for various options */

+ struct _cfg c_delay;

+ struct _cfg c_bw;

+ struct _cfg c_loss;

+ /* producer's fields */

+ uint64_t tx ALIGN_CACHE; /* tx counter */

+ uint64_t prod_tail_1; /* head of queue */

+ uint64_t prod_head; /* cached copy */

+ uint64_t prod_tail; /* cached copy */

+ uint64_t prod_drop; /* drop packet count */

+ uint64_t prod_max_gap; /* rx round duration */

+ struct nm_pcap_file *pcap; /* the pcap struct */

+ /* parameters for reading from the netmap port */

+ struct nm_desc *src_port; /* netmap descriptor */

+ const char * prod_ifname; /* interface name or pcap file */

+ struct netmap_ring *rxring; /* current ring being handled */

+ uint32_t si; /* ring index */

+ int burst;

+ uint32_t rx_qmax; /* stats on max queued */

+ uint64_t qt_qout; /* queue exit time for last packet */

+ /*

+ * when doing shaping, the software computes and stores here

+ * the time when the most recently queued packet will exit from

+ * the queue.

+ */

+ uint64_t qt_tx; /* delay line exit time for last packet */

+ /*

+ * The software computes the time at which the most recently

+ * queued packet exits from the queue.

+ * To avoid reordering, the next packet should exit at least

+ * at qt_tx + cur_tt

+ */

+ /* producer's fields controlling the queueing */

+ const char * cur_pkt; /* current packet being analysed */

+ uint32_t cur_len; /* length of current packet */

+ uint32_t cur_caplen; /* captured length of current packet */

+ int cur_drop; /* 1 if current packet should be dropped. */

+ /*

+ * cur_drop can be set as a result of the loss emulation,

+ * and may need to use the packet size, current time, etc.

+ */

+ uint64_t cur_tt; /* transmission time (ns) for current packet */

+ /*

+ * The transmission time is how much link time the packet will consume.

+ * should be set by the function that does the bandwidth emulation,

+ * but could also be the result of a function that emulates the

+ * presence of competing traffic, MAC protocols etc.

+ * cur_tt is 0 for links with infinite bandwidth.

+ */

+ uint64_t cur_delay; /* delay (ns) for current packet from c_delay.run() */

+ /*

+ * this should be set by the function that computes the extra delay

+ * applied to the packet.

+ * The code makes sure that there is no reordering and possibly

+ * bumps the output time as needed.

+ */

+ /* consumer's fields */

+ const char * cons_ifname;

+ uint64_t rx ALIGN_CACHE; /* rx counter */

+ uint64_t cons_head; /* cached copy */

+ uint64_t cons_tail; /* cached copy */

+ uint64_t cons_now; /* most recent producer timestamp */

+ uint64_t rx_wait; /* stats */

+ /* shared fields */

+ volatile uint64_t _tail ALIGN_CACHE ; /* producer writes here */

+ volatile uint64_t _head ALIGN_CACHE ; /* consumer reads from here */

+};

+struct pipe_args {

+ int wait_link;

+ pthread_t cons_tid; /* main thread */

+ pthread_t prod_tid; /* producer thread */

+ /* Affinity: */

+ int cons_core; /* core for cons() */

+ int prod_core; /* core for prod() */

+ struct nm_desc *pa; /* netmap descriptor */

+ struct nm_desc *pb;

+ struct _qs q;

+};

+#define NS_IN_S (1000000000ULL) // nanoseconds

+#define TIME_UNITS NS_IN_S

+/* set the thread affinity. */

+static int

+setaffinity(int i)

+ cpuset_t cpumask;

+ struct sched_param p;

+ if (i == -1)

+ return 0;

+ /* Set thread affinity affinity.*/

+ CPU_ZERO(&cpumask);

+ CPU_SET(i, &cpumask);

+ if (pthread_setaffinity_np(pthread_self(), sizeof(cpuset_t), &cpumask) != 0) {

+ WWW("Unable to set affinity: %s", strerror(errno));

+ }

+ if (setpriority(PRIO_PROCESS, 0, -10)) {; // XXX not meaningful

+ WWW("Unable to set priority: %s", strerror(errno));

+ }

+ bzero(&p, sizeof(p));

+ p.sched_priority = 10; // 99 on linux ?

+ // use SCHED_RR or SCHED_FIFO

+ if (sched_setscheduler(0, SCHED_RR, &p)) {

+ WWW("Unable to set scheduler: %s", strerror(errno));

+ }

+ return 0;

+/*

+ * set the timestamp from the clock, subtract t0

+ */

+static inline void

+set_tns_now(uint64_t *now, uint64_t t0)

+ struct timespec t;

+ clock_gettime(CLOCK_REALTIME, &t); // XXX precise on FreeBSD ?

+ *now = (uint64_t)(t.tv_nsec + NS_IN_S * t.tv_sec);

+ *now -= t0;

+/* compare two timestamps */

+static inline int64_t

+ts_cmp(uint64_t a, uint64_t b)

+ return (int64_t)(a - b);

+/* create a packet descriptor */

+static inline struct q_pkt *

+pkt_at(struct _qs *q, uint64_t ofs)

+ return (struct q_pkt *)(q->buf + ofs);

+/*

+ * we have already checked for room and prepared p->next

+ */

+static inline int

+enq(struct _qs *q)

+ struct q_pkt *p = pkt_at(q, q->prod_tail);

+ /* hopefully prefetch has been done ahead */

+ nm_pkt_copy(q->cur_pkt, (char *)(p+1), q->cur_caplen);

+ p->pktlen = q->cur_len;

+ p->pt_qout = q->qt_qout;

+ p->pt_tx = q->qt_tx;

+ p->next = q->prod_tail + pad(q->cur_len) + sizeof(struct q_pkt);

+ ND("enqueue len %d at %d new tail %ld qout %.6f tx %.6f",

+ q->cur_len, (int)q->prod_tail, p->next,

+ 1e-9*p->pt_qout, 1e-9*p->pt_tx);

+ q->prod_tail = p->next;

+ q->tx++;

+ return 0;

+/*

+ * simple handler for parameters not supplied

+ */

+static int

+null_run_fn(struct _qs *q, struct _cfg *cfg)

+ (void)q;

+ (void)cfg;

+ return 0;

+/*

+ * put packet data into the buffer.

+ * We read from the mmapped pcap file, construct header, copy

+ * the captured length of the packet and pad with zeroes.

+ */

+static void *

+pcap_prod(void *_pa)

+ struct pipe_args *pa = _pa;

+ struct _qs *q = &pa->q;

+ struct nm_pcap_file *pf = q->pcap; /* already opened by readpcap */

+ uint32_t loops, i, tot_pkts;

+ /* data plus the loop record */

+ uint64_t need;

+ uint64_t t_tx, tt, last_ts; /* last timestamp from trace */

+ /*

+ * For speed we make sure the trace is at least some 1000 packets,

+ * so we may need to loop the trace more than once (for short traces)

+ */

+ loops = (1 + 10000 / pf->tot_pkt);

+ tot_pkts = loops * pf->tot_pkt;

+ need = loops * pf->tot_bytes_rounded + sizeof(struct q_pkt);

+ q->buf = calloc(1, need);

+ if (q->buf == NULL) {

+ D("alloc %ld bytes for queue failed, exiting",(_P64)need);

+ goto fail;

+ }

+ q->prod_head = q->prod_tail = 0;

+ q->buflen = need;

+ pf->cur = pf->data + sizeof(struct pcap_file_header);

+ pf->err = 0;

+ ED("--- start create %lu packets at tail %d",

+ (u_long)tot_pkts, (int)q->prod_tail);

+ last_ts = pf->first_ts; /* beginning of the trace */

+ q->qt_qout = 0; /* first packet out of the queue */

+ for (loops = 0, i = 0; i < tot_pkts && !do_abort; i++) {

+ const char *next_pkt; /* in the pcap buffer */

+ uint64_t cur_ts;

+ /* read values from the pcap buffer */

+ cur_ts = read_next_info(pf, 4) * NS_SCALE +

+ read_next_info(pf, 4) * pf->resolution;

+ q->cur_caplen = read_next_info(pf, 4);

+ q->cur_len = read_next_info(pf, 4);

+ next_pkt = pf->cur + q->cur_caplen;

+ /* prepare fields in q for the generator */

+ q->cur_pkt = pf->cur;

+ /* initial estimate of tx time */

+ q->cur_tt = cur_ts - last_ts;

+ // -pf->first_ts + loops * pf->total_tx_time - last_ts;

+ if ((i % pf->tot_pkt) == 0)

+ ED("insert %5d len %lu cur_tt %.6f",

+ i, (u_long)q->cur_len, 1e-9*q->cur_tt);

+ /* prepare for next iteration */

+ pf->cur = next_pkt;

+ last_ts = cur_ts;

+ if (next_pkt == pf->lim) { //last pkt

+ pf->cur = pf->data + sizeof(struct pcap_file_header);

+ last_ts = pf->first_ts; /* beginning of the trace */

+ loops++;

+ }

+ q->c_loss.run(q, &q->c_loss);

+ if (q->cur_drop)

+ continue;

+ q->c_bw.run(q, &q->c_bw);

+ tt = q->cur_tt;

+ q->qt_qout += tt;

+#if 0

+ if (drop_after(q))

+ continue;

+#endif

+ q->c_delay.run(q, &q->c_delay); /* compute delay */

+ t_tx = q->qt_qout + q->cur_delay;

+ ND(5, "tt %ld qout %ld tx %ld qt_tx %ld", tt, q->qt_qout, t_tx, q->qt_tx);

+ /* insure no reordering and spacing by transmission time */

+ q->qt_tx = (t_tx >= q->qt_tx + tt) ? t_tx : q->qt_tx + tt;

+ enq(q);

+ q->tx++;

+ ND("ins %d q->prod_tail = %lu", (int)insert, (unsigned long)q->prod_tail);

+ }

+ /* loop marker ? */

+ ED("done q->prod_tail:%d",(int)q->prod_tail);

+ q->_tail = q->prod_tail; /* publish */

+ return NULL;

+fail:

+ if (q->buf != NULL) {

+ free(q->buf);

+ }

+ nm_close(pa->pb);

+ return (NULL);

+/*

+ * the consumer reads from the queue using head,

+ * advances it every now and then.

+ */

+static void *

+cons(void *_pa)

+ struct pipe_args *pa = _pa;

+ struct _qs *q = &pa->q;

+ int pending = 0;

+ uint64_t last_ts = 0;

+ /* read the start of times in q->t0 */

+ set_tns_now(&q->t0, 0);

+ /* set the time (cons_now) to clock - q->t0 */

+ set_tns_now(&q->cons_now, q->t0);

+ q->cons_head = q->_head;

+ q->cons_tail = q->_tail;

+ while (!do_abort) { /* consumer, infinite */

+ struct q_pkt *p = pkt_at(q, q->cons_head);

+ __builtin_prefetch (q->buf + p->next);

+ if (q->cons_head == q->cons_tail) { //reset record

+ ND("Transmission restarted");

+ /*

+ * add to q->t0 the time for the last packet

+ */

+ q->t0 += last_ts;

+ q->cons_head = 0; //restart from beginning of the queue

+ continue;

+ }

+ last_ts = p->pt_tx;

+ if (ts_cmp(p->pt_tx, q->cons_now) > 0) {

+ // packet not ready

+ q->rx_wait++;

+ /* the ioctl should be conditional */

+ ioctl(pa->pb->fd, NIOCTXSYNC, 0); // XXX just in case

+ pending = 0;

+ usleep(20);

+ set_tns_now(&q->cons_now, q->t0);

+ continue;

+ }

+ /* XXX copy is inefficient but simple */

+ pending++;

+ if (nm_inject(pa->pb, (char *)(p + 1), p->pktlen) == 0 ||

+ pending > q->burst) {

+ RD(1, "inject failed len %d now %ld tx %ld h %ld t %ld next %ld",

+ (int)p->pktlen, (u_long)q->cons_now, (u_long)p->pt_tx,

+ (u_long)q->_head, (u_long)q->_tail, (u_long)p->next);

+ ioctl(pa->pb->fd, NIOCTXSYNC, 0);

+ pending = 0;

+ continue;

+ }

+ q->cons_head = p->next;

+ /* drain packets from the queue */

+ q->rx++;

+ }

+ D("exiting on abort");

+ return NULL;

+/*

+ * In case of pcap file as input, the program acts in 2 different

+ * phases. It first fill the queue and then starts the cons()

+ */

+static void *

+nmreplay_main(void *_a)

+ struct pipe_args *a = _a;

+ struct _qs *q = &a->q;

+ const char *cap_fname = q->prod_ifname;

+ setaffinity(a->cons_core);

+ set_tns_now(&q->t0, 0); /* starting reference */

+ if (cap_fname == NULL) {

+ goto fail;

+ }

+ q->pcap = readpcap(cap_fname);

+ if (q->pcap == NULL) {

+ EEE("unable to read file %s", cap_fname);

+ goto fail;

+ }

+ pcap_prod((void*)a);

+ destroy_pcap(q->pcap);

+ q->pcap = NULL;

+ a->pb = nm_open(q->cons_ifname, NULL, 0, NULL);

+ if (a->pb == NULL) {

+ EEE("cannot open netmap on %s", q->cons_ifname);

+ do_abort = 1; // XXX any better way ?

+ return NULL;

+ }

+ /* continue as cons() */

+ WWW("prepare to send packets");

+ usleep(1000);

+ cons((void*)a);

+ EEE("exiting on abort");

+fail:

+ if (q->pcap != NULL) {

+ destroy_pcap(q->pcap);

+ }

+ do_abort = 1;

+ return NULL;

+static void

+sigint_h(int sig)

+ (void)sig; /* UNUSED */

+ do_abort = 1;

+ signal(SIGINT, SIG_DFL);

+static void

+usage(void)

+ fprintf(stderr,

+ "usage: nmreplay [-v] [-D delay] [-B {[constant,]bps|ether,bps|real,speedup}] [-L loss]\n"

+ "\t[-b burst] -i ifa-or-pcap-file -i ifb\n");

+ exit(1);

+/*---- configuration handling ---- */

+/*

+ * support routine: split argument, returns ac and *av.

+ * av contains two extra entries, a NULL and a pointer

+ * to the entire string.

+ */

+static char **

+split_arg(const char *src, int *_ac)

+ char *my = NULL, **av = NULL, *seps = " \t\r\n,";

+ int l, i, ac; /* number of entries */

+ if (!src)

+ return NULL;

+ l = strlen(src);

+ /* in the first pass we count fields, in the second pass

+ * we allocate the av[] array and a copy of the string

+ * and fill av[]. av[ac] = NULL, av[ac+1]

+ */

+ for (;;) {

+ i = ac = 0;

+ ND("start pass %d: <%s>", av ? 1 : 0, my);

+ while (i < l) {

+ /* trim leading separator */

+ while (i <l && strchr(seps, src[i]))

+ i++;

+ if (i >= l)

+ break;

+ ND(" pass %d arg %d: <%s>", av ? 1 : 0, ac, src+i);

+ if (av) /* in the second pass, set the result */

+ av[ac] = my+i;

+ ac++;

+ /* skip string */

+ while (i <l && !strchr(seps, src[i])) i++;

+ if (av)

+ my[i] = '\0'; /* write marker */

+ }

+ if (!av) { /* end of first pass */

+ ND("ac is %d", ac);

+ av = calloc(1, (l+1) + (ac + 2)*sizeof(char *));

+ my = (char *)&(av[ac+2]);

+ strcpy(my, src);

+ } else {

+ break;

+ }

+ for (i = 0; i < ac; i++) {

+ NED("%d: <%s>", i, av[i]);

+ }

+ av[i++] = NULL;

+ av[i++] = my;

+ *_ac = ac;

+ return av;

+/*

+ * apply a command against a set of functions,

+ * install a handler in *dst

+ */

+static int

+cmd_apply(const struct _cfg *a, const char *arg, struct _qs *q, struct _cfg *dst)

+ int ac = 0;

+ char **av;

+ int i;

+ if (arg == NULL || *arg == '\0')

+ return 1; /* no argument may be ok */

+ if (a == NULL || dst == NULL) {

+ ED("program error - invalid arguments");

+ exit(1);

+ }

+ av = split_arg(arg, &ac);

+ if (av == NULL)

+ return 1; /* error */

+ for (i = 0; a[i].parse; i++) {

+ struct _cfg x = a[i];

+ const char *errmsg = x.optarg;

+ int ret;

+ x.arg = NULL;

+ x.arg_len = 0;

+ bzero(&x.d, sizeof(x.d));

+ ND("apply %s to %s", av[0], errmsg);

+ ret = x.parse(q, &x, ac, av);

+ if (ret == 2) /* not recognised */

+ continue;

+ if (ret == 1) {

+ ED("invalid arguments: need '%s' have '%s'",

+ errmsg, arg);

+ break;

+ }

+ x.optarg = arg;

+ *dst = x;

+ return 0;

+ }

+ ED("arguments %s not recognised", arg);

+ free(av);

+ return 1;

+static struct _cfg delay_cfg[];

+static struct _cfg bw_cfg[];

+static struct _cfg loss_cfg[];

+static uint64_t parse_bw(const char *arg);

+/*

+ * prodcons [options]

+ * accept separate sets of arguments for the two directions

+ *

+ */

+static void

+add_to(const char ** v, int l, const char *arg, const char *msg)

+ for (; l > 0 && *v != NULL ; l--, v++);

+ if (l == 0) {

+ ED("%s %s", msg, arg);

+ exit(1);

+ }

+ *v = arg;

+int

+main(int argc, char **argv)

+ int ch, i, err=0;

+#define N_OPTS 1

+ struct pipe_args bp[N_OPTS];

+ const char *d[N_OPTS], *b[N_OPTS], *l[N_OPTS], *q[N_OPTS], *ifname[N_OPTS], *m[N_OPTS];

+ const char *pcap_file[N_OPTS];

+ int cores[4] = { 2, 8, 4, 10 }; /* default values */

+ bzero(&bp, sizeof(bp)); /* all data initially go here */

+ bzero(d, sizeof(d));

+ bzero(b, sizeof(b));

+ bzero(l, sizeof(l));

+ bzero(q, sizeof(q));

+ bzero(m, sizeof(m));

+ bzero(ifname, sizeof(ifname));

+ bzero(pcap_file, sizeof(pcap_file));

+ /* set default values */

+ for (i = 0; i < N_OPTS; i++) {

+ struct _qs *q = &bp[i].q;

+ q->burst = 128;

+ q->c_delay.optarg = "0";

+ q->c_delay.run = null_run_fn;

+ q->c_loss.optarg = "0";

+ q->c_loss.run = null_run_fn;

+ q->c_bw.optarg = "0";

+ q->c_bw.run = null_run_fn;

+ }

+ // Options:

+ // B bandwidth in bps

+ // D delay in seconds

+ // L loss probability

+ // f pcap file

+ // i interface name

+ // w wait link

+ // b batch size

+ // v verbose

+ // C cpu placement

+ while ( (ch = getopt(argc, argv, "B:C:D:L:b:f:i:vw:")) != -1) {

+ switch (ch) {

+ default:

+ D("bad option %c %s", ch, optarg);

+ usage();

+ break;

+ case 'C': /* CPU placement, up to 4 arguments */

+ {

+ int ac = 0;

+ char **av = split_arg(optarg, &ac);

+ if (ac == 1) { /* sequential after the first */

+ cores[0] = atoi(av[0]);

+ cores[1] = cores[0] + 1;

+ cores[2] = cores[1] + 1;

+ cores[3] = cores[2] + 1;

+ } else if (ac == 2) { /* two sequential pairs */

+ cores[0] = atoi(av[0]);

+ cores[1] = cores[0] + 1;

+ cores[2] = atoi(av[1]);

+ cores[3] = cores[2] + 1;

+ } else if (ac == 4) { /* four values */

+ cores[0] = atoi(av[0]);

+ cores[1] = atoi(av[1]);

+ cores[2] = atoi(av[2]);

+ cores[3] = atoi(av[3]);

+ } else {

+ ED(" -C accepts 1, 2 or 4 comma separated arguments");

+ usage();

+ }

+ if (av)

+ free(av);

+ }

+ break;

+ case 'B': /* bandwidth in bps */

+ add_to(b, N_OPTS, optarg, "-B too many times");

+ break;

+ case 'D': /* delay in seconds (float) */

+ add_to(d, N_OPTS, optarg, "-D too many times");

+ break;

+ case 'L': /* loss probability */

+ add_to(l, N_OPTS, optarg, "-L too many times");

+ break;

+ case 'b': /* burst */

+ bp[0].q.burst = atoi(optarg);

+ break;

+ case 'f': /* pcap_file */

+ add_to(pcap_file, N_OPTS, optarg, "-f too many times");

+ break;

+ case 'i': /* interface */

+ add_to(ifname, N_OPTS, optarg, "-i too many times");

+ break;

+ case 'v':

+ verbose++;

+ break;

+ case 'w':

+ bp[0].wait_link = atoi(optarg);

+ break;

+ }

+ argc -= optind;

+ argv += optind;

+ /*

+ * consistency checks for common arguments

+ * if pcap file has been provided we need just one interface, two otherwise

+ */

+ if (!pcap_file[0]) {

+ ED("missing pcap file");

+ usage();

+ }

+ if (!ifname[0]) {

+ ED("missing interface");

+ usage();

+ }

+ if (bp[0].q.burst < 1 || bp[0].q.burst > 8192) {

+ WWW("invalid burst %d, set to 1024", bp[0].q.burst);

+ bp[0].q.burst = 1024; // XXX 128 is probably better

+ }

+ if (bp[0].wait_link > 100) {

+ ED("invalid wait_link %d, set to 4", bp[0].wait_link);

+ bp[0].wait_link = 4;

+ }

+ bp[0].q.prod_ifname = pcap_file[0];

+ bp[0].q.cons_ifname = ifname[0];

+ /* assign cores. prod and cons work better if on the same HT */

+ bp[0].cons_core = cores[0];

+ bp[0].prod_core = cores[1];

+ ED("running on cores %d %d %d %d", cores[0], cores[1], cores[2], cores[3]);

+ /* apply commands */

+ for (i = 0; i < N_OPTS; i++) { /* once per queue */

+ struct _qs *q = &bp[i].q;

+ err += cmd_apply(delay_cfg, d[i], q, &q->c_delay);

+ err += cmd_apply(bw_cfg, b[i], q, &q->c_bw);

+ err += cmd_apply(loss_cfg, l[i], q, &q->c_loss);

+ }

+ pthread_create(&bp[0].cons_tid, NULL, nmreplay_main, (void*)&bp[0]);

+ signal(SIGINT, sigint_h);

+ sleep(1);

+ while (!do_abort) {

+ struct _qs olda = bp[0].q;

+ struct _qs *q0 = &bp[0].q;

+ sleep(1);

+ ED("%ld -> %ld maxq %d round %ld",

+ (_P64)(q0->rx - olda.rx), (_P64)(q0->tx - olda.tx),

+ q0->rx_qmax, (_P64)q0->prod_max_gap

+ );

+ ED("plr nominal %le actual %le",

+ (double)(q0->c_loss.d[0])/(1<<24),

+ q0->c_loss.d[1] == 0 ? 0 :

+ (double)(q0->c_loss.d[2])/q0->c_loss.d[1]);

+ bp[0].q.rx_qmax = (bp[0].q.rx_qmax * 7)/8; // ewma

+ bp[0].q.prod_max_gap = (bp[0].q.prod_max_gap * 7)/8; // ewma

+ }

+ D("exiting on abort");

+ sleep(1);

+ return (0);

+/* conversion factor for numbers.

+ * Each entry has a set of characters and conversion factor,

+ * the first entry should have an empty string and default factor,

+ * the final entry has s = NULL.

+ */

+struct _sm { /* string and multiplier */

+ char *s;

+ double m;

+};

+/*

+ * parse a generic value

+ */

+static double

+parse_gen(const char *arg, const struct _sm *conv, int *err)

+ double d;

+ char *ep;

+ int dummy;

+ if (err == NULL)

+ err = &dummy;

+ *err = 0;

+ if (arg == NULL)

+ goto error;

+ d = strtod(arg, &ep);

+ if (ep == arg) { /* no value */

+ ED("bad argument %s", arg);

+ goto error;

+ }

+ /* special case, no conversion */

+ if (conv == NULL && *ep == '\0')

+ goto done;

+ ND("checking %s [%s]", arg, ep);

+ for (;conv->s; conv++) {

+ if (strchr(conv->s, *ep))

+ goto done;

+ }

+error:

+ *err = 1; /* unrecognised */

+ return 0;

+done:

+ if (conv) {

+ ND("scale is %s %lf", conv->s, conv->m);

+ d *= conv->m; /* apply default conversion */

+ }

+ ND("returning %lf", d);

+ return d;

+#define U_PARSE_ERR ~(0ULL)

+/* returns a value in nanoseconds */

+static uint64_t

+parse_time(const char *arg)

+ struct _sm a[] = {

+ {"", 1000000000 /* seconds */},

+ {"n", 1 /* nanoseconds */}, {"u", 1000 /* microseconds */},

+ {"m", 1000000 /* milliseconds */}, {"s", 1000000000 /* seconds */},

+ {NULL, 0 /* seconds */}

+ };

+ int err;

+ uint64_t ret = (uint64_t)parse_gen(arg, a, &err);

+ return err ? U_PARSE_ERR : ret;

+/*

+ * parse a bandwidth, returns value in bps or U_PARSE_ERR if error.

+ */

+static uint64_t

+parse_bw(const char *arg)

+ struct _sm a[] = {

+ {"", 1}, {"kK", 1000}, {"mM", 1000000}, {"gG", 1000000000}, {NULL, 0}

+ };

+ int err;

+ uint64_t ret = (uint64_t)parse_gen(arg, a, &err);

+ return err ? U_PARSE_ERR : ret;

+/*

+ * For some function we need random bits.

+ * This is a wrapper to whatever function you want that returns

+ * 24 useful random bits.

+ */

+#include <math.h> /* log, exp etc. */

+static inline uint64_t

+my_random24(void) /* 24 useful bits */

+ return random() & ((1<<24) - 1);

+/*-------------- user-configuration -----------------*/

+#if 0 /* start of comment block */

+Here we place the functions to implement the various features

+of the system. For each feature one should define a struct _cfg

+(see at the beginning for definition) that refers a *_parse() function

+to extract values from the command line, and a *_run() function

+that is invoked on each packet to implement the desired function.

+Examples of the two functions are below. In general:

+- the *_parse() function takes argc/argv[], matches the function

+ name in argv[0], extracts the operating parameters, allocates memory

+ if needed, and stores them in the struct _cfg.

+ Return value is 2 if argv[0] is not recosnised, 1 if there is an

+ error in the arguments, 0 if all ok.

+ On the command line, argv[] is a single, comma separated argument

+ that follow the specific option eg -D constant,20ms

+ struct _cfg has some preallocated space (e.g an array of uint64_t) so simple

+ function can use that without having to allocate memory.

+- the *_run() function takes struct _q *q and struct _cfg *cfg as arguments.

+ *q contains all the informatio that may be possibly needed, including

+ those on the packet currently under processing.

+ The basic values are the following:

+ char * cur_pkt points to the current packet (linear buffer)

+ uint32_t cur_len; length of the current packet

+ the functions are not supposed to modify these values

+ int cur_drop; true if current packet must be dropped.

+ Must be set to non-zero by the loss emulation function

+ uint64_t cur_delay; delay in nanoseconds for the current packet

+ Must be set by the delay emulation function

+ More sophisticated functions may need to access other fields in *q,

+ see the structure description for that.

+When implementing a new function for a feature (e.g. for delay,

+bandwidth, loss...) the struct _cfg should be added to the array

+that contains all possible options.

+ --- Specific notes ---

+DELAY emulation -D option_arguments

+ If the option is not supplied, the system applies 0 extra delay

+ The resolution for times is 1ns, the precision is load dependent and

+ generally in the order of 20-50us.

+ Times are in nanoseconds, can be followed by a character specifying

+ a different unit e.g.

+ n nanoseconds

+ u microseconds

+ m milliseconds

+ s seconds

+ Currently implemented options:

+ constant,t constant delay equal to t

+ uniform,tmin,tmax uniform delay between tmin and tmax

+ exp,tavg,tmin exponential distribution with average tavg

+ and minimum tmin (corresponds to an exponential

+ distribution with argument 1/(tavg-tmin) )

+LOSS emulation -L option_arguments

+ Loss is expressed as packet or bit error rate, which is an absolute

+ number between 0 and 1 (typically small).

+ Currently implemented options

+ plr,p uniform packet loss rate p, independent

+ of packet size

+ burst,p,lmin,lmax burst loss with burst probability p and

+ burst length uniformly distributed between

+ lmin and lmax

+ ber,p uniformly distributed bit error rate p,

+ so actual loss prob. depends on size.

+BANDWIDTH emulation -B option_arguments

+ Bandwidths are expressed in bits per second, can be followed by a

+ character specifying a different unit e.g.

+ b/B bits per second

+ k/K kbits/s (10^3 bits/s)

+ m/M mbits/s (10^6 bits/s)

+ g/G gbits/s (10^9 bits/s)

+ Currently implemented options

+ const,b constant bw, excluding mac framing

+ ether,b constant bw, including ethernet framing

+ (20 bytes framing + 4 bytes crc)

+ real,[scale] use real time, optionally with a scaling factor

+#endif /* end of comment block */

+/*

+ * Configuration options for delay

+ */

+/* constant delay, also accepts just a number */

+static int

+const_delay_parse(struct _qs *q, struct _cfg *dst, int ac, char *av[])

+ uint64_t delay;

+ (void)q;

+ if (strncmp(av[0], "const", 5) != 0 && ac > 1)

+ return 2; /* unrecognised */

+ if (ac > 2)

+ return 1; /* error */

+ delay = parse_time(av[ac - 1]);

+ if (delay == U_PARSE_ERR)

+ return 1; /* error */

+ dst->d[0] = delay;

+ return 0; /* success */

+/* runtime function, store the delay into q->cur_delay */

+static int

+const_delay_run(struct _qs *q, struct _cfg *arg)

+ q->cur_delay = arg->d[0]; /* the delay */

+ return 0;

+static int

+uniform_delay_parse(struct _qs *q, struct _cfg *dst, int ac, char *av[])

+ uint64_t dmin, dmax;

+ (void)q;

+ if (strcmp(av[0], "uniform") != 0)

+ return 2; /* not recognised */

+ if (ac != 3)

+ return 1; /* error */

+ dmin = parse_time(av[1]);

+ dmax = parse_time(av[2]);

+ if (dmin == U_PARSE_ERR || dmax == U_PARSE_ERR || dmin > dmax)

+ return 1;

+ D("dmin %ld dmax %ld", (_P64)dmin, (_P64)dmax);

+ dst->d[0] = dmin;

+ dst->d[1] = dmax;

+ dst->d[2] = dmax - dmin;

+ return 0;

+static int

+uniform_delay_run(struct _qs *q, struct _cfg *arg)

+ uint64_t x = my_random24();

+ q->cur_delay = arg->d[0] + ((arg->d[2] * x) >> 24);

+#if 0 /* COMPUTE_STATS */

+#endif /* COMPUTE_STATS */

+ return 0;

+/*

+ * exponential delay: Prob(delay = x) = exp(-x/d_av)

+ * gives a delay between 0 and infinity with average d_av

+ * The cumulative function is 1 - d_av exp(-x/d_av)

+ *

+ * The inverse function generates a uniform random number p in 0..1

+ * and generates delay = (d_av-d_min) * -ln(1-p) + d_min

+ *

+ * To speed up behaviour at runtime we tabulate the values

+ */

+static int

+exp_delay_parse(struct _qs *q, struct _cfg *dst, int ac, char *av[])

+#define PTS_D_EXP 512

+ uint64_t i, d_av, d_min, *t; /*table of values */

+ (void)q;

+ if (strcmp(av[0], "exp") != 0)

+ return 2; /* not recognised */

+ if (ac != 3)

+ return 1; /* error */

+ d_av = parse_time(av[1]);

+ d_min = parse_time(av[2]);

+ if (d_av == U_PARSE_ERR || d_min == U_PARSE_ERR || d_av < d_min)

+ return 1; /* error */

+ d_av -= d_min;

+ dst->arg_len = PTS_D_EXP * sizeof(uint64_t);

+ dst->arg = calloc(1, dst->arg_len);

+ if (dst->arg == NULL)

+ return 1; /* no memory */

+ t = (uint64_t *)dst->arg;

+ /* tabulate -ln(1-n)*delay for n in 0..1 */

+ for (i = 0; i < PTS_D_EXP; i++) {

+ double d = -log2 ((double)(PTS_D_EXP - i) / PTS_D_EXP) * d_av + d_min;

+ t[i] = (uint64_t)d;

+ ND(5, "%ld: %le", i, d);

+ }

+ return 0;

+static int

+exp_delay_run(struct _qs *q, struct _cfg *arg)

+ uint64_t *t = (uint64_t *)arg->arg;

+ q->cur_delay = t[my_random24() & (PTS_D_EXP - 1)];

+ RD(5, "delay %lu", (_P64)q->cur_delay);

+ return 0;

+/* unused arguments in configuration */

+#define _CFG_END NULL, 0, {0}, {0}

+static struct _cfg delay_cfg[] = {

+ { const_delay_parse, const_delay_run,

+ "constant,delay", _CFG_END },

+ { uniform_delay_parse, uniform_delay_run,

+ "uniform,dmin,dmax # dmin <= dmax", _CFG_END },

+ { exp_delay_parse, exp_delay_run,

+ "exp,dmin,davg # dmin <= davg", _CFG_END },

+ { NULL, NULL, NULL, _CFG_END }

+};

+/* standard bandwidth, also accepts just a number */

+static int

+const_bw_parse(struct _qs *q, struct _cfg *dst, int ac, char *av[])

+ uint64_t bw;

+ (void)q;

+ if (strncmp(av[0], "const", 5) != 0 && ac > 1)

+ return 2; /* unrecognised */

+ if (ac > 2)

+ return 1; /* error */

+ bw = parse_bw(av[ac - 1]);

+ if (bw == U_PARSE_ERR) {

+ return (ac == 2) ? 1 /* error */ : 2 /* unrecognised */;

+ }

+ dst->d[0] = bw;

+ return 0; /* success */

+/* runtime function, store the delay into q->cur_delay */

+static int

+const_bw_run(struct _qs *q, struct _cfg *arg)

+ uint64_t bps = arg->d[0];

+ q->cur_tt = bps ? 8ULL* TIME_UNITS * q->cur_len / bps : 0 ;

+ return 0;

+/* ethernet bandwidth, add 672 bits per packet */

+static int

+ether_bw_parse(struct _qs *q, struct _cfg *dst, int ac, char *av[])

+ uint64_t bw;

+ (void)q;

+ if (strcmp(av[0], "ether") != 0)

+ return 2; /* unrecognised */

+ if (ac != 2)

+ return 1; /* error */

+ bw = parse_bw(av[ac - 1]);

+ if (bw == U_PARSE_ERR)

+ return 1; /* error */

+ dst->d[0] = bw;

+ return 0; /* success */

+/* runtime function, add 20 bytes (framing) + 4 bytes (crc) */

+static int

+ether_bw_run(struct _qs *q, struct _cfg *arg)

+ uint64_t bps = arg->d[0];

+ q->cur_tt = bps ? 8ULL * TIME_UNITS * (q->cur_len + 24) / bps : 0 ;

+ return 0;

+/* real bandwidth, plus scaling factor */

+static int

+real_bw_parse(struct _qs *q, struct _cfg *dst, int ac, char *av[])

+ double scale;

+ (void)q;

+ if (strcmp(av[0], "real") != 0)

+ return 2; /* unrecognised */

+ if (ac > 2) { /* second argument is optional */

+ return 1; /* error */

+ } else if (ac == 1) {

+ scale = 1;

+ } else {

+ int err = 0;

+ scale = parse_gen(av[ac-1], NULL, &err);

+ if (err || scale <= 0 || scale > 1000)

+ return 1;

+ }

+ ED("real -> scale is %.6f", scale);

+ dst->f[0] = scale;

+ return 0; /* success */

+static int

+real_bw_run(struct _qs *q, struct _cfg *arg)

+ q->cur_tt /= arg->f[0];

+ return 0;

+static struct _cfg bw_cfg[] = {

+ { const_bw_parse, const_bw_run,

+ "constant,bps", _CFG_END },

+ { ether_bw_parse, ether_bw_run,

+ "ether,bps", _CFG_END },

+ { real_bw_parse, real_bw_run,

+ "real,scale", _CFG_END },

+ { NULL, NULL, NULL, _CFG_END }

+};

+/*

+ * loss patterns

+ */

+static int

+const_plr_parse(struct _qs *q, struct _cfg *dst, int ac, char *av[])

+ double plr;

+ int err;

+ (void)q;

+ if (strcmp(av[0], "plr") != 0 && ac > 1)

+ return 2; /* unrecognised */

+ if (ac > 2)

+ return 1; /* error */

+ // XXX to be completed

+ plr = parse_gen(av[ac-1], NULL, &err);

+ if (err || plr < 0 || plr > 1)

+ return 1;

+ dst->d[0] = plr * (1<<24); /* scale is 16m */

+ if (plr != 0 && dst->d[0] == 0)

+ ED("WWW warning, rounding %le down to 0", plr);

+ return 0; /* success */

+static int

+const_plr_run(struct _qs *q, struct _cfg *arg)

+ (void)arg;

+ uint64_t r = my_random24();

+ q->cur_drop = r < arg->d[0];

+#if 1 /* keep stats */

+ arg->d[1]++;

+ arg->d[2] += q->cur_drop;

+#endif

+ return 0;

+/*

+ * For BER the loss is 1- (1-ber)**bit_len

+ * The linear approximation is only good for small values, so we

+ * tabulate (1-ber)**len for various sizes in bytes

+ */

+static int

+const_ber_parse(struct _qs *q, struct _cfg *dst, int ac, char *av[])

+ double ber, ber8, cur;

+ int i, err;

+ uint32_t *plr;

+ const uint32_t mask = (1<<24) - 1;

+ (void)q;

+ if (strcmp(av[0], "ber") != 0)

+ return 2; /* unrecognised */

+ if (ac != 2)

+ return 1; /* error */

+ ber = parse_gen(av[ac-1], NULL, &err);

+ if (err || ber < 0 || ber > 1)

+ return 1;

+ dst->arg_len = MAX_PKT * sizeof(uint32_t);

+ plr = calloc(1, dst->arg_len);

+ if (plr == NULL)

+ return 1; /* no memory */

+ dst->arg = plr;

+ ber8 = 1 - ber;

+ ber8 *= ber8; /* **2 */

+ ber8 *= ber8; /* **4 */

+ ber8 *= ber8; /* **8 */

+ cur = 1;

+ for (i=0; i < MAX_PKT; i++, cur *= ber8) {

+ plr[i] = (mask + 1)*(1 - cur);

+ if (plr[i] > mask)

+ plr[i] = mask;

+#if 0

+ if (i>= 60) // && plr[i] < mask/2)

+ RD(50,"%4d: %le %ld", i, 1.0 - cur, (_P64)plr[i]);

+#endif

+ }

+ dst->d[0] = ber * (mask + 1);

+ return 0; /* success */

+static int

+const_ber_run(struct _qs *q, struct _cfg *arg)

+ int l = q->cur_len;

+ uint64_t r = my_random24();

+ uint32_t *plr = arg->arg;

+ if (l >= MAX_PKT) {

+ RD(5, "pkt len %d too large, trim to %d", l, MAX_PKT-1);

+ l = MAX_PKT-1;

+ }

+ q->cur_drop = r < plr[l];

+#if 1 /* keep stats */

+ arg->d[1] += l * 8;

+ arg->d[2] += q->cur_drop;

+#endif

+ return 0;

+static struct _cfg loss_cfg[] = {

+ { const_plr_parse, const_plr_run,

+ "plr,prob # 0 <= prob <= 1", _CFG_END },

+ { const_ber_parse, const_ber_run,

+ "ber,prob # 0 <= prob <= 1", _CFG_END },

+ { NULL, NULL, NULL, _CFG_END }

+};

diff --git a/tools/tools/netmap/pkt-gen.c b/tools/tools/netmap/pkt-gen.c
index 6d9bee6de634..168e022cfba9 100644
--- a/tools/tools/netmap/pkt-gen.c
+++ b/tools/tools/netmap/pkt-gen.c

@@ -1,6 +1,6 @@

* Redistribution and use in source and binary forms, with or without

* modification, are permitted provided that the following conditions

@@ -37,8 +37,6 @@

-// #define TRASH_VHOST_HDR

#define _GNU_SOURCE /* for CPU_SET() */

#include <stdio.h>

#define NETMAP_WITH_LIBS

@@ -49,12 +47,16 @@

#include <unistd.h> // sysconf()

#include <sys/poll.h>

#include <arpa/inet.h> /* ntohs */

+#ifndef _WIN32

#include <sys/sysctl.h> /* sysctl */

+#endif

#include <ifaddrs.h> /* getifaddrs */

#include <net/ethernet.h>

#include <netinet/in.h>

#include <netinet/ip.h>

#include <netinet/udp.h>

+#include <assert.h>

+#include <math.h>

#include <pthread.h>

@@ -62,6 +64,69 @@

#include <pcap/pcap.h>

#endif

+#include "ctrs.h"

+#ifdef _WIN32

+#define cpuset_t DWORD_PTR //uint64_t

+static inline void CPU_ZERO(cpuset_t *p)

+ *p = 0;

+static inline void CPU_SET(uint32_t i, cpuset_t *p)

+ *p |= 1<< (i & 0x3f);

+#define pthread_setaffinity_np(a, b, c) !SetThreadAffinityMask(a, *c) //((void)a, 0)

+#define TAP_CLONEDEV "/dev/tap"

+#define AF_LINK 18 //defined in winsocks.h

+#define CLOCK_REALTIME_PRECISE CLOCK_REALTIME

+#include <net/if_dl.h>

+/*

+ * Convert an ASCII representation of an ethernet address to

+ * binary form.

+ */

+struct ether_addr *

+ether_aton(const char *a)

+ int i;

+ static struct ether_addr o;

+ unsigned int o0, o1, o2, o3, o4, o5;

+ i = sscanf(a, "%x:%x:%x:%x:%x:%x", &o0, &o1, &o2, &o3, &o4, &o5);

+ if (i != 6)

+ return (NULL);

+ o.octet[0]=o0;

+ o.octet[1]=o1;

+ o.octet[2]=o2;

+ o.octet[3]=o3;

+ o.octet[4]=o4;

+ o.octet[5]=o5;

+ return ((struct ether_addr *)&o);

+/*

+ * Convert a binary representation of an ethernet address to

+ * an ASCII string.

+ */

+char *

+ether_ntoa(const struct ether_addr *n)

+ int i;

+ static char a[18];

+ i = sprintf(a, "%02x:%02x:%02x:%02x:%02x:%02x",

+ n->octet[0], n->octet[1], n->octet[2],

+ n->octet[3], n->octet[4], n->octet[5]);

+ return (i < 17 ? NULL : (char *)&a);

+#endif /* _WIN32 */

#ifdef linux

#define cpuset_t cpu_set_t

@@ -169,10 +234,12 @@ struct glob_arg {

int pkt_size;

int burst;

int forever;

- int npackets; /* total packets to send */

+ uint64_t npackets; /* total packets to send */

int frags; /* fragments per packet */

int nthreads;

- int cpus;

+ int cpus; /* cpus used for running */

+ int system_cpus; /* cpus on the system */

int options; /* testing */

#define OPT_PREFETCH 1

#define OPT_ACCESS 2

@@ -181,10 +248,10 @@ struct glob_arg {

#define OPT_TS 16 /* add a timestamp */

#define OPT_INDIRECT 32 /* use indirect buffers, tx only */

#define OPT_DUMP 64 /* dump rx/tx traffic */

-#define OPT_MONITOR_TX 128

-#define OPT_MONITOR_RX 256

+#define OPT_RUBBISH 256 /* send wathever the buffers contain */

#define OPT_RANDOM_SRC 512

#define OPT_RANDOM_DST 1024

+#define OPT_PPS_STATS 2048

int dev_type;

#ifndef NO_PCAP

pcap_t *p;

@@ -198,13 +265,18 @@ struct glob_arg {

struct nm_desc *nmd;

int report_interval; /* milliseconds between prints */

void *(*td_body)(void *);

+ int td_type;

void *mmap_addr;

char ifname[MAX_IFNAMELEN];

char *nmr_config;

int dummy_send;

int virt_header; /* send also the virt_header */

int extra_bufs; /* goes in nr_arg3 */

+ int extra_pipes; /* goes in nr_arg1 */

char *packet_file; /* -P option */

+#define STATS_WIN 15

+ int win_idx;

+ int64_t win[STATS_WIN];

};

enum dev_type { DEV_NONE, DEV_NETMAP, DEV_PCAP, DEV_TAP };

@@ -220,7 +292,11 @@ struct targ {

int cancel;

int fd;

struct nm_desc *nmd;

- volatile uint64_t count;

+ /* these ought to be volatile, but they are

+ * only sampled and errors should not accumulate

+ */

+ struct my_ctrs ctr;

struct timespec tic, toc;

int me;

pthread_t thread;

@@ -327,11 +403,10 @@ sigint_h(int sig)

int i;

(void)sig; /* UNUSED */

- D("received control-C on thread %p", pthread_self());

+ D("received control-C on thread %p", (void *)pthread_self());

for (i = 0; i < global_nthreads; i++) {

targs[i].cancel = 1;

}

- signal(SIGINT, SIG_DFL);

}

/* sysctl wrapper to return the number of active CPUs */

@@ -345,6 +420,12 @@ system_ncpus(void)

sysctl(mib, 2, &ncpus, &len, NULL, 0);

#elif defined(linux)

ncpus = sysconf(_SC_NPROCESSORS_ONLN);

+#elif defined(_WIN32)

+ {

+ SYSTEM_INFO sysinfo;

+ GetSystemInfo(&sysinfo);

+ ncpus = sysinfo.dwNumberOfProcessors;

+ }

#else /* others */

ncpus = 1;

#endif /* others */

@@ -518,10 +599,11 @@ wrapsum(u_int32_t sum)

* Look for consecutive ascii representations of the size of the packet.

static void

-dump_payload(char *p, int len, struct netmap_ring *ring, int cur)

+dump_payload(const char *_p, int len, struct netmap_ring *ring, int cur)

{

char buf[128];

int i, j, i0;

+ const unsigned char *p = (const unsigned char *)_p;

/* get the length in ASCII of the length of the packet. */

@@ -629,6 +711,7 @@ initialize_packet(struct targ *targ)

indirect_payload : default_payload;

int i, l0 = strlen(payload);

+#ifndef NO_PCAP

char errbuf[PCAP_ERRBUF_SIZE];

pcap_t *file;

struct pcap_pkthdr *header;

@@ -650,6 +733,7 @@ initialize_packet(struct targ *targ)

pcap_close(file);

return;

}

+#endif

/* create a nice NUL-terminated string */

for (i = 0; i < paylen; i += l0) {

@@ -695,35 +779,49 @@ initialize_packet(struct targ *targ)

eh->ether_type = htons(ETHERTYPE_IP);

bzero(&pkt->vh, sizeof(pkt->vh));

-#ifdef TRASH_VHOST_HDR

- /* set bogus content */

- pkt->vh.fields[0] = 0xff;

- pkt->vh.fields[1] = 0xff;

- pkt->vh.fields[2] = 0xff;

- pkt->vh.fields[3] = 0xff;

- pkt->vh.fields[4] = 0xff;

- pkt->vh.fields[5] = 0xff;

-#endif /* TRASH_VHOST_HDR */

// dump_payload((void *)pkt, targ->g->pkt_size, NULL, 0);

}

static void

-set_vnet_hdr_len(struct targ *t)

+get_vnet_hdr_len(struct glob_arg *g)

{

- int err, l = t->g->virt_header;

+ struct nmreq req;

+ int err;

+ memset(&req, 0, sizeof(req));

+ bcopy(g->nmd->req.nr_name, req.nr_name, sizeof(req.nr_name));

+ req.nr_version = NETMAP_API;

+ req.nr_cmd = NETMAP_VNET_HDR_GET;

+ err = ioctl(g->main_fd, NIOCREGIF, &req);

+ if (err) {

+ D("Unable to get virtio-net header length");

+ return;

+ }

+ g->virt_header = req.nr_arg1;

+ if (g->virt_header) {

+ D("Port requires virtio-net header, length = %d",

+ g->virt_header);

+ }

+static void

+set_vnet_hdr_len(struct glob_arg *g)

+ int err, l = g->virt_header;

struct nmreq req;

if (l == 0)

return;

memset(&req, 0, sizeof(req));

- bcopy(t->nmd->req.nr_name, req.nr_name, sizeof(req.nr_name));

+ bcopy(g->nmd->req.nr_name, req.nr_name, sizeof(req.nr_name));

req.nr_version = NETMAP_API;

req.nr_cmd = NETMAP_BDG_VNET_HDR;

req.nr_arg1 = l;

- err = ioctl(t->fd, NIOCREGIF, &req);

+ err = ioctl(g->main_fd, NIOCREGIF, &req);

if (err) {

- D("Unable to set vnet header length %d", l);

+ D("Unable to set virtio-net header length %d", l);

}

@@ -763,12 +861,15 @@ send_packets(struct netmap_ring *ring, struct pkt *pkt, void *frame,

for (fcnt = nfrags, sent = 0; sent < count; sent++) {

struct netmap_slot *slot = &ring->slot[cur];

char *p = NETMAP_BUF(ring, slot->buf_idx);

+ int buf_changed = slot->flags & NS_BUF_CHANGED;

slot->flags = 0;

- if (options & OPT_INDIRECT) {

+ if (options & OPT_RUBBISH) {

+ /* do nothing */

+ } else if (options & OPT_INDIRECT) {

slot->flags |= NS_INDIRECT;

- slot->ptr = (uint64_t)frame;

- } else if (options & OPT_COPY) {

+ slot->ptr = (uint64_t)((uintptr_t)frame);

+ } else if ((options & OPT_COPY) || buf_changed) {

nm_pkt_copy(frame, p, size);

if (fcnt == nfrags)

update_addresses(pkt, g);

@@ -798,6 +899,21 @@ send_packets(struct netmap_ring *ring, struct pkt *pkt, void *frame,

}

+ * Index of the highest bit set

+ */

+uint32_t

+msb64(uint64_t x)

+ uint64_t m = 1ULL << 63;

+ int i;

+ for (i = 63; i >= 0; i--, m >>=1)

+ if (m & x)

+ return i;

+ return 0;

+/*

* Send a packet, and wait for a response.

* The payload (after UDP header, ofs 42) has a 4-byte sequence

* followed by a struct timeval (or bintime?)

@@ -810,25 +926,28 @@ pinger_body(void *data)

struct targ *targ = (struct targ *) data;

struct pollfd pfd = { .fd = targ->fd, .events = POLLIN };

struct netmap_if *nifp = targ->nmd->nifp;

- int i, rx = 0, n = targ->g->npackets;

+ int i, rx = 0;

void *frame;

int size;

- uint32_t sent = 0;

struct timespec ts, now, last_print;

- uint32_t count = 0, min = 1000000000, av = 0;

+ uint64_t sent = 0, n = targ->g->npackets;

+ uint64_t count = 0, t_cur, t_min = ~0, av = 0;

+ uint64_t buckets[64]; /* bins for delays, ns */

frame = &targ->pkt;

frame += sizeof(targ->pkt.vh) - targ->g->virt_header;

size = targ->g->pkt_size + targ->g->virt_header;

if (targ->g->nthreads > 1) {

D("can only ping with 1 thread");

return NULL;

}

+ bzero(&buckets, sizeof(buckets));

clock_gettime(CLOCK_REALTIME_PRECISE, &last_print);

now = last_print;

- while (n == 0 || (int)sent < n) {

+ while (!targ->cancel && (n == 0 || sent < n)) {

struct netmap_ring *ring = NETMAP_TXRING(nifp, 0);

struct netmap_slot *slot;

char *p;

@@ -864,6 +983,8 @@ pinger_body(void *data)

while (!nm_ring_empty(ring)) {

uint32_t seq;

struct tstamp *tp;

+ int pos;

slot = &ring->slot[ring->cur];

p = NETMAP_BUF(ring, slot->buf_idx);

@@ -878,12 +999,16 @@ pinger_body(void *data)

ts.tv_nsec += 1000000000;

ts.tv_sec--;

}

- if (1) D("seq %d/%d delta %d.%09d", seq, sent,

+ if (0) D("seq %d/%lu delta %d.%09d", seq, sent,

(int)ts.tv_sec, (int)ts.tv_nsec);

- if (ts.tv_nsec < (int)min)

- min = ts.tv_nsec;

+ t_cur = ts.tv_sec * 1000000000UL + ts.tv_nsec;

+ if (t_cur < t_min)

+ t_min = t_cur;

count ++;

- av += ts.tv_nsec;

+ av += t_cur;

+ pos = msb64(t_cur);

+ buckets[pos]++;

+ /* now store it in a bucket */

ring->head = ring->cur = nm_ring_next(ring, ring->cur);

rx++;

}

@@ -897,14 +1022,32 @@ pinger_body(void *data)

ts.tv_sec--;

}

if (ts.tv_sec >= 1) {

- D("count %d min %d av %d",

- count, min, av/count);

+ D("count %d RTT: min %d av %d ns",

+ (int)count, (int)t_min, (int)(av/count));

+ int k, j, kmin;

+ char buf[512];

+ for (kmin = 0; kmin < 64; kmin ++)

+ if (buckets[kmin])

+ break;

+ for (k = 63; k >= kmin; k--)

+ if (buckets[k])

+ break;

+ buf[0] = '\0';

+ for (j = kmin; j <= k; j++)

+ sprintf(buf, "%s %5d", buf, (int)buckets[j]);

+ D("k: %d .. %d\n\t%s", 1<<kmin, 1<<k, buf);

+ bzero(&buckets, sizeof(buckets));

count = 0;

av = 0;

- min = 100000000;

+ t_min = ~0;

last_print = now;

}

+ /* reset the ``used`` flag. */

+ targ->used = 0;

return NULL;

}

@@ -919,14 +1062,15 @@ ponger_body(void *data)

struct pollfd pfd = { .fd = targ->fd, .events = POLLIN };

struct netmap_if *nifp = targ->nmd->nifp;

struct netmap_ring *txring, *rxring;

- int i, rx = 0, sent = 0, n = targ->g->npackets;

+ int i, rx = 0;

+ uint64_t sent = 0, n = targ->g->npackets;

if (targ->g->nthreads > 1) {

D("can only reply ping with 1 thread");

return NULL;

}

- D("understood ponger %d but don't know how to do it", n);

- while (n == 0 || sent < n) {

+ D("understood ponger %lu but don't know how to do it", n);

+ while (!targ->cancel && (n == 0 || sent < n)) {

uint32_t txcur, txavail;

//#define BUSYWAIT

#ifdef BUSYWAIT

@@ -975,69 +1119,17 @@ ponger_body(void *data)

}

txring->head = txring->cur = txcur;

- targ->count = sent;

+ targ->ctr.pkts = sent;

#ifdef BUSYWAIT

ioctl(pfd.fd, NIOCTXSYNC, NULL);

#endif

//D("tx %d rx %d", sent, rx);

}

- return NULL;

-static __inline int

-timespec_ge(const struct timespec *a, const struct timespec *b)

- if (a->tv_sec > b->tv_sec)

- return (1);

- if (a->tv_sec < b->tv_sec)

- return (0);

- if (a->tv_nsec >= b->tv_nsec)

- return (1);

- return (0);

-static __inline struct timespec

-timeval2spec(const struct timeval *a)

- struct timespec ts = {

- .tv_sec = a->tv_sec,

- .tv_nsec = a->tv_usec * 1000

- };

- return ts;

-static __inline struct timeval

-timespec2val(const struct timespec *a)

- struct timeval tv = {

- .tv_sec = a->tv_sec,

- .tv_usec = a->tv_nsec / 1000

- };

- return tv;

+ /* reset the ``used`` flag. */

+ targ->used = 0;

-static __inline struct timespec

-timespec_add(struct timespec a, struct timespec b)

- struct timespec ret = { a.tv_sec + b.tv_sec, a.tv_nsec + b.tv_nsec };

- if (ret.tv_nsec >= 1000000000) {

- ret.tv_sec++;

- ret.tv_nsec -= 1000000000;

- }

- return ret;

-static __inline struct timespec

-timespec_sub(struct timespec a, struct timespec b)

- struct timespec ret = { a.tv_sec - b.tv_sec, a.tv_nsec - b.tv_nsec };

- if (ret.tv_nsec < 0) {

- ret.tv_sec--;

- ret.tv_nsec += 1000000000;

- }

- return ret;

+ return NULL;

}

@@ -1065,9 +1157,11 @@ sender_body(void *data)

struct targ *targ = (struct targ *) data;

struct pollfd pfd = { .fd = targ->fd, .events = POLLOUT };

struct netmap_if *nifp;

- struct netmap_ring *txring;

- int i, n = targ->g->npackets / targ->g->nthreads;

- int64_t sent = 0;

+ struct netmap_ring *txring = NULL;

+ int i;

+ uint64_t n = targ->g->npackets / targ->g->nthreads;

+ uint64_t sent = 0;

+ uint64_t event = 0;

int options = targ->g->options | OPT_COPY;

struct timespec nexttime = { 0, 0}; // XXX silence compiler

int rate_limit = targ->g->tx_rate;

@@ -1104,7 +1198,9 @@ sender_body(void *data)

sent++;

update_addresses(pkt, targ->g);

if (i > 10000) {

- targ->count = sent;

+ targ->ctr.pkts = sent;

+ targ->ctr.bytes = sent*size;

+ targ->ctr.events = sent;

i = 0;

}

@@ -1117,7 +1213,9 @@ sender_body(void *data)

sent++;

update_addresses(pkt, targ->g);

if (i > 10000) {

- targ->count = sent;

+ targ->ctr.pkts = sent;

+ targ->ctr.bytes = sent*size;

+ targ->ctr.events = sent;

i = 0;

}

@@ -1126,7 +1224,7 @@ sender_body(void *data)

int tosend = 0;

int frags = targ->g->frags;

- nifp = targ->nmd->nifp;

+ nifp = targ->nmd->nifp;

while (!targ->cancel && (n == 0 || sent < n)) {

if (rate_limit && tosend <= 0) {

@@ -1138,6 +1236,13 @@ sender_body(void *data)

* wait for available room in the send queue(s)

+#ifdef BUSYWAIT

+ if (ioctl(pfd.fd, NIOCTXSYNC, NULL) < 0) {

+ D("ioctl error on queue %d: %s", targ->me,

+ strerror(errno));

+ goto quit;

+ }

+#else /* !BUSYWAIT */

if (poll(&pfd, 1, 2000) <= 0) {

if (targ->cancel)

break;

@@ -1146,9 +1251,11 @@ sender_body(void *data)

// goto quit;

}

if (pfd.revents & POLLERR) {

- D("poll error");

+ D("poll error on %d ring %d-%d", pfd.fd,

+ targ->nmd->first_tx_ring, targ->nmd->last_tx_ring);

goto quit;

}

+#endif /* !BUSYWAIT */

* scan our queues and send on those with room

@@ -1157,7 +1264,8 @@ sender_body(void *data)

options &= ~OPT_COPY;

}

for (i = targ->nmd->first_tx_ring; i <= targ->nmd->last_tx_ring; i++) {

- int m, limit = rate_limit ? tosend : targ->g->burst;

+ int m;

+ uint64_t limit = rate_limit ? tosend : targ->g->burst;

if (n > 0 && n - sent < limit)

limit = n - sent;

txring = NETMAP_TXRING(nifp, i);

@@ -1171,7 +1279,11 @@ sender_body(void *data)

ND("limit %d tail %d frags %d m %d",

limit, txring->tail, frags, m);

sent += m;

- targ->count = sent;

+ if (m > 0) //XXX-ste: can m be 0?

+ event++;

+ targ->ctr.pkts = sent;

+ targ->ctr.bytes = sent*size;

+ targ->ctr.events = event;

if (rate_limit) {

tosend -= m;

if (tosend <= 0)

@@ -1182,13 +1294,13 @@ sender_body(void *data)

/* flush any remaining packets */

D("flush tail %d head %d on thread %p",

txring->tail, txring->head,

- pthread_self());

+ (void *)pthread_self());

ioctl(pfd.fd, NIOCTXSYNC, NULL);

/* final part: wait all the TX queues to be empty. */

for (i = targ->nmd->first_tx_ring; i <= targ->nmd->last_tx_ring; i++) {

txring = NETMAP_TXRING(nifp, i);

- while (nm_tx_pending(txring)) {

+ while (!targ->cancel && nm_tx_pending(txring)) {

RD(5, "pending tx tail %d head %d on ring %d",

txring->tail, txring->head, i);

ioctl(pfd.fd, NIOCTXSYNC, NULL);

@@ -1199,8 +1311,9 @@ sender_body(void *data)

clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc);

targ->completed = 1;

- targ->count = sent;

+ targ->ctr.pkts = sent;

+ targ->ctr.bytes = sent*size;

+ targ->ctr.events = event;

quit:

/* reset the ``used`` flag. */

targ->used = 0;

@@ -1214,17 +1327,22 @@ static void

receive_pcap(u_char *user, const struct pcap_pkthdr * h,

const u_char * bytes)

{

- int *count = (int *)user;

- (void)h; /* UNUSED */

+ struct my_ctrs *ctr = (struct my_ctrs *)user;

(void)bytes; /* UNUSED */

- (*count)++;

+ ctr->bytes += h->len;

+ ctr->pkts++;

}

#endif /* !NO_PCAP */

static int

-receive_packets(struct netmap_ring *ring, u_int limit, int dump)

+receive_packets(struct netmap_ring *ring, u_int limit, int dump, uint64_t *bytes)

{

u_int cur, rx, n;

+ uint64_t b = 0;

+ if (bytes == NULL)

+ bytes = &b;

cur = ring->cur;

n = nm_ring_space(ring);

@@ -1234,6 +1352,7 @@ receive_packets(struct netmap_ring *ring, u_int limit, int dump)

struct netmap_slot *slot = &ring->slot[cur];

char *p = NETMAP_BUF(ring, slot->buf_idx);

+ *bytes += slot->len;

if (dump)

dump_payload(p, slot->len, ring, cur);

@@ -1252,7 +1371,10 @@ receiver_body(void *data)

struct netmap_if *nifp;

struct netmap_ring *rxring;

int i;

- uint64_t received = 0;

+ struct my_ctrs cur;

+ cur.pkts = cur.bytes = cur.events = cur.min_space = 0;

+ cur.t.tv_usec = cur.t.tv_sec = 0; // unused, just silence the compiler

if (setaffinity(targ->thread, targ->affinity))

goto quit;

@@ -1273,24 +1395,36 @@ receiver_body(void *data)

while (!targ->cancel) {

char buf[MAX_BODYSIZE];

/* XXX should we poll ? */

- if (read(targ->g->main_fd, buf, sizeof(buf)) > 0)

- targ->count++;

+ i = read(targ->g->main_fd, buf, sizeof(buf));

+ if (i > 0) {

+ targ->ctr.pkts++;

+ targ->ctr.bytes += i;

+ targ->ctr.events++;

+ }

}

#ifndef NO_PCAP

} else if (targ->g->dev_type == DEV_PCAP) {

while (!targ->cancel) {

/* XXX should we poll ? */

pcap_dispatch(targ->g->p, targ->g->burst, receive_pcap,

- (u_char *)&targ->count);

+ (u_char *)&targ->ctr);

+ targ->ctr.events++;

}

#endif /* !NO_PCAP */

} else {

int dump = targ->g->options & OPT_DUMP;

- nifp = targ->nmd->nifp;

+ nifp = targ->nmd->nifp;

while (!targ->cancel) {

/* Once we started to receive packets, wait at most 1 seconds

before quitting. */

+#ifdef BUSYWAIT

+ if (ioctl(pfd.fd, NIOCRXSYNC, NULL) < 0) {

+ D("ioctl error on queue %d: %s", targ->me,

+ strerror(errno));

+ goto quit;

+ }

+#else /* !BUSYWAIT */

if (poll(&pfd, 1, 1 * 1000) <= 0 && !targ->g->forever) {

clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc);

targ->toc.tv_sec -= 1; /* Subtract timeout time. */

@@ -1301,26 +1435,39 @@ receiver_body(void *data)

D("poll err");

goto quit;

}

+#endif /* !BUSYWAIT */

+ uint64_t cur_space = 0;

for (i = targ->nmd->first_rx_ring; i <= targ->nmd->last_rx_ring; i++) {

int m;

rxring = NETMAP_RXRING(nifp, i);

+ /* compute free space in the ring */

+ m = rxring->head + rxring->num_slots - rxring->tail;

+ if (m >= (int) rxring->num_slots)

+ m -= rxring->num_slots;

+ cur_space += m;

if (nm_ring_empty(rxring))

continue;

- m = receive_packets(rxring, targ->g->burst, dump);

- received += m;

+ m = receive_packets(rxring, targ->g->burst, dump, &cur.bytes);

+ cur.pkts += m;

+ if (m > 0) //XXX-ste: can m be 0?

+ cur.events++;

}

- targ->count = received;

+ cur.min_space = targ->ctr.min_space;

+ if (cur_space < cur.min_space)

+ cur.min_space = cur_space;

+ targ->ctr = cur;

}

clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc);

+#if !defined(BUSYWAIT)

out:

+#endif

targ->completed = 1;

- targ->count = received;

+ targ->ctr = cur;

quit:

/* reset the ``used`` flag. */

@@ -1329,56 +1476,390 @@ quit:

return (NULL);

}

-/* very crude code to print a number in normalized form.

- * Caller has to make sure that the buffer is large enough.

- */

-static const char *

-norm(char *buf, double val)

+static void *

+txseq_body(void *data)

+ struct targ *targ = (struct targ *) data;

+ struct pollfd pfd = { .fd = targ->fd, .events = POLLOUT };

+ struct netmap_ring *ring;

+ int64_t sent = 0;

+ uint64_t event = 0;

+ int options = targ->g->options | OPT_COPY;

+ struct timespec nexttime = {0, 0};

+ int rate_limit = targ->g->tx_rate;

+ struct pkt *pkt = &targ->pkt;

+ int frags = targ->g->frags;

+ uint32_t sequence = 0;

+ int budget = 0;

+ void *frame;

+ int size;

+ if (targ->g->nthreads > 1) {

+ D("can only txseq ping with 1 thread");

+ return NULL;

+ }

+ if (targ->g->npackets > 0) {

+ D("Ignoring -n argument");

+ }

+ frame = pkt;

+ frame += sizeof(pkt->vh) - targ->g->virt_header;

+ size = targ->g->pkt_size + targ->g->virt_header;

+ D("start, fd %d main_fd %d", targ->fd, targ->g->main_fd);

+ if (setaffinity(targ->thread, targ->affinity))

+ goto quit;

+ clock_gettime(CLOCK_REALTIME_PRECISE, &targ->tic);

+ if (rate_limit) {

+ targ->tic = timespec_add(targ->tic, (struct timespec){2,0});

+ targ->tic.tv_nsec = 0;

+ wait_time(targ->tic);

+ nexttime = targ->tic;

+ }

+ /* Only use the first queue. */

+ ring = NETMAP_TXRING(targ->nmd->nifp, targ->nmd->first_tx_ring);

+ while (!targ->cancel) {

+ int64_t limit;

+ unsigned int space;

+ unsigned int head;

+ int fcnt;

+ if (!rate_limit) {

+ budget = targ->g->burst;

+ } else if (budget <= 0) {

+ budget = targ->g->burst;

+ nexttime = timespec_add(nexttime, targ->g->tx_period);

+ wait_time(nexttime);

+ }

+ /* wait for available room in the send queue */

+ if (poll(&pfd, 1, 2000) <= 0) {

+ if (targ->cancel)

+ break;

+ D("poll error/timeout on queue %d: %s", targ->me,

+ strerror(errno));

+ }

+ if (pfd.revents & POLLERR) {

+ D("poll error on %d ring %d-%d", pfd.fd,

+ targ->nmd->first_tx_ring, targ->nmd->last_tx_ring);

+ goto quit;

+ }

+ /* If no room poll() again. */

+ space = nm_ring_space(ring);

+ if (!space) {

+ continue;

+ }

+ limit = budget;

+ if (space < limit) {

+ limit = space;

+ }

+ /* Cut off ``limit`` to make sure is multiple of ``frags``. */

+ if (frags > 1) {

+ limit = (limit / frags) * frags;

+ }

+ limit = sent + limit; /* Convert to absolute. */

+ for (fcnt = frags, head = ring->head;

+ sent < limit; sent++, sequence++) {

+ struct netmap_slot *slot = &ring->slot[head];

+ char *p = NETMAP_BUF(ring, slot->buf_idx);

+ slot->flags = 0;

+ pkt->body[0] = sequence >> 24;

+ pkt->body[1] = (sequence >> 16) & 0xff;

+ pkt->body[2] = (sequence >> 8) & 0xff;

+ pkt->body[3] = sequence & 0xff;

+ nm_pkt_copy(frame, p, size);

+ if (fcnt == frags) {

+ update_addresses(pkt, targ->g);

+ }

+ if (options & OPT_DUMP) {

+ dump_payload(p, size, ring, head);

+ }

+ slot->len = size;

+ if (--fcnt > 0) {

+ slot->flags |= NS_MOREFRAG;

+ } else {

+ fcnt = frags;

+ }

+ if (sent == limit - 1) {

+ /* Make sure we don't push an incomplete

+ * packet. */

+ assert(!(slot->flags & NS_MOREFRAG));

+ slot->flags |= NS_REPORT;

+ }

+ head = nm_ring_next(ring, head);

+ if (rate_limit) {

+ budget--;

+ }

+ ring->cur = ring->head = head;

+ event ++;

+ targ->ctr.pkts = sent;

+ targ->ctr.bytes = sent * size;

+ targ->ctr.events = event;

+ }

+ /* flush any remaining packets */

+ D("flush tail %d head %d on thread %p",

+ ring->tail, ring->head,

+ (void *)pthread_self());

+ ioctl(pfd.fd, NIOCTXSYNC, NULL);

+ /* final part: wait the TX queues to become empty. */

+ while (!targ->cancel && nm_tx_pending(ring)) {

+ RD(5, "pending tx tail %d head %d on ring %d",

+ ring->tail, ring->head, targ->nmd->first_tx_ring);

+ ioctl(pfd.fd, NIOCTXSYNC, NULL);

+ usleep(1); /* wait 1 tick */

+ }

+ clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc);

+ targ->completed = 1;

+ targ->ctr.pkts = sent;

+ targ->ctr.bytes = sent * size;

+ targ->ctr.events = event;

+quit:

+ /* reset the ``used`` flag. */

+ targ->used = 0;

+ return (NULL);

+static char *

+multi_slot_to_string(struct netmap_ring *ring, unsigned int head,

+ unsigned int nfrags, char *strbuf, size_t strbuflen)

{

- char *units[] = { "", "K", "M", "G", "T" };

- u_int i;

+ unsigned int f;

+ char *ret = strbuf;

- for (i = 0; val >=1000 && i < sizeof(units)/sizeof(char *) - 1; i++)

- val /= 1000;

- sprintf(buf, "%.2f %s", val, units[i]);

- return buf;

+ for (f = 0; f < nfrags; f++) {

+ struct netmap_slot *slot = &ring->slot[head];

+ int m = snprintf(strbuf, strbuflen, "|%u,%x|", slot->len,

+ slot->flags);

+ if (m >= (int)strbuflen) {

+ break;

+ }

+ strbuf += m;

+ strbuflen -= m;

+ head = nm_ring_next(ring, head);

+ }

+ return ret;

}

-static void

-tx_output(uint64_t sent, int size, double delta)

+static void *

+rxseq_body(void *data)

{

- double bw, raw_bw, pps;

- char b1[40], b2[80], b3[80];

+ struct targ *targ = (struct targ *) data;

+ struct pollfd pfd = { .fd = targ->fd, .events = POLLIN };

+ int dump = targ->g->options & OPT_DUMP;

+ struct netmap_ring *ring;

+ unsigned int frags_exp = 1;

+ uint32_t seq_exp = 0;

+ struct my_ctrs cur;

+ unsigned int frags = 0;

+ int first_packet = 1;

+ int first_slot = 1;

+ int i;

- printf("Sent %llu packets, %d bytes each, in %.2f seconds.\n",

- (unsigned long long)sent, size, delta);

- if (delta == 0)

- delta = 1e-6;

- if (size < 60) /* correct for min packet size */

- size = 60;

- pps = sent / delta;

- bw = (8.0 * size * sent) / delta;

- /* raw packets have4 bytes crc + 20 bytes framing */

- raw_bw = (8.0 * (size + 24) * sent) / delta;

+ cur.pkts = cur.bytes = cur.events = cur.min_space = 0;

+ cur.t.tv_usec = cur.t.tv_sec = 0; // unused, just silence the compiler

+ if (setaffinity(targ->thread, targ->affinity))

+ goto quit;

+ D("reading from %s fd %d main_fd %d",

+ targ->g->ifname, targ->fd, targ->g->main_fd);

+ /* unbounded wait for the first packet. */

+ for (;!targ->cancel;) {

+ i = poll(&pfd, 1, 1000);

+ if (i > 0 && !(pfd.revents & POLLERR))

+ break;

+ RD(1, "waiting for initial packets, poll returns %d %d",

+ i, pfd.revents);

+ }

+ clock_gettime(CLOCK_REALTIME_PRECISE, &targ->tic);

+ ring = NETMAP_RXRING(targ->nmd->nifp, targ->nmd->first_rx_ring);

+ while (!targ->cancel) {

+ unsigned int head;

+ uint32_t seq;

+ int limit;

+ /* Once we started to receive packets, wait at most 1 seconds

+ before quitting. */

+ if (poll(&pfd, 1, 1 * 1000) <= 0 && !targ->g->forever) {

+ clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc);

+ targ->toc.tv_sec -= 1; /* Subtract timeout time. */

+ goto out;

+ }

+ if (pfd.revents & POLLERR) {

+ D("poll err");

+ goto quit;

+ }

+ if (nm_ring_empty(ring))

+ continue;

+ limit = nm_ring_space(ring);

+ if (limit > targ->g->burst)

+ limit = targ->g->burst;

+#if 0

+ /* Enable this if

+ * 1) we remove the early-return optimization from

+ * the netmap poll implementation, or

+ * 2) pipes get NS_MOREFRAG support.

+ * With the current netmap implementation, an experiment like

+ * pkt-gen -i vale:1{1 -f txseq -F 9

+ * pkt-gen -i vale:1}1 -f rxseq

+ * would get stuck as soon as we find nm_ring_space(ring) < 9,

+ * since here limit is rounded to 0 and

+ * pipe rxsync is not called anymore by the poll() of this loop.

+ */

+ if (frags_exp > 1) {

+ int o = limit;

+ /* Cut off to the closest smaller multiple. */

+ limit = (limit / frags_exp) * frags_exp;

+ RD(2, "LIMIT %d --> %d", o, limit);

+ }

+#endif

+ for (head = ring->head, i = 0; i < limit; i++) {

+ struct netmap_slot *slot = &ring->slot[head];

+ char *p = NETMAP_BUF(ring, slot->buf_idx);

+ int len = slot->len;

+ struct pkt *pkt;

+ if (dump) {

+ dump_payload(p, slot->len, ring, head);

+ }

- printf("Speed: %spps Bandwidth: %sbps (raw %sbps)\n",

- norm(b1, pps), norm(b2, bw), norm(b3, raw_bw) );

+ frags++;

+ if (!(slot->flags & NS_MOREFRAG)) {

+ if (first_packet) {

+ first_packet = 0;

+ } else if (frags != frags_exp) {

+ char prbuf[512];

+ RD(1, "Received packets with %u frags, "

+ "expected %u, '%s'", frags, frags_exp,

+ multi_slot_to_string(ring, head-frags+1, frags,

+ prbuf, sizeof(prbuf)));

+ }

+ first_packet = 0;

+ frags_exp = frags;

+ frags = 0;

+ }

+ p -= sizeof(pkt->vh) - targ->g->virt_header;

+ len += sizeof(pkt->vh) - targ->g->virt_header;

+ pkt = (struct pkt *)p;

+ if ((char *)pkt + len < ((char *)pkt->body) + sizeof(seq)) {

+ RD(1, "%s: packet too small (len=%u)", __func__,

+ slot->len);

+ } else {

+ seq = (pkt->body[0] << 24) | (pkt->body[1] << 16)

+ | (pkt->body[2] << 8) | pkt->body[3];

+ if (first_slot) {

+ /* Grab the first one, whatever it

+ is. */

+ seq_exp = seq;

+ first_slot = 0;

+ } else if (seq != seq_exp) {

+ uint32_t delta = seq - seq_exp;

+ if (delta < (0xFFFFFFFF >> 1)) {

+ RD(2, "Sequence GAP: exp %u found %u",

+ seq_exp, seq);

+ } else {

+ RD(2, "Sequence OUT OF ORDER: "

+ "exp %u found %u", seq_exp, seq);

+ }

+ seq_exp = seq;

+ }

+ seq_exp++;

+ }

+ cur.bytes += slot->len;

+ head = nm_ring_next(ring, head);

+ cur.pkts++;

+ }

+ ring->cur = ring->head = head;

+ cur.events++;

+ targ->ctr = cur;

+ }

+ clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc);

+out:

+ targ->completed = 1;

+ targ->ctr = cur;

+quit:

+ /* reset the ``used`` flag. */

+ targ->used = 0;

+ return (NULL);

}

static void

-rx_output(uint64_t received, double delta)

+tx_output(struct my_ctrs *cur, double delta, const char *msg)

{

- double pps;

- char b1[40];

+ double bw, raw_bw, pps, abs;

+ char b1[40], b2[80], b3[80];

+ int size;

+ if (cur->pkts == 0) {

+ printf("%s nothing.\n", msg);

+ return;

+ }

- printf("Received %llu packets, in %.2f seconds.\n",

- (unsigned long long) received, delta);

+ size = (int)(cur->bytes / cur->pkts);

+ printf("%s %llu packets %llu bytes %llu events %d bytes each in %.2f seconds.\n",

+ msg,

+ (unsigned long long)cur->pkts,

+ (unsigned long long)cur->bytes,

+ (unsigned long long)cur->events, size, delta);

if (delta == 0)

delta = 1e-6;

- pps = received / delta;

- printf("Speed: %spps\n", norm(b1, pps));

+ if (size < 60) /* correct for min packet size */

+ size = 60;

+ pps = cur->pkts / delta;

+ bw = (8.0 * cur->bytes) / delta;

+ /* raw packets have4 bytes crc + 20 bytes framing */

+ raw_bw = (8.0 * (cur->pkts * 24 + cur->bytes)) / delta;

+ abs = cur->pkts / (double)(cur->events);

+ printf("Speed: %spps Bandwidth: %sbps (raw %sbps). Average batch: %.2f pkts\n",

+ norm(b1, pps), norm(b2, bw), norm(b3, raw_bw), abs);

}

static void

@@ -1389,9 +1870,9 @@ usage(void)

"Usage:\n"

"%s arguments\n"

"\t-i interface interface name\n"

- "\t-f function tx rx ping pong\n"

+ "\t-f function tx rx ping pong txseq rxseq\n"

"\t-n count number of iterations (can be 0)\n"

- "\t-t pkts_to_send also forces tx mode\n"

+ "\t-t pkts_to_send also forces tx mode\n"

"\t-r pkts_to_receive also forces rx mode\n"

"\t-l pkt_size in bytes excluding CRC\n"

"\t-d dst_ip[:port[-dst_ip:port]] single or range\n"

@@ -1403,20 +1884,29 @@ usage(void)

"\t-c cores cores to use\n"

"\t-p threads processes/threads to use\n"

"\t-T report_ms milliseconds between reports\n"

- "\t-P use libpcap instead of netmap\n"

"\t-w wait_for_link_time in seconds\n"

"\t-R rate in packets per second\n"

"\t-X dump payload\n"

"\t-H len add empty virtio-net-header with size 'len'\n"

+ "\t-E pipes allocate extra space for a number of pipes\n"

+ "\t-r do not touch the buffers (send rubbish)\n"

"\t-P file load packet from pcap file\n"

"\t-z use random IPv4 src address/port\n"

"\t-Z use random IPv4 dst address/port\n"

+ "\t-F num_frags send multi-slot packets\n"

+ "\t-A activate pps stats on receiver\n"

"",

cmd);

exit(0);

}

+enum {

+ TD_TYPE_SENDER = 1,

+ TD_TYPE_RECEIVER,

+ TD_TYPE_OTHER,

+};

static void

start_threads(struct glob_arg *g)

{

@@ -1439,33 +1929,32 @@ start_threads(struct glob_arg *g)

uint64_t nmd_flags = 0;

nmd.self = &nmd;

- if (g->nthreads > 1) {

- if (nmd.req.nr_flags != NR_REG_ALL_NIC) {

- D("invalid nthreads mode %d", nmd.req.nr_flags);

- continue;

+ if (i > 0) {

+ /* the first thread uses the fd opened by the main

+ * thread, the other threads re-open /dev/netmap

+ */

+ if (g->nthreads > 1) {

+ nmd.req.nr_flags =

+ g->nmd->req.nr_flags & ~NR_REG_MASK;

+ nmd.req.nr_flags |= NR_REG_ONE_NIC;

+ nmd.req.nr_ringid = i;

}

- nmd.req.nr_flags = NR_REG_ONE_NIC;

- nmd.req.nr_ringid = i;

- }

- /* Only touch one of the rings (rx is already ok) */

- if (g->td_body == receiver_body)

- nmd_flags |= NETMAP_NO_TX_POLL;

- /* register interface. Override ifname and ringid etc. */

- if (g->options & OPT_MONITOR_TX)

- nmd.req.nr_flags |= NR_MONITOR_TX;

- if (g->options & OPT_MONITOR_RX)

- nmd.req.nr_flags |= NR_MONITOR_RX;

+ /* Only touch one of the rings (rx is already ok) */

+ if (g->td_type == TD_TYPE_RECEIVER)

+ nmd_flags |= NETMAP_NO_TX_POLL;

- t->nmd = nm_open(t->g->ifname, NULL, nmd_flags |

- NM_OPEN_IFNAME | NM_OPEN_NO_MMAP, &nmd);

- if (t->nmd == NULL) {

- D("Unable to open %s: %s",

- t->g->ifname, strerror(errno));

- continue;

+ /* register interface. Override ifname and ringid etc. */

+ t->nmd = nm_open(t->g->ifname, NULL, nmd_flags |

+ NM_OPEN_IFNAME | NM_OPEN_NO_MMAP, &nmd);

+ if (t->nmd == NULL) {

+ D("Unable to open %s: %s",

+ t->g->ifname, strerror(errno));

+ continue;

+ }

+ } else {

+ t->nmd = g->nmd;

}

t->fd = t->nmd->fd;

- set_vnet_hdr_len(t);

} else {

targs[i].fd = g->main_fd;

@@ -1473,10 +1962,7 @@ start_threads(struct glob_arg *g)

t->used = 1;

t->me = i;

if (g->affinity >= 0) {

- if (g->affinity < g->cpus)

- t->affinity = g->affinity;

- else

- t->affinity = i % g->cpus;

+ t->affinity = (g->affinity + i) % g->system_cpus;

} else {

t->affinity = -1;

}

@@ -1495,45 +1981,89 @@ main_thread(struct glob_arg *g)

{

int i;

- uint64_t prev = 0;

- uint64_t count = 0;

+ struct my_ctrs prev, cur;

double delta_t;

struct timeval tic, toc;

- gettimeofday(&toc, NULL);

+ prev.pkts = prev.bytes = prev.events = 0;

+ gettimeofday(&prev.t, NULL);

for (;;) {

- struct timeval now, delta;

- uint64_t pps, usec, my_count, npkts;

+ char b1[40], b2[40], b3[40], b4[70];

+ uint64_t pps, usec;

+ struct my_ctrs x;

+ double abs;

int done = 0;

- delta.tv_sec = g->report_interval/1000;

- delta.tv_usec = (g->report_interval%1000)*1000;

- select(0, NULL, NULL, NULL, &delta);

- gettimeofday(&now, NULL);

- timersub(&now, &toc, &toc);

- my_count = 0;

+ usec = wait_for_next_report(&prev.t, &cur.t,

+ g->report_interval);

+ cur.pkts = cur.bytes = cur.events = 0;

+ cur.min_space = 0;

+ if (usec < 10000) /* too short to be meaningful */

+ continue;

+ /* accumulate counts for all threads */

for (i = 0; i < g->nthreads; i++) {

- my_count += targs[i].count;

+ cur.pkts += targs[i].ctr.pkts;

+ cur.bytes += targs[i].ctr.bytes;

+ cur.events += targs[i].ctr.events;

+ cur.min_space += targs[i].ctr.min_space;

+ targs[i].ctr.min_space = 99999;

if (targs[i].used == 0)

done++;

}

- usec = toc.tv_sec* 1000000 + toc.tv_usec;

- if (usec < 10000)

- continue;

- npkts = my_count - prev;

- pps = (npkts*1000000 + usec/2) / usec;

- D("%llu pps (%llu pkts in %llu usec)",

- (unsigned long long)pps,

- (unsigned long long)npkts,

- (unsigned long long)usec);

- prev = my_count;

- toc = now;

+ x.pkts = cur.pkts - prev.pkts;

+ x.bytes = cur.bytes - prev.bytes;

+ x.events = cur.events - prev.events;

+ pps = (x.pkts*1000000 + usec/2) / usec;

+ abs = (x.events > 0) ? (x.pkts / (double) x.events) : 0;

+ if (!(g->options & OPT_PPS_STATS)) {

+ strcpy(b4, "");

+ } else {

+ /* Compute some pps stats using a sliding window. */

+ double ppsavg = 0.0, ppsdev = 0.0;

+ int nsamples = 0;

+ g->win[g->win_idx] = pps;

+ g->win_idx = (g->win_idx + 1) % STATS_WIN;

+ for (i = 0; i < STATS_WIN; i++) {

+ ppsavg += g->win[i];

+ if (g->win[i]) {

+ nsamples ++;

+ }

+ ppsavg /= nsamples;

+ for (i = 0; i < STATS_WIN; i++) {

+ if (g->win[i] == 0) {

+ continue;

+ }

+ ppsdev += (g->win[i] - ppsavg) * (g->win[i] - ppsavg);

+ }

+ ppsdev /= nsamples;

+ ppsdev = sqrt(ppsdev);

+ snprintf(b4, sizeof(b4), "[avg/std %s/%s pps]",

+ norm(b1, ppsavg), norm(b2, ppsdev));

+ }

+ D("%spps %s(%spkts %sbps in %llu usec) %.2f avg_batch %d min_space",

+ norm(b1, pps), b4,

+ norm(b2, (double)x.pkts),

+ norm(b3, (double)x.bytes*8),

+ (unsigned long long)usec,

+ abs, (int)cur.min_space);

+ prev = cur;

if (done == g->nthreads)

break;

}

timerclear(&tic);

timerclear(&toc);

+ cur.pkts = cur.bytes = cur.events = 0;

+ /* final round */

for (i = 0; i < g->nthreads; i++) {

struct timespec t_tic, t_toc;

@@ -1541,8 +2071,13 @@ main_thread(struct glob_arg *g)

* file descriptors.

if (targs[i].used)

- pthread_join(targs[i].thread, NULL);

- close(targs[i].fd);

+ pthread_join(targs[i].thread, NULL); /* blocking */

+ if (g->dev_type == DEV_NETMAP) {

+ nm_close(targs[i].nmd);

+ targs[i].nmd = NULL;

+ } else {

+ close(targs[i].fd);

+ }

if (targs[i].completed == 0)

D("ouch, thread %d exited with error", i);

@@ -1551,7 +2086,13 @@ main_thread(struct glob_arg *g)

* Collect threads output and extract information about

* how long it took to send all the packets.

- count += targs[i].count;

+ cur.pkts += targs[i].ctr.pkts;

+ cur.bytes += targs[i].ctr.bytes;

+ cur.events += targs[i].ctr.events;

+ /* collect the largest start (tic) and end (toc) times,

+ * XXX maybe we should do the earliest tic, or do a weighted

+ * average ?

+ */

t_tic = timeval2spec(&tic);

t_toc = timeval2spec(&toc);

if (!timerisset(&tic) || timespec_ge(&targs[i].tic, &t_tic))

@@ -1563,29 +2104,26 @@ main_thread(struct glob_arg *g)

/* print output. */

timersub(&toc, &tic, &toc);

delta_t = toc.tv_sec + 1e-6* toc.tv_usec;

- if (g->td_body == sender_body)

- tx_output(count, g->pkt_size, delta_t);

+ if (g->td_type == TD_TYPE_SENDER)

+ tx_output(&cur, delta_t, "Sent");

else

- rx_output(count, delta_t);

- if (g->dev_type == DEV_NETMAP) {

- munmap(g->nmd->mem, g->nmd->req.nr_memsize);

- close(g->main_fd);

- }

+ tx_output(&cur, delta_t, "Received");

}

-struct sf {

+struct td_desc {

+ int ty;

char *key;

void *f;

};

-static struct sf func[] = {

- { "tx", sender_body },

- { "rx", receiver_body },

- { "ping", pinger_body },

- { "pong", ponger_body },

- { NULL, NULL }

+static struct td_desc func[] = {

+ { TD_TYPE_SENDER, "tx", sender_body },

+ { TD_TYPE_RECEIVER, "rx", receiver_body },

+ { TD_TYPE_OTHER, "ping", pinger_body },

+ { TD_TYPE_OTHER, "pong", ponger_body },

+ { TD_TYPE_SENDER, "txseq", txseq_body },

+ { TD_TYPE_RECEIVER, "rxseq", rxseq_body },

+ { 0, NULL, NULL }

};

static int

@@ -1654,6 +2192,8 @@ int

main(int arc, char **argv)

{

int i;

+ struct sigaction sa;

+ sigset_t ss;

struct glob_arg g;

@@ -1665,6 +2205,7 @@ main(int arc, char **argv)

g.main_fd = -1;

g.td_body = receiver_body;

+ g.td_type = TD_TYPE_RECEIVER;

g.report_interval = 1000; /* report interval */

g.affinity = -1;

/* ip addresses can also be a range x.x.x.x-x.x.x.y */

@@ -1675,7 +2216,7 @@ main(int arc, char **argv)

g.pkt_size = 60;

g.burst = 512; // default

g.nthreads = 1;

- g.cpus = 1;

+ g.cpus = 1; // default

g.forever = 1;

g.tx_rate = 0;

g.frags = 1;

@@ -1683,8 +2224,8 @@ main(int arc, char **argv)

g.virt_header = 0;

while ( (ch = getopt(arc, argv,

- "a:f:F:n:i:Il:d:s:D:S:b:c:o:p:T:w:WvR:XC:H:e:m:P:zZ")) != -1) {

- struct sf *fn;

+ "a:f:F:n:i:Il:d:s:D:S:b:c:o:p:T:w:WvR:XC:H:e:E:m:rP:zZA")) != -1) {

+ struct td_desc *fn;

switch(ch) {

default:

@@ -1693,7 +2234,7 @@ main(int arc, char **argv)

break;

case 'n':

- g.npackets = atoi(optarg);

+ g.npackets = strtoull(optarg, NULL, 10);

break;

case 'F':

@@ -1710,10 +2251,12 @@ main(int arc, char **argv)

if (!strcmp(fn->key, optarg))

break;

}

- if (fn->key)

+ if (fn->key) {

g.td_body = fn->f;

- else

+ g.td_type = fn->ty;

+ } else {

D("unrecognised function %s", optarg);

+ }

break;

case 'o': /* data generation options */

@@ -1817,24 +2360,27 @@ main(int arc, char **argv)

case 'e': /* extra bufs */

g.extra_bufs = atoi(optarg);

break;

- case 'm':

- if (strcmp(optarg, "tx") == 0) {

- g.options |= OPT_MONITOR_TX;

- } else if (strcmp(optarg, "rx") == 0) {

- g.options |= OPT_MONITOR_RX;

- } else {

- D("unrecognized monitor mode %s", optarg);

- }

+ case 'E':

+ g.extra_pipes = atoi(optarg);

break;

case 'P':

g.packet_file = strdup(optarg);

break;

+ case 'm':

+ /* ignored */

+ break;

+ case 'r':

+ g.options |= OPT_RUBBISH;

+ break;

case 'z':

g.options |= OPT_RANDOM_SRC;

break;

case 'Z':

g.options |= OPT_RANDOM_DST;

break;

+ case 'A':

+ g.options |= OPT_PPS_STATS;

+ break;

}

@@ -1843,11 +2389,12 @@ main(int arc, char **argv)

usage();

}

- i = system_ncpus();

+ g.system_cpus = i = system_ncpus();

if (g.cpus < 0 || g.cpus > i) {

D("%d cpus is too high, have only %d cpus", g.cpus, i);

usage();

}

+D("running on %d cpus (have %d)", g.cpus, i);

if (g.cpus == 0)

g.cpus = i;

@@ -1914,6 +2461,11 @@ main(int arc, char **argv)

if (g.extra_bufs) {

base_nmd.nr_arg3 = g.extra_bufs;

}

+ if (g.extra_pipes) {

+ base_nmd.nr_arg1 = g.extra_pipes;

+ }

+ base_nmd.nr_flags |= NR_ACCEPT_VNET_HDR;

* Open the netmap device using nm_open().

@@ -1927,13 +2479,38 @@ main(int arc, char **argv)

D("Unable to open %s: %s", g.ifname, strerror(errno));

goto out;

}

+ if (g.nthreads > 1) {

+ struct nm_desc saved_desc = *g.nmd;

+ saved_desc.self = &saved_desc;

+ saved_desc.mem = NULL;

+ nm_close(g.nmd);

+ saved_desc.req.nr_flags &= ~NR_REG_MASK;

+ saved_desc.req.nr_flags |= NR_REG_ONE_NIC;

+ saved_desc.req.nr_ringid = 0;

+ g.nmd = nm_open(g.ifname, &base_nmd, NM_OPEN_IFNAME, &saved_desc);

+ if (g.nmd == NULL) {

+ D("Unable to open %s: %s", g.ifname, strerror(errno));

+ goto out;

+ }

g.main_fd = g.nmd->fd;

D("mapped %dKB at %p", g.nmd->req.nr_memsize>>10, g.nmd->mem);

- /* get num of queues in tx or rx */

- if (g.td_body == sender_body)

+ if (g.virt_header) {

+ /* Set the virtio-net header length, since the user asked

+ * for it explicitely. */

+ set_vnet_hdr_len(&g);

+ } else {

+ /* Check whether the netmap port we opened requires us to send

+ * and receive frames with virtio-net header. */

+ get_vnet_hdr_len(&g);

+ }

+ /* get num of queues in tx or rx */

+ if (g.td_type == TD_TYPE_SENDER)

devqueues = g.nmd->req.nr_tx_rings;

- else

+ else

devqueues = g.nmd->req.nr_rx_rings;

/* validate provided nthreads. */

@@ -1951,25 +2528,27 @@ main(int arc, char **argv)

req->nr_arg2);

for (i = 0; i <= req->nr_tx_rings; i++) {

struct netmap_ring *ring = NETMAP_TXRING(nifp, i);

- D(" TX%d at 0x%lx slots %d", i,

- (char *)ring - (char *)nifp, ring->num_slots);

+ D(" TX%d at 0x%p slots %d", i,

+ (void *)((char *)ring - (char *)nifp), ring->num_slots);

}

for (i = 0; i <= req->nr_rx_rings; i++) {

struct netmap_ring *ring = NETMAP_RXRING(nifp, i);

- D(" RX%d at 0x%lx slots %d", i,

- (char *)ring - (char *)nifp, ring->num_slots);

+ D(" RX%d at 0x%p slots %d", i,

+ (void *)((char *)ring - (char *)nifp), ring->num_slots);

}

/* Print some debug information. */

fprintf(stdout,

"%s %s: %d queues, %d threads and %d cpus.\n",

- (g.td_body == sender_body) ? "Sending on" : "Receiving from",

+ (g.td_type == TD_TYPE_SENDER) ? "Sending on" :

+ ((g.td_type == TD_TYPE_RECEIVER) ? "Receiving from" :

+ "Working on"),

g.ifname,

devqueues,

g.nthreads,

g.cpus);

- if (g.td_body == sender_body) {

+ if (g.td_type == TD_TYPE_SENDER) {

fprintf(stdout, "%s -> %s (%s -> %s)\n",

g.src_ip.name, g.dst_ip.name,

g.src_mac.name, g.dst_mac.name);

@@ -1985,12 +2564,13 @@ out:

if (g.options) {

- D("--- SPECIAL OPTIONS:%s%s%s%s%s\n",

+ D("--- SPECIAL OPTIONS:%s%s%s%s%s%s\n",

g.options & OPT_PREFETCH ? " prefetch" : "",

g.options & OPT_ACCESS ? " access" : "",

g.options & OPT_MEMCPY ? " memcpy" : "",

g.options & OPT_INDIRECT ? " indirect" : "",

- g.options & OPT_COPY ? " copy" : "");

+ g.options & OPT_COPY ? " copy" : "",

+ g.options & OPT_RUBBISH ? " rubbish " : "");

}

g.tx_period.tv_sec = g.tx_period.tv_nsec = 0;

@@ -2010,7 +2590,7 @@ out:

g.tx_period.tv_sec = g.tx_period.tv_nsec / 1000000000;

g.tx_period.tv_nsec = g.tx_period.tv_nsec % 1000000000;

}

- if (g.td_body == sender_body)

+ if (g.td_type == TD_TYPE_SENDER)

D("Sending %d packets every %ld.%09ld s",

g.burst, g.tx_period.tv_sec, g.tx_period.tv_nsec);

/* Wait for PHY reset. */

@@ -2020,10 +2600,24 @@ out:

/* Install ^C handler. */

global_nthreads = g.nthreads;

- signal(SIGINT, sigint_h);

+ sigemptyset(&ss);

+ sigaddset(&ss, SIGINT);

+ /* block SIGINT now, so that all created threads will inherit the mask */

+ if (pthread_sigmask(SIG_BLOCK, &ss, NULL) < 0) {

+ D("failed to block SIGINT: %s", strerror(errno));

+ }

start_threads(&g);

+ /* Install the handler and re-enable SIGINT for the main thread */

+ sa.sa_handler = sigint_h;

+ if (sigaction(SIGINT, &sa, NULL) < 0) {

+ D("failed to install ^C handler: %s", strerror(errno));

+ }

+ if (pthread_sigmask(SIG_UNBLOCK, &ss, NULL) < 0) {

+ D("failed to re-enable SIGINT: %s", strerror(errno));

+ }

main_thread(&g);

+ free(targs);

return 0;

}

diff --git a/tools/tools/netmap/vale-ctl.c b/tools/tools/netmap/vale-ctl.c
index c9e5f31b9206..bf6e51fbde97 100644
--- a/tools/tools/netmap/vale-ctl.c
+++ b/tools/tools/netmap/vale-ctl.c

@@ -25,6 +25,10 @@

/* $FreeBSD$ */

+#define NETMAP_WITH_LIBS

+#include <net/netmap_user.h>

+#include <net/netmap.h>

#include <errno.h>

#include <stdio.h>

#include <inttypes.h> /* PRI* macros */

@@ -35,17 +39,9 @@

#include <sys/param.h>

#include <sys/socket.h> /* apple needs sockaddr */

#include <net/if.h> /* ifreq */

-#include <net/netmap.h>

-#include <net/netmap_user.h>

#include <libgen.h> /* basename */

#include <stdlib.h> /* atoi, free */

-/* debug support */

-#define ND(format, ...) do {} while(0)

-#define D(format, ...) \

- fprintf(stderr, "%s [%d] " format "\n", \

- __FUNCTION__, __LINE__, ##__VA_ARGS__)

/* XXX cut and paste from pkt-gen.c because I'm not sure whether this

* program may include nm_util.h

@@ -117,8 +113,11 @@ bdg_ctl(const char *name, int nr_cmd, int nr_arg, char *nmr_config)

break;

case NETMAP_BDG_ATTACH:

case NETMAP_BDG_DETACH:

- if (nr_arg && nr_arg != NETMAP_BDG_HOST)

+ nmr.nr_flags = NR_REG_ALL_NIC;

+ if (nr_arg && nr_arg != NETMAP_BDG_HOST) {

+ nmr.nr_flags = NR_REG_NIC_SW;

nr_arg = 0;

+ }

nmr.nr_arg1 = nr_arg;

error = ioctl(fd, NIOCREGIF, &nmr);

if (error == -1) {

@@ -152,6 +151,36 @@ bdg_ctl(const char *name, int nr_cmd, int nr_arg, char *nmr_config)

break;

+ case NETMAP_BDG_POLLING_ON:

+ case NETMAP_BDG_POLLING_OFF:

+ /* We reuse nmreq fields as follows:

+ * nr_tx_slots: 0 and non-zero indicate REG_ALL_NIC

+ * REG_ONE_NIC, respectively.

+ * nr_rx_slots: CPU core index. This also indicates the

+ * first queue in the case of REG_ONE_NIC

+ * nr_tx_rings: (REG_ONE_NIC only) indicates the

+ * number of CPU cores or the last queue

+ */

+ nmr.nr_flags |= nmr.nr_tx_slots ?

+ NR_REG_ONE_NIC : NR_REG_ALL_NIC;

+ nmr.nr_ringid = nmr.nr_rx_slots;

+ /* number of cores/rings */

+ if (nmr.nr_flags == NR_REG_ALL_NIC)

+ nmr.nr_arg1 = 1;

+ else

+ nmr.nr_arg1 = nmr.nr_tx_rings;

+ error = ioctl(fd, NIOCREGIF, &nmr);

+ if (!error)

+ D("polling on %s %s", nmr.nr_name,

+ nr_cmd == NETMAP_BDG_POLLING_ON ?

+ "started" : "stopped");

+ else

+ D("polling on %s %s (err %d)", nmr.nr_name,

+ nr_cmd == NETMAP_BDG_POLLING_ON ?

+ "couldn't start" : "couldn't stop", error);

+ break;

default: /* GINFO */

nmr.nr_cmd = nmr.nr_arg1 = nmr.nr_arg2 = 0;

error = ioctl(fd, NIOCGINFO, &nmr);

@@ -173,7 +202,7 @@ main(int argc, char *argv[])

const char *command = basename(argv[0]);

char *name = NULL, *nmr_config = NULL;

- if (argc > 3) {

+ if (argc > 5) {

usage:

fprintf(stderr,

"Usage:\n"

@@ -186,12 +215,18 @@ usage:

"\t-r interface interface name to be deleted\n"

"\t-l list all or specified bridge's interfaces (default)\n"

"\t-C string ring/slot setting of an interface creating by -n\n"

+ "\t-p interface start polling. Additional -C x,y,z configures\n"

+ "\t\t x: 0 (REG_ALL_NIC) or 1 (REG_ONE_NIC),\n"

+ "\t\t y: CPU core id for ALL_NIC and core/ring for ONE_NIC\n"

+ "\t\t z: (ONE_NIC only) num of total cores/rings\n"

+ "\t-P interface stop polling\n"

"", command);

return 0;

}

- while ((ch = getopt(argc, argv, "d:a:h:g:l:n:r:C:")) != -1) {

- name = optarg; /* default */

+ while ((ch = getopt(argc, argv, "d:a:h:g:l:n:r:C:p:P:")) != -1) {

+ if (ch != 'C')

+ name = optarg; /* default */

switch (ch) {

default:

fprintf(stderr, "bad option %c %s", ch, optarg);

@@ -223,11 +258,17 @@ usage:

case 'C':

nmr_config = strdup(optarg);

break;

+ case 'p':

+ nr_cmd = NETMAP_BDG_POLLING_ON;

+ break;

+ case 'P':

+ nr_cmd = NETMAP_BDG_POLLING_OFF;

+ break;

}

- if (optind != argc) {

- // fprintf(stderr, "optind %d argc %d\n", optind, argc);

- goto usage;

- }

+ }

+ if (optind != argc) {

+ // fprintf(stderr, "optind %d argc %d\n", optind, argc);

+ goto usage;

}

if (argc == 1)

nr_cmd = NETMAP_BDG_LIST;