diff options
author | Luigi Rizzo <luigi@FreeBSD.org> | 2013-11-01 21:21:14 +0000 |
---|---|---|
committer | Luigi Rizzo <luigi@FreeBSD.org> | 2013-11-01 21:21:14 +0000 |
commit | ce3ee1e7c4cac5b86bbc15daac68f2129aa42187 (patch) | |
tree | 62d07ffe9208f3098d5f67c47dd66e29212478b5 | |
parent | a09968c47940d3b0e9e82ce7c06faec3f42cea94 (diff) |
Notes
-rw-r--r-- | share/man/man4/netmap.4 | 402 | ||||
-rw-r--r-- | sys/conf/files | 2 | ||||
-rw-r--r-- | sys/dev/e1000/if_em.c | 9 | ||||
-rw-r--r-- | sys/dev/e1000/if_igb.c | 7 | ||||
-rw-r--r-- | sys/dev/e1000/if_lem.c | 6 | ||||
-rw-r--r-- | sys/dev/e1000/if_lem.h | 27 | ||||
-rw-r--r-- | sys/dev/ixgbe/ixgbe.c | 11 | ||||
-rw-r--r-- | sys/dev/netmap/if_em_netmap.h | 53 | ||||
-rw-r--r-- | sys/dev/netmap/if_igb_netmap.h | 63 | ||||
-rw-r--r-- | sys/dev/netmap/if_lem_netmap.h | 78 | ||||
-rw-r--r-- | sys/dev/netmap/if_re_netmap.h | 51 | ||||
-rw-r--r-- | sys/dev/netmap/ixgbe_netmap.h | 64 | ||||
-rw-r--r-- | sys/dev/netmap/netmap.c | 3021 | ||||
-rw-r--r-- | sys/dev/netmap/netmap_kern.h | 271 | ||||
-rw-r--r-- | sys/dev/netmap/netmap_mem2.c | 850 | ||||
-rw-r--r-- | sys/dev/re/if_re.c | 5 | ||||
-rw-r--r-- | sys/net/netmap.h | 265 | ||||
-rw-r--r-- | tools/tools/netmap/nm_util.c | 12 | ||||
-rw-r--r-- | tools/tools/netmap/pkt-gen.c | 394 |
19 files changed, 3556 insertions, 2035 deletions
diff --git a/share/man/man4/netmap.4 b/share/man/man4/netmap.4 index 3b72417e2f33..b43f2d6c03c8 100644 --- a/share/man/man4/netmap.4 +++ b/share/man/man4/netmap.4 @@ -1,4 +1,4 @@ -.\" Copyright (c) 2011 Matteo Landi, Luigi Rizzo, Universita` di Pisa +.\" Copyright (c) 2011-2013 Matteo Landi, Luigi Rizzo, Universita` di Pisa .\" All rights reserved. .\" .\" Redistribution and use in source and binary forms, with or without @@ -21,14 +21,13 @@ .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. -.\" +.\" .\" This document is derived in part from the enet man page (enet.4) .\" distributed with 4.3BSD Unix. .\" .\" $FreeBSD$ -.\" $Id: netmap.4 11563 2012-08-02 08:59:12Z luigi $: stable/8/share/man/man4/bpf.4 181694 2008-08-13 17:45:06Z ed $ .\" -.Dd September 23, 2013 +.Dd October 18, 2013 .Dt NETMAP 4 .Os .Sh NAME @@ -38,101 +37,230 @@ .Cd device netmap .Sh DESCRIPTION .Nm -is a framework for fast and safe access to network devices -(reaching 14.88 Mpps at less than 1 GHz). +is a framework for extremely fast and efficient packet I/O +(reaching 14.88 Mpps with a single core at less than 1 GHz) +for both userspace and kernel clients. +Userspace clients can use the netmap API +to send and receive raw packets through physical interfaces +or ports of the +.Xr VALE 4 +switch. +.Pp +.Nm VALE +is a very fast (reaching 20 Mpps per port) +and modular software switch, +implemented within the kernel, which can interconnect +virtual ports, physical devices, and the native host stack. +.Pp .Nm -uses memory mapped buffers and metadata -(buffer indexes and lengths) to communicate with the kernel, -which is in charge of validating information through -.Pa ioctl() -and -.Pa select()/poll(). +uses a memory mapped region to share packet buffers, +descriptors and queues with the kernel. +Simple +.Pa ioctl()s +are used to bind interfaces/ports to file descriptors and +implement non-blocking I/O, whereas blocking I/O uses +.Pa select()/poll() . .Nm can exploit the parallelism in multiqueue devices and multicore systems. .Pp +For the best performance, +.Nm +requires explicit support in device drivers; +a generic emulation layer is available to implement the .Nm -requires explicit support in device drivers. -For a list of supported devices, see the end of this manual page. -.Sh OPERATION +API on top of unmodified device drivers, +at the price of reduced performance +(but still better than what can be achieved with +sockets or BPF/pcap). +.Pp +For a list of devices with native .Nm -clients must first open the +support, see the end of this manual page. +.Pp +.Sh OPERATION - THE NETMAP API +.Nm +clients must first .Pa open("/dev/netmap") , and then issue an -.Pa ioctl(...,NIOCREGIF,...) -to bind the file descriptor to a network device. -.Pp -When a device is put in +.Pa ioctl(fd, NIOCREGIF, (struct nmreq *)arg) +to bind the file descriptor to a specific interface or port. .Nm -mode, its data path is disconnected from the host stack. -The processes owning the file descriptor -can exchange packets with the device, or with the host stack, -through an mmapped memory region that contains pre-allocated -buffers and metadata. +has multiple modes of operation controlled by the +content of the +.Pa struct nmreq +passed to the +.Pa ioctl() . +In particular, the +.Em nr_name +field specifies whether the client operates on a physical network +interface or on a port of a +.Nm VALE +switch, as indicated below. Additional fields in the +.Pa struct nmreq +control the details of operation. +.Pp +.Bl -tag -width XXXX +.It Dv Interface name (e.g. 'em0', 'eth1', ... ) +The data path of the interface is disconnected from the host stack. +Depending on additional arguments, +the file descriptor is bound to the NIC (one or all queues), +or to the host stack. +.It Dv valeXXX:YYY (arbitrary XXX and YYY) +The file descriptor is bound to port YYY of a VALE switch called XXX, +where XXX and YYY are arbitrary alphanumeric strings. +The string cannot exceed IFNAMSIZ characters, and YYY cannot +matching the name of any existing interface. +.Pp +The switch and the port are created if not existing. +.It Dv valeXXX:ifname (ifname is an existing interface) +Flags in the argument control whether the physical interface +(and optionally the corrisponding host stack endpoint) +are connected or disconnected from the VALE switch named XXX. .Pp +In this case the +.Pa ioctl() +is used only for configuring the VALE switch, typically through the +.Nm vale-ctl +command. +The file descriptor cannot be used for I/O, and should be +.Pa close()d +after issuing the +.Pa ioctl(). +.El +.Pp +The binding can be removed (and the interface returns to +regular operation, or the virtual port destroyed) with a +.Pa close() +on the file descriptor. +.Pp +The processes owning the file descriptor can then +.Pa mmap() +the memory region that contains pre-allocated +buffers, descriptors and queues, and use them to +read/write raw packets. Non blocking I/O is done with special .Pa ioctl()'s , whereas the file descriptor can be passed to .Pa select()/poll() to be notified about incoming packet or available transmit buffers. -.Ss Data structures -All data structures for all devices in +.Ss DATA STRUCTURES +The data structures in the mmapped memory are described below +(see +.Xr sys/net/netmap.h +for reference). +All physical devices operating in .Nm -mode are in a memory -region shared by the kernel and all processes -who open +mode use the same memory region, +shared by the kernel and all processes who own .Pa /dev/netmap +descriptors bound to those devices (NOTE: visibility may be restricted in future implementations). +Virtual ports instead use separate memory regions, +shared only with the kernel. +.Pp All references between the shared data structure are relative (offsets or indexes). Some macros help converting them into actual pointers. -.Pp -The data structures in shared memory are the following: .Bl -tag -width XXX .It Dv struct netmap_if (one per interface) indicates the number of rings supported by an interface, their sizes, and the offsets of the .Pa netmap_rings associated to the interface. -The offset of a +.Pp .Pa struct netmap_if -in the shared memory region is indicated by the +is at offset .Pa nr_offset +in the shared memory region is indicated by the field in the structure returned by the .Pa NIOCREGIF (see below). .Bd -literal struct netmap_if { - char ni_name[IFNAMSIZ]; /* name of the interface. */ - const u_int ni_num_queues; /* number of hw ring pairs */ - const ssize_t ring_ofs[]; /* offset of tx and rx rings */ + char ni_name[IFNAMSIZ]; /* name of the interface. */ + const u_int ni_version; /* API version */ + const u_int ni_rx_rings; /* number of rx ring pairs */ + const u_int ni_tx_rings; /* if 0, same as ni_rx_rings */ + const ssize_t ring_ofs[]; /* offset of tx and rx rings */ }; .Ed .It Dv struct netmap_ring (one per ring) -contains the index of the current read or write slot (cur), -the number of slots available for reception or transmission (avail), +Contains the positions in the transmit and receive rings to +synchronize the kernel and the application, and an array of .Pa slots describing the buffers. -There is one ring pair for each of the N hardware ring pairs -supported by the card (numbered 0..N-1), plus -one ring pair (numbered N) for packets from/to the host stack. +'reserved' is used in receive rings to tell the kernel the +number of slots after 'cur' that are still in usr +indicates how many slots starting from 'cur' +the +.Pp +Each physical interface has one +.Pa netmap_ring +for each hardware transmit and receive ring, +plus one extra transmit and one receive structure +that connect to the host stack. .Bd -literal struct netmap_ring { - const ssize_t buf_ofs; - const uint32_t num_slots; /* number of slots in the ring. */ - uint32_t avail; /* number of usable slots */ - uint32_t cur; /* 'current' index for the user side */ - uint32_t reserved; /* not refilled before current */ + const ssize_t buf_ofs; /* see details */ + const uint32_t num_slots; /* number of slots in the ring */ + uint32_t avail; /* number of usable slots */ + uint32_t cur; /* 'current' read/write index */ + uint32_t reserved; /* not refilled before current */ const uint16_t nr_buf_size; - uint16_t flags; - struct netmap_slot slot[0]; /* array of slots. */ + uint16_t flags; +#define NR_TIMESTAMP 0x0002 /* set timestamp on *sync() */ +#define NR_FORWARD 0x0004 /* enable NS_FORWARD for ring */ +#define NR_RX_TSTMP 0x0008 /* set rx timestamp in slots */ + struct timeval ts; + struct netmap_slot slot[0]; /* array of slots */ } .Ed +.Pp +In transmit rings, after a system call 'cur' indicates +the first slot that can be used for transmissions, +and 'avail' reports how many of them are available. +Before the next netmap-related system call on the file +descriptor, the application should fill buffers and +slots with data, and update 'cur' and 'avail' +accordingly, as shown in the figure below: +.Bd -literal + + cur + |----- avail ---| (after syscall) + v + TX [*****aaaaaaaaaaaaaaaaa**] + TX [*****TTTTTaaaaaaaaaaaa**] + ^ + |-- avail --| (before syscall) + cur +.Ed + +In receive rings, after a system call 'cur' indicates +the first slot that contains a valid packet, +and 'avail' reports how many of them are available. +Before the next netmap-related system call on the file +descriptor, the application can process buffers and +release them to the kernel updating +'cur' and 'avail' accordingly, as shown in the figure below. +Receive rings have an additional field called 'reserved' +to indicate how many buffers before 'cur' are still +under processing and cannot be released. +.Bd -literal + cur + |-res-|-- avail --| (after syscall) + v + RX [**rrrrrrRRRRRRRRRRRR******] + RX [**...........rrrrRRR******] + |res|--|<avail (before syscall) + ^ + cur + +.Ed .It Dv struct netmap_slot (one per packet) -contains the metadata for a packet: a buffer index (buf_idx), -a buffer length (len), and some flags. +contains the metadata for a packet: .Bd -literal struct netmap_slot { uint32_t buf_idx; /* buffer index */ @@ -142,23 +270,94 @@ struct netmap_slot { #define NS_REPORT 0x0002 /* tell hw to report results * e.g. by generating an interrupt */ +#define NS_FORWARD 0x0004 /* pass packet to the other endpoint + * (host stack or device) + */ +#define NS_NO_LEARN 0x0008 +#define NS_INDIRECT 0x0010 +#define NS_MOREFRAG 0x0020 +#define NS_PORT_SHIFT 8 +#define NS_PORT_MASK (0xff << NS_PORT_SHIFT) +#define NS_RFRAGS(_slot) ( ((_slot)->flags >> 8) & 0xff) + uint64_t ptr; /* buffer address (indirect buffers) */ }; .Ed +The flags control how the the buffer associated to the slot +should be managed. .It Dv packet buffers -are fixed size (approximately 2k) buffers allocated by the kernel +are normally fixed size (2 Kbyte) buffers allocated by the kernel that contain packet data. Buffers addresses are computed through macros. .El .Pp +.Bl -tag -width XXX Some macros support the access to objects in the shared memory -region. In particular: -.Bd -literal -struct netmap_if *nifp; -struct netmap_ring *txring = NETMAP_TXRING(nifp, i); -struct netmap_ring *rxring = NETMAP_RXRING(nifp, i); -int i = txring->slot[txring->cur].buf_idx; -char *buf = NETMAP_BUF(txring, i); -.Ed +region. In particular, +.It NETMAP_TXRING(nifp, i) +.It NETMAP_RXRING(nifp, i) +return the address of the i-th transmit and receive ring, +respectively, whereas +.It NETMAP_BUF(ring, buf_idx) +returns the address of the buffer with index buf_idx +(which can be part of any ring for the given interface). +.El +.Pp +Normally, buffers are associated to slots when interfaces are bound, +and one packet is fully contained in a single buffer. +Clients can however modify the mapping using the +following flags: +.Ss FLAGS +.Bl -tag -width XXX +.It NS_BUF_CHANGED +indicates that the buf_idx in the slot has changed. +This can be useful if the client wants to implement +some form of zero-copy forwarding (e.g. by passing buffers +from an input interface to an output interface), or +needs to process packets out of order. +.Pp +The flag MUST be used whenever the buffer index is changed. +.It NS_REPORT +indicates that we want to be woken up when this buffer +has been transmitted. This reduces performance but insures +a prompt notification when a buffer has been sent. +Normally, +.Nm +notifies transmit completions in batches, hence signals +can be delayed indefinitely. However, we need such notifications +before closing a descriptor. +.It NS_FORWARD +When the device is open in 'transparent' mode, +the client can mark slots in receive rings with this flag. +For all marked slots, marked packets are forwarded to +the other endpoint at the next system call, thus restoring +(in a selective way) the connection between the NIC and the +host stack. +.It NS_NO_LEARN +tells the forwarding code that the SRC MAC address for this +packet should not be used in the learning bridge +.It NS_INDIRECT +indicates that the packet's payload is not in the netmap +supplied buffer, but in a user-supplied buffer whose +user virtual address is in the 'ptr' field of the slot. +The size can reach 65535 bytes. +.Em This is only supported on the transmit ring of virtual ports +.It NS_MOREFRAG +indicates that the packet continues with subsequent buffers; +the last buffer in a packet must have the flag clear. +The maximum length of a chain is 64 buffers. +.Em This is only supported on virtual ports +.It ns_ctr +on receive rings, contains the number of remaining buffers +in a packet, including this one. +Slots with a value greater than 1 also have NS_MOREFRAG set. +The length refers to the individual buffer, there is no +field for the total length +XXX maybe put it in the ptr field ? +.Pp +On transmit rings, if NS_DST is set, it is passed to the lookup +function, which can use it e.g. as the index of the destination +port instead of doing an address lookup. +.El .Sh IOCTLS .Nm supports some ioctl() to synchronize the state of the rings @@ -166,13 +365,13 @@ between the kernel and the user processes, plus some to query and configure the interface. The former do not require any argument, whereas the latter use a -.Pa struct netmap_req +.Pa struct nmreq defined as follows: .Bd -literal struct nmreq { char nr_name[IFNAMSIZ]; uint32_t nr_version; /* API version */ -#define NETMAP_API 3 /* current version */ +#define NETMAP_API 4 /* current version */ uint32_t nr_offset; /* nifp offset in the shared region */ uint32_t nr_memsize; /* size of the shared region */ uint32_t nr_tx_slots; /* slots in tx rings */ @@ -184,8 +383,14 @@ struct nmreq { #define NETMAP_SW_RING 0x2000 /* we process the sw ring */ #define NETMAP_NO_TX_POLL 0x1000 /* no gratuitous txsync on poll */ #define NETMAP_RING_MASK 0xfff /* the actual ring number */ - uint16_t spare1; - uint32_t spare2[4]; + uint16_t nr_cmd; +#define NETMAP_BDG_ATTACH 1 /* attach the NIC */ +#define NETMAP_BDG_DETACH 2 /* detach the NIC */ +#define NETMAP_BDG_LOOKUP_REG 3 /* register lookup function */ +#define NETMAP_BDG_LIST 4 /* get bridge's info */ + uint16_t nr_arg1; + uint16_t nr_arg2; + uint32_t spare2[3]; }; .Ed @@ -200,15 +405,27 @@ command codes below are defined in and are: .Bl -tag -width XXXX .It Dv NIOCGINFO -returns information about the interface named in nr_name. -On return, nr_memsize indicates the size of the shared netmap -memory region (this is device-independent), -nr_tx_slots and nr_rx_slots indicates how many buffers are in a -transmit and receive ring, -nr_tx_rings and nr_rx_rings indicates the number of transmit -and receive rings supported by the hardware. +returns EINVAL if the named device does not support netmap. +Otherwise, it returns 0 and (advisory) information +about the interface. +Note that all the information below can change before the +interface is actually put in netmap mode. .Pp -If the device does not support netmap, the ioctl returns EINVAL. +.Pa nr_memsize +indicates the size of the netmap +memory region. Physical devices all share the same memory region, +whereas VALE ports may have independent regions for each port. +These sizes can be set through system-wise sysctl variables. +.Pa nr_tx_slots, nr_rx_slots +indicate the size of transmit and receive rings. +.Pa nr_tx_rings, nr_rx_rings +indicate the number of transmit +and receive rings. +Both ring number and sizes may be configured at runtime +using interface-specific functions (e.g. +.Pa sysctl +or +.Pa ethtool . .It Dv NIOCREGIF puts the interface named in nr_name into netmap mode, disconnecting it from the host stack, and/or defines which rings are controlled @@ -243,8 +460,11 @@ or the send queue is full. .Pa NIOCREGIF can be used multiple times to change the association of a file descriptor to a ring pair, always within the same device. -.It Dv NIOCUNREGIF -brings an interface back to normal mode. +.Pp +When registering a virtual interface that is dynamically created to a +.Xr vale 4 +switch, we can specify the desired number of rings (1 by default, +and currently up to 16) on it using nr_tx_rings and nr_rx_rings fields. .It Dv NIOCTXSYNC tells the hardware of new packets to transmit, and updates the number of slots available for transmission. @@ -255,10 +475,20 @@ packets. .Sh SYSTEM CALLS .Nm uses -.Nm select +.Xr select 2 and -.Nm poll -to wake up processes when significant events occur. +.Xr poll 2 +to wake up processes when significant events occur, and +.Xr mmap 2 +to map memory. +.Pp +Applications may need to create threads and bind them to +specific cores to improve performance, using standard +OS primitives, see +.Xr pthread 3 . +In particular, +.Xr pthread_setaffinity_np 3 +may be of use. .Sh EXAMPLES The following code implements a traffic generator .Pp @@ -272,10 +502,10 @@ struct nmreq nmr; fd = open("/dev/netmap", O_RDWR); bzero(&nmr, sizeof(nmr)); strcpy(nmr.nr_name, "ix0"); -nmr.nr_version = NETMAP_API; -ioctl(fd, NIOCREG, &nmr); +nmr.nm_version = NETMAP_API; +ioctl(fd, NIOCREGIF, &nmr); p = mmap(0, nmr.nr_memsize, fd); -nifp = NETMAP_IF(p, nmr.offset); +nifp = NETMAP_IF(p, nmr.nr_offset); ring = NETMAP_TXRING(nifp, 0); fds.fd = fd; fds.events = POLLOUT; @@ -312,13 +542,17 @@ Usenix ATC'12, June 2012, Boston .An -nosplit The .Nm -framework has been designed and implemented at the +framework has been originally designed and implemented at the Universita` di Pisa in 2011 by .An Luigi Rizzo , -with help from +and further extended with help from .An Matteo Landi , .An Gaetano Catalli , -.An Giuseppe Lettieri . +.An Giuseppe Lettieri , +.An Vincenzo Maffione . .Pp .Nm -has been funded by the European Commission within FP7 Project CHANGE (257422). +and +.Nm VALE +have been funded by the European Commission within FP7 Projects +CHANGE (257422) and OPENLAB (287581). diff --git a/sys/conf/files b/sys/conf/files index 29e6b80376bf..3c20141d4f2f 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -1881,6 +1881,8 @@ dev/nand/nfc_if.m optional nand dev/ncv/ncr53c500.c optional ncv dev/ncv/ncr53c500_pccard.c optional ncv pccard dev/netmap/netmap.c optional netmap +dev/netmap/netmap_mem2.c optional netmap +# compile-with "${NORMAL_C} -Wconversion -Wextra" dev/nge/if_nge.c optional nge dev/nxge/if_nxge.c optional nxge \ compile-with "${NORMAL_C} ${NO_WSELF_ASSIGN}" diff --git a/sys/dev/e1000/if_em.c b/sys/dev/e1000/if_em.c index c8fbefb527bb..74d391dc28cd 100644 --- a/sys/dev/e1000/if_em.c +++ b/sys/dev/e1000/if_em.c @@ -3836,8 +3836,7 @@ em_txeof(struct tx_ring *txr) EM_TX_LOCK_ASSERT(txr); #ifdef DEV_NETMAP - if (netmap_tx_irq(ifp, txr->me | - (NETMAP_LOCKED_ENTER | NETMAP_LOCKED_EXIT))) + if (netmap_tx_irq(ifp, txr->me)) return; #endif /* DEV_NETMAP */ @@ -4101,7 +4100,7 @@ em_setup_receive_ring(struct rx_ring *rxr) sizeof(struct e1000_rx_desc), EM_DBA_ALIGN); bzero((void *)rxr->rx_base, rsize); #ifdef DEV_NETMAP - slot = netmap_reset(na, NR_RX, 0, 0); + slot = netmap_reset(na, NR_RX, rxr->me, 0); #endif /* @@ -4433,8 +4432,10 @@ em_rxeof(struct rx_ring *rxr, int count, int *done) EM_RX_LOCK(rxr); #ifdef DEV_NETMAP - if (netmap_rx_irq(ifp, rxr->me | NETMAP_LOCKED_ENTER, &processed)) + if (netmap_rx_irq(ifp, rxr->me, &processed)) { + EM_RX_UNLOCK(rxr); return (FALSE); + } #endif /* DEV_NETMAP */ for (i = rxr->next_to_check, processed = 0; count != 0;) { diff --git a/sys/dev/e1000/if_igb.c b/sys/dev/e1000/if_igb.c index a03aa51b10cf..3d580cf1a6fe 100644 --- a/sys/dev/e1000/if_igb.c +++ b/sys/dev/e1000/if_igb.c @@ -3962,8 +3962,7 @@ igb_txeof(struct tx_ring *txr) mtx_assert(&txr->tx_mtx, MA_OWNED); #ifdef DEV_NETMAP - if (netmap_tx_irq(ifp, txr->me | - (NETMAP_LOCKED_ENTER|NETMAP_LOCKED_EXIT))) + if (netmap_tx_irq(ifp, txr->me)) return (FALSE); #endif /* DEV_NETMAP */ @@ -4829,8 +4828,10 @@ igb_rxeof(struct igb_queue *que, int count, int *done) BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); #ifdef DEV_NETMAP - if (netmap_rx_irq(ifp, rxr->me | NETMAP_LOCKED_ENTER, &processed)) + if (netmap_rx_irq(ifp, rxr->me, &processed)) { + IGB_RX_UNLOCK(rxr); return (FALSE); + } #endif /* DEV_NETMAP */ /* Main clean loop */ diff --git a/sys/dev/e1000/if_lem.c b/sys/dev/e1000/if_lem.c index e8d53f0e8348..676bf8fe1e85 100644 --- a/sys/dev/e1000/if_lem.c +++ b/sys/dev/e1000/if_lem.c @@ -2986,7 +2986,7 @@ lem_txeof(struct adapter *adapter) EM_TX_LOCK_ASSERT(adapter); #ifdef DEV_NETMAP - if (netmap_tx_irq(ifp, 0 | (NETMAP_LOCKED_ENTER|NETMAP_LOCKED_EXIT))) + if (netmap_tx_irq(ifp, 0)) return; #endif /* DEV_NETMAP */ if (adapter->num_tx_desc_avail == adapter->num_tx_desc) @@ -3455,8 +3455,10 @@ lem_rxeof(struct adapter *adapter, int count, int *done) BUS_DMASYNC_POSTREAD); #ifdef DEV_NETMAP - if (netmap_rx_irq(ifp, 0 | NETMAP_LOCKED_ENTER, &rx_sent)) + if (netmap_rx_irq(ifp, 0, &rx_sent)) { + EM_RX_UNLOCK(adapter); return (FALSE); + } #endif /* DEV_NETMAP */ if (!((current_desc->status) & E1000_RXD_STAT_DD)) { diff --git a/sys/dev/e1000/if_lem.h b/sys/dev/e1000/if_lem.h index 235277d740c8..db6b5f0acab1 100644 --- a/sys/dev/e1000/if_lem.h +++ b/sys/dev/e1000/if_lem.h @@ -265,6 +265,13 @@ #define PICOSECS_PER_TICK 20833 #define TSYNC_PORT 319 /* UDP port for the protocol */ +#ifdef NIC_PARAVIRT +#define E1000_PARA_SUBDEV 0x1101 /* special id */ +#define E1000_CSBAL 0x02830 /* csb phys. addr. low */ +#define E1000_CSBAH 0x02834 /* csb phys. addr. hi */ +#include <net/paravirt.h> +#endif /* NIC_PARAVIRT */ + /* * Bus dma allocation structure used by * e1000_dma_malloc and e1000_dma_free. @@ -437,6 +444,26 @@ struct adapter { boolean_t pcix_82544; boolean_t in_detach; +#ifdef NIC_SEND_COMBINING + /* 0 = idle; 1xxxx int-pending; 3xxxx int + d pending + tdt */ +#define MIT_PENDING_INT 0x10000 /* pending interrupt */ +#define MIT_PENDING_TDT 0x30000 /* both intr and tdt write are pending */ + uint32_t shadow_tdt; + uint32_t sc_enable; +#endif /* NIC_SEND_COMBINING */ +#ifdef BATCH_DISPATCH + uint32_t batch_enable; +#endif /* BATCH_DISPATCH */ + +#ifdef NIC_PARAVIRT + struct em_dma_alloc csb_mem; /* phys address */ + struct paravirt_csb *csb; /* virtual addr */ + uint32_t rx_retries; /* optimize rx loop */ + uint32_t tdt_csb_count;// XXX stat + uint32_t tdt_reg_count;// XXX stat + uint32_t tdt_int_count;// XXX stat + uint32_t guest_need_kick_count;// XXX stat +#endif /* NIC_PARAVIRT */ struct e1000_hw_stats stats; }; diff --git a/sys/dev/ixgbe/ixgbe.c b/sys/dev/ixgbe/ixgbe.c index e913bd5eec52..3e1471d2d565 100644 --- a/sys/dev/ixgbe/ixgbe.c +++ b/sys/dev/ixgbe/ixgbe.c @@ -3621,16 +3621,11 @@ ixgbe_txeof(struct tx_ring *txr) * means the user thread should not be woken up); * - the driver ignores tx interrupts unless netmap_mitigate=0 * or the slot has the DD bit set. - * - * When the driver has separate locks, we need to - * release and re-acquire txlock to avoid deadlocks. - * XXX see if we can find a better way. */ if (!netmap_mitigate || (kring->nr_kflags < kring->nkr_num_slots && txd[kring->nr_kflags].wb.status & IXGBE_TXD_STAT_DD)) { - netmap_tx_irq(ifp, txr->me | - (NETMAP_LOCKED_ENTER|NETMAP_LOCKED_EXIT)); + netmap_tx_irq(ifp, txr->me); } return; } @@ -4422,8 +4417,10 @@ ixgbe_rxeof(struct ix_queue *que) #ifdef DEV_NETMAP /* Same as the txeof routine: wakeup clients on intr. */ - if (netmap_rx_irq(ifp, rxr->me | NETMAP_LOCKED_ENTER, &processed)) + if (netmap_rx_irq(ifp, rxr->me, &processed)) { + IXGBE_RX_UNLOCK(rxr); return (FALSE); + } #endif /* DEV_NETMAP */ for (i = rxr->next_to_check; count != 0;) { diff --git a/sys/dev/netmap/if_em_netmap.h b/sys/dev/netmap/if_em_netmap.h index 5bfbd3d76adc..1ea11238aaaf 100644 --- a/sys/dev/netmap/if_em_netmap.h +++ b/sys/dev/netmap/if_em_netmap.h @@ -43,35 +43,6 @@ static void em_netmap_block_tasks(struct adapter *); static void em_netmap_unblock_tasks(struct adapter *); -static void -em_netmap_lock_wrapper(struct ifnet *ifp, int what, u_int queueid) -{ - struct adapter *adapter = ifp->if_softc; - - ASSERT(queueid < adapter->num_queues); - switch (what) { - case NETMAP_CORE_LOCK: - EM_CORE_LOCK(adapter); - break; - case NETMAP_CORE_UNLOCK: - EM_CORE_UNLOCK(adapter); - break; - case NETMAP_TX_LOCK: - EM_TX_LOCK(&adapter->tx_rings[queueid]); - break; - case NETMAP_TX_UNLOCK: - EM_TX_UNLOCK(&adapter->tx_rings[queueid]); - break; - case NETMAP_RX_LOCK: - EM_RX_LOCK(&adapter->rx_rings[queueid]); - break; - case NETMAP_RX_UNLOCK: - EM_RX_UNLOCK(&adapter->rx_rings[queueid]); - break; - } -} - - // XXX do we need to block/unblock the tasks ? static void em_netmap_block_tasks(struct adapter *adapter) @@ -137,7 +108,7 @@ em_netmap_reg(struct ifnet *ifp, int onoff) ifp->if_capenable |= IFCAP_NETMAP; na->if_transmit = ifp->if_transmit; - ifp->if_transmit = netmap_start; + ifp->if_transmit = netmap_transmit; em_init_locked(adapter); if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) == 0) { @@ -160,7 +131,7 @@ fail: * Reconcile kernel and user view of the transmit ring. */ static int -em_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int do_lock) +em_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int flags) { struct adapter *adapter = ifp->if_softc; struct tx_ring *txr = &adapter->tx_rings[ring_nr]; @@ -176,8 +147,6 @@ em_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int do_lock) if (k > lim) return netmap_ring_reinit(kring); - if (do_lock) - EM_TX_LOCK(txr); bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, BUS_DMASYNC_POSTREAD); @@ -202,8 +171,6 @@ em_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int do_lock) u_int len = slot->len; if (addr == netmap_buffer_base || len > NETMAP_BUF_SIZE) { - if (do_lock) - EM_TX_UNLOCK(txr); return netmap_ring_reinit(kring); } @@ -252,8 +219,6 @@ em_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int do_lock) /* update avail to what the kernel knows */ ring->avail = kring->nr_hwavail; - if (do_lock) - EM_TX_UNLOCK(txr); return 0; } @@ -262,7 +227,7 @@ em_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int do_lock) * Reconcile kernel and user view of the receive ring. */ static int -em_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int do_lock) +em_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int flags) { struct adapter *adapter = ifp->if_softc; struct rx_ring *rxr = &adapter->rx_rings[ring_nr]; @@ -270,16 +235,13 @@ em_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int do_lock) struct netmap_kring *kring = &na->rx_rings[ring_nr]; struct netmap_ring *ring = kring->ring; u_int j, l, n, lim = kring->nkr_num_slots - 1; - int force_update = do_lock || kring->nr_kflags & NKR_PENDINTR; + int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR; u_int k = ring->cur, resvd = ring->reserved; k = ring->cur; if (k > lim) return netmap_ring_reinit(kring); - if (do_lock) - EM_RX_LOCK(rxr); - /* XXX check sync modes */ bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); @@ -334,8 +296,6 @@ em_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int do_lock) void *addr = PNMB(slot, &paddr); if (addr == netmap_buffer_base) { /* bad buf */ - if (do_lock) - EM_RX_UNLOCK(rxr); return netmap_ring_reinit(kring); } @@ -364,8 +324,6 @@ em_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int do_lock) } /* tell userspace that there are new packets */ ring->avail = kring->nr_hwavail - resvd; - if (do_lock) - EM_RX_UNLOCK(rxr); return 0; } @@ -378,12 +336,11 @@ em_netmap_attach(struct adapter *adapter) bzero(&na, sizeof(na)); na.ifp = adapter->ifp; - na.separate_locks = 1; + na.na_flags = NAF_BDG_MAYSLEEP; na.num_tx_desc = adapter->num_tx_desc; na.num_rx_desc = adapter->num_rx_desc; na.nm_txsync = em_netmap_txsync; na.nm_rxsync = em_netmap_rxsync; - na.nm_lock = em_netmap_lock_wrapper; na.nm_register = em_netmap_reg; netmap_attach(&na, adapter->num_queues); } diff --git a/sys/dev/netmap/if_igb_netmap.h b/sys/dev/netmap/if_igb_netmap.h index d4e5dfe0d1cf..10d94b5faa38 100644 --- a/sys/dev/netmap/if_igb_netmap.h +++ b/sys/dev/netmap/if_igb_netmap.h @@ -39,38 +39,6 @@ /* - * wrapper to export locks to the generic code - */ -static void -igb_netmap_lock_wrapper(struct ifnet *ifp, int what, u_int queueid) -{ - struct adapter *adapter = ifp->if_softc; - - ASSERT(queueid < adapter->num_queues); - switch (what) { - case NETMAP_CORE_LOCK: - IGB_CORE_LOCK(adapter); - break; - case NETMAP_CORE_UNLOCK: - IGB_CORE_UNLOCK(adapter); - break; - case NETMAP_TX_LOCK: - IGB_TX_LOCK(&adapter->tx_rings[queueid]); - break; - case NETMAP_TX_UNLOCK: - IGB_TX_UNLOCK(&adapter->tx_rings[queueid]); - break; - case NETMAP_RX_LOCK: - IGB_RX_LOCK(&adapter->rx_rings[queueid]); - break; - case NETMAP_RX_UNLOCK: - IGB_RX_UNLOCK(&adapter->rx_rings[queueid]); - break; - } -} - - -/* * register-unregister routine */ static int @@ -92,7 +60,7 @@ igb_netmap_reg(struct ifnet *ifp, int onoff) ifp->if_capenable |= IFCAP_NETMAP; na->if_transmit = ifp->if_transmit; - ifp->if_transmit = netmap_start; + ifp->if_transmit = netmap_transmit; igb_init_locked(adapter); if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) == 0) { @@ -114,7 +82,7 @@ fail: * Reconcile kernel and user view of the transmit ring. */ static int -igb_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int do_lock) +igb_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int flags) { struct adapter *adapter = ifp->if_softc; struct tx_ring *txr = &adapter->tx_rings[ring_nr]; @@ -130,8 +98,6 @@ igb_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int do_lock) if (k > lim) return netmap_ring_reinit(kring); - if (do_lock) - IGB_TX_LOCK(txr); bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, BUS_DMASYNC_POSTREAD); @@ -153,6 +119,13 @@ igb_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int do_lock) /* curr is the current slot in the nic ring */ union e1000_adv_tx_desc *curr = (union e1000_adv_tx_desc *)&txr->tx_base[l]; +#ifndef IGB_MEDIA_RESET +/* at the same time as IGB_MEDIA_RESET was defined, the + * tx buffer descriptor was renamed, so use this to revert + * back to the old name. + */ +#define igb_tx_buf igb_tx_buffer +#endif struct igb_tx_buf *txbuf = &txr->tx_buffers[l]; int flags = ((slot->flags & NS_REPORT) || j == 0 || j == report_frequency) ? @@ -162,8 +135,6 @@ igb_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int do_lock) u_int len = slot->len; if (addr == netmap_buffer_base || len > NETMAP_BUF_SIZE) { - if (do_lock) - IGB_TX_UNLOCK(txr); return netmap_ring_reinit(kring); } @@ -223,8 +194,6 @@ igb_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int do_lock) /* update avail to what the kernel knows */ ring->avail = kring->nr_hwavail; - if (do_lock) - IGB_TX_UNLOCK(txr); return 0; } @@ -233,7 +202,7 @@ igb_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int do_lock) * Reconcile kernel and user view of the receive ring. */ static int -igb_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int do_lock) +igb_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int flags) { struct adapter *adapter = ifp->if_softc; struct rx_ring *rxr = &adapter->rx_rings[ring_nr]; @@ -241,16 +210,13 @@ igb_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int do_lock) struct netmap_kring *kring = &na->rx_rings[ring_nr]; struct netmap_ring *ring = kring->ring; u_int j, l, n, lim = kring->nkr_num_slots - 1; - int force_update = do_lock || kring->nr_kflags & NKR_PENDINTR; + int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR; u_int k = ring->cur, resvd = ring->reserved; k = ring->cur; if (k > lim) return netmap_ring_reinit(kring); - if (do_lock) - IGB_RX_LOCK(rxr); - /* XXX check sync modes */ bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); @@ -303,8 +269,6 @@ igb_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int do_lock) void *addr = PNMB(slot, &paddr); if (addr == netmap_buffer_base) { /* bad buf */ - if (do_lock) - IGB_RX_UNLOCK(rxr); return netmap_ring_reinit(kring); } @@ -332,8 +296,6 @@ igb_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int do_lock) } /* tell userspace that there are new packets */ ring->avail = kring->nr_hwavail - resvd; - if (do_lock) - IGB_RX_UNLOCK(rxr); return 0; } @@ -346,12 +308,11 @@ igb_netmap_attach(struct adapter *adapter) bzero(&na, sizeof(na)); na.ifp = adapter->ifp; - na.separate_locks = 1; + na.na_flags = NAF_BDG_MAYSLEEP; na.num_tx_desc = adapter->num_tx_desc; na.num_rx_desc = adapter->num_rx_desc; na.nm_txsync = igb_netmap_txsync; na.nm_rxsync = igb_netmap_rxsync; - na.nm_lock = igb_netmap_lock_wrapper; na.nm_register = igb_netmap_reg; netmap_attach(&na, adapter->num_queues); } diff --git a/sys/dev/netmap/if_lem_netmap.h b/sys/dev/netmap/if_lem_netmap.h index acef45f0d884..25e5c7c27e3e 100644 --- a/sys/dev/netmap/if_lem_netmap.h +++ b/sys/dev/netmap/if_lem_netmap.h @@ -39,35 +39,6 @@ #include <dev/netmap/netmap_kern.h> -static void -lem_netmap_lock_wrapper(struct ifnet *ifp, int what, u_int ringid) -{ - struct adapter *adapter = ifp->if_softc; - - /* only one ring here so ignore the ringid */ - switch (what) { - case NETMAP_CORE_LOCK: - EM_CORE_LOCK(adapter); - break; - case NETMAP_CORE_UNLOCK: - EM_CORE_UNLOCK(adapter); - break; - case NETMAP_TX_LOCK: - EM_TX_LOCK(adapter); - break; - case NETMAP_TX_UNLOCK: - EM_TX_UNLOCK(adapter); - break; - case NETMAP_RX_LOCK: - EM_RX_LOCK(adapter); - break; - case NETMAP_RX_UNLOCK: - EM_RX_UNLOCK(adapter); - break; - } -} - - /* * Register/unregister */ @@ -81,6 +52,8 @@ lem_netmap_reg(struct ifnet *ifp, int onoff) if (na == NULL) return EINVAL; + EM_CORE_LOCK(adapter); + lem_disable_intr(adapter); /* Tell the stack that the interface is no longer active */ @@ -95,7 +68,7 @@ lem_netmap_reg(struct ifnet *ifp, int onoff) ifp->if_capenable |= IFCAP_NETMAP; na->if_transmit = ifp->if_transmit; - ifp->if_transmit = netmap_start; + ifp->if_transmit = netmap_transmit; lem_init_locked(adapter); if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) == 0) { @@ -114,6 +87,8 @@ fail: taskqueue_unblock(adapter->tq); // XXX do we need this ? #endif /* !EM_LEGCY_IRQ */ + EM_CORE_UNLOCK(adapter); + return (error); } @@ -122,7 +97,7 @@ fail: * Reconcile kernel and user view of the transmit ring. */ static int -lem_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int do_lock) +lem_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int flags) { struct adapter *adapter = ifp->if_softc; struct netmap_adapter *na = NA(ifp); @@ -133,13 +108,16 @@ lem_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int do_lock) /* generate an interrupt approximately every half ring */ int report_frequency = kring->nkr_num_slots >> 1; + ND("%s: hwofs %d, hwcur %d hwavail %d lease %d cur %d avail %d", + ifp->if_xname, + kring->nkr_hwofs, kring->nr_hwcur, kring->nr_hwavail, + kring->nkr_hwlease, + ring->cur, ring->avail); /* take a copy of ring->cur now, and never read it again */ k = ring->cur; if (k > lim) return netmap_ring_reinit(kring); - if (do_lock) - EM_TX_LOCK(adapter); bus_dmamap_sync(adapter->txdma.dma_tag, adapter->txdma.dma_map, BUS_DMASYNC_POSTREAD); /* @@ -147,6 +125,8 @@ lem_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int do_lock) * netmap ring, l is the corresponding index in the NIC ring. */ j = kring->nr_hwcur; + if (netmap_verbose > 255) + RD(5, "device %s send %d->%d", ifp->if_xname, j, k); if (j != k) { /* we have new packets to send */ l = netmap_idx_k2n(kring, j); for (n = 0; j != k; n++) { @@ -163,13 +143,12 @@ lem_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int do_lock) u_int len = slot->len; if (addr == netmap_buffer_base || len > NETMAP_BUF_SIZE) { - if (do_lock) - EM_TX_UNLOCK(adapter); return netmap_ring_reinit(kring); } + ND("slot %d NIC %d %s", j, l, nm_dump_buf(addr, len, 128, NULL)); slot->flags &= ~NS_REPORT; - if (slot->flags & NS_BUF_CHANGED) { + if (1 || slot->flags & NS_BUF_CHANGED) { /* buffer has changed, reload map */ netmap_reload_map(adapter->txtag, txbuf->map, addr); curr->buffer_addr = htole64(paddr); @@ -180,11 +159,13 @@ lem_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int do_lock) htole32( adapter->txd_cmd | len | (E1000_TXD_CMD_EOP | flags) ); + ND("len %d kring %d nic %d", len, j, l); bus_dmamap_sync(adapter->txtag, txbuf->map, BUS_DMASYNC_PREWRITE); j = (j == lim) ? 0 : j + 1; l = (l == lim) ? 0 : l + 1; } + ND("sent %d packets from %d, TDT now %d", n, kring->nr_hwcur, l); kring->nr_hwcur = k; /* the saved ring->cur */ kring->nr_hwavail -= n; @@ -199,6 +180,7 @@ lem_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int do_lock) /* record completed transmissions using TDH */ l = E1000_READ_REG(&adapter->hw, E1000_TDH(0)); + ND("tdh is now %d", l); if (l >= kring->nkr_num_slots) { /* XXX can it happen ? */ D("bad TDH %d", l); l -= kring->nkr_num_slots; @@ -208,6 +190,9 @@ lem_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int do_lock) /* some tx completed, increment hwavail. */ if (delta < 0) delta += kring->nkr_num_slots; + if (netmap_verbose > 255) + RD(5, "%s tx recover %d bufs", + ifp->if_xname, delta); adapter->next_tx_to_clean = l; kring->nr_hwavail += delta; } @@ -215,8 +200,6 @@ lem_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int do_lock) /* update avail to what the kernel knows */ ring->avail = kring->nr_hwavail; - if (do_lock) - EM_TX_UNLOCK(adapter); return 0; } @@ -225,21 +208,19 @@ lem_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int do_lock) * Reconcile kernel and user view of the receive ring. */ static int -lem_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int do_lock) +lem_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int flags) { struct adapter *adapter = ifp->if_softc; struct netmap_adapter *na = NA(ifp); struct netmap_kring *kring = &na->rx_rings[ring_nr]; struct netmap_ring *ring = kring->ring; int j, l, n, lim = kring->nkr_num_slots - 1; - int force_update = do_lock || kring->nr_kflags & NKR_PENDINTR; + int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR; u_int k = ring->cur, resvd = ring->reserved; if (k > lim) return netmap_ring_reinit(kring); - if (do_lock) - EM_RX_LOCK(adapter); /* XXX check sync modes */ bus_dmamap_sync(adapter->rxdma.dma_tag, adapter->rxdma.dma_map, @@ -251,6 +232,10 @@ lem_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int do_lock) */ l = adapter->next_rx_desc_to_check; j = netmap_idx_n2k(kring, l); + ND("%s: next NIC %d kring %d (ofs %d), hwcur %d hwavail %d cur %d avail %d", + ifp->if_xname, + l, j, kring->nkr_hwofs, kring->nr_hwcur, kring->nr_hwavail, + ring->cur, ring->avail); if (netmap_no_pendintr || force_update) { uint16_t slot_flags = kring->nkr_slot_flags; @@ -266,6 +251,8 @@ lem_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int do_lock) D("bogus pkt size at %d", j); len = 0; } + ND("\n%s", nm_dump_buf(NMB(&ring->slot[j]), + len, 128, NULL)); ring->slot[j].len = len; ring->slot[j].flags = slot_flags; bus_dmamap_sync(adapter->rxtag, @@ -300,8 +287,6 @@ lem_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int do_lock) void *addr = PNMB(slot, &paddr); if (addr == netmap_buffer_base) { /* bad buf */ - if (do_lock) - EM_RX_UNLOCK(adapter); return netmap_ring_reinit(kring); } @@ -332,8 +317,6 @@ lem_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int do_lock) } /* tell userspace that there are new packets */ ring->avail = kring->nr_hwavail - resvd; - if (do_lock) - EM_RX_UNLOCK(adapter); return 0; } @@ -346,12 +329,11 @@ lem_netmap_attach(struct adapter *adapter) bzero(&na, sizeof(na)); na.ifp = adapter->ifp; - na.separate_locks = 1; + na.na_flags = NAF_BDG_MAYSLEEP; na.num_tx_desc = adapter->num_tx_desc; na.num_rx_desc = adapter->num_rx_desc; na.nm_txsync = lem_netmap_txsync; na.nm_rxsync = lem_netmap_rxsync; - na.nm_lock = lem_netmap_lock_wrapper; na.nm_register = lem_netmap_reg; netmap_attach(&na, 1); } diff --git a/sys/dev/netmap/if_re_netmap.h b/sys/dev/netmap/if_re_netmap.h index f0f1f1969f4e..ac781ccb572e 100644 --- a/sys/dev/netmap/if_re_netmap.h +++ b/sys/dev/netmap/if_re_netmap.h @@ -39,33 +39,6 @@ /* - * wrapper to export locks to the generic code - * We should not use the tx/rx locks - */ -static void -re_netmap_lock_wrapper(struct ifnet *ifp, int what, u_int queueid) -{ - struct rl_softc *adapter = ifp->if_softc; - - switch (what) { - case NETMAP_CORE_LOCK: - RL_LOCK(adapter); - break; - case NETMAP_CORE_UNLOCK: - RL_UNLOCK(adapter); - break; - - case NETMAP_TX_LOCK: - case NETMAP_RX_LOCK: - case NETMAP_TX_UNLOCK: - case NETMAP_RX_UNLOCK: - D("invalid lock call %d, no tx/rx locks here", what); - break; - } -} - - -/* * support for netmap register/unregisted. We are already under core lock. * only called on the first register or the last unregister. */ @@ -88,7 +61,7 @@ re_netmap_reg(struct ifnet *ifp, int onoff) /* save if_transmit to restore it later */ na->if_transmit = ifp->if_transmit; - ifp->if_transmit = netmap_start; + ifp->if_transmit = netmap_transmit; re_init_locked(adapter); @@ -111,7 +84,7 @@ fail: * Reconcile kernel and user view of the transmit ring. */ static int -re_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int do_lock) +re_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int flags) { struct rl_softc *sc = ifp->if_softc; struct rl_txdesc *txd = sc->rl_ldata.rl_tx_desc; @@ -124,9 +97,6 @@ re_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int do_lock) if (k > lim) return netmap_ring_reinit(kring); - if (do_lock) - RL_LOCK(sc); - /* Sync the TX descriptor list */ bus_dmamap_sync(sc->rl_ldata.rl_tx_list_tag, sc->rl_ldata.rl_tx_list_map, @@ -164,8 +134,6 @@ re_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int do_lock) int len = slot->len; if (addr == netmap_buffer_base || len > NETMAP_BUF_SIZE) { - if (do_lock) - RL_UNLOCK(sc); // XXX what about prodidx ? return netmap_ring_reinit(kring); } @@ -200,8 +168,6 @@ re_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int do_lock) /* start ? */ CSR_WRITE_1(sc, sc->rl_txstart, RL_TXSTART_START); } - if (do_lock) - RL_UNLOCK(sc); return 0; } @@ -210,7 +176,7 @@ re_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int do_lock) * Reconcile kernel and user view of the receive ring. */ static int -re_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int do_lock) +re_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int flags) { struct rl_softc *sc = ifp->if_softc; struct rl_rxdesc *rxd = sc->rl_ldata.rl_rx_desc; @@ -218,15 +184,13 @@ re_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int do_lock) struct netmap_kring *kring = &na->rx_rings[ring_nr]; struct netmap_ring *ring = kring->ring; int j, l, n, lim = kring->nkr_num_slots - 1; - int force_update = do_lock || kring->nr_kflags & NKR_PENDINTR; + int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR; u_int k = ring->cur, resvd = ring->reserved; k = ring->cur; if (k > lim) return netmap_ring_reinit(kring); - if (do_lock) - RL_LOCK(sc); /* XXX check sync modes */ bus_dmamap_sync(sc->rl_ldata.rl_rx_list_tag, sc->rl_ldata.rl_rx_list_map, @@ -291,8 +255,6 @@ re_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int do_lock) void *addr = PNMB(slot, &paddr); if (addr == netmap_buffer_base) { /* bad buf */ - if (do_lock) - RL_UNLOCK(sc); return netmap_ring_reinit(kring); } @@ -323,8 +285,6 @@ re_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int do_lock) } /* tell userspace that there are new packets */ ring->avail = kring->nr_hwavail - resvd; - if (do_lock) - RL_UNLOCK(sc); return 0; } @@ -411,12 +371,11 @@ re_netmap_attach(struct rl_softc *sc) bzero(&na, sizeof(na)); na.ifp = sc->rl_ifp; - na.separate_locks = 0; + na.na_flags = NAF_BDG_MAYSLEEP; na.num_tx_desc = sc->rl_ldata.rl_tx_desc_cnt; na.num_rx_desc = sc->rl_ldata.rl_rx_desc_cnt; na.nm_txsync = re_netmap_txsync; na.nm_rxsync = re_netmap_rxsync; - na.nm_lock = re_netmap_lock_wrapper; na.nm_register = re_netmap_reg; netmap_attach(&na, 1); } diff --git a/sys/dev/netmap/ixgbe_netmap.h b/sys/dev/netmap/ixgbe_netmap.h index be790502c276..fca1cf1e0a90 100644 --- a/sys/dev/netmap/ixgbe_netmap.h +++ b/sys/dev/netmap/ixgbe_netmap.h @@ -72,37 +72,6 @@ SYSCTL_INT(_dev_netmap, OID_AUTO, ix_rx_miss, SYSCTL_INT(_dev_netmap, OID_AUTO, ix_rx_miss_bufs, CTLFLAG_RW, &ix_rx_miss_bufs, 0, "potentially missed rx intr bufs"); -/* - * wrapper to export locks to the generic netmap code. - */ -static void -ixgbe_netmap_lock_wrapper(struct ifnet *_a, int what, u_int queueid) -{ - struct adapter *adapter = _a->if_softc; - - ASSERT(queueid < adapter->num_queues); - switch (what) { - case NETMAP_CORE_LOCK: - IXGBE_CORE_LOCK(adapter); - break; - case NETMAP_CORE_UNLOCK: - IXGBE_CORE_UNLOCK(adapter); - break; - case NETMAP_TX_LOCK: - IXGBE_TX_LOCK(&adapter->tx_rings[queueid]); - break; - case NETMAP_TX_UNLOCK: - IXGBE_TX_UNLOCK(&adapter->tx_rings[queueid]); - break; - case NETMAP_RX_LOCK: - IXGBE_RX_LOCK(&adapter->rx_rings[queueid]); - break; - case NETMAP_RX_UNLOCK: - IXGBE_RX_UNLOCK(&adapter->rx_rings[queueid]); - break; - } -} - static void set_crcstrip(struct ixgbe_hw *hw, int onoff) @@ -155,6 +124,7 @@ ixgbe_netmap_reg(struct ifnet *ifp, int onoff) if (na == NULL) return EINVAL; /* no netmap support here */ + IXGBE_CORE_LOCK(adapter); ixgbe_disable_intr(adapter); /* Tell the stack that the interface is no longer active */ @@ -166,7 +136,7 @@ ixgbe_netmap_reg(struct ifnet *ifp, int onoff) /* save if_transmit and replace with our routine */ na->if_transmit = ifp->if_transmit; - ifp->if_transmit = netmap_start; + ifp->if_transmit = netmap_transmit; /* * reinitialize the adapter, now with netmap flag set, @@ -186,6 +156,7 @@ fail: ixgbe_init_locked(adapter); /* also enables intr */ } set_crcstrip(&adapter->hw, onoff); + IXGBE_CORE_UNLOCK(adapter); return (error); } @@ -213,12 +184,11 @@ fail: * * ring->avail is never used, only checked for bogus values. * - * do_lock is set iff the function is called from the ioctl handler. - * In this case, grab a lock around the body, and also reclaim transmitted + * I flags & FORCE_RECLAIM, reclaim transmitted * buffers irrespective of interrupt mitigation. */ static int -ixgbe_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int do_lock) +ixgbe_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int flags) { struct adapter *adapter = ifp->if_softc; struct tx_ring *txr = &adapter->tx_rings[ring_nr]; @@ -237,8 +207,6 @@ ixgbe_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int do_lock) if (k > lim) return netmap_ring_reinit(kring); - if (do_lock) - IXGBE_TX_LOCK(txr); bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, BUS_DMASYNC_POSTREAD); @@ -303,8 +271,6 @@ ixgbe_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int do_lock) */ if (addr == netmap_buffer_base || len > NETMAP_BUF_SIZE) { ring_reset: - if (do_lock) - IXGBE_TX_UNLOCK(txr); return netmap_ring_reinit(kring); } @@ -347,7 +313,7 @@ ring_reset: * In all cases kring->nr_kflags indicates which slot will be * checked upon a tx interrupt (nkr_num_slots means none). */ - if (do_lock) { + if (flags & NAF_FORCE_RECLAIM) { j = 1; /* forced reclaim, ignore interrupts */ kring->nr_kflags = kring->nkr_num_slots; } else if (kring->nr_hwavail > 0) { @@ -422,8 +388,6 @@ ring_reset: /* update avail to what the kernel knows */ ring->avail = kring->nr_hwavail; - if (do_lock) - IXGBE_TX_UNLOCK(txr); return 0; } @@ -442,10 +406,11 @@ ring_reset: * from nr_hwavail, make the descriptors available for the next reads, * and set kring->nr_hwcur = ring->cur and ring->avail = kring->nr_hwavail. * - * do_lock has a special meaning: please refer to txsync. + * If (flags & NAF_FORCE_READ) also check for incoming packets irrespective + * of whether or not we received an interrupt. */ static int -ixgbe_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int do_lock) +ixgbe_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int flags) { struct adapter *adapter = ifp->if_softc; struct rx_ring *rxr = &adapter->rx_rings[ring_nr]; @@ -453,14 +418,12 @@ ixgbe_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int do_lock) struct netmap_kring *kring = &na->rx_rings[ring_nr]; struct netmap_ring *ring = kring->ring; u_int j, l, n, lim = kring->nkr_num_slots - 1; - int force_update = do_lock || kring->nr_kflags & NKR_PENDINTR; + int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR; u_int k = ring->cur, resvd = ring->reserved; if (k > lim) return netmap_ring_reinit(kring); - if (do_lock) - IXGBE_RX_LOCK(rxr); /* XXX check sync modes */ bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); @@ -571,13 +534,9 @@ ixgbe_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int do_lock) /* tell userspace that there are new packets */ ring->avail = kring->nr_hwavail - resvd; - if (do_lock) - IXGBE_RX_UNLOCK(rxr); return 0; ring_reset: - if (do_lock) - IXGBE_RX_UNLOCK(rxr); return netmap_ring_reinit(kring); } @@ -597,12 +556,11 @@ ixgbe_netmap_attach(struct adapter *adapter) bzero(&na, sizeof(na)); na.ifp = adapter->ifp; - na.separate_locks = 1; /* this card has separate rx/tx locks */ + na.na_flags = NAF_BDG_MAYSLEEP; na.num_tx_desc = adapter->num_tx_desc; na.num_rx_desc = adapter->num_rx_desc; na.nm_txsync = ixgbe_netmap_txsync; na.nm_rxsync = ixgbe_netmap_rxsync; - na.nm_lock = ixgbe_netmap_lock_wrapper; na.nm_register = ixgbe_netmap_reg; netmap_attach(&na, adapter->num_queues); } diff --git a/sys/dev/netmap/netmap.c b/sys/dev/netmap/netmap.c index f868cfafeba2..160aedf8be9e 100644 --- a/sys/dev/netmap/netmap.c +++ b/sys/dev/netmap/netmap.c @@ -23,7 +23,10 @@ * SUCH DAMAGE. */ -#define NM_BRIDGE + +#ifdef __FreeBSD__ +#define TEST_STUFF // test code, does not compile yet on linux +#endif /* __FreeBSD__ */ /* * This module supports memory mapped access to network devices, @@ -52,18 +55,84 @@ * packets on the output interface. * 6. select() or poll() can be used to wait for events on individual * transmit or receive queues (or all queues for a given interface). - */ + * -#ifdef linux -#include "bsd_glue.h" -static netdev_tx_t linux_netmap_start(struct sk_buff *skb, struct net_device *dev); -#endif /* linux */ + SYNCHRONIZATION (USER) + +The netmap rings and data structures may be shared among multiple +user threads or even independent processes. +Any synchronization among those threads/processes is delegated +to the threads themselves. Only one thread at a time can be in +a system call on the same netmap ring. The OS does not enforce +this and only guarantees against system crashes in case of +invalid usage. + + LOCKING (INTERNAL) + +Within the kernel, access to the netmap rings is protected as follows: + +- a spinlock on each ring, to handle producer/consumer races on + RX rings attached to the host stack (against multiple host + threads writing from the host stack to the same ring), + and on 'destination' rings attached to a VALE switch + (i.e. RX rings in VALE ports, and TX rings in NIC/host ports) + protecting multiple active senders for the same destination) + +- an atomic variable to guarantee that there is at most one + instance of *_*xsync() on the ring at any time. + For rings connected to user file + descriptors, an atomic_test_and_set() protects this, and the + lock on the ring is not actually used. + For NIC RX rings connected to a VALE switch, an atomic_test_and_set() + is also used to prevent multiple executions (the driver might indeed + already guarantee this). + For NIC TX rings connected to a VALE switch, the lock arbitrates + access to the queue (both when allocating buffers and when pushing + them out). + +- *xsync() should be protected against initializations of the card. + On FreeBSD most devices have the reset routine protected by + a RING lock (ixgbe, igb, em) or core lock (re). lem is missing + the RING protection on rx_reset(), this should be added. + + On linux there is an external lock on the tx path, which probably + also arbitrates access to the reset routine. XXX to be revised + +- a per-interface core_lock protecting access from the host stack + while interfaces may be detached from netmap mode. + XXX there should be no need for this lock if we detach the interfaces + only while they are down. + + +--- VALE SWITCH --- + +NMG_LOCK() serializes all modifications to switches and ports. +A switch cannot be deleted until all ports are gone. + +For each switch, an SX lock (RWlock on linux) protects +deletion of ports. When configuring or deleting a new port, the +lock is acquired in exclusive mode (after holding NMG_LOCK). +When forwarding, the lock is acquired in shared mode (without NMG_LOCK). +The lock is held throughout the entire forwarding cycle, +during which the thread may incur in a page fault. +Hence it is important that sleepable shared locks are used. + +On the rx ring, the per-port lock is grabbed initially to reserve +a number of slot in the ring, then the lock is released, +packets are copied from source to destination, and then +the lock is acquired again and the receive ring is updated. +(A similar thing is done on the tx ring for NIC and host stack +ports attached to the switch) -#ifdef __APPLE__ -#include "osx_glue.h" -#endif /* __APPLE__ */ + */ -#ifdef __FreeBSD__ +/* + * OS-specific code that is used only within this file. + * Other OS-specific code that must be accessed by drivers + * is present in netmap_kern.h + */ + +#if defined(__FreeBSD__) #include <sys/cdefs.h> /* prerequisite */ __FBSDID("$FreeBSD$"); @@ -84,8 +153,12 @@ __FBSDID("$FreeBSD$"); #include <sys/rwlock.h> #include <vm/vm.h> /* vtophys */ #include <vm/pmap.h> /* vtophys */ +#include <vm/vm_param.h> +#include <vm/vm_object.h> +#include <vm/vm_page.h> +#include <vm/vm_pager.h> +#include <vm/uma.h> #include <sys/socket.h> /* sockaddrs */ -#include <machine/bus.h> #include <sys/selinfo.h> #include <sys/sysctl.h> #include <net/if.h> @@ -93,17 +166,130 @@ __FBSDID("$FreeBSD$"); #include <net/bpf.h> /* BIOCIMMEDIATE */ #include <net/vnet.h> #include <machine/bus.h> /* bus_dmamap_* */ +#include <sys/endian.h> +#include <sys/refcount.h> -MALLOC_DEFINE(M_NETMAP, "netmap", "Network memory map"); -#endif /* __FreeBSD__ */ +#define prefetch(x) __builtin_prefetch(x) + +#define BDG_RWLOCK_T struct rwlock // struct rwlock + +#define BDG_RWINIT(b) \ + rw_init_flags(&(b)->bdg_lock, "bdg lock", RW_NOWITNESS) +#define BDG_WLOCK(b) rw_wlock(&(b)->bdg_lock) +#define BDG_WUNLOCK(b) rw_wunlock(&(b)->bdg_lock) +#define BDG_RLOCK(b) rw_rlock(&(b)->bdg_lock) +#define BDG_RTRYLOCK(b) rw_try_rlock(&(b)->bdg_lock) +#define BDG_RUNLOCK(b) rw_runlock(&(b)->bdg_lock) +#define BDG_RWDESTROY(b) rw_destroy(&(b)->bdg_lock) + + +/* netmap global lock. + * normally called within the user thread (upon a system call) + * or when a file descriptor or process is terminated + * (last close or last munmap) + */ + +#define NMG_LOCK_T struct mtx +#define NMG_LOCK_INIT() mtx_init(&netmap_global_lock, "netmap global lock", NULL, MTX_DEF) +#define NMG_LOCK_DESTROY() mtx_destroy(&netmap_global_lock) +#define NMG_LOCK() mtx_lock(&netmap_global_lock) +#define NMG_UNLOCK() mtx_unlock(&netmap_global_lock) +#define NMG_LOCK_ASSERT() mtx_assert(&netmap_global_lock, MA_OWNED) + + +/* atomic operations */ +#include <machine/atomic.h> +#define NM_ATOMIC_TEST_AND_SET(p) (!atomic_cmpset_acq_int((p), 0, 1)) +#define NM_ATOMIC_CLEAR(p) atomic_store_rel_int((p), 0) + + +#elif defined(linux) + +#include "bsd_glue.h" + +static netdev_tx_t linux_netmap_start_xmit(struct sk_buff *, struct net_device *); + +static struct device_driver* +linux_netmap_find_driver(struct device *dev) +{ + struct device_driver *dd; + + while ( (dd = dev->driver) == NULL ) { + if ( (dev = dev->parent) == NULL ) + return NULL; + } + return dd; +} +static struct net_device* +ifunit_ref(const char *name) +{ + struct net_device *ifp = dev_get_by_name(&init_net, name); + struct device_driver *dd; + + if (ifp == NULL) + return NULL; + + if ( (dd = linux_netmap_find_driver(&ifp->dev)) == NULL ) + goto error; + + if (!try_module_get(dd->owner)) + goto error; + + return ifp; +error: + dev_put(ifp); + return NULL; +} + +static void +if_rele(struct net_device *ifp) +{ + struct device_driver *dd; + dd = linux_netmap_find_driver(&ifp->dev); + dev_put(ifp); + if (dd) + module_put(dd->owner); +} + +// XXX a mtx would suffice here too 20130404 gl +#define NMG_LOCK_T struct semaphore +#define NMG_LOCK_INIT() sema_init(&netmap_global_lock, 1) +#define NMG_LOCK_DESTROY() +#define NMG_LOCK() down(&netmap_global_lock) +#define NMG_UNLOCK() up(&netmap_global_lock) +#define NMG_LOCK_ASSERT() // XXX to be completed + + +#elif defined(__APPLE__) + +#warning OSX support is only partial +#include "osx_glue.h" + +#else + +#error Unsupported platform + +#endif /* unsupported */ + +/* + * common headers + */ #include <net/netmap.h> #include <dev/netmap/netmap_kern.h> +#include <dev/netmap/netmap_mem2.h> + -/* XXX the following variables must be deprecated and included in nm_mem */ +MALLOC_DEFINE(M_NETMAP, "netmap", "Network memory map"); + +/* + * The following variables are used by the drivers and replicate + * fields in the global memory pool. They only refer to buffers + * used by physical interfaces. + */ u_int netmap_total_buffers; u_int netmap_buf_size; -char *netmap_buffer_base; /* address of an invalid buffer */ +char *netmap_buffer_base; /* also address of an invalid buffer */ /* user-controlled variables */ int netmap_verbose; @@ -127,17 +313,178 @@ SYSCTL_INT(_dev_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW, int netmap_drop = 0; /* debugging */ int netmap_flags = 0; /* debug flags */ int netmap_fwd = 0; /* force transparent mode */ +int netmap_mmap_unreg = 0; /* allow mmap of unregistered fds */ SYSCTL_INT(_dev_netmap, OID_AUTO, drop, CTLFLAG_RW, &netmap_drop, 0 , ""); SYSCTL_INT(_dev_netmap, OID_AUTO, flags, CTLFLAG_RW, &netmap_flags, 0 , ""); SYSCTL_INT(_dev_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0 , ""); +SYSCTL_INT(_dev_netmap, OID_AUTO, mmap_unreg, CTLFLAG_RW, &netmap_mmap_unreg, 0, ""); + +NMG_LOCK_T netmap_global_lock; + +/* + * protect against multiple threads using the same ring. + * also check that the ring has not been stopped. + */ +#define NM_KR_BUSY 1 +#define NM_KR_STOPPED 2 +static void nm_kr_put(struct netmap_kring *kr); +static __inline int nm_kr_tryget(struct netmap_kring *kr) +{ + /* check a first time without taking the lock + * to avoid starvation for nm_kr_get() + */ + if (unlikely(kr->nkr_stopped)) { + ND("ring %p stopped (%d)", kr, kr->nkr_stopped); + return NM_KR_STOPPED; + } + if (unlikely(NM_ATOMIC_TEST_AND_SET(&kr->nr_busy))) + return NM_KR_BUSY; + /* check a second time with lock held */ + if (unlikely(kr->nkr_stopped)) { + ND("ring %p stopped (%d)", kr, kr->nkr_stopped); + nm_kr_put(kr); + return NM_KR_STOPPED; + } + return 0; +} + +static __inline void nm_kr_put(struct netmap_kring *kr) +{ + NM_ATOMIC_CLEAR(&kr->nr_busy); +} + +static void nm_kr_get(struct netmap_kring *kr) +{ + while (NM_ATOMIC_TEST_AND_SET(&kr->nr_busy)) + tsleep(kr, 0, "NM_KR_GET", 4); +} + +static void nm_disable_ring(struct netmap_kring *kr) +{ + kr->nkr_stopped = 1; + nm_kr_get(kr); + mtx_lock(&kr->q_lock); + mtx_unlock(&kr->q_lock); + nm_kr_put(kr); +} -#ifdef NM_BRIDGE /* support for netmap virtual switch, called VALE */ +void netmap_disable_all_rings(struct ifnet *ifp) +{ + struct netmap_adapter *na; + int i; + + if (!(ifp->if_capenable & IFCAP_NETMAP)) + return; + + na = NA(ifp); + + for (i = 0; i < na->num_tx_rings + 1; i++) { + nm_disable_ring(na->tx_rings + i); + selwakeuppri(&na->tx_rings[i].si, PI_NET); + } + for (i = 0; i < na->num_rx_rings + 1; i++) { + nm_disable_ring(na->rx_rings + i); + selwakeuppri(&na->rx_rings[i].si, PI_NET); + } + selwakeuppri(&na->tx_si, PI_NET); + selwakeuppri(&na->rx_si, PI_NET); +} + +void netmap_enable_all_rings(struct ifnet *ifp) +{ + struct netmap_adapter *na; + int i; + + if (!(ifp->if_capenable & IFCAP_NETMAP)) + return; + + na = NA(ifp); + for (i = 0; i < na->num_tx_rings + 1; i++) { + D("enabling %p", na->tx_rings + i); + na->tx_rings[i].nkr_stopped = 0; + } + for (i = 0; i < na->num_rx_rings + 1; i++) { + D("enabling %p", na->rx_rings + i); + na->rx_rings[i].nkr_stopped = 0; + } +} + + +/* + * generic bound_checking function + */ +u_int +nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg) +{ + u_int oldv = *v; + const char *op = NULL; + + if (dflt < lo) + dflt = lo; + if (dflt > hi) + dflt = hi; + if (oldv < lo) { + *v = dflt; + op = "Bump"; + } else if (oldv > hi) { + *v = hi; + op = "Clamp"; + } + if (op && msg) + printf("%s %s to %d (was %d)\n", op, msg, *v, oldv); + return *v; +} + +/* + * packet-dump function, user-supplied or static buffer. + * The destination buffer must be at least 30+4*len + */ +const char * +nm_dump_buf(char *p, int len, int lim, char *dst) +{ + static char _dst[8192]; + int i, j, i0; + static char hex[] ="0123456789abcdef"; + char *o; /* output position */ + +#define P_HI(x) hex[((x) & 0xf0)>>4] +#define P_LO(x) hex[((x) & 0xf)] +#define P_C(x) ((x) >= 0x20 && (x) <= 0x7e ? (x) : '.') + if (!dst) + dst = _dst; + if (lim <= 0 || lim > len) + lim = len; + o = dst; + sprintf(o, "buf 0x%p len %d lim %d\n", p, len, lim); + o += strlen(o); + /* hexdump routine */ + for (i = 0; i < lim; ) { + sprintf(o, "%5d: ", i); + o += strlen(o); + memset(o, ' ', 48); + i0 = i; + for (j=0; j < 16 && i < lim; i++, j++) { + o[j*3] = P_HI(p[i]); + o[j*3+1] = P_LO(p[i]); + } + i = i0; + for (j=0; j < 16 && i < lim; i++, j++) + o[j + 48] = P_C(p[i]); + o[j+48] = '\n'; + o += j+49; + } + *o = '\0'; +#undef P_HI +#undef P_LO +#undef P_C + return dst; +} /* * system parameters (most of them in netmap_kern.h) * NM_NAME prefix for switch port names, default "vale" - * NM_MAXPORTS number of ports + * NM_BDG_MAXPORTS number of ports * NM_BRIDGES max number of switches in the system. * XXX should become a sysctl or tunable * @@ -149,33 +496,29 @@ SYSCTL_INT(_dev_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0 , ""); * for rings and buffers. * The virtual interfaces use per-queue lock instead of core lock. * In the tx loop, we aggregate traffic in batches to make all operations - * faster. The batch size is NM_BDG_BATCH + * faster. The batch size is bridge_batch. */ #define NM_BDG_MAXRINGS 16 /* XXX unclear how many. */ +#define NM_BDG_MAXSLOTS 4096 /* XXX same as above */ #define NM_BRIDGE_RINGSIZE 1024 /* in the device */ #define NM_BDG_HASH 1024 /* forwarding table entries */ #define NM_BDG_BATCH 1024 /* entries in the forwarding buffer */ +#define NM_MULTISEG 64 /* max size of a chain of bufs */ +/* actual size of the tables */ +#define NM_BDG_BATCH_MAX (NM_BDG_BATCH + NM_MULTISEG) +/* NM_FT_NULL terminates a list of slots in the ft */ +#define NM_FT_NULL NM_BDG_BATCH_MAX #define NM_BRIDGES 8 /* number of bridges */ -int netmap_bridge = NM_BDG_BATCH; /* bridge batch size */ -SYSCTL_INT(_dev_netmap, OID_AUTO, bridge, CTLFLAG_RW, &netmap_bridge, 0 , ""); - -#ifdef linux - -#define refcount_acquire(_a) atomic_add(1, (atomic_t *)_a) -#define refcount_release(_a) atomic_dec_and_test((atomic_t *)_a) - -#else /* !linux */ - -#ifdef __FreeBSD__ -#include <sys/endian.h> -#include <sys/refcount.h> -#endif /* __FreeBSD__ */ - -#define prefetch(x) __builtin_prefetch(x) +/* + * bridge_batch is set via sysctl to the max batch size to be + * used in the bridge. The actual value may be larger as the + * last packet in the block may overflow the size. + */ +int bridge_batch = NM_BDG_BATCH; /* bridge batch size */ +SYSCTL_INT(_dev_netmap, OID_AUTO, bridge_batch, CTLFLAG_RW, &bridge_batch, 0 , ""); -#endif /* !linux */ /* * These are used to handle reference counters for bridge ports. @@ -183,54 +526,79 @@ SYSCTL_INT(_dev_netmap, OID_AUTO, bridge, CTLFLAG_RW, &netmap_bridge, 0 , ""); #define ADD_BDG_REF(ifp) refcount_acquire(&NA(ifp)->na_bdg_refcount) #define DROP_BDG_REF(ifp) refcount_release(&NA(ifp)->na_bdg_refcount) +/* The bridge references the buffers using the device specific look up table */ +static inline void * +BDG_NMB(struct netmap_mem_d *nmd, struct netmap_slot *slot) +{ + struct lut_entry *lut = nmd->pools[NETMAP_BUF_POOL].lut; + uint32_t i = slot->buf_idx; + return (unlikely(i >= nmd->pools[NETMAP_BUF_POOL].objtotal)) ? lut[0].vaddr : lut[i].vaddr; +} + static void bdg_netmap_attach(struct netmap_adapter *); static int bdg_netmap_reg(struct ifnet *ifp, int onoff); -static int kern_netmap_regif(struct nmreq *nmr); +int kern_netmap_regif(struct nmreq *nmr); -/* per-tx-queue entry */ +/* + * Each transmit queue accumulates a batch of packets into + * a structure before forwarding. Packets to the same + * destination are put in a list using ft_next as a link field. + * ft_frags and ft_next are valid only on the first fragment. + */ struct nm_bdg_fwd { /* forwarding entry for a bridge */ - void *ft_buf; - uint16_t _ft_dst; /* dst port, unused */ + void *ft_buf; /* netmap or indirect buffer */ + uint8_t ft_frags; /* how many fragments (only on 1st frag) */ + uint8_t _ft_port; /* dst port (unused) */ uint16_t ft_flags; /* flags, e.g. indirect */ - uint16_t ft_len; /* src len */ + uint16_t ft_len; /* src fragment len */ uint16_t ft_next; /* next packet to same destination */ }; -/* We need to build a list of buffers going to each destination. - * Each buffer is in one entry of struct nm_bdg_fwd, we use ft_next - * to build the list, and struct nm_bdg_q below for the queue. - * The structure should compact because potentially we have a lot - * of destinations. +/* + * For each output interface, nm_bdg_q is used to construct a list. + * bq_len is the number of output buffers (we can have coalescing + * during the copy). */ struct nm_bdg_q { uint16_t bq_head; uint16_t bq_tail; + uint32_t bq_len; /* number of buffers */ }; +/* XXX revise this */ struct nm_hash_ent { uint64_t mac; /* the top 2 bytes are the epoch */ uint64_t ports; }; /* + * nm_bridge is a descriptor for a VALE switch. * Interfaces for a bridge are all in bdg_ports[]. * The array has fixed size, an empty entry does not terminate - * the search. But lookups only occur on attach/detach so we + * the search, but lookups only occur on attach/detach so we * don't mind if they are slow. * - * The bridge is non blocking on the transmit ports. + * The bridge is non blocking on the transmit ports: excess + * packets are dropped if there is no room on the output port. * * bdg_lock protects accesses to the bdg_ports array. * This is a rw lock (or equivalent). */ struct nm_bridge { - int namelen; /* 0 means free */ - /* XXX what is the proper alignment/layout ? */ - NM_RWLOCK_T bdg_lock; /* protects bdg_ports */ + BDG_RWLOCK_T bdg_lock; /* protects bdg_ports */ + int bdg_namelen; + uint32_t bdg_active_ports; /* 0 means free */ + char bdg_basename[IFNAMSIZ]; + + /* Indexes of active ports (up to active_ports) + * and all other remaining ports. + */ + uint8_t bdg_port_index[NM_BDG_MAXPORTS]; + struct netmap_adapter *bdg_ports[NM_BDG_MAXPORTS]; - char basename[IFNAMSIZ]; + /* * The function to decide the destination port. * It returns either of an index of the destination port, @@ -242,41 +610,43 @@ struct nm_bridge { */ bdg_lookup_fn_t nm_bdg_lookup; - /* the forwarding table, MAC+ports */ + /* the forwarding table, MAC+ports. + * XXX should be changed to an argument to be passed to + * the lookup function, and allocated on attach + */ struct nm_hash_ent ht[NM_BDG_HASH]; }; -struct nm_bridge nm_bridges[NM_BRIDGES]; -NM_LOCK_T netmap_bridge_mutex; -/* other OS will have these macros defined in their own glue code. */ +/* + * XXX in principle nm_bridges could be created dynamically + * Right now we have a static array and deletions are protected + * by an exclusive lock. + */ +struct nm_bridge nm_bridges[NM_BRIDGES]; -#ifdef __FreeBSD__ -#define BDG_LOCK() mtx_lock(&netmap_bridge_mutex) -#define BDG_UNLOCK() mtx_unlock(&netmap_bridge_mutex) -#define BDG_WLOCK(b) rw_wlock(&(b)->bdg_lock) -#define BDG_WUNLOCK(b) rw_wunlock(&(b)->bdg_lock) -#define BDG_RLOCK(b) rw_rlock(&(b)->bdg_lock) -#define BDG_RUNLOCK(b) rw_runlock(&(b)->bdg_lock) -/* set/get variables. OS-specific macros may wrap these - * assignments into read/write lock or similar +/* + * A few function to tell which kind of port are we using. + * XXX should we hold a lock ? + * + * nma_is_vp() virtual port + * nma_is_host() port connected to the host stack + * nma_is_hw() port connected to a NIC */ -#define BDG_SET_VAR(lval, p) (lval = p) -#define BDG_GET_VAR(lval) (lval) -#define BDG_FREE(p) free(p, M_DEVBUF) -#endif /* __FreeBSD__ */ - -static __inline int +int nma_is_vp(struct netmap_adapter *na); +int nma_is_vp(struct netmap_adapter *na) { return na->nm_register == bdg_netmap_reg; } + static __inline int nma_is_host(struct netmap_adapter *na) { return na->nm_register == NULL; } + static __inline int nma_is_hw(struct netmap_adapter *na) { @@ -284,11 +654,12 @@ nma_is_hw(struct netmap_adapter *na) return !nma_is_vp(na) && !nma_is_host(na); } + /* - * Regarding holding a NIC, if the NIC is owned by the kernel + * If the NIC is owned by the kernel * (i.e., bridge), neither another bridge nor user can use it; * if the NIC is owned by a user, only users can share it. - * Evaluation must be done under NMA_LOCK(). + * Evaluation must be done under NMG_LOCK(). */ #define NETMAP_OWNED_BY_KERN(ifp) (!nma_is_vp(NA(ifp)) && NA(ifp)->na_bdg) #define NETMAP_OWNED_BY_ANY(ifp) \ @@ -298,14 +669,22 @@ nma_is_hw(struct netmap_adapter *na) * NA(ifp)->bdg_port port index */ -// XXX only for multiples of 64 bytes, non overlapped. + +/* + * this is a slightly optimized copy routine which rounds + * to multiple of 64 bytes and is often faster than dealing + * with other odd sizes. We assume there is enough room + * in the source and destination buffers. + * + * XXX only for multiples of 64 bytes, non overlapped. + */ static inline void pkt_copy(void *_src, void *_dst, int l) { uint64_t *src = _src; uint64_t *dst = _dst; if (unlikely(l >= 1024)) { - bcopy(src, dst, l); + memcpy(dst, src, l); return; } for (; likely(l > 0); l-=64) { @@ -321,8 +700,116 @@ pkt_copy(void *_src, void *_dst, int l) } +#ifdef TEST_STUFF +struct xxx { + char *name; + void (*fn)(uint32_t); +}; + + +static void +nm_test_defmtx(uint32_t n) +{ + uint32_t i; + struct mtx m; + mtx_init(&m, "test", NULL, MTX_DEF); + for (i = 0; i < n; i++) { mtx_lock(&m); mtx_unlock(&m); } + mtx_destroy(&m); + return; +} + +static void +nm_test_spinmtx(uint32_t n) +{ + uint32_t i; + struct mtx m; + mtx_init(&m, "test", NULL, MTX_SPIN); + for (i = 0; i < n; i++) { mtx_lock(&m); mtx_unlock(&m); } + mtx_destroy(&m); + return; +} + +static void +nm_test_rlock(uint32_t n) +{ + uint32_t i; + struct rwlock m; + rw_init(&m, "test"); + for (i = 0; i < n; i++) { rw_rlock(&m); rw_runlock(&m); } + rw_destroy(&m); + return; +} + +static void +nm_test_wlock(uint32_t n) +{ + uint32_t i; + struct rwlock m; + rw_init(&m, "test"); + for (i = 0; i < n; i++) { rw_wlock(&m); rw_wunlock(&m); } + rw_destroy(&m); + return; +} + +static void +nm_test_slock(uint32_t n) +{ + uint32_t i; + struct sx m; + sx_init(&m, "test"); + for (i = 0; i < n; i++) { sx_slock(&m); sx_sunlock(&m); } + sx_destroy(&m); + return; +} + +static void +nm_test_xlock(uint32_t n) +{ + uint32_t i; + struct sx m; + sx_init(&m, "test"); + for (i = 0; i < n; i++) { sx_xlock(&m); sx_xunlock(&m); } + sx_destroy(&m); + return; +} + + +struct xxx nm_tests[] = { + { "defmtx", nm_test_defmtx }, + { "spinmtx", nm_test_spinmtx }, + { "rlock", nm_test_rlock }, + { "wlock", nm_test_wlock }, + { "slock", nm_test_slock }, + { "xlock", nm_test_xlock }, +}; + +static int +nm_test(struct nmreq *nmr) +{ + uint32_t scale, n, test; + static int old_test = -1; + + test = nmr->nr_cmd; + scale = nmr->nr_offset; + n = sizeof(nm_tests) / sizeof(struct xxx) - 1; + if (test > n) { + D("test index too high, max %d", n); + return 0; + } + + if (old_test != test) { + D("test %s scale %d", nm_tests[test].name, scale); + old_test = test; + } + nm_tests[test].fn(scale); + return 0; +} +#endif /* TEST_STUFF */ + /* * locate a bridge among the existing ones. + * MUST BE CALLED WITH NMG_LOCK() + * * a ':' in the name terminates the bridge name. Otherwise, just NM_NAME. * We assume that this is called with a name of at least NM_NAME chars. */ @@ -332,8 +819,14 @@ nm_find_bridge(const char *name, int create) int i, l, namelen; struct nm_bridge *b = NULL; + NMG_LOCK_ASSERT(); + namelen = strlen(NM_NAME); /* base length */ - l = strlen(name); /* actual length */ + l = name ? strlen(name) : 0; /* actual length */ + if (l < namelen) { + D("invalid bridge name %s", name ? name : NULL); + return NULL; + } for (i = namelen + 1; i < l; i++) { if (name[i] == ':') { namelen = i; @@ -344,31 +837,35 @@ nm_find_bridge(const char *name, int create) namelen = IFNAMSIZ; ND("--- prefix is '%.*s' ---", namelen, name); - BDG_LOCK(); /* lookup the name, remember empty slot if there is one */ for (i = 0; i < NM_BRIDGES; i++) { struct nm_bridge *x = nm_bridges + i; - if (x->namelen == 0) { + if (x->bdg_active_ports == 0) { if (create && b == NULL) b = x; /* record empty slot */ - } else if (x->namelen != namelen) { + } else if (x->bdg_namelen != namelen) { continue; - } else if (strncmp(name, x->basename, namelen) == 0) { + } else if (strncmp(name, x->bdg_basename, namelen) == 0) { ND("found '%.*s' at %d", namelen, name, i); b = x; break; } } if (i == NM_BRIDGES && b) { /* name not found, can create entry */ - strncpy(b->basename, name, namelen); - b->namelen = namelen; + /* initialize the bridge */ + strncpy(b->bdg_basename, name, namelen); + ND("create new bridge %s with ports %d", b->bdg_basename, + b->bdg_active_ports); + b->bdg_namelen = namelen; + b->bdg_active_ports = 0; + for (i = 0; i < NM_BDG_MAXPORTS; i++) + b->bdg_port_index[i] = i; /* set the default function */ b->nm_bdg_lookup = netmap_bdg_learning; /* reset the MAC address table */ bzero(b->ht, sizeof(struct nm_hash_ent) * NM_BDG_HASH); } - BDG_UNLOCK(); return b; } @@ -382,6 +879,7 @@ nm_free_bdgfwd(struct netmap_adapter *na) int nrings, i; struct netmap_kring *kring; + NMG_LOCK_ASSERT(); nrings = nma_is_vp(na) ? na->num_tx_rings : na->num_rx_rings; kring = nma_is_vp(na) ? na->tx_rings : na->rx_rings; for (i = 0; i < nrings; i++) { @@ -404,11 +902,12 @@ nm_alloc_bdgfwd(struct netmap_adapter *na) int nrings, l, i, num_dstq; struct netmap_kring *kring; + NMG_LOCK_ASSERT(); /* all port:rings + broadcast */ num_dstq = NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1; - l = sizeof(struct nm_bdg_fwd) * NM_BDG_BATCH; + l = sizeof(struct nm_bdg_fwd) * NM_BDG_BATCH_MAX; l += sizeof(struct nm_bdg_q) * num_dstq; - l += sizeof(uint16_t) * NM_BDG_BATCH; + l += sizeof(uint16_t) * NM_BDG_BATCH_MAX; nrings = nma_is_vp(na) ? na->num_tx_rings : na->num_rx_rings; kring = nma_is_vp(na) ? na->tx_rings : na->rx_rings; @@ -422,9 +921,11 @@ nm_alloc_bdgfwd(struct netmap_adapter *na) nm_free_bdgfwd(na); return ENOMEM; } - dstq = (struct nm_bdg_q *)(ft + NM_BDG_BATCH); - for (j = 0; j < num_dstq; j++) - dstq[j].bq_head = dstq[j].bq_tail = NM_BDG_BATCH; + dstq = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX); + for (j = 0; j < num_dstq; j++) { + dstq[j].bq_head = dstq[j].bq_tail = NM_FT_NULL; + dstq[j].bq_len = 0; + } kring[i].nkr_ft = ft; } if (nma_is_hw(na)) @@ -432,8 +933,6 @@ nm_alloc_bdgfwd(struct netmap_adapter *na) return 0; } -#endif /* NM_BRIDGE */ - /* * Fetch configuration from the device, to cope with dynamic @@ -479,9 +978,15 @@ netmap_update_config(struct netmap_adapter *na) return 1; } -/*------------- memory allocator -----------------*/ -#include "netmap_mem2.c" -/*------------ end of memory allocator ----------*/ +static struct netmap_if * +netmap_if_new(const char *ifname, struct netmap_adapter *na) +{ + if (netmap_update_config(na)) { + /* configuration mismatch, report and fail */ + return NULL; + } + return netmap_mem_if_new(ifname, na); +} /* Structure associated to each thread which registered an interface. @@ -507,49 +1012,99 @@ netmap_update_config(struct netmap_adapter *na) * private structure is destroyed. */ struct netmap_priv_d { - struct netmap_if * volatile np_nifp; /* netmap interface descriptor. */ + struct netmap_if * volatile np_nifp; /* netmap if descriptor. */ - struct ifnet *np_ifp; /* device for which we hold a reference */ + struct ifnet *np_ifp; /* device for which we hold a ref. */ int np_ringid; /* from the ioctl */ u_int np_qfirst, np_qlast; /* range of rings to scan */ uint16_t np_txpoll; - unsigned long ref_done; /* use with NMA_LOCK held */ + struct netmap_mem_d *np_mref; /* use with NMG_LOCK held */ +#ifdef __FreeBSD__ + int np_refcount; /* use with NMG_LOCK held */ +#endif /* __FreeBSD__ */ }; - +/* grab a reference to the memory allocator, if we don't have one already. The + * reference is taken from the netmap_adapter registered with the priv. + * + */ static int -netmap_get_memory(struct netmap_priv_d* p) +netmap_get_memory_locked(struct netmap_priv_d* p) { + struct netmap_mem_d *nmd; int error = 0; - NMA_LOCK(); - if (!p->ref_done) { - error = netmap_memory_finalize(); + + if (p->np_ifp == NULL) { + if (!netmap_mmap_unreg) + return ENODEV; + /* for compatibility with older versions of the API + * we use the global allocator when no interface has been + * registered + */ + nmd = &nm_mem; + } else { + nmd = NA(p->np_ifp)->nm_mem; + } + if (p->np_mref == NULL) { + error = netmap_mem_finalize(nmd); if (!error) - p->ref_done = 1; + p->np_mref = nmd; + } else if (p->np_mref != nmd) { + /* a virtual port has been registered, but previous + * syscalls already used the global allocator. + * We cannot continue + */ + error = ENODEV; } - NMA_UNLOCK(); return error; } +static int +netmap_get_memory(struct netmap_priv_d* p) +{ + int error; + NMG_LOCK(); + error = netmap_get_memory_locked(p); + NMG_UNLOCK(); + return error; +} + +static int +netmap_have_memory_locked(struct netmap_priv_d* p) +{ + return p->np_mref != NULL; +} + +static void +netmap_drop_memory_locked(struct netmap_priv_d* p) +{ + if (p->np_mref) { + netmap_mem_deref(p->np_mref); + p->np_mref = NULL; + } +} + /* * File descriptor's private data destructor. * * Call nm_register(ifp,0) to stop netmap mode on the interface and * revert to normal operation. We expect that np_ifp has not gone. + * The second argument is the nifp to work on. In some cases it is + * not attached yet to the netmap_priv_d so we need to pass it as + * a separate argument. */ -/* call with NMA_LOCK held */ +/* call with NMG_LOCK held */ static void -netmap_dtor_locked(void *data) +netmap_do_unregif(struct netmap_priv_d *priv, struct netmap_if *nifp) { - struct netmap_priv_d *priv = data; struct ifnet *ifp = priv->np_ifp; struct netmap_adapter *na = NA(ifp); - struct netmap_if *nifp = priv->np_nifp; + NMG_LOCK_ASSERT(); na->refcount--; if (na->refcount <= 0) { /* last instance */ - u_int i, j, lim; + u_int i; if (netmap_verbose) D("deleting last instance for %s", ifp->if_xname); @@ -558,59 +1113,54 @@ netmap_dtor_locked(void *data) * when the last reference to this file descriptor goes * away. This means we cannot have any pending poll() * or interrupt routine operating on the structure. + * XXX The file may be closed in a thread while + * another thread is using it. + * Linux keeps the file opened until the last reference + * by any outstanding ioctl/poll or mmap is gone. + * FreeBSD does not track mmap()s (but we do) and + * wakes up any sleeping poll(). Need to check what + * happens if the close() occurs while a concurrent + * syscall is running. */ na->nm_register(ifp, 0); /* off, clear IFCAP_NETMAP */ /* Wake up any sleeping threads. netmap_poll will * then return POLLERR + * XXX The wake up now must happen during *_down(), when + * we order all activities to stop. -gl */ - for (i = 0; i < na->num_tx_rings + 1; i++) - selwakeuppri(&na->tx_rings[i].si, PI_NET); - for (i = 0; i < na->num_rx_rings + 1; i++) - selwakeuppri(&na->rx_rings[i].si, PI_NET); - selwakeuppri(&na->tx_si, PI_NET); - selwakeuppri(&na->rx_si, PI_NET); -#ifdef NM_BRIDGE nm_free_bdgfwd(na); -#endif /* NM_BRIDGE */ - /* release all buffers */ for (i = 0; i < na->num_tx_rings + 1; i++) { - struct netmap_ring *ring = na->tx_rings[i].ring; - lim = na->tx_rings[i].nkr_num_slots; - for (j = 0; j < lim; j++) - netmap_free_buf(nifp, ring->slot[j].buf_idx); - /* knlist_destroy(&na->tx_rings[i].si.si_note); */ mtx_destroy(&na->tx_rings[i].q_lock); } for (i = 0; i < na->num_rx_rings + 1; i++) { - struct netmap_ring *ring = na->rx_rings[i].ring; - lim = na->rx_rings[i].nkr_num_slots; - for (j = 0; j < lim; j++) - netmap_free_buf(nifp, ring->slot[j].buf_idx); - /* knlist_destroy(&na->rx_rings[i].si.si_note); */ mtx_destroy(&na->rx_rings[i].q_lock); } /* XXX kqueue(9) needed; these will mirror knlist_init. */ /* knlist_destroy(&na->tx_si.si_note); */ /* knlist_destroy(&na->rx_si.si_note); */ - netmap_free_rings(na); if (nma_is_hw(na)) SWNA(ifp)->tx_rings = SWNA(ifp)->rx_rings = NULL; } - netmap_if_free(nifp); + /* + * netmap_mem_if_delete() deletes the nifp, and if this is + * the last instance also buffers, rings and krings. + */ + netmap_mem_if_delete(na, nifp); } -/* we assume netmap adapter exists */ +/* we assume netmap adapter exists + * Called with NMG_LOCK held + */ static void nm_if_rele(struct ifnet *ifp) { -#ifndef NM_BRIDGE - if_rele(ifp); -#else /* NM_BRIDGE */ - int i, full = 0, is_hw; + int i, is_hw, hw, sw, lim; struct nm_bridge *b; struct netmap_adapter *na; + uint8_t tmp[NM_BDG_MAXPORTS]; + NMG_LOCK_ASSERT(); /* I can be called not only for get_ifp()-ed references where netmap's * capability is guaranteed, but also for non-netmap-capable NICs. */ @@ -618,202 +1168,278 @@ nm_if_rele(struct ifnet *ifp) if_rele(ifp); return; } - if (!DROP_BDG_REF(ifp)) - return; - na = NA(ifp); b = na->na_bdg; is_hw = nma_is_hw(na); - BDG_WLOCK(b); - ND("want to disconnect %s from the bridge", ifp->if_xname); - full = 0; - /* remove the entry from the bridge, also check - * if there are any leftover interfaces - * XXX we should optimize this code, e.g. going directly - * to na->bdg_port, and having a counter of ports that - * are connected. But it is not in a critical path. - * In NIC's case, index of sw na is always higher than hw na + ND("%s has %d references", ifp->if_xname, NA(ifp)->na_bdg_refcount); + + if (!DROP_BDG_REF(ifp)) + return; + + /* + New algorithm: + make a copy of bdg_port_index; + lookup NA(ifp)->bdg_port and SWNA(ifp)->bdg_port + in the array of bdg_port_index, replacing them with + entries from the bottom of the array; + decrement bdg_active_ports; + acquire BDG_WLOCK() and copy back the array. */ - for (i = 0; i < NM_BDG_MAXPORTS; i++) { - struct netmap_adapter *tmp = BDG_GET_VAR(b->bdg_ports[i]); - - if (tmp == na) { - /* disconnect from bridge */ - BDG_SET_VAR(b->bdg_ports[i], NULL); - na->na_bdg = NULL; - if (is_hw && SWNA(ifp)->na_bdg) { - /* disconnect sw adapter too */ - int j = SWNA(ifp)->bdg_port; - BDG_SET_VAR(b->bdg_ports[j], NULL); - SWNA(ifp)->na_bdg = NULL; - } - } else if (tmp != NULL) { - full = 1; + + hw = NA(ifp)->bdg_port; + sw = (is_hw && SWNA(ifp)->na_bdg) ? SWNA(ifp)->bdg_port : -1; + lim = b->bdg_active_ports; + + ND("detach %d and %d (lim %d)", hw, sw, lim); + /* make a copy of the list of active ports, update it, + * and then copy back within BDG_WLOCK(). + */ + memcpy(tmp, b->bdg_port_index, sizeof(tmp)); + for (i = 0; (hw >= 0 || sw >= 0) && i < lim; ) { + if (hw >= 0 && tmp[i] == hw) { + ND("detach hw %d at %d", hw, i); + lim--; /* point to last active port */ + tmp[i] = tmp[lim]; /* swap with i */ + tmp[lim] = hw; /* now this is inactive */ + hw = -1; + } else if (sw >= 0 && tmp[i] == sw) { + ND("detach sw %d at %d", sw, i); + lim--; + tmp[i] = tmp[lim]; + tmp[lim] = sw; + sw = -1; + } else { + i++; } } + if (hw >= 0 || sw >= 0) { + D("XXX delete failed hw %d sw %d, should panic...", hw, sw); + } + hw = NA(ifp)->bdg_port; + sw = (is_hw && SWNA(ifp)->na_bdg) ? SWNA(ifp)->bdg_port : -1; + + BDG_WLOCK(b); + b->bdg_ports[hw] = NULL; + na->na_bdg = NULL; + if (sw >= 0) { + b->bdg_ports[sw] = NULL; + SWNA(ifp)->na_bdg = NULL; + } + memcpy(b->bdg_port_index, tmp, sizeof(tmp)); + b->bdg_active_ports = lim; BDG_WUNLOCK(b); - if (full == 0) { - ND("marking bridge %d as free", b - nm_bridges); - b->namelen = 0; + + ND("now %d active ports", lim); + if (lim == 0) { + ND("marking bridge %s as free", b->bdg_basename); b->nm_bdg_lookup = NULL; } - if (na->na_bdg) { /* still attached to the bridge */ - D("ouch, cannot find ifp to remove"); - } else if (is_hw) { + + if (is_hw) { if_rele(ifp); } else { + if (na->na_flags & NAF_MEM_OWNER) + netmap_mem_private_delete(na->nm_mem); bzero(na, sizeof(*na)); free(na, M_DEVBUF); bzero(ifp, sizeof(*ifp)); free(ifp, M_DEVBUF); } -#endif /* NM_BRIDGE */ } -static void -netmap_dtor(void *data) + +/* + * returns 1 if this is the last instance and we can free priv + */ +static int +netmap_dtor_locked(struct netmap_priv_d *priv) { - struct netmap_priv_d *priv = data; struct ifnet *ifp = priv->np_ifp; - NMA_LOCK(); +#ifdef __FreeBSD__ + /* + * np_refcount is the number of active mmaps on + * this file descriptor + */ + if (--priv->np_refcount > 0) { + return 0; + } +#endif /* __FreeBSD__ */ + if (ifp) { + netmap_do_unregif(priv, priv->np_nifp); + } + netmap_drop_memory_locked(priv); if (ifp) { - struct netmap_adapter *na = NA(ifp); - - if (na->na_bdg) - BDG_WLOCK(na->na_bdg); - na->nm_lock(ifp, NETMAP_REG_LOCK, 0); - netmap_dtor_locked(data); - na->nm_lock(ifp, NETMAP_REG_UNLOCK, 0); - if (na->na_bdg) - BDG_WUNLOCK(na->na_bdg); - nm_if_rele(ifp); /* might also destroy *na */ } - if (priv->ref_done) { - netmap_memory_deref(); + return 1; +} + +static void +netmap_dtor(void *data) +{ + struct netmap_priv_d *priv = data; + int last_instance; + + NMG_LOCK(); + last_instance = netmap_dtor_locked(priv); + NMG_UNLOCK(); + if (last_instance) { + bzero(priv, sizeof(*priv)); /* for safety */ + free(priv, M_DEVBUF); } - NMA_UNLOCK(); - bzero(priv, sizeof(*priv)); /* XXX for safety */ - free(priv, M_DEVBUF); } #ifdef __FreeBSD__ -#include <vm/vm.h> -#include <vm/vm_param.h> -#include <vm/vm_object.h> -#include <vm/vm_page.h> -#include <vm/vm_pager.h> -#include <vm/uma.h> /* * In order to track whether pages are still mapped, we hook into * the standard cdev_pager and intercept the constructor and * destructor. - * XXX but then ? Do we really use the information ? - * Need to investigate. */ -static struct cdev_pager_ops saved_cdev_pager_ops; +struct netmap_vm_handle_t { + struct cdev *dev; + struct netmap_priv_d *priv; +}; static int netmap_dev_pager_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t foff, struct ucred *cred, u_short *color) { - if (netmap_verbose) - D("first mmap for %p", handle); - return saved_cdev_pager_ops.cdev_pg_ctor(handle, - size, prot, foff, cred, color); + struct netmap_vm_handle_t *vmh = handle; + D("handle %p size %jd prot %d foff %jd", + handle, (intmax_t)size, prot, (intmax_t)foff); + dev_ref(vmh->dev); + return 0; } static void netmap_dev_pager_dtor(void *handle) { - saved_cdev_pager_ops.cdev_pg_dtor(handle); - ND("ready to release memory for %p", handle); + struct netmap_vm_handle_t *vmh = handle; + struct cdev *dev = vmh->dev; + struct netmap_priv_d *priv = vmh->priv; + D("handle %p", handle); + netmap_dtor(priv); + free(vmh, M_DEVBUF); + dev_rel(dev); +} + +static int +netmap_dev_pager_fault(vm_object_t object, vm_ooffset_t offset, + int prot, vm_page_t *mres) +{ + struct netmap_vm_handle_t *vmh = object->handle; + struct netmap_priv_d *priv = vmh->priv; + vm_paddr_t paddr; + vm_page_t page; + vm_memattr_t memattr; + vm_pindex_t pidx; + + ND("object %p offset %jd prot %d mres %p", + object, (intmax_t)offset, prot, mres); + memattr = object->memattr; + pidx = OFF_TO_IDX(offset); + paddr = netmap_mem_ofstophys(priv->np_mref, offset); + if (paddr == 0) + return VM_PAGER_FAIL; + + if (((*mres)->flags & PG_FICTITIOUS) != 0) { + /* + * If the passed in result page is a fake page, update it with + * the new physical address. + */ + page = *mres; + vm_page_updatefake(page, paddr, memattr); + } else { + /* + * Replace the passed in reqpage page with our own fake page and + * free up the all of the original pages. + */ +#ifndef VM_OBJECT_WUNLOCK /* FreeBSD < 10.x */ +#define VM_OBJECT_WUNLOCK VM_OBJECT_UNLOCK +#define VM_OBJECT_WLOCK VM_OBJECT_LOCK +#endif /* VM_OBJECT_WUNLOCK */ + + VM_OBJECT_WUNLOCK(object); + page = vm_page_getfake(paddr, memattr); + VM_OBJECT_WLOCK(object); + vm_page_lock(*mres); + vm_page_free(*mres); + vm_page_unlock(*mres); + *mres = page; + vm_page_insert(page, object, pidx); + } + page->valid = VM_PAGE_BITS_ALL; + return (VM_PAGER_OK); } static struct cdev_pager_ops netmap_cdev_pager_ops = { .cdev_pg_ctor = netmap_dev_pager_ctor, .cdev_pg_dtor = netmap_dev_pager_dtor, - .cdev_pg_fault = NULL, + .cdev_pg_fault = netmap_dev_pager_fault, }; -// XXX check whether we need netmap_mmap_single _and_ netmap_mmap static int netmap_mmap_single(struct cdev *cdev, vm_ooffset_t *foff, vm_size_t objsize, vm_object_t *objp, int prot) { + int error; + struct netmap_vm_handle_t *vmh; + struct netmap_priv_d *priv; vm_object_t obj; - ND("cdev %p foff %jd size %jd objp %p prot %d", cdev, + D("cdev %p foff %jd size %jd objp %p prot %d", cdev, (intmax_t )*foff, (intmax_t )objsize, objp, prot); - obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, *foff, - curthread->td_ucred); - ND("returns obj %p", obj); - if (obj == NULL) - return EINVAL; - if (saved_cdev_pager_ops.cdev_pg_fault == NULL) { - ND("initialize cdev_pager_ops"); - saved_cdev_pager_ops = *(obj->un_pager.devp.ops); - netmap_cdev_pager_ops.cdev_pg_fault = - saved_cdev_pager_ops.cdev_pg_fault; - }; - obj->un_pager.devp.ops = &netmap_cdev_pager_ops; - *objp = obj; - return 0; -} -#endif /* __FreeBSD__ */ - - -/* - * mmap(2) support for the "netmap" device. - * - * Expose all the memory previously allocated by our custom memory - * allocator: this way the user has only to issue a single mmap(2), and - * can work on all the data structures flawlessly. - * - * Return 0 on success, -1 otherwise. - */ - -#ifdef __FreeBSD__ -static int -netmap_mmap(__unused struct cdev *dev, -#if __FreeBSD_version < 900000 - vm_offset_t offset, vm_paddr_t *paddr, int nprot -#else - vm_ooffset_t offset, vm_paddr_t *paddr, int nprot, - __unused vm_memattr_t *memattr -#endif - ) -{ - int error = 0; - struct netmap_priv_d *priv; + + vmh = malloc(sizeof(struct netmap_vm_handle_t), M_DEVBUF, + M_NOWAIT | M_ZERO); + if (vmh == NULL) + return ENOMEM; + vmh->dev = cdev; - if (nprot & PROT_EXEC) - return (-1); // XXX -1 or EINVAL ? + NMG_LOCK(); + error = devfs_get_cdevpriv((void**)&priv); + if (error) + goto err_unlock; + vmh->priv = priv; + priv->np_refcount++; + NMG_UNLOCK(); - error = devfs_get_cdevpriv((void **)&priv); - if (error == EBADF) { /* called on fault, memory is initialized */ - ND(5, "handling fault at ofs 0x%x", offset); - error = 0; - } else if (error == 0) /* make sure memory is set */ - error = netmap_get_memory(priv); + error = netmap_get_memory(priv); if (error) - return (error); + goto err_deref; - ND("request for offset 0x%x", (uint32_t)offset); - *paddr = netmap_ofstophys(offset); + obj = cdev_pager_allocate(vmh, OBJT_DEVICE, + &netmap_cdev_pager_ops, objsize, prot, + *foff, NULL); + if (obj == NULL) { + D("cdev_pager_allocate failed"); + error = EINVAL; + goto err_deref; + } + + *objp = obj; + return 0; - return (*paddr ? 0 : ENOMEM); +err_deref: + NMG_LOCK(); + priv->np_refcount--; +err_unlock: + NMG_UNLOCK(); +// err: + free(vmh, M_DEVBUF); + return error; } +// XXX can we remove this ? static int netmap_close(struct cdev *dev, int fflag, int devtype, struct thread *td) { @@ -830,6 +1456,12 @@ netmap_open(struct cdev *dev, int oflags, int devtype, struct thread *td) struct netmap_priv_d *priv; int error; + (void)dev; + (void)oflags; + (void)devtype; + (void)td; + + // XXX wait or nowait ? priv = malloc(sizeof(struct netmap_priv_d), M_DEVBUF, M_NOWAIT | M_ZERO); if (priv == NULL) @@ -839,6 +1471,8 @@ netmap_open(struct cdev *dev, int oflags, int devtype, struct thread *td) if (error) return error; + priv->np_refcount = 1; + return 0; } #endif /* __FreeBSD__ */ @@ -900,24 +1534,30 @@ netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force) * the queue is drained in all cases. * XXX handle reserved */ - int k = kring->ring->cur - kring->ring->reserved; - u_int n, lim = kring->nkr_num_slots - 1; + u_int lim = kring->nkr_num_slots - 1; struct mbuf *m, *tail = q->tail; - - if (k < 0) - k = k + kring->nkr_num_slots; + u_int k = kring->ring->cur, n = kring->ring->reserved; + struct netmap_mem_d *nmd = kring->na->nm_mem; + + /* compute the final position, ring->cur - ring->reserved */ + if (n > 0) { + if (k < n) + k += kring->nkr_num_slots; + k += n; + } for (n = kring->nr_hwcur; n != k;) { struct netmap_slot *slot = &kring->ring->slot[n]; - n = (n == lim) ? 0 : n + 1; + n = nm_next(n, lim); if ((slot->flags & NS_FORWARD) == 0 && !force) continue; - if (slot->len < 14 || slot->len > NETMAP_BUF_SIZE) { + if (slot->len < 14 || slot->len > NETMAP_BDG_BUF_SIZE(nmd)) { D("bad pkt at %d len %d", n, slot->len); continue; } slot->flags &= ~NS_FORWARD; // XXX needed ? - m = m_devget(NMB(slot), slot->len, 0, kring->na->ifp, NULL); + /* XXX adapt to the case of a multisegment packet */ + m = m_devget(BDG_NMB(nmd, slot), slot->len, 0, kring->na->ifp, NULL); if (m == NULL) break; @@ -934,9 +1574,17 @@ netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force) /* - * called under main lock to send packets from the host to the NIC * The host ring has packets from nr_hwcur to (cur - reserved) - * to be sent down. We scan the tx rings, which have just been + * to be sent down to the NIC. + * We need to use the queue lock on the source (host RX ring) + * to protect against netmap_transmit. + * If the user is well behaved we do not need to acquire locks + * on the destination(s), + * so we only need to make sure that there are no panics because + * of user errors. + * XXX verify + * + * We scan the tx rings, which have just been * flushed so nr_hwcur == cur. Pushing packets down means * increment cur and decrement avail. * XXX to be verified @@ -946,14 +1594,23 @@ netmap_sw_to_nic(struct netmap_adapter *na) { struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings]; struct netmap_kring *k1 = &na->tx_rings[0]; - int i, howmany, src_lim, dst_lim; + u_int i, howmany, src_lim, dst_lim; + + /* XXX we should also check that the carrier is on */ + if (kring->nkr_stopped) + return; + + mtx_lock(&kring->q_lock); + + if (kring->nkr_stopped) + goto out; howmany = kring->nr_hwavail; /* XXX otherwise cur - reserved - nr_hwcur */ - src_lim = kring->nkr_num_slots; + src_lim = kring->nkr_num_slots - 1; for (i = 0; howmany > 0 && i < na->num_tx_rings; i++, k1++) { ND("%d packets left to ring %d (space %d)", howmany, i, k1->nr_hwavail); - dst_lim = k1->nkr_num_slots; + dst_lim = k1->nkr_num_slots - 1; while (howmany > 0 && k1->ring->avail > 0) { struct netmap_slot *src, *dst, tmp; src = &kring->ring->slot[kring->nr_hwcur]; @@ -969,40 +1626,45 @@ netmap_sw_to_nic(struct netmap_adapter *na) dst->len, dst->buf_idx, kring->nr_hwcur, k1->ring->cur); - if (++kring->nr_hwcur >= src_lim) - kring->nr_hwcur = 0; + kring->nr_hwcur = nm_next(kring->nr_hwcur, src_lim); howmany--; kring->nr_hwavail--; - if (++k1->ring->cur >= dst_lim) - k1->ring->cur = 0; + k1->ring->cur = nm_next(k1->ring->cur, dst_lim); k1->ring->avail--; } kring->ring->cur = kring->nr_hwcur; // XXX - k1++; + k1++; // XXX why? } +out: + mtx_unlock(&kring->q_lock); } /* - * netmap_sync_to_host() passes packets up. We are called from a + * netmap_txsync_to_host() passes packets up. We are called from a * system call in user process context, and the only contention * can be among multiple user threads erroneously calling * this routine concurrently. */ static void -netmap_sync_to_host(struct netmap_adapter *na) +netmap_txsync_to_host(struct netmap_adapter *na) { struct netmap_kring *kring = &na->tx_rings[na->num_tx_rings]; struct netmap_ring *ring = kring->ring; u_int k, lim = kring->nkr_num_slots - 1; - struct mbq q = { NULL, NULL }; + struct mbq q = { NULL, NULL, 0 }; + if (nm_kr_tryget(kring)) { + D("ring %p busy (user error)", kring); + return; + } k = ring->cur; if (k > lim) { + D("invalid ring index in stack TX kring %p", kring); netmap_ring_reinit(kring); + nm_kr_put(kring); return; } - // na->nm_lock(na->ifp, NETMAP_CORE_LOCK, 0); /* Take packets from hwcur to cur and pass them up. * In case of no buffers we give up. At the end of the loop, @@ -1011,26 +1673,32 @@ netmap_sync_to_host(struct netmap_adapter *na) netmap_grab_packets(kring, &q, 1); kring->nr_hwcur = k; kring->nr_hwavail = ring->avail = lim; - // na->nm_lock(na->ifp, NETMAP_CORE_UNLOCK, 0); + nm_kr_put(kring); netmap_send_up(na->ifp, q.head); } +/* + * This is the 'txsync' handler to send from a software ring to the + * host stack. + */ /* SWNA(ifp)->txrings[0] is always NA(ifp)->txrings[NA(ifp)->num_txrings] */ static int -netmap_bdg_to_host(struct ifnet *ifp, u_int ring_nr, int do_lock) +netmap_bdg_to_host(struct ifnet *ifp, u_int ring_nr, int flags) { (void)ring_nr; - (void)do_lock; - netmap_sync_to_host(NA(ifp)); + (void)flags; + if (netmap_verbose > 255) + RD(5, "sync to host %s ring %d", ifp->if_xname, ring_nr); + netmap_txsync_to_host(NA(ifp)); return 0; } /* * rxsync backend for packets coming from the host stack. - * They have been put in the queue by netmap_start() so we + * They have been put in the queue by netmap_transmit() so we * need to protect access to the kring using a lock. * * This routine also does the selrecord if called from the poll handler @@ -1040,7 +1708,7 @@ netmap_bdg_to_host(struct ifnet *ifp, u_int ring_nr, int do_lock) * as an additional hidden argument. */ static void -netmap_sync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait) +netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait) { struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings]; struct netmap_ring *ring = kring->ring; @@ -1048,10 +1716,19 @@ netmap_sync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait) u_int k = ring->cur, resvd = ring->reserved; (void)pwait; /* disable unused warnings */ - na->nm_lock(na->ifp, NETMAP_CORE_LOCK, 0); + + if (kring->nkr_stopped) /* check a first time without lock */ + return; + + /* XXX as an optimization we could reuse na->core_lock */ + mtx_lock(&kring->q_lock); + + if (kring->nkr_stopped) /* check again with lock held */ + goto unlock_out; + if (k >= lim) { netmap_ring_reinit(kring); - return; + goto unlock_out; } /* new packets are already set in nr_hwavail */ /* skip past packets that userspace has released */ @@ -1073,17 +1750,23 @@ netmap_sync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait) selrecord(td, &kring->si); if (k && (netmap_verbose & NM_VERB_HOST)) D("%d pkts from stack", k); - na->nm_lock(na->ifp, NETMAP_CORE_UNLOCK, 0); +unlock_out: + + mtx_unlock(&kring->q_lock); } /* + * MUST BE CALLED UNDER NMG_LOCK() + * * get a refcounted reference to an interface. + * This is always called in the execution of an ioctl(). + * * Return ENXIO if the interface does not exist, EINVAL if netmap * is not supported by the interface. * If successful, hold a reference. * - * During the NIC is attached to a bridge, reference is managed + * When the NIC is attached to a bridge, reference is managed * at na->na_bdg_refcount using ADD/DROP_BDG_REF() as well as * virtual ports. Hence, on the final DROP_BDG_REF(), the NIC * is detached from the bridge, then ifp's refcount is dropped (this @@ -1094,143 +1777,166 @@ netmap_sync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait) * is acquired by this function, it must be released using nm_if_rele(). */ static int -get_ifp(struct nmreq *nmr, struct ifnet **ifp) +get_ifp(struct nmreq *nmr, struct ifnet **ifp, int create) { const char *name = nmr->nr_name; int namelen = strlen(name); -#ifdef NM_BRIDGE struct ifnet *iter = NULL; int no_prefix = 0; - do { - struct nm_bridge *b; - struct netmap_adapter *na; - int i, cand = -1, cand2 = -1; + /* first try to see if this is a bridge port. */ + struct nm_bridge *b; + struct netmap_adapter *na; + int i, j, cand = -1, cand2 = -1; + int needed; + + NMG_LOCK_ASSERT(); + *ifp = NULL; /* default */ + if (strncmp(name, NM_NAME, sizeof(NM_NAME) - 1)) { + no_prefix = 1; /* no VALE prefix */ + goto no_bridge_port; + } - if (strncmp(name, NM_NAME, sizeof(NM_NAME) - 1)) { - no_prefix = 1; - break; - } - b = nm_find_bridge(name, 1 /* create a new one if no exist */ ); - if (b == NULL) { - D("no bridges available for '%s'", name); - return (ENXIO); - } - /* Now we are sure that name starts with the bridge's name */ - BDG_WLOCK(b); - /* lookup in the local list of ports */ - for (i = 0; i < NM_BDG_MAXPORTS; i++) { - na = BDG_GET_VAR(b->bdg_ports[i]); - if (na == NULL) { - if (cand == -1) - cand = i; /* potential insert point */ - else if (cand2 == -1) - cand2 = i; /* for host stack */ - continue; - } - iter = na->ifp; - /* XXX make sure the name only contains one : */ - if (!strcmp(iter->if_xname, name) /* virtual port */ || - (namelen > b->namelen && !strcmp(iter->if_xname, - name + b->namelen + 1)) /* NIC */) { - ADD_BDG_REF(iter); - ND("found existing interface"); - BDG_WUNLOCK(b); - break; - } + b = nm_find_bridge(name, create); + if (b == NULL) { + D("no bridges available for '%s'", name); + return (ENXIO); + } + + /* Now we are sure that name starts with the bridge's name, + * lookup the port in the bridge. We need to scan the entire + * list. It is not important to hold a WLOCK on the bridge + * during the search because NMG_LOCK already guarantees + * that there are no other possible writers. + */ + + /* lookup in the local list of ports */ + for (j = 0; j < b->bdg_active_ports; j++) { + i = b->bdg_port_index[j]; + na = b->bdg_ports[i]; + // KASSERT(na != NULL); + iter = na->ifp; + /* XXX make sure the name only contains one : */ + if (!strcmp(iter->if_xname, name) /* virtual port */ || + (namelen > b->bdg_namelen && !strcmp(iter->if_xname, + name + b->bdg_namelen + 1)) /* NIC */) { + ADD_BDG_REF(iter); + ND("found existing if %s refs %d", name, + NA(iter)->na_bdg_refcount); + *ifp = iter; + /* we are done, this is surely netmap capable */ + return 0; } - if (i < NM_BDG_MAXPORTS) /* already unlocked */ - break; - if (cand == -1) { - D("bridge full, cannot create new port"); -no_port: - BDG_WUNLOCK(b); - *ifp = NULL; + } + /* not found, should we create it? */ + if (!create) + return ENXIO; + /* yes we should, see if we have space to attach entries */ + needed = 2; /* in some cases we only need 1 */ + if (b->bdg_active_ports + needed >= NM_BDG_MAXPORTS) { + D("bridge full %d, cannot create new port", b->bdg_active_ports); + return EINVAL; + } + /* record the next two ports available, but do not allocate yet */ + cand = b->bdg_port_index[b->bdg_active_ports]; + cand2 = b->bdg_port_index[b->bdg_active_ports + 1]; + ND("+++ bridge %s port %s used %d avail %d %d", + b->bdg_basename, name, b->bdg_active_ports, cand, cand2); + + /* + * try see if there is a matching NIC with this name + * (after the bridge's name) + */ + iter = ifunit_ref(name + b->bdg_namelen + 1); + if (!iter) { /* this is a virtual port */ + /* Create a temporary NA with arguments, then + * bdg_netmap_attach() will allocate the real one + * and attach it to the ifp + */ + struct netmap_adapter tmp_na; + + if (nmr->nr_cmd) { + /* nr_cmd must be 0 for a virtual port */ return EINVAL; } - ND("create new bridge port %s", name); - /* - * create a struct ifnet for the new port. - * The forwarding table is attached to the kring(s). + bzero(&tmp_na, sizeof(tmp_na)); + /* bound checking */ + tmp_na.num_tx_rings = nmr->nr_tx_rings; + nm_bound_var(&tmp_na.num_tx_rings, 1, 1, NM_BDG_MAXRINGS, NULL); + nmr->nr_tx_rings = tmp_na.num_tx_rings; // write back + tmp_na.num_rx_rings = nmr->nr_rx_rings; + nm_bound_var(&tmp_na.num_rx_rings, 1, 1, NM_BDG_MAXRINGS, NULL); + nmr->nr_rx_rings = tmp_na.num_rx_rings; // write back + nm_bound_var(&nmr->nr_tx_slots, NM_BRIDGE_RINGSIZE, + 1, NM_BDG_MAXSLOTS, NULL); + tmp_na.num_tx_desc = nmr->nr_tx_slots; + nm_bound_var(&nmr->nr_rx_slots, NM_BRIDGE_RINGSIZE, + 1, NM_BDG_MAXSLOTS, NULL); + tmp_na.num_rx_desc = nmr->nr_rx_slots; + + /* create a struct ifnet for the new port. + * need M_NOWAIT as we are under nma_lock */ - /* - * try see if there is a matching NIC with this name - * (after the bridge's name) - */ - iter = ifunit_ref(name + b->namelen + 1); - if (!iter) { /* this is a virtual port */ - /* Create a temporary NA with arguments, then - * bdg_netmap_attach() will allocate the real one - * and attach it to the ifp - */ - struct netmap_adapter tmp_na; - - if (nmr->nr_cmd) /* nr_cmd must be for a NIC */ - goto no_port; - bzero(&tmp_na, sizeof(tmp_na)); - /* bound checking */ - if (nmr->nr_tx_rings < 1) - nmr->nr_tx_rings = 1; - if (nmr->nr_tx_rings > NM_BDG_MAXRINGS) - nmr->nr_tx_rings = NM_BDG_MAXRINGS; - tmp_na.num_tx_rings = nmr->nr_tx_rings; - if (nmr->nr_rx_rings < 1) - nmr->nr_rx_rings = 1; - if (nmr->nr_rx_rings > NM_BDG_MAXRINGS) - nmr->nr_rx_rings = NM_BDG_MAXRINGS; - tmp_na.num_rx_rings = nmr->nr_rx_rings; - - iter = malloc(sizeof(*iter), M_DEVBUF, M_NOWAIT | M_ZERO); - if (!iter) - goto no_port; - strcpy(iter->if_xname, name); - tmp_na.ifp = iter; - /* bdg_netmap_attach creates a struct netmap_adapter */ - bdg_netmap_attach(&tmp_na); - } else if (NETMAP_CAPABLE(iter)) { /* this is a NIC */ - /* cannot attach the NIC that any user or another - * bridge already holds. - */ - if (NETMAP_OWNED_BY_ANY(iter) || cand2 == -1) { -ifunit_rele: - if_rele(iter); /* don't detach from bridge */ - goto no_port; - } - /* bind the host stack to the bridge */ - if (nmr->nr_arg1 == NETMAP_BDG_HOST) { - BDG_SET_VAR(b->bdg_ports[cand2], SWNA(iter)); - SWNA(iter)->bdg_port = cand2; - SWNA(iter)->na_bdg = b; - } - } else /* not a netmap-capable NIC */ - goto ifunit_rele; - na = NA(iter); - na->bdg_port = cand; - /* bind the port to the bridge (virtual ports are not active) */ - BDG_SET_VAR(b->bdg_ports[cand], na); - na->na_bdg = b; - ADD_BDG_REF(iter); - BDG_WUNLOCK(b); - ND("attaching virtual bridge %p", b); - } while (0); + iter = malloc(sizeof(*iter), M_DEVBUF, M_NOWAIT | M_ZERO); + if (!iter) + return ENOMEM; + + strcpy(iter->if_xname, name); + tmp_na.ifp = iter; + /* bdg_netmap_attach creates a struct netmap_adapter */ + bdg_netmap_attach(&tmp_na); + cand2 = -1; /* only need one port */ + } else if (NETMAP_CAPABLE(iter)) { /* this is a NIC */ + /* make sure the NIC is not already in use */ + if (NETMAP_OWNED_BY_ANY(iter)) { + D("NIC %s busy, cannot attach to bridge", + iter->if_xname); + if_rele(iter); /* don't detach from bridge */ + return EINVAL; + } + if (nmr->nr_arg1 != NETMAP_BDG_HOST) + cand2 = -1; /* only need one port */ + } else { /* not a netmap-capable NIC */ + if_rele(iter); /* don't detach from bridge */ + return EINVAL; + } + na = NA(iter); + + BDG_WLOCK(b); + na->bdg_port = cand; + ND("NIC %p to bridge port %d", NA(iter), cand); + /* bind the port to the bridge (virtual ports are not active) */ + b->bdg_ports[cand] = na; + na->na_bdg = b; + b->bdg_active_ports++; + if (cand2 >= 0) { + /* also bind the host stack to the bridge */ + b->bdg_ports[cand2] = SWNA(iter); + SWNA(iter)->bdg_port = cand2; + SWNA(iter)->na_bdg = b; + b->bdg_active_ports++; + ND("host %p to bridge port %d", SWNA(iter), cand2); + } + ADD_BDG_REF(iter); // XXX one or two ? + ND("if %s refs %d", name, NA(iter)->na_bdg_refcount); + BDG_WUNLOCK(b); + *ifp = iter; + return 0; + +no_bridge_port: *ifp = iter; if (! *ifp) -#endif /* NM_BRIDGE */ - *ifp = ifunit_ref(name); + *ifp = ifunit_ref(name); if (*ifp == NULL) return (ENXIO); - /* can do this if the capability exists and if_pspare[0] - * points to the netmap descriptor. - */ + if (NETMAP_CAPABLE(*ifp)) { -#ifdef NM_BRIDGE /* Users cannot use the NIC attached to a bridge directly */ if (no_prefix && NETMAP_OWNED_BY_KERN(*ifp)) { if_rele(*ifp); /* don't detach from bridge */ return EINVAL; } else -#endif /* NM_BRIDGE */ - return 0; /* valid pointer, we hold the refcount */ + return 0; /* valid pointer, we hold the refcount */ } nm_if_rele(*ifp); return EINVAL; // not NETMAP capable @@ -1256,6 +1962,7 @@ netmap_ring_reinit(struct netmap_kring *kring) u_int i, lim = kring->nkr_num_slots - 1; int errors = 0; + // XXX KASSERT nm_kr_tryget RD(10, "called for %s", kring->na->ifp->if_xname); if (ring->cur > lim) errors++; @@ -1267,7 +1974,7 @@ netmap_ring_reinit(struct netmap_kring *kring) D("bad buffer at slot %d idx %d len %d ", i, idx, len); ring->slot[i].buf_idx = 0; ring->slot[i].len = 0; - } else if (len > NETMAP_BUF_SIZE) { + } else if (len > NETMAP_BDG_BUF_SIZE(kring->na->nm_mem)) { ring->slot[i].len = 0; if (!errors++) D("bad len %d at slot %d idx %d", @@ -1303,8 +2010,7 @@ netmap_set_ringid(struct netmap_priv_d *priv, u_int ringid) struct netmap_adapter *na = NA(ifp); u_int i = ringid & NETMAP_RING_MASK; /* initially (np_qfirst == np_qlast) we don't want to lock */ - int need_lock = (priv->np_qfirst != priv->np_qlast); - int lim = na->num_rx_rings; + u_int lim = na->num_rx_rings; if (na->num_tx_rings > lim) lim = na->num_tx_rings; @@ -1312,8 +2018,6 @@ netmap_set_ringid(struct netmap_priv_d *priv, u_int ringid) D("invalid ring id %d", i); return (EINVAL); } - if (need_lock) - na->nm_lock(ifp, NETMAP_CORE_LOCK, 0); priv->np_ringid = ringid; if (ringid & NETMAP_SW_RING) { priv->np_qfirst = NETMAP_SW_RING; @@ -1326,8 +2030,6 @@ netmap_set_ringid(struct netmap_priv_d *priv, u_int ringid) priv->np_qlast = NETMAP_HW_RING ; } priv->np_txpoll = (ringid & NETMAP_NO_TX_POLL) ? 0 : 1; - if (need_lock) - na->nm_lock(ifp, NETMAP_CORE_UNLOCK, 0); if (netmap_verbose) { if (ringid & NETMAP_SW_RING) D("ringid %s set to SW RING", ifp->if_xname); @@ -1344,7 +2046,7 @@ netmap_set_ringid(struct netmap_priv_d *priv, u_int ringid) /* * possibly move the interface to netmap-mode. * If success it returns a pointer to netmap_if, otherwise NULL. - * This must be called with NMA_LOCK held. + * This must be called with NMG_LOCK held. */ static struct netmap_if * netmap_do_regif(struct netmap_priv_d *priv, struct ifnet *ifp, @@ -1352,63 +2054,89 @@ netmap_do_regif(struct netmap_priv_d *priv, struct ifnet *ifp, { struct netmap_adapter *na = NA(ifp); struct netmap_if *nifp = NULL; - int i, error; - - if (na->na_bdg) - BDG_WLOCK(na->na_bdg); - na->nm_lock(ifp, NETMAP_REG_LOCK, 0); + int error, need_mem; + NMG_LOCK_ASSERT(); /* ring configuration may have changed, fetch from the card */ netmap_update_config(na); priv->np_ifp = ifp; /* store the reference */ error = netmap_set_ringid(priv, ringid); if (error) goto out; + /* ensure allocators are ready */ + need_mem = !netmap_have_memory_locked(priv); + if (need_mem) { + error = netmap_get_memory_locked(priv); + ND("get_memory returned %d", error); + if (error) + goto out; + } nifp = netmap_if_new(ifp->if_xname, na); if (nifp == NULL) { /* allocation failed */ + /* we should drop the allocator, but only + * if we were the ones who grabbed it + */ + if (need_mem) + netmap_drop_memory_locked(priv); error = ENOMEM; - } else if (ifp->if_capenable & IFCAP_NETMAP) { + goto out; + } + na->refcount++; + if (ifp->if_capenable & IFCAP_NETMAP) { /* was already set */ } else { + u_int i; /* Otherwise set the card in netmap mode * and make it use the shared buffers. + * + * If the interface is attached to a bridge, lock it. */ + if (NETMAP_OWNED_BY_KERN(ifp)) + BDG_WLOCK(NA(ifp)->na_bdg); for (i = 0 ; i < na->num_tx_rings + 1; i++) mtx_init(&na->tx_rings[i].q_lock, "nm_txq_lock", - MTX_NETWORK_LOCK, MTX_DEF); + NULL, MTX_DEF); for (i = 0 ; i < na->num_rx_rings + 1; i++) { mtx_init(&na->rx_rings[i].q_lock, "nm_rxq_lock", - MTX_NETWORK_LOCK, MTX_DEF); + NULL, MTX_DEF); } if (nma_is_hw(na)) { SWNA(ifp)->tx_rings = &na->tx_rings[na->num_tx_rings]; SWNA(ifp)->rx_rings = &na->rx_rings[na->num_rx_rings]; } + /* + * do not core lock because the race is harmless here, + * there cannot be any traffic to netmap_transmit() + */ error = na->nm_register(ifp, 1); /* mode on */ -#ifdef NM_BRIDGE + // XXX do we need to nm_alloc_bdgfwd() in all cases ? if (!error) error = nm_alloc_bdgfwd(na); -#endif /* NM_BRIDGE */ if (error) { - netmap_dtor_locked(priv); - /* nifp is not yet in priv, so free it separately */ - netmap_if_free(nifp); + netmap_do_unregif(priv, nifp); nifp = NULL; } + if (NETMAP_OWNED_BY_KERN(ifp)) + BDG_WUNLOCK(NA(ifp)->na_bdg); } out: *err = error; - na->nm_lock(ifp, NETMAP_REG_UNLOCK, 0); - if (na->na_bdg) - BDG_WUNLOCK(na->na_bdg); + if (nifp != NULL) { + /* + * advertise that the interface is ready bt setting ni_nifp. + * The barrier is needed because readers (poll and *SYNC) + * check for priv->np_nifp != NULL without locking + */ + wmb(); /* make sure previous writes are visible to all CPUs */ + priv->np_nifp = nifp; + } return nifp; } - /* Process NETMAP_BDG_ATTACH and NETMAP_BDG_DETACH */ static int -kern_netmap_regif(struct nmreq *nmr) +nm_bdg_attach(struct nmreq *nmr) { struct ifnet *ifp; struct netmap_if *nifp; @@ -1418,81 +2146,92 @@ kern_netmap_regif(struct nmreq *nmr) npriv = malloc(sizeof(*npriv), M_DEVBUF, M_NOWAIT|M_ZERO); if (npriv == NULL) return ENOMEM; - error = netmap_get_memory(npriv); - if (error) { -free_exit: - bzero(npriv, sizeof(*npriv)); - free(npriv, M_DEVBUF); - return error; - } - - NMA_LOCK(); - error = get_ifp(nmr, &ifp); - if (error) { /* no device, or another bridge or user owns the device */ - NMA_UNLOCK(); - goto free_exit; - } else if (!NETMAP_OWNED_BY_KERN(ifp)) { + NMG_LOCK(); + error = get_ifp(nmr, &ifp, 1 /* create if not exists */); + if (error) /* no device, or another bridge or user owns the device */ + goto unlock_exit; + /* get_ifp() sets na_bdg if this is a physical interface + * that we can attach to a switch. + */ + if (!NETMAP_OWNED_BY_KERN(ifp)) { /* got reference to a virtual port or direct access to a NIC. - * perhaps specified no bridge's prefix or wrong NIC's name + * perhaps specified no bridge prefix or wrong NIC name */ error = EINVAL; -unref_exit: - nm_if_rele(ifp); - NMA_UNLOCK(); - goto free_exit; + goto unref_exit; } - if (nmr->nr_cmd == NETMAP_BDG_DETACH) { - if (NA(ifp)->refcount == 0) { /* not registered */ - error = EINVAL; - goto unref_exit; - } - NMA_UNLOCK(); - - netmap_dtor(NA(ifp)->na_kpriv); /* unregister */ - NA(ifp)->na_kpriv = NULL; - nm_if_rele(ifp); /* detach from the bridge */ - goto free_exit; - } else if (NA(ifp)->refcount > 0) { /* already registered */ - error = EINVAL; - goto unref_exit; + if (NA(ifp)->refcount > 0) { /* already registered */ + error = EBUSY; + DROP_BDG_REF(ifp); + goto unlock_exit; } nifp = netmap_do_regif(npriv, ifp, nmr->nr_ringid, &error); - if (!nifp) + if (!nifp) { goto unref_exit; - wmb(); // XXX do we need it ? - npriv->np_nifp = nifp; + } + NA(ifp)->na_kpriv = npriv; - NMA_UNLOCK(); - D("registered %s to netmap-mode", ifp->if_xname); + NMG_UNLOCK(); + ND("registered %s to netmap-mode", ifp->if_xname); return 0; -} +unref_exit: + nm_if_rele(ifp); +unlock_exit: + NMG_UNLOCK(); + bzero(npriv, sizeof(*npriv)); + free(npriv, M_DEVBUF); + return error; +} -/* CORE_LOCK is not necessary */ -static void -netmap_swlock_wrapper(struct ifnet *dev, int what, u_int queueid) +static int +nm_bdg_detach(struct nmreq *nmr) { - struct netmap_adapter *na = SWNA(dev); + struct ifnet *ifp; + int error; + int last_instance; - switch (what) { - case NETMAP_TX_LOCK: - mtx_lock(&na->tx_rings[queueid].q_lock); - break; + NMG_LOCK(); + error = get_ifp(nmr, &ifp, 0 /* don't create */); + if (error) { /* no device, or another bridge or user owns the device */ + goto unlock_exit; + } + /* XXX do we need to check this ? */ + if (!NETMAP_OWNED_BY_KERN(ifp)) { + /* got reference to a virtual port or direct access to a NIC. + * perhaps specified no bridge's prefix or wrong NIC's name + */ + error = EINVAL; + goto unref_exit; + } - case NETMAP_TX_UNLOCK: - mtx_unlock(&na->tx_rings[queueid].q_lock); - break; + if (NA(ifp)->refcount == 0) { /* not registered */ + error = EINVAL; + goto unref_exit; + } - case NETMAP_RX_LOCK: - mtx_lock(&na->rx_rings[queueid].q_lock); - break; + DROP_BDG_REF(ifp); /* the one from get_ifp */ + last_instance = netmap_dtor_locked(NA(ifp)->na_kpriv); /* unregister */ + NMG_UNLOCK(); + if (!last_instance) { + D("--- error, trying to detach an entry with active mmaps"); + error = EINVAL; + } else { + struct netmap_priv_d *npriv = NA(ifp)->na_kpriv; + NA(ifp)->na_kpriv = NULL; - case NETMAP_RX_UNLOCK: - mtx_unlock(&na->rx_rings[queueid].q_lock); - break; + bzero(npriv, sizeof(*npriv)); + free(npriv, M_DEVBUF); } + return error; + +unref_exit: + nm_if_rele(ifp); +unlock_exit: + NMG_UNLOCK(); + return error; } @@ -1511,16 +2250,20 @@ netmap_attach_sw(struct ifnet *ifp) struct netmap_adapter *na = SWNA(ifp); na->ifp = ifp; - na->separate_locks = 1; - na->nm_lock = netmap_swlock_wrapper; na->num_rx_rings = na->num_tx_rings = 1; na->num_tx_desc = hw_na->num_tx_desc; na->num_rx_desc = hw_na->num_rx_desc; na->nm_txsync = netmap_bdg_to_host; + /* we use the same memory allocator as the + * the hw adapter */ + na->nm_mem = hw_na->nm_mem; } -/* exported to kernel callers */ +/* exported to kernel callers, e.g. OVS ? + * Entry point. + * Called without NMG_LOCK. + */ int netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func) { @@ -1533,8 +2276,11 @@ netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func) switch (cmd) { case NETMAP_BDG_ATTACH: + error = nm_bdg_attach(nmr); + break; + case NETMAP_BDG_DETACH: - error = kern_netmap_regif(nmr); + error = nm_bdg_detach(nmr); break; case NETMAP_BDG_LIST: @@ -1544,26 +2290,30 @@ netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func) error = EINVAL; break; } + NMG_LOCK(); b = nm_find_bridge(name, 0 /* don't create */); if (!b) { error = ENOENT; + NMG_UNLOCK(); break; } - BDG_RLOCK(b); error = ENOENT; - for (i = 0; i < NM_BDG_MAXPORTS; i++) { - na = BDG_GET_VAR(b->bdg_ports[i]); - if (na == NULL) + for (j = 0; j < b->bdg_active_ports; j++) { + i = b->bdg_port_index[j]; + na = b->bdg_ports[i]; + if (na == NULL) { + D("---AAAAAAAAARGH-------"); continue; + } iter = na->ifp; /* the former and the latter identify a * virtual port and a NIC, respectively */ if (!strcmp(iter->if_xname, name) || - (namelen > b->namelen && + (namelen > b->bdg_namelen && !strcmp(iter->if_xname, - name + b->namelen + 1))) { + name + b->bdg_namelen + 1))) { /* bridge index */ nmr->nr_arg1 = b - nm_bridges; nmr->nr_arg2 = i; /* port index */ @@ -1571,7 +2321,7 @@ netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func) break; } } - BDG_RUNLOCK(b); + NMG_UNLOCK(); } else { /* return the first non-empty entry starting from * bridge nr_arg1 and port nr_arg2. @@ -1583,23 +2333,23 @@ netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func) i = nmr->nr_arg1; j = nmr->nr_arg2; - for (error = ENOENT; error && i < NM_BRIDGES; i++) { + NMG_LOCK(); + for (error = ENOENT; i < NM_BRIDGES; i++) { b = nm_bridges + i; - BDG_RLOCK(b); - for (; j < NM_BDG_MAXPORTS; j++) { - na = BDG_GET_VAR(b->bdg_ports[j]); - if (na == NULL) - continue; - iter = na->ifp; - nmr->nr_arg1 = i; - nmr->nr_arg2 = j; - strncpy(name, iter->if_xname, IFNAMSIZ); - error = 0; - break; + if (j >= b->bdg_active_ports) { + j = 0; /* following bridges scan from 0 */ + continue; } - BDG_RUNLOCK(b); - j = 0; /* following bridges scan from 0 */ + nmr->nr_arg1 = i; + nmr->nr_arg2 = j; + j = b->bdg_port_index[j]; + na = b->bdg_ports[j]; + iter = na->ifp; + strncpy(name, iter->if_xname, (size_t)IFNAMSIZ); + error = 0; + break; } + NMG_UNLOCK(); } break; @@ -1612,15 +2362,16 @@ netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func) error = EINVAL; break; } + NMG_LOCK(); b = nm_find_bridge(name, 0 /* don't create */); if (!b) { error = EINVAL; - break; + } else { + b->nm_bdg_lookup = func; } - BDG_WLOCK(b); - b->nm_bdg_lookup = func; - BDG_WUNLOCK(b); + NMG_UNLOCK(); break; + default: D("invalid cmd (nmr->nr_cmd) (0x%x)", cmd); error = EINVAL; @@ -1648,12 +2399,13 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td) { struct netmap_priv_d *priv = NULL; - struct ifnet *ifp; + struct ifnet *ifp = NULL; struct nmreq *nmr = (struct nmreq *) data; - struct netmap_adapter *na; + struct netmap_adapter *na = NULL; int error; u_int i, lim; struct netmap_if *nifp; + struct netmap_kring *krings; (void)dev; /* UNUSED */ (void)fflag; /* UNUSED */ @@ -1686,6 +2438,13 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, switch (cmd) { case NIOCGINFO: /* return capabilities etc */ if (nmr->nr_version != NETMAP_API) { +#ifdef TEST_STUFF + /* some test code for locks etc */ + if (nmr->nr_version == 666) { + error = nm_test(nmr); + break; + } +#endif /* TEST_STUFF */ D("API mismatch got %d have %d", nmr->nr_version, NETMAP_API); nmr->nr_version = NETMAP_API; @@ -1696,32 +2455,40 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, error = netmap_bdg_ctl(nmr, NULL); break; } - /* update configuration */ - error = netmap_get_memory(priv); - ND("get_memory returned %d", error); - if (error) - break; - /* memsize is always valid */ - nmr->nr_memsize = nm_mem.nm_totalsize; - nmr->nr_offset = 0; - nmr->nr_rx_slots = nmr->nr_tx_slots = 0; - if (nmr->nr_name[0] == '\0') /* just get memory info */ - break; - /* lock because get_ifp and update_config see na->refcount */ - NMA_LOCK(); - error = get_ifp(nmr, &ifp); /* get a refcount */ - if (error) { - NMA_UNLOCK(); - break; - } - na = NA(ifp); /* retrieve netmap_adapter */ - netmap_update_config(na); - NMA_UNLOCK(); - nmr->nr_rx_rings = na->num_rx_rings; - nmr->nr_tx_rings = na->num_tx_rings; - nmr->nr_rx_slots = na->num_rx_desc; - nmr->nr_tx_slots = na->num_tx_desc; - nm_if_rele(ifp); /* return the refcount */ + + NMG_LOCK(); + do { + /* memsize is always valid */ + struct netmap_mem_d *nmd = &nm_mem; + u_int memflags; + + if (nmr->nr_name[0] != '\0') { + /* get a refcount */ + error = get_ifp(nmr, &ifp, 1 /* create */); + if (error) + break; + na = NA(ifp); /* retrieve the netmap adapter */ + nmd = na->nm_mem; /* and its memory allocator */ + } + + error = netmap_mem_get_info(nmd, &nmr->nr_memsize, &memflags); + if (error) + break; + if (na == NULL) /* only memory info */ + break; + nmr->nr_offset = 0; + nmr->nr_rx_slots = nmr->nr_tx_slots = 0; + netmap_update_config(na); + nmr->nr_rx_rings = na->num_rx_rings; + nmr->nr_tx_rings = na->num_tx_rings; + nmr->nr_rx_slots = na->num_rx_desc; + nmr->nr_tx_slots = na->num_tx_desc; + if (memflags & NETMAP_MEM_PRIVATE) + nmr->nr_ringid |= NETMAP_PRIV_MEM; + } while (0); + if (ifp) + nm_if_rele(ifp); /* return the refcount */ + NMG_UNLOCK(); break; case NIOCREGIF: @@ -1741,52 +2508,50 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, break; } - /* ensure allocators are ready */ - error = netmap_get_memory(priv); - ND("get_memory returned %d", error); - if (error) - break; - /* protect access to priv from concurrent NIOCREGIF */ - NMA_LOCK(); - if (priv->np_ifp != NULL) { /* thread already registered */ - error = netmap_set_ringid(priv, nmr->nr_ringid); -unlock_out: - NMA_UNLOCK(); - break; - } - /* find the interface and a reference */ - error = get_ifp(nmr, &ifp); /* keep reference */ - if (error) - goto unlock_out; - else if (NETMAP_OWNED_BY_KERN(ifp)) { - nm_if_rele(ifp); - goto unlock_out; - } - nifp = netmap_do_regif(priv, ifp, nmr->nr_ringid, &error); - if (!nifp) { /* reg. failed, release priv and ref */ - nm_if_rele(ifp); /* return the refcount */ - priv->np_ifp = NULL; - priv->np_nifp = NULL; - goto unlock_out; - } + NMG_LOCK(); + do { + u_int memflags; - /* the following assignment is a commitment. - * Readers (i.e., poll and *SYNC) check for - * np_nifp != NULL without locking - */ - wmb(); /* make sure previous writes are visible to all CPUs */ - priv->np_nifp = nifp; - NMA_UNLOCK(); + if (priv->np_ifp != NULL) { /* thread already registered */ + error = netmap_set_ringid(priv, nmr->nr_ringid); + break; + } + /* find the interface and a reference */ + error = get_ifp(nmr, &ifp, 1 /* create */); /* keep reference */ + if (error) + break; + if (NETMAP_OWNED_BY_KERN(ifp)) { + nm_if_rele(ifp); + error = EBUSY; + break; + } + nifp = netmap_do_regif(priv, ifp, nmr->nr_ringid, &error); + if (!nifp) { /* reg. failed, release priv and ref */ + nm_if_rele(ifp); /* return the refcount */ + priv->np_ifp = NULL; + priv->np_nifp = NULL; + break; + } - /* return the offset of the netmap_if object */ - na = NA(ifp); /* retrieve netmap adapter */ - nmr->nr_rx_rings = na->num_rx_rings; - nmr->nr_tx_rings = na->num_tx_rings; - nmr->nr_rx_slots = na->num_rx_desc; - nmr->nr_tx_slots = na->num_tx_desc; - nmr->nr_memsize = nm_mem.nm_totalsize; - nmr->nr_offset = netmap_if_offset(nifp); + /* return the offset of the netmap_if object */ + na = NA(ifp); /* retrieve netmap adapter */ + nmr->nr_rx_rings = na->num_rx_rings; + nmr->nr_tx_rings = na->num_tx_rings; + nmr->nr_rx_slots = na->num_rx_desc; + nmr->nr_tx_slots = na->num_tx_desc; + error = netmap_mem_get_info(na->nm_mem, &nmr->nr_memsize, &memflags); + if (error) { + nm_if_rele(ifp); + break; + } + if (memflags & NETMAP_MEM_PRIVATE) { + nmr->nr_ringid |= NETMAP_PRIV_MEM; + *(uint32_t *)&nifp->ni_flags |= NI_PRIV_MEM; + } + nmr->nr_offset = netmap_mem_if_offset(na->nm_mem, nifp); + } while (0); + NMG_UNLOCK(); break; case NIOCUNREGIF: @@ -1805,7 +2570,6 @@ unlock_out: } rmb(); /* make sure following reads are not from cache */ - ifp = priv->np_ifp; /* we have a reference */ if (ifp == NULL) { @@ -1817,9 +2581,9 @@ unlock_out: na = NA(ifp); /* retrieve netmap adapter */ if (priv->np_qfirst == NETMAP_SW_RING) { /* host rings */ if (cmd == NIOCTXSYNC) - netmap_sync_to_host(na); + netmap_txsync_to_host(na); else - netmap_sync_from_host(na, NULL, NULL); + netmap_rxsync_from_host(na, NULL, NULL); break; } /* find the last ring to scan */ @@ -1828,22 +2592,28 @@ unlock_out: lim = (cmd == NIOCTXSYNC) ? na->num_tx_rings : na->num_rx_rings; + krings = (cmd == NIOCTXSYNC) ? na->tx_rings : na->rx_rings; for (i = priv->np_qfirst; i < lim; i++) { + struct netmap_kring *kring = krings + i; + if (nm_kr_tryget(kring)) { + error = EBUSY; + goto out; + } if (cmd == NIOCTXSYNC) { - struct netmap_kring *kring = &na->tx_rings[i]; if (netmap_verbose & NM_VERB_TXSYNC) D("pre txsync ring %d cur %d hwcur %d", i, kring->ring->cur, kring->nr_hwcur); - na->nm_txsync(ifp, i, 1 /* do lock */); + na->nm_txsync(ifp, i, NAF_FORCE_RECLAIM); if (netmap_verbose & NM_VERB_TXSYNC) D("post txsync ring %d cur %d hwcur %d", i, kring->ring->cur, kring->nr_hwcur); } else { - na->nm_rxsync(ifp, i, 1 /* do lock */); + na->nm_rxsync(ifp, i, NAF_FORCE_READ); microtime(&na->rx_rings[i].ring->ts); } + nm_kr_put(kring); } break; @@ -1859,14 +2629,19 @@ unlock_out: default: /* allow device-specific ioctls */ { struct socket so; + bzero(&so, sizeof(so)); - error = get_ifp(nmr, &ifp); /* keep reference */ - if (error) + NMG_LOCK(); + error = get_ifp(nmr, &ifp, 0 /* don't create */); /* keep reference */ + if (error) { + NMG_UNLOCK(); break; + } so.so_vnet = ifp->if_vnet; // so->so_proto not null. error = ifioctl(&so, cmd, data, td); nm_if_rele(ifp); + NMG_UNLOCK(); break; } @@ -1875,6 +2650,7 @@ unlock_out: error = EOPNOTSUPP; #endif /* linux */ } +out: CURVNET_RESTORE(); return (error); @@ -1887,7 +2663,7 @@ unlock_out: * Can be called for one or more queues. * Return true the event mask corresponding to ready events. * If there are no ready events, do a selrecord on either individual - * selfd or on the global one. + * selinfo or on the global one. * Device-dependent parts (locking and sync of tx/rx rings) * are done through callbacks. * @@ -1902,12 +2678,13 @@ netmap_poll(struct cdev *dev, int events, struct thread *td) struct netmap_adapter *na; struct ifnet *ifp; struct netmap_kring *kring; - u_int core_lock, i, check_all, want_tx, want_rx, revents = 0; + u_int i, check_all, want_tx, want_rx, revents = 0; u_int lim_tx, lim_rx, host_forwarded = 0; struct mbq q = { NULL, NULL, 0 }; - enum {NO_CL, NEED_CL, LOCKED_CL }; /* see below */ void *pwait = dev; /* linux compatibility */ + int retry_tx = 1; + (void)pwait; if (devfs_get_cdevpriv((void **)&priv) != 0 || priv == NULL) @@ -1933,17 +2710,18 @@ netmap_poll(struct cdev *dev, int events, struct thread *td) lim_tx = na->num_tx_rings; lim_rx = na->num_rx_rings; - /* how many queues we are scanning */ + if (priv->np_qfirst == NETMAP_SW_RING) { + /* handle the host stack ring */ if (priv->np_txpoll || want_tx) { /* push any packets up, then we are always ready */ - netmap_sync_to_host(na); + netmap_txsync_to_host(na); revents |= want_tx; } if (want_rx) { kring = &na->rx_rings[lim_rx]; if (kring->ring->avail == 0) - netmap_sync_from_host(na, td, dev); + netmap_rxsync_from_host(na, td, dev); if (kring->ring->avail > 0) { revents |= want_rx; } @@ -1957,26 +2735,18 @@ netmap_poll(struct cdev *dev, int events, struct thread *td) && want_rx && (netmap_fwd || kring->ring->flags & NR_FORWARD) ) { if (kring->ring->avail == 0) - netmap_sync_from_host(na, td, dev); + netmap_rxsync_from_host(na, td, dev); if (kring->ring->avail > 0) revents |= want_rx; } /* - * check_all is set if the card has more than one queue and + * check_all is set if the card has more than one queue AND * the client is polling all of them. If true, we sleep on - * the "global" selfd, otherwise we sleep on individual selfd - * (we can only sleep on one of them per direction). - * The interrupt routine in the driver should always wake on - * the individual selfd, and also on the global one if the card - * has more than one ring. - * - * If the card has only one lock, we just use that. - * If the card has separate ring locks, we just use those - * unless we are doing check_all, in which case the whole - * loop is wrapped by the global lock. - * We acquire locks only when necessary: if poll is called - * when buffers are available, we can just return without locks. + * the "global" selinfo, otherwise we sleep on individual selinfo + * (FreeBSD only allows two selinfo's per file descriptor). + * The interrupt routine in the driver wake one or the other + * (or both) depending on which clients are active. * * rxsync() is only called if we run out of buffers on a POLLIN. * txsync() is called if we run out of buffers on POLLOUT, or @@ -1985,28 +2755,6 @@ netmap_poll(struct cdev *dev, int events, struct thread *td) */ check_all = (priv->np_qlast == NETMAP_HW_RING) && (lim_tx > 1 || lim_rx > 1); - /* - * core_lock indicates what to do with the core lock. - * The core lock is used when either the card has no individual - * locks, or it has individual locks but we are cheking all - * rings so we need the core lock to avoid missing wakeup events. - * - * It has three possible states: - * NO_CL we don't need to use the core lock, e.g. - * because we are protected by individual locks. - * NEED_CL we need the core lock. In this case, when we - * call the lock routine, move to LOCKED_CL - * to remember to release the lock once done. - * LOCKED_CL core lock is set, so we need to release it. - */ - core_lock = (check_all || !na->separate_locks) ? NEED_CL : NO_CL; -#ifdef NM_BRIDGE - /* the bridge uses separate locks */ - if (na->nm_register == bdg_netmap_reg) { - ND("not using core lock for %s", ifp->if_xname); - core_lock = NO_CL; - } -#endif /* NM_BRIDGE */ if (priv->np_qlast != NETMAP_HW_RING) { lim_tx = lim_rx = priv->np_qlast; } @@ -2037,28 +2785,33 @@ netmap_poll(struct cdev *dev, int events, struct thread *td) * to avoid that the tx rings stall). */ if (priv->np_txpoll || want_tx) { + /* If we really want to be woken up (want_tx), + * do a selrecord, either on the global or on + * the private structure. Then issue the txsync + * so there is no race in the selrecord/selwait + */ flush_tx: for (i = priv->np_qfirst; i < lim_tx; i++) { kring = &na->tx_rings[i]; /* - * Skip the current ring if want_tx == 0 + * Skip this ring if want_tx == 0 * (we have already done a successful sync on * a previous ring) AND kring->cur == kring->hwcur * (there are no pending transmissions for this ring). */ if (!want_tx && kring->ring->cur == kring->nr_hwcur) continue; - if (core_lock == NEED_CL) { - na->nm_lock(ifp, NETMAP_CORE_LOCK, 0); - core_lock = LOCKED_CL; + /* make sure only one user thread is doing this */ + if (nm_kr_tryget(kring)) { + ND("ring %p busy is %d", kring, (int)kring->nr_busy); + revents |= POLLERR; + goto out; } - if (na->separate_locks) - na->nm_lock(ifp, NETMAP_TX_LOCK, i); + if (netmap_verbose & NM_VERB_TXSYNC) D("send %d on %s %d", - kring->ring->cur, - ifp->if_xname, i); - if (na->nm_txsync(ifp, i, 0 /* no lock */)) + kring->ring->cur, ifp->if_xname, i); + if (na->nm_txsync(ifp, i, 0)) revents |= POLLERR; /* Check avail/call selrecord only if called with POLLOUT */ @@ -2069,11 +2822,15 @@ flush_tx: */ revents |= want_tx; want_tx = 0; - } else if (!check_all) - selrecord(td, &kring->si); + } } - if (na->separate_locks) - na->nm_lock(ifp, NETMAP_TX_UNLOCK, i); + nm_kr_put(kring); + } + if (want_tx && retry_tx) { + selrecord(td, check_all ? + &na->tx_si : &na->tx_rings[priv->np_qfirst].si); + retry_tx = 0; + goto flush_tx; } } @@ -2082,64 +2839,65 @@ flush_tx: * Do it on all rings because otherwise we starve. */ if (want_rx) { + int retry_rx = 1; +do_retry_rx: for (i = priv->np_qfirst; i < lim_rx; i++) { kring = &na->rx_rings[i]; - if (core_lock == NEED_CL) { - na->nm_lock(ifp, NETMAP_CORE_LOCK, 0); - core_lock = LOCKED_CL; + + if (nm_kr_tryget(kring)) { + revents |= POLLERR; + goto out; } - if (na->separate_locks) - na->nm_lock(ifp, NETMAP_RX_LOCK, i); + + /* XXX NR_FORWARD should only be read on + * physical or NIC ports + */ if (netmap_fwd ||kring->ring->flags & NR_FORWARD) { ND(10, "forwarding some buffers up %d to %d", kring->nr_hwcur, kring->ring->cur); netmap_grab_packets(kring, &q, netmap_fwd); } - if (na->nm_rxsync(ifp, i, 0 /* no lock */)) + if (na->nm_rxsync(ifp, i, 0)) revents |= POLLERR; if (netmap_no_timestamp == 0 || kring->ring->flags & NR_TIMESTAMP) { microtime(&kring->ring->ts); } - if (kring->ring->avail > 0) + if (kring->ring->avail > 0) { revents |= want_rx; - else if (!check_all) - selrecord(td, &kring->si); - if (na->separate_locks) - na->nm_lock(ifp, NETMAP_RX_UNLOCK, i); + retry_rx = 0; + } + nm_kr_put(kring); + } + if (retry_rx) { + retry_rx = 0; + selrecord(td, check_all ? + &na->rx_si : &na->rx_rings[priv->np_qfirst].si); + goto do_retry_rx; } - } - if (check_all && revents == 0) { /* signal on the global queue */ - if (want_tx) - selrecord(td, &na->tx_si); - if (want_rx) - selrecord(td, &na->rx_si); } - /* forward host to the netmap ring */ + /* forward host to the netmap ring. + * I am accessing nr_hwavail without lock, but netmap_transmit + * can only increment it, so the operation is safe. + */ kring = &na->rx_rings[lim_rx]; - if (kring->nr_hwavail > 0) - ND("host rx %d has %d packets", lim_rx, kring->nr_hwavail); if ( (priv->np_qlast == NETMAP_HW_RING) // XXX check_all && (netmap_fwd || kring->ring->flags & NR_FORWARD) && kring->nr_hwavail > 0 && !host_forwarded) { - if (core_lock == NEED_CL) { - na->nm_lock(ifp, NETMAP_CORE_LOCK, 0); - core_lock = LOCKED_CL; - } netmap_sw_to_nic(na); host_forwarded = 1; /* prevent another pass */ want_rx = 0; goto flush_tx; } - if (core_lock == LOCKED_CL) - na->nm_lock(ifp, NETMAP_CORE_UNLOCK, 0); if (q.head) netmap_send_up(na->ifp, q.head); +out: + return (revents); } @@ -2147,48 +2905,6 @@ flush_tx: /* - * default lock wrapper. - */ -static void -netmap_lock_wrapper(struct ifnet *dev, int what, u_int queueid) -{ - struct netmap_adapter *na = NA(dev); - - switch (what) { -#ifdef linux /* some system do not need lock on register */ - case NETMAP_REG_LOCK: - case NETMAP_REG_UNLOCK: - break; -#endif /* linux */ - - case NETMAP_CORE_LOCK: - mtx_lock(&na->core_lock); - break; - - case NETMAP_CORE_UNLOCK: - mtx_unlock(&na->core_lock); - break; - - case NETMAP_TX_LOCK: - mtx_lock(&na->tx_rings[queueid].q_lock); - break; - - case NETMAP_TX_UNLOCK: - mtx_unlock(&na->tx_rings[queueid].q_lock); - break; - - case NETMAP_RX_LOCK: - mtx_lock(&na->rx_rings[queueid].q_lock); - break; - - case NETMAP_RX_UNLOCK: - mtx_unlock(&na->rx_rings[queueid].q_lock); - break; - } -} - - -/* * Initialize a ``netmap_adapter`` object created by driver on attach. * We allocate a block of memory with room for a struct netmap_adapter * plus two sets of N+2 struct netmap_kring (where N is the number @@ -2203,14 +2919,15 @@ netmap_lock_wrapper(struct ifnet *dev, int what, u_int queueid) * setups. */ int -netmap_attach(struct netmap_adapter *arg, int num_queues) +netmap_attach(struct netmap_adapter *arg, u_int num_queues) { struct netmap_adapter *na = NULL; struct ifnet *ifp = arg ? arg->ifp : NULL; - int len; + size_t len; if (arg == NULL || ifp == NULL) goto fail; + /* a VALE port uses two endpoints */ len = nma_is_vp(arg) ? sizeof(*na) : sizeof(*na) * 2; na = malloc(len, M_DEVBUF, M_NOWAIT | M_ZERO); if (na == NULL) @@ -2224,10 +2941,6 @@ netmap_attach(struct netmap_adapter *arg, int num_queues) na->refcount = na->na_single = na->na_multi = 0; /* Core lock initialized here, others after netmap_if_new. */ mtx_init(&na->core_lock, "netmap core lock", MTX_NETWORK_LOCK, MTX_DEF); - if (na->nm_lock == NULL) { - ND("using default locks for %s", ifp->if_xname); - na->nm_lock = netmap_lock_wrapper; - } #ifdef linux if (ifp->netdev_ops) { ND("netdev_ops %p", ifp->netdev_ops); @@ -2238,8 +2951,9 @@ netmap_attach(struct netmap_adapter *arg, int num_queues) na->nm_ndo = *ifp->netdev_ops; #endif } - na->nm_ndo.ndo_start_xmit = linux_netmap_start; -#endif + na->nm_ndo.ndo_start_xmit = linux_netmap_start_xmit; +#endif /* linux */ + na->nm_mem = arg->nm_mem ? arg->nm_mem : &nm_mem; if (!nma_is_vp(arg)) netmap_attach_sw(ifp); D("success for %s", ifp->if_xname); @@ -2270,6 +2984,8 @@ netmap_detach(struct ifnet *ifp) D("freeing leftover tx_rings"); free(na->tx_rings, M_DEVBUF); } + if (na->na_flags & NAF_MEM_OWNER) + netmap_mem_private_delete(na->nm_mem); bzero(na, sizeof(*na)); WNA(ifp) = NULL; free(na, M_DEVBUF); @@ -2277,84 +2993,103 @@ netmap_detach(struct ifnet *ifp) int -nm_bdg_flush(struct nm_bdg_fwd *ft, int n, struct netmap_adapter *na, u_int ring_nr); - -/* we don't need to lock myself */ -static int -bdg_netmap_start(struct ifnet *ifp, struct mbuf *m) -{ - struct netmap_adapter *na = SWNA(ifp); - struct nm_bdg_fwd *ft = na->rx_rings[0].nkr_ft; - char *buf = NMB(&na->rx_rings[0].ring->slot[0]); - u_int len = MBUF_LEN(m); - - if (!na->na_bdg) /* SWNA is not configured to be attached */ - return EBUSY; - m_copydata(m, 0, len, buf); - ft->ft_flags = 0; // XXX could be indirect ? - ft->ft_len = len; - ft->ft_buf = buf; - ft->ft_next = NM_BDG_BATCH; // XXX is it needed ? - nm_bdg_flush(ft, 1, na, 0); - - /* release the mbuf in either cases of success or failure. As an - * alternative, put the mbuf in a free list and free the list - * only when really necessary. - */ - m_freem(m); - - return (0); -} +nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, + struct netmap_adapter *na, u_int ring_nr); /* * Intercept packets from the network stack and pass them * to netmap as incoming packets on the 'software' ring. - * We are not locked when called. + * We rely on the OS to make sure that the ifp and na do not go + * away (typically the caller checks for IFF_DRV_RUNNING or the like). + * In nm_register() or whenever there is a reinitialization, + * we make sure to access the core lock and per-ring locks + * so that IFCAP_NETMAP is visible here. */ int -netmap_start(struct ifnet *ifp, struct mbuf *m) +netmap_transmit(struct ifnet *ifp, struct mbuf *m) { struct netmap_adapter *na = NA(ifp); - struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings]; + struct netmap_kring *kring; u_int i, len = MBUF_LEN(m); - u_int error = EBUSY, lim = kring->nkr_num_slots - 1; + u_int error = EBUSY, lim; struct netmap_slot *slot; + // XXX [Linux] we do not need this lock + // if we follow the down/configure/up protocol -gl + // mtx_lock(&na->core_lock); + if ( (ifp->if_capenable & IFCAP_NETMAP) == 0) { + /* interface not in netmap mode anymore */ + error = ENXIO; + goto done; + } + + kring = &na->rx_rings[na->num_rx_rings]; + lim = kring->nkr_num_slots - 1; if (netmap_verbose & NM_VERB_HOST) D("%s packet %d len %d from the stack", ifp->if_xname, kring->nr_hwcur + kring->nr_hwavail, len); - if (len > NETMAP_BUF_SIZE) { /* too long for us */ + // XXX reconsider long packets if we handle fragments + if (len > NETMAP_BDG_BUF_SIZE(na->nm_mem)) { /* too long for us */ D("%s from_host, drop packet size %d > %d", ifp->if_xname, - len, NETMAP_BUF_SIZE); - m_freem(m); - return EINVAL; + len, NETMAP_BDG_BUF_SIZE(na->nm_mem)); + goto done; + } + if (SWNA(ifp)->na_bdg) { + struct nm_bdg_fwd *ft; + char *dst; + + na = SWNA(ifp); /* we operate on the host port */ + ft = na->rx_rings[0].nkr_ft; + dst = BDG_NMB(na->nm_mem, &na->rx_rings[0].ring->slot[0]); + + /* use slot 0 in the ft, there is nothing queued here */ + /* XXX we can save the copy calling m_copydata in nm_bdg_flush, + * need a special flag for this. + */ + m_copydata(m, 0, (int)len, dst); + ft->ft_flags = 0; + ft->ft_len = len; + ft->ft_buf = dst; + ft->ft_next = NM_FT_NULL; + ft->ft_frags = 1; + if (netmap_verbose & NM_VERB_HOST) + RD(5, "pkt %p size %d to bridge port %d", + dst, len, na->bdg_port); + nm_bdg_flush(ft, 1, na, 0); + na = NA(ifp); /* back to the regular object/lock */ + error = 0; + goto done; } - if (na->na_bdg) - return bdg_netmap_start(ifp, m); - na->nm_lock(ifp, NETMAP_CORE_LOCK, 0); + /* protect against other instances of netmap_transmit, + * and userspace invocations of rxsync(). + * XXX could reuse core_lock + */ + // XXX [Linux] there can be no other instances of netmap_transmit + // on this same ring, but we still need this lock to protect + // concurrent access from netmap_sw_to_nic() -gl + mtx_lock(&kring->q_lock); if (kring->nr_hwavail >= lim) { if (netmap_verbose) D("stack ring %s full\n", ifp->if_xname); - goto done; /* no space */ - } - - /* compute the insert position */ - i = kring->nr_hwcur + kring->nr_hwavail; - if (i > lim) - i -= lim + 1; - slot = &kring->ring->slot[i]; - m_copydata(m, 0, len, NMB(slot)); - slot->len = len; - slot->flags = kring->nkr_slot_flags; - kring->nr_hwavail++; - if (netmap_verbose & NM_VERB_HOST) - D("wake up host ring %s %d", na->ifp->if_xname, na->num_rx_rings); - selwakeuppri(&kring->si, PI_NET); - error = 0; + } else { + /* compute the insert position */ + i = nm_kr_rxpos(kring); + slot = &kring->ring->slot[i]; + m_copydata(m, 0, (int)len, BDG_NMB(na->nm_mem, slot)); + slot->len = len; + slot->flags = kring->nkr_slot_flags; + kring->nr_hwavail++; + if (netmap_verbose & NM_VERB_HOST) + D("wake up host ring %s %d", na->ifp->if_xname, na->num_rx_rings); + selwakeuppri(&kring->si, PI_NET); + error = 0; + } + mtx_unlock(&kring->q_lock); + done: - na->nm_lock(ifp, NETMAP_CORE_UNLOCK, 0); + // mtx_unlock(&na->core_lock); /* release the mbuf in either cases of success or failure. As an * alternative, put the mbuf in a free list and free the list @@ -2372,17 +3107,29 @@ done: * If netmap mode is not set just return NULL. */ struct netmap_slot * -netmap_reset(struct netmap_adapter *na, enum txrx tx, int n, +netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n, u_int new_cur) { struct netmap_kring *kring; int new_hwofs, lim; - if (na == NULL) + if (na == NULL) { + D("NULL na, should not happen"); return NULL; /* no netmap support here */ - if (!(na->ifp->if_capenable & IFCAP_NETMAP)) + } + if (!(na->ifp->if_capenable & IFCAP_NETMAP)) { + D("interface not in netmap mode"); return NULL; /* nothing to reinitialize */ + } + /* XXX note- in the new scheme, we are not guaranteed to be + * under lock (e.g. when called on a device reset). + * In this case, we should set a flag and do not trust too + * much the values. In practice: TODO + * - set a RESET flag somewhere in the kring + * - do the processing in a conservative way + * - let the *sync() fixup at the end. + */ if (tx == NR_TX) { if (n >= na->num_tx_rings) return NULL; @@ -2398,13 +3145,15 @@ netmap_reset(struct netmap_adapter *na, enum txrx tx, int n, if (new_hwofs > lim) new_hwofs -= lim + 1; - /* Alwayws set the new offset value and realign the ring. */ + /* Always set the new offset value and realign the ring. */ + D("%s hwofs %d -> %d, hwavail %d -> %d", + tx == NR_TX ? "TX" : "RX", + kring->nkr_hwofs, new_hwofs, + kring->nr_hwavail, + tx == NR_TX ? lim : kring->nr_hwavail); kring->nkr_hwofs = new_hwofs; if (tx == NR_TX) - kring->nr_hwavail = kring->nkr_num_slots - 1; - ND(10, "new hwofs %d on %s %s[%d]", - kring->nkr_hwofs, na->ifp->if_xname, - tx == NR_TX ? "TX" : "RX", n); + kring->nr_hwavail = lim; #if 0 // def linux /* XXX check that the mappings are correct */ @@ -2417,7 +3166,7 @@ netmap_reset(struct netmap_adapter *na, enum txrx tx, int n, #endif /* linux */ /* - * Wakeup on the individual and global lock + * Wakeup on the individual and global selwait * We do the wakeup here, but the ring is not yet reconfigured. * However, we are under lock so there are no races. */ @@ -2427,43 +3176,79 @@ netmap_reset(struct netmap_adapter *na, enum txrx tx, int n, } -/* returns the next position in the ring */ +/* + * Grab packets from a kring, move them into the ft structure + * associated to the tx (input) port. Max one instance per port, + * filtered on input (ioctl, poll or XXX). + * Returns the next position in the ring. + */ static int nm_bdg_preflush(struct netmap_adapter *na, u_int ring_nr, struct netmap_kring *kring, u_int end) { struct netmap_ring *ring = kring->ring; - struct nm_bdg_fwd *ft = kring->nkr_ft; + struct nm_bdg_fwd *ft; u_int j = kring->nr_hwcur, lim = kring->nkr_num_slots - 1; u_int ft_i = 0; /* start from 0 */ + u_int frags = 1; /* how many frags ? */ + struct nm_bridge *b = na->na_bdg; - for (; likely(j != end); j = unlikely(j == lim) ? 0 : j+1) { + /* To protect against modifications to the bridge we acquire a + * shared lock, waiting if we can sleep (if the source port is + * attached to a user process) or with a trylock otherwise (NICs). + */ + ND("wait rlock for %d packets", ((j > end ? lim+1 : 0) + end) - j); + if (na->na_flags & NAF_BDG_MAYSLEEP) + BDG_RLOCK(b); + else if (!BDG_RTRYLOCK(b)) + return 0; + ND(5, "rlock acquired for %d packets", ((j > end ? lim+1 : 0) + end) - j); + ft = kring->nkr_ft; + + for (; likely(j != end); j = nm_next(j, lim)) { struct netmap_slot *slot = &ring->slot[j]; - char *buf = NMB(slot); - int len = ft[ft_i].ft_len = slot->len; + char *buf; + ft[ft_i].ft_len = slot->len; ft[ft_i].ft_flags = slot->flags; ND("flags is 0x%x", slot->flags); /* this slot goes into a list so initialize the link field */ - ft[ft_i].ft_next = NM_BDG_BATCH; /* equivalent to NULL */ - if (unlikely(len < 14)) - continue; + ft[ft_i].ft_next = NM_FT_NULL; buf = ft[ft_i].ft_buf = (slot->flags & NS_INDIRECT) ? - *((void **)buf) : buf; + (void *)slot->ptr : BDG_NMB(na->nm_mem, slot); prefetch(buf); - if (unlikely(++ft_i == netmap_bridge)) + ++ft_i; + if (slot->flags & NS_MOREFRAG) { + frags++; + continue; + } + if (unlikely(netmap_verbose && frags > 1)) + RD(5, "%d frags at %d", frags, ft_i - frags); + ft[ft_i - frags].ft_frags = frags; + frags = 1; + if (unlikely((int)ft_i >= bridge_batch)) ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr); } + if (frags > 1) { + D("truncate incomplete fragment at %d (%d frags)", ft_i, frags); + // ft_i > 0, ft[ft_i-1].flags has NS_MOREFRAG + ft[ft_i - 1].ft_frags &= ~NS_MOREFRAG; + ft[ft_i - frags].ft_frags = frags - 1; + } if (ft_i) ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr); + BDG_RUNLOCK(b); return j; } /* - * Pass packets from nic to the bridge. Must be called with - * proper locks on the source interface. + * Pass packets from nic to the bridge. + * XXX TODO check locking: this is called from the interrupt + * handler so we should make sure that the interface is not + * disconnected while passing down an interrupt. + * * Note, no user process can access this NIC so we can ignore * the info in the 'ring'. */ @@ -2473,20 +3258,24 @@ netmap_nic_to_bdg(struct ifnet *ifp, u_int ring_nr) struct netmap_adapter *na = NA(ifp); struct netmap_kring *kring = &na->rx_rings[ring_nr]; struct netmap_ring *ring = kring->ring; - int j, k, lim = kring->nkr_num_slots - 1; + u_int j, k; - /* fetch packets that have arrived */ - na->nm_rxsync(ifp, ring_nr, 0); - /* XXX we don't count reserved, but it should be 0 */ - j = kring->nr_hwcur; - k = j + kring->nr_hwavail; - if (k > lim) - k -= lim + 1; - if (k == j && netmap_verbose) { + /* make sure that only one thread is ever in here, + * after which we can unlock. Probably unnecessary XXX. + */ + if (nm_kr_tryget(kring)) + return; + /* fetch packets that have arrived. + * XXX maybe do this in a loop ? + */ + if (na->nm_rxsync(ifp, ring_nr, 0)) + goto put_out; + if (kring->nr_hwavail == 0 && netmap_verbose) { D("how strange, interrupt with no packets on %s", ifp->if_xname); - return; + goto put_out; } + k = nm_kr_rxpos(kring); j = nm_bdg_preflush(na, ring_nr, kring, k); @@ -2497,42 +3286,43 @@ netmap_nic_to_bdg(struct ifnet *ifp, u_int ring_nr) ring->cur = j; ring->avail = 0; na->nm_rxsync(ifp, ring_nr, 0); + +put_out: + nm_kr_put(kring); return; } /* - * Default functions to handle rx/tx interrupts - * we have 4 cases: - * 1 ring, single lock: - * lock(core); wake(i=0); unlock(core) - * N rings, single lock: - * lock(core); wake(i); wake(N+1) unlock(core) - * 1 ring, separate locks: (i=0) - * lock(i); wake(i); unlock(i) - * N rings, separate locks: - * lock(i); wake(i); unlock(i); lock(core) wake(N+1) unlock(core) - * work_done is non-null on the RX path. + * Default functions to handle rx/tx interrupts from a physical device. + * "work_done" is non-null on the RX path, NULL for the TX path. + * We rely on the OS to make sure that there is only one active + * instance per queue, and that there is appropriate locking. + * + * If the card is not in netmap mode, simply return 0, + * so that the caller proceeds with regular processing. * - * The 'q' argument also includes flag to tell whether the queue is - * already locked on enter, and whether it should remain locked on exit. - * This helps adapting to different defaults in drivers and OSes. + * If the card is connected to a netmap file descriptor, + * do a selwakeup on the individual queue, plus one on the global one + * if needed (multiqueue card _and_ there are multiqueue listeners), + * and return 1. + * + * Finally, if called on rx from an interface connected to a switch, + * calls the proper forwarding routine, and return 1. */ int -netmap_rx_irq(struct ifnet *ifp, int q, int *work_done) +netmap_rx_irq(struct ifnet *ifp, u_int q, u_int *work_done) { struct netmap_adapter *na; - struct netmap_kring *r; - NM_SELINFO_T *main_wq; - int locktype, unlocktype, nic_to_bridge, lock; + struct netmap_kring *kring; if (!(ifp->if_capenable & IFCAP_NETMAP)) return 0; - lock = q & (NETMAP_LOCKED_ENTER | NETMAP_LOCKED_EXIT); - q = q & NETMAP_RING_MASK; + q &= NETMAP_RING_MASK; - ND(5, "received %s queue %d", work_done ? "RX" : "TX" , q); + if (netmap_verbose) + RD(5, "received %s queue %d", work_done ? "RX" : "TX" , q); na = NA(ifp); if (na->na_flags & NAF_SKIP_INTR) { ND("use regular interrupt"); @@ -2542,57 +3332,24 @@ netmap_rx_irq(struct ifnet *ifp, int q, int *work_done) if (work_done) { /* RX path */ if (q >= na->num_rx_rings) return 0; // not a physical queue - r = na->rx_rings + q; - r->nr_kflags |= NKR_PENDINTR; - main_wq = (na->num_rx_rings > 1) ? &na->rx_si : NULL; - /* set a flag if the NIC is attached to a VALE switch */ - nic_to_bridge = (na->na_bdg != NULL); - locktype = NETMAP_RX_LOCK; - unlocktype = NETMAP_RX_UNLOCK; + kring = na->rx_rings + q; + kring->nr_kflags |= NKR_PENDINTR; // XXX atomic ? + if (na->na_bdg != NULL) { + netmap_nic_to_bdg(ifp, q); + } else { + selwakeuppri(&kring->si, PI_NET); + if (na->num_rx_rings > 1 /* or multiple listeners */ ) + selwakeuppri(&na->rx_si, PI_NET); + } + *work_done = 1; /* do not fire napi again */ } else { /* TX path */ if (q >= na->num_tx_rings) return 0; // not a physical queue - r = na->tx_rings + q; - main_wq = (na->num_tx_rings > 1) ? &na->tx_si : NULL; - work_done = &q; /* dummy */ - nic_to_bridge = 0; - locktype = NETMAP_TX_LOCK; - unlocktype = NETMAP_TX_UNLOCK; - } - if (na->separate_locks) { - if (!(lock & NETMAP_LOCKED_ENTER)) - na->nm_lock(ifp, locktype, q); - /* If a NIC is attached to a bridge, flush packets - * (and no need to wakeup anyone). Otherwise, wakeup - * possible processes waiting for packets. - */ - if (nic_to_bridge) - netmap_nic_to_bdg(ifp, q); - else - selwakeuppri(&r->si, PI_NET); - na->nm_lock(ifp, unlocktype, q); - if (main_wq && !nic_to_bridge) { - na->nm_lock(ifp, NETMAP_CORE_LOCK, 0); - selwakeuppri(main_wq, PI_NET); - na->nm_lock(ifp, NETMAP_CORE_UNLOCK, 0); - } - /* lock the queue again if requested */ - if (lock & NETMAP_LOCKED_EXIT) - na->nm_lock(ifp, locktype, q); - } else { - if (!(lock & NETMAP_LOCKED_ENTER)) - na->nm_lock(ifp, NETMAP_CORE_LOCK, 0); - if (nic_to_bridge) - netmap_nic_to_bdg(ifp, q); - else { - selwakeuppri(&r->si, PI_NET); - if (main_wq) - selwakeuppri(main_wq, PI_NET); - } - if (!(lock & NETMAP_LOCKED_EXIT)) - na->nm_lock(ifp, NETMAP_CORE_UNLOCK, 0); + kring = na->tx_rings + q; + selwakeuppri(&kring->si, PI_NET); + if (na->num_tx_rings > 1 /* or multiple listeners */ ) + selwakeuppri(&na->tx_si, PI_NET); } - *work_done = 1; /* do not fire napi again */ return 1; } @@ -2626,11 +3383,10 @@ linux_netmap_poll(struct file * file, struct poll_table_struct *pwait) static int linux_netmap_mmap(struct file *f, struct vm_area_struct *vma) { - int lut_skip, i, j; - int user_skip = 0; - struct lut_entry *l_entry; int error = 0; - unsigned long off, tomap; + unsigned long off, va; + vm_ooffset_t pa; + struct netmap_priv_d *priv = f->private_data; /* * vma->vm_start: start of mapping user address space * vma->vm_end: end of the mapping user address space @@ -2639,69 +3395,45 @@ linux_netmap_mmap(struct file *f, struct vm_area_struct *vma) // XXX security checks - error = netmap_get_memory(f->private_data); + error = netmap_get_memory(priv); ND("get_memory returned %d", error); if (error) return -error; - off = vma->vm_pgoff << PAGE_SHIFT; /* offset in bytes */ - tomap = vma->vm_end - vma->vm_start; - for (i = 0; i < NETMAP_POOLS_NR; i++) { /* loop through obj_pools */ - const struct netmap_obj_pool *p = &nm_mem.pools[i]; - /* - * In each pool memory is allocated in clusters - * of size _clustsize, each containing clustentries - * entries. For each object k we already store the - * vtophys mapping in lut[k] so we use that, scanning - * the lut[] array in steps of clustentries, - * and we map each cluster (not individual pages, - * it would be overkill -- XXX slow ? 20130415). - */ - - /* - * We interpret vm_pgoff as an offset into the whole - * netmap memory, as if all clusters where contiguous. - */ - for (lut_skip = 0, j = 0; j < p->_numclusters; j++, lut_skip += p->clustentries) { - unsigned long paddr, mapsize; - if (p->_clustsize <= off) { - off -= p->_clustsize; - continue; - } - l_entry = &p->lut[lut_skip]; /* first obj in the cluster */ - paddr = l_entry->paddr + off; - mapsize = p->_clustsize - off; - off = 0; - if (mapsize > tomap) - mapsize = tomap; - ND("remap_pfn_range(%lx, %lx, %lx)", - vma->vm_start + user_skip, - paddr >> PAGE_SHIFT, mapsize); - if (remap_pfn_range(vma, vma->vm_start + user_skip, - paddr >> PAGE_SHIFT, mapsize, - vma->vm_page_prot)) - return -EAGAIN; // XXX check return value - user_skip += mapsize; - tomap -= mapsize; - if (tomap == 0) - goto done; - } + if ((vma->vm_start & ~PAGE_MASK) || (vma->vm_end & ~PAGE_MASK)) { + ND("vm_start = %lx vm_end = %lx", vma->vm_start, vma->vm_end); + return -EINVAL; } -done: + for (va = vma->vm_start, off = vma->vm_pgoff; + va < vma->vm_end; + va += PAGE_SIZE, off++) + { + pa = netmap_mem_ofstophys(priv->np_mref, off << PAGE_SHIFT); + if (pa == 0) + return -EINVAL; + + ND("va %lx pa %p", va, pa); + error = remap_pfn_range(vma, va, pa >> PAGE_SHIFT, PAGE_SIZE, vma->vm_page_prot); + if (error) + return error; + } return 0; } +/* + * This one is probably already protected by the netif lock XXX + */ static netdev_tx_t -linux_netmap_start(struct sk_buff *skb, struct net_device *dev) +linux_netmap_start_xmit(struct sk_buff *skb, struct net_device *dev) { - netmap_start(dev, skb); + netmap_transmit(dev, skb); return (NETDEV_TX_OK); } -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,37) // XXX was 38 +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,36) // XXX was 37 #define LIN_IOCTL_NAME .ioctl int linux_netmap_ioctl(struct inode *inode, struct file *file, u_int cmd, u_long data /* arg */) @@ -2715,6 +3447,9 @@ linux_netmap_ioctl(struct file *file, u_int cmd, u_long data /* arg */) struct nmreq nmr; bzero(&nmr, sizeof(nmr)); + if (cmd == NIOCTXSYNC || cmd == NIOCRXSYNC) { + data = 0; /* no argument required here */ + } if (data && copy_from_user(&nmr, (void *)data, sizeof(nmr) ) != 0) return -EFAULT; ret = netmap_ioctl(NULL, cmd, (caddr_t)&nmr, 0, (void *)file); @@ -2792,6 +3527,8 @@ EXPORT_SYMBOL(netmap_rx_irq); // default irq handler EXPORT_SYMBOL(netmap_no_pendintr); // XXX mitigation - should go away EXPORT_SYMBOL(netmap_bdg_ctl); // bridge configuration routine EXPORT_SYMBOL(netmap_bdg_learning); // the default lookup function +EXPORT_SYMBOL(netmap_disable_all_rings); +EXPORT_SYMBOL(netmap_enable_all_rings); MODULE_AUTHOR("http://info.iet.unipi.it/~luigi/netmap/"); @@ -2805,7 +3542,6 @@ static struct cdevsw netmap_cdevsw = { .d_version = D_VERSION, .d_name = "netmap", .d_open = netmap_open, - .d_mmap = netmap_mmap, .d_mmap_single = netmap_mmap_single, .d_ioctl = netmap_ioctl, .d_poll = netmap_poll, @@ -2813,7 +3549,6 @@ static struct cdevsw netmap_cdevsw = { }; #endif /* __FreeBSD__ */ -#ifdef NM_BRIDGE /* *---- support for virtual bridge ----- */ @@ -2862,20 +3597,14 @@ nm_bridge_rthash(const uint8_t *addr) static int bdg_netmap_reg(struct ifnet *ifp, int onoff) { - // struct nm_bridge *b = NA(ifp)->na_bdg; - /* the interface is already attached to the bridge, * so we only need to toggle IFCAP_NETMAP. - * Locking is not necessary (we are already under - * NMA_LOCK, and the port is not in use during this call). */ - /* BDG_WLOCK(b); */ if (onoff) { ifp->if_capenable |= IFCAP_NETMAP; } else { ifp->if_capenable &= ~IFCAP_NETMAP; } - /* BDG_WUNLOCK(b); */ return 0; } @@ -2887,7 +3616,7 @@ bdg_netmap_reg(struct ifnet *ifp, int onoff) * ring in *dst_ring (at the moment, always use ring 0) */ u_int -netmap_bdg_learning(char *buf, u_int len, uint8_t *dst_ring, +netmap_bdg_learning(char *buf, u_int buf_len, uint8_t *dst_ring, struct netmap_adapter *na) { struct nm_hash_ent *ht = na->na_bdg->ht; @@ -2895,6 +3624,10 @@ netmap_bdg_learning(char *buf, u_int len, uint8_t *dst_ring, u_int dst, mysrc = na->bdg_port; uint64_t smac, dmac; + if (buf_len < 14) { + D("invalid buf length %d", buf_len); + return NM_BDG_NOPORT; + } dmac = le64toh(*(uint64_t *)(buf)) & 0xffffffffffff; smac = le64toh(*(uint64_t *)(buf + 4)); smac >>= 16; @@ -2905,7 +3638,7 @@ netmap_bdg_learning(char *buf, u_int len, uint8_t *dst_ring, */ if ((buf[6] & 1) == 0) { /* valid src */ uint8_t *s = buf+6; - sh = nm_bridge_rthash(buf+6); // XXX hash of source + sh = nm_bridge_rthash(s); // XXX hash of source /* update source port forwarding entry */ ht[sh].mac = smac; /* XXX expire ? */ ht[sh].ports = mysrc; @@ -2931,42 +3664,50 @@ netmap_bdg_learning(char *buf, u_int len, uint8_t *dst_ring, * number of ports, and lets us replace the learn and dispatch functions. */ int -nm_bdg_flush(struct nm_bdg_fwd *ft, int n, struct netmap_adapter *na, +nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_adapter *na, u_int ring_nr) { struct nm_bdg_q *dst_ents, *brddst; uint16_t num_dsts = 0, *dsts; struct nm_bridge *b = na->na_bdg; - u_int i, me = na->bdg_port; + u_int i, j, me = na->bdg_port; - dst_ents = (struct nm_bdg_q *)(ft + NM_BDG_BATCH); + /* + * The work area (pointed by ft) is followed by an array of + * pointers to queues , dst_ents; there are NM_BDG_MAXRINGS + * queues per port plus one for the broadcast traffic. + * Then we have an array of destination indexes. + */ + dst_ents = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX); dsts = (uint16_t *)(dst_ents + NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1); - BDG_RLOCK(b); - - /* first pass: find a destination */ - for (i = 0; likely(i < n); i++) { - uint8_t *buf = ft[i].ft_buf; - uint8_t dst_ring = ring_nr; + /* first pass: find a destination for each packet in the batch */ + for (i = 0; likely(i < n); i += ft[i].ft_frags) { + uint8_t dst_ring = ring_nr; /* default, same ring as origin */ uint16_t dst_port, d_i; struct nm_bdg_q *d; - dst_port = b->nm_bdg_lookup(buf, ft[i].ft_len, &dst_ring, na); - if (dst_port == NM_BDG_NOPORT) { + ND("slot %d frags %d", i, ft[i].ft_frags); + dst_port = b->nm_bdg_lookup(ft[i].ft_buf, ft[i].ft_len, + &dst_ring, na); + if (netmap_verbose > 255) + RD(5, "slot %d port %d -> %d", i, me, dst_port); + if (dst_port == NM_BDG_NOPORT) continue; /* this packet is identified to be dropped */ - } else if (unlikely(dst_port > NM_BDG_MAXPORTS)) { + else if (unlikely(dst_port > NM_BDG_MAXPORTS)) continue; - } else if (dst_port == NM_BDG_BROADCAST) { + else if (dst_port == NM_BDG_BROADCAST) dst_ring = 0; /* broadcasts always go to ring 0 */ - } else if (unlikely(dst_port == me || - !BDG_GET_VAR(b->bdg_ports[dst_port]))) { + else if (unlikely(dst_port == me || + !b->bdg_ports[dst_port])) continue; - } /* get a position in the scratch pad */ d_i = dst_port * NM_BDG_MAXRINGS + dst_ring; d = dst_ents + d_i; - if (d->bq_head == NM_BDG_BATCH) { /* new destination */ + + /* append the first fragment to the list */ + if (d->bq_head == NM_FT_NULL) { /* new destination */ d->bq_head = d->bq_tail = i; /* remember this position to be scanned later */ if (dst_port != NM_BDG_BROADCAST) @@ -2975,23 +3716,30 @@ nm_bdg_flush(struct nm_bdg_fwd *ft, int n, struct netmap_adapter *na, ft[d->bq_tail].ft_next = i; d->bq_tail = i; } + d->bq_len += ft[i].ft_frags; } - /* if there is a broadcast, set ring 0 of all ports to be scanned - * XXX This would be optimized by recording the highest index of active - * ports. + /* + * Broadcast traffic goes to ring 0 on all destinations. + * So we need to add these rings to the list of ports to scan. + * XXX at the moment we scan all NM_BDG_MAXPORTS ports, which is + * expensive. We should keep a compact list of active destinations + * so we could shorten this loop. */ brddst = dst_ents + NM_BDG_BROADCAST * NM_BDG_MAXRINGS; - if (brddst->bq_head != NM_BDG_BATCH) { - for (i = 0; likely(i < NM_BDG_MAXPORTS); i++) { - uint16_t d_i = i * NM_BDG_MAXRINGS; - if (unlikely(i == me) || !BDG_GET_VAR(b->bdg_ports[i])) + if (brddst->bq_head != NM_FT_NULL) { + for (j = 0; likely(j < b->bdg_active_ports); j++) { + uint16_t d_i; + i = b->bdg_port_index[j]; + if (unlikely(i == me)) continue; - else if (dst_ents[d_i].bq_head == NM_BDG_BATCH) + d_i = i * NM_BDG_MAXRINGS; + if (dst_ents[d_i].bq_head == NM_FT_NULL) dsts[num_dsts++] = d_i; } } + ND(5, "pass 1 done %d pkts %d dsts", n, num_dsts); /* second pass: scan destinations (XXX will be modular somehow) */ for (i = 0; i < num_dsts; i++) { struct ifnet *dst_ifp; @@ -2999,168 +3747,281 @@ nm_bdg_flush(struct nm_bdg_fwd *ft, int n, struct netmap_adapter *na, struct netmap_kring *kring; struct netmap_ring *ring; u_int dst_nr, is_vp, lim, j, sent = 0, d_i, next, brd_next; - int howmany, retry = netmap_txsync_retry; + u_int needed, howmany; + int retry = netmap_txsync_retry; struct nm_bdg_q *d; + uint32_t my_start = 0, lease_idx = 0; + int nrings; d_i = dsts[i]; + ND("second pass %d port %d", i, d_i); d = dst_ents + d_i; - dst_na = BDG_GET_VAR(b->bdg_ports[d_i/NM_BDG_MAXRINGS]); + // XXX fix the division + dst_na = b->bdg_ports[d_i/NM_BDG_MAXRINGS]; /* protect from the lookup function returning an inactive * destination port */ if (unlikely(dst_na == NULL)) - continue; - else if (dst_na->na_flags & NAF_SW_ONLY) - continue; + goto cleanup; + if (dst_na->na_flags & NAF_SW_ONLY) + goto cleanup; dst_ifp = dst_na->ifp; /* * The interface may be in !netmap mode in two cases: * - when na is attached but not activated yet; * - when na is being deactivated but is still attached. */ - if (unlikely(!(dst_ifp->if_capenable & IFCAP_NETMAP))) - continue; + if (unlikely(!(dst_ifp->if_capenable & IFCAP_NETMAP))) { + ND("not in netmap mode!"); + goto cleanup; + } /* there is at least one either unicast or broadcast packet */ brd_next = brddst->bq_head; next = d->bq_head; + /* we need to reserve this many slots. If fewer are + * available, some packets will be dropped. + * Packets may have multiple fragments, so we may not use + * there is a chance that we may not use all of the slots + * we have claimed, so we will need to handle the leftover + * ones when we regain the lock. + */ + needed = d->bq_len + brddst->bq_len; is_vp = nma_is_vp(dst_na); + ND(5, "pass 2 dst %d is %x %s", + i, d_i, is_vp ? "virtual" : "nic/host"); dst_nr = d_i & (NM_BDG_MAXRINGS-1); if (is_vp) { /* virtual port */ - if (dst_nr >= dst_na->num_rx_rings) - dst_nr = dst_nr % dst_na->num_rx_rings; - kring = &dst_na->rx_rings[dst_nr]; - ring = kring->ring; - lim = kring->nkr_num_slots - 1; - dst_na->nm_lock(dst_ifp, NETMAP_RX_LOCK, dst_nr); - j = kring->nr_hwcur + kring->nr_hwavail; - if (j > lim) - j -= kring->nkr_num_slots; - howmany = lim - kring->nr_hwavail; - } else { /* hw or sw adapter */ - if (dst_nr >= dst_na->num_tx_rings) - dst_nr = dst_nr % dst_na->num_tx_rings; - kring = &dst_na->tx_rings[dst_nr]; - ring = kring->ring; - lim = kring->nkr_num_slots - 1; - dst_na->nm_lock(dst_ifp, NETMAP_TX_LOCK, dst_nr); + nrings = dst_na->num_rx_rings; + } else { + nrings = dst_na->num_tx_rings; + } + if (dst_nr >= nrings) + dst_nr = dst_nr % nrings; + kring = is_vp ? &dst_na->rx_rings[dst_nr] : + &dst_na->tx_rings[dst_nr]; + ring = kring->ring; + lim = kring->nkr_num_slots - 1; + retry: + + /* reserve the buffers in the queue and an entry + * to report completion, and drop lock. + * XXX this might become a helper function. + */ + mtx_lock(&kring->q_lock); + if (kring->nkr_stopped) { + mtx_unlock(&kring->q_lock); + goto cleanup; + } + /* on physical interfaces, do a txsync to recover + * slots for packets already transmitted. + * XXX maybe we could be optimistic and rely on a retry + * in case of failure. + */ + if (nma_is_hw(dst_na)) { dst_na->nm_txsync(dst_ifp, dst_nr, 0); - /* see nm_bdg_flush() */ - j = kring->nr_hwcur; - howmany = kring->nr_hwavail; } - while (howmany-- > 0) { + my_start = j = kring->nkr_hwlease; + howmany = nm_kr_space(kring, is_vp); + if (needed < howmany) + howmany = needed; + lease_idx = nm_kr_lease(kring, howmany, is_vp); + mtx_unlock(&kring->q_lock); + + /* only retry if we need more than available slots */ + if (retry && needed <= howmany) + retry = 0; + + /* copy to the destination queue */ + while (howmany > 0) { struct netmap_slot *slot; - struct nm_bdg_fwd *ft_p; + struct nm_bdg_fwd *ft_p, *ft_end; + u_int cnt; - /* our 'NULL' is always higher than valid indexes + /* find the queue from which we pick next packet. + * NM_FT_NULL is always higher than valid indexes * so we never dereference it if the other list - * has packets (and if both are NULL we never + * has packets (and if both are empty we never * get here). */ if (next < brd_next) { ft_p = ft + next; next = ft_p->ft_next; - ND("j %d uni %d next %d %d", - j, ft_p - ft, next, brd_next); } else { /* insert broadcast */ ft_p = ft + brd_next; brd_next = ft_p->ft_next; - ND("j %d brd %d next %d %d", - j, ft_p - ft, next, brd_next); } - slot = &ring->slot[j]; - ND("send %d %d bytes at %s:%d", i, ft_p->ft_len, dst_ifp->if_xname, j); - if (ft_p->ft_flags & NS_INDIRECT) { - ND("copying from INDIRECT source"); - copyin(ft_p->ft_buf, NMB(slot), - (ft_p->ft_len + 63) & ~63); - } else { - pkt_copy(ft_p->ft_buf, NMB(slot), ft_p->ft_len); - } - slot->len = ft_p->ft_len; - j = unlikely(j == lim) ? 0: j + 1; /* XXX to be macro-ed */ - sent++; + cnt = ft_p->ft_frags; // cnt > 0 + if (unlikely(cnt > howmany)) + break; /* no more space */ + howmany -= cnt; + if (netmap_verbose && cnt > 1) + RD(5, "rx %d frags to %d", cnt, j); + ft_end = ft_p + cnt; + do { + void *dst, *src = ft_p->ft_buf; + size_t len = (ft_p->ft_len + 63) & ~63; + + slot = &ring->slot[j]; + dst = BDG_NMB(dst_na->nm_mem, slot); + /* round to a multiple of 64 */ + + ND("send %d %d bytes at %s:%d", + i, ft_p->ft_len, dst_ifp->if_xname, j); + if (ft_p->ft_flags & NS_INDIRECT) { + if (copyin(src, dst, len)) { + // invalid user pointer, pretend len is 0 + ft_p->ft_len = 0; + } + } else { + //memcpy(dst, src, len); + pkt_copy(src, dst, (int)len); + } + slot->len = ft_p->ft_len; + slot->flags = (cnt << 8)| NS_MOREFRAG; + j = nm_next(j, lim); + ft_p++; + sent++; + } while (ft_p != ft_end); + slot->flags = (cnt << 8); /* clear flag on last entry */ /* are we done ? */ - if (next == NM_BDG_BATCH && brd_next == NM_BDG_BATCH) + if (next == NM_FT_NULL && brd_next == NM_FT_NULL) break; } - if (netmap_verbose && (howmany < 0)) - D("rx ring full on %s", dst_ifp->if_xname); - if (is_vp) { - if (sent) { - kring->nr_hwavail += sent; - selwakeuppri(&kring->si, PI_NET); + { + /* current position */ + uint32_t *p = kring->nkr_leases; /* shorthand */ + uint32_t update_pos; + int still_locked = 1; + + mtx_lock(&kring->q_lock); + if (unlikely(howmany > 0)) { + /* not used all bufs. If i am the last one + * i can recover the slots, otherwise must + * fill them with 0 to mark empty packets. + */ + ND("leftover %d bufs", howmany); + if (nm_next(lease_idx, lim) == kring->nkr_lease_idx) { + /* yes i am the last one */ + ND("roll back nkr_hwlease to %d", j); + kring->nkr_hwlease = j; + } else { + while (howmany-- > 0) { + ring->slot[j].len = 0; + ring->slot[j].flags = 0; + j = nm_next(j, lim); + } } - dst_na->nm_lock(dst_ifp, NETMAP_RX_UNLOCK, dst_nr); - } else { - if (sent) { - ring->avail -= sent; + } + p[lease_idx] = j; /* report I am done */ + + update_pos = is_vp ? nm_kr_rxpos(kring) : ring->cur; + + if (my_start == update_pos) { + /* all slots before my_start have been reported, + * so scan subsequent leases to see if other ranges + * have been completed, and to a selwakeup or txsync. + */ + while (lease_idx != kring->nkr_lease_idx && + p[lease_idx] != NR_NOSLOT) { + j = p[lease_idx]; + p[lease_idx] = NR_NOSLOT; + lease_idx = nm_next(lease_idx, lim); + } + /* j is the new 'write' position. j != my_start + * means there are new buffers to report + */ + if (likely(j != my_start)) { + if (is_vp) { + uint32_t old_avail = kring->nr_hwavail; + + kring->nr_hwavail = (j >= kring->nr_hwcur) ? + j - kring->nr_hwcur : + j + lim + 1 - kring->nr_hwcur; + if (kring->nr_hwavail < old_avail) { + D("avail shrink %d -> %d", + old_avail, kring->nr_hwavail); + } + still_locked = 0; + mtx_unlock(&kring->q_lock); + selwakeuppri(&kring->si, PI_NET); + } else { ring->cur = j; + /* XXX update avail ? */ + still_locked = 0; dst_na->nm_txsync(dst_ifp, dst_nr, 0); + mtx_unlock(&kring->q_lock); + + /* retry to send more packets */ + if (nma_is_hw(dst_na) && retry--) + goto retry; + } } - /* retry to send more packets */ - if (nma_is_hw(dst_na) && howmany < 0 && retry--) - goto retry; - dst_na->nm_lock(dst_ifp, NETMAP_TX_UNLOCK, dst_nr); + } + if (still_locked) + mtx_unlock(&kring->q_lock); } - /* NM_BDG_BATCH means 'no packet' */ - d->bq_head = d->bq_tail = NM_BDG_BATCH; /* cleanup */ +cleanup: + d->bq_head = d->bq_tail = NM_FT_NULL; /* cleanup */ + d->bq_len = 0; } - brddst->bq_head = brddst->bq_tail = NM_BDG_BATCH; /* cleanup */ - BDG_RUNLOCK(b); + brddst->bq_head = brddst->bq_tail = NM_FT_NULL; /* cleanup */ + brddst->bq_len = 0; return 0; } /* - * main dispatch routine + * main dispatch routine for the bridge. + * We already know that only one thread is running this. + * we must run nm_bdg_preflush without lock. */ static int -bdg_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int do_lock) +bdg_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int flags) { struct netmap_adapter *na = NA(ifp); struct netmap_kring *kring = &na->tx_rings[ring_nr]; struct netmap_ring *ring = kring->ring; - int i, j, k, lim = kring->nkr_num_slots - 1; + u_int j, k, lim = kring->nkr_num_slots - 1; k = ring->cur; if (k > lim) return netmap_ring_reinit(kring); - if (do_lock) - na->nm_lock(ifp, NETMAP_TX_LOCK, ring_nr); - if (netmap_bridge <= 0) { /* testing only */ + if (bridge_batch <= 0) { /* testing only */ j = k; // used all goto done; } - if (netmap_bridge > NM_BDG_BATCH) - netmap_bridge = NM_BDG_BATCH; + if (bridge_batch > NM_BDG_BATCH) + bridge_batch = NM_BDG_BATCH; j = nm_bdg_preflush(na, ring_nr, kring, k); - i = k - j; - if (i < 0) - i += kring->nkr_num_slots; - kring->nr_hwavail = kring->nkr_num_slots - 1 - i; if (j != k) D("early break at %d/ %d, avail %d", j, k, kring->nr_hwavail); + /* k-j modulo ring size is the number of slots processed */ + if (k < j) + k += kring->nkr_num_slots; + kring->nr_hwavail = lim - (k - j); done: kring->nr_hwcur = j; ring->avail = kring->nr_hwavail; - if (do_lock) - na->nm_lock(ifp, NETMAP_TX_UNLOCK, ring_nr); - if (netmap_verbose) - D("%s ring %d lock %d", ifp->if_xname, ring_nr, do_lock); + D("%s ring %d flags %d", ifp->if_xname, ring_nr, flags); return 0; } +/* + * user process reading from a VALE switch. + * Already protected against concurrent calls from userspace, + * but we must acquire the queue's lock to protect against + * writers on the same queue. + */ static int -bdg_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int do_lock) +bdg_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int flags) { struct netmap_adapter *na = NA(ifp); struct netmap_kring *kring = &na->rx_rings[ring_nr]; @@ -3169,13 +4030,12 @@ bdg_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int do_lock) u_int k = ring->cur, resvd = ring->reserved; int n; - ND("%s ring %d lock %d avail %d", - ifp->if_xname, ring_nr, do_lock, kring->nr_hwavail); - - if (k > lim) - return netmap_ring_reinit(kring); - if (do_lock) - na->nm_lock(ifp, NETMAP_RX_LOCK, ring_nr); + mtx_lock(&kring->q_lock); + if (k > lim) { + D("ouch dangerous reset!!!"); + n = netmap_ring_reinit(kring); + goto done; + } /* skip past packets that userspace has released */ j = kring->nr_hwcur; /* netmap ring index */ @@ -3194,27 +4054,24 @@ bdg_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int do_lock) ND("userspace releases %d packets", n); for (n = 0; likely(j != k); n++) { struct netmap_slot *slot = &ring->slot[j]; - void *addr = NMB(slot); + void *addr = BDG_NMB(na->nm_mem, slot); if (addr == netmap_buffer_base) { /* bad buf */ - if (do_lock) - na->nm_lock(ifp, NETMAP_RX_UNLOCK, ring_nr); - return netmap_ring_reinit(kring); + D("bad buffer index %d, ignore ?", + slot->buf_idx); } - /* decrease refcount for buffer */ - slot->flags &= ~NS_BUF_CHANGED; - j = unlikely(j == lim) ? 0 : j + 1; + j = nm_next(j, lim); } kring->nr_hwavail -= n; kring->nr_hwcur = k; } /* tell userspace that there are new packets */ ring->avail = kring->nr_hwavail - resvd; - - if (do_lock) - na->nm_lock(ifp, NETMAP_RX_UNLOCK, ring_nr); - return 0; + n = 0; +done: + mtx_unlock(&kring->q_lock); + return n; } @@ -3227,18 +4084,20 @@ bdg_netmap_attach(struct netmap_adapter *arg) bzero(&na, sizeof(na)); na.ifp = arg->ifp; - na.separate_locks = 1; + na.na_flags = NAF_BDG_MAYSLEEP | NAF_MEM_OWNER; na.num_tx_rings = arg->num_tx_rings; na.num_rx_rings = arg->num_rx_rings; - na.num_tx_desc = NM_BRIDGE_RINGSIZE; - na.num_rx_desc = NM_BRIDGE_RINGSIZE; + na.num_tx_desc = arg->num_tx_desc; + na.num_rx_desc = arg->num_rx_desc; na.nm_txsync = bdg_netmap_txsync; na.nm_rxsync = bdg_netmap_rxsync; na.nm_register = bdg_netmap_reg; + na.nm_mem = netmap_mem_private_new(arg->ifp->if_xname, + na.num_tx_rings, na.num_tx_desc, + na.num_rx_rings, na.num_rx_desc); netmap_attach(&na, na.num_tx_rings); } -#endif /* NM_BRIDGE */ static struct cdev *netmap_dev; /* /dev/netmap character device. */ @@ -3254,9 +4113,11 @@ static struct cdev *netmap_dev; /* /dev/netmap character device. */ static int netmap_init(void) { - int error; + int i, error; - error = netmap_memory_init(); + NMG_LOCK_INIT(); + + error = netmap_mem_init(); if (error != 0) { printf("netmap: unable to initialize the memory allocator.\n"); return (error); @@ -3265,16 +4126,9 @@ netmap_init(void) netmap_dev = make_dev(&netmap_cdevsw, 0, UID_ROOT, GID_WHEEL, 0660, "netmap"); -#ifdef NM_BRIDGE - { - int i; - mtx_init(&netmap_bridge_mutex, "netmap_bridge_mutex", - MTX_NETWORK_LOCK, MTX_DEF); bzero(nm_bridges, sizeof(struct nm_bridge) * NM_BRIDGES); /* safety */ for (i = 0; i < NM_BRIDGES; i++) - rw_init(&nm_bridges[i].bdg_lock, "bdg lock"); - } -#endif + BDG_RWINIT(&nm_bridges[i]); return (error); } @@ -3288,7 +4142,8 @@ static void netmap_fini(void) { destroy_dev(netmap_dev); - netmap_memory_fini(); + netmap_mem_fini(); + NMG_LOCK_DESTROY(); printf("netmap: unloaded module.\n"); } diff --git a/sys/dev/netmap/netmap_kern.h b/sys/dev/netmap/netmap_kern.h index e246e146c8e2..12bd882521b3 100644 --- a/sys/dev/netmap/netmap_kern.h +++ b/sys/dev/netmap/netmap_kern.h @@ -35,26 +35,28 @@ #if defined(__FreeBSD__) -#define likely(x) __builtin_expect(!!(x), 1) -#define unlikely(x) __builtin_expect(!!(x), 0) +#define likely(x) __builtin_expect((long)!!(x), 1L) +#define unlikely(x) __builtin_expect((long)!!(x), 0L) #define NM_LOCK_T struct mtx -#define NM_RWLOCK_T struct rwlock #define NM_SELINFO_T struct selinfo #define MBUF_LEN(m) ((m)->m_pkthdr.len) #define NM_SEND_UP(ifp, m) ((ifp)->if_input)(ifp, m) +#define NM_ATOMIC_T volatile int + #elif defined (linux) #define NM_LOCK_T safe_spinlock_t // see bsd_glue.h -#define NM_RWLOCK_T safe_spinlock_t // see bsd_glue.h #define NM_SELINFO_T wait_queue_head_t #define MBUF_LEN(m) ((m)->len) #define NM_SEND_UP(ifp, m) netif_rx(m) +#define NM_ATOMIC_T volatile long unsigned int + #ifndef DEV_NETMAP #define DEV_NETMAP -#endif +#endif /* DEV_NETMAP */ /* * IFCAP_NETMAP goes into net_device's priv_flags (if_capenable). @@ -111,6 +113,8 @@ struct nm_bdg_fwd; struct nm_bridge; struct netmap_priv_d; +const char *nm_dump_buf(char *p, int len, int lim, char *dst); + /* * private, kernel view of a ring. Keeps track of the status of * a ring across system calls. @@ -128,26 +132,120 @@ struct netmap_priv_d; * the next empty buffer as known by the hardware (next_to_check or so). * TX rings: hwcur + hwofs coincides with next_to_send * + * Clients cannot issue concurrent syscall on a ring. The system + * detects this and reports an error using two flags, + * NKR_WBUSY and NKR_RBUSY * For received packets, slot->flags is set to nkr_slot_flags * so we can provide a proper initial value (e.g. set NS_FORWARD * when operating in 'transparent' mode). + * + * The following fields are used to implement lock-free copy of packets + * from input to output ports in VALE switch: + * nkr_hwlease buffer after the last one being copied. + * A writer in nm_bdg_flush reserves N buffers + * from nr_hwlease, advances it, then does the + * copy outside the lock. + * In RX rings (used for VALE ports), + * nkr_hwcur + nkr_hwavail <= nkr_hwlease < nkr_hwcur+N-1 + * In TX rings (used for NIC or host stack ports) + * nkr_hwcur <= nkr_hwlease < nkr_hwcur+ nkr_hwavail + * nkr_leases array of nkr_num_slots where writers can report + * completion of their block. NR_NOSLOT (~0) indicates + * that the writer has not finished yet + * nkr_lease_idx index of next free slot in nr_leases, to be assigned + * + * The kring is manipulated by txsync/rxsync and generic netmap function. + * q_lock is used to arbitrate access to the kring from within the netmap + * code, and this and other protections guarantee that there is never + * more than 1 concurrent call to txsync or rxsync. So we are free + * to manipulate the kring from within txsync/rxsync without any extra + * locks. */ struct netmap_kring { struct netmap_ring *ring; - u_int nr_hwcur; - int nr_hwavail; - u_int nr_kflags; /* private driver flags */ + uint32_t nr_hwcur; + uint32_t nr_hwavail; + uint32_t nr_kflags; /* private driver flags */ #define NKR_PENDINTR 0x1 // Pending interrupt. - u_int nkr_num_slots; + uint32_t nkr_num_slots; + int32_t nkr_hwofs; /* offset between NIC and netmap ring */ uint16_t nkr_slot_flags; /* initial value for flags */ - int nkr_hwofs; /* offset between NIC and netmap ring */ struct netmap_adapter *na; struct nm_bdg_fwd *nkr_ft; + uint32_t *nkr_leases; +#define NR_NOSLOT ((uint32_t)~0) + uint32_t nkr_hwlease; + uint32_t nkr_lease_idx; + NM_SELINFO_T si; /* poll/select wait queue */ - NM_LOCK_T q_lock; /* used if no device lock available */ + NM_LOCK_T q_lock; /* protects kring and ring. */ + NM_ATOMIC_T nr_busy; /* prevent concurrent syscalls */ + + volatile int nkr_stopped; } __attribute__((__aligned__(64))); + +/* return the next index, with wraparound */ +static inline uint32_t +nm_next(uint32_t i, uint32_t lim) +{ + return unlikely (i == lim) ? 0 : i + 1; +} + +/* + * + * Here is the layout for the Rx and Tx rings. + + RxRING TxRING + + +-----------------+ +-----------------+ + | | | | + |XXX free slot XXX| |XXX free slot XXX| + +-----------------+ +-----------------+ + | |<-hwcur | |<-hwcur + | reserved h | | (ready | + +----------- w -+ | to be | + cur->| a | | sent) h | + | v | +---------- w | + | a | cur->| (being a | + | i | | prepared) v | + | avail l | | a | + +-----------------+ + a ------ i + + | | ... | v l |<-hwlease + | (being | ... | a | ... + | prepared) | ... | i | ... + +-----------------+ ... | l | ... + | |<-hwlease +-----------------+ + | | | | + | | | | + | | | | + | | | | + +-----------------+ +-----------------+ + + * The cur/avail (user view) and hwcur/hwavail (kernel view) + * are used in the normal operation of the card. + * + * When a ring is the output of a switch port (Rx ring for + * a VALE port, Tx ring for the host stack or NIC), slots + * are reserved in blocks through 'hwlease' which points + * to the next unused slot. + * On an Rx ring, hwlease is always after hwavail, + * and completions cause avail to advance. + * On a Tx ring, hwlease is always between cur and hwavail, + * and completions cause cur to advance. + * + * nm_kr_space() returns the maximum number of slots that + * can be assigned. + * nm_kr_lease() reserves the required number of buffers, + * advances nkr_hwlease and also returns an entry in + * a circular array where completions should be reported. + */ + + + + + /* * This struct extends the 'struct adapter' (or * equivalent) device descriptor. It contains all fields needed to @@ -167,6 +265,13 @@ struct netmap_adapter { * useful during initialization */ #define NAF_SW_ONLY 2 /* forward packets only to sw adapter */ +#define NAF_BDG_MAYSLEEP 4 /* the bridge is allowed to sleep when + * forwarding packets coming from this + * interface + */ +#define NAF_MEM_OWNER 8 /* the adapter is responsible for the + * deallocation of the memory allocator + */ int refcount; /* number of user-space descriptors using this interface, which is equal to the number of struct netmap_if objs in the mapped region. */ @@ -179,9 +284,6 @@ struct netmap_adapter { int na_single; /* threads attached to a single hw queue */ int na_multi; /* threads attached to multiple hw queues */ - int separate_locks; /* set if the interface suports different - locks for rx, tx and core. */ - u_int num_rx_rings; /* number of adapter receive rings */ u_int num_tx_rings; /* number of adapter transmit rings */ @@ -210,9 +312,11 @@ struct netmap_adapter { NM_LOCK_T core_lock; /* used if no device lock available */ int (*nm_register)(struct ifnet *, int onoff); - void (*nm_lock)(struct ifnet *, int what, u_int ringid); - int (*nm_txsync)(struct ifnet *, u_int ring, int lock); - int (*nm_rxsync)(struct ifnet *, u_int ring, int lock); + + int (*nm_txsync)(struct ifnet *, u_int ring, int flags); + int (*nm_rxsync)(struct ifnet *, u_int ring, int flags); +#define NAF_FORCE_READ 1 +#define NAF_FORCE_RECLAIM 2 /* return configuration information */ int (*nm_config)(struct ifnet *, u_int *txr, u_int *txd, u_int *rxr, u_int *rxd); @@ -236,12 +340,105 @@ struct netmap_adapter { * This is only done when physical interfaces are attached to a bridge. */ struct netmap_priv_d *na_kpriv; + + /* memory allocator */ + struct netmap_mem_d *nm_mem; #ifdef linux struct net_device_ops nm_ndo; #endif /* linux */ }; /* + * Available space in the ring. + */ +static inline uint32_t +nm_kr_space(struct netmap_kring *k, int is_rx) +{ + int space; + + if (is_rx) { + int busy = k->nkr_hwlease - k->nr_hwcur; + if (busy < 0) + busy += k->nkr_num_slots; + space = k->nkr_num_slots - 1 - busy; + } else { + space = k->nr_hwcur + k->nr_hwavail - k->nkr_hwlease; + if (space < 0) + space += k->nkr_num_slots; + } +#if 0 + // sanity check + if (k->nkr_hwlease >= k->nkr_num_slots || + k->nr_hwcur >= k->nkr_num_slots || + k->nr_hwavail >= k->nkr_num_slots || + busy < 0 || + busy >= k->nkr_num_slots) { + D("invalid kring, cur %d avail %d lease %d lease_idx %d lim %d", k->nr_hwcur, k->nr_hwavail, k->nkr_hwlease, + k->nkr_lease_idx, k->nkr_num_slots); + } +#endif + return space; +} + + +/* return update position */ +static inline uint32_t +nm_kr_rxpos(struct netmap_kring *k) +{ + uint32_t pos = k->nr_hwcur + k->nr_hwavail; + if (pos >= k->nkr_num_slots) + pos -= k->nkr_num_slots; +#if 0 + if (pos >= k->nkr_num_slots || + k->nkr_hwlease >= k->nkr_num_slots || + k->nr_hwcur >= k->nkr_num_slots || + k->nr_hwavail >= k->nkr_num_slots || + k->nkr_lease_idx >= k->nkr_num_slots) { + D("invalid kring, cur %d avail %d lease %d lease_idx %d lim %d", k->nr_hwcur, k->nr_hwavail, k->nkr_hwlease, + k->nkr_lease_idx, k->nkr_num_slots); + } +#endif + return pos; +} + + +/* make a lease on the kring for N positions. return the + * lease index + */ +static inline uint32_t +nm_kr_lease(struct netmap_kring *k, u_int n, int is_rx) +{ + uint32_t lim = k->nkr_num_slots - 1; + uint32_t lease_idx = k->nkr_lease_idx; + + k->nkr_leases[lease_idx] = NR_NOSLOT; + k->nkr_lease_idx = nm_next(lease_idx, lim); + + if (n > nm_kr_space(k, is_rx)) { + D("invalid request for %d slots", n); + panic("x"); + } + /* XXX verify that there are n slots */ + k->nkr_hwlease += n; + if (k->nkr_hwlease > lim) + k->nkr_hwlease -= lim + 1; + + if (k->nkr_hwlease >= k->nkr_num_slots || + k->nr_hwcur >= k->nkr_num_slots || + k->nr_hwavail >= k->nkr_num_slots || + k->nkr_lease_idx >= k->nkr_num_slots) { + D("invalid kring %s, cur %d avail %d lease %d lease_idx %d lim %d", + k->na->ifp->if_xname, + k->nr_hwcur, k->nr_hwavail, k->nkr_hwlease, + k->nkr_lease_idx, k->nkr_num_slots); + } + return lease_idx; +} + + +/* + * XXX NETMAP_DELETING() is unused + * * The combination of "enable" (ifp->if_capenable & IFCAP_NETMAP) * and refcount gives the status of the interface, namely: * @@ -256,25 +453,6 @@ struct netmap_adapter { #define NETMAP_DELETING(_na) ( ((_na)->refcount == 0) && \ ( (_na)->ifp->if_capenable & IFCAP_NETMAP) ) -/* - * parameters for (*nm_lock)(adapter, what, index) - */ -enum { - NETMAP_NO_LOCK = 0, - NETMAP_CORE_LOCK, NETMAP_CORE_UNLOCK, - NETMAP_TX_LOCK, NETMAP_TX_UNLOCK, - NETMAP_RX_LOCK, NETMAP_RX_UNLOCK, -#ifdef __FreeBSD__ -#define NETMAP_REG_LOCK NETMAP_CORE_LOCK -#define NETMAP_REG_UNLOCK NETMAP_CORE_UNLOCK -#else - NETMAP_REG_LOCK, NETMAP_REG_UNLOCK -#endif -}; - -/* How to handle locking support in netmap_rx_irq/netmap_tx_irq */ -#define NETMAP_LOCKED_ENTER 0x10000000 /* already locked on enter */ -#define NETMAP_LOCKED_EXIT 0x20000000 /* keep locked on exit */ /* * The following are support routines used by individual drivers to @@ -285,7 +463,7 @@ enum { * * netmap_detach() frees the memory allocated by netmap_attach(). * - * netmap_start() replaces the if_transmit routine of the interface, + * netmap_transmit() replaces the if_transmit routine of the interface, * and is used to intercept packets coming from the stack. * * netmap_load_map/netmap_reload_map are helper routines to set/reset @@ -294,14 +472,16 @@ enum { * netmap_reset() is a helper routine to be called in the driver * when reinitializing a ring. */ -int netmap_attach(struct netmap_adapter *, int); +int netmap_attach(struct netmap_adapter *, u_int); void netmap_detach(struct ifnet *); -int netmap_start(struct ifnet *, struct mbuf *); +int netmap_transmit(struct ifnet *, struct mbuf *); enum txrx { NR_RX = 0, NR_TX = 1 }; struct netmap_slot *netmap_reset(struct netmap_adapter *na, - enum txrx tx, int n, u_int new_cur); + enum txrx tx, u_int n, u_int new_cur); int netmap_ring_reinit(struct netmap_kring *); +u_int nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg); + /* * The following bridge-related interfaces are used by other kernel modules * In the version that only supports unicast or broadcast, the lookup @@ -451,6 +631,7 @@ netmap_reload_map(bus_dma_tag_t tag, bus_dmamap_t map, void *buf) #endif /* linux */ + /* * functions to map NIC to KRING indexes (n2k) and vice versa (k2n) */ @@ -515,7 +696,15 @@ PNMB(struct netmap_slot *slot, uint64_t *pp) } /* default functions to handle rx/tx interrupts */ -int netmap_rx_irq(struct ifnet *, int, int *); +int netmap_rx_irq(struct ifnet *, u_int, u_int *); #define netmap_tx_irq(_n, _q) netmap_rx_irq(_n, _q, NULL) +#ifdef __FreeBSD__ +MALLOC_DECLARE(M_NETMAP); +#endif /* __FreeBSD__ */ + + +void netmap_disable_all_rings(struct ifnet *); +void netmap_enable_all_rings(struct ifnet *); + #endif /* _NET_NETMAP_KERN_H_ */ diff --git a/sys/dev/netmap/netmap_mem2.c b/sys/dev/netmap/netmap_mem2.c index dcf4b06d874d..a78904216057 100644 --- a/sys/dev/netmap/netmap_mem2.c +++ b/sys/dev/netmap/netmap_mem2.c @@ -23,108 +23,49 @@ * SUCH DAMAGE. */ -/* - * $FreeBSD$ - * - * (New) memory allocator for netmap - */ - -/* - * This allocator creates three memory pools: - * nm_if_pool for the struct netmap_if - * nm_ring_pool for the struct netmap_ring - * nm_buf_pool for the packet buffers. - * - * that contain netmap objects. Each pool is made of a number of clusters, - * multiple of a page size, each containing an integer number of objects. - * The clusters are contiguous in user space but not in the kernel. - * Only nm_buf_pool needs to be dma-able, - * but for convenience use the same type of allocator for all. - * - * Once mapped, the three pools are exported to userspace - * as a contiguous block, starting from nm_if_pool. Each - * cluster (and pool) is an integral number of pages. - * [ . . . ][ . . . . . .][ . . . . . . . . . .] - * nm_if nm_ring nm_buf - * - * The userspace areas contain offsets of the objects in userspace. - * When (at init time) we write these offsets, we find out the index - * of the object, and from there locate the offset from the beginning - * of the region. - * - * The invididual allocators manage a pool of memory for objects of - * the same size. - * The pool is split into smaller clusters, whose size is a - * multiple of the page size. The cluster size is chosen - * to minimize the waste for a given max cluster size - * (we do it by brute force, as we have relatively few objects - * per cluster). - * - * Objects are aligned to the cache line (64 bytes) rounding up object - * sizes when needed. A bitmap contains the state of each object. - * Allocation scans the bitmap; this is done only on attach, so we are not - * too worried about performance - * - * For each allocator we can define (thorugh sysctl) the size and - * number of each object. Memory is allocated at the first use of a - * netmap file descriptor, and can be freed when all such descriptors - * have been released (including unmapping the memory). - * If memory is scarce, the system tries to get as much as possible - * and the sysctl values reflect the actual allocation. - * Together with desired values, the sysctl export also absolute - * min and maximum values that cannot be overridden. - * - * struct netmap_if: - * variable size, max 16 bytes per ring pair plus some fixed amount. - * 1024 bytes should be large enough in practice. - * - * In the worst case we have one netmap_if per ring in the system. - * - * struct netmap_ring - * variable size, 8 byte per slot plus some fixed amount. - * Rings can be large (e.g. 4k slots, or >32Kbytes). - * We default to 36 KB (9 pages), and a few hundred rings. - * - * struct netmap_buffer - * The more the better, both because fast interfaces tend to have - * many slots, and because we may want to use buffers to store - * packets in userspace avoiding copies. - * Must contain a full frame (eg 1518, or more for vlans, jumbo - * frames etc.) plus be nicely aligned, plus some NICs restrict - * the size to multiple of 1K or so. Default to 2K - */ +#ifdef linux +#include "bsd_glue.h" +#endif /* linux */ -#define NETMAP_BUF_MAX_NUM 20*4096*2 /* large machine */ +#ifdef __APPLE__ +#include "osx_glue.h" +#endif /* __APPLE__ */ + +#ifdef __FreeBSD__ +#include <sys/cdefs.h> /* prerequisite */ +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> +#include <sys/malloc.h> +#include <sys/proc.h> +#include <vm/vm.h> /* vtophys */ +#include <vm/pmap.h> /* vtophys */ +#include <sys/socket.h> /* sockaddrs */ +#include <sys/selinfo.h> +#include <sys/sysctl.h> +#include <net/if.h> +#include <net/if_var.h> +#include <net/vnet.h> +#include <machine/bus.h> /* bus_dmamap_* */ + +#endif /* __FreeBSD__ */ + +#include <net/netmap.h> +#include <dev/netmap/netmap_kern.h> +#include "netmap_mem2.h" #ifdef linux -// XXX a mtx would suffice here 20130415 lr -// #define NMA_LOCK_T safe_spinlock_t -#define NMA_LOCK_T struct semaphore -#define NMA_LOCK_INIT() sema_init(&nm_mem.nm_mtx, 1) -#define NMA_LOCK_DESTROY() -#define NMA_LOCK() down(&nm_mem.nm_mtx) -#define NMA_UNLOCK() up(&nm_mem.nm_mtx) +#define NMA_LOCK_INIT(n) sema_init(&(n)->nm_mtx, 1) +#define NMA_LOCK_DESTROY(n) +#define NMA_LOCK(n) down(&(n)->nm_mtx) +#define NMA_UNLOCK(n) up(&(n)->nm_mtx) #else /* !linux */ -#define NMA_LOCK_T struct mtx -#define NMA_LOCK_INIT() mtx_init(&nm_mem.nm_mtx, "netmap memory allocator lock", NULL, MTX_DEF) -#define NMA_LOCK_DESTROY() mtx_destroy(&nm_mem.nm_mtx) -#define NMA_LOCK() mtx_lock(&nm_mem.nm_mtx) -#define NMA_UNLOCK() mtx_unlock(&nm_mem.nm_mtx) +#define NMA_LOCK_INIT(n) mtx_init(&(n)->nm_mtx, "netmap memory allocator lock", NULL, MTX_DEF) +#define NMA_LOCK_DESTROY(n) mtx_destroy(&(n)->nm_mtx) +#define NMA_LOCK(n) mtx_lock(&(n)->nm_mtx) +#define NMA_UNLOCK(n) mtx_unlock(&(n)->nm_mtx) #endif /* linux */ -enum { - NETMAP_IF_POOL = 0, - NETMAP_RING_POOL, - NETMAP_BUF_POOL, - NETMAP_POOLS_NR -}; - - -struct netmap_obj_params { - u_int size; - u_int num; -}; - struct netmap_obj_params netmap_params[NETMAP_POOLS_NR] = { [NETMAP_IF_POOL] = { @@ -142,47 +83,15 @@ struct netmap_obj_params netmap_params[NETMAP_POOLS_NR] = { }; -struct netmap_obj_pool { - char name[16]; /* name of the allocator */ - u_int objtotal; /* actual total number of objects. */ - u_int objfree; /* number of free objects. */ - u_int clustentries; /* actual objects per cluster */ - - /* limits */ - u_int objminsize; /* minimum object size */ - u_int objmaxsize; /* maximum object size */ - u_int nummin; /* minimum number of objects */ - u_int nummax; /* maximum number of objects */ - - /* the total memory space is _numclusters*_clustsize */ - u_int _numclusters; /* how many clusters */ - u_int _clustsize; /* cluster size */ - u_int _objsize; /* actual object size */ - - u_int _memtotal; /* _numclusters*_clustsize */ - struct lut_entry *lut; /* virt,phys addresses, objtotal entries */ - uint32_t *bitmap; /* one bit per buffer, 1 means free */ - uint32_t bitmap_slots; /* number of uint32 entries in bitmap */ -}; - - -struct netmap_mem_d { - NMA_LOCK_T nm_mtx; /* protect the allocator */ - u_int nm_totalsize; /* shorthand */ - - int finalized; /* !=0 iff preallocation done */ - int lasterr; /* last error for curr config */ - int refcount; /* existing priv structures */ - /* the three allocators */ - struct netmap_obj_pool pools[NETMAP_POOLS_NR]; -}; - /* * nm_mem is the memory allocator used for all physical interfaces * running in netmap mode. * Virtual (VALE) ports will have each its own allocator. */ -static struct netmap_mem_d nm_mem = { /* Our memory allocator. */ +static int netmap_mem_global_config(struct netmap_mem_d *nmd); +static int netmap_mem_global_finalize(struct netmap_mem_d *nmd); +static void netmap_mem_global_deref(struct netmap_mem_d *nmd); +struct netmap_mem_d nm_mem = { /* Our memory allocator. */ .pools = { [NETMAP_IF_POOL] = { .name = "netmap_if", @@ -206,15 +115,55 @@ static struct netmap_mem_d nm_mem = { /* Our memory allocator. */ .nummax = 1000000, /* one million! */ }, }, + .config = netmap_mem_global_config, + .finalize = netmap_mem_global_finalize, + .deref = netmap_mem_global_deref, }; + // XXX logically belongs to nm_mem struct lut_entry *netmap_buffer_lut; /* exported */ +/* blueprint for the private memory allocators */ +static int netmap_mem_private_config(struct netmap_mem_d *nmd); +static int netmap_mem_private_finalize(struct netmap_mem_d *nmd); +static void netmap_mem_private_deref(struct netmap_mem_d *nmd); +const struct netmap_mem_d nm_blueprint = { + .pools = { + [NETMAP_IF_POOL] = { + .name = "%s_if", + .objminsize = sizeof(struct netmap_if), + .objmaxsize = 4096, + .nummin = 1, + .nummax = 10, + }, + [NETMAP_RING_POOL] = { + .name = "%s_ring", + .objminsize = sizeof(struct netmap_ring), + .objmaxsize = 32*PAGE_SIZE, + .nummin = 2, + .nummax = 1024, + }, + [NETMAP_BUF_POOL] = { + .name = "%s_buf", + .objminsize = 64, + .objmaxsize = 65536, + .nummin = 4, + .nummax = 1000000, /* one million! */ + }, + }, + .config = netmap_mem_private_config, + .finalize = netmap_mem_private_finalize, + .deref = netmap_mem_private_deref, + + .flags = NETMAP_MEM_PRIVATE, +}; + /* memory allocator related sysctls */ #define STRINGIFY(x) #x + #define DECLARE_SYSCTLS(id, name) \ SYSCTL_INT(_dev_netmap, OID_AUTO, name##_size, \ CTLFLAG_RW, &netmap_params[id].size, 0, "Requested size of netmap " STRINGIFY(name) "s"); \ @@ -225,43 +174,71 @@ struct lut_entry *netmap_buffer_lut; /* exported */ SYSCTL_INT(_dev_netmap, OID_AUTO, name##_curr_num, \ CTLFLAG_RD, &nm_mem.pools[id].objtotal, 0, "Current number of netmap " STRINGIFY(name) "s") +SYSCTL_DECL(_dev_netmap); DECLARE_SYSCTLS(NETMAP_IF_POOL, if); DECLARE_SYSCTLS(NETMAP_RING_POOL, ring); DECLARE_SYSCTLS(NETMAP_BUF_POOL, buf); /* - * Convert a userspace offset to a physical address. - * XXX only called in the FreeBSD's netmap_mmap() - * because in linux we map everything at once. - * * First, find the allocator that contains the requested offset, * then locate the cluster through a lookup table. */ -static inline vm_paddr_t -netmap_ofstophys(vm_offset_t offset) +vm_paddr_t +netmap_mem_ofstophys(struct netmap_mem_d* nmd, vm_ooffset_t offset) { int i; - vm_offset_t o = offset; - struct netmap_obj_pool *p = nm_mem.pools; + vm_ooffset_t o = offset; + vm_paddr_t pa; + struct netmap_obj_pool *p; + + NMA_LOCK(nmd); + p = nmd->pools; - for (i = 0; i < NETMAP_POOLS_NR; offset -= p[i]._memtotal, i++) { - if (offset >= p[i]._memtotal) + for (i = 0; i < NETMAP_POOLS_NR; offset -= p[i].memtotal, i++) { + if (offset >= p[i].memtotal) continue; // now lookup the cluster's address - return p[i].lut[offset / p[i]._objsize].paddr + + pa = p[i].lut[offset / p[i]._objsize].paddr + offset % p[i]._objsize; + NMA_UNLOCK(nmd); + return pa; } /* this is only in case of errors */ D("invalid ofs 0x%x out of 0x%x 0x%x 0x%x", (u_int)o, - p[NETMAP_IF_POOL]._memtotal, - p[NETMAP_IF_POOL]._memtotal - + p[NETMAP_RING_POOL]._memtotal, - p[NETMAP_IF_POOL]._memtotal - + p[NETMAP_RING_POOL]._memtotal - + p[NETMAP_BUF_POOL]._memtotal); + p[NETMAP_IF_POOL].memtotal, + p[NETMAP_IF_POOL].memtotal + + p[NETMAP_RING_POOL].memtotal, + p[NETMAP_IF_POOL].memtotal + + p[NETMAP_RING_POOL].memtotal + + p[NETMAP_BUF_POOL].memtotal); + NMA_UNLOCK(nmd); return 0; // XXX bad address } +int +netmap_mem_get_info(struct netmap_mem_d* nmd, u_int* size, u_int *memflags) +{ + int error = 0; + NMA_LOCK(nmd); + error = nmd->config(nmd); + if (error) + goto out; + if (nmd->flags & NETMAP_MEM_FINALIZED) { + *size = nmd->nm_totalsize; + } else { + int i; + *size = 0; + for (i = 0; i < NETMAP_POOLS_NR; i++) { + struct netmap_obj_pool *p = nmd->pools + i; + *size += (p->_numclusters * p->_clustsize); + } + } + *memflags = nmd->flags; +out: + NMA_UNLOCK(nmd); + return error; +} + /* * we store objects by kernel address, need to find the offset * within the pool to export the value to userspace. @@ -271,7 +248,7 @@ netmap_ofstophys(vm_offset_t offset) static ssize_t netmap_obj_offset(struct netmap_obj_pool *p, const void *vaddr) { - int i, k = p->clustentries, n = p->objtotal; + int i, k = p->_clustentries, n = p->objtotal; ssize_t ofs = 0; for (i = 0; i < n; i += k, ofs += p->_clustsize) { @@ -292,25 +269,35 @@ netmap_obj_offset(struct netmap_obj_pool *p, const void *vaddr) } /* Helper functions which convert virtual addresses to offsets */ -#define netmap_if_offset(v) \ - netmap_obj_offset(&nm_mem.pools[NETMAP_IF_POOL], (v)) +#define netmap_if_offset(n, v) \ + netmap_obj_offset(&(n)->pools[NETMAP_IF_POOL], (v)) -#define netmap_ring_offset(v) \ - (nm_mem.pools[NETMAP_IF_POOL]._memtotal + \ - netmap_obj_offset(&nm_mem.pools[NETMAP_RING_POOL], (v))) +#define netmap_ring_offset(n, v) \ + ((n)->pools[NETMAP_IF_POOL].memtotal + \ + netmap_obj_offset(&(n)->pools[NETMAP_RING_POOL], (v))) -#define netmap_buf_offset(v) \ - (nm_mem.pools[NETMAP_IF_POOL]._memtotal + \ - nm_mem.pools[NETMAP_RING_POOL]._memtotal + \ - netmap_obj_offset(&nm_mem.pools[NETMAP_BUF_POOL], (v))) +#define netmap_buf_offset(n, v) \ + ((n)->pools[NETMAP_IF_POOL].memtotal + \ + (n)->pools[NETMAP_RING_POOL].memtotal + \ + netmap_obj_offset(&(n)->pools[NETMAP_BUF_POOL], (v))) +ssize_t +netmap_mem_if_offset(struct netmap_mem_d *nmd, const void *addr) +{ + ssize_t v; + NMA_LOCK(nmd); + v = netmap_if_offset(nmd, addr); + NMA_UNLOCK(nmd); + return v; +} + /* * report the index, and use start position as a hint, * otherwise buffer allocation becomes terribly expensive. */ static void * -netmap_obj_malloc(struct netmap_obj_pool *p, int len, uint32_t *start, uint32_t *index) +netmap_obj_malloc(struct netmap_obj_pool *p, u_int len, uint32_t *start, uint32_t *index) { uint32_t i = 0; /* index in the bitmap */ uint32_t mask, j; /* slot counter */ @@ -374,10 +361,10 @@ netmap_obj_free(struct netmap_obj_pool *p, uint32_t j) static void netmap_obj_free_va(struct netmap_obj_pool *p, void *vaddr) { - int i, j, n = p->_memtotal / p->_clustsize; + u_int i, j, n = p->numclusters; - for (i = 0, j = 0; i < n; i++, j += p->clustentries) { - void *base = p->lut[i * p->clustentries].vaddr; + for (i = 0, j = 0; i < n; i++, j += p->_clustentries) { + void *base = p->lut[i * p->_clustentries].vaddr; ssize_t relofs = (ssize_t) vaddr - (ssize_t) base; /* Given address, is out of the scope of the current cluster.*/ @@ -385,7 +372,7 @@ netmap_obj_free_va(struct netmap_obj_pool *p, void *vaddr) continue; j = j + relofs / p->_objsize; - KASSERT(j != 0, ("Cannot free object 0")); + /* KASSERT(j != 0, ("Cannot free object 0")); */ netmap_obj_free(p, j); return; } @@ -393,32 +380,32 @@ netmap_obj_free_va(struct netmap_obj_pool *p, void *vaddr) vaddr, p->name); } -#define netmap_if_malloc(len) netmap_obj_malloc(&nm_mem.pools[NETMAP_IF_POOL], len, NULL, NULL) -#define netmap_if_free(v) netmap_obj_free_va(&nm_mem.pools[NETMAP_IF_POOL], (v)) -#define netmap_ring_malloc(len) netmap_obj_malloc(&nm_mem.pools[NETMAP_RING_POOL], len, NULL, NULL) -#define netmap_ring_free(v) netmap_obj_free_va(&nm_mem.pools[NETMAP_RING_POOL], (v)) -#define netmap_buf_malloc(_pos, _index) \ - netmap_obj_malloc(&nm_mem.pools[NETMAP_BUF_POOL], NETMAP_BUF_SIZE, _pos, _index) +#define netmap_if_malloc(n, len) netmap_obj_malloc(&(n)->pools[NETMAP_IF_POOL], len, NULL, NULL) +#define netmap_if_free(n, v) netmap_obj_free_va(&(n)->pools[NETMAP_IF_POOL], (v)) +#define netmap_ring_malloc(n, len) netmap_obj_malloc(&(n)->pools[NETMAP_RING_POOL], len, NULL, NULL) +#define netmap_ring_free(n, v) netmap_obj_free_va(&(n)->pools[NETMAP_RING_POOL], (v)) +#define netmap_buf_malloc(n, _pos, _index) \ + netmap_obj_malloc(&(n)->pools[NETMAP_BUF_POOL], NETMAP_BDG_BUF_SIZE(n), _pos, _index) /* Return the index associated to the given packet buffer */ -#define netmap_buf_index(v) \ - (netmap_obj_offset(&nm_mem.pools[NETMAP_BUF_POOL], (v)) / nm_mem.pools[NETMAP_BUF_POOL]._objsize) +#define netmap_buf_index(n, v) \ + (netmap_obj_offset(&(n)->pools[NETMAP_BUF_POOL], (v)) / NETMAP_BDG_BUF_SIZE(n)) /* Return nonzero on error */ static int -netmap_new_bufs(struct netmap_if *nifp, +netmap_new_bufs(struct netmap_mem_d *nmd, struct netmap_if *nifp, struct netmap_slot *slot, u_int n) { - struct netmap_obj_pool *p = &nm_mem.pools[NETMAP_BUF_POOL]; - int i = 0; /* slot counter */ + struct netmap_obj_pool *p = &nmd->pools[NETMAP_BUF_POOL]; + u_int i = 0; /* slot counter */ uint32_t pos = 0; /* slot in p->bitmap */ uint32_t index = 0; /* buffer index */ (void)nifp; /* UNUSED */ for (i = 0; i < n; i++) { - void *vaddr = netmap_buf_malloc(&pos, &index); + void *vaddr = netmap_buf_malloc(nmd, &pos, &index); if (vaddr == NULL) { D("unable to locate empty packet buffer"); goto cleanup; @@ -446,10 +433,11 @@ cleanup: static void -netmap_free_buf(struct netmap_if *nifp, uint32_t i) +netmap_free_buf(struct netmap_mem_d *nmd, struct netmap_if *nifp, uint32_t i) { - struct netmap_obj_pool *p = &nm_mem.pools[NETMAP_BUF_POOL]; + struct netmap_obj_pool *p = &nmd->pools[NETMAP_BUF_POOL]; + (void)nifp; if (i < 2 || i >= p->objtotal) { D("Cannot free buf#%d: should be in [2, %d[", i, p->objtotal); return; @@ -460,16 +448,19 @@ netmap_free_buf(struct netmap_if *nifp, uint32_t i) static void netmap_reset_obj_allocator(struct netmap_obj_pool *p) { + if (p == NULL) return; if (p->bitmap) free(p->bitmap, M_NETMAP); p->bitmap = NULL; if (p->lut) { - int i; - for (i = 0; i < p->objtotal; i += p->clustentries) { + u_int i; + size_t sz = p->_clustsize; + + for (i = 0; i < p->objtotal; i += p->_clustentries) { if (p->lut[i].vaddr) - contigfree(p->lut[i].vaddr, p->_clustsize, M_NETMAP); + contigfree(p->lut[i].vaddr, sz, M_NETMAP); } bzero(p->lut, sizeof(struct lut_entry) * p->objtotal); #ifdef linux @@ -479,6 +470,10 @@ netmap_reset_obj_allocator(struct netmap_obj_pool *p) #endif } p->lut = NULL; + p->objtotal = 0; + p->memtotal = 0; + p->numclusters = 0; + p->objfree = 0; } /* @@ -496,8 +491,7 @@ netmap_destroy_obj_allocator(struct netmap_obj_pool *p) * We receive a request for objtotal objects, of size objsize each. * Internally we may round up both numbers, as we allocate objects * in small clusters multiple of the page size. - * In the allocator we don't need to store the objsize, - * but we do need to keep track of objtotal' and clustentries, + * We need to keep track of objtotal and clustentries, * as they are needed when freeing memory. * * XXX note -- userspace needs the buffers to be contiguous, @@ -509,16 +503,21 @@ netmap_destroy_obj_allocator(struct netmap_obj_pool *p) static int netmap_config_obj_allocator(struct netmap_obj_pool *p, u_int objtotal, u_int objsize) { - int i, n; + int i; u_int clustsize; /* the cluster size, multiple of page size */ u_int clustentries; /* how many objects per entry */ + /* we store the current request, so we can + * detect configuration changes later */ + p->r_objtotal = objtotal; + p->r_objsize = objsize; + #define MAX_CLUSTSIZE (1<<17) #define LINE_ROUND 64 if (objsize >= MAX_CLUSTSIZE) { /* we could do it but there is no point */ D("unsupported allocation for %d bytes", objsize); - goto error; + return EINVAL; } /* make sure objsize is a multiple of LINE_ROUND */ i = (objsize & (LINE_ROUND - 1)); @@ -529,12 +528,12 @@ netmap_config_obj_allocator(struct netmap_obj_pool *p, u_int objtotal, u_int obj if (objsize < p->objminsize || objsize > p->objmaxsize) { D("requested objsize %d out of range [%d, %d]", objsize, p->objminsize, p->objmaxsize); - goto error; + return EINVAL; } if (objtotal < p->nummin || objtotal > p->nummax) { D("requested objtotal %d out of range [%d, %d]", objtotal, p->nummin, p->nummax); - goto error; + return EINVAL; } /* * Compute number of objects using a brute-force approach: @@ -568,22 +567,15 @@ netmap_config_obj_allocator(struct netmap_obj_pool *p, u_int objtotal, u_int obj * The number of clusters is n = ceil(objtotal/clustentries) * objtotal' = n * clustentries */ - p->clustentries = clustentries; + p->_clustentries = clustentries; p->_clustsize = clustsize; - n = (objtotal + clustentries - 1) / clustentries; - p->_numclusters = n; - p->objtotal = n * clustentries; - p->objfree = p->objtotal - 2; /* obj 0 and 1 are reserved */ - p->_memtotal = p->_numclusters * p->_clustsize; - p->_objsize = objsize; + p->_numclusters = (objtotal + clustentries - 1) / clustentries; - return 0; - -error: + /* actual values (may be larger than requested) */ p->_objsize = objsize; - p->objtotal = objtotal; + p->_objtotal = p->_numclusters * clustentries; - return EINVAL; + return 0; } @@ -591,7 +583,12 @@ error: static int netmap_finalize_obj_allocator(struct netmap_obj_pool *p) { - int i, n; + int i; /* must be signed */ + size_t n; + + /* optimistically assume we have enough memory */ + p->numclusters = p->_numclusters; + p->objtotal = p->_objtotal; n = sizeof(struct lut_entry) * p->objtotal; #ifdef linux @@ -600,7 +597,7 @@ netmap_finalize_obj_allocator(struct netmap_obj_pool *p) p->lut = malloc(n, M_NETMAP, M_NOWAIT | M_ZERO); #endif if (p->lut == NULL) { - D("Unable to create lookup table (%d bytes) for '%s'", n, p->name); + D("Unable to create lookup table (%d bytes) for '%s'", (int)n, p->name); goto clean; } @@ -608,7 +605,7 @@ netmap_finalize_obj_allocator(struct netmap_obj_pool *p) n = (p->objtotal + 31) / 32; p->bitmap = malloc(sizeof(uint32_t) * n, M_NETMAP, M_NOWAIT | M_ZERO); if (p->bitmap == NULL) { - D("Unable to create bitmap (%d entries) for allocator '%s'", n, + D("Unable to create bitmap (%d entries) for allocator '%s'", (int)n, p->name); goto clean; } @@ -617,31 +614,34 @@ netmap_finalize_obj_allocator(struct netmap_obj_pool *p) /* * Allocate clusters, init pointers and bitmap */ - for (i = 0; i < p->objtotal;) { - int lim = i + p->clustentries; + + n = p->_clustsize; + for (i = 0; i < (int)p->objtotal;) { + int lim = i + p->_clustentries; char *clust; - clust = contigmalloc(p->_clustsize, M_NETMAP, M_NOWAIT | M_ZERO, - 0, -1UL, PAGE_SIZE, 0); + clust = contigmalloc(n, M_NETMAP, M_NOWAIT | M_ZERO, + (size_t)0, -1UL, PAGE_SIZE, 0); if (clust == NULL) { /* * If we get here, there is a severe memory shortage, * so halve the allocated memory to reclaim some. - * XXX check boundaries */ D("Unable to create cluster at %d for '%s' allocator", i, p->name); + if (i < 2) /* nothing to halve */ + goto out; lim = i / 2; for (i--; i >= lim; i--) { p->bitmap[ (i>>5) ] &= ~( 1 << (i & 31) ); - if (i % p->clustentries == 0 && p->lut[i].vaddr) + if (i % p->_clustentries == 0 && p->lut[i].vaddr) contigfree(p->lut[i].vaddr, - p->_clustsize, M_NETMAP); + n, M_NETMAP); } + out: p->objtotal = i; - p->objfree = p->objtotal - 2; - p->_numclusters = i / p->clustentries; - p->_memtotal = p->_numclusters * p->_clustsize; + /* we may have stopped in the middle of a cluster */ + p->numclusters = (i + p->_clustentries - 1) / p->_clustentries; break; } for (; i < lim; i++, clust += p->_objsize) { @@ -650,11 +650,14 @@ netmap_finalize_obj_allocator(struct netmap_obj_pool *p) p->lut[i].paddr = vtophys(clust); } } - p->bitmap[0] = ~3; /* objs 0 and 1 is always busy */ + p->objfree = p->objtotal; + p->memtotal = p->numclusters * p->_clustsize; + if (p->objfree == 0) + goto clean; if (netmap_verbose) D("Pre-allocated %d clusters (%d/%dKB) for '%s'", - p->_numclusters, p->_clustsize >> 10, - p->_memtotal >> 10, p->name); + p->numclusters, p->_clustsize >> 10, + p->memtotal >> 10, p->name); return 0; @@ -665,148 +668,275 @@ clean: /* call with lock held */ static int -netmap_memory_config_changed(void) +netmap_memory_config_changed(struct netmap_mem_d *nmd) { int i; for (i = 0; i < NETMAP_POOLS_NR; i++) { - if (nm_mem.pools[i]._objsize != netmap_params[i].size || - nm_mem.pools[i].objtotal != netmap_params[i].num) + if (nmd->pools[i].r_objsize != netmap_params[i].size || + nmd->pools[i].r_objtotal != netmap_params[i].num) return 1; } return 0; } +static void +netmap_mem_reset_all(struct netmap_mem_d *nmd) +{ + int i; + D("resetting %p", nmd); + for (i = 0; i < NETMAP_POOLS_NR; i++) { + netmap_reset_obj_allocator(&nmd->pools[i]); + } + nmd->flags &= ~NETMAP_MEM_FINALIZED; +} + +static int +netmap_mem_finalize_all(struct netmap_mem_d *nmd) +{ + int i; + if (nmd->flags & NETMAP_MEM_FINALIZED) + return 0; + nmd->lasterr = 0; + nmd->nm_totalsize = 0; + for (i = 0; i < NETMAP_POOLS_NR; i++) { + nmd->lasterr = netmap_finalize_obj_allocator(&nmd->pools[i]); + if (nmd->lasterr) + goto error; + nmd->nm_totalsize += nmd->pools[i].memtotal; + } + /* buffers 0 and 1 are reserved */ + nmd->pools[NETMAP_BUF_POOL].objfree -= 2; + nmd->pools[NETMAP_BUF_POOL].bitmap[0] = ~3; + nmd->flags |= NETMAP_MEM_FINALIZED; + + D("Have %d KB for interfaces, %d KB for rings and %d MB for buffers", + nmd->pools[NETMAP_IF_POOL].memtotal >> 10, + nmd->pools[NETMAP_RING_POOL].memtotal >> 10, + nmd->pools[NETMAP_BUF_POOL].memtotal >> 20); + + D("Free buffers: %d", nmd->pools[NETMAP_BUF_POOL].objfree); + + + return 0; +error: + netmap_mem_reset_all(nmd); + return nmd->lasterr; +} + + + +void +netmap_mem_private_delete(struct netmap_mem_d *nmd) +{ + if (nmd == NULL) + return; + D("deleting %p", nmd); + if (nmd->refcount > 0) + D("bug: deleting mem allocator with refcount=%d!", nmd->refcount); + D("done deleting %p", nmd); + NMA_LOCK_DESTROY(nmd); + free(nmd, M_DEVBUF); +} + +static int +netmap_mem_private_config(struct netmap_mem_d *nmd) +{ + /* nothing to do, we are configured on creation + * and configuration never changes thereafter + */ + return 0; +} + +static int +netmap_mem_private_finalize(struct netmap_mem_d *nmd) +{ + int err; + NMA_LOCK(nmd); + nmd->refcount++; + err = netmap_mem_finalize_all(nmd); + NMA_UNLOCK(nmd); + return err; + +} + +static void netmap_mem_private_deref(struct netmap_mem_d *nmd) +{ + NMA_LOCK(nmd); + if (--nmd->refcount <= 0) + netmap_mem_reset_all(nmd); + NMA_UNLOCK(nmd); +} + +struct netmap_mem_d * +netmap_mem_private_new(const char *name, u_int txr, u_int txd, u_int rxr, u_int rxd) +{ + struct netmap_mem_d *d = NULL; + struct netmap_obj_params p[NETMAP_POOLS_NR]; + int i; + u_int maxd; + + d = malloc(sizeof(struct netmap_mem_d), + M_DEVBUF, M_NOWAIT | M_ZERO); + if (d == NULL) + return NULL; + + *d = nm_blueprint; + + /* XXX the rest of the code assumes the stack rings are alwasy present */ + txr++; + rxr++; + p[NETMAP_IF_POOL].size = sizeof(struct netmap_if) + + sizeof(ssize_t) * (txr + rxr); + p[NETMAP_IF_POOL].num = 2; + maxd = (txd > rxd) ? txd : rxd; + p[NETMAP_RING_POOL].size = sizeof(struct netmap_ring) + + sizeof(struct netmap_slot) * maxd; + p[NETMAP_RING_POOL].num = txr + rxr; + p[NETMAP_BUF_POOL].size = 2048; /* XXX find a way to let the user choose this */ + p[NETMAP_BUF_POOL].num = rxr * (rxd + 2) + txr * (txd + 2); + + D("req if %d*%d ring %d*%d buf %d*%d", + p[NETMAP_IF_POOL].num, + p[NETMAP_IF_POOL].size, + p[NETMAP_RING_POOL].num, + p[NETMAP_RING_POOL].size, + p[NETMAP_BUF_POOL].num, + p[NETMAP_BUF_POOL].size); + + for (i = 0; i < NETMAP_POOLS_NR; i++) { + snprintf(d->pools[i].name, NETMAP_POOL_MAX_NAMSZ, + nm_blueprint.pools[i].name, + name); + if (netmap_config_obj_allocator(&d->pools[i], + p[i].num, p[i].size)) + goto error; + } + + d->flags &= ~NETMAP_MEM_FINALIZED; + + NMA_LOCK_INIT(d); + + return d; +error: + netmap_mem_private_delete(d); + return NULL; +} + /* call with lock held */ static int -netmap_memory_config(void) +netmap_mem_global_config(struct netmap_mem_d *nmd) { int i; - if (!netmap_memory_config_changed()) + if (nmd->refcount) + /* already in use, we cannot change the configuration */ + goto out; + + if (!netmap_memory_config_changed(nmd)) goto out; D("reconfiguring"); - if (nm_mem.finalized) { + if (nmd->flags & NETMAP_MEM_FINALIZED) { /* reset previous allocation */ for (i = 0; i < NETMAP_POOLS_NR; i++) { - netmap_reset_obj_allocator(&nm_mem.pools[i]); + netmap_reset_obj_allocator(&nmd->pools[i]); } - nm_mem.finalized = 0; + nmd->flags &= ~NETMAP_MEM_FINALIZED; } for (i = 0; i < NETMAP_POOLS_NR; i++) { - nm_mem.lasterr = netmap_config_obj_allocator(&nm_mem.pools[i], + nmd->lasterr = netmap_config_obj_allocator(&nmd->pools[i], netmap_params[i].num, netmap_params[i].size); - if (nm_mem.lasterr) + if (nmd->lasterr) goto out; } - D("Have %d KB for interfaces, %d KB for rings and %d MB for buffers", - nm_mem.pools[NETMAP_IF_POOL]._memtotal >> 10, - nm_mem.pools[NETMAP_RING_POOL]._memtotal >> 10, - nm_mem.pools[NETMAP_BUF_POOL]._memtotal >> 20); - out: - return nm_mem.lasterr; + return nmd->lasterr; } -/* call with lock held */ static int -netmap_memory_finalize(void) +netmap_mem_global_finalize(struct netmap_mem_d *nmd) { - int i; - u_int totalsize = 0; + int err; + + NMA_LOCK(nmd); - nm_mem.refcount++; - if (nm_mem.refcount > 1) { - ND("busy (refcount %d)", nm_mem.refcount); - goto out; - } /* update configuration if changed */ - if (netmap_memory_config()) + if (netmap_mem_global_config(nmd)) goto out; - if (nm_mem.finalized) { + nmd->refcount++; + + if (nmd->flags & NETMAP_MEM_FINALIZED) { /* may happen if config is not changed */ ND("nothing to do"); goto out; } - for (i = 0; i < NETMAP_POOLS_NR; i++) { - nm_mem.lasterr = netmap_finalize_obj_allocator(&nm_mem.pools[i]); - if (nm_mem.lasterr) - goto cleanup; - totalsize += nm_mem.pools[i]._memtotal; - } - nm_mem.nm_totalsize = totalsize; + if (netmap_mem_finalize_all(nmd)) + goto out; /* backward compatibility */ - netmap_buf_size = nm_mem.pools[NETMAP_BUF_POOL]._objsize; - netmap_total_buffers = nm_mem.pools[NETMAP_BUF_POOL].objtotal; - - netmap_buffer_lut = nm_mem.pools[NETMAP_BUF_POOL].lut; - netmap_buffer_base = nm_mem.pools[NETMAP_BUF_POOL].lut[0].vaddr; + netmap_buf_size = nmd->pools[NETMAP_BUF_POOL]._objsize; + netmap_total_buffers = nmd->pools[NETMAP_BUF_POOL].objtotal; - nm_mem.finalized = 1; - nm_mem.lasterr = 0; + netmap_buffer_lut = nmd->pools[NETMAP_BUF_POOL].lut; + netmap_buffer_base = nmd->pools[NETMAP_BUF_POOL].lut[0].vaddr; - /* make sysctl values match actual values in the pools */ - for (i = 0; i < NETMAP_POOLS_NR; i++) { - netmap_params[i].size = nm_mem.pools[i]._objsize; - netmap_params[i].num = nm_mem.pools[i].objtotal; - } + nmd->lasterr = 0; out: - if (nm_mem.lasterr) - nm_mem.refcount--; + if (nmd->lasterr) + nmd->refcount--; + err = nmd->lasterr; - return nm_mem.lasterr; + NMA_UNLOCK(nmd); -cleanup: - for (i = 0; i < NETMAP_POOLS_NR; i++) { - netmap_reset_obj_allocator(&nm_mem.pools[i]); - } - nm_mem.refcount--; + return err; - return nm_mem.lasterr; } -static int -netmap_memory_init(void) +int +netmap_mem_init(void) { - NMA_LOCK_INIT(); + NMA_LOCK_INIT(&nm_mem); return (0); } -static void -netmap_memory_fini(void) +void +netmap_mem_fini(void) { int i; for (i = 0; i < NETMAP_POOLS_NR; i++) { netmap_destroy_obj_allocator(&nm_mem.pools[i]); } - NMA_LOCK_DESTROY(); + NMA_LOCK_DESTROY(&nm_mem); } static void netmap_free_rings(struct netmap_adapter *na) { - int i; + u_int i; if (!na->tx_rings) return; for (i = 0; i < na->num_tx_rings + 1; i++) { - netmap_ring_free(na->tx_rings[i].ring); - na->tx_rings[i].ring = NULL; + if (na->tx_rings[i].ring) { + netmap_ring_free(na->nm_mem, na->tx_rings[i].ring); + na->tx_rings[i].ring = NULL; + } } for (i = 0; i < na->num_rx_rings + 1; i++) { - netmap_ring_free(na->rx_rings[i].ring); - na->rx_rings[i].ring = NULL; + if (na->rx_rings[i].ring) { + netmap_ring_free(na->nm_mem, na->rx_rings[i].ring); + na->rx_rings[i].ring = NULL; + } } free(na->tx_rings, M_DEVBUF); na->tx_rings = na->rx_rings = NULL; @@ -818,50 +948,76 @@ netmap_free_rings(struct netmap_adapter *na) /* * Allocate the per-fd structure netmap_if. * If this is the first instance, also allocate the krings, rings etc. + * + * We assume that the configuration stored in na + * (number of tx/rx rings and descs) does not change while + * the interface is in netmap mode. */ -static void * -netmap_if_new(const char *ifname, struct netmap_adapter *na) +extern int nma_is_vp(struct netmap_adapter *na); +struct netmap_if * +netmap_mem_if_new(const char *ifname, struct netmap_adapter *na) { struct netmap_if *nifp; struct netmap_ring *ring; ssize_t base; /* handy for relative offsets between rings and nifp */ u_int i, len, ndesc, ntx, nrx; struct netmap_kring *kring; + uint32_t *tx_leases = NULL, *rx_leases = NULL; - if (netmap_update_config(na)) { - /* configuration mismatch, report and fail */ - return NULL; - } + /* + * verify whether virtual port need the stack ring + */ ntx = na->num_tx_rings + 1; /* shorthand, include stack ring */ nrx = na->num_rx_rings + 1; /* shorthand, include stack ring */ /* * the descriptor is followed inline by an array of offsets * to the tx and rx rings in the shared memory region. + * For virtual rx rings we also allocate an array of + * pointers to assign to nkr_leases. */ + + NMA_LOCK(na->nm_mem); + len = sizeof(struct netmap_if) + (nrx + ntx) * sizeof(ssize_t); - nifp = netmap_if_malloc(len); + nifp = netmap_if_malloc(na->nm_mem, len); if (nifp == NULL) { + NMA_UNLOCK(na->nm_mem); return NULL; } /* initialize base fields -- override const */ - *(int *)(uintptr_t)&nifp->ni_tx_rings = na->num_tx_rings; - *(int *)(uintptr_t)&nifp->ni_rx_rings = na->num_rx_rings; - strncpy(nifp->ni_name, ifname, IFNAMSIZ); + *(u_int *)(uintptr_t)&nifp->ni_tx_rings = na->num_tx_rings; + *(u_int *)(uintptr_t)&nifp->ni_rx_rings = na->num_rx_rings; + strncpy(nifp->ni_name, ifname, (size_t)IFNAMSIZ); - (na->refcount)++; /* XXX atomic ? we are under lock */ - if (na->refcount > 1) { /* already setup, we are done */ + if (na->refcount) { /* already setup, we are done */ goto final; } len = (ntx + nrx) * sizeof(struct netmap_kring); - na->tx_rings = malloc(len, M_DEVBUF, M_NOWAIT | M_ZERO); + /* + * Leases are attached to TX rings on NIC/host ports, + * and to RX rings on VALE ports. + */ + if (nma_is_vp(na)) { + len += sizeof(uint32_t) * na->num_rx_desc * na->num_rx_rings; + } else { + len += sizeof(uint32_t) * na->num_tx_desc * ntx; + } + + na->tx_rings = malloc((size_t)len, M_DEVBUF, M_NOWAIT | M_ZERO); if (na->tx_rings == NULL) { D("Cannot allocate krings for %s", ifname); goto cleanup; } na->rx_rings = na->tx_rings + ntx; + if (nma_is_vp(na)) { + rx_leases = (uint32_t *)(na->rx_rings + nrx); + } else { + tx_leases = (uint32_t *)(na->rx_rings + nrx); + } + /* * First instance, allocate netmap rings and buffers for this card * The rings are contiguous, but have variable size. @@ -872,7 +1028,7 @@ netmap_if_new(const char *ifname, struct netmap_adapter *na) bzero(kring, sizeof(*kring)); len = sizeof(struct netmap_ring) + ndesc * sizeof(struct netmap_slot); - ring = netmap_ring_malloc(len); + ring = netmap_ring_malloc(na->nm_mem, len); if (ring == NULL) { D("Cannot allocate tx_ring[%d] for %s", i, ifname); goto cleanup; @@ -880,11 +1036,15 @@ netmap_if_new(const char *ifname, struct netmap_adapter *na) ND("txring[%d] at %p ofs %d", i, ring); kring->na = na; kring->ring = ring; - *(int *)(uintptr_t)&ring->num_slots = kring->nkr_num_slots = ndesc; + if (tx_leases) { + kring->nkr_leases = tx_leases; + tx_leases += ndesc; + } + *(uint32_t *)(uintptr_t)&ring->num_slots = kring->nkr_num_slots = ndesc; *(ssize_t *)(uintptr_t)&ring->buf_ofs = - (nm_mem.pools[NETMAP_IF_POOL]._memtotal + - nm_mem.pools[NETMAP_RING_POOL]._memtotal) - - netmap_ring_offset(ring); + (na->nm_mem->pools[NETMAP_IF_POOL].memtotal + + na->nm_mem->pools[NETMAP_RING_POOL].memtotal) - + netmap_ring_offset(na->nm_mem, ring); /* * IMPORTANT: @@ -894,9 +1054,10 @@ netmap_if_new(const char *ifname, struct netmap_adapter *na) */ ring->avail = kring->nr_hwavail = ndesc - 1; ring->cur = kring->nr_hwcur = 0; - *(int *)(uintptr_t)&ring->nr_buf_size = NETMAP_BUF_SIZE; + *(uint16_t *)(uintptr_t)&ring->nr_buf_size = + NETMAP_BDG_BUF_SIZE(na->nm_mem); ND("initializing slots for txring[%d]", i); - if (netmap_new_bufs(nifp, ring->slot, ndesc)) { + if (netmap_new_bufs(na->nm_mem, nifp, ring->slot, ndesc)) { D("Cannot allocate buffers for tx_ring[%d] for %s", i, ifname); goto cleanup; } @@ -908,7 +1069,7 @@ netmap_if_new(const char *ifname, struct netmap_adapter *na) bzero(kring, sizeof(*kring)); len = sizeof(struct netmap_ring) + ndesc * sizeof(struct netmap_slot); - ring = netmap_ring_malloc(len); + ring = netmap_ring_malloc(na->nm_mem, len); if (ring == NULL) { D("Cannot allocate rx_ring[%d] for %s", i, ifname); goto cleanup; @@ -917,17 +1078,22 @@ netmap_if_new(const char *ifname, struct netmap_adapter *na) kring->na = na; kring->ring = ring; - *(int *)(uintptr_t)&ring->num_slots = kring->nkr_num_slots = ndesc; + if (rx_leases && i < na->num_rx_rings) { + kring->nkr_leases = rx_leases; + rx_leases += ndesc; + } + *(uint32_t *)(uintptr_t)&ring->num_slots = kring->nkr_num_slots = ndesc; *(ssize_t *)(uintptr_t)&ring->buf_ofs = - (nm_mem.pools[NETMAP_IF_POOL]._memtotal + - nm_mem.pools[NETMAP_RING_POOL]._memtotal) - - netmap_ring_offset(ring); + (na->nm_mem->pools[NETMAP_IF_POOL].memtotal + + na->nm_mem->pools[NETMAP_RING_POOL].memtotal) - + netmap_ring_offset(na->nm_mem, ring); ring->cur = kring->nr_hwcur = 0; ring->avail = kring->nr_hwavail = 0; /* empty */ - *(int *)(uintptr_t)&ring->nr_buf_size = NETMAP_BUF_SIZE; + *(int *)(uintptr_t)&ring->nr_buf_size = + NETMAP_BDG_BUF_SIZE(na->nm_mem); ND("initializing slots for rxring[%d]", i); - if (netmap_new_bufs(nifp, ring->slot, ndesc)) { + if (netmap_new_bufs(na->nm_mem, nifp, ring->slot, ndesc)) { D("Cannot allocate buffers for rx_ring[%d] for %s", i, ifname); goto cleanup; } @@ -947,28 +1113,78 @@ final: * between the ring and nifp, so the information is usable in * userspace to reach the ring from the nifp. */ - base = netmap_if_offset(nifp); + base = netmap_if_offset(na->nm_mem, nifp); for (i = 0; i < ntx; i++) { *(ssize_t *)(uintptr_t)&nifp->ring_ofs[i] = - netmap_ring_offset(na->tx_rings[i].ring) - base; + netmap_ring_offset(na->nm_mem, na->tx_rings[i].ring) - base; } for (i = 0; i < nrx; i++) { *(ssize_t *)(uintptr_t)&nifp->ring_ofs[i+ntx] = - netmap_ring_offset(na->rx_rings[i].ring) - base; + netmap_ring_offset(na->nm_mem, na->rx_rings[i].ring) - base; } + + NMA_UNLOCK(na->nm_mem); + return (nifp); cleanup: netmap_free_rings(na); - netmap_if_free(nifp); - (na->refcount)--; + netmap_if_free(na->nm_mem, nifp); + + NMA_UNLOCK(na->nm_mem); + return NULL; } -/* call with NMA_LOCK held */ +void +netmap_mem_if_delete(struct netmap_adapter *na, struct netmap_if *nifp) +{ + if (nifp == NULL) + /* nothing to do */ + return; + NMA_LOCK(na->nm_mem); + + if (na->refcount <= 0) { + /* last instance, release bufs and rings */ + u_int i, j, lim; + struct netmap_ring *ring; + + for (i = 0; i < na->num_tx_rings + 1; i++) { + ring = na->tx_rings[i].ring; + lim = na->tx_rings[i].nkr_num_slots; + for (j = 0; j < lim; j++) + netmap_free_buf(na->nm_mem, nifp, ring->slot[j].buf_idx); + } + for (i = 0; i < na->num_rx_rings + 1; i++) { + ring = na->rx_rings[i].ring; + lim = na->rx_rings[i].nkr_num_slots; + for (j = 0; j < lim; j++) + netmap_free_buf(na->nm_mem, nifp, ring->slot[j].buf_idx); + } + netmap_free_rings(na); + } + netmap_if_free(na->nm_mem, nifp); + + NMA_UNLOCK(na->nm_mem); +} + static void -netmap_memory_deref(void) +netmap_mem_global_deref(struct netmap_mem_d *nmd) { - nm_mem.refcount--; + NMA_LOCK(nmd); + + nmd->refcount--; if (netmap_verbose) - D("refcount = %d", nm_mem.refcount); + D("refcount = %d", nmd->refcount); + + NMA_UNLOCK(nmd); +} + +int netmap_mem_finalize(struct netmap_mem_d *nmd) +{ + return nmd->finalize(nmd); +} + +void netmap_mem_deref(struct netmap_mem_d *nmd) +{ + return nmd->deref(nmd); } diff --git a/sys/dev/re/if_re.c b/sys/dev/re/if_re.c index c5a6bc7befe8..ee5af995f623 100644 --- a/sys/dev/re/if_re.c +++ b/sys/dev/re/if_re.c @@ -2134,8 +2134,7 @@ re_rxeof(struct rl_softc *sc, int *rx_npktsp) ifp = sc->rl_ifp; #ifdef DEV_NETMAP - if (netmap_rx_irq(ifp, 0 | (NETMAP_LOCKED_ENTER|NETMAP_LOCKED_EXIT), - &rx_npkts)) + if (netmap_rx_irq(ifp, 0, &rx_npkts)) return 0; #endif /* DEV_NETMAP */ if (ifp->if_mtu > RL_MTU && (sc->rl_flags & RL_FLAG_JUMBOV2) != 0) @@ -2380,7 +2379,7 @@ re_txeof(struct rl_softc *sc) ifp = sc->rl_ifp; #ifdef DEV_NETMAP - if (netmap_tx_irq(ifp, 0 | (NETMAP_LOCKED_ENTER|NETMAP_LOCKED_EXIT))) + if (netmap_tx_irq(ifp, 0)) return; #endif /* DEV_NETMAP */ /* Invalidate the TX descriptor list */ diff --git a/sys/net/netmap.h b/sys/net/netmap.h index b5ab6d549084..0f2baebe15fc 100644 --- a/sys/net/netmap.h +++ b/sys/net/netmap.h @@ -38,6 +38,8 @@ * Detailed info on netmap is available with "man netmap" or at * * http://info.iet.unipi.it/~luigi/netmap/ + * + * This API is also used to communicate with the VALE software switch */ #ifndef _NET_NETMAP_H_ @@ -46,106 +48,95 @@ /* * --- Netmap data structures --- * - * The data structures used by netmap are shown below. Those in - * capital letters are in an mmapp()ed area shared with userspace, - * while others are private to the kernel. - * Shared structures do not contain pointers but only memory - * offsets, so that addressing is portable between kernel and userspace. + * The userspace data structures used by netmap are shown below. + * They are allocated by the kernel and mmap()ed by userspace threads. + * Pointers are implemented as memory offsets or indexes, + * so that they can be easily dereferenced in kernel and userspace. + KERNEL (opaque, obviously) - softc -+----------------+ -| standard fields| -| if_pspare[0] ----------+ -+----------------+ | - | -+----------------+<------+ -|(netmap_adapter)| -| | netmap_kring -| tx_rings *--------------------------------->+---------------+ -| | netmap_kring | ring *---------. -| rx_rings *--------->+---------------+ | nr_hwcur | | -+----------------+ | ring *--------. | nr_hwavail | V - | nr_hwcur | | | selinfo | | - | nr_hwavail | | +---------------+ . - | selinfo | | | ... | . - +---------------+ | |(ntx+1 entries)| - | .... | | | | - |(nrx+1 entries)| | +---------------+ - | | | - KERNEL +---------------+ | - | ==================================================================== | - USERSPACE | NETMAP_RING - +---->+-------------+ - / | cur | - NETMAP_IF (nifp, one per file desc.) / | avail | - +---------------+ / | buf_ofs | - | ni_tx_rings | / +=============+ - | ni_rx_rings | / | buf_idx | slot[0] - | | / | len, flags | - | | / +-------------+ - +===============+ / | buf_idx | slot[1] - | txring_ofs[0] | (rel.to nifp)--' | len, flags | - | txring_ofs[1] | +-------------+ - (num_rings+1 entries) (nr_num_slots entries) - | txring_ofs[n] | | buf_idx | slot[n-1] - +---------------+ | len, flags | - | rxring_ofs[0] | +-------------+ + USERSPACE | struct netmap_ring + +---->+--------------+ + / | cur | + struct netmap_if (nifp, 1 per fd) / | avail | + +---------------+ / | buf_ofs | + | ni_tx_rings | / +==============+ + | ni_rx_rings | / | buf_idx, len | slot[0] + | | / | flags, ptr | + | | / +--------------+ + +===============+ / | buf_idx, len | slot[1] + | txring_ofs[0] | (rel.to nifp)--' | flags, ptr | + | txring_ofs[1] | +--------------+ + (ni_tx_rings+1 entries) (num_slots entries) + | txring_ofs[t] | | buf_idx, len | slot[n-1] + +---------------+ | flags, ptr | + | rxring_ofs[0] | +--------------+ | rxring_ofs[1] | - (num_rings+1 entries) - | txring_ofs[n] | + (ni_rx_rings+1 entries) + | rxring_ofs[r] | +---------------+ - * The private descriptor ('softc' or 'adapter') of each interface - * is extended with a "struct netmap_adapter" containing netmap-related - * info (see description in dev/netmap/netmap_kernel.h. - * Among other things, tx_rings and rx_rings point to the arrays of - * "struct netmap_kring" which in turn reache the various - * "struct netmap_ring", shared with userspace. - - * The NETMAP_RING is the userspace-visible replica of the NIC ring. - * Each slot has the index of a buffer, its length and some flags. + * For each "interface" (NIC, host stack, VALE switch port) attached to a + * file descriptor, the mmap()ed region contains a (logically readonly) + * struct netmap_if pointing to struct netmap_ring's. + * There is one netmap_ring per physical NIC ring, plus one tx/rx ring + * pair attached to the host stack (this pair is unused for VALE ports). + * + * All physical/host stack ports share the same memory region, + * so that zero-copy can be implemented between them. + * VALE switch ports instead have separate memory regions. + * + * The netmap_ring is the userspace-visible replica of the NIC ring. + * Each slot has the index of a buffer (MTU-sized and residing in the + * mmapped region), its length and some flags. An extra 64-bit pointer + * is provided for user-supplied buffers in the tx path. + * * In user space, the buffer address is computed as * (char *)ring + buf_ofs + index*NETMAP_BUF_SIZE - * In the kernel, buffers do not necessarily need to be contiguous, - * and the virtual and physical addresses are derived through - * a lookup table. + */ + +/* + * struct netmap_slot is a buffer descriptor * - * struct netmap_slot: + * buf_idx the index of the buffer associated to the slot. + * len the length of the payload + * flags control operation on the slot, as defined below * - * buf_idx is the index of the buffer associated to the slot. - * len is the length of the payload * NS_BUF_CHANGED must be set whenever userspace wants * to change buf_idx (it might be necessary to - * reprogram the NIC slot) + * reprogram the NIC) + * * NS_REPORT must be set if we want the NIC to generate an interrupt * when this slot is used. Leaving it to 0 improves * performance. + * * NS_FORWARD if set on a receive ring, and the device is in * transparent mode, buffers released with the flag set * will be forwarded to the 'other' side (host stack * or NIC, respectively) on the next select() or ioctl() * - * The following will be supported from NETMAP_API = 5 * NS_NO_LEARN on a VALE switch, do not 'learn' the source port for * this packet. - * NS_INDIRECT the netmap buffer contains a 64-bit pointer to - * the actual userspace buffer. This may be useful - * to reduce copies in a VM environment. + * + * NS_INDIRECT (tx rings only) data is in a userspace buffer pointed + * by the ptr field in the slot. + * * NS_MOREFRAG Part of a multi-segment frame. The last (or only) * segment must not have this flag. + * Only supported on VALE ports. + * * NS_PORT_MASK the high 8 bits of the flag, if not zero, indicate the * destination port for the VALE switch, overriding * the lookup table. */ struct netmap_slot { - uint32_t buf_idx; /* buffer index */ - uint16_t len; /* packet length, to be copied to/from the hw ring */ - uint16_t flags; /* buf changed, etc. */ -#define NS_BUF_CHANGED 0x0001 /* must resync the map, buffer changed */ + uint32_t buf_idx; /* buffer index */ + uint16_t len; /* packet length */ + uint16_t flags; /* buf changed, etc. */ +#define NS_BUF_CHANGED 0x0001 /* buf_idx changed */ #define NS_REPORT 0x0002 /* ask the hardware to report results * e.g. by generating an interrupt */ @@ -157,62 +148,61 @@ struct netmap_slot { #define NS_MOREFRAG 0x0020 #define NS_PORT_SHIFT 8 #define NS_PORT_MASK (0xff << NS_PORT_SHIFT) + /* + * in rx rings, the high 8 bits + * are the number of fragments. + */ +#define NS_RFRAGS(_slot) ( ((_slot)->flags >> 8) & 0xff) + uint64_t ptr; /* pointer for indirect buffers */ }; /* + * struct netmap_ring + * * Netmap representation of a TX or RX ring (also known as "queue"). * This is a queue implemented as a fixed-size circular array. * At the software level, two fields are important: avail and cur. * * In TX rings: - * avail indicates the number of slots available for transmission. - * It is updated by the kernel after every netmap system call. - * It MUST BE decremented by the application when it appends a - * packet. + * + * avail tells how many slots are available for transmission. + * It is updated by the kernel in each netmap system call. + * It MUST BE decremented by the user when it + * adds a new packet to send. + * * cur indicates the slot to use for the next packet * to send (i.e. the "tail" of the queue). - * It MUST BE incremented by the application before + * It MUST BE incremented by the user before * netmap system calls to reflect the number of newly * sent packets. * It is checked by the kernel on netmap system calls * (normally unmodified by the kernel unless invalid). * - * The kernel side of netmap uses two additional fields in its own - * private ring structure, netmap_kring: - * nr_hwcur is a copy of nr_cur on an NIOCTXSYNC. - * nr_hwavail is the number of slots known as available by the - * hardware. It is updated on an INTR (inc by the - * number of packets sent) and on a NIOCTXSYNC - * (decrease by nr_cur - nr_hwcur) - * A special case, nr_hwavail is -1 if the transmit - * side is idle (no pending transmits). - * * In RX rings: + * * avail is the number of packets available (possibly 0). - * It MUST BE decremented by the application when it consumes - * a packet, and it is updated to nr_hwavail on a NIOCRXSYNC + * It is updated by the kernel in each netmap system call. + * It MUST BE decremented by the user when it + * consumes a packet. + * * cur indicates the first slot that contains a packet not - * processed yet (the "head" of the queue). - * It MUST BE incremented by the software when it consumes + * yet processed (the "head" of the queue). + * It MUST BE incremented by the user when it consumes * a packet. + * * reserved indicates the number of buffers before 'cur' - * that the application has still in use. Normally 0, - * it MUST BE incremented by the application when it + * that the user has not released yet. Normally 0, + * it MUST BE incremented by the user when it * does not return the buffer immediately, and decremented * when the buffer is finally freed. * - * The kernel side of netmap uses two additional fields in the kring: - * nr_hwcur is a copy of nr_cur on an NIOCRXSYNC - * nr_hwavail is the number of packets available. It is updated - * on INTR (inc by the number of new packets arrived) - * and on NIOCRXSYNC (decreased by nr_cur - nr_hwcur). * * DATA OWNERSHIP/LOCKING: - * The netmap_ring is owned by the user program and it is only - * accessed or modified in the upper half of the kernel during - * a system call. - * - * The netmap_kring is only modified by the upper half of the kernel. + * The netmap_ring, all slots, and buffers in the range + * [reserved-cur , cur+avail[ are owned by the user program, + * and the kernel only touches them in the same thread context + * during a system call. + * Other buffers are reserved for use by the NIC's DMA engines. * * FLAGS * NR_TIMESTAMP updates the 'ts' field on each syscall. This is @@ -228,7 +218,7 @@ struct netmap_slot { */ struct netmap_ring { /* - * nr_buf_base_ofs is meant to be used through macros. + * buf_ofs is meant to be used through macros. * It contains the offset of the buffer region from this * descriptor. */ @@ -253,23 +243,29 @@ struct netmap_ring { /* * Netmap representation of an interface and its queue(s). + * This is initialized by the kernel when binding a file + * descriptor to a port, and should be considered as readonly + * by user programs. The kernel never uses it. + * * There is one netmap_if for each file descriptor on which we want - * to select/poll. We assume that on each interface has the same number - * of receive and transmit queues. + * to select/poll. * select/poll operates on one or all pairs depending on the value of * nmr_queueid passed on the ioctl. */ struct netmap_if { char ni_name[IFNAMSIZ]; /* name of the interface. */ - const u_int ni_version; /* API version, currently unused */ - const u_int ni_rx_rings; /* number of rx rings */ - const u_int ni_tx_rings; /* if zero, same as ni_rx_rings */ + const uint32_t ni_version; /* API version, currently unused */ + const uint32_t ni_flags; /* properties */ +#define NI_PRIV_MEM 0x1 /* private memory region */ + + const uint32_t ni_rx_rings; /* number of rx rings */ + const uint32_t ni_tx_rings; /* number of tx rings */ /* * The following array contains the offset of each netmap ring - * from this structure. The first ni_tx_queues+1 entries refer - * to the tx rings, the next ni_rx_queues+1 refer to the rx rings + * from this structure. The first ni_tx_rings+1 entries refer + * to the tx rings, the next ni_rx_rings+1 refer to the rx rings * (the last entry in each block refers to the host stack rings). - * The area is filled up by the kernel on NIOCREG, + * The area is filled up by the kernel on NIOCREGIF, * and then only read by userspace code. */ const ssize_t ring_ofs[0]; @@ -282,23 +278,47 @@ struct netmap_if { * NIOCGINFO takes a struct ifreq, the interface name is the input, * the outputs are number of queues and number of descriptor * for each queue (useful to set number of threads etc.). + * The info returned is only advisory and may change before + * the interface is bound to a file descriptor. * * NIOCREGIF takes an interface name within a struct ifreq, * and activates netmap mode on the interface (if possible). * - * For vale ports, starting with NETMAP_API = 5, - * nr_tx_rings and nr_rx_rings specify how many software rings - * are created (0 means 1). + * nr_name is the name of the interface + * + * nr_tx_slots, nr_tx_slots, nr_tx_rings, nr_rx_rings + * indicate the configuration of the port on return. + * + * On input, non-zero values for nr_tx_rings, nr_tx_slots and the + * rx counterparts may be used to reconfigure the port according + * to the requested values, but this is not guaranteed. + * The actual values are returned on completion of the ioctl(). + * + * nr_ringid + * indicates how rings should be bound to the file descriptors. + * The default (0) means all physical rings of a NIC are bound. + * NETMAP_HW_RING plus a ring number lets you bind just + * a single ring pair. + * NETMAP_SW_RING binds only the host tx/rx rings + * NETMAP_NO_TX_POLL prevents select()/poll() from pushing + * out packets on the tx ring unless POLLOUT is specified. + * + * NETMAP_PRIV_MEM is a return value used to indicate that + * this ring is in a private memory region hence buffer + * swapping cannot be used + * + * nr_cmd is used to configure NICs attached to a VALE switch, + * or to dump the configuration of a VALE switch. + * + * nr_cmd = NETMAP_BDG_ATTACH and nr_name = vale*:ifname + * attaches the NIC to the switch, with nr_ringid specifying + * which rings to use * - * NIOCREGIF is also used to attach a NIC to a VALE switch. - * In this case the name is vale*:ifname, and "nr_cmd" - * is set to 'NETMAP_BDG_ATTACH' or 'NETMAP_BDG_DETACH'. - * nr_ringid specifies which rings should be attached, 0 means all, - * NETMAP_HW_RING + n means only the n-th ring. - * The process can terminate after the interface has been attached. + * nr_cmd = NETMAP_BDG_DETACH and nr_name = vale*:ifname + * disconnects a previously attached NIC * - * NIOCUNREGIF unregisters the interface associated to the fd. - * this is deprecated and will go away. + * nr_cmd = NETMAP_BDG_LIST is used to list the configuration + * of VALE switches, with additional arguments. * * NIOCTXSYNC, NIOCRXSYNC synchronize tx or rx queues, * whose identity is set in NIOCREGIF through nr_ringid @@ -312,7 +332,7 @@ struct netmap_if { struct nmreq { char nr_name[IFNAMSIZ]; uint32_t nr_version; /* API version */ -#define NETMAP_API 4 /* current version */ +#define NETMAP_API 5 /* current version */ uint32_t nr_offset; /* nifp offset in the shared region */ uint32_t nr_memsize; /* size of the shared region */ uint32_t nr_tx_slots; /* slots in tx rings */ @@ -320,6 +340,7 @@ struct nmreq { uint16_t nr_tx_rings; /* number of tx rings */ uint16_t nr_rx_rings; /* number of rx rings */ uint16_t nr_ringid; /* ring(s) we care about */ +#define NETMAP_PRIV_MEM 0x8000 /* rings use private memory */ #define NETMAP_HW_RING 0x4000 /* low bits indicate one hw ring */ #define NETMAP_SW_RING 0x2000 /* process the sw ring */ #define NETMAP_NO_TX_POLL 0x1000 /* no automatic txsync on poll */ @@ -343,7 +364,7 @@ struct nmreq { */ #define NIOCGINFO _IOWR('i', 145, struct nmreq) /* return IF info */ #define NIOCREGIF _IOWR('i', 146, struct nmreq) /* interface register */ -#define NIOCUNREGIF _IO('i', 147) /* interface unregister */ +#define NIOCUNREGIF _IO('i', 147) /* deprecated. Was interface unregister */ #define NIOCTXSYNC _IO('i', 148) /* sync tx queues */ #define NIOCRXSYNC _IO('i', 149) /* sync rx queues */ #endif /* !NIOCREGIF */ diff --git a/tools/tools/netmap/nm_util.c b/tools/tools/netmap/nm_util.c index 615360355f88..195b68776c3b 100644 --- a/tools/tools/netmap/nm_util.c +++ b/tools/tools/netmap/nm_util.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2012 Luigi Rizzo. All rights reserved. + * Copyright (C) 2012-2013 Luigi Rizzo. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -130,20 +130,14 @@ netmap_open(struct my_ring *me, int ringid, int promisc) req.nr_version = NETMAP_API; strncpy(req.nr_name, me->ifname, sizeof(req.nr_name)); req.nr_ringid = ringid; - err = ioctl(fd, NIOCGINFO, &req); + err = ioctl(fd, NIOCREGIF, &req); if (err) { - D("cannot get info on %s, errno %d ver %d", - me->ifname, errno, req.nr_version); + D("Unable to register %s", me->ifname); goto error; } me->memsize = l = req.nr_memsize; if (verbose) D("memsize is %d MB", l>>20); - err = ioctl(fd, NIOCREGIF, &req); - if (err) { - D("Unable to register %s", me->ifname); - goto error; - } if (me->mem == NULL) { me->mem = mmap(0, l, PROT_WRITE | PROT_READ, MAP_SHARED, fd, 0); diff --git a/tools/tools/netmap/pkt-gen.c b/tools/tools/netmap/pkt-gen.c index 901175e3a74c..7203eba0e165 100644 --- a/tools/tools/netmap/pkt-gen.c +++ b/tools/tools/netmap/pkt-gen.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2011-2012 Matteo Landi, Luigi Rizzo. All rights reserved. + * Copyright (C) 2011-2013 Matteo Landi, Luigi Rizzo. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -25,7 +25,7 @@ /* * $FreeBSD$ - * $Id$ + * $Id: pkt-gen.c 12346 2013-06-12 17:36:25Z luigi $ * * Example program to show how to build a multithreaded packet * source/sink using the netmap device. @@ -40,7 +40,10 @@ #include <ctype.h> // isprint() -const char *default_payload="netmap pkt-gen payload\n" +const char *default_payload="netmap pkt-gen DIRECT payload\n" + "http://info.iet.unipi.it/~luigi/netmap/ "; + +const char *indirect_payload="netmap pkt-gen indirect payload\n" "http://info.iet.unipi.it/~luigi/netmap/ "; int time_second; // support for RD() debugging macro @@ -58,8 +61,8 @@ struct pkt { struct ip_range { char *name; - struct in_addr start, end, cur; - uint16_t port0, port1, cur_p; + uint32_t start, end; /* same as struct in_addr */ + uint16_t port0, port1; }; struct mac_range { @@ -80,6 +83,7 @@ struct glob_arg { int burst; int forever; int npackets; /* total packets to send */ + int frags; /* fragments per packet */ int nthreads; int cpus; int options; /* testing */ @@ -103,6 +107,8 @@ struct glob_arg { void *mmap_addr; int mmap_size; char *ifname; + char *nmr_config; + int dummy_send; }; enum dev_type { DEV_NONE, DEV_NETMAP, DEV_PCAP, DEV_TAP }; @@ -137,45 +143,58 @@ struct targ { static void extract_ip_range(struct ip_range *r) { - char *p_lo, *p_hi; - char buf1[16]; // one ip address + char *ap, *pp; + struct in_addr a; D("extract IP range from %s", r->name); - p_lo = index(r->name, ':'); /* do we have ports ? */ - if (p_lo) { - D(" found ports at %s", p_lo); - *p_lo++ = '\0'; - p_hi = index(p_lo, '-'); - if (p_hi) - *p_hi++ = '\0'; - else - p_hi = p_lo; - r->port0 = strtol(p_lo, NULL, 0); - r->port1 = strtol(p_hi, NULL, 0); - if (r->port1 < r->port0) { - r->cur_p = r->port0; - r->port0 = r->port1; - r->port1 = r->cur_p; + r->port0 = r->port1 = 0; + r->start = r->end = 0; + + /* the first - splits start/end of range */ + ap = index(r->name, '-'); /* do we have ports ? */ + if (ap) { + *ap++ = '\0'; + } + /* grab the initial values (mandatory) */ + pp = index(r->name, ':'); + if (pp) { + *pp++ = '\0'; + r->port0 = r->port1 = strtol(pp, NULL, 0); + }; + inet_aton(r->name, &a); + r->start = r->end = ntohl(a.s_addr); + if (ap) { + pp = index(ap, ':'); + if (pp) { + *pp++ = '\0'; + if (*pp) + r->port1 = strtol(pp, NULL, 0); } - r->cur_p = r->port0; - D("ports are %d to %d", r->port0, r->port1); + if (*ap) { + inet_aton(ap, &a); + r->end = ntohl(a.s_addr); + } + } + if (r->port0 > r->port1) { + uint16_t tmp = r->port0; + r->port0 = r->port1; + r->port1 = tmp; } - p_hi = index(r->name, '-'); /* do we have upper ip ? */ - if (p_hi) { - *p_hi++ = '\0'; - } else - p_hi = r->name; - inet_aton(r->name, &r->start); - inet_aton(p_hi, &r->end); - if (r->start.s_addr > r->end.s_addr) { - r->cur = r->start; + if (r->start > r->end) { + uint32_t tmp = r->start; r->start = r->end; - r->end = r->cur; + r->end = tmp; + } + { + struct in_addr a; + char buf1[16]; // one ip address + + a.s_addr = htonl(r->end); + strncpy(buf1, inet_ntoa(a), sizeof(buf1)); + a.s_addr = htonl(r->start); + D("range is %s:%d to %s:%d", + inet_ntoa(a), r->port0, buf1, r->port1); } - r->cur = r->start; - strncpy(buf1, inet_ntoa(r->end), sizeof(buf1)); - D("range is %s %d to %s %d", inet_ntoa(r->start), r->port0, - buf1, r->port1); } static void @@ -256,6 +275,53 @@ system_ncpus(void) /* + * parse the vale configuration in conf and put it in nmr. + * The configuration may consist of 0 to 4 numbers separated + * by commas: #tx-slots,#rx-slots,#tx-rinzgs,#rx-rings. + * Missing numbers or zeroes stand for default values. + * As an additional convenience, if exactly one number + * is specified, then this is assigned to bot #tx-slots and #rx-slots. + * If there is no 4th number, then the 3rd is assigned to bot #tx-rings + * and #rx-rings. + */ +void parse_nmr_config(const char* conf, struct nmreq *nmr) +{ + char *w, *tok; + int i, v; + + nmr->nr_tx_rings = nmr->nr_rx_rings = 0; + nmr->nr_tx_slots = nmr->nr_rx_slots = 0; + if (conf == NULL || ! *conf) + return; + w = strdup(conf); + for (i = 0, tok = strtok(w, ","); tok; i++, tok = strtok(NULL, ",")) { + v = atoi(tok); + switch (i) { + case 0: + nmr->nr_tx_slots = nmr->nr_rx_slots = v; + break; + case 1: + nmr->nr_rx_slots = v; + break; + case 2: + nmr->nr_tx_rings = nmr->nr_rx_rings = v; + break; + case 3: + nmr->nr_rx_rings = v; + break; + default: + D("ignored config: %s", tok); + break; + } + } + D("txr %d txd %d rxr %d rxd %d", + nmr->nr_tx_rings, nmr->nr_tx_slots, + nmr->nr_rx_rings, nmr->nr_rx_slots); + free(w); +} + + +/* * locate the src mac address for our interface, put it * into the user-supplied buffer. return 0 if ok, -1 on error. */ @@ -361,7 +427,9 @@ dump_payload(char *p, int len, struct netmap_ring *ring, int cur) /* get the length in ASCII of the length of the packet. */ - printf("ring %p cur %5d len %5d buf %p\n", ring, cur, len, p); + printf("ring %p cur %5d [buf %6d flags 0x%04x len %5d]\n", + ring, cur, ring->slot[cur].buf_idx, + ring->slot[cur].flags, len); /* hexdump routine */ for (i = 0; i < len; ) { memset(buf, sizeof(buf), ' '); @@ -389,6 +457,54 @@ dump_payload(char *p, int len, struct netmap_ring *ring, int cur) #define uh_sum check #endif /* linux */ +/* + * increment the addressed in the packet, + * starting from the least significant field. + * DST_IP DST_PORT SRC_IP SRC_PORT + */ +static void +update_addresses(struct pkt *pkt, struct glob_arg *g) +{ + uint32_t a; + uint16_t p; + struct ip *ip = &pkt->ip; + struct udphdr *udp = &pkt->udp; + + p = ntohs(udp->uh_sport); + if (p < g->src_ip.port1) { /* just inc, no wrap */ + udp->uh_sport = htons(p + 1); + return; + } + udp->uh_sport = htons(g->src_ip.port0); + + a = ntohl(ip->ip_src.s_addr); + if (a < g->src_ip.end) { /* just inc, no wrap */ + ip->ip_src.s_addr = htonl(a + 1); + return; + } + ip->ip_src.s_addr = htonl(g->src_ip.start); + + udp->uh_sport = htons(g->src_ip.port0); + p = ntohs(udp->uh_dport); + if (p < g->dst_ip.port1) { /* just inc, no wrap */ + udp->uh_dport = htons(p + 1); + return; + } + udp->uh_dport = htons(g->dst_ip.port0); + + a = ntohl(ip->ip_dst.s_addr); + if (a < g->dst_ip.end) { /* just inc, no wrap */ + ip->ip_dst.s_addr = htonl(a + 1); + return; + } + ip->ip_dst.s_addr = htonl(g->dst_ip.start); + +} + +/* + * initialize one packet and prepare for the next one. + * The copy could be done better instead of repeating it each time. + */ static void initialize_packet(struct targ *targ) { @@ -398,9 +514,10 @@ initialize_packet(struct targ *targ) struct udphdr *udp; uint16_t paylen = targ->g->pkt_size - sizeof(*eh) - sizeof(struct ip); const char *payload = targ->g->options & OPT_INDIRECT ? - "XXXXXXXXXXXXXXXXXXXXXX" : default_payload; + indirect_payload : default_payload; int i, l, l0 = strlen(payload); + /* create a nice NUL-terminated string */ for (i = 0; i < paylen;) { l = min(l0, paylen - i); bcopy(payload, pkt->body + i, l); @@ -409,6 +526,7 @@ initialize_packet(struct targ *targ) pkt->body[i-1] = '\0'; ip = &pkt->ip; + /* prepare the headers */ ip->ip_v = IPVERSION; ip->ip_hl = 5; ip->ip_id = 0; @@ -418,22 +536,14 @@ initialize_packet(struct targ *targ) ip->ip_off = htons(IP_DF); /* Don't fragment */ ip->ip_ttl = IPDEFTTL; ip->ip_p = IPPROTO_UDP; - ip->ip_dst.s_addr = targ->g->dst_ip.cur.s_addr; - if (++targ->g->dst_ip.cur.s_addr > targ->g->dst_ip.end.s_addr) - targ->g->dst_ip.cur.s_addr = targ->g->dst_ip.start.s_addr; - ip->ip_src.s_addr = targ->g->src_ip.cur.s_addr; - if (++targ->g->src_ip.cur.s_addr > targ->g->src_ip.end.s_addr) - targ->g->src_ip.cur.s_addr = targ->g->src_ip.start.s_addr; + ip->ip_dst.s_addr = htonl(targ->g->dst_ip.start); + ip->ip_src.s_addr = htonl(targ->g->src_ip.start); ip->ip_sum = wrapsum(checksum(ip, sizeof(*ip), 0)); udp = &pkt->udp; - udp->uh_sport = htons(targ->g->src_ip.cur_p); - if (++targ->g->src_ip.cur_p > targ->g->src_ip.port1) - targ->g->src_ip.cur_p = targ->g->src_ip.port0; - udp->uh_dport = htons(targ->g->dst_ip.cur_p); - if (++targ->g->dst_ip.cur_p > targ->g->dst_ip.port1) - targ->g->dst_ip.cur_p = targ->g->dst_ip.port0; + udp->uh_sport = htons(targ->g->src_ip.port0); + udp->uh_dport = htons(targ->g->dst_ip.port0); udp->uh_ulen = htons(paylen); /* Magic: taken from sbin/dhclient/packet.c */ udp->uh_sum = wrapsum(checksum(udp, sizeof(*udp), @@ -461,13 +571,18 @@ initialize_packet(struct targ *targ) */ static int send_packets(struct netmap_ring *ring, struct pkt *pkt, - int size, u_int count, int options) + struct glob_arg *g, u_int count, int options, u_int nfrags) { u_int sent, cur = ring->cur; + int fcnt; + int size = g->pkt_size; if (ring->avail < count) count = ring->avail; - + if (count < nfrags) { + D("truncating packet, no room for frags %d %d", + count, nfrags); + } #if 0 if (options & (OPT_COPY | OPT_PREFETCH) ) { for (sent = 0; sent < count; sent++) { @@ -480,25 +595,36 @@ send_packets(struct netmap_ring *ring, struct pkt *pkt, cur = ring->cur; } #endif - for (sent = 0; sent < count; sent++) { + for (fcnt = nfrags, sent = 0; sent < count; sent++) { struct netmap_slot *slot = &ring->slot[cur]; char *p = NETMAP_BUF(ring, slot->buf_idx); slot->flags = 0; - if (options & OPT_DUMP) - dump_payload(p, size, ring, cur); if (options & OPT_INDIRECT) { slot->flags |= NS_INDIRECT; - *((struct pkt **)(void *)p) = pkt; - } else if (options & OPT_COPY) + slot->ptr = (uint64_t)pkt; + } else if (options & OPT_COPY) { pkt_copy(pkt, p, size); - else if (options & OPT_MEMCPY) + if (fcnt == 1) + update_addresses(pkt, g); + } else if (options & OPT_MEMCPY) { memcpy(p, pkt, size); - else if (options & OPT_PREFETCH) + if (fcnt == 1) + update_addresses(pkt, g); + } else if (options & OPT_PREFETCH) { prefetch(p); + } + if (options & OPT_DUMP) + dump_payload(p, size, ring, cur); slot->len = size; - if (sent == count - 1) + if (--fcnt > 0) + slot->flags |= NS_MOREFRAG; + else + fcnt = nfrags; + if (sent == count - 1) { + slot->flags &= ~NS_MOREFRAG; slot->flags |= NS_REPORT; + } cur = NETMAP_RING_NEXT(ring, cur); } ring->avail -= sent; @@ -801,6 +927,7 @@ sender_body(void *data) for (i = 0; !targ->cancel && (n == 0 || sent < n); i++) { if (pcap_inject(p, pkt, size) != -1) sent++; + update_addresses(pkt, targ->g); if (i > 10000) { targ->count = sent; i = 0; @@ -814,6 +941,7 @@ sender_body(void *data) for (i = 0; !targ->cancel && (n == 0 || sent < n); i++) { if (write(targ->g->main_fd, pkt, size) != -1) sent++; + update_addresses(pkt, targ->g); if (i > 10000) { targ->count = sent; i = 0; @@ -821,6 +949,8 @@ sender_body(void *data) } } else { int tosend = 0; + int frags = targ->g->frags; + while (!targ->cancel && (n == 0 || sent < n)) { if (rate_limit && tosend <= 0) { @@ -855,11 +985,20 @@ sender_body(void *data) txring = NETMAP_TXRING(nifp, i); if (txring->avail == 0) continue; - m = send_packets(txring, &targ->pkt, targ->g->pkt_size, - limit, options); + if (frags > 1) + limit = ((limit + frags - 1) / frags) * frags; + + m = send_packets(txring, &targ->pkt, targ->g, + limit, options, frags); + ND("limit %d avail %d frags %d m %d", + limit, txring->avail, frags, m); sent += m; - tosend -= m; targ->count = sent; + if (rate_limit) { + tosend -= m; + if (tosend <= 0) + break; + } } } /* flush any remaining packets */ @@ -909,7 +1048,6 @@ receive_packets(struct netmap_ring *ring, u_int limit, int dump) struct netmap_slot *slot = &ring->slot[cur]; char *p = NETMAP_BUF(ring, slot->buf_idx); - slot->flags = OPT_INDIRECT; // XXX if (dump) dump_payload(p, slot->len, ring, cur); @@ -1063,18 +1201,20 @@ usage(void) "\t-n count number of iterations (can be 0)\n" "\t-t pkts_to_send also forces tx mode\n" "\t-r pkts_to_receive also forces rx mode\n" - "\t-l pkts_size in bytes excluding CRC\n" - "\t-d dst-ip end with %%n to sweep n addresses\n" - "\t-s src-ip end with %%n to sweep n addresses\n" - "\t-D dst-mac end with %%n to sweep n addresses\n" - "\t-S src-mac end with %%n to sweep n addresses\n" + "\t-l pkt_size in bytes excluding CRC\n" + "\t-d dst_ip[:port[-dst_ip:port]] single or range\n" + "\t-s src_ip[:port[-src_ip:port]] single or range\n" + "\t-D dst-mac\n" + "\t-S src-mac\n" "\t-a cpu_id use setaffinity\n" "\t-b burst size testing, mostly\n" "\t-c cores cores to use\n" "\t-p threads processes/threads to use\n" "\t-T report_ms milliseconds between reports\n" - "\t-P use libpcap instead of netmap\n" + "\t-P use libpcap instead of netmap\n" "\t-w wait_for_link_time in seconds\n" + "\t-R rate in packets per second\n" + "\t-X dump payload\n" "", cmd); @@ -1112,6 +1252,7 @@ start_threads(struct glob_arg *g) strncpy(tifreq.nr_name, g->ifname, sizeof(tifreq.nr_name)); tifreq.nr_version = NETMAP_API; tifreq.nr_ringid = (g->nthreads > 1) ? (i | NETMAP_HW_RING) : 0; + parse_nmr_config(g->nmr_config, &tifreq); /* * if we are acting as a receiver only, do not touch the transmit ring. @@ -1126,8 +1267,10 @@ start_threads(struct glob_arg *g) D("Unable to register %s", g->ifname); continue; } + D("memsize is %d MB", tifreq.nr_memsize >> 20); targs[i].nmr = tifreq; targs[i].nifp = NETMAP_IF(g->mmap_addr, tifreq.nr_offset); + D("nifp flags 0x%x", targs[i].nifp->ni_flags); /* start threads. */ targs[i].qfirst = (g->nthreads > 1) ? i : 0; targs[i].qlast = (g->nthreads > 1) ? i+1 : @@ -1343,9 +1486,11 @@ main(int arc, char **argv) g.cpus = 1; g.forever = 1; g.tx_rate = 0; + g.frags = 1; + g.nmr_config = ""; while ( (ch = getopt(arc, argv, - "a:f:n:i:It:r:l:d:s:D:S:b:c:o:p:PT:w:WvR:X")) != -1) { + "a:f:F:n:i:It:r:l:d:s:D:S:b:c:o:p:PT:w:WvR:XC:")) != -1) { struct sf *fn; switch(ch) { @@ -1358,6 +1503,15 @@ main(int arc, char **argv) g.npackets = atoi(optarg); break; + case 'F': + i = atoi(optarg); + if (i < 1 || i > 63) { + D("invalid frags %d [1..63], ignore", i); + break; + } + g.frags = i; + break; + case 'f': for (fn = func; fn->key; fn++) { if (!strcmp(fn->key, optarg)) @@ -1383,6 +1537,8 @@ main(int arc, char **argv) g.dev_type = DEV_TAP; else g.dev_type = DEV_NETMAP; + if (!strcmp(g.ifname, "null")) + g.dummy_send = 1; break; case 'I': @@ -1454,6 +1610,9 @@ main(int arc, char **argv) break; case 'X': g.options |= OPT_DUMP; + break; + case 'C': + g.nmr_config = strdup(optarg); } } @@ -1507,6 +1666,8 @@ main(int arc, char **argv) D("cannot open pcap on %s", g.ifname); usage(); } + } else if (g.dummy_send) { + D("using a dummy send routine"); } else { bzero(&nmr, sizeof(nmr)); nmr.nr_version = NETMAP_API; @@ -1523,20 +1684,36 @@ main(int arc, char **argv) if (g.main_fd == -1) { D("Unable to open /dev/netmap"); // fail later - } else { - if ((ioctl(g.main_fd, NIOCGINFO, &nmr)) == -1) { - D("Unable to get if info without name"); - } else { - D("map size is %d Kb", nmr.nr_memsize >> 10); - } - bzero(&nmr, sizeof(nmr)); - nmr.nr_version = NETMAP_API; - strncpy(nmr.nr_name, g.ifname, sizeof(nmr.nr_name)); - if ((ioctl(g.main_fd, NIOCGINFO, &nmr)) == -1) { - D("Unable to get if info for %s", g.ifname); - } - devqueues = nmr.nr_rx_rings; } + /* + * Register the interface on the netmap device: from now on, + * we can operate on the network interface without any + * interference from the legacy network stack. + * + * We decide to put the first interface registration here to + * give time to cards that take a long time to reset the PHY. + */ + bzero(&nmr, sizeof(nmr)); + nmr.nr_version = NETMAP_API; + strncpy(nmr.nr_name, g.ifname, sizeof(nmr.nr_name)); + nmr.nr_version = NETMAP_API; + parse_nmr_config(g.nmr_config, &nmr); + if (ioctl(g.main_fd, NIOCREGIF, &nmr) == -1) { + D("Unable to register interface %s", g.ifname); + //continue, fail later + } + ND("%s: txr %d txd %d rxr %d rxd %d", g.ifname, + nmr.nr_tx_rings, nmr.nr_tx_slots, + nmr.nr_rx_rings, nmr.nr_rx_slots); + //if ((ioctl(g.main_fd, NIOCGINFO, &nmr)) == -1) { + // D("Unable to get if info without name"); + //} else { + // D("map size is %d Kb", nmr.nr_memsize >> 10); + //} + if ((ioctl(g.main_fd, NIOCGINFO, &nmr)) == -1) { + D("Unable to get if info for %s", g.ifname); + } + devqueues = nmr.nr_rx_rings; /* validate provided nthreads. */ if (g.nthreads < 1 || g.nthreads > devqueues) { @@ -1559,19 +1736,6 @@ main(int arc, char **argv) // continue, fail later } - /* - * Register the interface on the netmap device: from now on, - * we can operate on the network interface without any - * interference from the legacy network stack. - * - * We decide to put the first interface registration here to - * give time to cards that take a long time to reset the PHY. - */ - nmr.nr_version = NETMAP_API; - if (ioctl(g.main_fd, NIOCREGIF, &nmr) == -1) { - D("Unable to register interface %s", g.ifname); - //continue, fail later - } /* Print some debug information. */ @@ -1595,6 +1759,7 @@ main(int arc, char **argv) } } + if (g.options) { D("--- SPECIAL OPTIONS:%s%s%s%s%s\n", g.options & OPT_PREFETCH ? " prefetch" : "", @@ -1603,23 +1768,24 @@ main(int arc, char **argv) g.options & OPT_INDIRECT ? " indirect" : "", g.options & OPT_COPY ? " copy" : ""); } - - if (g.tx_rate == 0) { - g.tx_period.tv_sec = 0; - g.tx_period.tv_nsec = 0; - } else if (g.tx_rate == 1) { - g.tx_period.tv_sec = 1; - g.tx_period.tv_nsec = 0; - } else { - g.tx_period.tv_sec = 0; + + g.tx_period.tv_sec = g.tx_period.tv_nsec = 0; + if (g.tx_rate > 0) { + /* try to have at least something every second, + * reducing the burst size to 0.5s worth of data + * (but no less than one full set of fragments) + */ + if (g.burst > g.tx_rate/2) + g.burst = g.tx_rate/2; + if (g.burst < g.frags) + g.burst = g.frags; g.tx_period.tv_nsec = (1e9 / g.tx_rate) * g.burst; - if (g.tx_period.tv_nsec > 1000000000) { - g.tx_period.tv_sec = g.tx_period.tv_nsec / 1000000000; - g.tx_period.tv_nsec = g.tx_period.tv_nsec % 1000000000; - } + g.tx_period.tv_sec = g.tx_period.tv_nsec / 1000000000; + g.tx_period.tv_nsec = g.tx_period.tv_nsec % 1000000000; } - D("Sending %d packets every %d.%09d ns", - g.burst, (int)g.tx_period.tv_sec, (int)g.tx_period.tv_nsec); + if (g.td_body == sender_body) + D("Sending %d packets every %ld.%09ld s", + g.burst, g.tx_period.tv_sec, g.tx_period.tv_nsec); /* Wait for PHY reset. */ D("Wait %d secs for phy reset", wait_link); sleep(wait_link); |