diff options
| author | Luigi Rizzo <luigi@FreeBSD.org> | 2013-11-01 21:21:14 +0000 |
|---|---|---|
| committer | Luigi Rizzo <luigi@FreeBSD.org> | 2013-11-01 21:21:14 +0000 |
| commit | ce3ee1e7c4cac5b86bbc15daac68f2129aa42187 (patch) | |
| tree | 62d07ffe9208f3098d5f67c47dd66e29212478b5 /sys/net/netmap.h | |
| parent | a09968c47940d3b0e9e82ce7c06faec3f42cea94 (diff) | |
Notes
Diffstat (limited to 'sys/net/netmap.h')
| -rw-r--r-- | sys/net/netmap.h | 265 |
1 files changed, 143 insertions, 122 deletions
diff --git a/sys/net/netmap.h b/sys/net/netmap.h index b5ab6d549084..0f2baebe15fc 100644 --- a/sys/net/netmap.h +++ b/sys/net/netmap.h @@ -38,6 +38,8 @@ * Detailed info on netmap is available with "man netmap" or at * * http://info.iet.unipi.it/~luigi/netmap/ + * + * This API is also used to communicate with the VALE software switch */ #ifndef _NET_NETMAP_H_ @@ -46,106 +48,95 @@ /* * --- Netmap data structures --- * - * The data structures used by netmap are shown below. Those in - * capital letters are in an mmapp()ed area shared with userspace, - * while others are private to the kernel. - * Shared structures do not contain pointers but only memory - * offsets, so that addressing is portable between kernel and userspace. + * The userspace data structures used by netmap are shown below. + * They are allocated by the kernel and mmap()ed by userspace threads. + * Pointers are implemented as memory offsets or indexes, + * so that they can be easily dereferenced in kernel and userspace. + KERNEL (opaque, obviously) - softc -+----------------+ -| standard fields| -| if_pspare[0] ----------+ -+----------------+ | - | -+----------------+<------+ -|(netmap_adapter)| -| | netmap_kring -| tx_rings *--------------------------------->+---------------+ -| | netmap_kring | ring *---------. -| rx_rings *--------->+---------------+ | nr_hwcur | | -+----------------+ | ring *--------. | nr_hwavail | V - | nr_hwcur | | | selinfo | | - | nr_hwavail | | +---------------+ . - | selinfo | | | ... | . - +---------------+ | |(ntx+1 entries)| - | .... | | | | - |(nrx+1 entries)| | +---------------+ - | | | - KERNEL +---------------+ | - | ==================================================================== | - USERSPACE | NETMAP_RING - +---->+-------------+ - / | cur | - NETMAP_IF (nifp, one per file desc.) / | avail | - +---------------+ / | buf_ofs | - | ni_tx_rings | / +=============+ - | ni_rx_rings | / | buf_idx | slot[0] - | | / | len, flags | - | | / +-------------+ - +===============+ / | buf_idx | slot[1] - | txring_ofs[0] | (rel.to nifp)--' | len, flags | - | txring_ofs[1] | +-------------+ - (num_rings+1 entries) (nr_num_slots entries) - | txring_ofs[n] | | buf_idx | slot[n-1] - +---------------+ | len, flags | - | rxring_ofs[0] | +-------------+ + USERSPACE | struct netmap_ring + +---->+--------------+ + / | cur | + struct netmap_if (nifp, 1 per fd) / | avail | + +---------------+ / | buf_ofs | + | ni_tx_rings | / +==============+ + | ni_rx_rings | / | buf_idx, len | slot[0] + | | / | flags, ptr | + | | / +--------------+ + +===============+ / | buf_idx, len | slot[1] + | txring_ofs[0] | (rel.to nifp)--' | flags, ptr | + | txring_ofs[1] | +--------------+ + (ni_tx_rings+1 entries) (num_slots entries) + | txring_ofs[t] | | buf_idx, len | slot[n-1] + +---------------+ | flags, ptr | + | rxring_ofs[0] | +--------------+ | rxring_ofs[1] | - (num_rings+1 entries) - | txring_ofs[n] | + (ni_rx_rings+1 entries) + | rxring_ofs[r] | +---------------+ - * The private descriptor ('softc' or 'adapter') of each interface - * is extended with a "struct netmap_adapter" containing netmap-related - * info (see description in dev/netmap/netmap_kernel.h. - * Among other things, tx_rings and rx_rings point to the arrays of - * "struct netmap_kring" which in turn reache the various - * "struct netmap_ring", shared with userspace. - - * The NETMAP_RING is the userspace-visible replica of the NIC ring. - * Each slot has the index of a buffer, its length and some flags. + * For each "interface" (NIC, host stack, VALE switch port) attached to a + * file descriptor, the mmap()ed region contains a (logically readonly) + * struct netmap_if pointing to struct netmap_ring's. + * There is one netmap_ring per physical NIC ring, plus one tx/rx ring + * pair attached to the host stack (this pair is unused for VALE ports). + * + * All physical/host stack ports share the same memory region, + * so that zero-copy can be implemented between them. + * VALE switch ports instead have separate memory regions. + * + * The netmap_ring is the userspace-visible replica of the NIC ring. + * Each slot has the index of a buffer (MTU-sized and residing in the + * mmapped region), its length and some flags. An extra 64-bit pointer + * is provided for user-supplied buffers in the tx path. + * * In user space, the buffer address is computed as * (char *)ring + buf_ofs + index*NETMAP_BUF_SIZE - * In the kernel, buffers do not necessarily need to be contiguous, - * and the virtual and physical addresses are derived through - * a lookup table. + */ + +/* + * struct netmap_slot is a buffer descriptor * - * struct netmap_slot: + * buf_idx the index of the buffer associated to the slot. + * len the length of the payload + * flags control operation on the slot, as defined below * - * buf_idx is the index of the buffer associated to the slot. - * len is the length of the payload * NS_BUF_CHANGED must be set whenever userspace wants * to change buf_idx (it might be necessary to - * reprogram the NIC slot) + * reprogram the NIC) + * * NS_REPORT must be set if we want the NIC to generate an interrupt * when this slot is used. Leaving it to 0 improves * performance. + * * NS_FORWARD if set on a receive ring, and the device is in * transparent mode, buffers released with the flag set * will be forwarded to the 'other' side (host stack * or NIC, respectively) on the next select() or ioctl() * - * The following will be supported from NETMAP_API = 5 * NS_NO_LEARN on a VALE switch, do not 'learn' the source port for * this packet. - * NS_INDIRECT the netmap buffer contains a 64-bit pointer to - * the actual userspace buffer. This may be useful - * to reduce copies in a VM environment. + * + * NS_INDIRECT (tx rings only) data is in a userspace buffer pointed + * by the ptr field in the slot. + * * NS_MOREFRAG Part of a multi-segment frame. The last (or only) * segment must not have this flag. + * Only supported on VALE ports. + * * NS_PORT_MASK the high 8 bits of the flag, if not zero, indicate the * destination port for the VALE switch, overriding * the lookup table. */ struct netmap_slot { - uint32_t buf_idx; /* buffer index */ - uint16_t len; /* packet length, to be copied to/from the hw ring */ - uint16_t flags; /* buf changed, etc. */ -#define NS_BUF_CHANGED 0x0001 /* must resync the map, buffer changed */ + uint32_t buf_idx; /* buffer index */ + uint16_t len; /* packet length */ + uint16_t flags; /* buf changed, etc. */ +#define NS_BUF_CHANGED 0x0001 /* buf_idx changed */ #define NS_REPORT 0x0002 /* ask the hardware to report results * e.g. by generating an interrupt */ @@ -157,62 +148,61 @@ struct netmap_slot { #define NS_MOREFRAG 0x0020 #define NS_PORT_SHIFT 8 #define NS_PORT_MASK (0xff << NS_PORT_SHIFT) + /* + * in rx rings, the high 8 bits + * are the number of fragments. + */ +#define NS_RFRAGS(_slot) ( ((_slot)->flags >> 8) & 0xff) + uint64_t ptr; /* pointer for indirect buffers */ }; /* + * struct netmap_ring + * * Netmap representation of a TX or RX ring (also known as "queue"). * This is a queue implemented as a fixed-size circular array. * At the software level, two fields are important: avail and cur. * * In TX rings: - * avail indicates the number of slots available for transmission. - * It is updated by the kernel after every netmap system call. - * It MUST BE decremented by the application when it appends a - * packet. + * + * avail tells how many slots are available for transmission. + * It is updated by the kernel in each netmap system call. + * It MUST BE decremented by the user when it + * adds a new packet to send. + * * cur indicates the slot to use for the next packet * to send (i.e. the "tail" of the queue). - * It MUST BE incremented by the application before + * It MUST BE incremented by the user before * netmap system calls to reflect the number of newly * sent packets. * It is checked by the kernel on netmap system calls * (normally unmodified by the kernel unless invalid). * - * The kernel side of netmap uses two additional fields in its own - * private ring structure, netmap_kring: - * nr_hwcur is a copy of nr_cur on an NIOCTXSYNC. - * nr_hwavail is the number of slots known as available by the - * hardware. It is updated on an INTR (inc by the - * number of packets sent) and on a NIOCTXSYNC - * (decrease by nr_cur - nr_hwcur) - * A special case, nr_hwavail is -1 if the transmit - * side is idle (no pending transmits). - * * In RX rings: + * * avail is the number of packets available (possibly 0). - * It MUST BE decremented by the application when it consumes - * a packet, and it is updated to nr_hwavail on a NIOCRXSYNC + * It is updated by the kernel in each netmap system call. + * It MUST BE decremented by the user when it + * consumes a packet. + * * cur indicates the first slot that contains a packet not - * processed yet (the "head" of the queue). - * It MUST BE incremented by the software when it consumes + * yet processed (the "head" of the queue). + * It MUST BE incremented by the user when it consumes * a packet. + * * reserved indicates the number of buffers before 'cur' - * that the application has still in use. Normally 0, - * it MUST BE incremented by the application when it + * that the user has not released yet. Normally 0, + * it MUST BE incremented by the user when it * does not return the buffer immediately, and decremented * when the buffer is finally freed. * - * The kernel side of netmap uses two additional fields in the kring: - * nr_hwcur is a copy of nr_cur on an NIOCRXSYNC - * nr_hwavail is the number of packets available. It is updated - * on INTR (inc by the number of new packets arrived) - * and on NIOCRXSYNC (decreased by nr_cur - nr_hwcur). * * DATA OWNERSHIP/LOCKING: - * The netmap_ring is owned by the user program and it is only - * accessed or modified in the upper half of the kernel during - * a system call. - * - * The netmap_kring is only modified by the upper half of the kernel. + * The netmap_ring, all slots, and buffers in the range + * [reserved-cur , cur+avail[ are owned by the user program, + * and the kernel only touches them in the same thread context + * during a system call. + * Other buffers are reserved for use by the NIC's DMA engines. * * FLAGS * NR_TIMESTAMP updates the 'ts' field on each syscall. This is @@ -228,7 +218,7 @@ struct netmap_slot { */ struct netmap_ring { /* - * nr_buf_base_ofs is meant to be used through macros. + * buf_ofs is meant to be used through macros. * It contains the offset of the buffer region from this * descriptor. */ @@ -253,23 +243,29 @@ struct netmap_ring { /* * Netmap representation of an interface and its queue(s). + * This is initialized by the kernel when binding a file + * descriptor to a port, and should be considered as readonly + * by user programs. The kernel never uses it. + * * There is one netmap_if for each file descriptor on which we want - * to select/poll. We assume that on each interface has the same number - * of receive and transmit queues. + * to select/poll. * select/poll operates on one or all pairs depending on the value of * nmr_queueid passed on the ioctl. */ struct netmap_if { char ni_name[IFNAMSIZ]; /* name of the interface. */ - const u_int ni_version; /* API version, currently unused */ - const u_int ni_rx_rings; /* number of rx rings */ - const u_int ni_tx_rings; /* if zero, same as ni_rx_rings */ + const uint32_t ni_version; /* API version, currently unused */ + const uint32_t ni_flags; /* properties */ +#define NI_PRIV_MEM 0x1 /* private memory region */ + + const uint32_t ni_rx_rings; /* number of rx rings */ + const uint32_t ni_tx_rings; /* number of tx rings */ /* * The following array contains the offset of each netmap ring - * from this structure. The first ni_tx_queues+1 entries refer - * to the tx rings, the next ni_rx_queues+1 refer to the rx rings + * from this structure. The first ni_tx_rings+1 entries refer + * to the tx rings, the next ni_rx_rings+1 refer to the rx rings * (the last entry in each block refers to the host stack rings). - * The area is filled up by the kernel on NIOCREG, + * The area is filled up by the kernel on NIOCREGIF, * and then only read by userspace code. */ const ssize_t ring_ofs[0]; @@ -282,23 +278,47 @@ struct netmap_if { * NIOCGINFO takes a struct ifreq, the interface name is the input, * the outputs are number of queues and number of descriptor * for each queue (useful to set number of threads etc.). + * The info returned is only advisory and may change before + * the interface is bound to a file descriptor. * * NIOCREGIF takes an interface name within a struct ifreq, * and activates netmap mode on the interface (if possible). * - * For vale ports, starting with NETMAP_API = 5, - * nr_tx_rings and nr_rx_rings specify how many software rings - * are created (0 means 1). + * nr_name is the name of the interface + * + * nr_tx_slots, nr_tx_slots, nr_tx_rings, nr_rx_rings + * indicate the configuration of the port on return. + * + * On input, non-zero values for nr_tx_rings, nr_tx_slots and the + * rx counterparts may be used to reconfigure the port according + * to the requested values, but this is not guaranteed. + * The actual values are returned on completion of the ioctl(). + * + * nr_ringid + * indicates how rings should be bound to the file descriptors. + * The default (0) means all physical rings of a NIC are bound. + * NETMAP_HW_RING plus a ring number lets you bind just + * a single ring pair. + * NETMAP_SW_RING binds only the host tx/rx rings + * NETMAP_NO_TX_POLL prevents select()/poll() from pushing + * out packets on the tx ring unless POLLOUT is specified. + * + * NETMAP_PRIV_MEM is a return value used to indicate that + * this ring is in a private memory region hence buffer + * swapping cannot be used + * + * nr_cmd is used to configure NICs attached to a VALE switch, + * or to dump the configuration of a VALE switch. + * + * nr_cmd = NETMAP_BDG_ATTACH and nr_name = vale*:ifname + * attaches the NIC to the switch, with nr_ringid specifying + * which rings to use * - * NIOCREGIF is also used to attach a NIC to a VALE switch. - * In this case the name is vale*:ifname, and "nr_cmd" - * is set to 'NETMAP_BDG_ATTACH' or 'NETMAP_BDG_DETACH'. - * nr_ringid specifies which rings should be attached, 0 means all, - * NETMAP_HW_RING + n means only the n-th ring. - * The process can terminate after the interface has been attached. + * nr_cmd = NETMAP_BDG_DETACH and nr_name = vale*:ifname + * disconnects a previously attached NIC * - * NIOCUNREGIF unregisters the interface associated to the fd. - * this is deprecated and will go away. + * nr_cmd = NETMAP_BDG_LIST is used to list the configuration + * of VALE switches, with additional arguments. * * NIOCTXSYNC, NIOCRXSYNC synchronize tx or rx queues, * whose identity is set in NIOCREGIF through nr_ringid @@ -312,7 +332,7 @@ struct netmap_if { struct nmreq { char nr_name[IFNAMSIZ]; uint32_t nr_version; /* API version */ -#define NETMAP_API 4 /* current version */ +#define NETMAP_API 5 /* current version */ uint32_t nr_offset; /* nifp offset in the shared region */ uint32_t nr_memsize; /* size of the shared region */ uint32_t nr_tx_slots; /* slots in tx rings */ @@ -320,6 +340,7 @@ struct nmreq { uint16_t nr_tx_rings; /* number of tx rings */ uint16_t nr_rx_rings; /* number of rx rings */ uint16_t nr_ringid; /* ring(s) we care about */ +#define NETMAP_PRIV_MEM 0x8000 /* rings use private memory */ #define NETMAP_HW_RING 0x4000 /* low bits indicate one hw ring */ #define NETMAP_SW_RING 0x2000 /* process the sw ring */ #define NETMAP_NO_TX_POLL 0x1000 /* no automatic txsync on poll */ @@ -343,7 +364,7 @@ struct nmreq { */ #define NIOCGINFO _IOWR('i', 145, struct nmreq) /* return IF info */ #define NIOCREGIF _IOWR('i', 146, struct nmreq) /* interface register */ -#define NIOCUNREGIF _IO('i', 147) /* interface unregister */ +#define NIOCUNREGIF _IO('i', 147) /* deprecated. Was interface unregister */ #define NIOCTXSYNC _IO('i', 148) /* sync tx queues */ #define NIOCRXSYNC _IO('i', 149) /* sync rx queues */ #endif /* !NIOCREGIF */ |
