From 37e3a6d349581b4dd0aebf24be7b1b159a698dcf Mon Sep 17 00:00:00 2001 From: Luigi Rizzo Date: Sun, 16 Oct 2016 14:13:32 +0000 Subject: Import the current version of netmap, aligned with the one on github. This commit, long overdue, contains contributions in the last 2 years from Stefano Garzarella, Giuseppe Lettieri, Vincenzo Maffione, including: + fixes on monitor ports + the 'ptnet' virtual device driver, and ptnetmap backend, for high speed virtual passthrough on VMs (bhyve fixes in an upcoming commit) + improved emulated netmap mode + more robust error handling + removal of stale code + various fixes to code and documentation (some mixup between RX and TX parameters, and private and public variables) We also include an additional tool, nmreplay, which is functionally equivalent to tcpreplay but operating on netmap ports. --- sys/dev/netmap/if_ixl_netmap.h | 4 +- sys/dev/netmap/if_lem_netmap.h | 201 +----- sys/dev/netmap/ixgbe_netmap.h | 21 +- sys/dev/netmap/netmap.c | 1252 ++++++++++++++++++++--------------- sys/dev/netmap/netmap_freebsd.c | 762 +++++++++++++++++++-- sys/dev/netmap/netmap_generic.c | 936 ++++++++++++++++++-------- sys/dev/netmap/netmap_kern.h | 658 ++++++++++++++---- sys/dev/netmap/netmap_mbq.c | 9 +- sys/dev/netmap/netmap_mbq.h | 18 +- sys/dev/netmap/netmap_mem2.c | 932 ++++++++++++++++++++++++-- sys/dev/netmap/netmap_mem2.h | 20 +- sys/dev/netmap/netmap_monitor.c | 112 ++-- sys/dev/netmap/netmap_offloadings.c | 260 +++++--- sys/dev/netmap/netmap_pipe.c | 156 +++-- sys/dev/netmap/netmap_vale.c | 665 ++++++++++++++----- 15 files changed, 4404 insertions(+), 1602 deletions(-) (limited to 'sys/dev/netmap') diff --git a/sys/dev/netmap/if_ixl_netmap.h b/sys/dev/netmap/if_ixl_netmap.h index 2c7f9be541b3a..223dc06e36abc 100644 --- a/sys/dev/netmap/if_ixl_netmap.h +++ b/sys/dev/netmap/if_ixl_netmap.h @@ -59,7 +59,7 @@ extern int ixl_rx_miss, ixl_rx_miss_bufs, ixl_crcstrip; /* * device-specific sysctl variables: * - * ixl_crcstrip: 0: keep CRC in rx frames (default), 1: strip it. + * ixl_crcstrip: 0: NIC keeps CRC in rx frames, 1: NIC strips it (default). * During regular operations the CRC is stripped, but on some * hardware reception of frames not multiple of 64 is slower, * so using crcstrip=0 helps in benchmarks. @@ -73,7 +73,7 @@ SYSCTL_DECL(_dev_netmap); */ #if 0 SYSCTL_INT(_dev_netmap, OID_AUTO, ixl_crcstrip, - CTLFLAG_RW, &ixl_crcstrip, 1, "strip CRC on rx frames"); + CTLFLAG_RW, &ixl_crcstrip, 1, "NIC strips CRC on rx frames"); #endif SYSCTL_INT(_dev_netmap, OID_AUTO, ixl_rx_miss, CTLFLAG_RW, &ixl_rx_miss, 0, "potentially missed rx intr"); diff --git a/sys/dev/netmap/if_lem_netmap.h b/sys/dev/netmap/if_lem_netmap.h index 0ec9b13466091..1c2afbd18f10a 100644 --- a/sys/dev/netmap/if_lem_netmap.h +++ b/sys/dev/netmap/if_lem_netmap.h @@ -81,6 +81,22 @@ lem_netmap_reg(struct netmap_adapter *na, int onoff) } +static void +lem_netmap_intr(struct netmap_adapter *na, int onoff) +{ + struct ifnet *ifp = na->ifp; + struct adapter *adapter = ifp->if_softc; + + EM_CORE_LOCK(adapter); + if (onoff) { + lem_enable_intr(adapter); + } else { + lem_disable_intr(adapter); + } + EM_CORE_UNLOCK(adapter); +} + + /* * Reconcile kernel and user view of the transmit ring. */ @@ -99,10 +115,6 @@ lem_netmap_txsync(struct netmap_kring *kring, int flags) /* device-specific */ struct adapter *adapter = ifp->if_softc; -#ifdef NIC_PARAVIRT - struct paravirt_csb *csb = adapter->csb; - uint64_t *csbd = (uint64_t *)(csb + 1); -#endif /* NIC_PARAVIRT */ bus_dmamap_sync(adapter->txdma.dma_tag, adapter->txdma.dma_map, BUS_DMASYNC_POSTREAD); @@ -113,19 +125,6 @@ lem_netmap_txsync(struct netmap_kring *kring, int flags) nm_i = kring->nr_hwcur; if (nm_i != head) { /* we have new packets to send */ -#ifdef NIC_PARAVIRT - int do_kick = 0; - uint64_t t = 0; // timestamp - int n = head - nm_i; - if (n < 0) - n += lim + 1; - if (csb) { - t = rdtsc(); /* last timestamp */ - csbd[16] += t - csbd[0]; /* total Wg */ - csbd[17] += n; /* Wg count */ - csbd[0] = t; - } -#endif /* NIC_PARAVIRT */ nic_i = netmap_idx_k2n(kring, nm_i); while (nm_i != head) { struct netmap_slot *slot = &ring->slot[nm_i]; @@ -166,38 +165,8 @@ lem_netmap_txsync(struct netmap_kring *kring, int flags) bus_dmamap_sync(adapter->txdma.dma_tag, adapter->txdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); -#ifdef NIC_PARAVIRT - /* set unconditionally, then also kick if needed */ - if (csb) { - t = rdtsc(); - if (csb->host_need_txkick == 2) { - /* can compute an update of delta */ - int64_t delta = t - csbd[3]; - if (delta < 0) - delta = -delta; - if (csbd[8] == 0 || delta < csbd[8]) { - csbd[8] = delta; - csbd[9]++; - } - csbd[10]++; - } - csb->guest_tdt = nic_i; - csbd[18] += t - csbd[0]; // total wp - csbd[19] += n; - } - if (!csb || !csb->guest_csb_on || (csb->host_need_txkick & 1)) - do_kick = 1; - if (do_kick) -#endif /* NIC_PARAVIRT */ /* (re)start the tx unit up to slot nic_i (excluded) */ E1000_WRITE_REG(&adapter->hw, E1000_TDT(0), nic_i); -#ifdef NIC_PARAVIRT - if (do_kick) { - uint64_t t1 = rdtsc(); - csbd[20] += t1 - t; // total Np - csbd[21]++; - } -#endif /* NIC_PARAVIRT */ } /* @@ -206,93 +175,6 @@ lem_netmap_txsync(struct netmap_kring *kring, int flags) if (ticks != kring->last_reclaim || flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring)) { kring->last_reclaim = ticks; /* record completed transmissions using TDH */ -#ifdef NIC_PARAVIRT - /* host updates tdh unconditionally, and we have - * no side effects on reads, so we can read from there - * instead of exiting. - */ - if (csb) { - static int drain = 0, nodrain=0, good = 0, bad = 0, fail = 0; - u_int x = adapter->next_tx_to_clean; - csbd[19]++; // XXX count reclaims - nic_i = csb->host_tdh; - if (csb->guest_csb_on) { - if (nic_i == x) { - bad++; - csbd[24]++; // failed reclaims - /* no progress, request kick and retry */ - csb->guest_need_txkick = 1; - mb(); // XXX barrier - nic_i = csb->host_tdh; - } else { - good++; - } - if (nic_i != x) { - csb->guest_need_txkick = 2; - if (nic_i == csb->guest_tdt) - drain++; - else - nodrain++; -#if 1 - if (netmap_adaptive_io) { - /* new mechanism: last half ring (or so) - * released one slot at a time. - * This effectively makes the system spin. - * - * Take next_to_clean + 1 as a reference. - * tdh must be ahead or equal - * On entry, the logical order is - * x < tdh = nic_i - * We first push tdh up to avoid wraps. - * The limit is tdh-ll (half ring). - * if tdh-256 < x we report x; - * else we report tdh-256 - */ - u_int tdh = nic_i; - u_int ll = csbd[15]; - u_int delta = lim/8; - if (netmap_adaptive_io == 2 || ll > delta) - csbd[15] = ll = delta; - else if (netmap_adaptive_io == 1 && ll > 1) { - csbd[15]--; - } - - if (nic_i >= kring->nkr_num_slots) { - RD(5, "bad nic_i %d on input", nic_i); - } - x = nm_next(x, lim); - if (tdh < x) - tdh += lim + 1; - if (tdh <= x + ll) { - nic_i = x; - csbd[25]++; //report n + 1; - } else { - tdh = nic_i; - if (tdh < ll) - tdh += lim + 1; - nic_i = tdh - ll; - csbd[26]++; // report tdh - ll - } - } -#endif - } else { - /* we stop, count whether we are idle or not */ - int bh_active = csb->host_need_txkick & 2 ? 4 : 0; - csbd[27+ csb->host_need_txkick]++; - if (netmap_adaptive_io == 1) { - if (bh_active && csbd[15] > 1) - csbd[15]--; - else if (!bh_active && csbd[15] < lim/2) - csbd[15]++; - } - bad--; - fail++; - } - } - RD(1, "drain %d nodrain %d good %d retry %d fail %d", - drain, nodrain, good, bad, fail); - } else -#endif /* !NIC_PARAVIRT */ nic_i = E1000_READ_REG(&adapter->hw, E1000_TDH(0)); if (nic_i >= kring->nkr_num_slots) { /* XXX can it happen ? */ D("TDH wrap %d", nic_i); @@ -324,21 +206,10 @@ lem_netmap_rxsync(struct netmap_kring *kring, int flags) /* device-specific */ struct adapter *adapter = ifp->if_softc; -#ifdef NIC_PARAVIRT - struct paravirt_csb *csb = adapter->csb; - uint32_t csb_mode = csb && csb->guest_csb_on; - uint32_t do_host_rxkick = 0; -#endif /* NIC_PARAVIRT */ if (head > lim) return netmap_ring_reinit(kring); -#ifdef NIC_PARAVIRT - if (csb_mode) { - force_update = 1; - csb->guest_need_rxkick = 0; - } -#endif /* NIC_PARAVIRT */ /* XXX check sync modes */ bus_dmamap_sync(adapter->rxdma.dma_tag, adapter->rxdma.dma_map, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); @@ -357,23 +228,6 @@ lem_netmap_rxsync(struct netmap_kring *kring, int flags) uint32_t staterr = le32toh(curr->status); int len; -#ifdef NIC_PARAVIRT - if (csb_mode) { - if ((staterr & E1000_RXD_STAT_DD) == 0) { - /* don't bother to retry if more than 1 pkt */ - if (n > 1) - break; - csb->guest_need_rxkick = 1; - wmb(); - staterr = le32toh(curr->status); - if ((staterr & E1000_RXD_STAT_DD) == 0) { - break; - } else { /* we are good */ - csb->guest_need_rxkick = 0; - } - } - } else -#endif /* NIC_PARAVIRT */ if ((staterr & E1000_RXD_STAT_DD) == 0) break; len = le16toh(curr->length) - 4; // CRC @@ -390,18 +244,6 @@ lem_netmap_rxsync(struct netmap_kring *kring, int flags) nic_i = nm_next(nic_i, lim); } if (n) { /* update the state variables */ -#ifdef NIC_PARAVIRT - if (csb_mode) { - if (n > 1) { - /* leave one spare buffer so we avoid rxkicks */ - nm_i = nm_prev(nm_i, lim); - nic_i = nm_prev(nic_i, lim); - n--; - } else { - csb->guest_need_rxkick = 1; - } - } -#endif /* NIC_PARAVIRT */ ND("%d new packets at nic %d nm %d tail %d", n, adapter->next_rx_desc_to_check, @@ -440,10 +282,6 @@ lem_netmap_rxsync(struct netmap_kring *kring, int flags) curr->status = 0; bus_dmamap_sync(adapter->rxtag, rxbuf->map, BUS_DMASYNC_PREREAD); -#ifdef NIC_PARAVIRT - if (csb_mode && csb->host_rxkick_at == nic_i) - do_host_rxkick = 1; -#endif /* NIC_PARAVIRT */ nm_i = nm_next(nm_i, lim); nic_i = nm_next(nic_i, lim); } @@ -455,12 +293,6 @@ lem_netmap_rxsync(struct netmap_kring *kring, int flags) * so move nic_i back by one unit */ nic_i = nm_prev(nic_i, lim); -#ifdef NIC_PARAVIRT - /* set unconditionally, then also kick if needed */ - if (csb) - csb->guest_rdt = nic_i; - if (!csb_mode || do_host_rxkick) -#endif /* NIC_PARAVIRT */ E1000_WRITE_REG(&adapter->hw, E1000_RDT(0), nic_i); } @@ -486,6 +318,7 @@ lem_netmap_attach(struct adapter *adapter) na.nm_rxsync = lem_netmap_rxsync; na.nm_register = lem_netmap_reg; na.num_tx_rings = na.num_rx_rings = 1; + na.nm_intr = lem_netmap_intr; netmap_attach(&na); } diff --git a/sys/dev/netmap/ixgbe_netmap.h b/sys/dev/netmap/ixgbe_netmap.h index 0f34e72185032..7986c99651732 100644 --- a/sys/dev/netmap/ixgbe_netmap.h +++ b/sys/dev/netmap/ixgbe_netmap.h @@ -53,7 +53,7 @@ void ixgbe_netmap_attach(struct adapter *adapter); /* * device-specific sysctl variables: * - * ix_crcstrip: 0: keep CRC in rx frames (default), 1: strip it. + * ix_crcstrip: 0: NIC keeps CRC in rx frames (default), 1: NIC strips it. * During regular operations the CRC is stripped, but on some * hardware reception of frames not multiple of 64 is slower, * so using crcstrip=0 helps in benchmarks. @@ -65,7 +65,7 @@ SYSCTL_DECL(_dev_netmap); static int ix_rx_miss, ix_rx_miss_bufs; int ix_crcstrip; SYSCTL_INT(_dev_netmap, OID_AUTO, ix_crcstrip, - CTLFLAG_RW, &ix_crcstrip, 0, "strip CRC on rx frames"); + CTLFLAG_RW, &ix_crcstrip, 0, "NIC strips CRC on rx frames"); SYSCTL_INT(_dev_netmap, OID_AUTO, ix_rx_miss, CTLFLAG_RW, &ix_rx_miss, 0, "potentially missed rx intr"); SYSCTL_INT(_dev_netmap, OID_AUTO, ix_rx_miss_bufs, @@ -109,6 +109,20 @@ set_crcstrip(struct ixgbe_hw *hw, int onoff) IXGBE_WRITE_REG(hw, IXGBE_RDRXCTL, rxc); } +static void +ixgbe_netmap_intr(struct netmap_adapter *na, int onoff) +{ + struct ifnet *ifp = na->ifp; + struct adapter *adapter = ifp->if_softc; + + IXGBE_CORE_LOCK(adapter); + if (onoff) { + ixgbe_enable_intr(adapter); // XXX maybe ixgbe_stop ? + } else { + ixgbe_disable_intr(adapter); // XXX maybe ixgbe_stop ? + } + IXGBE_CORE_UNLOCK(adapter); +} /* * Register/unregister. We are already under netmap lock. @@ -311,7 +325,7 @@ ixgbe_netmap_txsync(struct netmap_kring *kring, int flags) * good way. */ nic_i = IXGBE_READ_REG(&adapter->hw, IXGBE_IS_VF(adapter) ? - IXGBE_VFTDH(kring->ring_id) : IXGBE_TDH(kring->ring_id)); + IXGBE_VFTDH(kring->ring_id) : IXGBE_TDH(kring->ring_id)); if (nic_i >= kring->nkr_num_slots) { /* XXX can it happen ? */ D("TDH wrap %d", nic_i); nic_i -= kring->nkr_num_slots; @@ -486,6 +500,7 @@ ixgbe_netmap_attach(struct adapter *adapter) na.nm_rxsync = ixgbe_netmap_rxsync; na.nm_register = ixgbe_netmap_reg; na.num_tx_rings = na.num_rx_rings = adapter->num_queues; + na.nm_intr = ixgbe_netmap_intr; netmap_attach(&na); } diff --git a/sys/dev/netmap/netmap.c b/sys/dev/netmap/netmap.c index aff757bdadfec..d92d342af83cd 100644 --- a/sys/dev/netmap/netmap.c +++ b/sys/dev/netmap/netmap.c @@ -1,5 +1,9 @@ /* - * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved. + * Copyright (C) 2011-2014 Matteo Landi + * Copyright (C) 2011-2016 Luigi Rizzo + * Copyright (C) 2011-2016 Giuseppe Lettieri + * Copyright (C) 2011-2016 Vincenzo Maffione + * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -133,13 +137,12 @@ ports attached to the switch) * > select()able file descriptor on which events are reported. * * Internally, we allocate a netmap_priv_d structure, that will be - * initialized on ioctl(NIOCREGIF). + * initialized on ioctl(NIOCREGIF). There is one netmap_priv_d + * structure for each open(). * * os-specific: - * FreeBSD: netmap_open (netmap_freebsd.c). The priv is - * per-thread. - * linux: linux_netmap_open (netmap_linux.c). The priv is - * per-open. + * FreeBSD: see netmap_open() (netmap_freebsd.c) + * linux: see linux_netmap_open() (netmap_linux.c) * * > 2. on each descriptor, the process issues an ioctl() to identify * > the interface that should report events to the file descriptor. @@ -299,18 +302,17 @@ ports attached to the switch) * netmap_transmit() * na->nm_notify == netmap_notify() * 2) ioctl(NIOCRXSYNC)/netmap_poll() in process context - * kring->nm_sync() == netmap_rxsync_from_host_compat + * kring->nm_sync() == netmap_rxsync_from_host * netmap_rxsync_from_host(na, NULL, NULL) * - tx to host stack * ioctl(NIOCTXSYNC)/netmap_poll() in process context - * kring->nm_sync() == netmap_txsync_to_host_compat + * kring->nm_sync() == netmap_txsync_to_host * netmap_txsync_to_host(na) - * NM_SEND_UP() - * FreeBSD: na->if_input() == ?? XXX + * nm_os_send_up() + * FreeBSD: na->if_input() == ether_input() * linux: netif_rx() with NM_MAGIC_PRIORITY_RX * * - * * -= SYSTEM DEVICE WITH GENERIC SUPPORT =- * * na == NA(ifp) == generic_netmap_adapter created in generic_netmap_attach() @@ -319,10 +321,11 @@ ports attached to the switch) * concurrently: * 1) ioctl(NIOCTXSYNC)/netmap_poll() in process context * kring->nm_sync() == generic_netmap_txsync() - * linux: dev_queue_xmit() with NM_MAGIC_PRIORITY_TX - * generic_ndo_start_xmit() - * orig. dev. start_xmit - * FreeBSD: na->if_transmit() == orig. dev if_transmit + * nm_os_generic_xmit_frame() + * linux: dev_queue_xmit() with NM_MAGIC_PRIORITY_TX + * ifp->ndo_start_xmit == generic_ndo_start_xmit() + * gna->save_start_xmit == orig. dev. start_xmit + * FreeBSD: na->if_transmit() == orig. dev if_transmit * 2) generic_mbuf_destructor() * na->nm_notify() == netmap_notify() * - rx from netmap userspace: @@ -333,24 +336,15 @@ ports attached to the switch) * generic_rx_handler() * mbq_safe_enqueue() * na->nm_notify() == netmap_notify() - * - rx from host stack: - * concurrently: + * - rx from host stack + * FreeBSD: same as native + * Linux: same as native except: * 1) host stack - * linux: generic_ndo_start_xmit() - * netmap_transmit() - * FreeBSD: ifp->if_input() == netmap_transmit - * both: - * na->nm_notify() == netmap_notify() - * 2) ioctl(NIOCRXSYNC)/netmap_poll() in process context - * kring->nm_sync() == netmap_rxsync_from_host_compat - * netmap_rxsync_from_host(na, NULL, NULL) - * - tx to host stack: - * ioctl(NIOCTXSYNC)/netmap_poll() in process context - * kring->nm_sync() == netmap_txsync_to_host_compat - * netmap_txsync_to_host(na) - * NM_SEND_UP() - * FreeBSD: na->if_input() == ??? XXX - * linux: netif_rx() with NM_MAGIC_PRIORITY_RX + * dev_queue_xmit() without NM_MAGIC_PRIORITY_TX + * ifp->ndo_start_xmit == generic_ndo_start_xmit() + * netmap_transmit() + * na->nm_notify() == netmap_notify() + * - tx to host stack (same as native): * * * -= VALE =- @@ -371,7 +365,7 @@ ports attached to the switch) * from host stack: * netmap_transmit() * na->nm_notify() == netmap_bwrap_intr_notify(ring_nr == host ring) - * kring->nm_sync() == netmap_rxsync_from_host_compat() + * kring->nm_sync() == netmap_rxsync_from_host() * netmap_vp_txsync() * * - system device with generic support: @@ -384,7 +378,7 @@ ports attached to the switch) * from host stack: * netmap_transmit() * na->nm_notify() == netmap_bwrap_intr_notify(ring_nr == host ring) - * kring->nm_sync() == netmap_rxsync_from_host_compat() + * kring->nm_sync() == netmap_rxsync_from_host() * netmap_vp_txsync() * * (all cases) --> nm_bdg_flush() @@ -407,7 +401,7 @@ ports attached to the switch) * netmap_vp_rxsync() * to host stack: * netmap_vp_rxsync() - * kring->nm_sync() == netmap_txsync_to_host_compat + * kring->nm_sync() == netmap_txsync_to_host * netmap_vp_rxsync_locked() * * - system device with generic adapter: @@ -418,7 +412,7 @@ ports attached to the switch) * netmap_vp_rxsync() * to host stack: * netmap_vp_rxsync() - * kring->nm_sync() == netmap_txsync_to_host_compat + * kring->nm_sync() == netmap_txsync_to_host * netmap_vp_rxsync() * */ @@ -455,29 +449,19 @@ ports attached to the switch) #include -/* reduce conditional code */ -// linux API, use for the knlist in FreeBSD -/* use a private mutex for the knlist */ -#define init_waitqueue_head(x) do { \ - struct mtx *m = &(x)->m; \ - mtx_init(m, "nm_kn_lock", NULL, MTX_DEF); \ - knlist_init_mtx(&(x)->si.si_note, m); \ - } while (0) - -#define OS_selrecord(a, b) selrecord(a, &((b)->si)) -#define OS_selwakeup(a, b) freebsd_selwakeup(a, b) - #elif defined(linux) #include "bsd_glue.h" - - #elif defined(__APPLE__) #warning OSX support is only partial #include "osx_glue.h" +#elif defined (_WIN32) + +#include "win_glue.h" + #else #error Unsupported platform @@ -492,47 +476,72 @@ ports attached to the switch) #include -MALLOC_DEFINE(M_NETMAP, "netmap", "Network memory map"); - /* user-controlled variables */ int netmap_verbose; static int netmap_no_timestamp; /* don't timestamp on rxsync */ - -SYSCTL_NODE(_dev, OID_AUTO, netmap, CTLFLAG_RW, 0, "Netmap args"); -SYSCTL_INT(_dev_netmap, OID_AUTO, verbose, - CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode"); -SYSCTL_INT(_dev_netmap, OID_AUTO, no_timestamp, - CTLFLAG_RW, &netmap_no_timestamp, 0, "no_timestamp"); int netmap_mitigate = 1; -SYSCTL_INT(_dev_netmap, OID_AUTO, mitigate, CTLFLAG_RW, &netmap_mitigate, 0, ""); int netmap_no_pendintr = 1; -SYSCTL_INT(_dev_netmap, OID_AUTO, no_pendintr, - CTLFLAG_RW, &netmap_no_pendintr, 0, "Always look for new received packets."); int netmap_txsync_retry = 2; -SYSCTL_INT(_dev_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW, - &netmap_txsync_retry, 0 , "Number of txsync loops in bridge's flush."); - int netmap_adaptive_io = 0; -SYSCTL_INT(_dev_netmap, OID_AUTO, adaptive_io, CTLFLAG_RW, - &netmap_adaptive_io, 0 , "Adaptive I/O on paravirt"); - int netmap_flags = 0; /* debug flags */ -int netmap_fwd = 0; /* force transparent mode */ +static int netmap_fwd = 0; /* force transparent mode */ /* * netmap_admode selects the netmap mode to use. * Invalid values are reset to NETMAP_ADMODE_BEST */ -enum { NETMAP_ADMODE_BEST = 0, /* use native, fallback to generic */ +enum { NETMAP_ADMODE_BEST = 0, /* use native, fallback to generic */ NETMAP_ADMODE_NATIVE, /* either native or none */ NETMAP_ADMODE_GENERIC, /* force generic */ NETMAP_ADMODE_LAST }; static int netmap_admode = NETMAP_ADMODE_BEST; -int netmap_generic_mit = 100*1000; /* Generic mitigation interval in nanoseconds. */ -int netmap_generic_ringsize = 1024; /* Generic ringsize. */ -int netmap_generic_rings = 1; /* number of queues in generic. */ +/* netmap_generic_mit controls mitigation of RX notifications for + * the generic netmap adapter. The value is a time interval in + * nanoseconds. */ +int netmap_generic_mit = 100*1000; + +/* We use by default netmap-aware qdiscs with generic netmap adapters, + * even if there can be a little performance hit with hardware NICs. + * However, using the qdisc is the safer approach, for two reasons: + * 1) it prevents non-fifo qdiscs to break the TX notification + * scheme, which is based on mbuf destructors when txqdisc is + * not used. + * 2) it makes it possible to transmit over software devices that + * change skb->dev, like bridge, veth, ... + * + * Anyway users looking for the best performance should + * use native adapters. + */ +int netmap_generic_txqdisc = 1; + +/* Default number of slots and queues for generic adapters. */ +int netmap_generic_ringsize = 1024; +int netmap_generic_rings = 1; + +/* Non-zero if ptnet devices are allowed to use virtio-net headers. */ +int ptnet_vnet_hdr = 1; + +/* + * SYSCTL calls are grouped between SYSBEGIN and SYSEND to be emulated + * in some other operating systems + */ +SYSBEGIN(main_init); + +SYSCTL_DECL(_dev_netmap); +SYSCTL_NODE(_dev, OID_AUTO, netmap, CTLFLAG_RW, 0, "Netmap args"); +SYSCTL_INT(_dev_netmap, OID_AUTO, verbose, + CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode"); +SYSCTL_INT(_dev_netmap, OID_AUTO, no_timestamp, + CTLFLAG_RW, &netmap_no_timestamp, 0, "no_timestamp"); +SYSCTL_INT(_dev_netmap, OID_AUTO, mitigate, CTLFLAG_RW, &netmap_mitigate, 0, ""); +SYSCTL_INT(_dev_netmap, OID_AUTO, no_pendintr, + CTLFLAG_RW, &netmap_no_pendintr, 0, "Always look for new received packets."); +SYSCTL_INT(_dev_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW, + &netmap_txsync_retry, 0 , "Number of txsync loops in bridge's flush."); +SYSCTL_INT(_dev_netmap, OID_AUTO, adaptive_io, CTLFLAG_RW, + &netmap_adaptive_io, 0 , "Adaptive I/O on paravirt"); SYSCTL_INT(_dev_netmap, OID_AUTO, flags, CTLFLAG_RW, &netmap_flags, 0 , ""); SYSCTL_INT(_dev_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0 , ""); @@ -540,19 +549,24 @@ SYSCTL_INT(_dev_netmap, OID_AUTO, admode, CTLFLAG_RW, &netmap_admode, 0 , ""); SYSCTL_INT(_dev_netmap, OID_AUTO, generic_mit, CTLFLAG_RW, &netmap_generic_mit, 0 , ""); SYSCTL_INT(_dev_netmap, OID_AUTO, generic_ringsize, CTLFLAG_RW, &netmap_generic_ringsize, 0 , ""); SYSCTL_INT(_dev_netmap, OID_AUTO, generic_rings, CTLFLAG_RW, &netmap_generic_rings, 0 , ""); +SYSCTL_INT(_dev_netmap, OID_AUTO, generic_txqdisc, CTLFLAG_RW, &netmap_generic_txqdisc, 0 , ""); +SYSCTL_INT(_dev_netmap, OID_AUTO, ptnet_vnet_hdr, CTLFLAG_RW, &ptnet_vnet_hdr, 0 , ""); + +SYSEND; NMG_LOCK_T netmap_global_lock; -int netmap_use_count = 0; /* number of active netmap instances */ /* * mark the ring as stopped, and run through the locks * to make sure other users get to see it. + * stopped must be either NR_KR_STOPPED (for unbounded stop) + * of NR_KR_LOCKED (brief stop for mutual exclusion purposes) */ static void -netmap_disable_ring(struct netmap_kring *kr) +netmap_disable_ring(struct netmap_kring *kr, int stopped) { - kr->nkr_stopped = 1; - nm_kr_get(kr); + nm_kr_stop(kr, stopped); + // XXX check if nm_kr_stop is sufficient mtx_lock(&kr->q_lock); mtx_unlock(&kr->q_lock); nm_kr_put(kr); @@ -563,7 +577,7 @@ void netmap_set_ring(struct netmap_adapter *na, u_int ring_id, enum txrx t, int stopped) { if (stopped) - netmap_disable_ring(NMR(na, t) + ring_id); + netmap_disable_ring(NMR(na, t) + ring_id, stopped); else NMR(na, t)[ring_id].nkr_stopped = 0; } @@ -590,13 +604,14 @@ netmap_set_all_rings(struct netmap_adapter *na, int stopped) * Convenience function used in drivers. Waits for current txsync()s/rxsync()s * to finish and prevents any new one from starting. Call this before turning * netmap mode off, or before removing the hardware rings (e.g., on module - * onload). As a rule of thumb for linux drivers, this should be placed near - * each napi_disable(). + * onload). */ void netmap_disable_all_rings(struct ifnet *ifp) { - netmap_set_all_rings(NA(ifp), 1 /* stopped */); + if (NM_NA_VALID(ifp)) { + netmap_set_all_rings(NA(ifp), NM_KR_STOPPED); + } } /* @@ -607,9 +622,34 @@ netmap_disable_all_rings(struct ifnet *ifp) void netmap_enable_all_rings(struct ifnet *ifp) { - netmap_set_all_rings(NA(ifp), 0 /* enabled */); + if (NM_NA_VALID(ifp)) { + netmap_set_all_rings(NA(ifp), 0 /* enabled */); + } +} + +void +netmap_make_zombie(struct ifnet *ifp) +{ + if (NM_NA_VALID(ifp)) { + struct netmap_adapter *na = NA(ifp); + netmap_set_all_rings(na, NM_KR_LOCKED); + na->na_flags |= NAF_ZOMBIE; + netmap_set_all_rings(na, 0); + } } +void +netmap_undo_zombie(struct ifnet *ifp) +{ + if (NM_NA_VALID(ifp)) { + struct netmap_adapter *na = NA(ifp); + if (na->na_flags & NAF_ZOMBIE) { + netmap_set_all_rings(na, NM_KR_LOCKED); + na->na_flags &= ~NAF_ZOMBIE; + netmap_set_all_rings(na, 0); + } + } +} /* * generic bound_checking function @@ -727,28 +767,9 @@ netmap_update_config(struct netmap_adapter *na) return 1; } -static void netmap_txsync_to_host(struct netmap_adapter *na); -static int netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait); - -/* kring->nm_sync callback for the host tx ring */ -static int -netmap_txsync_to_host_compat(struct netmap_kring *kring, int flags) -{ - (void)flags; /* unused */ - netmap_txsync_to_host(kring->na); - return 0; -} - -/* kring->nm_sync callback for the host rx ring */ -static int -netmap_rxsync_from_host_compat(struct netmap_kring *kring, int flags) -{ - (void)flags; /* unused */ - netmap_rxsync_from_host(kring->na, NULL, NULL); - return 0; -} - - +/* nm_sync callbacks for the host rings */ +static int netmap_txsync_to_host(struct netmap_kring *kring, int flags); +static int netmap_rxsync_from_host(struct netmap_kring *kring, int flags); /* create the krings array and initialize the fields common to all adapters. * The array layout is this: @@ -809,12 +830,14 @@ netmap_krings_create(struct netmap_adapter *na, u_int tailroom) kring->ring_id = i; kring->tx = t; kring->nkr_num_slots = ndesc; + kring->nr_mode = NKR_NETMAP_OFF; + kring->nr_pending_mode = NKR_NETMAP_OFF; if (i < nma_get_nrings(na, t)) { kring->nm_sync = (t == NR_TX ? na->nm_txsync : na->nm_rxsync); - } else if (i == na->num_tx_rings) { + } else { kring->nm_sync = (t == NR_TX ? - netmap_txsync_to_host_compat : - netmap_rxsync_from_host_compat); + netmap_txsync_to_host: + netmap_rxsync_from_host); } kring->nm_notify = na->nm_notify; kring->rhead = kring->rcur = kring->nr_hwcur = 0; @@ -822,14 +845,14 @@ netmap_krings_create(struct netmap_adapter *na, u_int tailroom) * IMPORTANT: Always keep one slot empty. */ kring->rtail = kring->nr_hwtail = (t == NR_TX ? ndesc - 1 : 0); - snprintf(kring->name, sizeof(kring->name) - 1, "%s %s%d", na->name, + snprintf(kring->name, sizeof(kring->name) - 1, "%s %s%d", na->name, nm_txrx2str(t), i); ND("ktx %s h %d c %d t %d", kring->name, kring->rhead, kring->rcur, kring->rtail); mtx_init(&kring->q_lock, (t == NR_TX ? "nm_txq_lock" : "nm_rxq_lock"), NULL, MTX_DEF); - init_waitqueue_head(&kring->si); + nm_os_selinfo_init(&kring->si); } - init_waitqueue_head(&na->si[t]); + nm_os_selinfo_init(&na->si[t]); } na->tailroom = na->rx_rings + n[NR_RX]; @@ -838,19 +861,6 @@ netmap_krings_create(struct netmap_adapter *na, u_int tailroom) } -#ifdef __FreeBSD__ -static void -netmap_knlist_destroy(NM_SELINFO_T *si) -{ - /* XXX kqueue(9) needed; these will mirror knlist_init. */ - knlist_delete(&si->si.si_note, curthread, 0 /* not locked */ ); - knlist_destroy(&si->si.si_note); - /* now we don't need the mutex anymore */ - mtx_destroy(&si->m); -} -#endif /* __FreeBSD__ */ - - /* undo the actions performed by netmap_krings_create */ /* call with NMG_LOCK held */ void @@ -860,12 +870,12 @@ netmap_krings_delete(struct netmap_adapter *na) enum txrx t; for_rx_tx(t) - netmap_knlist_destroy(&na->si[t]); + nm_os_selinfo_uninit(&na->si[t]); /* we rely on the krings layout described above */ for ( ; kring != na->tailroom; kring++) { mtx_destroy(&kring->q_lock); - netmap_knlist_destroy(&kring->si); + nm_os_selinfo_uninit(&kring->si); } free(na->tx_rings, M_DEVBUF); na->tx_rings = na->rx_rings = na->tailroom = NULL; @@ -878,14 +888,14 @@ netmap_krings_delete(struct netmap_adapter *na) * them first. */ /* call with NMG_LOCK held */ -static void +void netmap_hw_krings_delete(struct netmap_adapter *na) { struct mbq *q = &na->rx_rings[na->num_rx_rings].rx_queue; ND("destroy sw mbq with len %d", mbq_len(q)); mbq_purge(q); - mbq_safe_destroy(q); + mbq_safe_fini(q); netmap_krings_delete(na); } @@ -898,29 +908,38 @@ netmap_hw_krings_delete(struct netmap_adapter *na) */ /* call with NMG_LOCK held */ static void netmap_unset_ringid(struct netmap_priv_d *); -static void netmap_rel_exclusive(struct netmap_priv_d *); -static void +static void netmap_krings_put(struct netmap_priv_d *); +void netmap_do_unregif(struct netmap_priv_d *priv) { struct netmap_adapter *na = priv->np_na; NMG_LOCK_ASSERT(); na->active_fds--; - /* release exclusive use if it was requested on regif */ - netmap_rel_exclusive(priv); - if (na->active_fds <= 0) { /* last instance */ - - if (netmap_verbose) - D("deleting last instance for %s", na->name); + /* unset nr_pending_mode and possibly release exclusive mode */ + netmap_krings_put(priv); #ifdef WITH_MONITOR + /* XXX check whether we have to do something with monitor + * when rings change nr_mode. */ + if (na->active_fds <= 0) { /* walk through all the rings and tell any monitor * that the port is going to exit netmap mode */ netmap_monitor_stop(na); + } #endif + + if (na->active_fds <= 0 || nm_kring_pending(priv)) { + na->nm_register(na, 0); + } + + /* delete rings and buffers that are no longer needed */ + netmap_mem_rings_delete(na); + + if (na->active_fds <= 0) { /* last instance */ /* - * (TO CHECK) This function is only called + * (TO CHECK) We enter here * when the last reference to this file descriptor goes * away. This means we cannot have any pending poll() * or interrupt routine operating on the structure. @@ -933,16 +952,16 @@ netmap_do_unregif(struct netmap_priv_d *priv) * happens if the close() occurs while a concurrent * syscall is running. */ - na->nm_register(na, 0); /* off, clear flags */ - /* Wake up any sleeping threads. netmap_poll will - * then return POLLERR - * XXX The wake up now must happen during *_down(), when - * we order all activities to stop. -gl - */ - /* delete rings and buffers */ - netmap_mem_rings_delete(na); + if (netmap_verbose) + D("deleting last instance for %s", na->name); + + if (nm_netmap_on(na)) { + D("BUG: netmap on while going to delete the krings"); + } + na->nm_krings_delete(na); } + /* possibily decrement counter of tx_si/rx_si users */ netmap_unset_ringid(priv); /* delete the nifp */ @@ -962,6 +981,20 @@ nm_si_user(struct netmap_priv_d *priv, enum txrx t) (priv->np_qlast[t] - priv->np_qfirst[t] > 1)); } +struct netmap_priv_d* +netmap_priv_new(void) +{ + struct netmap_priv_d *priv; + + priv = malloc(sizeof(struct netmap_priv_d), M_DEVBUF, + M_NOWAIT | M_ZERO); + if (priv == NULL) + return NULL; + priv->np_refs = 1; + nm_os_get_module(); + return priv; +} + /* * Destructor of the netmap_priv_d, called when the fd is closed * Action: undo all the things done by NIOCREGIF, @@ -971,22 +1004,22 @@ nm_si_user(struct netmap_priv_d *priv, enum txrx t) * */ /* call with NMG_LOCK held */ -int -netmap_dtor_locked(struct netmap_priv_d *priv) +void +netmap_priv_delete(struct netmap_priv_d *priv) { struct netmap_adapter *na = priv->np_na; /* number of active references to this fd */ if (--priv->np_refs > 0) { - return 0; + return; } - netmap_use_count--; - if (!na) { - return 1; //XXX is it correct? + nm_os_put_module(); + if (na) { + netmap_do_unregif(priv); } - netmap_do_unregif(priv); - netmap_adapter_put(na); - return 1; + netmap_unget_na(na, priv->np_ifp); + bzero(priv, sizeof(*priv)); /* for safety */ + free(priv, M_DEVBUF); } @@ -995,15 +1028,10 @@ void netmap_dtor(void *data) { struct netmap_priv_d *priv = data; - int last_instance; NMG_LOCK(); - last_instance = netmap_dtor_locked(priv); + netmap_priv_delete(priv); NMG_UNLOCK(); - if (last_instance) { - bzero(priv, sizeof(*priv)); /* for safety */ - free(priv, M_DEVBUF); - } } @@ -1036,14 +1064,19 @@ static void netmap_send_up(struct ifnet *dst, struct mbq *q) { struct mbuf *m; + struct mbuf *head = NULL, *prev = NULL; /* send packets up, outside the lock */ while ((m = mbq_dequeue(q)) != NULL) { if (netmap_verbose & NM_VERB_HOST) D("sending up pkt %p size %d", m, MBUF_LEN(m)); - NM_SEND_UP(dst, m); + prev = nm_os_send_up(dst, m, prev); + if (head == NULL) + head = prev; } - mbq_destroy(q); + if (head) + nm_os_send_up(dst, NULL, head); + mbq_fini(q); } @@ -1081,6 +1114,27 @@ netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force) } } +static inline int +_nm_may_forward(struct netmap_kring *kring) +{ + return ((netmap_fwd || kring->ring->flags & NR_FORWARD) && + kring->na->na_flags & NAF_HOST_RINGS && + kring->tx == NR_RX); +} + +static inline int +nm_may_forward_up(struct netmap_kring *kring) +{ + return _nm_may_forward(kring) && + kring->ring_id != kring->na->num_rx_rings; +} + +static inline int +nm_may_forward_down(struct netmap_kring *kring) +{ + return _nm_may_forward(kring) && + kring->ring_id == kring->na->num_rx_rings; +} /* * Send to the NIC rings packets marked NS_FORWARD between @@ -1107,7 +1161,7 @@ netmap_sw_to_nic(struct netmap_adapter *na) for (; rxcur != head && !nm_ring_empty(rdst); rxcur = nm_next(rxcur, src_lim) ) { struct netmap_slot *src, *dst, tmp; - u_int dst_cur = rdst->cur; + u_int dst_head = rdst->head; src = &rxslot[rxcur]; if ((src->flags & NS_FORWARD) == 0 && !netmap_fwd) @@ -1115,7 +1169,7 @@ netmap_sw_to_nic(struct netmap_adapter *na) sent++; - dst = &rdst->slot[dst_cur]; + dst = &rdst->slot[dst_head]; tmp = *src; @@ -1126,7 +1180,7 @@ netmap_sw_to_nic(struct netmap_adapter *na) dst->len = tmp.len; dst->flags = NS_BUF_CHANGED; - rdst->cur = nm_next(dst_cur, dst_lim); + rdst->head = rdst->cur = nm_next(dst_head, dst_lim); } /* if (sent) XXX txsync ? */ } @@ -1140,10 +1194,10 @@ netmap_sw_to_nic(struct netmap_adapter *na) * can be among multiple user threads erroneously calling * this routine concurrently. */ -static void -netmap_txsync_to_host(struct netmap_adapter *na) +static int +netmap_txsync_to_host(struct netmap_kring *kring, int flags) { - struct netmap_kring *kring = &na->tx_rings[na->num_tx_rings]; + struct netmap_adapter *na = kring->na; u_int const lim = kring->nkr_num_slots - 1; u_int const head = kring->rhead; struct mbq q; @@ -1162,6 +1216,7 @@ netmap_txsync_to_host(struct netmap_adapter *na) kring->nr_hwtail -= lim + 1; netmap_send_up(na->ifp, &q); + return 0; } @@ -1171,17 +1226,15 @@ netmap_txsync_to_host(struct netmap_adapter *na) * We protect access to the kring using kring->rx_queue.lock * * This routine also does the selrecord if called from the poll handler - * (we know because td != NULL). + * (we know because sr != NULL). * - * NOTE: on linux, selrecord() is defined as a macro and uses pwait - * as an additional hidden argument. * returns the number of packets delivered to tx queues in * transparent mode, or a negative value if error */ static int -netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait) +netmap_rxsync_from_host(struct netmap_kring *kring, int flags) { - struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings]; + struct netmap_adapter *na = kring->na; struct netmap_ring *ring = kring->ring; u_int nm_i, n; u_int const lim = kring->nkr_num_slots - 1; @@ -1189,9 +1242,6 @@ netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwai int ret = 0; struct mbq *q = &kring->rx_queue, fq; - (void)pwait; /* disable unused warnings */ - (void)td; - mbq_init(&fq); /* fq holds packets to be freed */ mbq_lock(q); @@ -1226,19 +1276,20 @@ netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwai */ nm_i = kring->nr_hwcur; if (nm_i != head) { /* something was released */ - if (netmap_fwd || kring->ring->flags & NR_FORWARD) + if (nm_may_forward_down(kring)) { ret = netmap_sw_to_nic(na); + if (ret > 0) { + kring->nr_kflags |= NR_FORWARD; + ret = 0; + } + } kring->nr_hwcur = head; } - /* access copies of cur,tail in the kring */ - if (kring->rcur == kring->rtail && td) /* no bufs available */ - OS_selrecord(td, &kring->si); - mbq_unlock(q); mbq_purge(&fq); - mbq_destroy(&fq); + mbq_fini(&fq); return ret; } @@ -1267,17 +1318,14 @@ netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwai * 0 NETMAP_ADMODE_GENERIC GENERIC GENERIC * */ - +static void netmap_hw_dtor(struct netmap_adapter *); /* needed by NM_IS_NATIVE() */ int netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na) { /* generic support */ int i = netmap_admode; /* Take a snapshot. */ struct netmap_adapter *prev_na; -#ifdef WITH_GENERIC - struct netmap_generic_adapter *gna; int error = 0; -#endif *na = NULL; /* default */ @@ -1285,7 +1333,7 @@ netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na) if (i < NETMAP_ADMODE_BEST || i >= NETMAP_ADMODE_LAST) i = netmap_admode = NETMAP_ADMODE_BEST; - if (NETMAP_CAPABLE(ifp)) { + if (NM_NA_VALID(ifp)) { prev_na = NA(ifp); /* If an adapter already exists, return it if * there are active file descriptors or if @@ -1310,10 +1358,9 @@ netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na) /* If there isn't native support and netmap is not allowed * to use generic adapters, we cannot satisfy the request. */ - if (!NETMAP_CAPABLE(ifp) && i == NETMAP_ADMODE_NATIVE) + if (!NM_IS_NATIVE(ifp) && i == NETMAP_ADMODE_NATIVE) return EOPNOTSUPP; -#ifdef WITH_GENERIC /* Otherwise, create a generic adapter and return it, * saving the previously used netmap adapter, if any. * @@ -1328,25 +1375,12 @@ netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na) * the branches above. This ensures that we never override * a generic adapter with another generic adapter. */ - prev_na = NA(ifp); error = generic_netmap_attach(ifp); if (error) return error; *na = NA(ifp); - gna = (struct netmap_generic_adapter*)NA(ifp); - gna->prev = prev_na; /* save old na */ - if (prev_na != NULL) { - ifunit_ref(ifp->if_xname); - // XXX add a refcount ? - netmap_adapter_get(prev_na); - } - ND("Created generic NA %p (prev %p)", gna, gna->prev); - return 0; -#else /* !WITH_GENERIC */ - return EOPNOTSUPP; -#endif } @@ -1364,21 +1398,22 @@ netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na) * could not be allocated. * If successful, hold a reference to the netmap adapter. * - * No reference is kept on the real interface, which may then - * disappear at any time. + * If the interface specified by nmr is a system one, also keep + * a reference to it and return a valid *ifp. */ int -netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create) +netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, + struct ifnet **ifp, int create) { - struct ifnet *ifp = NULL; int error = 0; struct netmap_adapter *ret = NULL; *na = NULL; /* default return value */ + *ifp = NULL; NMG_LOCK_ASSERT(); - /* we cascade through all possible types of netmap adapter. + /* We cascade through all possible types of netmap adapter. * All netmap_get_*_na() functions return an error and an na, * with the following combinations: * @@ -1389,6 +1424,11 @@ netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create) * !0 !NULL impossible */ + /* try to see if this is a ptnetmap port */ + error = netmap_get_pt_host_na(nmr, na, create); + if (error || *na != NULL) + return error; + /* try to see if this is a monitor port */ error = netmap_get_monitor_na(nmr, na, create); if (error || *na != NULL) @@ -1413,12 +1453,12 @@ netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create) * This may still be a tap, a veth/epair, or even a * persistent VALE port. */ - ifp = ifunit_ref(nmr->nr_name); - if (ifp == NULL) { + *ifp = ifunit_ref(nmr->nr_name); + if (*ifp == NULL) { return ENXIO; } - error = netmap_get_hw_na(ifp, &ret); + error = netmap_get_hw_na(*ifp, &ret); if (error) goto out; @@ -1426,15 +1466,42 @@ netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create) netmap_adapter_get(ret); out: - if (error && ret != NULL) - netmap_adapter_put(ret); - - if (ifp) - if_rele(ifp); /* allow live unloading of drivers modules */ + if (error) { + if (ret) + netmap_adapter_put(ret); + if (*ifp) { + if_rele(*ifp); + *ifp = NULL; + } + } return error; } +/* undo netmap_get_na() */ +void +netmap_unget_na(struct netmap_adapter *na, struct ifnet *ifp) +{ + if (ifp) + if_rele(ifp); + if (na) + netmap_adapter_put(na); +} + + +#define NM_FAIL_ON(t) do { \ + if (unlikely(t)) { \ + RD(5, "%s: fail '" #t "' " \ + "h %d c %d t %d " \ + "rh %d rc %d rt %d " \ + "hc %d ht %d", \ + kring->name, \ + head, cur, ring->tail, \ + kring->rhead, kring->rcur, kring->rtail, \ + kring->nr_hwcur, kring->nr_hwtail); \ + return kring->nkr_num_slots; \ + } \ +} while (0) /* * validate parameters on entry for *_txsync() @@ -1449,11 +1516,9 @@ out: * * hwcur, rhead, rtail and hwtail are reliable */ -static u_int -nm_txsync_prologue(struct netmap_kring *kring) +u_int +nm_txsync_prologue(struct netmap_kring *kring, struct netmap_ring *ring) { -#define NM_ASSERT(t) if (t) { D("fail " #t); goto error; } - struct netmap_ring *ring = kring->ring; u_int head = ring->head; /* read only once */ u_int cur = ring->cur; /* read only once */ u_int n = kring->nkr_num_slots; @@ -1463,35 +1528,34 @@ nm_txsync_prologue(struct netmap_kring *kring) kring->nr_hwcur, kring->nr_hwtail, ring->head, ring->cur, ring->tail); #if 1 /* kernel sanity checks; but we can trust the kring. */ - if (kring->nr_hwcur >= n || kring->rhead >= n || - kring->rtail >= n || kring->nr_hwtail >= n) - goto error; + NM_FAIL_ON(kring->nr_hwcur >= n || kring->rhead >= n || + kring->rtail >= n || kring->nr_hwtail >= n); #endif /* kernel sanity checks */ /* - * user sanity checks. We only use 'cur', - * A, B, ... are possible positions for cur: + * user sanity checks. We only use head, + * A, B, ... are possible positions for head: * - * 0 A cur B tail C n-1 - * 0 D tail E cur F n-1 + * 0 A rhead B rtail C n-1 + * 0 D rtail E rhead F n-1 * * B, F, D are valid. A, C, E are wrong */ if (kring->rtail >= kring->rhead) { /* want rhead <= head <= rtail */ - NM_ASSERT(head < kring->rhead || head > kring->rtail); + NM_FAIL_ON(head < kring->rhead || head > kring->rtail); /* and also head <= cur <= rtail */ - NM_ASSERT(cur < head || cur > kring->rtail); + NM_FAIL_ON(cur < head || cur > kring->rtail); } else { /* here rtail < rhead */ /* we need head outside rtail .. rhead */ - NM_ASSERT(head > kring->rtail && head < kring->rhead); + NM_FAIL_ON(head > kring->rtail && head < kring->rhead); /* two cases now: head <= rtail or head >= rhead */ if (head <= kring->rtail) { /* want head <= cur <= rtail */ - NM_ASSERT(cur < head || cur > kring->rtail); + NM_FAIL_ON(cur < head || cur > kring->rtail); } else { /* head >= rhead */ /* cur must be outside rtail..head */ - NM_ASSERT(cur > kring->rtail && cur < head); + NM_FAIL_ON(cur > kring->rtail && cur < head); } } if (ring->tail != kring->rtail) { @@ -1502,15 +1566,6 @@ nm_txsync_prologue(struct netmap_kring *kring) kring->rhead = head; kring->rcur = cur; return head; - -error: - RD(5, "%s kring error: head %d cur %d tail %d rhead %d rcur %d rtail %d hwcur %d hwtail %d", - kring->name, - head, cur, ring->tail, - kring->rhead, kring->rcur, kring->rtail, - kring->nr_hwcur, kring->nr_hwtail); - return n; -#undef NM_ASSERT } @@ -1525,10 +1580,9 @@ error: * hwcur and hwtail are reliable. * */ -static u_int -nm_rxsync_prologue(struct netmap_kring *kring) +u_int +nm_rxsync_prologue(struct netmap_kring *kring, struct netmap_ring *ring) { - struct netmap_ring *ring = kring->ring; uint32_t const n = kring->nkr_num_slots; uint32_t head, cur; @@ -1546,30 +1600,24 @@ nm_rxsync_prologue(struct netmap_kring *kring) cur = kring->rcur = ring->cur; /* read only once */ head = kring->rhead = ring->head; /* read only once */ #if 1 /* kernel sanity checks */ - if (kring->nr_hwcur >= n || kring->nr_hwtail >= n) - goto error; + NM_FAIL_ON(kring->nr_hwcur >= n || kring->nr_hwtail >= n); #endif /* kernel sanity checks */ /* user sanity checks */ if (kring->nr_hwtail >= kring->nr_hwcur) { /* want hwcur <= rhead <= hwtail */ - if (head < kring->nr_hwcur || head > kring->nr_hwtail) - goto error; + NM_FAIL_ON(head < kring->nr_hwcur || head > kring->nr_hwtail); /* and also rhead <= rcur <= hwtail */ - if (cur < head || cur > kring->nr_hwtail) - goto error; + NM_FAIL_ON(cur < head || cur > kring->nr_hwtail); } else { /* we need rhead outside hwtail..hwcur */ - if (head < kring->nr_hwcur && head > kring->nr_hwtail) - goto error; + NM_FAIL_ON(head < kring->nr_hwcur && head > kring->nr_hwtail); /* two cases now: head <= hwtail or head >= hwcur */ if (head <= kring->nr_hwtail) { /* want head <= cur <= hwtail */ - if (cur < head || cur > kring->nr_hwtail) - goto error; + NM_FAIL_ON(cur < head || cur > kring->nr_hwtail); } else { /* cur must be outside hwtail..head */ - if (cur < head && cur > kring->nr_hwtail) - goto error; + NM_FAIL_ON(cur < head && cur > kring->nr_hwtail); } } if (ring->tail != kring->rtail) { @@ -1579,13 +1627,6 @@ nm_rxsync_prologue(struct netmap_kring *kring) ring->tail = kring->rtail; } return head; - -error: - RD(5, "kring error: hwcur %d rcur %d hwtail %d head %d cur %d tail %d", - kring->nr_hwcur, - kring->rcur, kring->nr_hwtail, - kring->rhead, kring->rcur, ring->tail); - return n; } @@ -1659,6 +1700,7 @@ netmap_interp_ringid(struct netmap_priv_d *priv, uint16_t ringid, uint32_t flags struct netmap_adapter *na = priv->np_na; u_int j, i = ringid & NETMAP_RING_MASK; u_int reg = flags & NR_REG_MASK; + int excluded_direction[] = { NR_TX_RINGS_ONLY, NR_RX_RINGS_ONLY }; enum txrx t; if (reg == NR_REG_DEFAULT) { @@ -1672,48 +1714,58 @@ netmap_interp_ringid(struct netmap_priv_d *priv, uint16_t ringid, uint32_t flags } D("deprecated API, old ringid 0x%x -> ringid %x reg %d", ringid, i, reg); } - switch (reg) { - case NR_REG_ALL_NIC: - case NR_REG_PIPE_MASTER: - case NR_REG_PIPE_SLAVE: - for_rx_tx(t) { + + if ((flags & NR_PTNETMAP_HOST) && (reg != NR_REG_ALL_NIC || + flags & (NR_RX_RINGS_ONLY|NR_TX_RINGS_ONLY))) { + D("Error: only NR_REG_ALL_NIC supported with netmap passthrough"); + return EINVAL; + } + + for_rx_tx(t) { + if (flags & excluded_direction[t]) { + priv->np_qfirst[t] = priv->np_qlast[t] = 0; + continue; + } + switch (reg) { + case NR_REG_ALL_NIC: + case NR_REG_PIPE_MASTER: + case NR_REG_PIPE_SLAVE: priv->np_qfirst[t] = 0; priv->np_qlast[t] = nma_get_nrings(na, t); - } - ND("%s %d %d", "ALL/PIPE", - priv->np_qfirst[NR_RX], priv->np_qlast[NR_RX]); - break; - case NR_REG_SW: - case NR_REG_NIC_SW: - if (!(na->na_flags & NAF_HOST_RINGS)) { - D("host rings not supported"); - return EINVAL; - } - for_rx_tx(t) { + ND("ALL/PIPE: %s %d %d", nm_txrx2str(t), + priv->np_qfirst[t], priv->np_qlast[t]); + break; + case NR_REG_SW: + case NR_REG_NIC_SW: + if (!(na->na_flags & NAF_HOST_RINGS)) { + D("host rings not supported"); + return EINVAL; + } priv->np_qfirst[t] = (reg == NR_REG_SW ? nma_get_nrings(na, t) : 0); priv->np_qlast[t] = nma_get_nrings(na, t) + 1; - } - ND("%s %d %d", reg == NR_REG_SW ? "SW" : "NIC+SW", - priv->np_qfirst[NR_RX], priv->np_qlast[NR_RX]); - break; - case NR_REG_ONE_NIC: - if (i >= na->num_tx_rings && i >= na->num_rx_rings) { - D("invalid ring id %d", i); - return EINVAL; - } - for_rx_tx(t) { + ND("%s: %s %d %d", reg == NR_REG_SW ? "SW" : "NIC+SW", + nm_txrx2str(t), + priv->np_qfirst[t], priv->np_qlast[t]); + break; + case NR_REG_ONE_NIC: + if (i >= na->num_tx_rings && i >= na->num_rx_rings) { + D("invalid ring id %d", i); + return EINVAL; + } /* if not enough rings, use the first one */ j = i; if (j >= nma_get_nrings(na, t)) j = 0; priv->np_qfirst[t] = j; priv->np_qlast[t] = j + 1; + ND("ONE_NIC: %s %d %d", nm_txrx2str(t), + priv->np_qfirst[t], priv->np_qlast[t]); + break; + default: + D("invalid regif type %d", reg); + return EINVAL; } - break; - default: - D("invalid regif type %d", reg); - return EINVAL; } priv->np_flags = (flags & ~NR_REG_MASK) | reg; @@ -1776,11 +1828,12 @@ netmap_unset_ringid(struct netmap_priv_d *priv) } -/* check that the rings we want to bind are not exclusively owned by a previous - * bind. If exclusive ownership has been requested, we also mark the rings. +/* Set the nr_pending_mode for the requested rings. + * If requested, also try to get exclusive access to the rings, provided + * the rings we want to bind are not exclusively owned by a previous bind. */ static int -netmap_get_exclusive(struct netmap_priv_d *priv) +netmap_krings_get(struct netmap_priv_d *priv) { struct netmap_adapter *na = priv->np_na; u_int i; @@ -1811,16 +1864,16 @@ netmap_get_exclusive(struct netmap_priv_d *priv) } } - /* second round: increment usage cound and possibly - * mark as exclusive + /* second round: increment usage count (possibly marking them + * as exclusive) and set the nr_pending_mode */ - for_rx_tx(t) { for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) { kring = &NMR(na, t)[i]; kring->users++; if (excl) kring->nr_kflags |= NKR_EXCLUSIVE; + kring->nr_pending_mode = NKR_NETMAP_ON; } } @@ -1828,9 +1881,11 @@ netmap_get_exclusive(struct netmap_priv_d *priv) } -/* undo netmap_get_ownership() */ +/* Undo netmap_krings_get(). This is done by clearing the exclusive mode + * if was asked on regif, and unset the nr_pending_mode if we are the + * last users of the involved rings. */ static void -netmap_rel_exclusive(struct netmap_priv_d *priv) +netmap_krings_put(struct netmap_priv_d *priv) { struct netmap_adapter *na = priv->np_na; u_int i; @@ -1852,6 +1907,8 @@ netmap_rel_exclusive(struct netmap_priv_d *priv) if (excl) kring->nr_kflags &= ~NKR_EXCLUSIVE; kring->users--; + if (kring->users == 0) + kring->nr_pending_mode = NKR_NETMAP_OFF; } } } @@ -1899,9 +1956,8 @@ netmap_rel_exclusive(struct netmap_priv_d *priv) * (put the adapter in netmap mode) * * This may be one of the following: - * (XXX these should be either all *_register or all *_reg 2014-03-15) * - * * netmap_hw_register (hw ports) + * * netmap_hw_reg (hw ports) * checks that the ifp is still there, then calls * the hardware specific callback; * @@ -1919,7 +1975,7 @@ netmap_rel_exclusive(struct netmap_priv_d *priv) * intercept the sync callbacks of the monitored * rings * - * * netmap_bwrap_register (bwraps) + * * netmap_bwrap_reg (bwraps) * cross-link the bwrap and hwna rings, * forward the request to the hwna, override * the hwna notify callback (to get the frames @@ -1948,7 +2004,7 @@ netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na, if (na->active_fds == 0) { /* * If this is the first registration of the adapter, - * also create the netmap rings and their in-kernel view, + * create the in-kernel view of the netmap rings, * the netmap krings. */ @@ -1960,39 +2016,48 @@ netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na, if (error) goto err_drop_mem; - /* create all missing netmap rings */ - error = netmap_mem_rings_create(na); - if (error) - goto err_del_krings; } - /* now the kring must exist and we can check whether some - * previous bind has exclusive ownership on them + /* now the krings must exist and we can check whether some + * previous bind has exclusive ownership on them, and set + * nr_pending_mode */ - error = netmap_get_exclusive(priv); + error = netmap_krings_get(priv); if (error) - goto err_del_rings; + goto err_del_krings; + + /* create all needed missing netmap rings */ + error = netmap_mem_rings_create(na); + if (error) + goto err_rel_excl; /* in all cases, create a new netmap if */ nifp = netmap_mem_if_new(na); if (nifp == NULL) { error = ENOMEM; - goto err_rel_excl; + goto err_del_rings; } - na->active_fds++; - if (!nm_netmap_on(na)) { - /* Netmap not active, set the card in netmap mode - * and make it use the shared buffers. - */ + if (na->active_fds == 0) { /* cache the allocator info in the na */ - netmap_mem_get_lut(na->nm_mem, &na->na_lut); - ND("%p->na_lut == %p", na, na->na_lut.lut); - error = na->nm_register(na, 1); /* mode on */ - if (error) + error = netmap_mem_get_lut(na->nm_mem, &na->na_lut); + if (error) goto err_del_if; + ND("lut %p bufs %u size %u", na->na_lut.lut, na->na_lut.objtotal, + na->na_lut.objsize); + } + + if (nm_kring_pending(priv)) { + /* Some kring is switching mode, tell the adapter to + * react on this. */ + error = na->nm_register(na, 1); + if (error) + goto err_put_lut; } + /* Commit the reference. */ + na->active_fds++; + /* * advertise that the interface is ready by setting np_nifp. * The barrier is needed because readers (poll, *SYNC and mmap) @@ -2003,15 +2068,15 @@ netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na, return 0; +err_put_lut: + if (na->active_fds == 0) + memset(&na->na_lut, 0, sizeof(na->na_lut)); err_del_if: - memset(&na->na_lut, 0, sizeof(na->na_lut)); - na->active_fds--; netmap_mem_if_delete(na, nifp); err_rel_excl: - netmap_rel_exclusive(priv); + netmap_krings_put(priv); err_del_rings: - if (na->active_fds == 0) - netmap_mem_rings_delete(na); + netmap_mem_rings_delete(na); err_del_krings: if (na->active_fds == 0) na->nm_krings_delete(na); @@ -2024,41 +2089,23 @@ err: /* - * update kring and ring at the end of txsync. + * update kring and ring at the end of rxsync/txsync. */ static inline void -nm_txsync_finalize(struct netmap_kring *kring) +nm_sync_finalize(struct netmap_kring *kring) { - /* update ring tail to what the kernel knows */ + /* + * Update ring tail to what the kernel knows + * After txsync: head/rhead/hwcur might be behind cur/rcur + * if no carrier. + */ kring->ring->tail = kring->rtail = kring->nr_hwtail; - /* note, head/rhead/hwcur might be behind cur/rcur - * if no carrier - */ ND(5, "%s now hwcur %d hwtail %d head %d cur %d tail %d", kring->name, kring->nr_hwcur, kring->nr_hwtail, kring->rhead, kring->rcur, kring->rtail); } - -/* - * update kring and ring at the end of rxsync - */ -static inline void -nm_rxsync_finalize(struct netmap_kring *kring) -{ - /* tell userspace that there might be new packets */ - //struct netmap_ring *ring = kring->ring; - ND("head %d cur %d tail %d -> %d", ring->head, ring->cur, ring->tail, - kring->nr_hwtail); - kring->ring->tail = kring->rtail = kring->nr_hwtail; - /* make a copy of the state for next round */ - kring->rhead = kring->ring->head; - kring->rcur = kring->ring->cur; -} - - - /* * ioctl(2) support for the "netmap" device. * @@ -2072,21 +2119,17 @@ nm_rxsync_finalize(struct netmap_kring *kring) * Return 0 on success, errno otherwise. */ int -netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, - int fflag, struct thread *td) +netmap_ioctl(struct netmap_priv_d *priv, u_long cmd, caddr_t data, struct thread *td) { - struct netmap_priv_d *priv = NULL; struct nmreq *nmr = (struct nmreq *) data; struct netmap_adapter *na = NULL; - int error; + struct ifnet *ifp = NULL; + int error = 0; u_int i, qfirst, qlast; struct netmap_if *nifp; struct netmap_kring *krings; enum txrx t; - (void)dev; /* UNUSED */ - (void)fflag; /* UNUSED */ - if (cmd == NIOCGINFO || cmd == NIOCREGIF) { /* truncate name */ nmr->nr_name[sizeof(nmr->nr_name) - 1] = '\0'; @@ -2101,15 +2144,6 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, return EINVAL; } } - CURVNET_SET(TD_TO_VNET(td)); - - error = devfs_get_cdevpriv((void **)&priv); - if (error) { - CURVNET_RESTORE(); - /* XXX ENOENT should be impossible, since the priv - * is now created in the open */ - return (error == ENOENT ? ENXIO : error); - } switch (cmd) { case NIOCGINFO: /* return capabilities etc */ @@ -2125,10 +2159,14 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, u_int memflags; if (nmr->nr_name[0] != '\0') { + /* get a refcount */ - error = netmap_get_na(nmr, &na, 1 /* create */); - if (error) + error = netmap_get_na(nmr, &na, &ifp, 1 /* create */); + if (error) { + na = NULL; + ifp = NULL; break; + } nmd = na->nm_mem; /* get memory allocator */ } @@ -2145,8 +2183,8 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, nmr->nr_tx_rings = na->num_tx_rings; nmr->nr_rx_slots = na->num_rx_desc; nmr->nr_tx_slots = na->num_tx_desc; - netmap_adapter_put(na); } while (0); + netmap_unget_na(na, ifp); NMG_UNLOCK(); break; @@ -2156,9 +2194,25 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, if (i == NETMAP_BDG_ATTACH || i == NETMAP_BDG_DETACH || i == NETMAP_BDG_VNET_HDR || i == NETMAP_BDG_NEWIF - || i == NETMAP_BDG_DELIF) { + || i == NETMAP_BDG_DELIF + || i == NETMAP_BDG_POLLING_ON + || i == NETMAP_BDG_POLLING_OFF) { error = netmap_bdg_ctl(nmr, NULL); break; + } else if (i == NETMAP_PT_HOST_CREATE || i == NETMAP_PT_HOST_DELETE) { + error = ptnetmap_ctl(nmr, priv->np_na); + break; + } else if (i == NETMAP_VNET_HDR_GET) { + struct ifnet *ifp; + + NMG_LOCK(); + error = netmap_get_na(nmr, &na, &ifp, 0); + if (na && !error) { + nmr->nr_arg1 = na->virt_hdr_len; + } + netmap_unget_na(na, ifp); + NMG_UNLOCK(); + break; } else if (i != 0) { D("nr_cmd must be 0 not %d", i); error = EINVAL; @@ -2169,23 +2223,32 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, NMG_LOCK(); do { u_int memflags; + struct ifnet *ifp; if (priv->np_nifp != NULL) { /* thread already registered */ error = EBUSY; break; } /* find the interface and a reference */ - error = netmap_get_na(nmr, &na, 1 /* create */); /* keep reference */ + error = netmap_get_na(nmr, &na, &ifp, + 1 /* create */); /* keep reference */ if (error) break; if (NETMAP_OWNED_BY_KERN(na)) { - netmap_adapter_put(na); + netmap_unget_na(na, ifp); error = EBUSY; break; } + + if (na->virt_hdr_len && !(nmr->nr_flags & NR_ACCEPT_VNET_HDR)) { + netmap_unget_na(na, ifp); + error = EIO; + break; + } + error = netmap_do_regif(priv, na, nmr->nr_ringid, nmr->nr_flags); if (error) { /* reg. failed, release priv and ref */ - netmap_adapter_put(na); + netmap_unget_na(na, ifp); break; } nifp = priv->np_nifp; @@ -2200,7 +2263,7 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, &nmr->nr_arg2); if (error) { netmap_do_unregif(priv); - netmap_adapter_put(na); + netmap_unget_na(na, ifp); break; } if (memflags & NETMAP_MEM_PRIVATE) { @@ -2212,12 +2275,17 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, } if (nmr->nr_arg3) { - D("requested %d extra buffers", nmr->nr_arg3); + if (netmap_verbose) + D("requested %d extra buffers", nmr->nr_arg3); nmr->nr_arg3 = netmap_extra_alloc(na, &nifp->ni_bufs_head, nmr->nr_arg3); - D("got %d extra buffers", nmr->nr_arg3); + if (netmap_verbose) + D("got %d extra buffers", nmr->nr_arg3); } nmr->nr_offset = netmap_mem_if_offset(na->nm_mem, nifp); + + /* store ifp reference so that priv destructor may release it */ + priv->np_ifp = ifp; } while (0); NMG_UNLOCK(); break; @@ -2240,11 +2308,6 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, break; } - if (!nm_netmap_on(na)) { - error = ENXIO; - break; - } - t = (cmd == NIOCTXSYNC ? NR_TX : NR_RX); krings = NMR(na, t); qfirst = priv->np_qfirst[t]; @@ -2252,31 +2315,34 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, for (i = qfirst; i < qlast; i++) { struct netmap_kring *kring = krings + i; - if (nm_kr_tryget(kring)) { - error = EBUSY; - goto out; + struct netmap_ring *ring = kring->ring; + + if (unlikely(nm_kr_tryget(kring, 1, &error))) { + error = (error ? EIO : 0); + continue; } + if (cmd == NIOCTXSYNC) { if (netmap_verbose & NM_VERB_TXSYNC) D("pre txsync ring %d cur %d hwcur %d", - i, kring->ring->cur, + i, ring->cur, kring->nr_hwcur); - if (nm_txsync_prologue(kring) >= kring->nkr_num_slots) { + if (nm_txsync_prologue(kring, ring) >= kring->nkr_num_slots) { netmap_ring_reinit(kring); } else if (kring->nm_sync(kring, NAF_FORCE_RECLAIM) == 0) { - nm_txsync_finalize(kring); + nm_sync_finalize(kring); } if (netmap_verbose & NM_VERB_TXSYNC) D("post txsync ring %d cur %d hwcur %d", - i, kring->ring->cur, + i, ring->cur, kring->nr_hwcur); } else { - if (nm_rxsync_prologue(kring) >= kring->nkr_num_slots) { + if (nm_rxsync_prologue(kring, ring) >= kring->nkr_num_slots) { netmap_ring_reinit(kring); } else if (kring->nm_sync(kring, NAF_FORCE_READ) == 0) { - nm_rxsync_finalize(kring); + nm_sync_finalize(kring); } - microtime(&na->rx_rings[i].ring->ts); + microtime(&ring->ts); } nm_kr_put(kring); } @@ -2323,9 +2389,7 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, error = EOPNOTSUPP; #endif /* linux */ } -out: - CURVNET_RESTORE(); return (error); } @@ -2345,17 +2409,15 @@ out: * hidden argument. */ int -netmap_poll(struct cdev *dev, int events, struct thread *td) +netmap_poll(struct netmap_priv_d *priv, int events, NM_SELRECORD_T *sr) { - struct netmap_priv_d *priv = NULL; struct netmap_adapter *na; struct netmap_kring *kring; + struct netmap_ring *ring; u_int i, check_all_tx, check_all_rx, want[NR_TXRX], revents = 0; #define want_tx want[NR_TX] #define want_rx want[NR_RX] struct mbq q; /* packets from hw queues to host stack */ - void *pwait = dev; /* linux compatibility */ - int is_kevent = 0; enum txrx t; /* @@ -2365,23 +2427,13 @@ netmap_poll(struct cdev *dev, int events, struct thread *td) */ int retry_tx = 1, retry_rx = 1; - (void)pwait; - mbq_init(&q); - - /* - * XXX kevent has curthread->tp_fop == NULL, - * so devfs_get_cdevpriv() fails. We circumvent this by passing - * priv as the first argument, which is also useful to avoid - * the selrecord() which are not necessary in that case. + /* transparent mode: send_down is 1 if we have found some + * packets to forward during the rx scan and we have not + * sent them down to the nic yet */ - if (devfs_get_cdevpriv((void **)&priv) != 0) { - is_kevent = 1; - if (netmap_verbose) - D("called from kevent"); - priv = (struct netmap_priv_d *)dev; - } - if (priv == NULL) - return POLLERR; + int send_down = 0; + + mbq_init(&q); if (priv->np_nifp == NULL) { D("No if registered"); @@ -2399,7 +2451,6 @@ netmap_poll(struct cdev *dev, int events, struct thread *td) want_tx = events & (POLLOUT | POLLWRNORM); want_rx = events & (POLLIN | POLLRDNORM); - /* * check_all_{tx|rx} are set if the card has more than one queue AND * the file descriptor is bound to all of them. If so, we sleep on @@ -2421,6 +2472,32 @@ netmap_poll(struct cdev *dev, int events, struct thread *td) * slots available. If this fails, then lock and call the sync * routines. */ +#if 1 /* new code- call rx if any of the ring needs to release or read buffers */ + if (want_tx) { + t = NR_TX; + for (i = priv->np_qfirst[t]; want[t] && i < priv->np_qlast[t]; i++) { + kring = &NMR(na, t)[i]; + /* XXX compare ring->cur and kring->tail */ + if (!nm_ring_empty(kring->ring)) { + revents |= want[t]; + want[t] = 0; /* also breaks the loop */ + } + } + } + if (want_rx) { + want_rx = 0; /* look for a reason to run the handlers */ + t = NR_RX; + for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) { + kring = &NMR(na, t)[i]; + if (kring->ring->cur == kring->ring->tail /* try fetch new buffers */ + || kring->rhead != kring->ring->head /* release buffers */) { + want_rx = 1; + } + } + if (!want_rx) + revents |= events & (POLLIN | POLLRDNORM); /* we have data */ + } +#else /* old code */ for_rx_tx(t) { for (i = priv->np_qfirst[t]; want[t] && i < priv->np_qlast[t]; i++) { kring = &NMR(na, t)[i]; @@ -2431,6 +2508,7 @@ netmap_poll(struct cdev *dev, int events, struct thread *td) } } } +#endif /* old code */ /* * If we want to push packets out (priv->np_txpoll) or @@ -2447,32 +2525,26 @@ netmap_poll(struct cdev *dev, int events, struct thread *td) * used to skip rings with no pending transmissions. */ flush_tx: - for (i = priv->np_qfirst[NR_TX]; i < priv->np_qlast[NR_RX]; i++) { + for (i = priv->np_qfirst[NR_TX]; i < priv->np_qlast[NR_TX]; i++) { int found = 0; kring = &na->tx_rings[i]; - if (!want_tx && kring->ring->cur == kring->nr_hwcur) + ring = kring->ring; + + if (!send_down && !want_tx && ring->cur == kring->nr_hwcur) continue; - /* only one thread does txsync */ - if (nm_kr_tryget(kring)) { - /* either busy or stopped - * XXX if the ring is stopped, sleeping would - * be better. In current code, however, we only - * stop the rings for brief intervals (2014-03-14) - */ - if (netmap_verbose) - RD(2, "%p lost race on txring %d, ok", - priv, i); + + if (nm_kr_tryget(kring, 1, &revents)) continue; - } - if (nm_txsync_prologue(kring) >= kring->nkr_num_slots) { + + if (nm_txsync_prologue(kring, ring) >= kring->nkr_num_slots) { netmap_ring_reinit(kring); revents |= POLLERR; } else { if (kring->nm_sync(kring, 0)) revents |= POLLERR; else - nm_txsync_finalize(kring); + nm_sync_finalize(kring); } /* @@ -2489,8 +2561,10 @@ flush_tx: kring->nm_notify(kring, 0); } } - if (want_tx && retry_tx && !is_kevent) { - OS_selrecord(td, check_all_tx ? + /* if there were any packet to forward we must have handled them by now */ + send_down = 0; + if (want_tx && retry_tx && sr) { + nm_os_selrecord(sr, check_all_tx ? &na->si[NR_TX] : &na->tx_rings[priv->np_qfirst[NR_TX]].si); retry_tx = 0; goto flush_tx; @@ -2502,22 +2576,18 @@ flush_tx: * Do it on all rings because otherwise we starve. */ if (want_rx) { - int send_down = 0; /* transparent mode */ /* two rounds here for race avoidance */ do_retry_rx: for (i = priv->np_qfirst[NR_RX]; i < priv->np_qlast[NR_RX]; i++) { int found = 0; kring = &na->rx_rings[i]; + ring = kring->ring; - if (nm_kr_tryget(kring)) { - if (netmap_verbose) - RD(2, "%p lost race on rxring %d, ok", - priv, i); + if (unlikely(nm_kr_tryget(kring, 1, &revents))) continue; - } - if (nm_rxsync_prologue(kring) >= kring->nkr_num_slots) { + if (nm_rxsync_prologue(kring, ring) >= kring->nkr_num_slots) { netmap_ring_reinit(kring); revents |= POLLERR; } @@ -2526,22 +2596,22 @@ do_retry_rx: /* * transparent mode support: collect packets * from the rxring(s). - * XXX NR_FORWARD should only be read on - * physical or NIC ports */ - if (netmap_fwd ||kring->ring->flags & NR_FORWARD) { + if (nm_may_forward_up(kring)) { ND(10, "forwarding some buffers up %d to %d", - kring->nr_hwcur, kring->ring->cur); + kring->nr_hwcur, ring->cur); netmap_grab_packets(kring, &q, netmap_fwd); } + kring->nr_kflags &= ~NR_FORWARD; if (kring->nm_sync(kring, 0)) revents |= POLLERR; else - nm_rxsync_finalize(kring); + nm_sync_finalize(kring); + send_down |= (kring->nr_kflags & NR_FORWARD); /* host ring only */ if (netmap_no_timestamp == 0 || - kring->ring->flags & NR_TIMESTAMP) { - microtime(&kring->ring->ts); + ring->flags & NR_TIMESTAMP) { + microtime(&ring->ts); } found = kring->rcur != kring->rtail; nm_kr_put(kring); @@ -2552,22 +2622,10 @@ do_retry_rx: } } - /* transparent mode XXX only during first pass ? */ - if (na->na_flags & NAF_HOST_RINGS) { - kring = &na->rx_rings[na->num_rx_rings]; - if (check_all_rx - && (netmap_fwd || kring->ring->flags & NR_FORWARD)) { - /* XXX fix to use kring fields */ - if (nm_ring_empty(kring->ring)) - send_down = netmap_rxsync_from_host(na, td, dev); - if (!nm_ring_empty(kring->ring)) - revents |= want_rx; - } - } - - if (retry_rx && !is_kevent) - OS_selrecord(td, check_all_rx ? + if (retry_rx && sr) { + nm_os_selrecord(sr, check_all_rx ? &na->si[NR_RX] : &na->rx_rings[priv->np_qfirst[NR_RX]].si); + } if (send_down > 0 || retry_rx) { retry_rx = 0; if (send_down) @@ -2582,15 +2640,14 @@ do_retry_rx: * kring->nr_hwcur and ring->head * are passed to the other endpoint. * - * In this mode we also scan the sw rxring, which in - * turn passes packets up. - * - * XXX Transparent mode at the moment requires to bind all + * Transparent mode requires to bind all * rings to a single file descriptor. */ - if (q.head && na->ifp != NULL) + if (q.head && !nm_kr_tryget(&na->tx_rings[na->num_tx_rings], 1, &revents)) { netmap_send_up(na->ifp, &q); + nm_kr_put(&na->tx_rings[na->num_tx_rings]); + } return (revents); #undef want_tx @@ -2600,8 +2657,6 @@ do_retry_rx: /*-------------------- driver support routines -------------------*/ -static int netmap_hw_krings_create(struct netmap_adapter *); - /* default notify callback */ static int netmap_notify(struct netmap_kring *kring, int flags) @@ -2609,51 +2664,51 @@ netmap_notify(struct netmap_kring *kring, int flags) struct netmap_adapter *na = kring->na; enum txrx t = kring->tx; - OS_selwakeup(&kring->si, PI_NET); + nm_os_selwakeup(&kring->si); /* optimization: avoid a wake up on the global * queue if nobody has registered for more * than one ring */ if (na->si_users[t] > 0) - OS_selwakeup(&na->si[t], PI_NET); + nm_os_selwakeup(&na->si[t]); - return 0; + return NM_IRQ_COMPLETED; } +#if 0 +static int +netmap_notify(struct netmap_adapter *na, u_int n_ring, +enum txrx tx, int flags) +{ + if (tx == NR_TX) { + KeSetEvent(notes->TX_EVENT, 0, FALSE); + } + else + { + KeSetEvent(notes->RX_EVENT, 0, FALSE); + } + return 0; +} +#endif /* called by all routines that create netmap_adapters. - * Attach na to the ifp (if any) and provide defaults - * for optional callbacks. Defaults assume that we - * are creating an hardware netmap_adapter. + * provide some defaults and get a reference to the + * memory allocator */ int netmap_attach_common(struct netmap_adapter *na) { - struct ifnet *ifp = na->ifp; - if (na->num_tx_rings == 0 || na->num_rx_rings == 0) { D("%s: invalid rings tx %d rx %d", na->name, na->num_tx_rings, na->num_rx_rings); return EINVAL; } - /* ifp is NULL for virtual adapters (bwrap, non-persistent VALE ports, - * pipes, monitors). For bwrap we actually have a non-null ifp for - * use by the external modules, but that is set after this - * function has been called. - * XXX this is ugly, maybe split this function in two (2014-03-14) - */ - if (ifp != NULL) { - WNA(ifp) = na; - /* the following is only needed for na that use the host port. - * XXX do we have something similar for linux ? - */ #ifdef __FreeBSD__ - na->if_input = ifp->if_input; /* for netmap_send_up */ -#endif /* __FreeBSD__ */ - - NETMAP_SET_CAPABLE(ifp); + if (na->na_flags & NAF_HOST_RINGS && na->ifp) { + na->if_input = na->ifp->if_input; /* for netmap_send_up */ } +#endif /* __FreeBSD__ */ if (na->nm_krings_create == NULL) { /* we assume that we have been called by a driver, * since other port types all provide their own @@ -2677,6 +2732,7 @@ netmap_attach_common(struct netmap_adapter *na) */ na->nm_bdg_attach = netmap_bwrap_attach; #endif + return 0; } @@ -2685,9 +2741,6 @@ netmap_attach_common(struct netmap_adapter *na) void netmap_detach_common(struct netmap_adapter *na) { - if (na->ifp != NULL) - WNA(na->ifp) = NULL; /* XXX do we need this? */ - if (na->tx_rings) { /* XXX should not happen */ D("freeing leftover tx_rings"); na->nm_krings_delete(na); @@ -2699,31 +2752,52 @@ netmap_detach_common(struct netmap_adapter *na) free(na, M_DEVBUF); } -/* Wrapper for the register callback provided hardware drivers. - * na->ifp == NULL means the driver module has been +/* Wrapper for the register callback provided netmap-enabled + * hardware drivers. + * nm_iszombie(na) means that the driver module has been * unloaded, so we cannot call into it. - * Note that module unloading, in our patched linux drivers, - * happens under NMG_LOCK and after having stopped all the - * nic rings (see netmap_detach). This provides sufficient - * protection for the other driver-provied callbacks - * (i.e., nm_config and nm_*xsync), that therefore don't need - * to wrapped. + * nm_os_ifnet_lock() must guarantee mutual exclusion with + * module unloading. */ static int -netmap_hw_register(struct netmap_adapter *na, int onoff) +netmap_hw_reg(struct netmap_adapter *na, int onoff) { struct netmap_hw_adapter *hwna = (struct netmap_hw_adapter*)na; + int error = 0; + + nm_os_ifnet_lock(); + + if (nm_iszombie(na)) { + if (onoff) { + error = ENXIO; + } else if (na != NULL) { + na->na_flags &= ~NAF_NETMAP_ON; + } + goto out; + } + + error = hwna->nm_hw_register(na, onoff); + +out: + nm_os_ifnet_unlock(); + + return error; +} - if (na->ifp == NULL) - return onoff ? ENXIO : 0; +static void +netmap_hw_dtor(struct netmap_adapter *na) +{ + if (nm_iszombie(na) || na->ifp == NULL) + return; - return hwna->nm_hw_register(na, onoff); + WNA(na->ifp) = NULL; } /* - * Initialize a ``netmap_adapter`` object created by driver on attach. + * Allocate a ``netmap_adapter`` object, and initialize it from the + * 'arg' passed by the driver on attach. * We allocate a block of memory with room for a struct netmap_adapter * plus two sets of N+2 struct netmap_kring (where N is the number * of hardware rings): @@ -2732,29 +2806,31 @@ netmap_hw_register(struct netmap_adapter *na, int onoff) * kring N+1 is only used for the selinfo for all queues. // XXX still true ? * Return 0 on success, ENOMEM otherwise. */ -int -netmap_attach(struct netmap_adapter *arg) +static int +_netmap_attach(struct netmap_adapter *arg, size_t size) { struct netmap_hw_adapter *hwna = NULL; - // XXX when is arg == NULL ? - struct ifnet *ifp = arg ? arg->ifp : NULL; + struct ifnet *ifp = NULL; - if (arg == NULL || ifp == NULL) + if (arg == NULL || arg->ifp == NULL) goto fail; - hwna = malloc(sizeof(*hwna), M_DEVBUF, M_NOWAIT | M_ZERO); + ifp = arg->ifp; + hwna = malloc(size, M_DEVBUF, M_NOWAIT | M_ZERO); if (hwna == NULL) goto fail; hwna->up = *arg; hwna->up.na_flags |= NAF_HOST_RINGS | NAF_NATIVE; strncpy(hwna->up.name, ifp->if_xname, sizeof(hwna->up.name)); hwna->nm_hw_register = hwna->up.nm_register; - hwna->up.nm_register = netmap_hw_register; + hwna->up.nm_register = netmap_hw_reg; if (netmap_attach_common(&hwna->up)) { free(hwna, M_DEVBUF); goto fail; } netmap_adapter_get(&hwna->up); + NM_ATTACH_NA(ifp, &hwna->up); + #ifdef linux if (ifp->netdev_ops) { /* prepare a clone of the netdev ops */ @@ -2762,7 +2838,7 @@ netmap_attach(struct netmap_adapter *arg) hwna->nm_ndo.ndo_start_xmit = ifp->netdev_ops; #else hwna->nm_ndo = *ifp->netdev_ops; -#endif +#endif /* NETMAP_LINUX_HAVE_NETDEV_OPS */ } hwna->nm_ndo.ndo_start_xmit = linux_netmap_start_xmit; if (ifp->ethtool_ops) { @@ -2771,11 +2847,14 @@ netmap_attach(struct netmap_adapter *arg) hwna->nm_eto.set_ringparam = linux_netmap_set_ringparam; #ifdef NETMAP_LINUX_HAVE_SET_CHANNELS hwna->nm_eto.set_channels = linux_netmap_set_channels; -#endif +#endif /* NETMAP_LINUX_HAVE_SET_CHANNELS */ if (arg->nm_config == NULL) { hwna->up.nm_config = netmap_linux_config; } #endif /* linux */ + if (arg->nm_dtor == NULL) { + hwna->up.nm_dtor = netmap_hw_dtor; + } if_printf(ifp, "netmap queues/slots: TX %d/%d, RX %d/%d\n", hwna->up.num_tx_rings, hwna->up.num_tx_desc, @@ -2784,12 +2863,57 @@ netmap_attach(struct netmap_adapter *arg) fail: D("fail, arg %p ifp %p na %p", arg, ifp, hwna); - if (ifp) - netmap_detach(ifp); return (hwna ? EINVAL : ENOMEM); } +int +netmap_attach(struct netmap_adapter *arg) +{ + return _netmap_attach(arg, sizeof(struct netmap_hw_adapter)); +} + + +#ifdef WITH_PTNETMAP_GUEST +int +netmap_pt_guest_attach(struct netmap_adapter *arg, + void *csb, + unsigned int nifp_offset, + nm_pt_guest_ptctl_t ptctl) +{ + struct netmap_pt_guest_adapter *ptna; + struct ifnet *ifp = arg ? arg->ifp : NULL; + int error; + + /* get allocator */ + arg->nm_mem = netmap_mem_pt_guest_new(ifp, nifp_offset, ptctl); + if (arg->nm_mem == NULL) + return ENOMEM; + arg->na_flags |= NAF_MEM_OWNER; + error = _netmap_attach(arg, sizeof(struct netmap_pt_guest_adapter)); + if (error) + return error; + + /* get the netmap_pt_guest_adapter */ + ptna = (struct netmap_pt_guest_adapter *) NA(ifp); + ptna->csb = csb; + + /* Initialize a separate pass-through netmap adapter that is going to + * be used by the ptnet driver only, and so never exposed to netmap + * applications. We only need a subset of the available fields. */ + memset(&ptna->dr, 0, sizeof(ptna->dr)); + ptna->dr.up.ifp = ifp; + ptna->dr.up.nm_mem = ptna->hwup.up.nm_mem; + netmap_mem_get(ptna->dr.up.nm_mem); + ptna->dr.up.nm_config = ptna->hwup.up.nm_config; + + ptna->backend_regifs = 0; + + return 0; +} +#endif /* WITH_PTNETMAP_GUEST */ + + void NM_DBG(netmap_adapter_get)(struct netmap_adapter *na) { @@ -2841,28 +2965,29 @@ void netmap_detach(struct ifnet *ifp) { struct netmap_adapter *na = NA(ifp); - int skip; if (!na) return; - skip = 0; NMG_LOCK(); - netmap_disable_all_rings(ifp); - na->ifp = NULL; - na->na_flags &= ~NAF_NETMAP_ON; + netmap_set_all_rings(na, NM_KR_LOCKED); + na->na_flags |= NAF_ZOMBIE; /* * if the netmap adapter is not native, somebody * changed it, so we can not release it here. - * The NULL na->ifp will notify the new owner that + * The NAF_ZOMBIE flag will notify the new owner that * the driver is gone. */ if (na->na_flags & NAF_NATIVE) { - skip = netmap_adapter_put(na); + netmap_adapter_put(na); } - /* give them a chance to notice */ - if (skip == 0) - netmap_enable_all_rings(ifp); + /* give active users a chance to notice that NAF_ZOMBIE has been + * turned on, so that they can stop and return an error to userspace. + * Note that this becomes a NOP if there are no active users and, + * therefore, the put() above has deleted the na, since now NA(ifp) is + * NULL. + */ + netmap_enable_all_rings(ifp); NMG_UNLOCK(); } @@ -2883,9 +3008,10 @@ int netmap_transmit(struct ifnet *ifp, struct mbuf *m) { struct netmap_adapter *na = NA(ifp); - struct netmap_kring *kring; + struct netmap_kring *kring, *tx_kring; u_int len = MBUF_LEN(m); u_int error = ENOBUFS; + unsigned int txr; struct mbq *q; int space; @@ -2900,6 +3026,16 @@ netmap_transmit(struct ifnet *ifp, struct mbuf *m) goto done; } + txr = MBUF_TXQ(m); + if (txr >= na->num_tx_rings) { + txr %= na->num_tx_rings; + } + tx_kring = &NMR(na, NR_TX)[txr]; + + if (tx_kring->nr_mode == NKR_NETMAP_OFF) { + return MBUF_TRANSMIT(na, ifp, m); + } + q = &kring->rx_queue; // XXX reconsider long packets if we handle fragments @@ -2909,6 +3045,11 @@ netmap_transmit(struct ifnet *ifp, struct mbuf *m) goto done; } + if (nm_os_mbuf_has_offld(m)) { + RD(1, "%s drop mbuf requiring offloadings", na->name); + goto done; + } + /* protect against rxsync_from_host(), netmap_sw_to_nic() * and maybe other instances of netmap_transmit (the latter * not possible on Linux). @@ -2951,6 +3092,8 @@ done: * netmap_reset() is called by the driver routines when reinitializing * a ring. The driver is in charge of locking to protect the kring. * If native netmap mode is not set just return NULL. + * If native netmap mode is set, in particular, we have to set nr_mode to + * NKR_NETMAP_ON. */ struct netmap_slot * netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n, @@ -2975,13 +3118,26 @@ netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n, if (tx == NR_TX) { if (n >= na->num_tx_rings) return NULL; + kring = na->tx_rings + n; + + if (kring->nr_pending_mode == NKR_NETMAP_OFF) { + kring->nr_mode = NKR_NETMAP_OFF; + return NULL; + } + // XXX check whether we should use hwcur or rcur new_hwofs = kring->nr_hwcur - new_cur; } else { if (n >= na->num_rx_rings) return NULL; kring = na->rx_rings + n; + + if (kring->nr_pending_mode == NKR_NETMAP_OFF) { + kring->nr_mode = NKR_NETMAP_OFF; + return NULL; + } + new_hwofs = kring->nr_hwtail - new_cur; } lim = kring->nkr_num_slots - 1; @@ -3018,6 +3174,7 @@ netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n, * We do the wakeup here, but the ring is not yet reconfigured. * However, we are under lock so there are no races. */ + kring->nr_mode = NKR_NETMAP_ON; kring->nm_notify(kring, 0); return kring->ring->slot; } @@ -3037,10 +3194,9 @@ netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n, * - for a nic connected to a switch, call the proper forwarding routine * (see netmap_bwrap_intr_notify) */ -void -netmap_common_irq(struct ifnet *ifp, u_int q, u_int *work_done) +int +netmap_common_irq(struct netmap_adapter *na, u_int q, u_int *work_done) { - struct netmap_adapter *na = NA(ifp); struct netmap_kring *kring; enum txrx t = (work_done ? NR_RX : NR_TX); @@ -3051,15 +3207,20 @@ netmap_common_irq(struct ifnet *ifp, u_int q, u_int *work_done) } if (q >= nma_get_nrings(na, t)) - return; // not a physical queue + return NM_IRQ_PASS; // not a physical queue kring = NMR(na, t) + q; + if (kring->nr_mode == NKR_NETMAP_OFF) { + return NM_IRQ_PASS; + } + if (t == NR_RX) { kring->nr_kflags |= NKR_PENDINTR; // XXX atomic ? *work_done = 1; /* do not fire napi again */ } - kring->nm_notify(kring, 0); + + return kring->nm_notify(kring, 0); } @@ -3067,17 +3228,17 @@ netmap_common_irq(struct ifnet *ifp, u_int q, u_int *work_done) * Default functions to handle rx/tx interrupts from a physical device. * "work_done" is non-null on the RX path, NULL for the TX path. * - * If the card is not in netmap mode, simply return 0, + * If the card is not in netmap mode, simply return NM_IRQ_PASS, * so that the caller proceeds with regular processing. - * Otherwise call netmap_common_irq() and return 1. + * Otherwise call netmap_common_irq(). * * If the card is connected to a netmap file descriptor, * do a selwakeup on the individual queue, plus one on the global one * if needed (multiqueue card _and_ there are multiqueue listeners), - * and return 1. + * and return NR_IRQ_COMPLETED. * * Finally, if called on rx from an interface connected to a switch, - * calls the proper forwarding routine, and return 1. + * calls the proper forwarding routine. */ int netmap_rx_irq(struct ifnet *ifp, u_int q, u_int *work_done) @@ -3091,15 +3252,14 @@ netmap_rx_irq(struct ifnet *ifp, u_int q, u_int *work_done) * nm_native_on() here. */ if (!nm_netmap_on(na)) - return 0; + return NM_IRQ_PASS; if (na->na_flags & NAF_SKIP_INTR) { ND("use regular interrupt"); - return 0; + return NM_IRQ_PASS; } - netmap_common_irq(ifp, q, work_done); - return 1; + return netmap_common_irq(na, q, work_done); } @@ -3120,9 +3280,11 @@ extern struct cdevsw netmap_cdevsw; void netmap_fini(void) { - netmap_uninit_bridges(); if (netmap_dev) destroy_dev(netmap_dev); + /* we assume that there are no longer netmap users */ + nm_os_ifnet_fini(); + netmap_uninit_bridges(); netmap_mem_fini(); NMG_LOCK_DESTROY(); printf("netmap: unloaded module.\n"); @@ -3155,9 +3317,13 @@ netmap_init(void) goto fail; #ifdef __FreeBSD__ - nm_vi_init_index(); + nm_os_vi_init_index(); #endif + error = nm_os_ifnet_init(); + if (error) + goto fail; + printf("netmap: loaded module\n"); return (0); fail: diff --git a/sys/dev/netmap/netmap_freebsd.c b/sys/dev/netmap/netmap_freebsd.c index 8490ae85670bf..20ea5c8f29726 100644 --- a/sys/dev/netmap/netmap_freebsd.c +++ b/sys/dev/netmap/netmap_freebsd.c @@ -33,8 +33,9 @@ #include /* defines used in kernel.h */ #include /* POLLIN, POLLOUT */ #include /* types used in module initialization */ -#include /* DEV_MODULE */ +#include /* DEV_MODULE_ORDERED */ #include +#include /* kern_ioctl() */ #include @@ -50,6 +51,11 @@ #include #include /* sockaddrs */ #include +#include /* kthread_add() */ +#include /* PROC_LOCK() */ +#include /* RFNOWAIT */ +#include /* sched_bind() */ +#include /* mp_maxid */ #include #include #include /* IFT_ETHER */ @@ -61,13 +67,94 @@ #include #include +#include #include /* ======================== FREEBSD-SPECIFIC ROUTINES ================== */ +void nm_os_selinfo_init(NM_SELINFO_T *si) { + struct mtx *m = &si->m; + mtx_init(m, "nm_kn_lock", NULL, MTX_DEF); + knlist_init_mtx(&si->si.si_note, m); +} + +void +nm_os_selinfo_uninit(NM_SELINFO_T *si) +{ + /* XXX kqueue(9) needed; these will mirror knlist_init. */ + knlist_delete(&si->si.si_note, curthread, 0 /* not locked */ ); + knlist_destroy(&si->si.si_note); + /* now we don't need the mutex anymore */ + mtx_destroy(&si->m); +} + +void +nm_os_ifnet_lock(void) +{ + IFNET_WLOCK(); +} + +void +nm_os_ifnet_unlock(void) +{ + IFNET_WUNLOCK(); +} + +static int netmap_use_count = 0; + +void +nm_os_get_module(void) +{ + netmap_use_count++; +} + +void +nm_os_put_module(void) +{ + netmap_use_count--; +} + +static void +netmap_ifnet_arrival_handler(void *arg __unused, struct ifnet *ifp) +{ + netmap_undo_zombie(ifp); +} + +static void +netmap_ifnet_departure_handler(void *arg __unused, struct ifnet *ifp) +{ + netmap_make_zombie(ifp); +} + +static eventhandler_tag nm_ifnet_ah_tag; +static eventhandler_tag nm_ifnet_dh_tag; + +int +nm_os_ifnet_init(void) +{ + nm_ifnet_ah_tag = + EVENTHANDLER_REGISTER(ifnet_arrival_event, + netmap_ifnet_arrival_handler, + NULL, EVENTHANDLER_PRI_ANY); + nm_ifnet_dh_tag = + EVENTHANDLER_REGISTER(ifnet_departure_event, + netmap_ifnet_departure_handler, + NULL, EVENTHANDLER_PRI_ANY); + return 0; +} + +void +nm_os_ifnet_fini(void) +{ + EVENTHANDLER_DEREGISTER(ifnet_arrival_event, + nm_ifnet_ah_tag); + EVENTHANDLER_DEREGISTER(ifnet_departure_event, + nm_ifnet_dh_tag); +} + rawsum_t -nm_csum_raw(uint8_t *data, size_t len, rawsum_t cur_sum) +nm_os_csum_raw(uint8_t *data, size_t len, rawsum_t cur_sum) { /* TODO XXX please use the FreeBSD implementation for this. */ uint16_t *words = (uint16_t *)data; @@ -87,7 +174,7 @@ nm_csum_raw(uint8_t *data, size_t len, rawsum_t cur_sum) * return value is in network byte order. */ uint16_t -nm_csum_fold(rawsum_t cur_sum) +nm_os_csum_fold(rawsum_t cur_sum) { /* TODO XXX please use the FreeBSD implementation for this. */ while (cur_sum >> 16) @@ -96,17 +183,17 @@ nm_csum_fold(rawsum_t cur_sum) return htobe16((~cur_sum) & 0xFFFF); } -uint16_t nm_csum_ipv4(struct nm_iphdr *iph) +uint16_t nm_os_csum_ipv4(struct nm_iphdr *iph) { #if 0 return in_cksum_hdr((void *)iph); #else - return nm_csum_fold(nm_csum_raw((uint8_t*)iph, sizeof(struct nm_iphdr), 0)); + return nm_os_csum_fold(nm_os_csum_raw((uint8_t*)iph, sizeof(struct nm_iphdr), 0)); #endif } void -nm_csum_tcpudp_ipv4(struct nm_iphdr *iph, void *data, +nm_os_csum_tcpudp_ipv4(struct nm_iphdr *iph, void *data, size_t datalen, uint16_t *check) { #ifdef INET @@ -118,7 +205,7 @@ nm_csum_tcpudp_ipv4(struct nm_iphdr *iph, void *data, /* Compute the checksum on TCP/UDP header + payload * (includes the pseudo-header). */ - *check = nm_csum_fold(nm_csum_raw(data, datalen, 0)); + *check = nm_os_csum_fold(nm_os_csum_raw(data, datalen, 0)); #else static int notsupported = 0; if (!notsupported) { @@ -129,12 +216,12 @@ nm_csum_tcpudp_ipv4(struct nm_iphdr *iph, void *data, } void -nm_csum_tcpudp_ipv6(struct nm_ipv6hdr *ip6h, void *data, +nm_os_csum_tcpudp_ipv6(struct nm_ipv6hdr *ip6h, void *data, size_t datalen, uint16_t *check) { #ifdef INET6 *check = in6_cksum_pseudo((void*)ip6h, datalen, ip6h->nexthdr, 0); - *check = nm_csum_fold(nm_csum_raw(data, datalen, 0)); + *check = nm_os_csum_fold(nm_os_csum_raw(data, datalen, 0)); #else static int notsupported = 0; if (!notsupported) { @@ -144,13 +231,41 @@ nm_csum_tcpudp_ipv6(struct nm_ipv6hdr *ip6h, void *data, #endif } +/* on FreeBSD we send up one packet at a time */ +void * +nm_os_send_up(struct ifnet *ifp, struct mbuf *m, struct mbuf *prev) +{ + + NA(ifp)->if_input(ifp, m); + return NULL; +} + +int +nm_os_mbuf_has_offld(struct mbuf *m) +{ + return m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_SCTP | + CSUM_TCP_IPV6 | CSUM_UDP_IPV6 | + CSUM_SCTP_IPV6 | CSUM_TSO); +} + +static void +freebsd_generic_rx_handler(struct ifnet *ifp, struct mbuf *m) +{ + struct netmap_generic_adapter *gna = + (struct netmap_generic_adapter *)NA(ifp); + int stolen = generic_rx_handler(ifp, m); + + if (!stolen) { + gna->save_if_input(ifp, m); + } +} /* * Intercept the rx routine in the standard device driver. * Second argument is non-zero to intercept, 0 to restore */ int -netmap_catch_rx(struct netmap_generic_adapter *gna, int intercept) +nm_os_catch_rx(struct netmap_generic_adapter *gna, int intercept) { struct netmap_adapter *na = &gna->up.up; struct ifnet *ifp = na->ifp; @@ -161,7 +276,7 @@ netmap_catch_rx(struct netmap_generic_adapter *gna, int intercept) return EINVAL; /* already set */ } gna->save_if_input = ifp->if_input; - ifp->if_input = generic_rx_handler; + ifp->if_input = freebsd_generic_rx_handler; } else { if (!gna->save_if_input){ D("cannot restore"); @@ -181,18 +296,20 @@ netmap_catch_rx(struct netmap_generic_adapter *gna, int intercept) * Second argument is non-zero to intercept, 0 to restore. * On freebsd we just intercept if_transmit. */ -void -netmap_catch_tx(struct netmap_generic_adapter *gna, int enable) +int +nm_os_catch_tx(struct netmap_generic_adapter *gna, int intercept) { struct netmap_adapter *na = &gna->up.up; struct ifnet *ifp = netmap_generic_getifp(gna); - if (enable) { + if (intercept) { na->if_transmit = ifp->if_transmit; ifp->if_transmit = netmap_transmit; } else { ifp->if_transmit = na->if_transmit; } + + return 0; } @@ -213,40 +330,44 @@ netmap_catch_tx(struct netmap_generic_adapter *gna, int enable) * */ int -generic_xmit_frame(struct ifnet *ifp, struct mbuf *m, - void *addr, u_int len, u_int ring_nr) +nm_os_generic_xmit_frame(struct nm_os_gen_arg *a) { int ret; + u_int len = a->len; + struct ifnet *ifp = a->ifp; + struct mbuf *m = a->m; +#if __FreeBSD_version < 1100000 /* - * The mbuf should be a cluster from our special pool, - * so we do not need to do an m_copyback but just copy - * (and eventually, just reference the netmap buffer) + * Old FreeBSD versions. The mbuf has a cluster attached, + * we need to copy from the cluster to the netmap buffer. */ - - if (GET_MBUF_REFCNT(m) != 1) { - D("invalid refcnt %d for %p", - GET_MBUF_REFCNT(m), m); + if (MBUF_REFCNT(m) != 1) { + D("invalid refcnt %d for %p", MBUF_REFCNT(m), m); panic("in generic_xmit_frame"); } - // XXX the ext_size check is unnecessary if we link the netmap buf if (m->m_ext.ext_size < len) { RD(5, "size %d < len %d", m->m_ext.ext_size, len); len = m->m_ext.ext_size; } - if (0) { /* XXX seems to have negligible benefits */ - m->m_ext.ext_buf = m->m_data = addr; - } else { - bcopy(addr, m->m_data, len); - } + bcopy(a->addr, m->m_data, len); +#else /* __FreeBSD_version >= 1100000 */ + /* New FreeBSD versions. Link the external storage to + * the netmap buffer, so that no copy is necessary. */ + m->m_ext.ext_buf = m->m_data = a->addr; + m->m_ext.ext_size = len; +#endif /* __FreeBSD_version >= 1100000 */ + m->m_len = m->m_pkthdr.len = len; - // inc refcount. All ours, we could skip the atomic - atomic_fetchadd_int(PNT_MBUF_REFCNT(m), 1); + + /* mbuf refcnt is not contended, no need to use atomic + * (a memory barrier is enough). */ + SET_MBUF_REFCNT(m, 2); M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE); - m->m_pkthdr.flowid = ring_nr; + m->m_pkthdr.flowid = a->ring_nr; m->m_pkthdr.rcvif = ifp; /* used for tx notification */ ret = NA(ifp)->if_transmit(ifp, m); - return ret; + return ret ? -1 : 0; } @@ -263,7 +384,7 @@ netmap_getna(if_t ifp) * way to extract the info from the ifp */ int -generic_find_num_desc(struct ifnet *ifp, unsigned int *tx, unsigned int *rx) +nm_os_generic_find_num_desc(struct ifnet *ifp, unsigned int *tx, unsigned int *rx) { D("called, in tx %d rx %d", *tx, *rx); return 0; @@ -271,16 +392,23 @@ generic_find_num_desc(struct ifnet *ifp, unsigned int *tx, unsigned int *rx) void -generic_find_num_queues(struct ifnet *ifp, u_int *txq, u_int *rxq) +nm_os_generic_find_num_queues(struct ifnet *ifp, u_int *txq, u_int *rxq) { D("called, in txq %d rxq %d", *txq, *rxq); *txq = netmap_generic_rings; *rxq = netmap_generic_rings; } +void +nm_os_generic_set_features(struct netmap_generic_adapter *gna) +{ + + gna->rxsg = 1; /* Supported through m_copydata. */ + gna->txqdisc = 0; /* Not supported. */ +} void -netmap_mitigation_init(struct nm_generic_mit *mit, int idx, struct netmap_adapter *na) +nm_os_mitigation_init(struct nm_generic_mit *mit, int idx, struct netmap_adapter *na) { ND("called"); mit->mit_pending = 0; @@ -290,21 +418,21 @@ netmap_mitigation_init(struct nm_generic_mit *mit, int idx, struct netmap_adapte void -netmap_mitigation_start(struct nm_generic_mit *mit) +nm_os_mitigation_start(struct nm_generic_mit *mit) { ND("called"); } void -netmap_mitigation_restart(struct nm_generic_mit *mit) +nm_os_mitigation_restart(struct nm_generic_mit *mit) { ND("called"); } int -netmap_mitigation_active(struct nm_generic_mit *mit) +nm_os_mitigation_active(struct nm_generic_mit *mit) { ND("called"); return 0; @@ -312,7 +440,7 @@ netmap_mitigation_active(struct nm_generic_mit *mit) void -netmap_mitigation_cleanup(struct nm_generic_mit *mit) +nm_os_mitigation_cleanup(struct nm_generic_mit *mit) { ND("called"); } @@ -342,7 +470,7 @@ static struct { } nm_vi_indices; void -nm_vi_init_index(void) +nm_os_vi_init_index(void) { int i; for (i = 0; i < NM_VI_MAX; i++) @@ -398,7 +526,7 @@ nm_vi_free_index(uint8_t val) * increment this refcount on if_attach(). */ int -nm_vi_persist(const char *name, struct ifnet **ret) +nm_os_vi_persist(const char *name, struct ifnet **ret) { struct ifnet *ifp; u_short macaddr_hi; @@ -438,15 +566,220 @@ nm_vi_persist(const char *name, struct ifnet **ret) *ret = ifp; return 0; } + /* unregister from the system and drop the final refcount */ void -nm_vi_detach(struct ifnet *ifp) +nm_os_vi_detach(struct ifnet *ifp) { nm_vi_free_index(((char *)IF_LLADDR(ifp))[5]); ether_ifdetach(ifp); if_free(ifp); } +/* ======================== PTNETMAP SUPPORT ========================== */ + +#ifdef WITH_PTNETMAP_GUEST +#include +#include +#include /* bus_dmamap_* */ +#include +#include +#include +/* + * ptnetmap memory device (memdev) for freebsd guest, + * ssed to expose host netmap memory to the guest through a PCI BAR. + */ + +/* + * ptnetmap memdev private data structure + */ +struct ptnetmap_memdev { + device_t dev; + struct resource *pci_io; + struct resource *pci_mem; + struct netmap_mem_d *nm_mem; +}; + +static int ptn_memdev_probe(device_t); +static int ptn_memdev_attach(device_t); +static int ptn_memdev_detach(device_t); +static int ptn_memdev_shutdown(device_t); + +static device_method_t ptn_memdev_methods[] = { + DEVMETHOD(device_probe, ptn_memdev_probe), + DEVMETHOD(device_attach, ptn_memdev_attach), + DEVMETHOD(device_detach, ptn_memdev_detach), + DEVMETHOD(device_shutdown, ptn_memdev_shutdown), + DEVMETHOD_END +}; + +static driver_t ptn_memdev_driver = { + PTNETMAP_MEMDEV_NAME, + ptn_memdev_methods, + sizeof(struct ptnetmap_memdev), +}; + +/* We use (SI_ORDER_MIDDLE+1) here, see DEV_MODULE_ORDERED() invocation + * below. */ +static devclass_t ptnetmap_devclass; +DRIVER_MODULE_ORDERED(ptn_memdev, pci, ptn_memdev_driver, ptnetmap_devclass, + NULL, NULL, SI_ORDER_MIDDLE + 1); + +/* + * I/O port read/write wrappers. + * Some are not used, so we keep them commented out until needed + */ +#define ptn_ioread16(ptn_dev, reg) bus_read_2((ptn_dev)->pci_io, (reg)) +#define ptn_ioread32(ptn_dev, reg) bus_read_4((ptn_dev)->pci_io, (reg)) +#if 0 +#define ptn_ioread8(ptn_dev, reg) bus_read_1((ptn_dev)->pci_io, (reg)) +#define ptn_iowrite8(ptn_dev, reg, val) bus_write_1((ptn_dev)->pci_io, (reg), (val)) +#define ptn_iowrite16(ptn_dev, reg, val) bus_write_2((ptn_dev)->pci_io, (reg), (val)) +#define ptn_iowrite32(ptn_dev, reg, val) bus_write_4((ptn_dev)->pci_io, (reg), (val)) +#endif /* unused */ + +/* + * Map host netmap memory through PCI-BAR in the guest OS, + * returning physical (nm_paddr) and virtual (nm_addr) addresses + * of the netmap memory mapped in the guest. + */ +int +nm_os_pt_memdev_iomap(struct ptnetmap_memdev *ptn_dev, vm_paddr_t *nm_paddr, void **nm_addr) +{ + uint32_t mem_size; + int rid; + + D("ptn_memdev_driver iomap"); + + rid = PCIR_BAR(PTNETMAP_MEM_PCI_BAR); + mem_size = ptn_ioread32(ptn_dev, PTNETMAP_IO_PCI_MEMSIZE); + + /* map memory allocator */ + ptn_dev->pci_mem = bus_alloc_resource(ptn_dev->dev, SYS_RES_MEMORY, + &rid, 0, ~0, mem_size, RF_ACTIVE); + if (ptn_dev->pci_mem == NULL) { + *nm_paddr = 0; + *nm_addr = 0; + return ENOMEM; + } + + *nm_paddr = rman_get_start(ptn_dev->pci_mem); + *nm_addr = rman_get_virtual(ptn_dev->pci_mem); + + D("=== BAR %d start %lx len %lx mem_size %x ===", + PTNETMAP_MEM_PCI_BAR, + *nm_paddr, + rman_get_size(ptn_dev->pci_mem), + mem_size); + return (0); +} + +/* Unmap host netmap memory. */ +void +nm_os_pt_memdev_iounmap(struct ptnetmap_memdev *ptn_dev) +{ + D("ptn_memdev_driver iounmap"); + + if (ptn_dev->pci_mem) { + bus_release_resource(ptn_dev->dev, SYS_RES_MEMORY, + PCIR_BAR(PTNETMAP_MEM_PCI_BAR), ptn_dev->pci_mem); + ptn_dev->pci_mem = NULL; + } +} + +/* Device identification routine, return BUS_PROBE_DEFAULT on success, + * positive on failure */ +static int +ptn_memdev_probe(device_t dev) +{ + char desc[256]; + + if (pci_get_vendor(dev) != PTNETMAP_PCI_VENDOR_ID) + return (ENXIO); + if (pci_get_device(dev) != PTNETMAP_PCI_DEVICE_ID) + return (ENXIO); + + snprintf(desc, sizeof(desc), "%s PCI adapter", + PTNETMAP_MEMDEV_NAME); + device_set_desc_copy(dev, desc); + + return (BUS_PROBE_DEFAULT); +} + +/* Device initialization routine. */ +static int +ptn_memdev_attach(device_t dev) +{ + struct ptnetmap_memdev *ptn_dev; + int rid; + uint16_t mem_id; + + D("ptn_memdev_driver attach"); + + ptn_dev = device_get_softc(dev); + ptn_dev->dev = dev; + + pci_enable_busmaster(dev); + + rid = PCIR_BAR(PTNETMAP_IO_PCI_BAR); + ptn_dev->pci_io = bus_alloc_resource_any(dev, SYS_RES_IOPORT, &rid, + RF_ACTIVE); + if (ptn_dev->pci_io == NULL) { + device_printf(dev, "cannot map I/O space\n"); + return (ENXIO); + } + + mem_id = ptn_ioread16(ptn_dev, PTNETMAP_IO_PCI_HOSTID); + + /* create guest allocator */ + ptn_dev->nm_mem = netmap_mem_pt_guest_attach(ptn_dev, mem_id); + if (ptn_dev->nm_mem == NULL) { + ptn_memdev_detach(dev); + return (ENOMEM); + } + netmap_mem_get(ptn_dev->nm_mem); + + D("ptn_memdev_driver probe OK - host_id: %d", mem_id); + + return (0); +} + +/* Device removal routine. */ +static int +ptn_memdev_detach(device_t dev) +{ + struct ptnetmap_memdev *ptn_dev; + + D("ptn_memdev_driver detach"); + ptn_dev = device_get_softc(dev); + + if (ptn_dev->nm_mem) { + netmap_mem_put(ptn_dev->nm_mem); + ptn_dev->nm_mem = NULL; + } + if (ptn_dev->pci_mem) { + bus_release_resource(dev, SYS_RES_MEMORY, + PCIR_BAR(PTNETMAP_MEM_PCI_BAR), ptn_dev->pci_mem); + ptn_dev->pci_mem = NULL; + } + if (ptn_dev->pci_io) { + bus_release_resource(dev, SYS_RES_IOPORT, + PCIR_BAR(PTNETMAP_IO_PCI_BAR), ptn_dev->pci_io); + ptn_dev->pci_io = NULL; + } + + return (0); +} + +static int +ptn_memdev_shutdown(device_t dev) +{ + D("ptn_memdev_driver shutdown"); + return bus_generic_shutdown(dev); +} + +#endif /* WITH_PTNETMAP_GUEST */ + /* * In order to track whether pages are still mapped, we hook into * the standard cdev_pager and intercept the constructor and @@ -606,7 +939,7 @@ err_unlock: * the device (/dev/netmap) so we cannot do anything useful. * To track close() on individual file descriptors we pass netmap_dtor() to * devfs_set_cdevpriv() on open(). The FreeBSD kernel will call the destructor - * when the last fd pointing to the device is closed. + * when the last fd pointing to the device is closed. * * Note that FreeBSD does not even munmap() on close() so we also have * to track mmap() ourselves, and postpone the call to @@ -634,26 +967,275 @@ netmap_open(struct cdev *dev, int oflags, int devtype, struct thread *td) (void)devtype; (void)td; - priv = malloc(sizeof(struct netmap_priv_d), M_DEVBUF, - M_NOWAIT | M_ZERO); - if (priv == NULL) - return ENOMEM; - priv->np_refs = 1; + NMG_LOCK(); + priv = netmap_priv_new(); + if (priv == NULL) { + error = ENOMEM; + goto out; + } error = devfs_set_cdevpriv(priv, netmap_dtor); if (error) { - free(priv, M_DEVBUF); - } else { - NMG_LOCK(); - netmap_use_count++; - NMG_UNLOCK(); + netmap_priv_delete(priv); + } +out: + NMG_UNLOCK(); + return error; +} + +/******************** kthread wrapper ****************/ +#include +u_int +nm_os_ncpus(void) +{ + return mp_maxid + 1; +} + +struct nm_kthread_ctx { + struct thread *user_td; /* thread user-space (kthread creator) to send ioctl */ + /* notification to guest (interrupt) */ + int irq_fd; /* ioctl fd */ + struct nm_kth_ioctl irq_ioctl; /* ioctl arguments */ + + /* notification from guest */ + void *ioevent_file; /* tsleep() argument */ + + /* worker function and parameter */ + nm_kthread_worker_fn_t worker_fn; + void *worker_private; + + struct nm_kthread *nmk; + + /* integer to manage multiple worker contexts (e.g., RX or TX on ptnetmap) */ + long type; +}; + +struct nm_kthread { + struct thread *worker; + struct mtx worker_lock; + uint64_t scheduled; /* pending wake_up request */ + struct nm_kthread_ctx worker_ctx; + int run; /* used to stop kthread */ + int attach_user; /* kthread attached to user_process */ + int affinity; +}; + +void inline +nm_os_kthread_wakeup_worker(struct nm_kthread *nmk) +{ + /* + * There may be a race between FE and BE, + * which call both this function, and worker kthread, + * that reads nmk->scheduled. + * + * For us it is not important the counter value, + * but simply that it has changed since the last + * time the kthread saw it. + */ + mtx_lock(&nmk->worker_lock); + nmk->scheduled++; + if (nmk->worker_ctx.ioevent_file) { + wakeup(nmk->worker_ctx.ioevent_file); + } + mtx_unlock(&nmk->worker_lock); +} + +void inline +nm_os_kthread_send_irq(struct nm_kthread *nmk) +{ + struct nm_kthread_ctx *ctx = &nmk->worker_ctx; + int err; + + if (ctx->user_td && ctx->irq_fd > 0) { + err = kern_ioctl(ctx->user_td, ctx->irq_fd, ctx->irq_ioctl.com, (caddr_t)&ctx->irq_ioctl.data.msix); + if (err) { + D("kern_ioctl error: %d ioctl parameters: fd %d com %lu data %p", + err, ctx->irq_fd, ctx->irq_ioctl.com, &ctx->irq_ioctl.data); + } + } +} + +static void +nm_kthread_worker(void *data) +{ + struct nm_kthread *nmk = data; + struct nm_kthread_ctx *ctx = &nmk->worker_ctx; + uint64_t old_scheduled = nmk->scheduled; + + if (nmk->affinity >= 0) { + thread_lock(curthread); + sched_bind(curthread, nmk->affinity); + thread_unlock(curthread); + } + + while (nmk->run) { + /* + * check if the parent process dies + * (when kthread is attached to user process) + */ + if (ctx->user_td) { + PROC_LOCK(curproc); + thread_suspend_check(0); + PROC_UNLOCK(curproc); + } else { + kthread_suspend_check(); + } + + /* + * if ioevent_file is not defined, we don't have notification + * mechanism and we continually execute worker_fn() + */ + if (!ctx->ioevent_file) { + ctx->worker_fn(ctx->worker_private); /* worker body */ + } else { + /* checks if there is a pending notification */ + mtx_lock(&nmk->worker_lock); + if (likely(nmk->scheduled != old_scheduled)) { + old_scheduled = nmk->scheduled; + mtx_unlock(&nmk->worker_lock); + + ctx->worker_fn(ctx->worker_private); /* worker body */ + + continue; + } else if (nmk->run) { + /* wait on event with one second timeout */ + msleep_spin(ctx->ioevent_file, &nmk->worker_lock, + "nmk_ev", hz); + nmk->scheduled++; + } + mtx_unlock(&nmk->worker_lock); + } + } + + kthread_exit(); +} + +static int +nm_kthread_open_files(struct nm_kthread *nmk, struct nm_kthread_cfg *cfg) +{ + /* send irq through ioctl to bhyve (vmm.ko) */ + if (cfg->event.irqfd) { + nmk->worker_ctx.irq_fd = cfg->event.irqfd; + nmk->worker_ctx.irq_ioctl = cfg->event.ioctl; + } + /* ring.ioeventfd contains the chan where do tsleep to wait events */ + if (cfg->event.ioeventfd) { + nmk->worker_ctx.ioevent_file = (void *)cfg->event.ioeventfd; + } + + return 0; +} + +static void +nm_kthread_close_files(struct nm_kthread *nmk) +{ + nmk->worker_ctx.irq_fd = 0; + nmk->worker_ctx.ioevent_file = NULL; +} + +void +nm_os_kthread_set_affinity(struct nm_kthread *nmk, int affinity) +{ + nmk->affinity = affinity; +} + +struct nm_kthread * +nm_os_kthread_create(struct nm_kthread_cfg *cfg) +{ + struct nm_kthread *nmk = NULL; + int error; + + nmk = malloc(sizeof(*nmk), M_DEVBUF, M_NOWAIT | M_ZERO); + if (!nmk) + return NULL; + + mtx_init(&nmk->worker_lock, "nm_kthread lock", NULL, MTX_SPIN); + nmk->worker_ctx.worker_fn = cfg->worker_fn; + nmk->worker_ctx.worker_private = cfg->worker_private; + nmk->worker_ctx.type = cfg->type; + nmk->affinity = -1; + + /* attach kthread to user process (ptnetmap) */ + nmk->attach_user = cfg->attach_user; + + /* open event fd */ + error = nm_kthread_open_files(nmk, cfg); + if (error) + goto err; + + return nmk; +err: + free(nmk, M_DEVBUF); + return NULL; +} + +int +nm_os_kthread_start(struct nm_kthread *nmk) +{ + struct proc *p = NULL; + int error = 0; + + if (nmk->worker) { + return EBUSY; + } + + /* check if we want to attach kthread to user process */ + if (nmk->attach_user) { + nmk->worker_ctx.user_td = curthread; + p = curthread->td_proc; + } + + /* enable kthread main loop */ + nmk->run = 1; + /* create kthread */ + if((error = kthread_add(nm_kthread_worker, nmk, p, + &nmk->worker, RFNOWAIT /* to be checked */, 0, "nm-kthread-%ld", + nmk->worker_ctx.type))) { + goto err; } + + D("nm_kthread started td 0x%p", nmk->worker); + + return 0; +err: + D("nm_kthread start failed err %d", error); + nmk->worker = NULL; return error; } +void +nm_os_kthread_stop(struct nm_kthread *nmk) +{ + if (!nmk->worker) { + return; + } + /* tell to kthread to exit from main loop */ + nmk->run = 0; + + /* wake up kthread if it sleeps */ + kthread_resume(nmk->worker); + nm_os_kthread_wakeup_worker(nmk); + + nmk->worker = NULL; +} + +void +nm_os_kthread_delete(struct nm_kthread *nmk) +{ + if (!nmk) + return; + if (nmk->worker) { + nm_os_kthread_stop(nmk); + } + + nm_kthread_close_files(nmk); + + free(nmk, M_DEVBUF); +} + /******************** kqueue support ****************/ /* - * The OS_selwakeup also needs to issue a KNOTE_UNLOCKED. + * nm_os_selwakeup also needs to issue a KNOTE_UNLOCKED. * We use a non-zero argument to distinguish the call from the one * in kevent_scan() which instead also needs to run netmap_poll(). * The knote uses a global mutex for the time being. We might @@ -672,17 +1254,23 @@ netmap_open(struct cdev *dev, int oflags, int devtype, struct thread *td) void -freebsd_selwakeup(struct nm_selinfo *si, int pri) +nm_os_selwakeup(struct nm_selinfo *si) { if (netmap_verbose) D("on knote %p", &si->si.si_note); - selwakeuppri(&si->si, pri); + selwakeuppri(&si->si, PI_NET); /* use a non-zero hint to tell the notification from the * call done in kqueue_scan() which uses 0 */ KNOTE_UNLOCKED(&si->si.si_note, 0x100 /* notification */); } +void +nm_os_selrecord(struct thread *td, struct nm_selinfo *si) +{ + selrecord(td, &si->si); +} + static void netmap_knrdetach(struct knote *kn) { @@ -728,7 +1316,7 @@ netmap_knrw(struct knote *kn, long hint, int events) RD(5, "curthread changed %p %p", curthread, priv->np_td); return 1; } else { - revents = netmap_poll((void *)priv, events, curthread); + revents = netmap_poll(priv, events, NULL); return (events & revents) ? 1 : 0; } } @@ -801,13 +1389,47 @@ netmap_kqfilter(struct cdev *dev, struct knote *kn) return 0; } +static int +freebsd_netmap_poll(struct cdev *cdevi __unused, int events, struct thread *td) +{ + struct netmap_priv_d *priv; + if (devfs_get_cdevpriv((void **)&priv)) { + return POLLERR; + } + return netmap_poll(priv, events, td); +} + +static int +freebsd_netmap_ioctl(struct cdev *dev __unused, u_long cmd, caddr_t data, + int ffla __unused, struct thread *td) +{ + int error; + struct netmap_priv_d *priv; + + CURVNET_SET(TD_TO_VNET(rd)); + error = devfs_get_cdevpriv((void **)&priv); + if (error) { + /* XXX ENOENT should be impossible, since the priv + * is now created in the open */ + if (error == ENOENT) + error = ENXIO; + goto out; + } + error = netmap_ioctl(priv, cmd, data, td); +out: + CURVNET_RESTORE(); + + return error; +} + +extern struct cdevsw netmap_cdevsw; /* XXX used in netmap.c, should go elsewhere */ struct cdevsw netmap_cdevsw = { .d_version = D_VERSION, .d_name = "netmap", .d_open = netmap_open, .d_mmap_single = netmap_mmap_single, - .d_ioctl = netmap_ioctl, - .d_poll = netmap_poll, + .d_ioctl = freebsd_netmap_ioctl, + .d_poll = freebsd_netmap_poll, .d_kqfilter = netmap_kqfilter, .d_close = netmap_close, }; @@ -852,6 +1474,24 @@ netmap_loader(__unused struct module *module, int event, __unused void *arg) return (error); } - +#ifdef DEV_MODULE_ORDERED +/* + * The netmap module contains three drivers: (i) the netmap character device + * driver; (ii) the ptnetmap memdev PCI device driver, (iii) the ptnet PCI + * device driver. The attach() routines of both (ii) and (iii) need the + * lock of the global allocator, and such lock is initialized in netmap_init(), + * which is part of (i). + * Therefore, we make sure that (i) is loaded before (ii) and (iii), using + * the 'order' parameter of driver declaration macros. For (i), we specify + * SI_ORDER_MIDDLE, while higher orders are used with the DRIVER_MODULE_ORDERED + * macros for (ii) and (iii). + */ +DEV_MODULE_ORDERED(netmap, netmap_loader, NULL, SI_ORDER_MIDDLE); +#else /* !DEV_MODULE_ORDERED */ DEV_MODULE(netmap, netmap_loader, NULL); +#endif /* DEV_MODULE_ORDERED */ +MODULE_DEPEND(netmap, pci, 1, 1, 1); MODULE_VERSION(netmap, 1); +/* reduce conditional code */ +// linux API, use for the knlist in FreeBSD +/* use a private mutex for the knlist */ diff --git a/sys/dev/netmap/netmap_generic.c b/sys/dev/netmap/netmap_generic.c index 85a6a9f76ea2f..5cef4a29110a5 100644 --- a/sys/dev/netmap/netmap_generic.c +++ b/sys/dev/netmap/netmap_generic.c @@ -1,5 +1,7 @@ /* - * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved. + * Copyright (C) 2013-2016 Vincenzo Maffione + * Copyright (C) 2013-2016 Luigi Rizzo + * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -83,25 +85,25 @@ __FBSDID("$FreeBSD$"); #define rtnl_lock() ND("rtnl_lock called") #define rtnl_unlock() ND("rtnl_unlock called") -#define MBUF_TXQ(m) ((m)->m_pkthdr.flowid) #define MBUF_RXQ(m) ((m)->m_pkthdr.flowid) #define smp_mb() /* * FreeBSD mbuf allocator/deallocator in emulation mode: - * + */ +#if __FreeBSD_version < 1100000 + +/* + * For older versions of FreeBSD: + * * We allocate EXT_PACKET mbuf+clusters, but need to set M_NOFREE * so that the destructor, if invoked, will not free the packet. - * In principle we should set the destructor only on demand, + * In principle we should set the destructor only on demand, * but since there might be a race we better do it on allocation. * As a consequence, we also need to set the destructor or we * would leak buffers. */ -/* - * mbuf wrappers - */ - /* mbuf destructor, also need to change the type to EXT_EXTREF, * add an M_NOFREE flag, and then clear the flag and * chain into uma_zfree(zone_pack, mf) @@ -112,35 +114,93 @@ __FBSDID("$FreeBSD$"); (m)->m_ext.ext_type = EXT_EXTREF; \ } while (0) -static void -netmap_default_mbuf_destructor(struct mbuf *m) +static int +void_mbuf_dtor(struct mbuf *m, void *arg1, void *arg2) { /* restore original mbuf */ m->m_ext.ext_buf = m->m_data = m->m_ext.ext_arg1; m->m_ext.ext_arg1 = NULL; m->m_ext.ext_type = EXT_PACKET; m->m_ext.ext_free = NULL; - if (GET_MBUF_REFCNT(m) == 0) + if (MBUF_REFCNT(m) == 0) SET_MBUF_REFCNT(m, 1); uma_zfree(zone_pack, m); + + return 0; } static inline struct mbuf * -netmap_get_mbuf(int len) +nm_os_get_mbuf(struct ifnet *ifp, int len) { struct mbuf *m; + + (void)ifp; m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m) { - m->m_flags |= M_NOFREE; /* XXXNP: Almost certainly incorrect. */ + /* m_getcl() (mb_ctor_mbuf) has an assert that checks that + * M_NOFREE flag is not specified as third argument, + * so we have to set M_NOFREE after m_getcl(). */ + m->m_flags |= M_NOFREE; m->m_ext.ext_arg1 = m->m_ext.ext_buf; // XXX save - m->m_ext.ext_free = (void *)netmap_default_mbuf_destructor; + m->m_ext.ext_free = (void *)void_mbuf_dtor; m->m_ext.ext_type = EXT_EXTREF; - ND(5, "create m %p refcnt %d", m, GET_MBUF_REFCNT(m)); + ND(5, "create m %p refcnt %d", m, MBUF_REFCNT(m)); + } + return m; +} + +#else /* __FreeBSD_version >= 1100000 */ + +/* + * Newer versions of FreeBSD, using a straightforward scheme. + * + * We allocate mbufs with m_gethdr(), since the mbuf header is needed + * by the driver. We also attach a customly-provided external storage, + * which in this case is a netmap buffer. When calling m_extadd(), however + * we pass a NULL address, since the real address (and length) will be + * filled in by nm_os_generic_xmit_frame() right before calling + * if_transmit(). + * + * The dtor function does nothing, however we need it since mb_free_ext() + * has a KASSERT(), checking that the mbuf dtor function is not NULL. + */ + +#define SET_MBUF_DESTRUCTOR(m, fn) do { \ + (m)->m_ext.ext_free = (void *)fn; \ +} while (0) + +static void void_mbuf_dtor(struct mbuf *m, void *arg1, void *arg2) { } + +static inline struct mbuf * +nm_os_get_mbuf(struct ifnet *ifp, int len) +{ + struct mbuf *m; + + (void)ifp; + (void)len; + + m = m_gethdr(M_NOWAIT, MT_DATA); + if (m == NULL) { + return m; } + + m_extadd(m, NULL /* buf */, 0 /* size */, void_mbuf_dtor, + NULL, NULL, 0, EXT_NET_DRV); + return m; } +#endif /* __FreeBSD_version >= 1100000 */ + +#elif defined _WIN32 + +#include "win_glue.h" +#define rtnl_lock() ND("rtnl_lock called") +#define rtnl_unlock() ND("rtnl_unlock called") +#define MBUF_TXQ(m) 0//((m)->m_pkthdr.flowid) +#define MBUF_RXQ(m) 0//((m)->m_pkthdr.flowid) +#define smp_mb() //XXX: to be correctly defined #else /* linux */ @@ -150,7 +210,12 @@ netmap_get_mbuf(int len) #include /* struct ethtool_ops, get_ringparam */ #include -//#define REG_RESET +static inline struct mbuf * +nm_os_get_mbuf(struct ifnet *ifp, int len) +{ + return alloc_skb(ifp->needed_headroom + len + + ifp->needed_tailroom, GFP_ATOMIC); +} #endif /* linux */ @@ -161,8 +226,21 @@ netmap_get_mbuf(int len) #include +#define for_each_kring_n(_i, _k, _karr, _n) \ + for (_k=_karr, _i = 0; _i < _n; (_k)++, (_i)++) + +#define for_each_tx_kring(_i, _k, _na) \ + for_each_kring_n(_i, _k, (_na)->tx_rings, (_na)->num_tx_rings) +#define for_each_tx_kring_h(_i, _k, _na) \ + for_each_kring_n(_i, _k, (_na)->tx_rings, (_na)->num_tx_rings + 1) -/* ======================== usage stats =========================== */ +#define for_each_rx_kring(_i, _k, _na) \ + for_each_kring_n(_i, _k, (_na)->rx_rings, (_na)->num_rx_rings) +#define for_each_rx_kring_h(_i, _k, _na) \ + for_each_kring_n(_i, _k, (_na)->rx_rings, (_na)->num_rx_rings + 1) + + +/* ======================== PERFORMANCE STATISTICS =========================== */ #ifdef RATE_GENERIC #define IFRATE(x) x @@ -170,6 +248,8 @@ struct rate_stats { unsigned long txpkt; unsigned long txsync; unsigned long txirq; + unsigned long txrepl; + unsigned long txdrop; unsigned long rxpkt; unsigned long rxirq; unsigned long rxsync; @@ -194,6 +274,8 @@ static void rate_callback(unsigned long arg) RATE_PRINTK(txpkt); RATE_PRINTK(txsync); RATE_PRINTK(txirq); + RATE_PRINTK(txrepl); + RATE_PRINTK(txdrop); RATE_PRINTK(rxpkt); RATE_PRINTK(rxsync); RATE_PRINTK(rxirq); @@ -230,94 +312,222 @@ void generic_rate(int txp, int txs, int txi, int rxp, int rxs, int rxi) * the poller threads. Differently from netmap_rx_irq(), we check * only NAF_NETMAP_ON instead of NAF_NATIVE_ON to enable the irq. */ -static void -netmap_generic_irq(struct ifnet *ifp, u_int q, u_int *work_done) +void +netmap_generic_irq(struct netmap_adapter *na, u_int q, u_int *work_done) { - struct netmap_adapter *na = NA(ifp); if (unlikely(!nm_netmap_on(na))) return; - netmap_common_irq(ifp, q, work_done); + netmap_common_irq(na, q, work_done); +#ifdef RATE_GENERIC + if (work_done) + rate_ctx.new.rxirq++; + else + rate_ctx.new.txirq++; +#endif /* RATE_GENERIC */ } +static int +generic_netmap_unregister(struct netmap_adapter *na) +{ + struct netmap_generic_adapter *gna = (struct netmap_generic_adapter *)na; + struct netmap_kring *kring = NULL; + int i, r; + + if (na->active_fds == 0) { + D("Generic adapter %p goes off", na); + rtnl_lock(); + + na->na_flags &= ~NAF_NETMAP_ON; + + /* Release packet steering control. */ + nm_os_catch_tx(gna, 0); + + /* Stop intercepting packets on the RX path. */ + nm_os_catch_rx(gna, 0); + + rtnl_unlock(); + } + + for_each_rx_kring_h(r, kring, na) { + if (nm_kring_pending_off(kring)) { + D("RX ring %d of generic adapter %p goes off", r, na); + kring->nr_mode = NKR_NETMAP_OFF; + } + } + for_each_tx_kring_h(r, kring, na) { + if (nm_kring_pending_off(kring)) { + kring->nr_mode = NKR_NETMAP_OFF; + D("TX ring %d of generic adapter %p goes off", r, na); + } + } + + for_each_rx_kring(r, kring, na) { + /* Free the mbufs still pending in the RX queues, + * that did not end up into the corresponding netmap + * RX rings. */ + mbq_safe_purge(&kring->rx_queue); + nm_os_mitigation_cleanup(&gna->mit[r]); + } + + /* Decrement reference counter for the mbufs in the + * TX pools. These mbufs can be still pending in drivers, + * (e.g. this happens with virtio-net driver, which + * does lazy reclaiming of transmitted mbufs). */ + for_each_tx_kring(r, kring, na) { + /* We must remove the destructor on the TX event, + * because the destructor invokes netmap code, and + * the netmap module may disappear before the + * TX event is consumed. */ + mtx_lock_spin(&kring->tx_event_lock); + if (kring->tx_event) { + SET_MBUF_DESTRUCTOR(kring->tx_event, NULL); + } + kring->tx_event = NULL; + mtx_unlock_spin(&kring->tx_event_lock); + } + + if (na->active_fds == 0) { + free(gna->mit, M_DEVBUF); + + for_each_rx_kring(r, kring, na) { + mbq_safe_fini(&kring->rx_queue); + } + + for_each_tx_kring(r, kring, na) { + mtx_destroy(&kring->tx_event_lock); + if (kring->tx_pool == NULL) { + continue; + } + + for (i=0; inum_tx_desc; i++) { + if (kring->tx_pool[i]) { + m_freem(kring->tx_pool[i]); + } + } + free(kring->tx_pool, M_DEVBUF); + kring->tx_pool = NULL; + } + +#ifdef RATE_GENERIC + if (--rate_ctx.refcount == 0) { + D("del_timer()"); + del_timer(&rate_ctx.timer); + } +#endif + } + + return 0; +} /* Enable/disable netmap mode for a generic network interface. */ static int generic_netmap_register(struct netmap_adapter *na, int enable) { struct netmap_generic_adapter *gna = (struct netmap_generic_adapter *)na; - struct mbuf *m; + struct netmap_kring *kring = NULL; int error; int i, r; - if (!na) + if (!na) { return EINVAL; + } -#ifdef REG_RESET - error = ifp->netdev_ops->ndo_stop(ifp); - if (error) { - return error; + if (!enable) { + /* This is actually an unregif. */ + return generic_netmap_unregister(na); } -#endif /* REG_RESET */ - if (enable) { /* Enable netmap mode. */ - /* Init the mitigation support on all the rx queues. */ + if (na->active_fds == 0) { + D("Generic adapter %p goes on", na); + /* Do all memory allocations when (na->active_fds == 0), to + * simplify error management. */ + + /* Allocate memory for mitigation support on all the rx queues. */ gna->mit = malloc(na->num_rx_rings * sizeof(struct nm_generic_mit), - M_DEVBUF, M_NOWAIT | M_ZERO); + M_DEVBUF, M_NOWAIT | M_ZERO); if (!gna->mit) { D("mitigation allocation failed"); error = ENOMEM; goto out; } - for (r=0; rnum_rx_rings; r++) - netmap_mitigation_init(&gna->mit[r], r, na); - /* Initialize the rx queue, as generic_rx_handler() can - * be called as soon as netmap_catch_rx() returns. - */ - for (r=0; rnum_rx_rings; r++) { - mbq_safe_init(&na->rx_rings[r].rx_queue); + for_each_rx_kring(r, kring, na) { + /* Init mitigation support. */ + nm_os_mitigation_init(&gna->mit[r], r, na); + + /* Initialize the rx queue, as generic_rx_handler() can + * be called as soon as nm_os_catch_rx() returns. + */ + mbq_safe_init(&kring->rx_queue); } /* - * Preallocate packet buffers for the tx rings. + * Prepare mbuf pools (parallel to the tx rings), for packet + * transmission. Don't preallocate the mbufs here, it's simpler + * to leave this task to txsync. */ - for (r=0; rnum_tx_rings; r++) - na->tx_rings[r].tx_pool = NULL; - for (r=0; rnum_tx_rings; r++) { - na->tx_rings[r].tx_pool = malloc(na->num_tx_desc * sizeof(struct mbuf *), - M_DEVBUF, M_NOWAIT | M_ZERO); - if (!na->tx_rings[r].tx_pool) { + for_each_tx_kring(r, kring, na) { + kring->tx_pool = NULL; + } + for_each_tx_kring(r, kring, na) { + kring->tx_pool = + malloc(na->num_tx_desc * sizeof(struct mbuf *), + M_DEVBUF, M_NOWAIT | M_ZERO); + if (!kring->tx_pool) { D("tx_pool allocation failed"); error = ENOMEM; goto free_tx_pools; } - for (i=0; inum_tx_desc; i++) - na->tx_rings[r].tx_pool[i] = NULL; - for (i=0; inum_tx_desc; i++) { - m = netmap_get_mbuf(NETMAP_BUF_SIZE(na)); - if (!m) { - D("tx_pool[%d] allocation failed", i); - error = ENOMEM; - goto free_tx_pools; - } - na->tx_rings[r].tx_pool[i] = m; - } + mtx_init(&kring->tx_event_lock, "tx_event_lock", + NULL, MTX_SPIN); + } + } + + for_each_rx_kring_h(r, kring, na) { + if (nm_kring_pending_on(kring)) { + D("RX ring %d of generic adapter %p goes on", r, na); + kring->nr_mode = NKR_NETMAP_ON; + } + + } + for_each_tx_kring_h(r, kring, na) { + if (nm_kring_pending_on(kring)) { + D("TX ring %d of generic adapter %p goes on", r, na); + kring->nr_mode = NKR_NETMAP_ON; } + } + + for_each_tx_kring(r, kring, na) { + /* Initialize tx_pool and tx_event. */ + for (i=0; inum_tx_desc; i++) { + kring->tx_pool[i] = NULL; + } + + kring->tx_event = NULL; + } + + if (na->active_fds == 0) { rtnl_lock(); + /* Prepare to intercept incoming traffic. */ - error = netmap_catch_rx(gna, 1); + error = nm_os_catch_rx(gna, 1); if (error) { - D("netdev_rx_handler_register() failed (%d)", error); + D("nm_os_catch_rx(1) failed (%d)", error); goto register_handler; } - na->na_flags |= NAF_NETMAP_ON; /* Make netmap control the packet steering. */ - netmap_catch_tx(gna, 1); + error = nm_os_catch_tx(gna, 1); + if (error) { + D("nm_os_catch_tx(1) failed (%d)", error); + goto catch_rx; + } rtnl_unlock(); + na->na_flags |= NAF_NETMAP_ON; + #ifdef RATE_GENERIC if (rate_ctx.refcount == 0) { D("setup_timer()"); @@ -329,73 +539,26 @@ generic_netmap_register(struct netmap_adapter *na, int enable) } rate_ctx.refcount++; #endif /* RATE */ - - } else if (na->tx_rings[0].tx_pool) { - /* Disable netmap mode. We enter here only if the previous - generic_netmap_register(na, 1) was successful. - If it was not, na->tx_rings[0].tx_pool was set to NULL by the - error handling code below. */ - rtnl_lock(); - - na->na_flags &= ~NAF_NETMAP_ON; - - /* Release packet steering control. */ - netmap_catch_tx(gna, 0); - - /* Do not intercept packets on the rx path. */ - netmap_catch_rx(gna, 0); - - rtnl_unlock(); - - /* Free the mbufs going to the netmap rings */ - for (r=0; rnum_rx_rings; r++) { - mbq_safe_purge(&na->rx_rings[r].rx_queue); - mbq_safe_destroy(&na->rx_rings[r].rx_queue); - } - - for (r=0; rnum_rx_rings; r++) - netmap_mitigation_cleanup(&gna->mit[r]); - free(gna->mit, M_DEVBUF); - - for (r=0; rnum_tx_rings; r++) { - for (i=0; inum_tx_desc; i++) { - m_freem(na->tx_rings[r].tx_pool[i]); - } - free(na->tx_rings[r].tx_pool, M_DEVBUF); - } - -#ifdef RATE_GENERIC - if (--rate_ctx.refcount == 0) { - D("del_timer()"); - del_timer(&rate_ctx.timer); - } -#endif - } - -#ifdef REG_RESET - error = ifp->netdev_ops->ndo_open(ifp); - if (error) { - goto free_tx_pools; } -#endif return 0; + /* Here (na->active_fds == 0) holds. */ +catch_rx: + nm_os_catch_rx(gna, 0); register_handler: rtnl_unlock(); free_tx_pools: - for (r=0; rnum_tx_rings; r++) { - if (na->tx_rings[r].tx_pool == NULL) + for_each_tx_kring(r, kring, na) { + mtx_destroy(&kring->tx_event_lock); + if (kring->tx_pool == NULL) { continue; - for (i=0; inum_tx_desc; i++) - if (na->tx_rings[r].tx_pool[i]) - m_freem(na->tx_rings[r].tx_pool[i]); - free(na->tx_rings[r].tx_pool, M_DEVBUF); - na->tx_rings[r].tx_pool = NULL; + } + free(kring->tx_pool, M_DEVBUF); + kring->tx_pool = NULL; } - for (r=0; rnum_rx_rings; r++) { - netmap_mitigation_cleanup(&gna->mit[r]); - mbq_safe_destroy(&na->rx_rings[r].rx_queue); + for_each_rx_kring(r, kring, na) { + mbq_safe_fini(&kring->rx_queue); } free(gna->mit, M_DEVBUF); out: @@ -411,13 +574,58 @@ out: static void generic_mbuf_destructor(struct mbuf *m) { - netmap_generic_irq(MBUF_IFP(m), MBUF_TXQ(m), NULL); + struct netmap_adapter *na = NA(GEN_TX_MBUF_IFP(m)); + struct netmap_kring *kring; + unsigned int r = MBUF_TXQ(m); + unsigned int r_orig = r; + + if (unlikely(!nm_netmap_on(na) || r >= na->num_tx_rings)) { + D("Error: no netmap adapter on device %p", + GEN_TX_MBUF_IFP(m)); + return; + } + + /* + * First, clear the event mbuf. + * In principle, the event 'm' should match the one stored + * on ring 'r'. However we check it explicitely to stay + * safe against lower layers (qdisc, driver, etc.) changing + * MBUF_TXQ(m) under our feet. If the match is not found + * on 'r', we try to see if it belongs to some other ring. + */ + for (;;) { + bool match = false; + + kring = &na->tx_rings[r]; + mtx_lock_spin(&kring->tx_event_lock); + if (kring->tx_event == m) { + kring->tx_event = NULL; + match = true; + } + mtx_unlock_spin(&kring->tx_event_lock); + + if (match) { + if (r != r_orig) { + RD(1, "event %p migrated: ring %u --> %u", + m, r_orig, r); + } + break; + } + + if (++r == na->num_tx_rings) r = 0; + + if (r == r_orig) { + RD(1, "Cannot match event %p", m); + return; + } + } + + /* Second, wake up clients. They will reclaim the event through + * txsync. */ + netmap_generic_irq(na, r, NULL); #ifdef __FreeBSD__ - if (netmap_verbose) - RD(5, "Tx irq (%p) queue %d index %d" , m, MBUF_TXQ(m), (int)(uintptr_t)m->m_ext.ext_arg1); - netmap_default_mbuf_destructor(m); -#endif /* __FreeBSD__ */ - IFRATE(rate_ctx.new.txirq++); + void_mbuf_dtor(m, NULL, NULL); +#endif } extern int netmap_adaptive_io; @@ -428,7 +636,7 @@ extern int netmap_adaptive_io; * nr_hwcur is the first unsent buffer. */ static u_int -generic_netmap_tx_clean(struct netmap_kring *kring) +generic_netmap_tx_clean(struct netmap_kring *kring, int txqdisc) { u_int const lim = kring->nkr_num_slots - 1; u_int nm_i = nm_next(kring->nr_hwtail, lim); @@ -436,20 +644,50 @@ generic_netmap_tx_clean(struct netmap_kring *kring) u_int n = 0; struct mbuf **tx_pool = kring->tx_pool; + ND("hwcur = %d, hwtail = %d", kring->nr_hwcur, kring->nr_hwtail); + while (nm_i != hwcur) { /* buffers not completed */ struct mbuf *m = tx_pool[nm_i]; - if (unlikely(m == NULL)) { - /* this is done, try to replenish the entry */ - tx_pool[nm_i] = m = netmap_get_mbuf(NETMAP_BUF_SIZE(kring->na)); + if (txqdisc) { + if (m == NULL) { + /* Nothing to do, this is going + * to be replenished. */ + RD(3, "Is this happening?"); + + } else if (MBUF_QUEUED(m)) { + break; /* Not dequeued yet. */ + + } else if (MBUF_REFCNT(m) != 1) { + /* This mbuf has been dequeued but is still busy + * (refcount is 2). + * Leave it to the driver and replenish. */ + m_freem(m); + tx_pool[nm_i] = NULL; + } + + } else { if (unlikely(m == NULL)) { - D("mbuf allocation failed, XXX error"); - // XXX how do we proceed ? break ? - return -ENOMEM; + int event_consumed; + + /* This slot was used to place an event. */ + mtx_lock_spin(&kring->tx_event_lock); + event_consumed = (kring->tx_event == NULL); + mtx_unlock_spin(&kring->tx_event_lock); + if (!event_consumed) { + /* The event has not been consumed yet, + * still busy in the driver. */ + break; + } + /* The event has been consumed, we can go + * ahead. */ + + } else if (MBUF_REFCNT(m) != 1) { + /* This mbuf is still busy: its refcnt is 2. */ + break; } - } else if (GET_MBUF_REFCNT(m) != 1) { - break; /* This mbuf is still busy: its refcnt is 2. */ } + n++; nm_i = nm_next(nm_i, lim); #if 0 /* rate adaptation */ @@ -476,23 +714,17 @@ generic_netmap_tx_clean(struct netmap_kring *kring) return n; } - -/* - * We have pending packets in the driver between nr_hwtail +1 and hwcur. - * Compute a position in the middle, to be used to generate - * a notification. - */ +/* Compute a slot index in the middle between inf and sup. */ static inline u_int -generic_tx_event_middle(struct netmap_kring *kring, u_int hwcur) +ring_middle(u_int inf, u_int sup, u_int lim) { - u_int n = kring->nkr_num_slots; - u_int ntc = nm_next(kring->nr_hwtail, n-1); + u_int n = lim + 1; u_int e; - if (hwcur >= ntc) { - e = (hwcur + ntc) / 2; + if (sup >= inf) { + e = (sup + inf) / 2; } else { /* wrap around */ - e = (hwcur + n + ntc) / 2; + e = (sup + n + inf) / 2; if (e >= n) { e -= n; } @@ -506,35 +738,59 @@ generic_tx_event_middle(struct netmap_kring *kring, u_int hwcur) return e; } -/* - * We have pending packets in the driver between nr_hwtail+1 and hwcur. - * Schedule a notification approximately in the middle of the two. - * There is a race but this is only called within txsync which does - * a double check. - */ static void generic_set_tx_event(struct netmap_kring *kring, u_int hwcur) { + u_int lim = kring->nkr_num_slots - 1; struct mbuf *m; u_int e; + u_int ntc = nm_next(kring->nr_hwtail, lim); /* next to clean */ - if (nm_next(kring->nr_hwtail, kring->nkr_num_slots -1) == hwcur) { + if (ntc == hwcur) { return; /* all buffers are free */ } - e = generic_tx_event_middle(kring, hwcur); + + /* + * We have pending packets in the driver between hwtail+1 + * and hwcur, and we have to chose one of these slot to + * generate a notification. + * There is a race but this is only called within txsync which + * does a double check. + */ +#if 0 + /* Choose a slot in the middle, so that we don't risk ending + * up in a situation where the client continuously wake up, + * fills one or a few TX slots and go to sleep again. */ + e = ring_middle(ntc, hwcur, lim); +#else + /* Choose the first pending slot, to be safe against driver + * reordering mbuf transmissions. */ + e = ntc; +#endif m = kring->tx_pool[e]; - ND(5, "Request Event at %d mbuf %p refcnt %d", e, m, m ? GET_MBUF_REFCNT(m) : -2 ); if (m == NULL) { - /* This can happen if there is already an event on the netmap - slot 'e': There is nothing to do. */ + /* An event is already in place. */ return; } - kring->tx_pool[e] = NULL; + + mtx_lock_spin(&kring->tx_event_lock); + if (kring->tx_event) { + /* An event is already in place. */ + mtx_unlock_spin(&kring->tx_event_lock); + return; + } + SET_MBUF_DESTRUCTOR(m, generic_mbuf_destructor); + kring->tx_event = m; + mtx_unlock_spin(&kring->tx_event_lock); + + kring->tx_pool[e] = NULL; + + ND(5, "Request Event at %d mbuf %p refcnt %d", e, m, m ? MBUF_REFCNT(m) : -2 ); - // XXX wmb() ? - /* Decrement the refcount an free it if we have the last one. */ + /* Decrement the refcount. This will free it if we lose the race + * with the driver. */ m_freem(m); smp_mb(); } @@ -551,6 +807,7 @@ static int generic_netmap_txsync(struct netmap_kring *kring, int flags) { struct netmap_adapter *na = kring->na; + struct netmap_generic_adapter *gna = (struct netmap_generic_adapter *)na; struct ifnet *ifp = na->ifp; struct netmap_ring *ring = kring->ring; u_int nm_i; /* index into the netmap ring */ // j @@ -560,8 +817,6 @@ generic_netmap_txsync(struct netmap_kring *kring, int flags) IFRATE(rate_ctx.new.txsync++); - // TODO: handle the case of mbuf allocation failure - rmb(); /* @@ -569,72 +824,121 @@ generic_netmap_txsync(struct netmap_kring *kring, int flags) */ nm_i = kring->nr_hwcur; if (nm_i != head) { /* we have new packets to send */ + struct nm_os_gen_arg a; + u_int event = -1; + + if (gna->txqdisc && nm_kr_txempty(kring)) { + /* In txqdisc mode, we ask for a delayed notification, + * but only when cur == hwtail, which means that the + * client is going to block. */ + event = ring_middle(nm_i, head, lim); + ND(3, "Place txqdisc event (hwcur=%u,event=%u," + "head=%u,hwtail=%u)", nm_i, event, head, + kring->nr_hwtail); + } + + a.ifp = ifp; + a.ring_nr = ring_nr; + a.head = a.tail = NULL; + while (nm_i != head) { struct netmap_slot *slot = &ring->slot[nm_i]; u_int len = slot->len; void *addr = NMB(na, slot); - /* device-specific */ struct mbuf *m; int tx_ret; NM_CHECK_ADDR_LEN(na, addr, len); - /* Tale a mbuf from the tx pool and copy in the user packet. */ + /* Tale a mbuf from the tx pool (replenishing the pool + * entry if necessary) and copy in the user packet. */ m = kring->tx_pool[nm_i]; - if (unlikely(!m)) { - RD(5, "This should never happen"); - kring->tx_pool[nm_i] = m = netmap_get_mbuf(NETMAP_BUF_SIZE(na)); - if (unlikely(m == NULL)) { - D("mbuf allocation failed"); + if (unlikely(m == NULL)) { + kring->tx_pool[nm_i] = m = + nm_os_get_mbuf(ifp, NETMAP_BUF_SIZE(na)); + if (m == NULL) { + RD(2, "Failed to replenish mbuf"); + /* Here we could schedule a timer which + * retries to replenish after a while, + * and notifies the client when it + * manages to replenish some slots. In + * any case we break early to avoid + * crashes. */ break; } + IFRATE(rate_ctx.new.txrepl++); } - /* XXX we should ask notifications when NS_REPORT is set, - * or roughly every half frame. We can optimize this - * by lazily requesting notifications only when a - * transmission fails. Probably the best way is to - * break on failures and set notifications when - * ring->cur == ring->tail || nm_i != cur + + a.m = m; + a.addr = addr; + a.len = len; + a.qevent = (nm_i == event); + /* When not in txqdisc mode, we should ask + * notifications when NS_REPORT is set, or roughly + * every half ring. To optimize this, we set a + * notification event when the client runs out of + * TX ring space, or when transmission fails. In + * the latter case we also break early. */ - tx_ret = generic_xmit_frame(ifp, m, addr, len, ring_nr); + tx_ret = nm_os_generic_xmit_frame(&a); if (unlikely(tx_ret)) { - ND(5, "start_xmit failed: err %d [nm_i %u, head %u, hwtail %u]", - tx_ret, nm_i, head, kring->nr_hwtail); - /* - * No room for this mbuf in the device driver. - * Request a notification FOR A PREVIOUS MBUF, - * then call generic_netmap_tx_clean(kring) to do the - * double check and see if we can free more buffers. - * If there is space continue, else break; - * NOTE: the double check is necessary if the problem - * occurs in the txsync call after selrecord(). - * Also, we need some way to tell the caller that not - * all buffers were queued onto the device (this was - * not a problem with native netmap driver where space - * is preallocated). The bridge has a similar problem - * and we solve it there by dropping the excess packets. - */ - generic_set_tx_event(kring, nm_i); - if (generic_netmap_tx_clean(kring)) { /* space now available */ - continue; - } else { - break; + if (!gna->txqdisc) { + /* + * No room for this mbuf in the device driver. + * Request a notification FOR A PREVIOUS MBUF, + * then call generic_netmap_tx_clean(kring) to do the + * double check and see if we can free more buffers. + * If there is space continue, else break; + * NOTE: the double check is necessary if the problem + * occurs in the txsync call after selrecord(). + * Also, we need some way to tell the caller that not + * all buffers were queued onto the device (this was + * not a problem with native netmap driver where space + * is preallocated). The bridge has a similar problem + * and we solve it there by dropping the excess packets. + */ + generic_set_tx_event(kring, nm_i); + if (generic_netmap_tx_clean(kring, gna->txqdisc)) { + /* space now available */ + continue; + } else { + break; + } } + + /* In txqdisc mode, the netmap-aware qdisc + * queue has the same length as the number of + * netmap slots (N). Since tail is advanced + * only when packets are dequeued, qdisc + * queue overrun cannot happen, so + * nm_os_generic_xmit_frame() did not fail + * because of that. + * However, packets can be dropped because + * carrier is off, or because our qdisc is + * being deactivated, or possibly for other + * reasons. In these cases, we just let the + * packet to be dropped. */ + IFRATE(rate_ctx.new.txdrop++); } + slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED); nm_i = nm_next(nm_i, lim); - IFRATE(rate_ctx.new.txpkt ++); + IFRATE(rate_ctx.new.txpkt++); } - - /* Update hwcur to the next slot to transmit. */ - kring->nr_hwcur = nm_i; /* not head, we could break early */ + if (a.head != NULL) { + a.addr = NULL; + nm_os_generic_xmit_frame(&a); + } + /* Update hwcur to the next slot to transmit. Here nm_i + * is not necessarily head, we could break early. */ + kring->nr_hwcur = nm_i; } /* * Second, reclaim completed buffers */ - if (flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring)) { + if (!gna->txqdisc && (flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring))) { /* No more available slots? Set a notification event * on a netmap slot that will be cleaned in the future. * No doublecheck is performed, since txsync() will be @@ -642,58 +946,74 @@ generic_netmap_txsync(struct netmap_kring *kring, int flags) */ generic_set_tx_event(kring, nm_i); } - ND("tx #%d, hwtail = %d", n, kring->nr_hwtail); - generic_netmap_tx_clean(kring); + generic_netmap_tx_clean(kring, gna->txqdisc); return 0; } /* - * This handler is registered (through netmap_catch_rx()) + * This handler is registered (through nm_os_catch_rx()) * within the attached network interface * in the RX subsystem, so that every mbuf passed up by * the driver can be stolen to the network stack. * Stolen packets are put in a queue where the * generic_netmap_rxsync() callback can extract them. + * Returns 1 if the packet was stolen, 0 otherwise. */ -void +int generic_rx_handler(struct ifnet *ifp, struct mbuf *m) { struct netmap_adapter *na = NA(ifp); struct netmap_generic_adapter *gna = (struct netmap_generic_adapter *)na; + struct netmap_kring *kring; u_int work_done; - u_int rr = MBUF_RXQ(m); // receive ring number + u_int r = MBUF_RXQ(m); /* receive ring number */ - if (rr >= na->num_rx_rings) { - rr = rr % na->num_rx_rings; // XXX expensive... + if (r >= na->num_rx_rings) { + r = r % na->num_rx_rings; + } + + kring = &na->rx_rings[r]; + + if (kring->nr_mode == NKR_NETMAP_OFF) { + /* We must not intercept this mbuf. */ + return 0; } /* limit the size of the queue */ - if (unlikely(mbq_len(&na->rx_rings[rr].rx_queue) > 1024)) { + if (unlikely(!gna->rxsg && MBUF_LEN(m) > NETMAP_BUF_SIZE(na))) { + /* This may happen when GRO/LRO features are enabled for + * the NIC driver when the generic adapter does not + * support RX scatter-gather. */ + RD(2, "Warning: driver pushed up big packet " + "(size=%d)", (int)MBUF_LEN(m)); + m_freem(m); + } else if (unlikely(mbq_len(&kring->rx_queue) > 1024)) { m_freem(m); } else { - mbq_safe_enqueue(&na->rx_rings[rr].rx_queue, m); + mbq_safe_enqueue(&kring->rx_queue, m); } if (netmap_generic_mit < 32768) { /* no rx mitigation, pass notification up */ - netmap_generic_irq(na->ifp, rr, &work_done); - IFRATE(rate_ctx.new.rxirq++); + netmap_generic_irq(na, r, &work_done); } else { /* same as send combining, filter notification if there is a * pending timer, otherwise pass it up and start a timer. */ - if (likely(netmap_mitigation_active(&gna->mit[rr]))) { + if (likely(nm_os_mitigation_active(&gna->mit[r]))) { /* Record that there is some pending work. */ - gna->mit[rr].mit_pending = 1; + gna->mit[r].mit_pending = 1; } else { - netmap_generic_irq(na->ifp, rr, &work_done); - IFRATE(rate_ctx.new.rxirq++); - netmap_mitigation_start(&gna->mit[rr]); + netmap_generic_irq(na, r, &work_done); + nm_os_mitigation_start(&gna->mit[r]); } } + + /* We have intercepted the mbuf. */ + return 1; } /* @@ -713,54 +1033,23 @@ generic_netmap_rxsync(struct netmap_kring *kring, int flags) u_int const head = kring->rhead; int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR; + /* Adapter-specific variables. */ + uint16_t slot_flags = kring->nkr_slot_flags; + u_int nm_buf_len = NETMAP_BUF_SIZE(na); + struct mbq tmpq; + struct mbuf *m; + int avail; /* in bytes */ + int mlen; + int copy; + if (head > lim) return netmap_ring_reinit(kring); - /* - * First part: import newly received packets. - */ - if (netmap_no_pendintr || force_update) { - /* extract buffers from the rx queue, stop at most one - * slot before nr_hwcur (stop_i) - */ - uint16_t slot_flags = kring->nkr_slot_flags; - u_int stop_i = nm_prev(kring->nr_hwcur, lim); - - nm_i = kring->nr_hwtail; /* first empty slot in the receive ring */ - for (n = 0; nm_i != stop_i; n++) { - int len; - void *addr = NMB(na, &ring->slot[nm_i]); - struct mbuf *m; - - /* we only check the address here on generic rx rings */ - if (addr == NETMAP_BUF_BASE(na)) { /* Bad buffer */ - return netmap_ring_reinit(kring); - } - /* - * Call the locked version of the function. - * XXX Ideally we could grab a batch of mbufs at once - * and save some locking overhead. - */ - m = mbq_safe_dequeue(&kring->rx_queue); - if (!m) /* no more data */ - break; - len = MBUF_LEN(m); - m_copydata(m, 0, len, addr); - ring->slot[nm_i].len = len; - ring->slot[nm_i].flags = slot_flags; - m_freem(m); - nm_i = nm_next(nm_i, lim); - } - if (n) { - kring->nr_hwtail = nm_i; - IFRATE(rate_ctx.new.rxpkt += n); - } - kring->nr_kflags &= ~NKR_PENDINTR; - } + IFRATE(rate_ctx.new.rxsync++); - // XXX should we invert the order ? /* - * Second part: skip past packets that userspace has released. + * First part: skip past packets that userspace has released. + * This can possibly make room for the second part. */ nm_i = kring->nr_hwcur; if (nm_i != head) { @@ -773,7 +1062,106 @@ generic_netmap_rxsync(struct netmap_kring *kring, int flags) } kring->nr_hwcur = head; } - IFRATE(rate_ctx.new.rxsync++); + + /* + * Second part: import newly received packets. + */ + if (!netmap_no_pendintr && !force_update) { + return 0; + } + + nm_i = kring->nr_hwtail; /* First empty slot in the receive ring. */ + + /* Compute the available space (in bytes) in this netmap ring. + * The first slot that is not considered in is the one before + * nr_hwcur. */ + + avail = nm_prev(kring->nr_hwcur, lim) - nm_i; + if (avail < 0) + avail += lim + 1; + avail *= nm_buf_len; + + /* First pass: While holding the lock on the RX mbuf queue, + * extract as many mbufs as they fit the available space, + * and put them in a temporary queue. + * To avoid performing a per-mbuf division (mlen / nm_buf_len) to + * to update avail, we do the update in a while loop that we + * also use to set the RX slots, but without performing the copy. */ + mbq_init(&tmpq); + mbq_lock(&kring->rx_queue); + for (n = 0;; n++) { + m = mbq_peek(&kring->rx_queue); + if (!m) { + /* No more packets from the driver. */ + break; + } + + mlen = MBUF_LEN(m); + if (mlen > avail) { + /* No more space in the ring. */ + break; + } + + mbq_dequeue(&kring->rx_queue); + + while (mlen) { + copy = nm_buf_len; + if (mlen < copy) { + copy = mlen; + } + mlen -= copy; + avail -= nm_buf_len; + + ring->slot[nm_i].len = copy; + ring->slot[nm_i].flags = slot_flags | (mlen ? NS_MOREFRAG : 0); + nm_i = nm_next(nm_i, lim); + } + + mbq_enqueue(&tmpq, m); + } + mbq_unlock(&kring->rx_queue); + + /* Second pass: Drain the temporary queue, going over the used RX slots, + * and perform the copy out of the RX queue lock. */ + nm_i = kring->nr_hwtail; + + for (;;) { + void *nmaddr; + int ofs = 0; + int morefrag; + + m = mbq_dequeue(&tmpq); + if (!m) { + break; + } + + do { + nmaddr = NMB(na, &ring->slot[nm_i]); + /* We only check the address here on generic rx rings. */ + if (nmaddr == NETMAP_BUF_BASE(na)) { /* Bad buffer */ + m_freem(m); + mbq_purge(&tmpq); + mbq_fini(&tmpq); + return netmap_ring_reinit(kring); + } + + copy = ring->slot[nm_i].len; + m_copydata(m, ofs, copy, nmaddr); + ofs += copy; + morefrag = ring->slot[nm_i].flags & NS_MOREFRAG; + nm_i = nm_next(nm_i, lim); + } while (morefrag); + + m_freem(m); + } + + mbq_fini(&tmpq); + + if (n) { + kring->nr_hwtail = nm_i; + IFRATE(rate_ctx.new.rxpkt += n); + } + kring->nr_kflags &= ~NKR_PENDINTR; return 0; } @@ -787,9 +1175,8 @@ generic_netmap_dtor(struct netmap_adapter *na) if (prev_na != NULL) { D("Released generic NA %p", gna); - if_rele(ifp); netmap_adapter_put(prev_na); - if (na->ifp == NULL) { + if (nm_iszombie(na)) { /* * The driver has been removed without releasing * the reference so we need to do it here. @@ -797,9 +1184,13 @@ generic_netmap_dtor(struct netmap_adapter *na) netmap_adapter_put(prev_na); } } - WNA(ifp) = prev_na; - D("Restored native NA %p", prev_na); + NM_ATTACH_NA(ifp, prev_na); + /* + * netmap_detach_common(), that it's called after this function, + * overrides WNA(ifp) if na->ifp is not NULL. + */ na->ifp = NULL; + D("Restored native NA %p", prev_na); } /* @@ -823,7 +1214,7 @@ generic_netmap_attach(struct ifnet *ifp) num_tx_desc = num_rx_desc = netmap_generic_ringsize; /* starting point */ - generic_find_num_desc(ifp, &num_tx_desc, &num_rx_desc); /* ignore errors */ + nm_os_generic_find_num_desc(ifp, &num_tx_desc, &num_rx_desc); /* ignore errors */ ND("Netmap ring size: TX = %d, RX = %d", num_tx_desc, num_rx_desc); if (num_tx_desc == 0 || num_rx_desc == 0) { D("Device has no hw slots (tx %u, rx %u)", num_tx_desc, num_rx_desc); @@ -855,12 +1246,23 @@ generic_netmap_attach(struct ifnet *ifp) ND("[GNA] num_rx_queues(%d), real_num_rx_queues(%d)", ifp->num_rx_queues, ifp->real_num_rx_queues); - generic_find_num_queues(ifp, &na->num_tx_rings, &na->num_rx_rings); + nm_os_generic_find_num_queues(ifp, &na->num_tx_rings, &na->num_rx_rings); retval = netmap_attach_common(na); if (retval) { free(gna, M_DEVBUF); + return retval; } + gna->prev = NA(ifp); /* save old na */ + if (gna->prev != NULL) { + netmap_adapter_get(gna->prev); + } + NM_ATTACH_NA(ifp, na); + + nm_os_generic_set_features(gna); + + D("Created generic NA %p (prev %p)", gna, gna->prev); + return retval; } diff --git a/sys/dev/netmap/netmap_kern.h b/sys/dev/netmap/netmap_kern.h index 4aead85285fd6..de21f29585e06 100644 --- a/sys/dev/netmap/netmap_kern.h +++ b/sys/dev/netmap/netmap_kern.h @@ -1,6 +1,7 @@ /* - * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved. - * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved. + * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo + * Copyright (C) 2013-2016 Universita` di Pisa + * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -48,24 +49,34 @@ #if defined(CONFIG_NETMAP_GENERIC) #define WITH_GENERIC #endif -#if defined(CONFIG_NETMAP_V1000) -#define WITH_V1000 +#if defined(CONFIG_NETMAP_PTNETMAP_GUEST) +#define WITH_PTNETMAP_GUEST +#endif +#if defined(CONFIG_NETMAP_PTNETMAP_HOST) +#define WITH_PTNETMAP_HOST #endif -#else /* not linux */ +#elif defined (_WIN32) +#define WITH_VALE // comment out to disable VALE support +#define WITH_PIPES +#define WITH_MONITOR +#define WITH_GENERIC +#else /* neither linux nor windows */ #define WITH_VALE // comment out to disable VALE support #define WITH_PIPES #define WITH_MONITOR #define WITH_GENERIC +#define WITH_PTNETMAP_HOST /* ptnetmap host support */ +#define WITH_PTNETMAP_GUEST /* ptnetmap guest support */ #endif #if defined(__FreeBSD__) -#include #define likely(x) __builtin_expect((long)!!(x), 1L) #define unlikely(x) __builtin_expect((long)!!(x), 0L) +#define __user #define NM_LOCK_T struct mtx /* low level spinlock, used to protect queues */ @@ -77,9 +88,11 @@ #define NM_MTX_ASSERT(m) sx_assert(&(m), SA_XLOCKED) #define NM_SELINFO_T struct nm_selinfo +#define NM_SELRECORD_T struct thread #define MBUF_LEN(m) ((m)->m_pkthdr.len) -#define MBUF_IFP(m) ((m)->m_pkthdr.rcvif) -#define NM_SEND_UP(ifp, m) ((NA(ifp))->if_input)(ifp, m) +#define MBUF_TXQ(m) ((m)->m_pkthdr.flowid) +#define MBUF_TRANSMIT(na, ifp, m) ((na)->if_transmit(ifp, m)) +#define GEN_TX_MBUF_IFP(m) ((m)->m_pkthdr.rcvif) #define NM_ATOMIC_T volatile int // XXX ? /* atomic operations */ @@ -98,23 +111,20 @@ struct netmap_adapter *netmap_getna(if_t ifp); #endif #if __FreeBSD_version >= 1100027 -#define GET_MBUF_REFCNT(m) ((m)->m_ext.ext_cnt ? *((m)->m_ext.ext_cnt) : -1) -#define SET_MBUF_REFCNT(m, x) *((m)->m_ext.ext_cnt) = x -#define PNT_MBUF_REFCNT(m) ((m)->m_ext.ext_cnt) +#define MBUF_REFCNT(m) ((m)->m_ext.ext_count) +#define SET_MBUF_REFCNT(m, x) (m)->m_ext.ext_count = x #else -#define GET_MBUF_REFCNT(m) ((m)->m_ext.ref_cnt ? *((m)->m_ext.ref_cnt) : -1) +#define MBUF_REFCNT(m) ((m)->m_ext.ref_cnt ? *((m)->m_ext.ref_cnt) : -1) #define SET_MBUF_REFCNT(m, x) *((m)->m_ext.ref_cnt) = x -#define PNT_MBUF_REFCNT(m) ((m)->m_ext.ref_cnt) #endif -MALLOC_DECLARE(M_NETMAP); +#define MBUF_QUEUED(m) 1 struct nm_selinfo { struct selinfo si; struct mtx m; }; -void freebsd_selwakeup(struct nm_selinfo *si, int pri); // XXX linux struct, not used in FreeBSD struct net_device_ops { @@ -131,12 +141,16 @@ struct hrtimer { #define NM_LOCK_T safe_spinlock_t // see bsd_glue.h #define NM_SELINFO_T wait_queue_head_t #define MBUF_LEN(m) ((m)->len) -#define MBUF_IFP(m) ((m)->dev) -#define NM_SEND_UP(ifp, m) \ - do { \ - m->priority = NM_MAGIC_PRIORITY_RX; \ - netif_rx(m); \ - } while (0) +#define MBUF_TRANSMIT(na, ifp, m) \ + ({ \ + /* Avoid infinite recursion with generic. */ \ + m->priority = NM_MAGIC_PRIORITY_TX; \ + (((struct net_device_ops *)(na)->if_transmit)->ndo_start_xmit(m, ifp)); \ + 0; \ + }) + +/* See explanation in nm_os_generic_xmit_frame. */ +#define GEN_TX_MBUF_IFP(m) ((struct ifnet *)skb_shinfo(m)->destructor_arg) #define NM_ATOMIC_T volatile long unsigned int @@ -159,7 +173,51 @@ struct hrtimer { #define NM_LOCK_T IOLock * #define NM_SELINFO_T struct selinfo #define MBUF_LEN(m) ((m)->m_pkthdr.len) -#define NM_SEND_UP(ifp, m) ((ifp)->if_input)(ifp, m) + +#elif defined (_WIN32) +#include "../../../WINDOWS/win_glue.h" + +#define NM_SELRECORD_T IO_STACK_LOCATION +#define NM_SELINFO_T win_SELINFO // see win_glue.h +#define NM_LOCK_T win_spinlock_t // see win_glue.h +#define NM_MTX_T KGUARDED_MUTEX /* OS-specific mutex (sleepable) */ + +#define NM_MTX_INIT(m) KeInitializeGuardedMutex(&m); +#define NM_MTX_DESTROY(m) do { (void)(m); } while (0) +#define NM_MTX_LOCK(m) KeAcquireGuardedMutex(&(m)) +#define NM_MTX_UNLOCK(m) KeReleaseGuardedMutex(&(m)) +#define NM_MTX_ASSERT(m) assert(&m.Count>0) + +//These linknames are for the NDIS driver +#define NETMAP_NDIS_LINKNAME_STRING L"\\DosDevices\\NMAPNDIS" +#define NETMAP_NDIS_NTDEVICE_STRING L"\\Device\\NMAPNDIS" + +//Definition of internal driver-to-driver ioctl codes +#define NETMAP_KERNEL_XCHANGE_POINTERS _IO('i', 180) +#define NETMAP_KERNEL_SEND_SHUTDOWN_SIGNAL _IO_direct('i', 195) + +//Empty data structures are not permitted by MSVC compiler +//XXX_ale, try to solve this problem +struct net_device_ops{ + char data[1]; +}; +typedef struct ethtool_ops{ + char data[1]; +}; +typedef struct hrtimer{ + KTIMER timer; + BOOLEAN active; + KDPC deferred_proc; +}; + +/* MSVC does not have likely/unlikely support */ +#ifdef _MSC_VER +#define likely(x) (x) +#define unlikely(x) (x) +#else +#define likely(x) __builtin_expect((long)!!(x), 1L) +#define unlikely(x) __builtin_expect((long)!!(x), 0L) +#endif //_MSC_VER #else @@ -167,6 +225,13 @@ struct hrtimer { #endif /* end - platform-specific code */ +#ifndef _WIN32 /* support for emulated sysctl */ +#define SYSBEGIN(x) +#define SYSEND +#endif /* _WIN32 */ + +#define NM_ACCESS_ONCE(x) (*(volatile __typeof__(x) *)&(x)) + #define NMG_LOCK_T NM_MTX_T #define NMG_LOCK_INIT() NM_MTX_INIT(netmap_global_lock) #define NMG_LOCK_DESTROY() NM_MTX_DESTROY(netmap_global_lock) @@ -201,8 +266,36 @@ struct nm_bdg_fwd; struct nm_bridge; struct netmap_priv_d; +/* os-specific NM_SELINFO_T initialzation/destruction functions */ +void nm_os_selinfo_init(NM_SELINFO_T *); +void nm_os_selinfo_uninit(NM_SELINFO_T *); + const char *nm_dump_buf(char *p, int len, int lim, char *dst); +void nm_os_selwakeup(NM_SELINFO_T *si); +void nm_os_selrecord(NM_SELRECORD_T *sr, NM_SELINFO_T *si); + +int nm_os_ifnet_init(void); +void nm_os_ifnet_fini(void); +void nm_os_ifnet_lock(void); +void nm_os_ifnet_unlock(void); + +void nm_os_get_module(void); +void nm_os_put_module(void); + +void netmap_make_zombie(struct ifnet *); +void netmap_undo_zombie(struct ifnet *); + +/* passes a packet up to the host stack. + * If the packet is sent (or dropped) immediately it returns NULL, + * otherwise it links the packet to prev and returns m. + * In this case, a final call with m=NULL and prev != NULL will send up + * the entire chain to the host stack. + */ +void *nm_os_send_up(struct ifnet *, struct mbuf *m, struct mbuf *prev); + +int nm_os_mbuf_has_offld(struct mbuf *m); + #include "netmap_mbq.h" extern NMG_LOCK_T netmap_global_lock; @@ -299,6 +392,19 @@ struct netmap_kring { uint32_t nr_kflags; /* private driver flags */ #define NKR_PENDINTR 0x1 // Pending interrupt. #define NKR_EXCLUSIVE 0x2 /* exclusive binding */ +#define NKR_FORWARD 0x4 /* (host ring only) there are + packets to forward + */ +#define NKR_NEEDRING 0x8 /* ring needed even if users==0 + * (used internally by pipes and + * by ptnetmap host ports) + */ + + uint32_t nr_mode; + uint32_t nr_pending_mode; +#define NKR_NETMAP_OFF 0x0 +#define NKR_NETMAP_ON 0x1 + uint32_t nkr_num_slots; /* @@ -344,13 +450,14 @@ struct netmap_kring { * store incoming mbufs in a queue that is drained by * a rxsync. */ - struct mbuf **tx_pool; - // u_int nr_ntc; /* Emulation of a next-to-clean RX ring pointer. */ - struct mbq rx_queue; /* intercepted rx mbufs. */ + struct mbuf **tx_pool; + struct mbuf *tx_event; /* TX event used as a notification */ + NM_LOCK_T tx_event_lock; /* protects the tx_event mbuf */ + struct mbq rx_queue; /* intercepted rx mbufs. */ uint32_t users; /* existing bindings for this ring */ - uint32_t ring_id; /* debugging */ + uint32_t ring_id; /* kring identifier */ enum txrx tx; /* kind of ring (tx or rx) */ char name[64]; /* diagnostic */ @@ -372,9 +479,6 @@ struct netmap_kring { struct netmap_kring *pipe; /* if this is a pipe ring, * pointer to the other end */ - struct netmap_ring *save_ring; /* pointer to hidden rings - * (see netmap_pipe.c for details) - */ #endif /* WITH_PIPES */ #ifdef WITH_VALE @@ -397,8 +501,28 @@ struct netmap_kring { uint32_t mon_tail; /* last seen slot on rx */ uint32_t mon_pos; /* index of this ring in the monitored ring array */ #endif -} __attribute__((__aligned__(64))); +} +#ifdef _WIN32 +__declspec(align(64)); +#else +__attribute__((__aligned__(64))); +#endif + +/* return 1 iff the kring needs to be turned on */ +static inline int +nm_kring_pending_on(struct netmap_kring *kring) +{ + return kring->nr_pending_mode == NKR_NETMAP_ON && + kring->nr_mode == NKR_NETMAP_OFF; +} +/* return 1 iff the kring needs to be turned off */ +static inline int +nm_kring_pending_off(struct netmap_kring *kring) +{ + return kring->nr_pending_mode == NKR_NETMAP_OFF && + kring->nr_mode == NKR_NETMAP_ON; +} /* return the next index, with wraparound */ static inline uint32_t @@ -514,6 +638,8 @@ struct netmap_adapter { */ #define NAF_HOST_RINGS 64 /* the adapter supports the host rings */ #define NAF_FORCE_NATIVE 128 /* the adapter is always NATIVE */ +#define NAF_PTNETMAP_HOST 256 /* the adapter supports ptnetmap in the host */ +#define NAF_ZOMBIE (1U<<30) /* the nic driver has been unloaded */ #define NAF_BUSY (1U<<31) /* the adapter is used internally and * cannot be registered from userspace */ @@ -592,10 +718,14 @@ struct netmap_adapter { * For hw devices this is typically a selwakeup(), * but for NIC/host ports attached to a switch (or vice-versa) * we also need to invoke the 'txsync' code downstream. + * This callback pointer is actually used only to initialize + * kring->nm_notify. + * Return values are the same as for netmap_rx_irq(). */ void (*nm_dtor)(struct netmap_adapter *); int (*nm_register)(struct netmap_adapter *, int onoff); + void (*nm_intr)(struct netmap_adapter *, int onoff); int (*nm_txsync)(struct netmap_kring *kring, int flags); int (*nm_rxsync)(struct netmap_kring *kring, int flags); @@ -640,14 +770,14 @@ struct netmap_adapter { /* memory allocator (opaque) * We also cache a pointer to the lut_entry for translating - * buffer addresses, and the total number of buffers. + * buffer addresses, the total number of buffers and the buffer size. */ struct netmap_mem_d *nm_mem; struct netmap_lut na_lut; /* additional information attached to this adapter * by other netmap subsystems. Currently used by - * bwrap and LINUX/v1000. + * bwrap, LINUX/v1000 and ptnetmap */ void *na_private; @@ -656,6 +786,9 @@ struct netmap_adapter { int na_next_pipe; /* next free slot in the array */ int na_max_pipes; /* size of the array */ + /* Offset of ethernet header for each packet. */ + u_int virt_hdr_len; + char name[64]; }; @@ -721,8 +854,6 @@ struct netmap_vp_adapter { /* VALE software port */ struct nm_bridge *na_bdg; int retry; - /* Offset of ethernet header for each packet. */ - u_int virt_hdr_len; /* Maximum Frame Size, used in bdg_mismatch_datapath() */ u_int mfs; /* Last source MAC on this port */ @@ -767,6 +898,13 @@ struct netmap_generic_adapter { /* emulated device */ #ifdef linux netdev_tx_t (*save_start_xmit)(struct mbuf *, struct ifnet *); #endif + /* Is the adapter able to use multiple RX slots to scatter + * each packet pushed up by the driver? */ + int rxsg; + + /* Is the transmission path controlled by a netmap-aware + * device queue (i.e. qdisc on linux)? */ + int txqdisc; }; #endif /* WITH_GENERIC */ @@ -777,7 +915,7 @@ netmap_real_rings(struct netmap_adapter *na, enum txrx t) } #ifdef WITH_VALE - +struct nm_bdg_polling_state; /* * Bridge wrapper for non VALE ports attached to a VALE switch. * @@ -827,9 +965,6 @@ struct netmap_bwrap_adapter { struct netmap_vp_adapter host; /* for host rings */ struct netmap_adapter *hwna; /* the underlying device */ - /* backup of the hwna memory allocator */ - struct netmap_mem_d *save_nmd; - /* * When we attach a physical interface to the bridge, we * allow the controlling process to terminate, so we need @@ -838,10 +973,10 @@ struct netmap_bwrap_adapter { * are attached to a bridge. */ struct netmap_priv_d *na_kpriv; + struct nm_bdg_polling_state *na_polling_state; }; int netmap_bwrap_attach(const char *name, struct netmap_adapter *); - #endif /* WITH_VALE */ #ifdef WITH_PIPES @@ -876,56 +1011,122 @@ nm_kr_rxspace(struct netmap_kring *k) return space; } +/* return slots reserved to tx clients */ +#define nm_kr_txspace(_k) nm_kr_rxspace(_k) + -/* True if no space in the tx ring. only valid after txsync_prologue */ +/* True if no space in the tx ring, only valid after txsync_prologue */ static inline int nm_kr_txempty(struct netmap_kring *kring) { return kring->rcur == kring->nr_hwtail; } +/* True if no more completed slots in the rx ring, only valid after + * rxsync_prologue */ +#define nm_kr_rxempty(_k) nm_kr_txempty(_k) /* * protect against multiple threads using the same ring. - * also check that the ring has not been stopped. - * We only care for 0 or !=0 as a return code. + * also check that the ring has not been stopped or locked */ -#define NM_KR_BUSY 1 -#define NM_KR_STOPPED 2 +#define NM_KR_BUSY 1 /* some other thread is syncing the ring */ +#define NM_KR_STOPPED 2 /* unbounded stop (ifconfig down or driver unload) */ +#define NM_KR_LOCKED 3 /* bounded, brief stop for mutual exclusion */ +/* release the previously acquired right to use the *sync() methods of the ring */ static __inline void nm_kr_put(struct netmap_kring *kr) { NM_ATOMIC_CLEAR(&kr->nr_busy); } -static __inline int nm_kr_tryget(struct netmap_kring *kr) +/* true if the ifp that backed the adapter has disappeared (e.g., the + * driver has been unloaded) + */ +static inline int nm_iszombie(struct netmap_adapter *na); + +/* try to obtain exclusive right to issue the *sync() operations on the ring. + * The right is obtained and must be later relinquished via nm_kr_put() if and + * only if nm_kr_tryget() returns 0. + * If can_sleep is 1 there are only two other possible outcomes: + * - the function returns NM_KR_BUSY + * - the function returns NM_KR_STOPPED and sets the POLLERR bit in *perr + * (if non-null) + * In both cases the caller will typically skip the ring, possibly collecting + * errors along the way. + * If the calling context does not allow sleeping, the caller must pass 0 in can_sleep. + * In the latter case, the function may also return NM_KR_LOCKED and leave *perr + * untouched: ideally, the caller should try again at a later time. + */ +static __inline int nm_kr_tryget(struct netmap_kring *kr, int can_sleep, int *perr) { + int busy = 1, stopped; /* check a first time without taking the lock * to avoid starvation for nm_kr_get() */ - if (unlikely(kr->nkr_stopped)) { - ND("ring %p stopped (%d)", kr, kr->nkr_stopped); - return NM_KR_STOPPED; +retry: + stopped = kr->nkr_stopped; + if (unlikely(stopped)) { + goto stop; + } + busy = NM_ATOMIC_TEST_AND_SET(&kr->nr_busy); + /* we should not return NM_KR_BUSY if the ring was + * actually stopped, so check another time after + * the barrier provided by the atomic operation + */ + stopped = kr->nkr_stopped; + if (unlikely(stopped)) { + goto stop; } - if (unlikely(NM_ATOMIC_TEST_AND_SET(&kr->nr_busy))) - return NM_KR_BUSY; - /* check a second time with lock held */ - if (unlikely(kr->nkr_stopped)) { - ND("ring %p stopped (%d)", kr, kr->nkr_stopped); + + if (unlikely(nm_iszombie(kr->na))) { + stopped = NM_KR_STOPPED; + goto stop; + } + + return unlikely(busy) ? NM_KR_BUSY : 0; + +stop: + if (!busy) nm_kr_put(kr); - return NM_KR_STOPPED; + if (stopped == NM_KR_STOPPED) { +/* if POLLERR is defined we want to use it to simplify netmap_poll(). + * Otherwise, any non-zero value will do. + */ +#ifdef POLLERR +#define NM_POLLERR POLLERR +#else +#define NM_POLLERR 1 +#endif /* POLLERR */ + if (perr) + *perr |= NM_POLLERR; +#undef NM_POLLERR + } else if (can_sleep) { + tsleep(kr, 0, "NM_KR_TRYGET", 4); + goto retry; } - return 0; + return stopped; } -static __inline void nm_kr_get(struct netmap_kring *kr) +/* put the ring in the 'stopped' state and wait for the current user (if any) to + * notice. stopped must be either NM_KR_STOPPED or NM_KR_LOCKED + */ +static __inline void nm_kr_stop(struct netmap_kring *kr, int stopped) { + kr->nkr_stopped = stopped; while (NM_ATOMIC_TEST_AND_SET(&kr->nr_busy)) tsleep(kr, 0, "NM_KR_GET", 4); } +/* restart a ring after a stop */ +static __inline void nm_kr_start(struct netmap_kring *kr) +{ + kr->nkr_stopped = 0; + nm_kr_put(kr); +} + /* * The following functions are used by individual drivers to @@ -953,10 +1154,26 @@ struct netmap_slot *netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n, u_int new_cur); int netmap_ring_reinit(struct netmap_kring *); +/* Return codes for netmap_*x_irq. */ +enum { + /* Driver should do normal interrupt processing, e.g. because + * the interface is not in netmap mode. */ + NM_IRQ_PASS = 0, + /* Port is in netmap mode, and the interrupt work has been + * completed. The driver does not have to notify netmap + * again before the next interrupt. */ + NM_IRQ_COMPLETED = -1, + /* Port is in netmap mode, but the interrupt work has not been + * completed. The driver has to make sure netmap will be + * notified again soon, even if no more interrupts come (e.g. + * on Linux the driver should not call napi_complete()). */ + NM_IRQ_RESCHED = -2, +}; + /* default functions to handle rx/tx interrupts */ int netmap_rx_irq(struct ifnet *, u_int, u_int *); #define netmap_tx_irq(_n, _q) netmap_rx_irq(_n, _q, NULL) -void netmap_common_irq(struct ifnet *, u_int, u_int *work_done); +int netmap_common_irq(struct netmap_adapter *, u_int, u_int *work_done); #ifdef WITH_VALE @@ -986,35 +1203,74 @@ nm_native_on(struct netmap_adapter *na) return nm_netmap_on(na) && (na->na_flags & NAF_NATIVE); } +static inline int +nm_iszombie(struct netmap_adapter *na) +{ + return na == NULL || (na->na_flags & NAF_ZOMBIE); +} + +static inline void +nm_update_hostrings_mode(struct netmap_adapter *na) +{ + /* Process nr_mode and nr_pending_mode for host rings. */ + na->tx_rings[na->num_tx_rings].nr_mode = + na->tx_rings[na->num_tx_rings].nr_pending_mode; + na->rx_rings[na->num_rx_rings].nr_mode = + na->rx_rings[na->num_rx_rings].nr_pending_mode; +} + /* set/clear native flags and if_transmit/netdev_ops */ static inline void nm_set_native_flags(struct netmap_adapter *na) { struct ifnet *ifp = na->ifp; + /* We do the setup for intercepting packets only if we are the + * first user of this adapapter. */ + if (na->active_fds > 0) { + return; + } + na->na_flags |= NAF_NETMAP_ON; #ifdef IFCAP_NETMAP /* or FreeBSD ? */ ifp->if_capenable |= IFCAP_NETMAP; #endif -#ifdef __FreeBSD__ +#if defined (__FreeBSD__) na->if_transmit = ifp->if_transmit; ifp->if_transmit = netmap_transmit; +#elif defined (_WIN32) + (void)ifp; /* prevent a warning */ + //XXX_ale can we just comment those? + //na->if_transmit = ifp->if_transmit; + //ifp->if_transmit = netmap_transmit; #else na->if_transmit = (void *)ifp->netdev_ops; ifp->netdev_ops = &((struct netmap_hw_adapter *)na)->nm_ndo; ((struct netmap_hw_adapter *)na)->save_ethtool = ifp->ethtool_ops; ifp->ethtool_ops = &((struct netmap_hw_adapter*)na)->nm_eto; #endif + nm_update_hostrings_mode(na); } - static inline void nm_clear_native_flags(struct netmap_adapter *na) { struct ifnet *ifp = na->ifp; -#ifdef __FreeBSD__ + /* We undo the setup for intercepting packets only if we are the + * last user of this adapapter. */ + if (na->active_fds > 0) { + return; + } + + nm_update_hostrings_mode(na); + +#if defined(__FreeBSD__) ifp->if_transmit = na->if_transmit; +#elif defined(_WIN32) + (void)ifp; /* prevent a warning */ + //XXX_ale can we just comment those? + //ifp->if_transmit = na->if_transmit; #else ifp->netdev_ops = (void *)na->if_transmit; ifp->ethtool_ops = ((struct netmap_hw_adapter*)na)->save_ethtool; @@ -1025,6 +1281,28 @@ nm_clear_native_flags(struct netmap_adapter *na) #endif } +/* + * nm_*sync_prologue() functions are used in ioctl/poll and ptnetmap + * kthreads. + * We need netmap_ring* parameter, because in ptnetmap it is decoupled + * from host kring. + * The user-space ring pointers (head/cur/tail) are shared through + * CSB between host and guest. + */ + +/* + * validates parameters in the ring/kring, returns a value for head + * If any error, returns ring_size to force a reinit. + */ +uint32_t nm_txsync_prologue(struct netmap_kring *, struct netmap_ring *); + + +/* + * validates parameters in the ring/kring, returns a value for head + * If any error, returns ring_size lim to force a reinit. + */ +uint32_t nm_rxsync_prologue(struct netmap_kring *, struct netmap_ring *); + /* check/fix address and len in tx rings */ #if 1 /* debug version */ @@ -1080,6 +1358,9 @@ int netmap_krings_create(struct netmap_adapter *na, u_int tailroom); */ void netmap_krings_delete(struct netmap_adapter *na); +int netmap_hw_krings_create(struct netmap_adapter *na); +void netmap_hw_krings_delete(struct netmap_adapter *na); + /* set the stopped/enabled status of ring * When stopping, they also wait for all current activity on the ring to * terminate. The status change is then notified using the na nm_notify @@ -1088,16 +1369,18 @@ void netmap_krings_delete(struct netmap_adapter *na); void netmap_set_ring(struct netmap_adapter *, u_int ring_id, enum txrx, int stopped); /* set the stopped/enabled status of all rings of the adapter. */ void netmap_set_all_rings(struct netmap_adapter *, int stopped); -/* convenience wrappers for netmap_set_all_rings, used in drivers */ +/* convenience wrappers for netmap_set_all_rings */ void netmap_disable_all_rings(struct ifnet *); void netmap_enable_all_rings(struct ifnet *); int netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na, uint16_t ringid, uint32_t flags); - +void netmap_do_unregif(struct netmap_priv_d *priv); u_int nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg); -int netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create); +int netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, + struct ifnet **ifp, int create); +void netmap_unget_na(struct netmap_adapter *na, struct ifnet *ifp); int netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na); @@ -1124,12 +1407,11 @@ struct netmap_bdg_ops { u_int netmap_bdg_learning(struct nm_bdg_fwd *ft, uint8_t *dst_ring, struct netmap_vp_adapter *); +#define NM_BRIDGES 8 /* number of bridges */ #define NM_BDG_MAXPORTS 254 /* up to 254 */ #define NM_BDG_BROADCAST NM_BDG_MAXPORTS #define NM_BDG_NOPORT (NM_BDG_MAXPORTS+1) -#define NM_NAME "vale" /* prefix for bridge port name */ - /* these are redefined in case of no VALE support */ int netmap_get_bdg_na(struct nmreq *nmr, struct netmap_adapter **na, int create); struct nm_bridge *netmap_init_bridges2(u_int); @@ -1181,14 +1463,13 @@ void netmap_bns_getbridges(struct nm_bridge **, u_int *); #endif /* Various prototypes */ -int netmap_poll(struct cdev *dev, int events, struct thread *td); +int netmap_poll(struct netmap_priv_d *, int events, NM_SELRECORD_T *td); int netmap_init(void); void netmap_fini(void); int netmap_get_memory(struct netmap_priv_d* p); void netmap_dtor(void *data); -int netmap_dtor_locked(struct netmap_priv_d *priv); -int netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td); +int netmap_ioctl(struct netmap_priv_d *priv, u_long cmd, caddr_t data, struct thread *); /* netmap_adapter creation/destruction */ @@ -1228,8 +1509,8 @@ int netmap_adapter_put(struct netmap_adapter *na); /* * module variables */ -#define NETMAP_BUF_BASE(na) ((na)->na_lut.lut[0].vaddr) -#define NETMAP_BUF_SIZE(na) ((na)->na_lut.objsize) +#define NETMAP_BUF_BASE(_na) ((_na)->na_lut.lut[0].vaddr) +#define NETMAP_BUF_SIZE(_na) ((_na)->na_lut.objsize) extern int netmap_mitigate; // XXX not really used extern int netmap_no_pendintr; extern int netmap_verbose; // XXX debugging @@ -1245,10 +1526,12 @@ enum { /* verbose flags */ }; extern int netmap_txsync_retry; +extern int netmap_adaptive_io; +extern int netmap_flags; extern int netmap_generic_mit; extern int netmap_generic_ringsize; extern int netmap_generic_rings; -extern int netmap_use_count; +extern int netmap_generic_txqdisc; /* * NA returns a pointer to the struct netmap adapter from the ifp, @@ -1257,37 +1540,27 @@ extern int netmap_use_count; #define NA(_ifp) ((struct netmap_adapter *)WNA(_ifp)) /* - * Macros to determine if an interface is netmap capable or netmap enabled. - * See the magic field in struct netmap_adapter. - */ -#ifdef __FreeBSD__ -/* - * on FreeBSD just use if_capabilities and if_capenable. - */ -#define NETMAP_CAPABLE(ifp) (NA(ifp) && \ - (ifp)->if_capabilities & IFCAP_NETMAP ) - -#define NETMAP_SET_CAPABLE(ifp) \ - (ifp)->if_capabilities |= IFCAP_NETMAP - -#else /* linux */ - -/* - * on linux: - * we check if NA(ifp) is set and its first element has a related + * On old versions of FreeBSD, NA(ifp) is a pspare. On linux we + * overload another pointer in the netdev. + * + * We check if NA(ifp) is set and its first element has a related * magic value. The capenable is within the struct netmap_adapter. */ #define NETMAP_MAGIC 0x52697a7a -#define NETMAP_CAPABLE(ifp) (NA(ifp) && \ +#define NM_NA_VALID(ifp) (NA(ifp) && \ ((uint32_t)(uintptr_t)NA(ifp) ^ NA(ifp)->magic) == NETMAP_MAGIC ) -#define NETMAP_SET_CAPABLE(ifp) \ - NA(ifp)->magic = ((uint32_t)(uintptr_t)NA(ifp)) ^ NETMAP_MAGIC +#define NM_ATTACH_NA(ifp, na) do { \ + WNA(ifp) = na; \ + if (NA(ifp)) \ + NA(ifp)->magic = \ + ((uint32_t)(uintptr_t)NA(ifp)) ^ NETMAP_MAGIC; \ +} while(0) -#endif /* linux */ +#define NM_IS_NATIVE(ifp) (NM_NA_VALID(ifp) && NA(ifp)->nm_dtor == netmap_hw_dtor) -#ifdef __FreeBSD__ +#if defined(__FreeBSD__) /* Assigns the device IOMMU domain to an allocator. * Returns -ENOMEM in case the domain is different */ @@ -1331,6 +1604,8 @@ netmap_reload_map(struct netmap_adapter *na, } } +#elif defined(_WIN32) + #else /* linux */ int nm_iommu_group_id(bus_dma_tag_t dev); @@ -1341,8 +1616,8 @@ netmap_load_map(struct netmap_adapter *na, bus_dma_tag_t tag, bus_dmamap_t map, void *buf) { if (0 && map) { - *map = dma_map_single(na->pdev, buf, na->na_lut.objsize, - DMA_BIDIRECTIONAL); + *map = dma_map_single(na->pdev, buf, NETMAP_BUF_SIZE(na), + DMA_BIDIRECTIONAL); } } @@ -1350,11 +1625,11 @@ static inline void netmap_unload_map(struct netmap_adapter *na, bus_dma_tag_t tag, bus_dmamap_t map) { - u_int sz = na->na_lut.objsize; + u_int sz = NETMAP_BUF_SIZE(na); if (*map) { dma_unmap_single(na->pdev, *map, sz, - DMA_BIDIRECTIONAL); + DMA_BIDIRECTIONAL); } } @@ -1362,7 +1637,7 @@ static inline void netmap_reload_map(struct netmap_adapter *na, bus_dma_tag_t tag, bus_dmamap_t map, void *buf) { - u_int sz = na->na_lut.objsize; + u_int sz = NETMAP_BUF_SIZE(na); if (*map) { dma_unmap_single(na->pdev, *map, sz, @@ -1473,7 +1748,11 @@ PNMB(struct netmap_adapter *na, struct netmap_slot *slot, uint64_t *pp) struct lut_entry *lut = na->na_lut.lut; void *ret = (i >= na->na_lut.objtotal) ? lut[0].vaddr : lut[i].vaddr; +#ifndef _WIN32 *pp = (i >= na->na_lut.objtotal) ? lut[0].paddr : lut[i].paddr; +#else + *pp = (i >= na->na_lut.objtotal) ? (uint64_t)lut[0].paddr.QuadPart : (uint64_t)lut[i].paddr.QuadPart; +#endif return ret; } @@ -1497,8 +1776,9 @@ struct netmap_priv_d { struct netmap_if * volatile np_nifp; /* netmap if descriptor. */ struct netmap_adapter *np_na; + struct ifnet *np_ifp; uint32_t np_flags; /* from the ioctl */ - u_int np_qfirst[NR_TXRX], + u_int np_qfirst[NR_TXRX], np_qlast[NR_TXRX]; /* range of tx/rx rings to scan */ uint16_t np_txpoll; /* XXX and also np_rxpoll ? */ @@ -1512,6 +1792,26 @@ struct netmap_priv_d { struct thread *np_td; /* kqueue, just debugging */ }; +struct netmap_priv_d *netmap_priv_new(void); +void netmap_priv_delete(struct netmap_priv_d *); + +static inline int nm_kring_pending(struct netmap_priv_d *np) +{ + struct netmap_adapter *na = np->np_na; + enum txrx t; + int i; + + for_rx_tx(t) { + for (i = np->np_qfirst[t]; i < np->np_qlast[t]; i++) { + struct netmap_kring *kring = &NMR(na, t)[i]; + if (kring->nr_mode != kring->nr_pending_mode) { + return 1; + } + } + } + return 0; +} + #ifdef WITH_MONITOR struct netmap_monitor_adapter { @@ -1530,13 +1830,36 @@ struct netmap_monitor_adapter { * native netmap support. */ int generic_netmap_attach(struct ifnet *ifp); +int generic_rx_handler(struct ifnet *ifp, struct mbuf *m);; + +int nm_os_catch_rx(struct netmap_generic_adapter *gna, int intercept); +int nm_os_catch_tx(struct netmap_generic_adapter *gna, int intercept); + +/* + * the generic transmit routine is passed a structure to optionally + * build a queue of descriptors, in an OS-specific way. + * The payload is at addr, if non-null, and the routine should send or queue + * the packet, returning 0 if successful, 1 on failure. + * + * At the end, if head is non-null, there will be an additional call + * to the function with addr = NULL; this should tell the OS-specific + * routine to send the queue and free any resources. Failure is ignored. + */ +struct nm_os_gen_arg { + struct ifnet *ifp; + void *m; /* os-specific mbuf-like object */ + void *head, *tail; /* tailq, if the OS-specific routine needs to build one */ + void *addr; /* payload of current packet */ + u_int len; /* packet length */ + u_int ring_nr; /* packet length */ + u_int qevent; /* in txqdisc mode, place an event on this mbuf */ +}; + +int nm_os_generic_xmit_frame(struct nm_os_gen_arg *); +int nm_os_generic_find_num_desc(struct ifnet *ifp, u_int *tx, u_int *rx); +void nm_os_generic_find_num_queues(struct ifnet *ifp, u_int *txq, u_int *rxq); +void nm_os_generic_set_features(struct netmap_generic_adapter *gna); -int netmap_catch_rx(struct netmap_generic_adapter *na, int intercept); -void generic_rx_handler(struct ifnet *ifp, struct mbuf *m);; -void netmap_catch_tx(struct netmap_generic_adapter *na, int enable); -int generic_xmit_frame(struct ifnet *ifp, struct mbuf *m, void *addr, u_int len, u_int ring_nr); -int generic_find_num_desc(struct ifnet *ifp, u_int *tx, u_int *rx); -void generic_find_num_queues(struct ifnet *ifp, u_int *txq, u_int *rxq); static inline struct ifnet* netmap_generic_getifp(struct netmap_generic_adapter *gna) { @@ -1546,6 +1869,8 @@ netmap_generic_getifp(struct netmap_generic_adapter *gna) return gna->up.up.ifp; } +void netmap_generic_irq(struct netmap_adapter *na, u_int q, u_int *work_done); + //#define RATE_GENERIC /* Enables communication statistics for generic. */ #ifdef RATE_GENERIC void generic_rate(int txp, int txs, int txi, int rxp, int rxs, int rxi); @@ -1558,16 +1883,16 @@ void generic_rate(int txp, int txs, int txi, int rxp, int rxs, int rxi); * to reduce the number of interrupt requests/selwakeup * to clients on incoming packets. */ -void netmap_mitigation_init(struct nm_generic_mit *mit, int idx, +void nm_os_mitigation_init(struct nm_generic_mit *mit, int idx, struct netmap_adapter *na); -void netmap_mitigation_start(struct nm_generic_mit *mit); -void netmap_mitigation_restart(struct nm_generic_mit *mit); -int netmap_mitigation_active(struct nm_generic_mit *mit); -void netmap_mitigation_cleanup(struct nm_generic_mit *mit); +void nm_os_mitigation_start(struct nm_generic_mit *mit); +void nm_os_mitigation_restart(struct nm_generic_mit *mit); +int nm_os_mitigation_active(struct nm_generic_mit *mit); +void nm_os_mitigation_cleanup(struct nm_generic_mit *mit); +#else /* !WITH_GENERIC */ +#define generic_netmap_attach(ifp) (EOPNOTSUPP) #endif /* WITH_GENERIC */ - - /* Shared declarations for the VALE switch. */ /* @@ -1656,22 +1981,111 @@ struct nm_ipv6hdr { */ #define rawsum_t uint32_t -rawsum_t nm_csum_raw(uint8_t *data, size_t len, rawsum_t cur_sum); -uint16_t nm_csum_ipv4(struct nm_iphdr *iph); -void nm_csum_tcpudp_ipv4(struct nm_iphdr *iph, void *data, +rawsum_t nm_os_csum_raw(uint8_t *data, size_t len, rawsum_t cur_sum); +uint16_t nm_os_csum_ipv4(struct nm_iphdr *iph); +void nm_os_csum_tcpudp_ipv4(struct nm_iphdr *iph, void *data, size_t datalen, uint16_t *check); -void nm_csum_tcpudp_ipv6(struct nm_ipv6hdr *ip6h, void *data, +void nm_os_csum_tcpudp_ipv6(struct nm_ipv6hdr *ip6h, void *data, size_t datalen, uint16_t *check); -uint16_t nm_csum_fold(rawsum_t cur_sum); +uint16_t nm_os_csum_fold(rawsum_t cur_sum); void bdg_mismatch_datapath(struct netmap_vp_adapter *na, struct netmap_vp_adapter *dst_na, - struct nm_bdg_fwd *ft_p, struct netmap_ring *ring, + const struct nm_bdg_fwd *ft_p, + struct netmap_ring *dst_ring, u_int *j, u_int lim, u_int *howmany); /* persistent virtual port routines */ -int nm_vi_persist(const char *, struct ifnet **); -void nm_vi_detach(struct ifnet *); -void nm_vi_init_index(void); +int nm_os_vi_persist(const char *, struct ifnet **); +void nm_os_vi_detach(struct ifnet *); +void nm_os_vi_init_index(void); + +/* + * kernel thread routines + */ +struct nm_kthread; /* OS-specific kthread - opaque */ +typedef void (*nm_kthread_worker_fn_t)(void *data); + +/* kthread configuration */ +struct nm_kthread_cfg { + long type; /* kthread type/identifier */ + struct ptnet_ring_cfg event; /* event/ioctl fd */ + nm_kthread_worker_fn_t worker_fn; /* worker function */ + void *worker_private;/* worker parameter */ + int attach_user; /* attach kthread to user process */ +}; +/* kthread configuration */ +struct nm_kthread *nm_os_kthread_create(struct nm_kthread_cfg *cfg); +int nm_os_kthread_start(struct nm_kthread *); +void nm_os_kthread_stop(struct nm_kthread *); +void nm_os_kthread_delete(struct nm_kthread *); +void nm_os_kthread_wakeup_worker(struct nm_kthread *nmk); +void nm_os_kthread_send_irq(struct nm_kthread *); +void nm_os_kthread_set_affinity(struct nm_kthread *, int); +u_int nm_os_ncpus(void); + +#ifdef WITH_PTNETMAP_HOST +/* + * netmap adapter for host ptnetmap ports + */ +struct netmap_pt_host_adapter { + struct netmap_adapter up; + + struct netmap_adapter *parent; + int (*parent_nm_notify)(struct netmap_kring *kring, int flags); + void *ptns; +}; +/* ptnetmap HOST routines */ +int netmap_get_pt_host_na(struct nmreq *nmr, struct netmap_adapter **na, int create); +int ptnetmap_ctl(struct nmreq *nmr, struct netmap_adapter *na); +static inline int +nm_ptnetmap_host_on(struct netmap_adapter *na) +{ + return na && na->na_flags & NAF_PTNETMAP_HOST; +} +#else /* !WITH_PTNETMAP_HOST */ +#define netmap_get_pt_host_na(nmr, _2, _3) \ + ((nmr)->nr_flags & (NR_PTNETMAP_HOST) ? EOPNOTSUPP : 0) +#define ptnetmap_ctl(_1, _2) EINVAL +#define nm_ptnetmap_host_on(_1) EINVAL +#endif /* !WITH_PTNETMAP_HOST */ + +#ifdef WITH_PTNETMAP_GUEST +/* ptnetmap GUEST routines */ + +typedef uint32_t (*nm_pt_guest_ptctl_t)(struct ifnet *, uint32_t); + +/* + * netmap adapter for guest ptnetmap ports + */ +struct netmap_pt_guest_adapter { + /* The netmap adapter to be used by netmap applications. + * This field must be the first, to allow upcast. */ + struct netmap_hw_adapter hwup; + + /* The netmap adapter to be used by the driver. */ + struct netmap_hw_adapter dr; + + void *csb; + + /* Reference counter to track users of backend netmap port: the + * network stack and netmap clients. + * Used to decide when we need (de)allocate krings/rings and + * start (stop) ptnetmap kthreads. */ + int backend_regifs; + +}; + +int netmap_pt_guest_attach(struct netmap_adapter *, void *, + unsigned int, nm_pt_guest_ptctl_t); +struct ptnet_ring; +bool netmap_pt_guest_txsync(struct ptnet_ring *ptring, struct netmap_kring *kring, + int flags); +bool netmap_pt_guest_rxsync(struct ptnet_ring *ptring, struct netmap_kring *kring, + int flags); +int ptnet_nm_krings_create(struct netmap_adapter *na); +void ptnet_nm_krings_delete(struct netmap_adapter *na); +void ptnet_nm_dtor(struct netmap_adapter *na); +#endif /* WITH_PTNETMAP_GUEST */ #endif /* _NET_NETMAP_KERN_H_ */ diff --git a/sys/dev/netmap/netmap_mbq.c b/sys/dev/netmap/netmap_mbq.c index 503f5a13aa95a..3eb971b745611 100644 --- a/sys/dev/netmap/netmap_mbq.c +++ b/sys/dev/netmap/netmap_mbq.c @@ -1,5 +1,6 @@ /* - * Copyright (C) 2013-2014 Vincenzo Maffione. All rights reserved. + * Copyright (C) 2013-2014 Vincenzo Maffione + * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -30,6 +31,8 @@ #ifdef linux #include "bsd_glue.h" +#elif defined (_WIN32) +#include "win_glue.h" #else /* __FreeBSD__ */ #include #include @@ -152,12 +155,12 @@ void mbq_safe_purge(struct mbq *q) } -void mbq_safe_destroy(struct mbq *q) +void mbq_safe_fini(struct mbq *q) { mtx_destroy(&q->lock); } -void mbq_destroy(struct mbq *q) +void mbq_fini(struct mbq *q) { } diff --git a/sys/dev/netmap/netmap_mbq.h b/sys/dev/netmap/netmap_mbq.h index 455ca8a2c3acd..9dafa8b1149b3 100644 --- a/sys/dev/netmap/netmap_mbq.h +++ b/sys/dev/netmap/netmap_mbq.h @@ -1,5 +1,6 @@ /* - * Copyright (C) 2013-2014 Vincenzo Maffione. All rights reserved. + * Copyright (C) 2013-2014 Vincenzo Maffione + * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -40,6 +41,8 @@ /* XXX probably rely on a previous definition of SPINLOCK_T */ #ifdef linux #define SPINLOCK_T safe_spinlock_t +#elif defined (_WIN32) +#define SPINLOCK_T win_spinlock_t #else #define SPINLOCK_T struct mtx #endif @@ -52,16 +55,21 @@ struct mbq { SPINLOCK_T lock; }; -/* XXX "destroy" does not match "init" as a name. - * We should also clarify whether init can be used while +/* We should clarify whether init can be used while * holding a lock, and whether mbq_safe_destroy() is a NOP. */ void mbq_init(struct mbq *q); -void mbq_destroy(struct mbq *q); +void mbq_fini(struct mbq *q); void mbq_enqueue(struct mbq *q, struct mbuf *m); struct mbuf *mbq_dequeue(struct mbq *q); void mbq_purge(struct mbq *q); +static inline struct mbuf * +mbq_peek(struct mbq *q) +{ + return q->head ? q->head : NULL; +} + static inline void mbq_lock(struct mbq *q) { @@ -76,7 +84,7 @@ mbq_unlock(struct mbq *q) void mbq_safe_init(struct mbq *q); -void mbq_safe_destroy(struct mbq *q); +void mbq_safe_fini(struct mbq *q); void mbq_safe_enqueue(struct mbq *q, struct mbuf *m); struct mbuf *mbq_safe_dequeue(struct mbq *q); void mbq_safe_purge(struct mbq *q); diff --git a/sys/dev/netmap/netmap_mem2.c b/sys/dev/netmap/netmap_mem2.c index fd0c06bb8b576..b54c9813c33fd 100644 --- a/sys/dev/netmap/netmap_mem2.c +++ b/sys/dev/netmap/netmap_mem2.c @@ -1,5 +1,8 @@ /* - * Copyright (C) 2012-2014 Matteo Landi, Luigi Rizzo, Giuseppe Lettieri. All rights reserved. + * Copyright (C) 2012-2014 Matteo Landi + * Copyright (C) 2012-2016 Luigi Rizzo + * Copyright (C) 2012-2016 Giuseppe Lettieri + * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -37,6 +40,7 @@ __FBSDID("$FreeBSD$"); #include #include +#include /* MALLOC_DEFINE */ #include #include /* vtophys */ #include /* vtophys */ @@ -48,13 +52,26 @@ __FBSDID("$FreeBSD$"); #include #include /* bus_dmamap_* */ +/* M_NETMAP only used in here */ +MALLOC_DECLARE(M_NETMAP); +MALLOC_DEFINE(M_NETMAP, "netmap", "Network memory map"); + #endif /* __FreeBSD__ */ +#ifdef _WIN32 +#include +#endif + #include #include +#include #include "netmap_mem2.h" -#define NETMAP_BUF_MAX_NUM 20*4096*2 /* large machine */ +#ifdef _WIN32_USE_SMALL_GENERIC_DEVICES_MEMORY +#define NETMAP_BUF_MAX_NUM 8*4096 /* if too big takes too much time to allocate */ +#else +#define NETMAP_BUF_MAX_NUM 20*4096*2 /* large machine */ +#endif #define NETMAP_POOL_MAX_NAMSZ 32 @@ -111,7 +128,7 @@ struct netmap_obj_pool { struct netmap_mem_ops { - void (*nmd_get_lut)(struct netmap_mem_d *, struct netmap_lut*); + int (*nmd_get_lut)(struct netmap_mem_d *, struct netmap_lut*); int (*nmd_get_info)(struct netmap_mem_d *, u_int *size, u_int *memflags, uint16_t *id); @@ -130,6 +147,39 @@ struct netmap_mem_ops { typedef uint16_t nm_memid_t; +/* + * Shared info for netmap allocator + * + * Each allocator contains this structur as first netmap_if. + * In this way, we can share same details about allocator + * to the VM. + * Used in ptnetmap. + */ +struct netmap_mem_shared_info { +#ifndef _WIN32 + struct netmap_if up; /* ends with a 0-sized array, which VSC does not like */ +#else /* !_WIN32 */ + char up[sizeof(struct netmap_if)]; +#endif /* !_WIN32 */ + uint64_t features; +#define NMS_FEAT_BUF_POOL 0x0001 +#define NMS_FEAT_MEMSIZE 0x0002 + + uint32_t buf_pool_offset; + uint32_t buf_pool_objtotal; + uint32_t buf_pool_objsize; + uint32_t totalsize; +}; + +#define NMS_NAME "nms_info" +#define NMS_VERSION 1 +static const struct netmap_if nms_if_blueprint = { + .ni_name = NMS_NAME, + .ni_version = NMS_VERSION, + .ni_tx_rings = 0, + .ni_rx_rings = 0 +}; + struct netmap_mem_d { NMA_LOCK_T nm_mtx; /* protect the allocator */ u_int nm_totalsize; /* shorthand */ @@ -151,6 +201,9 @@ struct netmap_mem_d { struct netmap_mem_ops *ops; }; +/* + * XXX need to fix the case of t0 == void + */ #define NMD_DEFCB(t0, name) \ t0 \ netmap_mem_##name(struct netmap_mem_d *nmd) \ @@ -186,7 +239,7 @@ netmap_mem_##name(struct netmap_adapter *na, t1 a1) \ return na->nm_mem->ops->nmd_##name(na, a1); \ } -NMD_DEFCB1(void, get_lut, struct netmap_lut *); +NMD_DEFCB1(int, get_lut, struct netmap_lut *); NMD_DEFCB3(int, get_info, u_int *, u_int *, uint16_t *); NMD_DEFCB1(vm_paddr_t, ofstophys, vm_ooffset_t); static int netmap_mem_config(struct netmap_mem_d *); @@ -201,7 +254,7 @@ NMD_DEFNACB(void, rings_delete); static int netmap_mem_map(struct netmap_obj_pool *, struct netmap_adapter *); static int netmap_mem_unmap(struct netmap_obj_pool *, struct netmap_adapter *); -static int nm_mem_assign_group(struct netmap_mem_d *, device_t); +static int nm_mem_assign_group(struct netmap_mem_d *, struct device *); #define NMA_LOCK_INIT(n) NM_MTX_INIT((n)->nm_mtx) #define NMA_LOCK_DESTROY(n) NM_MTX_DESTROY((n)->nm_mtx) @@ -248,7 +301,9 @@ netmap_mem_finalize(struct netmap_mem_d *nmd, struct netmap_adapter *na) if (nm_mem_assign_group(nmd, na->pdev) < 0) { return ENOMEM; } else { - nmd->ops->nmd_finalize(nmd); + NMA_LOCK(nmd); + nmd->lasterr = nmd->ops->nmd_finalize(nmd); + NMA_UNLOCK(nmd); } if (!nmd->lasterr && na->pdev) @@ -257,26 +312,83 @@ netmap_mem_finalize(struct netmap_mem_d *nmd, struct netmap_adapter *na) return nmd->lasterr; } +static int netmap_mem_init_shared_info(struct netmap_mem_d *nmd); + void netmap_mem_deref(struct netmap_mem_d *nmd, struct netmap_adapter *na) { NMA_LOCK(nmd); netmap_mem_unmap(&nmd->pools[NETMAP_BUF_POOL], na); + if (nmd->active == 1) { + u_int i; + + /* + * Reset the allocator when it falls out of use so that any + * pool resources leaked by unclean application exits are + * reclaimed. + */ + for (i = 0; i < NETMAP_POOLS_NR; i++) { + struct netmap_obj_pool *p; + u_int j; + + p = &nmd->pools[i]; + p->objfree = p->objtotal; + /* + * Reproduce the net effect of the M_ZERO malloc() + * and marking of free entries in the bitmap that + * occur in finalize_obj_allocator() + */ + memset(p->bitmap, + '\0', + sizeof(uint32_t) * ((p->objtotal + 31) / 32)); + + /* + * Set all the bits in the bitmap that have + * corresponding buffers to 1 to indicate they are + * free. + */ + for (j = 0; j < p->objtotal; j++) { + if (p->lut[j].vaddr != NULL) { + p->bitmap[ (j>>5) ] |= ( 1 << (j & 31) ); + } + } + } + + /* + * Per netmap_mem_finalize_all(), + * buffers 0 and 1 are reserved + */ + nmd->pools[NETMAP_BUF_POOL].objfree -= 2; + if (nmd->pools[NETMAP_BUF_POOL].bitmap) { + /* XXX This check is a workaround that prevents a + * NULL pointer crash which currently happens only + * with ptnetmap guests. Also, + * netmap_mem_init_shared_info must not be called + * by ptnetmap guest. */ + nmd->pools[NETMAP_BUF_POOL].bitmap[0] = ~3; + + /* expose info to the ptnetmap guest */ + netmap_mem_init_shared_info(nmd); + } + } + nmd->ops->nmd_deref(nmd); + NMA_UNLOCK(nmd); - return nmd->ops->nmd_deref(nmd); } /* accessor functions */ -static void +static int netmap_mem2_get_lut(struct netmap_mem_d *nmd, struct netmap_lut *lut) { lut->lut = nmd->pools[NETMAP_BUF_POOL].lut; lut->objtotal = nmd->pools[NETMAP_BUF_POOL].objtotal; lut->objsize = nmd->pools[NETMAP_BUF_POOL]._objsize; + + return 0; } -struct netmap_obj_params netmap_params[NETMAP_POOLS_NR] = { +static struct netmap_obj_params netmap_params[NETMAP_POOLS_NR] = { [NETMAP_IF_POOL] = { .size = 1024, .num = 100, @@ -291,10 +403,10 @@ struct netmap_obj_params netmap_params[NETMAP_POOLS_NR] = { }, }; -struct netmap_obj_params netmap_min_priv_params[NETMAP_POOLS_NR] = { +static struct netmap_obj_params netmap_min_priv_params[NETMAP_POOLS_NR] = { [NETMAP_IF_POOL] = { .size = 1024, - .num = 1, + .num = 2, }, [NETMAP_RING_POOL] = { .size = 5*PAGE_SIZE, @@ -348,11 +460,12 @@ struct netmap_mem_d nm_mem = { /* Our memory allocator. */ }; -struct netmap_mem_d *netmap_last_mem_d = &nm_mem; +static struct netmap_mem_d *netmap_last_mem_d = &nm_mem; /* blueprint for the private memory allocators */ extern struct netmap_mem_ops netmap_mem_private_ops; /* forward */ -const struct netmap_mem_d nm_blueprint = { +/* XXX clang is not happy about using name as a print format */ +static const struct netmap_mem_d nm_blueprint = { .pools = { [NETMAP_IF_POOL] = { .name = "%s_if", @@ -388,6 +501,8 @@ const struct netmap_mem_d nm_blueprint = { #define DECLARE_SYSCTLS(id, name) \ + SYSBEGIN(mem2_ ## name); \ + SYSCTL_DECL(_dev_netmap); /* leave it here, easier for porting */ \ SYSCTL_INT(_dev_netmap, OID_AUTO, name##_size, \ CTLFLAG_RW, &netmap_params[id].size, 0, "Requested size of netmap " STRINGIFY(name) "s"); \ SYSCTL_INT(_dev_netmap, OID_AUTO, name##_curr_size, \ @@ -401,22 +516,21 @@ const struct netmap_mem_d nm_blueprint = { "Default size of private netmap " STRINGIFY(name) "s"); \ SYSCTL_INT(_dev_netmap, OID_AUTO, priv_##name##_num, \ CTLFLAG_RW, &netmap_min_priv_params[id].num, 0, \ - "Default number of private netmap " STRINGIFY(name) "s") + "Default number of private netmap " STRINGIFY(name) "s"); \ + SYSEND -SYSCTL_DECL(_dev_netmap); DECLARE_SYSCTLS(NETMAP_IF_POOL, if); DECLARE_SYSCTLS(NETMAP_RING_POOL, ring); DECLARE_SYSCTLS(NETMAP_BUF_POOL, buf); +/* call with NMA_LOCK(&nm_mem) held */ static int -nm_mem_assign_id(struct netmap_mem_d *nmd) +nm_mem_assign_id_locked(struct netmap_mem_d *nmd) { nm_memid_t id; struct netmap_mem_d *scan = netmap_last_mem_d; int error = ENOMEM; - NMA_LOCK(&nm_mem); - do { /* we rely on unsigned wrap around */ id = scan->nm_id + 1; @@ -435,10 +549,22 @@ nm_mem_assign_id(struct netmap_mem_d *nmd) } } while (scan != netmap_last_mem_d); - NMA_UNLOCK(&nm_mem); return error; } +/* call with NMA_LOCK(&nm_mem) *not* held */ +static int +nm_mem_assign_id(struct netmap_mem_d *nmd) +{ + int ret; + + NMA_LOCK(&nm_mem); + ret = nm_mem_assign_id_locked(nmd); + NMA_UNLOCK(&nm_mem); + + return ret; +} + static void nm_mem_release_id(struct netmap_mem_d *nmd) { @@ -456,7 +582,7 @@ nm_mem_release_id(struct netmap_mem_d *nmd) } static int -nm_mem_assign_group(struct netmap_mem_d *nmd, device_t dev) +nm_mem_assign_group(struct netmap_mem_d *nmd, struct device *dev) { int err = 0, id; id = nm_iommu_group_id(dev); @@ -494,8 +620,13 @@ netmap_mem2_ofstophys(struct netmap_mem_d* nmd, vm_ooffset_t offset) if (offset >= p[i].memtotal) continue; // now lookup the cluster's address +#ifndef _WIN32 pa = vtophys(p[i].lut[offset / p[i]._objsize].vaddr) + offset % p[i]._objsize; +#else + pa = vtophys(p[i].lut[offset / p[i]._objsize].vaddr); + pa.QuadPart += offset % p[i]._objsize; +#endif NMA_UNLOCK(nmd); return pa; } @@ -508,7 +639,110 @@ netmap_mem2_ofstophys(struct netmap_mem_d* nmd, vm_ooffset_t offset) + p[NETMAP_RING_POOL].memtotal + p[NETMAP_BUF_POOL].memtotal); NMA_UNLOCK(nmd); +#ifndef _WIN32 return 0; // XXX bad address +#else + vm_paddr_t res; + res.QuadPart = 0; + return res; +#endif +} + +#ifdef _WIN32 + +/* + * win32_build_virtual_memory_for_userspace + * + * This function get all the object making part of the pools and maps + * a contiguous virtual memory space for the userspace + * It works this way + * 1 - allocate a Memory Descriptor List wide as the sum + * of the memory needed for the pools + * 2 - cycle all the objects in every pool and for every object do + * + * 2a - cycle all the objects in every pool, get the list + * of the physical address descriptors + * 2b - calculate the offset in the array of pages desciptor in the + * main MDL + * 2c - copy the descriptors of the object in the main MDL + * + * 3 - return the resulting MDL that needs to be mapped in userland + * + * In this way we will have an MDL that describes all the memory for the + * objects in a single object +*/ + +PMDL +win32_build_user_vm_map(struct netmap_mem_d* nmd) +{ + int i, j; + u_int memsize, memflags, ofs = 0; + PMDL mainMdl, tempMdl; + + if (netmap_mem_get_info(nmd, &memsize, &memflags, NULL)) { + D("memory not finalised yet"); + return NULL; + } + + mainMdl = IoAllocateMdl(NULL, memsize, FALSE, FALSE, NULL); + if (mainMdl == NULL) { + D("failed to allocate mdl"); + return NULL; + } + + NMA_LOCK(nmd); + for (i = 0; i < NETMAP_POOLS_NR; i++) { + struct netmap_obj_pool *p = &nmd->pools[i]; + int clsz = p->_clustsize; + int clobjs = p->_clustentries; /* objects per cluster */ + int mdl_len = sizeof(PFN_NUMBER) * BYTES_TO_PAGES(clsz); + PPFN_NUMBER pSrc, pDst; + + /* each pool has a different cluster size so we need to reallocate */ + tempMdl = IoAllocateMdl(p->lut[0].vaddr, clsz, FALSE, FALSE, NULL); + if (tempMdl == NULL) { + NMA_UNLOCK(nmd); + D("fail to allocate tempMdl"); + IoFreeMdl(mainMdl); + return NULL; + } + pSrc = MmGetMdlPfnArray(tempMdl); + /* create one entry per cluster, the lut[] has one entry per object */ + for (j = 0; j < p->numclusters; j++, ofs += clsz) { + pDst = &MmGetMdlPfnArray(mainMdl)[BYTES_TO_PAGES(ofs)]; + MmInitializeMdl(tempMdl, p->lut[j*clobjs].vaddr, clsz); + MmBuildMdlForNonPagedPool(tempMdl); /* compute physical page addresses */ + RtlCopyMemory(pDst, pSrc, mdl_len); /* copy the page descriptors */ + mainMdl->MdlFlags = tempMdl->MdlFlags; /* XXX what is in here ? */ + } + IoFreeMdl(tempMdl); + } + NMA_UNLOCK(nmd); + return mainMdl; +} + +#endif /* _WIN32 */ + +/* + * helper function for OS-specific mmap routines (currently only windows). + * Given an nmd and a pool index, returns the cluster size and number of clusters. + * Returns 0 if memory is finalised and the pool is valid, otherwise 1. + * It should be called under NMA_LOCK(nmd) otherwise the underlying info can change. + */ + +int +netmap_mem2_get_pool_info(struct netmap_mem_d* nmd, u_int pool, u_int *clustsize, u_int *numclusters) +{ + if (!nmd || !clustsize || !numclusters || pool >= NETMAP_POOLS_NR) + return 1; /* invalid arguments */ + // NMA_LOCK_ASSERT(nmd); + if (!(nmd->flags & NETMAP_MEM_FINALIZED)) { + *clustsize = *numclusters = 0; + return 1; /* not ready yet */ + } + *clustsize = nmd->pools[pool]._clustsize; + *numclusters = nmd->pools[pool].numclusters; + return 0; /* success */ } static int @@ -578,12 +812,6 @@ netmap_obj_offset(struct netmap_obj_pool *p, const void *vaddr) ((n)->pools[NETMAP_IF_POOL].memtotal + \ netmap_obj_offset(&(n)->pools[NETMAP_RING_POOL], (v))) -#define netmap_buf_offset(n, v) \ - ((n)->pools[NETMAP_IF_POOL].memtotal + \ - (n)->pools[NETMAP_RING_POOL].memtotal + \ - netmap_obj_offset(&(n)->pools[NETMAP_BUF_POOL], (v))) - - static ssize_t netmap_mem2_if_offset(struct netmap_mem_d *nmd, const void *addr) { @@ -602,7 +830,7 @@ static void * netmap_obj_malloc(struct netmap_obj_pool *p, u_int len, uint32_t *start, uint32_t *index) { uint32_t i = 0; /* index in the bitmap */ - uint32_t mask, j; /* slot counter */ + uint32_t mask, j = 0; /* slot counter */ void *vaddr = NULL; if (len > p->_objsize) { @@ -636,7 +864,7 @@ netmap_obj_malloc(struct netmap_obj_pool *p, u_int len, uint32_t *start, uint32_ if (index) *index = i * 32 + j; } - ND("%s allocator: allocated object @ [%d][%d]: vaddr %p", i, j, vaddr); + ND("%s allocator: allocated object @ [%d][%d]: vaddr %p",p->name, i, j, vaddr); if (start) *start = i; @@ -733,7 +961,7 @@ netmap_extra_alloc(struct netmap_adapter *na, uint32_t *head, uint32_t n) *head = cur; /* restore */ break; } - RD(5, "allocate buffer %d -> %d", *head, cur); + ND(5, "allocate buffer %d -> %d", *head, cur); *p = cur; /* link to previous head */ } @@ -750,7 +978,7 @@ netmap_extra_free(struct netmap_adapter *na, uint32_t head) struct netmap_obj_pool *p = &nmd->pools[NETMAP_BUF_POOL]; uint32_t i, cur, *buf; - D("freeing the extra list"); + ND("freeing the extra list"); for (i = 0; head >=2 && head < p->objtotal; i++) { cur = head; buf = lut[head].vaddr; @@ -761,7 +989,8 @@ netmap_extra_free(struct netmap_adapter *na, uint32_t head) } if (head != 0) D("breaking with head %d", head); - D("freed %d buffers", i); + if (netmap_verbose) + D("freed %d buffers", i); } @@ -846,7 +1075,6 @@ netmap_reset_obj_allocator(struct netmap_obj_pool *p) p->bitmap = NULL; if (p->lut) { u_int i; - size_t sz = p->_clustsize; /* * Free each cluster allocated in @@ -856,7 +1084,7 @@ netmap_reset_obj_allocator(struct netmap_obj_pool *p) */ for (i = 0; i < p->objtotal; i += p->_clustentries) { if (p->lut[i].vaddr) - contigfree(p->lut[i].vaddr, sz, M_NETMAP); + contigfree(p->lut[i].vaddr, p->_clustsize, M_NETMAP); } bzero(p->lut, sizeof(struct lut_entry) * p->objtotal); #ifdef linux @@ -973,6 +1201,18 @@ netmap_config_obj_allocator(struct netmap_obj_pool *p, u_int objtotal, u_int obj return 0; } +static struct lut_entry * +nm_alloc_lut(u_int nobj) +{ + size_t n = sizeof(struct lut_entry) * nobj; + struct lut_entry *lut; +#ifdef linux + lut = vmalloc(n); +#else + lut = malloc(n, M_NETMAP, M_NOWAIT | M_ZERO); +#endif + return lut; +} /* call with NMA_LOCK held */ static int @@ -985,14 +1225,9 @@ netmap_finalize_obj_allocator(struct netmap_obj_pool *p) p->numclusters = p->_numclusters; p->objtotal = p->_objtotal; - n = sizeof(struct lut_entry) * p->objtotal; -#ifdef linux - p->lut = vmalloc(n); -#else - p->lut = malloc(n, M_NETMAP, M_NOWAIT | M_ZERO); -#endif + p->lut = nm_alloc_lut(p->objtotal); if (p->lut == NULL) { - D("Unable to create lookup table (%d bytes) for '%s'", (int)n, p->name); + D("Unable to create lookup table for '%s'", p->name); goto clean; } @@ -1015,6 +1250,13 @@ netmap_finalize_obj_allocator(struct netmap_obj_pool *p) int lim = i + p->_clustentries; char *clust; + /* + * XXX Note, we only need contigmalloc() for buffers attached + * to native interfaces. In all other cases (nifp, netmap rings + * and even buffers for VALE ports or emulated interfaces) we + * can live with standard malloc, because the hardware will not + * access the pages directly. + */ clust = contigmalloc(n, M_NETMAP, M_NOWAIT | M_ZERO, (size_t)0, -1UL, PAGE_SIZE, 0); if (clust == NULL) { @@ -1108,10 +1350,15 @@ netmap_mem_unmap(struct netmap_obj_pool *p, struct netmap_adapter *na) if (na->pdev == NULL) return 0; -#ifdef __FreeBSD__ +#if defined(__FreeBSD__) (void)i; (void)lim; D("unsupported on FreeBSD"); + +#elif defined(_WIN32) + (void)i; + (void)lim; + D("unsupported on Windows"); //XXX_ale, really? #else /* linux */ for (i = 2; i < lim; i++) { netmap_unload_map(na, (bus_dma_tag_t) na->pdev, &p->lut[i].paddr); @@ -1124,8 +1371,10 @@ netmap_mem_unmap(struct netmap_obj_pool *p, struct netmap_adapter *na) static int netmap_mem_map(struct netmap_obj_pool *p, struct netmap_adapter *na) { -#ifdef __FreeBSD__ +#if defined(__FreeBSD__) D("unsupported on FreeBSD"); +#elif defined(_WIN32) + D("unsupported on Windows"); //XXX_ale, really? #else /* linux */ int i, lim = p->_objtotal; @@ -1141,6 +1390,30 @@ netmap_mem_map(struct netmap_obj_pool *p, struct netmap_adapter *na) return 0; } +static int +netmap_mem_init_shared_info(struct netmap_mem_d *nmd) +{ + struct netmap_mem_shared_info *nms_info; + ssize_t base; + + /* Use the first slot in IF_POOL */ + nms_info = netmap_if_malloc(nmd, sizeof(*nms_info)); + if (nms_info == NULL) { + return ENOMEM; + } + + base = netmap_if_offset(nmd, nms_info); + + memcpy(&nms_info->up, &nms_if_blueprint, sizeof(nms_if_blueprint)); + nms_info->buf_pool_offset = nmd->pools[NETMAP_IF_POOL].memtotal + nmd->pools[NETMAP_RING_POOL].memtotal; + nms_info->buf_pool_objtotal = nmd->pools[NETMAP_BUF_POOL].objtotal; + nms_info->buf_pool_objsize = nmd->pools[NETMAP_BUF_POOL]._objsize; + nms_info->totalsize = nmd->nm_totalsize; + nms_info->features = NMS_FEAT_BUF_POOL | NMS_FEAT_MEMSIZE; + + return 0; +} + static int netmap_mem_finalize_all(struct netmap_mem_d *nmd) { @@ -1160,6 +1433,11 @@ netmap_mem_finalize_all(struct netmap_mem_d *nmd) nmd->pools[NETMAP_BUF_POOL].bitmap[0] = ~3; nmd->flags |= NETMAP_MEM_FINALIZED; + /* expose info to the ptnetmap guest */ + nmd->lasterr = netmap_mem_init_shared_info(nmd); + if (nmd->lasterr) + goto error; + if (netmap_verbose) D("interfaces %d KB, rings %d KB, buffers %d MB", nmd->pools[NETMAP_IF_POOL].memtotal >> 10, @@ -1207,10 +1485,9 @@ static int netmap_mem_private_finalize(struct netmap_mem_d *nmd) { int err; - NMA_LOCK(nmd); - nmd->active++; err = netmap_mem_finalize_all(nmd); - NMA_UNLOCK(nmd); + if (!err) + nmd->active++; return err; } @@ -1218,10 +1495,8 @@ netmap_mem_private_finalize(struct netmap_mem_d *nmd) static void netmap_mem_private_deref(struct netmap_mem_d *nmd) { - NMA_LOCK(nmd); if (--nmd->active <= 0) netmap_mem_reset_all(nmd); - NMA_UNLOCK(nmd); } @@ -1238,7 +1513,7 @@ netmap_mem_private_new(const char *name, u_int txr, u_int txd, u_int v, maxd; d = malloc(sizeof(struct netmap_mem_d), - M_DEVBUF, M_NOWAIT | M_ZERO); + M_DEVBUF, M_NOWAIT | M_ZERO); if (d == NULL) { err = ENOMEM; goto error; @@ -1357,10 +1632,10 @@ static int netmap_mem_global_finalize(struct netmap_mem_d *nmd) { int err; - + /* update configuration if changed */ if (netmap_mem_global_config(nmd)) - goto out; + return nmd->lasterr; nmd->active++; @@ -1417,13 +1692,17 @@ netmap_free_rings(struct netmap_adapter *na) for_rx_tx(t) { u_int i; - for (i = 0; i < netmap_real_rings(na, t); i++) { + for (i = 0; i < nma_get_nrings(na, t) + 1; i++) { struct netmap_kring *kring = &NMR(na, t)[i]; struct netmap_ring *ring = kring->ring; - if (ring == NULL) + if (ring == NULL || kring->users > 0 || (kring->nr_kflags & NKR_NEEDRING)) { + ND("skipping ring %s (ring %p, users %d)", + kring->name, ring, kring->users); continue; - netmap_free_bufs(na->nm_mem, ring->slot, kring->nkr_num_slots); + } + if (i != nma_get_nrings(na, t) || na->na_flags & NAF_HOST_RINGS) + netmap_free_bufs(na->nm_mem, ring->slot, kring->nkr_num_slots); netmap_ring_free(na->nm_mem, ring); kring->ring = NULL; } @@ -1452,9 +1731,10 @@ netmap_mem2_rings_create(struct netmap_adapter *na) struct netmap_ring *ring = kring->ring; u_int len, ndesc; - if (ring) { - ND("%s already created", kring->name); - continue; /* already created by somebody else */ + if (ring || (!kring->users && !(kring->nr_kflags & NKR_NEEDRING))) { + /* uneeded, or already created by somebody else */ + ND("skipping ring %s", kring->name); + continue; } ndesc = kring->nkr_num_slots; len = sizeof(struct netmap_ring) + @@ -1569,10 +1849,22 @@ netmap_mem2_if_new(struct netmap_adapter *na) */ base = netmap_if_offset(na->nm_mem, nifp); for (i = 0; i < n[NR_TX]; i++) { + if (na->tx_rings[i].ring == NULL) { + // XXX maybe use the offset of an error ring, + // like we do for buffers? + *(ssize_t *)(uintptr_t)&nifp->ring_ofs[i] = 0; + continue; + } *(ssize_t *)(uintptr_t)&nifp->ring_ofs[i] = netmap_ring_offset(na->nm_mem, na->tx_rings[i].ring) - base; } for (i = 0; i < n[NR_RX]; i++) { + if (na->rx_rings[i].ring == NULL) { + // XXX maybe use the offset of an error ring, + // like we do for buffers? + *(ssize_t *)(uintptr_t)&nifp->ring_ofs[i+n[NR_TX]] = 0; + continue; + } *(ssize_t *)(uintptr_t)&nifp->ring_ofs[i+n[NR_TX]] = netmap_ring_offset(na->nm_mem, na->rx_rings[i].ring) - base; } @@ -1636,3 +1928,531 @@ struct netmap_mem_ops netmap_mem_private_ops = { .nmd_rings_create = netmap_mem2_rings_create, .nmd_rings_delete = netmap_mem2_rings_delete }; + +#ifdef WITH_PTNETMAP_GUEST +struct mem_pt_if { + struct mem_pt_if *next; + struct ifnet *ifp; + unsigned int nifp_offset; + nm_pt_guest_ptctl_t ptctl; +}; + +/* Netmap allocator for ptnetmap guests. */ +struct netmap_mem_ptg { + struct netmap_mem_d up; + + vm_paddr_t nm_paddr; /* physical address in the guest */ + void *nm_addr; /* virtual address in the guest */ + struct netmap_lut buf_lut; /* lookup table for BUF pool in the guest */ + nm_memid_t nm_host_id; /* allocator identifier in the host */ + struct ptnetmap_memdev *ptn_dev; + struct mem_pt_if *pt_ifs; /* list of interfaces in passthrough */ +}; + +/* Link a passthrough interface to a passthrough netmap allocator. */ +static int +netmap_mem_pt_guest_ifp_add(struct netmap_mem_d *nmd, struct ifnet *ifp, + unsigned int nifp_offset, + nm_pt_guest_ptctl_t ptctl) +{ + struct netmap_mem_ptg *ptnmd = (struct netmap_mem_ptg *)nmd; + struct mem_pt_if *ptif = malloc(sizeof(*ptif), M_NETMAP, + M_NOWAIT | M_ZERO); + + if (!ptif) { + return ENOMEM; + } + + NMA_LOCK(nmd); + + ptif->ifp = ifp; + ptif->nifp_offset = nifp_offset; + ptif->ptctl = ptctl; + + if (ptnmd->pt_ifs) { + ptif->next = ptnmd->pt_ifs; + } + ptnmd->pt_ifs = ptif; + + NMA_UNLOCK(nmd); + + D("added (ifp=%p,nifp_offset=%u)", ptif->ifp, ptif->nifp_offset); + + return 0; +} + +/* Called with NMA_LOCK(nmd) held. */ +static struct mem_pt_if * +netmap_mem_pt_guest_ifp_lookup(struct netmap_mem_d *nmd, struct ifnet *ifp) +{ + struct netmap_mem_ptg *ptnmd = (struct netmap_mem_ptg *)nmd; + struct mem_pt_if *curr; + + for (curr = ptnmd->pt_ifs; curr; curr = curr->next) { + if (curr->ifp == ifp) { + return curr; + } + } + + return NULL; +} + +/* Unlink a passthrough interface from a passthrough netmap allocator. */ +int +netmap_mem_pt_guest_ifp_del(struct netmap_mem_d *nmd, struct ifnet *ifp) +{ + struct netmap_mem_ptg *ptnmd = (struct netmap_mem_ptg *)nmd; + struct mem_pt_if *prev = NULL; + struct mem_pt_if *curr; + int ret = -1; + + NMA_LOCK(nmd); + + for (curr = ptnmd->pt_ifs; curr; curr = curr->next) { + if (curr->ifp == ifp) { + if (prev) { + prev->next = curr->next; + } else { + ptnmd->pt_ifs = curr->next; + } + D("removed (ifp=%p,nifp_offset=%u)", + curr->ifp, curr->nifp_offset); + free(curr, M_NETMAP); + ret = 0; + break; + } + prev = curr; + } + + NMA_UNLOCK(nmd); + + return ret; +} + +/* Read allocator info from the first netmap_if (only on finalize) */ +static int +netmap_mem_pt_guest_read_shared_info(struct netmap_mem_d *nmd) +{ + struct netmap_mem_ptg *ptnmd = (struct netmap_mem_ptg *)nmd; + struct netmap_mem_shared_info *nms_info; + uint32_t bufsize; + uint32_t nbuffers; + char *vaddr; + vm_paddr_t paddr; + int i; + + nms_info = (struct netmap_mem_shared_info *)ptnmd->nm_addr; + if (strncmp(nms_info->up.ni_name, NMS_NAME, sizeof(NMS_NAME)) != 0) { + D("error, the first slot does not contain shared info"); + return EINVAL; + } + /* check features mem_shared info */ + if ((nms_info->features & (NMS_FEAT_BUF_POOL | NMS_FEAT_MEMSIZE)) != + (NMS_FEAT_BUF_POOL | NMS_FEAT_MEMSIZE)) { + D("error, the shared info does not contain BUF_POOL and MEMSIZE"); + return EINVAL; + } + + bufsize = nms_info->buf_pool_objsize; + nbuffers = nms_info->buf_pool_objtotal; + + /* allocate the lut */ + if (ptnmd->buf_lut.lut == NULL) { + D("allocating lut"); + ptnmd->buf_lut.lut = nm_alloc_lut(nbuffers); + if (ptnmd->buf_lut.lut == NULL) { + D("lut allocation failed"); + return ENOMEM; + } + } + + /* we have physically contiguous memory mapped through PCI BAR */ + vaddr = (char *)(ptnmd->nm_addr) + nms_info->buf_pool_offset; + paddr = ptnmd->nm_paddr + nms_info->buf_pool_offset; + + for (i = 0; i < nbuffers; i++) { + ptnmd->buf_lut.lut[i].vaddr = vaddr; + ptnmd->buf_lut.lut[i].paddr = paddr; + vaddr += bufsize; + paddr += bufsize; + } + + ptnmd->buf_lut.objtotal = nbuffers; + ptnmd->buf_lut.objsize = bufsize; + + nmd->nm_totalsize = nms_info->totalsize; + + return 0; +} + +static int +netmap_mem_pt_guest_get_lut(struct netmap_mem_d *nmd, struct netmap_lut *lut) +{ + struct netmap_mem_ptg *ptnmd = (struct netmap_mem_ptg *)nmd; + + if (!(nmd->flags & NETMAP_MEM_FINALIZED)) { + return EINVAL; + } + + *lut = ptnmd->buf_lut; + return 0; +} + +static int +netmap_mem_pt_guest_get_info(struct netmap_mem_d *nmd, u_int *size, + u_int *memflags, uint16_t *id) +{ + int error = 0; + + NMA_LOCK(nmd); + + error = nmd->ops->nmd_config(nmd); + if (error) + goto out; + + if (size) + *size = nmd->nm_totalsize; + if (memflags) + *memflags = nmd->flags; + if (id) + *id = nmd->nm_id; + +out: + NMA_UNLOCK(nmd); + + return error; +} + +static vm_paddr_t +netmap_mem_pt_guest_ofstophys(struct netmap_mem_d *nmd, vm_ooffset_t off) +{ + struct netmap_mem_ptg *ptnmd = (struct netmap_mem_ptg *)nmd; + vm_paddr_t paddr; + /* if the offset is valid, just return csb->base_addr + off */ + paddr = (vm_paddr_t)(ptnmd->nm_paddr + off); + ND("off %lx padr %lx", off, (unsigned long)paddr); + return paddr; +} + +static int +netmap_mem_pt_guest_config(struct netmap_mem_d *nmd) +{ + /* nothing to do, we are configured on creation + * and configuration never changes thereafter + */ + return 0; +} + +static int +netmap_mem_pt_guest_finalize(struct netmap_mem_d *nmd) +{ + struct netmap_mem_ptg *ptnmd = (struct netmap_mem_ptg *)nmd; + int error = 0; + + nmd->active++; + + if (nmd->flags & NETMAP_MEM_FINALIZED) + goto out; + + if (ptnmd->ptn_dev == NULL) { + D("ptnetmap memdev not attached"); + error = ENOMEM; + goto err; + } + /* map memory through ptnetmap-memdev BAR */ + error = nm_os_pt_memdev_iomap(ptnmd->ptn_dev, &ptnmd->nm_paddr, + &ptnmd->nm_addr); + if (error) + goto err; + + /* read allcator info and create lut */ + error = netmap_mem_pt_guest_read_shared_info(nmd); + if (error) + goto err; + + nmd->flags |= NETMAP_MEM_FINALIZED; +out: + return 0; +err: + nmd->active--; + return error; +} + +static void +netmap_mem_pt_guest_deref(struct netmap_mem_d *nmd) +{ + struct netmap_mem_ptg *ptnmd = (struct netmap_mem_ptg *)nmd; + + nmd->active--; + if (nmd->active <= 0 && + (nmd->flags & NETMAP_MEM_FINALIZED)) { + nmd->flags &= ~NETMAP_MEM_FINALIZED; + /* unmap ptnetmap-memdev memory */ + if (ptnmd->ptn_dev) { + nm_os_pt_memdev_iounmap(ptnmd->ptn_dev); + } + ptnmd->nm_addr = 0; + ptnmd->nm_paddr = 0; + } +} + +static ssize_t +netmap_mem_pt_guest_if_offset(struct netmap_mem_d *nmd, const void *vaddr) +{ + struct netmap_mem_ptg *ptnmd = (struct netmap_mem_ptg *)nmd; + + return (const char *)(vaddr) - (char *)(ptnmd->nm_addr); +} + +static void +netmap_mem_pt_guest_delete(struct netmap_mem_d *nmd) +{ + if (nmd == NULL) + return; + if (netmap_verbose) + D("deleting %p", nmd); + if (nmd->active > 0) + D("bug: deleting mem allocator with active=%d!", nmd->active); + nm_mem_release_id(nmd); + if (netmap_verbose) + D("done deleting %p", nmd); + NMA_LOCK_DESTROY(nmd); + free(nmd, M_DEVBUF); +} + +static struct netmap_if * +netmap_mem_pt_guest_if_new(struct netmap_adapter *na) +{ + struct netmap_mem_ptg *ptnmd = (struct netmap_mem_ptg *)na->nm_mem; + struct mem_pt_if *ptif; + struct netmap_if *nifp = NULL; + + NMA_LOCK(na->nm_mem); + + ptif = netmap_mem_pt_guest_ifp_lookup(na->nm_mem, na->ifp); + if (ptif == NULL) { + D("Error: interface %p is not in passthrough", na->ifp); + goto out; + } + + nifp = (struct netmap_if *)((char *)(ptnmd->nm_addr) + + ptif->nifp_offset); + NMA_UNLOCK(na->nm_mem); +out: + return nifp; +} + +static void +netmap_mem_pt_guest_if_delete(struct netmap_adapter *na, struct netmap_if *nifp) +{ + struct mem_pt_if *ptif; + + NMA_LOCK(na->nm_mem); + + ptif = netmap_mem_pt_guest_ifp_lookup(na->nm_mem, na->ifp); + if (ptif == NULL) { + D("Error: interface %p is not in passthrough", na->ifp); + goto out; + } + + ptif->ptctl(na->ifp, PTNETMAP_PTCTL_IFDELETE); +out: + NMA_UNLOCK(na->nm_mem); +} + +static int +netmap_mem_pt_guest_rings_create(struct netmap_adapter *na) +{ + struct netmap_mem_ptg *ptnmd = (struct netmap_mem_ptg *)na->nm_mem; + struct mem_pt_if *ptif; + struct netmap_if *nifp; + int i, error = -1; + + NMA_LOCK(na->nm_mem); + + ptif = netmap_mem_pt_guest_ifp_lookup(na->nm_mem, na->ifp); + if (ptif == NULL) { + D("Error: interface %p is not in passthrough", na->ifp); + goto out; + } + + + /* point each kring to the corresponding backend ring */ + nifp = (struct netmap_if *)((char *)ptnmd->nm_addr + ptif->nifp_offset); + for (i = 0; i <= na->num_tx_rings; i++) { + struct netmap_kring *kring = na->tx_rings + i; + if (kring->ring) + continue; + kring->ring = (struct netmap_ring *) + ((char *)nifp + nifp->ring_ofs[i]); + } + for (i = 0; i <= na->num_rx_rings; i++) { + struct netmap_kring *kring = na->rx_rings + i; + if (kring->ring) + continue; + kring->ring = (struct netmap_ring *) + ((char *)nifp + + nifp->ring_ofs[i + na->num_tx_rings + 1]); + } + + //error = ptif->ptctl->nm_ptctl(ifp, PTNETMAP_PTCTL_RINGSCREATE); + error = 0; +out: + NMA_UNLOCK(na->nm_mem); + + return error; +} + +static void +netmap_mem_pt_guest_rings_delete(struct netmap_adapter *na) +{ + /* TODO: remove?? */ +#if 0 + struct netmap_mem_ptg *ptnmd = (struct netmap_mem_ptg *)na->nm_mem; + struct mem_pt_if *ptif = netmap_mem_pt_guest_ifp_lookup(na->nm_mem, + na->ifp); +#endif +} + +static struct netmap_mem_ops netmap_mem_pt_guest_ops = { + .nmd_get_lut = netmap_mem_pt_guest_get_lut, + .nmd_get_info = netmap_mem_pt_guest_get_info, + .nmd_ofstophys = netmap_mem_pt_guest_ofstophys, + .nmd_config = netmap_mem_pt_guest_config, + .nmd_finalize = netmap_mem_pt_guest_finalize, + .nmd_deref = netmap_mem_pt_guest_deref, + .nmd_if_offset = netmap_mem_pt_guest_if_offset, + .nmd_delete = netmap_mem_pt_guest_delete, + .nmd_if_new = netmap_mem_pt_guest_if_new, + .nmd_if_delete = netmap_mem_pt_guest_if_delete, + .nmd_rings_create = netmap_mem_pt_guest_rings_create, + .nmd_rings_delete = netmap_mem_pt_guest_rings_delete +}; + +/* Called with NMA_LOCK(&nm_mem) held. */ +static struct netmap_mem_d * +netmap_mem_pt_guest_find_hostid(nm_memid_t host_id) +{ + struct netmap_mem_d *mem = NULL; + struct netmap_mem_d *scan = netmap_last_mem_d; + + do { + /* find ptnetmap allocator through host ID */ + if (scan->ops->nmd_deref == netmap_mem_pt_guest_deref && + ((struct netmap_mem_ptg *)(scan))->nm_host_id == host_id) { + mem = scan; + break; + } + scan = scan->next; + } while (scan != netmap_last_mem_d); + + return mem; +} + +/* Called with NMA_LOCK(&nm_mem) held. */ +static struct netmap_mem_d * +netmap_mem_pt_guest_create(nm_memid_t host_id) +{ + struct netmap_mem_ptg *ptnmd; + int err = 0; + + ptnmd = malloc(sizeof(struct netmap_mem_ptg), + M_DEVBUF, M_NOWAIT | M_ZERO); + if (ptnmd == NULL) { + err = ENOMEM; + goto error; + } + + ptnmd->up.ops = &netmap_mem_pt_guest_ops; + ptnmd->nm_host_id = host_id; + ptnmd->pt_ifs = NULL; + + /* Assign new id in the guest (We have the lock) */ + err = nm_mem_assign_id_locked(&ptnmd->up); + if (err) + goto error; + + ptnmd->up.flags &= ~NETMAP_MEM_FINALIZED; + ptnmd->up.flags |= NETMAP_MEM_IO; + + NMA_LOCK_INIT(&ptnmd->up); + + return &ptnmd->up; +error: + netmap_mem_pt_guest_delete(&ptnmd->up); + return NULL; +} + +/* + * find host id in guest allocators and create guest allocator + * if it is not there + */ +static struct netmap_mem_d * +netmap_mem_pt_guest_get(nm_memid_t host_id) +{ + struct netmap_mem_d *nmd; + + NMA_LOCK(&nm_mem); + nmd = netmap_mem_pt_guest_find_hostid(host_id); + if (nmd == NULL) { + nmd = netmap_mem_pt_guest_create(host_id); + } + NMA_UNLOCK(&nm_mem); + + return nmd; +} + +/* + * The guest allocator can be created by ptnetmap_memdev (during the device + * attach) or by ptnetmap device (e1000/virtio), during the netmap_attach. + * + * The order is not important (we have different order in LINUX and FreeBSD). + * The first one, creates the device, and the second one simply attaches it. + */ + +/* Called when ptnetmap_memdev is attaching, to attach a new allocator in + * the guest */ +struct netmap_mem_d * +netmap_mem_pt_guest_attach(struct ptnetmap_memdev *ptn_dev, nm_memid_t host_id) +{ + struct netmap_mem_d *nmd; + struct netmap_mem_ptg *ptnmd; + + nmd = netmap_mem_pt_guest_get(host_id); + + /* assign this device to the guest allocator */ + if (nmd) { + ptnmd = (struct netmap_mem_ptg *)nmd; + ptnmd->ptn_dev = ptn_dev; + } + + return nmd; +} + +/* Called when ptnetmap device (virtio/e1000) is attaching */ +struct netmap_mem_d * +netmap_mem_pt_guest_new(struct ifnet *ifp, + unsigned int nifp_offset, + nm_pt_guest_ptctl_t ptctl) +{ + struct netmap_mem_d *nmd; + nm_memid_t host_id; + + if (ifp == NULL || ptctl == NULL) { + return NULL; + } + + /* Get the host id allocator. */ + host_id = ptctl(ifp, PTNETMAP_PTCTL_HOSTMEMID); + + nmd = netmap_mem_pt_guest_get(host_id); + + if (nmd) { + netmap_mem_pt_guest_ifp_add(nmd, ifp, nifp_offset, + ptctl); + } + + return nmd; +} + +#endif /* WITH_PTNETMAP_GUEST */ diff --git a/sys/dev/netmap/netmap_mem2.h b/sys/dev/netmap/netmap_mem2.h index ef0ff96d8e7f3..7f4c5e9e96244 100644 --- a/sys/dev/netmap/netmap_mem2.h +++ b/sys/dev/netmap/netmap_mem2.h @@ -1,5 +1,8 @@ /* - * Copyright (C) 2012-2014 Matteo Landi, Luigi Rizzo, Giuseppe Lettieri. All rights reserved. + * Copyright (C) 2012-2014 Matteo Landi + * Copyright (C) 2012-2016 Luigi Rizzo + * Copyright (C) 2012-2016 Giuseppe Lettieri + * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -117,8 +120,11 @@ extern struct netmap_mem_d nm_mem; -void netmap_mem_get_lut(struct netmap_mem_d *, struct netmap_lut *); +int netmap_mem_get_lut(struct netmap_mem_d *, struct netmap_lut *); vm_paddr_t netmap_mem_ofstophys(struct netmap_mem_d *, vm_ooffset_t); +#ifdef _WIN32 +PMDL win32_build_user_vm_map(struct netmap_mem_d* nmd); +#endif int netmap_mem_finalize(struct netmap_mem_d *, struct netmap_adapter *); int netmap_mem_init(void); void netmap_mem_fini(void); @@ -127,6 +133,7 @@ void netmap_mem_if_delete(struct netmap_adapter *, struct netmap_if *); int netmap_mem_rings_create(struct netmap_adapter *); void netmap_mem_rings_delete(struct netmap_adapter *); void netmap_mem_deref(struct netmap_mem_d *, struct netmap_adapter *); +int netmap_mem2_get_pool_info(struct netmap_mem_d *, u_int, u_int *, u_int *); int netmap_mem_get_info(struct netmap_mem_d *, u_int *size, u_int *memflags, uint16_t *id); ssize_t netmap_mem_if_offset(struct netmap_mem_d *, const void *vaddr); struct netmap_mem_d* netmap_mem_private_new(const char *name, @@ -157,6 +164,15 @@ void netmap_mem_put(struct netmap_mem_d *); #endif /* !NM_DEBUG_PUTGET */ +#ifdef WITH_PTNETMAP_GUEST +struct netmap_mem_d* netmap_mem_pt_guest_new(struct ifnet *, + unsigned int nifp_offset, + nm_pt_guest_ptctl_t); +struct ptnetmap_memdev; +struct netmap_mem_d* netmap_mem_pt_guest_attach(struct ptnetmap_memdev *, uint16_t); +int netmap_mem_pt_guest_ifp_del(struct netmap_mem_d *, struct ifnet *); +#endif /* WITH_PTNETMAP_GUEST */ + #define NETMAP_MEM_PRIVATE 0x2 /* allocator uses private address space */ #define NETMAP_MEM_IO 0x4 /* the underlying memory is mmapped I/O */ diff --git a/sys/dev/netmap/netmap_monitor.c b/sys/dev/netmap/netmap_monitor.c index c303952417ff7..5b4f9cdf61c0b 100644 --- a/sys/dev/netmap/netmap_monitor.c +++ b/sys/dev/netmap/netmap_monitor.c @@ -1,5 +1,6 @@ /* - * Copyright (C) 2014 Giuseppe Lettieri. All rights reserved. + * Copyright (C) 2014-2016 Giuseppe Lettieri + * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -101,6 +102,8 @@ #warning OSX support is only partial #include "osx_glue.h" +#elif defined(_WIN32) +#include "win_glue.h" #else #error Unsupported platform @@ -151,13 +154,17 @@ netmap_monitor_rxsync(struct netmap_kring *kring, int flags) } /* nm_krings_create callbacks for monitors. - * We could use the default netmap_hw_krings_zmon, but - * we don't need the mbq. */ static int netmap_monitor_krings_create(struct netmap_adapter *na) { - return netmap_krings_create(na, 0); + int error = netmap_krings_create(na, 0); + if (error) + return error; + /* override the host rings callbacks */ + na->tx_rings[na->num_tx_rings].nm_sync = netmap_monitor_txsync; + na->rx_rings[na->num_rx_rings].nm_sync = netmap_monitor_rxsync; + return 0; } /* nm_krings_delete callback for monitors */ @@ -186,7 +193,11 @@ nm_monitor_alloc(struct netmap_kring *kring, u_int n) return 0; len = sizeof(struct netmap_kring *) * n; +#ifndef _WIN32 nm = realloc(kring->monitors, len, M_DEVBUF, M_NOWAIT | M_ZERO); +#else + nm = realloc(kring->monitors, len, sizeof(struct netmap_kring *)*kring->max_monitors); +#endif if (nm == NULL) return ENOMEM; @@ -229,10 +240,10 @@ static int netmap_monitor_parent_notify(struct netmap_kring *, int); static int netmap_monitor_add(struct netmap_kring *mkring, struct netmap_kring *kring, int zcopy) { - int error = 0; + int error = NM_IRQ_COMPLETED; /* sinchronize with concurrently running nm_sync()s */ - nm_kr_get(kring); + nm_kr_stop(kring, NM_KR_LOCKED); /* make sure the monitor array exists and is big enough */ error = nm_monitor_alloc(kring, kring->n_monitors + 1); if (error) @@ -242,7 +253,7 @@ netmap_monitor_add(struct netmap_kring *mkring, struct netmap_kring *kring, int kring->n_monitors++; if (kring->n_monitors == 1) { /* this is the first monitor, intercept callbacks */ - D("%s: intercept callbacks on %s", mkring->name, kring->name); + ND("%s: intercept callbacks on %s", mkring->name, kring->name); kring->mon_sync = kring->nm_sync; /* zcopy monitors do not override nm_notify(), but * we save the original one regardless, so that @@ -265,7 +276,7 @@ netmap_monitor_add(struct netmap_kring *mkring, struct netmap_kring *kring, int } out: - nm_kr_put(kring); + nm_kr_start(kring); return error; } @@ -277,7 +288,7 @@ static void netmap_monitor_del(struct netmap_kring *mkring, struct netmap_kring *kring) { /* sinchronize with concurrently running nm_sync()s */ - nm_kr_get(kring); + nm_kr_stop(kring, NM_KR_LOCKED); kring->n_monitors--; if (mkring->mon_pos != kring->n_monitors) { kring->monitors[mkring->mon_pos] = kring->monitors[kring->n_monitors]; @@ -286,18 +297,18 @@ netmap_monitor_del(struct netmap_kring *mkring, struct netmap_kring *kring) kring->monitors[kring->n_monitors] = NULL; if (kring->n_monitors == 0) { /* this was the last monitor, restore callbacks and delete monitor array */ - D("%s: restoring sync on %s: %p", mkring->name, kring->name, kring->mon_sync); + ND("%s: restoring sync on %s: %p", mkring->name, kring->name, kring->mon_sync); kring->nm_sync = kring->mon_sync; kring->mon_sync = NULL; if (kring->tx == NR_RX) { - D("%s: restoring notify on %s: %p", + ND("%s: restoring notify on %s: %p", mkring->name, kring->name, kring->mon_notify); kring->nm_notify = kring->mon_notify; kring->mon_notify = NULL; } nm_monitor_dealloc(kring); } - nm_kr_put(kring); + nm_kr_start(kring); } @@ -316,7 +327,7 @@ netmap_monitor_stop(struct netmap_adapter *na) for_rx_tx(t) { u_int i; - for (i = 0; i < nma_get_nrings(na, t); i++) { + for (i = 0; i < nma_get_nrings(na, t) + 1; i++) { struct netmap_kring *kring = &NMR(na, t)[i]; u_int j; @@ -360,23 +371,32 @@ netmap_monitor_reg_common(struct netmap_adapter *na, int onoff, int zmon) for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) { kring = &NMR(pna, t)[i]; mkring = &na->rx_rings[i]; - netmap_monitor_add(mkring, kring, zmon); + if (nm_kring_pending_on(mkring)) { + netmap_monitor_add(mkring, kring, zmon); + mkring->nr_mode = NKR_NETMAP_ON; + } } } } na->na_flags |= NAF_NETMAP_ON; } else { - if (pna == NULL) { - D("%s: parent left netmap mode, nothing to restore", na->name); - return 0; - } - na->na_flags &= ~NAF_NETMAP_ON; + if (na->active_fds == 0) + na->na_flags &= ~NAF_NETMAP_ON; for_rx_tx(t) { if (mna->flags & nm_txrx2flag(t)) { for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) { - kring = &NMR(pna, t)[i]; mkring = &na->rx_rings[i]; - netmap_monitor_del(mkring, kring); + if (nm_kring_pending_off(mkring)) { + mkring->nr_mode = NKR_NETMAP_OFF; + /* we cannot access the parent krings if the parent + * has left netmap mode. This is signaled by a NULL + * pna pointer + */ + if (pna) { + kring = &NMR(pna, t)[i]; + netmap_monitor_del(mkring, kring); + } + } } } } @@ -652,17 +672,27 @@ netmap_monitor_parent_rxsync(struct netmap_kring *kring, int flags) static int netmap_monitor_parent_notify(struct netmap_kring *kring, int flags) { + int (*notify)(struct netmap_kring*, int); ND(5, "%s %x", kring->name, flags); /* ?xsync callbacks have tryget called by their callers * (NIOCREGIF and poll()), but here we have to call it * by ourself */ - if (nm_kr_tryget(kring)) - goto out; - netmap_monitor_parent_rxsync(kring, NAF_FORCE_READ); + if (nm_kr_tryget(kring, 0, NULL)) { + /* in all cases, just skip the sync */ + return NM_IRQ_COMPLETED; + } + if (kring->n_monitors > 0) { + netmap_monitor_parent_rxsync(kring, NAF_FORCE_READ); + notify = kring->mon_notify; + } else { + /* we are no longer monitoring this ring, so both + * mon_sync and mon_notify are NULL + */ + notify = kring->nm_notify; + } nm_kr_put(kring); -out: - return kring->mon_notify(kring, flags); + return notify(kring, flags); } @@ -691,18 +721,25 @@ netmap_get_monitor_na(struct nmreq *nmr, struct netmap_adapter **na, int create) struct nmreq pnmr; struct netmap_adapter *pna; /* parent adapter */ struct netmap_monitor_adapter *mna; + struct ifnet *ifp = NULL; int i, error; enum txrx t; int zcopy = (nmr->nr_flags & NR_ZCOPY_MON); char monsuff[10] = ""; if ((nmr->nr_flags & (NR_MONITOR_TX | NR_MONITOR_RX)) == 0) { + if (nmr->nr_flags & NR_ZCOPY_MON) { + /* the flag makes no sense unless you are + * creating a monitor + */ + return EINVAL; + } ND("not a monitor"); return 0; } /* this is a request for a monitor adapter */ - D("flags %x", nmr->nr_flags); + ND("flags %x", nmr->nr_flags); mna = malloc(sizeof(*mna), M_DEVBUF, M_NOWAIT | M_ZERO); if (mna == NULL) { @@ -716,13 +753,14 @@ netmap_get_monitor_na(struct nmreq *nmr, struct netmap_adapter **na, int create) * except other monitors. */ memcpy(&pnmr, nmr, sizeof(pnmr)); - pnmr.nr_flags &= ~(NR_MONITOR_TX | NR_MONITOR_RX); - error = netmap_get_na(&pnmr, &pna, create); + pnmr.nr_flags &= ~(NR_MONITOR_TX | NR_MONITOR_RX | NR_ZCOPY_MON); + error = netmap_get_na(&pnmr, &pna, &ifp, create); if (error) { D("parent lookup failed: %d", error); + free(mna, M_DEVBUF); return error; } - D("found parent: %s", pna->name); + ND("found parent: %s", pna->name); if (!nm_netmap_on(pna)) { /* parent not in netmap mode */ @@ -829,19 +867,17 @@ netmap_get_monitor_na(struct nmreq *nmr, struct netmap_adapter **na, int create) *na = &mna->up; netmap_adapter_get(*na); - /* write the configuration back */ - nmr->nr_tx_rings = mna->up.num_tx_rings; - nmr->nr_rx_rings = mna->up.num_rx_rings; - nmr->nr_tx_slots = mna->up.num_tx_desc; - nmr->nr_rx_slots = mna->up.num_rx_desc; - /* keep the reference to the parent */ - D("monitor ok"); + ND("monitor ok"); + + /* drop the reference to the ifp, if any */ + if (ifp) + if_rele(ifp); return 0; put_out: - netmap_adapter_put(pna); + netmap_unget_na(pna, ifp); free(mna, M_DEVBUF); return error; } diff --git a/sys/dev/netmap/netmap_offloadings.c b/sys/dev/netmap/netmap_offloadings.c index dadc1dcbc14cc..f8da672ffa53c 100644 --- a/sys/dev/netmap/netmap_offloadings.c +++ b/sys/dev/netmap/netmap_offloadings.c @@ -1,5 +1,6 @@ /* - * Copyright (C) 2014 Vincenzo Maffione. All rights reserved. + * Copyright (C) 2014-2015 Vincenzo Maffione + * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -31,9 +32,9 @@ #include #include #include /* defines used in kernel.h */ -#include /* types used in module initialization */ #include /* types used in module initialization */ #include +#include #include /* struct socket */ #include /* sockaddrs */ #include @@ -64,21 +65,21 @@ /* This routine is called by bdg_mismatch_datapath() when it finishes * accumulating bytes for a segment, in order to fix some fields in the * segment headers (which still contain the same content as the header - * of the original GSO packet). 'buf' points to the beginning (e.g. - * the ethernet header) of the segment, and 'len' is its length. + * of the original GSO packet). 'pkt' points to the beginning of the IP + * header of the segment, while 'len' is the length of the IP packet. */ -static void gso_fix_segment(uint8_t *buf, size_t len, u_int idx, - u_int segmented_bytes, u_int last_segment, - u_int tcp, u_int iphlen) +static void +gso_fix_segment(uint8_t *pkt, size_t len, u_int ipv4, u_int iphlen, u_int tcp, + u_int idx, u_int segmented_bytes, u_int last_segment) { - struct nm_iphdr *iph = (struct nm_iphdr *)(buf + 14); - struct nm_ipv6hdr *ip6h = (struct nm_ipv6hdr *)(buf + 14); + struct nm_iphdr *iph = (struct nm_iphdr *)(pkt); + struct nm_ipv6hdr *ip6h = (struct nm_ipv6hdr *)(pkt); uint16_t *check = NULL; uint8_t *check_data = NULL; - if (iphlen == 20) { + if (ipv4) { /* Set the IPv4 "Total Length" field. */ - iph->tot_len = htobe16(len-14); + iph->tot_len = htobe16(len); ND("ip total length %u", be16toh(ip->tot_len)); /* Set the IPv4 "Identification" field. */ @@ -87,15 +88,15 @@ static void gso_fix_segment(uint8_t *buf, size_t len, u_int idx, /* Compute and insert the IPv4 header checksum. */ iph->check = 0; - iph->check = nm_csum_ipv4(iph); + iph->check = nm_os_csum_ipv4(iph); ND("IP csum %x", be16toh(iph->check)); - } else {/* if (iphlen == 40) */ + } else { /* Set the IPv6 "Payload Len" field. */ - ip6h->payload_len = htobe16(len-14-iphlen); + ip6h->payload_len = htobe16(len-iphlen); } if (tcp) { - struct nm_tcphdr *tcph = (struct nm_tcphdr *)(buf + 14 + iphlen); + struct nm_tcphdr *tcph = (struct nm_tcphdr *)(pkt + iphlen); /* Set the TCP sequence number. */ tcph->seq = htobe32(be32toh(tcph->seq) + segmented_bytes); @@ -110,10 +111,10 @@ static void gso_fix_segment(uint8_t *buf, size_t len, u_int idx, check = &tcph->check; check_data = (uint8_t *)tcph; } else { /* UDP */ - struct nm_udphdr *udph = (struct nm_udphdr *)(buf + 14 + iphlen); + struct nm_udphdr *udph = (struct nm_udphdr *)(pkt + iphlen); /* Set the UDP 'Length' field. */ - udph->len = htobe16(len-14-iphlen); + udph->len = htobe16(len-iphlen); check = &udph->check; check_data = (uint8_t *)udph; @@ -121,48 +122,80 @@ static void gso_fix_segment(uint8_t *buf, size_t len, u_int idx, /* Compute and insert TCP/UDP checksum. */ *check = 0; - if (iphlen == 20) - nm_csum_tcpudp_ipv4(iph, check_data, len-14-iphlen, check); + if (ipv4) + nm_os_csum_tcpudp_ipv4(iph, check_data, len-iphlen, check); else - nm_csum_tcpudp_ipv6(ip6h, check_data, len-14-iphlen, check); + nm_os_csum_tcpudp_ipv6(ip6h, check_data, len-iphlen, check); ND("TCP/UDP csum %x", be16toh(*check)); } +static int +vnet_hdr_is_bad(struct nm_vnet_hdr *vh) +{ + uint8_t gso_type = vh->gso_type & ~VIRTIO_NET_HDR_GSO_ECN; + + return ( + (gso_type != VIRTIO_NET_HDR_GSO_NONE && + gso_type != VIRTIO_NET_HDR_GSO_TCPV4 && + gso_type != VIRTIO_NET_HDR_GSO_UDP && + gso_type != VIRTIO_NET_HDR_GSO_TCPV6) + || + (vh->flags & ~(VIRTIO_NET_HDR_F_NEEDS_CSUM + | VIRTIO_NET_HDR_F_DATA_VALID)) + ); +} /* The VALE mismatch datapath implementation. */ -void bdg_mismatch_datapath(struct netmap_vp_adapter *na, - struct netmap_vp_adapter *dst_na, - struct nm_bdg_fwd *ft_p, struct netmap_ring *ring, - u_int *j, u_int lim, u_int *howmany) +void +bdg_mismatch_datapath(struct netmap_vp_adapter *na, + struct netmap_vp_adapter *dst_na, + const struct nm_bdg_fwd *ft_p, + struct netmap_ring *dst_ring, + u_int *j, u_int lim, u_int *howmany) { - struct netmap_slot *slot = NULL; + struct netmap_slot *dst_slot = NULL; struct nm_vnet_hdr *vh = NULL; - /* Number of source slots to process. */ - u_int frags = ft_p->ft_frags; - struct nm_bdg_fwd *ft_end = ft_p + frags; + const struct nm_bdg_fwd *ft_end = ft_p + ft_p->ft_frags; /* Source and destination pointers. */ uint8_t *dst, *src; size_t src_len, dst_len; + /* Indices and counters for the destination ring. */ u_int j_start = *j; + u_int j_cur = j_start; u_int dst_slots = 0; - /* If the source port uses the offloadings, while destination doesn't, - * we grab the source virtio-net header and do the offloadings here. - */ - if (na->virt_hdr_len && !dst_na->virt_hdr_len) { - vh = (struct nm_vnet_hdr *)ft_p->ft_buf; + if (unlikely(ft_p == ft_end)) { + RD(3, "No source slots to process"); + return; } /* Init source and dest pointers. */ src = ft_p->ft_buf; src_len = ft_p->ft_len; - slot = &ring->slot[*j]; - dst = NMB(&dst_na->up, slot); + dst_slot = &dst_ring->slot[j_cur]; + dst = NMB(&dst_na->up, dst_slot); dst_len = src_len; + /* If the source port uses the offloadings, while destination doesn't, + * we grab the source virtio-net header and do the offloadings here. + */ + if (na->up.virt_hdr_len && !dst_na->up.virt_hdr_len) { + vh = (struct nm_vnet_hdr *)src; + /* Initial sanity check on the source virtio-net header. If + * something seems wrong, just drop the packet. */ + if (src_len < na->up.virt_hdr_len) { + RD(3, "Short src vnet header, dropping"); + return; + } + if (vnet_hdr_is_bad(vh)) { + RD(3, "Bad src vnet header, dropping"); + return; + } + } + /* We are processing the first input slot and there is a mismatch * between source and destination virt_hdr_len (SHL and DHL). * When the a client is using virtio-net headers, the header length @@ -185,14 +218,14 @@ void bdg_mismatch_datapath(struct netmap_vp_adapter *na, * 12 | 0 | doesn't exist * 12 | 10 | copied from the first 10 bytes of source header */ - bzero(dst, dst_na->virt_hdr_len); - if (na->virt_hdr_len && dst_na->virt_hdr_len) + bzero(dst, dst_na->up.virt_hdr_len); + if (na->up.virt_hdr_len && dst_na->up.virt_hdr_len) memcpy(dst, src, sizeof(struct nm_vnet_hdr)); /* Skip the virtio-net headers. */ - src += na->virt_hdr_len; - src_len -= na->virt_hdr_len; - dst += dst_na->virt_hdr_len; - dst_len = dst_na->virt_hdr_len + src_len; + src += na->up.virt_hdr_len; + src_len -= na->up.virt_hdr_len; + dst += dst_na->up.virt_hdr_len; + dst_len = dst_na->up.virt_hdr_len + src_len; /* Here it could be dst_len == 0 (which implies src_len == 0), * so we avoid passing a zero length fragment. @@ -214,16 +247,27 @@ void bdg_mismatch_datapath(struct netmap_vp_adapter *na, u_int gso_idx = 0; /* Payload data bytes segmented so far (e.g. TCP data bytes). */ u_int segmented_bytes = 0; + /* Is this an IPv4 or IPv6 GSO packet? */ + u_int ipv4 = 0; /* Length of the IP header (20 if IPv4, 40 if IPv6). */ u_int iphlen = 0; + /* Length of the Ethernet header (18 if 802.1q, otherwise 14). */ + u_int ethhlen = 14; /* Is this a TCP or an UDP GSO packet? */ u_int tcp = ((vh->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) == VIRTIO_NET_HDR_GSO_UDP) ? 0 : 1; /* Segment the GSO packet contained into the input slots (frags). */ - while (ft_p != ft_end) { + for (;;) { size_t copy; + if (dst_slots >= *howmany) { + /* We still have work to do, but we've run out of + * dst slots, so we have to drop the packet. */ + RD(3, "Not enough slots, dropping GSO packet"); + return; + } + /* Grab the GSO header if we don't have it. */ if (!gso_hdr) { uint16_t ethertype; @@ -231,28 +275,75 @@ void bdg_mismatch_datapath(struct netmap_vp_adapter *na, gso_hdr = src; /* Look at the 'Ethertype' field to see if this packet - * is IPv4 or IPv6. - */ - ethertype = be16toh(*((uint16_t *)(gso_hdr + 12))); - if (ethertype == 0x0800) - iphlen = 20; - else /* if (ethertype == 0x86DD) */ - iphlen = 40; + * is IPv4 or IPv6, taking into account VLAN + * encapsulation. */ + for (;;) { + if (src_len < ethhlen) { + RD(3, "Short GSO fragment [eth], dropping"); + return; + } + ethertype = be16toh(*((uint16_t *) + (gso_hdr + ethhlen - 2))); + if (ethertype != 0x8100) /* not 802.1q */ + break; + ethhlen += 4; + } + switch (ethertype) { + case 0x0800: /* IPv4 */ + { + struct nm_iphdr *iph = (struct nm_iphdr *) + (gso_hdr + ethhlen); + + if (src_len < ethhlen + 20) { + RD(3, "Short GSO fragment " + "[IPv4], dropping"); + return; + } + ipv4 = 1; + iphlen = 4 * (iph->version_ihl & 0x0F); + break; + } + case 0x86DD: /* IPv6 */ + ipv4 = 0; + iphlen = 40; + break; + default: + RD(3, "Unsupported ethertype, " + "dropping GSO packet"); + return; + } ND(3, "type=%04x", ethertype); + if (src_len < ethhlen + iphlen) { + RD(3, "Short GSO fragment [IP], dropping"); + return; + } + /* Compute gso_hdr_len. For TCP we need to read the * content of the 'Data Offset' field. */ if (tcp) { - struct nm_tcphdr *tcph = - (struct nm_tcphdr *)&gso_hdr[14+iphlen]; + struct nm_tcphdr *tcph = (struct nm_tcphdr *) + (gso_hdr + ethhlen + iphlen); + + if (src_len < ethhlen + iphlen + 20) { + RD(3, "Short GSO fragment " + "[TCP], dropping"); + return; + } + gso_hdr_len = ethhlen + iphlen + + 4 * (tcph->doff >> 4); + } else { + gso_hdr_len = ethhlen + iphlen + 8; /* UDP */ + } - gso_hdr_len = 14 + iphlen + 4*(tcph->doff >> 4); - } else - gso_hdr_len = 14 + iphlen + 8; /* UDP */ + if (src_len < gso_hdr_len) { + RD(3, "Short GSO fragment [TCP/UDP], dropping"); + return; + } ND(3, "gso_hdr_len %u gso_mtu %d", gso_hdr_len, - dst_na->mfs); + dst_na->mfs); /* Advance source pointers. */ src += gso_hdr_len; @@ -263,7 +354,6 @@ void bdg_mismatch_datapath(struct netmap_vp_adapter *na, break; src = ft_p->ft_buf; src_len = ft_p->ft_len; - continue; } } @@ -289,25 +379,24 @@ void bdg_mismatch_datapath(struct netmap_vp_adapter *na, /* After raw segmentation, we must fix some header * fields and compute checksums, in a protocol dependent * way. */ - gso_fix_segment(dst, gso_bytes, gso_idx, - segmented_bytes, - src_len == 0 && ft_p + 1 == ft_end, - tcp, iphlen); + gso_fix_segment(dst + ethhlen, gso_bytes - ethhlen, + ipv4, iphlen, tcp, + gso_idx, segmented_bytes, + src_len == 0 && ft_p + 1 == ft_end); ND("frame %u completed with %d bytes", gso_idx, (int)gso_bytes); - slot->len = gso_bytes; - slot->flags = 0; - segmented_bytes += gso_bytes - gso_hdr_len; - + dst_slot->len = gso_bytes; + dst_slot->flags = 0; dst_slots++; - - /* Next destination slot. */ - *j = nm_next(*j, lim); - slot = &ring->slot[*j]; - dst = NMB(&dst_na->up, slot); + segmented_bytes += gso_bytes - gso_hdr_len; gso_bytes = 0; gso_idx++; + + /* Next destination slot. */ + j_cur = nm_next(j_cur, lim); + dst_slot = &dst_ring->slot[j_cur]; + dst = NMB(&dst_na->up, dst_slot); } /* Next input slot. */ @@ -342,10 +431,10 @@ void bdg_mismatch_datapath(struct netmap_vp_adapter *na, /* Init/update the packet checksum if needed. */ if (vh && (vh->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)) { if (!dst_slots) - csum = nm_csum_raw(src + vh->csum_start, + csum = nm_os_csum_raw(src + vh->csum_start, src_len - vh->csum_start, 0); else - csum = nm_csum_raw(src, src_len, csum); + csum = nm_os_csum_raw(src, src_len, csum); } /* Round to a multiple of 64 */ @@ -359,44 +448,43 @@ void bdg_mismatch_datapath(struct netmap_vp_adapter *na, } else { memcpy(dst, src, (int)src_len); } - slot->len = dst_len; - + dst_slot->len = dst_len; dst_slots++; /* Next destination slot. */ - *j = nm_next(*j, lim); - slot = &ring->slot[*j]; - dst = NMB(&dst_na->up, slot); + j_cur = nm_next(j_cur, lim); + dst_slot = &dst_ring->slot[j_cur]; + dst = NMB(&dst_na->up, dst_slot); /* Next source slot. */ ft_p++; src = ft_p->ft_buf; dst_len = src_len = ft_p->ft_len; - } /* Finalize (fold) the checksum if needed. */ if (check && vh && (vh->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)) { - *check = nm_csum_fold(csum); + *check = nm_os_csum_fold(csum); } ND(3, "using %u dst_slots", dst_slots); - /* A second pass on the desitations slots to set the slot flags, + /* A second pass on the destination slots to set the slot flags, * using the right number of destination slots. */ - while (j_start != *j) { - slot = &ring->slot[j_start]; - slot->flags = (dst_slots << 8)| NS_MOREFRAG; + while (j_start != j_cur) { + dst_slot = &dst_ring->slot[j_start]; + dst_slot->flags = (dst_slots << 8)| NS_MOREFRAG; j_start = nm_next(j_start, lim); } /* Clear NS_MOREFRAG flag on last entry. */ - slot->flags = (dst_slots << 8); + dst_slot->flags = (dst_slots << 8); } - /* Update howmany. */ + /* Update howmany and j. This is to commit the use of + * those slots in the destination ring. */ if (unlikely(dst_slots > *howmany)) { - dst_slots = *howmany; - D("Slot allocation error: Should never happen"); + D("Slot allocation error: This is a bug"); } + *j = j_cur; *howmany -= dst_slots; } diff --git a/sys/dev/netmap/netmap_pipe.c b/sys/dev/netmap/netmap_pipe.c index 67e840248c884..f0f1b524300a1 100644 --- a/sys/dev/netmap/netmap_pipe.c +++ b/sys/dev/netmap/netmap_pipe.c @@ -1,5 +1,6 @@ /* - * Copyright (C) 2014 Giuseppe Lettieri. All rights reserved. + * Copyright (C) 2014-2016 Giuseppe Lettieri + * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -54,6 +55,9 @@ #warning OSX support is only partial #include "osx_glue.h" +#elif defined(_WIN32) +#include "win_glue.h" + #else #error Unsupported platform @@ -72,9 +76,11 @@ #define NM_PIPE_MAXSLOTS 4096 -int netmap_default_pipes = 0; /* ignored, kept for compatibility */ +static int netmap_default_pipes = 0; /* ignored, kept for compatibility */ +SYSBEGIN(vars_pipes); SYSCTL_DECL(_dev_netmap); SYSCTL_INT(_dev_netmap, OID_AUTO, default_pipes, CTLFLAG_RW, &netmap_default_pipes, 0 , ""); +SYSEND; /* allocate the pipe array in the parent adapter */ static int @@ -91,7 +97,11 @@ nm_pipe_alloc(struct netmap_adapter *na, u_int npipes) return EINVAL; len = sizeof(struct netmap_pipe_adapter *) * npipes; +#ifndef _WIN32 npa = realloc(na->na_pipes, len, M_DEVBUF, M_NOWAIT | M_ZERO); +#else + npa = realloc(na->na_pipes, len, sizeof(struct netmap_pipe_adapter *)*na->na_max_pipes); +#endif if (npa == NULL) return ENOMEM; @@ -199,7 +209,7 @@ netmap_pipe_txsync(struct netmap_kring *txkring, int flags) } while (limit-- > 0) { - struct netmap_slot *rs = &rxkring->save_ring->slot[j]; + struct netmap_slot *rs = &rxkring->ring->slot[j]; struct netmap_slot *ts = &txkring->ring->slot[k]; struct netmap_slot tmp; @@ -295,7 +305,7 @@ netmap_pipe_rxsync(struct netmap_kring *rxkring, int flags) * usr1 --> e1 --> e2 * * and we are e2. e1 is certainly registered and our - * krings already exist, but they may be hidden. + * krings already exist. Nothing to do. */ static int netmap_pipe_krings_create(struct netmap_adapter *na) @@ -310,65 +320,28 @@ netmap_pipe_krings_create(struct netmap_adapter *na) int i; /* case 1) above */ - ND("%p: case 1, create everything", na); + D("%p: case 1, create both ends", na); error = netmap_krings_create(na, 0); if (error) goto err; - /* we also create all the rings, since we need to - * update the save_ring pointers. - * netmap_mem_rings_create (called by our caller) - * will not create the rings again - */ - - error = netmap_mem_rings_create(na); - if (error) - goto del_krings1; - - /* update our hidden ring pointers */ - for_rx_tx(t) { - for (i = 0; i < nma_get_nrings(na, t) + 1; i++) - NMR(na, t)[i].save_ring = NMR(na, t)[i].ring; - } - - /* now, create krings and rings of the other end */ + /* create the krings of the other end */ error = netmap_krings_create(ona, 0); if (error) - goto del_rings1; - - error = netmap_mem_rings_create(ona); - if (error) - goto del_krings2; - - for_rx_tx(t) { - for (i = 0; i < nma_get_nrings(ona, t) + 1; i++) - NMR(ona, t)[i].save_ring = NMR(ona, t)[i].ring; - } + goto del_krings1; /* cross link the krings */ for_rx_tx(t) { - enum txrx r= nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */ + enum txrx r = nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */ for (i = 0; i < nma_get_nrings(na, t); i++) { NMR(na, t)[i].pipe = NMR(&pna->peer->up, r) + i; NMR(&pna->peer->up, r)[i].pipe = NMR(na, t) + i; } } - } else { - int i; - /* case 2) above */ - /* recover the hidden rings */ - ND("%p: case 2, hidden rings", na); - for_rx_tx(t) { - for (i = 0; i < nma_get_nrings(na, t) + 1; i++) - NMR(na, t)[i].ring = NMR(na, t)[i].save_ring; - } + } return 0; -del_krings2: - netmap_krings_delete(ona); -del_rings1: - netmap_mem_rings_delete(na); del_krings1: netmap_krings_delete(na); err: @@ -383,7 +356,8 @@ err: * * usr1 --> e1 --> e2 * - * and we are e1. Nothing special to do. + * and we are e1. Create the needed rings of the + * other end. * * 1.b) state is * @@ -412,14 +386,65 @@ netmap_pipe_reg(struct netmap_adapter *na, int onoff) { struct netmap_pipe_adapter *pna = (struct netmap_pipe_adapter *)na; + struct netmap_adapter *ona = &pna->peer->up; + int i, error = 0; enum txrx t; ND("%p: onoff %d", na, onoff); if (onoff) { - na->na_flags |= NAF_NETMAP_ON; + for_rx_tx(t) { + for (i = 0; i < nma_get_nrings(na, t) + 1; i++) { + struct netmap_kring *kring = &NMR(na, t)[i]; + + if (nm_kring_pending_on(kring)) { + /* mark the partner ring as needed */ + kring->pipe->nr_kflags |= NKR_NEEDRING; + } + } + } + + /* create all missing needed rings on the other end */ + error = netmap_mem_rings_create(ona); + if (error) + return error; + + /* In case of no error we put our rings in netmap mode */ + for_rx_tx(t) { + for (i = 0; i < nma_get_nrings(na, t) + 1; i++) { + struct netmap_kring *kring = &NMR(na, t)[i]; + + if (nm_kring_pending_on(kring)) { + kring->nr_mode = NKR_NETMAP_ON; + } + } + } + if (na->active_fds == 0) + na->na_flags |= NAF_NETMAP_ON; } else { - na->na_flags &= ~NAF_NETMAP_ON; + if (na->active_fds == 0) + na->na_flags &= ~NAF_NETMAP_ON; + for_rx_tx(t) { + for (i = 0; i < nma_get_nrings(na, t) + 1; i++) { + struct netmap_kring *kring = &NMR(na, t)[i]; + + if (nm_kring_pending_off(kring)) { + kring->nr_mode = NKR_NETMAP_OFF; + /* mark the peer ring as no longer needed by us + * (it may still be kept if sombody else is using it) + */ + kring->pipe->nr_kflags &= ~NKR_NEEDRING; + } + } + } + /* delete all the peer rings that are no longer needed */ + netmap_mem_rings_delete(ona); + } + + if (na->active_fds) { + D("active_fds %d", na->active_fds); + return 0; } + if (pna->peer_ref) { ND("%p: case 1.a or 2.a, nothing to do", na); return 0; @@ -429,18 +454,11 @@ netmap_pipe_reg(struct netmap_adapter *na, int onoff) pna->peer->peer_ref = 0; netmap_adapter_put(na); } else { - int i; ND("%p: case 2.b, grab peer", na); netmap_adapter_get(na); pna->peer->peer_ref = 1; - /* hide our rings from netmap_mem_rings_delete */ - for_rx_tx(t) { - for (i = 0; i < nma_get_nrings(na, t) + 1; i++) { - NMR(na, t)[i].ring = NULL; - } - } } - return 0; + return error; } /* netmap_pipe_krings_delete. @@ -470,8 +488,6 @@ netmap_pipe_krings_delete(struct netmap_adapter *na) struct netmap_pipe_adapter *pna = (struct netmap_pipe_adapter *)na; struct netmap_adapter *ona; /* na of the other end */ - int i; - enum txrx t; if (!pna->peer_ref) { ND("%p: case 2, kept alive by peer", na); @@ -480,18 +496,12 @@ netmap_pipe_krings_delete(struct netmap_adapter *na) /* case 1) above */ ND("%p: case 1, deleting everyhing", na); netmap_krings_delete(na); /* also zeroes tx_rings etc. */ - /* restore the ring to be deleted on the peer */ ona = &pna->peer->up; if (ona->tx_rings == NULL) { /* already deleted, we must be on an * cleanup-after-error path */ return; } - for_rx_tx(t) { - for (i = 0; i < nma_get_nrings(ona, t) + 1; i++) - NMR(ona, t)[i].ring = NMR(ona, t)[i].save_ring; - } - netmap_mem_rings_delete(ona); netmap_krings_delete(ona); } @@ -519,6 +529,7 @@ netmap_get_pipe_na(struct nmreq *nmr, struct netmap_adapter **na, int create) struct nmreq pnmr; struct netmap_adapter *pna; /* parent adapter */ struct netmap_pipe_adapter *mna, *sna, *req; + struct ifnet *ifp = NULL; u_int pipe_id; int role = nmr->nr_flags & NR_REG_MASK; int error; @@ -536,7 +547,7 @@ netmap_get_pipe_na(struct nmreq *nmr, struct netmap_adapter **na, int create) memcpy(&pnmr.nr_name, nmr->nr_name, IFNAMSIZ); /* pass to parent the requested number of pipes */ pnmr.nr_arg1 = nmr->nr_arg1; - error = netmap_get_na(&pnmr, &pna, create); + error = netmap_get_na(&pnmr, &pna, &ifp, create); if (error) { ND("parent lookup failed: %d", error); return error; @@ -652,16 +663,15 @@ found: *na = &req->up; netmap_adapter_get(*na); - /* write the configuration back */ - nmr->nr_tx_rings = req->up.num_tx_rings; - nmr->nr_rx_rings = req->up.num_rx_rings; - nmr->nr_tx_slots = req->up.num_tx_desc; - nmr->nr_rx_slots = req->up.num_rx_desc; - /* keep the reference to the parent. * It will be released by the req destructor */ + /* drop the ifp reference, if any */ + if (ifp) { + if_rele(ifp); + } + return 0; free_sna: @@ -671,7 +681,7 @@ unregister_mna: free_mna: free(mna, M_DEVBUF); put_out: - netmap_adapter_put(pna); + netmap_unget_na(pna, ifp); return error; } diff --git a/sys/dev/netmap/netmap_vale.c b/sys/dev/netmap/netmap_vale.c index ddd7334a8378b..2d2c807681d2b 100644 --- a/sys/dev/netmap/netmap_vale.c +++ b/sys/dev/netmap/netmap_vale.c @@ -1,5 +1,6 @@ /* - * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved. + * Copyright (C) 2013-2016 Universita` di Pisa + * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -101,6 +102,9 @@ __FBSDID("$FreeBSD$"); #warning OSX support is only partial #include "osx_glue.h" +#elif defined(_WIN32) +#include "win_glue.h" + #else #error Unsupported platform @@ -119,7 +123,7 @@ __FBSDID("$FreeBSD$"); /* * system parameters (most of them in netmap_kern.h) - * NM_NAME prefix for switch port names, default "vale" + * NM_BDG_NAME prefix for switch port names, default "vale" * NM_BDG_MAXPORTS number of ports * NM_BRIDGES max number of switches in the system. * XXX should become a sysctl or tunable @@ -144,7 +148,6 @@ __FBSDID("$FreeBSD$"); #define NM_BDG_BATCH_MAX (NM_BDG_BATCH + NM_MULTISEG) /* NM_FT_NULL terminates a list of slots in the ft */ #define NM_FT_NULL NM_BDG_BATCH_MAX -#define NM_BRIDGES 8 /* number of bridges */ /* @@ -152,14 +155,15 @@ __FBSDID("$FreeBSD$"); * used in the bridge. The actual value may be larger as the * last packet in the block may overflow the size. */ -int bridge_batch = NM_BDG_BATCH; /* bridge batch size */ +static int bridge_batch = NM_BDG_BATCH; /* bridge batch size */ +SYSBEGIN(vars_vale); SYSCTL_DECL(_dev_netmap); SYSCTL_INT(_dev_netmap, OID_AUTO, bridge_batch, CTLFLAG_RW, &bridge_batch, 0 , ""); - +SYSEND; static int netmap_vp_create(struct nmreq *, struct ifnet *, struct netmap_vp_adapter **); static int netmap_vp_reg(struct netmap_adapter *na, int onoff); -static int netmap_bwrap_register(struct netmap_adapter *, int onoff); +static int netmap_bwrap_reg(struct netmap_adapter *, int onoff); /* * For each output interface, nm_bdg_q is used to construct a list. @@ -213,7 +217,7 @@ struct nm_bridge { * forward this packet. ring_nr is the source ring index, and the * function may overwrite this value to forward this packet to a * different ring index. - * This function must be set by netmap_bdgctl(). + * This function must be set by netmap_bdg_ctl(). */ struct netmap_bdg_ops bdg_ops; @@ -244,7 +248,7 @@ netmap_bdg_name(struct netmap_vp_adapter *vp) * Right now we have a static array and deletions are protected * by an exclusive lock. */ -struct nm_bridge *nm_bridges; +static struct nm_bridge *nm_bridges; #endif /* !CONFIG_NET_NS */ @@ -278,6 +282,45 @@ pkt_copy(void *_src, void *_dst, int l) } +static int +nm_is_id_char(const char c) +{ + return (c >= 'a' && c <= 'z') || + (c >= 'A' && c <= 'Z') || + (c >= '0' && c <= '9') || + (c == '_'); +} + +/* Validate the name of a VALE bridge port and return the + * position of the ":" character. */ +static int +nm_vale_name_validate(const char *name) +{ + int colon_pos = -1; + int i; + + if (!name || strlen(name) < strlen(NM_BDG_NAME)) { + return -1; + } + + for (i = 0; name[i]; i++) { + if (name[i] == ':') { + if (colon_pos != -1) { + return -1; + } + colon_pos = i; + } else if (!nm_is_id_char(name[i])) { + return -1; + } + } + + if (i >= IFNAMSIZ) { + return -1; + } + + return colon_pos; +} + /* * locate a bridge among the existing ones. * MUST BE CALLED WITH NMG_LOCK() @@ -288,7 +331,7 @@ pkt_copy(void *_src, void *_dst, int l) static struct nm_bridge * nm_find_bridge(const char *name, int create) { - int i, l, namelen; + int i, namelen; struct nm_bridge *b = NULL, *bridges; u_int num_bridges; @@ -296,21 +339,11 @@ nm_find_bridge(const char *name, int create) netmap_bns_getbridges(&bridges, &num_bridges); - namelen = strlen(NM_NAME); /* base length */ - l = name ? strlen(name) : 0; /* actual length */ - if (l < namelen) { + namelen = nm_vale_name_validate(name); + if (namelen < 0) { D("invalid bridge name %s", name ? name : NULL); return NULL; } - for (i = namelen + 1; i < l; i++) { - if (name[i] == ':') { - namelen = i; - break; - } - } - if (namelen >= IFNAMSIZ) - namelen = IFNAMSIZ; - ND("--- prefix is '%.*s' ---", namelen, name); /* lookup the name, remember empty slot if there is one */ for (i = 0; i < num_bridges; i++) { @@ -479,6 +512,7 @@ netmap_vp_bdg_ctl(struct netmap_adapter *na, struct nmreq *nmr, int attach) struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter *)na; struct nm_bridge *b = vpna->na_bdg; + (void)nmr; // XXX merge ? if (attach) return 0; /* nothing to do */ if (b) { @@ -518,7 +552,7 @@ nm_vi_destroy(const char *name) return ENXIO; NMG_LOCK(); /* make sure this is actually a VALE port */ - if (!NETMAP_CAPABLE(ifp) || NA(ifp)->nm_register != netmap_vp_reg) { + if (!NM_NA_VALID(ifp) || NA(ifp)->nm_register != netmap_vp_reg) { error = EINVAL; goto err; } @@ -535,7 +569,7 @@ nm_vi_destroy(const char *name) */ if_rele(ifp); netmap_detach(ifp); - nm_vi_detach(ifp); + nm_os_vi_detach(ifp); return 0; err: @@ -556,14 +590,14 @@ nm_vi_create(struct nmreq *nmr) int error; /* don't include VALE prefix */ - if (!strncmp(nmr->nr_name, NM_NAME, strlen(NM_NAME))) + if (!strncmp(nmr->nr_name, NM_BDG_NAME, strlen(NM_BDG_NAME))) return EINVAL; ifp = ifunit_ref(nmr->nr_name); if (ifp) { /* already exist, cannot create new one */ if_rele(ifp); return EEXIST; } - error = nm_vi_persist(nmr->nr_name, &ifp); + error = nm_os_vi_persist(nmr->nr_name, &ifp); if (error) return error; @@ -572,12 +606,13 @@ nm_vi_create(struct nmreq *nmr) error = netmap_vp_create(nmr, ifp, &vpna); if (error) { D("error %d", error); - nm_vi_detach(ifp); + nm_os_vi_detach(ifp); return error; } /* persist-specific routines */ vpna->up.nm_bdg_ctl = netmap_vp_bdg_ctl; netmap_adapter_get(&vpna->up); + NM_ATTACH_NA(ifp, &vpna->up); NMG_UNLOCK(); D("created %s", ifp->if_xname); return 0; @@ -608,7 +643,7 @@ netmap_get_bdg_na(struct nmreq *nmr, struct netmap_adapter **na, int create) /* first try to see if this is a bridge port. */ NMG_LOCK_ASSERT(); - if (strncmp(nr_name, NM_NAME, sizeof(NM_NAME) - 1)) { + if (strncmp(nr_name, NM_BDG_NAME, sizeof(NM_BDG_NAME) - 1)) { return 0; /* no error, but no VALE prefix */ } @@ -693,7 +728,6 @@ netmap_get_bdg_na(struct nmreq *nmr, struct netmap_adapter **na, int create) goto out; vpna = hw->na_vp; hostna = hw->na_hostvp; - if_rele(ifp); if (nmr->nr_arg1 != NETMAP_BDG_HOST) hostna = NULL; } @@ -768,6 +802,11 @@ unlock_exit: return error; } +static inline int +nm_is_bwrap(struct netmap_adapter *na) +{ + return na->nm_register == netmap_bwrap_reg; +} /* process NETMAP_BDG_DETACH */ static int @@ -785,8 +824,13 @@ nm_bdg_ctl_detach(struct nmreq *nmr) if (na == NULL) { /* VALE prefix missing */ error = EINVAL; goto unlock_exit; + } else if (nm_is_bwrap(na) && + ((struct netmap_bwrap_adapter *)na)->na_polling_state) { + /* Don't detach a NIC with polling */ + error = EBUSY; + netmap_adapter_put(na); + goto unlock_exit; } - if (na->nm_bdg_ctl) { /* remove the port from bridge. The bwrap * also needs to put the hwna in normal mode @@ -801,6 +845,267 @@ unlock_exit: } +struct nm_bdg_polling_state; +struct +nm_bdg_kthread { + struct nm_kthread *nmk; + u_int qfirst; + u_int qlast; + struct nm_bdg_polling_state *bps; +}; + +struct nm_bdg_polling_state { + bool configured; + bool stopped; + struct netmap_bwrap_adapter *bna; + u_int reg; + u_int qfirst; + u_int qlast; + u_int cpu_from; + u_int ncpus; + struct nm_bdg_kthread *kthreads; +}; + +static void +netmap_bwrap_polling(void *data) +{ + struct nm_bdg_kthread *nbk = data; + struct netmap_bwrap_adapter *bna; + u_int qfirst, qlast, i; + struct netmap_kring *kring0, *kring; + + if (!nbk) + return; + qfirst = nbk->qfirst; + qlast = nbk->qlast; + bna = nbk->bps->bna; + kring0 = NMR(bna->hwna, NR_RX); + + for (i = qfirst; i < qlast; i++) { + kring = kring0 + i; + kring->nm_notify(kring, 0); + } +} + +static int +nm_bdg_create_kthreads(struct nm_bdg_polling_state *bps) +{ + struct nm_kthread_cfg kcfg; + int i, j; + + bps->kthreads = malloc(sizeof(struct nm_bdg_kthread) * bps->ncpus, + M_DEVBUF, M_NOWAIT | M_ZERO); + if (bps->kthreads == NULL) + return ENOMEM; + + bzero(&kcfg, sizeof(kcfg)); + kcfg.worker_fn = netmap_bwrap_polling; + for (i = 0; i < bps->ncpus; i++) { + struct nm_bdg_kthread *t = bps->kthreads + i; + int all = (bps->ncpus == 1 && bps->reg == NR_REG_ALL_NIC); + int affinity = bps->cpu_from + i; + + t->bps = bps; + t->qfirst = all ? bps->qfirst /* must be 0 */: affinity; + t->qlast = all ? bps->qlast : t->qfirst + 1; + D("kthread %d a:%u qf:%u ql:%u", i, affinity, t->qfirst, + t->qlast); + + kcfg.type = i; + kcfg.worker_private = t; + t->nmk = nm_os_kthread_create(&kcfg); + if (t->nmk == NULL) { + goto cleanup; + } + nm_os_kthread_set_affinity(t->nmk, affinity); + } + return 0; + +cleanup: + for (j = 0; j < i; j++) { + struct nm_bdg_kthread *t = bps->kthreads + i; + nm_os_kthread_delete(t->nmk); + } + free(bps->kthreads, M_DEVBUF); + return EFAULT; +} + +/* a version of ptnetmap_start_kthreads() */ +static int +nm_bdg_polling_start_kthreads(struct nm_bdg_polling_state *bps) +{ + int error, i, j; + + if (!bps) { + D("polling is not configured"); + return EFAULT; + } + bps->stopped = false; + + for (i = 0; i < bps->ncpus; i++) { + struct nm_bdg_kthread *t = bps->kthreads + i; + error = nm_os_kthread_start(t->nmk); + if (error) { + D("error in nm_kthread_start()"); + goto cleanup; + } + } + return 0; + +cleanup: + for (j = 0; j < i; j++) { + struct nm_bdg_kthread *t = bps->kthreads + i; + nm_os_kthread_stop(t->nmk); + } + bps->stopped = true; + return error; +} + +static void +nm_bdg_polling_stop_delete_kthreads(struct nm_bdg_polling_state *bps) +{ + int i; + + if (!bps) + return; + + for (i = 0; i < bps->ncpus; i++) { + struct nm_bdg_kthread *t = bps->kthreads + i; + nm_os_kthread_stop(t->nmk); + nm_os_kthread_delete(t->nmk); + } + bps->stopped = true; +} + +static int +get_polling_cfg(struct nmreq *nmr, struct netmap_adapter *na, + struct nm_bdg_polling_state *bps) +{ + int req_cpus, avail_cpus, core_from; + u_int reg, i, qfirst, qlast; + + avail_cpus = nm_os_ncpus(); + req_cpus = nmr->nr_arg1; + + if (req_cpus == 0) { + D("req_cpus must be > 0"); + return EINVAL; + } else if (req_cpus >= avail_cpus) { + D("for safety, we need at least one core left in the system"); + return EINVAL; + } + reg = nmr->nr_flags & NR_REG_MASK; + i = nmr->nr_ringid & NETMAP_RING_MASK; + /* + * ONE_NIC: dedicate one core to one ring. If multiple cores + * are specified, consecutive rings are also polled. + * For example, if ringid=2 and 2 cores are given, + * ring 2 and 3 are polled by core 2 and 3, respectively. + * ALL_NIC: poll all the rings using a core specified by ringid. + * the number of cores must be 1. + */ + if (reg == NR_REG_ONE_NIC) { + if (i + req_cpus > nma_get_nrings(na, NR_RX)) { + D("only %d rings exist (ring %u-%u is given)", + nma_get_nrings(na, NR_RX), i, i+req_cpus); + return EINVAL; + } + qfirst = i; + qlast = qfirst + req_cpus; + core_from = qfirst; + } else if (reg == NR_REG_ALL_NIC) { + if (req_cpus != 1) { + D("ncpus must be 1 not %d for REG_ALL_NIC", req_cpus); + return EINVAL; + } + qfirst = 0; + qlast = nma_get_nrings(na, NR_RX); + core_from = i; + } else { + D("reg must be ALL_NIC or ONE_NIC"); + return EINVAL; + } + + bps->reg = reg; + bps->qfirst = qfirst; + bps->qlast = qlast; + bps->cpu_from = core_from; + bps->ncpus = req_cpus; + D("%s qfirst %u qlast %u cpu_from %u ncpus %u", + reg == NR_REG_ALL_NIC ? "REG_ALL_NIC" : "REG_ONE_NIC", + qfirst, qlast, core_from, req_cpus); + return 0; +} + +static int +nm_bdg_ctl_polling_start(struct nmreq *nmr, struct netmap_adapter *na) +{ + struct nm_bdg_polling_state *bps; + struct netmap_bwrap_adapter *bna; + int error; + + bna = (struct netmap_bwrap_adapter *)na; + if (bna->na_polling_state) { + D("ERROR adapter already in polling mode"); + return EFAULT; + } + + bps = malloc(sizeof(*bps), M_DEVBUF, M_NOWAIT | M_ZERO); + if (!bps) + return ENOMEM; + bps->configured = false; + bps->stopped = true; + + if (get_polling_cfg(nmr, na, bps)) { + free(bps, M_DEVBUF); + return EINVAL; + } + + if (nm_bdg_create_kthreads(bps)) { + free(bps, M_DEVBUF); + return EFAULT; + } + + bps->configured = true; + bna->na_polling_state = bps; + bps->bna = bna; + + /* disable interrupt if possible */ + if (bna->hwna->nm_intr) + bna->hwna->nm_intr(bna->hwna, 0); + /* start kthread now */ + error = nm_bdg_polling_start_kthreads(bps); + if (error) { + D("ERROR nm_bdg_polling_start_kthread()"); + free(bps->kthreads, M_DEVBUF); + free(bps, M_DEVBUF); + bna->na_polling_state = NULL; + if (bna->hwna->nm_intr) + bna->hwna->nm_intr(bna->hwna, 1); + } + return error; +} + +static int +nm_bdg_ctl_polling_stop(struct nmreq *nmr, struct netmap_adapter *na) +{ + struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter *)na; + struct nm_bdg_polling_state *bps; + + if (!bna->na_polling_state) { + D("ERROR adapter is not in polling mode"); + return EFAULT; + } + bps = bna->na_polling_state; + nm_bdg_polling_stop_delete_kthreads(bna->na_polling_state); + bps->configured = false; + free(bps, M_DEVBUF); + bna->na_polling_state = NULL; + /* reenable interrupt */ + if (bna->hwna->nm_intr) + bna->hwna->nm_intr(bna->hwna, 1); + return 0; +} /* Called by either user's context (netmap_ioctl()) * or external kernel modules (e.g., Openvswitch). @@ -843,7 +1148,7 @@ netmap_bdg_ctl(struct nmreq *nmr, struct netmap_bdg_ops *bdg_ops) case NETMAP_BDG_LIST: /* this is used to enumerate bridges and ports */ if (namelen) { /* look up indexes of bridge and port */ - if (strncmp(name, NM_NAME, strlen(NM_NAME))) { + if (strncmp(name, NM_BDG_NAME, strlen(NM_BDG_NAME))) { error = EINVAL; break; } @@ -855,7 +1160,9 @@ netmap_bdg_ctl(struct nmreq *nmr, struct netmap_bdg_ops *bdg_ops) break; } - error = ENOENT; + error = 0; + nmr->nr_arg1 = b - bridges; /* bridge index */ + nmr->nr_arg2 = NM_BDG_NOPORT; for (j = 0; j < b->bdg_active_ports; j++) { i = b->bdg_port_index[j]; vpna = b->bdg_ports[i]; @@ -867,10 +1174,7 @@ netmap_bdg_ctl(struct nmreq *nmr, struct netmap_bdg_ops *bdg_ops) * virtual port and a NIC, respectively */ if (!strcmp(vpna->up.name, name)) { - /* bridge index */ - nmr->nr_arg1 = b - bridges; nmr->nr_arg2 = i; /* port index */ - error = 0; break; } } @@ -937,10 +1241,34 @@ netmap_bdg_ctl(struct nmreq *nmr, struct netmap_bdg_ops *bdg_ops) error = netmap_get_bdg_na(nmr, &na, 0); if (na && !error) { vpna = (struct netmap_vp_adapter *)na; - vpna->virt_hdr_len = nmr->nr_arg1; - if (vpna->virt_hdr_len) + na->virt_hdr_len = nmr->nr_arg1; + if (na->virt_hdr_len) { vpna->mfs = NETMAP_BUF_SIZE(na); - D("Using vnet_hdr_len %d for %p", vpna->virt_hdr_len, vpna); + } + D("Using vnet_hdr_len %d for %p", na->virt_hdr_len, na); + netmap_adapter_put(na); + } else if (!na) { + error = ENXIO; + } + NMG_UNLOCK(); + break; + + case NETMAP_BDG_POLLING_ON: + case NETMAP_BDG_POLLING_OFF: + NMG_LOCK(); + error = netmap_get_bdg_na(nmr, &na, 0); + if (na && !error) { + if (!nm_is_bwrap(na)) { + error = EOPNOTSUPP; + } else if (cmd == NETMAP_BDG_POLLING_ON) { + error = nm_bdg_ctl_polling_start(nmr, na); + if (!error) + netmap_adapter_get(na); + } else { + error = nm_bdg_ctl_polling_stop(nmr, na); + if (!error) + netmap_adapter_put(na); + } netmap_adapter_put(na); } NMG_UNLOCK(); @@ -1097,10 +1425,12 @@ nm_bdg_preflush(struct netmap_kring *kring, u_int end) ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr); } if (frags > 1) { - D("truncate incomplete fragment at %d (%d frags)", ft_i, frags); - // ft_i > 0, ft[ft_i-1].flags has NS_MOREFRAG - ft[ft_i - 1].ft_frags &= ~NS_MOREFRAG; - ft[ft_i - frags].ft_frags = frags - 1; + /* Here ft_i > 0, ft[ft_i-1].flags has NS_MOREFRAG, and we + * have to fix frags count. */ + frags--; + ft[ft_i - 1].ft_flags &= ~NS_MOREFRAG; + ft[ft_i - frags].ft_frags = frags; + D("Truncate incomplete fragment at %d (%d frags)", ft_i, frags); } if (ft_i) ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr); @@ -1157,6 +1487,8 @@ netmap_vp_reg(struct netmap_adapter *na, int onoff) { struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter*)na; + enum txrx t; + int i; /* persistent ports may be put in netmap mode * before being attached to a bridge @@ -1164,12 +1496,30 @@ netmap_vp_reg(struct netmap_adapter *na, int onoff) if (vpna->na_bdg) BDG_WLOCK(vpna->na_bdg); if (onoff) { - na->na_flags |= NAF_NETMAP_ON; + for_rx_tx(t) { + for (i = 0; i < nma_get_nrings(na, t) + 1; i++) { + struct netmap_kring *kring = &NMR(na, t)[i]; + + if (nm_kring_pending_on(kring)) + kring->nr_mode = NKR_NETMAP_ON; + } + } + if (na->active_fds == 0) + na->na_flags |= NAF_NETMAP_ON; /* XXX on FreeBSD, persistent VALE ports should also * toggle IFCAP_NETMAP in na->ifp (2014-03-16) */ } else { - na->na_flags &= ~NAF_NETMAP_ON; + if (na->active_fds == 0) + na->na_flags &= ~NAF_NETMAP_ON; + for_rx_tx(t) { + for (i = 0; i < nma_get_nrings(na, t) + 1; i++) { + struct netmap_kring *kring = &NMR(na, t)[i]; + + if (nm_kring_pending_off(kring)) + kring->nr_mode = NKR_NETMAP_OFF; + } + } } if (vpna->na_bdg) BDG_WUNLOCK(vpna->na_bdg); @@ -1193,13 +1543,14 @@ netmap_bdg_learning(struct nm_bdg_fwd *ft, uint8_t *dst_ring, uint32_t sh, dh; u_int dst, mysrc = na->bdg_port; uint64_t smac, dmac; + uint8_t indbuf[12]; /* safety check, unfortunately we have many cases */ - if (buf_len >= 14 + na->virt_hdr_len) { + if (buf_len >= 14 + na->up.virt_hdr_len) { /* virthdr + mac_hdr in the same slot */ - buf += na->virt_hdr_len; - buf_len -= na->virt_hdr_len; - } else if (buf_len == na->virt_hdr_len && ft->ft_flags & NS_MOREFRAG) { + buf += na->up.virt_hdr_len; + buf_len -= na->up.virt_hdr_len; + } else if (buf_len == na->up.virt_hdr_len && ft->ft_flags & NS_MOREFRAG) { /* only header in first fragment */ ft++; buf = ft->ft_buf; @@ -1208,6 +1559,14 @@ netmap_bdg_learning(struct nm_bdg_fwd *ft, uint8_t *dst_ring, RD(5, "invalid buf format, length %d", buf_len); return NM_BDG_NOPORT; } + + if (ft->ft_flags & NS_INDIRECT) { + if (copyin(buf, indbuf, sizeof(indbuf))) { + return NM_BDG_NOPORT; + } + buf = indbuf; + } + dmac = le64toh(*(uint64_t *)(buf)) & 0xffffffffffff; smac = le64toh(*(uint64_t *)(buf + 4)); smac >>= 16; @@ -1321,7 +1680,7 @@ nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na, struct nm_bdg_q *dst_ents, *brddst; uint16_t num_dsts = 0, *dsts; struct nm_bridge *b = na->na_bdg; - u_int i, j, me = na->bdg_port; + u_int i, me = na->bdg_port; /* * The work area (pointed by ft) is followed by an array of @@ -1341,7 +1700,7 @@ nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na, ND("slot %d frags %d", i, ft[i].ft_frags); /* Drop the packet if the virtio-net header is not into the first fragment nor at the very beginning of the second. */ - if (unlikely(na->virt_hdr_len > ft[i].ft_len)) + if (unlikely(na->up.virt_hdr_len > ft[i].ft_len)) continue; dst_port = b->bdg_ops.lookup(&ft[i], &dst_ring, na); if (netmap_verbose > 255) @@ -1382,6 +1741,7 @@ nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na, */ brddst = dst_ents + NM_BDG_BROADCAST * NM_BDG_MAXRINGS; if (brddst->bq_head != NM_FT_NULL) { + u_int j; for (j = 0; likely(j < b->bdg_active_ports); j++) { uint16_t d_i; i = b->bdg_port_index[j]; @@ -1441,8 +1801,9 @@ nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na, */ needed = d->bq_len + brddst->bq_len; - if (unlikely(dst_na->virt_hdr_len != na->virt_hdr_len)) { - RD(3, "virt_hdr_mismatch, src %d dst %d", na->virt_hdr_len, dst_na->virt_hdr_len); + if (unlikely(dst_na->up.virt_hdr_len != na->up.virt_hdr_len)) { + RD(3, "virt_hdr_mismatch, src %d dst %d", na->up.virt_hdr_len, + dst_na->up.virt_hdr_len); /* There is a virtio-net header/offloadings mismatch between * source and destination. The slower mismatch datapath will * be used to cope with all the mismatches. @@ -1803,7 +2164,6 @@ netmap_vp_create(struct nmreq *nmr, struct ifnet *ifp, struct netmap_vp_adapter nm_bound_var(&nmr->nr_arg3, 0, 0, 128*NM_BDG_MAXSLOTS, NULL); na->num_rx_desc = nmr->nr_rx_slots; - vpna->virt_hdr_len = 0; vpna->mfs = 1514; vpna->last_smac = ~0llu; /*if (vpna->mfs > netmap_buf_size) TODO netmap_buf_size is zero?? @@ -1880,19 +2240,17 @@ netmap_bwrap_dtor(struct netmap_adapter *na) { struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na; struct netmap_adapter *hwna = bna->hwna; + struct nm_bridge *b = bna->up.na_bdg, + *bh = bna->host.na_bdg; + + if (b) { + netmap_bdg_detach_common(b, bna->up.bdg_port, + (bh ? bna->host.bdg_port : -1)); + } ND("na %p", na); - /* drop reference to hwna->ifp. - * If we don't do this, netmap_detach_common(na) - * will think it has set NA(na->ifp) to NULL - */ na->ifp = NULL; - /* for safety, also drop the possible reference - * in the hostna - */ bna->host.up.ifp = NULL; - - hwna->nm_mem = bna->save_nmd; hwna->na_private = NULL; hwna->na_vp = hwna->na_hostvp = NULL; hwna->na_flags &= ~NAF_BUSY; @@ -1916,7 +2274,8 @@ netmap_bwrap_dtor(struct netmap_adapter *na) * (part as a receive ring, part as a transmit ring). * * callback that overwrites the hwna notify callback. - * Packets come from the outside or from the host stack and are put on an hwna rx ring. + * Packets come from the outside or from the host stack and are put on an + * hwna rx ring. * The bridge wrapper then sends the packets through the bridge. */ static int @@ -1927,19 +2286,18 @@ netmap_bwrap_intr_notify(struct netmap_kring *kring, int flags) struct netmap_kring *bkring; struct netmap_vp_adapter *vpna = &bna->up; u_int ring_nr = kring->ring_id; - int error = 0; + int ret = NM_IRQ_COMPLETED; + int error; if (netmap_verbose) D("%s %s 0x%x", na->name, kring->name, flags); - if (!nm_netmap_on(na)) - return 0; - bkring = &vpna->up.tx_rings[ring_nr]; /* make sure the ring is not disabled */ - if (nm_kr_tryget(kring)) - return 0; + if (nm_kr_tryget(kring, 0 /* can't sleep */, NULL)) { + return EIO; + } if (netmap_verbose) D("%s head %d cur %d tail %d", na->name, @@ -1951,9 +2309,10 @@ netmap_bwrap_intr_notify(struct netmap_kring *kring, int flags) error = kring->nm_sync(kring, 0); if (error) goto put_out; - if (kring->nr_hwcur == kring->nr_hwtail && netmap_verbose) { - D("how strange, interrupt with no packets on %s", - na->name); + if (kring->nr_hwcur == kring->nr_hwtail) { + if (netmap_verbose) + D("how strange, interrupt with no packets on %s", + na->name); goto put_out; } @@ -1970,28 +2329,32 @@ netmap_bwrap_intr_notify(struct netmap_kring *kring, int flags) /* another call to actually release the buffers */ error = kring->nm_sync(kring, 0); + /* The second rxsync may have further advanced hwtail. If this happens, + * return NM_IRQ_RESCHED, otherwise just return NM_IRQ_COMPLETED. */ + if (kring->rcur != kring->nr_hwtail) { + ret = NM_IRQ_RESCHED; + } put_out: nm_kr_put(kring); - return error; + + return error ? error : ret; } /* nm_register callback for bwrap */ static int -netmap_bwrap_register(struct netmap_adapter *na, int onoff) +netmap_bwrap_reg(struct netmap_adapter *na, int onoff) { struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter *)na; struct netmap_adapter *hwna = bna->hwna; struct netmap_vp_adapter *hostna = &bna->host; - int error; + int error, i; enum txrx t; ND("%s %s", na->name, onoff ? "on" : "off"); if (onoff) { - int i; - /* netmap_do_regif has been called on the bwrap na. * We need to pass the information about the * memory allocator down to the hwna before @@ -2010,16 +2373,32 @@ netmap_bwrap_register(struct netmap_adapter *na, int onoff) /* cross-link the netmap rings * The original number of rings comes from hwna, * rx rings on one side equals tx rings on the other. - * We need to do this now, after the initialization - * of the kring->ring pointers */ for_rx_tx(t) { - enum txrx r= nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */ - for (i = 0; i < nma_get_nrings(na, r) + 1; i++) { - NMR(hwna, t)[i].nkr_num_slots = NMR(na, r)[i].nkr_num_slots; - NMR(hwna, t)[i].ring = NMR(na, r)[i].ring; + enum txrx r = nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */ + for (i = 0; i < nma_get_nrings(hwna, r) + 1; i++) { + NMR(hwna, r)[i].ring = NMR(na, t)[i].ring; } } + + if (na->na_flags & NAF_HOST_RINGS) { + struct netmap_adapter *hna = &hostna->up; + /* the hostna rings are the host rings of the bwrap. + * The corresponding krings must point back to the + * hostna + */ + hna->tx_rings = &na->tx_rings[na->num_tx_rings]; + hna->tx_rings[0].na = hna; + hna->rx_rings = &na->rx_rings[na->num_rx_rings]; + hna->rx_rings[0].na = hna; + } + } + + /* pass down the pending ring state information */ + for_rx_tx(t) { + for (i = 0; i < nma_get_nrings(na, t) + 1; i++) + NMR(hwna, t)[i].nr_pending_mode = + NMR(na, t)[i].nr_pending_mode; } /* forward the request to the hwna */ @@ -2027,6 +2406,13 @@ netmap_bwrap_register(struct netmap_adapter *na, int onoff) if (error) return error; + /* copy up the current ring state information */ + for_rx_tx(t) { + for (i = 0; i < nma_get_nrings(na, t) + 1; i++) + NMR(na, t)[i].nr_mode = + NMR(hwna, t)[i].nr_mode; + } + /* impersonate a netmap_vp_adapter */ netmap_vp_reg(na, onoff); if (hostna->na_bdg) @@ -2046,8 +2432,14 @@ netmap_bwrap_register(struct netmap_adapter *na, int onoff) /* also intercept the host ring notify */ hwna->rx_rings[i].nm_notify = netmap_bwrap_intr_notify; } + if (na->active_fds == 0) + na->na_flags |= NAF_NETMAP_ON; } else { u_int i; + + if (na->active_fds == 0) + na->na_flags &= ~NAF_NETMAP_ON; + /* reset all notify callbacks (including host ring) */ for (i = 0; i <= hwna->num_rx_rings; i++) { hwna->rx_rings[i].nm_notify = hwna->rx_rings[i].save_notify; @@ -2089,8 +2481,8 @@ netmap_bwrap_krings_create(struct netmap_adapter *na) struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter *)na; struct netmap_adapter *hwna = bna->hwna; - struct netmap_adapter *hostna = &bna->host.up; - int error; + int i, error = 0; + enum txrx t; ND("%s", na->name); @@ -2102,26 +2494,23 @@ netmap_bwrap_krings_create(struct netmap_adapter *na) /* also create the hwna krings */ error = hwna->nm_krings_create(hwna); if (error) { - netmap_vp_krings_delete(na); - return error; + goto err_del_vp_rings; } - /* the connection between the bwrap krings and the hwna krings - * will be perfomed later, in the nm_register callback, since - * now the kring->ring pointers have not been initialized yet - */ - if (na->na_flags & NAF_HOST_RINGS) { - /* the hostna rings are the host rings of the bwrap. - * The corresponding krings must point back to the - * hostna - */ - hostna->tx_rings = &na->tx_rings[na->num_tx_rings]; - hostna->tx_rings[0].na = hostna; - hostna->rx_rings = &na->rx_rings[na->num_rx_rings]; - hostna->rx_rings[0].na = hostna; + /* get each ring slot number from the corresponding hwna ring */ + for_rx_tx(t) { + enum txrx r = nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */ + for (i = 0; i < nma_get_nrings(hwna, r) + 1; i++) { + NMR(na, t)[i].nkr_num_slots = NMR(hwna, r)[i].nkr_num_slots; + } } return 0; + +err_del_vp_rings: + netmap_vp_krings_delete(na); + + return error; } @@ -2149,7 +2538,7 @@ netmap_bwrap_notify(struct netmap_kring *kring, int flags) u_int ring_n = kring->ring_id; u_int lim = kring->nkr_num_slots - 1; struct netmap_kring *hw_kring; - int error = 0; + int error; ND("%s: na %s hwna %s", (kring ? kring->name : "NULL!"), @@ -2157,11 +2546,10 @@ netmap_bwrap_notify(struct netmap_kring *kring, int flags) (hwna ? hwna->name : "NULL!")); hw_kring = &hwna->tx_rings[ring_n]; - if (nm_kr_tryget(hw_kring)) - return 0; + if (nm_kr_tryget(hw_kring, 0, NULL)) { + return ENXIO; + } - if (!nm_netmap_on(hwna)) - return 0; /* first step: simulate a user wakeup on the rx ring */ netmap_vp_rxsync(kring, flags); ND("%s[%d] PRE rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)", @@ -2175,7 +2563,7 @@ netmap_bwrap_notify(struct netmap_kring *kring, int flags) hw_kring->rhead = hw_kring->rcur = kring->nr_hwtail; error = hw_kring->nm_sync(hw_kring, flags); if (error) - goto out; + goto put_out; /* third step: now we are back the rx ring */ /* claim ownership on all hw owned bufs */ @@ -2188,9 +2576,10 @@ netmap_bwrap_notify(struct netmap_kring *kring, int flags) kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease, ring->head, ring->cur, ring->tail, hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_kring->rtail); -out: +put_out: nm_kr_put(hw_kring); - return error; + + return error ? error : NM_IRQ_COMPLETED; } @@ -2217,44 +2606,23 @@ netmap_bwrap_bdg_ctl(struct netmap_adapter *na, struct nmreq *nmr, int attach) /* nothing to do */ return 0; } - npriv = malloc(sizeof(*npriv), M_DEVBUF, M_NOWAIT|M_ZERO); + npriv = netmap_priv_new(); if (npriv == NULL) return ENOMEM; - error = netmap_do_regif(npriv, na, nmr->nr_ringid, nmr->nr_flags); + npriv->np_ifp = na->ifp; /* let the priv destructor release the ref */ + error = netmap_do_regif(npriv, na, 0, NR_REG_NIC_SW); if (error) { - bzero(npriv, sizeof(*npriv)); - free(npriv, M_DEVBUF); + netmap_priv_delete(npriv); return error; } bna->na_kpriv = npriv; na->na_flags |= NAF_BUSY; } else { - int last_instance; - if (na->active_fds == 0) /* not registered */ return EINVAL; - last_instance = netmap_dtor_locked(bna->na_kpriv); - if (!last_instance) { - D("--- error, trying to detach an entry with active mmaps"); - error = EINVAL; - } else { - struct nm_bridge *b = bna->up.na_bdg, - *bh = bna->host.na_bdg; - npriv = bna->na_kpriv; - bna->na_kpriv = NULL; - D("deleting priv"); - - bzero(npriv, sizeof(*npriv)); - free(npriv, M_DEVBUF); - if (b) { - /* XXX the bwrap dtor should take care - * of this (2014-06-16) - */ - netmap_bdg_detach_common(b, bna->up.bdg_port, - (bh ? bna->host.bdg_port : -1)); - } - na->na_flags &= ~NAF_BUSY; - } + netmap_priv_delete(bna->na_kpriv); + bna->na_kpriv = NULL; + na->na_flags &= ~NAF_BUSY; } return error; @@ -2282,6 +2650,8 @@ netmap_bwrap_attach(const char *nr_name, struct netmap_adapter *hwna) } na = &bna->up.up; + /* make bwrap ifp point to the real ifp */ + na->ifp = hwna->ifp; na->na_private = bna; strncpy(na->name, nr_name, sizeof(na->name)); /* fill the ring data for the bwrap adapter with rx/tx meanings @@ -2294,7 +2664,7 @@ netmap_bwrap_attach(const char *nr_name, struct netmap_adapter *hwna) nma_set_ndesc(na, t, nma_get_ndesc(hwna, r)); } na->nm_dtor = netmap_bwrap_dtor; - na->nm_register = netmap_bwrap_register; + na->nm_register = netmap_bwrap_reg; // na->nm_txsync = netmap_bwrap_txsync; // na->nm_rxsync = netmap_bwrap_rxsync; na->nm_config = netmap_bwrap_config; @@ -2303,13 +2673,8 @@ netmap_bwrap_attach(const char *nr_name, struct netmap_adapter *hwna) na->nm_notify = netmap_bwrap_notify; na->nm_bdg_ctl = netmap_bwrap_bdg_ctl; na->pdev = hwna->pdev; - na->nm_mem = netmap_mem_private_new(na->name, - na->num_tx_rings, na->num_tx_desc, - na->num_rx_rings, na->num_rx_desc, - 0, 0, &error); - na->na_flags |= NAF_MEM_OWNER; - if (na->nm_mem == NULL) - goto err_put; + na->nm_mem = hwna->nm_mem; + na->virt_hdr_len = hwna->virt_hdr_len; bna->up.retry = 1; /* XXX maybe this should depend on the hwna */ bna->hwna = hwna; @@ -2349,24 +2714,10 @@ netmap_bwrap_attach(const char *nr_name, struct netmap_adapter *hwna) if (error) { goto err_free; } - /* make bwrap ifp point to the real ifp - * NOTE: netmap_attach_common() interprets a non-NULL na->ifp - * as a request to make the ifp point to the na. Since we - * do not want to change the na already pointed to by hwna->ifp, - * the following assignment has to be delayed until now - */ - na->ifp = hwna->ifp; hwna->na_flags |= NAF_BUSY; - /* make hwna point to the allocator we are actually using, - * so that monitors will be able to find it - */ - bna->save_nmd = hwna->nm_mem; - hwna->nm_mem = na->nm_mem; return 0; err_free: - netmap_mem_delete(na->nm_mem); -err_put: hwna->na_vp = hwna->na_hostvp = NULL; netmap_adapter_put(hwna); free(bna, M_DEVBUF); -- cgit v1.3