diff options
| author | Luigi Rizzo <luigi@FreeBSD.org> | 2011-11-17 12:17:39 +0000 |
|---|---|---|
| committer | Luigi Rizzo <luigi@FreeBSD.org> | 2011-11-17 12:17:39 +0000 |
| commit | 68b8534bdfeb5078e84d668124e7585e43b03502 (patch) | |
| tree | 8be7a4f824011375a281269e79b86fa172e84386 /sys/dev/netmap | |
| parent | a93c40bb620da1a24e2e07b9bc0734736c1dae77 (diff) | |
Notes
Diffstat (limited to 'sys/dev/netmap')
| -rw-r--r-- | sys/dev/netmap/head.diff | 654 | ||||
| -rw-r--r-- | sys/dev/netmap/if_em_netmap.h | 383 | ||||
| -rw-r--r-- | sys/dev/netmap/if_igb_netmap.h | 378 | ||||
| -rw-r--r-- | sys/dev/netmap/if_lem_netmap.h | 344 | ||||
| -rw-r--r-- | sys/dev/netmap/if_re_netmap.h | 415 | ||||
| -rw-r--r-- | sys/dev/netmap/ixgbe_netmap.h | 376 | ||||
| -rw-r--r-- | sys/dev/netmap/netmap.c | 1762 | ||||
| -rw-r--r-- | sys/dev/netmap/netmap_kern.h | 221 |
8 files changed, 4533 insertions, 0 deletions
diff --git a/sys/dev/netmap/head.diff b/sys/dev/netmap/head.diff new file mode 100644 index 0000000000000..51a8e34e74d12 --- /dev/null +++ b/sys/dev/netmap/head.diff @@ -0,0 +1,654 @@ +Index: conf/NOTES +=================================================================== +--- conf/NOTES (revision 227552) ++++ conf/NOTES (working copy) +@@ -799,6 +799,12 @@ + # option. DHCP requires bpf. + device bpf + ++# The `netmap' device implements memory-mapped access to network ++# devices from userspace, enabling wire-speed packet capture and ++# generation even at 10Gbit/s. Requires support in the device ++# driver. Supported drivers are ixgbe, e1000, re. ++device netmap ++ + # The `disc' device implements a minimal network interface, + # which throws away all packets sent and never receives any. It is + # included for testing and benchmarking purposes. +Index: conf/files +=================================================================== +--- conf/files (revision 227552) ++++ conf/files (working copy) +@@ -1507,6 +1507,7 @@ + dev/my/if_my.c optional my + dev/ncv/ncr53c500.c optional ncv + dev/ncv/ncr53c500_pccard.c optional ncv pccard ++dev/netmap/netmap.c optional netmap + dev/nge/if_nge.c optional nge + dev/nxge/if_nxge.c optional nxge + dev/nxge/xgehal/xgehal-device.c optional nxge +Index: conf/options +=================================================================== +--- conf/options (revision 227552) ++++ conf/options (working copy) +@@ -689,6 +689,7 @@ + + # various 'device presence' options. + DEV_BPF opt_bpf.h ++DEV_NETMAP opt_global.h + DEV_MCA opt_mca.h + DEV_CARP opt_carp.h + DEV_SPLASH opt_splash.h +Index: dev/e1000/if_igb.c +=================================================================== +--- dev/e1000/if_igb.c (revision 227552) ++++ dev/e1000/if_igb.c (working copy) +@@ -369,6 +369,9 @@ + &igb_rx_process_limit, 0, + "Maximum number of received packets to process at a time, -1 means unlimited"); + ++#ifdef DEV_NETMAP ++#include <dev/netmap/if_igb_netmap.h> ++#endif /* DEV_NETMAP */ + /********************************************************************* + * Device identification routine + * +@@ -664,6 +667,9 @@ + adapter->led_dev = led_create(igb_led_func, adapter, + device_get_nameunit(dev)); + ++#ifdef DEV_NETMAP ++ igb_netmap_attach(adapter); ++#endif /* DEV_NETMAP */ + INIT_DEBUGOUT("igb_attach: end"); + + return (0); +@@ -742,6 +748,9 @@ + + callout_drain(&adapter->timer); + ++#ifdef DEV_NETMAP ++ netmap_detach(adapter->ifp); ++#endif /* DEV_NETMAP */ + igb_free_pci_resources(adapter); + bus_generic_detach(dev); + if_free(ifp); +@@ -3212,6 +3221,10 @@ + struct adapter *adapter = txr->adapter; + struct igb_tx_buffer *txbuf; + int i; ++#ifdef DEV_NETMAP ++ struct netmap_slot *slot = netmap_reset(NA(adapter->ifp), ++ NR_TX, txr->me, 0); ++#endif + + /* Clear the old descriptor contents */ + IGB_TX_LOCK(txr); +@@ -3231,6 +3244,13 @@ + m_freem(txbuf->m_head); + txbuf->m_head = NULL; + } ++#ifdef DEV_NETMAP ++ if (slot) { ++ netmap_load_map(txr->txtag, txbuf->map, ++ NMB(slot), adapter->rx_mbuf_sz); ++ slot++; ++ } ++#endif /* DEV_NETMAP */ + /* clear the watch index */ + txbuf->next_eop = -1; + } +@@ -3626,6 +3646,19 @@ + + IGB_TX_LOCK_ASSERT(txr); + ++#ifdef DEV_NETMAP ++ if (ifp->if_capenable & IFCAP_NETMAP) { ++ struct netmap_adapter *na = NA(ifp); ++ ++ selwakeuppri(&na->tx_rings[txr->me].si, PI_NET); ++ IGB_TX_UNLOCK(txr); ++ IGB_CORE_LOCK(adapter); ++ selwakeuppri(&na->tx_rings[na->num_queues + 1].si, PI_NET); ++ IGB_CORE_UNLOCK(adapter); ++ IGB_TX_LOCK(txr); // the caller is supposed to own the lock ++ return FALSE; ++ } ++#endif /* DEV_NETMAP */ + if (txr->tx_avail == adapter->num_tx_desc) { + txr->queue_status = IGB_QUEUE_IDLE; + return FALSE; +@@ -3949,6 +3982,10 @@ + bus_dma_segment_t pseg[1], hseg[1]; + struct lro_ctrl *lro = &rxr->lro; + int rsize, nsegs, error = 0; ++#ifdef DEV_NETMAP ++ struct netmap_slot *slot = netmap_reset(NA(rxr->adapter->ifp), ++ NR_RX, rxr->me, 0); ++#endif + + adapter = rxr->adapter; + dev = adapter->dev; +@@ -3974,6 +4011,18 @@ + struct mbuf *mh, *mp; + + rxbuf = &rxr->rx_buffers[j]; ++#ifdef DEV_NETMAP ++ if (slot) { ++ netmap_load_map(rxr->ptag, ++ rxbuf->pmap, NMB(slot), ++ adapter->rx_mbuf_sz); ++ /* Update descriptor */ ++ rxr->rx_base[j].read.pkt_addr = ++ htole64(vtophys(NMB(slot))); ++ slot++; ++ continue; ++ } ++#endif /* DEV_NETMAP */ + if (rxr->hdr_split == FALSE) + goto skip_head; + +@@ -4436,6 +4485,19 @@ + bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, + BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); + ++#ifdef DEV_NETMAP ++ if (ifp->if_capenable & IFCAP_NETMAP) { ++ struct netmap_adapter *na = NA(ifp); ++ ++ selwakeuppri(&na->rx_rings[rxr->me].si, PI_NET); ++ IGB_RX_UNLOCK(rxr); ++ IGB_CORE_LOCK(adapter); ++ selwakeuppri(&na->rx_rings[na->num_queues + 1].si, PI_NET); ++ IGB_CORE_UNLOCK(adapter); ++ return (0); ++ } ++#endif /* DEV_NETMAP */ ++ + /* Main clean loop */ + for (i = rxr->next_to_check; count != 0;) { + struct mbuf *sendmp, *mh, *mp; +Index: dev/e1000/if_lem.c +=================================================================== +--- dev/e1000/if_lem.c (revision 227552) ++++ dev/e1000/if_lem.c (working copy) +@@ -316,6 +316,10 @@ + /* Global used in WOL setup with multiport cards */ + static int global_quad_port_a = 0; + ++#ifdef DEV_NETMAP ++#include <dev/netmap/if_lem_netmap.h> ++#endif /* DEV_NETMAP */ ++ + /********************************************************************* + * Device identification routine + * +@@ -646,6 +650,9 @@ + adapter->led_dev = led_create(lem_led_func, adapter, + device_get_nameunit(dev)); + ++#ifdef DEV_NETMAP ++ lem_netmap_attach(adapter); ++#endif /* DEV_NETMAP */ + INIT_DEBUGOUT("lem_attach: end"); + + return (0); +@@ -724,6 +731,9 @@ + callout_drain(&adapter->timer); + callout_drain(&adapter->tx_fifo_timer); + ++#ifdef DEV_NETMAP ++ netmap_detach(ifp); ++#endif /* DEV_NETMAP */ + lem_free_pci_resources(adapter); + bus_generic_detach(dev); + if_free(ifp); +@@ -2637,6 +2647,9 @@ + lem_setup_transmit_structures(struct adapter *adapter) + { + struct em_buffer *tx_buffer; ++#ifdef DEV_NETMAP ++ struct netmap_slot *slot = netmap_reset(NA(adapter->ifp), NR_TX, 0, 0); ++#endif + + /* Clear the old ring contents */ + bzero(adapter->tx_desc_base, +@@ -2650,6 +2663,15 @@ + bus_dmamap_unload(adapter->txtag, tx_buffer->map); + m_freem(tx_buffer->m_head); + tx_buffer->m_head = NULL; ++#ifdef DEV_NETMAP ++ if (slot) { ++ /* reload the map for netmap mode */ ++ netmap_load_map(adapter->txtag, ++ tx_buffer->map, NMB(slot), ++ NA(adapter->ifp)->buff_size); ++ slot++; ++ } ++#endif /* DEV_NETMAP */ + tx_buffer->next_eop = -1; + } + +@@ -2951,6 +2973,12 @@ + + EM_TX_LOCK_ASSERT(adapter); + ++#ifdef DEV_NETMAP ++ if (ifp->if_capenable & IFCAP_NETMAP) { ++ selwakeuppri(&NA(ifp)->tx_rings[0].si, PI_NET); ++ return; ++ } ++#endif /* DEV_NETMAP */ + if (adapter->num_tx_desc_avail == adapter->num_tx_desc) + return; + +@@ -3181,6 +3209,9 @@ + { + struct em_buffer *rx_buffer; + int i, error; ++#ifdef DEV_NETMAP ++ struct netmap_slot *slot = netmap_reset(NA(adapter->ifp), NR_RX, 0, 0); ++#endif + + /* Reset descriptor ring */ + bzero(adapter->rx_desc_base, +@@ -3200,6 +3231,18 @@ + + /* Allocate new ones. */ + for (i = 0; i < adapter->num_rx_desc; i++) { ++#ifdef DEV_NETMAP ++ if (slot) { ++ netmap_load_map(adapter->rxtag, ++ rx_buffer->map, NMB(slot), ++ NA(adapter->ifp)->buff_size); ++ /* Update descriptor */ ++ adapter->rx_desc_base[i].buffer_addr = ++ htole64(vtophys(NMB(slot))); ++ slot++; ++ continue; ++ } ++#endif /* DEV_NETMAP */ + error = lem_get_buf(adapter, i); + if (error) + return (error); +@@ -3407,6 +3450,14 @@ + bus_dmamap_sync(adapter->rxdma.dma_tag, adapter->rxdma.dma_map, + BUS_DMASYNC_POSTREAD); + ++#ifdef DEV_NETMAP ++ if (ifp->if_capenable & IFCAP_NETMAP) { ++ selwakeuppri(&NA(ifp)->rx_rings[0].si, PI_NET); ++ EM_RX_UNLOCK(adapter); ++ return (0); ++ } ++#endif /* DEV_NETMAP */ ++ + if (!((current_desc->status) & E1000_RXD_STAT_DD)) { + if (done != NULL) + *done = rx_sent; +Index: dev/e1000/if_em.c +=================================================================== +--- dev/e1000/if_em.c (revision 227552) ++++ dev/e1000/if_em.c (working copy) +@@ -399,6 +399,10 @@ + /* Global used in WOL setup with multiport cards */ + static int global_quad_port_a = 0; + ++#ifdef DEV_NETMAP ++#include <dev/netmap/if_em_netmap.h> ++#endif /* DEV_NETMAP */ ++ + /********************************************************************* + * Device identification routine + * +@@ -714,6 +718,9 @@ + + adapter->led_dev = led_create(em_led_func, adapter, + device_get_nameunit(dev)); ++#ifdef DEV_NETMAP ++ em_netmap_attach(adapter); ++#endif /* DEV_NETMAP */ + + INIT_DEBUGOUT("em_attach: end"); + +@@ -785,6 +792,10 @@ + ether_ifdetach(adapter->ifp); + callout_drain(&adapter->timer); + ++#ifdef DEV_NETMAP ++ netmap_detach(ifp); ++#endif /* DEV_NETMAP */ ++ + em_free_pci_resources(adapter); + bus_generic_detach(dev); + if_free(ifp); +@@ -3213,6 +3224,10 @@ + struct adapter *adapter = txr->adapter; + struct em_buffer *txbuf; + int i; ++#ifdef DEV_NETMAP ++ struct netmap_slot *slot = netmap_reset(NA(adapter->ifp), ++ NR_TX, txr->me, 0); ++#endif + + /* Clear the old descriptor contents */ + EM_TX_LOCK(txr); +@@ -3232,6 +3247,16 @@ + m_freem(txbuf->m_head); + txbuf->m_head = NULL; + } ++#ifdef DEV_NETMAP ++ if (slot) { ++ /* reload the map for netmap mode */ ++ netmap_load_map(txr->txtag, ++ txbuf->map, NMB(slot), ++ adapter->rx_mbuf_sz); ++ slot++; ++ } ++#endif /* DEV_NETMAP */ ++ + /* clear the watch index */ + txbuf->next_eop = -1; + } +@@ -3682,6 +3707,12 @@ + struct ifnet *ifp = adapter->ifp; + + EM_TX_LOCK_ASSERT(txr); ++#ifdef DEV_NETMAP ++ if (ifp->if_capenable & IFCAP_NETMAP) { ++ selwakeuppri(&NA(ifp)->tx_rings[txr->me].si, PI_NET); ++ return (FALSE); ++ } ++#endif /* DEV_NETMAP */ + + /* No work, make sure watchdog is off */ + if (txr->tx_avail == adapter->num_tx_desc) { +@@ -3978,6 +4009,33 @@ + if (++j == adapter->num_rx_desc) + j = 0; + } ++#ifdef DEV_NETMAP ++ { ++ /* slot is NULL if we are not in netmap mode */ ++ struct netmap_slot *slot = netmap_reset(NA(adapter->ifp), ++ NR_RX, rxr->me, rxr->next_to_check); ++ /* ++ * we need to restore all buffer addresses in the ring as they might ++ * be in the wrong state if we are exiting from netmap mode. ++ */ ++ for (j = 0; j != adapter->num_rx_desc; ++j) { ++ void *addr; ++ rxbuf = &rxr->rx_buffers[j]; ++ if (rxbuf->m_head == NULL && !slot) ++ continue; ++ addr = slot ? NMB(slot) : rxbuf->m_head->m_data; ++ // XXX load or reload ? ++ netmap_load_map(rxr->rxtag, rxbuf->map, addr, adapter->rx_mbuf_sz); ++ /* Update descriptor */ ++ rxr->rx_base[j].buffer_addr = htole64(vtophys(addr)); ++ bus_dmamap_sync(rxr->rxtag, rxbuf->map, BUS_DMASYNC_PREREAD); ++ if (slot) ++ slot++; ++ } ++ /* Setup our descriptor indices */ ++ NA(adapter->ifp)->rx_rings[rxr->me].nr_hwcur = rxr->next_to_check; ++ } ++#endif /* DEV_NETMAP */ + + fail: + rxr->next_to_refresh = i; +@@ -4247,6 +4305,14 @@ + + EM_RX_LOCK(rxr); + ++#ifdef DEV_NETMAP ++ if (ifp->if_capenable & IFCAP_NETMAP) { ++ selwakeuppri(&NA(ifp)->rx_rings[rxr->me].si, PI_NET); ++ EM_RX_UNLOCK(rxr); ++ return (0); ++ } ++#endif /* DEV_NETMAP */ ++ + for (i = rxr->next_to_check, processed = 0; count != 0;) { + + if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) +Index: dev/re/if_re.c +=================================================================== +--- dev/re/if_re.c (revision 227552) ++++ dev/re/if_re.c (working copy) +@@ -291,6 +291,10 @@ + static void re_setwol (struct rl_softc *); + static void re_clrwol (struct rl_softc *); + ++#ifdef DEV_NETMAP ++#include <dev/netmap/if_re_netmap.h> ++#endif /* !DEV_NETMAP */ ++ + #ifdef RE_DIAG + static int re_diag (struct rl_softc *); + #endif +@@ -1583,6 +1587,9 @@ + */ + ifp->if_data.ifi_hdrlen = sizeof(struct ether_vlan_header); + ++#ifdef DEV_NETMAP ++ re_netmap_attach(sc); ++#endif /* DEV_NETMAP */ + #ifdef RE_DIAG + /* + * Perform hardware diagnostic on the original RTL8169. +@@ -1778,6 +1785,9 @@ + bus_dma_tag_destroy(sc->rl_ldata.rl_stag); + } + ++#ifdef DEV_NETMAP ++ netmap_detach(ifp); ++#endif /* DEV_NETMAP */ + if (sc->rl_parent_tag) + bus_dma_tag_destroy(sc->rl_parent_tag); + +@@ -1952,6 +1962,9 @@ + sc->rl_ldata.rl_tx_desc_cnt * sizeof(struct rl_desc)); + for (i = 0; i < sc->rl_ldata.rl_tx_desc_cnt; i++) + sc->rl_ldata.rl_tx_desc[i].tx_m = NULL; ++#ifdef DEV_NETMAP ++ re_netmap_tx_init(sc); ++#endif /* DEV_NETMAP */ + /* Set EOR. */ + desc = &sc->rl_ldata.rl_tx_list[sc->rl_ldata.rl_tx_desc_cnt - 1]; + desc->rl_cmdstat |= htole32(RL_TDESC_CMD_EOR); +@@ -1979,6 +1992,9 @@ + if ((error = re_newbuf(sc, i)) != 0) + return (error); + } ++#ifdef DEV_NETMAP ++ re_netmap_rx_init(sc); ++#endif /* DEV_NETMAP */ + + /* Flush the RX descriptors */ + +@@ -2035,6 +2051,12 @@ + RL_LOCK_ASSERT(sc); + + ifp = sc->rl_ifp; ++#ifdef DEV_NETMAP ++ if (ifp->if_capenable & IFCAP_NETMAP) { ++ selwakeuppri(&NA(ifp)->rx_rings->si, PI_NET); ++ return 0; ++ } ++#endif /* DEV_NETMAP */ + if (ifp->if_mtu > RL_MTU && (sc->rl_flags & RL_FLAG_JUMBOV2) != 0) + jumbo = 1; + else +@@ -2276,6 +2298,12 @@ + return; + + ifp = sc->rl_ifp; ++#ifdef DEV_NETMAP ++ if (ifp->if_capenable & IFCAP_NETMAP) { ++ selwakeuppri(&NA(ifp)->tx_rings[0].si, PI_NET); ++ return; ++ } ++#endif /* DEV_NETMAP */ + /* Invalidate the TX descriptor list */ + bus_dmamap_sync(sc->rl_ldata.rl_tx_list_tag, + sc->rl_ldata.rl_tx_list_map, +@@ -2794,6 +2822,20 @@ + + sc = ifp->if_softc; + ++#ifdef DEV_NETMAP ++ if (ifp->if_capenable & IFCAP_NETMAP) { ++ struct netmap_kring *kring = &NA(ifp)->tx_rings[0]; ++ if (sc->rl_ldata.rl_tx_prodidx != kring->nr_hwcur) { ++ /* kick the tx unit */ ++ CSR_WRITE_1(sc, sc->rl_txstart, RL_TXSTART_START); ++#ifdef RE_TX_MODERATION ++ CSR_WRITE_4(sc, RL_TIMERCNT, 1); ++#endif ++ sc->rl_watchdog_timer = 5; ++ } ++ return; ++ } ++#endif /* DEV_NETMAP */ + if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != + IFF_DRV_RUNNING || (sc->rl_flags & RL_FLAG_LINK) == 0) + return; +Index: dev/ixgbe/ixgbe.c +=================================================================== +--- dev/ixgbe/ixgbe.c (revision 227552) ++++ dev/ixgbe/ixgbe.c (working copy) +@@ -313,6 +313,10 @@ + static int fdir_pballoc = 1; + #endif + ++#ifdef DEV_NETMAP ++#include <dev/netmap/ixgbe_netmap.h> ++#endif /* DEV_NETMAP */ ++ + /********************************************************************* + * Device identification routine + * +@@ -578,6 +582,9 @@ + + ixgbe_add_hw_stats(adapter); + ++#ifdef DEV_NETMAP ++ ixgbe_netmap_attach(adapter); ++#endif /* DEV_NETMAP */ + INIT_DEBUGOUT("ixgbe_attach: end"); + return (0); + err_late: +@@ -652,6 +659,9 @@ + + ether_ifdetach(adapter->ifp); + callout_drain(&adapter->timer); ++#ifdef DEV_NETMAP ++ netmap_detach(adapter->ifp); ++#endif /* DEV_NETMAP */ + ixgbe_free_pci_resources(adapter); + bus_generic_detach(dev); + if_free(adapter->ifp); +@@ -1719,6 +1729,7 @@ + if (++i == adapter->num_tx_desc) + i = 0; + ++ // XXX should we sync each buffer ? + txbuf->m_head = NULL; + txbuf->eop_index = -1; + } +@@ -2813,6 +2824,10 @@ + struct adapter *adapter = txr->adapter; + struct ixgbe_tx_buf *txbuf; + int i; ++#ifdef DEV_NETMAP ++ struct netmap_slot *slot = netmap_reset(NA(adapter->ifp), ++ NR_TX, txr->me, 0); ++#endif + + /* Clear the old ring contents */ + IXGBE_TX_LOCK(txr); +@@ -2832,6 +2847,13 @@ + m_freem(txbuf->m_head); + txbuf->m_head = NULL; + } ++#ifdef DEV_NETMAP ++ if (slot) { ++ netmap_load_map(txr->txtag, txbuf->map, ++ NMB(slot), adapter->rx_mbuf_sz); ++ slot++; ++ } ++#endif /* DEV_NETMAP */ + /* Clear the EOP index */ + txbuf->eop_index = -1; + } +@@ -3310,6 +3332,20 @@ + + mtx_assert(&txr->tx_mtx, MA_OWNED); + ++#ifdef DEV_NETMAP ++ if (ifp->if_capenable & IFCAP_NETMAP) { ++ struct netmap_adapter *na = NA(ifp); ++ ++ selwakeuppri(&na->tx_rings[txr->me].si, PI_NET); ++ IXGBE_TX_UNLOCK(txr); ++ IXGBE_CORE_LOCK(adapter); ++ selwakeuppri(&na->tx_rings[na->num_queues + 1].si, PI_NET); ++ IXGBE_CORE_UNLOCK(adapter); ++ IXGBE_TX_LOCK(txr); // the caller is supposed to own the lock ++ return (FALSE); ++ } ++#endif /* DEV_NETMAP */ ++ + if (txr->tx_avail == adapter->num_tx_desc) { + txr->queue_status = IXGBE_QUEUE_IDLE; + return FALSE; +@@ -3698,6 +3734,10 @@ + bus_dma_segment_t pseg[1], hseg[1]; + struct lro_ctrl *lro = &rxr->lro; + int rsize, nsegs, error = 0; ++#ifdef DEV_NETMAP ++ struct netmap_slot *slot = netmap_reset(NA(rxr->adapter->ifp), ++ NR_RX, rxr->me, 0); ++#endif /* DEV_NETMAP */ + + adapter = rxr->adapter; + ifp = adapter->ifp; +@@ -3721,6 +3761,18 @@ + struct mbuf *mh, *mp; + + rxbuf = &rxr->rx_buffers[j]; ++#ifdef DEV_NETMAP ++ if (slot) { ++ netmap_load_map(rxr->ptag, ++ rxbuf->pmap, NMB(slot), ++ adapter->rx_mbuf_sz); ++ /* Update descriptor */ ++ rxr->rx_base[j].read.pkt_addr = ++ htole64(vtophys(NMB(slot))); ++ slot++; ++ continue; ++ } ++#endif /* DEV_NETMAP */ + /* + ** Don't allocate mbufs if not + ** doing header split, its wasteful +@@ -4148,6 +4200,18 @@ + + IXGBE_RX_LOCK(rxr); + ++#ifdef DEV_NETMAP ++ if (ifp->if_capenable & IFCAP_NETMAP) { ++ struct netmap_adapter *na = NA(ifp); ++ ++ selwakeuppri(&na->rx_rings[rxr->me].si, PI_NET); ++ IXGBE_RX_UNLOCK(rxr); ++ IXGBE_CORE_LOCK(adapter); ++ selwakeuppri(&na->rx_rings[na->num_queues + 1].si, PI_NET); ++ IXGBE_CORE_UNLOCK(adapter); ++ return (0); ++ } ++#endif /* DEV_NETMAP */ + for (i = rxr->next_to_check; count != 0;) { + struct mbuf *sendmp, *mh, *mp; + u32 rsc, ptype; diff --git a/sys/dev/netmap/if_em_netmap.h b/sys/dev/netmap/if_em_netmap.h new file mode 100644 index 0000000000000..0e220e755d68d --- /dev/null +++ b/sys/dev/netmap/if_em_netmap.h @@ -0,0 +1,383 @@ +/* + * Copyright (C) 2011 Matteo Landi, Luigi Rizzo. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $FreeBSD$ + * $Id: if_em_netmap.h 9662 2011-11-16 13:18:06Z luigi $ + * + * netmap changes for if_em. + */ + +#include <net/netmap.h> +#include <sys/selinfo.h> +#include <vm/vm.h> +#include <vm/pmap.h> /* vtophys ? */ +#include <dev/netmap/netmap_kern.h> + +static void em_netmap_block_tasks(struct adapter *); +static void em_netmap_unblock_tasks(struct adapter *); +static int em_netmap_reg(struct ifnet *, int onoff); +static int em_netmap_txsync(void *, u_int, int); +static int em_netmap_rxsync(void *, u_int, int); +static void em_netmap_lock_wrapper(void *, int, u_int); + +static void +em_netmap_attach(struct adapter *adapter) +{ + struct netmap_adapter na; + + bzero(&na, sizeof(na)); + + na.ifp = adapter->ifp; + na.separate_locks = 1; + na.num_tx_desc = adapter->num_tx_desc; + na.num_rx_desc = adapter->num_rx_desc; + na.nm_txsync = em_netmap_txsync; + na.nm_rxsync = em_netmap_rxsync; + na.nm_lock = em_netmap_lock_wrapper; + na.nm_register = em_netmap_reg; + /* + * adapter->rx_mbuf_sz is set by SIOCSETMTU, but in netmap mode + * we allocate the buffers on the first register. So we must + * disallow a SIOCSETMTU when if_capenable & IFCAP_NETMAP is set. + */ + na.buff_size = MCLBYTES; + netmap_attach(&na, adapter->num_queues); +} + + +/* + * wrapper to export locks to the generic code + */ +static void +em_netmap_lock_wrapper(void *_a, int what, u_int queueid) +{ + struct adapter *adapter = _a; + + ASSERT(queueid < adapter->num_queues); + switch (what) { + case NETMAP_CORE_LOCK: + EM_CORE_LOCK(adapter); + break; + case NETMAP_CORE_UNLOCK: + EM_CORE_UNLOCK(adapter); + break; + case NETMAP_TX_LOCK: + EM_TX_LOCK(&adapter->tx_rings[queueid]); + break; + case NETMAP_TX_UNLOCK: + EM_TX_UNLOCK(&adapter->tx_rings[queueid]); + break; + case NETMAP_RX_LOCK: + EM_RX_LOCK(&adapter->rx_rings[queueid]); + break; + case NETMAP_RX_UNLOCK: + EM_RX_UNLOCK(&adapter->rx_rings[queueid]); + break; + } +} + + +static void +em_netmap_block_tasks(struct adapter *adapter) +{ + if (adapter->msix > 1) { /* MSIX */ + int i; + struct tx_ring *txr = adapter->tx_rings; + struct rx_ring *rxr = adapter->rx_rings; + + for (i = 0; i < adapter->num_queues; i++, txr++, rxr++) { + taskqueue_block(txr->tq); + taskqueue_drain(txr->tq, &txr->tx_task); + taskqueue_block(rxr->tq); + taskqueue_drain(rxr->tq, &rxr->rx_task); + } + } else { /* legacy */ + taskqueue_block(adapter->tq); + taskqueue_drain(adapter->tq, &adapter->link_task); + taskqueue_drain(adapter->tq, &adapter->que_task); + } +} + + +static void +em_netmap_unblock_tasks(struct adapter *adapter) +{ + if (adapter->msix > 1) { + struct tx_ring *txr = adapter->tx_rings; + struct rx_ring *rxr = adapter->rx_rings; + int i; + + for (i = 0; i < adapter->num_queues; i++) { + taskqueue_unblock(txr->tq); + taskqueue_unblock(rxr->tq); + } + } else { /* legacy */ + taskqueue_unblock(adapter->tq); + } +} + +/* + * register-unregister routine + */ +static int +em_netmap_reg(struct ifnet *ifp, int onoff) +{ + struct adapter *adapter = ifp->if_softc; + struct netmap_adapter *na = NA(ifp); + int error = 0; + + if (na == NULL) + return EINVAL; /* no netmap support here */ + + em_disable_intr(adapter); + + /* Tell the stack that the interface is no longer active */ + ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); + + em_netmap_block_tasks(adapter); + + if (onoff) { + ifp->if_capenable |= IFCAP_NETMAP; + + /* save if_transmit for later restore. + * XXX also if_start and if_qflush ? + */ + na->if_transmit = ifp->if_transmit; + ifp->if_transmit = netmap_start; + + em_init_locked(adapter); + if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) == 0) { + error = ENOMEM; + goto fail; + } + } else { +fail: + /* restore if_transmit */ + ifp->if_transmit = na->if_transmit; + ifp->if_capenable &= ~IFCAP_NETMAP; + em_init_locked(adapter); /* also enable intr */ + + } + em_netmap_unblock_tasks(adapter); + return (error); +} + +/* + * Reconcile hardware and user view of the transmit ring, see + * ixgbe.c for details. + */ +static int +em_netmap_txsync(void *a, u_int ring_nr, int do_lock) +{ + struct adapter *adapter = a; + struct tx_ring *txr = &adapter->tx_rings[ring_nr]; + struct netmap_adapter *na = NA(adapter->ifp); + struct netmap_kring *kring = &na->tx_rings[ring_nr]; + struct netmap_ring *ring = kring->ring; + int j, k, n, lim = kring->nkr_num_slots - 1; + + /* generate an interrupt approximately every half ring */ + int report_frequency = kring->nkr_num_slots >> 1; + + k = ring->cur; + if ( (kring->nr_kflags & NR_REINIT) || k > lim) + return netmap_ring_reinit(kring); + + if (do_lock) + EM_TX_LOCK(txr); + bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, + BUS_DMASYNC_POSTREAD); + + /* record completed transmissions TODO + * + * instead of using TDH, we could read the transmitted status bit. + */ + j = E1000_READ_REG(&adapter->hw, E1000_TDH(ring_nr)); + if (j >= kring->nkr_num_slots) { /* XXX can happen */ + D("TDH wrap %d", j); + j -= kring->nkr_num_slots; + } + int delta = j - txr->next_to_clean; + if (delta) { + /* new transmissions were completed, increment + ring->nr_hwavail. */ + if (delta < 0) + delta += kring->nkr_num_slots; + txr->next_to_clean = j; + kring->nr_hwavail += delta; + } + + /* update avail to what the hardware knows */ + ring->avail = kring->nr_hwavail; + + j = kring->nr_hwcur; + if (j != k) { /* we have packets to send */ + n = 0; + while (j != k) { + struct netmap_slot *slot = &ring->slot[j]; + struct e1000_tx_desc *curr = &txr->tx_base[j]; + struct em_buffer *txbuf = &txr->tx_buffers[j]; + int flags = ((slot->flags & NS_REPORT) || + j == 0 || j == report_frequency) ? + E1000_TXD_CMD_RS : 0; + void *addr = NMB(slot); + int len = slot->len; + if (addr == netmap_buffer_base || len > NETMAP_BUF_SIZE) { + if (do_lock) + EM_TX_UNLOCK(txr); + return netmap_ring_reinit(kring); + } + + slot->flags &= ~NS_REPORT; + curr->upper.data = 0; + curr->lower.data = + htole32( + adapter->txd_cmd | + (E1000_TXD_CMD_EOP | flags) | + slot->len); + if (slot->flags & NS_BUF_CHANGED) { + curr->buffer_addr = htole64(vtophys(addr)); + /* buffer has changed, unload and reload map */ + netmap_reload_map(txr->txtag, txbuf->map, + addr, na->buff_size); + slot->flags &= ~NS_BUF_CHANGED; + } + + bus_dmamap_sync(txr->txtag, txbuf->map, + BUS_DMASYNC_PREWRITE); + j = (j == lim) ? 0 : j + 1; + n++; + } + kring->nr_hwcur = ring->cur; + + /* decrease avail by number of sent packets */ + ring->avail -= n; + kring->nr_hwavail = ring->avail; + + bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, + BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); + + E1000_WRITE_REG(&adapter->hw, E1000_TDT(txr->me), + ring->cur); + } + if (do_lock) + EM_TX_UNLOCK(txr); + return 0; +} + +/* + * Reconcile kernel and user view of the receive ring, see ixgbe.c + */ +static int +em_netmap_rxsync(void *a, u_int ring_nr, int do_lock) +{ + struct adapter *adapter = a; + struct rx_ring *rxr = &adapter->rx_rings[ring_nr]; + struct netmap_adapter *na = NA(adapter->ifp); + struct netmap_kring *kring = &na->rx_rings[ring_nr]; + struct netmap_ring *ring = kring->ring; + int j, k, n, lim = kring->nkr_num_slots - 1; + + k = ring->cur; + if ( (kring->nr_kflags & NR_REINIT) || k > lim) + return netmap_ring_reinit(kring); + + if (do_lock) + EM_RX_LOCK(rxr); + /* XXX check sync modes */ + bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, + BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); + + /* acknowledge all the received packets. */ + j = rxr->next_to_check; + for (n = 0; ; n++) { + struct e1000_rx_desc *curr = &rxr->rx_base[j]; + + if ((curr->status & E1000_RXD_STAT_DD) == 0) + break; + ring->slot[j].len = le16toh(curr->length); + bus_dmamap_sync(rxr->tag, rxr->rx_buffers[j].map, + BUS_DMASYNC_POSTREAD); + j = (j == lim) ? 0 : j + 1; + } + if (n) { + rxr->next_to_check = j; + kring->nr_hwavail += n; + } + + /* skip past packets that userspace has already processed: + * making them available for reception. + * advance nr_hwcur and issue a bus_dmamap_sync on the + * buffers so it is safe to write to them. + * Also increase nr_hwavail + */ + j = kring->nr_hwcur; + if (j != k) { /* userspace has read some packets. */ + n = 0; + while (j != k) { + struct netmap_slot *slot = &ring->slot[j]; + struct e1000_rx_desc *curr = &rxr->rx_base[j]; + struct em_buffer *rxbuf = &rxr->rx_buffers[j]; + void *addr = NMB(slot); + + if (addr == netmap_buffer_base) { /* bad buf */ + if (do_lock) + EM_RX_UNLOCK(rxr); + return netmap_ring_reinit(kring); + } + + curr->status = 0; + if (slot->flags & NS_BUF_CHANGED) { + curr->buffer_addr = htole64(vtophys(addr)); + /* buffer has changed, unload and reload map */ + netmap_reload_map(rxr->rxtag, rxbuf->map, + addr, na->buff_size); + slot->flags &= ~NS_BUF_CHANGED; + } + + bus_dmamap_sync(rxr->rxtag, rxbuf->map, + BUS_DMASYNC_PREREAD); + + j = (j == lim) ? 0 : j + 1; + n++; + } + kring->nr_hwavail -= n; + kring->nr_hwcur = ring->cur; + bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, + BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); + /* + * IMPORTANT: we must leave one free slot in the ring, + * so move j back by one unit + */ + j = (j == 0) ? lim : j - 1; + E1000_WRITE_REG(&adapter->hw, E1000_RDT(rxr->me), j); + } + /* tell userspace that there are new packets */ + ring->avail = kring->nr_hwavail ; + if (do_lock) + EM_RX_UNLOCK(rxr); + return 0; +} diff --git a/sys/dev/netmap/if_igb_netmap.h b/sys/dev/netmap/if_igb_netmap.h new file mode 100644 index 0000000000000..0c147063b2112 --- /dev/null +++ b/sys/dev/netmap/if_igb_netmap.h @@ -0,0 +1,378 @@ +/* + * Copyright (C) 2011 Universita` di Pisa. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $FreeBSD$ + * $Id: if_igb_netmap.h 9662 2011-11-16 13:18:06Z luigi $ + * + * netmap modifications for igb + * contribured by Ahmed Kooli + */ + +#include <net/netmap.h> +#include <sys/selinfo.h> +#include <vm/vm.h> +#include <vm/pmap.h> /* vtophys ? */ +#include <dev/netmap/netmap_kern.h> + +static int igb_netmap_reg(struct ifnet *, int onoff); +static int igb_netmap_txsync(void *, u_int, int); +static int igb_netmap_rxsync(void *, u_int, int); +static void igb_netmap_lock_wrapper(void *, int, u_int); + + +static void +igb_netmap_attach(struct adapter *adapter) +{ + struct netmap_adapter na; + + bzero(&na, sizeof(na)); + + na.ifp = adapter->ifp; + na.separate_locks = 1; + na.num_tx_desc = adapter->num_tx_desc; + na.num_rx_desc = adapter->num_rx_desc; + na.nm_txsync = igb_netmap_txsync; + na.nm_rxsync = igb_netmap_rxsync; + na.nm_lock = igb_netmap_lock_wrapper; + na.nm_register = igb_netmap_reg; + /* + * adapter->rx_mbuf_sz is set by SIOCSETMTU, but in netmap mode + * we allocate the buffers on the first register. So we must + * disallow a SIOCSETMTU when if_capenable & IFCAP_NETMAP is set. + */ + na.buff_size = MCLBYTES; + netmap_attach(&na, adapter->num_queues); +} + + +/* + * wrapper to export locks to the generic code + */ +static void +igb_netmap_lock_wrapper(void *_a, int what, u_int queueid) +{ + struct adapter *adapter = _a; + + ASSERT(queueid < adapter->num_queues); + switch (what) { + case NETMAP_CORE_LOCK: + IGB_CORE_LOCK(adapter); + break; + case NETMAP_CORE_UNLOCK: + IGB_CORE_UNLOCK(adapter); + break; + case NETMAP_TX_LOCK: + IGB_TX_LOCK(&adapter->tx_rings[queueid]); + break; + case NETMAP_TX_UNLOCK: + IGB_TX_UNLOCK(&adapter->tx_rings[queueid]); + break; + case NETMAP_RX_LOCK: + IGB_RX_LOCK(&adapter->rx_rings[queueid]); + break; + case NETMAP_RX_UNLOCK: + IGB_RX_UNLOCK(&adapter->rx_rings[queueid]); + break; + } +} + + +/* + * support for netmap register/unregisted. We are already under core lock. + * only called on the first init or the last unregister. + */ +static int +igb_netmap_reg(struct ifnet *ifp, int onoff) +{ + struct adapter *adapter = ifp->if_softc; + struct netmap_adapter *na = NA(ifp); + int error = 0; + + if (!na) + return EINVAL; + + igb_disable_intr(adapter); + + /* Tell the stack that the interface is no longer active */ + ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); + + if (onoff) { + ifp->if_capenable |= IFCAP_NETMAP; + + /* save if_transmit to restore it later */ + na->if_transmit = ifp->if_transmit; + ifp->if_transmit = netmap_start; + + igb_init_locked(adapter); + if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) == 0) { + error = ENOMEM; + goto fail; + } + } else { +fail: + /* restore if_transmit */ + ifp->if_transmit = na->if_transmit; + ifp->if_capenable &= ~IFCAP_NETMAP; + igb_init_locked(adapter); /* also enables intr */ + } + return (error); +} + + +/* + * Reconcile kernel and user view of the transmit ring. + * + * Userspace has filled tx slots up to cur (excluded). + * The last unused slot previously known to the kernel was nr_hwcur, + * and the last interrupt reported nr_hwavail slots available + * (using the special value -1 to indicate idle transmit ring). + * The function must first update avail to what the kernel + * knows, subtract the newly used slots (cur - nr_hwcur) + * from both avail and nr_hwavail, and set nr_hwcur = cur + * issuing a dmamap_sync on all slots. + * + * Check parameters in the struct netmap_ring. + * We don't use avail, only check for bogus values. + * Make sure cur is valid, and same goes for buffer indexes and lengths. + * To avoid races, read the values once, and never use those from + * the ring afterwards. + */ +static int +igb_netmap_txsync(void *a, u_int ring_nr, int do_lock) +{ + struct adapter *adapter = a; + struct tx_ring *txr = &adapter->tx_rings[ring_nr]; + struct netmap_adapter *na = NA(adapter->ifp); + struct netmap_kring *kring = &na->tx_rings[ring_nr]; + struct netmap_ring *ring = kring->ring; + int j, k, n, lim = kring->nkr_num_slots - 1; + + /* generate an interrupt approximately every half ring */ + int report_frequency = kring->nkr_num_slots >> 1; + + k = ring->cur; /* ring is not protected by any lock */ + if ( (kring->nr_kflags & NR_REINIT) || k > lim) + return netmap_ring_reinit(kring); + + if (do_lock) + IGB_TX_LOCK(txr); + bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, + BUS_DMASYNC_POSTREAD); + + /* record completed transmissions. TODO + * + * Instead of reading from the TDH register, we could and try to check + * the status bit of descriptor packets. + */ + j = E1000_READ_REG(&adapter->hw, E1000_TDH(ring_nr)); + if (j >= kring->nkr_num_slots) /* XXX can it happen ? */ + j -= kring->nkr_num_slots; + int delta = j - txr->next_to_clean; + if (delta) { + /* new tx were completed */ + if (delta < 0) + delta += kring->nkr_num_slots; + txr->next_to_clean = j; + kring->nr_hwavail += delta; + } + + /* update avail to what the hardware knows */ + ring->avail = kring->nr_hwavail; + + j = kring->nr_hwcur; + if (j != k) { /* we have new packets to send */ + u32 olinfo_status = 0; + n = 0; + + /* 82575 needs the queue index added */ + if (adapter->hw.mac.type == e1000_82575) + olinfo_status |= txr->me << 4; + + while (j != k) { + struct netmap_slot *slot = &ring->slot[j]; + struct igb_tx_buffer *txbuf = &txr->tx_buffers[j]; + union e1000_adv_tx_desc *curr = + (union e1000_adv_tx_desc *)&txr->tx_base[j]; + void *addr = NMB(slot); + int flags = ((slot->flags & NS_REPORT) || + j == 0 || j == report_frequency) ? + E1000_ADVTXD_DCMD_RS : 0; + int len = slot->len; + + if (addr == netmap_buffer_base || len > NETMAP_BUF_SIZE) { + if (do_lock) + IGB_TX_UNLOCK(txr); + return netmap_ring_reinit(kring); + } + + slot->flags &= ~NS_REPORT; + curr->read.buffer_addr = htole64(vtophys(addr)); + curr->read.olinfo_status = + htole32(olinfo_status | + (len<< E1000_ADVTXD_PAYLEN_SHIFT)); + curr->read.cmd_type_len = + htole32(len | E1000_ADVTXD_DTYP_DATA | + E1000_ADVTXD_DCMD_IFCS | + E1000_ADVTXD_DCMD_DEXT | + E1000_ADVTXD_DCMD_EOP | flags); + if (slot->flags & NS_BUF_CHANGED) { + /* buffer has changed, unload and reload map */ + netmap_reload_map(txr->txtag, txbuf->map, + addr, na->buff_size); + slot->flags &= ~NS_BUF_CHANGED; + } + + bus_dmamap_sync(txr->txtag, txbuf->map, + BUS_DMASYNC_PREWRITE); + j = (j == lim) ? 0 : j + 1; + n++; + } + kring->nr_hwcur = k; + + /* decrease avail by number of sent packets */ + ring->avail -= n; + kring->nr_hwavail = ring->avail; + + /* Set the watchdog */ + txr->queue_status = IGB_QUEUE_WORKING; + txr->watchdog_time = ticks; + + bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, + BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); + + E1000_WRITE_REG(&adapter->hw, E1000_TDT(txr->me), k); + } + if (do_lock) + IGB_TX_UNLOCK(txr); + return 0; +} + + +/* + * Reconcile kernel and user view of the receive ring. + * + * Userspace has read rx slots up to cur (excluded). + * The last unread slot previously known to the kernel was nr_hwcur, + * and the last interrupt reported nr_hwavail slots available. + * We must subtract the newly consumed slots (cur - nr_hwcur) + * from nr_hwavail, clearing the descriptors for the next + * read, tell the hardware that they are available, + * and set nr_hwcur = cur and avail = nr_hwavail. + * issuing a dmamap_sync on all slots. + */ +static int +igb_netmap_rxsync(void *a, u_int ring_nr, int do_lock) +{ + struct adapter *adapter = a; + struct rx_ring *rxr = &adapter->rx_rings[ring_nr]; + struct netmap_adapter *na = NA(adapter->ifp); + struct netmap_kring *kring = &na->rx_rings[ring_nr]; + struct netmap_ring *ring = kring->ring; + int j, k, n, lim = kring->nkr_num_slots - 1; + + k = ring->cur; /* ring is not protected by any lock */ + if ( (kring->nr_kflags & NR_REINIT) || k > lim) + return netmap_ring_reinit(kring); + + if (do_lock) + IGB_RX_LOCK(rxr); + + /* Sync the ring. */ + bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, + BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); + + j = rxr->next_to_check; + for (n = 0; ; n++) { + union e1000_adv_rx_desc *curr = &rxr->rx_base[j]; + uint32_t staterr = le32toh(curr->wb.upper.status_error); + + if ((staterr & E1000_RXD_STAT_DD) == 0) + break; + ring->slot[j].len = le16toh(curr->wb.upper.length); + + bus_dmamap_sync(rxr->ptag, + rxr->rx_buffers[j].pmap, BUS_DMASYNC_POSTREAD); + j = (j == lim) ? 0 : j + 1; + } + if (n) { + rxr->next_to_check = j; + kring->nr_hwavail += n; + if (kring->nr_hwavail >= lim - 10) { + ND("rx ring %d almost full %d", ring_nr, kring->nr_hwavail); + } + } + + /* skip past packets that userspace has already processed, + * making them available for reception. + * advance nr_hwcur and issue a bus_dmamap_sync on the + * buffers so it is safe to write to them. + * Also increase nr_hwavail + */ + j = kring->nr_hwcur; + if (j != k) { /* userspace has read some packets. */ + n = 0; + while (j != k) { + struct netmap_slot *slot = ring->slot + j; + union e1000_adv_rx_desc *curr = &rxr->rx_base[j]; + struct igb_rx_buf *rxbuf = rxr->rx_buffers + j; + void *addr = NMB(slot); + + if (addr == netmap_buffer_base) { /* bad buf */ + if (do_lock) + IGB_RX_UNLOCK(rxr); + return netmap_ring_reinit(kring); + } + + curr->wb.upper.status_error = 0; + curr->read.pkt_addr = htole64(vtophys(addr)); + if (slot->flags & NS_BUF_CHANGED) { + netmap_reload_map(rxr->ptag, rxbuf->pmap, + addr, na->buff_size); + slot->flags &= ~NS_BUF_CHANGED; + } + + bus_dmamap_sync(rxr->ptag, rxbuf->pmap, + BUS_DMASYNC_PREREAD); + + j = (j == lim) ? 0 : j + 1; + n++; + } + kring->nr_hwavail -= n; + kring->nr_hwcur = ring->cur; + bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, + BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); + /* IMPORTANT: we must leave one free slot in the ring, + * so move j back by one unit + */ + j = (j == 0) ? lim : j - 1; + E1000_WRITE_REG(&adapter->hw, E1000_RDT(rxr->me), j); + } + /* tell userspace that there are new packets */ + ring->avail = kring->nr_hwavail ; + if (do_lock) + IGB_RX_UNLOCK(rxr); + return 0; +} diff --git a/sys/dev/netmap/if_lem_netmap.h b/sys/dev/netmap/if_lem_netmap.h new file mode 100644 index 0000000000000..a8f34989bcc4c --- /dev/null +++ b/sys/dev/netmap/if_lem_netmap.h @@ -0,0 +1,344 @@ +/* + * Copyright (C) 2011 Matteo Landi, Luigi Rizzo. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $FreeBSD$ + * $Id: if_lem_netmap.h 9662 2011-11-16 13:18:06Z luigi $ + * + * netmap support for if_lem.c + */ + +#include <net/netmap.h> +#include <sys/selinfo.h> +#include <vm/vm.h> +#include <vm/pmap.h> /* vtophys ? */ +#include <dev/netmap/netmap_kern.h> + +static int lem_netmap_reg(struct ifnet *, int onoff); +static int lem_netmap_txsync(void *, u_int, int); +static int lem_netmap_rxsync(void *, u_int, int); +static void lem_netmap_lock_wrapper(void *, int, u_int); + + +SYSCTL_NODE(_dev, OID_AUTO, lem, CTLFLAG_RW, 0, "lem card"); + +static void +lem_netmap_attach(struct adapter *adapter) +{ + struct netmap_adapter na; + + bzero(&na, sizeof(na)); + + na.ifp = adapter->ifp; + na.separate_locks = 1; + na.num_tx_desc = adapter->num_tx_desc; + na.num_rx_desc = adapter->num_rx_desc; + na.nm_txsync = lem_netmap_txsync; + na.nm_rxsync = lem_netmap_rxsync; + na.nm_lock = lem_netmap_lock_wrapper; + na.nm_register = lem_netmap_reg; + na.buff_size = MCLBYTES; + netmap_attach(&na, 1); +} + + +static void +lem_netmap_lock_wrapper(void *_a, int what, u_int ringid) +{ + struct adapter *adapter = _a; + + /* only one ring here so ignore the ringid */ + switch (what) { + case NETMAP_CORE_LOCK: + EM_CORE_LOCK(adapter); + break; + case NETMAP_CORE_UNLOCK: + EM_CORE_UNLOCK(adapter); + break; + case NETMAP_TX_LOCK: + EM_TX_LOCK(adapter); + break; + case NETMAP_TX_UNLOCK: + EM_TX_UNLOCK(adapter); + break; + case NETMAP_RX_LOCK: + EM_RX_LOCK(adapter); + break; + case NETMAP_RX_UNLOCK: + EM_RX_UNLOCK(adapter); + break; + } +} + + +/* + * Reconcile kernel and user view of the transmit ring. see ixgbe.c + */ +static int +lem_netmap_txsync(void *a, u_int ring_nr, int do_lock) +{ + struct adapter *adapter = a; + struct netmap_adapter *na = NA(adapter->ifp); + struct netmap_kring *kring = &na->tx_rings[0]; + struct netmap_ring *ring = kring->ring; + int j, k, n, lim = kring->nkr_num_slots - 1; + + /* generate an interrupt approximately every half ring */ + int report_frequency = kring->nkr_num_slots >> 1; + + k = ring->cur; + if ( (kring->nr_kflags & NR_REINIT) || k > lim) + return netmap_ring_reinit(kring); + + if (do_lock) + EM_TX_LOCK(adapter); + bus_dmamap_sync(adapter->txdma.dma_tag, adapter->txdma.dma_map, + BUS_DMASYNC_POSTREAD); + + /* record completed transmissions TODO + * + * instead of using TDH, we could read the transmitted status bit. + */ + j = E1000_READ_REG(&adapter->hw, E1000_TDH(0)); + if (j >= kring->nkr_num_slots) { /* can it happen ? */ + D("bad TDH %d", j); + j -= kring->nkr_num_slots; + } + int delta = j - adapter->next_tx_to_clean; + if (delta) { + if (delta < 0) + delta += kring->nkr_num_slots; + adapter->next_tx_to_clean = j; + kring->nr_hwavail += delta; + } + + /* update avail to what the hardware knows */ + ring->avail = kring->nr_hwavail; + + j = kring->nr_hwcur; + if (j != k) { /* we have new packets to send */ + n = 0; + while (j != k) { + struct netmap_slot *slot = &ring->slot[j]; + struct e1000_tx_desc *curr = &adapter->tx_desc_base[j]; + struct em_buffer *txbuf = &adapter->tx_buffer_area[j]; + void *addr = NMB(slot); + int flags = ((slot->flags & NS_REPORT) || + j == 0 || j == report_frequency) ? + E1000_TXD_CMD_RS : 0; + int len = slot->len; + + if (addr == netmap_buffer_base || len > NETMAP_BUF_SIZE) { + if (do_lock) + EM_TX_UNLOCK(adapter); + return netmap_ring_reinit(kring); + } + + curr->upper.data = 0; + /* always interrupt. XXX make it conditional */ + curr->lower.data = + htole32( adapter->txd_cmd | len | + (E1000_TXD_CMD_EOP | flags) ); + if (slot->flags & NS_BUF_CHANGED) { + curr->buffer_addr = htole64(vtophys(addr)); + /* buffer has changed, unload and reload map */ + netmap_reload_map(adapter->txtag, txbuf->map, + addr, na->buff_size); + slot->flags &= ~NS_BUF_CHANGED; + } + + bus_dmamap_sync(adapter->txtag, txbuf->map, + BUS_DMASYNC_PREWRITE); + j = (j == lim) ? 0 : j + 1; + n++; + } + kring->nr_hwcur = ring->cur; + + /* decrease avail by number of sent packets */ + ring->avail -= n; + kring->nr_hwavail = ring->avail; + + bus_dmamap_sync(adapter->txdma.dma_tag, adapter->txdma.dma_map, + BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); + + E1000_WRITE_REG(&adapter->hw, E1000_TDT(0), ring->cur); + } + if (do_lock) + EM_TX_UNLOCK(adapter); + return 0; +} + + +/* + * Reconcile kernel and user view of the receive ring. see ixgbe.c + */ +static int +lem_netmap_rxsync(void *a, u_int ring_nr, int do_lock) +{ + struct adapter *adapter = a; + struct netmap_adapter *na = NA(adapter->ifp); + struct netmap_kring *kring = &na->rx_rings[0]; + struct netmap_ring *ring = kring->ring; + int j, k, n, lim = kring->nkr_num_slots - 1; + + k = ring->cur; + if ( (kring->nr_kflags & NR_REINIT) || k > lim) + return netmap_ring_reinit(kring); + + if (do_lock) + EM_RX_LOCK(adapter); + /* XXX check sync modes */ + bus_dmamap_sync(adapter->rxdma.dma_tag, adapter->rxdma.dma_map, + BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); + + /* acknowldge all the received packets. */ + j = adapter->next_rx_desc_to_check; + for (n = 0; ; n++) { + struct e1000_rx_desc *curr = &adapter->rx_desc_base[j]; + int len = le16toh(adapter->rx_desc_base[j].length) - 4; // CRC + + if ((curr->status & E1000_RXD_STAT_DD) == 0) + break; + + if (len < 0) { + D("bogus pkt size at %d", j); + len = 0; + } + ring->slot[j].len = len; + bus_dmamap_sync(adapter->rxtag, adapter->rx_buffer_area[j].map, + BUS_DMASYNC_POSTREAD); + j = (j == lim) ? 0 : j + 1; + } + if (n) { + adapter->next_rx_desc_to_check = j; + kring->nr_hwavail += n; + } + + /* skip past packets that userspace has already processed, + * making them available for reception. We don't need to set + * the length as it is the same for all slots. + */ + j = kring->nr_hwcur; + if (j != k) { /* userspace has read some packets. */ + n = 0; + while (j != k) { + struct netmap_slot *slot = &ring->slot[j]; + struct e1000_rx_desc *curr = &adapter->rx_desc_base[j]; + struct em_buffer *rxbuf = &adapter->rx_buffer_area[j]; + void *addr = NMB(slot); + + if (addr == netmap_buffer_base) { /* bad buf */ + if (do_lock) + EM_RX_UNLOCK(adapter); + return netmap_ring_reinit(kring); + } + curr = &adapter->rx_desc_base[j]; + curr->status = 0; + if (slot->flags & NS_BUF_CHANGED) { + curr->buffer_addr = htole64(vtophys(addr)); + /* buffer has changed, unload and reload map */ + netmap_reload_map(adapter->rxtag, rxbuf->map, + addr, na->buff_size); + slot->flags &= ~NS_BUF_CHANGED; + } + + bus_dmamap_sync(adapter->rxtag, rxbuf->map, + BUS_DMASYNC_PREREAD); + + j = (j == lim) ? 0 : j + 1; + n++; + } + kring->nr_hwavail -= n; + kring->nr_hwcur = ring->cur; + bus_dmamap_sync(adapter->rxdma.dma_tag, adapter->rxdma.dma_map, + BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); + /* + * IMPORTANT: we must leave one free slot in the ring, + * so move j back by one unit + */ + j = (j == 0) ? lim : j - 1; + E1000_WRITE_REG(&adapter->hw, E1000_RDT(0), j); + } + + /* tell userspace that there are new packets */ + ring->avail = kring->nr_hwavail ; + if (do_lock) + EM_RX_UNLOCK(adapter); + return 0; +} + + +/* + * Register/unregister routine + */ +static int +lem_netmap_reg(struct ifnet *ifp, int onoff) +{ + struct adapter *adapter = ifp->if_softc; + struct netmap_adapter *na = NA(ifp); + int error = 0; + + if (!na) + return EINVAL; + + lem_disable_intr(adapter); + + /* Tell the stack that the interface is no longer active */ + ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); + + /* lem_netmap_block_tasks(adapter); */ +#ifndef EM_LEGACY_IRQ + taskqueue_block(adapter->tq); + taskqueue_drain(adapter->tq, &adapter->rxtx_task); + taskqueue_drain(adapter->tq, &adapter->link_task); +#endif /* !EM_LEGCY_IRQ */ + if (onoff) { + ifp->if_capenable |= IFCAP_NETMAP; + + /* save if_transmit to restore it when exiting. + * XXX what about if_start and if_qflush ? + */ + na->if_transmit = ifp->if_transmit; + ifp->if_transmit = netmap_start; + + lem_init_locked(adapter); + if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) == 0) { + error = ENOMEM; + goto fail; + } + } else { +fail: + /* restore non-netmap mode */ + ifp->if_transmit = na->if_transmit; + ifp->if_capenable &= ~IFCAP_NETMAP; + lem_init_locked(adapter); /* also enables intr */ + } + +#ifndef EM_LEGACY_IRQ + taskqueue_unblock(adapter->tq); +#endif /* !EM_LEGCY_IRQ */ + + return (error); +} diff --git a/sys/dev/netmap/if_re_netmap.h b/sys/dev/netmap/if_re_netmap.h new file mode 100644 index 0000000000000..efccf3a795bc7 --- /dev/null +++ b/sys/dev/netmap/if_re_netmap.h @@ -0,0 +1,415 @@ +/* + * Copyright (C) 2011 Luigi Rizzo. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $FreeBSD$ + * $Id: if_re_netmap.h 9662 2011-11-16 13:18:06Z luigi $ + * + * netmap support for if_re + */ + +#include <net/netmap.h> +#include <sys/selinfo.h> +#include <vm/vm.h> +#include <vm/pmap.h> /* vtophys ? */ +#include <dev/netmap/netmap_kern.h> + +static int re_netmap_reg(struct ifnet *, int onoff); +static int re_netmap_txsync(void *, u_int, int); +static int re_netmap_rxsync(void *, u_int, int); +static void re_netmap_lock_wrapper(void *, int, u_int); + +static void +re_netmap_attach(struct rl_softc *sc) +{ + struct netmap_adapter na; + + bzero(&na, sizeof(na)); + + na.ifp = sc->rl_ifp; + na.separate_locks = 0; + na.num_tx_desc = sc->rl_ldata.rl_tx_desc_cnt; + na.num_rx_desc = sc->rl_ldata.rl_rx_desc_cnt; + na.nm_txsync = re_netmap_txsync; + na.nm_rxsync = re_netmap_rxsync; + na.nm_lock = re_netmap_lock_wrapper; + na.nm_register = re_netmap_reg; + na.buff_size = MCLBYTES; + netmap_attach(&na, 1); +} + + +/* + * wrapper to export locks to the generic code + * We should not use the tx/rx locks + */ +static void +re_netmap_lock_wrapper(void *_a, int what, u_int queueid) +{ + struct rl_softc *adapter = _a; + + switch (what) { + case NETMAP_CORE_LOCK: + RL_LOCK(adapter); + break; + case NETMAP_CORE_UNLOCK: + RL_UNLOCK(adapter); + break; + + case NETMAP_TX_LOCK: + case NETMAP_RX_LOCK: + case NETMAP_TX_UNLOCK: + case NETMAP_RX_UNLOCK: + D("invalid lock call %d, no tx/rx locks here", what); + break; + } +} + + +/* + * support for netmap register/unregisted. We are already under core lock. + * only called on the first register or the last unregister. + */ +static int +re_netmap_reg(struct ifnet *ifp, int onoff) +{ + struct rl_softc *adapter = ifp->if_softc; + struct netmap_adapter *na = NA(ifp); + int error = 0; + + if (!na) + return EINVAL; + /* Tell the stack that the interface is no longer active */ + ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); + + re_stop(adapter); + + if (onoff) { + ifp->if_capenable |= IFCAP_NETMAP; + + /* save if_transmit and restore it */ + na->if_transmit = ifp->if_transmit; + /* XXX if_start and if_qflush ??? */ + ifp->if_transmit = netmap_start; + + re_init_locked(adapter); + + if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) == 0) { + error = ENOMEM; + goto fail; + } + } else { +fail: + /* restore if_transmit */ + ifp->if_transmit = na->if_transmit; + ifp->if_capenable &= ~IFCAP_NETMAP; + re_init_locked(adapter); /* also enables intr */ + } + return (error); + +} + + +/* + * Reconcile kernel and user view of the transmit ring. + * + * Userspace has filled tx slots up to cur (excluded). + * The last unused slot previously known to the kernel was nr_hwcur, + * and the last interrupt reported nr_hwavail slots available + * (using the special value -1 to indicate idle transmit ring). + * The function must first update avail to what the kernel + * knows (translating the -1 to nkr_num_slots - 1), + * subtract the newly used slots (cur - nr_hwcur) + * from both avail and nr_hwavail, and set nr_hwcur = cur + * issuing a dmamap_sync on all slots. + */ +static int +re_netmap_txsync(void *a, u_int ring_nr, int do_lock) +{ + struct rl_softc *sc = a; + struct rl_txdesc *txd = sc->rl_ldata.rl_tx_desc; + struct netmap_adapter *na = NA(sc->rl_ifp); + struct netmap_kring *kring = &na->tx_rings[ring_nr]; + struct netmap_ring *ring = kring->ring; + int j, k, n, lim = kring->nkr_num_slots - 1; + + k = ring->cur; + if ( (kring->nr_kflags & NR_REINIT) || k > lim) + return netmap_ring_reinit(kring); + + if (do_lock) + RL_LOCK(sc); + + /* Sync the TX descriptor list */ + bus_dmamap_sync(sc->rl_ldata.rl_tx_list_tag, + sc->rl_ldata.rl_tx_list_map, + BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); + + /* record completed transmissions */ + for (n = 0, j = sc->rl_ldata.rl_tx_considx; + j != sc->rl_ldata.rl_tx_prodidx; + n++, j = RL_TX_DESC_NXT(sc, j)) { + uint32_t cmdstat = + le32toh(sc->rl_ldata.rl_tx_list[j].rl_cmdstat); + if (cmdstat & RL_TDESC_STAT_OWN) + break; + } + if (n > 0) { + sc->rl_ldata.rl_tx_considx = j; + sc->rl_ldata.rl_tx_free += n; + kring->nr_hwavail += n; + } + + /* update avail to what the hardware knows */ + ring->avail = kring->nr_hwavail; + + /* we trust prodidx, not hwcur */ + j = kring->nr_hwcur = sc->rl_ldata.rl_tx_prodidx; + if (j != k) { /* we have new packets to send */ + n = 0; + while (j != k) { + struct netmap_slot *slot = &ring->slot[j]; + struct rl_desc *desc = &sc->rl_ldata.rl_tx_list[j]; + int cmd = slot->len | RL_TDESC_CMD_EOF | + RL_TDESC_CMD_OWN | RL_TDESC_CMD_SOF ; + void *addr = NMB(slot); + int len = slot->len; + + if (addr == netmap_buffer_base || len > NETMAP_BUF_SIZE) { + if (do_lock) + RL_UNLOCK(sc); + return netmap_ring_reinit(kring); + } + + if (j == lim) /* mark end of ring */ + cmd |= RL_TDESC_CMD_EOR; + + if (slot->flags & NS_BUF_CHANGED) { + uint64_t paddr = vtophys(addr); + desc->rl_bufaddr_lo = htole32(RL_ADDR_LO(paddr)); + desc->rl_bufaddr_hi = htole32(RL_ADDR_HI(paddr)); + /* buffer has changed, unload and reload map */ + netmap_reload_map(sc->rl_ldata.rl_tx_mtag, + txd[j].tx_dmamap, addr, na->buff_size); + slot->flags &= ~NS_BUF_CHANGED; + } + slot->flags &= ~NS_REPORT; + desc->rl_cmdstat = htole32(cmd); + bus_dmamap_sync(sc->rl_ldata.rl_tx_mtag, + txd[j].tx_dmamap, BUS_DMASYNC_PREWRITE); + j = (j == lim) ? 0 : j + 1; + n++; + } + sc->rl_ldata.rl_tx_prodidx = kring->nr_hwcur = ring->cur; + + /* decrease avail by number of sent packets */ + ring->avail -= n; + kring->nr_hwavail = ring->avail; + + bus_dmamap_sync(sc->rl_ldata.rl_tx_list_tag, + sc->rl_ldata.rl_tx_list_map, + BUS_DMASYNC_PREWRITE|BUS_DMASYNC_PREREAD); + + /* start ? */ + CSR_WRITE_1(sc, sc->rl_txstart, RL_TXSTART_START); + } + if (do_lock) + RL_UNLOCK(sc); + return 0; +} + + +/* + * Reconcile kernel and user view of the receive ring. + * + * Userspace has read rx slots up to cur (excluded). + * The last unread slot previously known to the kernel was nr_hwcur, + * and the last interrupt reported nr_hwavail slots available. + * We must subtract the newly consumed slots (cur - nr_hwcur) + * from nr_hwavail, clearing the descriptors for the next + * read, tell the hardware that they are available, + * and set nr_hwcur = cur and avail = nr_hwavail. + * issuing a dmamap_sync on all slots. + */ +static int +re_netmap_rxsync(void *a, u_int ring_nr, int do_lock) +{ + struct rl_softc *sc = a; + struct rl_rxdesc *rxd = sc->rl_ldata.rl_rx_desc; + struct netmap_adapter *na = NA(sc->rl_ifp); + struct netmap_kring *kring = &na->rx_rings[ring_nr]; + struct netmap_ring *ring = kring->ring; + int j, k, n, lim = kring->nkr_num_slots - 1; + + k = ring->cur; + if ( (kring->nr_kflags & NR_REINIT) || k > lim) + return netmap_ring_reinit(kring); + + if (do_lock) + RL_LOCK(sc); + /* XXX check sync modes */ + bus_dmamap_sync(sc->rl_ldata.rl_rx_list_tag, + sc->rl_ldata.rl_rx_list_map, + BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); + + /* + * The device uses all the buffers in the ring, so we need + * another termination condition in addition to RL_RDESC_STAT_OWN + * cleared (all buffers could have it cleared. The easiest one + * is to limit the amount of data reported up to 'lim' + */ + j = sc->rl_ldata.rl_rx_prodidx; + for (n = kring->nr_hwavail; n < lim ; n++) { + struct rl_desc *cur_rx = &sc->rl_ldata.rl_rx_list[j]; + uint32_t rxstat = le32toh(cur_rx->rl_cmdstat); + uint32_t total_len; + + if ((rxstat & RL_RDESC_STAT_OWN) != 0) + break; + total_len = rxstat & sc->rl_rxlenmask; + /* XXX subtract crc */ + total_len = (total_len < 4) ? 0 : total_len - 4; + kring->ring->slot[j].len = total_len; + /* sync was in re_newbuf() */ + bus_dmamap_sync(sc->rl_ldata.rl_rx_mtag, + rxd[j].rx_dmamap, BUS_DMASYNC_POSTREAD); + j = RL_RX_DESC_NXT(sc, j); + } + if (n != kring->nr_hwavail) { + sc->rl_ldata.rl_rx_prodidx = j; + sc->rl_ifp->if_ipackets += n - kring->nr_hwavail; + kring->nr_hwavail = n; + } + + /* skip past packets that userspace has already processed, + * making them available for reception. + * advance nr_hwcur and issue a bus_dmamap_sync on the + * buffers so it is safe to write to them. + * Also increase nr_hwavail + */ + j = kring->nr_hwcur; + if (j != k) { /* userspace has read some packets. */ + n = 0; + while (j != k) { + struct netmap_slot *slot = ring->slot + j; + struct rl_desc *desc = &sc->rl_ldata.rl_rx_list[j]; + int cmd = na->buff_size | RL_RDESC_CMD_OWN; + void *addr = NMB(slot); + + if (addr == netmap_buffer_base) { /* bad buf */ + if (do_lock) + RL_UNLOCK(sc); + return netmap_ring_reinit(kring); + } + + if (j == lim) /* mark end of ring */ + cmd |= RL_RDESC_CMD_EOR; + + desc->rl_cmdstat = htole32(cmd); + slot->flags &= ~NS_REPORT; + if (slot->flags & NS_BUF_CHANGED) { + uint64_t paddr = vtophys(addr); + desc->rl_bufaddr_lo = htole32(RL_ADDR_LO(paddr)); + desc->rl_bufaddr_hi = htole32(RL_ADDR_HI(paddr)); + netmap_reload_map(sc->rl_ldata.rl_rx_mtag, + rxd[j].rx_dmamap, addr, na->buff_size); + slot->flags &= ~NS_BUF_CHANGED; + } + bus_dmamap_sync(sc->rl_ldata.rl_rx_mtag, + rxd[j].rx_dmamap, BUS_DMASYNC_PREREAD); + j = (j == lim) ? 0 : j + 1; + n++; + } + kring->nr_hwavail -= n; + kring->nr_hwcur = k; + /* Flush the RX DMA ring */ + + bus_dmamap_sync(sc->rl_ldata.rl_rx_list_tag, + sc->rl_ldata.rl_rx_list_map, + BUS_DMASYNC_PREWRITE|BUS_DMASYNC_PREREAD); + } + /* tell userspace that there are new packets */ + ring->avail = kring->nr_hwavail ; + if (do_lock) + RL_UNLOCK(sc); + return 0; +} + +static void +re_netmap_tx_init(struct rl_softc *sc) +{ + struct rl_txdesc *txd; + struct rl_desc *desc; + int i; + struct netmap_adapter *na = NA(sc->rl_ifp); + struct netmap_slot *slot = netmap_reset(na, NR_TX, 0, 0); + + /* slot is NULL if we are not in netmap mode */ + if (!slot) + return; + /* in netmap mode, overwrite addresses and maps */ + txd = sc->rl_ldata.rl_tx_desc; + desc = sc->rl_ldata.rl_tx_list; + + for (i = 0; i < sc->rl_ldata.rl_tx_desc_cnt; i++) { + void *addr = NMB(slot+i); + uint64_t paddr = vtophys(addr); + + desc[i].rl_bufaddr_lo = htole32(RL_ADDR_LO(paddr)); + desc[i].rl_bufaddr_hi = htole32(RL_ADDR_HI(paddr)); + netmap_load_map(sc->rl_ldata.rl_tx_mtag, + txd[i].tx_dmamap, addr, na->buff_size); + } +} + +static void +re_netmap_rx_init(struct rl_softc *sc) +{ + /* slot is NULL if we are not in netmap mode */ + struct netmap_adapter *na = NA(sc->rl_ifp); + struct netmap_slot *slot = netmap_reset(na, NR_RX, 0, 0); + struct rl_desc *desc = sc->rl_ldata.rl_rx_list; + uint32_t cmdstat; + int i; + + if (!slot) + return; + + for (i = 0; i < sc->rl_ldata.rl_rx_desc_cnt; i++) { + void *addr = NMB(slot+i); + uint64_t paddr = vtophys(addr); + + desc[i].rl_bufaddr_lo = htole32(RL_ADDR_LO(paddr)); + desc[i].rl_bufaddr_hi = htole32(RL_ADDR_HI(paddr)); + cmdstat = slot[i].len = na->buff_size; // XXX + if (i == sc->rl_ldata.rl_rx_desc_cnt - 1) + cmdstat |= RL_RDESC_CMD_EOR; + desc[i].rl_cmdstat = htole32(cmdstat | RL_RDESC_CMD_OWN); + + netmap_reload_map(sc->rl_ldata.rl_rx_mtag, + sc->rl_ldata.rl_rx_desc[i].rx_dmamap, + addr, na->buff_size); + } +} diff --git a/sys/dev/netmap/ixgbe_netmap.h b/sys/dev/netmap/ixgbe_netmap.h new file mode 100644 index 0000000000000..a4d5491d67f12 --- /dev/null +++ b/sys/dev/netmap/ixgbe_netmap.h @@ -0,0 +1,376 @@ +/* + * Copyright (C) 2011 Matteo Landi, Luigi Rizzo. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $FreeBSD$ + * $Id: ixgbe_netmap.h 9662 2011-11-16 13:18:06Z luigi $ + * + * netmap modifications for ixgbe + */ + +#include <net/netmap.h> +#include <sys/selinfo.h> +// #include <vm/vm.h> +// #include <vm/pmap.h> /* vtophys ? */ +#include <dev/netmap/netmap_kern.h> + +static int ixgbe_netmap_reg(struct ifnet *, int onoff); +static int ixgbe_netmap_txsync(void *, u_int, int); +static int ixgbe_netmap_rxsync(void *, u_int, int); +static void ixgbe_netmap_lock_wrapper(void *, int, u_int); + + +SYSCTL_NODE(_dev, OID_AUTO, ixgbe, CTLFLAG_RW, 0, "ixgbe card"); + +static void +ixgbe_netmap_attach(struct adapter *adapter) +{ + struct netmap_adapter na; + + bzero(&na, sizeof(na)); + + na.ifp = adapter->ifp; + na.separate_locks = 1; + na.num_tx_desc = adapter->num_tx_desc; + na.num_rx_desc = adapter->num_rx_desc; + na.nm_txsync = ixgbe_netmap_txsync; + na.nm_rxsync = ixgbe_netmap_rxsync; + na.nm_lock = ixgbe_netmap_lock_wrapper; + na.nm_register = ixgbe_netmap_reg; + /* + * adapter->rx_mbuf_sz is set by SIOCSETMTU, but in netmap mode + * we allocate the buffers on the first register. So we must + * disallow a SIOCSETMTU when if_capenable & IFCAP_NETMAP is set. + */ + na.buff_size = MCLBYTES; + netmap_attach(&na, adapter->num_queues); +} + + +/* + * wrapper to export locks to the generic code + */ +static void +ixgbe_netmap_lock_wrapper(void *_a, int what, u_int queueid) +{ + struct adapter *adapter = _a; + + ASSERT(queueid < adapter->num_queues); + switch (what) { + case NETMAP_CORE_LOCK: + IXGBE_CORE_LOCK(adapter); + break; + case NETMAP_CORE_UNLOCK: + IXGBE_CORE_UNLOCK(adapter); + break; + case NETMAP_TX_LOCK: + IXGBE_TX_LOCK(&adapter->tx_rings[queueid]); + break; + case NETMAP_TX_UNLOCK: + IXGBE_TX_UNLOCK(&adapter->tx_rings[queueid]); + break; + case NETMAP_RX_LOCK: + IXGBE_RX_LOCK(&adapter->rx_rings[queueid]); + break; + case NETMAP_RX_UNLOCK: + IXGBE_RX_UNLOCK(&adapter->rx_rings[queueid]); + break; + } +} + + +/* + * support for netmap register/unregisted. We are already under core lock. + * only called on the first init or the last unregister. + */ +static int +ixgbe_netmap_reg(struct ifnet *ifp, int onoff) +{ + struct adapter *adapter = ifp->if_softc; + struct netmap_adapter *na = NA(ifp); + int error = 0; + + if (!na) + return EINVAL; + + ixgbe_disable_intr(adapter); + + /* Tell the stack that the interface is no longer active */ + ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); + + if (onoff) { + ifp->if_capenable |= IFCAP_NETMAP; + + /* save if_transmit to restore it later */ + na->if_transmit = ifp->if_transmit; + ifp->if_transmit = netmap_start; + + ixgbe_init_locked(adapter); + if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) == 0) { + error = ENOMEM; + goto fail; + } + } else { +fail: + /* restore if_transmit */ + ifp->if_transmit = na->if_transmit; + ifp->if_capenable &= ~IFCAP_NETMAP; + ixgbe_init_locked(adapter); /* also enables intr */ + } + return (error); +} + + +/* + * Reconcile kernel and user view of the transmit ring. + * + * Userspace has filled tx slots up to cur (excluded). + * The last unused slot previously known to the kernel was nr_hwcur, + * and the last interrupt reported nr_hwavail slots available + * (using the special value -1 to indicate idle transmit ring). + * The function must first update avail to what the kernel + * knows, subtract the newly used slots (cur - nr_hwcur) + * from both avail and nr_hwavail, and set nr_hwcur = cur + * issuing a dmamap_sync on all slots. + * + * Check parameters in the struct netmap_ring. + * We don't use avail, only check for bogus values. + * Make sure cur is valid, and same goes for buffer indexes and lengths. + * To avoid races, read the values once, and never use those from + * the ring afterwards. + */ +static int +ixgbe_netmap_txsync(void *a, u_int ring_nr, int do_lock) +{ + struct adapter *adapter = a; + struct tx_ring *txr = &adapter->tx_rings[ring_nr]; + struct netmap_adapter *na = NA(adapter->ifp); + struct netmap_kring *kring = &na->tx_rings[ring_nr]; + struct netmap_ring *ring = kring->ring; + int j, k, n = 0, lim = kring->nkr_num_slots - 1; + + /* generate an interrupt approximately every half ring */ + int report_frequency = kring->nkr_num_slots >> 1; + + k = ring->cur; /* ring is not protected by any lock */ + if ( (kring->nr_kflags & NR_REINIT) || k > lim) + return netmap_ring_reinit(kring); + + if (do_lock) + IXGBE_TX_LOCK(txr); + bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, + BUS_DMASYNC_POSTREAD); + + /* update avail to what the hardware knows */ + ring->avail = kring->nr_hwavail; + + j = kring->nr_hwcur; + if (j != k) { /* we have new packets to send */ + while (j != k) { + struct netmap_slot *slot = &ring->slot[j]; + struct ixgbe_tx_buf *txbuf = &txr->tx_buffers[j]; + union ixgbe_adv_tx_desc *curr = &txr->tx_base[j]; + void *addr = NMB(slot); + int flags = ((slot->flags & NS_REPORT) || + j == 0 || j == report_frequency) ? + IXGBE_TXD_CMD_RS : 0; + int len = slot->len; + + if (addr == netmap_buffer_base || len > NETMAP_BUF_SIZE) { + if (do_lock) + IXGBE_TX_UNLOCK(txr); + return netmap_ring_reinit(kring); + } + + slot->flags &= ~NS_REPORT; + curr->read.buffer_addr = htole64(vtophys(addr)); + curr->read.olinfo_status = 0; + curr->read.cmd_type_len = + htole32(txr->txd_cmd | len | + (IXGBE_ADVTXD_DTYP_DATA | + IXGBE_ADVTXD_DCMD_IFCS | + IXGBE_TXD_CMD_EOP | flags) ); + if (slot->flags & NS_BUF_CHANGED) { + /* buffer has changed, unload and reload map */ + netmap_reload_map(txr->txtag, txbuf->map, + addr, na->buff_size); + slot->flags &= ~NS_BUF_CHANGED; + } + + bus_dmamap_sync(txr->txtag, txbuf->map, + BUS_DMASYNC_PREWRITE); + j = (j == lim) ? 0 : j + 1; + n++; + } + kring->nr_hwcur = k; + + /* decrease avail by number of sent packets */ + ring->avail -= n; + kring->nr_hwavail = ring->avail; + + bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, + BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); + + IXGBE_WRITE_REG(&adapter->hw, IXGBE_TDT(txr->me), k); + } + + if (n == 0 || kring->nr_hwavail < 1) { + /* record completed transmissions. TODO + * + * The datasheet discourages the use of TDH to find out the + * number of sent packets; the right way to do so, is to check + * the DD bit inside the status of a packet descriptor. On the + * other hand, we avoid to set the `report status' bit for + * *all* outgoing packets (kind of interrupt mitigation), + * consequently the DD bit is not guaranteed to be set for all + * the packets: thats way, for the moment we continue to use + * TDH. + */ + j = IXGBE_READ_REG(&adapter->hw, IXGBE_TDH(ring_nr)); + if (j >= kring->nkr_num_slots) { /* XXX can happen */ + D("TDH wrap %d", j); + j -= kring->nkr_num_slots; + } + int delta = j - txr->next_to_clean; + if (delta) { + /* new transmissions were completed, increment + ring->nr_hwavail. */ + if (delta < 0) + delta += kring->nkr_num_slots; + txr->next_to_clean = j; + kring->nr_hwavail += delta; + ring->avail = kring->nr_hwavail; + } + } + + if (do_lock) + IXGBE_TX_UNLOCK(txr); + return 0; +} + + +/* + * Reconcile kernel and user view of the receive ring. + * + * Userspace has read rx slots up to cur (excluded). + * The last unread slot previously known to the kernel was nr_hwcur, + * and the last interrupt reported nr_hwavail slots available. + * We must subtract the newly consumed slots (cur - nr_hwcur) + * from nr_hwavail, clearing the descriptors for the next + * read, tell the hardware that they are available, + * and set nr_hwcur = cur and avail = nr_hwavail. + * issuing a dmamap_sync on all slots. + */ +static int +ixgbe_netmap_rxsync(void *a, u_int ring_nr, int do_lock) +{ + struct adapter *adapter = a; + struct rx_ring *rxr = &adapter->rx_rings[ring_nr]; + struct netmap_adapter *na = NA(adapter->ifp); + struct netmap_kring *kring = &na->rx_rings[ring_nr]; + struct netmap_ring *ring = kring->ring; + int j, k, n, lim = kring->nkr_num_slots - 1; + + k = ring->cur; /* ring is not protected by any lock */ + if ( (kring->nr_kflags & NR_REINIT) || k > lim) + return netmap_ring_reinit(kring); + + if (do_lock) + IXGBE_RX_LOCK(rxr); + /* XXX check sync modes */ + bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, + BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); + + j = rxr->next_to_check; + for (n = 0; ; n++) { + union ixgbe_adv_rx_desc *curr = &rxr->rx_base[j]; + uint32_t staterr = le32toh(curr->wb.upper.status_error); + + if ((staterr & IXGBE_RXD_STAT_DD) == 0) + break; + ring->slot[j].len = le16toh(curr->wb.upper.length); + bus_dmamap_sync(rxr->ptag, + rxr->rx_buffers[j].pmap, BUS_DMASYNC_POSTREAD); + j = (j == lim) ? 0 : j + 1; + } + if (n) { + rxr->next_to_check = j; + kring->nr_hwavail += n; + if (kring->nr_hwavail >= lim - 10) { + ND("rx ring %d almost full %d", ring_nr, kring->nr_hwavail); + } + } + + /* skip past packets that userspace has already processed, + * making them available for reception. + * advance nr_hwcur and issue a bus_dmamap_sync on the + * buffers so it is safe to write to them. + * Also increase nr_hwavail + */ + j = kring->nr_hwcur; + if (j != k) { /* userspace has read some packets. */ + n = 0; + while (j != k) { + struct netmap_slot *slot = ring->slot + j; + union ixgbe_adv_rx_desc *curr = &rxr->rx_base[j]; + struct ixgbe_rx_buf *rxbuf = rxr->rx_buffers + j; + void *addr = NMB(slot); + + if (addr == netmap_buffer_base) { /* bad buf */ + if (do_lock) + IXGBE_RX_UNLOCK(rxr); + return netmap_ring_reinit(kring); + } + + curr->wb.upper.status_error = 0; + curr->read.pkt_addr = htole64(vtophys(addr)); + if (slot->flags & NS_BUF_CHANGED) { + netmap_reload_map(rxr->ptag, rxbuf->pmap, + addr, na->buff_size); + slot->flags &= ~NS_BUF_CHANGED; + } + + bus_dmamap_sync(rxr->ptag, rxbuf->pmap, + BUS_DMASYNC_PREREAD); + + j = (j == lim) ? 0 : j + 1; + n++; + } + kring->nr_hwavail -= n; + kring->nr_hwcur = ring->cur; + bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, + BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); + /* IMPORTANT: we must leave one free slot in the ring, + * so move j back by one unit + */ + j = (j == 0) ? lim : j - 1; + IXGBE_WRITE_REG(&adapter->hw, IXGBE_RDT(rxr->me), j); + } + /* tell userspace that there are new packets */ + ring->avail = kring->nr_hwavail ; + if (do_lock) + IXGBE_RX_UNLOCK(rxr); + return 0; +} diff --git a/sys/dev/netmap/netmap.c b/sys/dev/netmap/netmap.c new file mode 100644 index 0000000000000..7645a4e6e32bd --- /dev/null +++ b/sys/dev/netmap/netmap.c @@ -0,0 +1,1762 @@ +/* + * Copyright (C) 2011 Matteo Landi, Luigi Rizzo. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $FreeBSD$ + * $Id: netmap.c 9662 2011-11-16 13:18:06Z luigi $ + * + * This module supports memory mapped access to network devices, + * see netmap(4). + * + * The module uses a large, memory pool allocated by the kernel + * and accessible as mmapped memory by multiple userspace threads/processes. + * The memory pool contains packet buffers and "netmap rings", + * i.e. user-accessible copies of the interface's queues. + * + * Access to the network card works like this: + * 1. a process/thread issues one or more open() on /dev/netmap, to create + * select()able file descriptor on which events are reported. + * 2. on each descriptor, the process issues an ioctl() to identify + * the interface that should report events to the file descriptor. + * 3. on each descriptor, the process issues an mmap() request to + * map the shared memory region within the process' address space. + * The list of interesting queues is indicated by a location in + * the shared memory region. + * 4. using the functions in the netmap(4) userspace API, a process + * can look up the occupation state of a queue, access memory buffers, + * and retrieve received packets or enqueue packets to transmit. + * 5. using some ioctl()s the process can synchronize the userspace view + * of the queue with the actual status in the kernel. This includes both + * receiving the notification of new packets, and transmitting new + * packets on the output interface. + * 6. select() or poll() can be used to wait for events on individual + * transmit or receive queues (or all queues for a given interface). + */ + +#include <sys/cdefs.h> /* prerequisite */ +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> +#include <sys/module.h> +#include <sys/errno.h> +#include <sys/param.h> /* defines used in kernel.h */ +#include <sys/kernel.h> /* types used in module initialization */ +#include <sys/conf.h> /* cdevsw struct */ +#include <sys/uio.h> /* uio struct */ +#include <sys/sockio.h> +#include <sys/socketvar.h> /* struct socket */ +#include <sys/malloc.h> +#include <sys/mman.h> /* PROT_EXEC */ +#include <sys/poll.h> +#include <vm/vm.h> /* vtophys */ +#include <vm/pmap.h> /* vtophys */ +#include <sys/socket.h> /* sockaddrs */ +#include <machine/bus.h> +#include <sys/selinfo.h> +#include <sys/sysctl.h> +#include <net/if.h> +#include <net/bpf.h> /* BIOCIMMEDIATE */ +#include <net/netmap.h> +#include <dev/netmap/netmap_kern.h> +#include <machine/bus.h> /* bus_dmamap_* */ + +MALLOC_DEFINE(M_NETMAP, "netmap", "Network memory map"); + +/* + * lock and unlock for the netmap memory allocator + */ +#define NMA_LOCK() mtx_lock(&netmap_mem_d->nm_mtx); +#define NMA_UNLOCK() mtx_unlock(&netmap_mem_d->nm_mtx); + +/* + * Default amount of memory pre-allocated by the module. + * We start with a large size and then shrink our demand + * according to what is avalable when the module is loaded. + * At the moment the block is contiguous, but we can easily + * restrict our demand to smaller units (16..64k) + */ +#define NETMAP_MEMORY_SIZE (64 * 1024 * PAGE_SIZE) +static void * netmap_malloc(size_t size, const char *msg); +static void netmap_free(void *addr, const char *msg); + +/* + * Allocator for a pool of packet buffers. For each buffer we have + * one entry in the bitmap to signal the state. Allocation scans + * the bitmap, but since this is done only on attach, we are not + * too worried about performance + * XXX if we need to allocate small blocks, a translation + * table is used both for kernel virtual address and physical + * addresses. + */ +struct netmap_buf_pool { + u_int total_buffers; /* total buffers. */ + u_int free; + u_int bufsize; + char *base; /* buffer base address */ + uint32_t *bitmap; /* one bit per buffer, 1 means free */ +}; +struct netmap_buf_pool nm_buf_pool; +/* XXX move these two vars back into netmap_buf_pool */ +u_int netmap_total_buffers; +char *netmap_buffer_base; + +/* user-controlled variables */ +int netmap_verbose; + +static int no_timestamp; /* don't timestamp on rxsync */ + +SYSCTL_NODE(_dev, OID_AUTO, netmap, CTLFLAG_RW, 0, "Netmap args"); +SYSCTL_INT(_dev_netmap, OID_AUTO, verbose, + CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode"); +SYSCTL_INT(_dev_netmap, OID_AUTO, no_timestamp, + CTLFLAG_RW, &no_timestamp, 0, "no_timestamp"); +SYSCTL_INT(_dev_netmap, OID_AUTO, total_buffers, + CTLFLAG_RD, &nm_buf_pool.total_buffers, 0, "total_buffers"); +SYSCTL_INT(_dev_netmap, OID_AUTO, free_buffers, + CTLFLAG_RD, &nm_buf_pool.free, 0, "free_buffers"); + +/* + * Allocate n buffers from the ring, and fill the slot. + * Buffer 0 is the 'junk' buffer. + */ +static void +netmap_new_bufs(struct netmap_buf_pool *p, struct netmap_slot *slot, u_int n) +{ + uint32_t bi = 0; /* index in the bitmap */ + uint32_t mask, j, i = 0; /* slot counter */ + + if (n > p->free) { + D("only %d out of %d buffers available", i, n); + return; + } + /* termination is guaranteed by p->free */ + while (i < n && p->free > 0) { + uint32_t cur = p->bitmap[bi]; + if (cur == 0) { /* bitmask is fully used */ + bi++; + continue; + } + /* locate a slot */ + for (j = 0, mask = 1; (cur & mask) == 0; j++, mask <<= 1) ; + p->bitmap[bi] &= ~mask; /* slot in use */ + p->free--; + slot[i].buf_idx = bi*32+j; + slot[i].len = p->bufsize; + slot[i].flags = NS_BUF_CHANGED; + i++; + } + ND("allocated %d buffers, %d available", n, p->free); +} + + +static void +netmap_free_buf(struct netmap_buf_pool *p, uint32_t i) +{ + uint32_t pos, mask; + if (i >= p->total_buffers) { + D("invalid free index %d", i); + return; + } + pos = i / 32; + mask = 1 << (i % 32); + if (p->bitmap[pos] & mask) { + D("slot %d already free", i); + return; + } + p->bitmap[pos] |= mask; + p->free++; +} + + +/* Descriptor of the memory objects handled by our memory allocator. */ +struct netmap_mem_obj { + TAILQ_ENTRY(netmap_mem_obj) nmo_next; /* next object in the + chain. */ + int nmo_used; /* flag set on used memory objects. */ + size_t nmo_size; /* size of the memory area reserved for the + object. */ + void *nmo_data; /* pointer to the memory area. */ +}; + +/* Wrap our memory objects to make them ``chainable``. */ +TAILQ_HEAD(netmap_mem_obj_h, netmap_mem_obj); + + +/* Descriptor of our custom memory allocator. */ +struct netmap_mem_d { + struct mtx nm_mtx; /* lock used to handle the chain of memory + objects. */ + struct netmap_mem_obj_h nm_molist; /* list of memory objects */ + size_t nm_size; /* total amount of memory used for rings etc. */ + size_t nm_totalsize; /* total amount of allocated memory + (the difference is used for buffers) */ + size_t nm_buf_start; /* offset of packet buffers. + This is page-aligned. */ + size_t nm_buf_len; /* total memory for buffers */ + void *nm_buffer; /* pointer to the whole pre-allocated memory + area. */ +}; + + +/* Structure associated to each thread which registered an interface. */ +struct netmap_priv_d { + struct netmap_if *np_nifp; /* netmap interface descriptor. */ + + struct ifnet *np_ifp; /* device for which we hold a reference */ + int np_ringid; /* from the ioctl */ + u_int np_qfirst, np_qlast; /* range of rings to scan */ + uint16_t np_txpoll; +}; + + +static struct cdev *netmap_dev; /* /dev/netmap character device. */ +static struct netmap_mem_d *netmap_mem_d; /* Our memory allocator. */ + + +static d_mmap_t netmap_mmap; +static d_ioctl_t netmap_ioctl; +static d_poll_t netmap_poll; + +#ifdef NETMAP_KEVENT +static d_kqfilter_t netmap_kqfilter; +#endif + +static struct cdevsw netmap_cdevsw = { + .d_version = D_VERSION, + .d_name = "netmap", + .d_mmap = netmap_mmap, + .d_ioctl = netmap_ioctl, + .d_poll = netmap_poll, +#ifdef NETMAP_KEVENT + .d_kqfilter = netmap_kqfilter, +#endif +}; + +#ifdef NETMAP_KEVENT +static int netmap_kqread(struct knote *, long); +static int netmap_kqwrite(struct knote *, long); +static void netmap_kqdetach(struct knote *); + +static struct filterops netmap_read_filterops = { + .f_isfd = 1, + .f_attach = NULL, + .f_detach = netmap_kqdetach, + .f_event = netmap_kqread, +}; + +static struct filterops netmap_write_filterops = { + .f_isfd = 1, + .f_attach = NULL, + .f_detach = netmap_kqdetach, + .f_event = netmap_kqwrite, +}; + +/* + * support for the kevent() system call. + * + * This is the kevent filter, and is executed each time a new event + * is triggered on the device. This function execute some operation + * depending on the received filter. + * + * The implementation should test the filters and should implement + * filter operations we are interested on (a full list in /sys/event.h). + * + * On a match we should: + * - set kn->kn_fop + * - set kn->kn_hook + * - call knlist_add() to deliver the event to the application. + * + * Return 0 if the event should be delivered to the application. + */ +static int +netmap_kqfilter(struct cdev *dev, struct knote *kn) +{ + /* declare variables needed to read/write */ + + switch(kn->kn_filter) { + case EVFILT_READ: + if (netmap_verbose) + D("%s kqfilter: EVFILT_READ" ifp->if_xname); + + /* read operations */ + kn->kn_fop = &netmap_read_filterops; + break; + + case EVFILT_WRITE: + if (netmap_verbose) + D("%s kqfilter: EVFILT_WRITE" ifp->if_xname); + + /* write operations */ + kn->kn_fop = &netmap_write_filterops; + break; + + default: + if (netmap_verbose) + D("%s kqfilter: invalid filter" ifp->if_xname); + return(EINVAL); + } + + kn->kn_hook = 0;// + knlist_add(&netmap_sc->tun_rsel.si_note, kn, 0); + + return (0); +} +#endif /* NETMAP_KEVENT */ + +/* + * File descriptor's private data destructor. + * + * Call nm_register(ifp,0) to stop netmap mode on the interface and + * revert to normal operation. We expect that np_ifp has not gone. + */ +static void +netmap_dtor(void *data) +{ + struct netmap_priv_d *priv = data; + struct ifnet *ifp = priv->np_ifp; + struct netmap_adapter *na = NA(ifp); + struct netmap_if *nifp = priv->np_nifp; + + if (0) + printf("%s starting for %p ifp %p\n", __FUNCTION__, priv, + priv ? priv->np_ifp : NULL); + + na->nm_lock(ifp->if_softc, NETMAP_CORE_LOCK, 0); + + na->refcount--; + if (na->refcount <= 0) { /* last instance */ + u_int i; + + D("deleting last netmap instance for %s", ifp->if_xname); + /* + * there is a race here with *_netmap_task() and + * netmap_poll(), which don't run under NETMAP_CORE_LOCK. + * na->refcount == 0 && na->ifp->if_capenable & IFCAP_NETMAP + * (aka NETMAP_DELETING(na)) are a unique marker that the + * device is dying. + * Before destroying stuff we sleep a bit, and then complete + * the job. NIOCREG should realize the condition and + * loop until they can continue; the other routines + * should check the condition at entry and quit if + * they cannot run. + */ + na->nm_lock(ifp->if_softc, NETMAP_CORE_UNLOCK, 0); + tsleep(na, 0, "NIOCUNREG", 4); + na->nm_lock(ifp->if_softc, NETMAP_CORE_LOCK, 0); + na->nm_register(ifp, 0); /* off, clear IFCAP_NETMAP */ + /* Wake up any sleeping threads. netmap_poll will + * then return POLLERR + */ + for (i = 0; i < na->num_queues + 2; i++) { + selwakeuppri(&na->tx_rings[i].si, PI_NET); + selwakeuppri(&na->rx_rings[i].si, PI_NET); + } + /* release all buffers */ + NMA_LOCK(); + for (i = 0; i < na->num_queues + 1; i++) { + int j, lim; + struct netmap_ring *ring; + + ND("tx queue %d", i); + ring = na->tx_rings[i].ring; + lim = na->tx_rings[i].nkr_num_slots; + for (j = 0; j < lim; j++) + netmap_free_buf(&nm_buf_pool, + ring->slot[j].buf_idx); + + ND("rx queue %d", i); + ring = na->rx_rings[i].ring; + lim = na->rx_rings[i].nkr_num_slots; + for (j = 0; j < lim; j++) + netmap_free_buf(&nm_buf_pool, + ring->slot[j].buf_idx); + } + NMA_UNLOCK(); + netmap_free(na->tx_rings[0].ring, "shadow rings"); + wakeup(na); + } + netmap_free(nifp, "nifp"); + + na->nm_lock(ifp->if_softc, NETMAP_CORE_UNLOCK, 0); + + if_rele(ifp); + + bzero(priv, sizeof(*priv)); /* XXX for safety */ + free(priv, M_DEVBUF); +} + + + +/* + * Create and return a new ``netmap_if`` object, and possibly also + * rings and packet buffors. + * + * Return NULL on failure. + */ +static void * +netmap_if_new(const char *ifname, struct netmap_adapter *na) +{ + struct netmap_if *nifp; + struct netmap_ring *ring; + char *buff; + u_int i, len, ofs; + u_int n = na->num_queues + 1; /* shorthand, include stack queue */ + + /* + * the descriptor is followed inline by an array of offsets + * to the tx and rx rings in the shared memory region. + */ + len = sizeof(struct netmap_if) + 2 * n * sizeof(ssize_t); + nifp = netmap_malloc(len, "nifp"); + if (nifp == NULL) + return (NULL); + + /* initialize base fields */ + *(int *)(uintptr_t)&nifp->ni_num_queues = na->num_queues; + strncpy(nifp->ni_name, ifname, IFNAMSIZ); + + (na->refcount)++; /* XXX atomic ? we are under lock */ + if (na->refcount > 1) + goto final; + + /* + * If this is the first instance, allocate the shadow rings and + * buffers for this card (one for each hw queue, one for the host). + * The rings are contiguous, but have variable size. + * The entire block is reachable at + * na->tx_rings[0].ring + */ + + len = n * (2 * sizeof(struct netmap_ring) + + (na->num_tx_desc + na->num_rx_desc) * + sizeof(struct netmap_slot) ); + buff = netmap_malloc(len, "shadow rings"); + if (buff == NULL) { + D("failed to allocate %d bytes for %s shadow ring", + len, ifname); +error: + (na->refcount)--; + netmap_free(nifp, "nifp, rings failed"); + return (NULL); + } + /* do we have the bufers ? we are in need of num_tx_desc buffers for + * each tx ring and num_tx_desc buffers for each rx ring. */ + len = n * (na->num_tx_desc + na->num_rx_desc); + NMA_LOCK(); + if (nm_buf_pool.free < len) { + NMA_UNLOCK(); + netmap_free(buff, "not enough bufs"); + goto error; + } + /* + * in the kring, store the pointers to the shared rings + * and initialize the rings. We are under NMA_LOCK(). + */ + ofs = 0; + for (i = 0; i < n; i++) { + struct netmap_kring *kring; + int numdesc; + + /* Transmit rings */ + kring = &na->tx_rings[i]; + numdesc = na->num_tx_desc; + bzero(kring, sizeof(*kring)); + kring->na = na; + + ring = kring->ring = (struct netmap_ring *)(buff + ofs); + *(ssize_t *)(uintptr_t)&ring->buf_ofs = + nm_buf_pool.base - (char *)ring; + ND("txring[%d] at %p ofs %d", i, ring, ring->buf_ofs); + *(int *)(int *)(uintptr_t)&ring->num_slots = + kring->nkr_num_slots = numdesc; + + /* + * IMPORTANT: + * Always keep one slot empty, so we can detect new + * transmissions comparing cur and nr_hwcur (they are + * the same only if there are no new transmissions). + */ + ring->avail = kring->nr_hwavail = numdesc - 1; + ring->cur = kring->nr_hwcur = 0; + netmap_new_bufs(&nm_buf_pool, ring->slot, numdesc); + + ofs += sizeof(struct netmap_ring) + + numdesc * sizeof(struct netmap_slot); + + /* Receive rings */ + kring = &na->rx_rings[i]; + numdesc = na->num_rx_desc; + bzero(kring, sizeof(*kring)); + kring->na = na; + + ring = kring->ring = (struct netmap_ring *)(buff + ofs); + *(ssize_t *)(uintptr_t)&ring->buf_ofs = + nm_buf_pool.base - (char *)ring; + ND("rxring[%d] at %p offset %d", i, ring, ring->buf_ofs); + *(int *)(int *)(uintptr_t)&ring->num_slots = + kring->nkr_num_slots = numdesc; + ring->cur = kring->nr_hwcur = 0; + ring->avail = kring->nr_hwavail = 0; /* empty */ + netmap_new_bufs(&nm_buf_pool, ring->slot, numdesc); + ofs += sizeof(struct netmap_ring) + + numdesc * sizeof(struct netmap_slot); + } + NMA_UNLOCK(); + for (i = 0; i < n+1; i++) { + // XXX initialize the selrecord structs. + } +final: + /* + * fill the slots for the rx and tx queues. They contain the offset + * between the ring and nifp, so the information is usable in + * userspace to reach the ring from the nifp. + */ + for (i = 0; i < n; i++) { + char *base = (char *)nifp; + *(ssize_t *)(uintptr_t)&nifp->ring_ofs[i] = + (char *)na->tx_rings[i].ring - base; + *(ssize_t *)(uintptr_t)&nifp->ring_ofs[i+n] = + (char *)na->rx_rings[i].ring - base; + } + return (nifp); +} + + +/* + * mmap(2) support for the "netmap" device. + * + * Expose all the memory previously allocated by our custom memory + * allocator: this way the user has only to issue a single mmap(2), and + * can work on all the data structures flawlessly. + * + * Return 0 on success, -1 otherwise. + */ +static int +#if __FreeBSD_version < 900000 +netmap_mmap(__unused struct cdev *dev, vm_offset_t offset, vm_paddr_t *paddr, + int nprot) +#else +netmap_mmap(__unused struct cdev *dev, vm_ooffset_t offset, vm_paddr_t *paddr, + int nprot, __unused vm_memattr_t *memattr) +#endif +{ + if (nprot & PROT_EXEC) + return (-1); // XXX -1 or EINVAL ? + ND("request for offset 0x%x", (uint32_t)offset); + *paddr = vtophys(netmap_mem_d->nm_buffer) + offset; + + return (0); +} + + +/* + * handler for synchronization of the queues from/to the host + */ +static void +netmap_sync_to_host(struct netmap_adapter *na) +{ + struct netmap_kring *kring = &na->tx_rings[na->num_queues]; + struct netmap_ring *ring = kring->ring; + struct mbuf *head = NULL, *tail = NULL, *m; + u_int n, lim = kring->nkr_num_slots - 1; + + na->nm_lock(na->ifp->if_softc, NETMAP_CORE_LOCK, 0); + + /* Take packets from hwcur to cur and pass them up. + * In case of no buffers we give up. At the end of the loop, + * the queue is drained in all cases. + */ + for (n = kring->nr_hwcur; n != ring->cur;) { + struct netmap_slot *slot = &ring->slot[n]; + + n = (n == lim) ? 0 : n + 1; + if (slot->len < 14 || slot->len > NETMAP_BUF_SIZE) { + D("bad pkt at %d len %d", n, slot->len); + continue; + } + m = m_devget(NMB(slot), slot->len, 0, na->ifp, NULL); + + if (m == NULL) + break; + if (tail) + tail->m_nextpkt = m; + else + head = m; + tail = m; + m->m_nextpkt = NULL; + } + kring->nr_hwcur = ring->cur; + kring->nr_hwavail = ring->avail = lim; + na->nm_lock(na->ifp->if_softc, NETMAP_CORE_UNLOCK, 0); + + /* send packets up, outside the lock */ + while ((m = head) != NULL) { + head = head->m_nextpkt; + m->m_nextpkt = NULL; + m->m_pkthdr.rcvif = na->ifp; + if (netmap_verbose & NM_VERB_HOST) + D("sending up pkt %p size %d", m, m->m_pkthdr.len); + (na->ifp->if_input)(na->ifp, m); + } +} + +/* + * This routine also does the selrecord if called from the poll handler + * (we know because td != NULL). + */ +static void +netmap_sync_from_host(struct netmap_adapter *na, struct thread *td) +{ + struct netmap_kring *kring = &na->rx_rings[na->num_queues]; + struct netmap_ring *ring = kring->ring; + int delta; + + na->nm_lock(na->ifp->if_softc, NETMAP_CORE_LOCK, 0); + + /* skip past packets processed by userspace, + * and then sync cur/avail with hwcur/hwavail + */ + delta = ring->cur - kring->nr_hwcur; + if (delta < 0) + delta += kring->nkr_num_slots; + kring->nr_hwavail -= delta; + kring->nr_hwcur = ring->cur; + ring->avail = kring->nr_hwavail; + if (ring->avail == 0 && td) + selrecord(td, &kring->si); + if (ring->avail && (netmap_verbose & NM_VERB_HOST)) + D("%d pkts from stack", ring->avail); + na->nm_lock(na->ifp->if_softc, NETMAP_CORE_UNLOCK, 0); +} + + +/* + * get a refcounted reference to an interface. + * Return ENXIO if the interface does not exist, EINVAL if netmap + * is not supported by the interface. + * If successful, hold a reference. + */ +static int +get_ifp(const char *name, struct ifnet **ifp) +{ + *ifp = ifunit_ref(name); + if (*ifp == NULL) + return (ENXIO); + /* can do this if the capability exists and if_pspare[0] + * points to the netmap descriptor. + */ + if ((*ifp)->if_capabilities & IFCAP_NETMAP && NA(*ifp)) + return 0; /* valid pointer, we hold the refcount */ + if_rele(*ifp); + return EINVAL; // not NETMAP capable +} + + +/* + * Error routine called when txsync/rxsync detects an error. + * Can't do much more than resetting cur = hwcur, avail = hwavail. + * Return 1 on reinit. + */ +int +netmap_ring_reinit(struct netmap_kring *kring) +{ + struct netmap_ring *ring = kring->ring; + u_int i, lim = kring->nkr_num_slots - 1; + int errors = 0; + + D("called for %s", kring->na->ifp->if_xname); + if (ring->cur > lim) + errors++; + for (i = 0; i <= lim; i++) { + u_int idx = ring->slot[i].buf_idx; + u_int len = ring->slot[i].len; + if (idx < 2 || idx >= netmap_total_buffers) { + if (!errors++) + D("bad buffer at slot %d idx %d len %d ", i, idx, len); + ring->slot[i].buf_idx = 0; + ring->slot[i].len = 0; + } else if (len > NETMAP_BUF_SIZE) { + ring->slot[i].len = 0; + if (!errors++) + D("bad len %d at slot %d idx %d", + len, i, idx); + } + } + if (errors) { + int pos = kring - kring->na->tx_rings; + int n = kring->na->num_queues + 2; + + D("total %d errors", errors); + errors++; + D("%s %s[%d] reinit, cur %d -> %d avail %d -> %d", + kring->na->ifp->if_xname, + pos < n ? "TX" : "RX", pos < n ? pos : pos - n, + ring->cur, kring->nr_hwcur, + ring->avail, kring->nr_hwavail); + ring->cur = kring->nr_hwcur; + ring->avail = kring->nr_hwavail; + ring->flags |= NR_REINIT; + kring->na->flags |= NR_REINIT; + } + return (errors ? 1 : 0); +} + +/* + * Clean the reinit flag for our rings. + * XXX at the moment, clear for all rings + */ +static void +netmap_clean_reinit(struct netmap_adapter *na) +{ + //struct netmap_kring *kring; + u_int i; + + na->flags &= ~NR_REINIT; + D("--- NR_REINIT reset on %s", na->ifp->if_xname); + for (i = 0; i < na->num_queues + 1; i++) { + na->tx_rings[i].ring->flags &= ~NR_REINIT; + na->rx_rings[i].ring->flags &= ~NR_REINIT; + } +} + +/* + * Set the ring ID. For devices with a single queue, a request + * for all rings is the same as a single ring. + */ +static int +netmap_set_ringid(struct netmap_priv_d *priv, u_int ringid) +{ + struct ifnet *ifp = priv->np_ifp; + struct netmap_adapter *na = NA(ifp); + void *adapter = na->ifp->if_softc; /* shorthand */ + u_int i = ringid & NETMAP_RING_MASK; + /* first time we don't lock */ + int need_lock = (priv->np_qfirst != priv->np_qlast); + + if ( (ringid & NETMAP_HW_RING) && i >= na->num_queues) { + D("invalid ring id %d", i); + return (EINVAL); + } + if (need_lock) + na->nm_lock(adapter, NETMAP_CORE_LOCK, 0); + priv->np_ringid = ringid; + if (ringid & NETMAP_SW_RING) { + priv->np_qfirst = na->num_queues; + priv->np_qlast = na->num_queues + 1; + } else if (ringid & NETMAP_HW_RING) { + priv->np_qfirst = i; + priv->np_qlast = i + 1; + } else { + priv->np_qfirst = 0; + priv->np_qlast = na->num_queues; + } + priv->np_txpoll = (ringid & NETMAP_NO_TX_POLL) ? 0 : 1; + if (need_lock) + na->nm_lock(adapter, NETMAP_CORE_UNLOCK, 0); + if (ringid & NETMAP_SW_RING) + D("ringid %s set to SW RING", ifp->if_xname); + else if (ringid & NETMAP_HW_RING) + D("ringid %s set to HW RING %d", ifp->if_xname, + priv->np_qfirst); + else + D("ringid %s set to all %d HW RINGS", ifp->if_xname, + priv->np_qlast); + return 0; +} + +/* + * ioctl(2) support for the "netmap" device. + * + * Following a list of accepted commands: + * - NIOCGINFO + * - SIOCGIFADDR just for convenience + * - NIOCREGIF + * - NIOCUNREGIF + * - NIOCTXSYNC + * - NIOCRXSYNC + * + * Return 0 on success, errno otherwise. + */ +static int +netmap_ioctl(__unused struct cdev *dev, u_long cmd, caddr_t data, + __unused int fflag, __unused struct thread *td) +{ + struct netmap_priv_d *priv = NULL; + struct ifnet *ifp; + struct nmreq *nmr = (struct nmreq *) data; + struct netmap_adapter *na; + void *adapter; + int error; + u_int i; + struct netmap_if *nifp; + + error = devfs_get_cdevpriv((void **)&priv); + if (error != ENOENT && error != 0) + return (error); + + error = 0; /* Could be ENOENT */ + switch (cmd) { + case NIOCGINFO: /* return capabilities etc */ + /* memsize is always valid */ + nmr->nr_memsize = netmap_mem_d->nm_totalsize; + nmr->nr_offset = 0; + nmr->nr_numrings = 0; + nmr->nr_numslots = 0; + if (nmr->nr_name[0] == '\0') /* just get memory info */ + break; + error = get_ifp(nmr->nr_name, &ifp); /* get a refcount */ + if (error) + break; + na = NA(ifp); /* retrieve netmap_adapter */ + nmr->nr_numrings = na->num_queues; + nmr->nr_numslots = na->num_tx_desc; + if_rele(ifp); /* return the refcount */ + break; + + case NIOCREGIF: + if (priv != NULL) /* thread already registered */ + return netmap_set_ringid(priv, nmr->nr_ringid); + /* find the interface and a reference */ + error = get_ifp(nmr->nr_name, &ifp); /* keep reference */ + if (error) + break; + na = NA(ifp); /* retrieve netmap adapter */ + adapter = na->ifp->if_softc; /* shorthand */ + /* + * Allocate the private per-thread structure. + * XXX perhaps we can use a blocking malloc ? + */ + priv = malloc(sizeof(struct netmap_priv_d), M_DEVBUF, + M_NOWAIT | M_ZERO); + if (priv == NULL) { + error = ENOMEM; + if_rele(ifp); /* return the refcount */ + break; + } + + + for (i = 10; i > 0; i--) { + na->nm_lock(adapter, NETMAP_CORE_LOCK, 0); + if (!NETMAP_DELETING(na)) + break; + na->nm_lock(adapter, NETMAP_CORE_UNLOCK, 0); + tsleep(na, 0, "NIOCREGIF", hz/10); + } + if (i == 0) { + D("too many NIOCREGIF attempts, give up"); + error = EINVAL; + free(priv, M_DEVBUF); + if_rele(ifp); /* return the refcount */ + break; + } + + priv->np_ifp = ifp; /* store the reference */ + error = netmap_set_ringid(priv, nmr->nr_ringid); + if (error) + goto error; + priv->np_nifp = nifp = netmap_if_new(nmr->nr_name, na); + if (nifp == NULL) { /* allocation failed */ + error = ENOMEM; + } else if (ifp->if_capenable & IFCAP_NETMAP) { + /* was already set */ + } else { + /* Otherwise set the card in netmap mode + * and make it use the shared buffers. + */ + error = na->nm_register(ifp, 1); /* mode on */ + if (error) { + /* + * do something similar to netmap_dtor(). + */ + netmap_free(na->tx_rings[0].ring, "rings, reg.failed"); + free(na->tx_rings, M_DEVBUF); + na->tx_rings = na->rx_rings = NULL; + na->refcount--; + netmap_free(nifp, "nifp, rings failed"); + nifp = NULL; + } + } + + if (error) { /* reg. failed, release priv and ref */ +error: + na->nm_lock(adapter, NETMAP_CORE_UNLOCK, 0); + free(priv, M_DEVBUF); + if_rele(ifp); /* return the refcount */ + break; + } + + na->nm_lock(adapter, NETMAP_CORE_UNLOCK, 0); + error = devfs_set_cdevpriv(priv, netmap_dtor); + + if (error != 0) { + /* could not assign the private storage for the + * thread, call the destructor explicitly. + */ + netmap_dtor(priv); + break; + } + + /* return the offset of the netmap_if object */ + nmr->nr_numrings = na->num_queues; + nmr->nr_numslots = na->num_tx_desc; + nmr->nr_memsize = netmap_mem_d->nm_totalsize; + nmr->nr_offset = + ((char *) nifp - (char *) netmap_mem_d->nm_buffer); + break; + + case NIOCUNREGIF: + if (priv == NULL) + return (ENXIO); + + /* the interface is unregistered inside the + destructor of the private data. */ + devfs_clear_cdevpriv(); + break; + + case NIOCTXSYNC: + case NIOCRXSYNC: + if (priv == NULL) + return (ENXIO); + ifp = priv->np_ifp; /* we have a reference */ + na = NA(ifp); /* retrieve netmap adapter */ + adapter = ifp->if_softc; /* shorthand */ + + if (na->flags & NR_REINIT) + netmap_clean_reinit(na); + + if (priv->np_qfirst == na->num_queues) { + /* queues to/from host */ + if (cmd == NIOCTXSYNC) + netmap_sync_to_host(na); + else + netmap_sync_from_host(na, NULL); + return error; + } + + for (i = priv->np_qfirst; i < priv->np_qlast; i++) { + if (cmd == NIOCTXSYNC) { + struct netmap_kring *kring = &na->tx_rings[i]; + if (netmap_verbose & NM_VERB_TXSYNC) + D("sync tx ring %d cur %d hwcur %d", + i, kring->ring->cur, + kring->nr_hwcur); + na->nm_txsync(adapter, i, 1 /* do lock */); + if (netmap_verbose & NM_VERB_TXSYNC) + D("after sync tx ring %d cur %d hwcur %d", + i, kring->ring->cur, + kring->nr_hwcur); + } else { + na->nm_rxsync(adapter, i, 1 /* do lock */); + microtime(&na->rx_rings[i].ring->ts); + } + } + + break; + + case BIOCIMMEDIATE: + case BIOCGHDRCMPLT: + case BIOCSHDRCMPLT: + case BIOCSSEESENT: + D("ignore BIOCIMMEDIATE/BIOCSHDRCMPLT/BIOCSHDRCMPLT/BIOCSSEESENT"); + break; + + default: + { + /* + * allow device calls + */ + struct socket so; + bzero(&so, sizeof(so)); + error = get_ifp(nmr->nr_name, &ifp); /* keep reference */ + if (error) + break; + so.so_vnet = ifp->if_vnet; + // so->so_proto not null. + error = ifioctl(&so, cmd, data, td); + if_rele(ifp); + } + } + + return (error); +} + + +/* + * select(2) and poll(2) handlers for the "netmap" device. + * + * Can be called for one or more queues. + * Return true the event mask corresponding to ready events. + * If there are no ready events, do a selrecord on either individual + * selfd or on the global one. + * Device-dependent parts (locking and sync of tx/rx rings) + * are done through callbacks. + */ +static int +netmap_poll(__unused struct cdev *dev, int events, struct thread *td) +{ + struct netmap_priv_d *priv = NULL; + struct netmap_adapter *na; + struct ifnet *ifp; + struct netmap_kring *kring; + u_int i, check_all, want_tx, want_rx, revents = 0; + void *adapter; + + if (devfs_get_cdevpriv((void **)&priv) != 0 || priv == NULL) + return POLLERR; + + ifp = priv->np_ifp; + // XXX check for deleting() ? + if ( (ifp->if_capenable & IFCAP_NETMAP) == 0) + return POLLERR; + + if (netmap_verbose & 0x8000) + D("device %s events 0x%x", ifp->if_xname, events); + want_tx = events & (POLLOUT | POLLWRNORM); + want_rx = events & (POLLIN | POLLRDNORM); + + adapter = ifp->if_softc; + na = NA(ifp); /* retrieve netmap adapter */ + + /* pending reinit, report up as a poll error. Pending + * reads and writes are lost. + */ + if (na->flags & NR_REINIT) { + netmap_clean_reinit(na); + revents |= POLLERR; + } + /* how many queues we are scanning */ + i = priv->np_qfirst; + if (i == na->num_queues) { /* from/to host */ + if (priv->np_txpoll || want_tx) { + /* push any packets up, then we are always ready */ + kring = &na->tx_rings[i]; + netmap_sync_to_host(na); + revents |= want_tx; + } + if (want_rx) { + kring = &na->rx_rings[i]; + if (kring->ring->avail == 0) + netmap_sync_from_host(na, td); + if (kring->ring->avail > 0) { + revents |= want_rx; + } + } + return (revents); + } + + /* + * check_all is set if the card has more than one queue and + * the client is polling all of them. If true, we sleep on + * the "global" selfd, otherwise we sleep on individual selfd + * (we can only sleep on one of them per direction). + * The interrupt routine in the driver should always wake on + * the individual selfd, and also on the global one if the card + * has more than one ring. + * + * If the card has only one lock, we just use that. + * If the card has separate ring locks, we just use those + * unless we are doing check_all, in which case the whole + * loop is wrapped by the global lock. + * We acquire locks only when necessary: if poll is called + * when buffers are available, we can just return without locks. + * + * rxsync() is only called if we run out of buffers on a POLLIN. + * txsync() is called if we run out of buffers on POLLOUT, or + * there are pending packets to send. The latter can be disabled + * passing NETMAP_NO_TX_POLL in the NIOCREG call. + */ + check_all = (i + 1 != priv->np_qlast); + + /* + * core_lock indicates what to do with the core lock. + * The core lock is used when either the card has no individual + * locks, or it has individual locks but we are cheking all + * rings so we need the core lock to avoid missing wakeup events. + * + * It has three possible states: + * NO_CL we don't need to use the core lock, e.g. + * because we are protected by individual locks. + * NEED_CL we need the core lock. In this case, when we + * call the lock routine, move to LOCKED_CL + * to remember to release the lock once done. + * LOCKED_CL core lock is set, so we need to release it. + */ + enum {NO_CL, NEED_CL, LOCKED_CL }; + int core_lock = (check_all || !na->separate_locks) ? + NEED_CL:NO_CL; + /* + * We start with a lock free round which is good if we have + * data available. If this fails, then lock and call the sync + * routines. + */ + for (i = priv->np_qfirst; want_rx && i < priv->np_qlast; i++) { + kring = &na->rx_rings[i]; + if (kring->ring->avail > 0) { + revents |= want_rx; + want_rx = 0; /* also breaks the loop */ + } + } + for (i = priv->np_qfirst; want_tx && i < priv->np_qlast; i++) { + kring = &na->tx_rings[i]; + if (kring->ring->avail > 0) { + revents |= want_tx; + want_tx = 0; /* also breaks the loop */ + } + } + + /* + * If we to push packets out (priv->np_txpoll) or want_tx is + * still set, we do need to run the txsync calls (on all rings, + * to avoid that the tx rings stall). + */ + if (priv->np_txpoll || want_tx) { + for (i = priv->np_qfirst; i < priv->np_qlast; i++) { + kring = &na->tx_rings[i]; + if (!want_tx && kring->ring->cur == kring->nr_hwcur) + continue; + if (core_lock == NEED_CL) { + na->nm_lock(adapter, NETMAP_CORE_LOCK, 0); + core_lock = LOCKED_CL; + } + if (na->separate_locks) + na->nm_lock(adapter, NETMAP_TX_LOCK, i); + if (netmap_verbose & NM_VERB_TXSYNC) + D("send %d on %s %d", + kring->ring->cur, + ifp->if_xname, i); + if (na->nm_txsync(adapter, i, 0 /* no lock */)) + revents |= POLLERR; + + if (want_tx) { + if (kring->ring->avail > 0) { + /* stop at the first ring. We don't risk + * starvation. + */ + revents |= want_tx; + want_tx = 0; + } else if (!check_all) + selrecord(td, &kring->si); + } + if (na->separate_locks) + na->nm_lock(adapter, NETMAP_TX_UNLOCK, i); + } + } + + /* + * now if want_rx is still set we need to lock and rxsync. + * Do it on all rings because otherwise we starve. + */ + if (want_rx) { + for (i = priv->np_qfirst; i < priv->np_qlast; i++) { + kring = &na->rx_rings[i]; + if (core_lock == NEED_CL) { + na->nm_lock(adapter, NETMAP_CORE_LOCK, 0); + core_lock = LOCKED_CL; + } + if (na->separate_locks) + na->nm_lock(adapter, NETMAP_RX_LOCK, i); + + if (na->nm_rxsync(adapter, i, 0 /* no lock */)) + revents |= POLLERR; + if (no_timestamp == 0 || + kring->ring->flags & NR_TIMESTAMP) + microtime(&kring->ring->ts); + + if (kring->ring->avail > 0) + revents |= want_rx; + else if (!check_all) + selrecord(td, &kring->si); + if (na->separate_locks) + na->nm_lock(adapter, NETMAP_RX_UNLOCK, i); + } + } + if (check_all && revents == 0) { + i = na->num_queues + 1; /* the global queue */ + if (want_tx) + selrecord(td, &na->tx_rings[i].si); + if (want_rx) + selrecord(td, &na->rx_rings[i].si); + } + if (core_lock == LOCKED_CL) + na->nm_lock(adapter, NETMAP_CORE_UNLOCK, 0); + + return (revents); +} + +/*------- driver support routines ------*/ + +/* + * Initialize a ``netmap_adapter`` object created by driver on attach. + * We allocate a block of memory with room for a struct netmap_adapter + * plus two sets of N+2 struct netmap_kring (where N is the number + * of hardware rings): + * krings 0..N-1 are for the hardware queues. + * kring N is for the host stack queue + * kring N+1 is only used for the selinfo for all queues. + * Return 0 on success, ENOMEM otherwise. + */ +int +netmap_attach(struct netmap_adapter *na, int num_queues) +{ + int n = num_queues + 2; + int size = sizeof(*na) + 2 * n * sizeof(struct netmap_kring); + void *buf; + struct ifnet *ifp = na->ifp; + + if (ifp == NULL) { + D("ifp not set, giving up"); + return EINVAL; + } + na->refcount = 0; + na->num_queues = num_queues; + + buf = malloc(size, M_DEVBUF, M_NOWAIT | M_ZERO); + if (buf) { + ifp->if_pspare[0] = buf; + na->tx_rings = (void *)((char *)buf + sizeof(*na)); + na->rx_rings = na->tx_rings + n; + bcopy(na, buf, sizeof(*na)); + ifp->if_capabilities |= IFCAP_NETMAP; + } + D("%s for %s", buf ? "ok" : "failed", ifp->if_xname); + + return (buf ? 0 : ENOMEM); +} + + +/* + * Free the allocated memory linked to the given ``netmap_adapter`` + * object. + */ +void +netmap_detach(struct ifnet *ifp) +{ + u_int i; + struct netmap_adapter *na = NA(ifp); + + if (!na) + return; + + for (i = 0; i < na->num_queues + 2; i++) { + knlist_destroy(&na->tx_rings[i].si.si_note); + knlist_destroy(&na->rx_rings[i].si.si_note); + } + bzero(na, sizeof(*na)); + ifp->if_pspare[0] = NULL; + free(na, M_DEVBUF); +} + + +/* + * intercept packets coming from the network stack and present + * them to netmap as incoming packets on a separate ring. + * We are not locked when called. + */ +int +netmap_start(struct ifnet *ifp, struct mbuf *m) +{ + struct netmap_adapter *na = NA(ifp); + u_int i, len, n = na->num_queues; + int error = EBUSY; + struct netmap_kring *kring = &na->rx_rings[n]; + struct netmap_slot *slot; + + len = m->m_pkthdr.len; + if (netmap_verbose & NM_VERB_HOST) + D("%s packet %d len %d from the stack", ifp->if_xname, + kring->nr_hwcur + kring->nr_hwavail, len); + na->nm_lock(ifp->if_softc, NETMAP_CORE_LOCK, 0); + if (kring->nr_hwavail >= (int)kring->nkr_num_slots - 1) { + D("stack ring %s full\n", ifp->if_xname); + goto done; /* no space */ + } + if (len > na->buff_size) { + D("drop packet size %d > %d", len, na->buff_size); + goto done; /* too long for us */ + } + + /* compute the insert position */ + i = kring->nr_hwcur + kring->nr_hwavail; + if (i >= kring->nkr_num_slots) + i -= kring->nkr_num_slots; + slot = &kring->ring->slot[i]; + m_copydata(m, 0, len, NMB(slot)); + slot->len = len; + kring->nr_hwavail++; + if (netmap_verbose & NM_VERB_HOST) + D("wake up host ring %s %d", na->ifp->if_xname, na->num_queues); + selwakeuppri(&kring->si, PI_NET); + error = 0; +done: + na->nm_lock(ifp->if_softc, NETMAP_CORE_UNLOCK, 0); + + /* release the mbuf in either cases of success or failure. As an + * alternative, put the mbuf in a free list and free the list + * only when really necessary. + */ + m_freem(m); + + return (error); +} + + +/* + * netmap_reset() is called by the driver routines when reinitializing + * a ring. The driver is in charge of locking to protect the kring. + * If netmap mode is not set just return NULL. + * Otherwise set NR_REINIT (in the ring and in na) to signal + * that a ring has been reinitialized, + * set cur = hwcur = 0 and avail = hwavail = num_slots - 1 . + * IT IS IMPORTANT to leave one slot free even in the tx ring because + * we rely on cur=hwcur only for empty rings. + * These are good defaults but can be overridden later in the device + * specific code if, after a reinit, the ring does not start from 0 + * (e.g. if_em.c does this). + * + * XXX we shouldn't be touching the ring, but there is a + * race anyways and this is our best option. + * + * XXX setting na->flags makes the syscall code faster, as there is + * only one place to check. On the other hand, we will need a better + * way to notify multiple threads that rings have been reset. + * One way is to increment na->rst_count at each ring reset. + * Each thread in its own priv structure will keep a matching counter, + * and on a reset will acknowledge and clean its own rings. + */ +struct netmap_slot * +netmap_reset(struct netmap_adapter *na, enum txrx tx, int n, + u_int new_cur) +{ + struct netmap_kring *kring; + struct netmap_ring *ring; + struct netmap_slot *slot; + u_int i; + + if (na == NULL) + return NULL; /* no netmap support here */ + if (!(na->ifp->if_capenable & IFCAP_NETMAP)) + return NULL; /* nothing to reinitialize */ + kring = tx == NR_TX ? na->tx_rings + n : na->rx_rings + n; + ring = kring->ring; + if (tx == NR_TX) { + /* + * The last argument is the new value of next_to_clean. + * + * In the TX ring, we have P pending transmissions (from + * next_to_clean to nr_hwcur) followed by nr_hwavail free slots. + * Generally we can use all the slots in the ring so + * P = ring_size - nr_hwavail hence (modulo ring_size): + * next_to_clean == nr_hwcur + nr_hwavail + * + * If, upon a reset, nr_hwavail == ring_size and next_to_clean + * does not change we have nothing to report. Otherwise some + * pending packets may be lost, or newly injected packets will. + */ + /* if hwcur does not change, nothing to report. + * otherwise remember the change so perhaps we can + * shift the block at the next reinit + */ + if (new_cur == kring->nr_hwcur && + kring->nr_hwavail == kring->nkr_num_slots - 1) { + /* all ok */ + D("+++ NR_REINIT ok on %s TX[%d]", na->ifp->if_xname, n); + } else { + D("+++ NR_REINIT set on %s TX[%d]", na->ifp->if_xname, n); + } + ring->flags |= NR_REINIT; + na->flags |= NR_REINIT; + ring->avail = kring->nr_hwavail = kring->nkr_num_slots - 1; + ring->cur = kring->nr_hwcur = new_cur; + } else { + /* + * The last argument is the next free slot. + * In the RX ring we have nr_hwavail full buffers starting + * from nr_hwcur. + * If nr_hwavail == 0 and nr_hwcur does not change we are ok + * otherwise we might be in trouble as the buffers are + * changing. + */ + if (new_cur == kring->nr_hwcur && kring->nr_hwavail == 0) { + /* all ok */ + D("+++ NR_REINIT ok on %s RX[%d]", na->ifp->if_xname, n); + } else { + D("+++ NR_REINIT set on %s RX[%d]", na->ifp->if_xname, n); + } + ring->flags |= NR_REINIT; + na->flags |= NR_REINIT; + ring->avail = kring->nr_hwavail = 0; /* no data */ + ring->cur = kring->nr_hwcur = new_cur; + } + + slot = ring->slot; + /* + * Check that buffer indexes are correct. If we find a + * bogus value we are a bit in trouble because we cannot + * recover easily. Best we can do is (probably) persistently + * reset the ring. + */ + for (i = 0; i < kring->nkr_num_slots; i++) { + if (slot[i].buf_idx >= netmap_total_buffers) { + D("invalid buf_idx %d at slot %d", slot[i].buf_idx, i); + slot[i].buf_idx = 0; /* XXX reset */ + } + /* XXX we don't really need to set the length */ + slot[i].len = 0; + } + /* wakeup possible waiters, both on the ring and on the global + * selfd. Perhaps a bit early now but the device specific + * routine is locked so hopefully we won't have a race. + */ + selwakeuppri(&kring->si, PI_NET); + selwakeuppri(&kring[na->num_queues + 1 - n].si, PI_NET); + return kring->ring->slot; +} + +static void +ns_dmamap_cb(__unused void *arg, __unused bus_dma_segment_t * segs, + __unused int nseg, __unused int error) +{ +} + +/* unload a bus_dmamap and create a new one. Used when the + * buffer in the slot is changed. + * XXX buflen is probably not needed, buffers have constant size. + */ +void +netmap_reload_map(bus_dma_tag_t tag, bus_dmamap_t map, + void *buf, bus_size_t buflen) +{ + bus_addr_t paddr; + bus_dmamap_unload(tag, map); + bus_dmamap_load(tag, map, buf, buflen, ns_dmamap_cb, &paddr, + BUS_DMA_NOWAIT); +} + +void +netmap_load_map(bus_dma_tag_t tag, bus_dmamap_t map, + void *buf, bus_size_t buflen) +{ + bus_addr_t paddr; + bus_dmamap_load(tag, map, buf, buflen, ns_dmamap_cb, &paddr, + BUS_DMA_NOWAIT); +} + +/*------ netmap memory allocator -------*/ +/* + * Request for a chunk of memory. + * + * Memory objects are arranged into a list, hence we need to walk this + * list until we find an object with the needed amount of data free. + * This sounds like a completely inefficient implementation, but given + * the fact that data allocation is done once, we can handle it + * flawlessly. + * + * Return NULL on failure. + */ +static void * +netmap_malloc(size_t size, __unused const char *msg) +{ + struct netmap_mem_obj *mem_obj, *new_mem_obj; + void *ret = NULL; + + NMA_LOCK(); + TAILQ_FOREACH(mem_obj, &netmap_mem_d->nm_molist, nmo_next) { + if (mem_obj->nmo_used != 0 || mem_obj->nmo_size < size) + continue; + + new_mem_obj = malloc(sizeof(struct netmap_mem_obj), M_NETMAP, + M_WAITOK | M_ZERO); + TAILQ_INSERT_BEFORE(mem_obj, new_mem_obj, nmo_next); + + new_mem_obj->nmo_used = 1; + new_mem_obj->nmo_size = size; + new_mem_obj->nmo_data = mem_obj->nmo_data; + memset(new_mem_obj->nmo_data, 0, new_mem_obj->nmo_size); + + mem_obj->nmo_size -= size; + mem_obj->nmo_data = (char *) mem_obj->nmo_data + size; + if (mem_obj->nmo_size == 0) { + TAILQ_REMOVE(&netmap_mem_d->nm_molist, mem_obj, + nmo_next); + free(mem_obj, M_NETMAP); + } + + ret = new_mem_obj->nmo_data; + + break; + } + NMA_UNLOCK(); + ND("%s: %d bytes at %p", msg, size, ret); + + return (ret); +} + +/* + * Return the memory to the allocator. + * + * While freeing a memory object, we try to merge adjacent chunks in + * order to reduce memory fragmentation. + */ +static void +netmap_free(void *addr, const char *msg) +{ + size_t size; + struct netmap_mem_obj *cur, *prev, *next; + + if (addr == NULL) { + D("NULL addr for %s", msg); + return; + } + + NMA_LOCK(); + TAILQ_FOREACH(cur, &netmap_mem_d->nm_molist, nmo_next) { + if (cur->nmo_data == addr && cur->nmo_used) + break; + } + if (cur == NULL) { + NMA_UNLOCK(); + D("invalid addr %s %p", msg, addr); + return; + } + + size = cur->nmo_size; + cur->nmo_used = 0; + + /* merge current chunk of memory with the previous one, + if present. */ + prev = TAILQ_PREV(cur, netmap_mem_obj_h, nmo_next); + if (prev && prev->nmo_used == 0) { + TAILQ_REMOVE(&netmap_mem_d->nm_molist, cur, nmo_next); + prev->nmo_size += cur->nmo_size; + free(cur, M_NETMAP); + cur = prev; + } + + /* merge with the next one */ + next = TAILQ_NEXT(cur, nmo_next); + if (next && next->nmo_used == 0) { + TAILQ_REMOVE(&netmap_mem_d->nm_molist, next, nmo_next); + cur->nmo_size += next->nmo_size; + free(next, M_NETMAP); + } + NMA_UNLOCK(); + ND("freed %s %d bytes at %p", msg, size, addr); +} + + +/* + * Initialize the memory allocator. + * + * Create the descriptor for the memory , allocate the pool of memory + * and initialize the list of memory objects with a single chunk + * containing the whole pre-allocated memory marked as free. + * + * Start with a large size, then halve as needed if we fail to + * allocate the block. While halving, always add one extra page + * because buffers 0 and 1 are used for special purposes. + * Return 0 on success, errno otherwise. + */ +static int +netmap_memory_init(void) +{ + struct netmap_mem_obj *mem_obj; + void *buf = NULL; + int i, n, sz = NETMAP_MEMORY_SIZE; + int extra_sz = 0; // space for rings and two spare buffers + + for (; !buf && sz >= 1<<20; sz >>=1) { + extra_sz = sz/200; + extra_sz = (extra_sz + 2*PAGE_SIZE - 1) & ~(PAGE_SIZE-1); + buf = contigmalloc(sz + extra_sz, + M_NETMAP, + M_WAITOK | M_ZERO, + 0, /* low address */ + -1UL, /* high address */ + PAGE_SIZE, /* alignment */ + 0 /* boundary */ + ); + } + if (buf == NULL) + return (ENOMEM); + sz += extra_sz; + netmap_mem_d = malloc(sizeof(struct netmap_mem_d), M_NETMAP, + M_WAITOK | M_ZERO); + mtx_init(&netmap_mem_d->nm_mtx, "netmap memory allocator lock", NULL, + MTX_DEF); + TAILQ_INIT(&netmap_mem_d->nm_molist); + netmap_mem_d->nm_buffer = buf; + netmap_mem_d->nm_totalsize = sz; + + /* + * A buffer takes 2k, a slot takes 8 bytes + ring overhead, + * so the ratio is 200:1. In other words, we can use 1/200 of + * the memory for the rings, and the rest for the buffers, + * and be sure we never run out. + */ + netmap_mem_d->nm_size = sz/200; + netmap_mem_d->nm_buf_start = + (netmap_mem_d->nm_size + PAGE_SIZE - 1) & ~(PAGE_SIZE-1); + netmap_mem_d->nm_buf_len = sz - netmap_mem_d->nm_buf_start; + + nm_buf_pool.base = netmap_mem_d->nm_buffer; + nm_buf_pool.base += netmap_mem_d->nm_buf_start; + netmap_buffer_base = nm_buf_pool.base; + D("netmap_buffer_base %p (offset %d)", + netmap_buffer_base, netmap_mem_d->nm_buf_start); + /* number of buffers, they all start as free */ + + netmap_total_buffers = nm_buf_pool.total_buffers = + netmap_mem_d->nm_buf_len / NETMAP_BUF_SIZE; + nm_buf_pool.bufsize = NETMAP_BUF_SIZE; + + D("Have %d MB, use %dKB for rings, %d buffers at %p", + (sz >> 20), (netmap_mem_d->nm_size >> 10), + nm_buf_pool.total_buffers, nm_buf_pool.base); + + /* allocate and initialize the bitmap. Entry 0 is considered + * always busy (used as default when there are no buffers left). + */ + n = (nm_buf_pool.total_buffers + 31) / 32; + nm_buf_pool.bitmap = malloc(sizeof(uint32_t) * n, M_NETMAP, + M_WAITOK | M_ZERO); + nm_buf_pool.bitmap[0] = ~3; /* slot 0 and 1 always busy */ + for (i = 1; i < n; i++) + nm_buf_pool.bitmap[i] = ~0; + nm_buf_pool.free = nm_buf_pool.total_buffers - 2; + + mem_obj = malloc(sizeof(struct netmap_mem_obj), M_NETMAP, + M_WAITOK | M_ZERO); + TAILQ_INSERT_HEAD(&netmap_mem_d->nm_molist, mem_obj, nmo_next); + mem_obj->nmo_used = 0; + mem_obj->nmo_size = netmap_mem_d->nm_size; + mem_obj->nmo_data = netmap_mem_d->nm_buffer; + + return (0); +} + + +/* + * Finalize the memory allocator. + * + * Free all the memory objects contained inside the list, and deallocate + * the pool of memory; finally free the memory allocator descriptor. + */ +static void +netmap_memory_fini(void) +{ + struct netmap_mem_obj *mem_obj; + + while (!TAILQ_EMPTY(&netmap_mem_d->nm_molist)) { + mem_obj = TAILQ_FIRST(&netmap_mem_d->nm_molist); + TAILQ_REMOVE(&netmap_mem_d->nm_molist, mem_obj, nmo_next); + if (mem_obj->nmo_used == 1) { + printf("netmap: leaked %d bytes at %p\n", + mem_obj->nmo_size, + mem_obj->nmo_data); + } + free(mem_obj, M_NETMAP); + } + contigfree(netmap_mem_d->nm_buffer, netmap_mem_d->nm_totalsize, M_NETMAP); + // XXX mutex_destroy(nm_mtx); + free(netmap_mem_d, M_NETMAP); +} + + +/* + * Module loader. + * + * Create the /dev/netmap device and initialize all global + * variables. + * + * Return 0 on success, errno on failure. + */ +static int +netmap_init(void) +{ + int error; + + + error = netmap_memory_init(); + if (error != 0) { + printf("netmap: unable to initialize the memory allocator."); + return (error); + } + printf("netmap: loaded module with %d Mbytes\n", + netmap_mem_d->nm_totalsize >> 20); + + netmap_dev = make_dev(&netmap_cdevsw, 0, UID_ROOT, GID_WHEEL, 0660, + "netmap"); + + return (0); +} + + +/* + * Module unloader. + * + * Free all the memory, and destroy the ``/dev/netmap`` device. + */ +static void +netmap_fini(void) +{ + destroy_dev(netmap_dev); + + netmap_memory_fini(); + + printf("netmap: unloaded module.\n"); +} + + +/* + * Kernel entry point. + * + * Initialize/finalize the module and return. + * + * Return 0 on success, errno on failure. + */ +static int +netmap_loader(__unused struct module *module, int event, __unused void *arg) +{ + int error = 0; + + switch (event) { + case MOD_LOAD: + error = netmap_init(); + break; + + case MOD_UNLOAD: + netmap_fini(); + break; + + default: + error = EOPNOTSUPP; + break; + } + + return (error); +} + + +DEV_MODULE(netmap, netmap_loader, NULL); diff --git a/sys/dev/netmap/netmap_kern.h b/sys/dev/netmap/netmap_kern.h new file mode 100644 index 0000000000000..5434609c447b1 --- /dev/null +++ b/sys/dev/netmap/netmap_kern.h @@ -0,0 +1,221 @@ +/* + * Copyright (C) 2011 Matteo Landi, Luigi Rizzo. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $FreeBSD$ + * $Id: netmap_kern.h 9662 2011-11-16 13:18:06Z luigi $ + * + * The header contains the definitions of constants and function + * prototypes used only in kernelspace. + */ + +#ifndef _NET_NETMAP_KERN_H_ +#define _NET_NETMAP_KERN_H_ + +#ifdef MALLOC_DECLARE +MALLOC_DECLARE(M_NETMAP); +#endif + +#define ND(format, ...) +#define D(format, ...) \ + do { \ + struct timeval __xxts; \ + microtime(&__xxts); \ + printf("%03d.%06d %s [%d] " format "\n",\ + (int)__xxts.tv_sec % 1000, (int)__xxts.tv_usec, \ + __FUNCTION__, __LINE__, ##__VA_ARGS__); \ + } while (0) + +struct netmap_adapter; + +/* + * private, kernel view of a ring. + * + * XXX 20110627-todo + * The index in the NIC and netmap ring is offset by nkr_hwofs slots. + * This is so that, on a reset, buffers owned by userspace are not + * modified by the kernel. In particular: + * RX rings: the next empty buffer (hwcur + hwavail + hwofs) coincides + * the next empty buffer as known by the hardware (next_to_check or so). + * TX rings: hwcur + hwofs coincides with next_to_send + */ +struct netmap_kring { + struct netmap_ring *ring; + u_int nr_hwcur; + int nr_hwavail; + u_int nr_kflags; + u_int nkr_num_slots; + + u_int nkr_hwofs; /* offset between NIC and netmap ring */ + struct netmap_adapter *na; // debugging + struct selinfo si; /* poll/select wait queue */ +}; + +/* + * This struct is part of and extends the 'struct adapter' (or + * equivalent) device descriptor. It contains all fields needed to + * support netmap operation. + */ +struct netmap_adapter { + int refcount; /* number of user-space descriptors using this + interface, which is equal to the number of + struct netmap_if objs in the mapped region. */ + + int separate_locks; /* set if the interface suports different + locks for rx, tx and core. */ + + u_int num_queues; /* number of tx/rx queue pairs: this is + a duplicate field needed to simplify the + signature of ``netmap_detach``. */ + + u_int num_tx_desc; /* number of descriptor in each queue */ + u_int num_rx_desc; + u_int buff_size; + + u_int flags; /* NR_REINIT */ + /* tx_rings and rx_rings are private but allocated + * as a contiguous chunk of memory. Each array has + * N+1 entries, for the adapter queues and for the host queue. + */ + struct netmap_kring *tx_rings; /* array of TX rings. */ + struct netmap_kring *rx_rings; /* array of RX rings. */ + + /* copy of if_qflush and if_transmit pointers, to intercept + * packets from the network stack when netmap is active. + * XXX probably if_qflush is not necessary. + */ + void (*if_qflush)(struct ifnet *); + int (*if_transmit)(struct ifnet *, struct mbuf *); + + /* references to the ifnet and device routines, used by + * the generic netmap functions. + */ + struct ifnet *ifp; /* adapter is ifp->if_softc */ + + int (*nm_register)(struct ifnet *, int onoff); + void (*nm_lock)(void *, int what, u_int ringid); + int (*nm_txsync)(void *, u_int ring, int lock); + int (*nm_rxsync)(void *, u_int ring, int lock); +}; + +/* + * The combination of "enable" (ifp->if_capabilities &IFCAP_NETMAP) + * and refcount gives the status of the interface, namely: + * + * enable refcount Status + * + * FALSE 0 normal operation + * FALSE != 0 -- (impossible) + * TRUE 1 netmap mode + * TRUE 0 being deleted. + */ + +#define NETMAP_DELETING(_na) ( ((_na)->refcount == 0) && \ + ( (_na)->ifp->if_capenable & IFCAP_NETMAP) ) + +/* + * parameters for (*nm_lock)(adapter, what, index) + */ +enum { + NETMAP_NO_LOCK = 0, + NETMAP_CORE_LOCK, NETMAP_CORE_UNLOCK, + NETMAP_TX_LOCK, NETMAP_TX_UNLOCK, + NETMAP_RX_LOCK, NETMAP_RX_UNLOCK, +}; + +/* + * The following are support routines used by individual drivers to + * support netmap operation. + * + * netmap_attach() initializes a struct netmap_adapter, allocating the + * struct netmap_ring's and the struct selinfo. + * + * netmap_detach() frees the memory allocated by netmap_attach(). + * + * netmap_start() replaces the if_transmit routine of the interface, + * and is used to intercept packets coming from the stack. + * + * netmap_load_map/netmap_reload_map are helper routines to set/reset + * the dmamap for a packet buffer + * + * netmap_reset() is a helper routine to be called in the driver + * when reinitializing a ring. + */ +int netmap_attach(struct netmap_adapter *, int); +void netmap_detach(struct ifnet *); +int netmap_start(struct ifnet *, struct mbuf *); +enum txrx { NR_RX = 0, NR_TX = 1 }; +struct netmap_slot *netmap_reset(struct netmap_adapter *na, + enum txrx tx, int n, u_int new_cur); +void netmap_load_map(bus_dma_tag_t tag, bus_dmamap_t map, + void *buf, bus_size_t buflen); +void netmap_reload_map(bus_dma_tag_t tag, bus_dmamap_t map, + void *buf, bus_size_t buflen); +int netmap_ring_reinit(struct netmap_kring *); + +/* + * XXX eventually, get rid of netmap_total_buffers and netmap_buffer_base + * in favour of the structure + */ +// struct netmap_buf_pool; +// extern struct netmap_buf_pool nm_buf_pool; +extern u_int netmap_total_buffers; +extern char *netmap_buffer_base; +extern int netmap_verbose; // XXX debugging +enum { /* verbose flags */ + NM_VERB_ON = 1, /* generic verbose */ + NM_VERB_HOST = 0x2, /* verbose host stack */ + NM_VERB_RXSYNC = 0x10, /* verbose on rxsync/txsync */ + NM_VERB_TXSYNC = 0x20, + NM_VERB_RXINTR = 0x100, /* verbose on rx/tx intr (driver) */ + NM_VERB_TXINTR = 0x200, + NM_VERB_NIC_RXSYNC = 0x1000, /* verbose on rx/tx intr (driver) */ + NM_VERB_NIC_TXSYNC = 0x2000, +}; + +/* + * return a pointer to the struct netmap adapter from the ifp + */ +#define NA(_ifp) ((struct netmap_adapter *)(_ifp)->if_pspare[0]) + + +/* + * return the address of a buffer. + * XXX this is a special version with hardwired 2k bufs + * On error return netmap_buffer_base which is detected as a bad pointer. + */ +static inline char * +NMB(struct netmap_slot *slot) +{ + uint32_t i = slot->buf_idx; + return (i >= netmap_total_buffers) ? netmap_buffer_base : +#if NETMAP_BUF_SIZE == 2048 + netmap_buffer_base + (i << 11); +#else + netmap_buffer_base + (i *NETMAP_BUF_SIZE); +#endif +} + +#endif /* _NET_NETMAP_KERN_H_ */ |
