From 37e3a6d349581b4dd0aebf24be7b1b159a698dcf Mon Sep 17 00:00:00 2001 From: Luigi Rizzo Date: Sun, 16 Oct 2016 14:13:32 +0000 Subject: Import the current version of netmap, aligned with the one on github. This commit, long overdue, contains contributions in the last 2 years from Stefano Garzarella, Giuseppe Lettieri, Vincenzo Maffione, including: + fixes on monitor ports + the 'ptnet' virtual device driver, and ptnetmap backend, for high speed virtual passthrough on VMs (bhyve fixes in an upcoming commit) + improved emulated netmap mode + more robust error handling + removal of stale code + various fixes to code and documentation (some mixup between RX and TX parameters, and private and public variables) We also include an additional tool, nmreplay, which is functionally equivalent to tcpreplay but operating on netmap ports. --- sys/dev/netmap/netmap_freebsd.c | 762 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 701 insertions(+), 61 deletions(-) (limited to 'sys/dev/netmap/netmap_freebsd.c') diff --git a/sys/dev/netmap/netmap_freebsd.c b/sys/dev/netmap/netmap_freebsd.c index 8490ae85670b..20ea5c8f2972 100644 --- a/sys/dev/netmap/netmap_freebsd.c +++ b/sys/dev/netmap/netmap_freebsd.c @@ -33,8 +33,9 @@ #include /* defines used in kernel.h */ #include /* POLLIN, POLLOUT */ #include /* types used in module initialization */ -#include /* DEV_MODULE */ +#include /* DEV_MODULE_ORDERED */ #include +#include /* kern_ioctl() */ #include @@ -50,6 +51,11 @@ #include #include /* sockaddrs */ #include +#include /* kthread_add() */ +#include /* PROC_LOCK() */ +#include /* RFNOWAIT */ +#include /* sched_bind() */ +#include /* mp_maxid */ #include #include #include /* IFT_ETHER */ @@ -61,13 +67,94 @@ #include #include +#include #include /* ======================== FREEBSD-SPECIFIC ROUTINES ================== */ +void nm_os_selinfo_init(NM_SELINFO_T *si) { + struct mtx *m = &si->m; + mtx_init(m, "nm_kn_lock", NULL, MTX_DEF); + knlist_init_mtx(&si->si.si_note, m); +} + +void +nm_os_selinfo_uninit(NM_SELINFO_T *si) +{ + /* XXX kqueue(9) needed; these will mirror knlist_init. */ + knlist_delete(&si->si.si_note, curthread, 0 /* not locked */ ); + knlist_destroy(&si->si.si_note); + /* now we don't need the mutex anymore */ + mtx_destroy(&si->m); +} + +void +nm_os_ifnet_lock(void) +{ + IFNET_WLOCK(); +} + +void +nm_os_ifnet_unlock(void) +{ + IFNET_WUNLOCK(); +} + +static int netmap_use_count = 0; + +void +nm_os_get_module(void) +{ + netmap_use_count++; +} + +void +nm_os_put_module(void) +{ + netmap_use_count--; +} + +static void +netmap_ifnet_arrival_handler(void *arg __unused, struct ifnet *ifp) +{ + netmap_undo_zombie(ifp); +} + +static void +netmap_ifnet_departure_handler(void *arg __unused, struct ifnet *ifp) +{ + netmap_make_zombie(ifp); +} + +static eventhandler_tag nm_ifnet_ah_tag; +static eventhandler_tag nm_ifnet_dh_tag; + +int +nm_os_ifnet_init(void) +{ + nm_ifnet_ah_tag = + EVENTHANDLER_REGISTER(ifnet_arrival_event, + netmap_ifnet_arrival_handler, + NULL, EVENTHANDLER_PRI_ANY); + nm_ifnet_dh_tag = + EVENTHANDLER_REGISTER(ifnet_departure_event, + netmap_ifnet_departure_handler, + NULL, EVENTHANDLER_PRI_ANY); + return 0; +} + +void +nm_os_ifnet_fini(void) +{ + EVENTHANDLER_DEREGISTER(ifnet_arrival_event, + nm_ifnet_ah_tag); + EVENTHANDLER_DEREGISTER(ifnet_departure_event, + nm_ifnet_dh_tag); +} + rawsum_t -nm_csum_raw(uint8_t *data, size_t len, rawsum_t cur_sum) +nm_os_csum_raw(uint8_t *data, size_t len, rawsum_t cur_sum) { /* TODO XXX please use the FreeBSD implementation for this. */ uint16_t *words = (uint16_t *)data; @@ -87,7 +174,7 @@ nm_csum_raw(uint8_t *data, size_t len, rawsum_t cur_sum) * return value is in network byte order. */ uint16_t -nm_csum_fold(rawsum_t cur_sum) +nm_os_csum_fold(rawsum_t cur_sum) { /* TODO XXX please use the FreeBSD implementation for this. */ while (cur_sum >> 16) @@ -96,17 +183,17 @@ nm_csum_fold(rawsum_t cur_sum) return htobe16((~cur_sum) & 0xFFFF); } -uint16_t nm_csum_ipv4(struct nm_iphdr *iph) +uint16_t nm_os_csum_ipv4(struct nm_iphdr *iph) { #if 0 return in_cksum_hdr((void *)iph); #else - return nm_csum_fold(nm_csum_raw((uint8_t*)iph, sizeof(struct nm_iphdr), 0)); + return nm_os_csum_fold(nm_os_csum_raw((uint8_t*)iph, sizeof(struct nm_iphdr), 0)); #endif } void -nm_csum_tcpudp_ipv4(struct nm_iphdr *iph, void *data, +nm_os_csum_tcpudp_ipv4(struct nm_iphdr *iph, void *data, size_t datalen, uint16_t *check) { #ifdef INET @@ -118,7 +205,7 @@ nm_csum_tcpudp_ipv4(struct nm_iphdr *iph, void *data, /* Compute the checksum on TCP/UDP header + payload * (includes the pseudo-header). */ - *check = nm_csum_fold(nm_csum_raw(data, datalen, 0)); + *check = nm_os_csum_fold(nm_os_csum_raw(data, datalen, 0)); #else static int notsupported = 0; if (!notsupported) { @@ -129,12 +216,12 @@ nm_csum_tcpudp_ipv4(struct nm_iphdr *iph, void *data, } void -nm_csum_tcpudp_ipv6(struct nm_ipv6hdr *ip6h, void *data, +nm_os_csum_tcpudp_ipv6(struct nm_ipv6hdr *ip6h, void *data, size_t datalen, uint16_t *check) { #ifdef INET6 *check = in6_cksum_pseudo((void*)ip6h, datalen, ip6h->nexthdr, 0); - *check = nm_csum_fold(nm_csum_raw(data, datalen, 0)); + *check = nm_os_csum_fold(nm_os_csum_raw(data, datalen, 0)); #else static int notsupported = 0; if (!notsupported) { @@ -144,13 +231,41 @@ nm_csum_tcpudp_ipv6(struct nm_ipv6hdr *ip6h, void *data, #endif } +/* on FreeBSD we send up one packet at a time */ +void * +nm_os_send_up(struct ifnet *ifp, struct mbuf *m, struct mbuf *prev) +{ + + NA(ifp)->if_input(ifp, m); + return NULL; +} + +int +nm_os_mbuf_has_offld(struct mbuf *m) +{ + return m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_SCTP | + CSUM_TCP_IPV6 | CSUM_UDP_IPV6 | + CSUM_SCTP_IPV6 | CSUM_TSO); +} + +static void +freebsd_generic_rx_handler(struct ifnet *ifp, struct mbuf *m) +{ + struct netmap_generic_adapter *gna = + (struct netmap_generic_adapter *)NA(ifp); + int stolen = generic_rx_handler(ifp, m); + + if (!stolen) { + gna->save_if_input(ifp, m); + } +} /* * Intercept the rx routine in the standard device driver. * Second argument is non-zero to intercept, 0 to restore */ int -netmap_catch_rx(struct netmap_generic_adapter *gna, int intercept) +nm_os_catch_rx(struct netmap_generic_adapter *gna, int intercept) { struct netmap_adapter *na = &gna->up.up; struct ifnet *ifp = na->ifp; @@ -161,7 +276,7 @@ netmap_catch_rx(struct netmap_generic_adapter *gna, int intercept) return EINVAL; /* already set */ } gna->save_if_input = ifp->if_input; - ifp->if_input = generic_rx_handler; + ifp->if_input = freebsd_generic_rx_handler; } else { if (!gna->save_if_input){ D("cannot restore"); @@ -181,18 +296,20 @@ netmap_catch_rx(struct netmap_generic_adapter *gna, int intercept) * Second argument is non-zero to intercept, 0 to restore. * On freebsd we just intercept if_transmit. */ -void -netmap_catch_tx(struct netmap_generic_adapter *gna, int enable) +int +nm_os_catch_tx(struct netmap_generic_adapter *gna, int intercept) { struct netmap_adapter *na = &gna->up.up; struct ifnet *ifp = netmap_generic_getifp(gna); - if (enable) { + if (intercept) { na->if_transmit = ifp->if_transmit; ifp->if_transmit = netmap_transmit; } else { ifp->if_transmit = na->if_transmit; } + + return 0; } @@ -213,40 +330,44 @@ netmap_catch_tx(struct netmap_generic_adapter *gna, int enable) * */ int -generic_xmit_frame(struct ifnet *ifp, struct mbuf *m, - void *addr, u_int len, u_int ring_nr) +nm_os_generic_xmit_frame(struct nm_os_gen_arg *a) { int ret; + u_int len = a->len; + struct ifnet *ifp = a->ifp; + struct mbuf *m = a->m; +#if __FreeBSD_version < 1100000 /* - * The mbuf should be a cluster from our special pool, - * so we do not need to do an m_copyback but just copy - * (and eventually, just reference the netmap buffer) + * Old FreeBSD versions. The mbuf has a cluster attached, + * we need to copy from the cluster to the netmap buffer. */ - - if (GET_MBUF_REFCNT(m) != 1) { - D("invalid refcnt %d for %p", - GET_MBUF_REFCNT(m), m); + if (MBUF_REFCNT(m) != 1) { + D("invalid refcnt %d for %p", MBUF_REFCNT(m), m); panic("in generic_xmit_frame"); } - // XXX the ext_size check is unnecessary if we link the netmap buf if (m->m_ext.ext_size < len) { RD(5, "size %d < len %d", m->m_ext.ext_size, len); len = m->m_ext.ext_size; } - if (0) { /* XXX seems to have negligible benefits */ - m->m_ext.ext_buf = m->m_data = addr; - } else { - bcopy(addr, m->m_data, len); - } + bcopy(a->addr, m->m_data, len); +#else /* __FreeBSD_version >= 1100000 */ + /* New FreeBSD versions. Link the external storage to + * the netmap buffer, so that no copy is necessary. */ + m->m_ext.ext_buf = m->m_data = a->addr; + m->m_ext.ext_size = len; +#endif /* __FreeBSD_version >= 1100000 */ + m->m_len = m->m_pkthdr.len = len; - // inc refcount. All ours, we could skip the atomic - atomic_fetchadd_int(PNT_MBUF_REFCNT(m), 1); + + /* mbuf refcnt is not contended, no need to use atomic + * (a memory barrier is enough). */ + SET_MBUF_REFCNT(m, 2); M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE); - m->m_pkthdr.flowid = ring_nr; + m->m_pkthdr.flowid = a->ring_nr; m->m_pkthdr.rcvif = ifp; /* used for tx notification */ ret = NA(ifp)->if_transmit(ifp, m); - return ret; + return ret ? -1 : 0; } @@ -263,7 +384,7 @@ netmap_getna(if_t ifp) * way to extract the info from the ifp */ int -generic_find_num_desc(struct ifnet *ifp, unsigned int *tx, unsigned int *rx) +nm_os_generic_find_num_desc(struct ifnet *ifp, unsigned int *tx, unsigned int *rx) { D("called, in tx %d rx %d", *tx, *rx); return 0; @@ -271,16 +392,23 @@ generic_find_num_desc(struct ifnet *ifp, unsigned int *tx, unsigned int *rx) void -generic_find_num_queues(struct ifnet *ifp, u_int *txq, u_int *rxq) +nm_os_generic_find_num_queues(struct ifnet *ifp, u_int *txq, u_int *rxq) { D("called, in txq %d rxq %d", *txq, *rxq); *txq = netmap_generic_rings; *rxq = netmap_generic_rings; } +void +nm_os_generic_set_features(struct netmap_generic_adapter *gna) +{ + + gna->rxsg = 1; /* Supported through m_copydata. */ + gna->txqdisc = 0; /* Not supported. */ +} void -netmap_mitigation_init(struct nm_generic_mit *mit, int idx, struct netmap_adapter *na) +nm_os_mitigation_init(struct nm_generic_mit *mit, int idx, struct netmap_adapter *na) { ND("called"); mit->mit_pending = 0; @@ -290,21 +418,21 @@ netmap_mitigation_init(struct nm_generic_mit *mit, int idx, struct netmap_adapte void -netmap_mitigation_start(struct nm_generic_mit *mit) +nm_os_mitigation_start(struct nm_generic_mit *mit) { ND("called"); } void -netmap_mitigation_restart(struct nm_generic_mit *mit) +nm_os_mitigation_restart(struct nm_generic_mit *mit) { ND("called"); } int -netmap_mitigation_active(struct nm_generic_mit *mit) +nm_os_mitigation_active(struct nm_generic_mit *mit) { ND("called"); return 0; @@ -312,7 +440,7 @@ netmap_mitigation_active(struct nm_generic_mit *mit) void -netmap_mitigation_cleanup(struct nm_generic_mit *mit) +nm_os_mitigation_cleanup(struct nm_generic_mit *mit) { ND("called"); } @@ -342,7 +470,7 @@ static struct { } nm_vi_indices; void -nm_vi_init_index(void) +nm_os_vi_init_index(void) { int i; for (i = 0; i < NM_VI_MAX; i++) @@ -398,7 +526,7 @@ nm_vi_free_index(uint8_t val) * increment this refcount on if_attach(). */ int -nm_vi_persist(const char *name, struct ifnet **ret) +nm_os_vi_persist(const char *name, struct ifnet **ret) { struct ifnet *ifp; u_short macaddr_hi; @@ -438,15 +566,220 @@ nm_vi_persist(const char *name, struct ifnet **ret) *ret = ifp; return 0; } + /* unregister from the system and drop the final refcount */ void -nm_vi_detach(struct ifnet *ifp) +nm_os_vi_detach(struct ifnet *ifp) { nm_vi_free_index(((char *)IF_LLADDR(ifp))[5]); ether_ifdetach(ifp); if_free(ifp); } +/* ======================== PTNETMAP SUPPORT ========================== */ + +#ifdef WITH_PTNETMAP_GUEST +#include +#include +#include /* bus_dmamap_* */ +#include +#include +#include +/* + * ptnetmap memory device (memdev) for freebsd guest, + * ssed to expose host netmap memory to the guest through a PCI BAR. + */ + +/* + * ptnetmap memdev private data structure + */ +struct ptnetmap_memdev { + device_t dev; + struct resource *pci_io; + struct resource *pci_mem; + struct netmap_mem_d *nm_mem; +}; + +static int ptn_memdev_probe(device_t); +static int ptn_memdev_attach(device_t); +static int ptn_memdev_detach(device_t); +static int ptn_memdev_shutdown(device_t); + +static device_method_t ptn_memdev_methods[] = { + DEVMETHOD(device_probe, ptn_memdev_probe), + DEVMETHOD(device_attach, ptn_memdev_attach), + DEVMETHOD(device_detach, ptn_memdev_detach), + DEVMETHOD(device_shutdown, ptn_memdev_shutdown), + DEVMETHOD_END +}; + +static driver_t ptn_memdev_driver = { + PTNETMAP_MEMDEV_NAME, + ptn_memdev_methods, + sizeof(struct ptnetmap_memdev), +}; + +/* We use (SI_ORDER_MIDDLE+1) here, see DEV_MODULE_ORDERED() invocation + * below. */ +static devclass_t ptnetmap_devclass; +DRIVER_MODULE_ORDERED(ptn_memdev, pci, ptn_memdev_driver, ptnetmap_devclass, + NULL, NULL, SI_ORDER_MIDDLE + 1); + +/* + * I/O port read/write wrappers. + * Some are not used, so we keep them commented out until needed + */ +#define ptn_ioread16(ptn_dev, reg) bus_read_2((ptn_dev)->pci_io, (reg)) +#define ptn_ioread32(ptn_dev, reg) bus_read_4((ptn_dev)->pci_io, (reg)) +#if 0 +#define ptn_ioread8(ptn_dev, reg) bus_read_1((ptn_dev)->pci_io, (reg)) +#define ptn_iowrite8(ptn_dev, reg, val) bus_write_1((ptn_dev)->pci_io, (reg), (val)) +#define ptn_iowrite16(ptn_dev, reg, val) bus_write_2((ptn_dev)->pci_io, (reg), (val)) +#define ptn_iowrite32(ptn_dev, reg, val) bus_write_4((ptn_dev)->pci_io, (reg), (val)) +#endif /* unused */ + +/* + * Map host netmap memory through PCI-BAR in the guest OS, + * returning physical (nm_paddr) and virtual (nm_addr) addresses + * of the netmap memory mapped in the guest. + */ +int +nm_os_pt_memdev_iomap(struct ptnetmap_memdev *ptn_dev, vm_paddr_t *nm_paddr, void **nm_addr) +{ + uint32_t mem_size; + int rid; + + D("ptn_memdev_driver iomap"); + + rid = PCIR_BAR(PTNETMAP_MEM_PCI_BAR); + mem_size = ptn_ioread32(ptn_dev, PTNETMAP_IO_PCI_MEMSIZE); + + /* map memory allocator */ + ptn_dev->pci_mem = bus_alloc_resource(ptn_dev->dev, SYS_RES_MEMORY, + &rid, 0, ~0, mem_size, RF_ACTIVE); + if (ptn_dev->pci_mem == NULL) { + *nm_paddr = 0; + *nm_addr = 0; + return ENOMEM; + } + + *nm_paddr = rman_get_start(ptn_dev->pci_mem); + *nm_addr = rman_get_virtual(ptn_dev->pci_mem); + + D("=== BAR %d start %lx len %lx mem_size %x ===", + PTNETMAP_MEM_PCI_BAR, + *nm_paddr, + rman_get_size(ptn_dev->pci_mem), + mem_size); + return (0); +} + +/* Unmap host netmap memory. */ +void +nm_os_pt_memdev_iounmap(struct ptnetmap_memdev *ptn_dev) +{ + D("ptn_memdev_driver iounmap"); + + if (ptn_dev->pci_mem) { + bus_release_resource(ptn_dev->dev, SYS_RES_MEMORY, + PCIR_BAR(PTNETMAP_MEM_PCI_BAR), ptn_dev->pci_mem); + ptn_dev->pci_mem = NULL; + } +} + +/* Device identification routine, return BUS_PROBE_DEFAULT on success, + * positive on failure */ +static int +ptn_memdev_probe(device_t dev) +{ + char desc[256]; + + if (pci_get_vendor(dev) != PTNETMAP_PCI_VENDOR_ID) + return (ENXIO); + if (pci_get_device(dev) != PTNETMAP_PCI_DEVICE_ID) + return (ENXIO); + + snprintf(desc, sizeof(desc), "%s PCI adapter", + PTNETMAP_MEMDEV_NAME); + device_set_desc_copy(dev, desc); + + return (BUS_PROBE_DEFAULT); +} + +/* Device initialization routine. */ +static int +ptn_memdev_attach(device_t dev) +{ + struct ptnetmap_memdev *ptn_dev; + int rid; + uint16_t mem_id; + + D("ptn_memdev_driver attach"); + + ptn_dev = device_get_softc(dev); + ptn_dev->dev = dev; + + pci_enable_busmaster(dev); + + rid = PCIR_BAR(PTNETMAP_IO_PCI_BAR); + ptn_dev->pci_io = bus_alloc_resource_any(dev, SYS_RES_IOPORT, &rid, + RF_ACTIVE); + if (ptn_dev->pci_io == NULL) { + device_printf(dev, "cannot map I/O space\n"); + return (ENXIO); + } + + mem_id = ptn_ioread16(ptn_dev, PTNETMAP_IO_PCI_HOSTID); + + /* create guest allocator */ + ptn_dev->nm_mem = netmap_mem_pt_guest_attach(ptn_dev, mem_id); + if (ptn_dev->nm_mem == NULL) { + ptn_memdev_detach(dev); + return (ENOMEM); + } + netmap_mem_get(ptn_dev->nm_mem); + + D("ptn_memdev_driver probe OK - host_id: %d", mem_id); + + return (0); +} + +/* Device removal routine. */ +static int +ptn_memdev_detach(device_t dev) +{ + struct ptnetmap_memdev *ptn_dev; + + D("ptn_memdev_driver detach"); + ptn_dev = device_get_softc(dev); + + if (ptn_dev->nm_mem) { + netmap_mem_put(ptn_dev->nm_mem); + ptn_dev->nm_mem = NULL; + } + if (ptn_dev->pci_mem) { + bus_release_resource(dev, SYS_RES_MEMORY, + PCIR_BAR(PTNETMAP_MEM_PCI_BAR), ptn_dev->pci_mem); + ptn_dev->pci_mem = NULL; + } + if (ptn_dev->pci_io) { + bus_release_resource(dev, SYS_RES_IOPORT, + PCIR_BAR(PTNETMAP_IO_PCI_BAR), ptn_dev->pci_io); + ptn_dev->pci_io = NULL; + } + + return (0); +} + +static int +ptn_memdev_shutdown(device_t dev) +{ + D("ptn_memdev_driver shutdown"); + return bus_generic_shutdown(dev); +} + +#endif /* WITH_PTNETMAP_GUEST */ + /* * In order to track whether pages are still mapped, we hook into * the standard cdev_pager and intercept the constructor and @@ -606,7 +939,7 @@ err_unlock: * the device (/dev/netmap) so we cannot do anything useful. * To track close() on individual file descriptors we pass netmap_dtor() to * devfs_set_cdevpriv() on open(). The FreeBSD kernel will call the destructor - * when the last fd pointing to the device is closed. + * when the last fd pointing to the device is closed. * * Note that FreeBSD does not even munmap() on close() so we also have * to track mmap() ourselves, and postpone the call to @@ -634,26 +967,275 @@ netmap_open(struct cdev *dev, int oflags, int devtype, struct thread *td) (void)devtype; (void)td; - priv = malloc(sizeof(struct netmap_priv_d), M_DEVBUF, - M_NOWAIT | M_ZERO); - if (priv == NULL) - return ENOMEM; - priv->np_refs = 1; + NMG_LOCK(); + priv = netmap_priv_new(); + if (priv == NULL) { + error = ENOMEM; + goto out; + } error = devfs_set_cdevpriv(priv, netmap_dtor); if (error) { - free(priv, M_DEVBUF); - } else { - NMG_LOCK(); - netmap_use_count++; - NMG_UNLOCK(); + netmap_priv_delete(priv); + } +out: + NMG_UNLOCK(); + return error; +} + +/******************** kthread wrapper ****************/ +#include +u_int +nm_os_ncpus(void) +{ + return mp_maxid + 1; +} + +struct nm_kthread_ctx { + struct thread *user_td; /* thread user-space (kthread creator) to send ioctl */ + /* notification to guest (interrupt) */ + int irq_fd; /* ioctl fd */ + struct nm_kth_ioctl irq_ioctl; /* ioctl arguments */ + + /* notification from guest */ + void *ioevent_file; /* tsleep() argument */ + + /* worker function and parameter */ + nm_kthread_worker_fn_t worker_fn; + void *worker_private; + + struct nm_kthread *nmk; + + /* integer to manage multiple worker contexts (e.g., RX or TX on ptnetmap) */ + long type; +}; + +struct nm_kthread { + struct thread *worker; + struct mtx worker_lock; + uint64_t scheduled; /* pending wake_up request */ + struct nm_kthread_ctx worker_ctx; + int run; /* used to stop kthread */ + int attach_user; /* kthread attached to user_process */ + int affinity; +}; + +void inline +nm_os_kthread_wakeup_worker(struct nm_kthread *nmk) +{ + /* + * There may be a race between FE and BE, + * which call both this function, and worker kthread, + * that reads nmk->scheduled. + * + * For us it is not important the counter value, + * but simply that it has changed since the last + * time the kthread saw it. + */ + mtx_lock(&nmk->worker_lock); + nmk->scheduled++; + if (nmk->worker_ctx.ioevent_file) { + wakeup(nmk->worker_ctx.ioevent_file); + } + mtx_unlock(&nmk->worker_lock); +} + +void inline +nm_os_kthread_send_irq(struct nm_kthread *nmk) +{ + struct nm_kthread_ctx *ctx = &nmk->worker_ctx; + int err; + + if (ctx->user_td && ctx->irq_fd > 0) { + err = kern_ioctl(ctx->user_td, ctx->irq_fd, ctx->irq_ioctl.com, (caddr_t)&ctx->irq_ioctl.data.msix); + if (err) { + D("kern_ioctl error: %d ioctl parameters: fd %d com %lu data %p", + err, ctx->irq_fd, ctx->irq_ioctl.com, &ctx->irq_ioctl.data); + } + } +} + +static void +nm_kthread_worker(void *data) +{ + struct nm_kthread *nmk = data; + struct nm_kthread_ctx *ctx = &nmk->worker_ctx; + uint64_t old_scheduled = nmk->scheduled; + + if (nmk->affinity >= 0) { + thread_lock(curthread); + sched_bind(curthread, nmk->affinity); + thread_unlock(curthread); + } + + while (nmk->run) { + /* + * check if the parent process dies + * (when kthread is attached to user process) + */ + if (ctx->user_td) { + PROC_LOCK(curproc); + thread_suspend_check(0); + PROC_UNLOCK(curproc); + } else { + kthread_suspend_check(); + } + + /* + * if ioevent_file is not defined, we don't have notification + * mechanism and we continually execute worker_fn() + */ + if (!ctx->ioevent_file) { + ctx->worker_fn(ctx->worker_private); /* worker body */ + } else { + /* checks if there is a pending notification */ + mtx_lock(&nmk->worker_lock); + if (likely(nmk->scheduled != old_scheduled)) { + old_scheduled = nmk->scheduled; + mtx_unlock(&nmk->worker_lock); + + ctx->worker_fn(ctx->worker_private); /* worker body */ + + continue; + } else if (nmk->run) { + /* wait on event with one second timeout */ + msleep_spin(ctx->ioevent_file, &nmk->worker_lock, + "nmk_ev", hz); + nmk->scheduled++; + } + mtx_unlock(&nmk->worker_lock); + } + } + + kthread_exit(); +} + +static int +nm_kthread_open_files(struct nm_kthread *nmk, struct nm_kthread_cfg *cfg) +{ + /* send irq through ioctl to bhyve (vmm.ko) */ + if (cfg->event.irqfd) { + nmk->worker_ctx.irq_fd = cfg->event.irqfd; + nmk->worker_ctx.irq_ioctl = cfg->event.ioctl; + } + /* ring.ioeventfd contains the chan where do tsleep to wait events */ + if (cfg->event.ioeventfd) { + nmk->worker_ctx.ioevent_file = (void *)cfg->event.ioeventfd; + } + + return 0; +} + +static void +nm_kthread_close_files(struct nm_kthread *nmk) +{ + nmk->worker_ctx.irq_fd = 0; + nmk->worker_ctx.ioevent_file = NULL; +} + +void +nm_os_kthread_set_affinity(struct nm_kthread *nmk, int affinity) +{ + nmk->affinity = affinity; +} + +struct nm_kthread * +nm_os_kthread_create(struct nm_kthread_cfg *cfg) +{ + struct nm_kthread *nmk = NULL; + int error; + + nmk = malloc(sizeof(*nmk), M_DEVBUF, M_NOWAIT | M_ZERO); + if (!nmk) + return NULL; + + mtx_init(&nmk->worker_lock, "nm_kthread lock", NULL, MTX_SPIN); + nmk->worker_ctx.worker_fn = cfg->worker_fn; + nmk->worker_ctx.worker_private = cfg->worker_private; + nmk->worker_ctx.type = cfg->type; + nmk->affinity = -1; + + /* attach kthread to user process (ptnetmap) */ + nmk->attach_user = cfg->attach_user; + + /* open event fd */ + error = nm_kthread_open_files(nmk, cfg); + if (error) + goto err; + + return nmk; +err: + free(nmk, M_DEVBUF); + return NULL; +} + +int +nm_os_kthread_start(struct nm_kthread *nmk) +{ + struct proc *p = NULL; + int error = 0; + + if (nmk->worker) { + return EBUSY; + } + + /* check if we want to attach kthread to user process */ + if (nmk->attach_user) { + nmk->worker_ctx.user_td = curthread; + p = curthread->td_proc; + } + + /* enable kthread main loop */ + nmk->run = 1; + /* create kthread */ + if((error = kthread_add(nm_kthread_worker, nmk, p, + &nmk->worker, RFNOWAIT /* to be checked */, 0, "nm-kthread-%ld", + nmk->worker_ctx.type))) { + goto err; } + + D("nm_kthread started td 0x%p", nmk->worker); + + return 0; +err: + D("nm_kthread start failed err %d", error); + nmk->worker = NULL; return error; } +void +nm_os_kthread_stop(struct nm_kthread *nmk) +{ + if (!nmk->worker) { + return; + } + /* tell to kthread to exit from main loop */ + nmk->run = 0; + + /* wake up kthread if it sleeps */ + kthread_resume(nmk->worker); + nm_os_kthread_wakeup_worker(nmk); + + nmk->worker = NULL; +} + +void +nm_os_kthread_delete(struct nm_kthread *nmk) +{ + if (!nmk) + return; + if (nmk->worker) { + nm_os_kthread_stop(nmk); + } + + nm_kthread_close_files(nmk); + + free(nmk, M_DEVBUF); +} + /******************** kqueue support ****************/ /* - * The OS_selwakeup also needs to issue a KNOTE_UNLOCKED. + * nm_os_selwakeup also needs to issue a KNOTE_UNLOCKED. * We use a non-zero argument to distinguish the call from the one * in kevent_scan() which instead also needs to run netmap_poll(). * The knote uses a global mutex for the time being. We might @@ -672,17 +1254,23 @@ netmap_open(struct cdev *dev, int oflags, int devtype, struct thread *td) void -freebsd_selwakeup(struct nm_selinfo *si, int pri) +nm_os_selwakeup(struct nm_selinfo *si) { if (netmap_verbose) D("on knote %p", &si->si.si_note); - selwakeuppri(&si->si, pri); + selwakeuppri(&si->si, PI_NET); /* use a non-zero hint to tell the notification from the * call done in kqueue_scan() which uses 0 */ KNOTE_UNLOCKED(&si->si.si_note, 0x100 /* notification */); } +void +nm_os_selrecord(struct thread *td, struct nm_selinfo *si) +{ + selrecord(td, &si->si); +} + static void netmap_knrdetach(struct knote *kn) { @@ -728,7 +1316,7 @@ netmap_knrw(struct knote *kn, long hint, int events) RD(5, "curthread changed %p %p", curthread, priv->np_td); return 1; } else { - revents = netmap_poll((void *)priv, events, curthread); + revents = netmap_poll(priv, events, NULL); return (events & revents) ? 1 : 0; } } @@ -801,13 +1389,47 @@ netmap_kqfilter(struct cdev *dev, struct knote *kn) return 0; } +static int +freebsd_netmap_poll(struct cdev *cdevi __unused, int events, struct thread *td) +{ + struct netmap_priv_d *priv; + if (devfs_get_cdevpriv((void **)&priv)) { + return POLLERR; + } + return netmap_poll(priv, events, td); +} + +static int +freebsd_netmap_ioctl(struct cdev *dev __unused, u_long cmd, caddr_t data, + int ffla __unused, struct thread *td) +{ + int error; + struct netmap_priv_d *priv; + + CURVNET_SET(TD_TO_VNET(rd)); + error = devfs_get_cdevpriv((void **)&priv); + if (error) { + /* XXX ENOENT should be impossible, since the priv + * is now created in the open */ + if (error == ENOENT) + error = ENXIO; + goto out; + } + error = netmap_ioctl(priv, cmd, data, td); +out: + CURVNET_RESTORE(); + + return error; +} + +extern struct cdevsw netmap_cdevsw; /* XXX used in netmap.c, should go elsewhere */ struct cdevsw netmap_cdevsw = { .d_version = D_VERSION, .d_name = "netmap", .d_open = netmap_open, .d_mmap_single = netmap_mmap_single, - .d_ioctl = netmap_ioctl, - .d_poll = netmap_poll, + .d_ioctl = freebsd_netmap_ioctl, + .d_poll = freebsd_netmap_poll, .d_kqfilter = netmap_kqfilter, .d_close = netmap_close, }; @@ -852,6 +1474,24 @@ netmap_loader(__unused struct module *module, int event, __unused void *arg) return (error); } - +#ifdef DEV_MODULE_ORDERED +/* + * The netmap module contains three drivers: (i) the netmap character device + * driver; (ii) the ptnetmap memdev PCI device driver, (iii) the ptnet PCI + * device driver. The attach() routines of both (ii) and (iii) need the + * lock of the global allocator, and such lock is initialized in netmap_init(), + * which is part of (i). + * Therefore, we make sure that (i) is loaded before (ii) and (iii), using + * the 'order' parameter of driver declaration macros. For (i), we specify + * SI_ORDER_MIDDLE, while higher orders are used with the DRIVER_MODULE_ORDERED + * macros for (ii) and (iii). + */ +DEV_MODULE_ORDERED(netmap, netmap_loader, NULL, SI_ORDER_MIDDLE); +#else /* !DEV_MODULE_ORDERED */ DEV_MODULE(netmap, netmap_loader, NULL); +#endif /* DEV_MODULE_ORDERED */ +MODULE_DEPEND(netmap, pci, 1, 1, 1); MODULE_VERSION(netmap, 1); +/* reduce conditional code */ +// linux API, use for the knlist in FreeBSD +/* use a private mutex for the knlist */ -- cgit v1.3