From ce12afaa6fff46c9027eb6b2bd515a4e46ecefc9 Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Wed, 5 Apr 2023 12:12:30 -0400 Subject: netmap: Fix queue stalls with generic interfaces In emulated mode, the FreeBSD netmap port attempts to perform zero-copy transmission. This works as follows: the kernel ring is populated with mbuf headers to which netmap buffers are attached. When transmitting, the mbuf refcount is initialized to 2, and when the counter value has been decremented to 1 netmap infers that the driver has freed the mbuf and thus transmission is complete. This scheme does not generalize to the situation where netmap is attaching to a software interface which may transmit packets among multiple "queues", as is the case with bridge or lagg interfaces. In that case, we would be relying on backing hardware drivers to free transmitted mbufs promptly, but this isn't guaranteed; a driver may reasonably defer freeing a small number of transmitted buffers indefinitely. If such a buffer ends up at the tail of a netmap transmit ring, further transmits can end up blocked indefinitely. Fix the problem by removing the zero-copy scheme (which is also not implemented in the Linux port of netmap). Instead, the kernel ring is populated with regular mbuf clusters into which netmap buffers are copied by nm_os_generic_xmit_frame(). The refcounting scheme is preserved, and this lets us avoid allocating a fresh cluster per transmitted packet in the common case. If the transmit ring is full, a callout is used to free the "stuck" mbuf, avoiding the queue deadlock described above. Furthermore, when recycling mbuf clusters, be sure to fully reinitialize the mbuf header instead of simply re-setting M_PKTHDR. Some software interfaces, like if_vlan, may set fields in the header which should be reset before the mbuf is reused. Reviewed by: vmaffione MFC after: 1 month Sponsored by: Zenarmor Sponsored by: OPNsense Sponsored by: Klara, Inc. Differential Revision: https://reviews.freebsd.org/D38065 --- sys/dev/netmap/netmap_kern.h | 53 +++++++++++++++++++++++++++++++------------- 1 file changed, 38 insertions(+), 15 deletions(-) (limited to 'sys/dev/netmap/netmap_kern.h') diff --git a/sys/dev/netmap/netmap_kern.h b/sys/dev/netmap/netmap_kern.h index 7c68c79c61ef..04b9c199e1dc 100644 --- a/sys/dev/netmap/netmap_kern.h +++ b/sys/dev/netmap/netmap_kern.h @@ -501,6 +501,9 @@ struct netmap_kring { struct mbuf **tx_pool; struct mbuf *tx_event; /* TX event used as a notification */ NM_LOCK_T tx_event_lock; /* protects the tx_event mbuf */ +#ifdef __FreeBSD__ + struct callout tx_event_callout; +#endif struct mbq rx_queue; /* intercepted rx mbufs. */ uint32_t users; /* existing bindings for this ring */ @@ -2381,39 +2384,59 @@ ptnet_sync_tail(struct nm_csb_ktoa *ktoa, struct netmap_kring *kring) * * We allocate mbufs with m_gethdr(), since the mbuf header is needed * by the driver. We also attach a customly-provided external storage, - * which in this case is a netmap buffer. When calling m_extadd(), however - * we pass a NULL address, since the real address (and length) will be - * filled in by nm_os_generic_xmit_frame() right before calling - * if_transmit(). + * which in this case is a netmap buffer. * * The dtor function does nothing, however we need it since mb_free_ext() * has a KASSERT(), checking that the mbuf dtor function is not NULL. */ -static void void_mbuf_dtor(struct mbuf *m) { } +static inline void +nm_generic_mbuf_dtor(struct mbuf *m) +{ + uma_zfree(zone_clust, m->m_ext.ext_buf); +} #define SET_MBUF_DESTRUCTOR(m, fn) do { \ (m)->m_ext.ext_free = (fn != NULL) ? \ - (void *)fn : (void *)void_mbuf_dtor; \ + (void *)fn : (void *)nm_generic_mbuf_dtor; \ } while (0) static inline struct mbuf * -nm_os_get_mbuf(if_t ifp, int len) +nm_os_get_mbuf(if_t ifp __unused, int len) { struct mbuf *m; + void *buf; - (void)ifp; - (void)len; + KASSERT(len <= MCLBYTES, ("%s: len %d", __func__, len)); m = m_gethdr(M_NOWAIT, MT_DATA); - if (m == NULL) { - return m; + if (__predict_false(m == NULL)) + return (NULL); + buf = uma_zalloc(zone_clust, M_NOWAIT); + if (__predict_false(buf == NULL)) { + m_free(m); + return (NULL); } + m_extadd(m, buf, MCLBYTES, nm_generic_mbuf_dtor, NULL, NULL, 0, + EXT_NET_DRV); + return (m); +} - m_extadd(m, NULL /* buf */, 0 /* size */, void_mbuf_dtor, - NULL, NULL, 0, EXT_NET_DRV); - - return m; +static inline void +nm_os_mbuf_reinit(struct mbuf *m) +{ + void *buf; + + KASSERT((m->m_flags & M_EXT) != 0, + ("%s: mbuf %p has no external storage", __func__, m)); + KASSERT(m->m_ext.ext_size == MCLBYTES, + ("%s: mbuf %p has wrong external storage size %u", __func__, m, + m->m_ext.ext_size)); + + buf = m->m_ext.ext_buf; + m_init(m, M_NOWAIT, MT_DATA, M_PKTHDR); + m_extadd(m, buf, MCLBYTES, nm_generic_mbuf_dtor, NULL, NULL, 0, + EXT_NET_DRV); } #endif /* __FreeBSD__ */ -- cgit v1.3