1 files changed, 211 insertions, 58 deletions
diff --git a/sys/dev/gve/gve_tx.c b/sys/dev/gve/gve_tx.c
index 1e62e1226be1..84e3a4c4eb9f 100644
--- a/sys/dev/gve/gve_tx.c
+++ b/sys/dev/gve/gve_tx.c
@@ -1,7 +1,7 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
- * Copyright (c) 2023 Google LLC
+ * Copyright (c) 2023-2024 Google LLC
  *
  * Redistribution and use in source and binary forms, with or without modification,
  * are permitted provided that the following conditions are met:
@@ -30,6 +30,7 @@
  */
 #include "gve.h"
 #include "gve_adminq.h"
+#include "gve_dqo.h"
 
 #define GVE_GQ_TX_MIN_PKT_DESC_BYTES 182
 
@@ -48,61 +49,112 @@ gve_tx_fifo_init(struct gve_priv *priv, struct gve_tx_ring *tx)
 }
 
 static void
-gve_tx_free_ring(struct gve_priv *priv, int i)
+gve_tx_free_ring_gqi(struct gve_priv *priv, int i)
 {
 	struct gve_tx_ring *tx = &priv->tx[i];
 	struct gve_ring_com *com = &tx->com;
 
-	/* Safe to call even if never alloced */
-	gve_free_counters((counter_u64_t *)&tx->stats, NUM_TX_STATS);
-
-	if (tx->br != NULL) {
-		buf_ring_free(tx->br, M_DEVBUF);
-		tx->br = NULL;
+	if (tx->desc_ring != NULL) {
+		gve_dma_free_coherent(&tx->desc_ring_mem);
+		tx->desc_ring = NULL;
 	}
 
-	if (mtx_initialized(&tx->ring_mtx))
-		mtx_destroy(&tx->ring_mtx);
-
 	if (tx->info != NULL) {
 		free(tx->info, M_GVE);
 		tx->info = NULL;
 	}
 
-	if (tx->desc_ring != NULL) {
-		gve_dma_free_coherent(&tx->desc_ring_mem);
-		tx->desc_ring = NULL;
+	if (com->qpl != NULL) {
+		gve_free_qpl(priv, com->qpl);
+		com->qpl = NULL;
 	}
+}
+
+static void
+gve_tx_free_ring(struct gve_priv *priv, int i)
+{
+	struct gve_tx_ring *tx = &priv->tx[i];
+	struct gve_ring_com *com = &tx->com;
+
+	/* Safe to call even if never alloced */
+	gve_free_counters((counter_u64_t *)&tx->stats, NUM_TX_STATS);
+
+	if (mtx_initialized(&tx->ring_mtx))
+		mtx_destroy(&tx->ring_mtx);
 
 	if (com->q_resources != NULL) {
 		gve_dma_free_coherent(&com->q_resources_mem);
 		com->q_resources = NULL;
 	}
+
+	if (tx->br != NULL) {
+		buf_ring_free(tx->br, M_DEVBUF);
+		tx->br = NULL;
+	}
+
+	if (gve_is_gqi(priv))
+		gve_tx_free_ring_gqi(priv, i);
+	else
+		gve_tx_free_ring_dqo(priv, i);
 }
 
 static int
-gve_tx_alloc_ring(struct gve_priv *priv, int i)
+gve_tx_alloc_ring_gqi(struct gve_priv *priv, int i)
 {
 	struct gve_tx_ring *tx = &priv->tx[i];
 	struct gve_ring_com *com = &tx->com;
-	char mtx_name[16];
 	int err;
 
-	com->priv = priv;
-	com->id = i;
+	err = gve_dma_alloc_coherent(priv,
+	    sizeof(union gve_tx_desc) * priv->tx_desc_cnt,
+	    CACHE_LINE_SIZE, &tx->desc_ring_mem);
+	if (err != 0) {
+		device_printf(priv->dev,
+		    "Failed to alloc desc ring for tx ring %d", i);
+		goto abort;
+	}
+	tx->desc_ring = tx->desc_ring_mem.cpu_addr;
 
-	com->qpl = &priv->qpls[i];
+	com->qpl = gve_alloc_qpl(priv, i, priv->tx_desc_cnt / GVE_QPL_DIVISOR,
+	    /*single_kva=*/true);
 	if (com->qpl == NULL) {
-		device_printf(priv->dev, "No QPL left for tx ring %d\n", i);
-		return (ENOMEM);
+		device_printf(priv->dev,
+		    "Failed to alloc QPL for tx ring %d\n", i);
+		err = ENOMEM;
+		goto abort;
 	}
 
 	err = gve_tx_fifo_init(priv, tx);
 	if (err != 0)
 		goto abort;
 
-	tx->info = malloc(sizeof(struct gve_tx_buffer_state) * priv->tx_desc_cnt,
+	tx->info = malloc(
+	    sizeof(struct gve_tx_buffer_state) * priv->tx_desc_cnt,
 	    M_GVE, M_WAITOK | M_ZERO);
+	return (0);
+
+abort:
+	gve_tx_free_ring_gqi(priv, i);
+	return (err);
+}
+
+static int
+gve_tx_alloc_ring(struct gve_priv *priv, int i)
+{
+	struct gve_tx_ring *tx = &priv->tx[i];
+	struct gve_ring_com *com = &tx->com;
+	char mtx_name[16];
+	int err;
+
+	com->priv = priv;
+	com->id = i;
+
+	if (gve_is_gqi(priv))
+		err = gve_tx_alloc_ring_gqi(priv, i);
+	else
+		err = gve_tx_alloc_ring_dqo(priv, i);
+	if (err != 0)
+		goto abort;
 
 	sprintf(mtx_name, "gvetx%d", i);
 	mtx_init(&tx->ring_mtx, mtx_name, NULL, MTX_DEF);
@@ -115,19 +167,13 @@ gve_tx_alloc_ring(struct gve_priv *priv, int i)
 	err = gve_dma_alloc_coherent(priv, sizeof(struct gve_queue_resources),
 	    PAGE_SIZE, &com->q_resources_mem);
 	if (err != 0) {
-		device_printf(priv->dev, "Failed to alloc queue resources for tx ring %d", i);
+		device_printf(priv->dev,
+		    "Failed to alloc queue resources for tx ring %d", i);
 		goto abort;
 	}
 	com->q_resources = com->q_resources_mem.cpu_addr;
 
-	err = gve_dma_alloc_coherent(priv,
-	    sizeof(union gve_tx_desc) * priv->tx_desc_cnt,
-	    CACHE_LINE_SIZE, &tx->desc_ring_mem);
-	if (err != 0) {
-		device_printf(priv->dev, "Failed to alloc desc ring for tx ring %d", i);
-		goto abort;
-	}
-	tx->desc_ring = tx->desc_ring_mem.cpu_addr;
+	tx->last_kicked = 0;
 
 	return (0);
 
@@ -137,39 +183,32 @@ abort:
 }
 
 int
-gve_alloc_tx_rings(struct gve_priv *priv)
+gve_alloc_tx_rings(struct gve_priv *priv, uint16_t start_idx, uint16_t stop_idx)
 {
-	int err = 0;
 	int i;
+	int err;
 
-	priv->tx = malloc(sizeof(struct gve_tx_ring) * priv->tx_cfg.num_queues,
-	    M_GVE, M_WAITOK | M_ZERO);
+	KASSERT(priv->tx != NULL, ("priv->tx is NULL!"));
 
-	for (i = 0; i < priv->tx_cfg.num_queues; i++) {
+	for (i = start_idx; i < stop_idx; i++) {
 		err = gve_tx_alloc_ring(priv, i);
 		if (err != 0)
 			goto free_rings;
-
 	}
 
 	return (0);
-
 free_rings:
-	while (i--)
-		gve_tx_free_ring(priv, i);
-	free(priv->tx, M_GVE);
+	gve_free_tx_rings(priv, start_idx, i);
 	return (err);
 }
 
 void
-gve_free_tx_rings(struct gve_priv *priv)
+gve_free_tx_rings(struct gve_priv *priv, uint16_t start_idx, uint16_t stop_idx)
 {
 	int i;
 
-	for (i = 0; i < priv->tx_cfg.num_queues; i++)
+	for (i = start_idx; i < stop_idx; i++)
 		gve_tx_free_ring(priv, i);
-
-	free(priv->tx, M_GVE);
 }
 
 static void
@@ -181,6 +220,7 @@ gve_tx_clear_desc_ring(struct gve_tx_ring *tx)
 	for (i = 0; i < com->priv->tx_desc_cnt; i++) {
 		tx->desc_ring[i] = (union gve_tx_desc){};
 		tx->info[i] = (struct gve_tx_buffer_state){};
+		gve_invalidate_timestamp(&tx->info[i].enqueue_time_sec);
 	}
 
 	bus_dmamap_sync(tx->desc_ring_mem.tag, tx->desc_ring_mem.map,
@@ -209,7 +249,11 @@ gve_start_tx_ring(struct gve_priv *priv, int i)
 	struct gve_tx_ring *tx = &priv->tx[i];
 	struct gve_ring_com *com = &tx->com;
 
-	NET_TASK_INIT(&com->cleanup_task, 0, gve_tx_cleanup_tq, tx);
+	atomic_store_bool(&tx->stopped, false);
+	if (gve_is_gqi(priv))
+		NET_TASK_INIT(&com->cleanup_task, 0, gve_tx_cleanup_tq, tx);
+	else
+		NET_TASK_INIT(&com->cleanup_task, 0, gve_tx_cleanup_tq_dqo, tx);
 	com->cleanup_tq = taskqueue_create_fast("gve tx", M_WAITOK,
 	    taskqueue_thread_enqueue, &com->cleanup_tq);
 	taskqueue_start_threads(&com->cleanup_tq, 1, PI_NET, "%s txq %d",
@@ -233,8 +277,12 @@ gve_create_tx_rings(struct gve_priv *priv)
 	if (gve_get_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK))
 		return (0);
 
-	for (i = 0; i < priv->tx_cfg.num_queues; i++)
-		gve_clear_tx_ring(priv, i);
+	for (i = 0; i < priv->tx_cfg.num_queues; i++) {
+		if (gve_is_gqi(priv))
+			gve_clear_tx_ring(priv, i);
+		else
+			gve_clear_tx_ring_dqo(priv, i);
+	}
 
 	err = gve_adminq_create_tx_queues(priv, priv->tx_cfg.num_queues);
 	if (err != 0)
@@ -300,6 +348,30 @@ gve_destroy_tx_rings(struct gve_priv *priv)
 }
 
 int
+gve_check_tx_timeout_gqi(struct gve_priv *priv, struct gve_tx_ring *tx)
+{
+	struct gve_tx_buffer_state *info;
+	uint32_t pkt_idx;
+	int num_timeouts;
+
+	num_timeouts = 0;
+
+	for (pkt_idx = 0; pkt_idx < priv->tx_desc_cnt; pkt_idx++) {
+		info = &tx->info[pkt_idx];
+
+		if (!gve_timestamp_valid(&info->enqueue_time_sec))
+			continue;
+
+		if (__predict_false(
+		    gve_seconds_since(&info->enqueue_time_sec) >
+		    GVE_TX_TIMEOUT_PKT_SEC))
+			num_timeouts += 1;
+	}
+
+	return (num_timeouts);
+}
+
+int
 gve_tx_intr(void *arg)
 {
 	struct gve_tx_ring *tx = arg;
@@ -351,7 +423,10 @@ gve_tx_cleanup_tq(void *arg, int pending)
 		if (mbuf == NULL)
 			continue;
 
+		gve_invalidate_timestamp(&info->enqueue_time_sec);
+
 		info->mbuf = NULL;
+
 		counter_enter();
 		counter_u64_add_protected(tx->stats.tbytes, mbuf->m_pkthdr.len);
 		counter_u64_add_protected(tx->stats.tpackets, 1);
@@ -375,7 +450,7 @@ gve_tx_cleanup_tq(void *arg, int pending)
 	 * interrupt but they will still be handled by the enqueue below.
 	 * Completions born after the barrier WILL trigger an interrupt.
 	 */
-	mb();
+	atomic_thread_fence_seq_cst();
 
 	nic_done = gve_tx_load_event_counter(priv, tx);
 	todo = nic_done - tx->done;
@@ -383,6 +458,11 @@ gve_tx_cleanup_tq(void *arg, int pending)
 		gve_db_bar_write_4(priv, tx->com.irq_db_offset, GVE_IRQ_MASK);
 		taskqueue_enqueue(tx->com.cleanup_tq, &tx->com.cleanup_task);
 	}
+
+	if (atomic_load_bool(&tx->stopped) && space_freed) {
+		atomic_store_bool(&tx->stopped, false);
+		taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task);
+	}
 }
 
 static void
@@ -627,8 +707,7 @@ gve_xmit(struct gve_tx_ring *tx, struct mbuf *mbuf)
 	bytes_required = gve_fifo_bytes_required(tx, first_seg_len, pkt_len);
 	if (__predict_false(!gve_can_tx(tx, bytes_required))) {
 		counter_enter();
-		counter_u64_add_protected(tx->stats.tx_dropped_pkt_nospace_device, 1);
-		counter_u64_add_protected(tx->stats.tx_dropped_pkt, 1);
+		counter_u64_add_protected(tx->stats.tx_delayed_pkt_nospace_device, 1);
 		counter_exit();
 		return (ENOBUFS);
 	}
@@ -636,6 +715,8 @@ gve_xmit(struct gve_tx_ring *tx, struct mbuf *mbuf)
 	/* So that the cleanup taskqueue can free the mbuf eventually. */
 	info->mbuf = mbuf;
 
+	gve_set_timestamp(&info->enqueue_time_sec);
+
 	/*
 	 * We don't want to split the header, so if necessary, pad to the end
 	 * of the fifo and then put the header at the beginning of the fifo.
@@ -689,19 +770,86 @@ gve_xmit(struct gve_tx_ring *tx, struct mbuf *mbuf)
 	return (0);
 }
 
+static int
+gve_xmit_mbuf(struct gve_tx_ring *tx,
+    struct mbuf **mbuf)
+{
+	if (gve_is_gqi(tx->com.priv))
+		return (gve_xmit(tx, *mbuf));
+
+	if (gve_is_qpl(tx->com.priv))
+		return (gve_xmit_dqo_qpl(tx, *mbuf));
+
+	/*
+	 * gve_xmit_dqo might attempt to defrag the mbuf chain.
+	 * The reference is passed in so that in the case of
+	 * errors, the new mbuf chain is what's put back on the br.
+	 */
+	return (gve_xmit_dqo(tx, mbuf));
+}
+
+/*
+ * Has the side-effect of stopping the xmit queue by setting tx->stopped
+ */
+static int
+gve_xmit_retry_enobuf_mbuf(struct gve_tx_ring *tx,
+    struct mbuf **mbuf)
+{
+	int err;
+
+	atomic_store_bool(&tx->stopped, true);
+
+	/*
+	 * Room made in the queue BEFORE the barrier will be seen by the
+	 * gve_xmit_mbuf retry below.
+	 *
+	 * If room is made in the queue AFTER the barrier, the cleanup tq
+	 * iteration creating the room will either see a tx->stopped value
+	 * of 0 or the 1 we just wrote:
+	 *
+	 *   If it sees a 1, then it would enqueue the xmit tq. Enqueue
+	 *   implies a retry on the waiting pkt.
+	 *
+	 *   If it sees a 0, then that implies a previous iteration overwrote
+	 *   our 1, and that iteration would enqueue the xmit tq. Enqueue
+	 *   implies a retry on the waiting pkt.
+	 */
+	atomic_thread_fence_seq_cst();
+
+	err = gve_xmit_mbuf(tx, mbuf);
+	if (err == 0)
+		atomic_store_bool(&tx->stopped, false);
+
+	return (err);
+}
+
 static void
 gve_xmit_br(struct gve_tx_ring *tx)
 {
 	struct gve_priv *priv = tx->com.priv;
 	struct ifnet *ifp = priv->ifp;
 	struct mbuf *mbuf;
+	int err;
 
 	while ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) != 0 &&
 	    (mbuf = drbr_peek(ifp, tx->br)) != NULL) {
+		err = gve_xmit_mbuf(tx, &mbuf);
 
-		if (__predict_false(gve_xmit(tx, mbuf) != 0)) {
-			drbr_putback(ifp, tx->br, mbuf);
-			taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task);
+		/*
+		 * We need to stop this taskqueue when we can't xmit the pkt due
+		 * to lack of space in the NIC ring (ENOBUFS). The retry exists
+		 * to guard against a TOCTTOU bug that could end up freezing the
+		 * queue forever.
+		 */
+		if (__predict_false(mbuf != NULL && err == ENOBUFS))
+			err = gve_xmit_retry_enobuf_mbuf(tx, &mbuf);
+
+		if (__predict_false(err != 0 && mbuf != NULL)) {
+			if (err == EINVAL) {
+				drbr_advance(ifp, tx->br);
+				m_freem(mbuf);
+			} else
+				drbr_putback(ifp, tx->br, mbuf);
 			break;
 		}
 
@@ -710,7 +858,12 @@ gve_xmit_br(struct gve_tx_ring *tx)
 
 		bus_dmamap_sync(tx->desc_ring_mem.tag, tx->desc_ring_mem.map,
 		    BUS_DMASYNC_PREWRITE);
-		gve_db_bar_write_4(priv, tx->com.db_offset, tx->req);
+
+		if (gve_is_gqi(priv))
+			gve_db_bar_write_4(priv, tx->com.db_offset, tx->req);
+		else
+			gve_db_bar_dqo_write_4(priv, tx->com.db_offset,
+			    tx->dqo.desc_tail);
 	}
 }
 
@@ -763,7 +916,8 @@ gve_xmit_ifp(if_t ifp, struct mbuf *mbuf)
 	is_br_empty = drbr_empty(ifp, tx->br);
 	err = drbr_enqueue(ifp, tx->br, mbuf);
 	if (__predict_false(err != 0)) {
-		taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task);
+		if (!atomic_load_bool(&tx->stopped))
+			taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task);
 		counter_enter();
 		counter_u64_add_protected(tx->stats.tx_dropped_pkt_nospace_bufring, 1);
 		counter_u64_add_protected(tx->stats.tx_dropped_pkt, 1);
@@ -778,9 +932,8 @@ gve_xmit_ifp(if_t ifp, struct mbuf *mbuf)
 	if (is_br_empty && (GVE_RING_TRYLOCK(tx) != 0)) {
 		gve_xmit_br(tx);
 		GVE_RING_UNLOCK(tx);
-	} else {
+	} else if (!atomic_load_bool(&tx->stopped))
 		taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task);
-	}
 
 	return (0);
 }