aboutsummaryrefslogtreecommitdiff
path: root/sys/dev/gve
diff options
context:
space:
mode:
Diffstat (limited to 'sys/dev/gve')
-rw-r--r--sys/dev/gve/gve.h400
-rw-r--r--sys/dev/gve/gve_adminq.c215
-rw-r--r--sys/dev/gve/gve_adminq.h84
-rw-r--r--sys/dev/gve/gve_desc.h4
-rw-r--r--sys/dev/gve/gve_dqo.h337
-rw-r--r--sys/dev/gve/gve_main.c381
-rw-r--r--sys/dev/gve/gve_plat.h3
-rw-r--r--sys/dev/gve/gve_qpl.c187
-rw-r--r--sys/dev/gve/gve_rx.c163
-rw-r--r--sys/dev/gve/gve_rx_dqo.c1035
-rw-r--r--sys/dev/gve/gve_sysctl.c252
-rw-r--r--sys/dev/gve/gve_tx.c269
-rw-r--r--sys/dev/gve/gve_tx_dqo.c1149
-rw-r--r--sys/dev/gve/gve_utils.c95
14 files changed, 4228 insertions, 346 deletions
diff --git a/sys/dev/gve/gve.h b/sys/dev/gve/gve.h
index c446199dff2d..64c2a0481817 100644
--- a/sys/dev/gve/gve.h
+++ b/sys/dev/gve/gve.h
@@ -1,7 +1,7 @@
/*-
* SPDX-License-Identifier: BSD-3-Clause
*
- * Copyright (c) 2023 Google LLC
+ * Copyright (c) 2023-2024 Google LLC
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
@@ -47,12 +47,31 @@
#define GVE_TX_MAX_DESCS 4
#define GVE_TX_BUFRING_ENTRIES 4096
+#define GVE_TX_TIMEOUT_PKT_SEC 5
+#define GVE_TX_TIMEOUT_CHECK_CADENCE_SEC 5
+/*
+ * If the driver finds timed out packets on a tx queue it first kicks it and
+ * records the time. If the driver again finds a timeout on the same queue
+ * before the end of the cooldown period, only then will it reset. Thus, for a
+ * reset to be able to occur at all, the cooldown must be at least as long
+ * as the tx timeout checking cadence multiplied by the number of queues.
+ */
+#define GVE_TX_TIMEOUT_MAX_TX_QUEUES 16
+#define GVE_TX_TIMEOUT_KICK_COOLDOWN_SEC \
+ (2 * GVE_TX_TIMEOUT_CHECK_CADENCE_SEC * GVE_TX_TIMEOUT_MAX_TX_QUEUES)
+
+#define GVE_TIMESTAMP_INVALID -1
+
#define ADMINQ_SIZE PAGE_SIZE
#define GVE_DEFAULT_RX_BUFFER_SIZE 2048
+#define GVE_4K_RX_BUFFER_SIZE_DQO 4096
/* Each RX bounce buffer page can fit two packet buffers. */
#define GVE_DEFAULT_RX_BUFFER_OFFSET (PAGE_SIZE / 2)
+/* PTYPEs are always 10 bits. */
+#define GVE_NUM_PTYPES 1024
+
/*
* Number of descriptors per queue page list.
* Page count AKA QPL size can be derived by dividing the number of elements in
@@ -60,8 +79,17 @@
*/
#define GVE_QPL_DIVISOR 16
+/* Ring Size Limits */
+#define GVE_DEFAULT_MIN_RX_RING_SIZE 512
+#define GVE_DEFAULT_MIN_TX_RING_SIZE 256
+
static MALLOC_DEFINE(M_GVE, "gve", "gve allocations");
+_Static_assert(MCLBYTES >= GVE_DEFAULT_RX_BUFFER_SIZE,
+ "gve: bad MCLBYTES length");
+_Static_assert(MJUMPAGESIZE >= GVE_4K_RX_BUFFER_SIZE_DQO,
+ "gve: bad MJUMPAGESIZE length");
+
struct gve_dma_handle {
bus_addr_t bus_addr;
void *cpu_addr;
@@ -102,6 +130,7 @@ enum gve_queue_format {
GVE_GQI_RDA_FORMAT = 0x1,
GVE_GQI_QPL_FORMAT = 0x2,
GVE_DQO_RDA_FORMAT = 0x3,
+ GVE_DQO_QPL_FORMAT = 0x4,
};
enum gve_state_flags_bit {
@@ -223,31 +252,93 @@ struct gve_rxq_stats {
counter_u64_t rx_frag_flip_cnt;
counter_u64_t rx_frag_copy_cnt;
counter_u64_t rx_dropped_pkt_desc_err;
+ counter_u64_t rx_dropped_pkt_buf_post_fail;
counter_u64_t rx_dropped_pkt_mbuf_alloc_fail;
+ counter_u64_t rx_mbuf_dmamap_err;
+ counter_u64_t rx_mbuf_mclget_null;
};
#define NUM_RX_STATS (sizeof(struct gve_rxq_stats) / sizeof(counter_u64_t))
+union gve_rx_qpl_buf_id_dqo {
+ struct {
+ uint16_t buf_id:11; /* Index into rx->dqo.bufs */
+ uint8_t frag_num:5; /* Which frag in the QPL page */
+ };
+ uint16_t all;
+} __packed;
+_Static_assert(sizeof(union gve_rx_qpl_buf_id_dqo) == 2,
+ "gve: bad dqo qpl rx buf id length");
+
+struct gve_rx_buf_dqo {
+ union {
+ /* RDA */
+ struct {
+ struct mbuf *mbuf;
+ bus_dmamap_t dmamap;
+ uint64_t addr;
+ bool mapped;
+ };
+ /* QPL */
+ struct {
+ uint8_t num_nic_frags; /* number of pending completions */
+ uint8_t next_idx; /* index of the next frag to post */
+ /* for chaining rx->dqo.used_bufs */
+ STAILQ_ENTRY(gve_rx_buf_dqo) stailq_entry;
+ };
+ };
+ /* for chaining rx->dqo.free_bufs */
+ SLIST_ENTRY(gve_rx_buf_dqo) slist_entry;
+};
+
/* power-of-2 sized receive ring */
struct gve_rx_ring {
struct gve_ring_com com;
struct gve_dma_handle desc_ring_mem;
- struct gve_dma_handle data_ring_mem;
-
- /* accessed in the receive hot path */
- struct {
- struct gve_rx_desc *desc_ring;
- union gve_rx_data_slot *data_ring;
- struct gve_rx_slot_page_info *page_info;
-
- struct gve_rx_ctx ctx;
- struct lro_ctrl lro;
- uint8_t seq_no; /* helps traverse the descriptor ring */
- uint32_t cnt; /* free-running total number of completed packets */
- uint32_t fill_cnt; /* free-running total number of descs and buffs posted */
- uint32_t mask; /* masks the cnt and fill_cnt to the size of the ring */
- struct gve_rxq_stats stats;
- } __aligned(CACHE_LINE_SIZE);
+ uint32_t cnt; /* free-running total number of completed packets */
+ uint32_t fill_cnt; /* free-running total number of descs and buffs posted */
+
+ union {
+ /* GQI-only fields */
+ struct {
+ struct gve_dma_handle data_ring_mem;
+
+ /* accessed in the GQ receive hot path */
+ struct gve_rx_desc *desc_ring;
+ union gve_rx_data_slot *data_ring;
+ struct gve_rx_slot_page_info *page_info;
+ uint32_t mask; /* masks the cnt and fill_cnt to the size of the ring */
+ uint8_t seq_no; /* helps traverse the descriptor ring */
+ };
+
+ /* DQO-only fields */
+ struct {
+ struct gve_dma_handle compl_ring_mem;
+
+ struct gve_rx_compl_desc_dqo *compl_ring;
+ struct gve_rx_desc_dqo *desc_ring;
+ struct gve_rx_buf_dqo *bufs; /* Parking place for posted buffers */
+ bus_dma_tag_t buf_dmatag; /* To dmamap posted mbufs with */
+
+ uint32_t buf_cnt; /* Size of the bufs array */
+ uint32_t mask; /* One less than the sizes of the desc and compl rings */
+ uint32_t head; /* The index at which to post the next buffer at */
+ uint32_t tail; /* The index at which to receive the next compl at */
+ uint8_t cur_gen_bit; /* Gets flipped on every cycle of the compl ring */
+ SLIST_HEAD(, gve_rx_buf_dqo) free_bufs;
+
+ /*
+ * Only used in QPL mode. Pages referred to by if_input-ed mbufs
+ * stay parked here till their wire count comes back to 1.
+ * Pages are moved here after there aren't any pending completions.
+ */
+ STAILQ_HEAD(, gve_rx_buf_dqo) used_bufs;
+ } dqo;
+ };
+
+ struct lro_ctrl lro;
+ struct gve_rx_ctx ctx;
+ struct gve_rxq_stats stats;
} __aligned(CACHE_LINE_SIZE);
@@ -267,6 +358,14 @@ struct gve_tx_fifo {
struct gve_tx_buffer_state {
struct mbuf *mbuf;
+
+ /*
+ * Time at which the xmit tq places descriptors for mbuf's payload on a
+ * tx queue. This timestamp is invalidated when the mbuf is freed and
+ * must be checked for validity when read.
+ */
+ int64_t enqueue_time_sec;
+
struct gve_tx_iovec iov[GVE_TX_MAX_DESCS];
};
@@ -275,13 +374,50 @@ struct gve_txq_stats {
counter_u64_t tpackets;
counter_u64_t tso_packet_cnt;
counter_u64_t tx_dropped_pkt;
- counter_u64_t tx_dropped_pkt_nospace_device;
+ counter_u64_t tx_delayed_pkt_nospace_device;
counter_u64_t tx_dropped_pkt_nospace_bufring;
+ counter_u64_t tx_delayed_pkt_nospace_descring;
+ counter_u64_t tx_delayed_pkt_nospace_compring;
+ counter_u64_t tx_delayed_pkt_nospace_qpl_bufs;
+ counter_u64_t tx_delayed_pkt_tsoerr;
counter_u64_t tx_dropped_pkt_vlan;
+ counter_u64_t tx_mbuf_collapse;
+ counter_u64_t tx_mbuf_defrag;
+ counter_u64_t tx_mbuf_defrag_err;
+ counter_u64_t tx_mbuf_dmamap_enomem_err;
+ counter_u64_t tx_mbuf_dmamap_err;
+ counter_u64_t tx_timeout;
};
#define NUM_TX_STATS (sizeof(struct gve_txq_stats) / sizeof(counter_u64_t))
+struct gve_tx_pending_pkt_dqo {
+ struct mbuf *mbuf;
+
+ /*
+ * Time at which the xmit tq places descriptors for mbuf's payload on a
+ * tx queue. This timestamp is invalidated when the mbuf is freed and
+ * must be checked for validity when read.
+ */
+ int64_t enqueue_time_sec;
+
+ union {
+ /* RDA */
+ bus_dmamap_t dmamap;
+ /* QPL */
+ struct {
+ /*
+ * A linked list of entries from qpl_bufs that served
+ * as the bounce buffer for this packet.
+ */
+ int32_t qpl_buf_head;
+ uint32_t num_qpl_bufs;
+ };
+ };
+ uint8_t state; /* the gve_packet_state enum */
+ int next; /* To chain the free_pending_pkts lists */
+};
+
/* power-of-2 sized transmit ring */
struct gve_tx_ring {
struct gve_ring_com com;
@@ -289,23 +425,134 @@ struct gve_tx_ring {
struct task xmit_task;
struct taskqueue *xmit_tq;
+ bool stopped;
+
+ /* Accessed when writing descriptors */
+ struct buf_ring *br;
+ struct mtx ring_mtx;
+
+ uint32_t req; /* free-running total number of packets written to the nic */
+ uint32_t done; /* free-running total number of completed packets */
+
+ int64_t last_kicked; /* always-valid timestamp in seconds for the last queue kick */
+
+ union {
+ /* GQI specific stuff */
+ struct {
+ union gve_tx_desc *desc_ring;
+ struct gve_tx_buffer_state *info;
+
+ struct gve_tx_fifo fifo;
+
+ uint32_t mask; /* masks the req and done to the size of the ring */
+ };
+
+ /* DQO specific stuff */
+ struct {
+ struct gve_dma_handle compl_ring_mem;
+
+ /* Accessed when writing descriptors */
+ struct {
+ union gve_tx_desc_dqo *desc_ring;
+ uint32_t desc_mask; /* masks head and tail to the size of desc_ring */
+ uint32_t desc_head; /* last desc read by NIC, cached value of hw_tx_head */
+ uint32_t desc_tail; /* last desc written by driver */
+ uint32_t last_re_idx; /* desc which last had "report event" set */
+
+ /*
+ * The head index of a singly linked list containing pending packet objects
+ * to park mbufs till the NIC sends completions. Once this list is depleted,
+ * the "_prd" suffixed producer list, grown by the completion taskqueue,
+ * is stolen.
+ */
+ int32_t free_pending_pkts_csm;
+
+ /*
+ * The head index of a singly linked list representing QPL page fragments
+ * to copy mbuf payload into for the NIC to see. Once this list is depleted,
+ * the "_prd" suffixed producer list, grown by the completion taskqueue,
+ * is stolen.
+ *
+ * Only used in QPL mode. int32_t because atomic_swap_16 doesn't exist.
+ */
+ int32_t free_qpl_bufs_csm;
+ uint32_t qpl_bufs_consumed; /* Allows quickly checking for buf availability */
+ uint32_t qpl_bufs_produced_cached; /* Cached value of qpl_bufs_produced */
+
+ /* DMA params for mapping Tx mbufs. Only used in RDA mode. */
+ bus_dma_tag_t buf_dmatag;
+ } __aligned(CACHE_LINE_SIZE);
+
+ /* Accessed when processing completions */
+ struct {
+ struct gve_tx_compl_desc_dqo *compl_ring;
+ uint32_t compl_mask; /* masks head to the size of compl_ring */
+ uint32_t compl_head; /* last completion read by driver */
+ uint8_t cur_gen_bit; /* NIC flips a bit on every pass */
+ uint32_t hw_tx_head; /* last desc read by NIC */
+
+ /*
+ * The completion taskqueue moves pending-packet objects to this
+ * list after freeing the mbuf. The "_prd" denotes that this is
+ * a producer list. The transmit taskqueue steals this list once
+ * its consumer list, with the "_csm" suffix, is depleted.
+ */
+ int32_t free_pending_pkts_prd;
+
+ /*
+ * The completion taskqueue moves the QPL pages corresponding to a
+ * completed packet into this list. It is only used in QPL mode.
+ * The "_prd" denotes that this is a producer list. The transmit
+ * taskqueue steals this list once its consumer list, with the "_csm"
+ * suffix, is depleted.
+ *
+ * Only used in QPL mode. int32_t because atomic_swap_16 doesn't exist.
+ */
+ int32_t free_qpl_bufs_prd;
+ uint32_t qpl_bufs_produced;
+ } __aligned(CACHE_LINE_SIZE);
+
+ /* Accessed by both the completion and xmit loops */
+ struct {
+ /* completion tags index into this array */
+ struct gve_tx_pending_pkt_dqo *pending_pkts;
+ uint16_t num_pending_pkts;
+
+ /*
+ * Represents QPL page fragments. An index into this array
+ * always represents the same QPL page fragment. The value
+ * is also an index into this array and servers as a means
+ * to chain buffers into linked lists whose heads are
+ * either free_qpl_bufs_prd or free_qpl_bufs_csm or
+ * qpl_bufs_head.
+ */
+ int32_t *qpl_bufs;
+ } __aligned(CACHE_LINE_SIZE);
+ } dqo;
+ };
+ struct gve_txq_stats stats;
+} __aligned(CACHE_LINE_SIZE);
- /* accessed in the transmit hot path */
- struct {
- union gve_tx_desc *desc_ring;
- struct gve_tx_buffer_state *info;
- struct buf_ring *br;
-
- struct gve_tx_fifo fifo;
- struct mtx ring_mtx;
+enum gve_packet_state {
+ /*
+ * Packet does not yet have a dmamap created.
+ * This should always be zero since state is not explicitly initialized.
+ */
+ GVE_PACKET_STATE_UNALLOCATED,
+ /* Packet has a dmamap and is in free list, available to be allocated. */
+ GVE_PACKET_STATE_FREE,
+ /* Packet is expecting a regular data completion */
+ GVE_PACKET_STATE_PENDING_DATA_COMPL,
+};
- uint32_t req; /* free-running total number of packets written to the nic */
- uint32_t done; /* free-running total number of completed packets */
- uint32_t mask; /* masks the req and done to the size of the ring */
- struct gve_txq_stats stats;
- } __aligned(CACHE_LINE_SIZE);
+struct gve_ptype {
+ uint8_t l3_type; /* `gve_l3_type` in gve_adminq.h */
+ uint8_t l4_type; /* `gve_l4_type` in gve_adminq.h */
+};
-} __aligned(CACHE_LINE_SIZE);
+struct gve_ptype_lut {
+ struct gve_ptype ptypes[GVE_NUM_PTYPES];
+};
struct gve_priv {
if_t ifp;
@@ -326,12 +573,17 @@ struct gve_priv {
uint16_t num_event_counters;
uint16_t default_num_queues;
uint16_t tx_desc_cnt;
+ uint16_t max_tx_desc_cnt;
+ uint16_t min_tx_desc_cnt;
uint16_t rx_desc_cnt;
+ uint16_t max_rx_desc_cnt;
+ uint16_t min_rx_desc_cnt;
uint16_t rx_pages_per_qpl;
uint64_t max_registered_pages;
uint64_t num_registered_pages;
uint32_t supported_features;
uint16_t max_mtu;
+ bool modify_ringsize_enabled;
struct gve_dma_handle counter_array_mem;
__be32 *counters;
@@ -339,7 +591,6 @@ struct gve_priv {
struct gve_irq_db *irq_db_indices;
enum gve_queue_format queue_format;
- struct gve_queue_page_list *qpls;
struct gve_queue_config tx_cfg;
struct gve_queue_config rx_cfg;
uint32_t num_queues;
@@ -348,6 +599,8 @@ struct gve_priv {
struct gve_tx_ring *tx;
struct gve_rx_ring *rx;
+ struct gve_ptype_lut *ptype_lut_dqo;
+
/*
* Admin queue - see gve_adminq.h
* Since AQ cmds do not run in steady state, 32 bit counters suffice
@@ -370,6 +623,7 @@ struct gve_priv {
uint32_t adminq_dcfg_device_resources_cnt;
uint32_t adminq_set_driver_parameter_cnt;
uint32_t adminq_verify_driver_compatibility_cnt;
+ uint32_t adminq_get_ptype_map_cnt;
uint32_t interface_up_cnt;
uint32_t interface_down_cnt;
@@ -380,6 +634,12 @@ struct gve_priv {
struct gve_state_flags state_flags;
struct sx gve_iface_lock;
+
+ struct callout tx_timeout_service;
+ /* The index of tx queue that the timer service will check on its next invocation */
+ uint16_t check_tx_queue_idx;
+
+ uint16_t rx_buf_size_dqo;
};
static inline bool
@@ -400,39 +660,89 @@ gve_clear_state_flag(struct gve_priv *priv, int pos)
BIT_CLR_ATOMIC(GVE_NUM_STATE_FLAGS, pos, &priv->state_flags);
}
+static inline bool
+gve_is_gqi(struct gve_priv *priv)
+{
+ return (priv->queue_format == GVE_GQI_QPL_FORMAT);
+}
+
+static inline bool
+gve_is_qpl(struct gve_priv *priv)
+{
+ return (priv->queue_format == GVE_GQI_QPL_FORMAT ||
+ priv->queue_format == GVE_DQO_QPL_FORMAT);
+}
+
+static inline bool
+gve_is_4k_rx_buf(struct gve_priv *priv)
+{
+ return (priv->rx_buf_size_dqo == GVE_4K_RX_BUFFER_SIZE_DQO);
+}
+
+static inline bus_size_t
+gve_rx_dqo_mbuf_segment_size(struct gve_priv *priv)
+{
+ return (gve_is_4k_rx_buf(priv) ? MJUMPAGESIZE : MCLBYTES);
+}
+
/* Defined in gve_main.c */
void gve_schedule_reset(struct gve_priv *priv);
+int gve_adjust_tx_queues(struct gve_priv *priv, uint16_t new_queue_cnt);
+int gve_adjust_rx_queues(struct gve_priv *priv, uint16_t new_queue_cnt);
+int gve_adjust_ring_sizes(struct gve_priv *priv, uint16_t new_desc_cnt, bool is_rx);
/* Register access functions defined in gve_utils.c */
uint32_t gve_reg_bar_read_4(struct gve_priv *priv, bus_size_t offset);
void gve_reg_bar_write_4(struct gve_priv *priv, bus_size_t offset, uint32_t val);
void gve_db_bar_write_4(struct gve_priv *priv, bus_size_t offset, uint32_t val);
+void gve_db_bar_dqo_write_4(struct gve_priv *priv, bus_size_t offset, uint32_t val);
/* QPL (Queue Page List) functions defined in gve_qpl.c */
-int gve_alloc_qpls(struct gve_priv *priv);
-void gve_free_qpls(struct gve_priv *priv);
+struct gve_queue_page_list *gve_alloc_qpl(struct gve_priv *priv, uint32_t id,
+ int npages, bool single_kva);
+void gve_free_qpl(struct gve_priv *priv, struct gve_queue_page_list *qpl);
int gve_register_qpls(struct gve_priv *priv);
int gve_unregister_qpls(struct gve_priv *priv);
+void gve_mextadd_free(struct mbuf *mbuf);
/* TX functions defined in gve_tx.c */
-int gve_alloc_tx_rings(struct gve_priv *priv);
-void gve_free_tx_rings(struct gve_priv *priv);
+int gve_alloc_tx_rings(struct gve_priv *priv, uint16_t start_idx, uint16_t stop_idx);
+void gve_free_tx_rings(struct gve_priv *priv, uint16_t start_idx, uint16_t stop_idx);
int gve_create_tx_rings(struct gve_priv *priv);
int gve_destroy_tx_rings(struct gve_priv *priv);
+int gve_check_tx_timeout_gqi(struct gve_priv *priv, struct gve_tx_ring *tx);
int gve_tx_intr(void *arg);
int gve_xmit_ifp(if_t ifp, struct mbuf *mbuf);
void gve_qflush(if_t ifp);
void gve_xmit_tq(void *arg, int pending);
void gve_tx_cleanup_tq(void *arg, int pending);
+/* TX functions defined in gve_tx_dqo.c */
+int gve_tx_alloc_ring_dqo(struct gve_priv *priv, int i);
+void gve_tx_free_ring_dqo(struct gve_priv *priv, int i);
+void gve_clear_tx_ring_dqo(struct gve_priv *priv, int i);
+int gve_check_tx_timeout_dqo(struct gve_priv *priv, struct gve_tx_ring *tx);
+int gve_tx_intr_dqo(void *arg);
+int gve_xmit_dqo(struct gve_tx_ring *tx, struct mbuf **mbuf_ptr);
+int gve_xmit_dqo_qpl(struct gve_tx_ring *tx, struct mbuf *mbuf);
+void gve_tx_cleanup_tq_dqo(void *arg, int pending);
+
/* RX functions defined in gve_rx.c */
-int gve_alloc_rx_rings(struct gve_priv *priv);
-void gve_free_rx_rings(struct gve_priv *priv);
+int gve_alloc_rx_rings(struct gve_priv *priv, uint16_t start_idx, uint16_t stop_idx);
+void gve_free_rx_rings(struct gve_priv *priv, uint16_t start_idx, uint16_t stop_idx);
int gve_create_rx_rings(struct gve_priv *priv);
int gve_destroy_rx_rings(struct gve_priv *priv);
int gve_rx_intr(void *arg);
void gve_rx_cleanup_tq(void *arg, int pending);
+/* RX functions defined in gve_rx_dqo.c */
+int gve_rx_alloc_ring_dqo(struct gve_priv *priv, int i);
+void gve_rx_free_ring_dqo(struct gve_priv *priv, int i);
+void gve_rx_prefill_buffers_dqo(struct gve_rx_ring *rx);
+void gve_clear_rx_ring_dqo(struct gve_priv *priv, int i);
+int gve_rx_intr_dqo(void *arg);
+void gve_rx_cleanup_tq_dqo(void *arg, int pending);
+
/* DMA functions defined in gve_utils.c */
int gve_dma_alloc_coherent(struct gve_priv *priv, int size, int align,
struct gve_dma_handle *dma);
@@ -447,7 +757,17 @@ int gve_alloc_irqs(struct gve_priv *priv);
void gve_unmask_all_queue_irqs(struct gve_priv *priv);
void gve_mask_all_queue_irqs(struct gve_priv *priv);
-/* Systcl functions defined in gve_sysctl.c*/
+/* Miscellaneous functions defined in gve_utils.c */
+void gve_invalidate_timestamp(int64_t *timestamp_sec);
+int64_t gve_seconds_since(int64_t *timestamp_sec);
+void gve_set_timestamp(int64_t *timestamp_sec);
+bool gve_timestamp_valid(int64_t *timestamp_sec);
+
+/* Systcl functions defined in gve_sysctl.c */
+extern bool gve_disable_hw_lro;
+extern bool gve_allow_4k_rx_buffers;
+extern char gve_queue_format[8];
+extern char gve_version[8];
void gve_setup_sysctl(struct gve_priv *priv);
void gve_accum_stats(struct gve_priv *priv, uint64_t *rpackets,
uint64_t *rbytes, uint64_t *rx_dropped_pkt, uint64_t *tpackets,
diff --git a/sys/dev/gve/gve_adminq.c b/sys/dev/gve/gve_adminq.c
index 3c332607ebd4..9b59570a2af4 100644
--- a/sys/dev/gve/gve_adminq.c
+++ b/sys/dev/gve/gve_adminq.c
@@ -1,7 +1,7 @@
/*-
* SPDX-License-Identifier: BSD-3-Clause
*
- * Copyright (c) 2023 Google LLC
+ * Copyright (c) 2023-2024 Google LLC
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
@@ -57,6 +57,9 @@ void gve_parse_device_option(struct gve_priv *priv,
struct gve_device_descriptor *device_descriptor,
struct gve_device_option *option,
struct gve_device_option_gqi_qpl **dev_op_gqi_qpl,
+ struct gve_device_option_dqo_rda **dev_op_dqo_rda,
+ struct gve_device_option_dqo_qpl **dev_op_dqo_qpl,
+ struct gve_device_option_modify_ring **dev_op_modify_ring,
struct gve_device_option_jumbo_frames **dev_op_jumbo_frames)
{
uint32_t req_feat_mask = be32toh(option->required_features_mask);
@@ -85,6 +88,68 @@ void gve_parse_device_option(struct gve_priv *priv,
*dev_op_gqi_qpl = (void *)(option + 1);
break;
+ case GVE_DEV_OPT_ID_DQO_RDA:
+ if (option_length < sizeof(**dev_op_dqo_rda) ||
+ req_feat_mask != GVE_DEV_OPT_REQ_FEAT_MASK_DQO_RDA) {
+ device_printf(priv->dev, GVE_DEVICE_OPTION_ERROR_FMT,
+ "DQO RDA", (int)sizeof(**dev_op_dqo_rda),
+ GVE_DEV_OPT_REQ_FEAT_MASK_DQO_RDA,
+ option_length, req_feat_mask);
+ break;
+ }
+
+ if (option_length > sizeof(**dev_op_dqo_rda)) {
+ device_printf(priv->dev, GVE_DEVICE_OPTION_TOO_BIG_FMT,
+ "DQO RDA");
+ }
+ *dev_op_dqo_rda = (void *)(option + 1);
+ break;
+
+ case GVE_DEV_OPT_ID_DQO_QPL:
+ if (option_length < sizeof(**dev_op_dqo_qpl) ||
+ req_feat_mask != GVE_DEV_OPT_REQ_FEAT_MASK_DQO_QPL) {
+ device_printf(priv->dev, GVE_DEVICE_OPTION_ERROR_FMT,
+ "DQO QPL", (int)sizeof(**dev_op_dqo_qpl),
+ GVE_DEV_OPT_REQ_FEAT_MASK_DQO_QPL,
+ option_length, req_feat_mask);
+ break;
+ }
+
+ if (option_length > sizeof(**dev_op_dqo_qpl)) {
+ device_printf(priv->dev, GVE_DEVICE_OPTION_TOO_BIG_FMT,
+ "DQO QPL");
+ }
+ *dev_op_dqo_qpl = (void *)(option + 1);
+ break;
+
+ case GVE_DEV_OPT_ID_MODIFY_RING:
+ if (option_length < (sizeof(**dev_op_modify_ring) -
+ sizeof(struct gve_ring_size_bound)) ||
+ req_feat_mask != GVE_DEV_OPT_REQ_FEAT_MASK_MODIFY_RING) {
+ device_printf(priv->dev, GVE_DEVICE_OPTION_ERROR_FMT,
+ "Modify Ring", (int)sizeof(**dev_op_modify_ring),
+ GVE_DEV_OPT_REQ_FEAT_MASK_MODIFY_RING,
+ option_length, req_feat_mask);
+ break;
+ }
+
+ if (option_length > sizeof(**dev_op_modify_ring)) {
+ device_printf(priv->dev, GVE_DEVICE_OPTION_TOO_BIG_FMT,
+ "Modify Ring");
+ }
+ *dev_op_modify_ring = (void *)(option + 1);
+
+ /* Min ring size included; set the minimum ring size. */
+ if (option_length == sizeof(**dev_op_modify_ring)) {
+ priv->min_rx_desc_cnt = max(
+ be16toh((*dev_op_modify_ring)->min_ring_size.rx),
+ GVE_DEFAULT_MIN_RX_RING_SIZE);
+ priv->min_tx_desc_cnt = max(
+ be16toh((*dev_op_modify_ring)->min_ring_size.tx),
+ GVE_DEFAULT_MIN_TX_RING_SIZE);
+ }
+ break;
+
case GVE_DEV_OPT_ID_JUMBO_FRAMES:
if (option_length < sizeof(**dev_op_jumbo_frames) ||
req_feat_mask != GVE_DEV_OPT_REQ_FEAT_MASK_JUMBO_FRAMES) {
@@ -117,6 +182,9 @@ static int
gve_process_device_options(struct gve_priv *priv,
struct gve_device_descriptor *descriptor,
struct gve_device_option_gqi_qpl **dev_op_gqi_qpl,
+ struct gve_device_option_dqo_rda **dev_op_dqo_rda,
+ struct gve_device_option_dqo_qpl **dev_op_dqo_qpl,
+ struct gve_device_option_modify_ring **dev_op_modify_ring,
struct gve_device_option_jumbo_frames **dev_op_jumbo_frames)
{
char *desc_end = (char *)descriptor + be16toh(descriptor->total_length);
@@ -130,12 +198,16 @@ gve_process_device_options(struct gve_priv *priv,
if ((char *)(dev_opt + 1) > desc_end ||
(char *)(dev_opt + 1) + be16toh(dev_opt->option_length) > desc_end) {
device_printf(priv->dev,
- "options exceed device_descriptor's total length.\n");
+ "options exceed device descriptor's total length.\n");
return (EINVAL);
}
gve_parse_device_option(priv, descriptor, dev_opt,
- dev_op_gqi_qpl, dev_op_jumbo_frames);
+ dev_op_gqi_qpl,
+ dev_op_dqo_rda,
+ dev_op_dqo_qpl,
+ dev_op_modify_ring,
+ dev_op_jumbo_frames);
dev_opt = (void *)((char *)(dev_opt + 1) + be16toh(dev_opt->option_length));
}
@@ -221,16 +293,38 @@ gve_adminq_create_rx_queue(struct gve_priv *priv, uint32_t queue_index)
cmd.opcode = htobe32(GVE_ADMINQ_CREATE_RX_QUEUE);
cmd.create_rx_queue = (struct gve_adminq_create_rx_queue) {
.queue_id = htobe32(queue_index),
- .index = htobe32(queue_index),
.ntfy_id = htobe32(rx->com.ntfy_id),
.queue_resources_addr = htobe64(qres_dma->bus_addr),
- .rx_desc_ring_addr = htobe64(rx->desc_ring_mem.bus_addr),
- .rx_data_ring_addr = htobe64(rx->data_ring_mem.bus_addr),
- .queue_page_list_id = htobe32((rx->com.qpl)->id),
.rx_ring_size = htobe16(priv->rx_desc_cnt),
- .packet_buffer_size = htobe16(GVE_DEFAULT_RX_BUFFER_SIZE),
};
+ if (gve_is_gqi(priv)) {
+ cmd.create_rx_queue.rx_desc_ring_addr =
+ htobe64(rx->desc_ring_mem.bus_addr);
+ cmd.create_rx_queue.rx_data_ring_addr =
+ htobe64(rx->data_ring_mem.bus_addr);
+ cmd.create_rx_queue.index =
+ htobe32(queue_index);
+ cmd.create_rx_queue.queue_page_list_id =
+ htobe32((rx->com.qpl)->id);
+ cmd.create_rx_queue.packet_buffer_size =
+ htobe16(GVE_DEFAULT_RX_BUFFER_SIZE);
+ } else {
+ cmd.create_rx_queue.queue_page_list_id =
+ htobe32(GVE_RAW_ADDRESSING_QPL_ID);
+ cmd.create_rx_queue.rx_desc_ring_addr =
+ htobe64(rx->dqo.compl_ring_mem.bus_addr);
+ cmd.create_rx_queue.rx_data_ring_addr =
+ htobe64(rx->desc_ring_mem.bus_addr);
+ cmd.create_rx_queue.rx_buff_ring_size =
+ htobe16(priv->rx_desc_cnt);
+ cmd.create_rx_queue.enable_rsc =
+ !!((if_getcapenable(priv->ifp) & IFCAP_LRO) &&
+ !gve_disable_hw_lro);
+ cmd.create_rx_queue.packet_buffer_size =
+ htobe16(priv->rx_buf_size_dqo);
+ }
+
return (gve_adminq_execute_cmd(priv, &cmd));
}
@@ -272,11 +366,21 @@ gve_adminq_create_tx_queue(struct gve_priv *priv, uint32_t queue_index)
.queue_id = htobe32(queue_index),
.queue_resources_addr = htobe64(qres_dma->bus_addr),
.tx_ring_addr = htobe64(tx->desc_ring_mem.bus_addr),
- .queue_page_list_id = htobe32((tx->com.qpl)->id),
.ntfy_id = htobe32(tx->com.ntfy_id),
.tx_ring_size = htobe16(priv->tx_desc_cnt),
};
+ if (gve_is_gqi(priv)) {
+ cmd.create_tx_queue.queue_page_list_id =
+ htobe32((tx->com.qpl)->id);
+ } else {
+ cmd.create_tx_queue.queue_page_list_id =
+ htobe32(GVE_RAW_ADDRESSING_QPL_ID);
+ cmd.create_tx_queue.tx_comp_ring_addr =
+ htobe64(tx->dqo.compl_ring_mem.bus_addr);
+ cmd.create_tx_queue.tx_comp_ring_size =
+ htobe16(priv->tx_desc_cnt);
+ }
return (gve_adminq_execute_cmd(priv, &cmd));
}
@@ -320,8 +424,18 @@ gve_adminq_set_mtu(struct gve_priv *priv, uint32_t mtu) {
static void
gve_enable_supported_features(struct gve_priv *priv,
uint32_t supported_features_mask,
+ const struct gve_device_option_modify_ring *dev_op_modify_ring,
const struct gve_device_option_jumbo_frames *dev_op_jumbo_frames)
{
+ if (dev_op_modify_ring &&
+ (supported_features_mask & GVE_SUP_MODIFY_RING_MASK)) {
+ if (bootverbose)
+ device_printf(priv->dev, "MODIFY RING device option enabled.\n");
+ priv->modify_ringsize_enabled = true;
+ priv->max_rx_desc_cnt = be16toh(dev_op_modify_ring->max_ring_size.rx);
+ priv->max_tx_desc_cnt = be16toh(dev_op_modify_ring->max_ring_size.tx);
+ }
+
if (dev_op_jumbo_frames &&
(supported_features_mask & GVE_SUP_JUMBO_FRAMES_MASK)) {
if (bootverbose)
@@ -338,6 +452,9 @@ gve_adminq_describe_device(struct gve_priv *priv)
struct gve_device_descriptor *desc;
struct gve_dma_handle desc_mem;
struct gve_device_option_gqi_qpl *dev_op_gqi_qpl = NULL;
+ struct gve_device_option_dqo_rda *dev_op_dqo_rda = NULL;
+ struct gve_device_option_dqo_qpl *dev_op_dqo_qpl = NULL;
+ struct gve_device_option_modify_ring *dev_op_modify_ring = NULL;
struct gve_device_option_jumbo_frames *dev_op_jumbo_frames = NULL;
uint32_t supported_features_mask = 0;
int rc;
@@ -366,12 +483,40 @@ gve_adminq_describe_device(struct gve_priv *priv)
bus_dmamap_sync(desc_mem.tag, desc_mem.map, BUS_DMASYNC_POSTREAD);
- rc = gve_process_device_options(priv, desc, &dev_op_gqi_qpl,
+ /* Default min in case device options don't have min values */
+ priv->min_rx_desc_cnt = GVE_DEFAULT_MIN_RX_RING_SIZE;
+ priv->min_tx_desc_cnt = GVE_DEFAULT_MIN_TX_RING_SIZE;
+
+ rc = gve_process_device_options(priv, desc,
+ &dev_op_gqi_qpl,
+ &dev_op_dqo_rda,
+ &dev_op_dqo_qpl,
+ &dev_op_modify_ring,
&dev_op_jumbo_frames);
if (rc != 0)
goto free_device_descriptor;
- if (dev_op_gqi_qpl != NULL) {
+ if (dev_op_dqo_rda != NULL) {
+ snprintf(gve_queue_format, sizeof(gve_queue_format),
+ "%s", "DQO RDA");
+ priv->queue_format = GVE_DQO_RDA_FORMAT;
+ supported_features_mask = be32toh(
+ dev_op_dqo_rda->supported_features_mask);
+ if (bootverbose)
+ device_printf(priv->dev,
+ "Driver is running with DQO RDA queue format.\n");
+ } else if (dev_op_dqo_qpl != NULL) {
+ snprintf(gve_queue_format, sizeof(gve_queue_format),
+ "%s", "DQO QPL");
+ priv->queue_format = GVE_DQO_QPL_FORMAT;
+ supported_features_mask = be32toh(
+ dev_op_dqo_qpl->supported_features_mask);
+ if (bootverbose)
+ device_printf(priv->dev,
+ "Driver is running with DQO QPL queue format.\n");
+ } else if (dev_op_gqi_qpl != NULL) {
+ snprintf(gve_queue_format, sizeof(gve_queue_format),
+ "%s", "GQI QPL");
priv->queue_format = GVE_GQI_QPL_FORMAT;
supported_features_mask = be32toh(
dev_op_gqi_qpl->supported_features_mask);
@@ -380,7 +525,7 @@ gve_adminq_describe_device(struct gve_priv *priv)
"Driver is running with GQI QPL queue format.\n");
} else {
device_printf(priv->dev, "No compatible queue formats\n");
- rc = (EINVAL);
+ rc = EINVAL;
goto free_device_descriptor;
}
@@ -394,8 +539,12 @@ gve_adminq_describe_device(struct gve_priv *priv)
priv->default_num_queues = be16toh(desc->default_num_queues);
priv->supported_features = supported_features_mask;
+ /* Default max to current in case modify ring size option is disabled */
+ priv->max_rx_desc_cnt = priv->rx_desc_cnt;
+ priv->max_tx_desc_cnt = priv->tx_desc_cnt;
+
gve_enable_supported_features(priv, supported_features_mask,
- dev_op_jumbo_frames);
+ dev_op_modify_ring, dev_op_jumbo_frames);
for (i = 0; i < ETHER_ADDR_LEN; i++)
priv->mac[i] = desc->mac[i];
@@ -507,6 +656,41 @@ gve_adminq_verify_driver_compatibility(struct gve_priv *priv,
}
int
+gve_adminq_get_ptype_map_dqo(struct gve_priv *priv,
+ struct gve_ptype_lut *ptype_lut_dqo)
+{
+ struct gve_adminq_command aq_cmd = (struct gve_adminq_command){};
+ struct gve_ptype_map *ptype_map;
+ struct gve_dma_handle dma;
+ int err = 0;
+ int i;
+
+ err = gve_dma_alloc_coherent(priv, sizeof(*ptype_map), PAGE_SIZE, &dma);
+ if (err)
+ return (err);
+ ptype_map = dma.cpu_addr;
+
+ aq_cmd.opcode = htobe32(GVE_ADMINQ_GET_PTYPE_MAP);
+ aq_cmd.get_ptype_map = (struct gve_adminq_get_ptype_map) {
+ .ptype_map_len = htobe64(sizeof(*ptype_map)),
+ .ptype_map_addr = htobe64(dma.bus_addr),
+ };
+
+ err = gve_adminq_execute_cmd(priv, &aq_cmd);
+ if (err)
+ goto err;
+
+ /* Populate ptype_lut_dqo. */
+ for (i = 0; i < GVE_NUM_PTYPES; i++) {
+ ptype_lut_dqo->ptypes[i].l3_type = ptype_map->ptypes[i].l3_type;
+ ptype_lut_dqo->ptypes[i].l4_type = ptype_map->ptypes[i].l4_type;
+ }
+err:
+ gve_dma_free_coherent(&dma);
+ return (err);
+}
+
+int
gve_adminq_alloc(struct gve_priv *priv)
{
int rc;
@@ -543,6 +727,7 @@ gve_adminq_alloc(struct gve_priv *priv)
priv->adminq_destroy_rx_queue_cnt = 0;
priv->adminq_dcfg_device_resources_cnt = 0;
priv->adminq_set_driver_parameter_cnt = 0;
+ priv->adminq_get_ptype_map_cnt = 0;
gve_reg_bar_write_4(priv, GVE_REG_ADMINQ_ADDR,
priv->adminq_bus_addr / ADMINQ_SIZE);
@@ -772,6 +957,10 @@ gve_adminq_issue_cmd(struct gve_priv *priv, struct gve_adminq_command *cmd_orig)
priv->adminq_verify_driver_compatibility_cnt++;
break;
+ case GVE_ADMINQ_GET_PTYPE_MAP:
+ priv->adminq_get_ptype_map_cnt++;
+ break;
+
default:
device_printf(priv->dev, "Unknown AQ command opcode %d\n", opcode);
}
diff --git a/sys/dev/gve/gve_adminq.h b/sys/dev/gve/gve_adminq.h
index 5923e5f353d1..531a844f7d90 100644
--- a/sys/dev/gve/gve_adminq.h
+++ b/sys/dev/gve/gve_adminq.h
@@ -1,7 +1,7 @@
/*-
* SPDX-License-Identifier: BSD-3-Clause
*
- * Copyright (c) 2023 Google LLC
+ * Copyright (c) 2023-2024 Google LLC
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
@@ -137,18 +137,37 @@ _Static_assert(sizeof(struct gve_device_option_gqi_qpl) == 4,
struct gve_device_option_dqo_rda {
__be32 supported_features_mask;
+ __be16 tx_comp_ring_entries;
+ __be16 rx_buff_ring_entries;
};
-_Static_assert(sizeof(struct gve_device_option_dqo_rda) == 4,
+_Static_assert(sizeof(struct gve_device_option_dqo_rda) == 8,
+ "gve: bad admin queue struct length");
+
+struct gve_device_option_dqo_qpl {
+ __be32 supported_features_mask;
+ __be16 tx_comp_ring_entries;
+ __be16 rx_buff_ring_entries;
+};
+
+_Static_assert(sizeof(struct gve_device_option_dqo_qpl) == 8,
+ "gve: bad admin queue struct length");
+
+struct gve_ring_size_bound {
+ __be16 rx;
+ __be16 tx;
+};
+
+_Static_assert(sizeof(struct gve_ring_size_bound) == 4,
"gve: bad admin queue struct length");
struct gve_device_option_modify_ring {
__be32 supported_features_mask;
- __be16 max_rx_ring_size;
- __be16 max_tx_ring_size;
+ struct gve_ring_size_bound max_ring_size;
+ struct gve_ring_size_bound min_ring_size;
};
-_Static_assert(sizeof(struct gve_device_option_modify_ring) == 8,
+_Static_assert(sizeof(struct gve_device_option_modify_ring) == 12,
"gve: bad admin queue struct length");
struct gve_device_option_jumbo_frames {
@@ -166,6 +185,7 @@ enum gve_dev_opt_id {
GVE_DEV_OPT_ID_GQI_QPL = 0x3,
GVE_DEV_OPT_ID_DQO_RDA = 0x4,
GVE_DEV_OPT_ID_MODIFY_RING = 0x6,
+ GVE_DEV_OPT_ID_DQO_QPL = 0x7,
GVE_DEV_OPT_ID_JUMBO_FRAMES = 0x8,
};
@@ -180,6 +200,7 @@ enum gve_dev_opt_req_feat_mask {
GVE_DEV_OPT_REQ_FEAT_MASK_GQI_RDA = 0x0,
GVE_DEV_OPT_REQ_FEAT_MASK_GQI_QPL = 0x0,
GVE_DEV_OPT_REQ_FEAT_MASK_DQO_RDA = 0x0,
+ GVE_DEV_OPT_REQ_FEAT_MASK_DQO_QPL = 0x0,
GVE_DEV_OPT_REQ_FEAT_MASK_MODIFY_RING = 0x0,
GVE_DEV_OPT_REQ_FEAT_MASK_JUMBO_FRAMES = 0x0,
};
@@ -194,9 +215,8 @@ enum gve_sup_feature_mask {
enum gve_driver_capability {
gve_driver_capability_gqi_qpl = 0,
gve_driver_capability_gqi_rda = 1,
- gve_driver_capability_dqo_qpl = 2, /* reserved for future use */
+ gve_driver_capability_dqo_qpl = 2,
gve_driver_capability_dqo_rda = 3,
- gve_driver_capability_alt_miss_compl = 4,
};
#define GVE_CAP1(a) BIT((int) a)
@@ -209,7 +229,10 @@ enum gve_driver_capability {
* Only a few bits (as shown in `gve_driver_compatibility`) are currently
* defined. The rest are reserved for future use.
*/
-#define GVE_DRIVER_CAPABILITY_FLAGS1 (GVE_CAP1(gve_driver_capability_gqi_qpl))
+#define GVE_DRIVER_CAPABILITY_FLAGS1 \
+ (GVE_CAP1(gve_driver_capability_gqi_qpl) | \
+ GVE_CAP1(gve_driver_capability_dqo_qpl) | \
+ GVE_CAP1(gve_driver_capability_dqo_rda))
#define GVE_DRIVER_CAPABILITY_FLAGS2 0x0
#define GVE_DRIVER_CAPABILITY_FLAGS3 0x0
#define GVE_DRIVER_CAPABILITY_FLAGS4 0x0
@@ -282,6 +305,8 @@ struct gve_adminq_create_tx_queue {
_Static_assert(sizeof(struct gve_adminq_create_tx_queue) == 48,
"gve: bad admin queue struct length");
+#define GVE_RAW_ADDRESSING_QPL_ID 0xFFFFFFFF
+
struct gve_adminq_create_rx_queue {
__be32 queue_id;
__be32 index;
@@ -352,6 +377,24 @@ struct stats {
_Static_assert(sizeof(struct stats) == 16,
"gve: bad admin queue struct length");
+/*
+ * These are control path types for PTYPE which are the same as the data path
+ * types.
+ */
+struct gve_ptype_entry {
+ uint8_t l3_type;
+ uint8_t l4_type;
+};
+
+struct gve_ptype_map {
+ struct gve_ptype_entry ptypes[1 << 10]; /* PTYPES are always 10 bits. */
+};
+
+struct gve_adminq_get_ptype_map {
+ __be64 ptype_map_len;
+ __be64 ptype_map_addr;
+};
+
struct gve_adminq_command {
__be32 opcode;
__be32 status;
@@ -368,6 +411,7 @@ struct gve_adminq_command {
struct gve_adminq_set_driver_parameter set_driver_param;
struct gve_adminq_verify_driver_compatibility
verify_driver_compatibility;
+ struct gve_adminq_get_ptype_map get_ptype_map;
uint8_t reserved[56];
};
};
@@ -375,6 +419,24 @@ struct gve_adminq_command {
_Static_assert(sizeof(struct gve_adminq_command) == 64,
"gve: bad admin queue struct length");
+enum gve_l3_type {
+ /* Must be zero so zero initialized LUT is unknown. */
+ GVE_L3_TYPE_UNKNOWN = 0,
+ GVE_L3_TYPE_OTHER,
+ GVE_L3_TYPE_IPV4,
+ GVE_L3_TYPE_IPV6,
+};
+
+enum gve_l4_type {
+ /* Must be zero so zero initialized LUT is unknown. */
+ GVE_L4_TYPE_UNKNOWN = 0,
+ GVE_L4_TYPE_OTHER,
+ GVE_L4_TYPE_TCP,
+ GVE_L4_TYPE_UDP,
+ GVE_L4_TYPE_ICMP,
+ GVE_L4_TYPE_SCTP,
+};
+
int gve_adminq_create_rx_queues(struct gve_priv *priv, uint32_t num_queues);
int gve_adminq_create_tx_queues(struct gve_priv *priv, uint32_t num_queues);
int gve_adminq_destroy_tx_queues(struct gve_priv *priv, uint32_t num_queues);
@@ -387,8 +449,10 @@ int gve_adminq_configure_device_resources(struct gve_priv *priv);
int gve_adminq_deconfigure_device_resources(struct gve_priv *priv);
void gve_release_adminq(struct gve_priv *priv);
int gve_adminq_register_page_list(struct gve_priv *priv,
- struct gve_queue_page_list *qpl);
+ struct gve_queue_page_list *qpl);
int gve_adminq_unregister_page_list(struct gve_priv *priv, uint32_t page_list_id);
int gve_adminq_verify_driver_compatibility(struct gve_priv *priv,
- uint64_t driver_info_len, vm_paddr_t driver_info_addr);
+ uint64_t driver_info_len, vm_paddr_t driver_info_addr);
+int gve_adminq_get_ptype_map_dqo(struct gve_priv *priv,
+ struct gve_ptype_lut *ptype_lut);
#endif /* _GVE_AQ_H_ */
diff --git a/sys/dev/gve/gve_desc.h b/sys/dev/gve/gve_desc.h
index 5f09cc8b77b8..48c4ac27596b 100644
--- a/sys/dev/gve/gve_desc.h
+++ b/sys/dev/gve/gve_desc.h
@@ -130,10 +130,10 @@ union gve_rx_data_slot {
__be64 addr;
};
-/* GVE Recive Packet Descriptor Seq No */
+/* GVE Receive Packet Descriptor Seq No */
#define GVE_SEQNO(x) (be16toh(x) & 0x7)
-/* GVE Recive Packet Descriptor Flags */
+/* GVE Receive Packet Descriptor Flags */
#define GVE_RXFLG(x) htobe16(1 << (3 + (x)))
#define GVE_RXF_FRAG GVE_RXFLG(3) /* IP Fragment */
#define GVE_RXF_IPV4 GVE_RXFLG(4) /* IPv4 */
diff --git a/sys/dev/gve/gve_dqo.h b/sys/dev/gve/gve_dqo.h
new file mode 100644
index 000000000000..542f8ff7d888
--- /dev/null
+++ b/sys/dev/gve/gve_dqo.h
@@ -0,0 +1,337 @@
+/*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 2024 Google LLC
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* GVE DQO Descriptor formats */
+
+#ifndef _GVE_DESC_DQO_H_
+#define _GVE_DESC_DQO_H_
+
+#include "gve_plat.h"
+
+#define GVE_ITR_ENABLE_BIT_DQO BIT(0)
+#define GVE_ITR_NO_UPDATE_DQO (3 << 3)
+#define GVE_ITR_INTERVAL_DQO_SHIFT 5
+#define GVE_ITR_INTERVAL_DQO_MASK ((1 << 12) - 1)
+#define GVE_TX_IRQ_RATELIMIT_US_DQO 50
+#define GVE_RX_IRQ_RATELIMIT_US_DQO 20
+
+#define GVE_TX_MAX_HDR_SIZE_DQO 255
+#define GVE_TX_MIN_TSO_MSS_DQO 88
+
+/*
+ * Ringing the doorbell too often can hurt performance.
+ *
+ * HW requires this value to be at least 8.
+ */
+#define GVE_RX_BUF_THRESH_DQO 32
+
+/*
+ * Start dropping RX fragments if at least these many
+ * buffers cannot be posted to the NIC.
+ */
+#define GVE_RX_DQO_MIN_PENDING_BUFS 128
+
+/*
+ * gve_rx_qpl_buf_id_dqo's 11 bit wide buf_id field limits the total
+ * number of pages per QPL to 2048.
+ */
+#define GVE_RX_NUM_QPL_PAGES_DQO 2048
+
+/* 2K TX buffers for DQO-QPL */
+#define GVE_TX_BUF_SHIFT_DQO 11
+#define GVE_TX_BUF_SIZE_DQO BIT(GVE_TX_BUF_SHIFT_DQO)
+#define GVE_TX_BUFS_PER_PAGE_DQO (PAGE_SIZE >> GVE_TX_BUF_SHIFT_DQO)
+
+#define GVE_TX_NUM_QPL_PAGES_DQO 512
+
+/* Basic TX descriptor (DTYPE 0x0C) */
+struct gve_tx_pkt_desc_dqo {
+ __le64 buf_addr;
+
+ /* Must be GVE_TX_PKT_DESC_DTYPE_DQO (0xc) */
+ uint8_t dtype:5;
+
+ /* Denotes the last descriptor of a packet. */
+ uint8_t end_of_packet:1;
+ uint8_t checksum_offload_enable:1;
+
+ /* If set, will generate a descriptor completion for this descriptor. */
+ uint8_t report_event:1;
+ uint8_t reserved0;
+ __le16 reserved1;
+
+ /* The TX completion for this packet will contain this tag. */
+ __le16 compl_tag;
+ uint16_t buf_size:14;
+ uint16_t reserved2:2;
+} __packed;
+_Static_assert(sizeof(struct gve_tx_pkt_desc_dqo) == 16,
+ "gve: bad dqo desc struct length");
+
+#define GVE_TX_PKT_DESC_DTYPE_DQO 0xc
+
+/*
+ * Maximum number of data descriptors allowed per packet, or per-TSO segment.
+ */
+#define GVE_TX_MAX_DATA_DESCS_DQO 10
+#define GVE_TX_MAX_BUF_SIZE_DQO ((16 * 1024) - 1)
+#define GVE_TSO_MAXSIZE_DQO IP_MAXPACKET
+
+_Static_assert(GVE_TX_MAX_BUF_SIZE_DQO * GVE_TX_MAX_DATA_DESCS_DQO >=
+ GVE_TSO_MAXSIZE_DQO,
+ "gve: bad tso parameters");
+
+/*
+ * "report_event" on TX packet descriptors may only be reported on the last
+ * descriptor of a TX packet, and they must be spaced apart with at least this
+ * value.
+ */
+#define GVE_TX_MIN_RE_INTERVAL 32
+
+struct gve_tx_context_cmd_dtype {
+ uint8_t dtype:5;
+ uint8_t tso:1;
+ uint8_t reserved1:2;
+ uint8_t reserved2;
+};
+
+_Static_assert(sizeof(struct gve_tx_context_cmd_dtype) == 2,
+ "gve: bad dqo desc struct length");
+
+/*
+ * TX Native TSO Context DTYPE (0x05)
+ *
+ * "flex" fields allow the driver to send additional packet context to HW.
+ */
+struct gve_tx_tso_context_desc_dqo {
+ /* The L4 payload bytes that should be segmented. */
+ uint32_t tso_total_len:24;
+ uint32_t flex10:8;
+
+ /* Max segment size in TSO excluding headers. */
+ uint16_t mss:14;
+ uint16_t reserved:2;
+
+ uint8_t header_len; /* Header length to use for TSO offload */
+ uint8_t flex11;
+ struct gve_tx_context_cmd_dtype cmd_dtype;
+ uint8_t flex0;
+ uint8_t flex5;
+ uint8_t flex6;
+ uint8_t flex7;
+ uint8_t flex8;
+ uint8_t flex9;
+} __packed;
+_Static_assert(sizeof(struct gve_tx_tso_context_desc_dqo) == 16,
+ "gve: bad dqo desc struct length");
+
+#define GVE_TX_TSO_CTX_DESC_DTYPE_DQO 0x5
+
+/* General context descriptor for sending metadata. */
+struct gve_tx_general_context_desc_dqo {
+ uint8_t flex4;
+ uint8_t flex5;
+ uint8_t flex6;
+ uint8_t flex7;
+ uint8_t flex8;
+ uint8_t flex9;
+ uint8_t flex10;
+ uint8_t flex11;
+ struct gve_tx_context_cmd_dtype cmd_dtype;
+ uint16_t reserved;
+ uint8_t flex0;
+ uint8_t flex1;
+ uint8_t flex2;
+ uint8_t flex3;
+} __packed;
+_Static_assert(sizeof(struct gve_tx_general_context_desc_dqo) == 16,
+ "gve: bad dqo desc struct length");
+
+#define GVE_TX_GENERAL_CTX_DESC_DTYPE_DQO 0x4
+
+/*
+ * Logical structure of metadata which is packed into context descriptor flex
+ * fields.
+ */
+struct gve_tx_metadata_dqo {
+ union {
+ struct {
+ uint8_t version;
+
+ /*
+ * A zero value means no l4_hash was associated with the
+ * mbuf.
+ */
+ uint16_t path_hash:15;
+
+ /*
+ * Should be set to 1 if the flow associated with the
+ * mbuf had a rehash from the TCP stack.
+ */
+ uint16_t rehash_event:1;
+ } __packed;
+ uint8_t bytes[12];
+ };
+} __packed;
+_Static_assert(sizeof(struct gve_tx_metadata_dqo) == 12,
+ "gve: bad dqo desc struct length");
+
+#define GVE_TX_METADATA_VERSION_DQO 0
+
+/* Used to access the generation bit within a TX completion descriptor. */
+#define GVE_TX_DESC_DQO_GEN_BYTE_OFFSET 1
+#define GVE_TX_DESC_DQO_GEN_BIT_MASK 0x80
+
+/* TX completion descriptor */
+struct gve_tx_compl_desc_dqo {
+ /*
+ * For types 0-4 this is the TX queue ID associated with this
+ * completion.
+ */
+ uint16_t id:11;
+
+ /* See: GVE_COMPL_TYPE_DQO* */
+ uint16_t type:3;
+ uint16_t reserved0:1;
+
+ /* Flipped by HW to notify the descriptor is populated. */
+ uint16_t generation:1;
+ union {
+ /*
+ * For descriptor completions, this is the last index fetched
+ * by HW + 1.
+ */
+ __le16 tx_head;
+
+ /*
+ * For packet completions, this is the completion tag set on the
+ * TX packet descriptors.
+ */
+ __le16 completion_tag;
+ };
+ __le32 reserved1;
+} __packed;
+_Static_assert(sizeof(struct gve_tx_compl_desc_dqo) == 8,
+ "gve: bad dqo desc struct length");
+
+union gve_tx_desc_dqo {
+ struct gve_tx_pkt_desc_dqo pkt;
+ struct gve_tx_tso_context_desc_dqo tso_ctx;
+ struct gve_tx_general_context_desc_dqo general_ctx;
+};
+
+#define GVE_COMPL_TYPE_DQO_PKT 0x2 /* Packet completion */
+#define GVE_COMPL_TYPE_DQO_DESC 0x4 /* Descriptor completion */
+
+/* Descriptor to post buffers to HW on buffer queue. */
+struct gve_rx_desc_dqo {
+ __le16 buf_id; /* ID returned in Rx completion descriptor */
+ __le16 reserved0;
+ __le32 reserved1;
+ __le64 buf_addr; /* DMA address of the buffer */
+ __le64 header_buf_addr;
+ __le64 reserved2;
+} __packed;
+_Static_assert(sizeof(struct gve_rx_desc_dqo) == 32,
+ "gve: bad dqo desc struct length");
+
+/* Used to access the generation bit within an RX completion descriptor. */
+#define GVE_RX_DESC_DQO_GEN_BYTE_OFFSET 5
+#define GVE_RX_DESC_DQO_GEN_BIT_MASK 0x40
+
+/* Descriptor for HW to notify SW of new packets received on RX queue. */
+struct gve_rx_compl_desc_dqo {
+ /* Must be 1 */
+ uint8_t rxdid:4;
+ uint8_t reserved0:4;
+
+ /* Packet originated from this system rather than the network. */
+ uint8_t loopback:1;
+ /*
+ * Set when IPv6 packet contains a destination options header or routing
+ * header.
+ */
+ uint8_t ipv6_ex_add:1;
+ /* Invalid packet was received. */
+ uint8_t rx_error:1;
+ uint8_t reserved1:5;
+
+ uint16_t packet_type:10;
+ uint16_t ip_hdr_err:1;
+ uint16_t udp_len_err:1;
+ uint16_t raw_cs_invalid:1;
+ uint16_t reserved2:3;
+
+ uint16_t packet_len:14;
+ /* Flipped by HW to notify the descriptor is populated. */
+ uint16_t generation:1;
+ /* Should be zero. */
+ uint16_t buffer_queue_id:1;
+
+ uint16_t header_len:10;
+ uint16_t rsc:1;
+ uint16_t split_header:1;
+ uint16_t reserved3:4;
+
+ uint8_t descriptor_done:1;
+ uint8_t end_of_packet:1;
+ uint8_t header_buffer_overflow:1;
+ uint8_t l3_l4_processed:1;
+ uint8_t csum_ip_err:1;
+ uint8_t csum_l4_err:1;
+ uint8_t csum_external_ip_err:1;
+ uint8_t csum_external_udp_err:1;
+
+ uint8_t status_error1;
+
+ __le16 reserved5;
+ __le16 buf_id; /* Buffer ID which was sent on the buffer queue. */
+
+ union {
+ /* Packet checksum. */
+ __le16 raw_cs;
+ /* Segment length for RSC packets. */
+ __le16 rsc_seg_len;
+ };
+ __le32 hash;
+ __le32 reserved6;
+ __le64 reserved7;
+} __packed;
+
+_Static_assert(sizeof(struct gve_rx_compl_desc_dqo) == 32,
+ "gve: bad dqo desc struct length");
+
+static inline uint8_t
+gve_get_dq_num_frags_in_page(struct gve_priv *priv)
+{
+ return (PAGE_SIZE / priv->rx_buf_size_dqo);
+}
+#endif /* _GVE_DESC_DQO_H_ */
diff --git a/sys/dev/gve/gve_main.c b/sys/dev/gve/gve_main.c
index cd7849778bce..10197a8e15f8 100644
--- a/sys/dev/gve/gve_main.c
+++ b/sys/dev/gve/gve_main.c
@@ -1,7 +1,7 @@
/*-
* SPDX-License-Identifier: BSD-3-Clause
*
- * Copyright (c) 2023 Google LLC
+ * Copyright (c) 2023-2024 Google LLC
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
@@ -30,11 +30,12 @@
*/
#include "gve.h"
#include "gve_adminq.h"
+#include "gve_dqo.h"
-#define GVE_DRIVER_VERSION "GVE-FBSD-1.0.1\n"
+#define GVE_DRIVER_VERSION "GVE-FBSD-1.3.4\n"
#define GVE_VERSION_MAJOR 1
-#define GVE_VERSION_MINOR 0
-#define GVE_VERSION_SUB 1
+#define GVE_VERSION_MINOR 3
+#define GVE_VERSION_SUB 5
#define GVE_DEFAULT_RX_COPYBREAK 256
@@ -49,6 +50,9 @@ static struct gve_dev {
struct sx gve_global_lock;
+static void gve_start_tx_timeout_service(struct gve_priv *priv);
+static void gve_stop_tx_timeout_service(struct gve_priv *priv);
+
static int
gve_verify_driver_compatibility(struct gve_priv *priv)
{
@@ -98,6 +102,72 @@ gve_verify_driver_compatibility(struct gve_priv *priv)
return (err);
}
+static void
+gve_handle_tx_timeout(struct gve_priv *priv, struct gve_tx_ring *tx,
+ int num_timeout_pkts)
+{
+ int64_t time_since_last_kick;
+
+ counter_u64_add_protected(tx->stats.tx_timeout, 1);
+
+ /* last_kicked is never GVE_TIMESTAMP_INVALID so we can skip checking */
+ time_since_last_kick = gve_seconds_since(&tx->last_kicked);
+
+ /* Try kicking first in case the timeout is due to a missed interrupt */
+ if (time_since_last_kick > GVE_TX_TIMEOUT_KICK_COOLDOWN_SEC) {
+ device_printf(priv->dev,
+ "Found %d timed out packet(s) on txq%d, kicking it for completions\n",
+ num_timeout_pkts, tx->com.id);
+ gve_set_timestamp(&tx->last_kicked);
+ taskqueue_enqueue(tx->com.cleanup_tq, &tx->com.cleanup_task);
+ } else {
+ device_printf(priv->dev,
+ "Found %d timed out packet(s) on txq%d with its last kick %jd sec ago which is less than the cooldown period %d. Resetting device\n",
+ num_timeout_pkts, tx->com.id,
+ (intmax_t)time_since_last_kick,
+ GVE_TX_TIMEOUT_KICK_COOLDOWN_SEC);
+ gve_schedule_reset(priv);
+ }
+}
+
+static void
+gve_tx_timeout_service_callback(void *data)
+{
+ struct gve_priv *priv = (struct gve_priv *)data;
+ struct gve_tx_ring *tx;
+ uint16_t num_timeout_pkts;
+
+ tx = &priv->tx[priv->check_tx_queue_idx];
+
+ num_timeout_pkts = gve_is_gqi(priv) ?
+ gve_check_tx_timeout_gqi(priv, tx) :
+ gve_check_tx_timeout_dqo(priv, tx);
+ if (num_timeout_pkts)
+ gve_handle_tx_timeout(priv, tx, num_timeout_pkts);
+
+ priv->check_tx_queue_idx = (priv->check_tx_queue_idx + 1) %
+ priv->tx_cfg.num_queues;
+ callout_reset_sbt(&priv->tx_timeout_service,
+ SBT_1S * GVE_TX_TIMEOUT_CHECK_CADENCE_SEC, 0,
+ gve_tx_timeout_service_callback, (void *)priv, 0);
+}
+
+static void
+gve_start_tx_timeout_service(struct gve_priv *priv)
+{
+ priv->check_tx_queue_idx = 0;
+ callout_init(&priv->tx_timeout_service, true);
+ callout_reset_sbt(&priv->tx_timeout_service,
+ SBT_1S * GVE_TX_TIMEOUT_CHECK_CADENCE_SEC, 0,
+ gve_tx_timeout_service_callback, (void *)priv, 0);
+}
+
+static void
+gve_stop_tx_timeout_service(struct gve_priv *priv)
+{
+ callout_drain(&priv->tx_timeout_service);
+}
+
static int
gve_up(struct gve_priv *priv)
{
@@ -124,9 +194,11 @@ gve_up(struct gve_priv *priv)
if (if_getcapenable(ifp) & IFCAP_TSO6)
if_sethwassistbits(ifp, CSUM_IP6_TSO, 0);
- err = gve_register_qpls(priv);
- if (err != 0)
- goto reset;
+ if (gve_is_qpl(priv)) {
+ err = gve_register_qpls(priv);
+ if (err != 0)
+ goto reset;
+ }
err = gve_create_rx_rings(priv);
if (err != 0)
@@ -146,6 +218,9 @@ gve_up(struct gve_priv *priv)
gve_unmask_all_queue_irqs(priv);
gve_set_state_flag(priv, GVE_STATE_FLAG_QUEUES_UP);
priv->interface_up_cnt++;
+
+ gve_start_tx_timeout_service(priv);
+
return (0);
reset:
@@ -161,6 +236,8 @@ gve_down(struct gve_priv *priv)
if (!gve_get_state_flag(priv, GVE_STATE_FLAG_QUEUES_UP))
return;
+ gve_stop_tx_timeout_service(priv);
+
if (gve_get_state_flag(priv, GVE_STATE_FLAG_LINK_UP)) {
if_link_state_change(priv->ifp, LINK_STATE_DOWN);
gve_clear_state_flag(priv, GVE_STATE_FLAG_LINK_UP);
@@ -174,10 +251,13 @@ gve_down(struct gve_priv *priv)
if (gve_destroy_tx_rings(priv) != 0)
goto reset;
- if (gve_unregister_qpls(priv) != 0)
- goto reset;
+ if (gve_is_qpl(priv)) {
+ if (gve_unregister_qpls(priv) != 0)
+ goto reset;
+ }
- gve_mask_all_queue_irqs(priv);
+ if (gve_is_gqi(priv))
+ gve_mask_all_queue_irqs(priv);
gve_clear_state_flag(priv, GVE_STATE_FLAG_QUEUES_UP);
priv->interface_down_cnt++;
return;
@@ -186,10 +266,143 @@ reset:
gve_schedule_reset(priv);
}
+int
+gve_adjust_rx_queues(struct gve_priv *priv, uint16_t new_queue_cnt)
+{
+ int err;
+
+ GVE_IFACE_LOCK_ASSERT(priv->gve_iface_lock);
+
+ gve_down(priv);
+
+ if (new_queue_cnt < priv->rx_cfg.num_queues) {
+ /*
+ * Freeing a ring still preserves its ntfy_id,
+ * which is needed if we create the ring again.
+ */
+ gve_free_rx_rings(priv, new_queue_cnt, priv->rx_cfg.num_queues);
+ } else {
+ err = gve_alloc_rx_rings(priv, priv->rx_cfg.num_queues, new_queue_cnt);
+ if (err != 0) {
+ device_printf(priv->dev, "Failed to allocate new queues");
+ /* Failed to allocate rings, start back up with old ones */
+ gve_up(priv);
+ return (err);
+
+ }
+ }
+ priv->rx_cfg.num_queues = new_queue_cnt;
+
+ err = gve_up(priv);
+ if (err != 0)
+ gve_schedule_reset(priv);
+
+ return (err);
+}
+
+int
+gve_adjust_tx_queues(struct gve_priv *priv, uint16_t new_queue_cnt)
+{
+ int err;
+
+ GVE_IFACE_LOCK_ASSERT(priv->gve_iface_lock);
+
+ gve_down(priv);
+
+ if (new_queue_cnt < priv->tx_cfg.num_queues) {
+ /*
+ * Freeing a ring still preserves its ntfy_id,
+ * which is needed if we create the ring again.
+ */
+ gve_free_tx_rings(priv, new_queue_cnt, priv->tx_cfg.num_queues);
+ } else {
+ err = gve_alloc_tx_rings(priv, priv->tx_cfg.num_queues, new_queue_cnt);
+ if (err != 0) {
+ device_printf(priv->dev, "Failed to allocate new queues");
+ /* Failed to allocate rings, start back up with old ones */
+ gve_up(priv);
+ return (err);
+
+ }
+ }
+ priv->tx_cfg.num_queues = new_queue_cnt;
+
+ err = gve_up(priv);
+ if (err != 0)
+ gve_schedule_reset(priv);
+
+ return (err);
+}
+
+int
+gve_adjust_ring_sizes(struct gve_priv *priv, uint16_t new_desc_cnt, bool is_rx)
+{
+ int err;
+ uint16_t prev_desc_cnt;
+
+ GVE_IFACE_LOCK_ASSERT(priv->gve_iface_lock);
+
+ gve_down(priv);
+
+ if (is_rx) {
+ gve_free_rx_rings(priv, 0, priv->rx_cfg.num_queues);
+ prev_desc_cnt = priv->rx_desc_cnt;
+ priv->rx_desc_cnt = new_desc_cnt;
+ err = gve_alloc_rx_rings(priv, 0, priv->rx_cfg.num_queues);
+ if (err != 0) {
+ device_printf(priv->dev,
+ "Failed to allocate rings. Trying to start back up with previous ring size.");
+ priv->rx_desc_cnt = prev_desc_cnt;
+ err = gve_alloc_rx_rings(priv, 0, priv->rx_cfg.num_queues);
+ }
+ } else {
+ gve_free_tx_rings(priv, 0, priv->tx_cfg.num_queues);
+ prev_desc_cnt = priv->tx_desc_cnt;
+ priv->tx_desc_cnt = new_desc_cnt;
+ err = gve_alloc_tx_rings(priv, 0, priv->tx_cfg.num_queues);
+ if (err != 0) {
+ device_printf(priv->dev,
+ "Failed to allocate rings. Trying to start back up with previous ring size.");
+ priv->tx_desc_cnt = prev_desc_cnt;
+ err = gve_alloc_tx_rings(priv, 0, priv->tx_cfg.num_queues);
+ }
+ }
+
+ if (err != 0) {
+ device_printf(priv->dev, "Failed to allocate rings! Cannot start device back up!");
+ return (err);
+ }
+
+ err = gve_up(priv);
+ if (err != 0) {
+ gve_schedule_reset(priv);
+ return (err);
+ }
+
+ return (0);
+}
+
+static int
+gve_get_dqo_rx_buf_size(struct gve_priv *priv, uint16_t mtu)
+{
+ /*
+ * Use 4k buffers only if mode is DQ, 4k buffers flag is on,
+ * and either hw LRO is enabled or mtu is greater than 2048
+ */
+ if (!gve_is_gqi(priv) && gve_allow_4k_rx_buffers &&
+ (!gve_disable_hw_lro || mtu > GVE_DEFAULT_RX_BUFFER_SIZE))
+ return (GVE_4K_RX_BUFFER_SIZE_DQO);
+
+ return (GVE_DEFAULT_RX_BUFFER_SIZE);
+}
+
static int
gve_set_mtu(if_t ifp, uint32_t new_mtu)
{
struct gve_priv *priv = if_getsoftc(ifp);
+ const uint32_t max_problem_range = 8227;
+ const uint32_t min_problem_range = 7822;
+ uint16_t new_rx_buf_size = gve_get_dqo_rx_buf_size(priv, new_mtu);
int err;
if ((new_mtu > priv->max_mtu) || (new_mtu < ETHERMIN)) {
@@ -198,11 +411,32 @@ gve_set_mtu(if_t ifp, uint32_t new_mtu)
return (EINVAL);
}
+ /*
+ * When hardware LRO is enabled in DQ mode, MTUs within the range
+ * [7822, 8227] trigger hardware issues which cause a drastic drop
+ * in throughput.
+ */
+ if (!gve_is_gqi(priv) && !gve_disable_hw_lro &&
+ new_mtu >= min_problem_range && new_mtu <= max_problem_range &&
+ new_rx_buf_size != GVE_4K_RX_BUFFER_SIZE_DQO) {
+ device_printf(priv->dev,
+ "Cannot set to MTU to %d within the range [%d, %d] while HW LRO is enabled and not using 4k RX Buffers\n",
+ new_mtu, min_problem_range, max_problem_range);
+ return (EINVAL);
+ }
+
err = gve_adminq_set_mtu(priv, new_mtu);
if (err == 0) {
if (bootverbose)
device_printf(priv->dev, "MTU set to %d\n", new_mtu);
if_setmtu(ifp, new_mtu);
+ /* Need to re-alloc RX queues if RX buffer size changed */
+ if (!gve_is_gqi(priv) &&
+ new_rx_buf_size != priv->rx_buf_size_dqo) {
+ gve_free_rx_rings(priv, 0, priv->rx_cfg.num_queues);
+ priv->rx_buf_size_dqo = new_rx_buf_size;
+ gve_alloc_rx_rings(priv, 0, priv->rx_cfg.num_queues);
+ }
} else {
device_printf(priv->dev, "Failed to set MTU to %d\n", new_mtu);
}
@@ -352,18 +586,13 @@ gve_get_counter(if_t ifp, ift_counter cnt)
}
}
-static int
+static void
gve_setup_ifnet(device_t dev, struct gve_priv *priv)
{
int caps = 0;
if_t ifp;
ifp = priv->ifp = if_alloc(IFT_ETHER);
- if (ifp == NULL) {
- device_printf(priv->dev, "Failed to allocate ifnet struct\n");
- return (ENXIO);
- }
-
if_initname(ifp, device_get_name(dev), device_get_unit(dev));
if_setsoftc(ifp, priv);
if_setdev(ifp, dev);
@@ -372,6 +601,18 @@ gve_setup_ifnet(device_t dev, struct gve_priv *priv)
if_settransmitfn(ifp, gve_xmit_ifp);
if_setqflushfn(ifp, gve_qflush);
+ /*
+ * Set TSO limits, must match the arguments to bus_dma_tag_create
+ * when creating tx->dqo.buf_dmatag. Only applies to the RDA mode
+ * because in QPL we copy the entire packet into the bounce buffer
+ * and thus it does not matter how fragmented the mbuf is.
+ */
+ if (!gve_is_gqi(priv) && !gve_is_qpl(priv)) {
+ if_sethwtsomaxsegcount(ifp, GVE_TX_MAX_DATA_DESCS_DQO);
+ if_sethwtsomaxsegsize(ifp, GVE_TX_MAX_BUF_SIZE_DQO);
+ }
+ if_sethwtsomax(ifp, GVE_TSO_MAXSIZE_DQO);
+
#if __FreeBSD_version >= 1400086
if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST);
#else
@@ -401,8 +642,6 @@ gve_setup_ifnet(device_t dev, struct gve_priv *priv)
ifmedia_add(&priv->media, IFM_ETHER | IFM_AUTO, 0, NULL);
ifmedia_set(&priv->media, IFM_ETHER | IFM_AUTO);
-
- return (0);
}
static int
@@ -454,9 +693,14 @@ static void
gve_free_rings(struct gve_priv *priv)
{
gve_free_irqs(priv);
- gve_free_tx_rings(priv);
- gve_free_rx_rings(priv);
- gve_free_qpls(priv);
+
+ gve_free_tx_rings(priv, 0, priv->tx_cfg.num_queues);
+ free(priv->tx, M_GVE);
+ priv->tx = NULL;
+
+ gve_free_rx_rings(priv, 0, priv->rx_cfg.num_queues);
+ free(priv->rx, M_GVE);
+ priv->rx = NULL;
}
static int
@@ -464,15 +708,15 @@ gve_alloc_rings(struct gve_priv *priv)
{
int err;
- err = gve_alloc_qpls(priv);
- if (err != 0)
- goto abort;
-
- err = gve_alloc_rx_rings(priv);
+ priv->rx = malloc(sizeof(struct gve_rx_ring) * priv->rx_cfg.max_queues,
+ M_GVE, M_WAITOK | M_ZERO);
+ err = gve_alloc_rx_rings(priv, 0, priv->rx_cfg.num_queues);
if (err != 0)
goto abort;
- err = gve_alloc_tx_rings(priv);
+ priv->tx = malloc(sizeof(struct gve_tx_ring) * priv->tx_cfg.max_queues,
+ M_GVE, M_WAITOK | M_ZERO);
+ err = gve_alloc_tx_rings(priv, 0, priv->tx_cfg.num_queues);
if (err != 0)
goto abort;
@@ -488,7 +732,7 @@ abort:
}
static void
-gve_deconfigure_resources(struct gve_priv *priv)
+gve_deconfigure_and_free_device_resources(struct gve_priv *priv)
{
int err;
@@ -506,10 +750,15 @@ gve_deconfigure_resources(struct gve_priv *priv)
gve_free_irq_db_array(priv);
gve_free_counter_array(priv);
+
+ if (priv->ptype_lut_dqo) {
+ free(priv->ptype_lut_dqo, M_GVE);
+ priv->ptype_lut_dqo = NULL;
+ }
}
static int
-gve_configure_resources(struct gve_priv *priv)
+gve_alloc_and_configure_device_resources(struct gve_priv *priv)
{
int err;
@@ -532,13 +781,25 @@ gve_configure_resources(struct gve_priv *priv)
goto abort;
}
+ if (!gve_is_gqi(priv)) {
+ priv->ptype_lut_dqo = malloc(sizeof(*priv->ptype_lut_dqo), M_GVE,
+ M_WAITOK | M_ZERO);
+
+ err = gve_adminq_get_ptype_map_dqo(priv, priv->ptype_lut_dqo);
+ if (err != 0) {
+ device_printf(priv->dev, "Failed to configure ptype lut: err=%d\n",
+ err);
+ goto abort;
+ }
+ }
+
gve_set_state_flag(priv, GVE_STATE_FLAG_RESOURCES_OK);
if (bootverbose)
device_printf(priv->dev, "Configured device resources\n");
return (0);
abort:
- gve_deconfigure_resources(priv);
+ gve_deconfigure_and_free_device_resources(priv);
return (err);
}
@@ -557,7 +818,7 @@ gve_set_queue_cnts(struct gve_priv *priv)
priv->rx_cfg.num_queues);
}
- priv->num_queues = priv->tx_cfg.num_queues + priv->rx_cfg.num_queues;
+ priv->num_queues = priv->tx_cfg.max_queues + priv->rx_cfg.max_queues;
priv->mgmt_msix_idx = priv->num_queues;
}
@@ -603,7 +864,7 @@ static void
gve_destroy(struct gve_priv *priv)
{
gve_down(priv);
- gve_deconfigure_resources(priv);
+ gve_deconfigure_and_free_device_resources(priv);
gve_release_adminq(priv);
}
@@ -616,9 +877,21 @@ gve_restore(struct gve_priv *priv)
if (err != 0)
goto abort;
- err = gve_configure_resources(priv);
- if (err != 0)
+ err = gve_adminq_configure_device_resources(priv);
+ if (err != 0) {
+ device_printf(priv->dev, "Failed to configure device resources: err=%d\n",
+ err);
+ err = (ENXIO);
goto abort;
+ }
+ if (!gve_is_gqi(priv)) {
+ err = gve_adminq_get_ptype_map_dqo(priv, priv->ptype_lut_dqo);
+ if (err != 0) {
+ device_printf(priv->dev, "Failed to configure ptype lut: err=%d\n",
+ err);
+ goto abort;
+ }
+ }
err = gve_up(priv);
if (err != 0)
@@ -632,6 +905,25 @@ abort:
}
static void
+gve_clear_device_resources(struct gve_priv *priv)
+{
+ int i;
+
+ for (i = 0; i < priv->num_event_counters; i++)
+ priv->counters[i] = 0;
+ bus_dmamap_sync(priv->counter_array_mem.tag, priv->counter_array_mem.map,
+ BUS_DMASYNC_PREWRITE);
+
+ for (i = 0; i < priv->num_queues; i++)
+ priv->irq_db_indices[i] = (struct gve_irq_db){};
+ bus_dmamap_sync(priv->irqs_db_mem.tag, priv->irqs_db_mem.map,
+ BUS_DMASYNC_PREWRITE);
+
+ if (priv->ptype_lut_dqo)
+ *priv->ptype_lut_dqo = (struct gve_ptype_lut){0};
+}
+
+static void
gve_handle_reset(struct gve_priv *priv)
{
if (!gve_get_state_flag(priv, GVE_STATE_FLAG_DO_RESET))
@@ -662,6 +954,8 @@ gve_handle_reset(struct gve_priv *priv)
gve_clear_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK);
gve_down(priv);
+ gve_clear_device_resources(priv);
+
gve_restore(priv);
GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock);
@@ -749,6 +1043,9 @@ gve_attach(device_t dev)
int rid;
int err;
+ snprintf(gve_version, sizeof(gve_version), "%d.%d.%d",
+ GVE_VERSION_MAJOR, GVE_VERSION_MINOR, GVE_VERSION_SUB);
+
priv = device_get_softc(dev);
priv->dev = dev;
GVE_IFACE_LOCK_INIT(priv->gve_iface_lock);
@@ -786,17 +1083,16 @@ gve_attach(device_t dev)
if (err != 0)
goto abort;
- err = gve_configure_resources(priv);
+ err = gve_alloc_and_configure_device_resources(priv);
if (err != 0)
goto abort;
+ priv->rx_buf_size_dqo = gve_get_dqo_rx_buf_size(priv, priv->max_mtu);
err = gve_alloc_rings(priv);
if (err != 0)
goto abort;
- err = gve_setup_ifnet(dev, priv);
- if (err != 0)
- goto abort;
+ gve_setup_ifnet(dev, priv);
priv->rx_copybreak = GVE_DEFAULT_RX_COPYBREAK;
@@ -817,7 +1113,7 @@ gve_attach(device_t dev)
abort:
gve_free_rings(priv);
- gve_deconfigure_resources(priv);
+ gve_deconfigure_and_free_device_resources(priv);
gve_release_adminq(priv);
gve_free_sys_res_mem(priv);
GVE_IFACE_LOCK_DESTROY(priv->gve_iface_lock);
@@ -829,6 +1125,11 @@ gve_detach(device_t dev)
{
struct gve_priv *priv = device_get_softc(dev);
if_t ifp = priv->ifp;
+ int error;
+
+ error = bus_generic_detach(dev);
+ if (error != 0)
+ return (error);
ether_ifdetach(ifp);
@@ -845,7 +1146,7 @@ gve_detach(device_t dev)
taskqueue_free(priv->service_tq);
if_free(ifp);
- return (bus_generic_detach(dev));
+ return (0);
}
static device_method_t gve_methods[] = {
diff --git a/sys/dev/gve/gve_plat.h b/sys/dev/gve/gve_plat.h
index ad6bc1c92b36..3185656c5e04 100644
--- a/sys/dev/gve/gve_plat.h
+++ b/sys/dev/gve/gve_plat.h
@@ -85,6 +85,9 @@
typedef uint16_t __be16;
typedef uint32_t __be32;
typedef uint64_t __be64;
+typedef uint16_t __le16;
+typedef uint32_t __le32;
+typedef uint64_t __le64;
#define BIT(nr) (1UL << (nr))
#define FBSD_VERSION_MAJOR (__FreeBSD_version / 100000)
diff --git a/sys/dev/gve/gve_qpl.c b/sys/dev/gve/gve_qpl.c
index 3c6d9af6feee..0e7098dcd4a1 100644
--- a/sys/dev/gve/gve_qpl.c
+++ b/sys/dev/gve/gve_qpl.c
@@ -32,31 +32,13 @@
#include "gve.h"
#include "gve_adminq.h"
+#include "gve_dqo.h"
static MALLOC_DEFINE(M_GVE_QPL, "gve qpl", "gve qpl allocations");
-static uint32_t
-gve_num_tx_qpls(struct gve_priv *priv)
-{
- if (priv->queue_format != GVE_GQI_QPL_FORMAT)
- return (0);
-
- return (priv->tx_cfg.max_queues);
-}
-
-static uint32_t
-gve_num_rx_qpls(struct gve_priv *priv)
-{
- if (priv->queue_format != GVE_GQI_QPL_FORMAT)
- return (0);
-
- return (priv->rx_cfg.max_queues);
-}
-
-static void
-gve_free_qpl(struct gve_priv *priv, uint32_t id)
+void
+gve_free_qpl(struct gve_priv *priv, struct gve_queue_page_list *qpl)
{
- struct gve_queue_page_list *qpl = &priv->qpls[id];
int i;
for (i = 0; i < qpl->num_dmas; i++) {
@@ -91,12 +73,14 @@ gve_free_qpl(struct gve_priv *priv, uint32_t id)
if (qpl->dmas != NULL)
free(qpl->dmas, M_GVE_QPL);
+
+ free(qpl, M_GVE_QPL);
}
-static int
+struct gve_queue_page_list *
gve_alloc_qpl(struct gve_priv *priv, uint32_t id, int npages, bool single_kva)
{
- struct gve_queue_page_list *qpl = &priv->qpls[id];
+ struct gve_queue_page_list *qpl;
int err;
int i;
@@ -104,9 +88,12 @@ gve_alloc_qpl(struct gve_priv *priv, uint32_t id, int npages, bool single_kva)
device_printf(priv->dev, "Reached max number of registered pages %ju > %ju\n",
(uintmax_t)npages + priv->num_registered_pages,
(uintmax_t)priv->max_registered_pages);
- return (EINVAL);
+ return (NULL);
}
+ qpl = malloc(sizeof(struct gve_queue_page_list), M_GVE_QPL,
+ M_WAITOK | M_ZERO);
+
qpl->id = id;
qpl->num_pages = 0;
qpl->num_dmas = 0;
@@ -162,123 +149,111 @@ gve_alloc_qpl(struct gve_priv *priv, uint32_t id, int npages, bool single_kva)
priv->num_registered_pages++;
}
- return (0);
+ return (qpl);
abort:
- gve_free_qpl(priv, id);
- return (err);
+ gve_free_qpl(priv, qpl);
+ return (NULL);
}
-void
-gve_free_qpls(struct gve_priv *priv)
-{
- int num_qpls = gve_num_tx_qpls(priv) + gve_num_rx_qpls(priv);
- int i;
-
- if (num_qpls == 0)
- return;
-
- if (priv->qpls != NULL) {
- for (i = 0; i < num_qpls; i++)
- gve_free_qpl(priv, i);
- free(priv->qpls, M_GVE_QPL);
- }
-}
-
-int gve_alloc_qpls(struct gve_priv *priv)
+int
+gve_register_qpls(struct gve_priv *priv)
{
- int num_qpls = gve_num_tx_qpls(priv) + gve_num_rx_qpls(priv);
+ struct gve_ring_com *com;
+ struct gve_tx_ring *tx;
+ struct gve_rx_ring *rx;
int err;
int i;
- if (num_qpls == 0)
+ if (gve_get_state_flag(priv, GVE_STATE_FLAG_QPLREG_OK))
return (0);
- priv->qpls = malloc(num_qpls * sizeof(*priv->qpls), M_GVE_QPL,
- M_WAITOK | M_ZERO);
-
- for (i = 0; i < gve_num_tx_qpls(priv); i++) {
- err = gve_alloc_qpl(priv, i, priv->tx_desc_cnt / GVE_QPL_DIVISOR,
- /*single_kva=*/true);
- if (err != 0)
- goto abort;
- }
-
- for (; i < num_qpls; i++) {
- err = gve_alloc_qpl(priv, i, priv->rx_desc_cnt, /*single_kva=*/false);
- if (err != 0)
- goto abort;
- }
-
- return (0);
-
-abort:
- gve_free_qpls(priv);
- return (err);
-}
-
-static int
-gve_unregister_n_qpls(struct gve_priv *priv, int n)
-{
- int err;
- int i;
-
- for (i = 0; i < n; i++) {
- err = gve_adminq_unregister_page_list(priv, priv->qpls[i].id);
+ /* Register TX qpls */
+ for (i = 0; i < priv->tx_cfg.num_queues; i++) {
+ tx = &priv->tx[i];
+ com = &tx->com;
+ err = gve_adminq_register_page_list(priv, com->qpl);
if (err != 0) {
device_printf(priv->dev,
- "Failed to unregister qpl %d, err: %d\n",
- priv->qpls[i].id, err);
+ "Failed to register qpl %d, err: %d\n",
+ com->qpl->id, err);
+ /* Caller schedules a reset when this fails */
+ return (err);
}
}
- if (err != 0)
- return (err);
-
- return (0);
-}
-
-int
-gve_register_qpls(struct gve_priv *priv)
-{
- int num_qpls = gve_num_tx_qpls(priv) + gve_num_rx_qpls(priv);
- int err;
- int i;
-
- if (gve_get_state_flag(priv, GVE_STATE_FLAG_QPLREG_OK))
- return (0);
-
- for (i = 0; i < num_qpls; i++) {
- err = gve_adminq_register_page_list(priv, &priv->qpls[i]);
+ /* Register RX qpls */
+ for (i = 0; i < priv->rx_cfg.num_queues; i++) {
+ rx = &priv->rx[i];
+ com = &rx->com;
+ err = gve_adminq_register_page_list(priv, com->qpl);
if (err != 0) {
device_printf(priv->dev,
"Failed to register qpl %d, err: %d\n",
- priv->qpls[i].id, err);
- goto abort;
+ com->qpl->id, err);
+ /* Caller schedules a reset when this fails */
+ return (err);
}
}
-
gve_set_state_flag(priv, GVE_STATE_FLAG_QPLREG_OK);
return (0);
-
-abort:
- gve_unregister_n_qpls(priv, i);
- return (err);
}
int
gve_unregister_qpls(struct gve_priv *priv)
{
- int num_qpls = gve_num_tx_qpls(priv) + gve_num_rx_qpls(priv);
int err;
+ int i;
+ struct gve_ring_com *com;
+ struct gve_tx_ring *tx;
+ struct gve_rx_ring *rx;
if (!gve_get_state_flag(priv, GVE_STATE_FLAG_QPLREG_OK))
return (0);
- err = gve_unregister_n_qpls(priv, num_qpls);
+ for (i = 0; i < priv->tx_cfg.num_queues; i++) {
+ tx = &priv->tx[i];
+ com = &tx->com;
+ err = gve_adminq_unregister_page_list(priv, com->qpl->id);
+ if (err != 0) {
+ device_printf(priv->dev,
+ "Failed to unregister qpl %d, err: %d\n",
+ com->qpl->id, err);
+ }
+ }
+
+ for (i = 0; i < priv->rx_cfg.num_queues; i++) {
+ rx = &priv->rx[i];
+ com = &rx->com;
+ err = gve_adminq_unregister_page_list(priv, com->qpl->id);
+ if (err != 0) {
+ device_printf(priv->dev,
+ "Failed to unregister qpl %d, err: %d\n",
+ com->qpl->id, err);
+ }
+ }
+
if (err != 0)
return (err);
gve_clear_state_flag(priv, GVE_STATE_FLAG_QPLREG_OK);
return (0);
}
+
+void
+gve_mextadd_free(struct mbuf *mbuf)
+{
+ vm_page_t page = (vm_page_t)mbuf->m_ext.ext_arg1;
+ vm_offset_t va = (vm_offset_t)mbuf->m_ext.ext_arg2;
+
+ /*
+ * Free the page only if this is the last ref.
+ * The interface might no longer exist by the time
+ * this callback is called, see gve_free_qpl.
+ */
+ if (__predict_false(vm_page_unwire_noq(page))) {
+ pmap_qremove(va, 1);
+ kva_free(va, PAGE_SIZE);
+ vm_page_free(page);
+ }
+}
diff --git a/sys/dev/gve/gve_rx.c b/sys/dev/gve/gve_rx.c
index 9be96cf1ee3a..de64375ac4f3 100644
--- a/sys/dev/gve/gve_rx.c
+++ b/sys/dev/gve/gve_rx.c
@@ -1,7 +1,7 @@
/*-
* SPDX-License-Identifier: BSD-3-Clause
*
- * Copyright (c) 2023 Google LLC
+ * Copyright (c) 2023-2024 Google LLC
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
@@ -30,16 +30,14 @@
*/
#include "gve.h"
#include "gve_adminq.h"
+#include "gve_dqo.h"
static void
-gve_rx_free_ring(struct gve_priv *priv, int i)
+gve_rx_free_ring_gqi(struct gve_priv *priv, int i)
{
struct gve_rx_ring *rx = &priv->rx[i];
struct gve_ring_com *com = &rx->com;
- /* Safe to call even if never allocated */
- gve_free_counters((counter_u64_t *)&rx->stats, NUM_RX_STATS);
-
if (rx->page_info != NULL) {
free(rx->page_info, M_GVE);
rx->page_info = NULL;
@@ -55,6 +53,26 @@ gve_rx_free_ring(struct gve_priv *priv, int i)
rx->desc_ring = NULL;
}
+ if (com->qpl != NULL) {
+ gve_free_qpl(priv, com->qpl);
+ com->qpl = NULL;
+ }
+}
+
+static void
+gve_rx_free_ring(struct gve_priv *priv, int i)
+{
+ struct gve_rx_ring *rx = &priv->rx[i];
+ struct gve_ring_com *com = &rx->com;
+
+ /* Safe to call even if never allocated */
+ gve_free_counters((counter_u64_t *)&rx->stats, NUM_RX_STATS);
+
+ if (gve_is_gqi(priv))
+ gve_rx_free_ring_gqi(priv, i);
+ else
+ gve_rx_free_ring_dqo(priv, i);
+
if (com->q_resources != NULL) {
gve_dma_free_coherent(&com->q_resources_mem);
com->q_resources = NULL;
@@ -83,55 +101,82 @@ gve_prefill_rx_slots(struct gve_rx_ring *rx)
}
static int
-gve_rx_alloc_ring(struct gve_priv *priv, int i)
+gve_rx_alloc_ring_gqi(struct gve_priv *priv, int i)
{
struct gve_rx_ring *rx = &priv->rx[i];
struct gve_ring_com *com = &rx->com;
int err;
- com->priv = priv;
- com->id = i;
+ err = gve_dma_alloc_coherent(priv,
+ sizeof(struct gve_rx_desc) * priv->rx_desc_cnt,
+ CACHE_LINE_SIZE, &rx->desc_ring_mem);
+ if (err != 0) {
+ device_printf(priv->dev,
+ "Failed to alloc desc ring for rx ring %d", i);
+ goto abort;
+ }
rx->mask = priv->rx_pages_per_qpl - 1;
+ rx->desc_ring = rx->desc_ring_mem.cpu_addr;
- com->qpl = &priv->qpls[priv->tx_cfg.max_queues + i];
+ com->qpl = gve_alloc_qpl(priv, i + priv->tx_cfg.max_queues,
+ priv->rx_desc_cnt, /*single_kva=*/false);
if (com->qpl == NULL) {
- device_printf(priv->dev, "No QPL left for rx ring %d", i);
- return (ENOMEM);
+ device_printf(priv->dev,
+ "Failed to alloc QPL for rx ring %d", i);
+ err = ENOMEM;
+ goto abort;
}
- rx->page_info = malloc(priv->rx_desc_cnt * sizeof(*rx->page_info), M_GVE,
- M_WAITOK | M_ZERO);
+ rx->page_info = malloc(priv->rx_desc_cnt * sizeof(*rx->page_info),
+ M_GVE, M_WAITOK | M_ZERO);
+
+ err = gve_dma_alloc_coherent(priv,
+ sizeof(union gve_rx_data_slot) * priv->rx_desc_cnt,
+ CACHE_LINE_SIZE, &rx->data_ring_mem);
+ if (err != 0) {
+ device_printf(priv->dev,
+ "Failed to alloc data ring for rx ring %d", i);
+ goto abort;
+ }
+ rx->data_ring = rx->data_ring_mem.cpu_addr;
+
+ gve_prefill_rx_slots(rx);
+ return (0);
+
+abort:
+ gve_rx_free_ring_gqi(priv, i);
+ return (err);
+}
+
+static int
+gve_rx_alloc_ring(struct gve_priv *priv, int i)
+{
+ struct gve_rx_ring *rx = &priv->rx[i];
+ struct gve_ring_com *com = &rx->com;
+ int err;
+
+ com->priv = priv;
+ com->id = i;
gve_alloc_counters((counter_u64_t *)&rx->stats, NUM_RX_STATS);
err = gve_dma_alloc_coherent(priv, sizeof(struct gve_queue_resources),
PAGE_SIZE, &com->q_resources_mem);
if (err != 0) {
- device_printf(priv->dev, "Failed to alloc queue resources for rx ring %d", i);
+ device_printf(priv->dev,
+ "Failed to alloc queue resources for rx ring %d", i);
goto abort;
}
com->q_resources = com->q_resources_mem.cpu_addr;
- err = gve_dma_alloc_coherent(priv,
- sizeof(struct gve_rx_desc) * priv->rx_desc_cnt,
- CACHE_LINE_SIZE, &rx->desc_ring_mem);
- if (err != 0) {
- device_printf(priv->dev, "Failed to alloc desc ring for rx ring %d", i);
- goto abort;
- }
- rx->desc_ring = rx->desc_ring_mem.cpu_addr;
-
- err = gve_dma_alloc_coherent(priv,
- sizeof(union gve_rx_data_slot) * priv->rx_desc_cnt,
- CACHE_LINE_SIZE, &rx->data_ring_mem);
- if (err != 0) {
- device_printf(priv->dev, "Failed to alloc data ring for rx ring %d", i);
+ if (gve_is_gqi(priv))
+ err = gve_rx_alloc_ring_gqi(priv, i);
+ else
+ err = gve_rx_alloc_ring_dqo(priv, i);
+ if (err != 0)
goto abort;
- }
- rx->data_ring = rx->data_ring_mem.cpu_addr;
- gve_prefill_rx_slots(rx);
return (0);
abort:
@@ -140,38 +185,32 @@ abort:
}
int
-gve_alloc_rx_rings(struct gve_priv *priv)
+gve_alloc_rx_rings(struct gve_priv *priv, uint16_t start_idx, uint16_t stop_idx)
{
- int err = 0;
int i;
+ int err;
- priv->rx = malloc(sizeof(struct gve_rx_ring) * priv->rx_cfg.num_queues,
- M_GVE, M_WAITOK | M_ZERO);
+ KASSERT(priv->rx != NULL, ("priv->rx is NULL!"));
- for (i = 0; i < priv->rx_cfg.num_queues; i++) {
+ for (i = start_idx; i < stop_idx; i++) {
err = gve_rx_alloc_ring(priv, i);
if (err != 0)
goto free_rings;
}
return (0);
-
free_rings:
- while (i--)
- gve_rx_free_ring(priv, i);
- free(priv->rx, M_GVE);
+ gve_free_rx_rings(priv, start_idx, i);
return (err);
}
void
-gve_free_rx_rings(struct gve_priv *priv)
+gve_free_rx_rings(struct gve_priv *priv, uint16_t start_idx, uint16_t stop_idx)
{
int i;
- for (i = 0; i < priv->rx_cfg.num_queues; i++)
+ for (i = start_idx; i < stop_idx; i++)
gve_rx_free_ring(priv, i);
-
- free(priv->rx, M_GVE);
}
static void
@@ -217,6 +256,11 @@ gve_clear_rx_ring(struct gve_priv *priv, int i)
{
struct gve_rx_ring *rx = &priv->rx[i];
+ if (!gve_is_gqi(priv)) {
+ gve_clear_rx_ring_dqo(priv, i);
+ return;
+ }
+
rx->seq_no = 1;
rx->cnt = 0;
rx->fill_cnt = 0;
@@ -238,14 +282,21 @@ gve_start_rx_ring(struct gve_priv *priv, int i)
rx->lro.ifp = priv->ifp;
}
- NET_TASK_INIT(&com->cleanup_task, 0, gve_rx_cleanup_tq, rx);
+ if (gve_is_gqi(priv))
+ NET_TASK_INIT(&com->cleanup_task, 0, gve_rx_cleanup_tq, rx);
+ else
+ NET_TASK_INIT(&com->cleanup_task, 0, gve_rx_cleanup_tq_dqo, rx);
com->cleanup_tq = taskqueue_create_fast("gve rx", M_WAITOK,
taskqueue_thread_enqueue, &com->cleanup_tq);
taskqueue_start_threads(&com->cleanup_tq, 1, PI_NET,
"%s rxq %d", device_get_nameunit(priv->dev), i);
- gve_db_bar_write_4(priv, com->db_offset, rx->fill_cnt);
+ if (gve_is_gqi(priv)) {
+ /* GQ RX bufs are prefilled at ring alloc time */
+ gve_db_bar_write_4(priv, com->db_offset, rx->fill_cnt);
+ } else
+ gve_rx_prefill_buffers_dqo(rx);
}
int
@@ -362,24 +413,6 @@ gve_set_rss_type(__be16 flag, struct mbuf *mbuf)
}
static void
-gve_mextadd_free(struct mbuf *mbuf)
-{
- vm_page_t page = (vm_page_t)mbuf->m_ext.ext_arg1;
- vm_offset_t va = (vm_offset_t)mbuf->m_ext.ext_arg2;
-
- /*
- * Free the page only if this is the last ref.
- * The interface might no longer exist by the time
- * this callback is called, see gve_free_qpl.
- */
- if (__predict_false(vm_page_unwire_noq(page))) {
- pmap_qremove(va, 1);
- kva_free(va, PAGE_SIZE);
- vm_page_free(page);
- }
-}
-
-static void
gve_rx_flip_buff(struct gve_rx_slot_page_info *page_info, __be64 *slot_addr)
{
const __be64 offset = htobe64(GVE_DEFAULT_RX_BUFFER_OFFSET);
@@ -676,7 +709,7 @@ gve_rx_cleanup_tq(void *arg, int pending)
* interrupt but they will still be handled by the enqueue below.
* Fragments received after the barrier WILL trigger an interrupt.
*/
- mb();
+ atomic_thread_fence_seq_cst();
if (gve_rx_work_pending(rx)) {
gve_db_bar_write_4(priv, rx->com.irq_db_offset, GVE_IRQ_MASK);
diff --git a/sys/dev/gve/gve_rx_dqo.c b/sys/dev/gve/gve_rx_dqo.c
new file mode 100644
index 000000000000..cf914913da09
--- /dev/null
+++ b/sys/dev/gve/gve_rx_dqo.c
@@ -0,0 +1,1035 @@
+/*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 2024 Google LLC
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include "gve.h"
+#include "gve_adminq.h"
+#include "gve_dqo.h"
+
+static void
+gve_free_rx_mbufs_dqo(struct gve_rx_ring *rx)
+{
+ struct gve_rx_buf_dqo *buf;
+ int i;
+
+ if (gve_is_qpl(rx->com.priv))
+ return;
+
+ for (i = 0; i < rx->dqo.buf_cnt; i++) {
+ buf = &rx->dqo.bufs[i];
+ if (!buf->mbuf)
+ continue;
+
+ bus_dmamap_sync(rx->dqo.buf_dmatag, buf->dmamap,
+ BUS_DMASYNC_POSTREAD);
+ bus_dmamap_unload(rx->dqo.buf_dmatag, buf->dmamap);
+ m_freem(buf->mbuf);
+ buf->mbuf = NULL;
+ }
+}
+
+void
+gve_rx_free_ring_dqo(struct gve_priv *priv, int i)
+{
+ struct gve_rx_ring *rx = &priv->rx[i];
+ struct gve_ring_com *com = &rx->com;
+ int j;
+
+ if (rx->dqo.compl_ring != NULL) {
+ gve_dma_free_coherent(&rx->dqo.compl_ring_mem);
+ rx->dqo.compl_ring = NULL;
+ }
+
+ if (rx->dqo.desc_ring != NULL) {
+ gve_dma_free_coherent(&rx->desc_ring_mem);
+ rx->dqo.desc_ring = NULL;
+ }
+
+ if (rx->dqo.bufs != NULL) {
+ gve_free_rx_mbufs_dqo(rx);
+
+ if (!gve_is_qpl(priv) && rx->dqo.buf_dmatag) {
+ for (j = 0; j < rx->dqo.buf_cnt; j++)
+ if (rx->dqo.bufs[j].mapped)
+ bus_dmamap_destroy(rx->dqo.buf_dmatag,
+ rx->dqo.bufs[j].dmamap);
+ }
+
+ free(rx->dqo.bufs, M_GVE);
+ rx->dqo.bufs = NULL;
+ }
+
+ if (!gve_is_qpl(priv) && rx->dqo.buf_dmatag)
+ bus_dma_tag_destroy(rx->dqo.buf_dmatag);
+
+ if (com->qpl != NULL) {
+ gve_free_qpl(priv, com->qpl);
+ com->qpl = NULL;
+ }
+}
+
+int
+gve_rx_alloc_ring_dqo(struct gve_priv *priv, int i)
+{
+ struct gve_rx_ring *rx = &priv->rx[i];
+ int err;
+ int j;
+
+ err = gve_dma_alloc_coherent(priv,
+ sizeof(struct gve_rx_desc_dqo) * priv->rx_desc_cnt,
+ CACHE_LINE_SIZE, &rx->desc_ring_mem);
+ if (err != 0) {
+ device_printf(priv->dev,
+ "Failed to alloc desc ring for rx ring %d", i);
+ goto abort;
+ }
+ rx->dqo.desc_ring = rx->desc_ring_mem.cpu_addr;
+ rx->dqo.mask = priv->rx_desc_cnt - 1;
+
+ err = gve_dma_alloc_coherent(priv,
+ sizeof(struct gve_rx_compl_desc_dqo) * priv->rx_desc_cnt,
+ CACHE_LINE_SIZE, &rx->dqo.compl_ring_mem);
+ if (err != 0) {
+ device_printf(priv->dev,
+ "Failed to alloc compl ring for rx ring %d", i);
+ goto abort;
+ }
+ rx->dqo.compl_ring = rx->dqo.compl_ring_mem.cpu_addr;
+ rx->dqo.mask = priv->rx_desc_cnt - 1;
+
+ rx->dqo.buf_cnt = gve_is_qpl(priv) ? GVE_RX_NUM_QPL_PAGES_DQO :
+ priv->rx_desc_cnt;
+ rx->dqo.bufs = malloc(rx->dqo.buf_cnt * sizeof(struct gve_rx_buf_dqo),
+ M_GVE, M_WAITOK | M_ZERO);
+
+ if (gve_is_qpl(priv)) {
+ rx->com.qpl = gve_alloc_qpl(priv, i + priv->tx_cfg.max_queues,
+ GVE_RX_NUM_QPL_PAGES_DQO, /*single_kva=*/false);
+ if (rx->com.qpl == NULL) {
+ device_printf(priv->dev,
+ "Failed to alloc QPL for rx ring %d", i);
+ err = ENOMEM;
+ goto abort;
+ }
+ return (0);
+ }
+
+ bus_size_t max_seg_size = gve_rx_dqo_mbuf_segment_size(priv);
+
+ err = bus_dma_tag_create(
+ bus_get_dma_tag(priv->dev), /* parent */
+ 1, 0, /* alignment, bounds */
+ BUS_SPACE_MAXADDR, /* lowaddr */
+ BUS_SPACE_MAXADDR, /* highaddr */
+ NULL, NULL, /* filter, filterarg */
+ max_seg_size, /* maxsize */
+ 1, /* nsegments */
+ max_seg_size, /* maxsegsize */
+ 0, /* flags */
+ NULL, /* lockfunc */
+ NULL, /* lockarg */
+ &rx->dqo.buf_dmatag);
+ if (err != 0) {
+ device_printf(priv->dev,
+ "%s: bus_dma_tag_create failed: %d\n",
+ __func__, err);
+ goto abort;
+ }
+
+ for (j = 0; j < rx->dqo.buf_cnt; j++) {
+ err = bus_dmamap_create(rx->dqo.buf_dmatag, 0,
+ &rx->dqo.bufs[j].dmamap);
+ if (err != 0) {
+ device_printf(priv->dev,
+ "err in creating rx buf dmamap %d: %d",
+ j, err);
+ goto abort;
+ }
+ rx->dqo.bufs[j].mapped = true;
+ }
+
+ return (0);
+
+abort:
+ gve_rx_free_ring_dqo(priv, i);
+ return (err);
+}
+
+static void
+gve_rx_clear_desc_ring_dqo(struct gve_rx_ring *rx)
+{
+ struct gve_ring_com *com = &rx->com;
+ int entries;
+ int i;
+
+ entries = com->priv->rx_desc_cnt;
+ for (i = 0; i < entries; i++)
+ rx->dqo.desc_ring[i] = (struct gve_rx_desc_dqo){};
+
+ bus_dmamap_sync(rx->desc_ring_mem.tag, rx->desc_ring_mem.map,
+ BUS_DMASYNC_PREWRITE);
+}
+
+static void
+gve_rx_clear_compl_ring_dqo(struct gve_rx_ring *rx)
+{
+ struct gve_ring_com *com = &rx->com;
+ int i;
+
+ for (i = 0; i < com->priv->rx_desc_cnt; i++)
+ rx->dqo.compl_ring[i] = (struct gve_rx_compl_desc_dqo){};
+
+ bus_dmamap_sync(rx->dqo.compl_ring_mem.tag, rx->dqo.compl_ring_mem.map,
+ BUS_DMASYNC_PREWRITE);
+}
+
+void
+gve_clear_rx_ring_dqo(struct gve_priv *priv, int i)
+{
+ struct gve_rx_ring *rx = &priv->rx[i];
+ int j;
+
+ rx->fill_cnt = 0;
+ rx->cnt = 0;
+ rx->dqo.mask = priv->rx_desc_cnt - 1;
+ rx->dqo.head = 0;
+ rx->dqo.tail = 0;
+ rx->dqo.cur_gen_bit = 0;
+
+ gve_rx_clear_desc_ring_dqo(rx);
+ gve_rx_clear_compl_ring_dqo(rx);
+
+ gve_free_rx_mbufs_dqo(rx);
+
+ if (gve_is_qpl(priv)) {
+ SLIST_INIT(&rx->dqo.free_bufs);
+ STAILQ_INIT(&rx->dqo.used_bufs);
+
+ for (j = 0; j < rx->dqo.buf_cnt; j++) {
+ struct gve_rx_buf_dqo *buf = &rx->dqo.bufs[j];
+
+ vm_page_t page = rx->com.qpl->pages[buf - rx->dqo.bufs];
+ u_int ref_count = atomic_load_int(&page->ref_count);
+
+ /*
+ * An ifconfig down+up might see pages still in flight
+ * from the previous innings.
+ */
+ if (VPRC_WIRE_COUNT(ref_count) == 1)
+ SLIST_INSERT_HEAD(&rx->dqo.free_bufs,
+ buf, slist_entry);
+ else
+ STAILQ_INSERT_TAIL(&rx->dqo.used_bufs,
+ buf, stailq_entry);
+
+ buf->num_nic_frags = 0;
+ buf->next_idx = 0;
+ }
+ } else {
+ SLIST_INIT(&rx->dqo.free_bufs);
+ for (j = 0; j < rx->dqo.buf_cnt; j++)
+ SLIST_INSERT_HEAD(&rx->dqo.free_bufs,
+ &rx->dqo.bufs[j], slist_entry);
+ }
+}
+
+int
+gve_rx_intr_dqo(void *arg)
+{
+ struct gve_rx_ring *rx = arg;
+ struct gve_priv *priv = rx->com.priv;
+ struct gve_ring_com *com = &rx->com;
+
+ if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0))
+ return (FILTER_STRAY);
+
+ /* Interrupts are automatically masked */
+ taskqueue_enqueue(com->cleanup_tq, &com->cleanup_task);
+ return (FILTER_HANDLED);
+}
+
+static void
+gve_rx_advance_head_dqo(struct gve_rx_ring *rx)
+{
+ rx->dqo.head = (rx->dqo.head + 1) & rx->dqo.mask;
+ rx->fill_cnt++; /* rx->fill_cnt is just a sysctl counter */
+
+ if ((rx->dqo.head & (GVE_RX_BUF_THRESH_DQO - 1)) == 0) {
+ bus_dmamap_sync(rx->desc_ring_mem.tag, rx->desc_ring_mem.map,
+ BUS_DMASYNC_PREWRITE);
+ gve_db_bar_dqo_write_4(rx->com.priv, rx->com.db_offset,
+ rx->dqo.head);
+ }
+}
+
+static void
+gve_rx_post_buf_dqo(struct gve_rx_ring *rx, struct gve_rx_buf_dqo *buf)
+{
+ struct gve_rx_desc_dqo *desc;
+
+ bus_dmamap_sync(rx->dqo.buf_dmatag, buf->dmamap,
+ BUS_DMASYNC_PREREAD);
+
+ desc = &rx->dqo.desc_ring[rx->dqo.head];
+ desc->buf_id = htole16(buf - rx->dqo.bufs);
+ desc->buf_addr = htole64(buf->addr);
+
+ gve_rx_advance_head_dqo(rx);
+}
+
+static int
+gve_rx_post_new_mbuf_dqo(struct gve_rx_ring *rx, int how)
+{
+ struct gve_rx_buf_dqo *buf;
+ bus_dma_segment_t segs[1];
+ int nsegs;
+ int err;
+
+ buf = SLIST_FIRST(&rx->dqo.free_bufs);
+ if (__predict_false(!buf)) {
+ device_printf(rx->com.priv->dev,
+ "Unexpected empty free bufs list\n");
+ return (ENOBUFS);
+ }
+ SLIST_REMOVE_HEAD(&rx->dqo.free_bufs, slist_entry);
+
+ bus_size_t segment_size = gve_rx_dqo_mbuf_segment_size(rx->com.priv);
+ buf->mbuf = m_getjcl(how, MT_DATA, M_PKTHDR, segment_size);
+ if (__predict_false(!buf->mbuf)) {
+ err = ENOMEM;
+ counter_enter();
+ counter_u64_add_protected(rx->stats.rx_mbuf_mclget_null, 1);
+ counter_exit();
+ goto abort_with_buf;
+ }
+ buf->mbuf->m_len = segment_size;
+
+ err = bus_dmamap_load_mbuf_sg(rx->dqo.buf_dmatag, buf->dmamap,
+ buf->mbuf, segs, &nsegs, BUS_DMA_NOWAIT);
+ KASSERT(nsegs == 1, ("dma segs for a cluster mbuf is not 1"));
+ if (__predict_false(err != 0)) {
+ counter_enter();
+ counter_u64_add_protected(rx->stats.rx_mbuf_dmamap_err, 1);
+ counter_exit();
+ goto abort_with_mbuf;
+ }
+ buf->addr = segs[0].ds_addr;
+
+ gve_rx_post_buf_dqo(rx, buf);
+ return (0);
+
+abort_with_mbuf:
+ m_freem(buf->mbuf);
+ buf->mbuf = NULL;
+abort_with_buf:
+ SLIST_INSERT_HEAD(&rx->dqo.free_bufs, buf, slist_entry);
+ return (err);
+}
+
+static struct gve_dma_handle *
+gve_get_page_dma_handle(struct gve_rx_ring *rx, struct gve_rx_buf_dqo *buf)
+{
+ return (&(rx->com.qpl->dmas[buf - rx->dqo.bufs]));
+}
+
+static void
+gve_rx_post_qpl_buf_dqo(struct gve_rx_ring *rx, struct gve_rx_buf_dqo *buf,
+ uint8_t frag_num)
+{
+ struct gve_rx_desc_dqo *desc = &rx->dqo.desc_ring[rx->dqo.head];
+ union gve_rx_qpl_buf_id_dqo composed_id;
+ struct gve_dma_handle *page_dma_handle;
+
+ composed_id.buf_id = buf - rx->dqo.bufs;
+ composed_id.frag_num = frag_num;
+ desc->buf_id = htole16(composed_id.all);
+
+ page_dma_handle = gve_get_page_dma_handle(rx, buf);
+ bus_dmamap_sync(page_dma_handle->tag, page_dma_handle->map,
+ BUS_DMASYNC_PREREAD);
+ desc->buf_addr = htole64(page_dma_handle->bus_addr +
+ frag_num * rx->com.priv->rx_buf_size_dqo);
+
+ buf->num_nic_frags++;
+ gve_rx_advance_head_dqo(rx);
+}
+
+static void
+gve_rx_maybe_extract_from_used_bufs(struct gve_rx_ring *rx, bool just_one)
+{
+ struct gve_rx_buf_dqo *hol_blocker = NULL;
+ struct gve_rx_buf_dqo *buf;
+ u_int ref_count;
+ vm_page_t page;
+
+ while (true) {
+ buf = STAILQ_FIRST(&rx->dqo.used_bufs);
+ if (__predict_false(buf == NULL))
+ break;
+
+ page = rx->com.qpl->pages[buf - rx->dqo.bufs];
+ ref_count = atomic_load_int(&page->ref_count);
+
+ if (VPRC_WIRE_COUNT(ref_count) != 1) {
+ /* Account for one head-of-line blocker */
+ if (hol_blocker != NULL)
+ break;
+ hol_blocker = buf;
+ STAILQ_REMOVE_HEAD(&rx->dqo.used_bufs,
+ stailq_entry);
+ continue;
+ }
+
+ STAILQ_REMOVE_HEAD(&rx->dqo.used_bufs,
+ stailq_entry);
+ SLIST_INSERT_HEAD(&rx->dqo.free_bufs,
+ buf, slist_entry);
+ if (just_one)
+ break;
+ }
+
+ if (hol_blocker != NULL)
+ STAILQ_INSERT_HEAD(&rx->dqo.used_bufs,
+ hol_blocker, stailq_entry);
+}
+
+static int
+gve_rx_post_new_dqo_qpl_buf(struct gve_rx_ring *rx)
+{
+ struct gve_rx_buf_dqo *buf;
+
+ buf = SLIST_FIRST(&rx->dqo.free_bufs);
+ if (__predict_false(buf == NULL)) {
+ gve_rx_maybe_extract_from_used_bufs(rx, /*just_one=*/true);
+ buf = SLIST_FIRST(&rx->dqo.free_bufs);
+ if (__predict_false(buf == NULL))
+ return (ENOBUFS);
+ }
+
+ gve_rx_post_qpl_buf_dqo(rx, buf, buf->next_idx);
+ if (buf->next_idx == gve_get_dq_num_frags_in_page(rx->com.priv) - 1)
+ buf->next_idx = 0;
+ else
+ buf->next_idx++;
+
+ /*
+ * We have posted all the frags in this buf to the NIC.
+ * - buf will enter used_bufs once the last completion arrives.
+ * - It will renter free_bufs in gve_rx_maybe_extract_from_used_bufs
+ * when its wire count drops back to 1.
+ */
+ if (buf->next_idx == 0)
+ SLIST_REMOVE_HEAD(&rx->dqo.free_bufs, slist_entry);
+ return (0);
+}
+
+static void
+gve_rx_post_buffers_dqo(struct gve_rx_ring *rx, int how)
+{
+ uint32_t num_pending_bufs;
+ uint32_t num_to_post;
+ uint32_t i;
+ int err;
+
+ num_pending_bufs = (rx->dqo.head - rx->dqo.tail) & rx->dqo.mask;
+ num_to_post = rx->dqo.mask - num_pending_bufs;
+
+ for (i = 0; i < num_to_post; i++) {
+ if (gve_is_qpl(rx->com.priv))
+ err = gve_rx_post_new_dqo_qpl_buf(rx);
+ else
+ err = gve_rx_post_new_mbuf_dqo(rx, how);
+ if (err)
+ break;
+ }
+}
+
+void
+gve_rx_prefill_buffers_dqo(struct gve_rx_ring *rx)
+{
+ gve_rx_post_buffers_dqo(rx, M_WAITOK);
+}
+
+static void
+gve_rx_set_hashtype_dqo(struct mbuf *mbuf, struct gve_ptype *ptype, bool *is_tcp)
+{
+ switch (ptype->l3_type) {
+ case GVE_L3_TYPE_IPV4:
+ switch (ptype->l4_type) {
+ case GVE_L4_TYPE_TCP:
+ *is_tcp = true;
+ M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_TCP_IPV4);
+ break;
+ case GVE_L4_TYPE_UDP:
+ M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_UDP_IPV4);
+ break;
+ default:
+ M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_IPV4);
+ }
+ break;
+ case GVE_L3_TYPE_IPV6:
+ switch (ptype->l4_type) {
+ case GVE_L4_TYPE_TCP:
+ *is_tcp = true;
+ M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_TCP_IPV6);
+ break;
+ case GVE_L4_TYPE_UDP:
+ M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_UDP_IPV6);
+ break;
+ default:
+ M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_IPV6);
+ }
+ break;
+ default:
+ M_HASHTYPE_SET(mbuf, M_HASHTYPE_OPAQUE_HASH);
+ }
+}
+
+static void
+gve_rx_set_csum_flags_dqo(struct mbuf *mbuf,
+ struct gve_rx_compl_desc_dqo *desc,
+ struct gve_ptype *ptype)
+{
+ /* HW did not identify and process L3 and L4 headers. */
+ if (__predict_false(!desc->l3_l4_processed))
+ return;
+
+ if (ptype->l3_type == GVE_L3_TYPE_IPV4) {
+ if (__predict_false(desc->csum_ip_err ||
+ desc->csum_external_ip_err))
+ return;
+ } else if (ptype->l3_type == GVE_L3_TYPE_IPV6) {
+ /* Checksum should be skipped if this flag is set. */
+ if (__predict_false(desc->ipv6_ex_add))
+ return;
+ }
+
+ if (__predict_false(desc->csum_l4_err))
+ return;
+
+ switch (ptype->l4_type) {
+ case GVE_L4_TYPE_TCP:
+ case GVE_L4_TYPE_UDP:
+ case GVE_L4_TYPE_ICMP:
+ case GVE_L4_TYPE_SCTP:
+ mbuf->m_pkthdr.csum_flags = CSUM_IP_CHECKED |
+ CSUM_IP_VALID |
+ CSUM_DATA_VALID |
+ CSUM_PSEUDO_HDR;
+ mbuf->m_pkthdr.csum_data = 0xffff;
+ break;
+ default:
+ break;
+ }
+}
+
+static void
+gve_rx_input_mbuf_dqo(struct gve_rx_ring *rx,
+ struct gve_rx_compl_desc_dqo *compl_desc)
+{
+ struct mbuf *mbuf = rx->ctx.mbuf_head;
+ if_t ifp = rx->com.priv->ifp;
+ struct gve_ptype *ptype;
+ bool do_if_input = true;
+ bool is_tcp = false;
+
+ ptype = &rx->com.priv->ptype_lut_dqo->ptypes[compl_desc->packet_type];
+ gve_rx_set_hashtype_dqo(mbuf, ptype, &is_tcp);
+ mbuf->m_pkthdr.flowid = le32toh(compl_desc->hash);
+ gve_rx_set_csum_flags_dqo(mbuf, compl_desc, ptype);
+
+ mbuf->m_pkthdr.rcvif = ifp;
+ mbuf->m_pkthdr.len = rx->ctx.total_size;
+
+ if (((if_getcapenable(rx->com.priv->ifp) & IFCAP_LRO) != 0) &&
+ is_tcp &&
+ (rx->lro.lro_cnt != 0) &&
+ (tcp_lro_rx(&rx->lro, mbuf, 0) == 0))
+ do_if_input = false;
+
+ if (do_if_input)
+ if_input(ifp, mbuf);
+
+ counter_enter();
+ counter_u64_add_protected(rx->stats.rbytes, rx->ctx.total_size);
+ counter_u64_add_protected(rx->stats.rpackets, 1);
+ counter_exit();
+
+ rx->ctx = (struct gve_rx_ctx){};
+}
+
+static int
+gve_rx_copybreak_dqo(struct gve_rx_ring *rx, void *va,
+ struct gve_rx_compl_desc_dqo *compl_desc, uint16_t frag_len)
+{
+ struct mbuf *mbuf;
+
+ mbuf = m_get2(frag_len, M_NOWAIT, MT_DATA, M_PKTHDR);
+ if (__predict_false(mbuf == NULL))
+ return (ENOMEM);
+
+ counter_enter();
+ counter_u64_add_protected(rx->stats.rx_copybreak_cnt, 1);
+ counter_exit();
+
+ m_copyback(mbuf, 0, frag_len, va);
+ mbuf->m_len = frag_len;
+
+ rx->ctx.mbuf_head = mbuf;
+ rx->ctx.mbuf_tail = mbuf;
+ rx->ctx.total_size += frag_len;
+
+ gve_rx_input_mbuf_dqo(rx, compl_desc);
+ return (0);
+}
+
+static void
+gve_rx_dqo(struct gve_priv *priv, struct gve_rx_ring *rx,
+ struct gve_rx_compl_desc_dqo *compl_desc,
+ int *work_done)
+{
+ bool is_last_frag = compl_desc->end_of_packet != 0;
+ struct gve_rx_ctx *ctx = &rx->ctx;
+ struct gve_rx_buf_dqo *buf;
+ uint32_t num_pending_bufs;
+ uint16_t frag_len;
+ uint16_t buf_id;
+ int err;
+
+ buf_id = le16toh(compl_desc->buf_id);
+ if (__predict_false(buf_id >= rx->dqo.buf_cnt)) {
+ device_printf(priv->dev, "Invalid rx buf id %d on rxq %d, issuing reset\n",
+ buf_id, rx->com.id);
+ gve_schedule_reset(priv);
+ goto drop_frag_clear_ctx;
+ }
+ buf = &rx->dqo.bufs[buf_id];
+ if (__predict_false(buf->mbuf == NULL)) {
+ device_printf(priv->dev, "Spurious completion for buf id %d on rxq %d, issuing reset\n",
+ buf_id, rx->com.id);
+ gve_schedule_reset(priv);
+ goto drop_frag_clear_ctx;
+ }
+
+ if (__predict_false(ctx->drop_pkt))
+ goto drop_frag;
+
+ if (__predict_false(compl_desc->rx_error)) {
+ counter_enter();
+ counter_u64_add_protected(rx->stats.rx_dropped_pkt_desc_err, 1);
+ counter_exit();
+ goto drop_frag;
+ }
+
+ bus_dmamap_sync(rx->dqo.buf_dmatag, buf->dmamap,
+ BUS_DMASYNC_POSTREAD);
+
+ frag_len = compl_desc->packet_len;
+ if (frag_len <= priv->rx_copybreak && !ctx->mbuf_head && is_last_frag) {
+ err = gve_rx_copybreak_dqo(rx, mtod(buf->mbuf, char*),
+ compl_desc, frag_len);
+ if (__predict_false(err != 0))
+ goto drop_frag;
+ (*work_done)++;
+ gve_rx_post_buf_dqo(rx, buf);
+ return;
+ }
+
+ /*
+ * Although buffer completions may arrive out of order, buffer
+ * descriptors are consumed by the NIC in order. That is, the
+ * buffer at desc_ring[tail] might not be the buffer we got the
+ * completion compl_ring[tail] for: but we know that desc_ring[tail]
+ * has already been read by the NIC.
+ */
+ num_pending_bufs = (rx->dqo.head - rx->dqo.tail) & rx->dqo.mask;
+
+ /*
+ * For every fragment received, try to post a new buffer.
+ *
+ * Failures are okay but only so long as the number of outstanding
+ * buffers is above a threshold.
+ *
+ * Beyond that we drop new packets to reuse their buffers.
+ * Without ensuring a minimum number of buffers for the NIC to
+ * put packets in, we run the risk of getting the queue stuck
+ * for good.
+ */
+ err = gve_rx_post_new_mbuf_dqo(rx, M_NOWAIT);
+ if (__predict_false(err != 0 &&
+ num_pending_bufs <= GVE_RX_DQO_MIN_PENDING_BUFS)) {
+ counter_enter();
+ counter_u64_add_protected(
+ rx->stats.rx_dropped_pkt_mbuf_alloc_fail, 1);
+ counter_exit();
+ goto drop_frag;
+ }
+
+ buf->mbuf->m_len = frag_len;
+ ctx->total_size += frag_len;
+ if (ctx->mbuf_tail == NULL) {
+ ctx->mbuf_head = buf->mbuf;
+ ctx->mbuf_tail = buf->mbuf;
+ } else {
+ buf->mbuf->m_flags &= ~M_PKTHDR;
+ ctx->mbuf_tail->m_next = buf->mbuf;
+ ctx->mbuf_tail = buf->mbuf;
+ }
+
+ /*
+ * Disassociate the mbuf from buf and surrender buf to the free list to
+ * be used by a future mbuf.
+ */
+ bus_dmamap_unload(rx->dqo.buf_dmatag, buf->dmamap);
+ buf->mbuf = NULL;
+ buf->addr = 0;
+ SLIST_INSERT_HEAD(&rx->dqo.free_bufs, buf, slist_entry);
+
+ if (is_last_frag) {
+ gve_rx_input_mbuf_dqo(rx, compl_desc);
+ (*work_done)++;
+ }
+ return;
+
+drop_frag:
+ /* Clear the earlier frags if there were any */
+ m_freem(ctx->mbuf_head);
+ rx->ctx = (struct gve_rx_ctx){};
+ /* Drop the rest of the pkt if there are more frags */
+ ctx->drop_pkt = true;
+ /* Reuse the dropped frag's buffer */
+ gve_rx_post_buf_dqo(rx, buf);
+
+ if (is_last_frag)
+ goto drop_frag_clear_ctx;
+ return;
+
+drop_frag_clear_ctx:
+ counter_enter();
+ counter_u64_add_protected(rx->stats.rx_dropped_pkt, 1);
+ counter_exit();
+ m_freem(ctx->mbuf_head);
+ rx->ctx = (struct gve_rx_ctx){};
+}
+
+static void *
+gve_get_cpu_addr_for_qpl_buf(struct gve_rx_ring *rx,
+ struct gve_rx_buf_dqo *buf, uint8_t buf_frag_num)
+{
+ int page_idx = buf - rx->dqo.bufs;
+ void *va = rx->com.qpl->dmas[page_idx].cpu_addr;
+
+ va = (char *)va + (buf_frag_num * rx->com.priv->rx_buf_size_dqo);
+ return (va);
+}
+
+static int
+gve_rx_add_clmbuf_to_ctx(struct gve_rx_ring *rx,
+ struct gve_rx_ctx *ctx, struct gve_rx_buf_dqo *buf,
+ uint8_t buf_frag_num, uint16_t frag_len)
+{
+ void *va = gve_get_cpu_addr_for_qpl_buf(rx, buf, buf_frag_num);
+ struct mbuf *mbuf;
+ bus_size_t segment_size = gve_rx_dqo_mbuf_segment_size(rx->com.priv);
+
+ if (ctx->mbuf_tail == NULL) {
+ mbuf = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, segment_size);
+ if (mbuf == NULL)
+ return (ENOMEM);
+ ctx->mbuf_head = mbuf;
+ ctx->mbuf_tail = mbuf;
+ } else {
+ mbuf = m_getjcl(M_NOWAIT, MT_DATA, 0, segment_size);
+ if (mbuf == NULL)
+ return (ENOMEM);
+ ctx->mbuf_tail->m_next = mbuf;
+ ctx->mbuf_tail = mbuf;
+ }
+
+ mbuf->m_len = frag_len;
+ ctx->total_size += frag_len;
+
+ m_copyback(mbuf, 0, frag_len, va);
+ counter_enter();
+ counter_u64_add_protected(rx->stats.rx_frag_copy_cnt, 1);
+ counter_exit();
+ return (0);
+}
+
+static int
+gve_rx_add_extmbuf_to_ctx(struct gve_rx_ring *rx,
+ struct gve_rx_ctx *ctx, struct gve_rx_buf_dqo *buf,
+ uint8_t buf_frag_num, uint16_t frag_len)
+{
+ struct mbuf *mbuf;
+ void *page_addr;
+ vm_page_t page;
+ int page_idx;
+ void *va;
+
+ if (ctx->mbuf_tail == NULL) {
+ mbuf = m_gethdr(M_NOWAIT, MT_DATA);
+ if (mbuf == NULL)
+ return (ENOMEM);
+ ctx->mbuf_head = mbuf;
+ ctx->mbuf_tail = mbuf;
+ } else {
+ mbuf = m_get(M_NOWAIT, MT_DATA);
+ if (mbuf == NULL)
+ return (ENOMEM);
+ ctx->mbuf_tail->m_next = mbuf;
+ ctx->mbuf_tail = mbuf;
+ }
+
+ mbuf->m_len = frag_len;
+ ctx->total_size += frag_len;
+
+ page_idx = buf - rx->dqo.bufs;
+ page = rx->com.qpl->pages[page_idx];
+ page_addr = rx->com.qpl->dmas[page_idx].cpu_addr;
+ va = (char *)page_addr + (buf_frag_num * rx->com.priv->rx_buf_size_dqo);
+
+ /*
+ * Grab an extra ref to the page so that gve_mextadd_free
+ * does not end up freeing the page while the interface exists.
+ */
+ vm_page_wire(page);
+
+ counter_enter();
+ counter_u64_add_protected(rx->stats.rx_frag_flip_cnt, 1);
+ counter_exit();
+
+ MEXTADD(mbuf, va, frag_len,
+ gve_mextadd_free, page, page_addr,
+ 0, EXT_NET_DRV);
+ return (0);
+}
+
+static void
+gve_rx_dqo_qpl(struct gve_priv *priv, struct gve_rx_ring *rx,
+ struct gve_rx_compl_desc_dqo *compl_desc,
+ int *work_done)
+{
+ bool is_last_frag = compl_desc->end_of_packet != 0;
+ union gve_rx_qpl_buf_id_dqo composed_id;
+ struct gve_dma_handle *page_dma_handle;
+ struct gve_rx_ctx *ctx = &rx->ctx;
+ struct gve_rx_buf_dqo *buf;
+ uint32_t num_pending_bufs;
+ uint8_t buf_frag_num;
+ uint16_t frag_len;
+ uint16_t buf_id;
+ int err;
+
+ composed_id.all = le16toh(compl_desc->buf_id);
+ buf_id = composed_id.buf_id;
+ buf_frag_num = composed_id.frag_num;
+
+ if (__predict_false(buf_id >= rx->dqo.buf_cnt)) {
+ device_printf(priv->dev, "Invalid rx buf id %d on rxq %d, issuing reset\n",
+ buf_id, rx->com.id);
+ gve_schedule_reset(priv);
+ goto drop_frag_clear_ctx;
+ }
+ buf = &rx->dqo.bufs[buf_id];
+ if (__predict_false(buf->num_nic_frags == 0 ||
+ buf_frag_num > gve_get_dq_num_frags_in_page(priv) - 1)) {
+ device_printf(priv->dev, "Spurious compl for buf id %d on rxq %d "
+ "with buf_frag_num %d and num_nic_frags %d, issuing reset\n",
+ buf_id, rx->com.id, buf_frag_num, buf->num_nic_frags);
+ gve_schedule_reset(priv);
+ goto drop_frag_clear_ctx;
+ }
+
+ buf->num_nic_frags--;
+
+ if (__predict_false(ctx->drop_pkt))
+ goto drop_frag;
+
+ if (__predict_false(compl_desc->rx_error)) {
+ counter_enter();
+ counter_u64_add_protected(rx->stats.rx_dropped_pkt_desc_err, 1);
+ counter_exit();
+ goto drop_frag;
+ }
+
+ page_dma_handle = gve_get_page_dma_handle(rx, buf);
+ bus_dmamap_sync(page_dma_handle->tag, page_dma_handle->map,
+ BUS_DMASYNC_POSTREAD);
+
+ frag_len = compl_desc->packet_len;
+ if (frag_len <= priv->rx_copybreak && !ctx->mbuf_head && is_last_frag) {
+ void *va = gve_get_cpu_addr_for_qpl_buf(rx, buf, buf_frag_num);
+
+ err = gve_rx_copybreak_dqo(rx, va, compl_desc, frag_len);
+ if (__predict_false(err != 0))
+ goto drop_frag;
+ (*work_done)++;
+ gve_rx_post_qpl_buf_dqo(rx, buf, buf_frag_num);
+ return;
+ }
+
+ num_pending_bufs = (rx->dqo.head - rx->dqo.tail) & rx->dqo.mask;
+ err = gve_rx_post_new_dqo_qpl_buf(rx);
+ if (__predict_false(err != 0 &&
+ num_pending_bufs <= GVE_RX_DQO_MIN_PENDING_BUFS)) {
+ /*
+ * Resort to copying this fragment into a cluster mbuf
+ * when the above threshold is breached and repost the
+ * incoming buffer. If we cannot find cluster mbufs,
+ * just drop the packet (to repost its buffer).
+ */
+ err = gve_rx_add_clmbuf_to_ctx(rx, ctx, buf,
+ buf_frag_num, frag_len);
+ if (err != 0) {
+ counter_enter();
+ counter_u64_add_protected(
+ rx->stats.rx_dropped_pkt_buf_post_fail, 1);
+ counter_exit();
+ goto drop_frag;
+ }
+ gve_rx_post_qpl_buf_dqo(rx, buf, buf_frag_num);
+ } else {
+ err = gve_rx_add_extmbuf_to_ctx(rx, ctx, buf,
+ buf_frag_num, frag_len);
+ if (__predict_false(err != 0)) {
+ counter_enter();
+ counter_u64_add_protected(
+ rx->stats.rx_dropped_pkt_mbuf_alloc_fail, 1);
+ counter_exit();
+ goto drop_frag;
+ }
+ }
+
+ /*
+ * Both the counts need to be checked.
+ *
+ * num_nic_frags == 0 implies no pending completions
+ * but not all frags may have yet been posted.
+ *
+ * next_idx == 0 implies all frags have been posted
+ * but there might be pending completions.
+ */
+ if (buf->num_nic_frags == 0 && buf->next_idx == 0)
+ STAILQ_INSERT_TAIL(&rx->dqo.used_bufs, buf, stailq_entry);
+
+ if (is_last_frag) {
+ gve_rx_input_mbuf_dqo(rx, compl_desc);
+ (*work_done)++;
+ }
+ return;
+
+drop_frag:
+ /* Clear the earlier frags if there were any */
+ m_freem(ctx->mbuf_head);
+ rx->ctx = (struct gve_rx_ctx){};
+ /* Drop the rest of the pkt if there are more frags */
+ ctx->drop_pkt = true;
+ /* Reuse the dropped frag's buffer */
+ gve_rx_post_qpl_buf_dqo(rx, buf, buf_frag_num);
+
+ if (is_last_frag)
+ goto drop_frag_clear_ctx;
+ return;
+
+drop_frag_clear_ctx:
+ counter_enter();
+ counter_u64_add_protected(rx->stats.rx_dropped_pkt, 1);
+ counter_exit();
+ m_freem(ctx->mbuf_head);
+ rx->ctx = (struct gve_rx_ctx){};
+}
+
+static uint8_t
+gve_rx_get_gen_bit(uint8_t *desc)
+{
+ uint8_t byte;
+
+ /*
+ * Prevent generation bit from being read after the rest of the
+ * descriptor.
+ */
+ byte = atomic_load_acq_8(desc + GVE_RX_DESC_DQO_GEN_BYTE_OFFSET);
+ return ((byte & GVE_RX_DESC_DQO_GEN_BIT_MASK) != 0);
+}
+
+static bool
+gve_rx_cleanup_dqo(struct gve_priv *priv, struct gve_rx_ring *rx, int budget)
+{
+ struct gve_rx_compl_desc_dqo *compl_desc;
+ uint32_t work_done = 0;
+
+ NET_EPOCH_ASSERT();
+
+ while (work_done < budget) {
+ bus_dmamap_sync(rx->dqo.compl_ring_mem.tag,
+ rx->dqo.compl_ring_mem.map,
+ BUS_DMASYNC_POSTREAD);
+
+ compl_desc = &rx->dqo.compl_ring[rx->dqo.tail];
+ if (gve_rx_get_gen_bit((uint8_t *)compl_desc) ==
+ rx->dqo.cur_gen_bit)
+ break;
+
+ rx->cnt++;
+ rx->dqo.tail = (rx->dqo.tail + 1) & rx->dqo.mask;
+ rx->dqo.cur_gen_bit ^= (rx->dqo.tail == 0);
+
+ if (gve_is_qpl(priv))
+ gve_rx_dqo_qpl(priv, rx, compl_desc, &work_done);
+ else
+ gve_rx_dqo(priv, rx, compl_desc, &work_done);
+ }
+
+ if (work_done != 0)
+ tcp_lro_flush_all(&rx->lro);
+
+ gve_rx_post_buffers_dqo(rx, M_NOWAIT);
+ if (gve_is_qpl(priv))
+ gve_rx_maybe_extract_from_used_bufs(rx, /*just_one=*/false);
+ return (work_done == budget);
+}
+
+void
+gve_rx_cleanup_tq_dqo(void *arg, int pending)
+{
+ struct gve_rx_ring *rx = arg;
+ struct gve_priv *priv = rx->com.priv;
+
+ if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0))
+ return;
+
+ if (gve_rx_cleanup_dqo(priv, rx, /*budget=*/64)) {
+ taskqueue_enqueue(rx->com.cleanup_tq, &rx->com.cleanup_task);
+ return;
+ }
+
+ gve_db_bar_dqo_write_4(priv, rx->com.irq_db_offset,
+ GVE_ITR_NO_UPDATE_DQO | GVE_ITR_ENABLE_BIT_DQO);
+}
diff --git a/sys/dev/gve/gve_sysctl.c b/sys/dev/gve/gve_sysctl.c
index 924654f62adc..a3874cc921ee 100644
--- a/sys/dev/gve/gve_sysctl.c
+++ b/sys/dev/gve/gve_sysctl.c
@@ -1,7 +1,7 @@
/*-
* SPDX-License-Identifier: BSD-3-Clause
*
- * Copyright (c) 2023 Google LLC
+ * Copyright (c) 2023-2024 Google LLC
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
@@ -30,6 +30,25 @@
*/
#include "gve.h"
+static SYSCTL_NODE(_hw, OID_AUTO, gve, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
+ "GVE driver parameters");
+
+bool gve_disable_hw_lro = false;
+SYSCTL_BOOL(_hw_gve, OID_AUTO, disable_hw_lro, CTLFLAG_RDTUN,
+ &gve_disable_hw_lro, 0, "Controls if hardware LRO is used");
+
+bool gve_allow_4k_rx_buffers = false;
+SYSCTL_BOOL(_hw_gve, OID_AUTO, allow_4k_rx_buffers, CTLFLAG_RDTUN,
+ &gve_allow_4k_rx_buffers, 0, "Controls if 4K RX Buffers are allowed");
+
+char gve_queue_format[8];
+SYSCTL_STRING(_hw_gve, OID_AUTO, queue_format, CTLFLAG_RD,
+ &gve_queue_format, 0, "Queue format being used by the iface");
+
+char gve_version[8];
+SYSCTL_STRING(_hw_gve, OID_AUTO, driver_version, CTLFLAG_RD,
+ &gve_version, 0, "Driver version");
+
static void
gve_setup_rxq_sysctl(struct sysctl_ctx_list *ctx,
struct sysctl_oid_list *child, struct gve_rx_ring *rxq)
@@ -69,9 +88,21 @@ gve_setup_rxq_sysctl(struct sysctl_ctx_list *ctx,
&stats->rx_dropped_pkt_desc_err,
"Packets dropped due to descriptor error");
SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO,
+ "rx_dropped_pkt_buf_post_fail", CTLFLAG_RD,
+ &stats->rx_dropped_pkt_buf_post_fail,
+ "Packets dropped due to failure to post enough buffers");
+ SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO,
"rx_dropped_pkt_mbuf_alloc_fail", CTLFLAG_RD,
&stats->rx_dropped_pkt_mbuf_alloc_fail,
"Packets dropped due to failed mbuf allocation");
+ SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO,
+ "rx_mbuf_dmamap_err", CTLFLAG_RD,
+ &stats->rx_mbuf_dmamap_err,
+ "Number of rx mbufs which could not be dma mapped");
+ SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO,
+ "rx_mbuf_mclget_null", CTLFLAG_RD,
+ &stats->rx_mbuf_mclget_null,
+ "Number of times when there were no cluster mbufs");
SYSCTL_ADD_U32(ctx, list, OID_AUTO,
"rx_completed_desc", CTLFLAG_RD,
&rxq->cnt, 0, "Number of descriptors completed");
@@ -113,9 +144,9 @@ gve_setup_txq_sysctl(struct sysctl_ctx_list *ctx,
"tx_bytes", CTLFLAG_RD,
&stats->tbytes, "Bytes transmitted");
SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO,
- "tx_dropped_pkt_nospace_device", CTLFLAG_RD,
- &stats->tx_dropped_pkt_nospace_device,
- "Packets dropped due to no space in device");
+ "tx_delayed_pkt_nospace_device", CTLFLAG_RD,
+ &stats->tx_delayed_pkt_nospace_device,
+ "Packets delayed due to no space in device");
SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO,
"tx_dropped_pkt_nospace_bufring", CTLFLAG_RD,
&stats->tx_dropped_pkt_nospace_bufring,
@@ -124,6 +155,46 @@ gve_setup_txq_sysctl(struct sysctl_ctx_list *ctx,
"tx_dropped_pkt_vlan", CTLFLAG_RD,
&stats->tx_dropped_pkt_vlan,
"Dropped VLAN packets");
+ SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO,
+ "tx_delayed_pkt_nospace_descring", CTLFLAG_RD,
+ &stats->tx_delayed_pkt_nospace_descring,
+ "Packets delayed due to no space in desc ring");
+ SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO,
+ "tx_delayed_pkt_nospace_compring", CTLFLAG_RD,
+ &stats->tx_delayed_pkt_nospace_compring,
+ "Packets delayed due to no space in comp ring");
+ SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO,
+ "tx_delayed_pkt_nospace_qpl_bufs", CTLFLAG_RD,
+ &stats->tx_delayed_pkt_nospace_qpl_bufs,
+ "Packets delayed due to not enough qpl bufs");
+ SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO,
+ "tx_delayed_pkt_tsoerr", CTLFLAG_RD,
+ &stats->tx_delayed_pkt_tsoerr,
+ "TSO packets delayed due to err in prep errors");
+ SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO,
+ "tx_mbuf_collapse", CTLFLAG_RD,
+ &stats->tx_mbuf_collapse,
+ "tx mbufs that had to be collapsed");
+ SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO,
+ "tx_mbuf_defrag", CTLFLAG_RD,
+ &stats->tx_mbuf_defrag,
+ "tx mbufs that had to be defragged");
+ SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO,
+ "tx_mbuf_defrag_err", CTLFLAG_RD,
+ &stats->tx_mbuf_defrag_err,
+ "tx mbufs that failed defrag");
+ SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO,
+ "tx_mbuf_dmamap_enomem_err", CTLFLAG_RD,
+ &stats->tx_mbuf_dmamap_enomem_err,
+ "tx mbufs that could not be dma-mapped due to low mem");
+ SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO,
+ "tx_mbuf_dmamap_err", CTLFLAG_RD,
+ &stats->tx_mbuf_dmamap_err,
+ "tx mbufs that could not be dma-mapped");
+ SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO,
+ "tx_timeout", CTLFLAG_RD,
+ &stats->tx_timeout,
+ "detections of timed out packets on tx queues");
}
static void
@@ -185,6 +256,9 @@ gve_setup_adminq_stat_sysctl(struct sysctl_ctx_list *ctx,
SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_destroy_rx_queue_cnt",
CTLFLAG_RD, &priv->adminq_destroy_rx_queue_cnt, 0,
"adminq_destroy_rx_queue_cnt");
+ SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_get_ptype_map_cnt",
+ CTLFLAG_RD, &priv->adminq_get_ptype_map_cnt, 0,
+ "adminq_get_ptype_map_cnt");
SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO,
"adminq_dcfg_device_resources_cnt", CTLFLAG_RD,
&priv->adminq_dcfg_device_resources_cnt, 0,
@@ -219,6 +293,175 @@ gve_setup_main_stat_sysctl(struct sysctl_ctx_list *ctx,
&priv->reset_cnt, 0, "Times reset");
}
+static int
+gve_check_num_queues(struct gve_priv *priv, int val, bool is_rx)
+{
+ if (val < 1) {
+ device_printf(priv->dev,
+ "Requested num queues (%u) must be a positive integer\n", val);
+ return (EINVAL);
+ }
+
+ if (val > (is_rx ? priv->rx_cfg.max_queues : priv->tx_cfg.max_queues)) {
+ device_printf(priv->dev,
+ "Requested num queues (%u) is too large\n", val);
+ return (EINVAL);
+ }
+
+ return (0);
+}
+
+static int
+gve_sysctl_num_tx_queues(SYSCTL_HANDLER_ARGS)
+{
+ struct gve_priv *priv = arg1;
+ int val;
+ int err;
+
+ val = priv->tx_cfg.num_queues;
+ err = sysctl_handle_int(oidp, &val, 0, req);
+ if (err != 0 || req->newptr == NULL)
+ return (err);
+
+ err = gve_check_num_queues(priv, val, /*is_rx=*/false);
+ if (err != 0)
+ return (err);
+
+ if (val != priv->tx_cfg.num_queues) {
+ GVE_IFACE_LOCK_LOCK(priv->gve_iface_lock);
+ err = gve_adjust_tx_queues(priv, val);
+ GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock);
+ }
+
+ return (err);
+}
+
+static int
+gve_sysctl_num_rx_queues(SYSCTL_HANDLER_ARGS)
+{
+ struct gve_priv *priv = arg1;
+ int val;
+ int err;
+
+ val = priv->rx_cfg.num_queues;
+ err = sysctl_handle_int(oidp, &val, 0, req);
+ if (err != 0 || req->newptr == NULL)
+ return (err);
+
+ err = gve_check_num_queues(priv, val, /*is_rx=*/true);
+
+ if (err != 0)
+ return (err);
+
+ if (val != priv->rx_cfg.num_queues) {
+ GVE_IFACE_LOCK_LOCK(priv->gve_iface_lock);
+ err = gve_adjust_rx_queues(priv, val);
+ GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock);
+ }
+
+ return (err);
+}
+
+static int
+gve_check_ring_size(struct gve_priv *priv, int val, bool is_rx)
+{
+ if (!powerof2(val) || val == 0) {
+ device_printf(priv->dev,
+ "Requested ring size (%u) must be a power of 2\n", val);
+ return (EINVAL);
+ }
+
+ if (val < (is_rx ? priv->min_rx_desc_cnt : priv->min_tx_desc_cnt)) {
+ device_printf(priv->dev,
+ "Requested ring size (%u) cannot be less than %d\n", val,
+ (is_rx ? priv->min_rx_desc_cnt : priv->min_tx_desc_cnt));
+ return (EINVAL);
+ }
+
+
+ if (val > (is_rx ? priv->max_rx_desc_cnt : priv->max_tx_desc_cnt)) {
+ device_printf(priv->dev,
+ "Requested ring size (%u) cannot be greater than %d\n", val,
+ (is_rx ? priv->max_rx_desc_cnt : priv->max_tx_desc_cnt));
+ return (EINVAL);
+ }
+
+ return (0);
+}
+
+static int
+gve_sysctl_tx_ring_size(SYSCTL_HANDLER_ARGS)
+{
+ struct gve_priv *priv = arg1;
+ int val;
+ int err;
+
+ val = priv->tx_desc_cnt;
+ err = sysctl_handle_int(oidp, &val, 0, req);
+ if (err != 0 || req->newptr == NULL)
+ return (err);
+
+ err = gve_check_ring_size(priv, val, /*is_rx=*/false);
+ if (err != 0)
+ return (err);
+
+ if (val != priv->tx_desc_cnt) {
+ GVE_IFACE_LOCK_LOCK(priv->gve_iface_lock);
+ err = gve_adjust_ring_sizes(priv, val, /*is_rx=*/false);
+ GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock);
+ }
+
+ return (err);
+}
+
+static int
+gve_sysctl_rx_ring_size(SYSCTL_HANDLER_ARGS)
+{
+ struct gve_priv *priv = arg1;
+ int val;
+ int err;
+
+ val = priv->rx_desc_cnt;
+ err = sysctl_handle_int(oidp, &val, 0, req);
+ if (err != 0 || req->newptr == NULL)
+ return (err);
+
+ err = gve_check_ring_size(priv, val, /*is_rx=*/true);
+ if (err != 0)
+ return (err);
+
+ if (val != priv->rx_desc_cnt) {
+ GVE_IFACE_LOCK_LOCK(priv->gve_iface_lock);
+ err = gve_adjust_ring_sizes(priv, val, /*is_rx=*/true);
+ GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock);
+ }
+
+ return (err);
+}
+
+static void
+gve_setup_sysctl_writables(struct sysctl_ctx_list *ctx,
+ struct sysctl_oid_list *child, struct gve_priv *priv)
+{
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "num_tx_queues",
+ CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, priv, 0,
+ gve_sysctl_num_tx_queues, "I", "Number of TX queues");
+
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "num_rx_queues",
+ CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, priv, 0,
+ gve_sysctl_num_rx_queues, "I", "Number of RX queues");
+
+ if (priv->modify_ringsize_enabled) {
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_ring_size",
+ CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, priv, 0,
+ gve_sysctl_tx_ring_size, "I", "TX ring size");
+
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ring_size",
+ CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, priv, 0,
+ gve_sysctl_rx_ring_size, "I", "RX ring size");
+ }
+}
+
void gve_setup_sysctl(struct gve_priv *priv)
{
device_t dev;
@@ -234,6 +477,7 @@ void gve_setup_sysctl(struct gve_priv *priv)
gve_setup_queue_stat_sysctl(ctx, child, priv);
gve_setup_adminq_stat_sysctl(ctx, child, priv);
gve_setup_main_stat_sysctl(ctx, child, priv);
+ gve_setup_sysctl_writables(ctx, child, priv);
}
void
diff --git a/sys/dev/gve/gve_tx.c b/sys/dev/gve/gve_tx.c
index 1e62e1226be1..84e3a4c4eb9f 100644
--- a/sys/dev/gve/gve_tx.c
+++ b/sys/dev/gve/gve_tx.c
@@ -1,7 +1,7 @@
/*-
* SPDX-License-Identifier: BSD-3-Clause
*
- * Copyright (c) 2023 Google LLC
+ * Copyright (c) 2023-2024 Google LLC
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
@@ -30,6 +30,7 @@
*/
#include "gve.h"
#include "gve_adminq.h"
+#include "gve_dqo.h"
#define GVE_GQ_TX_MIN_PKT_DESC_BYTES 182
@@ -48,61 +49,112 @@ gve_tx_fifo_init(struct gve_priv *priv, struct gve_tx_ring *tx)
}
static void
-gve_tx_free_ring(struct gve_priv *priv, int i)
+gve_tx_free_ring_gqi(struct gve_priv *priv, int i)
{
struct gve_tx_ring *tx = &priv->tx[i];
struct gve_ring_com *com = &tx->com;
- /* Safe to call even if never alloced */
- gve_free_counters((counter_u64_t *)&tx->stats, NUM_TX_STATS);
-
- if (tx->br != NULL) {
- buf_ring_free(tx->br, M_DEVBUF);
- tx->br = NULL;
+ if (tx->desc_ring != NULL) {
+ gve_dma_free_coherent(&tx->desc_ring_mem);
+ tx->desc_ring = NULL;
}
- if (mtx_initialized(&tx->ring_mtx))
- mtx_destroy(&tx->ring_mtx);
-
if (tx->info != NULL) {
free(tx->info, M_GVE);
tx->info = NULL;
}
- if (tx->desc_ring != NULL) {
- gve_dma_free_coherent(&tx->desc_ring_mem);
- tx->desc_ring = NULL;
+ if (com->qpl != NULL) {
+ gve_free_qpl(priv, com->qpl);
+ com->qpl = NULL;
}
+}
+
+static void
+gve_tx_free_ring(struct gve_priv *priv, int i)
+{
+ struct gve_tx_ring *tx = &priv->tx[i];
+ struct gve_ring_com *com = &tx->com;
+
+ /* Safe to call even if never alloced */
+ gve_free_counters((counter_u64_t *)&tx->stats, NUM_TX_STATS);
+
+ if (mtx_initialized(&tx->ring_mtx))
+ mtx_destroy(&tx->ring_mtx);
if (com->q_resources != NULL) {
gve_dma_free_coherent(&com->q_resources_mem);
com->q_resources = NULL;
}
+
+ if (tx->br != NULL) {
+ buf_ring_free(tx->br, M_DEVBUF);
+ tx->br = NULL;
+ }
+
+ if (gve_is_gqi(priv))
+ gve_tx_free_ring_gqi(priv, i);
+ else
+ gve_tx_free_ring_dqo(priv, i);
}
static int
-gve_tx_alloc_ring(struct gve_priv *priv, int i)
+gve_tx_alloc_ring_gqi(struct gve_priv *priv, int i)
{
struct gve_tx_ring *tx = &priv->tx[i];
struct gve_ring_com *com = &tx->com;
- char mtx_name[16];
int err;
- com->priv = priv;
- com->id = i;
+ err = gve_dma_alloc_coherent(priv,
+ sizeof(union gve_tx_desc) * priv->tx_desc_cnt,
+ CACHE_LINE_SIZE, &tx->desc_ring_mem);
+ if (err != 0) {
+ device_printf(priv->dev,
+ "Failed to alloc desc ring for tx ring %d", i);
+ goto abort;
+ }
+ tx->desc_ring = tx->desc_ring_mem.cpu_addr;
- com->qpl = &priv->qpls[i];
+ com->qpl = gve_alloc_qpl(priv, i, priv->tx_desc_cnt / GVE_QPL_DIVISOR,
+ /*single_kva=*/true);
if (com->qpl == NULL) {
- device_printf(priv->dev, "No QPL left for tx ring %d\n", i);
- return (ENOMEM);
+ device_printf(priv->dev,
+ "Failed to alloc QPL for tx ring %d\n", i);
+ err = ENOMEM;
+ goto abort;
}
err = gve_tx_fifo_init(priv, tx);
if (err != 0)
goto abort;
- tx->info = malloc(sizeof(struct gve_tx_buffer_state) * priv->tx_desc_cnt,
+ tx->info = malloc(
+ sizeof(struct gve_tx_buffer_state) * priv->tx_desc_cnt,
M_GVE, M_WAITOK | M_ZERO);
+ return (0);
+
+abort:
+ gve_tx_free_ring_gqi(priv, i);
+ return (err);
+}
+
+static int
+gve_tx_alloc_ring(struct gve_priv *priv, int i)
+{
+ struct gve_tx_ring *tx = &priv->tx[i];
+ struct gve_ring_com *com = &tx->com;
+ char mtx_name[16];
+ int err;
+
+ com->priv = priv;
+ com->id = i;
+
+ if (gve_is_gqi(priv))
+ err = gve_tx_alloc_ring_gqi(priv, i);
+ else
+ err = gve_tx_alloc_ring_dqo(priv, i);
+ if (err != 0)
+ goto abort;
sprintf(mtx_name, "gvetx%d", i);
mtx_init(&tx->ring_mtx, mtx_name, NULL, MTX_DEF);
@@ -115,19 +167,13 @@ gve_tx_alloc_ring(struct gve_priv *priv, int i)
err = gve_dma_alloc_coherent(priv, sizeof(struct gve_queue_resources),
PAGE_SIZE, &com->q_resources_mem);
if (err != 0) {
- device_printf(priv->dev, "Failed to alloc queue resources for tx ring %d", i);
+ device_printf(priv->dev,
+ "Failed to alloc queue resources for tx ring %d", i);
goto abort;
}
com->q_resources = com->q_resources_mem.cpu_addr;
- err = gve_dma_alloc_coherent(priv,
- sizeof(union gve_tx_desc) * priv->tx_desc_cnt,
- CACHE_LINE_SIZE, &tx->desc_ring_mem);
- if (err != 0) {
- device_printf(priv->dev, "Failed to alloc desc ring for tx ring %d", i);
- goto abort;
- }
- tx->desc_ring = tx->desc_ring_mem.cpu_addr;
+ tx->last_kicked = 0;
return (0);
@@ -137,39 +183,32 @@ abort:
}
int
-gve_alloc_tx_rings(struct gve_priv *priv)
+gve_alloc_tx_rings(struct gve_priv *priv, uint16_t start_idx, uint16_t stop_idx)
{
- int err = 0;
int i;
+ int err;
- priv->tx = malloc(sizeof(struct gve_tx_ring) * priv->tx_cfg.num_queues,
- M_GVE, M_WAITOK | M_ZERO);
+ KASSERT(priv->tx != NULL, ("priv->tx is NULL!"));
- for (i = 0; i < priv->tx_cfg.num_queues; i++) {
+ for (i = start_idx; i < stop_idx; i++) {
err = gve_tx_alloc_ring(priv, i);
if (err != 0)
goto free_rings;
-
}
return (0);
-
free_rings:
- while (i--)
- gve_tx_free_ring(priv, i);
- free(priv->tx, M_GVE);
+ gve_free_tx_rings(priv, start_idx, i);
return (err);
}
void
-gve_free_tx_rings(struct gve_priv *priv)
+gve_free_tx_rings(struct gve_priv *priv, uint16_t start_idx, uint16_t stop_idx)
{
int i;
- for (i = 0; i < priv->tx_cfg.num_queues; i++)
+ for (i = start_idx; i < stop_idx; i++)
gve_tx_free_ring(priv, i);
-
- free(priv->tx, M_GVE);
}
static void
@@ -181,6 +220,7 @@ gve_tx_clear_desc_ring(struct gve_tx_ring *tx)
for (i = 0; i < com->priv->tx_desc_cnt; i++) {
tx->desc_ring[i] = (union gve_tx_desc){};
tx->info[i] = (struct gve_tx_buffer_state){};
+ gve_invalidate_timestamp(&tx->info[i].enqueue_time_sec);
}
bus_dmamap_sync(tx->desc_ring_mem.tag, tx->desc_ring_mem.map,
@@ -209,7 +249,11 @@ gve_start_tx_ring(struct gve_priv *priv, int i)
struct gve_tx_ring *tx = &priv->tx[i];
struct gve_ring_com *com = &tx->com;
- NET_TASK_INIT(&com->cleanup_task, 0, gve_tx_cleanup_tq, tx);
+ atomic_store_bool(&tx->stopped, false);
+ if (gve_is_gqi(priv))
+ NET_TASK_INIT(&com->cleanup_task, 0, gve_tx_cleanup_tq, tx);
+ else
+ NET_TASK_INIT(&com->cleanup_task, 0, gve_tx_cleanup_tq_dqo, tx);
com->cleanup_tq = taskqueue_create_fast("gve tx", M_WAITOK,
taskqueue_thread_enqueue, &com->cleanup_tq);
taskqueue_start_threads(&com->cleanup_tq, 1, PI_NET, "%s txq %d",
@@ -233,8 +277,12 @@ gve_create_tx_rings(struct gve_priv *priv)
if (gve_get_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK))
return (0);
- for (i = 0; i < priv->tx_cfg.num_queues; i++)
- gve_clear_tx_ring(priv, i);
+ for (i = 0; i < priv->tx_cfg.num_queues; i++) {
+ if (gve_is_gqi(priv))
+ gve_clear_tx_ring(priv, i);
+ else
+ gve_clear_tx_ring_dqo(priv, i);
+ }
err = gve_adminq_create_tx_queues(priv, priv->tx_cfg.num_queues);
if (err != 0)
@@ -300,6 +348,30 @@ gve_destroy_tx_rings(struct gve_priv *priv)
}
int
+gve_check_tx_timeout_gqi(struct gve_priv *priv, struct gve_tx_ring *tx)
+{
+ struct gve_tx_buffer_state *info;
+ uint32_t pkt_idx;
+ int num_timeouts;
+
+ num_timeouts = 0;
+
+ for (pkt_idx = 0; pkt_idx < priv->tx_desc_cnt; pkt_idx++) {
+ info = &tx->info[pkt_idx];
+
+ if (!gve_timestamp_valid(&info->enqueue_time_sec))
+ continue;
+
+ if (__predict_false(
+ gve_seconds_since(&info->enqueue_time_sec) >
+ GVE_TX_TIMEOUT_PKT_SEC))
+ num_timeouts += 1;
+ }
+
+ return (num_timeouts);
+}
+
+int
gve_tx_intr(void *arg)
{
struct gve_tx_ring *tx = arg;
@@ -351,7 +423,10 @@ gve_tx_cleanup_tq(void *arg, int pending)
if (mbuf == NULL)
continue;
+ gve_invalidate_timestamp(&info->enqueue_time_sec);
+
info->mbuf = NULL;
+
counter_enter();
counter_u64_add_protected(tx->stats.tbytes, mbuf->m_pkthdr.len);
counter_u64_add_protected(tx->stats.tpackets, 1);
@@ -375,7 +450,7 @@ gve_tx_cleanup_tq(void *arg, int pending)
* interrupt but they will still be handled by the enqueue below.
* Completions born after the barrier WILL trigger an interrupt.
*/
- mb();
+ atomic_thread_fence_seq_cst();
nic_done = gve_tx_load_event_counter(priv, tx);
todo = nic_done - tx->done;
@@ -383,6 +458,11 @@ gve_tx_cleanup_tq(void *arg, int pending)
gve_db_bar_write_4(priv, tx->com.irq_db_offset, GVE_IRQ_MASK);
taskqueue_enqueue(tx->com.cleanup_tq, &tx->com.cleanup_task);
}
+
+ if (atomic_load_bool(&tx->stopped) && space_freed) {
+ atomic_store_bool(&tx->stopped, false);
+ taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task);
+ }
}
static void
@@ -627,8 +707,7 @@ gve_xmit(struct gve_tx_ring *tx, struct mbuf *mbuf)
bytes_required = gve_fifo_bytes_required(tx, first_seg_len, pkt_len);
if (__predict_false(!gve_can_tx(tx, bytes_required))) {
counter_enter();
- counter_u64_add_protected(tx->stats.tx_dropped_pkt_nospace_device, 1);
- counter_u64_add_protected(tx->stats.tx_dropped_pkt, 1);
+ counter_u64_add_protected(tx->stats.tx_delayed_pkt_nospace_device, 1);
counter_exit();
return (ENOBUFS);
}
@@ -636,6 +715,8 @@ gve_xmit(struct gve_tx_ring *tx, struct mbuf *mbuf)
/* So that the cleanup taskqueue can free the mbuf eventually. */
info->mbuf = mbuf;
+ gve_set_timestamp(&info->enqueue_time_sec);
+
/*
* We don't want to split the header, so if necessary, pad to the end
* of the fifo and then put the header at the beginning of the fifo.
@@ -689,19 +770,86 @@ gve_xmit(struct gve_tx_ring *tx, struct mbuf *mbuf)
return (0);
}
+static int
+gve_xmit_mbuf(struct gve_tx_ring *tx,
+ struct mbuf **mbuf)
+{
+ if (gve_is_gqi(tx->com.priv))
+ return (gve_xmit(tx, *mbuf));
+
+ if (gve_is_qpl(tx->com.priv))
+ return (gve_xmit_dqo_qpl(tx, *mbuf));
+
+ /*
+ * gve_xmit_dqo might attempt to defrag the mbuf chain.
+ * The reference is passed in so that in the case of
+ * errors, the new mbuf chain is what's put back on the br.
+ */
+ return (gve_xmit_dqo(tx, mbuf));
+}
+
+/*
+ * Has the side-effect of stopping the xmit queue by setting tx->stopped
+ */
+static int
+gve_xmit_retry_enobuf_mbuf(struct gve_tx_ring *tx,
+ struct mbuf **mbuf)
+{
+ int err;
+
+ atomic_store_bool(&tx->stopped, true);
+
+ /*
+ * Room made in the queue BEFORE the barrier will be seen by the
+ * gve_xmit_mbuf retry below.
+ *
+ * If room is made in the queue AFTER the barrier, the cleanup tq
+ * iteration creating the room will either see a tx->stopped value
+ * of 0 or the 1 we just wrote:
+ *
+ * If it sees a 1, then it would enqueue the xmit tq. Enqueue
+ * implies a retry on the waiting pkt.
+ *
+ * If it sees a 0, then that implies a previous iteration overwrote
+ * our 1, and that iteration would enqueue the xmit tq. Enqueue
+ * implies a retry on the waiting pkt.
+ */
+ atomic_thread_fence_seq_cst();
+
+ err = gve_xmit_mbuf(tx, mbuf);
+ if (err == 0)
+ atomic_store_bool(&tx->stopped, false);
+
+ return (err);
+}
+
static void
gve_xmit_br(struct gve_tx_ring *tx)
{
struct gve_priv *priv = tx->com.priv;
struct ifnet *ifp = priv->ifp;
struct mbuf *mbuf;
+ int err;
while ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) != 0 &&
(mbuf = drbr_peek(ifp, tx->br)) != NULL) {
+ err = gve_xmit_mbuf(tx, &mbuf);
- if (__predict_false(gve_xmit(tx, mbuf) != 0)) {
- drbr_putback(ifp, tx->br, mbuf);
- taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task);
+ /*
+ * We need to stop this taskqueue when we can't xmit the pkt due
+ * to lack of space in the NIC ring (ENOBUFS). The retry exists
+ * to guard against a TOCTTOU bug that could end up freezing the
+ * queue forever.
+ */
+ if (__predict_false(mbuf != NULL && err == ENOBUFS))
+ err = gve_xmit_retry_enobuf_mbuf(tx, &mbuf);
+
+ if (__predict_false(err != 0 && mbuf != NULL)) {
+ if (err == EINVAL) {
+ drbr_advance(ifp, tx->br);
+ m_freem(mbuf);
+ } else
+ drbr_putback(ifp, tx->br, mbuf);
break;
}
@@ -710,7 +858,12 @@ gve_xmit_br(struct gve_tx_ring *tx)
bus_dmamap_sync(tx->desc_ring_mem.tag, tx->desc_ring_mem.map,
BUS_DMASYNC_PREWRITE);
- gve_db_bar_write_4(priv, tx->com.db_offset, tx->req);
+
+ if (gve_is_gqi(priv))
+ gve_db_bar_write_4(priv, tx->com.db_offset, tx->req);
+ else
+ gve_db_bar_dqo_write_4(priv, tx->com.db_offset,
+ tx->dqo.desc_tail);
}
}
@@ -763,7 +916,8 @@ gve_xmit_ifp(if_t ifp, struct mbuf *mbuf)
is_br_empty = drbr_empty(ifp, tx->br);
err = drbr_enqueue(ifp, tx->br, mbuf);
if (__predict_false(err != 0)) {
- taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task);
+ if (!atomic_load_bool(&tx->stopped))
+ taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task);
counter_enter();
counter_u64_add_protected(tx->stats.tx_dropped_pkt_nospace_bufring, 1);
counter_u64_add_protected(tx->stats.tx_dropped_pkt, 1);
@@ -778,9 +932,8 @@ gve_xmit_ifp(if_t ifp, struct mbuf *mbuf)
if (is_br_empty && (GVE_RING_TRYLOCK(tx) != 0)) {
gve_xmit_br(tx);
GVE_RING_UNLOCK(tx);
- } else {
+ } else if (!atomic_load_bool(&tx->stopped))
taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task);
- }
return (0);
}
diff --git a/sys/dev/gve/gve_tx_dqo.c b/sys/dev/gve/gve_tx_dqo.c
new file mode 100644
index 000000000000..551a7e308d19
--- /dev/null
+++ b/sys/dev/gve/gve_tx_dqo.c
@@ -0,0 +1,1149 @@
+/*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 2024 Google LLC
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "opt_inet6.h"
+
+#include "gve.h"
+#include "gve_dqo.h"
+
+static void
+gve_unmap_packet(struct gve_tx_ring *tx,
+ struct gve_tx_pending_pkt_dqo *pending_pkt)
+{
+ bus_dmamap_sync(tx->dqo.buf_dmatag, pending_pkt->dmamap,
+ BUS_DMASYNC_POSTWRITE);
+ bus_dmamap_unload(tx->dqo.buf_dmatag, pending_pkt->dmamap);
+}
+
+static void
+gve_clear_qpl_pending_pkt(struct gve_tx_pending_pkt_dqo *pending_pkt)
+{
+ pending_pkt->qpl_buf_head = -1;
+ pending_pkt->num_qpl_bufs = 0;
+}
+
+static void
+gve_free_tx_mbufs_dqo(struct gve_tx_ring *tx)
+{
+ struct gve_tx_pending_pkt_dqo *pending_pkt;
+ int i;
+
+ for (i = 0; i < tx->dqo.num_pending_pkts; i++) {
+ pending_pkt = &tx->dqo.pending_pkts[i];
+ if (!pending_pkt->mbuf)
+ continue;
+
+ if (gve_is_qpl(tx->com.priv))
+ gve_clear_qpl_pending_pkt(pending_pkt);
+ else
+ gve_unmap_packet(tx, pending_pkt);
+
+ m_freem(pending_pkt->mbuf);
+ pending_pkt->mbuf = NULL;
+ }
+}
+
+void
+gve_tx_free_ring_dqo(struct gve_priv *priv, int i)
+{
+ struct gve_tx_ring *tx = &priv->tx[i];
+ struct gve_ring_com *com = &tx->com;
+ int j;
+
+ if (tx->dqo.desc_ring != NULL) {
+ gve_dma_free_coherent(&tx->desc_ring_mem);
+ tx->dqo.desc_ring = NULL;
+ }
+
+ if (tx->dqo.compl_ring != NULL) {
+ gve_dma_free_coherent(&tx->dqo.compl_ring_mem);
+ tx->dqo.compl_ring = NULL;
+ }
+
+ if (tx->dqo.pending_pkts != NULL) {
+ gve_free_tx_mbufs_dqo(tx);
+
+ if (!gve_is_qpl(priv) && tx->dqo.buf_dmatag) {
+ for (j = 0; j < tx->dqo.num_pending_pkts; j++)
+ if (tx->dqo.pending_pkts[j].state !=
+ GVE_PACKET_STATE_UNALLOCATED)
+ bus_dmamap_destroy(tx->dqo.buf_dmatag,
+ tx->dqo.pending_pkts[j].dmamap);
+ }
+
+ free(tx->dqo.pending_pkts, M_GVE);
+ tx->dqo.pending_pkts = NULL;
+ }
+
+ if (!gve_is_qpl(priv) && tx->dqo.buf_dmatag)
+ bus_dma_tag_destroy(tx->dqo.buf_dmatag);
+
+ if (gve_is_qpl(priv) && tx->dqo.qpl_bufs != NULL) {
+ free(tx->dqo.qpl_bufs, M_GVE);
+ tx->dqo.qpl_bufs = NULL;
+ }
+
+ if (com->qpl != NULL) {
+ gve_free_qpl(priv, com->qpl);
+ com->qpl = NULL;
+ }
+}
+
+static int
+gve_tx_alloc_rda_fields_dqo(struct gve_tx_ring *tx)
+{
+ struct gve_priv *priv = tx->com.priv;
+ int err;
+ int j;
+
+ /*
+ * DMA tag for mapping Tx mbufs
+ * The maxsize, nsegments, and maxsegsize params should match
+ * the if_sethwtso* arguments in gve_setup_ifnet in gve_main.c.
+ */
+ err = bus_dma_tag_create(
+ bus_get_dma_tag(priv->dev), /* parent */
+ 1, 0, /* alignment, bounds */
+ BUS_SPACE_MAXADDR, /* lowaddr */
+ BUS_SPACE_MAXADDR, /* highaddr */
+ NULL, NULL, /* filter, filterarg */
+ GVE_TSO_MAXSIZE_DQO, /* maxsize */
+ GVE_TX_MAX_DATA_DESCS_DQO, /* nsegments */
+ GVE_TX_MAX_BUF_SIZE_DQO, /* maxsegsize */
+ BUS_DMA_ALLOCNOW, /* flags */
+ NULL, /* lockfunc */
+ NULL, /* lockarg */
+ &tx->dqo.buf_dmatag);
+ if (err != 0) {
+ device_printf(priv->dev, "%s: bus_dma_tag_create failed: %d\n",
+ __func__, err);
+ return (err);
+ }
+
+ for (j = 0; j < tx->dqo.num_pending_pkts; j++) {
+ err = bus_dmamap_create(tx->dqo.buf_dmatag, 0,
+ &tx->dqo.pending_pkts[j].dmamap);
+ if (err != 0) {
+ device_printf(priv->dev,
+ "err in creating pending pkt dmamap %d: %d",
+ j, err);
+ return (err);
+ }
+ tx->dqo.pending_pkts[j].state = GVE_PACKET_STATE_FREE;
+ }
+
+ return (0);
+}
+
+int
+gve_tx_alloc_ring_dqo(struct gve_priv *priv, int i)
+{
+ struct gve_tx_ring *tx = &priv->tx[i];
+ uint16_t num_pending_pkts;
+ int err;
+
+ /* Descriptor ring */
+ err = gve_dma_alloc_coherent(priv,
+ sizeof(union gve_tx_desc_dqo) * priv->tx_desc_cnt,
+ CACHE_LINE_SIZE, &tx->desc_ring_mem);
+ if (err != 0) {
+ device_printf(priv->dev,
+ "Failed to alloc desc ring for tx ring %d", i);
+ goto abort;
+ }
+ tx->dqo.desc_ring = tx->desc_ring_mem.cpu_addr;
+
+ /* Completion ring */
+ err = gve_dma_alloc_coherent(priv,
+ sizeof(struct gve_tx_compl_desc_dqo) * priv->tx_desc_cnt,
+ CACHE_LINE_SIZE, &tx->dqo.compl_ring_mem);
+ if (err != 0) {
+ device_printf(priv->dev,
+ "Failed to alloc compl ring for tx ring %d", i);
+ goto abort;
+ }
+ tx->dqo.compl_ring = tx->dqo.compl_ring_mem.cpu_addr;
+
+ /*
+ * pending_pkts array
+ *
+ * The max number of pending packets determines the maximum number of
+ * descriptors which maybe written to the completion queue.
+ *
+ * We must set the number small enough to make sure we never overrun the
+ * completion queue.
+ */
+ num_pending_pkts = priv->tx_desc_cnt;
+ /*
+ * Reserve space for descriptor completions, which will be reported at
+ * most every GVE_TX_MIN_RE_INTERVAL packets.
+ */
+ num_pending_pkts -= num_pending_pkts / GVE_TX_MIN_RE_INTERVAL;
+
+ tx->dqo.num_pending_pkts = num_pending_pkts;
+ tx->dqo.pending_pkts = malloc(
+ sizeof(struct gve_tx_pending_pkt_dqo) * num_pending_pkts,
+ M_GVE, M_WAITOK | M_ZERO);
+
+ if (gve_is_qpl(priv)) {
+ int qpl_buf_cnt;
+
+ tx->com.qpl = gve_alloc_qpl(priv, i, GVE_TX_NUM_QPL_PAGES_DQO,
+ /*single_kva*/false);
+ if (tx->com.qpl == NULL) {
+ device_printf(priv->dev,
+ "Failed to alloc QPL for tx ring %d", i);
+ err = ENOMEM;
+ goto abort;
+ }
+
+ qpl_buf_cnt = GVE_TX_BUFS_PER_PAGE_DQO *
+ tx->com.qpl->num_pages;
+
+ tx->dqo.qpl_bufs = malloc(
+ sizeof(*tx->dqo.qpl_bufs) * qpl_buf_cnt,
+ M_GVE, M_WAITOK | M_ZERO);
+ } else
+ gve_tx_alloc_rda_fields_dqo(tx);
+ return (0);
+
+abort:
+ gve_tx_free_ring_dqo(priv, i);
+ return (err);
+}
+
+static void
+gve_extract_tx_metadata_dqo(const struct mbuf *mbuf,
+ struct gve_tx_metadata_dqo *metadata)
+{
+ uint32_t hash = mbuf->m_pkthdr.flowid;
+ uint16_t path_hash;
+
+ metadata->version = GVE_TX_METADATA_VERSION_DQO;
+ if (hash) {
+ path_hash = hash ^ (hash >> 16);
+
+ path_hash &= (1 << 15) - 1;
+ if (__predict_false(path_hash == 0))
+ path_hash = ~path_hash;
+
+ metadata->path_hash = path_hash;
+ }
+}
+
+static void
+gve_tx_fill_pkt_desc_dqo(struct gve_tx_ring *tx,
+ uint32_t *desc_idx, uint32_t len, uint64_t addr,
+ int16_t compl_tag, bool eop, bool csum_enabled)
+{
+ while (len > 0) {
+ struct gve_tx_pkt_desc_dqo *desc =
+ &tx->dqo.desc_ring[*desc_idx].pkt;
+ uint32_t cur_len = MIN(len, GVE_TX_MAX_BUF_SIZE_DQO);
+ bool cur_eop = eop && cur_len == len;
+
+ *desc = (struct gve_tx_pkt_desc_dqo){
+ .buf_addr = htole64(addr),
+ .dtype = GVE_TX_PKT_DESC_DTYPE_DQO,
+ .end_of_packet = cur_eop,
+ .checksum_offload_enable = csum_enabled,
+ .compl_tag = htole16(compl_tag),
+ .buf_size = cur_len,
+ };
+
+ addr += cur_len;
+ len -= cur_len;
+ *desc_idx = (*desc_idx + 1) & tx->dqo.desc_mask;
+ }
+}
+
+static void
+gve_tx_fill_tso_ctx_desc(struct gve_tx_tso_context_desc_dqo *desc,
+ const struct mbuf *mbuf, const struct gve_tx_metadata_dqo *metadata,
+ int header_len)
+{
+ *desc = (struct gve_tx_tso_context_desc_dqo){
+ .header_len = header_len,
+ .cmd_dtype = {
+ .dtype = GVE_TX_TSO_CTX_DESC_DTYPE_DQO,
+ .tso = 1,
+ },
+ .flex0 = metadata->bytes[0],
+ .flex5 = metadata->bytes[5],
+ .flex6 = metadata->bytes[6],
+ .flex7 = metadata->bytes[7],
+ .flex8 = metadata->bytes[8],
+ .flex9 = metadata->bytes[9],
+ .flex10 = metadata->bytes[10],
+ .flex11 = metadata->bytes[11],
+ };
+ desc->tso_total_len = mbuf->m_pkthdr.len - header_len;
+ desc->mss = mbuf->m_pkthdr.tso_segsz;
+}
+
+static void
+gve_tx_fill_general_ctx_desc(struct gve_tx_general_context_desc_dqo *desc,
+ const struct gve_tx_metadata_dqo *metadata)
+{
+ *desc = (struct gve_tx_general_context_desc_dqo){
+ .flex0 = metadata->bytes[0],
+ .flex1 = metadata->bytes[1],
+ .flex2 = metadata->bytes[2],
+ .flex3 = metadata->bytes[3],
+ .flex4 = metadata->bytes[4],
+ .flex5 = metadata->bytes[5],
+ .flex6 = metadata->bytes[6],
+ .flex7 = metadata->bytes[7],
+ .flex8 = metadata->bytes[8],
+ .flex9 = metadata->bytes[9],
+ .flex10 = metadata->bytes[10],
+ .flex11 = metadata->bytes[11],
+ .cmd_dtype = {.dtype = GVE_TX_GENERAL_CTX_DESC_DTYPE_DQO},
+ };
+}
+
+#define PULLUP_HDR(m, len) \
+do { \
+ if (__predict_false((m)->m_len < (len))) { \
+ (m) = m_pullup((m), (len)); \
+ if ((m) == NULL) \
+ return (EINVAL); \
+ } \
+} while (0)
+
+static int
+gve_prep_tso(struct mbuf *mbuf, int *header_len)
+{
+ uint8_t l3_off, l4_off = 0;
+ struct ether_header *eh;
+ struct tcphdr *th;
+ u_short csum;
+
+ PULLUP_HDR(mbuf, sizeof(*eh));
+ eh = mtod(mbuf, struct ether_header *);
+ KASSERT(eh->ether_type != ETHERTYPE_VLAN,
+ ("VLAN-tagged packets not supported"));
+ l3_off = ETHER_HDR_LEN;
+
+#ifdef INET6
+ if (ntohs(eh->ether_type) == ETHERTYPE_IPV6) {
+ struct ip6_hdr *ip6;
+
+ PULLUP_HDR(mbuf, l3_off + sizeof(*ip6));
+ ip6 = (struct ip6_hdr *)(mtodo(mbuf, l3_off));
+ l4_off = l3_off + sizeof(struct ip6_hdr);
+ csum = in6_cksum_pseudo(ip6, /*len=*/0, IPPROTO_TCP,
+ /*csum=*/0);
+ } else
+#endif
+ if (ntohs(eh->ether_type) == ETHERTYPE_IP) {
+ struct ip *ip;
+
+ PULLUP_HDR(mbuf, l3_off + sizeof(*ip));
+ ip = (struct ip *)(mtodo(mbuf, l3_off));
+ l4_off = l3_off + (ip->ip_hl << 2);
+ csum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
+ htons(IPPROTO_TCP));
+ }
+
+ PULLUP_HDR(mbuf, l4_off + sizeof(struct tcphdr *));
+ th = (struct tcphdr *)(mtodo(mbuf, l4_off));
+ *header_len = l4_off + (th->th_off << 2);
+
+ /*
+ * Hardware requires the th->th_sum to not include the TCP payload,
+ * hence we recompute the csum with it excluded.
+ */
+ th->th_sum = csum;
+
+ return (0);
+}
+
+static int
+gve_tx_fill_ctx_descs(struct gve_tx_ring *tx, struct mbuf *mbuf,
+ bool is_tso, uint32_t *desc_idx)
+{
+ struct gve_tx_general_context_desc_dqo *gen_desc;
+ struct gve_tx_tso_context_desc_dqo *tso_desc;
+ struct gve_tx_metadata_dqo metadata;
+ int header_len;
+ int err;
+
+ metadata = (struct gve_tx_metadata_dqo){0};
+ gve_extract_tx_metadata_dqo(mbuf, &metadata);
+
+ if (is_tso) {
+ err = gve_prep_tso(mbuf, &header_len);
+ if (__predict_false(err)) {
+ counter_enter();
+ counter_u64_add_protected(
+ tx->stats.tx_delayed_pkt_tsoerr, 1);
+ counter_exit();
+ return (err);
+ }
+
+ tso_desc = &tx->dqo.desc_ring[*desc_idx].tso_ctx;
+ gve_tx_fill_tso_ctx_desc(tso_desc, mbuf, &metadata, header_len);
+
+ *desc_idx = (*desc_idx + 1) & tx->dqo.desc_mask;
+ counter_enter();
+ counter_u64_add_protected(tx->stats.tso_packet_cnt, 1);
+ counter_exit();
+ }
+
+ gen_desc = &tx->dqo.desc_ring[*desc_idx].general_ctx;
+ gve_tx_fill_general_ctx_desc(gen_desc, &metadata);
+ *desc_idx = (*desc_idx + 1) & tx->dqo.desc_mask;
+ return (0);
+}
+
+static int
+gve_map_mbuf_dqo(struct gve_tx_ring *tx,
+ struct mbuf **mbuf, bus_dmamap_t dmamap,
+ bus_dma_segment_t *segs, int *nsegs, int attempt)
+{
+ struct mbuf *m_new = NULL;
+ int err;
+
+ err = bus_dmamap_load_mbuf_sg(tx->dqo.buf_dmatag, dmamap,
+ *mbuf, segs, nsegs, BUS_DMA_NOWAIT);
+
+ switch (err) {
+ case __predict_true(0):
+ break;
+ case EFBIG:
+ if (__predict_false(attempt > 0))
+ goto abort;
+
+ counter_enter();
+ counter_u64_add_protected(
+ tx->stats.tx_mbuf_collapse, 1);
+ counter_exit();
+
+ /* Try m_collapse before m_defrag */
+ m_new = m_collapse(*mbuf, M_NOWAIT,
+ GVE_TX_MAX_DATA_DESCS_DQO);
+ if (m_new == NULL) {
+ counter_enter();
+ counter_u64_add_protected(
+ tx->stats.tx_mbuf_defrag, 1);
+ counter_exit();
+ m_new = m_defrag(*mbuf, M_NOWAIT);
+ }
+
+ if (__predict_false(m_new == NULL)) {
+ counter_enter();
+ counter_u64_add_protected(
+ tx->stats.tx_mbuf_defrag_err, 1);
+ counter_exit();
+
+ m_freem(*mbuf);
+ *mbuf = NULL;
+ err = ENOMEM;
+ goto abort;
+ } else {
+ *mbuf = m_new;
+ return (gve_map_mbuf_dqo(tx, mbuf, dmamap,
+ segs, nsegs, ++attempt));
+ }
+ case ENOMEM:
+ counter_enter();
+ counter_u64_add_protected(
+ tx->stats.tx_mbuf_dmamap_enomem_err, 1);
+ counter_exit();
+ goto abort;
+ default:
+ goto abort;
+ }
+
+ return (0);
+
+abort:
+ counter_enter();
+ counter_u64_add_protected(tx->stats.tx_mbuf_dmamap_err, 1);
+ counter_exit();
+ return (err);
+}
+
+static uint32_t
+num_avail_desc_ring_slots(const struct gve_tx_ring *tx)
+{
+ uint32_t num_used = (tx->dqo.desc_tail - tx->dqo.desc_head) &
+ tx->dqo.desc_mask;
+
+ return (tx->dqo.desc_mask - num_used);
+}
+
+static struct gve_tx_pending_pkt_dqo *
+gve_alloc_pending_packet(struct gve_tx_ring *tx)
+{
+ int32_t index = tx->dqo.free_pending_pkts_csm;
+ struct gve_tx_pending_pkt_dqo *pending_pkt;
+
+ /*
+ * No pending packets available in the consumer list,
+ * try to steal the producer list.
+ */
+ if (__predict_false(index == -1)) {
+ tx->dqo.free_pending_pkts_csm = atomic_swap_32(
+ &tx->dqo.free_pending_pkts_prd, -1);
+
+ index = tx->dqo.free_pending_pkts_csm;
+ if (__predict_false(index == -1))
+ return (NULL);
+ }
+
+ pending_pkt = &tx->dqo.pending_pkts[index];
+
+ /* Remove pending_pkt from the consumer list */
+ tx->dqo.free_pending_pkts_csm = pending_pkt->next;
+ pending_pkt->state = GVE_PACKET_STATE_PENDING_DATA_COMPL;
+
+ gve_set_timestamp(&pending_pkt->enqueue_time_sec);
+
+ return (pending_pkt);
+}
+
+static void
+gve_free_pending_packet(struct gve_tx_ring *tx,
+ struct gve_tx_pending_pkt_dqo *pending_pkt)
+{
+ int index = pending_pkt - tx->dqo.pending_pkts;
+ int32_t old_head;
+
+ pending_pkt->state = GVE_PACKET_STATE_FREE;
+
+ gve_invalidate_timestamp(&pending_pkt->enqueue_time_sec);
+
+ /* Add pending_pkt to the producer list */
+ while (true) {
+ old_head = atomic_load_acq_32(&tx->dqo.free_pending_pkts_prd);
+
+ pending_pkt->next = old_head;
+ if (atomic_cmpset_32(&tx->dqo.free_pending_pkts_prd,
+ old_head, index))
+ break;
+ }
+}
+
+/*
+ * Has the side-effect of retrieving the value of the last desc index
+ * processed by the NIC. hw_tx_head is written to by the completions-processing
+ * taskqueue upon receiving descriptor-completions.
+ */
+static bool
+gve_tx_has_desc_room_dqo(struct gve_tx_ring *tx, int needed_descs)
+{
+ if (needed_descs <= num_avail_desc_ring_slots(tx))
+ return (true);
+
+ tx->dqo.desc_head = atomic_load_acq_32(&tx->dqo.hw_tx_head);
+ if (needed_descs > num_avail_desc_ring_slots(tx)) {
+ counter_enter();
+ counter_u64_add_protected(
+ tx->stats.tx_delayed_pkt_nospace_descring, 1);
+ counter_exit();
+ return (false);
+ }
+
+ return (0);
+}
+
+static void
+gve_tx_request_desc_compl(struct gve_tx_ring *tx, uint32_t desc_idx)
+{
+ uint32_t last_report_event_interval;
+ uint32_t last_desc_idx;
+
+ last_desc_idx = (desc_idx - 1) & tx->dqo.desc_mask;
+ last_report_event_interval =
+ (last_desc_idx - tx->dqo.last_re_idx) & tx->dqo.desc_mask;
+
+ if (__predict_false(last_report_event_interval >=
+ GVE_TX_MIN_RE_INTERVAL)) {
+ tx->dqo.desc_ring[last_desc_idx].pkt.report_event = true;
+ tx->dqo.last_re_idx = last_desc_idx;
+ }
+}
+
+static bool
+gve_tx_have_enough_qpl_bufs(struct gve_tx_ring *tx, int num_bufs)
+{
+ uint32_t available = tx->dqo.qpl_bufs_produced_cached -
+ tx->dqo.qpl_bufs_consumed;
+
+ if (__predict_true(available >= num_bufs))
+ return (true);
+
+ tx->dqo.qpl_bufs_produced_cached = atomic_load_acq_32(
+ &tx->dqo.qpl_bufs_produced);
+ available = tx->dqo.qpl_bufs_produced_cached -
+ tx->dqo.qpl_bufs_consumed;
+
+ if (__predict_true(available >= num_bufs))
+ return (true);
+ return (false);
+}
+
+static int32_t
+gve_tx_alloc_qpl_buf(struct gve_tx_ring *tx)
+{
+ int32_t buf = tx->dqo.free_qpl_bufs_csm;
+
+ if (__predict_false(buf == -1)) {
+ tx->dqo.free_qpl_bufs_csm = atomic_swap_32(
+ &tx->dqo.free_qpl_bufs_prd, -1);
+ buf = tx->dqo.free_qpl_bufs_csm;
+ if (__predict_false(buf == -1))
+ return (-1);
+ }
+
+ tx->dqo.free_qpl_bufs_csm = tx->dqo.qpl_bufs[buf];
+ tx->dqo.qpl_bufs_consumed++;
+ return (buf);
+}
+
+/*
+ * Tx buffer i corresponds to
+ * qpl_page_id = i / GVE_TX_BUFS_PER_PAGE_DQO
+ * qpl_page_offset = (i % GVE_TX_BUFS_PER_PAGE_DQO) * GVE_TX_BUF_SIZE_DQO
+ */
+static void
+gve_tx_buf_get_addr_dqo(struct gve_tx_ring *tx,
+ int32_t index, void **va, bus_addr_t *dma_addr)
+{
+ int page_id = index >> (PAGE_SHIFT - GVE_TX_BUF_SHIFT_DQO);
+ int offset = (index & (GVE_TX_BUFS_PER_PAGE_DQO - 1)) <<
+ GVE_TX_BUF_SHIFT_DQO;
+
+ *va = (char *)tx->com.qpl->dmas[page_id].cpu_addr + offset;
+ *dma_addr = tx->com.qpl->dmas[page_id].bus_addr + offset;
+}
+
+static struct gve_dma_handle *
+gve_get_page_dma_handle(struct gve_tx_ring *tx, int32_t index)
+{
+ int page_id = index >> (PAGE_SHIFT - GVE_TX_BUF_SHIFT_DQO);
+
+ return (&tx->com.qpl->dmas[page_id]);
+}
+
+static void
+gve_tx_copy_mbuf_and_write_pkt_descs(struct gve_tx_ring *tx,
+ struct mbuf *mbuf, struct gve_tx_pending_pkt_dqo *pkt,
+ bool csum_enabled, int16_t completion_tag,
+ uint32_t *desc_idx)
+{
+ int32_t pkt_len = mbuf->m_pkthdr.len;
+ struct gve_dma_handle *dma;
+ uint32_t copy_offset = 0;
+ int32_t prev_buf = -1;
+ uint32_t copy_len;
+ bus_addr_t addr;
+ int32_t buf;
+ void *va;
+
+ MPASS(pkt->num_qpl_bufs == 0);
+ MPASS(pkt->qpl_buf_head == -1);
+
+ while (copy_offset < pkt_len) {
+ buf = gve_tx_alloc_qpl_buf(tx);
+ /* We already checked for availability */
+ MPASS(buf != -1);
+
+ gve_tx_buf_get_addr_dqo(tx, buf, &va, &addr);
+ copy_len = MIN(GVE_TX_BUF_SIZE_DQO, pkt_len - copy_offset);
+ m_copydata(mbuf, copy_offset, copy_len, va);
+ copy_offset += copy_len;
+
+ dma = gve_get_page_dma_handle(tx, buf);
+ bus_dmamap_sync(dma->tag, dma->map, BUS_DMASYNC_PREWRITE);
+
+ gve_tx_fill_pkt_desc_dqo(tx, desc_idx,
+ copy_len, addr, completion_tag,
+ /*eop=*/copy_offset == pkt_len,
+ csum_enabled);
+
+ /* Link all the qpl bufs for a packet */
+ if (prev_buf == -1)
+ pkt->qpl_buf_head = buf;
+ else
+ tx->dqo.qpl_bufs[prev_buf] = buf;
+
+ prev_buf = buf;
+ pkt->num_qpl_bufs++;
+ }
+
+ tx->dqo.qpl_bufs[buf] = -1;
+}
+
+int
+gve_xmit_dqo_qpl(struct gve_tx_ring *tx, struct mbuf *mbuf)
+{
+ uint32_t desc_idx = tx->dqo.desc_tail;
+ struct gve_tx_pending_pkt_dqo *pkt;
+ int total_descs_needed;
+ int16_t completion_tag;
+ bool has_csum_flag;
+ int csum_flags;
+ bool is_tso;
+ int nsegs;
+ int err;
+
+ csum_flags = mbuf->m_pkthdr.csum_flags;
+ has_csum_flag = csum_flags & (CSUM_TCP | CSUM_UDP |
+ CSUM_IP6_TCP | CSUM_IP6_UDP | CSUM_TSO);
+ is_tso = csum_flags & CSUM_TSO;
+
+ nsegs = howmany(mbuf->m_pkthdr.len, GVE_TX_BUF_SIZE_DQO);
+ /* Check if we have enough room in the desc ring */
+ total_descs_needed = 1 + /* general_ctx_desc */
+ nsegs + /* pkt_desc */
+ (is_tso ? 1 : 0); /* tso_ctx_desc */
+ if (__predict_false(!gve_tx_has_desc_room_dqo(tx, total_descs_needed)))
+ return (ENOBUFS);
+
+ if (!gve_tx_have_enough_qpl_bufs(tx, nsegs)) {
+ counter_enter();
+ counter_u64_add_protected(
+ tx->stats.tx_delayed_pkt_nospace_qpl_bufs, 1);
+ counter_exit();
+ return (ENOBUFS);
+ }
+
+ pkt = gve_alloc_pending_packet(tx);
+ if (pkt == NULL) {
+ counter_enter();
+ counter_u64_add_protected(
+ tx->stats.tx_delayed_pkt_nospace_compring, 1);
+ counter_exit();
+ return (ENOBUFS);
+ }
+ completion_tag = pkt - tx->dqo.pending_pkts;
+ pkt->mbuf = mbuf;
+
+ err = gve_tx_fill_ctx_descs(tx, mbuf, is_tso, &desc_idx);
+ if (err)
+ goto abort;
+
+ gve_tx_copy_mbuf_and_write_pkt_descs(tx, mbuf, pkt,
+ has_csum_flag, completion_tag, &desc_idx);
+
+ /* Remember the index of the last desc written */
+ tx->dqo.desc_tail = desc_idx;
+
+ /*
+ * Request a descriptor completion on the last descriptor of the
+ * packet if we are allowed to by the HW enforced interval.
+ */
+ gve_tx_request_desc_compl(tx, desc_idx);
+
+ tx->req += total_descs_needed; /* tx->req is just a sysctl counter */
+ return (0);
+
+abort:
+ pkt->mbuf = NULL;
+ gve_free_pending_packet(tx, pkt);
+ return (err);
+}
+
+int
+gve_xmit_dqo(struct gve_tx_ring *tx, struct mbuf **mbuf_ptr)
+{
+ bus_dma_segment_t segs[GVE_TX_MAX_DATA_DESCS_DQO];
+ uint32_t desc_idx = tx->dqo.desc_tail;
+ struct gve_tx_pending_pkt_dqo *pkt;
+ struct mbuf *mbuf = *mbuf_ptr;
+ int total_descs_needed;
+ int16_t completion_tag;
+ bool has_csum_flag;
+ int csum_flags;
+ bool is_tso;
+ int nsegs;
+ int err;
+ int i;
+
+ csum_flags = mbuf->m_pkthdr.csum_flags;
+ has_csum_flag = csum_flags & (CSUM_TCP | CSUM_UDP |
+ CSUM_IP6_TCP | CSUM_IP6_UDP | CSUM_TSO);
+ is_tso = csum_flags & CSUM_TSO;
+
+ /*
+ * This mbuf might end up needing more than 1 pkt desc.
+ * The actual number, `nsegs` is known only after the
+ * expensive gve_map_mbuf_dqo call. This check beneath
+ * exists to fail early when the desc ring is really full.
+ */
+ total_descs_needed = 1 + /* general_ctx_desc */
+ 1 + /* pkt_desc */
+ (is_tso ? 1 : 0); /* tso_ctx_desc */
+ if (__predict_false(!gve_tx_has_desc_room_dqo(tx, total_descs_needed)))
+ return (ENOBUFS);
+
+ pkt = gve_alloc_pending_packet(tx);
+ if (pkt == NULL) {
+ counter_enter();
+ counter_u64_add_protected(
+ tx->stats.tx_delayed_pkt_nospace_compring, 1);
+ counter_exit();
+ return (ENOBUFS);
+ }
+ completion_tag = pkt - tx->dqo.pending_pkts;
+
+ err = gve_map_mbuf_dqo(tx, mbuf_ptr, pkt->dmamap,
+ segs, &nsegs, /*attempt=*/0);
+ if (err)
+ goto abort;
+ mbuf = *mbuf_ptr; /* gve_map_mbuf_dqo might replace the mbuf chain */
+ pkt->mbuf = mbuf;
+
+ total_descs_needed = 1 + /* general_ctx_desc */
+ nsegs + /* pkt_desc */
+ (is_tso ? 1 : 0); /* tso_ctx_desc */
+ if (__predict_false(
+ !gve_tx_has_desc_room_dqo(tx, total_descs_needed))) {
+ err = ENOBUFS;
+ goto abort_with_dma;
+ }
+
+ err = gve_tx_fill_ctx_descs(tx, mbuf, is_tso, &desc_idx);
+ if (err)
+ goto abort_with_dma;
+
+ bus_dmamap_sync(tx->dqo.buf_dmatag, pkt->dmamap, BUS_DMASYNC_PREWRITE);
+ for (i = 0; i < nsegs; i++) {
+ gve_tx_fill_pkt_desc_dqo(tx, &desc_idx,
+ segs[i].ds_len, segs[i].ds_addr,
+ completion_tag, /*eop=*/i == (nsegs - 1),
+ has_csum_flag);
+ }
+
+ /* Remember the index of the last desc written */
+ tx->dqo.desc_tail = desc_idx;
+
+ /*
+ * Request a descriptor completion on the last descriptor of the
+ * packet if we are allowed to by the HW enforced interval.
+ */
+ gve_tx_request_desc_compl(tx, desc_idx);
+
+ tx->req += total_descs_needed; /* tx->req is just a sysctl counter */
+ return (0);
+
+abort_with_dma:
+ gve_unmap_packet(tx, pkt);
+abort:
+ pkt->mbuf = NULL;
+ gve_free_pending_packet(tx, pkt);
+ return (err);
+}
+
+static void
+gve_reap_qpl_bufs_dqo(struct gve_tx_ring *tx,
+ struct gve_tx_pending_pkt_dqo *pkt)
+{
+ int32_t buf = pkt->qpl_buf_head;
+ struct gve_dma_handle *dma;
+ int32_t qpl_buf_tail;
+ int32_t old_head;
+ int i;
+
+ for (i = 0; i < pkt->num_qpl_bufs; i++) {
+ dma = gve_get_page_dma_handle(tx, buf);
+ bus_dmamap_sync(dma->tag, dma->map, BUS_DMASYNC_POSTWRITE);
+ qpl_buf_tail = buf;
+ buf = tx->dqo.qpl_bufs[buf];
+ }
+ MPASS(buf == -1);
+ buf = qpl_buf_tail;
+
+ while (true) {
+ old_head = atomic_load_32(&tx->dqo.free_qpl_bufs_prd);
+ tx->dqo.qpl_bufs[buf] = old_head;
+
+ /*
+ * The "rel" ensures that the update to dqo.free_qpl_bufs_prd
+ * is visible only after the linked list from this pkt is
+ * attached above to old_head.
+ */
+ if (atomic_cmpset_rel_32(&tx->dqo.free_qpl_bufs_prd,
+ old_head, pkt->qpl_buf_head))
+ break;
+ }
+ /*
+ * The "rel" ensures that the update to dqo.qpl_bufs_produced is
+ * visible only adter the update to dqo.free_qpl_bufs_prd above.
+ */
+ atomic_add_rel_32(&tx->dqo.qpl_bufs_produced, pkt->num_qpl_bufs);
+
+ gve_clear_qpl_pending_pkt(pkt);
+}
+
+static uint64_t
+gve_handle_packet_completion(struct gve_priv *priv,
+ struct gve_tx_ring *tx, uint16_t compl_tag)
+{
+ struct gve_tx_pending_pkt_dqo *pending_pkt;
+ int32_t pkt_len;
+
+ if (__predict_false(compl_tag >= tx->dqo.num_pending_pkts)) {
+ device_printf(priv->dev, "Invalid TX completion tag: %d\n",
+ compl_tag);
+ return (0);
+ }
+
+ pending_pkt = &tx->dqo.pending_pkts[compl_tag];
+
+ /* Packet is allocated but not pending data completion. */
+ if (__predict_false(pending_pkt->state !=
+ GVE_PACKET_STATE_PENDING_DATA_COMPL)) {
+ device_printf(priv->dev,
+ "No pending data completion: %d\n", compl_tag);
+ return (0);
+ }
+
+ pkt_len = pending_pkt->mbuf->m_pkthdr.len;
+
+ if (gve_is_qpl(priv))
+ gve_reap_qpl_bufs_dqo(tx, pending_pkt);
+ else
+ gve_unmap_packet(tx, pending_pkt);
+
+ m_freem(pending_pkt->mbuf);
+ pending_pkt->mbuf = NULL;
+ gve_free_pending_packet(tx, pending_pkt);
+ return (pkt_len);
+}
+
+int
+gve_check_tx_timeout_dqo(struct gve_priv *priv, struct gve_tx_ring *tx)
+{
+ struct gve_tx_pending_pkt_dqo *pending_pkt;
+ int num_timeouts;
+ uint16_t pkt_idx;
+
+ num_timeouts = 0;
+ for (pkt_idx = 0; pkt_idx < tx->dqo.num_pending_pkts; pkt_idx++) {
+ pending_pkt = &tx->dqo.pending_pkts[pkt_idx];
+
+ if (!gve_timestamp_valid(&pending_pkt->enqueue_time_sec))
+ continue;
+
+ if (__predict_false(
+ gve_seconds_since(&pending_pkt->enqueue_time_sec) >
+ GVE_TX_TIMEOUT_PKT_SEC))
+ num_timeouts += 1;
+ }
+
+ return (num_timeouts);
+}
+
+int
+gve_tx_intr_dqo(void *arg)
+{
+ struct gve_tx_ring *tx = arg;
+ struct gve_priv *priv = tx->com.priv;
+ struct gve_ring_com *com = &tx->com;
+
+ if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0))
+ return (FILTER_STRAY);
+
+ /* Interrupts are automatically masked */
+ taskqueue_enqueue(com->cleanup_tq, &com->cleanup_task);
+ return (FILTER_HANDLED);
+}
+
+static void
+gve_tx_clear_desc_ring_dqo(struct gve_tx_ring *tx)
+{
+ struct gve_ring_com *com = &tx->com;
+ int i;
+
+ for (i = 0; i < com->priv->tx_desc_cnt; i++)
+ tx->dqo.desc_ring[i] = (union gve_tx_desc_dqo){};
+
+ bus_dmamap_sync(tx->desc_ring_mem.tag, tx->desc_ring_mem.map,
+ BUS_DMASYNC_PREWRITE);
+}
+
+static void
+gve_tx_clear_compl_ring_dqo(struct gve_tx_ring *tx)
+{
+ struct gve_ring_com *com = &tx->com;
+ int entries;
+ int i;
+
+ entries = com->priv->tx_desc_cnt;
+ for (i = 0; i < entries; i++)
+ tx->dqo.compl_ring[i] = (struct gve_tx_compl_desc_dqo){};
+
+ bus_dmamap_sync(tx->dqo.compl_ring_mem.tag, tx->dqo.compl_ring_mem.map,
+ BUS_DMASYNC_PREWRITE);
+}
+
+void
+gve_clear_tx_ring_dqo(struct gve_priv *priv, int i)
+{
+ struct gve_tx_ring *tx = &priv->tx[i];
+ int j;
+
+ tx->dqo.desc_head = 0;
+ tx->dqo.desc_tail = 0;
+ tx->dqo.desc_mask = priv->tx_desc_cnt - 1;
+ tx->dqo.last_re_idx = 0;
+
+ tx->dqo.compl_head = 0;
+ tx->dqo.compl_mask = priv->tx_desc_cnt - 1;
+ atomic_store_32(&tx->dqo.hw_tx_head, 0);
+ tx->dqo.cur_gen_bit = 0;
+
+ gve_free_tx_mbufs_dqo(tx);
+
+ for (j = 0; j < tx->dqo.num_pending_pkts; j++) {
+ if (gve_is_qpl(tx->com.priv))
+ gve_clear_qpl_pending_pkt(&tx->dqo.pending_pkts[j]);
+ gve_invalidate_timestamp(
+ &tx->dqo.pending_pkts[j].enqueue_time_sec);
+ tx->dqo.pending_pkts[j].next =
+ (j == tx->dqo.num_pending_pkts - 1) ? -1 : j + 1;
+ tx->dqo.pending_pkts[j].state = GVE_PACKET_STATE_FREE;
+ }
+ tx->dqo.free_pending_pkts_csm = 0;
+ atomic_store_rel_32(&tx->dqo.free_pending_pkts_prd, -1);
+
+ if (gve_is_qpl(priv)) {
+ int qpl_buf_cnt = GVE_TX_BUFS_PER_PAGE_DQO *
+ tx->com.qpl->num_pages;
+
+ for (j = 0; j < qpl_buf_cnt - 1; j++)
+ tx->dqo.qpl_bufs[j] = j + 1;
+ tx->dqo.qpl_bufs[j] = -1;
+
+ tx->dqo.free_qpl_bufs_csm = 0;
+ atomic_store_32(&tx->dqo.free_qpl_bufs_prd, -1);
+ atomic_store_32(&tx->dqo.qpl_bufs_produced, qpl_buf_cnt);
+ tx->dqo.qpl_bufs_produced_cached = qpl_buf_cnt;
+ tx->dqo.qpl_bufs_consumed = 0;
+ }
+
+ gve_tx_clear_desc_ring_dqo(tx);
+ gve_tx_clear_compl_ring_dqo(tx);
+}
+
+static uint8_t
+gve_tx_get_gen_bit(uint8_t *desc)
+{
+ uint8_t byte;
+
+ /*
+ * Prevent generation bit from being read after the rest of the
+ * descriptor.
+ */
+ byte = atomic_load_acq_8(desc + GVE_TX_DESC_DQO_GEN_BYTE_OFFSET);
+ return ((byte & GVE_TX_DESC_DQO_GEN_BIT_MASK) != 0);
+}
+
+static bool
+gve_tx_cleanup_dqo(struct gve_priv *priv, struct gve_tx_ring *tx, int budget)
+{
+ struct gve_tx_compl_desc_dqo *compl_desc;
+ uint64_t bytes_done = 0;
+ uint64_t pkts_done = 0;
+ uint16_t compl_tag;
+ int work_done = 0;
+ uint16_t tx_head;
+ uint16_t type;
+
+ while (work_done < budget) {
+ bus_dmamap_sync(tx->dqo.compl_ring_mem.tag,
+ tx->dqo.compl_ring_mem.map,
+ BUS_DMASYNC_POSTREAD);
+
+ compl_desc = &tx->dqo.compl_ring[tx->dqo.compl_head];
+ if (gve_tx_get_gen_bit((uint8_t *)compl_desc) ==
+ tx->dqo.cur_gen_bit)
+ break;
+
+ type = compl_desc->type;
+ if (type == GVE_COMPL_TYPE_DQO_DESC) {
+ /* This is the last descriptor fetched by HW plus one */
+ tx_head = le16toh(compl_desc->tx_head);
+ atomic_store_rel_32(&tx->dqo.hw_tx_head, tx_head);
+ } else if (type == GVE_COMPL_TYPE_DQO_PKT) {
+ compl_tag = le16toh(compl_desc->completion_tag);
+ bytes_done += gve_handle_packet_completion(priv,
+ tx, compl_tag);
+ pkts_done++;
+ }
+
+ tx->dqo.compl_head = (tx->dqo.compl_head + 1) &
+ tx->dqo.compl_mask;
+ /* Flip the generation bit when we wrap around */
+ tx->dqo.cur_gen_bit ^= tx->dqo.compl_head == 0;
+ work_done++;
+ }
+
+ /*
+ * Waking the xmit taskqueue has to occur after room has been made in
+ * the queue.
+ */
+ atomic_thread_fence_seq_cst();
+ if (atomic_load_bool(&tx->stopped) && work_done) {
+ atomic_store_bool(&tx->stopped, false);
+ taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task);
+ }
+
+ tx->done += work_done; /* tx->done is just a sysctl counter */
+ counter_enter();
+ counter_u64_add_protected(tx->stats.tbytes, bytes_done);
+ counter_u64_add_protected(tx->stats.tpackets, pkts_done);
+ counter_exit();
+
+ return (work_done == budget);
+}
+
+void
+gve_tx_cleanup_tq_dqo(void *arg, int pending)
+{
+ struct gve_tx_ring *tx = arg;
+ struct gve_priv *priv = tx->com.priv;
+
+ if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0))
+ return;
+
+ if (gve_tx_cleanup_dqo(priv, tx, /*budget=*/1024)) {
+ taskqueue_enqueue(tx->com.cleanup_tq, &tx->com.cleanup_task);
+ return;
+ }
+
+ gve_db_bar_dqo_write_4(priv, tx->com.irq_db_offset,
+ GVE_ITR_NO_UPDATE_DQO | GVE_ITR_ENABLE_BIT_DQO);
+}
diff --git a/sys/dev/gve/gve_utils.c b/sys/dev/gve/gve_utils.c
index c05488770dbd..707b8f039d88 100644
--- a/sys/dev/gve/gve_utils.c
+++ b/sys/dev/gve/gve_utils.c
@@ -1,7 +1,7 @@
/*-
* SPDX-License-Identifier: BSD-3-Clause
*
- * Copyright (c) 2023 Google LLC
+ * Copyright (c) 2023-2024 Google LLC
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
@@ -29,6 +29,7 @@
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "gve.h"
+#include "gve_dqo.h"
uint32_t
gve_reg_bar_read_4(struct gve_priv *priv, bus_size_t offset)
@@ -49,6 +50,12 @@ gve_db_bar_write_4(struct gve_priv *priv, bus_size_t offset, uint32_t val)
}
void
+gve_db_bar_dqo_write_4(struct gve_priv *priv, bus_size_t offset, uint32_t val)
+{
+ bus_write_4(priv->db_bar, offset, val);
+}
+
+void
gve_alloc_counters(counter_u64_t *stat, int num_stats)
{
int i;
@@ -227,7 +234,7 @@ gve_free_irqs(struct gve_priv *priv)
return;
}
- num_irqs = priv->tx_cfg.num_queues + priv->rx_cfg.num_queues + 1;
+ num_irqs = priv->tx_cfg.max_queues + priv->rx_cfg.max_queues + 1;
for (i = 0; i < num_irqs; i++) {
irq = &priv->irq_tbl[i];
@@ -261,8 +268,8 @@ gve_free_irqs(struct gve_priv *priv)
int
gve_alloc_irqs(struct gve_priv *priv)
{
- int num_tx = priv->tx_cfg.num_queues;
- int num_rx = priv->rx_cfg.num_queues;
+ int num_tx = priv->tx_cfg.max_queues;
+ int num_rx = priv->rx_cfg.max_queues;
int req_nvecs = num_tx + num_rx + 1;
int got_nvecs = req_nvecs;
struct gve_irq *irq;
@@ -307,7 +314,8 @@ gve_alloc_irqs(struct gve_priv *priv)
}
err = bus_setup_intr(priv->dev, irq->res, INTR_TYPE_NET | INTR_MPSAFE,
- gve_tx_intr, NULL, &priv->tx[i], &irq->cookie);
+ gve_is_gqi(priv) ? gve_tx_intr : gve_tx_intr_dqo, NULL,
+ &priv->tx[i], &irq->cookie);
if (err != 0) {
device_printf(priv->dev, "Failed to setup irq %d for Tx queue %d, "
"err: %d\n", rid, i, err);
@@ -334,7 +342,8 @@ gve_alloc_irqs(struct gve_priv *priv)
}
err = bus_setup_intr(priv->dev, irq->res, INTR_TYPE_NET | INTR_MPSAFE,
- gve_rx_intr, NULL, &priv->rx[j], &irq->cookie);
+ gve_is_gqi(priv) ? gve_rx_intr : gve_rx_intr_dqo, NULL,
+ &priv->rx[j], &irq->cookie);
if (err != 0) {
device_printf(priv->dev, "Failed to setup irq %d for Rx queue %d, "
"err: %d\n", rid, j, err);
@@ -374,6 +383,24 @@ abort:
return (err);
}
+/*
+ * Builds register value to write to DQO IRQ doorbell to enable with specified
+ * ITR interval.
+ */
+static uint32_t
+gve_setup_itr_interval_dqo(uint32_t interval_us)
+{
+ uint32_t result = GVE_ITR_ENABLE_BIT_DQO;
+
+ /* Interval has 2us granularity. */
+ interval_us >>= 1;
+
+ interval_us &= GVE_ITR_INTERVAL_DQO_MASK;
+ result |= (interval_us << GVE_ITR_INTERVAL_DQO_SHIFT);
+
+ return (result);
+}
+
void
gve_unmask_all_queue_irqs(struct gve_priv *priv)
{
@@ -383,11 +410,20 @@ gve_unmask_all_queue_irqs(struct gve_priv *priv)
for (idx = 0; idx < priv->tx_cfg.num_queues; idx++) {
tx = &priv->tx[idx];
- gve_db_bar_write_4(priv, tx->com.irq_db_offset, 0);
+ if (gve_is_gqi(priv))
+ gve_db_bar_write_4(priv, tx->com.irq_db_offset, 0);
+ else
+ gve_db_bar_dqo_write_4(priv, tx->com.irq_db_offset,
+ gve_setup_itr_interval_dqo(GVE_TX_IRQ_RATELIMIT_US_DQO));
}
+
for (idx = 0; idx < priv->rx_cfg.num_queues; idx++) {
rx = &priv->rx[idx];
- gve_db_bar_write_4(priv, rx->com.irq_db_offset, 0);
+ if (gve_is_gqi(priv))
+ gve_db_bar_write_4(priv, rx->com.irq_db_offset, 0);
+ else
+ gve_db_bar_dqo_write_4(priv, rx->com.irq_db_offset,
+ gve_setup_itr_interval_dqo(GVE_RX_IRQ_RATELIMIT_US_DQO));
}
}
@@ -403,3 +439,46 @@ gve_mask_all_queue_irqs(struct gve_priv *priv)
gve_db_bar_write_4(priv, rx->com.irq_db_offset, GVE_IRQ_MASK);
}
}
+
+/*
+ * In some cases, such as tracking timeout events, we must mark a timestamp as
+ * invalid when we do not want to consider its value. Such timestamps must be
+ * checked for validity before reading them.
+ */
+void
+gve_invalidate_timestamp(int64_t *timestamp_sec)
+{
+ atomic_store_64(timestamp_sec, GVE_TIMESTAMP_INVALID);
+}
+
+/*
+ * Returns 0 if the timestamp is invalid, otherwise returns the elapsed seconds
+ * since the timestamp was set.
+ */
+int64_t
+gve_seconds_since(int64_t *timestamp_sec)
+{
+ struct bintime curr_time;
+ int64_t enqueued_time;
+
+ getbintime(&curr_time);
+ enqueued_time = atomic_load_64(timestamp_sec);
+ if (enqueued_time == GVE_TIMESTAMP_INVALID)
+ return (0);
+ return ((int64_t)(curr_time.sec - enqueued_time));
+}
+
+void
+gve_set_timestamp(int64_t *timestamp_sec)
+{
+ struct bintime curr_time;
+
+ getbintime(&curr_time);
+ atomic_store_64(timestamp_sec, curr_time.sec);
+}
+
+bool
+gve_timestamp_valid(int64_t *timestamp_sec)
+{
+ return (atomic_load_64(timestamp_sec) != GVE_TIMESTAMP_INVALID);
+}