aboutsummaryrefslogtreecommitdiff
path: root/sys/dev/gve/gve.h
diff options
context:
space:
mode:
Diffstat (limited to 'sys/dev/gve/gve.h')
-rw-r--r--sys/dev/gve/gve.h400
1 files changed, 360 insertions, 40 deletions
diff --git a/sys/dev/gve/gve.h b/sys/dev/gve/gve.h
index c446199dff2d..64c2a0481817 100644
--- a/sys/dev/gve/gve.h
+++ b/sys/dev/gve/gve.h
@@ -1,7 +1,7 @@
/*-
* SPDX-License-Identifier: BSD-3-Clause
*
- * Copyright (c) 2023 Google LLC
+ * Copyright (c) 2023-2024 Google LLC
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
@@ -47,12 +47,31 @@
#define GVE_TX_MAX_DESCS 4
#define GVE_TX_BUFRING_ENTRIES 4096
+#define GVE_TX_TIMEOUT_PKT_SEC 5
+#define GVE_TX_TIMEOUT_CHECK_CADENCE_SEC 5
+/*
+ * If the driver finds timed out packets on a tx queue it first kicks it and
+ * records the time. If the driver again finds a timeout on the same queue
+ * before the end of the cooldown period, only then will it reset. Thus, for a
+ * reset to be able to occur at all, the cooldown must be at least as long
+ * as the tx timeout checking cadence multiplied by the number of queues.
+ */
+#define GVE_TX_TIMEOUT_MAX_TX_QUEUES 16
+#define GVE_TX_TIMEOUT_KICK_COOLDOWN_SEC \
+ (2 * GVE_TX_TIMEOUT_CHECK_CADENCE_SEC * GVE_TX_TIMEOUT_MAX_TX_QUEUES)
+
+#define GVE_TIMESTAMP_INVALID -1
+
#define ADMINQ_SIZE PAGE_SIZE
#define GVE_DEFAULT_RX_BUFFER_SIZE 2048
+#define GVE_4K_RX_BUFFER_SIZE_DQO 4096
/* Each RX bounce buffer page can fit two packet buffers. */
#define GVE_DEFAULT_RX_BUFFER_OFFSET (PAGE_SIZE / 2)
+/* PTYPEs are always 10 bits. */
+#define GVE_NUM_PTYPES 1024
+
/*
* Number of descriptors per queue page list.
* Page count AKA QPL size can be derived by dividing the number of elements in
@@ -60,8 +79,17 @@
*/
#define GVE_QPL_DIVISOR 16
+/* Ring Size Limits */
+#define GVE_DEFAULT_MIN_RX_RING_SIZE 512
+#define GVE_DEFAULT_MIN_TX_RING_SIZE 256
+
static MALLOC_DEFINE(M_GVE, "gve", "gve allocations");
+_Static_assert(MCLBYTES >= GVE_DEFAULT_RX_BUFFER_SIZE,
+ "gve: bad MCLBYTES length");
+_Static_assert(MJUMPAGESIZE >= GVE_4K_RX_BUFFER_SIZE_DQO,
+ "gve: bad MJUMPAGESIZE length");
+
struct gve_dma_handle {
bus_addr_t bus_addr;
void *cpu_addr;
@@ -102,6 +130,7 @@ enum gve_queue_format {
GVE_GQI_RDA_FORMAT = 0x1,
GVE_GQI_QPL_FORMAT = 0x2,
GVE_DQO_RDA_FORMAT = 0x3,
+ GVE_DQO_QPL_FORMAT = 0x4,
};
enum gve_state_flags_bit {
@@ -223,31 +252,93 @@ struct gve_rxq_stats {
counter_u64_t rx_frag_flip_cnt;
counter_u64_t rx_frag_copy_cnt;
counter_u64_t rx_dropped_pkt_desc_err;
+ counter_u64_t rx_dropped_pkt_buf_post_fail;
counter_u64_t rx_dropped_pkt_mbuf_alloc_fail;
+ counter_u64_t rx_mbuf_dmamap_err;
+ counter_u64_t rx_mbuf_mclget_null;
};
#define NUM_RX_STATS (sizeof(struct gve_rxq_stats) / sizeof(counter_u64_t))
+union gve_rx_qpl_buf_id_dqo {
+ struct {
+ uint16_t buf_id:11; /* Index into rx->dqo.bufs */
+ uint8_t frag_num:5; /* Which frag in the QPL page */
+ };
+ uint16_t all;
+} __packed;
+_Static_assert(sizeof(union gve_rx_qpl_buf_id_dqo) == 2,
+ "gve: bad dqo qpl rx buf id length");
+
+struct gve_rx_buf_dqo {
+ union {
+ /* RDA */
+ struct {
+ struct mbuf *mbuf;
+ bus_dmamap_t dmamap;
+ uint64_t addr;
+ bool mapped;
+ };
+ /* QPL */
+ struct {
+ uint8_t num_nic_frags; /* number of pending completions */
+ uint8_t next_idx; /* index of the next frag to post */
+ /* for chaining rx->dqo.used_bufs */
+ STAILQ_ENTRY(gve_rx_buf_dqo) stailq_entry;
+ };
+ };
+ /* for chaining rx->dqo.free_bufs */
+ SLIST_ENTRY(gve_rx_buf_dqo) slist_entry;
+};
+
/* power-of-2 sized receive ring */
struct gve_rx_ring {
struct gve_ring_com com;
struct gve_dma_handle desc_ring_mem;
- struct gve_dma_handle data_ring_mem;
-
- /* accessed in the receive hot path */
- struct {
- struct gve_rx_desc *desc_ring;
- union gve_rx_data_slot *data_ring;
- struct gve_rx_slot_page_info *page_info;
-
- struct gve_rx_ctx ctx;
- struct lro_ctrl lro;
- uint8_t seq_no; /* helps traverse the descriptor ring */
- uint32_t cnt; /* free-running total number of completed packets */
- uint32_t fill_cnt; /* free-running total number of descs and buffs posted */
- uint32_t mask; /* masks the cnt and fill_cnt to the size of the ring */
- struct gve_rxq_stats stats;
- } __aligned(CACHE_LINE_SIZE);
+ uint32_t cnt; /* free-running total number of completed packets */
+ uint32_t fill_cnt; /* free-running total number of descs and buffs posted */
+
+ union {
+ /* GQI-only fields */
+ struct {
+ struct gve_dma_handle data_ring_mem;
+
+ /* accessed in the GQ receive hot path */
+ struct gve_rx_desc *desc_ring;
+ union gve_rx_data_slot *data_ring;
+ struct gve_rx_slot_page_info *page_info;
+ uint32_t mask; /* masks the cnt and fill_cnt to the size of the ring */
+ uint8_t seq_no; /* helps traverse the descriptor ring */
+ };
+
+ /* DQO-only fields */
+ struct {
+ struct gve_dma_handle compl_ring_mem;
+
+ struct gve_rx_compl_desc_dqo *compl_ring;
+ struct gve_rx_desc_dqo *desc_ring;
+ struct gve_rx_buf_dqo *bufs; /* Parking place for posted buffers */
+ bus_dma_tag_t buf_dmatag; /* To dmamap posted mbufs with */
+
+ uint32_t buf_cnt; /* Size of the bufs array */
+ uint32_t mask; /* One less than the sizes of the desc and compl rings */
+ uint32_t head; /* The index at which to post the next buffer at */
+ uint32_t tail; /* The index at which to receive the next compl at */
+ uint8_t cur_gen_bit; /* Gets flipped on every cycle of the compl ring */
+ SLIST_HEAD(, gve_rx_buf_dqo) free_bufs;
+
+ /*
+ * Only used in QPL mode. Pages referred to by if_input-ed mbufs
+ * stay parked here till their wire count comes back to 1.
+ * Pages are moved here after there aren't any pending completions.
+ */
+ STAILQ_HEAD(, gve_rx_buf_dqo) used_bufs;
+ } dqo;
+ };
+
+ struct lro_ctrl lro;
+ struct gve_rx_ctx ctx;
+ struct gve_rxq_stats stats;
} __aligned(CACHE_LINE_SIZE);
@@ -267,6 +358,14 @@ struct gve_tx_fifo {
struct gve_tx_buffer_state {
struct mbuf *mbuf;
+
+ /*
+ * Time at which the xmit tq places descriptors for mbuf's payload on a
+ * tx queue. This timestamp is invalidated when the mbuf is freed and
+ * must be checked for validity when read.
+ */
+ int64_t enqueue_time_sec;
+
struct gve_tx_iovec iov[GVE_TX_MAX_DESCS];
};
@@ -275,13 +374,50 @@ struct gve_txq_stats {
counter_u64_t tpackets;
counter_u64_t tso_packet_cnt;
counter_u64_t tx_dropped_pkt;
- counter_u64_t tx_dropped_pkt_nospace_device;
+ counter_u64_t tx_delayed_pkt_nospace_device;
counter_u64_t tx_dropped_pkt_nospace_bufring;
+ counter_u64_t tx_delayed_pkt_nospace_descring;
+ counter_u64_t tx_delayed_pkt_nospace_compring;
+ counter_u64_t tx_delayed_pkt_nospace_qpl_bufs;
+ counter_u64_t tx_delayed_pkt_tsoerr;
counter_u64_t tx_dropped_pkt_vlan;
+ counter_u64_t tx_mbuf_collapse;
+ counter_u64_t tx_mbuf_defrag;
+ counter_u64_t tx_mbuf_defrag_err;
+ counter_u64_t tx_mbuf_dmamap_enomem_err;
+ counter_u64_t tx_mbuf_dmamap_err;
+ counter_u64_t tx_timeout;
};
#define NUM_TX_STATS (sizeof(struct gve_txq_stats) / sizeof(counter_u64_t))
+struct gve_tx_pending_pkt_dqo {
+ struct mbuf *mbuf;
+
+ /*
+ * Time at which the xmit tq places descriptors for mbuf's payload on a
+ * tx queue. This timestamp is invalidated when the mbuf is freed and
+ * must be checked for validity when read.
+ */
+ int64_t enqueue_time_sec;
+
+ union {
+ /* RDA */
+ bus_dmamap_t dmamap;
+ /* QPL */
+ struct {
+ /*
+ * A linked list of entries from qpl_bufs that served
+ * as the bounce buffer for this packet.
+ */
+ int32_t qpl_buf_head;
+ uint32_t num_qpl_bufs;
+ };
+ };
+ uint8_t state; /* the gve_packet_state enum */
+ int next; /* To chain the free_pending_pkts lists */
+};
+
/* power-of-2 sized transmit ring */
struct gve_tx_ring {
struct gve_ring_com com;
@@ -289,23 +425,134 @@ struct gve_tx_ring {
struct task xmit_task;
struct taskqueue *xmit_tq;
+ bool stopped;
+
+ /* Accessed when writing descriptors */
+ struct buf_ring *br;
+ struct mtx ring_mtx;
+
+ uint32_t req; /* free-running total number of packets written to the nic */
+ uint32_t done; /* free-running total number of completed packets */
+
+ int64_t last_kicked; /* always-valid timestamp in seconds for the last queue kick */
+
+ union {
+ /* GQI specific stuff */
+ struct {
+ union gve_tx_desc *desc_ring;
+ struct gve_tx_buffer_state *info;
+
+ struct gve_tx_fifo fifo;
+
+ uint32_t mask; /* masks the req and done to the size of the ring */
+ };
+
+ /* DQO specific stuff */
+ struct {
+ struct gve_dma_handle compl_ring_mem;
+
+ /* Accessed when writing descriptors */
+ struct {
+ union gve_tx_desc_dqo *desc_ring;
+ uint32_t desc_mask; /* masks head and tail to the size of desc_ring */
+ uint32_t desc_head; /* last desc read by NIC, cached value of hw_tx_head */
+ uint32_t desc_tail; /* last desc written by driver */
+ uint32_t last_re_idx; /* desc which last had "report event" set */
+
+ /*
+ * The head index of a singly linked list containing pending packet objects
+ * to park mbufs till the NIC sends completions. Once this list is depleted,
+ * the "_prd" suffixed producer list, grown by the completion taskqueue,
+ * is stolen.
+ */
+ int32_t free_pending_pkts_csm;
+
+ /*
+ * The head index of a singly linked list representing QPL page fragments
+ * to copy mbuf payload into for the NIC to see. Once this list is depleted,
+ * the "_prd" suffixed producer list, grown by the completion taskqueue,
+ * is stolen.
+ *
+ * Only used in QPL mode. int32_t because atomic_swap_16 doesn't exist.
+ */
+ int32_t free_qpl_bufs_csm;
+ uint32_t qpl_bufs_consumed; /* Allows quickly checking for buf availability */
+ uint32_t qpl_bufs_produced_cached; /* Cached value of qpl_bufs_produced */
+
+ /* DMA params for mapping Tx mbufs. Only used in RDA mode. */
+ bus_dma_tag_t buf_dmatag;
+ } __aligned(CACHE_LINE_SIZE);
+
+ /* Accessed when processing completions */
+ struct {
+ struct gve_tx_compl_desc_dqo *compl_ring;
+ uint32_t compl_mask; /* masks head to the size of compl_ring */
+ uint32_t compl_head; /* last completion read by driver */
+ uint8_t cur_gen_bit; /* NIC flips a bit on every pass */
+ uint32_t hw_tx_head; /* last desc read by NIC */
+
+ /*
+ * The completion taskqueue moves pending-packet objects to this
+ * list after freeing the mbuf. The "_prd" denotes that this is
+ * a producer list. The transmit taskqueue steals this list once
+ * its consumer list, with the "_csm" suffix, is depleted.
+ */
+ int32_t free_pending_pkts_prd;
+
+ /*
+ * The completion taskqueue moves the QPL pages corresponding to a
+ * completed packet into this list. It is only used in QPL mode.
+ * The "_prd" denotes that this is a producer list. The transmit
+ * taskqueue steals this list once its consumer list, with the "_csm"
+ * suffix, is depleted.
+ *
+ * Only used in QPL mode. int32_t because atomic_swap_16 doesn't exist.
+ */
+ int32_t free_qpl_bufs_prd;
+ uint32_t qpl_bufs_produced;
+ } __aligned(CACHE_LINE_SIZE);
+
+ /* Accessed by both the completion and xmit loops */
+ struct {
+ /* completion tags index into this array */
+ struct gve_tx_pending_pkt_dqo *pending_pkts;
+ uint16_t num_pending_pkts;
+
+ /*
+ * Represents QPL page fragments. An index into this array
+ * always represents the same QPL page fragment. The value
+ * is also an index into this array and servers as a means
+ * to chain buffers into linked lists whose heads are
+ * either free_qpl_bufs_prd or free_qpl_bufs_csm or
+ * qpl_bufs_head.
+ */
+ int32_t *qpl_bufs;
+ } __aligned(CACHE_LINE_SIZE);
+ } dqo;
+ };
+ struct gve_txq_stats stats;
+} __aligned(CACHE_LINE_SIZE);
- /* accessed in the transmit hot path */
- struct {
- union gve_tx_desc *desc_ring;
- struct gve_tx_buffer_state *info;
- struct buf_ring *br;
-
- struct gve_tx_fifo fifo;
- struct mtx ring_mtx;
+enum gve_packet_state {
+ /*
+ * Packet does not yet have a dmamap created.
+ * This should always be zero since state is not explicitly initialized.
+ */
+ GVE_PACKET_STATE_UNALLOCATED,
+ /* Packet has a dmamap and is in free list, available to be allocated. */
+ GVE_PACKET_STATE_FREE,
+ /* Packet is expecting a regular data completion */
+ GVE_PACKET_STATE_PENDING_DATA_COMPL,
+};
- uint32_t req; /* free-running total number of packets written to the nic */
- uint32_t done; /* free-running total number of completed packets */
- uint32_t mask; /* masks the req and done to the size of the ring */
- struct gve_txq_stats stats;
- } __aligned(CACHE_LINE_SIZE);
+struct gve_ptype {
+ uint8_t l3_type; /* `gve_l3_type` in gve_adminq.h */
+ uint8_t l4_type; /* `gve_l4_type` in gve_adminq.h */
+};
-} __aligned(CACHE_LINE_SIZE);
+struct gve_ptype_lut {
+ struct gve_ptype ptypes[GVE_NUM_PTYPES];
+};
struct gve_priv {
if_t ifp;
@@ -326,12 +573,17 @@ struct gve_priv {
uint16_t num_event_counters;
uint16_t default_num_queues;
uint16_t tx_desc_cnt;
+ uint16_t max_tx_desc_cnt;
+ uint16_t min_tx_desc_cnt;
uint16_t rx_desc_cnt;
+ uint16_t max_rx_desc_cnt;
+ uint16_t min_rx_desc_cnt;
uint16_t rx_pages_per_qpl;
uint64_t max_registered_pages;
uint64_t num_registered_pages;
uint32_t supported_features;
uint16_t max_mtu;
+ bool modify_ringsize_enabled;
struct gve_dma_handle counter_array_mem;
__be32 *counters;
@@ -339,7 +591,6 @@ struct gve_priv {
struct gve_irq_db *irq_db_indices;
enum gve_queue_format queue_format;
- struct gve_queue_page_list *qpls;
struct gve_queue_config tx_cfg;
struct gve_queue_config rx_cfg;
uint32_t num_queues;
@@ -348,6 +599,8 @@ struct gve_priv {
struct gve_tx_ring *tx;
struct gve_rx_ring *rx;
+ struct gve_ptype_lut *ptype_lut_dqo;
+
/*
* Admin queue - see gve_adminq.h
* Since AQ cmds do not run in steady state, 32 bit counters suffice
@@ -370,6 +623,7 @@ struct gve_priv {
uint32_t adminq_dcfg_device_resources_cnt;
uint32_t adminq_set_driver_parameter_cnt;
uint32_t adminq_verify_driver_compatibility_cnt;
+ uint32_t adminq_get_ptype_map_cnt;
uint32_t interface_up_cnt;
uint32_t interface_down_cnt;
@@ -380,6 +634,12 @@ struct gve_priv {
struct gve_state_flags state_flags;
struct sx gve_iface_lock;
+
+ struct callout tx_timeout_service;
+ /* The index of tx queue that the timer service will check on its next invocation */
+ uint16_t check_tx_queue_idx;
+
+ uint16_t rx_buf_size_dqo;
};
static inline bool
@@ -400,39 +660,89 @@ gve_clear_state_flag(struct gve_priv *priv, int pos)
BIT_CLR_ATOMIC(GVE_NUM_STATE_FLAGS, pos, &priv->state_flags);
}
+static inline bool
+gve_is_gqi(struct gve_priv *priv)
+{
+ return (priv->queue_format == GVE_GQI_QPL_FORMAT);
+}
+
+static inline bool
+gve_is_qpl(struct gve_priv *priv)
+{
+ return (priv->queue_format == GVE_GQI_QPL_FORMAT ||
+ priv->queue_format == GVE_DQO_QPL_FORMAT);
+}
+
+static inline bool
+gve_is_4k_rx_buf(struct gve_priv *priv)
+{
+ return (priv->rx_buf_size_dqo == GVE_4K_RX_BUFFER_SIZE_DQO);
+}
+
+static inline bus_size_t
+gve_rx_dqo_mbuf_segment_size(struct gve_priv *priv)
+{
+ return (gve_is_4k_rx_buf(priv) ? MJUMPAGESIZE : MCLBYTES);
+}
+
/* Defined in gve_main.c */
void gve_schedule_reset(struct gve_priv *priv);
+int gve_adjust_tx_queues(struct gve_priv *priv, uint16_t new_queue_cnt);
+int gve_adjust_rx_queues(struct gve_priv *priv, uint16_t new_queue_cnt);
+int gve_adjust_ring_sizes(struct gve_priv *priv, uint16_t new_desc_cnt, bool is_rx);
/* Register access functions defined in gve_utils.c */
uint32_t gve_reg_bar_read_4(struct gve_priv *priv, bus_size_t offset);
void gve_reg_bar_write_4(struct gve_priv *priv, bus_size_t offset, uint32_t val);
void gve_db_bar_write_4(struct gve_priv *priv, bus_size_t offset, uint32_t val);
+void gve_db_bar_dqo_write_4(struct gve_priv *priv, bus_size_t offset, uint32_t val);
/* QPL (Queue Page List) functions defined in gve_qpl.c */
-int gve_alloc_qpls(struct gve_priv *priv);
-void gve_free_qpls(struct gve_priv *priv);
+struct gve_queue_page_list *gve_alloc_qpl(struct gve_priv *priv, uint32_t id,
+ int npages, bool single_kva);
+void gve_free_qpl(struct gve_priv *priv, struct gve_queue_page_list *qpl);
int gve_register_qpls(struct gve_priv *priv);
int gve_unregister_qpls(struct gve_priv *priv);
+void gve_mextadd_free(struct mbuf *mbuf);
/* TX functions defined in gve_tx.c */
-int gve_alloc_tx_rings(struct gve_priv *priv);
-void gve_free_tx_rings(struct gve_priv *priv);
+int gve_alloc_tx_rings(struct gve_priv *priv, uint16_t start_idx, uint16_t stop_idx);
+void gve_free_tx_rings(struct gve_priv *priv, uint16_t start_idx, uint16_t stop_idx);
int gve_create_tx_rings(struct gve_priv *priv);
int gve_destroy_tx_rings(struct gve_priv *priv);
+int gve_check_tx_timeout_gqi(struct gve_priv *priv, struct gve_tx_ring *tx);
int gve_tx_intr(void *arg);
int gve_xmit_ifp(if_t ifp, struct mbuf *mbuf);
void gve_qflush(if_t ifp);
void gve_xmit_tq(void *arg, int pending);
void gve_tx_cleanup_tq(void *arg, int pending);
+/* TX functions defined in gve_tx_dqo.c */
+int gve_tx_alloc_ring_dqo(struct gve_priv *priv, int i);
+void gve_tx_free_ring_dqo(struct gve_priv *priv, int i);
+void gve_clear_tx_ring_dqo(struct gve_priv *priv, int i);
+int gve_check_tx_timeout_dqo(struct gve_priv *priv, struct gve_tx_ring *tx);
+int gve_tx_intr_dqo(void *arg);
+int gve_xmit_dqo(struct gve_tx_ring *tx, struct mbuf **mbuf_ptr);
+int gve_xmit_dqo_qpl(struct gve_tx_ring *tx, struct mbuf *mbuf);
+void gve_tx_cleanup_tq_dqo(void *arg, int pending);
+
/* RX functions defined in gve_rx.c */
-int gve_alloc_rx_rings(struct gve_priv *priv);
-void gve_free_rx_rings(struct gve_priv *priv);
+int gve_alloc_rx_rings(struct gve_priv *priv, uint16_t start_idx, uint16_t stop_idx);
+void gve_free_rx_rings(struct gve_priv *priv, uint16_t start_idx, uint16_t stop_idx);
int gve_create_rx_rings(struct gve_priv *priv);
int gve_destroy_rx_rings(struct gve_priv *priv);
int gve_rx_intr(void *arg);
void gve_rx_cleanup_tq(void *arg, int pending);
+/* RX functions defined in gve_rx_dqo.c */
+int gve_rx_alloc_ring_dqo(struct gve_priv *priv, int i);
+void gve_rx_free_ring_dqo(struct gve_priv *priv, int i);
+void gve_rx_prefill_buffers_dqo(struct gve_rx_ring *rx);
+void gve_clear_rx_ring_dqo(struct gve_priv *priv, int i);
+int gve_rx_intr_dqo(void *arg);
+void gve_rx_cleanup_tq_dqo(void *arg, int pending);
+
/* DMA functions defined in gve_utils.c */
int gve_dma_alloc_coherent(struct gve_priv *priv, int size, int align,
struct gve_dma_handle *dma);
@@ -447,7 +757,17 @@ int gve_alloc_irqs(struct gve_priv *priv);
void gve_unmask_all_queue_irqs(struct gve_priv *priv);
void gve_mask_all_queue_irqs(struct gve_priv *priv);
-/* Systcl functions defined in gve_sysctl.c*/
+/* Miscellaneous functions defined in gve_utils.c */
+void gve_invalidate_timestamp(int64_t *timestamp_sec);
+int64_t gve_seconds_since(int64_t *timestamp_sec);
+void gve_set_timestamp(int64_t *timestamp_sec);
+bool gve_timestamp_valid(int64_t *timestamp_sec);
+
+/* Systcl functions defined in gve_sysctl.c */
+extern bool gve_disable_hw_lro;
+extern bool gve_allow_4k_rx_buffers;
+extern char gve_queue_format[8];
+extern char gve_version[8];
void gve_setup_sysctl(struct gve_priv *priv);
void gve_accum_stats(struct gve_priv *priv, uint64_t *rpackets,
uint64_t *rbytes, uint64_t *rx_dropped_pkt, uint64_t *tpackets,