diff options
Diffstat (limited to 'sys/dev/gve')
-rw-r--r-- | sys/dev/gve/gve.h | 400 | ||||
-rw-r--r-- | sys/dev/gve/gve_adminq.c | 215 | ||||
-rw-r--r-- | sys/dev/gve/gve_adminq.h | 84 | ||||
-rw-r--r-- | sys/dev/gve/gve_desc.h | 4 | ||||
-rw-r--r-- | sys/dev/gve/gve_dqo.h | 337 | ||||
-rw-r--r-- | sys/dev/gve/gve_main.c | 381 | ||||
-rw-r--r-- | sys/dev/gve/gve_plat.h | 3 | ||||
-rw-r--r-- | sys/dev/gve/gve_qpl.c | 187 | ||||
-rw-r--r-- | sys/dev/gve/gve_rx.c | 163 | ||||
-rw-r--r-- | sys/dev/gve/gve_rx_dqo.c | 1035 | ||||
-rw-r--r-- | sys/dev/gve/gve_sysctl.c | 252 | ||||
-rw-r--r-- | sys/dev/gve/gve_tx.c | 269 | ||||
-rw-r--r-- | sys/dev/gve/gve_tx_dqo.c | 1149 | ||||
-rw-r--r-- | sys/dev/gve/gve_utils.c | 95 |
14 files changed, 4228 insertions, 346 deletions
diff --git a/sys/dev/gve/gve.h b/sys/dev/gve/gve.h index c446199dff2d..64c2a0481817 100644 --- a/sys/dev/gve/gve.h +++ b/sys/dev/gve/gve.h @@ -1,7 +1,7 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * - * Copyright (c) 2023 Google LLC + * Copyright (c) 2023-2024 Google LLC * * Redistribution and use in source and binary forms, with or without modification, * are permitted provided that the following conditions are met: @@ -47,12 +47,31 @@ #define GVE_TX_MAX_DESCS 4 #define GVE_TX_BUFRING_ENTRIES 4096 +#define GVE_TX_TIMEOUT_PKT_SEC 5 +#define GVE_TX_TIMEOUT_CHECK_CADENCE_SEC 5 +/* + * If the driver finds timed out packets on a tx queue it first kicks it and + * records the time. If the driver again finds a timeout on the same queue + * before the end of the cooldown period, only then will it reset. Thus, for a + * reset to be able to occur at all, the cooldown must be at least as long + * as the tx timeout checking cadence multiplied by the number of queues. + */ +#define GVE_TX_TIMEOUT_MAX_TX_QUEUES 16 +#define GVE_TX_TIMEOUT_KICK_COOLDOWN_SEC \ + (2 * GVE_TX_TIMEOUT_CHECK_CADENCE_SEC * GVE_TX_TIMEOUT_MAX_TX_QUEUES) + +#define GVE_TIMESTAMP_INVALID -1 + #define ADMINQ_SIZE PAGE_SIZE #define GVE_DEFAULT_RX_BUFFER_SIZE 2048 +#define GVE_4K_RX_BUFFER_SIZE_DQO 4096 /* Each RX bounce buffer page can fit two packet buffers. */ #define GVE_DEFAULT_RX_BUFFER_OFFSET (PAGE_SIZE / 2) +/* PTYPEs are always 10 bits. */ +#define GVE_NUM_PTYPES 1024 + /* * Number of descriptors per queue page list. * Page count AKA QPL size can be derived by dividing the number of elements in @@ -60,8 +79,17 @@ */ #define GVE_QPL_DIVISOR 16 +/* Ring Size Limits */ +#define GVE_DEFAULT_MIN_RX_RING_SIZE 512 +#define GVE_DEFAULT_MIN_TX_RING_SIZE 256 + static MALLOC_DEFINE(M_GVE, "gve", "gve allocations"); +_Static_assert(MCLBYTES >= GVE_DEFAULT_RX_BUFFER_SIZE, + "gve: bad MCLBYTES length"); +_Static_assert(MJUMPAGESIZE >= GVE_4K_RX_BUFFER_SIZE_DQO, + "gve: bad MJUMPAGESIZE length"); + struct gve_dma_handle { bus_addr_t bus_addr; void *cpu_addr; @@ -102,6 +130,7 @@ enum gve_queue_format { GVE_GQI_RDA_FORMAT = 0x1, GVE_GQI_QPL_FORMAT = 0x2, GVE_DQO_RDA_FORMAT = 0x3, + GVE_DQO_QPL_FORMAT = 0x4, }; enum gve_state_flags_bit { @@ -223,31 +252,93 @@ struct gve_rxq_stats { counter_u64_t rx_frag_flip_cnt; counter_u64_t rx_frag_copy_cnt; counter_u64_t rx_dropped_pkt_desc_err; + counter_u64_t rx_dropped_pkt_buf_post_fail; counter_u64_t rx_dropped_pkt_mbuf_alloc_fail; + counter_u64_t rx_mbuf_dmamap_err; + counter_u64_t rx_mbuf_mclget_null; }; #define NUM_RX_STATS (sizeof(struct gve_rxq_stats) / sizeof(counter_u64_t)) +union gve_rx_qpl_buf_id_dqo { + struct { + uint16_t buf_id:11; /* Index into rx->dqo.bufs */ + uint8_t frag_num:5; /* Which frag in the QPL page */ + }; + uint16_t all; +} __packed; +_Static_assert(sizeof(union gve_rx_qpl_buf_id_dqo) == 2, + "gve: bad dqo qpl rx buf id length"); + +struct gve_rx_buf_dqo { + union { + /* RDA */ + struct { + struct mbuf *mbuf; + bus_dmamap_t dmamap; + uint64_t addr; + bool mapped; + }; + /* QPL */ + struct { + uint8_t num_nic_frags; /* number of pending completions */ + uint8_t next_idx; /* index of the next frag to post */ + /* for chaining rx->dqo.used_bufs */ + STAILQ_ENTRY(gve_rx_buf_dqo) stailq_entry; + }; + }; + /* for chaining rx->dqo.free_bufs */ + SLIST_ENTRY(gve_rx_buf_dqo) slist_entry; +}; + /* power-of-2 sized receive ring */ struct gve_rx_ring { struct gve_ring_com com; struct gve_dma_handle desc_ring_mem; - struct gve_dma_handle data_ring_mem; - - /* accessed in the receive hot path */ - struct { - struct gve_rx_desc *desc_ring; - union gve_rx_data_slot *data_ring; - struct gve_rx_slot_page_info *page_info; - - struct gve_rx_ctx ctx; - struct lro_ctrl lro; - uint8_t seq_no; /* helps traverse the descriptor ring */ - uint32_t cnt; /* free-running total number of completed packets */ - uint32_t fill_cnt; /* free-running total number of descs and buffs posted */ - uint32_t mask; /* masks the cnt and fill_cnt to the size of the ring */ - struct gve_rxq_stats stats; - } __aligned(CACHE_LINE_SIZE); + uint32_t cnt; /* free-running total number of completed packets */ + uint32_t fill_cnt; /* free-running total number of descs and buffs posted */ + + union { + /* GQI-only fields */ + struct { + struct gve_dma_handle data_ring_mem; + + /* accessed in the GQ receive hot path */ + struct gve_rx_desc *desc_ring; + union gve_rx_data_slot *data_ring; + struct gve_rx_slot_page_info *page_info; + uint32_t mask; /* masks the cnt and fill_cnt to the size of the ring */ + uint8_t seq_no; /* helps traverse the descriptor ring */ + }; + + /* DQO-only fields */ + struct { + struct gve_dma_handle compl_ring_mem; + + struct gve_rx_compl_desc_dqo *compl_ring; + struct gve_rx_desc_dqo *desc_ring; + struct gve_rx_buf_dqo *bufs; /* Parking place for posted buffers */ + bus_dma_tag_t buf_dmatag; /* To dmamap posted mbufs with */ + + uint32_t buf_cnt; /* Size of the bufs array */ + uint32_t mask; /* One less than the sizes of the desc and compl rings */ + uint32_t head; /* The index at which to post the next buffer at */ + uint32_t tail; /* The index at which to receive the next compl at */ + uint8_t cur_gen_bit; /* Gets flipped on every cycle of the compl ring */ + SLIST_HEAD(, gve_rx_buf_dqo) free_bufs; + + /* + * Only used in QPL mode. Pages referred to by if_input-ed mbufs + * stay parked here till their wire count comes back to 1. + * Pages are moved here after there aren't any pending completions. + */ + STAILQ_HEAD(, gve_rx_buf_dqo) used_bufs; + } dqo; + }; + + struct lro_ctrl lro; + struct gve_rx_ctx ctx; + struct gve_rxq_stats stats; } __aligned(CACHE_LINE_SIZE); @@ -267,6 +358,14 @@ struct gve_tx_fifo { struct gve_tx_buffer_state { struct mbuf *mbuf; + + /* + * Time at which the xmit tq places descriptors for mbuf's payload on a + * tx queue. This timestamp is invalidated when the mbuf is freed and + * must be checked for validity when read. + */ + int64_t enqueue_time_sec; + struct gve_tx_iovec iov[GVE_TX_MAX_DESCS]; }; @@ -275,13 +374,50 @@ struct gve_txq_stats { counter_u64_t tpackets; counter_u64_t tso_packet_cnt; counter_u64_t tx_dropped_pkt; - counter_u64_t tx_dropped_pkt_nospace_device; + counter_u64_t tx_delayed_pkt_nospace_device; counter_u64_t tx_dropped_pkt_nospace_bufring; + counter_u64_t tx_delayed_pkt_nospace_descring; + counter_u64_t tx_delayed_pkt_nospace_compring; + counter_u64_t tx_delayed_pkt_nospace_qpl_bufs; + counter_u64_t tx_delayed_pkt_tsoerr; counter_u64_t tx_dropped_pkt_vlan; + counter_u64_t tx_mbuf_collapse; + counter_u64_t tx_mbuf_defrag; + counter_u64_t tx_mbuf_defrag_err; + counter_u64_t tx_mbuf_dmamap_enomem_err; + counter_u64_t tx_mbuf_dmamap_err; + counter_u64_t tx_timeout; }; #define NUM_TX_STATS (sizeof(struct gve_txq_stats) / sizeof(counter_u64_t)) +struct gve_tx_pending_pkt_dqo { + struct mbuf *mbuf; + + /* + * Time at which the xmit tq places descriptors for mbuf's payload on a + * tx queue. This timestamp is invalidated when the mbuf is freed and + * must be checked for validity when read. + */ + int64_t enqueue_time_sec; + + union { + /* RDA */ + bus_dmamap_t dmamap; + /* QPL */ + struct { + /* + * A linked list of entries from qpl_bufs that served + * as the bounce buffer for this packet. + */ + int32_t qpl_buf_head; + uint32_t num_qpl_bufs; + }; + }; + uint8_t state; /* the gve_packet_state enum */ + int next; /* To chain the free_pending_pkts lists */ +}; + /* power-of-2 sized transmit ring */ struct gve_tx_ring { struct gve_ring_com com; @@ -289,23 +425,134 @@ struct gve_tx_ring { struct task xmit_task; struct taskqueue *xmit_tq; + bool stopped; + + /* Accessed when writing descriptors */ + struct buf_ring *br; + struct mtx ring_mtx; + + uint32_t req; /* free-running total number of packets written to the nic */ + uint32_t done; /* free-running total number of completed packets */ + + int64_t last_kicked; /* always-valid timestamp in seconds for the last queue kick */ + + union { + /* GQI specific stuff */ + struct { + union gve_tx_desc *desc_ring; + struct gve_tx_buffer_state *info; + + struct gve_tx_fifo fifo; + + uint32_t mask; /* masks the req and done to the size of the ring */ + }; + + /* DQO specific stuff */ + struct { + struct gve_dma_handle compl_ring_mem; + + /* Accessed when writing descriptors */ + struct { + union gve_tx_desc_dqo *desc_ring; + uint32_t desc_mask; /* masks head and tail to the size of desc_ring */ + uint32_t desc_head; /* last desc read by NIC, cached value of hw_tx_head */ + uint32_t desc_tail; /* last desc written by driver */ + uint32_t last_re_idx; /* desc which last had "report event" set */ + + /* + * The head index of a singly linked list containing pending packet objects + * to park mbufs till the NIC sends completions. Once this list is depleted, + * the "_prd" suffixed producer list, grown by the completion taskqueue, + * is stolen. + */ + int32_t free_pending_pkts_csm; + + /* + * The head index of a singly linked list representing QPL page fragments + * to copy mbuf payload into for the NIC to see. Once this list is depleted, + * the "_prd" suffixed producer list, grown by the completion taskqueue, + * is stolen. + * + * Only used in QPL mode. int32_t because atomic_swap_16 doesn't exist. + */ + int32_t free_qpl_bufs_csm; + uint32_t qpl_bufs_consumed; /* Allows quickly checking for buf availability */ + uint32_t qpl_bufs_produced_cached; /* Cached value of qpl_bufs_produced */ + + /* DMA params for mapping Tx mbufs. Only used in RDA mode. */ + bus_dma_tag_t buf_dmatag; + } __aligned(CACHE_LINE_SIZE); + + /* Accessed when processing completions */ + struct { + struct gve_tx_compl_desc_dqo *compl_ring; + uint32_t compl_mask; /* masks head to the size of compl_ring */ + uint32_t compl_head; /* last completion read by driver */ + uint8_t cur_gen_bit; /* NIC flips a bit on every pass */ + uint32_t hw_tx_head; /* last desc read by NIC */ + + /* + * The completion taskqueue moves pending-packet objects to this + * list after freeing the mbuf. The "_prd" denotes that this is + * a producer list. The transmit taskqueue steals this list once + * its consumer list, with the "_csm" suffix, is depleted. + */ + int32_t free_pending_pkts_prd; + + /* + * The completion taskqueue moves the QPL pages corresponding to a + * completed packet into this list. It is only used in QPL mode. + * The "_prd" denotes that this is a producer list. The transmit + * taskqueue steals this list once its consumer list, with the "_csm" + * suffix, is depleted. + * + * Only used in QPL mode. int32_t because atomic_swap_16 doesn't exist. + */ + int32_t free_qpl_bufs_prd; + uint32_t qpl_bufs_produced; + } __aligned(CACHE_LINE_SIZE); + + /* Accessed by both the completion and xmit loops */ + struct { + /* completion tags index into this array */ + struct gve_tx_pending_pkt_dqo *pending_pkts; + uint16_t num_pending_pkts; + + /* + * Represents QPL page fragments. An index into this array + * always represents the same QPL page fragment. The value + * is also an index into this array and servers as a means + * to chain buffers into linked lists whose heads are + * either free_qpl_bufs_prd or free_qpl_bufs_csm or + * qpl_bufs_head. + */ + int32_t *qpl_bufs; + } __aligned(CACHE_LINE_SIZE); + } dqo; + }; + struct gve_txq_stats stats; +} __aligned(CACHE_LINE_SIZE); - /* accessed in the transmit hot path */ - struct { - union gve_tx_desc *desc_ring; - struct gve_tx_buffer_state *info; - struct buf_ring *br; - - struct gve_tx_fifo fifo; - struct mtx ring_mtx; +enum gve_packet_state { + /* + * Packet does not yet have a dmamap created. + * This should always be zero since state is not explicitly initialized. + */ + GVE_PACKET_STATE_UNALLOCATED, + /* Packet has a dmamap and is in free list, available to be allocated. */ + GVE_PACKET_STATE_FREE, + /* Packet is expecting a regular data completion */ + GVE_PACKET_STATE_PENDING_DATA_COMPL, +}; - uint32_t req; /* free-running total number of packets written to the nic */ - uint32_t done; /* free-running total number of completed packets */ - uint32_t mask; /* masks the req and done to the size of the ring */ - struct gve_txq_stats stats; - } __aligned(CACHE_LINE_SIZE); +struct gve_ptype { + uint8_t l3_type; /* `gve_l3_type` in gve_adminq.h */ + uint8_t l4_type; /* `gve_l4_type` in gve_adminq.h */ +}; -} __aligned(CACHE_LINE_SIZE); +struct gve_ptype_lut { + struct gve_ptype ptypes[GVE_NUM_PTYPES]; +}; struct gve_priv { if_t ifp; @@ -326,12 +573,17 @@ struct gve_priv { uint16_t num_event_counters; uint16_t default_num_queues; uint16_t tx_desc_cnt; + uint16_t max_tx_desc_cnt; + uint16_t min_tx_desc_cnt; uint16_t rx_desc_cnt; + uint16_t max_rx_desc_cnt; + uint16_t min_rx_desc_cnt; uint16_t rx_pages_per_qpl; uint64_t max_registered_pages; uint64_t num_registered_pages; uint32_t supported_features; uint16_t max_mtu; + bool modify_ringsize_enabled; struct gve_dma_handle counter_array_mem; __be32 *counters; @@ -339,7 +591,6 @@ struct gve_priv { struct gve_irq_db *irq_db_indices; enum gve_queue_format queue_format; - struct gve_queue_page_list *qpls; struct gve_queue_config tx_cfg; struct gve_queue_config rx_cfg; uint32_t num_queues; @@ -348,6 +599,8 @@ struct gve_priv { struct gve_tx_ring *tx; struct gve_rx_ring *rx; + struct gve_ptype_lut *ptype_lut_dqo; + /* * Admin queue - see gve_adminq.h * Since AQ cmds do not run in steady state, 32 bit counters suffice @@ -370,6 +623,7 @@ struct gve_priv { uint32_t adminq_dcfg_device_resources_cnt; uint32_t adminq_set_driver_parameter_cnt; uint32_t adminq_verify_driver_compatibility_cnt; + uint32_t adminq_get_ptype_map_cnt; uint32_t interface_up_cnt; uint32_t interface_down_cnt; @@ -380,6 +634,12 @@ struct gve_priv { struct gve_state_flags state_flags; struct sx gve_iface_lock; + + struct callout tx_timeout_service; + /* The index of tx queue that the timer service will check on its next invocation */ + uint16_t check_tx_queue_idx; + + uint16_t rx_buf_size_dqo; }; static inline bool @@ -400,39 +660,89 @@ gve_clear_state_flag(struct gve_priv *priv, int pos) BIT_CLR_ATOMIC(GVE_NUM_STATE_FLAGS, pos, &priv->state_flags); } +static inline bool +gve_is_gqi(struct gve_priv *priv) +{ + return (priv->queue_format == GVE_GQI_QPL_FORMAT); +} + +static inline bool +gve_is_qpl(struct gve_priv *priv) +{ + return (priv->queue_format == GVE_GQI_QPL_FORMAT || + priv->queue_format == GVE_DQO_QPL_FORMAT); +} + +static inline bool +gve_is_4k_rx_buf(struct gve_priv *priv) +{ + return (priv->rx_buf_size_dqo == GVE_4K_RX_BUFFER_SIZE_DQO); +} + +static inline bus_size_t +gve_rx_dqo_mbuf_segment_size(struct gve_priv *priv) +{ + return (gve_is_4k_rx_buf(priv) ? MJUMPAGESIZE : MCLBYTES); +} + /* Defined in gve_main.c */ void gve_schedule_reset(struct gve_priv *priv); +int gve_adjust_tx_queues(struct gve_priv *priv, uint16_t new_queue_cnt); +int gve_adjust_rx_queues(struct gve_priv *priv, uint16_t new_queue_cnt); +int gve_adjust_ring_sizes(struct gve_priv *priv, uint16_t new_desc_cnt, bool is_rx); /* Register access functions defined in gve_utils.c */ uint32_t gve_reg_bar_read_4(struct gve_priv *priv, bus_size_t offset); void gve_reg_bar_write_4(struct gve_priv *priv, bus_size_t offset, uint32_t val); void gve_db_bar_write_4(struct gve_priv *priv, bus_size_t offset, uint32_t val); +void gve_db_bar_dqo_write_4(struct gve_priv *priv, bus_size_t offset, uint32_t val); /* QPL (Queue Page List) functions defined in gve_qpl.c */ -int gve_alloc_qpls(struct gve_priv *priv); -void gve_free_qpls(struct gve_priv *priv); +struct gve_queue_page_list *gve_alloc_qpl(struct gve_priv *priv, uint32_t id, + int npages, bool single_kva); +void gve_free_qpl(struct gve_priv *priv, struct gve_queue_page_list *qpl); int gve_register_qpls(struct gve_priv *priv); int gve_unregister_qpls(struct gve_priv *priv); +void gve_mextadd_free(struct mbuf *mbuf); /* TX functions defined in gve_tx.c */ -int gve_alloc_tx_rings(struct gve_priv *priv); -void gve_free_tx_rings(struct gve_priv *priv); +int gve_alloc_tx_rings(struct gve_priv *priv, uint16_t start_idx, uint16_t stop_idx); +void gve_free_tx_rings(struct gve_priv *priv, uint16_t start_idx, uint16_t stop_idx); int gve_create_tx_rings(struct gve_priv *priv); int gve_destroy_tx_rings(struct gve_priv *priv); +int gve_check_tx_timeout_gqi(struct gve_priv *priv, struct gve_tx_ring *tx); int gve_tx_intr(void *arg); int gve_xmit_ifp(if_t ifp, struct mbuf *mbuf); void gve_qflush(if_t ifp); void gve_xmit_tq(void *arg, int pending); void gve_tx_cleanup_tq(void *arg, int pending); +/* TX functions defined in gve_tx_dqo.c */ +int gve_tx_alloc_ring_dqo(struct gve_priv *priv, int i); +void gve_tx_free_ring_dqo(struct gve_priv *priv, int i); +void gve_clear_tx_ring_dqo(struct gve_priv *priv, int i); +int gve_check_tx_timeout_dqo(struct gve_priv *priv, struct gve_tx_ring *tx); +int gve_tx_intr_dqo(void *arg); +int gve_xmit_dqo(struct gve_tx_ring *tx, struct mbuf **mbuf_ptr); +int gve_xmit_dqo_qpl(struct gve_tx_ring *tx, struct mbuf *mbuf); +void gve_tx_cleanup_tq_dqo(void *arg, int pending); + /* RX functions defined in gve_rx.c */ -int gve_alloc_rx_rings(struct gve_priv *priv); -void gve_free_rx_rings(struct gve_priv *priv); +int gve_alloc_rx_rings(struct gve_priv *priv, uint16_t start_idx, uint16_t stop_idx); +void gve_free_rx_rings(struct gve_priv *priv, uint16_t start_idx, uint16_t stop_idx); int gve_create_rx_rings(struct gve_priv *priv); int gve_destroy_rx_rings(struct gve_priv *priv); int gve_rx_intr(void *arg); void gve_rx_cleanup_tq(void *arg, int pending); +/* RX functions defined in gve_rx_dqo.c */ +int gve_rx_alloc_ring_dqo(struct gve_priv *priv, int i); +void gve_rx_free_ring_dqo(struct gve_priv *priv, int i); +void gve_rx_prefill_buffers_dqo(struct gve_rx_ring *rx); +void gve_clear_rx_ring_dqo(struct gve_priv *priv, int i); +int gve_rx_intr_dqo(void *arg); +void gve_rx_cleanup_tq_dqo(void *arg, int pending); + /* DMA functions defined in gve_utils.c */ int gve_dma_alloc_coherent(struct gve_priv *priv, int size, int align, struct gve_dma_handle *dma); @@ -447,7 +757,17 @@ int gve_alloc_irqs(struct gve_priv *priv); void gve_unmask_all_queue_irqs(struct gve_priv *priv); void gve_mask_all_queue_irqs(struct gve_priv *priv); -/* Systcl functions defined in gve_sysctl.c*/ +/* Miscellaneous functions defined in gve_utils.c */ +void gve_invalidate_timestamp(int64_t *timestamp_sec); +int64_t gve_seconds_since(int64_t *timestamp_sec); +void gve_set_timestamp(int64_t *timestamp_sec); +bool gve_timestamp_valid(int64_t *timestamp_sec); + +/* Systcl functions defined in gve_sysctl.c */ +extern bool gve_disable_hw_lro; +extern bool gve_allow_4k_rx_buffers; +extern char gve_queue_format[8]; +extern char gve_version[8]; void gve_setup_sysctl(struct gve_priv *priv); void gve_accum_stats(struct gve_priv *priv, uint64_t *rpackets, uint64_t *rbytes, uint64_t *rx_dropped_pkt, uint64_t *tpackets, diff --git a/sys/dev/gve/gve_adminq.c b/sys/dev/gve/gve_adminq.c index 3c332607ebd4..9b59570a2af4 100644 --- a/sys/dev/gve/gve_adminq.c +++ b/sys/dev/gve/gve_adminq.c @@ -1,7 +1,7 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * - * Copyright (c) 2023 Google LLC + * Copyright (c) 2023-2024 Google LLC * * Redistribution and use in source and binary forms, with or without modification, * are permitted provided that the following conditions are met: @@ -57,6 +57,9 @@ void gve_parse_device_option(struct gve_priv *priv, struct gve_device_descriptor *device_descriptor, struct gve_device_option *option, struct gve_device_option_gqi_qpl **dev_op_gqi_qpl, + struct gve_device_option_dqo_rda **dev_op_dqo_rda, + struct gve_device_option_dqo_qpl **dev_op_dqo_qpl, + struct gve_device_option_modify_ring **dev_op_modify_ring, struct gve_device_option_jumbo_frames **dev_op_jumbo_frames) { uint32_t req_feat_mask = be32toh(option->required_features_mask); @@ -85,6 +88,68 @@ void gve_parse_device_option(struct gve_priv *priv, *dev_op_gqi_qpl = (void *)(option + 1); break; + case GVE_DEV_OPT_ID_DQO_RDA: + if (option_length < sizeof(**dev_op_dqo_rda) || + req_feat_mask != GVE_DEV_OPT_REQ_FEAT_MASK_DQO_RDA) { + device_printf(priv->dev, GVE_DEVICE_OPTION_ERROR_FMT, + "DQO RDA", (int)sizeof(**dev_op_dqo_rda), + GVE_DEV_OPT_REQ_FEAT_MASK_DQO_RDA, + option_length, req_feat_mask); + break; + } + + if (option_length > sizeof(**dev_op_dqo_rda)) { + device_printf(priv->dev, GVE_DEVICE_OPTION_TOO_BIG_FMT, + "DQO RDA"); + } + *dev_op_dqo_rda = (void *)(option + 1); + break; + + case GVE_DEV_OPT_ID_DQO_QPL: + if (option_length < sizeof(**dev_op_dqo_qpl) || + req_feat_mask != GVE_DEV_OPT_REQ_FEAT_MASK_DQO_QPL) { + device_printf(priv->dev, GVE_DEVICE_OPTION_ERROR_FMT, + "DQO QPL", (int)sizeof(**dev_op_dqo_qpl), + GVE_DEV_OPT_REQ_FEAT_MASK_DQO_QPL, + option_length, req_feat_mask); + break; + } + + if (option_length > sizeof(**dev_op_dqo_qpl)) { + device_printf(priv->dev, GVE_DEVICE_OPTION_TOO_BIG_FMT, + "DQO QPL"); + } + *dev_op_dqo_qpl = (void *)(option + 1); + break; + + case GVE_DEV_OPT_ID_MODIFY_RING: + if (option_length < (sizeof(**dev_op_modify_ring) - + sizeof(struct gve_ring_size_bound)) || + req_feat_mask != GVE_DEV_OPT_REQ_FEAT_MASK_MODIFY_RING) { + device_printf(priv->dev, GVE_DEVICE_OPTION_ERROR_FMT, + "Modify Ring", (int)sizeof(**dev_op_modify_ring), + GVE_DEV_OPT_REQ_FEAT_MASK_MODIFY_RING, + option_length, req_feat_mask); + break; + } + + if (option_length > sizeof(**dev_op_modify_ring)) { + device_printf(priv->dev, GVE_DEVICE_OPTION_TOO_BIG_FMT, + "Modify Ring"); + } + *dev_op_modify_ring = (void *)(option + 1); + + /* Min ring size included; set the minimum ring size. */ + if (option_length == sizeof(**dev_op_modify_ring)) { + priv->min_rx_desc_cnt = max( + be16toh((*dev_op_modify_ring)->min_ring_size.rx), + GVE_DEFAULT_MIN_RX_RING_SIZE); + priv->min_tx_desc_cnt = max( + be16toh((*dev_op_modify_ring)->min_ring_size.tx), + GVE_DEFAULT_MIN_TX_RING_SIZE); + } + break; + case GVE_DEV_OPT_ID_JUMBO_FRAMES: if (option_length < sizeof(**dev_op_jumbo_frames) || req_feat_mask != GVE_DEV_OPT_REQ_FEAT_MASK_JUMBO_FRAMES) { @@ -117,6 +182,9 @@ static int gve_process_device_options(struct gve_priv *priv, struct gve_device_descriptor *descriptor, struct gve_device_option_gqi_qpl **dev_op_gqi_qpl, + struct gve_device_option_dqo_rda **dev_op_dqo_rda, + struct gve_device_option_dqo_qpl **dev_op_dqo_qpl, + struct gve_device_option_modify_ring **dev_op_modify_ring, struct gve_device_option_jumbo_frames **dev_op_jumbo_frames) { char *desc_end = (char *)descriptor + be16toh(descriptor->total_length); @@ -130,12 +198,16 @@ gve_process_device_options(struct gve_priv *priv, if ((char *)(dev_opt + 1) > desc_end || (char *)(dev_opt + 1) + be16toh(dev_opt->option_length) > desc_end) { device_printf(priv->dev, - "options exceed device_descriptor's total length.\n"); + "options exceed device descriptor's total length.\n"); return (EINVAL); } gve_parse_device_option(priv, descriptor, dev_opt, - dev_op_gqi_qpl, dev_op_jumbo_frames); + dev_op_gqi_qpl, + dev_op_dqo_rda, + dev_op_dqo_qpl, + dev_op_modify_ring, + dev_op_jumbo_frames); dev_opt = (void *)((char *)(dev_opt + 1) + be16toh(dev_opt->option_length)); } @@ -221,16 +293,38 @@ gve_adminq_create_rx_queue(struct gve_priv *priv, uint32_t queue_index) cmd.opcode = htobe32(GVE_ADMINQ_CREATE_RX_QUEUE); cmd.create_rx_queue = (struct gve_adminq_create_rx_queue) { .queue_id = htobe32(queue_index), - .index = htobe32(queue_index), .ntfy_id = htobe32(rx->com.ntfy_id), .queue_resources_addr = htobe64(qres_dma->bus_addr), - .rx_desc_ring_addr = htobe64(rx->desc_ring_mem.bus_addr), - .rx_data_ring_addr = htobe64(rx->data_ring_mem.bus_addr), - .queue_page_list_id = htobe32((rx->com.qpl)->id), .rx_ring_size = htobe16(priv->rx_desc_cnt), - .packet_buffer_size = htobe16(GVE_DEFAULT_RX_BUFFER_SIZE), }; + if (gve_is_gqi(priv)) { + cmd.create_rx_queue.rx_desc_ring_addr = + htobe64(rx->desc_ring_mem.bus_addr); + cmd.create_rx_queue.rx_data_ring_addr = + htobe64(rx->data_ring_mem.bus_addr); + cmd.create_rx_queue.index = + htobe32(queue_index); + cmd.create_rx_queue.queue_page_list_id = + htobe32((rx->com.qpl)->id); + cmd.create_rx_queue.packet_buffer_size = + htobe16(GVE_DEFAULT_RX_BUFFER_SIZE); + } else { + cmd.create_rx_queue.queue_page_list_id = + htobe32(GVE_RAW_ADDRESSING_QPL_ID); + cmd.create_rx_queue.rx_desc_ring_addr = + htobe64(rx->dqo.compl_ring_mem.bus_addr); + cmd.create_rx_queue.rx_data_ring_addr = + htobe64(rx->desc_ring_mem.bus_addr); + cmd.create_rx_queue.rx_buff_ring_size = + htobe16(priv->rx_desc_cnt); + cmd.create_rx_queue.enable_rsc = + !!((if_getcapenable(priv->ifp) & IFCAP_LRO) && + !gve_disable_hw_lro); + cmd.create_rx_queue.packet_buffer_size = + htobe16(priv->rx_buf_size_dqo); + } + return (gve_adminq_execute_cmd(priv, &cmd)); } @@ -272,11 +366,21 @@ gve_adminq_create_tx_queue(struct gve_priv *priv, uint32_t queue_index) .queue_id = htobe32(queue_index), .queue_resources_addr = htobe64(qres_dma->bus_addr), .tx_ring_addr = htobe64(tx->desc_ring_mem.bus_addr), - .queue_page_list_id = htobe32((tx->com.qpl)->id), .ntfy_id = htobe32(tx->com.ntfy_id), .tx_ring_size = htobe16(priv->tx_desc_cnt), }; + if (gve_is_gqi(priv)) { + cmd.create_tx_queue.queue_page_list_id = + htobe32((tx->com.qpl)->id); + } else { + cmd.create_tx_queue.queue_page_list_id = + htobe32(GVE_RAW_ADDRESSING_QPL_ID); + cmd.create_tx_queue.tx_comp_ring_addr = + htobe64(tx->dqo.compl_ring_mem.bus_addr); + cmd.create_tx_queue.tx_comp_ring_size = + htobe16(priv->tx_desc_cnt); + } return (gve_adminq_execute_cmd(priv, &cmd)); } @@ -320,8 +424,18 @@ gve_adminq_set_mtu(struct gve_priv *priv, uint32_t mtu) { static void gve_enable_supported_features(struct gve_priv *priv, uint32_t supported_features_mask, + const struct gve_device_option_modify_ring *dev_op_modify_ring, const struct gve_device_option_jumbo_frames *dev_op_jumbo_frames) { + if (dev_op_modify_ring && + (supported_features_mask & GVE_SUP_MODIFY_RING_MASK)) { + if (bootverbose) + device_printf(priv->dev, "MODIFY RING device option enabled.\n"); + priv->modify_ringsize_enabled = true; + priv->max_rx_desc_cnt = be16toh(dev_op_modify_ring->max_ring_size.rx); + priv->max_tx_desc_cnt = be16toh(dev_op_modify_ring->max_ring_size.tx); + } + if (dev_op_jumbo_frames && (supported_features_mask & GVE_SUP_JUMBO_FRAMES_MASK)) { if (bootverbose) @@ -338,6 +452,9 @@ gve_adminq_describe_device(struct gve_priv *priv) struct gve_device_descriptor *desc; struct gve_dma_handle desc_mem; struct gve_device_option_gqi_qpl *dev_op_gqi_qpl = NULL; + struct gve_device_option_dqo_rda *dev_op_dqo_rda = NULL; + struct gve_device_option_dqo_qpl *dev_op_dqo_qpl = NULL; + struct gve_device_option_modify_ring *dev_op_modify_ring = NULL; struct gve_device_option_jumbo_frames *dev_op_jumbo_frames = NULL; uint32_t supported_features_mask = 0; int rc; @@ -366,12 +483,40 @@ gve_adminq_describe_device(struct gve_priv *priv) bus_dmamap_sync(desc_mem.tag, desc_mem.map, BUS_DMASYNC_POSTREAD); - rc = gve_process_device_options(priv, desc, &dev_op_gqi_qpl, + /* Default min in case device options don't have min values */ + priv->min_rx_desc_cnt = GVE_DEFAULT_MIN_RX_RING_SIZE; + priv->min_tx_desc_cnt = GVE_DEFAULT_MIN_TX_RING_SIZE; + + rc = gve_process_device_options(priv, desc, + &dev_op_gqi_qpl, + &dev_op_dqo_rda, + &dev_op_dqo_qpl, + &dev_op_modify_ring, &dev_op_jumbo_frames); if (rc != 0) goto free_device_descriptor; - if (dev_op_gqi_qpl != NULL) { + if (dev_op_dqo_rda != NULL) { + snprintf(gve_queue_format, sizeof(gve_queue_format), + "%s", "DQO RDA"); + priv->queue_format = GVE_DQO_RDA_FORMAT; + supported_features_mask = be32toh( + dev_op_dqo_rda->supported_features_mask); + if (bootverbose) + device_printf(priv->dev, + "Driver is running with DQO RDA queue format.\n"); + } else if (dev_op_dqo_qpl != NULL) { + snprintf(gve_queue_format, sizeof(gve_queue_format), + "%s", "DQO QPL"); + priv->queue_format = GVE_DQO_QPL_FORMAT; + supported_features_mask = be32toh( + dev_op_dqo_qpl->supported_features_mask); + if (bootverbose) + device_printf(priv->dev, + "Driver is running with DQO QPL queue format.\n"); + } else if (dev_op_gqi_qpl != NULL) { + snprintf(gve_queue_format, sizeof(gve_queue_format), + "%s", "GQI QPL"); priv->queue_format = GVE_GQI_QPL_FORMAT; supported_features_mask = be32toh( dev_op_gqi_qpl->supported_features_mask); @@ -380,7 +525,7 @@ gve_adminq_describe_device(struct gve_priv *priv) "Driver is running with GQI QPL queue format.\n"); } else { device_printf(priv->dev, "No compatible queue formats\n"); - rc = (EINVAL); + rc = EINVAL; goto free_device_descriptor; } @@ -394,8 +539,12 @@ gve_adminq_describe_device(struct gve_priv *priv) priv->default_num_queues = be16toh(desc->default_num_queues); priv->supported_features = supported_features_mask; + /* Default max to current in case modify ring size option is disabled */ + priv->max_rx_desc_cnt = priv->rx_desc_cnt; + priv->max_tx_desc_cnt = priv->tx_desc_cnt; + gve_enable_supported_features(priv, supported_features_mask, - dev_op_jumbo_frames); + dev_op_modify_ring, dev_op_jumbo_frames); for (i = 0; i < ETHER_ADDR_LEN; i++) priv->mac[i] = desc->mac[i]; @@ -507,6 +656,41 @@ gve_adminq_verify_driver_compatibility(struct gve_priv *priv, } int +gve_adminq_get_ptype_map_dqo(struct gve_priv *priv, + struct gve_ptype_lut *ptype_lut_dqo) +{ + struct gve_adminq_command aq_cmd = (struct gve_adminq_command){}; + struct gve_ptype_map *ptype_map; + struct gve_dma_handle dma; + int err = 0; + int i; + + err = gve_dma_alloc_coherent(priv, sizeof(*ptype_map), PAGE_SIZE, &dma); + if (err) + return (err); + ptype_map = dma.cpu_addr; + + aq_cmd.opcode = htobe32(GVE_ADMINQ_GET_PTYPE_MAP); + aq_cmd.get_ptype_map = (struct gve_adminq_get_ptype_map) { + .ptype_map_len = htobe64(sizeof(*ptype_map)), + .ptype_map_addr = htobe64(dma.bus_addr), + }; + + err = gve_adminq_execute_cmd(priv, &aq_cmd); + if (err) + goto err; + + /* Populate ptype_lut_dqo. */ + for (i = 0; i < GVE_NUM_PTYPES; i++) { + ptype_lut_dqo->ptypes[i].l3_type = ptype_map->ptypes[i].l3_type; + ptype_lut_dqo->ptypes[i].l4_type = ptype_map->ptypes[i].l4_type; + } +err: + gve_dma_free_coherent(&dma); + return (err); +} + +int gve_adminq_alloc(struct gve_priv *priv) { int rc; @@ -543,6 +727,7 @@ gve_adminq_alloc(struct gve_priv *priv) priv->adminq_destroy_rx_queue_cnt = 0; priv->adminq_dcfg_device_resources_cnt = 0; priv->adminq_set_driver_parameter_cnt = 0; + priv->adminq_get_ptype_map_cnt = 0; gve_reg_bar_write_4(priv, GVE_REG_ADMINQ_ADDR, priv->adminq_bus_addr / ADMINQ_SIZE); @@ -772,6 +957,10 @@ gve_adminq_issue_cmd(struct gve_priv *priv, struct gve_adminq_command *cmd_orig) priv->adminq_verify_driver_compatibility_cnt++; break; + case GVE_ADMINQ_GET_PTYPE_MAP: + priv->adminq_get_ptype_map_cnt++; + break; + default: device_printf(priv->dev, "Unknown AQ command opcode %d\n", opcode); } diff --git a/sys/dev/gve/gve_adminq.h b/sys/dev/gve/gve_adminq.h index 5923e5f353d1..531a844f7d90 100644 --- a/sys/dev/gve/gve_adminq.h +++ b/sys/dev/gve/gve_adminq.h @@ -1,7 +1,7 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * - * Copyright (c) 2023 Google LLC + * Copyright (c) 2023-2024 Google LLC * * Redistribution and use in source and binary forms, with or without modification, * are permitted provided that the following conditions are met: @@ -137,18 +137,37 @@ _Static_assert(sizeof(struct gve_device_option_gqi_qpl) == 4, struct gve_device_option_dqo_rda { __be32 supported_features_mask; + __be16 tx_comp_ring_entries; + __be16 rx_buff_ring_entries; }; -_Static_assert(sizeof(struct gve_device_option_dqo_rda) == 4, +_Static_assert(sizeof(struct gve_device_option_dqo_rda) == 8, + "gve: bad admin queue struct length"); + +struct gve_device_option_dqo_qpl { + __be32 supported_features_mask; + __be16 tx_comp_ring_entries; + __be16 rx_buff_ring_entries; +}; + +_Static_assert(sizeof(struct gve_device_option_dqo_qpl) == 8, + "gve: bad admin queue struct length"); + +struct gve_ring_size_bound { + __be16 rx; + __be16 tx; +}; + +_Static_assert(sizeof(struct gve_ring_size_bound) == 4, "gve: bad admin queue struct length"); struct gve_device_option_modify_ring { __be32 supported_features_mask; - __be16 max_rx_ring_size; - __be16 max_tx_ring_size; + struct gve_ring_size_bound max_ring_size; + struct gve_ring_size_bound min_ring_size; }; -_Static_assert(sizeof(struct gve_device_option_modify_ring) == 8, +_Static_assert(sizeof(struct gve_device_option_modify_ring) == 12, "gve: bad admin queue struct length"); struct gve_device_option_jumbo_frames { @@ -166,6 +185,7 @@ enum gve_dev_opt_id { GVE_DEV_OPT_ID_GQI_QPL = 0x3, GVE_DEV_OPT_ID_DQO_RDA = 0x4, GVE_DEV_OPT_ID_MODIFY_RING = 0x6, + GVE_DEV_OPT_ID_DQO_QPL = 0x7, GVE_DEV_OPT_ID_JUMBO_FRAMES = 0x8, }; @@ -180,6 +200,7 @@ enum gve_dev_opt_req_feat_mask { GVE_DEV_OPT_REQ_FEAT_MASK_GQI_RDA = 0x0, GVE_DEV_OPT_REQ_FEAT_MASK_GQI_QPL = 0x0, GVE_DEV_OPT_REQ_FEAT_MASK_DQO_RDA = 0x0, + GVE_DEV_OPT_REQ_FEAT_MASK_DQO_QPL = 0x0, GVE_DEV_OPT_REQ_FEAT_MASK_MODIFY_RING = 0x0, GVE_DEV_OPT_REQ_FEAT_MASK_JUMBO_FRAMES = 0x0, }; @@ -194,9 +215,8 @@ enum gve_sup_feature_mask { enum gve_driver_capability { gve_driver_capability_gqi_qpl = 0, gve_driver_capability_gqi_rda = 1, - gve_driver_capability_dqo_qpl = 2, /* reserved for future use */ + gve_driver_capability_dqo_qpl = 2, gve_driver_capability_dqo_rda = 3, - gve_driver_capability_alt_miss_compl = 4, }; #define GVE_CAP1(a) BIT((int) a) @@ -209,7 +229,10 @@ enum gve_driver_capability { * Only a few bits (as shown in `gve_driver_compatibility`) are currently * defined. The rest are reserved for future use. */ -#define GVE_DRIVER_CAPABILITY_FLAGS1 (GVE_CAP1(gve_driver_capability_gqi_qpl)) +#define GVE_DRIVER_CAPABILITY_FLAGS1 \ + (GVE_CAP1(gve_driver_capability_gqi_qpl) | \ + GVE_CAP1(gve_driver_capability_dqo_qpl) | \ + GVE_CAP1(gve_driver_capability_dqo_rda)) #define GVE_DRIVER_CAPABILITY_FLAGS2 0x0 #define GVE_DRIVER_CAPABILITY_FLAGS3 0x0 #define GVE_DRIVER_CAPABILITY_FLAGS4 0x0 @@ -282,6 +305,8 @@ struct gve_adminq_create_tx_queue { _Static_assert(sizeof(struct gve_adminq_create_tx_queue) == 48, "gve: bad admin queue struct length"); +#define GVE_RAW_ADDRESSING_QPL_ID 0xFFFFFFFF + struct gve_adminq_create_rx_queue { __be32 queue_id; __be32 index; @@ -352,6 +377,24 @@ struct stats { _Static_assert(sizeof(struct stats) == 16, "gve: bad admin queue struct length"); +/* + * These are control path types for PTYPE which are the same as the data path + * types. + */ +struct gve_ptype_entry { + uint8_t l3_type; + uint8_t l4_type; +}; + +struct gve_ptype_map { + struct gve_ptype_entry ptypes[1 << 10]; /* PTYPES are always 10 bits. */ +}; + +struct gve_adminq_get_ptype_map { + __be64 ptype_map_len; + __be64 ptype_map_addr; +}; + struct gve_adminq_command { __be32 opcode; __be32 status; @@ -368,6 +411,7 @@ struct gve_adminq_command { struct gve_adminq_set_driver_parameter set_driver_param; struct gve_adminq_verify_driver_compatibility verify_driver_compatibility; + struct gve_adminq_get_ptype_map get_ptype_map; uint8_t reserved[56]; }; }; @@ -375,6 +419,24 @@ struct gve_adminq_command { _Static_assert(sizeof(struct gve_adminq_command) == 64, "gve: bad admin queue struct length"); +enum gve_l3_type { + /* Must be zero so zero initialized LUT is unknown. */ + GVE_L3_TYPE_UNKNOWN = 0, + GVE_L3_TYPE_OTHER, + GVE_L3_TYPE_IPV4, + GVE_L3_TYPE_IPV6, +}; + +enum gve_l4_type { + /* Must be zero so zero initialized LUT is unknown. */ + GVE_L4_TYPE_UNKNOWN = 0, + GVE_L4_TYPE_OTHER, + GVE_L4_TYPE_TCP, + GVE_L4_TYPE_UDP, + GVE_L4_TYPE_ICMP, + GVE_L4_TYPE_SCTP, +}; + int gve_adminq_create_rx_queues(struct gve_priv *priv, uint32_t num_queues); int gve_adminq_create_tx_queues(struct gve_priv *priv, uint32_t num_queues); int gve_adminq_destroy_tx_queues(struct gve_priv *priv, uint32_t num_queues); @@ -387,8 +449,10 @@ int gve_adminq_configure_device_resources(struct gve_priv *priv); int gve_adminq_deconfigure_device_resources(struct gve_priv *priv); void gve_release_adminq(struct gve_priv *priv); int gve_adminq_register_page_list(struct gve_priv *priv, - struct gve_queue_page_list *qpl); + struct gve_queue_page_list *qpl); int gve_adminq_unregister_page_list(struct gve_priv *priv, uint32_t page_list_id); int gve_adminq_verify_driver_compatibility(struct gve_priv *priv, - uint64_t driver_info_len, vm_paddr_t driver_info_addr); + uint64_t driver_info_len, vm_paddr_t driver_info_addr); +int gve_adminq_get_ptype_map_dqo(struct gve_priv *priv, + struct gve_ptype_lut *ptype_lut); #endif /* _GVE_AQ_H_ */ diff --git a/sys/dev/gve/gve_desc.h b/sys/dev/gve/gve_desc.h index 5f09cc8b77b8..48c4ac27596b 100644 --- a/sys/dev/gve/gve_desc.h +++ b/sys/dev/gve/gve_desc.h @@ -130,10 +130,10 @@ union gve_rx_data_slot { __be64 addr; }; -/* GVE Recive Packet Descriptor Seq No */ +/* GVE Receive Packet Descriptor Seq No */ #define GVE_SEQNO(x) (be16toh(x) & 0x7) -/* GVE Recive Packet Descriptor Flags */ +/* GVE Receive Packet Descriptor Flags */ #define GVE_RXFLG(x) htobe16(1 << (3 + (x))) #define GVE_RXF_FRAG GVE_RXFLG(3) /* IP Fragment */ #define GVE_RXF_IPV4 GVE_RXFLG(4) /* IPv4 */ diff --git a/sys/dev/gve/gve_dqo.h b/sys/dev/gve/gve_dqo.h new file mode 100644 index 000000000000..542f8ff7d888 --- /dev/null +++ b/sys/dev/gve/gve_dqo.h @@ -0,0 +1,337 @@ +/*- + * SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 2024 Google LLC + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* GVE DQO Descriptor formats */ + +#ifndef _GVE_DESC_DQO_H_ +#define _GVE_DESC_DQO_H_ + +#include "gve_plat.h" + +#define GVE_ITR_ENABLE_BIT_DQO BIT(0) +#define GVE_ITR_NO_UPDATE_DQO (3 << 3) +#define GVE_ITR_INTERVAL_DQO_SHIFT 5 +#define GVE_ITR_INTERVAL_DQO_MASK ((1 << 12) - 1) +#define GVE_TX_IRQ_RATELIMIT_US_DQO 50 +#define GVE_RX_IRQ_RATELIMIT_US_DQO 20 + +#define GVE_TX_MAX_HDR_SIZE_DQO 255 +#define GVE_TX_MIN_TSO_MSS_DQO 88 + +/* + * Ringing the doorbell too often can hurt performance. + * + * HW requires this value to be at least 8. + */ +#define GVE_RX_BUF_THRESH_DQO 32 + +/* + * Start dropping RX fragments if at least these many + * buffers cannot be posted to the NIC. + */ +#define GVE_RX_DQO_MIN_PENDING_BUFS 128 + +/* + * gve_rx_qpl_buf_id_dqo's 11 bit wide buf_id field limits the total + * number of pages per QPL to 2048. + */ +#define GVE_RX_NUM_QPL_PAGES_DQO 2048 + +/* 2K TX buffers for DQO-QPL */ +#define GVE_TX_BUF_SHIFT_DQO 11 +#define GVE_TX_BUF_SIZE_DQO BIT(GVE_TX_BUF_SHIFT_DQO) +#define GVE_TX_BUFS_PER_PAGE_DQO (PAGE_SIZE >> GVE_TX_BUF_SHIFT_DQO) + +#define GVE_TX_NUM_QPL_PAGES_DQO 512 + +/* Basic TX descriptor (DTYPE 0x0C) */ +struct gve_tx_pkt_desc_dqo { + __le64 buf_addr; + + /* Must be GVE_TX_PKT_DESC_DTYPE_DQO (0xc) */ + uint8_t dtype:5; + + /* Denotes the last descriptor of a packet. */ + uint8_t end_of_packet:1; + uint8_t checksum_offload_enable:1; + + /* If set, will generate a descriptor completion for this descriptor. */ + uint8_t report_event:1; + uint8_t reserved0; + __le16 reserved1; + + /* The TX completion for this packet will contain this tag. */ + __le16 compl_tag; + uint16_t buf_size:14; + uint16_t reserved2:2; +} __packed; +_Static_assert(sizeof(struct gve_tx_pkt_desc_dqo) == 16, + "gve: bad dqo desc struct length"); + +#define GVE_TX_PKT_DESC_DTYPE_DQO 0xc + +/* + * Maximum number of data descriptors allowed per packet, or per-TSO segment. + */ +#define GVE_TX_MAX_DATA_DESCS_DQO 10 +#define GVE_TX_MAX_BUF_SIZE_DQO ((16 * 1024) - 1) +#define GVE_TSO_MAXSIZE_DQO IP_MAXPACKET + +_Static_assert(GVE_TX_MAX_BUF_SIZE_DQO * GVE_TX_MAX_DATA_DESCS_DQO >= + GVE_TSO_MAXSIZE_DQO, + "gve: bad tso parameters"); + +/* + * "report_event" on TX packet descriptors may only be reported on the last + * descriptor of a TX packet, and they must be spaced apart with at least this + * value. + */ +#define GVE_TX_MIN_RE_INTERVAL 32 + +struct gve_tx_context_cmd_dtype { + uint8_t dtype:5; + uint8_t tso:1; + uint8_t reserved1:2; + uint8_t reserved2; +}; + +_Static_assert(sizeof(struct gve_tx_context_cmd_dtype) == 2, + "gve: bad dqo desc struct length"); + +/* + * TX Native TSO Context DTYPE (0x05) + * + * "flex" fields allow the driver to send additional packet context to HW. + */ +struct gve_tx_tso_context_desc_dqo { + /* The L4 payload bytes that should be segmented. */ + uint32_t tso_total_len:24; + uint32_t flex10:8; + + /* Max segment size in TSO excluding headers. */ + uint16_t mss:14; + uint16_t reserved:2; + + uint8_t header_len; /* Header length to use for TSO offload */ + uint8_t flex11; + struct gve_tx_context_cmd_dtype cmd_dtype; + uint8_t flex0; + uint8_t flex5; + uint8_t flex6; + uint8_t flex7; + uint8_t flex8; + uint8_t flex9; +} __packed; +_Static_assert(sizeof(struct gve_tx_tso_context_desc_dqo) == 16, + "gve: bad dqo desc struct length"); + +#define GVE_TX_TSO_CTX_DESC_DTYPE_DQO 0x5 + +/* General context descriptor for sending metadata. */ +struct gve_tx_general_context_desc_dqo { + uint8_t flex4; + uint8_t flex5; + uint8_t flex6; + uint8_t flex7; + uint8_t flex8; + uint8_t flex9; + uint8_t flex10; + uint8_t flex11; + struct gve_tx_context_cmd_dtype cmd_dtype; + uint16_t reserved; + uint8_t flex0; + uint8_t flex1; + uint8_t flex2; + uint8_t flex3; +} __packed; +_Static_assert(sizeof(struct gve_tx_general_context_desc_dqo) == 16, + "gve: bad dqo desc struct length"); + +#define GVE_TX_GENERAL_CTX_DESC_DTYPE_DQO 0x4 + +/* + * Logical structure of metadata which is packed into context descriptor flex + * fields. + */ +struct gve_tx_metadata_dqo { + union { + struct { + uint8_t version; + + /* + * A zero value means no l4_hash was associated with the + * mbuf. + */ + uint16_t path_hash:15; + + /* + * Should be set to 1 if the flow associated with the + * mbuf had a rehash from the TCP stack. + */ + uint16_t rehash_event:1; + } __packed; + uint8_t bytes[12]; + }; +} __packed; +_Static_assert(sizeof(struct gve_tx_metadata_dqo) == 12, + "gve: bad dqo desc struct length"); + +#define GVE_TX_METADATA_VERSION_DQO 0 + +/* Used to access the generation bit within a TX completion descriptor. */ +#define GVE_TX_DESC_DQO_GEN_BYTE_OFFSET 1 +#define GVE_TX_DESC_DQO_GEN_BIT_MASK 0x80 + +/* TX completion descriptor */ +struct gve_tx_compl_desc_dqo { + /* + * For types 0-4 this is the TX queue ID associated with this + * completion. + */ + uint16_t id:11; + + /* See: GVE_COMPL_TYPE_DQO* */ + uint16_t type:3; + uint16_t reserved0:1; + + /* Flipped by HW to notify the descriptor is populated. */ + uint16_t generation:1; + union { + /* + * For descriptor completions, this is the last index fetched + * by HW + 1. + */ + __le16 tx_head; + + /* + * For packet completions, this is the completion tag set on the + * TX packet descriptors. + */ + __le16 completion_tag; + }; + __le32 reserved1; +} __packed; +_Static_assert(sizeof(struct gve_tx_compl_desc_dqo) == 8, + "gve: bad dqo desc struct length"); + +union gve_tx_desc_dqo { + struct gve_tx_pkt_desc_dqo pkt; + struct gve_tx_tso_context_desc_dqo tso_ctx; + struct gve_tx_general_context_desc_dqo general_ctx; +}; + +#define GVE_COMPL_TYPE_DQO_PKT 0x2 /* Packet completion */ +#define GVE_COMPL_TYPE_DQO_DESC 0x4 /* Descriptor completion */ + +/* Descriptor to post buffers to HW on buffer queue. */ +struct gve_rx_desc_dqo { + __le16 buf_id; /* ID returned in Rx completion descriptor */ + __le16 reserved0; + __le32 reserved1; + __le64 buf_addr; /* DMA address of the buffer */ + __le64 header_buf_addr; + __le64 reserved2; +} __packed; +_Static_assert(sizeof(struct gve_rx_desc_dqo) == 32, + "gve: bad dqo desc struct length"); + +/* Used to access the generation bit within an RX completion descriptor. */ +#define GVE_RX_DESC_DQO_GEN_BYTE_OFFSET 5 +#define GVE_RX_DESC_DQO_GEN_BIT_MASK 0x40 + +/* Descriptor for HW to notify SW of new packets received on RX queue. */ +struct gve_rx_compl_desc_dqo { + /* Must be 1 */ + uint8_t rxdid:4; + uint8_t reserved0:4; + + /* Packet originated from this system rather than the network. */ + uint8_t loopback:1; + /* + * Set when IPv6 packet contains a destination options header or routing + * header. + */ + uint8_t ipv6_ex_add:1; + /* Invalid packet was received. */ + uint8_t rx_error:1; + uint8_t reserved1:5; + + uint16_t packet_type:10; + uint16_t ip_hdr_err:1; + uint16_t udp_len_err:1; + uint16_t raw_cs_invalid:1; + uint16_t reserved2:3; + + uint16_t packet_len:14; + /* Flipped by HW to notify the descriptor is populated. */ + uint16_t generation:1; + /* Should be zero. */ + uint16_t buffer_queue_id:1; + + uint16_t header_len:10; + uint16_t rsc:1; + uint16_t split_header:1; + uint16_t reserved3:4; + + uint8_t descriptor_done:1; + uint8_t end_of_packet:1; + uint8_t header_buffer_overflow:1; + uint8_t l3_l4_processed:1; + uint8_t csum_ip_err:1; + uint8_t csum_l4_err:1; + uint8_t csum_external_ip_err:1; + uint8_t csum_external_udp_err:1; + + uint8_t status_error1; + + __le16 reserved5; + __le16 buf_id; /* Buffer ID which was sent on the buffer queue. */ + + union { + /* Packet checksum. */ + __le16 raw_cs; + /* Segment length for RSC packets. */ + __le16 rsc_seg_len; + }; + __le32 hash; + __le32 reserved6; + __le64 reserved7; +} __packed; + +_Static_assert(sizeof(struct gve_rx_compl_desc_dqo) == 32, + "gve: bad dqo desc struct length"); + +static inline uint8_t +gve_get_dq_num_frags_in_page(struct gve_priv *priv) +{ + return (PAGE_SIZE / priv->rx_buf_size_dqo); +} +#endif /* _GVE_DESC_DQO_H_ */ diff --git a/sys/dev/gve/gve_main.c b/sys/dev/gve/gve_main.c index cd7849778bce..10197a8e15f8 100644 --- a/sys/dev/gve/gve_main.c +++ b/sys/dev/gve/gve_main.c @@ -1,7 +1,7 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * - * Copyright (c) 2023 Google LLC + * Copyright (c) 2023-2024 Google LLC * * Redistribution and use in source and binary forms, with or without modification, * are permitted provided that the following conditions are met: @@ -30,11 +30,12 @@ */ #include "gve.h" #include "gve_adminq.h" +#include "gve_dqo.h" -#define GVE_DRIVER_VERSION "GVE-FBSD-1.0.1\n" +#define GVE_DRIVER_VERSION "GVE-FBSD-1.3.4\n" #define GVE_VERSION_MAJOR 1 -#define GVE_VERSION_MINOR 0 -#define GVE_VERSION_SUB 1 +#define GVE_VERSION_MINOR 3 +#define GVE_VERSION_SUB 5 #define GVE_DEFAULT_RX_COPYBREAK 256 @@ -49,6 +50,9 @@ static struct gve_dev { struct sx gve_global_lock; +static void gve_start_tx_timeout_service(struct gve_priv *priv); +static void gve_stop_tx_timeout_service(struct gve_priv *priv); + static int gve_verify_driver_compatibility(struct gve_priv *priv) { @@ -98,6 +102,72 @@ gve_verify_driver_compatibility(struct gve_priv *priv) return (err); } +static void +gve_handle_tx_timeout(struct gve_priv *priv, struct gve_tx_ring *tx, + int num_timeout_pkts) +{ + int64_t time_since_last_kick; + + counter_u64_add_protected(tx->stats.tx_timeout, 1); + + /* last_kicked is never GVE_TIMESTAMP_INVALID so we can skip checking */ + time_since_last_kick = gve_seconds_since(&tx->last_kicked); + + /* Try kicking first in case the timeout is due to a missed interrupt */ + if (time_since_last_kick > GVE_TX_TIMEOUT_KICK_COOLDOWN_SEC) { + device_printf(priv->dev, + "Found %d timed out packet(s) on txq%d, kicking it for completions\n", + num_timeout_pkts, tx->com.id); + gve_set_timestamp(&tx->last_kicked); + taskqueue_enqueue(tx->com.cleanup_tq, &tx->com.cleanup_task); + } else { + device_printf(priv->dev, + "Found %d timed out packet(s) on txq%d with its last kick %jd sec ago which is less than the cooldown period %d. Resetting device\n", + num_timeout_pkts, tx->com.id, + (intmax_t)time_since_last_kick, + GVE_TX_TIMEOUT_KICK_COOLDOWN_SEC); + gve_schedule_reset(priv); + } +} + +static void +gve_tx_timeout_service_callback(void *data) +{ + struct gve_priv *priv = (struct gve_priv *)data; + struct gve_tx_ring *tx; + uint16_t num_timeout_pkts; + + tx = &priv->tx[priv->check_tx_queue_idx]; + + num_timeout_pkts = gve_is_gqi(priv) ? + gve_check_tx_timeout_gqi(priv, tx) : + gve_check_tx_timeout_dqo(priv, tx); + if (num_timeout_pkts) + gve_handle_tx_timeout(priv, tx, num_timeout_pkts); + + priv->check_tx_queue_idx = (priv->check_tx_queue_idx + 1) % + priv->tx_cfg.num_queues; + callout_reset_sbt(&priv->tx_timeout_service, + SBT_1S * GVE_TX_TIMEOUT_CHECK_CADENCE_SEC, 0, + gve_tx_timeout_service_callback, (void *)priv, 0); +} + +static void +gve_start_tx_timeout_service(struct gve_priv *priv) +{ + priv->check_tx_queue_idx = 0; + callout_init(&priv->tx_timeout_service, true); + callout_reset_sbt(&priv->tx_timeout_service, + SBT_1S * GVE_TX_TIMEOUT_CHECK_CADENCE_SEC, 0, + gve_tx_timeout_service_callback, (void *)priv, 0); +} + +static void +gve_stop_tx_timeout_service(struct gve_priv *priv) +{ + callout_drain(&priv->tx_timeout_service); +} + static int gve_up(struct gve_priv *priv) { @@ -124,9 +194,11 @@ gve_up(struct gve_priv *priv) if (if_getcapenable(ifp) & IFCAP_TSO6) if_sethwassistbits(ifp, CSUM_IP6_TSO, 0); - err = gve_register_qpls(priv); - if (err != 0) - goto reset; + if (gve_is_qpl(priv)) { + err = gve_register_qpls(priv); + if (err != 0) + goto reset; + } err = gve_create_rx_rings(priv); if (err != 0) @@ -146,6 +218,9 @@ gve_up(struct gve_priv *priv) gve_unmask_all_queue_irqs(priv); gve_set_state_flag(priv, GVE_STATE_FLAG_QUEUES_UP); priv->interface_up_cnt++; + + gve_start_tx_timeout_service(priv); + return (0); reset: @@ -161,6 +236,8 @@ gve_down(struct gve_priv *priv) if (!gve_get_state_flag(priv, GVE_STATE_FLAG_QUEUES_UP)) return; + gve_stop_tx_timeout_service(priv); + if (gve_get_state_flag(priv, GVE_STATE_FLAG_LINK_UP)) { if_link_state_change(priv->ifp, LINK_STATE_DOWN); gve_clear_state_flag(priv, GVE_STATE_FLAG_LINK_UP); @@ -174,10 +251,13 @@ gve_down(struct gve_priv *priv) if (gve_destroy_tx_rings(priv) != 0) goto reset; - if (gve_unregister_qpls(priv) != 0) - goto reset; + if (gve_is_qpl(priv)) { + if (gve_unregister_qpls(priv) != 0) + goto reset; + } - gve_mask_all_queue_irqs(priv); + if (gve_is_gqi(priv)) + gve_mask_all_queue_irqs(priv); gve_clear_state_flag(priv, GVE_STATE_FLAG_QUEUES_UP); priv->interface_down_cnt++; return; @@ -186,10 +266,143 @@ reset: gve_schedule_reset(priv); } +int +gve_adjust_rx_queues(struct gve_priv *priv, uint16_t new_queue_cnt) +{ + int err; + + GVE_IFACE_LOCK_ASSERT(priv->gve_iface_lock); + + gve_down(priv); + + if (new_queue_cnt < priv->rx_cfg.num_queues) { + /* + * Freeing a ring still preserves its ntfy_id, + * which is needed if we create the ring again. + */ + gve_free_rx_rings(priv, new_queue_cnt, priv->rx_cfg.num_queues); + } else { + err = gve_alloc_rx_rings(priv, priv->rx_cfg.num_queues, new_queue_cnt); + if (err != 0) { + device_printf(priv->dev, "Failed to allocate new queues"); + /* Failed to allocate rings, start back up with old ones */ + gve_up(priv); + return (err); + + } + } + priv->rx_cfg.num_queues = new_queue_cnt; + + err = gve_up(priv); + if (err != 0) + gve_schedule_reset(priv); + + return (err); +} + +int +gve_adjust_tx_queues(struct gve_priv *priv, uint16_t new_queue_cnt) +{ + int err; + + GVE_IFACE_LOCK_ASSERT(priv->gve_iface_lock); + + gve_down(priv); + + if (new_queue_cnt < priv->tx_cfg.num_queues) { + /* + * Freeing a ring still preserves its ntfy_id, + * which is needed if we create the ring again. + */ + gve_free_tx_rings(priv, new_queue_cnt, priv->tx_cfg.num_queues); + } else { + err = gve_alloc_tx_rings(priv, priv->tx_cfg.num_queues, new_queue_cnt); + if (err != 0) { + device_printf(priv->dev, "Failed to allocate new queues"); + /* Failed to allocate rings, start back up with old ones */ + gve_up(priv); + return (err); + + } + } + priv->tx_cfg.num_queues = new_queue_cnt; + + err = gve_up(priv); + if (err != 0) + gve_schedule_reset(priv); + + return (err); +} + +int +gve_adjust_ring_sizes(struct gve_priv *priv, uint16_t new_desc_cnt, bool is_rx) +{ + int err; + uint16_t prev_desc_cnt; + + GVE_IFACE_LOCK_ASSERT(priv->gve_iface_lock); + + gve_down(priv); + + if (is_rx) { + gve_free_rx_rings(priv, 0, priv->rx_cfg.num_queues); + prev_desc_cnt = priv->rx_desc_cnt; + priv->rx_desc_cnt = new_desc_cnt; + err = gve_alloc_rx_rings(priv, 0, priv->rx_cfg.num_queues); + if (err != 0) { + device_printf(priv->dev, + "Failed to allocate rings. Trying to start back up with previous ring size."); + priv->rx_desc_cnt = prev_desc_cnt; + err = gve_alloc_rx_rings(priv, 0, priv->rx_cfg.num_queues); + } + } else { + gve_free_tx_rings(priv, 0, priv->tx_cfg.num_queues); + prev_desc_cnt = priv->tx_desc_cnt; + priv->tx_desc_cnt = new_desc_cnt; + err = gve_alloc_tx_rings(priv, 0, priv->tx_cfg.num_queues); + if (err != 0) { + device_printf(priv->dev, + "Failed to allocate rings. Trying to start back up with previous ring size."); + priv->tx_desc_cnt = prev_desc_cnt; + err = gve_alloc_tx_rings(priv, 0, priv->tx_cfg.num_queues); + } + } + + if (err != 0) { + device_printf(priv->dev, "Failed to allocate rings! Cannot start device back up!"); + return (err); + } + + err = gve_up(priv); + if (err != 0) { + gve_schedule_reset(priv); + return (err); + } + + return (0); +} + +static int +gve_get_dqo_rx_buf_size(struct gve_priv *priv, uint16_t mtu) +{ + /* + * Use 4k buffers only if mode is DQ, 4k buffers flag is on, + * and either hw LRO is enabled or mtu is greater than 2048 + */ + if (!gve_is_gqi(priv) && gve_allow_4k_rx_buffers && + (!gve_disable_hw_lro || mtu > GVE_DEFAULT_RX_BUFFER_SIZE)) + return (GVE_4K_RX_BUFFER_SIZE_DQO); + + return (GVE_DEFAULT_RX_BUFFER_SIZE); +} + static int gve_set_mtu(if_t ifp, uint32_t new_mtu) { struct gve_priv *priv = if_getsoftc(ifp); + const uint32_t max_problem_range = 8227; + const uint32_t min_problem_range = 7822; + uint16_t new_rx_buf_size = gve_get_dqo_rx_buf_size(priv, new_mtu); int err; if ((new_mtu > priv->max_mtu) || (new_mtu < ETHERMIN)) { @@ -198,11 +411,32 @@ gve_set_mtu(if_t ifp, uint32_t new_mtu) return (EINVAL); } + /* + * When hardware LRO is enabled in DQ mode, MTUs within the range + * [7822, 8227] trigger hardware issues which cause a drastic drop + * in throughput. + */ + if (!gve_is_gqi(priv) && !gve_disable_hw_lro && + new_mtu >= min_problem_range && new_mtu <= max_problem_range && + new_rx_buf_size != GVE_4K_RX_BUFFER_SIZE_DQO) { + device_printf(priv->dev, + "Cannot set to MTU to %d within the range [%d, %d] while HW LRO is enabled and not using 4k RX Buffers\n", + new_mtu, min_problem_range, max_problem_range); + return (EINVAL); + } + err = gve_adminq_set_mtu(priv, new_mtu); if (err == 0) { if (bootverbose) device_printf(priv->dev, "MTU set to %d\n", new_mtu); if_setmtu(ifp, new_mtu); + /* Need to re-alloc RX queues if RX buffer size changed */ + if (!gve_is_gqi(priv) && + new_rx_buf_size != priv->rx_buf_size_dqo) { + gve_free_rx_rings(priv, 0, priv->rx_cfg.num_queues); + priv->rx_buf_size_dqo = new_rx_buf_size; + gve_alloc_rx_rings(priv, 0, priv->rx_cfg.num_queues); + } } else { device_printf(priv->dev, "Failed to set MTU to %d\n", new_mtu); } @@ -352,18 +586,13 @@ gve_get_counter(if_t ifp, ift_counter cnt) } } -static int +static void gve_setup_ifnet(device_t dev, struct gve_priv *priv) { int caps = 0; if_t ifp; ifp = priv->ifp = if_alloc(IFT_ETHER); - if (ifp == NULL) { - device_printf(priv->dev, "Failed to allocate ifnet struct\n"); - return (ENXIO); - } - if_initname(ifp, device_get_name(dev), device_get_unit(dev)); if_setsoftc(ifp, priv); if_setdev(ifp, dev); @@ -372,6 +601,18 @@ gve_setup_ifnet(device_t dev, struct gve_priv *priv) if_settransmitfn(ifp, gve_xmit_ifp); if_setqflushfn(ifp, gve_qflush); + /* + * Set TSO limits, must match the arguments to bus_dma_tag_create + * when creating tx->dqo.buf_dmatag. Only applies to the RDA mode + * because in QPL we copy the entire packet into the bounce buffer + * and thus it does not matter how fragmented the mbuf is. + */ + if (!gve_is_gqi(priv) && !gve_is_qpl(priv)) { + if_sethwtsomaxsegcount(ifp, GVE_TX_MAX_DATA_DESCS_DQO); + if_sethwtsomaxsegsize(ifp, GVE_TX_MAX_BUF_SIZE_DQO); + } + if_sethwtsomax(ifp, GVE_TSO_MAXSIZE_DQO); + #if __FreeBSD_version >= 1400086 if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST); #else @@ -401,8 +642,6 @@ gve_setup_ifnet(device_t dev, struct gve_priv *priv) ifmedia_add(&priv->media, IFM_ETHER | IFM_AUTO, 0, NULL); ifmedia_set(&priv->media, IFM_ETHER | IFM_AUTO); - - return (0); } static int @@ -454,9 +693,14 @@ static void gve_free_rings(struct gve_priv *priv) { gve_free_irqs(priv); - gve_free_tx_rings(priv); - gve_free_rx_rings(priv); - gve_free_qpls(priv); + + gve_free_tx_rings(priv, 0, priv->tx_cfg.num_queues); + free(priv->tx, M_GVE); + priv->tx = NULL; + + gve_free_rx_rings(priv, 0, priv->rx_cfg.num_queues); + free(priv->rx, M_GVE); + priv->rx = NULL; } static int @@ -464,15 +708,15 @@ gve_alloc_rings(struct gve_priv *priv) { int err; - err = gve_alloc_qpls(priv); - if (err != 0) - goto abort; - - err = gve_alloc_rx_rings(priv); + priv->rx = malloc(sizeof(struct gve_rx_ring) * priv->rx_cfg.max_queues, + M_GVE, M_WAITOK | M_ZERO); + err = gve_alloc_rx_rings(priv, 0, priv->rx_cfg.num_queues); if (err != 0) goto abort; - err = gve_alloc_tx_rings(priv); + priv->tx = malloc(sizeof(struct gve_tx_ring) * priv->tx_cfg.max_queues, + M_GVE, M_WAITOK | M_ZERO); + err = gve_alloc_tx_rings(priv, 0, priv->tx_cfg.num_queues); if (err != 0) goto abort; @@ -488,7 +732,7 @@ abort: } static void -gve_deconfigure_resources(struct gve_priv *priv) +gve_deconfigure_and_free_device_resources(struct gve_priv *priv) { int err; @@ -506,10 +750,15 @@ gve_deconfigure_resources(struct gve_priv *priv) gve_free_irq_db_array(priv); gve_free_counter_array(priv); + + if (priv->ptype_lut_dqo) { + free(priv->ptype_lut_dqo, M_GVE); + priv->ptype_lut_dqo = NULL; + } } static int -gve_configure_resources(struct gve_priv *priv) +gve_alloc_and_configure_device_resources(struct gve_priv *priv) { int err; @@ -532,13 +781,25 @@ gve_configure_resources(struct gve_priv *priv) goto abort; } + if (!gve_is_gqi(priv)) { + priv->ptype_lut_dqo = malloc(sizeof(*priv->ptype_lut_dqo), M_GVE, + M_WAITOK | M_ZERO); + + err = gve_adminq_get_ptype_map_dqo(priv, priv->ptype_lut_dqo); + if (err != 0) { + device_printf(priv->dev, "Failed to configure ptype lut: err=%d\n", + err); + goto abort; + } + } + gve_set_state_flag(priv, GVE_STATE_FLAG_RESOURCES_OK); if (bootverbose) device_printf(priv->dev, "Configured device resources\n"); return (0); abort: - gve_deconfigure_resources(priv); + gve_deconfigure_and_free_device_resources(priv); return (err); } @@ -557,7 +818,7 @@ gve_set_queue_cnts(struct gve_priv *priv) priv->rx_cfg.num_queues); } - priv->num_queues = priv->tx_cfg.num_queues + priv->rx_cfg.num_queues; + priv->num_queues = priv->tx_cfg.max_queues + priv->rx_cfg.max_queues; priv->mgmt_msix_idx = priv->num_queues; } @@ -603,7 +864,7 @@ static void gve_destroy(struct gve_priv *priv) { gve_down(priv); - gve_deconfigure_resources(priv); + gve_deconfigure_and_free_device_resources(priv); gve_release_adminq(priv); } @@ -616,9 +877,21 @@ gve_restore(struct gve_priv *priv) if (err != 0) goto abort; - err = gve_configure_resources(priv); - if (err != 0) + err = gve_adminq_configure_device_resources(priv); + if (err != 0) { + device_printf(priv->dev, "Failed to configure device resources: err=%d\n", + err); + err = (ENXIO); goto abort; + } + if (!gve_is_gqi(priv)) { + err = gve_adminq_get_ptype_map_dqo(priv, priv->ptype_lut_dqo); + if (err != 0) { + device_printf(priv->dev, "Failed to configure ptype lut: err=%d\n", + err); + goto abort; + } + } err = gve_up(priv); if (err != 0) @@ -632,6 +905,25 @@ abort: } static void +gve_clear_device_resources(struct gve_priv *priv) +{ + int i; + + for (i = 0; i < priv->num_event_counters; i++) + priv->counters[i] = 0; + bus_dmamap_sync(priv->counter_array_mem.tag, priv->counter_array_mem.map, + BUS_DMASYNC_PREWRITE); + + for (i = 0; i < priv->num_queues; i++) + priv->irq_db_indices[i] = (struct gve_irq_db){}; + bus_dmamap_sync(priv->irqs_db_mem.tag, priv->irqs_db_mem.map, + BUS_DMASYNC_PREWRITE); + + if (priv->ptype_lut_dqo) + *priv->ptype_lut_dqo = (struct gve_ptype_lut){0}; +} + +static void gve_handle_reset(struct gve_priv *priv) { if (!gve_get_state_flag(priv, GVE_STATE_FLAG_DO_RESET)) @@ -662,6 +954,8 @@ gve_handle_reset(struct gve_priv *priv) gve_clear_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK); gve_down(priv); + gve_clear_device_resources(priv); + gve_restore(priv); GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock); @@ -749,6 +1043,9 @@ gve_attach(device_t dev) int rid; int err; + snprintf(gve_version, sizeof(gve_version), "%d.%d.%d", + GVE_VERSION_MAJOR, GVE_VERSION_MINOR, GVE_VERSION_SUB); + priv = device_get_softc(dev); priv->dev = dev; GVE_IFACE_LOCK_INIT(priv->gve_iface_lock); @@ -786,17 +1083,16 @@ gve_attach(device_t dev) if (err != 0) goto abort; - err = gve_configure_resources(priv); + err = gve_alloc_and_configure_device_resources(priv); if (err != 0) goto abort; + priv->rx_buf_size_dqo = gve_get_dqo_rx_buf_size(priv, priv->max_mtu); err = gve_alloc_rings(priv); if (err != 0) goto abort; - err = gve_setup_ifnet(dev, priv); - if (err != 0) - goto abort; + gve_setup_ifnet(dev, priv); priv->rx_copybreak = GVE_DEFAULT_RX_COPYBREAK; @@ -817,7 +1113,7 @@ gve_attach(device_t dev) abort: gve_free_rings(priv); - gve_deconfigure_resources(priv); + gve_deconfigure_and_free_device_resources(priv); gve_release_adminq(priv); gve_free_sys_res_mem(priv); GVE_IFACE_LOCK_DESTROY(priv->gve_iface_lock); @@ -829,6 +1125,11 @@ gve_detach(device_t dev) { struct gve_priv *priv = device_get_softc(dev); if_t ifp = priv->ifp; + int error; + + error = bus_generic_detach(dev); + if (error != 0) + return (error); ether_ifdetach(ifp); @@ -845,7 +1146,7 @@ gve_detach(device_t dev) taskqueue_free(priv->service_tq); if_free(ifp); - return (bus_generic_detach(dev)); + return (0); } static device_method_t gve_methods[] = { diff --git a/sys/dev/gve/gve_plat.h b/sys/dev/gve/gve_plat.h index ad6bc1c92b36..3185656c5e04 100644 --- a/sys/dev/gve/gve_plat.h +++ b/sys/dev/gve/gve_plat.h @@ -85,6 +85,9 @@ typedef uint16_t __be16; typedef uint32_t __be32; typedef uint64_t __be64; +typedef uint16_t __le16; +typedef uint32_t __le32; +typedef uint64_t __le64; #define BIT(nr) (1UL << (nr)) #define FBSD_VERSION_MAJOR (__FreeBSD_version / 100000) diff --git a/sys/dev/gve/gve_qpl.c b/sys/dev/gve/gve_qpl.c index 3c6d9af6feee..0e7098dcd4a1 100644 --- a/sys/dev/gve/gve_qpl.c +++ b/sys/dev/gve/gve_qpl.c @@ -32,31 +32,13 @@ #include "gve.h" #include "gve_adminq.h" +#include "gve_dqo.h" static MALLOC_DEFINE(M_GVE_QPL, "gve qpl", "gve qpl allocations"); -static uint32_t -gve_num_tx_qpls(struct gve_priv *priv) -{ - if (priv->queue_format != GVE_GQI_QPL_FORMAT) - return (0); - - return (priv->tx_cfg.max_queues); -} - -static uint32_t -gve_num_rx_qpls(struct gve_priv *priv) -{ - if (priv->queue_format != GVE_GQI_QPL_FORMAT) - return (0); - - return (priv->rx_cfg.max_queues); -} - -static void -gve_free_qpl(struct gve_priv *priv, uint32_t id) +void +gve_free_qpl(struct gve_priv *priv, struct gve_queue_page_list *qpl) { - struct gve_queue_page_list *qpl = &priv->qpls[id]; int i; for (i = 0; i < qpl->num_dmas; i++) { @@ -91,12 +73,14 @@ gve_free_qpl(struct gve_priv *priv, uint32_t id) if (qpl->dmas != NULL) free(qpl->dmas, M_GVE_QPL); + + free(qpl, M_GVE_QPL); } -static int +struct gve_queue_page_list * gve_alloc_qpl(struct gve_priv *priv, uint32_t id, int npages, bool single_kva) { - struct gve_queue_page_list *qpl = &priv->qpls[id]; + struct gve_queue_page_list *qpl; int err; int i; @@ -104,9 +88,12 @@ gve_alloc_qpl(struct gve_priv *priv, uint32_t id, int npages, bool single_kva) device_printf(priv->dev, "Reached max number of registered pages %ju > %ju\n", (uintmax_t)npages + priv->num_registered_pages, (uintmax_t)priv->max_registered_pages); - return (EINVAL); + return (NULL); } + qpl = malloc(sizeof(struct gve_queue_page_list), M_GVE_QPL, + M_WAITOK | M_ZERO); + qpl->id = id; qpl->num_pages = 0; qpl->num_dmas = 0; @@ -162,123 +149,111 @@ gve_alloc_qpl(struct gve_priv *priv, uint32_t id, int npages, bool single_kva) priv->num_registered_pages++; } - return (0); + return (qpl); abort: - gve_free_qpl(priv, id); - return (err); + gve_free_qpl(priv, qpl); + return (NULL); } -void -gve_free_qpls(struct gve_priv *priv) -{ - int num_qpls = gve_num_tx_qpls(priv) + gve_num_rx_qpls(priv); - int i; - - if (num_qpls == 0) - return; - - if (priv->qpls != NULL) { - for (i = 0; i < num_qpls; i++) - gve_free_qpl(priv, i); - free(priv->qpls, M_GVE_QPL); - } -} - -int gve_alloc_qpls(struct gve_priv *priv) +int +gve_register_qpls(struct gve_priv *priv) { - int num_qpls = gve_num_tx_qpls(priv) + gve_num_rx_qpls(priv); + struct gve_ring_com *com; + struct gve_tx_ring *tx; + struct gve_rx_ring *rx; int err; int i; - if (num_qpls == 0) + if (gve_get_state_flag(priv, GVE_STATE_FLAG_QPLREG_OK)) return (0); - priv->qpls = malloc(num_qpls * sizeof(*priv->qpls), M_GVE_QPL, - M_WAITOK | M_ZERO); - - for (i = 0; i < gve_num_tx_qpls(priv); i++) { - err = gve_alloc_qpl(priv, i, priv->tx_desc_cnt / GVE_QPL_DIVISOR, - /*single_kva=*/true); - if (err != 0) - goto abort; - } - - for (; i < num_qpls; i++) { - err = gve_alloc_qpl(priv, i, priv->rx_desc_cnt, /*single_kva=*/false); - if (err != 0) - goto abort; - } - - return (0); - -abort: - gve_free_qpls(priv); - return (err); -} - -static int -gve_unregister_n_qpls(struct gve_priv *priv, int n) -{ - int err; - int i; - - for (i = 0; i < n; i++) { - err = gve_adminq_unregister_page_list(priv, priv->qpls[i].id); + /* Register TX qpls */ + for (i = 0; i < priv->tx_cfg.num_queues; i++) { + tx = &priv->tx[i]; + com = &tx->com; + err = gve_adminq_register_page_list(priv, com->qpl); if (err != 0) { device_printf(priv->dev, - "Failed to unregister qpl %d, err: %d\n", - priv->qpls[i].id, err); + "Failed to register qpl %d, err: %d\n", + com->qpl->id, err); + /* Caller schedules a reset when this fails */ + return (err); } } - if (err != 0) - return (err); - - return (0); -} - -int -gve_register_qpls(struct gve_priv *priv) -{ - int num_qpls = gve_num_tx_qpls(priv) + gve_num_rx_qpls(priv); - int err; - int i; - - if (gve_get_state_flag(priv, GVE_STATE_FLAG_QPLREG_OK)) - return (0); - - for (i = 0; i < num_qpls; i++) { - err = gve_adminq_register_page_list(priv, &priv->qpls[i]); + /* Register RX qpls */ + for (i = 0; i < priv->rx_cfg.num_queues; i++) { + rx = &priv->rx[i]; + com = &rx->com; + err = gve_adminq_register_page_list(priv, com->qpl); if (err != 0) { device_printf(priv->dev, "Failed to register qpl %d, err: %d\n", - priv->qpls[i].id, err); - goto abort; + com->qpl->id, err); + /* Caller schedules a reset when this fails */ + return (err); } } - gve_set_state_flag(priv, GVE_STATE_FLAG_QPLREG_OK); return (0); - -abort: - gve_unregister_n_qpls(priv, i); - return (err); } int gve_unregister_qpls(struct gve_priv *priv) { - int num_qpls = gve_num_tx_qpls(priv) + gve_num_rx_qpls(priv); int err; + int i; + struct gve_ring_com *com; + struct gve_tx_ring *tx; + struct gve_rx_ring *rx; if (!gve_get_state_flag(priv, GVE_STATE_FLAG_QPLREG_OK)) return (0); - err = gve_unregister_n_qpls(priv, num_qpls); + for (i = 0; i < priv->tx_cfg.num_queues; i++) { + tx = &priv->tx[i]; + com = &tx->com; + err = gve_adminq_unregister_page_list(priv, com->qpl->id); + if (err != 0) { + device_printf(priv->dev, + "Failed to unregister qpl %d, err: %d\n", + com->qpl->id, err); + } + } + + for (i = 0; i < priv->rx_cfg.num_queues; i++) { + rx = &priv->rx[i]; + com = &rx->com; + err = gve_adminq_unregister_page_list(priv, com->qpl->id); + if (err != 0) { + device_printf(priv->dev, + "Failed to unregister qpl %d, err: %d\n", + com->qpl->id, err); + } + } + if (err != 0) return (err); gve_clear_state_flag(priv, GVE_STATE_FLAG_QPLREG_OK); return (0); } + +void +gve_mextadd_free(struct mbuf *mbuf) +{ + vm_page_t page = (vm_page_t)mbuf->m_ext.ext_arg1; + vm_offset_t va = (vm_offset_t)mbuf->m_ext.ext_arg2; + + /* + * Free the page only if this is the last ref. + * The interface might no longer exist by the time + * this callback is called, see gve_free_qpl. + */ + if (__predict_false(vm_page_unwire_noq(page))) { + pmap_qremove(va, 1); + kva_free(va, PAGE_SIZE); + vm_page_free(page); + } +} diff --git a/sys/dev/gve/gve_rx.c b/sys/dev/gve/gve_rx.c index 9be96cf1ee3a..de64375ac4f3 100644 --- a/sys/dev/gve/gve_rx.c +++ b/sys/dev/gve/gve_rx.c @@ -1,7 +1,7 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * - * Copyright (c) 2023 Google LLC + * Copyright (c) 2023-2024 Google LLC * * Redistribution and use in source and binary forms, with or without modification, * are permitted provided that the following conditions are met: @@ -30,16 +30,14 @@ */ #include "gve.h" #include "gve_adminq.h" +#include "gve_dqo.h" static void -gve_rx_free_ring(struct gve_priv *priv, int i) +gve_rx_free_ring_gqi(struct gve_priv *priv, int i) { struct gve_rx_ring *rx = &priv->rx[i]; struct gve_ring_com *com = &rx->com; - /* Safe to call even if never allocated */ - gve_free_counters((counter_u64_t *)&rx->stats, NUM_RX_STATS); - if (rx->page_info != NULL) { free(rx->page_info, M_GVE); rx->page_info = NULL; @@ -55,6 +53,26 @@ gve_rx_free_ring(struct gve_priv *priv, int i) rx->desc_ring = NULL; } + if (com->qpl != NULL) { + gve_free_qpl(priv, com->qpl); + com->qpl = NULL; + } +} + +static void +gve_rx_free_ring(struct gve_priv *priv, int i) +{ + struct gve_rx_ring *rx = &priv->rx[i]; + struct gve_ring_com *com = &rx->com; + + /* Safe to call even if never allocated */ + gve_free_counters((counter_u64_t *)&rx->stats, NUM_RX_STATS); + + if (gve_is_gqi(priv)) + gve_rx_free_ring_gqi(priv, i); + else + gve_rx_free_ring_dqo(priv, i); + if (com->q_resources != NULL) { gve_dma_free_coherent(&com->q_resources_mem); com->q_resources = NULL; @@ -83,55 +101,82 @@ gve_prefill_rx_slots(struct gve_rx_ring *rx) } static int -gve_rx_alloc_ring(struct gve_priv *priv, int i) +gve_rx_alloc_ring_gqi(struct gve_priv *priv, int i) { struct gve_rx_ring *rx = &priv->rx[i]; struct gve_ring_com *com = &rx->com; int err; - com->priv = priv; - com->id = i; + err = gve_dma_alloc_coherent(priv, + sizeof(struct gve_rx_desc) * priv->rx_desc_cnt, + CACHE_LINE_SIZE, &rx->desc_ring_mem); + if (err != 0) { + device_printf(priv->dev, + "Failed to alloc desc ring for rx ring %d", i); + goto abort; + } rx->mask = priv->rx_pages_per_qpl - 1; + rx->desc_ring = rx->desc_ring_mem.cpu_addr; - com->qpl = &priv->qpls[priv->tx_cfg.max_queues + i]; + com->qpl = gve_alloc_qpl(priv, i + priv->tx_cfg.max_queues, + priv->rx_desc_cnt, /*single_kva=*/false); if (com->qpl == NULL) { - device_printf(priv->dev, "No QPL left for rx ring %d", i); - return (ENOMEM); + device_printf(priv->dev, + "Failed to alloc QPL for rx ring %d", i); + err = ENOMEM; + goto abort; } - rx->page_info = malloc(priv->rx_desc_cnt * sizeof(*rx->page_info), M_GVE, - M_WAITOK | M_ZERO); + rx->page_info = malloc(priv->rx_desc_cnt * sizeof(*rx->page_info), + M_GVE, M_WAITOK | M_ZERO); + + err = gve_dma_alloc_coherent(priv, + sizeof(union gve_rx_data_slot) * priv->rx_desc_cnt, + CACHE_LINE_SIZE, &rx->data_ring_mem); + if (err != 0) { + device_printf(priv->dev, + "Failed to alloc data ring for rx ring %d", i); + goto abort; + } + rx->data_ring = rx->data_ring_mem.cpu_addr; + + gve_prefill_rx_slots(rx); + return (0); + +abort: + gve_rx_free_ring_gqi(priv, i); + return (err); +} + +static int +gve_rx_alloc_ring(struct gve_priv *priv, int i) +{ + struct gve_rx_ring *rx = &priv->rx[i]; + struct gve_ring_com *com = &rx->com; + int err; + + com->priv = priv; + com->id = i; gve_alloc_counters((counter_u64_t *)&rx->stats, NUM_RX_STATS); err = gve_dma_alloc_coherent(priv, sizeof(struct gve_queue_resources), PAGE_SIZE, &com->q_resources_mem); if (err != 0) { - device_printf(priv->dev, "Failed to alloc queue resources for rx ring %d", i); + device_printf(priv->dev, + "Failed to alloc queue resources for rx ring %d", i); goto abort; } com->q_resources = com->q_resources_mem.cpu_addr; - err = gve_dma_alloc_coherent(priv, - sizeof(struct gve_rx_desc) * priv->rx_desc_cnt, - CACHE_LINE_SIZE, &rx->desc_ring_mem); - if (err != 0) { - device_printf(priv->dev, "Failed to alloc desc ring for rx ring %d", i); - goto abort; - } - rx->desc_ring = rx->desc_ring_mem.cpu_addr; - - err = gve_dma_alloc_coherent(priv, - sizeof(union gve_rx_data_slot) * priv->rx_desc_cnt, - CACHE_LINE_SIZE, &rx->data_ring_mem); - if (err != 0) { - device_printf(priv->dev, "Failed to alloc data ring for rx ring %d", i); + if (gve_is_gqi(priv)) + err = gve_rx_alloc_ring_gqi(priv, i); + else + err = gve_rx_alloc_ring_dqo(priv, i); + if (err != 0) goto abort; - } - rx->data_ring = rx->data_ring_mem.cpu_addr; - gve_prefill_rx_slots(rx); return (0); abort: @@ -140,38 +185,32 @@ abort: } int -gve_alloc_rx_rings(struct gve_priv *priv) +gve_alloc_rx_rings(struct gve_priv *priv, uint16_t start_idx, uint16_t stop_idx) { - int err = 0; int i; + int err; - priv->rx = malloc(sizeof(struct gve_rx_ring) * priv->rx_cfg.num_queues, - M_GVE, M_WAITOK | M_ZERO); + KASSERT(priv->rx != NULL, ("priv->rx is NULL!")); - for (i = 0; i < priv->rx_cfg.num_queues; i++) { + for (i = start_idx; i < stop_idx; i++) { err = gve_rx_alloc_ring(priv, i); if (err != 0) goto free_rings; } return (0); - free_rings: - while (i--) - gve_rx_free_ring(priv, i); - free(priv->rx, M_GVE); + gve_free_rx_rings(priv, start_idx, i); return (err); } void -gve_free_rx_rings(struct gve_priv *priv) +gve_free_rx_rings(struct gve_priv *priv, uint16_t start_idx, uint16_t stop_idx) { int i; - for (i = 0; i < priv->rx_cfg.num_queues; i++) + for (i = start_idx; i < stop_idx; i++) gve_rx_free_ring(priv, i); - - free(priv->rx, M_GVE); } static void @@ -217,6 +256,11 @@ gve_clear_rx_ring(struct gve_priv *priv, int i) { struct gve_rx_ring *rx = &priv->rx[i]; + if (!gve_is_gqi(priv)) { + gve_clear_rx_ring_dqo(priv, i); + return; + } + rx->seq_no = 1; rx->cnt = 0; rx->fill_cnt = 0; @@ -238,14 +282,21 @@ gve_start_rx_ring(struct gve_priv *priv, int i) rx->lro.ifp = priv->ifp; } - NET_TASK_INIT(&com->cleanup_task, 0, gve_rx_cleanup_tq, rx); + if (gve_is_gqi(priv)) + NET_TASK_INIT(&com->cleanup_task, 0, gve_rx_cleanup_tq, rx); + else + NET_TASK_INIT(&com->cleanup_task, 0, gve_rx_cleanup_tq_dqo, rx); com->cleanup_tq = taskqueue_create_fast("gve rx", M_WAITOK, taskqueue_thread_enqueue, &com->cleanup_tq); taskqueue_start_threads(&com->cleanup_tq, 1, PI_NET, "%s rxq %d", device_get_nameunit(priv->dev), i); - gve_db_bar_write_4(priv, com->db_offset, rx->fill_cnt); + if (gve_is_gqi(priv)) { + /* GQ RX bufs are prefilled at ring alloc time */ + gve_db_bar_write_4(priv, com->db_offset, rx->fill_cnt); + } else + gve_rx_prefill_buffers_dqo(rx); } int @@ -362,24 +413,6 @@ gve_set_rss_type(__be16 flag, struct mbuf *mbuf) } static void -gve_mextadd_free(struct mbuf *mbuf) -{ - vm_page_t page = (vm_page_t)mbuf->m_ext.ext_arg1; - vm_offset_t va = (vm_offset_t)mbuf->m_ext.ext_arg2; - - /* - * Free the page only if this is the last ref. - * The interface might no longer exist by the time - * this callback is called, see gve_free_qpl. - */ - if (__predict_false(vm_page_unwire_noq(page))) { - pmap_qremove(va, 1); - kva_free(va, PAGE_SIZE); - vm_page_free(page); - } -} - -static void gve_rx_flip_buff(struct gve_rx_slot_page_info *page_info, __be64 *slot_addr) { const __be64 offset = htobe64(GVE_DEFAULT_RX_BUFFER_OFFSET); @@ -676,7 +709,7 @@ gve_rx_cleanup_tq(void *arg, int pending) * interrupt but they will still be handled by the enqueue below. * Fragments received after the barrier WILL trigger an interrupt. */ - mb(); + atomic_thread_fence_seq_cst(); if (gve_rx_work_pending(rx)) { gve_db_bar_write_4(priv, rx->com.irq_db_offset, GVE_IRQ_MASK); diff --git a/sys/dev/gve/gve_rx_dqo.c b/sys/dev/gve/gve_rx_dqo.c new file mode 100644 index 000000000000..cf914913da09 --- /dev/null +++ b/sys/dev/gve/gve_rx_dqo.c @@ -0,0 +1,1035 @@ +/*- + * SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 2024 Google LLC + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include "gve.h" +#include "gve_adminq.h" +#include "gve_dqo.h" + +static void +gve_free_rx_mbufs_dqo(struct gve_rx_ring *rx) +{ + struct gve_rx_buf_dqo *buf; + int i; + + if (gve_is_qpl(rx->com.priv)) + return; + + for (i = 0; i < rx->dqo.buf_cnt; i++) { + buf = &rx->dqo.bufs[i]; + if (!buf->mbuf) + continue; + + bus_dmamap_sync(rx->dqo.buf_dmatag, buf->dmamap, + BUS_DMASYNC_POSTREAD); + bus_dmamap_unload(rx->dqo.buf_dmatag, buf->dmamap); + m_freem(buf->mbuf); + buf->mbuf = NULL; + } +} + +void +gve_rx_free_ring_dqo(struct gve_priv *priv, int i) +{ + struct gve_rx_ring *rx = &priv->rx[i]; + struct gve_ring_com *com = &rx->com; + int j; + + if (rx->dqo.compl_ring != NULL) { + gve_dma_free_coherent(&rx->dqo.compl_ring_mem); + rx->dqo.compl_ring = NULL; + } + + if (rx->dqo.desc_ring != NULL) { + gve_dma_free_coherent(&rx->desc_ring_mem); + rx->dqo.desc_ring = NULL; + } + + if (rx->dqo.bufs != NULL) { + gve_free_rx_mbufs_dqo(rx); + + if (!gve_is_qpl(priv) && rx->dqo.buf_dmatag) { + for (j = 0; j < rx->dqo.buf_cnt; j++) + if (rx->dqo.bufs[j].mapped) + bus_dmamap_destroy(rx->dqo.buf_dmatag, + rx->dqo.bufs[j].dmamap); + } + + free(rx->dqo.bufs, M_GVE); + rx->dqo.bufs = NULL; + } + + if (!gve_is_qpl(priv) && rx->dqo.buf_dmatag) + bus_dma_tag_destroy(rx->dqo.buf_dmatag); + + if (com->qpl != NULL) { + gve_free_qpl(priv, com->qpl); + com->qpl = NULL; + } +} + +int +gve_rx_alloc_ring_dqo(struct gve_priv *priv, int i) +{ + struct gve_rx_ring *rx = &priv->rx[i]; + int err; + int j; + + err = gve_dma_alloc_coherent(priv, + sizeof(struct gve_rx_desc_dqo) * priv->rx_desc_cnt, + CACHE_LINE_SIZE, &rx->desc_ring_mem); + if (err != 0) { + device_printf(priv->dev, + "Failed to alloc desc ring for rx ring %d", i); + goto abort; + } + rx->dqo.desc_ring = rx->desc_ring_mem.cpu_addr; + rx->dqo.mask = priv->rx_desc_cnt - 1; + + err = gve_dma_alloc_coherent(priv, + sizeof(struct gve_rx_compl_desc_dqo) * priv->rx_desc_cnt, + CACHE_LINE_SIZE, &rx->dqo.compl_ring_mem); + if (err != 0) { + device_printf(priv->dev, + "Failed to alloc compl ring for rx ring %d", i); + goto abort; + } + rx->dqo.compl_ring = rx->dqo.compl_ring_mem.cpu_addr; + rx->dqo.mask = priv->rx_desc_cnt - 1; + + rx->dqo.buf_cnt = gve_is_qpl(priv) ? GVE_RX_NUM_QPL_PAGES_DQO : + priv->rx_desc_cnt; + rx->dqo.bufs = malloc(rx->dqo.buf_cnt * sizeof(struct gve_rx_buf_dqo), + M_GVE, M_WAITOK | M_ZERO); + + if (gve_is_qpl(priv)) { + rx->com.qpl = gve_alloc_qpl(priv, i + priv->tx_cfg.max_queues, + GVE_RX_NUM_QPL_PAGES_DQO, /*single_kva=*/false); + if (rx->com.qpl == NULL) { + device_printf(priv->dev, + "Failed to alloc QPL for rx ring %d", i); + err = ENOMEM; + goto abort; + } + return (0); + } + + bus_size_t max_seg_size = gve_rx_dqo_mbuf_segment_size(priv); + + err = bus_dma_tag_create( + bus_get_dma_tag(priv->dev), /* parent */ + 1, 0, /* alignment, bounds */ + BUS_SPACE_MAXADDR, /* lowaddr */ + BUS_SPACE_MAXADDR, /* highaddr */ + NULL, NULL, /* filter, filterarg */ + max_seg_size, /* maxsize */ + 1, /* nsegments */ + max_seg_size, /* maxsegsize */ + 0, /* flags */ + NULL, /* lockfunc */ + NULL, /* lockarg */ + &rx->dqo.buf_dmatag); + if (err != 0) { + device_printf(priv->dev, + "%s: bus_dma_tag_create failed: %d\n", + __func__, err); + goto abort; + } + + for (j = 0; j < rx->dqo.buf_cnt; j++) { + err = bus_dmamap_create(rx->dqo.buf_dmatag, 0, + &rx->dqo.bufs[j].dmamap); + if (err != 0) { + device_printf(priv->dev, + "err in creating rx buf dmamap %d: %d", + j, err); + goto abort; + } + rx->dqo.bufs[j].mapped = true; + } + + return (0); + +abort: + gve_rx_free_ring_dqo(priv, i); + return (err); +} + +static void +gve_rx_clear_desc_ring_dqo(struct gve_rx_ring *rx) +{ + struct gve_ring_com *com = &rx->com; + int entries; + int i; + + entries = com->priv->rx_desc_cnt; + for (i = 0; i < entries; i++) + rx->dqo.desc_ring[i] = (struct gve_rx_desc_dqo){}; + + bus_dmamap_sync(rx->desc_ring_mem.tag, rx->desc_ring_mem.map, + BUS_DMASYNC_PREWRITE); +} + +static void +gve_rx_clear_compl_ring_dqo(struct gve_rx_ring *rx) +{ + struct gve_ring_com *com = &rx->com; + int i; + + for (i = 0; i < com->priv->rx_desc_cnt; i++) + rx->dqo.compl_ring[i] = (struct gve_rx_compl_desc_dqo){}; + + bus_dmamap_sync(rx->dqo.compl_ring_mem.tag, rx->dqo.compl_ring_mem.map, + BUS_DMASYNC_PREWRITE); +} + +void +gve_clear_rx_ring_dqo(struct gve_priv *priv, int i) +{ + struct gve_rx_ring *rx = &priv->rx[i]; + int j; + + rx->fill_cnt = 0; + rx->cnt = 0; + rx->dqo.mask = priv->rx_desc_cnt - 1; + rx->dqo.head = 0; + rx->dqo.tail = 0; + rx->dqo.cur_gen_bit = 0; + + gve_rx_clear_desc_ring_dqo(rx); + gve_rx_clear_compl_ring_dqo(rx); + + gve_free_rx_mbufs_dqo(rx); + + if (gve_is_qpl(priv)) { + SLIST_INIT(&rx->dqo.free_bufs); + STAILQ_INIT(&rx->dqo.used_bufs); + + for (j = 0; j < rx->dqo.buf_cnt; j++) { + struct gve_rx_buf_dqo *buf = &rx->dqo.bufs[j]; + + vm_page_t page = rx->com.qpl->pages[buf - rx->dqo.bufs]; + u_int ref_count = atomic_load_int(&page->ref_count); + + /* + * An ifconfig down+up might see pages still in flight + * from the previous innings. + */ + if (VPRC_WIRE_COUNT(ref_count) == 1) + SLIST_INSERT_HEAD(&rx->dqo.free_bufs, + buf, slist_entry); + else + STAILQ_INSERT_TAIL(&rx->dqo.used_bufs, + buf, stailq_entry); + + buf->num_nic_frags = 0; + buf->next_idx = 0; + } + } else { + SLIST_INIT(&rx->dqo.free_bufs); + for (j = 0; j < rx->dqo.buf_cnt; j++) + SLIST_INSERT_HEAD(&rx->dqo.free_bufs, + &rx->dqo.bufs[j], slist_entry); + } +} + +int +gve_rx_intr_dqo(void *arg) +{ + struct gve_rx_ring *rx = arg; + struct gve_priv *priv = rx->com.priv; + struct gve_ring_com *com = &rx->com; + + if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0)) + return (FILTER_STRAY); + + /* Interrupts are automatically masked */ + taskqueue_enqueue(com->cleanup_tq, &com->cleanup_task); + return (FILTER_HANDLED); +} + +static void +gve_rx_advance_head_dqo(struct gve_rx_ring *rx) +{ + rx->dqo.head = (rx->dqo.head + 1) & rx->dqo.mask; + rx->fill_cnt++; /* rx->fill_cnt is just a sysctl counter */ + + if ((rx->dqo.head & (GVE_RX_BUF_THRESH_DQO - 1)) == 0) { + bus_dmamap_sync(rx->desc_ring_mem.tag, rx->desc_ring_mem.map, + BUS_DMASYNC_PREWRITE); + gve_db_bar_dqo_write_4(rx->com.priv, rx->com.db_offset, + rx->dqo.head); + } +} + +static void +gve_rx_post_buf_dqo(struct gve_rx_ring *rx, struct gve_rx_buf_dqo *buf) +{ + struct gve_rx_desc_dqo *desc; + + bus_dmamap_sync(rx->dqo.buf_dmatag, buf->dmamap, + BUS_DMASYNC_PREREAD); + + desc = &rx->dqo.desc_ring[rx->dqo.head]; + desc->buf_id = htole16(buf - rx->dqo.bufs); + desc->buf_addr = htole64(buf->addr); + + gve_rx_advance_head_dqo(rx); +} + +static int +gve_rx_post_new_mbuf_dqo(struct gve_rx_ring *rx, int how) +{ + struct gve_rx_buf_dqo *buf; + bus_dma_segment_t segs[1]; + int nsegs; + int err; + + buf = SLIST_FIRST(&rx->dqo.free_bufs); + if (__predict_false(!buf)) { + device_printf(rx->com.priv->dev, + "Unexpected empty free bufs list\n"); + return (ENOBUFS); + } + SLIST_REMOVE_HEAD(&rx->dqo.free_bufs, slist_entry); + + bus_size_t segment_size = gve_rx_dqo_mbuf_segment_size(rx->com.priv); + buf->mbuf = m_getjcl(how, MT_DATA, M_PKTHDR, segment_size); + if (__predict_false(!buf->mbuf)) { + err = ENOMEM; + counter_enter(); + counter_u64_add_protected(rx->stats.rx_mbuf_mclget_null, 1); + counter_exit(); + goto abort_with_buf; + } + buf->mbuf->m_len = segment_size; + + err = bus_dmamap_load_mbuf_sg(rx->dqo.buf_dmatag, buf->dmamap, + buf->mbuf, segs, &nsegs, BUS_DMA_NOWAIT); + KASSERT(nsegs == 1, ("dma segs for a cluster mbuf is not 1")); + if (__predict_false(err != 0)) { + counter_enter(); + counter_u64_add_protected(rx->stats.rx_mbuf_dmamap_err, 1); + counter_exit(); + goto abort_with_mbuf; + } + buf->addr = segs[0].ds_addr; + + gve_rx_post_buf_dqo(rx, buf); + return (0); + +abort_with_mbuf: + m_freem(buf->mbuf); + buf->mbuf = NULL; +abort_with_buf: + SLIST_INSERT_HEAD(&rx->dqo.free_bufs, buf, slist_entry); + return (err); +} + +static struct gve_dma_handle * +gve_get_page_dma_handle(struct gve_rx_ring *rx, struct gve_rx_buf_dqo *buf) +{ + return (&(rx->com.qpl->dmas[buf - rx->dqo.bufs])); +} + +static void +gve_rx_post_qpl_buf_dqo(struct gve_rx_ring *rx, struct gve_rx_buf_dqo *buf, + uint8_t frag_num) +{ + struct gve_rx_desc_dqo *desc = &rx->dqo.desc_ring[rx->dqo.head]; + union gve_rx_qpl_buf_id_dqo composed_id; + struct gve_dma_handle *page_dma_handle; + + composed_id.buf_id = buf - rx->dqo.bufs; + composed_id.frag_num = frag_num; + desc->buf_id = htole16(composed_id.all); + + page_dma_handle = gve_get_page_dma_handle(rx, buf); + bus_dmamap_sync(page_dma_handle->tag, page_dma_handle->map, + BUS_DMASYNC_PREREAD); + desc->buf_addr = htole64(page_dma_handle->bus_addr + + frag_num * rx->com.priv->rx_buf_size_dqo); + + buf->num_nic_frags++; + gve_rx_advance_head_dqo(rx); +} + +static void +gve_rx_maybe_extract_from_used_bufs(struct gve_rx_ring *rx, bool just_one) +{ + struct gve_rx_buf_dqo *hol_blocker = NULL; + struct gve_rx_buf_dqo *buf; + u_int ref_count; + vm_page_t page; + + while (true) { + buf = STAILQ_FIRST(&rx->dqo.used_bufs); + if (__predict_false(buf == NULL)) + break; + + page = rx->com.qpl->pages[buf - rx->dqo.bufs]; + ref_count = atomic_load_int(&page->ref_count); + + if (VPRC_WIRE_COUNT(ref_count) != 1) { + /* Account for one head-of-line blocker */ + if (hol_blocker != NULL) + break; + hol_blocker = buf; + STAILQ_REMOVE_HEAD(&rx->dqo.used_bufs, + stailq_entry); + continue; + } + + STAILQ_REMOVE_HEAD(&rx->dqo.used_bufs, + stailq_entry); + SLIST_INSERT_HEAD(&rx->dqo.free_bufs, + buf, slist_entry); + if (just_one) + break; + } + + if (hol_blocker != NULL) + STAILQ_INSERT_HEAD(&rx->dqo.used_bufs, + hol_blocker, stailq_entry); +} + +static int +gve_rx_post_new_dqo_qpl_buf(struct gve_rx_ring *rx) +{ + struct gve_rx_buf_dqo *buf; + + buf = SLIST_FIRST(&rx->dqo.free_bufs); + if (__predict_false(buf == NULL)) { + gve_rx_maybe_extract_from_used_bufs(rx, /*just_one=*/true); + buf = SLIST_FIRST(&rx->dqo.free_bufs); + if (__predict_false(buf == NULL)) + return (ENOBUFS); + } + + gve_rx_post_qpl_buf_dqo(rx, buf, buf->next_idx); + if (buf->next_idx == gve_get_dq_num_frags_in_page(rx->com.priv) - 1) + buf->next_idx = 0; + else + buf->next_idx++; + + /* + * We have posted all the frags in this buf to the NIC. + * - buf will enter used_bufs once the last completion arrives. + * - It will renter free_bufs in gve_rx_maybe_extract_from_used_bufs + * when its wire count drops back to 1. + */ + if (buf->next_idx == 0) + SLIST_REMOVE_HEAD(&rx->dqo.free_bufs, slist_entry); + return (0); +} + +static void +gve_rx_post_buffers_dqo(struct gve_rx_ring *rx, int how) +{ + uint32_t num_pending_bufs; + uint32_t num_to_post; + uint32_t i; + int err; + + num_pending_bufs = (rx->dqo.head - rx->dqo.tail) & rx->dqo.mask; + num_to_post = rx->dqo.mask - num_pending_bufs; + + for (i = 0; i < num_to_post; i++) { + if (gve_is_qpl(rx->com.priv)) + err = gve_rx_post_new_dqo_qpl_buf(rx); + else + err = gve_rx_post_new_mbuf_dqo(rx, how); + if (err) + break; + } +} + +void +gve_rx_prefill_buffers_dqo(struct gve_rx_ring *rx) +{ + gve_rx_post_buffers_dqo(rx, M_WAITOK); +} + +static void +gve_rx_set_hashtype_dqo(struct mbuf *mbuf, struct gve_ptype *ptype, bool *is_tcp) +{ + switch (ptype->l3_type) { + case GVE_L3_TYPE_IPV4: + switch (ptype->l4_type) { + case GVE_L4_TYPE_TCP: + *is_tcp = true; + M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_TCP_IPV4); + break; + case GVE_L4_TYPE_UDP: + M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_UDP_IPV4); + break; + default: + M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_IPV4); + } + break; + case GVE_L3_TYPE_IPV6: + switch (ptype->l4_type) { + case GVE_L4_TYPE_TCP: + *is_tcp = true; + M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_TCP_IPV6); + break; + case GVE_L4_TYPE_UDP: + M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_UDP_IPV6); + break; + default: + M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_IPV6); + } + break; + default: + M_HASHTYPE_SET(mbuf, M_HASHTYPE_OPAQUE_HASH); + } +} + +static void +gve_rx_set_csum_flags_dqo(struct mbuf *mbuf, + struct gve_rx_compl_desc_dqo *desc, + struct gve_ptype *ptype) +{ + /* HW did not identify and process L3 and L4 headers. */ + if (__predict_false(!desc->l3_l4_processed)) + return; + + if (ptype->l3_type == GVE_L3_TYPE_IPV4) { + if (__predict_false(desc->csum_ip_err || + desc->csum_external_ip_err)) + return; + } else if (ptype->l3_type == GVE_L3_TYPE_IPV6) { + /* Checksum should be skipped if this flag is set. */ + if (__predict_false(desc->ipv6_ex_add)) + return; + } + + if (__predict_false(desc->csum_l4_err)) + return; + + switch (ptype->l4_type) { + case GVE_L4_TYPE_TCP: + case GVE_L4_TYPE_UDP: + case GVE_L4_TYPE_ICMP: + case GVE_L4_TYPE_SCTP: + mbuf->m_pkthdr.csum_flags = CSUM_IP_CHECKED | + CSUM_IP_VALID | + CSUM_DATA_VALID | + CSUM_PSEUDO_HDR; + mbuf->m_pkthdr.csum_data = 0xffff; + break; + default: + break; + } +} + +static void +gve_rx_input_mbuf_dqo(struct gve_rx_ring *rx, + struct gve_rx_compl_desc_dqo *compl_desc) +{ + struct mbuf *mbuf = rx->ctx.mbuf_head; + if_t ifp = rx->com.priv->ifp; + struct gve_ptype *ptype; + bool do_if_input = true; + bool is_tcp = false; + + ptype = &rx->com.priv->ptype_lut_dqo->ptypes[compl_desc->packet_type]; + gve_rx_set_hashtype_dqo(mbuf, ptype, &is_tcp); + mbuf->m_pkthdr.flowid = le32toh(compl_desc->hash); + gve_rx_set_csum_flags_dqo(mbuf, compl_desc, ptype); + + mbuf->m_pkthdr.rcvif = ifp; + mbuf->m_pkthdr.len = rx->ctx.total_size; + + if (((if_getcapenable(rx->com.priv->ifp) & IFCAP_LRO) != 0) && + is_tcp && + (rx->lro.lro_cnt != 0) && + (tcp_lro_rx(&rx->lro, mbuf, 0) == 0)) + do_if_input = false; + + if (do_if_input) + if_input(ifp, mbuf); + + counter_enter(); + counter_u64_add_protected(rx->stats.rbytes, rx->ctx.total_size); + counter_u64_add_protected(rx->stats.rpackets, 1); + counter_exit(); + + rx->ctx = (struct gve_rx_ctx){}; +} + +static int +gve_rx_copybreak_dqo(struct gve_rx_ring *rx, void *va, + struct gve_rx_compl_desc_dqo *compl_desc, uint16_t frag_len) +{ + struct mbuf *mbuf; + + mbuf = m_get2(frag_len, M_NOWAIT, MT_DATA, M_PKTHDR); + if (__predict_false(mbuf == NULL)) + return (ENOMEM); + + counter_enter(); + counter_u64_add_protected(rx->stats.rx_copybreak_cnt, 1); + counter_exit(); + + m_copyback(mbuf, 0, frag_len, va); + mbuf->m_len = frag_len; + + rx->ctx.mbuf_head = mbuf; + rx->ctx.mbuf_tail = mbuf; + rx->ctx.total_size += frag_len; + + gve_rx_input_mbuf_dqo(rx, compl_desc); + return (0); +} + +static void +gve_rx_dqo(struct gve_priv *priv, struct gve_rx_ring *rx, + struct gve_rx_compl_desc_dqo *compl_desc, + int *work_done) +{ + bool is_last_frag = compl_desc->end_of_packet != 0; + struct gve_rx_ctx *ctx = &rx->ctx; + struct gve_rx_buf_dqo *buf; + uint32_t num_pending_bufs; + uint16_t frag_len; + uint16_t buf_id; + int err; + + buf_id = le16toh(compl_desc->buf_id); + if (__predict_false(buf_id >= rx->dqo.buf_cnt)) { + device_printf(priv->dev, "Invalid rx buf id %d on rxq %d, issuing reset\n", + buf_id, rx->com.id); + gve_schedule_reset(priv); + goto drop_frag_clear_ctx; + } + buf = &rx->dqo.bufs[buf_id]; + if (__predict_false(buf->mbuf == NULL)) { + device_printf(priv->dev, "Spurious completion for buf id %d on rxq %d, issuing reset\n", + buf_id, rx->com.id); + gve_schedule_reset(priv); + goto drop_frag_clear_ctx; + } + + if (__predict_false(ctx->drop_pkt)) + goto drop_frag; + + if (__predict_false(compl_desc->rx_error)) { + counter_enter(); + counter_u64_add_protected(rx->stats.rx_dropped_pkt_desc_err, 1); + counter_exit(); + goto drop_frag; + } + + bus_dmamap_sync(rx->dqo.buf_dmatag, buf->dmamap, + BUS_DMASYNC_POSTREAD); + + frag_len = compl_desc->packet_len; + if (frag_len <= priv->rx_copybreak && !ctx->mbuf_head && is_last_frag) { + err = gve_rx_copybreak_dqo(rx, mtod(buf->mbuf, char*), + compl_desc, frag_len); + if (__predict_false(err != 0)) + goto drop_frag; + (*work_done)++; + gve_rx_post_buf_dqo(rx, buf); + return; + } + + /* + * Although buffer completions may arrive out of order, buffer + * descriptors are consumed by the NIC in order. That is, the + * buffer at desc_ring[tail] might not be the buffer we got the + * completion compl_ring[tail] for: but we know that desc_ring[tail] + * has already been read by the NIC. + */ + num_pending_bufs = (rx->dqo.head - rx->dqo.tail) & rx->dqo.mask; + + /* + * For every fragment received, try to post a new buffer. + * + * Failures are okay but only so long as the number of outstanding + * buffers is above a threshold. + * + * Beyond that we drop new packets to reuse their buffers. + * Without ensuring a minimum number of buffers for the NIC to + * put packets in, we run the risk of getting the queue stuck + * for good. + */ + err = gve_rx_post_new_mbuf_dqo(rx, M_NOWAIT); + if (__predict_false(err != 0 && + num_pending_bufs <= GVE_RX_DQO_MIN_PENDING_BUFS)) { + counter_enter(); + counter_u64_add_protected( + rx->stats.rx_dropped_pkt_mbuf_alloc_fail, 1); + counter_exit(); + goto drop_frag; + } + + buf->mbuf->m_len = frag_len; + ctx->total_size += frag_len; + if (ctx->mbuf_tail == NULL) { + ctx->mbuf_head = buf->mbuf; + ctx->mbuf_tail = buf->mbuf; + } else { + buf->mbuf->m_flags &= ~M_PKTHDR; + ctx->mbuf_tail->m_next = buf->mbuf; + ctx->mbuf_tail = buf->mbuf; + } + + /* + * Disassociate the mbuf from buf and surrender buf to the free list to + * be used by a future mbuf. + */ + bus_dmamap_unload(rx->dqo.buf_dmatag, buf->dmamap); + buf->mbuf = NULL; + buf->addr = 0; + SLIST_INSERT_HEAD(&rx->dqo.free_bufs, buf, slist_entry); + + if (is_last_frag) { + gve_rx_input_mbuf_dqo(rx, compl_desc); + (*work_done)++; + } + return; + +drop_frag: + /* Clear the earlier frags if there were any */ + m_freem(ctx->mbuf_head); + rx->ctx = (struct gve_rx_ctx){}; + /* Drop the rest of the pkt if there are more frags */ + ctx->drop_pkt = true; + /* Reuse the dropped frag's buffer */ + gve_rx_post_buf_dqo(rx, buf); + + if (is_last_frag) + goto drop_frag_clear_ctx; + return; + +drop_frag_clear_ctx: + counter_enter(); + counter_u64_add_protected(rx->stats.rx_dropped_pkt, 1); + counter_exit(); + m_freem(ctx->mbuf_head); + rx->ctx = (struct gve_rx_ctx){}; +} + +static void * +gve_get_cpu_addr_for_qpl_buf(struct gve_rx_ring *rx, + struct gve_rx_buf_dqo *buf, uint8_t buf_frag_num) +{ + int page_idx = buf - rx->dqo.bufs; + void *va = rx->com.qpl->dmas[page_idx].cpu_addr; + + va = (char *)va + (buf_frag_num * rx->com.priv->rx_buf_size_dqo); + return (va); +} + +static int +gve_rx_add_clmbuf_to_ctx(struct gve_rx_ring *rx, + struct gve_rx_ctx *ctx, struct gve_rx_buf_dqo *buf, + uint8_t buf_frag_num, uint16_t frag_len) +{ + void *va = gve_get_cpu_addr_for_qpl_buf(rx, buf, buf_frag_num); + struct mbuf *mbuf; + bus_size_t segment_size = gve_rx_dqo_mbuf_segment_size(rx->com.priv); + + if (ctx->mbuf_tail == NULL) { + mbuf = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, segment_size); + if (mbuf == NULL) + return (ENOMEM); + ctx->mbuf_head = mbuf; + ctx->mbuf_tail = mbuf; + } else { + mbuf = m_getjcl(M_NOWAIT, MT_DATA, 0, segment_size); + if (mbuf == NULL) + return (ENOMEM); + ctx->mbuf_tail->m_next = mbuf; + ctx->mbuf_tail = mbuf; + } + + mbuf->m_len = frag_len; + ctx->total_size += frag_len; + + m_copyback(mbuf, 0, frag_len, va); + counter_enter(); + counter_u64_add_protected(rx->stats.rx_frag_copy_cnt, 1); + counter_exit(); + return (0); +} + +static int +gve_rx_add_extmbuf_to_ctx(struct gve_rx_ring *rx, + struct gve_rx_ctx *ctx, struct gve_rx_buf_dqo *buf, + uint8_t buf_frag_num, uint16_t frag_len) +{ + struct mbuf *mbuf; + void *page_addr; + vm_page_t page; + int page_idx; + void *va; + + if (ctx->mbuf_tail == NULL) { + mbuf = m_gethdr(M_NOWAIT, MT_DATA); + if (mbuf == NULL) + return (ENOMEM); + ctx->mbuf_head = mbuf; + ctx->mbuf_tail = mbuf; + } else { + mbuf = m_get(M_NOWAIT, MT_DATA); + if (mbuf == NULL) + return (ENOMEM); + ctx->mbuf_tail->m_next = mbuf; + ctx->mbuf_tail = mbuf; + } + + mbuf->m_len = frag_len; + ctx->total_size += frag_len; + + page_idx = buf - rx->dqo.bufs; + page = rx->com.qpl->pages[page_idx]; + page_addr = rx->com.qpl->dmas[page_idx].cpu_addr; + va = (char *)page_addr + (buf_frag_num * rx->com.priv->rx_buf_size_dqo); + + /* + * Grab an extra ref to the page so that gve_mextadd_free + * does not end up freeing the page while the interface exists. + */ + vm_page_wire(page); + + counter_enter(); + counter_u64_add_protected(rx->stats.rx_frag_flip_cnt, 1); + counter_exit(); + + MEXTADD(mbuf, va, frag_len, + gve_mextadd_free, page, page_addr, + 0, EXT_NET_DRV); + return (0); +} + +static void +gve_rx_dqo_qpl(struct gve_priv *priv, struct gve_rx_ring *rx, + struct gve_rx_compl_desc_dqo *compl_desc, + int *work_done) +{ + bool is_last_frag = compl_desc->end_of_packet != 0; + union gve_rx_qpl_buf_id_dqo composed_id; + struct gve_dma_handle *page_dma_handle; + struct gve_rx_ctx *ctx = &rx->ctx; + struct gve_rx_buf_dqo *buf; + uint32_t num_pending_bufs; + uint8_t buf_frag_num; + uint16_t frag_len; + uint16_t buf_id; + int err; + + composed_id.all = le16toh(compl_desc->buf_id); + buf_id = composed_id.buf_id; + buf_frag_num = composed_id.frag_num; + + if (__predict_false(buf_id >= rx->dqo.buf_cnt)) { + device_printf(priv->dev, "Invalid rx buf id %d on rxq %d, issuing reset\n", + buf_id, rx->com.id); + gve_schedule_reset(priv); + goto drop_frag_clear_ctx; + } + buf = &rx->dqo.bufs[buf_id]; + if (__predict_false(buf->num_nic_frags == 0 || + buf_frag_num > gve_get_dq_num_frags_in_page(priv) - 1)) { + device_printf(priv->dev, "Spurious compl for buf id %d on rxq %d " + "with buf_frag_num %d and num_nic_frags %d, issuing reset\n", + buf_id, rx->com.id, buf_frag_num, buf->num_nic_frags); + gve_schedule_reset(priv); + goto drop_frag_clear_ctx; + } + + buf->num_nic_frags--; + + if (__predict_false(ctx->drop_pkt)) + goto drop_frag; + + if (__predict_false(compl_desc->rx_error)) { + counter_enter(); + counter_u64_add_protected(rx->stats.rx_dropped_pkt_desc_err, 1); + counter_exit(); + goto drop_frag; + } + + page_dma_handle = gve_get_page_dma_handle(rx, buf); + bus_dmamap_sync(page_dma_handle->tag, page_dma_handle->map, + BUS_DMASYNC_POSTREAD); + + frag_len = compl_desc->packet_len; + if (frag_len <= priv->rx_copybreak && !ctx->mbuf_head && is_last_frag) { + void *va = gve_get_cpu_addr_for_qpl_buf(rx, buf, buf_frag_num); + + err = gve_rx_copybreak_dqo(rx, va, compl_desc, frag_len); + if (__predict_false(err != 0)) + goto drop_frag; + (*work_done)++; + gve_rx_post_qpl_buf_dqo(rx, buf, buf_frag_num); + return; + } + + num_pending_bufs = (rx->dqo.head - rx->dqo.tail) & rx->dqo.mask; + err = gve_rx_post_new_dqo_qpl_buf(rx); + if (__predict_false(err != 0 && + num_pending_bufs <= GVE_RX_DQO_MIN_PENDING_BUFS)) { + /* + * Resort to copying this fragment into a cluster mbuf + * when the above threshold is breached and repost the + * incoming buffer. If we cannot find cluster mbufs, + * just drop the packet (to repost its buffer). + */ + err = gve_rx_add_clmbuf_to_ctx(rx, ctx, buf, + buf_frag_num, frag_len); + if (err != 0) { + counter_enter(); + counter_u64_add_protected( + rx->stats.rx_dropped_pkt_buf_post_fail, 1); + counter_exit(); + goto drop_frag; + } + gve_rx_post_qpl_buf_dqo(rx, buf, buf_frag_num); + } else { + err = gve_rx_add_extmbuf_to_ctx(rx, ctx, buf, + buf_frag_num, frag_len); + if (__predict_false(err != 0)) { + counter_enter(); + counter_u64_add_protected( + rx->stats.rx_dropped_pkt_mbuf_alloc_fail, 1); + counter_exit(); + goto drop_frag; + } + } + + /* + * Both the counts need to be checked. + * + * num_nic_frags == 0 implies no pending completions + * but not all frags may have yet been posted. + * + * next_idx == 0 implies all frags have been posted + * but there might be pending completions. + */ + if (buf->num_nic_frags == 0 && buf->next_idx == 0) + STAILQ_INSERT_TAIL(&rx->dqo.used_bufs, buf, stailq_entry); + + if (is_last_frag) { + gve_rx_input_mbuf_dqo(rx, compl_desc); + (*work_done)++; + } + return; + +drop_frag: + /* Clear the earlier frags if there were any */ + m_freem(ctx->mbuf_head); + rx->ctx = (struct gve_rx_ctx){}; + /* Drop the rest of the pkt if there are more frags */ + ctx->drop_pkt = true; + /* Reuse the dropped frag's buffer */ + gve_rx_post_qpl_buf_dqo(rx, buf, buf_frag_num); + + if (is_last_frag) + goto drop_frag_clear_ctx; + return; + +drop_frag_clear_ctx: + counter_enter(); + counter_u64_add_protected(rx->stats.rx_dropped_pkt, 1); + counter_exit(); + m_freem(ctx->mbuf_head); + rx->ctx = (struct gve_rx_ctx){}; +} + +static uint8_t +gve_rx_get_gen_bit(uint8_t *desc) +{ + uint8_t byte; + + /* + * Prevent generation bit from being read after the rest of the + * descriptor. + */ + byte = atomic_load_acq_8(desc + GVE_RX_DESC_DQO_GEN_BYTE_OFFSET); + return ((byte & GVE_RX_DESC_DQO_GEN_BIT_MASK) != 0); +} + +static bool +gve_rx_cleanup_dqo(struct gve_priv *priv, struct gve_rx_ring *rx, int budget) +{ + struct gve_rx_compl_desc_dqo *compl_desc; + uint32_t work_done = 0; + + NET_EPOCH_ASSERT(); + + while (work_done < budget) { + bus_dmamap_sync(rx->dqo.compl_ring_mem.tag, + rx->dqo.compl_ring_mem.map, + BUS_DMASYNC_POSTREAD); + + compl_desc = &rx->dqo.compl_ring[rx->dqo.tail]; + if (gve_rx_get_gen_bit((uint8_t *)compl_desc) == + rx->dqo.cur_gen_bit) + break; + + rx->cnt++; + rx->dqo.tail = (rx->dqo.tail + 1) & rx->dqo.mask; + rx->dqo.cur_gen_bit ^= (rx->dqo.tail == 0); + + if (gve_is_qpl(priv)) + gve_rx_dqo_qpl(priv, rx, compl_desc, &work_done); + else + gve_rx_dqo(priv, rx, compl_desc, &work_done); + } + + if (work_done != 0) + tcp_lro_flush_all(&rx->lro); + + gve_rx_post_buffers_dqo(rx, M_NOWAIT); + if (gve_is_qpl(priv)) + gve_rx_maybe_extract_from_used_bufs(rx, /*just_one=*/false); + return (work_done == budget); +} + +void +gve_rx_cleanup_tq_dqo(void *arg, int pending) +{ + struct gve_rx_ring *rx = arg; + struct gve_priv *priv = rx->com.priv; + + if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0)) + return; + + if (gve_rx_cleanup_dqo(priv, rx, /*budget=*/64)) { + taskqueue_enqueue(rx->com.cleanup_tq, &rx->com.cleanup_task); + return; + } + + gve_db_bar_dqo_write_4(priv, rx->com.irq_db_offset, + GVE_ITR_NO_UPDATE_DQO | GVE_ITR_ENABLE_BIT_DQO); +} diff --git a/sys/dev/gve/gve_sysctl.c b/sys/dev/gve/gve_sysctl.c index 924654f62adc..a3874cc921ee 100644 --- a/sys/dev/gve/gve_sysctl.c +++ b/sys/dev/gve/gve_sysctl.c @@ -1,7 +1,7 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * - * Copyright (c) 2023 Google LLC + * Copyright (c) 2023-2024 Google LLC * * Redistribution and use in source and binary forms, with or without modification, * are permitted provided that the following conditions are met: @@ -30,6 +30,25 @@ */ #include "gve.h" +static SYSCTL_NODE(_hw, OID_AUTO, gve, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, + "GVE driver parameters"); + +bool gve_disable_hw_lro = false; +SYSCTL_BOOL(_hw_gve, OID_AUTO, disable_hw_lro, CTLFLAG_RDTUN, + &gve_disable_hw_lro, 0, "Controls if hardware LRO is used"); + +bool gve_allow_4k_rx_buffers = false; +SYSCTL_BOOL(_hw_gve, OID_AUTO, allow_4k_rx_buffers, CTLFLAG_RDTUN, + &gve_allow_4k_rx_buffers, 0, "Controls if 4K RX Buffers are allowed"); + +char gve_queue_format[8]; +SYSCTL_STRING(_hw_gve, OID_AUTO, queue_format, CTLFLAG_RD, + &gve_queue_format, 0, "Queue format being used by the iface"); + +char gve_version[8]; +SYSCTL_STRING(_hw_gve, OID_AUTO, driver_version, CTLFLAG_RD, + &gve_version, 0, "Driver version"); + static void gve_setup_rxq_sysctl(struct sysctl_ctx_list *ctx, struct sysctl_oid_list *child, struct gve_rx_ring *rxq) @@ -69,9 +88,21 @@ gve_setup_rxq_sysctl(struct sysctl_ctx_list *ctx, &stats->rx_dropped_pkt_desc_err, "Packets dropped due to descriptor error"); SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO, + "rx_dropped_pkt_buf_post_fail", CTLFLAG_RD, + &stats->rx_dropped_pkt_buf_post_fail, + "Packets dropped due to failure to post enough buffers"); + SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO, "rx_dropped_pkt_mbuf_alloc_fail", CTLFLAG_RD, &stats->rx_dropped_pkt_mbuf_alloc_fail, "Packets dropped due to failed mbuf allocation"); + SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO, + "rx_mbuf_dmamap_err", CTLFLAG_RD, + &stats->rx_mbuf_dmamap_err, + "Number of rx mbufs which could not be dma mapped"); + SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO, + "rx_mbuf_mclget_null", CTLFLAG_RD, + &stats->rx_mbuf_mclget_null, + "Number of times when there were no cluster mbufs"); SYSCTL_ADD_U32(ctx, list, OID_AUTO, "rx_completed_desc", CTLFLAG_RD, &rxq->cnt, 0, "Number of descriptors completed"); @@ -113,9 +144,9 @@ gve_setup_txq_sysctl(struct sysctl_ctx_list *ctx, "tx_bytes", CTLFLAG_RD, &stats->tbytes, "Bytes transmitted"); SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, - "tx_dropped_pkt_nospace_device", CTLFLAG_RD, - &stats->tx_dropped_pkt_nospace_device, - "Packets dropped due to no space in device"); + "tx_delayed_pkt_nospace_device", CTLFLAG_RD, + &stats->tx_delayed_pkt_nospace_device, + "Packets delayed due to no space in device"); SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "tx_dropped_pkt_nospace_bufring", CTLFLAG_RD, &stats->tx_dropped_pkt_nospace_bufring, @@ -124,6 +155,46 @@ gve_setup_txq_sysctl(struct sysctl_ctx_list *ctx, "tx_dropped_pkt_vlan", CTLFLAG_RD, &stats->tx_dropped_pkt_vlan, "Dropped VLAN packets"); + SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, + "tx_delayed_pkt_nospace_descring", CTLFLAG_RD, + &stats->tx_delayed_pkt_nospace_descring, + "Packets delayed due to no space in desc ring"); + SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, + "tx_delayed_pkt_nospace_compring", CTLFLAG_RD, + &stats->tx_delayed_pkt_nospace_compring, + "Packets delayed due to no space in comp ring"); + SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, + "tx_delayed_pkt_nospace_qpl_bufs", CTLFLAG_RD, + &stats->tx_delayed_pkt_nospace_qpl_bufs, + "Packets delayed due to not enough qpl bufs"); + SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, + "tx_delayed_pkt_tsoerr", CTLFLAG_RD, + &stats->tx_delayed_pkt_tsoerr, + "TSO packets delayed due to err in prep errors"); + SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, + "tx_mbuf_collapse", CTLFLAG_RD, + &stats->tx_mbuf_collapse, + "tx mbufs that had to be collapsed"); + SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, + "tx_mbuf_defrag", CTLFLAG_RD, + &stats->tx_mbuf_defrag, + "tx mbufs that had to be defragged"); + SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, + "tx_mbuf_defrag_err", CTLFLAG_RD, + &stats->tx_mbuf_defrag_err, + "tx mbufs that failed defrag"); + SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, + "tx_mbuf_dmamap_enomem_err", CTLFLAG_RD, + &stats->tx_mbuf_dmamap_enomem_err, + "tx mbufs that could not be dma-mapped due to low mem"); + SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, + "tx_mbuf_dmamap_err", CTLFLAG_RD, + &stats->tx_mbuf_dmamap_err, + "tx mbufs that could not be dma-mapped"); + SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, + "tx_timeout", CTLFLAG_RD, + &stats->tx_timeout, + "detections of timed out packets on tx queues"); } static void @@ -185,6 +256,9 @@ gve_setup_adminq_stat_sysctl(struct sysctl_ctx_list *ctx, SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_destroy_rx_queue_cnt", CTLFLAG_RD, &priv->adminq_destroy_rx_queue_cnt, 0, "adminq_destroy_rx_queue_cnt"); + SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_get_ptype_map_cnt", + CTLFLAG_RD, &priv->adminq_get_ptype_map_cnt, 0, + "adminq_get_ptype_map_cnt"); SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_dcfg_device_resources_cnt", CTLFLAG_RD, &priv->adminq_dcfg_device_resources_cnt, 0, @@ -219,6 +293,175 @@ gve_setup_main_stat_sysctl(struct sysctl_ctx_list *ctx, &priv->reset_cnt, 0, "Times reset"); } +static int +gve_check_num_queues(struct gve_priv *priv, int val, bool is_rx) +{ + if (val < 1) { + device_printf(priv->dev, + "Requested num queues (%u) must be a positive integer\n", val); + return (EINVAL); + } + + if (val > (is_rx ? priv->rx_cfg.max_queues : priv->tx_cfg.max_queues)) { + device_printf(priv->dev, + "Requested num queues (%u) is too large\n", val); + return (EINVAL); + } + + return (0); +} + +static int +gve_sysctl_num_tx_queues(SYSCTL_HANDLER_ARGS) +{ + struct gve_priv *priv = arg1; + int val; + int err; + + val = priv->tx_cfg.num_queues; + err = sysctl_handle_int(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + err = gve_check_num_queues(priv, val, /*is_rx=*/false); + if (err != 0) + return (err); + + if (val != priv->tx_cfg.num_queues) { + GVE_IFACE_LOCK_LOCK(priv->gve_iface_lock); + err = gve_adjust_tx_queues(priv, val); + GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock); + } + + return (err); +} + +static int +gve_sysctl_num_rx_queues(SYSCTL_HANDLER_ARGS) +{ + struct gve_priv *priv = arg1; + int val; + int err; + + val = priv->rx_cfg.num_queues; + err = sysctl_handle_int(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + err = gve_check_num_queues(priv, val, /*is_rx=*/true); + + if (err != 0) + return (err); + + if (val != priv->rx_cfg.num_queues) { + GVE_IFACE_LOCK_LOCK(priv->gve_iface_lock); + err = gve_adjust_rx_queues(priv, val); + GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock); + } + + return (err); +} + +static int +gve_check_ring_size(struct gve_priv *priv, int val, bool is_rx) +{ + if (!powerof2(val) || val == 0) { + device_printf(priv->dev, + "Requested ring size (%u) must be a power of 2\n", val); + return (EINVAL); + } + + if (val < (is_rx ? priv->min_rx_desc_cnt : priv->min_tx_desc_cnt)) { + device_printf(priv->dev, + "Requested ring size (%u) cannot be less than %d\n", val, + (is_rx ? priv->min_rx_desc_cnt : priv->min_tx_desc_cnt)); + return (EINVAL); + } + + + if (val > (is_rx ? priv->max_rx_desc_cnt : priv->max_tx_desc_cnt)) { + device_printf(priv->dev, + "Requested ring size (%u) cannot be greater than %d\n", val, + (is_rx ? priv->max_rx_desc_cnt : priv->max_tx_desc_cnt)); + return (EINVAL); + } + + return (0); +} + +static int +gve_sysctl_tx_ring_size(SYSCTL_HANDLER_ARGS) +{ + struct gve_priv *priv = arg1; + int val; + int err; + + val = priv->tx_desc_cnt; + err = sysctl_handle_int(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + err = gve_check_ring_size(priv, val, /*is_rx=*/false); + if (err != 0) + return (err); + + if (val != priv->tx_desc_cnt) { + GVE_IFACE_LOCK_LOCK(priv->gve_iface_lock); + err = gve_adjust_ring_sizes(priv, val, /*is_rx=*/false); + GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock); + } + + return (err); +} + +static int +gve_sysctl_rx_ring_size(SYSCTL_HANDLER_ARGS) +{ + struct gve_priv *priv = arg1; + int val; + int err; + + val = priv->rx_desc_cnt; + err = sysctl_handle_int(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + err = gve_check_ring_size(priv, val, /*is_rx=*/true); + if (err != 0) + return (err); + + if (val != priv->rx_desc_cnt) { + GVE_IFACE_LOCK_LOCK(priv->gve_iface_lock); + err = gve_adjust_ring_sizes(priv, val, /*is_rx=*/true); + GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock); + } + + return (err); +} + +static void +gve_setup_sysctl_writables(struct sysctl_ctx_list *ctx, + struct sysctl_oid_list *child, struct gve_priv *priv) +{ + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "num_tx_queues", + CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, priv, 0, + gve_sysctl_num_tx_queues, "I", "Number of TX queues"); + + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "num_rx_queues", + CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, priv, 0, + gve_sysctl_num_rx_queues, "I", "Number of RX queues"); + + if (priv->modify_ringsize_enabled) { + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_ring_size", + CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, priv, 0, + gve_sysctl_tx_ring_size, "I", "TX ring size"); + + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ring_size", + CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, priv, 0, + gve_sysctl_rx_ring_size, "I", "RX ring size"); + } +} + void gve_setup_sysctl(struct gve_priv *priv) { device_t dev; @@ -234,6 +477,7 @@ void gve_setup_sysctl(struct gve_priv *priv) gve_setup_queue_stat_sysctl(ctx, child, priv); gve_setup_adminq_stat_sysctl(ctx, child, priv); gve_setup_main_stat_sysctl(ctx, child, priv); + gve_setup_sysctl_writables(ctx, child, priv); } void diff --git a/sys/dev/gve/gve_tx.c b/sys/dev/gve/gve_tx.c index 1e62e1226be1..84e3a4c4eb9f 100644 --- a/sys/dev/gve/gve_tx.c +++ b/sys/dev/gve/gve_tx.c @@ -1,7 +1,7 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * - * Copyright (c) 2023 Google LLC + * Copyright (c) 2023-2024 Google LLC * * Redistribution and use in source and binary forms, with or without modification, * are permitted provided that the following conditions are met: @@ -30,6 +30,7 @@ */ #include "gve.h" #include "gve_adminq.h" +#include "gve_dqo.h" #define GVE_GQ_TX_MIN_PKT_DESC_BYTES 182 @@ -48,61 +49,112 @@ gve_tx_fifo_init(struct gve_priv *priv, struct gve_tx_ring *tx) } static void -gve_tx_free_ring(struct gve_priv *priv, int i) +gve_tx_free_ring_gqi(struct gve_priv *priv, int i) { struct gve_tx_ring *tx = &priv->tx[i]; struct gve_ring_com *com = &tx->com; - /* Safe to call even if never alloced */ - gve_free_counters((counter_u64_t *)&tx->stats, NUM_TX_STATS); - - if (tx->br != NULL) { - buf_ring_free(tx->br, M_DEVBUF); - tx->br = NULL; + if (tx->desc_ring != NULL) { + gve_dma_free_coherent(&tx->desc_ring_mem); + tx->desc_ring = NULL; } - if (mtx_initialized(&tx->ring_mtx)) - mtx_destroy(&tx->ring_mtx); - if (tx->info != NULL) { free(tx->info, M_GVE); tx->info = NULL; } - if (tx->desc_ring != NULL) { - gve_dma_free_coherent(&tx->desc_ring_mem); - tx->desc_ring = NULL; + if (com->qpl != NULL) { + gve_free_qpl(priv, com->qpl); + com->qpl = NULL; } +} + +static void +gve_tx_free_ring(struct gve_priv *priv, int i) +{ + struct gve_tx_ring *tx = &priv->tx[i]; + struct gve_ring_com *com = &tx->com; + + /* Safe to call even if never alloced */ + gve_free_counters((counter_u64_t *)&tx->stats, NUM_TX_STATS); + + if (mtx_initialized(&tx->ring_mtx)) + mtx_destroy(&tx->ring_mtx); if (com->q_resources != NULL) { gve_dma_free_coherent(&com->q_resources_mem); com->q_resources = NULL; } + + if (tx->br != NULL) { + buf_ring_free(tx->br, M_DEVBUF); + tx->br = NULL; + } + + if (gve_is_gqi(priv)) + gve_tx_free_ring_gqi(priv, i); + else + gve_tx_free_ring_dqo(priv, i); } static int -gve_tx_alloc_ring(struct gve_priv *priv, int i) +gve_tx_alloc_ring_gqi(struct gve_priv *priv, int i) { struct gve_tx_ring *tx = &priv->tx[i]; struct gve_ring_com *com = &tx->com; - char mtx_name[16]; int err; - com->priv = priv; - com->id = i; + err = gve_dma_alloc_coherent(priv, + sizeof(union gve_tx_desc) * priv->tx_desc_cnt, + CACHE_LINE_SIZE, &tx->desc_ring_mem); + if (err != 0) { + device_printf(priv->dev, + "Failed to alloc desc ring for tx ring %d", i); + goto abort; + } + tx->desc_ring = tx->desc_ring_mem.cpu_addr; - com->qpl = &priv->qpls[i]; + com->qpl = gve_alloc_qpl(priv, i, priv->tx_desc_cnt / GVE_QPL_DIVISOR, + /*single_kva=*/true); if (com->qpl == NULL) { - device_printf(priv->dev, "No QPL left for tx ring %d\n", i); - return (ENOMEM); + device_printf(priv->dev, + "Failed to alloc QPL for tx ring %d\n", i); + err = ENOMEM; + goto abort; } err = gve_tx_fifo_init(priv, tx); if (err != 0) goto abort; - tx->info = malloc(sizeof(struct gve_tx_buffer_state) * priv->tx_desc_cnt, + tx->info = malloc( + sizeof(struct gve_tx_buffer_state) * priv->tx_desc_cnt, M_GVE, M_WAITOK | M_ZERO); + return (0); + +abort: + gve_tx_free_ring_gqi(priv, i); + return (err); +} + +static int +gve_tx_alloc_ring(struct gve_priv *priv, int i) +{ + struct gve_tx_ring *tx = &priv->tx[i]; + struct gve_ring_com *com = &tx->com; + char mtx_name[16]; + int err; + + com->priv = priv; + com->id = i; + + if (gve_is_gqi(priv)) + err = gve_tx_alloc_ring_gqi(priv, i); + else + err = gve_tx_alloc_ring_dqo(priv, i); + if (err != 0) + goto abort; sprintf(mtx_name, "gvetx%d", i); mtx_init(&tx->ring_mtx, mtx_name, NULL, MTX_DEF); @@ -115,19 +167,13 @@ gve_tx_alloc_ring(struct gve_priv *priv, int i) err = gve_dma_alloc_coherent(priv, sizeof(struct gve_queue_resources), PAGE_SIZE, &com->q_resources_mem); if (err != 0) { - device_printf(priv->dev, "Failed to alloc queue resources for tx ring %d", i); + device_printf(priv->dev, + "Failed to alloc queue resources for tx ring %d", i); goto abort; } com->q_resources = com->q_resources_mem.cpu_addr; - err = gve_dma_alloc_coherent(priv, - sizeof(union gve_tx_desc) * priv->tx_desc_cnt, - CACHE_LINE_SIZE, &tx->desc_ring_mem); - if (err != 0) { - device_printf(priv->dev, "Failed to alloc desc ring for tx ring %d", i); - goto abort; - } - tx->desc_ring = tx->desc_ring_mem.cpu_addr; + tx->last_kicked = 0; return (0); @@ -137,39 +183,32 @@ abort: } int -gve_alloc_tx_rings(struct gve_priv *priv) +gve_alloc_tx_rings(struct gve_priv *priv, uint16_t start_idx, uint16_t stop_idx) { - int err = 0; int i; + int err; - priv->tx = malloc(sizeof(struct gve_tx_ring) * priv->tx_cfg.num_queues, - M_GVE, M_WAITOK | M_ZERO); + KASSERT(priv->tx != NULL, ("priv->tx is NULL!")); - for (i = 0; i < priv->tx_cfg.num_queues; i++) { + for (i = start_idx; i < stop_idx; i++) { err = gve_tx_alloc_ring(priv, i); if (err != 0) goto free_rings; - } return (0); - free_rings: - while (i--) - gve_tx_free_ring(priv, i); - free(priv->tx, M_GVE); + gve_free_tx_rings(priv, start_idx, i); return (err); } void -gve_free_tx_rings(struct gve_priv *priv) +gve_free_tx_rings(struct gve_priv *priv, uint16_t start_idx, uint16_t stop_idx) { int i; - for (i = 0; i < priv->tx_cfg.num_queues; i++) + for (i = start_idx; i < stop_idx; i++) gve_tx_free_ring(priv, i); - - free(priv->tx, M_GVE); } static void @@ -181,6 +220,7 @@ gve_tx_clear_desc_ring(struct gve_tx_ring *tx) for (i = 0; i < com->priv->tx_desc_cnt; i++) { tx->desc_ring[i] = (union gve_tx_desc){}; tx->info[i] = (struct gve_tx_buffer_state){}; + gve_invalidate_timestamp(&tx->info[i].enqueue_time_sec); } bus_dmamap_sync(tx->desc_ring_mem.tag, tx->desc_ring_mem.map, @@ -209,7 +249,11 @@ gve_start_tx_ring(struct gve_priv *priv, int i) struct gve_tx_ring *tx = &priv->tx[i]; struct gve_ring_com *com = &tx->com; - NET_TASK_INIT(&com->cleanup_task, 0, gve_tx_cleanup_tq, tx); + atomic_store_bool(&tx->stopped, false); + if (gve_is_gqi(priv)) + NET_TASK_INIT(&com->cleanup_task, 0, gve_tx_cleanup_tq, tx); + else + NET_TASK_INIT(&com->cleanup_task, 0, gve_tx_cleanup_tq_dqo, tx); com->cleanup_tq = taskqueue_create_fast("gve tx", M_WAITOK, taskqueue_thread_enqueue, &com->cleanup_tq); taskqueue_start_threads(&com->cleanup_tq, 1, PI_NET, "%s txq %d", @@ -233,8 +277,12 @@ gve_create_tx_rings(struct gve_priv *priv) if (gve_get_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK)) return (0); - for (i = 0; i < priv->tx_cfg.num_queues; i++) - gve_clear_tx_ring(priv, i); + for (i = 0; i < priv->tx_cfg.num_queues; i++) { + if (gve_is_gqi(priv)) + gve_clear_tx_ring(priv, i); + else + gve_clear_tx_ring_dqo(priv, i); + } err = gve_adminq_create_tx_queues(priv, priv->tx_cfg.num_queues); if (err != 0) @@ -300,6 +348,30 @@ gve_destroy_tx_rings(struct gve_priv *priv) } int +gve_check_tx_timeout_gqi(struct gve_priv *priv, struct gve_tx_ring *tx) +{ + struct gve_tx_buffer_state *info; + uint32_t pkt_idx; + int num_timeouts; + + num_timeouts = 0; + + for (pkt_idx = 0; pkt_idx < priv->tx_desc_cnt; pkt_idx++) { + info = &tx->info[pkt_idx]; + + if (!gve_timestamp_valid(&info->enqueue_time_sec)) + continue; + + if (__predict_false( + gve_seconds_since(&info->enqueue_time_sec) > + GVE_TX_TIMEOUT_PKT_SEC)) + num_timeouts += 1; + } + + return (num_timeouts); +} + +int gve_tx_intr(void *arg) { struct gve_tx_ring *tx = arg; @@ -351,7 +423,10 @@ gve_tx_cleanup_tq(void *arg, int pending) if (mbuf == NULL) continue; + gve_invalidate_timestamp(&info->enqueue_time_sec); + info->mbuf = NULL; + counter_enter(); counter_u64_add_protected(tx->stats.tbytes, mbuf->m_pkthdr.len); counter_u64_add_protected(tx->stats.tpackets, 1); @@ -375,7 +450,7 @@ gve_tx_cleanup_tq(void *arg, int pending) * interrupt but they will still be handled by the enqueue below. * Completions born after the barrier WILL trigger an interrupt. */ - mb(); + atomic_thread_fence_seq_cst(); nic_done = gve_tx_load_event_counter(priv, tx); todo = nic_done - tx->done; @@ -383,6 +458,11 @@ gve_tx_cleanup_tq(void *arg, int pending) gve_db_bar_write_4(priv, tx->com.irq_db_offset, GVE_IRQ_MASK); taskqueue_enqueue(tx->com.cleanup_tq, &tx->com.cleanup_task); } + + if (atomic_load_bool(&tx->stopped) && space_freed) { + atomic_store_bool(&tx->stopped, false); + taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task); + } } static void @@ -627,8 +707,7 @@ gve_xmit(struct gve_tx_ring *tx, struct mbuf *mbuf) bytes_required = gve_fifo_bytes_required(tx, first_seg_len, pkt_len); if (__predict_false(!gve_can_tx(tx, bytes_required))) { counter_enter(); - counter_u64_add_protected(tx->stats.tx_dropped_pkt_nospace_device, 1); - counter_u64_add_protected(tx->stats.tx_dropped_pkt, 1); + counter_u64_add_protected(tx->stats.tx_delayed_pkt_nospace_device, 1); counter_exit(); return (ENOBUFS); } @@ -636,6 +715,8 @@ gve_xmit(struct gve_tx_ring *tx, struct mbuf *mbuf) /* So that the cleanup taskqueue can free the mbuf eventually. */ info->mbuf = mbuf; + gve_set_timestamp(&info->enqueue_time_sec); + /* * We don't want to split the header, so if necessary, pad to the end * of the fifo and then put the header at the beginning of the fifo. @@ -689,19 +770,86 @@ gve_xmit(struct gve_tx_ring *tx, struct mbuf *mbuf) return (0); } +static int +gve_xmit_mbuf(struct gve_tx_ring *tx, + struct mbuf **mbuf) +{ + if (gve_is_gqi(tx->com.priv)) + return (gve_xmit(tx, *mbuf)); + + if (gve_is_qpl(tx->com.priv)) + return (gve_xmit_dqo_qpl(tx, *mbuf)); + + /* + * gve_xmit_dqo might attempt to defrag the mbuf chain. + * The reference is passed in so that in the case of + * errors, the new mbuf chain is what's put back on the br. + */ + return (gve_xmit_dqo(tx, mbuf)); +} + +/* + * Has the side-effect of stopping the xmit queue by setting tx->stopped + */ +static int +gve_xmit_retry_enobuf_mbuf(struct gve_tx_ring *tx, + struct mbuf **mbuf) +{ + int err; + + atomic_store_bool(&tx->stopped, true); + + /* + * Room made in the queue BEFORE the barrier will be seen by the + * gve_xmit_mbuf retry below. + * + * If room is made in the queue AFTER the barrier, the cleanup tq + * iteration creating the room will either see a tx->stopped value + * of 0 or the 1 we just wrote: + * + * If it sees a 1, then it would enqueue the xmit tq. Enqueue + * implies a retry on the waiting pkt. + * + * If it sees a 0, then that implies a previous iteration overwrote + * our 1, and that iteration would enqueue the xmit tq. Enqueue + * implies a retry on the waiting pkt. + */ + atomic_thread_fence_seq_cst(); + + err = gve_xmit_mbuf(tx, mbuf); + if (err == 0) + atomic_store_bool(&tx->stopped, false); + + return (err); +} + static void gve_xmit_br(struct gve_tx_ring *tx) { struct gve_priv *priv = tx->com.priv; struct ifnet *ifp = priv->ifp; struct mbuf *mbuf; + int err; while ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) != 0 && (mbuf = drbr_peek(ifp, tx->br)) != NULL) { + err = gve_xmit_mbuf(tx, &mbuf); - if (__predict_false(gve_xmit(tx, mbuf) != 0)) { - drbr_putback(ifp, tx->br, mbuf); - taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task); + /* + * We need to stop this taskqueue when we can't xmit the pkt due + * to lack of space in the NIC ring (ENOBUFS). The retry exists + * to guard against a TOCTTOU bug that could end up freezing the + * queue forever. + */ + if (__predict_false(mbuf != NULL && err == ENOBUFS)) + err = gve_xmit_retry_enobuf_mbuf(tx, &mbuf); + + if (__predict_false(err != 0 && mbuf != NULL)) { + if (err == EINVAL) { + drbr_advance(ifp, tx->br); + m_freem(mbuf); + } else + drbr_putback(ifp, tx->br, mbuf); break; } @@ -710,7 +858,12 @@ gve_xmit_br(struct gve_tx_ring *tx) bus_dmamap_sync(tx->desc_ring_mem.tag, tx->desc_ring_mem.map, BUS_DMASYNC_PREWRITE); - gve_db_bar_write_4(priv, tx->com.db_offset, tx->req); + + if (gve_is_gqi(priv)) + gve_db_bar_write_4(priv, tx->com.db_offset, tx->req); + else + gve_db_bar_dqo_write_4(priv, tx->com.db_offset, + tx->dqo.desc_tail); } } @@ -763,7 +916,8 @@ gve_xmit_ifp(if_t ifp, struct mbuf *mbuf) is_br_empty = drbr_empty(ifp, tx->br); err = drbr_enqueue(ifp, tx->br, mbuf); if (__predict_false(err != 0)) { - taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task); + if (!atomic_load_bool(&tx->stopped)) + taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task); counter_enter(); counter_u64_add_protected(tx->stats.tx_dropped_pkt_nospace_bufring, 1); counter_u64_add_protected(tx->stats.tx_dropped_pkt, 1); @@ -778,9 +932,8 @@ gve_xmit_ifp(if_t ifp, struct mbuf *mbuf) if (is_br_empty && (GVE_RING_TRYLOCK(tx) != 0)) { gve_xmit_br(tx); GVE_RING_UNLOCK(tx); - } else { + } else if (!atomic_load_bool(&tx->stopped)) taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task); - } return (0); } diff --git a/sys/dev/gve/gve_tx_dqo.c b/sys/dev/gve/gve_tx_dqo.c new file mode 100644 index 000000000000..551a7e308d19 --- /dev/null +++ b/sys/dev/gve/gve_tx_dqo.c @@ -0,0 +1,1149 @@ +/*- + * SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 2024 Google LLC + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "opt_inet6.h" + +#include "gve.h" +#include "gve_dqo.h" + +static void +gve_unmap_packet(struct gve_tx_ring *tx, + struct gve_tx_pending_pkt_dqo *pending_pkt) +{ + bus_dmamap_sync(tx->dqo.buf_dmatag, pending_pkt->dmamap, + BUS_DMASYNC_POSTWRITE); + bus_dmamap_unload(tx->dqo.buf_dmatag, pending_pkt->dmamap); +} + +static void +gve_clear_qpl_pending_pkt(struct gve_tx_pending_pkt_dqo *pending_pkt) +{ + pending_pkt->qpl_buf_head = -1; + pending_pkt->num_qpl_bufs = 0; +} + +static void +gve_free_tx_mbufs_dqo(struct gve_tx_ring *tx) +{ + struct gve_tx_pending_pkt_dqo *pending_pkt; + int i; + + for (i = 0; i < tx->dqo.num_pending_pkts; i++) { + pending_pkt = &tx->dqo.pending_pkts[i]; + if (!pending_pkt->mbuf) + continue; + + if (gve_is_qpl(tx->com.priv)) + gve_clear_qpl_pending_pkt(pending_pkt); + else + gve_unmap_packet(tx, pending_pkt); + + m_freem(pending_pkt->mbuf); + pending_pkt->mbuf = NULL; + } +} + +void +gve_tx_free_ring_dqo(struct gve_priv *priv, int i) +{ + struct gve_tx_ring *tx = &priv->tx[i]; + struct gve_ring_com *com = &tx->com; + int j; + + if (tx->dqo.desc_ring != NULL) { + gve_dma_free_coherent(&tx->desc_ring_mem); + tx->dqo.desc_ring = NULL; + } + + if (tx->dqo.compl_ring != NULL) { + gve_dma_free_coherent(&tx->dqo.compl_ring_mem); + tx->dqo.compl_ring = NULL; + } + + if (tx->dqo.pending_pkts != NULL) { + gve_free_tx_mbufs_dqo(tx); + + if (!gve_is_qpl(priv) && tx->dqo.buf_dmatag) { + for (j = 0; j < tx->dqo.num_pending_pkts; j++) + if (tx->dqo.pending_pkts[j].state != + GVE_PACKET_STATE_UNALLOCATED) + bus_dmamap_destroy(tx->dqo.buf_dmatag, + tx->dqo.pending_pkts[j].dmamap); + } + + free(tx->dqo.pending_pkts, M_GVE); + tx->dqo.pending_pkts = NULL; + } + + if (!gve_is_qpl(priv) && tx->dqo.buf_dmatag) + bus_dma_tag_destroy(tx->dqo.buf_dmatag); + + if (gve_is_qpl(priv) && tx->dqo.qpl_bufs != NULL) { + free(tx->dqo.qpl_bufs, M_GVE); + tx->dqo.qpl_bufs = NULL; + } + + if (com->qpl != NULL) { + gve_free_qpl(priv, com->qpl); + com->qpl = NULL; + } +} + +static int +gve_tx_alloc_rda_fields_dqo(struct gve_tx_ring *tx) +{ + struct gve_priv *priv = tx->com.priv; + int err; + int j; + + /* + * DMA tag for mapping Tx mbufs + * The maxsize, nsegments, and maxsegsize params should match + * the if_sethwtso* arguments in gve_setup_ifnet in gve_main.c. + */ + err = bus_dma_tag_create( + bus_get_dma_tag(priv->dev), /* parent */ + 1, 0, /* alignment, bounds */ + BUS_SPACE_MAXADDR, /* lowaddr */ + BUS_SPACE_MAXADDR, /* highaddr */ + NULL, NULL, /* filter, filterarg */ + GVE_TSO_MAXSIZE_DQO, /* maxsize */ + GVE_TX_MAX_DATA_DESCS_DQO, /* nsegments */ + GVE_TX_MAX_BUF_SIZE_DQO, /* maxsegsize */ + BUS_DMA_ALLOCNOW, /* flags */ + NULL, /* lockfunc */ + NULL, /* lockarg */ + &tx->dqo.buf_dmatag); + if (err != 0) { + device_printf(priv->dev, "%s: bus_dma_tag_create failed: %d\n", + __func__, err); + return (err); + } + + for (j = 0; j < tx->dqo.num_pending_pkts; j++) { + err = bus_dmamap_create(tx->dqo.buf_dmatag, 0, + &tx->dqo.pending_pkts[j].dmamap); + if (err != 0) { + device_printf(priv->dev, + "err in creating pending pkt dmamap %d: %d", + j, err); + return (err); + } + tx->dqo.pending_pkts[j].state = GVE_PACKET_STATE_FREE; + } + + return (0); +} + +int +gve_tx_alloc_ring_dqo(struct gve_priv *priv, int i) +{ + struct gve_tx_ring *tx = &priv->tx[i]; + uint16_t num_pending_pkts; + int err; + + /* Descriptor ring */ + err = gve_dma_alloc_coherent(priv, + sizeof(union gve_tx_desc_dqo) * priv->tx_desc_cnt, + CACHE_LINE_SIZE, &tx->desc_ring_mem); + if (err != 0) { + device_printf(priv->dev, + "Failed to alloc desc ring for tx ring %d", i); + goto abort; + } + tx->dqo.desc_ring = tx->desc_ring_mem.cpu_addr; + + /* Completion ring */ + err = gve_dma_alloc_coherent(priv, + sizeof(struct gve_tx_compl_desc_dqo) * priv->tx_desc_cnt, + CACHE_LINE_SIZE, &tx->dqo.compl_ring_mem); + if (err != 0) { + device_printf(priv->dev, + "Failed to alloc compl ring for tx ring %d", i); + goto abort; + } + tx->dqo.compl_ring = tx->dqo.compl_ring_mem.cpu_addr; + + /* + * pending_pkts array + * + * The max number of pending packets determines the maximum number of + * descriptors which maybe written to the completion queue. + * + * We must set the number small enough to make sure we never overrun the + * completion queue. + */ + num_pending_pkts = priv->tx_desc_cnt; + /* + * Reserve space for descriptor completions, which will be reported at + * most every GVE_TX_MIN_RE_INTERVAL packets. + */ + num_pending_pkts -= num_pending_pkts / GVE_TX_MIN_RE_INTERVAL; + + tx->dqo.num_pending_pkts = num_pending_pkts; + tx->dqo.pending_pkts = malloc( + sizeof(struct gve_tx_pending_pkt_dqo) * num_pending_pkts, + M_GVE, M_WAITOK | M_ZERO); + + if (gve_is_qpl(priv)) { + int qpl_buf_cnt; + + tx->com.qpl = gve_alloc_qpl(priv, i, GVE_TX_NUM_QPL_PAGES_DQO, + /*single_kva*/false); + if (tx->com.qpl == NULL) { + device_printf(priv->dev, + "Failed to alloc QPL for tx ring %d", i); + err = ENOMEM; + goto abort; + } + + qpl_buf_cnt = GVE_TX_BUFS_PER_PAGE_DQO * + tx->com.qpl->num_pages; + + tx->dqo.qpl_bufs = malloc( + sizeof(*tx->dqo.qpl_bufs) * qpl_buf_cnt, + M_GVE, M_WAITOK | M_ZERO); + } else + gve_tx_alloc_rda_fields_dqo(tx); + return (0); + +abort: + gve_tx_free_ring_dqo(priv, i); + return (err); +} + +static void +gve_extract_tx_metadata_dqo(const struct mbuf *mbuf, + struct gve_tx_metadata_dqo *metadata) +{ + uint32_t hash = mbuf->m_pkthdr.flowid; + uint16_t path_hash; + + metadata->version = GVE_TX_METADATA_VERSION_DQO; + if (hash) { + path_hash = hash ^ (hash >> 16); + + path_hash &= (1 << 15) - 1; + if (__predict_false(path_hash == 0)) + path_hash = ~path_hash; + + metadata->path_hash = path_hash; + } +} + +static void +gve_tx_fill_pkt_desc_dqo(struct gve_tx_ring *tx, + uint32_t *desc_idx, uint32_t len, uint64_t addr, + int16_t compl_tag, bool eop, bool csum_enabled) +{ + while (len > 0) { + struct gve_tx_pkt_desc_dqo *desc = + &tx->dqo.desc_ring[*desc_idx].pkt; + uint32_t cur_len = MIN(len, GVE_TX_MAX_BUF_SIZE_DQO); + bool cur_eop = eop && cur_len == len; + + *desc = (struct gve_tx_pkt_desc_dqo){ + .buf_addr = htole64(addr), + .dtype = GVE_TX_PKT_DESC_DTYPE_DQO, + .end_of_packet = cur_eop, + .checksum_offload_enable = csum_enabled, + .compl_tag = htole16(compl_tag), + .buf_size = cur_len, + }; + + addr += cur_len; + len -= cur_len; + *desc_idx = (*desc_idx + 1) & tx->dqo.desc_mask; + } +} + +static void +gve_tx_fill_tso_ctx_desc(struct gve_tx_tso_context_desc_dqo *desc, + const struct mbuf *mbuf, const struct gve_tx_metadata_dqo *metadata, + int header_len) +{ + *desc = (struct gve_tx_tso_context_desc_dqo){ + .header_len = header_len, + .cmd_dtype = { + .dtype = GVE_TX_TSO_CTX_DESC_DTYPE_DQO, + .tso = 1, + }, + .flex0 = metadata->bytes[0], + .flex5 = metadata->bytes[5], + .flex6 = metadata->bytes[6], + .flex7 = metadata->bytes[7], + .flex8 = metadata->bytes[8], + .flex9 = metadata->bytes[9], + .flex10 = metadata->bytes[10], + .flex11 = metadata->bytes[11], + }; + desc->tso_total_len = mbuf->m_pkthdr.len - header_len; + desc->mss = mbuf->m_pkthdr.tso_segsz; +} + +static void +gve_tx_fill_general_ctx_desc(struct gve_tx_general_context_desc_dqo *desc, + const struct gve_tx_metadata_dqo *metadata) +{ + *desc = (struct gve_tx_general_context_desc_dqo){ + .flex0 = metadata->bytes[0], + .flex1 = metadata->bytes[1], + .flex2 = metadata->bytes[2], + .flex3 = metadata->bytes[3], + .flex4 = metadata->bytes[4], + .flex5 = metadata->bytes[5], + .flex6 = metadata->bytes[6], + .flex7 = metadata->bytes[7], + .flex8 = metadata->bytes[8], + .flex9 = metadata->bytes[9], + .flex10 = metadata->bytes[10], + .flex11 = metadata->bytes[11], + .cmd_dtype = {.dtype = GVE_TX_GENERAL_CTX_DESC_DTYPE_DQO}, + }; +} + +#define PULLUP_HDR(m, len) \ +do { \ + if (__predict_false((m)->m_len < (len))) { \ + (m) = m_pullup((m), (len)); \ + if ((m) == NULL) \ + return (EINVAL); \ + } \ +} while (0) + +static int +gve_prep_tso(struct mbuf *mbuf, int *header_len) +{ + uint8_t l3_off, l4_off = 0; + struct ether_header *eh; + struct tcphdr *th; + u_short csum; + + PULLUP_HDR(mbuf, sizeof(*eh)); + eh = mtod(mbuf, struct ether_header *); + KASSERT(eh->ether_type != ETHERTYPE_VLAN, + ("VLAN-tagged packets not supported")); + l3_off = ETHER_HDR_LEN; + +#ifdef INET6 + if (ntohs(eh->ether_type) == ETHERTYPE_IPV6) { + struct ip6_hdr *ip6; + + PULLUP_HDR(mbuf, l3_off + sizeof(*ip6)); + ip6 = (struct ip6_hdr *)(mtodo(mbuf, l3_off)); + l4_off = l3_off + sizeof(struct ip6_hdr); + csum = in6_cksum_pseudo(ip6, /*len=*/0, IPPROTO_TCP, + /*csum=*/0); + } else +#endif + if (ntohs(eh->ether_type) == ETHERTYPE_IP) { + struct ip *ip; + + PULLUP_HDR(mbuf, l3_off + sizeof(*ip)); + ip = (struct ip *)(mtodo(mbuf, l3_off)); + l4_off = l3_off + (ip->ip_hl << 2); + csum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, + htons(IPPROTO_TCP)); + } + + PULLUP_HDR(mbuf, l4_off + sizeof(struct tcphdr *)); + th = (struct tcphdr *)(mtodo(mbuf, l4_off)); + *header_len = l4_off + (th->th_off << 2); + + /* + * Hardware requires the th->th_sum to not include the TCP payload, + * hence we recompute the csum with it excluded. + */ + th->th_sum = csum; + + return (0); +} + +static int +gve_tx_fill_ctx_descs(struct gve_tx_ring *tx, struct mbuf *mbuf, + bool is_tso, uint32_t *desc_idx) +{ + struct gve_tx_general_context_desc_dqo *gen_desc; + struct gve_tx_tso_context_desc_dqo *tso_desc; + struct gve_tx_metadata_dqo metadata; + int header_len; + int err; + + metadata = (struct gve_tx_metadata_dqo){0}; + gve_extract_tx_metadata_dqo(mbuf, &metadata); + + if (is_tso) { + err = gve_prep_tso(mbuf, &header_len); + if (__predict_false(err)) { + counter_enter(); + counter_u64_add_protected( + tx->stats.tx_delayed_pkt_tsoerr, 1); + counter_exit(); + return (err); + } + + tso_desc = &tx->dqo.desc_ring[*desc_idx].tso_ctx; + gve_tx_fill_tso_ctx_desc(tso_desc, mbuf, &metadata, header_len); + + *desc_idx = (*desc_idx + 1) & tx->dqo.desc_mask; + counter_enter(); + counter_u64_add_protected(tx->stats.tso_packet_cnt, 1); + counter_exit(); + } + + gen_desc = &tx->dqo.desc_ring[*desc_idx].general_ctx; + gve_tx_fill_general_ctx_desc(gen_desc, &metadata); + *desc_idx = (*desc_idx + 1) & tx->dqo.desc_mask; + return (0); +} + +static int +gve_map_mbuf_dqo(struct gve_tx_ring *tx, + struct mbuf **mbuf, bus_dmamap_t dmamap, + bus_dma_segment_t *segs, int *nsegs, int attempt) +{ + struct mbuf *m_new = NULL; + int err; + + err = bus_dmamap_load_mbuf_sg(tx->dqo.buf_dmatag, dmamap, + *mbuf, segs, nsegs, BUS_DMA_NOWAIT); + + switch (err) { + case __predict_true(0): + break; + case EFBIG: + if (__predict_false(attempt > 0)) + goto abort; + + counter_enter(); + counter_u64_add_protected( + tx->stats.tx_mbuf_collapse, 1); + counter_exit(); + + /* Try m_collapse before m_defrag */ + m_new = m_collapse(*mbuf, M_NOWAIT, + GVE_TX_MAX_DATA_DESCS_DQO); + if (m_new == NULL) { + counter_enter(); + counter_u64_add_protected( + tx->stats.tx_mbuf_defrag, 1); + counter_exit(); + m_new = m_defrag(*mbuf, M_NOWAIT); + } + + if (__predict_false(m_new == NULL)) { + counter_enter(); + counter_u64_add_protected( + tx->stats.tx_mbuf_defrag_err, 1); + counter_exit(); + + m_freem(*mbuf); + *mbuf = NULL; + err = ENOMEM; + goto abort; + } else { + *mbuf = m_new; + return (gve_map_mbuf_dqo(tx, mbuf, dmamap, + segs, nsegs, ++attempt)); + } + case ENOMEM: + counter_enter(); + counter_u64_add_protected( + tx->stats.tx_mbuf_dmamap_enomem_err, 1); + counter_exit(); + goto abort; + default: + goto abort; + } + + return (0); + +abort: + counter_enter(); + counter_u64_add_protected(tx->stats.tx_mbuf_dmamap_err, 1); + counter_exit(); + return (err); +} + +static uint32_t +num_avail_desc_ring_slots(const struct gve_tx_ring *tx) +{ + uint32_t num_used = (tx->dqo.desc_tail - tx->dqo.desc_head) & + tx->dqo.desc_mask; + + return (tx->dqo.desc_mask - num_used); +} + +static struct gve_tx_pending_pkt_dqo * +gve_alloc_pending_packet(struct gve_tx_ring *tx) +{ + int32_t index = tx->dqo.free_pending_pkts_csm; + struct gve_tx_pending_pkt_dqo *pending_pkt; + + /* + * No pending packets available in the consumer list, + * try to steal the producer list. + */ + if (__predict_false(index == -1)) { + tx->dqo.free_pending_pkts_csm = atomic_swap_32( + &tx->dqo.free_pending_pkts_prd, -1); + + index = tx->dqo.free_pending_pkts_csm; + if (__predict_false(index == -1)) + return (NULL); + } + + pending_pkt = &tx->dqo.pending_pkts[index]; + + /* Remove pending_pkt from the consumer list */ + tx->dqo.free_pending_pkts_csm = pending_pkt->next; + pending_pkt->state = GVE_PACKET_STATE_PENDING_DATA_COMPL; + + gve_set_timestamp(&pending_pkt->enqueue_time_sec); + + return (pending_pkt); +} + +static void +gve_free_pending_packet(struct gve_tx_ring *tx, + struct gve_tx_pending_pkt_dqo *pending_pkt) +{ + int index = pending_pkt - tx->dqo.pending_pkts; + int32_t old_head; + + pending_pkt->state = GVE_PACKET_STATE_FREE; + + gve_invalidate_timestamp(&pending_pkt->enqueue_time_sec); + + /* Add pending_pkt to the producer list */ + while (true) { + old_head = atomic_load_acq_32(&tx->dqo.free_pending_pkts_prd); + + pending_pkt->next = old_head; + if (atomic_cmpset_32(&tx->dqo.free_pending_pkts_prd, + old_head, index)) + break; + } +} + +/* + * Has the side-effect of retrieving the value of the last desc index + * processed by the NIC. hw_tx_head is written to by the completions-processing + * taskqueue upon receiving descriptor-completions. + */ +static bool +gve_tx_has_desc_room_dqo(struct gve_tx_ring *tx, int needed_descs) +{ + if (needed_descs <= num_avail_desc_ring_slots(tx)) + return (true); + + tx->dqo.desc_head = atomic_load_acq_32(&tx->dqo.hw_tx_head); + if (needed_descs > num_avail_desc_ring_slots(tx)) { + counter_enter(); + counter_u64_add_protected( + tx->stats.tx_delayed_pkt_nospace_descring, 1); + counter_exit(); + return (false); + } + + return (0); +} + +static void +gve_tx_request_desc_compl(struct gve_tx_ring *tx, uint32_t desc_idx) +{ + uint32_t last_report_event_interval; + uint32_t last_desc_idx; + + last_desc_idx = (desc_idx - 1) & tx->dqo.desc_mask; + last_report_event_interval = + (last_desc_idx - tx->dqo.last_re_idx) & tx->dqo.desc_mask; + + if (__predict_false(last_report_event_interval >= + GVE_TX_MIN_RE_INTERVAL)) { + tx->dqo.desc_ring[last_desc_idx].pkt.report_event = true; + tx->dqo.last_re_idx = last_desc_idx; + } +} + +static bool +gve_tx_have_enough_qpl_bufs(struct gve_tx_ring *tx, int num_bufs) +{ + uint32_t available = tx->dqo.qpl_bufs_produced_cached - + tx->dqo.qpl_bufs_consumed; + + if (__predict_true(available >= num_bufs)) + return (true); + + tx->dqo.qpl_bufs_produced_cached = atomic_load_acq_32( + &tx->dqo.qpl_bufs_produced); + available = tx->dqo.qpl_bufs_produced_cached - + tx->dqo.qpl_bufs_consumed; + + if (__predict_true(available >= num_bufs)) + return (true); + return (false); +} + +static int32_t +gve_tx_alloc_qpl_buf(struct gve_tx_ring *tx) +{ + int32_t buf = tx->dqo.free_qpl_bufs_csm; + + if (__predict_false(buf == -1)) { + tx->dqo.free_qpl_bufs_csm = atomic_swap_32( + &tx->dqo.free_qpl_bufs_prd, -1); + buf = tx->dqo.free_qpl_bufs_csm; + if (__predict_false(buf == -1)) + return (-1); + } + + tx->dqo.free_qpl_bufs_csm = tx->dqo.qpl_bufs[buf]; + tx->dqo.qpl_bufs_consumed++; + return (buf); +} + +/* + * Tx buffer i corresponds to + * qpl_page_id = i / GVE_TX_BUFS_PER_PAGE_DQO + * qpl_page_offset = (i % GVE_TX_BUFS_PER_PAGE_DQO) * GVE_TX_BUF_SIZE_DQO + */ +static void +gve_tx_buf_get_addr_dqo(struct gve_tx_ring *tx, + int32_t index, void **va, bus_addr_t *dma_addr) +{ + int page_id = index >> (PAGE_SHIFT - GVE_TX_BUF_SHIFT_DQO); + int offset = (index & (GVE_TX_BUFS_PER_PAGE_DQO - 1)) << + GVE_TX_BUF_SHIFT_DQO; + + *va = (char *)tx->com.qpl->dmas[page_id].cpu_addr + offset; + *dma_addr = tx->com.qpl->dmas[page_id].bus_addr + offset; +} + +static struct gve_dma_handle * +gve_get_page_dma_handle(struct gve_tx_ring *tx, int32_t index) +{ + int page_id = index >> (PAGE_SHIFT - GVE_TX_BUF_SHIFT_DQO); + + return (&tx->com.qpl->dmas[page_id]); +} + +static void +gve_tx_copy_mbuf_and_write_pkt_descs(struct gve_tx_ring *tx, + struct mbuf *mbuf, struct gve_tx_pending_pkt_dqo *pkt, + bool csum_enabled, int16_t completion_tag, + uint32_t *desc_idx) +{ + int32_t pkt_len = mbuf->m_pkthdr.len; + struct gve_dma_handle *dma; + uint32_t copy_offset = 0; + int32_t prev_buf = -1; + uint32_t copy_len; + bus_addr_t addr; + int32_t buf; + void *va; + + MPASS(pkt->num_qpl_bufs == 0); + MPASS(pkt->qpl_buf_head == -1); + + while (copy_offset < pkt_len) { + buf = gve_tx_alloc_qpl_buf(tx); + /* We already checked for availability */ + MPASS(buf != -1); + + gve_tx_buf_get_addr_dqo(tx, buf, &va, &addr); + copy_len = MIN(GVE_TX_BUF_SIZE_DQO, pkt_len - copy_offset); + m_copydata(mbuf, copy_offset, copy_len, va); + copy_offset += copy_len; + + dma = gve_get_page_dma_handle(tx, buf); + bus_dmamap_sync(dma->tag, dma->map, BUS_DMASYNC_PREWRITE); + + gve_tx_fill_pkt_desc_dqo(tx, desc_idx, + copy_len, addr, completion_tag, + /*eop=*/copy_offset == pkt_len, + csum_enabled); + + /* Link all the qpl bufs for a packet */ + if (prev_buf == -1) + pkt->qpl_buf_head = buf; + else + tx->dqo.qpl_bufs[prev_buf] = buf; + + prev_buf = buf; + pkt->num_qpl_bufs++; + } + + tx->dqo.qpl_bufs[buf] = -1; +} + +int +gve_xmit_dqo_qpl(struct gve_tx_ring *tx, struct mbuf *mbuf) +{ + uint32_t desc_idx = tx->dqo.desc_tail; + struct gve_tx_pending_pkt_dqo *pkt; + int total_descs_needed; + int16_t completion_tag; + bool has_csum_flag; + int csum_flags; + bool is_tso; + int nsegs; + int err; + + csum_flags = mbuf->m_pkthdr.csum_flags; + has_csum_flag = csum_flags & (CSUM_TCP | CSUM_UDP | + CSUM_IP6_TCP | CSUM_IP6_UDP | CSUM_TSO); + is_tso = csum_flags & CSUM_TSO; + + nsegs = howmany(mbuf->m_pkthdr.len, GVE_TX_BUF_SIZE_DQO); + /* Check if we have enough room in the desc ring */ + total_descs_needed = 1 + /* general_ctx_desc */ + nsegs + /* pkt_desc */ + (is_tso ? 1 : 0); /* tso_ctx_desc */ + if (__predict_false(!gve_tx_has_desc_room_dqo(tx, total_descs_needed))) + return (ENOBUFS); + + if (!gve_tx_have_enough_qpl_bufs(tx, nsegs)) { + counter_enter(); + counter_u64_add_protected( + tx->stats.tx_delayed_pkt_nospace_qpl_bufs, 1); + counter_exit(); + return (ENOBUFS); + } + + pkt = gve_alloc_pending_packet(tx); + if (pkt == NULL) { + counter_enter(); + counter_u64_add_protected( + tx->stats.tx_delayed_pkt_nospace_compring, 1); + counter_exit(); + return (ENOBUFS); + } + completion_tag = pkt - tx->dqo.pending_pkts; + pkt->mbuf = mbuf; + + err = gve_tx_fill_ctx_descs(tx, mbuf, is_tso, &desc_idx); + if (err) + goto abort; + + gve_tx_copy_mbuf_and_write_pkt_descs(tx, mbuf, pkt, + has_csum_flag, completion_tag, &desc_idx); + + /* Remember the index of the last desc written */ + tx->dqo.desc_tail = desc_idx; + + /* + * Request a descriptor completion on the last descriptor of the + * packet if we are allowed to by the HW enforced interval. + */ + gve_tx_request_desc_compl(tx, desc_idx); + + tx->req += total_descs_needed; /* tx->req is just a sysctl counter */ + return (0); + +abort: + pkt->mbuf = NULL; + gve_free_pending_packet(tx, pkt); + return (err); +} + +int +gve_xmit_dqo(struct gve_tx_ring *tx, struct mbuf **mbuf_ptr) +{ + bus_dma_segment_t segs[GVE_TX_MAX_DATA_DESCS_DQO]; + uint32_t desc_idx = tx->dqo.desc_tail; + struct gve_tx_pending_pkt_dqo *pkt; + struct mbuf *mbuf = *mbuf_ptr; + int total_descs_needed; + int16_t completion_tag; + bool has_csum_flag; + int csum_flags; + bool is_tso; + int nsegs; + int err; + int i; + + csum_flags = mbuf->m_pkthdr.csum_flags; + has_csum_flag = csum_flags & (CSUM_TCP | CSUM_UDP | + CSUM_IP6_TCP | CSUM_IP6_UDP | CSUM_TSO); + is_tso = csum_flags & CSUM_TSO; + + /* + * This mbuf might end up needing more than 1 pkt desc. + * The actual number, `nsegs` is known only after the + * expensive gve_map_mbuf_dqo call. This check beneath + * exists to fail early when the desc ring is really full. + */ + total_descs_needed = 1 + /* general_ctx_desc */ + 1 + /* pkt_desc */ + (is_tso ? 1 : 0); /* tso_ctx_desc */ + if (__predict_false(!gve_tx_has_desc_room_dqo(tx, total_descs_needed))) + return (ENOBUFS); + + pkt = gve_alloc_pending_packet(tx); + if (pkt == NULL) { + counter_enter(); + counter_u64_add_protected( + tx->stats.tx_delayed_pkt_nospace_compring, 1); + counter_exit(); + return (ENOBUFS); + } + completion_tag = pkt - tx->dqo.pending_pkts; + + err = gve_map_mbuf_dqo(tx, mbuf_ptr, pkt->dmamap, + segs, &nsegs, /*attempt=*/0); + if (err) + goto abort; + mbuf = *mbuf_ptr; /* gve_map_mbuf_dqo might replace the mbuf chain */ + pkt->mbuf = mbuf; + + total_descs_needed = 1 + /* general_ctx_desc */ + nsegs + /* pkt_desc */ + (is_tso ? 1 : 0); /* tso_ctx_desc */ + if (__predict_false( + !gve_tx_has_desc_room_dqo(tx, total_descs_needed))) { + err = ENOBUFS; + goto abort_with_dma; + } + + err = gve_tx_fill_ctx_descs(tx, mbuf, is_tso, &desc_idx); + if (err) + goto abort_with_dma; + + bus_dmamap_sync(tx->dqo.buf_dmatag, pkt->dmamap, BUS_DMASYNC_PREWRITE); + for (i = 0; i < nsegs; i++) { + gve_tx_fill_pkt_desc_dqo(tx, &desc_idx, + segs[i].ds_len, segs[i].ds_addr, + completion_tag, /*eop=*/i == (nsegs - 1), + has_csum_flag); + } + + /* Remember the index of the last desc written */ + tx->dqo.desc_tail = desc_idx; + + /* + * Request a descriptor completion on the last descriptor of the + * packet if we are allowed to by the HW enforced interval. + */ + gve_tx_request_desc_compl(tx, desc_idx); + + tx->req += total_descs_needed; /* tx->req is just a sysctl counter */ + return (0); + +abort_with_dma: + gve_unmap_packet(tx, pkt); +abort: + pkt->mbuf = NULL; + gve_free_pending_packet(tx, pkt); + return (err); +} + +static void +gve_reap_qpl_bufs_dqo(struct gve_tx_ring *tx, + struct gve_tx_pending_pkt_dqo *pkt) +{ + int32_t buf = pkt->qpl_buf_head; + struct gve_dma_handle *dma; + int32_t qpl_buf_tail; + int32_t old_head; + int i; + + for (i = 0; i < pkt->num_qpl_bufs; i++) { + dma = gve_get_page_dma_handle(tx, buf); + bus_dmamap_sync(dma->tag, dma->map, BUS_DMASYNC_POSTWRITE); + qpl_buf_tail = buf; + buf = tx->dqo.qpl_bufs[buf]; + } + MPASS(buf == -1); + buf = qpl_buf_tail; + + while (true) { + old_head = atomic_load_32(&tx->dqo.free_qpl_bufs_prd); + tx->dqo.qpl_bufs[buf] = old_head; + + /* + * The "rel" ensures that the update to dqo.free_qpl_bufs_prd + * is visible only after the linked list from this pkt is + * attached above to old_head. + */ + if (atomic_cmpset_rel_32(&tx->dqo.free_qpl_bufs_prd, + old_head, pkt->qpl_buf_head)) + break; + } + /* + * The "rel" ensures that the update to dqo.qpl_bufs_produced is + * visible only adter the update to dqo.free_qpl_bufs_prd above. + */ + atomic_add_rel_32(&tx->dqo.qpl_bufs_produced, pkt->num_qpl_bufs); + + gve_clear_qpl_pending_pkt(pkt); +} + +static uint64_t +gve_handle_packet_completion(struct gve_priv *priv, + struct gve_tx_ring *tx, uint16_t compl_tag) +{ + struct gve_tx_pending_pkt_dqo *pending_pkt; + int32_t pkt_len; + + if (__predict_false(compl_tag >= tx->dqo.num_pending_pkts)) { + device_printf(priv->dev, "Invalid TX completion tag: %d\n", + compl_tag); + return (0); + } + + pending_pkt = &tx->dqo.pending_pkts[compl_tag]; + + /* Packet is allocated but not pending data completion. */ + if (__predict_false(pending_pkt->state != + GVE_PACKET_STATE_PENDING_DATA_COMPL)) { + device_printf(priv->dev, + "No pending data completion: %d\n", compl_tag); + return (0); + } + + pkt_len = pending_pkt->mbuf->m_pkthdr.len; + + if (gve_is_qpl(priv)) + gve_reap_qpl_bufs_dqo(tx, pending_pkt); + else + gve_unmap_packet(tx, pending_pkt); + + m_freem(pending_pkt->mbuf); + pending_pkt->mbuf = NULL; + gve_free_pending_packet(tx, pending_pkt); + return (pkt_len); +} + +int +gve_check_tx_timeout_dqo(struct gve_priv *priv, struct gve_tx_ring *tx) +{ + struct gve_tx_pending_pkt_dqo *pending_pkt; + int num_timeouts; + uint16_t pkt_idx; + + num_timeouts = 0; + for (pkt_idx = 0; pkt_idx < tx->dqo.num_pending_pkts; pkt_idx++) { + pending_pkt = &tx->dqo.pending_pkts[pkt_idx]; + + if (!gve_timestamp_valid(&pending_pkt->enqueue_time_sec)) + continue; + + if (__predict_false( + gve_seconds_since(&pending_pkt->enqueue_time_sec) > + GVE_TX_TIMEOUT_PKT_SEC)) + num_timeouts += 1; + } + + return (num_timeouts); +} + +int +gve_tx_intr_dqo(void *arg) +{ + struct gve_tx_ring *tx = arg; + struct gve_priv *priv = tx->com.priv; + struct gve_ring_com *com = &tx->com; + + if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0)) + return (FILTER_STRAY); + + /* Interrupts are automatically masked */ + taskqueue_enqueue(com->cleanup_tq, &com->cleanup_task); + return (FILTER_HANDLED); +} + +static void +gve_tx_clear_desc_ring_dqo(struct gve_tx_ring *tx) +{ + struct gve_ring_com *com = &tx->com; + int i; + + for (i = 0; i < com->priv->tx_desc_cnt; i++) + tx->dqo.desc_ring[i] = (union gve_tx_desc_dqo){}; + + bus_dmamap_sync(tx->desc_ring_mem.tag, tx->desc_ring_mem.map, + BUS_DMASYNC_PREWRITE); +} + +static void +gve_tx_clear_compl_ring_dqo(struct gve_tx_ring *tx) +{ + struct gve_ring_com *com = &tx->com; + int entries; + int i; + + entries = com->priv->tx_desc_cnt; + for (i = 0; i < entries; i++) + tx->dqo.compl_ring[i] = (struct gve_tx_compl_desc_dqo){}; + + bus_dmamap_sync(tx->dqo.compl_ring_mem.tag, tx->dqo.compl_ring_mem.map, + BUS_DMASYNC_PREWRITE); +} + +void +gve_clear_tx_ring_dqo(struct gve_priv *priv, int i) +{ + struct gve_tx_ring *tx = &priv->tx[i]; + int j; + + tx->dqo.desc_head = 0; + tx->dqo.desc_tail = 0; + tx->dqo.desc_mask = priv->tx_desc_cnt - 1; + tx->dqo.last_re_idx = 0; + + tx->dqo.compl_head = 0; + tx->dqo.compl_mask = priv->tx_desc_cnt - 1; + atomic_store_32(&tx->dqo.hw_tx_head, 0); + tx->dqo.cur_gen_bit = 0; + + gve_free_tx_mbufs_dqo(tx); + + for (j = 0; j < tx->dqo.num_pending_pkts; j++) { + if (gve_is_qpl(tx->com.priv)) + gve_clear_qpl_pending_pkt(&tx->dqo.pending_pkts[j]); + gve_invalidate_timestamp( + &tx->dqo.pending_pkts[j].enqueue_time_sec); + tx->dqo.pending_pkts[j].next = + (j == tx->dqo.num_pending_pkts - 1) ? -1 : j + 1; + tx->dqo.pending_pkts[j].state = GVE_PACKET_STATE_FREE; + } + tx->dqo.free_pending_pkts_csm = 0; + atomic_store_rel_32(&tx->dqo.free_pending_pkts_prd, -1); + + if (gve_is_qpl(priv)) { + int qpl_buf_cnt = GVE_TX_BUFS_PER_PAGE_DQO * + tx->com.qpl->num_pages; + + for (j = 0; j < qpl_buf_cnt - 1; j++) + tx->dqo.qpl_bufs[j] = j + 1; + tx->dqo.qpl_bufs[j] = -1; + + tx->dqo.free_qpl_bufs_csm = 0; + atomic_store_32(&tx->dqo.free_qpl_bufs_prd, -1); + atomic_store_32(&tx->dqo.qpl_bufs_produced, qpl_buf_cnt); + tx->dqo.qpl_bufs_produced_cached = qpl_buf_cnt; + tx->dqo.qpl_bufs_consumed = 0; + } + + gve_tx_clear_desc_ring_dqo(tx); + gve_tx_clear_compl_ring_dqo(tx); +} + +static uint8_t +gve_tx_get_gen_bit(uint8_t *desc) +{ + uint8_t byte; + + /* + * Prevent generation bit from being read after the rest of the + * descriptor. + */ + byte = atomic_load_acq_8(desc + GVE_TX_DESC_DQO_GEN_BYTE_OFFSET); + return ((byte & GVE_TX_DESC_DQO_GEN_BIT_MASK) != 0); +} + +static bool +gve_tx_cleanup_dqo(struct gve_priv *priv, struct gve_tx_ring *tx, int budget) +{ + struct gve_tx_compl_desc_dqo *compl_desc; + uint64_t bytes_done = 0; + uint64_t pkts_done = 0; + uint16_t compl_tag; + int work_done = 0; + uint16_t tx_head; + uint16_t type; + + while (work_done < budget) { + bus_dmamap_sync(tx->dqo.compl_ring_mem.tag, + tx->dqo.compl_ring_mem.map, + BUS_DMASYNC_POSTREAD); + + compl_desc = &tx->dqo.compl_ring[tx->dqo.compl_head]; + if (gve_tx_get_gen_bit((uint8_t *)compl_desc) == + tx->dqo.cur_gen_bit) + break; + + type = compl_desc->type; + if (type == GVE_COMPL_TYPE_DQO_DESC) { + /* This is the last descriptor fetched by HW plus one */ + tx_head = le16toh(compl_desc->tx_head); + atomic_store_rel_32(&tx->dqo.hw_tx_head, tx_head); + } else if (type == GVE_COMPL_TYPE_DQO_PKT) { + compl_tag = le16toh(compl_desc->completion_tag); + bytes_done += gve_handle_packet_completion(priv, + tx, compl_tag); + pkts_done++; + } + + tx->dqo.compl_head = (tx->dqo.compl_head + 1) & + tx->dqo.compl_mask; + /* Flip the generation bit when we wrap around */ + tx->dqo.cur_gen_bit ^= tx->dqo.compl_head == 0; + work_done++; + } + + /* + * Waking the xmit taskqueue has to occur after room has been made in + * the queue. + */ + atomic_thread_fence_seq_cst(); + if (atomic_load_bool(&tx->stopped) && work_done) { + atomic_store_bool(&tx->stopped, false); + taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task); + } + + tx->done += work_done; /* tx->done is just a sysctl counter */ + counter_enter(); + counter_u64_add_protected(tx->stats.tbytes, bytes_done); + counter_u64_add_protected(tx->stats.tpackets, pkts_done); + counter_exit(); + + return (work_done == budget); +} + +void +gve_tx_cleanup_tq_dqo(void *arg, int pending) +{ + struct gve_tx_ring *tx = arg; + struct gve_priv *priv = tx->com.priv; + + if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0)) + return; + + if (gve_tx_cleanup_dqo(priv, tx, /*budget=*/1024)) { + taskqueue_enqueue(tx->com.cleanup_tq, &tx->com.cleanup_task); + return; + } + + gve_db_bar_dqo_write_4(priv, tx->com.irq_db_offset, + GVE_ITR_NO_UPDATE_DQO | GVE_ITR_ENABLE_BIT_DQO); +} diff --git a/sys/dev/gve/gve_utils.c b/sys/dev/gve/gve_utils.c index c05488770dbd..707b8f039d88 100644 --- a/sys/dev/gve/gve_utils.c +++ b/sys/dev/gve/gve_utils.c @@ -1,7 +1,7 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * - * Copyright (c) 2023 Google LLC + * Copyright (c) 2023-2024 Google LLC * * Redistribution and use in source and binary forms, with or without modification, * are permitted provided that the following conditions are met: @@ -29,6 +29,7 @@ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "gve.h" +#include "gve_dqo.h" uint32_t gve_reg_bar_read_4(struct gve_priv *priv, bus_size_t offset) @@ -49,6 +50,12 @@ gve_db_bar_write_4(struct gve_priv *priv, bus_size_t offset, uint32_t val) } void +gve_db_bar_dqo_write_4(struct gve_priv *priv, bus_size_t offset, uint32_t val) +{ + bus_write_4(priv->db_bar, offset, val); +} + +void gve_alloc_counters(counter_u64_t *stat, int num_stats) { int i; @@ -227,7 +234,7 @@ gve_free_irqs(struct gve_priv *priv) return; } - num_irqs = priv->tx_cfg.num_queues + priv->rx_cfg.num_queues + 1; + num_irqs = priv->tx_cfg.max_queues + priv->rx_cfg.max_queues + 1; for (i = 0; i < num_irqs; i++) { irq = &priv->irq_tbl[i]; @@ -261,8 +268,8 @@ gve_free_irqs(struct gve_priv *priv) int gve_alloc_irqs(struct gve_priv *priv) { - int num_tx = priv->tx_cfg.num_queues; - int num_rx = priv->rx_cfg.num_queues; + int num_tx = priv->tx_cfg.max_queues; + int num_rx = priv->rx_cfg.max_queues; int req_nvecs = num_tx + num_rx + 1; int got_nvecs = req_nvecs; struct gve_irq *irq; @@ -307,7 +314,8 @@ gve_alloc_irqs(struct gve_priv *priv) } err = bus_setup_intr(priv->dev, irq->res, INTR_TYPE_NET | INTR_MPSAFE, - gve_tx_intr, NULL, &priv->tx[i], &irq->cookie); + gve_is_gqi(priv) ? gve_tx_intr : gve_tx_intr_dqo, NULL, + &priv->tx[i], &irq->cookie); if (err != 0) { device_printf(priv->dev, "Failed to setup irq %d for Tx queue %d, " "err: %d\n", rid, i, err); @@ -334,7 +342,8 @@ gve_alloc_irqs(struct gve_priv *priv) } err = bus_setup_intr(priv->dev, irq->res, INTR_TYPE_NET | INTR_MPSAFE, - gve_rx_intr, NULL, &priv->rx[j], &irq->cookie); + gve_is_gqi(priv) ? gve_rx_intr : gve_rx_intr_dqo, NULL, + &priv->rx[j], &irq->cookie); if (err != 0) { device_printf(priv->dev, "Failed to setup irq %d for Rx queue %d, " "err: %d\n", rid, j, err); @@ -374,6 +383,24 @@ abort: return (err); } +/* + * Builds register value to write to DQO IRQ doorbell to enable with specified + * ITR interval. + */ +static uint32_t +gve_setup_itr_interval_dqo(uint32_t interval_us) +{ + uint32_t result = GVE_ITR_ENABLE_BIT_DQO; + + /* Interval has 2us granularity. */ + interval_us >>= 1; + + interval_us &= GVE_ITR_INTERVAL_DQO_MASK; + result |= (interval_us << GVE_ITR_INTERVAL_DQO_SHIFT); + + return (result); +} + void gve_unmask_all_queue_irqs(struct gve_priv *priv) { @@ -383,11 +410,20 @@ gve_unmask_all_queue_irqs(struct gve_priv *priv) for (idx = 0; idx < priv->tx_cfg.num_queues; idx++) { tx = &priv->tx[idx]; - gve_db_bar_write_4(priv, tx->com.irq_db_offset, 0); + if (gve_is_gqi(priv)) + gve_db_bar_write_4(priv, tx->com.irq_db_offset, 0); + else + gve_db_bar_dqo_write_4(priv, tx->com.irq_db_offset, + gve_setup_itr_interval_dqo(GVE_TX_IRQ_RATELIMIT_US_DQO)); } + for (idx = 0; idx < priv->rx_cfg.num_queues; idx++) { rx = &priv->rx[idx]; - gve_db_bar_write_4(priv, rx->com.irq_db_offset, 0); + if (gve_is_gqi(priv)) + gve_db_bar_write_4(priv, rx->com.irq_db_offset, 0); + else + gve_db_bar_dqo_write_4(priv, rx->com.irq_db_offset, + gve_setup_itr_interval_dqo(GVE_RX_IRQ_RATELIMIT_US_DQO)); } } @@ -403,3 +439,46 @@ gve_mask_all_queue_irqs(struct gve_priv *priv) gve_db_bar_write_4(priv, rx->com.irq_db_offset, GVE_IRQ_MASK); } } + +/* + * In some cases, such as tracking timeout events, we must mark a timestamp as + * invalid when we do not want to consider its value. Such timestamps must be + * checked for validity before reading them. + */ +void +gve_invalidate_timestamp(int64_t *timestamp_sec) +{ + atomic_store_64(timestamp_sec, GVE_TIMESTAMP_INVALID); +} + +/* + * Returns 0 if the timestamp is invalid, otherwise returns the elapsed seconds + * since the timestamp was set. + */ +int64_t +gve_seconds_since(int64_t *timestamp_sec) +{ + struct bintime curr_time; + int64_t enqueued_time; + + getbintime(&curr_time); + enqueued_time = atomic_load_64(timestamp_sec); + if (enqueued_time == GVE_TIMESTAMP_INVALID) + return (0); + return ((int64_t)(curr_time.sec - enqueued_time)); +} + +void +gve_set_timestamp(int64_t *timestamp_sec) +{ + struct bintime curr_time; + + getbintime(&curr_time); + atomic_store_64(timestamp_sec, curr_time.sec); +} + +bool +gve_timestamp_valid(int64_t *timestamp_sec) +{ + return (atomic_load_64(timestamp_sec) != GVE_TIMESTAMP_INVALID); +} |