aboutsummaryrefslogtreecommitdiff
path: root/sys/dev/gve/gve_tx_dqo.c
diff options
context:
space:
mode:
Diffstat (limited to 'sys/dev/gve/gve_tx_dqo.c')
-rw-r--r--sys/dev/gve/gve_tx_dqo.c1149
1 files changed, 1149 insertions, 0 deletions
diff --git a/sys/dev/gve/gve_tx_dqo.c b/sys/dev/gve/gve_tx_dqo.c
new file mode 100644
index 000000000000..551a7e308d19
--- /dev/null
+++ b/sys/dev/gve/gve_tx_dqo.c
@@ -0,0 +1,1149 @@
+/*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 2024 Google LLC
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "opt_inet6.h"
+
+#include "gve.h"
+#include "gve_dqo.h"
+
+static void
+gve_unmap_packet(struct gve_tx_ring *tx,
+ struct gve_tx_pending_pkt_dqo *pending_pkt)
+{
+ bus_dmamap_sync(tx->dqo.buf_dmatag, pending_pkt->dmamap,
+ BUS_DMASYNC_POSTWRITE);
+ bus_dmamap_unload(tx->dqo.buf_dmatag, pending_pkt->dmamap);
+}
+
+static void
+gve_clear_qpl_pending_pkt(struct gve_tx_pending_pkt_dqo *pending_pkt)
+{
+ pending_pkt->qpl_buf_head = -1;
+ pending_pkt->num_qpl_bufs = 0;
+}
+
+static void
+gve_free_tx_mbufs_dqo(struct gve_tx_ring *tx)
+{
+ struct gve_tx_pending_pkt_dqo *pending_pkt;
+ int i;
+
+ for (i = 0; i < tx->dqo.num_pending_pkts; i++) {
+ pending_pkt = &tx->dqo.pending_pkts[i];
+ if (!pending_pkt->mbuf)
+ continue;
+
+ if (gve_is_qpl(tx->com.priv))
+ gve_clear_qpl_pending_pkt(pending_pkt);
+ else
+ gve_unmap_packet(tx, pending_pkt);
+
+ m_freem(pending_pkt->mbuf);
+ pending_pkt->mbuf = NULL;
+ }
+}
+
+void
+gve_tx_free_ring_dqo(struct gve_priv *priv, int i)
+{
+ struct gve_tx_ring *tx = &priv->tx[i];
+ struct gve_ring_com *com = &tx->com;
+ int j;
+
+ if (tx->dqo.desc_ring != NULL) {
+ gve_dma_free_coherent(&tx->desc_ring_mem);
+ tx->dqo.desc_ring = NULL;
+ }
+
+ if (tx->dqo.compl_ring != NULL) {
+ gve_dma_free_coherent(&tx->dqo.compl_ring_mem);
+ tx->dqo.compl_ring = NULL;
+ }
+
+ if (tx->dqo.pending_pkts != NULL) {
+ gve_free_tx_mbufs_dqo(tx);
+
+ if (!gve_is_qpl(priv) && tx->dqo.buf_dmatag) {
+ for (j = 0; j < tx->dqo.num_pending_pkts; j++)
+ if (tx->dqo.pending_pkts[j].state !=
+ GVE_PACKET_STATE_UNALLOCATED)
+ bus_dmamap_destroy(tx->dqo.buf_dmatag,
+ tx->dqo.pending_pkts[j].dmamap);
+ }
+
+ free(tx->dqo.pending_pkts, M_GVE);
+ tx->dqo.pending_pkts = NULL;
+ }
+
+ if (!gve_is_qpl(priv) && tx->dqo.buf_dmatag)
+ bus_dma_tag_destroy(tx->dqo.buf_dmatag);
+
+ if (gve_is_qpl(priv) && tx->dqo.qpl_bufs != NULL) {
+ free(tx->dqo.qpl_bufs, M_GVE);
+ tx->dqo.qpl_bufs = NULL;
+ }
+
+ if (com->qpl != NULL) {
+ gve_free_qpl(priv, com->qpl);
+ com->qpl = NULL;
+ }
+}
+
+static int
+gve_tx_alloc_rda_fields_dqo(struct gve_tx_ring *tx)
+{
+ struct gve_priv *priv = tx->com.priv;
+ int err;
+ int j;
+
+ /*
+ * DMA tag for mapping Tx mbufs
+ * The maxsize, nsegments, and maxsegsize params should match
+ * the if_sethwtso* arguments in gve_setup_ifnet in gve_main.c.
+ */
+ err = bus_dma_tag_create(
+ bus_get_dma_tag(priv->dev), /* parent */
+ 1, 0, /* alignment, bounds */
+ BUS_SPACE_MAXADDR, /* lowaddr */
+ BUS_SPACE_MAXADDR, /* highaddr */
+ NULL, NULL, /* filter, filterarg */
+ GVE_TSO_MAXSIZE_DQO, /* maxsize */
+ GVE_TX_MAX_DATA_DESCS_DQO, /* nsegments */
+ GVE_TX_MAX_BUF_SIZE_DQO, /* maxsegsize */
+ BUS_DMA_ALLOCNOW, /* flags */
+ NULL, /* lockfunc */
+ NULL, /* lockarg */
+ &tx->dqo.buf_dmatag);
+ if (err != 0) {
+ device_printf(priv->dev, "%s: bus_dma_tag_create failed: %d\n",
+ __func__, err);
+ return (err);
+ }
+
+ for (j = 0; j < tx->dqo.num_pending_pkts; j++) {
+ err = bus_dmamap_create(tx->dqo.buf_dmatag, 0,
+ &tx->dqo.pending_pkts[j].dmamap);
+ if (err != 0) {
+ device_printf(priv->dev,
+ "err in creating pending pkt dmamap %d: %d",
+ j, err);
+ return (err);
+ }
+ tx->dqo.pending_pkts[j].state = GVE_PACKET_STATE_FREE;
+ }
+
+ return (0);
+}
+
+int
+gve_tx_alloc_ring_dqo(struct gve_priv *priv, int i)
+{
+ struct gve_tx_ring *tx = &priv->tx[i];
+ uint16_t num_pending_pkts;
+ int err;
+
+ /* Descriptor ring */
+ err = gve_dma_alloc_coherent(priv,
+ sizeof(union gve_tx_desc_dqo) * priv->tx_desc_cnt,
+ CACHE_LINE_SIZE, &tx->desc_ring_mem);
+ if (err != 0) {
+ device_printf(priv->dev,
+ "Failed to alloc desc ring for tx ring %d", i);
+ goto abort;
+ }
+ tx->dqo.desc_ring = tx->desc_ring_mem.cpu_addr;
+
+ /* Completion ring */
+ err = gve_dma_alloc_coherent(priv,
+ sizeof(struct gve_tx_compl_desc_dqo) * priv->tx_desc_cnt,
+ CACHE_LINE_SIZE, &tx->dqo.compl_ring_mem);
+ if (err != 0) {
+ device_printf(priv->dev,
+ "Failed to alloc compl ring for tx ring %d", i);
+ goto abort;
+ }
+ tx->dqo.compl_ring = tx->dqo.compl_ring_mem.cpu_addr;
+
+ /*
+ * pending_pkts array
+ *
+ * The max number of pending packets determines the maximum number of
+ * descriptors which maybe written to the completion queue.
+ *
+ * We must set the number small enough to make sure we never overrun the
+ * completion queue.
+ */
+ num_pending_pkts = priv->tx_desc_cnt;
+ /*
+ * Reserve space for descriptor completions, which will be reported at
+ * most every GVE_TX_MIN_RE_INTERVAL packets.
+ */
+ num_pending_pkts -= num_pending_pkts / GVE_TX_MIN_RE_INTERVAL;
+
+ tx->dqo.num_pending_pkts = num_pending_pkts;
+ tx->dqo.pending_pkts = malloc(
+ sizeof(struct gve_tx_pending_pkt_dqo) * num_pending_pkts,
+ M_GVE, M_WAITOK | M_ZERO);
+
+ if (gve_is_qpl(priv)) {
+ int qpl_buf_cnt;
+
+ tx->com.qpl = gve_alloc_qpl(priv, i, GVE_TX_NUM_QPL_PAGES_DQO,
+ /*single_kva*/false);
+ if (tx->com.qpl == NULL) {
+ device_printf(priv->dev,
+ "Failed to alloc QPL for tx ring %d", i);
+ err = ENOMEM;
+ goto abort;
+ }
+
+ qpl_buf_cnt = GVE_TX_BUFS_PER_PAGE_DQO *
+ tx->com.qpl->num_pages;
+
+ tx->dqo.qpl_bufs = malloc(
+ sizeof(*tx->dqo.qpl_bufs) * qpl_buf_cnt,
+ M_GVE, M_WAITOK | M_ZERO);
+ } else
+ gve_tx_alloc_rda_fields_dqo(tx);
+ return (0);
+
+abort:
+ gve_tx_free_ring_dqo(priv, i);
+ return (err);
+}
+
+static void
+gve_extract_tx_metadata_dqo(const struct mbuf *mbuf,
+ struct gve_tx_metadata_dqo *metadata)
+{
+ uint32_t hash = mbuf->m_pkthdr.flowid;
+ uint16_t path_hash;
+
+ metadata->version = GVE_TX_METADATA_VERSION_DQO;
+ if (hash) {
+ path_hash = hash ^ (hash >> 16);
+
+ path_hash &= (1 << 15) - 1;
+ if (__predict_false(path_hash == 0))
+ path_hash = ~path_hash;
+
+ metadata->path_hash = path_hash;
+ }
+}
+
+static void
+gve_tx_fill_pkt_desc_dqo(struct gve_tx_ring *tx,
+ uint32_t *desc_idx, uint32_t len, uint64_t addr,
+ int16_t compl_tag, bool eop, bool csum_enabled)
+{
+ while (len > 0) {
+ struct gve_tx_pkt_desc_dqo *desc =
+ &tx->dqo.desc_ring[*desc_idx].pkt;
+ uint32_t cur_len = MIN(len, GVE_TX_MAX_BUF_SIZE_DQO);
+ bool cur_eop = eop && cur_len == len;
+
+ *desc = (struct gve_tx_pkt_desc_dqo){
+ .buf_addr = htole64(addr),
+ .dtype = GVE_TX_PKT_DESC_DTYPE_DQO,
+ .end_of_packet = cur_eop,
+ .checksum_offload_enable = csum_enabled,
+ .compl_tag = htole16(compl_tag),
+ .buf_size = cur_len,
+ };
+
+ addr += cur_len;
+ len -= cur_len;
+ *desc_idx = (*desc_idx + 1) & tx->dqo.desc_mask;
+ }
+}
+
+static void
+gve_tx_fill_tso_ctx_desc(struct gve_tx_tso_context_desc_dqo *desc,
+ const struct mbuf *mbuf, const struct gve_tx_metadata_dqo *metadata,
+ int header_len)
+{
+ *desc = (struct gve_tx_tso_context_desc_dqo){
+ .header_len = header_len,
+ .cmd_dtype = {
+ .dtype = GVE_TX_TSO_CTX_DESC_DTYPE_DQO,
+ .tso = 1,
+ },
+ .flex0 = metadata->bytes[0],
+ .flex5 = metadata->bytes[5],
+ .flex6 = metadata->bytes[6],
+ .flex7 = metadata->bytes[7],
+ .flex8 = metadata->bytes[8],
+ .flex9 = metadata->bytes[9],
+ .flex10 = metadata->bytes[10],
+ .flex11 = metadata->bytes[11],
+ };
+ desc->tso_total_len = mbuf->m_pkthdr.len - header_len;
+ desc->mss = mbuf->m_pkthdr.tso_segsz;
+}
+
+static void
+gve_tx_fill_general_ctx_desc(struct gve_tx_general_context_desc_dqo *desc,
+ const struct gve_tx_metadata_dqo *metadata)
+{
+ *desc = (struct gve_tx_general_context_desc_dqo){
+ .flex0 = metadata->bytes[0],
+ .flex1 = metadata->bytes[1],
+ .flex2 = metadata->bytes[2],
+ .flex3 = metadata->bytes[3],
+ .flex4 = metadata->bytes[4],
+ .flex5 = metadata->bytes[5],
+ .flex6 = metadata->bytes[6],
+ .flex7 = metadata->bytes[7],
+ .flex8 = metadata->bytes[8],
+ .flex9 = metadata->bytes[9],
+ .flex10 = metadata->bytes[10],
+ .flex11 = metadata->bytes[11],
+ .cmd_dtype = {.dtype = GVE_TX_GENERAL_CTX_DESC_DTYPE_DQO},
+ };
+}
+
+#define PULLUP_HDR(m, len) \
+do { \
+ if (__predict_false((m)->m_len < (len))) { \
+ (m) = m_pullup((m), (len)); \
+ if ((m) == NULL) \
+ return (EINVAL); \
+ } \
+} while (0)
+
+static int
+gve_prep_tso(struct mbuf *mbuf, int *header_len)
+{
+ uint8_t l3_off, l4_off = 0;
+ struct ether_header *eh;
+ struct tcphdr *th;
+ u_short csum;
+
+ PULLUP_HDR(mbuf, sizeof(*eh));
+ eh = mtod(mbuf, struct ether_header *);
+ KASSERT(eh->ether_type != ETHERTYPE_VLAN,
+ ("VLAN-tagged packets not supported"));
+ l3_off = ETHER_HDR_LEN;
+
+#ifdef INET6
+ if (ntohs(eh->ether_type) == ETHERTYPE_IPV6) {
+ struct ip6_hdr *ip6;
+
+ PULLUP_HDR(mbuf, l3_off + sizeof(*ip6));
+ ip6 = (struct ip6_hdr *)(mtodo(mbuf, l3_off));
+ l4_off = l3_off + sizeof(struct ip6_hdr);
+ csum = in6_cksum_pseudo(ip6, /*len=*/0, IPPROTO_TCP,
+ /*csum=*/0);
+ } else
+#endif
+ if (ntohs(eh->ether_type) == ETHERTYPE_IP) {
+ struct ip *ip;
+
+ PULLUP_HDR(mbuf, l3_off + sizeof(*ip));
+ ip = (struct ip *)(mtodo(mbuf, l3_off));
+ l4_off = l3_off + (ip->ip_hl << 2);
+ csum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
+ htons(IPPROTO_TCP));
+ }
+
+ PULLUP_HDR(mbuf, l4_off + sizeof(struct tcphdr *));
+ th = (struct tcphdr *)(mtodo(mbuf, l4_off));
+ *header_len = l4_off + (th->th_off << 2);
+
+ /*
+ * Hardware requires the th->th_sum to not include the TCP payload,
+ * hence we recompute the csum with it excluded.
+ */
+ th->th_sum = csum;
+
+ return (0);
+}
+
+static int
+gve_tx_fill_ctx_descs(struct gve_tx_ring *tx, struct mbuf *mbuf,
+ bool is_tso, uint32_t *desc_idx)
+{
+ struct gve_tx_general_context_desc_dqo *gen_desc;
+ struct gve_tx_tso_context_desc_dqo *tso_desc;
+ struct gve_tx_metadata_dqo metadata;
+ int header_len;
+ int err;
+
+ metadata = (struct gve_tx_metadata_dqo){0};
+ gve_extract_tx_metadata_dqo(mbuf, &metadata);
+
+ if (is_tso) {
+ err = gve_prep_tso(mbuf, &header_len);
+ if (__predict_false(err)) {
+ counter_enter();
+ counter_u64_add_protected(
+ tx->stats.tx_delayed_pkt_tsoerr, 1);
+ counter_exit();
+ return (err);
+ }
+
+ tso_desc = &tx->dqo.desc_ring[*desc_idx].tso_ctx;
+ gve_tx_fill_tso_ctx_desc(tso_desc, mbuf, &metadata, header_len);
+
+ *desc_idx = (*desc_idx + 1) & tx->dqo.desc_mask;
+ counter_enter();
+ counter_u64_add_protected(tx->stats.tso_packet_cnt, 1);
+ counter_exit();
+ }
+
+ gen_desc = &tx->dqo.desc_ring[*desc_idx].general_ctx;
+ gve_tx_fill_general_ctx_desc(gen_desc, &metadata);
+ *desc_idx = (*desc_idx + 1) & tx->dqo.desc_mask;
+ return (0);
+}
+
+static int
+gve_map_mbuf_dqo(struct gve_tx_ring *tx,
+ struct mbuf **mbuf, bus_dmamap_t dmamap,
+ bus_dma_segment_t *segs, int *nsegs, int attempt)
+{
+ struct mbuf *m_new = NULL;
+ int err;
+
+ err = bus_dmamap_load_mbuf_sg(tx->dqo.buf_dmatag, dmamap,
+ *mbuf, segs, nsegs, BUS_DMA_NOWAIT);
+
+ switch (err) {
+ case __predict_true(0):
+ break;
+ case EFBIG:
+ if (__predict_false(attempt > 0))
+ goto abort;
+
+ counter_enter();
+ counter_u64_add_protected(
+ tx->stats.tx_mbuf_collapse, 1);
+ counter_exit();
+
+ /* Try m_collapse before m_defrag */
+ m_new = m_collapse(*mbuf, M_NOWAIT,
+ GVE_TX_MAX_DATA_DESCS_DQO);
+ if (m_new == NULL) {
+ counter_enter();
+ counter_u64_add_protected(
+ tx->stats.tx_mbuf_defrag, 1);
+ counter_exit();
+ m_new = m_defrag(*mbuf, M_NOWAIT);
+ }
+
+ if (__predict_false(m_new == NULL)) {
+ counter_enter();
+ counter_u64_add_protected(
+ tx->stats.tx_mbuf_defrag_err, 1);
+ counter_exit();
+
+ m_freem(*mbuf);
+ *mbuf = NULL;
+ err = ENOMEM;
+ goto abort;
+ } else {
+ *mbuf = m_new;
+ return (gve_map_mbuf_dqo(tx, mbuf, dmamap,
+ segs, nsegs, ++attempt));
+ }
+ case ENOMEM:
+ counter_enter();
+ counter_u64_add_protected(
+ tx->stats.tx_mbuf_dmamap_enomem_err, 1);
+ counter_exit();
+ goto abort;
+ default:
+ goto abort;
+ }
+
+ return (0);
+
+abort:
+ counter_enter();
+ counter_u64_add_protected(tx->stats.tx_mbuf_dmamap_err, 1);
+ counter_exit();
+ return (err);
+}
+
+static uint32_t
+num_avail_desc_ring_slots(const struct gve_tx_ring *tx)
+{
+ uint32_t num_used = (tx->dqo.desc_tail - tx->dqo.desc_head) &
+ tx->dqo.desc_mask;
+
+ return (tx->dqo.desc_mask - num_used);
+}
+
+static struct gve_tx_pending_pkt_dqo *
+gve_alloc_pending_packet(struct gve_tx_ring *tx)
+{
+ int32_t index = tx->dqo.free_pending_pkts_csm;
+ struct gve_tx_pending_pkt_dqo *pending_pkt;
+
+ /*
+ * No pending packets available in the consumer list,
+ * try to steal the producer list.
+ */
+ if (__predict_false(index == -1)) {
+ tx->dqo.free_pending_pkts_csm = atomic_swap_32(
+ &tx->dqo.free_pending_pkts_prd, -1);
+
+ index = tx->dqo.free_pending_pkts_csm;
+ if (__predict_false(index == -1))
+ return (NULL);
+ }
+
+ pending_pkt = &tx->dqo.pending_pkts[index];
+
+ /* Remove pending_pkt from the consumer list */
+ tx->dqo.free_pending_pkts_csm = pending_pkt->next;
+ pending_pkt->state = GVE_PACKET_STATE_PENDING_DATA_COMPL;
+
+ gve_set_timestamp(&pending_pkt->enqueue_time_sec);
+
+ return (pending_pkt);
+}
+
+static void
+gve_free_pending_packet(struct gve_tx_ring *tx,
+ struct gve_tx_pending_pkt_dqo *pending_pkt)
+{
+ int index = pending_pkt - tx->dqo.pending_pkts;
+ int32_t old_head;
+
+ pending_pkt->state = GVE_PACKET_STATE_FREE;
+
+ gve_invalidate_timestamp(&pending_pkt->enqueue_time_sec);
+
+ /* Add pending_pkt to the producer list */
+ while (true) {
+ old_head = atomic_load_acq_32(&tx->dqo.free_pending_pkts_prd);
+
+ pending_pkt->next = old_head;
+ if (atomic_cmpset_32(&tx->dqo.free_pending_pkts_prd,
+ old_head, index))
+ break;
+ }
+}
+
+/*
+ * Has the side-effect of retrieving the value of the last desc index
+ * processed by the NIC. hw_tx_head is written to by the completions-processing
+ * taskqueue upon receiving descriptor-completions.
+ */
+static bool
+gve_tx_has_desc_room_dqo(struct gve_tx_ring *tx, int needed_descs)
+{
+ if (needed_descs <= num_avail_desc_ring_slots(tx))
+ return (true);
+
+ tx->dqo.desc_head = atomic_load_acq_32(&tx->dqo.hw_tx_head);
+ if (needed_descs > num_avail_desc_ring_slots(tx)) {
+ counter_enter();
+ counter_u64_add_protected(
+ tx->stats.tx_delayed_pkt_nospace_descring, 1);
+ counter_exit();
+ return (false);
+ }
+
+ return (0);
+}
+
+static void
+gve_tx_request_desc_compl(struct gve_tx_ring *tx, uint32_t desc_idx)
+{
+ uint32_t last_report_event_interval;
+ uint32_t last_desc_idx;
+
+ last_desc_idx = (desc_idx - 1) & tx->dqo.desc_mask;
+ last_report_event_interval =
+ (last_desc_idx - tx->dqo.last_re_idx) & tx->dqo.desc_mask;
+
+ if (__predict_false(last_report_event_interval >=
+ GVE_TX_MIN_RE_INTERVAL)) {
+ tx->dqo.desc_ring[last_desc_idx].pkt.report_event = true;
+ tx->dqo.last_re_idx = last_desc_idx;
+ }
+}
+
+static bool
+gve_tx_have_enough_qpl_bufs(struct gve_tx_ring *tx, int num_bufs)
+{
+ uint32_t available = tx->dqo.qpl_bufs_produced_cached -
+ tx->dqo.qpl_bufs_consumed;
+
+ if (__predict_true(available >= num_bufs))
+ return (true);
+
+ tx->dqo.qpl_bufs_produced_cached = atomic_load_acq_32(
+ &tx->dqo.qpl_bufs_produced);
+ available = tx->dqo.qpl_bufs_produced_cached -
+ tx->dqo.qpl_bufs_consumed;
+
+ if (__predict_true(available >= num_bufs))
+ return (true);
+ return (false);
+}
+
+static int32_t
+gve_tx_alloc_qpl_buf(struct gve_tx_ring *tx)
+{
+ int32_t buf = tx->dqo.free_qpl_bufs_csm;
+
+ if (__predict_false(buf == -1)) {
+ tx->dqo.free_qpl_bufs_csm = atomic_swap_32(
+ &tx->dqo.free_qpl_bufs_prd, -1);
+ buf = tx->dqo.free_qpl_bufs_csm;
+ if (__predict_false(buf == -1))
+ return (-1);
+ }
+
+ tx->dqo.free_qpl_bufs_csm = tx->dqo.qpl_bufs[buf];
+ tx->dqo.qpl_bufs_consumed++;
+ return (buf);
+}
+
+/*
+ * Tx buffer i corresponds to
+ * qpl_page_id = i / GVE_TX_BUFS_PER_PAGE_DQO
+ * qpl_page_offset = (i % GVE_TX_BUFS_PER_PAGE_DQO) * GVE_TX_BUF_SIZE_DQO
+ */
+static void
+gve_tx_buf_get_addr_dqo(struct gve_tx_ring *tx,
+ int32_t index, void **va, bus_addr_t *dma_addr)
+{
+ int page_id = index >> (PAGE_SHIFT - GVE_TX_BUF_SHIFT_DQO);
+ int offset = (index & (GVE_TX_BUFS_PER_PAGE_DQO - 1)) <<
+ GVE_TX_BUF_SHIFT_DQO;
+
+ *va = (char *)tx->com.qpl->dmas[page_id].cpu_addr + offset;
+ *dma_addr = tx->com.qpl->dmas[page_id].bus_addr + offset;
+}
+
+static struct gve_dma_handle *
+gve_get_page_dma_handle(struct gve_tx_ring *tx, int32_t index)
+{
+ int page_id = index >> (PAGE_SHIFT - GVE_TX_BUF_SHIFT_DQO);
+
+ return (&tx->com.qpl->dmas[page_id]);
+}
+
+static void
+gve_tx_copy_mbuf_and_write_pkt_descs(struct gve_tx_ring *tx,
+ struct mbuf *mbuf, struct gve_tx_pending_pkt_dqo *pkt,
+ bool csum_enabled, int16_t completion_tag,
+ uint32_t *desc_idx)
+{
+ int32_t pkt_len = mbuf->m_pkthdr.len;
+ struct gve_dma_handle *dma;
+ uint32_t copy_offset = 0;
+ int32_t prev_buf = -1;
+ uint32_t copy_len;
+ bus_addr_t addr;
+ int32_t buf;
+ void *va;
+
+ MPASS(pkt->num_qpl_bufs == 0);
+ MPASS(pkt->qpl_buf_head == -1);
+
+ while (copy_offset < pkt_len) {
+ buf = gve_tx_alloc_qpl_buf(tx);
+ /* We already checked for availability */
+ MPASS(buf != -1);
+
+ gve_tx_buf_get_addr_dqo(tx, buf, &va, &addr);
+ copy_len = MIN(GVE_TX_BUF_SIZE_DQO, pkt_len - copy_offset);
+ m_copydata(mbuf, copy_offset, copy_len, va);
+ copy_offset += copy_len;
+
+ dma = gve_get_page_dma_handle(tx, buf);
+ bus_dmamap_sync(dma->tag, dma->map, BUS_DMASYNC_PREWRITE);
+
+ gve_tx_fill_pkt_desc_dqo(tx, desc_idx,
+ copy_len, addr, completion_tag,
+ /*eop=*/copy_offset == pkt_len,
+ csum_enabled);
+
+ /* Link all the qpl bufs for a packet */
+ if (prev_buf == -1)
+ pkt->qpl_buf_head = buf;
+ else
+ tx->dqo.qpl_bufs[prev_buf] = buf;
+
+ prev_buf = buf;
+ pkt->num_qpl_bufs++;
+ }
+
+ tx->dqo.qpl_bufs[buf] = -1;
+}
+
+int
+gve_xmit_dqo_qpl(struct gve_tx_ring *tx, struct mbuf *mbuf)
+{
+ uint32_t desc_idx = tx->dqo.desc_tail;
+ struct gve_tx_pending_pkt_dqo *pkt;
+ int total_descs_needed;
+ int16_t completion_tag;
+ bool has_csum_flag;
+ int csum_flags;
+ bool is_tso;
+ int nsegs;
+ int err;
+
+ csum_flags = mbuf->m_pkthdr.csum_flags;
+ has_csum_flag = csum_flags & (CSUM_TCP | CSUM_UDP |
+ CSUM_IP6_TCP | CSUM_IP6_UDP | CSUM_TSO);
+ is_tso = csum_flags & CSUM_TSO;
+
+ nsegs = howmany(mbuf->m_pkthdr.len, GVE_TX_BUF_SIZE_DQO);
+ /* Check if we have enough room in the desc ring */
+ total_descs_needed = 1 + /* general_ctx_desc */
+ nsegs + /* pkt_desc */
+ (is_tso ? 1 : 0); /* tso_ctx_desc */
+ if (__predict_false(!gve_tx_has_desc_room_dqo(tx, total_descs_needed)))
+ return (ENOBUFS);
+
+ if (!gve_tx_have_enough_qpl_bufs(tx, nsegs)) {
+ counter_enter();
+ counter_u64_add_protected(
+ tx->stats.tx_delayed_pkt_nospace_qpl_bufs, 1);
+ counter_exit();
+ return (ENOBUFS);
+ }
+
+ pkt = gve_alloc_pending_packet(tx);
+ if (pkt == NULL) {
+ counter_enter();
+ counter_u64_add_protected(
+ tx->stats.tx_delayed_pkt_nospace_compring, 1);
+ counter_exit();
+ return (ENOBUFS);
+ }
+ completion_tag = pkt - tx->dqo.pending_pkts;
+ pkt->mbuf = mbuf;
+
+ err = gve_tx_fill_ctx_descs(tx, mbuf, is_tso, &desc_idx);
+ if (err)
+ goto abort;
+
+ gve_tx_copy_mbuf_and_write_pkt_descs(tx, mbuf, pkt,
+ has_csum_flag, completion_tag, &desc_idx);
+
+ /* Remember the index of the last desc written */
+ tx->dqo.desc_tail = desc_idx;
+
+ /*
+ * Request a descriptor completion on the last descriptor of the
+ * packet if we are allowed to by the HW enforced interval.
+ */
+ gve_tx_request_desc_compl(tx, desc_idx);
+
+ tx->req += total_descs_needed; /* tx->req is just a sysctl counter */
+ return (0);
+
+abort:
+ pkt->mbuf = NULL;
+ gve_free_pending_packet(tx, pkt);
+ return (err);
+}
+
+int
+gve_xmit_dqo(struct gve_tx_ring *tx, struct mbuf **mbuf_ptr)
+{
+ bus_dma_segment_t segs[GVE_TX_MAX_DATA_DESCS_DQO];
+ uint32_t desc_idx = tx->dqo.desc_tail;
+ struct gve_tx_pending_pkt_dqo *pkt;
+ struct mbuf *mbuf = *mbuf_ptr;
+ int total_descs_needed;
+ int16_t completion_tag;
+ bool has_csum_flag;
+ int csum_flags;
+ bool is_tso;
+ int nsegs;
+ int err;
+ int i;
+
+ csum_flags = mbuf->m_pkthdr.csum_flags;
+ has_csum_flag = csum_flags & (CSUM_TCP | CSUM_UDP |
+ CSUM_IP6_TCP | CSUM_IP6_UDP | CSUM_TSO);
+ is_tso = csum_flags & CSUM_TSO;
+
+ /*
+ * This mbuf might end up needing more than 1 pkt desc.
+ * The actual number, `nsegs` is known only after the
+ * expensive gve_map_mbuf_dqo call. This check beneath
+ * exists to fail early when the desc ring is really full.
+ */
+ total_descs_needed = 1 + /* general_ctx_desc */
+ 1 + /* pkt_desc */
+ (is_tso ? 1 : 0); /* tso_ctx_desc */
+ if (__predict_false(!gve_tx_has_desc_room_dqo(tx, total_descs_needed)))
+ return (ENOBUFS);
+
+ pkt = gve_alloc_pending_packet(tx);
+ if (pkt == NULL) {
+ counter_enter();
+ counter_u64_add_protected(
+ tx->stats.tx_delayed_pkt_nospace_compring, 1);
+ counter_exit();
+ return (ENOBUFS);
+ }
+ completion_tag = pkt - tx->dqo.pending_pkts;
+
+ err = gve_map_mbuf_dqo(tx, mbuf_ptr, pkt->dmamap,
+ segs, &nsegs, /*attempt=*/0);
+ if (err)
+ goto abort;
+ mbuf = *mbuf_ptr; /* gve_map_mbuf_dqo might replace the mbuf chain */
+ pkt->mbuf = mbuf;
+
+ total_descs_needed = 1 + /* general_ctx_desc */
+ nsegs + /* pkt_desc */
+ (is_tso ? 1 : 0); /* tso_ctx_desc */
+ if (__predict_false(
+ !gve_tx_has_desc_room_dqo(tx, total_descs_needed))) {
+ err = ENOBUFS;
+ goto abort_with_dma;
+ }
+
+ err = gve_tx_fill_ctx_descs(tx, mbuf, is_tso, &desc_idx);
+ if (err)
+ goto abort_with_dma;
+
+ bus_dmamap_sync(tx->dqo.buf_dmatag, pkt->dmamap, BUS_DMASYNC_PREWRITE);
+ for (i = 0; i < nsegs; i++) {
+ gve_tx_fill_pkt_desc_dqo(tx, &desc_idx,
+ segs[i].ds_len, segs[i].ds_addr,
+ completion_tag, /*eop=*/i == (nsegs - 1),
+ has_csum_flag);
+ }
+
+ /* Remember the index of the last desc written */
+ tx->dqo.desc_tail = desc_idx;
+
+ /*
+ * Request a descriptor completion on the last descriptor of the
+ * packet if we are allowed to by the HW enforced interval.
+ */
+ gve_tx_request_desc_compl(tx, desc_idx);
+
+ tx->req += total_descs_needed; /* tx->req is just a sysctl counter */
+ return (0);
+
+abort_with_dma:
+ gve_unmap_packet(tx, pkt);
+abort:
+ pkt->mbuf = NULL;
+ gve_free_pending_packet(tx, pkt);
+ return (err);
+}
+
+static void
+gve_reap_qpl_bufs_dqo(struct gve_tx_ring *tx,
+ struct gve_tx_pending_pkt_dqo *pkt)
+{
+ int32_t buf = pkt->qpl_buf_head;
+ struct gve_dma_handle *dma;
+ int32_t qpl_buf_tail;
+ int32_t old_head;
+ int i;
+
+ for (i = 0; i < pkt->num_qpl_bufs; i++) {
+ dma = gve_get_page_dma_handle(tx, buf);
+ bus_dmamap_sync(dma->tag, dma->map, BUS_DMASYNC_POSTWRITE);
+ qpl_buf_tail = buf;
+ buf = tx->dqo.qpl_bufs[buf];
+ }
+ MPASS(buf == -1);
+ buf = qpl_buf_tail;
+
+ while (true) {
+ old_head = atomic_load_32(&tx->dqo.free_qpl_bufs_prd);
+ tx->dqo.qpl_bufs[buf] = old_head;
+
+ /*
+ * The "rel" ensures that the update to dqo.free_qpl_bufs_prd
+ * is visible only after the linked list from this pkt is
+ * attached above to old_head.
+ */
+ if (atomic_cmpset_rel_32(&tx->dqo.free_qpl_bufs_prd,
+ old_head, pkt->qpl_buf_head))
+ break;
+ }
+ /*
+ * The "rel" ensures that the update to dqo.qpl_bufs_produced is
+ * visible only adter the update to dqo.free_qpl_bufs_prd above.
+ */
+ atomic_add_rel_32(&tx->dqo.qpl_bufs_produced, pkt->num_qpl_bufs);
+
+ gve_clear_qpl_pending_pkt(pkt);
+}
+
+static uint64_t
+gve_handle_packet_completion(struct gve_priv *priv,
+ struct gve_tx_ring *tx, uint16_t compl_tag)
+{
+ struct gve_tx_pending_pkt_dqo *pending_pkt;
+ int32_t pkt_len;
+
+ if (__predict_false(compl_tag >= tx->dqo.num_pending_pkts)) {
+ device_printf(priv->dev, "Invalid TX completion tag: %d\n",
+ compl_tag);
+ return (0);
+ }
+
+ pending_pkt = &tx->dqo.pending_pkts[compl_tag];
+
+ /* Packet is allocated but not pending data completion. */
+ if (__predict_false(pending_pkt->state !=
+ GVE_PACKET_STATE_PENDING_DATA_COMPL)) {
+ device_printf(priv->dev,
+ "No pending data completion: %d\n", compl_tag);
+ return (0);
+ }
+
+ pkt_len = pending_pkt->mbuf->m_pkthdr.len;
+
+ if (gve_is_qpl(priv))
+ gve_reap_qpl_bufs_dqo(tx, pending_pkt);
+ else
+ gve_unmap_packet(tx, pending_pkt);
+
+ m_freem(pending_pkt->mbuf);
+ pending_pkt->mbuf = NULL;
+ gve_free_pending_packet(tx, pending_pkt);
+ return (pkt_len);
+}
+
+int
+gve_check_tx_timeout_dqo(struct gve_priv *priv, struct gve_tx_ring *tx)
+{
+ struct gve_tx_pending_pkt_dqo *pending_pkt;
+ int num_timeouts;
+ uint16_t pkt_idx;
+
+ num_timeouts = 0;
+ for (pkt_idx = 0; pkt_idx < tx->dqo.num_pending_pkts; pkt_idx++) {
+ pending_pkt = &tx->dqo.pending_pkts[pkt_idx];
+
+ if (!gve_timestamp_valid(&pending_pkt->enqueue_time_sec))
+ continue;
+
+ if (__predict_false(
+ gve_seconds_since(&pending_pkt->enqueue_time_sec) >
+ GVE_TX_TIMEOUT_PKT_SEC))
+ num_timeouts += 1;
+ }
+
+ return (num_timeouts);
+}
+
+int
+gve_tx_intr_dqo(void *arg)
+{
+ struct gve_tx_ring *tx = arg;
+ struct gve_priv *priv = tx->com.priv;
+ struct gve_ring_com *com = &tx->com;
+
+ if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0))
+ return (FILTER_STRAY);
+
+ /* Interrupts are automatically masked */
+ taskqueue_enqueue(com->cleanup_tq, &com->cleanup_task);
+ return (FILTER_HANDLED);
+}
+
+static void
+gve_tx_clear_desc_ring_dqo(struct gve_tx_ring *tx)
+{
+ struct gve_ring_com *com = &tx->com;
+ int i;
+
+ for (i = 0; i < com->priv->tx_desc_cnt; i++)
+ tx->dqo.desc_ring[i] = (union gve_tx_desc_dqo){};
+
+ bus_dmamap_sync(tx->desc_ring_mem.tag, tx->desc_ring_mem.map,
+ BUS_DMASYNC_PREWRITE);
+}
+
+static void
+gve_tx_clear_compl_ring_dqo(struct gve_tx_ring *tx)
+{
+ struct gve_ring_com *com = &tx->com;
+ int entries;
+ int i;
+
+ entries = com->priv->tx_desc_cnt;
+ for (i = 0; i < entries; i++)
+ tx->dqo.compl_ring[i] = (struct gve_tx_compl_desc_dqo){};
+
+ bus_dmamap_sync(tx->dqo.compl_ring_mem.tag, tx->dqo.compl_ring_mem.map,
+ BUS_DMASYNC_PREWRITE);
+}
+
+void
+gve_clear_tx_ring_dqo(struct gve_priv *priv, int i)
+{
+ struct gve_tx_ring *tx = &priv->tx[i];
+ int j;
+
+ tx->dqo.desc_head = 0;
+ tx->dqo.desc_tail = 0;
+ tx->dqo.desc_mask = priv->tx_desc_cnt - 1;
+ tx->dqo.last_re_idx = 0;
+
+ tx->dqo.compl_head = 0;
+ tx->dqo.compl_mask = priv->tx_desc_cnt - 1;
+ atomic_store_32(&tx->dqo.hw_tx_head, 0);
+ tx->dqo.cur_gen_bit = 0;
+
+ gve_free_tx_mbufs_dqo(tx);
+
+ for (j = 0; j < tx->dqo.num_pending_pkts; j++) {
+ if (gve_is_qpl(tx->com.priv))
+ gve_clear_qpl_pending_pkt(&tx->dqo.pending_pkts[j]);
+ gve_invalidate_timestamp(
+ &tx->dqo.pending_pkts[j].enqueue_time_sec);
+ tx->dqo.pending_pkts[j].next =
+ (j == tx->dqo.num_pending_pkts - 1) ? -1 : j + 1;
+ tx->dqo.pending_pkts[j].state = GVE_PACKET_STATE_FREE;
+ }
+ tx->dqo.free_pending_pkts_csm = 0;
+ atomic_store_rel_32(&tx->dqo.free_pending_pkts_prd, -1);
+
+ if (gve_is_qpl(priv)) {
+ int qpl_buf_cnt = GVE_TX_BUFS_PER_PAGE_DQO *
+ tx->com.qpl->num_pages;
+
+ for (j = 0; j < qpl_buf_cnt - 1; j++)
+ tx->dqo.qpl_bufs[j] = j + 1;
+ tx->dqo.qpl_bufs[j] = -1;
+
+ tx->dqo.free_qpl_bufs_csm = 0;
+ atomic_store_32(&tx->dqo.free_qpl_bufs_prd, -1);
+ atomic_store_32(&tx->dqo.qpl_bufs_produced, qpl_buf_cnt);
+ tx->dqo.qpl_bufs_produced_cached = qpl_buf_cnt;
+ tx->dqo.qpl_bufs_consumed = 0;
+ }
+
+ gve_tx_clear_desc_ring_dqo(tx);
+ gve_tx_clear_compl_ring_dqo(tx);
+}
+
+static uint8_t
+gve_tx_get_gen_bit(uint8_t *desc)
+{
+ uint8_t byte;
+
+ /*
+ * Prevent generation bit from being read after the rest of the
+ * descriptor.
+ */
+ byte = atomic_load_acq_8(desc + GVE_TX_DESC_DQO_GEN_BYTE_OFFSET);
+ return ((byte & GVE_TX_DESC_DQO_GEN_BIT_MASK) != 0);
+}
+
+static bool
+gve_tx_cleanup_dqo(struct gve_priv *priv, struct gve_tx_ring *tx, int budget)
+{
+ struct gve_tx_compl_desc_dqo *compl_desc;
+ uint64_t bytes_done = 0;
+ uint64_t pkts_done = 0;
+ uint16_t compl_tag;
+ int work_done = 0;
+ uint16_t tx_head;
+ uint16_t type;
+
+ while (work_done < budget) {
+ bus_dmamap_sync(tx->dqo.compl_ring_mem.tag,
+ tx->dqo.compl_ring_mem.map,
+ BUS_DMASYNC_POSTREAD);
+
+ compl_desc = &tx->dqo.compl_ring[tx->dqo.compl_head];
+ if (gve_tx_get_gen_bit((uint8_t *)compl_desc) ==
+ tx->dqo.cur_gen_bit)
+ break;
+
+ type = compl_desc->type;
+ if (type == GVE_COMPL_TYPE_DQO_DESC) {
+ /* This is the last descriptor fetched by HW plus one */
+ tx_head = le16toh(compl_desc->tx_head);
+ atomic_store_rel_32(&tx->dqo.hw_tx_head, tx_head);
+ } else if (type == GVE_COMPL_TYPE_DQO_PKT) {
+ compl_tag = le16toh(compl_desc->completion_tag);
+ bytes_done += gve_handle_packet_completion(priv,
+ tx, compl_tag);
+ pkts_done++;
+ }
+
+ tx->dqo.compl_head = (tx->dqo.compl_head + 1) &
+ tx->dqo.compl_mask;
+ /* Flip the generation bit when we wrap around */
+ tx->dqo.cur_gen_bit ^= tx->dqo.compl_head == 0;
+ work_done++;
+ }
+
+ /*
+ * Waking the xmit taskqueue has to occur after room has been made in
+ * the queue.
+ */
+ atomic_thread_fence_seq_cst();
+ if (atomic_load_bool(&tx->stopped) && work_done) {
+ atomic_store_bool(&tx->stopped, false);
+ taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task);
+ }
+
+ tx->done += work_done; /* tx->done is just a sysctl counter */
+ counter_enter();
+ counter_u64_add_protected(tx->stats.tbytes, bytes_done);
+ counter_u64_add_protected(tx->stats.tpackets, pkts_done);
+ counter_exit();
+
+ return (work_done == budget);
+}
+
+void
+gve_tx_cleanup_tq_dqo(void *arg, int pending)
+{
+ struct gve_tx_ring *tx = arg;
+ struct gve_priv *priv = tx->com.priv;
+
+ if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0))
+ return;
+
+ if (gve_tx_cleanup_dqo(priv, tx, /*budget=*/1024)) {
+ taskqueue_enqueue(tx->com.cleanup_tq, &tx->com.cleanup_task);
+ return;
+ }
+
+ gve_db_bar_dqo_write_4(priv, tx->com.irq_db_offset,
+ GVE_ITR_NO_UPDATE_DQO | GVE_ITR_ENABLE_BIT_DQO);
+}