diff options
Diffstat (limited to 'sys/dev/gve/gve_tx_dqo.c')
-rw-r--r-- | sys/dev/gve/gve_tx_dqo.c | 1149 |
1 files changed, 1149 insertions, 0 deletions
diff --git a/sys/dev/gve/gve_tx_dqo.c b/sys/dev/gve/gve_tx_dqo.c new file mode 100644 index 000000000000..551a7e308d19 --- /dev/null +++ b/sys/dev/gve/gve_tx_dqo.c @@ -0,0 +1,1149 @@ +/*- + * SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 2024 Google LLC + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "opt_inet6.h" + +#include "gve.h" +#include "gve_dqo.h" + +static void +gve_unmap_packet(struct gve_tx_ring *tx, + struct gve_tx_pending_pkt_dqo *pending_pkt) +{ + bus_dmamap_sync(tx->dqo.buf_dmatag, pending_pkt->dmamap, + BUS_DMASYNC_POSTWRITE); + bus_dmamap_unload(tx->dqo.buf_dmatag, pending_pkt->dmamap); +} + +static void +gve_clear_qpl_pending_pkt(struct gve_tx_pending_pkt_dqo *pending_pkt) +{ + pending_pkt->qpl_buf_head = -1; + pending_pkt->num_qpl_bufs = 0; +} + +static void +gve_free_tx_mbufs_dqo(struct gve_tx_ring *tx) +{ + struct gve_tx_pending_pkt_dqo *pending_pkt; + int i; + + for (i = 0; i < tx->dqo.num_pending_pkts; i++) { + pending_pkt = &tx->dqo.pending_pkts[i]; + if (!pending_pkt->mbuf) + continue; + + if (gve_is_qpl(tx->com.priv)) + gve_clear_qpl_pending_pkt(pending_pkt); + else + gve_unmap_packet(tx, pending_pkt); + + m_freem(pending_pkt->mbuf); + pending_pkt->mbuf = NULL; + } +} + +void +gve_tx_free_ring_dqo(struct gve_priv *priv, int i) +{ + struct gve_tx_ring *tx = &priv->tx[i]; + struct gve_ring_com *com = &tx->com; + int j; + + if (tx->dqo.desc_ring != NULL) { + gve_dma_free_coherent(&tx->desc_ring_mem); + tx->dqo.desc_ring = NULL; + } + + if (tx->dqo.compl_ring != NULL) { + gve_dma_free_coherent(&tx->dqo.compl_ring_mem); + tx->dqo.compl_ring = NULL; + } + + if (tx->dqo.pending_pkts != NULL) { + gve_free_tx_mbufs_dqo(tx); + + if (!gve_is_qpl(priv) && tx->dqo.buf_dmatag) { + for (j = 0; j < tx->dqo.num_pending_pkts; j++) + if (tx->dqo.pending_pkts[j].state != + GVE_PACKET_STATE_UNALLOCATED) + bus_dmamap_destroy(tx->dqo.buf_dmatag, + tx->dqo.pending_pkts[j].dmamap); + } + + free(tx->dqo.pending_pkts, M_GVE); + tx->dqo.pending_pkts = NULL; + } + + if (!gve_is_qpl(priv) && tx->dqo.buf_dmatag) + bus_dma_tag_destroy(tx->dqo.buf_dmatag); + + if (gve_is_qpl(priv) && tx->dqo.qpl_bufs != NULL) { + free(tx->dqo.qpl_bufs, M_GVE); + tx->dqo.qpl_bufs = NULL; + } + + if (com->qpl != NULL) { + gve_free_qpl(priv, com->qpl); + com->qpl = NULL; + } +} + +static int +gve_tx_alloc_rda_fields_dqo(struct gve_tx_ring *tx) +{ + struct gve_priv *priv = tx->com.priv; + int err; + int j; + + /* + * DMA tag for mapping Tx mbufs + * The maxsize, nsegments, and maxsegsize params should match + * the if_sethwtso* arguments in gve_setup_ifnet in gve_main.c. + */ + err = bus_dma_tag_create( + bus_get_dma_tag(priv->dev), /* parent */ + 1, 0, /* alignment, bounds */ + BUS_SPACE_MAXADDR, /* lowaddr */ + BUS_SPACE_MAXADDR, /* highaddr */ + NULL, NULL, /* filter, filterarg */ + GVE_TSO_MAXSIZE_DQO, /* maxsize */ + GVE_TX_MAX_DATA_DESCS_DQO, /* nsegments */ + GVE_TX_MAX_BUF_SIZE_DQO, /* maxsegsize */ + BUS_DMA_ALLOCNOW, /* flags */ + NULL, /* lockfunc */ + NULL, /* lockarg */ + &tx->dqo.buf_dmatag); + if (err != 0) { + device_printf(priv->dev, "%s: bus_dma_tag_create failed: %d\n", + __func__, err); + return (err); + } + + for (j = 0; j < tx->dqo.num_pending_pkts; j++) { + err = bus_dmamap_create(tx->dqo.buf_dmatag, 0, + &tx->dqo.pending_pkts[j].dmamap); + if (err != 0) { + device_printf(priv->dev, + "err in creating pending pkt dmamap %d: %d", + j, err); + return (err); + } + tx->dqo.pending_pkts[j].state = GVE_PACKET_STATE_FREE; + } + + return (0); +} + +int +gve_tx_alloc_ring_dqo(struct gve_priv *priv, int i) +{ + struct gve_tx_ring *tx = &priv->tx[i]; + uint16_t num_pending_pkts; + int err; + + /* Descriptor ring */ + err = gve_dma_alloc_coherent(priv, + sizeof(union gve_tx_desc_dqo) * priv->tx_desc_cnt, + CACHE_LINE_SIZE, &tx->desc_ring_mem); + if (err != 0) { + device_printf(priv->dev, + "Failed to alloc desc ring for tx ring %d", i); + goto abort; + } + tx->dqo.desc_ring = tx->desc_ring_mem.cpu_addr; + + /* Completion ring */ + err = gve_dma_alloc_coherent(priv, + sizeof(struct gve_tx_compl_desc_dqo) * priv->tx_desc_cnt, + CACHE_LINE_SIZE, &tx->dqo.compl_ring_mem); + if (err != 0) { + device_printf(priv->dev, + "Failed to alloc compl ring for tx ring %d", i); + goto abort; + } + tx->dqo.compl_ring = tx->dqo.compl_ring_mem.cpu_addr; + + /* + * pending_pkts array + * + * The max number of pending packets determines the maximum number of + * descriptors which maybe written to the completion queue. + * + * We must set the number small enough to make sure we never overrun the + * completion queue. + */ + num_pending_pkts = priv->tx_desc_cnt; + /* + * Reserve space for descriptor completions, which will be reported at + * most every GVE_TX_MIN_RE_INTERVAL packets. + */ + num_pending_pkts -= num_pending_pkts / GVE_TX_MIN_RE_INTERVAL; + + tx->dqo.num_pending_pkts = num_pending_pkts; + tx->dqo.pending_pkts = malloc( + sizeof(struct gve_tx_pending_pkt_dqo) * num_pending_pkts, + M_GVE, M_WAITOK | M_ZERO); + + if (gve_is_qpl(priv)) { + int qpl_buf_cnt; + + tx->com.qpl = gve_alloc_qpl(priv, i, GVE_TX_NUM_QPL_PAGES_DQO, + /*single_kva*/false); + if (tx->com.qpl == NULL) { + device_printf(priv->dev, + "Failed to alloc QPL for tx ring %d", i); + err = ENOMEM; + goto abort; + } + + qpl_buf_cnt = GVE_TX_BUFS_PER_PAGE_DQO * + tx->com.qpl->num_pages; + + tx->dqo.qpl_bufs = malloc( + sizeof(*tx->dqo.qpl_bufs) * qpl_buf_cnt, + M_GVE, M_WAITOK | M_ZERO); + } else + gve_tx_alloc_rda_fields_dqo(tx); + return (0); + +abort: + gve_tx_free_ring_dqo(priv, i); + return (err); +} + +static void +gve_extract_tx_metadata_dqo(const struct mbuf *mbuf, + struct gve_tx_metadata_dqo *metadata) +{ + uint32_t hash = mbuf->m_pkthdr.flowid; + uint16_t path_hash; + + metadata->version = GVE_TX_METADATA_VERSION_DQO; + if (hash) { + path_hash = hash ^ (hash >> 16); + + path_hash &= (1 << 15) - 1; + if (__predict_false(path_hash == 0)) + path_hash = ~path_hash; + + metadata->path_hash = path_hash; + } +} + +static void +gve_tx_fill_pkt_desc_dqo(struct gve_tx_ring *tx, + uint32_t *desc_idx, uint32_t len, uint64_t addr, + int16_t compl_tag, bool eop, bool csum_enabled) +{ + while (len > 0) { + struct gve_tx_pkt_desc_dqo *desc = + &tx->dqo.desc_ring[*desc_idx].pkt; + uint32_t cur_len = MIN(len, GVE_TX_MAX_BUF_SIZE_DQO); + bool cur_eop = eop && cur_len == len; + + *desc = (struct gve_tx_pkt_desc_dqo){ + .buf_addr = htole64(addr), + .dtype = GVE_TX_PKT_DESC_DTYPE_DQO, + .end_of_packet = cur_eop, + .checksum_offload_enable = csum_enabled, + .compl_tag = htole16(compl_tag), + .buf_size = cur_len, + }; + + addr += cur_len; + len -= cur_len; + *desc_idx = (*desc_idx + 1) & tx->dqo.desc_mask; + } +} + +static void +gve_tx_fill_tso_ctx_desc(struct gve_tx_tso_context_desc_dqo *desc, + const struct mbuf *mbuf, const struct gve_tx_metadata_dqo *metadata, + int header_len) +{ + *desc = (struct gve_tx_tso_context_desc_dqo){ + .header_len = header_len, + .cmd_dtype = { + .dtype = GVE_TX_TSO_CTX_DESC_DTYPE_DQO, + .tso = 1, + }, + .flex0 = metadata->bytes[0], + .flex5 = metadata->bytes[5], + .flex6 = metadata->bytes[6], + .flex7 = metadata->bytes[7], + .flex8 = metadata->bytes[8], + .flex9 = metadata->bytes[9], + .flex10 = metadata->bytes[10], + .flex11 = metadata->bytes[11], + }; + desc->tso_total_len = mbuf->m_pkthdr.len - header_len; + desc->mss = mbuf->m_pkthdr.tso_segsz; +} + +static void +gve_tx_fill_general_ctx_desc(struct gve_tx_general_context_desc_dqo *desc, + const struct gve_tx_metadata_dqo *metadata) +{ + *desc = (struct gve_tx_general_context_desc_dqo){ + .flex0 = metadata->bytes[0], + .flex1 = metadata->bytes[1], + .flex2 = metadata->bytes[2], + .flex3 = metadata->bytes[3], + .flex4 = metadata->bytes[4], + .flex5 = metadata->bytes[5], + .flex6 = metadata->bytes[6], + .flex7 = metadata->bytes[7], + .flex8 = metadata->bytes[8], + .flex9 = metadata->bytes[9], + .flex10 = metadata->bytes[10], + .flex11 = metadata->bytes[11], + .cmd_dtype = {.dtype = GVE_TX_GENERAL_CTX_DESC_DTYPE_DQO}, + }; +} + +#define PULLUP_HDR(m, len) \ +do { \ + if (__predict_false((m)->m_len < (len))) { \ + (m) = m_pullup((m), (len)); \ + if ((m) == NULL) \ + return (EINVAL); \ + } \ +} while (0) + +static int +gve_prep_tso(struct mbuf *mbuf, int *header_len) +{ + uint8_t l3_off, l4_off = 0; + struct ether_header *eh; + struct tcphdr *th; + u_short csum; + + PULLUP_HDR(mbuf, sizeof(*eh)); + eh = mtod(mbuf, struct ether_header *); + KASSERT(eh->ether_type != ETHERTYPE_VLAN, + ("VLAN-tagged packets not supported")); + l3_off = ETHER_HDR_LEN; + +#ifdef INET6 + if (ntohs(eh->ether_type) == ETHERTYPE_IPV6) { + struct ip6_hdr *ip6; + + PULLUP_HDR(mbuf, l3_off + sizeof(*ip6)); + ip6 = (struct ip6_hdr *)(mtodo(mbuf, l3_off)); + l4_off = l3_off + sizeof(struct ip6_hdr); + csum = in6_cksum_pseudo(ip6, /*len=*/0, IPPROTO_TCP, + /*csum=*/0); + } else +#endif + if (ntohs(eh->ether_type) == ETHERTYPE_IP) { + struct ip *ip; + + PULLUP_HDR(mbuf, l3_off + sizeof(*ip)); + ip = (struct ip *)(mtodo(mbuf, l3_off)); + l4_off = l3_off + (ip->ip_hl << 2); + csum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, + htons(IPPROTO_TCP)); + } + + PULLUP_HDR(mbuf, l4_off + sizeof(struct tcphdr *)); + th = (struct tcphdr *)(mtodo(mbuf, l4_off)); + *header_len = l4_off + (th->th_off << 2); + + /* + * Hardware requires the th->th_sum to not include the TCP payload, + * hence we recompute the csum with it excluded. + */ + th->th_sum = csum; + + return (0); +} + +static int +gve_tx_fill_ctx_descs(struct gve_tx_ring *tx, struct mbuf *mbuf, + bool is_tso, uint32_t *desc_idx) +{ + struct gve_tx_general_context_desc_dqo *gen_desc; + struct gve_tx_tso_context_desc_dqo *tso_desc; + struct gve_tx_metadata_dqo metadata; + int header_len; + int err; + + metadata = (struct gve_tx_metadata_dqo){0}; + gve_extract_tx_metadata_dqo(mbuf, &metadata); + + if (is_tso) { + err = gve_prep_tso(mbuf, &header_len); + if (__predict_false(err)) { + counter_enter(); + counter_u64_add_protected( + tx->stats.tx_delayed_pkt_tsoerr, 1); + counter_exit(); + return (err); + } + + tso_desc = &tx->dqo.desc_ring[*desc_idx].tso_ctx; + gve_tx_fill_tso_ctx_desc(tso_desc, mbuf, &metadata, header_len); + + *desc_idx = (*desc_idx + 1) & tx->dqo.desc_mask; + counter_enter(); + counter_u64_add_protected(tx->stats.tso_packet_cnt, 1); + counter_exit(); + } + + gen_desc = &tx->dqo.desc_ring[*desc_idx].general_ctx; + gve_tx_fill_general_ctx_desc(gen_desc, &metadata); + *desc_idx = (*desc_idx + 1) & tx->dqo.desc_mask; + return (0); +} + +static int +gve_map_mbuf_dqo(struct gve_tx_ring *tx, + struct mbuf **mbuf, bus_dmamap_t dmamap, + bus_dma_segment_t *segs, int *nsegs, int attempt) +{ + struct mbuf *m_new = NULL; + int err; + + err = bus_dmamap_load_mbuf_sg(tx->dqo.buf_dmatag, dmamap, + *mbuf, segs, nsegs, BUS_DMA_NOWAIT); + + switch (err) { + case __predict_true(0): + break; + case EFBIG: + if (__predict_false(attempt > 0)) + goto abort; + + counter_enter(); + counter_u64_add_protected( + tx->stats.tx_mbuf_collapse, 1); + counter_exit(); + + /* Try m_collapse before m_defrag */ + m_new = m_collapse(*mbuf, M_NOWAIT, + GVE_TX_MAX_DATA_DESCS_DQO); + if (m_new == NULL) { + counter_enter(); + counter_u64_add_protected( + tx->stats.tx_mbuf_defrag, 1); + counter_exit(); + m_new = m_defrag(*mbuf, M_NOWAIT); + } + + if (__predict_false(m_new == NULL)) { + counter_enter(); + counter_u64_add_protected( + tx->stats.tx_mbuf_defrag_err, 1); + counter_exit(); + + m_freem(*mbuf); + *mbuf = NULL; + err = ENOMEM; + goto abort; + } else { + *mbuf = m_new; + return (gve_map_mbuf_dqo(tx, mbuf, dmamap, + segs, nsegs, ++attempt)); + } + case ENOMEM: + counter_enter(); + counter_u64_add_protected( + tx->stats.tx_mbuf_dmamap_enomem_err, 1); + counter_exit(); + goto abort; + default: + goto abort; + } + + return (0); + +abort: + counter_enter(); + counter_u64_add_protected(tx->stats.tx_mbuf_dmamap_err, 1); + counter_exit(); + return (err); +} + +static uint32_t +num_avail_desc_ring_slots(const struct gve_tx_ring *tx) +{ + uint32_t num_used = (tx->dqo.desc_tail - tx->dqo.desc_head) & + tx->dqo.desc_mask; + + return (tx->dqo.desc_mask - num_used); +} + +static struct gve_tx_pending_pkt_dqo * +gve_alloc_pending_packet(struct gve_tx_ring *tx) +{ + int32_t index = tx->dqo.free_pending_pkts_csm; + struct gve_tx_pending_pkt_dqo *pending_pkt; + + /* + * No pending packets available in the consumer list, + * try to steal the producer list. + */ + if (__predict_false(index == -1)) { + tx->dqo.free_pending_pkts_csm = atomic_swap_32( + &tx->dqo.free_pending_pkts_prd, -1); + + index = tx->dqo.free_pending_pkts_csm; + if (__predict_false(index == -1)) + return (NULL); + } + + pending_pkt = &tx->dqo.pending_pkts[index]; + + /* Remove pending_pkt from the consumer list */ + tx->dqo.free_pending_pkts_csm = pending_pkt->next; + pending_pkt->state = GVE_PACKET_STATE_PENDING_DATA_COMPL; + + gve_set_timestamp(&pending_pkt->enqueue_time_sec); + + return (pending_pkt); +} + +static void +gve_free_pending_packet(struct gve_tx_ring *tx, + struct gve_tx_pending_pkt_dqo *pending_pkt) +{ + int index = pending_pkt - tx->dqo.pending_pkts; + int32_t old_head; + + pending_pkt->state = GVE_PACKET_STATE_FREE; + + gve_invalidate_timestamp(&pending_pkt->enqueue_time_sec); + + /* Add pending_pkt to the producer list */ + while (true) { + old_head = atomic_load_acq_32(&tx->dqo.free_pending_pkts_prd); + + pending_pkt->next = old_head; + if (atomic_cmpset_32(&tx->dqo.free_pending_pkts_prd, + old_head, index)) + break; + } +} + +/* + * Has the side-effect of retrieving the value of the last desc index + * processed by the NIC. hw_tx_head is written to by the completions-processing + * taskqueue upon receiving descriptor-completions. + */ +static bool +gve_tx_has_desc_room_dqo(struct gve_tx_ring *tx, int needed_descs) +{ + if (needed_descs <= num_avail_desc_ring_slots(tx)) + return (true); + + tx->dqo.desc_head = atomic_load_acq_32(&tx->dqo.hw_tx_head); + if (needed_descs > num_avail_desc_ring_slots(tx)) { + counter_enter(); + counter_u64_add_protected( + tx->stats.tx_delayed_pkt_nospace_descring, 1); + counter_exit(); + return (false); + } + + return (0); +} + +static void +gve_tx_request_desc_compl(struct gve_tx_ring *tx, uint32_t desc_idx) +{ + uint32_t last_report_event_interval; + uint32_t last_desc_idx; + + last_desc_idx = (desc_idx - 1) & tx->dqo.desc_mask; + last_report_event_interval = + (last_desc_idx - tx->dqo.last_re_idx) & tx->dqo.desc_mask; + + if (__predict_false(last_report_event_interval >= + GVE_TX_MIN_RE_INTERVAL)) { + tx->dqo.desc_ring[last_desc_idx].pkt.report_event = true; + tx->dqo.last_re_idx = last_desc_idx; + } +} + +static bool +gve_tx_have_enough_qpl_bufs(struct gve_tx_ring *tx, int num_bufs) +{ + uint32_t available = tx->dqo.qpl_bufs_produced_cached - + tx->dqo.qpl_bufs_consumed; + + if (__predict_true(available >= num_bufs)) + return (true); + + tx->dqo.qpl_bufs_produced_cached = atomic_load_acq_32( + &tx->dqo.qpl_bufs_produced); + available = tx->dqo.qpl_bufs_produced_cached - + tx->dqo.qpl_bufs_consumed; + + if (__predict_true(available >= num_bufs)) + return (true); + return (false); +} + +static int32_t +gve_tx_alloc_qpl_buf(struct gve_tx_ring *tx) +{ + int32_t buf = tx->dqo.free_qpl_bufs_csm; + + if (__predict_false(buf == -1)) { + tx->dqo.free_qpl_bufs_csm = atomic_swap_32( + &tx->dqo.free_qpl_bufs_prd, -1); + buf = tx->dqo.free_qpl_bufs_csm; + if (__predict_false(buf == -1)) + return (-1); + } + + tx->dqo.free_qpl_bufs_csm = tx->dqo.qpl_bufs[buf]; + tx->dqo.qpl_bufs_consumed++; + return (buf); +} + +/* + * Tx buffer i corresponds to + * qpl_page_id = i / GVE_TX_BUFS_PER_PAGE_DQO + * qpl_page_offset = (i % GVE_TX_BUFS_PER_PAGE_DQO) * GVE_TX_BUF_SIZE_DQO + */ +static void +gve_tx_buf_get_addr_dqo(struct gve_tx_ring *tx, + int32_t index, void **va, bus_addr_t *dma_addr) +{ + int page_id = index >> (PAGE_SHIFT - GVE_TX_BUF_SHIFT_DQO); + int offset = (index & (GVE_TX_BUFS_PER_PAGE_DQO - 1)) << + GVE_TX_BUF_SHIFT_DQO; + + *va = (char *)tx->com.qpl->dmas[page_id].cpu_addr + offset; + *dma_addr = tx->com.qpl->dmas[page_id].bus_addr + offset; +} + +static struct gve_dma_handle * +gve_get_page_dma_handle(struct gve_tx_ring *tx, int32_t index) +{ + int page_id = index >> (PAGE_SHIFT - GVE_TX_BUF_SHIFT_DQO); + + return (&tx->com.qpl->dmas[page_id]); +} + +static void +gve_tx_copy_mbuf_and_write_pkt_descs(struct gve_tx_ring *tx, + struct mbuf *mbuf, struct gve_tx_pending_pkt_dqo *pkt, + bool csum_enabled, int16_t completion_tag, + uint32_t *desc_idx) +{ + int32_t pkt_len = mbuf->m_pkthdr.len; + struct gve_dma_handle *dma; + uint32_t copy_offset = 0; + int32_t prev_buf = -1; + uint32_t copy_len; + bus_addr_t addr; + int32_t buf; + void *va; + + MPASS(pkt->num_qpl_bufs == 0); + MPASS(pkt->qpl_buf_head == -1); + + while (copy_offset < pkt_len) { + buf = gve_tx_alloc_qpl_buf(tx); + /* We already checked for availability */ + MPASS(buf != -1); + + gve_tx_buf_get_addr_dqo(tx, buf, &va, &addr); + copy_len = MIN(GVE_TX_BUF_SIZE_DQO, pkt_len - copy_offset); + m_copydata(mbuf, copy_offset, copy_len, va); + copy_offset += copy_len; + + dma = gve_get_page_dma_handle(tx, buf); + bus_dmamap_sync(dma->tag, dma->map, BUS_DMASYNC_PREWRITE); + + gve_tx_fill_pkt_desc_dqo(tx, desc_idx, + copy_len, addr, completion_tag, + /*eop=*/copy_offset == pkt_len, + csum_enabled); + + /* Link all the qpl bufs for a packet */ + if (prev_buf == -1) + pkt->qpl_buf_head = buf; + else + tx->dqo.qpl_bufs[prev_buf] = buf; + + prev_buf = buf; + pkt->num_qpl_bufs++; + } + + tx->dqo.qpl_bufs[buf] = -1; +} + +int +gve_xmit_dqo_qpl(struct gve_tx_ring *tx, struct mbuf *mbuf) +{ + uint32_t desc_idx = tx->dqo.desc_tail; + struct gve_tx_pending_pkt_dqo *pkt; + int total_descs_needed; + int16_t completion_tag; + bool has_csum_flag; + int csum_flags; + bool is_tso; + int nsegs; + int err; + + csum_flags = mbuf->m_pkthdr.csum_flags; + has_csum_flag = csum_flags & (CSUM_TCP | CSUM_UDP | + CSUM_IP6_TCP | CSUM_IP6_UDP | CSUM_TSO); + is_tso = csum_flags & CSUM_TSO; + + nsegs = howmany(mbuf->m_pkthdr.len, GVE_TX_BUF_SIZE_DQO); + /* Check if we have enough room in the desc ring */ + total_descs_needed = 1 + /* general_ctx_desc */ + nsegs + /* pkt_desc */ + (is_tso ? 1 : 0); /* tso_ctx_desc */ + if (__predict_false(!gve_tx_has_desc_room_dqo(tx, total_descs_needed))) + return (ENOBUFS); + + if (!gve_tx_have_enough_qpl_bufs(tx, nsegs)) { + counter_enter(); + counter_u64_add_protected( + tx->stats.tx_delayed_pkt_nospace_qpl_bufs, 1); + counter_exit(); + return (ENOBUFS); + } + + pkt = gve_alloc_pending_packet(tx); + if (pkt == NULL) { + counter_enter(); + counter_u64_add_protected( + tx->stats.tx_delayed_pkt_nospace_compring, 1); + counter_exit(); + return (ENOBUFS); + } + completion_tag = pkt - tx->dqo.pending_pkts; + pkt->mbuf = mbuf; + + err = gve_tx_fill_ctx_descs(tx, mbuf, is_tso, &desc_idx); + if (err) + goto abort; + + gve_tx_copy_mbuf_and_write_pkt_descs(tx, mbuf, pkt, + has_csum_flag, completion_tag, &desc_idx); + + /* Remember the index of the last desc written */ + tx->dqo.desc_tail = desc_idx; + + /* + * Request a descriptor completion on the last descriptor of the + * packet if we are allowed to by the HW enforced interval. + */ + gve_tx_request_desc_compl(tx, desc_idx); + + tx->req += total_descs_needed; /* tx->req is just a sysctl counter */ + return (0); + +abort: + pkt->mbuf = NULL; + gve_free_pending_packet(tx, pkt); + return (err); +} + +int +gve_xmit_dqo(struct gve_tx_ring *tx, struct mbuf **mbuf_ptr) +{ + bus_dma_segment_t segs[GVE_TX_MAX_DATA_DESCS_DQO]; + uint32_t desc_idx = tx->dqo.desc_tail; + struct gve_tx_pending_pkt_dqo *pkt; + struct mbuf *mbuf = *mbuf_ptr; + int total_descs_needed; + int16_t completion_tag; + bool has_csum_flag; + int csum_flags; + bool is_tso; + int nsegs; + int err; + int i; + + csum_flags = mbuf->m_pkthdr.csum_flags; + has_csum_flag = csum_flags & (CSUM_TCP | CSUM_UDP | + CSUM_IP6_TCP | CSUM_IP6_UDP | CSUM_TSO); + is_tso = csum_flags & CSUM_TSO; + + /* + * This mbuf might end up needing more than 1 pkt desc. + * The actual number, `nsegs` is known only after the + * expensive gve_map_mbuf_dqo call. This check beneath + * exists to fail early when the desc ring is really full. + */ + total_descs_needed = 1 + /* general_ctx_desc */ + 1 + /* pkt_desc */ + (is_tso ? 1 : 0); /* tso_ctx_desc */ + if (__predict_false(!gve_tx_has_desc_room_dqo(tx, total_descs_needed))) + return (ENOBUFS); + + pkt = gve_alloc_pending_packet(tx); + if (pkt == NULL) { + counter_enter(); + counter_u64_add_protected( + tx->stats.tx_delayed_pkt_nospace_compring, 1); + counter_exit(); + return (ENOBUFS); + } + completion_tag = pkt - tx->dqo.pending_pkts; + + err = gve_map_mbuf_dqo(tx, mbuf_ptr, pkt->dmamap, + segs, &nsegs, /*attempt=*/0); + if (err) + goto abort; + mbuf = *mbuf_ptr; /* gve_map_mbuf_dqo might replace the mbuf chain */ + pkt->mbuf = mbuf; + + total_descs_needed = 1 + /* general_ctx_desc */ + nsegs + /* pkt_desc */ + (is_tso ? 1 : 0); /* tso_ctx_desc */ + if (__predict_false( + !gve_tx_has_desc_room_dqo(tx, total_descs_needed))) { + err = ENOBUFS; + goto abort_with_dma; + } + + err = gve_tx_fill_ctx_descs(tx, mbuf, is_tso, &desc_idx); + if (err) + goto abort_with_dma; + + bus_dmamap_sync(tx->dqo.buf_dmatag, pkt->dmamap, BUS_DMASYNC_PREWRITE); + for (i = 0; i < nsegs; i++) { + gve_tx_fill_pkt_desc_dqo(tx, &desc_idx, + segs[i].ds_len, segs[i].ds_addr, + completion_tag, /*eop=*/i == (nsegs - 1), + has_csum_flag); + } + + /* Remember the index of the last desc written */ + tx->dqo.desc_tail = desc_idx; + + /* + * Request a descriptor completion on the last descriptor of the + * packet if we are allowed to by the HW enforced interval. + */ + gve_tx_request_desc_compl(tx, desc_idx); + + tx->req += total_descs_needed; /* tx->req is just a sysctl counter */ + return (0); + +abort_with_dma: + gve_unmap_packet(tx, pkt); +abort: + pkt->mbuf = NULL; + gve_free_pending_packet(tx, pkt); + return (err); +} + +static void +gve_reap_qpl_bufs_dqo(struct gve_tx_ring *tx, + struct gve_tx_pending_pkt_dqo *pkt) +{ + int32_t buf = pkt->qpl_buf_head; + struct gve_dma_handle *dma; + int32_t qpl_buf_tail; + int32_t old_head; + int i; + + for (i = 0; i < pkt->num_qpl_bufs; i++) { + dma = gve_get_page_dma_handle(tx, buf); + bus_dmamap_sync(dma->tag, dma->map, BUS_DMASYNC_POSTWRITE); + qpl_buf_tail = buf; + buf = tx->dqo.qpl_bufs[buf]; + } + MPASS(buf == -1); + buf = qpl_buf_tail; + + while (true) { + old_head = atomic_load_32(&tx->dqo.free_qpl_bufs_prd); + tx->dqo.qpl_bufs[buf] = old_head; + + /* + * The "rel" ensures that the update to dqo.free_qpl_bufs_prd + * is visible only after the linked list from this pkt is + * attached above to old_head. + */ + if (atomic_cmpset_rel_32(&tx->dqo.free_qpl_bufs_prd, + old_head, pkt->qpl_buf_head)) + break; + } + /* + * The "rel" ensures that the update to dqo.qpl_bufs_produced is + * visible only adter the update to dqo.free_qpl_bufs_prd above. + */ + atomic_add_rel_32(&tx->dqo.qpl_bufs_produced, pkt->num_qpl_bufs); + + gve_clear_qpl_pending_pkt(pkt); +} + +static uint64_t +gve_handle_packet_completion(struct gve_priv *priv, + struct gve_tx_ring *tx, uint16_t compl_tag) +{ + struct gve_tx_pending_pkt_dqo *pending_pkt; + int32_t pkt_len; + + if (__predict_false(compl_tag >= tx->dqo.num_pending_pkts)) { + device_printf(priv->dev, "Invalid TX completion tag: %d\n", + compl_tag); + return (0); + } + + pending_pkt = &tx->dqo.pending_pkts[compl_tag]; + + /* Packet is allocated but not pending data completion. */ + if (__predict_false(pending_pkt->state != + GVE_PACKET_STATE_PENDING_DATA_COMPL)) { + device_printf(priv->dev, + "No pending data completion: %d\n", compl_tag); + return (0); + } + + pkt_len = pending_pkt->mbuf->m_pkthdr.len; + + if (gve_is_qpl(priv)) + gve_reap_qpl_bufs_dqo(tx, pending_pkt); + else + gve_unmap_packet(tx, pending_pkt); + + m_freem(pending_pkt->mbuf); + pending_pkt->mbuf = NULL; + gve_free_pending_packet(tx, pending_pkt); + return (pkt_len); +} + +int +gve_check_tx_timeout_dqo(struct gve_priv *priv, struct gve_tx_ring *tx) +{ + struct gve_tx_pending_pkt_dqo *pending_pkt; + int num_timeouts; + uint16_t pkt_idx; + + num_timeouts = 0; + for (pkt_idx = 0; pkt_idx < tx->dqo.num_pending_pkts; pkt_idx++) { + pending_pkt = &tx->dqo.pending_pkts[pkt_idx]; + + if (!gve_timestamp_valid(&pending_pkt->enqueue_time_sec)) + continue; + + if (__predict_false( + gve_seconds_since(&pending_pkt->enqueue_time_sec) > + GVE_TX_TIMEOUT_PKT_SEC)) + num_timeouts += 1; + } + + return (num_timeouts); +} + +int +gve_tx_intr_dqo(void *arg) +{ + struct gve_tx_ring *tx = arg; + struct gve_priv *priv = tx->com.priv; + struct gve_ring_com *com = &tx->com; + + if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0)) + return (FILTER_STRAY); + + /* Interrupts are automatically masked */ + taskqueue_enqueue(com->cleanup_tq, &com->cleanup_task); + return (FILTER_HANDLED); +} + +static void +gve_tx_clear_desc_ring_dqo(struct gve_tx_ring *tx) +{ + struct gve_ring_com *com = &tx->com; + int i; + + for (i = 0; i < com->priv->tx_desc_cnt; i++) + tx->dqo.desc_ring[i] = (union gve_tx_desc_dqo){}; + + bus_dmamap_sync(tx->desc_ring_mem.tag, tx->desc_ring_mem.map, + BUS_DMASYNC_PREWRITE); +} + +static void +gve_tx_clear_compl_ring_dqo(struct gve_tx_ring *tx) +{ + struct gve_ring_com *com = &tx->com; + int entries; + int i; + + entries = com->priv->tx_desc_cnt; + for (i = 0; i < entries; i++) + tx->dqo.compl_ring[i] = (struct gve_tx_compl_desc_dqo){}; + + bus_dmamap_sync(tx->dqo.compl_ring_mem.tag, tx->dqo.compl_ring_mem.map, + BUS_DMASYNC_PREWRITE); +} + +void +gve_clear_tx_ring_dqo(struct gve_priv *priv, int i) +{ + struct gve_tx_ring *tx = &priv->tx[i]; + int j; + + tx->dqo.desc_head = 0; + tx->dqo.desc_tail = 0; + tx->dqo.desc_mask = priv->tx_desc_cnt - 1; + tx->dqo.last_re_idx = 0; + + tx->dqo.compl_head = 0; + tx->dqo.compl_mask = priv->tx_desc_cnt - 1; + atomic_store_32(&tx->dqo.hw_tx_head, 0); + tx->dqo.cur_gen_bit = 0; + + gve_free_tx_mbufs_dqo(tx); + + for (j = 0; j < tx->dqo.num_pending_pkts; j++) { + if (gve_is_qpl(tx->com.priv)) + gve_clear_qpl_pending_pkt(&tx->dqo.pending_pkts[j]); + gve_invalidate_timestamp( + &tx->dqo.pending_pkts[j].enqueue_time_sec); + tx->dqo.pending_pkts[j].next = + (j == tx->dqo.num_pending_pkts - 1) ? -1 : j + 1; + tx->dqo.pending_pkts[j].state = GVE_PACKET_STATE_FREE; + } + tx->dqo.free_pending_pkts_csm = 0; + atomic_store_rel_32(&tx->dqo.free_pending_pkts_prd, -1); + + if (gve_is_qpl(priv)) { + int qpl_buf_cnt = GVE_TX_BUFS_PER_PAGE_DQO * + tx->com.qpl->num_pages; + + for (j = 0; j < qpl_buf_cnt - 1; j++) + tx->dqo.qpl_bufs[j] = j + 1; + tx->dqo.qpl_bufs[j] = -1; + + tx->dqo.free_qpl_bufs_csm = 0; + atomic_store_32(&tx->dqo.free_qpl_bufs_prd, -1); + atomic_store_32(&tx->dqo.qpl_bufs_produced, qpl_buf_cnt); + tx->dqo.qpl_bufs_produced_cached = qpl_buf_cnt; + tx->dqo.qpl_bufs_consumed = 0; + } + + gve_tx_clear_desc_ring_dqo(tx); + gve_tx_clear_compl_ring_dqo(tx); +} + +static uint8_t +gve_tx_get_gen_bit(uint8_t *desc) +{ + uint8_t byte; + + /* + * Prevent generation bit from being read after the rest of the + * descriptor. + */ + byte = atomic_load_acq_8(desc + GVE_TX_DESC_DQO_GEN_BYTE_OFFSET); + return ((byte & GVE_TX_DESC_DQO_GEN_BIT_MASK) != 0); +} + +static bool +gve_tx_cleanup_dqo(struct gve_priv *priv, struct gve_tx_ring *tx, int budget) +{ + struct gve_tx_compl_desc_dqo *compl_desc; + uint64_t bytes_done = 0; + uint64_t pkts_done = 0; + uint16_t compl_tag; + int work_done = 0; + uint16_t tx_head; + uint16_t type; + + while (work_done < budget) { + bus_dmamap_sync(tx->dqo.compl_ring_mem.tag, + tx->dqo.compl_ring_mem.map, + BUS_DMASYNC_POSTREAD); + + compl_desc = &tx->dqo.compl_ring[tx->dqo.compl_head]; + if (gve_tx_get_gen_bit((uint8_t *)compl_desc) == + tx->dqo.cur_gen_bit) + break; + + type = compl_desc->type; + if (type == GVE_COMPL_TYPE_DQO_DESC) { + /* This is the last descriptor fetched by HW plus one */ + tx_head = le16toh(compl_desc->tx_head); + atomic_store_rel_32(&tx->dqo.hw_tx_head, tx_head); + } else if (type == GVE_COMPL_TYPE_DQO_PKT) { + compl_tag = le16toh(compl_desc->completion_tag); + bytes_done += gve_handle_packet_completion(priv, + tx, compl_tag); + pkts_done++; + } + + tx->dqo.compl_head = (tx->dqo.compl_head + 1) & + tx->dqo.compl_mask; + /* Flip the generation bit when we wrap around */ + tx->dqo.cur_gen_bit ^= tx->dqo.compl_head == 0; + work_done++; + } + + /* + * Waking the xmit taskqueue has to occur after room has been made in + * the queue. + */ + atomic_thread_fence_seq_cst(); + if (atomic_load_bool(&tx->stopped) && work_done) { + atomic_store_bool(&tx->stopped, false); + taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task); + } + + tx->done += work_done; /* tx->done is just a sysctl counter */ + counter_enter(); + counter_u64_add_protected(tx->stats.tbytes, bytes_done); + counter_u64_add_protected(tx->stats.tpackets, pkts_done); + counter_exit(); + + return (work_done == budget); +} + +void +gve_tx_cleanup_tq_dqo(void *arg, int pending) +{ + struct gve_tx_ring *tx = arg; + struct gve_priv *priv = tx->com.priv; + + if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0)) + return; + + if (gve_tx_cleanup_dqo(priv, tx, /*budget=*/1024)) { + taskqueue_enqueue(tx->com.cleanup_tq, &tx->com.cleanup_task); + return; + } + + gve_db_bar_dqo_write_4(priv, tx->com.irq_db_offset, + GVE_ITR_NO_UPDATE_DQO | GVE_ITR_ENABLE_BIT_DQO); +} |