aboutsummaryrefslogtreecommitdiff
path: root/sys
diff options
context:
space:
mode:
Diffstat (limited to 'sys')
-rw-r--r--sys/amd64/amd64/apic_vector.S6
-rw-r--r--sys/amd64/amd64/pmap.c11
-rw-r--r--sys/amd64/include/vmparam.h2
-rw-r--r--sys/amd64/pt/pt.c978
-rw-r--r--sys/amd64/pt/pt.h49
-rw-r--r--sys/amd64/vmm/intel/vmx_support.S6
-rw-r--r--sys/arm64/arm64/pmap.c12
-rw-r--r--sys/cddl/boot/zfs/zfsimpl.h2
-rw-r--r--sys/conf/files13
-rw-r--r--sys/conf/files.amd644
-rw-r--r--sys/dev/gpio/gpiobus.c24
-rw-r--r--sys/dev/ice/ice_features.h2
-rw-r--r--sys/dev/ice/ice_iflib.h16
-rw-r--r--sys/dev/ice/ice_iov.c1856
-rw-r--r--sys/dev/ice/ice_iov.h125
-rw-r--r--sys/dev/ice/ice_lib.c28
-rw-r--r--sys/dev/ice/ice_lib.h4
-rw-r--r--sys/dev/ice/ice_vf_mbx.c471
-rw-r--r--sys/dev/ice/ice_vf_mbx.h67
-rw-r--r--sys/dev/ice/if_ice_iflib.c132
-rw-r--r--sys/dev/md/md.c6
-rw-r--r--sys/dev/mgb/if_mgb.c2
-rw-r--r--sys/dev/mlx5/mlx5_accel/ipsec.h8
-rw-r--r--sys/dev/mlx5/mlx5_accel/mlx5_ipsec_rxtx.c16
-rw-r--r--sys/dev/mlx5/mlx5_en/mlx5_en_hw_tls_rx.c3
-rw-r--r--sys/dev/mlx5/mlx5_en/mlx5_en_rx.c2
-rw-r--r--sys/dev/qlnx/qlnxe/qlnx_os.c6
-rw-r--r--sys/dev/random/fortuna.c7
-rw-r--r--sys/dev/random/random_harvestq.c232
-rw-r--r--sys/dev/random/random_harvestq.h2
-rw-r--r--sys/dev/random/randomdev.c2
-rw-r--r--sys/dev/ufshci/ufshci_private.h4
-rw-r--r--sys/dev/ufshci/ufshci_req_sdb.c45
-rw-r--r--sys/fs/fdescfs/fdesc_vnops.c9
-rw-r--r--sys/fs/msdosfs/msdosfs_conv.c11
-rw-r--r--sys/fs/nfsserver/nfs_nfsdserv.c5
-rw-r--r--sys/fs/p9fs/p9fs_vnops.c8
-rw-r--r--sys/fs/udf/ecma167-udf.h4
-rw-r--r--sys/fs/udf/udf_vfsops.c7
-rw-r--r--sys/fs/udf/udf_vnops.c48
-rw-r--r--sys/i386/i386/pmap.c10
-rw-r--r--sys/kern/subr_asan.c3
-rw-r--r--sys/kern/sys_generic.c6
-rw-r--r--sys/kern/vfs_cache.c13
-rw-r--r--sys/kern/vfs_inotify.c5
-rw-r--r--sys/modules/Makefile2
-rw-r--r--sys/modules/ice/Makefile1
-rw-r--r--sys/modules/pt/Makefile8
-rw-r--r--sys/modules/qlnx/qlnxe/Makefile1
-rw-r--r--sys/net/ethernet.h6
-rw-r--r--sys/net/if_ethersubr.c5
-rw-r--r--sys/net80211/ieee80211_hostap.c7
-rw-r--r--sys/net80211/ieee80211_ht.c13
-rw-r--r--sys/net80211/ieee80211_vht.c4
-rw-r--r--sys/net80211/ieee80211_vht.h3
-rw-r--r--sys/netipsec/ipsec.c6
-rw-r--r--sys/netipsec/ipsec_offload.c25
-rw-r--r--sys/netipsec/ipsec_offload.h16
-rw-r--r--sys/netipsec/key.c2
-rw-r--r--sys/netpfil/pf/pf.c20
-rw-r--r--sys/netpfil/pf/pf_ioctl.c202
-rw-r--r--sys/netpfil/pf/pf_lb.c9
-rw-r--r--sys/netpfil/pf/pf_table.c23
-rw-r--r--sys/rpc/clnt_rc.c7
-rw-r--r--sys/rpc/rpcsec_tls/rpctls_impl.c8
-rw-r--r--sys/sys/exterrvar.h1
-rw-r--r--sys/sys/inotify.h14
-rw-r--r--sys/sys/param.h2
-rw-r--r--sys/sys/random.h3
-rw-r--r--sys/vm/swap_pager.c23
-rw-r--r--sys/vm/vm_pagequeue.h6
71 files changed, 4457 insertions, 232 deletions
diff --git a/sys/amd64/amd64/apic_vector.S b/sys/amd64/amd64/apic_vector.S
index 6e51ebff298a..5bb877a174f7 100644
--- a/sys/amd64/amd64/apic_vector.S
+++ b/sys/amd64/amd64/apic_vector.S
@@ -49,12 +49,6 @@
#include <machine/specialreg.h>
#include <x86/apicreg.h>
-#ifdef SMP
-#define LK lock ;
-#else
-#define LK
-#endif
-
.text
SUPERALIGN_TEXT
/* End Of Interrupt to APIC */
diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c
index 2c7777e608b9..14f57ca94ba7 100644
--- a/sys/amd64/amd64/pmap.c
+++ b/sys/amd64/amd64/pmap.c
@@ -7561,6 +7561,9 @@ pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags,
PG_RW = pmap_rw_bit(pmap);
KASSERT((newpde & (pmap_modified_bit(pmap) | PG_RW)) != PG_RW,
("pmap_enter_pde: newpde is missing PG_M"));
+ KASSERT((flags & (PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM)) !=
+ PMAP_ENTER_NORECLAIM,
+ ("pmap_enter_pde: flags is missing PMAP_ENTER_NOREPLACE"));
PG_V = pmap_valid_bit(pmap);
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
@@ -7689,6 +7692,14 @@ pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags,
if (!pmap_pv_insert_pde(pmap, va, newpde, flags, lockp)) {
if (pdpg != NULL)
pmap_abort_ptp(pmap, va, pdpg);
+ else {
+ KASSERT(va >= VM_MAXUSER_ADDRESS &&
+ (*pde & (PG_PS | PG_V)) == PG_V,
+ ("pmap_enter_pde: invalid kernel PDE"));
+ mt = pmap_remove_pt_page(pmap, va);
+ KASSERT(mt != NULL,
+ ("pmap_enter_pde: missing kernel PTP"));
+ }
if (uwptpg != NULL) {
mt = pmap_remove_pt_page(pmap, va);
KASSERT(mt == uwptpg,
diff --git a/sys/amd64/include/vmparam.h b/sys/amd64/include/vmparam.h
index 59053665dc40..e4cc05cbb889 100644
--- a/sys/amd64/include/vmparam.h
+++ b/sys/amd64/include/vmparam.h
@@ -249,7 +249,7 @@
*/
#define PHYS_IN_DMAP(pa) (dmaplimit == 0 || (pa) < dmaplimit)
#define VIRT_IN_DMAP(va) \
- ((va) >= kva_layout.dmap_low && (va) < kva_layout.dmap_high)
+ ((va) >= kva_layout.dmap_low && (va) < kva_layout.dmap_low + dmaplimit)
#define PMAP_HAS_DMAP 1
#define PHYS_TO_DMAP(x) __extension__ ({ \
diff --git a/sys/amd64/pt/pt.c b/sys/amd64/pt/pt.c
new file mode 100644
index 000000000000..c7b75767680a
--- /dev/null
+++ b/sys/amd64/pt/pt.c
@@ -0,0 +1,978 @@
+/*
+ * Copyright (c) 2025 Bojan Novković <bnovkov@freebsd.org>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+/*
+ * hwt(4) Intel Processor Trace (PT) backend
+ *
+ * Driver Design Overview
+ *
+ * - Since PT is configured on a per-core basis, the driver uses
+ * 'smp_rendezvous' to start and disable tracing on each target core.
+ * - PT-specific resources are stored in a 'struct pt_ctx' context structure for
+ * each traced CPU core or thread. Upon initialization, a ToPA configuration
+ * is generated for each 'pt_ctx' structure using the HWT tracing buffers.
+ * The HWT tracing buffer is split into 4K ToPA entries. Currently, each
+ * 4K ToPA entry is configured to trigger an interrupt after it is filled.
+ * - The PT driver uses the XSAVE/XRSTOR PT extensions to load and save all
+ * relevant PT registers. Every time a traced thread is switched
+ * out or in, its state will be saved to or loaded from its corresponding
+ * 'pt_ctx' context.
+ * - When tracing starts, the PT hardware will start writing data into the
+ * tracing buffer. When a TOPA_INT entry is filled, it will trigger an
+ * interrupt before continuing. The interrupt handler will then fetch the
+ * last valid tracing buffer offset and enqueue a HWT_RECORD_BUFFER record.
+ * The driver is currently configured to use the NMI interrupt line.
+ * - The userspace PT backend waits for incoming HWT_RECORD_BUFFER records
+ * and uses the offsets to decode data from the tracing buffer.
+ *
+ * Future improvements and limitations
+ *
+ * - We currently configure the PT hardware to trigger an interrupt whenever
+ * a 4K ToPA entry is filled. While this is fine when tracing smaller
+ * functions or infrequent code paths, this will generate too much interrupt
+ * traffic when tracing hotter functions. A proper solution for this issue
+ * should estimate the amount of data generated by the current configuration
+ * and use it to determine interrupt frequency.
+ *
+ * - Support for more tracing options and PT features.
+ *
+ */
+
+#include <sys/systm.h>
+#include <sys/hwt.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/sdt.h>
+#include <sys/smp.h>
+#include <sys/taskqueue.h>
+
+#include <vm/vm.h>
+#include <vm/vm_page.h>
+
+#include <machine/atomic.h>
+#include <machine/cpufunc.h>
+#include <machine/fpu.h>
+#include <machine/smp.h>
+#include <machine/specialreg.h>
+
+#include <x86/apicvar.h>
+#include <x86/x86_var.h>
+
+#include <dev/hwt/hwt_context.h>
+#include <dev/hwt/hwt_vm.h>
+#include <dev/hwt/hwt_backend.h>
+#include <dev/hwt/hwt_config.h>
+#include <dev/hwt/hwt_cpu.h>
+#include <dev/hwt/hwt_record.h>
+#include <dev/hwt/hwt_thread.h>
+
+#include <amd64/pt/pt.h>
+
+#ifdef PT_DEBUG
+#define dprintf(fmt, ...) printf(fmt, ##__VA_ARGS__)
+#else
+#define dprintf(fmt, ...)
+#endif
+#define PT_SUPPORTED_FLAGS \
+ (RTIT_CTL_MTCEN | RTIT_CTL_CR3FILTER | RTIT_CTL_DIS_TNT | \
+ RTIT_CTL_USER | RTIT_CTL_OS | RTIT_CTL_BRANCHEN)
+#define PT_XSAVE_MASK (XFEATURE_ENABLED_X87 | XFEATURE_ENABLED_SSE)
+#define PT_XSTATE_BV (PT_XSAVE_MASK | XFEATURE_ENABLED_PT)
+#define PT_MAX_IP_RANGES 2
+
+#define PT_TOPA_MASK_PTRS 0x7f
+#define PT_TOPA_PAGE_MASK 0xffffff80
+#define PT_TOPA_PAGE_SHIFT 7
+
+#define CPUID_PT_LEAF 0x14
+
+MALLOC_DEFINE(M_PT, "pt", "Intel Processor Trace");
+
+SDT_PROVIDER_DEFINE(pt);
+SDT_PROBE_DEFINE(pt, , , topa__intr);
+
+TASKQUEUE_FAST_DEFINE_THREAD(pt);
+
+static void pt_send_buffer_record(void *arg, int pending __unused);
+static int pt_topa_intr(struct trapframe *tf);
+
+/*
+ * Intel Processor Trace XSAVE-managed state.
+ */
+struct pt_ext_area {
+ uint64_t rtit_ctl;
+ uint64_t rtit_output_base;
+ uint64_t rtit_output_mask_ptrs;
+ uint64_t rtit_status;
+ uint64_t rtit_cr3_match;
+ uint64_t rtit_addr0_a;
+ uint64_t rtit_addr0_b;
+ uint64_t rtit_addr1_a;
+ uint64_t rtit_addr1_b;
+};
+
+struct pt_buffer {
+ uint64_t *topa_hw; /* ToPA table entries. */
+ size_t size;
+ struct mtx lock; /* Lock for fields below. */
+ vm_offset_t offset;
+ uint64_t wrap_count;
+ int curpage;
+};
+
+struct pt_ctx {
+ int id;
+ struct pt_buffer buf; /* ToPA buffer metadata */
+ struct task task; /* ToPA buffer notification task */
+ struct hwt_context *hwt_ctx;
+ uint8_t *save_area; /* PT XSAVE area */
+};
+/* PT tracing contexts used for CPU mode. */
+static struct pt_ctx *pt_pcpu_ctx;
+
+enum pt_cpu_state {
+ PT_DISABLED = 0,
+ PT_STOPPED,
+ PT_ACTIVE
+};
+
+static struct pt_cpu {
+ struct pt_ctx *ctx; /* active PT tracing context */
+ enum pt_cpu_state state; /* used as part of trace stop protocol */
+} *pt_pcpu;
+
+/*
+ * PT-related CPUID bits.
+ */
+static struct pt_cpu_info {
+ uint32_t l0_eax;
+ uint32_t l0_ebx;
+ uint32_t l0_ecx;
+ uint32_t l1_eax;
+ uint32_t l1_ebx;
+ size_t xsave_area_size;
+ size_t xstate_hdr_offset;
+ size_t pt_xsave_offset;
+} pt_info __read_mostly;
+
+static bool initialized = false;
+static int cpu_mode_ctr = 0;
+
+static __inline enum pt_cpu_state
+pt_cpu_get_state(int cpu_id)
+{
+ return (atomic_load_int(&pt_pcpu[cpu_id].state));
+}
+
+static __inline void
+pt_cpu_set_state(int cpu_id, enum pt_cpu_state state)
+{
+ atomic_store_int(&pt_pcpu[cpu_id].state, state);
+}
+
+static __inline struct xstate_hdr *
+pt_ctx_get_xstate_hdr(struct pt_ctx *ctx)
+{
+ return ((struct xstate_hdr *)(ctx->save_area +
+ pt_info.xstate_hdr_offset));
+}
+
+
+static __inline struct pt_ext_area *
+pt_ctx_get_ext_area(struct pt_ctx *ctx)
+{
+ return ((struct pt_ext_area *)(ctx->save_area +
+ pt_info.pt_xsave_offset));
+}
+
+/*
+ * Updates current trace buffer offset from the
+ * ToPA MSRs. Records if the trace buffer wrapped.
+ */
+static __inline void
+pt_update_buffer(struct pt_buffer *buf)
+{
+ uint64_t reg;
+ int curpage;
+
+ /* Update buffer offset. */
+ reg = rdmsr(MSR_IA32_RTIT_OUTPUT_MASK_PTRS);
+ curpage = (reg & PT_TOPA_PAGE_MASK) >> PT_TOPA_PAGE_SHIFT;
+ mtx_lock_spin(&buf->lock);
+ /* Check if the output wrapped. */
+ if (buf->curpage > curpage)
+ buf->wrap_count++;
+ buf->curpage = curpage;
+ buf->offset = reg >> 32;
+ mtx_unlock_spin(&buf->lock);
+
+ dprintf("%s: wrap_cnt: %lu, curpage: %d, offset: %zu\n", __func__,
+ buf->wrap_count, buf->curpage, buf->offset);
+}
+
+static __inline void
+pt_fill_buffer_record(int id, struct pt_buffer *buf,
+ struct hwt_record_entry *rec)
+{
+ rec->record_type = HWT_RECORD_BUFFER;
+ rec->buf_id = id;
+ rec->curpage = buf->curpage;
+ rec->offset = buf->offset + (buf->wrap_count * buf->size);
+}
+
+/*
+ * Enables or disables tracing on curcpu
+ * using the XSAVE/XRSTOR PT extensions.
+ */
+static void
+pt_cpu_toggle_local(uint8_t *save_area, bool enable)
+{
+ u_long xcr0, cr0;
+ u_long xss;
+
+ cr0 = rcr0();
+ if (cr0 & CR0_TS)
+ clts();
+ xcr0 = rxcr(XCR0);
+ if ((xcr0 & PT_XSAVE_MASK) != PT_XSAVE_MASK)
+ load_xcr(XCR0, xcr0 | PT_XSAVE_MASK);
+ xss = rdmsr(MSR_IA32_XSS);
+ wrmsr(MSR_IA32_XSS, xss | XFEATURE_ENABLED_PT);
+
+ if (!enable) {
+ KASSERT((rdmsr(MSR_IA32_RTIT_CTL) & RTIT_CTL_TRACEEN) != 0,
+ ("%s: PT is disabled", __func__));
+ xsaves(save_area, XFEATURE_ENABLED_PT);
+ } else {
+ KASSERT((rdmsr(MSR_IA32_RTIT_CTL) & RTIT_CTL_TRACEEN) == 0,
+ ("%s: PT is enabled", __func__));
+ xrstors(save_area, XFEATURE_ENABLED_PT);
+ }
+ wrmsr(MSR_IA32_XSS, xss);
+ if ((xcr0 & PT_XSAVE_MASK) != PT_XSAVE_MASK)
+ load_xcr(XCR0, xcr0);
+ if (cr0 & CR0_TS)
+ load_cr0(cr0);
+}
+
+/*
+ * Starts PT tracing on 'curcpu'.
+ */
+static void
+pt_cpu_start(void *dummy)
+{
+ struct pt_cpu *cpu;
+
+ cpu = &pt_pcpu[curcpu];
+ MPASS(cpu->ctx != NULL);
+
+ dprintf("%s: curcpu %d\n", __func__, curcpu);
+ load_cr4(rcr4() | CR4_XSAVE);
+ wrmsr(MSR_IA32_RTIT_STATUS, 0);
+ pt_cpu_set_state(curcpu, PT_ACTIVE);
+ pt_cpu_toggle_local(cpu->ctx->save_area, true);
+}
+
+/*
+ * Stops PT tracing on 'curcpu'.
+ * Updates trace buffer offset to ensure
+ * any data generated between the last interrupt
+ * and the trace stop gets picked up by userspace.
+ */
+static void
+pt_cpu_stop(void *dummy)
+{
+ struct pt_cpu *cpu;
+ struct pt_ctx *ctx;
+
+ /* Shutdown may occur before PT gets properly configured. */
+ if (pt_cpu_get_state(curcpu) == PT_DISABLED)
+ return;
+
+ cpu = &pt_pcpu[curcpu];
+ ctx = cpu->ctx;
+ MPASS(ctx != NULL);
+ dprintf("%s: curcpu %d\n", __func__, curcpu);
+
+ pt_cpu_set_state(curcpu, PT_STOPPED);
+ pt_cpu_toggle_local(cpu->ctx->save_area, false);
+ pt_update_buffer(&ctx->buf);
+}
+
+/*
+ * Prepares the Table of Physical Addresses (ToPA) metadata for 'pt_ctx'.
+ * The HWT trace buffer is split into 4K ToPA table entries and used
+ * as a circular buffer, meaning that the last ToPA entry points to
+ * the first ToPA entry. Each entry is configured to raise an
+ * interrupt after being filled.
+ */
+static int
+pt_topa_prepare(struct pt_ctx *ctx, struct hwt_vm *vm)
+{
+ struct pt_buffer *buf;
+ size_t topa_size;
+ int i;
+
+ topa_size = TOPA_SIZE_4K;
+ buf = &ctx->buf;
+
+ KASSERT(buf->topa_hw == NULL,
+ ("%s: ToPA info already exists", __func__));
+ buf->topa_hw = mallocarray(vm->npages + 1, sizeof(uint64_t), M_PT,
+ M_ZERO | M_WAITOK);
+ dprintf("%s: ToPA virt addr %p\n", __func__, buf->topa_hw);
+ buf->size = vm->npages * PAGE_SIZE;
+ for (i = 0; i < vm->npages; i++) {
+ buf->topa_hw[i] = VM_PAGE_TO_PHYS(vm->pages[i]) | topa_size;
+ /*
+ * XXX: TOPA_INT should ideally be set according to
+ * expected amount of incoming trace data. Too few TOPA_INT
+ * entries will not trigger interrupts often enough when tracing
+ * smaller functions.
+ */
+ buf->topa_hw[i] |= TOPA_INT;
+ }
+ buf->topa_hw[vm->npages] = (uint64_t)vtophys(buf->topa_hw) | TOPA_END;
+
+ return (0);
+}
+
+/*
+ * Configures IP filtering for trace generation.
+ * A maximum of 2 ranges can be specified due to
+ * limitations imposed by the XSAVE/XRSTOR PT extensions.
+ */
+static int
+pt_configure_ranges(struct pt_ctx *ctx, struct pt_cpu_config *cfg)
+{
+ struct pt_ext_area *pt_ext;
+ int nranges_supp, n, error = 0;
+
+ pt_ext = pt_ctx_get_ext_area(ctx);
+ if (pt_info.l0_ebx & CPUPT_IPF) {
+ nranges_supp = (pt_info.l1_eax & CPUPT_NADDR_M) >>
+ CPUPT_NADDR_S;
+
+ if (nranges_supp > PT_IP_FILTER_MAX_RANGES)
+ nranges_supp = PT_IP_FILTER_MAX_RANGES;
+ n = cfg->nranges;
+ if (n > nranges_supp) {
+ printf("%s: %d IP filtering ranges requested, CPU "
+ "supports %d, truncating\n",
+ __func__, n, nranges_supp);
+ n = nranges_supp;
+ }
+
+ switch (n) {
+ case 2:
+ pt_ext->rtit_ctl |= (1UL << RTIT_CTL_ADDR_CFG_S(1));
+ pt_ext->rtit_addr1_a = cfg->ip_ranges[1].start;
+ pt_ext->rtit_addr1_b = cfg->ip_ranges[1].end;
+ case 1:
+ pt_ext->rtit_ctl |= (1UL << RTIT_CTL_ADDR_CFG_S(0));
+ pt_ext->rtit_addr0_a = cfg->ip_ranges[0].start;
+ pt_ext->rtit_addr0_b = cfg->ip_ranges[0].end;
+ break;
+ default:
+ error = (EINVAL);
+ break;
+ };
+ } else
+ error = (ENXIO);
+
+ return (error);
+}
+
+static int
+pt_init_ctx(struct pt_ctx *pt_ctx, struct hwt_vm *vm, int ctx_id)
+{
+
+ dprintf("%s: ctx id %d\n", __func__, ctx_id);
+
+ KASSERT(pt_ctx->buf.topa_hw == NULL,
+ ("%s: active ToPA buffer in context %p\n", __func__, pt_ctx));
+
+ memset(pt_ctx, 0, sizeof(struct pt_ctx));
+ mtx_init(&pt_ctx->buf.lock, "pttopa", NULL, MTX_SPIN);
+ pt_ctx->save_area = malloc_aligned(pt_info.xsave_area_size, 64,
+ M_PT, M_NOWAIT | M_ZERO);
+ if (pt_ctx->save_area == NULL)
+ return (ENOMEM);
+ dprintf("%s: preparing ToPA buffer\n", __func__);
+ if (pt_topa_prepare(pt_ctx, vm) != 0) {
+ dprintf("%s: failed to prepare ToPA buffer\n", __func__);
+ free(pt_ctx->save_area, M_PT);
+ return (ENOMEM);
+ }
+
+ pt_ctx->id = ctx_id;
+ TASK_INIT(&pt_ctx->task, 0, pt_send_buffer_record, pt_ctx);
+
+ return (0);
+}
+
+static void
+pt_deinit_ctx(struct pt_ctx *pt_ctx)
+{
+
+ if (pt_ctx->buf.topa_hw != NULL)
+ free(pt_ctx->buf.topa_hw, M_PT);
+ if (pt_ctx->save_area != NULL)
+ free(pt_ctx->save_area, M_PT);
+ memset(pt_ctx, 0, sizeof(*pt_ctx));
+ pt_ctx->buf.topa_hw = NULL;
+}
+
+/*
+ * HWT backend configuration method.
+ *
+ * Checks and translates the user-defined configuration to a
+ * set of PT tracing features. Uses the feature set to initialize
+ * the tracing context for the target CPU or thread.
+ */
+static int
+pt_backend_configure(struct hwt_context *ctx, int cpu_id, int thread_id)
+{
+ struct hwt_cpu *hwt_cpu;
+ struct hwt_thread *thr;
+ struct pt_ctx *pt_ctx;
+ struct pt_cpu_config *cfg;
+ struct pt_ext_area *pt_ext;
+ struct xstate_hdr *hdr;
+ int error;
+
+ dprintf("%s\n", __func__);
+
+ cfg = (struct pt_cpu_config *)ctx->config;
+ pt_ctx = NULL;
+
+ /* Clear any flags we don't support yet. */
+ cfg->rtit_ctl &= PT_SUPPORTED_FLAGS;
+ if (cfg->rtit_ctl & RTIT_CTL_MTCEN) {
+ if ((pt_info.l0_ebx & CPUPT_MTC) == 0) {
+ printf("%s: CPU does not support generating MTC "
+ "packets\n", __func__);
+ return (ENXIO);
+ }
+ }
+
+ if (cfg->rtit_ctl & RTIT_CTL_CR3FILTER) {
+ if ((pt_info.l0_ebx & CPUPT_CR3) == 0) {
+ printf("%s: CPU does not support CR3 filtering\n",
+ __func__);
+ return (ENXIO);
+ }
+ }
+
+ if (cfg->rtit_ctl & RTIT_CTL_DIS_TNT) {
+ if ((pt_info.l0_ebx & CPUPT_DIS_TNT) == 0) {
+ printf("%s: CPU does not support TNT\n", __func__);
+ return (ENXIO);
+ }
+ }
+ /* TODO: support for more config bits. */
+
+ if (ctx->mode == HWT_MODE_CPU) {
+ TAILQ_FOREACH(hwt_cpu, &ctx->cpus, next) {
+ if (hwt_cpu->cpu_id != cpu_id)
+ continue;
+ pt_ctx = &pt_pcpu_ctx[cpu_id];
+ break;
+ }
+ } else {
+ TAILQ_FOREACH(thr, &ctx->threads, next) {
+ if (thr->thread_id != thread_id)
+ continue;
+ KASSERT(thr->private != NULL,
+ ("%s: hwt thread private"
+ " not set, thr %p",
+ __func__, thr));
+ pt_ctx = (struct pt_ctx *)thr->private;
+ break;
+ }
+ }
+ if (pt_ctx == NULL)
+ return (ENOENT);
+
+ dprintf("%s: preparing MSRs\n", __func__);
+ pt_ext = pt_ctx_get_ext_area(pt_ctx);
+ hdr = pt_ctx_get_xstate_hdr(pt_ctx);
+
+ pt_ext->rtit_ctl |= cfg->rtit_ctl;
+ if (cfg->nranges != 0) {
+ dprintf("%s: preparing IPF ranges\n", __func__);
+ if ((error = pt_configure_ranges(pt_ctx, cfg)) != 0)
+ return (error);
+ }
+ pt_ctx->hwt_ctx = ctx;
+ pt_ext->rtit_ctl |= RTIT_CTL_TOPA;
+ pt_ext->rtit_output_base = (uint64_t)vtophys(pt_ctx->buf.topa_hw);
+ pt_ext->rtit_output_mask_ptrs = PT_TOPA_MASK_PTRS;
+ hdr->xstate_bv = XFEATURE_ENABLED_PT;
+ hdr->xstate_xcomp_bv = XFEATURE_ENABLED_PT |
+ XSTATE_XCOMP_BV_COMPACT;
+ pt_ext->rtit_ctl |= RTIT_CTL_TRACEEN;
+ pt_pcpu[cpu_id].ctx = pt_ctx;
+ pt_cpu_set_state(cpu_id, PT_STOPPED);
+
+ return (0);
+}
+
+/*
+ * hwt backend trace start operation. CPU affine.
+ */
+static void
+pt_backend_enable(struct hwt_context *ctx, int cpu_id)
+{
+ if (ctx->mode == HWT_MODE_CPU)
+ return;
+
+ KASSERT(curcpu == cpu_id,
+ ("%s: attempting to start PT on another cpu", __func__));
+ pt_cpu_start(NULL);
+ CPU_SET(cpu_id, &ctx->cpu_map);
+}
+
+/*
+ * hwt backend trace stop operation. CPU affine.
+ */
+static void
+pt_backend_disable(struct hwt_context *ctx, int cpu_id)
+{
+ struct pt_cpu *cpu;
+
+ if (ctx->mode == HWT_MODE_CPU)
+ return;
+
+ KASSERT(curcpu == cpu_id,
+ ("%s: attempting to disable PT on another cpu", __func__));
+ pt_cpu_stop(NULL);
+ CPU_CLR(cpu_id, &ctx->cpu_map);
+ cpu = &pt_pcpu[cpu_id];
+ cpu->ctx = NULL;
+}
+
+/*
+ * hwt backend trace start operation for remote CPUs.
+ */
+static int
+pt_backend_enable_smp(struct hwt_context *ctx)
+{
+
+ dprintf("%s\n", __func__);
+ if (ctx->mode == HWT_MODE_CPU &&
+ atomic_swap_32(&cpu_mode_ctr, 1) != 0)
+ return (-1);
+
+ KASSERT(ctx->mode == HWT_MODE_CPU,
+ ("%s: should only be used for CPU mode", __func__));
+ smp_rendezvous_cpus(ctx->cpu_map, NULL, pt_cpu_start, NULL, NULL);
+
+ return (0);
+}
+
+/*
+ * hwt backend trace stop operation for remote CPUs.
+ */
+static int
+pt_backend_disable_smp(struct hwt_context *ctx)
+{
+
+ dprintf("%s\n", __func__);
+ if (ctx->mode == HWT_MODE_CPU &&
+ atomic_swap_32(&cpu_mode_ctr, 0) == 0)
+ return (-1);
+
+ if (CPU_EMPTY(&ctx->cpu_map)) {
+ dprintf("%s: empty cpu map\n", __func__);
+ return (-1);
+ }
+ smp_rendezvous_cpus(ctx->cpu_map, NULL, pt_cpu_stop, NULL, NULL);
+
+ return (0);
+}
+
+/*
+ * HWT backend initialization method.
+ *
+ * Installs the ToPA interrupt handler and initializes
+ * the tracing contexts used for HWT_MODE_CPU.
+ */
+static int
+pt_backend_init(struct hwt_context *ctx)
+{
+ struct hwt_cpu *hwt_cpu;
+ int error;
+
+ dprintf("%s\n", __func__);
+ if (ctx->mode == HWT_MODE_CPU) {
+ TAILQ_FOREACH(hwt_cpu, &ctx->cpus, next) {
+ error = pt_init_ctx(&pt_pcpu_ctx[hwt_cpu->cpu_id],
+ hwt_cpu->vm, hwt_cpu->cpu_id);
+ if (error)
+ return (error);
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * HWT backend teardown method.
+ *
+ * Removes the ToPA interrupt handler, stops tracing on all active CPUs,
+ * and releases all previously allocated ToPA metadata.
+ */
+static int
+pt_backend_deinit(struct hwt_context *ctx)
+{
+ struct pt_ctx *pt_ctx;
+ struct hwt_thread *thr;
+ int cpu_id;
+
+ dprintf("%s\n", __func__);
+
+ pt_backend_disable_smp(ctx);
+ if (ctx->mode == HWT_MODE_THREAD) {
+ TAILQ_FOREACH(thr, &ctx->threads, next) {
+ KASSERT(thr->private != NULL,
+ ("%s: thr->private not set", __func__));
+ pt_ctx = (struct pt_ctx *)thr->private;
+ pt_deinit_ctx(pt_ctx);
+ }
+ } else {
+ CPU_FOREACH(cpu_id) {
+ if (!CPU_ISSET(cpu_id, &ctx->cpu_map))
+ continue;
+ if (pt_pcpu[cpu_id].ctx != NULL) {
+ KASSERT(pt_pcpu[cpu_id].ctx ==
+ &pt_pcpu_ctx[cpu_id],
+ ("%s: CPU mode tracing with non-cpu mode PT"
+ "context active",
+ __func__));
+ pt_pcpu[cpu_id].ctx = NULL;
+ }
+ pt_ctx = &pt_pcpu_ctx[cpu_id];
+ pt_deinit_ctx(pt_ctx);
+ memset(&pt_pcpu[cpu_id], 0, sizeof(struct pt_cpu));
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * Fetches current offset into the tracing buffer.
+ */
+static int
+pt_backend_read(struct hwt_vm *vm, int *curpage, vm_offset_t *curpage_offset,
+ uint64_t *data)
+{
+ struct pt_buffer *buf;
+
+ if (vm->ctx->mode == HWT_MODE_THREAD)
+ buf = &((struct pt_ctx *)vm->thr->private)->buf;
+ else
+ buf = &pt_pcpu[vm->cpu->cpu_id].ctx->buf;
+ mtx_lock_spin(&buf->lock);
+ *curpage = buf->curpage;
+ *curpage_offset = buf->offset + (buf->wrap_count * vm->ctx->bufsize);
+ mtx_unlock_spin(&buf->lock);
+
+ return (0);
+}
+
+/*
+ * HWT thread creation hook.
+ * Allocates and associates a 'struct pt_ctx' for a given hwt thread.
+ */
+static int
+pt_backend_alloc_thread(struct hwt_thread *thr)
+{
+ struct pt_ctx *pt_ctx;
+ int error;
+
+ /* Omit M_WAITOK since this might get invoked a non-sleepable context */
+ pt_ctx = malloc(sizeof(*pt_ctx), M_PT, M_NOWAIT | M_ZERO);
+ if (pt_ctx == NULL)
+ return (ENOMEM);
+
+ error = pt_init_ctx(pt_ctx, thr->vm, thr->thread_id);
+ if (error)
+ return (error);
+
+ thr->private = pt_ctx;
+ return (0);
+}
+/*
+ * HWT thread teardown hook.
+ */
+static void
+pt_backend_free_thread(struct hwt_thread *thr)
+{
+ struct pt_ctx *ctx;
+
+ ctx = (struct pt_ctx *)thr->private;
+
+ pt_deinit_ctx(ctx);
+ free(ctx, M_PT);
+}
+
+static void
+pt_backend_dump(int cpu_id)
+{
+}
+
+static struct hwt_backend_ops pt_ops = {
+ .hwt_backend_init = pt_backend_init,
+ .hwt_backend_deinit = pt_backend_deinit,
+
+ .hwt_backend_configure = pt_backend_configure,
+
+ .hwt_backend_enable = pt_backend_enable,
+ .hwt_backend_disable = pt_backend_disable,
+
+#ifdef SMP
+ .hwt_backend_enable_smp = pt_backend_enable_smp,
+ .hwt_backend_disable_smp = pt_backend_disable_smp,
+#endif
+
+ .hwt_backend_read = pt_backend_read,
+ .hwt_backend_dump = pt_backend_dump,
+
+ .hwt_backend_thread_alloc = pt_backend_alloc_thread,
+ .hwt_backend_thread_free = pt_backend_free_thread,
+};
+
+static struct hwt_backend backend = {
+ .ops = &pt_ops,
+ .name = "pt",
+ .kva_req = 1,
+};
+
+/*
+ * Reads the latest valid trace buffer offset and enqueues
+ * a HWT_RECORD_BUFFER record.
+ * Used as a taskqueue routine from the ToPA interrupt handler.
+ */
+static void
+pt_send_buffer_record(void *arg, int pending __unused)
+{
+ struct hwt_record_entry record;
+ struct pt_ctx *ctx = (struct pt_ctx *)arg;
+
+ /* Prepare buffer record. */
+ mtx_lock_spin(&ctx->buf.lock);
+ pt_fill_buffer_record(ctx->id, &ctx->buf, &record);
+ mtx_unlock_spin(&ctx->buf.lock);
+ hwt_record_ctx(ctx->hwt_ctx, &record, M_ZERO | M_NOWAIT);
+}
+static void
+pt_topa_status_clear(void)
+{
+ uint64_t reg;
+
+ reg = rdmsr(MSR_IA_GLOBAL_STATUS_RESET);
+ reg &= ~GLOBAL_STATUS_FLAG_TRACETOPAPMI;
+ reg |= GLOBAL_STATUS_FLAG_TRACETOPAPMI;
+ wrmsr(MSR_IA_GLOBAL_STATUS_RESET, reg);
+}
+
+/*
+ * ToPA PMI handler.
+ *
+ * Invoked every time a ToPA entry marked with TOPA_INT is filled.
+ * Uses taskqueue to enqueue a buffer record for userspace.
+ * Re-enables the PC interrupt line as long as tracing is active.
+ */
+static int
+pt_topa_intr(struct trapframe *tf)
+{
+ struct pt_buffer *buf;
+ struct pt_ctx *ctx;
+ uint64_t reg;
+
+ SDT_PROBE0(pt, , , topa__intr);
+
+ if (pt_cpu_get_state(curcpu) != PT_ACTIVE) {
+ return (0);
+ }
+ reg = rdmsr(MSR_IA_GLOBAL_STATUS);
+ if ((reg & GLOBAL_STATUS_FLAG_TRACETOPAPMI) == 0) {
+ /* ACK spurious or leftover interrupt. */
+ pt_topa_status_clear();
+ return (1);
+ }
+
+ ctx = pt_pcpu[curcpu].ctx;
+ buf = &ctx->buf;
+ KASSERT(buf->topa_hw != NULL,
+ ("%s: ToPA PMI interrupt with invalid buffer", __func__));
+
+ pt_cpu_toggle_local(ctx->save_area, false);
+ pt_update_buffer(buf);
+ pt_topa_status_clear();
+ taskqueue_enqueue_flags(taskqueue_pt, &ctx->task,
+ TASKQUEUE_FAIL_IF_PENDING);
+
+ if (pt_cpu_get_state(curcpu) == PT_ACTIVE) {
+ pt_cpu_toggle_local(ctx->save_area, true);
+ lapic_reenable_pcint();
+ }
+ return (1);
+}
+
+/*
+ * Module initialization.
+ *
+ * Saves all PT-related cpuid info, registers itself as a HWT backend,
+ * and allocates metadata required to keep track of tracing operations
+ * on each CPU.
+ */
+static int
+pt_init(void)
+{
+ u_int cp[4];
+ int error;
+
+ dprintf("pt: Enumerating part 1\n");
+ cpuid_count(CPUID_PT_LEAF, 0, cp);
+ dprintf("pt: Maximum valid sub-leaf Index: %x\n", cp[0]);
+ dprintf("pt: ebx %x\n", cp[1]);
+ dprintf("pt: ecx %x\n", cp[2]);
+
+ pt_info.l0_eax = cp[0];
+ pt_info.l0_ebx = cp[1];
+ pt_info.l0_ecx = cp[2];
+
+ dprintf("pt: Enumerating part 2\n");
+ cpuid_count(CPUID_PT_LEAF, 1, cp);
+ dprintf("pt: eax %x\n", cp[0]);
+ dprintf("pt: ebx %x\n", cp[1]);
+
+ pt_info.l1_eax = cp[0];
+ pt_info.l1_ebx = cp[1];
+
+ error = hwt_backend_register(&backend);
+ if (error != 0) {
+ printf("pt: unable to register hwt backend, error %d\n", error);
+ return (error);
+ }
+ pt_pcpu = mallocarray(mp_ncpus, sizeof(struct pt_cpu), M_PT,
+ M_ZERO | M_WAITOK);
+ pt_pcpu_ctx = mallocarray(mp_ncpus, sizeof(struct pt_ctx), M_PT,
+ M_ZERO | M_WAITOK);
+
+ nmi_register_handler(pt_topa_intr);
+ if (!lapic_enable_pcint()) {
+ nmi_remove_handler(pt_topa_intr);
+ hwt_backend_unregister(&backend);
+ free(pt_pcpu, M_PT);
+ free(pt_pcpu_ctx, M_PT);
+ pt_pcpu = NULL;
+ pt_pcpu_ctx = NULL;
+ printf("pt: failed to setup interrupt line\n");
+ return (error);
+ }
+ initialized = true;
+
+ return (0);
+}
+
+/*
+ * Checks whether the CPU support Intel PT and
+ * initializes XSAVE area info.
+ *
+ * The driver relies on XSAVE/XRSTOR PT extensions,
+ * Table of Physical Addresses (ToPA) support, and
+ * support for multiple ToPA entries.
+ */
+static bool
+pt_supported(void)
+{
+ u_int cp[4];
+
+ if ((cpu_stdext_feature & CPUID_STDEXT_PROCTRACE) == 0) {
+ printf("pt: CPU does not support Intel Processor Trace\n");
+ return (false);
+ }
+ if ((cpu_feature2 & CPUID2_XSAVE) == 0) {
+ printf("pt: XSAVE is not supported\n");
+ return (false);
+ }
+ if (!xsave_extfeature_supported(XFEATURE_ENABLED_PT, true)) {
+ printf("pt: CPU does not support managing PT state using XSAVE\n");
+ return (false);
+ }
+ if (!xsave_extension_supported(CPUID_EXTSTATE_XSAVEC)) {
+ printf("pt: XSAVE compaction is not supported\n");
+ return (false);
+ }
+ if (!xsave_extension_supported(CPUID_EXTSTATE_XSAVES)) {
+ printf("pt: CPU does not support XSAVES/XRSTORS\n");
+ return (false);
+ }
+
+ /* Require ToPA support. */
+ cpuid_count(CPUID_PT_LEAF, 0, cp);
+ if ((cp[2] & CPUPT_TOPA) == 0) {
+ printf("pt: ToPA is not supported\n");
+ return (false);
+ }
+ if ((cp[2] & CPUPT_TOPA_MULTI) == 0) {
+ printf("pt: multiple ToPA outputs are not supported\n");
+ return (false);
+ }
+
+ pt_info.xstate_hdr_offset = xsave_area_hdr_offset();
+ pt_info.xsave_area_size = xsave_area_size(PT_XSTATE_BV, true, true);
+ pt_info.pt_xsave_offset = xsave_area_offset(PT_XSTATE_BV,
+ XFEATURE_ENABLED_PT, true, true);
+
+ return (true);
+}
+
+static void
+pt_deinit(void)
+{
+ if (!initialized)
+ return;
+ nmi_remove_handler(pt_topa_intr);
+ lapic_disable_pcint();
+ hwt_backend_unregister(&backend);
+ free(pt_pcpu, M_PT);
+ free(pt_pcpu_ctx, M_PT);
+ pt_pcpu = NULL;
+ initialized = false;
+}
+
+static int
+pt_modevent(module_t mod, int type, void *data)
+{
+ switch (type) {
+ case MOD_LOAD:
+ if (!pt_supported() || pt_init() != 0) {
+ return (ENXIO);
+ }
+ break;
+ case MOD_UNLOAD:
+ pt_deinit();
+ break;
+ default:
+ break;
+ }
+
+ return (0);
+}
+
+static moduledata_t pt_mod = { "intel_pt", pt_modevent, NULL };
+
+DECLARE_MODULE(intel_pt, pt_mod, SI_SUB_DRIVERS, SI_ORDER_FIRST);
+MODULE_DEPEND(intel_pt, hwt, 1, 1, 1);
+MODULE_VERSION(intel_pt, 1);
diff --git a/sys/amd64/pt/pt.h b/sys/amd64/pt/pt.h
new file mode 100644
index 000000000000..2423afdf22e9
--- /dev/null
+++ b/sys/amd64/pt/pt.h
@@ -0,0 +1,49 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023 Bojan Novković <bnovkov@freebsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _AMD64_PT_PT_H_
+#define _AMD64_PT_PT_H_
+
+#include <sys/types.h>
+
+#include <x86/include/specialreg.h>
+
+#define PT_IP_FILTER_MAX_RANGES (2) /* Intel SDM Vol. 3C, 33-29 */
+
+struct pt_cpu_config {
+ uint64_t rtit_ctl;
+ register_t cr3_filter;
+ int nranges;
+ struct ipf_range {
+ vm_offset_t start;
+ vm_offset_t end;
+ } ip_ranges[PT_IP_FILTER_MAX_RANGES];
+ uint32_t mtc_freq;
+ uint32_t cyc_thresh;
+ uint32_t psb_freq;
+};
+#endif /* !_AMD64_PT_PT_H_ */
diff --git a/sys/amd64/vmm/intel/vmx_support.S b/sys/amd64/vmm/intel/vmx_support.S
index f393f160b101..130130b64541 100644
--- a/sys/amd64/vmm/intel/vmx_support.S
+++ b/sys/amd64/vmm/intel/vmx_support.S
@@ -32,12 +32,6 @@
#include "vmx_assym.h"
-#ifdef SMP
-#define LK lock ;
-#else
-#define LK
-#endif
-
/* Be friendly to DTrace FBT's prologue/epilogue pattern matching */
#define VENTER push %rbp ; mov %rsp,%rbp
#define VLEAVE pop %rbp
diff --git a/sys/arm64/arm64/pmap.c b/sys/arm64/arm64/pmap.c
index a09da794e77d..459cc8ebe505 100644
--- a/sys/arm64/arm64/pmap.c
+++ b/sys/arm64/arm64/pmap.c
@@ -5709,6 +5709,9 @@ pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags,
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
KASSERT(ADDR_IS_CANONICAL(va),
("%s: Address not in canonical form: %lx", __func__, va));
+ KASSERT((flags & (PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM)) !=
+ PMAP_ENTER_NORECLAIM,
+ ("pmap_enter_l2: flags is missing PMAP_ENTER_NOREPLACE"));
if ((l2 = pmap_alloc_l2(pmap, va, &l2pg, (flags &
PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp)) == NULL) {
@@ -5828,6 +5831,15 @@ pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags,
if (!pmap_pv_insert_l2(pmap, va, new_l2, flags, lockp)) {
if (l2pg != NULL)
pmap_abort_ptp(pmap, va, l2pg);
+ else {
+ KASSERT(ADDR_IS_KERNEL(va) &&
+ (pmap_load(l2) & ATTR_DESCR_MASK) ==
+ L2_TABLE,
+ ("pmap_enter_l2: invalid kernel L2E"));
+ mt = pmap_remove_pt_page(pmap, va);
+ KASSERT(mt != NULL,
+ ("pmap_enter_l2: missing kernel PTP"));
+ }
if (uwptpg != NULL) {
mt = pmap_remove_pt_page(pmap, va);
KASSERT(mt == uwptpg,
diff --git a/sys/cddl/boot/zfs/zfsimpl.h b/sys/cddl/boot/zfs/zfsimpl.h
index 0ce38384abbf..83d964360343 100644
--- a/sys/cddl/boot/zfs/zfsimpl.h
+++ b/sys/cddl/boot/zfs/zfsimpl.h
@@ -2019,6 +2019,7 @@ typedef struct vdev {
vdev_list_t v_children; /* children of this vdev */
const char *v_name; /* vdev name */
uint64_t v_guid; /* vdev guid */
+ uint64_t v_txg; /* most recent transaction */
uint64_t v_id; /* index in parent */
uint64_t v_psize; /* physical device capacity */
int v_ashift; /* offset to block shift */
@@ -2048,7 +2049,6 @@ typedef struct spa {
STAILQ_ENTRY(spa) spa_link; /* link in global pool list */
char *spa_name; /* pool name */
uint64_t spa_guid; /* pool guid */
- uint64_t spa_txg; /* most recent transaction */
struct uberblock *spa_uberblock; /* best uberblock so far */
vdev_t *spa_root_vdev; /* toplevel vdev container */
objset_phys_t *spa_mos; /* MOS for this pool */
diff --git a/sys/conf/files b/sys/conf/files
index 74d251c2b608..dd0d390962f2 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -3227,6 +3227,19 @@ dev/uart/uart_if.m optional uart
dev/uart/uart_subr.c optional uart
dev/uart/uart_tty.c optional uart
#
+# Universal Flash Storage Host Controller Interface drivers
+#
+dev/ufshci/ufshci.c optional ufshci
+dev/ufshci/ufshci_ctrlr.c optional ufshci
+dev/ufshci/ufshci_ctrlr_cmd.c optional ufshci
+dev/ufshci/ufshci_dev.c optional ufshci
+dev/ufshci/ufshci_pci.c optional ufshci
+dev/ufshci/ufshci_req_queue.c optional ufshci
+dev/ufshci/ufshci_req_sdb.c optional ufshci
+dev/ufshci/ufshci_sim.c optional ufshci
+dev/ufshci/ufshci_sysctl.c optional ufshci
+dev/ufshci/ufshci_uic_cmd.c optional ufshci
+#
# USB controller drivers
#
dev/usb/controller/musb_otg.c optional musb
diff --git a/sys/conf/files.amd64 b/sys/conf/files.amd64
index 0584fc29d039..678d288c2d86 100644
--- a/sys/conf/files.amd64
+++ b/sys/conf/files.amd64
@@ -191,6 +191,10 @@ dev/ice/irdma_di_if.m optional ice pci \
compile-with "${NORMAL_M} -I$S/dev/ice"
dev/ice/ice_ddp_common.c optional ice pci \
compile-with "${NORMAL_C} -I$S/dev/ice"
+dev/ice/ice_iov.c optional ice pci pci_iov \
+ compile-with "${NORMAL_C} -I$S/dev/ice"
+dev/ice/ice_vf_mbx.c optional ice pci pci_iov \
+ compile-with "${NORMAL_C} -I$S/dev/ice"
ice_ddp.c optional ice_ddp \
compile-with "${AWK} -f $S/tools/fw_stub.awk ice_ddp.fw:ice_ddp:0x01032900 -mice_ddp -c${.TARGET}" \
no-ctfconvert no-implicit-rule before-depend local \
diff --git a/sys/dev/gpio/gpiobus.c b/sys/dev/gpio/gpiobus.c
index ab7f13177969..764bcb7e6ee8 100644
--- a/sys/dev/gpio/gpiobus.c
+++ b/sys/dev/gpio/gpiobus.c
@@ -110,10 +110,9 @@ gpio_alloc_intr_resource(device_t consumer_dev, int *rid, u_int alloc_flags,
res = bus_alloc_resource(consumer_dev, SYS_RES_IRQ, rid, irq, irq, 1,
alloc_flags);
if (res == NULL) {
- intr_free_intr_map_data((struct intr_map_data *)gpio_data);
+ intr_unmap_irq(irq);
return (NULL);
}
- rman_set_virtual(res, gpio_data);
return (res);
}
#else
@@ -866,6 +865,25 @@ gpiobus_alloc_resource(device_t bus, device_t child, int type, int *rid,
end, count, flags));
}
+static int
+gpiobus_release_resource(device_t dev, device_t child, struct resource *r)
+{
+ int err;
+#ifdef INTRNG
+ u_int irq;
+
+ irq = rman_get_start(r);
+ MPASS(irq == rman_get_end(r));
+#endif
+ err = bus_generic_rman_release_resource(dev, child, r);
+ if (err != 0)
+ return (err);
+#ifdef INTRNG
+ intr_unmap_irq(irq);
+#endif
+ return (0);
+}
+
static struct resource_list *
gpiobus_get_resource_list(device_t bus __unused, device_t child)
{
@@ -1060,7 +1078,7 @@ static device_method_t gpiobus_methods[] = {
DEVMETHOD(bus_get_resource, bus_generic_rl_get_resource),
DEVMETHOD(bus_set_resource, bus_generic_rl_set_resource),
DEVMETHOD(bus_alloc_resource, gpiobus_alloc_resource),
- DEVMETHOD(bus_release_resource, bus_generic_rman_release_resource),
+ DEVMETHOD(bus_release_resource, gpiobus_release_resource),
DEVMETHOD(bus_activate_resource, bus_generic_rman_activate_resource),
DEVMETHOD(bus_deactivate_resource, bus_generic_rman_deactivate_resource),
DEVMETHOD(bus_get_resource_list, gpiobus_get_resource_list),
diff --git a/sys/dev/ice/ice_features.h b/sys/dev/ice/ice_features.h
index 821abe4806ca..5b23757b1c98 100644
--- a/sys/dev/ice/ice_features.h
+++ b/sys/dev/ice/ice_features.h
@@ -91,7 +91,9 @@ enum feat_list {
static inline void
ice_disable_unsupported_features(ice_bitmap_t __unused *bitmap)
{
+#ifndef PCI_IOV
ice_clear_bit(ICE_FEATURE_SRIOV, bitmap);
+#endif
#ifndef DEV_NETMAP
ice_clear_bit(ICE_FEATURE_NETMAP, bitmap);
#endif
diff --git a/sys/dev/ice/ice_iflib.h b/sys/dev/ice/ice_iflib.h
index 3a5dc201189a..e1d5307a9516 100644
--- a/sys/dev/ice/ice_iflib.h
+++ b/sys/dev/ice/ice_iflib.h
@@ -139,6 +139,9 @@ struct ice_irq_vector {
* @tc: traffic class queue belongs to
* @q_handle: qidx in tc; used in TXQ enable functions
*
+ * ice_iov.c requires the following parameters (when PCI_IOV is defined):
+ * @itr_idx: ITR index to use for this queue
+ *
* Other parameters may be iflib driver specific
*/
struct ice_tx_queue {
@@ -153,6 +156,9 @@ struct ice_tx_queue {
u32 me;
u16 q_handle;
u8 tc;
+#ifdef PCI_IOV
+ u8 itr_idx;
+#endif
/* descriptor writeback status */
qidx_t *tx_rsq;
@@ -175,6 +181,9 @@ struct ice_tx_queue {
* @stats: queue statistics
* @tc: traffic class queue belongs to
*
+ * ice_iov.c requires the following parameters (when PCI_IOV is defined):
+ * @itr_idx: ITR index to use for this queue
+ *
* Other parameters may be iflib driver specific
*/
struct ice_rx_queue {
@@ -187,6 +196,9 @@ struct ice_rx_queue {
struct ice_irq_vector *irqv;
u32 me;
u8 tc;
+#ifdef PCI_IOV
+ u8 itr_idx;
+#endif
struct if_irq que_irq;
};
@@ -332,6 +344,10 @@ struct ice_softc {
ice_declare_bitmap(feat_cap, ICE_FEATURE_COUNT);
ice_declare_bitmap(feat_en, ICE_FEATURE_COUNT);
+#ifdef PCI_IOV
+ struct ice_vf *vfs;
+ u16 num_vfs;
+#endif
struct ice_resmgr os_imgr;
/* For mirror interface */
struct ice_mirr_if *mirr_if;
diff --git a/sys/dev/ice/ice_iov.c b/sys/dev/ice/ice_iov.c
new file mode 100644
index 000000000000..c5a3e1060e44
--- /dev/null
+++ b/sys/dev/ice/ice_iov.c
@@ -0,0 +1,1856 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright (c) 2025, Intel Corporation
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file ice_iov.c
+ * @brief Virtualization support functions
+ *
+ * Contains functions for enabling and managing PCIe virtual function devices,
+ * including enabling new VFs, and managing VFs over the virtchnl interface.
+ */
+
+#include "ice_iov.h"
+
+static struct ice_vf *ice_iov_get_vf(struct ice_softc *sc, int vf_num);
+static void ice_iov_ready_vf(struct ice_softc *sc, struct ice_vf *vf);
+static void ice_reset_vf(struct ice_softc *sc, struct ice_vf *vf,
+ bool trigger_vflr);
+static void ice_iov_setup_intr_mapping(struct ice_softc *sc, struct ice_vf *vf);
+
+static void ice_vc_version_msg(struct ice_softc *sc, struct ice_vf *vf,
+ u8 *msg_buf);
+static void ice_vc_get_vf_res_msg(struct ice_softc *sc, struct ice_vf *vf,
+ u8 *msg_buf);
+static void ice_vc_add_eth_addr_msg(struct ice_softc *sc, struct ice_vf *vf,
+ u8 *msg_buf);
+static void ice_vc_del_eth_addr_msg(struct ice_softc *sc, struct ice_vf *vf,
+ u8 *msg_buf);
+static bool ice_vc_isvalid_ring_len(u16 ring_len);
+static void ice_vc_cfg_vsi_qs_msg(struct ice_softc *sc, struct ice_vf *vf,
+ u8 *msg_buf);
+static void ice_vc_cfg_rss_key_msg(struct ice_softc *sc, struct ice_vf *vf,
+ u8 *msg_buf);
+static void ice_vc_set_rss_hena_msg(struct ice_softc *sc, struct ice_vf *vf,
+ u8 *msg_buf);
+static void ice_vc_enable_queues_msg(struct ice_softc *sc, struct ice_vf *vf,
+ u8 *msg_buf);
+static void ice_vc_notify_vf_link_state(struct ice_softc *sc, struct ice_vf *vf);
+static void ice_vc_disable_queues_msg(struct ice_softc *sc, struct ice_vf *vf,
+ u8 *msg_buf);
+static void ice_vc_cfg_irq_map_msg(struct ice_softc *sc, struct ice_vf *vf,
+ u8 *msg_buf);
+static void ice_vc_get_stats_msg(struct ice_softc *sc, struct ice_vf *vf,
+ u8 *msg_buf);
+static void ice_eth_stats_to_virtchnl_eth_stats(struct ice_eth_stats *istats,
+ struct virtchnl_eth_stats *vstats);
+static void ice_vc_cfg_rss_lut_msg(struct ice_softc *sc, struct ice_vf *vf,
+ u8 *msg_buf);
+static void ice_vc_cfg_promisc_mode_msg(struct ice_softc *sc, struct ice_vf *vf,
+ u8 *msg_buf);
+static void ice_vc_add_vlan_msg(struct ice_softc *sc, struct ice_vf *vf,
+ u8 *msg_buf);
+static void ice_vc_del_vlan_msg(struct ice_softc *sc, struct ice_vf *vf,
+ u8 *msg_buf);
+static enum virtchnl_status_code ice_iov_err_to_virt_err(int ice_err);
+static int ice_vf_validate_mac(struct ice_vf *vf, const uint8_t *addr);
+
+/**
+ * ice_iov_attach - Initialize SR-IOV PF host support
+ * @sc: device softc structure
+ *
+ * Initialize SR-IOV PF host support at the end of the driver attach process.
+ *
+ * @pre Must be called from sleepable context (calls malloc() w/ M_WAITOK)
+ *
+ * @returns 0 if successful, or
+ * - ENOMEM if there is no memory for the PF/VF schemas or iov device
+ * - ENXIO if the device isn't PCI-E or doesn't support the same SR-IOV
+ * version as the kernel
+ * - ENOENT if the device doesn't have the SR-IOV capability
+ */
+int
+ice_iov_attach(struct ice_softc *sc)
+{
+ device_t dev = sc->dev;
+ nvlist_t *pf_schema, *vf_schema;
+ int error;
+
+ pf_schema = pci_iov_schema_alloc_node();
+ vf_schema = pci_iov_schema_alloc_node();
+
+ pci_iov_schema_add_unicast_mac(vf_schema, "mac-addr", 0, NULL);
+ pci_iov_schema_add_bool(vf_schema, "mac-anti-spoof",
+ IOV_SCHEMA_HASDEFAULT, TRUE);
+ pci_iov_schema_add_bool(vf_schema, "allow-set-mac",
+ IOV_SCHEMA_HASDEFAULT, FALSE);
+ pci_iov_schema_add_bool(vf_schema, "allow-promisc",
+ IOV_SCHEMA_HASDEFAULT, FALSE);
+ pci_iov_schema_add_uint16(vf_schema, "num-queues",
+ IOV_SCHEMA_HASDEFAULT, ICE_DEFAULT_VF_QUEUES);
+ pci_iov_schema_add_uint16(vf_schema, "mirror-src-vsi",
+ IOV_SCHEMA_HASDEFAULT, ICE_INVALID_MIRROR_VSI);
+ pci_iov_schema_add_uint16(vf_schema, "max-vlan-allowed",
+ IOV_SCHEMA_HASDEFAULT, ICE_DEFAULT_VF_VLAN_LIMIT);
+ pci_iov_schema_add_uint16(vf_schema, "max-mac-filters",
+ IOV_SCHEMA_HASDEFAULT, ICE_DEFAULT_VF_FILTER_LIMIT);
+
+ error = pci_iov_attach(dev, pf_schema, vf_schema);
+ if (error != 0) {
+ device_printf(dev,
+ "pci_iov_attach failed (error=%s)\n",
+ ice_err_str(error));
+ ice_clear_bit(ICE_FEATURE_SRIOV, sc->feat_en);
+ } else
+ ice_set_bit(ICE_FEATURE_SRIOV, sc->feat_en);
+
+ return (error);
+}
+
+/**
+ * ice_iov_detach - Teardown SR-IOV PF host support
+ * @sc: device softc structure
+ *
+ * Teardown SR-IOV PF host support at the start of the driver detach process.
+ *
+ * @returns 0 if successful or IOV support hasn't been setup, or
+ * - EBUSY if VFs still exist
+ */
+int
+ice_iov_detach(struct ice_softc *sc)
+{
+ device_t dev = sc->dev;
+ int error;
+
+ error = pci_iov_detach(dev);
+ if (error != 0) {
+ device_printf(dev,
+ "pci_iov_detach failed (error=%s)\n",
+ ice_err_str(error));
+ }
+
+ return (error);
+}
+
+/**
+ * ice_iov_init - Called by the OS before the first VF is created.
+ * @sc: device softc structure
+ * @num_vfs: number of VFs to setup resources for
+ * @params: configuration parameters for the PF
+ *
+ * @returns 0 if successful or an error code on failure
+ */
+int
+ice_iov_init(struct ice_softc *sc, uint16_t num_vfs, const nvlist_t *params __unused)
+{
+ /* Allocate array of VFs, for tracking */
+ sc->vfs = (struct ice_vf *)malloc(sizeof(struct ice_vf) * num_vfs, M_ICE, M_NOWAIT |
+ M_ZERO);
+ if (sc->vfs == NULL)
+ return (ENOMEM);
+
+ /* Initialize each VF with basic information */
+ for (int i = 0; i < num_vfs; i++)
+ sc->vfs[i].vf_num = i;
+
+ /* Save off number of configured VFs */
+ sc->num_vfs = num_vfs;
+
+ return (0);
+}
+
+/**
+ * ice_iov_get_vf - Get pointer to VF at given index
+ * @sc: device softc structure
+ * @vf_num: Index of VF to retrieve
+ *
+ * @remark will throw an assertion if vf_num is not in the
+ * range of allocated VFs
+ *
+ * @returns a pointer to the VF structure at the given index
+ */
+static struct ice_vf *
+ice_iov_get_vf(struct ice_softc *sc, int vf_num)
+{
+ MPASS(vf_num < sc->num_vfs);
+
+ return &sc->vfs[vf_num];
+}
+
+/**
+ * ice_iov_add_vf - Called by the OS for each VF to create
+ * @sc: device softc structure
+ * @vfnum: index of VF to configure
+ * @params: configuration parameters for the VF
+ *
+ * @returns 0 if successful or an error code on failure
+ */
+int
+ice_iov_add_vf(struct ice_softc *sc, uint16_t vfnum, const nvlist_t *params)
+{
+ struct ice_tx_queue *txq;
+ struct ice_rx_queue *rxq;
+ device_t dev = sc->dev;
+ struct ice_vsi *vsi;
+ struct ice_vf *vf;
+ int vf_num_queues;
+ const void *mac;
+ size_t size;
+ int error;
+ int i;
+
+ vf = ice_iov_get_vf(sc, vfnum);
+ vf->vf_flags = VF_FLAG_ENABLED;
+
+ /* This VF needs at least one VSI */
+ vsi = ice_alloc_vsi(sc, ICE_VSI_VF);
+ if (vsi == NULL)
+ return (ENOMEM);
+ vf->vsi = vsi;
+ vsi->vf_num = vfnum;
+
+ vf_num_queues = nvlist_get_number(params, "num-queues");
+ /* Validate and clamp value if invalid */
+ if (vf_num_queues < 1 || vf_num_queues > ICE_MAX_SCATTERED_QUEUES)
+ device_printf(dev, "Invalid num-queues (%d) for VF %d\n",
+ vf_num_queues, vf->vf_num);
+ if (vf_num_queues < 1) {
+ device_printf(dev, "Setting VF %d num-queues to 1\n", vf->vf_num);
+ vf_num_queues = 1;
+ } else if (vf_num_queues > ICE_MAX_SCATTERED_QUEUES) {
+ device_printf(dev, "Setting VF %d num-queues to %d\n",
+ vf->vf_num, ICE_MAX_SCATTERED_QUEUES);
+ vf_num_queues = ICE_MAX_SCATTERED_QUEUES;
+ }
+ vsi->qmap_type = ICE_RESMGR_ALLOC_SCATTERED;
+
+ /* Reserve VF queue allocation from PF queues */
+ ice_alloc_vsi_qmap(vsi, vf_num_queues, vf_num_queues);
+ vsi->num_tx_queues = vsi->num_rx_queues = vf_num_queues;
+
+ /* Assign Tx queues from PF space */
+ error = ice_resmgr_assign_scattered(&sc->tx_qmgr, vsi->tx_qmap,
+ vsi->num_tx_queues);
+ if (error) {
+ device_printf(sc->dev, "Unable to assign VF Tx queues: %s\n",
+ ice_err_str(error));
+ goto release_vsi;
+ }
+
+ /* Assign Rx queues from PF space */
+ error = ice_resmgr_assign_scattered(&sc->rx_qmgr, vsi->rx_qmap,
+ vsi->num_rx_queues);
+ if (error) {
+ device_printf(sc->dev, "Unable to assign VF Rx queues: %s\n",
+ ice_err_str(error));
+ goto release_vsi;
+ }
+
+ vsi->max_frame_size = ICE_MAX_FRAME_SIZE;
+
+ /* Allocate queue structure memory */
+ vsi->tx_queues = (struct ice_tx_queue *)
+ malloc(sizeof(struct ice_tx_queue) * vsi->num_tx_queues, M_ICE,
+ M_NOWAIT | M_ZERO);
+ if (!vsi->tx_queues) {
+ device_printf(sc->dev, "VF-%d: Unable to allocate Tx queue memory\n",
+ vfnum);
+ error = ENOMEM;
+ goto release_vsi;
+ }
+ for (i = 0, txq = vsi->tx_queues; i < vsi->num_tx_queues; i++, txq++) {
+ txq->me = i;
+ txq->vsi = vsi;
+ }
+
+ /* Allocate queue structure memory */
+ vsi->rx_queues = (struct ice_rx_queue *)
+ malloc(sizeof(struct ice_rx_queue) * vsi->num_rx_queues, M_ICE,
+ M_NOWAIT | M_ZERO);
+ if (!vsi->rx_queues) {
+ device_printf(sc->dev, "VF-%d: Unable to allocate Rx queue memory\n",
+ vfnum);
+ error = ENOMEM;
+ goto free_txqs;
+ }
+ for (i = 0, rxq = vsi->rx_queues; i < vsi->num_rx_queues; i++, rxq++) {
+ rxq->me = i;
+ rxq->vsi = vsi;
+ }
+
+ /* Allocate space to store the IRQ vector data */
+ vf->num_irq_vectors = vf_num_queues + 1;
+ vf->tx_irqvs = (struct ice_irq_vector *)
+ malloc(sizeof(struct ice_irq_vector) * (vf->num_irq_vectors),
+ M_ICE, M_NOWAIT);
+ if (!vf->tx_irqvs) {
+ device_printf(sc->dev,
+ "Unable to allocate TX irqv memory for VF-%d's %d vectors\n",
+ vfnum, vf->num_irq_vectors);
+ error = ENOMEM;
+ goto free_rxqs;
+ }
+ vf->rx_irqvs = (struct ice_irq_vector *)
+ malloc(sizeof(struct ice_irq_vector) * (vf->num_irq_vectors),
+ M_ICE, M_NOWAIT);
+ if (!vf->rx_irqvs) {
+ device_printf(sc->dev,
+ "Unable to allocate RX irqv memory for VF-%d's %d vectors\n",
+ vfnum, vf->num_irq_vectors);
+ error = ENOMEM;
+ goto free_txirqvs;
+ }
+
+ /* Assign VF interrupts from PF space */
+ if (!(vf->vf_imap =
+ (u16 *)malloc(sizeof(u16) * vf->num_irq_vectors,
+ M_ICE, M_NOWAIT))) {
+ device_printf(dev, "Unable to allocate VF-%d imap memory\n", vfnum);
+ error = ENOMEM;
+ goto free_rxirqvs;
+ }
+ error = ice_resmgr_assign_contiguous(&sc->dev_imgr, vf->vf_imap, vf->num_irq_vectors);
+ if (error) {
+ device_printf(dev, "Unable to assign VF-%d interrupt mapping: %s\n",
+ vfnum, ice_err_str(error));
+ goto free_imap;
+ }
+
+ if (nvlist_exists_binary(params, "mac-addr")) {
+ mac = nvlist_get_binary(params, "mac-addr", &size);
+ memcpy(vf->mac, mac, ETHER_ADDR_LEN);
+
+ if (nvlist_get_bool(params, "allow-set-mac"))
+ vf->vf_flags |= VF_FLAG_SET_MAC_CAP;
+ } else
+ /*
+ * If the administrator has not specified a MAC address then
+ * we must allow the VF to choose one.
+ */
+ vf->vf_flags |= VF_FLAG_SET_MAC_CAP;
+
+ if (nvlist_get_bool(params, "mac-anti-spoof"))
+ vf->vf_flags |= VF_FLAG_MAC_ANTI_SPOOF;
+
+ if (nvlist_get_bool(params, "allow-promisc"))
+ vf->vf_flags |= VF_FLAG_PROMISC_CAP;
+
+ vsi->mirror_src_vsi = nvlist_get_number(params, "mirror-src-vsi");
+
+ vf->vlan_limit = nvlist_get_number(params, "max-vlan-allowed");
+ vf->mac_filter_limit = nvlist_get_number(params, "max-mac-filters");
+
+ vf->vf_flags |= VF_FLAG_VLAN_CAP;
+
+ /* Create and setup VSI in HW */
+ error = ice_initialize_vsi(vsi);
+ if (error) {
+ device_printf(sc->dev, "Unable to initialize VF %d VSI: %s\n",
+ vfnum, ice_err_str(error));
+ goto release_imap;
+ }
+
+ /* Add the broadcast address */
+ error = ice_add_vsi_mac_filter(vsi, broadcastaddr);
+ if (error) {
+ device_printf(sc->dev, "Unable to add broadcast filter VF %d VSI: %s\n",
+ vfnum, ice_err_str(error));
+ goto release_imap;
+ }
+
+ ice_iov_ready_vf(sc, vf);
+
+ return (0);
+
+release_imap:
+ ice_resmgr_release_map(&sc->dev_imgr, vf->vf_imap,
+ vf->num_irq_vectors);
+free_imap:
+ free(vf->vf_imap, M_ICE);
+ vf->vf_imap = NULL;
+free_rxirqvs:
+ free(vf->rx_irqvs, M_ICE);
+ vf->rx_irqvs = NULL;
+free_txirqvs:
+ free(vf->tx_irqvs, M_ICE);
+ vf->tx_irqvs = NULL;
+free_rxqs:
+ free(vsi->rx_queues, M_ICE);
+ vsi->rx_queues = NULL;
+free_txqs:
+ free(vsi->tx_queues, M_ICE);
+ vsi->tx_queues = NULL;
+release_vsi:
+ ice_release_vsi(vsi);
+ vf->vsi = NULL;
+ return (error);
+}
+
+/**
+ * ice_iov_uninit - Called by the OS when VFs are destroyed
+ * @sc: device softc structure
+ */
+void
+ice_iov_uninit(struct ice_softc *sc)
+{
+ struct ice_vf *vf;
+ struct ice_vsi *vsi;
+
+ /* Release per-VF resources */
+ for (int i = 0; i < sc->num_vfs; i++) {
+ vf = &sc->vfs[i];
+ vsi = vf->vsi;
+
+ /* Free VF interrupt reservation */
+ if (vf->vf_imap) {
+ free(vf->vf_imap, M_ICE);
+ vf->vf_imap = NULL;
+ }
+
+ /* Free queue interrupt mapping trackers */
+ if (vf->tx_irqvs) {
+ free(vf->tx_irqvs, M_ICE);
+ vf->tx_irqvs = NULL;
+ }
+ if (vf->rx_irqvs) {
+ free(vf->rx_irqvs, M_ICE);
+ vf->rx_irqvs = NULL;
+ }
+
+ if (!vsi)
+ continue;
+
+ /* Free VSI queues */
+ if (vsi->tx_queues) {
+ free(vsi->tx_queues, M_ICE);
+ vsi->tx_queues = NULL;
+ }
+ if (vsi->rx_queues) {
+ free(vsi->rx_queues, M_ICE);
+ vsi->rx_queues = NULL;
+ }
+
+ ice_release_vsi(vsi);
+ vf->vsi = NULL;
+ }
+
+ /* Release memory used for VF tracking */
+ if (sc->vfs) {
+ free(sc->vfs, M_ICE);
+ sc->vfs = NULL;
+ }
+ sc->num_vfs = 0;
+}
+
+/**
+ * ice_iov_handle_vflr - Process VFLR event
+ * @sc: device softc structure
+ *
+ * Identifys which VFs have been reset and re-configure
+ * them.
+ */
+void
+ice_iov_handle_vflr(struct ice_softc *sc)
+{
+ struct ice_hw *hw = &sc->hw;
+ struct ice_vf *vf;
+ u32 reg, reg_idx, bit_idx;
+
+ for (int i = 0; i < sc->num_vfs; i++) {
+ vf = &sc->vfs[i];
+
+ reg_idx = (hw->func_caps.vf_base_id + vf->vf_num) / 32;
+ bit_idx = (hw->func_caps.vf_base_id + vf->vf_num) % 32;
+ reg = rd32(hw, GLGEN_VFLRSTAT(reg_idx));
+ if (reg & BIT(bit_idx))
+ ice_reset_vf(sc, vf, false);
+ }
+}
+
+/**
+ * ice_iov_ready_vf - Setup VF interrupts and mark it as ready
+ * @sc: device softc structure
+ * @vf: driver's VF structure for the VF to update
+ *
+ * Clears VF reset triggering bit, sets up the PF<->VF interrupt
+ * mapping and marks the VF as active in the HW so that the VF
+ * driver can use it.
+ */
+static void
+ice_iov_ready_vf(struct ice_softc *sc, struct ice_vf *vf)
+{
+ struct ice_hw *hw = &sc->hw;
+ u32 reg;
+
+ /* Clear the triggering bit */
+ reg = rd32(hw, VPGEN_VFRTRIG(vf->vf_num));
+ reg &= ~VPGEN_VFRTRIG_VFSWR_M;
+ wr32(hw, VPGEN_VFRTRIG(vf->vf_num), reg);
+
+ /* Setup VF interrupt allocation and mapping */
+ ice_iov_setup_intr_mapping(sc, vf);
+
+ /* Indicate to the VF that reset is done */
+ wr32(hw, VFGEN_RSTAT(vf->vf_num), VIRTCHNL_VFR_VFACTIVE);
+
+ ice_flush(hw);
+}
+
+/**
+ * ice_reset_vf - Perform a hardware reset (VFR) on a VF
+ * @sc: device softc structure
+ * @vf: driver's VF structure for VF to be reset
+ * @trigger_vflr: trigger a reset or only handle already executed reset
+ *
+ * Performs a VFR for the given VF. This function busy waits until the
+ * reset completes in the HW, notifies the VF that the reset is done
+ * by setting a bit in a HW register, then returns.
+ *
+ * @remark This also sets up the PF<->VF interrupt mapping and allocations in
+ * the hardware after the hardware reset is finished, via
+ * ice_iov_setup_intr_mapping()
+ */
+static void
+ice_reset_vf(struct ice_softc *sc, struct ice_vf *vf, bool trigger_vflr)
+{
+ u16 global_vf_num, reg_idx, bit_idx;
+ struct ice_hw *hw = &sc->hw;
+ int status;
+ u32 reg;
+ int i;
+
+ global_vf_num = vf->vf_num + hw->func_caps.vf_base_id;
+
+ if (trigger_vflr) {
+ reg = rd32(hw, VPGEN_VFRTRIG(vf->vf_num));
+ reg |= VPGEN_VFRTRIG_VFSWR_M;
+ wr32(hw, VPGEN_VFRTRIG(vf->vf_num), reg);
+ }
+
+ /* clear the VFLR bit for the VF in a GLGEN_VFLRSTAT register */
+ reg_idx = (global_vf_num) / 32;
+ bit_idx = (global_vf_num) % 32;
+ wr32(hw, GLGEN_VFLRSTAT(reg_idx), BIT(bit_idx));
+ ice_flush(hw);
+
+ /* Wait until there are no pending PCI transactions */
+ wr32(hw, PF_PCI_CIAA,
+ ICE_PCIE_DEV_STATUS | (global_vf_num << PF_PCI_CIAA_VF_NUM_S));
+
+ for (i = 0; i < ICE_PCI_CIAD_WAIT_COUNT; i++) {
+ reg = rd32(hw, PF_PCI_CIAD);
+ if (!(reg & PCIEM_STA_TRANSACTION_PND))
+ break;
+
+ DELAY(ICE_PCI_CIAD_WAIT_DELAY_US);
+ }
+ if (i == ICE_PCI_CIAD_WAIT_COUNT)
+ device_printf(sc->dev,
+ "VF-%d PCI transactions stuck\n", vf->vf_num);
+
+ /* Disable TX queues, which is required during VF reset */
+ status = ice_dis_vsi_txq(hw->port_info, vf->vsi->idx, 0, 0, NULL, NULL,
+ NULL, ICE_VF_RESET, vf->vf_num, NULL);
+ if (status)
+ device_printf(sc->dev,
+ "%s: Failed to disable LAN Tx queues: err %s aq_err %s\n",
+ __func__, ice_status_str(status),
+ ice_aq_str(hw->adminq.sq_last_status));
+
+ /* Then check for the VF reset to finish in HW */
+ for (i = 0; i < ICE_VPGEN_VFRSTAT_WAIT_COUNT; i++) {
+ reg = rd32(hw, VPGEN_VFRSTAT(vf->vf_num));
+ if ((reg & VPGEN_VFRSTAT_VFRD_M))
+ break;
+
+ DELAY(ICE_VPGEN_VFRSTAT_WAIT_DELAY_US);
+ }
+ if (i == ICE_VPGEN_VFRSTAT_WAIT_COUNT)
+ device_printf(sc->dev,
+ "VF-%d Reset is stuck\n", vf->vf_num);
+
+ ice_iov_ready_vf(sc, vf);
+}
+
+/**
+ * ice_vc_get_vf_res_msg - Handle VIRTCHNL_OP_GET_VF_RESOURCES msg from VF
+ * @sc: device private structure
+ * @vf: VF tracking structure
+ * @msg_buf: raw message buffer from the VF
+ *
+ * Receives a message from the VF listing its supported capabilities, and
+ * replies to the VF with information about what resources the PF has
+ * allocated for the VF.
+ *
+ * @remark This always replies to the VF with a success status; it does not
+ * fail. It's up to the VF driver to reject or complain about the PF's response.
+ */
+static void
+ice_vc_get_vf_res_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf)
+{
+ struct ice_hw *hw = &sc->hw;
+ struct virtchnl_vf_resource *vf_res;
+ struct virtchnl_vsi_resource *vsi_res;
+ u16 vf_res_len;
+ u32 vf_caps;
+
+ /* XXX: Only support one VSI per VF, so this size doesn't need adjusting */
+ vf_res_len = sizeof(struct virtchnl_vf_resource);
+ vf_res = (struct virtchnl_vf_resource *)malloc(vf_res_len, M_ICE,
+ M_WAITOK | M_ZERO);
+
+ vf_res->num_vsis = 1;
+ vf_res->num_queue_pairs = vf->vsi->num_tx_queues;
+ vf_res->max_vectors = vf_res->num_queue_pairs + 1;
+
+ vf_res->rss_key_size = ICE_GET_SET_RSS_KEY_EXTEND_KEY_SIZE;
+ vf_res->rss_lut_size = ICE_VSIQF_HLUT_ARRAY_SIZE;
+ vf_res->max_mtu = 0;
+
+ vf_res->vf_cap_flags = VF_BASE_MODE_OFFLOADS;
+ if (msg_buf != NULL) {
+ vf_caps = *((u32 *)(msg_buf));
+
+ if (vf_caps & VIRTCHNL_VF_CAP_ADV_LINK_SPEED)
+ vf_res->vf_cap_flags |= VIRTCHNL_VF_CAP_ADV_LINK_SPEED;
+
+ if (vf_caps & VIRTCHNL_VF_OFFLOAD_WB_ON_ITR)
+ vf_res->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_WB_ON_ITR;
+ }
+
+ vsi_res = &vf_res->vsi_res[0];
+ vsi_res->vsi_id = vf->vsi->idx;
+ vsi_res->num_queue_pairs = vf->vsi->num_tx_queues;
+ vsi_res->vsi_type = VIRTCHNL_VSI_SRIOV;
+ vsi_res->qset_handle = 0;
+ if (!ETHER_IS_ZERO(vf->mac))
+ memcpy(vsi_res->default_mac_addr, vf->mac, ETHER_ADDR_LEN);
+
+ ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_GET_VF_RESOURCES,
+ VIRTCHNL_STATUS_SUCCESS, (u8 *)vf_res, vf_res_len, NULL);
+
+ free(vf_res, M_ICE);
+}
+
+/**
+ * ice_vc_version_msg - Handle VIRTCHNL_OP_VERSION msg from VF
+ * @sc: device private structure
+ * @vf: VF tracking structure
+ * @msg_buf: raw message buffer from the VF
+ *
+ * Receives a version message from the VF, and responds to the VF with
+ * the version number that the PF will use.
+ *
+ * @remark This always replies to the VF with a success status; it does not
+ * fail.
+ */
+static void
+ice_vc_version_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf)
+{
+ struct virtchnl_version_info *recv_vf_version;
+ struct ice_hw *hw = &sc->hw;
+ device_t dev = sc->dev;
+
+ recv_vf_version = (struct virtchnl_version_info *)msg_buf;
+
+ /* VFs running the 1.0 API expect to get 1.0 back */
+ if (VF_IS_V10(recv_vf_version)) {
+ vf->version.major = 1;
+ vf->version.minor = VIRTCHNL_VERSION_MINOR_NO_VF_CAPS;
+ } else {
+ vf->version.major = VIRTCHNL_VERSION_MAJOR;
+ vf->version.minor = VIRTCHNL_VERSION_MINOR;
+
+ if ((recv_vf_version->major != VIRTCHNL_VERSION_MAJOR) ||
+ (recv_vf_version->minor != VIRTCHNL_VERSION_MINOR))
+ device_printf(dev,
+ "%s: VF-%d requested version (%d.%d) differs from PF version (%d.%d)\n",
+ __func__, vf->vf_num,
+ recv_vf_version->major, recv_vf_version->minor,
+ VIRTCHNL_VERSION_MAJOR, VIRTCHNL_VERSION_MINOR);
+ }
+
+ ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_VERSION,
+ VIRTCHNL_STATUS_SUCCESS, (u8 *)&vf->version, sizeof(vf->version),
+ NULL);
+}
+
+/**
+ * ice_vf_validate_mac - Validate MAC address before adding it
+ * @vf: VF tracking structure
+ * @addr: MAC address to validate
+ *
+ * Validate a MAC address before adding it to a VF during the handling
+ * of a VIRTCHNL_OP_ADD_ETH_ADDR operation. Notably, this also checks if
+ * the VF is allowed to set its own arbitrary MAC addresses.
+ *
+ * Returns 0 if MAC address is valid for the given vf
+ */
+static int
+ice_vf_validate_mac(struct ice_vf *vf, const uint8_t *addr)
+{
+
+ if (ETHER_IS_ZERO(addr) || ETHER_IS_BROADCAST(addr))
+ return (EINVAL);
+
+ /*
+ * If the VF is not allowed to change its MAC address, don't let it
+ * set a MAC filter for an address that is not a multicast address and
+ * is not its assigned MAC.
+ */
+ if (!(vf->vf_flags & VF_FLAG_SET_MAC_CAP) &&
+ !(ETHER_IS_MULTICAST(addr) || !bcmp(addr, vf->mac, ETHER_ADDR_LEN)))
+ return (EPERM);
+
+ return (0);
+}
+
+/**
+ * ice_vc_add_eth_addr_msg - Handle VIRTCHNL_OP_ADD_ETH_ADDR msg from VF
+ * @sc: device private structure
+ * @vf: VF tracking structure
+ * @msg_buf: raw message buffer from the VF
+ *
+ * Receives a list of MAC addresses from the VF and adds those addresses
+ * to the VSI's filter list.
+ */
+static void
+ice_vc_add_eth_addr_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf)
+{
+ enum virtchnl_status_code v_status = VIRTCHNL_STATUS_SUCCESS;
+ struct virtchnl_ether_addr_list *addr_list;
+ struct ice_hw *hw = &sc->hw;
+ u16 added_addr_cnt = 0;
+ int error = 0;
+
+ addr_list = (struct virtchnl_ether_addr_list *)msg_buf;
+
+ if (addr_list->num_elements >
+ (vf->mac_filter_limit - vf->mac_filter_cnt)) {
+ v_status = VIRTCHNL_STATUS_ERR_NO_MEMORY;
+ goto done;
+ }
+
+ for (int i = 0; i < addr_list->num_elements; i++) {
+ u8 *addr = addr_list->list[i].addr;
+
+ /* The type flag is currently ignored; every MAC address is
+ * treated as the LEGACY type
+ */
+
+ error = ice_vf_validate_mac(vf, addr);
+ if (error == EPERM) {
+ device_printf(sc->dev,
+ "%s: VF-%d: Not permitted to add MAC addr for VSI %d\n",
+ __func__, vf->vf_num, vf->vsi->idx);
+ v_status = VIRTCHNL_STATUS_ERR_PARAM;
+ continue;
+ } else if (error) {
+ device_printf(sc->dev,
+ "%s: VF-%d: Did not add invalid MAC addr for VSI %d\n",
+ __func__, vf->vf_num, vf->vsi->idx);
+ v_status = VIRTCHNL_STATUS_ERR_PARAM;
+ continue;
+ }
+
+ error = ice_add_vsi_mac_filter(vf->vsi, addr);
+ if (error) {
+ device_printf(sc->dev,
+ "%s: VF-%d: Error adding MAC addr for VSI %d\n",
+ __func__, vf->vf_num, vf->vsi->idx);
+ v_status = VIRTCHNL_STATUS_ERR_PARAM;
+ continue;
+ }
+ /* Don't count VF's MAC against its MAC filter limit */
+ if (memcmp(addr, vf->mac, ETHER_ADDR_LEN))
+ added_addr_cnt++;
+ }
+
+ vf->mac_filter_cnt += added_addr_cnt;
+
+done:
+ ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_ADD_ETH_ADDR,
+ v_status, NULL, 0, NULL);
+}
+
+/**
+ * ice_vc_del_eth_addr_msg - Handle VIRTCHNL_OP_DEL_ETH_ADDR msg from VF
+ * @sc: device private structure
+ * @vf: VF tracking structure
+ * @msg_buf: raw message buffer from the VF
+ *
+ * Receives a list of MAC addresses from the VF and removes those addresses
+ * from the VSI's filter list.
+ */
+static void
+ice_vc_del_eth_addr_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf)
+{
+ enum virtchnl_status_code v_status = VIRTCHNL_STATUS_SUCCESS;
+ struct virtchnl_ether_addr_list *addr_list;
+ struct ice_hw *hw = &sc->hw;
+ u16 deleted_addr_cnt = 0;
+ int error = 0;
+
+ addr_list = (struct virtchnl_ether_addr_list *)msg_buf;
+
+ for (int i = 0; i < addr_list->num_elements; i++) {
+ error = ice_remove_vsi_mac_filter(vf->vsi, addr_list->list[i].addr);
+ if (error) {
+ device_printf(sc->dev,
+ "%s: VF-%d: Error removing MAC addr for VSI %d\n",
+ __func__, vf->vf_num, vf->vsi->idx);
+ v_status = VIRTCHNL_STATUS_ERR_PARAM;
+ continue;
+ }
+ /* Don't count VF's MAC against its MAC filter limit */
+ if (memcmp(addr_list->list[i].addr, vf->mac, ETHER_ADDR_LEN))
+ deleted_addr_cnt++;
+ }
+
+ if (deleted_addr_cnt >= vf->mac_filter_cnt)
+ vf->mac_filter_cnt = 0;
+ else
+ vf->mac_filter_cnt -= deleted_addr_cnt;
+
+ ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_DEL_ETH_ADDR,
+ v_status, NULL, 0, NULL);
+}
+
+/**
+ * ice_vc_add_vlan_msg - Handle VIRTCHNL_OP_ADD_VLAN msg from VF
+ * @sc: PF's softc structure
+ * @vf: VF tracking structure
+ * @msg_buf: message buffer from VF
+ *
+ * Adds the VLANs in msg_buf to the VF's VLAN filter list.
+ */
+static void
+ice_vc_add_vlan_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf)
+{
+ struct ice_hw *hw = &sc->hw;
+ struct virtchnl_vlan_filter_list *vlan_list;
+ int status = 0;
+ enum virtchnl_status_code v_status = VIRTCHNL_STATUS_SUCCESS;
+ struct ice_vsi *vsi = vf->vsi;
+
+ vlan_list = (struct virtchnl_vlan_filter_list *)msg_buf;
+
+ if (vlan_list->vsi_id != vsi->idx) {
+ device_printf(sc->dev,
+ "VF-%d: Message has invalid VSI ID (expected %d, got %d)\n",
+ vf->vf_num, vsi->idx, vlan_list->vsi_id);
+ v_status = VIRTCHNL_STATUS_ERR_PARAM;
+ goto done;
+ }
+
+ if (vlan_list->num_elements > (vf->vlan_limit - vf->vlan_cnt)) {
+ v_status = VIRTCHNL_STATUS_ERR_NO_MEMORY;
+ goto done;
+ }
+
+ status = ice_add_vlan_hw_filters(vsi, vlan_list->vlan_id,
+ vlan_list->num_elements);
+ if (status) {
+ device_printf(sc->dev,
+ "VF-%d: Failure adding VLANs to VSI %d, err %s aq_err %s\n",
+ vf->vf_num, vsi->idx, ice_status_str(status),
+ ice_aq_str(sc->hw.adminq.sq_last_status));
+ v_status = ice_iov_err_to_virt_err(status);
+ goto done;
+ }
+
+ vf->vlan_cnt += vlan_list->num_elements;
+
+done:
+ ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_ADD_VLAN,
+ v_status, NULL, 0, NULL);
+}
+
+/**
+ * ice_vc_del_vlan_msg - Handle VIRTCHNL_OP_DEL_VLAN msg from VF
+ * @sc: PF's softc structure
+ * @vf: VF tracking structure
+ * @msg_buf: message buffer from VF
+ *
+ * Removes the VLANs in msg_buf from the VF's VLAN filter list.
+ */
+static void
+ice_vc_del_vlan_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf)
+{
+ struct ice_hw *hw = &sc->hw;
+ struct virtchnl_vlan_filter_list *vlan_list;
+ int status = 0;
+ enum virtchnl_status_code v_status = VIRTCHNL_STATUS_SUCCESS;
+ struct ice_vsi *vsi = vf->vsi;
+
+ vlan_list = (struct virtchnl_vlan_filter_list *)msg_buf;
+
+ if (vlan_list->vsi_id != vsi->idx) {
+ device_printf(sc->dev,
+ "VF-%d: Message has invalid VSI ID (expected %d, got %d)\n",
+ vf->vf_num, vsi->idx, vlan_list->vsi_id);
+ v_status = VIRTCHNL_STATUS_ERR_PARAM;
+ goto done;
+ }
+
+ status = ice_remove_vlan_hw_filters(vsi, vlan_list->vlan_id,
+ vlan_list->num_elements);
+ if (status) {
+ device_printf(sc->dev,
+ "VF-%d: Failure deleting VLANs from VSI %d, err %s aq_err %s\n",
+ vf->vf_num, vsi->idx, ice_status_str(status),
+ ice_aq_str(sc->hw.adminq.sq_last_status));
+ v_status = ice_iov_err_to_virt_err(status);
+ goto done;
+ }
+
+ if (vlan_list->num_elements >= vf->vlan_cnt)
+ vf->vlan_cnt = 0;
+ else
+ vf->vlan_cnt -= vlan_list->num_elements;
+
+done:
+ ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_DEL_VLAN,
+ v_status, NULL, 0, NULL);
+}
+
+/**
+ * ice_vc_validate_ring_len - Check to see if a descriptor ring length is valid
+ * @ring_len: length of ring
+ *
+ * Check whether a ring size value is valid.
+ *
+ * @returns true if given ring size is valid
+ */
+static bool
+ice_vc_isvalid_ring_len(u16 ring_len)
+{
+ return (ring_len >= ICE_MIN_DESC_COUNT &&
+ ring_len <= ICE_MAX_DESC_COUNT &&
+ !(ring_len % ICE_DESC_COUNT_INCR));
+}
+
+/**
+ * ice_vc_cfg_vsi_qs_msg - Handle VIRTCHNL_OP_CONFIG_VSI_QUEUES msg from VF
+ * @sc: PF's softc structure
+ * @vf: VF tracking structure
+ * @msg_buf: message buffer from VF
+ */
+static void
+ice_vc_cfg_vsi_qs_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf)
+{
+ device_t dev = sc->dev;
+ struct ice_hw *hw = &sc->hw;
+ struct virtchnl_vsi_queue_config_info *vqci;
+ struct virtchnl_queue_pair_info *vqpi;
+ enum virtchnl_status_code status = VIRTCHNL_STATUS_SUCCESS;
+ struct ice_vsi *vsi = vf->vsi;
+ struct ice_tx_queue *txq;
+ struct ice_rx_queue *rxq;
+ int i, error = 0;
+
+ vqci = (struct virtchnl_vsi_queue_config_info *)msg_buf;
+
+ if (vqci->num_queue_pairs > vf->vsi->num_tx_queues &&
+ vqci->num_queue_pairs > vf->vsi->num_rx_queues) {
+ status = VIRTCHNL_STATUS_ERR_PARAM;
+ goto done;
+ }
+
+ ice_vsi_disable_tx(vf->vsi);
+ ice_control_all_rx_queues(vf->vsi, false);
+
+ /*
+ * Clear TX and RX queues config in case VF
+ * requests different number of queues.
+ */
+ for (i = 0; i < vsi->num_tx_queues; i++) {
+ txq = &vsi->tx_queues[i];
+
+ txq->desc_count = 0;
+ txq->tx_paddr = 0;
+ txq->tc = 0;
+ }
+
+ for (i = 0; i < vsi->num_rx_queues; i++) {
+ rxq = &vsi->rx_queues[i];
+
+ rxq->desc_count = 0;
+ rxq->rx_paddr = 0;
+ }
+
+ vqpi = vqci->qpair;
+ for (i = 0; i < vqci->num_queue_pairs; i++, vqpi++) {
+ /* Initial parameter validation */
+ if (vqpi->txq.vsi_id != vf->vsi->idx ||
+ vqpi->rxq.vsi_id != vf->vsi->idx ||
+ vqpi->txq.queue_id != vqpi->rxq.queue_id ||
+ vqpi->txq.headwb_enabled ||
+ vqpi->rxq.splithdr_enabled ||
+ vqpi->rxq.crc_disable ||
+ !(ice_vc_isvalid_ring_len(vqpi->txq.ring_len)) ||
+ !(ice_vc_isvalid_ring_len(vqpi->rxq.ring_len))) {
+ status = VIRTCHNL_STATUS_ERR_PARAM;
+ goto done;
+ }
+
+ /* Copy parameters into VF's queue/VSI structs */
+ txq = &vsi->tx_queues[vqpi->txq.queue_id];
+
+ txq->desc_count = vqpi->txq.ring_len;
+ txq->tx_paddr = vqpi->txq.dma_ring_addr;
+ txq->q_handle = vqpi->txq.queue_id;
+ txq->tc = 0;
+
+ rxq = &vsi->rx_queues[vqpi->rxq.queue_id];
+
+ rxq->desc_count = vqpi->rxq.ring_len;
+ rxq->rx_paddr = vqpi->rxq.dma_ring_addr;
+ vsi->mbuf_sz = vqpi->rxq.databuffer_size;
+ }
+
+ /* Configure TX queues in HW */
+ error = ice_cfg_vsi_for_tx(vsi);
+ if (error) {
+ device_printf(dev,
+ "VF-%d: Unable to configure VSI for Tx: %s\n",
+ vf->vf_num, ice_err_str(error));
+ status = VIRTCHNL_STATUS_ERR_ADMIN_QUEUE_ERROR;
+ goto done;
+ }
+
+ /* Configure RX queues in HW */
+ error = ice_cfg_vsi_for_rx(vsi);
+ if (error) {
+ device_printf(dev,
+ "VF-%d: Unable to configure VSI for Rx: %s\n",
+ vf->vf_num, ice_err_str(error));
+ status = VIRTCHNL_STATUS_ERR_ADMIN_QUEUE_ERROR;
+ ice_vsi_disable_tx(vsi);
+ goto done;
+ }
+
+done:
+ ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_CONFIG_VSI_QUEUES,
+ status, NULL, 0, NULL);
+}
+
+/**
+ * ice_vc_cfg_rss_key_msg - Handle VIRTCHNL_OP_CONFIG_RSS_KEY msg from VF
+ * @sc: PF's softc structure
+ * @vf: VF tracking structure
+ * @msg_buf: message buffer from VF
+ *
+ * Sets the RSS key for the given VF, using the contents of msg_buf.
+ */
+static void
+ice_vc_cfg_rss_key_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf)
+{
+ struct ice_aqc_get_set_rss_keys keydata =
+ { .standard_rss_key = {0}, .extended_hash_key = {0} };
+ struct ice_hw *hw = &sc->hw;
+ struct virtchnl_rss_key *vrk;
+ int status = 0;
+ enum virtchnl_status_code v_status = VIRTCHNL_STATUS_SUCCESS;
+ struct ice_vsi *vsi = vf->vsi;
+
+ vrk = (struct virtchnl_rss_key *)msg_buf;
+
+ if (vrk->vsi_id != vsi->idx) {
+ device_printf(sc->dev,
+ "VF-%d: Message has invalid VSI ID (expected %d, got %d)\n",
+ vf->vf_num, vsi->idx, vrk->vsi_id);
+ v_status = VIRTCHNL_STATUS_ERR_PARAM;
+ goto done;
+ }
+
+ if ((vrk->key_len >
+ (ICE_AQC_GET_SET_RSS_KEY_DATA_RSS_KEY_SIZE +
+ ICE_AQC_GET_SET_RSS_KEY_DATA_HASH_KEY_SIZE)) ||
+ vrk->key_len == 0) {
+ v_status = VIRTCHNL_STATUS_ERR_PARAM;
+ goto done;
+ }
+
+ memcpy(&keydata, vrk->key, vrk->key_len);
+
+ status = ice_aq_set_rss_key(hw, vsi->idx, &keydata);
+ if (status) {
+ device_printf(sc->dev,
+ "ice_aq_set_rss_key status %s, error %s\n",
+ ice_status_str(status), ice_aq_str(hw->adminq.sq_last_status));
+ v_status = ice_iov_err_to_virt_err(status);
+ goto done;
+ }
+
+done:
+ ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_CONFIG_RSS_KEY,
+ v_status, NULL, 0, NULL);
+}
+
+/**
+ * ice_vc_cfg_rss_lut_msg - Handle VIRTCHNL_OP_CONFIG_RSS_LUT msg from VF
+ * @sc: PF's softc structure
+ * @vf: VF tracking structure
+ * @msg_buf: message buffer from VF
+ *
+ * Adds the LUT from the VF in msg_buf to the PF via an admin queue call.
+ */
+static void
+ice_vc_cfg_rss_lut_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf)
+{
+ struct ice_hw *hw = &sc->hw;
+ struct virtchnl_rss_lut *vrl;
+ int status = 0;
+ enum virtchnl_status_code v_status = VIRTCHNL_STATUS_SUCCESS;
+ struct ice_aq_get_set_rss_lut_params lut_params = {};
+ struct ice_vsi *vsi = vf->vsi;
+
+ vrl = (struct virtchnl_rss_lut *)msg_buf;
+
+ if (vrl->vsi_id != vsi->idx) {
+ device_printf(sc->dev,
+ "VF-%d: Message has invalid VSI ID (expected %d, got %d)\n",
+ vf->vf_num, vsi->idx, vrl->vsi_id);
+ v_status = VIRTCHNL_STATUS_ERR_PARAM;
+ goto done;
+ }
+
+ if (vrl->lut_entries > ICE_VSIQF_HLUT_ARRAY_SIZE) {
+ v_status = VIRTCHNL_STATUS_ERR_PARAM;
+ goto done;
+ }
+
+ lut_params.vsi_handle = vsi->idx;
+ lut_params.lut_size = vsi->rss_table_size;
+ lut_params.lut_type = vsi->rss_lut_type;
+ lut_params.lut = vrl->lut;
+ lut_params.global_lut_id = 0;
+
+ status = ice_aq_set_rss_lut(hw, &lut_params);
+ if (status) {
+ device_printf(sc->dev,
+ "VF-%d: Cannot set RSS lut, err %s aq_err %s\n",
+ vf->vf_num, ice_status_str(status),
+ ice_aq_str(hw->adminq.sq_last_status));
+ v_status = ice_iov_err_to_virt_err(status);
+ }
+
+done:
+ ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_CONFIG_RSS_LUT,
+ v_status, NULL, 0, NULL);
+}
+
+/**
+ * ice_vc_set_rss_hena_msg - Handle VIRTCHNL_OP_SET_RSS_HENA msg from VF
+ * @sc: PF's softc structure
+ * @vf: VF tracking structure
+ * @msg_buf: message buffer from VF
+ *
+ * Adds the VF's hena (hash enable) bits as flow types to the PF's RSS flow
+ * type list.
+ */
+static void
+ice_vc_set_rss_hena_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf)
+{
+ struct ice_hw *hw = &sc->hw;
+ struct virtchnl_rss_hena *vrh;
+ int status = 0;
+ enum virtchnl_status_code v_status = VIRTCHNL_STATUS_SUCCESS;
+ struct ice_vsi *vsi = vf->vsi;
+
+ MPASS(vsi != NULL);
+
+ vrh = (struct virtchnl_rss_hena *)msg_buf;
+
+ /*
+ * Remove existing configuration to make sure only requested
+ * config is applied and allow VFs to disable RSS completly.
+ */
+ status = ice_rem_vsi_rss_cfg(hw, vsi->idx);
+ if (vrh->hena) {
+ /*
+ * Problem with removing config is not fatal, when new one
+ * is requested. Warn about it but try to apply new config
+ * anyway.
+ */
+ if (status)
+ device_printf(sc->dev,
+ "ice_rem_vsi_rss_cfg status %s, error %s\n",
+ ice_status_str(status),
+ ice_aq_str(hw->adminq.sq_last_status));
+ status = ice_add_avf_rss_cfg(hw, vsi->idx, vrh->hena);
+ if (status)
+ device_printf(sc->dev,
+ "ice_add_avf_rss_cfg status %s, error %s\n",
+ ice_status_str(status),
+ ice_aq_str(hw->adminq.sq_last_status));
+ }
+ v_status = ice_iov_err_to_virt_err(status);
+ ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_SET_RSS_HENA,
+ v_status, NULL, 0, NULL);
+}
+
+/**
+ * ice_vc_enable_queues_msg - Handle VIRTCHNL_OP_ENABLE_QUEUES msg from VF
+ * @sc: PF's softc structure
+ * @vf: VF tracking structure
+ * @msg_buf: message buffer from VF
+ *
+ * Enables VF queues selected in msg_buf for Tx/Rx traffic.
+ *
+ * @remark Only actually operates on Rx queues; Tx queues are enabled in
+ * CONFIG_VSI_QUEUES message handler.
+ */
+static void
+ice_vc_enable_queues_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf)
+{
+ struct ice_hw *hw = &sc->hw;
+ struct virtchnl_queue_select *vqs;
+ enum virtchnl_status_code v_status = VIRTCHNL_STATUS_SUCCESS;
+ struct ice_vsi *vsi = vf->vsi;
+ int bit, error = 0;
+
+ vqs = (struct virtchnl_queue_select *)msg_buf;
+
+ if (vqs->vsi_id != vsi->idx) {
+ device_printf(sc->dev,
+ "%s: VF-%d: Message has invalid VSI ID (expected %d, got %d)\n",
+ __func__, vf->vf_num, vsi->idx, vqs->vsi_id);
+ v_status = VIRTCHNL_STATUS_ERR_PARAM;
+ goto done;
+ }
+
+ if (!vqs->rx_queues && !vqs->tx_queues) {
+ device_printf(sc->dev,
+ "%s: VF-%d: message queue masks are empty\n",
+ __func__, vf->vf_num);
+ v_status = VIRTCHNL_STATUS_ERR_PARAM;
+ goto done;
+ }
+
+ /* Validate rx_queue mask */
+ bit = fls(vqs->rx_queues);
+ if (bit > vsi->num_rx_queues) {
+ device_printf(sc->dev,
+ "%s: VF-%d: message's rx_queues map (0x%08x) has invalid bit set (%d)\n",
+ __func__, vf->vf_num, vqs->rx_queues, bit);
+ v_status = VIRTCHNL_STATUS_ERR_PARAM;
+ goto done;
+ }
+
+ /* Tx ring enable is handled in an earlier message. */
+ for_each_set_bit(bit, &vqs->rx_queues, 32) {
+ error = ice_control_rx_queue(vsi, bit, true);
+ if (error) {
+ device_printf(sc->dev,
+ "Unable to enable Rx ring %d for receive: %s\n",
+ bit, ice_err_str(error));
+ v_status = VIRTCHNL_STATUS_ERR_PARAM;
+ goto done;
+ }
+ }
+
+done:
+ ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_ENABLE_QUEUES,
+ v_status, NULL, 0, NULL);
+}
+
+/**
+ * ice_vc_disable_queues_msg - Handle VIRTCHNL_OP_DISABLE_QUEUES msg
+ * @sc: PF's softc structure
+ * @vf: VF tracking structure
+ * @msg_buf: message buffer from VF
+ *
+ * Disables all VF queues for the VF's VSI.
+ *
+ * @remark Unlike the ENABLE_QUEUES handler, this operates on both
+ * Tx and Rx queues
+ */
+static void
+ice_vc_disable_queues_msg(struct ice_softc *sc, struct ice_vf *vf,
+ u8 *msg_buf __unused)
+{
+ struct ice_hw *hw = &sc->hw;
+ enum virtchnl_status_code v_status = VIRTCHNL_STATUS_SUCCESS;
+ struct ice_vsi *vsi = vf->vsi;
+ int error = 0;
+
+ error = ice_control_all_rx_queues(vsi, false);
+ if (error) {
+ device_printf(sc->dev,
+ "Unable to disable Rx rings for transmit: %s\n",
+ ice_err_str(error));
+ v_status = VIRTCHNL_STATUS_ERR_PARAM;
+ goto done;
+ }
+
+ error = ice_vsi_disable_tx(vsi);
+ if (error) {
+ /* Already prints an error message */
+ v_status = VIRTCHNL_STATUS_ERR_PARAM;
+ }
+
+done:
+ ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_DISABLE_QUEUES,
+ v_status, NULL, 0, NULL);
+}
+
+/**
+ * ice_vc_cfg_irq_map_msg - Handle VIRTCHNL_OP_CFG_IRQ_MAP msg from VF
+ * @sc: PF's softc structure
+ * @vf: VF tracking structure
+ * @msg_buf: message buffer from VF
+ *
+ * Configures the interrupt vectors described in the message in msg_buf. The
+ * VF needs to send this message during init, so that queues can be allowed
+ * to generate interrupts.
+ */
+static void
+ice_vc_cfg_irq_map_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf)
+{
+#define ICE_VIRTCHNL_QUEUE_MAP_SIZE 16
+ struct ice_hw *hw = &sc->hw;
+ struct virtchnl_irq_map_info *vimi;
+ struct virtchnl_vector_map *vvm;
+ enum virtchnl_status_code v_status = VIRTCHNL_STATUS_SUCCESS;
+ struct ice_vsi *vsi = vf->vsi;
+ u16 vector;
+
+ vimi = (struct virtchnl_irq_map_info *)msg_buf;
+
+ if (vimi->num_vectors > vf->num_irq_vectors) {
+ device_printf(sc->dev,
+ "%s: VF-%d: message has more vectors (%d) than configured for VF (%d)\n",
+ __func__, vf->vf_num, vimi->num_vectors, vf->num_irq_vectors);
+ v_status = VIRTCHNL_STATUS_ERR_PARAM;
+ goto done;
+ }
+
+ vvm = vimi->vecmap;
+ /* Save off information from message */
+ for (int i = 0; i < vimi->num_vectors; i++, vvm++) {
+ struct ice_tx_queue *txq;
+ struct ice_rx_queue *rxq;
+ int bit;
+
+ if (vvm->vsi_id != vf->vsi->idx) {
+ device_printf(sc->dev,
+ "%s: VF-%d: message's VSI ID (%d) does not match VF's (%d) for vector %d\n",
+ __func__, vf->vf_num, vvm->vsi_id, vf->vsi->idx, i);
+ v_status = VIRTCHNL_STATUS_ERR_PARAM;
+ goto done;
+ }
+
+ /* vvm->vector_id is relative to VF space */
+ vector = vvm->vector_id;
+
+ if (vector >= vf->num_irq_vectors) {
+ device_printf(sc->dev,
+ "%s: VF-%d: message's vector ID (%d) is greater than VF's max ID (%d)\n",
+ __func__, vf->vf_num, vector, vf->num_irq_vectors - 1);
+ v_status = VIRTCHNL_STATUS_ERR_PARAM;
+ goto done;
+ }
+
+ /* The Misc/Admin Queue vector doesn't need mapping */
+ if (vector == 0)
+ continue;
+
+ /* coverity[address_of] */
+ for_each_set_bit(bit, &vvm->txq_map, ICE_VIRTCHNL_QUEUE_MAP_SIZE) {
+ if (bit >= vsi->num_tx_queues) {
+ device_printf(sc->dev,
+ "%s: VF-%d: txq map has invalid bit set\n",
+ __func__, vf->vf_num);
+ v_status = VIRTCHNL_STATUS_ERR_PARAM;
+ goto done;
+ }
+
+ vf->tx_irqvs[vector].me = vector;
+
+ txq = &vsi->tx_queues[bit];
+ txq->irqv = &vf->tx_irqvs[vector];
+ txq->itr_idx = vvm->txitr_idx;
+ }
+ /* coverity[address_of] */
+ for_each_set_bit(bit, &vvm->rxq_map, ICE_VIRTCHNL_QUEUE_MAP_SIZE) {
+ if (bit >= vsi->num_rx_queues) {
+ device_printf(sc->dev,
+ "%s: VF-%d: rxq map has invalid bit set\n",
+ __func__, vf->vf_num);
+ v_status = VIRTCHNL_STATUS_ERR_PARAM;
+ goto done;
+ }
+ vf->rx_irqvs[vector].me = vector;
+
+ rxq = &vsi->rx_queues[bit];
+ rxq->irqv = &vf->rx_irqvs[vector];
+ rxq->itr_idx = vvm->rxitr_idx;
+ }
+ }
+
+ /* Write to T/RQCTL registers to actually map vectors to queues */
+ for (int i = 0; i < vf->vsi->num_rx_queues; i++)
+ if (vsi->rx_queues[i].irqv != NULL)
+ ice_configure_rxq_interrupt(hw, vsi->rx_qmap[i],
+ vsi->rx_queues[i].irqv->me, vsi->rx_queues[i].itr_idx);
+
+ for (int i = 0; i < vf->vsi->num_tx_queues; i++)
+ if (vsi->tx_queues[i].irqv != NULL)
+ ice_configure_txq_interrupt(hw, vsi->tx_qmap[i],
+ vsi->tx_queues[i].irqv->me, vsi->tx_queues[i].itr_idx);
+
+ ice_flush(hw);
+
+done:
+ ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_CONFIG_IRQ_MAP,
+ v_status, NULL, 0, NULL);
+}
+
+/**
+ * ice_eth_stats_to_virtchnl_eth_stats - Convert stats for virtchnl
+ * @istats: VSI stats from HW to convert
+ * @vstats: stats struct to copy to
+ *
+ * This function copies all known stats in struct virtchnl_eth_stats from the
+ * input struct ice_eth_stats to an output struct virtchnl_eth_stats.
+ *
+ * @remark These two structure types currently have the same definition up to
+ * the size of struct virtchnl_eth_stats (on FreeBSD), but that could change
+ * in the future.
+ */
+static void
+ice_eth_stats_to_virtchnl_eth_stats(struct ice_eth_stats *istats,
+ struct virtchnl_eth_stats *vstats)
+{
+ vstats->rx_bytes = istats->rx_bytes;
+ vstats->rx_unicast = istats->rx_unicast;
+ vstats->rx_multicast = istats->rx_multicast;
+ vstats->rx_broadcast = istats->rx_broadcast;
+ vstats->rx_discards = istats->rx_discards;
+ vstats->rx_unknown_protocol = istats->rx_unknown_protocol;
+ vstats->tx_bytes = istats->tx_bytes;
+ vstats->tx_unicast = istats->tx_unicast;
+ vstats->tx_multicast = istats->tx_multicast;
+ vstats->tx_broadcast = istats->tx_broadcast;
+ vstats->tx_discards = istats->tx_discards;
+ vstats->tx_errors = istats->tx_errors;
+}
+
+/**
+ * ice_vc_get_stats_msg - Handle VIRTCHNL_OP_GET_STATS msg
+ * @sc: device private structure
+ * @vf: VF tracking structure
+ * @msg_buf: raw message buffer from the VF
+ *
+ * Updates the VF's VSI stats and sends those stats back to the VF.
+ */
+static void
+ice_vc_get_stats_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf)
+{
+ struct virtchnl_queue_select *vqs;
+ struct virtchnl_eth_stats stats;
+ struct ice_vsi *vsi = vf->vsi;
+ struct ice_hw *hw = &sc->hw;
+
+ vqs = (struct virtchnl_queue_select *)msg_buf;
+
+ if (vqs->vsi_id != vsi->idx) {
+ device_printf(sc->dev,
+ "%s: VF-%d: message has invalid VSI ID %d (VF has VSI ID %d)\n",
+ __func__, vf->vf_num, vqs->vsi_id, vsi->idx);
+ ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_GET_STATS,
+ VIRTCHNL_STATUS_ERR_PARAM, NULL, 0, NULL);
+ }
+
+ ice_update_vsi_hw_stats(vf->vsi);
+ ice_eth_stats_to_virtchnl_eth_stats(&vsi->hw_stats.cur, &stats);
+
+ ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_GET_STATS,
+ VIRTCHNL_STATUS_SUCCESS, (u8 *)&stats,
+ sizeof(struct virtchnl_eth_stats), NULL);
+}
+
+/**
+ * ice_vc_cfg_promisc_mode_msg - Handle VIRTCHNL_OP_CONFIG_PROMISCUOUS_MODE
+ * @sc: PF's softc structure
+ * @vf: VF tracking structure
+ * @msg_buf: message buffer from VF
+ *
+ * Configures the promiscuous modes for the given VSI in msg_buf.
+ */
+static void
+ice_vc_cfg_promisc_mode_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf)
+{
+ struct ice_hw *hw = &sc->hw;
+ struct virtchnl_promisc_info *vpi;
+ enum virtchnl_status_code v_status = VIRTCHNL_STATUS_SUCCESS;
+ int status = 0;
+ struct ice_vsi *vsi = vf->vsi;
+ ice_declare_bitmap(old_promisc_mask, ICE_PROMISC_MAX);
+ ice_declare_bitmap(req_promisc_mask, ICE_PROMISC_MAX);
+ ice_declare_bitmap(clear_promisc_mask, ICE_PROMISC_MAX);
+ ice_declare_bitmap(set_promisc_mask, ICE_PROMISC_MAX);
+ ice_declare_bitmap(old_req_xor_mask, ICE_PROMISC_MAX);
+ u16 vid;
+
+ vpi = (struct virtchnl_promisc_info *)msg_buf;
+
+ /* Check to see if VF has permission to configure promiscuous mode */
+ if (!(vf->vf_flags & VF_FLAG_PROMISC_CAP)) {
+ device_printf(sc->dev,
+ "VF-%d: attempted to configure promiscuous mode\n",
+ vf->vf_num);
+ /* Don't reply to VF with an error */
+ goto done;
+ }
+
+ if (vpi->vsi_id != vsi->idx) {
+ device_printf(sc->dev,
+ "VF-%d: Message has invalid VSI ID (expected %d, got %d)\n",
+ vf->vf_num, vsi->idx, vpi->vsi_id);
+ v_status = VIRTCHNL_STATUS_ERR_PARAM;
+ goto done;
+ }
+
+ if (vpi->flags & ~ICE_VIRTCHNL_VALID_PROMISC_FLAGS) {
+ device_printf(sc->dev,
+ "VF-%d: Message has invalid promiscuous flags set (valid 0x%02x, got 0x%02x)\n",
+ vf->vf_num, ICE_VIRTCHNL_VALID_PROMISC_FLAGS,
+ vpi->flags);
+ v_status = VIRTCHNL_STATUS_ERR_PARAM;
+ goto done;
+
+ }
+
+ ice_zero_bitmap(req_promisc_mask, ICE_PROMISC_MAX);
+ /* Convert virtchnl flags to ice AQ promiscuous mode flags */
+ if (vpi->flags & FLAG_VF_UNICAST_PROMISC) {
+ ice_set_bit(ICE_PROMISC_UCAST_TX, req_promisc_mask);
+ ice_set_bit(ICE_PROMISC_UCAST_RX, req_promisc_mask);
+ }
+ if (vpi->flags & FLAG_VF_MULTICAST_PROMISC) {
+ ice_set_bit(ICE_PROMISC_MCAST_TX, req_promisc_mask);
+ ice_set_bit(ICE_PROMISC_MCAST_RX, req_promisc_mask);
+ }
+
+ status = ice_get_vsi_promisc(hw, vsi->idx, old_promisc_mask, &vid);
+ if (status) {
+ device_printf(sc->dev,
+ "VF-%d: Failed to get promiscuous mode mask for VSI %d, err %s aq_err %s\n",
+ vf->vf_num, vsi->idx,
+ ice_status_str(status),
+ ice_aq_str(hw->adminq.sq_last_status));
+ v_status = ice_iov_err_to_virt_err(status);
+ goto done;
+ }
+
+ /* Figure out what got added and what got removed */
+ ice_zero_bitmap(old_req_xor_mask, ICE_PROMISC_MAX);
+ ice_xor_bitmap(old_req_xor_mask, old_promisc_mask, req_promisc_mask, ICE_PROMISC_MAX);
+ ice_and_bitmap(clear_promisc_mask, old_req_xor_mask, old_promisc_mask, ICE_PROMISC_MAX);
+ ice_and_bitmap(set_promisc_mask, old_req_xor_mask, req_promisc_mask, ICE_PROMISC_MAX);
+
+ if (ice_is_any_bit_set(clear_promisc_mask, ICE_PROMISC_MAX)) {
+ status = ice_clear_vsi_promisc(hw, vsi->idx,
+ clear_promisc_mask, 0);
+ if (status) {
+ device_printf(sc->dev,
+ "VF-%d: Failed to clear promiscuous mode for VSI %d, err %s aq_err %s\n",
+ vf->vf_num, vsi->idx,
+ ice_status_str(status),
+ ice_aq_str(hw->adminq.sq_last_status));
+ v_status = ice_iov_err_to_virt_err(status);
+ goto done;
+ }
+ }
+
+ if (ice_is_any_bit_set(set_promisc_mask, ICE_PROMISC_MAX)) {
+ status = ice_set_vsi_promisc(hw, vsi->idx, set_promisc_mask, 0);
+ if (status) {
+ device_printf(sc->dev,
+ "VF-%d: Failed to set promiscuous mode for VSI %d, err %s aq_err %s\n",
+ vf->vf_num, vsi->idx,
+ ice_status_str(status),
+ ice_aq_str(hw->adminq.sq_last_status));
+ v_status = ice_iov_err_to_virt_err(status);
+ goto done;
+ }
+ }
+
+done:
+ ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_CONFIG_PROMISCUOUS_MODE,
+ v_status, NULL, 0, NULL);
+}
+
+/**
+ * ice_vc_notify_all_vfs_link_state - Notify all VFs of PF link state
+ * @sc: device private structure
+ *
+ * Sends a message to all VFs about the status of the PF's link
+ * state. For more details, @see ice_vc_notify_vf_link_state.
+ */
+void
+ice_vc_notify_all_vfs_link_state(struct ice_softc *sc)
+{
+ for (int i = 0; i < sc->num_vfs; i++)
+ ice_vc_notify_vf_link_state(sc, &sc->vfs[i]);
+}
+
+/**
+ * ice_vc_notify_vf_link_state - Notify VF of PF link state
+ * @sc: device private structure
+ * @vf: VF tracking structure
+ *
+ * Sends an event message to the specified VF with information about
+ * the current link state from the PF's port. This includes whether
+ * link is up or down, and the link speed in 100Mbps units.
+ */
+static void
+ice_vc_notify_vf_link_state(struct ice_softc *sc, struct ice_vf *vf)
+{
+ struct virtchnl_pf_event event = {};
+ struct ice_hw *hw = &sc->hw;
+
+ event.event = VIRTCHNL_EVENT_LINK_CHANGE;
+ event.severity = PF_EVENT_SEVERITY_INFO;
+ event.event_data.link_event_adv.link_status = sc->link_up;
+ event.event_data.link_event_adv.link_speed =
+ (u32)ice_conv_link_speed_to_virtchnl(true,
+ hw->port_info->phy.link_info.link_speed);
+
+ ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_EVENT,
+ VIRTCHNL_STATUS_SUCCESS, (u8 *)&event, sizeof(event), NULL);
+}
+
+/**
+ * ice_vc_handle_vf_msg - Handle a message from a VF
+ * @sc: device private structure
+ * @event: event received from the HW MBX queue
+ *
+ * Called whenever an event is received from a VF on the HW mailbox queue.
+ * Responsible for handling these messages as well as responding to the
+ * VF afterwards, depending on the received message type.
+ */
+void
+ice_vc_handle_vf_msg(struct ice_softc *sc, struct ice_rq_event_info *event)
+{
+ struct ice_hw *hw = &sc->hw;
+ device_t dev = sc->dev;
+ struct ice_vf *vf;
+ int err = 0;
+
+ u32 v_opcode = event->desc.cookie_high;
+ u16 v_id = event->desc.retval;
+ u8 *msg = event->msg_buf;
+ u16 msglen = event->msg_len;
+
+ if (v_id >= sc->num_vfs) {
+ device_printf(dev, "%s: Received msg from invalid VF-%d: opcode %d, len %d\n",
+ __func__, v_id, v_opcode, msglen);
+ return;
+ }
+
+ vf = &sc->vfs[v_id];
+
+ /* Perform basic checks on the msg */
+ err = virtchnl_vc_validate_vf_msg(&vf->version, v_opcode, msg, msglen);
+ if (err) {
+ device_printf(dev, "%s: Received invalid msg from VF-%d: opcode %d, len %d, error %d\n",
+ __func__, vf->vf_num, v_opcode, msglen, err);
+ ice_aq_send_msg_to_vf(hw, v_id, v_opcode, VIRTCHNL_STATUS_ERR_PARAM, NULL, 0, NULL);
+ return;
+ }
+
+ switch (v_opcode) {
+ case VIRTCHNL_OP_VERSION:
+ ice_vc_version_msg(sc, vf, msg);
+ break;
+ case VIRTCHNL_OP_RESET_VF:
+ ice_reset_vf(sc, vf, true);
+ break;
+ case VIRTCHNL_OP_GET_VF_RESOURCES:
+ ice_vc_get_vf_res_msg(sc, vf, msg);
+ break;
+ case VIRTCHNL_OP_ADD_ETH_ADDR:
+ ice_vc_add_eth_addr_msg(sc, vf, msg);
+ break;
+ case VIRTCHNL_OP_DEL_ETH_ADDR:
+ ice_vc_del_eth_addr_msg(sc, vf, msg);
+ break;
+ case VIRTCHNL_OP_ADD_VLAN:
+ ice_vc_add_vlan_msg(sc, vf, msg);
+ break;
+ case VIRTCHNL_OP_DEL_VLAN:
+ ice_vc_del_vlan_msg(sc, vf, msg);
+ break;
+ case VIRTCHNL_OP_CONFIG_VSI_QUEUES:
+ ice_vc_cfg_vsi_qs_msg(sc, vf, msg);
+ break;
+ case VIRTCHNL_OP_CONFIG_RSS_KEY:
+ ice_vc_cfg_rss_key_msg(sc, vf, msg);
+ break;
+ case VIRTCHNL_OP_CONFIG_RSS_LUT:
+ ice_vc_cfg_rss_lut_msg(sc, vf, msg);
+ break;
+ case VIRTCHNL_OP_SET_RSS_HENA:
+ ice_vc_set_rss_hena_msg(sc, vf, msg);
+ break;
+ case VIRTCHNL_OP_ENABLE_QUEUES:
+ ice_vc_enable_queues_msg(sc, vf, msg);
+ ice_vc_notify_vf_link_state(sc, vf);
+ break;
+ case VIRTCHNL_OP_DISABLE_QUEUES:
+ ice_vc_disable_queues_msg(sc, vf, msg);
+ break;
+ case VIRTCHNL_OP_CONFIG_IRQ_MAP:
+ ice_vc_cfg_irq_map_msg(sc, vf, msg);
+ break;
+ case VIRTCHNL_OP_GET_STATS:
+ ice_vc_get_stats_msg(sc, vf, msg);
+ break;
+ case VIRTCHNL_OP_CONFIG_PROMISCUOUS_MODE:
+ ice_vc_cfg_promisc_mode_msg(sc, vf, msg);
+ break;
+ default:
+ device_printf(dev, "%s: Received unknown msg from VF-%d: opcode %d, len %d\n",
+ __func__, vf->vf_num, v_opcode, msglen);
+ ice_aq_send_msg_to_vf(hw, v_id, v_opcode,
+ VIRTCHNL_STATUS_ERR_NOT_SUPPORTED, NULL, 0, NULL);
+ break;
+ }
+}
+
+/**
+ * ice_iov_setup_intr_mapping - Setup interrupt config for a VF
+ * @sc: device softc structure
+ * @vf: driver's VF structure for VF to be configured
+ *
+ * Before a VF can be used, and after a VF reset, the PF must configure
+ * the VF's interrupt allocation registers. This includes allocating
+ * interrupts from the PF's interrupt pool to the VF using the
+ * VPINT_ALLOC(_PCI) registers, and setting up a mapping from PF vectors
+ * to VF vectors in GLINT_VECT2FUNC.
+ *
+ * As well, this sets up queue allocation registers and maps the mailbox
+ * interrupt for the VF.
+ */
+static void
+ice_iov_setup_intr_mapping(struct ice_softc *sc, struct ice_vf *vf)
+{
+ struct ice_hw *hw = &sc->hw;
+ struct ice_vsi *vsi = vf->vsi;
+ u16 v;
+
+ /* Calculate indices for register ops below */
+ u16 vf_first_irq_idx = vf->vf_imap[0];
+ u16 vf_last_irq_idx = (vf_first_irq_idx + vf->num_irq_vectors) - 1;
+ u16 abs_vf_first_irq_idx = hw->func_caps.common_cap.msix_vector_first_id +
+ vf_first_irq_idx;
+ u16 abs_vf_last_irq_idx = (abs_vf_first_irq_idx + vf->num_irq_vectors) - 1;
+ u16 abs_vf_num = vf->vf_num + hw->func_caps.vf_base_id;
+
+ /* Map out VF interrupt allocation in global device space. Both
+ * VPINT_ALLOC and VPINT_ALLOC_PCI use the same values.
+ */
+ wr32(hw, VPINT_ALLOC(vf->vf_num),
+ (((abs_vf_first_irq_idx << VPINT_ALLOC_FIRST_S) & VPINT_ALLOC_FIRST_M) |
+ ((abs_vf_last_irq_idx << VPINT_ALLOC_LAST_S) & VPINT_ALLOC_LAST_M) |
+ VPINT_ALLOC_VALID_M));
+ wr32(hw, VPINT_ALLOC_PCI(vf->vf_num),
+ (((abs_vf_first_irq_idx << VPINT_ALLOC_PCI_FIRST_S) & VPINT_ALLOC_PCI_FIRST_M) |
+ ((abs_vf_last_irq_idx << VPINT_ALLOC_PCI_LAST_S) & VPINT_ALLOC_PCI_LAST_M) |
+ VPINT_ALLOC_PCI_VALID_M));
+
+ /* Create inverse mapping of vectors to PF/VF combinations */
+ for (v = vf_first_irq_idx; v <= vf_last_irq_idx; v++)
+ {
+ wr32(hw, GLINT_VECT2FUNC(v),
+ (((abs_vf_num << GLINT_VECT2FUNC_VF_NUM_S) & GLINT_VECT2FUNC_VF_NUM_M) |
+ ((hw->pf_id << GLINT_VECT2FUNC_PF_NUM_S) & GLINT_VECT2FUNC_PF_NUM_M)));
+ }
+
+ /* Map mailbox interrupt to MSI-X index 0. Disable ITR for it, too. */
+ wr32(hw, VPINT_MBX_CTL(abs_vf_num),
+ ((0 << VPINT_MBX_CTL_MSIX_INDX_S) & VPINT_MBX_CTL_MSIX_INDX_M) |
+ ((0x3 << VPINT_MBX_CTL_ITR_INDX_S) & VPINT_MBX_CTL_ITR_INDX_M) |
+ VPINT_MBX_CTL_CAUSE_ENA_M);
+
+ /* Mark the TX queue mapping registers as valid */
+ wr32(hw, VPLAN_TXQ_MAPENA(vf->vf_num), VPLAN_TXQ_MAPENA_TX_ENA_M);
+
+ /* Indicate to HW that VF has scattered queue allocation */
+ wr32(hw, VPLAN_TX_QBASE(vf->vf_num), VPLAN_TX_QBASE_VFQTABLE_ENA_M);
+ for (int i = 0; i < vsi->num_tx_queues; i++) {
+ wr32(hw, VPLAN_TX_QTABLE(i, vf->vf_num),
+ (vsi->tx_qmap[i] << VPLAN_TX_QTABLE_QINDEX_S) & VPLAN_TX_QTABLE_QINDEX_M);
+ }
+
+ /* Mark the RX queue mapping registers as valid */
+ wr32(hw, VPLAN_RXQ_MAPENA(vf->vf_num), VPLAN_RXQ_MAPENA_RX_ENA_M);
+ wr32(hw, VPLAN_RX_QBASE(vf->vf_num), VPLAN_RX_QBASE_VFQTABLE_ENA_M);
+ for (int i = 0; i < vsi->num_rx_queues; i++) {
+ wr32(hw, VPLAN_RX_QTABLE(i, vf->vf_num),
+ (vsi->rx_qmap[i] << VPLAN_RX_QTABLE_QINDEX_S) & VPLAN_RX_QTABLE_QINDEX_M);
+ }
+}
+
+/**
+ * ice_err_to_virt err - translate ice errors into virtchnl errors
+ * @ice_err: status returned from ice function
+ */
+static enum virtchnl_status_code
+ice_iov_err_to_virt_err(int ice_err)
+{
+ switch (ice_err) {
+ case 0:
+ return VIRTCHNL_STATUS_SUCCESS;
+ case ICE_ERR_BAD_PTR:
+ case ICE_ERR_INVAL_SIZE:
+ case ICE_ERR_DEVICE_NOT_SUPPORTED:
+ case ICE_ERR_PARAM:
+ case ICE_ERR_CFG:
+ return VIRTCHNL_STATUS_ERR_PARAM;
+ case ICE_ERR_NO_MEMORY:
+ return VIRTCHNL_STATUS_ERR_NO_MEMORY;
+ case ICE_ERR_NOT_READY:
+ case ICE_ERR_RESET_FAILED:
+ case ICE_ERR_FW_API_VER:
+ case ICE_ERR_AQ_ERROR:
+ case ICE_ERR_AQ_TIMEOUT:
+ case ICE_ERR_AQ_FULL:
+ case ICE_ERR_AQ_NO_WORK:
+ case ICE_ERR_AQ_EMPTY:
+ return VIRTCHNL_STATUS_ERR_ADMIN_QUEUE_ERROR;
+ default:
+ return VIRTCHNL_STATUS_ERR_NOT_SUPPORTED;
+ }
+}
diff --git a/sys/dev/ice/ice_iov.h b/sys/dev/ice/ice_iov.h
new file mode 100644
index 000000000000..c4fb3e932e3f
--- /dev/null
+++ b/sys/dev/ice/ice_iov.h
@@ -0,0 +1,125 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright (c) 2025, Intel Corporation
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file ice_iov.h
+ * @brief header for IOV functionality
+ *
+ * This header includes definitions used to implement device Virtual Functions
+ * for the ice driver.
+ */
+
+#ifndef _ICE_IOV_H_
+#define _ICE_IOV_H_
+
+#include <sys/types.h>
+#include <sys/bus.h>
+#include <sys/nv.h>
+#include <sys/iov_schema.h>
+#include <sys/param.h>
+#include <dev/pci/pcivar.h>
+#include <dev/pci/pcireg.h>
+
+#include <dev/pci/pci_iov.h>
+
+#include "ice_iflib.h"
+#include "ice_vf_mbx.h"
+
+/**
+ * @enum ice_vf_flags
+ * @brief VF state flags
+ *
+ * Used to indicate the status of a PF's VF, as well as indicating what each VF
+ * is capabile of. Intended to be modified only using atomic operations, so
+ * they can be read and modified in places that aren't locked.
+ *
+ * Used in struct ice_vf's vf_flags field.
+ */
+enum ice_vf_flags {
+ VF_FLAG_ENABLED = BIT(0),
+ VF_FLAG_SET_MAC_CAP = BIT(1),
+ VF_FLAG_VLAN_CAP = BIT(2),
+ VF_FLAG_PROMISC_CAP = BIT(3),
+ VF_FLAG_MAC_ANTI_SPOOF = BIT(4),
+};
+
+/**
+ * @struct ice_vf
+ * @brief PF's VF software context
+ *
+ * Represents the state and options for a VF spawned from a PF.
+ */
+struct ice_vf {
+ struct ice_vsi *vsi;
+ u32 vf_flags;
+
+ u8 mac[ETHER_ADDR_LEN];
+ u16 vf_num;
+ struct virtchnl_version_info version;
+
+ u16 mac_filter_limit;
+ u16 mac_filter_cnt;
+ u16 vlan_limit;
+ u16 vlan_cnt;
+
+ u16 num_irq_vectors;
+ u16 *vf_imap;
+ struct ice_irq_vector *tx_irqvs;
+ struct ice_irq_vector *rx_irqvs;
+};
+
+#define ICE_PCIE_DEV_STATUS 0xAA
+
+#define ICE_PCI_CIAD_WAIT_COUNT 100
+#define ICE_PCI_CIAD_WAIT_DELAY_US 1
+#define ICE_VPGEN_VFRSTAT_WAIT_COUNT 100
+#define ICE_VPGEN_VFRSTAT_WAIT_DELAY_US 20
+
+#define ICE_VIRTCHNL_VALID_PROMISC_FLAGS (FLAG_VF_UNICAST_PROMISC | \
+ FLAG_VF_MULTICAST_PROMISC)
+
+#define ICE_DEFAULT_VF_VLAN_LIMIT 64
+#define ICE_DEFAULT_VF_FILTER_LIMIT 16
+
+int ice_iov_attach(struct ice_softc *sc);
+int ice_iov_detach(struct ice_softc *sc);
+
+int ice_iov_init(struct ice_softc *sc, uint16_t num_vfs, const nvlist_t *params);
+int ice_iov_add_vf(struct ice_softc *sc, uint16_t vfnum, const nvlist_t *params);
+void ice_iov_uninit(struct ice_softc *sc);
+
+void ice_iov_handle_vflr(struct ice_softc *sc);
+
+void ice_vc_handle_vf_msg(struct ice_softc *sc, struct ice_rq_event_info *event);
+void ice_vc_notify_all_vfs_link_state(struct ice_softc *sc);
+
+#endif /* _ICE_IOV_H_ */
+
diff --git a/sys/dev/ice/ice_lib.c b/sys/dev/ice/ice_lib.c
index d44ae5f37750..442111e5ffaf 100644
--- a/sys/dev/ice/ice_lib.c
+++ b/sys/dev/ice/ice_lib.c
@@ -42,6 +42,9 @@
#include "ice_lib.h"
#include "ice_iflib.h"
+#ifdef PCI_IOV
+#include "ice_iov.h"
+#endif
#include <dev/pci/pcivar.h>
#include <dev/pci/pcireg.h>
#include <machine/resource.h>
@@ -741,6 +744,12 @@ ice_initialize_vsi(struct ice_vsi *vsi)
case ICE_VSI_VMDQ2:
ctx.flags = ICE_AQ_VSI_TYPE_VMDQ2;
break;
+#ifdef PCI_IOV
+ case ICE_VSI_VF:
+ ctx.flags = ICE_AQ_VSI_TYPE_VF;
+ ctx.vf_num = vsi->vf_num;
+ break;
+#endif
default:
return (ENODEV);
}
@@ -1607,6 +1616,12 @@ ice_setup_tx_ctx(struct ice_tx_queue *txq, struct ice_tlan_ctx *tlan_ctx, u16 pf
case ICE_VSI_VMDQ2:
tlan_ctx->vmvf_type = ICE_TLAN_CTX_VMVF_TYPE_VMQ;
break;
+#ifdef PCI_IOV
+ case ICE_VSI_VF:
+ tlan_ctx->vmvf_type = ICE_TLAN_CTX_VMVF_TYPE_VF;
+ tlan_ctx->vmvf_num = hw->func_caps.vf_base_id + vsi->vf_num;
+ break;
+#endif
default:
return (ENODEV);
}
@@ -1660,6 +1675,10 @@ ice_cfg_vsi_for_tx(struct ice_vsi *vsi)
struct ice_tlan_ctx tlan_ctx = { 0 };
struct ice_tx_queue *txq = &vsi->tx_queues[i];
+ /* Last configured queue */
+ if (txq->desc_count == 0)
+ break;
+
pf_q = vsi->tx_qmap[txq->me];
qg->txqs[0].txq_id = htole16(pf_q);
@@ -1788,6 +1807,10 @@ ice_cfg_vsi_for_rx(struct ice_vsi *vsi)
for (i = 0; i < vsi->num_rx_queues; i++) {
MPASS(vsi->mbuf_sz > 0);
+ /* Last configured queue */
+ if (vsi->rx_queues[i].desc_count == 0)
+ break;
+
err = ice_setup_rx_ctx(&vsi->rx_queues[i]);
if (err)
return err;
@@ -2257,6 +2280,11 @@ ice_process_ctrlq_event(struct ice_softc *sc, const char *qname,
case ice_aqc_opc_get_link_status:
ice_process_link_event(sc, event);
break;
+#ifdef PCI_IOV
+ case ice_mbx_opc_send_msg_to_pf:
+ ice_vc_handle_vf_msg(sc, event);
+ break;
+#endif
case ice_aqc_opc_fw_logs_event:
ice_handle_fw_log_event(sc, &event->desc, event->msg_buf);
break;
diff --git a/sys/dev/ice/ice_lib.h b/sys/dev/ice/ice_lib.h
index b6b23ec82161..308b2bda2790 100644
--- a/sys/dev/ice/ice_lib.h
+++ b/sys/dev/ice/ice_lib.h
@@ -611,6 +611,10 @@ struct ice_vsi {
u16 mirror_src_vsi;
u16 rule_mir_ingress;
u16 rule_mir_egress;
+
+#ifdef PCI_IOV
+ u8 vf_num; /* Index of owning VF, if applicable */
+#endif
};
/**
diff --git a/sys/dev/ice/ice_vf_mbx.c b/sys/dev/ice/ice_vf_mbx.c
new file mode 100644
index 000000000000..387a1c6739a6
--- /dev/null
+++ b/sys/dev/ice/ice_vf_mbx.c
@@ -0,0 +1,471 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright (c) 2025, Intel Corporation
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "ice_common.h"
+#include "ice_hw_autogen.h"
+#include "ice_vf_mbx.h"
+
+/**
+ * ice_aq_send_msg_to_vf
+ * @hw: pointer to the hardware structure
+ * @vfid: VF ID to send msg
+ * @v_opcode: opcodes for VF-PF communication
+ * @v_retval: return error code
+ * @msg: pointer to the msg buffer
+ * @msglen: msg length
+ * @cd: pointer to command details
+ *
+ * Send message to VF driver (0x0802) using mailbox
+ * queue and asynchronously sending message via
+ * ice_sq_send_cmd() function
+ */
+int
+ice_aq_send_msg_to_vf(struct ice_hw *hw, u16 vfid, u32 v_opcode, u32 v_retval,
+ u8 *msg, u16 msglen, struct ice_sq_cd *cd)
+{
+ struct ice_aqc_pf_vf_msg *cmd;
+ struct ice_aq_desc desc;
+
+ ice_fill_dflt_direct_cmd_desc(&desc, ice_mbx_opc_send_msg_to_vf);
+
+ cmd = &desc.params.virt;
+ cmd->id = CPU_TO_LE32(vfid);
+
+ desc.cookie_high = CPU_TO_LE32(v_opcode);
+ desc.cookie_low = CPU_TO_LE32(v_retval);
+
+ if (msglen)
+ desc.flags |= CPU_TO_LE16(ICE_AQ_FLAG_RD);
+
+ return ice_sq_send_cmd(hw, &hw->mailboxq, &desc, msg, msglen, cd);
+}
+
+/**
+ * ice_aq_send_msg_to_pf
+ * @hw: pointer to the hardware structure
+ * @v_opcode: opcodes for VF-PF communication
+ * @v_retval: return error code
+ * @msg: pointer to the msg buffer
+ * @msglen: msg length
+ * @cd: pointer to command details
+ *
+ * Send message to PF driver using mailbox queue. By default, this
+ * message is sent asynchronously, i.e. ice_sq_send_cmd()
+ * does not wait for completion before returning.
+ */
+int
+ice_aq_send_msg_to_pf(struct ice_hw *hw, enum virtchnl_ops v_opcode,
+ int v_retval, u8 *msg, u16 msglen,
+ struct ice_sq_cd *cd)
+{
+ struct ice_aq_desc desc;
+
+ ice_fill_dflt_direct_cmd_desc(&desc, ice_mbx_opc_send_msg_to_pf);
+ desc.cookie_high = CPU_TO_LE32(v_opcode);
+ desc.cookie_low = CPU_TO_LE32(v_retval);
+
+ if (msglen)
+ desc.flags |= CPU_TO_LE16(ICE_AQ_FLAG_RD);
+
+ return ice_sq_send_cmd(hw, &hw->mailboxq, &desc, msg, msglen, cd);
+}
+
+static const u32 ice_legacy_aq_to_vc_speed[] = {
+ VIRTCHNL_LINK_SPEED_100MB, /* BIT(0) */
+ VIRTCHNL_LINK_SPEED_100MB,
+ VIRTCHNL_LINK_SPEED_1GB,
+ VIRTCHNL_LINK_SPEED_1GB,
+ VIRTCHNL_LINK_SPEED_1GB,
+ VIRTCHNL_LINK_SPEED_10GB,
+ VIRTCHNL_LINK_SPEED_20GB,
+ VIRTCHNL_LINK_SPEED_25GB,
+ VIRTCHNL_LINK_SPEED_40GB,
+ VIRTCHNL_LINK_SPEED_40GB,
+ VIRTCHNL_LINK_SPEED_40GB,
+};
+
+/**
+ * ice_conv_link_speed_to_virtchnl
+ * @adv_link_support: determines the format of the returned link speed
+ * @link_speed: variable containing the link_speed to be converted
+ *
+ * Convert link speed supported by HW to link speed supported by virtchnl.
+ * If adv_link_support is true, then return link speed in Mbps. Else return
+ * link speed as a VIRTCHNL_LINK_SPEED_* casted to a u32. Note that the caller
+ * needs to cast back to an enum virtchnl_link_speed in the case where
+ * adv_link_support is false, but when adv_link_support is true the caller can
+ * expect the speed in Mbps.
+ */
+u32 ice_conv_link_speed_to_virtchnl(bool adv_link_support, u16 link_speed)
+{
+ /* convert a BIT() value into an array index */
+ u16 index = (u16)(ice_fls(link_speed) - 1);
+
+ if (adv_link_support)
+ return ice_get_link_speed(index);
+ else if (index < ARRAY_SIZE(ice_legacy_aq_to_vc_speed))
+ /* Virtchnl speeds are not defined for every speed supported in
+ * the hardware. To maintain compatibility with older AVF
+ * drivers, while reporting the speed the new speed values are
+ * resolved to the closest known virtchnl speeds
+ */
+ return ice_legacy_aq_to_vc_speed[index];
+
+ return VIRTCHNL_LINK_SPEED_UNKNOWN;
+}
+
+/* The mailbox overflow detection algorithm helps to check if there
+ * is a possibility of a malicious VF transmitting too many MBX messages to the
+ * PF.
+ * 1. The mailbox snapshot structure, ice_mbx_snapshot, is initialized during
+ * driver initialization in ice_init_hw() using ice_mbx_init_snapshot().
+ * The struct ice_mbx_snapshot helps to track and traverse a static window of
+ * messages within the mailbox queue while looking for a malicious VF.
+ *
+ * 2. When the caller starts processing its mailbox queue in response to an
+ * interrupt, the structure ice_mbx_snapshot is expected to be cleared before
+ * the algorithm can be run for the first time for that interrupt. This
+ * requires calling ice_mbx_reset_snapshot() as well as calling
+ * ice_mbx_reset_vf_info() for each VF tracking structure.
+ *
+ * 3. For every message read by the caller from the MBX Queue, the caller must
+ * call the detection algorithm's entry function ice_mbx_vf_state_handler().
+ * Before every call to ice_mbx_vf_state_handler() the struct ice_mbx_data is
+ * filled as it is required to be passed to the algorithm.
+ *
+ * 4. Every time a message is read from the MBX queue, a tracking structure
+ * for the VF must be passed to the state handler. The boolean output
+ * report_malvf from ice_mbx_vf_state_handler() serves as an indicator to the
+ * caller whether it must report this VF as malicious or not.
+ *
+ * 5. When a VF is identified to be malicious, the caller can send a message
+ * to the system administrator.
+ *
+ * 6. The PF is responsible for maintaining the struct ice_mbx_vf_info
+ * structure for each VF. The PF should clear the VF tracking structure if the
+ * VF is reset. When a VF is shut down and brought back up, we will then
+ * assume that the new VF is not malicious and may report it again if we
+ * detect it again.
+ *
+ * 7. The function ice_mbx_reset_snapshot() is called to reset the information
+ * in ice_mbx_snapshot for every new mailbox interrupt handled.
+ */
+#define ICE_RQ_DATA_MASK(rq_data) ((rq_data) & PF_MBX_ARQH_ARQH_M)
+/* Using the highest value for an unsigned 16-bit value 0xFFFF to indicate that
+ * the max messages check must be ignored in the algorithm
+ */
+#define ICE_IGNORE_MAX_MSG_CNT 0xFFFF
+
+/**
+ * ice_mbx_reset_snapshot - Initialize mailbox snapshot structure
+ * @snap: pointer to the mailbox snapshot
+ */
+static void ice_mbx_reset_snapshot(struct ice_mbx_snapshot *snap)
+{
+ struct ice_mbx_vf_info *vf_info;
+
+ /* Clear mbx_buf in the mailbox snaphot structure and setting the
+ * mailbox snapshot state to a new capture.
+ */
+ ice_memset(&snap->mbx_buf, 0, sizeof(snap->mbx_buf), ICE_NONDMA_MEM);
+ snap->mbx_buf.state = ICE_MAL_VF_DETECT_STATE_NEW_SNAPSHOT;
+
+ /* Reset message counts for all VFs to zero */
+ LIST_FOR_EACH_ENTRY(vf_info, &snap->mbx_vf, ice_mbx_vf_info, list_entry)
+ vf_info->msg_count = 0;
+}
+
+/**
+ * ice_mbx_traverse - Pass through mailbox snapshot
+ * @hw: pointer to the HW struct
+ * @new_state: new algorithm state
+ *
+ * Traversing the mailbox static snapshot without checking
+ * for malicious VFs.
+ */
+static void
+ice_mbx_traverse(struct ice_hw *hw,
+ enum ice_mbx_snapshot_state *new_state)
+{
+ struct ice_mbx_snap_buffer_data *snap_buf;
+ u32 num_iterations;
+
+ snap_buf = &hw->mbx_snapshot.mbx_buf;
+
+ /* As mailbox buffer is circular, applying a mask
+ * on the incremented iteration count.
+ */
+ num_iterations = ICE_RQ_DATA_MASK(++snap_buf->num_iterations);
+
+ /* Checking either of the below conditions to exit snapshot traversal:
+ * Condition-1: If the number of iterations in the mailbox is equal to
+ * the mailbox head which would indicate that we have reached the end
+ * of the static snapshot.
+ * Condition-2: If the maximum messages serviced in the mailbox for a
+ * given interrupt is the highest possible value then there is no need
+ * to check if the number of messages processed is equal to it. If not
+ * check if the number of messages processed is greater than or equal
+ * to the maximum number of mailbox entries serviced in current work item.
+ */
+ if (num_iterations == snap_buf->head ||
+ (snap_buf->max_num_msgs_mbx < ICE_IGNORE_MAX_MSG_CNT &&
+ ++snap_buf->num_msg_proc >= snap_buf->max_num_msgs_mbx))
+ *new_state = ICE_MAL_VF_DETECT_STATE_NEW_SNAPSHOT;
+}
+
+/**
+ * ice_mbx_detect_malvf - Detect malicious VF in snapshot
+ * @hw: pointer to the HW struct
+ * @vf_info: mailbox tracking structure for a VF
+ * @new_state: new algorithm state
+ * @is_malvf: boolean output to indicate if VF is malicious
+ *
+ * This function tracks the number of asynchronous messages
+ * sent per VF and marks the VF as malicious if it exceeds
+ * the permissible number of messages to send.
+ */
+static int
+ice_mbx_detect_malvf(struct ice_hw *hw, struct ice_mbx_vf_info *vf_info,
+ enum ice_mbx_snapshot_state *new_state,
+ bool *is_malvf)
+{
+ /* increment the message count for this VF */
+ vf_info->msg_count++;
+
+ if (vf_info->msg_count >= ICE_ASYNC_VF_MSG_THRESHOLD)
+ *is_malvf = true;
+
+ /* continue to iterate through the mailbox snapshot */
+ ice_mbx_traverse(hw, new_state);
+
+ return 0;
+}
+
+/**
+ * ice_e830_mbx_vf_dec_trig - Decrements the VF mailbox queue counter
+ * @hw: pointer to the HW struct
+ * @event: pointer to the control queue receive event
+ *
+ * This function triggers to decrement the counter
+ * MBX_VF_IN_FLIGHT_MSGS_AT_PF_CNT when the driver replenishes
+ * the buffers at the PF mailbox queue.
+ */
+void ice_e830_mbx_vf_dec_trig(struct ice_hw *hw,
+ struct ice_rq_event_info *event)
+{
+ u16 vfid = LE16_TO_CPU(event->desc.retval);
+
+ wr32(hw, E830_MBX_VF_DEC_TRIG(vfid), 1);
+}
+
+/**
+ * ice_mbx_vf_clear_cnt_e830 - Clear the VF mailbox queue count
+ * @hw: pointer to the HW struct
+ * @vf_id: VF ID in the PF space
+ *
+ * This function clears the counter MBX_VF_IN_FLIGHT_MSGS_AT_PF_CNT, and should
+ * be called when a VF is created and on VF reset.
+ */
+void ice_mbx_vf_clear_cnt_e830(struct ice_hw *hw, u16 vf_id)
+{
+ u32 reg = rd32(hw, E830_MBX_VF_IN_FLIGHT_MSGS_AT_PF_CNT(vf_id));
+
+ wr32(hw, E830_MBX_VF_DEC_TRIG(vf_id), reg);
+}
+
+/**
+ * ice_mbx_vf_state_handler - Handle states of the overflow algorithm
+ * @hw: pointer to the HW struct
+ * @mbx_data: pointer to structure containing mailbox data
+ * @vf_info: mailbox tracking structure for the VF in question
+ * @report_malvf: boolean output to indicate whether VF should be reported
+ *
+ * The function serves as an entry point for the malicious VF
+ * detection algorithm by handling the different states and state
+ * transitions of the algorithm:
+ * New snapshot: This state is entered when creating a new static
+ * snapshot. The data from any previous mailbox snapshot is
+ * cleared and a new capture of the mailbox head and tail is
+ * logged. This will be the new static snapshot to detect
+ * asynchronous messages sent by VFs. On capturing the snapshot
+ * and depending on whether the number of pending messages in that
+ * snapshot exceed the watermark value, the state machine enters
+ * traverse or detect states.
+ * Traverse: If pending message count is below watermark then iterate
+ * through the snapshot without any action on VF.
+ * Detect: If pending message count exceeds watermark traverse
+ * the static snapshot and look for a malicious VF.
+ */
+int
+ice_mbx_vf_state_handler(struct ice_hw *hw, struct ice_mbx_data *mbx_data,
+ struct ice_mbx_vf_info *vf_info, bool *report_malvf)
+{
+ struct ice_mbx_snapshot *snap = &hw->mbx_snapshot;
+ struct ice_mbx_snap_buffer_data *snap_buf;
+ struct ice_ctl_q_info *cq = &hw->mailboxq;
+ enum ice_mbx_snapshot_state new_state;
+ int status = 0;
+ bool is_malvf = false;
+
+ if (!report_malvf || !mbx_data || !vf_info)
+ return ICE_ERR_BAD_PTR;
+
+ *report_malvf = false;
+
+ /* When entering the mailbox state machine assume that the VF
+ * is not malicious until detected.
+ */
+ /* Checking if max messages allowed to be processed while servicing current
+ * interrupt is not less than the defined AVF message threshold.
+ */
+ if (mbx_data->max_num_msgs_mbx <= ICE_ASYNC_VF_MSG_THRESHOLD)
+ return ICE_ERR_INVAL_SIZE;
+
+ /* The watermark value should not be lesser than the threshold limit
+ * set for the number of asynchronous messages a VF can send to mailbox
+ * nor should it be greater than the maximum number of messages in the
+ * mailbox serviced in current interrupt.
+ */
+ if (mbx_data->async_watermark_val < ICE_ASYNC_VF_MSG_THRESHOLD ||
+ mbx_data->async_watermark_val > mbx_data->max_num_msgs_mbx)
+ return ICE_ERR_PARAM;
+
+ new_state = ICE_MAL_VF_DETECT_STATE_INVALID;
+ snap_buf = &snap->mbx_buf;
+
+ switch (snap_buf->state) {
+ case ICE_MAL_VF_DETECT_STATE_NEW_SNAPSHOT:
+ /* Clear any previously held data in mailbox snapshot structure. */
+ ice_mbx_reset_snapshot(snap);
+
+ /* Collect the pending ARQ count, number of messages processed and
+ * the maximum number of messages allowed to be processed from the
+ * Mailbox for current interrupt.
+ */
+ snap_buf->num_pending_arq = mbx_data->num_pending_arq;
+ snap_buf->num_msg_proc = mbx_data->num_msg_proc;
+ snap_buf->max_num_msgs_mbx = mbx_data->max_num_msgs_mbx;
+
+ /* Capture a new static snapshot of the mailbox by logging the
+ * head and tail of snapshot and set num_iterations to the tail
+ * value to mark the start of the iteration through the snapshot.
+ */
+ snap_buf->head = ICE_RQ_DATA_MASK(cq->rq.next_to_clean +
+ mbx_data->num_pending_arq);
+ snap_buf->tail = ICE_RQ_DATA_MASK(cq->rq.next_to_clean - 1);
+ snap_buf->num_iterations = snap_buf->tail;
+
+ /* Pending ARQ messages returned by ice_clean_rq_elem
+ * is the difference between the head and tail of the
+ * mailbox queue. Comparing this value against the watermark
+ * helps to check if we potentially have malicious VFs.
+ */
+ if (snap_buf->num_pending_arq >=
+ mbx_data->async_watermark_val) {
+ new_state = ICE_MAL_VF_DETECT_STATE_DETECT;
+ status = ice_mbx_detect_malvf(hw, vf_info, &new_state, &is_malvf);
+ } else {
+ new_state = ICE_MAL_VF_DETECT_STATE_TRAVERSE;
+ ice_mbx_traverse(hw, &new_state);
+ }
+ break;
+
+ case ICE_MAL_VF_DETECT_STATE_TRAVERSE:
+ new_state = ICE_MAL_VF_DETECT_STATE_TRAVERSE;
+ ice_mbx_traverse(hw, &new_state);
+ break;
+
+ case ICE_MAL_VF_DETECT_STATE_DETECT:
+ new_state = ICE_MAL_VF_DETECT_STATE_DETECT;
+ status = ice_mbx_detect_malvf(hw, vf_info, &new_state, &is_malvf);
+ break;
+
+ default:
+ new_state = ICE_MAL_VF_DETECT_STATE_INVALID;
+ status = ICE_ERR_CFG;
+ }
+
+ snap_buf->state = new_state;
+
+ /* Only report VFs as malicious the first time we detect it */
+ if (is_malvf && !vf_info->malicious) {
+ vf_info->malicious = 1;
+ *report_malvf = true;
+ }
+
+ return status;
+}
+
+/**
+ * ice_mbx_clear_malvf - Clear VF mailbox info
+ * @vf_info: the mailbox tracking structure for a VF
+ *
+ * In case of a VF reset, this function shall be called to clear the VF's
+ * current mailbox tracking state.
+ */
+void ice_mbx_clear_malvf(struct ice_mbx_vf_info *vf_info)
+{
+ vf_info->malicious = 0;
+ vf_info->msg_count = 0;
+}
+
+/**
+ * ice_mbx_init_vf_info - Initialize a new VF mailbox tracking info
+ * @hw: pointer to the hardware structure
+ * @vf_info: the mailbox tracking info structure for a VF
+ *
+ * Initialize a VF mailbox tracking info structure and insert it into the
+ * snapshot list.
+ *
+ * If you remove the VF, you must also delete the associated VF info structure
+ * from the linked list.
+ */
+void ice_mbx_init_vf_info(struct ice_hw *hw, struct ice_mbx_vf_info *vf_info)
+{
+ struct ice_mbx_snapshot *snap = &hw->mbx_snapshot;
+
+ ice_mbx_clear_malvf(vf_info);
+ LIST_ADD(&vf_info->list_entry, &snap->mbx_vf);
+}
+
+/**
+ * ice_mbx_init_snapshot - Initialize mailbox snapshot data
+ * @hw: pointer to the hardware structure
+ *
+ * Clear the mailbox snapshot structure and initialize the VF mailbox list.
+ */
+void ice_mbx_init_snapshot(struct ice_hw *hw)
+{
+ struct ice_mbx_snapshot *snap = &hw->mbx_snapshot;
+
+ INIT_LIST_HEAD(&snap->mbx_vf);
+ ice_mbx_reset_snapshot(snap);
+}
diff --git a/sys/dev/ice/ice_vf_mbx.h b/sys/dev/ice/ice_vf_mbx.h
new file mode 100644
index 000000000000..3b185ac89c11
--- /dev/null
+++ b/sys/dev/ice/ice_vf_mbx.h
@@ -0,0 +1,67 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright (c) 2025, Intel Corporation
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _ICE_VF_MBX_H_
+#define _ICE_VF_MBX_H_
+
+#include "ice_type.h"
+#include "ice_controlq.h"
+
+/* Defining the mailbox message threshold as 63 asynchronous
+ * pending messages. Normal VF functionality does not require
+ * sending more than 63 asynchronous pending message.
+ */
+
+ /* Threshold value should be used to initialize
+ * MBX_VF_IN_FLIGHT_MSGS_AT_PF_CNT register.
+ */
+#define ICE_ASYNC_VF_MSG_THRESHOLD 63
+
+int
+ice_aq_send_msg_to_pf(struct ice_hw *hw, enum virtchnl_ops v_opcode,
+ int v_retval, u8 *msg, u16 msglen,
+ struct ice_sq_cd *cd);
+int
+ice_aq_send_msg_to_vf(struct ice_hw *hw, u16 vfid, u32 v_opcode, u32 v_retval,
+ u8 *msg, u16 msglen, struct ice_sq_cd *cd);
+
+u32 ice_conv_link_speed_to_virtchnl(bool adv_link_support, u16 link_speed);
+
+void ice_e830_mbx_vf_dec_trig(struct ice_hw *hw,
+ struct ice_rq_event_info *event);
+void ice_mbx_vf_clear_cnt_e830(struct ice_hw *hw, u16 vf_id);
+int
+ice_mbx_vf_state_handler(struct ice_hw *hw, struct ice_mbx_data *mbx_data,
+ struct ice_mbx_vf_info *vf_info, bool *report_malvf);
+void ice_mbx_clear_malvf(struct ice_mbx_vf_info *vf_info);
+void ice_mbx_init_vf_info(struct ice_hw *hw, struct ice_mbx_vf_info *vf_info);
+void ice_mbx_init_snapshot(struct ice_hw *hw);
+#endif /* _ICE_VF_MBX_H_ */
diff --git a/sys/dev/ice/if_ice_iflib.c b/sys/dev/ice/if_ice_iflib.c
index e60ee0f1c5c3..1469d2916465 100644
--- a/sys/dev/ice/if_ice_iflib.c
+++ b/sys/dev/ice/if_ice_iflib.c
@@ -42,6 +42,9 @@
#include "ice_drv_info.h"
#include "ice_switch.h"
#include "ice_sched.h"
+#ifdef PCI_IOV
+#include "ice_iov.h"
+#endif
#include <sys/module.h>
#include <sys/sockio.h>
@@ -85,6 +88,12 @@ static int ice_if_suspend(if_ctx_t ctx);
static int ice_if_resume(if_ctx_t ctx);
static bool ice_if_needs_restart(if_ctx_t ctx, enum iflib_restart_event event);
static void ice_init_link(struct ice_softc *sc);
+#ifdef PCI_IOV
+static int ice_if_iov_init(if_ctx_t ctx, uint16_t num_vfs, const nvlist_t *params);
+static void ice_if_iov_uninit(if_ctx_t ctx);
+static int ice_if_iov_vf_add(if_ctx_t ctx, uint16_t vfnum, const nvlist_t *params);
+static void ice_if_vflr_handle(if_ctx_t ctx);
+#endif
static int ice_setup_mirror_vsi(struct ice_mirr_if *mif);
static int ice_wire_mirror_intrs(struct ice_mirr_if *mif);
static void ice_free_irqvs_subif(struct ice_mirr_if *mif);
@@ -158,6 +167,11 @@ static device_method_t ice_methods[] = {
DEVMETHOD(device_shutdown, iflib_device_shutdown),
DEVMETHOD(device_suspend, iflib_device_suspend),
DEVMETHOD(device_resume, iflib_device_resume),
+#ifdef PCI_IOV
+ DEVMETHOD(pci_iov_init, iflib_device_iov_init),
+ DEVMETHOD(pci_iov_uninit, iflib_device_iov_uninit),
+ DEVMETHOD(pci_iov_add_vf, iflib_device_iov_add_vf),
+#endif
DEVMETHOD_END
};
@@ -198,6 +212,12 @@ static device_method_t ice_iflib_methods[] = {
DEVMETHOD(ifdi_suspend, ice_if_suspend),
DEVMETHOD(ifdi_resume, ice_if_resume),
DEVMETHOD(ifdi_needs_restart, ice_if_needs_restart),
+#ifdef PCI_IOV
+ DEVMETHOD(ifdi_iov_vf_add, ice_if_iov_vf_add),
+ DEVMETHOD(ifdi_iov_init, ice_if_iov_init),
+ DEVMETHOD(ifdi_iov_uninit, ice_if_iov_uninit),
+ DEVMETHOD(ifdi_vflr_handle, ice_if_vflr_handle),
+#endif
DEVMETHOD_END
};
@@ -733,6 +753,9 @@ ice_update_link_status(struct ice_softc *sc, bool update_media)
iflib_link_state_change(sc->ctx, LINK_STATE_DOWN, 0);
ice_rdma_link_change(sc, LINK_STATE_DOWN, 0);
}
+#ifdef PCI_IOV
+ ice_vc_notify_all_vfs_link_state(sc);
+#endif
update_media = true;
}
@@ -831,6 +854,14 @@ ice_if_attach_post(if_ctx_t ctx)
ice_add_device_sysctls(sc);
+#ifdef PCI_IOV
+ if (ice_is_bit_set(sc->feat_cap, ICE_FEATURE_SRIOV)) {
+ err = ice_iov_attach(sc);
+ if (err == ENOMEM)
+ return (err);
+ }
+#endif /* PCI_IOV */
+
/* Get DCBX/LLDP state and start DCBX agent */
ice_init_dcb_setup(sc);
@@ -953,6 +984,11 @@ ice_if_detach(if_ctx_t ctx)
ice_destroy_mirror_interface(sc);
ice_rdma_pf_detach(sc);
+#ifdef PCI_IOV
+ if (ice_is_bit_set(sc->feat_cap, ICE_FEATURE_SRIOV))
+ ice_iov_detach(sc);
+#endif /* PCI_IOV */
+
/* Free allocated media types */
ifmedia_removeall(sc->media);
@@ -1676,6 +1712,11 @@ ice_if_msix_intr_assign(if_ctx_t ctx, int msix)
/* For future interrupt assignments */
sc->last_rid = rid + sc->irdma_vectors;
+#ifdef PCI_IOV
+ /* Create soft IRQ for handling VF resets */
+ iflib_softirq_alloc_generic(ctx, NULL, IFLIB_INTR_IOV, sc, 0, "iov");
+#endif
+
return (0);
fail:
for (; i >= 0; i--, vector--)
@@ -2277,7 +2318,12 @@ ice_transition_recovery_mode(struct ice_softc *sc)
ice_rdma_pf_detach(sc);
ice_clear_bit(ICE_FEATURE_RDMA, sc->feat_cap);
+#ifdef PCI_IOV
+ if (ice_test_and_clear_bit(ICE_FEATURE_SRIOV, sc->feat_en))
+ ice_iov_detach(sc);
+#else
ice_clear_bit(ICE_FEATURE_SRIOV, sc->feat_en);
+#endif /* PCI_IOV */
ice_clear_bit(ICE_FEATURE_SRIOV, sc->feat_cap);
ice_vsi_del_txqs_ctx(vsi);
@@ -2325,7 +2371,12 @@ ice_transition_safe_mode(struct ice_softc *sc)
ice_rdma_pf_detach(sc);
ice_clear_bit(ICE_FEATURE_RDMA, sc->feat_cap);
+#ifdef PCI_IOV
+ if (ice_test_and_clear_bit(ICE_FEATURE_SRIOV, sc->feat_en))
+ ice_iov_detach(sc);
+#else
ice_clear_bit(ICE_FEATURE_SRIOV, sc->feat_en);
+#endif /* PCI_IOV */
ice_clear_bit(ICE_FEATURE_SRIOV, sc->feat_cap);
ice_clear_bit(ICE_FEATURE_RSS, sc->feat_cap);
@@ -2410,6 +2461,15 @@ ice_if_update_admin_status(if_ctx_t ctx)
/* Check and update link status */
ice_update_link_status(sc, false);
+#ifdef PCI_IOV
+ /*
+ * Schedule VFs' reset handler after global resets
+ * and other events were processed.
+ */
+ if (ice_testandclear_state(&sc->state, ICE_STATE_VFLR_PENDING))
+ iflib_iov_intr_deferred(ctx);
+#endif
+
/*
* If there are still messages to process, we need to reschedule
* ourselves. Otherwise, we can just re-enable the interrupt. We'll be
@@ -3349,6 +3409,78 @@ ice_init_link(struct ice_softc *sc)
}
+#ifdef PCI_IOV
+/**
+ * ice_if_iov_init - iov init handler for iflib
+ * @ctx: iflib context pointer
+ * @num_vfs: number of VFs to create
+ * @params: configuration parameters for the PF
+ *
+ * Configure the driver for SR-IOV mode. Used to setup things like memory
+ * before any VFs are created.
+ *
+ * @remark This is a wrapper for ice_iov_init
+ */
+static int
+ice_if_iov_init(if_ctx_t ctx, uint16_t num_vfs, const nvlist_t *params)
+{
+ struct ice_softc *sc = (struct ice_softc *)iflib_get_softc(ctx);
+
+ return ice_iov_init(sc, num_vfs, params);
+}
+
+/**
+ * ice_if_iov_uninit - iov uninit handler for iflib
+ * @ctx: iflib context pointer
+ *
+ * Destroys VFs and frees their memory and resources.
+ *
+ * @remark This is a wrapper for ice_iov_uninit
+ */
+static void
+ice_if_iov_uninit(if_ctx_t ctx)
+{
+ struct ice_softc *sc = (struct ice_softc *)iflib_get_softc(ctx);
+
+ ice_iov_uninit(sc);
+}
+
+/**
+ * ice_if_iov_vf_add - iov add vf handler for iflib
+ * @ctx: iflib context pointer
+ * @vfnum: index of VF to configure
+ * @params: configuration parameters for the VF
+ *
+ * Sets up the VF given by the vfnum index. This is called by the OS
+ * for each VF created by the PF driver after it is spawned.
+ *
+ * @remark This is a wrapper for ice_iov_vf_add
+ */
+static int
+ice_if_iov_vf_add(if_ctx_t ctx, uint16_t vfnum, const nvlist_t *params)
+{
+ struct ice_softc *sc = (struct ice_softc *)iflib_get_softc(ctx);
+
+ return ice_iov_add_vf(sc, vfnum, params);
+}
+
+/**
+ * ice_if_vflr_handle - iov VFLR handler
+ * @ctx: iflib context pointer
+ *
+ * Performs the necessar teardown or setup required for a VF after
+ * a VFLR is initiated.
+ *
+ * @remark This is a wrapper for ice_iov_handle_vflr
+ */
+static void
+ice_if_vflr_handle(if_ctx_t ctx)
+{
+ struct ice_softc *sc = (struct ice_softc *)iflib_get_softc(ctx);
+ ice_iov_handle_vflr(sc);
+}
+#endif /* PCI_IOV */
+
extern struct if_txrx ice_subif_txrx;
/**
diff --git a/sys/dev/md/md.c b/sys/dev/md/md.c
index 741a7c013f7d..29dc0c880e3a 100644
--- a/sys/dev/md/md.c
+++ b/sys/dev/md/md.c
@@ -11,9 +11,9 @@
*/
/*-
- * The following functions are based on the vn(4) driver: mdstart_swap(),
- * mdstart_vnode(), mdcreate_swap(), mdcreate_vnode() and mddestroy(),
- * and as such under the following copyright:
+ * The following functions are based on the historical vn(4) driver:
+ * mdstart_swap(), mdstart_vnode(), mdcreate_swap(), mdcreate_vnode()
+ * and mddestroy(), and as such under the following copyright:
*
* Copyright (c) 1988 University of Utah.
* Copyright (c) 1990, 1993
diff --git a/sys/dev/mgb/if_mgb.c b/sys/dev/mgb/if_mgb.c
index 1240d0f84415..409f34167df0 100644
--- a/sys/dev/mgb/if_mgb.c
+++ b/sys/dev/mgb/if_mgb.c
@@ -1435,7 +1435,7 @@ mgb_hw_teardown(struct mgb_softc *sc)
/* Stop MAC */
CSR_CLEAR_REG(sc, MGB_MAC_RX, MGB_MAC_ENBL);
- CSR_WRITE_REG(sc, MGB_MAC_TX, MGB_MAC_ENBL);
+ CSR_CLEAR_REG(sc, MGB_MAC_TX, MGB_MAC_ENBL);
if ((err = mgb_wait_for_bits(sc, MGB_MAC_RX, MGB_MAC_DSBL, 0)))
return (err);
if ((err = mgb_wait_for_bits(sc, MGB_MAC_TX, MGB_MAC_DSBL, 0)))
diff --git a/sys/dev/mlx5/mlx5_accel/ipsec.h b/sys/dev/mlx5/mlx5_accel/ipsec.h
index 361b9f72d873..c3f3a2372482 100644
--- a/sys/dev/mlx5/mlx5_accel/ipsec.h
+++ b/sys/dev/mlx5/mlx5_accel/ipsec.h
@@ -260,8 +260,8 @@ int mlx5e_accel_ipsec_fs_rx_tables_create(struct mlx5e_priv *priv);
void mlx5e_accel_ipsec_fs_rx_catchall_rules_destroy(struct mlx5e_priv *priv);
int mlx5e_accel_ipsec_fs_rx_catchall_rules(struct mlx5e_priv *priv);
int mlx5_accel_ipsec_rx_tag_add(if_t ifp, struct mlx5e_rq_mbuf *mr);
-void mlx5e_accel_ipsec_handle_rx_cqe(struct mbuf *mb, struct mlx5_cqe64 *cqe,
- struct mlx5e_rq_mbuf *mr);
+void mlx5e_accel_ipsec_handle_rx_cqe(if_t ifp, struct mbuf *mb,
+ struct mlx5_cqe64 *cqe, struct mlx5e_rq_mbuf *mr);
static inline int mlx5e_accel_ipsec_flow(struct mlx5_cqe64 *cqe)
{
@@ -269,12 +269,12 @@ static inline int mlx5e_accel_ipsec_flow(struct mlx5_cqe64 *cqe)
}
static inline void
-mlx5e_accel_ipsec_handle_rx(struct mbuf *mb, struct mlx5_cqe64 *cqe,
+mlx5e_accel_ipsec_handle_rx(if_t ifp, struct mbuf *mb, struct mlx5_cqe64 *cqe,
struct mlx5e_rq_mbuf *mr)
{
u32 ipsec_meta_data = be32_to_cpu(cqe->ft_metadata);
if (MLX5_IPSEC_METADATA_MARKER(ipsec_meta_data))
- mlx5e_accel_ipsec_handle_rx_cqe(mb, cqe, mr);
+ mlx5e_accel_ipsec_handle_rx_cqe(ifp, mb, cqe, mr);
}
#endif /* __MLX5_ACCEL_IPSEC_H__ */
diff --git a/sys/dev/mlx5/mlx5_accel/mlx5_ipsec_rxtx.c b/sys/dev/mlx5/mlx5_accel/mlx5_ipsec_rxtx.c
index 0883cfb2d510..5dccb8bc2b87 100644
--- a/sys/dev/mlx5/mlx5_accel/mlx5_ipsec_rxtx.c
+++ b/sys/dev/mlx5/mlx5_accel/mlx5_ipsec_rxtx.c
@@ -24,11 +24,14 @@
*
*/
+#include "opt_ipsec.h"
+
#include <sys/mbuf.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <netipsec/keydb.h>
#include <netipsec/ipsec_offload.h>
+#include <netipsec/xform.h>
#include <dev/mlx5/qp.h>
#include <dev/mlx5/mlx5_en/en.h>
#include <dev/mlx5/mlx5_accel/ipsec.h>
@@ -48,7 +51,8 @@ mlx5_accel_ipsec_rx_tag_add(if_t ifp, struct mlx5e_rq_mbuf *mr)
return (0);
mtag = (struct ipsec_accel_in_tag *)m_tag_get(
- PACKET_TAG_IPSEC_ACCEL_IN, sizeof(*mtag), M_NOWAIT);
+ PACKET_TAG_IPSEC_ACCEL_IN, sizeof(struct ipsec_accel_in_tag) -
+ __offsetof(struct ipsec_accel_in_tag, xh), M_NOWAIT);
if (mtag == NULL)
return (-ENOMEM);
mr->ipsec_mtag = mtag;
@@ -56,8 +60,8 @@ mlx5_accel_ipsec_rx_tag_add(if_t ifp, struct mlx5e_rq_mbuf *mr)
}
void
-mlx5e_accel_ipsec_handle_rx_cqe(struct mbuf *mb, struct mlx5_cqe64 *cqe,
- struct mlx5e_rq_mbuf *mr)
+mlx5e_accel_ipsec_handle_rx_cqe(if_t ifp, struct mbuf *mb,
+ struct mlx5_cqe64 *cqe, struct mlx5e_rq_mbuf *mr)
{
struct ipsec_accel_in_tag *mtag;
u32 drv_spi;
@@ -65,10 +69,12 @@ mlx5e_accel_ipsec_handle_rx_cqe(struct mbuf *mb, struct mlx5_cqe64 *cqe,
drv_spi = MLX5_IPSEC_METADATA_HANDLE(be32_to_cpu(cqe->ft_metadata));
mtag = mr->ipsec_mtag;
WARN_ON(mtag == NULL);
- mr->ipsec_mtag = NULL;
if (mtag != NULL) {
mtag->drv_spi = drv_spi;
- m_tag_prepend(mb, &mtag->tag);
+ if (ipsec_accel_fill_xh(ifp, drv_spi, &mtag->xh)) {
+ m_tag_prepend(mb, &mtag->tag);
+ mr->ipsec_mtag = NULL;
+ }
}
}
diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_hw_tls_rx.c b/sys/dev/mlx5/mlx5_en/mlx5_en_hw_tls_rx.c
index 4de451f1b039..89d2010656c5 100644
--- a/sys/dev/mlx5/mlx5_en/mlx5_en_hw_tls_rx.c
+++ b/sys/dev/mlx5/mlx5_en/mlx5_en_hw_tls_rx.c
@@ -659,7 +659,8 @@ mlx5e_tls_rx_set_params(void *ctx, struct inpcb *inp, const struct tls_session_p
return (EINVAL);
MLX5_SET64(sw_tls_rx_cntx, ctx, param.initial_record_number, tls_sn_he);
- MLX5_SET(sw_tls_rx_cntx, ctx, param.resync_tcp_sn, tcp_sn_he);
+ MLX5_SET(sw_tls_rx_cntx, ctx, param.resync_tcp_sn, 0);
+ MLX5_SET(sw_tls_rx_cntx, ctx, progress.next_record_tcp_sn, tcp_sn_he);
return (0);
}
diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c b/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c
index 6b53db6fea23..eb569488631a 100644
--- a/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c
+++ b/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c
@@ -467,7 +467,7 @@ mlx5e_build_rx_mbuf(struct mlx5_cqe64 *cqe, struct mlx5e_rq *rq,
break;
}
- mlx5e_accel_ipsec_handle_rx(mb, cqe, mr);
+ mlx5e_accel_ipsec_handle_rx(ifp, mb, cqe, mr);
}
static inline void
diff --git a/sys/dev/qlnx/qlnxe/qlnx_os.c b/sys/dev/qlnx/qlnxe/qlnx_os.c
index 05ec69a70dfe..9d23d5df1d2b 100644
--- a/sys/dev/qlnx/qlnxe/qlnx_os.c
+++ b/sys/dev/qlnx/qlnxe/qlnx_os.c
@@ -30,6 +30,8 @@
* Author : David C Somayajulu, Cavium, Inc., San Jose, CA 95131.
*/
+#include "opt_inet.h"
+
#include <sys/cdefs.h>
#include "qlnx_os.h"
#include "bcm_osal.h"
@@ -2778,7 +2780,7 @@ qlnx_ioctl(if_t ifp, u_long cmd, caddr_t data)
if (!p_ptt) {
QL_DPRINT1(ha, "ecore_ptt_acquire failed\n");
- ret = -1;
+ ret = ERESTART;
break;
}
@@ -2789,7 +2791,7 @@ qlnx_ioctl(if_t ifp, u_long cmd, caddr_t data)
ecore_ptt_release(p_hwfn, p_ptt);
if (ret) {
- ret = -1;
+ ret = ENODEV;
break;
}
diff --git a/sys/dev/random/fortuna.c b/sys/dev/random/fortuna.c
index c4282c723a44..8363de99a60a 100644
--- a/sys/dev/random/fortuna.c
+++ b/sys/dev/random/fortuna.c
@@ -341,6 +341,13 @@ random_fortuna_process_event(struct harvest_event *event)
u_int pl;
RANDOM_RESEED_LOCK();
+ /*
+ * Run SP 800-90B health tests on the source if so configured.
+ */
+ if (!random_harvest_healthtest(event)) {
+ RANDOM_RESEED_UNLOCK();
+ return;
+ }
/*-
* FS&K - P_i = P_i|<harvested stuff>
* Accumulate the event into the appropriate pool
diff --git a/sys/dev/random/random_harvestq.c b/sys/dev/random/random_harvestq.c
index 395310b115fb..c7762967c4fb 100644
--- a/sys/dev/random/random_harvestq.c
+++ b/sys/dev/random/random_harvestq.c
@@ -88,6 +88,8 @@ static void random_sources_feed(void);
static __read_mostly bool epoch_inited;
static __read_mostly epoch_t rs_epoch;
+static const char *random_source_descr[ENTROPYSOURCE];
+
/*
* How many events to queue up. We create this many items in
* an 'empty' queue, then transfer them to the 'harvest' queue with
@@ -299,6 +301,230 @@ random_sources_feed(void)
explicit_bzero(entropy, sizeof(entropy));
}
+/*
+ * State used for conducting NIST SP 800-90B health tests on entropy sources.
+ */
+static struct health_test_softc {
+ uint32_t ht_rct_value[HARVESTSIZE + 1];
+ u_int ht_rct_count; /* number of samples with the same value */
+ u_int ht_rct_limit; /* constant after init */
+
+ uint32_t ht_apt_value[HARVESTSIZE + 1];
+ u_int ht_apt_count; /* number of samples with the same value */
+ u_int ht_apt_seq; /* sequence number of the last sample */
+ u_int ht_apt_cutoff; /* constant after init */
+
+ uint64_t ht_total_samples;
+ bool ondemand; /* Set to true to restart the state machine */
+ enum {
+ INIT = 0, /* initial state */
+ DISABLED, /* health checking is disabled */
+ STARTUP, /* doing startup tests, samples are discarded */
+ STEADY, /* steady-state operation */
+ FAILED, /* health check failed, discard samples */
+ } ht_state;
+} healthtest[ENTROPYSOURCE];
+
+#define RANDOM_SELFTEST_STARTUP_SAMPLES 1024 /* 4.3, requirement 4 */
+#define RANDOM_SELFTEST_APT_WINDOW 512 /* 4.4.2 */
+
+static void
+copy_event(uint32_t dst[static HARVESTSIZE + 1],
+ const struct harvest_event *event)
+{
+ memset(dst, 0, sizeof(uint32_t) * (HARVESTSIZE + 1));
+ memcpy(dst, event->he_entropy, event->he_size);
+ dst[HARVESTSIZE] = event->he_somecounter;
+}
+
+static void
+random_healthtest_rct_init(struct health_test_softc *ht,
+ const struct harvest_event *event)
+{
+ ht->ht_rct_count = 1;
+ copy_event(ht->ht_rct_value, event);
+}
+
+/*
+ * Apply the repitition count test to a sample.
+ *
+ * Return false if the test failed, i.e., we observed >= C consecutive samples
+ * with the same value, and true otherwise.
+ */
+static bool
+random_healthtest_rct_next(struct health_test_softc *ht,
+ const struct harvest_event *event)
+{
+ uint32_t val[HARVESTSIZE + 1];
+
+ copy_event(val, event);
+ if (memcmp(val, ht->ht_rct_value, sizeof(ht->ht_rct_value)) != 0) {
+ ht->ht_rct_count = 1;
+ memcpy(ht->ht_rct_value, val, sizeof(ht->ht_rct_value));
+ return (true);
+ } else {
+ ht->ht_rct_count++;
+ return (ht->ht_rct_count < ht->ht_rct_limit);
+ }
+}
+
+static void
+random_healthtest_apt_init(struct health_test_softc *ht,
+ const struct harvest_event *event)
+{
+ ht->ht_apt_count = 1;
+ ht->ht_apt_seq = 1;
+ copy_event(ht->ht_apt_value, event);
+}
+
+static bool
+random_healthtest_apt_next(struct health_test_softc *ht,
+ const struct harvest_event *event)
+{
+ uint32_t val[HARVESTSIZE + 1];
+
+ if (ht->ht_apt_seq == 0) {
+ random_healthtest_apt_init(ht, event);
+ return (true);
+ }
+
+ copy_event(val, event);
+ if (memcmp(val, ht->ht_apt_value, sizeof(ht->ht_apt_value)) == 0) {
+ ht->ht_apt_count++;
+ if (ht->ht_apt_count >= ht->ht_apt_cutoff)
+ return (false);
+ }
+
+ ht->ht_apt_seq++;
+ if (ht->ht_apt_seq == RANDOM_SELFTEST_APT_WINDOW)
+ ht->ht_apt_seq = 0;
+
+ return (true);
+}
+
+/*
+ * Run the health tests for the given event. This is assumed to be called from
+ * a serialized context.
+ */
+bool
+random_harvest_healthtest(const struct harvest_event *event)
+{
+ struct health_test_softc *ht;
+
+ ht = &healthtest[event->he_source];
+
+ /*
+ * Was on-demand testing requested? Restart the state machine if so,
+ * restarting the startup tests.
+ */
+ if (atomic_load_bool(&ht->ondemand)) {
+ atomic_store_bool(&ht->ondemand, false);
+ ht->ht_state = INIT;
+ }
+
+ switch (ht->ht_state) {
+ case __predict_false(INIT):
+ /* Store the first sample and initialize test state. */
+ random_healthtest_rct_init(ht, event);
+ random_healthtest_apt_init(ht, event);
+ ht->ht_total_samples = 0;
+ ht->ht_state = STARTUP;
+ return (false);
+ case DISABLED:
+ /* No health testing for this source. */
+ return (true);
+ case STEADY:
+ case STARTUP:
+ ht->ht_total_samples++;
+ if (random_healthtest_rct_next(ht, event) &&
+ random_healthtest_apt_next(ht, event)) {
+ if (ht->ht_state == STARTUP &&
+ ht->ht_total_samples >=
+ RANDOM_SELFTEST_STARTUP_SAMPLES) {
+ printf(
+ "random: health test passed for source %s\n",
+ random_source_descr[event->he_source]);
+ ht->ht_state = STEADY;
+ }
+ return (ht->ht_state == STEADY);
+ }
+ ht->ht_state = FAILED;
+ printf(
+ "random: health test failed for source %s, discarding samples\n",
+ random_source_descr[event->he_source]);
+ /* FALLTHROUGH */
+ case FAILED:
+ return (false);
+ }
+}
+
+static bool nist_healthtest_enabled = false;
+SYSCTL_BOOL(_kern_random, OID_AUTO, nist_healthtest_enabled,
+ CTLFLAG_RDTUN, &nist_healthtest_enabled, 0,
+ "Enable NIST SP 800-90B health tests for noise sources");
+
+static void
+random_healthtest_init(enum random_entropy_source source)
+{
+ struct health_test_softc *ht;
+
+ ht = &healthtest[source];
+ KASSERT(ht->ht_state == INIT,
+ ("%s: health test state is %d for source %d",
+ __func__, ht->ht_state, source));
+
+ /*
+ * If health-testing is enabled, validate all sources except CACHED and
+ * VMGENID: they are deterministic sources used only a small, fixed
+ * number of times, so statistical testing is not applicable.
+ */
+ if (!nist_healthtest_enabled ||
+ source == RANDOM_CACHED || source == RANDOM_PURE_VMGENID) {
+ ht->ht_state = DISABLED;
+ return;
+ }
+
+ /*
+ * Set cutoff values for the two tests, assuming that each sample has
+ * min-entropy of 1 bit and allowing for an error rate of 1 in 2^{34}.
+ * With a sample rate of RANDOM_KTHREAD_HZ, we expect to see an false
+ * positive once in ~54.5 years.
+ *
+ * The RCT limit comes from the formula in section 4.4.1.
+ *
+ * The APT cutoff is calculated using the formula in section 4.4.2
+ * footnote 10 with the window size changed from 512 to 511, since the
+ * test as written counts the number of samples equal to the first
+ * sample in the window, and thus tests W-1 samples.
+ */
+ ht->ht_rct_limit = 35;
+ ht->ht_apt_cutoff = 330;
+}
+
+static int
+random_healthtest_ondemand(SYSCTL_HANDLER_ARGS)
+{
+ u_int mask, source;
+ int error;
+
+ mask = 0;
+ error = sysctl_handle_int(oidp, &mask, 0, req);
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+
+ while (mask != 0) {
+ source = ffs(mask) - 1;
+ if (source < nitems(healthtest))
+ atomic_store_bool(&healthtest[source].ondemand, true);
+ mask &= ~(1u << source);
+ }
+ return (0);
+}
+SYSCTL_PROC(_kern_random, OID_AUTO, nist_healthtest_ondemand,
+ CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0,
+ random_healthtest_ondemand, "I",
+ "Re-run NIST SP 800-90B startup health tests for a noise source");
+
static int
random_check_uint_harvestmask(SYSCTL_HANDLER_ARGS)
{
@@ -362,7 +588,8 @@ static const char *random_source_descr[ENTROPYSOURCE] = {
[RANDOM_SWI] = "SWI",
[RANDOM_FS_ATIME] = "FS_ATIME",
[RANDOM_UMA] = "UMA",
- [RANDOM_CALLOUT] = "CALLOUT", /* ENVIRONMENTAL_END */
+ [RANDOM_CALLOUT] = "CALLOUT",
+ [RANDOM_RANDOMDEV] = "RANDOMDEV", /* ENVIRONMENTAL_END */
[RANDOM_PURE_OCTEON] = "PURE_OCTEON", /* PURE_START */
[RANDOM_PURE_SAFE] = "PURE_SAFE",
[RANDOM_PURE_GLXSB] = "PURE_GLXSB",
@@ -424,6 +651,9 @@ random_harvestq_init(void *unused __unused)
hc_source_mask = almost_everything_mask;
RANDOM_HARVEST_INIT_LOCK();
harvest_context.hc_active_buf = 0;
+
+ for (int i = 0; i < ENTROPYSOURCE; i++)
+ random_healthtest_init(i);
}
SYSINIT(random_device_h_init, SI_SUB_RANDOM, SI_ORDER_THIRD, random_harvestq_init, NULL);
diff --git a/sys/dev/random/random_harvestq.h b/sys/dev/random/random_harvestq.h
index 7804bf52aa4f..1d462500df85 100644
--- a/sys/dev/random/random_harvestq.h
+++ b/sys/dev/random/random_harvestq.h
@@ -49,4 +49,6 @@ random_get_cyclecount(void)
return ((uint32_t)get_cyclecount());
}
+bool random_harvest_healthtest(const struct harvest_event *event);
+
#endif /* SYS_DEV_RANDOM_RANDOM_HARVESTQ_H_INCLUDED */
diff --git a/sys/dev/random/randomdev.c b/sys/dev/random/randomdev.c
index 9d1c7b1167c8..ced4dd8067d9 100644
--- a/sys/dev/random/randomdev.c
+++ b/sys/dev/random/randomdev.c
@@ -312,7 +312,7 @@ randomdev_accumulate(uint8_t *buf, u_int count)
for (i = 0; i < RANDOM_KEYSIZE_WORDS; i += sizeof(event.he_entropy)/sizeof(event.he_entropy[0])) {
event.he_somecounter = random_get_cyclecount();
event.he_size = sizeof(event.he_entropy);
- event.he_source = RANDOM_CACHED;
+ event.he_source = RANDOM_RANDOMDEV;
event.he_destination = destination++; /* Harmless cheating */
memcpy(event.he_entropy, entropy_data + i, sizeof(event.he_entropy));
p_random_alg_context->ra_event_processor(&event);
diff --git a/sys/dev/ufshci/ufshci_private.h b/sys/dev/ufshci/ufshci_private.h
index cac743884ee6..ac58d44102a0 100644
--- a/sys/dev/ufshci/ufshci_private.h
+++ b/sys/dev/ufshci/ufshci_private.h
@@ -149,6 +149,8 @@ struct ufshci_hw_queue {
bus_dmamap_t queuemem_map;
bus_addr_t req_queue_addr;
+ bus_addr_t *ucd_bus_addr;
+
uint32_t num_entries;
uint32_t num_trackers;
@@ -198,8 +200,6 @@ struct ufshci_req_queue {
bus_dma_tag_t dma_tag_payload;
bus_dmamap_t ucdmem_map;
-
- bus_addr_t ucd_addr;
};
struct ufshci_device {
diff --git a/sys/dev/ufshci/ufshci_req_sdb.c b/sys/dev/ufshci/ufshci_req_sdb.c
index 4670281d367a..b1f303afaef5 100644
--- a/sys/dev/ufshci/ufshci_req_sdb.c
+++ b/sys/dev/ufshci/ufshci_req_sdb.c
@@ -48,6 +48,29 @@ ufshci_req_sdb_cmd_desc_destroy(struct ufshci_req_queue *req_queue)
}
}
+static void
+ufshci_ucd_map(void *arg, bus_dma_segment_t *seg, int nseg, int error)
+{
+ struct ufshci_hw_queue *hwq = arg;
+ int i;
+
+ if (error != 0) {
+ printf("ufshci: Failed to map UCD, error = %d\n", error);
+ return;
+ }
+
+ if (hwq->num_trackers != nseg) {
+ printf(
+ "ufshci: Failed to map UCD, num_trackers = %d, nseg = %d\n",
+ hwq->num_trackers, nseg);
+ return;
+ }
+
+ for (i = 0; i < nseg; i++) {
+ hwq->ucd_bus_addr[i] = seg[i].ds_addr;
+ }
+}
+
static int
ufshci_req_sdb_cmd_desc_construct(struct ufshci_req_queue *req_queue,
uint32_t num_entries, struct ufshci_controller *ctrlr)
@@ -55,7 +78,6 @@ ufshci_req_sdb_cmd_desc_construct(struct ufshci_req_queue *req_queue,
struct ufshci_hw_queue *hwq = &req_queue->hwq[UFSHCI_SDB_Q];
struct ufshci_tracker *tr;
size_t ucd_allocsz, payload_allocsz;
- uint64_t ucdmem_phys;
uint8_t *ucdmem;
int i, error;
@@ -71,10 +93,11 @@ ufshci_req_sdb_cmd_desc_construct(struct ufshci_req_queue *req_queue,
* Allocate physical memory for UTP Command Descriptor (UCD)
* Note: UFSHCI UCD format is restricted to 128-byte alignment.
*/
- error = bus_dma_tag_create(bus_get_dma_tag(ctrlr->dev), 128,
- ctrlr->page_size, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL,
- ucd_allocsz, howmany(ucd_allocsz, ctrlr->page_size),
- ctrlr->page_size, 0, NULL, NULL, &req_queue->dma_tag_ucd);
+ error = bus_dma_tag_create(bus_get_dma_tag(ctrlr->dev), 128, 0,
+ BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, ucd_allocsz,
+ howmany(ucd_allocsz, sizeof(struct ufshci_utp_cmd_desc)),
+ sizeof(struct ufshci_utp_cmd_desc), 0, NULL, NULL,
+ &req_queue->dma_tag_ucd);
if (error != 0) {
ufshci_printf(ctrlr, "request cmd desc tag create failed %d\n",
error);
@@ -88,7 +111,7 @@ ufshci_req_sdb_cmd_desc_construct(struct ufshci_req_queue *req_queue,
}
if (bus_dmamap_load(req_queue->dma_tag_ucd, req_queue->ucdmem_map,
- ucdmem, ucd_allocsz, ufshci_single_map, &ucdmem_phys, 0) != 0) {
+ ucdmem, ucd_allocsz, ufshci_ucd_map, hwq, 0) != 0) {
ufshci_printf(ctrlr, "failed to load cmd desc memory\n");
bus_dmamem_free(req_queue->dma_tag_ucd, req_queue->ucd,
req_queue->ucdmem_map);
@@ -96,7 +119,6 @@ ufshci_req_sdb_cmd_desc_construct(struct ufshci_req_queue *req_queue,
}
req_queue->ucd = (struct ufshci_utp_cmd_desc *)ucdmem;
- req_queue->ucd_addr = ucdmem_phys;
/*
* Allocate physical memory for PRDT
@@ -128,10 +150,9 @@ ufshci_req_sdb_cmd_desc_construct(struct ufshci_req_queue *req_queue,
tr->slot_state = UFSHCI_SLOT_STATE_FREE;
tr->ucd = (struct ufshci_utp_cmd_desc *)ucdmem;
- tr->ucd_bus_addr = ucdmem_phys;
+ tr->ucd_bus_addr = hwq->ucd_bus_addr[i];
ucdmem += sizeof(struct ufshci_utp_cmd_desc);
- ucdmem_phys += sizeof(struct ufshci_utp_cmd_desc);
hwq->act_tr[i] = tr;
}
@@ -175,6 +196,11 @@ ufshci_req_sdb_construct(struct ufshci_controller *ctrlr,
req_queue->hwq = malloc(sizeof(struct ufshci_hw_queue), M_UFSHCI,
M_ZERO | M_NOWAIT);
hwq = &req_queue->hwq[UFSHCI_SDB_Q];
+ hwq->num_entries = req_queue->num_entries;
+ hwq->num_trackers = req_queue->num_trackers;
+ req_queue->hwq->ucd_bus_addr = malloc(sizeof(bus_addr_t) *
+ req_queue->num_trackers,
+ M_UFSHCI, M_ZERO | M_NOWAIT);
mtx_init(&hwq->qlock, "ufshci req_queue lock", NULL, MTX_DEF);
@@ -277,6 +303,7 @@ ufshci_req_sdb_destroy(struct ufshci_controller *ctrlr,
if (mtx_initialized(&hwq->qlock))
mtx_destroy(&hwq->qlock);
+ free(req_queue->hwq->ucd_bus_addr, M_UFSHCI);
free(req_queue->hwq, M_UFSHCI);
}
diff --git a/sys/fs/fdescfs/fdesc_vnops.c b/sys/fs/fdescfs/fdesc_vnops.c
index 676ea5de12b8..58a22b8bdc50 100644
--- a/sys/fs/fdescfs/fdesc_vnops.c
+++ b/sys/fs/fdescfs/fdesc_vnops.c
@@ -547,6 +547,8 @@ fdesc_readdir(struct vop_readdir_args *ap)
fmp = VFSTOFDESC(ap->a_vp->v_mount);
if (ap->a_ncookies != NULL)
*ap->a_ncookies = 0;
+ if (ap->a_eofflag != NULL)
+ *ap->a_eofflag = 0;
off = (int)uio->uio_offset;
if (off != uio->uio_offset || off < 0 || (u_int)off % UIO_MX != 0 ||
@@ -559,7 +561,12 @@ fdesc_readdir(struct vop_readdir_args *ap)
fcnt = i - 2; /* The first two nodes are `.' and `..' */
FILEDESC_SLOCK(fdp);
- while (i < fdp->fd_nfiles + 2 && uio->uio_resid >= UIO_MX) {
+ while (uio->uio_resid >= UIO_MX) {
+ if (i >= fdp->fd_nfiles + 2) {
+ if (ap->a_eofflag != NULL)
+ *ap->a_eofflag = 1;
+ break;
+ }
bzero((caddr_t)dp, UIO_MX);
switch (i) {
case 0: /* `.' */
diff --git a/sys/fs/msdosfs/msdosfs_conv.c b/sys/fs/msdosfs/msdosfs_conv.c
index da4848169173..208b64930e61 100644
--- a/sys/fs/msdosfs/msdosfs_conv.c
+++ b/sys/fs/msdosfs/msdosfs_conv.c
@@ -797,19 +797,24 @@ mbsadjpos(const char **instr, size_t inlen, size_t outlen, int weight, int flag,
static u_char *
dos2unixchr(u_char *outbuf, const u_char **instr, size_t *ilen, int lower, struct msdosfsmount *pmp)
{
- u_char c, *outp;
- size_t len, olen;
+ u_char c, *outp, *outp1;
+ size_t i, len, olen;
outp = outbuf;
if (pmp->pm_flags & MSDOSFSMNT_KICONV && msdosfs_iconv) {
olen = len = 4;
+ outp1 = outp;
if (lower & (LCASE_BASE | LCASE_EXT))
msdosfs_iconv->convchr_case(pmp->pm_d2u, (const char **)instr,
ilen, (char **)&outp, &olen, KICONV_LOWER);
else
msdosfs_iconv->convchr(pmp->pm_d2u, (const char **)instr,
ilen, (char **)&outp, &olen);
+ for (i = 0; i < outp - outp1; i++) {
+ if (outp1[i] == '/')
+ outp1[i] = '?';
+ }
len -= olen;
/*
@@ -826,6 +831,8 @@ dos2unixchr(u_char *outbuf, const u_char **instr, size_t *ilen, int lower, struc
c = dos2unix[c];
if (lower & (LCASE_BASE | LCASE_EXT))
c = u2l[c];
+ if (c == '/')
+ c = '?';
*outp++ = c;
outbuf[1] = '\0';
}
diff --git a/sys/fs/nfsserver/nfs_nfsdserv.c b/sys/fs/nfsserver/nfs_nfsdserv.c
index f7564ade401b..3acf07d5253b 100644
--- a/sys/fs/nfsserver/nfs_nfsdserv.c
+++ b/sys/fs/nfsserver/nfs_nfsdserv.c
@@ -4353,9 +4353,10 @@ nfsrvd_openattr(struct nfsrv_descript *nd, __unused int isdgram,
int error = 0;
NFSNAMEICNDSET(&cn, nd->nd_cred, LOOKUP, OPENNAMED | ISLASTCN |
- NOFOLLOW);
+ NOFOLLOW | LOCKLEAF);
cn.cn_nameptr = ".";
cn.cn_namelen = 1;
+ cn.cn_lkflags = LK_SHARED;
NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED);
if (*tl == newnfs_true)
cn.cn_flags |= CREATENAMED;
@@ -4374,6 +4375,8 @@ nfsrvd_openattr(struct nfsrv_descript *nd, __unused int isdgram,
if (nd->nd_repstat == ENOATTR)
nd->nd_repstat = NFSERR_NOENT;
}
+ if (nd->nd_repstat == 0)
+ NFSVOPUNLOCK(*vpp);
vput(dp);
NFSEXITCODE2(0, nd);
diff --git a/sys/fs/p9fs/p9fs_vnops.c b/sys/fs/p9fs/p9fs_vnops.c
index 56bf766ef801..227e2b93883e 100644
--- a/sys/fs/p9fs/p9fs_vnops.c
+++ b/sys/fs/p9fs/p9fs_vnops.c
@@ -1784,6 +1784,9 @@ p9fs_readdir(struct vop_readdir_args *ap)
return (EBADF);
}
+ if (ap->a_eofflag != NULL)
+ *ap->a_eofflag = 0;
+
io_buffer = uma_zalloc(p9fs_io_buffer_zone, M_WAITOK);
/* We haven't reached the end yet. read more. */
@@ -1801,8 +1804,11 @@ p9fs_readdir(struct vop_readdir_args *ap)
count = p9_client_readdir(vofid, (char *)io_buffer,
diroffset, count);
- if (count == 0)
+ if (count == 0) {
+ if (ap->a_eofflag != NULL)
+ *ap->a_eofflag = 1;
break;
+ }
if (count < 0) {
error = EIO;
diff --git a/sys/fs/udf/ecma167-udf.h b/sys/fs/udf/ecma167-udf.h
index 839bbec08254..19e114763cac 100644
--- a/sys/fs/udf/ecma167-udf.h
+++ b/sys/fs/udf/ecma167-udf.h
@@ -243,7 +243,7 @@ struct part_map_spare {
uint8_t n_st; /* Number of Sparing Tables */
uint8_t reserved1;
uint32_t st_size;
- uint32_t st_loc[1];
+ uint32_t st_loc[];
} __packed;
union udf_pmap {
@@ -266,7 +266,7 @@ struct udf_sparing_table {
uint16_t rt_l; /* Relocation Table len */
uint8_t reserved[2];
uint32_t seq_num;
- struct spare_map_entry entries[1];
+ struct spare_map_entry entries[];
} __packed;
/* Partition Descriptor [3/10.5] */
diff --git a/sys/fs/udf/udf_vfsops.c b/sys/fs/udf/udf_vfsops.c
index c7438147c0a0..c5ef1f686093 100644
--- a/sys/fs/udf/udf_vfsops.c
+++ b/sys/fs/udf/udf_vfsops.c
@@ -81,6 +81,7 @@
#include <sys/fcntl.h>
#include <sys/iconv.h>
#include <sys/kernel.h>
+#include <sys/limits.h>
#include <sys/malloc.h>
#include <sys/mount.h>
#include <sys/namei.h>
@@ -729,7 +730,7 @@ udf_fhtovp(struct mount *mp, struct fid *fhp, int flags, struct vnode **vpp)
struct ifid *ifhp;
struct vnode *nvp;
struct udf_node *np;
- off_t fsize;
+ uint64_t fsize;
int error;
ifhp = (struct ifid *)fhp;
@@ -741,6 +742,10 @@ udf_fhtovp(struct mount *mp, struct fid *fhp, int flags, struct vnode **vpp)
np = VTON(nvp);
fsize = le64toh(np->fentry->inf_len);
+ if (fsize > OFF_MAX) {
+ *vpp = NULLVP;
+ return (EIO);
+ }
*vpp = nvp;
vnode_create_vobject(*vpp, fsize, curthread);
diff --git a/sys/fs/udf/udf_vnops.c b/sys/fs/udf/udf_vnops.c
index 88bf4917a851..37889241e8c3 100644
--- a/sys/fs/udf/udf_vnops.c
+++ b/sys/fs/udf/udf_vnops.c
@@ -39,6 +39,7 @@
#include <sys/conf.h>
#include <sys/buf.h>
#include <sys/iconv.h>
+#include <sys/limits.h>
#include <sys/mount.h>
#include <sys/vnode.h>
#include <sys/dirent.h>
@@ -182,11 +183,14 @@ udf_access(struct vop_access_args *a)
}
static int
-udf_open(struct vop_open_args *ap) {
+udf_open(struct vop_open_args *ap)
+{
struct udf_node *np = VTON(ap->a_vp);
- off_t fsize;
+ uint64_t fsize;
fsize = le64toh(np->fentry->inf_len);
+ if (fsize > OFF_MAX)
+ return (EIO);
vnode_create_vobject(ap->a_vp, fsize, ap->a_td);
return 0;
}
@@ -314,12 +318,13 @@ udf_getattr(struct vop_getattr_args *a)
* that directories consume at least one logical block,
* make it appear so.
*/
- if (fentry->logblks_rec != 0) {
- vap->va_size =
- le64toh(fentry->logblks_rec) * node->udfmp->bsize;
- } else {
+ vap->va_size = le64toh(fentry->logblks_rec);
+ if (vap->va_size == 0)
vap->va_size = node->udfmp->bsize;
- }
+ else if (vap->va_size > UINT64_MAX / node->udfmp->bsize)
+ vap->va_size = UINT64_MAX;
+ else
+ vap->va_size *= node->udfmp->bsize;
} else {
vap->va_size = le64toh(fentry->inf_len);
}
@@ -446,6 +451,7 @@ udf_read(struct vop_read_args *ap)
struct buf *bp;
uint8_t *data;
daddr_t lbn, rablock;
+ uint64_t len;
off_t diff, fsize;
ssize_t n;
int error = 0;
@@ -471,7 +477,12 @@ udf_read(struct vop_read_args *ap)
return (error);
}
- fsize = le64toh(node->fentry->inf_len);
+ len = le64toh(node->fentry->inf_len);
+ if (len > OFF_MAX) {
+ /* too big, just cap to the requested length */
+ len = uio->uio_resid;
+ }
+ fsize = len;
udfmp = node->udfmp;
do {
lbn = lblkno(udfmp, uio->uio_offset);
@@ -783,6 +794,7 @@ udf_readdir(struct vop_readdir_args *a)
struct udf_uiodir uiodir;
struct udf_dirstream *ds;
uint64_t *cookies = NULL;
+ uint64_t len;
int ncookies;
int error = 0;
@@ -811,8 +823,12 @@ udf_readdir(struct vop_readdir_args *a)
* Iterate through the file id descriptors. Give the parent dir
* entry special attention.
*/
- ds = udf_opendir(node, uio->uio_offset, le64toh(node->fentry->inf_len),
- node->udfmp);
+ len = le64toh(node->fentry->inf_len);
+ if (len > INT_MAX) {
+ /* too big, just cap to INT_MAX */
+ len = INT_MAX;
+ }
+ ds = udf_opendir(node, uio->uio_offset, len, node->udfmp);
while ((fid = udf_getfid(ds)) != NULL) {
/* XXX Should we return an error on a bad fid? */
@@ -904,7 +920,8 @@ udf_readlink(struct vop_readlink_args *ap)
struct udf_node *node;
void *buf;
char *cp;
- int error, len, root;
+ uint64_t len;
+ int error, root;
/*
* A symbolic link in UDF is a list of variable-length path
@@ -914,6 +931,8 @@ udf_readlink(struct vop_readlink_args *ap)
vp = ap->a_vp;
node = VTON(vp);
len = le64toh(node->fentry->inf_len);
+ if (len > MAXPATHLEN)
+ return (EIO);
buf = malloc(len, M_DEVBUF, M_WAITOK);
iov[0].iov_len = len;
iov[0].iov_base = buf;
@@ -1116,13 +1135,14 @@ udf_lookup(struct vop_cachedlookup_args *a)
struct udf_mnt *udfmp;
struct fileid_desc *fid = NULL;
struct udf_dirstream *ds;
+ uint64_t fsize;
u_long nameiop;
u_long flags;
char *nameptr;
long namelen;
ino_t id = 0;
int offset, error = 0;
- int fsize, lkflags, ltype, numdirpasses;
+ int lkflags, ltype, numdirpasses;
dvp = a->a_dvp;
node = VTON(dvp);
@@ -1133,6 +1153,10 @@ udf_lookup(struct vop_cachedlookup_args *a)
nameptr = a->a_cnp->cn_nameptr;
namelen = a->a_cnp->cn_namelen;
fsize = le64toh(node->fentry->inf_len);
+ if (fsize > INT_MAX) {
+ /* too big, just cap to INT_MAX */
+ fsize = INT_MAX;
+ }
/*
* If this is a LOOKUP and we've already partially searched through
diff --git a/sys/i386/i386/pmap.c b/sys/i386/i386/pmap.c
index 5065b7e61ee8..b44f5e08bbcf 100644
--- a/sys/i386/i386/pmap.c
+++ b/sys/i386/i386/pmap.c
@@ -876,14 +876,16 @@ __CONCAT(PMTYPE, init_pat)(void)
#ifdef PMAP_PAE_COMP
static void *
-pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags,
- int wait)
+pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *sflagsp,
+ int flags)
{
/* Inform UMA that this allocator uses kernel_map/object. */
- *flags = UMA_SLAB_KERNEL;
+ *sflagsp = UMA_SLAB_KERNEL;
+ /* contig allocations cannot be NEVERFREED */
+ flags &= ~M_NEVERFREED;
return ((void *)kmem_alloc_contig_domainset(DOMAINSET_FIXED(domain),
- bytes, wait, 0x0ULL, 0xffffffffULL, 1, 0, VM_MEMATTR_DEFAULT));
+ bytes, flags, 0x0ULL, 0xffffffffULL, 1, 0, VM_MEMATTR_DEFAULT));
}
#endif
diff --git a/sys/kern/subr_asan.c b/sys/kern/subr_asan.c
index 0edb631d1475..464efda1e91a 100644
--- a/sys/kern/subr_asan.c
+++ b/sys/kern/subr_asan.c
@@ -263,8 +263,7 @@ kasan_mark(const void *addr, size_t size, size_t redzsize, uint8_t code)
if (__predict_false(!kasan_enabled))
return;
- if ((vm_offset_t)addr >= DMAP_MIN_ADDRESS &&
- (vm_offset_t)addr < DMAP_MAX_ADDRESS)
+ if (kasan_md_unsupported((vm_offset_t)addr))
return;
KASSERT((vm_offset_t)addr >= VM_MIN_KERNEL_ADDRESS &&
diff --git a/sys/kern/sys_generic.c b/sys/kern/sys_generic.c
index 94e44d888181..b472aaea89e6 100644
--- a/sys/kern/sys_generic.c
+++ b/sys/kern/sys_generic.c
@@ -2309,6 +2309,12 @@ sys_exterrctl(struct thread *td, struct exterrctl_args *uap)
return (EINVAL);
td->td_pflags2 &= ~TDP2_UEXTERR;
return (0);
+ case EXTERRCTL_UD:
+ /*
+ * Important: this code must always return EINVAL and never any
+ * extended error, for testing purposes.
+ */
+ /* FALLTHROUGH */
default:
return (EINVAL);
}
diff --git a/sys/kern/vfs_cache.c b/sys/kern/vfs_cache.c
index 3d455b3874cc..89c1d779f04c 100644
--- a/sys/kern/vfs_cache.c
+++ b/sys/kern/vfs_cache.c
@@ -332,7 +332,8 @@ SDT_PROBE_DEFINE2(vfs, namecache, evict_negative, done, "struct vnode *",
"char *");
SDT_PROBE_DEFINE1(vfs, namecache, symlink, alloc__fail, "size_t");
-SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool");
+SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata *", "int",
+ "enum cache_fpl_status");
SDT_PROBE_DECLARE(vfs, namei, lookup, entry);
SDT_PROBE_DECLARE(vfs, namei, lookup, return);
@@ -6420,15 +6421,11 @@ out:
cache_fpl_smr_assert_not_entered(&fpl);
cache_fpl_assert_status(&fpl);
*status = fpl.status;
- if (SDT_PROBES_ENABLED()) {
- SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
- if (fpl.status == CACHE_FPL_STATUS_HANDLED)
- SDT_PROBE4(vfs, namei, lookup, return, error, ndp->ni_vp, true,
- ndp);
- }
-
+ SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
if (__predict_true(fpl.status == CACHE_FPL_STATUS_HANDLED)) {
MPASS(error != CACHE_FPL_FAILED);
+ SDT_PROBE4(vfs, namei, lookup, return, error, ndp->ni_vp, true,
+ ndp);
if (error != 0) {
cache_fpl_cleanup_cnp(fpl.cnp);
MPASS(fpl.dvp == NULL);
diff --git a/sys/kern/vfs_inotify.c b/sys/kern/vfs_inotify.c
index 2b42228465a4..d3cd0d1f9832 100644
--- a/sys/kern/vfs_inotify.c
+++ b/sys/kern/vfs_inotify.c
@@ -371,7 +371,7 @@ inotify_unlink_watch_locked(struct inotify_softc *sc, struct inotify_watch *watc
TAILQ_REMOVE(&vp->v_pollinfo->vpi_inotify, watch, vlink);
if (TAILQ_EMPTY(&vp->v_pollinfo->vpi_inotify))
- vn_irflag_unset_locked(vp, VIRF_INOTIFY);
+ vn_irflag_unset(vp, VIRF_INOTIFY);
}
/*
@@ -675,7 +675,8 @@ vn_inotify(struct vnode *vp, struct vnode *dvp, struct componentname *cnp,
struct vattr va;
int error;
- error = VOP_GETATTR(vp, &va, cnp->cn_cred);
+ error = VOP_GETATTR(vp, &va,
+ cnp->cn_cred);
if (error == 0 && va.va_nlink != 0)
selfevent = 0;
}
diff --git a/sys/modules/Makefile b/sys/modules/Makefile
index 9922796f8a1d..7cb6e2124326 100644
--- a/sys/modules/Makefile
+++ b/sys/modules/Makefile
@@ -326,6 +326,7 @@ SUBDIR= \
proto \
pseudofs \
${_pst} \
+ ${_pt} \
pty \
puc \
pwm \
@@ -842,6 +843,7 @@ _iwx= iwx
_ixl= ixl
_nvdimm= nvdimm
_pms= pms
+_pt= pt
_qat= qat
.if ${MK_SOURCELESS_UCODE} != "no"
_qatfw= qatfw
diff --git a/sys/modules/ice/Makefile b/sys/modules/ice/Makefile
index 91f20193d878..9f9c9f602cda 100644
--- a/sys/modules/ice/Makefile
+++ b/sys/modules/ice/Makefile
@@ -13,6 +13,7 @@ SRCS += opt_inet.h opt_inet6.h opt_rss.h opt_iflib.h
SRCS += ice_lib.c ice_osdep.c ice_resmgr.c ice_strings.c
SRCS += ice_iflib_recovery_txrx.c ice_iflib_txrx.c if_ice_iflib.c
SRCS += ice_fw_logging.c ice_ddp_common.c
+SRCS.PCI_IOV += pci_iov_if.h ice_iov.c ice_vf_mbx.c
# RDMA Client interface
# TODO: Is this the right way to compile this?
diff --git a/sys/modules/pt/Makefile b/sys/modules/pt/Makefile
new file mode 100644
index 000000000000..416b072face9
--- /dev/null
+++ b/sys/modules/pt/Makefile
@@ -0,0 +1,8 @@
+
+.PATH: ${SRCTOP}/sys/amd64/pt
+
+KMOD= pt
+SRCS= pt.c pt.h device_if.h bus_if.h
+SRCS+= opt_hwpmc_hooks.h opt_kstack_pages.h
+
+.include <bsd.kmod.mk>
diff --git a/sys/modules/qlnx/qlnxe/Makefile b/sys/modules/qlnx/qlnxe/Makefile
index 3d8415cf0e57..2a44ae6ddde5 100644
--- a/sys/modules/qlnx/qlnxe/Makefile
+++ b/sys/modules/qlnx/qlnxe/Makefile
@@ -58,6 +58,7 @@ SRCS+=qlnx_rdma.c
SRCS+=qlnx_ioctl.c
SRCS+=qlnx_os.c
+SRCS+=opt_inet.h
SRCS+= ${LINUXKPI_GENSRCS}
diff --git a/sys/net/ethernet.h b/sys/net/ethernet.h
index cf4f75bd0b6c..01485cf26e06 100644
--- a/sys/net/ethernet.h
+++ b/sys/net/ethernet.h
@@ -62,6 +62,8 @@ struct ether_header {
u_char ether_shost[ETHER_ADDR_LEN];
u_short ether_type;
} __packed;
+_Static_assert(sizeof(struct ether_header) == ETHER_HDR_LEN,
+ "size of struct ether_header is wrong");
/*
* Structure of a 48-bit Ethernet address.
@@ -69,6 +71,8 @@ struct ether_header {
struct ether_addr {
u_char octet[ETHER_ADDR_LEN];
} __packed;
+_Static_assert(sizeof(struct ether_addr) == ETHER_ADDR_LEN,
+ "size of struct ether_addr is wrong");
#define ETHER_IS_MULTICAST(addr) (*(addr) & 0x01) /* is address mcast/bcast? */
#define ETHER_IS_IPV6_MULTICAST(addr) \
@@ -112,6 +116,8 @@ struct ether_vlan_header {
uint16_t evl_tag;
uint16_t evl_proto;
} __packed;
+_Static_assert(sizeof(struct ether_vlan_header) == ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN,
+ "size of struct ether_vlan_header is wrong");
#define EVL_VLID_MASK 0x0FFF
#define EVL_PRI_MASK 0xE000
diff --git a/sys/net/if_ethersubr.c b/sys/net/if_ethersubr.c
index 7be4dfac23e7..3ae0c01c0efc 100644
--- a/sys/net/if_ethersubr.c
+++ b/sys/net/if_ethersubr.c
@@ -92,11 +92,6 @@
#include <crypto/sha1.h>
-#ifdef CTASSERT
-CTASSERT(sizeof (struct ether_header) == ETHER_ADDR_LEN * 2 + 2);
-CTASSERT(sizeof (struct ether_addr) == ETHER_ADDR_LEN);
-#endif
-
VNET_DEFINE(pfil_head_t, link_pfil_head); /* Packet filter hooks */
/* netgraph node hooks for ng_ether(4) */
diff --git a/sys/net80211/ieee80211_hostap.c b/sys/net80211/ieee80211_hostap.c
index c5a478533313..9074878e17e4 100644
--- a/sys/net80211/ieee80211_hostap.c
+++ b/sys/net80211/ieee80211_hostap.c
@@ -2214,12 +2214,9 @@ hostap_recv_mgmt(struct ieee80211_node *ni, struct mbuf *m0,
/* VHT */
if (IEEE80211_IS_CHAN_VHT(ni->ni_chan) &&
- vhtcap != NULL &&
- vhtinfo != NULL) {
- /* XXX TODO; see below */
- net80211_vap_printf(vap, "%s: VHT TODO!\n", __func__);
+ vhtcap != NULL) {
ieee80211_vht_node_init(ni);
- ieee80211_vht_update_cap(ni, vhtcap, vhtinfo);
+ ieee80211_vht_update_cap(ni, vhtcap);
} else if (ni->ni_flags & IEEE80211_NODE_VHT)
ieee80211_vht_node_cleanup(ni);
diff --git a/sys/net80211/ieee80211_ht.c b/sys/net80211/ieee80211_ht.c
index 5ec80e3646b8..c28f124648a1 100644
--- a/sys/net80211/ieee80211_ht.c
+++ b/sys/net80211/ieee80211_ht.c
@@ -1952,6 +1952,11 @@ do { \
_RETURN_CHAN_BITS(0);
/*
+ * TODO: should we bail out if there's no htinfo?
+ * Or just treat it as if we can't do the HT20/HT40 check?
+ */
+
+ /*
* The original code was based on
* 802.11ac-2013, Table 8-183x-VHT Operation Information subfields.
* 802.11-2020, Table 9-274-VHT Operation Information subfields
@@ -1962,8 +1967,12 @@ do { \
*/
htinfo = (const struct ieee80211_ie_htinfo *)ni->ni_ies.htinfo_ie;
- ht40 = ((htinfo->hi_byte1 & IEEE80211_HTINFO_TXWIDTH) ==
- IEEE80211_HTINFO_TXWIDTH_2040);
+ if (htinfo != NULL)
+ ht40 = ((htinfo->hi_byte1 & IEEE80211_HTINFO_TXWIDTH) ==
+ IEEE80211_HTINFO_TXWIDTH_2040);
+ else
+ ht40 = false;
+
can_vht160 = can_vht80p80 = can_vht80 = false;
/* 20 Mhz */
diff --git a/sys/net80211/ieee80211_vht.c b/sys/net80211/ieee80211_vht.c
index e91977f1ef98..de0b691d4d2a 100644
--- a/sys/net80211/ieee80211_vht.c
+++ b/sys/net80211/ieee80211_vht.c
@@ -838,12 +838,10 @@ ieee80211_add_vhtinfo(uint8_t *frm, struct ieee80211_node *ni)
}
void
-ieee80211_vht_update_cap(struct ieee80211_node *ni, const uint8_t *vhtcap_ie,
- const uint8_t *vhtop_ie)
+ieee80211_vht_update_cap(struct ieee80211_node *ni, const uint8_t *vhtcap_ie)
{
ieee80211_parse_vhtcap(ni, vhtcap_ie);
- ieee80211_parse_vhtopmode(ni, vhtop_ie);
}
static struct ieee80211_channel *
diff --git a/sys/net80211/ieee80211_vht.h b/sys/net80211/ieee80211_vht.h
index 2964de63c343..a1529df4a85b 100644
--- a/sys/net80211/ieee80211_vht.h
+++ b/sys/net80211/ieee80211_vht.h
@@ -52,8 +52,7 @@ uint8_t * ieee80211_add_vhtinfo(uint8_t *frm, struct ieee80211_node *);
uint8_t *ieee80211_add_vhtcap_ch(uint8_t *, struct ieee80211vap *,
struct ieee80211_channel *);
-void ieee80211_vht_update_cap(struct ieee80211_node *,
- const uint8_t *, const uint8_t *);
+void ieee80211_vht_update_cap(struct ieee80211_node *, const uint8_t *);
struct ieee80211_channel *
ieee80211_vht_adjust_channel(struct ieee80211com *,
diff --git a/sys/netipsec/ipsec.c b/sys/netipsec/ipsec.c
index 6bacc68b7441..92d0201b398a 100644
--- a/sys/netipsec/ipsec.c
+++ b/sys/netipsec/ipsec.c
@@ -636,8 +636,10 @@ ipsec4_in_reject1(const struct mbuf *m, struct ip *ip1, struct inpcb *inp)
#ifdef IPSEC_OFFLOAD
tag = ipsec_accel_input_tag_lookup(m);
- if (tag != NULL)
- return (0);
+ if (tag != NULL) {
+ tag->tag.m_tag_id = PACKET_TAG_IPSEC_IN_DONE;
+ __DECONST(struct mbuf *, m)->m_flags |= M_DECRYPTED;
+ }
#endif
if (ip1 == NULL) {
diff --git a/sys/netipsec/ipsec_offload.c b/sys/netipsec/ipsec_offload.c
index 467d5ded1d7a..8a09d5f37b4a 100644
--- a/sys/netipsec/ipsec_offload.c
+++ b/sys/netipsec/ipsec_offload.c
@@ -94,6 +94,7 @@ struct ifp_handle_sav {
size_t hdr_ext_size;
uint64_t cnt_octets;
uint64_t cnt_allocs;
+ struct xform_history xfh;
};
#define IFP_HS_HANDLED 0x00000001
@@ -159,6 +160,8 @@ static void ipsec_accel_drv_sa_lifetime_update_impl(struct secasvar *sav,
static int ipsec_accel_drv_sa_lifetime_fetch_impl(struct secasvar *sav,
if_t ifp, u_int drv_spi, uint64_t *octets, uint64_t *allocs);
static void ipsec_accel_ifdetach_event(void *arg, struct ifnet *ifp);
+static bool ipsec_accel_fill_xh_impl(if_t ifp, uint32_t drv_spi,
+ struct xform_history *xh);
static void
ipsec_accel_init(void *arg)
@@ -185,6 +188,7 @@ ipsec_accel_init(void *arg)
ipsec_accel_drv_sa_lifetime_update_impl;
ipsec_accel_drv_sa_lifetime_fetch_p =
ipsec_accel_drv_sa_lifetime_fetch_impl;
+ ipsec_accel_fill_xh_p = ipsec_accel_fill_xh_impl;
pctrie_init(&drv_spi_pctrie);
ipsec_accel_ifdetach_event_tag = EVENTHANDLER_REGISTER(
ifnet_departure_event, ipsec_accel_ifdetach_event, NULL,
@@ -209,6 +213,7 @@ ipsec_accel_fini(void *arg)
ipsec_accel_on_ifdown_p = NULL;
ipsec_accel_drv_sa_lifetime_update_p = NULL;
ipsec_accel_drv_sa_lifetime_fetch_p = NULL;
+ ipsec_accel_fill_xh_p = NULL;
ipsec_accel_sync_imp();
clean_unrhdr(drv_spi_unr); /* avoid panic, should go later */
clear_unrhdr(drv_spi_unr);
@@ -412,6 +417,10 @@ ipsec_accel_handle_sav(struct secasvar *sav, struct ifnet *ifp,
ihs->ifdata = priv;
ihs->flags = flags;
ihs->hdr_ext_size = esp_hdrsiz(sav);
+ memcpy(&ihs->xfh.dst, &sav->sah->saidx.dst, sizeof(ihs->xfh.dst));
+ ihs->xfh.spi = sav->spi;
+ ihs->xfh.proto = sav->sah->saidx.proto;
+ ihs->xfh.mode = sav->sah->saidx.mode;
mtx_lock(&ipsec_accel_sav_tmp);
CK_LIST_FOREACH(i, &sav->accel_ifps, sav_link) {
if (i->ifp == ifp) {
@@ -1162,4 +1171,20 @@ ipsec_accel_key_setaccelif_impl(struct secasvar *sav)
return (m);
}
+static bool
+ipsec_accel_fill_xh_impl(if_t ifp, uint32_t drv_spi, struct xform_history *xh)
+{
+ struct ifp_handle_sav *i;
+
+ if (drv_spi < IPSEC_ACCEL_DRV_SPI_MIN ||
+ drv_spi > IPSEC_ACCEL_DRV_SPI_MAX)
+ return (false);
+
+ i = DRVSPI_SA_PCTRIE_LOOKUP(&drv_spi_pctrie, drv_spi);
+ if (i == NULL)
+ return (false);
+ memcpy(xh, &i->xfh, sizeof(*xh));
+ return (true);
+}
+
#endif /* IPSEC_OFFLOAD */
diff --git a/sys/netipsec/ipsec_offload.h b/sys/netipsec/ipsec_offload.h
index 904fe6252396..ae60eaa8ae78 100644
--- a/sys/netipsec/ipsec_offload.h
+++ b/sys/netipsec/ipsec_offload.h
@@ -30,6 +30,7 @@
#include <sys/errno.h>
#include <net/if.h>
#include <net/if_var.h>
+#include <netipsec/xform.h>
struct secpolicy;
struct secasvar;
@@ -42,6 +43,7 @@ struct ipsec_accel_out_tag {
struct ipsec_accel_in_tag {
struct m_tag tag;
+ struct xform_history xh; /* Must be first to mimic IPSEC_IN_DONE */
uint16_t drv_spi;
};
@@ -66,6 +68,8 @@ extern void (*ipsec_accel_drv_sa_lifetime_update_p)(struct secasvar *sav,
if_t ifp, u_int drv_spi, uint64_t octets, uint64_t allocs);
extern int (*ipsec_accel_drv_sa_lifetime_fetch_p)(struct secasvar *sav,
if_t ifp, u_int drv_spi, uint64_t *octets, uint64_t *allocs);
+extern bool (*ipsec_accel_fill_xh_p)(if_t ifp, uint32_t drv_spi,
+ struct xform_history *xh);
#ifdef IPSEC_OFFLOAD
/*
@@ -158,6 +162,16 @@ ipsec_accel_key_setaccelif(struct secasvar *sav)
return (NULL);
}
+static inline bool
+ipsec_accel_fill_xh(if_t ifp, uint32_t drv_spi, struct xform_history *xh)
+{
+ bool (*p)(if_t ifp, uint32_t drv_spi, struct xform_history *xh);
+
+ p = atomic_load_ptr(&ipsec_accel_fill_xh_p);
+ if (p != NULL)
+ return (p(ifp, drv_spi, xh));
+ return (false);
+}
#else
#define ipsec_accel_sa_newkey(a)
@@ -168,6 +182,7 @@ ipsec_accel_key_setaccelif(struct secasvar *sav)
#define ipsec_accel_sync()
#define ipsec_accel_is_accel_sav(a)
#define ipsec_accel_key_setaccelif(a)
+#define ipsec_accel_fill_xh(a, b, c) (false)
#endif
void ipsec_accel_forget_sav_impl(struct secasvar *sav);
@@ -180,6 +195,7 @@ bool ipsec_accel_output(struct ifnet *ifp, struct mbuf *m,
struct inpcb *inp, struct secpolicy *sp, struct secasvar *sav, int af,
int mtu, int *hwassist);
void ipsec_accel_forget_sav(struct secasvar *sav);
+struct xform_history;
#else
#define ipsec_accel_input(a, b, c) (ENXIO)
#define ipsec_accel_output(a, b, c, d, e, f, g, h) ({ \
diff --git a/sys/netipsec/key.c b/sys/netipsec/key.c
index ae67d83c6d13..4ba1b49c24f0 100644
--- a/sys/netipsec/key.c
+++ b/sys/netipsec/key.c
@@ -114,6 +114,8 @@ void (*ipsec_accel_drv_sa_lifetime_update_p)(struct secasvar *sav, if_t ifp,
u_int drv_spi, uint64_t octets, uint64_t allocs);
int (*ipsec_accel_drv_sa_lifetime_fetch_p)(struct secasvar *sav, if_t ifp,
u_int drv_spi, uint64_t *octets, uint64_t *allocs);
+bool (*ipsec_accel_fill_xh_p)(if_t ifp, uint32_t drv_spi,
+ struct xform_history *xh);
#endif
#define FULLMASK 0xff
diff --git a/sys/netpfil/pf/pf.c b/sys/netpfil/pf/pf.c
index 63d513fb1956..009f7e4d78b1 100644
--- a/sys/netpfil/pf/pf.c
+++ b/sys/netpfil/pf/pf.c
@@ -10064,6 +10064,8 @@ pf_setup_pdesc(sa_family_t af, int dir, struct pf_pdesc *pd, struct mbuf **m0,
pd->didx = (dir == PF_IN) ? 1 : 0;
pd->af = pd->naf = af;
+ PF_RULES_ASSERT();
+
TAILQ_INIT(&pd->sctp_multihome_jobs);
if (default_actions != NULL)
memcpy(&pd->act, default_actions, sizeof(pd->act));
@@ -10139,6 +10141,12 @@ pf_setup_pdesc(sa_family_t af, int dir, struct pf_pdesc *pd, struct mbuf **m0,
}
h = mtod(pd->m, struct ip6_hdr *);
+ if (pd->m->m_pkthdr.len <
+ sizeof(struct ip6_hdr) + ntohs(h->ip6_plen)) {
+ *action = PF_DROP;
+ REASON_SET(reason, PFRES_SHORT);
+ return (-1);
+ }
if (pf_walk_header6(pd, h, reason) != PF_PASS) {
*action = PF_DROP;
@@ -10477,35 +10485,30 @@ pf_test(sa_family_t af, int dir, int pflags, struct ifnet *ifp, struct mbuf **m0
PF_RULES_RLOCK_TRACKER;
KASSERT(dir == PF_IN || dir == PF_OUT, ("%s: bad direction %d\n", __func__, dir));
M_ASSERTPKTHDR(*m0);
+ NET_EPOCH_ASSERT();
if (!V_pf_status.running)
return (PF_PASS);
- PF_RULES_RLOCK();
-
kif = (struct pfi_kkif *)ifp->if_pf_kif;
if (__predict_false(kif == NULL)) {
DPFPRINTF(PF_DEBUG_URGENT,
("%s: kif == NULL, if_xname %s\n",
__func__, ifp->if_xname));
- PF_RULES_RUNLOCK();
return (PF_DROP);
}
if (kif->pfik_flags & PFI_IFLAG_SKIP) {
- PF_RULES_RUNLOCK();
return (PF_PASS);
}
if ((*m0)->m_flags & M_SKIP_FIREWALL) {
- PF_RULES_RUNLOCK();
return (PF_PASS);
}
if (__predict_false(! M_WRITABLE(*m0))) {
*m0 = m_unshare(*m0, M_NOWAIT);
if (*m0 == NULL) {
- PF_RULES_RUNLOCK();
return (PF_DROP);
}
}
@@ -10518,12 +10521,10 @@ pf_test(sa_family_t af, int dir, int pflags, struct ifnet *ifp, struct mbuf **m0
ifp = ifnet_byindexgen(pd.pf_mtag->if_index,
pd.pf_mtag->if_idxgen);
if (ifp == NULL || ifp->if_flags & IFF_DYING) {
- PF_RULES_RUNLOCK();
m_freem(*m0);
*m0 = NULL;
return (PF_PASS);
}
- PF_RULES_RUNLOCK();
(ifp->if_output)(ifp, *m0, sintosa(&pd.pf_mtag->dst), NULL);
*m0 = NULL;
return (PF_PASS);
@@ -10538,11 +10539,12 @@ pf_test(sa_family_t af, int dir, int pflags, struct ifnet *ifp, struct mbuf **m0
/* But only once. We may see the packet multiple times (e.g.
* PFIL_IN/PFIL_OUT). */
pf_dummynet_flag_remove(pd.m, pd.pf_mtag);
- PF_RULES_RUNLOCK();
return (PF_PASS);
}
+ PF_RULES_RLOCK();
+
if (pf_setup_pdesc(af, dir, &pd, m0, &action, &reason,
kif, default_actions) == -1) {
if (action != PF_PASS)
diff --git a/sys/netpfil/pf/pf_ioctl.c b/sys/netpfil/pf/pf_ioctl.c
index 3caa0d2e3b11..5c69c395c5fc 100644
--- a/sys/netpfil/pf/pf_ioctl.c
+++ b/sys/netpfil/pf/pf_ioctl.c
@@ -2042,6 +2042,34 @@ pf_ioctl_getrules(struct pfioc_rule *pr)
}
static int
+pf_rule_checkaf(struct pf_krule *r)
+{
+ switch (r->af) {
+ case 0:
+ if (r->rule_flag & PFRULE_AFTO)
+ return (EPFNOSUPPORT);
+ break;
+ case AF_INET:
+ if ((r->rule_flag & PFRULE_AFTO) && r->naf != AF_INET6)
+ return (EPFNOSUPPORT);
+ break;
+#ifdef INET6
+ case AF_INET6:
+ if ((r->rule_flag & PFRULE_AFTO) && r->naf != AF_INET)
+ return (EPFNOSUPPORT);
+ break;
+#endif /* INET6 */
+ default:
+ return (EPFNOSUPPORT);
+ }
+
+ if ((r->rule_flag & PFRULE_AFTO) == 0 && r->naf != 0)
+ return (EPFNOSUPPORT);
+
+ return (0);
+}
+
+static int
pf_validate_range(uint8_t op, uint16_t port[2])
{
uint16_t a = ntohs(port[0]);
@@ -2073,6 +2101,8 @@ pf_ioctl_addrule(struct pf_krule *rule, uint32_t ticket,
#define ERROUT(x) ERROUT_FUNCTION(errout, x)
+ if ((error = pf_rule_checkaf(rule)))
+ ERROUT(error);
if (pf_validate_range(rule->src.port_op, rule->src.port))
ERROUT(EINVAL);
if (pf_validate_range(rule->dst.port_op, rule->dst.port))
@@ -3567,7 +3597,7 @@ DIOCADDRULENV_error:
error = pf_rule_to_krule(&pr->rule, rule);
if (error != 0) {
pf_krule_free(rule);
- break;
+ goto fail;
}
pr->anchor[sizeof(pr->anchor) - 1] = '\0';
@@ -3726,11 +3756,11 @@ DIOCGETRULENV_error:
if (pcr->action < PF_CHANGE_ADD_HEAD ||
pcr->action > PF_CHANGE_GET_TICKET) {
error = EINVAL;
- break;
+ goto fail;
}
if (pcr->rule.return_icmp >> 8 > ICMP_MAXTYPE) {
error = EINVAL;
- break;
+ goto fail;
}
if (pcr->action != PF_CHANGE_REMOVE) {
@@ -3738,9 +3768,13 @@ DIOCGETRULENV_error:
error = pf_rule_to_krule(&pcr->rule, newrule);
if (error != 0) {
pf_krule_free(newrule);
- break;
+ goto fail;
}
+ if ((error = pf_rule_checkaf(newrule))) {
+ pf_krule_free(newrule);
+ goto fail;
+ }
if (newrule->ifname[0])
kif = pf_kkif_create(M_WAITOK);
pf_counter_u64_init(&newrule->evaluations, M_WAITOK);
@@ -3888,7 +3922,7 @@ DIOCGETRULENV_error:
pf_free_rule(newrule);
PF_RULES_WUNLOCK();
PF_CONFIG_UNLOCK();
- break;
+ goto fail;
}
newrule->nat.cur = TAILQ_FIRST(&newrule->nat.list);
@@ -3915,7 +3949,7 @@ DIOCGETRULENV_error:
PF_RULES_WUNLOCK();
PF_CONFIG_UNLOCK();
error = EINVAL;
- break;
+ goto fail;
}
}
@@ -3933,7 +3967,7 @@ DIOCGETRULENV_error:
PF_RULES_WUNLOCK();
PF_CONFIG_UNLOCK();
error = EEXIST;
- break;
+ goto fail;
}
if (oldrule == NULL)
@@ -3989,7 +4023,7 @@ DIOCCHANGERULE_error:
if (sp->timeout >= PFTM_MAX) {
error = EINVAL;
- break;
+ goto fail;
}
if (V_pfsync_state_import_ptr != NULL) {
PF_RULES_RLOCK();
@@ -4009,7 +4043,7 @@ DIOCCHANGERULE_error:
s = pf_find_state_byid(ps->state.id, ps->state.creatorid);
if (s == NULL) {
error = ENOENT;
- break;
+ goto fail;
}
pfsync_state_export((union pfsync_state_union*)&ps->state,
@@ -4088,7 +4122,7 @@ DIOCGETSTATES_retry:
error = copyout(pstore, out,
sizeof(struct pfsync_state_1301) * count);
if (error)
- break;
+ goto fail;
out = ps->ps_states + nr;
}
DIOCGETSTATES_full:
@@ -4108,7 +4142,7 @@ DIOCGETSTATES_full:
if (ps->ps_req_version > PF_STATE_VERSION) {
error = ENOTSUP;
- break;
+ goto fail;
}
if (ps->ps_len <= 0) {
@@ -4166,7 +4200,7 @@ DIOCGETSTATESV2_retry:
error = copyout(pstore, out,
sizeof(struct pf_state_export) * count);
if (error)
- break;
+ goto fail;
out = ps->ps_states + nr;
}
DIOCGETSTATESV2_full:
@@ -4272,12 +4306,12 @@ DIOCGETSTATESV2_full:
if (psp->ifname[0] == '\0') {
error = EINVAL;
- break;
+ goto fail;
}
error = pf_user_strcpy(ps.ifname, psp->ifname, IFNAMSIZ);
if (error != 0)
- break;
+ goto fail;
ifp = ifunit(ps.ifname);
if (ifp != NULL) {
psp->baudrate32 =
@@ -4338,7 +4372,7 @@ DIOCGETSTATESV2_full:
altq = malloc(sizeof(*altq), M_PFALTQ, M_WAITOK | M_ZERO);
error = pf_import_kaltq(pa, altq, IOCPARM_LEN(cmd));
if (error)
- break;
+ goto fail;
altq->local_flags = 0;
PF_RULES_WLOCK();
@@ -4346,7 +4380,7 @@ DIOCGETSTATESV2_full:
PF_RULES_WUNLOCK();
free(altq, M_PFALTQ);
error = EBUSY;
- break;
+ goto fail;
}
/*
@@ -4358,7 +4392,7 @@ DIOCGETSTATESV2_full:
PF_RULES_WUNLOCK();
error = EBUSY;
free(altq, M_PFALTQ);
- break;
+ goto fail;
}
altq->altq_disc = NULL;
TAILQ_FOREACH(a, V_pf_altq_ifs_inactive, entries) {
@@ -4378,7 +4412,7 @@ DIOCGETSTATESV2_full:
if (error) {
PF_RULES_WUNLOCK();
free(altq, M_PFALTQ);
- break;
+ goto fail;
}
if (altq->qname[0] != 0)
@@ -4416,13 +4450,13 @@ DIOCGETSTATESV2_full:
if (pa->ticket != V_ticket_altqs_active) {
PF_RULES_RUNLOCK();
error = EBUSY;
- break;
+ goto fail;
}
altq = pf_altq_get_nth_active(pa->nr);
if (altq == NULL) {
PF_RULES_RUNLOCK();
error = EBUSY;
- break;
+ goto fail;
}
pf_export_kaltq(altq, pa, IOCPARM_LEN(cmd));
PF_RULES_RUNLOCK();
@@ -4446,20 +4480,20 @@ DIOCGETSTATESV2_full:
if (pq->ticket != V_ticket_altqs_active) {
PF_RULES_RUNLOCK();
error = EBUSY;
- break;
+ goto fail;
}
nbytes = pq->nbytes;
altq = pf_altq_get_nth_active(pq->nr);
if (altq == NULL) {
PF_RULES_RUNLOCK();
error = EBUSY;
- break;
+ goto fail;
}
if ((altq->local_flags & PFALTQ_FLAG_IF_REMOVED) != 0) {
PF_RULES_RUNLOCK();
error = ENXIO;
- break;
+ goto fail;
}
PF_RULES_RUNLOCK();
if (cmd == DIOCGETQSTATSV0)
@@ -4528,30 +4562,30 @@ DIOCGETSTATESV2_full:
if (pca->action < PF_CHANGE_ADD_HEAD ||
pca->action > PF_CHANGE_REMOVE) {
error = EINVAL;
- break;
+ goto fail;
}
if (pca->addr.addr.type != PF_ADDR_ADDRMASK &&
pca->addr.addr.type != PF_ADDR_DYNIFTL &&
pca->addr.addr.type != PF_ADDR_TABLE) {
error = EINVAL;
- break;
+ goto fail;
}
if (pca->addr.addr.p.dyn != NULL) {
error = EINVAL;
- break;
+ goto fail;
}
if (pca->action != PF_CHANGE_REMOVE) {
#ifndef INET
if (pca->af == AF_INET) {
error = EAFNOSUPPORT;
- break;
+ goto fail;
}
#endif /* INET */
#ifndef INET6
if (pca->af == AF_INET6) {
error = EAFNOSUPPORT;
- break;
+ goto fail;
}
#endif /* INET6 */
newpa = malloc(sizeof(*newpa), M_PFRULE, M_WAITOK);
@@ -4674,7 +4708,7 @@ DIOCCHANGEADDR_error:
if (io->pfrio_esize != 0) {
error = ENODEV;
- break;
+ goto fail;
}
PF_RULES_WLOCK();
error = pfr_clr_tables(&io->pfrio_table, &io->pfrio_ndel,
@@ -4690,13 +4724,13 @@ DIOCCHANGEADDR_error:
if (io->pfrio_esize != sizeof(struct pfr_table)) {
error = ENODEV;
- break;
+ goto fail;
}
if (io->pfrio_size < 0 || io->pfrio_size > pf_ioctl_maxcount ||
WOULD_OVERFLOW(io->pfrio_size, sizeof(struct pfr_table))) {
error = ENOMEM;
- break;
+ goto fail;
}
totlen = io->pfrio_size * sizeof(struct pfr_table);
@@ -4705,7 +4739,7 @@ DIOCCHANGEADDR_error:
error = copyin(io->pfrio_buffer, pfrts, totlen);
if (error) {
free(pfrts, M_TEMP);
- break;
+ goto fail;
}
PF_RULES_WLOCK();
error = pfr_add_tables(pfrts, io->pfrio_size,
@@ -4722,13 +4756,13 @@ DIOCCHANGEADDR_error:
if (io->pfrio_esize != sizeof(struct pfr_table)) {
error = ENODEV;
- break;
+ goto fail;
}
if (io->pfrio_size < 0 || io->pfrio_size > pf_ioctl_maxcount ||
WOULD_OVERFLOW(io->pfrio_size, sizeof(struct pfr_table))) {
error = ENOMEM;
- break;
+ goto fail;
}
totlen = io->pfrio_size * sizeof(struct pfr_table);
@@ -4737,7 +4771,7 @@ DIOCCHANGEADDR_error:
error = copyin(io->pfrio_buffer, pfrts, totlen);
if (error) {
free(pfrts, M_TEMP);
- break;
+ goto fail;
}
PF_RULES_WLOCK();
error = pfr_del_tables(pfrts, io->pfrio_size,
@@ -4755,14 +4789,14 @@ DIOCCHANGEADDR_error:
if (io->pfrio_esize != sizeof(struct pfr_table)) {
error = ENODEV;
- break;
+ goto fail;
}
PF_RULES_RLOCK();
n = pfr_table_count(&io->pfrio_table, io->pfrio_flags);
if (n < 0) {
PF_RULES_RUNLOCK();
error = EINVAL;
- break;
+ goto fail;
}
io->pfrio_size = min(io->pfrio_size, n);
@@ -4773,7 +4807,7 @@ DIOCCHANGEADDR_error:
if (pfrts == NULL) {
error = ENOMEM;
PF_RULES_RUNLOCK();
- break;
+ goto fail;
}
error = pfr_get_tables(&io->pfrio_table, pfrts,
&io->pfrio_size, io->pfrio_flags | PFR_FLAG_USERIOCTL);
@@ -4792,7 +4826,7 @@ DIOCCHANGEADDR_error:
if (io->pfrio_esize != sizeof(struct pfr_tstats)) {
error = ENODEV;
- break;
+ goto fail;
}
PF_TABLE_STATS_LOCK();
PF_RULES_RLOCK();
@@ -4801,7 +4835,7 @@ DIOCCHANGEADDR_error:
PF_RULES_RUNLOCK();
PF_TABLE_STATS_UNLOCK();
error = EINVAL;
- break;
+ goto fail;
}
io->pfrio_size = min(io->pfrio_size, n);
@@ -4812,7 +4846,7 @@ DIOCCHANGEADDR_error:
error = ENOMEM;
PF_RULES_RUNLOCK();
PF_TABLE_STATS_UNLOCK();
- break;
+ goto fail;
}
error = pfr_get_tstats(&io->pfrio_table, pfrtstats,
&io->pfrio_size, io->pfrio_flags | PFR_FLAG_USERIOCTL);
@@ -4831,7 +4865,7 @@ DIOCCHANGEADDR_error:
if (io->pfrio_esize != sizeof(struct pfr_table)) {
error = ENODEV;
- break;
+ goto fail;
}
if (io->pfrio_size < 0 || io->pfrio_size > pf_ioctl_maxcount ||
@@ -4840,7 +4874,7 @@ DIOCCHANGEADDR_error:
* size, so we didn't fail on overly large requests.
* Keep doing so. */
io->pfrio_size = pf_ioctl_maxcount;
- break;
+ goto fail;
}
totlen = io->pfrio_size * sizeof(struct pfr_table);
@@ -4849,7 +4883,7 @@ DIOCCHANGEADDR_error:
error = copyin(io->pfrio_buffer, pfrts, totlen);
if (error) {
free(pfrts, M_TEMP);
- break;
+ goto fail;
}
PF_TABLE_STATS_LOCK();
@@ -4870,7 +4904,7 @@ DIOCCHANGEADDR_error:
if (io->pfrio_esize != sizeof(struct pfr_table)) {
error = ENODEV;
- break;
+ goto fail;
}
PF_RULES_RLOCK();
@@ -4878,7 +4912,7 @@ DIOCCHANGEADDR_error:
if (n < 0) {
PF_RULES_RUNLOCK();
error = EINVAL;
- break;
+ goto fail;
}
io->pfrio_size = min(io->pfrio_size, n);
@@ -4890,7 +4924,7 @@ DIOCCHANGEADDR_error:
error = copyin(io->pfrio_buffer, pfrts, totlen);
if (error) {
free(pfrts, M_TEMP);
- break;
+ goto fail;
}
PF_RULES_WLOCK();
error = pfr_set_tflags(pfrts, io->pfrio_size,
@@ -4906,7 +4940,7 @@ DIOCCHANGEADDR_error:
if (io->pfrio_esize != 0) {
error = ENODEV;
- break;
+ goto fail;
}
PF_RULES_WLOCK();
error = pfr_clr_addrs(&io->pfrio_table, &io->pfrio_ndel,
@@ -4922,13 +4956,13 @@ DIOCCHANGEADDR_error:
if (io->pfrio_esize != sizeof(struct pfr_addr)) {
error = ENODEV;
- break;
+ goto fail;
}
if (io->pfrio_size < 0 ||
io->pfrio_size > pf_ioctl_maxcount ||
WOULD_OVERFLOW(io->pfrio_size, sizeof(struct pfr_addr))) {
error = EINVAL;
- break;
+ goto fail;
}
totlen = io->pfrio_size * sizeof(struct pfr_addr);
pfras = mallocarray(io->pfrio_size, sizeof(struct pfr_addr),
@@ -4936,7 +4970,7 @@ DIOCCHANGEADDR_error:
error = copyin(io->pfrio_buffer, pfras, totlen);
if (error) {
free(pfras, M_TEMP);
- break;
+ goto fail;
}
PF_RULES_WLOCK();
error = pfr_add_addrs(&io->pfrio_table, pfras,
@@ -4956,13 +4990,13 @@ DIOCCHANGEADDR_error:
if (io->pfrio_esize != sizeof(struct pfr_addr)) {
error = ENODEV;
- break;
+ goto fail;
}
if (io->pfrio_size < 0 ||
io->pfrio_size > pf_ioctl_maxcount ||
WOULD_OVERFLOW(io->pfrio_size, sizeof(struct pfr_addr))) {
error = EINVAL;
- break;
+ goto fail;
}
totlen = io->pfrio_size * sizeof(struct pfr_addr);
pfras = mallocarray(io->pfrio_size, sizeof(struct pfr_addr),
@@ -4970,7 +5004,7 @@ DIOCCHANGEADDR_error:
error = copyin(io->pfrio_buffer, pfras, totlen);
if (error) {
free(pfras, M_TEMP);
- break;
+ goto fail;
}
PF_RULES_WLOCK();
error = pfr_del_addrs(&io->pfrio_table, pfras,
@@ -4990,17 +5024,17 @@ DIOCCHANGEADDR_error:
if (io->pfrio_esize != sizeof(struct pfr_addr)) {
error = ENODEV;
- break;
+ goto fail;
}
if (io->pfrio_size < 0 || io->pfrio_size2 < 0) {
error = EINVAL;
- break;
+ goto fail;
}
count = max(io->pfrio_size, io->pfrio_size2);
if (count > pf_ioctl_maxcount ||
WOULD_OVERFLOW(count, sizeof(struct pfr_addr))) {
error = EINVAL;
- break;
+ goto fail;
}
totlen = count * sizeof(struct pfr_addr);
pfras = mallocarray(count, sizeof(struct pfr_addr), M_TEMP,
@@ -5008,7 +5042,7 @@ DIOCCHANGEADDR_error:
error = copyin(io->pfrio_buffer, pfras, totlen);
if (error) {
free(pfras, M_TEMP);
- break;
+ goto fail;
}
PF_RULES_WLOCK();
error = pfr_set_addrs(&io->pfrio_table, pfras,
@@ -5029,13 +5063,13 @@ DIOCCHANGEADDR_error:
if (io->pfrio_esize != sizeof(struct pfr_addr)) {
error = ENODEV;
- break;
+ goto fail;
}
if (io->pfrio_size < 0 ||
io->pfrio_size > pf_ioctl_maxcount ||
WOULD_OVERFLOW(io->pfrio_size, sizeof(struct pfr_addr))) {
error = EINVAL;
- break;
+ goto fail;
}
totlen = io->pfrio_size * sizeof(struct pfr_addr);
pfras = mallocarray(io->pfrio_size, sizeof(struct pfr_addr),
@@ -5057,13 +5091,13 @@ DIOCCHANGEADDR_error:
if (io->pfrio_esize != sizeof(struct pfr_astats)) {
error = ENODEV;
- break;
+ goto fail;
}
if (io->pfrio_size < 0 ||
io->pfrio_size > pf_ioctl_maxcount ||
WOULD_OVERFLOW(io->pfrio_size, sizeof(struct pfr_astats))) {
error = EINVAL;
- break;
+ goto fail;
}
totlen = io->pfrio_size * sizeof(struct pfr_astats);
pfrastats = mallocarray(io->pfrio_size,
@@ -5085,13 +5119,13 @@ DIOCCHANGEADDR_error:
if (io->pfrio_esize != sizeof(struct pfr_addr)) {
error = ENODEV;
- break;
+ goto fail;
}
if (io->pfrio_size < 0 ||
io->pfrio_size > pf_ioctl_maxcount ||
WOULD_OVERFLOW(io->pfrio_size, sizeof(struct pfr_addr))) {
error = EINVAL;
- break;
+ goto fail;
}
totlen = io->pfrio_size * sizeof(struct pfr_addr);
pfras = mallocarray(io->pfrio_size, sizeof(struct pfr_addr),
@@ -5099,7 +5133,7 @@ DIOCCHANGEADDR_error:
error = copyin(io->pfrio_buffer, pfras, totlen);
if (error) {
free(pfras, M_TEMP);
- break;
+ goto fail;
}
PF_RULES_WLOCK();
error = pfr_clr_astats(&io->pfrio_table, pfras,
@@ -5119,13 +5153,13 @@ DIOCCHANGEADDR_error:
if (io->pfrio_esize != sizeof(struct pfr_addr)) {
error = ENODEV;
- break;
+ goto fail;
}
if (io->pfrio_size < 0 ||
io->pfrio_size > pf_ioctl_maxcount ||
WOULD_OVERFLOW(io->pfrio_size, sizeof(struct pfr_addr))) {
error = EINVAL;
- break;
+ goto fail;
}
totlen = io->pfrio_size * sizeof(struct pfr_addr);
pfras = mallocarray(io->pfrio_size, sizeof(struct pfr_addr),
@@ -5133,7 +5167,7 @@ DIOCCHANGEADDR_error:
error = copyin(io->pfrio_buffer, pfras, totlen);
if (error) {
free(pfras, M_TEMP);
- break;
+ goto fail;
}
PF_RULES_RLOCK();
error = pfr_tst_addrs(&io->pfrio_table, pfras,
@@ -5153,13 +5187,13 @@ DIOCCHANGEADDR_error:
if (io->pfrio_esize != sizeof(struct pfr_addr)) {
error = ENODEV;
- break;
+ goto fail;
}
if (io->pfrio_size < 0 ||
io->pfrio_size > pf_ioctl_maxcount ||
WOULD_OVERFLOW(io->pfrio_size, sizeof(struct pfr_addr))) {
error = EINVAL;
- break;
+ goto fail;
}
totlen = io->pfrio_size * sizeof(struct pfr_addr);
pfras = mallocarray(io->pfrio_size, sizeof(struct pfr_addr),
@@ -5167,7 +5201,7 @@ DIOCCHANGEADDR_error:
error = copyin(io->pfrio_buffer, pfras, totlen);
if (error) {
free(pfras, M_TEMP);
- break;
+ goto fail;
}
PF_RULES_WLOCK();
error = pfr_ina_define(&io->pfrio_table, pfras,
@@ -5202,13 +5236,13 @@ DIOCCHANGEADDR_error:
if (io->esize != sizeof(*ioe)) {
error = ENODEV;
- break;
+ goto fail;
}
if (io->size < 0 ||
io->size > pf_ioctl_maxcount ||
WOULD_OVERFLOW(io->size, sizeof(struct pfioc_trans_e))) {
error = EINVAL;
- break;
+ goto fail;
}
totlen = sizeof(struct pfioc_trans_e) * io->size;
ioes = mallocarray(io->size, sizeof(struct pfioc_trans_e),
@@ -5216,7 +5250,7 @@ DIOCCHANGEADDR_error:
error = copyin(io->array, ioes, totlen);
if (error) {
free(ioes, M_TEMP);
- break;
+ goto fail;
}
PF_RULES_WLOCK();
for (i = 0, ioe = ioes; i < io->size; i++, ioe++) {
@@ -5283,13 +5317,13 @@ DIOCCHANGEADDR_error:
if (io->esize != sizeof(*ioe)) {
error = ENODEV;
- break;
+ goto fail;
}
if (io->size < 0 ||
io->size > pf_ioctl_maxcount ||
WOULD_OVERFLOW(io->size, sizeof(struct pfioc_trans_e))) {
error = EINVAL;
- break;
+ goto fail;
}
totlen = sizeof(struct pfioc_trans_e) * io->size;
ioes = mallocarray(io->size, sizeof(struct pfioc_trans_e),
@@ -5297,7 +5331,7 @@ DIOCCHANGEADDR_error:
error = copyin(io->array, ioes, totlen);
if (error) {
free(ioes, M_TEMP);
- break;
+ goto fail;
}
PF_RULES_WLOCK();
for (i = 0, ioe = ioes; i < io->size; i++, ioe++) {
@@ -5366,14 +5400,14 @@ DIOCCHANGEADDR_error:
if (io->esize != sizeof(*ioe)) {
error = ENODEV;
- break;
+ goto fail;
}
if (io->size < 0 ||
io->size > pf_ioctl_maxcount ||
WOULD_OVERFLOW(io->size, sizeof(struct pfioc_trans_e))) {
error = EINVAL;
- break;
+ goto fail;
}
totlen = sizeof(struct pfioc_trans_e) * io->size;
@@ -5382,7 +5416,7 @@ DIOCCHANGEADDR_error:
error = copyin(io->array, ioes, totlen);
if (error) {
free(ioes, M_TEMP);
- break;
+ goto fail;
}
PF_RULES_WLOCK();
/* First makes sure everything will succeed. */
@@ -5523,7 +5557,7 @@ DIOCCHANGEADDR_error:
if (psn->psn_len == 0) {
psn->psn_len = sizeof(struct pf_src_node) * nr;
- break;
+ goto fail;
}
nr = 0;
@@ -5548,7 +5582,7 @@ DIOCCHANGEADDR_error:
sizeof(struct pf_src_node) * nr);
if (error) {
free(pstore, M_TEMP);
- break;
+ goto fail;
}
psn->psn_len = sizeof(struct pf_src_node) * nr;
free(pstore, M_TEMP);
@@ -5604,14 +5638,14 @@ DIOCCHANGEADDR_error:
if (io->pfiio_esize != sizeof(struct pfi_kif)) {
error = ENODEV;
- break;
+ goto fail;
}
if (io->pfiio_size < 0 ||
io->pfiio_size > pf_ioctl_maxcount ||
WOULD_OVERFLOW(io->pfiio_size, sizeof(struct pfi_kif))) {
error = EINVAL;
- break;
+ goto fail;
}
io->pfiio_name[sizeof(io->pfiio_name) - 1] = '\0';
diff --git a/sys/netpfil/pf/pf_lb.c b/sys/netpfil/pf/pf_lb.c
index 26f7ab41eef4..9c7863bb301e 100644
--- a/sys/netpfil/pf/pf_lb.c
+++ b/sys/netpfil/pf/pf_lb.c
@@ -1012,10 +1012,13 @@ pf_get_transaddr(struct pf_test_ctx *ctx, struct pf_krule *r,
if (rpool->proxy_port[1]) {
uint32_t tmp_nport;
+ uint16_t div;
- tmp_nport = ((ntohs(pd->ndport) - ntohs(r->dst.port[0])) %
- (rpool->proxy_port[1] - rpool->proxy_port[0] +
- 1)) + rpool->proxy_port[0];
+ div = r->rdr.proxy_port[1] - r->rdr.proxy_port[0] + 1;
+ div = (div == 0) ? 1 : div;
+
+ tmp_nport = ((ntohs(pd->ndport) - ntohs(r->dst.port[0])) % div) +
+ rpool->proxy_port[0];
/* Wrap around if necessary. */
if (tmp_nport > 65535)
diff --git a/sys/netpfil/pf/pf_table.c b/sys/netpfil/pf/pf_table.c
index e3f3ab9025f7..9c0151b7da2b 100644
--- a/sys/netpfil/pf/pf_table.c
+++ b/sys/netpfil/pf/pf_table.c
@@ -819,10 +819,10 @@ pfr_create_kentry(struct pfr_addr *ad, bool counters)
static void
pfr_destroy_kentries(struct pfr_kentryworkq *workq)
{
- struct pfr_kentry *p, *q;
+ struct pfr_kentry *p;
- for (p = SLIST_FIRST(workq); p != NULL; p = q) {
- q = SLIST_NEXT(p, pfrke_workq);
+ while ((p = SLIST_FIRST(workq)) != NULL) {
+ SLIST_REMOVE_HEAD(workq, pfrke_workq);
pfr_destroy_kentry(p);
}
}
@@ -1680,8 +1680,7 @@ pfr_ina_commit(struct pfr_table *trs, u_int32_t ticket, int *nadd,
}
if (!(flags & PFR_FLAG_DUMMY)) {
- for (p = SLIST_FIRST(&workq); p != NULL; p = q) {
- q = SLIST_NEXT(p, pfrkt_workq);
+ SLIST_FOREACH_SAFE(p, &workq, pfrkt_workq, q) {
pfr_commit_ktable(p, tzero);
}
rs->topen = 0;
@@ -1710,7 +1709,7 @@ pfr_commit_ktable(struct pfr_ktable *kt, time_t tzero)
} else if (kt->pfrkt_flags & PFR_TFLAG_ACTIVE) {
/* kt might contain addresses */
struct pfr_kentryworkq addrq, addq, changeq, delq, garbageq;
- struct pfr_kentry *p, *q, *next;
+ struct pfr_kentry *p, *q;
struct pfr_addr ad;
pfr_enqueue_addrs(shadow, &addrq, NULL, 0);
@@ -1720,7 +1719,8 @@ pfr_commit_ktable(struct pfr_ktable *kt, time_t tzero)
SLIST_INIT(&delq);
SLIST_INIT(&garbageq);
pfr_clean_node_mask(shadow, &addrq);
- SLIST_FOREACH_SAFE(p, &addrq, pfrke_workq, next) {
+ while ((p = SLIST_FIRST(&addrq)) != NULL) {
+ SLIST_REMOVE_HEAD(&addrq, pfrke_workq);
pfr_copyout_addr(&ad, p);
q = pfr_lookup_addr(kt, &ad, 1);
if (q != NULL) {
@@ -1864,8 +1864,7 @@ pfr_setflags_ktables(struct pfr_ktableworkq *workq)
{
struct pfr_ktable *p, *q;
- for (p = SLIST_FIRST(workq); p; p = q) {
- q = SLIST_NEXT(p, pfrkt_workq);
+ SLIST_FOREACH_SAFE(p, workq, pfrkt_workq, q) {
pfr_setflags_ktable(p, p->pfrkt_nflags);
}
}
@@ -2015,10 +2014,10 @@ pfr_create_ktable(struct pfr_table *tbl, time_t tzero, int attachruleset)
static void
pfr_destroy_ktables(struct pfr_ktableworkq *workq, int flushaddr)
{
- struct pfr_ktable *p, *q;
+ struct pfr_ktable *p;
- for (p = SLIST_FIRST(workq); p; p = q) {
- q = SLIST_NEXT(p, pfrkt_workq);
+ while ((p = SLIST_FIRST(workq)) != NULL) {
+ SLIST_REMOVE_HEAD(workq, pfrkt_workq);
pfr_destroy_ktable(p, flushaddr);
}
}
diff --git a/sys/rpc/clnt_rc.c b/sys/rpc/clnt_rc.c
index 9e87af578885..44b63e38a8e6 100644
--- a/sys/rpc/clnt_rc.c
+++ b/sys/rpc/clnt_rc.c
@@ -198,6 +198,12 @@ clnt_reconnect_connect(CLIENT *cl)
newclient = clnt_vc_create(so,
(struct sockaddr *) &rc->rc_addr, rc->rc_prog, rc->rc_vers,
rc->rc_sendsz, rc->rc_recvsz, rc->rc_intr);
+ /*
+ * CLSET_FD_CLOSE must be done now, in case rpctls_connect()
+ * fails just below.
+ */
+ if (newclient != NULL)
+ CLNT_CONTROL(newclient, CLSET_FD_CLOSE, 0);
if (rc->rc_tls && newclient != NULL) {
CURVNET_SET(so->so_vnet);
stat = rpctls_connect(newclient, rc->rc_tlscertname, so,
@@ -236,7 +242,6 @@ clnt_reconnect_connect(CLIENT *cl)
goto out;
}
- CLNT_CONTROL(newclient, CLSET_FD_CLOSE, 0);
CLNT_CONTROL(newclient, CLSET_CONNECT, &one);
CLNT_CONTROL(newclient, CLSET_TIMEOUT, &rc->rc_timeout);
CLNT_CONTROL(newclient, CLSET_RETRY_TIMEOUT, &rc->rc_retry);
diff --git a/sys/rpc/rpcsec_tls/rpctls_impl.c b/sys/rpc/rpcsec_tls/rpctls_impl.c
index 93fe283e65fd..51fe270b13d9 100644
--- a/sys/rpc/rpcsec_tls/rpctls_impl.c
+++ b/sys/rpc/rpcsec_tls/rpctls_impl.c
@@ -240,6 +240,14 @@ rpctls_rpc_failed(struct upsock *ups, struct socket *so)
* failed to do the handshake.
*/
mtx_unlock(&rpctls_lock);
+ /*
+ * Do a shutdown on the socket, since the daemon is
+ * probably stuck in SSL_accept() or SSL_connect() trying to
+ * read the socket. Do not soclose() the socket, since the
+ * daemon will close() the socket after SSL_accept()
+ * returns an error.
+ */
+ soshutdown(so, SHUT_RD);
}
}
diff --git a/sys/sys/exterrvar.h b/sys/sys/exterrvar.h
index 15557c614f88..7bf1d264ff5e 100644
--- a/sys/sys/exterrvar.h
+++ b/sys/sys/exterrvar.h
@@ -21,6 +21,7 @@
#define EXTERRCTL_ENABLE 1
#define EXTERRCTL_DISABLE 2
+#define EXTERRCTL_UD 3
#define EXTERRCTLF_FORCE 0x00000001
diff --git a/sys/sys/inotify.h b/sys/sys/inotify.h
index 65dc5dba43f3..d1f23d5898bb 100644
--- a/sys/sys/inotify.h
+++ b/sys/sys/inotify.h
@@ -107,11 +107,18 @@ void vn_inotify_revoke(struct vnode *);
} while (0)
/* Log an inotify event using a specific name for the vnode. */
-#define INOTIFY_NAME(vp, dvp, cnp, ev) do { \
+#define INOTIFY_NAME_LOCK(vp, dvp, cnp, ev, lock) do { \
if (__predict_false((vn_irflag_read(vp) & VIRF_INOTIFY) != 0 || \
- (vn_irflag_read(dvp) & VIRF_INOTIFY) != 0)) \
+ (vn_irflag_read(dvp) & VIRF_INOTIFY) != 0)) { \
+ if (lock) \
+ vn_lock((vp), LK_SHARED | LK_RETRY); \
VOP_INOTIFY((vp), (dvp), (cnp), (ev), 0); \
+ if (lock) \
+ VOP_UNLOCK(vp); \
+ } \
} while (0)
+#define INOTIFY_NAME(vp, dvp, cnp, ev) \
+ INOTIFY_NAME_LOCK((vp), (dvp), (cnp), (ev), false)
extern __uint32_t inotify_rename_cookie;
@@ -126,7 +133,8 @@ extern __uint32_t inotify_rename_cookie;
VOP_INOTIFY((vp), (tdvp), (tcnp), IN_MOVED_TO, cookie); \
} \
if ((tvp) != NULL) \
- INOTIFY_NAME((tvp), (tdvp), (tcnp), _IN_MOVE_DELETE); \
+ INOTIFY_NAME_LOCK((tvp), (tdvp), (tcnp), \
+ _IN_MOVE_DELETE, true); \
} while (0)
#define INOTIFY_REVOKE(vp) do { \
diff --git a/sys/sys/param.h b/sys/sys/param.h
index af116d6e3f7a..a8e9635242dd 100644
--- a/sys/sys/param.h
+++ b/sys/sys/param.h
@@ -74,7 +74,7 @@
* cannot include sys/param.h and should only be updated here.
*/
#undef __FreeBSD_version
-#define __FreeBSD_version 1500051
+#define __FreeBSD_version 1500052
/*
* __FreeBSD_kernel__ indicates that this system uses the kernel of FreeBSD,
diff --git a/sys/sys/random.h b/sys/sys/random.h
index 254ba9451d0a..5abf762cd200 100644
--- a/sys/sys/random.h
+++ b/sys/sys/random.h
@@ -85,7 +85,8 @@ enum random_entropy_source {
RANDOM_FS_ATIME,
RANDOM_UMA, /* Special!! UMA/SLAB Allocator */
RANDOM_CALLOUT,
- RANDOM_ENVIRONMENTAL_END = RANDOM_CALLOUT,
+ RANDOM_RANDOMDEV,
+ RANDOM_ENVIRONMENTAL_END = RANDOM_RANDOMDEV,
/* Fast hardware random-number sources from here on. */
RANDOM_PURE_START,
RANDOM_PURE_OCTEON = RANDOM_PURE_START,
diff --git a/sys/vm/swap_pager.c b/sys/vm/swap_pager.c
index 86b75a2d7989..d6bd06226d04 100644
--- a/sys/vm/swap_pager.c
+++ b/sys/vm/swap_pager.c
@@ -384,8 +384,8 @@ swap_release_by_cred(vm_ooffset_t decr, struct ucred *cred)
#endif
}
-static int swap_pager_full = 2; /* swap space exhaustion (task killing) */
-static int swap_pager_almost_full = 1; /* swap space exhaustion (w/hysteresis)*/
+static bool swap_pager_full = true; /* swap space exhaustion (task killing) */
+static bool swap_pager_almost_full = true; /* swap space exhaustion (w/hysteresis) */
static struct mtx swbuf_mtx; /* to sync nsw_wcount_async */
static int nsw_wcount_async; /* limit async write buffers */
static int nsw_wcount_async_max;/* assigned maximum */
@@ -642,14 +642,14 @@ swp_sizecheck(void)
{
if (swap_pager_avail < nswap_lowat) {
- if (swap_pager_almost_full == 0) {
+ if (!swap_pager_almost_full) {
printf("swap_pager: out of swap space\n");
- swap_pager_almost_full = 1;
+ swap_pager_almost_full = true;
}
} else {
- swap_pager_full = 0;
+ swap_pager_full = false;
if (swap_pager_avail > nswap_hiwat)
- swap_pager_almost_full = 0;
+ swap_pager_almost_full = false;
}
}
@@ -958,11 +958,10 @@ swp_pager_getswapspace(int *io_npages)
swp_sizecheck();
swdevhd = TAILQ_NEXT(sp, sw_list);
} else {
- if (swap_pager_full != 2) {
+ if (!swap_pager_full) {
printf("swp_pager_getswapspace(%d): failed\n",
*io_npages);
- swap_pager_full = 2;
- swap_pager_almost_full = 1;
+ swap_pager_full = swap_pager_almost_full = true;
}
swdevhd = NULL;
}
@@ -2863,10 +2862,8 @@ swapoff_one(struct swdevt *sp, struct ucred *cred, u_int flags)
sp->sw_id = NULL;
TAILQ_REMOVE(&swtailq, sp, sw_list);
nswapdev--;
- if (nswapdev == 0) {
- swap_pager_full = 2;
- swap_pager_almost_full = 1;
- }
+ if (nswapdev == 0)
+ swap_pager_full = swap_pager_almost_full = true;
if (swdevhd == sp)
swdevhd = NULL;
mtx_unlock(&sw_dev_mtx);
diff --git a/sys/vm/vm_pagequeue.h b/sys/vm/vm_pagequeue.h
index cbbd27389662..9bd3b389fb60 100644
--- a/sys/vm/vm_pagequeue.h
+++ b/sys/vm/vm_pagequeue.h
@@ -260,9 +260,9 @@ struct vm_domain {
u_int vmd_inactive_shortage; /* Per-thread shortage. */
blockcount_t vmd_inactive_running; /* Number of inactive threads. */
blockcount_t vmd_inactive_starting; /* Number of threads started. */
- volatile u_int vmd_addl_shortage; /* Shortage accumulator. */
- volatile u_int vmd_inactive_freed; /* Successful inactive frees. */
- volatile u_int vmd_inactive_us; /* Microseconds for above. */
+ u_int vmd_addl_shortage; /* (a) Shortage accumulator. */
+ u_int vmd_inactive_freed; /* (a) Successful inactive frees. */
+ u_int vmd_inactive_us; /* (a) Microseconds for above. */
u_int vmd_inactive_pps; /* Exponential decay frees/second. */
int vmd_oom_seq;
int vmd_last_active_scan;