123 files changed, 5270 insertions, 915 deletions
diff --git a/sys/amd64/acpica/acpi_wakeup.c b/sys/amd64/acpica/acpi_wakeup.c
index 51d6d5e36840..99565fbb69ca 100644
--- a/sys/amd64/acpica/acpi_wakeup.c
+++ b/sys/amd64/acpica/acpi_wakeup.c
@@ -54,10 +54,8 @@
 
 #include <x86/apicreg.h>
 #include <x86/apicvar.h>
-#ifdef SMP
 #include <machine/smp.h>
 #include <machine/vmparam.h>
-#endif
 
 #include <contrib/dev/acpica/include/acpi.h>
 
@@ -73,19 +71,13 @@ extern int		acpi_resume_beep;
 extern int		acpi_reset_video;
 extern int		acpi_susp_bounce;
 
-#ifdef SMP
 extern struct susppcb	**susppcbs;
 static cpuset_t		suspcpus;
-#else
-static struct susppcb	**susppcbs;
-#endif
 
 static void		acpi_stop_beep(void *);
 
-#ifdef SMP
 static int		acpi_wakeup_ap(struct acpi_softc *, int);
 static void		acpi_wakeup_cpus(struct acpi_softc *);
-#endif
 
 #define	ACPI_WAKEPT_PAGES	7
 
@@ -103,7 +95,6 @@ acpi_stop_beep(void *arg)
 		timer_spkr_release();
 }
 
-#ifdef SMP
 static int
 acpi_wakeup_ap(struct acpi_softc *sc, int cpu)
 {
@@ -177,7 +168,6 @@ acpi_wakeup_cpus(struct acpi_softc *sc)
 		outb(CMOS_DATA, mpbiosreason);
 	}
 }
-#endif
 
 int
 acpi_sleep_machdep(struct acpi_softc *sc, int state)
@@ -190,10 +180,8 @@ acpi_sleep_machdep(struct acpi_softc *sc, int state)
 	if (sc->acpi_wakeaddr == 0ul)
 		return (-1);	/* couldn't alloc wake memory */
 
-#ifdef SMP
 	suspcpus = all_cpus;
 	CPU_CLR(PCPU_GET(cpuid), &suspcpus);
-#endif
 
 	if (acpi_resume_beep != 0)
 		timer_spkr_acquire();
@@ -208,12 +196,10 @@ acpi_sleep_machdep(struct acpi_softc *sc, int state)
 	pcb = &susppcbs[0]->sp_pcb;
 	if (savectx(pcb)) {
 		fpususpend(susppcbs[0]->sp_fpususpend);
-#ifdef SMP
 		if (!CPU_EMPTY(&suspcpus) && suspend_cpus(suspcpus) == 0) {
 			device_printf(sc->acpi_dev, "Failed to suspend APs\n");
 			return (0);	/* couldn't sleep */
 		}
-#endif
 		hw_ibrs_ibpb_active = 0;
 		hw_ssb_active = 0;
 		cpu_stdext_feature3 = 0;
@@ -278,16 +264,12 @@ acpi_wakeup_machdep(struct acpi_softc *sc, int state, int sleep_result,
 			PCPU_SET(switchtime, 0);
 			PCPU_SET(switchticks, ticks);
 			lapic_xapic_mode();
-#ifdef SMP
 			if (!CPU_EMPTY(&suspcpus))
 				acpi_wakeup_cpus(sc);
-#endif
 		}
 
-#ifdef SMP
 		if (!CPU_EMPTY(&suspcpus))
 			resume_cpus(suspcpus);
-#endif
 
 		/*
 		 * Re-read cpu_stdext_feature3, which was zeroed-out
diff --git a/sys/amd64/amd64/apic_vector.S b/sys/amd64/amd64/apic_vector.S
index 6e51ebff298a..e98bae9eb6c5 100644
--- a/sys/amd64/amd64/apic_vector.S
+++ b/sys/amd64/amd64/apic_vector.S
@@ -49,12 +49,6 @@
 #include <machine/specialreg.h>
 #include <x86/apicreg.h>
 
-#ifdef SMP
-#define LK	lock ;
-#else
-#define LK
-#endif
-
 	.text
 	SUPERALIGN_TEXT
 	/* End Of Interrupt to APIC */
@@ -163,7 +157,6 @@ IDTVEC(spuriousint)
 	jmp	doreti
 #endif
 
-#ifdef SMP
 /*
  * Global address space TLB shootdown.
  */
@@ -270,5 +263,3 @@ IDTVEC(justreturn)
 	INTR_HANDLER	justreturn1
 	call	as_lapic_eoi
 	jmp	doreti
-
-#endif /* SMP */
diff --git a/sys/amd64/amd64/cpu_switch.S b/sys/amd64/amd64/cpu_switch.S
index a053f6c70af1..d7e954f573b0 100644
--- a/sys/amd64/amd64/cpu_switch.S
+++ b/sys/amd64/amd64/cpu_switch.S
@@ -136,7 +136,7 @@ ctx_switch_fpusave_done:
 	movq	%r15,TD_LOCK(%r13)		/* Release the old thread */
 sw1:
 	leaq	TD_MD_PCB(%r12),%r8
-#if defined(SCHED_ULE) && defined(SMP)
+#if defined(SCHED_ULE)
 	movq	$blocked_lock, %rdx
 	movq	TD_LOCK(%r12),%rcx
 	cmpq	%rcx, %rdx
@@ -492,7 +492,7 @@ ENTRY(resumectx)
 END(resumectx)
 
 /* Wait for the new thread to become unblocked */
-#if defined(SCHED_ULE) && defined(SMP)
+#if defined(SCHED_ULE)
 sw1wait:
 1:
 	pause
diff --git a/sys/amd64/amd64/exec_machdep.c b/sys/amd64/amd64/exec_machdep.c
index da68289e2c83..6752b716deb5 100644
--- a/sys/amd64/amd64/exec_machdep.c
+++ b/sys/amd64/amd64/exec_machdep.c
@@ -59,9 +59,7 @@
 #include <sys/reg.h>
 #include <sys/rwlock.h>
 #include <sys/signalvar.h>
-#ifdef SMP
 #include <sys/smp.h>
-#endif
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c
index f46462b39fa3..37c7056f649c 100644
--- a/sys/amd64/amd64/machdep.c
+++ b/sys/amd64/amd64/machdep.c
@@ -38,7 +38,6 @@
  * SUCH DAMAGE.
  */
 
-#include <sys/cdefs.h>
 #include "opt_atpic.h"
 #include "opt_cpu.h"
 #include "opt_ddb.h"
@@ -82,9 +81,7 @@
 #include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/signalvar.h>
-#ifdef SMP
 #include <sys/smp.h>
-#endif
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
@@ -132,9 +129,7 @@
 #include <machine/tss.h>
 #include <x86/ucode.h>
 #include <x86/ifunc.h>
-#ifdef SMP
 #include <machine/smp.h>
-#endif
 #ifdef FDT
 #include <x86/fdt.h>
 #endif
@@ -149,6 +144,10 @@
 #include <isa/rtc.h>
 #include <x86/init.h>
 
+#ifndef SMP
+#error amd64 requires options SMP
+#endif
+
 /* Sanity check for __curthread() */
 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
 
diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c
index 2c7777e608b9..d1d80afccdc7 100644
--- a/sys/amd64/amd64/pmap.c
+++ b/sys/amd64/amd64/pmap.c
@@ -162,9 +162,7 @@
 #include <machine/msan.h>
 #include <machine/pcb.h>
 #include <machine/specialreg.h>
-#ifdef SMP
 #include <machine/smp.h>
-#endif
 #include <machine/sysarch.h>
 #include <machine/tss.h>
 
@@ -483,6 +481,8 @@ vm_paddr_t		KERNend;	/* and the end */
 
 struct kva_layout_s	kva_layout = {
 	.kva_min =	KV4ADDR(PML4PML4I, 0, 0, 0),
+	.kva_max =	KV4ADDR(NPML4EPG - 1, NPDPEPG - 1,
+			    NPDEPG - 1, NPTEPG - 1),
 	.dmap_low =	KV4ADDR(DMPML4I, 0, 0, 0),
 	.dmap_high =	KV4ADDR(DMPML4I + NDMPML4E, 0, 0, 0),
 	.lm_low =	KV4ADDR(LMSPML4I, 0, 0, 0),
@@ -491,18 +491,36 @@ struct kva_layout_s	kva_layout = {
 	.km_high =	KV4ADDR(KPML4BASE + NKPML4E - 1, NPDPEPG - 1,
 			    NPDEPG - 1, NPTEPG - 1),
 	.rec_pt =	KV4ADDR(PML4PML4I, 0, 0, 0),
+	.kasan_shadow_low = KV4ADDR(KASANPML4I, 0, 0, 0),
+	.kasan_shadow_high = KV4ADDR(KASANPML4I + NKASANPML4E, 0, 0, 0),
+	.kmsan_shadow_low = KV4ADDR(KMSANSHADPML4I, 0, 0, 0),
+	.kmsan_shadow_high = KV4ADDR(KMSANSHADPML4I + NKMSANSHADPML4E,
+			    0, 0, 0),
+	.kmsan_origin_low = KV4ADDR(KMSANORIGPML4I, 0, 0, 0),
+	.kmsan_origin_high = KV4ADDR(KMSANORIGPML4I + NKMSANORIGPML4E,
+			    0, 0, 0),
 };
 
 struct kva_layout_s	kva_layout_la57 = {
 	.kva_min =	KV5ADDR(NPML5EPG / 2, 0, 0, 0, 0),	/* == rec_pt */
+	.kva_max =	KV5ADDR(NPML5EPG - 1, NPML4EPG - 1, NPDPEPG - 1,
+			    NPDEPG - 1, NPTEPG - 1),
 	.dmap_low =	KV5ADDR(DMPML5I, 0, 0, 0, 0),
 	.dmap_high =	KV5ADDR(DMPML5I + NDMPML5E, 0, 0, 0, 0),
-	.lm_low =	KV4ADDR(LMSPML4I, 0, 0, 0),
-	.lm_high =	KV4ADDR(LMEPML4I + 1, 0, 0, 0),
+	.lm_low =	KV5ADDR(LMSPML5I, 0, 0, 0, 0),
+	.lm_high =	KV5ADDR(LMEPML5I + 1, 0, 0, 0, 0),
 	.km_low =	KV4ADDR(KPML4BASE, 0, 0, 0),
 	.km_high =	KV4ADDR(KPML4BASE + NKPML4E - 1, NPDPEPG - 1,
 			    NPDEPG - 1, NPTEPG - 1),
 	.rec_pt =	KV5ADDR(PML5PML5I, 0, 0, 0, 0),
+	.kasan_shadow_low = KV4ADDR(KASANPML4I, 0, 0, 0),
+	.kasan_shadow_high = KV4ADDR(KASANPML4I + NKASANPML4E, 0, 0, 0),
+	.kmsan_shadow_low = KV4ADDR(KMSANSHADPML4I, 0, 0, 0),
+	.kmsan_shadow_high = KV4ADDR(KMSANSHADPML4I + NKMSANSHADPML4E,
+			    0, 0, 0),
+	.kmsan_origin_low = KV4ADDR(KMSANORIGPML4I, 0, 0, 0),
+	.kmsan_origin_high = KV4ADDR(KMSANORIGPML4I + NKMSANORIGPML4E,
+			    0, 0, 0),
 };
 
 /*
@@ -2005,7 +2023,7 @@ create_pagetables(vm_paddr_t *firstaddr)
 				 */
 				p5_p[i] = KPML5phys | X86_PG_RW | X86_PG_A |
 				    X86_PG_M | X86_PG_V | pg_nx;
-			} else if (i >= DMPML5I && i < DMPML5I + NDMPML5E) {
+			} else if (i >= DMPML5I && i < DMPML5I + ndmpml4phys) {
 				/* Connect DMAP pml4 pages to PML5. */
 				p5_p[i] = (DMPML4phys + ptoa(i - DMPML5I)) |
 				    X86_PG_RW | X86_PG_V | pg_nx;
@@ -2475,6 +2493,7 @@ pmap_init(void)
 	struct pmap_preinit_mapping *ppim;
 	vm_page_t m, mpte;
 	pml4_entry_t *pml4e;
+	unsigned long lm_max;
 	int error, i, ret, skz63;
 
 	/* L1TF, reserve page @0 unconditionally */
@@ -2600,10 +2619,15 @@ pmap_init(void)
 
 	lm_ents = 8;
 	TUNABLE_INT_FETCH("vm.pmap.large_map_pml4_entries", &lm_ents);
-	if (lm_ents > LMEPML4I - LMSPML4I + 1)
-		lm_ents = LMEPML4I - LMSPML4I + 1;
+	lm_max = (kva_layout.lm_high - kva_layout.lm_low) / NBPML4;
+	if (lm_ents > lm_max) {
+		printf(
+	    "pmap: shrinking large map from requested %d slots to %ld slots\n",
+		    lm_ents, lm_max);
+		lm_ents = lm_max;
+	}
 #ifdef KMSAN
-	if (lm_ents > KMSANORIGPML4I - LMSPML4I) {
+	if (!la57 && lm_ents > KMSANORIGPML4I - LMSPML4I) {
 		printf(
 	    "pmap: shrinking large map for KMSAN (%d slots to %ld slots)\n",
 		    lm_ents, KMSANORIGPML4I - LMSPML4I);
@@ -2615,12 +2639,20 @@ pmap_init(void)
 		    lm_ents, (u_long)lm_ents * (NBPML4 / 1024 / 1024 / 1024));
 	if (lm_ents != 0) {
 		large_vmem = vmem_create("large", kva_layout.lm_low,
-		    (vmem_size_t)kva_layout.lm_high - kva_layout.lm_low,
-		    PAGE_SIZE, 0, M_WAITOK);
+		    (vmem_size_t)lm_ents * NBPML4, PAGE_SIZE, 0, M_WAITOK);
 		if (large_vmem == NULL) {
 			printf("pmap: cannot create large map\n");
 			lm_ents = 0;
 		}
+		if (la57) {
+			for (i = 0; i < howmany((vm_offset_t)NBPML4 *
+			    lm_ents, NBPML5); i++) {
+				m = pmap_large_map_getptp_unlocked();
+				kernel_pmap->pm_pmltop[LMSPML5I + i] = X86_PG_V |
+				    X86_PG_RW | X86_PG_A | X86_PG_M |
+				    pg_nx | VM_PAGE_TO_PHYS(m);
+			}
+		}
 		for (i = 0; i < lm_ents; i++) {
 			m = pmap_large_map_getptp_unlocked();
 			pml4e = pmap_pml4e(kernel_pmap, kva_layout.lm_low +
@@ -3031,7 +3063,6 @@ pmap_update_pde_invalidate(pmap_t pmap, vm_offset_t va, pd_entry_t newpde)
  * XXX TODO
  */
 
-#ifdef SMP
 /*
  * Interrupt the cpus that are executing in the guest context.
  * This will force the vcpu to exit and the cached EPT mappings
@@ -3489,168 +3520,6 @@ pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
 	}
 	sched_unpin();
 }
-#else /* !SMP */
-/*
- * Normal, non-SMP, invalidation functions.
- */
-void
-pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
-{
-	struct invpcid_descr d;
-	struct pmap_pcid *pcidp;
-	uint64_t kcr3, ucr3;
-	uint32_t pcid;
-
-	if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) {
-		pmap->pm_eptgen++;
-		return;
-	}
-	KASSERT(pmap->pm_type == PT_X86,
-	    ("pmap_invalidate_range: unknown type %d", pmap->pm_type));
-
-	if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) {
-		invlpg(va);
-		if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled &&
-		    pmap->pm_ucr3 != PMAP_NO_CR3) {
-			critical_enter();
-			pcid = pmap_get_pcid(pmap);
-			if (invpcid_works) {
-				d.pcid = pcid | PMAP_PCID_USER_PT;
-				d.pad = 0;
-				d.addr = va;
-				invpcid(&d, INVPCID_ADDR);
-			} else {
-				kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE;
-				ucr3 = pmap->pm_ucr3 | pcid |
-				    PMAP_PCID_USER_PT | CR3_PCID_SAVE;
-				pmap_pti_pcid_invlpg(ucr3, kcr3, va);
-			}
-			critical_exit();
-		}
-	} else if (pmap_pcid_enabled) {
-		pcidp = zpcpu_get(pmap->pm_pcidp);
-		pcidp->pm_gen = 0;
-	}
-}
-
-void
-pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
-{
-	struct invpcid_descr d;
-	struct pmap_pcid *pcidp;
-	vm_offset_t addr;
-	uint64_t kcr3, ucr3;
-	uint32_t pcid;
-
-	if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) {
-		pmap->pm_eptgen++;
-		return;
-	}
-	KASSERT(pmap->pm_type == PT_X86,
-	    ("pmap_invalidate_range: unknown type %d", pmap->pm_type));
-
-	if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) {
-		for (addr = sva; addr < eva; addr += PAGE_SIZE)
-			invlpg(addr);
-		if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled &&
-		    pmap->pm_ucr3 != PMAP_NO_CR3) {
-			critical_enter();
-			pcid = pmap_get_pcid(pmap);
-			if (invpcid_works) {
-				d.pcid = pcid | PMAP_PCID_USER_PT;
-				d.pad = 0;
-				d.addr = sva;
-				for (; d.addr < eva; d.addr += PAGE_SIZE)
-					invpcid(&d, INVPCID_ADDR);
-			} else {
-				kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE;
-				ucr3 = pmap->pm_ucr3 | pcid |
-				    PMAP_PCID_USER_PT | CR3_PCID_SAVE;
-				pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva);
-			}
-			critical_exit();
-		}
-	} else if (pmap_pcid_enabled) {
-		pcidp = zpcpu_get(pmap->pm_pcidp);
-		pcidp->pm_gen = 0;
-	}
-}
-
-void
-pmap_invalidate_all(pmap_t pmap)
-{
-	struct invpcid_descr d;
-	struct pmap_pcid *pcidp;
-	uint64_t kcr3, ucr3;
-	uint32_t pcid;
-
-	if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) {
-		pmap->pm_eptgen++;
-		return;
-	}
-	KASSERT(pmap->pm_type == PT_X86,
-	    ("pmap_invalidate_all: unknown type %d", pmap->pm_type));
-
-	if (pmap == kernel_pmap) {
-		if (pmap_pcid_enabled && invpcid_works) {
-			bzero(&d, sizeof(d));
-			invpcid(&d, INVPCID_CTXGLOB);
-		} else {
-			invltlb_glob();
-		}
-	} else if (pmap == PCPU_GET(curpmap)) {
-		if (pmap_pcid_enabled) {
-			critical_enter();
-			pcid = pmap_get_pcid(pmap);
-			if (invpcid_works) {
-				d.pcid = pcid;
-				d.pad = 0;
-				d.addr = 0;
-				invpcid(&d, INVPCID_CTX);
-				if (pmap->pm_ucr3 != PMAP_NO_CR3) {
-					d.pcid |= PMAP_PCID_USER_PT;
-					invpcid(&d, INVPCID_CTX);
-				}
-			} else {
-				kcr3 = pmap->pm_cr3 | pcid;
-				if (pmap->pm_ucr3 != PMAP_NO_CR3) {
-					ucr3 = pmap->pm_ucr3 | pcid |
-					    PMAP_PCID_USER_PT;
-					pmap_pti_pcid_invalidate(ucr3, kcr3);
-				} else
-					load_cr3(kcr3);
-			}
-			critical_exit();
-		} else {
-			invltlb();
-		}
-	} else if (pmap_pcid_enabled) {
-		pcidp = zpcpu_get(pmap->pm_pcidp);
-		pcidp->pm_gen = 0;
-	}
-}
-
-void
-pmap_invalidate_cache(void)
-{
-
-	wbinvd();
-}
-
-static void
-pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
-{
-	struct pmap_pcid *pcidp;
-
-	pmap_update_pde_store(pmap, pde, newpde);
-	if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap))
-		pmap_update_pde_invalidate(pmap, va, newpde);
-	else {
-		pcidp = zpcpu_get(pmap->pm_pcidp);
-		pcidp->pm_gen = 0;
-	}
-}
-#endif /* !SMP */
 
 static void
 pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde)
@@ -7561,6 +7430,9 @@ pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags,
 	PG_RW = pmap_rw_bit(pmap);
 	KASSERT((newpde & (pmap_modified_bit(pmap) | PG_RW)) != PG_RW,
 	    ("pmap_enter_pde: newpde is missing PG_M"));
+	KASSERT((flags & (PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM)) !=
+	    PMAP_ENTER_NORECLAIM,
+	    ("pmap_enter_pde: flags is missing PMAP_ENTER_NOREPLACE"));
 	PG_V = pmap_valid_bit(pmap);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
@@ -7689,6 +7561,14 @@ pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags,
 		if (!pmap_pv_insert_pde(pmap, va, newpde, flags, lockp)) {
 			if (pdpg != NULL)
 				pmap_abort_ptp(pmap, va, pdpg);
+			else {
+				KASSERT(va >= VM_MAXUSER_ADDRESS &&
+				    (*pde & (PG_PS | PG_V)) == PG_V,
+				    ("pmap_enter_pde: invalid kernel PDE"));
+				mt = pmap_remove_pt_page(pmap, va);
+				KASSERT(mt != NULL,
+				    ("pmap_enter_pde: missing kernel PTP"));
+			}
 			if (uwptpg != NULL) {
 				mt = pmap_remove_pt_page(pmap, va);
 				KASSERT(mt == uwptpg,
@@ -10333,17 +10213,9 @@ pmap_activate_sw(struct thread *td)
 		return;
 	}
 	cpuid = PCPU_GET(cpuid);
-#ifdef SMP
 	CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
-#else
-	CPU_SET(cpuid, &pmap->pm_active);
-#endif
 	pmap_activate_sw_mode(td, pmap, cpuid);
-#ifdef SMP
 	CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active);
-#else
-	CPU_CLR(cpuid, &oldpmap->pm_active);
-#endif
 }
 
 void
@@ -10384,11 +10256,7 @@ pmap_activate_boot(pmap_t pmap)
 	MPASS(pmap != kernel_pmap);
 
 	cpuid = PCPU_GET(cpuid);
-#ifdef SMP
 	CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
-#else
-	CPU_SET(cpuid, &pmap->pm_active);
-#endif
 	PCPU_SET(curpmap, pmap);
 	if (pti) {
 		kcr3 = pmap->pm_cr3;
@@ -10752,19 +10620,28 @@ pmap_large_map_getptp(void)
 static pdp_entry_t *
 pmap_large_map_pdpe(vm_offset_t va)
 {
+	pml4_entry_t *pml4;
 	vm_pindex_t pml4_idx;
 	vm_paddr_t mphys;
 
-	pml4_idx = pmap_pml4e_index(va);
-	KASSERT(LMSPML4I <= pml4_idx && pml4_idx < LMSPML4I + lm_ents,
-	    ("pmap_large_map_pdpe: va %#jx out of range idx %#jx LMSPML4I "
-	    "%#jx lm_ents %d",
-	    (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents));
-	KASSERT((kernel_pml4[pml4_idx] & X86_PG_V) != 0,
-	    ("pmap_large_map_pdpe: invalid pml4 for va %#jx idx %#jx "
-	    "LMSPML4I %#jx lm_ents %d",
-	    (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents));
-	mphys = kernel_pml4[pml4_idx] & PG_FRAME;
+	KASSERT(va >= kva_layout.lm_low && va < kva_layout.lm_low +
+	    (vm_offset_t)NBPML4 * lm_ents, ("va %#lx not in large map", va));
+	if (la57) {
+		pml4 = pmap_pml4e(kernel_pmap, va);
+		mphys = *pml4 & PG_FRAME;
+	} else {
+		pml4_idx = pmap_pml4e_index(va);
+
+		KASSERT(LMSPML4I <= pml4_idx && pml4_idx < LMSPML4I + lm_ents,
+		    ("pmap_large_map_pdpe: va %#jx out of range idx %#jx "
+		    "LMSPML4I %#jx lm_ents %d",
+		    (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents));
+		KASSERT((kernel_pml4[pml4_idx] & X86_PG_V) != 0,
+		    ("pmap_large_map_pdpe: invalid pml4 for va %#jx idx %#jx "
+		    "LMSPML4I %#jx lm_ents %d",
+		    (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents));
+		mphys = kernel_pml4[pml4_idx] & PG_FRAME;
+	}
 	return ((pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va));
 }
 
@@ -12023,9 +11900,7 @@ sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range,
 	    mode, range->pdpes, range->pdes, range->ptes);
 
 	/* Reset to sentinel value. */
-	range->sva = la57 ? KV5ADDR(NPML5EPG - 1, NPML4EPG - 1, NPDPEPG - 1,
-	    NPDEPG - 1, NPTEPG - 1) : KV4ADDR(NPML4EPG - 1, NPDPEPG - 1,
-	    NPDEPG - 1, NPTEPG - 1);
+	range->sva = kva_layout.kva_max;
 }
 
 /*
@@ -12066,12 +11941,18 @@ sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va,
  */
 static void
 sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range,
-    vm_offset_t va, pml4_entry_t pml4e, pdp_entry_t pdpe, pd_entry_t pde,
-    pt_entry_t pte)
+    vm_offset_t va, pml5_entry_t pml5e, pml4_entry_t pml4e, pdp_entry_t pdpe,
+    pd_entry_t pde, pt_entry_t pte)
 {
 	pt_entry_t attrs;
 
-	attrs = pml4e & (X86_PG_RW | X86_PG_U | pg_nx);
+	if (la57) {
+		attrs = pml5e & (X86_PG_RW | X86_PG_U | pg_nx);
+		attrs |= pml4e & pg_nx;
+		attrs &= pg_nx | (pml4e & (X86_PG_RW | X86_PG_U));
+	} else {
+		attrs = pml4e & (X86_PG_RW | X86_PG_U | pg_nx);
+	}
 
 	attrs |= pdpe & pg_nx;
 	attrs &= pg_nx | (pdpe & (X86_PG_RW | X86_PG_U));
@@ -12104,13 +11985,15 @@ sysctl_kmaps(SYSCTL_HANDLER_ARGS)
 {
 	struct pmap_kernel_map_range range;
 	struct sbuf sbuf, *sb;
+	pml5_entry_t pml5e;
 	pml4_entry_t pml4e;
 	pdp_entry_t *pdp, pdpe;
 	pd_entry_t *pd, pde;
 	pt_entry_t *pt, pte;
 	vm_offset_t sva;
 	vm_paddr_t pa;
-	int error, i, j, k, l;
+	int error, j, k, l;
+	bool first;
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
@@ -12119,9 +12002,8 @@ sysctl_kmaps(SYSCTL_HANDLER_ARGS)
 	sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req);
 
 	/* Sentinel value. */
-	range.sva = la57 ? KV5ADDR(NPML5EPG - 1, NPML4EPG - 1, NPDPEPG - 1,
-	    NPDEPG - 1, NPTEPG - 1) : KV4ADDR(NPML4EPG - 1, NPDPEPG - 1,
-	    NPDEPG - 1, NPTEPG - 1);
+	range.sva = kva_layout.kva_max;
+	pml5e = 0;	/* no UB for la48 */
 
 	/*
 	 * Iterate over the kernel page tables without holding the kernel pmap
@@ -12130,41 +12012,50 @@ sysctl_kmaps(SYSCTL_HANDLER_ARGS)
 	 * Within the large map, ensure that PDP and PD page addresses are
 	 * valid before descending.
 	 */
-	for (sva = 0, i = pmap_pml4e_index(sva); i < NPML4EPG; i++) {
-		switch (i) {
-		case PML4PML4I:
+	for (first = true, sva = 0; sva != 0 || first; first = false) {
+		if (sva == kva_layout.rec_pt)
 			sbuf_printf(sb, "\nRecursive map:\n");
-			break;
-		case DMPML4I:
+		else if (sva == kva_layout.dmap_low)
 			sbuf_printf(sb, "\nDirect map:\n");
-			break;
 #ifdef KASAN
-		case KASANPML4I:
+		else if (sva == kva_layout.kasan_shadow_low)
 			sbuf_printf(sb, "\nKASAN shadow map:\n");
-			break;
 #endif
 #ifdef KMSAN
-		case KMSANSHADPML4I:
+		else if (sva == kva_layout.kmsan_shadow_low)
 			sbuf_printf(sb, "\nKMSAN shadow map:\n");
-			break;
-		case KMSANORIGPML4I:
+		else if (sva == kva_layout.kmsan_origin_low)
 			sbuf_printf(sb, "\nKMSAN origin map:\n");
-			break;
 #endif
-		case KPML4BASE:
+		else if (sva == kva_layout.km_low)
 			sbuf_printf(sb, "\nKernel map:\n");
-			break;
-		case LMSPML4I:
+		else if (sva == kva_layout.lm_low)
 			sbuf_printf(sb, "\nLarge map:\n");
-			break;
-		}
 
 		/* Convert to canonical form. */
-		if (sva == 1ul << 47)
-			sva |= -1ul << 48;
+		if (la57) {
+			if (sva == 1ul << 56) {
+				sva |= -1ul << 57;
+				continue;
+			}
+		} else {
+			if (sva == 1ul << 47) {
+				sva |= -1ul << 48;
+				continue;
+			}
+		}
 
 restart:
-		pml4e = kernel_pml4[i];
+		if (la57) {
+			pml5e = *pmap_pml5e(kernel_pmap, sva);
+			if ((pml5e & X86_PG_V) == 0) {
+				sva = rounddown2(sva, NBPML5);
+				sysctl_kmaps_dump(sb, &range, sva);
+				sva += NBPML5;
+				continue;
+			}
+		}
+		pml4e = *pmap_pml4e(kernel_pmap, sva);
 		if ((pml4e & X86_PG_V) == 0) {
 			sva = rounddown2(sva, NBPML4);
 			sysctl_kmaps_dump(sb, &range, sva);
@@ -12185,8 +12076,8 @@ restart:
 			pa = pdpe & PG_FRAME;
 			if ((pdpe & PG_PS) != 0) {
 				sva = rounddown2(sva, NBPDP);
-				sysctl_kmaps_check(sb, &range, sva, pml4e, pdpe,
-				    0, 0);
+				sysctl_kmaps_check(sb, &range, sva, pml5e,
+				    pml4e, pdpe, 0, 0);
 				range.pdpes++;
 				sva += NBPDP;
 				continue;
@@ -12198,6 +12089,7 @@ restart:
 				 * freed.  Validate the next-level address
 				 * before descending.
 				 */
+				sva += NBPDP;
 				goto restart;
 			}
 			pd = (pd_entry_t *)PHYS_TO_DMAP(pa);
@@ -12214,7 +12106,7 @@ restart:
 				if ((pde & PG_PS) != 0) {
 					sva = rounddown2(sva, NBPDR);
 					sysctl_kmaps_check(sb, &range, sva,
-					    pml4e, pdpe, pde, 0);
+					    pml5e, pml4e, pdpe, pde, 0);
 					range.pdes++;
 					sva += NBPDR;
 					continue;
@@ -12226,6 +12118,7 @@ restart:
 					 * may be freed.  Validate the
 					 * next-level address before descending.
 					 */
+					sva += NBPDR;
 					goto restart;
 				}
 				pt = (pt_entry_t *)PHYS_TO_DMAP(pa);
@@ -12239,7 +12132,7 @@ restart:
 						continue;
 					}
 					sysctl_kmaps_check(sb, &range, sva,
-					    pml4e, pdpe, pde, pte);
+					    pml5e, pml4e, pdpe, pde, pte);
 					range.ptes++;
 				}
 			}
diff --git a/sys/amd64/amd64/support.S b/sys/amd64/amd64/support.S
index c95696bbe7ef..870cd255abb7 100644
--- a/sys/amd64/amd64/support.S
+++ b/sys/amd64/amd64/support.S
@@ -934,10 +934,7 @@ ENTRY(casueword32_nosmap)
 	ja	fusufault
 
 	movl	%esi,%eax			/* old */
-#ifdef SMP
-	lock
-#endif
-	cmpxchgl %ecx,(%rdi)			/* new = %ecx */
+	lock cmpxchgl %ecx,(%rdi)		/* new = %ecx */
 	setne	%cl
 
 	/*
@@ -971,10 +968,7 @@ ENTRY(casueword32_smap)
 
 	movl	%esi,%eax			/* old */
 	stac
-#ifdef SMP
-	lock
-#endif
-	cmpxchgl %ecx,(%rdi)			/* new = %ecx */
+	lock cmpxchgl %ecx,(%rdi)		/* new = %ecx */
 	clac
 	setne	%cl
 
@@ -1014,10 +1008,7 @@ ENTRY(casueword_nosmap)
 	ja	fusufault
 
 	movq	%rsi,%rax			/* old */
-#ifdef SMP
-	lock
-#endif
-	cmpxchgq %rcx,(%rdi)			/* new = %rcx */
+	lock cmpxchgq %rcx,(%rdi)		/* new = %rcx */
 	setne	%cl
 
 	/*
@@ -1045,10 +1036,7 @@ ENTRY(casueword_smap)
 
 	movq	%rsi,%rax			/* old */
 	stac
-#ifdef SMP
-	lock
-#endif
-	cmpxchgq %rcx,(%rdi)			/* new = %rcx */
+	lock cmpxchgq %rcx,(%rdi)		/* new = %rcx */
 	clac
 	setne	%cl
 
diff --git a/sys/amd64/amd64/trap.c b/sys/amd64/amd64/trap.c
index eefddad2f142..f3469ed5e2bc 100644
--- a/sys/amd64/amd64/trap.c
+++ b/sys/amd64/amd64/trap.c
@@ -37,7 +37,6 @@
  * SUCH DAMAGE.
  */
 
-#include <sys/cdefs.h>
 /*
  * AMD64 Trap and System call handling
  */
@@ -87,9 +86,7 @@ PMC_SOFT_DEFINE( , , page_fault, write);
 #include <x86/mca.h>
 #include <machine/md_var.h>
 #include <machine/pcb.h>
-#ifdef SMP
 #include <machine/smp.h>
-#endif
 #include <machine/stack.h>
 #include <machine/trap.h>
 #include <machine/tss.h>
@@ -900,11 +897,9 @@ trap_diag(struct trapframe *frame, vm_offset_t eva)
 	printf("\n\nFatal trap %d: %s while in %s mode\n", type,
 	    type < nitems(trap_msg) ? trap_msg[type] : UNKNOWN,
 	    TRAPF_USERMODE(frame) ? "user" : "kernel");
-#ifdef SMP
-	/* two separate prints in case of a trap on an unmapped page */
-	printf("cpuid = %d; ", PCPU_GET(cpuid));
-	printf("apic id = %02x\n", PCPU_GET(apic_id));
-#endif
+	/* Print these separately in case pcpu accesses trap. */
+	printf("cpuid = %d; apic id = %02x\n", PCPU_GET(cpuid),
+	    PCPU_GET(apic_id));
 	if (type == T_PAGEFLT) {
 		printf("fault virtual address	= 0x%lx\n", eva);
 		printf("fault code		= %s %s %s%s%s, %s\n",
@@ -1025,11 +1020,9 @@ dblfault_handler(struct trapframe *frame)
 	    frame->tf_cs, frame->tf_ss, frame->tf_ds, frame->tf_es,
 	    frame->tf_fs, frame->tf_gs,
 	    rdmsr(MSR_FSBASE), rdmsr(MSR_GSBASE), rdmsr(MSR_KGSBASE));
-#ifdef SMP
-	/* two separate prints in case of a trap on an unmapped page */
-	printf("cpuid = %d; ", PCPU_GET(cpuid));
-	printf("apic id = %02x\n", PCPU_GET(apic_id));
-#endif
+	/* Print these separately in case pcpu accesses trap. */
+	printf("cpuid = %d; apic id = %02x\n", PCPU_GET(cpuid),
+	    PCPU_GET(apic_id));
 	panic("double fault");
 }
 
diff --git a/sys/amd64/conf/MINIMALUP b/sys/amd64/conf/MINIMALUP
deleted file mode 100644
index 0dbddbe5b341..000000000000
--- a/sys/amd64/conf/MINIMALUP
+++ /dev/null
@@ -1,4 +0,0 @@
-include 	MINIMAL
-ident 		MINIMALUP
-nooptions 	SMP
-nooptions 	NUMA
diff --git a/sys/amd64/include/param.h b/sys/amd64/include/param.h
index 1bbb302259d6..5a9c3162e14c 100644
--- a/sys/amd64/include/param.h
+++ b/sys/amd64/include/param.h
@@ -150,8 +150,6 @@
     (((va) >= kva_layout.dmap_low && (va) < kva_layout.dmap_high) || \
     ((va) >= kva_layout.km_low && (va) < kva_layout.km_high))
 
-#ifdef SMP
 #define SC_TABLESIZE    1024                     /* Must be power of 2. */
-#endif
 
 #endif /* !_AMD64_INCLUDE_PARAM_H_ */
diff --git a/sys/amd64/include/pmap.h b/sys/amd64/include/pmap.h
index 08e96027a5ed..e2f97442c10f 100644
--- a/sys/amd64/include/pmap.h
+++ b/sys/amd64/include/pmap.h
@@ -202,9 +202,14 @@
 #define	KMSANSHADPML4I	(KPML4BASE - NKMSANSHADPML4E)
 #define	KMSANORIGPML4I	(DMPML4I - NKMSANORIGPML4E)
 
-/* Large map: index of the first and max last pml4 entry */
+/*
+ * Large map: index of the first and max last pml4/la48 and pml5/la57
+ * entry.
+ */
 #define	LMSPML4I	(PML4PML4I + 1)
 #define	LMEPML4I	(KASANPML4I - 1)
+#define	LMSPML5I	(DMPML5I + NDMPML5E)
+#define	LMEPML5I	(LMSPML5I + 32 - 1)	/* 32 slots for large map */
 
 /*
  * XXX doesn't really belong here I guess...
@@ -552,6 +557,7 @@ pmap_pml5e_index(vm_offset_t va)
 
 struct kva_layout_s {
 	vm_offset_t kva_min;
+	vm_offset_t kva_max;
 	vm_offset_t dmap_low;	/* DMAP_MIN_ADDRESS */
 	vm_offset_t dmap_high;	/* DMAP_MAX_ADDRESS */
 	vm_offset_t lm_low;	/* LARGEMAP_MIN_ADDRESS */
@@ -559,6 +565,12 @@ struct kva_layout_s {
 	vm_offset_t km_low;	/* VM_MIN_KERNEL_ADDRESS */
 	vm_offset_t km_high;	/* VM_MAX_KERNEL_ADDRESS */
 	vm_offset_t rec_pt;
+	vm_offset_t kasan_shadow_low;	/* KASAN_MIN_ADDRESS */
+	vm_offset_t kasan_shadow_high;	/* KASAN_MAX_ADDRESS */
+	vm_offset_t kmsan_shadow_low;	/* KMSAN_SHAD_MIN_ADDRESS */
+	vm_offset_t kmsan_shadow_high;	/* KMSAN_SHAD_MAX_ADDRESS */
+	vm_offset_t kmsan_origin_low;	/* KMSAN_ORIG_MIN_ADDRESS */
+	vm_offset_t kmsan_origin_high;	/* KMSAN_ORIG_MAX_ADDRESS */
 };
 extern struct kva_layout_s kva_layout;
 
diff --git a/sys/amd64/include/smp.h b/sys/amd64/include/smp.h
index 26eb227211da..bff92570ff82 100644
--- a/sys/amd64/include/smp.h
+++ b/sys/amd64/include/smp.h
@@ -13,8 +13,6 @@
 
 #ifdef _KERNEL
 
-#ifdef SMP
-
 #ifndef LOCORE
 
 #include <x86/x86_smp.h>
@@ -39,7 +37,6 @@ void	invlop_handler(void);
 int	start_all_aps(void);
 
 #endif /* !LOCORE */
-#endif /* SMP */
 
 #endif /* _KERNEL */
 #endif /* _MACHINE_SMP_H_ */
diff --git a/sys/amd64/include/vmparam.h b/sys/amd64/include/vmparam.h
index 59053665dc40..d2ac3c6648b2 100644
--- a/sys/amd64/include/vmparam.h
+++ b/sys/amd64/include/vmparam.h
@@ -181,9 +181,9 @@
  * 0x0100000000000000 - 0xf0ffffffffffffff   does not exist (hole)
  * 0xff00000000000000 - 0xff00ffffffffffff   recursive page table (2048TB slot)
  * 0xff01000000000000 - 0xff20ffffffffffff   direct map (32 x 2048TB slots)
- * 0xff21000000000000 - 0xffff807fffffffff   unused
- * 0xffff808000000000 - 0xffff847fffffffff   large map (can be tuned up)
- * 0xffff848000000000 - 0xfffff77fffffffff   unused (large map extends there)
+ * 0xff21000000000000 - 0xff40ffffffffffff   large map
+ * 0xff41000000000000 - 0xffff7fffffffffff   unused
+ * 0xffff800000000000 - 0xfffff5ffffffffff   unused (start of kernel pml4 entry)
  * 0xfffff60000000000 - 0xfffff7ffffffffff   2TB KMSAN origin map, optional
  * 0xfffff78000000000 - 0xfffff7bfffffffff   512GB KASAN shadow map, optional
  * 0xfffff80000000000 - 0xfffffbffffffffff   4TB unused
@@ -200,16 +200,14 @@
 #define	VM_MIN_KERNEL_ADDRESS		kva_layout.km_low
 #define	VM_MAX_KERNEL_ADDRESS		kva_layout.km_high
 
-#define	KASAN_MIN_ADDRESS	KV4ADDR(KASANPML4I, 0, 0, 0)
-#define	KASAN_MAX_ADDRESS	KV4ADDR(KASANPML4I + NKASANPML4E, 0, 0, 0)
+#define	KASAN_MIN_ADDRESS		(kva_layout.kasan_shadow_low)
+#define	KASAN_MAX_ADDRESS		(kva_layout.kasan_shadow_high)
 
-#define	KMSAN_SHAD_MIN_ADDRESS	KV4ADDR(KMSANSHADPML4I, 0, 0, 0)
-#define	KMSAN_SHAD_MAX_ADDRESS	KV4ADDR(KMSANSHADPML4I + NKMSANSHADPML4E, \
-					0, 0, 0)
+#define	KMSAN_SHAD_MIN_ADDRESS		(kva_layout.kmsan_shadow_low)
+#define	KMSAN_SHAD_MAX_ADDRESS		(kva_layout.kmsan_shadow_high)
 
-#define	KMSAN_ORIG_MIN_ADDRESS	KV4ADDR(KMSANORIGPML4I, 0, 0, 0)
-#define	KMSAN_ORIG_MAX_ADDRESS	KV4ADDR(KMSANORIGPML4I + NKMSANORIGPML4E, \
-					0, 0, 0)
+#define	KMSAN_ORIG_MIN_ADDRESS		(kva_layout.kmsan_origin_low)
+#define	KMSAN_ORIG_MAX_ADDRESS		(kva_layout.kmsan_origin_high)
 
 /*
  * Formally kernel mapping starts at KERNBASE, but kernel linker
@@ -249,7 +247,7 @@
  */
 #define	PHYS_IN_DMAP(pa)	(dmaplimit == 0 || (pa) < dmaplimit)
 #define	VIRT_IN_DMAP(va)	\
-    ((va) >= kva_layout.dmap_low && (va) < kva_layout.dmap_high)
+    ((va) >= kva_layout.dmap_low && (va) < kva_layout.dmap_low + dmaplimit)
 
 #define	PMAP_HAS_DMAP	1
 #define	PHYS_TO_DMAP(x)	__extension__ ({				\
diff --git a/sys/amd64/pt/pt.c b/sys/amd64/pt/pt.c
new file mode 100644
index 000000000000..c7b75767680a
--- /dev/null
+++ b/sys/amd64/pt/pt.c
@@ -0,0 +1,978 @@
+/*
+ * Copyright (c) 2025 Bojan Novković <bnovkov@freebsd.org>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+/*
+ * hwt(4) Intel Processor Trace (PT) backend
+ *
+ * Driver Design Overview
+ *
+ * - Since PT is configured on a per-core basis, the driver uses
+ *   'smp_rendezvous' to start and disable tracing on each target core.
+ * - PT-specific resources are stored in a 'struct pt_ctx' context structure for
+ *   each traced CPU core or thread. Upon initialization, a ToPA configuration
+ *   is generated for each 'pt_ctx' structure using the HWT tracing buffers.
+ *   The HWT tracing buffer is split into 4K ToPA entries. Currently, each
+ *   4K ToPA entry is configured to trigger an interrupt after it is filled.
+ * - The PT driver uses the XSAVE/XRSTOR PT extensions to load and save all
+ *   relevant PT registers. Every time a traced thread is switched
+ *   out or in, its state will be saved to or loaded from its corresponding
+ *   'pt_ctx' context.
+ * - When tracing starts, the PT hardware will start writing data into the
+ *   tracing buffer. When a TOPA_INT entry is filled, it will trigger an
+ *   interrupt before continuing. The interrupt handler will then fetch the
+ *   last valid tracing buffer offset and enqueue a HWT_RECORD_BUFFER record.
+ *   The driver is currently configured to use the NMI interrupt line.
+ * - The userspace PT backend waits for incoming HWT_RECORD_BUFFER records
+ *   and uses the offsets to decode data from the tracing buffer.
+ *
+ * Future improvements and limitations
+ *
+ * - We currently configure the PT hardware to trigger an interrupt whenever
+ *   a 4K ToPA entry is filled. While this is fine when tracing smaller
+ *   functions or infrequent code paths, this will generate too much interrupt
+ *   traffic when tracing hotter functions. A proper solution for this issue
+ *   should estimate the amount of data generated by the current configuration
+ *   and use it to determine interrupt frequency.
+ *
+ * - Support for more tracing options and PT features.
+ *
+ */
+
+#include <sys/systm.h>
+#include <sys/hwt.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/sdt.h>
+#include <sys/smp.h>
+#include <sys/taskqueue.h>
+
+#include <vm/vm.h>
+#include <vm/vm_page.h>
+
+#include <machine/atomic.h>
+#include <machine/cpufunc.h>
+#include <machine/fpu.h>
+#include <machine/smp.h>
+#include <machine/specialreg.h>
+
+#include <x86/apicvar.h>
+#include <x86/x86_var.h>
+
+#include <dev/hwt/hwt_context.h>
+#include <dev/hwt/hwt_vm.h>
+#include <dev/hwt/hwt_backend.h>
+#include <dev/hwt/hwt_config.h>
+#include <dev/hwt/hwt_cpu.h>
+#include <dev/hwt/hwt_record.h>
+#include <dev/hwt/hwt_thread.h>
+
+#include <amd64/pt/pt.h>
+
+#ifdef PT_DEBUG
+#define dprintf(fmt, ...) printf(fmt, ##__VA_ARGS__)
+#else
+#define dprintf(fmt, ...)
+#endif
+#define PT_SUPPORTED_FLAGS						\
+	(RTIT_CTL_MTCEN | RTIT_CTL_CR3FILTER | RTIT_CTL_DIS_TNT |	\
+	    RTIT_CTL_USER | RTIT_CTL_OS | RTIT_CTL_BRANCHEN)
+#define PT_XSAVE_MASK (XFEATURE_ENABLED_X87 | XFEATURE_ENABLED_SSE)
+#define PT_XSTATE_BV (PT_XSAVE_MASK | XFEATURE_ENABLED_PT)
+#define PT_MAX_IP_RANGES 2
+
+#define PT_TOPA_MASK_PTRS 0x7f
+#define PT_TOPA_PAGE_MASK 0xffffff80
+#define PT_TOPA_PAGE_SHIFT 7
+
+#define CPUID_PT_LEAF	0x14
+
+MALLOC_DEFINE(M_PT, "pt", "Intel Processor Trace");
+
+SDT_PROVIDER_DEFINE(pt);
+SDT_PROBE_DEFINE(pt, , , topa__intr);
+
+TASKQUEUE_FAST_DEFINE_THREAD(pt);
+
+static void pt_send_buffer_record(void *arg, int pending __unused);
+static int pt_topa_intr(struct trapframe *tf);
+
+/*
+ * Intel Processor Trace XSAVE-managed state.
+ */
+struct pt_ext_area {
+	uint64_t rtit_ctl;
+	uint64_t rtit_output_base;
+	uint64_t rtit_output_mask_ptrs;
+	uint64_t rtit_status;
+	uint64_t rtit_cr3_match;
+	uint64_t rtit_addr0_a;
+	uint64_t rtit_addr0_b;
+	uint64_t rtit_addr1_a;
+	uint64_t rtit_addr1_b;
+};
+
+struct pt_buffer {
+	uint64_t *topa_hw; /* ToPA table entries. */
+	size_t size;
+	struct mtx lock; /* Lock for fields below. */
+	vm_offset_t offset;
+	uint64_t wrap_count;
+	int curpage;
+};
+
+struct pt_ctx {
+	int id;
+	struct pt_buffer buf; /* ToPA buffer metadata */
+	struct task task;     /* ToPA buffer notification task */
+	struct hwt_context *hwt_ctx;
+	uint8_t *save_area; /* PT XSAVE area */
+};
+/* PT tracing contexts used for CPU mode. */
+static struct pt_ctx *pt_pcpu_ctx;
+
+enum pt_cpu_state {
+	PT_DISABLED = 0,
+	PT_STOPPED,
+	PT_ACTIVE
+};
+
+static struct pt_cpu {
+	struct pt_ctx *ctx;	 /* active PT tracing context */
+	enum pt_cpu_state state; /* used as part of trace stop protocol */
+} *pt_pcpu;
+
+/*
+ * PT-related CPUID bits.
+ */
+static struct pt_cpu_info {
+	uint32_t l0_eax;
+	uint32_t l0_ebx;
+	uint32_t l0_ecx;
+	uint32_t l1_eax;
+	uint32_t l1_ebx;
+	size_t xsave_area_size;
+	size_t xstate_hdr_offset;
+	size_t pt_xsave_offset;
+} pt_info  __read_mostly;
+
+static bool initialized = false;
+static int cpu_mode_ctr = 0;
+
+static __inline enum pt_cpu_state
+pt_cpu_get_state(int cpu_id)
+{
+	return (atomic_load_int(&pt_pcpu[cpu_id].state));
+}
+
+static __inline void
+pt_cpu_set_state(int cpu_id, enum pt_cpu_state state)
+{
+	atomic_store_int(&pt_pcpu[cpu_id].state, state);
+}
+
+static __inline struct xstate_hdr *
+pt_ctx_get_xstate_hdr(struct pt_ctx *ctx)
+{
+	return ((struct xstate_hdr *)(ctx->save_area +
+	    pt_info.xstate_hdr_offset));
+}
+
+
+static __inline struct pt_ext_area *
+pt_ctx_get_ext_area(struct pt_ctx *ctx)
+{
+	return ((struct pt_ext_area *)(ctx->save_area +
+	    pt_info.pt_xsave_offset));
+}
+
+/*
+ * Updates current trace buffer offset from the
+ * ToPA MSRs. Records if the trace buffer wrapped.
+ */
+static __inline void
+pt_update_buffer(struct pt_buffer *buf)
+{
+	uint64_t reg;
+	int curpage;
+
+	/* Update buffer offset. */
+	reg = rdmsr(MSR_IA32_RTIT_OUTPUT_MASK_PTRS);
+	curpage = (reg & PT_TOPA_PAGE_MASK) >> PT_TOPA_PAGE_SHIFT;
+	mtx_lock_spin(&buf->lock);
+	/* Check if the output wrapped. */
+	if (buf->curpage > curpage)
+		buf->wrap_count++;
+	buf->curpage = curpage;
+	buf->offset = reg >> 32;
+	mtx_unlock_spin(&buf->lock);
+
+	dprintf("%s: wrap_cnt: %lu, curpage: %d, offset: %zu\n", __func__,
+	    buf->wrap_count, buf->curpage, buf->offset);
+}
+
+static __inline void
+pt_fill_buffer_record(int id, struct pt_buffer *buf,
+    struct hwt_record_entry *rec)
+{
+	rec->record_type = HWT_RECORD_BUFFER;
+	rec->buf_id = id;
+	rec->curpage = buf->curpage;
+	rec->offset = buf->offset + (buf->wrap_count * buf->size);
+}
+
+/*
+ * Enables or disables tracing on curcpu
+ * using the XSAVE/XRSTOR PT extensions.
+ */
+static void
+pt_cpu_toggle_local(uint8_t *save_area, bool enable)
+{
+	u_long xcr0, cr0;
+	u_long xss;
+
+	cr0 = rcr0();
+	if (cr0 & CR0_TS)
+		clts();
+	xcr0 = rxcr(XCR0);
+	if ((xcr0 & PT_XSAVE_MASK) != PT_XSAVE_MASK)
+		load_xcr(XCR0, xcr0 | PT_XSAVE_MASK);
+	xss = rdmsr(MSR_IA32_XSS);
+	wrmsr(MSR_IA32_XSS, xss | XFEATURE_ENABLED_PT);
+
+	if (!enable) {
+		KASSERT((rdmsr(MSR_IA32_RTIT_CTL) & RTIT_CTL_TRACEEN) != 0,
+		    ("%s: PT is disabled", __func__));
+		xsaves(save_area, XFEATURE_ENABLED_PT);
+	} else {
+		KASSERT((rdmsr(MSR_IA32_RTIT_CTL) & RTIT_CTL_TRACEEN) == 0,
+		    ("%s: PT is enabled", __func__));
+		xrstors(save_area, XFEATURE_ENABLED_PT);
+	}
+	wrmsr(MSR_IA32_XSS, xss);
+	if ((xcr0 & PT_XSAVE_MASK) != PT_XSAVE_MASK)
+		load_xcr(XCR0, xcr0);
+	if (cr0 & CR0_TS)
+		load_cr0(cr0);
+}
+
+/*
+ * Starts PT tracing on 'curcpu'.
+ */
+static void
+pt_cpu_start(void *dummy)
+{
+	struct pt_cpu *cpu;
+
+	cpu = &pt_pcpu[curcpu];
+	MPASS(cpu->ctx != NULL);
+
+	dprintf("%s: curcpu %d\n", __func__, curcpu);
+	load_cr4(rcr4() | CR4_XSAVE);
+	wrmsr(MSR_IA32_RTIT_STATUS, 0);
+	pt_cpu_set_state(curcpu, PT_ACTIVE);
+	pt_cpu_toggle_local(cpu->ctx->save_area, true);
+}
+
+/*
+ * Stops PT tracing on 'curcpu'.
+ * Updates trace buffer offset to ensure
+ * any data generated between the last interrupt
+ * and the trace stop gets picked up by userspace.
+ */
+static void
+pt_cpu_stop(void *dummy)
+{
+	struct pt_cpu *cpu;
+	struct pt_ctx *ctx;
+
+	/* Shutdown may occur before PT gets properly configured. */
+	if (pt_cpu_get_state(curcpu) == PT_DISABLED)
+		return;
+
+	cpu = &pt_pcpu[curcpu];
+	ctx = cpu->ctx;
+	MPASS(ctx != NULL);
+	dprintf("%s: curcpu %d\n", __func__, curcpu);
+
+	pt_cpu_set_state(curcpu, PT_STOPPED);
+	pt_cpu_toggle_local(cpu->ctx->save_area, false);
+	pt_update_buffer(&ctx->buf);
+}
+
+/*
+ * Prepares the Table of Physical Addresses (ToPA) metadata for 'pt_ctx'.
+ * The HWT trace buffer is split into 4K ToPA table entries and used
+ * as a circular buffer, meaning that the last ToPA entry points to
+ * the first ToPA entry. Each entry is configured to raise an
+ * interrupt after being filled.
+ */
+static int
+pt_topa_prepare(struct pt_ctx *ctx, struct hwt_vm *vm)
+{
+	struct pt_buffer *buf;
+	size_t topa_size;
+	int i;
+
+	topa_size = TOPA_SIZE_4K;
+	buf = &ctx->buf;
+
+	KASSERT(buf->topa_hw == NULL,
+	    ("%s: ToPA info already exists", __func__));
+	buf->topa_hw = mallocarray(vm->npages + 1, sizeof(uint64_t), M_PT,
+	    M_ZERO | M_WAITOK);
+	dprintf("%s: ToPA virt addr %p\n", __func__, buf->topa_hw);
+	buf->size = vm->npages * PAGE_SIZE;
+	for (i = 0; i < vm->npages; i++) {
+		buf->topa_hw[i] = VM_PAGE_TO_PHYS(vm->pages[i]) | topa_size;
+		/*
+		 * XXX: TOPA_INT should ideally be set according to
+		 * expected amount of incoming trace data. Too few TOPA_INT
+		 * entries will not trigger interrupts often enough when tracing
+		 * smaller functions.
+		 */
+		buf->topa_hw[i] |= TOPA_INT;
+	}
+	buf->topa_hw[vm->npages] = (uint64_t)vtophys(buf->topa_hw) | TOPA_END;
+
+	return (0);
+}
+
+/*
+ * Configures IP filtering for trace generation.
+ * A maximum of 2 ranges can be specified due to
+ * limitations imposed by the XSAVE/XRSTOR PT extensions.
+ */
+static int
+pt_configure_ranges(struct pt_ctx *ctx, struct pt_cpu_config *cfg)
+{
+	struct pt_ext_area *pt_ext;
+	int nranges_supp, n, error = 0;
+
+	pt_ext = pt_ctx_get_ext_area(ctx);
+	if (pt_info.l0_ebx & CPUPT_IPF) {
+		nranges_supp = (pt_info.l1_eax & CPUPT_NADDR_M) >>
+		    CPUPT_NADDR_S;
+
+		if (nranges_supp > PT_IP_FILTER_MAX_RANGES)
+			nranges_supp = PT_IP_FILTER_MAX_RANGES;
+		n = cfg->nranges;
+		if (n > nranges_supp) {
+			printf("%s: %d IP filtering ranges requested, CPU "
+			       "supports %d, truncating\n",
+			    __func__, n, nranges_supp);
+			n = nranges_supp;
+		}
+
+		switch (n) {
+		case 2:
+			pt_ext->rtit_ctl |= (1UL << RTIT_CTL_ADDR_CFG_S(1));
+			pt_ext->rtit_addr1_a = cfg->ip_ranges[1].start;
+			pt_ext->rtit_addr1_b = cfg->ip_ranges[1].end;
+		case 1:
+			pt_ext->rtit_ctl |= (1UL << RTIT_CTL_ADDR_CFG_S(0));
+			pt_ext->rtit_addr0_a = cfg->ip_ranges[0].start;
+			pt_ext->rtit_addr0_b = cfg->ip_ranges[0].end;
+			break;
+		default:
+			error = (EINVAL);
+			break;
+		};
+	} else
+		error = (ENXIO);
+
+	return (error);
+}
+
+static int
+pt_init_ctx(struct pt_ctx *pt_ctx, struct hwt_vm *vm, int ctx_id)
+{
+
+	dprintf("%s: ctx id %d\n", __func__, ctx_id);
+
+	KASSERT(pt_ctx->buf.topa_hw == NULL,
+	    ("%s: active ToPA buffer in context %p\n", __func__, pt_ctx));
+
+	memset(pt_ctx, 0, sizeof(struct pt_ctx));
+	mtx_init(&pt_ctx->buf.lock, "pttopa", NULL, MTX_SPIN);
+	pt_ctx->save_area = malloc_aligned(pt_info.xsave_area_size, 64,
+	    M_PT, M_NOWAIT | M_ZERO);
+	if (pt_ctx->save_area == NULL)
+		return (ENOMEM);
+	dprintf("%s: preparing ToPA buffer\n", __func__);
+	if (pt_topa_prepare(pt_ctx, vm) != 0) {
+		dprintf("%s: failed to prepare ToPA buffer\n", __func__);
+		free(pt_ctx->save_area, M_PT);
+		return (ENOMEM);
+	}
+
+	pt_ctx->id = ctx_id;
+	TASK_INIT(&pt_ctx->task, 0, pt_send_buffer_record, pt_ctx);
+
+	return (0);
+}
+
+static void
+pt_deinit_ctx(struct pt_ctx *pt_ctx)
+{
+
+	if (pt_ctx->buf.topa_hw != NULL)
+		free(pt_ctx->buf.topa_hw, M_PT);
+	if (pt_ctx->save_area != NULL)
+		free(pt_ctx->save_area, M_PT);
+	memset(pt_ctx, 0, sizeof(*pt_ctx));
+	pt_ctx->buf.topa_hw = NULL;
+}
+
+/*
+ * HWT backend configuration method.
+ *
+ * Checks and translates the user-defined configuration to a
+ * set of PT tracing features. Uses the feature set to initialize
+ * the tracing context for the target CPU or thread.
+ */
+static int
+pt_backend_configure(struct hwt_context *ctx, int cpu_id, int thread_id)
+{
+	struct hwt_cpu *hwt_cpu;
+	struct hwt_thread *thr;
+	struct pt_ctx *pt_ctx;
+	struct pt_cpu_config *cfg;
+	struct pt_ext_area *pt_ext;
+	struct xstate_hdr *hdr;
+	int error;
+
+	dprintf("%s\n", __func__);
+
+	cfg = (struct pt_cpu_config *)ctx->config;
+	pt_ctx = NULL;
+
+	/* Clear any flags we don't support yet. */
+	cfg->rtit_ctl &= PT_SUPPORTED_FLAGS;
+	if (cfg->rtit_ctl & RTIT_CTL_MTCEN) {
+		if ((pt_info.l0_ebx & CPUPT_MTC) == 0) {
+			printf("%s: CPU does not support generating MTC "
+			    "packets\n", __func__);
+			return (ENXIO);
+		}
+	}
+
+	if (cfg->rtit_ctl & RTIT_CTL_CR3FILTER) {
+		if ((pt_info.l0_ebx & CPUPT_CR3) == 0) {
+			printf("%s: CPU does not support CR3 filtering\n",
+			    __func__);
+			return (ENXIO);
+		}
+	}
+
+	if (cfg->rtit_ctl & RTIT_CTL_DIS_TNT) {
+		if ((pt_info.l0_ebx & CPUPT_DIS_TNT) == 0) {
+			printf("%s: CPU does not support TNT\n", __func__);
+			return (ENXIO);
+		}
+	}
+	/* TODO: support for more config bits. */
+
+	if (ctx->mode == HWT_MODE_CPU) {
+		TAILQ_FOREACH(hwt_cpu, &ctx->cpus, next) {
+			if (hwt_cpu->cpu_id != cpu_id)
+				continue;
+			pt_ctx = &pt_pcpu_ctx[cpu_id];
+			break;
+		}
+	} else {
+		TAILQ_FOREACH(thr, &ctx->threads, next) {
+			if (thr->thread_id != thread_id)
+				continue;
+			KASSERT(thr->private != NULL,
+			    ("%s: hwt thread private"
+			     " not set, thr %p",
+				__func__, thr));
+			pt_ctx = (struct pt_ctx *)thr->private;
+			break;
+		}
+	}
+	if (pt_ctx == NULL)
+		return (ENOENT);
+
+	dprintf("%s: preparing MSRs\n", __func__);
+	pt_ext = pt_ctx_get_ext_area(pt_ctx);
+	hdr = pt_ctx_get_xstate_hdr(pt_ctx);
+
+	pt_ext->rtit_ctl |= cfg->rtit_ctl;
+	if (cfg->nranges != 0) {
+		dprintf("%s: preparing IPF ranges\n", __func__);
+		if ((error = pt_configure_ranges(pt_ctx, cfg)) != 0)
+			return (error);
+	}
+	pt_ctx->hwt_ctx = ctx;
+	pt_ext->rtit_ctl |= RTIT_CTL_TOPA;
+	pt_ext->rtit_output_base = (uint64_t)vtophys(pt_ctx->buf.topa_hw);
+	pt_ext->rtit_output_mask_ptrs = PT_TOPA_MASK_PTRS;
+	hdr->xstate_bv = XFEATURE_ENABLED_PT;
+	hdr->xstate_xcomp_bv = XFEATURE_ENABLED_PT |
+	    XSTATE_XCOMP_BV_COMPACT;
+	pt_ext->rtit_ctl |= RTIT_CTL_TRACEEN;
+	pt_pcpu[cpu_id].ctx = pt_ctx;
+	pt_cpu_set_state(cpu_id, PT_STOPPED);
+
+	return (0);
+}
+
+/*
+ * hwt backend trace start operation. CPU affine.
+ */
+static void
+pt_backend_enable(struct hwt_context *ctx, int cpu_id)
+{
+	if (ctx->mode == HWT_MODE_CPU)
+		return;
+
+	KASSERT(curcpu == cpu_id,
+	    ("%s: attempting to start PT on another cpu", __func__));
+	pt_cpu_start(NULL);
+	CPU_SET(cpu_id, &ctx->cpu_map);
+}
+
+/*
+ * hwt backend trace stop operation. CPU affine.
+ */
+static void
+pt_backend_disable(struct hwt_context *ctx, int cpu_id)
+{
+	struct pt_cpu *cpu;
+
+	if (ctx->mode == HWT_MODE_CPU)
+		return;
+
+	KASSERT(curcpu == cpu_id,
+	    ("%s: attempting to disable PT on another cpu", __func__));
+	pt_cpu_stop(NULL);
+	CPU_CLR(cpu_id, &ctx->cpu_map);
+	cpu = &pt_pcpu[cpu_id];
+	cpu->ctx = NULL;
+}
+
+/*
+ * hwt backend trace start operation for remote CPUs.
+ */
+static int
+pt_backend_enable_smp(struct hwt_context *ctx)
+{
+
+	dprintf("%s\n", __func__);
+	if (ctx->mode == HWT_MODE_CPU &&
+	    atomic_swap_32(&cpu_mode_ctr, 1) != 0)
+		return (-1);
+
+	KASSERT(ctx->mode == HWT_MODE_CPU,
+	    ("%s: should only be used for CPU mode", __func__));
+	smp_rendezvous_cpus(ctx->cpu_map, NULL, pt_cpu_start, NULL, NULL);
+
+	return (0);
+}
+
+/*
+ * hwt backend trace stop operation for remote CPUs.
+ */
+static int
+pt_backend_disable_smp(struct hwt_context *ctx)
+{
+
+	dprintf("%s\n", __func__);
+	if (ctx->mode == HWT_MODE_CPU &&
+	    atomic_swap_32(&cpu_mode_ctr, 0) == 0)
+		return (-1);
+
+	if (CPU_EMPTY(&ctx->cpu_map)) {
+		dprintf("%s: empty cpu map\n", __func__);
+		return (-1);
+	}
+	smp_rendezvous_cpus(ctx->cpu_map, NULL, pt_cpu_stop, NULL, NULL);
+
+	return (0);
+}
+
+/*
+ * HWT backend initialization method.
+ *
+ * Installs the ToPA interrupt handler and initializes
+ * the tracing contexts used for HWT_MODE_CPU.
+ */
+static int
+pt_backend_init(struct hwt_context *ctx)
+{
+	struct hwt_cpu *hwt_cpu;
+	int error;
+
+	dprintf("%s\n", __func__);
+	if (ctx->mode == HWT_MODE_CPU) {
+		TAILQ_FOREACH(hwt_cpu, &ctx->cpus, next) {
+			error = pt_init_ctx(&pt_pcpu_ctx[hwt_cpu->cpu_id],
+			    hwt_cpu->vm, hwt_cpu->cpu_id);
+			if (error)
+				return (error);
+		}
+	}
+
+	return (0);
+}
+
+/*
+ * HWT backend teardown method.
+ *
+ * Removes the ToPA interrupt handler, stops tracing on all active CPUs,
+ * and releases all previously allocated ToPA metadata.
+ */
+static int
+pt_backend_deinit(struct hwt_context *ctx)
+{
+	struct pt_ctx *pt_ctx;
+	struct hwt_thread *thr;
+	int cpu_id;
+
+	dprintf("%s\n", __func__);
+
+	pt_backend_disable_smp(ctx);
+	if (ctx->mode == HWT_MODE_THREAD) {
+		TAILQ_FOREACH(thr, &ctx->threads, next) {
+			KASSERT(thr->private != NULL,
+			    ("%s: thr->private not set", __func__));
+			pt_ctx = (struct pt_ctx *)thr->private;
+			pt_deinit_ctx(pt_ctx);
+		}
+	} else {
+		CPU_FOREACH(cpu_id) {
+			if (!CPU_ISSET(cpu_id, &ctx->cpu_map))
+				continue;
+			if (pt_pcpu[cpu_id].ctx != NULL) {
+				KASSERT(pt_pcpu[cpu_id].ctx ==
+					&pt_pcpu_ctx[cpu_id],
+				    ("%s: CPU mode tracing with non-cpu mode PT"
+				     "context active",
+					__func__));
+				pt_pcpu[cpu_id].ctx = NULL;
+			}
+			pt_ctx = &pt_pcpu_ctx[cpu_id];
+			pt_deinit_ctx(pt_ctx);
+			memset(&pt_pcpu[cpu_id], 0, sizeof(struct pt_cpu));
+		}
+	}
+
+	return (0);
+}
+
+/*
+ * Fetches current offset into the tracing buffer.
+ */
+static int
+pt_backend_read(struct hwt_vm *vm, int *curpage, vm_offset_t *curpage_offset,
+    uint64_t *data)
+{
+	struct pt_buffer *buf;
+
+	if (vm->ctx->mode == HWT_MODE_THREAD)
+		buf = &((struct pt_ctx *)vm->thr->private)->buf;
+	else
+		buf = &pt_pcpu[vm->cpu->cpu_id].ctx->buf;
+	mtx_lock_spin(&buf->lock);
+	*curpage = buf->curpage;
+	*curpage_offset = buf->offset + (buf->wrap_count * vm->ctx->bufsize);
+	mtx_unlock_spin(&buf->lock);
+
+	return (0);
+}
+
+/*
+ * HWT thread creation hook.
+ * Allocates and associates a 'struct pt_ctx' for a given hwt thread.
+ */
+static int
+pt_backend_alloc_thread(struct hwt_thread *thr)
+{
+	struct pt_ctx *pt_ctx;
+	int error;
+
+	/* Omit M_WAITOK since this might get invoked a non-sleepable context */
+	pt_ctx = malloc(sizeof(*pt_ctx), M_PT, M_NOWAIT | M_ZERO);
+	if (pt_ctx == NULL)
+		return (ENOMEM);
+
+	error = pt_init_ctx(pt_ctx, thr->vm, thr->thread_id);
+	if (error)
+		return (error);
+
+	thr->private = pt_ctx;
+	return (0);
+}
+/*
+ * HWT thread teardown hook.
+ */
+static void
+pt_backend_free_thread(struct hwt_thread *thr)
+{
+	struct pt_ctx *ctx;
+
+	ctx = (struct pt_ctx *)thr->private;
+
+	pt_deinit_ctx(ctx);
+	free(ctx, M_PT);
+}
+
+static void
+pt_backend_dump(int cpu_id)
+{
+}
+
+static struct hwt_backend_ops pt_ops = {
+	.hwt_backend_init = pt_backend_init,
+	.hwt_backend_deinit = pt_backend_deinit,
+
+	.hwt_backend_configure = pt_backend_configure,
+
+	.hwt_backend_enable = pt_backend_enable,
+	.hwt_backend_disable = pt_backend_disable,
+
+#ifdef SMP
+	.hwt_backend_enable_smp = pt_backend_enable_smp,
+	.hwt_backend_disable_smp = pt_backend_disable_smp,
+#endif
+
+	.hwt_backend_read = pt_backend_read,
+	.hwt_backend_dump = pt_backend_dump,
+
+	.hwt_backend_thread_alloc = pt_backend_alloc_thread,
+	.hwt_backend_thread_free = pt_backend_free_thread,
+};
+
+static struct hwt_backend backend = {
+	.ops = &pt_ops,
+	.name = "pt",
+	.kva_req = 1,
+};
+
+/*
+ * Reads the latest valid trace buffer offset and enqueues
+ * a HWT_RECORD_BUFFER record.
+ * Used as a taskqueue routine from the ToPA interrupt handler.
+ */
+static void
+pt_send_buffer_record(void *arg, int pending __unused)
+{
+	struct hwt_record_entry record;
+	struct pt_ctx *ctx = (struct pt_ctx *)arg;
+
+	/* Prepare buffer record. */
+	mtx_lock_spin(&ctx->buf.lock);
+	pt_fill_buffer_record(ctx->id, &ctx->buf, &record);
+	mtx_unlock_spin(&ctx->buf.lock);
+	hwt_record_ctx(ctx->hwt_ctx, &record, M_ZERO | M_NOWAIT);
+}
+static void
+pt_topa_status_clear(void)
+{
+	uint64_t reg;
+
+	reg = rdmsr(MSR_IA_GLOBAL_STATUS_RESET);
+	reg &= ~GLOBAL_STATUS_FLAG_TRACETOPAPMI;
+	reg |= GLOBAL_STATUS_FLAG_TRACETOPAPMI;
+	wrmsr(MSR_IA_GLOBAL_STATUS_RESET, reg);
+}
+
+/*
+ * ToPA PMI handler.
+ *
+ * Invoked every time a ToPA entry marked with TOPA_INT is filled.
+ * Uses taskqueue to enqueue a buffer record for userspace.
+ * Re-enables the PC interrupt line as long as tracing is active.
+ */
+static int
+pt_topa_intr(struct trapframe *tf)
+{
+	struct pt_buffer *buf;
+	struct pt_ctx *ctx;
+	uint64_t reg;
+
+	SDT_PROBE0(pt, , , topa__intr);
+
+	if (pt_cpu_get_state(curcpu) != PT_ACTIVE) {
+		return (0);
+	}
+	reg = rdmsr(MSR_IA_GLOBAL_STATUS);
+	if ((reg & GLOBAL_STATUS_FLAG_TRACETOPAPMI) == 0) {
+		/* ACK spurious or leftover interrupt. */
+		pt_topa_status_clear();
+		return (1);
+	}
+
+	ctx = pt_pcpu[curcpu].ctx;
+	buf = &ctx->buf;
+	KASSERT(buf->topa_hw != NULL,
+	    ("%s: ToPA PMI interrupt with invalid buffer", __func__));
+
+	pt_cpu_toggle_local(ctx->save_area, false);
+	pt_update_buffer(buf);
+	pt_topa_status_clear();
+	taskqueue_enqueue_flags(taskqueue_pt, &ctx->task,
+	    TASKQUEUE_FAIL_IF_PENDING);
+
+	if (pt_cpu_get_state(curcpu) == PT_ACTIVE) {
+		pt_cpu_toggle_local(ctx->save_area, true);
+		lapic_reenable_pcint();
+	}
+	return (1);
+}
+
+/*
+ * Module initialization.
+ *
+ * Saves all PT-related cpuid info, registers itself as a HWT backend,
+ * and allocates metadata required to keep track of tracing operations
+ * on each CPU.
+ */
+static int
+pt_init(void)
+{
+	u_int cp[4];
+	int error;
+
+	dprintf("pt: Enumerating part 1\n");
+	cpuid_count(CPUID_PT_LEAF, 0, cp);
+	dprintf("pt: Maximum valid sub-leaf Index: %x\n", cp[0]);
+	dprintf("pt: ebx %x\n", cp[1]);
+	dprintf("pt: ecx %x\n", cp[2]);
+
+	pt_info.l0_eax = cp[0];
+	pt_info.l0_ebx = cp[1];
+	pt_info.l0_ecx = cp[2];
+
+	dprintf("pt: Enumerating part 2\n");
+	cpuid_count(CPUID_PT_LEAF, 1, cp);
+	dprintf("pt: eax %x\n", cp[0]);
+	dprintf("pt: ebx %x\n", cp[1]);
+
+	pt_info.l1_eax = cp[0];
+	pt_info.l1_ebx = cp[1];
+
+	error = hwt_backend_register(&backend);
+	if (error != 0) {
+		printf("pt: unable to register hwt backend, error %d\n", error);
+		return (error);
+	}
+	pt_pcpu = mallocarray(mp_ncpus, sizeof(struct pt_cpu), M_PT,
+	    M_ZERO | M_WAITOK);
+	pt_pcpu_ctx = mallocarray(mp_ncpus, sizeof(struct pt_ctx), M_PT,
+	    M_ZERO | M_WAITOK);
+
+	nmi_register_handler(pt_topa_intr);
+	if (!lapic_enable_pcint()) {
+		nmi_remove_handler(pt_topa_intr);
+		hwt_backend_unregister(&backend);
+		free(pt_pcpu, M_PT);
+		free(pt_pcpu_ctx, M_PT);
+		pt_pcpu = NULL;
+		pt_pcpu_ctx = NULL;
+		printf("pt: failed to setup interrupt line\n");
+		return (error);
+	}
+	initialized = true;
+
+	return (0);
+}
+
+/*
+ * Checks whether the CPU support Intel PT and
+ * initializes XSAVE area info.
+ *
+ * The driver relies on XSAVE/XRSTOR PT extensions,
+ * Table of Physical Addresses (ToPA) support, and
+ * support for multiple ToPA entries.
+ */
+static bool
+pt_supported(void)
+{
+	u_int cp[4];
+
+	if ((cpu_stdext_feature & CPUID_STDEXT_PROCTRACE) == 0) {
+		printf("pt: CPU does not support Intel Processor Trace\n");
+		return (false);
+	}
+	if ((cpu_feature2 & CPUID2_XSAVE) == 0) {
+		printf("pt: XSAVE is not supported\n");
+		return (false);
+	}
+	if (!xsave_extfeature_supported(XFEATURE_ENABLED_PT, true)) {
+		printf("pt: CPU does not support managing PT state using XSAVE\n");
+		return (false);
+	}
+	if (!xsave_extension_supported(CPUID_EXTSTATE_XSAVEC)) {
+		printf("pt: XSAVE compaction is not supported\n");
+		return (false);
+	}
+	if (!xsave_extension_supported(CPUID_EXTSTATE_XSAVES)) {
+		printf("pt: CPU does not support XSAVES/XRSTORS\n");
+		return (false);
+	}
+
+	/* Require ToPA support. */
+	cpuid_count(CPUID_PT_LEAF, 0, cp);
+	if ((cp[2] & CPUPT_TOPA) == 0) {
+		printf("pt: ToPA is not supported\n");
+		return (false);
+	}
+	if ((cp[2] & CPUPT_TOPA_MULTI) == 0) {
+		printf("pt: multiple ToPA outputs are not supported\n");
+		return (false);
+	}
+
+	pt_info.xstate_hdr_offset = xsave_area_hdr_offset();
+	pt_info.xsave_area_size = xsave_area_size(PT_XSTATE_BV, true, true);
+	pt_info.pt_xsave_offset = xsave_area_offset(PT_XSTATE_BV,
+	    XFEATURE_ENABLED_PT, true, true);
+
+	return (true);
+}
+
+static void
+pt_deinit(void)
+{
+	if (!initialized)
+		return;
+	nmi_remove_handler(pt_topa_intr);
+	lapic_disable_pcint();
+	hwt_backend_unregister(&backend);
+	free(pt_pcpu, M_PT);
+	free(pt_pcpu_ctx, M_PT);
+	pt_pcpu = NULL;
+	initialized = false;
+}
+
+static int
+pt_modevent(module_t mod, int type, void *data)
+{
+	switch (type) {
+	case MOD_LOAD:
+		if (!pt_supported() || pt_init() != 0) {
+			return (ENXIO);
+		}
+		break;
+	case MOD_UNLOAD:
+		pt_deinit();
+		break;
+	default:
+		break;
+	}
+
+	return (0);
+}
+
+static moduledata_t pt_mod = { "intel_pt", pt_modevent, NULL };
+
+DECLARE_MODULE(intel_pt, pt_mod, SI_SUB_DRIVERS, SI_ORDER_FIRST);
+MODULE_DEPEND(intel_pt, hwt, 1, 1, 1);
+MODULE_VERSION(intel_pt, 1);
diff --git a/sys/amd64/pt/pt.h b/sys/amd64/pt/pt.h
new file mode 100644
index 000000000000..2423afdf22e9
--- /dev/null
+++ b/sys/amd64/pt/pt.h
@@ -0,0 +1,49 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023 Bojan Novković <bnovkov@freebsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _AMD64_PT_PT_H_
+#define _AMD64_PT_PT_H_
+
+#include <sys/types.h>
+
+#include <x86/include/specialreg.h>
+
+#define PT_IP_FILTER_MAX_RANGES (2) /* Intel SDM Vol. 3C, 33-29 */
+
+struct pt_cpu_config {
+	uint64_t rtit_ctl;
+	register_t cr3_filter;
+	int nranges;
+	struct ipf_range {
+		vm_offset_t start;
+		vm_offset_t end;
+	} ip_ranges[PT_IP_FILTER_MAX_RANGES];
+	uint32_t mtc_freq;
+	uint32_t cyc_thresh;
+	uint32_t psb_freq;
+};
+#endif /* !_AMD64_PT_PT_H_ */
diff --git a/sys/amd64/vmm/intel/vmx_support.S b/sys/amd64/vmm/intel/vmx_support.S
index f393f160b101..130130b64541 100644
--- a/sys/amd64/vmm/intel/vmx_support.S
+++ b/sys/amd64/vmm/intel/vmx_support.S
@@ -32,12 +32,6 @@
 
 #include "vmx_assym.h"
 
-#ifdef SMP
-#define	LK	lock ;
-#else
-#define	LK
-#endif
-
 /* Be friendly to DTrace FBT's prologue/epilogue pattern matching */
 #define VENTER  push %rbp ; mov %rsp,%rbp
 #define VLEAVE  pop %rbp
diff --git a/sys/arm/allwinner/aw_mmc.c b/sys/arm/allwinner/aw_mmc.c
index 6bebf5e5fb5e..a8add957dc74 100644
--- a/sys/arm/allwinner/aw_mmc.c
+++ b/sys/arm/allwinner/aw_mmc.c
@@ -84,21 +84,26 @@
 
 struct aw_mmc_conf {
 	uint32_t	dma_xferlen;
+	uint32_t	dma_desc_shift;
 	bool		mask_data0;
 	bool		can_calibrate;
 	bool		new_timing;
+	bool		zero_is_skip;
 };
 
 static const struct aw_mmc_conf a10_mmc_conf = {
 	.dma_xferlen = 0x2000,
+	.dma_desc_shift = 0,
 };
 
 static const struct aw_mmc_conf a13_mmc_conf = {
 	.dma_xferlen = 0x10000,
+	.dma_desc_shift = 0,
 };
 
 static const struct aw_mmc_conf a64_mmc_conf = {
 	.dma_xferlen = 0x10000,
+	.dma_desc_shift = 0,
 	.mask_data0 = true,
 	.can_calibrate = true,
 	.new_timing = true,
@@ -106,13 +111,24 @@ static const struct aw_mmc_conf a64_mmc_conf = {
 
 static const struct aw_mmc_conf a64_emmc_conf = {
 	.dma_xferlen = 0x2000,
+	.dma_desc_shift = 0,
 	.can_calibrate = true,
 };
 
+static const struct aw_mmc_conf d1_mmc_conf = {
+	.dma_xferlen = 0x1000,
+	.dma_desc_shift = 2,
+	.mask_data0 = true,
+	.can_calibrate = true,
+	.new_timing = true,
+	.zero_is_skip = true,
+};
+
 static struct ofw_compat_data compat_data[] = {
 	{"allwinner,sun4i-a10-mmc", (uintptr_t)&a10_mmc_conf},
 	{"allwinner,sun5i-a13-mmc", (uintptr_t)&a13_mmc_conf},
 	{"allwinner,sun7i-a20-mmc", (uintptr_t)&a13_mmc_conf},
+	{"allwinner,sun20i-d1-mmc", (uintptr_t)&d1_mmc_conf},
 	{"allwinner,sun50i-a64-mmc", (uintptr_t)&a64_mmc_conf},
 	{"allwinner,sun50i-a64-emmc", (uintptr_t)&a64_emmc_conf},
 	{NULL,             0}
@@ -607,16 +623,18 @@ aw_dma_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int err)
 
 	dma_desc = sc->aw_dma_desc;
 	for (i = 0; i < nsegs; i++) {
-		if (segs[i].ds_len == sc->aw_mmc_conf->dma_xferlen)
+		if ((segs[i].ds_len == sc->aw_mmc_conf->dma_xferlen) &&
+		    !sc->aw_mmc_conf->zero_is_skip)
 			dma_desc[i].buf_size = 0;		/* Size of 0 indicate max len */
 		else
 			dma_desc[i].buf_size = segs[i].ds_len;
-		dma_desc[i].buf_addr = segs[i].ds_addr;
+		dma_desc[i].buf_addr = segs[i].ds_addr >>
+		    sc->aw_mmc_conf->dma_desc_shift;
 		dma_desc[i].config = AW_MMC_DMA_CONFIG_CH |
-			AW_MMC_DMA_CONFIG_OWN | AW_MMC_DMA_CONFIG_DIC;
-
-		dma_desc[i].next = sc->aw_dma_desc_phys +
-			((i + 1) * sizeof(struct aw_mmc_dma_desc));
+		    AW_MMC_DMA_CONFIG_OWN | AW_MMC_DMA_CONFIG_DIC;
+		dma_desc[i].next = (sc->aw_dma_desc_phys +
+		    (i + 1) * sizeof(struct aw_mmc_dma_desc)) >>
+		    sc->aw_mmc_conf->dma_desc_shift;
 	}
 
 	dma_desc[0].config |= AW_MMC_DMA_CONFIG_FD;
@@ -678,7 +696,8 @@ aw_mmc_prepare_dma(struct aw_mmc_softc *sc)
 	AW_MMC_WRITE_4(sc, AW_MMC_IDIE, val);
 
 	/* Set DMA descritptor list address */
-	AW_MMC_WRITE_4(sc, AW_MMC_DLBA, sc->aw_dma_desc_phys);
+	AW_MMC_WRITE_4(sc, AW_MMC_DLBA, sc->aw_dma_desc_phys >>
+	    sc->aw_mmc_conf->dma_desc_shift);
 
 	/* FIFO trigger level */
 	AW_MMC_WRITE_4(sc, AW_MMC_FWLR, AW_MMC_DMA_FTRGLEVEL);
diff --git a/sys/arm64/arm64/pmap.c b/sys/arm64/arm64/pmap.c
index a09da794e77d..459cc8ebe505 100644
--- a/sys/arm64/arm64/pmap.c
+++ b/sys/arm64/arm64/pmap.c
@@ -5709,6 +5709,9 @@ pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags,
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	KASSERT(ADDR_IS_CANONICAL(va),
 	    ("%s: Address not in canonical form: %lx", __func__, va));
+	KASSERT((flags & (PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM)) !=
+	    PMAP_ENTER_NORECLAIM,
+	    ("pmap_enter_l2: flags is missing PMAP_ENTER_NOREPLACE"));
 
 	if ((l2 = pmap_alloc_l2(pmap, va, &l2pg, (flags &
 	    PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp)) == NULL) {
@@ -5828,6 +5831,15 @@ pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags,
 		if (!pmap_pv_insert_l2(pmap, va, new_l2, flags, lockp)) {
 			if (l2pg != NULL)
 				pmap_abort_ptp(pmap, va, l2pg);
+			else {
+				KASSERT(ADDR_IS_KERNEL(va) &&
+				    (pmap_load(l2) & ATTR_DESCR_MASK) ==
+				    L2_TABLE,
+				    ("pmap_enter_l2: invalid kernel L2E"));
+				mt = pmap_remove_pt_page(pmap, va);
+				KASSERT(mt != NULL,
+				    ("pmap_enter_l2: missing kernel PTP"));
+			}
 			if (uwptpg != NULL) {
 				mt = pmap_remove_pt_page(pmap, va);
 				KASSERT(mt == uwptpg,
diff --git a/sys/arm64/broadcom/genet/if_genet.c b/sys/arm64/broadcom/genet/if_genet.c
index 0602f076b257..182b5582fb7c 100644
--- a/sys/arm64/broadcom/genet/if_genet.c
+++ b/sys/arm64/broadcom/genet/if_genet.c
@@ -349,7 +349,7 @@ gen_attach(device_t dev)
 	}
 
 	/* If address was not found, create one based on the hostid and name. */
-	if (eaddr_found == 0)
+	if (!eaddr_found)
 		ether_gen_addr(sc->ifp, &eaddr);
 	/* Attach ethernet interface */
 	ether_ifattach(sc->ifp, eaddr.octet);
@@ -653,7 +653,7 @@ gen_bus_dma_teardown(struct gen_softc *sc)
 			    error);
 	}
 
-	if (sc->tx_buf_tag != NULL) {
+	if (sc->rx_buf_tag != NULL) {
 		for (i = 0; i < RX_DESC_COUNT; i++) {
 			error = bus_dmamap_destroy(sc->rx_buf_tag,
 			    sc->rx_ring_ent[i].map);
diff --git a/sys/cam/cam_xpt.c b/sys/cam/cam_xpt.c
index 2ec736e7f4ac..cae29226d13c 100644
--- a/sys/cam/cam_xpt.c
+++ b/sys/cam/cam_xpt.c
@@ -2515,6 +2515,15 @@ xpt_action(union ccb *start_ccb)
 	    ("xpt_action: func %#x %s\n", start_ccb->ccb_h.func_code,
 		xpt_action_name(start_ccb->ccb_h.func_code)));
 
+	/*
+	 * Either it isn't queued, or it has a real priority. There still too
+	 * many places that reuse CCBs with a real priority to do immediate
+	 * queries to do the other side of this assert.
+	 */
+	KASSERT((start_ccb->ccb_h.func_code & XPT_FC_QUEUED) == 0 ||
+	    start_ccb->ccb_h.pinfo.priority != CAM_PRIORITY_NONE,
+	    ("%s: queued ccb and CAM_PRIORITY_NONE illegal.", __func__));
+
 	start_ccb->ccb_h.status = CAM_REQ_INPROG;
 	(*(start_ccb->ccb_h.path->bus->xport->ops->action))(start_ccb);
 }
diff --git a/sys/cam/mmc/mmc_da.c b/sys/cam/mmc/mmc_da.c
index 7f8bf3516804..322141a72707 100644
--- a/sys/cam/mmc/mmc_da.c
+++ b/sys/cam/mmc/mmc_da.c
@@ -1081,7 +1081,7 @@ sdda_start_init_task(void *context, int pending)
 	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("sdda_start_init_task\n"));
 	new_ccb = xpt_alloc_ccb();
 	xpt_setup_ccb(&new_ccb->ccb_h, periph->path,
-		      CAM_PRIORITY_NONE);
+		      CAM_PRIORITY_NORMAL);
 
 	cam_periph_lock(periph);
 	cam_periph_hold(periph, PRIBIO|PCATCH);
diff --git a/sys/cam/mmc/mmc_xpt.c b/sys/cam/mmc/mmc_xpt.c
index 4fce03004994..f5f66f5214a8 100644
--- a/sys/cam/mmc/mmc_xpt.c
+++ b/sys/cam/mmc/mmc_xpt.c
@@ -610,7 +610,6 @@ mmcprobe_start(struct cam_periph *periph, union ccb *start_ccb)
 		CAM_DEBUG(start_ccb->ccb_h.path, CAM_DEBUG_PROBE, ("Start with PROBE_RESET\n"));
 		/* FALLTHROUGH */
 	case PROBE_IDENTIFY:
-		xpt_path_inq(&start_ccb->cpi, periph->path);
 		CAM_DEBUG(start_ccb->ccb_h.path, CAM_DEBUG_PROBE, ("Start with PROBE_IDENTIFY\n"));
 		init_standard_ccb(start_ccb, XPT_MMC_GET_TRAN_SETTINGS);
 		break;
diff --git a/sys/cddl/boot/zfs/zfsimpl.h b/sys/cddl/boot/zfs/zfsimpl.h
index 0ce38384abbf..83d964360343 100644
--- a/sys/cddl/boot/zfs/zfsimpl.h
+++ b/sys/cddl/boot/zfs/zfsimpl.h
@@ -2019,6 +2019,7 @@ typedef struct vdev {
 	vdev_list_t	v_children;	/* children of this vdev */
 	const char	*v_name;	/* vdev name */
 	uint64_t	v_guid;		/* vdev guid */
+	uint64_t	v_txg;		/* most recent transaction */
 	uint64_t	v_id;		/* index in parent */
 	uint64_t	v_psize;	/* physical device capacity */
 	int		v_ashift;	/* offset to block shift */
@@ -2048,7 +2049,6 @@ typedef struct spa {
 	STAILQ_ENTRY(spa) spa_link;	/* link in global pool list */
 	char		*spa_name;	/* pool name */
 	uint64_t	spa_guid;	/* pool guid */
-	uint64_t	spa_txg;	/* most recent transaction */
 	struct uberblock *spa_uberblock;	/* best uberblock so far */
 	vdev_t		*spa_root_vdev;	/* toplevel vdev container */
 	objset_phys_t	*spa_mos;	/* MOS for this pool */
diff --git a/sys/conf/files b/sys/conf/files
index 74d251c2b608..dd0d390962f2 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -3227,6 +3227,19 @@ dev/uart/uart_if.m		optional uart
 dev/uart/uart_subr.c		optional uart
 dev/uart/uart_tty.c		optional uart
 #
+# Universal Flash Storage Host Controller Interface drivers
+#
+dev/ufshci/ufshci.c		optional ufshci
+dev/ufshci/ufshci_ctrlr.c	optional ufshci
+dev/ufshci/ufshci_ctrlr_cmd.c	optional ufshci
+dev/ufshci/ufshci_dev.c		optional ufshci
+dev/ufshci/ufshci_pci.c		optional ufshci
+dev/ufshci/ufshci_req_queue.c	optional ufshci
+dev/ufshci/ufshci_req_sdb.c	optional ufshci
+dev/ufshci/ufshci_sim.c		optional ufshci
+dev/ufshci/ufshci_sysctl.c	optional ufshci
+dev/ufshci/ufshci_uic_cmd.c	optional ufshci
+#
 # USB controller drivers
 #
 dev/usb/controller/musb_otg.c		optional musb
diff --git a/sys/conf/files.amd64 b/sys/conf/files.amd64
index 0584fc29d039..80548320c3fc 100644
--- a/sys/conf/files.amd64
+++ b/sys/conf/files.amd64
@@ -84,8 +84,8 @@ amd64/amd64/xen-locore.S	optional	xenhvm \
 amd64/amd64/machdep.c		standard
 amd64/amd64/mem.c		optional	mem
 amd64/amd64/minidump_machdep.c	standard
-amd64/amd64/mp_machdep.c	optional	smp
-amd64/amd64/mpboot.S		optional	smp
+amd64/amd64/mp_machdep.c	standard
+amd64/amd64/mpboot.S		standard
 amd64/amd64/pmap.c		standard
 amd64/amd64/ptrace_machdep.c	standard
 amd64/amd64/support.S		standard
@@ -191,6 +191,10 @@ dev/ice/irdma_di_if.m		optional	ice pci \
 	compile-with "${NORMAL_M} -I$S/dev/ice"
 dev/ice/ice_ddp_common.c	optional	ice pci \
 	compile-with "${NORMAL_C} -I$S/dev/ice"
+dev/ice/ice_iov.c	optional	ice pci pci_iov \
+	compile-with "${NORMAL_C} -I$S/dev/ice"
+dev/ice/ice_vf_mbx.c	optional	ice pci pci_iov \
+	compile-with "${NORMAL_C} -I$S/dev/ice"
 ice_ddp.c			optional ice_ddp	\
 	compile-with	"${AWK} -f $S/tools/fw_stub.awk ice_ddp.fw:ice_ddp:0x01032900 -mice_ddp -c${.TARGET}"	\
 	no-ctfconvert no-implicit-rule before-depend local	\
diff --git a/sys/dev/drm2/drm_fb_helper.c b/sys/dev/drm2/drm_fb_helper.c
index f67cc9f60d02..1f4abd255690 100644
--- a/sys/dev/drm2/drm_fb_helper.c
+++ b/sys/dev/drm2/drm_fb_helper.c
@@ -51,7 +51,7 @@ struct vt_kms_softc {
 	struct task		 fb_mode_task;
 };
 
-/* Call restore out of vt(9) locks. */
+/* Call restore out of vt(4) locks. */
 static void
 vt_restore_fbdev_mode(void *arg, int pending)
 {
diff --git a/sys/dev/efidev/efirt.c b/sys/dev/efidev/efirt.c
index b0fa33daeca7..b55c1c191077 100644
--- a/sys/dev/efidev/efirt.c
+++ b/sys/dev/efidev/efirt.c
@@ -107,7 +107,8 @@ static int efi_status2err[25] = {
 
 enum efi_table_type {
 	TYPE_ESRT = 0,
-	TYPE_PROP
+	TYPE_PROP,
+	TYPE_MEMORY_ATTR
 };
 
 static int efi_enter(void);
@@ -445,6 +446,42 @@ get_table_length(enum efi_table_type type, size_t *table_len, void **taddr)
 		free(buf, M_TEMP);
 		return (0);
 	}
+	case TYPE_MEMORY_ATTR:
+	{
+		efi_guid_t guid = EFI_MEMORY_ATTRIBUTES_TABLE;
+		struct efi_memory_attribute_table *tbl_addr, *mem_addr;
+		int error;
+		void *buf;
+		size_t len = sizeof(struct efi_memory_attribute_table);
+
+		error = efi_get_table(&guid, (void **)&tbl_addr);
+		if (error)
+			return (error);
+
+		buf = malloc(len, M_TEMP, M_WAITOK);
+		error = physcopyout((vm_paddr_t)tbl_addr, buf, len);
+		if (error) {
+			free(buf, M_TEMP);
+			return (error);
+		}
+
+		mem_addr = (struct efi_memory_attribute_table *)buf;
+		if (mem_addr->version != 2) {
+			free(buf, M_TEMP);
+			return (EINVAL);
+		}
+		len += mem_addr->descriptor_size * mem_addr->num_ents;
+		if (len > EFI_TABLE_ALLOC_MAX) {
+			free(buf, M_TEMP);
+			return (ENOMEM);
+		}
+
+		*table_len = len;
+		if (taddr != NULL)
+			*taddr = tbl_addr;
+		free(buf, M_TEMP);
+		return (0);
+	}
 	}
 	return (ENOENT);
 }
@@ -457,7 +494,8 @@ copy_table(efi_guid_t *guid, void **buf, size_t buf_len, size_t *table_len)
 		enum efi_table_type type;
 	} tables[] = {
 		{ EFI_TABLE_ESRT,       TYPE_ESRT },
-		{ EFI_PROPERTIES_TABLE, TYPE_PROP }
+		{ EFI_PROPERTIES_TABLE, TYPE_PROP },
+		{ EFI_MEMORY_ATTRIBUTES_TABLE, TYPE_MEMORY_ATTR }
 	};
 	size_t table_idx;
 	void *taddr;
diff --git a/sys/dev/gpio/gpiobus.c b/sys/dev/gpio/gpiobus.c
index ab7f13177969..764bcb7e6ee8 100644
--- a/sys/dev/gpio/gpiobus.c
+++ b/sys/dev/gpio/gpiobus.c
@@ -110,10 +110,9 @@ gpio_alloc_intr_resource(device_t consumer_dev, int *rid, u_int alloc_flags,
 	res = bus_alloc_resource(consumer_dev, SYS_RES_IRQ, rid, irq, irq, 1,
 	    alloc_flags);
 	if (res == NULL) {
-		intr_free_intr_map_data((struct intr_map_data *)gpio_data);
+		intr_unmap_irq(irq);
 		return (NULL);
 	}
-	rman_set_virtual(res, gpio_data);
 	return (res);
 }
 #else
@@ -866,6 +865,25 @@ gpiobus_alloc_resource(device_t bus, device_t child, int type, int *rid,
 	    end, count, flags));
 }
 
+static int
+gpiobus_release_resource(device_t dev, device_t child, struct resource *r)
+{
+	int err;
+#ifdef INTRNG
+	u_int irq;
+
+	irq = rman_get_start(r);
+	MPASS(irq == rman_get_end(r));
+#endif
+	err = bus_generic_rman_release_resource(dev, child, r);
+	if (err != 0)
+		return (err);
+#ifdef INTRNG
+	intr_unmap_irq(irq);
+#endif
+	return (0);
+}
+
 static struct resource_list *
 gpiobus_get_resource_list(device_t bus __unused, device_t child)
 {
@@ -1060,7 +1078,7 @@ static device_method_t gpiobus_methods[] = {
 	DEVMETHOD(bus_get_resource,	bus_generic_rl_get_resource),
 	DEVMETHOD(bus_set_resource,	bus_generic_rl_set_resource),
 	DEVMETHOD(bus_alloc_resource,	gpiobus_alloc_resource),
-	DEVMETHOD(bus_release_resource,	bus_generic_rman_release_resource),
+	DEVMETHOD(bus_release_resource,	gpiobus_release_resource),
 	DEVMETHOD(bus_activate_resource,	bus_generic_rman_activate_resource),
 	DEVMETHOD(bus_deactivate_resource,	bus_generic_rman_deactivate_resource),
 	DEVMETHOD(bus_get_resource_list,	gpiobus_get_resource_list),
diff --git a/sys/dev/ice/ice_features.h b/sys/dev/ice/ice_features.h
index 821abe4806ca..5b23757b1c98 100644
--- a/sys/dev/ice/ice_features.h
+++ b/sys/dev/ice/ice_features.h
@@ -91,7 +91,9 @@ enum feat_list {
 static inline void
 ice_disable_unsupported_features(ice_bitmap_t __unused *bitmap)
 {
+#ifndef PCI_IOV
 	ice_clear_bit(ICE_FEATURE_SRIOV, bitmap);
+#endif
 #ifndef DEV_NETMAP
 	ice_clear_bit(ICE_FEATURE_NETMAP, bitmap);
 #endif
diff --git a/sys/dev/ice/ice_iflib.h b/sys/dev/ice/ice_iflib.h
index 3a5dc201189a..e1d5307a9516 100644
--- a/sys/dev/ice/ice_iflib.h
+++ b/sys/dev/ice/ice_iflib.h
@@ -139,6 +139,9 @@ struct ice_irq_vector {
  * @tc: traffic class queue belongs to
  * @q_handle: qidx in tc; used in TXQ enable functions
  *
+ * ice_iov.c requires the following parameters (when PCI_IOV is defined):
+ * @itr_idx: ITR index to use for this queue
+ *
  * Other parameters may be iflib driver specific
  */
 struct ice_tx_queue {
@@ -153,6 +156,9 @@ struct ice_tx_queue {
 	u32			me;
 	u16			q_handle;
 	u8			tc;
+#ifdef PCI_IOV
+	u8			itr_idx;
+#endif
 
 	/* descriptor writeback status */
 	qidx_t			*tx_rsq;
@@ -175,6 +181,9 @@ struct ice_tx_queue {
  * @stats: queue statistics
  * @tc: traffic class queue belongs to
  *
+ * ice_iov.c requires the following parameters (when PCI_IOV is defined):
+ * @itr_idx: ITR index to use for this queue
+ *
  * Other parameters may be iflib driver specific
  */
 struct ice_rx_queue {
@@ -187,6 +196,9 @@ struct ice_rx_queue {
 	struct ice_irq_vector		*irqv;
 	u32				me;
 	u8				tc;
+#ifdef PCI_IOV
+	u8				itr_idx;
+#endif
 
 	struct if_irq			que_irq;
 };
@@ -332,6 +344,10 @@ struct ice_softc {
 	ice_declare_bitmap(feat_cap, ICE_FEATURE_COUNT);
 	ice_declare_bitmap(feat_en, ICE_FEATURE_COUNT);
 
+#ifdef PCI_IOV
+	struct ice_vf *vfs;
+	u16 num_vfs;
+#endif
 	struct ice_resmgr os_imgr;
 	/* For mirror interface */
 	struct ice_mirr_if *mirr_if;
diff --git a/sys/dev/ice/ice_iov.c b/sys/dev/ice/ice_iov.c
new file mode 100644
index 000000000000..c5a3e1060e44
--- /dev/null
+++ b/sys/dev/ice/ice_iov.c
@@ -0,0 +1,1856 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/*  Copyright (c) 2025, Intel Corporation
+ *  All rights reserved.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *
+ *   1. Redistributions of source code must retain the above copyright notice,
+ *      this list of conditions and the following disclaimer.
+ *
+ *   2. Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in the
+ *      documentation and/or other materials provided with the distribution.
+ *
+ *   3. Neither the name of the Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived from
+ *      this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ *  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file ice_iov.c
+ * @brief Virtualization support functions
+ *
+ * Contains functions for enabling and managing PCIe virtual function devices,
+ * including enabling new VFs, and managing VFs over the virtchnl interface.
+ */
+
+#include "ice_iov.h"
+
+static struct ice_vf *ice_iov_get_vf(struct ice_softc *sc, int vf_num);
+static void ice_iov_ready_vf(struct ice_softc *sc, struct ice_vf *vf);
+static void ice_reset_vf(struct ice_softc *sc, struct ice_vf *vf,
+			 bool trigger_vflr);
+static void ice_iov_setup_intr_mapping(struct ice_softc *sc, struct ice_vf *vf);
+
+static void ice_vc_version_msg(struct ice_softc *sc, struct ice_vf *vf,
+			       u8 *msg_buf);
+static void ice_vc_get_vf_res_msg(struct ice_softc *sc, struct ice_vf *vf,
+				  u8 *msg_buf);
+static void ice_vc_add_eth_addr_msg(struct ice_softc *sc, struct ice_vf *vf,
+				    u8 *msg_buf);
+static void ice_vc_del_eth_addr_msg(struct ice_softc *sc, struct ice_vf *vf,
+				    u8 *msg_buf);
+static bool ice_vc_isvalid_ring_len(u16 ring_len);
+static void ice_vc_cfg_vsi_qs_msg(struct ice_softc *sc, struct ice_vf *vf,
+				  u8 *msg_buf);
+static void ice_vc_cfg_rss_key_msg(struct ice_softc *sc, struct ice_vf *vf,
+				   u8 *msg_buf);
+static void ice_vc_set_rss_hena_msg(struct ice_softc *sc, struct ice_vf *vf,
+				    u8 *msg_buf);
+static void ice_vc_enable_queues_msg(struct ice_softc *sc, struct ice_vf *vf,
+				     u8 *msg_buf);
+static void ice_vc_notify_vf_link_state(struct ice_softc *sc, struct ice_vf *vf);
+static void ice_vc_disable_queues_msg(struct ice_softc *sc, struct ice_vf *vf,
+				      u8 *msg_buf);
+static void ice_vc_cfg_irq_map_msg(struct ice_softc *sc, struct ice_vf *vf,
+				   u8 *msg_buf);
+static void ice_vc_get_stats_msg(struct ice_softc *sc, struct ice_vf *vf,
+				 u8 *msg_buf);
+static void ice_eth_stats_to_virtchnl_eth_stats(struct ice_eth_stats *istats,
+     struct virtchnl_eth_stats *vstats);
+static void ice_vc_cfg_rss_lut_msg(struct ice_softc *sc, struct ice_vf *vf,
+				   u8 *msg_buf);
+static void ice_vc_cfg_promisc_mode_msg(struct ice_softc *sc, struct ice_vf *vf,
+				        u8 *msg_buf);
+static void ice_vc_add_vlan_msg(struct ice_softc *sc, struct ice_vf *vf,
+				u8 *msg_buf);
+static void ice_vc_del_vlan_msg(struct ice_softc *sc, struct ice_vf *vf,
+				u8 *msg_buf);
+static enum virtchnl_status_code ice_iov_err_to_virt_err(int ice_err);
+static int ice_vf_validate_mac(struct ice_vf *vf, const uint8_t *addr);
+
+/**
+ * ice_iov_attach - Initialize SR-IOV PF host support
+ * @sc: device softc structure
+ *
+ * Initialize SR-IOV PF host support at the end of the driver attach process.
+ *
+ * @pre Must be called from sleepable context (calls malloc() w/ M_WAITOK)
+ *
+ * @returns 0 if successful, or
+ * - ENOMEM if there is no memory for the PF/VF schemas or iov device
+ * - ENXIO if the device isn't PCI-E or doesn't support the same SR-IOV
+ *   version as the kernel
+ * - ENOENT if the device doesn't have the SR-IOV capability
+ */
+int
+ice_iov_attach(struct ice_softc *sc)
+{
+	device_t dev = sc->dev;
+	nvlist_t *pf_schema, *vf_schema;
+	int error;
+
+	pf_schema = pci_iov_schema_alloc_node();
+	vf_schema = pci_iov_schema_alloc_node();
+
+	pci_iov_schema_add_unicast_mac(vf_schema, "mac-addr", 0, NULL);
+	pci_iov_schema_add_bool(vf_schema, "mac-anti-spoof",
+	    IOV_SCHEMA_HASDEFAULT, TRUE);
+	pci_iov_schema_add_bool(vf_schema, "allow-set-mac",
+	    IOV_SCHEMA_HASDEFAULT, FALSE);
+	pci_iov_schema_add_bool(vf_schema, "allow-promisc",
+	    IOV_SCHEMA_HASDEFAULT, FALSE);
+	pci_iov_schema_add_uint16(vf_schema, "num-queues",
+	    IOV_SCHEMA_HASDEFAULT, ICE_DEFAULT_VF_QUEUES);
+	pci_iov_schema_add_uint16(vf_schema, "mirror-src-vsi",
+	    IOV_SCHEMA_HASDEFAULT, ICE_INVALID_MIRROR_VSI);
+	pci_iov_schema_add_uint16(vf_schema, "max-vlan-allowed",
+	    IOV_SCHEMA_HASDEFAULT, ICE_DEFAULT_VF_VLAN_LIMIT);
+	pci_iov_schema_add_uint16(vf_schema, "max-mac-filters",
+	    IOV_SCHEMA_HASDEFAULT, ICE_DEFAULT_VF_FILTER_LIMIT);
+
+	error = pci_iov_attach(dev, pf_schema, vf_schema);
+	if (error != 0) {
+		device_printf(dev,
+		    "pci_iov_attach failed (error=%s)\n",
+		    ice_err_str(error));
+		ice_clear_bit(ICE_FEATURE_SRIOV, sc->feat_en);
+	} else
+		ice_set_bit(ICE_FEATURE_SRIOV, sc->feat_en);
+
+	return (error);
+}
+
+/**
+ * ice_iov_detach - Teardown SR-IOV PF host support
+ * @sc: device softc structure
+ *
+ * Teardown SR-IOV PF host support at the start of the driver detach process.
+ *
+ * @returns 0 if successful or IOV support hasn't been setup, or
+ * - EBUSY if VFs still exist
+ */
+int
+ice_iov_detach(struct ice_softc *sc)
+{
+	device_t dev = sc->dev;
+	int error;
+
+	error = pci_iov_detach(dev);
+	if (error != 0) {
+		device_printf(dev,
+		    "pci_iov_detach failed (error=%s)\n",
+		    ice_err_str(error));
+	}
+
+	return (error);
+}
+
+/**
+ * ice_iov_init - Called by the OS before the first VF is created.
+ * @sc: device softc structure
+ * @num_vfs: number of VFs to setup resources for
+ * @params: configuration parameters for the PF
+ *
+ * @returns 0 if successful or an error code on failure
+ */
+int
+ice_iov_init(struct ice_softc *sc, uint16_t num_vfs, const nvlist_t *params __unused)
+{
+	/* Allocate array of VFs, for tracking */
+	sc->vfs = (struct ice_vf *)malloc(sizeof(struct ice_vf) * num_vfs, M_ICE, M_NOWAIT |
+	    M_ZERO);
+	if (sc->vfs == NULL)
+		return (ENOMEM);
+
+	/* Initialize each VF with basic information */
+	for (int i = 0; i < num_vfs; i++)
+		sc->vfs[i].vf_num = i;
+
+	/* Save off number of configured VFs */
+	sc->num_vfs = num_vfs;
+
+	return (0);
+}
+
+/**
+ * ice_iov_get_vf - Get pointer to VF at given index
+ * @sc: device softc structure
+ * @vf_num: Index of VF to retrieve
+ *
+ * @remark will throw an assertion if vf_num is not in the
+ * range of allocated VFs
+ *
+ * @returns a pointer to the VF structure at the given index
+ */
+static struct ice_vf *
+ice_iov_get_vf(struct ice_softc *sc, int vf_num)
+{
+	MPASS(vf_num < sc->num_vfs);
+
+	return &sc->vfs[vf_num];
+}
+
+/**
+ * ice_iov_add_vf - Called by the OS for each VF to create
+ * @sc: device softc structure
+ * @vfnum: index of VF to configure
+ * @params: configuration parameters for the VF
+ *
+ * @returns 0 if successful or an error code on failure
+ */
+int
+ice_iov_add_vf(struct ice_softc *sc, uint16_t vfnum, const nvlist_t *params)
+{
+	struct ice_tx_queue *txq;
+	struct ice_rx_queue *rxq;
+	device_t dev = sc->dev;
+	struct ice_vsi *vsi;
+	struct ice_vf *vf;
+	int vf_num_queues;
+	const void *mac;
+	size_t size;
+	int error;
+	int i;
+
+	vf = ice_iov_get_vf(sc, vfnum);
+	vf->vf_flags = VF_FLAG_ENABLED;
+
+	/* This VF needs at least one VSI */
+	vsi = ice_alloc_vsi(sc, ICE_VSI_VF);
+	if (vsi == NULL)
+		return (ENOMEM);
+	vf->vsi = vsi;
+	vsi->vf_num = vfnum;
+
+	vf_num_queues = nvlist_get_number(params, "num-queues");
+	/* Validate and clamp value if invalid */
+	if (vf_num_queues < 1 || vf_num_queues > ICE_MAX_SCATTERED_QUEUES)
+		device_printf(dev, "Invalid num-queues (%d) for VF %d\n",
+		    vf_num_queues, vf->vf_num);
+	if (vf_num_queues < 1) {
+		device_printf(dev, "Setting VF %d num-queues to 1\n", vf->vf_num);
+		vf_num_queues = 1;
+	} else if (vf_num_queues > ICE_MAX_SCATTERED_QUEUES) {
+		device_printf(dev, "Setting VF %d num-queues to %d\n",
+		    vf->vf_num, ICE_MAX_SCATTERED_QUEUES);
+		vf_num_queues = ICE_MAX_SCATTERED_QUEUES;
+	}
+	vsi->qmap_type = ICE_RESMGR_ALLOC_SCATTERED;
+
+	/* Reserve VF queue allocation from PF queues */
+	ice_alloc_vsi_qmap(vsi, vf_num_queues, vf_num_queues);
+	vsi->num_tx_queues = vsi->num_rx_queues = vf_num_queues;
+
+	/* Assign Tx queues from PF space */
+	error = ice_resmgr_assign_scattered(&sc->tx_qmgr, vsi->tx_qmap,
+					     vsi->num_tx_queues);
+	if (error) {
+		device_printf(sc->dev, "Unable to assign VF Tx queues: %s\n",
+			      ice_err_str(error));
+		goto release_vsi;
+	}
+
+	/* Assign Rx queues from PF space */
+	error = ice_resmgr_assign_scattered(&sc->rx_qmgr, vsi->rx_qmap,
+					     vsi->num_rx_queues);
+	if (error) {
+		device_printf(sc->dev, "Unable to assign VF Rx queues: %s\n",
+			      ice_err_str(error));
+		goto release_vsi;
+	}
+
+	vsi->max_frame_size = ICE_MAX_FRAME_SIZE;
+
+	/* Allocate queue structure memory */
+	vsi->tx_queues = (struct ice_tx_queue *)
+	    malloc(sizeof(struct ice_tx_queue) * vsi->num_tx_queues, M_ICE,
+		   M_NOWAIT | M_ZERO);
+	if (!vsi->tx_queues) {
+		device_printf(sc->dev, "VF-%d: Unable to allocate Tx queue memory\n",
+			      vfnum);
+		error = ENOMEM;
+		goto release_vsi;
+	}
+	for (i = 0, txq = vsi->tx_queues; i < vsi->num_tx_queues; i++, txq++) {
+		txq->me = i;
+		txq->vsi = vsi;
+	}
+
+	/* Allocate queue structure memory */
+	vsi->rx_queues = (struct ice_rx_queue *)
+	    malloc(sizeof(struct ice_rx_queue) * vsi->num_rx_queues, M_ICE,
+		   M_NOWAIT | M_ZERO);
+	if (!vsi->rx_queues) {
+		device_printf(sc->dev, "VF-%d: Unable to allocate Rx queue memory\n",
+			      vfnum);
+		error = ENOMEM;
+		goto free_txqs;
+	}
+	for (i = 0, rxq = vsi->rx_queues; i < vsi->num_rx_queues; i++, rxq++) {
+		rxq->me = i;
+		rxq->vsi = vsi;
+	}
+
+	/* Allocate space to store the IRQ vector data */
+	vf->num_irq_vectors = vf_num_queues + 1;
+	vf->tx_irqvs = (struct ice_irq_vector *)
+	    malloc(sizeof(struct ice_irq_vector) * (vf->num_irq_vectors),
+		   M_ICE, M_NOWAIT);
+	if (!vf->tx_irqvs) {
+		device_printf(sc->dev,
+			      "Unable to allocate TX irqv memory for VF-%d's %d vectors\n",
+			      vfnum, vf->num_irq_vectors);
+		error = ENOMEM;
+		goto free_rxqs;
+	}
+	vf->rx_irqvs = (struct ice_irq_vector *)
+	    malloc(sizeof(struct ice_irq_vector) * (vf->num_irq_vectors),
+		   M_ICE, M_NOWAIT);
+	if (!vf->rx_irqvs) {
+		device_printf(sc->dev,
+			      "Unable to allocate RX irqv memory for VF-%d's %d vectors\n",
+			      vfnum, vf->num_irq_vectors);
+		error = ENOMEM;
+		goto free_txirqvs;
+	}
+
+	/* Assign VF interrupts from PF space */
+	if (!(vf->vf_imap =
+	      (u16 *)malloc(sizeof(u16) * vf->num_irq_vectors,
+	      M_ICE, M_NOWAIT))) {
+		device_printf(dev, "Unable to allocate VF-%d imap memory\n", vfnum);
+		error = ENOMEM;
+		goto free_rxirqvs;
+	}
+	error = ice_resmgr_assign_contiguous(&sc->dev_imgr, vf->vf_imap, vf->num_irq_vectors);
+	if (error) {
+		device_printf(dev, "Unable to assign VF-%d interrupt mapping: %s\n",
+			      vfnum, ice_err_str(error));
+		goto free_imap;
+	}
+
+	if (nvlist_exists_binary(params, "mac-addr")) {
+		mac = nvlist_get_binary(params, "mac-addr", &size);
+		memcpy(vf->mac, mac, ETHER_ADDR_LEN);
+
+		if (nvlist_get_bool(params, "allow-set-mac"))
+			vf->vf_flags |= VF_FLAG_SET_MAC_CAP;
+	} else
+		/*
+		 * If the administrator has not specified a MAC address then
+		 * we must allow the VF to choose one.
+		 */
+		vf->vf_flags |= VF_FLAG_SET_MAC_CAP;
+
+	if (nvlist_get_bool(params, "mac-anti-spoof"))
+		vf->vf_flags |= VF_FLAG_MAC_ANTI_SPOOF;
+
+	if (nvlist_get_bool(params, "allow-promisc"))
+		vf->vf_flags |= VF_FLAG_PROMISC_CAP;
+
+	vsi->mirror_src_vsi = nvlist_get_number(params, "mirror-src-vsi");
+
+	vf->vlan_limit = nvlist_get_number(params, "max-vlan-allowed");
+	vf->mac_filter_limit = nvlist_get_number(params, "max-mac-filters");
+
+	vf->vf_flags |= VF_FLAG_VLAN_CAP;
+
+	/* Create and setup VSI in HW */
+	error = ice_initialize_vsi(vsi);
+	if (error) {
+		device_printf(sc->dev, "Unable to initialize VF %d VSI: %s\n",
+			      vfnum, ice_err_str(error));
+		goto release_imap;
+	}
+
+	/* Add the broadcast address */
+	error = ice_add_vsi_mac_filter(vsi, broadcastaddr);
+	if (error) {
+		device_printf(sc->dev, "Unable to add broadcast filter VF %d VSI: %s\n",
+			      vfnum, ice_err_str(error));
+		goto release_imap;
+	}
+
+	ice_iov_ready_vf(sc, vf);
+
+	return (0);
+
+release_imap:
+	ice_resmgr_release_map(&sc->dev_imgr, vf->vf_imap,
+			       vf->num_irq_vectors);
+free_imap:
+	free(vf->vf_imap, M_ICE);
+	vf->vf_imap = NULL;
+free_rxirqvs:
+	free(vf->rx_irqvs, M_ICE);
+	vf->rx_irqvs = NULL;
+free_txirqvs:
+	free(vf->tx_irqvs, M_ICE);
+	vf->tx_irqvs = NULL;
+free_rxqs:
+	free(vsi->rx_queues, M_ICE);
+	vsi->rx_queues = NULL;
+free_txqs:
+	free(vsi->tx_queues, M_ICE);
+	vsi->tx_queues = NULL;
+release_vsi:
+	ice_release_vsi(vsi);
+	vf->vsi = NULL;
+	return (error);
+}
+
+/**
+ * ice_iov_uninit - Called by the OS when VFs are destroyed
+ * @sc: device softc structure
+ */
+void
+ice_iov_uninit(struct ice_softc *sc)
+{
+	struct ice_vf *vf;
+	struct ice_vsi *vsi;
+
+	/* Release per-VF resources */
+	for (int i = 0; i < sc->num_vfs; i++) {
+		vf = &sc->vfs[i];
+		vsi = vf->vsi;
+
+		/* Free VF interrupt reservation */
+		if (vf->vf_imap) {
+			free(vf->vf_imap, M_ICE);
+			vf->vf_imap = NULL;
+		}
+
+		/* Free queue interrupt mapping trackers */
+		if (vf->tx_irqvs) {
+			free(vf->tx_irqvs, M_ICE);
+			vf->tx_irqvs = NULL;
+		}
+		if (vf->rx_irqvs) {
+			free(vf->rx_irqvs, M_ICE);
+			vf->rx_irqvs = NULL;
+		}
+
+		if (!vsi)
+			continue;
+
+		/* Free VSI queues */
+		if (vsi->tx_queues) {
+			free(vsi->tx_queues, M_ICE);
+			vsi->tx_queues = NULL;
+		}
+		if (vsi->rx_queues) {
+			free(vsi->rx_queues, M_ICE);
+			vsi->rx_queues = NULL;
+		}
+
+		ice_release_vsi(vsi);
+		vf->vsi = NULL;
+	}
+
+	/* Release memory used for VF tracking */
+	if (sc->vfs) {
+		free(sc->vfs, M_ICE);
+		sc->vfs = NULL;
+	}
+	sc->num_vfs = 0;
+}
+
+/**
+ * ice_iov_handle_vflr - Process VFLR event
+ * @sc: device softc structure
+ *
+ * Identifys which VFs have been reset and re-configure
+ * them.
+ */
+void
+ice_iov_handle_vflr(struct ice_softc *sc)
+{
+	struct ice_hw *hw = &sc->hw;
+	struct ice_vf *vf;
+	u32 reg, reg_idx, bit_idx;
+
+	for (int i = 0; i < sc->num_vfs; i++) {
+		vf = &sc->vfs[i];
+
+		reg_idx = (hw->func_caps.vf_base_id + vf->vf_num) / 32;
+		bit_idx = (hw->func_caps.vf_base_id + vf->vf_num) % 32;
+		reg = rd32(hw, GLGEN_VFLRSTAT(reg_idx));
+		if (reg & BIT(bit_idx))
+			ice_reset_vf(sc, vf, false);
+	}
+}
+
+/**
+ * ice_iov_ready_vf - Setup VF interrupts and mark it as ready
+ * @sc: device softc structure
+ * @vf: driver's VF structure for the VF to update
+ *
+ * Clears VF reset triggering bit, sets up the PF<->VF interrupt
+ * mapping and marks the VF as active in the HW so that the VF
+ * driver can use it.
+ */
+static void
+ice_iov_ready_vf(struct ice_softc *sc, struct ice_vf *vf)
+{
+	struct ice_hw *hw = &sc->hw;
+	u32 reg;
+
+	/* Clear the triggering bit */
+	reg = rd32(hw, VPGEN_VFRTRIG(vf->vf_num));
+	reg &= ~VPGEN_VFRTRIG_VFSWR_M;
+	wr32(hw, VPGEN_VFRTRIG(vf->vf_num), reg);
+
+	/* Setup VF interrupt allocation and mapping */
+	ice_iov_setup_intr_mapping(sc, vf);
+
+	/* Indicate to the VF that reset is done */
+	wr32(hw, VFGEN_RSTAT(vf->vf_num), VIRTCHNL_VFR_VFACTIVE);
+
+	ice_flush(hw);
+}
+
+/**
+ * ice_reset_vf - Perform a hardware reset (VFR) on a VF
+ * @sc: device softc structure
+ * @vf: driver's VF structure for VF to be reset
+ * @trigger_vflr: trigger a reset or only handle already executed reset
+ *
+ * Performs a VFR for the given VF. This function busy waits until the
+ * reset completes in the HW, notifies the VF that the reset is done
+ * by setting a bit in a HW register, then returns.
+ *
+ * @remark This also sets up the PF<->VF interrupt mapping and allocations in
+ * the hardware after the hardware reset is finished, via
+ * ice_iov_setup_intr_mapping()
+ */
+static void
+ice_reset_vf(struct ice_softc *sc, struct ice_vf *vf, bool trigger_vflr)
+{
+	u16 global_vf_num, reg_idx, bit_idx;
+	struct ice_hw *hw = &sc->hw;
+	int status;
+	u32 reg;
+	int i;
+
+	global_vf_num = vf->vf_num + hw->func_caps.vf_base_id;
+
+	if (trigger_vflr) {
+		reg = rd32(hw, VPGEN_VFRTRIG(vf->vf_num));
+		reg |= VPGEN_VFRTRIG_VFSWR_M;
+		wr32(hw, VPGEN_VFRTRIG(vf->vf_num), reg);
+	}
+
+	/* clear the VFLR bit for the VF in a GLGEN_VFLRSTAT register */
+	reg_idx = (global_vf_num) / 32;
+	bit_idx = (global_vf_num) % 32;
+	wr32(hw, GLGEN_VFLRSTAT(reg_idx), BIT(bit_idx));
+	ice_flush(hw);
+
+	/* Wait until there are no pending PCI transactions */
+	wr32(hw, PF_PCI_CIAA,
+	     ICE_PCIE_DEV_STATUS | (global_vf_num << PF_PCI_CIAA_VF_NUM_S));
+
+	for (i = 0; i < ICE_PCI_CIAD_WAIT_COUNT; i++) {
+		reg = rd32(hw, PF_PCI_CIAD);
+		if (!(reg & PCIEM_STA_TRANSACTION_PND))
+			break;
+
+		DELAY(ICE_PCI_CIAD_WAIT_DELAY_US);
+	}
+	if (i == ICE_PCI_CIAD_WAIT_COUNT)
+		device_printf(sc->dev,
+			"VF-%d PCI transactions stuck\n", vf->vf_num);
+
+	/* Disable TX queues, which is required during VF reset */
+	status = ice_dis_vsi_txq(hw->port_info, vf->vsi->idx, 0, 0, NULL, NULL,
+			NULL, ICE_VF_RESET, vf->vf_num, NULL);
+	if (status)
+		device_printf(sc->dev,
+			      "%s: Failed to disable LAN Tx queues: err %s aq_err %s\n",
+			      __func__, ice_status_str(status),
+			      ice_aq_str(hw->adminq.sq_last_status));
+
+	/* Then check for the VF reset to finish in HW */
+	for (i = 0; i < ICE_VPGEN_VFRSTAT_WAIT_COUNT; i++) {
+		reg = rd32(hw, VPGEN_VFRSTAT(vf->vf_num));
+		if ((reg & VPGEN_VFRSTAT_VFRD_M))
+			break;
+
+		DELAY(ICE_VPGEN_VFRSTAT_WAIT_DELAY_US);
+	}
+	if (i == ICE_VPGEN_VFRSTAT_WAIT_COUNT)
+		device_printf(sc->dev,
+			"VF-%d Reset is stuck\n", vf->vf_num);
+
+	ice_iov_ready_vf(sc, vf);
+}
+
+/**
+ * ice_vc_get_vf_res_msg - Handle VIRTCHNL_OP_GET_VF_RESOURCES msg from VF
+ * @sc: device private structure
+ * @vf: VF tracking structure
+ * @msg_buf: raw message buffer from the VF
+ *
+ * Receives a message from the VF listing its supported capabilities, and
+ * replies to the VF with information about what resources the PF has
+ * allocated for the VF.
+ *
+ * @remark This always replies to the VF with a success status; it does not
+ * fail. It's up to the VF driver to reject or complain about the PF's response.
+ */
+static void
+ice_vc_get_vf_res_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf)
+{
+	struct ice_hw *hw = &sc->hw;
+	struct virtchnl_vf_resource *vf_res;
+	struct virtchnl_vsi_resource *vsi_res;
+	u16 vf_res_len;
+	u32 vf_caps;
+
+	/* XXX: Only support one VSI per VF, so this size doesn't need adjusting */
+	vf_res_len = sizeof(struct virtchnl_vf_resource);
+	vf_res = (struct virtchnl_vf_resource *)malloc(vf_res_len, M_ICE,
+	    M_WAITOK | M_ZERO);
+
+	vf_res->num_vsis = 1;
+	vf_res->num_queue_pairs = vf->vsi->num_tx_queues;
+	vf_res->max_vectors = vf_res->num_queue_pairs + 1;
+
+	vf_res->rss_key_size = ICE_GET_SET_RSS_KEY_EXTEND_KEY_SIZE;
+	vf_res->rss_lut_size = ICE_VSIQF_HLUT_ARRAY_SIZE;
+	vf_res->max_mtu = 0;
+
+	vf_res->vf_cap_flags = VF_BASE_MODE_OFFLOADS;
+	if (msg_buf != NULL) {
+		vf_caps = *((u32 *)(msg_buf));
+
+		if (vf_caps & VIRTCHNL_VF_CAP_ADV_LINK_SPEED)
+			vf_res->vf_cap_flags |= VIRTCHNL_VF_CAP_ADV_LINK_SPEED;
+
+		if (vf_caps & VIRTCHNL_VF_OFFLOAD_WB_ON_ITR)
+                        vf_res->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_WB_ON_ITR;
+	}
+
+	vsi_res = &vf_res->vsi_res[0];
+	vsi_res->vsi_id = vf->vsi->idx;
+	vsi_res->num_queue_pairs = vf->vsi->num_tx_queues;
+	vsi_res->vsi_type = VIRTCHNL_VSI_SRIOV;
+	vsi_res->qset_handle = 0;
+	if (!ETHER_IS_ZERO(vf->mac))
+		memcpy(vsi_res->default_mac_addr, vf->mac, ETHER_ADDR_LEN);
+
+	ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_GET_VF_RESOURCES,
+	    VIRTCHNL_STATUS_SUCCESS, (u8 *)vf_res, vf_res_len, NULL);
+
+	free(vf_res, M_ICE);
+}
+
+/**
+ * ice_vc_version_msg - Handle VIRTCHNL_OP_VERSION msg from VF
+ * @sc: device private structure
+ * @vf: VF tracking structure
+ * @msg_buf: raw message buffer from the VF
+ *
+ * Receives a version message from the VF, and responds to the VF with
+ * the version number that the PF will use.
+ *
+ * @remark This always replies to the VF with a success status; it does not
+ * fail.
+ */
+static void
+ice_vc_version_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf)
+{
+	struct virtchnl_version_info *recv_vf_version;
+	struct ice_hw *hw = &sc->hw;
+	device_t dev = sc->dev;
+
+	recv_vf_version = (struct virtchnl_version_info *)msg_buf;
+
+	/* VFs running the 1.0 API expect to get 1.0 back */
+	if (VF_IS_V10(recv_vf_version)) {
+		vf->version.major = 1;
+		vf->version.minor = VIRTCHNL_VERSION_MINOR_NO_VF_CAPS;
+	} else {
+		vf->version.major = VIRTCHNL_VERSION_MAJOR;
+		vf->version.minor = VIRTCHNL_VERSION_MINOR;
+
+		if ((recv_vf_version->major != VIRTCHNL_VERSION_MAJOR) ||
+		    (recv_vf_version->minor != VIRTCHNL_VERSION_MINOR))
+		    device_printf(dev,
+		        "%s: VF-%d requested version (%d.%d) differs from PF version (%d.%d)\n",
+			__func__, vf->vf_num,
+			recv_vf_version->major, recv_vf_version->minor,
+			VIRTCHNL_VERSION_MAJOR, VIRTCHNL_VERSION_MINOR);
+	}
+
+	ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_VERSION,
+	    VIRTCHNL_STATUS_SUCCESS, (u8 *)&vf->version, sizeof(vf->version),
+	    NULL);
+}
+
+/**
+ * ice_vf_validate_mac - Validate MAC address before adding it
+ * @vf: VF tracking structure
+ * @addr: MAC address to validate
+ *
+ * Validate a MAC address before adding it to a VF during the handling
+ * of a VIRTCHNL_OP_ADD_ETH_ADDR operation. Notably, this also checks if
+ * the VF is allowed to set its own arbitrary MAC addresses.
+ *
+ * Returns 0 if MAC address is valid for the given vf
+ */
+static int
+ice_vf_validate_mac(struct ice_vf *vf, const uint8_t *addr)
+{
+
+	if (ETHER_IS_ZERO(addr) || ETHER_IS_BROADCAST(addr))
+		return (EINVAL);
+
+	/*
+	 * If the VF is not allowed to change its MAC address, don't let it
+	 * set a MAC filter for an address that is not a multicast address and
+	 * is not its assigned MAC.
+	 */
+	if (!(vf->vf_flags & VF_FLAG_SET_MAC_CAP) &&
+	    !(ETHER_IS_MULTICAST(addr) || !bcmp(addr, vf->mac, ETHER_ADDR_LEN)))
+		return (EPERM);
+
+	return (0);
+}
+
+/**
+ * ice_vc_add_eth_addr_msg - Handle VIRTCHNL_OP_ADD_ETH_ADDR msg from VF
+ * @sc: device private structure
+ * @vf: VF tracking structure
+ * @msg_buf: raw message buffer from the VF
+ *
+ * Receives a list of MAC addresses from the VF and adds those addresses
+ * to the VSI's filter list.
+ */
+static void
+ice_vc_add_eth_addr_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf)
+{
+	enum virtchnl_status_code v_status = VIRTCHNL_STATUS_SUCCESS;
+	struct virtchnl_ether_addr_list *addr_list;
+	struct ice_hw *hw = &sc->hw;
+	u16 added_addr_cnt = 0;
+	int error = 0;
+
+	addr_list = (struct virtchnl_ether_addr_list *)msg_buf;
+
+	if (addr_list->num_elements >
+	    (vf->mac_filter_limit - vf->mac_filter_cnt)) {
+		v_status = VIRTCHNL_STATUS_ERR_NO_MEMORY;
+		goto done;
+	}
+
+	for (int i = 0; i < addr_list->num_elements; i++) {
+		u8 *addr = addr_list->list[i].addr;
+
+		/* The type flag is currently ignored; every MAC address is
+		 * treated as the LEGACY type
+		 */
+
+		error = ice_vf_validate_mac(vf, addr);
+		if (error == EPERM) {
+			device_printf(sc->dev,
+			    "%s: VF-%d: Not permitted to add MAC addr for VSI %d\n",
+			    __func__, vf->vf_num, vf->vsi->idx);
+			v_status = VIRTCHNL_STATUS_ERR_PARAM;
+			continue;
+		} else if (error) {
+			device_printf(sc->dev,
+			    "%s: VF-%d: Did not add invalid MAC addr for VSI %d\n",
+			    __func__, vf->vf_num, vf->vsi->idx);
+			v_status = VIRTCHNL_STATUS_ERR_PARAM;
+			continue;
+		}
+
+		error = ice_add_vsi_mac_filter(vf->vsi, addr);
+		if (error) {
+			device_printf(sc->dev,
+			    "%s: VF-%d: Error adding MAC addr for VSI %d\n",
+			    __func__, vf->vf_num, vf->vsi->idx);
+			v_status = VIRTCHNL_STATUS_ERR_PARAM;
+			continue;
+		}
+		/* Don't count VF's MAC against its MAC filter limit */
+		if (memcmp(addr, vf->mac, ETHER_ADDR_LEN))
+			added_addr_cnt++;
+	}
+
+	vf->mac_filter_cnt += added_addr_cnt;
+
+done:
+	ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_ADD_ETH_ADDR,
+	    v_status, NULL, 0, NULL);
+}
+
+/**
+ * ice_vc_del_eth_addr_msg - Handle VIRTCHNL_OP_DEL_ETH_ADDR msg from VF
+ * @sc: device private structure
+ * @vf: VF tracking structure
+ * @msg_buf: raw message buffer from the VF
+ *
+ * Receives a list of MAC addresses from the VF and removes those addresses
+ * from the VSI's filter list.
+ */
+static void
+ice_vc_del_eth_addr_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf)
+{
+	enum virtchnl_status_code v_status = VIRTCHNL_STATUS_SUCCESS;
+	struct virtchnl_ether_addr_list *addr_list;
+	struct ice_hw *hw = &sc->hw;
+	u16 deleted_addr_cnt = 0;
+	int error = 0;
+
+	addr_list = (struct virtchnl_ether_addr_list *)msg_buf;
+
+	for (int i = 0; i < addr_list->num_elements; i++) {
+		error = ice_remove_vsi_mac_filter(vf->vsi, addr_list->list[i].addr);
+		if (error) {
+			device_printf(sc->dev,
+			    "%s: VF-%d: Error removing MAC addr for VSI %d\n",
+			    __func__, vf->vf_num, vf->vsi->idx);
+			v_status = VIRTCHNL_STATUS_ERR_PARAM;
+			continue;
+		}
+		/* Don't count VF's MAC against its MAC filter limit */
+		if (memcmp(addr_list->list[i].addr, vf->mac, ETHER_ADDR_LEN))
+			deleted_addr_cnt++;
+	}
+
+	if (deleted_addr_cnt >= vf->mac_filter_cnt)
+		vf->mac_filter_cnt = 0;
+	else
+		vf->mac_filter_cnt -= deleted_addr_cnt;
+
+	ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_DEL_ETH_ADDR,
+	    v_status, NULL, 0, NULL);
+}
+
+/**
+ * ice_vc_add_vlan_msg - Handle VIRTCHNL_OP_ADD_VLAN msg from VF
+ * @sc: PF's softc structure
+ * @vf: VF tracking structure
+ * @msg_buf: message buffer from VF
+ *
+ * Adds the VLANs in msg_buf to the VF's VLAN filter list.
+ */
+static void
+ice_vc_add_vlan_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf)
+{
+	struct ice_hw *hw = &sc->hw;
+	struct virtchnl_vlan_filter_list *vlan_list;
+	int status = 0;
+	enum virtchnl_status_code v_status = VIRTCHNL_STATUS_SUCCESS;
+	struct ice_vsi *vsi = vf->vsi;
+
+	vlan_list = (struct virtchnl_vlan_filter_list *)msg_buf;
+
+	if (vlan_list->vsi_id != vsi->idx) {
+		device_printf(sc->dev,
+			      "VF-%d: Message has invalid VSI ID (expected %d, got %d)\n",
+			      vf->vf_num, vsi->idx, vlan_list->vsi_id);
+		v_status = VIRTCHNL_STATUS_ERR_PARAM;
+		goto done;
+	}
+
+	if (vlan_list->num_elements > (vf->vlan_limit - vf->vlan_cnt)) {
+		v_status = VIRTCHNL_STATUS_ERR_NO_MEMORY;
+		goto done;
+	}
+
+	status = ice_add_vlan_hw_filters(vsi, vlan_list->vlan_id,
+					vlan_list->num_elements);
+	if (status) {
+		device_printf(sc->dev,
+			      "VF-%d: Failure adding VLANs to VSI %d, err %s aq_err %s\n",
+			      vf->vf_num, vsi->idx, ice_status_str(status),
+			      ice_aq_str(sc->hw.adminq.sq_last_status));
+		v_status = ice_iov_err_to_virt_err(status);
+		goto done;
+	}
+
+	vf->vlan_cnt += vlan_list->num_elements;
+
+done:
+	ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_ADD_VLAN,
+	    v_status, NULL, 0, NULL);
+}
+
+/**
+ * ice_vc_del_vlan_msg - Handle VIRTCHNL_OP_DEL_VLAN msg from VF
+ * @sc: PF's softc structure
+ * @vf: VF tracking structure
+ * @msg_buf: message buffer from VF
+ *
+ * Removes the VLANs in msg_buf from the VF's VLAN filter list.
+ */
+static void
+ice_vc_del_vlan_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf)
+{
+	struct ice_hw *hw = &sc->hw;
+	struct virtchnl_vlan_filter_list *vlan_list;
+	int status = 0;
+	enum virtchnl_status_code v_status = VIRTCHNL_STATUS_SUCCESS;
+	struct ice_vsi *vsi = vf->vsi;
+
+	vlan_list = (struct virtchnl_vlan_filter_list *)msg_buf;
+
+	if (vlan_list->vsi_id != vsi->idx) {
+		device_printf(sc->dev,
+			      "VF-%d: Message has invalid VSI ID (expected %d, got %d)\n",
+			      vf->vf_num, vsi->idx, vlan_list->vsi_id);
+		v_status = VIRTCHNL_STATUS_ERR_PARAM;
+		goto done;
+	}
+
+	status = ice_remove_vlan_hw_filters(vsi, vlan_list->vlan_id,
+					vlan_list->num_elements);
+	if (status) {
+		device_printf(sc->dev,
+			      "VF-%d: Failure deleting VLANs from VSI %d, err %s aq_err %s\n",
+			      vf->vf_num, vsi->idx, ice_status_str(status),
+			      ice_aq_str(sc->hw.adminq.sq_last_status));
+		v_status = ice_iov_err_to_virt_err(status);
+		goto done;
+	}
+
+	if (vlan_list->num_elements >= vf->vlan_cnt)
+		vf->vlan_cnt = 0;
+	else
+		vf->vlan_cnt -= vlan_list->num_elements;
+
+done:
+	ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_DEL_VLAN,
+	    v_status, NULL, 0, NULL);
+}
+
+/**
+ * ice_vc_validate_ring_len - Check to see if a descriptor ring length is valid
+ * @ring_len: length of ring
+ *
+ * Check whether a ring size value is valid.
+ *
+ * @returns true if given ring size is valid
+ */
+static bool
+ice_vc_isvalid_ring_len(u16 ring_len)
+{
+	return (ring_len >= ICE_MIN_DESC_COUNT &&
+		ring_len <= ICE_MAX_DESC_COUNT &&
+		!(ring_len % ICE_DESC_COUNT_INCR));
+}
+
+/**
+ * ice_vc_cfg_vsi_qs_msg - Handle VIRTCHNL_OP_CONFIG_VSI_QUEUES msg from VF
+ * @sc: PF's softc structure
+ * @vf: VF tracking structure
+ * @msg_buf: message buffer from VF
+ */
+static void
+ice_vc_cfg_vsi_qs_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf)
+{
+	device_t dev = sc->dev;
+	struct ice_hw *hw = &sc->hw;
+	struct virtchnl_vsi_queue_config_info *vqci;
+	struct virtchnl_queue_pair_info *vqpi;
+	enum virtchnl_status_code status = VIRTCHNL_STATUS_SUCCESS;
+	struct ice_vsi *vsi = vf->vsi;
+	struct ice_tx_queue *txq;
+	struct ice_rx_queue *rxq;
+	int i, error = 0;
+
+	vqci = (struct virtchnl_vsi_queue_config_info *)msg_buf;
+
+	if (vqci->num_queue_pairs > vf->vsi->num_tx_queues &&
+	    vqci->num_queue_pairs > vf->vsi->num_rx_queues) {
+		status = VIRTCHNL_STATUS_ERR_PARAM;
+		goto done;
+	}
+
+	ice_vsi_disable_tx(vf->vsi);
+	ice_control_all_rx_queues(vf->vsi, false);
+
+	/*
+	 * Clear TX and RX queues config in case VF
+	 * requests different number of queues.
+	 */
+	for (i = 0; i < vsi->num_tx_queues; i++) {
+		txq = &vsi->tx_queues[i];
+
+		txq->desc_count = 0;
+		txq->tx_paddr = 0;
+		txq->tc = 0;
+	}
+
+	for (i = 0; i < vsi->num_rx_queues; i++) {
+		rxq = &vsi->rx_queues[i];
+
+		rxq->desc_count = 0;
+		rxq->rx_paddr = 0;
+	}
+
+	vqpi = vqci->qpair;
+	for (i = 0; i < vqci->num_queue_pairs; i++, vqpi++) {
+		/* Initial parameter validation */
+		if (vqpi->txq.vsi_id != vf->vsi->idx ||
+		    vqpi->rxq.vsi_id != vf->vsi->idx ||
+		    vqpi->txq.queue_id != vqpi->rxq.queue_id ||
+		    vqpi->txq.headwb_enabled ||
+		    vqpi->rxq.splithdr_enabled ||
+		    vqpi->rxq.crc_disable ||
+		    !(ice_vc_isvalid_ring_len(vqpi->txq.ring_len)) ||
+		    !(ice_vc_isvalid_ring_len(vqpi->rxq.ring_len))) {
+			status = VIRTCHNL_STATUS_ERR_PARAM;
+			goto done;
+		}
+
+		/* Copy parameters into VF's queue/VSI structs */
+		txq = &vsi->tx_queues[vqpi->txq.queue_id];
+
+		txq->desc_count = vqpi->txq.ring_len;
+		txq->tx_paddr = vqpi->txq.dma_ring_addr;
+		txq->q_handle = vqpi->txq.queue_id;
+		txq->tc = 0;
+
+		rxq = &vsi->rx_queues[vqpi->rxq.queue_id];
+
+		rxq->desc_count = vqpi->rxq.ring_len;
+		rxq->rx_paddr = vqpi->rxq.dma_ring_addr;
+		vsi->mbuf_sz = vqpi->rxq.databuffer_size;
+	}
+
+	/* Configure TX queues in HW */
+	error = ice_cfg_vsi_for_tx(vsi);
+	if (error) {
+		device_printf(dev,
+			      "VF-%d: Unable to configure VSI for Tx: %s\n",
+			      vf->vf_num, ice_err_str(error));
+		status = VIRTCHNL_STATUS_ERR_ADMIN_QUEUE_ERROR;
+		goto done;
+	}
+
+	/* Configure RX queues in HW */
+	error = ice_cfg_vsi_for_rx(vsi);
+	if (error) {
+		device_printf(dev,
+			      "VF-%d: Unable to configure VSI for Rx: %s\n",
+			      vf->vf_num, ice_err_str(error));
+		status = VIRTCHNL_STATUS_ERR_ADMIN_QUEUE_ERROR;
+		ice_vsi_disable_tx(vsi);
+		goto done;
+	}
+
+done:
+	ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_CONFIG_VSI_QUEUES,
+	    status, NULL, 0, NULL);
+}
+
+/**
+ * ice_vc_cfg_rss_key_msg - Handle VIRTCHNL_OP_CONFIG_RSS_KEY msg from VF
+ * @sc: PF's softc structure
+ * @vf: VF tracking structure
+ * @msg_buf: message buffer from VF
+ *
+ * Sets the RSS key for the given VF, using the contents of msg_buf.
+ */
+static void
+ice_vc_cfg_rss_key_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf)
+{
+	struct ice_aqc_get_set_rss_keys keydata =
+	    { .standard_rss_key = {0}, .extended_hash_key = {0} };
+	struct ice_hw *hw = &sc->hw;
+	struct virtchnl_rss_key *vrk;
+	int status = 0;
+	enum virtchnl_status_code v_status = VIRTCHNL_STATUS_SUCCESS;
+	struct ice_vsi *vsi = vf->vsi;
+
+	vrk = (struct virtchnl_rss_key *)msg_buf;
+
+	if (vrk->vsi_id != vsi->idx) {
+		device_printf(sc->dev,
+		    "VF-%d: Message has invalid VSI ID (expected %d, got %d)\n",
+		    vf->vf_num, vsi->idx, vrk->vsi_id);
+		v_status = VIRTCHNL_STATUS_ERR_PARAM;
+		goto done;
+	}
+
+	if ((vrk->key_len >
+	   (ICE_AQC_GET_SET_RSS_KEY_DATA_RSS_KEY_SIZE +
+	    ICE_AQC_GET_SET_RSS_KEY_DATA_HASH_KEY_SIZE)) ||
+	    vrk->key_len == 0) {
+		v_status = VIRTCHNL_STATUS_ERR_PARAM;
+		goto done;
+	}
+
+	memcpy(&keydata, vrk->key, vrk->key_len);
+
+	status = ice_aq_set_rss_key(hw, vsi->idx, &keydata);
+	if (status) {
+		device_printf(sc->dev,
+			      "ice_aq_set_rss_key status %s, error %s\n",
+			      ice_status_str(status), ice_aq_str(hw->adminq.sq_last_status));
+		v_status = ice_iov_err_to_virt_err(status);
+		goto done;
+	}
+
+done:
+	ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_CONFIG_RSS_KEY,
+	    v_status, NULL, 0, NULL);
+}
+
+/**
+ * ice_vc_cfg_rss_lut_msg - Handle VIRTCHNL_OP_CONFIG_RSS_LUT msg from VF
+ * @sc: PF's softc structure
+ * @vf: VF tracking structure
+ * @msg_buf: message buffer from VF
+ *
+ * Adds the LUT from the VF in msg_buf to the PF via an admin queue call.
+ */
+static void
+ice_vc_cfg_rss_lut_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf)
+{
+	struct ice_hw *hw = &sc->hw;
+	struct virtchnl_rss_lut *vrl;
+	int status = 0;
+	enum virtchnl_status_code v_status = VIRTCHNL_STATUS_SUCCESS;
+	struct ice_aq_get_set_rss_lut_params lut_params = {};
+	struct ice_vsi *vsi = vf->vsi;
+
+	vrl = (struct virtchnl_rss_lut *)msg_buf;
+
+	if (vrl->vsi_id != vsi->idx) {
+		device_printf(sc->dev,
+		    "VF-%d: Message has invalid VSI ID (expected %d, got %d)\n",
+		    vf->vf_num, vsi->idx, vrl->vsi_id);
+		v_status = VIRTCHNL_STATUS_ERR_PARAM;
+		goto done;
+	}
+
+	if (vrl->lut_entries > ICE_VSIQF_HLUT_ARRAY_SIZE) {
+		v_status = VIRTCHNL_STATUS_ERR_PARAM;
+		goto done;
+	}
+
+	lut_params.vsi_handle = vsi->idx;
+	lut_params.lut_size = vsi->rss_table_size;
+	lut_params.lut_type = vsi->rss_lut_type;
+	lut_params.lut = vrl->lut;
+	lut_params.global_lut_id = 0;
+
+	status = ice_aq_set_rss_lut(hw, &lut_params);
+	if (status) {
+		device_printf(sc->dev,
+			      "VF-%d: Cannot set RSS lut, err %s aq_err %s\n",
+			      vf->vf_num, ice_status_str(status),
+			      ice_aq_str(hw->adminq.sq_last_status));
+		v_status = ice_iov_err_to_virt_err(status);
+	}
+
+done:
+	ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_CONFIG_RSS_LUT,
+	    v_status, NULL, 0, NULL);
+}
+
+/**
+ * ice_vc_set_rss_hena_msg - Handle VIRTCHNL_OP_SET_RSS_HENA msg from VF
+ * @sc: PF's softc structure
+ * @vf: VF tracking structure
+ * @msg_buf: message buffer from VF
+ *
+ * Adds the VF's hena (hash enable) bits as flow types to the PF's RSS flow
+ * type list.
+ */
+static void
+ice_vc_set_rss_hena_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf)
+{
+	struct ice_hw *hw = &sc->hw;
+	struct virtchnl_rss_hena *vrh;
+	int status = 0;
+	enum virtchnl_status_code v_status = VIRTCHNL_STATUS_SUCCESS;
+	struct ice_vsi *vsi = vf->vsi;
+
+	MPASS(vsi != NULL);
+
+	vrh = (struct virtchnl_rss_hena *)msg_buf;
+
+	/*
+	 * Remove existing configuration to make sure only requested
+	 * config is applied and allow VFs to disable RSS completly.
+	 */
+	status = ice_rem_vsi_rss_cfg(hw, vsi->idx);
+	if (vrh->hena) {
+		/*
+		 * Problem with removing config is not fatal, when new one
+		 * is requested. Warn about it but try to apply new config
+		 * anyway.
+		 */
+		if (status)
+			device_printf(sc->dev,
+			    "ice_rem_vsi_rss_cfg status %s, error %s\n",
+			    ice_status_str(status),
+			    ice_aq_str(hw->adminq.sq_last_status));
+		status = ice_add_avf_rss_cfg(hw, vsi->idx, vrh->hena);
+		if (status)
+			device_printf(sc->dev,
+			    "ice_add_avf_rss_cfg status %s, error %s\n",
+			    ice_status_str(status),
+			    ice_aq_str(hw->adminq.sq_last_status));
+	}
+	v_status = ice_iov_err_to_virt_err(status);
+	ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_SET_RSS_HENA,
+	    v_status, NULL, 0, NULL);
+}
+
+/**
+ * ice_vc_enable_queues_msg - Handle VIRTCHNL_OP_ENABLE_QUEUES msg from VF
+ * @sc: PF's softc structure
+ * @vf: VF tracking structure
+ * @msg_buf: message buffer from VF
+ *
+ * Enables VF queues selected in msg_buf for Tx/Rx traffic.
+ *
+ * @remark Only actually operates on Rx queues; Tx queues are enabled in
+ * CONFIG_VSI_QUEUES message handler.
+ */
+static void
+ice_vc_enable_queues_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf)
+{
+	struct ice_hw *hw = &sc->hw;
+	struct virtchnl_queue_select *vqs;
+	enum virtchnl_status_code v_status = VIRTCHNL_STATUS_SUCCESS;
+	struct ice_vsi *vsi = vf->vsi;
+	int bit, error = 0;
+
+	vqs = (struct virtchnl_queue_select *)msg_buf;
+
+	if (vqs->vsi_id != vsi->idx) {
+		device_printf(sc->dev,
+		    "%s: VF-%d: Message has invalid VSI ID (expected %d, got %d)\n",
+		    __func__, vf->vf_num, vsi->idx, vqs->vsi_id);
+		v_status = VIRTCHNL_STATUS_ERR_PARAM;
+		goto done;
+	}
+
+	if (!vqs->rx_queues && !vqs->tx_queues) {
+		device_printf(sc->dev,
+		    "%s: VF-%d: message queue masks are empty\n",
+		    __func__, vf->vf_num);
+		v_status = VIRTCHNL_STATUS_ERR_PARAM;
+		goto done;
+	}
+
+	/* Validate rx_queue mask */
+	bit = fls(vqs->rx_queues);
+	if (bit > vsi->num_rx_queues) {
+		device_printf(sc->dev,
+		    "%s: VF-%d: message's rx_queues map (0x%08x) has invalid bit set (%d)\n",
+		    __func__, vf->vf_num, vqs->rx_queues, bit);
+		v_status = VIRTCHNL_STATUS_ERR_PARAM;
+		goto done;
+	}
+
+	/* Tx ring enable is handled in an earlier message. */
+	for_each_set_bit(bit, &vqs->rx_queues, 32) {
+		error = ice_control_rx_queue(vsi, bit, true);
+		if (error) {
+			device_printf(sc->dev,
+				      "Unable to enable Rx ring %d for receive: %s\n",
+				      bit, ice_err_str(error));
+			v_status = VIRTCHNL_STATUS_ERR_PARAM;
+			goto done;
+		}
+	}
+
+done:
+	ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_ENABLE_QUEUES,
+	    v_status, NULL, 0, NULL);
+}
+
+/**
+ * ice_vc_disable_queues_msg - Handle VIRTCHNL_OP_DISABLE_QUEUES msg
+ * @sc: PF's softc structure
+ * @vf: VF tracking structure
+ * @msg_buf: message buffer from VF
+ *
+ * Disables all VF queues for the VF's VSI.
+ *
+ * @remark Unlike the ENABLE_QUEUES handler, this operates on both
+ * Tx and Rx queues
+ */
+static void
+ice_vc_disable_queues_msg(struct ice_softc *sc, struct ice_vf *vf,
+			  u8 *msg_buf __unused)
+{
+	struct ice_hw *hw = &sc->hw;
+	enum virtchnl_status_code v_status = VIRTCHNL_STATUS_SUCCESS;
+	struct ice_vsi *vsi = vf->vsi;
+	int error = 0;
+
+	error = ice_control_all_rx_queues(vsi, false);
+	if (error) {
+		device_printf(sc->dev,
+			      "Unable to disable Rx rings for transmit: %s\n",
+			      ice_err_str(error));
+		v_status = VIRTCHNL_STATUS_ERR_PARAM;
+		goto done;
+	}
+
+	error = ice_vsi_disable_tx(vsi);
+	if (error) {
+		/* Already prints an error message */
+		v_status = VIRTCHNL_STATUS_ERR_PARAM;
+	}
+
+done:
+	ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_DISABLE_QUEUES,
+	    v_status, NULL, 0, NULL);
+}
+
+/**
+ * ice_vc_cfg_irq_map_msg - Handle VIRTCHNL_OP_CFG_IRQ_MAP msg from VF
+ * @sc: PF's softc structure
+ * @vf: VF tracking structure
+ * @msg_buf: message buffer from VF
+ *
+ * Configures the interrupt vectors described in the message in msg_buf. The
+ * VF needs to send this message during init, so that queues can be allowed
+ * to generate interrupts.
+ */
+static void
+ice_vc_cfg_irq_map_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf)
+{
+#define ICE_VIRTCHNL_QUEUE_MAP_SIZE	16
+	struct ice_hw *hw = &sc->hw;
+	struct virtchnl_irq_map_info *vimi;
+	struct virtchnl_vector_map *vvm;
+	enum virtchnl_status_code v_status = VIRTCHNL_STATUS_SUCCESS;
+	struct ice_vsi *vsi = vf->vsi;
+	u16 vector;
+
+	vimi = (struct virtchnl_irq_map_info *)msg_buf;
+
+	if (vimi->num_vectors > vf->num_irq_vectors) {
+		device_printf(sc->dev,
+		    "%s: VF-%d: message has more vectors (%d) than configured for VF (%d)\n",
+		    __func__, vf->vf_num, vimi->num_vectors, vf->num_irq_vectors);
+		v_status = VIRTCHNL_STATUS_ERR_PARAM;
+		goto done;
+	}
+
+	vvm = vimi->vecmap;
+	/* Save off information from message */
+	for (int i = 0; i < vimi->num_vectors; i++, vvm++) {
+		struct ice_tx_queue *txq;
+		struct ice_rx_queue *rxq;
+		int bit;
+
+		if (vvm->vsi_id != vf->vsi->idx) {
+			device_printf(sc->dev,
+			    "%s: VF-%d: message's VSI ID (%d) does not match VF's (%d) for vector %d\n",
+			    __func__, vf->vf_num, vvm->vsi_id, vf->vsi->idx, i);
+			v_status = VIRTCHNL_STATUS_ERR_PARAM;
+			goto done;
+		}
+
+		/* vvm->vector_id is relative to VF space */
+		vector = vvm->vector_id;
+
+		if (vector >= vf->num_irq_vectors) {
+			device_printf(sc->dev,
+			    "%s: VF-%d: message's vector ID (%d) is greater than VF's max ID (%d)\n",
+			    __func__, vf->vf_num, vector, vf->num_irq_vectors - 1);
+			v_status = VIRTCHNL_STATUS_ERR_PARAM;
+			goto done;
+		}
+
+		/* The Misc/Admin Queue vector doesn't need mapping */
+		if (vector == 0)
+			continue;
+
+		/* coverity[address_of] */
+		for_each_set_bit(bit, &vvm->txq_map, ICE_VIRTCHNL_QUEUE_MAP_SIZE) {
+			if (bit >= vsi->num_tx_queues) {
+				device_printf(sc->dev,
+				    "%s: VF-%d: txq map has invalid bit set\n",
+				    __func__, vf->vf_num);
+				v_status = VIRTCHNL_STATUS_ERR_PARAM;
+				goto done;
+			}
+
+			vf->tx_irqvs[vector].me = vector;
+
+			txq = &vsi->tx_queues[bit];
+			txq->irqv = &vf->tx_irqvs[vector];
+			txq->itr_idx = vvm->txitr_idx;
+		}
+		/* coverity[address_of] */
+		for_each_set_bit(bit, &vvm->rxq_map, ICE_VIRTCHNL_QUEUE_MAP_SIZE) {
+			if (bit >= vsi->num_rx_queues) {
+				device_printf(sc->dev,
+				    "%s: VF-%d: rxq map has invalid bit set\n",
+				    __func__, vf->vf_num);
+				v_status = VIRTCHNL_STATUS_ERR_PARAM;
+				goto done;
+			}
+			vf->rx_irqvs[vector].me = vector;
+
+			rxq = &vsi->rx_queues[bit];
+			rxq->irqv = &vf->rx_irqvs[vector];
+			rxq->itr_idx = vvm->rxitr_idx;
+		}
+	}
+
+	/* Write to T/RQCTL registers to actually map vectors to queues */
+	for (int i = 0; i < vf->vsi->num_rx_queues; i++)
+		if (vsi->rx_queues[i].irqv != NULL)
+			ice_configure_rxq_interrupt(hw, vsi->rx_qmap[i],
+			    vsi->rx_queues[i].irqv->me, vsi->rx_queues[i].itr_idx);
+
+	for (int i = 0; i < vf->vsi->num_tx_queues; i++)
+		if (vsi->tx_queues[i].irqv != NULL)
+			ice_configure_txq_interrupt(hw, vsi->tx_qmap[i],
+			    vsi->tx_queues[i].irqv->me, vsi->tx_queues[i].itr_idx);
+
+	ice_flush(hw);
+
+done:
+	ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_CONFIG_IRQ_MAP,
+	    v_status, NULL, 0, NULL);
+}
+
+/**
+ * ice_eth_stats_to_virtchnl_eth_stats - Convert stats for virtchnl
+ * @istats: VSI stats from HW to convert
+ * @vstats: stats struct to copy to
+ *
+ * This function copies all known stats in struct virtchnl_eth_stats from the
+ * input struct ice_eth_stats to an output struct virtchnl_eth_stats.
+ *
+ * @remark These two structure types currently have the same definition up to
+ * the size of struct virtchnl_eth_stats (on FreeBSD), but that could change
+ * in the future.
+ */
+static void
+ice_eth_stats_to_virtchnl_eth_stats(struct ice_eth_stats *istats,
+				    struct virtchnl_eth_stats *vstats)
+{
+	vstats->rx_bytes = istats->rx_bytes;
+	vstats->rx_unicast = istats->rx_unicast;
+	vstats->rx_multicast = istats->rx_multicast;
+	vstats->rx_broadcast = istats->rx_broadcast;
+	vstats->rx_discards = istats->rx_discards;
+	vstats->rx_unknown_protocol = istats->rx_unknown_protocol;
+	vstats->tx_bytes = istats->tx_bytes;
+	vstats->tx_unicast = istats->tx_unicast;
+	vstats->tx_multicast = istats->tx_multicast;
+	vstats->tx_broadcast = istats->tx_broadcast;
+	vstats->tx_discards = istats->tx_discards;
+	vstats->tx_errors = istats->tx_errors;
+}
+
+/**
+ * ice_vc_get_stats_msg - Handle VIRTCHNL_OP_GET_STATS msg
+ * @sc: device private structure
+ * @vf: VF tracking structure
+ * @msg_buf: raw message buffer from the VF
+ *
+ * Updates the VF's VSI stats and sends those stats back to the VF.
+ */
+static void
+ice_vc_get_stats_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf)
+{
+	struct virtchnl_queue_select *vqs;
+	struct virtchnl_eth_stats stats;
+	struct ice_vsi *vsi = vf->vsi;
+	struct ice_hw *hw = &sc->hw;
+
+	vqs = (struct virtchnl_queue_select *)msg_buf;
+
+	if (vqs->vsi_id != vsi->idx) {
+		device_printf(sc->dev,
+		    "%s: VF-%d: message has invalid VSI ID %d (VF has VSI ID %d)\n",
+		    __func__, vf->vf_num, vqs->vsi_id, vsi->idx);
+		ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_GET_STATS,
+		    VIRTCHNL_STATUS_ERR_PARAM, NULL, 0, NULL);
+	}
+
+	ice_update_vsi_hw_stats(vf->vsi);
+	ice_eth_stats_to_virtchnl_eth_stats(&vsi->hw_stats.cur, &stats);
+
+	ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_GET_STATS,
+	    VIRTCHNL_STATUS_SUCCESS, (u8 *)&stats,
+	    sizeof(struct virtchnl_eth_stats), NULL);
+}
+
+/**
+ * ice_vc_cfg_promisc_mode_msg - Handle VIRTCHNL_OP_CONFIG_PROMISCUOUS_MODE
+ * @sc: PF's softc structure
+ * @vf: VF tracking structure
+ * @msg_buf: message buffer from VF
+ *
+ * Configures the promiscuous modes for the given VSI in msg_buf.
+ */
+static void
+ice_vc_cfg_promisc_mode_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf)
+{
+	struct ice_hw *hw = &sc->hw;
+	struct virtchnl_promisc_info *vpi;
+	enum virtchnl_status_code v_status = VIRTCHNL_STATUS_SUCCESS;
+	int status = 0;
+	struct ice_vsi *vsi = vf->vsi;
+	ice_declare_bitmap(old_promisc_mask, ICE_PROMISC_MAX);
+	ice_declare_bitmap(req_promisc_mask, ICE_PROMISC_MAX);
+	ice_declare_bitmap(clear_promisc_mask, ICE_PROMISC_MAX);
+	ice_declare_bitmap(set_promisc_mask, ICE_PROMISC_MAX);
+	ice_declare_bitmap(old_req_xor_mask, ICE_PROMISC_MAX);
+	u16 vid;
+
+	vpi = (struct virtchnl_promisc_info *)msg_buf;
+
+	/* Check to see if VF has permission to configure promiscuous mode */
+	if (!(vf->vf_flags & VF_FLAG_PROMISC_CAP)) {
+		device_printf(sc->dev,
+			      "VF-%d: attempted to configure promiscuous mode\n",
+			      vf->vf_num);
+		/* Don't reply to VF with an error */
+		goto done;
+	}
+
+	if (vpi->vsi_id != vsi->idx) {
+		device_printf(sc->dev,
+			      "VF-%d: Message has invalid VSI ID (expected %d, got %d)\n",
+			      vf->vf_num, vsi->idx, vpi->vsi_id);
+		v_status = VIRTCHNL_STATUS_ERR_PARAM;
+		goto done;
+	}
+
+	if (vpi->flags & ~ICE_VIRTCHNL_VALID_PROMISC_FLAGS) {
+		device_printf(sc->dev,
+			      "VF-%d: Message has invalid promiscuous flags set (valid 0x%02x, got 0x%02x)\n",
+			      vf->vf_num, ICE_VIRTCHNL_VALID_PROMISC_FLAGS,
+			      vpi->flags);
+		v_status = VIRTCHNL_STATUS_ERR_PARAM;
+		goto done;
+
+	}
+
+	ice_zero_bitmap(req_promisc_mask, ICE_PROMISC_MAX);
+	/* Convert virtchnl flags to ice AQ promiscuous mode flags */
+	if (vpi->flags & FLAG_VF_UNICAST_PROMISC) {
+		ice_set_bit(ICE_PROMISC_UCAST_TX, req_promisc_mask);
+		ice_set_bit(ICE_PROMISC_UCAST_RX, req_promisc_mask);
+	}
+	if (vpi->flags & FLAG_VF_MULTICAST_PROMISC) {
+		ice_set_bit(ICE_PROMISC_MCAST_TX, req_promisc_mask);
+		ice_set_bit(ICE_PROMISC_MCAST_RX, req_promisc_mask);
+	}
+
+	status = ice_get_vsi_promisc(hw, vsi->idx, old_promisc_mask, &vid);
+	if (status) {
+		device_printf(sc->dev,
+			      "VF-%d: Failed to get promiscuous mode mask for VSI %d, err %s aq_err %s\n",
+			      vf->vf_num, vsi->idx,
+			      ice_status_str(status),
+			      ice_aq_str(hw->adminq.sq_last_status));
+		v_status = ice_iov_err_to_virt_err(status);
+		goto done;
+	}
+
+	/* Figure out what got added and what got removed */
+	ice_zero_bitmap(old_req_xor_mask, ICE_PROMISC_MAX);
+	ice_xor_bitmap(old_req_xor_mask, old_promisc_mask, req_promisc_mask, ICE_PROMISC_MAX);
+	ice_and_bitmap(clear_promisc_mask, old_req_xor_mask, old_promisc_mask, ICE_PROMISC_MAX);
+	ice_and_bitmap(set_promisc_mask, old_req_xor_mask, req_promisc_mask, ICE_PROMISC_MAX);
+
+	if (ice_is_any_bit_set(clear_promisc_mask, ICE_PROMISC_MAX)) {
+		status = ice_clear_vsi_promisc(hw, vsi->idx,
+					       clear_promisc_mask, 0);
+		if (status) {
+			device_printf(sc->dev,
+				      "VF-%d: Failed to clear promiscuous mode for VSI %d, err %s aq_err %s\n",
+				      vf->vf_num, vsi->idx,
+				      ice_status_str(status),
+				      ice_aq_str(hw->adminq.sq_last_status));
+			v_status = ice_iov_err_to_virt_err(status);
+			goto done;
+		}
+	}
+
+	if (ice_is_any_bit_set(set_promisc_mask, ICE_PROMISC_MAX)) {
+		status = ice_set_vsi_promisc(hw, vsi->idx, set_promisc_mask, 0);
+		if (status) {
+			device_printf(sc->dev,
+				      "VF-%d: Failed to set promiscuous mode for VSI %d, err %s aq_err %s\n",
+				      vf->vf_num, vsi->idx,
+				      ice_status_str(status),
+				      ice_aq_str(hw->adminq.sq_last_status));
+			v_status = ice_iov_err_to_virt_err(status);
+			goto done;
+		}
+	}
+
+done:
+	ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_CONFIG_PROMISCUOUS_MODE,
+	    v_status, NULL, 0, NULL);
+}
+
+/**
+ * ice_vc_notify_all_vfs_link_state - Notify all VFs of PF link state
+ * @sc: device private structure
+ *
+ * Sends a message to all VFs about the status of the PF's link
+ * state. For more details, @see ice_vc_notify_vf_link_state.
+ */
+void
+ice_vc_notify_all_vfs_link_state(struct ice_softc *sc)
+{
+	for (int i = 0; i < sc->num_vfs; i++)
+		ice_vc_notify_vf_link_state(sc, &sc->vfs[i]);
+}
+
+/**
+ * ice_vc_notify_vf_link_state - Notify VF of PF link state
+ * @sc: device private structure
+ * @vf: VF tracking structure
+ *
+ * Sends an event message to the specified VF with information about
+ * the current link state from the PF's port. This includes whether
+ * link is up or down, and the link speed in 100Mbps units.
+ */
+static void
+ice_vc_notify_vf_link_state(struct ice_softc *sc, struct ice_vf *vf)
+{
+	struct virtchnl_pf_event event = {};
+	struct ice_hw *hw = &sc->hw;
+
+	event.event = VIRTCHNL_EVENT_LINK_CHANGE;
+	event.severity = PF_EVENT_SEVERITY_INFO;
+	event.event_data.link_event_adv.link_status = sc->link_up;
+	event.event_data.link_event_adv.link_speed =
+		(u32)ice_conv_link_speed_to_virtchnl(true,
+		    hw->port_info->phy.link_info.link_speed);
+
+	ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_EVENT,
+	    VIRTCHNL_STATUS_SUCCESS, (u8 *)&event, sizeof(event), NULL);
+}
+
+/**
+ * ice_vc_handle_vf_msg - Handle a message from a VF
+ * @sc: device private structure
+ * @event: event received from the HW MBX queue
+ *
+ * Called whenever an event is received from a VF on the HW mailbox queue.
+ * Responsible for handling these messages as well as responding to the
+ * VF afterwards, depending on the received message type.
+ */
+void
+ice_vc_handle_vf_msg(struct ice_softc *sc, struct ice_rq_event_info *event)
+{
+	struct ice_hw *hw = &sc->hw;
+	device_t dev = sc->dev;
+	struct ice_vf *vf;
+	int err = 0;
+
+	u32 v_opcode = event->desc.cookie_high;
+	u16 v_id = event->desc.retval;
+	u8 *msg = event->msg_buf;
+	u16 msglen = event->msg_len;
+
+	if (v_id >= sc->num_vfs) {
+		device_printf(dev, "%s: Received msg from invalid VF-%d: opcode %d, len %d\n",
+		    __func__, v_id, v_opcode, msglen);
+		return;
+	}
+
+	vf = &sc->vfs[v_id];
+
+	/* Perform basic checks on the msg */
+	err = virtchnl_vc_validate_vf_msg(&vf->version, v_opcode, msg, msglen);
+	if (err) {
+		device_printf(dev, "%s: Received invalid msg from VF-%d: opcode %d, len %d, error %d\n",
+		    __func__, vf->vf_num, v_opcode, msglen, err);
+		ice_aq_send_msg_to_vf(hw, v_id, v_opcode, VIRTCHNL_STATUS_ERR_PARAM, NULL, 0, NULL);
+		return;
+	}
+
+	switch (v_opcode) {
+	case VIRTCHNL_OP_VERSION:
+		ice_vc_version_msg(sc, vf, msg);
+		break;
+	case VIRTCHNL_OP_RESET_VF:
+		ice_reset_vf(sc, vf, true);
+		break;
+	case VIRTCHNL_OP_GET_VF_RESOURCES:
+		ice_vc_get_vf_res_msg(sc, vf, msg);
+		break;
+	case VIRTCHNL_OP_ADD_ETH_ADDR:
+		ice_vc_add_eth_addr_msg(sc, vf, msg);
+		break;
+	case VIRTCHNL_OP_DEL_ETH_ADDR:
+		ice_vc_del_eth_addr_msg(sc, vf, msg);
+		break;
+	case VIRTCHNL_OP_ADD_VLAN:
+		ice_vc_add_vlan_msg(sc, vf, msg);
+		break;
+	case VIRTCHNL_OP_DEL_VLAN:
+		ice_vc_del_vlan_msg(sc, vf, msg);
+		break;
+	case VIRTCHNL_OP_CONFIG_VSI_QUEUES:
+		ice_vc_cfg_vsi_qs_msg(sc, vf, msg);
+		break;
+	case VIRTCHNL_OP_CONFIG_RSS_KEY:
+		ice_vc_cfg_rss_key_msg(sc, vf, msg);
+		break;
+	case VIRTCHNL_OP_CONFIG_RSS_LUT:
+		ice_vc_cfg_rss_lut_msg(sc, vf, msg);
+		break;
+	case VIRTCHNL_OP_SET_RSS_HENA:
+		ice_vc_set_rss_hena_msg(sc, vf, msg);
+		break;
+	case VIRTCHNL_OP_ENABLE_QUEUES:
+		ice_vc_enable_queues_msg(sc, vf, msg);
+		ice_vc_notify_vf_link_state(sc, vf);
+		break;
+	case VIRTCHNL_OP_DISABLE_QUEUES:
+		ice_vc_disable_queues_msg(sc, vf, msg);
+		break;
+	case VIRTCHNL_OP_CONFIG_IRQ_MAP:
+		ice_vc_cfg_irq_map_msg(sc, vf, msg);
+		break;
+	case VIRTCHNL_OP_GET_STATS:
+		ice_vc_get_stats_msg(sc, vf, msg);
+		break;
+	case VIRTCHNL_OP_CONFIG_PROMISCUOUS_MODE:
+		ice_vc_cfg_promisc_mode_msg(sc, vf, msg);
+		break;
+	default:
+		device_printf(dev, "%s: Received unknown msg from VF-%d: opcode %d, len %d\n",
+		    __func__, vf->vf_num, v_opcode, msglen);
+		ice_aq_send_msg_to_vf(hw, v_id, v_opcode,
+		    VIRTCHNL_STATUS_ERR_NOT_SUPPORTED, NULL, 0, NULL);
+		break;
+	}
+}
+
+/**
+ * ice_iov_setup_intr_mapping - Setup interrupt config for a VF
+ * @sc: device softc structure
+ * @vf: driver's VF structure for VF to be configured
+ *
+ * Before a VF can be used, and after a VF reset, the PF must configure
+ * the VF's interrupt allocation registers. This includes allocating
+ * interrupts from the PF's interrupt pool to the VF using the
+ * VPINT_ALLOC(_PCI) registers, and setting up a mapping from PF vectors
+ * to VF vectors in GLINT_VECT2FUNC.
+ *
+ * As well, this sets up queue allocation registers and maps the mailbox
+ * interrupt for the VF.
+ */
+static void
+ice_iov_setup_intr_mapping(struct ice_softc *sc, struct ice_vf *vf)
+{
+	struct ice_hw *hw = &sc->hw;
+	struct ice_vsi *vsi = vf->vsi;
+	u16 v;
+
+	/* Calculate indices for register ops below */
+	u16 vf_first_irq_idx = vf->vf_imap[0];
+	u16 vf_last_irq_idx = (vf_first_irq_idx + vf->num_irq_vectors) - 1;
+	u16 abs_vf_first_irq_idx = hw->func_caps.common_cap.msix_vector_first_id +
+	    vf_first_irq_idx;
+	u16 abs_vf_last_irq_idx = (abs_vf_first_irq_idx + vf->num_irq_vectors) - 1;
+	u16 abs_vf_num = vf->vf_num + hw->func_caps.vf_base_id;
+
+	/* Map out VF interrupt allocation in global device space. Both
+	 * VPINT_ALLOC and VPINT_ALLOC_PCI use the same values.
+	 */
+	wr32(hw, VPINT_ALLOC(vf->vf_num),
+	    (((abs_vf_first_irq_idx << VPINT_ALLOC_FIRST_S) & VPINT_ALLOC_FIRST_M) |
+	    ((abs_vf_last_irq_idx << VPINT_ALLOC_LAST_S) & VPINT_ALLOC_LAST_M) |
+	    VPINT_ALLOC_VALID_M));
+	wr32(hw, VPINT_ALLOC_PCI(vf->vf_num),
+	    (((abs_vf_first_irq_idx << VPINT_ALLOC_PCI_FIRST_S) & VPINT_ALLOC_PCI_FIRST_M) |
+	    ((abs_vf_last_irq_idx << VPINT_ALLOC_PCI_LAST_S) & VPINT_ALLOC_PCI_LAST_M) |
+	    VPINT_ALLOC_PCI_VALID_M));
+
+	/* Create inverse mapping of vectors to PF/VF combinations */
+	for (v = vf_first_irq_idx; v <= vf_last_irq_idx; v++)
+	{
+		wr32(hw, GLINT_VECT2FUNC(v),
+		    (((abs_vf_num << GLINT_VECT2FUNC_VF_NUM_S) & GLINT_VECT2FUNC_VF_NUM_M) |
+		     ((hw->pf_id << GLINT_VECT2FUNC_PF_NUM_S) & GLINT_VECT2FUNC_PF_NUM_M)));
+	}
+
+	/* Map mailbox interrupt to MSI-X index 0. Disable ITR for it, too. */
+	wr32(hw, VPINT_MBX_CTL(abs_vf_num),
+	    ((0 << VPINT_MBX_CTL_MSIX_INDX_S) & VPINT_MBX_CTL_MSIX_INDX_M) |
+	    ((0x3 << VPINT_MBX_CTL_ITR_INDX_S) & VPINT_MBX_CTL_ITR_INDX_M) |
+	    VPINT_MBX_CTL_CAUSE_ENA_M);
+
+	/* Mark the TX queue mapping registers as valid */
+	wr32(hw, VPLAN_TXQ_MAPENA(vf->vf_num), VPLAN_TXQ_MAPENA_TX_ENA_M);
+
+	/* Indicate to HW that VF has scattered queue allocation */
+	wr32(hw, VPLAN_TX_QBASE(vf->vf_num), VPLAN_TX_QBASE_VFQTABLE_ENA_M);
+	for (int i = 0; i < vsi->num_tx_queues; i++) {
+		wr32(hw, VPLAN_TX_QTABLE(i, vf->vf_num),
+		    (vsi->tx_qmap[i] << VPLAN_TX_QTABLE_QINDEX_S) & VPLAN_TX_QTABLE_QINDEX_M);
+	}
+
+	/* Mark the RX queue mapping registers as valid */
+	wr32(hw, VPLAN_RXQ_MAPENA(vf->vf_num), VPLAN_RXQ_MAPENA_RX_ENA_M);
+	wr32(hw, VPLAN_RX_QBASE(vf->vf_num), VPLAN_RX_QBASE_VFQTABLE_ENA_M);
+	for (int i = 0; i < vsi->num_rx_queues; i++) {
+		wr32(hw, VPLAN_RX_QTABLE(i, vf->vf_num),
+		    (vsi->rx_qmap[i] << VPLAN_RX_QTABLE_QINDEX_S) & VPLAN_RX_QTABLE_QINDEX_M);
+	}
+}
+
+/**
+ * ice_err_to_virt err - translate ice errors into virtchnl errors
+ * @ice_err: status returned from ice function
+ */
+static enum virtchnl_status_code
+ice_iov_err_to_virt_err(int ice_err)
+{
+	switch (ice_err) {
+	case 0:
+		return VIRTCHNL_STATUS_SUCCESS;
+	case ICE_ERR_BAD_PTR:
+	case ICE_ERR_INVAL_SIZE:
+	case ICE_ERR_DEVICE_NOT_SUPPORTED:
+	case ICE_ERR_PARAM:
+	case ICE_ERR_CFG:
+		return VIRTCHNL_STATUS_ERR_PARAM;
+	case ICE_ERR_NO_MEMORY:
+		return VIRTCHNL_STATUS_ERR_NO_MEMORY;
+	case ICE_ERR_NOT_READY:
+	case ICE_ERR_RESET_FAILED:
+	case ICE_ERR_FW_API_VER:
+	case ICE_ERR_AQ_ERROR:
+	case ICE_ERR_AQ_TIMEOUT:
+	case ICE_ERR_AQ_FULL:
+	case ICE_ERR_AQ_NO_WORK:
+	case ICE_ERR_AQ_EMPTY:
+		return VIRTCHNL_STATUS_ERR_ADMIN_QUEUE_ERROR;
+	default:
+		return VIRTCHNL_STATUS_ERR_NOT_SUPPORTED;
+	}
+}
diff --git a/sys/dev/ice/ice_iov.h b/sys/dev/ice/ice_iov.h
new file mode 100644
index 000000000000..c4fb3e932e3f
--- /dev/null
+++ b/sys/dev/ice/ice_iov.h
@@ -0,0 +1,125 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/*  Copyright (c) 2025, Intel Corporation
+ *  All rights reserved.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *
+ *   1. Redistributions of source code must retain the above copyright notice,
+ *      this list of conditions and the following disclaimer.
+ *
+ *   2. Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in the
+ *      documentation and/or other materials provided with the distribution.
+ *
+ *   3. Neither the name of the Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived from
+ *      this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ *  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file ice_iov.h
+ * @brief header for IOV functionality
+ *
+ * This header includes definitions used to implement device Virtual Functions
+ * for the ice driver.
+ */
+
+#ifndef _ICE_IOV_H_
+#define _ICE_IOV_H_
+
+#include <sys/types.h>
+#include <sys/bus.h>
+#include <sys/nv.h>
+#include <sys/iov_schema.h>
+#include <sys/param.h>
+#include <dev/pci/pcivar.h>
+#include <dev/pci/pcireg.h>
+
+#include <dev/pci/pci_iov.h>
+
+#include "ice_iflib.h"
+#include "ice_vf_mbx.h"
+
+/**
+ * @enum ice_vf_flags
+ * @brief VF state flags
+ *
+ * Used to indicate the status of a PF's VF, as well as indicating what each VF
+ * is capabile of. Intended to be modified only using atomic operations, so
+ * they can be read and modified in places that aren't locked.
+ *
+ * Used in struct ice_vf's vf_flags field.
+ */
+enum ice_vf_flags {
+	VF_FLAG_ENABLED			= BIT(0),
+	VF_FLAG_SET_MAC_CAP		= BIT(1),
+	VF_FLAG_VLAN_CAP		= BIT(2),
+	VF_FLAG_PROMISC_CAP		= BIT(3),
+	VF_FLAG_MAC_ANTI_SPOOF		= BIT(4),
+};
+
+/**
+ * @struct ice_vf
+ * @brief PF's VF software context
+ *
+ * Represents the state and options for a VF spawned from a PF.
+ */
+struct ice_vf {
+	struct ice_vsi *vsi;
+	u32 vf_flags;
+
+	u8 mac[ETHER_ADDR_LEN];
+	u16 vf_num;
+	struct virtchnl_version_info version;
+
+	u16 mac_filter_limit;
+	u16 mac_filter_cnt;
+	u16 vlan_limit;
+	u16 vlan_cnt;
+
+	u16 num_irq_vectors;
+	u16 *vf_imap;
+	struct ice_irq_vector *tx_irqvs;
+	struct ice_irq_vector *rx_irqvs;
+};
+
+#define ICE_PCIE_DEV_STATUS			0xAA
+
+#define ICE_PCI_CIAD_WAIT_COUNT			100
+#define ICE_PCI_CIAD_WAIT_DELAY_US		1
+#define ICE_VPGEN_VFRSTAT_WAIT_COUNT		100
+#define ICE_VPGEN_VFRSTAT_WAIT_DELAY_US		20
+
+#define ICE_VIRTCHNL_VALID_PROMISC_FLAGS	(FLAG_VF_UNICAST_PROMISC | \
+						 FLAG_VF_MULTICAST_PROMISC)
+
+#define ICE_DEFAULT_VF_VLAN_LIMIT			64
+#define ICE_DEFAULT_VF_FILTER_LIMIT			16
+
+int ice_iov_attach(struct ice_softc *sc);
+int ice_iov_detach(struct ice_softc *sc);
+
+int ice_iov_init(struct ice_softc *sc, uint16_t num_vfs, const nvlist_t *params);
+int ice_iov_add_vf(struct ice_softc *sc, uint16_t vfnum, const nvlist_t *params);
+void ice_iov_uninit(struct ice_softc *sc);
+
+void ice_iov_handle_vflr(struct ice_softc *sc);
+
+void ice_vc_handle_vf_msg(struct ice_softc *sc, struct ice_rq_event_info *event);
+void ice_vc_notify_all_vfs_link_state(struct ice_softc *sc);
+
+#endif /* _ICE_IOV_H_ */
+
diff --git a/sys/dev/ice/ice_lib.c b/sys/dev/ice/ice_lib.c
index d44ae5f37750..442111e5ffaf 100644
--- a/sys/dev/ice/ice_lib.c
+++ b/sys/dev/ice/ice_lib.c
@@ -42,6 +42,9 @@
 
 #include "ice_lib.h"
 #include "ice_iflib.h"
+#ifdef PCI_IOV
+#include "ice_iov.h"
+#endif
 #include <dev/pci/pcivar.h>
 #include <dev/pci/pcireg.h>
 #include <machine/resource.h>
@@ -741,6 +744,12 @@ ice_initialize_vsi(struct ice_vsi *vsi)
 	case ICE_VSI_VMDQ2:
 		ctx.flags = ICE_AQ_VSI_TYPE_VMDQ2;
 		break;
+#ifdef PCI_IOV
+	case ICE_VSI_VF:
+		ctx.flags = ICE_AQ_VSI_TYPE_VF;
+		ctx.vf_num = vsi->vf_num;
+		break;
+#endif
 	default:
 		return (ENODEV);
 	}
@@ -1607,6 +1616,12 @@ ice_setup_tx_ctx(struct ice_tx_queue *txq, struct ice_tlan_ctx *tlan_ctx, u16 pf
 	case ICE_VSI_VMDQ2:
 		tlan_ctx->vmvf_type = ICE_TLAN_CTX_VMVF_TYPE_VMQ;
 		break;
+#ifdef PCI_IOV
+	case ICE_VSI_VF:
+		tlan_ctx->vmvf_type = ICE_TLAN_CTX_VMVF_TYPE_VF;
+		tlan_ctx->vmvf_num = hw->func_caps.vf_base_id + vsi->vf_num;
+		break;
+#endif
 	default:
 		return (ENODEV);
 	}
@@ -1660,6 +1675,10 @@ ice_cfg_vsi_for_tx(struct ice_vsi *vsi)
 		struct ice_tlan_ctx tlan_ctx = { 0 };
 		struct ice_tx_queue *txq = &vsi->tx_queues[i];
 
+		/* Last configured queue */
+		if (txq->desc_count == 0)
+			break;
+
 		pf_q = vsi->tx_qmap[txq->me];
 		qg->txqs[0].txq_id = htole16(pf_q);
 
@@ -1788,6 +1807,10 @@ ice_cfg_vsi_for_rx(struct ice_vsi *vsi)
 
 	for (i = 0; i < vsi->num_rx_queues; i++) {
 		MPASS(vsi->mbuf_sz > 0);
+		/* Last configured queue */
+		if (vsi->rx_queues[i].desc_count == 0)
+			break;
+
 		err = ice_setup_rx_ctx(&vsi->rx_queues[i]);
 		if (err)
 			return err;
@@ -2257,6 +2280,11 @@ ice_process_ctrlq_event(struct ice_softc *sc, const char *qname,
 	case ice_aqc_opc_get_link_status:
 		ice_process_link_event(sc, event);
 		break;
+#ifdef PCI_IOV
+	case ice_mbx_opc_send_msg_to_pf:
+		ice_vc_handle_vf_msg(sc, event);
+		break;
+#endif
 	case ice_aqc_opc_fw_logs_event:
 		ice_handle_fw_log_event(sc, &event->desc, event->msg_buf);
 		break;
diff --git a/sys/dev/ice/ice_lib.h b/sys/dev/ice/ice_lib.h
index b6b23ec82161..308b2bda2790 100644
--- a/sys/dev/ice/ice_lib.h
+++ b/sys/dev/ice/ice_lib.h
@@ -611,6 +611,10 @@ struct ice_vsi {
 	u16 mirror_src_vsi;
 	u16 rule_mir_ingress;
 	u16 rule_mir_egress;
+
+#ifdef PCI_IOV
+	u8 vf_num;		/* Index of owning VF, if applicable */
+#endif
 };
 
 /**
diff --git a/sys/dev/ice/ice_vf_mbx.c b/sys/dev/ice/ice_vf_mbx.c
new file mode 100644
index 000000000000..387a1c6739a6
--- /dev/null
+++ b/sys/dev/ice/ice_vf_mbx.c
@@ -0,0 +1,471 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/*  Copyright (c) 2025, Intel Corporation
+ *  All rights reserved.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *
+ *   1. Redistributions of source code must retain the above copyright notice,
+ *      this list of conditions and the following disclaimer.
+ *
+ *   2. Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in the
+ *      documentation and/or other materials provided with the distribution.
+ *
+ *   3. Neither the name of the Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived from
+ *      this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ *  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "ice_common.h"
+#include "ice_hw_autogen.h"
+#include "ice_vf_mbx.h"
+
+/**
+ * ice_aq_send_msg_to_vf
+ * @hw: pointer to the hardware structure
+ * @vfid: VF ID to send msg
+ * @v_opcode: opcodes for VF-PF communication
+ * @v_retval: return error code
+ * @msg: pointer to the msg buffer
+ * @msglen: msg length
+ * @cd: pointer to command details
+ *
+ * Send message to VF driver (0x0802) using mailbox
+ * queue and asynchronously sending message via
+ * ice_sq_send_cmd() function
+ */
+int
+ice_aq_send_msg_to_vf(struct ice_hw *hw, u16 vfid, u32 v_opcode, u32 v_retval,
+		      u8 *msg, u16 msglen, struct ice_sq_cd *cd)
+{
+	struct ice_aqc_pf_vf_msg *cmd;
+	struct ice_aq_desc desc;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_mbx_opc_send_msg_to_vf);
+
+	cmd = &desc.params.virt;
+	cmd->id = CPU_TO_LE32(vfid);
+
+	desc.cookie_high = CPU_TO_LE32(v_opcode);
+	desc.cookie_low = CPU_TO_LE32(v_retval);
+
+	if (msglen)
+		desc.flags |= CPU_TO_LE16(ICE_AQ_FLAG_RD);
+
+	return ice_sq_send_cmd(hw, &hw->mailboxq, &desc, msg, msglen, cd);
+}
+
+/**
+ * ice_aq_send_msg_to_pf
+ * @hw: pointer to the hardware structure
+ * @v_opcode: opcodes for VF-PF communication
+ * @v_retval: return error code
+ * @msg: pointer to the msg buffer
+ * @msglen: msg length
+ * @cd: pointer to command details
+ *
+ * Send message to PF driver using mailbox queue. By default, this
+ * message is sent asynchronously, i.e. ice_sq_send_cmd()
+ * does not wait for completion before returning.
+ */
+int
+ice_aq_send_msg_to_pf(struct ice_hw *hw, enum virtchnl_ops v_opcode,
+		      int v_retval, u8 *msg, u16 msglen,
+		      struct ice_sq_cd *cd)
+{
+	struct ice_aq_desc desc;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_mbx_opc_send_msg_to_pf);
+	desc.cookie_high = CPU_TO_LE32(v_opcode);
+	desc.cookie_low = CPU_TO_LE32(v_retval);
+
+	if (msglen)
+		desc.flags |= CPU_TO_LE16(ICE_AQ_FLAG_RD);
+
+	return ice_sq_send_cmd(hw, &hw->mailboxq, &desc, msg, msglen, cd);
+}
+
+static const u32 ice_legacy_aq_to_vc_speed[] = {
+	VIRTCHNL_LINK_SPEED_100MB,	/* BIT(0) */
+	VIRTCHNL_LINK_SPEED_100MB,
+	VIRTCHNL_LINK_SPEED_1GB,
+	VIRTCHNL_LINK_SPEED_1GB,
+	VIRTCHNL_LINK_SPEED_1GB,
+	VIRTCHNL_LINK_SPEED_10GB,
+	VIRTCHNL_LINK_SPEED_20GB,
+	VIRTCHNL_LINK_SPEED_25GB,
+	VIRTCHNL_LINK_SPEED_40GB,
+	VIRTCHNL_LINK_SPEED_40GB,
+	VIRTCHNL_LINK_SPEED_40GB,
+};
+
+/**
+ * ice_conv_link_speed_to_virtchnl
+ * @adv_link_support: determines the format of the returned link speed
+ * @link_speed: variable containing the link_speed to be converted
+ *
+ * Convert link speed supported by HW to link speed supported by virtchnl.
+ * If adv_link_support is true, then return link speed in Mbps. Else return
+ * link speed as a VIRTCHNL_LINK_SPEED_* casted to a u32. Note that the caller
+ * needs to cast back to an enum virtchnl_link_speed in the case where
+ * adv_link_support is false, but when adv_link_support is true the caller can
+ * expect the speed in Mbps.
+ */
+u32 ice_conv_link_speed_to_virtchnl(bool adv_link_support, u16 link_speed)
+{
+	/* convert a BIT() value into an array index */
+	u16 index = (u16)(ice_fls(link_speed) - 1);
+
+	if (adv_link_support)
+		return ice_get_link_speed(index);
+	else if (index < ARRAY_SIZE(ice_legacy_aq_to_vc_speed))
+		/* Virtchnl speeds are not defined for every speed supported in
+		 * the hardware. To maintain compatibility with older AVF
+		 * drivers, while reporting the speed the new speed values are
+		 * resolved to the closest known virtchnl speeds
+		 */
+		return ice_legacy_aq_to_vc_speed[index];
+
+	return VIRTCHNL_LINK_SPEED_UNKNOWN;
+}
+
+/* The mailbox overflow detection algorithm helps to check if there
+ * is a possibility of a malicious VF transmitting too many MBX messages to the
+ * PF.
+ * 1. The mailbox snapshot structure, ice_mbx_snapshot, is initialized during
+ * driver initialization in ice_init_hw() using ice_mbx_init_snapshot().
+ * The struct ice_mbx_snapshot helps to track and traverse a static window of
+ * messages within the mailbox queue while looking for a malicious VF.
+ *
+ * 2. When the caller starts processing its mailbox queue in response to an
+ * interrupt, the structure ice_mbx_snapshot is expected to be cleared before
+ * the algorithm can be run for the first time for that interrupt. This
+ * requires calling ice_mbx_reset_snapshot() as well as calling
+ * ice_mbx_reset_vf_info() for each VF tracking structure.
+ *
+ * 3. For every message read by the caller from the MBX Queue, the caller must
+ * call the detection algorithm's entry function ice_mbx_vf_state_handler().
+ * Before every call to ice_mbx_vf_state_handler() the struct ice_mbx_data is
+ * filled as it is required to be passed to the algorithm.
+ *
+ * 4. Every time a message is read from the MBX queue, a tracking structure
+ * for the VF must be passed to the state handler. The boolean output
+ * report_malvf from ice_mbx_vf_state_handler() serves as an indicator to the
+ * caller whether it must report this VF as malicious or not.
+ *
+ * 5. When a VF is identified to be malicious, the caller can send a message
+ * to the system administrator.
+ *
+ * 6. The PF is responsible for maintaining the struct ice_mbx_vf_info
+ * structure for each VF. The PF should clear the VF tracking structure if the
+ * VF is reset. When a VF is shut down and brought back up, we will then
+ * assume that the new VF is not malicious and may report it again if we
+ * detect it again.
+ *
+ * 7. The function ice_mbx_reset_snapshot() is called to reset the information
+ * in ice_mbx_snapshot for every new mailbox interrupt handled.
+ */
+#define ICE_RQ_DATA_MASK(rq_data) ((rq_data) & PF_MBX_ARQH_ARQH_M)
+/* Using the highest value for an unsigned 16-bit value 0xFFFF to indicate that
+ * the max messages check must be ignored in the algorithm
+ */
+#define ICE_IGNORE_MAX_MSG_CNT	0xFFFF
+
+/**
+ * ice_mbx_reset_snapshot - Initialize mailbox snapshot structure
+ * @snap: pointer to the mailbox snapshot
+ */
+static void ice_mbx_reset_snapshot(struct ice_mbx_snapshot *snap)
+{
+	struct ice_mbx_vf_info *vf_info;
+
+	/* Clear mbx_buf in the mailbox snaphot structure and setting the
+	 * mailbox snapshot state to a new capture.
+	 */
+	ice_memset(&snap->mbx_buf, 0, sizeof(snap->mbx_buf), ICE_NONDMA_MEM);
+	snap->mbx_buf.state = ICE_MAL_VF_DETECT_STATE_NEW_SNAPSHOT;
+
+	/* Reset message counts for all VFs to zero */
+	LIST_FOR_EACH_ENTRY(vf_info, &snap->mbx_vf, ice_mbx_vf_info, list_entry)
+		vf_info->msg_count = 0;
+}
+
+/**
+ * ice_mbx_traverse - Pass through mailbox snapshot
+ * @hw: pointer to the HW struct
+ * @new_state: new algorithm state
+ *
+ * Traversing the mailbox static snapshot without checking
+ * for malicious VFs.
+ */
+static void
+ice_mbx_traverse(struct ice_hw *hw,
+		 enum ice_mbx_snapshot_state *new_state)
+{
+	struct ice_mbx_snap_buffer_data *snap_buf;
+	u32 num_iterations;
+
+	snap_buf = &hw->mbx_snapshot.mbx_buf;
+
+	/* As mailbox buffer is circular, applying a mask
+	 * on the incremented iteration count.
+	 */
+	num_iterations = ICE_RQ_DATA_MASK(++snap_buf->num_iterations);
+
+	/* Checking either of the below conditions to exit snapshot traversal:
+	 * Condition-1: If the number of iterations in the mailbox is equal to
+	 * the mailbox head which would indicate that we have reached the end
+	 * of the static snapshot.
+	 * Condition-2: If the maximum messages serviced in the mailbox for a
+	 * given interrupt is the highest possible value then there is no need
+	 * to check if the number of messages processed is equal to it. If not
+	 * check if the number of messages processed is greater than or equal
+	 * to the maximum number of mailbox entries serviced in current work item.
+	 */
+	if (num_iterations == snap_buf->head ||
+	    (snap_buf->max_num_msgs_mbx < ICE_IGNORE_MAX_MSG_CNT &&
+	     ++snap_buf->num_msg_proc >= snap_buf->max_num_msgs_mbx))
+		*new_state = ICE_MAL_VF_DETECT_STATE_NEW_SNAPSHOT;
+}
+
+/**
+ * ice_mbx_detect_malvf - Detect malicious VF in snapshot
+ * @hw: pointer to the HW struct
+ * @vf_info: mailbox tracking structure for a VF
+ * @new_state: new algorithm state
+ * @is_malvf: boolean output to indicate if VF is malicious
+ *
+ * This function tracks the number of asynchronous messages
+ * sent per VF and marks the VF as malicious if it exceeds
+ * the permissible number of messages to send.
+ */
+static int
+ice_mbx_detect_malvf(struct ice_hw *hw, struct ice_mbx_vf_info *vf_info,
+		     enum ice_mbx_snapshot_state *new_state,
+		     bool *is_malvf)
+{
+	/* increment the message count for this VF */
+	vf_info->msg_count++;
+
+	if (vf_info->msg_count >= ICE_ASYNC_VF_MSG_THRESHOLD)
+		*is_malvf = true;
+
+	/* continue to iterate through the mailbox snapshot */
+	ice_mbx_traverse(hw, new_state);
+
+	return 0;
+}
+
+/**
+ * ice_e830_mbx_vf_dec_trig - Decrements the VF mailbox queue counter
+ * @hw: pointer to the HW struct
+ * @event: pointer to the control queue receive event
+ *
+ * This function triggers to decrement the counter
+ * MBX_VF_IN_FLIGHT_MSGS_AT_PF_CNT when the driver replenishes
+ * the buffers at the PF mailbox queue.
+ */
+void ice_e830_mbx_vf_dec_trig(struct ice_hw *hw,
+			      struct ice_rq_event_info *event)
+{
+	u16 vfid = LE16_TO_CPU(event->desc.retval);
+
+	wr32(hw, E830_MBX_VF_DEC_TRIG(vfid), 1);
+}
+
+/**
+ * ice_mbx_vf_clear_cnt_e830 - Clear the VF mailbox queue count
+ * @hw: pointer to the HW struct
+ * @vf_id: VF ID in the PF space
+ *
+ * This function clears the counter MBX_VF_IN_FLIGHT_MSGS_AT_PF_CNT, and should
+ * be called when a VF is created and on VF reset.
+ */
+void ice_mbx_vf_clear_cnt_e830(struct ice_hw *hw, u16 vf_id)
+{
+	u32 reg = rd32(hw, E830_MBX_VF_IN_FLIGHT_MSGS_AT_PF_CNT(vf_id));
+
+	wr32(hw, E830_MBX_VF_DEC_TRIG(vf_id), reg);
+}
+
+/**
+ * ice_mbx_vf_state_handler - Handle states of the overflow algorithm
+ * @hw: pointer to the HW struct
+ * @mbx_data: pointer to structure containing mailbox data
+ * @vf_info: mailbox tracking structure for the VF in question
+ * @report_malvf: boolean output to indicate whether VF should be reported
+ *
+ * The function serves as an entry point for the malicious VF
+ * detection algorithm by handling the different states and state
+ * transitions of the algorithm:
+ * New snapshot: This state is entered when creating a new static
+ * snapshot. The data from any previous mailbox snapshot is
+ * cleared and a new capture of the mailbox head and tail is
+ * logged. This will be the new static snapshot to detect
+ * asynchronous messages sent by VFs. On capturing the snapshot
+ * and depending on whether the number of pending messages in that
+ * snapshot exceed the watermark value, the state machine enters
+ * traverse or detect states.
+ * Traverse: If pending message count is below watermark then iterate
+ * through the snapshot without any action on VF.
+ * Detect: If pending message count exceeds watermark traverse
+ * the static snapshot and look for a malicious VF.
+ */
+int
+ice_mbx_vf_state_handler(struct ice_hw *hw, struct ice_mbx_data *mbx_data,
+			 struct ice_mbx_vf_info *vf_info, bool *report_malvf)
+{
+	struct ice_mbx_snapshot *snap = &hw->mbx_snapshot;
+	struct ice_mbx_snap_buffer_data *snap_buf;
+	struct ice_ctl_q_info *cq = &hw->mailboxq;
+	enum ice_mbx_snapshot_state new_state;
+	int status = 0;
+	bool is_malvf = false;
+
+	if (!report_malvf || !mbx_data || !vf_info)
+		return ICE_ERR_BAD_PTR;
+
+	*report_malvf = false;
+
+	/* When entering the mailbox state machine assume that the VF
+	 * is not malicious until detected.
+	 */
+	 /* Checking if max messages allowed to be processed while servicing current
+	  * interrupt is not less than the defined AVF message threshold.
+	  */
+	if (mbx_data->max_num_msgs_mbx <= ICE_ASYNC_VF_MSG_THRESHOLD)
+		return ICE_ERR_INVAL_SIZE;
+
+	/* The watermark value should not be lesser than the threshold limit
+	 * set for the number of asynchronous messages a VF can send to mailbox
+	 * nor should it be greater than the maximum number of messages in the
+	 * mailbox serviced in current interrupt.
+	 */
+	if (mbx_data->async_watermark_val < ICE_ASYNC_VF_MSG_THRESHOLD ||
+	    mbx_data->async_watermark_val > mbx_data->max_num_msgs_mbx)
+		return ICE_ERR_PARAM;
+
+	new_state = ICE_MAL_VF_DETECT_STATE_INVALID;
+	snap_buf = &snap->mbx_buf;
+
+	switch (snap_buf->state) {
+	case ICE_MAL_VF_DETECT_STATE_NEW_SNAPSHOT:
+		/* Clear any previously held data in mailbox snapshot structure. */
+		ice_mbx_reset_snapshot(snap);
+
+		/* Collect the pending ARQ count, number of messages processed and
+		 * the maximum number of messages allowed to be processed from the
+		 * Mailbox for current interrupt.
+		 */
+		snap_buf->num_pending_arq = mbx_data->num_pending_arq;
+		snap_buf->num_msg_proc = mbx_data->num_msg_proc;
+		snap_buf->max_num_msgs_mbx = mbx_data->max_num_msgs_mbx;
+
+		/* Capture a new static snapshot of the mailbox by logging the
+		 * head and tail of snapshot and set num_iterations to the tail
+		 * value to mark the start of the iteration through the snapshot.
+		 */
+		snap_buf->head = ICE_RQ_DATA_MASK(cq->rq.next_to_clean +
+						  mbx_data->num_pending_arq);
+		snap_buf->tail = ICE_RQ_DATA_MASK(cq->rq.next_to_clean - 1);
+		snap_buf->num_iterations = snap_buf->tail;
+
+		/* Pending ARQ messages returned by ice_clean_rq_elem
+		 * is the difference between the head and tail of the
+		 * mailbox queue. Comparing this value against the watermark
+		 * helps to check if we potentially have malicious VFs.
+		 */
+		if (snap_buf->num_pending_arq >=
+		    mbx_data->async_watermark_val) {
+			new_state = ICE_MAL_VF_DETECT_STATE_DETECT;
+			status = ice_mbx_detect_malvf(hw, vf_info, &new_state, &is_malvf);
+		} else {
+			new_state = ICE_MAL_VF_DETECT_STATE_TRAVERSE;
+			ice_mbx_traverse(hw, &new_state);
+		}
+		break;
+
+	case ICE_MAL_VF_DETECT_STATE_TRAVERSE:
+		new_state = ICE_MAL_VF_DETECT_STATE_TRAVERSE;
+		ice_mbx_traverse(hw, &new_state);
+		break;
+
+	case ICE_MAL_VF_DETECT_STATE_DETECT:
+		new_state = ICE_MAL_VF_DETECT_STATE_DETECT;
+		status = ice_mbx_detect_malvf(hw, vf_info, &new_state, &is_malvf);
+		break;
+
+	default:
+		new_state = ICE_MAL_VF_DETECT_STATE_INVALID;
+		status = ICE_ERR_CFG;
+	}
+
+	snap_buf->state = new_state;
+
+	/* Only report VFs as malicious the first time we detect it */
+	if (is_malvf && !vf_info->malicious) {
+		vf_info->malicious = 1;
+		*report_malvf = true;
+	}
+
+	return status;
+}
+
+/**
+ * ice_mbx_clear_malvf - Clear VF mailbox info
+ * @vf_info: the mailbox tracking structure for a VF
+ *
+ * In case of a VF reset, this function shall be called to clear the VF's
+ * current mailbox tracking state.
+ */
+void ice_mbx_clear_malvf(struct ice_mbx_vf_info *vf_info)
+{
+	vf_info->malicious = 0;
+	vf_info->msg_count = 0;
+}
+
+/**
+ * ice_mbx_init_vf_info - Initialize a new VF mailbox tracking info
+ * @hw: pointer to the hardware structure
+ * @vf_info: the mailbox tracking info structure for a VF
+ *
+ * Initialize a VF mailbox tracking info structure and insert it into the
+ * snapshot list.
+ *
+ * If you remove the VF, you must also delete the associated VF info structure
+ * from the linked list.
+ */
+void ice_mbx_init_vf_info(struct ice_hw *hw, struct ice_mbx_vf_info *vf_info)
+{
+	struct ice_mbx_snapshot *snap = &hw->mbx_snapshot;
+
+	ice_mbx_clear_malvf(vf_info);
+	LIST_ADD(&vf_info->list_entry, &snap->mbx_vf);
+}
+
+/**
+ * ice_mbx_init_snapshot - Initialize mailbox snapshot data
+ * @hw: pointer to the hardware structure
+ *
+ * Clear the mailbox snapshot structure and initialize the VF mailbox list.
+ */
+void ice_mbx_init_snapshot(struct ice_hw *hw)
+{
+	struct ice_mbx_snapshot *snap = &hw->mbx_snapshot;
+
+	INIT_LIST_HEAD(&snap->mbx_vf);
+	ice_mbx_reset_snapshot(snap);
+}
diff --git a/sys/dev/ice/ice_vf_mbx.h b/sys/dev/ice/ice_vf_mbx.h
new file mode 100644
index 000000000000..3b185ac89c11
--- /dev/null
+++ b/sys/dev/ice/ice_vf_mbx.h
@@ -0,0 +1,67 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/*  Copyright (c) 2025, Intel Corporation
+ *  All rights reserved.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *
+ *   1. Redistributions of source code must retain the above copyright notice,
+ *      this list of conditions and the following disclaimer.
+ *
+ *   2. Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in the
+ *      documentation and/or other materials provided with the distribution.
+ *
+ *   3. Neither the name of the Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived from
+ *      this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ *  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _ICE_VF_MBX_H_
+#define _ICE_VF_MBX_H_
+
+#include "ice_type.h"
+#include "ice_controlq.h"
+
+/* Defining the mailbox message threshold as 63 asynchronous
+ * pending messages. Normal VF functionality does not require
+ * sending more than 63 asynchronous pending message.
+ */
+
+ /* Threshold value should be used to initialize
+  * MBX_VF_IN_FLIGHT_MSGS_AT_PF_CNT register.
+  */
+#define ICE_ASYNC_VF_MSG_THRESHOLD	63
+
+int
+ice_aq_send_msg_to_pf(struct ice_hw *hw, enum virtchnl_ops v_opcode,
+		      int v_retval, u8 *msg, u16 msglen,
+		      struct ice_sq_cd *cd);
+int
+ice_aq_send_msg_to_vf(struct ice_hw *hw, u16 vfid, u32 v_opcode, u32 v_retval,
+		      u8 *msg, u16 msglen, struct ice_sq_cd *cd);
+
+u32 ice_conv_link_speed_to_virtchnl(bool adv_link_support, u16 link_speed);
+
+void ice_e830_mbx_vf_dec_trig(struct ice_hw *hw,
+			      struct ice_rq_event_info *event);
+void ice_mbx_vf_clear_cnt_e830(struct ice_hw *hw, u16 vf_id);
+int
+ice_mbx_vf_state_handler(struct ice_hw *hw, struct ice_mbx_data *mbx_data,
+			 struct ice_mbx_vf_info *vf_info, bool *report_malvf);
+void ice_mbx_clear_malvf(struct ice_mbx_vf_info *vf_info);
+void ice_mbx_init_vf_info(struct ice_hw *hw, struct ice_mbx_vf_info *vf_info);
+void ice_mbx_init_snapshot(struct ice_hw *hw);
+#endif /* _ICE_VF_MBX_H_ */
diff --git a/sys/dev/ice/if_ice_iflib.c b/sys/dev/ice/if_ice_iflib.c
index e60ee0f1c5c3..1469d2916465 100644
--- a/sys/dev/ice/if_ice_iflib.c
+++ b/sys/dev/ice/if_ice_iflib.c
@@ -42,6 +42,9 @@
 #include "ice_drv_info.h"
 #include "ice_switch.h"
 #include "ice_sched.h"
+#ifdef PCI_IOV
+#include "ice_iov.h"
+#endif
 
 #include <sys/module.h>
 #include <sys/sockio.h>
@@ -85,6 +88,12 @@ static int ice_if_suspend(if_ctx_t ctx);
 static int ice_if_resume(if_ctx_t ctx);
 static bool ice_if_needs_restart(if_ctx_t ctx, enum iflib_restart_event event);
 static void ice_init_link(struct ice_softc *sc);
+#ifdef PCI_IOV
+static int ice_if_iov_init(if_ctx_t ctx, uint16_t num_vfs, const nvlist_t *params);
+static void ice_if_iov_uninit(if_ctx_t ctx);
+static int ice_if_iov_vf_add(if_ctx_t ctx, uint16_t vfnum, const nvlist_t *params);
+static void ice_if_vflr_handle(if_ctx_t ctx);
+#endif
 static int ice_setup_mirror_vsi(struct ice_mirr_if *mif);
 static int ice_wire_mirror_intrs(struct ice_mirr_if *mif);
 static void ice_free_irqvs_subif(struct ice_mirr_if *mif);
@@ -158,6 +167,11 @@ static device_method_t ice_methods[] = {
 	DEVMETHOD(device_shutdown, iflib_device_shutdown),
 	DEVMETHOD(device_suspend,  iflib_device_suspend),
 	DEVMETHOD(device_resume,   iflib_device_resume),
+#ifdef PCI_IOV
+	DEVMETHOD(pci_iov_init, iflib_device_iov_init),
+	DEVMETHOD(pci_iov_uninit, iflib_device_iov_uninit),
+	DEVMETHOD(pci_iov_add_vf, iflib_device_iov_add_vf),
+#endif
 	DEVMETHOD_END
 };
 
@@ -198,6 +212,12 @@ static device_method_t ice_iflib_methods[] = {
 	DEVMETHOD(ifdi_suspend, ice_if_suspend),
 	DEVMETHOD(ifdi_resume, ice_if_resume),
 	DEVMETHOD(ifdi_needs_restart, ice_if_needs_restart),
+#ifdef PCI_IOV
+	DEVMETHOD(ifdi_iov_vf_add, ice_if_iov_vf_add),
+	DEVMETHOD(ifdi_iov_init, ice_if_iov_init),
+	DEVMETHOD(ifdi_iov_uninit, ice_if_iov_uninit),
+	DEVMETHOD(ifdi_vflr_handle, ice_if_vflr_handle),
+#endif
 	DEVMETHOD_END
 };
 
@@ -733,6 +753,9 @@ ice_update_link_status(struct ice_softc *sc, bool update_media)
 			iflib_link_state_change(sc->ctx, LINK_STATE_DOWN, 0);
 			ice_rdma_link_change(sc, LINK_STATE_DOWN, 0);
 		}
+#ifdef PCI_IOV
+		ice_vc_notify_all_vfs_link_state(sc);
+#endif
 		update_media = true;
 	}
 
@@ -831,6 +854,14 @@ ice_if_attach_post(if_ctx_t ctx)
 
 	ice_add_device_sysctls(sc);
 
+#ifdef PCI_IOV
+	if (ice_is_bit_set(sc->feat_cap, ICE_FEATURE_SRIOV)) {
+		err = ice_iov_attach(sc);
+		if (err == ENOMEM)
+			return (err);
+	}
+#endif /* PCI_IOV */
+
 	/* Get DCBX/LLDP state and start DCBX agent */
 	ice_init_dcb_setup(sc);
 
@@ -953,6 +984,11 @@ ice_if_detach(if_ctx_t ctx)
 		ice_destroy_mirror_interface(sc);
 	ice_rdma_pf_detach(sc);
 
+#ifdef PCI_IOV
+	if (ice_is_bit_set(sc->feat_cap, ICE_FEATURE_SRIOV))
+		ice_iov_detach(sc);
+#endif /* PCI_IOV */
+
 	/* Free allocated media types */
 	ifmedia_removeall(sc->media);
 
@@ -1676,6 +1712,11 @@ ice_if_msix_intr_assign(if_ctx_t ctx, int msix)
 	/* For future interrupt assignments */
 	sc->last_rid = rid + sc->irdma_vectors;
 
+#ifdef PCI_IOV
+	/* Create soft IRQ for handling VF resets */
+	iflib_softirq_alloc_generic(ctx, NULL, IFLIB_INTR_IOV, sc, 0, "iov");
+#endif
+
 	return (0);
 fail:
 	for (; i >= 0; i--, vector--)
@@ -2277,7 +2318,12 @@ ice_transition_recovery_mode(struct ice_softc *sc)
 	ice_rdma_pf_detach(sc);
 	ice_clear_bit(ICE_FEATURE_RDMA, sc->feat_cap);
 
+#ifdef PCI_IOV
+	if (ice_test_and_clear_bit(ICE_FEATURE_SRIOV, sc->feat_en))
+		 ice_iov_detach(sc);
+#else
 	ice_clear_bit(ICE_FEATURE_SRIOV, sc->feat_en);
+#endif /* PCI_IOV */
 	ice_clear_bit(ICE_FEATURE_SRIOV, sc->feat_cap);
 
 	ice_vsi_del_txqs_ctx(vsi);
@@ -2325,7 +2371,12 @@ ice_transition_safe_mode(struct ice_softc *sc)
 	ice_rdma_pf_detach(sc);
 	ice_clear_bit(ICE_FEATURE_RDMA, sc->feat_cap);
 
+#ifdef PCI_IOV
+	if (ice_test_and_clear_bit(ICE_FEATURE_SRIOV, sc->feat_en))
+		 ice_iov_detach(sc);
+#else
 	ice_clear_bit(ICE_FEATURE_SRIOV, sc->feat_en);
+#endif /* PCI_IOV */
 	ice_clear_bit(ICE_FEATURE_SRIOV, sc->feat_cap);
 
 	ice_clear_bit(ICE_FEATURE_RSS, sc->feat_cap);
@@ -2410,6 +2461,15 @@ ice_if_update_admin_status(if_ctx_t ctx)
 	/* Check and update link status */
 	ice_update_link_status(sc, false);
 
+#ifdef PCI_IOV
+	/*
+	 * Schedule VFs' reset handler after global resets
+	 * and other events were processed.
+	 */
+	if (ice_testandclear_state(&sc->state, ICE_STATE_VFLR_PENDING))
+		iflib_iov_intr_deferred(ctx);
+#endif
+
 	/*
 	 * If there are still messages to process, we need to reschedule
 	 * ourselves. Otherwise, we can just re-enable the interrupt. We'll be
@@ -3349,6 +3409,78 @@ ice_init_link(struct ice_softc *sc)
 
 }
 
+#ifdef PCI_IOV
+/**
+ * ice_if_iov_init - iov init handler for iflib
+ * @ctx: iflib context pointer
+ * @num_vfs: number of VFs to create
+ * @params: configuration parameters for the PF
+ *
+ * Configure the driver for SR-IOV mode. Used to setup things like memory
+ * before any VFs are created.
+ *
+ * @remark This is a wrapper for ice_iov_init
+ */
+static int
+ice_if_iov_init(if_ctx_t ctx, uint16_t num_vfs, const nvlist_t *params)
+{
+	struct ice_softc *sc = (struct ice_softc *)iflib_get_softc(ctx);
+
+	return ice_iov_init(sc, num_vfs, params);
+}
+
+/**
+ * ice_if_iov_uninit - iov uninit handler for iflib
+ * @ctx: iflib context pointer
+ *
+ * Destroys VFs and frees their memory and resources.
+ *
+ * @remark This is a wrapper for ice_iov_uninit
+ */
+static void
+ice_if_iov_uninit(if_ctx_t ctx)
+{
+	struct ice_softc *sc = (struct ice_softc *)iflib_get_softc(ctx);
+
+	ice_iov_uninit(sc);
+}
+
+/**
+ * ice_if_iov_vf_add - iov add vf handler for iflib
+ * @ctx: iflib context pointer
+ * @vfnum: index of VF to configure
+ * @params: configuration parameters for the VF
+ *
+ * Sets up the VF given by the vfnum index. This is called by the OS
+ * for each VF created by the PF driver after it is spawned.
+ *
+ * @remark This is a wrapper for ice_iov_vf_add
+ */
+static int
+ice_if_iov_vf_add(if_ctx_t ctx, uint16_t vfnum, const nvlist_t *params)
+{
+	struct ice_softc *sc = (struct ice_softc *)iflib_get_softc(ctx);
+
+	return ice_iov_add_vf(sc, vfnum, params);
+}
+
+/**
+ * ice_if_vflr_handle - iov VFLR handler
+ * @ctx: iflib context pointer
+ *
+ * Performs the necessar teardown or setup required for a VF after
+ * a VFLR is initiated.
+ *
+ * @remark This is a wrapper for ice_iov_handle_vflr
+ */
+static void
+ice_if_vflr_handle(if_ctx_t ctx)
+{
+	struct ice_softc *sc = (struct ice_softc *)iflib_get_softc(ctx);
+	ice_iov_handle_vflr(sc);
+}
+#endif /* PCI_IOV */
+
 extern struct if_txrx ice_subif_txrx;
 
 /**
diff --git a/sys/dev/md/md.c b/sys/dev/md/md.c
index 741a7c013f7d..29dc0c880e3a 100644
--- a/sys/dev/md/md.c
+++ b/sys/dev/md/md.c
@@ -11,9 +11,9 @@
  */
 
 /*-
- * The following functions are based on the vn(4) driver: mdstart_swap(),
- * mdstart_vnode(), mdcreate_swap(), mdcreate_vnode() and mddestroy(),
- * and as such under the following copyright:
+ * The following functions are based on the historical vn(4) driver:
+ * mdstart_swap(), mdstart_vnode(), mdcreate_swap(), mdcreate_vnode()
+ * and mddestroy(), and as such under the following copyright:
  *
  * Copyright (c) 1988 University of Utah.
  * Copyright (c) 1990, 1993
diff --git a/sys/dev/mgb/if_mgb.c b/sys/dev/mgb/if_mgb.c
index 1240d0f84415..409f34167df0 100644
--- a/sys/dev/mgb/if_mgb.c
+++ b/sys/dev/mgb/if_mgb.c
@@ -1435,7 +1435,7 @@ mgb_hw_teardown(struct mgb_softc *sc)
 
 	/* Stop MAC */
 	CSR_CLEAR_REG(sc, MGB_MAC_RX, MGB_MAC_ENBL);
-	CSR_WRITE_REG(sc, MGB_MAC_TX, MGB_MAC_ENBL);
+	CSR_CLEAR_REG(sc, MGB_MAC_TX, MGB_MAC_ENBL);
 	if ((err = mgb_wait_for_bits(sc, MGB_MAC_RX, MGB_MAC_DSBL, 0)))
 		return (err);
 	if ((err = mgb_wait_for_bits(sc, MGB_MAC_TX, MGB_MAC_DSBL, 0)))
diff --git a/sys/dev/mlx5/mlx5_accel/ipsec.h b/sys/dev/mlx5/mlx5_accel/ipsec.h
index 361b9f72d873..c3f3a2372482 100644
--- a/sys/dev/mlx5/mlx5_accel/ipsec.h
+++ b/sys/dev/mlx5/mlx5_accel/ipsec.h
@@ -260,8 +260,8 @@ int mlx5e_accel_ipsec_fs_rx_tables_create(struct mlx5e_priv *priv);
 void mlx5e_accel_ipsec_fs_rx_catchall_rules_destroy(struct mlx5e_priv *priv);
 int mlx5e_accel_ipsec_fs_rx_catchall_rules(struct mlx5e_priv *priv);
 int mlx5_accel_ipsec_rx_tag_add(if_t ifp, struct mlx5e_rq_mbuf *mr);
-void mlx5e_accel_ipsec_handle_rx_cqe(struct mbuf *mb, struct mlx5_cqe64 *cqe,
-    struct mlx5e_rq_mbuf *mr);
+void mlx5e_accel_ipsec_handle_rx_cqe(if_t ifp, struct mbuf *mb,
+    struct mlx5_cqe64 *cqe, struct mlx5e_rq_mbuf *mr);
 
 static inline int mlx5e_accel_ipsec_flow(struct mlx5_cqe64 *cqe)
 {
@@ -269,12 +269,12 @@ static inline int mlx5e_accel_ipsec_flow(struct mlx5_cqe64 *cqe)
 }
 
 static inline void
-mlx5e_accel_ipsec_handle_rx(struct mbuf *mb, struct mlx5_cqe64 *cqe,
+mlx5e_accel_ipsec_handle_rx(if_t ifp, struct mbuf *mb, struct mlx5_cqe64 *cqe,
     struct mlx5e_rq_mbuf *mr)
 {
 	u32 ipsec_meta_data = be32_to_cpu(cqe->ft_metadata);
 
 	if (MLX5_IPSEC_METADATA_MARKER(ipsec_meta_data))
-		mlx5e_accel_ipsec_handle_rx_cqe(mb, cqe, mr);
+		mlx5e_accel_ipsec_handle_rx_cqe(ifp, mb, cqe, mr);
 }
 #endif	/* __MLX5_ACCEL_IPSEC_H__ */
diff --git a/sys/dev/mlx5/mlx5_accel/mlx5_ipsec_rxtx.c b/sys/dev/mlx5/mlx5_accel/mlx5_ipsec_rxtx.c
index 0883cfb2d510..5dccb8bc2b87 100644
--- a/sys/dev/mlx5/mlx5_accel/mlx5_ipsec_rxtx.c
+++ b/sys/dev/mlx5/mlx5_accel/mlx5_ipsec_rxtx.c
@@ -24,11 +24,14 @@
  *
  */
 
+#include "opt_ipsec.h"
+
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <netinet/in.h>
 #include <netipsec/keydb.h>
 #include <netipsec/ipsec_offload.h>
+#include <netipsec/xform.h>
 #include <dev/mlx5/qp.h>
 #include <dev/mlx5/mlx5_en/en.h>
 #include <dev/mlx5/mlx5_accel/ipsec.h>
@@ -48,7 +51,8 @@ mlx5_accel_ipsec_rx_tag_add(if_t ifp, struct mlx5e_rq_mbuf *mr)
 		return (0);
 
 	mtag = (struct ipsec_accel_in_tag *)m_tag_get(
-	    PACKET_TAG_IPSEC_ACCEL_IN, sizeof(*mtag), M_NOWAIT);
+	    PACKET_TAG_IPSEC_ACCEL_IN, sizeof(struct ipsec_accel_in_tag) -
+	    __offsetof(struct ipsec_accel_in_tag, xh), M_NOWAIT);
 	if (mtag == NULL)
 		return (-ENOMEM);
 	mr->ipsec_mtag = mtag;
@@ -56,8 +60,8 @@ mlx5_accel_ipsec_rx_tag_add(if_t ifp, struct mlx5e_rq_mbuf *mr)
 }
 
 void
-mlx5e_accel_ipsec_handle_rx_cqe(struct mbuf *mb, struct mlx5_cqe64 *cqe,
-    struct mlx5e_rq_mbuf *mr)
+mlx5e_accel_ipsec_handle_rx_cqe(if_t ifp, struct mbuf *mb,
+    struct mlx5_cqe64 *cqe, struct mlx5e_rq_mbuf *mr)
 {
 	struct ipsec_accel_in_tag *mtag;
 	u32 drv_spi;
@@ -65,10 +69,12 @@ mlx5e_accel_ipsec_handle_rx_cqe(struct mbuf *mb, struct mlx5_cqe64 *cqe,
 	drv_spi = MLX5_IPSEC_METADATA_HANDLE(be32_to_cpu(cqe->ft_metadata));
 	mtag = mr->ipsec_mtag;
 	WARN_ON(mtag == NULL);
-	mr->ipsec_mtag = NULL;
 	if (mtag != NULL) {
 		mtag->drv_spi = drv_spi;
-		m_tag_prepend(mb, &mtag->tag);
+		if (ipsec_accel_fill_xh(ifp, drv_spi, &mtag->xh)) {
+			m_tag_prepend(mb, &mtag->tag);
+			mr->ipsec_mtag = NULL;
+		}
 	}
 }
 
diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_hw_tls_rx.c b/sys/dev/mlx5/mlx5_en/mlx5_en_hw_tls_rx.c
index 4de451f1b039..89d2010656c5 100644
--- a/sys/dev/mlx5/mlx5_en/mlx5_en_hw_tls_rx.c
+++ b/sys/dev/mlx5/mlx5_en/mlx5_en_hw_tls_rx.c
@@ -659,7 +659,8 @@ mlx5e_tls_rx_set_params(void *ctx, struct inpcb *inp, const struct tls_session_p
 		return (EINVAL);
 
 	MLX5_SET64(sw_tls_rx_cntx, ctx, param.initial_record_number, tls_sn_he);
-	MLX5_SET(sw_tls_rx_cntx, ctx, param.resync_tcp_sn, tcp_sn_he);
+	MLX5_SET(sw_tls_rx_cntx, ctx, param.resync_tcp_sn, 0);
+	MLX5_SET(sw_tls_rx_cntx, ctx, progress.next_record_tcp_sn, tcp_sn_he);
 
 	return (0);
 }
diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c b/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c
index 6b53db6fea23..eb569488631a 100644
--- a/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c
+++ b/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c
@@ -467,7 +467,7 @@ mlx5e_build_rx_mbuf(struct mlx5_cqe64 *cqe, struct mlx5e_rq *rq,
 		break;
 	}
 
-	mlx5e_accel_ipsec_handle_rx(mb, cqe, mr);
+	mlx5e_accel_ipsec_handle_rx(ifp, mb, cqe, mr);
 }
 
 static inline void
diff --git a/sys/dev/nvme/nvme_ctrlr.c b/sys/dev/nvme/nvme_ctrlr.c
index 73a7cee4aad0..fd7f00ced14b 100644
--- a/sys/dev/nvme/nvme_ctrlr.c
+++ b/sys/dev/nvme/nvme_ctrlr.c
@@ -48,7 +48,7 @@
 #define B4_CHK_RDY_DELAY_MS	2300		/* work around controller bug */
 
 static void nvme_ctrlr_construct_and_submit_aer(struct nvme_controller *ctrlr,
-						struct nvme_async_event_request *aer);
+    struct nvme_async_event_request *aer);
 
 static void
 nvme_ctrlr_barrier(struct nvme_controller *ctrlr, int flags)
@@ -680,96 +680,6 @@ nvme_ctrlr_log_critical_warnings(struct nvme_controller *ctrlr,
 }
 
 static void
-nvme_ctrlr_async_event_log_page_cb(void *arg, const struct nvme_completion *cpl)
-{
-	struct nvme_async_event_request		*aer = arg;
-	struct nvme_health_information_page	*health_info;
-	struct nvme_ns_list			*nsl;
-	struct nvme_error_information_entry	*err;
-	int i;
-
-	/*
-	 * If the log page fetch for some reason completed with an error,
-	 *  don't pass log page data to the consumers.  In practice, this case
-	 *  should never happen.
-	 */
-	if (nvme_completion_is_error(cpl))
-		nvme_notify_async_consumers(aer->ctrlr, &aer->cpl,
-		    aer->log_page_id, NULL, 0);
-	else {
-		/* Convert data to host endian */
-		switch (aer->log_page_id) {
-		case NVME_LOG_ERROR:
-			err = (struct nvme_error_information_entry *)aer->log_page_buffer;
-			for (i = 0; i < (aer->ctrlr->cdata.elpe + 1); i++)
-				nvme_error_information_entry_swapbytes(err++);
-			break;
-		case NVME_LOG_HEALTH_INFORMATION:
-			nvme_health_information_page_swapbytes(
-			    (struct nvme_health_information_page *)aer->log_page_buffer);
-			break;
-		case NVME_LOG_CHANGED_NAMESPACE:
-			nvme_ns_list_swapbytes(
-			    (struct nvme_ns_list *)aer->log_page_buffer);
-			break;
-		case NVME_LOG_COMMAND_EFFECT:
-			nvme_command_effects_page_swapbytes(
-			    (struct nvme_command_effects_page *)aer->log_page_buffer);
-			break;
-		case NVME_LOG_RES_NOTIFICATION:
-			nvme_res_notification_page_swapbytes(
-			    (struct nvme_res_notification_page *)aer->log_page_buffer);
-			break;
-		case NVME_LOG_SANITIZE_STATUS:
-			nvme_sanitize_status_page_swapbytes(
-			    (struct nvme_sanitize_status_page *)aer->log_page_buffer);
-			break;
-		default:
-			break;
-		}
-
-		if (aer->log_page_id == NVME_LOG_HEALTH_INFORMATION) {
-			health_info = (struct nvme_health_information_page *)
-			    aer->log_page_buffer;
-			nvme_ctrlr_log_critical_warnings(aer->ctrlr,
-			    health_info->critical_warning);
-			/*
-			 * Critical warnings reported through the
-			 *  SMART/health log page are persistent, so
-			 *  clear the associated bits in the async event
-			 *  config so that we do not receive repeated
-			 *  notifications for the same event.
-			 */
-			aer->ctrlr->async_event_config &=
-			    ~health_info->critical_warning;
-			nvme_ctrlr_cmd_set_async_event_config(aer->ctrlr,
-			    aer->ctrlr->async_event_config, NULL, NULL);
-		} else if (aer->log_page_id == NVME_LOG_CHANGED_NAMESPACE &&
-		    !nvme_use_nvd) {
-			nsl = (struct nvme_ns_list *)aer->log_page_buffer;
-			for (i = 0; i < nitems(nsl->ns) && nsl->ns[i] != 0; i++) {
-				if (nsl->ns[i] > NVME_MAX_NAMESPACES)
-					break;
-				nvme_notify_ns(aer->ctrlr, nsl->ns[i]);
-			}
-		}
-
-		/*
-		 * Pass the cpl data from the original async event completion,
-		 *  not the log page fetch.
-		 */
-		nvme_notify_async_consumers(aer->ctrlr, &aer->cpl,
-		    aer->log_page_id, aer->log_page_buffer, aer->log_page_size);
-	}
-
-	/*
-	 * Repost another asynchronous event request to replace the one
-	 *  that just completed.
-	 */
-	nvme_ctrlr_construct_and_submit_aer(aer->ctrlr, aer);
-}
-
-static void
 nvme_ctrlr_async_event_cb(void *arg, const struct nvme_completion *cpl)
 {
 	struct nvme_async_event_request	*aer = arg;
@@ -784,33 +694,18 @@ nvme_ctrlr_async_event_cb(void *arg, const struct nvme_completion *cpl)
 		return;
 	}
 
-	/* Associated log page is in bits 23:16 of completion entry dw0. */
+	/*
+	 * Save the completion status and associated log page is in bits 23:16
+	 * of completion entry dw0. Print a message and queue it for further
+	 * processing.
+	 */
+	memcpy(&aer->cpl, cpl, sizeof(*cpl));
 	aer->log_page_id = NVMEV(NVME_ASYNC_EVENT_LOG_PAGE_ID, cpl->cdw0);
-
 	nvme_printf(aer->ctrlr, "async event occurred (type 0x%x, info 0x%02x,"
 	    " page 0x%02x)\n", NVMEV(NVME_ASYNC_EVENT_TYPE, cpl->cdw0),
 	    NVMEV(NVME_ASYNC_EVENT_INFO, cpl->cdw0),
 	    aer->log_page_id);
-
-	if (is_log_page_id_valid(aer->log_page_id)) {
-		aer->log_page_size = nvme_ctrlr_get_log_page_size(aer->ctrlr,
-		    aer->log_page_id);
-		memcpy(&aer->cpl, cpl, sizeof(*cpl));
-		nvme_ctrlr_cmd_get_log_page(aer->ctrlr, aer->log_page_id,
-		    NVME_GLOBAL_NAMESPACE_TAG, aer->log_page_buffer,
-		    aer->log_page_size, nvme_ctrlr_async_event_log_page_cb,
-		    aer);
-		/* Wait to notify consumers until after log page is fetched. */
-	} else {
-		nvme_notify_async_consumers(aer->ctrlr, cpl, aer->log_page_id,
-		    NULL, 0);
-
-		/*
-		 * Repost another asynchronous event request to replace the one
-		 *  that just completed.
-		 */
-		nvme_ctrlr_construct_and_submit_aer(aer->ctrlr, aer);
-	}
+	taskqueue_enqueue(aer->ctrlr->taskqueue, &aer->task);
 }
 
 static void
@@ -819,15 +714,21 @@ nvme_ctrlr_construct_and_submit_aer(struct nvme_controller *ctrlr,
 {
 	struct nvme_request *req;
 
-	aer->ctrlr = ctrlr;
 	/*
-	 * XXX-MJ this should be M_WAITOK but we might be in a non-sleepable
-	 * callback context.  AER completions should be handled on a dedicated
-	 * thread.
+	 * We're racing the reset thread, so let that process submit this again.
+	 * XXX does this really solve that race? And is that race even possible
+	 * since we only reset when we've no theard from the card in a long
+	 * time. Why would we get an AER in the middle of that just before we
+	 * kick off the reset?
 	 */
-	req = nvme_allocate_request_null(M_NOWAIT, nvme_ctrlr_async_event_cb,
+	if (ctrlr->is_resetting)
+		return;
+
+	aer->ctrlr = ctrlr;
+	req = nvme_allocate_request_null(M_WAITOK, nvme_ctrlr_async_event_cb,
 	    aer);
 	aer->req = req;
+	aer->log_page_id = 0;		/* Not a valid page */
 
 	/*
 	 * Disable timeout here, since asynchronous event requests should by
@@ -1203,6 +1104,140 @@ nvme_ctrlr_reset_task(void *arg, int pending)
 	atomic_cmpset_32(&ctrlr->is_resetting, 1, 0);
 }
 
+static void
+nvme_ctrlr_aer_done(void *arg,  const struct nvme_completion *cpl)
+{
+	struct nvme_async_event_request	*aer = arg;
+
+	mtx_lock(&aer->mtx);
+	if (nvme_completion_is_error(cpl))
+		aer->log_page_size = (uint32_t)-1;
+	else
+		aer->log_page_size = nvme_ctrlr_get_log_page_size(
+		    aer->ctrlr, aer->log_page_id);
+	wakeup(aer);
+	mtx_unlock(&aer->mtx);
+}
+
+static void
+nvme_ctrlr_aer_task(void *arg, int pending)
+{
+	struct nvme_async_event_request	*aer = arg;
+	struct nvme_controller	*ctrlr = aer->ctrlr;
+	uint32_t len;
+
+	/*
+	 * We're resetting, so just punt.
+	 */
+	if (ctrlr->is_resetting)
+		return;
+
+	if (!is_log_page_id_valid(aer->log_page_id)) {
+		/*
+		 * Repost another asynchronous event request to replace the one
+		 * that just completed.
+		 */
+		nvme_notify_async_consumers(ctrlr, &aer->cpl, aer->log_page_id,
+		    NULL, 0);
+		nvme_ctrlr_construct_and_submit_aer(ctrlr, aer);
+		goto out;
+	}
+
+	aer->log_page_size = 0;
+	len = nvme_ctrlr_get_log_page_size(aer->ctrlr, aer->log_page_id);
+	nvme_ctrlr_cmd_get_log_page(aer->ctrlr, aer->log_page_id,
+	    NVME_GLOBAL_NAMESPACE_TAG, aer->log_page_buffer, len,
+	    nvme_ctrlr_aer_done, aer);
+	mtx_lock(&aer->mtx);
+	while (aer->log_page_size == 0)
+		mtx_sleep(aer, &aer->mtx, PRIBIO, "nvme_pt", 0);
+	mtx_unlock(&aer->mtx);
+
+	if (aer->log_page_size != (uint32_t)-1) {
+		/*
+		 * If the log page fetch for some reason completed with an
+		 * error, don't pass log page data to the consumers.  In
+		 * practice, this case should never happen.
+		 */
+		nvme_notify_async_consumers(aer->ctrlr, &aer->cpl,
+		    aer->log_page_id, NULL, 0);
+		goto out;
+	}
+
+	/* Convert data to host endian */
+	switch (aer->log_page_id) {
+	case NVME_LOG_ERROR: {
+		struct nvme_error_information_entry *err =
+		    (struct nvme_error_information_entry *)aer->log_page_buffer;
+		for (int i = 0; i < (aer->ctrlr->cdata.elpe + 1); i++)
+			nvme_error_information_entry_swapbytes(err++);
+		break;
+	}
+	case NVME_LOG_HEALTH_INFORMATION:
+		nvme_health_information_page_swapbytes(
+			(struct nvme_health_information_page *)aer->log_page_buffer);
+		break;
+	case NVME_LOG_CHANGED_NAMESPACE:
+		nvme_ns_list_swapbytes(
+			(struct nvme_ns_list *)aer->log_page_buffer);
+		break;
+	case NVME_LOG_COMMAND_EFFECT:
+		nvme_command_effects_page_swapbytes(
+			(struct nvme_command_effects_page *)aer->log_page_buffer);
+		break;
+	case NVME_LOG_RES_NOTIFICATION:
+		nvme_res_notification_page_swapbytes(
+			(struct nvme_res_notification_page *)aer->log_page_buffer);
+		break;
+	case NVME_LOG_SANITIZE_STATUS:
+		nvme_sanitize_status_page_swapbytes(
+			(struct nvme_sanitize_status_page *)aer->log_page_buffer);
+		break;
+	default:
+		break;
+	}
+
+	if (aer->log_page_id == NVME_LOG_HEALTH_INFORMATION) {
+		struct nvme_health_information_page *health_info =
+		    (struct nvme_health_information_page *)aer->log_page_buffer;
+
+		/*
+		 * Critical warnings reported through the SMART/health log page
+		 * are persistent, so clear the associated bits in the async
+		 * event config so that we do not receive repeated notifications
+		 * for the same event.
+		 */
+		nvme_ctrlr_log_critical_warnings(aer->ctrlr,
+		    health_info->critical_warning);
+		aer->ctrlr->async_event_config &=
+		    ~health_info->critical_warning;
+		nvme_ctrlr_cmd_set_async_event_config(aer->ctrlr,
+		    aer->ctrlr->async_event_config, NULL, NULL);
+	} else if (aer->log_page_id == NVME_LOG_CHANGED_NAMESPACE) {
+		struct nvme_ns_list *nsl =
+		    (struct nvme_ns_list *)aer->log_page_buffer;
+		for (int i = 0; i < nitems(nsl->ns) && nsl->ns[i] != 0; i++) {
+			if (nsl->ns[i] > NVME_MAX_NAMESPACES)
+				break;
+			nvme_notify_ns(aer->ctrlr, nsl->ns[i]);
+		}
+	}
+
+	/*
+	 * Pass the cpl data from the original async event completion, not the
+	 * log page fetch.
+	 */
+	nvme_notify_async_consumers(aer->ctrlr, &aer->cpl,
+	    aer->log_page_id, aer->log_page_buffer, aer->log_page_size);
+
+	/*
+	 * Repost another asynchronous event request to replace the one
+	 *  that just completed.
+	 */
+out:
+	nvme_ctrlr_construct_and_submit_aer(ctrlr, aer);
+}
+
 /*
  * Poll all the queues enabled on the device for completion.
  */
@@ -1574,13 +1609,8 @@ nvme_ctrlr_construct(struct nvme_controller *ctrlr, device_t dev)
 	/*
 	 * Create 2 threads for the taskqueue. The reset thread will block when
 	 * it detects that the controller has failed until all I/O has been
-	 * failed up the stack. The fail_req task needs to be able to run in
-	 * this case to finish the request failure for some cases.
-	 *
-	 * We could partially solve this race by draining the failed requeust
-	 * queue before proceding to free the sim, though nothing would stop
-	 * new I/O from coming in after we do that drain, but before we reach
-	 * cam_sim_free, so this big hammer is used instead.
+	 * failed up the stack. The second thread is used for AER events, which
+	 * can block, but only briefly for memory and log page fetching.
 	 */
 	ctrlr->taskqueue = taskqueue_create("nvme_taskq", M_WAITOK,
 	    taskqueue_thread_enqueue, &ctrlr->taskqueue);
@@ -1590,7 +1620,12 @@ nvme_ctrlr_construct(struct nvme_controller *ctrlr, device_t dev)
 	ctrlr->is_initialized = false;
 	ctrlr->notification_sent = 0;
 	TASK_INIT(&ctrlr->reset_task, 0, nvme_ctrlr_reset_task, ctrlr);
-	STAILQ_INIT(&ctrlr->fail_req);
+	for (int i = 0; i < NVME_MAX_ASYNC_EVENTS; i++) {
+		struct nvme_async_event_request *aer = &ctrlr->aer[i];
+
+		TASK_INIT(&aer->task, 0, nvme_ctrlr_aer_task, aer);
+		mtx_init(&aer->mtx, "AER mutex", NULL, MTX_DEF);
+	}
 	ctrlr->is_failed = false;
 
 	make_dev_args_init(&md_args);
@@ -1678,8 +1713,14 @@ nvme_ctrlr_destruct(struct nvme_controller *ctrlr, device_t dev)
 	}
 
 noadminq:
-	if (ctrlr->taskqueue)
+	if (ctrlr->taskqueue) {
 		taskqueue_free(ctrlr->taskqueue);
+		for (int i = 0; i < NVME_MAX_ASYNC_EVENTS; i++) {
+			struct nvme_async_event_request *aer = &ctrlr->aer[i];
+
+			mtx_destroy(&aer->mtx);
+		}
+	}
 
 	if (ctrlr->tag)
 		bus_teardown_intr(ctrlr->dev, ctrlr->res, ctrlr->tag);
diff --git a/sys/dev/nvme/nvme_private.h b/sys/dev/nvme/nvme_private.h
index 949e69ec9290..36f00fedc48e 100644
--- a/sys/dev/nvme/nvme_private.h
+++ b/sys/dev/nvme/nvme_private.h
@@ -123,6 +123,8 @@ struct nvme_request {
 struct nvme_async_event_request {
 	struct nvme_controller		*ctrlr;
 	struct nvme_request		*req;
+	struct task			task;
+	struct mtx			mtx;
 	struct nvme_completion		cpl;
 	uint32_t			log_page_id;
 	uint32_t			log_page_size;
@@ -307,8 +309,6 @@ struct nvme_controller {
 	bool				isr_warned;
 	bool				is_initialized;
 
-	STAILQ_HEAD(, nvme_request)	fail_req;
-
 	/* Host Memory Buffer */
 	int				hmb_nchunks;
 	size_t				hmb_chunk;
diff --git a/sys/dev/ofw/ofw_bus_subr.c b/sys/dev/ofw/ofw_bus_subr.c
index 4d0479dfb957..b99d784929bc 100644
--- a/sys/dev/ofw/ofw_bus_subr.c
+++ b/sys/dev/ofw/ofw_bus_subr.c
@@ -634,11 +634,89 @@ ofw_bus_find_iparent(phandle_t node)
 	return (iparent);
 }
 
+static phandle_t
+ofw_bus_search_iparent(phandle_t node)
+{
+	phandle_t iparent;
+
+	do {
+		if (OF_getencprop(node, "interrupt-parent", &iparent,
+		    sizeof(iparent)) > 0) {
+			node = OF_node_from_xref(iparent);
+		} else {
+			node = OF_parent(node);
+		}
+		if (node == 0)
+			return (0);
+	} while (!OF_hasprop(node, "#interrupt-cells"));
+
+	return (OF_xref_from_node(node));
+}
+
+static int
+ofw_bus_traverse_imap(phandle_t inode, phandle_t node, uint32_t *intr,
+    int intrsz, pcell_t *res, int ressz, phandle_t *iparentp)
+{
+	struct ofw_bus_iinfo ii;
+	void *reg;
+	uint32_t *intrp;
+	phandle_t iparent;
+	int rv = 0;
+
+	/* We already have an interrupt controller */
+	if (OF_hasprop(node, "interrupt-controller"))
+		return (0);
+
+	intrp = malloc(intrsz, M_OFWPROP, M_WAITOK);
+	memcpy(intrp, intr, intrsz);
+
+	while (true) {
+		/* There is no interrupt-map to follow */
+		if (!OF_hasprop(inode, "interrupt-map")) {
+			free(intrp, M_OFWPROP);
+			return (0);
+		}
+
+		memset(&ii, 0, sizeof(ii));
+		ofw_bus_setup_iinfo(inode, &ii, sizeof(cell_t));
+
+		reg = NULL;
+		if (ii.opi_addrc > 0)
+			reg = malloc(ii.opi_addrc, M_OFWPROP, M_WAITOK);
+
+		rv = ofw_bus_lookup_imap(node, &ii, reg, ii.opi_addrc, intrp,
+		    intrsz, res, ressz, &iparent);
+
+		free(reg, M_OFWPROP);
+		free(ii.opi_imap, M_OFWPROP);
+		free(ii.opi_imapmsk, M_OFWPROP);
+		free(intrp, M_OFWPROP);
+
+		if (rv == 0)
+			return (0);
+
+		node = inode;
+		inode = OF_node_from_xref(iparent);
+
+		/* Stop when we have an interrupt controller */
+		if (OF_hasprop(inode, "interrupt-controller")) {
+			*iparentp = iparent;
+			return (rv);
+		}
+
+		intrsz = rv * sizeof(pcell_t);
+		intrp = malloc(intrsz, M_OFWPROP, M_WAITOK);
+		memcpy(intrp, res, intrsz);
+	}
+}
+
 int
 ofw_bus_intr_to_rl(device_t dev, phandle_t node,
     struct resource_list *rl, int *rlen)
 {
-	phandle_t iparent;
+	phandle_t iparent, iparent_node;
+	uint32_t result[16];
+	uint32_t intrpcells, *intrp;
 	uint32_t icells, *intr;
 	int err, i, irqnum, nintr, rid;
 	bool extended;
@@ -646,15 +724,16 @@ ofw_bus_intr_to_rl(device_t dev, phandle_t node,
 	nintr = OF_getencprop_alloc_multi(node, "interrupts",  sizeof(*intr),
 	    (void **)&intr);
 	if (nintr > 0) {
-		iparent = ofw_bus_find_iparent(node);
+		iparent = ofw_bus_search_iparent(node);
 		if (iparent == 0) {
 			device_printf(dev, "No interrupt-parent found, "
 			    "assuming direct parent\n");
 			iparent = OF_parent(node);
 			iparent = OF_xref_from_node(iparent);
 		}
-		if (OF_searchencprop(OF_node_from_xref(iparent), 
-		    "#interrupt-cells", &icells, sizeof(icells)) == -1) {
+		iparent_node = OF_node_from_xref(iparent);
+		if (OF_searchencprop(iparent_node, "#interrupt-cells", &icells,
+		    sizeof(icells)) == -1) {
 			device_printf(dev, "Missing #interrupt-cells "
 			    "property, assuming <1>\n");
 			icells = 1;
@@ -677,7 +756,8 @@ ofw_bus_intr_to_rl(device_t dev, phandle_t node,
 	for (i = 0; i < nintr; i += icells) {
 		if (extended) {
 			iparent = intr[i++];
-			if (OF_searchencprop(OF_node_from_xref(iparent), 
+			iparent_node = OF_node_from_xref(iparent);
+			if (OF_searchencprop(iparent_node,
 			    "#interrupt-cells", &icells, sizeof(icells)) == -1) {
 				device_printf(dev, "Missing #interrupt-cells "
 				    "property\n");
@@ -691,7 +771,16 @@ ofw_bus_intr_to_rl(device_t dev, phandle_t node,
 				break;
 			}
 		}
-		irqnum = ofw_bus_map_intr(dev, iparent, icells, &intr[i]);
+
+		intrp = &intr[i];
+		intrpcells = ofw_bus_traverse_imap(iparent_node, node, intrp,
+		    icells * sizeof(intr[0]), result, sizeof(result), &iparent);
+		if (intrpcells > 0)
+			intrp = result;
+		else
+			intrpcells = icells;
+
+		irqnum = ofw_bus_map_intr(dev, iparent, intrpcells, intrp);
 		resource_list_add(rl, SYS_RES_IRQ, rid++, irqnum, irqnum, 1);
 	}
 	if (rlen != NULL)
diff --git a/sys/dev/qlnx/qlnxe/qlnx_os.c b/sys/dev/qlnx/qlnxe/qlnx_os.c
index 05ec69a70dfe..4ad190374f87 100644
--- a/sys/dev/qlnx/qlnxe/qlnx_os.c
+++ b/sys/dev/qlnx/qlnxe/qlnx_os.c
@@ -30,6 +30,8 @@
  * Author : David C Somayajulu, Cavium, Inc., San Jose, CA 95131.
  */
 
+#include "opt_inet.h"
+
 #include <sys/cdefs.h>
 #include "qlnx_os.h"
 #include "bcm_osal.h"
@@ -2306,8 +2308,6 @@ qlnx_init_ifnet(device_t dev, qlnx_host_t *ha)
         else if (device_id == QLOGIC_PCI_DEVICE_ID_1644)
 		if_setbaudrate(ifp, IF_Gbps(100));
 
-        if_setcapabilities(ifp, IFCAP_LINKSTATE);
-
         if_setinitfn(ifp, qlnx_init);
         if_setsoftc(ifp, ha);
         if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST);
@@ -2341,7 +2341,6 @@ qlnx_init_ifnet(device_t dev, qlnx_host_t *ha)
 
 	if_setcapabilities(ifp, IFCAP_HWCSUM);
 	if_setcapabilitiesbit(ifp, IFCAP_JUMBO_MTU, 0);
-
 	if_setcapabilitiesbit(ifp, IFCAP_VLAN_MTU, 0);
 	if_setcapabilitiesbit(ifp, IFCAP_VLAN_HWTAGGING, 0);
 	if_setcapabilitiesbit(ifp, IFCAP_VLAN_HWFILTER, 0);
@@ -2350,6 +2349,8 @@ qlnx_init_ifnet(device_t dev, qlnx_host_t *ha)
 	if_setcapabilitiesbit(ifp, IFCAP_TSO4, 0);
 	if_setcapabilitiesbit(ifp, IFCAP_TSO6, 0);
 	if_setcapabilitiesbit(ifp, IFCAP_LRO, 0);
+	if_setcapabilitiesbit(ifp, IFCAP_LINKSTATE, 0);
+	if_setcapabilitiesbit(ifp, IFCAP_HWSTATS, 0);
 
 	if_sethwtsomax(ifp,  QLNX_MAX_TSO_FRAME_SIZE -
 				(ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN));
@@ -2778,7 +2779,7 @@ qlnx_ioctl(if_t ifp, u_long cmd, caddr_t data)
 
 		if (!p_ptt) {
 			QL_DPRINT1(ha, "ecore_ptt_acquire failed\n");
-			ret = -1;
+			ret = ERESTART;
 			break;
 		}
 
@@ -2789,7 +2790,7 @@ qlnx_ioctl(if_t ifp, u_long cmd, caddr_t data)
 		ecore_ptt_release(p_hwfn, p_ptt);
 
 		if (ret) {
-			ret = -1;
+			ret = ENODEV;
 			break;
 		}
 
diff --git a/sys/dev/random/fortuna.c b/sys/dev/random/fortuna.c
index c4282c723a44..8363de99a60a 100644
--- a/sys/dev/random/fortuna.c
+++ b/sys/dev/random/fortuna.c
@@ -341,6 +341,13 @@ random_fortuna_process_event(struct harvest_event *event)
 	u_int pl;
 
 	RANDOM_RESEED_LOCK();
+	/*
+	 * Run SP 800-90B health tests on the source if so configured.
+	 */
+	if (!random_harvest_healthtest(event)) {
+		RANDOM_RESEED_UNLOCK();
+		return;
+	}
 	/*-
 	 * FS&K - P_i = P_i|<harvested stuff>
 	 * Accumulate the event into the appropriate pool
diff --git a/sys/dev/random/random_harvestq.c b/sys/dev/random/random_harvestq.c
index 395310b115fb..c7762967c4fb 100644
--- a/sys/dev/random/random_harvestq.c
+++ b/sys/dev/random/random_harvestq.c
@@ -88,6 +88,8 @@ static void random_sources_feed(void);
 static __read_mostly bool epoch_inited;
 static __read_mostly epoch_t rs_epoch;
 
+static const char *random_source_descr[ENTROPYSOURCE];
+
 /*
  * How many events to queue up. We create this many items in
  * an 'empty' queue, then transfer them to the 'harvest' queue with
@@ -299,6 +301,230 @@ random_sources_feed(void)
 	explicit_bzero(entropy, sizeof(entropy));
 }
 
+/*
+ * State used for conducting NIST SP 800-90B health tests on entropy sources.
+ */
+static struct health_test_softc {
+	uint32_t ht_rct_value[HARVESTSIZE + 1];
+	u_int ht_rct_count;	/* number of samples with the same value */
+	u_int ht_rct_limit;	/* constant after init */
+
+	uint32_t ht_apt_value[HARVESTSIZE + 1];
+	u_int ht_apt_count;	/* number of samples with the same value */
+	u_int ht_apt_seq;	/* sequence number of the last sample */
+	u_int ht_apt_cutoff;	/* constant after init */
+
+	uint64_t ht_total_samples;
+	bool ondemand;		/* Set to true to restart the state machine */
+	enum {
+		INIT = 0,	/* initial state */
+		DISABLED,	/* health checking is disabled */
+		STARTUP,	/* doing startup tests, samples are discarded */
+		STEADY,		/* steady-state operation */
+		FAILED,		/* health check failed, discard samples */
+	} ht_state;
+} healthtest[ENTROPYSOURCE];
+
+#define	RANDOM_SELFTEST_STARTUP_SAMPLES	1024	/* 4.3, requirement 4 */
+#define	RANDOM_SELFTEST_APT_WINDOW	512	/* 4.4.2 */
+
+static void
+copy_event(uint32_t dst[static HARVESTSIZE + 1],
+    const struct harvest_event *event)
+{
+	memset(dst, 0, sizeof(uint32_t) * (HARVESTSIZE + 1));
+	memcpy(dst, event->he_entropy, event->he_size);
+	dst[HARVESTSIZE] = event->he_somecounter;
+}
+
+static void
+random_healthtest_rct_init(struct health_test_softc *ht,
+    const struct harvest_event *event)
+{
+	ht->ht_rct_count = 1;
+	copy_event(ht->ht_rct_value, event);
+}
+
+/*
+ * Apply the repitition count test to a sample.
+ *
+ * Return false if the test failed, i.e., we observed >= C consecutive samples
+ * with the same value, and true otherwise.
+ */
+static bool
+random_healthtest_rct_next(struct health_test_softc *ht,
+    const struct harvest_event *event)
+{
+	uint32_t val[HARVESTSIZE + 1];
+
+	copy_event(val, event);
+	if (memcmp(val, ht->ht_rct_value, sizeof(ht->ht_rct_value)) != 0) {
+		ht->ht_rct_count = 1;
+		memcpy(ht->ht_rct_value, val, sizeof(ht->ht_rct_value));
+		return (true);
+	} else {
+		ht->ht_rct_count++;
+		return (ht->ht_rct_count < ht->ht_rct_limit);
+	}
+}
+
+static void
+random_healthtest_apt_init(struct health_test_softc *ht,
+    const struct harvest_event *event)
+{
+	ht->ht_apt_count = 1;
+	ht->ht_apt_seq = 1;
+	copy_event(ht->ht_apt_value, event);
+}
+
+static bool
+random_healthtest_apt_next(struct health_test_softc *ht,
+    const struct harvest_event *event)
+{
+	uint32_t val[HARVESTSIZE + 1];
+
+	if (ht->ht_apt_seq == 0) {
+		random_healthtest_apt_init(ht, event);
+		return (true);
+	}
+
+	copy_event(val, event);
+	if (memcmp(val, ht->ht_apt_value, sizeof(ht->ht_apt_value)) == 0) {
+		ht->ht_apt_count++;
+		if (ht->ht_apt_count >= ht->ht_apt_cutoff)
+			return (false);
+	}
+
+	ht->ht_apt_seq++;
+	if (ht->ht_apt_seq == RANDOM_SELFTEST_APT_WINDOW)
+		ht->ht_apt_seq = 0;
+
+	return (true);
+}
+
+/*
+ * Run the health tests for the given event.  This is assumed to be called from
+ * a serialized context.
+ */
+bool
+random_harvest_healthtest(const struct harvest_event *event)
+{
+	struct health_test_softc *ht;
+
+	ht = &healthtest[event->he_source];
+
+	/*
+	 * Was on-demand testing requested?  Restart the state machine if so,
+	 * restarting the startup tests.
+	 */
+	if (atomic_load_bool(&ht->ondemand)) {
+		atomic_store_bool(&ht->ondemand, false);
+		ht->ht_state = INIT;
+	}
+
+	switch (ht->ht_state) {
+	case __predict_false(INIT):
+		/* Store the first sample and initialize test state. */
+		random_healthtest_rct_init(ht, event);
+		random_healthtest_apt_init(ht, event);
+		ht->ht_total_samples = 0;
+		ht->ht_state = STARTUP;
+		return (false);
+	case DISABLED:
+		/* No health testing for this source. */
+		return (true);
+	case STEADY:
+	case STARTUP:
+		ht->ht_total_samples++;
+		if (random_healthtest_rct_next(ht, event) &&
+		    random_healthtest_apt_next(ht, event)) {
+			if (ht->ht_state == STARTUP &&
+			    ht->ht_total_samples >=
+			    RANDOM_SELFTEST_STARTUP_SAMPLES) {
+				printf(
+			    "random: health test passed for source %s\n",
+				    random_source_descr[event->he_source]);
+				ht->ht_state = STEADY;
+			}
+			return (ht->ht_state == STEADY);
+		}
+		ht->ht_state = FAILED;
+		printf(
+	    "random: health test failed for source %s, discarding samples\n",
+		    random_source_descr[event->he_source]);
+		/* FALLTHROUGH */
+	case FAILED:
+		return (false);
+	}
+}
+
+static bool nist_healthtest_enabled = false;
+SYSCTL_BOOL(_kern_random, OID_AUTO, nist_healthtest_enabled,
+    CTLFLAG_RDTUN, &nist_healthtest_enabled, 0,
+    "Enable NIST SP 800-90B health tests for noise sources");
+
+static void
+random_healthtest_init(enum random_entropy_source source)
+{
+	struct health_test_softc *ht;
+
+	ht = &healthtest[source];
+	KASSERT(ht->ht_state == INIT,
+	    ("%s: health test state is %d for source %d",
+	    __func__, ht->ht_state, source));
+
+	/*
+	 * If health-testing is enabled, validate all sources except CACHED and
+	 * VMGENID: they are deterministic sources used only a small, fixed
+	 * number of times, so statistical testing is not applicable.
+	 */
+	if (!nist_healthtest_enabled ||
+	    source == RANDOM_CACHED || source == RANDOM_PURE_VMGENID) {
+		ht->ht_state = DISABLED;
+		return;
+	}
+
+	/*
+	 * Set cutoff values for the two tests, assuming that each sample has
+	 * min-entropy of 1 bit and allowing for an error rate of 1 in 2^{34}.
+	 * With a sample rate of RANDOM_KTHREAD_HZ, we expect to see an false
+	 * positive once in ~54.5 years.
+	 *
+	 * The RCT limit comes from the formula in section 4.4.1.
+	 *
+	 * The APT cutoff is calculated using the formula in section 4.4.2
+	 * footnote 10 with the window size changed from 512 to 511, since the
+	 * test as written counts the number of samples equal to the first
+	 * sample in the window, and thus tests W-1 samples.
+	 */
+	ht->ht_rct_limit = 35;
+	ht->ht_apt_cutoff = 330;
+}
+
+static int
+random_healthtest_ondemand(SYSCTL_HANDLER_ARGS)
+{
+	u_int mask, source;
+	int error;
+
+	mask = 0;
+	error = sysctl_handle_int(oidp, &mask, 0, req);
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+
+	while (mask != 0) {
+		source = ffs(mask) - 1;
+		if (source < nitems(healthtest))
+			atomic_store_bool(&healthtest[source].ondemand, true);
+		mask &= ~(1u << source);
+	}
+	return (0);
+}
+SYSCTL_PROC(_kern_random, OID_AUTO, nist_healthtest_ondemand,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0,
+    random_healthtest_ondemand, "I",
+    "Re-run NIST SP 800-90B startup health tests for a noise source");
+
 static int
 random_check_uint_harvestmask(SYSCTL_HANDLER_ARGS)
 {
@@ -362,7 +588,8 @@ static const char *random_source_descr[ENTROPYSOURCE] = {
 	[RANDOM_SWI] = "SWI",
 	[RANDOM_FS_ATIME] = "FS_ATIME",
 	[RANDOM_UMA] = "UMA",
-	[RANDOM_CALLOUT] = "CALLOUT", /* ENVIRONMENTAL_END */
+	[RANDOM_CALLOUT] = "CALLOUT",
+	[RANDOM_RANDOMDEV] = "RANDOMDEV", /* ENVIRONMENTAL_END */
 	[RANDOM_PURE_OCTEON] = "PURE_OCTEON", /* PURE_START */
 	[RANDOM_PURE_SAFE] = "PURE_SAFE",
 	[RANDOM_PURE_GLXSB] = "PURE_GLXSB",
@@ -424,6 +651,9 @@ random_harvestq_init(void *unused __unused)
 	hc_source_mask = almost_everything_mask;
 	RANDOM_HARVEST_INIT_LOCK();
 	harvest_context.hc_active_buf = 0;
+
+	for (int i = 0; i < ENTROPYSOURCE; i++)
+		random_healthtest_init(i);
 }
 SYSINIT(random_device_h_init, SI_SUB_RANDOM, SI_ORDER_THIRD, random_harvestq_init, NULL);
 
diff --git a/sys/dev/random/random_harvestq.h b/sys/dev/random/random_harvestq.h
index 7804bf52aa4f..1d462500df85 100644
--- a/sys/dev/random/random_harvestq.h
+++ b/sys/dev/random/random_harvestq.h
@@ -49,4 +49,6 @@ random_get_cyclecount(void)
 	return ((uint32_t)get_cyclecount());
 }
 
+bool random_harvest_healthtest(const struct harvest_event *event);
+
 #endif /* SYS_DEV_RANDOM_RANDOM_HARVESTQ_H_INCLUDED */
diff --git a/sys/dev/random/randomdev.c b/sys/dev/random/randomdev.c
index 9d1c7b1167c8..ced4dd8067d9 100644
--- a/sys/dev/random/randomdev.c
+++ b/sys/dev/random/randomdev.c
@@ -312,7 +312,7 @@ randomdev_accumulate(uint8_t *buf, u_int count)
 	for (i = 0; i < RANDOM_KEYSIZE_WORDS; i += sizeof(event.he_entropy)/sizeof(event.he_entropy[0])) {
 		event.he_somecounter = random_get_cyclecount();
 		event.he_size = sizeof(event.he_entropy);
-		event.he_source = RANDOM_CACHED;
+		event.he_source = RANDOM_RANDOMDEV;
 		event.he_destination = destination++; /* Harmless cheating */
 		memcpy(event.he_entropy, entropy_data + i, sizeof(event.he_entropy));
 		p_random_alg_context->ra_event_processor(&event);
diff --git a/sys/dev/ufshci/ufshci_private.h b/sys/dev/ufshci/ufshci_private.h
index cac743884ee6..ac58d44102a0 100644
--- a/sys/dev/ufshci/ufshci_private.h
+++ b/sys/dev/ufshci/ufshci_private.h
@@ -149,6 +149,8 @@ struct ufshci_hw_queue {
 	bus_dmamap_t queuemem_map;
 	bus_addr_t req_queue_addr;
 
+	bus_addr_t *ucd_bus_addr;
+
 	uint32_t num_entries;
 	uint32_t num_trackers;
 
@@ -198,8 +200,6 @@ struct ufshci_req_queue {
 	bus_dma_tag_t dma_tag_payload;
 
 	bus_dmamap_t ucdmem_map;
-
-	bus_addr_t ucd_addr;
 };
 
 struct ufshci_device {
diff --git a/sys/dev/ufshci/ufshci_req_sdb.c b/sys/dev/ufshci/ufshci_req_sdb.c
index 4670281d367a..b1f303afaef5 100644
--- a/sys/dev/ufshci/ufshci_req_sdb.c
+++ b/sys/dev/ufshci/ufshci_req_sdb.c
@@ -48,6 +48,29 @@ ufshci_req_sdb_cmd_desc_destroy(struct ufshci_req_queue *req_queue)
 	}
 }
 
+static void
+ufshci_ucd_map(void *arg, bus_dma_segment_t *seg, int nseg, int error)
+{
+	struct ufshci_hw_queue *hwq = arg;
+	int i;
+
+	if (error != 0) {
+		printf("ufshci: Failed to map UCD, error = %d\n", error);
+		return;
+	}
+
+	if (hwq->num_trackers != nseg) {
+		printf(
+		    "ufshci: Failed to map UCD, num_trackers = %d, nseg = %d\n",
+		    hwq->num_trackers, nseg);
+		return;
+	}
+
+	for (i = 0; i < nseg; i++) {
+		hwq->ucd_bus_addr[i] = seg[i].ds_addr;
+	}
+}
+
 static int
 ufshci_req_sdb_cmd_desc_construct(struct ufshci_req_queue *req_queue,
     uint32_t num_entries, struct ufshci_controller *ctrlr)
@@ -55,7 +78,6 @@ ufshci_req_sdb_cmd_desc_construct(struct ufshci_req_queue *req_queue,
 	struct ufshci_hw_queue *hwq = &req_queue->hwq[UFSHCI_SDB_Q];
 	struct ufshci_tracker *tr;
 	size_t ucd_allocsz, payload_allocsz;
-	uint64_t ucdmem_phys;
 	uint8_t *ucdmem;
 	int i, error;
 
@@ -71,10 +93,11 @@ ufshci_req_sdb_cmd_desc_construct(struct ufshci_req_queue *req_queue,
 	 * Allocate physical memory for UTP Command Descriptor (UCD)
 	 * Note: UFSHCI UCD format is restricted to 128-byte alignment.
 	 */
-	error = bus_dma_tag_create(bus_get_dma_tag(ctrlr->dev), 128,
-	    ctrlr->page_size, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL,
-	    ucd_allocsz, howmany(ucd_allocsz, ctrlr->page_size),
-	    ctrlr->page_size, 0, NULL, NULL, &req_queue->dma_tag_ucd);
+	error = bus_dma_tag_create(bus_get_dma_tag(ctrlr->dev), 128, 0,
+	    BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, ucd_allocsz,
+	    howmany(ucd_allocsz, sizeof(struct ufshci_utp_cmd_desc)),
+	    sizeof(struct ufshci_utp_cmd_desc), 0, NULL, NULL,
+	    &req_queue->dma_tag_ucd);
 	if (error != 0) {
 		ufshci_printf(ctrlr, "request cmd desc tag create failed %d\n",
 		    error);
@@ -88,7 +111,7 @@ ufshci_req_sdb_cmd_desc_construct(struct ufshci_req_queue *req_queue,
 	}
 
 	if (bus_dmamap_load(req_queue->dma_tag_ucd, req_queue->ucdmem_map,
-		ucdmem, ucd_allocsz, ufshci_single_map, &ucdmem_phys, 0) != 0) {
+		ucdmem, ucd_allocsz, ufshci_ucd_map, hwq, 0) != 0) {
 		ufshci_printf(ctrlr, "failed to load cmd desc memory\n");
 		bus_dmamem_free(req_queue->dma_tag_ucd, req_queue->ucd,
 		    req_queue->ucdmem_map);
@@ -96,7 +119,6 @@ ufshci_req_sdb_cmd_desc_construct(struct ufshci_req_queue *req_queue,
 	}
 
 	req_queue->ucd = (struct ufshci_utp_cmd_desc *)ucdmem;
-	req_queue->ucd_addr = ucdmem_phys;
 
 	/*
 	 * Allocate physical memory for PRDT
@@ -128,10 +150,9 @@ ufshci_req_sdb_cmd_desc_construct(struct ufshci_req_queue *req_queue,
 		tr->slot_state = UFSHCI_SLOT_STATE_FREE;
 
 		tr->ucd = (struct ufshci_utp_cmd_desc *)ucdmem;
-		tr->ucd_bus_addr = ucdmem_phys;
+		tr->ucd_bus_addr = hwq->ucd_bus_addr[i];
 
 		ucdmem += sizeof(struct ufshci_utp_cmd_desc);
-		ucdmem_phys += sizeof(struct ufshci_utp_cmd_desc);
 
 		hwq->act_tr[i] = tr;
 	}
@@ -175,6 +196,11 @@ ufshci_req_sdb_construct(struct ufshci_controller *ctrlr,
 	req_queue->hwq = malloc(sizeof(struct ufshci_hw_queue), M_UFSHCI,
 	    M_ZERO | M_NOWAIT);
 	hwq = &req_queue->hwq[UFSHCI_SDB_Q];
+	hwq->num_entries = req_queue->num_entries;
+	hwq->num_trackers = req_queue->num_trackers;
+	req_queue->hwq->ucd_bus_addr = malloc(sizeof(bus_addr_t) *
+		req_queue->num_trackers,
+	    M_UFSHCI, M_ZERO | M_NOWAIT);
 
 	mtx_init(&hwq->qlock, "ufshci req_queue lock", NULL, MTX_DEF);
 
@@ -277,6 +303,7 @@ ufshci_req_sdb_destroy(struct ufshci_controller *ctrlr,
 	if (mtx_initialized(&hwq->qlock))
 		mtx_destroy(&hwq->qlock);
 
+	free(req_queue->hwq->ucd_bus_addr, M_UFSHCI);
 	free(req_queue->hwq, M_UFSHCI);
 }
 
diff --git a/sys/dev/vt/hw/vga/vt_vga.c b/sys/dev/vt/hw/vga/vt_vga.c
index 64039575c0ad..675c0573bd7e 100644
--- a/sys/dev/vt/hw/vga/vt_vga.c
+++ b/sys/dev/vt/hw/vga/vt_vga.c
@@ -1347,7 +1347,7 @@ vga_postswitch(struct vt_device *vd)
 
 	/* Reinit VGA mode, to restore view after app which change mode. */
 	vga_initialize(vd, (vd->vd_flags & VDF_TEXTMODE));
-	/* Ask vt(9) to update chars on visible area. */
+	/* Ask vt(4) to update chars on visible area. */
 	vd->vd_flags |= VDF_INVALID;
 }
 
diff --git a/sys/dev/vt/vt_core.c b/sys/dev/vt/vt_core.c
index b0f58b38a6f1..b51ef6766de4 100644
--- a/sys/dev/vt/vt_core.c
+++ b/sys/dev/vt/vt_core.c
@@ -125,10 +125,10 @@ static const struct terminal_class vt_termclass = {
 			(vw)->vw_number)
 
 static SYSCTL_NODE(_kern, OID_AUTO, vt, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
-    "vt(9) parameters");
+    "vt(4) parameters");
 static VT_SYSCTL_INT(enable_altgr, 1, "Enable AltGr key (Do not assume R.Alt as Alt)");
 static VT_SYSCTL_INT(enable_bell, 0, "Enable bell");
-static VT_SYSCTL_INT(debug, 0, "vt(9) debug level");
+static VT_SYSCTL_INT(debug, 0, "vt(4) debug level");
 static VT_SYSCTL_INT(deadtimer, 15, "Time to wait busy process in VT_PROCESS mode");
 static VT_SYSCTL_INT(suspendswitch, 1, "Switch to VT0 before suspend");
 
diff --git a/sys/fs/fdescfs/fdesc_vnops.c b/sys/fs/fdescfs/fdesc_vnops.c
index 676ea5de12b8..58a22b8bdc50 100644
--- a/sys/fs/fdescfs/fdesc_vnops.c
+++ b/sys/fs/fdescfs/fdesc_vnops.c
@@ -547,6 +547,8 @@ fdesc_readdir(struct vop_readdir_args *ap)
 	fmp = VFSTOFDESC(ap->a_vp->v_mount);
 	if (ap->a_ncookies != NULL)
 		*ap->a_ncookies = 0;
+	if (ap->a_eofflag != NULL)
+		*ap->a_eofflag = 0;
 
 	off = (int)uio->uio_offset;
 	if (off != uio->uio_offset || off < 0 || (u_int)off % UIO_MX != 0 ||
@@ -559,7 +561,12 @@ fdesc_readdir(struct vop_readdir_args *ap)
 	fcnt = i - 2;		/* The first two nodes are `.' and `..' */
 
 	FILEDESC_SLOCK(fdp);
-	while (i < fdp->fd_nfiles + 2 && uio->uio_resid >= UIO_MX) {
+	while (uio->uio_resid >= UIO_MX) {
+		if (i >= fdp->fd_nfiles + 2) {
+			if (ap->a_eofflag != NULL)
+				*ap->a_eofflag = 1;
+			break;
+		}
 		bzero((caddr_t)dp, UIO_MX);
 		switch (i) {
 		case 0:	/* `.' */
diff --git a/sys/fs/msdosfs/msdosfs_conv.c b/sys/fs/msdosfs/msdosfs_conv.c
index da4848169173..208b64930e61 100644
--- a/sys/fs/msdosfs/msdosfs_conv.c
+++ b/sys/fs/msdosfs/msdosfs_conv.c
@@ -797,19 +797,24 @@ mbsadjpos(const char **instr, size_t inlen, size_t outlen, int weight, int flag,
 static u_char *
 dos2unixchr(u_char *outbuf, const u_char **instr, size_t *ilen, int lower, struct msdosfsmount *pmp)
 {
-	u_char c, *outp;
-	size_t len, olen;
+	u_char c, *outp, *outp1;
+	size_t i, len, olen;
 
 	outp = outbuf;
 	if (pmp->pm_flags & MSDOSFSMNT_KICONV && msdosfs_iconv) {
 		olen = len = 4;
 
+		outp1 = outp;
 		if (lower & (LCASE_BASE | LCASE_EXT))
 			msdosfs_iconv->convchr_case(pmp->pm_d2u, (const char **)instr,
 						  ilen, (char **)&outp, &olen, KICONV_LOWER);
 		else
 			msdosfs_iconv->convchr(pmp->pm_d2u, (const char **)instr,
 					     ilen, (char **)&outp, &olen);
+		for (i = 0; i < outp - outp1; i++) {
+			if (outp1[i] == '/')
+				outp1[i] = '?';
+		}
 		len -= olen;
 
 		/*
@@ -826,6 +831,8 @@ dos2unixchr(u_char *outbuf, const u_char **instr, size_t *ilen, int lower, struc
 		c = dos2unix[c];
 		if (lower & (LCASE_BASE | LCASE_EXT))
 			c = u2l[c];
+		if (c == '/')
+			c = '?';
 		*outp++ = c;
 		outbuf[1] = '\0';
 	}
diff --git a/sys/fs/nfs/nfs_commonsubs.c b/sys/fs/nfs/nfs_commonsubs.c
index 4c498e96a3c0..a957315aaa12 100644
--- a/sys/fs/nfs/nfs_commonsubs.c
+++ b/sys/fs/nfs/nfs_commonsubs.c
@@ -647,7 +647,8 @@ nfscl_fillsattr(struct nfsrv_descript *nd, struct vattr *vap,
 		    NFSATTRBIT_TIMECREATE))
 			NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_TIMECREATE);
 		(void) nfsv4_fillattr(nd, vp->v_mount, vp, NULL, vap, NULL, 0,
-		    &attrbits, NULL, NULL, 0, 0, 0, 0, (uint64_t)0, NULL);
+		    &attrbits, NULL, NULL, 0, 0, 0, 0, (uint64_t)0, NULL,
+		    false, false, false);
 		break;
 	}
 }
@@ -2646,7 +2647,8 @@ nfsv4_fillattr(struct nfsrv_descript *nd, struct mount *mp, vnode_t vp,
     NFSACL_T *saclp, struct vattr *vap, fhandle_t *fhp, int rderror,
     nfsattrbit_t *attrbitp, struct ucred *cred, NFSPROC_T *p, int isdgram,
     int reterr, int supports_nfsv4acls, int at_root, uint64_t mounted_on_fileno,
-    struct statfs *pnfssf)
+    struct statfs *pnfssf, bool xattrsupp, bool has_hiddensystem,
+    bool has_namedattr)
 {
 	int bitpos, retnum = 0;
 	u_int32_t *tl;
@@ -2660,10 +2662,7 @@ nfsv4_fillattr(struct nfsrv_descript *nd, struct mount *mp, vnode_t vp,
 	struct nfsfsinfo fsinf;
 	struct timespec temptime;
 	NFSACL_T *aclp, *naclp = NULL;
-	size_t atsiz;
-	bool xattrsupp;
 	short irflag;
-	long has_pathconf;
 #ifdef QUOTA
 	struct dqblk dqb;
 	uid_t savuid;
@@ -2747,18 +2746,6 @@ nfsv4_fillattr(struct nfsrv_descript *nd, struct mount *mp, vnode_t vp,
 		}
 	}
 
-	/* Check to see if Extended Attributes are supported. */
-	xattrsupp = false;
-	if (NFSISSET_ATTRBIT(retbitp, NFSATTRBIT_XATTRSUPPORT)) {
-		if (NFSVOPLOCK(vp, LK_SHARED) == 0) {
-			error = VOP_GETEXTATTR(vp, EXTATTR_NAMESPACE_USER,
-			    "xxx", NULL, &atsiz, cred, p);
-			NFSVOPUNLOCK(vp);
-			if (error != EOPNOTSUPP)
-				xattrsupp = true;
-		}
-	}
-
 	/*
 	 * Put out the attribute bitmap for the ones being filled in
 	 * and get the field for the number of attributes returned.
@@ -2780,11 +2767,7 @@ nfsv4_fillattr(struct nfsrv_descript *nd, struct mount *mp, vnode_t vp,
 			    NFSCLRBIT_ATTRBIT(&attrbits,NFSATTRBIT_ACLSUPPORT);
 			    NFSCLRBIT_ATTRBIT(&attrbits,NFSATTRBIT_ACL);
 			}
-			if (cred == NULL || p == NULL || vp == NULL ||
-			    VOP_PATHCONF(vp, _PC_HAS_HIDDENSYSTEM,
-			    &has_pathconf) != 0)
-			    has_pathconf = 0;
-			if (has_pathconf == 0) {
+			if (!has_hiddensystem) {
 			    NFSCLRBIT_ATTRBIT(&attrbits, NFSATTRBIT_HIDDEN);
 			    NFSCLRBIT_ATTRBIT(&attrbits, NFSATTRBIT_SYSTEM);
 			}
@@ -2828,10 +2811,7 @@ nfsv4_fillattr(struct nfsrv_descript *nd, struct mount *mp, vnode_t vp,
 			break;
 		case NFSATTRBIT_NAMEDATTR:
 			NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
-			if (VOP_PATHCONF(vp, _PC_HAS_NAMEDATTR,
-			    &has_pathconf) != 0)
-				has_pathconf = 0;
-			if (has_pathconf != 0)
+			if (has_namedattr)
 				*tl = newnfs_true;
 			else
 				*tl = newnfs_false;
diff --git a/sys/fs/nfs/nfs_var.h b/sys/fs/nfs/nfs_var.h
index 3b6c1ec90c06..54f60a753c50 100644
--- a/sys/fs/nfs/nfs_var.h
+++ b/sys/fs/nfs/nfs_var.h
@@ -395,8 +395,9 @@ int nfsrv_putopbit(struct nfsrv_descript *, nfsopbit_t *);
 void nfsrv_wcc(struct nfsrv_descript *, int, struct nfsvattr *, int,
     struct nfsvattr *);
 int nfsv4_fillattr(struct nfsrv_descript *, struct mount *, vnode_t, NFSACL_T *,
-    struct vattr *, fhandle_t *, int, nfsattrbit_t *,
-    struct ucred *, NFSPROC_T *, int, int, int, int, uint64_t, struct statfs *);
+    struct vattr *, fhandle_t *, int, nfsattrbit_t *, struct ucred *,
+    NFSPROC_T *, int, int, int, int, uint64_t, struct statfs *, bool, bool,
+    bool);
 void nfsrv_fillattr(struct nfsrv_descript *, struct nfsvattr *);
 struct mbuf *nfsrv_adj(struct mbuf *, int, int);
 void nfsrv_postopattr(struct nfsrv_descript *, int, struct nfsvattr *);
@@ -735,7 +736,8 @@ int nfsvno_updfilerev(vnode_t, struct nfsvattr *, struct nfsrv_descript *,
     NFSPROC_T *);
 int nfsvno_fillattr(struct nfsrv_descript *, struct mount *, vnode_t,
     struct nfsvattr *, fhandle_t *, int, nfsattrbit_t *,
-    struct ucred *, NFSPROC_T *, int, int, int, int, uint64_t);
+    struct ucred *, NFSPROC_T *, int, int, int, int, uint64_t, bool, bool,
+    bool);
 int nfsrv_sattr(struct nfsrv_descript *, vnode_t, struct nfsvattr *, nfsattrbit_t *,
     NFSACL_T *, NFSPROC_T *);
 int nfsv4_sattr(struct nfsrv_descript *, vnode_t, struct nfsvattr *, nfsattrbit_t *,
diff --git a/sys/fs/nfsclient/nfs_clrpcops.c b/sys/fs/nfsclient/nfs_clrpcops.c
index e0e66baca44d..2f3c59b68518 100644
--- a/sys/fs/nfsclient/nfs_clrpcops.c
+++ b/sys/fs/nfsclient/nfs_clrpcops.c
@@ -5436,7 +5436,8 @@ nfsrpc_setaclrpc(vnode_t vp, struct ucred *cred, NFSPROC_T *p,
 	NFSZERO_ATTRBIT(&attrbits);
 	NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_ACL);
 	(void) nfsv4_fillattr(nd, vp->v_mount, vp, aclp, NULL, NULL, 0,
-	    &attrbits, NULL, NULL, 0, 0, 0, 0, (uint64_t)0, NULL);
+	    &attrbits, NULL, NULL, 0, 0, 0, 0, (uint64_t)0, NULL, false, false,
+	    false);
 	error = nfscl_request(nd, vp, p, cred);
 	if (error)
 		return (error);
diff --git a/sys/fs/nfsclient/nfs_clstate.c b/sys/fs/nfsclient/nfs_clstate.c
index 1ae5ed1a75ca..99a781640c53 100644
--- a/sys/fs/nfsclient/nfs_clstate.c
+++ b/sys/fs/nfsclient/nfs_clstate.c
@@ -3701,7 +3701,7 @@ nfscl_docb(struct nfsrv_descript *nd, NFSPROC_T *p)
 			if (!error)
 				(void) nfsv4_fillattr(nd, NULL, NULL, NULL, &va,
 				    NULL, 0, &rattrbits, NULL, p, 0, 0, 0, 0,
-				    (uint64_t)0, NULL);
+				    (uint64_t)0, NULL, false, false, false);
 			break;
 		case NFSV4OP_CBRECALL:
 			NFSCL_DEBUG(4, "cbrecall\n");
diff --git a/sys/fs/nfsserver/nfs_nfsdport.c b/sys/fs/nfsserver/nfs_nfsdport.c
index 43ee0383669f..4f0d5946d6b9 100644
--- a/sys/fs/nfsserver/nfs_nfsdport.c
+++ b/sys/fs/nfsserver/nfs_nfsdport.c
@@ -2112,7 +2112,8 @@ int
 nfsvno_fillattr(struct nfsrv_descript *nd, struct mount *mp, struct vnode *vp,
     struct nfsvattr *nvap, fhandle_t *fhp, int rderror, nfsattrbit_t *attrbitp,
     struct ucred *cred, struct thread *p, int isdgram, int reterr,
-    int supports_nfsv4acls, int at_root, uint64_t mounted_on_fileno)
+    int supports_nfsv4acls, int at_root, uint64_t mounted_on_fileno,
+    bool xattrsupp, bool has_hiddensystem, bool has_namedattr)
 {
 	struct statfs *sf;
 	int error;
@@ -2131,7 +2132,7 @@ nfsvno_fillattr(struct nfsrv_descript *nd, struct mount *mp, struct vnode *vp,
 	}
 	error = nfsv4_fillattr(nd, mp, vp, NULL, &nvap->na_vattr, fhp, rderror,
 	    attrbitp, cred, p, isdgram, reterr, supports_nfsv4acls, at_root,
-	    mounted_on_fileno, sf);
+	    mounted_on_fileno, sf, xattrsupp, has_hiddensystem, has_namedattr);
 	free(sf, M_TEMP);
 	NFSEXITCODE2(0, nd);
 	return (error);
@@ -2448,7 +2449,7 @@ nfsrvd_readdirplus(struct nfsrv_descript *nd, int isdgram,
 	struct nfsvattr nva, at, *nvap = &nva;
 	struct mbuf *mb0, *mb1;
 	struct nfsreferral *refp;
-	int nlen, r, error = 0, getret = 1, usevget = 1;
+	int nlen, r, error = 0, getret = 1, ret, usevget = 1;
 	int siz, cnt, fullsiz, eofflag, ncookies, entrycnt;
 	caddr_t bpos0, bpos1;
 	u_int64_t off, toff, verf __unused;
@@ -2462,6 +2463,9 @@ nfsrvd_readdirplus(struct nfsrv_descript *nd, int isdgram,
 	uint64_t mounted_on_fileno;
 	struct thread *p = curthread;
 	int bextpg0, bextpg1, bextpgsiz0, bextpgsiz1;
+	size_t atsiz;
+	long pathval;
+	bool has_hiddensystem, has_namedattr, xattrsupp;
 
 	if (nd->nd_repstat) {
 		nfsrv_postopattr(nd, getret, &at);
@@ -2936,9 +2940,32 @@ again:
 				*tl++ = newnfs_true;
 				txdr_hyper(*cookiep, tl);
 				dirlen += nfsm_strtom(nd, dp->d_name, nlen);
+				xattrsupp = false;
+				has_hiddensystem = false;
+				has_namedattr = false;
 				if (nvp != NULL) {
 					supports_nfsv4acls =
 					    nfs_supportsnfsv4acls(nvp);
+					if (NFSISSET_ATTRBIT(&attrbits,
+					    NFSATTRBIT_XATTRSUPPORT)) {
+						ret = VOP_GETEXTATTR(nvp,
+						    EXTATTR_NAMESPACE_USER,
+						    "xxx", NULL, &atsiz,
+						    nd->nd_cred, p);
+						xattrsupp = ret != EOPNOTSUPP;
+					}
+					if (VOP_PATHCONF(nvp,
+					    _PC_HAS_HIDDENSYSTEM, &pathval) !=
+					    0)
+						pathval = 0;
+					has_hiddensystem = pathval > 0;
+					pathval = 0;
+					if (NFSISSET_ATTRBIT(&attrbits,
+					    NFSATTRBIT_NAMEDATTR) &&
+					    VOP_PATHCONF(nvp, _PC_HAS_NAMEDATTR,
+					    &pathval) != 0)
+						pathval = 0;
+					has_namedattr = pathval > 0;
 					NFSVOPUNLOCK(nvp);
 				} else
 					supports_nfsv4acls = 0;
@@ -2958,13 +2985,15 @@ again:
 					    nvp, nvap, &nfh, r, &rderrbits,
 					    nd->nd_cred, p, isdgram, 0,
 					    supports_nfsv4acls, at_root,
-					    mounted_on_fileno);
+					    mounted_on_fileno, xattrsupp,
+					    has_hiddensystem, has_namedattr);
 				} else {
 					dirlen += nfsvno_fillattr(nd, new_mp,
 					    nvp, nvap, &nfh, r, &attrbits,
 					    nd->nd_cred, p, isdgram, 0,
 					    supports_nfsv4acls, at_root,
-					    mounted_on_fileno);
+					    mounted_on_fileno, xattrsupp,
+					    has_hiddensystem, has_namedattr);
 				}
 				if (nvp != NULL)
 					vrele(nvp);
@@ -6356,7 +6385,7 @@ nfsrv_setacldsdorpc(fhandle_t *fhp, struct ucred *cred, NFSPROC_T *p,
 	 * the same type (VREG).
 	 */
 	nfsv4_fillattr(nd, NULL, vp, aclp, NULL, NULL, 0, &attrbits, NULL,
-	    NULL, 0, 0, 0, 0, 0, NULL);
+	    NULL, 0, 0, 0, 0, 0, NULL, false, false, false);
 	error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred,
 	    NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL);
 	if (error != 0) {
diff --git a/sys/fs/nfsserver/nfs_nfsdserv.c b/sys/fs/nfsserver/nfs_nfsdserv.c
index f7564ade401b..9eebcda548c6 100644
--- a/sys/fs/nfsserver/nfs_nfsdserv.c
+++ b/sys/fs/nfsserver/nfs_nfsdserv.c
@@ -241,7 +241,7 @@ nfsrvd_getattr(struct nfsrv_descript *nd, int isdgram,
 {
 	struct nfsvattr nva;
 	fhandle_t fh;
-	int at_root = 0, error = 0, supports_nfsv4acls;
+	int at_root = 0, error = 0, ret, supports_nfsv4acls;
 	struct nfsreferral *refp;
 	nfsattrbit_t attrbits, tmpbits;
 	struct mount *mp;
@@ -250,6 +250,9 @@ nfsrvd_getattr(struct nfsrv_descript *nd, int isdgram,
 	uint64_t mounted_on_fileno = 0;
 	accmode_t accmode;
 	struct thread *p = curthread;
+	size_t atsiz;
+	long pathval;
+	bool has_hiddensystem, has_namedattr, xattrsupp;
 
 	if (nd->nd_repstat)
 		goto out;
@@ -307,6 +310,26 @@ nfsrvd_getattr(struct nfsrv_descript *nd, int isdgram,
 				    &nva, &attrbits, p);
 			if (nd->nd_repstat == 0) {
 				supports_nfsv4acls = nfs_supportsnfsv4acls(vp);
+				xattrsupp = false;
+				if (NFSISSET_ATTRBIT(&attrbits,
+				    NFSATTRBIT_XATTRSUPPORT)) {
+					ret = VOP_GETEXTATTR(vp,
+					    EXTATTR_NAMESPACE_USER,
+					    "xxx", NULL, &atsiz, nd->nd_cred,
+					    p);
+					xattrsupp = ret != EOPNOTSUPP;
+				}
+				if (VOP_PATHCONF(vp, _PC_HAS_HIDDENSYSTEM,
+				    &pathval) != 0)
+					pathval = 0;
+				has_hiddensystem = pathval > 0;
+				pathval = 0;
+				if (NFSISSET_ATTRBIT(&attrbits,
+				    NFSATTRBIT_NAMEDATTR) &&
+				    VOP_PATHCONF(vp, _PC_HAS_NAMEDATTR,
+				    &pathval) != 0)
+					pathval = 0;
+				has_namedattr = pathval > 0;
 				mp = vp->v_mount;
 				if (nfsrv_enable_crossmntpt != 0 &&
 				    vp->v_type == VDIR &&
@@ -340,7 +363,9 @@ nfsrvd_getattr(struct nfsrv_descript *nd, int isdgram,
 					(void)nfsvno_fillattr(nd, mp, vp, &nva,
 					    &fh, 0, &attrbits, nd->nd_cred, p,
 					    isdgram, 1, supports_nfsv4acls,
-					    at_root, mounted_on_fileno);
+					    at_root, mounted_on_fileno,
+					    xattrsupp, has_hiddensystem,
+					    has_namedattr);
 					vfs_unbusy(mp);
 				}
 				vrele(vp);
@@ -4353,9 +4378,10 @@ nfsrvd_openattr(struct nfsrv_descript *nd, __unused int isdgram,
 	int error = 0;
 
 	NFSNAMEICNDSET(&cn, nd->nd_cred, LOOKUP, OPENNAMED | ISLASTCN |
-	    NOFOLLOW);
+	    NOFOLLOW | LOCKLEAF);
 	cn.cn_nameptr = ".";
 	cn.cn_namelen = 1;
+	cn.cn_lkflags = LK_SHARED;
 	NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED);
 	if (*tl == newnfs_true)
 		cn.cn_flags |= CREATENAMED;
@@ -4374,6 +4400,8 @@ nfsrvd_openattr(struct nfsrv_descript *nd, __unused int isdgram,
 		if (nd->nd_repstat == ENOATTR)
 			nd->nd_repstat = NFSERR_NOENT;
 	}
+	if (nd->nd_repstat == 0)
+		NFSVOPUNLOCK(*vpp);
 
 	vput(dp);
 	NFSEXITCODE2(0, nd);
diff --git a/sys/fs/p9fs/p9fs_vnops.c b/sys/fs/p9fs/p9fs_vnops.c
index 56bf766ef801..227e2b93883e 100644
--- a/sys/fs/p9fs/p9fs_vnops.c
+++ b/sys/fs/p9fs/p9fs_vnops.c
@@ -1784,6 +1784,9 @@ p9fs_readdir(struct vop_readdir_args *ap)
 		return (EBADF);
 	}
 
+	if (ap->a_eofflag != NULL)
+		*ap->a_eofflag = 0;
+
 	io_buffer = uma_zalloc(p9fs_io_buffer_zone, M_WAITOK);
 
 	/* We haven't reached the end yet. read more. */
@@ -1801,8 +1804,11 @@ p9fs_readdir(struct vop_readdir_args *ap)
 		count = p9_client_readdir(vofid, (char *)io_buffer,
 		    diroffset, count);
 
-		if (count == 0)
+		if (count == 0) {
+			if (ap->a_eofflag != NULL)
+				*ap->a_eofflag = 1;
 			break;
+		}
 
 		if (count < 0) {
 			error = EIO;
diff --git a/sys/fs/udf/ecma167-udf.h b/sys/fs/udf/ecma167-udf.h
index 839bbec08254..19e114763cac 100644
--- a/sys/fs/udf/ecma167-udf.h
+++ b/sys/fs/udf/ecma167-udf.h
@@ -243,7 +243,7 @@ struct part_map_spare {
 	uint8_t			n_st;	/* Number of Sparing Tables */
 	uint8_t			reserved1;
 	uint32_t		st_size;
-	uint32_t		st_loc[1];
+	uint32_t		st_loc[];
 } __packed;
 
 union udf_pmap {
@@ -266,7 +266,7 @@ struct udf_sparing_table {
 	uint16_t		rt_l;	/* Relocation Table len */
 	uint8_t			reserved[2];
 	uint32_t		seq_num;
-	struct spare_map_entry	entries[1];
+	struct spare_map_entry	entries[];
 } __packed;
 
 /* Partition Descriptor [3/10.5] */
diff --git a/sys/fs/udf/udf_vfsops.c b/sys/fs/udf/udf_vfsops.c
index c7438147c0a0..c5ef1f686093 100644
--- a/sys/fs/udf/udf_vfsops.c
+++ b/sys/fs/udf/udf_vfsops.c
@@ -81,6 +81,7 @@
 #include <sys/fcntl.h>
 #include <sys/iconv.h>
 #include <sys/kernel.h>
+#include <sys/limits.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
@@ -729,7 +730,7 @@ udf_fhtovp(struct mount *mp, struct fid *fhp, int flags, struct vnode **vpp)
 	struct ifid *ifhp;
 	struct vnode *nvp;
 	struct udf_node *np;
-	off_t fsize;
+	uint64_t fsize;
 	int error;
 
 	ifhp = (struct ifid *)fhp;
@@ -741,6 +742,10 @@ udf_fhtovp(struct mount *mp, struct fid *fhp, int flags, struct vnode **vpp)
 
 	np = VTON(nvp);
 	fsize = le64toh(np->fentry->inf_len);
+	if (fsize > OFF_MAX) {
+		*vpp = NULLVP;
+		return (EIO);
+	}
 
 	*vpp = nvp;
 	vnode_create_vobject(*vpp, fsize, curthread);
diff --git a/sys/fs/udf/udf_vnops.c b/sys/fs/udf/udf_vnops.c
index 88bf4917a851..37889241e8c3 100644
--- a/sys/fs/udf/udf_vnops.c
+++ b/sys/fs/udf/udf_vnops.c
@@ -39,6 +39,7 @@
 #include <sys/conf.h>
 #include <sys/buf.h>
 #include <sys/iconv.h>
+#include <sys/limits.h>
 #include <sys/mount.h>
 #include <sys/vnode.h>
 #include <sys/dirent.h>
@@ -182,11 +183,14 @@ udf_access(struct vop_access_args *a)
 }
 
 static int
-udf_open(struct vop_open_args *ap) {
+udf_open(struct vop_open_args *ap)
+{
 	struct udf_node *np = VTON(ap->a_vp);
-	off_t fsize;
+	uint64_t fsize;
 
 	fsize = le64toh(np->fentry->inf_len);
+	if (fsize > OFF_MAX)
+		return (EIO);
 	vnode_create_vobject(ap->a_vp, fsize, ap->a_td);
 	return 0;
 }
@@ -314,12 +318,13 @@ udf_getattr(struct vop_getattr_args *a)
 		 * that directories consume at least one logical block,
 		 * make it appear so.
 		 */
-		if (fentry->logblks_rec != 0) {
-			vap->va_size =
-			    le64toh(fentry->logblks_rec) * node->udfmp->bsize;
-		} else {
+		vap->va_size = le64toh(fentry->logblks_rec);
+		if (vap->va_size == 0)
 			vap->va_size = node->udfmp->bsize;
-		}
+		else if (vap->va_size > UINT64_MAX / node->udfmp->bsize)
+			vap->va_size = UINT64_MAX;
+		else
+			vap->va_size *= node->udfmp->bsize;
 	} else {
 		vap->va_size = le64toh(fentry->inf_len);
 	}
@@ -446,6 +451,7 @@ udf_read(struct vop_read_args *ap)
 	struct buf *bp;
 	uint8_t *data;
 	daddr_t lbn, rablock;
+	uint64_t len;
 	off_t diff, fsize;
 	ssize_t n;
 	int error = 0;
@@ -471,7 +477,12 @@ udf_read(struct vop_read_args *ap)
 		return (error);
 	}
 
-	fsize = le64toh(node->fentry->inf_len);
+	len = le64toh(node->fentry->inf_len);
+	if (len > OFF_MAX) {
+		/* too big, just cap to the requested length */
+		len = uio->uio_resid;
+	}
+	fsize = len;
 	udfmp = node->udfmp;
 	do {
 		lbn = lblkno(udfmp, uio->uio_offset);
@@ -783,6 +794,7 @@ udf_readdir(struct vop_readdir_args *a)
 	struct udf_uiodir uiodir;
 	struct udf_dirstream *ds;
 	uint64_t *cookies = NULL;
+	uint64_t len;
 	int ncookies;
 	int error = 0;
 
@@ -811,8 +823,12 @@ udf_readdir(struct vop_readdir_args *a)
 	 * Iterate through the file id descriptors.  Give the parent dir
 	 * entry special attention.
 	 */
-	ds = udf_opendir(node, uio->uio_offset, le64toh(node->fentry->inf_len),
-	    node->udfmp);
+	len = le64toh(node->fentry->inf_len);
+	if (len > INT_MAX) {
+		/* too big, just cap to INT_MAX */
+		len = INT_MAX;
+	}
+	ds = udf_opendir(node, uio->uio_offset, len, node->udfmp);
 
 	while ((fid = udf_getfid(ds)) != NULL) {
 		/* XXX Should we return an error on a bad fid? */
@@ -904,7 +920,8 @@ udf_readlink(struct vop_readlink_args *ap)
 	struct udf_node *node;
 	void *buf;
 	char *cp;
-	int error, len, root;
+	uint64_t len;
+	int error, root;
 
 	/*
 	 * A symbolic link in UDF is a list of variable-length path
@@ -914,6 +931,8 @@ udf_readlink(struct vop_readlink_args *ap)
 	vp = ap->a_vp;
 	node = VTON(vp);
 	len = le64toh(node->fentry->inf_len);
+	if (len > MAXPATHLEN)
+		return (EIO);
 	buf = malloc(len, M_DEVBUF, M_WAITOK);
 	iov[0].iov_len = len;
 	iov[0].iov_base = buf;
@@ -1116,13 +1135,14 @@ udf_lookup(struct vop_cachedlookup_args *a)
 	struct udf_mnt *udfmp;
 	struct fileid_desc *fid = NULL;
 	struct udf_dirstream *ds;
+	uint64_t fsize;
 	u_long nameiop;
 	u_long flags;
 	char *nameptr;
 	long namelen;
 	ino_t id = 0;
 	int offset, error = 0;
-	int fsize, lkflags, ltype, numdirpasses;
+	int lkflags, ltype, numdirpasses;
 
 	dvp = a->a_dvp;
 	node = VTON(dvp);
@@ -1133,6 +1153,10 @@ udf_lookup(struct vop_cachedlookup_args *a)
 	nameptr = a->a_cnp->cn_nameptr;
 	namelen = a->a_cnp->cn_namelen;
 	fsize = le64toh(node->fentry->inf_len);
+	if (fsize > INT_MAX) {
+		/* too big, just cap to INT_MAX */
+		fsize = INT_MAX;
+	}
 
 	/*
 	 * If this is a LOOKUP and we've already partially searched through
diff --git a/sys/i386/i386/pmap.c b/sys/i386/i386/pmap.c
index 5065b7e61ee8..b44f5e08bbcf 100644
--- a/sys/i386/i386/pmap.c
+++ b/sys/i386/i386/pmap.c
@@ -876,14 +876,16 @@ __CONCAT(PMTYPE, init_pat)(void)
 
 #ifdef PMAP_PAE_COMP
 static void *
-pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags,
-    int wait)
+pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *sflagsp,
+    int flags)
 {
 
 	/* Inform UMA that this allocator uses kernel_map/object. */
-	*flags = UMA_SLAB_KERNEL;
+	*sflagsp = UMA_SLAB_KERNEL;
+	/* contig allocations cannot be NEVERFREED */
+	flags &= ~M_NEVERFREED;
 	return ((void *)kmem_alloc_contig_domainset(DOMAINSET_FIXED(domain),
-	    bytes, wait, 0x0ULL, 0xffffffffULL, 1, 0, VM_MEMATTR_DEFAULT));
+	    bytes, flags, 0x0ULL, 0xffffffffULL, 1, 0, VM_MEMATTR_DEFAULT));
 }
 #endif
 
diff --git a/sys/kern/subr_asan.c b/sys/kern/subr_asan.c
index 0edb631d1475..464efda1e91a 100644
--- a/sys/kern/subr_asan.c
+++ b/sys/kern/subr_asan.c
@@ -263,8 +263,7 @@ kasan_mark(const void *addr, size_t size, size_t redzsize, uint8_t code)
 	if (__predict_false(!kasan_enabled))
 		return;
 
-	if ((vm_offset_t)addr >= DMAP_MIN_ADDRESS &&
-	    (vm_offset_t)addr < DMAP_MAX_ADDRESS)
+	if (kasan_md_unsupported((vm_offset_t)addr))
 		return;
 
 	KASSERT((vm_offset_t)addr >= VM_MIN_KERNEL_ADDRESS &&
diff --git a/sys/kern/sys_generic.c b/sys/kern/sys_generic.c
index 94e44d888181..b472aaea89e6 100644
--- a/sys/kern/sys_generic.c
+++ b/sys/kern/sys_generic.c
@@ -2309,6 +2309,12 @@ sys_exterrctl(struct thread *td, struct exterrctl_args *uap)
 			return (EINVAL);
 		td->td_pflags2 &= ~TDP2_UEXTERR;
 		return (0);
+	case EXTERRCTL_UD:
+		/*
+		 * Important: this code must always return EINVAL and never any
+		 * extended error, for testing purposes.
+		 */
+		/* FALLTHROUGH */
 	default:
 		return (EINVAL);
 	}
diff --git a/sys/kern/vfs_cache.c b/sys/kern/vfs_cache.c
index 3d455b3874cc..89c1d779f04c 100644
--- a/sys/kern/vfs_cache.c
+++ b/sys/kern/vfs_cache.c
@@ -332,7 +332,8 @@ SDT_PROBE_DEFINE2(vfs, namecache, evict_negative, done, "struct vnode *",
     "char *");
 SDT_PROBE_DEFINE1(vfs, namecache, symlink, alloc__fail, "size_t");
 
-SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool");
+SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata *", "int",
+    "enum cache_fpl_status");
 SDT_PROBE_DECLARE(vfs, namei, lookup, entry);
 SDT_PROBE_DECLARE(vfs, namei, lookup, return);
 
@@ -6420,15 +6421,11 @@ out:
 	cache_fpl_smr_assert_not_entered(&fpl);
 	cache_fpl_assert_status(&fpl);
 	*status = fpl.status;
-	if (SDT_PROBES_ENABLED()) {
-		SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
-		if (fpl.status == CACHE_FPL_STATUS_HANDLED)
-			SDT_PROBE4(vfs, namei, lookup, return, error, ndp->ni_vp, true,
-			    ndp);
-	}
-
+	SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
 	if (__predict_true(fpl.status == CACHE_FPL_STATUS_HANDLED)) {
 		MPASS(error != CACHE_FPL_FAILED);
+		SDT_PROBE4(vfs, namei, lookup, return, error, ndp->ni_vp, true,
+		    ndp);
 		if (error != 0) {
 			cache_fpl_cleanup_cnp(fpl.cnp);
 			MPASS(fpl.dvp == NULL);
diff --git a/sys/kern/vfs_inotify.c b/sys/kern/vfs_inotify.c
index 2b42228465a4..d3cd0d1f9832 100644
--- a/sys/kern/vfs_inotify.c
+++ b/sys/kern/vfs_inotify.c
@@ -371,7 +371,7 @@ inotify_unlink_watch_locked(struct inotify_softc *sc, struct inotify_watch *watc
 
 	TAILQ_REMOVE(&vp->v_pollinfo->vpi_inotify, watch, vlink);
 	if (TAILQ_EMPTY(&vp->v_pollinfo->vpi_inotify))
-		vn_irflag_unset_locked(vp, VIRF_INOTIFY);
+		vn_irflag_unset(vp, VIRF_INOTIFY);
 }
 
 /*
@@ -675,7 +675,8 @@ vn_inotify(struct vnode *vp, struct vnode *dvp, struct componentname *cnp,
 					struct vattr va;
 					int error;
 
-					error = VOP_GETATTR(vp, &va, cnp->cn_cred);
+					error = VOP_GETATTR(vp, &va,
+					    cnp->cn_cred);
 					if (error == 0 && va.va_nlink != 0)
 						selfevent = 0;
 				}
diff --git a/sys/modules/Makefile b/sys/modules/Makefile
index 9922796f8a1d..7cb6e2124326 100644
--- a/sys/modules/Makefile
+++ b/sys/modules/Makefile
@@ -326,6 +326,7 @@ SUBDIR=	\
 	proto \
 	pseudofs \
 	${_pst} \
+	${_pt} \
 	pty  \
 	puc \
 	pwm \
@@ -842,6 +843,7 @@ _iwx=		iwx
 _ixl=		ixl
 _nvdimm=	nvdimm
 _pms=		pms
+_pt=		pt
 _qat=		qat
 .if ${MK_SOURCELESS_UCODE} != "no"
 _qatfw=		qatfw
diff --git a/sys/modules/efirt/Makefile b/sys/modules/efirt/Makefile
index 4738996fd4e6..c46484465b68 100644
--- a/sys/modules/efirt/Makefile
+++ b/sys/modules/efirt/Makefile
@@ -9,7 +9,7 @@ SRCS+=  device_if.h bus_if.h clock_if.h
 DPSRCS+= assym.inc
 
 .if ${MACHINE_CPUARCH} == "amd64"
-SRCS+=	opt_hwpmc_hooks.h opt_kstack_pages.h
+SRCS+=	opt_acpi.h opt_hwpmc_hooks.h opt_kstack_pages.h
 .endif
 
 efirt_support.o:	efirt_support.S assym.inc
diff --git a/sys/modules/ice/Makefile b/sys/modules/ice/Makefile
index 91f20193d878..9f9c9f602cda 100644
--- a/sys/modules/ice/Makefile
+++ b/sys/modules/ice/Makefile
@@ -13,6 +13,7 @@ SRCS    += opt_inet.h opt_inet6.h opt_rss.h opt_iflib.h
 SRCS    += ice_lib.c ice_osdep.c ice_resmgr.c ice_strings.c
 SRCS    += ice_iflib_recovery_txrx.c ice_iflib_txrx.c if_ice_iflib.c
 SRCS	+= ice_fw_logging.c ice_ddp_common.c
+SRCS.PCI_IOV += pci_iov_if.h ice_iov.c ice_vf_mbx.c
 
 # RDMA Client interface
 # TODO: Is this the right way to compile this?
diff --git a/sys/modules/pt/Makefile b/sys/modules/pt/Makefile
new file mode 100644
index 000000000000..416b072face9
--- /dev/null
+++ b/sys/modules/pt/Makefile
@@ -0,0 +1,8 @@
+
+.PATH: ${SRCTOP}/sys/amd64/pt
+
+KMOD=	pt
+SRCS=	pt.c pt.h device_if.h bus_if.h
+SRCS+=	opt_hwpmc_hooks.h opt_kstack_pages.h
+
+.include <bsd.kmod.mk>
diff --git a/sys/modules/qlnx/qlnxe/Makefile b/sys/modules/qlnx/qlnxe/Makefile
index 3d8415cf0e57..2a44ae6ddde5 100644
--- a/sys/modules/qlnx/qlnxe/Makefile
+++ b/sys/modules/qlnx/qlnxe/Makefile
@@ -58,6 +58,7 @@ SRCS+=qlnx_rdma.c
 
 SRCS+=qlnx_ioctl.c
 SRCS+=qlnx_os.c
+SRCS+=opt_inet.h
 
 SRCS+=	${LINUXKPI_GENSRCS}
 
diff --git a/sys/net/ethernet.h b/sys/net/ethernet.h
index cf4f75bd0b6c..01485cf26e06 100644
--- a/sys/net/ethernet.h
+++ b/sys/net/ethernet.h
@@ -62,6 +62,8 @@ struct ether_header {
 	u_char	ether_shost[ETHER_ADDR_LEN];
 	u_short	ether_type;
 } __packed;
+_Static_assert(sizeof(struct ether_header) == ETHER_HDR_LEN,
+    "size of struct ether_header is wrong");
 
 /*
  * Structure of a 48-bit Ethernet address.
@@ -69,6 +71,8 @@ struct ether_header {
 struct ether_addr {
 	u_char octet[ETHER_ADDR_LEN];
 } __packed;
+_Static_assert(sizeof(struct ether_addr) == ETHER_ADDR_LEN,
+    "size of struct ether_addr is wrong");
 
 #define	ETHER_IS_MULTICAST(addr) (*(addr) & 0x01) /* is address mcast/bcast? */
 #define	ETHER_IS_IPV6_MULTICAST(addr) \
@@ -112,6 +116,8 @@ struct ether_vlan_header {
 	uint16_t evl_tag;
 	uint16_t evl_proto;
 } __packed;
+_Static_assert(sizeof(struct ether_vlan_header) == ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN,
+    "size of struct ether_vlan_header is wrong");
 
 #define	EVL_VLID_MASK		0x0FFF
 #define	EVL_PRI_MASK		0xE000
diff --git a/sys/net/if_ethersubr.c b/sys/net/if_ethersubr.c
index 7be4dfac23e7..3ae0c01c0efc 100644
--- a/sys/net/if_ethersubr.c
+++ b/sys/net/if_ethersubr.c
@@ -92,11 +92,6 @@
 
 #include <crypto/sha1.h>
 
-#ifdef CTASSERT
-CTASSERT(sizeof (struct ether_header) == ETHER_ADDR_LEN * 2 + 2);
-CTASSERT(sizeof (struct ether_addr) == ETHER_ADDR_LEN);
-#endif
-
 VNET_DEFINE(pfil_head_t, link_pfil_head);	/* Packet filter hooks */
 
 /* netgraph node hooks for ng_ether(4) */
diff --git a/sys/net/if_gif.h b/sys/net/if_gif.h
index 3c1846b8f82a..c6692d3dd6bc 100644
--- a/sys/net/if_gif.h
+++ b/sys/net/if_gif.h
@@ -120,7 +120,8 @@ int in6_gif_setopts(struct gif_softc *, u_int);
 #define GIFGOPTS	_IOWR('i', 150, struct ifreq)
 #define GIFSOPTS	_IOW('i', 151, struct ifreq)
 
+#define	GIF_NOCLAMP		0x0001
 #define	GIF_IGNORE_SOURCE	0x0002
-#define	GIF_OPTMASK		(GIF_IGNORE_SOURCE)
+#define	GIF_OPTMASK		(GIF_NOCLAMP|GIF_IGNORE_SOURCE)
 
 #endif /* _NET_IF_GIF_H_ */
diff --git a/sys/net/pfvar.h b/sys/net/pfvar.h
index 1f2011634695..452a8eb4024b 100644
--- a/sys/net/pfvar.h
+++ b/sys/net/pfvar.h
@@ -1370,7 +1370,6 @@ struct pf_kruleset {
 		struct pf_krulequeue	 queues[2];
 		struct {
 			struct pf_krulequeue	*ptr;
-			struct pf_krule		**ptr_array;
 			u_int32_t		 rcount;
 			u_int32_t		 ticket;
 			int			 open;
diff --git a/sys/net80211/ieee80211_hostap.c b/sys/net80211/ieee80211_hostap.c
index c5a478533313..9074878e17e4 100644
--- a/sys/net80211/ieee80211_hostap.c
+++ b/sys/net80211/ieee80211_hostap.c
@@ -2214,12 +2214,9 @@ hostap_recv_mgmt(struct ieee80211_node *ni, struct mbuf *m0,
 
 		/* VHT */
 		if (IEEE80211_IS_CHAN_VHT(ni->ni_chan) &&
-		    vhtcap != NULL &&
-		    vhtinfo != NULL) {
-			/* XXX TODO; see below */
-			net80211_vap_printf(vap, "%s: VHT TODO!\n", __func__);
+		    vhtcap != NULL) {
 			ieee80211_vht_node_init(ni);
-			ieee80211_vht_update_cap(ni, vhtcap, vhtinfo);
+			ieee80211_vht_update_cap(ni, vhtcap);
 		} else if (ni->ni_flags & IEEE80211_NODE_VHT)
 			ieee80211_vht_node_cleanup(ni);
 
diff --git a/sys/net80211/ieee80211_ht.c b/sys/net80211/ieee80211_ht.c
index 5ec80e3646b8..c28f124648a1 100644
--- a/sys/net80211/ieee80211_ht.c
+++ b/sys/net80211/ieee80211_ht.c
@@ -1952,6 +1952,11 @@ do {									\
 		_RETURN_CHAN_BITS(0);
 
 	/*
+	 * TODO: should we bail out if there's no htinfo?
+	 * Or just treat it as if we can't do the HT20/HT40 check?
+	 */
+
+	/*
 	 * The original code was based on
 	 * 802.11ac-2013, Table 8-183x-VHT Operation Information subfields.
 	 * 802.11-2020, Table 9-274-VHT Operation Information subfields
@@ -1962,8 +1967,12 @@ do {									\
 	 */
 
 	htinfo = (const struct ieee80211_ie_htinfo *)ni->ni_ies.htinfo_ie;
-	ht40 = ((htinfo->hi_byte1 & IEEE80211_HTINFO_TXWIDTH) ==
-	    IEEE80211_HTINFO_TXWIDTH_2040);
+	if (htinfo != NULL)
+		ht40 = ((htinfo->hi_byte1 & IEEE80211_HTINFO_TXWIDTH) ==
+		    IEEE80211_HTINFO_TXWIDTH_2040);
+	else
+		ht40 = false;
+
 	can_vht160 = can_vht80p80 = can_vht80 = false;
 
 	/* 20 Mhz */
diff --git a/sys/net80211/ieee80211_vht.c b/sys/net80211/ieee80211_vht.c
index e91977f1ef98..de0b691d4d2a 100644
--- a/sys/net80211/ieee80211_vht.c
+++ b/sys/net80211/ieee80211_vht.c
@@ -838,12 +838,10 @@ ieee80211_add_vhtinfo(uint8_t *frm, struct ieee80211_node *ni)
 }
 
 void
-ieee80211_vht_update_cap(struct ieee80211_node *ni, const uint8_t *vhtcap_ie,
-    const uint8_t *vhtop_ie)
+ieee80211_vht_update_cap(struct ieee80211_node *ni, const uint8_t *vhtcap_ie)
 {
 
 	ieee80211_parse_vhtcap(ni, vhtcap_ie);
-	ieee80211_parse_vhtopmode(ni, vhtop_ie);
 }
 
 static struct ieee80211_channel *
diff --git a/sys/net80211/ieee80211_vht.h b/sys/net80211/ieee80211_vht.h
index 2964de63c343..a1529df4a85b 100644
--- a/sys/net80211/ieee80211_vht.h
+++ b/sys/net80211/ieee80211_vht.h
@@ -52,8 +52,7 @@ uint8_t *	ieee80211_add_vhtinfo(uint8_t *frm, struct ieee80211_node *);
 uint8_t *ieee80211_add_vhtcap_ch(uint8_t *, struct ieee80211vap *,
     struct ieee80211_channel *);
 
-void	ieee80211_vht_update_cap(struct ieee80211_node *,
-	    const uint8_t *, const uint8_t *);
+void	ieee80211_vht_update_cap(struct ieee80211_node *, const uint8_t *);
 
 struct ieee80211_channel *
 	ieee80211_vht_adjust_channel(struct ieee80211com *,
diff --git a/sys/netinet/icmp_var.h b/sys/netinet/icmp_var.h
index b1f2b0ebf911..d6b75e482e35 100644
--- a/sys/netinet/icmp_var.h
+++ b/sys/netinet/icmp_var.h
@@ -104,11 +104,10 @@ extern int badport_bandlim(int);
 #define BANDLIM_ICMP_UNREACH 0
 #define BANDLIM_ICMP_ECHO 1
 #define BANDLIM_ICMP_TSTAMP 2
-#define BANDLIM_RST_CLOSEDPORT 3 /* No connection, and no listeners */
-#define BANDLIM_RST_OPENPORT 4   /* No connection, listener */
-#define BANDLIM_ICMP6_UNREACH 5
-#define BANDLIM_SCTP_OOTB 6
-#define BANDLIM_MAX 7
+#define BANDLIM_TCP_RST 3
+#define BANDLIM_ICMP6_UNREACH 4
+#define BANDLIM_SCTP_OOTB 5
+#define BANDLIM_MAX 6
 #endif
 
 #endif
diff --git a/sys/netinet/ip_icmp.c b/sys/netinet/ip_icmp.c
index cb4b6df57c57..71b75d18efd0 100644
--- a/sys/netinet/ip_icmp.c
+++ b/sys/netinet/ip_icmp.c
@@ -1097,8 +1097,7 @@ static const char *icmp_rate_descrs[BANDLIM_MAX] = {
 	[BANDLIM_ICMP_UNREACH] = "icmp unreach",
 	[BANDLIM_ICMP_ECHO] = "icmp ping",
 	[BANDLIM_ICMP_TSTAMP] = "icmp tstamp",
-	[BANDLIM_RST_CLOSEDPORT] = "closed port RST",
-	[BANDLIM_RST_OPENPORT] = "open port RST",
+	[BANDLIM_TCP_RST] = "tcp reset",
 	[BANDLIM_ICMP6_UNREACH] = "icmp6 unreach",
 	[BANDLIM_SCTP_OOTB] = "sctp ootb",
 };
diff --git a/sys/netinet/tcp_hpts.c b/sys/netinet/tcp_hpts.c
index 91f8251589e4..b60cdf45af52 100644
--- a/sys/netinet/tcp_hpts.c
+++ b/sys/netinet/tcp_hpts.c
@@ -433,38 +433,40 @@ static void
 tcp_hpts_log(struct tcp_hpts_entry *hpts, struct tcpcb *tp, struct timeval *tv,
     int slots_to_run, int idx, bool from_callout)
 {
-	union tcp_log_stackspecific log;
-	/*
-	 * Unused logs are
-	 * 64 bit - delRate, rttProp, bw_inuse
-	 * 16 bit - cwnd_gain
-	 *  8 bit - bbr_state, bbr_substate, inhpts;
-	 */
-	memset(&log, 0, sizeof(log));
-	log.u_bbr.flex1 = hpts->p_nxt_slot;
-	log.u_bbr.flex2 = hpts->p_cur_slot;
-	log.u_bbr.flex3 = hpts->p_prev_slot;
-	log.u_bbr.flex4 = idx;
-	log.u_bbr.flex5 = hpts->p_curtick;
-	log.u_bbr.flex6 = hpts->p_on_queue_cnt;
-	log.u_bbr.flex7 = hpts->p_cpu;
-	log.u_bbr.flex8 = (uint8_t)from_callout;
-	log.u_bbr.inflight = slots_to_run;
-	log.u_bbr.applimited = hpts->overidden_sleep;
-	log.u_bbr.delivered = hpts->saved_curtick;
-	log.u_bbr.timeStamp = tcp_tv_to_usectick(tv);
-	log.u_bbr.epoch = hpts->saved_curslot;
-	log.u_bbr.lt_epoch = hpts->saved_prev_slot;
-	log.u_bbr.pkts_out = hpts->p_delayed_by;
-	log.u_bbr.lost = hpts->p_hpts_sleep_time;
-	log.u_bbr.pacing_gain = hpts->p_cpu;
-	log.u_bbr.pkt_epoch = hpts->p_runningslot;
-	log.u_bbr.use_lt_bw = 1;
-	TCP_LOG_EVENTP(tp, NULL,
-		       &tptosocket(tp)->so_rcv,
-		       &tptosocket(tp)->so_snd,
-		       BBR_LOG_HPTSDIAG, 0,
-		       0, &log, false, tv);
+	if (hpts_does_tp_logging && tcp_bblogging_on(tp)) {
+		union tcp_log_stackspecific log;
+		/*
+		 * Unused logs are
+		 * 64 bit - delRate, rttProp, bw_inuse
+		 * 16 bit - cwnd_gain
+		 *  8 bit - bbr_state, bbr_substate, inhpts;
+		 */
+		memset(&log, 0, sizeof(log));
+		log.u_bbr.flex1 = hpts->p_nxt_slot;
+		log.u_bbr.flex2 = hpts->p_cur_slot;
+		log.u_bbr.flex3 = hpts->p_prev_slot;
+		log.u_bbr.flex4 = idx;
+		log.u_bbr.flex5 = hpts->p_curtick;
+		log.u_bbr.flex6 = hpts->p_on_queue_cnt;
+		log.u_bbr.flex7 = hpts->p_cpu;
+		log.u_bbr.flex8 = (uint8_t)from_callout;
+		log.u_bbr.inflight = slots_to_run;
+		log.u_bbr.applimited = hpts->overidden_sleep;
+		log.u_bbr.delivered = hpts->saved_curtick;
+		log.u_bbr.timeStamp = tcp_tv_to_usectick(tv);
+		log.u_bbr.epoch = hpts->saved_curslot;
+		log.u_bbr.lt_epoch = hpts->saved_prev_slot;
+		log.u_bbr.pkts_out = hpts->p_delayed_by;
+		log.u_bbr.lost = hpts->p_hpts_sleep_time;
+		log.u_bbr.pacing_gain = hpts->p_cpu;
+		log.u_bbr.pkt_epoch = hpts->p_runningslot;
+		log.u_bbr.use_lt_bw = 1;
+		TCP_LOG_EVENTP(tp, NULL,
+			&tptosocket(tp)->so_rcv,
+			&tptosocket(tp)->so_snd,
+			BBR_LOG_HPTSDIAG, 0,
+			0, &log, false, tv);
+	}
 }
 
 static void
@@ -1353,10 +1355,7 @@ again:
 			}
 			CURVNET_SET(inp->inp_vnet);
 			/* Lets do any logging that we might want to */
-			if (hpts_does_tp_logging && tcp_bblogging_on(tp)) {
-				tcp_hpts_log(hpts, tp, &tv, slots_to_run, i,
-				    from_callout);
-			}
+			tcp_hpts_log(hpts, tp, &tv, slots_to_run, i, from_callout);
 
 			if (tp->t_fb_ptr != NULL) {
 				kern_prefetch(tp->t_fb_ptr, &did_prefetch);
@@ -1487,7 +1486,7 @@ no_run:
 }
 
 void
-__tcp_set_hpts(struct tcpcb *tp, int32_t line)
+tcp_set_hpts(struct tcpcb *tp)
 {
 	struct tcp_hpts_entry *hpts;
 	int failed;
diff --git a/sys/netinet/tcp_hpts.h b/sys/netinet/tcp_hpts.h
index b097a2b98db9..f5856ed8e688 100644
--- a/sys/netinet/tcp_hpts.h
+++ b/sys/netinet/tcp_hpts.h
@@ -149,8 +149,7 @@ uint32_t tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line,
 #define	tcp_hpts_insert(inp, slot)	\
 	tcp_hpts_insert_diag((inp), (slot), __LINE__, NULL)
 
-void __tcp_set_hpts(struct tcpcb *tp, int32_t line);
-#define tcp_set_hpts(a) __tcp_set_hpts(a, __LINE__)
+void tcp_set_hpts(struct tcpcb *tp);
 
 void tcp_set_inp_to_drop(struct inpcb *inp, uint16_t reason);
 
@@ -165,25 +164,25 @@ extern int32_t tcp_min_hptsi_time;
  * The following functions should also be available
  * to userspace as well.
  */
-static __inline uint32_t
+static inline uint32_t
 tcp_tv_to_hptstick(const struct timeval *sv)
 {
 	return ((sv->tv_sec * 100000) + (sv->tv_usec / HPTS_TICKS_PER_SLOT));
 }
 
-static __inline uint32_t
+static inline uint32_t
 tcp_tv_to_usectick(const struct timeval *sv)
 {
 	return ((uint32_t) ((sv->tv_sec * HPTS_USEC_IN_SEC) + sv->tv_usec));
 }
 
-static __inline uint32_t
+static inline uint32_t
 tcp_tv_to_mssectick(const struct timeval *sv)
 {
 	return ((uint32_t) ((sv->tv_sec * HPTS_MSEC_IN_SEC) + (sv->tv_usec/HPTS_USEC_IN_MSEC)));
 }
 
-static __inline uint64_t
+static inline uint64_t
 tcp_tv_to_lusectick(const struct timeval *sv)
 {
 	return ((uint64_t)((sv->tv_sec * HPTS_USEC_IN_SEC) + sv->tv_usec));
@@ -199,7 +198,7 @@ get_hpts_min_sleep_time(void)
 	return (tcp_min_hptsi_time + HPTS_TICKS_PER_SLOT);
 }
 
-static __inline uint32_t
+static inline uint32_t
 tcp_gethptstick(struct timeval *sv)
 {
 	struct timeval tv;
@@ -210,7 +209,7 @@ tcp_gethptstick(struct timeval *sv)
 	return (tcp_tv_to_hptstick(sv));
 }
 
-static __inline uint64_t
+static inline uint64_t
 tcp_get_u64_usecs(struct timeval *tv)
 {
 	struct timeval tvd;
@@ -221,7 +220,7 @@ tcp_get_u64_usecs(struct timeval *tv)
 	return (tcp_tv_to_lusectick(tv));
 }
 
-static __inline uint32_t
+static inline uint32_t
 tcp_get_usecs(struct timeval *tv)
 {
 	struct timeval tvd;
diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c
index 7c032e13f37a..de428ae1af6f 100644
--- a/sys/netinet/tcp_input.c
+++ b/sys/netinet/tcp_input.c
@@ -621,6 +621,7 @@ tcp_input_with_port(struct mbuf **mp, int *offp, int proto, uint16_t port)
 #endif /* INET6 */
 	struct tcpopt to;		/* options in this segment */
 	char *s = NULL;			/* address and port logging */
+	bool closed_port = false;	/* segment is hitting a closed port */
 
 	NET_EPOCH_ASSERT();
 
@@ -907,7 +908,8 @@ findpcb:
 				log(LOG_INFO, "%s; %s: Connection attempt "
 				    "to closed port\n", s, __func__);
 		}
-		rstreason = BANDLIM_RST_CLOSEDPORT;
+		rstreason = BANDLIM_TCP_RST;
+		closed_port = true;
 		goto dropwithreset;
 	}
 	INP_LOCK_ASSERT(inp);
@@ -998,12 +1000,14 @@ findpcb:
 		 * down or it is in the CLOSED state.  Either way we drop the
 		 * segment and send an appropriate response.
 		 */
-		rstreason = BANDLIM_RST_CLOSEDPORT;
+		rstreason = BANDLIM_TCP_RST;
+		closed_port = true;
 		goto dropwithreset;
 	}
 
 	if ((tp->t_port != port) && (tp->t_state > TCPS_LISTEN)) {
-		rstreason = BANDLIM_RST_CLOSEDPORT;
+		rstreason = BANDLIM_TCP_RST;
+		closed_port = true;
 		goto dropwithreset;
 	}
 
@@ -1055,6 +1059,8 @@ findpcb:
 		 * socket appended to the listen queue in SYN_RECEIVED state.
 		 */
 		if ((thflags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK) {
+			int result;
+
 			/*
 			 * Parse the TCP options here because
 			 * syncookies need access to the reflected
@@ -1064,8 +1070,8 @@ findpcb:
 			/*
 			 * NB: syncache_expand() doesn't unlock inp.
 			 */
-			rstreason = syncache_expand(&inc, &to, th, &so, m, port);
-			if (rstreason < 0) {
+			result = syncache_expand(&inc, &to, th, &so, m, port);
+			if (result < 0) {
 				/*
 				 * A failing TCP MD5 signature comparison
 				 * must result in the segment being dropped
@@ -1073,7 +1079,7 @@ findpcb:
 				 * to the sender.
 				 */
 				goto dropunlock;
-			} else if (rstreason == 0) {
+			} else if (result == 0) {
 				/*
 				 * No syncache entry, or ACK was not for our
 				 * SYN/ACK.  Do our protection against double
@@ -1092,7 +1098,7 @@ findpcb:
 				 * of the failure cause.
 				 */
 				INP_WUNLOCK(inp);
-				rstreason = BANDLIM_RST_OPENPORT;
+				rstreason = BANDLIM_TCP_RST;
 				lookupflag &= ~INPLOOKUP_WILDCARD;
 				goto findpcb;
 			}
@@ -1183,7 +1189,7 @@ tfo_socket_result:
 				    s, __func__);
 			syncache_badack(&inc, port);	/* XXX: Not needed! */
 			TCPSTAT_INC(tcps_badsyn);
-			rstreason = BANDLIM_RST_OPENPORT;
+			rstreason = BANDLIM_TCP_RST;
 			goto dropwithreset;
 		}
 		/*
@@ -1259,7 +1265,7 @@ tfo_socket_result:
 					"Connection attempt to deprecated "
 					"IPv6 address rejected\n",
 					s, __func__);
-				rstreason = BANDLIM_RST_OPENPORT;
+				rstreason = BANDLIM_TCP_RST;
 				goto dropwithreset;
 			}
 		}
@@ -1380,9 +1386,10 @@ dropwithreset:
 	 * When blackholing do not respond with a RST but
 	 * completely ignore the segment and drop it.
 	 */
-	if (((rstreason == BANDLIM_RST_OPENPORT && V_blackhole == 3) ||
-	    (rstreason == BANDLIM_RST_CLOSEDPORT &&
-	    ((V_blackhole == 1 && (thflags & TH_SYN)) || V_blackhole > 1))) &&
+	if (rstreason == BANDLIM_TCP_RST &&
+	    ((!closed_port && V_blackhole == 3) ||
+	     (closed_port &&
+	      ((V_blackhole == 1 && (thflags & TH_SYN)) || V_blackhole > 1))) &&
 	    (V_blackhole_local || (
 #ifdef INET6
 	    isipv6 ? !in6_localip(&ip6->ip6_src) :
@@ -1515,7 +1522,9 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
 	struct tcpopt to;
 	int tfo_syn;
 	u_int maxseg = 0;
+	bool no_data;
 
+	no_data = (tlen == 0);
 	thflags = tcp_get_flags(th);
 	tp->sackhint.last_sack_ack = 0;
 	sack_changed = SACK_NOCHANGE;
@@ -1754,7 +1763,7 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
 			tp->ts_recent = to.to_tsval;
 		}
 
-		if (tlen == 0) {
+		if (no_data) {
 			if (SEQ_GT(th->th_ack, tp->snd_una) &&
 			    SEQ_LEQ(th->th_ack, tp->snd_max) &&
 			    !IN_RECOVERY(tp->t_flags) &&
@@ -1963,7 +1972,7 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
 		if ((thflags & TH_ACK) &&
 		    (SEQ_LEQ(th->th_ack, tp->snd_una) ||
 		     SEQ_GT(th->th_ack, tp->snd_max))) {
-				rstreason = BANDLIM_RST_OPENPORT;
+				rstreason = BANDLIM_TCP_RST;
 				tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
 				goto dropwithreset;
 		}
@@ -1976,7 +1985,7 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
 			 * FIN, or a RST.
 			 */
 			if ((thflags & (TH_SYN|TH_ACK)) == (TH_SYN|TH_ACK)) {
-				rstreason = BANDLIM_RST_OPENPORT;
+				rstreason = BANDLIM_TCP_RST;
 				tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
 				goto dropwithreset;
 			} else if (thflags & TH_SYN) {
@@ -2244,7 +2253,7 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
 	 * for the "LAND" DoS attack.
 	 */
 	if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) {
-		rstreason = BANDLIM_RST_OPENPORT;
+		rstreason = BANDLIM_TCP_RST;
 		tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
 		goto dropwithreset;
 	}
@@ -2557,7 +2566,7 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
 
 		if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
 			maxseg = tcp_maxseg(tp);
-			if (tlen == 0 &&
+			if (no_data &&
 			    (tiwin == tp->snd_wnd ||
 			    (tp->t_flags & TF_SACK_PERMIT))) {
 				/*
@@ -3113,8 +3122,7 @@ step6:
 	    (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
 	     (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
 		/* keep track of pure window updates */
-		if (tlen == 0 &&
-		    tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
+		if (no_data && tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
 			TCPSTAT_INC(tcps_rcvwinupd);
 		tp->snd_wnd = tiwin;
 		tp->snd_wl1 = th->th_seq;
@@ -3424,7 +3432,7 @@ dropafterack:
 	if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) &&
 	    (SEQ_GT(tp->snd_una, th->th_ack) ||
 	     SEQ_GT(th->th_ack, tp->snd_max)) ) {
-		rstreason = BANDLIM_RST_OPENPORT;
+		rstreason = BANDLIM_TCP_RST;
 		tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
 		goto dropwithreset;
 	}
diff --git a/sys/netinet/tcp_log_buf.c b/sys/netinet/tcp_log_buf.c
index 75d693bc019b..e24790ece43d 100644
--- a/sys/netinet/tcp_log_buf.c
+++ b/sys/netinet/tcp_log_buf.c
@@ -2878,7 +2878,7 @@ tcp_log_sendfile(struct socket *so, off_t offset, size_t nbytes, int flags)
 	/* double check log state now that we have the lock */
 	if (inp->inp_flags & INP_DROPPED)
 		goto done;
-	if (tp->_t_logstate != TCP_LOG_STATE_OFF) {
+	if (tcp_bblogging_on(tp)) {
 		struct timeval tv;
 		tcp_log_eventspecific_t log;
 
diff --git a/sys/netinet/tcp_log_buf.h b/sys/netinet/tcp_log_buf.h
index fef32e16b2e4..3e7eef8a1cda 100644
--- a/sys/netinet/tcp_log_buf.h
+++ b/sys/netinet/tcp_log_buf.h
@@ -539,12 +539,12 @@ struct tcpcb;
 			    NULL, NULL, 0, NULL);			\
 	} while (0)
 #endif /* TCP_LOG_FORCEVERBOSE */
+/* Assumes/requires the caller has already checked tcp_bblogging_on(tp). */
 #define	TCP_LOG_EVENTP(tp, th, rxbuf, txbuf, eventid, errornum, len, stackinfo, th_hostorder, tv) \
 	do {								\
-		if (tcp_bblogging_on(tp))				\
-			tcp_log_event(tp, th, rxbuf, txbuf, eventid,	\
-			    errornum, len, stackinfo, th_hostorder,	\
-			    NULL, NULL, 0, tv);				\
+		KASSERT(tcp_bblogging_on(tp), ("bblogging is off")); \
+		tcp_log_event(tp, th, rxbuf, txbuf, eventid, errornum, len,	\
+			stackinfo, th_hostorder, NULL, NULL, 0, tv); \
 	} while (0)
 
 #ifdef TCP_BLACKBOX
diff --git a/sys/netinet/tcp_stacks/bbr.c b/sys/netinet/tcp_stacks/bbr.c
index e2cfec5c9275..d2636f01714e 100644
--- a/sys/netinet/tcp_stacks/bbr.c
+++ b/sys/netinet/tcp_stacks/bbr.c
@@ -8763,7 +8763,7 @@ bbr_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so,
 	    (SEQ_LEQ(th->th_ack, tp->iss) ||
 	    SEQ_GT(th->th_ack, tp->snd_max))) {
 		tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
-		ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+		ctf_do_dropwithreset(m, tp, th, BANDLIM_TCP_RST, tlen);
 		return (1);
 	}
 	if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) {
@@ -8965,7 +8965,7 @@ bbr_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so,
 	    (SEQ_LEQ(th->th_ack, tp->snd_una) ||
 	     SEQ_GT(th->th_ack, tp->snd_max))) {
 		tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
-		ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+		ctf_do_dropwithreset(m, tp, th, BANDLIM_TCP_RST, tlen);
 		return (1);
 	}
 	if (tp->t_flags & TF_FASTOPEN) {
@@ -8977,7 +8977,7 @@ bbr_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so,
 		 */
 		if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) {
 			tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
-			ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+			ctf_do_dropwithreset(m, tp, th, BANDLIM_TCP_RST, tlen);
 			return (1);
 		} else if (thflags & TH_SYN) {
 			/* non-initial SYN is ignored */
@@ -9010,7 +9010,7 @@ bbr_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so,
 	 */
 	if (SEQ_LT(th->th_seq, tp->irs)) {
 		tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
-		ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+		ctf_do_dropwithreset(m, tp, th, BANDLIM_TCP_RST, tlen);
 		return (1);
 	}
 	if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
@@ -9288,7 +9288,7 @@ bbr_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so,
 	if (sbavail(&so->so_snd)) {
 		if (ctf_progress_timeout_check(tp, true)) {
 			bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__);
-			ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+			ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen);
 			return (1);
 		}
 	}
@@ -9385,7 +9385,7 @@ bbr_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so,
 	if (sbavail(&so->so_snd)) {
 		if (ctf_progress_timeout_check(tp, true)) {
 			bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__);
-			ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+			ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen);
 			return (1);
 		}
 	}
@@ -9535,7 +9535,7 @@ bbr_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so,
 	if (sbavail(&so->so_snd)) {
 		if (ctf_progress_timeout_check(tp, true)) {
 			bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__);
-			ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+			ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen);
 			return (1);
 		}
 	}
@@ -9637,7 +9637,7 @@ bbr_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so,
 	if (sbavail(&so->so_snd)) {
 		if (ctf_progress_timeout_check(tp, true)) {
 			bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__);
-			ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+			ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen);
 			return (1);
 		}
 	}
@@ -9739,7 +9739,7 @@ bbr_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
 	if (sbavail(&so->so_snd)) {
 		if (ctf_progress_timeout_check(tp, true)) {
 			bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__);
-			ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+			ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen);
 			return (1);
 		}
 	}
@@ -9848,7 +9848,7 @@ bbr_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so,
 	if (sbavail(&so->so_snd)) {
 		if (ctf_progress_timeout_check(tp, true)) {
 			bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__);
-			ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+			ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen);
 			return (1);
 		}
 	}
@@ -11510,7 +11510,7 @@ bbr_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
 	if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) &&
 	    (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) {
 		tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
-		ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+		ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen);
 		return (1);
 	}
 	if (tiwin > bbr->r_ctl.rc_high_rwnd)
diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c
index 8e05498863b9..834e1347a152 100644
--- a/sys/netinet/tcp_stacks/rack.c
+++ b/sys/netinet/tcp_stacks/rack.c
@@ -40,7 +40,6 @@
 #endif
 #include <sys/lock.h>
 #include <sys/malloc.h>
-#include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/mbuf.h>
 #include <sys/proc.h>		/* for proc0 declaration */
@@ -198,7 +197,7 @@ static uint32_t rack_pcm_blast = 0;
 static uint32_t rack_pcm_is_enabled = 1;
 static uint8_t rack_ssthresh_rest_rto_rec = 0; /* Do we restore ssthresh when we have rec -> rto -> rec */
 
-static uint32_t rack_gp_gain_req = 1200;		/* Amount percent wise required to gain to record a round has "gaining" */
+static uint32_t rack_gp_gain_req = 1200;		/* Amount percent wise required to gain to record a round as "gaining" */
 static uint32_t rack_rnd_cnt_req = 0x10005;		/* Default number of rounds if we are below rack_gp_gain_req where we exit ss */
 
 
@@ -938,7 +937,7 @@ rack_init_sysctls(void)
 	SYSCTL_ADD_U32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_probertt),
 	    OID_AUTO, "time_between", CTLFLAG_RW,
-	    & rack_time_between_probertt, 96000000,
+	    &rack_time_between_probertt, 96000000,
 	    "How many useconds between the lowest rtt falling must past before we enter probertt");
 	SYSCTL_ADD_U32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_probertt),
@@ -3480,9 +3479,9 @@ static void
 rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm)
 {
 	if (rsm->r_flags & RACK_APP_LIMITED) {
-		if (rack->r_ctl.rc_app_limited_cnt > 0) {
-			rack->r_ctl.rc_app_limited_cnt--;
-		}
+		KASSERT((rack->r_ctl.rc_app_limited_cnt > 0),
+		    ("app_cnt %u, rsm %p", rack->r_ctl.rc_app_limited_cnt, rsm));
+		rack->r_ctl.rc_app_limited_cnt--;
 	}
 	if (rsm->r_limit_type) {
 		/* currently there is only one limit type */
@@ -3554,8 +3553,7 @@ rack_get_measure_window(struct tcpcb *tp, struct tcp_rack *rack)
 	 * earlier.
 	 *
 	 * So lets calculate the BDP with the "known" b/w using
-	 * the SRTT has our rtt and then multiply it by the
-	 * goal.
+	 * the SRTT as our rtt and then multiply it by the goal.
 	 */
 	bw = rack_get_bw(rack);
 	srtt = (uint64_t)tp->t_srtt;
@@ -5793,7 +5791,7 @@ rack_cong_signal(struct tcpcb *tp, uint32_t type, uint32_t ack, int line)
 		tp->t_badrxtwin = 0;
 		break;
 	}
-	if ((CC_ALGO(tp)->cong_signal != NULL)  &&
+	if ((CC_ALGO(tp)->cong_signal != NULL) &&
 	    (type != CC_RTO)){
 		tp->t_ccv.curack = ack;
 		CC_ALGO(tp)->cong_signal(&tp->t_ccv, type);
@@ -5904,7 +5902,7 @@ rack_calc_thresh_rack(struct tcp_rack *rack, uint32_t srtt, uint32_t cts, int li
 	 *
 	 * If reorder-fade is configured, then we track the last time we saw
 	 * re-ordering occur. If we reach the point where enough time as
-	 * passed we no longer consider reordering has occuring.
+	 * passed we no longer consider reordering as occurring.
 	 *
 	 * Or if reorder-face is 0, then once we see reordering we consider
 	 * the connection to alway be subject to reordering and just set lro
@@ -7045,6 +7043,9 @@ rack_clone_rsm(struct tcp_rack *rack, struct rack_sendmap *nrsm,
 	/* Push bit must go to the right edge as well */
 	if (rsm->r_flags & RACK_HAD_PUSH)
 		rsm->r_flags &= ~RACK_HAD_PUSH;
+	/* Update the count if app limited */
+	if (nrsm->r_flags & RACK_APP_LIMITED)
+		rack->r_ctl.rc_app_limited_cnt++;
 	/* Clone over the state of the hw_tls flag */
 	nrsm->r_hw_tls = rsm->r_hw_tls;
 	/*
@@ -7096,7 +7097,7 @@ rack_merge_rsm(struct tcp_rack *rack,
 		l_rsm->r_flags |= RACK_TLP;
 	if (r_rsm->r_flags & RACK_RWND_COLLAPSED)
 		l_rsm->r_flags |= RACK_RWND_COLLAPSED;
-	if ((r_rsm->r_flags & RACK_APP_LIMITED)  &&
+	if ((r_rsm->r_flags & RACK_APP_LIMITED) &&
 	    ((l_rsm->r_flags & RACK_APP_LIMITED) == 0)) {
 		/*
 		 * If both are app-limited then let the
@@ -8137,7 +8138,7 @@ rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack,
 		 * remove the lost desgination and reduce the
 		 * bytes considered lost.
 		 */
-		rsm->r_flags  &= ~RACK_WAS_LOST;
+		rsm->r_flags &= ~RACK_WAS_LOST;
 		KASSERT((rack->r_ctl.rc_considered_lost >= (rsm->r_end - rsm->r_start)),
 			("rsm:%p rack:%p rc_considered_lost goes negative", rsm,  rack));
 		if (rack->r_ctl.rc_considered_lost >= (rsm->r_end - rsm->r_start))
@@ -8832,7 +8833,7 @@ rack_apply_updated_usrtt(struct tcp_rack *rack, uint32_t us_rtt, uint32_t us_cts
 
 				val = rack_probertt_lower_within * rack_time_between_probertt;
 				val /= 100;
-				if ((rack->in_probe_rtt == 0)  &&
+				if ((rack->in_probe_rtt == 0) &&
 				    (rack->rc_skip_timely == 0) &&
 				    ((us_cts - rack->r_ctl.rc_lower_rtt_us_cts) >= (rack_time_between_probertt - val)))	{
 					rack_enter_probertt(rack, us_cts);
@@ -10369,7 +10370,7 @@ more:
 			 * and yet before retransmitting we get an ack
 			 * which can happen due to reordering.
 			 */
-			rsm->r_flags  &= ~RACK_WAS_LOST;
+			rsm->r_flags &= ~RACK_WAS_LOST;
 			KASSERT((rack->r_ctl.rc_considered_lost >= (rsm->r_end - rsm->r_start)),
 				("rsm:%p rack:%p rc_considered_lost goes negative", rsm,  rack));
 			if (rack->r_ctl.rc_considered_lost >= (rsm->r_end - rsm->r_start))
@@ -11065,7 +11066,7 @@ rack_strike_dupack(struct tcp_rack *rack, tcp_seq th_ack)
 		 * We need to skip anything already set
 		 * to be retransmitted.
 		 */
-		if ((rsm->r_dupack >= DUP_ACK_THRESHOLD)  ||
+		if ((rsm->r_dupack >= DUP_ACK_THRESHOLD) ||
 		    (rsm->r_flags & RACK_MUST_RXT)) {
 			rsm = TAILQ_NEXT(rsm, r_tnext);
 			continue;
@@ -12875,7 +12876,7 @@ rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so,
 	    (SEQ_LEQ(th->th_ack, tp->iss) ||
 	    SEQ_GT(th->th_ack, tp->snd_max))) {
 		tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
-		ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+		ctf_do_dropwithreset(m, tp, th, BANDLIM_TCP_RST, tlen);
 		return (1);
 	}
 	if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) {
@@ -13089,7 +13090,7 @@ rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so,
 	    (SEQ_LEQ(th->th_ack, tp->snd_una) ||
 	    SEQ_GT(th->th_ack, tp->snd_max))) {
 		tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
-		ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+		ctf_do_dropwithreset(m, tp, th, BANDLIM_TCP_RST, tlen);
 		return (1);
 	}
 	if (tp->t_flags & TF_FASTOPEN) {
@@ -13102,7 +13103,7 @@ rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so,
 		 */
 		if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) {
 			tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
-			ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+			ctf_do_dropwithreset(m, tp, th, BANDLIM_TCP_RST, tlen);
 			return (1);
 		} else if (thflags & TH_SYN) {
 			/* non-initial SYN is ignored */
@@ -13136,7 +13137,7 @@ rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so,
 	 */
 	if (SEQ_LT(th->th_seq, tp->irs)) {
 		tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
-		ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+		ctf_do_dropwithreset(m, tp, th, BANDLIM_TCP_RST, tlen);
 		return (1);
 	}
 	if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
@@ -13399,7 +13400,7 @@ rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so,
 	if (sbavail(&so->so_snd)) {
 		if (ctf_progress_timeout_check(tp, true)) {
 			rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__);
-			ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+			ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen);
 			return (1);
 		}
 	}
@@ -13495,7 +13496,7 @@ rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so,
 		if (ctf_progress_timeout_check(tp, true)) {
 			rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
 						tp, tick, PROGRESS_DROP, __LINE__);
-			ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+			ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen);
 			return (1);
 		}
 	}
@@ -13645,7 +13646,7 @@ rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so,
 		if (ctf_progress_timeout_check(tp, true)) {
 			rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
 						tp, tick, PROGRESS_DROP, __LINE__);
-			ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+			ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen);
 			return (1);
 		}
 	}
@@ -13746,7 +13747,7 @@ rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so,
 		if (ctf_progress_timeout_check(tp, true)) {
 			rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
 						tp, tick, PROGRESS_DROP, __LINE__);
-			ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+			ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen);
 			return (1);
 		}
 	}
@@ -13848,7 +13849,7 @@ rack_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
 		if (ctf_progress_timeout_check(tp, true)) {
 			rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
 						tp, tick, PROGRESS_DROP, __LINE__);
-			ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+			ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen);
 			return (1);
 		}
 	}
@@ -13952,7 +13953,7 @@ rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so,
 		if (ctf_progress_timeout_check(tp, true)) {
 			rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
 						tp, tick, PROGRESS_DROP, __LINE__);
-			ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+			ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen);
 			return (1);
 		}
 	}
@@ -16655,7 +16656,7 @@ rack_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
 	if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) &&
 	    (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) {
 		tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
-		ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+		ctf_do_dropwithreset(m, tp, th, BANDLIM_TCP_RST, tlen);
 #ifdef TCP_ACCOUNTING
 		sched_unpin();
 #endif
@@ -16919,7 +16920,7 @@ do_output_now:
 		} else if ((nxt_pkt == 0) && (tp->t_flags & TF_ACKNOW)) {
 			goto do_output_now;
 		} else if ((no_output == 1) &&
-			   (nxt_pkt == 0)  &&
+			   (nxt_pkt == 0) &&
 			   (tcp_in_hpts(rack->rc_tp) == 0)) {
 			/*
 			 * We are not in hpts and we had a pacing timer up. Use
@@ -17546,7 +17547,7 @@ rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, str
 						   rack->r_ctl.rc_last_us_rtt,
 						   88, __LINE__, NULL, gain);
 		}
-		if (((bw_est == 0) || (rate_wanted == 0)  || (rack->gp_ready == 0)) &&
+		if (((bw_est == 0) || (rate_wanted == 0) || (rack->gp_ready == 0)) &&
 		    (rack->use_fixed_rate == 0)) {
 			/*
 			 * No way yet to make a b/w estimate or
@@ -17986,7 +17987,7 @@ start_set:
 		tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack);
 		rack->r_ctl.rc_gp_cumack_ts = 0;
 		if ((rack->r_ctl.cleared_app_ack == 1) &&
-		    (SEQ_GEQ(rack->r_ctl.cleared_app_ack, tp->gput_seq))) {
+		    (SEQ_GEQ(tp->gput_seq, rack->r_ctl.cleared_app_ack_seq))) {
 			/*
 			 * We just cleared an application limited period
 			 * so the next seq out needs to skip the first
@@ -20043,7 +20044,7 @@ again:
 			rack->r_ctl.pcm_max_seg = ctf_fixed_maxseg(tp) * 10;
 		}
 	}
-	if ((rack->r_ctl.pcm_max_seg != 0)  && (rack->pcm_needed == 1)) {
+	if ((rack->r_ctl.pcm_max_seg != 0) && (rack->pcm_needed == 1)) {
 		uint32_t rw_avail, cwa;
 
 		if (tp->snd_wnd > ctf_outstanding(tp))
@@ -21031,7 +21032,7 @@ just_return_nolock:
 					} else
 						log = 1;
 				}
-				/* Mark the last packet has app limited */
+				/* Mark the last packet as app limited */
 				rsm = tqhash_max(rack->r_ctl.tqh);
 				if (rsm && ((rsm->r_flags & RACK_APP_LIMITED) == 0)) {
 					if (rack->r_ctl.rc_app_limited_cnt == 0)
diff --git a/sys/netinet/tcp_stacks/rack_bbr_common.c b/sys/netinet/tcp_stacks/rack_bbr_common.c
index da26b8cb1f9b..d1c4ba58bf55 100644
--- a/sys/netinet/tcp_stacks/rack_bbr_common.c
+++ b/sys/netinet/tcp_stacks/rack_bbr_common.c
@@ -672,7 +672,7 @@ ctf_do_dropafterack(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t
 	    (SEQ_GT(tp->snd_una, th->th_ack) ||
 	    SEQ_GT(th->th_ack, tp->snd_max))) {
 		*ret_val = 1;
-		ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+		ctf_do_dropwithreset(m, tp, th, BANDLIM_TCP_RST, tlen);
 		return;
 	} else
 		*ret_val = 0;
diff --git a/sys/netinet6/in6_gif.c b/sys/netinet6/in6_gif.c
index d476829e8e3b..2bab1c57ce2a 100644
--- a/sys/netinet6/in6_gif.c
+++ b/sys/netinet6/in6_gif.c
@@ -194,6 +194,11 @@ in6_gif_setopts(struct gif_softc *sc, u_int options)
 		sc->gif_options = options;
 		in6_gif_attach(sc);
 	}
+
+	if ((options & GIF_NOCLAMP) !=
+	    (sc->gif_options & GIF_NOCLAMP)) {
+		sc->gif_options = options;
+	}
 	return (0);
 }
 
@@ -289,6 +294,7 @@ in6_gif_output(struct ifnet *ifp, struct mbuf *m, int proto, uint8_t ecn)
 {
 	struct gif_softc *sc = ifp->if_softc;
 	struct ip6_hdr *ip6;
+	u_long mtu;
 
 	/* prepend new IP header */
 	NET_EPOCH_ASSERT();
@@ -304,11 +310,15 @@ in6_gif_output(struct ifnet *ifp, struct mbuf *m, int proto, uint8_t ecn)
 	ip6->ip6_nxt	= proto;
 	ip6->ip6_hlim	= V_ip6_gif_hlim;
 	/*
-	 * force fragmentation to minimum MTU, to avoid path MTU discovery.
-	 * it is too painful to ask for resend of inner packet, to achieve
-	 * path MTU discovery for encapsulated packets.
+	 * Enforce fragmentation to minimum MTU, even if the interface MTU
+	 * is larger, to avoid path MTU discovery when NOCLAMP is not
+	 * set (default).  IPv6 does not allow fragmentation on intermediate
+	 * router nodes, so it is too painful to ask for resend of inner
+	 * packet, to achieve path MTU discovery for encapsulated packets.
 	 */
-	return (ip6_output(m, 0, NULL, IPV6_MINMTU, 0, NULL, NULL));
+	mtu = ((sc->gif_options & GIF_NOCLAMP) == 0) ? IPV6_MINMTU : 0;
+
+	return (ip6_output(m, 0, NULL, mtu, 0, NULL, NULL));
 }
 
 static int
diff --git a/sys/netinet6/mld6.c b/sys/netinet6/mld6.c
index 06fe9e8820c9..a825658bd9ee 100644
--- a/sys/netinet6/mld6.c
+++ b/sys/netinet6/mld6.c
@@ -234,17 +234,20 @@ static SYSCTL_NODE(_net_inet6_mld, OID_AUTO, ifinfo,
     CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_mld_ifinfo,
     "Per-interface MLDv2 state");
 
-static int	mld_v1enable = 1;
-SYSCTL_INT(_net_inet6_mld, OID_AUTO, v1enable, CTLFLAG_RWTUN,
-    &mld_v1enable, 0, "Enable fallback to MLDv1");
+VNET_DEFINE_STATIC(bool, mld_v1enable) = true;
+#define	V_mld_v1enable	VNET(mld_v1enable)
+SYSCTL_BOOL(_net_inet6_mld, OID_AUTO, v1enable, CTLFLAG_VNET | CTLFLAG_RWTUN,
+    &VNET_NAME(mld_v1enable), 0, "Enable fallback to MLDv1");
 
-static int	mld_v2enable = 1;
-SYSCTL_INT(_net_inet6_mld, OID_AUTO, v2enable, CTLFLAG_RWTUN,
-    &mld_v2enable, 0, "Enable MLDv2");
+VNET_DEFINE_STATIC(bool, mld_v2enable) = true;
+#define	V_mld_v2enable	VNET(mld_v2enable)
+SYSCTL_BOOL(_net_inet6_mld, OID_AUTO, v2enable, CTLFLAG_VNET | CTLFLAG_RWTUN,
+    &VNET_NAME(mld_v2enable), 0, "Enable MLDv2");
 
-static int	mld_use_allow = 1;
-SYSCTL_INT(_net_inet6_mld, OID_AUTO, use_allow, CTLFLAG_RWTUN,
-    &mld_use_allow, 0, "Use ALLOW/BLOCK for RFC 4604 SSM joins/leaves");
+VNET_DEFINE_STATIC(bool, mld_use_allow) = true;
+#define	V_mld_use_allow	VNET(mld_use_allow)
+SYSCTL_BOOL(_net_inet6_mld, OID_AUTO, use_allow, CTLFLAG_VNET | CTLFLAG_RWTUN,
+    &VNET_NAME(mld_use_allow), 0, "Use ALLOW/BLOCK for RFC 4604 SSM joins/leaves");
 
 /*
  * Packed Router Alert option structure declaration.
@@ -481,7 +484,7 @@ mld_domifattach(struct ifnet *ifp)
 	mbufq_init(&mli->mli_gq, MLD_MAX_RESPONSE_PACKETS);
 	if ((ifp->if_flags & IFF_MULTICAST) == 0)
 		mli->mli_flags |= MLIF_SILENT;
-	if (mld_use_allow)
+	if (V_mld_use_allow)
 		mli->mli_flags |= MLIF_USEALLOW;
 
 	MLD_LOCK();
@@ -614,7 +617,7 @@ mld_v1_input_query(struct ifnet *ifp, const struct ip6_hdr *ip6,
 
 	is_general_query = 0;
 
-	if (!mld_v1enable) {
+	if (!V_mld_v1enable) {
 		CTR3(KTR_MLD, "ignore v1 query %s on ifp %p(%s)",
 		    ip6_sprintf(ip6tbuf, &mld->mld_addr),
 		    ifp, if_name(ifp));
@@ -790,7 +793,7 @@ mld_v2_input_query(struct ifnet *ifp, const struct ip6_hdr *ip6,
 
 	NET_EPOCH_ASSERT();
 
-	if (!mld_v2enable) {
+	if (!V_mld_v2enable) {
 		CTR3(KTR_MLD, "ignore v2 query src %s on ifp %p(%s)",
 		    ip6_sprintf(ip6tbuf, &ip6->ip6_src),
 		    ifp, if_name(ifp));
@@ -1076,7 +1079,7 @@ mld_v1_input_report(struct ifnet *ifp, const struct ip6_hdr *ip6,
 
 	NET_EPOCH_ASSERT();
 
-	if (!mld_v1enable) {
+	if (!V_mld_v1enable) {
 		CTR3(KTR_MLD, "ignore v1 report %s on ifp %p(%s)",
 		    ip6_sprintf(ip6tbuf, &mld->mld_addr),
 		    ifp, if_name(ifp));
diff --git a/sys/netinet6/raw_ip6.c b/sys/netinet6/raw_ip6.c
index 0379ef7c789a..c90a1213bd66 100644
--- a/sys/netinet6/raw_ip6.c
+++ b/sys/netinet6/raw_ip6.c
@@ -765,8 +765,7 @@ rip6_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
 	}
 	if (ifa != NULL &&
 	    ((struct in6_ifaddr *)ifa)->ia6_flags &
-	    (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY|
-	     IN6_IFF_DETACHED|IN6_IFF_DEPRECATED)) {
+	    (IN6_IFF_NOTREADY|IN6_IFF_DETACHED|IN6_IFF_DEPRECATED)) {
 		NET_EPOCH_EXIT(et);
 		return (EADDRNOTAVAIL);
 	}
diff --git a/sys/netipsec/ipsec.c b/sys/netipsec/ipsec.c
index 6bacc68b7441..92d0201b398a 100644
--- a/sys/netipsec/ipsec.c
+++ b/sys/netipsec/ipsec.c
@@ -636,8 +636,10 @@ ipsec4_in_reject1(const struct mbuf *m, struct ip *ip1, struct inpcb *inp)
 
 #ifdef IPSEC_OFFLOAD
 	tag = ipsec_accel_input_tag_lookup(m);
-	if (tag != NULL)
-		return (0);
+	if (tag != NULL) {
+		tag->tag.m_tag_id = PACKET_TAG_IPSEC_IN_DONE;
+		__DECONST(struct mbuf *, m)->m_flags |= M_DECRYPTED;
+	}
 #endif
 
 	if (ip1 == NULL) {
diff --git a/sys/netipsec/ipsec_offload.c b/sys/netipsec/ipsec_offload.c
index 467d5ded1d7a..8a09d5f37b4a 100644
--- a/sys/netipsec/ipsec_offload.c
+++ b/sys/netipsec/ipsec_offload.c
@@ -94,6 +94,7 @@ struct ifp_handle_sav {
 	size_t hdr_ext_size;
 	uint64_t cnt_octets;
 	uint64_t cnt_allocs;
+	struct xform_history xfh;
 };
 
 #define	IFP_HS_HANDLED	0x00000001
@@ -159,6 +160,8 @@ static void ipsec_accel_drv_sa_lifetime_update_impl(struct secasvar *sav,
 static int ipsec_accel_drv_sa_lifetime_fetch_impl(struct secasvar *sav,
     if_t ifp, u_int drv_spi, uint64_t *octets, uint64_t *allocs);
 static void ipsec_accel_ifdetach_event(void *arg, struct ifnet *ifp);
+static bool ipsec_accel_fill_xh_impl(if_t ifp, uint32_t drv_spi,
+    struct xform_history *xh);
 
 static void
 ipsec_accel_init(void *arg)
@@ -185,6 +188,7 @@ ipsec_accel_init(void *arg)
 	    ipsec_accel_drv_sa_lifetime_update_impl;
 	ipsec_accel_drv_sa_lifetime_fetch_p =
 	    ipsec_accel_drv_sa_lifetime_fetch_impl;
+	ipsec_accel_fill_xh_p = ipsec_accel_fill_xh_impl;
 	pctrie_init(&drv_spi_pctrie);
 	ipsec_accel_ifdetach_event_tag = EVENTHANDLER_REGISTER(
 	    ifnet_departure_event, ipsec_accel_ifdetach_event, NULL,
@@ -209,6 +213,7 @@ ipsec_accel_fini(void *arg)
 	ipsec_accel_on_ifdown_p = NULL;
 	ipsec_accel_drv_sa_lifetime_update_p = NULL;
 	ipsec_accel_drv_sa_lifetime_fetch_p = NULL;
+	ipsec_accel_fill_xh_p = NULL;
 	ipsec_accel_sync_imp();
 	clean_unrhdr(drv_spi_unr);	/* avoid panic, should go later */
 	clear_unrhdr(drv_spi_unr);
@@ -412,6 +417,10 @@ ipsec_accel_handle_sav(struct secasvar *sav, struct ifnet *ifp,
 	ihs->ifdata = priv;
 	ihs->flags = flags;
 	ihs->hdr_ext_size = esp_hdrsiz(sav);
+	memcpy(&ihs->xfh.dst, &sav->sah->saidx.dst, sizeof(ihs->xfh.dst));
+	ihs->xfh.spi = sav->spi;
+	ihs->xfh.proto = sav->sah->saidx.proto;
+	ihs->xfh.mode = sav->sah->saidx.mode;
 	mtx_lock(&ipsec_accel_sav_tmp);
 	CK_LIST_FOREACH(i, &sav->accel_ifps, sav_link) {
 		if (i->ifp == ifp) {
@@ -1162,4 +1171,20 @@ ipsec_accel_key_setaccelif_impl(struct secasvar *sav)
 	return (m);
 }
 
+static bool
+ipsec_accel_fill_xh_impl(if_t ifp, uint32_t drv_spi, struct xform_history *xh)
+{
+	struct ifp_handle_sav *i;
+
+	if (drv_spi < IPSEC_ACCEL_DRV_SPI_MIN ||
+	    drv_spi > IPSEC_ACCEL_DRV_SPI_MAX)
+		return (false);
+
+	i = DRVSPI_SA_PCTRIE_LOOKUP(&drv_spi_pctrie, drv_spi);
+	if (i == NULL)
+		return (false);
+	memcpy(xh, &i->xfh, sizeof(*xh));
+	return (true);
+}
+
 #endif	/* IPSEC_OFFLOAD */
diff --git a/sys/netipsec/ipsec_offload.h b/sys/netipsec/ipsec_offload.h
index 904fe6252396..ae60eaa8ae78 100644
--- a/sys/netipsec/ipsec_offload.h
+++ b/sys/netipsec/ipsec_offload.h
@@ -30,6 +30,7 @@
 #include <sys/errno.h>
 #include <net/if.h>
 #include <net/if_var.h>
+#include <netipsec/xform.h>
 
 struct secpolicy;
 struct secasvar;
@@ -42,6 +43,7 @@ struct ipsec_accel_out_tag {
 
 struct ipsec_accel_in_tag {
 	struct m_tag tag;
+	struct xform_history xh; /* Must be first to mimic IPSEC_IN_DONE */
 	uint16_t drv_spi;
 };
 
@@ -66,6 +68,8 @@ extern void (*ipsec_accel_drv_sa_lifetime_update_p)(struct secasvar *sav,
     if_t ifp, u_int drv_spi, uint64_t octets, uint64_t allocs);
 extern int (*ipsec_accel_drv_sa_lifetime_fetch_p)(struct secasvar *sav,
     if_t ifp, u_int drv_spi, uint64_t *octets, uint64_t *allocs);
+extern bool (*ipsec_accel_fill_xh_p)(if_t ifp, uint32_t drv_spi,
+    struct xform_history *xh);
 
 #ifdef IPSEC_OFFLOAD
 /*
@@ -158,6 +162,16 @@ ipsec_accel_key_setaccelif(struct secasvar *sav)
 	return (NULL);
 }
 
+static inline bool
+ipsec_accel_fill_xh(if_t ifp, uint32_t drv_spi, struct xform_history *xh)
+{
+	bool (*p)(if_t ifp, uint32_t drv_spi, struct xform_history *xh);
+
+	p = atomic_load_ptr(&ipsec_accel_fill_xh_p);
+	if (p != NULL)
+		return (p(ifp, drv_spi, xh));
+	return (false);
+}
 
 #else
 #define	ipsec_accel_sa_newkey(a)
@@ -168,6 +182,7 @@ ipsec_accel_key_setaccelif(struct secasvar *sav)
 #define	ipsec_accel_sync()
 #define	ipsec_accel_is_accel_sav(a)
 #define	ipsec_accel_key_setaccelif(a)
+#define	ipsec_accel_fill_xh(a, b, c)	(false)
 #endif
 
 void ipsec_accel_forget_sav_impl(struct secasvar *sav);
@@ -180,6 +195,7 @@ bool ipsec_accel_output(struct ifnet *ifp, struct mbuf *m,
     struct inpcb *inp, struct secpolicy *sp, struct secasvar *sav, int af,
     int mtu, int *hwassist);
 void ipsec_accel_forget_sav(struct secasvar *sav);
+struct xform_history;
 #else
 #define	ipsec_accel_input(a, b, c) (ENXIO)
 #define	ipsec_accel_output(a, b, c, d, e, f, g, h) ({	\
diff --git a/sys/netipsec/key.c b/sys/netipsec/key.c
index ae67d83c6d13..4ba1b49c24f0 100644
--- a/sys/netipsec/key.c
+++ b/sys/netipsec/key.c
@@ -114,6 +114,8 @@ void (*ipsec_accel_drv_sa_lifetime_update_p)(struct secasvar *sav, if_t ifp,
     u_int drv_spi, uint64_t octets, uint64_t allocs);
 int (*ipsec_accel_drv_sa_lifetime_fetch_p)(struct secasvar *sav, if_t ifp,
     u_int drv_spi, uint64_t *octets, uint64_t *allocs);
+bool (*ipsec_accel_fill_xh_p)(if_t ifp, uint32_t drv_spi,
+    struct xform_history *xh);
 #endif
 
 #define FULLMASK	0xff
diff --git a/sys/netlink/netlink_message_parser.h b/sys/netlink/netlink_message_parser.h
index 8492ecb3021b..720317ed74f3 100644
--- a/sys/netlink/netlink_message_parser.h
+++ b/sys/netlink/netlink_message_parser.h
@@ -209,7 +209,8 @@ int nlattr_get_nested(struct nlattr *nla, struct nl_pstate *npt,
 int nlattr_get_nested_ptr(struct nlattr *nla, struct nl_pstate *npt,
     const void *arg, void *target);
 
-bool nlmsg_report_err_msg(struct nl_pstate *npt, const char *fmt, ...);
+bool nlmsg_report_err_msg(struct nl_pstate *npt, const char *fmt, ...)
+	__printflike(2, 3);
 
 #define	NLMSG_REPORT_ERR_MSG(_npt, _fmt, ...) {	\
 	nlmsg_report_err_msg(_npt, _fmt, ## __VA_ARGS__); \
diff --git a/sys/netpfil/ipfw/ip_fw2.c b/sys/netpfil/ipfw/ip_fw2.c
index 923633d76df7..c129c8c49921 100644
--- a/sys/netpfil/ipfw/ip_fw2.c
+++ b/sys/netpfil/ipfw/ip_fw2.c
@@ -196,7 +196,7 @@ SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Firewall");
 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, one_pass,
     CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_one_pass), 0,
-    "Only do a single pass through ipfw when using dummynet(4)");
+    "Only do a single pass through ipfw when using dummynet(4), ipfw_nat or other divert(4)-like interfaces");
 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, autoinc_step,
     CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(autoinc_step), 0,
     "Rule number auto-increment step");
diff --git a/sys/netpfil/pf/if_pflog.c b/sys/netpfil/pf/if_pflog.c
index 0a84f9d680ac..cb96d2fcc44c 100644
--- a/sys/netpfil/pf/if_pflog.c
+++ b/sys/netpfil/pf/if_pflog.c
@@ -284,9 +284,9 @@ pflog_packet(uint8_t action, u_int8_t reason,
 	 * state lock, since this leads to unsafe LOR.
 	 * These conditions are very very rare, however.
 	 */
-	if (trigger->log & PF_LOG_SOCKET_LOOKUP && !pd->lookup.done && lookupsafe)
+	if (trigger->log & PF_LOG_USER && !pd->lookup.done && lookupsafe)
 		pd->lookup.done = pf_socket_lookup(pd);
-	if (pd->lookup.done > 0)
+	if (trigger->log & PF_LOG_USER && pd->lookup.done > 0)
 		hdr.uid = pd->lookup.uid;
 	else
 		hdr.uid = -1;
diff --git a/sys/netpfil/pf/if_pfsync.c b/sys/netpfil/pf/if_pfsync.c
index 2391edaf1a5a..4e03584b8f85 100644
--- a/sys/netpfil/pf/if_pfsync.c
+++ b/sys/netpfil/pf/if_pfsync.c
@@ -532,6 +532,7 @@ pfsync_state_import(union pfsync_state_union *sp, int flags, int msg_version)
 	struct pf_kpooladdr	*rpool_first;
 	int			 error;
 	uint8_t			 rt = 0;
+	int			 n = 0;
 
 	PF_RULES_RASSERT();
 
@@ -557,10 +558,12 @@ pfsync_state_import(union pfsync_state_union *sp, int flags, int msg_version)
 	 */
 	if (sp->pfs_1301.rule != htonl(-1) && sp->pfs_1301.anchor == htonl(-1) &&
 	    (flags & (PFSYNC_SI_IOCTL | PFSYNC_SI_CKSUM)) && ntohl(sp->pfs_1301.rule) <
-	    pf_main_ruleset.rules[PF_RULESET_FILTER].active.rcount)
-		r = pf_main_ruleset.rules[
-		    PF_RULESET_FILTER].active.ptr_array[ntohl(sp->pfs_1301.rule)];
-	else
+	    pf_main_ruleset.rules[PF_RULESET_FILTER].active.rcount) {
+		TAILQ_FOREACH(r, pf_main_ruleset.rules[
+		    PF_RULESET_FILTER].active.ptr, entries)
+			if (ntohl(sp->pfs_1301.rule) == n++)
+				break;
+	} else
 		r = &V_pf_default_rule;
 
 	/*
diff --git a/sys/netpfil/pf/pf.c b/sys/netpfil/pf/pf.c
index a410fe570c39..009f7e4d78b1 100644
--- a/sys/netpfil/pf/pf.c
+++ b/sys/netpfil/pf/pf.c
@@ -6219,7 +6219,7 @@ pf_create_state(struct pf_krule *r, struct pf_test_ctx *ctx,
 	if (ctx->tag > 0)
 		s->tag = ctx->tag;
 	if (pd->proto == IPPROTO_TCP && (tcp_get_flags(th) & (TH_SYN|TH_ACK)) ==
-	    TH_SYN && r->keep_state == PF_STATE_SYNPROXY) {
+	    TH_SYN && r->keep_state == PF_STATE_SYNPROXY && pd->dir == PF_IN) {
 		pf_set_protostate(s, PF_PEER_SRC, PF_TCPS_PROXY_SRC);
 		pf_undo_nat(ctx->nr, pd, bip_sum);
 		s->src.seqhi = arc4random();
@@ -9068,6 +9068,9 @@ pf_route(struct pf_krule *r, struct ifnet *oifp,
 		goto bad;
 	}
 
+	if (r->rt == PF_DUPTO)
+		skip_test = true;
+
 	if (pd->dir == PF_IN && !skip_test) {
 		if (pf_test(AF_INET, PF_OUT, PFIL_FWD, ifp, &m0, inp,
 		    &pd->act) != PF_PASS) {
@@ -9370,6 +9373,9 @@ pf_route6(struct pf_krule *r, struct ifnet *oifp,
 		goto bad;
 	}
 
+	if (r->rt == PF_DUPTO)
+		skip_test = true;
+
 	if (pd->dir == PF_IN && !skip_test) {
 		if (pf_test(AF_INET6, PF_OUT, PFIL_FWD | PF_PFIL_NOREFRAGMENT,
 		    ifp, &m0, inp, &pd->act) != PF_PASS) {
@@ -10058,6 +10064,8 @@ pf_setup_pdesc(sa_family_t af, int dir, struct pf_pdesc *pd, struct mbuf **m0,
 	pd->didx = (dir == PF_IN) ? 1 : 0;
 	pd->af = pd->naf = af;
 
+	PF_RULES_ASSERT();
+
 	TAILQ_INIT(&pd->sctp_multihome_jobs);
 	if (default_actions != NULL)
 		memcpy(&pd->act, default_actions, sizeof(pd->act));
@@ -10133,6 +10141,12 @@ pf_setup_pdesc(sa_family_t af, int dir, struct pf_pdesc *pd, struct mbuf **m0,
 		}
 
 		h = mtod(pd->m, struct ip6_hdr *);
+		if (pd->m->m_pkthdr.len <
+		    sizeof(struct ip6_hdr) + ntohs(h->ip6_plen)) {
+			*action = PF_DROP;
+			REASON_SET(reason, PFRES_SHORT);
+			return (-1);
+		}
 
 		if (pf_walk_header6(pd, h, reason) != PF_PASS) {
 			*action = PF_DROP;
@@ -10471,35 +10485,30 @@ pf_test(sa_family_t af, int dir, int pflags, struct ifnet *ifp, struct mbuf **m0
 	PF_RULES_RLOCK_TRACKER;
 	KASSERT(dir == PF_IN || dir == PF_OUT, ("%s: bad direction %d\n", __func__, dir));
 	M_ASSERTPKTHDR(*m0);
+	NET_EPOCH_ASSERT();
 
 	if (!V_pf_status.running)
 		return (PF_PASS);
 
-	PF_RULES_RLOCK();
-
 	kif = (struct pfi_kkif *)ifp->if_pf_kif;
 
 	if (__predict_false(kif == NULL)) {
 		DPFPRINTF(PF_DEBUG_URGENT,
 		    ("%s: kif == NULL, if_xname %s\n",
 		    __func__, ifp->if_xname));
-		PF_RULES_RUNLOCK();
 		return (PF_DROP);
 	}
 	if (kif->pfik_flags & PFI_IFLAG_SKIP) {
-		PF_RULES_RUNLOCK();
 		return (PF_PASS);
 	}
 
 	if ((*m0)->m_flags & M_SKIP_FIREWALL) {
-		PF_RULES_RUNLOCK();
 		return (PF_PASS);
 	}
 
 	if (__predict_false(! M_WRITABLE(*m0))) {
 		*m0 = m_unshare(*m0, M_NOWAIT);
 		if (*m0 == NULL) {
-			PF_RULES_RUNLOCK();
 			return (PF_DROP);
 		}
 	}
@@ -10512,12 +10521,10 @@ pf_test(sa_family_t af, int dir, int pflags, struct ifnet *ifp, struct mbuf **m0
 		ifp = ifnet_byindexgen(pd.pf_mtag->if_index,
 		    pd.pf_mtag->if_idxgen);
 		if (ifp == NULL || ifp->if_flags & IFF_DYING) {
-			PF_RULES_RUNLOCK();
 			m_freem(*m0);
 			*m0 = NULL;
 			return (PF_PASS);
 		}
-		PF_RULES_RUNLOCK();
 		(ifp->if_output)(ifp, *m0, sintosa(&pd.pf_mtag->dst), NULL);
 		*m0 = NULL;
 		return (PF_PASS);
@@ -10532,11 +10539,12 @@ pf_test(sa_family_t af, int dir, int pflags, struct ifnet *ifp, struct mbuf **m0
 		/* But only once. We may see the packet multiple times (e.g.
 		 * PFIL_IN/PFIL_OUT). */
 		pf_dummynet_flag_remove(pd.m, pd.pf_mtag);
-		PF_RULES_RUNLOCK();
 
 		return (PF_PASS);
 	}
 
+	PF_RULES_RLOCK();
+
 	if (pf_setup_pdesc(af, dir, &pd, m0, &action, &reason,
 		kif, default_actions) == -1) {
 		if (action != PF_PASS)
diff --git a/sys/netpfil/pf/pf.h b/sys/netpfil/pf/pf.h
index db353d185368..cfff58064922 100644
--- a/sys/netpfil/pf/pf.h
+++ b/sys/netpfil/pf/pf.h
@@ -140,7 +140,7 @@ enum	{ PF_ADDR_ADDRMASK, PF_ADDR_NOROUTE, PF_ADDR_DYNIFTL,
 
 #define	PF_LOG			0x01
 #define	PF_LOG_ALL		0x02
-#define	PF_LOG_SOCKET_LOOKUP	0x04
+#define	PF_LOG_USER		0x04
 #define	PF_LOG_FORCE		0x08
 #define	PF_LOG_MATCHES		0x10
 
diff --git a/sys/netpfil/pf/pf_ioctl.c b/sys/netpfil/pf/pf_ioctl.c
index c14211edf10f..5c69c395c5fc 100644
--- a/sys/netpfil/pf/pf_ioctl.c
+++ b/sys/netpfil/pf/pf_ioctl.c
@@ -1359,7 +1359,7 @@ static int
 pf_commit_rules(u_int32_t ticket, int rs_num, char *anchor)
 {
 	struct pf_kruleset	*rs;
-	struct pf_krule		*rule, **old_array, *old_rule;
+	struct pf_krule		*rule, *old_rule;
 	struct pf_krulequeue	*old_rules;
 	struct pf_krule_global  *old_tree;
 	int			 error;
@@ -1384,13 +1384,10 @@ pf_commit_rules(u_int32_t ticket, int rs_num, char *anchor)
 	/* Swap rules, keep the old. */
 	old_rules = rs->rules[rs_num].active.ptr;
 	old_rcount = rs->rules[rs_num].active.rcount;
-	old_array = rs->rules[rs_num].active.ptr_array;
 	old_tree = rs->rules[rs_num].active.tree;
 
 	rs->rules[rs_num].active.ptr =
 	    rs->rules[rs_num].inactive.ptr;
-	rs->rules[rs_num].active.ptr_array =
-	    rs->rules[rs_num].inactive.ptr_array;
 	rs->rules[rs_num].active.tree =
 	    rs->rules[rs_num].inactive.tree;
 	rs->rules[rs_num].active.rcount =
@@ -1420,7 +1417,6 @@ pf_commit_rules(u_int32_t ticket, int rs_num, char *anchor)
 	}
 
 	rs->rules[rs_num].inactive.ptr = old_rules;
-	rs->rules[rs_num].inactive.ptr_array = old_array;
 	rs->rules[rs_num].inactive.tree = NULL; /* important for pf_ioctl_addrule */
 	rs->rules[rs_num].inactive.rcount = old_rcount;
 
@@ -1433,9 +1429,6 @@ pf_commit_rules(u_int32_t ticket, int rs_num, char *anchor)
 	while ((rule = TAILQ_FIRST(old_rules)) != NULL)
 		pf_unlink_rule_locked(old_rules, rule);
 	PF_UNLNKDRULES_UNLOCK();
-	if (rs->rules[rs_num].inactive.ptr_array)
-		free(rs->rules[rs_num].inactive.ptr_array, M_TEMP);
-	rs->rules[rs_num].inactive.ptr_array = NULL;
 	rs->rules[rs_num].inactive.rcount = 0;
 	rs->rules[rs_num].inactive.open = 0;
 	pf_remove_if_empty_kruleset(rs);
@@ -1458,24 +1451,11 @@ pf_setup_pfsync_matching(struct pf_kruleset *rs)
 		if (rs_cnt == PF_RULESET_SCRUB)
 			continue;
 
-		if (rs->rules[rs_cnt].inactive.ptr_array)
-			free(rs->rules[rs_cnt].inactive.ptr_array, M_TEMP);
-		rs->rules[rs_cnt].inactive.ptr_array = NULL;
-
 		if (rs->rules[rs_cnt].inactive.rcount) {
-			rs->rules[rs_cnt].inactive.ptr_array =
-			    mallocarray(rs->rules[rs_cnt].inactive.rcount,
-			    sizeof(struct pf_rule **),
-			    M_TEMP, M_NOWAIT);
-
-			if (!rs->rules[rs_cnt].inactive.ptr_array)
-				return (ENOMEM);
-		}
-
-		TAILQ_FOREACH(rule, rs->rules[rs_cnt].inactive.ptr,
-		    entries) {
-			pf_hash_rule_rolling(&ctx, rule);
-			(rs->rules[rs_cnt].inactive.ptr_array)[rule->nr] = rule;
+			TAILQ_FOREACH(rule, rs->rules[rs_cnt].inactive.ptr,
+			    entries) {
+				pf_hash_rule_rolling(&ctx, rule);
+			}
 		}
 	}
 
@@ -2061,6 +2041,47 @@ pf_ioctl_getrules(struct pfioc_rule *pr)
 	return (0);
 }
 
+static int
+pf_rule_checkaf(struct pf_krule *r)
+{
+	switch (r->af) {
+	case 0:
+		if (r->rule_flag & PFRULE_AFTO)
+			return (EPFNOSUPPORT);
+		break;
+	case AF_INET:
+		if ((r->rule_flag & PFRULE_AFTO) && r->naf != AF_INET6)
+			return (EPFNOSUPPORT);
+		break;
+#ifdef INET6
+	case AF_INET6:
+		if ((r->rule_flag & PFRULE_AFTO) && r->naf != AF_INET)
+			return (EPFNOSUPPORT);
+		break;
+#endif /* INET6 */
+	default:
+		return (EPFNOSUPPORT);
+	}
+
+	if ((r->rule_flag & PFRULE_AFTO) == 0 && r->naf != 0)
+		return (EPFNOSUPPORT);
+
+	return (0);
+}
+
+static int
+pf_validate_range(uint8_t op, uint16_t port[2])
+{
+	uint16_t a = ntohs(port[0]);
+	uint16_t b = ntohs(port[1]);
+
+	if ((op == PF_OP_RRG && a > b) ||  /* 34:12,  i.e. none */
+	    (op == PF_OP_IRG && a >= b) || /* 34><12, i.e. none */
+	    (op == PF_OP_XRG && a > b))	   /* 34<>22, i.e. all */
+		return 1;
+	return 0;
+}
+
 int
 pf_ioctl_addrule(struct pf_krule *rule, uint32_t ticket,
     uint32_t pool_ticket, const char *anchor, const char *anchor_call,
@@ -2080,6 +2101,13 @@ pf_ioctl_addrule(struct pf_krule *rule, uint32_t ticket,
 
 #define	ERROUT(x)	ERROUT_FUNCTION(errout, x)
 
+	if ((error = pf_rule_checkaf(rule)))
+		ERROUT(error);
+	if (pf_validate_range(rule->src.port_op, rule->src.port))
+		ERROUT(EINVAL);
+	if (pf_validate_range(rule->dst.port_op, rule->dst.port))
+		ERROUT(EINVAL);
+
 	if (rule->ifname[0])
 		kif = pf_kkif_create(M_WAITOK);
 	if (rule->rcv_ifname[0])
@@ -3569,7 +3597,7 @@ DIOCADDRULENV_error:
 		error = pf_rule_to_krule(&pr->rule, rule);
 		if (error != 0) {
 			pf_krule_free(rule);
-			break;
+			goto fail;
 		}
 
 		pr->anchor[sizeof(pr->anchor) - 1] = '\0';
@@ -3728,11 +3756,11 @@ DIOCGETRULENV_error:
 		if (pcr->action < PF_CHANGE_ADD_HEAD ||
 		    pcr->action > PF_CHANGE_GET_TICKET) {
 			error = EINVAL;
-			break;
+			goto fail;
 		}
 		if (pcr->rule.return_icmp >> 8 > ICMP_MAXTYPE) {
 			error = EINVAL;
-			break;
+			goto fail;
 		}
 
 		if (pcr->action != PF_CHANGE_REMOVE) {
@@ -3740,9 +3768,13 @@ DIOCGETRULENV_error:
 			error = pf_rule_to_krule(&pcr->rule, newrule);
 			if (error != 0) {
 				pf_krule_free(newrule);
-				break;
+				goto fail;
 			}
 
+			if ((error = pf_rule_checkaf(newrule))) {
+				pf_krule_free(newrule);
+				goto fail;
+			}
 			if (newrule->ifname[0])
 				kif = pf_kkif_create(M_WAITOK);
 			pf_counter_u64_init(&newrule->evaluations, M_WAITOK);
@@ -3890,7 +3922,7 @@ DIOCGETRULENV_error:
 				pf_free_rule(newrule);
 				PF_RULES_WUNLOCK();
 				PF_CONFIG_UNLOCK();
-				break;
+				goto fail;
 			}
 
 			newrule->nat.cur = TAILQ_FIRST(&newrule->nat.list);
@@ -3917,7 +3949,7 @@ DIOCGETRULENV_error:
 				PF_RULES_WUNLOCK();
 				PF_CONFIG_UNLOCK();
 				error = EINVAL;
-				break;
+				goto fail;
 			}
 		}
 
@@ -3935,7 +3967,7 @@ DIOCGETRULENV_error:
 				PF_RULES_WUNLOCK();
 				PF_CONFIG_UNLOCK();
 				error = EEXIST;
-				break;
+				goto fail;
 			}
 
 			if (oldrule == NULL)
@@ -3991,7 +4023,7 @@ DIOCCHANGERULE_error:
 
 		if (sp->timeout >= PFTM_MAX) {
 			error = EINVAL;
-			break;
+			goto fail;
 		}
 		if (V_pfsync_state_import_ptr != NULL) {
 			PF_RULES_RLOCK();
@@ -4011,7 +4043,7 @@ DIOCCHANGERULE_error:
 		s = pf_find_state_byid(ps->state.id, ps->state.creatorid);
 		if (s == NULL) {
 			error = ENOENT;
-			break;
+			goto fail;
 		}
 
 		pfsync_state_export((union pfsync_state_union*)&ps->state,
@@ -4090,7 +4122,7 @@ DIOCGETSTATES_retry:
 			error = copyout(pstore, out,
 			    sizeof(struct pfsync_state_1301) * count);
 			if (error)
-				break;
+				goto fail;
 			out = ps->ps_states + nr;
 		}
 DIOCGETSTATES_full:
@@ -4110,7 +4142,7 @@ DIOCGETSTATES_full:
 
 		if (ps->ps_req_version > PF_STATE_VERSION) {
 			error = ENOTSUP;
-			break;
+			goto fail;
 		}
 
 		if (ps->ps_len <= 0) {
@@ -4168,7 +4200,7 @@ DIOCGETSTATESV2_retry:
 			error = copyout(pstore, out,
 			    sizeof(struct pf_state_export) * count);
 			if (error)
-				break;
+				goto fail;
 			out = ps->ps_states + nr;
 		}
 DIOCGETSTATESV2_full:
@@ -4274,12 +4306,12 @@ DIOCGETSTATESV2_full:
 
 		if (psp->ifname[0] == '\0') {
 			error = EINVAL;
-			break;
+			goto fail;
 		}
 
 		error = pf_user_strcpy(ps.ifname, psp->ifname, IFNAMSIZ);
 		if (error != 0)
-			break;
+			goto fail;
 		ifp = ifunit(ps.ifname);
 		if (ifp != NULL) {
 			psp->baudrate32 =
@@ -4340,7 +4372,7 @@ DIOCGETSTATESV2_full:
 		altq = malloc(sizeof(*altq), M_PFALTQ, M_WAITOK | M_ZERO);
 		error = pf_import_kaltq(pa, altq, IOCPARM_LEN(cmd));
 		if (error)
-			break;
+			goto fail;
 		altq->local_flags = 0;
 
 		PF_RULES_WLOCK();
@@ -4348,7 +4380,7 @@ DIOCGETSTATESV2_full:
 			PF_RULES_WUNLOCK();
 			free(altq, M_PFALTQ);
 			error = EBUSY;
-			break;
+			goto fail;
 		}
 
 		/*
@@ -4360,7 +4392,7 @@ DIOCGETSTATESV2_full:
 				PF_RULES_WUNLOCK();
 				error = EBUSY;
 				free(altq, M_PFALTQ);
-				break;
+				goto fail;
 			}
 			altq->altq_disc = NULL;
 			TAILQ_FOREACH(a, V_pf_altq_ifs_inactive, entries) {
@@ -4380,7 +4412,7 @@ DIOCGETSTATESV2_full:
 		if (error) {
 			PF_RULES_WUNLOCK();
 			free(altq, M_PFALTQ);
-			break;
+			goto fail;
 		}
 
 		if (altq->qname[0] != 0)
@@ -4418,13 +4450,13 @@ DIOCGETSTATESV2_full:
 		if (pa->ticket != V_ticket_altqs_active) {
 			PF_RULES_RUNLOCK();
 			error = EBUSY;
-			break;
+			goto fail;
 		}
 		altq = pf_altq_get_nth_active(pa->nr);
 		if (altq == NULL) {
 			PF_RULES_RUNLOCK();
 			error = EBUSY;
-			break;
+			goto fail;
 		}
 		pf_export_kaltq(altq, pa, IOCPARM_LEN(cmd));
 		PF_RULES_RUNLOCK();
@@ -4448,20 +4480,20 @@ DIOCGETSTATESV2_full:
 		if (pq->ticket != V_ticket_altqs_active) {
 			PF_RULES_RUNLOCK();
 			error = EBUSY;
-			break;
+			goto fail;
 		}
 		nbytes = pq->nbytes;
 		altq = pf_altq_get_nth_active(pq->nr);
 		if (altq == NULL) {
 			PF_RULES_RUNLOCK();
 			error = EBUSY;
-			break;
+			goto fail;
 		}
 
 		if ((altq->local_flags & PFALTQ_FLAG_IF_REMOVED) != 0) {
 			PF_RULES_RUNLOCK();
 			error = ENXIO;
-			break;
+			goto fail;
 		}
 		PF_RULES_RUNLOCK();
 		if (cmd == DIOCGETQSTATSV0)
@@ -4530,30 +4562,30 @@ DIOCGETSTATESV2_full:
 		if (pca->action < PF_CHANGE_ADD_HEAD ||
 		    pca->action > PF_CHANGE_REMOVE) {
 			error = EINVAL;
-			break;
+			goto fail;
 		}
 		if (pca->addr.addr.type != PF_ADDR_ADDRMASK &&
 		    pca->addr.addr.type != PF_ADDR_DYNIFTL &&
 		    pca->addr.addr.type != PF_ADDR_TABLE) {
 			error = EINVAL;
-			break;
+			goto fail;
 		}
 		if (pca->addr.addr.p.dyn != NULL) {
 			error = EINVAL;
-			break;
+			goto fail;
 		}
 
 		if (pca->action != PF_CHANGE_REMOVE) {
 #ifndef INET
 			if (pca->af == AF_INET) {
 				error = EAFNOSUPPORT;
-				break;
+				goto fail;
 			}
 #endif /* INET */
 #ifndef INET6
 			if (pca->af == AF_INET6) {
 				error = EAFNOSUPPORT;
-				break;
+				goto fail;
 			}
 #endif /* INET6 */
 			newpa = malloc(sizeof(*newpa), M_PFRULE, M_WAITOK);
@@ -4676,7 +4708,7 @@ DIOCCHANGEADDR_error:
 
 		if (io->pfrio_esize != 0) {
 			error = ENODEV;
-			break;
+			goto fail;
 		}
 		PF_RULES_WLOCK();
 		error = pfr_clr_tables(&io->pfrio_table, &io->pfrio_ndel,
@@ -4692,13 +4724,13 @@ DIOCCHANGEADDR_error:
 
 		if (io->pfrio_esize != sizeof(struct pfr_table)) {
 			error = ENODEV;
-			break;
+			goto fail;
 		}
 
 		if (io->pfrio_size < 0 || io->pfrio_size > pf_ioctl_maxcount ||
 		    WOULD_OVERFLOW(io->pfrio_size, sizeof(struct pfr_table))) {
 			error = ENOMEM;
-			break;
+			goto fail;
 		}
 
 		totlen = io->pfrio_size * sizeof(struct pfr_table);
@@ -4707,7 +4739,7 @@ DIOCCHANGEADDR_error:
 		error = copyin(io->pfrio_buffer, pfrts, totlen);
 		if (error) {
 			free(pfrts, M_TEMP);
-			break;
+			goto fail;
 		}
 		PF_RULES_WLOCK();
 		error = pfr_add_tables(pfrts, io->pfrio_size,
@@ -4724,13 +4756,13 @@ DIOCCHANGEADDR_error:
 
 		if (io->pfrio_esize != sizeof(struct pfr_table)) {
 			error = ENODEV;
-			break;
+			goto fail;
 		}
 
 		if (io->pfrio_size < 0 || io->pfrio_size > pf_ioctl_maxcount ||
 		    WOULD_OVERFLOW(io->pfrio_size, sizeof(struct pfr_table))) {
 			error = ENOMEM;
-			break;
+			goto fail;
 		}
 
 		totlen = io->pfrio_size * sizeof(struct pfr_table);
@@ -4739,7 +4771,7 @@ DIOCCHANGEADDR_error:
 		error = copyin(io->pfrio_buffer, pfrts, totlen);
 		if (error) {
 			free(pfrts, M_TEMP);
-			break;
+			goto fail;
 		}
 		PF_RULES_WLOCK();
 		error = pfr_del_tables(pfrts, io->pfrio_size,
@@ -4757,14 +4789,14 @@ DIOCCHANGEADDR_error:
 
 		if (io->pfrio_esize != sizeof(struct pfr_table)) {
 			error = ENODEV;
-			break;
+			goto fail;
 		}
 		PF_RULES_RLOCK();
 		n = pfr_table_count(&io->pfrio_table, io->pfrio_flags);
 		if (n < 0) {
 			PF_RULES_RUNLOCK();
 			error = EINVAL;
-			break;
+			goto fail;
 		}
 		io->pfrio_size = min(io->pfrio_size, n);
 
@@ -4775,7 +4807,7 @@ DIOCCHANGEADDR_error:
 		if (pfrts == NULL) {
 			error = ENOMEM;
 			PF_RULES_RUNLOCK();
-			break;
+			goto fail;
 		}
 		error = pfr_get_tables(&io->pfrio_table, pfrts,
 		    &io->pfrio_size, io->pfrio_flags | PFR_FLAG_USERIOCTL);
@@ -4794,7 +4826,7 @@ DIOCCHANGEADDR_error:
 
 		if (io->pfrio_esize != sizeof(struct pfr_tstats)) {
 			error = ENODEV;
-			break;
+			goto fail;
 		}
 		PF_TABLE_STATS_LOCK();
 		PF_RULES_RLOCK();
@@ -4803,7 +4835,7 @@ DIOCCHANGEADDR_error:
 			PF_RULES_RUNLOCK();
 			PF_TABLE_STATS_UNLOCK();
 			error = EINVAL;
-			break;
+			goto fail;
 		}
 		io->pfrio_size = min(io->pfrio_size, n);
 
@@ -4814,7 +4846,7 @@ DIOCCHANGEADDR_error:
 			error = ENOMEM;
 			PF_RULES_RUNLOCK();
 			PF_TABLE_STATS_UNLOCK();
-			break;
+			goto fail;
 		}
 		error = pfr_get_tstats(&io->pfrio_table, pfrtstats,
 		    &io->pfrio_size, io->pfrio_flags | PFR_FLAG_USERIOCTL);
@@ -4833,7 +4865,7 @@ DIOCCHANGEADDR_error:
 
 		if (io->pfrio_esize != sizeof(struct pfr_table)) {
 			error = ENODEV;
-			break;
+			goto fail;
 		}
 
 		if (io->pfrio_size < 0 || io->pfrio_size > pf_ioctl_maxcount ||
@@ -4842,7 +4874,7 @@ DIOCCHANGEADDR_error:
 			 * size, so we didn't fail on overly large requests.
 			 * Keep doing so. */
 			io->pfrio_size = pf_ioctl_maxcount;
-			break;
+			goto fail;
 		}
 
 		totlen = io->pfrio_size * sizeof(struct pfr_table);
@@ -4851,7 +4883,7 @@ DIOCCHANGEADDR_error:
 		error = copyin(io->pfrio_buffer, pfrts, totlen);
 		if (error) {
 			free(pfrts, M_TEMP);
-			break;
+			goto fail;
 		}
 
 		PF_TABLE_STATS_LOCK();
@@ -4872,7 +4904,7 @@ DIOCCHANGEADDR_error:
 
 		if (io->pfrio_esize != sizeof(struct pfr_table)) {
 			error = ENODEV;
-			break;
+			goto fail;
 		}
 
 		PF_RULES_RLOCK();
@@ -4880,7 +4912,7 @@ DIOCCHANGEADDR_error:
 		if (n < 0) {
 			PF_RULES_RUNLOCK();
 			error = EINVAL;
-			break;
+			goto fail;
 		}
 
 		io->pfrio_size = min(io->pfrio_size, n);
@@ -4892,7 +4924,7 @@ DIOCCHANGEADDR_error:
 		error = copyin(io->pfrio_buffer, pfrts, totlen);
 		if (error) {
 			free(pfrts, M_TEMP);
-			break;
+			goto fail;
 		}
 		PF_RULES_WLOCK();
 		error = pfr_set_tflags(pfrts, io->pfrio_size,
@@ -4908,7 +4940,7 @@ DIOCCHANGEADDR_error:
 
 		if (io->pfrio_esize != 0) {
 			error = ENODEV;
-			break;
+			goto fail;
 		}
 		PF_RULES_WLOCK();
 		error = pfr_clr_addrs(&io->pfrio_table, &io->pfrio_ndel,
@@ -4924,13 +4956,13 @@ DIOCCHANGEADDR_error:
 
 		if (io->pfrio_esize != sizeof(struct pfr_addr)) {
 			error = ENODEV;
-			break;
+			goto fail;
 		}
 		if (io->pfrio_size < 0 ||
 		    io->pfrio_size > pf_ioctl_maxcount ||
 		    WOULD_OVERFLOW(io->pfrio_size, sizeof(struct pfr_addr))) {
 			error = EINVAL;
-			break;
+			goto fail;
 		}
 		totlen = io->pfrio_size * sizeof(struct pfr_addr);
 		pfras = mallocarray(io->pfrio_size, sizeof(struct pfr_addr),
@@ -4938,7 +4970,7 @@ DIOCCHANGEADDR_error:
 		error = copyin(io->pfrio_buffer, pfras, totlen);
 		if (error) {
 			free(pfras, M_TEMP);
-			break;
+			goto fail;
 		}
 		PF_RULES_WLOCK();
 		error = pfr_add_addrs(&io->pfrio_table, pfras,
@@ -4958,13 +4990,13 @@ DIOCCHANGEADDR_error:
 
 		if (io->pfrio_esize != sizeof(struct pfr_addr)) {
 			error = ENODEV;
-			break;
+			goto fail;
 		}
 		if (io->pfrio_size < 0 ||
 		    io->pfrio_size > pf_ioctl_maxcount ||
 		    WOULD_OVERFLOW(io->pfrio_size, sizeof(struct pfr_addr))) {
 			error = EINVAL;
-			break;
+			goto fail;
 		}
 		totlen = io->pfrio_size * sizeof(struct pfr_addr);
 		pfras = mallocarray(io->pfrio_size, sizeof(struct pfr_addr),
@@ -4972,7 +5004,7 @@ DIOCCHANGEADDR_error:
 		error = copyin(io->pfrio_buffer, pfras, totlen);
 		if (error) {
 			free(pfras, M_TEMP);
-			break;
+			goto fail;
 		}
 		PF_RULES_WLOCK();
 		error = pfr_del_addrs(&io->pfrio_table, pfras,
@@ -4992,17 +5024,17 @@ DIOCCHANGEADDR_error:
 
 		if (io->pfrio_esize != sizeof(struct pfr_addr)) {
 			error = ENODEV;
-			break;
+			goto fail;
 		}
 		if (io->pfrio_size < 0 || io->pfrio_size2 < 0) {
 			error = EINVAL;
-			break;
+			goto fail;
 		}
 		count = max(io->pfrio_size, io->pfrio_size2);
 		if (count > pf_ioctl_maxcount ||
 		    WOULD_OVERFLOW(count, sizeof(struct pfr_addr))) {
 			error = EINVAL;
-			break;
+			goto fail;
 		}
 		totlen = count * sizeof(struct pfr_addr);
 		pfras = mallocarray(count, sizeof(struct pfr_addr), M_TEMP,
@@ -5010,7 +5042,7 @@ DIOCCHANGEADDR_error:
 		error = copyin(io->pfrio_buffer, pfras, totlen);
 		if (error) {
 			free(pfras, M_TEMP);
-			break;
+			goto fail;
 		}
 		PF_RULES_WLOCK();
 		error = pfr_set_addrs(&io->pfrio_table, pfras,
@@ -5031,13 +5063,13 @@ DIOCCHANGEADDR_error:
 
 		if (io->pfrio_esize != sizeof(struct pfr_addr)) {
 			error = ENODEV;
-			break;
+			goto fail;
 		}
 		if (io->pfrio_size < 0 ||
 		    io->pfrio_size > pf_ioctl_maxcount ||
 		    WOULD_OVERFLOW(io->pfrio_size, sizeof(struct pfr_addr))) {
 			error = EINVAL;
-			break;
+			goto fail;
 		}
 		totlen = io->pfrio_size * sizeof(struct pfr_addr);
 		pfras = mallocarray(io->pfrio_size, sizeof(struct pfr_addr),
@@ -5059,13 +5091,13 @@ DIOCCHANGEADDR_error:
 
 		if (io->pfrio_esize != sizeof(struct pfr_astats)) {
 			error = ENODEV;
-			break;
+			goto fail;
 		}
 		if (io->pfrio_size < 0 ||
 		    io->pfrio_size > pf_ioctl_maxcount ||
 		    WOULD_OVERFLOW(io->pfrio_size, sizeof(struct pfr_astats))) {
 			error = EINVAL;
-			break;
+			goto fail;
 		}
 		totlen = io->pfrio_size * sizeof(struct pfr_astats);
 		pfrastats = mallocarray(io->pfrio_size,
@@ -5087,13 +5119,13 @@ DIOCCHANGEADDR_error:
 
 		if (io->pfrio_esize != sizeof(struct pfr_addr)) {
 			error = ENODEV;
-			break;
+			goto fail;
 		}
 		if (io->pfrio_size < 0 ||
 		    io->pfrio_size > pf_ioctl_maxcount ||
 		    WOULD_OVERFLOW(io->pfrio_size, sizeof(struct pfr_addr))) {
 			error = EINVAL;
-			break;
+			goto fail;
 		}
 		totlen = io->pfrio_size * sizeof(struct pfr_addr);
 		pfras = mallocarray(io->pfrio_size, sizeof(struct pfr_addr),
@@ -5101,7 +5133,7 @@ DIOCCHANGEADDR_error:
 		error = copyin(io->pfrio_buffer, pfras, totlen);
 		if (error) {
 			free(pfras, M_TEMP);
-			break;
+			goto fail;
 		}
 		PF_RULES_WLOCK();
 		error = pfr_clr_astats(&io->pfrio_table, pfras,
@@ -5121,13 +5153,13 @@ DIOCCHANGEADDR_error:
 
 		if (io->pfrio_esize != sizeof(struct pfr_addr)) {
 			error = ENODEV;
-			break;
+			goto fail;
 		}
 		if (io->pfrio_size < 0 ||
 		    io->pfrio_size > pf_ioctl_maxcount ||
 		    WOULD_OVERFLOW(io->pfrio_size, sizeof(struct pfr_addr))) {
 			error = EINVAL;
-			break;
+			goto fail;
 		}
 		totlen = io->pfrio_size * sizeof(struct pfr_addr);
 		pfras = mallocarray(io->pfrio_size, sizeof(struct pfr_addr),
@@ -5135,7 +5167,7 @@ DIOCCHANGEADDR_error:
 		error = copyin(io->pfrio_buffer, pfras, totlen);
 		if (error) {
 			free(pfras, M_TEMP);
-			break;
+			goto fail;
 		}
 		PF_RULES_RLOCK();
 		error = pfr_tst_addrs(&io->pfrio_table, pfras,
@@ -5155,13 +5187,13 @@ DIOCCHANGEADDR_error:
 
 		if (io->pfrio_esize != sizeof(struct pfr_addr)) {
 			error = ENODEV;
-			break;
+			goto fail;
 		}
 		if (io->pfrio_size < 0 ||
 		    io->pfrio_size > pf_ioctl_maxcount ||
 		    WOULD_OVERFLOW(io->pfrio_size, sizeof(struct pfr_addr))) {
 			error = EINVAL;
-			break;
+			goto fail;
 		}
 		totlen = io->pfrio_size * sizeof(struct pfr_addr);
 		pfras = mallocarray(io->pfrio_size, sizeof(struct pfr_addr),
@@ -5169,7 +5201,7 @@ DIOCCHANGEADDR_error:
 		error = copyin(io->pfrio_buffer, pfras, totlen);
 		if (error) {
 			free(pfras, M_TEMP);
-			break;
+			goto fail;
 		}
 		PF_RULES_WLOCK();
 		error = pfr_ina_define(&io->pfrio_table, pfras,
@@ -5204,13 +5236,13 @@ DIOCCHANGEADDR_error:
 
 		if (io->esize != sizeof(*ioe)) {
 			error = ENODEV;
-			break;
+			goto fail;
 		}
 		if (io->size < 0 ||
 		    io->size > pf_ioctl_maxcount ||
 		    WOULD_OVERFLOW(io->size, sizeof(struct pfioc_trans_e))) {
 			error = EINVAL;
-			break;
+			goto fail;
 		}
 		totlen = sizeof(struct pfioc_trans_e) * io->size;
 		ioes = mallocarray(io->size, sizeof(struct pfioc_trans_e),
@@ -5218,7 +5250,7 @@ DIOCCHANGEADDR_error:
 		error = copyin(io->array, ioes, totlen);
 		if (error) {
 			free(ioes, M_TEMP);
-			break;
+			goto fail;
 		}
 		PF_RULES_WLOCK();
 		for (i = 0, ioe = ioes; i < io->size; i++, ioe++) {
@@ -5285,13 +5317,13 @@ DIOCCHANGEADDR_error:
 
 		if (io->esize != sizeof(*ioe)) {
 			error = ENODEV;
-			break;
+			goto fail;
 		}
 		if (io->size < 0 ||
 		    io->size > pf_ioctl_maxcount ||
 		    WOULD_OVERFLOW(io->size, sizeof(struct pfioc_trans_e))) {
 			error = EINVAL;
-			break;
+			goto fail;
 		}
 		totlen = sizeof(struct pfioc_trans_e) * io->size;
 		ioes = mallocarray(io->size, sizeof(struct pfioc_trans_e),
@@ -5299,7 +5331,7 @@ DIOCCHANGEADDR_error:
 		error = copyin(io->array, ioes, totlen);
 		if (error) {
 			free(ioes, M_TEMP);
-			break;
+			goto fail;
 		}
 		PF_RULES_WLOCK();
 		for (i = 0, ioe = ioes; i < io->size; i++, ioe++) {
@@ -5368,14 +5400,14 @@ DIOCCHANGEADDR_error:
 
 		if (io->esize != sizeof(*ioe)) {
 			error = ENODEV;
-			break;
+			goto fail;
 		}
 
 		if (io->size < 0 ||
 		    io->size > pf_ioctl_maxcount ||
 		    WOULD_OVERFLOW(io->size, sizeof(struct pfioc_trans_e))) {
 			error = EINVAL;
-			break;
+			goto fail;
 		}
 
 		totlen = sizeof(struct pfioc_trans_e) * io->size;
@@ -5384,7 +5416,7 @@ DIOCCHANGEADDR_error:
 		error = copyin(io->array, ioes, totlen);
 		if (error) {
 			free(ioes, M_TEMP);
-			break;
+			goto fail;
 		}
 		PF_RULES_WLOCK();
 		/* First makes sure everything will succeed. */
@@ -5525,7 +5557,7 @@ DIOCCHANGEADDR_error:
 
 		if (psn->psn_len == 0) {
 			psn->psn_len = sizeof(struct pf_src_node) * nr;
-			break;
+			goto fail;
 		}
 
 		nr = 0;
@@ -5550,7 +5582,7 @@ DIOCCHANGEADDR_error:
 		    sizeof(struct pf_src_node) * nr);
 		if (error) {
 			free(pstore, M_TEMP);
-			break;
+			goto fail;
 		}
 		psn->psn_len = sizeof(struct pf_src_node) * nr;
 		free(pstore, M_TEMP);
@@ -5606,14 +5638,14 @@ DIOCCHANGEADDR_error:
 
 		if (io->pfiio_esize != sizeof(struct pfi_kif)) {
 			error = ENODEV;
-			break;
+			goto fail;
 		}
 
 		if (io->pfiio_size < 0 ||
 		    io->pfiio_size > pf_ioctl_maxcount ||
 		    WOULD_OVERFLOW(io->pfiio_size, sizeof(struct pfi_kif))) {
 			error = EINVAL;
-			break;
+			goto fail;
 		}
 
 		io->pfiio_name[sizeof(io->pfiio_name) - 1] = '\0';
diff --git a/sys/netpfil/pf/pf_lb.c b/sys/netpfil/pf/pf_lb.c
index 26f7ab41eef4..9c7863bb301e 100644
--- a/sys/netpfil/pf/pf_lb.c
+++ b/sys/netpfil/pf/pf_lb.c
@@ -1012,10 +1012,13 @@ pf_get_transaddr(struct pf_test_ctx *ctx, struct pf_krule *r,
 
 		if (rpool->proxy_port[1]) {
 			uint32_t	tmp_nport;
+			uint16_t	div;
 
-			tmp_nport = ((ntohs(pd->ndport) - ntohs(r->dst.port[0])) %
-			    (rpool->proxy_port[1] - rpool->proxy_port[0] +
-			    1)) + rpool->proxy_port[0];
+			div = r->rdr.proxy_port[1] - r->rdr.proxy_port[0] + 1;
+			div = (div == 0) ? 1 : div;
+
+			tmp_nport = ((ntohs(pd->ndport) - ntohs(r->dst.port[0])) % div) +
+			    rpool->proxy_port[0];
 
 			/* Wrap around if necessary. */
 			if (tmp_nport > 65535)
diff --git a/sys/netpfil/pf/pf_table.c b/sys/netpfil/pf/pf_table.c
index e3f3ab9025f7..9c0151b7da2b 100644
--- a/sys/netpfil/pf/pf_table.c
+++ b/sys/netpfil/pf/pf_table.c
@@ -819,10 +819,10 @@ pfr_create_kentry(struct pfr_addr *ad, bool counters)
 static void
 pfr_destroy_kentries(struct pfr_kentryworkq *workq)
 {
-	struct pfr_kentry	*p, *q;
+	struct pfr_kentry	*p;
 
-	for (p = SLIST_FIRST(workq); p != NULL; p = q) {
-		q = SLIST_NEXT(p, pfrke_workq);
+	while ((p = SLIST_FIRST(workq)) != NULL) {
+		SLIST_REMOVE_HEAD(workq, pfrke_workq);
 		pfr_destroy_kentry(p);
 	}
 }
@@ -1680,8 +1680,7 @@ pfr_ina_commit(struct pfr_table *trs, u_int32_t ticket, int *nadd,
 	}
 
 	if (!(flags & PFR_FLAG_DUMMY)) {
-		for (p = SLIST_FIRST(&workq); p != NULL; p = q) {
-			q = SLIST_NEXT(p, pfrkt_workq);
+		SLIST_FOREACH_SAFE(p, &workq, pfrkt_workq, q) {
 			pfr_commit_ktable(p, tzero);
 		}
 		rs->topen = 0;
@@ -1710,7 +1709,7 @@ pfr_commit_ktable(struct pfr_ktable *kt, time_t tzero)
 	} else if (kt->pfrkt_flags & PFR_TFLAG_ACTIVE) {
 		/* kt might contain addresses */
 		struct pfr_kentryworkq	 addrq, addq, changeq, delq, garbageq;
-		struct pfr_kentry	*p, *q, *next;
+		struct pfr_kentry	*p, *q;
 		struct pfr_addr		 ad;
 
 		pfr_enqueue_addrs(shadow, &addrq, NULL, 0);
@@ -1720,7 +1719,8 @@ pfr_commit_ktable(struct pfr_ktable *kt, time_t tzero)
 		SLIST_INIT(&delq);
 		SLIST_INIT(&garbageq);
 		pfr_clean_node_mask(shadow, &addrq);
-		SLIST_FOREACH_SAFE(p, &addrq, pfrke_workq, next) {
+		while ((p = SLIST_FIRST(&addrq)) != NULL) {
+			SLIST_REMOVE_HEAD(&addrq, pfrke_workq);
 			pfr_copyout_addr(&ad, p);
 			q = pfr_lookup_addr(kt, &ad, 1);
 			if (q != NULL) {
@@ -1864,8 +1864,7 @@ pfr_setflags_ktables(struct pfr_ktableworkq *workq)
 {
 	struct pfr_ktable	*p, *q;
 
-	for (p = SLIST_FIRST(workq); p; p = q) {
-		q = SLIST_NEXT(p, pfrkt_workq);
+	SLIST_FOREACH_SAFE(p, workq, pfrkt_workq, q) {
 		pfr_setflags_ktable(p, p->pfrkt_nflags);
 	}
 }
@@ -2015,10 +2014,10 @@ pfr_create_ktable(struct pfr_table *tbl, time_t tzero, int attachruleset)
 static void
 pfr_destroy_ktables(struct pfr_ktableworkq *workq, int flushaddr)
 {
-	struct pfr_ktable	*p, *q;
+	struct pfr_ktable	*p;
 
-	for (p = SLIST_FIRST(workq); p; p = q) {
-		q = SLIST_NEXT(p, pfrkt_workq);
+	while ((p = SLIST_FIRST(workq)) != NULL) {
+		SLIST_REMOVE_HEAD(workq, pfrkt_workq);
 		pfr_destroy_ktable(p, flushaddr);
 	}
 }
diff --git a/sys/riscv/allwinner/files.allwinner b/sys/riscv/allwinner/files.allwinner
index 73fa9660e2d2..7a4ff6b9c62e 100644
--- a/sys/riscv/allwinner/files.allwinner
+++ b/sys/riscv/allwinner/files.allwinner
@@ -1,5 +1,6 @@
 
 arm/allwinner/aw_gpio.c			optional gpio aw_gpio fdt
+arm/allwinner/aw_mmc.c			optional mmc aw_mmc fdt | mmccam aw_mmc fdt
 arm/allwinner/aw_rtc.c			optional aw_rtc fdt
 arm/allwinner/aw_syscon.c		optional syscon
 arm/allwinner/aw_sid.c			optional aw_sid nvmem
diff --git a/sys/riscv/conf/std.allwinner b/sys/riscv/conf/std.allwinner
index 2b1e0d4e09dc..34fe195b01ba 100644
--- a/sys/riscv/conf/std.allwinner
+++ b/sys/riscv/conf/std.allwinner
@@ -7,6 +7,7 @@ options 	SOC_ALLWINNER_D1
 
 device		aw_ccu		# Allwinner clock controller
 device		aw_gpio		# Allwinner GPIO controller
+device		aw_mmc		# Allwinner SD/MMC controller
 device		aw_rtc		# Allwinner Real-time Clock
 device		aw_sid		# Allwinner Secure ID EFUSE
 device		aw_timer	# Allwinner Timer
diff --git a/sys/rpc/clnt_rc.c b/sys/rpc/clnt_rc.c
index 9e87af578885..44b63e38a8e6 100644
--- a/sys/rpc/clnt_rc.c
+++ b/sys/rpc/clnt_rc.c
@@ -198,6 +198,12 @@ clnt_reconnect_connect(CLIENT *cl)
 		newclient = clnt_vc_create(so,
 		    (struct sockaddr *) &rc->rc_addr, rc->rc_prog, rc->rc_vers,
 		    rc->rc_sendsz, rc->rc_recvsz, rc->rc_intr);
+		/*
+		 * CLSET_FD_CLOSE must be done now, in case rpctls_connect()
+		 * fails just below.
+		 */
+		if (newclient != NULL)
+			CLNT_CONTROL(newclient, CLSET_FD_CLOSE, 0);
 		if (rc->rc_tls && newclient != NULL) {
 			CURVNET_SET(so->so_vnet);
 			stat = rpctls_connect(newclient, rc->rc_tlscertname, so,
@@ -236,7 +242,6 @@ clnt_reconnect_connect(CLIENT *cl)
 		goto out;
 	}
 
-	CLNT_CONTROL(newclient, CLSET_FD_CLOSE, 0);
 	CLNT_CONTROL(newclient, CLSET_CONNECT, &one);
 	CLNT_CONTROL(newclient, CLSET_TIMEOUT, &rc->rc_timeout);
 	CLNT_CONTROL(newclient, CLSET_RETRY_TIMEOUT, &rc->rc_retry);
diff --git a/sys/rpc/rpcsec_gss/rpcsec_gss.c b/sys/rpc/rpcsec_gss/rpcsec_gss.c
index 62c71937a185..983dd251f81f 100644
--- a/sys/rpc/rpcsec_gss/rpcsec_gss.c
+++ b/sys/rpc/rpcsec_gss/rpcsec_gss.c
@@ -67,6 +67,7 @@
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/hash.h>
+#include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/kobj.h>
 #include <sys/lock.h>
@@ -772,6 +773,17 @@ rpc_gss_init(AUTH *auth, rpc_gss_options_ret_t *options_ret)
 	gd->gd_cred.gc_seq = 0;
 
 	/*
+	 * XXX Threads from inside jails can get here via calls
+	 * to clnt_vc_call()->AUTH_REFRESH()->rpc_gss_refresh()
+	 * but the NFS mount is always done outside of the
+	 * jails in vnet0.  Since the thread credentials won't
+	 * necessarily have cr_prison == vnet0 and this function
+	 * has no access to the socket, using vnet0 seems the
+	 * only option.  This is broken if NFS mounts are enabled
+	 * within vnet prisons.
+	 */
+	KGSS_CURVNET_SET_QUIET(vnet0);
+	/*
 	 * For KerberosV, if there is a client principal name, that implies
 	 * that this is a host based initiator credential in the default
 	 * keytab file. For this case, it is necessary to do a
@@ -994,12 +1006,14 @@ out:
 			gss_delete_sec_context(&min_stat, &gd->gd_ctx,
 				GSS_C_NO_BUFFER);
 		}
+		KGSS_CURVNET_RESTORE();
 		mtx_lock(&gd->gd_lock);
 		gd->gd_state = RPCSEC_GSS_START;
 		wakeup(gd);
 		mtx_unlock(&gd->gd_lock);
 		return (FALSE);
 	}
+	KGSS_CURVNET_RESTORE();
 	
 	mtx_lock(&gd->gd_lock);
 	gd->gd_state = RPCSEC_GSS_ESTABLISHED;
diff --git a/sys/rpc/rpcsec_tls/rpctls_impl.c b/sys/rpc/rpcsec_tls/rpctls_impl.c
index 93fe283e65fd..51fe270b13d9 100644
--- a/sys/rpc/rpcsec_tls/rpctls_impl.c
+++ b/sys/rpc/rpcsec_tls/rpctls_impl.c
@@ -240,6 +240,14 @@ rpctls_rpc_failed(struct upsock *ups, struct socket *so)
 		 * failed to do the handshake.
 		 */
 		mtx_unlock(&rpctls_lock);
+		/*
+		 * Do a shutdown on the socket, since the daemon is
+		 * probably stuck in SSL_accept() or SSL_connect() trying to
+		 * read the socket.  Do not soclose() the socket, since the
+		 * daemon will close() the socket after SSL_accept()
+		 * returns an error.
+		 */
+		soshutdown(so, SHUT_RD);
 	}
 }
 
diff --git a/sys/sys/efi.h b/sys/sys/efi.h
index 95a433a950db..89c8b15519de 100644
--- a/sys/sys/efi.h
+++ b/sys/sys/efi.h
@@ -42,6 +42,8 @@
 	{0xb122a263,0x3661,0x4f68,{0x99,0x29,0x78,0xf8,0xb0,0xd6,0x21,0x80}}
 #define	EFI_PROPERTIES_TABLE			\
 	{0x880aaca3,0x4adc,0x4a04,{0x90,0x79,0xb7,0x47,0x34,0x08,0x25,0xe5}}
+#define	EFI_MEMORY_ATTRIBUTES_TABLE		\
+	{0xdcfa911d,0x26eb,0x469f,{0xa2,0x20,0x38,0xb7,0xdc,0x46,0x12,0x20}}
 #define LINUX_EFI_MEMRESERVE_TABLE			\
 	{0x888eb0c6,0x8ede,0x4ff5,{0xa8,0xf0,0x9a,0xee,0x5c,0xb9,0x77,0xc2}}
 
@@ -166,6 +168,22 @@ struct efi_prop_table {
 	uint64_t	memory_protection_attribute;
 };
 
+struct efi_memory_descriptor {
+	uint32_t	type;
+	caddr_t		phy_addr;
+	caddr_t		virt_addr;
+	uint64_t	pages;
+	uint64_t	attrs;
+};
+
+struct efi_memory_attribute_table {
+	uint32_t	version;
+	uint32_t	num_ents;
+	uint32_t	descriptor_size;
+	uint32_t	flags;
+	struct efi_memory_descriptor tables[];
+};
+
 #ifdef _KERNEL
 
 #ifdef EFIABI_ATTR
diff --git a/sys/sys/exterrvar.h b/sys/sys/exterrvar.h
index 15557c614f88..7bf1d264ff5e 100644
--- a/sys/sys/exterrvar.h
+++ b/sys/sys/exterrvar.h
@@ -21,6 +21,7 @@
 
 #define	EXTERRCTL_ENABLE	1
 #define	EXTERRCTL_DISABLE	2
+#define	EXTERRCTL_UD		3
 
 #define	EXTERRCTLF_FORCE	0x00000001
 
diff --git a/sys/sys/inotify.h b/sys/sys/inotify.h
index 65dc5dba43f3..d1f23d5898bb 100644
--- a/sys/sys/inotify.h
+++ b/sys/sys/inotify.h
@@ -107,11 +107,18 @@ void	vn_inotify_revoke(struct vnode *);
 } while (0)
 
 /* Log an inotify event using a specific name for the vnode. */
-#define	INOTIFY_NAME(vp, dvp, cnp, ev) do {				\
+#define	INOTIFY_NAME_LOCK(vp, dvp, cnp, ev, lock) do {			\
 	if (__predict_false((vn_irflag_read(vp) & VIRF_INOTIFY) != 0 ||	\
-	    (vn_irflag_read(dvp) & VIRF_INOTIFY) != 0)) 		\
+	    (vn_irflag_read(dvp) & VIRF_INOTIFY) != 0)) {		\
+		if (lock)						\
+			vn_lock((vp), LK_SHARED | LK_RETRY);		\
 		VOP_INOTIFY((vp), (dvp), (cnp), (ev), 0);		\
+		if (lock)						\
+			VOP_UNLOCK(vp);					\
+	}								\
 } while (0)
+#define	INOTIFY_NAME(vp, dvp, cnp, ev)					\
+	INOTIFY_NAME_LOCK((vp), (dvp), (cnp), (ev), false)
 
 extern __uint32_t inotify_rename_cookie;
 
@@ -126,7 +133,8 @@ extern __uint32_t inotify_rename_cookie;
 		VOP_INOTIFY((vp), (tdvp), (tcnp), IN_MOVED_TO, cookie);	\
 	}								\
 	if ((tvp) != NULL)						\
-		INOTIFY_NAME((tvp), (tdvp), (tcnp), _IN_MOVE_DELETE);	\
+		INOTIFY_NAME_LOCK((tvp), (tdvp), (tcnp),		\
+		    _IN_MOVE_DELETE, true);				\
 } while (0)
 
 #define	INOTIFY_REVOKE(vp) do {						\
diff --git a/sys/sys/param.h b/sys/sys/param.h
index af116d6e3f7a..f941f021a423 100644
--- a/sys/sys/param.h
+++ b/sys/sys/param.h
@@ -74,7 +74,7 @@
  * cannot include sys/param.h and should only be updated here.
  */
 #undef __FreeBSD_version
-#define __FreeBSD_version 1500051
+#define __FreeBSD_version 1500054
 
 /*
  * __FreeBSD_kernel__ indicates that this system uses the kernel of FreeBSD,
diff --git a/sys/sys/random.h b/sys/sys/random.h
index 254ba9451d0a..5abf762cd200 100644
--- a/sys/sys/random.h
+++ b/sys/sys/random.h
@@ -85,7 +85,8 @@ enum random_entropy_source {
 	RANDOM_FS_ATIME,
 	RANDOM_UMA,	/* Special!! UMA/SLAB Allocator */
 	RANDOM_CALLOUT,
-	RANDOM_ENVIRONMENTAL_END = RANDOM_CALLOUT,
+	RANDOM_RANDOMDEV,
+	RANDOM_ENVIRONMENTAL_END = RANDOM_RANDOMDEV,
 	/* Fast hardware random-number sources from here on. */
 	RANDOM_PURE_START,
 	RANDOM_PURE_OCTEON = RANDOM_PURE_START,
diff --git a/sys/vm/swap_pager.c b/sys/vm/swap_pager.c
index 86b75a2d7989..d6bd06226d04 100644
--- a/sys/vm/swap_pager.c
+++ b/sys/vm/swap_pager.c
@@ -384,8 +384,8 @@ swap_release_by_cred(vm_ooffset_t decr, struct ucred *cred)
 #endif
 }
 
-static int swap_pager_full = 2;	/* swap space exhaustion (task killing) */
-static int swap_pager_almost_full = 1; /* swap space exhaustion (w/hysteresis)*/
+static bool swap_pager_full = true; /* swap space exhaustion (task killing) */
+static bool swap_pager_almost_full = true; /* swap space exhaustion (w/hysteresis) */
 static struct mtx swbuf_mtx;	/* to sync nsw_wcount_async */
 static int nsw_wcount_async;	/* limit async write buffers */
 static int nsw_wcount_async_max;/* assigned maximum			*/
@@ -642,14 +642,14 @@ swp_sizecheck(void)
 {
 
 	if (swap_pager_avail < nswap_lowat) {
-		if (swap_pager_almost_full == 0) {
+		if (!swap_pager_almost_full) {
 			printf("swap_pager: out of swap space\n");
-			swap_pager_almost_full = 1;
+			swap_pager_almost_full = true;
 		}
 	} else {
-		swap_pager_full = 0;
+		swap_pager_full = false;
 		if (swap_pager_avail > nswap_hiwat)
-			swap_pager_almost_full = 0;
+			swap_pager_almost_full = false;
 	}
 }
 
@@ -958,11 +958,10 @@ swp_pager_getswapspace(int *io_npages)
 		swp_sizecheck();
 		swdevhd = TAILQ_NEXT(sp, sw_list);
 	} else {
-		if (swap_pager_full != 2) {
+		if (!swap_pager_full) {
 			printf("swp_pager_getswapspace(%d): failed\n",
 			    *io_npages);
-			swap_pager_full = 2;
-			swap_pager_almost_full = 1;
+			swap_pager_full = swap_pager_almost_full = true;
 		}
 		swdevhd = NULL;
 	}
@@ -2863,10 +2862,8 @@ swapoff_one(struct swdevt *sp, struct ucred *cred, u_int flags)
 	sp->sw_id = NULL;
 	TAILQ_REMOVE(&swtailq, sp, sw_list);
 	nswapdev--;
-	if (nswapdev == 0) {
-		swap_pager_full = 2;
-		swap_pager_almost_full = 1;
-	}
+	if (nswapdev == 0)
+		swap_pager_full = swap_pager_almost_full = true;
 	if (swdevhd == sp)
 		swdevhd = NULL;
 	mtx_unlock(&sw_dev_mtx);
diff --git a/sys/vm/vm_pagequeue.h b/sys/vm/vm_pagequeue.h
index cbbd27389662..9bd3b389fb60 100644
--- a/sys/vm/vm_pagequeue.h
+++ b/sys/vm/vm_pagequeue.h
@@ -260,9 +260,9 @@ struct vm_domain {
 	u_int vmd_inactive_shortage;		/* Per-thread shortage. */
 	blockcount_t vmd_inactive_running;	/* Number of inactive threads. */
 	blockcount_t vmd_inactive_starting;	/* Number of threads started. */
-	volatile u_int vmd_addl_shortage;	/* Shortage accumulator. */
-	volatile u_int vmd_inactive_freed;	/* Successful inactive frees. */
-	volatile u_int vmd_inactive_us;		/* Microseconds for above. */
+	u_int vmd_addl_shortage;	/* (a) Shortage accumulator. */
+	u_int vmd_inactive_freed;	/* (a) Successful inactive frees. */
+	u_int vmd_inactive_us;		/* (a) Microseconds for above. */
 	u_int vmd_inactive_pps;		/* Exponential decay frees/second. */
 	int vmd_oom_seq;
 	int vmd_last_active_scan;