diff options
Diffstat (limited to 'sys')
124 files changed, 5302 insertions, 796 deletions
diff --git a/sys/amd64/amd64/apic_vector.S b/sys/amd64/amd64/apic_vector.S index 6e51ebff298a..5bb877a174f7 100644 --- a/sys/amd64/amd64/apic_vector.S +++ b/sys/amd64/amd64/apic_vector.S @@ -49,12 +49,6 @@ #include <machine/specialreg.h> #include <x86/apicreg.h> -#ifdef SMP -#define LK lock ; -#else -#define LK -#endif - .text SUPERALIGN_TEXT /* End Of Interrupt to APIC */ diff --git a/sys/amd64/amd64/mem.c b/sys/amd64/amd64/mem.c index 413b7c74890e..851f2df0e6e1 100644 --- a/sys/amd64/amd64/mem.c +++ b/sys/amd64/amd64/mem.c @@ -105,8 +105,8 @@ memrw(struct cdev *dev, struct uio *uio, int flags) * PAGE_SIZE, the uiomove() call does not * access past the end of the direct map. */ - if (v >= DMAP_MIN_ADDRESS && - v < DMAP_MIN_ADDRESS + dmaplimit) { + if (v >= kva_layout.dmap_low && + v < kva_layout.dmap_high) { error = uiomove((void *)v, c, uio); break; } diff --git a/sys/amd64/amd64/minidump_machdep.c b/sys/amd64/amd64/minidump_machdep.c index 6d0917e16099..43bf81a991bf 100644 --- a/sys/amd64/amd64/minidump_machdep.c +++ b/sys/amd64/amd64/minidump_machdep.c @@ -186,7 +186,7 @@ cpu_minidumpsys(struct dumperinfo *di, const struct minidumpstate *state) * tables, so care must be taken to read each entry only once. */ pmapsize = 0; - for (va = VM_MIN_KERNEL_ADDRESS; va < kva_end; ) { + for (va = kva_layout.km_low; va < kva_end; ) { /* * We always write a page, even if it is zero. Each * page written corresponds to 1GB of space @@ -279,9 +279,9 @@ cpu_minidumpsys(struct dumperinfo *di, const struct minidumpstate *state) mdhdr.msgbufsize = mbp->msg_size; mdhdr.bitmapsize = round_page(BITSET_SIZE(vm_page_dump_pages)); mdhdr.pmapsize = pmapsize; - mdhdr.kernbase = VM_MIN_KERNEL_ADDRESS; - mdhdr.dmapbase = DMAP_MIN_ADDRESS; - mdhdr.dmapend = DMAP_MAX_ADDRESS; + mdhdr.kernbase = kva_layout.km_low; + mdhdr.dmapbase = kva_layout.dmap_low; + mdhdr.dmapend = kva_layout.dmap_high; mdhdr.dumpavailsize = round_page(sizeof(dump_avail)); dump_init_header(di, &kdh, KERNELDUMPMAGIC, KERNELDUMP_AMD64_VERSION, @@ -323,7 +323,7 @@ cpu_minidumpsys(struct dumperinfo *di, const struct minidumpstate *state) /* Dump kernel page directory pages */ bzero(fakepd, sizeof(fakepd)); - for (va = VM_MIN_KERNEL_ADDRESS; va < kva_end; va += NBPDP) { + for (va = kva_layout.km_low; va < kva_end; va += NBPDP) { ii = pmap_pml4e_index(va); pml4 = (uint64_t *)PHYS_TO_DMAP(KPML4phys) + ii; pdp = (uint64_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index 9c985df13ddf..b2bfe633adcc 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -415,7 +415,7 @@ SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0, static int ndmpdp; vm_paddr_t dmaplimit; -vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS; +vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS_LA48; pt_entry_t pg_nx; static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, @@ -475,11 +475,36 @@ _Static_assert(DMPML4I + NDMPML4E <= KMSANSHADPML4I, "direct map overflow"); static pml4_entry_t *kernel_pml4; static u_int64_t DMPDphys; /* phys addr of direct mapped level 2 */ static u_int64_t DMPDPphys; /* phys addr of direct mapped level 3 */ +static u_int64_t DMPML4phys; /* ... level 4, for la57 */ static int ndmpdpphys; /* number of DMPDPphys pages */ vm_paddr_t kernphys; /* phys addr of start of bootstrap data */ vm_paddr_t KERNend; /* and the end */ +struct kva_layout_s kva_layout = { + .kva_min = KV4ADDR(PML4PML4I, 0, 0, 0), + .dmap_low = KV4ADDR(DMPML4I, 0, 0, 0), + .dmap_high = KV4ADDR(DMPML4I + NDMPML4E, 0, 0, 0), + .lm_low = KV4ADDR(LMSPML4I, 0, 0, 0), + .lm_high = KV4ADDR(LMEPML4I + 1, 0, 0, 0), + .km_low = KV4ADDR(KPML4BASE, 0, 0, 0), + .km_high = KV4ADDR(KPML4BASE + NKPML4E - 1, NPDPEPG - 1, + NPDEPG - 1, NPTEPG - 1), + .rec_pt = KV4ADDR(PML4PML4I, 0, 0, 0), +}; + +struct kva_layout_s kva_layout_la57 = { + .kva_min = KV5ADDR(NPML5EPG / 2, 0, 0, 0, 0), /* == rec_pt */ + .dmap_low = KV5ADDR(DMPML5I, 0, 0, 0, 0), + .dmap_high = KV5ADDR(DMPML5I + NDMPML5E, 0, 0, 0, 0), + .lm_low = KV5ADDR(LMSPML5I, 0, 0, 0, 0), + .lm_high = KV5ADDR(LMEPML5I + 1, 0, 0, 0, 0), + .km_low = KV4ADDR(KPML4BASE, 0, 0, 0), + .km_high = KV4ADDR(KPML4BASE + NKPML4E - 1, NPDPEPG - 1, + NPDEPG - 1, NPTEPG - 1), + .rec_pt = KV5ADDR(PML5PML5I, 0, 0, 0, 0), +}; + /* * pmap_mapdev support pre initialization (i.e. console) */ @@ -549,8 +574,8 @@ static int pmap_flags = PMAP_PDE_SUPERPAGE; /* flags for x86 pmaps */ static vmem_t *large_vmem; static u_int lm_ents; -#define PMAP_ADDRESS_IN_LARGEMAP(va) ((va) >= LARGEMAP_MIN_ADDRESS && \ - (va) < LARGEMAP_MIN_ADDRESS + NBPML4 * (u_long)lm_ents) +#define PMAP_ADDRESS_IN_LARGEMAP(va) ((va) >= kva_layout.lm_low && \ + (va) < kva_layout.lm_high) int pmap_pcid_enabled = 1; SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, @@ -1336,7 +1361,7 @@ static pdp_entry_t *pmap_pti_pdpe(vm_offset_t va); static pd_entry_t *pmap_pti_pde(vm_offset_t va); static void pmap_pti_wire_pte(void *pte); static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, - bool remove_pt, struct spglist *free, struct rwlock **lockp); + bool demote_kpde, struct spglist *free, struct rwlock **lockp); static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp); static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va); @@ -1722,7 +1747,7 @@ create_pagetables(vm_paddr_t *firstaddr) { pd_entry_t *pd_p; pdp_entry_t *pdp_p; - pml4_entry_t *p4_p; + pml4_entry_t *p4_p, *p4d_p; pml5_entry_t *p5_p; uint64_t DMPDkernphys; vm_paddr_t pax; @@ -1732,7 +1757,7 @@ create_pagetables(vm_paddr_t *firstaddr) vm_offset_t kasankernbase; int kasankpdpi, kasankpdi, nkasanpte; #endif - int i, j, ndm1g, nkpdpe, nkdmpde; + int i, j, ndm1g, nkpdpe, nkdmpde, ndmpml4phys; TSENTER(); /* Allocate page table pages for the direct map */ @@ -1740,15 +1765,30 @@ create_pagetables(vm_paddr_t *firstaddr) if (ndmpdp < 4) /* Minimum 4GB of dirmap */ ndmpdp = 4; ndmpdpphys = howmany(ndmpdp, NPDPEPG); - if (ndmpdpphys > NDMPML4E) { - /* - * Each NDMPML4E allows 512 GB, so limit to that, - * and then readjust ndmpdp and ndmpdpphys. - */ - printf("NDMPML4E limits system to %d GB\n", NDMPML4E * 512); - Maxmem = atop(NDMPML4E * NBPML4); - ndmpdpphys = NDMPML4E; - ndmpdp = NDMPML4E * NPDEPG; + if (la57) { + ndmpml4phys = howmany(ndmpdpphys, NPML4EPG); + if (ndmpml4phys > NDMPML5E) { + printf("NDMPML5E limits system to %ld GB\n", + (u_long)NDMPML5E * NBPML5 / 1024 / 1024 / 1024); + Maxmem = atop(NDMPML5E * NBPML5); + ndmpml4phys = NDMPML5E; + ndmpdpphys = ndmpml4phys * NPML4EPG; + ndmpdp = ndmpdpphys * NPDEPG; + } + DMPML4phys = allocpages(firstaddr, ndmpml4phys); + } else { + if (ndmpdpphys > NDMPML4E) { + /* + * Each NDMPML4E allows 512 GB, so limit to + * that, and then readjust ndmpdp and + * ndmpdpphys. + */ + printf("NDMPML4E limits system to %d GB\n", + NDMPML4E * 512); + Maxmem = atop(NDMPML4E * NBPML4); + ndmpdpphys = NDMPML4E; + ndmpdp = NDMPML4E * NPDEPG; + } } DMPDPphys = allocpages(firstaddr, ndmpdpphys); ndm1g = 0; @@ -1773,7 +1813,13 @@ create_pagetables(vm_paddr_t *firstaddr) dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT; /* Allocate pages. */ + if (la57) { + KPML5phys = allocpages(firstaddr, 1); + p5_p = (pml5_entry_t *)KPML5phys; + } KPML4phys = allocpages(firstaddr, 1); + p4_p = (pml4_entry_t *)KPML4phys; + KPDPphys = allocpages(firstaddr, NKPML4E); #ifdef KASAN KASANPDPphys = allocpages(firstaddr, NKASANPML4E); @@ -1893,6 +1939,16 @@ create_pagetables(vm_paddr_t *firstaddr) } /* + * Connect the Direct Map slots up to the PML4. + * pml5 entries for DMAP are handled below in global pml5 loop. + */ + p4d_p = la57 ? (pml4_entry_t *)DMPML4phys : &p4_p[DMPML4I]; + for (i = 0; i < ndmpdpphys; i++) { + p4d_p[i] = (DMPDPphys + ptoa(i)) | X86_PG_RW | X86_PG_V | + pg_nx; + } + + /* * Instead of using a 1G page for the memory containing the kernel, * use 2M pages with read-only and no-execute permissions. (If using 1G * pages, this will partially overwrite the PDPEs above.) @@ -1911,11 +1967,6 @@ create_pagetables(vm_paddr_t *firstaddr) } } - /* And recursively map PML4 to itself in order to get PTmap */ - p4_p = (pml4_entry_t *)KPML4phys; - p4_p[PML4PML4I] = KPML4phys; - p4_p[PML4PML4I] |= X86_PG_RW | X86_PG_V | pg_nx; - #ifdef KASAN /* Connect the KASAN shadow map slots up to the PML4. */ for (i = 0; i < NKASANPML4E; i++) { @@ -1938,25 +1989,15 @@ create_pagetables(vm_paddr_t *firstaddr) } #endif - /* Connect the Direct Map slots up to the PML4. */ - for (i = 0; i < ndmpdpphys; i++) { - p4_p[DMPML4I + i] = DMPDPphys + ptoa(i); - p4_p[DMPML4I + i] |= X86_PG_RW | X86_PG_V | pg_nx; - } - /* Connect the KVA slots up to the PML4 */ for (i = 0; i < NKPML4E; i++) { p4_p[KPML4BASE + i] = KPDPphys + ptoa(i); p4_p[KPML4BASE + i] |= X86_PG_RW | X86_PG_V; } - kernel_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys); - if (la57) { /* XXXKIB bootstrap KPML5phys page is lost */ - KPML5phys = allocpages(firstaddr, 1); - for (i = 0, p5_p = (pml5_entry_t *)KPML5phys; i < NPML5EPG; - i++) { + for (i = 0; i < NPML5EPG; i++) { if (i == PML5PML5I) { /* * Recursively map PML5 to itself in @@ -1964,6 +2005,10 @@ create_pagetables(vm_paddr_t *firstaddr) */ p5_p[i] = KPML5phys | X86_PG_RW | X86_PG_A | X86_PG_M | X86_PG_V | pg_nx; + } else if (i >= DMPML5I && i < DMPML5I + NDMPML5E) { + /* Connect DMAP pml4 pages to PML5. */ + p5_p[i] = (DMPML4phys + ptoa(i - DMPML5I)) | + X86_PG_RW | X86_PG_V | pg_nx; } else if (i == pmap_pml5e_index(UPT_MAX_ADDRESS)) { p5_p[i] = KPML4phys | X86_PG_RW | X86_PG_A | X86_PG_M | X86_PG_V; @@ -1971,6 +2016,10 @@ create_pagetables(vm_paddr_t *firstaddr) p5_p[i] = 0; } } + } else { + /* Recursively map PML4 to itself in order to get PTmap */ + p4_p[PML4PML4I] = KPML4phys; + p4_p[PML4PML4I] |= X86_PG_RW | X86_PG_V | pg_nx; } TSEXIT(); } @@ -2024,7 +2073,7 @@ pmap_bootstrap(vm_paddr_t *firstaddr) */ virtual_avail = (vm_offset_t)KERNSTART + round_2mpage(KERNend - (vm_paddr_t)kernphys); - virtual_end = VM_MAX_KERNEL_ADDRESS; + virtual_end = kva_layout.km_high; /* * Enable PG_G global pages, then switch to the kernel page @@ -2046,9 +2095,13 @@ pmap_bootstrap(vm_paddr_t *firstaddr) * Initialize the kernel pmap (which is statically allocated). * Count bootstrap data as being resident in case any of this data is * later unmapped (using pmap_remove()) and freed. + * + * DMAP_TO_PHYS()/PHYS_TO_DMAP() are functional only after + * kva_layout is fixed. */ PMAP_LOCK_INIT(kernel_pmap); if (la57) { + kva_layout = kva_layout_la57; vtoptem = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT + NPML5EPGSHIFT)) - 1) << 3; PTmap = (vm_offset_t)P5Tmap; @@ -2059,6 +2112,7 @@ pmap_bootstrap(vm_paddr_t *firstaddr) kernel_pmap->pm_cr3 = KPML5phys; pmap_pt_page_count_adj(kernel_pmap, 1); /* top-level page */ } else { + kernel_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys); kernel_pmap->pm_pmltop = kernel_pml4; kernel_pmap->pm_cr3 = KPML4phys; } @@ -2420,6 +2474,8 @@ pmap_init(void) { struct pmap_preinit_mapping *ppim; vm_page_t m, mpte; + pml4_entry_t *pml4e; + unsigned long lm_max; int error, i, ret, skz63; /* L1TF, reserve page @0 unconditionally */ @@ -2545,10 +2601,15 @@ pmap_init(void) lm_ents = 8; TUNABLE_INT_FETCH("vm.pmap.large_map_pml4_entries", &lm_ents); - if (lm_ents > LMEPML4I - LMSPML4I + 1) - lm_ents = LMEPML4I - LMSPML4I + 1; + lm_max = (kva_layout.lm_high - kva_layout.lm_low) / NBPML4; + if (lm_ents > lm_max) { + printf( + "pmap: shrinking large map from requested %d slots to %ld slots\n", + lm_ents, lm_max); + lm_ents = lm_max; + } #ifdef KMSAN - if (lm_ents > KMSANORIGPML4I - LMSPML4I) { + if (!la57 && lm_ents > KMSANORIGPML4I - LMSPML4I) { printf( "pmap: shrinking large map for KMSAN (%d slots to %ld slots)\n", lm_ents, KMSANORIGPML4I - LMSPML4I); @@ -2559,18 +2620,27 @@ pmap_init(void) printf("pmap: large map %u PML4 slots (%lu GB)\n", lm_ents, (u_long)lm_ents * (NBPML4 / 1024 / 1024 / 1024)); if (lm_ents != 0) { - large_vmem = vmem_create("large", LARGEMAP_MIN_ADDRESS, + large_vmem = vmem_create("large", kva_layout.lm_low, (vmem_size_t)lm_ents * NBPML4, PAGE_SIZE, 0, M_WAITOK); if (large_vmem == NULL) { printf("pmap: cannot create large map\n"); lm_ents = 0; } + if (la57) { + for (i = 0; i < howmany((vm_offset_t)NBPML4 * + lm_ents, NBPML5); i++) { + m = pmap_large_map_getptp_unlocked(); + kernel_pmap->pm_pmltop[LMSPML5I + i] = X86_PG_V | + X86_PG_RW | X86_PG_A | X86_PG_M | + pg_nx | VM_PAGE_TO_PHYS(m); + } + } for (i = 0; i < lm_ents; i++) { m = pmap_large_map_getptp_unlocked(); - /* XXXKIB la57 */ - kernel_pml4[LMSPML4I + i] = X86_PG_V | - X86_PG_RW | X86_PG_A | X86_PG_M | pg_nx | - VM_PAGE_TO_PHYS(m); + pml4e = pmap_pml4e(kernel_pmap, kva_layout.lm_low + + (u_long)i * NBPML4); + *pml4e = X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M | + pg_nx | VM_PAGE_TO_PHYS(m); } } } @@ -3899,7 +3969,7 @@ pmap_kextract(vm_offset_t va) pd_entry_t pde; vm_paddr_t pa; - if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { + if (va >= kva_layout.dmap_low && va < kva_layout.dmap_high) { pa = DMAP_TO_PHYS(va); } else if (PMAP_ADDRESS_IN_LARGEMAP(va)) { pa = pmap_large_map_kextract(va); @@ -4040,7 +4110,7 @@ pmap_qremove(vm_offset_t sva, int count) * enough to one of those pmap_enter() calls for it to * be caught up in a promotion. */ - KASSERT(va >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", va)); + KASSERT(va >= kva_layout.km_low, ("usermode va %lx", va)); KASSERT((*vtopde(va) & X86_PG_PS) == 0, ("pmap_qremove on promoted va %#lx", va)); @@ -4328,21 +4398,13 @@ void pmap_pinit_pml5(vm_page_t pml5pg) { pml5_entry_t *pm_pml5; + int i; pm_pml5 = (pml5_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml5pg)); - - /* - * Add pml5 entry at top of KVA pointing to existing pml4 table, - * entering all existing kernel mappings into level 5 table. - */ - pm_pml5[pmap_pml5e_index(UPT_MAX_ADDRESS)] = KPML4phys | X86_PG_V | - X86_PG_RW | X86_PG_A | X86_PG_M; - - /* - * Install self-referential address mapping entry. - */ - pm_pml5[PML5PML5I] = VM_PAGE_TO_PHYS(pml5pg) | - X86_PG_RW | X86_PG_V | X86_PG_M | X86_PG_A; + for (i = 0; i < NPML5EPG / 2; i++) + pm_pml5[i] = 0; + for (; i < NPML5EPG; i++) + pm_pml5[i] = kernel_pmap->pm_pmltop[i]; } static void @@ -4899,8 +4961,8 @@ pmap_release(pmap_t pmap) m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pmltop)); if (pmap_is_la57(pmap)) { - pmap->pm_pmltop[pmap_pml5e_index(UPT_MAX_ADDRESS)] = 0; - pmap->pm_pmltop[PML5PML5I] = 0; + for (i = NPML5EPG / 2; i < NPML5EPG; i++) + pmap->pm_pmltop[i] = 0; } else { for (i = 0; i < NKPML4E; i++) /* KVA */ pmap->pm_pmltop[KPML4BASE + i] = 0; @@ -4942,7 +5004,7 @@ pmap_release(pmap_t pmap) static int kvm_size(SYSCTL_HANDLER_ARGS) { - unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS; + unsigned long ksize = kva_layout.km_high - kva_layout.km_low; return sysctl_handle_long(oidp, &ksize, 0, req); } @@ -4953,7 +5015,7 @@ SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, static int kvm_free(SYSCTL_HANDLER_ARGS) { - unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; + unsigned long kfree = kva_layout.km_high - kernel_vm_end; return sysctl_handle_long(oidp, &kfree, 0, req); } @@ -5031,7 +5093,7 @@ pmap_page_array_startup(long pages) vm_page_array_size = pages; - start = VM_MIN_KERNEL_ADDRESS; + start = kva_layout.km_low; end = start + pages * sizeof(struct vm_page); for (va = start; va < end; va += NBPDR) { pfn = first_page + (va - start) / sizeof(struct vm_page); @@ -6067,8 +6129,8 @@ pmap_demote_pde_mpte(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, * so the direct map region is the only part of the * kernel address space that must be handled here. */ - KASSERT(!in_kernel || (va >= DMAP_MIN_ADDRESS && - va < DMAP_MAX_ADDRESS), + KASSERT(!in_kernel || (va >= kva_layout.dmap_low && + va < kva_layout.dmap_high), ("pmap_demote_pde: No saved mpte for va %#lx", va)); /* @@ -6165,8 +6227,7 @@ pmap_demote_pde_mpte(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, * pmap_remove_kernel_pde: Remove a kernel superpage mapping. */ static void -pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, - bool remove_pt) +pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) { pd_entry_t newpde; vm_paddr_t mptepa; @@ -6174,12 +6235,8 @@ pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); PMAP_LOCK_ASSERT(pmap, MA_OWNED); - if (remove_pt) - mpte = pmap_remove_pt_page(pmap, va); - else - mpte = vm_radix_lookup(&pmap->pm_root, pmap_pde_pindex(va)); - if (mpte == NULL) - panic("pmap_remove_kernel_pde: Missing pt page."); + mpte = pmap_remove_pt_page(pmap, va); + KASSERT(mpte != NULL, ("pmap_remove_kernel_pde: missing pt page")); mptepa = VM_PAGE_TO_PHYS(mpte); newpde = mptepa | X86_PG_M | X86_PG_A | X86_PG_RW | X86_PG_V; @@ -6209,7 +6266,7 @@ pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, * pmap_remove_pde: do the things to unmap a superpage in a process */ static int -pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, bool remove_pt, +pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, bool demote_kpde, struct spglist *free, struct rwlock **lockp) { struct md_page *pvh; @@ -6249,9 +6306,7 @@ pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, bool remove_pt, pmap_delayed_invl_page(m); } } - if (pmap == kernel_pmap) { - pmap_remove_kernel_pde(pmap, pdq, sva, remove_pt); - } else { + if (pmap != kernel_pmap) { mpte = pmap_remove_pt_page(pmap, sva); if (mpte != NULL) { KASSERT(vm_page_any_valid(mpte), @@ -6262,6 +6317,14 @@ pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, bool remove_pt, mpte->ref_count = 0; pmap_add_delayed_free_list(mpte, free, false); } + } else if (demote_kpde) { + pmap_remove_kernel_pde(pmap, pdq, sva); + } else { + mpte = vm_radix_lookup(&pmap->pm_root, pmap_pde_pindex(sva)); + if (vm_page_any_valid(mpte)) { + mpte->valid = 0; + pmap_zero_page(mpte); + } } return (pmap_unuse_pt(pmap, sva, *pmap_pdpe(pmap, sva), free)); } @@ -7183,7 +7246,7 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, PG_RW = pmap_rw_bit(pmap); va = trunc_page(va); - KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig")); + KASSERT(va <= kva_layout.km_high, ("pmap_enter: toobig")); KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS, ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)", va)); @@ -7512,6 +7575,9 @@ pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags, PG_RW = pmap_rw_bit(pmap); KASSERT((newpde & (pmap_modified_bit(pmap) | PG_RW)) != PG_RW, ("pmap_enter_pde: newpde is missing PG_M")); + KASSERT((flags & (PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM)) != + PMAP_ENTER_NORECLAIM, + ("pmap_enter_pde: flags is missing PMAP_ENTER_NOREPLACE")); PG_V = pmap_valid_bit(pmap); PMAP_LOCK_ASSERT(pmap, MA_OWNED); @@ -7573,8 +7639,8 @@ pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags, * the mapping is not from kernel_pmap, then * a reserved PT page could be freed. */ - (void)pmap_remove_pde(pmap, pde, va, - pmap != kernel_pmap, &free, lockp); + (void)pmap_remove_pde(pmap, pde, va, false, &free, + lockp); if ((oldpde & PG_G) == 0) pmap_invalidate_pde_page(pmap, va, oldpde); } else { @@ -7584,10 +7650,9 @@ pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags, * before any changes to mappings are * made. Abort on failure. */ - mt = PHYS_TO_VM_PAGE(*pde & PG_FRAME); - if (pmap_insert_pt_page(pmap, mt, false, false)) { - if (pdpg != NULL) - pdpg->ref_count--; + mt = PHYS_TO_VM_PAGE(oldpde & PG_FRAME); + if (pmap_insert_pt_page(pmap, mt, false, + false)) { CTR1(KTR_PMAP, "pmap_enter_pde: cannot ins kern ptp va %#lx", va); @@ -7641,6 +7706,14 @@ pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags, if (!pmap_pv_insert_pde(pmap, va, newpde, flags, lockp)) { if (pdpg != NULL) pmap_abort_ptp(pmap, va, pdpg); + else { + KASSERT(va >= VM_MAXUSER_ADDRESS && + (*pde & (PG_PS | PG_V)) == PG_V, + ("pmap_enter_pde: invalid kernel PDE")); + mt = pmap_remove_pt_page(pmap, va); + KASSERT(mt != NULL, + ("pmap_enter_pde: missing kernel PTP")); + } if (uwptpg != NULL) { mt = pmap_remove_pt_page(pmap, va); KASSERT(mt == uwptpg, @@ -9550,7 +9623,7 @@ pmap_unmapdev(void *p, vm_size_t size) va = (vm_offset_t)p; /* If we gave a direct map region in pmap_mapdev, do nothing */ - if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) + if (va >= kva_layout.dmap_low && va < kva_layout.dmap_high) return; offset = va & PAGE_MASK; size = round_page(offset + size); @@ -9649,6 +9722,8 @@ pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va, vm_page_t m) void pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) { + if (m->md.pat_mode == ma) + return; m->md.pat_mode = ma; @@ -9668,6 +9743,9 @@ pmap_page_set_memattr_noflush(vm_page_t m, vm_memattr_t ma) { int error; + if (m->md.pat_mode == ma) + return; + m->md.pat_mode = ma; if ((m->flags & PG_FICTITIOUS) != 0) @@ -9724,7 +9802,7 @@ pmap_change_prot(vm_offset_t va, vm_size_t size, vm_prot_t prot) int error; /* Only supported within the kernel map. */ - if (va < VM_MIN_KERNEL_ADDRESS) + if (va < kva_layout.km_low) return (EINVAL); PMAP_LOCK(kernel_pmap); @@ -9755,7 +9833,7 @@ pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot, * Only supported on kernel virtual addresses, including the direct * map but excluding the recursive map. */ - if (base < DMAP_MIN_ADDRESS) + if (base < kva_layout.dmap_low) return (EINVAL); /* @@ -9778,7 +9856,7 @@ pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot, pte_bits |= X86_PG_RW; } if ((prot & VM_PROT_EXECUTE) == 0 || - va < VM_MIN_KERNEL_ADDRESS) { + va < kva_layout.km_low) { pde_bits |= pg_nx; pte_bits |= pg_nx; } @@ -9874,7 +9952,7 @@ pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot, pmap_pte_props(pdpe, pde_bits, pde_mask); changed = true; } - if (tmpva >= VM_MIN_KERNEL_ADDRESS && + if (tmpva >= kva_layout.km_low && (*pdpe & PG_PS_FRAME) < dmaplimit) { if (pa_start == pa_end) { /* Start physical address run. */ @@ -9904,7 +9982,7 @@ pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot, pmap_pte_props(pde, pde_bits, pde_mask); changed = true; } - if (tmpva >= VM_MIN_KERNEL_ADDRESS && + if (tmpva >= kva_layout.km_low && (*pde & PG_PS_FRAME) < dmaplimit) { if (pa_start == pa_end) { /* Start physical address run. */ @@ -9932,7 +10010,7 @@ pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot, pmap_pte_props(pte, pte_bits, pte_mask); changed = true; } - if (tmpva >= VM_MIN_KERNEL_ADDRESS && + if (tmpva >= kva_layout.km_low && (*pte & PG_FRAME) < dmaplimit) { if (pa_start == pa_end) { /* Start physical address run. */ @@ -10699,19 +10777,28 @@ pmap_large_map_getptp(void) static pdp_entry_t * pmap_large_map_pdpe(vm_offset_t va) { + pml4_entry_t *pml4; vm_pindex_t pml4_idx; vm_paddr_t mphys; - pml4_idx = pmap_pml4e_index(va); - KASSERT(LMSPML4I <= pml4_idx && pml4_idx < LMSPML4I + lm_ents, - ("pmap_large_map_pdpe: va %#jx out of range idx %#jx LMSPML4I " - "%#jx lm_ents %d", - (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents)); - KASSERT((kernel_pml4[pml4_idx] & X86_PG_V) != 0, - ("pmap_large_map_pdpe: invalid pml4 for va %#jx idx %#jx " - "LMSPML4I %#jx lm_ents %d", - (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents)); - mphys = kernel_pml4[pml4_idx] & PG_FRAME; + KASSERT(va >= kva_layout.lm_low && va < kva_layout.lm_low + + (vm_offset_t)NBPML4 * lm_ents, ("va %#lx not in large map", va)); + if (la57) { + pml4 = pmap_pml4e(kernel_pmap, va); + mphys = *pml4 & PG_FRAME; + } else { + pml4_idx = pmap_pml4e_index(va); + + KASSERT(LMSPML4I <= pml4_idx && pml4_idx < LMSPML4I + lm_ents, + ("pmap_large_map_pdpe: va %#jx out of range idx %#jx " + "LMSPML4I %#jx lm_ents %d", + (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents)); + KASSERT((kernel_pml4[pml4_idx] & X86_PG_V) != 0, + ("pmap_large_map_pdpe: invalid pml4 for va %#jx idx %#jx " + "LMSPML4I %#jx lm_ents %d", + (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents)); + mphys = kernel_pml4[pml4_idx] & PG_FRAME; + } return ((pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va)); } @@ -10904,8 +10991,8 @@ pmap_large_unmap(void *svaa, vm_size_t len) struct spglist spgf; sva = (vm_offset_t)svaa; - if (len == 0 || sva + len < sva || (sva >= DMAP_MIN_ADDRESS && - sva + len <= DMAP_MIN_ADDRESS + dmaplimit)) + if (len == 0 || sva + len < sva || (sva >= kva_layout.dmap_low && + sva + len < kva_layout.dmap_high)) return; SLIST_INIT(&spgf); @@ -11151,11 +11238,10 @@ pmap_large_map_wb(void *svap, vm_size_t len) sva = (vm_offset_t)svap; eva = sva + len; pmap_large_map_wb_fence(); - if (sva >= DMAP_MIN_ADDRESS && eva <= DMAP_MIN_ADDRESS + dmaplimit) { + if (sva >= kva_layout.dmap_low && eva < kva_layout.dmap_high) { pmap_large_map_flush_range(sva, len); } else { - KASSERT(sva >= LARGEMAP_MIN_ADDRESS && - eva <= LARGEMAP_MIN_ADDRESS + lm_ents * NBPML4, + KASSERT(sva >= kva_layout.lm_low && eva < kva_layout.lm_high, ("pmap_large_map_wb: not largemap %#lx %#lx", sva, len)); pmap_large_map_wb_large(sva, eva); } @@ -11196,8 +11282,8 @@ pmap_pti_init(void) VM_OBJECT_WLOCK(pti_obj); pml4_pg = pmap_pti_alloc_page(); pti_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4_pg)); - for (va = VM_MIN_KERNEL_ADDRESS; va <= VM_MAX_KERNEL_ADDRESS && - va >= VM_MIN_KERNEL_ADDRESS && va > NBPML4; va += NBPML4) { + for (va = kva_layout.km_low; va <= kva_layout.km_high && + va >= kva_layout.km_low && va > NBPML4; va += NBPML4) { pdpe = pmap_pti_pdpe(va); pmap_pti_wire_pte(pdpe); } @@ -12081,10 +12167,12 @@ sysctl_kmaps(SYSCTL_HANDLER_ARGS) for (sva = 0, i = pmap_pml4e_index(sva); i < NPML4EPG; i++) { switch (i) { case PML4PML4I: - sbuf_printf(sb, "\nRecursive map:\n"); + if (!la57) + sbuf_printf(sb, "\nRecursive map:\n"); break; case DMPML4I: - sbuf_printf(sb, "\nDirect map:\n"); + if (!la57) + sbuf_printf(sb, "\nDirect map:\n"); break; #ifdef KASAN case KASANPML4I: @@ -12103,7 +12191,8 @@ sysctl_kmaps(SYSCTL_HANDLER_ARGS) sbuf_printf(sb, "\nKernel map:\n"); break; case LMSPML4I: - sbuf_printf(sb, "\nLarge map:\n"); + if (!la57) + sbuf_printf(sb, "\nLarge map:\n"); break; } diff --git a/sys/amd64/amd64/trap.c b/sys/amd64/amd64/trap.c index 09ac0a67dbef..eefddad2f142 100644 --- a/sys/amd64/amd64/trap.c +++ b/sys/amd64/amd64/trap.c @@ -769,7 +769,7 @@ trap_pfault(struct trapframe *frame, bool usermode, int *signo, int *ucode) return (-1); } } - if (eva >= VM_MIN_KERNEL_ADDRESS) { + if (eva >= kva_layout.km_low) { /* * Don't allow user-mode faults in kernel address space. */ diff --git a/sys/amd64/include/param.h b/sys/amd64/include/param.h index 8db314fa034d..1bbb302259d6 100644 --- a/sys/amd64/include/param.h +++ b/sys/amd64/include/param.h @@ -146,8 +146,9 @@ #define amd64_btop(x) ((unsigned long)(x) >> PAGE_SHIFT) #define amd64_ptob(x) ((unsigned long)(x) << PAGE_SHIFT) -#define INKERNEL(va) (((va) >= DMAP_MIN_ADDRESS && (va) < DMAP_MAX_ADDRESS) \ - || ((va) >= VM_MIN_KERNEL_ADDRESS && (va) < VM_MAX_KERNEL_ADDRESS)) +#define INKERNEL(va) \ + (((va) >= kva_layout.dmap_low && (va) < kva_layout.dmap_high) || \ + ((va) >= kva_layout.km_low && (va) < kva_layout.km_high)) #ifdef SMP #define SC_TABLESIZE 1024 /* Must be power of 2. */ diff --git a/sys/amd64/include/pmap.h b/sys/amd64/include/pmap.h index 7d3e91bcd9b9..a0ca97f2d5a0 100644 --- a/sys/amd64/include/pmap.h +++ b/sys/amd64/include/pmap.h @@ -169,11 +169,12 @@ * the recursive page table map. */ #define NDMPML4E 8 +#define NDMPML5E 32 /* - * These values control the layout of virtual memory. The starting address - * of the direct map, which is controlled by DMPML4I, must be a multiple of - * its size. (See the PHYS_TO_DMAP() and DMAP_TO_PHYS() macros.) + * These values control the layout of virtual memory. The starting + * address of the direct map is controlled by DMPML4I on LA48 and + * DMPML5I on LA57. * * Note: KPML4I is the index of the (single) level 4 page that maps * the KVA that holds KERNBASE, while KPML4BASE is the index of the @@ -191,6 +192,7 @@ #define KPML4BASE (NPML4EPG-NKPML4E) /* KVM at highest addresses */ #define DMPML4I rounddown(KPML4BASE-NDMPML4E, NDMPML4E) /* Below KVM */ +#define DMPML5I (NPML5EPG / 2 + 1) #define KPML4I (NPML4EPG-1) #define KPDPI (NPDPEPG-2) /* kernbase at -2GB */ @@ -200,9 +202,14 @@ #define KMSANSHADPML4I (KPML4BASE - NKMSANSHADPML4E) #define KMSANORIGPML4I (DMPML4I - NKMSANORIGPML4E) -/* Large map: index of the first and max last pml4 entry */ +/* + * Large map: index of the first and max last pml4/la48 and pml5/la57 + * entry. + */ #define LMSPML4I (PML4PML4I + 1) #define LMEPML4I (KASANPML4I - 1) +#define LMSPML5I (DMPML5I + NDMPML5E) +#define LMEPML5I (LMSPML5I + 32 - 1) /* 32 slots for large map */ /* * XXX doesn't really belong here I guess... @@ -548,6 +555,18 @@ pmap_pml5e_index(vm_offset_t va) return ((va >> PML5SHIFT) & ((1ul << NPML5EPGSHIFT) - 1)); } +struct kva_layout_s { + vm_offset_t kva_min; + vm_offset_t dmap_low; /* DMAP_MIN_ADDRESS */ + vm_offset_t dmap_high; /* DMAP_MAX_ADDRESS */ + vm_offset_t lm_low; /* LARGEMAP_MIN_ADDRESS */ + vm_offset_t lm_high; /* LARGEMAP_MAX_ADDRESS */ + vm_offset_t km_low; /* VM_MIN_KERNEL_ADDRESS */ + vm_offset_t km_high; /* VM_MAX_KERNEL_ADDRESS */ + vm_offset_t rec_pt; +}; +extern struct kva_layout_s kva_layout; + #endif /* !LOCORE */ #endif /* !_MACHINE_PMAP_H_ */ diff --git a/sys/amd64/include/vmparam.h b/sys/amd64/include/vmparam.h index 0cd9bb4fa7a4..ef352e776af6 100644 --- a/sys/amd64/include/vmparam.h +++ b/sys/amd64/include/vmparam.h @@ -163,6 +163,7 @@ * Virtual addresses of things. Derived from the page directory and * page table indexes from pmap.h for precision. * + * LA48: * 0x0000000000000000 - 0x00007fffffffffff user map * 0x0000800000000000 - 0xffff7fffffffffff does not exist (hole) * 0xffff800000000000 - 0xffff804020100fff recursive page table (512GB slot) @@ -175,18 +176,29 @@ * 0xfffffc0000000000 - 0xfffffdffffffffff 2TB KMSAN shadow map, optional * 0xfffffe0000000000 - 0xffffffffffffffff 2TB kernel map * + * LA57: + * 0x0000000000000000 - 0x00ffffffffffffff user map + * 0x0100000000000000 - 0xf0ffffffffffffff does not exist (hole) + * 0xff00000000000000 - 0xff00ffffffffffff recursive page table (2048TB slot) + * 0xff01000000000000 - 0xff20ffffffffffff direct map (32 x 2048TB slots) + * 0xff21000000000000 - 0xff40ffffffffffff large map + * 0xff41000000000000 - 0xffff7fffffffffff unused + * 0xffff800000000000 - 0xfffff5ffffffffff unused (start of kernel pml4 entry) + * 0xfffff60000000000 - 0xfffff7ffffffffff 2TB KMSAN origin map, optional + * 0xfffff78000000000 - 0xfffff7bfffffffff 512GB KASAN shadow map, optional + * 0xfffff80000000000 - 0xfffffbffffffffff 4TB unused + * 0xfffffc0000000000 - 0xfffffdffffffffff 2TB KMSAN shadow map, optional + * 0xfffffe0000000000 - 0xffffffffffffffff 2TB kernel map + * * Within the kernel map: * * 0xfffffe0000000000 vm_page_array * 0xffffffff80000000 KERNBASE */ -#define VM_MIN_KERNEL_ADDRESS KV4ADDR(KPML4BASE, 0, 0, 0) -#define VM_MAX_KERNEL_ADDRESS KV4ADDR(KPML4BASE + NKPML4E - 1, \ - NPDPEPG-1, NPDEPG-1, NPTEPG-1) - -#define DMAP_MIN_ADDRESS KV4ADDR(DMPML4I, 0, 0, 0) -#define DMAP_MAX_ADDRESS KV4ADDR(DMPML4I + NDMPML4E, 0, 0, 0) +#define VM_MIN_KERNEL_ADDRESS_LA48 KV4ADDR(KPML4BASE, 0, 0, 0) +#define VM_MIN_KERNEL_ADDRESS kva_layout.km_low +#define VM_MAX_KERNEL_ADDRESS kva_layout.km_high #define KASAN_MIN_ADDRESS KV4ADDR(KASANPML4I, 0, 0, 0) #define KASAN_MAX_ADDRESS KV4ADDR(KASANPML4I + NKASANPML4E, 0, 0, 0) @@ -199,9 +211,6 @@ #define KMSAN_ORIG_MAX_ADDRESS KV4ADDR(KMSANORIGPML4I + NKMSANORIGPML4E, \ 0, 0, 0) -#define LARGEMAP_MIN_ADDRESS KV4ADDR(LMSPML4I, 0, 0, 0) -#define LARGEMAP_MAX_ADDRESS KV4ADDR(LMEPML4I + 1, 0, 0, 0) - /* * Formally kernel mapping starts at KERNBASE, but kernel linker * script leaves first PDE reserved. For legacy BIOS boot, kernel is @@ -239,21 +248,21 @@ * vt fb startup needs to be reworked. */ #define PHYS_IN_DMAP(pa) (dmaplimit == 0 || (pa) < dmaplimit) -#define VIRT_IN_DMAP(va) ((va) >= DMAP_MIN_ADDRESS && \ - (va) < (DMAP_MIN_ADDRESS + dmaplimit)) +#define VIRT_IN_DMAP(va) \ + ((va) >= kva_layout.dmap_low && (va) < kva_layout.dmap_low + dmaplimit) #define PMAP_HAS_DMAP 1 -#define PHYS_TO_DMAP(x) ({ \ +#define PHYS_TO_DMAP(x) __extension__ ({ \ KASSERT(PHYS_IN_DMAP(x), \ ("physical address %#jx not covered by the DMAP", \ (uintmax_t)x)); \ - (x) | DMAP_MIN_ADDRESS; }) + (x) + kva_layout.dmap_low; }) -#define DMAP_TO_PHYS(x) ({ \ +#define DMAP_TO_PHYS(x) __extension__ ({ \ KASSERT(VIRT_IN_DMAP(x), \ ("virtual address %#jx not covered by the DMAP", \ (uintmax_t)x)); \ - (x) & ~DMAP_MIN_ADDRESS; }) + (x) - kva_layout.dmap_low; }) /* * amd64 maps the page array into KVA so that it can be more easily @@ -274,7 +283,7 @@ */ #ifndef VM_KMEM_SIZE_MAX #define VM_KMEM_SIZE_MAX ((VM_MAX_KERNEL_ADDRESS - \ - VM_MIN_KERNEL_ADDRESS + 1) * 3 / 5) + kva_layout.km_low + 1) * 3 / 5) #endif /* initial pagein size of beginning of executable file */ diff --git a/sys/amd64/pt/pt.c b/sys/amd64/pt/pt.c new file mode 100644 index 000000000000..c7b75767680a --- /dev/null +++ b/sys/amd64/pt/pt.c @@ -0,0 +1,978 @@ +/* + * Copyright (c) 2025 Bojan Novkoviฤ <bnovkov@freebsd.org> + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +/* + * hwt(4) Intel Processor Trace (PT) backend + * + * Driver Design Overview + * + * - Since PT is configured on a per-core basis, the driver uses + * 'smp_rendezvous' to start and disable tracing on each target core. + * - PT-specific resources are stored in a 'struct pt_ctx' context structure for + * each traced CPU core or thread. Upon initialization, a ToPA configuration + * is generated for each 'pt_ctx' structure using the HWT tracing buffers. + * The HWT tracing buffer is split into 4K ToPA entries. Currently, each + * 4K ToPA entry is configured to trigger an interrupt after it is filled. + * - The PT driver uses the XSAVE/XRSTOR PT extensions to load and save all + * relevant PT registers. Every time a traced thread is switched + * out or in, its state will be saved to or loaded from its corresponding + * 'pt_ctx' context. + * - When tracing starts, the PT hardware will start writing data into the + * tracing buffer. When a TOPA_INT entry is filled, it will trigger an + * interrupt before continuing. The interrupt handler will then fetch the + * last valid tracing buffer offset and enqueue a HWT_RECORD_BUFFER record. + * The driver is currently configured to use the NMI interrupt line. + * - The userspace PT backend waits for incoming HWT_RECORD_BUFFER records + * and uses the offsets to decode data from the tracing buffer. + * + * Future improvements and limitations + * + * - We currently configure the PT hardware to trigger an interrupt whenever + * a 4K ToPA entry is filled. While this is fine when tracing smaller + * functions or infrequent code paths, this will generate too much interrupt + * traffic when tracing hotter functions. A proper solution for this issue + * should estimate the amount of data generated by the current configuration + * and use it to determine interrupt frequency. + * + * - Support for more tracing options and PT features. + * + */ + +#include <sys/systm.h> +#include <sys/hwt.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/module.h> +#include <sys/mutex.h> +#include <sys/sdt.h> +#include <sys/smp.h> +#include <sys/taskqueue.h> + +#include <vm/vm.h> +#include <vm/vm_page.h> + +#include <machine/atomic.h> +#include <machine/cpufunc.h> +#include <machine/fpu.h> +#include <machine/smp.h> +#include <machine/specialreg.h> + +#include <x86/apicvar.h> +#include <x86/x86_var.h> + +#include <dev/hwt/hwt_context.h> +#include <dev/hwt/hwt_vm.h> +#include <dev/hwt/hwt_backend.h> +#include <dev/hwt/hwt_config.h> +#include <dev/hwt/hwt_cpu.h> +#include <dev/hwt/hwt_record.h> +#include <dev/hwt/hwt_thread.h> + +#include <amd64/pt/pt.h> + +#ifdef PT_DEBUG +#define dprintf(fmt, ...) printf(fmt, ##__VA_ARGS__) +#else +#define dprintf(fmt, ...) +#endif +#define PT_SUPPORTED_FLAGS \ + (RTIT_CTL_MTCEN | RTIT_CTL_CR3FILTER | RTIT_CTL_DIS_TNT | \ + RTIT_CTL_USER | RTIT_CTL_OS | RTIT_CTL_BRANCHEN) +#define PT_XSAVE_MASK (XFEATURE_ENABLED_X87 | XFEATURE_ENABLED_SSE) +#define PT_XSTATE_BV (PT_XSAVE_MASK | XFEATURE_ENABLED_PT) +#define PT_MAX_IP_RANGES 2 + +#define PT_TOPA_MASK_PTRS 0x7f +#define PT_TOPA_PAGE_MASK 0xffffff80 +#define PT_TOPA_PAGE_SHIFT 7 + +#define CPUID_PT_LEAF 0x14 + +MALLOC_DEFINE(M_PT, "pt", "Intel Processor Trace"); + +SDT_PROVIDER_DEFINE(pt); +SDT_PROBE_DEFINE(pt, , , topa__intr); + +TASKQUEUE_FAST_DEFINE_THREAD(pt); + +static void pt_send_buffer_record(void *arg, int pending __unused); +static int pt_topa_intr(struct trapframe *tf); + +/* + * Intel Processor Trace XSAVE-managed state. + */ +struct pt_ext_area { + uint64_t rtit_ctl; + uint64_t rtit_output_base; + uint64_t rtit_output_mask_ptrs; + uint64_t rtit_status; + uint64_t rtit_cr3_match; + uint64_t rtit_addr0_a; + uint64_t rtit_addr0_b; + uint64_t rtit_addr1_a; + uint64_t rtit_addr1_b; +}; + +struct pt_buffer { + uint64_t *topa_hw; /* ToPA table entries. */ + size_t size; + struct mtx lock; /* Lock for fields below. */ + vm_offset_t offset; + uint64_t wrap_count; + int curpage; +}; + +struct pt_ctx { + int id; + struct pt_buffer buf; /* ToPA buffer metadata */ + struct task task; /* ToPA buffer notification task */ + struct hwt_context *hwt_ctx; + uint8_t *save_area; /* PT XSAVE area */ +}; +/* PT tracing contexts used for CPU mode. */ +static struct pt_ctx *pt_pcpu_ctx; + +enum pt_cpu_state { + PT_DISABLED = 0, + PT_STOPPED, + PT_ACTIVE +}; + +static struct pt_cpu { + struct pt_ctx *ctx; /* active PT tracing context */ + enum pt_cpu_state state; /* used as part of trace stop protocol */ +} *pt_pcpu; + +/* + * PT-related CPUID bits. + */ +static struct pt_cpu_info { + uint32_t l0_eax; + uint32_t l0_ebx; + uint32_t l0_ecx; + uint32_t l1_eax; + uint32_t l1_ebx; + size_t xsave_area_size; + size_t xstate_hdr_offset; + size_t pt_xsave_offset; +} pt_info __read_mostly; + +static bool initialized = false; +static int cpu_mode_ctr = 0; + +static __inline enum pt_cpu_state +pt_cpu_get_state(int cpu_id) +{ + return (atomic_load_int(&pt_pcpu[cpu_id].state)); +} + +static __inline void +pt_cpu_set_state(int cpu_id, enum pt_cpu_state state) +{ + atomic_store_int(&pt_pcpu[cpu_id].state, state); +} + +static __inline struct xstate_hdr * +pt_ctx_get_xstate_hdr(struct pt_ctx *ctx) +{ + return ((struct xstate_hdr *)(ctx->save_area + + pt_info.xstate_hdr_offset)); +} + + +static __inline struct pt_ext_area * +pt_ctx_get_ext_area(struct pt_ctx *ctx) +{ + return ((struct pt_ext_area *)(ctx->save_area + + pt_info.pt_xsave_offset)); +} + +/* + * Updates current trace buffer offset from the + * ToPA MSRs. Records if the trace buffer wrapped. + */ +static __inline void +pt_update_buffer(struct pt_buffer *buf) +{ + uint64_t reg; + int curpage; + + /* Update buffer offset. */ + reg = rdmsr(MSR_IA32_RTIT_OUTPUT_MASK_PTRS); + curpage = (reg & PT_TOPA_PAGE_MASK) >> PT_TOPA_PAGE_SHIFT; + mtx_lock_spin(&buf->lock); + /* Check if the output wrapped. */ + if (buf->curpage > curpage) + buf->wrap_count++; + buf->curpage = curpage; + buf->offset = reg >> 32; + mtx_unlock_spin(&buf->lock); + + dprintf("%s: wrap_cnt: %lu, curpage: %d, offset: %zu\n", __func__, + buf->wrap_count, buf->curpage, buf->offset); +} + +static __inline void +pt_fill_buffer_record(int id, struct pt_buffer *buf, + struct hwt_record_entry *rec) +{ + rec->record_type = HWT_RECORD_BUFFER; + rec->buf_id = id; + rec->curpage = buf->curpage; + rec->offset = buf->offset + (buf->wrap_count * buf->size); +} + +/* + * Enables or disables tracing on curcpu + * using the XSAVE/XRSTOR PT extensions. + */ +static void +pt_cpu_toggle_local(uint8_t *save_area, bool enable) +{ + u_long xcr0, cr0; + u_long xss; + + cr0 = rcr0(); + if (cr0 & CR0_TS) + clts(); + xcr0 = rxcr(XCR0); + if ((xcr0 & PT_XSAVE_MASK) != PT_XSAVE_MASK) + load_xcr(XCR0, xcr0 | PT_XSAVE_MASK); + xss = rdmsr(MSR_IA32_XSS); + wrmsr(MSR_IA32_XSS, xss | XFEATURE_ENABLED_PT); + + if (!enable) { + KASSERT((rdmsr(MSR_IA32_RTIT_CTL) & RTIT_CTL_TRACEEN) != 0, + ("%s: PT is disabled", __func__)); + xsaves(save_area, XFEATURE_ENABLED_PT); + } else { + KASSERT((rdmsr(MSR_IA32_RTIT_CTL) & RTIT_CTL_TRACEEN) == 0, + ("%s: PT is enabled", __func__)); + xrstors(save_area, XFEATURE_ENABLED_PT); + } + wrmsr(MSR_IA32_XSS, xss); + if ((xcr0 & PT_XSAVE_MASK) != PT_XSAVE_MASK) + load_xcr(XCR0, xcr0); + if (cr0 & CR0_TS) + load_cr0(cr0); +} + +/* + * Starts PT tracing on 'curcpu'. + */ +static void +pt_cpu_start(void *dummy) +{ + struct pt_cpu *cpu; + + cpu = &pt_pcpu[curcpu]; + MPASS(cpu->ctx != NULL); + + dprintf("%s: curcpu %d\n", __func__, curcpu); + load_cr4(rcr4() | CR4_XSAVE); + wrmsr(MSR_IA32_RTIT_STATUS, 0); + pt_cpu_set_state(curcpu, PT_ACTIVE); + pt_cpu_toggle_local(cpu->ctx->save_area, true); +} + +/* + * Stops PT tracing on 'curcpu'. + * Updates trace buffer offset to ensure + * any data generated between the last interrupt + * and the trace stop gets picked up by userspace. + */ +static void +pt_cpu_stop(void *dummy) +{ + struct pt_cpu *cpu; + struct pt_ctx *ctx; + + /* Shutdown may occur before PT gets properly configured. */ + if (pt_cpu_get_state(curcpu) == PT_DISABLED) + return; + + cpu = &pt_pcpu[curcpu]; + ctx = cpu->ctx; + MPASS(ctx != NULL); + dprintf("%s: curcpu %d\n", __func__, curcpu); + + pt_cpu_set_state(curcpu, PT_STOPPED); + pt_cpu_toggle_local(cpu->ctx->save_area, false); + pt_update_buffer(&ctx->buf); +} + +/* + * Prepares the Table of Physical Addresses (ToPA) metadata for 'pt_ctx'. + * The HWT trace buffer is split into 4K ToPA table entries and used + * as a circular buffer, meaning that the last ToPA entry points to + * the first ToPA entry. Each entry is configured to raise an + * interrupt after being filled. + */ +static int +pt_topa_prepare(struct pt_ctx *ctx, struct hwt_vm *vm) +{ + struct pt_buffer *buf; + size_t topa_size; + int i; + + topa_size = TOPA_SIZE_4K; + buf = &ctx->buf; + + KASSERT(buf->topa_hw == NULL, + ("%s: ToPA info already exists", __func__)); + buf->topa_hw = mallocarray(vm->npages + 1, sizeof(uint64_t), M_PT, + M_ZERO | M_WAITOK); + dprintf("%s: ToPA virt addr %p\n", __func__, buf->topa_hw); + buf->size = vm->npages * PAGE_SIZE; + for (i = 0; i < vm->npages; i++) { + buf->topa_hw[i] = VM_PAGE_TO_PHYS(vm->pages[i]) | topa_size; + /* + * XXX: TOPA_INT should ideally be set according to + * expected amount of incoming trace data. Too few TOPA_INT + * entries will not trigger interrupts often enough when tracing + * smaller functions. + */ + buf->topa_hw[i] |= TOPA_INT; + } + buf->topa_hw[vm->npages] = (uint64_t)vtophys(buf->topa_hw) | TOPA_END; + + return (0); +} + +/* + * Configures IP filtering for trace generation. + * A maximum of 2 ranges can be specified due to + * limitations imposed by the XSAVE/XRSTOR PT extensions. + */ +static int +pt_configure_ranges(struct pt_ctx *ctx, struct pt_cpu_config *cfg) +{ + struct pt_ext_area *pt_ext; + int nranges_supp, n, error = 0; + + pt_ext = pt_ctx_get_ext_area(ctx); + if (pt_info.l0_ebx & CPUPT_IPF) { + nranges_supp = (pt_info.l1_eax & CPUPT_NADDR_M) >> + CPUPT_NADDR_S; + + if (nranges_supp > PT_IP_FILTER_MAX_RANGES) + nranges_supp = PT_IP_FILTER_MAX_RANGES; + n = cfg->nranges; + if (n > nranges_supp) { + printf("%s: %d IP filtering ranges requested, CPU " + "supports %d, truncating\n", + __func__, n, nranges_supp); + n = nranges_supp; + } + + switch (n) { + case 2: + pt_ext->rtit_ctl |= (1UL << RTIT_CTL_ADDR_CFG_S(1)); + pt_ext->rtit_addr1_a = cfg->ip_ranges[1].start; + pt_ext->rtit_addr1_b = cfg->ip_ranges[1].end; + case 1: + pt_ext->rtit_ctl |= (1UL << RTIT_CTL_ADDR_CFG_S(0)); + pt_ext->rtit_addr0_a = cfg->ip_ranges[0].start; + pt_ext->rtit_addr0_b = cfg->ip_ranges[0].end; + break; + default: + error = (EINVAL); + break; + }; + } else + error = (ENXIO); + + return (error); +} + +static int +pt_init_ctx(struct pt_ctx *pt_ctx, struct hwt_vm *vm, int ctx_id) +{ + + dprintf("%s: ctx id %d\n", __func__, ctx_id); + + KASSERT(pt_ctx->buf.topa_hw == NULL, + ("%s: active ToPA buffer in context %p\n", __func__, pt_ctx)); + + memset(pt_ctx, 0, sizeof(struct pt_ctx)); + mtx_init(&pt_ctx->buf.lock, "pttopa", NULL, MTX_SPIN); + pt_ctx->save_area = malloc_aligned(pt_info.xsave_area_size, 64, + M_PT, M_NOWAIT | M_ZERO); + if (pt_ctx->save_area == NULL) + return (ENOMEM); + dprintf("%s: preparing ToPA buffer\n", __func__); + if (pt_topa_prepare(pt_ctx, vm) != 0) { + dprintf("%s: failed to prepare ToPA buffer\n", __func__); + free(pt_ctx->save_area, M_PT); + return (ENOMEM); + } + + pt_ctx->id = ctx_id; + TASK_INIT(&pt_ctx->task, 0, pt_send_buffer_record, pt_ctx); + + return (0); +} + +static void +pt_deinit_ctx(struct pt_ctx *pt_ctx) +{ + + if (pt_ctx->buf.topa_hw != NULL) + free(pt_ctx->buf.topa_hw, M_PT); + if (pt_ctx->save_area != NULL) + free(pt_ctx->save_area, M_PT); + memset(pt_ctx, 0, sizeof(*pt_ctx)); + pt_ctx->buf.topa_hw = NULL; +} + +/* + * HWT backend configuration method. + * + * Checks and translates the user-defined configuration to a + * set of PT tracing features. Uses the feature set to initialize + * the tracing context for the target CPU or thread. + */ +static int +pt_backend_configure(struct hwt_context *ctx, int cpu_id, int thread_id) +{ + struct hwt_cpu *hwt_cpu; + struct hwt_thread *thr; + struct pt_ctx *pt_ctx; + struct pt_cpu_config *cfg; + struct pt_ext_area *pt_ext; + struct xstate_hdr *hdr; + int error; + + dprintf("%s\n", __func__); + + cfg = (struct pt_cpu_config *)ctx->config; + pt_ctx = NULL; + + /* Clear any flags we don't support yet. */ + cfg->rtit_ctl &= PT_SUPPORTED_FLAGS; + if (cfg->rtit_ctl & RTIT_CTL_MTCEN) { + if ((pt_info.l0_ebx & CPUPT_MTC) == 0) { + printf("%s: CPU does not support generating MTC " + "packets\n", __func__); + return (ENXIO); + } + } + + if (cfg->rtit_ctl & RTIT_CTL_CR3FILTER) { + if ((pt_info.l0_ebx & CPUPT_CR3) == 0) { + printf("%s: CPU does not support CR3 filtering\n", + __func__); + return (ENXIO); + } + } + + if (cfg->rtit_ctl & RTIT_CTL_DIS_TNT) { + if ((pt_info.l0_ebx & CPUPT_DIS_TNT) == 0) { + printf("%s: CPU does not support TNT\n", __func__); + return (ENXIO); + } + } + /* TODO: support for more config bits. */ + + if (ctx->mode == HWT_MODE_CPU) { + TAILQ_FOREACH(hwt_cpu, &ctx->cpus, next) { + if (hwt_cpu->cpu_id != cpu_id) + continue; + pt_ctx = &pt_pcpu_ctx[cpu_id]; + break; + } + } else { + TAILQ_FOREACH(thr, &ctx->threads, next) { + if (thr->thread_id != thread_id) + continue; + KASSERT(thr->private != NULL, + ("%s: hwt thread private" + " not set, thr %p", + __func__, thr)); + pt_ctx = (struct pt_ctx *)thr->private; + break; + } + } + if (pt_ctx == NULL) + return (ENOENT); + + dprintf("%s: preparing MSRs\n", __func__); + pt_ext = pt_ctx_get_ext_area(pt_ctx); + hdr = pt_ctx_get_xstate_hdr(pt_ctx); + + pt_ext->rtit_ctl |= cfg->rtit_ctl; + if (cfg->nranges != 0) { + dprintf("%s: preparing IPF ranges\n", __func__); + if ((error = pt_configure_ranges(pt_ctx, cfg)) != 0) + return (error); + } + pt_ctx->hwt_ctx = ctx; + pt_ext->rtit_ctl |= RTIT_CTL_TOPA; + pt_ext->rtit_output_base = (uint64_t)vtophys(pt_ctx->buf.topa_hw); + pt_ext->rtit_output_mask_ptrs = PT_TOPA_MASK_PTRS; + hdr->xstate_bv = XFEATURE_ENABLED_PT; + hdr->xstate_xcomp_bv = XFEATURE_ENABLED_PT | + XSTATE_XCOMP_BV_COMPACT; + pt_ext->rtit_ctl |= RTIT_CTL_TRACEEN; + pt_pcpu[cpu_id].ctx = pt_ctx; + pt_cpu_set_state(cpu_id, PT_STOPPED); + + return (0); +} + +/* + * hwt backend trace start operation. CPU affine. + */ +static void +pt_backend_enable(struct hwt_context *ctx, int cpu_id) +{ + if (ctx->mode == HWT_MODE_CPU) + return; + + KASSERT(curcpu == cpu_id, + ("%s: attempting to start PT on another cpu", __func__)); + pt_cpu_start(NULL); + CPU_SET(cpu_id, &ctx->cpu_map); +} + +/* + * hwt backend trace stop operation. CPU affine. + */ +static void +pt_backend_disable(struct hwt_context *ctx, int cpu_id) +{ + struct pt_cpu *cpu; + + if (ctx->mode == HWT_MODE_CPU) + return; + + KASSERT(curcpu == cpu_id, + ("%s: attempting to disable PT on another cpu", __func__)); + pt_cpu_stop(NULL); + CPU_CLR(cpu_id, &ctx->cpu_map); + cpu = &pt_pcpu[cpu_id]; + cpu->ctx = NULL; +} + +/* + * hwt backend trace start operation for remote CPUs. + */ +static int +pt_backend_enable_smp(struct hwt_context *ctx) +{ + + dprintf("%s\n", __func__); + if (ctx->mode == HWT_MODE_CPU && + atomic_swap_32(&cpu_mode_ctr, 1) != 0) + return (-1); + + KASSERT(ctx->mode == HWT_MODE_CPU, + ("%s: should only be used for CPU mode", __func__)); + smp_rendezvous_cpus(ctx->cpu_map, NULL, pt_cpu_start, NULL, NULL); + + return (0); +} + +/* + * hwt backend trace stop operation for remote CPUs. + */ +static int +pt_backend_disable_smp(struct hwt_context *ctx) +{ + + dprintf("%s\n", __func__); + if (ctx->mode == HWT_MODE_CPU && + atomic_swap_32(&cpu_mode_ctr, 0) == 0) + return (-1); + + if (CPU_EMPTY(&ctx->cpu_map)) { + dprintf("%s: empty cpu map\n", __func__); + return (-1); + } + smp_rendezvous_cpus(ctx->cpu_map, NULL, pt_cpu_stop, NULL, NULL); + + return (0); +} + +/* + * HWT backend initialization method. + * + * Installs the ToPA interrupt handler and initializes + * the tracing contexts used for HWT_MODE_CPU. + */ +static int +pt_backend_init(struct hwt_context *ctx) +{ + struct hwt_cpu *hwt_cpu; + int error; + + dprintf("%s\n", __func__); + if (ctx->mode == HWT_MODE_CPU) { + TAILQ_FOREACH(hwt_cpu, &ctx->cpus, next) { + error = pt_init_ctx(&pt_pcpu_ctx[hwt_cpu->cpu_id], + hwt_cpu->vm, hwt_cpu->cpu_id); + if (error) + return (error); + } + } + + return (0); +} + +/* + * HWT backend teardown method. + * + * Removes the ToPA interrupt handler, stops tracing on all active CPUs, + * and releases all previously allocated ToPA metadata. + */ +static int +pt_backend_deinit(struct hwt_context *ctx) +{ + struct pt_ctx *pt_ctx; + struct hwt_thread *thr; + int cpu_id; + + dprintf("%s\n", __func__); + + pt_backend_disable_smp(ctx); + if (ctx->mode == HWT_MODE_THREAD) { + TAILQ_FOREACH(thr, &ctx->threads, next) { + KASSERT(thr->private != NULL, + ("%s: thr->private not set", __func__)); + pt_ctx = (struct pt_ctx *)thr->private; + pt_deinit_ctx(pt_ctx); + } + } else { + CPU_FOREACH(cpu_id) { + if (!CPU_ISSET(cpu_id, &ctx->cpu_map)) + continue; + if (pt_pcpu[cpu_id].ctx != NULL) { + KASSERT(pt_pcpu[cpu_id].ctx == + &pt_pcpu_ctx[cpu_id], + ("%s: CPU mode tracing with non-cpu mode PT" + "context active", + __func__)); + pt_pcpu[cpu_id].ctx = NULL; + } + pt_ctx = &pt_pcpu_ctx[cpu_id]; + pt_deinit_ctx(pt_ctx); + memset(&pt_pcpu[cpu_id], 0, sizeof(struct pt_cpu)); + } + } + + return (0); +} + +/* + * Fetches current offset into the tracing buffer. + */ +static int +pt_backend_read(struct hwt_vm *vm, int *curpage, vm_offset_t *curpage_offset, + uint64_t *data) +{ + struct pt_buffer *buf; + + if (vm->ctx->mode == HWT_MODE_THREAD) + buf = &((struct pt_ctx *)vm->thr->private)->buf; + else + buf = &pt_pcpu[vm->cpu->cpu_id].ctx->buf; + mtx_lock_spin(&buf->lock); + *curpage = buf->curpage; + *curpage_offset = buf->offset + (buf->wrap_count * vm->ctx->bufsize); + mtx_unlock_spin(&buf->lock); + + return (0); +} + +/* + * HWT thread creation hook. + * Allocates and associates a 'struct pt_ctx' for a given hwt thread. + */ +static int +pt_backend_alloc_thread(struct hwt_thread *thr) +{ + struct pt_ctx *pt_ctx; + int error; + + /* Omit M_WAITOK since this might get invoked a non-sleepable context */ + pt_ctx = malloc(sizeof(*pt_ctx), M_PT, M_NOWAIT | M_ZERO); + if (pt_ctx == NULL) + return (ENOMEM); + + error = pt_init_ctx(pt_ctx, thr->vm, thr->thread_id); + if (error) + return (error); + + thr->private = pt_ctx; + return (0); +} +/* + * HWT thread teardown hook. + */ +static void +pt_backend_free_thread(struct hwt_thread *thr) +{ + struct pt_ctx *ctx; + + ctx = (struct pt_ctx *)thr->private; + + pt_deinit_ctx(ctx); + free(ctx, M_PT); +} + +static void +pt_backend_dump(int cpu_id) +{ +} + +static struct hwt_backend_ops pt_ops = { + .hwt_backend_init = pt_backend_init, + .hwt_backend_deinit = pt_backend_deinit, + + .hwt_backend_configure = pt_backend_configure, + + .hwt_backend_enable = pt_backend_enable, + .hwt_backend_disable = pt_backend_disable, + +#ifdef SMP + .hwt_backend_enable_smp = pt_backend_enable_smp, + .hwt_backend_disable_smp = pt_backend_disable_smp, +#endif + + .hwt_backend_read = pt_backend_read, + .hwt_backend_dump = pt_backend_dump, + + .hwt_backend_thread_alloc = pt_backend_alloc_thread, + .hwt_backend_thread_free = pt_backend_free_thread, +}; + +static struct hwt_backend backend = { + .ops = &pt_ops, + .name = "pt", + .kva_req = 1, +}; + +/* + * Reads the latest valid trace buffer offset and enqueues + * a HWT_RECORD_BUFFER record. + * Used as a taskqueue routine from the ToPA interrupt handler. + */ +static void +pt_send_buffer_record(void *arg, int pending __unused) +{ + struct hwt_record_entry record; + struct pt_ctx *ctx = (struct pt_ctx *)arg; + + /* Prepare buffer record. */ + mtx_lock_spin(&ctx->buf.lock); + pt_fill_buffer_record(ctx->id, &ctx->buf, &record); + mtx_unlock_spin(&ctx->buf.lock); + hwt_record_ctx(ctx->hwt_ctx, &record, M_ZERO | M_NOWAIT); +} +static void +pt_topa_status_clear(void) +{ + uint64_t reg; + + reg = rdmsr(MSR_IA_GLOBAL_STATUS_RESET); + reg &= ~GLOBAL_STATUS_FLAG_TRACETOPAPMI; + reg |= GLOBAL_STATUS_FLAG_TRACETOPAPMI; + wrmsr(MSR_IA_GLOBAL_STATUS_RESET, reg); +} + +/* + * ToPA PMI handler. + * + * Invoked every time a ToPA entry marked with TOPA_INT is filled. + * Uses taskqueue to enqueue a buffer record for userspace. + * Re-enables the PC interrupt line as long as tracing is active. + */ +static int +pt_topa_intr(struct trapframe *tf) +{ + struct pt_buffer *buf; + struct pt_ctx *ctx; + uint64_t reg; + + SDT_PROBE0(pt, , , topa__intr); + + if (pt_cpu_get_state(curcpu) != PT_ACTIVE) { + return (0); + } + reg = rdmsr(MSR_IA_GLOBAL_STATUS); + if ((reg & GLOBAL_STATUS_FLAG_TRACETOPAPMI) == 0) { + /* ACK spurious or leftover interrupt. */ + pt_topa_status_clear(); + return (1); + } + + ctx = pt_pcpu[curcpu].ctx; + buf = &ctx->buf; + KASSERT(buf->topa_hw != NULL, + ("%s: ToPA PMI interrupt with invalid buffer", __func__)); + + pt_cpu_toggle_local(ctx->save_area, false); + pt_update_buffer(buf); + pt_topa_status_clear(); + taskqueue_enqueue_flags(taskqueue_pt, &ctx->task, + TASKQUEUE_FAIL_IF_PENDING); + + if (pt_cpu_get_state(curcpu) == PT_ACTIVE) { + pt_cpu_toggle_local(ctx->save_area, true); + lapic_reenable_pcint(); + } + return (1); +} + +/* + * Module initialization. + * + * Saves all PT-related cpuid info, registers itself as a HWT backend, + * and allocates metadata required to keep track of tracing operations + * on each CPU. + */ +static int +pt_init(void) +{ + u_int cp[4]; + int error; + + dprintf("pt: Enumerating part 1\n"); + cpuid_count(CPUID_PT_LEAF, 0, cp); + dprintf("pt: Maximum valid sub-leaf Index: %x\n", cp[0]); + dprintf("pt: ebx %x\n", cp[1]); + dprintf("pt: ecx %x\n", cp[2]); + + pt_info.l0_eax = cp[0]; + pt_info.l0_ebx = cp[1]; + pt_info.l0_ecx = cp[2]; + + dprintf("pt: Enumerating part 2\n"); + cpuid_count(CPUID_PT_LEAF, 1, cp); + dprintf("pt: eax %x\n", cp[0]); + dprintf("pt: ebx %x\n", cp[1]); + + pt_info.l1_eax = cp[0]; + pt_info.l1_ebx = cp[1]; + + error = hwt_backend_register(&backend); + if (error != 0) { + printf("pt: unable to register hwt backend, error %d\n", error); + return (error); + } + pt_pcpu = mallocarray(mp_ncpus, sizeof(struct pt_cpu), M_PT, + M_ZERO | M_WAITOK); + pt_pcpu_ctx = mallocarray(mp_ncpus, sizeof(struct pt_ctx), M_PT, + M_ZERO | M_WAITOK); + + nmi_register_handler(pt_topa_intr); + if (!lapic_enable_pcint()) { + nmi_remove_handler(pt_topa_intr); + hwt_backend_unregister(&backend); + free(pt_pcpu, M_PT); + free(pt_pcpu_ctx, M_PT); + pt_pcpu = NULL; + pt_pcpu_ctx = NULL; + printf("pt: failed to setup interrupt line\n"); + return (error); + } + initialized = true; + + return (0); +} + +/* + * Checks whether the CPU support Intel PT and + * initializes XSAVE area info. + * + * The driver relies on XSAVE/XRSTOR PT extensions, + * Table of Physical Addresses (ToPA) support, and + * support for multiple ToPA entries. + */ +static bool +pt_supported(void) +{ + u_int cp[4]; + + if ((cpu_stdext_feature & CPUID_STDEXT_PROCTRACE) == 0) { + printf("pt: CPU does not support Intel Processor Trace\n"); + return (false); + } + if ((cpu_feature2 & CPUID2_XSAVE) == 0) { + printf("pt: XSAVE is not supported\n"); + return (false); + } + if (!xsave_extfeature_supported(XFEATURE_ENABLED_PT, true)) { + printf("pt: CPU does not support managing PT state using XSAVE\n"); + return (false); + } + if (!xsave_extension_supported(CPUID_EXTSTATE_XSAVEC)) { + printf("pt: XSAVE compaction is not supported\n"); + return (false); + } + if (!xsave_extension_supported(CPUID_EXTSTATE_XSAVES)) { + printf("pt: CPU does not support XSAVES/XRSTORS\n"); + return (false); + } + + /* Require ToPA support. */ + cpuid_count(CPUID_PT_LEAF, 0, cp); + if ((cp[2] & CPUPT_TOPA) == 0) { + printf("pt: ToPA is not supported\n"); + return (false); + } + if ((cp[2] & CPUPT_TOPA_MULTI) == 0) { + printf("pt: multiple ToPA outputs are not supported\n"); + return (false); + } + + pt_info.xstate_hdr_offset = xsave_area_hdr_offset(); + pt_info.xsave_area_size = xsave_area_size(PT_XSTATE_BV, true, true); + pt_info.pt_xsave_offset = xsave_area_offset(PT_XSTATE_BV, + XFEATURE_ENABLED_PT, true, true); + + return (true); +} + +static void +pt_deinit(void) +{ + if (!initialized) + return; + nmi_remove_handler(pt_topa_intr); + lapic_disable_pcint(); + hwt_backend_unregister(&backend); + free(pt_pcpu, M_PT); + free(pt_pcpu_ctx, M_PT); + pt_pcpu = NULL; + initialized = false; +} + +static int +pt_modevent(module_t mod, int type, void *data) +{ + switch (type) { + case MOD_LOAD: + if (!pt_supported() || pt_init() != 0) { + return (ENXIO); + } + break; + case MOD_UNLOAD: + pt_deinit(); + break; + default: + break; + } + + return (0); +} + +static moduledata_t pt_mod = { "intel_pt", pt_modevent, NULL }; + +DECLARE_MODULE(intel_pt, pt_mod, SI_SUB_DRIVERS, SI_ORDER_FIRST); +MODULE_DEPEND(intel_pt, hwt, 1, 1, 1); +MODULE_VERSION(intel_pt, 1); diff --git a/sys/amd64/pt/pt.h b/sys/amd64/pt/pt.h new file mode 100644 index 000000000000..2423afdf22e9 --- /dev/null +++ b/sys/amd64/pt/pt.h @@ -0,0 +1,49 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023 Bojan Novkoviฤ <bnovkov@freebsd.org> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _AMD64_PT_PT_H_ +#define _AMD64_PT_PT_H_ + +#include <sys/types.h> + +#include <x86/include/specialreg.h> + +#define PT_IP_FILTER_MAX_RANGES (2) /* Intel SDM Vol. 3C, 33-29 */ + +struct pt_cpu_config { + uint64_t rtit_ctl; + register_t cr3_filter; + int nranges; + struct ipf_range { + vm_offset_t start; + vm_offset_t end; + } ip_ranges[PT_IP_FILTER_MAX_RANGES]; + uint32_t mtc_freq; + uint32_t cyc_thresh; + uint32_t psb_freq; +}; +#endif /* !_AMD64_PT_PT_H_ */ diff --git a/sys/amd64/vmm/intel/vmx_support.S b/sys/amd64/vmm/intel/vmx_support.S index f393f160b101..130130b64541 100644 --- a/sys/amd64/vmm/intel/vmx_support.S +++ b/sys/amd64/vmm/intel/vmx_support.S @@ -32,12 +32,6 @@ #include "vmx_assym.h" -#ifdef SMP -#define LK lock ; -#else -#define LK -#endif - /* Be friendly to DTrace FBT's prologue/epilogue pattern matching */ #define VENTER push %rbp ; mov %rsp,%rbp #define VLEAVE pop %rbp diff --git a/sys/arm/arm/pmap-v6.c b/sys/arm/arm/pmap-v6.c index 92eb0589f80b..78883296c5b7 100644 --- a/sys/arm/arm/pmap-v6.c +++ b/sys/arm/arm/pmap-v6.c @@ -5767,7 +5767,7 @@ pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) CTR5(KTR_PMAP, "%s: page %p - 0x%08X oma: %d, ma: %d", __func__, m, VM_PAGE_TO_PHYS(m), oma, ma); - if ((m->flags & PG_FICTITIOUS) != 0) + if (ma == oma || (m->flags & PG_FICTITIOUS) != 0) return; #if 0 /* @@ -5784,22 +5784,20 @@ pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) * If page is not mapped by sf buffer, map the page * transient and do invalidation. */ - if (ma != oma) { - pa = VM_PAGE_TO_PHYS(m); - sched_pin(); - pc = get_pcpu(); - cmap2_pte2p = pc->pc_cmap2_pte2p; - mtx_lock(&pc->pc_cmap_lock); - if (pte2_load(cmap2_pte2p) != 0) - panic("%s: CMAP2 busy", __func__); - pte2_store(cmap2_pte2p, PTE2_KERN_NG(pa, PTE2_AP_KRW, - vm_memattr_to_pte2(ma))); - dcache_wbinv_poc((vm_offset_t)pc->pc_cmap2_addr, pa, PAGE_SIZE); - pte2_clear(cmap2_pte2p); - tlb_flush((vm_offset_t)pc->pc_cmap2_addr); - sched_unpin(); - mtx_unlock(&pc->pc_cmap_lock); - } + pa = VM_PAGE_TO_PHYS(m); + sched_pin(); + pc = get_pcpu(); + cmap2_pte2p = pc->pc_cmap2_pte2p; + mtx_lock(&pc->pc_cmap_lock); + if (pte2_load(cmap2_pte2p) != 0) + panic("%s: CMAP2 busy", __func__); + pte2_store(cmap2_pte2p, PTE2_KERN_NG(pa, PTE2_AP_KRW, + vm_memattr_to_pte2(ma))); + dcache_wbinv_poc((vm_offset_t)pc->pc_cmap2_addr, pa, PAGE_SIZE); + pte2_clear(cmap2_pte2p); + tlb_flush((vm_offset_t)pc->pc_cmap2_addr); + sched_unpin(); + mtx_unlock(&pc->pc_cmap_lock); } /* diff --git a/sys/arm64/arm64/pmap.c b/sys/arm64/arm64/pmap.c index d2e56a270f54..459cc8ebe505 100644 --- a/sys/arm64/arm64/pmap.c +++ b/sys/arm64/arm64/pmap.c @@ -497,7 +497,8 @@ static bool pmap_pv_insert_l3c(pmap_t pmap, vm_offset_t va, vm_page_t m, struct rwlock **lockp); static void pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va); static int pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, - pd_entry_t l1e, struct spglist *free, struct rwlock **lockp); + pd_entry_t l1e, bool demote_kl2e, struct spglist *free, + struct rwlock **lockp); static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva, pd_entry_t l2e, struct spglist *free, struct rwlock **lockp); static bool pmap_remove_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va, @@ -3847,8 +3848,7 @@ pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va) PMAP_LOCK_ASSERT(pmap, MA_OWNED); ml3 = pmap_remove_pt_page(pmap, va); - if (ml3 == NULL) - panic("pmap_remove_kernel_l2: Missing pt page"); + KASSERT(ml3 != NULL, ("pmap_remove_kernel_l2: missing pt page")); ml3pa = VM_PAGE_TO_PHYS(ml3); newl2 = PHYS_TO_PTE(ml3pa) | L2_TABLE; @@ -3873,8 +3873,8 @@ pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va) * pmap_remove_l2: Do the things to unmap a level 2 superpage. */ static int -pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, - pd_entry_t l1e, struct spglist *free, struct rwlock **lockp) +pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, pd_entry_t l1e, + bool demote_kl2e, struct spglist *free, struct rwlock **lockp) { struct md_page *pvh; pt_entry_t old_l2; @@ -3910,9 +3910,7 @@ pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, vm_page_aflag_clear(mt, PGA_WRITEABLE); } } - if (pmap == kernel_pmap) { - pmap_remove_kernel_l2(pmap, l2, sva); - } else { + if (pmap != kernel_pmap) { ml3 = pmap_remove_pt_page(pmap, sva); if (ml3 != NULL) { KASSERT(vm_page_any_valid(ml3), @@ -3923,6 +3921,14 @@ pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, ml3->ref_count = 0; pmap_add_delayed_free_list(ml3, free, false); } + } else if (demote_kl2e) { + pmap_remove_kernel_l2(pmap, l2, sva); + } else { + ml3 = vm_radix_lookup(&pmap->pm_root, pmap_l2_pindex(sva)); + if (vm_page_any_valid(ml3)) { + ml3->valid = 0; + pmap_zero_page(ml3); + } } return (pmap_unuse_pt(pmap, sva, l1e, free)); } @@ -4232,7 +4238,7 @@ pmap_remove1(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, bool map_delete) if ((l3_paddr & ATTR_DESCR_MASK) == L2_BLOCK) { if (sva + L2_SIZE == va_next && eva >= va_next) { pmap_remove_l2(pmap, l2, sva, pmap_load(l1), - &free, &lock); + true, &free, &lock); continue; } else if (pmap_demote_l2_locked(pmap, l2, sva, &lock) == NULL) @@ -5703,6 +5709,9 @@ pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags, PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT(ADDR_IS_CANONICAL(va), ("%s: Address not in canonical form: %lx", __func__, va)); + KASSERT((flags & (PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM)) != + PMAP_ENTER_NORECLAIM, + ("pmap_enter_l2: flags is missing PMAP_ENTER_NOREPLACE")); if ((l2 = pmap_alloc_l2(pmap, va, &l2pg, (flags & PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp)) == NULL) { @@ -5747,33 +5756,51 @@ pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags, } } SLIST_INIT(&free); - if ((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK) + if ((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK) { (void)pmap_remove_l2(pmap, l2, va, - pmap_load(pmap_l1(pmap, va)), &free, lockp); - else + pmap_load(pmap_l1(pmap, va)), false, &free, lockp); + } else { + if (ADDR_IS_KERNEL(va)) { + /* + * Try to save the ptp in the trie + * before any changes to mappings are + * made. Abort on failure. + */ + mt = PTE_TO_VM_PAGE(old_l2); + if (pmap_insert_pt_page(pmap, mt, false, + false)) { + CTR1(KTR_PMAP, + "pmap_enter_l2: cannot ins kern ptp va %#lx", + va); + return (KERN_RESOURCE_SHORTAGE); + } + /* + * Both pmap_remove_l2() and + * pmap_remove_l3_range() will zero fill + * the L3 kernel page table page. + */ + } pmap_remove_l3_range(pmap, old_l2, va, va + L2_SIZE, &free, lockp); + if (ADDR_IS_KERNEL(va)) { + /* + * The TLB could have an intermediate + * entry for the L3 kernel page table + * page, so request an invalidation at + * all levels after clearing the + * L2_TABLE entry. + */ + pmap_clear(l2); + pmap_s1_invalidate_page(pmap, va, false); + } + } + KASSERT(pmap_load(l2) == 0, + ("pmap_enter_l2: non-zero L2 entry %p", l2)); if (!ADDR_IS_KERNEL(va)) { vm_page_free_pages_toq(&free, true); - KASSERT(pmap_load(l2) == 0, - ("pmap_enter_l2: non-zero L2 entry %p", l2)); } else { KASSERT(SLIST_EMPTY(&free), ("pmap_enter_l2: freed kernel page table page")); - - /* - * Both pmap_remove_l2() and pmap_remove_l3_range() - * will leave the kernel page table page zero filled. - * Nonetheless, the TLB could have an intermediate - * entry for the kernel page table page, so request - * an invalidation at all levels after clearing - * the L2_TABLE entry. - */ - mt = PTE_TO_VM_PAGE(pmap_load(l2)); - if (pmap_insert_pt_page(pmap, mt, false, false)) - panic("pmap_enter_l2: trie insert failed"); - pmap_clear(l2); - pmap_s1_invalidate_page(pmap, va, false); } } @@ -5804,6 +5831,15 @@ pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags, if (!pmap_pv_insert_l2(pmap, va, new_l2, flags, lockp)) { if (l2pg != NULL) pmap_abort_ptp(pmap, va, l2pg); + else { + KASSERT(ADDR_IS_KERNEL(va) && + (pmap_load(l2) & ATTR_DESCR_MASK) == + L2_TABLE, + ("pmap_enter_l2: invalid kernel L2E")); + mt = pmap_remove_pt_page(pmap, va); + KASSERT(mt != NULL, + ("pmap_enter_l2: missing kernel PTP")); + } if (uwptpg != NULL) { mt = pmap_remove_pt_page(pmap, va); KASSERT(mt == uwptpg, @@ -8045,6 +8081,8 @@ pmap_unmapbios(void *p, vm_size_t size) void pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) { + if (m->md.pv_memattr == ma) + return; m->md.pv_memattr = ma; @@ -8424,8 +8462,8 @@ pmap_demote_l2_abort(pmap_t pmap, vm_offset_t va, pt_entry_t *l2, struct spglist free; SLIST_INIT(&free); - (void)pmap_remove_l2(pmap, l2, va, pmap_load(pmap_l1(pmap, va)), &free, - lockp); + (void)pmap_remove_l2(pmap, l2, va, pmap_load(pmap_l1(pmap, va)), true, + &free, lockp); vm_page_free_pages_toq(&free, true); } diff --git a/sys/arm64/broadcom/genet/if_genet.c b/sys/arm64/broadcom/genet/if_genet.c index 0602f076b257..182b5582fb7c 100644 --- a/sys/arm64/broadcom/genet/if_genet.c +++ b/sys/arm64/broadcom/genet/if_genet.c @@ -349,7 +349,7 @@ gen_attach(device_t dev) } /* If address was not found, create one based on the hostid and name. */ - if (eaddr_found == 0) + if (!eaddr_found) ether_gen_addr(sc->ifp, &eaddr); /* Attach ethernet interface */ ether_ifattach(sc->ifp, eaddr.octet); @@ -653,7 +653,7 @@ gen_bus_dma_teardown(struct gen_softc *sc) error); } - if (sc->tx_buf_tag != NULL) { + if (sc->rx_buf_tag != NULL) { for (i = 0; i < RX_DESC_COUNT; i++) { error = bus_dmamap_destroy(sc->rx_buf_tag, sc->rx_ring_ent[i].map); diff --git a/sys/cddl/boot/zfs/zfsimpl.h b/sys/cddl/boot/zfs/zfsimpl.h index 0ce38384abbf..83d964360343 100644 --- a/sys/cddl/boot/zfs/zfsimpl.h +++ b/sys/cddl/boot/zfs/zfsimpl.h @@ -2019,6 +2019,7 @@ typedef struct vdev { vdev_list_t v_children; /* children of this vdev */ const char *v_name; /* vdev name */ uint64_t v_guid; /* vdev guid */ + uint64_t v_txg; /* most recent transaction */ uint64_t v_id; /* index in parent */ uint64_t v_psize; /* physical device capacity */ int v_ashift; /* offset to block shift */ @@ -2048,7 +2049,6 @@ typedef struct spa { STAILQ_ENTRY(spa) spa_link; /* link in global pool list */ char *spa_name; /* pool name */ uint64_t spa_guid; /* pool guid */ - uint64_t spa_txg; /* most recent transaction */ struct uberblock *spa_uberblock; /* best uberblock so far */ vdev_t *spa_root_vdev; /* toplevel vdev container */ objset_phys_t *spa_mos; /* MOS for this pool */ diff --git a/sys/compat/linuxkpi/common/include/linux/slab.h b/sys/compat/linuxkpi/common/include/linux/slab.h index f3a840d9bf4b..efa5c8cb67b3 100644 --- a/sys/compat/linuxkpi/common/include/linux/slab.h +++ b/sys/compat/linuxkpi/common/include/linux/slab.h @@ -45,7 +45,7 @@ MALLOC_DECLARE(M_KMALLOC); -#define kvzalloc(size, flags) kmalloc(size, (flags) | __GFP_ZERO) +#define kvzalloc(size, flags) kvmalloc(size, (flags) | __GFP_ZERO) #define kvcalloc(n, size, flags) kvmalloc_array(n, size, (flags) | __GFP_ZERO) #define kzalloc(size, flags) kmalloc(size, (flags) | __GFP_ZERO) #define kzalloc_node(size, flags, node) kmalloc_node(size, (flags) | __GFP_ZERO, node) diff --git a/sys/compat/linuxkpi/common/src/linux_page.c b/sys/compat/linuxkpi/common/src/linux_page.c index ebb92eacbf9a..628af17df853 100644 --- a/sys/compat/linuxkpi/common/src/linux_page.c +++ b/sys/compat/linuxkpi/common/src/linux_page.c @@ -106,6 +106,7 @@ linux_alloc_pages(gfp_t flags, unsigned int order) if ((flags & M_ZERO) != 0) req |= VM_ALLOC_ZERO; + if (order == 0 && (flags & GFP_DMA32) == 0) { page = vm_page_alloc_noobj(req); if (page == NULL) @@ -113,6 +114,10 @@ linux_alloc_pages(gfp_t flags, unsigned int order) } else { vm_paddr_t pmax = (flags & GFP_DMA32) ? BUS_SPACE_MAXADDR_32BIT : BUS_SPACE_MAXADDR; + + if ((flags & __GFP_NORETRY) != 0) + req |= VM_ALLOC_NORECLAIM; + retry: page = vm_page_alloc_noobj_contig(req, npages, 0, pmax, PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); diff --git a/sys/conf/files b/sys/conf/files index 74d251c2b608..dd0d390962f2 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -3227,6 +3227,19 @@ dev/uart/uart_if.m optional uart dev/uart/uart_subr.c optional uart dev/uart/uart_tty.c optional uart # +# Universal Flash Storage Host Controller Interface drivers +# +dev/ufshci/ufshci.c optional ufshci +dev/ufshci/ufshci_ctrlr.c optional ufshci +dev/ufshci/ufshci_ctrlr_cmd.c optional ufshci +dev/ufshci/ufshci_dev.c optional ufshci +dev/ufshci/ufshci_pci.c optional ufshci +dev/ufshci/ufshci_req_queue.c optional ufshci +dev/ufshci/ufshci_req_sdb.c optional ufshci +dev/ufshci/ufshci_sim.c optional ufshci +dev/ufshci/ufshci_sysctl.c optional ufshci +dev/ufshci/ufshci_uic_cmd.c optional ufshci +# # USB controller drivers # dev/usb/controller/musb_otg.c optional musb diff --git a/sys/conf/files.amd64 b/sys/conf/files.amd64 index 0584fc29d039..678d288c2d86 100644 --- a/sys/conf/files.amd64 +++ b/sys/conf/files.amd64 @@ -191,6 +191,10 @@ dev/ice/irdma_di_if.m optional ice pci \ compile-with "${NORMAL_M} -I$S/dev/ice" dev/ice/ice_ddp_common.c optional ice pci \ compile-with "${NORMAL_C} -I$S/dev/ice" +dev/ice/ice_iov.c optional ice pci pci_iov \ + compile-with "${NORMAL_C} -I$S/dev/ice" +dev/ice/ice_vf_mbx.c optional ice pci pci_iov \ + compile-with "${NORMAL_C} -I$S/dev/ice" ice_ddp.c optional ice_ddp \ compile-with "${AWK} -f $S/tools/fw_stub.awk ice_ddp.fw:ice_ddp:0x01032900 -mice_ddp -c${.TARGET}" \ no-ctfconvert no-implicit-rule before-depend local \ diff --git a/sys/dev/gpio/gpiobus.c b/sys/dev/gpio/gpiobus.c index ab7f13177969..764bcb7e6ee8 100644 --- a/sys/dev/gpio/gpiobus.c +++ b/sys/dev/gpio/gpiobus.c @@ -110,10 +110,9 @@ gpio_alloc_intr_resource(device_t consumer_dev, int *rid, u_int alloc_flags, res = bus_alloc_resource(consumer_dev, SYS_RES_IRQ, rid, irq, irq, 1, alloc_flags); if (res == NULL) { - intr_free_intr_map_data((struct intr_map_data *)gpio_data); + intr_unmap_irq(irq); return (NULL); } - rman_set_virtual(res, gpio_data); return (res); } #else @@ -866,6 +865,25 @@ gpiobus_alloc_resource(device_t bus, device_t child, int type, int *rid, end, count, flags)); } +static int +gpiobus_release_resource(device_t dev, device_t child, struct resource *r) +{ + int err; +#ifdef INTRNG + u_int irq; + + irq = rman_get_start(r); + MPASS(irq == rman_get_end(r)); +#endif + err = bus_generic_rman_release_resource(dev, child, r); + if (err != 0) + return (err); +#ifdef INTRNG + intr_unmap_irq(irq); +#endif + return (0); +} + static struct resource_list * gpiobus_get_resource_list(device_t bus __unused, device_t child) { @@ -1060,7 +1078,7 @@ static device_method_t gpiobus_methods[] = { DEVMETHOD(bus_get_resource, bus_generic_rl_get_resource), DEVMETHOD(bus_set_resource, bus_generic_rl_set_resource), DEVMETHOD(bus_alloc_resource, gpiobus_alloc_resource), - DEVMETHOD(bus_release_resource, bus_generic_rman_release_resource), + DEVMETHOD(bus_release_resource, gpiobus_release_resource), DEVMETHOD(bus_activate_resource, bus_generic_rman_activate_resource), DEVMETHOD(bus_deactivate_resource, bus_generic_rman_deactivate_resource), DEVMETHOD(bus_get_resource_list, gpiobus_get_resource_list), diff --git a/sys/dev/ice/ice_features.h b/sys/dev/ice/ice_features.h index 821abe4806ca..5b23757b1c98 100644 --- a/sys/dev/ice/ice_features.h +++ b/sys/dev/ice/ice_features.h @@ -91,7 +91,9 @@ enum feat_list { static inline void ice_disable_unsupported_features(ice_bitmap_t __unused *bitmap) { +#ifndef PCI_IOV ice_clear_bit(ICE_FEATURE_SRIOV, bitmap); +#endif #ifndef DEV_NETMAP ice_clear_bit(ICE_FEATURE_NETMAP, bitmap); #endif diff --git a/sys/dev/ice/ice_iflib.h b/sys/dev/ice/ice_iflib.h index 3a5dc201189a..e1d5307a9516 100644 --- a/sys/dev/ice/ice_iflib.h +++ b/sys/dev/ice/ice_iflib.h @@ -139,6 +139,9 @@ struct ice_irq_vector { * @tc: traffic class queue belongs to * @q_handle: qidx in tc; used in TXQ enable functions * + * ice_iov.c requires the following parameters (when PCI_IOV is defined): + * @itr_idx: ITR index to use for this queue + * * Other parameters may be iflib driver specific */ struct ice_tx_queue { @@ -153,6 +156,9 @@ struct ice_tx_queue { u32 me; u16 q_handle; u8 tc; +#ifdef PCI_IOV + u8 itr_idx; +#endif /* descriptor writeback status */ qidx_t *tx_rsq; @@ -175,6 +181,9 @@ struct ice_tx_queue { * @stats: queue statistics * @tc: traffic class queue belongs to * + * ice_iov.c requires the following parameters (when PCI_IOV is defined): + * @itr_idx: ITR index to use for this queue + * * Other parameters may be iflib driver specific */ struct ice_rx_queue { @@ -187,6 +196,9 @@ struct ice_rx_queue { struct ice_irq_vector *irqv; u32 me; u8 tc; +#ifdef PCI_IOV + u8 itr_idx; +#endif struct if_irq que_irq; }; @@ -332,6 +344,10 @@ struct ice_softc { ice_declare_bitmap(feat_cap, ICE_FEATURE_COUNT); ice_declare_bitmap(feat_en, ICE_FEATURE_COUNT); +#ifdef PCI_IOV + struct ice_vf *vfs; + u16 num_vfs; +#endif struct ice_resmgr os_imgr; /* For mirror interface */ struct ice_mirr_if *mirr_if; diff --git a/sys/dev/ice/ice_iov.c b/sys/dev/ice/ice_iov.c new file mode 100644 index 000000000000..c5a3e1060e44 --- /dev/null +++ b/sys/dev/ice/ice_iov.c @@ -0,0 +1,1856 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright (c) 2025, Intel Corporation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * @file ice_iov.c + * @brief Virtualization support functions + * + * Contains functions for enabling and managing PCIe virtual function devices, + * including enabling new VFs, and managing VFs over the virtchnl interface. + */ + +#include "ice_iov.h" + +static struct ice_vf *ice_iov_get_vf(struct ice_softc *sc, int vf_num); +static void ice_iov_ready_vf(struct ice_softc *sc, struct ice_vf *vf); +static void ice_reset_vf(struct ice_softc *sc, struct ice_vf *vf, + bool trigger_vflr); +static void ice_iov_setup_intr_mapping(struct ice_softc *sc, struct ice_vf *vf); + +static void ice_vc_version_msg(struct ice_softc *sc, struct ice_vf *vf, + u8 *msg_buf); +static void ice_vc_get_vf_res_msg(struct ice_softc *sc, struct ice_vf *vf, + u8 *msg_buf); +static void ice_vc_add_eth_addr_msg(struct ice_softc *sc, struct ice_vf *vf, + u8 *msg_buf); +static void ice_vc_del_eth_addr_msg(struct ice_softc *sc, struct ice_vf *vf, + u8 *msg_buf); +static bool ice_vc_isvalid_ring_len(u16 ring_len); +static void ice_vc_cfg_vsi_qs_msg(struct ice_softc *sc, struct ice_vf *vf, + u8 *msg_buf); +static void ice_vc_cfg_rss_key_msg(struct ice_softc *sc, struct ice_vf *vf, + u8 *msg_buf); +static void ice_vc_set_rss_hena_msg(struct ice_softc *sc, struct ice_vf *vf, + u8 *msg_buf); +static void ice_vc_enable_queues_msg(struct ice_softc *sc, struct ice_vf *vf, + u8 *msg_buf); +static void ice_vc_notify_vf_link_state(struct ice_softc *sc, struct ice_vf *vf); +static void ice_vc_disable_queues_msg(struct ice_softc *sc, struct ice_vf *vf, + u8 *msg_buf); +static void ice_vc_cfg_irq_map_msg(struct ice_softc *sc, struct ice_vf *vf, + u8 *msg_buf); +static void ice_vc_get_stats_msg(struct ice_softc *sc, struct ice_vf *vf, + u8 *msg_buf); +static void ice_eth_stats_to_virtchnl_eth_stats(struct ice_eth_stats *istats, + struct virtchnl_eth_stats *vstats); +static void ice_vc_cfg_rss_lut_msg(struct ice_softc *sc, struct ice_vf *vf, + u8 *msg_buf); +static void ice_vc_cfg_promisc_mode_msg(struct ice_softc *sc, struct ice_vf *vf, + u8 *msg_buf); +static void ice_vc_add_vlan_msg(struct ice_softc *sc, struct ice_vf *vf, + u8 *msg_buf); +static void ice_vc_del_vlan_msg(struct ice_softc *sc, struct ice_vf *vf, + u8 *msg_buf); +static enum virtchnl_status_code ice_iov_err_to_virt_err(int ice_err); +static int ice_vf_validate_mac(struct ice_vf *vf, const uint8_t *addr); + +/** + * ice_iov_attach - Initialize SR-IOV PF host support + * @sc: device softc structure + * + * Initialize SR-IOV PF host support at the end of the driver attach process. + * + * @pre Must be called from sleepable context (calls malloc() w/ M_WAITOK) + * + * @returns 0 if successful, or + * - ENOMEM if there is no memory for the PF/VF schemas or iov device + * - ENXIO if the device isn't PCI-E or doesn't support the same SR-IOV + * version as the kernel + * - ENOENT if the device doesn't have the SR-IOV capability + */ +int +ice_iov_attach(struct ice_softc *sc) +{ + device_t dev = sc->dev; + nvlist_t *pf_schema, *vf_schema; + int error; + + pf_schema = pci_iov_schema_alloc_node(); + vf_schema = pci_iov_schema_alloc_node(); + + pci_iov_schema_add_unicast_mac(vf_schema, "mac-addr", 0, NULL); + pci_iov_schema_add_bool(vf_schema, "mac-anti-spoof", + IOV_SCHEMA_HASDEFAULT, TRUE); + pci_iov_schema_add_bool(vf_schema, "allow-set-mac", + IOV_SCHEMA_HASDEFAULT, FALSE); + pci_iov_schema_add_bool(vf_schema, "allow-promisc", + IOV_SCHEMA_HASDEFAULT, FALSE); + pci_iov_schema_add_uint16(vf_schema, "num-queues", + IOV_SCHEMA_HASDEFAULT, ICE_DEFAULT_VF_QUEUES); + pci_iov_schema_add_uint16(vf_schema, "mirror-src-vsi", + IOV_SCHEMA_HASDEFAULT, ICE_INVALID_MIRROR_VSI); + pci_iov_schema_add_uint16(vf_schema, "max-vlan-allowed", + IOV_SCHEMA_HASDEFAULT, ICE_DEFAULT_VF_VLAN_LIMIT); + pci_iov_schema_add_uint16(vf_schema, "max-mac-filters", + IOV_SCHEMA_HASDEFAULT, ICE_DEFAULT_VF_FILTER_LIMIT); + + error = pci_iov_attach(dev, pf_schema, vf_schema); + if (error != 0) { + device_printf(dev, + "pci_iov_attach failed (error=%s)\n", + ice_err_str(error)); + ice_clear_bit(ICE_FEATURE_SRIOV, sc->feat_en); + } else + ice_set_bit(ICE_FEATURE_SRIOV, sc->feat_en); + + return (error); +} + +/** + * ice_iov_detach - Teardown SR-IOV PF host support + * @sc: device softc structure + * + * Teardown SR-IOV PF host support at the start of the driver detach process. + * + * @returns 0 if successful or IOV support hasn't been setup, or + * - EBUSY if VFs still exist + */ +int +ice_iov_detach(struct ice_softc *sc) +{ + device_t dev = sc->dev; + int error; + + error = pci_iov_detach(dev); + if (error != 0) { + device_printf(dev, + "pci_iov_detach failed (error=%s)\n", + ice_err_str(error)); + } + + return (error); +} + +/** + * ice_iov_init - Called by the OS before the first VF is created. + * @sc: device softc structure + * @num_vfs: number of VFs to setup resources for + * @params: configuration parameters for the PF + * + * @returns 0 if successful or an error code on failure + */ +int +ice_iov_init(struct ice_softc *sc, uint16_t num_vfs, const nvlist_t *params __unused) +{ + /* Allocate array of VFs, for tracking */ + sc->vfs = (struct ice_vf *)malloc(sizeof(struct ice_vf) * num_vfs, M_ICE, M_NOWAIT | + M_ZERO); + if (sc->vfs == NULL) + return (ENOMEM); + + /* Initialize each VF with basic information */ + for (int i = 0; i < num_vfs; i++) + sc->vfs[i].vf_num = i; + + /* Save off number of configured VFs */ + sc->num_vfs = num_vfs; + + return (0); +} + +/** + * ice_iov_get_vf - Get pointer to VF at given index + * @sc: device softc structure + * @vf_num: Index of VF to retrieve + * + * @remark will throw an assertion if vf_num is not in the + * range of allocated VFs + * + * @returns a pointer to the VF structure at the given index + */ +static struct ice_vf * +ice_iov_get_vf(struct ice_softc *sc, int vf_num) +{ + MPASS(vf_num < sc->num_vfs); + + return &sc->vfs[vf_num]; +} + +/** + * ice_iov_add_vf - Called by the OS for each VF to create + * @sc: device softc structure + * @vfnum: index of VF to configure + * @params: configuration parameters for the VF + * + * @returns 0 if successful or an error code on failure + */ +int +ice_iov_add_vf(struct ice_softc *sc, uint16_t vfnum, const nvlist_t *params) +{ + struct ice_tx_queue *txq; + struct ice_rx_queue *rxq; + device_t dev = sc->dev; + struct ice_vsi *vsi; + struct ice_vf *vf; + int vf_num_queues; + const void *mac; + size_t size; + int error; + int i; + + vf = ice_iov_get_vf(sc, vfnum); + vf->vf_flags = VF_FLAG_ENABLED; + + /* This VF needs at least one VSI */ + vsi = ice_alloc_vsi(sc, ICE_VSI_VF); + if (vsi == NULL) + return (ENOMEM); + vf->vsi = vsi; + vsi->vf_num = vfnum; + + vf_num_queues = nvlist_get_number(params, "num-queues"); + /* Validate and clamp value if invalid */ + if (vf_num_queues < 1 || vf_num_queues > ICE_MAX_SCATTERED_QUEUES) + device_printf(dev, "Invalid num-queues (%d) for VF %d\n", + vf_num_queues, vf->vf_num); + if (vf_num_queues < 1) { + device_printf(dev, "Setting VF %d num-queues to 1\n", vf->vf_num); + vf_num_queues = 1; + } else if (vf_num_queues > ICE_MAX_SCATTERED_QUEUES) { + device_printf(dev, "Setting VF %d num-queues to %d\n", + vf->vf_num, ICE_MAX_SCATTERED_QUEUES); + vf_num_queues = ICE_MAX_SCATTERED_QUEUES; + } + vsi->qmap_type = ICE_RESMGR_ALLOC_SCATTERED; + + /* Reserve VF queue allocation from PF queues */ + ice_alloc_vsi_qmap(vsi, vf_num_queues, vf_num_queues); + vsi->num_tx_queues = vsi->num_rx_queues = vf_num_queues; + + /* Assign Tx queues from PF space */ + error = ice_resmgr_assign_scattered(&sc->tx_qmgr, vsi->tx_qmap, + vsi->num_tx_queues); + if (error) { + device_printf(sc->dev, "Unable to assign VF Tx queues: %s\n", + ice_err_str(error)); + goto release_vsi; + } + + /* Assign Rx queues from PF space */ + error = ice_resmgr_assign_scattered(&sc->rx_qmgr, vsi->rx_qmap, + vsi->num_rx_queues); + if (error) { + device_printf(sc->dev, "Unable to assign VF Rx queues: %s\n", + ice_err_str(error)); + goto release_vsi; + } + + vsi->max_frame_size = ICE_MAX_FRAME_SIZE; + + /* Allocate queue structure memory */ + vsi->tx_queues = (struct ice_tx_queue *) + malloc(sizeof(struct ice_tx_queue) * vsi->num_tx_queues, M_ICE, + M_NOWAIT | M_ZERO); + if (!vsi->tx_queues) { + device_printf(sc->dev, "VF-%d: Unable to allocate Tx queue memory\n", + vfnum); + error = ENOMEM; + goto release_vsi; + } + for (i = 0, txq = vsi->tx_queues; i < vsi->num_tx_queues; i++, txq++) { + txq->me = i; + txq->vsi = vsi; + } + + /* Allocate queue structure memory */ + vsi->rx_queues = (struct ice_rx_queue *) + malloc(sizeof(struct ice_rx_queue) * vsi->num_rx_queues, M_ICE, + M_NOWAIT | M_ZERO); + if (!vsi->rx_queues) { + device_printf(sc->dev, "VF-%d: Unable to allocate Rx queue memory\n", + vfnum); + error = ENOMEM; + goto free_txqs; + } + for (i = 0, rxq = vsi->rx_queues; i < vsi->num_rx_queues; i++, rxq++) { + rxq->me = i; + rxq->vsi = vsi; + } + + /* Allocate space to store the IRQ vector data */ + vf->num_irq_vectors = vf_num_queues + 1; + vf->tx_irqvs = (struct ice_irq_vector *) + malloc(sizeof(struct ice_irq_vector) * (vf->num_irq_vectors), + M_ICE, M_NOWAIT); + if (!vf->tx_irqvs) { + device_printf(sc->dev, + "Unable to allocate TX irqv memory for VF-%d's %d vectors\n", + vfnum, vf->num_irq_vectors); + error = ENOMEM; + goto free_rxqs; + } + vf->rx_irqvs = (struct ice_irq_vector *) + malloc(sizeof(struct ice_irq_vector) * (vf->num_irq_vectors), + M_ICE, M_NOWAIT); + if (!vf->rx_irqvs) { + device_printf(sc->dev, + "Unable to allocate RX irqv memory for VF-%d's %d vectors\n", + vfnum, vf->num_irq_vectors); + error = ENOMEM; + goto free_txirqvs; + } + + /* Assign VF interrupts from PF space */ + if (!(vf->vf_imap = + (u16 *)malloc(sizeof(u16) * vf->num_irq_vectors, + M_ICE, M_NOWAIT))) { + device_printf(dev, "Unable to allocate VF-%d imap memory\n", vfnum); + error = ENOMEM; + goto free_rxirqvs; + } + error = ice_resmgr_assign_contiguous(&sc->dev_imgr, vf->vf_imap, vf->num_irq_vectors); + if (error) { + device_printf(dev, "Unable to assign VF-%d interrupt mapping: %s\n", + vfnum, ice_err_str(error)); + goto free_imap; + } + + if (nvlist_exists_binary(params, "mac-addr")) { + mac = nvlist_get_binary(params, "mac-addr", &size); + memcpy(vf->mac, mac, ETHER_ADDR_LEN); + + if (nvlist_get_bool(params, "allow-set-mac")) + vf->vf_flags |= VF_FLAG_SET_MAC_CAP; + } else + /* + * If the administrator has not specified a MAC address then + * we must allow the VF to choose one. + */ + vf->vf_flags |= VF_FLAG_SET_MAC_CAP; + + if (nvlist_get_bool(params, "mac-anti-spoof")) + vf->vf_flags |= VF_FLAG_MAC_ANTI_SPOOF; + + if (nvlist_get_bool(params, "allow-promisc")) + vf->vf_flags |= VF_FLAG_PROMISC_CAP; + + vsi->mirror_src_vsi = nvlist_get_number(params, "mirror-src-vsi"); + + vf->vlan_limit = nvlist_get_number(params, "max-vlan-allowed"); + vf->mac_filter_limit = nvlist_get_number(params, "max-mac-filters"); + + vf->vf_flags |= VF_FLAG_VLAN_CAP; + + /* Create and setup VSI in HW */ + error = ice_initialize_vsi(vsi); + if (error) { + device_printf(sc->dev, "Unable to initialize VF %d VSI: %s\n", + vfnum, ice_err_str(error)); + goto release_imap; + } + + /* Add the broadcast address */ + error = ice_add_vsi_mac_filter(vsi, broadcastaddr); + if (error) { + device_printf(sc->dev, "Unable to add broadcast filter VF %d VSI: %s\n", + vfnum, ice_err_str(error)); + goto release_imap; + } + + ice_iov_ready_vf(sc, vf); + + return (0); + +release_imap: + ice_resmgr_release_map(&sc->dev_imgr, vf->vf_imap, + vf->num_irq_vectors); +free_imap: + free(vf->vf_imap, M_ICE); + vf->vf_imap = NULL; +free_rxirqvs: + free(vf->rx_irqvs, M_ICE); + vf->rx_irqvs = NULL; +free_txirqvs: + free(vf->tx_irqvs, M_ICE); + vf->tx_irqvs = NULL; +free_rxqs: + free(vsi->rx_queues, M_ICE); + vsi->rx_queues = NULL; +free_txqs: + free(vsi->tx_queues, M_ICE); + vsi->tx_queues = NULL; +release_vsi: + ice_release_vsi(vsi); + vf->vsi = NULL; + return (error); +} + +/** + * ice_iov_uninit - Called by the OS when VFs are destroyed + * @sc: device softc structure + */ +void +ice_iov_uninit(struct ice_softc *sc) +{ + struct ice_vf *vf; + struct ice_vsi *vsi; + + /* Release per-VF resources */ + for (int i = 0; i < sc->num_vfs; i++) { + vf = &sc->vfs[i]; + vsi = vf->vsi; + + /* Free VF interrupt reservation */ + if (vf->vf_imap) { + free(vf->vf_imap, M_ICE); + vf->vf_imap = NULL; + } + + /* Free queue interrupt mapping trackers */ + if (vf->tx_irqvs) { + free(vf->tx_irqvs, M_ICE); + vf->tx_irqvs = NULL; + } + if (vf->rx_irqvs) { + free(vf->rx_irqvs, M_ICE); + vf->rx_irqvs = NULL; + } + + if (!vsi) + continue; + + /* Free VSI queues */ + if (vsi->tx_queues) { + free(vsi->tx_queues, M_ICE); + vsi->tx_queues = NULL; + } + if (vsi->rx_queues) { + free(vsi->rx_queues, M_ICE); + vsi->rx_queues = NULL; + } + + ice_release_vsi(vsi); + vf->vsi = NULL; + } + + /* Release memory used for VF tracking */ + if (sc->vfs) { + free(sc->vfs, M_ICE); + sc->vfs = NULL; + } + sc->num_vfs = 0; +} + +/** + * ice_iov_handle_vflr - Process VFLR event + * @sc: device softc structure + * + * Identifys which VFs have been reset and re-configure + * them. + */ +void +ice_iov_handle_vflr(struct ice_softc *sc) +{ + struct ice_hw *hw = &sc->hw; + struct ice_vf *vf; + u32 reg, reg_idx, bit_idx; + + for (int i = 0; i < sc->num_vfs; i++) { + vf = &sc->vfs[i]; + + reg_idx = (hw->func_caps.vf_base_id + vf->vf_num) / 32; + bit_idx = (hw->func_caps.vf_base_id + vf->vf_num) % 32; + reg = rd32(hw, GLGEN_VFLRSTAT(reg_idx)); + if (reg & BIT(bit_idx)) + ice_reset_vf(sc, vf, false); + } +} + +/** + * ice_iov_ready_vf - Setup VF interrupts and mark it as ready + * @sc: device softc structure + * @vf: driver's VF structure for the VF to update + * + * Clears VF reset triggering bit, sets up the PF<->VF interrupt + * mapping and marks the VF as active in the HW so that the VF + * driver can use it. + */ +static void +ice_iov_ready_vf(struct ice_softc *sc, struct ice_vf *vf) +{ + struct ice_hw *hw = &sc->hw; + u32 reg; + + /* Clear the triggering bit */ + reg = rd32(hw, VPGEN_VFRTRIG(vf->vf_num)); + reg &= ~VPGEN_VFRTRIG_VFSWR_M; + wr32(hw, VPGEN_VFRTRIG(vf->vf_num), reg); + + /* Setup VF interrupt allocation and mapping */ + ice_iov_setup_intr_mapping(sc, vf); + + /* Indicate to the VF that reset is done */ + wr32(hw, VFGEN_RSTAT(vf->vf_num), VIRTCHNL_VFR_VFACTIVE); + + ice_flush(hw); +} + +/** + * ice_reset_vf - Perform a hardware reset (VFR) on a VF + * @sc: device softc structure + * @vf: driver's VF structure for VF to be reset + * @trigger_vflr: trigger a reset or only handle already executed reset + * + * Performs a VFR for the given VF. This function busy waits until the + * reset completes in the HW, notifies the VF that the reset is done + * by setting a bit in a HW register, then returns. + * + * @remark This also sets up the PF<->VF interrupt mapping and allocations in + * the hardware after the hardware reset is finished, via + * ice_iov_setup_intr_mapping() + */ +static void +ice_reset_vf(struct ice_softc *sc, struct ice_vf *vf, bool trigger_vflr) +{ + u16 global_vf_num, reg_idx, bit_idx; + struct ice_hw *hw = &sc->hw; + int status; + u32 reg; + int i; + + global_vf_num = vf->vf_num + hw->func_caps.vf_base_id; + + if (trigger_vflr) { + reg = rd32(hw, VPGEN_VFRTRIG(vf->vf_num)); + reg |= VPGEN_VFRTRIG_VFSWR_M; + wr32(hw, VPGEN_VFRTRIG(vf->vf_num), reg); + } + + /* clear the VFLR bit for the VF in a GLGEN_VFLRSTAT register */ + reg_idx = (global_vf_num) / 32; + bit_idx = (global_vf_num) % 32; + wr32(hw, GLGEN_VFLRSTAT(reg_idx), BIT(bit_idx)); + ice_flush(hw); + + /* Wait until there are no pending PCI transactions */ + wr32(hw, PF_PCI_CIAA, + ICE_PCIE_DEV_STATUS | (global_vf_num << PF_PCI_CIAA_VF_NUM_S)); + + for (i = 0; i < ICE_PCI_CIAD_WAIT_COUNT; i++) { + reg = rd32(hw, PF_PCI_CIAD); + if (!(reg & PCIEM_STA_TRANSACTION_PND)) + break; + + DELAY(ICE_PCI_CIAD_WAIT_DELAY_US); + } + if (i == ICE_PCI_CIAD_WAIT_COUNT) + device_printf(sc->dev, + "VF-%d PCI transactions stuck\n", vf->vf_num); + + /* Disable TX queues, which is required during VF reset */ + status = ice_dis_vsi_txq(hw->port_info, vf->vsi->idx, 0, 0, NULL, NULL, + NULL, ICE_VF_RESET, vf->vf_num, NULL); + if (status) + device_printf(sc->dev, + "%s: Failed to disable LAN Tx queues: err %s aq_err %s\n", + __func__, ice_status_str(status), + ice_aq_str(hw->adminq.sq_last_status)); + + /* Then check for the VF reset to finish in HW */ + for (i = 0; i < ICE_VPGEN_VFRSTAT_WAIT_COUNT; i++) { + reg = rd32(hw, VPGEN_VFRSTAT(vf->vf_num)); + if ((reg & VPGEN_VFRSTAT_VFRD_M)) + break; + + DELAY(ICE_VPGEN_VFRSTAT_WAIT_DELAY_US); + } + if (i == ICE_VPGEN_VFRSTAT_WAIT_COUNT) + device_printf(sc->dev, + "VF-%d Reset is stuck\n", vf->vf_num); + + ice_iov_ready_vf(sc, vf); +} + +/** + * ice_vc_get_vf_res_msg - Handle VIRTCHNL_OP_GET_VF_RESOURCES msg from VF + * @sc: device private structure + * @vf: VF tracking structure + * @msg_buf: raw message buffer from the VF + * + * Receives a message from the VF listing its supported capabilities, and + * replies to the VF with information about what resources the PF has + * allocated for the VF. + * + * @remark This always replies to the VF with a success status; it does not + * fail. It's up to the VF driver to reject or complain about the PF's response. + */ +static void +ice_vc_get_vf_res_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf) +{ + struct ice_hw *hw = &sc->hw; + struct virtchnl_vf_resource *vf_res; + struct virtchnl_vsi_resource *vsi_res; + u16 vf_res_len; + u32 vf_caps; + + /* XXX: Only support one VSI per VF, so this size doesn't need adjusting */ + vf_res_len = sizeof(struct virtchnl_vf_resource); + vf_res = (struct virtchnl_vf_resource *)malloc(vf_res_len, M_ICE, + M_WAITOK | M_ZERO); + + vf_res->num_vsis = 1; + vf_res->num_queue_pairs = vf->vsi->num_tx_queues; + vf_res->max_vectors = vf_res->num_queue_pairs + 1; + + vf_res->rss_key_size = ICE_GET_SET_RSS_KEY_EXTEND_KEY_SIZE; + vf_res->rss_lut_size = ICE_VSIQF_HLUT_ARRAY_SIZE; + vf_res->max_mtu = 0; + + vf_res->vf_cap_flags = VF_BASE_MODE_OFFLOADS; + if (msg_buf != NULL) { + vf_caps = *((u32 *)(msg_buf)); + + if (vf_caps & VIRTCHNL_VF_CAP_ADV_LINK_SPEED) + vf_res->vf_cap_flags |= VIRTCHNL_VF_CAP_ADV_LINK_SPEED; + + if (vf_caps & VIRTCHNL_VF_OFFLOAD_WB_ON_ITR) + vf_res->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_WB_ON_ITR; + } + + vsi_res = &vf_res->vsi_res[0]; + vsi_res->vsi_id = vf->vsi->idx; + vsi_res->num_queue_pairs = vf->vsi->num_tx_queues; + vsi_res->vsi_type = VIRTCHNL_VSI_SRIOV; + vsi_res->qset_handle = 0; + if (!ETHER_IS_ZERO(vf->mac)) + memcpy(vsi_res->default_mac_addr, vf->mac, ETHER_ADDR_LEN); + + ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_GET_VF_RESOURCES, + VIRTCHNL_STATUS_SUCCESS, (u8 *)vf_res, vf_res_len, NULL); + + free(vf_res, M_ICE); +} + +/** + * ice_vc_version_msg - Handle VIRTCHNL_OP_VERSION msg from VF + * @sc: device private structure + * @vf: VF tracking structure + * @msg_buf: raw message buffer from the VF + * + * Receives a version message from the VF, and responds to the VF with + * the version number that the PF will use. + * + * @remark This always replies to the VF with a success status; it does not + * fail. + */ +static void +ice_vc_version_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf) +{ + struct virtchnl_version_info *recv_vf_version; + struct ice_hw *hw = &sc->hw; + device_t dev = sc->dev; + + recv_vf_version = (struct virtchnl_version_info *)msg_buf; + + /* VFs running the 1.0 API expect to get 1.0 back */ + if (VF_IS_V10(recv_vf_version)) { + vf->version.major = 1; + vf->version.minor = VIRTCHNL_VERSION_MINOR_NO_VF_CAPS; + } else { + vf->version.major = VIRTCHNL_VERSION_MAJOR; + vf->version.minor = VIRTCHNL_VERSION_MINOR; + + if ((recv_vf_version->major != VIRTCHNL_VERSION_MAJOR) || + (recv_vf_version->minor != VIRTCHNL_VERSION_MINOR)) + device_printf(dev, + "%s: VF-%d requested version (%d.%d) differs from PF version (%d.%d)\n", + __func__, vf->vf_num, + recv_vf_version->major, recv_vf_version->minor, + VIRTCHNL_VERSION_MAJOR, VIRTCHNL_VERSION_MINOR); + } + + ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_VERSION, + VIRTCHNL_STATUS_SUCCESS, (u8 *)&vf->version, sizeof(vf->version), + NULL); +} + +/** + * ice_vf_validate_mac - Validate MAC address before adding it + * @vf: VF tracking structure + * @addr: MAC address to validate + * + * Validate a MAC address before adding it to a VF during the handling + * of a VIRTCHNL_OP_ADD_ETH_ADDR operation. Notably, this also checks if + * the VF is allowed to set its own arbitrary MAC addresses. + * + * Returns 0 if MAC address is valid for the given vf + */ +static int +ice_vf_validate_mac(struct ice_vf *vf, const uint8_t *addr) +{ + + if (ETHER_IS_ZERO(addr) || ETHER_IS_BROADCAST(addr)) + return (EINVAL); + + /* + * If the VF is not allowed to change its MAC address, don't let it + * set a MAC filter for an address that is not a multicast address and + * is not its assigned MAC. + */ + if (!(vf->vf_flags & VF_FLAG_SET_MAC_CAP) && + !(ETHER_IS_MULTICAST(addr) || !bcmp(addr, vf->mac, ETHER_ADDR_LEN))) + return (EPERM); + + return (0); +} + +/** + * ice_vc_add_eth_addr_msg - Handle VIRTCHNL_OP_ADD_ETH_ADDR msg from VF + * @sc: device private structure + * @vf: VF tracking structure + * @msg_buf: raw message buffer from the VF + * + * Receives a list of MAC addresses from the VF and adds those addresses + * to the VSI's filter list. + */ +static void +ice_vc_add_eth_addr_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf) +{ + enum virtchnl_status_code v_status = VIRTCHNL_STATUS_SUCCESS; + struct virtchnl_ether_addr_list *addr_list; + struct ice_hw *hw = &sc->hw; + u16 added_addr_cnt = 0; + int error = 0; + + addr_list = (struct virtchnl_ether_addr_list *)msg_buf; + + if (addr_list->num_elements > + (vf->mac_filter_limit - vf->mac_filter_cnt)) { + v_status = VIRTCHNL_STATUS_ERR_NO_MEMORY; + goto done; + } + + for (int i = 0; i < addr_list->num_elements; i++) { + u8 *addr = addr_list->list[i].addr; + + /* The type flag is currently ignored; every MAC address is + * treated as the LEGACY type + */ + + error = ice_vf_validate_mac(vf, addr); + if (error == EPERM) { + device_printf(sc->dev, + "%s: VF-%d: Not permitted to add MAC addr for VSI %d\n", + __func__, vf->vf_num, vf->vsi->idx); + v_status = VIRTCHNL_STATUS_ERR_PARAM; + continue; + } else if (error) { + device_printf(sc->dev, + "%s: VF-%d: Did not add invalid MAC addr for VSI %d\n", + __func__, vf->vf_num, vf->vsi->idx); + v_status = VIRTCHNL_STATUS_ERR_PARAM; + continue; + } + + error = ice_add_vsi_mac_filter(vf->vsi, addr); + if (error) { + device_printf(sc->dev, + "%s: VF-%d: Error adding MAC addr for VSI %d\n", + __func__, vf->vf_num, vf->vsi->idx); + v_status = VIRTCHNL_STATUS_ERR_PARAM; + continue; + } + /* Don't count VF's MAC against its MAC filter limit */ + if (memcmp(addr, vf->mac, ETHER_ADDR_LEN)) + added_addr_cnt++; + } + + vf->mac_filter_cnt += added_addr_cnt; + +done: + ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_ADD_ETH_ADDR, + v_status, NULL, 0, NULL); +} + +/** + * ice_vc_del_eth_addr_msg - Handle VIRTCHNL_OP_DEL_ETH_ADDR msg from VF + * @sc: device private structure + * @vf: VF tracking structure + * @msg_buf: raw message buffer from the VF + * + * Receives a list of MAC addresses from the VF and removes those addresses + * from the VSI's filter list. + */ +static void +ice_vc_del_eth_addr_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf) +{ + enum virtchnl_status_code v_status = VIRTCHNL_STATUS_SUCCESS; + struct virtchnl_ether_addr_list *addr_list; + struct ice_hw *hw = &sc->hw; + u16 deleted_addr_cnt = 0; + int error = 0; + + addr_list = (struct virtchnl_ether_addr_list *)msg_buf; + + for (int i = 0; i < addr_list->num_elements; i++) { + error = ice_remove_vsi_mac_filter(vf->vsi, addr_list->list[i].addr); + if (error) { + device_printf(sc->dev, + "%s: VF-%d: Error removing MAC addr for VSI %d\n", + __func__, vf->vf_num, vf->vsi->idx); + v_status = VIRTCHNL_STATUS_ERR_PARAM; + continue; + } + /* Don't count VF's MAC against its MAC filter limit */ + if (memcmp(addr_list->list[i].addr, vf->mac, ETHER_ADDR_LEN)) + deleted_addr_cnt++; + } + + if (deleted_addr_cnt >= vf->mac_filter_cnt) + vf->mac_filter_cnt = 0; + else + vf->mac_filter_cnt -= deleted_addr_cnt; + + ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_DEL_ETH_ADDR, + v_status, NULL, 0, NULL); +} + +/** + * ice_vc_add_vlan_msg - Handle VIRTCHNL_OP_ADD_VLAN msg from VF + * @sc: PF's softc structure + * @vf: VF tracking structure + * @msg_buf: message buffer from VF + * + * Adds the VLANs in msg_buf to the VF's VLAN filter list. + */ +static void +ice_vc_add_vlan_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf) +{ + struct ice_hw *hw = &sc->hw; + struct virtchnl_vlan_filter_list *vlan_list; + int status = 0; + enum virtchnl_status_code v_status = VIRTCHNL_STATUS_SUCCESS; + struct ice_vsi *vsi = vf->vsi; + + vlan_list = (struct virtchnl_vlan_filter_list *)msg_buf; + + if (vlan_list->vsi_id != vsi->idx) { + device_printf(sc->dev, + "VF-%d: Message has invalid VSI ID (expected %d, got %d)\n", + vf->vf_num, vsi->idx, vlan_list->vsi_id); + v_status = VIRTCHNL_STATUS_ERR_PARAM; + goto done; + } + + if (vlan_list->num_elements > (vf->vlan_limit - vf->vlan_cnt)) { + v_status = VIRTCHNL_STATUS_ERR_NO_MEMORY; + goto done; + } + + status = ice_add_vlan_hw_filters(vsi, vlan_list->vlan_id, + vlan_list->num_elements); + if (status) { + device_printf(sc->dev, + "VF-%d: Failure adding VLANs to VSI %d, err %s aq_err %s\n", + vf->vf_num, vsi->idx, ice_status_str(status), + ice_aq_str(sc->hw.adminq.sq_last_status)); + v_status = ice_iov_err_to_virt_err(status); + goto done; + } + + vf->vlan_cnt += vlan_list->num_elements; + +done: + ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_ADD_VLAN, + v_status, NULL, 0, NULL); +} + +/** + * ice_vc_del_vlan_msg - Handle VIRTCHNL_OP_DEL_VLAN msg from VF + * @sc: PF's softc structure + * @vf: VF tracking structure + * @msg_buf: message buffer from VF + * + * Removes the VLANs in msg_buf from the VF's VLAN filter list. + */ +static void +ice_vc_del_vlan_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf) +{ + struct ice_hw *hw = &sc->hw; + struct virtchnl_vlan_filter_list *vlan_list; + int status = 0; + enum virtchnl_status_code v_status = VIRTCHNL_STATUS_SUCCESS; + struct ice_vsi *vsi = vf->vsi; + + vlan_list = (struct virtchnl_vlan_filter_list *)msg_buf; + + if (vlan_list->vsi_id != vsi->idx) { + device_printf(sc->dev, + "VF-%d: Message has invalid VSI ID (expected %d, got %d)\n", + vf->vf_num, vsi->idx, vlan_list->vsi_id); + v_status = VIRTCHNL_STATUS_ERR_PARAM; + goto done; + } + + status = ice_remove_vlan_hw_filters(vsi, vlan_list->vlan_id, + vlan_list->num_elements); + if (status) { + device_printf(sc->dev, + "VF-%d: Failure deleting VLANs from VSI %d, err %s aq_err %s\n", + vf->vf_num, vsi->idx, ice_status_str(status), + ice_aq_str(sc->hw.adminq.sq_last_status)); + v_status = ice_iov_err_to_virt_err(status); + goto done; + } + + if (vlan_list->num_elements >= vf->vlan_cnt) + vf->vlan_cnt = 0; + else + vf->vlan_cnt -= vlan_list->num_elements; + +done: + ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_DEL_VLAN, + v_status, NULL, 0, NULL); +} + +/** + * ice_vc_validate_ring_len - Check to see if a descriptor ring length is valid + * @ring_len: length of ring + * + * Check whether a ring size value is valid. + * + * @returns true if given ring size is valid + */ +static bool +ice_vc_isvalid_ring_len(u16 ring_len) +{ + return (ring_len >= ICE_MIN_DESC_COUNT && + ring_len <= ICE_MAX_DESC_COUNT && + !(ring_len % ICE_DESC_COUNT_INCR)); +} + +/** + * ice_vc_cfg_vsi_qs_msg - Handle VIRTCHNL_OP_CONFIG_VSI_QUEUES msg from VF + * @sc: PF's softc structure + * @vf: VF tracking structure + * @msg_buf: message buffer from VF + */ +static void +ice_vc_cfg_vsi_qs_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf) +{ + device_t dev = sc->dev; + struct ice_hw *hw = &sc->hw; + struct virtchnl_vsi_queue_config_info *vqci; + struct virtchnl_queue_pair_info *vqpi; + enum virtchnl_status_code status = VIRTCHNL_STATUS_SUCCESS; + struct ice_vsi *vsi = vf->vsi; + struct ice_tx_queue *txq; + struct ice_rx_queue *rxq; + int i, error = 0; + + vqci = (struct virtchnl_vsi_queue_config_info *)msg_buf; + + if (vqci->num_queue_pairs > vf->vsi->num_tx_queues && + vqci->num_queue_pairs > vf->vsi->num_rx_queues) { + status = VIRTCHNL_STATUS_ERR_PARAM; + goto done; + } + + ice_vsi_disable_tx(vf->vsi); + ice_control_all_rx_queues(vf->vsi, false); + + /* + * Clear TX and RX queues config in case VF + * requests different number of queues. + */ + for (i = 0; i < vsi->num_tx_queues; i++) { + txq = &vsi->tx_queues[i]; + + txq->desc_count = 0; + txq->tx_paddr = 0; + txq->tc = 0; + } + + for (i = 0; i < vsi->num_rx_queues; i++) { + rxq = &vsi->rx_queues[i]; + + rxq->desc_count = 0; + rxq->rx_paddr = 0; + } + + vqpi = vqci->qpair; + for (i = 0; i < vqci->num_queue_pairs; i++, vqpi++) { + /* Initial parameter validation */ + if (vqpi->txq.vsi_id != vf->vsi->idx || + vqpi->rxq.vsi_id != vf->vsi->idx || + vqpi->txq.queue_id != vqpi->rxq.queue_id || + vqpi->txq.headwb_enabled || + vqpi->rxq.splithdr_enabled || + vqpi->rxq.crc_disable || + !(ice_vc_isvalid_ring_len(vqpi->txq.ring_len)) || + !(ice_vc_isvalid_ring_len(vqpi->rxq.ring_len))) { + status = VIRTCHNL_STATUS_ERR_PARAM; + goto done; + } + + /* Copy parameters into VF's queue/VSI structs */ + txq = &vsi->tx_queues[vqpi->txq.queue_id]; + + txq->desc_count = vqpi->txq.ring_len; + txq->tx_paddr = vqpi->txq.dma_ring_addr; + txq->q_handle = vqpi->txq.queue_id; + txq->tc = 0; + + rxq = &vsi->rx_queues[vqpi->rxq.queue_id]; + + rxq->desc_count = vqpi->rxq.ring_len; + rxq->rx_paddr = vqpi->rxq.dma_ring_addr; + vsi->mbuf_sz = vqpi->rxq.databuffer_size; + } + + /* Configure TX queues in HW */ + error = ice_cfg_vsi_for_tx(vsi); + if (error) { + device_printf(dev, + "VF-%d: Unable to configure VSI for Tx: %s\n", + vf->vf_num, ice_err_str(error)); + status = VIRTCHNL_STATUS_ERR_ADMIN_QUEUE_ERROR; + goto done; + } + + /* Configure RX queues in HW */ + error = ice_cfg_vsi_for_rx(vsi); + if (error) { + device_printf(dev, + "VF-%d: Unable to configure VSI for Rx: %s\n", + vf->vf_num, ice_err_str(error)); + status = VIRTCHNL_STATUS_ERR_ADMIN_QUEUE_ERROR; + ice_vsi_disable_tx(vsi); + goto done; + } + +done: + ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_CONFIG_VSI_QUEUES, + status, NULL, 0, NULL); +} + +/** + * ice_vc_cfg_rss_key_msg - Handle VIRTCHNL_OP_CONFIG_RSS_KEY msg from VF + * @sc: PF's softc structure + * @vf: VF tracking structure + * @msg_buf: message buffer from VF + * + * Sets the RSS key for the given VF, using the contents of msg_buf. + */ +static void +ice_vc_cfg_rss_key_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf) +{ + struct ice_aqc_get_set_rss_keys keydata = + { .standard_rss_key = {0}, .extended_hash_key = {0} }; + struct ice_hw *hw = &sc->hw; + struct virtchnl_rss_key *vrk; + int status = 0; + enum virtchnl_status_code v_status = VIRTCHNL_STATUS_SUCCESS; + struct ice_vsi *vsi = vf->vsi; + + vrk = (struct virtchnl_rss_key *)msg_buf; + + if (vrk->vsi_id != vsi->idx) { + device_printf(sc->dev, + "VF-%d: Message has invalid VSI ID (expected %d, got %d)\n", + vf->vf_num, vsi->idx, vrk->vsi_id); + v_status = VIRTCHNL_STATUS_ERR_PARAM; + goto done; + } + + if ((vrk->key_len > + (ICE_AQC_GET_SET_RSS_KEY_DATA_RSS_KEY_SIZE + + ICE_AQC_GET_SET_RSS_KEY_DATA_HASH_KEY_SIZE)) || + vrk->key_len == 0) { + v_status = VIRTCHNL_STATUS_ERR_PARAM; + goto done; + } + + memcpy(&keydata, vrk->key, vrk->key_len); + + status = ice_aq_set_rss_key(hw, vsi->idx, &keydata); + if (status) { + device_printf(sc->dev, + "ice_aq_set_rss_key status %s, error %s\n", + ice_status_str(status), ice_aq_str(hw->adminq.sq_last_status)); + v_status = ice_iov_err_to_virt_err(status); + goto done; + } + +done: + ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_CONFIG_RSS_KEY, + v_status, NULL, 0, NULL); +} + +/** + * ice_vc_cfg_rss_lut_msg - Handle VIRTCHNL_OP_CONFIG_RSS_LUT msg from VF + * @sc: PF's softc structure + * @vf: VF tracking structure + * @msg_buf: message buffer from VF + * + * Adds the LUT from the VF in msg_buf to the PF via an admin queue call. + */ +static void +ice_vc_cfg_rss_lut_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf) +{ + struct ice_hw *hw = &sc->hw; + struct virtchnl_rss_lut *vrl; + int status = 0; + enum virtchnl_status_code v_status = VIRTCHNL_STATUS_SUCCESS; + struct ice_aq_get_set_rss_lut_params lut_params = {}; + struct ice_vsi *vsi = vf->vsi; + + vrl = (struct virtchnl_rss_lut *)msg_buf; + + if (vrl->vsi_id != vsi->idx) { + device_printf(sc->dev, + "VF-%d: Message has invalid VSI ID (expected %d, got %d)\n", + vf->vf_num, vsi->idx, vrl->vsi_id); + v_status = VIRTCHNL_STATUS_ERR_PARAM; + goto done; + } + + if (vrl->lut_entries > ICE_VSIQF_HLUT_ARRAY_SIZE) { + v_status = VIRTCHNL_STATUS_ERR_PARAM; + goto done; + } + + lut_params.vsi_handle = vsi->idx; + lut_params.lut_size = vsi->rss_table_size; + lut_params.lut_type = vsi->rss_lut_type; + lut_params.lut = vrl->lut; + lut_params.global_lut_id = 0; + + status = ice_aq_set_rss_lut(hw, &lut_params); + if (status) { + device_printf(sc->dev, + "VF-%d: Cannot set RSS lut, err %s aq_err %s\n", + vf->vf_num, ice_status_str(status), + ice_aq_str(hw->adminq.sq_last_status)); + v_status = ice_iov_err_to_virt_err(status); + } + +done: + ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_CONFIG_RSS_LUT, + v_status, NULL, 0, NULL); +} + +/** + * ice_vc_set_rss_hena_msg - Handle VIRTCHNL_OP_SET_RSS_HENA msg from VF + * @sc: PF's softc structure + * @vf: VF tracking structure + * @msg_buf: message buffer from VF + * + * Adds the VF's hena (hash enable) bits as flow types to the PF's RSS flow + * type list. + */ +static void +ice_vc_set_rss_hena_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf) +{ + struct ice_hw *hw = &sc->hw; + struct virtchnl_rss_hena *vrh; + int status = 0; + enum virtchnl_status_code v_status = VIRTCHNL_STATUS_SUCCESS; + struct ice_vsi *vsi = vf->vsi; + + MPASS(vsi != NULL); + + vrh = (struct virtchnl_rss_hena *)msg_buf; + + /* + * Remove existing configuration to make sure only requested + * config is applied and allow VFs to disable RSS completly. + */ + status = ice_rem_vsi_rss_cfg(hw, vsi->idx); + if (vrh->hena) { + /* + * Problem with removing config is not fatal, when new one + * is requested. Warn about it but try to apply new config + * anyway. + */ + if (status) + device_printf(sc->dev, + "ice_rem_vsi_rss_cfg status %s, error %s\n", + ice_status_str(status), + ice_aq_str(hw->adminq.sq_last_status)); + status = ice_add_avf_rss_cfg(hw, vsi->idx, vrh->hena); + if (status) + device_printf(sc->dev, + "ice_add_avf_rss_cfg status %s, error %s\n", + ice_status_str(status), + ice_aq_str(hw->adminq.sq_last_status)); + } + v_status = ice_iov_err_to_virt_err(status); + ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_SET_RSS_HENA, + v_status, NULL, 0, NULL); +} + +/** + * ice_vc_enable_queues_msg - Handle VIRTCHNL_OP_ENABLE_QUEUES msg from VF + * @sc: PF's softc structure + * @vf: VF tracking structure + * @msg_buf: message buffer from VF + * + * Enables VF queues selected in msg_buf for Tx/Rx traffic. + * + * @remark Only actually operates on Rx queues; Tx queues are enabled in + * CONFIG_VSI_QUEUES message handler. + */ +static void +ice_vc_enable_queues_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf) +{ + struct ice_hw *hw = &sc->hw; + struct virtchnl_queue_select *vqs; + enum virtchnl_status_code v_status = VIRTCHNL_STATUS_SUCCESS; + struct ice_vsi *vsi = vf->vsi; + int bit, error = 0; + + vqs = (struct virtchnl_queue_select *)msg_buf; + + if (vqs->vsi_id != vsi->idx) { + device_printf(sc->dev, + "%s: VF-%d: Message has invalid VSI ID (expected %d, got %d)\n", + __func__, vf->vf_num, vsi->idx, vqs->vsi_id); + v_status = VIRTCHNL_STATUS_ERR_PARAM; + goto done; + } + + if (!vqs->rx_queues && !vqs->tx_queues) { + device_printf(sc->dev, + "%s: VF-%d: message queue masks are empty\n", + __func__, vf->vf_num); + v_status = VIRTCHNL_STATUS_ERR_PARAM; + goto done; + } + + /* Validate rx_queue mask */ + bit = fls(vqs->rx_queues); + if (bit > vsi->num_rx_queues) { + device_printf(sc->dev, + "%s: VF-%d: message's rx_queues map (0x%08x) has invalid bit set (%d)\n", + __func__, vf->vf_num, vqs->rx_queues, bit); + v_status = VIRTCHNL_STATUS_ERR_PARAM; + goto done; + } + + /* Tx ring enable is handled in an earlier message. */ + for_each_set_bit(bit, &vqs->rx_queues, 32) { + error = ice_control_rx_queue(vsi, bit, true); + if (error) { + device_printf(sc->dev, + "Unable to enable Rx ring %d for receive: %s\n", + bit, ice_err_str(error)); + v_status = VIRTCHNL_STATUS_ERR_PARAM; + goto done; + } + } + +done: + ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_ENABLE_QUEUES, + v_status, NULL, 0, NULL); +} + +/** + * ice_vc_disable_queues_msg - Handle VIRTCHNL_OP_DISABLE_QUEUES msg + * @sc: PF's softc structure + * @vf: VF tracking structure + * @msg_buf: message buffer from VF + * + * Disables all VF queues for the VF's VSI. + * + * @remark Unlike the ENABLE_QUEUES handler, this operates on both + * Tx and Rx queues + */ +static void +ice_vc_disable_queues_msg(struct ice_softc *sc, struct ice_vf *vf, + u8 *msg_buf __unused) +{ + struct ice_hw *hw = &sc->hw; + enum virtchnl_status_code v_status = VIRTCHNL_STATUS_SUCCESS; + struct ice_vsi *vsi = vf->vsi; + int error = 0; + + error = ice_control_all_rx_queues(vsi, false); + if (error) { + device_printf(sc->dev, + "Unable to disable Rx rings for transmit: %s\n", + ice_err_str(error)); + v_status = VIRTCHNL_STATUS_ERR_PARAM; + goto done; + } + + error = ice_vsi_disable_tx(vsi); + if (error) { + /* Already prints an error message */ + v_status = VIRTCHNL_STATUS_ERR_PARAM; + } + +done: + ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_DISABLE_QUEUES, + v_status, NULL, 0, NULL); +} + +/** + * ice_vc_cfg_irq_map_msg - Handle VIRTCHNL_OP_CFG_IRQ_MAP msg from VF + * @sc: PF's softc structure + * @vf: VF tracking structure + * @msg_buf: message buffer from VF + * + * Configures the interrupt vectors described in the message in msg_buf. The + * VF needs to send this message during init, so that queues can be allowed + * to generate interrupts. + */ +static void +ice_vc_cfg_irq_map_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf) +{ +#define ICE_VIRTCHNL_QUEUE_MAP_SIZE 16 + struct ice_hw *hw = &sc->hw; + struct virtchnl_irq_map_info *vimi; + struct virtchnl_vector_map *vvm; + enum virtchnl_status_code v_status = VIRTCHNL_STATUS_SUCCESS; + struct ice_vsi *vsi = vf->vsi; + u16 vector; + + vimi = (struct virtchnl_irq_map_info *)msg_buf; + + if (vimi->num_vectors > vf->num_irq_vectors) { + device_printf(sc->dev, + "%s: VF-%d: message has more vectors (%d) than configured for VF (%d)\n", + __func__, vf->vf_num, vimi->num_vectors, vf->num_irq_vectors); + v_status = VIRTCHNL_STATUS_ERR_PARAM; + goto done; + } + + vvm = vimi->vecmap; + /* Save off information from message */ + for (int i = 0; i < vimi->num_vectors; i++, vvm++) { + struct ice_tx_queue *txq; + struct ice_rx_queue *rxq; + int bit; + + if (vvm->vsi_id != vf->vsi->idx) { + device_printf(sc->dev, + "%s: VF-%d: message's VSI ID (%d) does not match VF's (%d) for vector %d\n", + __func__, vf->vf_num, vvm->vsi_id, vf->vsi->idx, i); + v_status = VIRTCHNL_STATUS_ERR_PARAM; + goto done; + } + + /* vvm->vector_id is relative to VF space */ + vector = vvm->vector_id; + + if (vector >= vf->num_irq_vectors) { + device_printf(sc->dev, + "%s: VF-%d: message's vector ID (%d) is greater than VF's max ID (%d)\n", + __func__, vf->vf_num, vector, vf->num_irq_vectors - 1); + v_status = VIRTCHNL_STATUS_ERR_PARAM; + goto done; + } + + /* The Misc/Admin Queue vector doesn't need mapping */ + if (vector == 0) + continue; + + /* coverity[address_of] */ + for_each_set_bit(bit, &vvm->txq_map, ICE_VIRTCHNL_QUEUE_MAP_SIZE) { + if (bit >= vsi->num_tx_queues) { + device_printf(sc->dev, + "%s: VF-%d: txq map has invalid bit set\n", + __func__, vf->vf_num); + v_status = VIRTCHNL_STATUS_ERR_PARAM; + goto done; + } + + vf->tx_irqvs[vector].me = vector; + + txq = &vsi->tx_queues[bit]; + txq->irqv = &vf->tx_irqvs[vector]; + txq->itr_idx = vvm->txitr_idx; + } + /* coverity[address_of] */ + for_each_set_bit(bit, &vvm->rxq_map, ICE_VIRTCHNL_QUEUE_MAP_SIZE) { + if (bit >= vsi->num_rx_queues) { + device_printf(sc->dev, + "%s: VF-%d: rxq map has invalid bit set\n", + __func__, vf->vf_num); + v_status = VIRTCHNL_STATUS_ERR_PARAM; + goto done; + } + vf->rx_irqvs[vector].me = vector; + + rxq = &vsi->rx_queues[bit]; + rxq->irqv = &vf->rx_irqvs[vector]; + rxq->itr_idx = vvm->rxitr_idx; + } + } + + /* Write to T/RQCTL registers to actually map vectors to queues */ + for (int i = 0; i < vf->vsi->num_rx_queues; i++) + if (vsi->rx_queues[i].irqv != NULL) + ice_configure_rxq_interrupt(hw, vsi->rx_qmap[i], + vsi->rx_queues[i].irqv->me, vsi->rx_queues[i].itr_idx); + + for (int i = 0; i < vf->vsi->num_tx_queues; i++) + if (vsi->tx_queues[i].irqv != NULL) + ice_configure_txq_interrupt(hw, vsi->tx_qmap[i], + vsi->tx_queues[i].irqv->me, vsi->tx_queues[i].itr_idx); + + ice_flush(hw); + +done: + ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_CONFIG_IRQ_MAP, + v_status, NULL, 0, NULL); +} + +/** + * ice_eth_stats_to_virtchnl_eth_stats - Convert stats for virtchnl + * @istats: VSI stats from HW to convert + * @vstats: stats struct to copy to + * + * This function copies all known stats in struct virtchnl_eth_stats from the + * input struct ice_eth_stats to an output struct virtchnl_eth_stats. + * + * @remark These two structure types currently have the same definition up to + * the size of struct virtchnl_eth_stats (on FreeBSD), but that could change + * in the future. + */ +static void +ice_eth_stats_to_virtchnl_eth_stats(struct ice_eth_stats *istats, + struct virtchnl_eth_stats *vstats) +{ + vstats->rx_bytes = istats->rx_bytes; + vstats->rx_unicast = istats->rx_unicast; + vstats->rx_multicast = istats->rx_multicast; + vstats->rx_broadcast = istats->rx_broadcast; + vstats->rx_discards = istats->rx_discards; + vstats->rx_unknown_protocol = istats->rx_unknown_protocol; + vstats->tx_bytes = istats->tx_bytes; + vstats->tx_unicast = istats->tx_unicast; + vstats->tx_multicast = istats->tx_multicast; + vstats->tx_broadcast = istats->tx_broadcast; + vstats->tx_discards = istats->tx_discards; + vstats->tx_errors = istats->tx_errors; +} + +/** + * ice_vc_get_stats_msg - Handle VIRTCHNL_OP_GET_STATS msg + * @sc: device private structure + * @vf: VF tracking structure + * @msg_buf: raw message buffer from the VF + * + * Updates the VF's VSI stats and sends those stats back to the VF. + */ +static void +ice_vc_get_stats_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf) +{ + struct virtchnl_queue_select *vqs; + struct virtchnl_eth_stats stats; + struct ice_vsi *vsi = vf->vsi; + struct ice_hw *hw = &sc->hw; + + vqs = (struct virtchnl_queue_select *)msg_buf; + + if (vqs->vsi_id != vsi->idx) { + device_printf(sc->dev, + "%s: VF-%d: message has invalid VSI ID %d (VF has VSI ID %d)\n", + __func__, vf->vf_num, vqs->vsi_id, vsi->idx); + ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_GET_STATS, + VIRTCHNL_STATUS_ERR_PARAM, NULL, 0, NULL); + } + + ice_update_vsi_hw_stats(vf->vsi); + ice_eth_stats_to_virtchnl_eth_stats(&vsi->hw_stats.cur, &stats); + + ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_GET_STATS, + VIRTCHNL_STATUS_SUCCESS, (u8 *)&stats, + sizeof(struct virtchnl_eth_stats), NULL); +} + +/** + * ice_vc_cfg_promisc_mode_msg - Handle VIRTCHNL_OP_CONFIG_PROMISCUOUS_MODE + * @sc: PF's softc structure + * @vf: VF tracking structure + * @msg_buf: message buffer from VF + * + * Configures the promiscuous modes for the given VSI in msg_buf. + */ +static void +ice_vc_cfg_promisc_mode_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf) +{ + struct ice_hw *hw = &sc->hw; + struct virtchnl_promisc_info *vpi; + enum virtchnl_status_code v_status = VIRTCHNL_STATUS_SUCCESS; + int status = 0; + struct ice_vsi *vsi = vf->vsi; + ice_declare_bitmap(old_promisc_mask, ICE_PROMISC_MAX); + ice_declare_bitmap(req_promisc_mask, ICE_PROMISC_MAX); + ice_declare_bitmap(clear_promisc_mask, ICE_PROMISC_MAX); + ice_declare_bitmap(set_promisc_mask, ICE_PROMISC_MAX); + ice_declare_bitmap(old_req_xor_mask, ICE_PROMISC_MAX); + u16 vid; + + vpi = (struct virtchnl_promisc_info *)msg_buf; + + /* Check to see if VF has permission to configure promiscuous mode */ + if (!(vf->vf_flags & VF_FLAG_PROMISC_CAP)) { + device_printf(sc->dev, + "VF-%d: attempted to configure promiscuous mode\n", + vf->vf_num); + /* Don't reply to VF with an error */ + goto done; + } + + if (vpi->vsi_id != vsi->idx) { + device_printf(sc->dev, + "VF-%d: Message has invalid VSI ID (expected %d, got %d)\n", + vf->vf_num, vsi->idx, vpi->vsi_id); + v_status = VIRTCHNL_STATUS_ERR_PARAM; + goto done; + } + + if (vpi->flags & ~ICE_VIRTCHNL_VALID_PROMISC_FLAGS) { + device_printf(sc->dev, + "VF-%d: Message has invalid promiscuous flags set (valid 0x%02x, got 0x%02x)\n", + vf->vf_num, ICE_VIRTCHNL_VALID_PROMISC_FLAGS, + vpi->flags); + v_status = VIRTCHNL_STATUS_ERR_PARAM; + goto done; + + } + + ice_zero_bitmap(req_promisc_mask, ICE_PROMISC_MAX); + /* Convert virtchnl flags to ice AQ promiscuous mode flags */ + if (vpi->flags & FLAG_VF_UNICAST_PROMISC) { + ice_set_bit(ICE_PROMISC_UCAST_TX, req_promisc_mask); + ice_set_bit(ICE_PROMISC_UCAST_RX, req_promisc_mask); + } + if (vpi->flags & FLAG_VF_MULTICAST_PROMISC) { + ice_set_bit(ICE_PROMISC_MCAST_TX, req_promisc_mask); + ice_set_bit(ICE_PROMISC_MCAST_RX, req_promisc_mask); + } + + status = ice_get_vsi_promisc(hw, vsi->idx, old_promisc_mask, &vid); + if (status) { + device_printf(sc->dev, + "VF-%d: Failed to get promiscuous mode mask for VSI %d, err %s aq_err %s\n", + vf->vf_num, vsi->idx, + ice_status_str(status), + ice_aq_str(hw->adminq.sq_last_status)); + v_status = ice_iov_err_to_virt_err(status); + goto done; + } + + /* Figure out what got added and what got removed */ + ice_zero_bitmap(old_req_xor_mask, ICE_PROMISC_MAX); + ice_xor_bitmap(old_req_xor_mask, old_promisc_mask, req_promisc_mask, ICE_PROMISC_MAX); + ice_and_bitmap(clear_promisc_mask, old_req_xor_mask, old_promisc_mask, ICE_PROMISC_MAX); + ice_and_bitmap(set_promisc_mask, old_req_xor_mask, req_promisc_mask, ICE_PROMISC_MAX); + + if (ice_is_any_bit_set(clear_promisc_mask, ICE_PROMISC_MAX)) { + status = ice_clear_vsi_promisc(hw, vsi->idx, + clear_promisc_mask, 0); + if (status) { + device_printf(sc->dev, + "VF-%d: Failed to clear promiscuous mode for VSI %d, err %s aq_err %s\n", + vf->vf_num, vsi->idx, + ice_status_str(status), + ice_aq_str(hw->adminq.sq_last_status)); + v_status = ice_iov_err_to_virt_err(status); + goto done; + } + } + + if (ice_is_any_bit_set(set_promisc_mask, ICE_PROMISC_MAX)) { + status = ice_set_vsi_promisc(hw, vsi->idx, set_promisc_mask, 0); + if (status) { + device_printf(sc->dev, + "VF-%d: Failed to set promiscuous mode for VSI %d, err %s aq_err %s\n", + vf->vf_num, vsi->idx, + ice_status_str(status), + ice_aq_str(hw->adminq.sq_last_status)); + v_status = ice_iov_err_to_virt_err(status); + goto done; + } + } + +done: + ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_CONFIG_PROMISCUOUS_MODE, + v_status, NULL, 0, NULL); +} + +/** + * ice_vc_notify_all_vfs_link_state - Notify all VFs of PF link state + * @sc: device private structure + * + * Sends a message to all VFs about the status of the PF's link + * state. For more details, @see ice_vc_notify_vf_link_state. + */ +void +ice_vc_notify_all_vfs_link_state(struct ice_softc *sc) +{ + for (int i = 0; i < sc->num_vfs; i++) + ice_vc_notify_vf_link_state(sc, &sc->vfs[i]); +} + +/** + * ice_vc_notify_vf_link_state - Notify VF of PF link state + * @sc: device private structure + * @vf: VF tracking structure + * + * Sends an event message to the specified VF with information about + * the current link state from the PF's port. This includes whether + * link is up or down, and the link speed in 100Mbps units. + */ +static void +ice_vc_notify_vf_link_state(struct ice_softc *sc, struct ice_vf *vf) +{ + struct virtchnl_pf_event event = {}; + struct ice_hw *hw = &sc->hw; + + event.event = VIRTCHNL_EVENT_LINK_CHANGE; + event.severity = PF_EVENT_SEVERITY_INFO; + event.event_data.link_event_adv.link_status = sc->link_up; + event.event_data.link_event_adv.link_speed = + (u32)ice_conv_link_speed_to_virtchnl(true, + hw->port_info->phy.link_info.link_speed); + + ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_EVENT, + VIRTCHNL_STATUS_SUCCESS, (u8 *)&event, sizeof(event), NULL); +} + +/** + * ice_vc_handle_vf_msg - Handle a message from a VF + * @sc: device private structure + * @event: event received from the HW MBX queue + * + * Called whenever an event is received from a VF on the HW mailbox queue. + * Responsible for handling these messages as well as responding to the + * VF afterwards, depending on the received message type. + */ +void +ice_vc_handle_vf_msg(struct ice_softc *sc, struct ice_rq_event_info *event) +{ + struct ice_hw *hw = &sc->hw; + device_t dev = sc->dev; + struct ice_vf *vf; + int err = 0; + + u32 v_opcode = event->desc.cookie_high; + u16 v_id = event->desc.retval; + u8 *msg = event->msg_buf; + u16 msglen = event->msg_len; + + if (v_id >= sc->num_vfs) { + device_printf(dev, "%s: Received msg from invalid VF-%d: opcode %d, len %d\n", + __func__, v_id, v_opcode, msglen); + return; + } + + vf = &sc->vfs[v_id]; + + /* Perform basic checks on the msg */ + err = virtchnl_vc_validate_vf_msg(&vf->version, v_opcode, msg, msglen); + if (err) { + device_printf(dev, "%s: Received invalid msg from VF-%d: opcode %d, len %d, error %d\n", + __func__, vf->vf_num, v_opcode, msglen, err); + ice_aq_send_msg_to_vf(hw, v_id, v_opcode, VIRTCHNL_STATUS_ERR_PARAM, NULL, 0, NULL); + return; + } + + switch (v_opcode) { + case VIRTCHNL_OP_VERSION: + ice_vc_version_msg(sc, vf, msg); + break; + case VIRTCHNL_OP_RESET_VF: + ice_reset_vf(sc, vf, true); + break; + case VIRTCHNL_OP_GET_VF_RESOURCES: + ice_vc_get_vf_res_msg(sc, vf, msg); + break; + case VIRTCHNL_OP_ADD_ETH_ADDR: + ice_vc_add_eth_addr_msg(sc, vf, msg); + break; + case VIRTCHNL_OP_DEL_ETH_ADDR: + ice_vc_del_eth_addr_msg(sc, vf, msg); + break; + case VIRTCHNL_OP_ADD_VLAN: + ice_vc_add_vlan_msg(sc, vf, msg); + break; + case VIRTCHNL_OP_DEL_VLAN: + ice_vc_del_vlan_msg(sc, vf, msg); + break; + case VIRTCHNL_OP_CONFIG_VSI_QUEUES: + ice_vc_cfg_vsi_qs_msg(sc, vf, msg); + break; + case VIRTCHNL_OP_CONFIG_RSS_KEY: + ice_vc_cfg_rss_key_msg(sc, vf, msg); + break; + case VIRTCHNL_OP_CONFIG_RSS_LUT: + ice_vc_cfg_rss_lut_msg(sc, vf, msg); + break; + case VIRTCHNL_OP_SET_RSS_HENA: + ice_vc_set_rss_hena_msg(sc, vf, msg); + break; + case VIRTCHNL_OP_ENABLE_QUEUES: + ice_vc_enable_queues_msg(sc, vf, msg); + ice_vc_notify_vf_link_state(sc, vf); + break; + case VIRTCHNL_OP_DISABLE_QUEUES: + ice_vc_disable_queues_msg(sc, vf, msg); + break; + case VIRTCHNL_OP_CONFIG_IRQ_MAP: + ice_vc_cfg_irq_map_msg(sc, vf, msg); + break; + case VIRTCHNL_OP_GET_STATS: + ice_vc_get_stats_msg(sc, vf, msg); + break; + case VIRTCHNL_OP_CONFIG_PROMISCUOUS_MODE: + ice_vc_cfg_promisc_mode_msg(sc, vf, msg); + break; + default: + device_printf(dev, "%s: Received unknown msg from VF-%d: opcode %d, len %d\n", + __func__, vf->vf_num, v_opcode, msglen); + ice_aq_send_msg_to_vf(hw, v_id, v_opcode, + VIRTCHNL_STATUS_ERR_NOT_SUPPORTED, NULL, 0, NULL); + break; + } +} + +/** + * ice_iov_setup_intr_mapping - Setup interrupt config for a VF + * @sc: device softc structure + * @vf: driver's VF structure for VF to be configured + * + * Before a VF can be used, and after a VF reset, the PF must configure + * the VF's interrupt allocation registers. This includes allocating + * interrupts from the PF's interrupt pool to the VF using the + * VPINT_ALLOC(_PCI) registers, and setting up a mapping from PF vectors + * to VF vectors in GLINT_VECT2FUNC. + * + * As well, this sets up queue allocation registers and maps the mailbox + * interrupt for the VF. + */ +static void +ice_iov_setup_intr_mapping(struct ice_softc *sc, struct ice_vf *vf) +{ + struct ice_hw *hw = &sc->hw; + struct ice_vsi *vsi = vf->vsi; + u16 v; + + /* Calculate indices for register ops below */ + u16 vf_first_irq_idx = vf->vf_imap[0]; + u16 vf_last_irq_idx = (vf_first_irq_idx + vf->num_irq_vectors) - 1; + u16 abs_vf_first_irq_idx = hw->func_caps.common_cap.msix_vector_first_id + + vf_first_irq_idx; + u16 abs_vf_last_irq_idx = (abs_vf_first_irq_idx + vf->num_irq_vectors) - 1; + u16 abs_vf_num = vf->vf_num + hw->func_caps.vf_base_id; + + /* Map out VF interrupt allocation in global device space. Both + * VPINT_ALLOC and VPINT_ALLOC_PCI use the same values. + */ + wr32(hw, VPINT_ALLOC(vf->vf_num), + (((abs_vf_first_irq_idx << VPINT_ALLOC_FIRST_S) & VPINT_ALLOC_FIRST_M) | + ((abs_vf_last_irq_idx << VPINT_ALLOC_LAST_S) & VPINT_ALLOC_LAST_M) | + VPINT_ALLOC_VALID_M)); + wr32(hw, VPINT_ALLOC_PCI(vf->vf_num), + (((abs_vf_first_irq_idx << VPINT_ALLOC_PCI_FIRST_S) & VPINT_ALLOC_PCI_FIRST_M) | + ((abs_vf_last_irq_idx << VPINT_ALLOC_PCI_LAST_S) & VPINT_ALLOC_PCI_LAST_M) | + VPINT_ALLOC_PCI_VALID_M)); + + /* Create inverse mapping of vectors to PF/VF combinations */ + for (v = vf_first_irq_idx; v <= vf_last_irq_idx; v++) + { + wr32(hw, GLINT_VECT2FUNC(v), + (((abs_vf_num << GLINT_VECT2FUNC_VF_NUM_S) & GLINT_VECT2FUNC_VF_NUM_M) | + ((hw->pf_id << GLINT_VECT2FUNC_PF_NUM_S) & GLINT_VECT2FUNC_PF_NUM_M))); + } + + /* Map mailbox interrupt to MSI-X index 0. Disable ITR for it, too. */ + wr32(hw, VPINT_MBX_CTL(abs_vf_num), + ((0 << VPINT_MBX_CTL_MSIX_INDX_S) & VPINT_MBX_CTL_MSIX_INDX_M) | + ((0x3 << VPINT_MBX_CTL_ITR_INDX_S) & VPINT_MBX_CTL_ITR_INDX_M) | + VPINT_MBX_CTL_CAUSE_ENA_M); + + /* Mark the TX queue mapping registers as valid */ + wr32(hw, VPLAN_TXQ_MAPENA(vf->vf_num), VPLAN_TXQ_MAPENA_TX_ENA_M); + + /* Indicate to HW that VF has scattered queue allocation */ + wr32(hw, VPLAN_TX_QBASE(vf->vf_num), VPLAN_TX_QBASE_VFQTABLE_ENA_M); + for (int i = 0; i < vsi->num_tx_queues; i++) { + wr32(hw, VPLAN_TX_QTABLE(i, vf->vf_num), + (vsi->tx_qmap[i] << VPLAN_TX_QTABLE_QINDEX_S) & VPLAN_TX_QTABLE_QINDEX_M); + } + + /* Mark the RX queue mapping registers as valid */ + wr32(hw, VPLAN_RXQ_MAPENA(vf->vf_num), VPLAN_RXQ_MAPENA_RX_ENA_M); + wr32(hw, VPLAN_RX_QBASE(vf->vf_num), VPLAN_RX_QBASE_VFQTABLE_ENA_M); + for (int i = 0; i < vsi->num_rx_queues; i++) { + wr32(hw, VPLAN_RX_QTABLE(i, vf->vf_num), + (vsi->rx_qmap[i] << VPLAN_RX_QTABLE_QINDEX_S) & VPLAN_RX_QTABLE_QINDEX_M); + } +} + +/** + * ice_err_to_virt err - translate ice errors into virtchnl errors + * @ice_err: status returned from ice function + */ +static enum virtchnl_status_code +ice_iov_err_to_virt_err(int ice_err) +{ + switch (ice_err) { + case 0: + return VIRTCHNL_STATUS_SUCCESS; + case ICE_ERR_BAD_PTR: + case ICE_ERR_INVAL_SIZE: + case ICE_ERR_DEVICE_NOT_SUPPORTED: + case ICE_ERR_PARAM: + case ICE_ERR_CFG: + return VIRTCHNL_STATUS_ERR_PARAM; + case ICE_ERR_NO_MEMORY: + return VIRTCHNL_STATUS_ERR_NO_MEMORY; + case ICE_ERR_NOT_READY: + case ICE_ERR_RESET_FAILED: + case ICE_ERR_FW_API_VER: + case ICE_ERR_AQ_ERROR: + case ICE_ERR_AQ_TIMEOUT: + case ICE_ERR_AQ_FULL: + case ICE_ERR_AQ_NO_WORK: + case ICE_ERR_AQ_EMPTY: + return VIRTCHNL_STATUS_ERR_ADMIN_QUEUE_ERROR; + default: + return VIRTCHNL_STATUS_ERR_NOT_SUPPORTED; + } +} diff --git a/sys/dev/ice/ice_iov.h b/sys/dev/ice/ice_iov.h new file mode 100644 index 000000000000..c4fb3e932e3f --- /dev/null +++ b/sys/dev/ice/ice_iov.h @@ -0,0 +1,125 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright (c) 2025, Intel Corporation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * @file ice_iov.h + * @brief header for IOV functionality + * + * This header includes definitions used to implement device Virtual Functions + * for the ice driver. + */ + +#ifndef _ICE_IOV_H_ +#define _ICE_IOV_H_ + +#include <sys/types.h> +#include <sys/bus.h> +#include <sys/nv.h> +#include <sys/iov_schema.h> +#include <sys/param.h> +#include <dev/pci/pcivar.h> +#include <dev/pci/pcireg.h> + +#include <dev/pci/pci_iov.h> + +#include "ice_iflib.h" +#include "ice_vf_mbx.h" + +/** + * @enum ice_vf_flags + * @brief VF state flags + * + * Used to indicate the status of a PF's VF, as well as indicating what each VF + * is capabile of. Intended to be modified only using atomic operations, so + * they can be read and modified in places that aren't locked. + * + * Used in struct ice_vf's vf_flags field. + */ +enum ice_vf_flags { + VF_FLAG_ENABLED = BIT(0), + VF_FLAG_SET_MAC_CAP = BIT(1), + VF_FLAG_VLAN_CAP = BIT(2), + VF_FLAG_PROMISC_CAP = BIT(3), + VF_FLAG_MAC_ANTI_SPOOF = BIT(4), +}; + +/** + * @struct ice_vf + * @brief PF's VF software context + * + * Represents the state and options for a VF spawned from a PF. + */ +struct ice_vf { + struct ice_vsi *vsi; + u32 vf_flags; + + u8 mac[ETHER_ADDR_LEN]; + u16 vf_num; + struct virtchnl_version_info version; + + u16 mac_filter_limit; + u16 mac_filter_cnt; + u16 vlan_limit; + u16 vlan_cnt; + + u16 num_irq_vectors; + u16 *vf_imap; + struct ice_irq_vector *tx_irqvs; + struct ice_irq_vector *rx_irqvs; +}; + +#define ICE_PCIE_DEV_STATUS 0xAA + +#define ICE_PCI_CIAD_WAIT_COUNT 100 +#define ICE_PCI_CIAD_WAIT_DELAY_US 1 +#define ICE_VPGEN_VFRSTAT_WAIT_COUNT 100 +#define ICE_VPGEN_VFRSTAT_WAIT_DELAY_US 20 + +#define ICE_VIRTCHNL_VALID_PROMISC_FLAGS (FLAG_VF_UNICAST_PROMISC | \ + FLAG_VF_MULTICAST_PROMISC) + +#define ICE_DEFAULT_VF_VLAN_LIMIT 64 +#define ICE_DEFAULT_VF_FILTER_LIMIT 16 + +int ice_iov_attach(struct ice_softc *sc); +int ice_iov_detach(struct ice_softc *sc); + +int ice_iov_init(struct ice_softc *sc, uint16_t num_vfs, const nvlist_t *params); +int ice_iov_add_vf(struct ice_softc *sc, uint16_t vfnum, const nvlist_t *params); +void ice_iov_uninit(struct ice_softc *sc); + +void ice_iov_handle_vflr(struct ice_softc *sc); + +void ice_vc_handle_vf_msg(struct ice_softc *sc, struct ice_rq_event_info *event); +void ice_vc_notify_all_vfs_link_state(struct ice_softc *sc); + +#endif /* _ICE_IOV_H_ */ + diff --git a/sys/dev/ice/ice_lib.c b/sys/dev/ice/ice_lib.c index d44ae5f37750..442111e5ffaf 100644 --- a/sys/dev/ice/ice_lib.c +++ b/sys/dev/ice/ice_lib.c @@ -42,6 +42,9 @@ #include "ice_lib.h" #include "ice_iflib.h" +#ifdef PCI_IOV +#include "ice_iov.h" +#endif #include <dev/pci/pcivar.h> #include <dev/pci/pcireg.h> #include <machine/resource.h> @@ -741,6 +744,12 @@ ice_initialize_vsi(struct ice_vsi *vsi) case ICE_VSI_VMDQ2: ctx.flags = ICE_AQ_VSI_TYPE_VMDQ2; break; +#ifdef PCI_IOV + case ICE_VSI_VF: + ctx.flags = ICE_AQ_VSI_TYPE_VF; + ctx.vf_num = vsi->vf_num; + break; +#endif default: return (ENODEV); } @@ -1607,6 +1616,12 @@ ice_setup_tx_ctx(struct ice_tx_queue *txq, struct ice_tlan_ctx *tlan_ctx, u16 pf case ICE_VSI_VMDQ2: tlan_ctx->vmvf_type = ICE_TLAN_CTX_VMVF_TYPE_VMQ; break; +#ifdef PCI_IOV + case ICE_VSI_VF: + tlan_ctx->vmvf_type = ICE_TLAN_CTX_VMVF_TYPE_VF; + tlan_ctx->vmvf_num = hw->func_caps.vf_base_id + vsi->vf_num; + break; +#endif default: return (ENODEV); } @@ -1660,6 +1675,10 @@ ice_cfg_vsi_for_tx(struct ice_vsi *vsi) struct ice_tlan_ctx tlan_ctx = { 0 }; struct ice_tx_queue *txq = &vsi->tx_queues[i]; + /* Last configured queue */ + if (txq->desc_count == 0) + break; + pf_q = vsi->tx_qmap[txq->me]; qg->txqs[0].txq_id = htole16(pf_q); @@ -1788,6 +1807,10 @@ ice_cfg_vsi_for_rx(struct ice_vsi *vsi) for (i = 0; i < vsi->num_rx_queues; i++) { MPASS(vsi->mbuf_sz > 0); + /* Last configured queue */ + if (vsi->rx_queues[i].desc_count == 0) + break; + err = ice_setup_rx_ctx(&vsi->rx_queues[i]); if (err) return err; @@ -2257,6 +2280,11 @@ ice_process_ctrlq_event(struct ice_softc *sc, const char *qname, case ice_aqc_opc_get_link_status: ice_process_link_event(sc, event); break; +#ifdef PCI_IOV + case ice_mbx_opc_send_msg_to_pf: + ice_vc_handle_vf_msg(sc, event); + break; +#endif case ice_aqc_opc_fw_logs_event: ice_handle_fw_log_event(sc, &event->desc, event->msg_buf); break; diff --git a/sys/dev/ice/ice_lib.h b/sys/dev/ice/ice_lib.h index b6b23ec82161..308b2bda2790 100644 --- a/sys/dev/ice/ice_lib.h +++ b/sys/dev/ice/ice_lib.h @@ -611,6 +611,10 @@ struct ice_vsi { u16 mirror_src_vsi; u16 rule_mir_ingress; u16 rule_mir_egress; + +#ifdef PCI_IOV + u8 vf_num; /* Index of owning VF, if applicable */ +#endif }; /** diff --git a/sys/dev/ice/ice_vf_mbx.c b/sys/dev/ice/ice_vf_mbx.c new file mode 100644 index 000000000000..387a1c6739a6 --- /dev/null +++ b/sys/dev/ice/ice_vf_mbx.c @@ -0,0 +1,471 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright (c) 2025, Intel Corporation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "ice_common.h" +#include "ice_hw_autogen.h" +#include "ice_vf_mbx.h" + +/** + * ice_aq_send_msg_to_vf + * @hw: pointer to the hardware structure + * @vfid: VF ID to send msg + * @v_opcode: opcodes for VF-PF communication + * @v_retval: return error code + * @msg: pointer to the msg buffer + * @msglen: msg length + * @cd: pointer to command details + * + * Send message to VF driver (0x0802) using mailbox + * queue and asynchronously sending message via + * ice_sq_send_cmd() function + */ +int +ice_aq_send_msg_to_vf(struct ice_hw *hw, u16 vfid, u32 v_opcode, u32 v_retval, + u8 *msg, u16 msglen, struct ice_sq_cd *cd) +{ + struct ice_aqc_pf_vf_msg *cmd; + struct ice_aq_desc desc; + + ice_fill_dflt_direct_cmd_desc(&desc, ice_mbx_opc_send_msg_to_vf); + + cmd = &desc.params.virt; + cmd->id = CPU_TO_LE32(vfid); + + desc.cookie_high = CPU_TO_LE32(v_opcode); + desc.cookie_low = CPU_TO_LE32(v_retval); + + if (msglen) + desc.flags |= CPU_TO_LE16(ICE_AQ_FLAG_RD); + + return ice_sq_send_cmd(hw, &hw->mailboxq, &desc, msg, msglen, cd); +} + +/** + * ice_aq_send_msg_to_pf + * @hw: pointer to the hardware structure + * @v_opcode: opcodes for VF-PF communication + * @v_retval: return error code + * @msg: pointer to the msg buffer + * @msglen: msg length + * @cd: pointer to command details + * + * Send message to PF driver using mailbox queue. By default, this + * message is sent asynchronously, i.e. ice_sq_send_cmd() + * does not wait for completion before returning. + */ +int +ice_aq_send_msg_to_pf(struct ice_hw *hw, enum virtchnl_ops v_opcode, + int v_retval, u8 *msg, u16 msglen, + struct ice_sq_cd *cd) +{ + struct ice_aq_desc desc; + + ice_fill_dflt_direct_cmd_desc(&desc, ice_mbx_opc_send_msg_to_pf); + desc.cookie_high = CPU_TO_LE32(v_opcode); + desc.cookie_low = CPU_TO_LE32(v_retval); + + if (msglen) + desc.flags |= CPU_TO_LE16(ICE_AQ_FLAG_RD); + + return ice_sq_send_cmd(hw, &hw->mailboxq, &desc, msg, msglen, cd); +} + +static const u32 ice_legacy_aq_to_vc_speed[] = { + VIRTCHNL_LINK_SPEED_100MB, /* BIT(0) */ + VIRTCHNL_LINK_SPEED_100MB, + VIRTCHNL_LINK_SPEED_1GB, + VIRTCHNL_LINK_SPEED_1GB, + VIRTCHNL_LINK_SPEED_1GB, + VIRTCHNL_LINK_SPEED_10GB, + VIRTCHNL_LINK_SPEED_20GB, + VIRTCHNL_LINK_SPEED_25GB, + VIRTCHNL_LINK_SPEED_40GB, + VIRTCHNL_LINK_SPEED_40GB, + VIRTCHNL_LINK_SPEED_40GB, +}; + +/** + * ice_conv_link_speed_to_virtchnl + * @adv_link_support: determines the format of the returned link speed + * @link_speed: variable containing the link_speed to be converted + * + * Convert link speed supported by HW to link speed supported by virtchnl. + * If adv_link_support is true, then return link speed in Mbps. Else return + * link speed as a VIRTCHNL_LINK_SPEED_* casted to a u32. Note that the caller + * needs to cast back to an enum virtchnl_link_speed in the case where + * adv_link_support is false, but when adv_link_support is true the caller can + * expect the speed in Mbps. + */ +u32 ice_conv_link_speed_to_virtchnl(bool adv_link_support, u16 link_speed) +{ + /* convert a BIT() value into an array index */ + u16 index = (u16)(ice_fls(link_speed) - 1); + + if (adv_link_support) + return ice_get_link_speed(index); + else if (index < ARRAY_SIZE(ice_legacy_aq_to_vc_speed)) + /* Virtchnl speeds are not defined for every speed supported in + * the hardware. To maintain compatibility with older AVF + * drivers, while reporting the speed the new speed values are + * resolved to the closest known virtchnl speeds + */ + return ice_legacy_aq_to_vc_speed[index]; + + return VIRTCHNL_LINK_SPEED_UNKNOWN; +} + +/* The mailbox overflow detection algorithm helps to check if there + * is a possibility of a malicious VF transmitting too many MBX messages to the + * PF. + * 1. The mailbox snapshot structure, ice_mbx_snapshot, is initialized during + * driver initialization in ice_init_hw() using ice_mbx_init_snapshot(). + * The struct ice_mbx_snapshot helps to track and traverse a static window of + * messages within the mailbox queue while looking for a malicious VF. + * + * 2. When the caller starts processing its mailbox queue in response to an + * interrupt, the structure ice_mbx_snapshot is expected to be cleared before + * the algorithm can be run for the first time for that interrupt. This + * requires calling ice_mbx_reset_snapshot() as well as calling + * ice_mbx_reset_vf_info() for each VF tracking structure. + * + * 3. For every message read by the caller from the MBX Queue, the caller must + * call the detection algorithm's entry function ice_mbx_vf_state_handler(). + * Before every call to ice_mbx_vf_state_handler() the struct ice_mbx_data is + * filled as it is required to be passed to the algorithm. + * + * 4. Every time a message is read from the MBX queue, a tracking structure + * for the VF must be passed to the state handler. The boolean output + * report_malvf from ice_mbx_vf_state_handler() serves as an indicator to the + * caller whether it must report this VF as malicious or not. + * + * 5. When a VF is identified to be malicious, the caller can send a message + * to the system administrator. + * + * 6. The PF is responsible for maintaining the struct ice_mbx_vf_info + * structure for each VF. The PF should clear the VF tracking structure if the + * VF is reset. When a VF is shut down and brought back up, we will then + * assume that the new VF is not malicious and may report it again if we + * detect it again. + * + * 7. The function ice_mbx_reset_snapshot() is called to reset the information + * in ice_mbx_snapshot for every new mailbox interrupt handled. + */ +#define ICE_RQ_DATA_MASK(rq_data) ((rq_data) & PF_MBX_ARQH_ARQH_M) +/* Using the highest value for an unsigned 16-bit value 0xFFFF to indicate that + * the max messages check must be ignored in the algorithm + */ +#define ICE_IGNORE_MAX_MSG_CNT 0xFFFF + +/** + * ice_mbx_reset_snapshot - Initialize mailbox snapshot structure + * @snap: pointer to the mailbox snapshot + */ +static void ice_mbx_reset_snapshot(struct ice_mbx_snapshot *snap) +{ + struct ice_mbx_vf_info *vf_info; + + /* Clear mbx_buf in the mailbox snaphot structure and setting the + * mailbox snapshot state to a new capture. + */ + ice_memset(&snap->mbx_buf, 0, sizeof(snap->mbx_buf), ICE_NONDMA_MEM); + snap->mbx_buf.state = ICE_MAL_VF_DETECT_STATE_NEW_SNAPSHOT; + + /* Reset message counts for all VFs to zero */ + LIST_FOR_EACH_ENTRY(vf_info, &snap->mbx_vf, ice_mbx_vf_info, list_entry) + vf_info->msg_count = 0; +} + +/** + * ice_mbx_traverse - Pass through mailbox snapshot + * @hw: pointer to the HW struct + * @new_state: new algorithm state + * + * Traversing the mailbox static snapshot without checking + * for malicious VFs. + */ +static void +ice_mbx_traverse(struct ice_hw *hw, + enum ice_mbx_snapshot_state *new_state) +{ + struct ice_mbx_snap_buffer_data *snap_buf; + u32 num_iterations; + + snap_buf = &hw->mbx_snapshot.mbx_buf; + + /* As mailbox buffer is circular, applying a mask + * on the incremented iteration count. + */ + num_iterations = ICE_RQ_DATA_MASK(++snap_buf->num_iterations); + + /* Checking either of the below conditions to exit snapshot traversal: + * Condition-1: If the number of iterations in the mailbox is equal to + * the mailbox head which would indicate that we have reached the end + * of the static snapshot. + * Condition-2: If the maximum messages serviced in the mailbox for a + * given interrupt is the highest possible value then there is no need + * to check if the number of messages processed is equal to it. If not + * check if the number of messages processed is greater than or equal + * to the maximum number of mailbox entries serviced in current work item. + */ + if (num_iterations == snap_buf->head || + (snap_buf->max_num_msgs_mbx < ICE_IGNORE_MAX_MSG_CNT && + ++snap_buf->num_msg_proc >= snap_buf->max_num_msgs_mbx)) + *new_state = ICE_MAL_VF_DETECT_STATE_NEW_SNAPSHOT; +} + +/** + * ice_mbx_detect_malvf - Detect malicious VF in snapshot + * @hw: pointer to the HW struct + * @vf_info: mailbox tracking structure for a VF + * @new_state: new algorithm state + * @is_malvf: boolean output to indicate if VF is malicious + * + * This function tracks the number of asynchronous messages + * sent per VF and marks the VF as malicious if it exceeds + * the permissible number of messages to send. + */ +static int +ice_mbx_detect_malvf(struct ice_hw *hw, struct ice_mbx_vf_info *vf_info, + enum ice_mbx_snapshot_state *new_state, + bool *is_malvf) +{ + /* increment the message count for this VF */ + vf_info->msg_count++; + + if (vf_info->msg_count >= ICE_ASYNC_VF_MSG_THRESHOLD) + *is_malvf = true; + + /* continue to iterate through the mailbox snapshot */ + ice_mbx_traverse(hw, new_state); + + return 0; +} + +/** + * ice_e830_mbx_vf_dec_trig - Decrements the VF mailbox queue counter + * @hw: pointer to the HW struct + * @event: pointer to the control queue receive event + * + * This function triggers to decrement the counter + * MBX_VF_IN_FLIGHT_MSGS_AT_PF_CNT when the driver replenishes + * the buffers at the PF mailbox queue. + */ +void ice_e830_mbx_vf_dec_trig(struct ice_hw *hw, + struct ice_rq_event_info *event) +{ + u16 vfid = LE16_TO_CPU(event->desc.retval); + + wr32(hw, E830_MBX_VF_DEC_TRIG(vfid), 1); +} + +/** + * ice_mbx_vf_clear_cnt_e830 - Clear the VF mailbox queue count + * @hw: pointer to the HW struct + * @vf_id: VF ID in the PF space + * + * This function clears the counter MBX_VF_IN_FLIGHT_MSGS_AT_PF_CNT, and should + * be called when a VF is created and on VF reset. + */ +void ice_mbx_vf_clear_cnt_e830(struct ice_hw *hw, u16 vf_id) +{ + u32 reg = rd32(hw, E830_MBX_VF_IN_FLIGHT_MSGS_AT_PF_CNT(vf_id)); + + wr32(hw, E830_MBX_VF_DEC_TRIG(vf_id), reg); +} + +/** + * ice_mbx_vf_state_handler - Handle states of the overflow algorithm + * @hw: pointer to the HW struct + * @mbx_data: pointer to structure containing mailbox data + * @vf_info: mailbox tracking structure for the VF in question + * @report_malvf: boolean output to indicate whether VF should be reported + * + * The function serves as an entry point for the malicious VF + * detection algorithm by handling the different states and state + * transitions of the algorithm: + * New snapshot: This state is entered when creating a new static + * snapshot. The data from any previous mailbox snapshot is + * cleared and a new capture of the mailbox head and tail is + * logged. This will be the new static snapshot to detect + * asynchronous messages sent by VFs. On capturing the snapshot + * and depending on whether the number of pending messages in that + * snapshot exceed the watermark value, the state machine enters + * traverse or detect states. + * Traverse: If pending message count is below watermark then iterate + * through the snapshot without any action on VF. + * Detect: If pending message count exceeds watermark traverse + * the static snapshot and look for a malicious VF. + */ +int +ice_mbx_vf_state_handler(struct ice_hw *hw, struct ice_mbx_data *mbx_data, + struct ice_mbx_vf_info *vf_info, bool *report_malvf) +{ + struct ice_mbx_snapshot *snap = &hw->mbx_snapshot; + struct ice_mbx_snap_buffer_data *snap_buf; + struct ice_ctl_q_info *cq = &hw->mailboxq; + enum ice_mbx_snapshot_state new_state; + int status = 0; + bool is_malvf = false; + + if (!report_malvf || !mbx_data || !vf_info) + return ICE_ERR_BAD_PTR; + + *report_malvf = false; + + /* When entering the mailbox state machine assume that the VF + * is not malicious until detected. + */ + /* Checking if max messages allowed to be processed while servicing current + * interrupt is not less than the defined AVF message threshold. + */ + if (mbx_data->max_num_msgs_mbx <= ICE_ASYNC_VF_MSG_THRESHOLD) + return ICE_ERR_INVAL_SIZE; + + /* The watermark value should not be lesser than the threshold limit + * set for the number of asynchronous messages a VF can send to mailbox + * nor should it be greater than the maximum number of messages in the + * mailbox serviced in current interrupt. + */ + if (mbx_data->async_watermark_val < ICE_ASYNC_VF_MSG_THRESHOLD || + mbx_data->async_watermark_val > mbx_data->max_num_msgs_mbx) + return ICE_ERR_PARAM; + + new_state = ICE_MAL_VF_DETECT_STATE_INVALID; + snap_buf = &snap->mbx_buf; + + switch (snap_buf->state) { + case ICE_MAL_VF_DETECT_STATE_NEW_SNAPSHOT: + /* Clear any previously held data in mailbox snapshot structure. */ + ice_mbx_reset_snapshot(snap); + + /* Collect the pending ARQ count, number of messages processed and + * the maximum number of messages allowed to be processed from the + * Mailbox for current interrupt. + */ + snap_buf->num_pending_arq = mbx_data->num_pending_arq; + snap_buf->num_msg_proc = mbx_data->num_msg_proc; + snap_buf->max_num_msgs_mbx = mbx_data->max_num_msgs_mbx; + + /* Capture a new static snapshot of the mailbox by logging the + * head and tail of snapshot and set num_iterations to the tail + * value to mark the start of the iteration through the snapshot. + */ + snap_buf->head = ICE_RQ_DATA_MASK(cq->rq.next_to_clean + + mbx_data->num_pending_arq); + snap_buf->tail = ICE_RQ_DATA_MASK(cq->rq.next_to_clean - 1); + snap_buf->num_iterations = snap_buf->tail; + + /* Pending ARQ messages returned by ice_clean_rq_elem + * is the difference between the head and tail of the + * mailbox queue. Comparing this value against the watermark + * helps to check if we potentially have malicious VFs. + */ + if (snap_buf->num_pending_arq >= + mbx_data->async_watermark_val) { + new_state = ICE_MAL_VF_DETECT_STATE_DETECT; + status = ice_mbx_detect_malvf(hw, vf_info, &new_state, &is_malvf); + } else { + new_state = ICE_MAL_VF_DETECT_STATE_TRAVERSE; + ice_mbx_traverse(hw, &new_state); + } + break; + + case ICE_MAL_VF_DETECT_STATE_TRAVERSE: + new_state = ICE_MAL_VF_DETECT_STATE_TRAVERSE; + ice_mbx_traverse(hw, &new_state); + break; + + case ICE_MAL_VF_DETECT_STATE_DETECT: + new_state = ICE_MAL_VF_DETECT_STATE_DETECT; + status = ice_mbx_detect_malvf(hw, vf_info, &new_state, &is_malvf); + break; + + default: + new_state = ICE_MAL_VF_DETECT_STATE_INVALID; + status = ICE_ERR_CFG; + } + + snap_buf->state = new_state; + + /* Only report VFs as malicious the first time we detect it */ + if (is_malvf && !vf_info->malicious) { + vf_info->malicious = 1; + *report_malvf = true; + } + + return status; +} + +/** + * ice_mbx_clear_malvf - Clear VF mailbox info + * @vf_info: the mailbox tracking structure for a VF + * + * In case of a VF reset, this function shall be called to clear the VF's + * current mailbox tracking state. + */ +void ice_mbx_clear_malvf(struct ice_mbx_vf_info *vf_info) +{ + vf_info->malicious = 0; + vf_info->msg_count = 0; +} + +/** + * ice_mbx_init_vf_info - Initialize a new VF mailbox tracking info + * @hw: pointer to the hardware structure + * @vf_info: the mailbox tracking info structure for a VF + * + * Initialize a VF mailbox tracking info structure and insert it into the + * snapshot list. + * + * If you remove the VF, you must also delete the associated VF info structure + * from the linked list. + */ +void ice_mbx_init_vf_info(struct ice_hw *hw, struct ice_mbx_vf_info *vf_info) +{ + struct ice_mbx_snapshot *snap = &hw->mbx_snapshot; + + ice_mbx_clear_malvf(vf_info); + LIST_ADD(&vf_info->list_entry, &snap->mbx_vf); +} + +/** + * ice_mbx_init_snapshot - Initialize mailbox snapshot data + * @hw: pointer to the hardware structure + * + * Clear the mailbox snapshot structure and initialize the VF mailbox list. + */ +void ice_mbx_init_snapshot(struct ice_hw *hw) +{ + struct ice_mbx_snapshot *snap = &hw->mbx_snapshot; + + INIT_LIST_HEAD(&snap->mbx_vf); + ice_mbx_reset_snapshot(snap); +} diff --git a/sys/dev/ice/ice_vf_mbx.h b/sys/dev/ice/ice_vf_mbx.h new file mode 100644 index 000000000000..3b185ac89c11 --- /dev/null +++ b/sys/dev/ice/ice_vf_mbx.h @@ -0,0 +1,67 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright (c) 2025, Intel Corporation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _ICE_VF_MBX_H_ +#define _ICE_VF_MBX_H_ + +#include "ice_type.h" +#include "ice_controlq.h" + +/* Defining the mailbox message threshold as 63 asynchronous + * pending messages. Normal VF functionality does not require + * sending more than 63 asynchronous pending message. + */ + + /* Threshold value should be used to initialize + * MBX_VF_IN_FLIGHT_MSGS_AT_PF_CNT register. + */ +#define ICE_ASYNC_VF_MSG_THRESHOLD 63 + +int +ice_aq_send_msg_to_pf(struct ice_hw *hw, enum virtchnl_ops v_opcode, + int v_retval, u8 *msg, u16 msglen, + struct ice_sq_cd *cd); +int +ice_aq_send_msg_to_vf(struct ice_hw *hw, u16 vfid, u32 v_opcode, u32 v_retval, + u8 *msg, u16 msglen, struct ice_sq_cd *cd); + +u32 ice_conv_link_speed_to_virtchnl(bool adv_link_support, u16 link_speed); + +void ice_e830_mbx_vf_dec_trig(struct ice_hw *hw, + struct ice_rq_event_info *event); +void ice_mbx_vf_clear_cnt_e830(struct ice_hw *hw, u16 vf_id); +int +ice_mbx_vf_state_handler(struct ice_hw *hw, struct ice_mbx_data *mbx_data, + struct ice_mbx_vf_info *vf_info, bool *report_malvf); +void ice_mbx_clear_malvf(struct ice_mbx_vf_info *vf_info); +void ice_mbx_init_vf_info(struct ice_hw *hw, struct ice_mbx_vf_info *vf_info); +void ice_mbx_init_snapshot(struct ice_hw *hw); +#endif /* _ICE_VF_MBX_H_ */ diff --git a/sys/dev/ice/if_ice_iflib.c b/sys/dev/ice/if_ice_iflib.c index e60ee0f1c5c3..1469d2916465 100644 --- a/sys/dev/ice/if_ice_iflib.c +++ b/sys/dev/ice/if_ice_iflib.c @@ -42,6 +42,9 @@ #include "ice_drv_info.h" #include "ice_switch.h" #include "ice_sched.h" +#ifdef PCI_IOV +#include "ice_iov.h" +#endif #include <sys/module.h> #include <sys/sockio.h> @@ -85,6 +88,12 @@ static int ice_if_suspend(if_ctx_t ctx); static int ice_if_resume(if_ctx_t ctx); static bool ice_if_needs_restart(if_ctx_t ctx, enum iflib_restart_event event); static void ice_init_link(struct ice_softc *sc); +#ifdef PCI_IOV +static int ice_if_iov_init(if_ctx_t ctx, uint16_t num_vfs, const nvlist_t *params); +static void ice_if_iov_uninit(if_ctx_t ctx); +static int ice_if_iov_vf_add(if_ctx_t ctx, uint16_t vfnum, const nvlist_t *params); +static void ice_if_vflr_handle(if_ctx_t ctx); +#endif static int ice_setup_mirror_vsi(struct ice_mirr_if *mif); static int ice_wire_mirror_intrs(struct ice_mirr_if *mif); static void ice_free_irqvs_subif(struct ice_mirr_if *mif); @@ -158,6 +167,11 @@ static device_method_t ice_methods[] = { DEVMETHOD(device_shutdown, iflib_device_shutdown), DEVMETHOD(device_suspend, iflib_device_suspend), DEVMETHOD(device_resume, iflib_device_resume), +#ifdef PCI_IOV + DEVMETHOD(pci_iov_init, iflib_device_iov_init), + DEVMETHOD(pci_iov_uninit, iflib_device_iov_uninit), + DEVMETHOD(pci_iov_add_vf, iflib_device_iov_add_vf), +#endif DEVMETHOD_END }; @@ -198,6 +212,12 @@ static device_method_t ice_iflib_methods[] = { DEVMETHOD(ifdi_suspend, ice_if_suspend), DEVMETHOD(ifdi_resume, ice_if_resume), DEVMETHOD(ifdi_needs_restart, ice_if_needs_restart), +#ifdef PCI_IOV + DEVMETHOD(ifdi_iov_vf_add, ice_if_iov_vf_add), + DEVMETHOD(ifdi_iov_init, ice_if_iov_init), + DEVMETHOD(ifdi_iov_uninit, ice_if_iov_uninit), + DEVMETHOD(ifdi_vflr_handle, ice_if_vflr_handle), +#endif DEVMETHOD_END }; @@ -733,6 +753,9 @@ ice_update_link_status(struct ice_softc *sc, bool update_media) iflib_link_state_change(sc->ctx, LINK_STATE_DOWN, 0); ice_rdma_link_change(sc, LINK_STATE_DOWN, 0); } +#ifdef PCI_IOV + ice_vc_notify_all_vfs_link_state(sc); +#endif update_media = true; } @@ -831,6 +854,14 @@ ice_if_attach_post(if_ctx_t ctx) ice_add_device_sysctls(sc); +#ifdef PCI_IOV + if (ice_is_bit_set(sc->feat_cap, ICE_FEATURE_SRIOV)) { + err = ice_iov_attach(sc); + if (err == ENOMEM) + return (err); + } +#endif /* PCI_IOV */ + /* Get DCBX/LLDP state and start DCBX agent */ ice_init_dcb_setup(sc); @@ -953,6 +984,11 @@ ice_if_detach(if_ctx_t ctx) ice_destroy_mirror_interface(sc); ice_rdma_pf_detach(sc); +#ifdef PCI_IOV + if (ice_is_bit_set(sc->feat_cap, ICE_FEATURE_SRIOV)) + ice_iov_detach(sc); +#endif /* PCI_IOV */ + /* Free allocated media types */ ifmedia_removeall(sc->media); @@ -1676,6 +1712,11 @@ ice_if_msix_intr_assign(if_ctx_t ctx, int msix) /* For future interrupt assignments */ sc->last_rid = rid + sc->irdma_vectors; +#ifdef PCI_IOV + /* Create soft IRQ for handling VF resets */ + iflib_softirq_alloc_generic(ctx, NULL, IFLIB_INTR_IOV, sc, 0, "iov"); +#endif + return (0); fail: for (; i >= 0; i--, vector--) @@ -2277,7 +2318,12 @@ ice_transition_recovery_mode(struct ice_softc *sc) ice_rdma_pf_detach(sc); ice_clear_bit(ICE_FEATURE_RDMA, sc->feat_cap); +#ifdef PCI_IOV + if (ice_test_and_clear_bit(ICE_FEATURE_SRIOV, sc->feat_en)) + ice_iov_detach(sc); +#else ice_clear_bit(ICE_FEATURE_SRIOV, sc->feat_en); +#endif /* PCI_IOV */ ice_clear_bit(ICE_FEATURE_SRIOV, sc->feat_cap); ice_vsi_del_txqs_ctx(vsi); @@ -2325,7 +2371,12 @@ ice_transition_safe_mode(struct ice_softc *sc) ice_rdma_pf_detach(sc); ice_clear_bit(ICE_FEATURE_RDMA, sc->feat_cap); +#ifdef PCI_IOV + if (ice_test_and_clear_bit(ICE_FEATURE_SRIOV, sc->feat_en)) + ice_iov_detach(sc); +#else ice_clear_bit(ICE_FEATURE_SRIOV, sc->feat_en); +#endif /* PCI_IOV */ ice_clear_bit(ICE_FEATURE_SRIOV, sc->feat_cap); ice_clear_bit(ICE_FEATURE_RSS, sc->feat_cap); @@ -2410,6 +2461,15 @@ ice_if_update_admin_status(if_ctx_t ctx) /* Check and update link status */ ice_update_link_status(sc, false); +#ifdef PCI_IOV + /* + * Schedule VFs' reset handler after global resets + * and other events were processed. + */ + if (ice_testandclear_state(&sc->state, ICE_STATE_VFLR_PENDING)) + iflib_iov_intr_deferred(ctx); +#endif + /* * If there are still messages to process, we need to reschedule * ourselves. Otherwise, we can just re-enable the interrupt. We'll be @@ -3349,6 +3409,78 @@ ice_init_link(struct ice_softc *sc) } +#ifdef PCI_IOV +/** + * ice_if_iov_init - iov init handler for iflib + * @ctx: iflib context pointer + * @num_vfs: number of VFs to create + * @params: configuration parameters for the PF + * + * Configure the driver for SR-IOV mode. Used to setup things like memory + * before any VFs are created. + * + * @remark This is a wrapper for ice_iov_init + */ +static int +ice_if_iov_init(if_ctx_t ctx, uint16_t num_vfs, const nvlist_t *params) +{ + struct ice_softc *sc = (struct ice_softc *)iflib_get_softc(ctx); + + return ice_iov_init(sc, num_vfs, params); +} + +/** + * ice_if_iov_uninit - iov uninit handler for iflib + * @ctx: iflib context pointer + * + * Destroys VFs and frees their memory and resources. + * + * @remark This is a wrapper for ice_iov_uninit + */ +static void +ice_if_iov_uninit(if_ctx_t ctx) +{ + struct ice_softc *sc = (struct ice_softc *)iflib_get_softc(ctx); + + ice_iov_uninit(sc); +} + +/** + * ice_if_iov_vf_add - iov add vf handler for iflib + * @ctx: iflib context pointer + * @vfnum: index of VF to configure + * @params: configuration parameters for the VF + * + * Sets up the VF given by the vfnum index. This is called by the OS + * for each VF created by the PF driver after it is spawned. + * + * @remark This is a wrapper for ice_iov_vf_add + */ +static int +ice_if_iov_vf_add(if_ctx_t ctx, uint16_t vfnum, const nvlist_t *params) +{ + struct ice_softc *sc = (struct ice_softc *)iflib_get_softc(ctx); + + return ice_iov_add_vf(sc, vfnum, params); +} + +/** + * ice_if_vflr_handle - iov VFLR handler + * @ctx: iflib context pointer + * + * Performs the necessar teardown or setup required for a VF after + * a VFLR is initiated. + * + * @remark This is a wrapper for ice_iov_handle_vflr + */ +static void +ice_if_vflr_handle(if_ctx_t ctx) +{ + struct ice_softc *sc = (struct ice_softc *)iflib_get_softc(ctx); + ice_iov_handle_vflr(sc); +} +#endif /* PCI_IOV */ + extern struct if_txrx ice_subif_txrx; /** diff --git a/sys/dev/md/md.c b/sys/dev/md/md.c index 741a7c013f7d..29dc0c880e3a 100644 --- a/sys/dev/md/md.c +++ b/sys/dev/md/md.c @@ -11,9 +11,9 @@ */ /*- - * The following functions are based on the vn(4) driver: mdstart_swap(), - * mdstart_vnode(), mdcreate_swap(), mdcreate_vnode() and mddestroy(), - * and as such under the following copyright: + * The following functions are based on the historical vn(4) driver: + * mdstart_swap(), mdstart_vnode(), mdcreate_swap(), mdcreate_vnode() + * and mddestroy(), and as such under the following copyright: * * Copyright (c) 1988 University of Utah. * Copyright (c) 1990, 1993 diff --git a/sys/dev/mgb/if_mgb.c b/sys/dev/mgb/if_mgb.c index 1240d0f84415..409f34167df0 100644 --- a/sys/dev/mgb/if_mgb.c +++ b/sys/dev/mgb/if_mgb.c @@ -1435,7 +1435,7 @@ mgb_hw_teardown(struct mgb_softc *sc) /* Stop MAC */ CSR_CLEAR_REG(sc, MGB_MAC_RX, MGB_MAC_ENBL); - CSR_WRITE_REG(sc, MGB_MAC_TX, MGB_MAC_ENBL); + CSR_CLEAR_REG(sc, MGB_MAC_TX, MGB_MAC_ENBL); if ((err = mgb_wait_for_bits(sc, MGB_MAC_RX, MGB_MAC_DSBL, 0))) return (err); if ((err = mgb_wait_for_bits(sc, MGB_MAC_TX, MGB_MAC_DSBL, 0))) diff --git a/sys/dev/mlx5/mlx5_accel/ipsec.h b/sys/dev/mlx5/mlx5_accel/ipsec.h index 361b9f72d873..c3f3a2372482 100644 --- a/sys/dev/mlx5/mlx5_accel/ipsec.h +++ b/sys/dev/mlx5/mlx5_accel/ipsec.h @@ -260,8 +260,8 @@ int mlx5e_accel_ipsec_fs_rx_tables_create(struct mlx5e_priv *priv); void mlx5e_accel_ipsec_fs_rx_catchall_rules_destroy(struct mlx5e_priv *priv); int mlx5e_accel_ipsec_fs_rx_catchall_rules(struct mlx5e_priv *priv); int mlx5_accel_ipsec_rx_tag_add(if_t ifp, struct mlx5e_rq_mbuf *mr); -void mlx5e_accel_ipsec_handle_rx_cqe(struct mbuf *mb, struct mlx5_cqe64 *cqe, - struct mlx5e_rq_mbuf *mr); +void mlx5e_accel_ipsec_handle_rx_cqe(if_t ifp, struct mbuf *mb, + struct mlx5_cqe64 *cqe, struct mlx5e_rq_mbuf *mr); static inline int mlx5e_accel_ipsec_flow(struct mlx5_cqe64 *cqe) { @@ -269,12 +269,12 @@ static inline int mlx5e_accel_ipsec_flow(struct mlx5_cqe64 *cqe) } static inline void -mlx5e_accel_ipsec_handle_rx(struct mbuf *mb, struct mlx5_cqe64 *cqe, +mlx5e_accel_ipsec_handle_rx(if_t ifp, struct mbuf *mb, struct mlx5_cqe64 *cqe, struct mlx5e_rq_mbuf *mr) { u32 ipsec_meta_data = be32_to_cpu(cqe->ft_metadata); if (MLX5_IPSEC_METADATA_MARKER(ipsec_meta_data)) - mlx5e_accel_ipsec_handle_rx_cqe(mb, cqe, mr); + mlx5e_accel_ipsec_handle_rx_cqe(ifp, mb, cqe, mr); } #endif /* __MLX5_ACCEL_IPSEC_H__ */ diff --git a/sys/dev/mlx5/mlx5_accel/mlx5_ipsec_rxtx.c b/sys/dev/mlx5/mlx5_accel/mlx5_ipsec_rxtx.c index 0883cfb2d510..5dccb8bc2b87 100644 --- a/sys/dev/mlx5/mlx5_accel/mlx5_ipsec_rxtx.c +++ b/sys/dev/mlx5/mlx5_accel/mlx5_ipsec_rxtx.c @@ -24,11 +24,14 @@ * */ +#include "opt_ipsec.h" + #include <sys/mbuf.h> #include <sys/socket.h> #include <netinet/in.h> #include <netipsec/keydb.h> #include <netipsec/ipsec_offload.h> +#include <netipsec/xform.h> #include <dev/mlx5/qp.h> #include <dev/mlx5/mlx5_en/en.h> #include <dev/mlx5/mlx5_accel/ipsec.h> @@ -48,7 +51,8 @@ mlx5_accel_ipsec_rx_tag_add(if_t ifp, struct mlx5e_rq_mbuf *mr) return (0); mtag = (struct ipsec_accel_in_tag *)m_tag_get( - PACKET_TAG_IPSEC_ACCEL_IN, sizeof(*mtag), M_NOWAIT); + PACKET_TAG_IPSEC_ACCEL_IN, sizeof(struct ipsec_accel_in_tag) - + __offsetof(struct ipsec_accel_in_tag, xh), M_NOWAIT); if (mtag == NULL) return (-ENOMEM); mr->ipsec_mtag = mtag; @@ -56,8 +60,8 @@ mlx5_accel_ipsec_rx_tag_add(if_t ifp, struct mlx5e_rq_mbuf *mr) } void -mlx5e_accel_ipsec_handle_rx_cqe(struct mbuf *mb, struct mlx5_cqe64 *cqe, - struct mlx5e_rq_mbuf *mr) +mlx5e_accel_ipsec_handle_rx_cqe(if_t ifp, struct mbuf *mb, + struct mlx5_cqe64 *cqe, struct mlx5e_rq_mbuf *mr) { struct ipsec_accel_in_tag *mtag; u32 drv_spi; @@ -65,10 +69,12 @@ mlx5e_accel_ipsec_handle_rx_cqe(struct mbuf *mb, struct mlx5_cqe64 *cqe, drv_spi = MLX5_IPSEC_METADATA_HANDLE(be32_to_cpu(cqe->ft_metadata)); mtag = mr->ipsec_mtag; WARN_ON(mtag == NULL); - mr->ipsec_mtag = NULL; if (mtag != NULL) { mtag->drv_spi = drv_spi; - m_tag_prepend(mb, &mtag->tag); + if (ipsec_accel_fill_xh(ifp, drv_spi, &mtag->xh)) { + m_tag_prepend(mb, &mtag->tag); + mr->ipsec_mtag = NULL; + } } } diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_hw_tls_rx.c b/sys/dev/mlx5/mlx5_en/mlx5_en_hw_tls_rx.c index 4de451f1b039..89d2010656c5 100644 --- a/sys/dev/mlx5/mlx5_en/mlx5_en_hw_tls_rx.c +++ b/sys/dev/mlx5/mlx5_en/mlx5_en_hw_tls_rx.c @@ -659,7 +659,8 @@ mlx5e_tls_rx_set_params(void *ctx, struct inpcb *inp, const struct tls_session_p return (EINVAL); MLX5_SET64(sw_tls_rx_cntx, ctx, param.initial_record_number, tls_sn_he); - MLX5_SET(sw_tls_rx_cntx, ctx, param.resync_tcp_sn, tcp_sn_he); + MLX5_SET(sw_tls_rx_cntx, ctx, param.resync_tcp_sn, 0); + MLX5_SET(sw_tls_rx_cntx, ctx, progress.next_record_tcp_sn, tcp_sn_he); return (0); } diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c b/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c index 6b53db6fea23..eb569488631a 100644 --- a/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c +++ b/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c @@ -467,7 +467,7 @@ mlx5e_build_rx_mbuf(struct mlx5_cqe64 *cqe, struct mlx5e_rq *rq, break; } - mlx5e_accel_ipsec_handle_rx(mb, cqe, mr); + mlx5e_accel_ipsec_handle_rx(ifp, mb, cqe, mr); } static inline void diff --git a/sys/dev/qlnx/qlnxe/qlnx_os.c b/sys/dev/qlnx/qlnxe/qlnx_os.c index 05ec69a70dfe..9d23d5df1d2b 100644 --- a/sys/dev/qlnx/qlnxe/qlnx_os.c +++ b/sys/dev/qlnx/qlnxe/qlnx_os.c @@ -30,6 +30,8 @@ * Author : David C Somayajulu, Cavium, Inc., San Jose, CA 95131. */ +#include "opt_inet.h" + #include <sys/cdefs.h> #include "qlnx_os.h" #include "bcm_osal.h" @@ -2778,7 +2780,7 @@ qlnx_ioctl(if_t ifp, u_long cmd, caddr_t data) if (!p_ptt) { QL_DPRINT1(ha, "ecore_ptt_acquire failed\n"); - ret = -1; + ret = ERESTART; break; } @@ -2789,7 +2791,7 @@ qlnx_ioctl(if_t ifp, u_long cmd, caddr_t data) ecore_ptt_release(p_hwfn, p_ptt); if (ret) { - ret = -1; + ret = ENODEV; break; } diff --git a/sys/dev/random/fortuna.c b/sys/dev/random/fortuna.c index c4282c723a44..8363de99a60a 100644 --- a/sys/dev/random/fortuna.c +++ b/sys/dev/random/fortuna.c @@ -341,6 +341,13 @@ random_fortuna_process_event(struct harvest_event *event) u_int pl; RANDOM_RESEED_LOCK(); + /* + * Run SP 800-90B health tests on the source if so configured. + */ + if (!random_harvest_healthtest(event)) { + RANDOM_RESEED_UNLOCK(); + return; + } /*- * FS&K - P_i = P_i|<harvested stuff> * Accumulate the event into the appropriate pool diff --git a/sys/dev/random/random_harvestq.c b/sys/dev/random/random_harvestq.c index 395310b115fb..c7762967c4fb 100644 --- a/sys/dev/random/random_harvestq.c +++ b/sys/dev/random/random_harvestq.c @@ -88,6 +88,8 @@ static void random_sources_feed(void); static __read_mostly bool epoch_inited; static __read_mostly epoch_t rs_epoch; +static const char *random_source_descr[ENTROPYSOURCE]; + /* * How many events to queue up. We create this many items in * an 'empty' queue, then transfer them to the 'harvest' queue with @@ -299,6 +301,230 @@ random_sources_feed(void) explicit_bzero(entropy, sizeof(entropy)); } +/* + * State used for conducting NIST SP 800-90B health tests on entropy sources. + */ +static struct health_test_softc { + uint32_t ht_rct_value[HARVESTSIZE + 1]; + u_int ht_rct_count; /* number of samples with the same value */ + u_int ht_rct_limit; /* constant after init */ + + uint32_t ht_apt_value[HARVESTSIZE + 1]; + u_int ht_apt_count; /* number of samples with the same value */ + u_int ht_apt_seq; /* sequence number of the last sample */ + u_int ht_apt_cutoff; /* constant after init */ + + uint64_t ht_total_samples; + bool ondemand; /* Set to true to restart the state machine */ + enum { + INIT = 0, /* initial state */ + DISABLED, /* health checking is disabled */ + STARTUP, /* doing startup tests, samples are discarded */ + STEADY, /* steady-state operation */ + FAILED, /* health check failed, discard samples */ + } ht_state; +} healthtest[ENTROPYSOURCE]; + +#define RANDOM_SELFTEST_STARTUP_SAMPLES 1024 /* 4.3, requirement 4 */ +#define RANDOM_SELFTEST_APT_WINDOW 512 /* 4.4.2 */ + +static void +copy_event(uint32_t dst[static HARVESTSIZE + 1], + const struct harvest_event *event) +{ + memset(dst, 0, sizeof(uint32_t) * (HARVESTSIZE + 1)); + memcpy(dst, event->he_entropy, event->he_size); + dst[HARVESTSIZE] = event->he_somecounter; +} + +static void +random_healthtest_rct_init(struct health_test_softc *ht, + const struct harvest_event *event) +{ + ht->ht_rct_count = 1; + copy_event(ht->ht_rct_value, event); +} + +/* + * Apply the repitition count test to a sample. + * + * Return false if the test failed, i.e., we observed >= C consecutive samples + * with the same value, and true otherwise. + */ +static bool +random_healthtest_rct_next(struct health_test_softc *ht, + const struct harvest_event *event) +{ + uint32_t val[HARVESTSIZE + 1]; + + copy_event(val, event); + if (memcmp(val, ht->ht_rct_value, sizeof(ht->ht_rct_value)) != 0) { + ht->ht_rct_count = 1; + memcpy(ht->ht_rct_value, val, sizeof(ht->ht_rct_value)); + return (true); + } else { + ht->ht_rct_count++; + return (ht->ht_rct_count < ht->ht_rct_limit); + } +} + +static void +random_healthtest_apt_init(struct health_test_softc *ht, + const struct harvest_event *event) +{ + ht->ht_apt_count = 1; + ht->ht_apt_seq = 1; + copy_event(ht->ht_apt_value, event); +} + +static bool +random_healthtest_apt_next(struct health_test_softc *ht, + const struct harvest_event *event) +{ + uint32_t val[HARVESTSIZE + 1]; + + if (ht->ht_apt_seq == 0) { + random_healthtest_apt_init(ht, event); + return (true); + } + + copy_event(val, event); + if (memcmp(val, ht->ht_apt_value, sizeof(ht->ht_apt_value)) == 0) { + ht->ht_apt_count++; + if (ht->ht_apt_count >= ht->ht_apt_cutoff) + return (false); + } + + ht->ht_apt_seq++; + if (ht->ht_apt_seq == RANDOM_SELFTEST_APT_WINDOW) + ht->ht_apt_seq = 0; + + return (true); +} + +/* + * Run the health tests for the given event. This is assumed to be called from + * a serialized context. + */ +bool +random_harvest_healthtest(const struct harvest_event *event) +{ + struct health_test_softc *ht; + + ht = &healthtest[event->he_source]; + + /* + * Was on-demand testing requested? Restart the state machine if so, + * restarting the startup tests. + */ + if (atomic_load_bool(&ht->ondemand)) { + atomic_store_bool(&ht->ondemand, false); + ht->ht_state = INIT; + } + + switch (ht->ht_state) { + case __predict_false(INIT): + /* Store the first sample and initialize test state. */ + random_healthtest_rct_init(ht, event); + random_healthtest_apt_init(ht, event); + ht->ht_total_samples = 0; + ht->ht_state = STARTUP; + return (false); + case DISABLED: + /* No health testing for this source. */ + return (true); + case STEADY: + case STARTUP: + ht->ht_total_samples++; + if (random_healthtest_rct_next(ht, event) && + random_healthtest_apt_next(ht, event)) { + if (ht->ht_state == STARTUP && + ht->ht_total_samples >= + RANDOM_SELFTEST_STARTUP_SAMPLES) { + printf( + "random: health test passed for source %s\n", + random_source_descr[event->he_source]); + ht->ht_state = STEADY; + } + return (ht->ht_state == STEADY); + } + ht->ht_state = FAILED; + printf( + "random: health test failed for source %s, discarding samples\n", + random_source_descr[event->he_source]); + /* FALLTHROUGH */ + case FAILED: + return (false); + } +} + +static bool nist_healthtest_enabled = false; +SYSCTL_BOOL(_kern_random, OID_AUTO, nist_healthtest_enabled, + CTLFLAG_RDTUN, &nist_healthtest_enabled, 0, + "Enable NIST SP 800-90B health tests for noise sources"); + +static void +random_healthtest_init(enum random_entropy_source source) +{ + struct health_test_softc *ht; + + ht = &healthtest[source]; + KASSERT(ht->ht_state == INIT, + ("%s: health test state is %d for source %d", + __func__, ht->ht_state, source)); + + /* + * If health-testing is enabled, validate all sources except CACHED and + * VMGENID: they are deterministic sources used only a small, fixed + * number of times, so statistical testing is not applicable. + */ + if (!nist_healthtest_enabled || + source == RANDOM_CACHED || source == RANDOM_PURE_VMGENID) { + ht->ht_state = DISABLED; + return; + } + + /* + * Set cutoff values for the two tests, assuming that each sample has + * min-entropy of 1 bit and allowing for an error rate of 1 in 2^{34}. + * With a sample rate of RANDOM_KTHREAD_HZ, we expect to see an false + * positive once in ~54.5 years. + * + * The RCT limit comes from the formula in section 4.4.1. + * + * The APT cutoff is calculated using the formula in section 4.4.2 + * footnote 10 with the window size changed from 512 to 511, since the + * test as written counts the number of samples equal to the first + * sample in the window, and thus tests W-1 samples. + */ + ht->ht_rct_limit = 35; + ht->ht_apt_cutoff = 330; +} + +static int +random_healthtest_ondemand(SYSCTL_HANDLER_ARGS) +{ + u_int mask, source; + int error; + + mask = 0; + error = sysctl_handle_int(oidp, &mask, 0, req); + if (error != 0 || req->newptr == NULL) + return (error); + + while (mask != 0) { + source = ffs(mask) - 1; + if (source < nitems(healthtest)) + atomic_store_bool(&healthtest[source].ondemand, true); + mask &= ~(1u << source); + } + return (0); +} +SYSCTL_PROC(_kern_random, OID_AUTO, nist_healthtest_ondemand, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, + random_healthtest_ondemand, "I", + "Re-run NIST SP 800-90B startup health tests for a noise source"); + static int random_check_uint_harvestmask(SYSCTL_HANDLER_ARGS) { @@ -362,7 +588,8 @@ static const char *random_source_descr[ENTROPYSOURCE] = { [RANDOM_SWI] = "SWI", [RANDOM_FS_ATIME] = "FS_ATIME", [RANDOM_UMA] = "UMA", - [RANDOM_CALLOUT] = "CALLOUT", /* ENVIRONMENTAL_END */ + [RANDOM_CALLOUT] = "CALLOUT", + [RANDOM_RANDOMDEV] = "RANDOMDEV", /* ENVIRONMENTAL_END */ [RANDOM_PURE_OCTEON] = "PURE_OCTEON", /* PURE_START */ [RANDOM_PURE_SAFE] = "PURE_SAFE", [RANDOM_PURE_GLXSB] = "PURE_GLXSB", @@ -424,6 +651,9 @@ random_harvestq_init(void *unused __unused) hc_source_mask = almost_everything_mask; RANDOM_HARVEST_INIT_LOCK(); harvest_context.hc_active_buf = 0; + + for (int i = 0; i < ENTROPYSOURCE; i++) + random_healthtest_init(i); } SYSINIT(random_device_h_init, SI_SUB_RANDOM, SI_ORDER_THIRD, random_harvestq_init, NULL); diff --git a/sys/dev/random/random_harvestq.h b/sys/dev/random/random_harvestq.h index 7804bf52aa4f..1d462500df85 100644 --- a/sys/dev/random/random_harvestq.h +++ b/sys/dev/random/random_harvestq.h @@ -49,4 +49,6 @@ random_get_cyclecount(void) return ((uint32_t)get_cyclecount()); } +bool random_harvest_healthtest(const struct harvest_event *event); + #endif /* SYS_DEV_RANDOM_RANDOM_HARVESTQ_H_INCLUDED */ diff --git a/sys/dev/random/randomdev.c b/sys/dev/random/randomdev.c index 9d1c7b1167c8..ced4dd8067d9 100644 --- a/sys/dev/random/randomdev.c +++ b/sys/dev/random/randomdev.c @@ -312,7 +312,7 @@ randomdev_accumulate(uint8_t *buf, u_int count) for (i = 0; i < RANDOM_KEYSIZE_WORDS; i += sizeof(event.he_entropy)/sizeof(event.he_entropy[0])) { event.he_somecounter = random_get_cyclecount(); event.he_size = sizeof(event.he_entropy); - event.he_source = RANDOM_CACHED; + event.he_source = RANDOM_RANDOMDEV; event.he_destination = destination++; /* Harmless cheating */ memcpy(event.he_entropy, entropy_data + i, sizeof(event.he_entropy)); p_random_alg_context->ra_event_processor(&event); diff --git a/sys/dev/ufshci/ufshci_private.h b/sys/dev/ufshci/ufshci_private.h index cac743884ee6..ac58d44102a0 100644 --- a/sys/dev/ufshci/ufshci_private.h +++ b/sys/dev/ufshci/ufshci_private.h @@ -149,6 +149,8 @@ struct ufshci_hw_queue { bus_dmamap_t queuemem_map; bus_addr_t req_queue_addr; + bus_addr_t *ucd_bus_addr; + uint32_t num_entries; uint32_t num_trackers; @@ -198,8 +200,6 @@ struct ufshci_req_queue { bus_dma_tag_t dma_tag_payload; bus_dmamap_t ucdmem_map; - - bus_addr_t ucd_addr; }; struct ufshci_device { diff --git a/sys/dev/ufshci/ufshci_req_sdb.c b/sys/dev/ufshci/ufshci_req_sdb.c index 4670281d367a..b1f303afaef5 100644 --- a/sys/dev/ufshci/ufshci_req_sdb.c +++ b/sys/dev/ufshci/ufshci_req_sdb.c @@ -48,6 +48,29 @@ ufshci_req_sdb_cmd_desc_destroy(struct ufshci_req_queue *req_queue) } } +static void +ufshci_ucd_map(void *arg, bus_dma_segment_t *seg, int nseg, int error) +{ + struct ufshci_hw_queue *hwq = arg; + int i; + + if (error != 0) { + printf("ufshci: Failed to map UCD, error = %d\n", error); + return; + } + + if (hwq->num_trackers != nseg) { + printf( + "ufshci: Failed to map UCD, num_trackers = %d, nseg = %d\n", + hwq->num_trackers, nseg); + return; + } + + for (i = 0; i < nseg; i++) { + hwq->ucd_bus_addr[i] = seg[i].ds_addr; + } +} + static int ufshci_req_sdb_cmd_desc_construct(struct ufshci_req_queue *req_queue, uint32_t num_entries, struct ufshci_controller *ctrlr) @@ -55,7 +78,6 @@ ufshci_req_sdb_cmd_desc_construct(struct ufshci_req_queue *req_queue, struct ufshci_hw_queue *hwq = &req_queue->hwq[UFSHCI_SDB_Q]; struct ufshci_tracker *tr; size_t ucd_allocsz, payload_allocsz; - uint64_t ucdmem_phys; uint8_t *ucdmem; int i, error; @@ -71,10 +93,11 @@ ufshci_req_sdb_cmd_desc_construct(struct ufshci_req_queue *req_queue, * Allocate physical memory for UTP Command Descriptor (UCD) * Note: UFSHCI UCD format is restricted to 128-byte alignment. */ - error = bus_dma_tag_create(bus_get_dma_tag(ctrlr->dev), 128, - ctrlr->page_size, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, - ucd_allocsz, howmany(ucd_allocsz, ctrlr->page_size), - ctrlr->page_size, 0, NULL, NULL, &req_queue->dma_tag_ucd); + error = bus_dma_tag_create(bus_get_dma_tag(ctrlr->dev), 128, 0, + BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, ucd_allocsz, + howmany(ucd_allocsz, sizeof(struct ufshci_utp_cmd_desc)), + sizeof(struct ufshci_utp_cmd_desc), 0, NULL, NULL, + &req_queue->dma_tag_ucd); if (error != 0) { ufshci_printf(ctrlr, "request cmd desc tag create failed %d\n", error); @@ -88,7 +111,7 @@ ufshci_req_sdb_cmd_desc_construct(struct ufshci_req_queue *req_queue, } if (bus_dmamap_load(req_queue->dma_tag_ucd, req_queue->ucdmem_map, - ucdmem, ucd_allocsz, ufshci_single_map, &ucdmem_phys, 0) != 0) { + ucdmem, ucd_allocsz, ufshci_ucd_map, hwq, 0) != 0) { ufshci_printf(ctrlr, "failed to load cmd desc memory\n"); bus_dmamem_free(req_queue->dma_tag_ucd, req_queue->ucd, req_queue->ucdmem_map); @@ -96,7 +119,6 @@ ufshci_req_sdb_cmd_desc_construct(struct ufshci_req_queue *req_queue, } req_queue->ucd = (struct ufshci_utp_cmd_desc *)ucdmem; - req_queue->ucd_addr = ucdmem_phys; /* * Allocate physical memory for PRDT @@ -128,10 +150,9 @@ ufshci_req_sdb_cmd_desc_construct(struct ufshci_req_queue *req_queue, tr->slot_state = UFSHCI_SLOT_STATE_FREE; tr->ucd = (struct ufshci_utp_cmd_desc *)ucdmem; - tr->ucd_bus_addr = ucdmem_phys; + tr->ucd_bus_addr = hwq->ucd_bus_addr[i]; ucdmem += sizeof(struct ufshci_utp_cmd_desc); - ucdmem_phys += sizeof(struct ufshci_utp_cmd_desc); hwq->act_tr[i] = tr; } @@ -175,6 +196,11 @@ ufshci_req_sdb_construct(struct ufshci_controller *ctrlr, req_queue->hwq = malloc(sizeof(struct ufshci_hw_queue), M_UFSHCI, M_ZERO | M_NOWAIT); hwq = &req_queue->hwq[UFSHCI_SDB_Q]; + hwq->num_entries = req_queue->num_entries; + hwq->num_trackers = req_queue->num_trackers; + req_queue->hwq->ucd_bus_addr = malloc(sizeof(bus_addr_t) * + req_queue->num_trackers, + M_UFSHCI, M_ZERO | M_NOWAIT); mtx_init(&hwq->qlock, "ufshci req_queue lock", NULL, MTX_DEF); @@ -277,6 +303,7 @@ ufshci_req_sdb_destroy(struct ufshci_controller *ctrlr, if (mtx_initialized(&hwq->qlock)) mtx_destroy(&hwq->qlock); + free(req_queue->hwq->ucd_bus_addr, M_UFSHCI); free(req_queue->hwq, M_UFSHCI); } diff --git a/sys/dev/usb/controller/xhci_pci.c b/sys/dev/usb/controller/xhci_pci.c index b50e33ea36ce..d5cfd228a429 100644 --- a/sys/dev/usb/controller/xhci_pci.c +++ b/sys/dev/usb/controller/xhci_pci.c @@ -99,6 +99,11 @@ xhci_pci_match(device_t self) return ("AMD Starship USB 3.0 controller"); case 0x149c1022: return ("AMD Matisse USB 3.0 controller"); + case 0x15b61022: + case 0x15b71022: + return ("AMD Raphael/Granite Ridge USB 3.1 controller"); + case 0x15b81022: + return ("AMD Raphael/Granite Ridge USB 2.0 controller"); case 0x15e01022: case 0x15e11022: return ("AMD Raven USB 3.1 controller"); @@ -109,6 +114,8 @@ xhci_pci_match(device_t self) return ("AMD 300 Series USB 3.1 controller"); case 0x43d51022: return ("AMD 400 Series USB 3.1 controller"); + case 0x43f71022: + return ("AMD 600 Series USB 3.2 controller"); case 0x78121022: case 0x78141022: case 0x79141022: diff --git a/sys/fs/fdescfs/fdesc_vnops.c b/sys/fs/fdescfs/fdesc_vnops.c index 676ea5de12b8..58a22b8bdc50 100644 --- a/sys/fs/fdescfs/fdesc_vnops.c +++ b/sys/fs/fdescfs/fdesc_vnops.c @@ -547,6 +547,8 @@ fdesc_readdir(struct vop_readdir_args *ap) fmp = VFSTOFDESC(ap->a_vp->v_mount); if (ap->a_ncookies != NULL) *ap->a_ncookies = 0; + if (ap->a_eofflag != NULL) + *ap->a_eofflag = 0; off = (int)uio->uio_offset; if (off != uio->uio_offset || off < 0 || (u_int)off % UIO_MX != 0 || @@ -559,7 +561,12 @@ fdesc_readdir(struct vop_readdir_args *ap) fcnt = i - 2; /* The first two nodes are `.' and `..' */ FILEDESC_SLOCK(fdp); - while (i < fdp->fd_nfiles + 2 && uio->uio_resid >= UIO_MX) { + while (uio->uio_resid >= UIO_MX) { + if (i >= fdp->fd_nfiles + 2) { + if (ap->a_eofflag != NULL) + *ap->a_eofflag = 1; + break; + } bzero((caddr_t)dp, UIO_MX); switch (i) { case 0: /* `.' */ diff --git a/sys/fs/msdosfs/msdosfs_conv.c b/sys/fs/msdosfs/msdosfs_conv.c index da4848169173..208b64930e61 100644 --- a/sys/fs/msdosfs/msdosfs_conv.c +++ b/sys/fs/msdosfs/msdosfs_conv.c @@ -797,19 +797,24 @@ mbsadjpos(const char **instr, size_t inlen, size_t outlen, int weight, int flag, static u_char * dos2unixchr(u_char *outbuf, const u_char **instr, size_t *ilen, int lower, struct msdosfsmount *pmp) { - u_char c, *outp; - size_t len, olen; + u_char c, *outp, *outp1; + size_t i, len, olen; outp = outbuf; if (pmp->pm_flags & MSDOSFSMNT_KICONV && msdosfs_iconv) { olen = len = 4; + outp1 = outp; if (lower & (LCASE_BASE | LCASE_EXT)) msdosfs_iconv->convchr_case(pmp->pm_d2u, (const char **)instr, ilen, (char **)&outp, &olen, KICONV_LOWER); else msdosfs_iconv->convchr(pmp->pm_d2u, (const char **)instr, ilen, (char **)&outp, &olen); + for (i = 0; i < outp - outp1; i++) { + if (outp1[i] == '/') + outp1[i] = '?'; + } len -= olen; /* @@ -826,6 +831,8 @@ dos2unixchr(u_char *outbuf, const u_char **instr, size_t *ilen, int lower, struc c = dos2unix[c]; if (lower & (LCASE_BASE | LCASE_EXT)) c = u2l[c]; + if (c == '/') + c = '?'; *outp++ = c; outbuf[1] = '\0'; } diff --git a/sys/fs/msdosfs/msdosfs_vnops.c b/sys/fs/msdosfs/msdosfs_vnops.c index 5db61c8951f6..33e0d94954d7 100644 --- a/sys/fs/msdosfs/msdosfs_vnops.c +++ b/sys/fs/msdosfs/msdosfs_vnops.c @@ -1521,6 +1521,9 @@ msdosfs_readdir(struct vop_readdir_args *ap) ap->a_vp, uio, ap->a_cred, ap->a_eofflag); #endif + if (ap->a_eofflag != NULL) + *ap->a_eofflag = 0; + /* * msdosfs_readdir() won't operate properly on regular files since * it does i/o only with the filesystem vnode, and hence can @@ -1614,8 +1617,11 @@ msdosfs_readdir(struct vop_readdir_args *ap) on = (offset - bias) & pmp->pm_crbomask; n = min(pmp->pm_bpcluster - on, uio->uio_resid); diff = dep->de_FileSize - (offset - bias); - if (diff <= 0) - break; + if (diff <= 0) { + if (ap->a_eofflag != NULL) + *ap->a_eofflag = 1; + goto out; + } n = min(n, diff); error = pcbmap(dep, lbn, &bn, &cn, &blsize); if (error) @@ -1646,6 +1652,8 @@ msdosfs_readdir(struct vop_readdir_args *ap) */ if (dentp->deName[0] == SLOT_EMPTY) { brelse(bp); + if (ap->a_eofflag != NULL) + *ap->a_eofflag = 1; goto out; } /* @@ -1743,15 +1751,6 @@ out: uio->uio_offset = off; - /* - * Set the eofflag (NFS uses it) - */ - if (ap->a_eofflag) { - if (dep->de_FileSize - (offset - bias) <= 0) - *ap->a_eofflag = 1; - else - *ap->a_eofflag = 0; - } return (error); } diff --git a/sys/fs/nfs/nfs_commonsubs.c b/sys/fs/nfs/nfs_commonsubs.c index 4c498e96a3c0..a957315aaa12 100644 --- a/sys/fs/nfs/nfs_commonsubs.c +++ b/sys/fs/nfs/nfs_commonsubs.c @@ -647,7 +647,8 @@ nfscl_fillsattr(struct nfsrv_descript *nd, struct vattr *vap, NFSATTRBIT_TIMECREATE)) NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_TIMECREATE); (void) nfsv4_fillattr(nd, vp->v_mount, vp, NULL, vap, NULL, 0, - &attrbits, NULL, NULL, 0, 0, 0, 0, (uint64_t)0, NULL); + &attrbits, NULL, NULL, 0, 0, 0, 0, (uint64_t)0, NULL, + false, false, false); break; } } @@ -2646,7 +2647,8 @@ nfsv4_fillattr(struct nfsrv_descript *nd, struct mount *mp, vnode_t vp, NFSACL_T *saclp, struct vattr *vap, fhandle_t *fhp, int rderror, nfsattrbit_t *attrbitp, struct ucred *cred, NFSPROC_T *p, int isdgram, int reterr, int supports_nfsv4acls, int at_root, uint64_t mounted_on_fileno, - struct statfs *pnfssf) + struct statfs *pnfssf, bool xattrsupp, bool has_hiddensystem, + bool has_namedattr) { int bitpos, retnum = 0; u_int32_t *tl; @@ -2660,10 +2662,7 @@ nfsv4_fillattr(struct nfsrv_descript *nd, struct mount *mp, vnode_t vp, struct nfsfsinfo fsinf; struct timespec temptime; NFSACL_T *aclp, *naclp = NULL; - size_t atsiz; - bool xattrsupp; short irflag; - long has_pathconf; #ifdef QUOTA struct dqblk dqb; uid_t savuid; @@ -2747,18 +2746,6 @@ nfsv4_fillattr(struct nfsrv_descript *nd, struct mount *mp, vnode_t vp, } } - /* Check to see if Extended Attributes are supported. */ - xattrsupp = false; - if (NFSISSET_ATTRBIT(retbitp, NFSATTRBIT_XATTRSUPPORT)) { - if (NFSVOPLOCK(vp, LK_SHARED) == 0) { - error = VOP_GETEXTATTR(vp, EXTATTR_NAMESPACE_USER, - "xxx", NULL, &atsiz, cred, p); - NFSVOPUNLOCK(vp); - if (error != EOPNOTSUPP) - xattrsupp = true; - } - } - /* * Put out the attribute bitmap for the ones being filled in * and get the field for the number of attributes returned. @@ -2780,11 +2767,7 @@ nfsv4_fillattr(struct nfsrv_descript *nd, struct mount *mp, vnode_t vp, NFSCLRBIT_ATTRBIT(&attrbits,NFSATTRBIT_ACLSUPPORT); NFSCLRBIT_ATTRBIT(&attrbits,NFSATTRBIT_ACL); } - if (cred == NULL || p == NULL || vp == NULL || - VOP_PATHCONF(vp, _PC_HAS_HIDDENSYSTEM, - &has_pathconf) != 0) - has_pathconf = 0; - if (has_pathconf == 0) { + if (!has_hiddensystem) { NFSCLRBIT_ATTRBIT(&attrbits, NFSATTRBIT_HIDDEN); NFSCLRBIT_ATTRBIT(&attrbits, NFSATTRBIT_SYSTEM); } @@ -2828,10 +2811,7 @@ nfsv4_fillattr(struct nfsrv_descript *nd, struct mount *mp, vnode_t vp, break; case NFSATTRBIT_NAMEDATTR: NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); - if (VOP_PATHCONF(vp, _PC_HAS_NAMEDATTR, - &has_pathconf) != 0) - has_pathconf = 0; - if (has_pathconf != 0) + if (has_namedattr) *tl = newnfs_true; else *tl = newnfs_false; diff --git a/sys/fs/nfs/nfs_var.h b/sys/fs/nfs/nfs_var.h index 3b6c1ec90c06..54f60a753c50 100644 --- a/sys/fs/nfs/nfs_var.h +++ b/sys/fs/nfs/nfs_var.h @@ -395,8 +395,9 @@ int nfsrv_putopbit(struct nfsrv_descript *, nfsopbit_t *); void nfsrv_wcc(struct nfsrv_descript *, int, struct nfsvattr *, int, struct nfsvattr *); int nfsv4_fillattr(struct nfsrv_descript *, struct mount *, vnode_t, NFSACL_T *, - struct vattr *, fhandle_t *, int, nfsattrbit_t *, - struct ucred *, NFSPROC_T *, int, int, int, int, uint64_t, struct statfs *); + struct vattr *, fhandle_t *, int, nfsattrbit_t *, struct ucred *, + NFSPROC_T *, int, int, int, int, uint64_t, struct statfs *, bool, bool, + bool); void nfsrv_fillattr(struct nfsrv_descript *, struct nfsvattr *); struct mbuf *nfsrv_adj(struct mbuf *, int, int); void nfsrv_postopattr(struct nfsrv_descript *, int, struct nfsvattr *); @@ -735,7 +736,8 @@ int nfsvno_updfilerev(vnode_t, struct nfsvattr *, struct nfsrv_descript *, NFSPROC_T *); int nfsvno_fillattr(struct nfsrv_descript *, struct mount *, vnode_t, struct nfsvattr *, fhandle_t *, int, nfsattrbit_t *, - struct ucred *, NFSPROC_T *, int, int, int, int, uint64_t); + struct ucred *, NFSPROC_T *, int, int, int, int, uint64_t, bool, bool, + bool); int nfsrv_sattr(struct nfsrv_descript *, vnode_t, struct nfsvattr *, nfsattrbit_t *, NFSACL_T *, NFSPROC_T *); int nfsv4_sattr(struct nfsrv_descript *, vnode_t, struct nfsvattr *, nfsattrbit_t *, diff --git a/sys/fs/nfsclient/nfs_clrpcops.c b/sys/fs/nfsclient/nfs_clrpcops.c index e0e66baca44d..2f3c59b68518 100644 --- a/sys/fs/nfsclient/nfs_clrpcops.c +++ b/sys/fs/nfsclient/nfs_clrpcops.c @@ -5436,7 +5436,8 @@ nfsrpc_setaclrpc(vnode_t vp, struct ucred *cred, NFSPROC_T *p, NFSZERO_ATTRBIT(&attrbits); NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_ACL); (void) nfsv4_fillattr(nd, vp->v_mount, vp, aclp, NULL, NULL, 0, - &attrbits, NULL, NULL, 0, 0, 0, 0, (uint64_t)0, NULL); + &attrbits, NULL, NULL, 0, 0, 0, 0, (uint64_t)0, NULL, false, false, + false); error = nfscl_request(nd, vp, p, cred); if (error) return (error); diff --git a/sys/fs/nfsclient/nfs_clstate.c b/sys/fs/nfsclient/nfs_clstate.c index 1ae5ed1a75ca..99a781640c53 100644 --- a/sys/fs/nfsclient/nfs_clstate.c +++ b/sys/fs/nfsclient/nfs_clstate.c @@ -3701,7 +3701,7 @@ nfscl_docb(struct nfsrv_descript *nd, NFSPROC_T *p) if (!error) (void) nfsv4_fillattr(nd, NULL, NULL, NULL, &va, NULL, 0, &rattrbits, NULL, p, 0, 0, 0, 0, - (uint64_t)0, NULL); + (uint64_t)0, NULL, false, false, false); break; case NFSV4OP_CBRECALL: NFSCL_DEBUG(4, "cbrecall\n"); diff --git a/sys/fs/nfsserver/nfs_nfsdport.c b/sys/fs/nfsserver/nfs_nfsdport.c index 43ee0383669f..4f0d5946d6b9 100644 --- a/sys/fs/nfsserver/nfs_nfsdport.c +++ b/sys/fs/nfsserver/nfs_nfsdport.c @@ -2112,7 +2112,8 @@ int nfsvno_fillattr(struct nfsrv_descript *nd, struct mount *mp, struct vnode *vp, struct nfsvattr *nvap, fhandle_t *fhp, int rderror, nfsattrbit_t *attrbitp, struct ucred *cred, struct thread *p, int isdgram, int reterr, - int supports_nfsv4acls, int at_root, uint64_t mounted_on_fileno) + int supports_nfsv4acls, int at_root, uint64_t mounted_on_fileno, + bool xattrsupp, bool has_hiddensystem, bool has_namedattr) { struct statfs *sf; int error; @@ -2131,7 +2132,7 @@ nfsvno_fillattr(struct nfsrv_descript *nd, struct mount *mp, struct vnode *vp, } error = nfsv4_fillattr(nd, mp, vp, NULL, &nvap->na_vattr, fhp, rderror, attrbitp, cred, p, isdgram, reterr, supports_nfsv4acls, at_root, - mounted_on_fileno, sf); + mounted_on_fileno, sf, xattrsupp, has_hiddensystem, has_namedattr); free(sf, M_TEMP); NFSEXITCODE2(0, nd); return (error); @@ -2448,7 +2449,7 @@ nfsrvd_readdirplus(struct nfsrv_descript *nd, int isdgram, struct nfsvattr nva, at, *nvap = &nva; struct mbuf *mb0, *mb1; struct nfsreferral *refp; - int nlen, r, error = 0, getret = 1, usevget = 1; + int nlen, r, error = 0, getret = 1, ret, usevget = 1; int siz, cnt, fullsiz, eofflag, ncookies, entrycnt; caddr_t bpos0, bpos1; u_int64_t off, toff, verf __unused; @@ -2462,6 +2463,9 @@ nfsrvd_readdirplus(struct nfsrv_descript *nd, int isdgram, uint64_t mounted_on_fileno; struct thread *p = curthread; int bextpg0, bextpg1, bextpgsiz0, bextpgsiz1; + size_t atsiz; + long pathval; + bool has_hiddensystem, has_namedattr, xattrsupp; if (nd->nd_repstat) { nfsrv_postopattr(nd, getret, &at); @@ -2936,9 +2940,32 @@ again: *tl++ = newnfs_true; txdr_hyper(*cookiep, tl); dirlen += nfsm_strtom(nd, dp->d_name, nlen); + xattrsupp = false; + has_hiddensystem = false; + has_namedattr = false; if (nvp != NULL) { supports_nfsv4acls = nfs_supportsnfsv4acls(nvp); + if (NFSISSET_ATTRBIT(&attrbits, + NFSATTRBIT_XATTRSUPPORT)) { + ret = VOP_GETEXTATTR(nvp, + EXTATTR_NAMESPACE_USER, + "xxx", NULL, &atsiz, + nd->nd_cred, p); + xattrsupp = ret != EOPNOTSUPP; + } + if (VOP_PATHCONF(nvp, + _PC_HAS_HIDDENSYSTEM, &pathval) != + 0) + pathval = 0; + has_hiddensystem = pathval > 0; + pathval = 0; + if (NFSISSET_ATTRBIT(&attrbits, + NFSATTRBIT_NAMEDATTR) && + VOP_PATHCONF(nvp, _PC_HAS_NAMEDATTR, + &pathval) != 0) + pathval = 0; + has_namedattr = pathval > 0; NFSVOPUNLOCK(nvp); } else supports_nfsv4acls = 0; @@ -2958,13 +2985,15 @@ again: nvp, nvap, &nfh, r, &rderrbits, nd->nd_cred, p, isdgram, 0, supports_nfsv4acls, at_root, - mounted_on_fileno); + mounted_on_fileno, xattrsupp, + has_hiddensystem, has_namedattr); } else { dirlen += nfsvno_fillattr(nd, new_mp, nvp, nvap, &nfh, r, &attrbits, nd->nd_cred, p, isdgram, 0, supports_nfsv4acls, at_root, - mounted_on_fileno); + mounted_on_fileno, xattrsupp, + has_hiddensystem, has_namedattr); } if (nvp != NULL) vrele(nvp); @@ -6356,7 +6385,7 @@ nfsrv_setacldsdorpc(fhandle_t *fhp, struct ucred *cred, NFSPROC_T *p, * the same type (VREG). */ nfsv4_fillattr(nd, NULL, vp, aclp, NULL, NULL, 0, &attrbits, NULL, - NULL, 0, 0, 0, 0, 0, NULL); + NULL, 0, 0, 0, 0, 0, NULL, false, false, false); error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred, NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL); if (error != 0) { diff --git a/sys/fs/nfsserver/nfs_nfsdserv.c b/sys/fs/nfsserver/nfs_nfsdserv.c index f7564ade401b..9eebcda548c6 100644 --- a/sys/fs/nfsserver/nfs_nfsdserv.c +++ b/sys/fs/nfsserver/nfs_nfsdserv.c @@ -241,7 +241,7 @@ nfsrvd_getattr(struct nfsrv_descript *nd, int isdgram, { struct nfsvattr nva; fhandle_t fh; - int at_root = 0, error = 0, supports_nfsv4acls; + int at_root = 0, error = 0, ret, supports_nfsv4acls; struct nfsreferral *refp; nfsattrbit_t attrbits, tmpbits; struct mount *mp; @@ -250,6 +250,9 @@ nfsrvd_getattr(struct nfsrv_descript *nd, int isdgram, uint64_t mounted_on_fileno = 0; accmode_t accmode; struct thread *p = curthread; + size_t atsiz; + long pathval; + bool has_hiddensystem, has_namedattr, xattrsupp; if (nd->nd_repstat) goto out; @@ -307,6 +310,26 @@ nfsrvd_getattr(struct nfsrv_descript *nd, int isdgram, &nva, &attrbits, p); if (nd->nd_repstat == 0) { supports_nfsv4acls = nfs_supportsnfsv4acls(vp); + xattrsupp = false; + if (NFSISSET_ATTRBIT(&attrbits, + NFSATTRBIT_XATTRSUPPORT)) { + ret = VOP_GETEXTATTR(vp, + EXTATTR_NAMESPACE_USER, + "xxx", NULL, &atsiz, nd->nd_cred, + p); + xattrsupp = ret != EOPNOTSUPP; + } + if (VOP_PATHCONF(vp, _PC_HAS_HIDDENSYSTEM, + &pathval) != 0) + pathval = 0; + has_hiddensystem = pathval > 0; + pathval = 0; + if (NFSISSET_ATTRBIT(&attrbits, + NFSATTRBIT_NAMEDATTR) && + VOP_PATHCONF(vp, _PC_HAS_NAMEDATTR, + &pathval) != 0) + pathval = 0; + has_namedattr = pathval > 0; mp = vp->v_mount; if (nfsrv_enable_crossmntpt != 0 && vp->v_type == VDIR && @@ -340,7 +363,9 @@ nfsrvd_getattr(struct nfsrv_descript *nd, int isdgram, (void)nfsvno_fillattr(nd, mp, vp, &nva, &fh, 0, &attrbits, nd->nd_cred, p, isdgram, 1, supports_nfsv4acls, - at_root, mounted_on_fileno); + at_root, mounted_on_fileno, + xattrsupp, has_hiddensystem, + has_namedattr); vfs_unbusy(mp); } vrele(vp); @@ -4353,9 +4378,10 @@ nfsrvd_openattr(struct nfsrv_descript *nd, __unused int isdgram, int error = 0; NFSNAMEICNDSET(&cn, nd->nd_cred, LOOKUP, OPENNAMED | ISLASTCN | - NOFOLLOW); + NOFOLLOW | LOCKLEAF); cn.cn_nameptr = "."; cn.cn_namelen = 1; + cn.cn_lkflags = LK_SHARED; NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); if (*tl == newnfs_true) cn.cn_flags |= CREATENAMED; @@ -4374,6 +4400,8 @@ nfsrvd_openattr(struct nfsrv_descript *nd, __unused int isdgram, if (nd->nd_repstat == ENOATTR) nd->nd_repstat = NFSERR_NOENT; } + if (nd->nd_repstat == 0) + NFSVOPUNLOCK(*vpp); vput(dp); NFSEXITCODE2(0, nd); diff --git a/sys/fs/p9fs/p9fs_vnops.c b/sys/fs/p9fs/p9fs_vnops.c index 56bf766ef801..227e2b93883e 100644 --- a/sys/fs/p9fs/p9fs_vnops.c +++ b/sys/fs/p9fs/p9fs_vnops.c @@ -1784,6 +1784,9 @@ p9fs_readdir(struct vop_readdir_args *ap) return (EBADF); } + if (ap->a_eofflag != NULL) + *ap->a_eofflag = 0; + io_buffer = uma_zalloc(p9fs_io_buffer_zone, M_WAITOK); /* We haven't reached the end yet. read more. */ @@ -1801,8 +1804,11 @@ p9fs_readdir(struct vop_readdir_args *ap) count = p9_client_readdir(vofid, (char *)io_buffer, diroffset, count); - if (count == 0) + if (count == 0) { + if (ap->a_eofflag != NULL) + *ap->a_eofflag = 1; break; + } if (count < 0) { error = EIO; diff --git a/sys/fs/udf/ecma167-udf.h b/sys/fs/udf/ecma167-udf.h index 839bbec08254..19e114763cac 100644 --- a/sys/fs/udf/ecma167-udf.h +++ b/sys/fs/udf/ecma167-udf.h @@ -243,7 +243,7 @@ struct part_map_spare { uint8_t n_st; /* Number of Sparing Tables */ uint8_t reserved1; uint32_t st_size; - uint32_t st_loc[1]; + uint32_t st_loc[]; } __packed; union udf_pmap { @@ -266,7 +266,7 @@ struct udf_sparing_table { uint16_t rt_l; /* Relocation Table len */ uint8_t reserved[2]; uint32_t seq_num; - struct spare_map_entry entries[1]; + struct spare_map_entry entries[]; } __packed; /* Partition Descriptor [3/10.5] */ diff --git a/sys/fs/udf/udf_vfsops.c b/sys/fs/udf/udf_vfsops.c index c7438147c0a0..c5ef1f686093 100644 --- a/sys/fs/udf/udf_vfsops.c +++ b/sys/fs/udf/udf_vfsops.c @@ -81,6 +81,7 @@ #include <sys/fcntl.h> #include <sys/iconv.h> #include <sys/kernel.h> +#include <sys/limits.h> #include <sys/malloc.h> #include <sys/mount.h> #include <sys/namei.h> @@ -729,7 +730,7 @@ udf_fhtovp(struct mount *mp, struct fid *fhp, int flags, struct vnode **vpp) struct ifid *ifhp; struct vnode *nvp; struct udf_node *np; - off_t fsize; + uint64_t fsize; int error; ifhp = (struct ifid *)fhp; @@ -741,6 +742,10 @@ udf_fhtovp(struct mount *mp, struct fid *fhp, int flags, struct vnode **vpp) np = VTON(nvp); fsize = le64toh(np->fentry->inf_len); + if (fsize > OFF_MAX) { + *vpp = NULLVP; + return (EIO); + } *vpp = nvp; vnode_create_vobject(*vpp, fsize, curthread); diff --git a/sys/fs/udf/udf_vnops.c b/sys/fs/udf/udf_vnops.c index 88bf4917a851..37889241e8c3 100644 --- a/sys/fs/udf/udf_vnops.c +++ b/sys/fs/udf/udf_vnops.c @@ -39,6 +39,7 @@ #include <sys/conf.h> #include <sys/buf.h> #include <sys/iconv.h> +#include <sys/limits.h> #include <sys/mount.h> #include <sys/vnode.h> #include <sys/dirent.h> @@ -182,11 +183,14 @@ udf_access(struct vop_access_args *a) } static int -udf_open(struct vop_open_args *ap) { +udf_open(struct vop_open_args *ap) +{ struct udf_node *np = VTON(ap->a_vp); - off_t fsize; + uint64_t fsize; fsize = le64toh(np->fentry->inf_len); + if (fsize > OFF_MAX) + return (EIO); vnode_create_vobject(ap->a_vp, fsize, ap->a_td); return 0; } @@ -314,12 +318,13 @@ udf_getattr(struct vop_getattr_args *a) * that directories consume at least one logical block, * make it appear so. */ - if (fentry->logblks_rec != 0) { - vap->va_size = - le64toh(fentry->logblks_rec) * node->udfmp->bsize; - } else { + vap->va_size = le64toh(fentry->logblks_rec); + if (vap->va_size == 0) vap->va_size = node->udfmp->bsize; - } + else if (vap->va_size > UINT64_MAX / node->udfmp->bsize) + vap->va_size = UINT64_MAX; + else + vap->va_size *= node->udfmp->bsize; } else { vap->va_size = le64toh(fentry->inf_len); } @@ -446,6 +451,7 @@ udf_read(struct vop_read_args *ap) struct buf *bp; uint8_t *data; daddr_t lbn, rablock; + uint64_t len; off_t diff, fsize; ssize_t n; int error = 0; @@ -471,7 +477,12 @@ udf_read(struct vop_read_args *ap) return (error); } - fsize = le64toh(node->fentry->inf_len); + len = le64toh(node->fentry->inf_len); + if (len > OFF_MAX) { + /* too big, just cap to the requested length */ + len = uio->uio_resid; + } + fsize = len; udfmp = node->udfmp; do { lbn = lblkno(udfmp, uio->uio_offset); @@ -783,6 +794,7 @@ udf_readdir(struct vop_readdir_args *a) struct udf_uiodir uiodir; struct udf_dirstream *ds; uint64_t *cookies = NULL; + uint64_t len; int ncookies; int error = 0; @@ -811,8 +823,12 @@ udf_readdir(struct vop_readdir_args *a) * Iterate through the file id descriptors. Give the parent dir * entry special attention. */ - ds = udf_opendir(node, uio->uio_offset, le64toh(node->fentry->inf_len), - node->udfmp); + len = le64toh(node->fentry->inf_len); + if (len > INT_MAX) { + /* too big, just cap to INT_MAX */ + len = INT_MAX; + } + ds = udf_opendir(node, uio->uio_offset, len, node->udfmp); while ((fid = udf_getfid(ds)) != NULL) { /* XXX Should we return an error on a bad fid? */ @@ -904,7 +920,8 @@ udf_readlink(struct vop_readlink_args *ap) struct udf_node *node; void *buf; char *cp; - int error, len, root; + uint64_t len; + int error, root; /* * A symbolic link in UDF is a list of variable-length path @@ -914,6 +931,8 @@ udf_readlink(struct vop_readlink_args *ap) vp = ap->a_vp; node = VTON(vp); len = le64toh(node->fentry->inf_len); + if (len > MAXPATHLEN) + return (EIO); buf = malloc(len, M_DEVBUF, M_WAITOK); iov[0].iov_len = len; iov[0].iov_base = buf; @@ -1116,13 +1135,14 @@ udf_lookup(struct vop_cachedlookup_args *a) struct udf_mnt *udfmp; struct fileid_desc *fid = NULL; struct udf_dirstream *ds; + uint64_t fsize; u_long nameiop; u_long flags; char *nameptr; long namelen; ino_t id = 0; int offset, error = 0; - int fsize, lkflags, ltype, numdirpasses; + int lkflags, ltype, numdirpasses; dvp = a->a_dvp; node = VTON(dvp); @@ -1133,6 +1153,10 @@ udf_lookup(struct vop_cachedlookup_args *a) nameptr = a->a_cnp->cn_nameptr; namelen = a->a_cnp->cn_namelen; fsize = le64toh(node->fentry->inf_len); + if (fsize > INT_MAX) { + /* too big, just cap to INT_MAX */ + fsize = INT_MAX; + } /* * If this is a LOOKUP and we've already partially searched through diff --git a/sys/i386/conf/GENERIC b/sys/i386/conf/GENERIC index e7d460af21d4..f577cd07ac7c 100644 --- a/sys/i386/conf/GENERIC +++ b/sys/i386/conf/GENERIC @@ -17,6 +17,8 @@ # in NOTES. # +#NO_UNIVERSE + cpu I486_CPU cpu I586_CPU cpu I686_CPU diff --git a/sys/i386/conf/GENERIC-NODEBUG b/sys/i386/conf/GENERIC-NODEBUG index ea07613a796f..a93304481b5f 100644 --- a/sys/i386/conf/GENERIC-NODEBUG +++ b/sys/i386/conf/GENERIC-NODEBUG @@ -25,6 +25,8 @@ # in NOTES. # +#NO_UNIVERSE + include GENERIC include "std.nodebug" diff --git a/sys/i386/conf/LINT b/sys/i386/conf/LINT index 41207eb63cb9..2e947202f723 100644 --- a/sys/i386/conf/LINT +++ b/sys/i386/conf/LINT @@ -1,3 +1,4 @@ +#NO_UNIVERSE include "../../conf/NOTES" include "../../x86/conf/NOTES" diff --git a/sys/i386/conf/MINIMAL b/sys/i386/conf/MINIMAL index 2a06eb84bff8..8019617ca4d4 100644 --- a/sys/i386/conf/MINIMAL +++ b/sys/i386/conf/MINIMAL @@ -31,6 +31,8 @@ # in NOTES. # +#NO_UNIVERSE + cpu I486_CPU cpu I586_CPU cpu I686_CPU diff --git a/sys/i386/conf/PAE b/sys/i386/conf/PAE index a39d32d77106..72af9e9a9eec 100644 --- a/sys/i386/conf/PAE +++ b/sys/i386/conf/PAE @@ -2,6 +2,8 @@ # PAE -- Generic kernel configuration file for FreeBSD/i386 PAE # +#NO_UNIVERSE + include GENERIC ident PAE-GENERIC diff --git a/sys/i386/i386/pmap.c b/sys/i386/i386/pmap.c index 465b4d0f365b..b44f5e08bbcf 100644 --- a/sys/i386/i386/pmap.c +++ b/sys/i386/i386/pmap.c @@ -876,14 +876,16 @@ __CONCAT(PMTYPE, init_pat)(void) #ifdef PMAP_PAE_COMP static void * -pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags, - int wait) +pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *sflagsp, + int flags) { /* Inform UMA that this allocator uses kernel_map/object. */ - *flags = UMA_SLAB_KERNEL; + *sflagsp = UMA_SLAB_KERNEL; + /* contig allocations cannot be NEVERFREED */ + flags &= ~M_NEVERFREED; return ((void *)kmem_alloc_contig_domainset(DOMAINSET_FIXED(domain), - bytes, wait, 0x0ULL, 0xffffffffULL, 1, 0, VM_MEMATTR_DEFAULT)); + bytes, flags, 0x0ULL, 0xffffffffULL, 1, 0, VM_MEMATTR_DEFAULT)); } #endif @@ -5617,6 +5619,8 @@ __CONCAT(PMTYPE, unmapdev)(void *p, vm_size_t size) static void __CONCAT(PMTYPE, page_set_memattr)(vm_page_t m, vm_memattr_t ma) { + if (m->md.pat_mode == ma) + return; m->md.pat_mode = ma; if ((m->flags & PG_FICTITIOUS) != 0) diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c index 93bdd41d1515..a27ab33b34da 100644 --- a/sys/kern/kern_descrip.c +++ b/sys/kern/kern_descrip.c @@ -557,8 +557,10 @@ open_to_fde_flags(int open_flags, bool sticky_orb) { .f = O_CLOFORK, .t = UF_FOCLOSE }, { .f = O_RESOLVE_BENEATH, .t = UF_RESOLVE_BENEATH }, }; +#if defined(__clang__) && __clang_major__ >= 19 _Static_assert(open_to_fde_flags_s[nitems(open_to_fde_flags_s) - 1].f == O_RESOLVE_BENEATH, "O_RESOLVE_BENEATH must be last, for sticky_orb"); +#endif return (flags_trans(open_to_fde_flags_s, nitems(open_to_fde_flags_s) - (sticky_orb ? 0 : 1), open_flags)); diff --git a/sys/kern/subr_asan.c b/sys/kern/subr_asan.c index 0edb631d1475..464efda1e91a 100644 --- a/sys/kern/subr_asan.c +++ b/sys/kern/subr_asan.c @@ -263,8 +263,7 @@ kasan_mark(const void *addr, size_t size, size_t redzsize, uint8_t code) if (__predict_false(!kasan_enabled)) return; - if ((vm_offset_t)addr >= DMAP_MIN_ADDRESS && - (vm_offset_t)addr < DMAP_MAX_ADDRESS) + if (kasan_md_unsupported((vm_offset_t)addr)) return; KASSERT((vm_offset_t)addr >= VM_MIN_KERNEL_ADDRESS && diff --git a/sys/kern/subr_trap.c b/sys/kern/subr_trap.c index 18388ae5f232..bac7d0080c71 100644 --- a/sys/kern/subr_trap.c +++ b/sys/kern/subr_trap.c @@ -338,8 +338,9 @@ ast_handler(struct thread *td, struct trapframe *framep, bool dtor) td->td_ast = 0; } - CTR3(KTR_SYSC, "ast: thread %p (pid %d, %s)", td, td->td_proc->p_pid, - td->td_proc->p_comm); + CTR3(KTR_SYSC, "ast: thread %p (pid %d, %s)", td, + td->td_proc == NULL ? -1 : td->td_proc->p_pid, + td->td_proc == NULL ? "" : td->td_proc->p_comm); KASSERT(framep == NULL || TRAPF_USERMODE(framep), ("ast in kernel mode")); diff --git a/sys/kern/sys_generic.c b/sys/kern/sys_generic.c index 94e44d888181..b472aaea89e6 100644 --- a/sys/kern/sys_generic.c +++ b/sys/kern/sys_generic.c @@ -2309,6 +2309,12 @@ sys_exterrctl(struct thread *td, struct exterrctl_args *uap) return (EINVAL); td->td_pflags2 &= ~TDP2_UEXTERR; return (0); + case EXTERRCTL_UD: + /* + * Important: this code must always return EINVAL and never any + * extended error, for testing purposes. + */ + /* FALLTHROUGH */ default: return (EINVAL); } diff --git a/sys/kern/vfs_cache.c b/sys/kern/vfs_cache.c index 3d455b3874cc..89c1d779f04c 100644 --- a/sys/kern/vfs_cache.c +++ b/sys/kern/vfs_cache.c @@ -332,7 +332,8 @@ SDT_PROBE_DEFINE2(vfs, namecache, evict_negative, done, "struct vnode *", "char *"); SDT_PROBE_DEFINE1(vfs, namecache, symlink, alloc__fail, "size_t"); -SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool"); +SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata *", "int", + "enum cache_fpl_status"); SDT_PROBE_DECLARE(vfs, namei, lookup, entry); SDT_PROBE_DECLARE(vfs, namei, lookup, return); @@ -6420,15 +6421,11 @@ out: cache_fpl_smr_assert_not_entered(&fpl); cache_fpl_assert_status(&fpl); *status = fpl.status; - if (SDT_PROBES_ENABLED()) { - SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); - if (fpl.status == CACHE_FPL_STATUS_HANDLED) - SDT_PROBE4(vfs, namei, lookup, return, error, ndp->ni_vp, true, - ndp); - } - + SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); if (__predict_true(fpl.status == CACHE_FPL_STATUS_HANDLED)) { MPASS(error != CACHE_FPL_FAILED); + SDT_PROBE4(vfs, namei, lookup, return, error, ndp->ni_vp, true, + ndp); if (error != 0) { cache_fpl_cleanup_cnp(fpl.cnp); MPASS(fpl.dvp == NULL); diff --git a/sys/kern/vfs_inotify.c b/sys/kern/vfs_inotify.c index 2b42228465a4..d3cd0d1f9832 100644 --- a/sys/kern/vfs_inotify.c +++ b/sys/kern/vfs_inotify.c @@ -371,7 +371,7 @@ inotify_unlink_watch_locked(struct inotify_softc *sc, struct inotify_watch *watc TAILQ_REMOVE(&vp->v_pollinfo->vpi_inotify, watch, vlink); if (TAILQ_EMPTY(&vp->v_pollinfo->vpi_inotify)) - vn_irflag_unset_locked(vp, VIRF_INOTIFY); + vn_irflag_unset(vp, VIRF_INOTIFY); } /* @@ -675,7 +675,8 @@ vn_inotify(struct vnode *vp, struct vnode *dvp, struct componentname *cnp, struct vattr va; int error; - error = VOP_GETATTR(vp, &va, cnp->cn_cred); + error = VOP_GETATTR(vp, &va, + cnp->cn_cred); if (error == 0 && va.va_nlink != 0) selfevent = 0; } diff --git a/sys/modules/Makefile b/sys/modules/Makefile index 9922796f8a1d..7cb6e2124326 100644 --- a/sys/modules/Makefile +++ b/sys/modules/Makefile @@ -326,6 +326,7 @@ SUBDIR= \ proto \ pseudofs \ ${_pst} \ + ${_pt} \ pty \ puc \ pwm \ @@ -842,6 +843,7 @@ _iwx= iwx _ixl= ixl _nvdimm= nvdimm _pms= pms +_pt= pt _qat= qat .if ${MK_SOURCELESS_UCODE} != "no" _qatfw= qatfw diff --git a/sys/modules/ice/Makefile b/sys/modules/ice/Makefile index 91f20193d878..9f9c9f602cda 100644 --- a/sys/modules/ice/Makefile +++ b/sys/modules/ice/Makefile @@ -13,6 +13,7 @@ SRCS += opt_inet.h opt_inet6.h opt_rss.h opt_iflib.h SRCS += ice_lib.c ice_osdep.c ice_resmgr.c ice_strings.c SRCS += ice_iflib_recovery_txrx.c ice_iflib_txrx.c if_ice_iflib.c SRCS += ice_fw_logging.c ice_ddp_common.c +SRCS.PCI_IOV += pci_iov_if.h ice_iov.c ice_vf_mbx.c # RDMA Client interface # TODO: Is this the right way to compile this? diff --git a/sys/modules/pt/Makefile b/sys/modules/pt/Makefile new file mode 100644 index 000000000000..416b072face9 --- /dev/null +++ b/sys/modules/pt/Makefile @@ -0,0 +1,8 @@ + +.PATH: ${SRCTOP}/sys/amd64/pt + +KMOD= pt +SRCS= pt.c pt.h device_if.h bus_if.h +SRCS+= opt_hwpmc_hooks.h opt_kstack_pages.h + +.include <bsd.kmod.mk> diff --git a/sys/modules/qlnx/qlnxe/Makefile b/sys/modules/qlnx/qlnxe/Makefile index 3d8415cf0e57..2a44ae6ddde5 100644 --- a/sys/modules/qlnx/qlnxe/Makefile +++ b/sys/modules/qlnx/qlnxe/Makefile @@ -58,6 +58,7 @@ SRCS+=qlnx_rdma.c SRCS+=qlnx_ioctl.c SRCS+=qlnx_os.c +SRCS+=opt_inet.h SRCS+= ${LINUXKPI_GENSRCS} diff --git a/sys/net/ethernet.h b/sys/net/ethernet.h index cf4f75bd0b6c..01485cf26e06 100644 --- a/sys/net/ethernet.h +++ b/sys/net/ethernet.h @@ -62,6 +62,8 @@ struct ether_header { u_char ether_shost[ETHER_ADDR_LEN]; u_short ether_type; } __packed; +_Static_assert(sizeof(struct ether_header) == ETHER_HDR_LEN, + "size of struct ether_header is wrong"); /* * Structure of a 48-bit Ethernet address. @@ -69,6 +71,8 @@ struct ether_header { struct ether_addr { u_char octet[ETHER_ADDR_LEN]; } __packed; +_Static_assert(sizeof(struct ether_addr) == ETHER_ADDR_LEN, + "size of struct ether_addr is wrong"); #define ETHER_IS_MULTICAST(addr) (*(addr) & 0x01) /* is address mcast/bcast? */ #define ETHER_IS_IPV6_MULTICAST(addr) \ @@ -112,6 +116,8 @@ struct ether_vlan_header { uint16_t evl_tag; uint16_t evl_proto; } __packed; +_Static_assert(sizeof(struct ether_vlan_header) == ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN, + "size of struct ether_vlan_header is wrong"); #define EVL_VLID_MASK 0x0FFF #define EVL_PRI_MASK 0xE000 diff --git a/sys/net/if_ethersubr.c b/sys/net/if_ethersubr.c index 7be4dfac23e7..3ae0c01c0efc 100644 --- a/sys/net/if_ethersubr.c +++ b/sys/net/if_ethersubr.c @@ -92,11 +92,6 @@ #include <crypto/sha1.h> -#ifdef CTASSERT -CTASSERT(sizeof (struct ether_header) == ETHER_ADDR_LEN * 2 + 2); -CTASSERT(sizeof (struct ether_addr) == ETHER_ADDR_LEN); -#endif - VNET_DEFINE(pfil_head_t, link_pfil_head); /* Packet filter hooks */ /* netgraph node hooks for ng_ether(4) */ diff --git a/sys/net/if_lagg.c b/sys/net/if_lagg.c index 9867a718e148..5b52bfa80e3b 100644 --- a/sys/net/if_lagg.c +++ b/sys/net/if_lagg.c @@ -718,6 +718,7 @@ lagg_capabilities(struct lagg_softc *sc) sc->sc_ifp->if_capenable = ena; sc->sc_ifp->if_capenable2 = ena2; sc->sc_ifp->if_hwassist = hwa; + (void)if_hw_tsomax_update(sc->sc_ifp, &hw_tsomax); getmicrotime(&sc->sc_ifp->if_lastchange); if (sc->sc_ifflags & IFF_DEBUG) diff --git a/sys/net/pfvar.h b/sys/net/pfvar.h index 36fab1a03ee6..452a8eb4024b 100644 --- a/sys/net/pfvar.h +++ b/sys/net/pfvar.h @@ -1370,7 +1370,6 @@ struct pf_kruleset { struct pf_krulequeue queues[2]; struct { struct pf_krulequeue *ptr; - struct pf_krule **ptr_array; u_int32_t rcount; u_int32_t ticket; int open; @@ -2500,7 +2499,7 @@ int pfr_match_addr(struct pfr_ktable *, struct pf_addr *, sa_family_t); void pfr_update_stats(struct pfr_ktable *, struct pf_addr *, sa_family_t, u_int64_t, int, int, int); int pfr_pool_get(struct pfr_ktable *, int *, struct pf_addr *, sa_family_t, - pf_addr_filter_func_t); + pf_addr_filter_func_t, bool); void pfr_dynaddr_update(struct pfr_ktable *, struct pfi_dynaddr *); struct pfr_ktable * pfr_attach_table(struct pf_kruleset *, char *); @@ -2534,6 +2533,8 @@ int pfr_ina_rollback(struct pfr_table *, u_int32_t, int *, int); int pfr_ina_commit(struct pfr_table *, u_int32_t, int *, int *, int); int pfr_ina_define(struct pfr_table *, struct pfr_addr *, int, int *, int *, u_int32_t, int); +struct pfr_ktable + *pfr_ktable_select_active(struct pfr_ktable *); MALLOC_DECLARE(PFI_MTYPE); VNET_DECLARE(struct pfi_kkif *, pfi_all); @@ -2712,7 +2713,6 @@ u_short pf_map_addr(u_int8_t, struct pf_krule *, u_short pf_map_addr_sn(u_int8_t, struct pf_krule *, struct pf_addr *, struct pf_addr *, struct pfi_kkif **nkif, struct pf_addr *, - struct pf_ksrc_node **, struct pf_srchash **, struct pf_kpool *, pf_sn_types_t); int pf_get_transaddr_af(struct pf_krule *, struct pf_pdesc *); diff --git a/sys/net80211/ieee80211_hostap.c b/sys/net80211/ieee80211_hostap.c index c5a478533313..9074878e17e4 100644 --- a/sys/net80211/ieee80211_hostap.c +++ b/sys/net80211/ieee80211_hostap.c @@ -2214,12 +2214,9 @@ hostap_recv_mgmt(struct ieee80211_node *ni, struct mbuf *m0, /* VHT */ if (IEEE80211_IS_CHAN_VHT(ni->ni_chan) && - vhtcap != NULL && - vhtinfo != NULL) { - /* XXX TODO; see below */ - net80211_vap_printf(vap, "%s: VHT TODO!\n", __func__); + vhtcap != NULL) { ieee80211_vht_node_init(ni); - ieee80211_vht_update_cap(ni, vhtcap, vhtinfo); + ieee80211_vht_update_cap(ni, vhtcap); } else if (ni->ni_flags & IEEE80211_NODE_VHT) ieee80211_vht_node_cleanup(ni); diff --git a/sys/net80211/ieee80211_ht.c b/sys/net80211/ieee80211_ht.c index 5ec80e3646b8..c28f124648a1 100644 --- a/sys/net80211/ieee80211_ht.c +++ b/sys/net80211/ieee80211_ht.c @@ -1952,6 +1952,11 @@ do { \ _RETURN_CHAN_BITS(0); /* + * TODO: should we bail out if there's no htinfo? + * Or just treat it as if we can't do the HT20/HT40 check? + */ + + /* * The original code was based on * 802.11ac-2013, Table 8-183x-VHT Operation Information subfields. * 802.11-2020, Table 9-274-VHT Operation Information subfields @@ -1962,8 +1967,12 @@ do { \ */ htinfo = (const struct ieee80211_ie_htinfo *)ni->ni_ies.htinfo_ie; - ht40 = ((htinfo->hi_byte1 & IEEE80211_HTINFO_TXWIDTH) == - IEEE80211_HTINFO_TXWIDTH_2040); + if (htinfo != NULL) + ht40 = ((htinfo->hi_byte1 & IEEE80211_HTINFO_TXWIDTH) == + IEEE80211_HTINFO_TXWIDTH_2040); + else + ht40 = false; + can_vht160 = can_vht80p80 = can_vht80 = false; /* 20 Mhz */ diff --git a/sys/net80211/ieee80211_vht.c b/sys/net80211/ieee80211_vht.c index e91977f1ef98..de0b691d4d2a 100644 --- a/sys/net80211/ieee80211_vht.c +++ b/sys/net80211/ieee80211_vht.c @@ -838,12 +838,10 @@ ieee80211_add_vhtinfo(uint8_t *frm, struct ieee80211_node *ni) } void -ieee80211_vht_update_cap(struct ieee80211_node *ni, const uint8_t *vhtcap_ie, - const uint8_t *vhtop_ie) +ieee80211_vht_update_cap(struct ieee80211_node *ni, const uint8_t *vhtcap_ie) { ieee80211_parse_vhtcap(ni, vhtcap_ie); - ieee80211_parse_vhtopmode(ni, vhtop_ie); } static struct ieee80211_channel * diff --git a/sys/net80211/ieee80211_vht.h b/sys/net80211/ieee80211_vht.h index 2964de63c343..a1529df4a85b 100644 --- a/sys/net80211/ieee80211_vht.h +++ b/sys/net80211/ieee80211_vht.h @@ -52,8 +52,7 @@ uint8_t * ieee80211_add_vhtinfo(uint8_t *frm, struct ieee80211_node *); uint8_t *ieee80211_add_vhtcap_ch(uint8_t *, struct ieee80211vap *, struct ieee80211_channel *); -void ieee80211_vht_update_cap(struct ieee80211_node *, - const uint8_t *, const uint8_t *); +void ieee80211_vht_update_cap(struct ieee80211_node *, const uint8_t *); struct ieee80211_channel * ieee80211_vht_adjust_channel(struct ieee80211com *, diff --git a/sys/netinet/icmp_var.h b/sys/netinet/icmp_var.h index b1f2b0ebf911..d6b75e482e35 100644 --- a/sys/netinet/icmp_var.h +++ b/sys/netinet/icmp_var.h @@ -104,11 +104,10 @@ extern int badport_bandlim(int); #define BANDLIM_ICMP_UNREACH 0 #define BANDLIM_ICMP_ECHO 1 #define BANDLIM_ICMP_TSTAMP 2 -#define BANDLIM_RST_CLOSEDPORT 3 /* No connection, and no listeners */ -#define BANDLIM_RST_OPENPORT 4 /* No connection, listener */ -#define BANDLIM_ICMP6_UNREACH 5 -#define BANDLIM_SCTP_OOTB 6 -#define BANDLIM_MAX 7 +#define BANDLIM_TCP_RST 3 +#define BANDLIM_ICMP6_UNREACH 4 +#define BANDLIM_SCTP_OOTB 5 +#define BANDLIM_MAX 6 #endif #endif diff --git a/sys/netinet/ip_icmp.c b/sys/netinet/ip_icmp.c index cb4b6df57c57..71b75d18efd0 100644 --- a/sys/netinet/ip_icmp.c +++ b/sys/netinet/ip_icmp.c @@ -1097,8 +1097,7 @@ static const char *icmp_rate_descrs[BANDLIM_MAX] = { [BANDLIM_ICMP_UNREACH] = "icmp unreach", [BANDLIM_ICMP_ECHO] = "icmp ping", [BANDLIM_ICMP_TSTAMP] = "icmp tstamp", - [BANDLIM_RST_CLOSEDPORT] = "closed port RST", - [BANDLIM_RST_OPENPORT] = "open port RST", + [BANDLIM_TCP_RST] = "tcp reset", [BANDLIM_ICMP6_UNREACH] = "icmp6 unreach", [BANDLIM_SCTP_OOTB] = "sctp ootb", }; diff --git a/sys/netinet/tcp_hpts.c b/sys/netinet/tcp_hpts.c index 91f8251589e4..b60cdf45af52 100644 --- a/sys/netinet/tcp_hpts.c +++ b/sys/netinet/tcp_hpts.c @@ -433,38 +433,40 @@ static void tcp_hpts_log(struct tcp_hpts_entry *hpts, struct tcpcb *tp, struct timeval *tv, int slots_to_run, int idx, bool from_callout) { - union tcp_log_stackspecific log; - /* - * Unused logs are - * 64 bit - delRate, rttProp, bw_inuse - * 16 bit - cwnd_gain - * 8 bit - bbr_state, bbr_substate, inhpts; - */ - memset(&log, 0, sizeof(log)); - log.u_bbr.flex1 = hpts->p_nxt_slot; - log.u_bbr.flex2 = hpts->p_cur_slot; - log.u_bbr.flex3 = hpts->p_prev_slot; - log.u_bbr.flex4 = idx; - log.u_bbr.flex5 = hpts->p_curtick; - log.u_bbr.flex6 = hpts->p_on_queue_cnt; - log.u_bbr.flex7 = hpts->p_cpu; - log.u_bbr.flex8 = (uint8_t)from_callout; - log.u_bbr.inflight = slots_to_run; - log.u_bbr.applimited = hpts->overidden_sleep; - log.u_bbr.delivered = hpts->saved_curtick; - log.u_bbr.timeStamp = tcp_tv_to_usectick(tv); - log.u_bbr.epoch = hpts->saved_curslot; - log.u_bbr.lt_epoch = hpts->saved_prev_slot; - log.u_bbr.pkts_out = hpts->p_delayed_by; - log.u_bbr.lost = hpts->p_hpts_sleep_time; - log.u_bbr.pacing_gain = hpts->p_cpu; - log.u_bbr.pkt_epoch = hpts->p_runningslot; - log.u_bbr.use_lt_bw = 1; - TCP_LOG_EVENTP(tp, NULL, - &tptosocket(tp)->so_rcv, - &tptosocket(tp)->so_snd, - BBR_LOG_HPTSDIAG, 0, - 0, &log, false, tv); + if (hpts_does_tp_logging && tcp_bblogging_on(tp)) { + union tcp_log_stackspecific log; + /* + * Unused logs are + * 64 bit - delRate, rttProp, bw_inuse + * 16 bit - cwnd_gain + * 8 bit - bbr_state, bbr_substate, inhpts; + */ + memset(&log, 0, sizeof(log)); + log.u_bbr.flex1 = hpts->p_nxt_slot; + log.u_bbr.flex2 = hpts->p_cur_slot; + log.u_bbr.flex3 = hpts->p_prev_slot; + log.u_bbr.flex4 = idx; + log.u_bbr.flex5 = hpts->p_curtick; + log.u_bbr.flex6 = hpts->p_on_queue_cnt; + log.u_bbr.flex7 = hpts->p_cpu; + log.u_bbr.flex8 = (uint8_t)from_callout; + log.u_bbr.inflight = slots_to_run; + log.u_bbr.applimited = hpts->overidden_sleep; + log.u_bbr.delivered = hpts->saved_curtick; + log.u_bbr.timeStamp = tcp_tv_to_usectick(tv); + log.u_bbr.epoch = hpts->saved_curslot; + log.u_bbr.lt_epoch = hpts->saved_prev_slot; + log.u_bbr.pkts_out = hpts->p_delayed_by; + log.u_bbr.lost = hpts->p_hpts_sleep_time; + log.u_bbr.pacing_gain = hpts->p_cpu; + log.u_bbr.pkt_epoch = hpts->p_runningslot; + log.u_bbr.use_lt_bw = 1; + TCP_LOG_EVENTP(tp, NULL, + &tptosocket(tp)->so_rcv, + &tptosocket(tp)->so_snd, + BBR_LOG_HPTSDIAG, 0, + 0, &log, false, tv); + } } static void @@ -1353,10 +1355,7 @@ again: } CURVNET_SET(inp->inp_vnet); /* Lets do any logging that we might want to */ - if (hpts_does_tp_logging && tcp_bblogging_on(tp)) { - tcp_hpts_log(hpts, tp, &tv, slots_to_run, i, - from_callout); - } + tcp_hpts_log(hpts, tp, &tv, slots_to_run, i, from_callout); if (tp->t_fb_ptr != NULL) { kern_prefetch(tp->t_fb_ptr, &did_prefetch); @@ -1487,7 +1486,7 @@ no_run: } void -__tcp_set_hpts(struct tcpcb *tp, int32_t line) +tcp_set_hpts(struct tcpcb *tp) { struct tcp_hpts_entry *hpts; int failed; diff --git a/sys/netinet/tcp_hpts.h b/sys/netinet/tcp_hpts.h index b097a2b98db9..f5856ed8e688 100644 --- a/sys/netinet/tcp_hpts.h +++ b/sys/netinet/tcp_hpts.h @@ -149,8 +149,7 @@ uint32_t tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, #define tcp_hpts_insert(inp, slot) \ tcp_hpts_insert_diag((inp), (slot), __LINE__, NULL) -void __tcp_set_hpts(struct tcpcb *tp, int32_t line); -#define tcp_set_hpts(a) __tcp_set_hpts(a, __LINE__) +void tcp_set_hpts(struct tcpcb *tp); void tcp_set_inp_to_drop(struct inpcb *inp, uint16_t reason); @@ -165,25 +164,25 @@ extern int32_t tcp_min_hptsi_time; * The following functions should also be available * to userspace as well. */ -static __inline uint32_t +static inline uint32_t tcp_tv_to_hptstick(const struct timeval *sv) { return ((sv->tv_sec * 100000) + (sv->tv_usec / HPTS_TICKS_PER_SLOT)); } -static __inline uint32_t +static inline uint32_t tcp_tv_to_usectick(const struct timeval *sv) { return ((uint32_t) ((sv->tv_sec * HPTS_USEC_IN_SEC) + sv->tv_usec)); } -static __inline uint32_t +static inline uint32_t tcp_tv_to_mssectick(const struct timeval *sv) { return ((uint32_t) ((sv->tv_sec * HPTS_MSEC_IN_SEC) + (sv->tv_usec/HPTS_USEC_IN_MSEC))); } -static __inline uint64_t +static inline uint64_t tcp_tv_to_lusectick(const struct timeval *sv) { return ((uint64_t)((sv->tv_sec * HPTS_USEC_IN_SEC) + sv->tv_usec)); @@ -199,7 +198,7 @@ get_hpts_min_sleep_time(void) return (tcp_min_hptsi_time + HPTS_TICKS_PER_SLOT); } -static __inline uint32_t +static inline uint32_t tcp_gethptstick(struct timeval *sv) { struct timeval tv; @@ -210,7 +209,7 @@ tcp_gethptstick(struct timeval *sv) return (tcp_tv_to_hptstick(sv)); } -static __inline uint64_t +static inline uint64_t tcp_get_u64_usecs(struct timeval *tv) { struct timeval tvd; @@ -221,7 +220,7 @@ tcp_get_u64_usecs(struct timeval *tv) return (tcp_tv_to_lusectick(tv)); } -static __inline uint32_t +static inline uint32_t tcp_get_usecs(struct timeval *tv) { struct timeval tvd; diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c index 7c032e13f37a..de428ae1af6f 100644 --- a/sys/netinet/tcp_input.c +++ b/sys/netinet/tcp_input.c @@ -621,6 +621,7 @@ tcp_input_with_port(struct mbuf **mp, int *offp, int proto, uint16_t port) #endif /* INET6 */ struct tcpopt to; /* options in this segment */ char *s = NULL; /* address and port logging */ + bool closed_port = false; /* segment is hitting a closed port */ NET_EPOCH_ASSERT(); @@ -907,7 +908,8 @@ findpcb: log(LOG_INFO, "%s; %s: Connection attempt " "to closed port\n", s, __func__); } - rstreason = BANDLIM_RST_CLOSEDPORT; + rstreason = BANDLIM_TCP_RST; + closed_port = true; goto dropwithreset; } INP_LOCK_ASSERT(inp); @@ -998,12 +1000,14 @@ findpcb: * down or it is in the CLOSED state. Either way we drop the * segment and send an appropriate response. */ - rstreason = BANDLIM_RST_CLOSEDPORT; + rstreason = BANDLIM_TCP_RST; + closed_port = true; goto dropwithreset; } if ((tp->t_port != port) && (tp->t_state > TCPS_LISTEN)) { - rstreason = BANDLIM_RST_CLOSEDPORT; + rstreason = BANDLIM_TCP_RST; + closed_port = true; goto dropwithreset; } @@ -1055,6 +1059,8 @@ findpcb: * socket appended to the listen queue in SYN_RECEIVED state. */ if ((thflags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK) { + int result; + /* * Parse the TCP options here because * syncookies need access to the reflected @@ -1064,8 +1070,8 @@ findpcb: /* * NB: syncache_expand() doesn't unlock inp. */ - rstreason = syncache_expand(&inc, &to, th, &so, m, port); - if (rstreason < 0) { + result = syncache_expand(&inc, &to, th, &so, m, port); + if (result < 0) { /* * A failing TCP MD5 signature comparison * must result in the segment being dropped @@ -1073,7 +1079,7 @@ findpcb: * to the sender. */ goto dropunlock; - } else if (rstreason == 0) { + } else if (result == 0) { /* * No syncache entry, or ACK was not for our * SYN/ACK. Do our protection against double @@ -1092,7 +1098,7 @@ findpcb: * of the failure cause. */ INP_WUNLOCK(inp); - rstreason = BANDLIM_RST_OPENPORT; + rstreason = BANDLIM_TCP_RST; lookupflag &= ~INPLOOKUP_WILDCARD; goto findpcb; } @@ -1183,7 +1189,7 @@ tfo_socket_result: s, __func__); syncache_badack(&inc, port); /* XXX: Not needed! */ TCPSTAT_INC(tcps_badsyn); - rstreason = BANDLIM_RST_OPENPORT; + rstreason = BANDLIM_TCP_RST; goto dropwithreset; } /* @@ -1259,7 +1265,7 @@ tfo_socket_result: "Connection attempt to deprecated " "IPv6 address rejected\n", s, __func__); - rstreason = BANDLIM_RST_OPENPORT; + rstreason = BANDLIM_TCP_RST; goto dropwithreset; } } @@ -1380,9 +1386,10 @@ dropwithreset: * When blackholing do not respond with a RST but * completely ignore the segment and drop it. */ - if (((rstreason == BANDLIM_RST_OPENPORT && V_blackhole == 3) || - (rstreason == BANDLIM_RST_CLOSEDPORT && - ((V_blackhole == 1 && (thflags & TH_SYN)) || V_blackhole > 1))) && + if (rstreason == BANDLIM_TCP_RST && + ((!closed_port && V_blackhole == 3) || + (closed_port && + ((V_blackhole == 1 && (thflags & TH_SYN)) || V_blackhole > 1))) && (V_blackhole_local || ( #ifdef INET6 isipv6 ? !in6_localip(&ip6->ip6_src) : @@ -1515,7 +1522,9 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, struct tcpopt to; int tfo_syn; u_int maxseg = 0; + bool no_data; + no_data = (tlen == 0); thflags = tcp_get_flags(th); tp->sackhint.last_sack_ack = 0; sack_changed = SACK_NOCHANGE; @@ -1754,7 +1763,7 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, tp->ts_recent = to.to_tsval; } - if (tlen == 0) { + if (no_data) { if (SEQ_GT(th->th_ack, tp->snd_una) && SEQ_LEQ(th->th_ack, tp->snd_max) && !IN_RECOVERY(tp->t_flags) && @@ -1963,7 +1972,7 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, if ((thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->snd_una) || SEQ_GT(th->th_ack, tp->snd_max))) { - rstreason = BANDLIM_RST_OPENPORT; + rstreason = BANDLIM_TCP_RST; tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); goto dropwithreset; } @@ -1976,7 +1985,7 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, * FIN, or a RST. */ if ((thflags & (TH_SYN|TH_ACK)) == (TH_SYN|TH_ACK)) { - rstreason = BANDLIM_RST_OPENPORT; + rstreason = BANDLIM_TCP_RST; tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); goto dropwithreset; } else if (thflags & TH_SYN) { @@ -2244,7 +2253,7 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, * for the "LAND" DoS attack. */ if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) { - rstreason = BANDLIM_RST_OPENPORT; + rstreason = BANDLIM_TCP_RST; tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); goto dropwithreset; } @@ -2557,7 +2566,7 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, if (SEQ_LEQ(th->th_ack, tp->snd_una)) { maxseg = tcp_maxseg(tp); - if (tlen == 0 && + if (no_data && (tiwin == tp->snd_wnd || (tp->t_flags & TF_SACK_PERMIT))) { /* @@ -3113,8 +3122,7 @@ step6: (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { /* keep track of pure window updates */ - if (tlen == 0 && - tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) + if (no_data && tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) TCPSTAT_INC(tcps_rcvwinupd); tp->snd_wnd = tiwin; tp->snd_wl1 = th->th_seq; @@ -3424,7 +3432,7 @@ dropafterack: if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) && (SEQ_GT(tp->snd_una, th->th_ack) || SEQ_GT(th->th_ack, tp->snd_max)) ) { - rstreason = BANDLIM_RST_OPENPORT; + rstreason = BANDLIM_TCP_RST; tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); goto dropwithreset; } diff --git a/sys/netinet/tcp_log_buf.c b/sys/netinet/tcp_log_buf.c index 75d693bc019b..e24790ece43d 100644 --- a/sys/netinet/tcp_log_buf.c +++ b/sys/netinet/tcp_log_buf.c @@ -2878,7 +2878,7 @@ tcp_log_sendfile(struct socket *so, off_t offset, size_t nbytes, int flags) /* double check log state now that we have the lock */ if (inp->inp_flags & INP_DROPPED) goto done; - if (tp->_t_logstate != TCP_LOG_STATE_OFF) { + if (tcp_bblogging_on(tp)) { struct timeval tv; tcp_log_eventspecific_t log; diff --git a/sys/netinet/tcp_log_buf.h b/sys/netinet/tcp_log_buf.h index fef32e16b2e4..3e7eef8a1cda 100644 --- a/sys/netinet/tcp_log_buf.h +++ b/sys/netinet/tcp_log_buf.h @@ -539,12 +539,12 @@ struct tcpcb; NULL, NULL, 0, NULL); \ } while (0) #endif /* TCP_LOG_FORCEVERBOSE */ +/* Assumes/requires the caller has already checked tcp_bblogging_on(tp). */ #define TCP_LOG_EVENTP(tp, th, rxbuf, txbuf, eventid, errornum, len, stackinfo, th_hostorder, tv) \ do { \ - if (tcp_bblogging_on(tp)) \ - tcp_log_event(tp, th, rxbuf, txbuf, eventid, \ - errornum, len, stackinfo, th_hostorder, \ - NULL, NULL, 0, tv); \ + KASSERT(tcp_bblogging_on(tp), ("bblogging is off")); \ + tcp_log_event(tp, th, rxbuf, txbuf, eventid, errornum, len, \ + stackinfo, th_hostorder, NULL, NULL, 0, tv); \ } while (0) #ifdef TCP_BLACKBOX diff --git a/sys/netinet/tcp_stacks/bbr.c b/sys/netinet/tcp_stacks/bbr.c index e2cfec5c9275..d2636f01714e 100644 --- a/sys/netinet/tcp_stacks/bbr.c +++ b/sys/netinet/tcp_stacks/bbr.c @@ -8763,7 +8763,7 @@ bbr_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so, (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); - ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset(m, tp, th, BANDLIM_TCP_RST, tlen); return (1); } if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) { @@ -8965,7 +8965,7 @@ bbr_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, (SEQ_LEQ(th->th_ack, tp->snd_una) || SEQ_GT(th->th_ack, tp->snd_max))) { tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); - ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset(m, tp, th, BANDLIM_TCP_RST, tlen); return (1); } if (tp->t_flags & TF_FASTOPEN) { @@ -8977,7 +8977,7 @@ bbr_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, */ if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) { tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); - ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset(m, tp, th, BANDLIM_TCP_RST, tlen); return (1); } else if (thflags & TH_SYN) { /* non-initial SYN is ignored */ @@ -9010,7 +9010,7 @@ bbr_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, */ if (SEQ_LT(th->th_seq, tp->irs)) { tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); - ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset(m, tp, th, BANDLIM_TCP_RST, tlen); return (1); } if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { @@ -9288,7 +9288,7 @@ bbr_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so, if (sbavail(&so->so_snd)) { if (ctf_progress_timeout_check(tp, true)) { bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__); - ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen); return (1); } } @@ -9385,7 +9385,7 @@ bbr_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so, if (sbavail(&so->so_snd)) { if (ctf_progress_timeout_check(tp, true)) { bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__); - ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen); return (1); } } @@ -9535,7 +9535,7 @@ bbr_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so, if (sbavail(&so->so_snd)) { if (ctf_progress_timeout_check(tp, true)) { bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__); - ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen); return (1); } } @@ -9637,7 +9637,7 @@ bbr_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so, if (sbavail(&so->so_snd)) { if (ctf_progress_timeout_check(tp, true)) { bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__); - ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen); return (1); } } @@ -9739,7 +9739,7 @@ bbr_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so, if (sbavail(&so->so_snd)) { if (ctf_progress_timeout_check(tp, true)) { bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__); - ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen); return (1); } } @@ -9848,7 +9848,7 @@ bbr_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so, if (sbavail(&so->so_snd)) { if (ctf_progress_timeout_check(tp, true)) { bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__); - ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen); return (1); } } @@ -11510,7 +11510,7 @@ bbr_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); - ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen); return (1); } if (tiwin > bbr->r_ctl.rc_high_rwnd) diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c index 8e05498863b9..5280f18dc983 100644 --- a/sys/netinet/tcp_stacks/rack.c +++ b/sys/netinet/tcp_stacks/rack.c @@ -40,7 +40,6 @@ #endif #include <sys/lock.h> #include <sys/malloc.h> -#include <sys/lock.h> #include <sys/mutex.h> #include <sys/mbuf.h> #include <sys/proc.h> /* for proc0 declaration */ @@ -198,7 +197,7 @@ static uint32_t rack_pcm_blast = 0; static uint32_t rack_pcm_is_enabled = 1; static uint8_t rack_ssthresh_rest_rto_rec = 0; /* Do we restore ssthresh when we have rec -> rto -> rec */ -static uint32_t rack_gp_gain_req = 1200; /* Amount percent wise required to gain to record a round has "gaining" */ +static uint32_t rack_gp_gain_req = 1200; /* Amount percent wise required to gain to record a round as "gaining" */ static uint32_t rack_rnd_cnt_req = 0x10005; /* Default number of rounds if we are below rack_gp_gain_req where we exit ss */ @@ -938,7 +937,7 @@ rack_init_sysctls(void) SYSCTL_ADD_U32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_probertt), OID_AUTO, "time_between", CTLFLAG_RW, - & rack_time_between_probertt, 96000000, + &rack_time_between_probertt, 96000000, "How many useconds between the lowest rtt falling must past before we enter probertt"); SYSCTL_ADD_U32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_probertt), @@ -3480,9 +3479,9 @@ static void rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm) { if (rsm->r_flags & RACK_APP_LIMITED) { - if (rack->r_ctl.rc_app_limited_cnt > 0) { - rack->r_ctl.rc_app_limited_cnt--; - } + KASSERT((rack->r_ctl.rc_app_limited_cnt > 0), + ("app_cnt %u, rsm %p", rack->r_ctl.rc_app_limited_cnt, rsm)); + rack->r_ctl.rc_app_limited_cnt--; } if (rsm->r_limit_type) { /* currently there is only one limit type */ @@ -3554,8 +3553,7 @@ rack_get_measure_window(struct tcpcb *tp, struct tcp_rack *rack) * earlier. * * So lets calculate the BDP with the "known" b/w using - * the SRTT has our rtt and then multiply it by the - * goal. + * the SRTT as our rtt and then multiply it by the goal. */ bw = rack_get_bw(rack); srtt = (uint64_t)tp->t_srtt; @@ -5793,7 +5791,7 @@ rack_cong_signal(struct tcpcb *tp, uint32_t type, uint32_t ack, int line) tp->t_badrxtwin = 0; break; } - if ((CC_ALGO(tp)->cong_signal != NULL) && + if ((CC_ALGO(tp)->cong_signal != NULL) && (type != CC_RTO)){ tp->t_ccv.curack = ack; CC_ALGO(tp)->cong_signal(&tp->t_ccv, type); @@ -5904,7 +5902,7 @@ rack_calc_thresh_rack(struct tcp_rack *rack, uint32_t srtt, uint32_t cts, int li * * If reorder-fade is configured, then we track the last time we saw * re-ordering occur. If we reach the point where enough time as - * passed we no longer consider reordering has occuring. + * passed we no longer consider reordering as occurring. * * Or if reorder-face is 0, then once we see reordering we consider * the connection to alway be subject to reordering and just set lro @@ -7045,6 +7043,9 @@ rack_clone_rsm(struct tcp_rack *rack, struct rack_sendmap *nrsm, /* Push bit must go to the right edge as well */ if (rsm->r_flags & RACK_HAD_PUSH) rsm->r_flags &= ~RACK_HAD_PUSH; + /* Update the count if app limited */ + if (nrsm->r_flags & RACK_APP_LIMITED) + rack->r_ctl.rc_app_limited_cnt++; /* Clone over the state of the hw_tls flag */ nrsm->r_hw_tls = rsm->r_hw_tls; /* @@ -7096,7 +7097,7 @@ rack_merge_rsm(struct tcp_rack *rack, l_rsm->r_flags |= RACK_TLP; if (r_rsm->r_flags & RACK_RWND_COLLAPSED) l_rsm->r_flags |= RACK_RWND_COLLAPSED; - if ((r_rsm->r_flags & RACK_APP_LIMITED) && + if ((r_rsm->r_flags & RACK_APP_LIMITED) && ((l_rsm->r_flags & RACK_APP_LIMITED) == 0)) { /* * If both are app-limited then let the @@ -8137,7 +8138,7 @@ rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, * remove the lost desgination and reduce the * bytes considered lost. */ - rsm->r_flags &= ~RACK_WAS_LOST; + rsm->r_flags &= ~RACK_WAS_LOST; KASSERT((rack->r_ctl.rc_considered_lost >= (rsm->r_end - rsm->r_start)), ("rsm:%p rack:%p rc_considered_lost goes negative", rsm, rack)); if (rack->r_ctl.rc_considered_lost >= (rsm->r_end - rsm->r_start)) @@ -8832,7 +8833,7 @@ rack_apply_updated_usrtt(struct tcp_rack *rack, uint32_t us_rtt, uint32_t us_cts val = rack_probertt_lower_within * rack_time_between_probertt; val /= 100; - if ((rack->in_probe_rtt == 0) && + if ((rack->in_probe_rtt == 0) && (rack->rc_skip_timely == 0) && ((us_cts - rack->r_ctl.rc_lower_rtt_us_cts) >= (rack_time_between_probertt - val))) { rack_enter_probertt(rack, us_cts); @@ -10369,7 +10370,7 @@ more: * and yet before retransmitting we get an ack * which can happen due to reordering. */ - rsm->r_flags &= ~RACK_WAS_LOST; + rsm->r_flags &= ~RACK_WAS_LOST; KASSERT((rack->r_ctl.rc_considered_lost >= (rsm->r_end - rsm->r_start)), ("rsm:%p rack:%p rc_considered_lost goes negative", rsm, rack)); if (rack->r_ctl.rc_considered_lost >= (rsm->r_end - rsm->r_start)) @@ -11065,7 +11066,7 @@ rack_strike_dupack(struct tcp_rack *rack, tcp_seq th_ack) * We need to skip anything already set * to be retransmitted. */ - if ((rsm->r_dupack >= DUP_ACK_THRESHOLD) || + if ((rsm->r_dupack >= DUP_ACK_THRESHOLD) || (rsm->r_flags & RACK_MUST_RXT)) { rsm = TAILQ_NEXT(rsm, r_tnext); continue; @@ -12875,7 +12876,7 @@ rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so, (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); - ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset(m, tp, th, BANDLIM_TCP_RST, tlen); return (1); } if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) { @@ -13089,7 +13090,7 @@ rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, (SEQ_LEQ(th->th_ack, tp->snd_una) || SEQ_GT(th->th_ack, tp->snd_max))) { tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); - ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset(m, tp, th, BANDLIM_TCP_RST, tlen); return (1); } if (tp->t_flags & TF_FASTOPEN) { @@ -13102,7 +13103,7 @@ rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, */ if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) { tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); - ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset(m, tp, th, BANDLIM_TCP_RST, tlen); return (1); } else if (thflags & TH_SYN) { /* non-initial SYN is ignored */ @@ -13136,7 +13137,7 @@ rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, */ if (SEQ_LT(th->th_seq, tp->irs)) { tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); - ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset(m, tp, th, BANDLIM_TCP_RST, tlen); return (1); } if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { @@ -13399,7 +13400,7 @@ rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so, if (sbavail(&so->so_snd)) { if (ctf_progress_timeout_check(tp, true)) { rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); - ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen); return (1); } } @@ -13495,7 +13496,7 @@ rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so, if (ctf_progress_timeout_check(tp, true)) { rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, tp, tick, PROGRESS_DROP, __LINE__); - ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen); return (1); } } @@ -13645,7 +13646,7 @@ rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so, if (ctf_progress_timeout_check(tp, true)) { rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, tp, tick, PROGRESS_DROP, __LINE__); - ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen); return (1); } } @@ -13746,7 +13747,7 @@ rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so, if (ctf_progress_timeout_check(tp, true)) { rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, tp, tick, PROGRESS_DROP, __LINE__); - ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen); return (1); } } @@ -13848,7 +13849,7 @@ rack_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so, if (ctf_progress_timeout_check(tp, true)) { rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, tp, tick, PROGRESS_DROP, __LINE__); - ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen); return (1); } } @@ -13952,7 +13953,7 @@ rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so, if (ctf_progress_timeout_check(tp, true)) { rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, tp, tick, PROGRESS_DROP, __LINE__); - ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen); return (1); } } @@ -16655,7 +16656,7 @@ rack_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); - ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset(m, tp, th, BANDLIM_TCP_RST, tlen); #ifdef TCP_ACCOUNTING sched_unpin(); #endif @@ -16919,7 +16920,7 @@ do_output_now: } else if ((nxt_pkt == 0) && (tp->t_flags & TF_ACKNOW)) { goto do_output_now; } else if ((no_output == 1) && - (nxt_pkt == 0) && + (nxt_pkt == 0) && (tcp_in_hpts(rack->rc_tp) == 0)) { /* * We are not in hpts and we had a pacing timer up. Use @@ -17546,7 +17547,7 @@ rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, str rack->r_ctl.rc_last_us_rtt, 88, __LINE__, NULL, gain); } - if (((bw_est == 0) || (rate_wanted == 0) || (rack->gp_ready == 0)) && + if (((bw_est == 0) || (rate_wanted == 0) || (rack->gp_ready == 0)) && (rack->use_fixed_rate == 0)) { /* * No way yet to make a b/w estimate or @@ -20043,7 +20044,7 @@ again: rack->r_ctl.pcm_max_seg = ctf_fixed_maxseg(tp) * 10; } } - if ((rack->r_ctl.pcm_max_seg != 0) && (rack->pcm_needed == 1)) { + if ((rack->r_ctl.pcm_max_seg != 0) && (rack->pcm_needed == 1)) { uint32_t rw_avail, cwa; if (tp->snd_wnd > ctf_outstanding(tp)) @@ -21031,7 +21032,7 @@ just_return_nolock: } else log = 1; } - /* Mark the last packet has app limited */ + /* Mark the last packet as app limited */ rsm = tqhash_max(rack->r_ctl.tqh); if (rsm && ((rsm->r_flags & RACK_APP_LIMITED) == 0)) { if (rack->r_ctl.rc_app_limited_cnt == 0) diff --git a/sys/netinet/tcp_stacks/rack_bbr_common.c b/sys/netinet/tcp_stacks/rack_bbr_common.c index da26b8cb1f9b..d1c4ba58bf55 100644 --- a/sys/netinet/tcp_stacks/rack_bbr_common.c +++ b/sys/netinet/tcp_stacks/rack_bbr_common.c @@ -672,7 +672,7 @@ ctf_do_dropafterack(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t (SEQ_GT(tp->snd_una, th->th_ack) || SEQ_GT(th->th_ack, tp->snd_max))) { *ret_val = 1; - ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset(m, tp, th, BANDLIM_TCP_RST, tlen); return; } else *ret_val = 0; diff --git a/sys/netinet6/mld6.c b/sys/netinet6/mld6.c index 06fe9e8820c9..a825658bd9ee 100644 --- a/sys/netinet6/mld6.c +++ b/sys/netinet6/mld6.c @@ -234,17 +234,20 @@ static SYSCTL_NODE(_net_inet6_mld, OID_AUTO, ifinfo, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_mld_ifinfo, "Per-interface MLDv2 state"); -static int mld_v1enable = 1; -SYSCTL_INT(_net_inet6_mld, OID_AUTO, v1enable, CTLFLAG_RWTUN, - &mld_v1enable, 0, "Enable fallback to MLDv1"); +VNET_DEFINE_STATIC(bool, mld_v1enable) = true; +#define V_mld_v1enable VNET(mld_v1enable) +SYSCTL_BOOL(_net_inet6_mld, OID_AUTO, v1enable, CTLFLAG_VNET | CTLFLAG_RWTUN, + &VNET_NAME(mld_v1enable), 0, "Enable fallback to MLDv1"); -static int mld_v2enable = 1; -SYSCTL_INT(_net_inet6_mld, OID_AUTO, v2enable, CTLFLAG_RWTUN, - &mld_v2enable, 0, "Enable MLDv2"); +VNET_DEFINE_STATIC(bool, mld_v2enable) = true; +#define V_mld_v2enable VNET(mld_v2enable) +SYSCTL_BOOL(_net_inet6_mld, OID_AUTO, v2enable, CTLFLAG_VNET | CTLFLAG_RWTUN, + &VNET_NAME(mld_v2enable), 0, "Enable MLDv2"); -static int mld_use_allow = 1; -SYSCTL_INT(_net_inet6_mld, OID_AUTO, use_allow, CTLFLAG_RWTUN, - &mld_use_allow, 0, "Use ALLOW/BLOCK for RFC 4604 SSM joins/leaves"); +VNET_DEFINE_STATIC(bool, mld_use_allow) = true; +#define V_mld_use_allow VNET(mld_use_allow) +SYSCTL_BOOL(_net_inet6_mld, OID_AUTO, use_allow, CTLFLAG_VNET | CTLFLAG_RWTUN, + &VNET_NAME(mld_use_allow), 0, "Use ALLOW/BLOCK for RFC 4604 SSM joins/leaves"); /* * Packed Router Alert option structure declaration. @@ -481,7 +484,7 @@ mld_domifattach(struct ifnet *ifp) mbufq_init(&mli->mli_gq, MLD_MAX_RESPONSE_PACKETS); if ((ifp->if_flags & IFF_MULTICAST) == 0) mli->mli_flags |= MLIF_SILENT; - if (mld_use_allow) + if (V_mld_use_allow) mli->mli_flags |= MLIF_USEALLOW; MLD_LOCK(); @@ -614,7 +617,7 @@ mld_v1_input_query(struct ifnet *ifp, const struct ip6_hdr *ip6, is_general_query = 0; - if (!mld_v1enable) { + if (!V_mld_v1enable) { CTR3(KTR_MLD, "ignore v1 query %s on ifp %p(%s)", ip6_sprintf(ip6tbuf, &mld->mld_addr), ifp, if_name(ifp)); @@ -790,7 +793,7 @@ mld_v2_input_query(struct ifnet *ifp, const struct ip6_hdr *ip6, NET_EPOCH_ASSERT(); - if (!mld_v2enable) { + if (!V_mld_v2enable) { CTR3(KTR_MLD, "ignore v2 query src %s on ifp %p(%s)", ip6_sprintf(ip6tbuf, &ip6->ip6_src), ifp, if_name(ifp)); @@ -1076,7 +1079,7 @@ mld_v1_input_report(struct ifnet *ifp, const struct ip6_hdr *ip6, NET_EPOCH_ASSERT(); - if (!mld_v1enable) { + if (!V_mld_v1enable) { CTR3(KTR_MLD, "ignore v1 report %s on ifp %p(%s)", ip6_sprintf(ip6tbuf, &mld->mld_addr), ifp, if_name(ifp)); diff --git a/sys/netinet6/raw_ip6.c b/sys/netinet6/raw_ip6.c index 0379ef7c789a..c90a1213bd66 100644 --- a/sys/netinet6/raw_ip6.c +++ b/sys/netinet6/raw_ip6.c @@ -765,8 +765,7 @@ rip6_bind(struct socket *so, struct sockaddr *nam, struct thread *td) } if (ifa != NULL && ((struct in6_ifaddr *)ifa)->ia6_flags & - (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY| - IN6_IFF_DETACHED|IN6_IFF_DEPRECATED)) { + (IN6_IFF_NOTREADY|IN6_IFF_DETACHED|IN6_IFF_DEPRECATED)) { NET_EPOCH_EXIT(et); return (EADDRNOTAVAIL); } diff --git a/sys/netipsec/ipsec.c b/sys/netipsec/ipsec.c index 6bacc68b7441..92d0201b398a 100644 --- a/sys/netipsec/ipsec.c +++ b/sys/netipsec/ipsec.c @@ -636,8 +636,10 @@ ipsec4_in_reject1(const struct mbuf *m, struct ip *ip1, struct inpcb *inp) #ifdef IPSEC_OFFLOAD tag = ipsec_accel_input_tag_lookup(m); - if (tag != NULL) - return (0); + if (tag != NULL) { + tag->tag.m_tag_id = PACKET_TAG_IPSEC_IN_DONE; + __DECONST(struct mbuf *, m)->m_flags |= M_DECRYPTED; + } #endif if (ip1 == NULL) { diff --git a/sys/netipsec/ipsec_offload.c b/sys/netipsec/ipsec_offload.c index 467d5ded1d7a..8a09d5f37b4a 100644 --- a/sys/netipsec/ipsec_offload.c +++ b/sys/netipsec/ipsec_offload.c @@ -94,6 +94,7 @@ struct ifp_handle_sav { size_t hdr_ext_size; uint64_t cnt_octets; uint64_t cnt_allocs; + struct xform_history xfh; }; #define IFP_HS_HANDLED 0x00000001 @@ -159,6 +160,8 @@ static void ipsec_accel_drv_sa_lifetime_update_impl(struct secasvar *sav, static int ipsec_accel_drv_sa_lifetime_fetch_impl(struct secasvar *sav, if_t ifp, u_int drv_spi, uint64_t *octets, uint64_t *allocs); static void ipsec_accel_ifdetach_event(void *arg, struct ifnet *ifp); +static bool ipsec_accel_fill_xh_impl(if_t ifp, uint32_t drv_spi, + struct xform_history *xh); static void ipsec_accel_init(void *arg) @@ -185,6 +188,7 @@ ipsec_accel_init(void *arg) ipsec_accel_drv_sa_lifetime_update_impl; ipsec_accel_drv_sa_lifetime_fetch_p = ipsec_accel_drv_sa_lifetime_fetch_impl; + ipsec_accel_fill_xh_p = ipsec_accel_fill_xh_impl; pctrie_init(&drv_spi_pctrie); ipsec_accel_ifdetach_event_tag = EVENTHANDLER_REGISTER( ifnet_departure_event, ipsec_accel_ifdetach_event, NULL, @@ -209,6 +213,7 @@ ipsec_accel_fini(void *arg) ipsec_accel_on_ifdown_p = NULL; ipsec_accel_drv_sa_lifetime_update_p = NULL; ipsec_accel_drv_sa_lifetime_fetch_p = NULL; + ipsec_accel_fill_xh_p = NULL; ipsec_accel_sync_imp(); clean_unrhdr(drv_spi_unr); /* avoid panic, should go later */ clear_unrhdr(drv_spi_unr); @@ -412,6 +417,10 @@ ipsec_accel_handle_sav(struct secasvar *sav, struct ifnet *ifp, ihs->ifdata = priv; ihs->flags = flags; ihs->hdr_ext_size = esp_hdrsiz(sav); + memcpy(&ihs->xfh.dst, &sav->sah->saidx.dst, sizeof(ihs->xfh.dst)); + ihs->xfh.spi = sav->spi; + ihs->xfh.proto = sav->sah->saidx.proto; + ihs->xfh.mode = sav->sah->saidx.mode; mtx_lock(&ipsec_accel_sav_tmp); CK_LIST_FOREACH(i, &sav->accel_ifps, sav_link) { if (i->ifp == ifp) { @@ -1162,4 +1171,20 @@ ipsec_accel_key_setaccelif_impl(struct secasvar *sav) return (m); } +static bool +ipsec_accel_fill_xh_impl(if_t ifp, uint32_t drv_spi, struct xform_history *xh) +{ + struct ifp_handle_sav *i; + + if (drv_spi < IPSEC_ACCEL_DRV_SPI_MIN || + drv_spi > IPSEC_ACCEL_DRV_SPI_MAX) + return (false); + + i = DRVSPI_SA_PCTRIE_LOOKUP(&drv_spi_pctrie, drv_spi); + if (i == NULL) + return (false); + memcpy(xh, &i->xfh, sizeof(*xh)); + return (true); +} + #endif /* IPSEC_OFFLOAD */ diff --git a/sys/netipsec/ipsec_offload.h b/sys/netipsec/ipsec_offload.h index 904fe6252396..ae60eaa8ae78 100644 --- a/sys/netipsec/ipsec_offload.h +++ b/sys/netipsec/ipsec_offload.h @@ -30,6 +30,7 @@ #include <sys/errno.h> #include <net/if.h> #include <net/if_var.h> +#include <netipsec/xform.h> struct secpolicy; struct secasvar; @@ -42,6 +43,7 @@ struct ipsec_accel_out_tag { struct ipsec_accel_in_tag { struct m_tag tag; + struct xform_history xh; /* Must be first to mimic IPSEC_IN_DONE */ uint16_t drv_spi; }; @@ -66,6 +68,8 @@ extern void (*ipsec_accel_drv_sa_lifetime_update_p)(struct secasvar *sav, if_t ifp, u_int drv_spi, uint64_t octets, uint64_t allocs); extern int (*ipsec_accel_drv_sa_lifetime_fetch_p)(struct secasvar *sav, if_t ifp, u_int drv_spi, uint64_t *octets, uint64_t *allocs); +extern bool (*ipsec_accel_fill_xh_p)(if_t ifp, uint32_t drv_spi, + struct xform_history *xh); #ifdef IPSEC_OFFLOAD /* @@ -158,6 +162,16 @@ ipsec_accel_key_setaccelif(struct secasvar *sav) return (NULL); } +static inline bool +ipsec_accel_fill_xh(if_t ifp, uint32_t drv_spi, struct xform_history *xh) +{ + bool (*p)(if_t ifp, uint32_t drv_spi, struct xform_history *xh); + + p = atomic_load_ptr(&ipsec_accel_fill_xh_p); + if (p != NULL) + return (p(ifp, drv_spi, xh)); + return (false); +} #else #define ipsec_accel_sa_newkey(a) @@ -168,6 +182,7 @@ ipsec_accel_key_setaccelif(struct secasvar *sav) #define ipsec_accel_sync() #define ipsec_accel_is_accel_sav(a) #define ipsec_accel_key_setaccelif(a) +#define ipsec_accel_fill_xh(a, b, c) (false) #endif void ipsec_accel_forget_sav_impl(struct secasvar *sav); @@ -180,6 +195,7 @@ bool ipsec_accel_output(struct ifnet *ifp, struct mbuf *m, struct inpcb *inp, struct secpolicy *sp, struct secasvar *sav, int af, int mtu, int *hwassist); void ipsec_accel_forget_sav(struct secasvar *sav); +struct xform_history; #else #define ipsec_accel_input(a, b, c) (ENXIO) #define ipsec_accel_output(a, b, c, d, e, f, g, h) ({ \ diff --git a/sys/netipsec/key.c b/sys/netipsec/key.c index ae67d83c6d13..4ba1b49c24f0 100644 --- a/sys/netipsec/key.c +++ b/sys/netipsec/key.c @@ -114,6 +114,8 @@ void (*ipsec_accel_drv_sa_lifetime_update_p)(struct secasvar *sav, if_t ifp, u_int drv_spi, uint64_t octets, uint64_t allocs); int (*ipsec_accel_drv_sa_lifetime_fetch_p)(struct secasvar *sav, if_t ifp, u_int drv_spi, uint64_t *octets, uint64_t *allocs); +bool (*ipsec_accel_fill_xh_p)(if_t ifp, uint32_t drv_spi, + struct xform_history *xh); #endif #define FULLMASK 0xff diff --git a/sys/netlink/netlink_message_parser.h b/sys/netlink/netlink_message_parser.h index 8492ecb3021b..720317ed74f3 100644 --- a/sys/netlink/netlink_message_parser.h +++ b/sys/netlink/netlink_message_parser.h @@ -209,7 +209,8 @@ int nlattr_get_nested(struct nlattr *nla, struct nl_pstate *npt, int nlattr_get_nested_ptr(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target); -bool nlmsg_report_err_msg(struct nl_pstate *npt, const char *fmt, ...); +bool nlmsg_report_err_msg(struct nl_pstate *npt, const char *fmt, ...) + __printflike(2, 3); #define NLMSG_REPORT_ERR_MSG(_npt, _fmt, ...) { \ nlmsg_report_err_msg(_npt, _fmt, ## __VA_ARGS__); \ diff --git a/sys/netpfil/ipfilter/netinet/ip_fil_freebsd.c b/sys/netpfil/ipfilter/netinet/ip_fil_freebsd.c index 04850549db98..6eb6cf2a7a47 100644 --- a/sys/netpfil/ipfilter/netinet/ip_fil_freebsd.c +++ b/sys/netpfil/ipfilter/netinet/ip_fil_freebsd.c @@ -463,13 +463,14 @@ ipf_send_ip(fr_info_t *fin, mb_t *m) int ipf_send_icmp_err(int type, fr_info_t *fin, int dst) { - int err, hlen, xtra, iclen, ohlen, avail, code; + int err, hlen, xtra, iclen, ohlen, avail; struct in_addr dst4; struct icmp *icmp; struct mbuf *m; i6addr_t dst6; void *ifp; #ifdef USE_INET6 + int code; ip6_t *ip6; #endif ip_t *ip, *ip2; @@ -477,8 +478,8 @@ ipf_send_icmp_err(int type, fr_info_t *fin, int dst) if ((type < 0) || (type >= ICMP_MAXTYPE)) return (-1); - code = fin->fin_icode; #ifdef USE_INET6 + code = fin->fin_icode; /* See NetBSD ip_fil_netbsd.c r1.4: */ if ((code < 0) || (code >= sizeof(icmptoicmp6unreach)/sizeof(int))) return (-1); diff --git a/sys/netpfil/pf/if_pflog.c b/sys/netpfil/pf/if_pflog.c index 0a84f9d680ac..cb96d2fcc44c 100644 --- a/sys/netpfil/pf/if_pflog.c +++ b/sys/netpfil/pf/if_pflog.c @@ -284,9 +284,9 @@ pflog_packet(uint8_t action, u_int8_t reason, * state lock, since this leads to unsafe LOR. * These conditions are very very rare, however. */ - if (trigger->log & PF_LOG_SOCKET_LOOKUP && !pd->lookup.done && lookupsafe) + if (trigger->log & PF_LOG_USER && !pd->lookup.done && lookupsafe) pd->lookup.done = pf_socket_lookup(pd); - if (pd->lookup.done > 0) + if (trigger->log & PF_LOG_USER && pd->lookup.done > 0) hdr.uid = pd->lookup.uid; else hdr.uid = -1; diff --git a/sys/netpfil/pf/if_pfsync.c b/sys/netpfil/pf/if_pfsync.c index 2391edaf1a5a..4e03584b8f85 100644 --- a/sys/netpfil/pf/if_pfsync.c +++ b/sys/netpfil/pf/if_pfsync.c @@ -532,6 +532,7 @@ pfsync_state_import(union pfsync_state_union *sp, int flags, int msg_version) struct pf_kpooladdr *rpool_first; int error; uint8_t rt = 0; + int n = 0; PF_RULES_RASSERT(); @@ -557,10 +558,12 @@ pfsync_state_import(union pfsync_state_union *sp, int flags, int msg_version) */ if (sp->pfs_1301.rule != htonl(-1) && sp->pfs_1301.anchor == htonl(-1) && (flags & (PFSYNC_SI_IOCTL | PFSYNC_SI_CKSUM)) && ntohl(sp->pfs_1301.rule) < - pf_main_ruleset.rules[PF_RULESET_FILTER].active.rcount) - r = pf_main_ruleset.rules[ - PF_RULESET_FILTER].active.ptr_array[ntohl(sp->pfs_1301.rule)]; - else + pf_main_ruleset.rules[PF_RULESET_FILTER].active.rcount) { + TAILQ_FOREACH(r, pf_main_ruleset.rules[ + PF_RULESET_FILTER].active.ptr, entries) + if (ntohl(sp->pfs_1301.rule) == n++) + break; + } else r = &V_pf_default_rule; /* diff --git a/sys/netpfil/pf/pf.c b/sys/netpfil/pf/pf.c index d5f01e5c4956..009f7e4d78b1 100644 --- a/sys/netpfil/pf/pf.c +++ b/sys/netpfil/pf/pf.c @@ -5901,18 +5901,17 @@ pf_test_rule(struct pf_krule **rm, struct pf_kstate **sm, M_SETFIB(pd->m, pd->act.rtableid); if (r->rt) { - struct pf_ksrc_node *sn = NULL; - struct pf_srchash *snh = NULL; /* * Set act.rt here instead of in pf_rule_to_actions() because * it is applied only from the last pass rule. */ pd->act.rt = r->rt; - /* Don't use REASON_SET, pf_map_addr increases the reason counters */ - ctx.reason = pf_map_addr_sn(pd->af, r, pd->src, &pd->act.rt_addr, - &pd->act.rt_kif, NULL, &sn, &snh, &(r->route), PF_SN_ROUTE); - if (ctx.reason != 0) + if ((transerror = pf_map_addr_sn(pd->af, r, pd->src, + &pd->act.rt_addr, &pd->act.rt_kif, NULL, &(r->route), + PF_SN_ROUTE)) != PFRES_MATCH) { + REASON_SET(&ctx.reason, transerror); goto cleanup; + } } if (pd->virtual_proto != PF_VPROTO_FRAGMENT && @@ -6056,9 +6055,16 @@ pf_create_state(struct pf_krule *r, struct pf_test_ctx *ctx, /* src node for translation rule */ if (ctx->nr != NULL) { KASSERT(ctx->nat_pool != NULL, ("%s: nat_pool is NULL", __func__)); + /* + * The NAT addresses are chosen during ruleset parsing. + * The new afto code stores post-nat addresses in nsaddr. + * The old nat code (also used for new nat-to rules) creates + * state keys and stores addresses in them. + */ if ((ctx->nat_pool->opts & PF_POOL_STICKYADDR) && (sn_reason = pf_insert_src_node(sns, snhs, ctx->nr, - &ctx->sk->addr[pd->sidx], pd->af, &ctx->nk->addr[1], NULL, + ctx->sk ? &(ctx->sk->addr[pd->sidx]) : pd->src, pd->af, + ctx->nk ? &(ctx->nk->addr[1]) : &(pd->nsaddr), NULL, PF_SN_NAT)) != 0 ) { REASON_SET(&ctx->reason, sn_reason); goto csfailed; @@ -6213,7 +6219,7 @@ pf_create_state(struct pf_krule *r, struct pf_test_ctx *ctx, if (ctx->tag > 0) s->tag = ctx->tag; if (pd->proto == IPPROTO_TCP && (tcp_get_flags(th) & (TH_SYN|TH_ACK)) == - TH_SYN && r->keep_state == PF_STATE_SYNPROXY) { + TH_SYN && r->keep_state == PF_STATE_SYNPROXY && pd->dir == PF_IN) { pf_set_protostate(s, PF_PEER_SRC, PF_TCPS_PROXY_SRC); pf_undo_nat(ctx->nr, pd, bip_sum); s->src.seqhi = arc4random(); @@ -9062,6 +9068,9 @@ pf_route(struct pf_krule *r, struct ifnet *oifp, goto bad; } + if (r->rt == PF_DUPTO) + skip_test = true; + if (pd->dir == PF_IN && !skip_test) { if (pf_test(AF_INET, PF_OUT, PFIL_FWD, ifp, &m0, inp, &pd->act) != PF_PASS) { @@ -9364,6 +9373,9 @@ pf_route6(struct pf_krule *r, struct ifnet *oifp, goto bad; } + if (r->rt == PF_DUPTO) + skip_test = true; + if (pd->dir == PF_IN && !skip_test) { if (pf_test(AF_INET6, PF_OUT, PFIL_FWD | PF_PFIL_NOREFRAGMENT, ifp, &m0, inp, &pd->act) != PF_PASS) { @@ -10052,6 +10064,8 @@ pf_setup_pdesc(sa_family_t af, int dir, struct pf_pdesc *pd, struct mbuf **m0, pd->didx = (dir == PF_IN) ? 1 : 0; pd->af = pd->naf = af; + PF_RULES_ASSERT(); + TAILQ_INIT(&pd->sctp_multihome_jobs); if (default_actions != NULL) memcpy(&pd->act, default_actions, sizeof(pd->act)); @@ -10127,6 +10141,12 @@ pf_setup_pdesc(sa_family_t af, int dir, struct pf_pdesc *pd, struct mbuf **m0, } h = mtod(pd->m, struct ip6_hdr *); + if (pd->m->m_pkthdr.len < + sizeof(struct ip6_hdr) + ntohs(h->ip6_plen)) { + *action = PF_DROP; + REASON_SET(reason, PFRES_SHORT); + return (-1); + } if (pf_walk_header6(pd, h, reason) != PF_PASS) { *action = PF_DROP; @@ -10465,35 +10485,30 @@ pf_test(sa_family_t af, int dir, int pflags, struct ifnet *ifp, struct mbuf **m0 PF_RULES_RLOCK_TRACKER; KASSERT(dir == PF_IN || dir == PF_OUT, ("%s: bad direction %d\n", __func__, dir)); M_ASSERTPKTHDR(*m0); + NET_EPOCH_ASSERT(); if (!V_pf_status.running) return (PF_PASS); - PF_RULES_RLOCK(); - kif = (struct pfi_kkif *)ifp->if_pf_kif; if (__predict_false(kif == NULL)) { DPFPRINTF(PF_DEBUG_URGENT, ("%s: kif == NULL, if_xname %s\n", __func__, ifp->if_xname)); - PF_RULES_RUNLOCK(); return (PF_DROP); } if (kif->pfik_flags & PFI_IFLAG_SKIP) { - PF_RULES_RUNLOCK(); return (PF_PASS); } if ((*m0)->m_flags & M_SKIP_FIREWALL) { - PF_RULES_RUNLOCK(); return (PF_PASS); } if (__predict_false(! M_WRITABLE(*m0))) { *m0 = m_unshare(*m0, M_NOWAIT); if (*m0 == NULL) { - PF_RULES_RUNLOCK(); return (PF_DROP); } } @@ -10506,12 +10521,10 @@ pf_test(sa_family_t af, int dir, int pflags, struct ifnet *ifp, struct mbuf **m0 ifp = ifnet_byindexgen(pd.pf_mtag->if_index, pd.pf_mtag->if_idxgen); if (ifp == NULL || ifp->if_flags & IFF_DYING) { - PF_RULES_RUNLOCK(); m_freem(*m0); *m0 = NULL; return (PF_PASS); } - PF_RULES_RUNLOCK(); (ifp->if_output)(ifp, *m0, sintosa(&pd.pf_mtag->dst), NULL); *m0 = NULL; return (PF_PASS); @@ -10526,11 +10539,12 @@ pf_test(sa_family_t af, int dir, int pflags, struct ifnet *ifp, struct mbuf **m0 /* But only once. We may see the packet multiple times (e.g. * PFIL_IN/PFIL_OUT). */ pf_dummynet_flag_remove(pd.m, pd.pf_mtag); - PF_RULES_RUNLOCK(); return (PF_PASS); } + PF_RULES_RLOCK(); + if (pf_setup_pdesc(af, dir, &pd, m0, &action, &reason, kif, default_actions) == -1) { if (action != PF_PASS) diff --git a/sys/netpfil/pf/pf.h b/sys/netpfil/pf/pf.h index 2009d2907985..cfff58064922 100644 --- a/sys/netpfil/pf/pf.h +++ b/sys/netpfil/pf/pf.h @@ -140,7 +140,7 @@ enum { PF_ADDR_ADDRMASK, PF_ADDR_NOROUTE, PF_ADDR_DYNIFTL, #define PF_LOG 0x01 #define PF_LOG_ALL 0x02 -#define PF_LOG_SOCKET_LOOKUP 0x04 +#define PF_LOG_USER 0x04 #define PF_LOG_FORCE 0x08 #define PF_LOG_MATCHES 0x10 @@ -490,6 +490,7 @@ struct pf_osfp_ioctl { #define PF_ANCHOR_NAME_SIZE 64 #define PF_ANCHOR_MAXPATH (MAXPATHLEN - PF_ANCHOR_NAME_SIZE - 1) +#define PF_OPTIMIZER_TABLE_PFX "__automatic_" struct pf_rule { struct pf_rule_addr src; diff --git a/sys/netpfil/pf/pf_ioctl.c b/sys/netpfil/pf/pf_ioctl.c index c96741023db9..5c69c395c5fc 100644 --- a/sys/netpfil/pf/pf_ioctl.c +++ b/sys/netpfil/pf/pf_ioctl.c @@ -1274,7 +1274,9 @@ pf_hash_rule_addr(MD5_CTX *ctx, struct pf_rule_addr *pfr) PF_MD5_UPD(pfr, addr.iflags); break; case PF_ADDR_TABLE: - PF_MD5_UPD(pfr, addr.v.tblname); + if (strncmp(pfr->addr.v.tblname, PF_OPTIMIZER_TABLE_PFX, + strlen(PF_OPTIMIZER_TABLE_PFX))) + PF_MD5_UPD(pfr, addr.v.tblname); break; case PF_ADDR_ADDRMASK: /* XXX ignore af? */ @@ -1357,7 +1359,7 @@ static int pf_commit_rules(u_int32_t ticket, int rs_num, char *anchor) { struct pf_kruleset *rs; - struct pf_krule *rule, **old_array, *old_rule; + struct pf_krule *rule, *old_rule; struct pf_krulequeue *old_rules; struct pf_krule_global *old_tree; int error; @@ -1382,13 +1384,10 @@ pf_commit_rules(u_int32_t ticket, int rs_num, char *anchor) /* Swap rules, keep the old. */ old_rules = rs->rules[rs_num].active.ptr; old_rcount = rs->rules[rs_num].active.rcount; - old_array = rs->rules[rs_num].active.ptr_array; old_tree = rs->rules[rs_num].active.tree; rs->rules[rs_num].active.ptr = rs->rules[rs_num].inactive.ptr; - rs->rules[rs_num].active.ptr_array = - rs->rules[rs_num].inactive.ptr_array; rs->rules[rs_num].active.tree = rs->rules[rs_num].inactive.tree; rs->rules[rs_num].active.rcount = @@ -1418,7 +1417,6 @@ pf_commit_rules(u_int32_t ticket, int rs_num, char *anchor) } rs->rules[rs_num].inactive.ptr = old_rules; - rs->rules[rs_num].inactive.ptr_array = old_array; rs->rules[rs_num].inactive.tree = NULL; /* important for pf_ioctl_addrule */ rs->rules[rs_num].inactive.rcount = old_rcount; @@ -1431,9 +1429,6 @@ pf_commit_rules(u_int32_t ticket, int rs_num, char *anchor) while ((rule = TAILQ_FIRST(old_rules)) != NULL) pf_unlink_rule_locked(old_rules, rule); PF_UNLNKDRULES_UNLOCK(); - if (rs->rules[rs_num].inactive.ptr_array) - free(rs->rules[rs_num].inactive.ptr_array, M_TEMP); - rs->rules[rs_num].inactive.ptr_array = NULL; rs->rules[rs_num].inactive.rcount = 0; rs->rules[rs_num].inactive.open = 0; pf_remove_if_empty_kruleset(rs); @@ -1456,24 +1451,11 @@ pf_setup_pfsync_matching(struct pf_kruleset *rs) if (rs_cnt == PF_RULESET_SCRUB) continue; - if (rs->rules[rs_cnt].inactive.ptr_array) - free(rs->rules[rs_cnt].inactive.ptr_array, M_TEMP); - rs->rules[rs_cnt].inactive.ptr_array = NULL; - if (rs->rules[rs_cnt].inactive.rcount) { - rs->rules[rs_cnt].inactive.ptr_array = - mallocarray(rs->rules[rs_cnt].inactive.rcount, - sizeof(struct pf_rule **), - M_TEMP, M_NOWAIT); - - if (!rs->rules[rs_cnt].inactive.ptr_array) - return (ENOMEM); - } - - TAILQ_FOREACH(rule, rs->rules[rs_cnt].inactive.ptr, - entries) { - pf_hash_rule_rolling(&ctx, rule); - (rs->rules[rs_cnt].inactive.ptr_array)[rule->nr] = rule; + TAILQ_FOREACH(rule, rs->rules[rs_cnt].inactive.ptr, + entries) { + pf_hash_rule_rolling(&ctx, rule); + } } } @@ -2059,6 +2041,47 @@ pf_ioctl_getrules(struct pfioc_rule *pr) return (0); } +static int +pf_rule_checkaf(struct pf_krule *r) +{ + switch (r->af) { + case 0: + if (r->rule_flag & PFRULE_AFTO) + return (EPFNOSUPPORT); + break; + case AF_INET: + if ((r->rule_flag & PFRULE_AFTO) && r->naf != AF_INET6) + return (EPFNOSUPPORT); + break; +#ifdef INET6 + case AF_INET6: + if ((r->rule_flag & PFRULE_AFTO) && r->naf != AF_INET) + return (EPFNOSUPPORT); + break; +#endif /* INET6 */ + default: + return (EPFNOSUPPORT); + } + + if ((r->rule_flag & PFRULE_AFTO) == 0 && r->naf != 0) + return (EPFNOSUPPORT); + + return (0); +} + +static int +pf_validate_range(uint8_t op, uint16_t port[2]) +{ + uint16_t a = ntohs(port[0]); + uint16_t b = ntohs(port[1]); + + if ((op == PF_OP_RRG && a > b) || /* 34:12, i.e. none */ + (op == PF_OP_IRG && a >= b) || /* 34><12, i.e. none */ + (op == PF_OP_XRG && a > b)) /* 34<>22, i.e. all */ + return 1; + return 0; +} + int pf_ioctl_addrule(struct pf_krule *rule, uint32_t ticket, uint32_t pool_ticket, const char *anchor, const char *anchor_call, @@ -2078,6 +2101,13 @@ pf_ioctl_addrule(struct pf_krule *rule, uint32_t ticket, #define ERROUT(x) ERROUT_FUNCTION(errout, x) + if ((error = pf_rule_checkaf(rule))) + ERROUT(error); + if (pf_validate_range(rule->src.port_op, rule->src.port)) + ERROUT(EINVAL); + if (pf_validate_range(rule->dst.port_op, rule->dst.port)) + ERROUT(EINVAL); + if (rule->ifname[0]) kif = pf_kkif_create(M_WAITOK); if (rule->rcv_ifname[0]) @@ -3567,7 +3597,7 @@ DIOCADDRULENV_error: error = pf_rule_to_krule(&pr->rule, rule); if (error != 0) { pf_krule_free(rule); - break; + goto fail; } pr->anchor[sizeof(pr->anchor) - 1] = '\0'; @@ -3726,11 +3756,11 @@ DIOCGETRULENV_error: if (pcr->action < PF_CHANGE_ADD_HEAD || pcr->action > PF_CHANGE_GET_TICKET) { error = EINVAL; - break; + goto fail; } if (pcr->rule.return_icmp >> 8 > ICMP_MAXTYPE) { error = EINVAL; - break; + goto fail; } if (pcr->action != PF_CHANGE_REMOVE) { @@ -3738,9 +3768,13 @@ DIOCGETRULENV_error: error = pf_rule_to_krule(&pcr->rule, newrule); if (error != 0) { pf_krule_free(newrule); - break; + goto fail; } + if ((error = pf_rule_checkaf(newrule))) { + pf_krule_free(newrule); + goto fail; + } if (newrule->ifname[0]) kif = pf_kkif_create(M_WAITOK); pf_counter_u64_init(&newrule->evaluations, M_WAITOK); @@ -3888,7 +3922,7 @@ DIOCGETRULENV_error: pf_free_rule(newrule); PF_RULES_WUNLOCK(); PF_CONFIG_UNLOCK(); - break; + goto fail; } newrule->nat.cur = TAILQ_FIRST(&newrule->nat.list); @@ -3915,7 +3949,7 @@ DIOCGETRULENV_error: PF_RULES_WUNLOCK(); PF_CONFIG_UNLOCK(); error = EINVAL; - break; + goto fail; } } @@ -3933,7 +3967,7 @@ DIOCGETRULENV_error: PF_RULES_WUNLOCK(); PF_CONFIG_UNLOCK(); error = EEXIST; - break; + goto fail; } if (oldrule == NULL) @@ -3989,7 +4023,7 @@ DIOCCHANGERULE_error: if (sp->timeout >= PFTM_MAX) { error = EINVAL; - break; + goto fail; } if (V_pfsync_state_import_ptr != NULL) { PF_RULES_RLOCK(); @@ -4009,7 +4043,7 @@ DIOCCHANGERULE_error: s = pf_find_state_byid(ps->state.id, ps->state.creatorid); if (s == NULL) { error = ENOENT; - break; + goto fail; } pfsync_state_export((union pfsync_state_union*)&ps->state, @@ -4088,7 +4122,7 @@ DIOCGETSTATES_retry: error = copyout(pstore, out, sizeof(struct pfsync_state_1301) * count); if (error) - break; + goto fail; out = ps->ps_states + nr; } DIOCGETSTATES_full: @@ -4108,7 +4142,7 @@ DIOCGETSTATES_full: if (ps->ps_req_version > PF_STATE_VERSION) { error = ENOTSUP; - break; + goto fail; } if (ps->ps_len <= 0) { @@ -4166,7 +4200,7 @@ DIOCGETSTATESV2_retry: error = copyout(pstore, out, sizeof(struct pf_state_export) * count); if (error) - break; + goto fail; out = ps->ps_states + nr; } DIOCGETSTATESV2_full: @@ -4272,12 +4306,12 @@ DIOCGETSTATESV2_full: if (psp->ifname[0] == '\0') { error = EINVAL; - break; + goto fail; } error = pf_user_strcpy(ps.ifname, psp->ifname, IFNAMSIZ); if (error != 0) - break; + goto fail; ifp = ifunit(ps.ifname); if (ifp != NULL) { psp->baudrate32 = @@ -4338,7 +4372,7 @@ DIOCGETSTATESV2_full: altq = malloc(sizeof(*altq), M_PFALTQ, M_WAITOK | M_ZERO); error = pf_import_kaltq(pa, altq, IOCPARM_LEN(cmd)); if (error) - break; + goto fail; altq->local_flags = 0; PF_RULES_WLOCK(); @@ -4346,7 +4380,7 @@ DIOCGETSTATESV2_full: PF_RULES_WUNLOCK(); free(altq, M_PFALTQ); error = EBUSY; - break; + goto fail; } /* @@ -4358,7 +4392,7 @@ DIOCGETSTATESV2_full: PF_RULES_WUNLOCK(); error = EBUSY; free(altq, M_PFALTQ); - break; + goto fail; } altq->altq_disc = NULL; TAILQ_FOREACH(a, V_pf_altq_ifs_inactive, entries) { @@ -4378,7 +4412,7 @@ DIOCGETSTATESV2_full: if (error) { PF_RULES_WUNLOCK(); free(altq, M_PFALTQ); - break; + goto fail; } if (altq->qname[0] != 0) @@ -4416,13 +4450,13 @@ DIOCGETSTATESV2_full: if (pa->ticket != V_ticket_altqs_active) { PF_RULES_RUNLOCK(); error = EBUSY; - break; + goto fail; } altq = pf_altq_get_nth_active(pa->nr); if (altq == NULL) { PF_RULES_RUNLOCK(); error = EBUSY; - break; + goto fail; } pf_export_kaltq(altq, pa, IOCPARM_LEN(cmd)); PF_RULES_RUNLOCK(); @@ -4446,20 +4480,20 @@ DIOCGETSTATESV2_full: if (pq->ticket != V_ticket_altqs_active) { PF_RULES_RUNLOCK(); error = EBUSY; - break; + goto fail; } nbytes = pq->nbytes; altq = pf_altq_get_nth_active(pq->nr); if (altq == NULL) { PF_RULES_RUNLOCK(); error = EBUSY; - break; + goto fail; } if ((altq->local_flags & PFALTQ_FLAG_IF_REMOVED) != 0) { PF_RULES_RUNLOCK(); error = ENXIO; - break; + goto fail; } PF_RULES_RUNLOCK(); if (cmd == DIOCGETQSTATSV0) @@ -4528,30 +4562,30 @@ DIOCGETSTATESV2_full: if (pca->action < PF_CHANGE_ADD_HEAD || pca->action > PF_CHANGE_REMOVE) { error = EINVAL; - break; + goto fail; } if (pca->addr.addr.type != PF_ADDR_ADDRMASK && pca->addr.addr.type != PF_ADDR_DYNIFTL && pca->addr.addr.type != PF_ADDR_TABLE) { error = EINVAL; - break; + goto fail; } if (pca->addr.addr.p.dyn != NULL) { error = EINVAL; - break; + goto fail; } if (pca->action != PF_CHANGE_REMOVE) { #ifndef INET if (pca->af == AF_INET) { error = EAFNOSUPPORT; - break; + goto fail; } #endif /* INET */ #ifndef INET6 if (pca->af == AF_INET6) { error = EAFNOSUPPORT; - break; + goto fail; } #endif /* INET6 */ newpa = malloc(sizeof(*newpa), M_PFRULE, M_WAITOK); @@ -4674,7 +4708,7 @@ DIOCCHANGEADDR_error: if (io->pfrio_esize != 0) { error = ENODEV; - break; + goto fail; } PF_RULES_WLOCK(); error = pfr_clr_tables(&io->pfrio_table, &io->pfrio_ndel, @@ -4690,13 +4724,13 @@ DIOCCHANGEADDR_error: if (io->pfrio_esize != sizeof(struct pfr_table)) { error = ENODEV; - break; + goto fail; } if (io->pfrio_size < 0 || io->pfrio_size > pf_ioctl_maxcount || WOULD_OVERFLOW(io->pfrio_size, sizeof(struct pfr_table))) { error = ENOMEM; - break; + goto fail; } totlen = io->pfrio_size * sizeof(struct pfr_table); @@ -4705,7 +4739,7 @@ DIOCCHANGEADDR_error: error = copyin(io->pfrio_buffer, pfrts, totlen); if (error) { free(pfrts, M_TEMP); - break; + goto fail; } PF_RULES_WLOCK(); error = pfr_add_tables(pfrts, io->pfrio_size, @@ -4722,13 +4756,13 @@ DIOCCHANGEADDR_error: if (io->pfrio_esize != sizeof(struct pfr_table)) { error = ENODEV; - break; + goto fail; } if (io->pfrio_size < 0 || io->pfrio_size > pf_ioctl_maxcount || WOULD_OVERFLOW(io->pfrio_size, sizeof(struct pfr_table))) { error = ENOMEM; - break; + goto fail; } totlen = io->pfrio_size * sizeof(struct pfr_table); @@ -4737,7 +4771,7 @@ DIOCCHANGEADDR_error: error = copyin(io->pfrio_buffer, pfrts, totlen); if (error) { free(pfrts, M_TEMP); - break; + goto fail; } PF_RULES_WLOCK(); error = pfr_del_tables(pfrts, io->pfrio_size, @@ -4755,14 +4789,14 @@ DIOCCHANGEADDR_error: if (io->pfrio_esize != sizeof(struct pfr_table)) { error = ENODEV; - break; + goto fail; } PF_RULES_RLOCK(); n = pfr_table_count(&io->pfrio_table, io->pfrio_flags); if (n < 0) { PF_RULES_RUNLOCK(); error = EINVAL; - break; + goto fail; } io->pfrio_size = min(io->pfrio_size, n); @@ -4773,7 +4807,7 @@ DIOCCHANGEADDR_error: if (pfrts == NULL) { error = ENOMEM; PF_RULES_RUNLOCK(); - break; + goto fail; } error = pfr_get_tables(&io->pfrio_table, pfrts, &io->pfrio_size, io->pfrio_flags | PFR_FLAG_USERIOCTL); @@ -4792,7 +4826,7 @@ DIOCCHANGEADDR_error: if (io->pfrio_esize != sizeof(struct pfr_tstats)) { error = ENODEV; - break; + goto fail; } PF_TABLE_STATS_LOCK(); PF_RULES_RLOCK(); @@ -4801,7 +4835,7 @@ DIOCCHANGEADDR_error: PF_RULES_RUNLOCK(); PF_TABLE_STATS_UNLOCK(); error = EINVAL; - break; + goto fail; } io->pfrio_size = min(io->pfrio_size, n); @@ -4812,7 +4846,7 @@ DIOCCHANGEADDR_error: error = ENOMEM; PF_RULES_RUNLOCK(); PF_TABLE_STATS_UNLOCK(); - break; + goto fail; } error = pfr_get_tstats(&io->pfrio_table, pfrtstats, &io->pfrio_size, io->pfrio_flags | PFR_FLAG_USERIOCTL); @@ -4831,7 +4865,7 @@ DIOCCHANGEADDR_error: if (io->pfrio_esize != sizeof(struct pfr_table)) { error = ENODEV; - break; + goto fail; } if (io->pfrio_size < 0 || io->pfrio_size > pf_ioctl_maxcount || @@ -4840,7 +4874,7 @@ DIOCCHANGEADDR_error: * size, so we didn't fail on overly large requests. * Keep doing so. */ io->pfrio_size = pf_ioctl_maxcount; - break; + goto fail; } totlen = io->pfrio_size * sizeof(struct pfr_table); @@ -4849,7 +4883,7 @@ DIOCCHANGEADDR_error: error = copyin(io->pfrio_buffer, pfrts, totlen); if (error) { free(pfrts, M_TEMP); - break; + goto fail; } PF_TABLE_STATS_LOCK(); @@ -4870,7 +4904,7 @@ DIOCCHANGEADDR_error: if (io->pfrio_esize != sizeof(struct pfr_table)) { error = ENODEV; - break; + goto fail; } PF_RULES_RLOCK(); @@ -4878,7 +4912,7 @@ DIOCCHANGEADDR_error: if (n < 0) { PF_RULES_RUNLOCK(); error = EINVAL; - break; + goto fail; } io->pfrio_size = min(io->pfrio_size, n); @@ -4890,7 +4924,7 @@ DIOCCHANGEADDR_error: error = copyin(io->pfrio_buffer, pfrts, totlen); if (error) { free(pfrts, M_TEMP); - break; + goto fail; } PF_RULES_WLOCK(); error = pfr_set_tflags(pfrts, io->pfrio_size, @@ -4906,7 +4940,7 @@ DIOCCHANGEADDR_error: if (io->pfrio_esize != 0) { error = ENODEV; - break; + goto fail; } PF_RULES_WLOCK(); error = pfr_clr_addrs(&io->pfrio_table, &io->pfrio_ndel, @@ -4922,13 +4956,13 @@ DIOCCHANGEADDR_error: if (io->pfrio_esize != sizeof(struct pfr_addr)) { error = ENODEV; - break; + goto fail; } if (io->pfrio_size < 0 || io->pfrio_size > pf_ioctl_maxcount || WOULD_OVERFLOW(io->pfrio_size, sizeof(struct pfr_addr))) { error = EINVAL; - break; + goto fail; } totlen = io->pfrio_size * sizeof(struct pfr_addr); pfras = mallocarray(io->pfrio_size, sizeof(struct pfr_addr), @@ -4936,7 +4970,7 @@ DIOCCHANGEADDR_error: error = copyin(io->pfrio_buffer, pfras, totlen); if (error) { free(pfras, M_TEMP); - break; + goto fail; } PF_RULES_WLOCK(); error = pfr_add_addrs(&io->pfrio_table, pfras, @@ -4956,13 +4990,13 @@ DIOCCHANGEADDR_error: if (io->pfrio_esize != sizeof(struct pfr_addr)) { error = ENODEV; - break; + goto fail; } if (io->pfrio_size < 0 || io->pfrio_size > pf_ioctl_maxcount || WOULD_OVERFLOW(io->pfrio_size, sizeof(struct pfr_addr))) { error = EINVAL; - break; + goto fail; } totlen = io->pfrio_size * sizeof(struct pfr_addr); pfras = mallocarray(io->pfrio_size, sizeof(struct pfr_addr), @@ -4970,7 +5004,7 @@ DIOCCHANGEADDR_error: error = copyin(io->pfrio_buffer, pfras, totlen); if (error) { free(pfras, M_TEMP); - break; + goto fail; } PF_RULES_WLOCK(); error = pfr_del_addrs(&io->pfrio_table, pfras, @@ -4990,17 +5024,17 @@ DIOCCHANGEADDR_error: if (io->pfrio_esize != sizeof(struct pfr_addr)) { error = ENODEV; - break; + goto fail; } if (io->pfrio_size < 0 || io->pfrio_size2 < 0) { error = EINVAL; - break; + goto fail; } count = max(io->pfrio_size, io->pfrio_size2); if (count > pf_ioctl_maxcount || WOULD_OVERFLOW(count, sizeof(struct pfr_addr))) { error = EINVAL; - break; + goto fail; } totlen = count * sizeof(struct pfr_addr); pfras = mallocarray(count, sizeof(struct pfr_addr), M_TEMP, @@ -5008,7 +5042,7 @@ DIOCCHANGEADDR_error: error = copyin(io->pfrio_buffer, pfras, totlen); if (error) { free(pfras, M_TEMP); - break; + goto fail; } PF_RULES_WLOCK(); error = pfr_set_addrs(&io->pfrio_table, pfras, @@ -5029,13 +5063,13 @@ DIOCCHANGEADDR_error: if (io->pfrio_esize != sizeof(struct pfr_addr)) { error = ENODEV; - break; + goto fail; } if (io->pfrio_size < 0 || io->pfrio_size > pf_ioctl_maxcount || WOULD_OVERFLOW(io->pfrio_size, sizeof(struct pfr_addr))) { error = EINVAL; - break; + goto fail; } totlen = io->pfrio_size * sizeof(struct pfr_addr); pfras = mallocarray(io->pfrio_size, sizeof(struct pfr_addr), @@ -5057,13 +5091,13 @@ DIOCCHANGEADDR_error: if (io->pfrio_esize != sizeof(struct pfr_astats)) { error = ENODEV; - break; + goto fail; } if (io->pfrio_size < 0 || io->pfrio_size > pf_ioctl_maxcount || WOULD_OVERFLOW(io->pfrio_size, sizeof(struct pfr_astats))) { error = EINVAL; - break; + goto fail; } totlen = io->pfrio_size * sizeof(struct pfr_astats); pfrastats = mallocarray(io->pfrio_size, @@ -5085,13 +5119,13 @@ DIOCCHANGEADDR_error: if (io->pfrio_esize != sizeof(struct pfr_addr)) { error = ENODEV; - break; + goto fail; } if (io->pfrio_size < 0 || io->pfrio_size > pf_ioctl_maxcount || WOULD_OVERFLOW(io->pfrio_size, sizeof(struct pfr_addr))) { error = EINVAL; - break; + goto fail; } totlen = io->pfrio_size * sizeof(struct pfr_addr); pfras = mallocarray(io->pfrio_size, sizeof(struct pfr_addr), @@ -5099,7 +5133,7 @@ DIOCCHANGEADDR_error: error = copyin(io->pfrio_buffer, pfras, totlen); if (error) { free(pfras, M_TEMP); - break; + goto fail; } PF_RULES_WLOCK(); error = pfr_clr_astats(&io->pfrio_table, pfras, @@ -5119,13 +5153,13 @@ DIOCCHANGEADDR_error: if (io->pfrio_esize != sizeof(struct pfr_addr)) { error = ENODEV; - break; + goto fail; } if (io->pfrio_size < 0 || io->pfrio_size > pf_ioctl_maxcount || WOULD_OVERFLOW(io->pfrio_size, sizeof(struct pfr_addr))) { error = EINVAL; - break; + goto fail; } totlen = io->pfrio_size * sizeof(struct pfr_addr); pfras = mallocarray(io->pfrio_size, sizeof(struct pfr_addr), @@ -5133,7 +5167,7 @@ DIOCCHANGEADDR_error: error = copyin(io->pfrio_buffer, pfras, totlen); if (error) { free(pfras, M_TEMP); - break; + goto fail; } PF_RULES_RLOCK(); error = pfr_tst_addrs(&io->pfrio_table, pfras, @@ -5153,13 +5187,13 @@ DIOCCHANGEADDR_error: if (io->pfrio_esize != sizeof(struct pfr_addr)) { error = ENODEV; - break; + goto fail; } if (io->pfrio_size < 0 || io->pfrio_size > pf_ioctl_maxcount || WOULD_OVERFLOW(io->pfrio_size, sizeof(struct pfr_addr))) { error = EINVAL; - break; + goto fail; } totlen = io->pfrio_size * sizeof(struct pfr_addr); pfras = mallocarray(io->pfrio_size, sizeof(struct pfr_addr), @@ -5167,7 +5201,7 @@ DIOCCHANGEADDR_error: error = copyin(io->pfrio_buffer, pfras, totlen); if (error) { free(pfras, M_TEMP); - break; + goto fail; } PF_RULES_WLOCK(); error = pfr_ina_define(&io->pfrio_table, pfras, @@ -5202,13 +5236,13 @@ DIOCCHANGEADDR_error: if (io->esize != sizeof(*ioe)) { error = ENODEV; - break; + goto fail; } if (io->size < 0 || io->size > pf_ioctl_maxcount || WOULD_OVERFLOW(io->size, sizeof(struct pfioc_trans_e))) { error = EINVAL; - break; + goto fail; } totlen = sizeof(struct pfioc_trans_e) * io->size; ioes = mallocarray(io->size, sizeof(struct pfioc_trans_e), @@ -5216,7 +5250,7 @@ DIOCCHANGEADDR_error: error = copyin(io->array, ioes, totlen); if (error) { free(ioes, M_TEMP); - break; + goto fail; } PF_RULES_WLOCK(); for (i = 0, ioe = ioes; i < io->size; i++, ioe++) { @@ -5283,13 +5317,13 @@ DIOCCHANGEADDR_error: if (io->esize != sizeof(*ioe)) { error = ENODEV; - break; + goto fail; } if (io->size < 0 || io->size > pf_ioctl_maxcount || WOULD_OVERFLOW(io->size, sizeof(struct pfioc_trans_e))) { error = EINVAL; - break; + goto fail; } totlen = sizeof(struct pfioc_trans_e) * io->size; ioes = mallocarray(io->size, sizeof(struct pfioc_trans_e), @@ -5297,7 +5331,7 @@ DIOCCHANGEADDR_error: error = copyin(io->array, ioes, totlen); if (error) { free(ioes, M_TEMP); - break; + goto fail; } PF_RULES_WLOCK(); for (i = 0, ioe = ioes; i < io->size; i++, ioe++) { @@ -5366,14 +5400,14 @@ DIOCCHANGEADDR_error: if (io->esize != sizeof(*ioe)) { error = ENODEV; - break; + goto fail; } if (io->size < 0 || io->size > pf_ioctl_maxcount || WOULD_OVERFLOW(io->size, sizeof(struct pfioc_trans_e))) { error = EINVAL; - break; + goto fail; } totlen = sizeof(struct pfioc_trans_e) * io->size; @@ -5382,7 +5416,7 @@ DIOCCHANGEADDR_error: error = copyin(io->array, ioes, totlen); if (error) { free(ioes, M_TEMP); - break; + goto fail; } PF_RULES_WLOCK(); /* First makes sure everything will succeed. */ @@ -5523,7 +5557,7 @@ DIOCCHANGEADDR_error: if (psn->psn_len == 0) { psn->psn_len = sizeof(struct pf_src_node) * nr; - break; + goto fail; } nr = 0; @@ -5548,7 +5582,7 @@ DIOCCHANGEADDR_error: sizeof(struct pf_src_node) * nr); if (error) { free(pstore, M_TEMP); - break; + goto fail; } psn->psn_len = sizeof(struct pf_src_node) * nr; free(pstore, M_TEMP); @@ -5604,14 +5638,14 @@ DIOCCHANGEADDR_error: if (io->pfiio_esize != sizeof(struct pfi_kif)) { error = ENODEV; - break; + goto fail; } if (io->pfiio_size < 0 || io->pfiio_size > pf_ioctl_maxcount || WOULD_OVERFLOW(io->pfiio_size, sizeof(struct pfi_kif))) { error = EINVAL; - break; + goto fail; } io->pfiio_name[sizeof(io->pfiio_name) - 1] = '\0'; diff --git a/sys/netpfil/pf/pf_lb.c b/sys/netpfil/pf/pf_lb.c index 308d76c46e5b..9c7863bb301e 100644 --- a/sys/netpfil/pf/pf_lb.c +++ b/sys/netpfil/pf/pf_lb.c @@ -80,7 +80,6 @@ static enum pf_test_status pf_step_into_translation_anchor(int, struct pf_test_c struct pf_krule *); static int pf_get_sport(struct pf_pdesc *, struct pf_krule *, struct pf_addr *, uint16_t *, uint16_t, uint16_t, - struct pf_ksrc_node **, struct pf_srchash **, struct pf_kpool *, struct pf_udp_mapping **, pf_sn_types_t); static bool pf_islinklocal(const sa_family_t, const struct pf_addr *); @@ -291,10 +290,8 @@ pf_match_translation(int rs_num, struct pf_test_ctx *ctx) } static int -pf_get_sport(struct pf_pdesc *pd, struct pf_krule *r, - struct pf_addr *naddr, uint16_t *nport, uint16_t low, - uint16_t high, struct pf_ksrc_node **sn, - struct pf_srchash **sh, struct pf_kpool *rpool, +pf_get_sport(struct pf_pdesc *pd, struct pf_krule *r, struct pf_addr *naddr, + uint16_t *nport, uint16_t low, uint16_t high, struct pf_kpool *rpool, struct pf_udp_mapping **udp_mapping, pf_sn_types_t sn_type) { struct pf_state_key_cmp key; @@ -322,19 +319,24 @@ pf_get_sport(struct pf_pdesc *pd, struct pf_krule *r, pf_addrcpy(&udp_source.addr, &pd->nsaddr, pd->af); udp_source.port = pd->nsport; if (udp_mapping) { + struct pf_ksrc_node *sn = NULL; + struct pf_srchash *sh = NULL; *udp_mapping = pf_udp_mapping_find(&udp_source); if (*udp_mapping) { pf_addrcpy(naddr, &(*udp_mapping)->endpoints[1].addr, pd->af); *nport = (*udp_mapping)->endpoints[1].port; - /* Try to find a src_node as per pf_map_addr(). */ - if (*sn == NULL && rpool->opts & PF_POOL_STICKYADDR && + /* + * Try to find a src_node as per pf_map_addr(). + * XXX: Why? This code seems to do nothing. + */ + if (rpool->opts & PF_POOL_STICKYADDR && (rpool->opts & PF_POOL_TYPEMASK) != PF_POOL_NONE) - *sn = pf_find_src_node(&pd->nsaddr, r, - pd->af, sh, sn_type, false); - if (*sn != NULL) - PF_SRC_NODE_UNLOCK(*sn); + sn = pf_find_src_node(&pd->nsaddr, r, + pd->af, &sh, sn_type, false); + if (sn != NULL) + PF_SRC_NODE_UNLOCK(sn); return (0); } else { *udp_mapping = pf_udp_mapping_create(pd->af, &pd->nsaddr, @@ -346,7 +348,7 @@ pf_get_sport(struct pf_pdesc *pd, struct pf_krule *r, } if (pf_map_addr_sn(pd->naf, r, &pd->nsaddr, naddr, NULL, &init_addr, - sn, sh, rpool, sn_type)) + rpool, sn_type)) goto failed; if (pd->proto == IPPROTO_ICMP) { @@ -470,9 +472,8 @@ pf_get_sport(struct pf_pdesc *pd, struct pf_krule *r, * pick a different source address since we're out * of free port choices for the current one. */ - (*sn) = NULL; if (pf_map_addr_sn(pd->naf, r, &pd->nsaddr, naddr, NULL, - &init_addr, sn, sh, rpool, sn_type)) + &init_addr, rpool, sn_type)) return (1); break; case PF_POOL_NONE: @@ -503,7 +504,6 @@ pf_islinklocal(const sa_family_t af, const struct pf_addr *addr) static int pf_get_mape_sport(struct pf_pdesc *pd, struct pf_krule *r, struct pf_addr *naddr, uint16_t *nport, - struct pf_ksrc_node **sn, struct pf_srchash **sh, struct pf_udp_mapping **udp_mapping, struct pf_kpool *rpool) { uint16_t psmask, low, highmask; @@ -523,16 +523,14 @@ pf_get_mape_sport(struct pf_pdesc *pd, struct pf_krule *r, for (i = cut; i <= ahigh; i++) { low = (i << ashift) | psmask; - if (!pf_get_sport(pd, r, - naddr, nport, low, low | highmask, sn, sh, rpool, - udp_mapping, PF_SN_NAT)) + if (!pf_get_sport(pd, r, naddr, nport, low, low | highmask, + rpool, udp_mapping, PF_SN_NAT)) return (0); } for (i = cut - 1; i > 0; i--) { low = (i << ashift) | psmask; - if (!pf_get_sport(pd, r, - naddr, nport, low, low | highmask, sn, sh, rpool, - udp_mapping, PF_SN_NAT)) + if (!pf_get_sport(pd, r, naddr, nport, low, low | highmask, + rpool, udp_mapping, PF_SN_NAT)) return (0); } return (1); @@ -545,6 +543,7 @@ pf_map_addr(sa_family_t af, struct pf_krule *r, struct pf_addr *saddr, { u_short reason = PFRES_MATCH; struct pf_addr *raddr = NULL, *rmask = NULL; + struct pfr_ktable *kt; uint64_t hashidx; int cnt; @@ -600,29 +599,25 @@ pf_map_addr(sa_family_t af, struct pf_krule *r, struct pf_addr *saddr, pf_poolmask(naddr, raddr, rmask, saddr, af); break; case PF_POOL_RANDOM: - if (rpool->cur->addr.type == PF_ADDR_TABLE) { - cnt = rpool->cur->addr.p.tbl->pfrkt_cnt; - if (cnt == 0) - rpool->tblidx = 0; + if (rpool->cur->addr.type == PF_ADDR_TABLE || + rpool->cur->addr.type == PF_ADDR_DYNIFTL) { + if (rpool->cur->addr.type == PF_ADDR_TABLE) + kt = rpool->cur->addr.p.tbl; else - rpool->tblidx = (int)arc4random_uniform(cnt); - memset(&rpool->counter, 0, sizeof(rpool->counter)); - if (pfr_pool_get(rpool->cur->addr.p.tbl, - &rpool->tblidx, &rpool->counter, af, NULL)) { + kt = rpool->cur->addr.p.dyn->pfid_kt; + kt = pfr_ktable_select_active(kt); + if (kt == NULL) { reason = PFRES_MAPFAILED; goto done_pool_mtx; /* unsupported */ } - pf_addrcpy(naddr, &rpool->counter, af); - } else if (rpool->cur->addr.type == PF_ADDR_DYNIFTL) { - cnt = rpool->cur->addr.p.dyn->pfid_kt->pfrkt_cnt; + cnt = kt->pfrkt_cnt; if (cnt == 0) rpool->tblidx = 0; else rpool->tblidx = (int)arc4random_uniform(cnt); memset(&rpool->counter, 0, sizeof(rpool->counter)); - if (pfr_pool_get(rpool->cur->addr.p.dyn->pfid_kt, - &rpool->tblidx, &rpool->counter, af, - pf_islinklocal)) { + if (pfr_pool_get(kt, &rpool->tblidx, &rpool->counter, + af, pf_islinklocal, false)) { reason = PFRES_MAPFAILED; goto done_pool_mtx; /* unsupported */ } @@ -671,29 +666,25 @@ pf_map_addr(sa_family_t af, struct pf_krule *r, struct pf_addr *saddr, hashidx = pf_hash(saddr, (struct pf_addr *)&hash, &rpool->key, af); - if (rpool->cur->addr.type == PF_ADDR_TABLE) { - cnt = rpool->cur->addr.p.tbl->pfrkt_cnt; - if (cnt == 0) - rpool->tblidx = 0; + if (rpool->cur->addr.type == PF_ADDR_TABLE || + rpool->cur->addr.type == PF_ADDR_DYNIFTL) { + if (rpool->cur->addr.type == PF_ADDR_TABLE) + kt = rpool->cur->addr.p.tbl; else - rpool->tblidx = (int)(hashidx % cnt); - memset(&rpool->counter, 0, sizeof(rpool->counter)); - if (pfr_pool_get(rpool->cur->addr.p.tbl, - &rpool->tblidx, &rpool->counter, af, NULL)) { + kt = rpool->cur->addr.p.dyn->pfid_kt; + kt = pfr_ktable_select_active(kt); + if (kt == NULL) { reason = PFRES_MAPFAILED; goto done_pool_mtx; /* unsupported */ } - pf_addrcpy(naddr, &rpool->counter, af); - } else if (rpool->cur->addr.type == PF_ADDR_DYNIFTL) { - cnt = rpool->cur->addr.p.dyn->pfid_kt->pfrkt_cnt; + cnt = kt->pfrkt_cnt; if (cnt == 0) rpool->tblidx = 0; else rpool->tblidx = (int)(hashidx % cnt); memset(&rpool->counter, 0, sizeof(rpool->counter)); - if (pfr_pool_get(rpool->cur->addr.p.dyn->pfid_kt, - &rpool->tblidx, &rpool->counter, af, - pf_islinklocal)) { + if (pfr_pool_get(kt, &rpool->tblidx, &rpool->counter, + af, pf_islinklocal, false)) { reason = PFRES_MAPFAILED; goto done_pool_mtx; /* unsupported */ } @@ -710,11 +701,12 @@ pf_map_addr(sa_family_t af, struct pf_krule *r, struct pf_addr *saddr, if (rpool->cur->addr.type == PF_ADDR_TABLE) { if (!pfr_pool_get(rpool->cur->addr.p.tbl, - &rpool->tblidx, &rpool->counter, af, NULL)) + &rpool->tblidx, &rpool->counter, af, NULL, true)) goto get_addr; } else if (rpool->cur->addr.type == PF_ADDR_DYNIFTL) { if (!pfr_pool_get(rpool->cur->addr.p.dyn->pfid_kt, - &rpool->tblidx, &rpool->counter, af, pf_islinklocal)) + &rpool->tblidx, &rpool->counter, af, pf_islinklocal, + true)) goto get_addr; } else if (pf_match_addr(0, raddr, rmask, &rpool->counter, af)) goto get_addr; @@ -724,9 +716,10 @@ pf_map_addr(sa_family_t af, struct pf_krule *r, struct pf_addr *saddr, rpool->cur = TAILQ_FIRST(&rpool->list); else rpool->cur = TAILQ_NEXT(rpool->cur, entries); + rpool->tblidx = -1; if (rpool->cur->addr.type == PF_ADDR_TABLE) { if (pfr_pool_get(rpool->cur->addr.p.tbl, - &rpool->tblidx, &rpool->counter, af, NULL)) { + &rpool->tblidx, &rpool->counter, af, NULL, true)) { /* table contains no address of type 'af' */ if (rpool->cur != acur) goto try_next; @@ -734,9 +727,9 @@ pf_map_addr(sa_family_t af, struct pf_krule *r, struct pf_addr *saddr, goto done_pool_mtx; } } else if (rpool->cur->addr.type == PF_ADDR_DYNIFTL) { - rpool->tblidx = -1; if (pfr_pool_get(rpool->cur->addr.p.dyn->pfid_kt, - &rpool->tblidx, &rpool->counter, af, pf_islinklocal)) { + &rpool->tblidx, &rpool->counter, af, pf_islinklocal, + true)) { /* table contains no address of type 'af' */ if (rpool->cur != acur) goto try_next; @@ -764,48 +757,41 @@ pf_map_addr(sa_family_t af, struct pf_krule *r, struct pf_addr *saddr, done_pool_mtx: mtx_unlock(&rpool->mtx); - if (reason) { - counter_u64_add(V_pf_status.counters[reason], 1); - } - return (reason); } u_short pf_map_addr_sn(sa_family_t af, struct pf_krule *r, struct pf_addr *saddr, struct pf_addr *naddr, struct pfi_kkif **nkif, struct pf_addr *init_addr, - struct pf_ksrc_node **sn, struct pf_srchash **sh, struct pf_kpool *rpool, - pf_sn_types_t sn_type) + struct pf_kpool *rpool, pf_sn_types_t sn_type) { + struct pf_ksrc_node *sn = NULL; + struct pf_srchash *sh = NULL; u_short reason = 0; - KASSERT(*sn == NULL, ("*sn not NULL")); - /* * If this is a sticky-address rule, try to find an existing src_node. - * Request the sh to be unlocked if sn was not found, as we never - * insert a new sn when parsing the ruleset. */ if (rpool->opts & PF_POOL_STICKYADDR && (rpool->opts & PF_POOL_TYPEMASK) != PF_POOL_NONE) - *sn = pf_find_src_node(saddr, r, af, sh, sn_type, false); + sn = pf_find_src_node(saddr, r, af, &sh, sn_type, false); - if (*sn != NULL) { - PF_SRC_NODE_LOCK_ASSERT(*sn); + if (sn != NULL) { + PF_SRC_NODE_LOCK_ASSERT(sn); /* If the supplied address is the same as the current one we've * been asked before, so tell the caller that there's no other * address to be had. */ - if (PF_AEQ(naddr, &(*sn)->raddr, af)) { + if (PF_AEQ(naddr, &(sn->raddr), af)) { reason = PFRES_MAPFAILED; goto done; } - pf_addrcpy(naddr, &(*sn)->raddr, af); + pf_addrcpy(naddr, &(sn->raddr), af); if (nkif) - *nkif = (*sn)->rkif; + *nkif = sn->rkif; if (V_pf_status.debug >= PF_DEBUG_NOISY) { - printf("pf_map_addr: src tracking maps "); + printf("%s: src tracking maps ", __func__); pf_print_host(saddr, 0, af); printf(" to "); pf_print_host(naddr, 0, af); @@ -820,14 +806,16 @@ pf_map_addr_sn(sa_family_t af, struct pf_krule *r, struct pf_addr *saddr, * Source node has not been found. Find a new address and store it * in variables given by the caller. */ - if (pf_map_addr(af, r, saddr, naddr, nkif, init_addr, rpool) != 0) { - /* pf_map_addr() sets reason counters on its own */ + if ((reason = pf_map_addr(af, r, saddr, naddr, nkif, init_addr, + rpool)) != 0) { + if (V_pf_status.debug >= PF_DEBUG_MISC) + printf("%s: pf_map_addr has failed\n", __func__); goto done; } if (V_pf_status.debug >= PF_DEBUG_NOISY && (rpool->opts & PF_POOL_TYPEMASK) != PF_POOL_NONE) { - printf("pf_map_addr: selected address "); + printf("%s: selected address ", __func__); pf_print_host(naddr, 0, af); if (nkif) printf("@%s", (*nkif)->pfik_name); @@ -835,12 +823,8 @@ pf_map_addr_sn(sa_family_t af, struct pf_krule *r, struct pf_addr *saddr, } done: - if ((*sn) != NULL) - PF_SRC_NODE_UNLOCK(*sn); - - if (reason) { - counter_u64_add(V_pf_status.counters[reason], 1); - } + if (sn != NULL) + PF_SRC_NODE_UNLOCK(sn); return (reason); } @@ -890,8 +874,6 @@ pf_get_transaddr(struct pf_test_ctx *ctx, struct pf_krule *r, { struct pf_pdesc *pd = ctx->pd; struct pf_addr *naddr; - struct pf_ksrc_node *sn = NULL; - struct pf_srchash *sh = NULL; uint16_t *nportp; uint16_t low, high; u_short reason; @@ -919,8 +901,8 @@ pf_get_transaddr(struct pf_test_ctx *ctx, struct pf_krule *r, high = rpool->proxy_port[1]; } if (rpool->mape.offset > 0) { - if (pf_get_mape_sport(pd, r, naddr, nportp, &sn, - &sh, &ctx->udp_mapping, rpool)) { + if (pf_get_mape_sport(pd, r, naddr, nportp, + &ctx->udp_mapping, rpool)) { DPFPRINTF(PF_DEBUG_MISC, ("pf: MAP-E port allocation (%u/%u/%u)" " failed\n", @@ -930,8 +912,8 @@ pf_get_transaddr(struct pf_test_ctx *ctx, struct pf_krule *r, reason = PFRES_MAPFAILED; goto notrans; } - } else if (pf_get_sport(pd, r, naddr, nportp, low, high, &sn, - &sh, rpool, &ctx->udp_mapping, PF_SN_NAT)) { + } else if (pf_get_sport(pd, r, naddr, nportp, low, high, + rpool, &ctx->udp_mapping, PF_SN_NAT)) { DPFPRINTF(PF_DEBUG_MISC, ("pf: NAT proxy port allocation (%u-%u) failed\n", rpool->proxy_port[0], rpool->proxy_port[1])); @@ -1017,7 +999,7 @@ pf_get_transaddr(struct pf_test_ctx *ctx, struct pf_krule *r, uint16_t cut, low, high, nport; reason = pf_map_addr_sn(pd->af, r, &pd->nsaddr, naddr, NULL, - NULL, &sn, &sh, rpool, PF_SN_NAT); + NULL, rpool, PF_SN_NAT); if (reason != 0) goto notrans; if ((rpool->opts & PF_POOL_TYPEMASK) == PF_POOL_BITMASK) @@ -1030,10 +1012,13 @@ pf_get_transaddr(struct pf_test_ctx *ctx, struct pf_krule *r, if (rpool->proxy_port[1]) { uint32_t tmp_nport; + uint16_t div; + + div = r->rdr.proxy_port[1] - r->rdr.proxy_port[0] + 1; + div = (div == 0) ? 1 : div; - tmp_nport = ((ntohs(pd->ndport) - ntohs(r->dst.port[0])) % - (rpool->proxy_port[1] - rpool->proxy_port[0] + - 1)) + rpool->proxy_port[0]; + tmp_nport = ((ntohs(pd->ndport) - ntohs(r->dst.port[0])) % div) + + rpool->proxy_port[0]; /* Wrap around if necessary. */ if (tmp_nport > 65535) @@ -1134,8 +1119,6 @@ pf_get_transaddr_af(struct pf_krule *r, struct pf_pdesc *pd) struct pf_addr ndaddr, nsaddr, naddr; u_int16_t nport = 0; int prefixlen = 96; - struct pf_srchash *sh = NULL; - struct pf_ksrc_node *sns = NULL; bzero(&nsaddr, sizeof(nsaddr)); bzero(&ndaddr, sizeof(ndaddr)); @@ -1154,9 +1137,8 @@ pf_get_transaddr_af(struct pf_krule *r, struct pf_pdesc *pd) panic("pf_get_transaddr_af: no nat pool for source address"); /* get source address and port */ - if (pf_get_sport(pd, r, &nsaddr, &nport, - r->nat.proxy_port[0], r->nat.proxy_port[1], &sns, &sh, &r->nat, - NULL, PF_SN_NAT)) { + if (pf_get_sport(pd, r, &nsaddr, &nport, r->nat.proxy_port[0], + r->nat.proxy_port[1], &r->nat, NULL, PF_SN_NAT)) { DPFPRINTF(PF_DEBUG_MISC, ("pf: af-to NAT proxy port allocation (%u-%u) failed", r->nat.proxy_port[0], r->nat.proxy_port[1])); @@ -1182,7 +1164,7 @@ pf_get_transaddr_af(struct pf_krule *r, struct pf_pdesc *pd) /* get the destination address and port */ if (! TAILQ_EMPTY(&r->rdr.list)) { if (pf_map_addr_sn(pd->naf, r, &nsaddr, &naddr, NULL, NULL, - &sns, NULL, &r->rdr, PF_SN_NAT)) + &r->rdr, PF_SN_NAT)) return (-1); if (r->rdr.proxy_port[0]) pd->ndport = htons(r->rdr.proxy_port[0]); diff --git a/sys/netpfil/pf/pf_table.c b/sys/netpfil/pf/pf_table.c index 43e4366845a2..9c0151b7da2b 100644 --- a/sys/netpfil/pf/pf_table.c +++ b/sys/netpfil/pf/pf_table.c @@ -819,10 +819,10 @@ pfr_create_kentry(struct pfr_addr *ad, bool counters) static void pfr_destroy_kentries(struct pfr_kentryworkq *workq) { - struct pfr_kentry *p, *q; + struct pfr_kentry *p; - for (p = SLIST_FIRST(workq); p != NULL; p = q) { - q = SLIST_NEXT(p, pfrke_workq); + while ((p = SLIST_FIRST(workq)) != NULL) { + SLIST_REMOVE_HEAD(workq, pfrke_workq); pfr_destroy_kentry(p); } } @@ -1680,8 +1680,7 @@ pfr_ina_commit(struct pfr_table *trs, u_int32_t ticket, int *nadd, } if (!(flags & PFR_FLAG_DUMMY)) { - for (p = SLIST_FIRST(&workq); p != NULL; p = q) { - q = SLIST_NEXT(p, pfrkt_workq); + SLIST_FOREACH_SAFE(p, &workq, pfrkt_workq, q) { pfr_commit_ktable(p, tzero); } rs->topen = 0; @@ -1710,7 +1709,7 @@ pfr_commit_ktable(struct pfr_ktable *kt, time_t tzero) } else if (kt->pfrkt_flags & PFR_TFLAG_ACTIVE) { /* kt might contain addresses */ struct pfr_kentryworkq addrq, addq, changeq, delq, garbageq; - struct pfr_kentry *p, *q, *next; + struct pfr_kentry *p, *q; struct pfr_addr ad; pfr_enqueue_addrs(shadow, &addrq, NULL, 0); @@ -1720,7 +1719,8 @@ pfr_commit_ktable(struct pfr_ktable *kt, time_t tzero) SLIST_INIT(&delq); SLIST_INIT(&garbageq); pfr_clean_node_mask(shadow, &addrq); - SLIST_FOREACH_SAFE(p, &addrq, pfrke_workq, next) { + while ((p = SLIST_FIRST(&addrq)) != NULL) { + SLIST_REMOVE_HEAD(&addrq, pfrke_workq); pfr_copyout_addr(&ad, p); q = pfr_lookup_addr(kt, &ad, 1); if (q != NULL) { @@ -1864,8 +1864,7 @@ pfr_setflags_ktables(struct pfr_ktableworkq *workq) { struct pfr_ktable *p, *q; - for (p = SLIST_FIRST(workq); p; p = q) { - q = SLIST_NEXT(p, pfrkt_workq); + SLIST_FOREACH_SAFE(p, workq, pfrkt_workq, q) { pfr_setflags_ktable(p, p->pfrkt_nflags); } } @@ -2015,10 +2014,10 @@ pfr_create_ktable(struct pfr_table *tbl, time_t tzero, int attachruleset) static void pfr_destroy_ktables(struct pfr_ktableworkq *workq, int flushaddr) { - struct pfr_ktable *p, *q; + struct pfr_ktable *p; - for (p = SLIST_FIRST(workq); p; p = q) { - q = SLIST_NEXT(p, pfrkt_workq); + while ((p = SLIST_FIRST(workq)) != NULL) { + SLIST_REMOVE_HEAD(workq, pfrkt_workq); pfr_destroy_ktable(p, flushaddr); } } @@ -2074,17 +2073,16 @@ pfr_lookup_table(struct pfr_table *tbl) (struct pfr_ktable *)tbl)); } -int -pfr_match_addr(struct pfr_ktable *kt, struct pf_addr *a, sa_family_t af) +static struct pfr_kentry * +pfr_kentry_byaddr(struct pfr_ktable *kt, struct pf_addr *a, sa_family_t af, + int exact) { struct pfr_kentry *ke = NULL; - int match; PF_RULES_RASSERT(); - if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE) && kt->pfrkt_root != NULL) - kt = kt->pfrkt_root; - if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE)) + kt = pfr_ktable_select_active(kt); + if (kt == NULL) return (0); switch (af) { @@ -2121,11 +2119,26 @@ pfr_match_addr(struct pfr_ktable *kt, struct pf_addr *a, sa_family_t af) default: unhandled_af(af); } + if (exact && ke && KENTRY_NETWORK(ke)) + ke = NULL; + + return (ke); +} + +int +pfr_match_addr(struct pfr_ktable *kt, struct pf_addr *a, sa_family_t af) +{ + struct pfr_kentry *ke = NULL; + int match; + + ke = pfr_kentry_byaddr(kt, a, af, 0); + match = (ke && !ke->pfrke_not); if (match) pfr_kstate_counter_add(&kt->pfrkt_match, 1); else pfr_kstate_counter_add(&kt->pfrkt_nomatch, 1); + return (match); } @@ -2135,9 +2148,8 @@ pfr_update_stats(struct pfr_ktable *kt, struct pf_addr *a, sa_family_t af, { struct pfr_kentry *ke = NULL; - if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE) && kt->pfrkt_root != NULL) - kt = kt->pfrkt_root; - if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE)) + kt = pfr_ktable_select_active(kt); + if (kt == NULL) return; switch (af) { @@ -2281,7 +2293,7 @@ pfr_detach_table(struct pfr_ktable *kt) int pfr_pool_get(struct pfr_ktable *kt, int *pidx, struct pf_addr *counter, - sa_family_t af, pf_addr_filter_func_t filter) + sa_family_t af, pf_addr_filter_func_t filter, bool loop_once) { struct pf_addr *addr, cur, mask, umask_addr; union sockaddr_union uaddr, umask; @@ -2306,9 +2318,8 @@ pfr_pool_get(struct pfr_ktable *kt, int *pidx, struct pf_addr *counter, unhandled_af(af); } - if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE) && kt->pfrkt_root != NULL) - kt = kt->pfrkt_root; - if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE)) + kt = pfr_ktable_select_active(kt); + if (kt == NULL) return (-1); idx = *pidx; @@ -2327,7 +2338,7 @@ _next_block: ke = pfr_kentry_byidx(kt, idx, af); if (ke == NULL) { /* we don't have this idx, try looping */ - if (loop || (ke = pfr_kentry_byidx(kt, 0, af)) == NULL) { + if ((loop || loop_once) || (ke = pfr_kentry_byidx(kt, 0, af)) == NULL) { pfr_kstate_counter_add(&kt->pfrkt_nomatch, 1); return (1); } @@ -2455,3 +2466,14 @@ pfr_dynaddr_update(struct pfr_ktable *kt, struct pfi_dynaddr *dyn) unhandled_af(dyn->pfid_af); } } + +struct pfr_ktable * +pfr_ktable_select_active(struct pfr_ktable *kt) +{ + if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE) && kt->pfrkt_root != NULL) + kt = kt->pfrkt_root; + if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE)) + return (NULL); + + return (kt); +} diff --git a/sys/powerpc/aim/mmu_oea.c b/sys/powerpc/aim/mmu_oea.c index 7746b668265d..ae17b3289593 100644 --- a/sys/powerpc/aim/mmu_oea.c +++ b/sys/powerpc/aim/mmu_oea.c @@ -1469,6 +1469,9 @@ moea_page_set_memattr(vm_page_t m, vm_memattr_t ma) pmap_t pmap; u_int lo; + if (m->md.mdpg_cache_attrs == ma) + return; + if ((m->oflags & VPO_UNMANAGED) != 0) { m->md.mdpg_cache_attrs = ma; return; diff --git a/sys/powerpc/aim/mmu_oea64.c b/sys/powerpc/aim/mmu_oea64.c index 79cea408bb5f..796b1719b8ba 100644 --- a/sys/powerpc/aim/mmu_oea64.c +++ b/sys/powerpc/aim/mmu_oea64.c @@ -2134,6 +2134,9 @@ moea64_page_set_memattr(vm_page_t m, vm_memattr_t ma) CTR3(KTR_PMAP, "%s: pa=%#jx, ma=%#x", __func__, (uintmax_t)VM_PAGE_TO_PHYS(m), ma); + if (m->md.mdpg_cache_attrs == ma) + return; + if ((m->oflags & VPO_UNMANAGED) != 0) { m->md.mdpg_cache_attrs = ma; return; diff --git a/sys/powerpc/aim/mmu_radix.c b/sys/powerpc/aim/mmu_radix.c index 45f7bef8bcc9..a12142fc2d7b 100644 --- a/sys/powerpc/aim/mmu_radix.c +++ b/sys/powerpc/aim/mmu_radix.c @@ -5937,6 +5937,10 @@ mmu_radix_page_set_memattr(vm_page_t m, vm_memattr_t ma) { CTR3(KTR_PMAP, "%s(%p, %#x)", __func__, m, ma); + + if (m->md.mdpg_cache_attrs == ma) + return; + m->md.mdpg_cache_attrs = ma; /* diff --git a/sys/powerpc/include/pcb.h b/sys/powerpc/include/pcb.h index 050ada6b0f64..0230cf78aba7 100644 --- a/sys/powerpc/include/pcb.h +++ b/sys/powerpc/include/pcb.h @@ -66,16 +66,8 @@ struct pcb { #define PCB_VECREGS 0x200 /* Process had Altivec registers initialized */ struct fpu { union { -#if _BYTE_ORDER == _BIG_ENDIAN - double fpr; - uint32_t vsr[4]; -#else uint32_t vsr[4]; - struct { - double padding; - double fpr; - }; -#endif + double fpr; } fpr[32]; double fpscr; /* FPSCR stored as double for easier access */ } pcb_fpu; /* Floating point processor */ diff --git a/sys/powerpc/include/ucontext.h b/sys/powerpc/include/ucontext.h index d35c6c773fe0..dc87edd578bc 100644 --- a/sys/powerpc/include/ucontext.h +++ b/sys/powerpc/include/ucontext.h @@ -41,6 +41,7 @@ typedef struct __mcontext { int mc_flags; #define _MC_FP_VALID 0x01 #define _MC_AV_VALID 0x02 +#define _MC_VS_VALID 0x04 int mc_onstack; /* saved onstack flag */ int mc_len; /* sizeof(__mcontext) */ __uint64_t mc_avec[32*2]; /* vector register file */ @@ -56,6 +57,7 @@ typedef struct __mcontext32 { int mc_flags; #define _MC_FP_VALID 0x01 #define _MC_AV_VALID 0x02 +#define _MC_VS_VALID 0x04 int mc_onstack; /* saved onstack flag */ int mc_len; /* sizeof(__mcontext) */ uint64_t mc_avec[32*2]; /* vector register file */ diff --git a/sys/powerpc/powerpc/exec_machdep.c b/sys/powerpc/powerpc/exec_machdep.c index 1893d79f29a8..8a33d0f589a7 100644 --- a/sys/powerpc/powerpc/exec_machdep.c +++ b/sys/powerpc/powerpc/exec_machdep.c @@ -214,10 +214,10 @@ sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) sfpsize = sizeof(sf); #ifdef __powerpc64__ /* - * 64-bit PPC defines a 288 byte scratch region - * below the stack. + * 64-bit PPC defines a 512 byte red zone below + * the existing stack (ELF ABI v2 ยง2.2.2.4) */ - rndfsize = 288 + roundup(sizeof(sf), 48); + rndfsize = 512 + roundup(sizeof(sf), 48); #else rndfsize = roundup(sizeof(sf), 16); #endif @@ -349,13 +349,6 @@ sys_sigreturn(struct thread *td, struct sigreturn_args *uap) if (error != 0) return (error); - /* - * Save FPU state if needed. User may have changed it on - * signal handler - */ - if (uc.uc_mcontext.mc_srr1 & PSL_FP) - save_fpu(td); - kern_sigprocmask(td, SIG_SETMASK, &uc.uc_sigmask, NULL, 0); CTR3(KTR_SIG, "sigreturn: return td=%p pc=%#x sp=%#x", @@ -432,6 +425,7 @@ grab_mcontext(struct thread *td, mcontext_t *mcp, int flags) } if (pcb->pcb_flags & PCB_VSX) { + mcp->mc_flags |= _MC_VS_VALID; for (i = 0; i < 32; i++) memcpy(&mcp->mc_vsxfpreg[i], &pcb->pcb_fpu.fpr[i].vsr[2], sizeof(double)); @@ -481,6 +475,7 @@ set_mcontext(struct thread *td, mcontext_t *mcp) struct pcb *pcb; struct trapframe *tf; register_t tls; + register_t msr; int i; pcb = td->td_pcb; @@ -531,6 +526,22 @@ set_mcontext(struct thread *td, mcontext_t *mcp) tf->srr1 &= ~(PSL_FP | PSL_VSX | PSL_VEC); pcb->pcb_flags &= ~(PCB_FPU | PCB_VSX | PCB_VEC); + /* + * Ensure the FPU is also disabled in hardware. + * + * Without this, it's possible for the register reload to fail if we + * don't switch to a FPU disabled context before resuming the original + * thread. Specifically, if the FPU/VSX unavailable exception is never + * hit, then whatever data is still in the FP/VSX registers when + * sigresume is callled will used by the resumed thread, instead of the + * previously saved data from the mcontext. + */ + critical_enter(); + msr = mfmsr() & ~(PSL_FP | PSL_VSX | PSL_VEC); + isync(); + mtmsr(msr); + critical_exit(); + if (mcp->mc_flags & _MC_FP_VALID) { /* enable_fpu() will happen lazily on a fault */ pcb->pcb_flags |= PCB_FPREGS; @@ -539,8 +550,12 @@ set_mcontext(struct thread *td, mcontext_t *mcp) for (i = 0; i < 32; i++) { memcpy(&pcb->pcb_fpu.fpr[i].fpr, &mcp->mc_fpreg[i], sizeof(double)); - memcpy(&pcb->pcb_fpu.fpr[i].vsr[2], - &mcp->mc_vsxfpreg[i], sizeof(double)); + } + if (mcp->mc_flags & _MC_VS_VALID) { + for (i = 0; i < 32; i++) { + memcpy(&pcb->pcb_fpu.fpr[i].vsr[2], + &mcp->mc_vsxfpreg[i], sizeof(double)); + } } } diff --git a/sys/powerpc/powerpc/fpu.c b/sys/powerpc/powerpc/fpu.c index 0eaff2ea4932..cc8f22f7dda3 100644 --- a/sys/powerpc/powerpc/fpu.c +++ b/sys/powerpc/powerpc/fpu.c @@ -64,8 +64,19 @@ save_fpu_int(struct thread *td) * Save the floating-point registers and FPSCR to the PCB */ if (pcb->pcb_flags & PCB_VSX) { - #define SFP(n) __asm ("stxvw4x " #n ", 0,%0" \ +#if _BYTE_ORDER == _BIG_ENDIAN + #define SFP(n) __asm("stxvw4x " #n ", 0,%0" \ :: "b"(&pcb->pcb_fpu.fpr[n])); +#else + /* + * stxvw2x will swap words within the FP double word on LE systems, + * leading to corruption if VSX is used to store state and FP is + * subsequently used to restore state. + * Use stxvd2x instead. + */ + #define SFP(n) __asm("stxvd2x " #n ", 0,%0" \ + :: "b"(&pcb->pcb_fpu.fpr[n])); +#endif SFP(0); SFP(1); SFP(2); SFP(3); SFP(4); SFP(5); SFP(6); SFP(7); SFP(8); SFP(9); SFP(10); SFP(11); @@ -76,7 +87,7 @@ save_fpu_int(struct thread *td) SFP(28); SFP(29); SFP(30); SFP(31); #undef SFP } else { - #define SFP(n) __asm ("stfd " #n ", 0(%0)" \ + #define SFP(n) __asm("stfd " #n ", 0(%0)" \ :: "b"(&pcb->pcb_fpu.fpr[n].fpr)); SFP(0); SFP(1); SFP(2); SFP(3); SFP(4); SFP(5); SFP(6); SFP(7); @@ -149,8 +160,19 @@ enable_fpu(struct thread *td) :: "b"(&pcb->pcb_fpu.fpscr)); if (pcb->pcb_flags & PCB_VSX) { - #define LFP(n) __asm ("lxvw4x " #n ", 0,%0" \ +#if _BYTE_ORDER == _BIG_ENDIAN + #define LFP(n) __asm("lxvw4x " #n ", 0,%0" \ + :: "b"(&pcb->pcb_fpu.fpr[n])); +#else + /* + * lxvw4x will swap words within the FP double word on LE systems, + * leading to corruption if FP is used to store state and VSX is + * subsequently used to restore state. + * Use lxvd2x instead. + */ + #define LFP(n) __asm("lxvd2x " #n ", 0,%0" \ :: "b"(&pcb->pcb_fpu.fpr[n])); +#endif LFP(0); LFP(1); LFP(2); LFP(3); LFP(4); LFP(5); LFP(6); LFP(7); LFP(8); LFP(9); LFP(10); LFP(11); @@ -161,7 +183,7 @@ enable_fpu(struct thread *td) LFP(28); LFP(29); LFP(30); LFP(31); #undef LFP } else { - #define LFP(n) __asm ("lfd " #n ", 0(%0)" \ + #define LFP(n) __asm("lfd " #n ", 0(%0)" \ :: "b"(&pcb->pcb_fpu.fpr[n].fpr)); LFP(0); LFP(1); LFP(2); LFP(3); LFP(4); LFP(5); LFP(6); LFP(7); diff --git a/sys/riscv/riscv/pmap.c b/sys/riscv/riscv/pmap.c index 5d15bd671285..26efaecc64d1 100644 --- a/sys/riscv/riscv/pmap.c +++ b/sys/riscv/riscv/pmap.c @@ -4838,6 +4838,8 @@ pmap_unmapbios(void *p, vm_size_t size) void pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) { + if (m->md.pv_memattr == ma) + return; m->md.pv_memattr = ma; diff --git a/sys/rpc/clnt_rc.c b/sys/rpc/clnt_rc.c index 9e87af578885..44b63e38a8e6 100644 --- a/sys/rpc/clnt_rc.c +++ b/sys/rpc/clnt_rc.c @@ -198,6 +198,12 @@ clnt_reconnect_connect(CLIENT *cl) newclient = clnt_vc_create(so, (struct sockaddr *) &rc->rc_addr, rc->rc_prog, rc->rc_vers, rc->rc_sendsz, rc->rc_recvsz, rc->rc_intr); + /* + * CLSET_FD_CLOSE must be done now, in case rpctls_connect() + * fails just below. + */ + if (newclient != NULL) + CLNT_CONTROL(newclient, CLSET_FD_CLOSE, 0); if (rc->rc_tls && newclient != NULL) { CURVNET_SET(so->so_vnet); stat = rpctls_connect(newclient, rc->rc_tlscertname, so, @@ -236,7 +242,6 @@ clnt_reconnect_connect(CLIENT *cl) goto out; } - CLNT_CONTROL(newclient, CLSET_FD_CLOSE, 0); CLNT_CONTROL(newclient, CLSET_CONNECT, &one); CLNT_CONTROL(newclient, CLSET_TIMEOUT, &rc->rc_timeout); CLNT_CONTROL(newclient, CLSET_RETRY_TIMEOUT, &rc->rc_retry); diff --git a/sys/rpc/rpcsec_gss/rpcsec_gss.c b/sys/rpc/rpcsec_gss/rpcsec_gss.c index 62c71937a185..983dd251f81f 100644 --- a/sys/rpc/rpcsec_gss/rpcsec_gss.c +++ b/sys/rpc/rpcsec_gss/rpcsec_gss.c @@ -67,6 +67,7 @@ #include <sys/param.h> #include <sys/systm.h> #include <sys/hash.h> +#include <sys/jail.h> #include <sys/kernel.h> #include <sys/kobj.h> #include <sys/lock.h> @@ -772,6 +773,17 @@ rpc_gss_init(AUTH *auth, rpc_gss_options_ret_t *options_ret) gd->gd_cred.gc_seq = 0; /* + * XXX Threads from inside jails can get here via calls + * to clnt_vc_call()->AUTH_REFRESH()->rpc_gss_refresh() + * but the NFS mount is always done outside of the + * jails in vnet0. Since the thread credentials won't + * necessarily have cr_prison == vnet0 and this function + * has no access to the socket, using vnet0 seems the + * only option. This is broken if NFS mounts are enabled + * within vnet prisons. + */ + KGSS_CURVNET_SET_QUIET(vnet0); + /* * For KerberosV, if there is a client principal name, that implies * that this is a host based initiator credential in the default * keytab file. For this case, it is necessary to do a @@ -994,12 +1006,14 @@ out: gss_delete_sec_context(&min_stat, &gd->gd_ctx, GSS_C_NO_BUFFER); } + KGSS_CURVNET_RESTORE(); mtx_lock(&gd->gd_lock); gd->gd_state = RPCSEC_GSS_START; wakeup(gd); mtx_unlock(&gd->gd_lock); return (FALSE); } + KGSS_CURVNET_RESTORE(); mtx_lock(&gd->gd_lock); gd->gd_state = RPCSEC_GSS_ESTABLISHED; diff --git a/sys/rpc/rpcsec_tls/rpctls_impl.c b/sys/rpc/rpcsec_tls/rpctls_impl.c index 93fe283e65fd..51fe270b13d9 100644 --- a/sys/rpc/rpcsec_tls/rpctls_impl.c +++ b/sys/rpc/rpcsec_tls/rpctls_impl.c @@ -240,6 +240,14 @@ rpctls_rpc_failed(struct upsock *ups, struct socket *so) * failed to do the handshake. */ mtx_unlock(&rpctls_lock); + /* + * Do a shutdown on the socket, since the daemon is + * probably stuck in SSL_accept() or SSL_connect() trying to + * read the socket. Do not soclose() the socket, since the + * daemon will close() the socket after SSL_accept() + * returns an error. + */ + soshutdown(so, SHUT_RD); } } diff --git a/sys/sys/exterrvar.h b/sys/sys/exterrvar.h index 15557c614f88..7bf1d264ff5e 100644 --- a/sys/sys/exterrvar.h +++ b/sys/sys/exterrvar.h @@ -21,6 +21,7 @@ #define EXTERRCTL_ENABLE 1 #define EXTERRCTL_DISABLE 2 +#define EXTERRCTL_UD 3 #define EXTERRCTLF_FORCE 0x00000001 diff --git a/sys/sys/inotify.h b/sys/sys/inotify.h index 65dc5dba43f3..d1f23d5898bb 100644 --- a/sys/sys/inotify.h +++ b/sys/sys/inotify.h @@ -107,11 +107,18 @@ void vn_inotify_revoke(struct vnode *); } while (0) /* Log an inotify event using a specific name for the vnode. */ -#define INOTIFY_NAME(vp, dvp, cnp, ev) do { \ +#define INOTIFY_NAME_LOCK(vp, dvp, cnp, ev, lock) do { \ if (__predict_false((vn_irflag_read(vp) & VIRF_INOTIFY) != 0 || \ - (vn_irflag_read(dvp) & VIRF_INOTIFY) != 0)) \ + (vn_irflag_read(dvp) & VIRF_INOTIFY) != 0)) { \ + if (lock) \ + vn_lock((vp), LK_SHARED | LK_RETRY); \ VOP_INOTIFY((vp), (dvp), (cnp), (ev), 0); \ + if (lock) \ + VOP_UNLOCK(vp); \ + } \ } while (0) +#define INOTIFY_NAME(vp, dvp, cnp, ev) \ + INOTIFY_NAME_LOCK((vp), (dvp), (cnp), (ev), false) extern __uint32_t inotify_rename_cookie; @@ -126,7 +133,8 @@ extern __uint32_t inotify_rename_cookie; VOP_INOTIFY((vp), (tdvp), (tcnp), IN_MOVED_TO, cookie); \ } \ if ((tvp) != NULL) \ - INOTIFY_NAME((tvp), (tdvp), (tcnp), _IN_MOVE_DELETE); \ + INOTIFY_NAME_LOCK((tvp), (tdvp), (tcnp), \ + _IN_MOVE_DELETE, true); \ } while (0) #define INOTIFY_REVOKE(vp) do { \ diff --git a/sys/sys/param.h b/sys/sys/param.h index af116d6e3f7a..bd739eacae6f 100644 --- a/sys/sys/param.h +++ b/sys/sys/param.h @@ -74,7 +74,7 @@ * cannot include sys/param.h and should only be updated here. */ #undef __FreeBSD_version -#define __FreeBSD_version 1500051 +#define __FreeBSD_version 1500053 /* * __FreeBSD_kernel__ indicates that this system uses the kernel of FreeBSD, diff --git a/sys/sys/random.h b/sys/sys/random.h index 254ba9451d0a..5abf762cd200 100644 --- a/sys/sys/random.h +++ b/sys/sys/random.h @@ -85,7 +85,8 @@ enum random_entropy_source { RANDOM_FS_ATIME, RANDOM_UMA, /* Special!! UMA/SLAB Allocator */ RANDOM_CALLOUT, - RANDOM_ENVIRONMENTAL_END = RANDOM_CALLOUT, + RANDOM_RANDOMDEV, + RANDOM_ENVIRONMENTAL_END = RANDOM_RANDOMDEV, /* Fast hardware random-number sources from here on. */ RANDOM_PURE_START, RANDOM_PURE_OCTEON = RANDOM_PURE_START, diff --git a/sys/vm/swap_pager.c b/sys/vm/swap_pager.c index 86b75a2d7989..d6bd06226d04 100644 --- a/sys/vm/swap_pager.c +++ b/sys/vm/swap_pager.c @@ -384,8 +384,8 @@ swap_release_by_cred(vm_ooffset_t decr, struct ucred *cred) #endif } -static int swap_pager_full = 2; /* swap space exhaustion (task killing) */ -static int swap_pager_almost_full = 1; /* swap space exhaustion (w/hysteresis)*/ +static bool swap_pager_full = true; /* swap space exhaustion (task killing) */ +static bool swap_pager_almost_full = true; /* swap space exhaustion (w/hysteresis) */ static struct mtx swbuf_mtx; /* to sync nsw_wcount_async */ static int nsw_wcount_async; /* limit async write buffers */ static int nsw_wcount_async_max;/* assigned maximum */ @@ -642,14 +642,14 @@ swp_sizecheck(void) { if (swap_pager_avail < nswap_lowat) { - if (swap_pager_almost_full == 0) { + if (!swap_pager_almost_full) { printf("swap_pager: out of swap space\n"); - swap_pager_almost_full = 1; + swap_pager_almost_full = true; } } else { - swap_pager_full = 0; + swap_pager_full = false; if (swap_pager_avail > nswap_hiwat) - swap_pager_almost_full = 0; + swap_pager_almost_full = false; } } @@ -958,11 +958,10 @@ swp_pager_getswapspace(int *io_npages) swp_sizecheck(); swdevhd = TAILQ_NEXT(sp, sw_list); } else { - if (swap_pager_full != 2) { + if (!swap_pager_full) { printf("swp_pager_getswapspace(%d): failed\n", *io_npages); - swap_pager_full = 2; - swap_pager_almost_full = 1; + swap_pager_full = swap_pager_almost_full = true; } swdevhd = NULL; } @@ -2863,10 +2862,8 @@ swapoff_one(struct swdevt *sp, struct ucred *cred, u_int flags) sp->sw_id = NULL; TAILQ_REMOVE(&swtailq, sp, sw_list); nswapdev--; - if (nswapdev == 0) { - swap_pager_full = 2; - swap_pager_almost_full = 1; - } + if (nswapdev == 0) + swap_pager_full = swap_pager_almost_full = true; if (swdevhd == sp) swdevhd = NULL; mtx_unlock(&sw_dev_mtx); diff --git a/sys/vm/vm_domainset.c b/sys/vm/vm_domainset.c index 7b8bf4c77663..b44bdb96b0d4 100644 --- a/sys/vm/vm_domainset.c +++ b/sys/vm/vm_domainset.c @@ -131,8 +131,7 @@ static void vm_domainset_iter_next(struct vm_domainset_iter *di, int *domain) { - KASSERT(di->di_n > 0, - ("vm_domainset_iter_first: Invalid n %d", di->di_n)); + KASSERT(di->di_n > 0, ("%s: Invalid n %d", __func__, di->di_n)); switch (di->di_policy) { case DOMAINSET_POLICY_FIRSTTOUCH: /* @@ -149,11 +148,10 @@ vm_domainset_iter_next(struct vm_domainset_iter *di, int *domain) vm_domainset_iter_prefer(di, domain); break; default: - panic("vm_domainset_iter_first: Unknown policy %d", - di->di_policy); + panic("%s: Unknown policy %d", __func__, di->di_policy); } KASSERT(*domain < vm_ndomains, - ("vm_domainset_iter_next: Invalid domain %d", *domain)); + ("%s: Invalid domain %d", __func__, *domain)); } static void @@ -189,13 +187,11 @@ vm_domainset_iter_first(struct vm_domainset_iter *di, int *domain) di->di_n = di->di_domain->ds_cnt; break; default: - panic("vm_domainset_iter_first: Unknown policy %d", - di->di_policy); + panic("%s: Unknown policy %d", __func__, di->di_policy); } - KASSERT(di->di_n > 0, - ("vm_domainset_iter_first: Invalid n %d", di->di_n)); + KASSERT(di->di_n > 0, ("%s: Invalid n %d", __func__, di->di_n)); KASSERT(*domain < vm_ndomains, - ("vm_domainset_iter_first: Invalid domain %d", *domain)); + ("%s: Invalid domain %d", __func__, *domain)); } void diff --git a/sys/vm/vm_kern.c b/sys/vm/vm_kern.c index 875c22d27628..e7d7b6726d2c 100644 --- a/sys/vm/vm_kern.c +++ b/sys/vm/vm_kern.c @@ -110,11 +110,18 @@ u_int exec_map_entry_size; u_int exec_map_entries; SYSCTL_ULONG(_vm, OID_AUTO, min_kernel_address, CTLFLAG_RD, - SYSCTL_NULL_ULONG_PTR, VM_MIN_KERNEL_ADDRESS, "Min kernel address"); +#if defined(__amd64__) + &kva_layout.km_low, 0, +#else + SYSCTL_NULL_ULONG_PTR, VM_MIN_KERNEL_ADDRESS, +#endif + "Min kernel address"); SYSCTL_ULONG(_vm, OID_AUTO, max_kernel_address, CTLFLAG_RD, #if defined(__arm__) &vm_max_kernel_address, 0, +#elif defined(__amd64__) + &kva_layout.km_high, 0, #else SYSCTL_NULL_ULONG_PTR, VM_MAX_KERNEL_ADDRESS, #endif diff --git a/sys/vm/vm_pagequeue.h b/sys/vm/vm_pagequeue.h index cbbd27389662..9bd3b389fb60 100644 --- a/sys/vm/vm_pagequeue.h +++ b/sys/vm/vm_pagequeue.h @@ -260,9 +260,9 @@ struct vm_domain { u_int vmd_inactive_shortage; /* Per-thread shortage. */ blockcount_t vmd_inactive_running; /* Number of inactive threads. */ blockcount_t vmd_inactive_starting; /* Number of threads started. */ - volatile u_int vmd_addl_shortage; /* Shortage accumulator. */ - volatile u_int vmd_inactive_freed; /* Successful inactive frees. */ - volatile u_int vmd_inactive_us; /* Microseconds for above. */ + u_int vmd_addl_shortage; /* (a) Shortage accumulator. */ + u_int vmd_inactive_freed; /* (a) Successful inactive frees. */ + u_int vmd_inactive_us; /* (a) Microseconds for above. */ u_int vmd_inactive_pps; /* Exponential decay frees/second. */ int vmd_oom_seq; int vmd_last_active_scan; |