diff options
| author | Glen Barber <gjb@FreeBSD.org> | 2018-10-05 17:53:47 +0000 |
|---|---|---|
| committer | Glen Barber <gjb@FreeBSD.org> | 2018-10-05 17:53:47 +0000 |
| commit | 01d4e2149e5566e5d9394913dc9fb032da259e0b (patch) | |
| tree | 4bc35787f1ac2632cbdbd5f1627bf552fb11501b /sys | |
| parent | e4456411a8c2d4a9bfbccd60f2cf914fd402f817 (diff) | |
| parent | c84dbc532904f2342f06fed592c384fd0c6436f5 (diff) | |
Notes
Diffstat (limited to 'sys')
230 files changed, 4195 insertions, 2675 deletions
diff --git a/sys/amd64/amd64/copyout.c b/sys/amd64/amd64/copyout.c index dce20fb27769..59b7ad5c1307 100644 --- a/sys/amd64/amd64/copyout.c +++ b/sys/amd64/amd64/copyout.c @@ -159,20 +159,41 @@ DEFINE_IFUNC(, int, copyinstr, (const void *, void *, size_t, size_t *), copyinstr_smap : copyinstr_nosmap); } -int copyin_nosmap(const void *udaddr, void *kaddr, size_t len); -int copyin_smap(const void *udaddr, void *kaddr, size_t len); +int copyin_nosmap_std(const void *udaddr, void *kaddr, size_t len); +int copyin_smap_std(const void *udaddr, void *kaddr, size_t len); +int copyin_nosmap_erms(const void *udaddr, void *kaddr, size_t len); +int copyin_smap_erms(const void *udaddr, void *kaddr, size_t len); DEFINE_IFUNC(, int, copyin, (const void *, void *, size_t), static) { - return ((cpu_stdext_feature & CPUID_STDEXT_SMAP) != 0 ? - copyin_smap : copyin_nosmap); + switch (cpu_stdext_feature & (CPUID_STDEXT_SMAP | CPUID_STDEXT_ERMS)) { + case CPUID_STDEXT_SMAP: + return (copyin_smap_std); + case CPUID_STDEXT_ERMS: + return (copyin_nosmap_erms); + case CPUID_STDEXT_SMAP | CPUID_STDEXT_ERMS: + return (copyin_smap_erms); + default: + return (copyin_nosmap_std); + + } } -int copyout_nosmap(const void *kaddr, void *udaddr, size_t len); -int copyout_smap(const void *kaddr, void *udaddr, size_t len); +int copyout_nosmap_std(const void *kaddr, void *udaddr, size_t len); +int copyout_smap_std(const void *kaddr, void *udaddr, size_t len); +int copyout_nosmap_erms(const void *kaddr, void *udaddr, size_t len); +int copyout_smap_erms(const void *kaddr, void *udaddr, size_t len); DEFINE_IFUNC(, int, copyout, (const void *, void *, size_t), static) { - return ((cpu_stdext_feature & CPUID_STDEXT_SMAP) != 0 ? - copyout_smap : copyout_nosmap); + switch (cpu_stdext_feature & (CPUID_STDEXT_SMAP | CPUID_STDEXT_ERMS)) { + case CPUID_STDEXT_SMAP: + return (copyout_smap_std); + case CPUID_STDEXT_ERMS: + return (copyout_nosmap_erms); + case CPUID_STDEXT_SMAP | CPUID_STDEXT_ERMS: + return (copyout_smap_erms); + default: + return (copyout_nosmap_std); + } } diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c index 092dc095bf65..dc174e85a61f 100644 --- a/sys/amd64/amd64/machdep.c +++ b/sys/amd64/amd64/machdep.c @@ -1581,6 +1581,21 @@ hammer_time(u_int64_t modulep, u_int64_t physfree) */ identify_cpu2(); + /* + * Check for pti, pcid, and invpcid before ifuncs are + * resolved, to correctly select the implementation for + * pmap_activate_sw_mode(). + */ + pti = pti_get_default(); + TUNABLE_INT_FETCH("vm.pmap.pti", &pti); + TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled); + if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) { + invpcid_works = (cpu_stdext_feature & + CPUID_STDEXT_INVPCID) != 0; + } else { + pmap_pcid_enabled = 0; + } + link_elf_ireloc(kmdp); /* @@ -1645,9 +1660,6 @@ hammer_time(u_int64_t modulep, u_int64_t physfree) mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF); /* exceptions */ - pti = pti_get_default(); - TUNABLE_INT_FETCH("vm.pmap.pti", &pti); - for (x = 0; x < NIDT; x++) setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0); @@ -2693,3 +2705,12 @@ DEFINE_IFUNC(, void *, memcpy, (void * _Nonnull, const void * _Nonnull, size_t), return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ? memcpy_erms : memcpy_std); } + +void pagezero_std(void *addr); +void pagezero_erms(void *addr); +DEFINE_IFUNC(, void , pagezero, (void *), static) +{ + + return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ? + pagezero_erms : pagezero_std); +} diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index 8cb09df626be..bab8072d9ec1 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -146,6 +146,7 @@ __FBSDID("$FreeBSD$"); #include <machine/intr_machdep.h> #include <x86/apicvar.h> +#include <x86/ifunc.h> #include <machine/cpu.h> #include <machine/cputypes.h> #include <machine/md_var.h> @@ -647,6 +648,10 @@ static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte); static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte); +static void pmap_invalidate_cache_range_selfsnoop(vm_offset_t sva, + vm_offset_t eva); +static void pmap_invalidate_cache_range_all(vm_offset_t sva, + vm_offset_t eva); static void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde); static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode); @@ -1093,9 +1098,11 @@ pmap_bootstrap(vm_paddr_t *firstaddr) vm_offset_t va; pt_entry_t *pte; uint64_t cr4; + u_long res; int i; KERNend = *firstaddr; + res = atop(KERNend - (vm_paddr_t)kernphys); if (!pti) pg_g = X86_PG_G; @@ -1115,10 +1122,8 @@ pmap_bootstrap(vm_paddr_t *firstaddr) vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt)); virtual_avail = (vm_offset_t) KERNBASE + *firstaddr; - virtual_end = VM_MAX_KERNEL_ADDRESS; - /* * Enable PG_G global pages, then switch to the kernel page * table from the bootstrap page table. After the switch, it @@ -1137,6 +1142,8 @@ pmap_bootstrap(vm_paddr_t *firstaddr) /* * Initialize the kernel pmap (which is statically allocated). + * Count bootstrap data as being resident in case any of this data is + * later unmapped (using pmap_remove()) and freed. */ PMAP_LOCK_INIT(kernel_pmap); kernel_pmap->pm_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(KPML4phys); @@ -1144,6 +1151,7 @@ pmap_bootstrap(vm_paddr_t *firstaddr) kernel_pmap->pm_ucr3 = PMAP_NO_CR3; CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ TAILQ_INIT(&kernel_pmap->pm_pvchunk); + kernel_pmap->pm_stats.resident_count = res; kernel_pmap->pm_flags = pmap_flags; /* @@ -1179,11 +1187,7 @@ pmap_bootstrap(vm_paddr_t *firstaddr) pmap_init_pat(); /* Initialize TLB Context Id. */ - TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled); - if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) { - /* Check for INVPCID support */ - invpcid_works = (cpu_stdext_feature & CPUID_STDEXT_INVPCID) - != 0; + if (pmap_pcid_enabled) { for (i = 0; i < MAXCPU; i++) { kernel_pmap->pm_pcids[i].pm_pcid = PMAP_PCID_KERN; kernel_pmap->pm_pcids[i].pm_gen = 1; @@ -1204,8 +1208,6 @@ pmap_bootstrap(vm_paddr_t *firstaddr) * during pcpu setup. */ load_cr4(rcr4() | CR4_PCIDE); - } else { - pmap_pcid_enabled = 0; } } @@ -1423,7 +1425,7 @@ pmap_init(void) if (ppim->va == 0) continue; /* Make the direct map consistent */ - if (ppim->pa < dmaplimit && ppim->pa + ppim->sz < dmaplimit) { + if (ppim->pa < dmaplimit && ppim->pa + ppim->sz <= dmaplimit) { (void)pmap_change_attr(PHYS_TO_DMAP(ppim->pa), ppim->sz, ppim->mode); } @@ -1705,15 +1707,100 @@ pmap_invalidate_ept(pmap_t pmap) sched_unpin(); } -void -pmap_invalidate_page(pmap_t pmap, vm_offset_t va) +static cpuset_t +pmap_invalidate_cpu_mask(pmap_t pmap) +{ + + return (pmap == kernel_pmap ? all_cpus : pmap->pm_active); +} + +static inline void +pmap_invalidate_page_pcid(pmap_t pmap, vm_offset_t va, + const bool invpcid_works1) { - cpuset_t *mask; struct invpcid_descr d; uint64_t kcr3, ucr3; uint32_t pcid; u_int cpuid, i; + cpuid = PCPU_GET(cpuid); + if (pmap == PCPU_GET(curpmap)) { + if (pmap->pm_ucr3 != PMAP_NO_CR3) { + /* + * Because pm_pcid is recalculated on a + * context switch, we must disable switching. + * Otherwise, we might use a stale value + * below. + */ + critical_enter(); + pcid = pmap->pm_pcids[cpuid].pm_pcid; + if (invpcid_works1) { + d.pcid = pcid | PMAP_PCID_USER_PT; + d.pad = 0; + d.addr = va; + invpcid(&d, INVPCID_ADDR); + } else { + kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE; + ucr3 = pmap->pm_ucr3 | pcid | + PMAP_PCID_USER_PT | CR3_PCID_SAVE; + pmap_pti_pcid_invlpg(ucr3, kcr3, va); + } + critical_exit(); + } + } else + pmap->pm_pcids[cpuid].pm_gen = 0; + + CPU_FOREACH(i) { + if (cpuid != i) + pmap->pm_pcids[i].pm_gen = 0; + } + + /* + * The fence is between stores to pm_gen and the read of the + * pm_active mask. We need to ensure that it is impossible + * for us to miss the bit update in pm_active and + * simultaneously observe a non-zero pm_gen in + * pmap_activate_sw(), otherwise TLB update is missed. + * Without the fence, IA32 allows such an outcome. Note that + * pm_active is updated by a locked operation, which provides + * the reciprocal fence. + */ + atomic_thread_fence_seq_cst(); +} + +static void +pmap_invalidate_page_pcid_invpcid(pmap_t pmap, vm_offset_t va) +{ + + pmap_invalidate_page_pcid(pmap, va, true); +} + +static void +pmap_invalidate_page_pcid_noinvpcid(pmap_t pmap, vm_offset_t va) +{ + + pmap_invalidate_page_pcid(pmap, va, false); +} + +static void +pmap_invalidate_page_nopcid(pmap_t pmap, vm_offset_t va) +{ +} + +DEFINE_IFUNC(static, void, pmap_invalidate_page_mode, (pmap_t, vm_offset_t), + static) +{ + + if (pmap_pcid_enabled) + return (invpcid_works ? pmap_invalidate_page_pcid_invpcid : + pmap_invalidate_page_pcid_noinvpcid); + return (pmap_invalidate_page_nopcid); +} + +void +pmap_invalidate_page(pmap_t pmap, vm_offset_t va) +{ + if (pmap_type_guest(pmap)) { pmap_invalidate_ept(pmap); return; @@ -1725,73 +1812,93 @@ pmap_invalidate_page(pmap_t pmap, vm_offset_t va) sched_pin(); if (pmap == kernel_pmap) { invlpg(va); - mask = &all_cpus; } else { - cpuid = PCPU_GET(cpuid); - if (pmap == PCPU_GET(curpmap)) { + if (pmap == PCPU_GET(curpmap)) invlpg(va); - if (pmap_pcid_enabled && pmap->pm_ucr3 != PMAP_NO_CR3) { - /* - * Disable context switching. pm_pcid - * is recalculated on switch, which - * might make us use wrong pcid below. - */ - critical_enter(); - pcid = pmap->pm_pcids[cpuid].pm_pcid; - - if (invpcid_works) { - d.pcid = pcid | PMAP_PCID_USER_PT; - d.pad = 0; - d.addr = va; - invpcid(&d, INVPCID_ADDR); - } else { - kcr3 = pmap->pm_cr3 | pcid | - CR3_PCID_SAVE; - ucr3 = pmap->pm_ucr3 | pcid | - PMAP_PCID_USER_PT | CR3_PCID_SAVE; - pmap_pti_pcid_invlpg(ucr3, kcr3, va); - } - critical_exit(); - } - } else if (pmap_pcid_enabled) - pmap->pm_pcids[cpuid].pm_gen = 0; - if (pmap_pcid_enabled) { - CPU_FOREACH(i) { - if (cpuid != i) - pmap->pm_pcids[i].pm_gen = 0; - } - - /* - * The fence is between stores to pm_gen and the read of - * the pm_active mask. We need to ensure that it is - * impossible for us to miss the bit update in pm_active - * and simultaneously observe a non-zero pm_gen in - * pmap_activate_sw(), otherwise TLB update is missed. - * Without the fence, IA32 allows such an outcome. - * Note that pm_active is updated by a locked operation, - * which provides the reciprocal fence. - */ - atomic_thread_fence_seq_cst(); - } - mask = &pmap->pm_active; + pmap_invalidate_page_mode(pmap, va); } - smp_masked_invlpg(*mask, va, pmap); + smp_masked_invlpg(pmap_invalidate_cpu_mask(pmap), va, pmap); sched_unpin(); } /* 4k PTEs -- Chosen to exceed the total size of Broadwell L2 TLB */ #define PMAP_INVLPG_THRESHOLD (4 * 1024 * PAGE_SIZE) -void -pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) +static void +pmap_invalidate_range_pcid(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, + const bool invpcid_works1) { - cpuset_t *mask; struct invpcid_descr d; - vm_offset_t addr; uint64_t kcr3, ucr3; uint32_t pcid; u_int cpuid, i; + cpuid = PCPU_GET(cpuid); + if (pmap == PCPU_GET(curpmap)) { + if (pmap->pm_ucr3 != PMAP_NO_CR3) { + critical_enter(); + pcid = pmap->pm_pcids[cpuid].pm_pcid; + if (invpcid_works1) { + d.pcid = pcid | PMAP_PCID_USER_PT; + d.pad = 0; + d.addr = sva; + for (; d.addr < eva; d.addr += PAGE_SIZE) + invpcid(&d, INVPCID_ADDR); + } else { + kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE; + ucr3 = pmap->pm_ucr3 | pcid | + PMAP_PCID_USER_PT | CR3_PCID_SAVE; + pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva); + } + critical_exit(); + } + } else + pmap->pm_pcids[cpuid].pm_gen = 0; + + CPU_FOREACH(i) { + if (cpuid != i) + pmap->pm_pcids[i].pm_gen = 0; + } + /* See the comment in pmap_invalidate_page_pcid(). */ + atomic_thread_fence_seq_cst(); +} + +static void +pmap_invalidate_range_pcid_invpcid(pmap_t pmap, vm_offset_t sva, + vm_offset_t eva) +{ + + pmap_invalidate_range_pcid(pmap, sva, eva, true); +} + +static void +pmap_invalidate_range_pcid_noinvpcid(pmap_t pmap, vm_offset_t sva, + vm_offset_t eva) +{ + + pmap_invalidate_range_pcid(pmap, sva, eva, false); +} + +static void +pmap_invalidate_range_nopcid(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) +{ +} + +DEFINE_IFUNC(static, void, pmap_invalidate_range_mode, (pmap_t, vm_offset_t, + vm_offset_t), static) +{ + + if (pmap_pcid_enabled) + return (invpcid_works ? pmap_invalidate_range_pcid_invpcid : + pmap_invalidate_range_pcid_noinvpcid); + return (pmap_invalidate_range_nopcid); +} + +void +pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) +{ + vm_offset_t addr; + if (eva - sva >= PMAP_INVLPG_THRESHOLD) { pmap_invalidate_all(pmap); return; @@ -1806,122 +1913,119 @@ pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) ("pmap_invalidate_range: invalid type %d", pmap->pm_type)); sched_pin(); - cpuid = PCPU_GET(cpuid); if (pmap == kernel_pmap) { for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); - mask = &all_cpus; } else { if (pmap == PCPU_GET(curpmap)) { for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); - if (pmap_pcid_enabled && pmap->pm_ucr3 != PMAP_NO_CR3) { - critical_enter(); - pcid = pmap->pm_pcids[cpuid].pm_pcid; - if (invpcid_works) { - d.pcid = pcid | PMAP_PCID_USER_PT; - d.pad = 0; - d.addr = sva; - for (; d.addr < eva; d.addr += - PAGE_SIZE) - invpcid(&d, INVPCID_ADDR); - } else { - kcr3 = pmap->pm_cr3 | pcid | - CR3_PCID_SAVE; - ucr3 = pmap->pm_ucr3 | pcid | - PMAP_PCID_USER_PT | CR3_PCID_SAVE; - pmap_pti_pcid_invlrng(ucr3, kcr3, sva, - eva); - } - critical_exit(); - } - } else if (pmap_pcid_enabled) { - pmap->pm_pcids[cpuid].pm_gen = 0; - } - if (pmap_pcid_enabled) { - CPU_FOREACH(i) { - if (cpuid != i) - pmap->pm_pcids[i].pm_gen = 0; - } - /* See the comment in pmap_invalidate_page(). */ - atomic_thread_fence_seq_cst(); } - mask = &pmap->pm_active; + pmap_invalidate_range_mode(pmap, sva, eva); } - smp_masked_invlpg_range(*mask, sva, eva, pmap); + smp_masked_invlpg_range(pmap_invalidate_cpu_mask(pmap), sva, eva, pmap); sched_unpin(); } -void -pmap_invalidate_all(pmap_t pmap) +static inline void +pmap_invalidate_all_pcid(pmap_t pmap, bool invpcid_works1) { - cpuset_t *mask; struct invpcid_descr d; uint64_t kcr3, ucr3; uint32_t pcid; u_int cpuid, i; - if (pmap_type_guest(pmap)) { - pmap_invalidate_ept(pmap); - return; - } - - KASSERT(pmap->pm_type == PT_X86, - ("pmap_invalidate_all: invalid type %d", pmap->pm_type)); - - sched_pin(); if (pmap == kernel_pmap) { - if (pmap_pcid_enabled && invpcid_works) { + if (invpcid_works1) { bzero(&d, sizeof(d)); invpcid(&d, INVPCID_CTXGLOB); } else { invltlb_glob(); } - mask = &all_cpus; } else { cpuid = PCPU_GET(cpuid); if (pmap == PCPU_GET(curpmap)) { - if (pmap_pcid_enabled) { - critical_enter(); - pcid = pmap->pm_pcids[cpuid].pm_pcid; - if (invpcid_works) { - d.pcid = pcid; - d.pad = 0; - d.addr = 0; + critical_enter(); + pcid = pmap->pm_pcids[cpuid].pm_pcid; + if (invpcid_works1) { + d.pcid = pcid; + d.pad = 0; + d.addr = 0; + invpcid(&d, INVPCID_CTX); + if (pmap->pm_ucr3 != PMAP_NO_CR3) { + d.pcid |= PMAP_PCID_USER_PT; invpcid(&d, INVPCID_CTX); - if (pmap->pm_ucr3 != PMAP_NO_CR3) { - d.pcid |= PMAP_PCID_USER_PT; - invpcid(&d, INVPCID_CTX); - } - } else { - kcr3 = pmap->pm_cr3 | pcid; - ucr3 = pmap->pm_ucr3; - if (ucr3 != PMAP_NO_CR3) { - ucr3 |= pcid | PMAP_PCID_USER_PT; - pmap_pti_pcid_invalidate(ucr3, - kcr3); - } else { - load_cr3(kcr3); - } } - critical_exit(); } else { - invltlb(); + kcr3 = pmap->pm_cr3 | pcid; + ucr3 = pmap->pm_ucr3; + if (ucr3 != PMAP_NO_CR3) { + ucr3 |= pcid | PMAP_PCID_USER_PT; + pmap_pti_pcid_invalidate(ucr3, kcr3); + } else { + load_cr3(kcr3); + } } - } else if (pmap_pcid_enabled) { + critical_exit(); + } else pmap->pm_pcids[cpuid].pm_gen = 0; + CPU_FOREACH(i) { + if (cpuid != i) + pmap->pm_pcids[i].pm_gen = 0; } - if (pmap_pcid_enabled) { - CPU_FOREACH(i) { - if (cpuid != i) - pmap->pm_pcids[i].pm_gen = 0; - } - /* See the comment in pmap_invalidate_page(). */ - atomic_thread_fence_seq_cst(); - } - mask = &pmap->pm_active; } - smp_masked_invltlb(*mask, pmap); + /* See the comment in pmap_invalidate_page_pcid(). */ + atomic_thread_fence_seq_cst(); +} + +static void +pmap_invalidate_all_pcid_invpcid(pmap_t pmap) +{ + + pmap_invalidate_all_pcid(pmap, true); +} + +static void +pmap_invalidate_all_pcid_noinvpcid(pmap_t pmap) +{ + + pmap_invalidate_all_pcid(pmap, false); +} + +static void +pmap_invalidate_all_nopcid(pmap_t pmap) +{ + + if (pmap == kernel_pmap) + invltlb_glob(); + else if (pmap == PCPU_GET(curpmap)) + invltlb(); +} + +DEFINE_IFUNC(static, void, pmap_invalidate_all_mode, (pmap_t), static) +{ + + if (pmap_pcid_enabled) + return (invpcid_works ? pmap_invalidate_all_pcid_invpcid : + pmap_invalidate_all_pcid_noinvpcid); + return (pmap_invalidate_all_nopcid); +} + +void +pmap_invalidate_all(pmap_t pmap) +{ + + if (pmap_type_guest(pmap)) { + pmap_invalidate_ept(pmap); + return; + } + + KASSERT(pmap->pm_type == PT_X86, + ("pmap_invalidate_all: invalid type %d", pmap->pm_type)); + + sched_pin(); + pmap_invalidate_all_mode(pmap); + smp_masked_invltlb(pmap_invalidate_cpu_mask(pmap), pmap); sched_unpin(); } @@ -2176,36 +2280,62 @@ pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde) pmap_invalidate_page(pmap, va); } +DEFINE_IFUNC(, void, pmap_invalidate_cache_range, + (vm_offset_t sva, vm_offset_t eva), static) +{ + + if ((cpu_feature & CPUID_SS) != 0) + return (pmap_invalidate_cache_range_selfsnoop); + if ((cpu_feature & CPUID_CLFSH) != 0) + return (pmap_force_invalidate_cache_range); + return (pmap_invalidate_cache_range_all); +} + #define PMAP_CLFLUSH_THRESHOLD (2 * 1024 * 1024) -void -pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva, boolean_t force) +static void +pmap_invalidate_cache_range_check_align(vm_offset_t sva, vm_offset_t eva) { - if (force) { - sva &= ~(vm_offset_t)(cpu_clflush_line_size - 1); - } else { - KASSERT((sva & PAGE_MASK) == 0, - ("pmap_invalidate_cache_range: sva not page-aligned")); - KASSERT((eva & PAGE_MASK) == 0, - ("pmap_invalidate_cache_range: eva not page-aligned")); - } + KASSERT((sva & PAGE_MASK) == 0, + ("pmap_invalidate_cache_range: sva not page-aligned")); + KASSERT((eva & PAGE_MASK) == 0, + ("pmap_invalidate_cache_range: eva not page-aligned")); +} - if ((cpu_feature & CPUID_SS) != 0 && !force) - ; /* If "Self Snoop" is supported and allowed, do nothing. */ - else if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0 && - eva - sva < PMAP_CLFLUSH_THRESHOLD) { +static void +pmap_invalidate_cache_range_selfsnoop(vm_offset_t sva, vm_offset_t eva) +{ + + pmap_invalidate_cache_range_check_align(sva, eva); +} + +void +pmap_force_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva) +{ + + sva &= ~(vm_offset_t)(cpu_clflush_line_size - 1); + if (eva - sva >= PMAP_CLFLUSH_THRESHOLD) { /* - * XXX: Some CPUs fault, hang, or trash the local APIC - * registers if we use CLFLUSH on the local APIC - * range. The local APIC is always uncached, so we - * don't need to flush for that range anyway. + * The supplied range is bigger than 2MB. + * Globally invalidate cache. */ - if (pmap_kextract(sva) == lapic_paddr) - return; + pmap_invalidate_cache(); + return; + } + /* + * XXX: Some CPUs fault, hang, or trash the local APIC + * registers if we use CLFLUSH on the local APIC range. The + * local APIC is always uncached, so we don't need to flush + * for that range anyway. + */ + if (pmap_kextract(sva) == lapic_paddr) + return; + + if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0) { /* - * Otherwise, do per-cache line flush. Use the sfence + * Do per-cache line flush. Use the sfence * instruction to insure that previous stores are * included in the write-back. The processor * propagates flush to other processors in the cache @@ -2215,10 +2345,7 @@ pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva, boolean_t force) for (; sva < eva; sva += cpu_clflush_line_size) clflushopt(sva); sfence(); - } else if ((cpu_feature & CPUID_CLFSH) != 0 && - eva - sva < PMAP_CLFLUSH_THRESHOLD) { - if (pmap_kextract(sva) == lapic_paddr) - return; + } else { /* * Writes are ordered by CLFLUSH on Intel CPUs. */ @@ -2228,17 +2355,17 @@ pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva, boolean_t force) clflush(sva); if (cpu_vendor_id != CPU_VENDOR_INTEL) mfence(); - } else { - - /* - * No targeted cache flush methods are supported by CPU, - * or the supplied range is bigger than 2MB. - * Globally invalidate cache. - */ - pmap_invalidate_cache(); } } +static void +pmap_invalidate_cache_range_all(vm_offset_t sva, vm_offset_t eva) +{ + + pmap_invalidate_cache_range_check_align(sva, eva); + pmap_invalidate_cache(); +} + /* * Remove the specified set of pages from the data and instruction caches. * @@ -6931,7 +7058,7 @@ pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode) * If the specified range of physical addresses fits within * the direct map window, use the direct map. */ - if (pa < dmaplimit && pa + size < dmaplimit) { + if (pa < dmaplimit && pa + size <= dmaplimit) { va = PHYS_TO_DMAP(pa); if (!pmap_change_attr(va, size, mode)) return ((void *)(va + offset)); @@ -6943,7 +7070,7 @@ pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode) for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE) pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode); pmap_invalidate_range(kernel_pmap, va, va + tmpsize); - pmap_invalidate_cache_range(va, va + tmpsize, FALSE); + pmap_invalidate_cache_range(va, va + tmpsize); return ((void *)(va + offset)); } @@ -7302,7 +7429,7 @@ pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode) */ if (changed) { pmap_invalidate_range(kernel_pmap, base, tmpva); - pmap_invalidate_cache_range(base, tmpva, FALSE); + pmap_invalidate_cache_range(base, tmpva); } return (error); } @@ -7441,17 +7568,176 @@ pmap_pcid_alloc(pmap_t pmap, u_int cpuid) return (0); } +static uint64_t +pmap_pcid_alloc_checked(pmap_t pmap, u_int cpuid) +{ + uint64_t cached; + + cached = pmap_pcid_alloc(pmap, cpuid); + KASSERT(pmap->pm_pcids[cpuid].pm_pcid < PMAP_PCID_OVERMAX, + ("pmap %p cpu %d pcid %#x", pmap, cpuid, + pmap->pm_pcids[cpuid].pm_pcid)); + KASSERT(pmap->pm_pcids[cpuid].pm_pcid != PMAP_PCID_KERN || + pmap == kernel_pmap, + ("non-kernel pmap pmap %p cpu %d pcid %#x", + pmap, cpuid, pmap->pm_pcids[cpuid].pm_pcid)); + return (cached); +} + +static void +pmap_activate_sw_pti_post(pmap_t pmap) +{ + + if (pmap->pm_ucr3 != PMAP_NO_CR3) + PCPU_GET(tssp)->tss_rsp0 = ((vm_offset_t)PCPU_PTR(pti_stack) + + PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful; +} + +static void inline +pmap_activate_sw_pcid_pti(pmap_t pmap, u_int cpuid, const bool invpcid_works1) +{ + struct invpcid_descr d; + uint64_t cached, cr3, kcr3, ucr3; + + cached = pmap_pcid_alloc_checked(pmap, cpuid); + cr3 = rcr3(); + if ((cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3) + load_cr3(pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid); + PCPU_SET(curpmap, pmap); + kcr3 = pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid; + ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[cpuid].pm_pcid | + PMAP_PCID_USER_PT; + + if (!cached && pmap->pm_ucr3 != PMAP_NO_CR3) { + /* + * Explicitly invalidate translations cached from the + * user page table. They are not automatically + * flushed by reload of cr3 with the kernel page table + * pointer above. + * + * Note that the if() condition is resolved statically + * by using the function argument instead of + * runtime-evaluated invpcid_works value. + */ + if (invpcid_works1) { + d.pcid = PMAP_PCID_USER_PT | + pmap->pm_pcids[cpuid].pm_pcid; + d.pad = 0; + d.addr = 0; + invpcid(&d, INVPCID_CTX); + } else { + pmap_pti_pcid_invalidate(ucr3, kcr3); + } + } + + PCPU_SET(kcr3, kcr3 | CR3_PCID_SAVE); + PCPU_SET(ucr3, ucr3 | CR3_PCID_SAVE); + if (cached) + PCPU_INC(pm_save_cnt); +} + +static void +pmap_activate_sw_pcid_invpcid_pti(pmap_t pmap, u_int cpuid) +{ + + pmap_activate_sw_pcid_pti(pmap, cpuid, true); + pmap_activate_sw_pti_post(pmap); +} + +static void +pmap_activate_sw_pcid_noinvpcid_pti(pmap_t pmap, u_int cpuid) +{ + register_t rflags; + + /* + * If the INVPCID instruction is not available, + * invltlb_pcid_handler() is used to handle an invalidate_all + * IPI, which checks for curpmap == smp_tlb_pmap. The below + * sequence of operations has a window where %CR3 is loaded + * with the new pmap's PML4 address, but the curpmap value has + * not yet been updated. This causes the invltlb IPI handler, + * which is called between the updates, to execute as a NOP, + * which leaves stale TLB entries. + * + * Note that the most typical use of pmap_activate_sw(), from + * the context switch, is immune to this race, because + * interrupts are disabled (while the thread lock is owned), + * and the IPI happens after curpmap is updated. Protect + * other callers in a similar way, by disabling interrupts + * around the %cr3 register reload and curpmap assignment. + */ + rflags = intr_disable(); + pmap_activate_sw_pcid_pti(pmap, cpuid, false); + intr_restore(rflags); + pmap_activate_sw_pti_post(pmap); +} + +static void +pmap_activate_sw_pcid_nopti(pmap_t pmap, u_int cpuid) +{ + uint64_t cached, cr3; + + cached = pmap_pcid_alloc_checked(pmap, cpuid); + cr3 = rcr3(); + if (!cached || (cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3) + load_cr3(pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid | + cached); + PCPU_SET(curpmap, pmap); + if (cached) + PCPU_INC(pm_save_cnt); +} + +static void +pmap_activate_sw_pcid_noinvpcid_nopti(pmap_t pmap, u_int cpuid) +{ + register_t rflags; + + rflags = intr_disable(); + pmap_activate_sw_pcid_nopti(pmap, cpuid); + intr_restore(rflags); +} + +static void +pmap_activate_sw_nopcid_nopti(pmap_t pmap, u_int cpuid __unused) +{ + + load_cr3(pmap->pm_cr3); + PCPU_SET(curpmap, pmap); +} + +static void +pmap_activate_sw_nopcid_pti(pmap_t pmap, u_int cpuid __unused) +{ + + pmap_activate_sw_nopcid_nopti(pmap, cpuid); + PCPU_SET(kcr3, pmap->pm_cr3); + PCPU_SET(ucr3, pmap->pm_ucr3); + pmap_activate_sw_pti_post(pmap); +} + +DEFINE_IFUNC(static, void, pmap_activate_sw_mode, (pmap_t, u_int), static) +{ + + if (pmap_pcid_enabled && pti && invpcid_works) + return (pmap_activate_sw_pcid_invpcid_pti); + else if (pmap_pcid_enabled && pti && !invpcid_works) + return (pmap_activate_sw_pcid_noinvpcid_pti); + else if (pmap_pcid_enabled && !pti && invpcid_works) + return (pmap_activate_sw_pcid_nopti); + else if (pmap_pcid_enabled && !pti && !invpcid_works) + return (pmap_activate_sw_pcid_noinvpcid_nopti); + else if (!pmap_pcid_enabled && pti) + return (pmap_activate_sw_nopcid_pti); + else /* if (!pmap_pcid_enabled && !pti) */ + return (pmap_activate_sw_nopcid_nopti); +} + void pmap_activate_sw(struct thread *td) { pmap_t oldpmap, pmap; - struct invpcid_descr d; - uint64_t cached, cr3, kcr3, kern_pti_cached, rsp0, ucr3; - register_t rflags; u_int cpuid; - struct amd64tss *tssp; - rflags = 0; oldpmap = PCPU_GET(curpmap); pmap = vmspace_pmap(td->td_proc->p_vmspace); if (oldpmap == pmap) @@ -7462,91 +7748,7 @@ pmap_activate_sw(struct thread *td) #else CPU_SET(cpuid, &pmap->pm_active); #endif - cr3 = rcr3(); - if (pmap_pcid_enabled) { - cached = pmap_pcid_alloc(pmap, cpuid); - KASSERT(pmap->pm_pcids[cpuid].pm_pcid >= 0 && - pmap->pm_pcids[cpuid].pm_pcid < PMAP_PCID_OVERMAX, - ("pmap %p cpu %d pcid %#x", pmap, cpuid, - pmap->pm_pcids[cpuid].pm_pcid)); - KASSERT(pmap->pm_pcids[cpuid].pm_pcid != PMAP_PCID_KERN || - pmap == kernel_pmap, - ("non-kernel pmap thread %p pmap %p cpu %d pcid %#x", - td, pmap, cpuid, pmap->pm_pcids[cpuid].pm_pcid)); - - /* - * If the INVPCID instruction is not available, - * invltlb_pcid_handler() is used for handle - * invalidate_all IPI, which checks for curpmap == - * smp_tlb_pmap. Below operations sequence has a - * window where %CR3 is loaded with the new pmap's - * PML4 address, but curpmap value is not yet updated. - * This causes invltlb IPI handler, called between the - * updates, to execute as NOP, which leaves stale TLB - * entries. - * - * Note that the most typical use of - * pmap_activate_sw(), from the context switch, is - * immune to this race, because interrupts are - * disabled (while the thread lock is owned), and IPI - * happens after curpmap is updated. Protect other - * callers in a similar way, by disabling interrupts - * around the %cr3 register reload and curpmap - * assignment. - */ - if (!invpcid_works) - rflags = intr_disable(); - - kern_pti_cached = pti ? 0 : cached; - if (!kern_pti_cached || (cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3) { - load_cr3(pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid | - kern_pti_cached); - } - PCPU_SET(curpmap, pmap); - if (pti) { - kcr3 = pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid; - ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[cpuid].pm_pcid | - PMAP_PCID_USER_PT; - - if (!cached && pmap->pm_ucr3 != PMAP_NO_CR3) { - /* - * Manually invalidate translations cached - * from the user page table. They are not - * flushed by reload of cr3 with the kernel - * page table pointer above. - */ - if (invpcid_works) { - d.pcid = PMAP_PCID_USER_PT | - pmap->pm_pcids[cpuid].pm_pcid; - d.pad = 0; - d.addr = 0; - invpcid(&d, INVPCID_CTX); - } else { - pmap_pti_pcid_invalidate(ucr3, kcr3); - } - } - - PCPU_SET(kcr3, kcr3 | CR3_PCID_SAVE); - PCPU_SET(ucr3, ucr3 | CR3_PCID_SAVE); - } - if (!invpcid_works) - intr_restore(rflags); - if (cached) - PCPU_INC(pm_save_cnt); - } else { - load_cr3(pmap->pm_cr3); - PCPU_SET(curpmap, pmap); - if (pti) { - PCPU_SET(kcr3, pmap->pm_cr3); - PCPU_SET(ucr3, pmap->pm_ucr3); - } - } - if (pmap->pm_ucr3 != PMAP_NO_CR3) { - rsp0 = ((vm_offset_t)PCPU_PTR(pti_stack) + - PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful; - tssp = PCPU_GET(tssp); - tssp->tss_rsp0 = rsp0; - } + pmap_activate_sw_mode(pmap, cpuid); #ifdef SMP CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); #else diff --git a/sys/amd64/amd64/support.S b/sys/amd64/amd64/support.S index db89a6fadd3c..f0086e298ea5 100644 --- a/sys/amd64/amd64/support.S +++ b/sys/amd64/amd64/support.S @@ -41,7 +41,7 @@ .text /* Address: %rdi */ -ENTRY(pagezero) +ENTRY(pagezero_std) PUSH_FRAME_POINTER movq $PAGE_SIZE/8,%rcx xorl %eax,%eax @@ -49,7 +49,17 @@ ENTRY(pagezero) stosq POP_FRAME_POINTER ret -END(pagezero) +END(pagezero_std) + +ENTRY(pagezero_erms) + PUSH_FRAME_POINTER + movq $PAGE_SIZE,%rcx + xorl %eax,%eax + rep + stosb + POP_FRAME_POINTER + ret +END(pagezero_erms) /* * pagecopy(%rdi=from, %rsi=to) @@ -91,6 +101,100 @@ ENTRY(sse2_pagezero) END(sse2_pagezero) /* + * memcmpy(b1, b2, len) + * rdi,rsi,len + */ +ENTRY(memcmp) + PUSH_FRAME_POINTER + cmpq $16,%rdx + jae 5f +1: + testq %rdx,%rdx + je 3f + xorl %ecx,%ecx +2: + movzbl (%rdi,%rcx,1),%eax + movzbl (%rsi,%rcx,1),%r8d + cmpb %r8b,%al + jne 4f + addq $1,%rcx + cmpq %rcx,%rdx + jz 3f + movzbl (%rdi,%rcx,1),%eax + movzbl (%rsi,%rcx,1),%r8d + cmpb %r8b,%al + jne 4f + addq $1,%rcx + cmpq %rcx,%rdx + jz 3f + movzbl (%rdi,%rcx,1),%eax + movzbl (%rsi,%rcx,1),%r8d + cmpb %r8b,%al + jne 4f + addq $1,%rcx + cmpq %rcx,%rdx + jz 3f + movzbl (%rdi,%rcx,1),%eax + movzbl (%rsi,%rcx,1),%r8d + cmpb %r8b,%al + jne 4f + addq $1,%rcx + cmpq %rcx,%rdx + jne 2b +3: + xorl %eax,%eax + POP_FRAME_POINTER + ret +4: + subl %r8d,%eax + POP_FRAME_POINTER + ret +5: + cmpq $32,%rdx + jae 7f +6: + /* + * 8 bytes + */ + movq (%rdi),%r8 + movq (%rsi),%r9 + cmpq %r8,%r9 + jne 1b + leaq 8(%rdi),%rdi + leaq 8(%rsi),%rsi + subq $8,%rdx + cmpq $8,%rdx + jae 6b + jl 1b + jmp 3b +7: + /* + * 32 bytes + */ + movq (%rsi),%r8 + movq 8(%rsi),%r9 + subq (%rdi),%r8 + subq 8(%rdi),%r9 + or %r8,%r9 + jnz 1b + + movq 16(%rsi),%r8 + movq 24(%rsi),%r9 + subq 16(%rdi),%r8 + subq 24(%rdi),%r9 + or %r8,%r9 + jnz 1b + + leaq 32(%rdi),%rdi + leaq 32(%rsi),%rsi + subq $32,%rdx + cmpq $32,%rdx + jae 7b + jnz 1b + jmp 3b +END(memcmp) + +/* * memmove(dst, src, cnt) * rdi, rsi, rdx * Adapted from bcopy written by: @@ -98,40 +202,43 @@ END(sse2_pagezero) */ ENTRY(memmove_std) PUSH_FRAME_POINTER - movq %rdi,%r9 + movq %rdi,%rax movq %rdx,%rcx - movq %rdi,%rax - subq %rsi,%rax - cmpq %rcx,%rax /* overlapping && src < dst? */ - jb 1f + movq %rdi,%r8 + subq %rsi,%r8 + cmpq %rcx,%r8 /* overlapping && src < dst? */ + jb 2f + cmpq $15,%rcx + jbe 1f shrq $3,%rcx /* copy by 64-bit words */ rep movsq movq %rdx,%rcx andq $7,%rcx /* any bytes left? */ - jne 2f - movq %r9,%rax + jne 1f POP_FRAME_POINTER ret -2: + ALIGN_TEXT +1: rep movsb - movq %r9,%rax POP_FRAME_POINTER ret /* ALIGN_TEXT */ -1: +2: addq %rcx,%rdi /* copy backwards */ addq %rcx,%rsi decq %rdi decq %rsi - andq $7,%rcx /* any fractional bytes? */ std + andq $7,%rcx /* any fractional bytes? */ + je 3f rep movsb +3: movq %rdx,%rcx /* copy remainder by 32-bit words */ shrq $3,%rcx subq $7,%rsi @@ -139,24 +246,22 @@ ENTRY(memmove_std) rep movsq cld - movq %r9,%rax POP_FRAME_POINTER ret END(memmove_std) ENTRY(memmove_erms) PUSH_FRAME_POINTER - movq %rdi,%r9 + movq %rdi,%rax movq %rdx,%rcx - movq %rdi,%rax - subq %rsi,%rax - cmpq %rcx,%rax /* overlapping && src < dst? */ + movq %rdi,%r8 + subq %rsi,%r8 + cmpq %rcx,%r8 /* overlapping && src < dst? */ jb 1f rep movsb - movq %r9,%rax POP_FRAME_POINTER ret @@ -169,7 +274,6 @@ ENTRY(memmove_erms) rep movsb cld - movq %r9,%rax POP_FRAME_POINTER ret END(memmove_erms) @@ -184,6 +288,8 @@ ENTRY(memcpy_std) PUSH_FRAME_POINTER movq %rdi,%rax movq %rdx,%rcx + cmpq $15,%rcx + jbe 1f shrq $3,%rcx /* copy by 64-bit words */ rep movsq @@ -192,6 +298,7 @@ ENTRY(memcpy_std) jne 1f POP_FRAME_POINTER ret + ALIGN_TEXT 1: rep movsb @@ -220,6 +327,8 @@ ENTRY(memset_std) movzbq %sil,%r8 movabs $0x0101010101010101,%rax imulq %r8,%rax + cmpq $15,%rcx + jbe 1f shrq $3,%rcx rep stosq @@ -229,6 +338,7 @@ ENTRY(memset_std) movq %r9,%rax POP_FRAME_POINTER ret + ALIGN_TEXT 1: rep stosb @@ -274,62 +384,27 @@ END(fillw) * returns to *curpcb->pcb_onfault instead of the function. */ +.macro SMAP_DISABLE smap +.if \smap + stac +.endif +.endm + + +.macro SMAP_ENABLE smap +.if \smap + clac +.endif +.endm + /* * copyout(from_kernel, to_user, len) * %rdi, %rsi, %rdx */ -ENTRY(copyout_nosmap) +.macro COPYOUT smap erms PUSH_FRAME_POINTER - movq PCPU(CURPCB),%rax - movq $copyout_fault,PCB_ONFAULT(%rax) - testq %rdx,%rdx /* anything to do? */ - jz done_copyout - - /* - * Check explicitly for non-user addresses. This check is essential - * because it prevents usermode from writing into the kernel. We do - * not verify anywhere else that the user did not specify a rogue - * address. - */ - /* - * First, prevent address wrapping. - */ - movq %rsi,%rax - addq %rdx,%rax - jc copyout_fault -/* - * XXX STOP USING VM_MAXUSER_ADDRESS. - * It is an end address, not a max, so every time it is used correctly it - * looks like there is an off by one error, and of course it caused an off - * by one error in several places. - */ - movq $VM_MAXUSER_ADDRESS,%rcx - cmpq %rcx,%rax - ja copyout_fault - - xchgq %rdi,%rsi - /* bcopy(%rsi, %rdi, %rdx) */ - movq %rdx,%rcx - - shrq $3,%rcx - rep - movsq - movb %dl,%cl - andb $7,%cl - je done_copyout - rep - movsb - - jmp done_copyout -END(copyout_nosmap) - -ENTRY(copyout_smap) - PUSH_FRAME_POINTER - movq PCPU(CURPCB),%rax - /* Trap entry clears PSL.AC */ - movq $copyout_fault,PCB_ONFAULT(%rax) - testq %rdx,%rdx /* anything to do? */ - jz done_copyout + movq PCPU(CURPCB),%r9 + movq $copy_fault,PCB_ONFAULT(%r9) /* * Check explicitly for non-user addresses. If 486 write protection @@ -343,7 +418,7 @@ ENTRY(copyout_smap) */ movq %rsi,%rax addq %rdx,%rax - jc copyout_fault + jc copy_fault /* * XXX STOP USING VM_MAXUSER_ADDRESS. * It is an end address, not a max, so every time it is used correctly it @@ -352,119 +427,137 @@ ENTRY(copyout_smap) */ movq $VM_MAXUSER_ADDRESS,%rcx cmpq %rcx,%rax - ja copyout_fault + ja copy_fault - xchgq %rdi,%rsi - /* bcopy(%rsi, %rdi, %rdx) */ + /* + * Set up arguments for rep movs*. + */ + movq %rdi,%r8 + movq %rsi,%rdi + movq %r8,%rsi movq %rdx,%rcx + /* + * Set return value to zero. Remaining failure mode goes through + * copy_fault. + */ + xorl %eax,%eax + + SMAP_DISABLE \smap +.if \erms == 0 + cmpq $15,%rcx + jbe 1f shrq $3,%rcx - stac rep movsq movb %dl,%cl andb $7,%cl - je 1f + jne 1f + SMAP_ENABLE \smap + movq %rax,PCB_ONFAULT(%r9) + POP_FRAME_POINTER + ret + ALIGN_TEXT +1: +.endif rep movsb -1: clac -done_copyout: - xorl %eax,%eax - movq PCPU(CURPCB),%rdx - movq %rax,PCB_ONFAULT(%rdx) + SMAP_ENABLE \smap + movq %rax,PCB_ONFAULT(%r9) POP_FRAME_POINTER ret +.endm - ALIGN_TEXT -copyout_fault: - movq PCPU(CURPCB),%rdx - movq $0,PCB_ONFAULT(%rdx) - movq $EFAULT,%rax - POP_FRAME_POINTER - ret -END(copyout_smap) +ENTRY(copyout_nosmap_std) + COPYOUT smap=0 erms=0 +END(copyout_nosmap_std) + +ENTRY(copyout_smap_std) + COPYOUT smap=1 erms=0 +END(copyout_smap_std) + +ENTRY(copyout_nosmap_erms) + COPYOUT smap=0 erms=1 +END(copyout_nosmap_erms) + +ENTRY(copyout_smap_erms) + COPYOUT smap=1 erms=1 +END(copyout_smap_erms) /* * copyin(from_user, to_kernel, len) * %rdi, %rsi, %rdx */ -ENTRY(copyin_nosmap) +.macro COPYIN smap erms PUSH_FRAME_POINTER - movq PCPU(CURPCB),%rax - movq $copyin_fault,PCB_ONFAULT(%rax) - testq %rdx,%rdx /* anything to do? */ - jz done_copyin + movq PCPU(CURPCB),%r9 + movq $copy_fault,PCB_ONFAULT(%r9) /* * make sure address is valid */ movq %rdi,%rax addq %rdx,%rax - jc copyin_fault + jc copy_fault movq $VM_MAXUSER_ADDRESS,%rcx cmpq %rcx,%rax - ja copyin_fault + ja copy_fault - xchgq %rdi,%rsi + movq %rdi,%r8 + movq %rsi,%rdi + movq %r8,%rsi movq %rdx,%rcx - movb %cl,%al - shrq $3,%rcx /* copy longword-wise */ - rep - movsq - movb %al,%cl - andb $7,%cl /* copy remaining bytes */ - je done_copyin - rep - movsb - - jmp done_copyin -END(copyin_nosmap) -ENTRY(copyin_smap) - PUSH_FRAME_POINTER - movq PCPU(CURPCB),%rax - movq $copyin_fault,PCB_ONFAULT(%rax) - testq %rdx,%rdx /* anything to do? */ - jz done_copyin - - /* - * make sure address is valid - */ - movq %rdi,%rax - addq %rdx,%rax - jc copyin_fault - movq $VM_MAXUSER_ADDRESS,%rcx - cmpq %rcx,%rax - ja copyin_fault + xorl %eax,%eax - xchgq %rdi,%rsi - movq %rdx,%rcx - movb %cl,%al + SMAP_DISABLE \smap +.if \erms == 0 + cmpq $15,%rcx + jbe 1f shrq $3,%rcx /* copy longword-wise */ - stac rep movsq - movb %al,%cl + movb %dl,%cl andb $7,%cl /* copy remaining bytes */ - je 1f + jne 1f + SMAP_ENABLE \smap + movq %rax,PCB_ONFAULT(%r9) + POP_FRAME_POINTER + ret + ALIGN_TEXT +1: +.endif rep movsb -1: clac -done_copyin: - xorl %eax,%eax - movq PCPU(CURPCB),%rdx - movq %rax,PCB_ONFAULT(%rdx) + SMAP_ENABLE \smap + movq %rax,PCB_ONFAULT(%r9) POP_FRAME_POINTER ret -END(copyin_smap) +.endm + +ENTRY(copyin_nosmap_std) + COPYIN smap=0 erms=0 +END(copyin_nosmap_std) + +ENTRY(copyin_smap_std) + COPYIN smap=1 erms=0 +END(copyin_smap_std) + +ENTRY(copyin_nosmap_erms) + COPYIN smap=0 erms=1 +END(copyin_nosmap_erms) + +ENTRY(copyin_smap_erms) + COPYIN smap=1 erms=1 +END(copyin_smap_erms) ALIGN_TEXT -copyin_fault: - movq PCPU(CURPCB),%rdx - movq $0,PCB_ONFAULT(%rdx) - movq $EFAULT,%rax + /* Trap entry clears PSL.AC */ +copy_fault: + movq $0,PCB_ONFAULT(%r9) + movl $EFAULT,%eax POP_FRAME_POINTER ret @@ -748,16 +841,6 @@ ENTRY(fubyte_smap) ret END(fubyte_smap) - ALIGN_TEXT - /* Fault entry clears PSL.AC */ -fusufault: - movq PCPU(CURPCB),%rcx - xorl %eax,%eax - movq %rax,PCB_ONFAULT(%rcx) - decq %rax - POP_FRAME_POINTER - ret - /* * Store a 64-bit word, a 32-bit word, a 16-bit word, or an 8-bit byte to * user memory. @@ -909,6 +992,16 @@ ENTRY(subyte_smap) ret END(subyte_smap) + ALIGN_TEXT + /* Fault entry clears PSL.AC */ +fusufault: + movq PCPU(CURPCB),%rcx + xorl %eax,%eax + movq %rax,PCB_ONFAULT(%rcx) + decq %rax + POP_FRAME_POINTER + ret + /* * copyinstr(from, to, maxlen, int *lencopied) * %rdi, %rsi, %rdx, %rcx @@ -918,109 +1011,97 @@ END(subyte_smap) * EFAULT on protection violations. If lencopied is non-zero, * return the actual length in *lencopied. */ -ENTRY(copyinstr_nosmap) +.macro COPYINSTR smap PUSH_FRAME_POINTER movq %rdx,%r8 /* %r8 = maxlen */ - movq %rcx,%r9 /* %r9 = *len */ - xchgq %rdi,%rsi /* %rdi = from, %rsi = to */ - movq PCPU(CURPCB),%rcx - movq $cpystrflt,PCB_ONFAULT(%rcx) + movq PCPU(CURPCB),%r9 + movq $cpystrflt,PCB_ONFAULT(%r9) movq $VM_MAXUSER_ADDRESS,%rax /* make sure 'from' is within bounds */ - subq %rsi,%rax + subq %rdi,%rax jbe cpystrflt + SMAP_DISABLE \smap + /* restrict maxlen to <= VM_MAXUSER_ADDRESS-from */ cmpq %rdx,%rax - jae 1f - movq %rax,%rdx - movq %rax,%r8 + jb 8f 1: incq %rdx - 2: decq %rdx +.if \smap == 0 jz copyinstr_toolong +.else + jz copyinstr_toolong_smap +.endif - lodsb - stosb - orb %al,%al + movb (%rdi),%al + movb %al,(%rsi) + incq %rsi + incq %rdi + testb %al,%al jnz 2b - jmp copyinstr_succ -END(copyinstr_nosmap) + SMAP_ENABLE \smap -ENTRY(copyinstr_smap) - PUSH_FRAME_POINTER - movq %rdx,%r8 /* %r8 = maxlen */ - movq %rcx,%r9 /* %r9 = *len */ - xchgq %rdi,%rsi /* %rdi = from, %rsi = to */ - movq PCPU(CURPCB),%rcx - movq $cpystrflt,PCB_ONFAULT(%rcx) - - movq $VM_MAXUSER_ADDRESS,%rax - - /* make sure 'from' is within bounds */ - subq %rsi,%rax - jbe cpystrflt + /* Success -- 0 byte reached */ + decq %rdx + xorl %eax,%eax - stac + /* set *lencopied and return %eax */ + movq %rax,PCB_ONFAULT(%r9) - /* restrict maxlen to <= VM_MAXUSER_ADDRESS-from */ - cmpq %rdx,%rax - jae 1f + testq %rcx,%rcx + jz 3f + subq %rdx,%r8 + movq %r8,(%rcx) +3: + POP_FRAME_POINTER + ret + ALIGN_TEXT +8: movq %rax,%rdx movq %rax,%r8 -1: - incq %rdx - -2: - decq %rdx - jz copyinstr_toolong_smap + jmp 1b - lodsb - stosb - orb %al,%al - jnz 2b +.endm - clac +ENTRY(copyinstr_nosmap) + COPYINSTR smap=0 +END(copyinstr_nosmap) -copyinstr_succ: - /* Success -- 0 byte reached */ - decq %rdx - xorl %eax,%eax +ENTRY(copyinstr_smap) + COPYINSTR smap=1 +END(copyinstr_smap) +cpystrflt: + /* Fault entry clears PSL.AC */ + movl $EFAULT,%eax cpystrflt_x: /* set *lencopied and return %eax */ - movq PCPU(CURPCB),%rcx - movq $0,PCB_ONFAULT(%rcx) + movq $0,PCB_ONFAULT(%r9) - testq %r9,%r9 + testq %rcx,%rcx jz 1f subq %rdx,%r8 - movq %r8,(%r9) + movq %r8,(%rcx) 1: POP_FRAME_POINTER ret - /* Fault entry clears PSL.AC */ -cpystrflt: - movq $EFAULT,%rax - jmp cpystrflt_x copyinstr_toolong_smap: clac copyinstr_toolong: /* rdx is zero - return ENAMETOOLONG or EFAULT */ movq $VM_MAXUSER_ADDRESS,%rax - cmpq %rax,%rsi + cmpq %rax,%rdi jae cpystrflt - movq $ENAMETOOLONG,%rax + movl $ENAMETOOLONG,%eax jmp cpystrflt_x -END(copyinstr_smap) - /* * copystr(from, to, maxlen, int *lencopied) * %rdi, %rsi, %rdx, %rcx @@ -1029,34 +1110,33 @@ ENTRY(copystr) PUSH_FRAME_POINTER movq %rdx,%r8 /* %r8 = maxlen */ - xchgq %rdi,%rsi - incq %rdx + incq %rdx 1: decq %rdx jz 4f - lodsb - stosb - orb %al,%al + movb (%rdi),%al + movb %al,(%rsi) + incq %rsi + incq %rdi + testb %al,%al jnz 1b /* Success -- 0 byte reached */ decq %rdx xorl %eax,%eax - jmp 6f -4: - /* rdx is zero -- return ENAMETOOLONG */ - movq $ENAMETOOLONG,%rax - -6: - +2: testq %rcx,%rcx - jz 7f + jz 3f /* set *lencopied and return %rax */ subq %rdx,%r8 movq %r8,(%rcx) -7: +3: POP_FRAME_POINTER ret +4: + /* rdx is zero -- return ENAMETOOLONG */ + movl $ENAMETOOLONG,%eax + jmp 2b END(copystr) /* diff --git a/sys/amd64/amd64/trap.c b/sys/amd64/amd64/trap.c index 4d03da234f19..1a8e5d23ff3a 100644 --- a/sys/amd64/amd64/trap.c +++ b/sys/amd64/amd64/trap.c @@ -659,12 +659,6 @@ trap(struct trapframe *frame) KASSERT((read_rflags() & PSL_I) != 0, ("interrupts disabled")); trapsignal(td, &ksi); - /* - * Clear any pending debug exceptions after allowing a - * debugger to read DR6 while stopped in trapsignal(). - */ - if (type == T_TRCTRAP) - load_dr6(0); userret: userret(td, frame); KASSERT(PCB_USER_FPU(td->td_pcb), @@ -705,6 +699,17 @@ trap_is_smap(struct trapframe *frame) PGEX_P && (frame->tf_rflags & PSL_AC) == 0); } +static bool +trap_is_pti(struct trapframe *frame) +{ + + return (PCPU_GET(curpmap)->pm_ucr3 != PMAP_NO_CR3 && + pg_nx != 0 && (frame->tf_err & (PGEX_P | PGEX_W | + PGEX_U | PGEX_I)) == (PGEX_P | PGEX_U | PGEX_I) && + (curpcb->pcb_saved_ucr3 & ~CR3_PCID_MASK) == + (PCPU_GET(curpmap)->pm_cr3 & ~CR3_PCID_MASK)); +} + static int trap_pfault(struct trapframe *frame, int usermode) { @@ -806,12 +811,8 @@ trap_pfault(struct trapframe *frame, int usermode) * If nx protection of the usermode portion of kernel page * tables caused trap, panic. */ - if (usermode && PCPU_GET(curpmap)->pm_ucr3 != PMAP_NO_CR3 && - pg_nx != 0 && (frame->tf_err & (PGEX_P | PGEX_W | - PGEX_U | PGEX_I)) == (PGEX_P | PGEX_U | PGEX_I) && - (curpcb->pcb_saved_ucr3 & ~CR3_PCID_MASK)== - (PCPU_GET(curpmap)->pm_cr3 & ~CR3_PCID_MASK)) - panic("PTI: pid %d comm %s tf_err %#lx\n", p->p_pid, + if (usermode && trap_is_pti(frame)) + panic("PTI: pid %d comm %s tf_err %#lx", p->p_pid, p->p_comm, frame->tf_err); /* diff --git a/sys/amd64/include/pmap.h b/sys/amd64/include/pmap.h index 34c3fb868ce3..5b83307bbb68 100644 --- a/sys/amd64/include/pmap.h +++ b/sys/amd64/include/pmap.h @@ -431,8 +431,8 @@ void pmap_invalidate_range(pmap_t, vm_offset_t, vm_offset_t); void pmap_invalidate_all(pmap_t); void pmap_invalidate_cache(void); void pmap_invalidate_cache_pages(vm_page_t *pages, int count); -void pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva, - boolean_t force); +void pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva); +void pmap_force_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva); void pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num); boolean_t pmap_map_io_transient(vm_page_t *, vm_offset_t *, int, boolean_t); void pmap_unmap_io_transient(vm_page_t *, vm_offset_t *, int, boolean_t); diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h index 6b8a14224874..b66eda0f4768 100644 --- a/sys/amd64/include/vmm.h +++ b/sys/amd64/include/vmm.h @@ -557,6 +557,7 @@ enum vm_exitcode { VM_EXITCODE_SVM, VM_EXITCODE_REQIDLE, VM_EXITCODE_DEBUG, + VM_EXITCODE_VMINSN, VM_EXITCODE_MAX }; diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c index b40846cae6e6..61871e9338eb 100644 --- a/sys/amd64/vmm/intel/vmx.c +++ b/sys/amd64/vmm/intel/vmx.c @@ -267,6 +267,9 @@ SDT_PROBE_DEFINE3(vmm, vmx, exit, monitor, SDT_PROBE_DEFINE3(vmm, vmx, exit, mwait, "struct vmx *", "int", "struct vm_exit *"); +SDT_PROBE_DEFINE3(vmm, vmx, exit, vminsn, + "struct vmx *", "int", "struct vm_exit *"); + SDT_PROBE_DEFINE4(vmm, vmx, exit, unknown, "struct vmx *", "int", "struct vm_exit *", "uint32_t"); @@ -2638,6 +2641,19 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) SDT_PROBE3(vmm, vmx, exit, mwait, vmx, vcpu, vmexit); vmexit->exitcode = VM_EXITCODE_MWAIT; break; + case EXIT_REASON_VMCALL: + case EXIT_REASON_VMCLEAR: + case EXIT_REASON_VMLAUNCH: + case EXIT_REASON_VMPTRLD: + case EXIT_REASON_VMPTRST: + case EXIT_REASON_VMREAD: + case EXIT_REASON_VMRESUME: + case EXIT_REASON_VMWRITE: + case EXIT_REASON_VMXOFF: + case EXIT_REASON_VMXON: + SDT_PROBE3(vmm, vmx, exit, vminsn, vmx, vcpu, vmexit); + vmexit->exitcode = VM_EXITCODE_VMINSN; + break; default: SDT_PROBE4(vmm, vmx, exit, unknown, vmx, vcpu, vmexit, reason); diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c index 6f0e0b2b9554..aecd21f13b4a 100644 --- a/sys/amd64/vmm/vmm.c +++ b/sys/amd64/vmm/vmm.c @@ -1737,6 +1737,7 @@ restart: break; case VM_EXITCODE_MONITOR: case VM_EXITCODE_MWAIT: + case VM_EXITCODE_VMINSN: vm_inject_ud(vm, vcpuid); break; default: diff --git a/sys/arm/conf/std.armv6 b/sys/arm/conf/std.armv6 index 2f6f9c93af4f..52685a9b13bf 100644 --- a/sys/arm/conf/std.armv6 +++ b/sys/arm/conf/std.armv6 @@ -41,6 +41,8 @@ options _KPOSIX_PRIORITY_SCHEDULING # POSIX P1003_1B real-time extensions options PRINTF_BUFR_SIZE=128 # Prevent printf output being interspersed. options KBD_INSTALL_CDEV # install a CDEV entry in /dev options HWPMC_HOOKS # Necessary kernel hooks for hwpmc(4) +options CAPABILITY_MODE # Capsicum capability mode +options CAPABILITIES # Capsicum capabilites options FREEBSD_BOOT_LOADER # Process metadata passed from loader(8) options VFP # Enable floating point hardware support options MAC # Support for Mandatory Access Control (MAC) diff --git a/sys/arm/conf/std.armv7 b/sys/arm/conf/std.armv7 index 5754f4780fea..c3ab6852e615 100644 --- a/sys/arm/conf/std.armv7 +++ b/sys/arm/conf/std.armv7 @@ -41,6 +41,8 @@ options _KPOSIX_PRIORITY_SCHEDULING # POSIX P1003_1B real-time extensions options PRINTF_BUFR_SIZE=128 # Prevent printf output being interspersed. options KBD_INSTALL_CDEV # install a CDEV entry in /dev options HWPMC_HOOKS # Necessary kernel hooks for hwpmc(4) +options CAPABILITY_MODE # Capsicum capability mode +options CAPABILITIES # Capsicum capabilites options FREEBSD_BOOT_LOADER # Process metadata passed from loader(8) options VFP # Enable floating point hardware support options MAC # Support for Mandatory Access Control (MAC) diff --git a/sys/arm64/arm64/elf_machdep.c b/sys/arm64/arm64/elf_machdep.c index 67244adfb0c9..f39eb3558b5f 100644 --- a/sys/arm64/arm64/elf_machdep.c +++ b/sys/arm64/arm64/elf_machdep.c @@ -133,14 +133,14 @@ bool elf_is_ifunc_reloc(Elf_Size r_info __unused) { - return (false); + return (ELF_R_TYPE(r_info) == R_AARCH64_IRELATIVE); } static int elf_reloc_internal(linker_file_t lf, Elf_Addr relocbase, const void *data, int type, int local, elf_lookup_fn lookup) { - Elf_Addr *where, addr, addend; + Elf_Addr *where, addr, addend, val; Elf_Word rtype, symidx; const Elf_Rel *rel; const Elf_Rela *rela; @@ -183,6 +183,12 @@ elf_reloc_internal(linker_file_t lf, Elf_Addr relocbase, const void *data, return (-1); *where = addr + addend; break; + case R_AARCH64_IRELATIVE: + addr = relocbase + addend; + val = ((Elf64_Addr (*)(void))addr)(); + if (*where != val) + *where = val; + break; default: printf("kldload: unexpected relocation type %d\n", rtype); return (-1); diff --git a/sys/arm64/arm64/identcpu.c b/sys/arm64/arm64/identcpu.c index a79490441e91..b9e7826e3435 100644 --- a/sys/arm64/arm64/identcpu.c +++ b/sys/arm64/arm64/identcpu.c @@ -43,6 +43,7 @@ __FBSDID("$FreeBSD$"); #include <machine/atomic.h> #include <machine/cpu.h> #include <machine/cpufunc.h> +#include <machine/undefined.h> static int ident_lock; @@ -87,6 +88,7 @@ struct cpu_desc { }; struct cpu_desc cpu_desc[MAXCPU]; +struct cpu_desc user_cpu_desc; static u_int cpu_print_regs; #define PRINT_ID_AA64_AFR0 0x00000001 #define PRINT_ID_AA64_AFR1 0x00000002 @@ -162,14 +164,249 @@ const struct cpu_implementers cpu_implementers[] = { CPU_IMPLEMENTER_NONE, }; +#define MRS_TYPE_MASK 0xf +#define MRS_INVALID 0 +#define MRS_EXACT 1 +#define MRS_EXACT_VAL(x) (MRS_EXACT | ((x) << 4)) +#define MRS_EXACT_FIELD(x) ((x) >> 4) +#define MRS_LOWER 2 + +struct mrs_field { + bool sign; + u_int type; + u_int shift; +}; + +#define MRS_FIELD(_sign, _type, _shift) \ + { \ + .sign = (_sign), \ + .type = (_type), \ + .shift = (_shift), \ + } + +#define MRS_FIELD_END { .type = MRS_INVALID, } + +static struct mrs_field id_aa64isar0_fields[] = { + MRS_FIELD(false, MRS_LOWER, ID_AA64ISAR0_DP_SHIFT), + MRS_FIELD(false, MRS_LOWER, ID_AA64ISAR0_SM4_SHIFT), + MRS_FIELD(false, MRS_LOWER, ID_AA64ISAR0_SM3_SHIFT), + MRS_FIELD(false, MRS_LOWER, ID_AA64ISAR0_SHA3_SHIFT), + MRS_FIELD(false, MRS_LOWER, ID_AA64ISAR0_RDM_SHIFT), + MRS_FIELD(false, MRS_LOWER, ID_AA64ISAR0_ATOMIC_SHIFT), + MRS_FIELD(false, MRS_LOWER, ID_AA64ISAR0_CRC32_SHIFT), + MRS_FIELD(false, MRS_LOWER, ID_AA64ISAR0_SHA2_SHIFT), + MRS_FIELD(false, MRS_LOWER, ID_AA64ISAR0_SHA1_SHIFT), + MRS_FIELD(false, MRS_LOWER, ID_AA64ISAR0_AES_SHIFT), + MRS_FIELD_END, +}; + +static struct mrs_field id_aa64isar1_fields[] = { + MRS_FIELD(false, MRS_LOWER, ID_AA64ISAR1_GPI_SHIFT), + MRS_FIELD(false, MRS_LOWER, ID_AA64ISAR1_GPA_SHIFT), + MRS_FIELD(false, MRS_LOWER, ID_AA64ISAR1_LRCPC_SHIFT), + MRS_FIELD(false, MRS_LOWER, ID_AA64ISAR1_FCMA_SHIFT), + MRS_FIELD(false, MRS_LOWER, ID_AA64ISAR1_JSCVT_SHIFT), + MRS_FIELD(false, MRS_LOWER, ID_AA64ISAR1_API_SHIFT), + MRS_FIELD(false, MRS_LOWER, ID_AA64ISAR1_APA_SHIFT), + MRS_FIELD(false, MRS_LOWER, ID_AA64ISAR1_DPB_SHIFT), + MRS_FIELD_END, +}; + +static struct mrs_field id_aa64pfr0_fields[] = { + MRS_FIELD(false, MRS_EXACT, ID_AA64PFR0_SVE_SHIFT), + MRS_FIELD(false, MRS_EXACT, ID_AA64PFR0_RAS_SHIFT), + MRS_FIELD(false, MRS_EXACT, ID_AA64PFR0_GIC_SHIFT), + MRS_FIELD(true, MRS_LOWER, ID_AA64PFR0_ADV_SIMD_SHIFT), + MRS_FIELD(true, MRS_LOWER, ID_AA64PFR0_FP_SHIFT), + MRS_FIELD(false, MRS_EXACT, ID_AA64PFR0_EL3_SHIFT), + MRS_FIELD(false, MRS_EXACT, ID_AA64PFR0_EL2_SHIFT), + MRS_FIELD(false, MRS_LOWER, ID_AA64PFR0_EL1_SHIFT), + MRS_FIELD(false, MRS_LOWER, ID_AA64PFR0_EL0_SHIFT), + MRS_FIELD_END, +}; + +static struct mrs_field id_aa64dfr0_fields[] = { + MRS_FIELD(false, MRS_EXACT, ID_AA64DFR0_PMS_VER_SHIFT), + MRS_FIELD(false, MRS_EXACT, ID_AA64DFR0_CTX_CMPS_SHIFT), + MRS_FIELD(false, MRS_EXACT, ID_AA64DFR0_WRPS_SHIFT), + MRS_FIELD(false, MRS_EXACT, ID_AA64DFR0_BRPS_SHIFT), + MRS_FIELD(false, MRS_EXACT, ID_AA64DFR0_PMU_VER_SHIFT), + MRS_FIELD(false, MRS_EXACT, ID_AA64DFR0_TRACE_VER_SHIFT), + MRS_FIELD(false, MRS_EXACT_VAL(0x6), ID_AA64DFR0_DEBUG_VER_SHIFT), + MRS_FIELD_END, +}; + +struct mrs_user_reg { + u_int CRm; + u_int Op2; + size_t offset; + struct mrs_field *fields; +}; + +static struct mrs_user_reg user_regs[] = { + { /* id_aa64isar0_el1 */ + .CRm = 6, + .Op2 = 0, + .offset = __offsetof(struct cpu_desc, id_aa64isar0), + .fields = id_aa64isar0_fields, + }, + { /* id_aa64isar1_el1 */ + .CRm = 6, + .Op2 = 1, + .offset = __offsetof(struct cpu_desc, id_aa64isar1), + .fields = id_aa64isar1_fields, + }, + { /* id_aa64pfr0_el1 */ + .CRm = 4, + .Op2 = 0, + .offset = __offsetof(struct cpu_desc, id_aa64pfr0), + .fields = id_aa64pfr0_fields, + }, + { /* id_aa64dfr0_el1 */ + .CRm = 5, + .Op2 = 0, + .offset = __offsetof(struct cpu_desc, id_aa64dfr0), + .fields = id_aa64dfr0_fields, + }, +}; + +#define CPU_DESC_FIELD(desc, idx) \ + *(uint64_t *)((char *)&(desc) + user_regs[(idx)].offset) + +static int +user_mrs_handler(vm_offset_t va, uint32_t insn, struct trapframe *frame, + uint32_t esr) +{ + uint64_t value; + int CRm, Op2, i, reg; + + if ((insn & MRS_MASK) != MRS_VALUE) + return (0); + + /* + * We only emulate Op0 == 3, Op1 == 0, CRn == 0, CRm == {0, 4-7}. + * These are in the EL1 CPU identification space. + * CRm == 0 holds MIDR_EL1, MPIDR_EL1, and REVID_EL1. + * CRm == {4-7} holds the ID_AA64 registers. + * + * For full details see the ARMv8 ARM (ARM DDI 0487C.a) + * Table D9-2 System instruction encodings for non-Debug System + * register accesses. + */ + if (mrs_Op0(insn) != 3 || mrs_Op1(insn) != 0 || mrs_CRn(insn) != 0) + return (0); + + CRm = mrs_CRm(insn); + if (CRm > 7 || (CRm < 4 && CRm != 0)) + return (0); + + Op2 = mrs_Op2(insn); + value = 0; + + for (i = 0; i < nitems(user_regs); i++) { + if (user_regs[i].CRm == CRm && user_regs[i].Op2 == Op2) { + value = CPU_DESC_FIELD(user_cpu_desc, i); + break; + } + } + + if (CRm == 0) { + switch (Op2) { + case 0: + value = READ_SPECIALREG(midr_el1); + break; + case 5: + value = READ_SPECIALREG(mpidr_el1); + break; + case 6: + value = READ_SPECIALREG(revidr_el1); + break; + default: + return (0); + } + } + + /* + * We will handle this instruction, move to the next so we + * don't trap here again. + */ + frame->tf_elr += INSN_SIZE; + + reg = MRS_REGISTER(insn); + /* If reg is 31 then write to xzr, i.e. do nothing */ + if (reg == 31) + return (1); + + if (reg < nitems(frame->tf_x)) + frame->tf_x[reg] = value; + else if (reg == 30) + frame->tf_lr = value; + + return (1); +} + +static void +update_user_regs(u_int cpu) +{ + struct mrs_field *fields; + uint64_t cur, value; + int i, j, cur_field, new_field; + + for (i = 0; i < nitems(user_regs); i++) { + value = CPU_DESC_FIELD(cpu_desc[cpu], i); + if (cpu == 0) + cur = value; + else + cur = CPU_DESC_FIELD(user_cpu_desc, i); + + fields = user_regs[i].fields; + for (j = 0; fields[j].type != 0; j++) { + switch (fields[j].type & MRS_TYPE_MASK) { + case MRS_EXACT: + cur &= ~(0xfu << fields[j].shift); + cur |= + (uint64_t)MRS_EXACT_FIELD(fields[j].type) << + fields[j].shift; + break; + case MRS_LOWER: + new_field = (value >> fields[j].shift) & 0xf; + cur_field = (cur >> fields[j].shift) & 0xf; + if ((fields[j].sign && + (int)new_field < (int)cur_field) || + (!fields[j].sign && + (u_int)new_field < (u_int)cur_field)) { + cur &= ~(0xfu << fields[j].shift); + cur |= new_field << fields[j].shift; + } + break; + default: + panic("Invalid field type: %d", fields[j].type); + } + } + + CPU_DESC_FIELD(user_cpu_desc, i) = cur; + } +} + static void identify_cpu_sysinit(void *dummy __unused) { int cpu; + /* Create a user visible cpu description with safe values */ + memset(&user_cpu_desc, 0, sizeof(user_cpu_desc)); + /* Safe values for these registers */ + user_cpu_desc.id_aa64pfr0 = ID_AA64PFR0_ADV_SIMD_NONE | + ID_AA64PFR0_FP_NONE | ID_AA64PFR0_EL1_64 | ID_AA64PFR0_EL0_64; + user_cpu_desc.id_aa64dfr0 = ID_AA64DFR0_DEBUG_VER_8; + + CPU_FOREACH(cpu) { print_cpu_features(cpu); + update_user_regs(cpu); } + + install_undef_handler(true, user_mrs_handler); } SYSINIT(idenrity_cpu, SI_SUB_SMP, SI_ORDER_ANY, identify_cpu_sysinit, NULL); diff --git a/sys/arm64/arm64/machdep.c b/sys/arm64/arm64/machdep.c index dc243c958a63..613dcbc53a09 100644 --- a/sys/arm64/arm64/machdep.c +++ b/sys/arm64/arm64/machdep.c @@ -236,7 +236,7 @@ fill_fpregs(struct thread *td, struct fpreg *regs) regs->fp_sr = pcb->pcb_fpustate.vfp_fpsr; } else #endif - memset(regs->fp_q, 0, sizeof(regs->fp_q)); + memset(regs, 0, sizeof(*regs)); return (0); } @@ -1004,6 +1004,7 @@ initarm(struct arm64_bootparams *abp) boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int); init_static_kenv(MD_FETCH(kmdp, MODINFOMD_ENVP, char *), 0); + link_elf_ireloc(kmdp); #ifdef FDT try_load_dtb(kmdp); diff --git a/sys/arm64/arm64/undefined.c b/sys/arm64/arm64/undefined.c index a96d5f88ddd5..753558a085dd 100644 --- a/sys/arm64/arm64/undefined.c +++ b/sys/arm64/arm64/undefined.c @@ -53,135 +53,6 @@ struct undef_handler { */ LIST_HEAD(, undef_handler) undef_handlers[2]; -#define MRS_MASK 0xfff00000 -#define MRS_VALUE 0xd5300000 -#define MRS_SPECIAL(insn) ((insn) & 0x000fffe0) -#define MRS_REGISTER(insn) ((insn) & 0x0000001f) -#define MRS_Op0_SHIFT 19 -#define MRS_Op0_MASK 0x00080000 -#define MRS_Op1_SHIFT 16 -#define MRS_Op1_MASK 0x00070000 -#define MRS_CRn_SHIFT 12 -#define MRS_CRn_MASK 0x0000f000 -#define MRS_CRm_SHIFT 8 -#define MRS_CRm_MASK 0x00000f00 -#define MRS_Op2_SHIFT 5 -#define MRS_Op2_MASK 0x000000e0 -#define MRS_Rt_SHIFT 0 -#define MRS_Rt_MASK 0x0000001f - -static inline int -mrs_Op0(uint32_t insn) -{ - - /* op0 is encoded without the top bit in a mrs instruction */ - return (2 | ((insn & MRS_Op0_MASK) >> MRS_Op0_SHIFT)); -} - -#define MRS_GET(op) \ -static inline int \ -mrs_##op(uint32_t insn) \ -{ \ - \ - return ((insn & MRS_##op##_MASK) >> MRS_##op##_SHIFT); \ -} -MRS_GET(Op1) -MRS_GET(CRn) -MRS_GET(CRm) -MRS_GET(Op2) - -struct mrs_safe_value { - u_int CRm; - u_int Op2; - uint64_t value; -}; - -static struct mrs_safe_value safe_values[] = { - { /* id_aa64pfr0_el1 */ - .CRm = 4, - .Op2 = 0, - .value = ID_AA64PFR0_ADV_SIMD_NONE | ID_AA64PFR0_FP_NONE | - ID_AA64PFR0_EL1_64 | ID_AA64PFR0_EL0_64, - }, - { /* id_aa64dfr0_el1 */ - .CRm = 5, - .Op2 = 0, - .value = ID_AA64DFR0_DEBUG_VER_8, - }, -}; - -static int -user_mrs_handler(vm_offset_t va, uint32_t insn, struct trapframe *frame, - uint32_t esr) -{ - uint64_t value; - int CRm, Op2, i, reg; - - if ((insn & MRS_MASK) != MRS_VALUE) - return (0); - - /* - * We only emulate Op0 == 3, Op1 == 0, CRn == 0, CRm == {0, 4-7}. - * These are in the EL1 CPU identification space. - * CRm == 0 holds MIDR_EL1, MPIDR_EL1, and REVID_EL1. - * CRm == {4-7} holds the ID_AA64 registers. - * - * For full details see the ARMv8 ARM (ARM DDI 0487C.a) - * Table D9-2 System instruction encodings for non-Debug System - * register accesses. - */ - if (mrs_Op0(insn) != 3 || mrs_Op1(insn) != 0 || mrs_CRn(insn) != 0) - return (0); - - CRm = mrs_CRm(insn); - if (CRm > 7 || (CRm < 4 && CRm != 0)) - return (0); - - Op2 = mrs_Op2(insn); - value = 0; - - for (i = 0; i < nitems(safe_values); i++) { - if (safe_values[i].CRm == CRm && safe_values[i].Op2 == Op2) { - value = safe_values[i].value; - break; - } - } - - if (CRm == 0) { - switch (Op2) { - case 0: - value = READ_SPECIALREG(midr_el1); - break; - case 5: - value = READ_SPECIALREG(mpidr_el1); - break; - case 6: - value = READ_SPECIALREG(revidr_el1); - break; - default: - return (0); - } - } - - /* - * We will handle this instruction, move to the next so we - * don't trap here again. - */ - frame->tf_elr += INSN_SIZE; - - reg = MRS_REGISTER(insn); - /* If reg is 31 then write to xzr, i.e. do nothing */ - if (reg == 31) - return (1); - - if (reg < nitems(frame->tf_x)) - frame->tf_x[reg] = value; - else if (reg == 30) - frame->tf_lr = value; - - return (1); -} - /* * Work around a bug in QEMU prior to 2.5.1 where reading unknown ID * registers would raise an exception when they should return 0. @@ -219,7 +90,6 @@ undef_init(void) LIST_INIT(&undef_handlers[0]); LIST_INIT(&undef_handlers[1]); - install_undef_handler(true, user_mrs_handler); install_undef_handler(false, id_aa64mmfr2_handler); } diff --git a/sys/arm64/conf/GENERIC-MMCCAM b/sys/arm64/conf/GENERIC-MMCCAM index 0d1e91cd58c8..ab45fcb8168d 100644 --- a/sys/arm64/conf/GENERIC-MMCCAM +++ b/sys/arm64/conf/GENERIC-MMCCAM @@ -9,6 +9,7 @@ #NO_UNIVERSE include GENERIC +ident GENERIC-MMCCAM # Add CAMDEBUG stuff options CAMDEBUG diff --git a/sys/arm64/include/ifunc.h b/sys/arm64/include/ifunc.h new file mode 100644 index 000000000000..2ce29154180b --- /dev/null +++ b/sys/arm64/include/ifunc.h @@ -0,0 +1,51 @@ +/*- + * Copyright (c) 2015-2018 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Konstantin Belousov <kib@FreeBSD.org> + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef __ARM64_IFUNC_H +#define __ARM64_IFUNC_H + +#define DEFINE_IFUNC(qual, ret_type, name, args, resolver_qual) \ + resolver_qual ret_type (*name##_resolver(void))args __used; \ + qual ret_type name args __attribute__((ifunc(#name "_resolver"))); \ + resolver_qual ret_type (*name##_resolver(void))args + +#define DEFINE_UIFUNC(qual, ret_type, name, args, resolver_qual) \ + resolver_qual ret_type (*name##_resolver(uint64_t, uint64_t, \ + uint64_t, uint64_t, uint64_t, uint64_t, uint64_t, \ + uint64_t))args __used; \ + qual ret_type name args __attribute__((ifunc(#name "_resolver"))); \ + resolver_qual ret_type (*name##_resolver(uint64_t _arg1 __unused, \ + uint64_t _arg2 __unused, uint64_t _arg3 __unused, \ + uint64_t _arg4 __unused, uint64_t _arg5 __unused, \ + uint64_t _arg6 __unused, uint64_t _arg7 __unused, \ + uint64_t _arg8 __unused))args + +#endif diff --git a/sys/arm64/include/pte.h b/sys/arm64/include/pte.h index acd3f81ab41c..7aa216e92b43 100644 --- a/sys/arm64/include/pte.h +++ b/sys/arm64/include/pte.h @@ -109,7 +109,7 @@ typedef uint64_t pt_entry_t; /* page table entry */ /* 0x2 also marks an invalid address */ #define L3_PAGE 0x3 -#define PMAP_MAPDEV_EARLY_SIZE (L2_SIZE * 4) +#define PMAP_MAPDEV_EARLY_SIZE (L2_SIZE * 8) #define L0_ENTRIES_SHIFT 9 #define L0_ENTRIES (1 << L0_ENTRIES_SHIFT) diff --git a/sys/arm64/include/undefined.h b/sys/arm64/include/undefined.h index 7fded28c31f2..ee25af6e3f9c 100644 --- a/sys/arm64/include/undefined.h +++ b/sys/arm64/include/undefined.h @@ -36,6 +36,43 @@ typedef int (*undef_handler_t)(vm_offset_t, uint32_t, struct trapframe *, uint32_t); +#define MRS_MASK 0xfff00000 +#define MRS_VALUE 0xd5300000 +#define MRS_SPECIAL(insn) ((insn) & 0x000fffe0) +#define MRS_REGISTER(insn) ((insn) & 0x0000001f) +#define MRS_Op0_SHIFT 19 +#define MRS_Op0_MASK 0x00080000 +#define MRS_Op1_SHIFT 16 +#define MRS_Op1_MASK 0x00070000 +#define MRS_CRn_SHIFT 12 +#define MRS_CRn_MASK 0x0000f000 +#define MRS_CRm_SHIFT 8 +#define MRS_CRm_MASK 0x00000f00 +#define MRS_Op2_SHIFT 5 +#define MRS_Op2_MASK 0x000000e0 +#define MRS_Rt_SHIFT 0 +#define MRS_Rt_MASK 0x0000001f + +static inline int +mrs_Op0(uint32_t insn) +{ + + /* op0 is encoded without the top bit in a mrs instruction */ + return (2 | ((insn & MRS_Op0_MASK) >> MRS_Op0_SHIFT)); +} + +#define MRS_GET(op) \ +static inline int \ +mrs_##op(uint32_t insn) \ +{ \ + \ + return ((insn & MRS_##op##_MASK) >> MRS_##op##_SHIFT); \ +} +MRS_GET(Op1) +MRS_GET(CRn) +MRS_GET(CRm) +MRS_GET(Op2) + void undef_init(void); void *install_undef_handler(bool, undef_handler_t); void remove_undef_handler(void *); diff --git a/sys/cam/scsi/scsi_cd.c b/sys/cam/scsi/scsi_cd.c index b0d4fe5d11c5..599ebc272629 100644 --- a/sys/cam/scsi/scsi_cd.c +++ b/sys/cam/scsi/scsi_cd.c @@ -63,8 +63,10 @@ __FBSDID("$FreeBSD$"); #include <sys/cdrio.h> #include <sys/dvdio.h> #include <sys/devicestat.h> +#include <sys/proc.h> #include <sys/sbuf.h> #include <sys/sysctl.h> +#include <sys/sysent.h> #include <sys/taskqueue.h> #include <geom/geom_disk.h> @@ -210,6 +212,17 @@ static struct cd_quirk_entry cd_quirk_table[] = } }; +#ifdef COMPAT_FREEBSD32 +struct ioc_read_toc_entry32 { + u_char address_format; + u_char starting_track; + u_short data_len; + uint32_t data; /* (struct cd_toc_entry *) */ +}; +#define CDIOREADTOCENTRYS_32 \ + _IOC_NEWTYPE(CDIOREADTOCENTRYS, struct ioc_read_toc_entry32) +#endif + static disk_open_t cdopen; static disk_close_t cdclose; static disk_ioctl_t cdioctl; @@ -1272,6 +1285,29 @@ cdgetpagesize(int page_num) return (-1); } +static struct cd_toc_entry * +te_data_get_ptr(void *irtep, u_long cmd) +{ + union { + struct ioc_read_toc_entry irte; +#ifdef COMPAT_FREEBSD32 + struct ioc_read_toc_entry32 irte32; +#endif + } *irteup; + + irteup = irtep; + switch (IOCPARM_LEN(cmd)) { + case sizeof(irteup->irte): + return (irteup->irte.data); +#ifdef COMPAT_FREEBSD32 + case sizeof(irteup->irte32): + return ((struct cd_toc_entry *)(uintptr_t)irteup->irte32.data); +#endif + default: + panic("Unhandled ioctl command %ld", cmd); + } +} + static int cdioctl(struct disk *dp, u_long cmd, void *addr, int flag, struct thread *td) { @@ -1587,6 +1623,9 @@ cdioctl(struct disk *dp, u_long cmd, void *addr, int flag, struct thread *td) } break; case CDIOREADTOCENTRYS: +#ifdef COMPAT_FREEBSD32 + case CDIOREADTOCENTRYS_32: +#endif { struct cd_tocdata *data; struct cd_toc_single *lead; @@ -1712,7 +1751,8 @@ cdioctl(struct disk *dp, u_long cmd, void *addr, int flag, struct thread *td) } cam_periph_unlock(periph); - error = copyout(data->entries, te->data, len); + error = copyout(data->entries, te_data_get_ptr(te, cmd), + len); free(data, M_SCSICD); free(lead, M_SCSICD); } diff --git a/sys/cam/scsi/scsi_da.c b/sys/cam/scsi/scsi_da.c index 40f593f9b629..80dcb34a0a78 100644 --- a/sys/cam/scsi/scsi_da.c +++ b/sys/cam/scsi/scsi_da.c @@ -5674,6 +5674,9 @@ dadone_probezone(struct cam_periph *periph, union ccb *done_ccb) } } } + + free(csio->data_ptr, M_SCSIDA); + daprobedone(periph, done_ccb); return; } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c index 0a352b1e9798..9012baa0a994 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c @@ -292,8 +292,15 @@ uint_t dbuf_cache_lowater_pct = 10; SYSCTL_DECL(_vfs_zfs); SYSCTL_QUAD(_vfs_zfs, OID_AUTO, dbuf_cache_max_bytes, CTLFLAG_RWTUN, &dbuf_cache_max_bytes, 0, "dbuf cache size in bytes"); +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, dbuf_metadata_cache_max_bytes, CTLFLAG_RWTUN, + &dbuf_metadata_cache_max_bytes, 0, "dbuf metadata cache size in bytes"); SYSCTL_INT(_vfs_zfs, OID_AUTO, dbuf_cache_shift, CTLFLAG_RDTUN, &dbuf_cache_shift, 0, "dbuf cache size as log2 fraction of ARC"); +SYSCTL_INT(_vfs_zfs, OID_AUTO, dbuf_metadata_cache_shift, CTLFLAG_RDTUN, + &dbuf_metadata_cache_shift, 0, + "dbuf metadata cache size as log2 fraction of ARC"); +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, dbuf_metadata_cache_overflow, CTLFLAG_RD, + &dbuf_metadata_cache_overflow, 0, "dbuf metadata cache overflow"); SYSCTL_UINT(_vfs_zfs, OID_AUTO, dbuf_cache_hiwater_pct, CTLFLAG_RWTUN, &dbuf_cache_hiwater_pct, 0, "max percents above the dbuf cache size"); SYSCTL_UINT(_vfs_zfs, OID_AUTO, dbuf_cache_lowater_pct, CTLFLAG_RWTUN, diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c index 2b5547ce5986..3a25d8463d33 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c @@ -28,7 +28,7 @@ * Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Toomas Soome <tsoome@me.com> - * Copyright 2017 Joyent, Inc. + * Copyright 2018 Joyent, Inc. * Copyright (c) 2017 Datto Inc. * Copyright 2018 OmniOS Community Edition (OmniOSce) Association. */ @@ -6923,6 +6923,7 @@ spa_vdev_resilver_done_hunt(vdev_t *vd) /* * Check for a completed resilver with the 'unspare' flag set. + * Also potentially update faulted state. */ if (vd->vdev_ops == &vdev_spare_ops) { vdev_t *first = vd->vdev_child[0]; @@ -6944,6 +6945,8 @@ spa_vdev_resilver_done_hunt(vdev_t *vd) !vdev_dtl_required(oldvd)) return (oldvd); + vdev_propagate_state(vd); + /* * If there are more than two spares attached to a disk, * and those spares are not required, then we want to diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h index 8cd6360e4cf5..bbfb79ce24d3 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h @@ -46,6 +46,8 @@ struct zfsvfs { zfsvfs_t *z_parent; /* parent fs */ objset_t *z_os; /* objset reference */ uint64_t z_root; /* id of root znode */ + struct vnode *z_rootvnode; /* root vnode */ + struct rmlock z_rootvnodelock;/* protection for root vnode */ uint64_t z_unlinkedobj; /* id of unlinked zapobj */ uint64_t z_max_blksz; /* maximum block size for files */ uint64_t z_fuid_obj; /* fuid table object number */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c index aa8a400f2d78..7794bd505525 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c @@ -415,9 +415,10 @@ vdev_geom_io(struct g_consumer *cp, int *cmds, void **datas, off_t *offsets, * least one valid label was found. */ static int -vdev_geom_read_config(struct g_consumer *cp, nvlist_t **config) +vdev_geom_read_config(struct g_consumer *cp, nvlist_t **configp) { struct g_provider *pp; + nvlist_t *config; vdev_phys_t *vdev_lists[VDEV_LABELS]; char *buf; size_t buflen; @@ -442,7 +443,6 @@ vdev_geom_read_config(struct g_consumer *cp, nvlist_t **config) buflen = sizeof(vdev_lists[0]->vp_nvlist); - *config = NULL; /* Create all of the IO requests */ for (l = 0; l < VDEV_LABELS; l++) { cmds[l] = BIO_READ; @@ -458,6 +458,7 @@ vdev_geom_read_config(struct g_consumer *cp, nvlist_t **config) VDEV_LABELS); /* Parse the labels */ + config = *configp = NULL; nlabels = 0; for (l = 0; l < VDEV_LABELS; l++) { if (errors[l] != 0) @@ -465,25 +466,27 @@ vdev_geom_read_config(struct g_consumer *cp, nvlist_t **config) buf = vdev_lists[l]->vp_nvlist; - if (nvlist_unpack(buf, buflen, config, 0) != 0) + if (nvlist_unpack(buf, buflen, &config, 0) != 0) continue; - if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE, + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &state) != 0 || state > POOL_STATE_L2CACHE) { - nvlist_free(*config); - *config = NULL; + nvlist_free(config); continue; } if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && - (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG, + (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) != 0 || txg == 0)) { - nvlist_free(*config); - *config = NULL; + nvlist_free(config); continue; } + if (*configp != NULL) + nvlist_free(*configp); + *configp = config; + nlabels++; } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c index 3fad5a76957b..cf75ab0856c0 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c @@ -958,6 +958,15 @@ vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority) vdev_queue_t *vq = &zio->io_vd->vdev_queue; avl_tree_t *tree; + /* + * ZIO_PRIORITY_NOW is used by the vdev cache code and the aggregate zio + * code to issue IOs without adding them to the vdev queue. In this + * case, the zio is already going to be issued as quickly as possible + * and so it doesn't need any reprioitization to help. + */ + if (zio->io_priority == ZIO_PRIORITY_NOW) + return; + ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); ASSERT3U(priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c index 92f51420e321..1ab51ba77a1f 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c @@ -65,6 +65,7 @@ #include <sys/spa_boot.h> #include <sys/jail.h> #include <ufs/ufs/quota.h> +#include <sys/rmlock.h> #include "zfs_comutil.h" @@ -92,6 +93,9 @@ static int zfs_version_zpl = ZPL_VERSION; SYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0, "ZPL_VERSION"); +static int zfs_root_setvnode(zfsvfs_t *zfsvfs); +static void zfs_root_dropvnode(zfsvfs_t *zfsvfs); + static int zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg); static int zfs_mount(vfs_t *vfsp); static int zfs_umount(vfs_t *vfsp, int fflag); @@ -198,6 +202,8 @@ zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg) break; default: error = EINVAL; + if (cmd == Q_QUOTAON || cmd == Q_QUOTAOFF) + vfs_unbusy(vfsp); goto done; } } @@ -255,9 +261,11 @@ zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg) case Q_QUOTAON: // As far as I can tell, you can't turn quotas on or off on zfs error = 0; + vfs_unbusy(vfsp); break; case Q_QUOTAOFF: error = ENOTSUP; + vfs_unbusy(vfsp); break; case Q_SETQUOTA: error = copyin(&dqblk, arg, sizeof(dqblk)); @@ -1205,6 +1213,8 @@ zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os) for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++) mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); + rm_init(&zfsvfs->z_rootvnodelock, "zfs root vnode lock"); + error = zfsvfs_init(zfsvfs, os); if (error != 0) { *zfvp = NULL; @@ -1311,6 +1321,8 @@ zfsvfs_free(zfsvfs_t *zfsvfs) rw_enter(&zfsvfs_lock, RW_READER); rw_exit(&zfsvfs_lock); + rm_destroy(&zfsvfs->z_rootvnodelock); + zfs_fuid_destroy(zfsvfs); mutex_destroy(&zfsvfs->z_znodes_lock); @@ -1917,6 +1929,8 @@ zfs_mount(vfs_t *vfsp) error = zfs_domount(vfsp, osname); PICKUP_GIANT(); + zfs_root_setvnode((zfsvfs_t *)vfsp->vfs_data); + #ifdef illumos /* * Add an extra VFS_HOLD on our parent vfs so that it can't @@ -1989,14 +2003,65 @@ zfs_statfs(vfs_t *vfsp, struct statfs *statp) } static int +zfs_root_setvnode(zfsvfs_t *zfsvfs) +{ + znode_t *rootzp; + int error; + + ZFS_ENTER(zfsvfs); + error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp); + if (error != 0) + panic("could not zfs_zget for root vnode"); + ZFS_EXIT(zfsvfs); + + rm_wlock(&zfsvfs->z_rootvnodelock); + if (zfsvfs->z_rootvnode != NULL) + panic("zfs mount point already has a root vnode: %p\n", + zfsvfs->z_rootvnode); + zfsvfs->z_rootvnode = ZTOV(rootzp); + rm_wunlock(&zfsvfs->z_rootvnodelock); + return (0); +} + +static void +zfs_root_putvnode(zfsvfs_t *zfsvfs) +{ + struct vnode *vp; + + rm_wlock(&zfsvfs->z_rootvnodelock); + vp = zfsvfs->z_rootvnode; + zfsvfs->z_rootvnode = NULL; + rm_wunlock(&zfsvfs->z_rootvnodelock); + if (vp != NULL) + vrele(vp); +} + +static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp) { + struct rm_priotracker tracker; zfsvfs_t *zfsvfs = vfsp->vfs_data; znode_t *rootzp; int error; - ZFS_ENTER(zfsvfs); + rm_rlock(&zfsvfs->z_rootvnodelock, &tracker); + *vpp = zfsvfs->z_rootvnode; + if (*vpp != NULL && (((*vpp)->v_iflag & VI_DOOMED) == 0)) { + vrefact(*vpp); + rm_runlock(&zfsvfs->z_rootvnodelock, &tracker); + goto lock; + } + rm_runlock(&zfsvfs->z_rootvnodelock, &tracker); + /* + * We found the vnode but did not like it. + */ + if (*vpp != NULL) { + *vpp = NULL; + zfs_root_putvnode(zfsvfs); + } + + ZFS_ENTER(zfsvfs); error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp); if (error == 0) *vpp = ZTOV(rootzp); @@ -2004,6 +2069,7 @@ zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp) ZFS_EXIT(zfsvfs); if (error == 0) { +lock: error = vn_lock(*vpp, flags); if (error != 0) { VN_RELE(*vpp); @@ -2122,6 +2188,8 @@ zfs_umount(vfs_t *vfsp, int fflag) cred_t *cr = td->td_ucred; int ret; + zfs_root_putvnode(zfsvfs); + ret = secpolicy_fs_unmount(cr, vfsp); if (ret) { if (dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource), diff --git a/sys/compat/freebsd32/freebsd32_ioctl.c b/sys/compat/freebsd32/freebsd32_ioctl.c index a3226cdcd9a0..414178a7bffd 100644 --- a/sys/compat/freebsd32/freebsd32_ioctl.c +++ b/sys/compat/freebsd32/freebsd32_ioctl.c @@ -56,37 +56,7 @@ __FBSDID("$FreeBSD$"); #include <compat/freebsd32/freebsd32_misc.h> #include <compat/freebsd32/freebsd32_proto.h> -CTASSERT(sizeof(struct ioc_read_toc_entry32) == 8); CTASSERT(sizeof(struct mem_range_op32) == 12); -CTASSERT(sizeof(struct pci_conf_io32) == 36); -CTASSERT(sizeof(struct pci_match_conf32) == 44); -CTASSERT(sizeof(struct pci_conf32) == 44); - -static int -freebsd32_ioctl_ioc_read_toc(struct thread *td, - struct freebsd32_ioctl_args *uap, struct file *fp) -{ - struct ioc_read_toc_entry toce; - struct ioc_read_toc_entry32 toce32; - int error; - - if ((error = copyin(uap->data, &toce32, sizeof(toce32)))) - return (error); - CP(toce32, toce, address_format); - CP(toce32, toce, starting_track); - CP(toce32, toce, data_len); - PTRIN_CP(toce32, toce, data); - - if ((error = fo_ioctl(fp, CDIOREADTOCENTRYS, (caddr_t)&toce, - td->td_ucred, td))) { - CP(toce, toce32, address_format); - CP(toce, toce32, starting_track); - CP(toce, toce32, data_len); - PTROUT_CP(toce, toce32, data); - error = copyout(&toce32, uap->data, sizeof(toce32)); - } - return error; -} static int freebsd32_ioctl_fiodgname(struct thread *td, @@ -148,108 +118,6 @@ freebsd32_ioctl_memrange(struct thread *td, } static int -freebsd32_ioctl_pciocgetconf(struct thread *td, - struct freebsd32_ioctl_args *uap, struct file *fp) -{ - struct pci_conf_io pci; - struct pci_conf_io32 pci32; - struct pci_match_conf32 pmc32; - struct pci_match_conf32 *pmc32p; - struct pci_match_conf pmc; - struct pci_match_conf *pmcp; - struct pci_conf32 pc32; - struct pci_conf32 *pc32p; - struct pci_conf pc; - struct pci_conf *pcp; - u_int32_t i; - u_int32_t npat_to_convert; - u_int32_t nmatch_to_convert; - vm_offset_t addr; - int error; - - if ((error = copyin(uap->data, &pci32, sizeof(pci32))) != 0) - return (error); - - CP(pci32, pci, num_patterns); - CP(pci32, pci, offset); - CP(pci32, pci, generation); - - npat_to_convert = pci32.pat_buf_len / sizeof(struct pci_match_conf32); - pci.pat_buf_len = npat_to_convert * sizeof(struct pci_match_conf); - pci.patterns = NULL; - nmatch_to_convert = pci32.match_buf_len / sizeof(struct pci_conf32); - pci.match_buf_len = nmatch_to_convert * sizeof(struct pci_conf); - pci.matches = NULL; - - if ((error = copyout_map(td, &addr, pci.pat_buf_len)) != 0) - goto cleanup; - pci.patterns = (struct pci_match_conf *)addr; - if ((error = copyout_map(td, &addr, pci.match_buf_len)) != 0) - goto cleanup; - pci.matches = (struct pci_conf *)addr; - - npat_to_convert = min(npat_to_convert, pci.num_patterns); - - for (i = 0, pmc32p = (struct pci_match_conf32 *)PTRIN(pci32.patterns), - pmcp = pci.patterns; - i < npat_to_convert; i++, pmc32p++, pmcp++) { - if ((error = copyin(pmc32p, &pmc32, sizeof(pmc32))) != 0) - goto cleanup; - CP(pmc32,pmc,pc_sel); - strlcpy(pmc.pd_name, pmc32.pd_name, sizeof(pmc.pd_name)); - CP(pmc32,pmc,pd_unit); - CP(pmc32,pmc,pc_vendor); - CP(pmc32,pmc,pc_device); - CP(pmc32,pmc,pc_class); - CP(pmc32,pmc,flags); - if ((error = copyout(&pmc, pmcp, sizeof(pmc))) != 0) - goto cleanup; - } - - if ((error = fo_ioctl(fp, PCIOCGETCONF, (caddr_t)&pci, - td->td_ucred, td)) != 0) - goto cleanup; - - nmatch_to_convert = min(nmatch_to_convert, pci.num_matches); - - for (i = 0, pcp = pci.matches, - pc32p = (struct pci_conf32 *)PTRIN(pci32.matches); - i < nmatch_to_convert; i++, pcp++, pc32p++) { - if ((error = copyin(pcp, &pc, sizeof(pc))) != 0) - goto cleanup; - CP(pc,pc32,pc_sel); - CP(pc,pc32,pc_hdr); - CP(pc,pc32,pc_subvendor); - CP(pc,pc32,pc_subdevice); - CP(pc,pc32,pc_vendor); - CP(pc,pc32,pc_device); - CP(pc,pc32,pc_class); - CP(pc,pc32,pc_subclass); - CP(pc,pc32,pc_progif); - CP(pc,pc32,pc_revid); - strlcpy(pc32.pd_name, pc.pd_name, sizeof(pc32.pd_name)); - CP(pc,pc32,pd_unit); - if ((error = copyout(&pc32, pc32p, sizeof(pc32))) != 0) - goto cleanup; - } - - CP(pci, pci32, num_matches); - CP(pci, pci32, offset); - CP(pci, pci32, generation); - CP(pci, pci32, status); - - error = copyout(&pci32, uap->data, sizeof(pci32)); - -cleanup: - if (pci.patterns) - copyout_unmap(td, (vm_offset_t)pci.patterns, pci.pat_buf_len); - if (pci.matches) - copyout_unmap(td, (vm_offset_t)pci.matches, pci.match_buf_len); - - return (error); -} - -static int freebsd32_ioctl_barmmap(struct thread *td, struct freebsd32_ioctl_args *uap, struct file *fp) { @@ -369,10 +237,6 @@ freebsd32_ioctl(struct thread *td, struct freebsd32_ioctl_args *uap) } switch (uap->com) { - case CDIOREADTOCENTRYS_32: - error = freebsd32_ioctl_ioc_read_toc(td, uap, fp); - break; - case FIODGNAME_32: error = freebsd32_ioctl_fiodgname(td, uap, fp); break; @@ -382,10 +246,6 @@ freebsd32_ioctl(struct thread *td, struct freebsd32_ioctl_args *uap) error = freebsd32_ioctl_memrange(td, uap, fp); break; - case PCIOCGETCONF_32: - error = freebsd32_ioctl_pciocgetconf(td, uap, fp); - break; - case SG_IO_32: error = freebsd32_ioctl_sg(td, uap, fp); break; diff --git a/sys/compat/freebsd32/freebsd32_ioctl.h b/sys/compat/freebsd32/freebsd32_ioctl.h index fc9c93a7a29b..1d2312b41c14 100644 --- a/sys/compat/freebsd32/freebsd32_ioctl.h +++ b/sys/compat/freebsd32/freebsd32_ioctl.h @@ -38,13 +38,6 @@ typedef __uint32_t caddr_t32; -struct ioc_read_toc_entry32 { - u_char address_format; - u_char starting_track; - u_short data_len; - uint32_t data; /* struct cd_toc_entry* */ -}; - struct fiodgname_arg32 { int len; caddr_t32 buf; @@ -56,45 +49,6 @@ struct mem_range_op32 int mo_arg[2]; }; -struct pci_conf32 { - struct pcisel pc_sel; /* domain+bus+slot+function */ - u_int8_t pc_hdr; /* PCI header type */ - u_int16_t pc_subvendor; /* card vendor ID */ - u_int16_t pc_subdevice; /* card device ID, assigned by - card vendor */ - u_int16_t pc_vendor; /* chip vendor ID */ - u_int16_t pc_device; /* chip device ID, assigned by - chip vendor */ - u_int8_t pc_class; /* chip PCI class */ - u_int8_t pc_subclass; /* chip PCI subclass */ - u_int8_t pc_progif; /* chip PCI programming interface */ - u_int8_t pc_revid; /* chip revision ID */ - char pd_name[PCI_MAXNAMELEN + 1]; /* device name */ - u_int32_t pd_unit; /* device unit number */ -}; - -struct pci_match_conf32 { - struct pcisel pc_sel; /* domain+bus+slot+function */ - char pd_name[PCI_MAXNAMELEN + 1]; /* device name */ - u_int32_t pd_unit; /* Unit number */ - u_int16_t pc_vendor; /* PCI Vendor ID */ - u_int16_t pc_device; /* PCI Device ID */ - u_int8_t pc_class; /* PCI class */ - u_int32_t flags; /* Matching expression */ -}; - -struct pci_conf_io32 { - u_int32_t pat_buf_len; /* pattern buffer length */ - u_int32_t num_patterns; /* number of patterns */ - caddr_t32 patterns; /* struct pci_match_conf ptr */ - u_int32_t match_buf_len; /* match buffer length */ - u_int32_t num_matches; /* number of matches returned */ - caddr_t32 matches; /* struct pci_conf ptr */ - u_int32_t offset; /* offset into device list */ - u_int32_t generation; /* device list generation */ - u_int32_t status; /* request status */ -}; - struct pci_bar_mmap32 { uint32_t pbm_map_base; uint32_t pbm_map_length; @@ -106,11 +60,9 @@ struct pci_bar_mmap32 { int pbm_memattr; }; -#define CDIOREADTOCENTRYS_32 _IOWR('c', 5, struct ioc_read_toc_entry32) #define FIODGNAME_32 _IOW('f', 120, struct fiodgname_arg32) #define MEMRANGE_GET32 _IOWR('m', 50, struct mem_range_op32) #define MEMRANGE_SET32 _IOW('m', 51, struct mem_range_op32) -#define PCIOCGETCONF_32 _IOWR('p', 5, struct pci_conf_io32) #define SG_IO_32 _IOWR(SGIOC, 0x85, struct sg_io_hdr32) #define PCIOCBARMMAP_32 _IOWR('p', 8, struct pci_bar_mmap32) diff --git a/sys/compat/freebsd32/freebsd32_syscall.h b/sys/compat/freebsd32/freebsd32_syscall.h index 6cc72e423997..66ab072edab8 100644 --- a/sys/compat/freebsd32/freebsd32_syscall.h +++ b/sys/compat/freebsd32/freebsd32_syscall.h @@ -171,6 +171,10 @@ #define FREEBSD32_SYS_setgid 181 #define FREEBSD32_SYS_setegid 182 #define FREEBSD32_SYS_seteuid 183 + /* 184 is obsolete lfs_bmapv */ + /* 185 is obsolete lfs_markv */ + /* 186 is obsolete lfs_segclean */ + /* 187 is obsolete lfs_segwait */ #define FREEBSD32_SYS_freebsd11_freebsd32_stat 188 #define FREEBSD32_SYS_freebsd11_freebsd32_fstat 189 #define FREEBSD32_SYS_freebsd11_freebsd32_lstat 190 @@ -194,6 +198,7 @@ #define FREEBSD32_SYS_freebsd7_freebsd32_semctl 220 #define FREEBSD32_SYS_semget 221 #define FREEBSD32_SYS_semop 222 + /* 223 is obsolete semconfig */ #define FREEBSD32_SYS_freebsd7_freebsd32_msgctl 224 #define FREEBSD32_SYS_msgget 225 #define FREEBSD32_SYS_freebsd32_msgsnd 226 @@ -300,12 +305,24 @@ #define FREEBSD32_SYS_getresgid 361 #define FREEBSD32_SYS_kqueue 362 #define FREEBSD32_SYS_freebsd11_freebsd32_kevent 363 + /* 364 is obsolete __cap_get_proc */ + /* 365 is obsolete __cap_set_proc */ + /* 366 is obsolete __cap_get_fd */ + /* 367 is obsolete __cap_get_file */ + /* 368 is obsolete __cap_set_fd */ + /* 369 is obsolete __cap_set_file */ #define FREEBSD32_SYS_extattr_set_fd 371 #define FREEBSD32_SYS_extattr_get_fd 372 #define FREEBSD32_SYS_extattr_delete_fd 373 #define FREEBSD32_SYS___setugid 374 + /* 375 is obsolete nfsclnt */ #define FREEBSD32_SYS_eaccess 376 #define FREEBSD32_SYS_freebsd32_nmount 378 + /* 379 is obsolete kse_exit */ + /* 380 is obsolete kse_wakeup */ + /* 381 is obsolete kse_create */ + /* 382 is obsolete kse_thr_interrupt */ + /* 383 is obsolete kse_release */ #define FREEBSD32_SYS_kenv 390 #define FREEBSD32_SYS_lchflags 391 #define FREEBSD32_SYS_uuidgen 392 @@ -343,6 +360,7 @@ #define FREEBSD32_SYS_extattr_list_fd 437 #define FREEBSD32_SYS_extattr_list_file 438 #define FREEBSD32_SYS_extattr_list_link 439 + /* 440 is obsolete kse_switchin */ #define FREEBSD32_SYS_freebsd32_ksem_timedwait 441 #define FREEBSD32_SYS_freebsd32_thr_suspend 442 #define FREEBSD32_SYS_thr_wake 443 @@ -455,6 +473,8 @@ #define FREEBSD32_SYS_freebsd32_ppoll 545 #define FREEBSD32_SYS_freebsd32_futimens 546 #define FREEBSD32_SYS_freebsd32_utimensat 547 + /* 548 is obsolete numa_getaffinity */ + /* 549 is obsolete numa_setaffinity */ #define FREEBSD32_SYS_fdatasync 550 #define FREEBSD32_SYS_freebsd32_fstat 551 #define FREEBSD32_SYS_freebsd32_fstatat 552 diff --git a/sys/compat/freebsd32/freebsd32_syscalls.c b/sys/compat/freebsd32/freebsd32_syscalls.c index cc554b0efb64..c186771f0704 100644 --- a/sys/compat/freebsd32/freebsd32_syscalls.c +++ b/sys/compat/freebsd32/freebsd32_syscalls.c @@ -193,10 +193,10 @@ const char *freebsd32_syscallnames[] = { "setgid", /* 181 = setgid */ "setegid", /* 182 = setegid */ "seteuid", /* 183 = seteuid */ - "#184", /* 184 = lfs_bmapv */ - "#185", /* 185 = lfs_markv */ - "#186", /* 186 = lfs_segclean */ - "#187", /* 187 = lfs_segwait */ + "obs_lfs_bmapv", /* 184 = obsolete lfs_bmapv */ + "obs_lfs_markv", /* 185 = obsolete lfs_markv */ + "obs_lfs_segclean", /* 186 = obsolete lfs_segclean */ + "obs_lfs_segwait", /* 187 = obsolete lfs_segwait */ "compat11.freebsd32_stat", /* 188 = freebsd11 freebsd32_stat */ "compat11.freebsd32_fstat", /* 189 = freebsd11 freebsd32_fstat */ "compat11.freebsd32_lstat", /* 190 = freebsd11 freebsd32_lstat */ @@ -232,7 +232,7 @@ const char *freebsd32_syscallnames[] = { "compat7.freebsd32_semctl", /* 220 = freebsd7 freebsd32_semctl */ "semget", /* 221 = semget */ "semop", /* 222 = semop */ - "#223", /* 223 = semconfig */ + "obs_semconfig", /* 223 = obsolete semconfig */ "compat7.freebsd32_msgctl", /* 224 = freebsd7 freebsd32_msgctl */ "msgget", /* 225 = msgget */ "freebsd32_msgsnd", /* 226 = freebsd32_msgsnd */ @@ -373,26 +373,26 @@ const char *freebsd32_syscallnames[] = { "getresgid", /* 361 = getresgid */ "kqueue", /* 362 = kqueue */ "compat11.freebsd32_kevent", /* 363 = freebsd11 freebsd32_kevent */ - "#364", /* 364 = __cap_get_proc */ - "#365", /* 365 = __cap_set_proc */ - "#366", /* 366 = __cap_get_fd */ - "#367", /* 367 = __cap_get_file */ - "#368", /* 368 = __cap_set_fd */ - "#369", /* 369 = __cap_set_file */ + "obs___cap_get_proc", /* 364 = obsolete __cap_get_proc */ + "obs___cap_set_proc", /* 365 = obsolete __cap_set_proc */ + "obs___cap_get_fd", /* 366 = obsolete __cap_get_fd */ + "obs___cap_get_file", /* 367 = obsolete __cap_get_file */ + "obs___cap_set_fd", /* 368 = obsolete __cap_set_fd */ + "obs___cap_set_file", /* 369 = obsolete __cap_set_file */ "#370", /* 370 = nosys */ "extattr_set_fd", /* 371 = extattr_set_fd */ "extattr_get_fd", /* 372 = extattr_get_fd */ "extattr_delete_fd", /* 373 = extattr_delete_fd */ "__setugid", /* 374 = __setugid */ - "#375", /* 375 = nfsclnt */ + "obs_nfsclnt", /* 375 = obsolete nfsclnt */ "eaccess", /* 376 = eaccess */ "#377", /* 377 = afs_syscall */ "freebsd32_nmount", /* 378 = freebsd32_nmount */ - "#379", /* 379 = kse_exit */ - "#380", /* 380 = kse_wakeup */ - "#381", /* 381 = kse_create */ - "#382", /* 382 = kse_thr_interrupt */ - "#383", /* 383 = kse_release */ + "obs_kse_exit", /* 379 = obsolete kse_exit */ + "obs_kse_wakeup", /* 380 = obsolete kse_wakeup */ + "obs_kse_create", /* 381 = obsolete kse_create */ + "obs_kse_thr_interrupt", /* 382 = obsolete kse_thr_interrupt */ + "obs_kse_release", /* 383 = obsolete kse_release */ "#384", /* 384 = __mac_get_proc */ "#385", /* 385 = __mac_set_proc */ "#386", /* 386 = __mac_get_fd */ @@ -449,7 +449,7 @@ const char *freebsd32_syscallnames[] = { "extattr_list_fd", /* 437 = extattr_list_fd */ "extattr_list_file", /* 438 = extattr_list_file */ "extattr_list_link", /* 439 = extattr_list_link */ - "#440", /* 440 = kse_switchin */ + "obs_kse_switchin", /* 440 = obsolete kse_switchin */ "freebsd32_ksem_timedwait", /* 441 = freebsd32_ksem_timedwait */ "freebsd32_thr_suspend", /* 442 = freebsd32_thr_suspend */ "thr_wake", /* 443 = thr_wake */ @@ -580,8 +580,8 @@ const char *freebsd32_syscallnames[] = { "freebsd32_ppoll", /* 545 = freebsd32_ppoll */ "freebsd32_futimens", /* 546 = freebsd32_futimens */ "freebsd32_utimensat", /* 547 = freebsd32_utimensat */ - "#548", /* 548 = numa_getaffinity */ - "#549", /* 549 = numa_setaffinity */ + "obs_numa_getaffinity", /* 548 = obsolete numa_getaffinity */ + "obs_numa_setaffinity", /* 549 = obsolete numa_setaffinity */ "fdatasync", /* 550 = fdatasync */ "freebsd32_fstat", /* 551 = freebsd32_fstat */ "freebsd32_fstatat", /* 552 = freebsd32_fstatat */ diff --git a/sys/compat/freebsd32/freebsd32_sysent.c b/sys/compat/freebsd32/freebsd32_sysent.c index 27caed81eefa..1a6ff314dfa2 100644 --- a/sys/compat/freebsd32/freebsd32_sysent.c +++ b/sys/compat/freebsd32/freebsd32_sysent.c @@ -240,10 +240,10 @@ struct sysent freebsd32_sysent[] = { { AS(setgid_args), (sy_call_t *)sys_setgid, AUE_SETGID, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 181 = setgid */ { AS(setegid_args), (sy_call_t *)sys_setegid, AUE_SETEGID, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 182 = setegid */ { AS(seteuid_args), (sy_call_t *)sys_seteuid, AUE_SETEUID, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 183 = seteuid */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 184 = lfs_bmapv */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 185 = lfs_markv */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 186 = lfs_segclean */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 187 = lfs_segwait */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 184 = obsolete lfs_bmapv */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 185 = obsolete lfs_markv */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 186 = obsolete lfs_segclean */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 187 = obsolete lfs_segwait */ { compat11(AS(freebsd11_freebsd32_stat_args),freebsd32_stat), AUE_STAT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 188 = freebsd11 freebsd32_stat */ { compat11(AS(freebsd11_freebsd32_fstat_args),freebsd32_fstat), AUE_FSTAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 189 = freebsd11 freebsd32_fstat */ { compat11(AS(freebsd11_freebsd32_lstat_args),freebsd32_lstat), AUE_LSTAT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 190 = freebsd11 freebsd32_lstat */ @@ -279,7 +279,7 @@ struct sysent freebsd32_sysent[] = { { 0, (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 220 = freebsd7 freebsd32_semctl */ { AS(semget_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 221 = semget */ { AS(semop_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 222 = semop */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 223 = semconfig */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 223 = obsolete semconfig */ { 0, (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 224 = freebsd7 freebsd32_msgctl */ { AS(msgget_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 225 = msgget */ { AS(freebsd32_msgsnd_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 226 = freebsd32_msgsnd */ @@ -420,26 +420,26 @@ struct sysent freebsd32_sysent[] = { { AS(getresgid_args), (sy_call_t *)sys_getresgid, AUE_GETRESGID, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 361 = getresgid */ { 0, (sy_call_t *)sys_kqueue, AUE_KQUEUE, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 362 = kqueue */ { compat11(AS(freebsd11_freebsd32_kevent_args),freebsd32_kevent), AUE_KEVENT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 363 = freebsd11 freebsd32_kevent */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 364 = __cap_get_proc */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 365 = __cap_set_proc */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 366 = __cap_get_fd */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 367 = __cap_get_file */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 368 = __cap_set_fd */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 369 = __cap_set_file */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 364 = obsolete __cap_get_proc */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 365 = obsolete __cap_set_proc */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 366 = obsolete __cap_get_fd */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 367 = obsolete __cap_get_file */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 368 = obsolete __cap_set_fd */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 369 = obsolete __cap_set_file */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 370 = nosys */ { AS(extattr_set_fd_args), (sy_call_t *)sys_extattr_set_fd, AUE_EXTATTR_SET_FD, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 371 = extattr_set_fd */ { AS(extattr_get_fd_args), (sy_call_t *)sys_extattr_get_fd, AUE_EXTATTR_GET_FD, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 372 = extattr_get_fd */ { AS(extattr_delete_fd_args), (sy_call_t *)sys_extattr_delete_fd, AUE_EXTATTR_DELETE_FD, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 373 = extattr_delete_fd */ { AS(__setugid_args), (sy_call_t *)sys___setugid, AUE_SETUGID, NULL, 0, 0, 0, SY_THR_STATIC }, /* 374 = __setugid */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 375 = nfsclnt */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 375 = obsolete nfsclnt */ { AS(eaccess_args), (sy_call_t *)sys_eaccess, AUE_EACCESS, NULL, 0, 0, 0, SY_THR_STATIC }, /* 376 = eaccess */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 377 = afs_syscall */ { AS(freebsd32_nmount_args), (sy_call_t *)freebsd32_nmount, AUE_NMOUNT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 378 = freebsd32_nmount */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 379 = kse_exit */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 380 = kse_wakeup */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 381 = kse_create */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 382 = kse_thr_interrupt */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 383 = kse_release */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 379 = obsolete kse_exit */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 380 = obsolete kse_wakeup */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 381 = obsolete kse_create */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 382 = obsolete kse_thr_interrupt */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 383 = obsolete kse_release */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 384 = __mac_get_proc */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 385 = __mac_set_proc */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 386 = __mac_get_fd */ @@ -496,7 +496,7 @@ struct sysent freebsd32_sysent[] = { { AS(extattr_list_fd_args), (sy_call_t *)sys_extattr_list_fd, AUE_EXTATTR_LIST_FD, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 437 = extattr_list_fd */ { AS(extattr_list_file_args), (sy_call_t *)sys_extattr_list_file, AUE_EXTATTR_LIST_FILE, NULL, 0, 0, 0, SY_THR_STATIC }, /* 438 = extattr_list_file */ { AS(extattr_list_link_args), (sy_call_t *)sys_extattr_list_link, AUE_EXTATTR_LIST_LINK, NULL, 0, 0, 0, SY_THR_STATIC }, /* 439 = extattr_list_link */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 440 = kse_switchin */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 440 = obsolete kse_switchin */ { AS(freebsd32_ksem_timedwait_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 441 = freebsd32_ksem_timedwait */ { AS(freebsd32_thr_suspend_args), (sy_call_t *)freebsd32_thr_suspend, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 442 = freebsd32_thr_suspend */ { AS(thr_wake_args), (sy_call_t *)sys_thr_wake, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 443 = thr_wake */ @@ -627,8 +627,8 @@ struct sysent freebsd32_sysent[] = { { AS(freebsd32_ppoll_args), (sy_call_t *)freebsd32_ppoll, AUE_POLL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 545 = freebsd32_ppoll */ { AS(freebsd32_futimens_args), (sy_call_t *)freebsd32_futimens, AUE_FUTIMES, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 546 = freebsd32_futimens */ { AS(freebsd32_utimensat_args), (sy_call_t *)freebsd32_utimensat, AUE_FUTIMESAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 547 = freebsd32_utimensat */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 548 = numa_getaffinity */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 549 = numa_setaffinity */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 548 = obsolete numa_getaffinity */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 549 = obsolete numa_setaffinity */ { AS(fdatasync_args), (sy_call_t *)sys_fdatasync, AUE_FSYNC, NULL, 0, 0, 0, SY_THR_STATIC }, /* 550 = fdatasync */ { AS(freebsd32_fstat_args), (sy_call_t *)freebsd32_fstat, AUE_FSTAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 551 = freebsd32_fstat */ { AS(freebsd32_fstatat_args), (sy_call_t *)freebsd32_fstatat, AUE_FSTATAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 552 = freebsd32_fstatat */ diff --git a/sys/compat/freebsd32/syscalls.master b/sys/compat/freebsd32/syscalls.master index 99103a3553b5..9b5aafe41244 100644 --- a/sys/compat/freebsd32/syscalls.master +++ b/sys/compat/freebsd32/syscalls.master @@ -343,10 +343,10 @@ 181 AUE_SETGID NOPROTO { int setgid(gid_t gid); } 182 AUE_SETEGID NOPROTO { int setegid(gid_t egid); } 183 AUE_SETEUID NOPROTO { int seteuid(uid_t euid); } -184 AUE_NULL UNIMPL lfs_bmapv -185 AUE_NULL UNIMPL lfs_markv -186 AUE_NULL UNIMPL lfs_segclean -187 AUE_NULL UNIMPL lfs_segwait +184 AUE_NULL OBSOL lfs_bmapv +185 AUE_NULL OBSOL lfs_markv +186 AUE_NULL OBSOL lfs_segclean +187 AUE_NULL OBSOL lfs_segwait 188 AUE_STAT COMPAT11 { int freebsd32_stat(char *path, \ struct freebsd11_stat32 *ub); } 189 AUE_FSTAT COMPAT11 { int freebsd32_fstat(int fd, \ @@ -414,7 +414,7 @@ int semflg); } 222 AUE_SEMOP NOSTD|NOPROTO { int semop(int semid, \ struct sembuf *sops, u_int nsops); } -223 AUE_NULL UNIMPL semconfig +223 AUE_NULL OBSOL semconfig 224 AUE_MSGCTL COMPAT7|NOSTD { int freebsd32_msgctl( \ int msqid, int cmd, \ struct msqid_ds32_old *buf); } @@ -662,12 +662,12 @@ struct kevent32_freebsd11 *eventlist, \ int nevents, \ const struct timespec32 *timeout); } -364 AUE_NULL UNIMPL __cap_get_proc -365 AUE_NULL UNIMPL __cap_set_proc -366 AUE_NULL UNIMPL __cap_get_fd -367 AUE_NULL UNIMPL __cap_get_file -368 AUE_NULL UNIMPL __cap_set_fd -369 AUE_NULL UNIMPL __cap_set_file +364 AUE_NULL OBSOL __cap_get_proc +365 AUE_NULL OBSOL __cap_set_proc +366 AUE_NULL OBSOL __cap_get_fd +367 AUE_NULL OBSOL __cap_get_file +368 AUE_NULL OBSOL __cap_set_fd +369 AUE_NULL OBSOL __cap_set_file 370 AUE_NULL UNIMPL nosys 371 AUE_EXTATTR_SET_FD NOPROTO { ssize_t extattr_set_fd(int fd, \ int attrnamespace, const char *attrname, \ @@ -679,16 +679,16 @@ int attrnamespace, \ const char *attrname); } 374 AUE_SETUGID NOPROTO { int __setugid(int flag); } -375 AUE_NULL UNIMPL nfsclnt +375 AUE_NULL OBSOL nfsclnt 376 AUE_EACCESS NOPROTO { int eaccess(char *path, int amode); } 377 AUE_NULL UNIMPL afs_syscall 378 AUE_NMOUNT STD { int freebsd32_nmount(struct iovec32 *iovp, \ unsigned int iovcnt, int flags); } -379 AUE_NULL UNIMPL kse_exit -380 AUE_NULL UNIMPL kse_wakeup -381 AUE_NULL UNIMPL kse_create -382 AUE_NULL UNIMPL kse_thr_interrupt -383 AUE_NULL UNIMPL kse_release +379 AUE_NULL OBSOL kse_exit +380 AUE_NULL OBSOL kse_wakeup +381 AUE_NULL OBSOL kse_create +382 AUE_NULL OBSOL kse_thr_interrupt +383 AUE_NULL OBSOL kse_release 384 AUE_NULL UNIMPL __mac_get_proc 385 AUE_NULL UNIMPL __mac_set_proc 386 AUE_NULL UNIMPL __mac_get_fd @@ -787,7 +787,7 @@ 439 AUE_EXTATTR_LIST_LINK NOPROTO { ssize_t extattr_list_link( \ const char *path, int attrnamespace, \ void *data, size_t nbytes); } -440 AUE_NULL UNIMPL kse_switchin +440 AUE_NULL OBSOL kse_switchin 441 AUE_SEMWAIT NOSTD { int freebsd32_ksem_timedwait(semid_t id, \ const struct timespec32 *abstime); } 442 AUE_NULL STD { int freebsd32_thr_suspend( \ @@ -1074,8 +1074,8 @@ 547 AUE_FUTIMESAT STD { int freebsd32_utimensat(int fd, \ char *path, \ struct timespec *times, int flag); } -548 AUE_NULL UNIMPL numa_getaffinity -549 AUE_NULL UNIMPL numa_setaffinity +548 AUE_NULL OBSOL numa_getaffinity +549 AUE_NULL OBSOL numa_setaffinity 550 AUE_FSYNC NOPROTO { int fdatasync(int fd); } 551 AUE_FSTAT STD { int freebsd32_fstat(int fd, \ struct stat32 *ub); } diff --git a/sys/conf/files b/sys/conf/files index b2fcc65773e0..ee18125a034f 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -4041,7 +4041,6 @@ libkern/murmur3_32.c standard libkern/mcount.c optional profiling-routine libkern/memcchr.c standard libkern/memchr.c standard -libkern/memcmp.c standard libkern/memmem.c optional gdb libkern/qsort.c standard libkern/qsort_r.c standard diff --git a/sys/conf/files.arm b/sys/conf/files.arm index 98d452a8f7ab..087f4c695fa1 100644 --- a/sys/conf/files.arm +++ b/sys/conf/files.arm @@ -163,6 +163,7 @@ libkern/fls.c standard libkern/flsl.c standard libkern/flsll.c standard libkern/lshrdi3.c standard +libkern/memcmp.c standard libkern/moddi3.c standard libkern/qdivrem.c standard libkern/ucmpdi2.c standard diff --git a/sys/conf/files.arm64 b/sys/conf/files.arm64 index 76e9e8e36479..010fbc7460c7 100644 --- a/sys/conf/files.arm64 +++ b/sys/conf/files.arm64 @@ -244,6 +244,7 @@ libkern/ffsll.c standard libkern/fls.c standard libkern/flsl.c standard libkern/flsll.c standard +libkern/memcmp.c standard libkern/memset.c standard libkern/arm64/crc32c_armv8.S standard cddl/contrib/opensolaris/common/atomic/aarch64/opensolaris_atomic.S optional zfs | dtrace compile-with "${CDDL_C}" diff --git a/sys/conf/files.i386 b/sys/conf/files.i386 index d2591137a990..f7a86cf5ee49 100644 --- a/sys/conf/files.i386 +++ b/sys/conf/files.i386 @@ -548,6 +548,7 @@ kern/subr_sfbuf.c standard libkern/divdi3.c standard libkern/ffsll.c standard libkern/flsll.c standard +libkern/memcmp.c standard libkern/memset.c standard libkern/moddi3.c standard libkern/qdivrem.c standard diff --git a/sys/conf/files.mips b/sys/conf/files.mips index 07448f44497c..1977fb9dcce2 100644 --- a/sys/conf/files.mips +++ b/sys/conf/files.mips @@ -65,6 +65,7 @@ libkern/cmpdi2.c optional mips | mipshf | mipsel | mipselhf libkern/ucmpdi2.c optional mips | mipshf | mipsel | mipselhf libkern/ashldi3.c standard libkern/ashrdi3.c standard +libkern/memcmp.c standard # cfe support dev/cfe/cfe_api.c optional cfe diff --git a/sys/conf/files.powerpc b/sys/conf/files.powerpc index d6df0c878641..faa057bfc882 100644 --- a/sys/conf/files.powerpc +++ b/sys/conf/files.powerpc @@ -98,6 +98,7 @@ libkern/fls.c standard libkern/flsl.c standard libkern/flsll.c standard libkern/lshrdi3.c optional powerpc | powerpcspe +libkern/memcmp.c standard libkern/memset.c standard libkern/moddi3.c optional powerpc | powerpcspe libkern/qdivrem.c optional powerpc | powerpcspe diff --git a/sys/conf/files.riscv b/sys/conf/files.riscv index daba1826331b..8a70023eb91a 100644 --- a/sys/conf/files.riscv +++ b/sys/conf/files.riscv @@ -22,6 +22,7 @@ libkern/ffsll.c standard libkern/fls.c standard libkern/flsl.c standard libkern/flsll.c standard +libkern/memcmp.c standard libkern/memset.c standard riscv/riscv/autoconf.c standard riscv/riscv/bus_machdep.c standard diff --git a/sys/conf/files.sparc64 b/sys/conf/files.sparc64 index 7d8cc3c5cb88..ed61aa49648e 100644 --- a/sys/conf/files.sparc64 +++ b/sys/conf/files.sparc64 @@ -71,6 +71,7 @@ libkern/ffsll.c standard libkern/fls.c standard libkern/flsl.c standard libkern/flsll.c standard +libkern/memcmp.c standard sparc64/central/central.c optional central sparc64/ebus/ebus.c optional ebus sparc64/ebus/epic.c optional epic ebus diff --git a/sys/conf/kern.pre.mk b/sys/conf/kern.pre.mk index a663deb59d1f..55072f1046f8 100644 --- a/sys/conf/kern.pre.mk +++ b/sys/conf/kern.pre.mk @@ -121,10 +121,12 @@ CFLAGS+= ${CONF_CFLAGS} LDFLAGS+= -Wl,--build-id=sha1 .endif -.if ${MACHINE_CPUARCH} == "amd64" -.if defined(LINKER_FEATURES) && ${LINKER_FEATURES:Mifunc} == "" -.error amd64 kernel requires linker ifunc support +.if (${MACHINE_CPUARCH} == "aarch64" || ${MACHINE_CPUARCH} == "amd64" || \ + ${MACHINE_CPUARCH} == "i386") && \ + defined(LINKER_FEATURES) && ${LINKER_FEATURES:Mifunc} == "" +.error amd64/arm64/i386 kernel requires linker ifunc support .endif +.if ${MACHINE_CPUARCH} == "amd64" LDFLAGS+= -Wl,-z max-page-size=2097152 -Wl,-z common-page-size=4096 -Wl,-z -Wl,ifunc-noplt .endif diff --git a/sys/conf/newvers.sh b/sys/conf/newvers.sh index 9e83b6379f51..5e749d52126d 100644 --- a/sys/conf/newvers.sh +++ b/sys/conf/newvers.sh @@ -46,7 +46,7 @@ TYPE="FreeBSD" REVISION="12.0" -BRANCH="ALPHA5" +BRANCH="ALPHA8" if [ -n "${BRANCH_OVERRIDE}" ]; then BRANCH=${BRANCH_OVERRIDE} fi @@ -183,7 +183,7 @@ done if findvcs .git; then for dir in /usr/bin /usr/local/bin; do if [ -x "${dir}/git" ] ; then - git_cmd="${dir}/git --git-dir=${VCSDIR}" + git_cmd="${dir}/git -c help.autocorrect=0 --git-dir=${VCSDIR}" break fi done @@ -293,7 +293,7 @@ done shift $((OPTIND - 1)) if [ -z "${include_metadata}" ]; then - VERINFO="${VERSION} ${svn}${git}${hg}${p4version}" + VERINFO="${VERSION}${svn}${git}${hg}${p4version} ${i}" VERSTR="${VERINFO}\\n" else VERINFO="${VERSION} #${v}${svn}${git}${hg}${p4version}: ${t}" diff --git a/sys/crypto/ccp/ccp.c b/sys/crypto/ccp/ccp.c index 163fa748b2ef..1983eac79444 100644 --- a/sys/crypto/ccp/ccp.c +++ b/sys/crypto/ccp/ccp.c @@ -735,7 +735,7 @@ MODULE_VERSION(ccp, 1); MODULE_DEPEND(ccp, crypto, 1, 1, 1); MODULE_DEPEND(ccp, random_device, 1, 1, 1); #if 0 /* There are enough known issues that we shouldn't load automatically */ -MODULE_PNP_INFO("W32:vendor/device", pci, ccp, ccp_ids, sizeof(ccp_ids[0]), +MODULE_PNP_INFO("W32:vendor/device", pci, ccp, ccp_ids, nitems(ccp_ids)); #endif diff --git a/sys/dev/aac/aac_pci.c b/sys/dev/aac/aac_pci.c index 399e55a9a47f..738bfbf79167 100644 --- a/sys/dev/aac/aac_pci.c +++ b/sys/dev/aac/aac_pci.c @@ -494,7 +494,7 @@ static driver_t aacch_driver = { static devclass_t aacch_devclass; DRIVER_MODULE(aacch, pci, aacch_driver, aacch_devclass, NULL, NULL); MODULE_PNP_INFO("U16:vendor;U16:device;", pci, aac, - aac_identifiers, sizeof(aac_identifiers[0]), nitems(aac_identifiers) - 1); + aac_identifiers, nitems(aac_identifiers) - 1); static int aacch_probe(device_t dev) diff --git a/sys/dev/aacraid/aacraid_pci.c b/sys/dev/aacraid/aacraid_pci.c index 74ac0ee23312..f445a073b6d5 100644 --- a/sys/dev/aacraid/aacraid_pci.c +++ b/sys/dev/aacraid/aacraid_pci.c @@ -106,7 +106,7 @@ struct aac_ident DRIVER_MODULE(aacraid, pci, aacraid_pci_driver, aacraid_devclass, 0, 0); MODULE_PNP_INFO("U16:vendor;U16:device", pci, aacraid, - aacraid_family_identifiers, sizeof(aacraid_family_identifiers[0]), + aacraid_family_identifiers, nitems(aacraid_family_identifiers) - 1); MODULE_DEPEND(aacraid, pci, 1, 1, 1); diff --git a/sys/dev/adlink/adlink.c b/sys/dev/adlink/adlink.c index 2ae3d768c2fd..19b2f9388c6f 100644 --- a/sys/dev/adlink/adlink.c +++ b/sys/dev/adlink/adlink.c @@ -438,6 +438,6 @@ static driver_t adlink_driver = { }; DRIVER_MODULE(adlink, pci, adlink_driver, adlink_devclass, 0, 0); -MODULE_PNP_INFO("U16:vendor;U16:device;D:#", pci, adlink, adlink_id, sizeof(adlink_id[0]), +MODULE_PNP_INFO("U16:vendor;U16:device;D:#", pci, adlink, adlink_id, nitems(adlink_id)); #endif /* _KERNEL */ diff --git a/sys/dev/ae/if_ae.c b/sys/dev/ae/if_ae.c index f70afb27fa56..cb9474db8553 100644 --- a/sys/dev/ae/if_ae.c +++ b/sys/dev/ae/if_ae.c @@ -178,7 +178,7 @@ static devclass_t ae_devclass; DRIVER_MODULE(ae, pci, ae_driver, ae_devclass, 0, 0); MODULE_PNP_INFO("U16:vendor;U16:device;D:#", pci, ae, ae_devs, - sizeof(ae_devs[0]), nitems(ae_devs)); + nitems(ae_devs)); DRIVER_MODULE(miibus, ae, miibus_driver, miibus_devclass, 0, 0); MODULE_DEPEND(ae, pci, 1, 1, 1); MODULE_DEPEND(ae, ether, 1, 1, 1); diff --git a/sys/dev/age/if_age.c b/sys/dev/age/if_age.c index 764363f19856..ee52564d7cc8 100644 --- a/sys/dev/age/if_age.c +++ b/sys/dev/age/if_age.c @@ -184,7 +184,7 @@ static devclass_t age_devclass; DRIVER_MODULE(age, pci, age_driver, age_devclass, 0, 0); MODULE_PNP_INFO("U16:vendor;U16:device;D:#", pci, age, age_devs, - sizeof(age_devs[0]), nitems(age_devs)); + nitems(age_devs)); DRIVER_MODULE(miibus, age, miibus_driver, miibus_devclass, 0, 0); static struct resource_spec age_res_spec_mem[] = { diff --git a/sys/dev/ahci/ahci_pci.c b/sys/dev/ahci/ahci_pci.c index a117a5720c86..2bc2a243837b 100644 --- a/sys/dev/ahci/ahci_pci.c +++ b/sys/dev/ahci/ahci_pci.c @@ -667,7 +667,7 @@ static driver_t ahci_driver = { DRIVER_MODULE(ahci, pci, ahci_driver, ahci_devclass, NULL, NULL); /* Also matches class / subclass / progid XXX need to add when we have masking support */ MODULE_PNP_INFO("W32:vendor/device", pci, ahci, ahci_ids, - sizeof(ahci_ids[0]), nitems(ahci_ids) - 1); + nitems(ahci_ids) - 1); static device_method_t ahci_ata_methods[] = { DEVMETHOD(device_probe, ahci_ata_probe), DEVMETHOD(device_attach, ahci_pci_attach), diff --git a/sys/dev/alc/if_alc.c b/sys/dev/alc/if_alc.c index ede7bd49c193..1bafdab235d0 100644 --- a/sys/dev/alc/if_alc.c +++ b/sys/dev/alc/if_alc.c @@ -244,7 +244,7 @@ static devclass_t alc_devclass; DRIVER_MODULE(alc, pci, alc_driver, alc_devclass, 0, 0); MODULE_PNP_INFO("U16:vendor;U16:device", pci, alc, alc_ident_table, - sizeof(alc_ident_table[0]), nitems(alc_ident_table) - 1); + nitems(alc_ident_table) - 1); DRIVER_MODULE(miibus, alc, miibus_driver, miibus_devclass, 0, 0); static struct resource_spec alc_res_spec_mem[] = { diff --git a/sys/dev/ale/if_ale.c b/sys/dev/ale/if_ale.c index 987bd2db7421..802fdafd63cf 100644 --- a/sys/dev/ale/if_ale.c +++ b/sys/dev/ale/if_ale.c @@ -179,7 +179,7 @@ static devclass_t ale_devclass; DRIVER_MODULE(ale, pci, ale_driver, ale_devclass, NULL, NULL); MODULE_PNP_INFO("U16:vendor;U16:device;D:#", pci, ale, ale_devs, - sizeof(ale_devs[0]), nitems(ale_devs)); + nitems(ale_devs)); DRIVER_MODULE(miibus, ale, miibus_driver, miibus_devclass, NULL, NULL); static struct resource_spec ale_res_spec_mem[] = { diff --git a/sys/dev/amdsmn/amdsmn.c b/sys/dev/amdsmn/amdsmn.c index fb2c8b17328c..17792dd922cd 100644 --- a/sys/dev/amdsmn/amdsmn.c +++ b/sys/dev/amdsmn/amdsmn.c @@ -90,7 +90,7 @@ static devclass_t amdsmn_devclass; DRIVER_MODULE(amdsmn, hostb, amdsmn_driver, amdsmn_devclass, NULL, NULL); MODULE_VERSION(amdsmn, 1); MODULE_PNP_INFO("W32:vendor/device", pci, amdsmn, amdsmn_ids, - sizeof(amdsmn_ids[0]), nitems(amdsmn_ids)); + nitems(amdsmn_ids)); static bool amdsmn_match(device_t parent) diff --git a/sys/dev/amdtemp/amdtemp.c b/sys/dev/amdtemp/amdtemp.c index 45f8d6397ed0..2463212c25f5 100644 --- a/sys/dev/amdtemp/amdtemp.c +++ b/sys/dev/amdtemp/amdtemp.c @@ -167,7 +167,7 @@ DRIVER_MODULE(amdtemp, hostb, amdtemp_driver, amdtemp_devclass, NULL, NULL); MODULE_VERSION(amdtemp, 1); MODULE_DEPEND(amdtemp, amdsmn, 1, 1, 1); MODULE_PNP_INFO("U16:vendor;U16:device", pci, amdtemp, amdtemp_products, - sizeof(amdtemp_products[0]), nitems(amdtemp_products)); + nitems(amdtemp_products)); static int amdtemp_match(device_t dev) diff --git a/sys/dev/amr/amr_pci.c b/sys/dev/amr/amr_pci.c index 80cd58b5ccd3..25b37eda3895 100644 --- a/sys/dev/amr/amr_pci.c +++ b/sys/dev/amr/amr_pci.c @@ -142,7 +142,7 @@ static struct amr_ident static devclass_t amr_devclass; DRIVER_MODULE(amr, pci, amr_pci_driver, amr_devclass, 0, 0); MODULE_PNP_INFO("U16:vendor;U16:device", pci, amr, amr_device_ids, - sizeof(amr_device_ids[0]), nitems(amr_device_ids) - 1); + nitems(amr_device_ids) - 1); MODULE_DEPEND(amr, pci, 1, 1, 1); MODULE_DEPEND(amr, cam, 1, 1, 1); diff --git a/sys/dev/an/if_an_pci.c b/sys/dev/an/if_an_pci.c index 1105a963128e..15ae3e320e54 100644 --- a/sys/dev/an/if_an_pci.c +++ b/sys/dev/an/if_an_pci.c @@ -274,6 +274,6 @@ static devclass_t an_devclass; DRIVER_MODULE(an, pci, an_pci_driver, an_devclass, 0, 0); MODULE_PNP_INFO("U16:vendor;U16:device;D:#", pci, an, - an_devs, sizeof(an_devs[0]), nitems(an_devs) - 1); + an_devs, nitems(an_devs) - 1); MODULE_DEPEND(an, pci, 1, 1, 1); MODULE_DEPEND(an, wlan, 1, 1, 1); diff --git a/sys/dev/bce/if_bce.c b/sys/dev/bce/if_bce.c index 37eed4abcbc0..3d5c0742580c 100644 --- a/sys/dev/bce/if_bce.c +++ b/sys/dev/bce/if_bce.c @@ -530,7 +530,7 @@ MODULE_DEPEND(bce, miibus, 1, 1, 1); DRIVER_MODULE(bce, pci, bce_driver, bce_devclass, NULL, NULL); DRIVER_MODULE(miibus, bce, miibus_driver, miibus_devclass, NULL, NULL); MODULE_PNP_INFO("U16:vendor;U16:device;U16:#;U16:#;D:#", pci, bce, - bce_devs, sizeof(bce_devs[0]), nitems(bce_devs) - 1); + bce_devs, nitems(bce_devs) - 1); /****************************************************************************/ /* Tunable device values */ diff --git a/sys/dev/bfe/if_bfe.c b/sys/dev/bfe/if_bfe.c index fb0c2949c82d..6a86f3488b4c 100644 --- a/sys/dev/bfe/if_bfe.c +++ b/sys/dev/bfe/if_bfe.c @@ -158,7 +158,7 @@ static devclass_t bfe_devclass; DRIVER_MODULE(bfe, pci, bfe_driver, bfe_devclass, 0, 0); MODULE_PNP_INFO("U16:vendor;U16:device;D:#", pci, bfe, bfe_devs, - sizeof(bfe_devs[0]), nitems(bfe_devs) - 1); + nitems(bfe_devs) - 1); DRIVER_MODULE(miibus, bfe, miibus_driver, miibus_devclass, 0, 0); /* diff --git a/sys/dev/bge/if_bge.c b/sys/dev/bge/if_bge.c index c7134b9a2abd..193116e10681 100644 --- a/sys/dev/bge/if_bge.c +++ b/sys/dev/bge/if_bge.c @@ -548,7 +548,7 @@ static devclass_t bge_devclass; DRIVER_MODULE(bge, pci, bge_driver, bge_devclass, 0, 0); MODULE_PNP_INFO("U16:vendor;U16:device", pci, bge, bge_devs, - sizeof(bge_devs[0]), nitems(bge_devs) - 1); + nitems(bge_devs) - 1); DRIVER_MODULE(miibus, bge, miibus_driver, miibus_devclass, 0, 0); static int bge_allow_asf = 1; diff --git a/sys/dev/bwi/if_bwi_pci.c b/sys/dev/bwi/if_bwi_pci.c index 63378be0452d..f95ef854ceaa 100644 --- a/sys/dev/bwi/if_bwi_pci.c +++ b/sys/dev/bwi/if_bwi_pci.c @@ -257,7 +257,7 @@ static driver_t bwi_driver = { static devclass_t bwi_devclass; DRIVER_MODULE(bwi, pci, bwi_driver, bwi_devclass, 0, 0); MODULE_PNP_INFO("U16:vendor;U16:device;D:#", pci, bwi, bwi_devices, - sizeof(bwi_devices[0]), nitems(bwi_devices) - 1); + nitems(bwi_devices) - 1); MODULE_DEPEND(bwi, wlan, 1, 1, 1); /* 802.11 media layer */ MODULE_DEPEND(bwi, firmware, 1, 1, 1); /* firmware support */ MODULE_DEPEND(bwi, wlan_amrr, 1, 1, 1); diff --git a/sys/dev/bwn/if_bwn_pci.c b/sys/dev/bwn/if_bwn_pci.c index dfe7b50e7bc0..610f8bdeb823 100644 --- a/sys/dev/bwn/if_bwn_pci.c +++ b/sys/dev/bwn/if_bwn_pci.c @@ -296,9 +296,9 @@ DEFINE_CLASS_0(bwn_pci, bwn_pci_driver, bwn_pci_methods, DRIVER_MODULE_ORDERED(bwn_pci, pci, bwn_pci_driver, bwn_pci_devclass, NULL, NULL, SI_ORDER_ANY); MODULE_PNP_INFO("U16:vendor;U16:device;D:#", pci, bwn_siba, - siba_devices, sizeof(siba_devices[0]), nitems(siba_devices) - 1); + siba_devices, nitems(siba_devices) - 1); MODULE_PNP_INFO("U16:vendor;U16:device;D:#", pci, bwn_bcma, - bcma_devices, sizeof(bcma_devices[0]), nitems(bcma_devices) - 1); + bcma_devices, nitems(bcma_devices) - 1); DRIVER_MODULE(bhndb, bwn_pci, bhndb_pci_driver, bhndb_devclass, NULL, NULL); MODULE_DEPEND(bwn_pci, bwn, 1, 1, 1); diff --git a/sys/dev/bxe/bxe.c b/sys/dev/bxe/bxe.c index 393b78f540ff..4ef92e39d1f9 100644 --- a/sys/dev/bxe/bxe.c +++ b/sys/dev/bxe/bxe.c @@ -7076,13 +7076,13 @@ bxe_link_attn(struct bxe_softc *sc) if (sc->state == BXE_STATE_OPEN) { bxe_stats_handle(sc, STATS_EVENT_LINK_UP); + /* Restart tx when the link comes back. */ + FOR_EACH_ETH_QUEUE(sc, i) { + fp = &sc->fp[i]; + taskqueue_enqueue(fp->tq, &fp->tx_task); + } } - /* Restart tx when the link comes back. */ - FOR_EACH_ETH_QUEUE(sc, i) { - fp = &sc->fp[i]; - taskqueue_enqueue(fp->tq, &fp->tx_task); - } } if (sc->link_vars.link_up && sc->link_vars.line_speed) { @@ -16279,9 +16279,11 @@ bxe_shutdown(device_t dev) /* stop the periodic callout */ bxe_periodic_stop(sc); - BXE_CORE_LOCK(sc); - bxe_nic_unload(sc, UNLOAD_NORMAL, FALSE); - BXE_CORE_UNLOCK(sc); + if (sc->state != BXE_STATE_CLOSED) { + BXE_CORE_LOCK(sc); + bxe_nic_unload(sc, UNLOAD_NORMAL, FALSE); + BXE_CORE_UNLOCK(sc); + } return (0); } diff --git a/sys/dev/cas/if_cas.c b/sys/dev/cas/if_cas.c index 3e6dbfe7b463..d1b3761302a1 100644 --- a/sys/dev/cas/if_cas.c +++ b/sys/dev/cas/if_cas.c @@ -2617,7 +2617,7 @@ static const struct cas_pci_dev { DRIVER_MODULE(cas, pci, cas_pci_driver, cas_devclass, 0, 0); MODULE_PNP_INFO("W32:vendor/device", pci, cas, cas_pci_devlist, - sizeof(cas_pci_devlist[0]), nitems(cas_pci_devlist) - 1); + nitems(cas_pci_devlist) - 1); DRIVER_MODULE(miibus, cas, miibus_driver, miibus_devclass, 0, 0); MODULE_DEPEND(cas, pci, 1, 1, 1); diff --git a/sys/dev/ciss/ciss.c b/sys/dev/ciss/ciss.c index 94bd9c69e1c5..e4b887a38e60 100644 --- a/sys/dev/ciss/ciss.c +++ b/sys/dev/ciss/ciss.c @@ -365,7 +365,7 @@ static struct static devclass_t ciss_devclass; DRIVER_MODULE(ciss, pci, ciss_pci_driver, ciss_devclass, 0, 0); MODULE_PNP_INFO("U16:vendor;U16:device;", pci, ciss, ciss_vendor_data, - sizeof(ciss_vendor_data[0]), nitems(ciss_vendor_data) - 1); + nitems(ciss_vendor_data) - 1); MODULE_DEPEND(ciss, cam, 1, 1, 1); MODULE_DEPEND(ciss, pci, 1, 1, 1); diff --git a/sys/dev/cpuctl/cpuctl.c b/sys/dev/cpuctl/cpuctl.c index 33ac83eb731a..4f91b4fbbf28 100644 --- a/sys/dev/cpuctl/cpuctl.c +++ b/sys/dev/cpuctl/cpuctl.c @@ -362,7 +362,7 @@ update_intel(int cpu, cpuctl_update_args_t *args, struct thread *td) set_cpu(cpu, td); critical_enter(); - ret = ucode_intel_load(ptr, true); + ret = ucode_intel_load(ptr, true, NULL, NULL); critical_exit(); restore_cpu(oldcpu, is_bound, td); diff --git a/sys/dev/cxgb/cxgb_main.c b/sys/dev/cxgb/cxgb_main.c index 127a61d61c21..a47aa909c237 100644 --- a/sys/dev/cxgb/cxgb_main.c +++ b/sys/dev/cxgb/cxgb_main.c @@ -133,6 +133,30 @@ static void cxgb_update_mac_settings(struct port_info *p); static int toe_capability(struct port_info *, int); #endif +/* Table for probing the cards. The desc field isn't actually used */ +struct cxgb_ident { + uint16_t vendor; + uint16_t device; + int index; + char *desc; +} cxgb_identifiers[] = { + {PCI_VENDOR_ID_CHELSIO, 0x0020, 0, "PE9000"}, + {PCI_VENDOR_ID_CHELSIO, 0x0021, 1, "T302E"}, + {PCI_VENDOR_ID_CHELSIO, 0x0022, 2, "T310E"}, + {PCI_VENDOR_ID_CHELSIO, 0x0023, 3, "T320X"}, + {PCI_VENDOR_ID_CHELSIO, 0x0024, 1, "T302X"}, + {PCI_VENDOR_ID_CHELSIO, 0x0025, 3, "T320E"}, + {PCI_VENDOR_ID_CHELSIO, 0x0026, 2, "T310X"}, + {PCI_VENDOR_ID_CHELSIO, 0x0030, 2, "T3B10"}, + {PCI_VENDOR_ID_CHELSIO, 0x0031, 3, "T3B20"}, + {PCI_VENDOR_ID_CHELSIO, 0x0032, 1, "T3B02"}, + {PCI_VENDOR_ID_CHELSIO, 0x0033, 4, "T3B04"}, + {PCI_VENDOR_ID_CHELSIO, 0x0035, 6, "T3C10"}, + {PCI_VENDOR_ID_CHELSIO, 0x0036, 3, "S320E-CR"}, + {PCI_VENDOR_ID_CHELSIO, 0x0037, 7, "N320E-G2"}, + {0, 0, 0, NULL} +}; + static device_method_t cxgb_controller_methods[] = { DEVMETHOD(device_probe, cxgb_controller_probe), DEVMETHOD(device_attach, cxgb_controller_attach), @@ -151,6 +175,8 @@ static int cxgbc_mod_event(module_t, int, void *); static devclass_t cxgb_controller_devclass; DRIVER_MODULE(cxgbc, pci, cxgb_controller_driver, cxgb_controller_devclass, cxgbc_mod_event, 0); +MODULE_PNP_INFO("U16:vendor;U16:device", pci, cxgbc, cxgb_identifiers, + nitems(cxgb_identifiers) - 1); MODULE_VERSION(cxgbc, 1); MODULE_DEPEND(cxgbc, firmware, 1, 1, 1); @@ -280,29 +306,6 @@ enum { FILTER_NO_VLAN_PRI = 7 }; #define PORT_MASK ((1 << MAX_NPORTS) - 1) -/* Table for probing the cards. The desc field isn't actually used */ -struct cxgb_ident { - uint16_t vendor; - uint16_t device; - int index; - char *desc; -} cxgb_identifiers[] = { - {PCI_VENDOR_ID_CHELSIO, 0x0020, 0, "PE9000"}, - {PCI_VENDOR_ID_CHELSIO, 0x0021, 1, "T302E"}, - {PCI_VENDOR_ID_CHELSIO, 0x0022, 2, "T310E"}, - {PCI_VENDOR_ID_CHELSIO, 0x0023, 3, "T320X"}, - {PCI_VENDOR_ID_CHELSIO, 0x0024, 1, "T302X"}, - {PCI_VENDOR_ID_CHELSIO, 0x0025, 3, "T320E"}, - {PCI_VENDOR_ID_CHELSIO, 0x0026, 2, "T310X"}, - {PCI_VENDOR_ID_CHELSIO, 0x0030, 2, "T3B10"}, - {PCI_VENDOR_ID_CHELSIO, 0x0031, 3, "T3B20"}, - {PCI_VENDOR_ID_CHELSIO, 0x0032, 1, "T3B02"}, - {PCI_VENDOR_ID_CHELSIO, 0x0033, 4, "T3B04"}, - {PCI_VENDOR_ID_CHELSIO, 0x0035, 6, "T3C10"}, - {PCI_VENDOR_ID_CHELSIO, 0x0036, 3, "S320E-CR"}, - {PCI_VENDOR_ID_CHELSIO, 0x0037, 7, "N320E-G2"}, - {0, 0, 0, NULL} -}; static int set_eeprom(struct port_info *pi, const uint8_t *data, int len, int offset); diff --git a/sys/dev/cxgbe/adapter.h b/sys/dev/cxgbe/adapter.h index 866bace84c36..f23d16479ee8 100644 --- a/sys/dev/cxgbe/adapter.h +++ b/sys/dev/cxgbe/adapter.h @@ -289,7 +289,6 @@ struct port_info { uint8_t rx_e_chan_map; /* rx TP e-channel bitmap */ struct link_config link_cfg; - struct link_config old_link_cfg; struct ifmedia media; struct timeval last_refreshed; @@ -1073,52 +1072,6 @@ t4_os_set_hw_addr(struct port_info *pi, uint8_t hw_addr[]) bcopy(hw_addr, pi->vi[0].hw_addr, ETHER_ADDR_LEN); } -static inline bool -is_10G_port(const struct port_info *pi) -{ - - return ((pi->link_cfg.supported & FW_PORT_CAP_SPEED_10G) != 0); -} - -static inline bool -is_25G_port(const struct port_info *pi) -{ - - return ((pi->link_cfg.supported & FW_PORT_CAP_SPEED_25G) != 0); -} - -static inline bool -is_40G_port(const struct port_info *pi) -{ - - return ((pi->link_cfg.supported & FW_PORT_CAP_SPEED_40G) != 0); -} - -static inline bool -is_100G_port(const struct port_info *pi) -{ - - return ((pi->link_cfg.supported & FW_PORT_CAP_SPEED_100G) != 0); -} - -static inline int -port_top_speed(const struct port_info *pi) -{ - - if (pi->link_cfg.supported & FW_PORT_CAP_SPEED_100G) - return (100); - if (pi->link_cfg.supported & FW_PORT_CAP_SPEED_40G) - return (40); - if (pi->link_cfg.supported & FW_PORT_CAP_SPEED_25G) - return (25); - if (pi->link_cfg.supported & FW_PORT_CAP_SPEED_10G) - return (10); - if (pi->link_cfg.supported & FW_PORT_CAP_SPEED_1G) - return (1); - - return (0); -} - static inline int tx_resume_threshold(struct sge_eq *eq) { diff --git a/sys/dev/cxgbe/common/common.h b/sys/dev/cxgbe/common/common.h index 044582db6ede..ada4fa4fa775 100644 --- a/sys/dev/cxgbe/common/common.h +++ b/sys/dev/cxgbe/common/common.h @@ -66,9 +66,10 @@ enum { }; enum { + FEC_NONE = 0, FEC_RS = 1 << 0, FEC_BASER_RS = 1 << 1, - FEC_RESERVED = 1 << 2, + FEC_AUTO = 1 << 5, /* M_FW_PORT_CAP32_FEC + 1 */ }; enum t4_bar2_qtype { T4_BAR2_QTYPE_EGRESS, T4_BAR2_QTYPE_INGRESS }; @@ -368,6 +369,7 @@ struct adapter_params { unsigned int ethoffload:1; unsigned int hash_filter:1; unsigned int filter2_wr_support:1; + unsigned int port_caps32:1; unsigned int ofldq_wr_cred; unsigned int eo_wr_cred; @@ -409,20 +411,21 @@ struct trace_params { }; struct link_config { - /* OS-specific code owns all the requested_* fields */ - unsigned char requested_aneg; /* link aneg user has requested */ - unsigned char requested_fc; /* flow control user has requested */ - unsigned char requested_fec; /* FEC user has requested */ - unsigned int requested_speed; /* speed user has requested (Mbps) */ + /* OS-specific code owns all the requested_* fields. */ + int8_t requested_aneg; /* link autonegotiation */ + int8_t requested_fc; /* flow control */ + int8_t requested_fec; /* FEC */ + u_int requested_speed; /* speed (Mbps) */ - unsigned short supported; /* link capabilities */ - unsigned short advertising; /* advertised capabilities */ - unsigned short lp_advertising; /* peer advertised capabilities */ - unsigned int speed; /* actual link speed (Mbps) */ - unsigned char fc; /* actual link flow control */ - unsigned char fec; /* actual FEC */ - unsigned char link_ok; /* link up? */ - unsigned char link_down_rc; /* link down reason */ + uint32_t supported; /* link capabilities */ + uint32_t advertising; /* advertised capabilities */ + uint32_t lp_advertising; /* peer advertised capabilities */ + uint32_t fec_hint; /* use this fec */ + u_int speed; /* actual link speed (Mbps) */ + int8_t fc; /* actual link flow control */ + int8_t fec; /* actual FEC */ + bool link_ok; /* link up? */ + uint8_t link_down_rc; /* link down reason */ }; #include "adapter.h" @@ -874,5 +877,16 @@ int t4vf_prep_adapter(struct adapter *adapter); int t4_bar2_sge_qregs(struct adapter *adapter, unsigned int qid, enum t4_bar2_qtype qtype, int user, u64 *pbar2_qoffset, unsigned int *pbar2_qid); +unsigned int fwcap_to_speed(uint32_t caps); +uint32_t speed_to_fwcap(unsigned int speed); +uint32_t fwcap_top_speed(uint32_t caps); + +static inline int +port_top_speed(const struct port_info *pi) +{ + + /* Mbps -> Gbps */ + return (fwcap_to_speed(pi->link_cfg.supported) / 1000); +} #endif /* __CHELSIO_COMMON_H */ diff --git a/sys/dev/cxgbe/common/t4_hw.c b/sys/dev/cxgbe/common/t4_hw.c index 2312b66cf7a8..cec8ebbf00e7 100644 --- a/sys/dev/cxgbe/common/t4_hw.c +++ b/sys/dev/cxgbe/common/t4_hw.c @@ -3756,6 +3756,93 @@ void t4_ulprx_read_la(struct adapter *adap, u32 *la_buf) } /** + * fwcaps16_to_caps32 - convert 16-bit Port Capabilities to 32-bits + * @caps16: a 16-bit Port Capabilities value + * + * Returns the equivalent 32-bit Port Capabilities value. + */ +static uint32_t fwcaps16_to_caps32(uint16_t caps16) +{ + uint32_t caps32 = 0; + + #define CAP16_TO_CAP32(__cap) \ + do { \ + if (caps16 & FW_PORT_CAP_##__cap) \ + caps32 |= FW_PORT_CAP32_##__cap; \ + } while (0) + + CAP16_TO_CAP32(SPEED_100M); + CAP16_TO_CAP32(SPEED_1G); + CAP16_TO_CAP32(SPEED_25G); + CAP16_TO_CAP32(SPEED_10G); + CAP16_TO_CAP32(SPEED_40G); + CAP16_TO_CAP32(SPEED_100G); + CAP16_TO_CAP32(FC_RX); + CAP16_TO_CAP32(FC_TX); + CAP16_TO_CAP32(ANEG); + CAP16_TO_CAP32(FORCE_PAUSE); + CAP16_TO_CAP32(MDIAUTO); + CAP16_TO_CAP32(MDISTRAIGHT); + CAP16_TO_CAP32(FEC_RS); + CAP16_TO_CAP32(FEC_BASER_RS); + CAP16_TO_CAP32(802_3_PAUSE); + CAP16_TO_CAP32(802_3_ASM_DIR); + + #undef CAP16_TO_CAP32 + + return caps32; +} + +/** + * fwcaps32_to_caps16 - convert 32-bit Port Capabilities to 16-bits + * @caps32: a 32-bit Port Capabilities value + * + * Returns the equivalent 16-bit Port Capabilities value. Note that + * not all 32-bit Port Capabilities can be represented in the 16-bit + * Port Capabilities and some fields/values may not make it. + */ +static uint16_t fwcaps32_to_caps16(uint32_t caps32) +{ + uint16_t caps16 = 0; + + #define CAP32_TO_CAP16(__cap) \ + do { \ + if (caps32 & FW_PORT_CAP32_##__cap) \ + caps16 |= FW_PORT_CAP_##__cap; \ + } while (0) + + CAP32_TO_CAP16(SPEED_100M); + CAP32_TO_CAP16(SPEED_1G); + CAP32_TO_CAP16(SPEED_10G); + CAP32_TO_CAP16(SPEED_25G); + CAP32_TO_CAP16(SPEED_40G); + CAP32_TO_CAP16(SPEED_100G); + CAP32_TO_CAP16(FC_RX); + CAP32_TO_CAP16(FC_TX); + CAP32_TO_CAP16(802_3_PAUSE); + CAP32_TO_CAP16(802_3_ASM_DIR); + CAP32_TO_CAP16(ANEG); + CAP32_TO_CAP16(FORCE_PAUSE); + CAP32_TO_CAP16(MDIAUTO); + CAP32_TO_CAP16(MDISTRAIGHT); + CAP32_TO_CAP16(FEC_RS); + CAP32_TO_CAP16(FEC_BASER_RS); + + #undef CAP32_TO_CAP16 + + return caps16; +} + +static bool +is_bt(struct port_info *pi) +{ + + return (pi->port_type == FW_PORT_TYPE_BT_SGMII || + pi->port_type == FW_PORT_TYPE_BT_XFI || + pi->port_type == FW_PORT_TYPE_BT_XAUI); +} + +/** * t4_link_l1cfg - apply link configuration to MAC/PHY * @phy: the PHY to setup * @mac: the MAC to setup @@ -3772,52 +3859,44 @@ int t4_link_l1cfg(struct adapter *adap, unsigned int mbox, unsigned int port, struct link_config *lc) { struct fw_port_cmd c; - unsigned int mdi = V_FW_PORT_CAP_MDI(FW_PORT_CAP_MDI_AUTO); + unsigned int mdi = V_FW_PORT_CAP32_MDI(FW_PORT_CAP32_MDI_AUTO); unsigned int aneg, fc, fec, speed, rcap; fc = 0; if (lc->requested_fc & PAUSE_RX) - fc |= FW_PORT_CAP_FC_RX; + fc |= FW_PORT_CAP32_FC_RX; if (lc->requested_fc & PAUSE_TX) - fc |= FW_PORT_CAP_FC_TX; + fc |= FW_PORT_CAP32_FC_TX; + if (!(lc->requested_fc & PAUSE_AUTONEG)) + fc |= FW_PORT_CAP32_FORCE_PAUSE; fec = 0; - if (lc->requested_fec & FEC_RS) - fec = FW_PORT_CAP_FEC_RS; - else if (lc->requested_fec & FEC_BASER_RS) - fec = FW_PORT_CAP_FEC_BASER_RS; + if (lc->requested_fec == FEC_AUTO) + fec = lc->fec_hint; + else { + if (lc->requested_fec & FEC_RS) + fec |= FW_PORT_CAP32_FEC_RS; + if (lc->requested_fec & FEC_BASER_RS) + fec |= FW_PORT_CAP32_FEC_BASER_RS; + } - if (!(lc->supported & FW_PORT_CAP_ANEG) || - lc->requested_aneg == AUTONEG_DISABLE) { + if (lc->requested_aneg == AUTONEG_DISABLE) aneg = 0; - switch (lc->requested_speed) { - case 100000: - speed = FW_PORT_CAP_SPEED_100G; - break; - case 40000: - speed = FW_PORT_CAP_SPEED_40G; - break; - case 25000: - speed = FW_PORT_CAP_SPEED_25G; - break; - case 10000: - speed = FW_PORT_CAP_SPEED_10G; - break; - case 1000: - speed = FW_PORT_CAP_SPEED_1G; - break; - case 100: - speed = FW_PORT_CAP_SPEED_100M; - break; - default: - return -EINVAL; - break; - } - } else { - aneg = FW_PORT_CAP_ANEG; - speed = lc->supported & - V_FW_PORT_CAP_SPEED(M_FW_PORT_CAP_SPEED); - } + else if (lc->requested_aneg == AUTONEG_ENABLE) + aneg = FW_PORT_CAP32_ANEG; + else + aneg = lc->supported & FW_PORT_CAP32_ANEG; + + if (aneg) { + speed = lc->supported & V_FW_PORT_CAP32_SPEED(M_FW_PORT_CAP32_SPEED); + } else if (lc->requested_speed != 0) + speed = speed_to_fwcap(lc->requested_speed); + else + speed = fwcap_top_speed(lc->supported); + + /* Force AN on for BT cards. */ + if (is_bt(adap->port[port])) + aneg = lc->supported & FW_PORT_CAP32_ANEG; rcap = aneg | speed | fc | fec; if ((rcap | lc->supported) != lc->supported) { @@ -3833,10 +3912,17 @@ int t4_link_l1cfg(struct adapter *adap, unsigned int mbox, unsigned int port, c.op_to_portid = cpu_to_be32(V_FW_CMD_OP(FW_PORT_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_EXEC | V_FW_PORT_CMD_PORTID(port)); - c.action_to_len16 = - cpu_to_be32(V_FW_PORT_CMD_ACTION(FW_PORT_ACTION_L1_CFG) | + if (adap->params.port_caps32) { + c.action_to_len16 = + cpu_to_be32(V_FW_PORT_CMD_ACTION(FW_PORT_ACTION_L1_CFG32) | + FW_LEN16(c)); + c.u.l1cfg32.rcap32 = cpu_to_be32(rcap); + } else { + c.action_to_len16 = + cpu_to_be32(V_FW_PORT_CMD_ACTION(FW_PORT_ACTION_L1_CFG) | FW_LEN16(c)); - c.u.l1cfg.rcap = cpu_to_be32(rcap); + c.u.l1cfg.rcap = cpu_to_be32(fwcaps32_to_caps16(rcap)); + } return t4_wr_mbox_ns(adap, mbox, &c, sizeof(c), NULL); } @@ -7736,56 +7822,205 @@ const char *t4_link_down_rc_str(unsigned char link_down_rc) } /* + * Return the highest speed set in the port capabilities, in Mb/s. + */ +unsigned int fwcap_to_speed(uint32_t caps) +{ + #define TEST_SPEED_RETURN(__caps_speed, __speed) \ + do { \ + if (caps & FW_PORT_CAP32_SPEED_##__caps_speed) \ + return __speed; \ + } while (0) + + TEST_SPEED_RETURN(400G, 400000); + TEST_SPEED_RETURN(200G, 200000); + TEST_SPEED_RETURN(100G, 100000); + TEST_SPEED_RETURN(50G, 50000); + TEST_SPEED_RETURN(40G, 40000); + TEST_SPEED_RETURN(25G, 25000); + TEST_SPEED_RETURN(10G, 10000); + TEST_SPEED_RETURN(1G, 1000); + TEST_SPEED_RETURN(100M, 100); + + #undef TEST_SPEED_RETURN + + return 0; +} + +/* + * Return the port capabilities bit for the given speed, which is in Mb/s. + */ +uint32_t speed_to_fwcap(unsigned int speed) +{ + #define TEST_SPEED_RETURN(__caps_speed, __speed) \ + do { \ + if (speed == __speed) \ + return FW_PORT_CAP32_SPEED_##__caps_speed; \ + } while (0) + + TEST_SPEED_RETURN(400G, 400000); + TEST_SPEED_RETURN(200G, 200000); + TEST_SPEED_RETURN(100G, 100000); + TEST_SPEED_RETURN(50G, 50000); + TEST_SPEED_RETURN(40G, 40000); + TEST_SPEED_RETURN(25G, 25000); + TEST_SPEED_RETURN(10G, 10000); + TEST_SPEED_RETURN(1G, 1000); + TEST_SPEED_RETURN(100M, 100); + + #undef TEST_SPEED_RETURN + + return 0; +} + +/* + * Return the port capabilities bit for the highest speed in the capabilities. + */ +uint32_t fwcap_top_speed(uint32_t caps) +{ + #define TEST_SPEED_RETURN(__caps_speed) \ + do { \ + if (caps & FW_PORT_CAP32_SPEED_##__caps_speed) \ + return FW_PORT_CAP32_SPEED_##__caps_speed; \ + } while (0) + + TEST_SPEED_RETURN(400G); + TEST_SPEED_RETURN(200G); + TEST_SPEED_RETURN(100G); + TEST_SPEED_RETURN(50G); + TEST_SPEED_RETURN(40G); + TEST_SPEED_RETURN(25G); + TEST_SPEED_RETURN(10G); + TEST_SPEED_RETURN(1G); + TEST_SPEED_RETURN(100M); + + #undef TEST_SPEED_RETURN + + return 0; +} + + +/** + * lstatus_to_fwcap - translate old lstatus to 32-bit Port Capabilities + * @lstatus: old FW_PORT_ACTION_GET_PORT_INFO lstatus value + * + * Translates old FW_PORT_ACTION_GET_PORT_INFO lstatus field into new + * 32-bit Port Capabilities value. + */ +static uint32_t lstatus_to_fwcap(u32 lstatus) +{ + uint32_t linkattr = 0; + + /* + * Unfortunately the format of the Link Status in the old + * 16-bit Port Information message isn't the same as the + * 16-bit Port Capabilities bitfield used everywhere else ... + */ + if (lstatus & F_FW_PORT_CMD_RXPAUSE) + linkattr |= FW_PORT_CAP32_FC_RX; + if (lstatus & F_FW_PORT_CMD_TXPAUSE) + linkattr |= FW_PORT_CAP32_FC_TX; + if (lstatus & V_FW_PORT_CMD_LSPEED(FW_PORT_CAP_SPEED_100M)) + linkattr |= FW_PORT_CAP32_SPEED_100M; + if (lstatus & V_FW_PORT_CMD_LSPEED(FW_PORT_CAP_SPEED_1G)) + linkattr |= FW_PORT_CAP32_SPEED_1G; + if (lstatus & V_FW_PORT_CMD_LSPEED(FW_PORT_CAP_SPEED_10G)) + linkattr |= FW_PORT_CAP32_SPEED_10G; + if (lstatus & V_FW_PORT_CMD_LSPEED(FW_PORT_CAP_SPEED_25G)) + linkattr |= FW_PORT_CAP32_SPEED_25G; + if (lstatus & V_FW_PORT_CMD_LSPEED(FW_PORT_CAP_SPEED_40G)) + linkattr |= FW_PORT_CAP32_SPEED_40G; + if (lstatus & V_FW_PORT_CMD_LSPEED(FW_PORT_CAP_SPEED_100G)) + linkattr |= FW_PORT_CAP32_SPEED_100G; + + return linkattr; +} + +/* * Updates all fields owned by the common code in port_info and link_config * based on information provided by the firmware. Does not touch any * requested_* field. */ -static void handle_port_info(struct port_info *pi, const struct fw_port_info *p) +static void handle_port_info(struct port_info *pi, const struct fw_port_cmd *p, + enum fw_port_action action, bool *mod_changed, bool *link_changed) { - struct link_config *lc = &pi->link_cfg; - int speed; + struct link_config old_lc, *lc = &pi->link_cfg; unsigned char fc, fec; - u32 stat = be32_to_cpu(p->lstatus_to_modtype); + u32 stat, linkattr; + int old_ptype, old_mtype; - pi->port_type = G_FW_PORT_CMD_PTYPE(stat); - pi->mod_type = G_FW_PORT_CMD_MODTYPE(stat); - pi->mdio_addr = stat & F_FW_PORT_CMD_MDIOCAP ? - G_FW_PORT_CMD_MDIOADDR(stat) : -1; + old_ptype = pi->port_type; + old_mtype = pi->mod_type; + old_lc = *lc; + if (action == FW_PORT_ACTION_GET_PORT_INFO) { + stat = be32_to_cpu(p->u.info.lstatus_to_modtype); - lc->supported = be16_to_cpu(p->pcap); - lc->advertising = be16_to_cpu(p->acap); - lc->lp_advertising = be16_to_cpu(p->lpacap); - lc->link_ok = (stat & F_FW_PORT_CMD_LSTATUS) != 0; - lc->link_down_rc = G_FW_PORT_CMD_LINKDNRC(stat); + pi->port_type = G_FW_PORT_CMD_PTYPE(stat); + pi->mod_type = G_FW_PORT_CMD_MODTYPE(stat); + pi->mdio_addr = stat & F_FW_PORT_CMD_MDIOCAP ? + G_FW_PORT_CMD_MDIOADDR(stat) : -1; - speed = 0; - if (stat & V_FW_PORT_CMD_LSPEED(FW_PORT_CAP_SPEED_100M)) - speed = 100; - else if (stat & V_FW_PORT_CMD_LSPEED(FW_PORT_CAP_SPEED_1G)) - speed = 1000; - else if (stat & V_FW_PORT_CMD_LSPEED(FW_PORT_CAP_SPEED_10G)) - speed = 10000; - else if (stat & V_FW_PORT_CMD_LSPEED(FW_PORT_CAP_SPEED_25G)) - speed = 25000; - else if (stat & V_FW_PORT_CMD_LSPEED(FW_PORT_CAP_SPEED_40G)) - speed = 40000; - else if (stat & V_FW_PORT_CMD_LSPEED(FW_PORT_CAP_SPEED_100G)) - speed = 100000; - lc->speed = speed; + lc->supported = fwcaps16_to_caps32(be16_to_cpu(p->u.info.pcap)); + lc->advertising = fwcaps16_to_caps32(be16_to_cpu(p->u.info.acap)); + lc->lp_advertising = fwcaps16_to_caps32(be16_to_cpu(p->u.info.lpacap)); + lc->link_ok = (stat & F_FW_PORT_CMD_LSTATUS) != 0; + lc->link_down_rc = G_FW_PORT_CMD_LINKDNRC(stat); + + linkattr = lstatus_to_fwcap(stat); + } else if (action == FW_PORT_ACTION_GET_PORT_INFO32) { + stat = be32_to_cpu(p->u.info32.lstatus32_to_cbllen32); + + pi->port_type = G_FW_PORT_CMD_PORTTYPE32(stat); + pi->mod_type = G_FW_PORT_CMD_MODTYPE32(stat); + pi->mdio_addr = stat & F_FW_PORT_CMD_MDIOCAP32 ? + G_FW_PORT_CMD_MDIOADDR32(stat) : -1; + + lc->supported = be32_to_cpu(p->u.info32.pcaps32); + lc->advertising = be32_to_cpu(p->u.info32.acaps32); + lc->lp_advertising = be16_to_cpu(p->u.info32.lpacaps32); + lc->link_ok = (stat & F_FW_PORT_CMD_LSTATUS32) != 0; + lc->link_down_rc = G_FW_PORT_CMD_LINKDNRC32(stat); + + linkattr = be32_to_cpu(p->u.info32.linkattr32); + } else { + CH_ERR(pi->adapter, "bad port_info action 0x%x\n", action); + return; + } + + lc->speed = fwcap_to_speed(linkattr); fc = 0; - if (stat & F_FW_PORT_CMD_RXPAUSE) + if (linkattr & FW_PORT_CAP32_FC_RX) fc |= PAUSE_RX; - if (stat & F_FW_PORT_CMD_TXPAUSE) + if (linkattr & FW_PORT_CAP32_FC_TX) fc |= PAUSE_TX; lc->fc = fc; - fec = 0; - if (lc->advertising & FW_PORT_CAP_FEC_RS) - fec = FEC_RS; - else if (lc->advertising & FW_PORT_CAP_FEC_BASER_RS) - fec = FEC_BASER_RS; + fec = FEC_NONE; + if (linkattr & FW_PORT_CAP32_FEC_RS) + fec |= FEC_RS; + if (linkattr & FW_PORT_CAP32_FEC_BASER_RS) + fec |= FEC_BASER_RS; lc->fec = fec; + + if (mod_changed != NULL) + *mod_changed = false; + if (link_changed != NULL) + *link_changed = false; + if (old_ptype != pi->port_type || old_mtype != pi->mod_type || + old_lc.supported != lc->supported) { + if (pi->mod_type != FW_PORT_MOD_TYPE_NONE) { + lc->fec_hint = lc->advertising & + V_FW_PORT_CAP32_FEC(M_FW_PORT_CAP32_FEC); + } + if (mod_changed != NULL) + *mod_changed = true; + } + if (old_lc.link_ok != lc->link_ok || old_lc.speed != lc->speed || + old_lc.fec != lc->fec || old_lc.fc != lc->fc) { + if (link_changed != NULL) + *link_changed = true; + } } /** @@ -7798,22 +8033,24 @@ static void handle_port_info(struct port_info *pi, const struct fw_port_info *p) */ int t4_update_port_info(struct port_info *pi) { - struct fw_port_cmd port_cmd; + struct adapter *sc = pi->adapter; + struct fw_port_cmd cmd; + enum fw_port_action action; int ret; - memset(&port_cmd, 0, sizeof port_cmd); - port_cmd.op_to_portid = cpu_to_be32(V_FW_CMD_OP(FW_PORT_CMD) | - F_FW_CMD_REQUEST | F_FW_CMD_READ | - V_FW_PORT_CMD_PORTID(pi->tx_chan)); - port_cmd.action_to_len16 = cpu_to_be32( - V_FW_PORT_CMD_ACTION(FW_PORT_ACTION_GET_PORT_INFO) | - FW_LEN16(port_cmd)); - ret = t4_wr_mbox_ns(pi->adapter, pi->adapter->mbox, - &port_cmd, sizeof(port_cmd), &port_cmd); + memset(&cmd, 0, sizeof(cmd)); + cmd.op_to_portid = cpu_to_be32(V_FW_CMD_OP(FW_PORT_CMD) | + F_FW_CMD_REQUEST | F_FW_CMD_READ | + V_FW_PORT_CMD_PORTID(pi->tx_chan)); + action = sc->params.port_caps32 ? FW_PORT_ACTION_GET_PORT_INFO32 : + FW_PORT_ACTION_GET_PORT_INFO; + cmd.action_to_len16 = cpu_to_be32(V_FW_PORT_CMD_ACTION(action) | + FW_LEN16(cmd)); + ret = t4_wr_mbox_ns(sc, sc->mbox, &cmd, sizeof(cmd), &cmd); if (ret) return ret; - handle_port_info(pi, &port_cmd.u.info); + handle_port_info(pi, &cmd, action, NULL, NULL); return 0; } @@ -7828,15 +8065,18 @@ int t4_handle_fw_rpl(struct adapter *adap, const __be64 *rpl) { u8 opcode = *(const u8 *)rpl; const struct fw_port_cmd *p = (const void *)rpl; - unsigned int action = - G_FW_PORT_CMD_ACTION(be32_to_cpu(p->action_to_len16)); + enum fw_port_action action = + G_FW_PORT_CMD_ACTION(be32_to_cpu(p->action_to_len16)); + bool mod_changed, link_changed; - if (opcode == FW_PORT_CMD && action == FW_PORT_ACTION_GET_PORT_INFO) { + if (opcode == FW_PORT_CMD && + (action == FW_PORT_ACTION_GET_PORT_INFO || + action == FW_PORT_ACTION_GET_PORT_INFO32)) { /* link/module state change message */ - int i, old_ptype, old_mtype; + int i; int chan = G_FW_PORT_CMD_PORTID(be32_to_cpu(p->op_to_portid)); struct port_info *pi = NULL; - struct link_config *lc, *old_lc; + struct link_config *lc; for_each_port(adap, i) { pi = adap2pinfo(adap, i); @@ -7846,23 +8086,15 @@ int t4_handle_fw_rpl(struct adapter *adap, const __be64 *rpl) lc = &pi->link_cfg; PORT_LOCK(pi); - old_lc = &pi->old_link_cfg; - old_ptype = pi->port_type; - old_mtype = pi->mod_type; - handle_port_info(pi, &p->u.info); + handle_port_info(pi, p, action, &mod_changed, &link_changed); PORT_UNLOCK(pi); - if (old_ptype != pi->port_type || old_mtype != pi->mod_type) { + if (mod_changed) t4_os_portmod_changed(pi); - } - PORT_LOCK(pi); - if (old_lc->link_ok != lc->link_ok || - old_lc->speed != lc->speed || - old_lc->fec != lc->fec || - old_lc->fc != lc->fc) { + if (link_changed) { + PORT_LOCK(pi); t4_os_link_changed(pi); - *old_lc = *lc; + PORT_UNLOCK(pi); } - PORT_UNLOCK(pi); } else { CH_WARN_RATELIMIT(adap, "Unknown firmware reply %d\n", opcode); return -EINVAL; @@ -8595,6 +8827,11 @@ int t4_port_init(struct adapter *adap, int mbox, int pf, int vf, int port_id) } while ((adap->params.portvec & (1 << j)) == 0); } + p->tx_chan = j; + p->mps_bg_map = t4_get_mps_bg_map(adap, j); + p->rx_e_chan_map = t4_get_rx_e_chan_map(adap, j); + p->lport = j; + if (!(adap->flags & IS_VF) || adap->params.vfres.r_caps & FW_CMD_CAP_PORT) { t4_update_port_info(p); @@ -8609,10 +8846,6 @@ int t4_port_init(struct adapter *adap, int mbox, int pf, int vf, int port_id) p->vi[0].smt_idx = (ret & 0x7f) << 1; else p->vi[0].smt_idx = (ret & 0x7f); - p->tx_chan = j; - p->mps_bg_map = t4_get_mps_bg_map(adap, j); - p->rx_e_chan_map = t4_get_rx_e_chan_map(adap, j); - p->lport = j; p->vi[0].rss_size = rss_size; t4_os_set_hw_addr(p, addr); diff --git a/sys/dev/cxgbe/firmware/t4fw_cfg.txt b/sys/dev/cxgbe/firmware/t4fw_cfg.txt index 43820a0b44cb..11721a1ed648 100644 --- a/sys/dev/cxgbe/firmware/t4fw_cfg.txt +++ b/sys/dev/cxgbe/firmware/t4fw_cfg.txt @@ -110,6 +110,7 @@ nexactf = 280 cmask = all pmask = all + nethofld = 2048 # driver will mask off features it won't use protocol = ofld, rddp, rdmac, iscsi_initiator_pdu, iscsi_target_pdu @@ -245,7 +246,7 @@ [fini] version = 0x1 - checksum = 0xbec0621 + checksum = 0x159b9295 # # $FreeBSD$ # diff --git a/sys/dev/cxgbe/firmware/t5fw_cfg.txt b/sys/dev/cxgbe/firmware/t5fw_cfg.txt index 721ff372ef80..f42966c62e49 100644 --- a/sys/dev/cxgbe/firmware/t5fw_cfg.txt +++ b/sys/dev/cxgbe/firmware/t5fw_cfg.txt @@ -155,6 +155,7 @@ nexactf = 456 cmask = all pmask = all + nethofld = 8192 # driver will mask off features it won't use protocol = ofld, rddp, rdmac, iscsi_initiator_pdu, iscsi_target_pdu, iscsi_t10dif @@ -290,7 +291,7 @@ [fini] version = 0x1 - checksum = 0x89c83d98 + checksum = 0x30b6a157 # # $FreeBSD$ # diff --git a/sys/dev/cxgbe/firmware/t6fw_cfg.txt b/sys/dev/cxgbe/firmware/t6fw_cfg.txt index 3b62ed83b344..c542cfcadf5e 100644 --- a/sys/dev/cxgbe/firmware/t6fw_cfg.txt +++ b/sys/dev/cxgbe/firmware/t6fw_cfg.txt @@ -155,6 +155,7 @@ pmask = all ncrypto_lookaside = 16 nclip = 320 + nethofld = 8192 # TCAM has 6K cells; each region must start at a multiple of 128 cell. # Each entry in these categories takes 2 cells each. nhash will use the @@ -275,7 +276,7 @@ [fini] version = 0x1 - checksum = 0x9e8952d2 + checksum = 0xf3e93001 # # $FreeBSD$ # diff --git a/sys/dev/cxgbe/osdep.h b/sys/dev/cxgbe/osdep.h index 5cd3e88803ad..6eec287db1a7 100644 --- a/sys/dev/cxgbe/osdep.h +++ b/sys/dev/cxgbe/osdep.h @@ -108,6 +108,7 @@ typedef boolean_t bool; #define DUPLEX_HALF 0 #define DUPLEX_FULL 1 +#define AUTONEG_AUTO (-1) #define AUTONEG_DISABLE 0 #define AUTONEG_ENABLE 1 diff --git a/sys/dev/cxgbe/t4_filter.c b/sys/dev/cxgbe/t4_filter.c index fe26b47566e1..4f325bdd235d 100644 --- a/sys/dev/cxgbe/t4_filter.c +++ b/sys/dev/cxgbe/t4_filter.c @@ -593,13 +593,8 @@ set_tcamfilter(struct adapter *sc, struct t4_filter *t, struct l2t_entry *l2te, } } mtx_unlock(&sc->tids.ftid_lock); - if (rc != 0) { - if (l2te) - t4_l2t_release(l2te); - if (smt) - t4_smt_release(smt); + if (rc != 0) return (rc); - } /* * Can't fail now. A set-filter WR will definitely be sent. @@ -817,8 +812,8 @@ int set_filter(struct adapter *sc, struct t4_filter *t) { struct tid_info *ti = &sc->tids; - struct l2t_entry *l2te; - struct smt_entry *smt; + struct l2t_entry *l2te = NULL; + struct smt_entry *smt = NULL; uint64_t ftuple; int rc; @@ -942,43 +937,41 @@ done: * Allocate L2T entry, SMT entry, etc. */ - l2te = NULL; if (t->fs.newdmac || t->fs.newvlan) { /* This filter needs an L2T entry; allocate one. */ - l2te = t4_l2t_alloc_switching(sc->l2t); - if (__predict_false(l2te == NULL)) - return (EAGAIN); - rc = t4_l2t_set_switching(sc, l2te, t->fs.vlan, t->fs.eport, + l2te = t4_l2t_alloc_switching(sc, t->fs.vlan, t->fs.eport, t->fs.dmac); - if (rc) { - t4_l2t_release(l2te); - return (ENOMEM); + if (__predict_false(l2te == NULL)) { + rc = EAGAIN; + goto error; } } - smt = NULL; if (t->fs.newsmac) { /* This filter needs an SMT entry; allocate one. */ smt = t4_smt_alloc_switching(sc->smt, t->fs.smac); if (__predict_false(smt == NULL)) { - if (l2te != NULL) - t4_l2t_release(l2te); - return (EAGAIN); + rc = EAGAIN; + goto error; } rc = t4_smt_set_switching(sc, smt, 0x0, t->fs.smac); - if (rc) { - t4_smt_release(smt); - if (l2te != NULL) - t4_l2t_release(l2te); - return (rc); - } + if (rc) + goto error; } if (t->fs.hash) - return (set_hashfilter(sc, t, ftuple, l2te, smt)); + rc = set_hashfilter(sc, t, ftuple, l2te, smt); else - return (set_tcamfilter(sc, t, l2te, smt)); + rc = set_tcamfilter(sc, t, l2te, smt); + if (rc != 0 && rc != EINPROGRESS) { +error: + if (l2te) + t4_l2t_release(l2te); + if (smt) + t4_smt_release(smt); + } + return (rc); } static int @@ -1552,10 +1545,6 @@ set_hashfilter(struct adapter *sc, struct t4_filter *t, uint64_t ftuple, f = malloc(sizeof(*f), M_CXGBE, M_ZERO | M_NOWAIT); if (__predict_false(f == NULL)) { - if (l2te) - t4_l2t_release(l2te); - if (smt) - t4_smt_release(smt); rc = ENOMEM; goto done; } @@ -1565,10 +1554,6 @@ set_hashfilter(struct adapter *sc, struct t4_filter *t, uint64_t ftuple, atid = alloc_atid(sc, f); if (__predict_false(atid) == -1) { - if (l2te) - t4_l2t_release(l2te); - if (smt) - t4_smt_release(smt); free(f, M_CXGBE); rc = EAGAIN; goto done; @@ -1579,10 +1564,6 @@ set_hashfilter(struct adapter *sc, struct t4_filter *t, uint64_t ftuple, &cookie); if (wr == NULL) { free_atid(sc, atid); - if (l2te) - t4_l2t_release(l2te); - if (smt) - t4_smt_release(smt); free(f, M_CXGBE); rc = ENOMEM; goto done; diff --git a/sys/dev/cxgbe/t4_l2t.c b/sys/dev/cxgbe/t4_l2t.c index 22e84d320aa2..5e96579f1717 100644 --- a/sys/dev/cxgbe/t4_l2t.c +++ b/sys/dev/cxgbe/t4_l2t.c @@ -108,6 +108,44 @@ found: return (e); } +static struct l2t_entry * +find_or_alloc_l2e(struct l2t_data *d, uint16_t vlan, uint8_t port, uint8_t *dmac) +{ + struct l2t_entry *end, *e, **p; + struct l2t_entry *first_free = NULL; + + for (e = &d->l2tab[0], end = &d->l2tab[d->l2t_size]; e != end; ++e) { + if (atomic_load_acq_int(&e->refcnt) == 0) { + if (!first_free) + first_free = e; + } else if (e->state == L2T_STATE_SWITCHING && + memcmp(e->dmac, dmac, ETHER_ADDR_LEN) == 0 && + e->vlan == vlan && e->lport == port) + return (e); /* Found existing entry that matches. */ + } + + if (first_free == NULL) + return (NULL); /* No match and no room for a new entry. */ + + /* + * The entry we found may be an inactive entry that is + * presently in the hash table. We need to remove it. + */ + e = first_free; + if (e->state < L2T_STATE_SWITCHING) { + for (p = &d->l2tab[e->hash].first; *p; p = &(*p)->next) { + if (*p == e) { + *p = e->next; + e->next = NULL; + break; + } + } + } + e->state = L2T_STATE_UNUSED; + return (e); +} + + /* * Write an L2T entry. Must be called with the entry locked. * The write may be synchronous or asynchronous. @@ -154,41 +192,38 @@ t4_write_l2e(struct l2t_entry *e, int sync) * address resolution updates do not see them. */ struct l2t_entry * -t4_l2t_alloc_switching(struct l2t_data *d) +t4_l2t_alloc_switching(struct adapter *sc, uint16_t vlan, uint8_t port, + uint8_t *eth_addr) { + struct l2t_data *d = sc->l2t; struct l2t_entry *e; + int rc; rw_wlock(&d->lock); - e = t4_alloc_l2e(d); + e = find_or_alloc_l2e(d, vlan, port, eth_addr); if (e) { - mtx_lock(&e->lock); /* avoid race with t4_l2t_free */ - e->state = L2T_STATE_SWITCHING; - atomic_store_rel_int(&e->refcnt, 1); - mtx_unlock(&e->lock); + if (atomic_load_acq_int(&e->refcnt) == 0) { + mtx_lock(&e->lock); /* avoid race with t4_l2t_free */ + e->wrq = &sc->sge.ctrlq[0]; + e->iqid = sc->sge.fwq.abs_id; + e->state = L2T_STATE_SWITCHING; + e->vlan = vlan; + e->lport = port; + memcpy(e->dmac, eth_addr, ETHER_ADDR_LEN); + atomic_store_rel_int(&e->refcnt, 1); + atomic_subtract_int(&d->nfree, 1); + rc = t4_write_l2e(e, 0); + mtx_unlock(&e->lock); + if (rc != 0) + e = NULL; + } else { + MPASS(e->vlan == vlan); + MPASS(e->lport == port); + atomic_add_int(&e->refcnt, 1); + } } rw_wunlock(&d->lock); - return e; -} - -/* - * Sets/updates the contents of a switching L2T entry that has been allocated - * with an earlier call to @t4_l2t_alloc_switching. - */ -int -t4_l2t_set_switching(struct adapter *sc, struct l2t_entry *e, uint16_t vlan, - uint8_t port, uint8_t *eth_addr) -{ - int rc; - - e->vlan = vlan; - e->lport = port; - e->wrq = &sc->sge.ctrlq[0]; - e->iqid = sc->sge.fwq.abs_id; - memcpy(e->dmac, eth_addr, ETHER_ADDR_LEN); - mtx_lock(&e->lock); - rc = t4_write_l2e(e, 0); - mtx_unlock(&e->lock); - return (rc); + return (e); } int diff --git a/sys/dev/cxgbe/t4_l2t.h b/sys/dev/cxgbe/t4_l2t.h index 21c392018fe8..188669e80261 100644 --- a/sys/dev/cxgbe/t4_l2t.h +++ b/sys/dev/cxgbe/t4_l2t.h @@ -91,7 +91,8 @@ struct l2t_data { int t4_init_l2t(struct adapter *, int); int t4_free_l2t(struct l2t_data *); struct l2t_entry *t4_alloc_l2e(struct l2t_data *); -struct l2t_entry *t4_l2t_alloc_switching(struct l2t_data *); +struct l2t_entry *t4_l2t_alloc_switching(struct adapter *, uint16_t, uint8_t, + uint8_t *); int t4_l2t_set_switching(struct adapter *, struct l2t_entry *, uint16_t, uint8_t, uint8_t *); int t4_write_l2e(struct l2t_entry *, int); diff --git a/sys/dev/cxgbe/t4_main.c b/sys/dev/cxgbe/t4_main.c index d6bf50121a15..898cbf4db465 100644 --- a/sys/dev/cxgbe/t4_main.c +++ b/sys/dev/cxgbe/t4_main.c @@ -390,18 +390,20 @@ static char t4_cfg_file[32] = DEFAULT_CF; TUNABLE_STR("hw.cxgbe.config_file", t4_cfg_file, sizeof(t4_cfg_file)); /* - * PAUSE settings (bit 0, 1 = rx_pause, tx_pause respectively). + * PAUSE settings (bit 0, 1, 2 = rx_pause, tx_pause, pause_autoneg respectively). * rx_pause = 1 to heed incoming PAUSE frames, 0 to ignore them. * tx_pause = 1 to emit PAUSE frames when the rx FIFO reaches its high water * mark or when signalled to do so, 0 to never emit PAUSE. + * pause_autoneg = 1 means PAUSE will be negotiated if possible and the + * negotiated settings will override rx_pause/tx_pause. + * Otherwise rx_pause/tx_pause are applied forcibly. */ -static int t4_pause_settings = PAUSE_TX | PAUSE_RX; +static int t4_pause_settings = PAUSE_RX | PAUSE_TX | PAUSE_AUTONEG; TUNABLE_INT("hw.cxgbe.pause_settings", &t4_pause_settings); /* - * Forward Error Correction settings (bit 0, 1, 2 = FEC_RS, FEC_BASER_RS, - * FEC_RESERVED respectively). - * -1 to run with the firmware default. + * Forward Error Correction settings (bit 0, 1 = RS, BASER respectively). + * -1 to run with the firmware default. Same as FEC_AUTO (bit 5) * 0 to disable FEC. */ static int t4_fec = -1; @@ -437,8 +439,13 @@ static int t4_switchcaps_allowed = FW_CAPS_CONFIG_SWITCH_INGRESS | FW_CAPS_CONFIG_SWITCH_EGRESS; TUNABLE_INT("hw.cxgbe.switchcaps_allowed", &t4_switchcaps_allowed); +#ifdef RATELIMIT static int t4_niccaps_allowed = FW_CAPS_CONFIG_NIC | FW_CAPS_CONFIG_NIC_HASHFILTER | FW_CAPS_CONFIG_NIC_ETHOFLD; +#else +static int t4_niccaps_allowed = FW_CAPS_CONFIG_NIC | + FW_CAPS_CONFIG_NIC_HASHFILTER; +#endif TUNABLE_INT("hw.cxgbe.niccaps_allowed", &t4_niccaps_allowed); static int t4_toecaps_allowed = -1; @@ -526,9 +533,11 @@ static int get_params__pre_init(struct adapter *); static int get_params__post_init(struct adapter *); static int set_params__post_init(struct adapter *); static void t4_set_desc(struct adapter *); -static void build_medialist(struct port_info *, struct ifmedia *); -static void init_l1cfg(struct port_info *); -static int apply_l1cfg(struct port_info *); +static bool fixed_ifmedia(struct port_info *); +static void build_medialist(struct port_info *); +static void init_link_config(struct port_info *); +static int fixup_link_config(struct port_info *); +static int apply_link_config(struct port_info *); static int cxgbe_init_synchronized(struct vi_info *); static int cxgbe_uninit_synchronized(struct vi_info *); static void quiesce_txq(struct adapter *, struct sge_txq *); @@ -1018,6 +1027,14 @@ t4_attach(device_t dev) ifmedia_init(&pi->media, IFM_IMASK, cxgbe_media_change, cxgbe_media_status); + PORT_LOCK(pi); + init_link_config(pi); + fixup_link_config(pi); + build_medialist(pi); + if (fixed_ifmedia(pi)) + pi->flags |= FIXED_IFMEDIA; + PORT_UNLOCK(pi); + pi->dev = device_add_child(dev, sc->names->ifnet_name, -1); if (pi->dev == NULL) { device_printf(dev, @@ -1500,6 +1517,7 @@ cxgbe_vi_attach(device_t dev, struct vi_info *vi) #endif ifp->if_capabilities = T4_CAP; + ifp->if_capenable = T4_CAP_ENABLE; #ifdef TCP_OFFLOAD if (vi->nofldrxq != 0) ifp->if_capabilities |= IFCAP_TOE; @@ -1509,10 +1527,11 @@ cxgbe_vi_attach(device_t dev, struct vi_info *vi) ifp->if_capabilities |= IFCAP_NETMAP; #endif #ifdef RATELIMIT - if (is_ethoffload(vi->pi->adapter) && vi->nofldtxq != 0) + if (is_ethoffload(vi->pi->adapter) && vi->nofldtxq != 0) { ifp->if_capabilities |= IFCAP_TXRTLMT; + ifp->if_capenable |= IFCAP_TXRTLMT; + } #endif - ifp->if_capenable = T4_CAP_ENABLE; ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_IP | CSUM_TSO | CSUM_UDP_IPV6 | CSUM_TCP_IPV6; @@ -1883,7 +1902,7 @@ cxgbe_transmit(struct ifnet *ifp, struct mbuf *m) M_ASSERTPKTHDR(m); MPASS(m->m_nextpkt == NULL); /* not quite ready for this yet */ - if (__predict_false(pi->link_cfg.link_ok == 0)) { + if (__predict_false(pi->link_cfg.link_ok == false)) { m_freem(m); return (ENETDOWN); } @@ -2061,8 +2080,8 @@ cxgbe_get_counter(struct ifnet *ifp, ift_counter c) } /* - * The kernel picks a media from the list we had provided so we do not have to - * validate the request. + * The kernel picks a media from the list we had provided but we still validate + * the requeste. */ int cxgbe_media_change(struct ifnet *ifp) @@ -2079,8 +2098,14 @@ cxgbe_media_change(struct ifnet *ifp) return (rc); PORT_LOCK(pi); if (IFM_SUBTYPE(ifm->ifm_media) == IFM_AUTO) { - MPASS(lc->supported & FW_PORT_CAP_ANEG); + /* ifconfig .. media autoselect */ + if (!(lc->supported & FW_PORT_CAP32_ANEG)) { + rc = ENOTSUP; /* AN not supported by transceiver */ + goto done; + } lc->requested_aneg = AUTONEG_ENABLE; + lc->requested_speed = 0; + lc->requested_fc |= PAUSE_AUTONEG; } else { lc->requested_aneg = AUTONEG_DISABLE; lc->requested_speed = @@ -2091,47 +2116,25 @@ cxgbe_media_change(struct ifnet *ifp) if (IFM_OPTIONS(ifm->ifm_media) & IFM_ETH_TXPAUSE) lc->requested_fc |= PAUSE_TX; } - if (pi->up_vis > 0) - rc = apply_l1cfg(pi); + if (pi->up_vis > 0) { + fixup_link_config(pi); + rc = apply_link_config(pi); + } +done: PORT_UNLOCK(pi); end_synchronized_op(sc, 0); return (rc); } /* - * Mbps to FW_PORT_CAP_SPEED_* bit. - */ -static uint16_t -speed_to_fwspeed(int speed) -{ - - switch (speed) { - case 100000: - return (FW_PORT_CAP_SPEED_100G); - case 40000: - return (FW_PORT_CAP_SPEED_40G); - case 25000: - return (FW_PORT_CAP_SPEED_25G); - case 10000: - return (FW_PORT_CAP_SPEED_10G); - case 1000: - return (FW_PORT_CAP_SPEED_1G); - case 100: - return (FW_PORT_CAP_SPEED_100M); - } - - return (0); -} - -/* * Base media word (without ETHER, pause, link active, etc.) for the port at the * given speed. */ static int -port_mword(struct port_info *pi, uint16_t speed) +port_mword(struct port_info *pi, uint32_t speed) { - MPASS(speed & M_FW_PORT_CAP_SPEED); + MPASS(speed & M_FW_PORT_CAP32_SPEED); MPASS(powerof2(speed)); switch(pi->port_type) { @@ -2140,24 +2143,24 @@ port_mword(struct port_info *pi, uint16_t speed) case FW_PORT_TYPE_BT_XAUI: /* BaseT */ switch (speed) { - case FW_PORT_CAP_SPEED_100M: + case FW_PORT_CAP32_SPEED_100M: return (IFM_100_T); - case FW_PORT_CAP_SPEED_1G: + case FW_PORT_CAP32_SPEED_1G: return (IFM_1000_T); - case FW_PORT_CAP_SPEED_10G: + case FW_PORT_CAP32_SPEED_10G: return (IFM_10G_T); } break; case FW_PORT_TYPE_KX4: - if (speed == FW_PORT_CAP_SPEED_10G) + if (speed == FW_PORT_CAP32_SPEED_10G) return (IFM_10G_KX4); break; case FW_PORT_TYPE_CX4: - if (speed == FW_PORT_CAP_SPEED_10G) + if (speed == FW_PORT_CAP32_SPEED_10G) return (IFM_10G_CX4); break; case FW_PORT_TYPE_KX: - if (speed == FW_PORT_CAP_SPEED_1G) + if (speed == FW_PORT_CAP32_SPEED_1G) return (IFM_1000_KX); break; case FW_PORT_TYPE_KR: @@ -2168,15 +2171,17 @@ port_mword(struct port_info *pi, uint16_t speed) case FW_PORT_TYPE_KR_SFP28: case FW_PORT_TYPE_KR_XLAUI: switch (speed) { - case FW_PORT_CAP_SPEED_1G: + case FW_PORT_CAP32_SPEED_1G: return (IFM_1000_KX); - case FW_PORT_CAP_SPEED_10G: + case FW_PORT_CAP32_SPEED_10G: return (IFM_10G_KR); - case FW_PORT_CAP_SPEED_25G: + case FW_PORT_CAP32_SPEED_25G: return (IFM_25G_KR); - case FW_PORT_CAP_SPEED_40G: + case FW_PORT_CAP32_SPEED_40G: return (IFM_40G_KR4); - case FW_PORT_CAP_SPEED_100G: + case FW_PORT_CAP32_SPEED_50G: + return (IFM_50G_KR2); + case FW_PORT_CAP32_SPEED_100G: return (IFM_100G_KR4); } break; @@ -2194,53 +2199,59 @@ port_mword(struct port_info *pi, uint16_t speed) switch (pi->mod_type) { case FW_PORT_MOD_TYPE_LR: switch (speed) { - case FW_PORT_CAP_SPEED_1G: + case FW_PORT_CAP32_SPEED_1G: return (IFM_1000_LX); - case FW_PORT_CAP_SPEED_10G: + case FW_PORT_CAP32_SPEED_10G: return (IFM_10G_LR); - case FW_PORT_CAP_SPEED_25G: + case FW_PORT_CAP32_SPEED_25G: return (IFM_25G_LR); - case FW_PORT_CAP_SPEED_40G: + case FW_PORT_CAP32_SPEED_40G: return (IFM_40G_LR4); - case FW_PORT_CAP_SPEED_100G: + case FW_PORT_CAP32_SPEED_50G: + return (IFM_50G_LR2); + case FW_PORT_CAP32_SPEED_100G: return (IFM_100G_LR4); } break; case FW_PORT_MOD_TYPE_SR: switch (speed) { - case FW_PORT_CAP_SPEED_1G: + case FW_PORT_CAP32_SPEED_1G: return (IFM_1000_SX); - case FW_PORT_CAP_SPEED_10G: + case FW_PORT_CAP32_SPEED_10G: return (IFM_10G_SR); - case FW_PORT_CAP_SPEED_25G: + case FW_PORT_CAP32_SPEED_25G: return (IFM_25G_SR); - case FW_PORT_CAP_SPEED_40G: + case FW_PORT_CAP32_SPEED_40G: return (IFM_40G_SR4); - case FW_PORT_CAP_SPEED_100G: + case FW_PORT_CAP32_SPEED_50G: + return (IFM_50G_SR2); + case FW_PORT_CAP32_SPEED_100G: return (IFM_100G_SR4); } break; case FW_PORT_MOD_TYPE_ER: - if (speed == FW_PORT_CAP_SPEED_10G) + if (speed == FW_PORT_CAP32_SPEED_10G) return (IFM_10G_ER); break; case FW_PORT_MOD_TYPE_TWINAX_PASSIVE: case FW_PORT_MOD_TYPE_TWINAX_ACTIVE: switch (speed) { - case FW_PORT_CAP_SPEED_1G: + case FW_PORT_CAP32_SPEED_1G: return (IFM_1000_CX); - case FW_PORT_CAP_SPEED_10G: + case FW_PORT_CAP32_SPEED_10G: return (IFM_10G_TWINAX); - case FW_PORT_CAP_SPEED_25G: + case FW_PORT_CAP32_SPEED_25G: return (IFM_25G_CR); - case FW_PORT_CAP_SPEED_40G: + case FW_PORT_CAP32_SPEED_40G: return (IFM_40G_CR4); - case FW_PORT_CAP_SPEED_100G: + case FW_PORT_CAP32_SPEED_50G: + return (IFM_50G_CR2); + case FW_PORT_CAP32_SPEED_100G: return (IFM_100G_CR4); } break; case FW_PORT_MOD_TYPE_LRM: - if (speed == FW_PORT_CAP_SPEED_10G) + if (speed == FW_PORT_CAP32_SPEED_10G) return (IFM_10G_LRM); break; case FW_PORT_MOD_TYPE_NA: @@ -2282,12 +2293,12 @@ cxgbe_media_status(struct ifnet *ifp, struct ifmediareq *ifmr) * function. Just PORT_LOCK would have been enough otherwise. */ t4_update_port_info(pi); - build_medialist(pi, &pi->media); + build_medialist(pi); } /* ifm_status */ ifmr->ifm_status = IFM_AVALID; - if (lc->link_ok == 0) + if (lc->link_ok == false) goto done; ifmr->ifm_status |= IFM_ACTIVE; @@ -2298,7 +2309,7 @@ cxgbe_media_status(struct ifnet *ifp, struct ifmediareq *ifmr) ifmr->ifm_active |= IFM_ETH_RXPAUSE; if (lc->fc & PAUSE_TX) ifmr->ifm_active |= IFM_ETH_TXPAUSE; - ifmr->ifm_active |= port_mword(pi, speed_to_fwspeed(lc->speed)); + ifmr->ifm_active |= port_mword(pi, speed_to_fwcap(lc->speed)); done: PORT_UNLOCK(pi); end_synchronized_op(sc, 0); @@ -3845,7 +3856,7 @@ get_params__post_init(struct adapter *sc) sc->sge.iq_start = val[0]; sc->sge.eq_start = val[1]; - if (val[3] > val[2]) { + if ((int)val[3] > (int)val[2]) { sc->tids.ftid_base = val[2]; sc->tids.ftid_end = val[3]; sc->tids.nftids = val[3] - val[2] + 1; @@ -3954,7 +3965,7 @@ get_params__post_init(struct adapter *sc) sc->toecaps = 0; param[0] = FW_PARAM_DEV(NTID); - rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 6, param, val); + rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, param, val); if (rc != 0) { device_printf(sc->dev, "failed to query HASHFILTER parameters: %d.\n", rc); @@ -3980,7 +3991,7 @@ get_params__post_init(struct adapter *sc) "failed to query NIC parameters: %d.\n", rc); return (rc); } - if (val[1] > val[0]) { + if ((int)val[1] > (int)val[0]) { sc->tids.etid_base = val[0]; sc->tids.etid_end = val[1]; sc->tids.netids = val[1] - val[0] + 1; @@ -4010,7 +4021,7 @@ get_params__post_init(struct adapter *sc) sc->tids.ntids -= sc->tids.nhpftids; } sc->tids.natids = min(sc->tids.ntids / 2, MAX_ATIDS); - if (val[2] > val[1]) { + if ((int)val[2] > (int)val[1]) { sc->tids.stid_base = val[1]; sc->tids.nstids = val[2] - val[1] + 1; } @@ -4133,6 +4144,12 @@ set_params__post_init(struct adapter *sc) val = 1; (void)t4_set_params(sc, sc->mbox, sc->pf, 0, 1, ¶m, &val); + /* Enable 32b port caps if the firmware supports it. */ + param = FW_PARAM_PFVF(PORT_CAPS32); + val = 1; + if (t4_set_params(sc, sc->mbox, sc->pf, 0, 1, ¶m, &val) == 0) + sc->params.port_caps32 = 1; + #ifdef TCP_OFFLOAD /* * Override the TOE timers with user provided tunables. This is not the @@ -4215,22 +4232,30 @@ ifmedia_add4(struct ifmedia *ifm, int m) ifmedia_add(ifm, m | IFM_ETH_TXPAUSE | IFM_ETH_RXPAUSE, 0, NULL); } +/* + * This is the selected media, which is not quite the same as the active media. + * The media line in ifconfig is "media: Ethernet selected (active)" if selected + * and active are not the same, and "media: Ethernet selected" otherwise. + */ static void -set_current_media(struct port_info *pi, struct ifmedia *ifm) +set_current_media(struct port_info *pi) { struct link_config *lc; + struct ifmedia *ifm; int mword; + u_int speed; PORT_LOCK_ASSERT_OWNED(pi); /* Leave current media alone if it's already set to IFM_NONE. */ + ifm = &pi->media; if (ifm->ifm_cur != NULL && IFM_SUBTYPE(ifm->ifm_cur->ifm_media) == IFM_NONE) return; lc = &pi->link_cfg; - if (lc->requested_aneg == AUTONEG_ENABLE && - lc->supported & FW_PORT_CAP_ANEG) { + if (lc->requested_aneg != AUTONEG_DISABLE && + lc->supported & FW_PORT_CAP32_ANEG) { ifmedia_set(ifm, IFM_ETHER | IFM_AUTO); return; } @@ -4239,16 +4264,42 @@ set_current_media(struct port_info *pi, struct ifmedia *ifm) mword |= IFM_ETH_TXPAUSE; if (lc->requested_fc & PAUSE_RX) mword |= IFM_ETH_RXPAUSE; - mword |= port_mword(pi, speed_to_fwspeed(lc->requested_speed)); + if (lc->requested_speed == 0) + speed = port_top_speed(pi) * 1000; /* Gbps -> Mbps */ + else + speed = lc->requested_speed; + mword |= port_mword(pi, speed_to_fwcap(speed)); ifmedia_set(ifm, mword); } +/* + * Returns true if the ifmedia list for the port cannot change. + */ +static bool +fixed_ifmedia(struct port_info *pi) +{ + + return (pi->port_type == FW_PORT_TYPE_BT_SGMII || + pi->port_type == FW_PORT_TYPE_BT_XFI || + pi->port_type == FW_PORT_TYPE_BT_XAUI || + pi->port_type == FW_PORT_TYPE_KX4 || + pi->port_type == FW_PORT_TYPE_KX || + pi->port_type == FW_PORT_TYPE_KR || + pi->port_type == FW_PORT_TYPE_BP_AP || + pi->port_type == FW_PORT_TYPE_BP4_AP || + pi->port_type == FW_PORT_TYPE_BP40_BA || + pi->port_type == FW_PORT_TYPE_KR4_100G || + pi->port_type == FW_PORT_TYPE_KR_SFP28 || + pi->port_type == FW_PORT_TYPE_KR_XLAUI); +} + static void -build_medialist(struct port_info *pi, struct ifmedia *ifm) +build_medialist(struct port_info *pi) { - uint16_t ss, speed; + uint32_t ss, speed; int unknown, mword, bit; struct link_config *lc; + struct ifmedia *ifm; PORT_LOCK_ASSERT_OWNED(pi); @@ -4256,18 +4307,12 @@ build_medialist(struct port_info *pi, struct ifmedia *ifm) return; /* - * First setup all the requested_ fields so that they comply with what's - * supported by the port + transceiver. Note that this clobbers any - * user preferences set via sysctl_pause_settings or sysctl_autoneg. - */ - init_l1cfg(pi); - - /* - * Now (re)build the ifmedia list. + * Rebuild the ifmedia list. */ + ifm = &pi->media; ifmedia_removeall(ifm); lc = &pi->link_cfg; - ss = G_FW_PORT_CAP_SPEED(lc->supported); /* Supported Speeds */ + ss = G_FW_PORT_CAP32_SPEED(lc->supported); /* Supported Speeds */ if (__predict_false(ss == 0)) { /* not supposed to happen. */ MPASS(ss != 0); no_media: @@ -4278,9 +4323,9 @@ no_media: } unknown = 0; - for (bit = 0; bit < fls(ss); bit++) { + for (bit = S_FW_PORT_CAP32_SPEED; bit < fls(ss); bit++) { speed = 1 << bit; - MPASS(speed & M_FW_PORT_CAP_SPEED); + MPASS(speed & M_FW_PORT_CAP32_SPEED); if (ss & speed) { mword = port_mword(pi, speed); if (mword == IFM_NONE) { @@ -4293,86 +4338,134 @@ no_media: } if (unknown > 0) /* Add one unknown for all unknown media types. */ ifmedia_add4(ifm, IFM_ETHER | IFM_FDX | IFM_UNKNOWN); - if (lc->supported & FW_PORT_CAP_ANEG) + if (lc->supported & FW_PORT_CAP32_ANEG) ifmedia_add(ifm, IFM_ETHER | IFM_AUTO, 0, NULL); - set_current_media(pi, ifm); + set_current_media(pi); } /* - * Update all the requested_* fields in the link config to something valid (and - * reasonable). + * Initialize the requested fields in the link config based on driver tunables. */ static void -init_l1cfg(struct port_info *pi) +init_link_config(struct port_info *pi) { struct link_config *lc = &pi->link_cfg; PORT_LOCK_ASSERT_OWNED(pi); - /* Gbps -> Mbps */ - lc->requested_speed = port_top_speed(pi) * 1000; + lc->requested_speed = 0; - if (t4_autoneg != 0 && lc->supported & FW_PORT_CAP_ANEG) { - lc->requested_aneg = AUTONEG_ENABLE; - } else { + if (t4_autoneg == 0) lc->requested_aneg = AUTONEG_DISABLE; + else if (t4_autoneg == 1) + lc->requested_aneg = AUTONEG_ENABLE; + else + lc->requested_aneg = AUTONEG_AUTO; + + lc->requested_fc = t4_pause_settings & (PAUSE_TX | PAUSE_RX | + PAUSE_AUTONEG); + + if (t4_fec == -1 || t4_fec & FEC_AUTO) + lc->requested_fec = FEC_AUTO; + else { + lc->requested_fec = FEC_NONE; + if (t4_fec & FEC_RS) + lc->requested_fec |= FEC_RS; + if (t4_fec & FEC_BASER_RS) + lc->requested_fec |= FEC_BASER_RS; } +} - lc->requested_fc = t4_pause_settings & (PAUSE_TX | PAUSE_RX); +/* + * Makes sure that all requested settings comply with what's supported by the + * port. Returns the number of settings that were invalid and had to be fixed. + */ +static int +fixup_link_config(struct port_info *pi) +{ + int n = 0; + struct link_config *lc = &pi->link_cfg; + uint32_t fwspeed; - if (t4_fec != -1) { - if (t4_fec & FEC_RS && lc->supported & FW_PORT_CAP_FEC_RS) { - lc->requested_fec = FEC_RS; - } else if (t4_fec & FEC_BASER_RS && - lc->supported & FW_PORT_CAP_FEC_BASER_RS) { - lc->requested_fec = FEC_BASER_RS; - } else { - lc->requested_fec = 0; - } - } else { - /* Use the suggested value provided by the firmware in acaps */ - if (lc->advertising & FW_PORT_CAP_FEC_RS && - lc->supported & FW_PORT_CAP_FEC_RS) { - lc->requested_fec = FEC_RS; - } else if (lc->advertising & FW_PORT_CAP_FEC_BASER_RS && - lc->supported & FW_PORT_CAP_FEC_BASER_RS) { - lc->requested_fec = FEC_BASER_RS; - } else { - lc->requested_fec = 0; + PORT_LOCK_ASSERT_OWNED(pi); + + /* Speed (when not autonegotiating) */ + if (lc->requested_speed != 0) { + fwspeed = speed_to_fwcap(lc->requested_speed); + if ((fwspeed & lc->supported) == 0) { + n++; + lc->requested_speed = 0; } } + + /* Link autonegotiation */ + MPASS(lc->requested_aneg == AUTONEG_ENABLE || + lc->requested_aneg == AUTONEG_DISABLE || + lc->requested_aneg == AUTONEG_AUTO); + if (lc->requested_aneg == AUTONEG_ENABLE && + !(lc->supported & FW_PORT_CAP32_ANEG)) { + n++; + lc->requested_aneg = AUTONEG_AUTO; + } + + /* Flow control */ + MPASS((lc->requested_fc & ~(PAUSE_TX | PAUSE_RX | PAUSE_AUTONEG)) == 0); + if (lc->requested_fc & PAUSE_TX && + !(lc->supported & FW_PORT_CAP32_FC_TX)) { + n++; + lc->requested_fc &= ~PAUSE_TX; + } + if (lc->requested_fc & PAUSE_RX && + !(lc->supported & FW_PORT_CAP32_FC_RX)) { + n++; + lc->requested_fc &= ~PAUSE_RX; + } + if (!(lc->requested_fc & PAUSE_AUTONEG) && + !(lc->supported & FW_PORT_CAP32_FORCE_PAUSE)) { + n++; + lc->requested_fc |= PAUSE_AUTONEG; + } + + /* FEC */ + if ((lc->requested_fec & FEC_RS && + !(lc->supported & FW_PORT_CAP32_FEC_RS)) || + (lc->requested_fec & FEC_BASER_RS && + !(lc->supported & FW_PORT_CAP32_FEC_BASER_RS))) { + n++; + lc->requested_fec = FEC_AUTO; + } + + return (n); } /* - * Apply the settings in requested_* to the hardware. The parameters are - * expected to be sane. + * Apply the requested L1 settings, which are expected to be valid, to the + * hardware. */ static int -apply_l1cfg(struct port_info *pi) +apply_link_config(struct port_info *pi) { struct adapter *sc = pi->adapter; struct link_config *lc = &pi->link_cfg; int rc; -#ifdef INVARIANTS - uint16_t fwspeed; +#ifdef INVARIANTS ASSERT_SYNCHRONIZED_OP(sc); PORT_LOCK_ASSERT_OWNED(pi); if (lc->requested_aneg == AUTONEG_ENABLE) - MPASS(lc->supported & FW_PORT_CAP_ANEG); + MPASS(lc->supported & FW_PORT_CAP32_ANEG); + if (!(lc->requested_fc & PAUSE_AUTONEG)) + MPASS(lc->supported & FW_PORT_CAP32_FORCE_PAUSE); if (lc->requested_fc & PAUSE_TX) - MPASS(lc->supported & FW_PORT_CAP_FC_TX); + MPASS(lc->supported & FW_PORT_CAP32_FC_TX); if (lc->requested_fc & PAUSE_RX) - MPASS(lc->supported & FW_PORT_CAP_FC_RX); - if (lc->requested_fec == FEC_RS) - MPASS(lc->supported & FW_PORT_CAP_FEC_RS); - if (lc->requested_fec == FEC_BASER_RS) - MPASS(lc->supported & FW_PORT_CAP_FEC_BASER_RS); - fwspeed = speed_to_fwspeed(lc->requested_speed); - MPASS(fwspeed != 0); - MPASS(lc->supported & fwspeed); + MPASS(lc->supported & FW_PORT_CAP32_FC_RX); + if (lc->requested_fec & FEC_RS) + MPASS(lc->supported & FW_PORT_CAP32_FEC_RS); + if (lc->requested_fec & FEC_BASER_RS) + MPASS(lc->supported & FW_PORT_CAP32_FEC_BASER_RS); #endif rc = -t4_link_l1cfg(sc, sc->mbox, pi->tx_chan, lc); if (rc != 0) { @@ -4380,8 +4473,17 @@ apply_l1cfg(struct port_info *pi) if (!(sc->flags & IS_VF) || rc != FW_EPERM) device_printf(pi->dev, "l1cfg failed: %d\n", rc); } else { - lc->fc = lc->requested_fc; - lc->fec = lc->requested_fec; + /* + * An L1_CFG will almost always result in a link-change event if + * the link is up, and the driver will refresh the actual + * fec/fc/etc. when the notification is processed. If the link + * is down then the actual settings are meaningless. + * + * This takes care of the case where a change in the L1 settings + * may not result in a notification. + */ + if (lc->link_ok && !(lc->requested_fc & PAUSE_AUTONEG)) + lc->fc = lc->requested_fc & (PAUSE_TX | PAUSE_RX); } return (rc); } @@ -4635,9 +4737,18 @@ cxgbe_init_synchronized(struct vi_info *vi) if (rc) goto done; /* error message displayed already */ + PORT_LOCK(pi); + if (pi->up_vis == 0) { + t4_update_port_info(pi); + fixup_link_config(pi); + build_medialist(pi); + apply_link_config(pi); + } + rc = -t4_enable_vi(sc, sc->mbox, vi->viid, true, true); if (rc != 0) { if_printf(ifp, "enable_vi failed: %d\n", rc); + PORT_UNLOCK(pi); goto done; } @@ -4664,12 +4775,7 @@ cxgbe_init_synchronized(struct vi_info *vi) } /* all ok */ - PORT_LOCK(pi); - if (pi->up_vis++ == 0) { - t4_update_port_info(pi); - build_medialist(pi, &pi->media); - apply_l1cfg(pi); - } + pi->up_vis++; ifp->if_drv_flags |= IFF_DRV_RUNNING; if (pi->nvi > 1 || sc->flags & IS_VF) @@ -4744,11 +4850,10 @@ cxgbe_uninit_synchronized(struct vi_info *vi) return (0); } - pi->link_cfg.link_ok = 0; + pi->link_cfg.link_ok = false; pi->link_cfg.speed = 0; pi->link_cfg.link_down_rc = 255; t4_os_link_changed(pi); - pi->old_link_cfg = pi->link_cfg; PORT_UNLOCK(pi); return (0); @@ -6512,7 +6617,7 @@ sysctl_pause_settings(SYSCTL_HANDLER_ARGS) if (req->newptr == NULL) { struct sbuf *sb; - static char *bits = "\20\1PAUSE_RX\2PAUSE_TX"; + static char *bits = "\20\1RX\2TX\3AUTO"; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) @@ -6522,14 +6627,21 @@ sysctl_pause_settings(SYSCTL_HANDLER_ARGS) if (sb == NULL) return (ENOMEM); - sbuf_printf(sb, "%b", lc->fc & (PAUSE_TX | PAUSE_RX), bits); + if (lc->link_ok) { + sbuf_printf(sb, "%b", (lc->fc & (PAUSE_TX | PAUSE_RX)) | + (lc->requested_fc & PAUSE_AUTONEG), bits); + } else { + sbuf_printf(sb, "%b", lc->requested_fc & (PAUSE_TX | + PAUSE_RX | PAUSE_AUTONEG), bits); + } rc = sbuf_finish(sb); sbuf_delete(sb); } else { char s[2]; int n; - s[0] = '0' + (lc->requested_fc & (PAUSE_TX | PAUSE_RX)); + s[0] = '0' + (lc->requested_fc & (PAUSE_TX | PAUSE_RX | + PAUSE_AUTONEG)); s[1] = 0; rc = sysctl_handle_string(oidp, s, sizeof(s), req); @@ -6541,7 +6653,7 @@ sysctl_pause_settings(SYSCTL_HANDLER_ARGS) if (s[0] < '0' || s[0] > '9') return (EINVAL); /* not a number */ n = s[0] - '0'; - if (n & ~(PAUSE_TX | PAUSE_RX)) + if (n & ~(PAUSE_TX | PAUSE_RX | PAUSE_AUTONEG)) return (EINVAL); /* some other bit is set too */ rc = begin_synchronized_op(sc, &pi->vi[0], SLEEP_OK | INTR_OK, @@ -6549,15 +6661,11 @@ sysctl_pause_settings(SYSCTL_HANDLER_ARGS) if (rc) return (rc); PORT_LOCK(pi); - if ((lc->requested_fc & (PAUSE_TX | PAUSE_RX)) != n) { - lc->requested_fc &= ~(PAUSE_TX | PAUSE_RX); - lc->requested_fc |= n; - rc = -t4_link_l1cfg(sc, sc->mbox, pi->tx_chan, lc); - if (rc == 0) { - lc->fc = lc->requested_fc; - set_current_media(pi, &pi->media); - } - } + lc->requested_fc = n; + fixup_link_config(pi); + if (pi->up_vis > 0) + rc = apply_link_config(pi); + set_current_media(pi); PORT_UNLOCK(pi); end_synchronized_op(sc, 0); } @@ -6572,10 +6680,11 @@ sysctl_fec(SYSCTL_HANDLER_ARGS) struct adapter *sc = pi->adapter; struct link_config *lc = &pi->link_cfg; int rc; + int8_t old; if (req->newptr == NULL) { struct sbuf *sb; - static char *bits = "\20\1RS\2BASER_RS\3RESERVED"; + static char *bits = "\20\1RS\2BASE-R\3RSVD1\4RSVD2\5RSVD3\6AUTO"; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) @@ -6585,43 +6694,68 @@ sysctl_fec(SYSCTL_HANDLER_ARGS) if (sb == NULL) return (ENOMEM); - sbuf_printf(sb, "%b", lc->fec & M_FW_PORT_CAP_FEC, bits); + /* + * Display the requested_fec when the link is down -- the actual + * FEC makes sense only when the link is up. + */ + if (lc->link_ok) { + sbuf_printf(sb, "%b", (lc->fec & M_FW_PORT_CAP32_FEC) | + (lc->requested_fec & FEC_AUTO), bits); + } else { + sbuf_printf(sb, "%b", lc->requested_fec, bits); + } rc = sbuf_finish(sb); sbuf_delete(sb); } else { - char s[2]; + char s[3]; int n; - s[0] = '0' + (lc->requested_fec & M_FW_PORT_CAP_FEC); - s[1] = 0; + snprintf(s, sizeof(s), "%d", + lc->requested_fec == FEC_AUTO ? -1 : + lc->requested_fec & M_FW_PORT_CAP32_FEC); rc = sysctl_handle_string(oidp, s, sizeof(s), req); if (rc != 0) return(rc); - if (s[1] != 0) - return (EINVAL); - if (s[0] < '0' || s[0] > '9') - return (EINVAL); /* not a number */ - n = s[0] - '0'; - if (n & ~M_FW_PORT_CAP_FEC) - return (EINVAL); /* some other bit is set too */ - if (!powerof2(n)) - return (EINVAL); /* one bit can be set at most */ + n = strtol(&s[0], NULL, 0); + if (n < 0 || n & FEC_AUTO) + n = FEC_AUTO; + else { + if (n & ~M_FW_PORT_CAP32_FEC) + return (EINVAL);/* some other bit is set too */ + if (!powerof2(n)) + return (EINVAL);/* one bit can be set at most */ + } rc = begin_synchronized_op(sc, &pi->vi[0], SLEEP_OK | INTR_OK, "t4fec"); if (rc) return (rc); PORT_LOCK(pi); - if ((lc->requested_fec & M_FW_PORT_CAP_FEC) != n) { - lc->requested_fec = n & - G_FW_PORT_CAP_FEC(lc->supported); - rc = -t4_link_l1cfg(sc, sc->mbox, pi->tx_chan, lc); - if (rc == 0) { - lc->fec = lc->requested_fec; + old = lc->requested_fec; + if (n == FEC_AUTO) + lc->requested_fec = FEC_AUTO; + else if (n == 0) + lc->requested_fec = FEC_NONE; + else { + if ((lc->supported | V_FW_PORT_CAP32_FEC(n)) != + lc->supported) { + rc = ENOTSUP; + goto done; + } + lc->requested_fec = n; + } + fixup_link_config(pi); + if (pi->up_vis > 0) { + rc = apply_link_config(pi); + if (rc != 0) { + lc->requested_fec = old; + if (rc == FW_EPROTO) + rc = ENOTSUP; } } +done: PORT_UNLOCK(pi); end_synchronized_op(sc, 0); } @@ -6635,10 +6769,10 @@ sysctl_autoneg(SYSCTL_HANDLER_ARGS) struct port_info *pi = arg1; struct adapter *sc = pi->adapter; struct link_config *lc = &pi->link_cfg; - int rc, val, old; + int rc, val; - if (lc->supported & FW_PORT_CAP_ANEG) - val = lc->requested_aneg == AUTONEG_ENABLE ? 1 : 0; + if (lc->supported & FW_PORT_CAP32_ANEG) + val = lc->requested_aneg == AUTONEG_DISABLE ? 0 : 1; else val = -1; rc = sysctl_handle_int(oidp, &val, 0, req); @@ -6649,28 +6783,22 @@ sysctl_autoneg(SYSCTL_HANDLER_ARGS) else if (val == 1) val = AUTONEG_ENABLE; else - return (EINVAL); + val = AUTONEG_AUTO; rc = begin_synchronized_op(sc, &pi->vi[0], SLEEP_OK | INTR_OK, "t4aneg"); if (rc) return (rc); PORT_LOCK(pi); - if ((lc->supported & FW_PORT_CAP_ANEG) == 0) { + if (val == AUTONEG_ENABLE && !(lc->supported & FW_PORT_CAP32_ANEG)) { rc = ENOTSUP; goto done; } - if (lc->requested_aneg == val) { - rc = 0; /* no change, do nothing. */ - goto done; - } - old = lc->requested_aneg; lc->requested_aneg = val; - rc = -t4_link_l1cfg(sc, sc->mbox, pi->tx_chan, lc); - if (rc != 0) - lc->requested_aneg = old; - else - set_current_media(pi, &pi->media); + fixup_link_config(pi); + if (pi->up_vis > 0) + rc = apply_link_config(pi); + set_current_media(pi); done: PORT_UNLOCK(pi); end_synchronized_op(sc, 0); @@ -9407,13 +9535,17 @@ t4_os_portmod_changed(struct port_info *pi) NULL, "LR", "SR", "ER", "TWINAX", "active TWINAX", "LRM" }; - MPASS((pi->flags & FIXED_IFMEDIA) == 0); + KASSERT((pi->flags & FIXED_IFMEDIA) == 0, + ("%s: port_type %u", __func__, pi->port_type)); vi = &pi->vi[0]; if (begin_synchronized_op(sc, vi, HOLD_LOCK, "t4mod") == 0) { PORT_LOCK(pi); - build_medialist(pi, &pi->media); - apply_l1cfg(pi); + build_medialist(pi); + if (pi->mod_type != FW_PORT_MOD_TYPE_NONE) { + fixup_link_config(pi); + apply_link_config(pi); + } PORT_UNLOCK(pi); end_synchronized_op(sc, LOCK_HELD); } diff --git a/sys/dev/cxgbe/tom/t4_cpl_io.c b/sys/dev/cxgbe/tom/t4_cpl_io.c index 4a6ac900962b..684606a8d5b5 100644 --- a/sys/dev/cxgbe/tom/t4_cpl_io.c +++ b/sys/dev/cxgbe/tom/t4_cpl_io.c @@ -634,7 +634,7 @@ write_tx_wr(void *dst, struct toepcb *toep, unsigned int immdlen, if (txalign > 0) { struct tcpcb *tp = intotcpcb(toep->inp); - if (plen < 2 * tp->t_maxseg || is_10G_port(toep->vi->pi)) + if (plen < 2 * tp->t_maxseg) txwr->lsodisable_to_flags |= htobe32(F_FW_OFLD_TX_DATA_WR_LSODISABLE); else diff --git a/sys/dev/dc/if_dc.c b/sys/dev/dc/if_dc.c index e1593b02db25..a4394de9c552 100644 --- a/sys/dev/dc/if_dc.c +++ b/sys/dev/dc/if_dc.c @@ -360,7 +360,7 @@ static devclass_t dc_devclass; DRIVER_MODULE_ORDERED(dc, pci, dc_driver, dc_devclass, NULL, NULL, SI_ORDER_ANY); MODULE_PNP_INFO("W32:vendor/device;U8:revision;D:#", pci, dc, dc_devs, - sizeof(dc_devs[0]), nitems(dc_devs) - 1); + nitems(dc_devs) - 1); DRIVER_MODULE(miibus, dc, miibus_driver, miibus_devclass, NULL, NULL); #define DC_SETBIT(sc, reg, x) \ diff --git a/sys/dev/drm2/drm_os_freebsd.c b/sys/dev/drm2/drm_os_freebsd.c index 8489ca848027..c7b0bce2c7ab 100644 --- a/sys/dev/drm2/drm_os_freebsd.c +++ b/sys/dev/drm2/drm_os_freebsd.c @@ -395,8 +395,8 @@ drm_clflush_virt_range(char *addr, unsigned long length) { #if defined(__i386__) || defined(__amd64__) - pmap_invalidate_cache_range((vm_offset_t)addr, - (vm_offset_t)addr + length, TRUE); + pmap_force_invalidate_cache_range((vm_offset_t)addr, + (vm_offset_t)addr + length); #else DRM_ERROR("drm_clflush_virt_range not implemented on this architecture"); #endif diff --git a/sys/dev/drm2/i915/i915_drv.c b/sys/dev/drm2/i915/i915_drv.c index f0c8867501b3..8a7168861aca 100644 --- a/sys/dev/drm2/i915/i915_drv.c +++ b/sys/dev/drm2/i915/i915_drv.c @@ -1237,7 +1237,7 @@ MODULE_DEPEND(i915kms, iicbus, 1, 1, 1); MODULE_DEPEND(i915kms, iic, 1, 1, 1); MODULE_DEPEND(i915kms, iicbb, 1, 1, 1); MODULE_PNP_INFO("U32:vendor;U32:device;P:#;D:#", vgapci, i915, pciidlist, - sizeof(pciidlist[0]), nitems(pciidlist) - 1); + nitems(pciidlist) - 1); /* We give fast paths for the really cool registers */ #define NEEDS_FORCE_WAKE(dev_priv, reg) \ diff --git a/sys/dev/drm2/i915/intel_ringbuffer.c b/sys/dev/drm2/i915/intel_ringbuffer.c index 92a792746b1b..c6c242c875c3 100644 --- a/sys/dev/drm2/i915/intel_ringbuffer.c +++ b/sys/dev/drm2/i915/intel_ringbuffer.c @@ -471,8 +471,8 @@ init_pipe_control(struct intel_ring_buffer *ring) if (pc->cpu_page == NULL) goto err_unpin; pmap_qenter((uintptr_t)pc->cpu_page, &obj->pages[0], 1); - pmap_invalidate_cache_range((vm_offset_t)pc->cpu_page, - (vm_offset_t)pc->cpu_page + PAGE_SIZE, FALSE); + pmap_force_invalidate_cache_range((vm_offset_t)pc->cpu_page, + (vm_offset_t)pc->cpu_page + PAGE_SIZE); pc->obj = obj; ring->private = pc; @@ -1102,8 +1102,9 @@ static int init_status_page(struct intel_ring_buffer *ring) } pmap_qenter((vm_offset_t)ring->status_page.page_addr, &obj->pages[0], 1); - pmap_invalidate_cache_range((vm_offset_t)ring->status_page.page_addr, - (vm_offset_t)ring->status_page.page_addr + PAGE_SIZE, FALSE); + pmap_force_invalidate_cache_range( + (vm_offset_t)ring->status_page.page_addr, + (vm_offset_t)ring->status_page.page_addr + PAGE_SIZE); ring->status_page.obj = obj; memset(ring->status_page.page_addr, 0, PAGE_SIZE); diff --git a/sys/dev/drm2/radeon/radeon_drv.c b/sys/dev/drm2/radeon/radeon_drv.c index bf3dd063c178..73f83ccef0cb 100644 --- a/sys/dev/drm2/radeon/radeon_drv.c +++ b/sys/dev/drm2/radeon/radeon_drv.c @@ -402,4 +402,4 @@ MODULE_DEPEND(radeonkms, iic, 1, 1, 1); MODULE_DEPEND(radeonkms, iicbb, 1, 1, 1); MODULE_DEPEND(radeonkms, firmware, 1, 1, 1); MODULE_PNP_INFO("U32:vendor;U32:device;P:#;D:#", vgapci, radeonkms, - pciidlist, sizeof(pciidlist[0]), nitems(pciidlist) - 1); + pciidlist, nitems(pciidlist) - 1); diff --git a/sys/dev/e1000/if_em.c b/sys/dev/e1000/if_em.c index a95d606ac34f..93714cd10710 100644 --- a/sys/dev/e1000/if_em.c +++ b/sys/dev/e1000/if_em.c @@ -709,7 +709,8 @@ em_set_num_queues(if_ctx_t ctx) #define IGB_CAPS \ IFCAP_HWCSUM | IFCAP_VLAN_MTU | IFCAP_VLAN_HWTAGGING | \ IFCAP_VLAN_HWCSUM | IFCAP_WOL | IFCAP_VLAN_HWFILTER | IFCAP_TSO4 | \ - IFCAP_LRO | IFCAP_VLAN_HWTSO | IFCAP_JUMBO_MTU | IFCAP_HWCSUM_IPV6; + IFCAP_LRO | IFCAP_VLAN_HWTSO | IFCAP_JUMBO_MTU | IFCAP_HWCSUM_IPV6 |\ + IFCAP_TSO6 /********************************************************************* * Device initialization routine diff --git a/sys/dev/ed/if_ed_pci.c b/sys/dev/ed/if_ed_pci.c index b487b97de820..8ada958abcbe 100644 --- a/sys/dev/ed/if_ed_pci.c +++ b/sys/dev/ed/if_ed_pci.c @@ -145,5 +145,5 @@ static driver_t ed_pci_driver = { DRIVER_MODULE(ed, pci, ed_pci_driver, ed_devclass, 0, 0); MODULE_DEPEND(ed, pci, 1, 1, 1); MODULE_DEPEND(ed, ether, 1, 1, 1); -MODULE_PNP_INFO("W32:vendor/device;D:#", pci, ed, pci_ids, sizeof(pci_ids[0]), +MODULE_PNP_INFO("W32:vendor/device;D:#", pci, ed, pci_ids, nitems(pci_ids) - 1); diff --git a/sys/dev/ena/ena.c b/sys/dev/ena/ena.c index cadc8bdbf677..a27de7db593f 100644 --- a/sys/dev/ena/ena.c +++ b/sys/dev/ena/ena.c @@ -3948,7 +3948,7 @@ static driver_t ena_driver = { devclass_t ena_devclass; DRIVER_MODULE(ena, pci, ena_driver, ena_devclass, 0, 0); MODULE_PNP_INFO("U16:vendor;U16:device", pci, ena, ena_vendor_info_array, - sizeof(ena_vendor_info_array[0]), nitems(ena_vendor_info_array) - 1); + nitems(ena_vendor_info_array) - 1); MODULE_DEPEND(ena, pci, 1, 1, 1); MODULE_DEPEND(ena, ether, 1, 1, 1); diff --git a/sys/dev/et/if_et.c b/sys/dev/et/if_et.c index a5ce9e452872..294d9d5e55aa 100644 --- a/sys/dev/et/if_et.c +++ b/sys/dev/et/if_et.c @@ -189,7 +189,7 @@ static devclass_t et_devclass; DRIVER_MODULE(et, pci, et_driver, et_devclass, 0, 0); MODULE_PNP_INFO("U16:vendor;U16:device;D:#", pci, et, et_devices, - sizeof(et_devices[0]), nitems(et_devices) - 1); + nitems(et_devices) - 1); DRIVER_MODULE(miibus, et, miibus_driver, miibus_devclass, 0, 0); static int et_rx_intr_npkts = 32; diff --git a/sys/dev/ffec/if_ffec.c b/sys/dev/ffec/if_ffec.c index ea291ab8df2d..d52bf9a4e3d5 100644 --- a/sys/dev/ffec/if_ffec.c +++ b/sys/dev/ffec/if_ffec.c @@ -801,7 +801,8 @@ ffec_alloc_mbufcl(struct ffec_softc *sc) struct mbuf *m; m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); - m->m_pkthdr.len = m->m_len = m->m_ext.ext_size; + if (m != NULL) + m->m_pkthdr.len = m->m_len = m->m_ext.ext_size; return (m); } diff --git a/sys/dev/fxp/if_fxp.c b/sys/dev/fxp/if_fxp.c index b89f8329a4f2..2c3297a05bea 100644 --- a/sys/dev/fxp/if_fxp.c +++ b/sys/dev/fxp/if_fxp.c @@ -308,7 +308,7 @@ static devclass_t fxp_devclass; DRIVER_MODULE_ORDERED(fxp, pci, fxp_driver, fxp_devclass, NULL, NULL, SI_ORDER_ANY); MODULE_PNP_INFO("U16:vendor;U16:device", pci, fxp, fxp_ident_table, - sizeof(fxp_ident_table[0]), nitems(fxp_ident_table) - 1); + nitems(fxp_ident_table) - 1); DRIVER_MODULE(miibus, fxp, miibus_driver, miibus_devclass, NULL, NULL); static struct resource_spec fxp_res_spec_mem[] = { diff --git a/sys/dev/gem/if_gem_pci.c b/sys/dev/gem/if_gem_pci.c index af9db3da9c3e..ce3027fb3441 100644 --- a/sys/dev/gem/if_gem_pci.c +++ b/sys/dev/gem/if_gem_pci.c @@ -116,7 +116,7 @@ static driver_t gem_pci_driver = { DRIVER_MODULE(gem, pci, gem_pci_driver, gem_devclass, 0, 0); MODULE_PNP_INFO("W32:vendor/device", pci, gem, gem_pci_devlist, - sizeof(gem_pci_devlist[0]), nitems(gem_pci_devlist) - 1); + nitems(gem_pci_devlist) - 1); MODULE_DEPEND(gem, pci, 1, 1, 1); MODULE_DEPEND(gem, ether, 1, 1, 1); diff --git a/sys/dev/hwpmc/hwpmc_logging.c b/sys/dev/hwpmc/hwpmc_logging.c index 4d2e08fe157a..8764ac9e922c 100644 --- a/sys/dev/hwpmc/hwpmc_logging.c +++ b/sys/dev/hwpmc/hwpmc_logging.c @@ -234,7 +234,7 @@ static void pmclog_loop(void *arg); static void pmclog_release(struct pmc_owner *po); static uint32_t *pmclog_reserve(struct pmc_owner *po, int length); static void pmclog_schedule_io(struct pmc_owner *po, int wakeup); -static void pmclog_schedule_all(struct pmc_owner *po, int force); +static void pmclog_schedule_all(struct pmc_owner *po); static void pmclog_stop_kthread(struct pmc_owner *po); /* @@ -842,7 +842,7 @@ pmclog_flush(struct pmc_owner *po, int force) goto error; } - pmclog_schedule_all(po, force); + pmclog_schedule_all(po); error: mtx_unlock(&pmc_kthread_mtx); @@ -850,7 +850,7 @@ pmclog_flush(struct pmc_owner *po, int force) } static void -pmclog_schedule_one_cond(struct pmc_owner *po, int force) +pmclog_schedule_one_cond(struct pmc_owner *po) { struct pmclog_buffer *plb; int cpu; @@ -860,8 +860,7 @@ pmclog_schedule_one_cond(struct pmc_owner *po, int force) /* tell hardclock not to run again */ if (PMC_CPU_HAS_SAMPLES(cpu)) PMC_CALL_HOOK_UNLOCKED(curthread, PMC_FN_DO_SAMPLES, NULL); - if (force) - pmc_flush_samples(cpu); + plb = po->po_curbuf[cpu]; if (plb && plb->plb_ptr != plb->plb_base) pmclog_schedule_io(po, 1); @@ -869,7 +868,7 @@ pmclog_schedule_one_cond(struct pmc_owner *po, int force) } static void -pmclog_schedule_all(struct pmc_owner *po, int force) +pmclog_schedule_all(struct pmc_owner *po) { /* * Schedule the current buffer if any and not empty. @@ -878,7 +877,7 @@ pmclog_schedule_all(struct pmc_owner *po, int force) thread_lock(curthread); sched_bind(curthread, i); thread_unlock(curthread); - pmclog_schedule_one_cond(po, force); + pmclog_schedule_one_cond(po); } thread_lock(curthread); sched_unbind(curthread); @@ -905,7 +904,7 @@ pmclog_close(struct pmc_owner *po) /* * Schedule the current buffer. */ - pmclog_schedule_all(po, 0); + pmclog_schedule_all(po); wakeup_one(po); mtx_unlock(&pmc_kthread_mtx); diff --git a/sys/dev/hwpmc/hwpmc_mod.c b/sys/dev/hwpmc/hwpmc_mod.c index 4d7a4535d27b..1fad25c55702 100644 --- a/sys/dev/hwpmc/hwpmc_mod.c +++ b/sys/dev/hwpmc/hwpmc_mod.c @@ -61,6 +61,7 @@ __FBSDID("$FreeBSD$"); #include <sys/sx.h> #include <sys/sysctl.h> #include <sys/sysent.h> +#include <sys/syslog.h> #include <sys/systm.h> #include <sys/vnode.h> @@ -210,7 +211,7 @@ static int pmc_debugflags_parse(char *newstr, char *fence); #endif static int load(struct module *module, int cmd, void *arg); -static int pmc_add_sample(int ring, struct pmc *pm, struct trapframe *tf); +static int pmc_add_sample(ring_type_t ring, struct pmc *pm, struct trapframe *tf); static void pmc_add_thread_descriptors_from_proc(struct proc *p, struct pmc_process *pp); static int pmc_attach_process(struct proc *p, struct pmc *pm); @@ -248,7 +249,7 @@ static void pmc_process_csw_out(struct thread *td); static void pmc_process_exit(void *arg, struct proc *p); static void pmc_process_fork(void *arg, struct proc *p1, struct proc *p2, int n); -static void pmc_process_samples(int cpu, int soft); +static void pmc_process_samples(int cpu, ring_type_t soft); static void pmc_release_pmc_descriptor(struct pmc *pmc); static void pmc_process_thread_add(struct thread *td); static void pmc_process_thread_delete(struct thread *td); @@ -341,6 +342,7 @@ static int pmc_nsamples = PMC_NSAMPLES; SYSCTL_INT(_kern_hwpmc, OID_AUTO, nsamples, CTLFLAG_RDTUN, &pmc_nsamples, 0, "number of PC samples per CPU"); +static uint64_t pmc_sample_mask = PMC_NSAMPLES-1; /* * kern.hwpmc.mtxpoolsize -- number of mutexes in the mutex pool. @@ -1401,6 +1403,10 @@ pmc_process_csw_in(struct thread *td) if (pm->pm_state != PMC_STATE_RUNNING) continue; + KASSERT(counter_u64_fetch(pm->pm_runcount) >= 0, + ("[pmc,%d] pm=%p runcount %ld", __LINE__, (void *) pm, + (unsigned long)counter_u64_fetch(pm->pm_runcount))); + /* increment PMC runcount */ counter_u64_add(pm->pm_runcount, 1); @@ -1595,6 +1601,10 @@ pmc_process_csw_out(struct thread *td) if (pm->pm_pcpu_state[cpu].pps_stalled == 0) pcd->pcd_stop_pmc(cpu, adjri); + KASSERT(counter_u64_fetch(pm->pm_runcount) > 0, + ("[pmc,%d] pm=%p runcount %ld", __LINE__, (void *) pm, + (unsigned long)counter_u64_fetch(pm->pm_runcount))); + /* reduce this PMC's runcount */ counter_u64_add(pm->pm_runcount, -1); @@ -2724,7 +2734,7 @@ pmc_destroy_pmc_descriptor(struct pmc *pm) static void pmc_wait_for_pmc_idle(struct pmc *pm) { -#ifdef HWPMC_DEBUG +#ifdef INVARIANTS volatile int maxloop; maxloop = 100 * pmc_cpu_max(); @@ -2736,7 +2746,7 @@ pmc_wait_for_pmc_idle(struct pmc *pm) pmclog_flush(pm->pm_owner, 1); while (counter_u64_fetch(pm->pm_runcount) > 0) { pmclog_flush(pm->pm_owner, 1); -#ifdef HWPMC_DEBUG +#ifdef INVARIANTS maxloop--; KASSERT(maxloop > 0, ("[pmc,%d] (ri%d, rc%ld) waiting too long for " @@ -3942,9 +3952,16 @@ pmc_syscall_handler(struct thread *td, void *syscall_args) pmc->pm_flags = pa.pm_flags; /* XXX set lower bound on sampling for process counters */ - if (PMC_IS_SAMPLING_MODE(mode)) - pmc->pm_sc.pm_reloadcount = pa.pm_count; - else + if (PMC_IS_SAMPLING_MODE(mode)) { + /* + * Don't permit requested sample rate to be less than 1000 + */ + if (pa.pm_count < 1000) + log(LOG_WARNING, + "pmcallocate: passed sample rate %ju - setting to 1000\n", + (uintmax_t)pa.pm_count); + pmc->pm_sc.pm_reloadcount = MAX(1000, pa.pm_count); + } else pmc->pm_sc.pm_initial = pa.pm_count; /* switch thread to CPU 'cpu' */ @@ -4460,9 +4477,16 @@ pmc_syscall_handler(struct thread *td, void *syscall_args) break; } - if (PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm))) - pm->pm_sc.pm_reloadcount = sc.pm_count; - else + if (PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm))) { + /* + * Don't permit requested sample rate to be less than 1000 + */ + if (sc.pm_count < 1000) + log(LOG_WARNING, + "pmcsetcount: passed sample rate %ju - setting to 1000\n", + (uintmax_t)sc.pm_count); + pm->pm_sc.pm_reloadcount = MAX(1000, sc.pm_count); + } else pm->pm_sc.pm_initial = sc.pm_count; } break; @@ -4642,7 +4666,7 @@ pmc_post_callchain_callback(void) */ static int -pmc_add_sample(int ring, struct pmc *pm, struct trapframe *tf) +pmc_add_sample(ring_type_t ring, struct pmc *pm, struct trapframe *tf) { int error, cpu, callchaindepth, inuserspace; struct thread *td; @@ -4657,18 +4681,15 @@ pmc_add_sample(int ring, struct pmc *pm, struct trapframe *tf) cpu = curcpu; psb = pmc_pcpu[cpu]->pc_sb[ring]; inuserspace = TRAPF_USERMODE(tf); - ps = psb->ps_write; - if (ps->ps_nsamples == PMC_SAMPLE_INUSE) { - counter_u64_add(ps->ps_pmc->pm_runcount, -1); - counter_u64_add(pmc_stats.pm_overwrites, 1); - ps->ps_nsamples = 0; - } else if (ps->ps_nsamples) { /* in use, reader hasn't caught up */ + ps = PMC_PROD_SAMPLE(psb); + if (psb->ps_considx != psb->ps_prodidx && + ps->ps_nsamples) { /* in use, reader hasn't caught up */ pm->pm_pcpu_state[cpu].pps_stalled = 1; counter_u64_add(pmc_stats.pm_intr_bufferfull, 1); PMCDBG6(SAM,INT,1,"(spc) cpu=%d pm=%p tf=%p um=%d wr=%d rd=%d", cpu, pm, (void *) tf, inuserspace, - (int) (psb->ps_write - psb->ps_samples), - (int) (psb->ps_read - psb->ps_samples)); + (int) (psb->ps_prodidx & pmc_sample_mask), + (int) (psb->ps_considx & pmc_sample_mask)); callchaindepth = 1; error = ENOMEM; goto done; @@ -4677,14 +4698,8 @@ pmc_add_sample(int ring, struct pmc *pm, struct trapframe *tf) /* Fill in entry. */ PMCDBG6(SAM,INT,1,"cpu=%d pm=%p tf=%p um=%d wr=%d rd=%d", cpu, pm, (void *) tf, inuserspace, - (int) (psb->ps_write - psb->ps_samples), - (int) (psb->ps_read - psb->ps_samples)); - - KASSERT(counter_u64_fetch(pm->pm_runcount) >= 0, - ("[pmc,%d] pm=%p runcount %ld", __LINE__, (void *) pm, - (unsigned long)counter_u64_fetch(pm->pm_runcount))); - - counter_u64_add(pm->pm_runcount, 1); /* hold onto PMC */ + (int) (psb->ps_prodidx & pmc_sample_mask), + (int) (psb->ps_considx & pmc_sample_mask)); td = curthread; ps->ps_pmc = pm; @@ -4692,13 +4707,14 @@ pmc_add_sample(int ring, struct pmc *pm, struct trapframe *tf) ps->ps_pid = td->td_proc->p_pid; ps->ps_tid = td->td_tid; ps->ps_tsc = pmc_rdtsc(); - + ps->ps_ticks = ticks; ps->ps_cpu = cpu; ps->ps_flags = inuserspace ? PMC_CC_F_USERSPACE : 0; callchaindepth = (pm->pm_flags & PMC_F_CALLCHAIN) ? pmc_callchaindepth : 1; + MPASS(ps->ps_pc != NULL); if (callchaindepth == 1) ps->ps_pc[0] = PMC_TRAPFRAME_TO_PC(tf); else { @@ -4712,26 +4728,27 @@ pmc_add_sample(int ring, struct pmc *pm, struct trapframe *tf) callchaindepth, tf); } else { pmc_post_callchain_callback(); - callchaindepth = PMC_SAMPLE_INUSE; + callchaindepth = PMC_USER_CALLCHAIN_PENDING; } } ps->ps_nsamples = callchaindepth; /* mark entry as in use */ if (ring == PMC_UR) { ps->ps_nsamples_actual = callchaindepth; /* mark entry as in use */ - ps->ps_nsamples = PMC_SAMPLE_INUSE; + ps->ps_nsamples = PMC_USER_CALLCHAIN_PENDING; } else ps->ps_nsamples = callchaindepth; /* mark entry as in use */ - /* increment write pointer, modulo ring buffer size */ - ps++; - if (ps == psb->ps_fence) - psb->ps_write = psb->ps_samples; - else - psb->ps_write = ps; + KASSERT(counter_u64_fetch(pm->pm_runcount) >= 0, + ("[pmc,%d] pm=%p runcount %ld", __LINE__, (void *) pm, + (unsigned long)counter_u64_fetch(pm->pm_runcount))); + + counter_u64_add(pm->pm_runcount, 1); /* hold onto PMC */ + /* increment write pointer */ + psb->ps_prodidx++; done: /* mark CPU as needing processing */ - if (callchaindepth != PMC_SAMPLE_INUSE) + if (callchaindepth != PMC_USER_CALLCHAIN_PENDING) DPCPU_SET(pmc_sampled, 1); return (error); @@ -4770,14 +4787,15 @@ pmc_capture_user_callchain(int cpu, int ring, struct trapframe *tf) { struct pmc *pm; struct thread *td; - struct pmc_sample *ps, *ps_end; + struct pmc_sample *ps; struct pmc_samplebuffer *psb; - int nsamples, nrecords, pass; + uint64_t considx, prodidx; + int nsamples, nrecords, pass, iter; #ifdef INVARIANTS int ncallchains; int nfree; + int start_ticks = ticks; #endif - psb = pmc_pcpu[cpu]->pc_sb[ring]; td = curthread; @@ -4795,29 +4813,30 @@ pmc_capture_user_callchain(int cpu, int ring, struct trapframe *tf) if (ring == PMC_UR) nrecords = atomic_readandclear_32(&td->td_pmcpend); + for (iter = 0, considx = psb->ps_considx, prodidx = psb->ps_prodidx; + considx < prodidx && iter < pmc_nsamples; considx++, iter++) { + ps = PMC_CONS_SAMPLE_OFF(psb, considx); + /* * Iterate through all deferred callchain requests. * Walk from the current read pointer to the current * write pointer. */ - ps = psb->ps_read; - ps_end = psb->ps_write; - do { #ifdef INVARIANTS if (ps->ps_nsamples == PMC_SAMPLE_FREE) { nfree++; - goto next; + continue; } if ((ps->ps_pmc == NULL) || (ps->ps_pmc->pm_state != PMC_STATE_RUNNING)) nfree++; #endif - if (ps->ps_nsamples != PMC_SAMPLE_INUSE) - goto next; - if (ps->ps_td != td) - goto next; + if (ps->ps_td != td || + ps->ps_nsamples == PMC_USER_CALLCHAIN_PENDING || + ps->ps_pmc->pm_state != PMC_STATE_RUNNING) + continue; KASSERT(ps->ps_cpu == cpu, ("[pmc,%d] cpu mismatch ps_cpu=%d pcpu=%d", __LINE__, @@ -4850,15 +4869,28 @@ pmc_capture_user_callchain(int cpu, int ring, struct trapframe *tf) if (__predict_true(nsamples < pmc_callchaindepth - 1)) nsamples += pmc_save_user_callchain(ps->ps_pc + nsamples, pmc_callchaindepth - nsamples - 1, tf); - wmb(); - ps->ps_nsamples = nsamples; + + /* + * We have to prevent hardclock from potentially overwriting + * this sample between when we read the value and when we set + * it + */ + spinlock_enter(); + /* + * Verify that the sample hasn't been dropped in the meantime + */ + if (ps->ps_nsamples == PMC_USER_CALLCHAIN_PENDING) { + ps->ps_nsamples = nsamples; + /* + * If we couldn't get a sample, simply drop the reference + */ + if (nsamples == 0) + counter_u64_add(pm->pm_runcount, -1); + } + spinlock_exit(); if (nrecords-- == 1) break; -next: - /* increment the pointer, modulo sample ring size */ - if (++ps == psb->ps_fence) - ps = psb->ps_samples; - } while (ps != ps_end); + } if (__predict_false(ring == PMC_UR && td->td_pmcpend)) { if (pass == 0) { pass = 1; @@ -4869,60 +4901,20 @@ next: } #ifdef INVARIANTS - if (ring == PMC_HR) - KASSERT(ncallchains > 0 || nfree > 0, - ("[pmc,%d] cpu %d didn't find a sample to collect", __LINE__, - cpu)); + if ((ticks - start_ticks) > hz) + log(LOG_ERR, "%s took %d ticks\n", __func__, (ticks - start_ticks)); #endif /* mark CPU as needing processing */ DPCPU_SET(pmc_sampled, 1); } - -static void -pmc_flush_ring(int cpu, int ring) -{ - struct pmc *pm; - struct pmc_sample *ps; - struct pmc_samplebuffer *psb; - int n; - - psb = pmc_pcpu[cpu]->pc_sb[ring]; - - for (n = 0; n < pmc_nsamples; n++) { /* bound on #iterations */ - - ps = psb->ps_read; - if (ps->ps_nsamples == PMC_SAMPLE_FREE) - goto next; - pm = ps->ps_pmc; - counter_u64_add(pm->pm_runcount, -1); - ps->ps_nsamples = PMC_SAMPLE_FREE; - /* increment read pointer, modulo sample size */ - next: - if (++ps == psb->ps_fence) - psb->ps_read = psb->ps_samples; - else - psb->ps_read = ps; - } -} - -void -pmc_flush_samples(int cpu) -{ - int n; - - for (n = 0; n < PMC_NUM_SR; n++) - pmc_flush_ring(cpu, n); -} - - /* * Process saved PC samples. */ static void -pmc_process_samples(int cpu, int ring) +pmc_process_samples(int cpu, ring_type_t ring) { struct pmc *pm; int adjri, n; @@ -4931,20 +4923,25 @@ pmc_process_samples(int cpu, int ring) struct pmc_sample *ps; struct pmc_classdep *pcd; struct pmc_samplebuffer *psb; + uint64_t delta; KASSERT(PCPU_GET(cpuid) == cpu, ("[pmc,%d] not on the correct CPU pcpu=%d cpu=%d", __LINE__, PCPU_GET(cpuid), cpu)); psb = pmc_pcpu[cpu]->pc_sb[ring]; + delta = psb->ps_prodidx - psb->ps_considx; + MPASS(delta <= pmc_nsamples); + MPASS(psb->ps_considx <= psb->ps_prodidx); + for (n = 0; psb->ps_considx < psb->ps_prodidx; psb->ps_considx++, n++) { + ps = PMC_CONS_SAMPLE(psb); - for (n = 0; n < pmc_nsamples; n++) { /* bound on #iterations */ - - ps = psb->ps_read; - if (ps->ps_nsamples == PMC_SAMPLE_FREE) - break; - + if (__predict_false(ps->ps_nsamples == PMC_SAMPLE_FREE)) + continue; pm = ps->ps_pmc; + /* skip non-running samples */ + if (pm->pm_state != PMC_STATE_RUNNING) + goto entrydone; KASSERT(counter_u64_fetch(pm->pm_runcount) > 0, ("[pmc,%d] pm=%p runcount %ld", __LINE__, (void *) pm, @@ -4956,12 +4953,19 @@ pmc_process_samples(int cpu, int ring) ("[pmc,%d] pmc=%p non-sampling mode=%d", __LINE__, pm, PMC_TO_MODE(pm))); - /* Ignore PMCs that have been switched off */ - if (pm->pm_state != PMC_STATE_RUNNING) - goto entrydone; /* If there is a pending AST wait for completion */ - if (ps->ps_nsamples == PMC_SAMPLE_INUSE) { + if (ps->ps_nsamples == PMC_USER_CALLCHAIN_PENDING) { + /* if sample is more than 65 ms old, drop it */ + if (ticks - ps->ps_ticks > (hz >> 4)) { + /* + * track how often we hit this as it will + * preferentially lose user samples + * for long running system calls + */ + counter_u64_add(pmc_stats.pm_overwrites, 1); + goto entrydone; + } /* Need a rescan at a later time. */ DPCPU_SET(pmc_sampled, 1); break; @@ -4969,8 +4973,8 @@ pmc_process_samples(int cpu, int ring) PMCDBG6(SAM,OPS,1,"cpu=%d pm=%p n=%d fl=%x wr=%d rd=%d", cpu, pm, ps->ps_nsamples, ps->ps_flags, - (int) (psb->ps_write - psb->ps_samples), - (int) (psb->ps_read - psb->ps_samples)); + (int) (psb->ps_prodidx & pmc_sample_mask), + (int) (psb->ps_considx & pmc_sample_mask)); /* * If this is a process-mode PMC that is attached to @@ -4993,13 +4997,11 @@ pmc_process_samples(int cpu, int ring) entrydone: ps->ps_nsamples = 0; /* mark entry as free */ - counter_u64_add(pm->pm_runcount, -1); + KASSERT(counter_u64_fetch(pm->pm_runcount) > 0, + ("[pmc,%d] pm=%p runcount %ld", __LINE__, (void *) pm, + (unsigned long)counter_u64_fetch(pm->pm_runcount))); - /* increment read pointer, modulo sample size */ - if (++ps == psb->ps_fence) - psb->ps_read = psb->ps_samples; - else - psb->ps_read = ps; + counter_u64_add(pm->pm_runcount, -1); } counter_u64_add(pmc_stats.pm_log_sweeps, 1); @@ -5182,11 +5184,11 @@ pmc_process_exit(void *arg __unused, struct proc *p) } } - counter_u64_add(pm->pm_runcount, -1); - - KASSERT((int) counter_u64_fetch(pm->pm_runcount) >= 0, + KASSERT((int64_t) counter_u64_fetch(pm->pm_runcount) > 0, ("[pmc,%d] runcount is %d", __LINE__, ri)); + counter_u64_add(pm->pm_runcount, -1); + (void) pcd->pcd_config_pmc(cpu, adjri, NULL); } @@ -5568,6 +5570,7 @@ pmc_initialize(void) "range.\n", pmc_nsamples); pmc_nsamples = PMC_NSAMPLES; } + pmc_sample_mask = pmc_nsamples-1; if (pmc_callchaindepth <= 0 || pmc_callchaindepth > PMC_CALLCHAIN_DEPTH_MAX) { @@ -5643,8 +5646,6 @@ pmc_initialize(void) sb = malloc_domain(sizeof(struct pmc_samplebuffer) + pmc_nsamples * sizeof(struct pmc_sample), M_PMC, domain, M_WAITOK|M_ZERO); - sb->ps_read = sb->ps_write = sb->ps_samples; - sb->ps_fence = sb->ps_samples + pmc_nsamples; KASSERT(pmc_pcpu[cpu] != NULL, ("[pmc,%d] cpu=%d Null per-cpu data", __LINE__, cpu)); @@ -5661,8 +5662,6 @@ pmc_initialize(void) sb = malloc_domain(sizeof(struct pmc_samplebuffer) + pmc_nsamples * sizeof(struct pmc_sample), M_PMC, domain, M_WAITOK|M_ZERO); - sb->ps_read = sb->ps_write = sb->ps_samples; - sb->ps_fence = sb->ps_samples + pmc_nsamples; KASSERT(pmc_pcpu[cpu] != NULL, ("[pmc,%d] cpu=%d Null per-cpu data", __LINE__, cpu)); @@ -5679,8 +5678,6 @@ pmc_initialize(void) sb = malloc_domain(sizeof(struct pmc_samplebuffer) + pmc_nsamples * sizeof(struct pmc_sample), M_PMC, domain, M_WAITOK|M_ZERO); - sb->ps_read = sb->ps_write = sb->ps_samples; - sb->ps_fence = sb->ps_samples + pmc_nsamples; KASSERT(pmc_pcpu[cpu] != NULL, ("[pmc,%d] cpu=%d Null per-cpu data", __LINE__, cpu)); diff --git a/sys/dev/ichiic/ig4_iic.c b/sys/dev/ichiic/ig4_iic.c index 237e24c10706..11beb743ed4b 100644 --- a/sys/dev/ichiic/ig4_iic.c +++ b/sys/dev/ichiic/ig4_iic.c @@ -729,9 +729,9 @@ ig4iic_intr(void *cookie) * Workaround to trigger pending interrupt if IG4_REG_INTR_STAT * is changed after clearing it */ - if(sc->access_intr_mask) { + if (sc->access_intr_mask != 0) { status = reg_read(sc, IG4_REG_INTR_MASK); - if(status) { + if (status != 0) { reg_write(sc, IG4_REG_INTR_MASK, 0); reg_write(sc, IG4_REG_INTR_MASK, status); } diff --git a/sys/dev/ichiic/ig4_pci.c b/sys/dev/ichiic/ig4_pci.c index eed7d651bfde..ae73f3b2abca 100644 --- a/sys/dev/ichiic/ig4_pci.c +++ b/sys/dev/ichiic/ig4_pci.c @@ -112,8 +112,8 @@ static struct ig4iic_pci_device ig4iic_pci_devices[] = { { PCI_CHIP_SKYLAKE_I2C_3, "Intel Sunrise Point-LP I2C Controller-3", IG4_SKYLAKE}, { PCI_CHIP_SKYLAKE_I2C_4, "Intel Sunrise Point-LP I2C Controller-4", IG4_SKYLAKE}, { PCI_CHIP_SKYLAKE_I2C_5, "Intel Sunrise Point-LP I2C Controller-5", IG4_SKYLAKE}, - { PCI_CHIP_KABYLAKE_I2C_0, "Intel Sunrise Point-LP I2C Controller-0", IG4_SKYLAKE}, - { PCI_CHIP_KABYLAKE_I2C_1, "Intel Sunrise Point-LP I2C Controller-1", IG4_SKYLAKE}, + { PCI_CHIP_KABYLAKE_I2C_0, "Intel Sunrise Point-H I2C Controller-0", IG4_SKYLAKE}, + { PCI_CHIP_KABYLAKE_I2C_1, "Intel Sunrise Point-H I2C Controller-1", IG4_SKYLAKE}, { PCI_CHIP_APL_I2C_0, "Intel Apollo Lake I2C Controller-0", IG4_APL}, { PCI_CHIP_APL_I2C_1, "Intel Apollo Lake I2C Controller-1", IG4_APL}, { PCI_CHIP_APL_I2C_2, "Intel Apollo Lake I2C Controller-2", IG4_APL}, diff --git a/sys/dev/ida/ida_pci.c b/sys/dev/ida/ida_pci.c index 63a18ecf0259..3911a70c0b46 100644 --- a/sys/dev/ida/ida_pci.c +++ b/sys/dev/ida/ida_pci.c @@ -306,3 +306,5 @@ ida_pci_attach(device_t dev) } DRIVER_MODULE(ida, pci, ida_pci_driver, ida_devclass, 0, 0); +MODULE_PNP_INFO("W32:vendor/device;D:#", pci, ida, board_id, + nitems(board_id) - 1); diff --git a/sys/dev/intpm/intpm.c b/sys/dev/intpm/intpm.c index 4b4cf9ab10c8..15da5e861a07 100644 --- a/sys/dev/intpm/intpm.c +++ b/sys/dev/intpm/intpm.c @@ -896,4 +896,4 @@ DRIVER_MODULE(smbus, intsmb, smbus_driver, smbus_devclass, 0, 0); MODULE_DEPEND(intsmb, smbus, SMBUS_MINVER, SMBUS_PREFVER, SMBUS_MAXVER); MODULE_VERSION(intsmb, 1); MODULE_PNP_INFO("W32:vendor/device;D:#", pci, intpm, intsmb_products, - sizeof(intsmb_products[0]), nitems(intsmb_products)); + nitems(intsmb_products)); diff --git a/sys/dev/ioat/ioat.c b/sys/dev/ioat/ioat.c index 744c26dfafdc..b03de58fcb55 100644 --- a/sys/dev/ioat/ioat.c +++ b/sys/dev/ioat/ioat.c @@ -241,7 +241,7 @@ static struct _pcsid }; MODULE_PNP_INFO("W32:vendor/device;D:#", pci, ioat, pci_ids, - sizeof(pci_ids[0]), nitems(pci_ids)); + nitems(pci_ids)); /* * OS <-> Driver linkage functions diff --git a/sys/dev/ipw/if_ipw.c b/sys/dev/ipw/if_ipw.c index 1da31934f738..e99d5aedc714 100644 --- a/sys/dev/ipw/if_ipw.c +++ b/sys/dev/ipw/if_ipw.c @@ -203,7 +203,7 @@ static devclass_t ipw_devclass; DRIVER_MODULE(ipw, pci, ipw_driver, ipw_devclass, NULL, NULL); MODULE_PNP_INFO("U16:vendor;U16:device;D:#", pci, ipw, ipw_ident_table, - sizeof(ipw_ident_table[0]), nitems(ipw_ident_table) - 1); + nitems(ipw_ident_table) - 1); MODULE_VERSION(ipw, 1); diff --git a/sys/dev/iwm/if_iwm.c b/sys/dev/iwm/if_iwm.c index 9daa85257ac9..4e08dacf8c4f 100644 --- a/sys/dev/iwm/if_iwm.c +++ b/sys/dev/iwm/if_iwm.c @@ -6460,6 +6460,8 @@ static driver_t iwm_pci_driver = { static devclass_t iwm_devclass; DRIVER_MODULE(iwm, pci, iwm_pci_driver, iwm_devclass, NULL, NULL); +MODULE_PNP_INFO("U16:device;P:#;T:vendor=0x8086", pci, iwm_pci_driver, + iwm_devices, nitems(iwm_devices)); MODULE_DEPEND(iwm, firmware, 1, 1, 1); MODULE_DEPEND(iwm, pci, 1, 1, 1); MODULE_DEPEND(iwm, wlan, 1, 1, 1); diff --git a/sys/dev/iwn/if_iwn.c b/sys/dev/iwn/if_iwn.c index e46082c0b3b8..3e79f4b400c0 100644 --- a/sys/dev/iwn/if_iwn.c +++ b/sys/dev/iwn/if_iwn.c @@ -372,7 +372,8 @@ static driver_t iwn_driver = { static devclass_t iwn_devclass; DRIVER_MODULE(iwn, pci, iwn_driver, iwn_devclass, NULL, NULL); - +MODULE_PNP_INFO("U16:vendor;U16:device;D:#", pci, iwn, iwn_ident_table, + nitems(iwn_ident_table) - 1); MODULE_VERSION(iwn, 1); MODULE_DEPEND(iwn, firmware, 1, 1, 1); diff --git a/sys/dev/ixgbe/if_ix.c b/sys/dev/ixgbe/if_ix.c index cca610664065..44843ff3fc98 100644 --- a/sys/dev/ixgbe/if_ix.c +++ b/sys/dev/ixgbe/if_ix.c @@ -238,7 +238,7 @@ static driver_t ix_driver = { devclass_t ix_devclass; DRIVER_MODULE(ix, pci, ix_driver, ix_devclass, 0, 0); MODULE_PNP_INFO("U16:vendor;U16:device", pci, ix, ixgbe_vendor_info_array, - sizeof(ixgbe_vendor_info_array[0]), nitems(ixgbe_vendor_info_array) - 1); + nitems(ixgbe_vendor_info_array) - 1); MODULE_DEPEND(ix, pci, 1, 1, 1); MODULE_DEPEND(ix, ether, 1, 1, 1); diff --git a/sys/dev/ixgbe/if_ixv.c b/sys/dev/ixgbe/if_ixv.c index a6a3465b60d7..91f2f8ef755e 100644 --- a/sys/dev/ixgbe/if_ixv.c +++ b/sys/dev/ixgbe/if_ixv.c @@ -144,7 +144,7 @@ static driver_t ixv_driver = { devclass_t ixv_devclass; DRIVER_MODULE(ixv, pci, ixv_driver, ixv_devclass, 0, 0); MODULE_PNP_INFO("U16:vendor;U16:device", pci, ixv, ixv_vendor_info_array, - sizeof(ixv_vendor_info_array[0]), nitems(ixv_vendor_info_array) - 1); + nitems(ixv_vendor_info_array) - 1); MODULE_DEPEND(ixv, pci, 1, 1, 1); MODULE_DEPEND(ixv, ether, 1, 1, 1); #ifdef DEV_NETMAP diff --git a/sys/dev/ixl/if_ixl.c b/sys/dev/ixl/if_ixl.c index 6d83545ae941..23c9d13c695d 100644 --- a/sys/dev/ixl/if_ixl.c +++ b/sys/dev/ixl/if_ixl.c @@ -150,6 +150,7 @@ static driver_t ixl_driver = { devclass_t ixl_devclass; DRIVER_MODULE(ixl, pci, ixl_driver, ixl_devclass, 0, 0); +IFLIB_PNP_INFO(pci, ixl, ixl_vendor_info_array); MODULE_VERSION(ixl, 3); MODULE_DEPEND(ixl, pci, 1, 1, 1); diff --git a/sys/dev/ixl/if_ixlv.c b/sys/dev/ixl/if_ixlv.c index f0a91b761f87..dd72d52ab187 100644 --- a/sys/dev/ixl/if_ixlv.c +++ b/sys/dev/ixl/if_ixlv.c @@ -149,7 +149,9 @@ static driver_t ixlv_driver = { devclass_t ixlv_devclass; DRIVER_MODULE(ixlv, pci, ixlv_driver, ixlv_devclass, 0, 0); - +MODULE_PNP_INFO("U32:vendor;U32:device;U32:subvendor;U32:subdevice;U32:revision", + pci, ixlv, ixlv_vendor_info_array, + nitems(ixlv_vendor_info_array) - 1); MODULE_DEPEND(ixlv, pci, 1, 1, 1); MODULE_DEPEND(ixlv, ether, 1, 1, 1); MODULE_DEPEND(ixlv, iflib, 1, 1, 1); diff --git a/sys/dev/mfi/mfi_pci.c b/sys/dev/mfi/mfi_pci.c index d63c5ae6435f..ee609b328990 100644 --- a/sys/dev/mfi/mfi_pci.c +++ b/sys/dev/mfi/mfi_pci.c @@ -106,8 +106,6 @@ static driver_t mfi_pci_driver = { }; static devclass_t mfi_devclass; -DRIVER_MODULE(mfi, pci, mfi_pci_driver, mfi_devclass, 0, 0); -MODULE_VERSION(mfi, 1); static int mfi_msi = 1; SYSCTL_INT(_hw_mfi, OID_AUTO, msi, CTLFLAG_RDTUN, &mfi_msi, 0, @@ -159,6 +157,11 @@ struct mfi_ident { {0, 0, 0, 0, 0, NULL} }; +DRIVER_MODULE(mfi, pci, mfi_pci_driver, mfi_devclass, 0, 0); +MODULE_PNP_INFO("U16:vendor;U16:device;U16:subvendor;U16:subdevice", pci, mfi, + mfi_identifiers, nitems(mfi_identifiers) - 1); +MODULE_VERSION(mfi, 1); + static struct mfi_ident * mfi_find_ident(device_t dev) { diff --git a/sys/dev/mpr/mpr_pci.c b/sys/dev/mpr/mpr_pci.c index 03f3143040c2..e86bd9e102ba 100644 --- a/sys/dev/mpr/mpr_pci.c +++ b/sys/dev/mpr/mpr_pci.c @@ -88,9 +88,6 @@ static driver_t mpr_pci_driver = { sizeof(struct mpr_softc) }; -static devclass_t mpr_devclass; -DRIVER_MODULE(mpr, pci, mpr_pci_driver, mpr_devclass, 0, 0); -MODULE_DEPEND(mpr, cam, 1, 1, 1); struct mpr_ident { uint16_t vendor; @@ -154,6 +151,14 @@ struct mpr_ident { { 0, 0, 0, 0, 0, NULL } }; + +static devclass_t mpr_devclass; +DRIVER_MODULE(mpr, pci, mpr_pci_driver, mpr_devclass, 0, 0); +MODULE_PNP_INFO("U16:vendor;U16:device;U16:subvendor;U16:subdevice;D:#", pci, + mpr, mpr_identifiers, nitems(mpr_identifiers) - 1); + +MODULE_DEPEND(mpr, cam, 1, 1, 1); + static struct mpr_ident * mpr_find_ident(device_t dev) { diff --git a/sys/dev/mps/mps_pci.c b/sys/dev/mps/mps_pci.c index a81fe8a7300f..1e12c9a6906a 100644 --- a/sys/dev/mps/mps_pci.c +++ b/sys/dev/mps/mps_pci.c @@ -88,10 +88,6 @@ static driver_t mps_pci_driver = { sizeof(struct mps_softc) }; -static devclass_t mps_devclass; -DRIVER_MODULE(mps, pci, mps_pci_driver, mps_devclass, 0, 0); -MODULE_DEPEND(mps, cam, 1, 1, 1); - struct mps_ident { uint16_t vendor; uint16_t device; @@ -147,6 +143,10 @@ struct mps_ident { { 0, 0, 0, 0, 0, NULL } }; +static devclass_t mps_devclass; +DRIVER_MODULE(mps, pci, mps_pci_driver, mps_devclass, 0, 0); +MODULE_PNP_INFO("U16:vendor;U16:device;U16:subvendor;U16:subdevice", pci, mps, + mps_identifiers, nitems(mps_identifiers) - 1); static struct mps_ident * mps_find_ident(device_t dev) { diff --git a/sys/dev/mvs/mvs_pci.c b/sys/dev/mvs/mvs_pci.c index cd6afce40f5c..4774390b272c 100644 --- a/sys/dev/mvs/mvs_pci.c +++ b/sys/dev/mvs/mvs_pci.c @@ -521,6 +521,8 @@ static driver_t mvs_driver = { sizeof(struct mvs_controller) }; DRIVER_MODULE(mvs, pci, mvs_driver, mvs_devclass, 0, 0); +MODULE_PNP_INFO("W32:vendor/device", pci, mvs, mvs_ids, + nitems(mvs_ids) - 1); MODULE_VERSION(mvs, 1); MODULE_DEPEND(mvs, cam, 1, 1, 1); diff --git a/sys/dev/my/if_my.c b/sys/dev/my/if_my.c index b1d5e3586c9d..ef5ef7613b99 100644 --- a/sys/dev/my/if_my.c +++ b/sys/dev/my/if_my.c @@ -163,6 +163,8 @@ static driver_t my_driver = { static devclass_t my_devclass; DRIVER_MODULE(my, pci, my_driver, my_devclass, 0, 0); +MODULE_PNP_INFO("U16:vendor;U16:device;D:#", pci, my, my_devs, + nitems(my_devs) - 1); MODULE_DEPEND(my, pci, 1, 1, 1); MODULE_DEPEND(my, ether, 1, 1, 1); diff --git a/sys/dev/ncr/ncr.c b/sys/dev/ncr/ncr.c index 5cfb413731c2..58bc2a6af01c 100644 --- a/sys/dev/ncr/ncr.c +++ b/sys/dev/ncr/ncr.c @@ -7109,7 +7109,7 @@ static devclass_t ncr_devclass; DRIVER_MODULE(ncr, pci, ncr_driver, ncr_devclass, 0, 0); MODULE_PNP_INFO("W32:vendor/device;U16:#;D:#", pci, ncr, ncr_chip_table, - sizeof(ncr_chip_table[0]), nitems(ncr_chip_table)); + nitems(ncr_chip_table)); MODULE_DEPEND(ncr, cam, 1, 1, 1); MODULE_DEPEND(ncr, pci, 1, 1, 1); diff --git a/sys/dev/ntb/ntb_hw/ntb_hw_intel.c b/sys/dev/ntb/ntb_hw/ntb_hw_intel.c index 3bb57fcb71d0..d61f664d8cff 100644 --- a/sys/dev/ntb/ntb_hw/ntb_hw_intel.c +++ b/sys/dev/ntb/ntb_hw/ntb_hw_intel.c @@ -3120,4 +3120,4 @@ DRIVER_MODULE(ntb_hw_intel, pci, ntb_intel_driver, ntb_hw_devclass, NULL, NULL); MODULE_DEPEND(ntb_hw_intel, ntb, 1, 1, 1); MODULE_VERSION(ntb_hw_intel, 1); MODULE_PNP_INFO("W32:vendor/device;D:#", pci, ntb_hw_intel, pci_ids, - sizeof(pci_ids[0]), nitems(pci_ids)); + nitems(pci_ids)); diff --git a/sys/dev/oce/oce_if.c b/sys/dev/oce/oce_if.c index 23e32a59de49..0b7aa4da2372 100644 --- a/sys/dev/oce/oce_if.c +++ b/sys/dev/oce/oce_if.c @@ -214,12 +214,6 @@ static driver_t oce_driver = { static devclass_t oce_devclass; -DRIVER_MODULE(oce, pci, oce_driver, oce_devclass, 0, 0); -MODULE_DEPEND(oce, pci, 1, 1, 1); -MODULE_DEPEND(oce, ether, 1, 1, 1); -MODULE_VERSION(oce, 1); - - /* global vars */ const char component_revision[32] = {"///" COMPONENT_REVISION "///"}; @@ -242,6 +236,15 @@ static uint32_t supportedDevices[] = { (PCI_VENDOR_EMULEX << 16) | PCI_PRODUCT_SH }; + +DRIVER_MODULE(oce, pci, oce_driver, oce_devclass, 0, 0); +MODULE_PNP_INFO("W32:vendor/device", pci, oce, supportedDevices, + nitems(supportedDevices)); +MODULE_DEPEND(oce, pci, 1, 1, 1); +MODULE_DEPEND(oce, ether, 1, 1, 1); +MODULE_VERSION(oce, 1); + + POCE_SOFTC softc_head = NULL; POCE_SOFTC softc_tail = NULL; diff --git a/sys/dev/ofw/ofw_bus_subr.h b/sys/dev/ofw/ofw_bus_subr.h index 537856b97ef7..edc90ed7bf3c 100644 --- a/sys/dev/ofw/ofw_bus_subr.h +++ b/sys/dev/ofw/ofw_bus_subr.h @@ -67,7 +67,7 @@ struct intr_map_data_fdt { #define SIMPLEBUS_PNP_DESCR "Z:compat;P:#;" #define SIMPLEBUS_PNP_INFO(t) \ - MODULE_PNP_INFO(SIMPLEBUS_PNP_DESCR, simplebus, t, t, sizeof(t[0]), sizeof(t) / sizeof(t[0])); + MODULE_PNP_INFO(SIMPLEBUS_PNP_DESCR, simplebus, t, t, sizeof(t) / sizeof(t[0])); /* Generic implementation of ofw_bus_if.m methods and helper routines */ int ofw_bus_gen_setup_devinfo(struct ofw_bus_devinfo *, phandle_t); diff --git a/sys/dev/pccard/pccardvar.h b/sys/dev/pccard/pccardvar.h index 5138c243f5b7..87f41850ec5c 100644 --- a/sys/dev/pccard/pccardvar.h +++ b/sys/dev/pccard/pccardvar.h @@ -102,7 +102,7 @@ struct pccard_product { */ #define PCCARD_PNP_DESCR "D:#;V32:manufacturer;V32:product;Z:cisvendor;Z:cisproduct;" #define PCCARD_PNP_INFO(t) \ - MODULE_PNP_INFO(PCCARD_PNP_DESCR, pccard, t, t, sizeof(t[0]), nitems(t) - 1); \ + MODULE_PNP_INFO(PCCARD_PNP_DESCR, pccard, t, t, nitems(t) - 1) typedef int (*pccard_product_match_fn) (device_t dev, const struct pccard_product *ent, int vpfmatch); diff --git a/sys/dev/pccbb/pccbb_pci.c b/sys/dev/pccbb/pccbb_pci.c index bbffb8b53200..f2f6387c8aaa 100644 --- a/sys/dev/pccbb/pccbb_pci.c +++ b/sys/dev/pccbb/pccbb_pci.c @@ -983,4 +983,6 @@ static driver_t cbb_driver = { }; DRIVER_MODULE(cbb, pci, cbb_driver, cbb_devclass, 0, 0); +MODULE_PNP_INFO("W32:vendor/device;D:#", pci, cbb, yc_chipsets, + nitems(yc_chipsets) - 1); MODULE_DEPEND(cbb, exca, 1, 1, 1); diff --git a/sys/dev/pci/pci_user.c b/sys/dev/pci/pci_user.c index 4e40a8167362..380beff0d310 100644 --- a/sys/dev/pci/pci_user.c +++ b/sys/dev/pci/pci_user.c @@ -66,6 +66,49 @@ __FBSDID("$FreeBSD$"); #include "pcib_if.h" #include "pci_if.h" +#ifdef COMPAT_FREEBSD32 +struct pci_conf32 { + struct pcisel pc_sel; /* domain+bus+slot+function */ + u_int8_t pc_hdr; /* PCI header type */ + u_int16_t pc_subvendor; /* card vendor ID */ + u_int16_t pc_subdevice; /* card device ID, assigned by + card vendor */ + u_int16_t pc_vendor; /* chip vendor ID */ + u_int16_t pc_device; /* chip device ID, assigned by + chip vendor */ + u_int8_t pc_class; /* chip PCI class */ + u_int8_t pc_subclass; /* chip PCI subclass */ + u_int8_t pc_progif; /* chip PCI programming interface */ + u_int8_t pc_revid; /* chip revision ID */ + char pd_name[PCI_MAXNAMELEN + 1]; /* device name */ + u_int32_t pd_unit; /* device unit number */ +}; + +struct pci_match_conf32 { + struct pcisel pc_sel; /* domain+bus+slot+function */ + char pd_name[PCI_MAXNAMELEN + 1]; /* device name */ + u_int32_t pd_unit; /* Unit number */ + u_int16_t pc_vendor; /* PCI Vendor ID */ + u_int16_t pc_device; /* PCI Device ID */ + u_int8_t pc_class; /* PCI class */ + u_int32_t flags; /* Matching expression */ +}; + +struct pci_conf_io32 { + u_int32_t pat_buf_len; /* pattern buffer length */ + u_int32_t num_patterns; /* number of patterns */ + u_int32_t patterns; /* struct pci_match_conf ptr */ + u_int32_t match_buf_len; /* match buffer length */ + u_int32_t num_matches; /* number of matches returned */ + u_int32_t matches; /* struct pci_conf ptr */ + u_int32_t offset; /* offset into device list */ + u_int32_t generation; /* device list generation */ + u_int32_t status; /* request status */ +}; + +#define PCIOCGETCONF32 _IOC_NEWTYPE(PCIOCGETCONF, struct pci_conf_io32) +#endif + /* * This is the user interface to PCI configuration space. */ @@ -175,6 +218,73 @@ pci_conf_match_native(struct pci_match_conf *matches, int num_matches, return(1); } +#ifdef COMPAT_FREEBSD32 +static int +pci_conf_match32(struct pci_match_conf32 *matches, int num_matches, + struct pci_conf *match_buf) +{ + int i; + + if ((matches == NULL) || (match_buf == NULL) || (num_matches <= 0)) + return(1); + + for (i = 0; i < num_matches; i++) { + /* + * I'm not sure why someone would do this...but... + */ + if (matches[i].flags == PCI_GETCONF_NO_MATCH) + continue; + + /* + * Look at each of the match flags. If it's set, do the + * comparison. If the comparison fails, we don't have a + * match, go on to the next item if there is one. + */ + if (((matches[i].flags & PCI_GETCONF_MATCH_DOMAIN) != 0) + && (match_buf->pc_sel.pc_domain != + matches[i].pc_sel.pc_domain)) + continue; + + if (((matches[i].flags & PCI_GETCONF_MATCH_BUS) != 0) + && (match_buf->pc_sel.pc_bus != matches[i].pc_sel.pc_bus)) + continue; + + if (((matches[i].flags & PCI_GETCONF_MATCH_DEV) != 0) + && (match_buf->pc_sel.pc_dev != matches[i].pc_sel.pc_dev)) + continue; + + if (((matches[i].flags & PCI_GETCONF_MATCH_FUNC) != 0) + && (match_buf->pc_sel.pc_func != matches[i].pc_sel.pc_func)) + continue; + + if (((matches[i].flags & PCI_GETCONF_MATCH_VENDOR) != 0) + && (match_buf->pc_vendor != matches[i].pc_vendor)) + continue; + + if (((matches[i].flags & PCI_GETCONF_MATCH_DEVICE) != 0) + && (match_buf->pc_device != matches[i].pc_device)) + continue; + + if (((matches[i].flags & PCI_GETCONF_MATCH_CLASS) != 0) + && (match_buf->pc_class != matches[i].pc_class)) + continue; + + if (((matches[i].flags & PCI_GETCONF_MATCH_UNIT) != 0) + && (match_buf->pd_unit != matches[i].pd_unit)) + continue; + + if (((matches[i].flags & PCI_GETCONF_MATCH_NAME) != 0) + && (strncmp(matches[i].pd_name, match_buf->pd_name, + sizeof(match_buf->pd_name)) != 0)) + continue; + + return(0); + } + + return(1); +} +#endif /* COMPAT_FREEBSD32 */ + #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) #define PRE7_COMPAT @@ -259,20 +369,6 @@ struct pci_match_conf_old32 { pci_getconf_flags_old flags; /* Matching expression */ }; -struct pci_conf_io32 { - uint32_t pat_buf_len; /* pattern buffer length */ - uint32_t num_patterns; /* number of patterns */ - uint32_t patterns; /* pattern buffer - (struct pci_match_conf_old32 *) */ - uint32_t match_buf_len; /* match buffer length */ - uint32_t num_matches; /* number of matches returned */ - uint32_t matches; /* match buffer - (struct pci_conf_old32 *) */ - uint32_t offset; /* offset into device list */ - uint32_t generation; /* device list generation */ - pci_getconf_status status; /* request status */ -}; - #define PCIOCGETCONF_OLD32 _IOWR('p', 1, struct pci_conf_io32) #endif /* COMPAT_FREEBSD32 */ @@ -411,6 +507,9 @@ pci_conf_match_old32(struct pci_match_conf_old32 *matches, int num_matches, union pci_conf_union { struct pci_conf pc; +#ifdef COMPAT_FREEBSD32 + struct pci_conf32 pc32; +#endif #ifdef PRE7_COMPAT struct pci_conf_old pco; #ifdef COMPAT_FREEBSD32 @@ -428,6 +527,11 @@ pci_conf_match(u_long cmd, struct pci_match_conf *matches, int num_matches, case PCIOCGETCONF: return (pci_conf_match_native( (struct pci_match_conf *)matches, num_matches, match_buf)); +#ifdef COMPAT_FREEBSD32 + case PCIOCGETCONF32: + return (pci_conf_match32((struct pci_match_conf32 *)matches, + num_matches, match_buf)); +#endif #ifdef PRE7_COMPAT case PCIOCGETCONF_OLD: return (pci_conf_match_old( @@ -544,6 +648,10 @@ pci_match_conf_size(u_long cmd) switch (cmd) { case PCIOCGETCONF: return (sizeof(struct pci_match_conf)); +#ifdef COMPAT_FREEBSD32 + case PCIOCGETCONF32: + return (sizeof(struct pci_match_conf32)); +#endif #ifdef PRE7_COMPAT case PCIOCGETCONF_OLD: return (sizeof(struct pci_match_conf_old)); @@ -565,6 +673,10 @@ pci_conf_size(u_long cmd) switch (cmd) { case PCIOCGETCONF: return (sizeof(struct pci_conf)); +#ifdef COMPAT_FREEBSD32 + case PCIOCGETCONF32: + return (sizeof(struct pci_conf32)); +#endif #ifdef PRE7_COMPAT case PCIOCGETCONF_OLD: return (sizeof(struct pci_conf_old)); @@ -582,7 +694,7 @@ pci_conf_size(u_long cmd) static void pci_conf_io_init(struct pci_conf_io *cio, caddr_t data, u_long cmd) { -#if defined(PRE7_COMPAT) && defined(COMPAT_FREEBSD32) +#if defined(COMPAT_FREEBSD32) struct pci_conf_io32 *cio32; #endif @@ -594,8 +706,11 @@ pci_conf_io_init(struct pci_conf_io *cio, caddr_t data, u_long cmd) *cio = *(struct pci_conf_io *)data; return; -#if defined(PRE7_COMPAT) && defined(COMPAT_FREEBSD32) +#ifdef COMPAT_FREEBSD32 + case PCIOCGETCONF32: +#ifdef PRE7_COMPAT case PCIOCGETCONF_OLD32: +#endif cio32 = (struct pci_conf_io32 *)data; cio->pat_buf_len = cio32->pat_buf_len; cio->num_patterns = cio32->num_patterns; @@ -620,7 +735,7 @@ pci_conf_io_update_data(const struct pci_conf_io *cio, caddr_t data, u_long cmd) { struct pci_conf_io *d_cio; -#if defined(PRE7_COMPAT) && defined(COMPAT_FREEBSD32) +#if defined(COMPAT_FREEBSD32) struct pci_conf_io32 *cio32; #endif @@ -636,8 +751,11 @@ pci_conf_io_update_data(const struct pci_conf_io *cio, caddr_t data, d_cio->num_matches = cio->num_matches; return; -#if defined(PRE7_COMPAT) && defined(COMPAT_FREEBSD32) +#ifdef COMPAT_FREEBSD32 + case PCIOCGETCONF32: +#ifdef PRE7_COMPAT case PCIOCGETCONF_OLD32: +#endif cio32 = (struct pci_conf_io32 *)data; cio32->status = cio->status; @@ -665,6 +783,24 @@ pci_conf_for_copyout(const struct pci_conf *pcp, union pci_conf_union *pcup, pcup->pc = *pcp; return; +#ifdef COMPAT_FREEBSD32 + case PCIOCGETCONF32: + pcup->pc32.pc_sel = pcp->pc_sel; + pcup->pc32.pc_hdr = pcp->pc_hdr; + pcup->pc32.pc_subvendor = pcp->pc_subvendor; + pcup->pc32.pc_subdevice = pcp->pc_subdevice; + pcup->pc32.pc_vendor = pcp->pc_vendor; + pcup->pc32.pc_device = pcp->pc_device; + pcup->pc32.pc_class = pcp->pc_class; + pcup->pc32.pc_subclass = pcp->pc_subclass; + pcup->pc32.pc_progif = pcp->pc_progif; + pcup->pc32.pc_revid = pcp->pc_revid; + strlcpy(pcup->pc32.pd_name, pcp->pd_name, + sizeof(pcup->pc32.pd_name)); + pcup->pc32.pd_unit = (uint32_t)pcp->pd_unit; + return; +#endif + #ifdef PRE7_COMPAT #ifdef COMPAT_FREEBSD32 case PCIOCGETCONF_OLD32: diff --git a/sys/dev/pci/pcireg.h b/sys/dev/pci/pcireg.h index 00589c4b83da..edec95c8e67f 100644 --- a/sys/dev/pci/pcireg.h +++ b/sys/dev/pci/pcireg.h @@ -122,6 +122,9 @@ #define PCIM_MFDEV 0x80 #define PCIR_BIST 0x0f +/* PCI Spec rev 2.2: 0FFFFh is an invalid value for Vendor ID. */ +#define PCIV_INVALID 0xffff + /* Capability Register Offsets */ #define PCICAP_ID 0x0 diff --git a/sys/dev/pci/pcivar.h b/sys/dev/pci/pcivar.h index c5a8afb4ed23..376fb96594ce 100644 --- a/sys/dev/pci/pcivar.h +++ b/sys/dev/pci/pcivar.h @@ -311,7 +311,7 @@ struct pci_device_table { "M16:mask;U16:vendor;U16:device;U16:subvendor;U16:subdevice;" \ "U16:class;U16:subclass;U16:revid;" #define PCI_PNP_INFO(table) \ - MODULE_PNP_INFO(PCI_PNP_STR, pci, table, table, sizeof(table[0]), \ + MODULE_PNP_INFO(PCI_PNP_STR, pci, table, table, \ sizeof(table) / sizeof(table[0])) const struct pci_device_table *pci_match_device(device_t child, diff --git a/sys/dev/pcn/if_pcn.c b/sys/dev/pcn/if_pcn.c index 6073310b2ac9..0e91df03ea18 100644 --- a/sys/dev/pcn/if_pcn.c +++ b/sys/dev/pcn/if_pcn.c @@ -193,6 +193,8 @@ static driver_t pcn_driver = { static devclass_t pcn_devclass; DRIVER_MODULE(pcn, pci, pcn_driver, pcn_devclass, 0, 0); +MODULE_PNP_INFO("U16:vendor;U16:device", pci, pcn, pcn_devs, + nitems(pcn_devs) - 1); DRIVER_MODULE(miibus, pcn, miibus_driver, miibus_devclass, 0, 0); #define PCN_CSR_SETBIT(sc, reg, x) \ diff --git a/sys/dev/puc/puc_pci.c b/sys/dev/puc/puc_pci.c index f0c6aa81dde4..012a16dc9e9b 100644 --- a/sys/dev/puc/puc_pci.c +++ b/sys/dev/puc/puc_pci.c @@ -200,4 +200,4 @@ static driver_t puc_pci_driver = { DRIVER_MODULE(puc, pci, puc_pci_driver, puc_devclass, 0, 0); MODULE_PNP_INFO("U16:vendor;U16:device;U16:#;U16:#;D:#", pci, puc, - puc_pci_devices, sizeof(puc_pci_devices[0]), nitems(puc_pci_devices) - 1); + puc_pci_devices, nitems(puc_pci_devices) - 1); diff --git a/sys/dev/ral/if_ral_pci.c b/sys/dev/ral/if_ral_pci.c index 6d9de66e53c8..41e81d573536 100644 --- a/sys/dev/ral/if_ral_pci.c +++ b/sys/dev/ral/if_ral_pci.c @@ -178,6 +178,8 @@ static driver_t ral_pci_driver = { static devclass_t ral_devclass; DRIVER_MODULE(ral, pci, ral_pci_driver, ral_devclass, NULL, NULL); +MODULE_PNP_INFO("U16:vendor;U16:device;D:#", pci, ral, ral_pci_ids, + nitems(ral_pci_ids) - 1); static int ral_pci_probe(device_t dev) diff --git a/sys/dev/rl/if_rl.c b/sys/dev/rl/if_rl.c index 7e8c63131607..e5d4787ebbe9 100644 --- a/sys/dev/rl/if_rl.c +++ b/sys/dev/rl/if_rl.c @@ -259,6 +259,8 @@ static driver_t rl_driver = { static devclass_t rl_devclass; DRIVER_MODULE(rl, pci, rl_driver, rl_devclass, 0, 0); +MODULE_PNP_INFO("U16:vendor;U16:device", pci, rl, rl_devs, + nitems(rl_devs) - 1); DRIVER_MODULE(rl, cardbus, rl_driver, rl_devclass, 0, 0); DRIVER_MODULE(miibus, rl, miibus_driver, miibus_devclass, 0, 0); diff --git a/sys/dev/sdhci/sdhci_acpi.c b/sys/dev/sdhci/sdhci_acpi.c index 844be21d64bc..c202ba054cfc 100644 --- a/sys/dev/sdhci/sdhci_acpi.c +++ b/sys/dev/sdhci/sdhci_acpi.c @@ -79,6 +79,8 @@ static const struct sdhci_acpi_device { SDHCI_QUIRK_MMC_DDR52 | SDHCI_QUIRK_CAPS_BIT63_FOR_MMC_HS400 | SDHCI_QUIRK_PRESET_VALUE_BROKEN }, + { "AMDI0040", 0, "AMD eMMC 5.0 Controller", + SDHCI_QUIRK_32BIT_DMA_SIZE }, { NULL, 0, NULL, 0} }; @@ -87,6 +89,7 @@ static char *sdhci_ids[] = { "80860F16", "80865ACA", "80865ACC", + "AMDI0040", NULL }; diff --git a/sys/dev/spibus/spi.h b/sys/dev/spibus/spi.h index 1a9c1496a77d..0c12929bee15 100644 --- a/sys/dev/spibus/spi.h +++ b/sys/dev/spibus/spi.h @@ -43,4 +43,4 @@ struct spi_command { #define SPIBUS_PNP_DESCR "Z:compat;P:#;" #define SPIBUS_PNP_INFO(t) \ - MODULE_PNP_INFO(SPIBUS_PNP_DESCR, spibus, t, t, sizeof(t[0]), sizeof(t) / sizeof(t[0])); + MODULE_PNP_INFO(SPIBUS_PNP_DESCR, spibus, t, t, sizeof(t) / sizeof(t[0])); diff --git a/sys/dev/uart/uart_bus_pccard.c b/sys/dev/uart/uart_bus_pccard.c index 3a8446a871f6..8b672ebeadb0 100644 --- a/sys/dev/uart/uart_bus_pccard.c +++ b/sys/dev/uart/uart_bus_pccard.c @@ -103,4 +103,4 @@ uart_pccard_attach(device_t dev) DRIVER_MODULE(uart, pccard, uart_pccard_driver, uart_devclass, 0, 0); MODULE_PNP_INFO("U32:function_type;", pccard, uart, &uart_pccard_function, - sizeof(uart_pccard_function), 1); + 1); diff --git a/sys/dev/uart/uart_bus_pci.c b/sys/dev/uart/uart_bus_pci.c index f4be108ab7b2..15b4472a37b2 100644 --- a/sys/dev/uart/uart_bus_pci.c +++ b/sys/dev/uart/uart_bus_pci.c @@ -125,6 +125,7 @@ static const struct pci_id pci_ns8250_ids[] = { 128 * DEFAULT_RCLK, 2}, { 0x14e4, 0x4344, 0xffff, 0, "Sony Ericsson GC89 PC Card", 0x10}, { 0x151f, 0x0000, 0xffff, 0, "TOPIC Semiconductor TP560 56k modem", 0x10 }, +{ 0x1d0f, 0x8250, 0x1d0f, 0, "Amazon PCI serial device", 0x10 }, { 0x1fd4, 0x1999, 0x1fd4, 0x0001, "Sunix SER5xxxx Serial Port", 0x10, 8 * DEFAULT_RCLK }, { 0x8086, 0x0f0a, 0xffff, 0, "Intel ValleyView LPIO1 HSUART#1", 0x10, diff --git a/sys/dev/usb/net/if_ure.c b/sys/dev/usb/net/if_ure.c index 2bf0609697ed..24ce36b64a62 100644 --- a/sys/dev/usb/net/if_ure.c +++ b/sys/dev/usb/net/if_ure.c @@ -169,6 +169,7 @@ MODULE_DEPEND(ure, usb, 1, 1, 1); MODULE_DEPEND(ure, ether, 1, 1, 1); MODULE_DEPEND(ure, miibus, 1, 1, 1); MODULE_VERSION(ure, 1); +USB_PNP_HOST_INFO(ure_devs); static const struct usb_ether_methods ure_ue_methods = { .ue_attach_post = ure_attach_post, diff --git a/sys/dev/usb/usbdi.h b/sys/dev/usb/usbdi.h index 147b5d5e71b8..d5648c0301ea 100644 --- a/sys/dev/usb/usbdi.h +++ b/sys/dev/usb/usbdi.h @@ -342,13 +342,13 @@ struct usb_device_id { #define USB_STD_PNP_HOST_INFO USB_STD_PNP_INFO "T:mode=host;" #define USB_STD_PNP_DEVICE_INFO USB_STD_PNP_INFO "T:mode=device;" #define USB_PNP_HOST_INFO(table) \ - MODULE_PNP_INFO(USB_STD_PNP_HOST_INFO, uhub, table, table, sizeof(table[0]), \ + MODULE_PNP_INFO(USB_STD_PNP_HOST_INFO, uhub, table, table, \ sizeof(table) / sizeof(table[0])) #define USB_PNP_DEVICE_INFO(table) \ - MODULE_PNP_INFO(USB_STD_PNP_DEVICE_INFO, uhub, table, table, sizeof(table[0]), \ + MODULE_PNP_INFO(USB_STD_PNP_DEVICE_INFO, uhub, table, table, \ sizeof(table) / sizeof(table[0])) #define USB_PNP_DUAL_INFO(table) \ - MODULE_PNP_INFO(USB_STD_PNP_INFO, uhub, table, table, sizeof(table[0]), \ + MODULE_PNP_INFO(USB_STD_PNP_INFO, uhub, table, table, \ sizeof(table) / sizeof(table[0])) /* check that the size of the structure above is correct */ diff --git a/sys/dev/xl/if_xl.c b/sys/dev/xl/if_xl.c index 0afe62a522de..29d323399894 100644 --- a/sys/dev/xl/if_xl.c +++ b/sys/dev/xl/if_xl.c @@ -334,7 +334,7 @@ static devclass_t xl_devclass; DRIVER_MODULE_ORDERED(xl, pci, xl_driver, xl_devclass, NULL, NULL, SI_ORDER_ANY); DRIVER_MODULE(miibus, xl, miibus_driver, miibus_devclass, NULL, NULL); -MODULE_PNP_INFO("U16:vendor;U16:device;D:#", pci, xl, xl_devs, sizeof(xl_devs[0]), +MODULE_PNP_INFO("U16:vendor;U16:device;D:#", pci, xl, xl_devs, nitems(xl_devs) - 1); static void diff --git a/sys/geom/raid/tr_raid0.c b/sys/geom/raid/tr_raid0.c index 78fa1b920f8a..33a802103f0c 100644 --- a/sys/geom/raid/tr_raid0.c +++ b/sys/geom/raid/tr_raid0.c @@ -323,7 +323,7 @@ g_raid_tr_iodone_raid0(struct g_raid_tr_object *tr, pbp->bio_inbed++; if (pbp->bio_children == pbp->bio_inbed) { pbp->bio_completed = pbp->bio_length; - g_raid_iodone(pbp, bp->bio_error); + g_raid_iodone(pbp, pbp->bio_error); } } diff --git a/sys/i386/i386/npx.c b/sys/i386/i386/npx.c index a2edacec987e..665b55b1c35b 100644 --- a/sys/i386/i386/npx.c +++ b/sys/i386/i386/npx.c @@ -67,6 +67,7 @@ __FBSDID("$FreeBSD$"); #include <machine/specialreg.h> #include <machine/segments.h> #include <machine/ucontext.h> +#include <x86/ifunc.h> #include <machine/intr_machdep.h> @@ -183,7 +184,6 @@ CTASSERT(X86_XSTATE_XCR0_OFFSET >= offsetof(struct savexmm, sv_pad) && static void fpu_clean_state(void); -static void fpusave(union savefpu *); static void fpurstor(union savefpu *); int hw_float; @@ -206,8 +206,6 @@ struct xsave_area_elm_descr { u_int size; } *xsave_area_desc; -static int use_xsaveopt; - static volatile u_int npx_traps_while_probing; alias_for_inthand_t probetrap; @@ -314,6 +312,69 @@ cleanup: return (hw_float); } +static void +npxsave_xsaveopt(union savefpu *addr) +{ + + xsaveopt((char *)addr, xsave_mask); +} + +static void +fpusave_xsave(union savefpu *addr) +{ + + xsave((char *)addr, xsave_mask); +} + +static void +fpusave_fxsave(union savefpu *addr) +{ + + fxsave((char *)addr); +} + +static void +fpusave_fnsave(union savefpu *addr) +{ + + fnsave((char *)addr); +} + +static void +init_xsave(void) +{ + + if (use_xsave) + return; + if (!cpu_fxsr || (cpu_feature2 & CPUID2_XSAVE) == 0) + return; + use_xsave = 1; + TUNABLE_INT_FETCH("hw.use_xsave", &use_xsave); +} + +DEFINE_IFUNC(, void, npxsave_core, (union savefpu *), static) +{ + + init_xsave(); + if (use_xsave) + return ((cpu_stdext_feature & CPUID_EXTSTATE_XSAVEOPT) != 0 ? + npxsave_xsaveopt : fpusave_xsave); + if (cpu_fxsr) + return (fpusave_fxsave); + return (fpusave_fnsave); +} + +DEFINE_IFUNC(, void, fpusave, (union savefpu *), static) +{ + + init_xsave(); + if (use_xsave) + return (fpusave_xsave); + if (cpu_fxsr) + return (fpusave_fxsave); + return (fpusave_fnsave); +} + /* * Enable XSAVE if supported and allowed by user. * Calculate the xsave_mask. @@ -325,13 +386,8 @@ npxinit_bsp1(void) uint64_t xsave_mask_user; TUNABLE_INT_FETCH("hw.lazy_fpu_switch", &lazy_fpu_switch); - if (cpu_fxsr && (cpu_feature2 & CPUID2_XSAVE) != 0) { - use_xsave = 1; - TUNABLE_INT_FETCH("hw.use_xsave", &use_xsave); - } if (!use_xsave) return; - cpuid_count(0xd, 0x0, cp); xsave_mask = XFEATURE_ENABLED_X87 | XFEATURE_ENABLED_SSE; if ((cp[0] & xsave_mask) != xsave_mask) @@ -345,14 +401,9 @@ npxinit_bsp1(void) xsave_mask &= ~XFEATURE_AVX512; if ((xsave_mask & XFEATURE_MPX) != XFEATURE_MPX) xsave_mask &= ~XFEATURE_MPX; - - cpuid_count(0xd, 0x1, cp); - if ((cp[0] & CPUID_EXTSTATE_XSAVEOPT) != 0) - use_xsaveopt = 1; } /* - * Calculate the fpu save area size. */ static void @@ -867,15 +918,11 @@ npxdna(void) * npxsave() atomically with checking fpcurthread. */ void -npxsave(addr) - union savefpu *addr; +npxsave(union savefpu *addr) { stop_emulating(); - if (use_xsaveopt) - xsaveopt((char *)addr, xsave_mask); - else - fpusave(addr); + npxsave_core(addr); } void npxswitch(struct thread *td, struct pcb *pcb); @@ -1100,19 +1147,6 @@ npxsetregs(struct thread *td, union savefpu *addr, char *xfpustate, } static void -fpusave(addr) - union savefpu *addr; -{ - - if (use_xsave) - xsave((char *)addr, xsave_mask); - else if (cpu_fxsr) - fxsave(addr); - else - fnsave(addr); -} - -static void npx_fill_fpregs_xmm1(struct savexmm *sv_xmm, struct save87 *sv_87) { struct env87 *penv_87; diff --git a/sys/i386/i386/pmap.c b/sys/i386/i386/pmap.c index 0c1437df5187..9dd80ad72d49 100644 --- a/sys/i386/i386/pmap.c +++ b/sys/i386/i386/pmap.c @@ -148,6 +148,7 @@ __FBSDID("$FreeBSD$"); #include <machine/intr_machdep.h> #include <x86/apicvar.h> #endif +#include <x86/ifunc.h> #include <machine/bootinfo.h> #include <machine/cpu.h> #include <machine/cputypes.h> @@ -314,6 +315,10 @@ static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte); static void pmap_flush_page(vm_page_t m); static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte); +static void pmap_invalidate_cache_range_selfsnoop(vm_offset_t sva, + vm_offset_t eva); +static void pmap_invalidate_cache_range_all(vm_offset_t sva, + vm_offset_t eva); static void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde); static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte); @@ -574,8 +579,11 @@ pmap_bootstrap(vm_paddr_t firstaddr) vm_offset_t va; pt_entry_t *pte, *unused; struct pcpu *pc; + u_long res; int i; + res = atop(firstaddr - (vm_paddr_t)KERNLOAD); + /* * Add a physical memory segment (vm_phys_seg) corresponding to the * preallocated kernel page table pages so that vm_page structures @@ -593,11 +601,12 @@ pmap_bootstrap(vm_paddr_t firstaddr) * unused virtual address in addition to "firstaddr". */ virtual_avail = (vm_offset_t)firstaddr; - virtual_end = VM_MAX_KERNEL_ADDRESS; /* * Initialize the kernel pmap (which is statically allocated). + * Count bootstrap data as being resident in case any of this data is + * later unmapped (using pmap_remove()) and freed. */ PMAP_LOCK_INIT(kernel_pmap); kernel_pmap->pm_pdir = IdlePTD; @@ -605,6 +614,7 @@ pmap_bootstrap(vm_paddr_t firstaddr) kernel_pmap->pm_pdpt = IdlePDPT; #endif CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ + kernel_pmap->pm_stats.resident_count = res; TAILQ_INIT(&kernel_pmap->pm_pvchunk); /* @@ -1407,37 +1417,64 @@ pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde) pmap_invalidate_page(pmap, va); } +DEFINE_IFUNC(, void, pmap_invalidate_cache_range, (vm_offset_t, vm_offset_t), + static) +{ + + if ((cpu_feature & CPUID_SS) != 0) + return (pmap_invalidate_cache_range_selfsnoop); + if ((cpu_feature & CPUID_CLFSH) != 0) + return (pmap_force_invalidate_cache_range); + return (pmap_invalidate_cache_range_all); +} + #define PMAP_CLFLUSH_THRESHOLD (2 * 1024 * 1024) +static void +pmap_invalidate_cache_range_check_align(vm_offset_t sva, vm_offset_t eva) +{ + + KASSERT((sva & PAGE_MASK) == 0, + ("pmap_invalidate_cache_range: sva not page-aligned")); + KASSERT((eva & PAGE_MASK) == 0, + ("pmap_invalidate_cache_range: eva not page-aligned")); +} + +static void +pmap_invalidate_cache_range_selfsnoop(vm_offset_t sva, vm_offset_t eva) +{ + + pmap_invalidate_cache_range_check_align(sva, eva); +} + void -pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva, boolean_t force) +pmap_force_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva) { - if (force) { - sva &= ~(vm_offset_t)(cpu_clflush_line_size - 1); - } else { - KASSERT((sva & PAGE_MASK) == 0, - ("pmap_invalidate_cache_range: sva not page-aligned")); - KASSERT((eva & PAGE_MASK) == 0, - ("pmap_invalidate_cache_range: eva not page-aligned")); + sva &= ~(vm_offset_t)(cpu_clflush_line_size - 1); + if (eva - sva >= PMAP_CLFLUSH_THRESHOLD) { + /* + * The supplied range is bigger than 2MB. + * Globally invalidate cache. + */ + pmap_invalidate_cache(); + return; } - if ((cpu_feature & CPUID_SS) != 0 && !force) - ; /* If "Self Snoop" is supported and allowed, do nothing. */ - else if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0 && - eva - sva < PMAP_CLFLUSH_THRESHOLD) { #ifdef DEV_APIC - /* - * XXX: Some CPUs fault, hang, or trash the local APIC - * registers if we use CLFLUSH on the local APIC - * range. The local APIC is always uncached, so we - * don't need to flush for that range anyway. - */ - if (pmap_kextract(sva) == lapic_paddr) - return; + /* + * XXX: Some CPUs fault, hang, or trash the local APIC + * registers if we use CLFLUSH on the local APIC + * range. The local APIC is always uncached, so we + * don't need to flush for that range anyway. + */ + if (pmap_kextract(sva) == lapic_paddr) + return; #endif + + if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0) { /* - * Otherwise, do per-cache line flush. Use the sfence + * Do per-cache line flush. Use the sfence * instruction to insure that previous stores are * included in the write-back. The processor * propagates flush to other processors in the cache @@ -1447,12 +1484,7 @@ pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva, boolean_t force) for (; sva < eva; sva += cpu_clflush_line_size) clflushopt(sva); sfence(); - } else if ((cpu_feature & CPUID_CLFSH) != 0 && - eva - sva < PMAP_CLFLUSH_THRESHOLD) { -#ifdef DEV_APIC - if (pmap_kextract(sva) == lapic_paddr) - return; -#endif + } else { /* * Writes are ordered by CLFLUSH on Intel CPUs. */ @@ -1462,17 +1494,17 @@ pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva, boolean_t force) clflush(sva); if (cpu_vendor_id != CPU_VENDOR_INTEL) mfence(); - } else { - - /* - * No targeted cache flush methods are supported by CPU, - * or the supplied range is bigger than 2MB. - * Globally invalidate cache. - */ - pmap_invalidate_cache(); } } +static void +pmap_invalidate_cache_range_all(vm_offset_t sva, vm_offset_t eva) +{ + + pmap_invalidate_cache_range_check_align(sva, eva); + pmap_invalidate_cache(); +} + void pmap_invalidate_cache_pages(vm_page_t *pages, int count) { @@ -5479,7 +5511,7 @@ pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode) for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE) pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode); pmap_invalidate_range(kernel_pmap, va, va + tmpsize); - pmap_invalidate_cache_range(va, va + size, FALSE); + pmap_invalidate_cache_range(va, va + size); return ((void *)(va + offset)); } @@ -5718,7 +5750,7 @@ pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) */ if (changed) { pmap_invalidate_range(kernel_pmap, base, tmpva); - pmap_invalidate_cache_range(base, tmpva, FALSE); + pmap_invalidate_cache_range(base, tmpva); } return (0); } diff --git a/sys/i386/i386/trap.c b/sys/i386/i386/trap.c index 19086221b11b..4b62484533c4 100644 --- a/sys/i386/i386/trap.c +++ b/sys/i386/i386/trap.c @@ -762,12 +762,6 @@ kernel_trctrap: KASSERT((read_eflags() & PSL_I) != 0, ("interrupts disabled")); trapsignal(td, &ksi); - /* - * Clear any pending debug exceptions after allowing a - * debugger to read DR6 while stopped in trapsignal(). - */ - if (type == T_TRCTRAP) - load_dr6(0); user: userret(td, frame); KASSERT(PCB_USER_FPU(td->td_pcb), diff --git a/sys/i386/i386/vm_machdep.c b/sys/i386/i386/vm_machdep.c index c3ed95dd4d02..250689459643 100644 --- a/sys/i386/i386/vm_machdep.c +++ b/sys/i386/i386/vm_machdep.c @@ -650,7 +650,7 @@ sf_buf_invalidate(struct sf_buf *sf) * settings are recalculated. */ pmap_qenter(sf->kva, &m, 1); - pmap_invalidate_cache_range(sf->kva, sf->kva + PAGE_SIZE, FALSE); + pmap_invalidate_cache_range(sf->kva, sf->kva + PAGE_SIZE); } /* diff --git a/sys/i386/include/pmap.h b/sys/i386/include/pmap.h index da9ae6588189..e48c0d3c6af5 100644 --- a/sys/i386/include/pmap.h +++ b/sys/i386/include/pmap.h @@ -394,8 +394,8 @@ void pmap_invalidate_range(pmap_t, vm_offset_t, vm_offset_t); void pmap_invalidate_all(pmap_t); void pmap_invalidate_cache(void); void pmap_invalidate_cache_pages(vm_page_t *pages, int count); -void pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva, - boolean_t force); +void pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva); +void pmap_force_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva); void *pmap_trm_alloc(size_t size, int flags); void pmap_trm_free(void *addr, size_t size); diff --git a/sys/isa/isavar.h b/sys/isa/isavar.h index 1a3e661b67a4..d95a9c1ab3f2 100644 --- a/sys/isa/isavar.h +++ b/sys/isa/isavar.h @@ -142,7 +142,7 @@ enum isa_device_ivars { #define ISA_PNP_DESCR "E:pnpid;D:#" #define ISA_PNP_INFO(t) \ - MODULE_PNP_INFO(ISA_PNP_DESCR, isa, t, t, sizeof(t[0]), nitems(t) - 1); \ + MODULE_PNP_INFO(ISA_PNP_DESCR, isa, t, t, nitems(t) - 1); \ /* * Simplified accessors for isa devices diff --git a/sys/kern/init_sysent.c b/sys/kern/init_sysent.c index 5ea572ff0381..885b5dc6f030 100644 --- a/sys/kern/init_sysent.c +++ b/sys/kern/init_sysent.c @@ -233,10 +233,10 @@ struct sysent sysent[] = { { AS(setgid_args), (sy_call_t *)sys_setgid, AUE_SETGID, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 181 = setgid */ { AS(setegid_args), (sy_call_t *)sys_setegid, AUE_SETEGID, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 182 = setegid */ { AS(seteuid_args), (sy_call_t *)sys_seteuid, AUE_SETEUID, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 183 = seteuid */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 184 = lfs_bmapv */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 185 = lfs_markv */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 186 = lfs_segclean */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 187 = lfs_segwait */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 184 = obsolete lfs_bmapv */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 185 = obsolete lfs_markv */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 186 = obsolete lfs_segclean */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 187 = obsolete lfs_segwait */ { compat11(AS(freebsd11_stat_args),stat), AUE_STAT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 188 = freebsd11 stat */ { compat11(AS(freebsd11_fstat_args),fstat), AUE_FSTAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 189 = freebsd11 fstat */ { compat11(AS(freebsd11_lstat_args),lstat), AUE_LSTAT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 190 = freebsd11 lstat */ @@ -272,7 +272,7 @@ struct sysent sysent[] = { { 0, (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 220 = freebsd7 __semctl */ { AS(semget_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 221 = semget */ { AS(semop_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 222 = semop */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 223 = semconfig */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 223 = obsolete semconfig */ { 0, (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 224 = freebsd7 msgctl */ { AS(msgget_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 225 = msgget */ { AS(msgsnd_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 226 = msgsnd */ @@ -413,26 +413,26 @@ struct sysent sysent[] = { { AS(getresgid_args), (sy_call_t *)sys_getresgid, AUE_GETRESGID, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 361 = getresgid */ { 0, (sy_call_t *)sys_kqueue, AUE_KQUEUE, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 362 = kqueue */ { compat11(AS(freebsd11_kevent_args),kevent), AUE_KEVENT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 363 = freebsd11 kevent */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 364 = __cap_get_proc */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 365 = __cap_set_proc */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 366 = __cap_get_fd */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 367 = __cap_get_file */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 368 = __cap_set_fd */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 369 = __cap_set_file */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 364 = obsolete __cap_get_proc */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 365 = obsolete __cap_set_proc */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 366 = obsolete __cap_get_fd */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 367 = obsolete __cap_get_file */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 368 = obsolete __cap_set_fd */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 369 = obsolete __cap_set_file */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 370 = nosys */ { AS(extattr_set_fd_args), (sy_call_t *)sys_extattr_set_fd, AUE_EXTATTR_SET_FD, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 371 = extattr_set_fd */ { AS(extattr_get_fd_args), (sy_call_t *)sys_extattr_get_fd, AUE_EXTATTR_GET_FD, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 372 = extattr_get_fd */ { AS(extattr_delete_fd_args), (sy_call_t *)sys_extattr_delete_fd, AUE_EXTATTR_DELETE_FD, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 373 = extattr_delete_fd */ { AS(__setugid_args), (sy_call_t *)sys___setugid, AUE_SETUGID, NULL, 0, 0, 0, SY_THR_STATIC }, /* 374 = __setugid */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 375 = nfsclnt */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 375 = obsolete nfsclnt */ { AS(eaccess_args), (sy_call_t *)sys_eaccess, AUE_EACCESS, NULL, 0, 0, 0, SY_THR_STATIC }, /* 376 = eaccess */ { AS(afs3_syscall_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 377 = afs3_syscall */ { AS(nmount_args), (sy_call_t *)sys_nmount, AUE_NMOUNT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 378 = nmount */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 379 = kse_exit */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 380 = kse_wakeup */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 381 = kse_create */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 382 = kse_thr_interrupt */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 383 = kse_release */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 379 = obsolete kse_exit */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 380 = obsolete kse_wakeup */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 381 = obsolete kse_create */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 382 = obsolete kse_thr_interrupt */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 383 = obsolete kse_release */ { AS(__mac_get_proc_args), (sy_call_t *)sys___mac_get_proc, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 384 = __mac_get_proc */ { AS(__mac_set_proc_args), (sy_call_t *)sys___mac_set_proc, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 385 = __mac_set_proc */ { AS(__mac_get_fd_args), (sy_call_t *)sys___mac_get_fd, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 386 = __mac_get_fd */ @@ -489,7 +489,7 @@ struct sysent sysent[] = { { AS(extattr_list_fd_args), (sy_call_t *)sys_extattr_list_fd, AUE_EXTATTR_LIST_FD, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 437 = extattr_list_fd */ { AS(extattr_list_file_args), (sy_call_t *)sys_extattr_list_file, AUE_EXTATTR_LIST_FILE, NULL, 0, 0, 0, SY_THR_STATIC }, /* 438 = extattr_list_file */ { AS(extattr_list_link_args), (sy_call_t *)sys_extattr_list_link, AUE_EXTATTR_LIST_LINK, NULL, 0, 0, 0, SY_THR_STATIC }, /* 439 = extattr_list_link */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 440 = kse_switchin */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 440 = obsolete kse_switchin */ { AS(ksem_timedwait_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 441 = ksem_timedwait */ { AS(thr_suspend_args), (sy_call_t *)sys_thr_suspend, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 442 = thr_suspend */ { AS(thr_wake_args), (sy_call_t *)sys_thr_wake, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 443 = thr_wake */ @@ -597,8 +597,8 @@ struct sysent sysent[] = { { AS(ppoll_args), (sy_call_t *)sys_ppoll, AUE_POLL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 545 = ppoll */ { AS(futimens_args), (sy_call_t *)sys_futimens, AUE_FUTIMES, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 546 = futimens */ { AS(utimensat_args), (sy_call_t *)sys_utimensat, AUE_FUTIMESAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 547 = utimensat */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 548 = numa_getaffinity */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 549 = numa_setaffinity */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 548 = obsolete numa_getaffinity */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 549 = obsolete numa_setaffinity */ { AS(fdatasync_args), (sy_call_t *)sys_fdatasync, AUE_FSYNC, NULL, 0, 0, 0, SY_THR_STATIC }, /* 550 = fdatasync */ { AS(fstat_args), (sy_call_t *)sys_fstat, AUE_FSTAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 551 = fstat */ { AS(fstatat_args), (sy_call_t *)sys_fstatat, AUE_FSTATAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 552 = fstatat */ diff --git a/sys/kern/kern_context.c b/sys/kern/kern_context.c index 5afb371bfb27..3bd5f31082ac 100644 --- a/sys/kern/kern_context.c +++ b/sys/kern/kern_context.c @@ -70,6 +70,7 @@ sys_getcontext(struct thread *td, struct getcontext_args *uap) if (uap->ucp == NULL) ret = EINVAL; else { + bzero(&uc, sizeof(ucontext_t)); get_mcontext(td, &uc.uc_mcontext, GET_MC_CLEAR_RET); PROC_LOCK(td->td_proc); uc.uc_sigmask = td->td_sigmask; @@ -110,6 +111,7 @@ sys_swapcontext(struct thread *td, struct swapcontext_args *uap) if (uap->oucp == NULL || uap->ucp == NULL) ret = EINVAL; else { + bzero(&uc, sizeof(ucontext_t)); get_mcontext(td, &uc.uc_mcontext, GET_MC_CLEAR_RET); bzero(uc.__spare__, sizeof(uc.__spare__)); PROC_LOCK(td->td_proc); diff --git a/sys/kern/kern_cpuset.c b/sys/kern/kern_cpuset.c index b1612b11f770..3f4a81ff70d7 100644 --- a/sys/kern/kern_cpuset.c +++ b/sys/kern/kern_cpuset.c @@ -65,7 +65,12 @@ __FBSDID("$FreeBSD$"); #include <vm/uma.h> #include <vm/vm.h> #include <vm/vm_object.h> +#include <vm/vm_page.h> +#include <vm/vm_pageout.h> #include <vm/vm_extern.h> +#include <vm/vm_param.h> +#include <vm/vm_phys.h> +#include <vm/vm_pagequeue.h> #ifdef DDB #include <ddb/ddb.h> @@ -479,6 +484,26 @@ _domainset_create(struct domainset *domain, struct domainlist *freelist) } /* + * Are any of the domains in the mask empty? If so, silently + * remove them. If only empty domains are present, we must + * return failure. + */ +static bool +domainset_empty_vm(struct domainset *domain) +{ + int i, max; + + max = DOMAINSET_FLS(&domain->ds_mask) + 1; + for (i = 0; i < max; i++) { + if (DOMAINSET_ISSET(i, &domain->ds_mask) && + VM_DOMAIN_EMPTY(i)) + DOMAINSET_CLR(i, &domain->ds_mask); + } + + return (DOMAINSET_EMPTY(&domain->ds_mask)); +} + +/* * Create or lookup a domainset based on the key held in 'domain'. */ struct domainset * @@ -1360,6 +1385,7 @@ domainset_zero(void) DOMAINSET_SET(i, &dset->ds_mask); dset->ds_policy = DOMAINSET_POLICY_FIRSTTOUCH; dset->ds_prefer = -1; + (void)domainset_empty_vm(dset); curthread->td_domain.dr_policy = _domainset_create(dset, NULL); domainset_copy(dset, &domainset2); @@ -2087,6 +2113,13 @@ kern_cpuset_setdomain(struct thread *td, cpulevel_t level, cpuwhich_t which, DOMAINSET_FILL(&domain.ds_mask); } + /* + * When given an impossible policy, fall back to interleaving + * across all domains + */ + if (domainset_empty_vm(&domain)) + domainset_copy(&domainset2, &domain); + switch (level) { case CPU_LEVEL_ROOT: case CPU_LEVEL_CPUSET: diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c index ca1f7326fe3f..08a704d904b1 100644 --- a/sys/kern/kern_descrip.c +++ b/sys/kern/kern_descrip.c @@ -2936,8 +2936,11 @@ fgetvp_write(struct thread *td, int fd, cap_rights_t *rightsp, /* * Handle the last reference to a file being closed. + * + * Without the noinline attribute clang keeps inlining the func thorough this + * file when fdrop is used. */ -int +int __noinline _fdrop(struct file *fp, struct thread *td) { int error; diff --git a/sys/kern/kern_malloc.c b/sys/kern/kern_malloc.c index c102eb52e5bd..a8d42a85d949 100644 --- a/sys/kern/kern_malloc.c +++ b/sys/kern/kern_malloc.c @@ -61,6 +61,7 @@ __FBSDID("$FreeBSD$"); #include <sys/vmmeter.h> #include <sys/proc.h> #include <sys/sbuf.h> +#include <sys/smp.h> #include <sys/sysctl.h> #include <sys/time.h> #include <sys/vmem.h> @@ -173,6 +174,7 @@ struct { * declare malloc types. */ static uma_zone_t mt_zone; +static uma_zone_t mt_stats_zone; u_long vm_kmem_size; SYSCTL_ULONG(_vm, OID_AUTO, kmem_size, CTLFLAG_RDTUN, &vm_kmem_size, 0, @@ -368,7 +370,7 @@ malloc_type_zone_allocated(struct malloc_type *mtp, unsigned long size, critical_enter(); mtip = mtp->ks_handle; - mtsp = &mtip->mti_stats[curcpu]; + mtsp = zpcpu_get(mtip->mti_stats); if (size > 0) { mtsp->mts_memalloced += size; mtsp->mts_numallocs++; @@ -411,7 +413,7 @@ malloc_type_freed(struct malloc_type *mtp, unsigned long size) critical_enter(); mtip = mtp->ks_handle; - mtsp = &mtip->mti_stats[curcpu]; + mtsp = zpcpu_get(mtip->mti_stats); mtsp->mts_memfreed += size; mtsp->mts_numfrees++; @@ -953,6 +955,9 @@ mallocinit(void *dummy) if (kmem_zmax < PAGE_SIZE || kmem_zmax > KMEM_ZMAX) kmem_zmax = KMEM_ZMAX; + mt_stats_zone = uma_zcreate("mt_stats_zone", + sizeof(struct malloc_type_stats), NULL, NULL, NULL, NULL, + UMA_ALIGN_PTR, UMA_ZONE_PCPU); mt_zone = uma_zcreate("mt_zone", sizeof(struct malloc_type_internal), #ifdef INVARIANTS mtrash_ctor, mtrash_dtor, mtrash_init, mtrash_fini, @@ -995,6 +1000,7 @@ malloc_init(void *data) panic("malloc_init: bad malloc type magic"); mtip = uma_zalloc(mt_zone, M_WAITOK | M_ZERO); + mtip->mti_stats = uma_zalloc_pcpu(mt_stats_zone, M_WAITOK | M_ZERO); mtp->ks_handle = mtip; mtp_set_subzone(mtp); @@ -1042,8 +1048,8 @@ malloc_uninit(void *data) * Look for memory leaks. */ temp_allocs = temp_bytes = 0; - for (i = 0; i < MAXCPU; i++) { - mtsp = &mtip->mti_stats[i]; + for (i = 0; i <= mp_maxid; i++) { + mtsp = zpcpu_get_cpu(mtip->mti_stats, i); temp_allocs += mtsp->mts_numallocs; temp_allocs -= mtsp->mts_numfrees; temp_bytes += mtsp->mts_memalloced; @@ -1056,6 +1062,7 @@ malloc_uninit(void *data) } slab = vtoslab((vm_offset_t) mtip & (~UMA_SLAB_MASK)); + uma_zfree_pcpu(mt_stats_zone, mtip->mti_stats); uma_zfree_arg(mt_zone, mtip, slab); } @@ -1077,6 +1084,7 @@ sysctl_kern_malloc_stats(SYSCTL_HANDLER_ARGS) { struct malloc_type_stream_header mtsh; struct malloc_type_internal *mtip; + struct malloc_type_stats *mtsp, zeromts; struct malloc_type_header mth; struct malloc_type *mtp; int error, i; @@ -1089,6 +1097,8 @@ sysctl_kern_malloc_stats(SYSCTL_HANDLER_ARGS) sbuf_clear_flags(&sbuf, SBUF_INCLUDENUL); mtx_lock(&malloc_mtx); + bzero(&zeromts, sizeof(zeromts)); + /* * Insert stream header. */ @@ -1114,10 +1124,17 @@ sysctl_kern_malloc_stats(SYSCTL_HANDLER_ARGS) /* * Insert type statistics for each CPU. */ - for (i = 0; i < MAXCPU; i++) { - (void)sbuf_bcat(&sbuf, &mtip->mti_stats[i], - sizeof(mtip->mti_stats[i])); + for (i = 0; i <= mp_maxid; i++) { + mtsp = zpcpu_get_cpu(mtip->mti_stats, i); + (void)sbuf_bcat(&sbuf, mtsp, sizeof(*mtsp)); } + /* + * Fill in the missing CPUs. + */ + for (; i < MAXCPU; i++) { + (void)sbuf_bcat(&sbuf, &zeromts, sizeof(zeromts)); + } + } mtx_unlock(&malloc_mtx); error = sbuf_finish(&sbuf); @@ -1170,6 +1187,7 @@ restart: DB_SHOW_COMMAND(malloc, db_show_malloc) { struct malloc_type_internal *mtip; + struct malloc_type_internal *mtsp; struct malloc_type *mtp; uint64_t allocs, frees; uint64_t alloced, freed; @@ -1183,7 +1201,8 @@ DB_SHOW_COMMAND(malloc, db_show_malloc) frees = 0; alloced = 0; freed = 0; - for (i = 0; i < MAXCPU; i++) { + for (i = 0; i <= mp_maxid; i++) { + mtsp = zpcpu_get_cpu(mtip->mti_stats, i); allocs += mtip->mti_stats[i].mts_numallocs; frees += mtip->mti_stats[i].mts_numfrees; alloced += mtip->mti_stats[i].mts_memalloced; diff --git a/sys/kern/kern_resource.c b/sys/kern/kern_resource.c index bed34bf4a7fe..271339e5c4c4 100644 --- a/sys/kern/kern_resource.c +++ b/sys/kern/kern_resource.c @@ -1276,7 +1276,6 @@ uifind(uid_t uid) racct_create(&new_uip->ui_racct); refcount_init(&new_uip->ui_ref, 1); new_uip->ui_uid = uid; - mtx_init(&new_uip->ui_vmsize_mtx, "ui_vmsize", NULL, MTX_DEF); rw_wlock(&uihashtbl_lock); /* @@ -1291,7 +1290,6 @@ uifind(uid_t uid) } else { rw_wunlock(&uihashtbl_lock); racct_destroy(&new_uip->ui_racct); - mtx_destroy(&new_uip->ui_vmsize_mtx); free(new_uip, M_UIDINFO); } return (uip); @@ -1352,7 +1350,6 @@ uifree(struct uidinfo *uip) if (uip->ui_vmsize != 0) printf("freeing uidinfo: uid = %d, swapuse = %lld\n", uip->ui_uid, (unsigned long long)uip->ui_vmsize); - mtx_destroy(&uip->ui_vmsize_mtx); free(uip, M_UIDINFO); } diff --git a/sys/kern/link_elf.c b/sys/kern/link_elf.c index 94fe5c407aeb..9338b06bb268 100644 --- a/sys/kern/link_elf.c +++ b/sys/kern/link_elf.c @@ -1653,7 +1653,7 @@ link_elf_strtab_get(linker_file_t lf, caddr_t *strtab) return (ef->ddbstrcnt); } -#if defined(__i386__) || defined(__amd64__) +#if defined(__i386__) || defined(__amd64__) || defined(__aarch64__) /* * Use this lookup routine when performing relocations early during boot. * The generic lookup routine depends on kobj, which is not initialized diff --git a/sys/kern/subr_vmem.c b/sys/kern/subr_vmem.c index 3c83c777b6a9..8529266f381c 100644 --- a/sys/kern/subr_vmem.c +++ b/sys/kern/subr_vmem.c @@ -241,9 +241,6 @@ static struct vmem buffer_arena_storage; static struct vmem transient_arena_storage; /* kernel and kmem arenas are aliased for backwards KPI compat. */ vmem_t *kernel_arena = &kernel_arena_storage; -#if VM_NRESERVLEVEL > 0 -vmem_t *kernel_rwx_arena = NULL; -#endif vmem_t *kmem_arena = &kernel_arena_storage; vmem_t *buffer_arena = &buffer_arena_storage; vmem_t *transient_arena = &transient_arena_storage; diff --git a/sys/kern/sys_generic.c b/sys/kern/sys_generic.c index 649cfb191b7f..455220c36752 100644 --- a/sys/kern/sys_generic.c +++ b/sys/kern/sys_generic.c @@ -1031,8 +1031,9 @@ kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou, error = copyin(name, ibits[x], ncpubytes); \ if (error != 0) \ goto done; \ - bzero((char *)ibits[x] + ncpubytes, \ - ncpbytes - ncpubytes); \ + if (ncpbytes != ncpubytes) \ + bzero((char *)ibits[x] + ncpubytes, \ + ncpbytes - ncpubytes); \ } \ } while (0) getbits(fd_in, 0); diff --git a/sys/kern/syscalls.c b/sys/kern/syscalls.c index a107b5571b3c..8e74163fe6d8 100644 --- a/sys/kern/syscalls.c +++ b/sys/kern/syscalls.c @@ -190,10 +190,10 @@ const char *syscallnames[] = { "setgid", /* 181 = setgid */ "setegid", /* 182 = setegid */ "seteuid", /* 183 = seteuid */ - "#184", /* 184 = lfs_bmapv */ - "#185", /* 185 = lfs_markv */ - "#186", /* 186 = lfs_segclean */ - "#187", /* 187 = lfs_segwait */ + "obs_lfs_bmapv", /* 184 = obsolete lfs_bmapv */ + "obs_lfs_markv", /* 185 = obsolete lfs_markv */ + "obs_lfs_segclean", /* 186 = obsolete lfs_segclean */ + "obs_lfs_segwait", /* 187 = obsolete lfs_segwait */ "compat11.stat", /* 188 = freebsd11 stat */ "compat11.fstat", /* 189 = freebsd11 fstat */ "compat11.lstat", /* 190 = freebsd11 lstat */ @@ -229,7 +229,7 @@ const char *syscallnames[] = { "compat7.__semctl", /* 220 = freebsd7 __semctl */ "semget", /* 221 = semget */ "semop", /* 222 = semop */ - "#223", /* 223 = semconfig */ + "obs_semconfig", /* 223 = obsolete semconfig */ "compat7.msgctl", /* 224 = freebsd7 msgctl */ "msgget", /* 225 = msgget */ "msgsnd", /* 226 = msgsnd */ @@ -370,26 +370,26 @@ const char *syscallnames[] = { "getresgid", /* 361 = getresgid */ "kqueue", /* 362 = kqueue */ "compat11.kevent", /* 363 = freebsd11 kevent */ - "#364", /* 364 = __cap_get_proc */ - "#365", /* 365 = __cap_set_proc */ - "#366", /* 366 = __cap_get_fd */ - "#367", /* 367 = __cap_get_file */ - "#368", /* 368 = __cap_set_fd */ - "#369", /* 369 = __cap_set_file */ + "obs___cap_get_proc", /* 364 = obsolete __cap_get_proc */ + "obs___cap_set_proc", /* 365 = obsolete __cap_set_proc */ + "obs___cap_get_fd", /* 366 = obsolete __cap_get_fd */ + "obs___cap_get_file", /* 367 = obsolete __cap_get_file */ + "obs___cap_set_fd", /* 368 = obsolete __cap_set_fd */ + "obs___cap_set_file", /* 369 = obsolete __cap_set_file */ "#370", /* 370 = nosys */ "extattr_set_fd", /* 371 = extattr_set_fd */ "extattr_get_fd", /* 372 = extattr_get_fd */ "extattr_delete_fd", /* 373 = extattr_delete_fd */ "__setugid", /* 374 = __setugid */ - "#375", /* 375 = nfsclnt */ + "obs_nfsclnt", /* 375 = obsolete nfsclnt */ "eaccess", /* 376 = eaccess */ "afs3_syscall", /* 377 = afs3_syscall */ "nmount", /* 378 = nmount */ - "#379", /* 379 = kse_exit */ - "#380", /* 380 = kse_wakeup */ - "#381", /* 381 = kse_create */ - "#382", /* 382 = kse_thr_interrupt */ - "#383", /* 383 = kse_release */ + "obs_kse_exit", /* 379 = obsolete kse_exit */ + "obs_kse_wakeup", /* 380 = obsolete kse_wakeup */ + "obs_kse_create", /* 381 = obsolete kse_create */ + "obs_kse_thr_interrupt", /* 382 = obsolete kse_thr_interrupt */ + "obs_kse_release", /* 383 = obsolete kse_release */ "__mac_get_proc", /* 384 = __mac_get_proc */ "__mac_set_proc", /* 385 = __mac_set_proc */ "__mac_get_fd", /* 386 = __mac_get_fd */ @@ -446,7 +446,7 @@ const char *syscallnames[] = { "extattr_list_fd", /* 437 = extattr_list_fd */ "extattr_list_file", /* 438 = extattr_list_file */ "extattr_list_link", /* 439 = extattr_list_link */ - "#440", /* 440 = kse_switchin */ + "obs_kse_switchin", /* 440 = obsolete kse_switchin */ "ksem_timedwait", /* 441 = ksem_timedwait */ "thr_suspend", /* 442 = thr_suspend */ "thr_wake", /* 443 = thr_wake */ @@ -554,8 +554,8 @@ const char *syscallnames[] = { "ppoll", /* 545 = ppoll */ "futimens", /* 546 = futimens */ "utimensat", /* 547 = utimensat */ - "#548", /* 548 = numa_getaffinity */ - "#549", /* 549 = numa_setaffinity */ + "obs_numa_getaffinity", /* 548 = obsolete numa_getaffinity */ + "obs_numa_setaffinity", /* 549 = obsolete numa_setaffinity */ "fdatasync", /* 550 = fdatasync */ "fstat", /* 551 = fstat */ "fstatat", /* 552 = fstatat */ diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master index 9ea0f1b5353b..08b254655ee5 100644 --- a/sys/kern/syscalls.master +++ b/sys/kern/syscalls.master @@ -459,10 +459,10 @@ 181 AUE_SETGID STD { int setgid(gid_t gid); } 182 AUE_SETEGID STD { int setegid(gid_t egid); } 183 AUE_SETEUID STD { int seteuid(uid_t euid); } -184 AUE_NULL UNIMPL lfs_bmapv -185 AUE_NULL UNIMPL lfs_markv -186 AUE_NULL UNIMPL lfs_segclean -187 AUE_NULL UNIMPL lfs_segwait +184 AUE_NULL OBSOL lfs_bmapv +185 AUE_NULL OBSOL lfs_markv +186 AUE_NULL OBSOL lfs_segclean +187 AUE_NULL OBSOL lfs_segwait 188 AUE_STAT COMPAT11 { int stat(_In_z_ char *path, \ _Out_ struct freebsd11_stat *ub); } 189 AUE_FSTAT COMPAT11 { int fstat(int fd, \ @@ -536,7 +536,7 @@ 222 AUE_SEMOP NOSTD { int semop(int semid, \ _In_reads_(nsops) struct sembuf *sops, \ size_t nsops); } -223 AUE_NULL UNIMPL semconfig +223 AUE_NULL OBSOL semconfig 224 AUE_MSGCTL COMPAT7|NOSTD { int msgctl(int msqid, int cmd, \ struct msqid_ds_old *buf); } 225 AUE_MSGGET NOSTD { int msgget(key_t key, int msgflg); } @@ -821,12 +821,12 @@ struct kevent_freebsd11 *eventlist, \ int nevents, \ _In_opt_ const struct timespec *timeout); } -364 AUE_NULL UNIMPL __cap_get_proc -365 AUE_NULL UNIMPL __cap_set_proc -366 AUE_NULL UNIMPL __cap_get_fd -367 AUE_NULL UNIMPL __cap_get_file -368 AUE_NULL UNIMPL __cap_set_fd -369 AUE_NULL UNIMPL __cap_set_file +364 AUE_NULL OBSOL __cap_get_proc +365 AUE_NULL OBSOL __cap_set_proc +366 AUE_NULL OBSOL __cap_get_fd +367 AUE_NULL OBSOL __cap_get_file +368 AUE_NULL OBSOL __cap_set_fd +369 AUE_NULL OBSOL __cap_set_file 370 AUE_NULL UNIMPL nosys 371 AUE_EXTATTR_SET_FD STD { ssize_t extattr_set_fd(int fd, \ int attrnamespace, \ @@ -842,7 +842,7 @@ int attrnamespace, \ _In_z_ const char *attrname); } 374 AUE_SETUGID STD { int __setugid(int flag); } -375 AUE_NULL UNIMPL nfsclnt +375 AUE_NULL OBSOL nfsclnt 376 AUE_EACCESS STD { int eaccess(_In_z_ char *path, int amode); } 377 AUE_NULL NOSTD|NOTSTATIC { int afs3_syscall(long syscall, \ long parm1, long parm2, long parm3, \ @@ -850,11 +850,11 @@ 378 AUE_NMOUNT STD { int nmount( \ _In_reads_(iovcnt) struct iovec *iovp, \ unsigned int iovcnt, int flags); } -379 AUE_NULL UNIMPL kse_exit -380 AUE_NULL UNIMPL kse_wakeup -381 AUE_NULL UNIMPL kse_create -382 AUE_NULL UNIMPL kse_thr_interrupt -383 AUE_NULL UNIMPL kse_release +379 AUE_NULL OBSOL kse_exit +380 AUE_NULL OBSOL kse_wakeup +381 AUE_NULL OBSOL kse_create +382 AUE_NULL OBSOL kse_thr_interrupt +383 AUE_NULL OBSOL kse_release 384 AUE_NULL STD { int __mac_get_proc( \ _In_ struct mac *mac_p); } 385 AUE_NULL STD { int __mac_set_proc( \ @@ -994,7 +994,7 @@ int attrnamespace, \ _Out_writes_bytes_opt_(nbytes) \ void *data, size_t nbytes); } -440 AUE_NULL UNIMPL kse_switchin +440 AUE_NULL OBSOL kse_switchin 441 AUE_SEMWAIT NOSTD { int ksem_timedwait(semid_t id, \ _In_opt_ const struct timespec *abstime); } 442 AUE_NULL STD { int thr_suspend( \ @@ -1295,8 +1295,8 @@ _In_reads_(2) \ struct timespec *times, \ int flag); } -548 AUE_NULL UNIMPL numa_getaffinity -549 AUE_NULL UNIMPL numa_setaffinity +548 AUE_NULL OBSOL numa_getaffinity +549 AUE_NULL OBSOL numa_setaffinity 550 AUE_FSYNC STD { int fdatasync(int fd); } 551 AUE_FSTAT STD { int fstat(int fd, _Out_ struct stat *sb); } 552 AUE_FSTATAT STD { int fstatat(int fd, _In_z_ char *path, \ diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c index 18a4c6c9e835..af2fd1ff65f7 100644 --- a/sys/kern/uipc_socket.c +++ b/sys/kern/uipc_socket.c @@ -917,12 +917,13 @@ solisten_dequeue(struct socket *head, struct socket **ret, int flags) if (head->so_error) { error = head->so_error; head->so_error = 0; + } else if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp)) + error = EWOULDBLOCK; + else + error = 0; + if (error) { SOLISTEN_UNLOCK(head); return (error); - } - if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp)) { - SOLISTEN_UNLOCK(head); - return (EWOULDBLOCK); } so = TAILQ_FIRST(&head->sol_comp); SOCK_LOCK(so); @@ -2585,11 +2586,20 @@ soshutdown(struct socket *so, int how) * both backward-compatibility and POSIX requirements by forcing * ENOTCONN but still asking protocol to perform pru_shutdown(). */ - if (so->so_type != SOCK_DGRAM) + if (so->so_type != SOCK_DGRAM && !SOLISTENING(so)) return (ENOTCONN); soerror_enotconn = 1; } + if (SOLISTENING(so)) { + if (how != SHUT_WR) { + SOLISTEN_LOCK(so); + so->so_error = ECONNABORTED; + solisten_wakeup(so); /* unlocks so */ + } + goto done; + } + CURVNET_SET(so->so_vnet); if (pr->pr_usrreqs->pru_flush != NULL) (*pr->pr_usrreqs->pru_flush)(so, how); @@ -2604,6 +2614,7 @@ soshutdown(struct socket *so, int how) wakeup(&so->so_timeo); CURVNET_RESTORE(); +done: return (soerror_enotconn ? ENOTCONN : 0); } @@ -3279,6 +3290,8 @@ sopoll_generic(struct socket *so, int events, struct ucred *active_cred, revents = 0; else if (!TAILQ_EMPTY(&so->sol_comp)) revents = events & (POLLIN | POLLRDNORM); + else if ((events & POLLINIGNEOF) == 0 && so->so_error) + revents = (events & (POLLIN | POLLRDNORM)) | POLLHUP; else { selrecord(td, &so->so_rdsel); revents = 0; @@ -3555,6 +3568,11 @@ filt_soread(struct knote *kn, long hint) if (SOLISTENING(so)) { SOCK_LOCK_ASSERT(so); kn->kn_data = so->sol_qlen; + if (so->so_error) { + kn->kn_flags |= EV_EOF; + kn->kn_fflags = so->so_error; + return (1); + } return (!TAILQ_EMPTY(&so->sol_comp)); } diff --git a/sys/kern/vfs_lookup.c b/sys/kern/vfs_lookup.c index cdfb84e42447..675c0aa02ac3 100644 --- a/sys/kern/vfs_lookup.c +++ b/sys/kern/vfs_lookup.c @@ -160,10 +160,6 @@ nameiinit(void *dummy __unused) } SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nameiinit, NULL); -static int lookup_shared = 1; -SYSCTL_INT(_vfs, OID_AUTO, lookup_shared, CTLFLAG_RWTUN, &lookup_shared, 0, - "enables shared locks for path name translation"); - static int lookup_cap_dotdot = 1; SYSCTL_INT(_vfs, OID_AUTO, lookup_cap_dotdot, CTLFLAG_RWTUN, &lookup_cap_dotdot, 0, @@ -307,8 +303,6 @@ namei(struct nameidata *ndp) ("namei: flags contaminated with nameiops")); MPASS(ndp->ni_startdir == NULL || ndp->ni_startdir->v_type == VDIR || ndp->ni_startdir->v_type == VBAD); - if (!lookup_shared) - cnp->cn_flags &= ~LOCKSHARED; fdp = p->p_fd; TAILQ_INIT(&ndp->ni_cap_tracker); ndp->ni_lcf = 0; @@ -660,10 +654,7 @@ lookup(struct nameidata *ndp) * We use shared locks until we hit the parent of the last cn then * we adjust based on the requesting flags. */ - if (lookup_shared) - cnp->cn_lkflags = LK_SHARED; - else - cnp->cn_lkflags = LK_EXCLUSIVE; + cnp->cn_lkflags = LK_SHARED; dp = ndp->ni_startdir; ndp->ni_startdir = NULLVP; vn_lock(dp, @@ -1087,7 +1078,7 @@ nextname: VOP_UNLOCK(dp, 0); success: /* - * Because of lookup_shared we may have the vnode shared locked, but + * Because of shared lookup we may have the vnode shared locked, but * the caller may want it to be exclusively locked. */ if (needs_exclusive_leaf(dp->v_mount, cnp->cn_flags) && diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c index 91d93bb89207..a0ddefaa15df 100644 --- a/sys/kern/vfs_syscalls.c +++ b/sys/kern/vfs_syscalls.c @@ -190,7 +190,8 @@ sys_quotactl(struct thread *td, struct quotactl_args *uap) * Require that Q_QUOTAON handles the vfs_busy() reference on * its own, always returning with ubusied mount point. */ - if ((uap->cmd >> SUBCMDSHIFT) != Q_QUOTAON) + if ((uap->cmd >> SUBCMDSHIFT) != Q_QUOTAON && + (uap->cmd >> SUBCMDSHIFT) != Q_QUOTAOFF) vfs_unbusy(mp); return (error); } diff --git a/sys/net/if.c b/sys/net/if.c index 3147eafb7c78..2acea128cdfd 100644 --- a/sys/net/if.c +++ b/sys/net/if.c @@ -264,7 +264,6 @@ static int if_setflag(struct ifnet *, int, int, int *, int); static int if_transmit(struct ifnet *ifp, struct mbuf *m); static void if_unroute(struct ifnet *, int flag, int fam); static void link_rtrequest(int, struct rtentry *, struct rt_addrinfo *); -static int ifhwioctl(u_long, struct ifnet *, caddr_t, struct thread *); static int if_delmulti_locked(struct ifnet *, struct ifmultiaddr *, int); static void do_link_state_change(void *, int); static int if_getgroup(struct ifgroupreq *, struct ifnet *); @@ -2512,7 +2511,7 @@ ifr_data_get_ptr(void *ifrp) /* * Hardware specific interface ioctls. */ -static int +int ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td) { struct ifreq *ifr; diff --git a/sys/net/if_gre.c b/sys/net/if_gre.c index 591396c588dd..a2b923ee04e9 100644 --- a/sys/net/if_gre.c +++ b/sys/net/if_gre.c @@ -569,6 +569,8 @@ gre_transmit(struct ifnet *ifp, struct mbuf *m) goto drop; } af = m->m_pkthdr.csum_data; + BPF_MTAP2(ifp, &af, sizeof(af), m); + m->m_flags &= ~(M_BCAST|M_MCAST); M_SETFIB(m, sc->gre_fibnum); M_PREPEND(m, sc->gre_hlen, M_NOWAIT); if (m == NULL) { diff --git a/sys/net/if_tap.c b/sys/net/if_tap.c index 2bce8c1e7aad..1efd5ea20c6e 100644 --- a/sys/net/if_tap.c +++ b/sys/net/if_tap.c @@ -723,10 +723,12 @@ tapifstart(struct ifnet *ifp) static int tapioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag, struct thread *td) { + struct ifreq ifr; struct tap_softc *tp = dev->si_drv1; struct ifnet *ifp = tp->tap_ifp; struct tapinfo *tapp = NULL; int f; + int error; #if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD4) int ival; @@ -738,7 +740,18 @@ tapioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag, struct thread *td if (ifp->if_type != tapp->type) return (EPROTOTYPE); mtx_lock(&tp->tap_mtx); - ifp->if_mtu = tapp->mtu; + if (ifp->if_mtu != tapp->mtu) { + strlcpy(ifr.ifr_name, if_name(ifp), IFNAMSIZ); + ifr.ifr_mtu = tapp->mtu; + CURVNET_SET(ifp->if_vnet); + error = ifhwioctl(SIOCSIFMTU, ifp, + (caddr_t)&ifr, td); + CURVNET_RESTORE(); + if (error) { + mtx_unlock(&tp->tap_mtx); + return (error); + } + } ifp->if_baudrate = tapp->baudrate; mtx_unlock(&tp->tap_mtx); break; diff --git a/sys/net/if_tun.c b/sys/net/if_tun.c index cf404012a2e8..d47117f17b8e 100644 --- a/sys/net/if_tun.c +++ b/sys/net/if_tun.c @@ -662,24 +662,29 @@ static int tunioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag, struct thread *td) { - int error; + struct ifreq ifr; struct tun_softc *tp = dev->si_drv1; struct tuninfo *tunp; + int error; switch (cmd) { case TUNSIFINFO: tunp = (struct tuninfo *)data; - if (tunp->mtu < IF_MINMTU) - return (EINVAL); - if (TUN2IFP(tp)->if_mtu != tunp->mtu) { - error = priv_check(td, PRIV_NET_SETIFMTU); - if (error) - return (error); - } if (TUN2IFP(tp)->if_type != tunp->type) return (EPROTOTYPE); mtx_lock(&tp->tun_mtx); - TUN2IFP(tp)->if_mtu = tunp->mtu; + if (TUN2IFP(tp)->if_mtu != tunp->mtu) { + strlcpy(ifr.ifr_name, if_name(TUN2IFP(tp)), IFNAMSIZ); + ifr.ifr_mtu = tunp->mtu; + CURVNET_SET(TUN2IFP(tp)->if_vnet); + error = ifhwioctl(SIOCSIFMTU, TUN2IFP(tp), + (caddr_t)&ifr, td); + CURVNET_RESTORE(); + if (error) { + mtx_unlock(&tp->tun_mtx); + return (error); + } + } TUN2IFP(tp)->if_baudrate = tunp->baudrate; mtx_unlock(&tp->tun_mtx); break; diff --git a/sys/net/if_var.h b/sys/net/if_var.h index 480df7433a6d..c9452406a151 100644 --- a/sys/net/if_var.h +++ b/sys/net/if_var.h @@ -411,6 +411,7 @@ struct ifnet { #define NET_EPOCH_ENTER_ET(et) epoch_enter_preempt(net_epoch_preempt, &(et)) #define NET_EPOCH_EXIT() epoch_exit_preempt(net_epoch_preempt, &nep_et) #define NET_EPOCH_EXIT_ET(et) epoch_exit_preempt(net_epoch_preempt, &(et)) +#define NET_EPOCH_WAIT() epoch_wait_preempt(net_epoch_preempt) /* @@ -759,6 +760,8 @@ int if_hw_tsomax_update(if_t ifp, struct ifnet_hw_tsomax *); /* accessors for struct ifreq */ void *ifr_data_get_ptr(void *ifrp); +int ifhwioctl(u_long, struct ifnet *, caddr_t, struct thread *); + #ifdef DEVICE_POLLING enum poll_cmd { POLL_ONLY, POLL_AND_CHECK_STATUS }; diff --git a/sys/net/if_vlan.c b/sys/net/if_vlan.c index 11baf8a018b4..5170ee6ae35c 100644 --- a/sys/net/if_vlan.c +++ b/sys/net/if_vlan.c @@ -87,11 +87,11 @@ __FBSDID("$FreeBSD$"); #define UP_AND_RUNNING(ifp) \ ((ifp)->if_flags & IFF_UP && (ifp)->if_drv_flags & IFF_DRV_RUNNING) -LIST_HEAD(ifvlanhead, ifvlan); +CK_SLIST_HEAD(ifvlanhead, ifvlan); struct ifvlantrunk { struct ifnet *parent; /* parent interface of this trunk */ - struct rmlock lock; + struct mtx lock; #ifdef VLAN_ARRAY #define VLAN_ARRAY_SIZE (EVL_VLID_MASK + 1) struct ifvlan *vlans[VLAN_ARRAY_SIZE]; /* static table */ @@ -117,7 +117,7 @@ struct ifvlantrunk { struct ifvlan *_next; \ size_t _i; \ for (_i = 0; _i < (1 << (_trunk)->hwidth); _i++) \ - LIST_FOREACH_SAFE((_ifv), &(_trunk)->hash[_i], ifv_list, _next) + CK_SLIST_FOREACH_SAFE((_ifv), &(_trunk)->hash[_i], ifv_list, _next) #endif /* VLAN_ARRAY */ /* @@ -146,13 +146,13 @@ struct ifvlantrunk { for (_i = 0; \ !(_cond) && _i < (1 << (_trunk)->hwidth); \ _i = (_touch && ((_trunk) != NULL) ? 0 : _i + 1), _touch = false) \ - if (((_ifv) = LIST_FIRST(&(_trunk)->hash[_i])) != NULL && \ + if (((_ifv) = CK_SLIST_FIRST(&(_trunk)->hash[_i])) != NULL && \ (_touch = true)) #endif /* VLAN_ARRAY */ struct vlan_mc_entry { struct sockaddr_dl mc_addr; - SLIST_ENTRY(vlan_mc_entry) mc_entries; + CK_SLIST_ENTRY(vlan_mc_entry) mc_entries; }; struct ifvlan { @@ -173,9 +173,9 @@ struct ifvlan { uint8_t ifvm_pcp; /* Priority Code Point (PCP). */ } ifv_mib; struct task lladdr_task; - SLIST_HEAD(, vlan_mc_entry) vlan_mc_listhead; + CK_SLIST_HEAD(, vlan_mc_entry) vlan_mc_listhead; #ifndef VLAN_ARRAY - LIST_ENTRY(ifvlan) ifv_list; + CK_SLIST_ENTRY(ifvlan) ifv_list; #endif }; #define ifv_proto ifv_mib.ifvm_proto @@ -205,55 +205,36 @@ static eventhandler_tag ifdetach_tag; static eventhandler_tag iflladdr_tag; /* - * if_vlan uses two module-level locks to allow concurrent modification of vlan - * interfaces and (mostly) allow for vlans to be destroyed while they are being - * used for tx/rx. To accomplish this in a way that has acceptable performance - * and cooperation with other parts of the network stack there is a - * non-sleepable rmlock(9) and an sx(9). Both locks are exclusively acquired - * when destroying a vlan interface, i.e. when the if_vlantrunk field of struct - * ifnet is de-allocated and NULL'd. Thus a reader holding either lock has a - * guarantee that the struct ifvlantrunk references a valid vlan trunk. + * if_vlan uses two module-level synchronizations primitives to allow concurrent + * modification of vlan interfaces and (mostly) allow for vlans to be destroyed + * while they are being used for tx/rx. To accomplish this in a way that has + * acceptable performance and cooperation with other parts of the network stack + * there is a non-sleepable epoch(9) and an sx(9). * - * The performance-sensitive paths that warrant using the rmlock(9) are + * The performance-sensitive paths that warrant using the epoch(9) are * vlan_transmit and vlan_input. Both have to check for the vlan interface's * existence using if_vlantrunk, and being in the network tx/rx paths the use - * of an rmlock(9) gives a measureable improvement in performance. + * of an epoch(9) gives a measureable improvement in performance. * * The reason for having an sx(9) is mostly because there are still areas that * must be sleepable and also have safe concurrent access to a vlan interface. * Since the sx(9) exists, it is used by default in most paths unless sleeping * is not permitted, or if it is not clear whether sleeping is permitted. * - * Note that despite these protections, there is still an inherent race in the - * destruction of vlans since there's no guarantee that the ifnet hasn't been - * freed/reused when the tx/rx functions are called by the stack. This can only - * be fixed by addressing ifnet's lifetime issues. */ -#define _VLAN_RM_ID ifv_rm_lock #define _VLAN_SX_ID ifv_sx -static struct rmlock _VLAN_RM_ID; static struct sx _VLAN_SX_ID; #define VLAN_LOCKING_INIT() \ - rm_init(&_VLAN_RM_ID, "vlan_rm"); \ sx_init(&_VLAN_SX_ID, "vlan_sx") #define VLAN_LOCKING_DESTROY() \ - rm_destroy(&_VLAN_RM_ID); \ sx_destroy(&_VLAN_SX_ID) -#define _VLAN_RM_TRACKER _vlan_rm_tracker -#define VLAN_RLOCK() rm_rlock(&_VLAN_RM_ID, \ - &_VLAN_RM_TRACKER) -#define VLAN_RUNLOCK() rm_runlock(&_VLAN_RM_ID, \ - &_VLAN_RM_TRACKER) -#define VLAN_WLOCK() rm_wlock(&_VLAN_RM_ID) -#define VLAN_WUNLOCK() rm_wunlock(&_VLAN_RM_ID) -#define VLAN_RLOCK_ASSERT() rm_assert(&_VLAN_RM_ID, RA_RLOCKED) -#define VLAN_WLOCK_ASSERT() rm_assert(&_VLAN_RM_ID, RA_WLOCKED) -#define VLAN_RWLOCK_ASSERT() rm_assert(&_VLAN_RM_ID, RA_LOCKED) -#define VLAN_LOCK_READER struct rm_priotracker _VLAN_RM_TRACKER +#define VLAN_RLOCK() NET_EPOCH_ENTER(); +#define VLAN_RUNLOCK() NET_EPOCH_EXIT(); +#define VLAN_RLOCK_ASSERT() MPASS(in_epoch(net_epoch_preempt)) #define VLAN_SLOCK() sx_slock(&_VLAN_SX_ID) #define VLAN_SUNLOCK() sx_sunlock(&_VLAN_SX_ID) @@ -265,25 +246,18 @@ static struct sx _VLAN_SX_ID; /* - * We also have a per-trunk rmlock(9), that is locked shared on packet - * processing and exclusive when configuration is changed. Note: This should - * only be acquired while there is a shared lock on either of the global locks - * via VLAN_SLOCK or VLAN_RLOCK. Thus, an exclusive lock on the global locks - * makes a call to TRUNK_RLOCK/TRUNK_WLOCK technically superfluous. + * We also have a per-trunk mutex that should be acquired when changing + * its state. */ -#define _TRUNK_RM_TRACKER _trunk_rm_tracker -#define TRUNK_LOCK_INIT(trunk) rm_init(&(trunk)->lock, vlanname) -#define TRUNK_LOCK_DESTROY(trunk) rm_destroy(&(trunk)->lock) -#define TRUNK_RLOCK(trunk) rm_rlock(&(trunk)->lock, \ - &_TRUNK_RM_TRACKER) -#define TRUNK_WLOCK(trunk) rm_wlock(&(trunk)->lock) -#define TRUNK_RUNLOCK(trunk) rm_runlock(&(trunk)->lock, \ - &_TRUNK_RM_TRACKER) -#define TRUNK_WUNLOCK(trunk) rm_wunlock(&(trunk)->lock) -#define TRUNK_RLOCK_ASSERT(trunk) rm_assert(&(trunk)->lock, RA_RLOCKED) -#define TRUNK_LOCK_ASSERT(trunk) rm_assert(&(trunk)->lock, RA_LOCKED) -#define TRUNK_WLOCK_ASSERT(trunk) rm_assert(&(trunk)->lock, RA_WLOCKED) -#define TRUNK_LOCK_READER struct rm_priotracker _TRUNK_RM_TRACKER +#define TRUNK_LOCK_INIT(trunk) mtx_init(&(trunk)->lock, vlanname, NULL, MTX_DEF) +#define TRUNK_LOCK_DESTROY(trunk) mtx_destroy(&(trunk)->lock) +#define TRUNK_RLOCK(trunk) NET_EPOCH_ENTER() +#define TRUNK_WLOCK(trunk) mtx_lock(&(trunk)->lock) +#define TRUNK_RUNLOCK(trunk) NET_EPOCH_EXIT(); +#define TRUNK_WUNLOCK(trunk) mtx_unlock(&(trunk)->lock) +#define TRUNK_RLOCK_ASSERT(trunk) MPASS(in_epoch(net_epoch_preempt)) +#define TRUNK_LOCK_ASSERT(trunk) MPASS(in_epoch(net_epoch_preempt) || mtx_owned(&(trunk)->lock)) +#define TRUNK_WLOCK_ASSERT(trunk) mtx_assert(&(trunk)->lock, MA_OWNED); /* * The VLAN_ARRAY substitutes the dynamic hash with a static array @@ -361,7 +335,7 @@ vlan_inithash(struct ifvlantrunk *trunk) trunk->hmask = n - 1; trunk->hash = malloc(sizeof(struct ifvlanhead) * n, M_VLAN, M_WAITOK); for (i = 0; i < n; i++) - LIST_INIT(&trunk->hash[i]); + CK_SLIST_INIT(&trunk->hash[i]); } static void @@ -372,7 +346,7 @@ vlan_freehash(struct ifvlantrunk *trunk) KASSERT(trunk->hwidth > 0, ("%s: hwidth not positive", __func__)); for (i = 0; i < (1 << trunk->hwidth); i++) - KASSERT(LIST_EMPTY(&trunk->hash[i]), + KASSERT(CK_SLIST_EMPTY(&trunk->hash[i]), ("%s: hash table not empty", __func__)); #endif free(trunk->hash, M_VLAN); @@ -386,12 +360,12 @@ vlan_inshash(struct ifvlantrunk *trunk, struct ifvlan *ifv) int i, b; struct ifvlan *ifv2; - TRUNK_WLOCK_ASSERT(trunk); + VLAN_XLOCK_ASSERT(); KASSERT(trunk->hwidth > 0, ("%s: hwidth not positive", __func__)); b = 1 << trunk->hwidth; i = HASH(ifv->ifv_vid, trunk->hmask); - LIST_FOREACH(ifv2, &trunk->hash[i], ifv_list) + CK_SLIST_FOREACH(ifv2, &trunk->hash[i], ifv_list) if (ifv->ifv_vid == ifv2->ifv_vid) return (EEXIST); @@ -404,7 +378,7 @@ vlan_inshash(struct ifvlantrunk *trunk, struct ifvlan *ifv) vlan_growhash(trunk, 1); i = HASH(ifv->ifv_vid, trunk->hmask); } - LIST_INSERT_HEAD(&trunk->hash[i], ifv, ifv_list); + CK_SLIST_INSERT_HEAD(&trunk->hash[i], ifv, ifv_list); trunk->refcnt++; return (0); @@ -416,15 +390,15 @@ vlan_remhash(struct ifvlantrunk *trunk, struct ifvlan *ifv) int i, b; struct ifvlan *ifv2; - TRUNK_WLOCK_ASSERT(trunk); + VLAN_XLOCK_ASSERT(); KASSERT(trunk->hwidth > 0, ("%s: hwidth not positive", __func__)); b = 1 << trunk->hwidth; i = HASH(ifv->ifv_vid, trunk->hmask); - LIST_FOREACH(ifv2, &trunk->hash[i], ifv_list) + CK_SLIST_FOREACH(ifv2, &trunk->hash[i], ifv_list) if (ifv2 == ifv) { trunk->refcnt--; - LIST_REMOVE(ifv2, ifv_list); + CK_SLIST_REMOVE(&trunk->hash[i], ifv2, ifvlan, ifv_list); if (trunk->refcnt < (b * b) / 2) vlan_growhash(trunk, -1); return (0); @@ -444,7 +418,7 @@ vlan_growhash(struct ifvlantrunk *trunk, int howmuch) struct ifvlanhead *hash2; int hwidth2, i, j, n, n2; - TRUNK_WLOCK_ASSERT(trunk); + VLAN_XLOCK_ASSERT(); KASSERT(trunk->hwidth > 0, ("%s: hwidth not positive", __func__)); if (howmuch == 0) { @@ -460,21 +434,21 @@ vlan_growhash(struct ifvlantrunk *trunk, int howmuch) if (hwidth2 < VLAN_DEF_HWIDTH) return; - /* M_NOWAIT because we're called with trunk mutex held */ - hash2 = malloc(sizeof(struct ifvlanhead) * n2, M_VLAN, M_NOWAIT); + hash2 = malloc(sizeof(struct ifvlanhead) * n2, M_VLAN, M_WAITOK); if (hash2 == NULL) { printf("%s: out of memory -- hash size not changed\n", __func__); return; /* We can live with the old hash table */ } for (j = 0; j < n2; j++) - LIST_INIT(&hash2[j]); + CK_SLIST_INIT(&hash2[j]); for (i = 0; i < n; i++) - while ((ifv = LIST_FIRST(&trunk->hash[i])) != NULL) { - LIST_REMOVE(ifv, ifv_list); + while ((ifv = CK_SLIST_FIRST(&trunk->hash[i])) != NULL) { + CK_SLIST_REMOVE(&trunk->hash[i], ifv, ifvlan, ifv_list); j = HASH(ifv->ifv_vid, n2 - 1); - LIST_INSERT_HEAD(&hash2[j], ifv, ifv_list); + CK_SLIST_INSERT_HEAD(&hash2[j], ifv, ifv_list); } + NET_EPOCH_WAIT(); free(trunk->hash, M_VLAN); trunk->hash = hash2; trunk->hwidth = hwidth2; @@ -492,7 +466,7 @@ vlan_gethash(struct ifvlantrunk *trunk, uint16_t vid) TRUNK_RLOCK_ASSERT(trunk); - LIST_FOREACH(ifv, &trunk->hash[HASH(vid, trunk->hmask)], ifv_list) + CK_SLIST_FOREACH(ifv, &trunk->hash[HASH(vid, trunk->hmask)], ifv_list) if (ifv->ifv_vid == vid) return (ifv); return (NULL); @@ -508,7 +482,7 @@ vlan_dumphash(struct ifvlantrunk *trunk) for (i = 0; i < (1 << trunk->hwidth); i++) { printf("%d: ", i); - LIST_FOREACH(ifv, &trunk->hash[i], ifv_list) + CK_SLIST_FOREACH(ifv, &trunk->hash[i], ifv_list) printf("%s ", ifv->ifv_ifp->if_xname); printf("\n"); } @@ -561,7 +535,6 @@ static void trunk_destroy(struct ifvlantrunk *trunk) { VLAN_XLOCK_ASSERT(); - VLAN_WLOCK_ASSERT(); vlan_freehash(trunk); trunk->parent->if_vlantrunk = NULL; @@ -587,23 +560,19 @@ vlan_setmulti(struct ifnet *ifp) struct vlan_mc_entry *mc; int error; - /* - * XXX This stupidly needs the rmlock to avoid sleeping while holding - * the in6_multi_mtx (see in6_mc_join_locked). - */ - VLAN_RWLOCK_ASSERT(); + VLAN_XLOCK_ASSERT(); /* Find the parent. */ sc = ifp->if_softc; - TRUNK_WLOCK_ASSERT(TRUNK(sc)); ifp_p = PARENT(sc); CURVNET_SET_QUIET(ifp_p->if_vnet); /* First, remove any existing filter entries. */ - while ((mc = SLIST_FIRST(&sc->vlan_mc_listhead)) != NULL) { - SLIST_REMOVE_HEAD(&sc->vlan_mc_listhead, mc_entries); + while ((mc = CK_SLIST_FIRST(&sc->vlan_mc_listhead)) != NULL) { + CK_SLIST_REMOVE_HEAD(&sc->vlan_mc_listhead, mc_entries); (void)if_delmulti(ifp_p, (struct sockaddr *)&mc->mc_addr); + NET_EPOCH_WAIT(); free(mc, M_VLAN); } @@ -619,10 +588,10 @@ vlan_setmulti(struct ifnet *ifp) } bcopy(ifma->ifma_addr, &mc->mc_addr, ifma->ifma_addr->sa_len); mc->mc_addr.sdl_index = ifp_p->if_index; - SLIST_INSERT_HEAD(&sc->vlan_mc_listhead, mc, mc_entries); + CK_SLIST_INSERT_HEAD(&sc->vlan_mc_listhead, mc, mc_entries); } IF_ADDR_WUNLOCK(ifp); - SLIST_FOREACH (mc, &sc->vlan_mc_listhead, mc_entries) { + CK_SLIST_FOREACH (mc, &sc->vlan_mc_listhead, mc_entries) { error = if_addmulti(ifp_p, (struct sockaddr *)&mc->mc_addr, NULL); if (error) @@ -645,7 +614,6 @@ vlan_iflladdr(void *arg __unused, struct ifnet *ifp) struct ifnet *ifv_ifp; struct ifvlantrunk *trunk; struct sockaddr_dl *sdl; - VLAN_LOCK_READER; /* Need the rmlock since this is run on taskqueue_swi. */ VLAN_RLOCK(); @@ -724,12 +692,10 @@ static struct ifnet * vlan_trunkdev(struct ifnet *ifp) { struct ifvlan *ifv; - VLAN_LOCK_READER; if (ifp->if_type != IFT_L2VLAN) return (NULL); - /* Not clear if callers are sleepable, so acquire the rmlock. */ VLAN_RLOCK(); ifv = ifp->if_softc; ifp = NULL; @@ -809,10 +775,7 @@ vlan_devat(struct ifnet *ifp, uint16_t vid) { struct ifvlantrunk *trunk; struct ifvlan *ifv; - VLAN_LOCK_READER; - TRUNK_LOCK_READER; - /* Not clear if callers are sleepable, so acquire the rmlock. */ VLAN_RLOCK(); trunk = ifp->if_vlantrunk; if (trunk == NULL) { @@ -820,11 +783,9 @@ vlan_devat(struct ifnet *ifp, uint16_t vid) return (NULL); } ifp = NULL; - TRUNK_RLOCK(trunk); ifv = vlan_gethash(trunk, vid); if (ifv) ifp = ifv->ifv_ifp; - TRUNK_RUNLOCK(trunk); VLAN_RUNLOCK(); return (ifp); } @@ -1076,7 +1037,7 @@ vlan_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params) if_rele(p); return (ENOSPC); } - SLIST_INIT(&ifv->vlan_mc_listhead); + CK_SLIST_INIT(&ifv->vlan_mc_listhead); ifp->if_softc = ifv; /* * Set the name manually rather than using if_initname because @@ -1143,6 +1104,7 @@ vlan_clone_destroy(struct if_clone *ifc, struct ifnet *ifp) * ifvlan. */ taskqueue_drain(taskqueue_thread, &ifv->lladdr_task); + NET_EPOCH_WAIT(); if_free(ifp); free(ifv, M_VLAN); ifc_free_unit(ifc, unit); @@ -1167,7 +1129,6 @@ vlan_transmit(struct ifnet *ifp, struct mbuf *m) struct ifvlan *ifv; struct ifnet *p; int error, len, mcast; - VLAN_LOCK_READER; VLAN_RLOCK(); ifv = ifp->if_softc; @@ -1227,8 +1188,6 @@ vlan_input(struct ifnet *ifp, struct mbuf *m) { struct ifvlantrunk *trunk; struct ifvlan *ifv; - VLAN_LOCK_READER; - TRUNK_LOCK_READER; struct m_tag *mtag; uint16_t vid, tag; @@ -1289,16 +1248,13 @@ vlan_input(struct ifnet *ifp, struct mbuf *m) vid = EVL_VLANOFTAG(tag); - TRUNK_RLOCK(trunk); ifv = vlan_gethash(trunk, vid); if (ifv == NULL || !UP_AND_RUNNING(ifv->ifv_ifp)) { - TRUNK_RUNLOCK(trunk); - if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1); VLAN_RUNLOCK(); + if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1); m_freem(m); return; } - TRUNK_RUNLOCK(trunk); if (vlan_mtag_pcp) { /* @@ -1369,22 +1325,19 @@ vlan_config(struct ifvlan *ifv, struct ifnet *p, uint16_t vid) if (ifv->ifv_trunk) return (EBUSY); - /* Acquire rmlock after the branch so we can M_WAITOK. */ VLAN_XLOCK(); if (p->if_vlantrunk == NULL) { trunk = malloc(sizeof(struct ifvlantrunk), M_VLAN, M_WAITOK | M_ZERO); vlan_inithash(trunk); TRUNK_LOCK_INIT(trunk); - VLAN_WLOCK(); TRUNK_WLOCK(trunk); p->if_vlantrunk = trunk; trunk->parent = p; if_ref(trunk->parent); + TRUNK_WUNLOCK(trunk); } else { - VLAN_WLOCK(); trunk = p->if_vlantrunk; - TRUNK_WLOCK(trunk); } ifv->ifv_vid = vid; /* must set this before vlan_inshash() */ @@ -1448,7 +1401,9 @@ vlan_config(struct ifvlan *ifv, struct ifnet *p, uint16_t vid) ifp->if_link_state = p->if_link_state; + TRUNK_RLOCK(TRUNK(ifv)); vlan_capabilities(ifv); + TRUNK_RUNLOCK(TRUNK(ifv)); /* * Set up our interface address to reflect the underlying @@ -1458,12 +1413,6 @@ vlan_config(struct ifvlan *ifv, struct ifnet *p, uint16_t vid) ((struct sockaddr_dl *)ifp->if_addr->ifa_addr)->sdl_alen = p->if_addrlen; - /* - * Configure multicast addresses that may already be - * joined on the vlan device. - */ - (void)vlan_setmulti(ifp); - TASK_INIT(&ifv->lladdr_task, 0, vlan_lladdr_fn, ifv); /* We are ready for operation now. */ @@ -1471,13 +1420,14 @@ vlan_config(struct ifvlan *ifv, struct ifnet *p, uint16_t vid) /* Update flags on the parent, if necessary. */ vlan_setflags(ifp, 1); -done: + /* - * We need to drop the non-sleepable rmlock so that the underlying - * devices can sleep in their vlan_config hooks. + * Configure multicast addresses that may already be + * joined on the vlan device. */ - TRUNK_WUNLOCK(trunk); - VLAN_WUNLOCK(); + (void)vlan_setmulti(ifp); + +done: if (error == 0) EVENTHANDLER_INVOKE(vlan_config, p, ifv->ifv_vid); VLAN_XUNLOCK(); @@ -1510,13 +1460,6 @@ vlan_unconfig_locked(struct ifnet *ifp, int departing) parent = NULL; if (trunk != NULL) { - /* - * Both vlan_transmit and vlan_input rely on the trunk fields - * being NULL to determine whether to bail, so we need to get - * an exclusive lock here to prevent them from using bad - * ifvlans. - */ - VLAN_WLOCK(); parent = trunk->parent; /* @@ -1524,7 +1467,7 @@ vlan_unconfig_locked(struct ifnet *ifp, int departing) * empty the list of multicast groups that we may have joined * while we were alive from the parent's list. */ - while ((mc = SLIST_FIRST(&ifv->vlan_mc_listhead)) != NULL) { + while ((mc = CK_SLIST_FIRST(&ifv->vlan_mc_listhead)) != NULL) { /* * If the parent interface is being detached, * all its multicast addresses have already @@ -1541,19 +1484,14 @@ vlan_unconfig_locked(struct ifnet *ifp, int departing) "Failed to delete multicast address from parent: %d\n", error); } - SLIST_REMOVE_HEAD(&ifv->vlan_mc_listhead, mc_entries); + CK_SLIST_REMOVE_HEAD(&ifv->vlan_mc_listhead, mc_entries); + NET_EPOCH_WAIT(); free(mc, M_VLAN); } vlan_setflags(ifp, 0); /* clear special flags on parent */ - /* - * The trunk lock isn't actually required here, but - * vlan_remhash expects it. - */ - TRUNK_WLOCK(trunk); vlan_remhash(trunk, ifv); - TRUNK_WUNLOCK(trunk); ifv->ifv_trunk = NULL; /* @@ -1561,9 +1499,9 @@ vlan_unconfig_locked(struct ifnet *ifp, int departing) */ if (trunk->refcnt == 0) { parent->if_vlantrunk = NULL; + NET_EPOCH_WAIT(); trunk_destroy(trunk); } - VLAN_WUNLOCK(); } /* Disconnect from parent. */ @@ -1640,7 +1578,6 @@ vlan_link_state(struct ifnet *ifp) { struct ifvlantrunk *trunk; struct ifvlan *ifv; - VLAN_LOCK_READER; /* Called from a taskqueue_swi task, so we cannot sleep. */ VLAN_RLOCK(); @@ -1670,7 +1607,7 @@ vlan_capabilities(struct ifvlan *ifv) u_long hwa = 0; VLAN_SXLOCK_ASSERT(); - TRUNK_WLOCK_ASSERT(TRUNK(ifv)); + TRUNK_RLOCK_ASSERT(TRUNK(ifv)); p = PARENT(ifv); ifp = ifv->ifv_ifp; @@ -1771,11 +1708,11 @@ vlan_trunk_capabilities(struct ifnet *ifp) VLAN_SUNLOCK(); return; } - TRUNK_WLOCK(trunk); + TRUNK_RLOCK(trunk); VLAN_FOREACH(ifv, trunk) { vlan_capabilities(ifv); } - TRUNK_WUNLOCK(trunk); + TRUNK_RUNLOCK(trunk); VLAN_SUNLOCK(); } @@ -1789,7 +1726,6 @@ vlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) struct ifvlantrunk *trunk; struct vlanreq vlr; int error = 0; - VLAN_LOCK_READER; ifr = (struct ifreq *)data; ifa = (struct ifaddr *) data; @@ -1925,16 +1861,13 @@ vlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) * XXX We need the rmlock here to avoid sleeping while * holding in6_multi_mtx. */ - VLAN_RLOCK(); + VLAN_XLOCK(); trunk = TRUNK(ifv); - if (trunk != NULL) { - TRUNK_WLOCK(trunk); + if (trunk != NULL) error = vlan_setmulti(ifp); - TRUNK_WUNLOCK(trunk); - } - VLAN_RUNLOCK(); - break; + VLAN_XUNLOCK(); + break; case SIOCGVLANPCP: #ifdef VIMAGE if (ifp->if_vnet != ifp->if_home_vnet) { @@ -1971,9 +1904,9 @@ vlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) ifv->ifv_capenable = ifr->ifr_reqcap; trunk = TRUNK(ifv); if (trunk != NULL) { - TRUNK_WLOCK(trunk); + TRUNK_RLOCK(trunk); vlan_capabilities(ifv); - TRUNK_WUNLOCK(trunk); + TRUNK_RUNLOCK(trunk); } VLAN_SUNLOCK(); break; diff --git a/sys/net/iflib.c b/sys/net/iflib.c index 65bf07a0b2c1..d9a6e3b962d7 100644 --- a/sys/net/iflib.c +++ b/sys/net/iflib.c @@ -4100,9 +4100,10 @@ iflib_if_qflush(if_t ifp) } -#define IFCAP_FLAGS (IFCAP_TXCSUM_IPV6 | IFCAP_RXCSUM_IPV6 | IFCAP_HWCSUM | IFCAP_LRO | \ - IFCAP_TSO4 | IFCAP_TSO6 | IFCAP_VLAN_HWTAGGING | IFCAP_HWSTATS | \ - IFCAP_VLAN_MTU | IFCAP_VLAN_HWFILTER | IFCAP_VLAN_HWTSO) +#define IFCAP_FLAGS (IFCAP_HWCSUM_IPV6 | IFCAP_HWCSUM | IFCAP_LRO | \ + IFCAP_TSO | IFCAP_VLAN_HWTAGGING | IFCAP_HWSTATS | \ + IFCAP_VLAN_MTU | IFCAP_VLAN_HWFILTER | \ + IFCAP_VLAN_HWTSO | IFCAP_VLAN_HWCSUM) static int iflib_if_ioctl(if_t ifp, u_long command, caddr_t data) @@ -4223,39 +4224,51 @@ iflib_if_ioctl(if_t ifp, u_long command, caddr_t data) } case SIOCSIFCAP: { - int mask, setmask; + int mask, setmask, oldmask; - mask = ifr->ifr_reqcap ^ if_getcapenable(ifp); + oldmask = if_getcapenable(ifp); + mask = ifr->ifr_reqcap ^ oldmask; + mask &= ctx->ifc_softc_ctx.isc_capabilities; setmask = 0; #ifdef TCP_OFFLOAD setmask |= mask & (IFCAP_TOE4|IFCAP_TOE6); #endif setmask |= (mask & IFCAP_FLAGS); + setmask |= (mask & IFCAP_WOL); + + /* + * If we're disabling any RX csum, disable all the ones + * the driver supports. This assumes all supported are + * enabled. + * + * Otherwise, if they've changed, enable all of them. + */ + if ((setmask & (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6)) < + (oldmask & (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6))) + setmask &= ~(IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6); + else if ((setmask & (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6)) != + (oldmask & (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6))) + setmask |= (mask & (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6)); - if (setmask & (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6)) - setmask |= (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6); - if ((mask & IFCAP_WOL) && - (if_getcapabilities(ifp) & IFCAP_WOL) != 0) - setmask |= (mask & (IFCAP_WOL_MCAST|IFCAP_WOL_MAGIC)); - if_vlancap(ifp); /* * want to ensure that traffic has stopped before we change any of the flags */ if (setmask) { CTX_LOCK(ctx); bits = if_getdrvflags(ifp); - if (bits & IFF_DRV_RUNNING) + if (bits & IFF_DRV_RUNNING && setmask & ~IFCAP_WOL) iflib_stop(ctx); STATE_LOCK(ctx); if_togglecapenable(ifp, setmask); STATE_UNLOCK(ctx); - if (bits & IFF_DRV_RUNNING) + if (bits & IFF_DRV_RUNNING && setmask & ~IFCAP_WOL) iflib_init_locked(ctx); STATE_LOCK(ctx); if_setdrvflags(ifp, bits); STATE_UNLOCK(ctx); CTX_UNLOCK(ctx); } + if_vlancap(ifp); break; } case SIOCGPRIVATE_0: diff --git a/sys/net/iflib.h b/sys/net/iflib.h index 6e1eee633a2c..bda5ad45dfc8 100644 --- a/sys/net/iflib.h +++ b/sys/net/iflib.h @@ -173,7 +173,7 @@ typedef struct pci_vendor_info { #define IFLIB_PNP_DESCR "U32:vendor;U32:device;U32:subvendor;U32:subdevice;" \ "U32:revision;U32:class;D:#" #define IFLIB_PNP_INFO(b, u, t) \ - MODULE_PNP_INFO(IFLIB_PNP_DESCR, b, u, t, sizeof(t[0]), nitems(t) - 1) + MODULE_PNP_INFO(IFLIB_PNP_DESCR, b, u, t, nitems(t) - 1) typedef struct if_txrx { int (*ift_txd_encap) (void *, if_pkt_info_t); diff --git a/sys/netinet/in_pcb.h b/sys/netinet/in_pcb.h index 86c9705cb905..6d2c86d5014e 100644 --- a/sys/netinet/in_pcb.h +++ b/sys/netinet/in_pcb.h @@ -642,6 +642,8 @@ int inp_so_options(const struct inpcb *inp); #define INP_INFO_LOCK_ASSERT(ipi) MPASS(in_epoch(net_epoch_preempt) || mtx_owned(&(ipi)->ipi_lock)) #define INP_INFO_RLOCK_ASSERT(ipi) MPASS(in_epoch(net_epoch_preempt)) #define INP_INFO_WLOCK_ASSERT(ipi) mtx_assert(&(ipi)->ipi_lock, MA_OWNED) +#define INP_INFO_WUNLOCK_ASSERT(ipi) \ + mtx_assert(&(ipi)->ipi_lock, MA_NOTOWNED) #define INP_INFO_UNLOCK_ASSERT(ipi) MPASS(!in_epoch(net_epoch_preempt) && !mtx_owned(&(ipi)->ipi_lock)) #define INP_LIST_LOCK_INIT(ipi, d) \ diff --git a/sys/netinet/ip_encap.h b/sys/netinet/ip_encap.h index f3d1d3afcab8..65ac922fc1bb 100644 --- a/sys/netinet/ip_encap.h +++ b/sys/netinet/ip_encap.h @@ -48,12 +48,15 @@ typedef int (*encap_input_t)(struct mbuf *, int , int, void *); struct encap_config { int proto; /* protocol */ int min_length; /* minimum packet length */ + int max_hdrsize; /* maximum header size */ int exact_match; /* a packet is exactly matched */ #define ENCAP_DRV_LOOKUP 0x7fffffff encap_lookup_t lookup; encap_check_t check; encap_input_t input; + + void *pad[3]; }; struct encaptab; diff --git a/sys/netinet/ip_output.c b/sys/netinet/ip_output.c index 9d7b9cbe8661..a08806a686bd 100644 --- a/sys/netinet/ip_output.c +++ b/sys/netinet/ip_output.c @@ -932,10 +932,11 @@ in_delayed_cksum(struct mbuf *m) if (m->m_pkthdr.csum_flags & CSUM_UDP) { /* if udp header is not in the first mbuf copy udplen */ - if (offset + sizeof(struct udphdr) > m->m_len) + if (offset + sizeof(struct udphdr) > m->m_len) { m_copydata(m, offset + offsetof(struct udphdr, uh_ulen), sizeof(cklen), (caddr_t)&cklen); - else { + cklen = ntohs(cklen); + } else { uh = (struct udphdr *)mtodo(m, offset); cklen = ntohs(uh->uh_ulen); } diff --git a/sys/netinet/sctp_asconf.c b/sys/netinet/sctp_asconf.c index 26b1ba6f508b..611280f1ae42 100644 --- a/sys/netinet/sctp_asconf.c +++ b/sys/netinet/sctp_asconf.c @@ -670,6 +670,7 @@ sctp_handle_asconf(struct mbuf *m, unsigned int offset, SCTPDBG(SCTP_DEBUG_ASCONF1, "handle_asconf: couldn't get lookup addr!\n"); /* respond with a missing/invalid mandatory parameter error */ + sctp_m_freem(m_ack); return; } /* param_length is already validated in process_control... */ diff --git a/sys/netinet/sctp_auth.c b/sys/netinet/sctp_auth.c index 3150306356dc..d379dd0a143e 100644 --- a/sys/netinet/sctp_auth.c +++ b/sys/netinet/sctp_auth.c @@ -1060,40 +1060,6 @@ sctp_hmac_m(uint16_t hmac_algo, uint8_t *key, uint32_t keylen, return (digestlen); } -/*- - * verify the HMAC digest using the desired hash key, text, and HMAC - * algorithm. - * Returns -1 on error, 0 on success. - */ -int -sctp_verify_hmac(uint16_t hmac_algo, uint8_t *key, uint32_t keylen, - uint8_t *text, uint32_t textlen, - uint8_t *digest, uint32_t digestlen) -{ - uint32_t len; - uint8_t temp[SCTP_AUTH_DIGEST_LEN_MAX]; - - /* sanity check the material and length */ - if ((key == NULL) || (keylen == 0) || - (text == NULL) || (textlen == 0) || (digest == NULL)) { - /* can't do HMAC with empty key or text or digest */ - return (-1); - } - len = sctp_get_hmac_digest_len(hmac_algo); - if ((len == 0) || (digestlen != len)) - return (-1); - - /* compute the expected hash */ - if (sctp_hmac(hmac_algo, key, keylen, text, textlen, temp) != len) - return (-1); - - if (memcmp(digest, temp, digestlen) != 0) - return (-1); - else - return (0); -} - - /* * computes the requested HMAC using a key struct (which may be modified if * the keylen exceeds the HMAC block len). @@ -1740,7 +1706,7 @@ sctp_handle_auth(struct sctp_tcb *stcb, struct sctp_auth_chunk *auth, m, offset, computed_digest); /* compare the computed digest with the one in the AUTH chunk */ - if (memcmp(digest, computed_digest, digestlen) != 0) { + if (timingsafe_bcmp(digest, computed_digest, digestlen) != 0) { SCTP_STAT_INCR(sctps_recvauthfailed); SCTPDBG(SCTP_DEBUG_AUTH1, "SCTP Auth: HMAC digest check failed\n"); diff --git a/sys/netinet/sctp_auth.h b/sys/netinet/sctp_auth.h index 44126e3e590f..5c22cc749c65 100644 --- a/sys/netinet/sctp_auth.h +++ b/sys/netinet/sctp_auth.h @@ -178,9 +178,6 @@ extern uint32_t sctp_get_hmac_digest_len(uint16_t hmac_algo); extern uint32_t sctp_hmac(uint16_t hmac_algo, uint8_t *key, uint32_t keylen, uint8_t *text, uint32_t textlen, uint8_t *digest); -extern int -sctp_verify_hmac(uint16_t hmac_algo, uint8_t *key, uint32_t keylen, - uint8_t *text, uint32_t textlen, uint8_t *digest, uint32_t digestlen); extern uint32_t sctp_compute_hmac(uint16_t hmac_algo, sctp_key_t *key, uint8_t *text, uint32_t textlen, uint8_t *digest); diff --git a/sys/netinet/sctp_input.c b/sys/netinet/sctp_input.c index b77abf4768ce..86656a7f7eb2 100644 --- a/sys/netinet/sctp_input.c +++ b/sys/netinet/sctp_input.c @@ -2554,7 +2554,7 @@ sctp_handle_cookie_echo(struct mbuf *m, int iphlen, int offset, return (NULL); } /* compare the received digest with the computed digest */ - if (memcmp(calc_sig, sig, SCTP_SIGNATURE_SIZE) != 0) { + if (timingsafe_bcmp(calc_sig, sig, SCTP_SIGNATURE_SIZE) != 0) { /* try the old cookie? */ if ((cookie->time_entered.tv_sec == (long)ep->time_of_secret_change) && (ep->current_secret_number != ep->last_secret_number)) { @@ -2563,7 +2563,7 @@ sctp_handle_cookie_echo(struct mbuf *m, int iphlen, int offset, (uint8_t *)ep->secret_key[(int)ep->last_secret_number], SCTP_SECRET_SIZE, m, cookie_offset, calc_sig, 0); /* compare */ - if (memcmp(calc_sig, sig, SCTP_SIGNATURE_SIZE) == 0) + if (timingsafe_bcmp(calc_sig, sig, SCTP_SIGNATURE_SIZE) == 0) cookie_ok = 1; } } else { @@ -5669,7 +5669,6 @@ sctp_common_input_processing(struct mbuf **mm, int iphlen, int offset, int lengt vrf_id, port); goto out; } - } if (IS_SCTP_CONTROL(ch)) { /* process the control portion of the SCTP packet */ diff --git a/sys/netinet/sctp_output.c b/sys/netinet/sctp_output.c index e7807b331629..34e91d8b0af6 100644 --- a/sys/netinet/sctp_output.c +++ b/sys/netinet/sctp_output.c @@ -3572,7 +3572,6 @@ static int sctp_process_cmsgs_for_init(struct sctp_tcb *stcb, struct mbuf *control, int *error) { struct cmsghdr cmh; - int tlen, at; struct sctp_initmsg initmsg; #ifdef INET struct sockaddr_in sin; @@ -3580,34 +3579,37 @@ sctp_process_cmsgs_for_init(struct sctp_tcb *stcb, struct mbuf *control, int *er #ifdef INET6 struct sockaddr_in6 sin6; #endif + int tot_len, rem_len, cmsg_data_len, cmsg_data_off, off; - tlen = SCTP_BUF_LEN(control); - at = 0; - while (at < tlen) { - if ((tlen - at) < (int)CMSG_ALIGN(sizeof(cmh))) { + tot_len = SCTP_BUF_LEN(control); + for (off = 0; off < tot_len; off += CMSG_ALIGN(cmh.cmsg_len)) { + rem_len = tot_len - off; + if (rem_len < (int)CMSG_ALIGN(sizeof(cmh))) { /* There is not enough room for one more. */ *error = EINVAL; return (1); } - m_copydata(control, at, sizeof(cmh), (caddr_t)&cmh); + m_copydata(control, off, sizeof(cmh), (caddr_t)&cmh); if (cmh.cmsg_len < CMSG_ALIGN(sizeof(cmh))) { /* We dont't have a complete CMSG header. */ *error = EINVAL; return (1); } - if (((int)cmh.cmsg_len + at) > tlen) { + if ((cmh.cmsg_len > INT_MAX) || ((int)cmh.cmsg_len > rem_len)) { /* We don't have the complete CMSG. */ *error = EINVAL; return (1); } + cmsg_data_len = (int)cmh.cmsg_len - CMSG_ALIGN(sizeof(cmh)); + cmsg_data_off = off + CMSG_ALIGN(sizeof(cmh)); if (cmh.cmsg_level == IPPROTO_SCTP) { switch (cmh.cmsg_type) { case SCTP_INIT: - if ((size_t)(cmh.cmsg_len - CMSG_ALIGN(sizeof(cmh))) < sizeof(struct sctp_initmsg)) { + if (cmsg_data_len < (int)sizeof(struct sctp_initmsg)) { *error = EINVAL; return (1); } - m_copydata(control, at + CMSG_ALIGN(sizeof(cmh)), sizeof(struct sctp_initmsg), (caddr_t)&initmsg); + m_copydata(control, cmsg_data_off, sizeof(struct sctp_initmsg), (caddr_t)&initmsg); if (initmsg.sinit_max_attempts) stcb->asoc.max_init_times = initmsg.sinit_max_attempts; if (initmsg.sinit_num_ostreams) @@ -3662,7 +3664,7 @@ sctp_process_cmsgs_for_init(struct sctp_tcb *stcb, struct mbuf *control, int *er break; #ifdef INET case SCTP_DSTADDRV4: - if ((size_t)(cmh.cmsg_len - CMSG_ALIGN(sizeof(cmh))) < sizeof(struct in_addr)) { + if (cmsg_data_len < (int)sizeof(struct in_addr)) { *error = EINVAL; return (1); } @@ -3670,7 +3672,7 @@ sctp_process_cmsgs_for_init(struct sctp_tcb *stcb, struct mbuf *control, int *er sin.sin_family = AF_INET; sin.sin_len = sizeof(struct sockaddr_in); sin.sin_port = stcb->rport; - m_copydata(control, at + CMSG_ALIGN(sizeof(cmh)), sizeof(struct in_addr), (caddr_t)&sin.sin_addr); + m_copydata(control, cmsg_data_off, sizeof(struct in_addr), (caddr_t)&sin.sin_addr); if ((sin.sin_addr.s_addr == INADDR_ANY) || (sin.sin_addr.s_addr == INADDR_BROADCAST) || IN_MULTICAST(ntohl(sin.sin_addr.s_addr))) { @@ -3686,7 +3688,7 @@ sctp_process_cmsgs_for_init(struct sctp_tcb *stcb, struct mbuf *control, int *er #endif #ifdef INET6 case SCTP_DSTADDRV6: - if ((size_t)(cmh.cmsg_len - CMSG_ALIGN(sizeof(cmh))) < sizeof(struct in6_addr)) { + if (cmsg_data_len < (int)sizeof(struct in6_addr)) { *error = EINVAL; return (1); } @@ -3694,7 +3696,7 @@ sctp_process_cmsgs_for_init(struct sctp_tcb *stcb, struct mbuf *control, int *er sin6.sin6_family = AF_INET6; sin6.sin6_len = sizeof(struct sockaddr_in6); sin6.sin6_port = stcb->rport; - m_copydata(control, at + CMSG_ALIGN(sizeof(cmh)), sizeof(struct in6_addr), (caddr_t)&sin6.sin6_addr); + m_copydata(control, cmsg_data_off, sizeof(struct in6_addr), (caddr_t)&sin6.sin6_addr); if (IN6_IS_ADDR_UNSPECIFIED(&sin6.sin6_addr) || IN6_IS_ADDR_MULTICAST(&sin6.sin6_addr)) { *error = EINVAL; @@ -3727,7 +3729,6 @@ sctp_process_cmsgs_for_init(struct sctp_tcb *stcb, struct mbuf *control, int *er break; } } - at += CMSG_ALIGN(cmh.cmsg_len); } return (0); } @@ -3740,7 +3741,6 @@ sctp_findassociation_cmsgs(struct sctp_inpcb **inp_p, int *error) { struct cmsghdr cmh; - int tlen, at; struct sctp_tcb *stcb; struct sockaddr *addr; #ifdef INET @@ -3749,31 +3749,34 @@ sctp_findassociation_cmsgs(struct sctp_inpcb **inp_p, #ifdef INET6 struct sockaddr_in6 sin6; #endif + int tot_len, rem_len, cmsg_data_len, cmsg_data_off, off; - tlen = SCTP_BUF_LEN(control); - at = 0; - while (at < tlen) { - if ((tlen - at) < (int)CMSG_ALIGN(sizeof(cmh))) { + tot_len = SCTP_BUF_LEN(control); + for (off = 0; off < tot_len; off += CMSG_ALIGN(cmh.cmsg_len)) { + rem_len = tot_len - off; + if (rem_len < (int)CMSG_ALIGN(sizeof(cmh))) { /* There is not enough room for one more. */ *error = EINVAL; return (NULL); } - m_copydata(control, at, sizeof(cmh), (caddr_t)&cmh); + m_copydata(control, off, sizeof(cmh), (caddr_t)&cmh); if (cmh.cmsg_len < CMSG_ALIGN(sizeof(cmh))) { /* We dont't have a complete CMSG header. */ *error = EINVAL; return (NULL); } - if (((int)cmh.cmsg_len + at) > tlen) { + if ((cmh.cmsg_len > INT_MAX) || ((int)cmh.cmsg_len > rem_len)) { /* We don't have the complete CMSG. */ *error = EINVAL; return (NULL); } + cmsg_data_len = (int)cmh.cmsg_len - CMSG_ALIGN(sizeof(cmh)); + cmsg_data_off = off + CMSG_ALIGN(sizeof(cmh)); if (cmh.cmsg_level == IPPROTO_SCTP) { switch (cmh.cmsg_type) { #ifdef INET case SCTP_DSTADDRV4: - if ((size_t)(cmh.cmsg_len - CMSG_ALIGN(sizeof(cmh))) < sizeof(struct in_addr)) { + if (cmsg_data_len < (int)sizeof(struct in_addr)) { *error = EINVAL; return (NULL); } @@ -3781,13 +3784,13 @@ sctp_findassociation_cmsgs(struct sctp_inpcb **inp_p, sin.sin_family = AF_INET; sin.sin_len = sizeof(struct sockaddr_in); sin.sin_port = port; - m_copydata(control, at + CMSG_ALIGN(sizeof(cmh)), sizeof(struct in_addr), (caddr_t)&sin.sin_addr); + m_copydata(control, cmsg_data_off, sizeof(struct in_addr), (caddr_t)&sin.sin_addr); addr = (struct sockaddr *)&sin; break; #endif #ifdef INET6 case SCTP_DSTADDRV6: - if ((size_t)(cmh.cmsg_len - CMSG_ALIGN(sizeof(cmh))) < sizeof(struct in6_addr)) { + if (cmsg_data_len < (int)sizeof(struct in6_addr)) { *error = EINVAL; return (NULL); } @@ -3795,7 +3798,7 @@ sctp_findassociation_cmsgs(struct sctp_inpcb **inp_p, sin6.sin6_family = AF_INET6; sin6.sin6_len = sizeof(struct sockaddr_in6); sin6.sin6_port = port; - m_copydata(control, at + CMSG_ALIGN(sizeof(cmh)), sizeof(struct in6_addr), (caddr_t)&sin6.sin6_addr); + m_copydata(control, cmsg_data_off, sizeof(struct in6_addr), (caddr_t)&sin6.sin6_addr); #ifdef INET if (IN6_IS_ADDR_V4MAPPED(&sin6.sin6_addr)) { in6_sin6_2_sin(&sin, &sin6); @@ -3816,7 +3819,6 @@ sctp_findassociation_cmsgs(struct sctp_inpcb **inp_p, } } } - at += CMSG_ALIGN(cmh.cmsg_len); } return (NULL); } @@ -4263,6 +4265,9 @@ sctp_lowlevel_chunk_output(struct sctp_inpcb *inp, atomic_subtract_int(&stcb->asoc.refcnt, 1); } #endif + if (port) { + UDPSTAT_INC(udps_opackets); + } SCTP_STAT_INCR(sctps_sendpackets); SCTP_STAT_INCR_COUNTER64(sctps_outpackets); if (ret) @@ -4362,6 +4367,7 @@ sctp_lowlevel_chunk_output(struct sctp_inpcb *inp, /* KAME hack: embed scopeid */ if (sa6_embedscope(sin6, MODULE_GLOBAL(ip6_use_defzone)) != 0) { SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL); + sctp_m_freem(m); return (EINVAL); } if (net == NULL) { @@ -4426,6 +4432,7 @@ sctp_lowlevel_chunk_output(struct sctp_inpcb *inp, /* KAME hack: embed scopeid */ if (sa6_embedscope(sin6, MODULE_GLOBAL(ip6_use_defzone)) != 0) { SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL); + sctp_m_freem(m); return (EINVAL); } /* Cache the source address */ @@ -4452,6 +4459,7 @@ sctp_lowlevel_chunk_output(struct sctp_inpcb *inp, /* KAME hack: embed scopeid */ if (sa6_embedscope(sin6, MODULE_GLOBAL(ip6_use_defzone)) != 0) { SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL); + sctp_m_freem(m); return (EINVAL); } if (over_addr == NULL) { @@ -4603,6 +4611,9 @@ sctp_lowlevel_chunk_output(struct sctp_inpcb *inp, sin6->sin6_port = prev_port; } SCTPDBG(SCTP_DEBUG_OUTPUT3, "return from send is %d\n", ret); + if (port) { + UDPSTAT_INC(udps_opackets); + } SCTP_STAT_INCR(sctps_sendpackets); SCTP_STAT_INCR_COUNTER64(sctps_outpackets); if (ret) { @@ -7210,7 +7221,7 @@ one_more_time: if ((sp->msg_is_complete) && (sp->length == 0)) { if (sp->sender_all_done) { /* - * We are doing differed cleanup. Last time through + * We are doing deferred cleanup. Last time through * when we took all the data the sender_all_done was * not set. */ @@ -8964,14 +8975,15 @@ sctp_queue_op_err(struct sctp_tcb *stcb, struct mbuf *op_err) return; } chk->copy_by_ref = 0; + chk->rec.chunk_id.id = SCTP_OPERATION_ERROR; + chk->rec.chunk_id.can_take_data = 0; + chk->flags = 0; chk->send_size = (uint16_t)chunk_length; chk->sent = SCTP_DATAGRAM_UNSENT; chk->snd_count = 0; chk->asoc = &stcb->asoc; chk->data = op_err; chk->whoTo = NULL; - chk->rec.chunk_id.id = SCTP_OPERATION_ERROR; - chk->rec.chunk_id.can_take_data = 0; hdr = mtod(op_err, struct sctp_chunkhdr *); hdr->chunk_type = SCTP_OPERATION_ERROR; hdr->chunk_flags = 0; @@ -9193,7 +9205,6 @@ sctp_send_shutdown_ack(struct sctp_tcb *stcb, struct sctp_nets *net) chk->send_size = sizeof(struct sctp_chunkhdr); chk->sent = SCTP_DATAGRAM_UNSENT; chk->snd_count = 0; - chk->flags = 0; chk->asoc = &stcb->asoc; chk->data = m_shutdown_ack; chk->whoTo = net; @@ -9248,7 +9259,6 @@ sctp_send_shutdown(struct sctp_tcb *stcb, struct sctp_nets *net) chk->send_size = sizeof(struct sctp_shutdown_chunk); chk->sent = SCTP_DATAGRAM_UNSENT; chk->snd_count = 0; - chk->flags = 0; chk->asoc = &stcb->asoc; chk->data = m_shutdown; chk->whoTo = net; @@ -11290,6 +11300,9 @@ sctp_send_resp_msg(struct sockaddr *src, struct sockaddr *dst, return; } SCTPDBG(SCTP_DEBUG_OUTPUT3, "return from send is %d\n", ret); + if (port) { + UDPSTAT_INC(udps_opackets); + } SCTP_STAT_INCR(sctps_sendpackets); SCTP_STAT_INCR_COUNTER64(sctps_outpackets); SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks); @@ -12154,7 +12167,6 @@ sctp_send_str_reset_req(struct sctp_tcb *stcb, chk->book_size = sizeof(struct sctp_chunkhdr); chk->send_size = SCTP_SIZE32(chk->book_size); chk->book_size_scale = 0; - chk->data = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_NOWAIT, 1, MT_DATA); if (chk->data == NULL) { sctp_free_a_chunk(stcb, chk, SCTP_SO_LOCKED); diff --git a/sys/netinet/sctputil.c b/sys/netinet/sctputil.c index 12047ad91293..5fc57fe139ea 100644 --- a/sys/netinet/sctputil.c +++ b/sys/netinet/sctputil.c @@ -3700,7 +3700,7 @@ sctp_ulp_notify(uint32_t notification, struct sctp_tcb *stcb, return; } if ((SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_WAIT) || - (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_ECHOED)) { + (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_ECHOED)) { if ((notification == SCTP_NOTIFY_INTERFACE_DOWN) || (notification == SCTP_NOTIFY_INTERFACE_UP) || (notification == SCTP_NOTIFY_INTERFACE_CONFIRMED)) { @@ -7391,8 +7391,8 @@ sctp_set_state(struct sctp_tcb *stcb, int new_state) #endif KASSERT((new_state & ~SCTP_STATE_MASK) == 0, - ("sctp_set_state: Can't set substate (new_state = %x)", - new_state)); + ("sctp_set_state: Can't set substate (new_state = %x)", + new_state)); stcb->asoc.state = (stcb->asoc.state & ~SCTP_STATE_MASK) | new_state; if ((new_state == SCTP_STATE_SHUTDOWN_RECEIVED) || (new_state == SCTP_STATE_SHUTDOWN_SENT) || @@ -7402,7 +7402,7 @@ sctp_set_state(struct sctp_tcb *stcb, int new_state) #if defined(KDTRACE_HOOKS) if (((old_state & SCTP_STATE_MASK) != new_state) && !(((old_state & SCTP_STATE_MASK) == SCTP_STATE_EMPTY) && - (new_state == SCTP_STATE_INUSE))) { + (new_state == SCTP_STATE_INUSE))) { SCTP_PROBE6(state__change, NULL, stcb, NULL, stcb, NULL, old_state); } #endif @@ -7416,14 +7416,14 @@ sctp_add_substate(struct sctp_tcb *stcb, int substate) #endif KASSERT((substate & SCTP_STATE_MASK) == 0, - ("sctp_add_substate: Can't set state (substate = %x)", - substate)); + ("sctp_add_substate: Can't set state (substate = %x)", + substate)); stcb->asoc.state |= substate; #if defined(KDTRACE_HOOKS) if (((substate & SCTP_STATE_ABOUT_TO_BE_FREED) && - ((old_state & SCTP_STATE_ABOUT_TO_BE_FREED) == 0)) || + ((old_state & SCTP_STATE_ABOUT_TO_BE_FREED) == 0)) || ((substate & SCTP_STATE_SHUTDOWN_PENDING) && - ((old_state & SCTP_STATE_SHUTDOWN_PENDING) == 0))) { + ((old_state & SCTP_STATE_SHUTDOWN_PENDING) == 0))) { SCTP_PROBE6(state__change, NULL, stcb, NULL, stcb, NULL, old_state); } #endif diff --git a/sys/netinet/siftr.c b/sys/netinet/siftr.c index 83af89570d5e..bc59e312bb00 100644 --- a/sys/netinet/siftr.c +++ b/sys/netinet/siftr.c @@ -710,7 +710,7 @@ siftr_findinpcb(int ipver, struct ip *ip, struct mbuf *m, uint16_t sport, struct inpcb *inp; /* We need the tcbinfo lock. */ - INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); + INP_INFO_WUNLOCK_ASSERT(&V_tcbinfo); if (dir == PFIL_IN) inp = (ipver == INP_IPV4 ? diff --git a/sys/netinet/tcp_hpts.c b/sys/netinet/tcp_hpts.c index 32047180f883..f3737888f3d0 100644 --- a/sys/netinet/tcp_hpts.c +++ b/sys/netinet/tcp_hpts.c @@ -1282,7 +1282,7 @@ out: * lock again but we also need some kasserts * here. */ - INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); + INP_INFO_WUNLOCK_ASSERT(&V_tcbinfo); INP_UNLOCK_ASSERT(inp); m = n; if (m) @@ -1324,7 +1324,7 @@ out: INP_WUNLOCK(inp); if (ti_locked == TI_RLOCKED) INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); - INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); + INP_INFO_WUNLOCK_ASSERT(&V_tcbinfo); INP_UNLOCK_ASSERT(inp); ti_locked = TI_UNLOCKED; mtx_lock(&hpts->p_mtx); diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c index b61161c3eed8..bbb031439fc8 100644 --- a/sys/netinet/tcp_input.c +++ b/sys/netinet/tcp_input.c @@ -800,7 +800,7 @@ findpcb: if (ti_locked == TI_RLOCKED) { INP_INFO_RLOCK_ASSERT(&V_tcbinfo); } else { - INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); + INP_INFO_WUNLOCK_ASSERT(&V_tcbinfo); } #endif #ifdef INET6 @@ -1358,7 +1358,7 @@ tfo_socket_result: INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); ti_locked = TI_UNLOCKED; } - INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); + INP_INFO_WUNLOCK_ASSERT(&V_tcbinfo); return (IPPROTO_DONE); } else if (tp->t_state == TCPS_LISTEN) { /* @@ -1405,7 +1405,7 @@ dropwithreset: else { KASSERT(ti_locked == TI_UNLOCKED, ("%s: dropwithreset " "ti_locked: %d", __func__, ti_locked)); - INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); + INP_INFO_WUNLOCK_ASSERT(&V_tcbinfo); } #endif @@ -1429,7 +1429,7 @@ dropunlock: else { KASSERT(ti_locked == TI_UNLOCKED, ("%s: dropunlock " "ti_locked: %d", __func__, ti_locked)); - INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); + INP_INFO_WUNLOCK_ASSERT(&V_tcbinfo); } #endif @@ -1437,7 +1437,7 @@ dropunlock: INP_WUNLOCK(inp); drop: - INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); + INP_INFO_WUNLOCK_ASSERT(&V_tcbinfo); if (s != NULL) free(s, M_TCPLOG); if (m != NULL) diff --git a/sys/netinet/tcp_syncache.c b/sys/netinet/tcp_syncache.c index e5a184fb713e..9eefd0d948d7 100644 --- a/sys/netinet/tcp_syncache.c +++ b/sys/netinet/tcp_syncache.c @@ -130,7 +130,7 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, functions_inherit_listen_socket_stack, static void syncache_drop(struct syncache *, struct syncache_head *); static void syncache_free(struct syncache *); static void syncache_insert(struct syncache *, struct syncache_head *); -static int syncache_respond(struct syncache *, struct syncache_head *, int, +static int syncache_respond(struct syncache *, struct syncache_head *, const struct mbuf *); static struct socket *syncache_socket(struct syncache *, struct socket *, struct mbuf *m); @@ -489,7 +489,7 @@ syncache_timer(void *xsch) free(s, M_TCPLOG); } - syncache_respond(sc, sch, 1, NULL); + syncache_respond(sc, sch, NULL); TCPSTAT_INC(tcps_sc_retransmitted); syncache_timeout(sc, sch, 0); } @@ -1413,7 +1413,7 @@ syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, s, __func__); free(s, M_TCPLOG); } - if (syncache_respond(sc, sch, 1, m) == 0) { + if (syncache_respond(sc, sch, m) == 0) { sc->sc_rxmits = 0; syncache_timeout(sc, sch, 1); TCPSTAT_INC(tcps_sndacks); @@ -1577,7 +1577,7 @@ skip_alloc: /* * Do a standard 3-way handshake. */ - if (syncache_respond(sc, sch, 0, m) == 0) { + if (syncache_respond(sc, sch, m) == 0) { if (V_tcp_syncookies && V_tcp_syncookiesonly && sc != &scs) syncache_free(sc); else if (sc != &scs) @@ -1622,7 +1622,7 @@ tfo_expanded: * i.e. m0 != NULL, or upon 3WHS ACK timeout, i.e. m0 == NULL. */ static int -syncache_respond(struct syncache *sc, struct syncache_head *sch, int locked, +syncache_respond(struct syncache *sc, struct syncache_head *sch, const struct mbuf *m0) { struct ip *ip = NULL; diff --git a/sys/netinet/udp_usrreq.c b/sys/netinet/udp_usrreq.c index cae044c066c3..429f195ee954 100644 --- a/sys/netinet/udp_usrreq.c +++ b/sys/netinet/udp_usrreq.c @@ -756,13 +756,7 @@ struct inpcb * udp_notify(struct inpcb *inp, int errno) { - /* - * While udp_ctlinput() always calls udp_notify() with a read lock - * when invoking it directly, in_pcbnotifyall() currently uses write - * locks due to sharing code with TCP. For now, accept either a read - * or a write lock, but a read lock is sufficient. - */ - INP_LOCK_ASSERT(inp); + INP_WLOCK_ASSERT(inp); if ((errno == EHOSTUNREACH || errno == ENETUNREACH || errno == EHOSTDOWN) && inp->inp_route.ro_rt) { RTFREE(inp->inp_route.ro_rt); @@ -808,13 +802,13 @@ udp_common_ctlinput(int cmd, struct sockaddr *sa, void *vip, if (ip != NULL) { uh = (struct udphdr *)((caddr_t)ip + (ip->ip_hl << 2)); inp = in_pcblookup(pcbinfo, faddr, uh->uh_dport, - ip->ip_src, uh->uh_sport, INPLOOKUP_RLOCKPCB, NULL); + ip->ip_src, uh->uh_sport, INPLOOKUP_WLOCKPCB, NULL); if (inp != NULL) { - INP_RLOCK_ASSERT(inp); + INP_WLOCK_ASSERT(inp); if (inp->inp_socket != NULL) { udp_notify(inp, inetctlerrmap[cmd]); } - INP_RUNLOCK(inp); + INP_WUNLOCK(inp); } else { inp = in_pcblookup(pcbinfo, faddr, uh->uh_dport, ip->ip_src, uh->uh_sport, diff --git a/sys/netinet6/icmp6.c b/sys/netinet6/icmp6.c index 0bb5636602ff..93cbfe66140b 100644 --- a/sys/netinet6/icmp6.c +++ b/sys/netinet6/icmp6.c @@ -1936,6 +1936,10 @@ icmp6_rip6_input(struct mbuf **mp, int off) !IN6_ARE_ADDR_EQUAL(&in6p->in6p_faddr, &ip6->ip6_src)) continue; INP_RLOCK(in6p); + if (__predict_false(in6p->inp_flags2 & INP_FREED)) { + INP_RUNLOCK(in6p); + continue; + } if (ICMP6_FILTER_WILLBLOCK(icmp6->icmp6_type, in6p->in6p_icmp6filt)) { INP_RUNLOCK(in6p); diff --git a/sys/netinet6/in6_pcb.c b/sys/netinet6/in6_pcb.c index a9c73798b18c..b9d8e9e3187a 100644 --- a/sys/netinet6/in6_pcb.c +++ b/sys/netinet6/in6_pcb.c @@ -809,6 +809,10 @@ in6_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp) INP_INFO_WLOCK(pcbinfo); CK_LIST_FOREACH(in6p, pcbinfo->ipi_listhead, inp_list) { INP_WLOCK(in6p); + if (__predict_false(in6p->inp_flags2 & INP_FREED)) { + INP_WUNLOCK(in6p); + continue; + } im6o = in6p->in6p_moptions; if ((in6p->inp_vflag & INP_IPV6) && im6o != NULL) { /* diff --git a/sys/netinet6/udp6_usrreq.c b/sys/netinet6/udp6_usrreq.c index b32e88663172..93aebbd3440b 100644 --- a/sys/netinet6/udp6_usrreq.c +++ b/sys/netinet6/udp6_usrreq.c @@ -698,7 +698,7 @@ udp6_output(struct socket *so, int flags_arg, struct mbuf *m, u_int32_t ulen, plen; uint16_t cscov; u_short fport; - uint8_t nxt, unlock_udbinfo; + uint8_t nxt, unlock_inp, unlock_udbinfo; /* addr6 has been validated in udp6_send(). */ sin6 = (struct sockaddr_in6 *)addr6; @@ -734,7 +734,22 @@ udp6_output(struct socket *so, int flags_arg, struct mbuf *m, inp = sotoinpcb(so); KASSERT(inp != NULL, ("%s: inp == NULL", __func__)); - INP_RLOCK(inp); + /* + * In the following cases we want a write lock on the inp for either + * local operations or for possible route cache updates in the IPv6 + * output path: + * - on connected sockets (sin6 is NULL) for route cache updates, + * - when we are not bound to an address and source port (it is + * in6_pcbsetport() which will require the write lock). + */ + if (sin6 == NULL || (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) && + inp->inp_lport == 0)) { + INP_WLOCK(inp); + unlock_inp = UH_WLOCKED; + } else { + INP_RLOCK(inp); + unlock_inp = UH_RLOCKED; + } nxt = (inp->inp_socket->so_proto->pr_protocol == IPPROTO_UDP) ? IPPROTO_UDP : IPPROTO_UDPLITE; @@ -758,7 +773,10 @@ udp6_output(struct socket *so, int flags_arg, struct mbuf *m, * potential race in which the factors causing us to * select the UDPv4 output routine are invalidated? */ - INP_RUNLOCK(inp); + if (unlock_inp == UH_WLOCKED) + INP_WUNLOCK(inp); + else + INP_RUNLOCK(inp); if (sin6) in6_sin6_2_sin_in_sock((struct sockaddr *)sin6); pru = inetsw[ip_protox[nxt]].pr_usrreqs; @@ -766,13 +784,28 @@ udp6_output(struct socket *so, int flags_arg, struct mbuf *m, return ((*pru->pru_send)(so, flags_arg, m, (struct sockaddr *)sin6, control, td)); } - } + } else #endif + if (sin6 && IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { + /* + * Given this is either an IPv6-only socket or no INET is + * supported we will fail the send if the given destination + * address is a v4mapped address. + */ + if (unlock_inp == UH_WLOCKED) + INP_WUNLOCK(inp); + else + INP_RUNLOCK(inp); + return (EINVAL); + } if (control) { if ((error = ip6_setpktopts(control, &opt, inp->in6p_outputopts, td->td_ucred, nxt)) != 0) { - INP_RUNLOCK(inp); + if (unlock_inp == UH_WLOCKED) + INP_WUNLOCK(inp); + else + INP_RUNLOCK(inp); ip6_clearpktopts(&opt, -1); if (control) m_freem(control); @@ -786,12 +819,6 @@ udp6_output(struct socket *so, int flags_arg, struct mbuf *m, pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); if (sin6 != NULL && IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) && inp->inp_lport == 0) { - INP_RUNLOCK(inp); - /* - * XXX there is a short window here which could lead to a race; - * should we re-check that what got us here is still valid? - */ - INP_WLOCK(inp); INP_HASH_WLOCK(pcbinfo); unlock_udbinfo = UH_WLOCKED; } else if (sin6 != NULL && @@ -972,9 +999,10 @@ udp6_output(struct socket *so, int flags_arg, struct mbuf *m, UDPLITE_PROBE(send, NULL, inp, ip6, inp, udp6); else UDP_PROBE(send, NULL, inp, ip6, inp, udp6); - error = ip6_output(m, optp, &inp->inp_route6, flags, + error = ip6_output(m, optp, + (unlock_inp == UH_WLOCKED) ? &inp->inp_route6 : NULL, flags, inp->in6p_moptions, NULL, inp); - if (unlock_udbinfo == UH_WLOCKED) + if (unlock_inp == UH_WLOCKED) INP_WUNLOCK(inp); else INP_RUNLOCK(inp); @@ -987,12 +1015,20 @@ udp6_output(struct socket *so, int flags_arg, struct mbuf *m, release: if (unlock_udbinfo == UH_WLOCKED) { + KASSERT(unlock_inp == UH_WLOCKED, ("%s: excl udbinfo lock, " + "non-excl inp lock: pcbinfo %p %#x inp %p %#x", + __func__, pcbinfo, unlock_udbinfo, inp, unlock_inp)); INP_HASH_WUNLOCK(pcbinfo); INP_WUNLOCK(inp); } else if (unlock_udbinfo == UH_RLOCKED) { + KASSERT(unlock_inp == UH_RLOCKED, ("%s: non-excl udbinfo lock, " + "excl inp lock: pcbinfo %p %#x inp %p %#x", + __func__, pcbinfo, unlock_udbinfo, inp, unlock_inp)); INP_HASH_RUNLOCK_ET(pcbinfo, et); INP_RUNLOCK(inp); - } else + } else if (unlock_inp == UH_WLOCKED) + INP_WUNLOCK(inp); + else INP_RUNLOCK(inp); if (control) { ip6_clearpktopts(&opt, -1); diff --git a/sys/netipsec/key.c b/sys/netipsec/key.c index 8f5004a0613c..1e027bf6076e 100644 --- a/sys/netipsec/key.c +++ b/sys/netipsec/key.c @@ -534,14 +534,6 @@ MALLOC_DEFINE(M_IPSEC_SPDCACHE, "ipsec-spdcache", "ipsec SPD cache"); VNET_DEFINE_STATIC(uma_zone_t, key_lft_zone); #define V_key_lft_zone VNET(key_lft_zone) -static LIST_HEAD(xforms_list, xformsw) xforms = LIST_HEAD_INITIALIZER(); -static struct mtx xforms_lock; -#define XFORMS_LOCK_INIT() \ - mtx_init(&xforms_lock, "xforms_list", "IPsec transforms list", MTX_DEF) -#define XFORMS_LOCK_DESTROY() mtx_destroy(&xforms_lock) -#define XFORMS_LOCK() mtx_lock(&xforms_lock) -#define XFORMS_UNLOCK() mtx_unlock(&xforms_lock) - /* * set parameters into secpolicyindex buffer. * Must allocate secpolicyindex buffer passed to this function. @@ -717,7 +709,6 @@ static int key_delete(struct socket *, struct mbuf *, const struct sadb_msghdr *); static int key_delete_all(struct socket *, struct mbuf *, const struct sadb_msghdr *, struct secasindex *); -static void key_delete_xform(const struct xformsw *); static int key_get(struct socket *, struct mbuf *, const struct sadb_msghdr *); @@ -750,7 +741,6 @@ static int key_validate_ext(const struct sadb_ext *, int); static int key_align(struct mbuf *, struct sadb_msghdr *); static struct mbuf *key_setlifetime(struct seclifetime *, uint16_t); static struct mbuf *key_setkey(struct seckey *, uint16_t); -static int xform_init(struct secasvar *, u_short); static void spdcache_init(void); static void spdcache_clear(void); @@ -6167,7 +6157,7 @@ key_delete_all(struct socket *so, struct mbuf *m, * Larval SAs have not initialized tdb_xform, so it is safe to leave them * here when xform disappears. */ -static void +void key_delete_xform(const struct xformsw *xsp) { struct secasvar_queue drainq; @@ -8335,7 +8325,6 @@ key_init(void) if (!IS_DEFAULT_VNET(curvnet)) return; - XFORMS_LOCK_INIT(); SPTREE_LOCK_INIT(); REGTREE_LOCK_INIT(); SAHTREE_LOCK_INIT(); @@ -8458,7 +8447,6 @@ key_destroy(void) #ifndef IPSEC_DEBUG2 callout_drain(&key_timer); #endif - XFORMS_LOCK_DESTROY(); SPTREE_LOCK_DESTROY(); REGTREE_LOCK_DESTROY(); SAHTREE_LOCK_DESTROY(); @@ -8617,70 +8605,3 @@ comp_algorithm_lookup(int alg) return (NULL); } -/* - * Register a transform. - */ -static int -xform_register(struct xformsw* xsp) -{ - struct xformsw *entry; - - XFORMS_LOCK(); - LIST_FOREACH(entry, &xforms, chain) { - if (entry->xf_type == xsp->xf_type) { - XFORMS_UNLOCK(); - return (EEXIST); - } - } - LIST_INSERT_HEAD(&xforms, xsp, chain); - XFORMS_UNLOCK(); - return (0); -} - -void -xform_attach(void *data) -{ - struct xformsw *xsp = (struct xformsw *)data; - - if (xform_register(xsp) != 0) - printf("%s: failed to register %s xform\n", __func__, - xsp->xf_name); -} - -void -xform_detach(void *data) -{ - struct xformsw *xsp = (struct xformsw *)data; - - XFORMS_LOCK(); - LIST_REMOVE(xsp, chain); - XFORMS_UNLOCK(); - - /* Delete all SAs related to this xform. */ - key_delete_xform(xsp); -} - -/* - * Initialize transform support in an sav. - */ -static int -xform_init(struct secasvar *sav, u_short xftype) -{ - struct xformsw *entry; - int ret; - - IPSEC_ASSERT(sav->tdb_xform == NULL, - ("tdb_xform is already initialized")); - - ret = EINVAL; - XFORMS_LOCK(); - LIST_FOREACH(entry, &xforms, chain) { - if (entry->xf_type == xftype) { - ret = (*entry->xf_init)(sav, entry); - break; - } - } - XFORMS_UNLOCK(); - return (ret); -} - diff --git a/sys/netipsec/key.h b/sys/netipsec/key.h index 6c3e05c039e8..7d7ae69f379d 100644 --- a/sys/netipsec/key.h +++ b/sys/netipsec/key.h @@ -46,6 +46,7 @@ struct sadb_msg; struct sadb_x_policy; struct secasindex; union sockaddr_union; +struct xformsw; struct secpolicy *key_newsp(void); struct secpolicy *key_allocsp(struct secpolicyindex *, u_int); @@ -74,6 +75,8 @@ int key_sockaddrcmp_withmask(const struct sockaddr *, const struct sockaddr *, int key_register_ifnet(struct secpolicy **, u_int); void key_unregister_ifnet(struct secpolicy **, u_int); +void key_delete_xform(const struct xformsw *); + extern u_long key_random(void); extern void key_randomfill(void *, size_t); extern void key_freereg(struct socket *); diff --git a/sys/netipsec/subr_ipsec.c b/sys/netipsec/subr_ipsec.c index acfe66acf458..37b686a7e91e 100644 --- a/sys/netipsec/subr_ipsec.c +++ b/sys/netipsec/subr_ipsec.c @@ -54,6 +54,7 @@ __FBSDID("$FreeBSD$"); #include <netipsec/ipsec6.h> #include <netipsec/key.h> #include <netipsec/key_debug.h> +#include <netipsec/xform.h> #include <machine/atomic.h> /* @@ -124,14 +125,6 @@ ipsec6_setsockaddrs(const struct mbuf *m, union sockaddr_union *src, } #endif -#ifdef IPSEC_SUPPORT -/* - * IPSEC_SUPPORT - loading of ipsec.ko and tcpmd5.ko is supported. - * IPSEC + IPSEC_SUPPORT - loading tcpmd5.ko is supported. - * IPSEC + TCP_SIGNATURE - all is build in the kernel, do not build - * IPSEC_SUPPORT. - */ -#if !defined(IPSEC) || !defined(TCP_SIGNATURE) #define IPSEC_MODULE_INCR 2 static int ipsec_kmod_enter(volatile u_int *cntr) @@ -171,6 +164,83 @@ ipsec_kmod_drain(volatile u_int *cntr) pause("ipsecd", hz/2); } +static LIST_HEAD(xforms_list, xformsw) xforms = LIST_HEAD_INITIALIZER(); +static struct mtx xforms_lock; +MTX_SYSINIT(xfroms_list, &xforms_lock, "IPsec transforms list", MTX_DEF); +#define XFORMS_LOCK() mtx_lock(&xforms_lock) +#define XFORMS_UNLOCK() mtx_unlock(&xforms_lock) + +void +xform_attach(void *data) +{ + struct xformsw *xsp, *entry; + + xsp = (struct xformsw *)data; + XFORMS_LOCK(); + LIST_FOREACH(entry, &xforms, chain) { + if (entry->xf_type == xsp->xf_type) { + XFORMS_UNLOCK(); + printf("%s: failed to register %s xform\n", + __func__, xsp->xf_name); + return; + } + } + LIST_INSERT_HEAD(&xforms, xsp, chain); + xsp->xf_cntr = IPSEC_MODULE_ENABLED; + XFORMS_UNLOCK(); +} + +void +xform_detach(void *data) +{ + struct xformsw *xsp = (struct xformsw *)data; + + XFORMS_LOCK(); + LIST_REMOVE(xsp, chain); + XFORMS_UNLOCK(); + + /* Delete all SAs related to this xform. */ + key_delete_xform(xsp); + if (xsp->xf_cntr & IPSEC_MODULE_ENABLED) + ipsec_kmod_drain(&xsp->xf_cntr); +} + +/* + * Initialize transform support in an sav. + */ +int +xform_init(struct secasvar *sav, u_short xftype) +{ + struct xformsw *entry; + int ret; + + IPSEC_ASSERT(sav->tdb_xform == NULL, + ("tdb_xform is already initialized")); + + XFORMS_LOCK(); + LIST_FOREACH(entry, &xforms, chain) { + if (entry->xf_type == xftype) { + ret = ipsec_kmod_enter(&entry->xf_cntr); + XFORMS_UNLOCK(); + if (ret != 0) + return (ret); + ret = (*entry->xf_init)(sav, entry); + ipsec_kmod_exit(&entry->xf_cntr); + return (ret); + } + } + XFORMS_UNLOCK(); + return (EINVAL); +} + +#ifdef IPSEC_SUPPORT +/* + * IPSEC_SUPPORT - loading of ipsec.ko and tcpmd5.ko is supported. + * IPSEC + IPSEC_SUPPORT - loading tcpmd5.ko is supported. + * IPSEC + TCP_SIGNATURE - all is build in the kernel, do not build + * IPSEC_SUPPORT. + */ +#if !defined(IPSEC) || !defined(TCP_SIGNATURE) #define METHOD_DECL(...) __VA_ARGS__ #define METHOD_ARGS(...) __VA_ARGS__ #define IPSEC_KMOD_METHOD(type, name, sc, method, decl, args) \ diff --git a/sys/netipsec/xform.h b/sys/netipsec/xform.h index 389d0b66850b..910a88a706f3 100644 --- a/sys/netipsec/xform.h +++ b/sys/netipsec/xform.h @@ -86,14 +86,16 @@ struct xform_data { #define XF_IPCOMP 6 /* IPCOMP */ struct xformsw { - u_short xf_type; /* xform ID */ - char *xf_name; /* human-readable name */ + u_short xf_type; /* xform ID */ + const char *xf_name; /* human-readable name */ int (*xf_init)(struct secasvar*, struct xformsw*); /* setup */ int (*xf_zeroize)(struct secasvar*); /* cleanup */ int (*xf_input)(struct mbuf*, struct secasvar*, /* input */ int, int); int (*xf_output)(struct mbuf*, /* output */ struct secpolicy *, struct secasvar *, u_int, int, int); + + volatile u_int xf_cntr; LIST_ENTRY(xformsw) chain; }; @@ -103,6 +105,7 @@ const struct comp_algo * comp_algorithm_lookup(int); void xform_attach(void *); void xform_detach(void *); +int xform_init(struct secasvar *, u_short); struct cryptoini; /* XF_AH */ diff --git a/sys/netpfil/pf/pf.c b/sys/netpfil/pf/pf.c index 08eea294b4e2..13c3c6463c0c 100644 --- a/sys/netpfil/pf/pf.c +++ b/sys/netpfil/pf/pf.c @@ -1719,24 +1719,28 @@ pf_purge_expired_states(u_int i, int maxcheck) while (maxcheck > 0) { ih = &V_pf_idhash[i]; + + /* only take the lock if we expect to do work */ + if (!LIST_EMPTY(&ih->states)) { relock: - PF_HASHROW_LOCK(ih); - LIST_FOREACH(s, &ih->states, entry) { - if (pf_state_expires(s) <= time_uptime) { - V_pf_status.states -= - pf_unlink_state(s, PF_ENTER_LOCKED); - goto relock; + PF_HASHROW_LOCK(ih); + LIST_FOREACH(s, &ih->states, entry) { + if (pf_state_expires(s) <= time_uptime) { + V_pf_status.states -= + pf_unlink_state(s, PF_ENTER_LOCKED); + goto relock; + } + s->rule.ptr->rule_flag |= PFRULE_REFS; + if (s->nat_rule.ptr != NULL) + s->nat_rule.ptr->rule_flag |= PFRULE_REFS; + if (s->anchor.ptr != NULL) + s->anchor.ptr->rule_flag |= PFRULE_REFS; + s->kif->pfik_flags |= PFI_IFLAG_REFS; + if (s->rt_kif) + s->rt_kif->pfik_flags |= PFI_IFLAG_REFS; } - s->rule.ptr->rule_flag |= PFRULE_REFS; - if (s->nat_rule.ptr != NULL) - s->nat_rule.ptr->rule_flag |= PFRULE_REFS; - if (s->anchor.ptr != NULL) - s->anchor.ptr->rule_flag |= PFRULE_REFS; - s->kif->pfik_flags |= PFI_IFLAG_REFS; - if (s->rt_kif) - s->rt_kif->pfik_flags |= PFI_IFLAG_REFS; + PF_HASHROW_UNLOCK(ih); } - PF_HASHROW_UNLOCK(ih); /* Return when we hit end of hash. */ if (++i > pf_hashmask) { diff --git a/sys/opencrypto/cryptosoft.c b/sys/opencrypto/cryptosoft.c index a4a719b5fbed..0a5a20640a66 100644 --- a/sys/opencrypto/cryptosoft.c +++ b/sys/opencrypto/cryptosoft.c @@ -45,6 +45,7 @@ __FBSDID("$FreeBSD$"); #include <sys/rwlock.h> #include <sys/endian.h> #include <sys/limits.h> +#include <sys/mutex.h> #include <crypto/blowfish/blowfish.h> #include <crypto/sha1.h> @@ -765,6 +766,7 @@ swcr_newsession(device_t dev, crypto_session_t cses, struct cryptoini *cri) return EINVAL; ses = crypto_get_driver_session(cses); + mtx_init(&ses->swcr_lock, "swcr session lock", NULL, MTX_DEF); for (i = 0; cri != NULL && i < nitems(ses->swcr_algorithms); i++) { swd = &ses->swcr_algorithms[i]; @@ -1022,6 +1024,7 @@ swcr_freesession(device_t dev, crypto_session_t cses) ses = crypto_get_driver_session(cses); + mtx_destroy(&ses->swcr_lock); for (i = 0; i < nitems(ses->swcr_algorithms); i++) { swd = &ses->swcr_algorithms[i]; @@ -1109,7 +1112,7 @@ swcr_freesession(device_t dev, crypto_session_t cses) static int swcr_process(device_t dev, struct cryptop *crp, int hint) { - struct swcr_session *ses; + struct swcr_session *ses = NULL; struct cryptodesc *crd; struct swcr_data *sw; size_t i; @@ -1124,6 +1127,7 @@ swcr_process(device_t dev, struct cryptop *crp, int hint) } ses = crypto_get_driver_session(crp->crp_session); + mtx_lock(&ses->swcr_lock); /* Go through crypto descriptors, processing as we go */ for (crd = crp->crp_desc; crd; crd = crd->crd_next) { @@ -1213,6 +1217,8 @@ swcr_process(device_t dev, struct cryptop *crp, int hint) } done: + if (ses) + mtx_unlock(&ses->swcr_lock); crypto_done(crp); return 0; } diff --git a/sys/opencrypto/cryptosoft.h b/sys/opencrypto/cryptosoft.h index d88b09d4e1c0..d787dc243ae6 100644 --- a/sys/opencrypto/cryptosoft.h +++ b/sys/opencrypto/cryptosoft.h @@ -58,6 +58,7 @@ struct swcr_data { }; struct swcr_session { + struct mtx swcr_lock; struct swcr_data swcr_algorithms[2]; unsigned swcr_nalgs; }; diff --git a/sys/powerpc/conf/GENERIC64 b/sys/powerpc/conf/GENERIC64 index ec8dc06f0980..5913100d037a 100644 --- a/sys/powerpc/conf/GENERIC64 +++ b/sys/powerpc/conf/GENERIC64 @@ -120,6 +120,7 @@ device siis # SiliconImage SiI3124/SiI3132/SiI3531 SATA # NVM Express (NVMe) support device nvme # base NVMe driver +options NVME_USE_NVD=0 # prefer the cam(4) based nda(4) driver device nvd # expose NVMe namespaces as disks, depends on nvme # SCSI Controllers diff --git a/sys/powerpc/ofw/ofw_machdep.c b/sys/powerpc/ofw/ofw_machdep.c index 7e231373d829..c647ce817905 100644 --- a/sys/powerpc/ofw/ofw_machdep.c +++ b/sys/powerpc/ofw/ofw_machdep.c @@ -69,6 +69,10 @@ __FBSDID("$FreeBSD$"); #include <contrib/libfdt/libfdt.h> +#ifdef POWERNV +#include <powerpc/powernv/opal.h> +#endif + static void *fdt; int ofw_real_mode; @@ -338,6 +342,34 @@ excise_initrd_region(struct mem_region *avail, int asz) return (asz); } +#ifdef POWERNV +static int +excise_msi_region(struct mem_region *avail, int asz) +{ + uint64_t start, end; + struct mem_region initrdmap[1]; + + /* + * This range of physical addresses is used to implement optimized + * 32 bit MSI interrupts on POWER9. Exclude it to avoid accidentally + * using it for DMA, as this will cause an immediate PHB fence. + * While we could theoretically turn off this behavior in the ETU, + * doing so would break 32-bit MSI, so just reserve the range in + * the physical map instead. + * See section 4.4.2.8 of the PHB4 specification. + */ + start = 0x00000000ffff0000ul; + end = 0x00000000fffffffful; + + initrdmap[0].mr_start = start; + initrdmap[0].mr_size = end - start; + + asz = excise_reserved_regions(avail, asz, initrdmap, 1); + + return (asz); +} +#endif + static int excise_fdt_reserved(struct mem_region *avail, int asz) { @@ -430,6 +462,11 @@ ofw_mem_regions(struct mem_region *memp, int *memsz, asz = excise_initrd_region(availp, asz); #endif +#ifdef POWERNV + if (opal_check() == 0) + asz = excise_msi_region(availp, asz); +#endif + *memsz = msz; *availsz = asz; } diff --git a/sys/riscv/include/fpe.h b/sys/riscv/include/fpe.h index 1294ab0de8fa..a519094d4272 100644 --- a/sys/riscv/include/fpe.h +++ b/sys/riscv/include/fpe.h @@ -34,5 +34,6 @@ #define _MACHINE_FPE_H_ void fpe_state_save(struct thread *td); +void fpe_state_clear(void); #endif /* !_MACHINE_FPE_H_ */ diff --git a/sys/riscv/riscv/machdep.c b/sys/riscv/riscv/machdep.c index 64d20126f3ad..430336408368 100644 --- a/sys/riscv/riscv/machdep.c +++ b/sys/riscv/riscv/machdep.c @@ -204,13 +204,14 @@ fill_fpregs(struct thread *td, struct fpreg *regs) * If we have just been running FPE instructions we will * need to save the state to memcpy it below. */ - fpe_state_save(td); + if (td == curthread) + fpe_state_save(td); memcpy(regs->fp_x, pcb->pcb_x, sizeof(regs->fp_x)); regs->fp_fcsr = pcb->pcb_fcsr; } else #endif - memset(regs->fp_x, 0, sizeof(regs->fp_x)); + memset(regs, 0, sizeof(*regs)); return (0); } @@ -219,12 +220,17 @@ int set_fpregs(struct thread *td, struct fpreg *regs) { #ifdef FPE + struct trapframe *frame; struct pcb *pcb; + frame = td->td_frame; pcb = td->td_pcb; memcpy(pcb->pcb_x, regs->fp_x, sizeof(regs->fp_x)); pcb->pcb_fcsr = regs->fp_fcsr; + pcb->pcb_fpflags |= PCB_FP_STARTED; + frame->tf_sstatus &= ~SSTATUS_FS_MASK; + frame->tf_sstatus |= SSTATUS_FS_CLEAN; #endif return (0); diff --git a/sys/riscv/riscv/pmap.c b/sys/riscv/riscv/pmap.c index 8582667b27b9..b6a58ed796b6 100644 --- a/sys/riscv/riscv/pmap.c +++ b/sys/riscv/riscv/pmap.c @@ -3244,11 +3244,27 @@ pmap_activate(struct thread *td) critical_exit(); } +static void +pmap_sync_icache_one(void *arg __unused) +{ + + __asm __volatile("fence.i"); +} + void pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz) { - panic("RISCVTODO: pmap_sync_icache"); + /* + * From the RISC-V User-Level ISA V2.2: + * + * "To make a store to instruction memory visible to all + * RISC-V harts, the writing hart has to execute a data FENCE + * before requesting that all remote RISC-V harts execute a + * FENCE.I." + */ + __asm __volatile("fence"); + smp_rendezvous(NULL, pmap_sync_icache_one, NULL, NULL); } /* diff --git a/sys/riscv/riscv/swtch.S b/sys/riscv/riscv/swtch.S index b17de23ad395..8192abaf9869 100644 --- a/sys/riscv/riscv/swtch.S +++ b/sys/riscv/riscv/swtch.S @@ -154,6 +154,59 @@ END(fpe_state_save) #endif /* FPE */ /* + * void + * fpe_state_clear(void) + */ +ENTRY(fpe_state_clear) + /* + * Enable FPE usage in supervisor mode, + * so we can access registers. + */ + li t0, SSTATUS_FS_INITIAL + csrs sstatus, t0 + + fscsr zero + fcvt.d.l f0, zero + fcvt.d.l f1, zero + fcvt.d.l f2, zero + fcvt.d.l f3, zero + fcvt.d.l f4, zero + fcvt.d.l f5, zero + fcvt.d.l f6, zero + fcvt.d.l f7, zero + fcvt.d.l f8, zero + fcvt.d.l f9, zero + fcvt.d.l f10, zero + fcvt.d.l f11, zero + fcvt.d.l f12, zero + fcvt.d.l f13, zero + fcvt.d.l f14, zero + fcvt.d.l f15, zero + fcvt.d.l f16, zero + fcvt.d.l f17, zero + fcvt.d.l f18, zero + fcvt.d.l f19, zero + fcvt.d.l f20, zero + fcvt.d.l f21, zero + fcvt.d.l f22, zero + fcvt.d.l f23, zero + fcvt.d.l f24, zero + fcvt.d.l f25, zero + fcvt.d.l f26, zero + fcvt.d.l f27, zero + fcvt.d.l f28, zero + fcvt.d.l f29, zero + fcvt.d.l f30, zero + fcvt.d.l f31, zero + + /* Disable FPE usage in supervisor mode. */ + li t0, SSTATUS_FS_MASK + csrc sstatus, t0 + + ret +END(fpe_state_clear) + +/* * void cpu_throw(struct thread *old, struct thread *new) */ ENTRY(cpu_throw) diff --git a/sys/riscv/riscv/trap.c b/sys/riscv/riscv/trap.c index ced05c52588f..57f5c558d286 100644 --- a/sys/riscv/riscv/trap.c +++ b/sys/riscv/riscv/trap.c @@ -57,6 +57,9 @@ __FBSDID("$FreeBSD$"); #include <vm/vm_param.h> #include <vm/vm_extern.h> +#ifdef FPE +#include <machine/fpe.h> +#endif #include <machine/frame.h> #include <machine/pcb.h> #include <machine/pcpu.h> @@ -363,7 +366,9 @@ do_trap_user(struct trapframe *frame) * May be a FPE trap. Enable FPE usage * for this thread and try again. */ - frame->tf_sstatus |= SSTATUS_FS_INITIAL; + fpe_state_clear(); + frame->tf_sstatus &= ~SSTATUS_FS_MASK; + frame->tf_sstatus |= SSTATUS_FS_CLEAN; pcb->pcb_fpflags |= PCB_FP_STARTED; break; } diff --git a/sys/security/audit/audit.c b/sys/security/audit/audit.c index 87ceeb670e76..c00ddc126f5a 100644 --- a/sys/security/audit/audit.c +++ b/sys/security/audit/audit.c @@ -2,7 +2,7 @@ * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1999-2005 Apple Inc. - * Copyright (c) 2006-2007, 2016-2017 Robert N. M. Watson + * Copyright (c) 2006-2007, 2016-2018 Robert N. M. Watson * All rights reserved. * * Portions of this software were developed by BAE Systems, the University of @@ -98,8 +98,12 @@ static SYSCTL_NODE(_security, OID_AUTO, audit, CTLFLAG_RW, 0, * * Define the audit control flags. */ -int __read_frequently audit_enabled; -int audit_suspended; +int audit_trail_enabled; +int audit_trail_suspended; +#ifdef KDTRACE_HOOKS +u_int audit_dtrace_enabled; +#endif +int __read_frequently audit_syscalls_enabled; /* * Flags controlling behavior in low storage situations. Should we panic if @@ -198,6 +202,33 @@ static struct rwlock audit_kinfo_lock; #define KINFO_RUNLOCK() rw_runlock(&audit_kinfo_lock) #define KINFO_WUNLOCK() rw_wunlock(&audit_kinfo_lock) +/* + * Check various policies to see if we should enable system-call audit hooks. + * Note that despite the mutex being held, we want to assign a value exactly + * once, as checks of the flag are performed lock-free for performance + * reasons. The mutex is used to get a consistent snapshot of policy state -- + * e.g., safely accessing the two audit_trail flags. + */ +void +audit_syscalls_enabled_update(void) +{ + + mtx_lock(&audit_mtx); +#ifdef KDTRACE_HOOKS + if (audit_dtrace_enabled) + audit_syscalls_enabled = 1; + else { +#endif + if (audit_trail_enabled && !audit_trail_suspended) + audit_syscalls_enabled = 1; + else + audit_syscalls_enabled = 0; +#ifdef KDTRACE_HOOKS + } +#endif + mtx_unlock(&audit_mtx); +} + void audit_set_kinfo(struct auditinfo_addr *ak) { @@ -303,8 +334,9 @@ static void audit_init(void) { - audit_enabled = 0; - audit_suspended = 0; + audit_trail_enabled = 0; + audit_trail_suspended = 0; + audit_syscalls_enabled = 0; audit_panic_on_write_fail = 0; audit_fail_stop = 0; audit_in_failure = 0; @@ -337,6 +369,9 @@ audit_init(void) sizeof(struct kaudit_record), audit_record_ctor, audit_record_dtor, NULL, NULL, UMA_ALIGN_PTR, 0); + /* First initialisation of audit_syscalls_enabled. */ + audit_syscalls_enabled_update(); + /* Initialize the BSM audit subsystem. */ kau_init(); @@ -378,10 +413,6 @@ currecord(void) } /* - * XXXAUDIT: There are a number of races present in the code below due to - * release and re-grab of the mutex. The code should be revised to become - * slightly less racy. - * * XXXAUDIT: Shouldn't there be logic here to sleep waiting on available * pre_q space, suspending the system call until there is room? */ @@ -389,13 +420,6 @@ struct kaudit_record * audit_new(int event, struct thread *td) { struct kaudit_record *ar; - int no_record; - - mtx_lock(&audit_mtx); - no_record = (audit_suspended || !audit_enabled); - mtx_unlock(&audit_mtx); - if (no_record) - return (NULL); /* * Note: the number of outstanding uncommitted audit records is @@ -529,9 +553,13 @@ audit_commit(struct kaudit_record *ar, int error, int retval) /* * Note: it could be that some records initiated while audit was * enabled should still be committed? + * + * NB: The check here is not for audit_syscalls because any + * DTrace-related obligations have been fulfilled above -- we're just + * down to the trail and pipes now. */ mtx_lock(&audit_mtx); - if (audit_suspended || !audit_enabled) { + if (audit_trail_suspended || !audit_trail_enabled) { audit_pre_q_len--; mtx_unlock(&audit_mtx); audit_free(ar); @@ -557,6 +585,10 @@ audit_commit(struct kaudit_record *ar, int error, int retval) * responsible for deciding whether or not to audit the call (preselection), * and if so, allocating a per-thread audit record. audit_new() will fill in * basic thread/credential properties. + * + * This function will be entered only if audit_syscalls_enabled was set in the + * macro wrapper for this function. It could be cleared by the time this + * function runs, but that is an acceptable race. */ void audit_syscall_enter(unsigned short code, struct thread *td) diff --git a/sys/security/audit/audit.h b/sys/security/audit/audit.h index 055194d3a88d..f24bc1e503b2 100644 --- a/sys/security/audit/audit.h +++ b/sys/security/audit/audit.h @@ -2,7 +2,7 @@ * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1999-2005 Apple Inc. - * Copyright (c) 2016-2017 Robert N. M. Watson + * Copyright (c) 2016-2018 Robert N. M. Watson * All rights reserved. * * This software was developed by BAE Systems, the University of Cambridge @@ -55,14 +55,23 @@ #include <sys/sysctl.h> /* - * Audit subsystem condition flags. The audit_enabled flag is set and + * Audit subsystem condition flags. The audit_trail_enabled flag is set and * removed automatically as a result of configuring log files, and can be * observed but should not be directly manipulated. The audit suspension * flag permits audit to be temporarily disabled without reconfiguring the * audit target. + * + * As DTrace can also request system-call auditing, a further + * audit_syscalls_enabled flag tracks whether newly entering system calls + * should be considered for auditing or not. + * + * XXXRW: Move trail flags to audit_private.h, as they no longer need to be + * visible outside the audit code...? */ -extern int audit_enabled; -extern int audit_suspended; +extern u_int audit_dtrace_enabled; +extern int audit_trail_enabled; +extern int audit_trail_suspended; +extern int audit_syscalls_enabled; void audit_syscall_enter(unsigned short code, struct thread *td); void audit_syscall_exit(int error, struct thread *td); @@ -139,7 +148,7 @@ void audit_thread_free(struct thread *td); /* * Define macros to wrap the audit_arg_* calls by checking the global - * audit_enabled flag before performing the actual call. + * audit_syscalls_enabled flag before performing the actual call. */ #define AUDITING_TD(td) ((td)->td_pflags & TDP_AUDITREC) @@ -369,7 +378,7 @@ void audit_thread_free(struct thread *td); } while (0) #define AUDIT_SYSCALL_ENTER(code, td) do { \ - if (audit_enabled) { \ + if (audit_syscalls_enabled) { \ audit_syscall_enter(code, td); \ } \ } while (0) @@ -377,7 +386,7 @@ void audit_thread_free(struct thread *td); /* * Wrap the audit_syscall_exit() function so that it is called only when * we have a audit record on the thread. Audit records can persist after - * auditing is disabled, so we don't just check audit_enabled here. + * auditing is disabled, so we don't just check audit_syscalls_enabled here. */ #define AUDIT_SYSCALL_EXIT(error, td) do { \ if (td->td_pflags & TDP_AUDITREC) \ diff --git a/sys/security/audit/audit_dtrace.c b/sys/security/audit/audit_dtrace.c index c456ab71400a..985baf142ab3 100644 --- a/sys/security/audit/audit_dtrace.c +++ b/sys/security/audit/audit_dtrace.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2016 Robert N. M. Watson + * Copyright (c) 2016, 2018 Robert N. M. Watson * All rights reserved. * * This software was developed by BAE Systems, the University of Cambridge @@ -147,8 +147,12 @@ static dtrace_provider_id_t dtaudit_id; * maintain a global flag tracking whether any dtaudit probes are enabled. If * not, don't bother doing all that work whenever potential queries about * events turn up during preselection or commit. + * + * NB: We used to maintain our own variable in dtaudit, but now use the + * centralized audit_dtrace_enabled variable imported from the audit code. + * + * static uint_t dtaudit_probes_enabled; */ -static uint_t dtaudit_probes_enabled; /* * Check dtaudit policy for the event to see whether this is an event we would @@ -179,7 +183,7 @@ dtaudit_preselect(au_id_t auid, au_event_t event, au_class_t class) * NB: Lockless reads here may return a slightly stale value; this is * considered better than acquiring a lock, however. */ - if (!dtaudit_probes_enabled) + if (!audit_dtrace_enabled) return (NULL); ene = au_evnamemap_lookup(event); if (ene == NULL) @@ -457,7 +461,8 @@ dtaudit_enable(void *arg, dtrace_id_t id, void *parg) ene->ene_commit_probe_enabled = 1; else ene->ene_bsm_probe_enabled = 1; - refcount_acquire(&dtaudit_probes_enabled); + refcount_acquire(&audit_dtrace_enabled); + audit_syscalls_enabled_update(); } static void @@ -474,7 +479,8 @@ dtaudit_disable(void *arg, dtrace_id_t id, void *parg) ene->ene_commit_probe_enabled = 0; else ene->ene_bsm_probe_enabled = 0; - (void)refcount_release(&dtaudit_probes_enabled); + (void)refcount_release(&audit_dtrace_enabled); + audit_syscalls_enabled_update(); } static void diff --git a/sys/security/audit/audit_private.h b/sys/security/audit/audit_private.h index 5e7a3934835e..4aa811bc1516 100644 --- a/sys/security/audit/audit_private.h +++ b/sys/security/audit/audit_private.h @@ -2,7 +2,7 @@ * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1999-2009 Apple Inc. - * Copyright (c) 2016-2017 Robert N. M. Watson + * Copyright (c) 2016, 2018 Robert N. M. Watson * All rights reserved. * * Portions of this software were developed by BAE Systems, the University of @@ -344,6 +344,13 @@ void audit_commit(struct kaudit_record *ar, int error, struct kaudit_record *audit_new(int event, struct thread *td); /* + * Function to update the audit_syscalls_enabled flag, whose value is affected + * by configuration of the audit trail/pipe mechanism and DTrace. Call this + * function when any of the inputs to that policy change. + */ +void audit_syscalls_enabled_update(void); + +/* * Functions relating to the conversion of internal kernel audit records to * the BSM file format. */ diff --git a/sys/security/audit/audit_syscalls.c b/sys/security/audit/audit_syscalls.c index 89af1a7d2817..a092e27b0a74 100644 --- a/sys/security/audit/audit_syscalls.c +++ b/sys/security/audit/audit_syscalls.c @@ -2,7 +2,7 @@ * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1999-2009 Apple Inc. - * Copyright (c) 2016 Robert N. M. Watson + * Copyright (c) 2016, 2018 Robert N. M. Watson * All rights reserved. * * Portions of this software were developed by BAE Systems, the University of @@ -368,7 +368,7 @@ sys_auditon(struct thread *td, struct auditon_args *uap) case A_OLDGETCOND: case A_GETCOND: if (uap->length == sizeof(udata.au_cond64)) { - if (audit_enabled && !audit_suspended) + if (audit_trail_enabled && !audit_trail_suspended) udata.au_cond64 = AUC_AUDITING; else udata.au_cond64 = AUC_NOAUDIT; @@ -376,7 +376,7 @@ sys_auditon(struct thread *td, struct auditon_args *uap) } if (uap->length != sizeof(udata.au_cond)) return (EINVAL); - if (audit_enabled && !audit_suspended) + if (audit_trail_enabled && !audit_trail_suspended) udata.au_cond = AUC_AUDITING; else udata.au_cond = AUC_NOAUDIT; @@ -386,25 +386,27 @@ sys_auditon(struct thread *td, struct auditon_args *uap) case A_SETCOND: if (uap->length == sizeof(udata.au_cond64)) { if (udata.au_cond64 == AUC_NOAUDIT) - audit_suspended = 1; + audit_trail_suspended = 1; if (udata.au_cond64 == AUC_AUDITING) - audit_suspended = 0; + audit_trail_suspended = 0; if (udata.au_cond64 == AUC_DISABLED) { - audit_suspended = 1; + audit_trail_suspended = 1; audit_shutdown(NULL, 0); } + audit_syscalls_enabled_update(); break; } if (uap->length != sizeof(udata.au_cond)) return (EINVAL); if (udata.au_cond == AUC_NOAUDIT) - audit_suspended = 1; + audit_trail_suspended = 1; if (udata.au_cond == AUC_AUDITING) - audit_suspended = 0; + audit_trail_suspended = 0; if (udata.au_cond == AUC_DISABLED) { - audit_suspended = 1; + audit_trail_suspended = 1; audit_shutdown(NULL, 0); } + audit_syscalls_enabled_update(); break; case A_GETCLASS: @@ -826,10 +828,11 @@ sys_auditctl(struct thread *td, struct auditctl_args *uap) crhold(cred); /* - * XXXAUDIT: Should audit_suspended actually be cleared by + * XXXAUDIT: Should audit_trail_suspended actually be cleared by * audit_worker? */ - audit_suspended = 0; + audit_trail_suspended = 0; + audit_syscalls_enabled_update(); audit_rotate_vnode(cred, vp); diff --git a/sys/security/audit/audit_worker.c b/sys/security/audit/audit_worker.c index fa6fe66db259..f169ab8d9dd1 100644 --- a/sys/security/audit/audit_worker.c +++ b/sys/security/audit/audit_worker.c @@ -2,7 +2,7 @@ * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1999-2008 Apple Inc. - * Copyright (c) 2006-2008, 2016 Robert N. M. Watson + * Copyright (c) 2006-2008, 2016, 2018 Robert N. M. Watson * All rights reserved. * * Portions of this software were developed by BAE Systems, the University of @@ -305,7 +305,8 @@ fail_enospc: "Audit log space exhausted and fail-stop set."); } (void)audit_send_trigger(AUDIT_TRIGGER_NO_SPACE); - audit_suspended = 1; + audit_trail_suspended = 1; + audit_syscalls_enabled_update(); /* FALLTHROUGH */ fail: @@ -518,7 +519,8 @@ audit_rotate_vnode(struct ucred *cred, struct vnode *vp) audit_vp = vp; audit_size = vattr.va_size; audit_file_rotate_wait = 0; - audit_enabled = (audit_vp != NULL); + audit_trail_enabled = (audit_vp != NULL); + audit_syscalls_enabled_update(); AUDIT_WORKER_UNLOCK(); /* diff --git a/sys/sys/_domainset.h b/sys/sys/_domainset.h index 30d8501c8e4a..34d8f61ca9fc 100644 --- a/sys/sys/_domainset.h +++ b/sys/sys/_domainset.h @@ -54,7 +54,7 @@ typedef struct _domainset domainset_t; struct domainset; struct domainset_ref { struct domainset * volatile dr_policy; - int dr_iterator; + unsigned int dr_iterator; }; #endif /* !_SYS__DOMAINSET_H_ */ diff --git a/sys/sys/malloc.h b/sys/sys/malloc.h index 68595091a580..520f18c5a969 100644 --- a/sys/sys/malloc.h +++ b/sys/sys/malloc.h @@ -101,7 +101,7 @@ struct malloc_type_internal { uint32_t mti_probes[DTMALLOC_PROBE_MAX]; /* DTrace probe ID array. */ u_char mti_zone; - struct malloc_type_stats mti_stats[MAXCPU]; + struct malloc_type_stats *mti_stats; }; /* diff --git a/sys/sys/module.h b/sys/sys/module.h index b40870d32941..89377df401a8 100644 --- a/sys/sys/module.h +++ b/sys/sys/module.h @@ -178,12 +178,12 @@ struct mod_pnp_match_info * to allow external tools to parse their internal device tables * to make an informed guess about what driver(s) to load. */ -#define MODULE_PNP_INFO(d, b, unique, t, l, n) \ +#define MODULE_PNP_INFO(d, b, unique, t, n) \ static const struct mod_pnp_match_info _module_pnp_##b##_##unique = { \ .descr = d, \ .bus = #b, \ .table = t, \ - .entry_len = l, \ + .entry_len = sizeof((t)[0]), \ .num_entry = n \ }; \ MODULE_METADATA(_md_##b##_pnpinfo_##unique, MDT_PNP_INFO, \ diff --git a/sys/sys/pmc.h b/sys/sys/pmc.h index 0c7a3a331abd..be4c1cda03b0 100644 --- a/sys/sys/pmc.h +++ b/sys/sys/pmc.h @@ -936,6 +936,8 @@ struct pmc_sample { uint16_t ps_flags; /* other flags */ lwpid_t ps_tid; /* thread id */ pid_t ps_pid; /* process PID or -1 */ + int ps_ticks; /* ticks at sample time */ + /* pad */ struct thread *ps_td; /* which thread */ struct pmc *ps_pmc; /* interrupting PMC */ uintptr_t *ps_pc; /* (const) callchain start */ @@ -943,16 +945,23 @@ struct pmc_sample { }; #define PMC_SAMPLE_FREE ((uint16_t) 0) -#define PMC_SAMPLE_INUSE ((uint16_t) 0xFFFF) +#define PMC_USER_CALLCHAIN_PENDING ((uint16_t) 0xFFFF) struct pmc_samplebuffer { - struct pmc_sample * volatile ps_read; /* read pointer */ - struct pmc_sample * volatile ps_write; /* write pointer */ + volatile uint64_t ps_prodidx; /* producer index */ + volatile uint64_t ps_considx; /* consumer index */ uintptr_t *ps_callchains; /* all saved call chains */ - struct pmc_sample *ps_fence; /* one beyond ps_samples[] */ struct pmc_sample ps_samples[]; /* array of sample entries */ }; +#define PMC_CONS_SAMPLE(psb) \ + (&(psb)->ps_samples[(psb)->ps_considx & pmc_sample_mask]) + +#define PMC_CONS_SAMPLE_OFF(psb, off) \ + (&(psb)->ps_samples[(off) & pmc_sample_mask]) + +#define PMC_PROD_SAMPLE(psb) \ + (&(psb)->ps_samples[(psb)->ps_prodidx & pmc_sample_mask]) /* * struct pmc_cpustate @@ -1216,7 +1225,6 @@ int pmc_save_user_callchain(uintptr_t *_cc, int _maxsamples, struct trapframe *_tf); struct pmc_mdep *pmc_mdep_alloc(int nclasses); void pmc_mdep_free(struct pmc_mdep *md); -void pmc_flush_samples(int cpu); uint64_t pmc_rdtsc(void); #endif /* _KERNEL */ #endif /* _SYS_PMC_H_ */ diff --git a/sys/sys/pmckern.h b/sys/sys/pmckern.h index 27e02af73cf2..e892d658a1ca 100644 --- a/sys/sys/pmckern.h +++ b/sys/sys/pmckern.h @@ -67,10 +67,12 @@ #define PMC_FN_THR_EXIT_LOG 16 #define PMC_FN_PROC_CREATE_LOG 17 -#define PMC_HR 0 /* Hardware ring buffer */ -#define PMC_SR 1 /* Software ring buffer */ -#define PMC_UR 2 /* userret ring buffer */ -#define PMC_NUM_SR (PMC_UR+1) +typedef enum ring_type { + PMC_HR = 0, /* Hardware ring buffer */ + PMC_SR = 1, /* Software ring buffer */ + PMC_UR = 2, /* userret ring buffer */ + PMC_NUM_SR = PMC_UR+1 +} ring_type_t; struct pmckern_procexec { int pm_credentialschanged; diff --git a/sys/sys/racct.h b/sys/sys/racct.h index ec3322bdfdf9..84de705f24af 100644 --- a/sys/sys/racct.h +++ b/sys/sys/racct.h @@ -164,6 +164,15 @@ extern struct mtx racct_lock; #define RACCT_UNLOCK() mtx_unlock(&racct_lock) #define RACCT_LOCK_ASSERT() mtx_assert(&racct_lock, MA_OWNED) +#define RACCT_PROC_LOCK(p) do { \ + if (__predict_false(racct_enable)) \ + PROC_LOCK(p); \ +} while (0) +#define RACCT_PROC_UNLOCK(p) do { \ + if (__predict_false(racct_enable)) \ + PROC_UNLOCK(p); \ +} while (0) + int racct_add(struct proc *p, int resource, uint64_t amount); void racct_add_cred(struct ucred *cred, int resource, uint64_t amount); void racct_add_force(struct proc *p, int resource, uint64_t amount); @@ -189,6 +198,9 @@ void racct_proc_throttle(struct proc *p, int timeout); #else +#define RACCT_PROC_LOCK(p) do { } while (0) +#define RACCT_PROC_UNLOCK(p) do { } while (0) + static inline int racct_add(struct proc *p, int resource, uint64_t amount) { diff --git a/sys/sys/resourcevar.h b/sys/sys/resourcevar.h index c61ac3e1a6cf..a60eb3b6dbb3 100644 --- a/sys/sys/resourcevar.h +++ b/sys/sys/resourcevar.h @@ -93,12 +93,10 @@ struct racct; * (a) Constant from inception * (b) Lockless, updated using atomics * (c) Locked by global uihashtbl_lock - * (d) Locked by the ui_vmsize_mtx */ struct uidinfo { LIST_ENTRY(uidinfo) ui_hash; /* (c) hash chain of uidinfos */ - struct mtx ui_vmsize_mtx; - vm_ooffset_t ui_vmsize; /* (d) swap reservation by uid */ + u_long ui_vmsize; /* (b) pages of swap reservation by uid */ long ui_sbsize; /* (b) socket buffer space consumed */ long ui_proccnt; /* (b) number of processes */ long ui_ptscnt; /* (b) number of pseudo-terminals */ diff --git a/sys/sys/signalvar.h b/sys/sys/signalvar.h index 3b5f750db5c9..c6c7d083cefb 100644 --- a/sys/sys/signalvar.h +++ b/sys/sys/signalvar.h @@ -349,7 +349,7 @@ static inline int sigdeferstop(int mode) { - if (mode == SIGDEFERSTOP_NOP) + if (__predict_true(mode == SIGDEFERSTOP_NOP)) return (SIGDEFERSTOP_VAL_NCHG); return (sigdeferstop_impl(mode)); } @@ -358,7 +358,7 @@ static inline void sigallowstop(int prev) { - if (prev == SIGDEFERSTOP_VAL_NCHG) + if (__predict_true(prev == SIGDEFERSTOP_VAL_NCHG)) return; sigallowstop_impl(prev); } diff --git a/sys/sys/syscall.h b/sys/sys/syscall.h index 6e880f04950c..436186e90d29 100644 --- a/sys/sys/syscall.h +++ b/sys/sys/syscall.h @@ -175,6 +175,10 @@ #define SYS_setgid 181 #define SYS_setegid 182 #define SYS_seteuid 183 + /* 184 is obsolete lfs_bmapv */ + /* 185 is obsolete lfs_markv */ + /* 186 is obsolete lfs_segclean */ + /* 187 is obsolete lfs_segwait */ #define SYS_freebsd11_stat 188 #define SYS_freebsd11_fstat 189 #define SYS_freebsd11_lstat 190 @@ -198,6 +202,7 @@ #define SYS_freebsd7___semctl 220 #define SYS_semget 221 #define SYS_semop 222 + /* 223 is obsolete semconfig */ #define SYS_freebsd7_msgctl 224 #define SYS_msgget 225 #define SYS_msgsnd 226 @@ -306,13 +311,25 @@ #define SYS_getresgid 361 #define SYS_kqueue 362 #define SYS_freebsd11_kevent 363 + /* 364 is obsolete __cap_get_proc */ + /* 365 is obsolete __cap_set_proc */ + /* 366 is obsolete __cap_get_fd */ + /* 367 is obsolete __cap_get_file */ + /* 368 is obsolete __cap_set_fd */ + /* 369 is obsolete __cap_set_file */ #define SYS_extattr_set_fd 371 #define SYS_extattr_get_fd 372 #define SYS_extattr_delete_fd 373 #define SYS___setugid 374 + /* 375 is obsolete nfsclnt */ #define SYS_eaccess 376 #define SYS_afs3_syscall 377 #define SYS_nmount 378 + /* 379 is obsolete kse_exit */ + /* 380 is obsolete kse_wakeup */ + /* 381 is obsolete kse_create */ + /* 382 is obsolete kse_thr_interrupt */ + /* 383 is obsolete kse_release */ #define SYS___mac_get_proc 384 #define SYS___mac_set_proc 385 #define SYS___mac_get_fd 386 @@ -363,6 +380,7 @@ #define SYS_extattr_list_fd 437 #define SYS_extattr_list_file 438 #define SYS_extattr_list_link 439 + /* 440 is obsolete kse_switchin */ #define SYS_ksem_timedwait 441 #define SYS_thr_suspend 442 #define SYS_thr_wake 443 @@ -465,6 +483,8 @@ #define SYS_ppoll 545 #define SYS_futimens 546 #define SYS_utimensat 547 + /* 548 is obsolete numa_getaffinity */ + /* 549 is obsolete numa_setaffinity */ #define SYS_fdatasync 550 #define SYS_fstat 551 #define SYS_fstatat 552 diff --git a/sys/sys/user.h b/sys/sys/user.h index f5353f15b0fc..67acf2d0740d 100644 --- a/sys/sys/user.h +++ b/sys/sys/user.h @@ -346,85 +346,96 @@ struct kinfo_file { int64_t kf_offset; /* Seek location. */ union { struct { - /* Sendq size */ - uint32_t kf_sock_sendq; - /* Socket domain. */ - int kf_sock_domain0; - /* Socket type. */ - int kf_sock_type0; - /* Socket protocol. */ - int kf_sock_protocol0; - /* Socket address. */ + /* API compatiblity with FreeBSD < 12. */ + int kf_vnode_type; + int kf_sock_domain; + int kf_sock_type; + int kf_sock_protocol; struct sockaddr_storage kf_sa_local; - /* Peer address. */ struct sockaddr_storage kf_sa_peer; - /* Address of so_pcb. */ - uint64_t kf_sock_pcb; - /* Address of inp_ppcb. */ - uint64_t kf_sock_inpcb; - /* Address of unp_conn. */ - uint64_t kf_sock_unpconn; - /* Send buffer state. */ - uint16_t kf_sock_snd_sb_state; - /* Receive buffer state. */ - uint16_t kf_sock_rcv_sb_state; - /* Recvq size. */ - uint32_t kf_sock_recvq; - } kf_sock; - struct { - /* Vnode type. */ - int kf_file_type; - /* Space for future use */ - int kf_spareint[3]; - uint64_t kf_spareint64[30]; - /* Vnode filesystem id. */ - uint64_t kf_file_fsid; - /* File device. */ - uint64_t kf_file_rdev; - /* Global file id. */ - uint64_t kf_file_fileid; - /* File size. */ - uint64_t kf_file_size; - /* Vnode filesystem id, FreeBSD 11 compat. */ - uint32_t kf_file_fsid_freebsd11; - /* File device, FreeBSD 11 compat. */ - uint32_t kf_file_rdev_freebsd11; - /* File mode. */ - uint16_t kf_file_mode; - /* Round to 64 bit alignment. */ - uint16_t kf_file_pad0; - uint32_t kf_file_pad1; - } kf_file; - struct { - uint32_t kf_spareint[4]; - uint64_t kf_spareint64[32]; - uint32_t kf_sem_value; - uint16_t kf_sem_mode; - } kf_sem; - struct { - uint32_t kf_spareint[4]; - uint64_t kf_spareint64[32]; - uint64_t kf_pipe_addr; - uint64_t kf_pipe_peer; - uint32_t kf_pipe_buffer_cnt; - /* Round to 64 bit alignment. */ - uint32_t kf_pipe_pad0[3]; - } kf_pipe; - struct { - uint32_t kf_spareint[4]; - uint64_t kf_spareint64[32]; - uint32_t kf_pts_dev_freebsd11; - uint32_t kf_pts_pad0; - uint64_t kf_pts_dev; - /* Round to 64 bit alignment. */ - uint32_t kf_pts_pad1[4]; - } kf_pts; - struct { - uint32_t kf_spareint[4]; - uint64_t kf_spareint64[32]; - pid_t kf_pid; - } kf_proc; - } kf_un; + }; + union { + struct { + /* Sendq size */ + uint32_t kf_sock_sendq; + /* Socket domain. */ + int kf_sock_domain0; + /* Socket type. */ + int kf_sock_type0; + /* Socket protocol. */ + int kf_sock_protocol0; + /* Socket address. */ + struct sockaddr_storage kf_sa_local; + /* Peer address. */ + struct sockaddr_storage kf_sa_peer; + /* Address of so_pcb. */ + uint64_t kf_sock_pcb; + /* Address of inp_ppcb. */ + uint64_t kf_sock_inpcb; + /* Address of unp_conn. */ + uint64_t kf_sock_unpconn; + /* Send buffer state. */ + uint16_t kf_sock_snd_sb_state; + /* Receive buffer state. */ + uint16_t kf_sock_rcv_sb_state; + /* Recvq size. */ + uint32_t kf_sock_recvq; + } kf_sock; + struct { + /* Vnode type. */ + int kf_file_type; + /* Space for future use */ + int kf_spareint[3]; + uint64_t kf_spareint64[30]; + /* Vnode filesystem id. */ + uint64_t kf_file_fsid; + /* File device. */ + uint64_t kf_file_rdev; + /* Global file id. */ + uint64_t kf_file_fileid; + /* File size. */ + uint64_t kf_file_size; + /* Vnode filesystem id, FreeBSD 11 compat. */ + uint32_t kf_file_fsid_freebsd11; + /* File device, FreeBSD 11 compat. */ + uint32_t kf_file_rdev_freebsd11; + /* File mode. */ + uint16_t kf_file_mode; + /* Round to 64 bit alignment. */ + uint16_t kf_file_pad0; + uint32_t kf_file_pad1; + } kf_file; + struct { + uint32_t kf_spareint[4]; + uint64_t kf_spareint64[32]; + uint32_t kf_sem_value; + uint16_t kf_sem_mode; + } kf_sem; + struct { + uint32_t kf_spareint[4]; + uint64_t kf_spareint64[32]; + uint64_t kf_pipe_addr; + uint64_t kf_pipe_peer; + uint32_t kf_pipe_buffer_cnt; + /* Round to 64 bit alignment. */ + uint32_t kf_pipe_pad0[3]; + } kf_pipe; + struct { + uint32_t kf_spareint[4]; + uint64_t kf_spareint64[32]; + uint32_t kf_pts_dev_freebsd11; + uint32_t kf_pts_pad0; + uint64_t kf_pts_dev; + /* Round to 64 bit alignment. */ + uint32_t kf_pts_pad1[4]; + } kf_pts; + struct { + uint32_t kf_spareint[4]; + uint64_t kf_spareint64[32]; + pid_t kf_pid; + } kf_proc; + } kf_un; + }; uint16_t kf_status; /* Status flags. */ uint16_t kf_pad1; /* Round to 32 bit alignment. */ int _kf_ispare0; /* Space for more stuff. */ @@ -433,12 +444,6 @@ struct kinfo_file { /* Truncated before copyout in sysctl */ char kf_path[PATH_MAX]; /* Path to file, if any. */ }; -#ifndef _KERNEL -#define kf_vnode_type kf_un.kf_file.kf_file_type -#define kf_sock_domain kf_un.kf_sock.kf_sock_domain0 -#define kf_sock_type kf_un.kf_sock.kf_sock_type0 -#define kf_sock_protocol kf_un.kf_sock.kf_sock_protocol0 -#endif /* * The KERN_PROC_VMMAP sysctl allows a process to dump the VM layout of diff --git a/sys/sys/vmmeter.h b/sys/sys/vmmeter.h index c41b151fa502..579d16756e99 100644 --- a/sys/sys/vmmeter.h +++ b/sys/sys/vmmeter.h @@ -145,6 +145,7 @@ struct vmmeter { #include <sys/domainset.h> extern struct vmmeter vm_cnt; +extern domainset_t all_domains; extern domainset_t vm_min_domains; extern domainset_t vm_severe_domains; @@ -177,7 +178,7 @@ vm_wire_count(void) /* * Return TRUE if we are under our severe low-free-pages threshold * - * This routine is typically used at the user<->system interface to determine + * These routines are typically used at the user<->system interface to determine * whether we need to block in order to avoid a low memory deadlock. */ static inline int @@ -188,7 +189,14 @@ vm_page_count_severe(void) } static inline int -vm_page_count_severe_set(domainset_t *mask) +vm_page_count_severe_domain(int domain) +{ + + return (DOMAINSET_ISSET(domain, &vm_severe_domains)); +} + +static inline int +vm_page_count_severe_set(const domainset_t *mask) { return (DOMAINSET_SUBSET(&vm_severe_domains, mask)); @@ -197,7 +205,7 @@ vm_page_count_severe_set(domainset_t *mask) /* * Return TRUE if we are under our minimum low-free-pages threshold. * - * This routine is typically used within the system to determine whether + * These routines are typically used within the system to determine whether * we can execute potentially very expensive code in terms of memory. It * is also used by the pageout daemon to calculate when to sleep, when * to wake waiters up, and when (after making a pass) to become more @@ -210,5 +218,19 @@ vm_page_count_min(void) return (!DOMAINSET_EMPTY(&vm_min_domains)); } +static inline int +vm_page_count_min_domain(int domain) +{ + + return (DOMAINSET_ISSET(domain, &vm_min_domains)); +} + +static inline int +vm_page_count_min_set(const domainset_t *mask) +{ + + return (DOMAINSET_SUBSET(&vm_min_domains, mask)); +} + #endif /* _KERNEL */ #endif /* _SYS_VMMETER_H_ */ diff --git a/sys/ufs/ffs/ffs_softdep.c b/sys/ufs/ffs/ffs_softdep.c index 89d0b7382e4d..dcc72b2fdfd3 100644 --- a/sys/ufs/ffs/ffs_softdep.c +++ b/sys/ufs/ffs/ffs_softdep.c @@ -10261,22 +10261,22 @@ initiate_write_inodeblock_ufs1(inodedep, bp) prevlbn = adp->ad_offset; if (adp->ad_offset < UFS_NDADDR && dp->di_db[adp->ad_offset] != adp->ad_newblkno) - panic("%s: direct pointer #%jd mismatch %d != %jd", - "softdep_write_inodeblock", + panic("initiate_write_inodeblock_ufs1: " + "direct pointer #%jd mismatch %d != %jd", (intmax_t)adp->ad_offset, dp->di_db[adp->ad_offset], (intmax_t)adp->ad_newblkno); if (adp->ad_offset >= UFS_NDADDR && dp->di_ib[adp->ad_offset - UFS_NDADDR] != adp->ad_newblkno) - panic("%s: indirect pointer #%jd mismatch %d != %jd", - "softdep_write_inodeblock", + panic("initiate_write_inodeblock_ufs1: " + "indirect pointer #%jd mismatch %d != %jd", (intmax_t)adp->ad_offset - UFS_NDADDR, dp->di_ib[adp->ad_offset - UFS_NDADDR], (intmax_t)adp->ad_newblkno); deplist |= 1 << adp->ad_offset; if ((adp->ad_state & ATTACHED) == 0) - panic("softdep_write_inodeblock: Unknown state 0x%x", - adp->ad_state); + panic("initiate_write_inodeblock_ufs1: " + "Unknown state 0x%x", adp->ad_state); #endif /* INVARIANTS */ adp->ad_state &= ~ATTACHED; adp->ad_state |= UNDONE; @@ -10299,7 +10299,8 @@ initiate_write_inodeblock_ufs1(inodedep, bp) for (i = adp->ad_offset + 1; i < UFS_NDADDR; i++) { #ifdef INVARIANTS if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) - panic("softdep_write_inodeblock: lost dep1"); + panic("initiate_write_inodeblock_ufs1: " + "lost dep1"); #endif /* INVARIANTS */ dp->di_db[i] = 0; } @@ -10307,7 +10308,8 @@ initiate_write_inodeblock_ufs1(inodedep, bp) #ifdef INVARIANTS if (dp->di_ib[i] != 0 && (deplist & ((1 << UFS_NDADDR) << i)) == 0) - panic("softdep_write_inodeblock: lost dep2"); + panic("initiate_write_inodeblock_ufs1: " + "lost dep2"); #endif /* INVARIANTS */ dp->di_ib[i] = 0; } @@ -10429,18 +10431,18 @@ initiate_write_inodeblock_ufs2(inodedep, bp) adp = TAILQ_NEXT(adp, ad_next)) { #ifdef INVARIANTS if (deplist != 0 && prevlbn >= adp->ad_offset) - panic("softdep_write_inodeblock: lbn order"); + panic("initiate_write_inodeblock_ufs2: lbn order"); prevlbn = adp->ad_offset; if (dp->di_extb[adp->ad_offset] != adp->ad_newblkno) - panic("%s: direct pointer #%jd mismatch %jd != %jd", - "softdep_write_inodeblock", + panic("initiate_write_inodeblock_ufs2: " + "ext pointer #%jd mismatch %jd != %jd", (intmax_t)adp->ad_offset, (intmax_t)dp->di_extb[adp->ad_offset], (intmax_t)adp->ad_newblkno); deplist |= 1 << adp->ad_offset; if ((adp->ad_state & ATTACHED) == 0) - panic("softdep_write_inodeblock: Unknown state 0x%x", - adp->ad_state); + panic("initiate_write_inodeblock_ufs2: Unknown " + "state 0x%x", adp->ad_state); #endif /* INVARIANTS */ adp->ad_state &= ~ATTACHED; adp->ad_state |= UNDONE; @@ -10461,7 +10463,8 @@ initiate_write_inodeblock_ufs2(inodedep, bp) for (i = adp->ad_offset + 1; i < UFS_NXADDR; i++) { #ifdef INVARIANTS if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0) - panic("softdep_write_inodeblock: lost dep1"); + panic("initiate_write_inodeblock_ufs2: " + "lost dep1"); #endif /* INVARIANTS */ dp->di_extb[i] = 0; } @@ -10494,22 +10497,22 @@ initiate_write_inodeblock_ufs2(inodedep, bp) prevlbn = adp->ad_offset; if (adp->ad_offset < UFS_NDADDR && dp->di_db[adp->ad_offset] != adp->ad_newblkno) - panic("%s: direct pointer #%jd mismatch %jd != %jd", - "softdep_write_inodeblock", + panic("initiate_write_inodeblock_ufs2: " + "direct pointer #%jd mismatch %jd != %jd", (intmax_t)adp->ad_offset, (intmax_t)dp->di_db[adp->ad_offset], (intmax_t)adp->ad_newblkno); if (adp->ad_offset >= UFS_NDADDR && dp->di_ib[adp->ad_offset - UFS_NDADDR] != adp->ad_newblkno) - panic("%s indirect pointer #%jd mismatch %jd != %jd", - "softdep_write_inodeblock:", + panic("initiate_write_inodeblock_ufs2: " + "indirect pointer #%jd mismatch %jd != %jd", (intmax_t)adp->ad_offset - UFS_NDADDR, (intmax_t)dp->di_ib[adp->ad_offset - UFS_NDADDR], (intmax_t)adp->ad_newblkno); deplist |= 1 << adp->ad_offset; if ((adp->ad_state & ATTACHED) == 0) - panic("softdep_write_inodeblock: Unknown state 0x%x", - adp->ad_state); + panic("initiate_write_inodeblock_ufs2: Unknown " + "state 0x%x", adp->ad_state); #endif /* INVARIANTS */ adp->ad_state &= ~ATTACHED; adp->ad_state |= UNDONE; @@ -10532,7 +10535,8 @@ initiate_write_inodeblock_ufs2(inodedep, bp) for (i = adp->ad_offset + 1; i < UFS_NDADDR; i++) { #ifdef INVARIANTS if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) - panic("softdep_write_inodeblock: lost dep2"); + panic("initiate_write_inodeblock_ufs2: " + "lost dep2"); #endif /* INVARIANTS */ dp->di_db[i] = 0; } @@ -10540,7 +10544,8 @@ initiate_write_inodeblock_ufs2(inodedep, bp) #ifdef INVARIANTS if (dp->di_ib[i] != 0 && (deplist & ((1 << UFS_NDADDR) << i)) == 0) - panic("softdep_write_inodeblock: lost dep3"); + panic("initiate_write_inodeblock_ufs2: " + "lost dep3"); #endif /* INVARIANTS */ dp->di_ib[i] = 0; } diff --git a/sys/ufs/ufs/ufs_quota.c b/sys/ufs/ufs/ufs_quota.c index 25133b834861..31fc49b7b4cf 100644 --- a/sys/ufs/ufs/ufs_quota.c +++ b/sys/ufs/ufs/ufs_quota.c @@ -712,6 +712,34 @@ again: return (error); } +static int +quotaoff_inchange1(struct thread *td, struct mount *mp, int type) +{ + int error; + bool need_resume; + + /* + * mp is already suspended on unmount. If not, suspend it, to + * avoid the situation where quotaoff operation eventually + * failing due to SU structures still keeping references on + * dquots, but vnode's references are already clean. This + * would cause quota accounting leak and asserts otherwise. + * Note that the thread has already called vn_start_write(). + */ + if (mp->mnt_susp_owner == td) { + need_resume = false; + } else { + error = vfs_write_suspend_umnt(mp); + if (error != 0) + return (error); + need_resume = true; + } + error = quotaoff1(td, mp, type); + if (need_resume) + vfs_write_resume(mp, VR_START_WRITE); + return (error); +} + /* * Turns off quotas, assumes that ump->um_qflags are already checked * and QTF_CLOSING is set to indicate operation in progress. Fixes @@ -721,10 +749,9 @@ int quotaoff_inchange(struct thread *td, struct mount *mp, int type) { struct ufsmount *ump; - int i; - int error; + int error, i; - error = quotaoff1(td, mp, type); + error = quotaoff_inchange1(td, mp, type); ump = VFSTOUFS(mp); UFS_LOCK(ump); diff --git a/sys/ufs/ufs/ufs_vfsops.c b/sys/ufs/ufs/ufs_vfsops.c index cf5b1d792aef..f39f50cc9a61 100644 --- a/sys/ufs/ufs/ufs_vfsops.c +++ b/sys/ufs/ufs/ufs_vfsops.c @@ -94,7 +94,8 @@ ufs_quotactl(mp, cmds, id, arg) void *arg; { #ifndef QUOTA - if ((cmds >> SUBCMDSHIFT) == Q_QUOTAON) + if ((cmds >> SUBCMDSHIFT) == Q_QUOTAON || + (cmds >> SUBCMDSHIFT) == Q_QUOTAOFF) vfs_unbusy(mp); return (EOPNOTSUPP); @@ -117,13 +118,13 @@ ufs_quotactl(mp, cmds, id, arg) break; default: - if (cmd == Q_QUOTAON) + if (cmd == Q_QUOTAON || cmd == Q_QUOTAOFF) vfs_unbusy(mp); return (EINVAL); } } if ((u_int)type >= MAXQUOTAS) { - if (cmd == Q_QUOTAON) + if (cmd == Q_QUOTAON || cmd == Q_QUOTAOFF) vfs_unbusy(mp); return (EINVAL); } @@ -134,7 +135,11 @@ ufs_quotactl(mp, cmds, id, arg) break; case Q_QUOTAOFF: + vfs_ref(mp); + vfs_unbusy(mp); + vn_start_write(NULL, &mp, V_WAIT | V_MNTREF); error = quotaoff(td, mp, type); + vn_finished_write(mp); break; case Q_SETQUOTA32: diff --git a/sys/ufs/ufs/ufs_vnops.c b/sys/ufs/ufs/ufs_vnops.c index 9e6e24c32db9..9e8896866660 100644 --- a/sys/ufs/ufs/ufs_vnops.c +++ b/sys/ufs/ufs/ufs_vnops.c @@ -325,9 +325,6 @@ ufs_accessx(ap) struct inode *ip = VTOI(vp); accmode_t accmode = ap->a_accmode; int error; -#ifdef QUOTA - int relocked; -#endif #ifdef UFS_ACL struct acl *acl; acl_type_t type; @@ -350,32 +347,14 @@ ufs_accessx(ap) * Inode is accounted in the quotas only if struct * dquot is attached to it. VOP_ACCESS() is called * from vn_open_cred() and provides a convenient - * point to call getinoquota(). + * point to call getinoquota(). The lock mode is + * exclusive when the file is opening for write. */ - if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { - - /* - * Upgrade vnode lock, since getinoquota() - * requires exclusive lock to modify inode. - */ - relocked = 1; - vhold(vp); - vn_lock(vp, LK_UPGRADE | LK_RETRY); - VI_LOCK(vp); - if (vp->v_iflag & VI_DOOMED) { - vdropl(vp); - error = ENOENT; - goto relock; - } - vdropl(vp); - } else - relocked = 0; - error = getinoquota(ip); -relock: - if (relocked) - vn_lock(vp, LK_DOWNGRADE | LK_RETRY); - if (error != 0) - return (error); + if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE) { + error = getinoquota(ip); + if (error != 0) + return (error); + } #endif break; default: diff --git a/sys/vm/swap_pager.c b/sys/vm/swap_pager.c index 858a9e884626..76a574883619 100644 --- a/sys/vm/swap_pager.c +++ b/sys/vm/swap_pager.c @@ -151,12 +151,16 @@ static int nswapdev; /* Number of swap devices */ int swap_pager_avail; static struct sx swdev_syscall_lock; /* serialize swap(on|off) */ -static vm_ooffset_t swap_total; -SYSCTL_QUAD(_vm, OID_AUTO, swap_total, CTLFLAG_RD, &swap_total, 0, - "Total amount of available swap storage."); -static vm_ooffset_t swap_reserved; -SYSCTL_QUAD(_vm, OID_AUTO, swap_reserved, CTLFLAG_RD, &swap_reserved, 0, +static u_long swap_reserved; +static u_long swap_total; +static int sysctl_page_shift(SYSCTL_HANDLER_ARGS); +SYSCTL_PROC(_vm, OID_AUTO, swap_reserved, CTLTYPE_U64 | CTLFLAG_RD | CTLFLAG_MPSAFE, + &swap_reserved, 0, sysctl_page_shift, "A", "Amount of swap storage needed to back all allocated anonymous memory."); +SYSCTL_PROC(_vm, OID_AUTO, swap_total, CTLTYPE_U64 | CTLFLAG_RD | CTLFLAG_MPSAFE, + &swap_total, 0, sysctl_page_shift, "A", + "Total amount of available swap storage."); + static int overcommit = 0; SYSCTL_INT(_vm, VM_OVERCOMMIT, overcommit, CTLFLAG_RW, &overcommit, 0, "Configure virtual memory overcommit behavior. See tuning(7) " @@ -173,6 +177,16 @@ SYSCTL_ULONG(_vm, OID_AUTO, swap_maxpages, CTLFLAG_RD, &swap_maxpages, 0, #define SWAP_RESERVE_RLIMIT_ON (1 << 1) #define SWAP_RESERVE_ALLOW_NONWIRED (1 << 2) +static int +sysctl_page_shift(SYSCTL_HANDLER_ARGS) +{ + uint64_t newval; + u_long value = *(u_long *)arg1; + + newval = ((uint64_t)value) << PAGE_SHIFT; + return (sysctl_handle_64(oidp, &newval, 0, req)); +} + int swap_reserve(vm_ooffset_t incr) { @@ -183,7 +197,7 @@ swap_reserve(vm_ooffset_t incr) int swap_reserve_by_cred(vm_ooffset_t incr, struct ucred *cred) { - vm_ooffset_t r, s; + u_long r, s, prev, pincr; int res, error; static int curfail; static struct timeval lastfail; @@ -191,8 +205,8 @@ swap_reserve_by_cred(vm_ooffset_t incr, struct ucred *cred) uip = cred->cr_ruidinfo; - if (incr & PAGE_MASK) - panic("swap_reserve: & PAGE_MASK"); + KASSERT((incr & PAGE_MASK) == 0, ("%s: incr: %ju & PAGE_MASK", __func__, + (uintmax_t)incr)); #ifdef RACCT if (racct_enable) { @@ -204,36 +218,33 @@ swap_reserve_by_cred(vm_ooffset_t incr, struct ucred *cred) } #endif + pincr = atop(incr); res = 0; - mtx_lock(&sw_dev_mtx); - r = swap_reserved + incr; + prev = atomic_fetchadd_long(&swap_reserved, pincr); + r = prev + pincr; if (overcommit & SWAP_RESERVE_ALLOW_NONWIRED) { s = vm_cnt.v_page_count - vm_cnt.v_free_reserved - vm_wire_count(); - s *= PAGE_SIZE; } else s = 0; s += swap_total; if ((overcommit & SWAP_RESERVE_FORCE_ON) == 0 || r <= s || (error = priv_check(curthread, PRIV_VM_SWAP_NOQUOTA)) == 0) { res = 1; - swap_reserved = r; + } else { + prev = atomic_fetchadd_long(&swap_reserved, -pincr); + if (prev < pincr) + panic("swap_reserved < incr on overcommit fail"); } - mtx_unlock(&sw_dev_mtx); - if (res) { - UIDINFO_VMSIZE_LOCK(uip); + prev = atomic_fetchadd_long(&uip->ui_vmsize, pincr); if ((overcommit & SWAP_RESERVE_RLIMIT_ON) != 0 && - uip->ui_vmsize + incr > lim_cur(curthread, RLIMIT_SWAP) && - priv_check(curthread, PRIV_VM_SWAP_NORLIMIT)) + prev + pincr > lim_cur(curthread, RLIMIT_SWAP) && + priv_check(curthread, PRIV_VM_SWAP_NORLIMIT)) { res = 0; - else - uip->ui_vmsize += incr; - UIDINFO_VMSIZE_UNLOCK(uip); - if (!res) { - mtx_lock(&sw_dev_mtx); - swap_reserved -= incr; - mtx_unlock(&sw_dev_mtx); + prev = atomic_fetchadd_long(&uip->ui_vmsize, -pincr); + if (prev < pincr) + panic("uip->ui_vmsize < incr on overcommit fail"); } } if (!res && ppsratecheck(&lastfail, &curfail, 1)) { @@ -242,7 +253,7 @@ swap_reserve_by_cred(vm_ooffset_t incr, struct ucred *cred) } #ifdef RACCT - if (!res) { + if (racct_enable && !res) { PROC_LOCK(curproc); racct_sub(curproc, RACCT_SWAP, incr); PROC_UNLOCK(curproc); @@ -256,22 +267,20 @@ void swap_reserve_force(vm_ooffset_t incr) { struct uidinfo *uip; + u_long pincr; - mtx_lock(&sw_dev_mtx); - swap_reserved += incr; - mtx_unlock(&sw_dev_mtx); + KASSERT((incr & PAGE_MASK) == 0, ("%s: incr: %ju & PAGE_MASK", __func__, + (uintmax_t)incr)); -#ifdef RACCT PROC_LOCK(curproc); - racct_add_force(curproc, RACCT_SWAP, incr); - PROC_UNLOCK(curproc); +#ifdef RACCT + if (racct_enable) + racct_add_force(curproc, RACCT_SWAP, incr); #endif - - uip = curthread->td_ucred->cr_ruidinfo; - PROC_LOCK(curproc); - UIDINFO_VMSIZE_LOCK(uip); - uip->ui_vmsize += incr; - UIDINFO_VMSIZE_UNLOCK(uip); + pincr = atop(incr); + atomic_add_long(&swap_reserved, pincr); + uip = curproc->p_ucred->cr_ruidinfo; + atomic_add_long(&uip->ui_vmsize, pincr); PROC_UNLOCK(curproc); } @@ -281,7 +290,7 @@ swap_release(vm_ooffset_t decr) struct ucred *cred; PROC_LOCK(curproc); - cred = curthread->td_ucred; + cred = curproc->p_ucred; swap_release_by_cred(decr, cred); PROC_UNLOCK(curproc); } @@ -289,26 +298,26 @@ swap_release(vm_ooffset_t decr) void swap_release_by_cred(vm_ooffset_t decr, struct ucred *cred) { + u_long prev, pdecr; struct uidinfo *uip; uip = cred->cr_ruidinfo; - if (decr & PAGE_MASK) - panic("swap_release: & PAGE_MASK"); + KASSERT((decr & PAGE_MASK) == 0, ("%s: decr: %ju & PAGE_MASK", __func__, + (uintmax_t)decr)); - mtx_lock(&sw_dev_mtx); - if (swap_reserved < decr) + pdecr = atop(decr); + prev = atomic_fetchadd_long(&swap_reserved, -pdecr); + if (prev < pdecr) panic("swap_reserved < decr"); - swap_reserved -= decr; - mtx_unlock(&sw_dev_mtx); - UIDINFO_VMSIZE_LOCK(uip); - if (uip->ui_vmsize < decr) + prev = atomic_fetchadd_long(&uip->ui_vmsize, -pdecr); + if (prev < pdecr) printf("negative vmsize for uid = %d\n", uip->ui_uid); - uip->ui_vmsize -= decr; - UIDINFO_VMSIZE_UNLOCK(uip); - - racct_sub_cred(cred, RACCT_SWAP, decr); +#ifdef RACCT + if (racct_enable) + racct_sub_cred(cred, RACCT_SWAP, decr); +#endif } #define SWM_POP 0x01 /* pop out */ @@ -545,13 +554,11 @@ swap_pager_swap_init(void) if (maxswzone && n > maxswzone / sizeof(struct swblk)) n = maxswzone / sizeof(struct swblk); swpctrie_zone = uma_zcreate("swpctrie", pctrie_node_size(), NULL, NULL, - pctrie_zone_init, NULL, UMA_ALIGN_PTR, - UMA_ZONE_NOFREE | UMA_ZONE_VM); + pctrie_zone_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM); if (swpctrie_zone == NULL) panic("failed to create swap pctrie zone."); swblk_zone = uma_zcreate("swblk", sizeof(struct swblk), NULL, NULL, - NULL, NULL, _Alignof(struct swblk) - 1, - UMA_ZONE_NOFREE | UMA_ZONE_VM); + NULL, NULL, _Alignof(struct swblk) - 1, UMA_ZONE_VM); if (swblk_zone == NULL) panic("failed to create swap blk zone."); n2 = n; @@ -2178,7 +2185,7 @@ swapon_check_swzone(void) { unsigned long maxpages, npages; - npages = swap_total / PAGE_SIZE; + npages = swap_total; /* absolute maximum we can handle assuming 100% efficiency */ maxpages = uma_zone_get_max(swblk_zone) * SWAP_META_PAGES; @@ -2256,7 +2263,7 @@ swaponsomething(struct vnode *vp, void *id, u_long nblks, TAILQ_INSERT_TAIL(&swtailq, sp, sw_list); nswapdev++; swap_pager_avail += nblks - 2; - swap_total += (vm_ooffset_t)nblks * PAGE_SIZE; + swap_total += nblks; swapon_check_swzone(); swp_sizecheck(); mtx_unlock(&sw_dev_mtx); @@ -2353,7 +2360,7 @@ swapoff_one(struct swdevt *sp, struct ucred *cred) mtx_lock(&sw_dev_mtx); sp->sw_flags |= SW_CLOSING; swap_pager_avail -= blist_fill(sp->sw_blist, 0, nblks); - swap_total -= (vm_ooffset_t)nblks * PAGE_SIZE; + swap_total -= nblks; mtx_unlock(&sw_dev_mtx); /* diff --git a/sys/vm/uma_core.c b/sys/vm/uma_core.c index 908349379763..837eb0787915 100644 --- a/sys/vm/uma_core.c +++ b/sys/vm/uma_core.c @@ -84,6 +84,7 @@ __FBSDID("$FreeBSD$"); #include <vm/vm_pageout.h> #include <vm/vm_param.h> #include <vm/vm_phys.h> +#include <vm/vm_pagequeue.h> #include <vm/vm_map.h> #include <vm/vm_kern.h> #include <vm/vm_extern.h> @@ -2469,9 +2470,11 @@ zalloc_start: if (bucket != NULL) bucket_free(zone, bucket, udata); - if (zone->uz_flags & UMA_ZONE_NUMA) + if (zone->uz_flags & UMA_ZONE_NUMA) { domain = PCPU_GET(domain); - else + if (VM_DOMAIN_EMPTY(domain)) + domain = UMA_ANYDOMAIN; + } else domain = UMA_ANYDOMAIN; /* Short-circuit for zones without buckets and low memory. */ @@ -2647,7 +2650,11 @@ keg_fetch_slab(uma_keg_t keg, uma_zone_t zone, int rdomain, int flags) rdomain = 0; rr = rdomain == UMA_ANYDOMAIN; if (rr) { - keg->uk_cursor = (keg->uk_cursor + 1) % vm_ndomains; + start = keg->uk_cursor; + do { + keg->uk_cursor = (keg->uk_cursor + 1) % vm_ndomains; + domain = keg->uk_cursor; + } while (VM_DOMAIN_EMPTY(domain) && domain != start); domain = start = keg->uk_cursor; /* Only block on the second pass. */ if ((flags & (M_WAITOK | M_NOVM)) == M_WAITOK) @@ -2699,8 +2706,9 @@ again: return (slab); } if (rr) { - keg->uk_cursor = (keg->uk_cursor + 1) % vm_ndomains; - domain = keg->uk_cursor; + do { + domain = (domain + 1) % vm_ndomains; + } while (VM_DOMAIN_EMPTY(domain) && domain != start); } } while (domain != start); @@ -2905,6 +2913,8 @@ zone_alloc_bucket(uma_zone_t zone, void *udata, int domain, int flags) uma_bucket_t bucket; int max; + CTR1(KTR_UMA, "zone_alloc:_bucket domain %d)", domain); + /* Don't wait for buckets, preserve caller's NOVM setting. */ bucket = bucket_alloc(zone, udata, M_NOWAIT | (flags & M_NOVM)); if (bucket == NULL) @@ -2972,6 +2982,11 @@ zone_alloc_item(uma_zone_t zone, void *udata, int domain, int flags) item = NULL; + if (domain != UMA_ANYDOMAIN) { + /* avoid allocs targeting empty domains */ + if (VM_DOMAIN_EMPTY(domain)) + domain = UMA_ANYDOMAIN; + } if (zone->uz_import(zone->uz_arg, &item, 1, domain, flags) != 1) goto fail; atomic_add_long(&zone->uz_allocs, 1); @@ -3141,9 +3156,11 @@ zfree_start: /* We are no longer associated with this CPU. */ critical_exit(); - if ((zone->uz_flags & UMA_ZONE_NUMA) != 0) + if ((zone->uz_flags & UMA_ZONE_NUMA) != 0) { domain = PCPU_GET(domain); - else + if (VM_DOMAIN_EMPTY(domain)) + domain = UMA_ANYDOMAIN; + } else domain = 0; zdom = &zone->uz_domain[0]; @@ -3590,7 +3607,9 @@ uma_prealloc(uma_zone_t zone, int items) dom = &keg->uk_domain[slab->us_domain]; LIST_INSERT_HEAD(&dom->ud_free_slab, slab, us_link); slabs--; - domain = (domain + 1) % vm_ndomains; + do { + domain = (domain + 1) % vm_ndomains; + } while (VM_DOMAIN_EMPTY(domain)); } KEG_UNLOCK(keg); } @@ -3680,6 +3699,11 @@ uma_large_malloc_domain(vm_size_t size, int domain, int wait) vm_offset_t addr; uma_slab_t slab; + if (domain != UMA_ANYDOMAIN) { + /* avoid allocs targeting empty domains */ + if (VM_DOMAIN_EMPTY(domain)) + domain = UMA_ANYDOMAIN; + } slab = zone_alloc_item(slabzone, NULL, domain, wait); if (slab == NULL) return (NULL); diff --git a/sys/vm/vm_domainset.c b/sys/vm/vm_domainset.c index 16b078e7dabd..b9348d6c632b 100644 --- a/sys/vm/vm_domainset.c +++ b/sys/vm/vm_domainset.c @@ -66,6 +66,7 @@ vm_domainset_iter_init(struct vm_domainset_iter *di, struct vm_object *obj, vm_pindex_t pindex) { struct domainset *domain; + struct thread *td; /* * object policy takes precedence over thread policy. The policies @@ -76,8 +77,9 @@ vm_domainset_iter_init(struct vm_domainset_iter *di, struct vm_object *obj, di->di_domain = domain; di->di_iter = &obj->domain.dr_iterator; } else { - di->di_domain = curthread->td_domain.dr_policy; - di->di_iter = &curthread->td_domain.dr_iterator; + td = curthread; + di->di_domain = td->td_domain.dr_policy; + di->di_iter = &td->td_domain.dr_iterator; } di->di_policy = di->di_domain->ds_policy; if (di->di_policy == DOMAINSET_POLICY_INTERLEAVE) { @@ -215,7 +217,7 @@ vm_domainset_iter_page_init(struct vm_domainset_iter *di, struct vm_object *obj, *req = (di->di_flags & ~(VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL)) | VM_ALLOC_NOWAIT; vm_domainset_iter_first(di, domain); - if (DOMAINSET_ISSET(*domain, &vm_min_domains)) + if (vm_page_count_min_domain(*domain)) vm_domainset_iter_page(di, domain, req); } @@ -233,8 +235,7 @@ vm_domainset_iter_page(struct vm_domainset_iter *di, int *domain, int *req) /* If there are more domains to visit we run the iterator. */ while (--di->di_n != 0) { vm_domainset_iter_next(di, domain); - if (!di->di_minskip || - !DOMAINSET_ISSET(*domain, &vm_min_domains)) + if (!di->di_minskip || !vm_page_count_min_domain(*domain)) return (0); } if (di->di_minskip) { @@ -269,7 +270,7 @@ vm_domainset_iter_malloc_init(struct vm_domainset_iter *di, di->di_flags = *flags; *flags = (di->di_flags & ~M_WAITOK) | M_NOWAIT; vm_domainset_iter_first(di, domain); - if (DOMAINSET_ISSET(*domain, &vm_min_domains)) + if (vm_page_count_min_domain(*domain)) vm_domainset_iter_malloc(di, domain, flags); } @@ -280,8 +281,7 @@ vm_domainset_iter_malloc(struct vm_domainset_iter *di, int *domain, int *flags) /* If there are more domains to visit we run the iterator. */ while (--di->di_n != 0) { vm_domainset_iter_next(di, domain); - if (!di->di_minskip || - !DOMAINSET_ISSET(*domain, &vm_min_domains)) + if (!di->di_minskip || !vm_page_count_min_domain(*domain)) return (0); } diff --git a/sys/vm/vm_domainset.h b/sys/vm/vm_domainset.h index 10da5caa0ea7..b1c5766c1c67 100644 --- a/sys/vm/vm_domainset.h +++ b/sys/vm/vm_domainset.h @@ -32,7 +32,7 @@ struct vm_domainset_iter { struct domainset *di_domain; - int *di_iter; + unsigned int *di_iter; vm_pindex_t di_offset; int di_flags; uint16_t di_policy; diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c index d5a6b57f47e3..c56e51f3dbfe 100644 --- a/sys/vm/vm_fault.c +++ b/sys/vm/vm_fault.c @@ -1633,16 +1633,16 @@ vm_fault_copy_entry(vm_map_t dst_map, vm_map_t src_map, dst_object->flags |= OBJ_COLORED; dst_object->pg_color = atop(dst_entry->start); #endif + dst_object->domain = src_object->domain; + dst_object->charge = dst_entry->end - dst_entry->start; } VM_OBJECT_WLOCK(dst_object); KASSERT(upgrade || dst_entry->object.vm_object == NULL, ("vm_fault_copy_entry: vm_object not NULL")); if (src_object != dst_object) { - dst_object->domain = src_object->domain; dst_entry->object.vm_object = dst_object; dst_entry->offset = 0; - dst_object->charge = dst_entry->end - dst_entry->start; } if (fork_charge != NULL) { KASSERT(dst_entry->cred == NULL, @@ -1650,7 +1650,9 @@ vm_fault_copy_entry(vm_map_t dst_map, vm_map_t src_map, dst_object->cred = curthread->td_ucred; crhold(dst_object->cred); *fork_charge += dst_object->charge; - } else if (dst_object->cred == NULL) { + } else if ((dst_object->type == OBJT_DEFAULT || + dst_object->type == OBJT_SWAP) && + dst_object->cred == NULL) { KASSERT(dst_entry->cred != NULL, ("no cred for entry %p", dst_entry)); dst_object->cred = dst_entry->cred; @@ -1737,6 +1739,13 @@ again: dst_m = src_m; if (vm_page_sleep_if_busy(dst_m, "fltupg")) goto again; + if (dst_m->pindex >= dst_object->size) + /* + * We are upgrading. Index can occur + * out of bounds if the object type is + * vnode and the file was truncated. + */ + break; vm_page_xbusy(dst_m); KASSERT(dst_m->valid == VM_PAGE_BITS_ALL, ("invalid dst page %p", dst_m)); diff --git a/sys/vm/vm_glue.c b/sys/vm/vm_glue.c index 832dbce324ef..beccb31f0b64 100644 --- a/sys/vm/vm_glue.c +++ b/sys/vm/vm_glue.c @@ -327,7 +327,7 @@ vm_thread_new(struct thread *td, int pages) else if (pages > KSTACK_MAX_PAGES) pages = KSTACK_MAX_PAGES; - if (pages == kstack_pages) { + if (pages == kstack_pages && kstack_cache != NULL) { mtx_lock(&kstack_cache_mtx); if (kstack_cache != NULL) { ks_ce = kstack_cache; diff --git a/sys/vm/vm_init.c b/sys/vm/vm_init.c index 09e87ed231ed..27ecb960201e 100644 --- a/sys/vm/vm_init.c +++ b/sys/vm/vm_init.c @@ -98,12 +98,6 @@ extern void uma_startup1(void); extern void uma_startup2(void); extern void vm_radix_reserve_kva(void); -#if VM_NRESERVLEVEL > 0 -#define KVA_QUANTUM (1 << (VM_LEVEL_0_ORDER + PAGE_SHIFT)) -#else - /* On non-superpage architectures want large import sizes. */ -#define KVA_QUANTUM (PAGE_SIZE * 1024) -#endif long physmem; /* @@ -113,57 +107,14 @@ static void vm_mem_init(void *); SYSINIT(vm_mem, SI_SUB_VM, SI_ORDER_FIRST, vm_mem_init, NULL); /* - * Import kva into the kernel arena. - */ -static int -kva_import(void *unused, vmem_size_t size, int flags, vmem_addr_t *addrp) -{ - vm_offset_t addr; - int result; - - KASSERT((size % KVA_QUANTUM) == 0, - ("kva_import: Size %jd is not a multiple of %d", - (intmax_t)size, (int)KVA_QUANTUM)); - addr = vm_map_min(kernel_map); - result = vm_map_find(kernel_map, NULL, 0, &addr, size, 0, - VMFS_SUPER_SPACE, VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT); - if (result != KERN_SUCCESS) - return (ENOMEM); - - *addrp = addr; - - return (0); -} - -#if VM_NRESERVLEVEL > 0 -/* - * Import a superpage from the normal kernel arena into the special - * arena for allocations with different permissions. - */ -static int -kernel_rwx_alloc(void *arena, vmem_size_t size, int flags, vmem_addr_t *addrp) -{ - - KASSERT((size % KVA_QUANTUM) == 0, - ("kernel_rwx_alloc: Size %jd is not a multiple of %d", - (intmax_t)size, (int)KVA_QUANTUM)); - return (vmem_xalloc(arena, size, KVA_QUANTUM, 0, 0, VMEM_ADDR_MIN, - VMEM_ADDR_MAX, flags, addrp)); -} -#endif - -/* * vm_init initializes the virtual memory system. * This is done only by the first cpu up. * * The start and end address of physical memory is passed in. */ -/* ARGSUSED*/ static void -vm_mem_init(dummy) - void *dummy; +vm_mem_init(void *dummy) { - int domain; /* * Initializes resident memory structures. From here on, all physical @@ -184,39 +135,6 @@ vm_mem_init(dummy) vm_map_startup(); kmem_init(virtual_avail, virtual_end); - /* - * Initialize the kernel_arena. This can grow on demand. - */ - vmem_init(kernel_arena, "kernel arena", 0, 0, PAGE_SIZE, 0, 0); - vmem_set_import(kernel_arena, kva_import, NULL, NULL, KVA_QUANTUM); - -#if VM_NRESERVLEVEL > 0 - /* - * In an architecture with superpages, maintain a separate arena - * for allocations with permissions that differ from the "standard" - * read/write permissions used for memory in the kernel_arena. - */ - kernel_rwx_arena = vmem_create("kernel rwx arena", 0, 0, PAGE_SIZE, - 0, M_WAITOK); - vmem_set_import(kernel_rwx_arena, kernel_rwx_alloc, - (vmem_release_t *)vmem_xfree, kernel_arena, KVA_QUANTUM); -#endif - - for (domain = 0; domain < vm_ndomains; domain++) { - vm_dom[domain].vmd_kernel_arena = vmem_create( - "kernel arena domain", 0, 0, PAGE_SIZE, 0, M_WAITOK); - vmem_set_import(vm_dom[domain].vmd_kernel_arena, - (vmem_import_t *)vmem_alloc, NULL, kernel_arena, - KVA_QUANTUM); -#if VM_NRESERVLEVEL > 0 - vm_dom[domain].vmd_kernel_rwx_arena = vmem_create( - "kernel rwx arena domain", 0, 0, PAGE_SIZE, 0, M_WAITOK); - vmem_set_import(vm_dom[domain].vmd_kernel_rwx_arena, - kernel_rwx_alloc, (vmem_release_t *)vmem_xfree, - vm_dom[domain].vmd_kernel_arena, KVA_QUANTUM); -#endif - } - #ifndef UMA_MD_SMALL_ALLOC /* Set up radix zone to use noobj_alloc. */ vm_radix_reserve_kva(); diff --git a/sys/vm/vm_kern.c b/sys/vm/vm_kern.c index 5dee4d758dd0..88fbc74848df 100644 --- a/sys/vm/vm_kern.c +++ b/sys/vm/vm_kern.c @@ -121,6 +121,14 @@ SYSCTL_ULONG(_vm, OID_AUTO, max_kernel_address, CTLFLAG_RD, #endif "Max kernel address"); +#if VM_NRESERVLEVEL > 0 +#define KVA_QUANTUM_SHIFT (VM_LEVEL_0_ORDER + PAGE_SHIFT) +#else +/* On non-superpage architectures want large import sizes. */ +#define KVA_QUANTUM_SHIFT (10 + PAGE_SHIFT) +#endif +#define KVA_QUANTUM (1 << KVA_QUANTUM_SHIFT) + /* * kva_alloc: * @@ -409,9 +417,10 @@ kmem_malloc(vm_size_t size, int flags) } /* - * kmem_back: + * kmem_back_domain: * - * Allocate physical pages for the specified virtual address range. + * Allocate physical pages from the specified domain for the specified + * virtual address range. */ int kmem_back_domain(int domain, vm_object_t object, vm_offset_t addr, @@ -472,24 +481,43 @@ retry: return (KERN_SUCCESS); } +/* + * kmem_back: + * + * Allocate physical pages for the specified virtual address range. + */ int kmem_back(vm_object_t object, vm_offset_t addr, vm_size_t size, int flags) { - struct vm_domainset_iter di; - int domain; - int ret; + vm_offset_t end, next, start; + int domain, rv; KASSERT(object == kernel_object, ("kmem_back: only supports kernel object.")); - vm_domainset_iter_malloc_init(&di, kernel_object, &domain, &flags); - do { - ret = kmem_back_domain(domain, object, addr, size, flags); - if (ret == KERN_SUCCESS) + for (start = addr, end = addr + size; addr < end; addr = next) { + /* + * We must ensure that pages backing a given large virtual page + * all come from the same physical domain. + */ + if (vm_ndomains > 1) { + domain = (addr >> KVA_QUANTUM_SHIFT) % vm_ndomains; + while (VM_DOMAIN_EMPTY(domain)) + domain++; + next = roundup2(addr + 1, KVA_QUANTUM); + if (next > end || next < start) + next = end; + } else { + domain = 0; + next = end; + } + rv = kmem_back_domain(domain, object, addr, next - addr, flags); + if (rv != KERN_SUCCESS) { + kmem_unback(object, start, addr - start); break; - } while (vm_domainset_iter_malloc(&di, &domain, &flags) == 0); - - return (ret); + } + } + return (rv); } /* @@ -645,17 +673,57 @@ kmem_init_zero_region(void) } /* + * Import KVA from the kernel map into the kernel arena. + */ +static int +kva_import(void *unused, vmem_size_t size, int flags, vmem_addr_t *addrp) +{ + vm_offset_t addr; + int result; + + KASSERT((size % KVA_QUANTUM) == 0, + ("kva_import: Size %jd is not a multiple of %d", + (intmax_t)size, (int)KVA_QUANTUM)); + addr = vm_map_min(kernel_map); + result = vm_map_find(kernel_map, NULL, 0, &addr, size, 0, + VMFS_SUPER_SPACE, VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT); + if (result != KERN_SUCCESS) + return (ENOMEM); + + *addrp = addr; + + return (0); +} + +/* + * Import KVA from a parent arena into a per-domain arena. Imports must be + * KVA_QUANTUM-aligned and a multiple of KVA_QUANTUM in size. + */ +static int +kva_import_domain(void *arena, vmem_size_t size, int flags, vmem_addr_t *addrp) +{ + + KASSERT((size % KVA_QUANTUM) == 0, + ("kva_import_domain: Size %jd is not a multiple of %d", + (intmax_t)size, (int)KVA_QUANTUM)); + return (vmem_xalloc(arena, size, KVA_QUANTUM, 0, 0, VMEM_ADDR_MIN, + VMEM_ADDR_MAX, flags, addrp)); +} + +/* * kmem_init: * * Create the kernel map; insert a mapping covering kernel text, * data, bss, and all space allocated thus far (`boostrap' data). The * new map will thus map the range between VM_MIN_KERNEL_ADDRESS and * `start' as allocated, and the range between `start' and `end' as free. + * Create the kernel vmem arena and its per-domain children. */ void kmem_init(vm_offset_t start, vm_offset_t end) { vm_map_t m; + int domain; m = vm_map_create(kernel_pmap, VM_MIN_KERNEL_ADDRESS, end); m->system_map = 1; @@ -671,6 +739,39 @@ kmem_init(vm_offset_t start, vm_offset_t end) start, VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT); /* ... and ending with the completion of the above `insert' */ vm_map_unlock(m); + + /* + * Initialize the kernel_arena. This can grow on demand. + */ + vmem_init(kernel_arena, "kernel arena", 0, 0, PAGE_SIZE, 0, 0); + vmem_set_import(kernel_arena, kva_import, NULL, NULL, KVA_QUANTUM); + + for (domain = 0; domain < vm_ndomains; domain++) { + /* + * Initialize the per-domain arenas. These are used to color + * the KVA space in a way that ensures that virtual large pages + * are backed by memory from the same physical domain, + * maximizing the potential for superpage promotion. + */ + vm_dom[domain].vmd_kernel_arena = vmem_create( + "kernel arena domain", 0, 0, PAGE_SIZE, 0, M_WAITOK); + vmem_set_import(vm_dom[domain].vmd_kernel_arena, + kva_import_domain, NULL, kernel_arena, KVA_QUANTUM); + + /* + * In architectures with superpages, maintain separate arenas + * for allocations with permissions that differ from the + * "standard" read/write permissions used for kernel memory, + * so as not to inhibit superpage promotion. + */ +#if VM_NRESERVLEVEL > 0 + vm_dom[domain].vmd_kernel_rwx_arena = vmem_create( + "kernel rwx arena domain", 0, 0, PAGE_SIZE, 0, M_WAITOK); + vmem_set_import(vm_dom[domain].vmd_kernel_rwx_arena, + kva_import_domain, (vmem_release_t *)vmem_xfree, + kernel_arena, KVA_QUANTUM); +#endif + } } /* diff --git a/sys/vm/vm_kern.h b/sys/vm/vm_kern.h index 8d49a598f26d..20e847f5e5af 100644 --- a/sys/vm/vm_kern.h +++ b/sys/vm/vm_kern.h @@ -70,7 +70,6 @@ extern vm_map_t kernel_map; extern vm_map_t exec_map; extern vm_map_t pipe_map; extern struct vmem *kernel_arena; -extern struct vmem *kernel_rwx_arena; extern struct vmem *kmem_arena; extern struct vmem *buffer_arena; extern struct vmem *transient_arena; diff --git a/sys/vm/vm_mmap.c b/sys/vm/vm_mmap.c index e184b8ff5c1f..e8772a1eadde 100644 --- a/sys/vm/vm_mmap.c +++ b/sys/vm/vm_mmap.c @@ -1055,12 +1055,8 @@ sys_mlockall(struct thread *td, struct mlockall_args *uap) * a hard resource limit, return ENOMEM. */ if (!old_mlock && uap->how & MCL_CURRENT) { - PROC_LOCK(td->td_proc); - if (map->size > lim_cur(td, RLIMIT_MEMLOCK)) { - PROC_UNLOCK(td->td_proc); + if (map->size > lim_cur(td, RLIMIT_MEMLOCK)) return (ENOMEM); - } - PROC_UNLOCK(td->td_proc); } #ifdef RACCT if (racct_enable) { @@ -1445,21 +1441,21 @@ vm_mmap_object(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, curmap = map == &td->td_proc->p_vmspace->vm_map; if (curmap) { - PROC_LOCK(td->td_proc); - if (map->size + size > lim_cur_proc(td->td_proc, RLIMIT_VMEM)) { - PROC_UNLOCK(td->td_proc); + RACCT_PROC_LOCK(td->td_proc); + if (map->size + size > lim_cur(td, RLIMIT_VMEM)) { + RACCT_PROC_UNLOCK(td->td_proc); return (ENOMEM); } if (racct_set(td->td_proc, RACCT_VMEM, map->size + size)) { - PROC_UNLOCK(td->td_proc); + RACCT_PROC_UNLOCK(td->td_proc); return (ENOMEM); } if (!old_mlock && map->flags & MAP_WIREFUTURE) { if (ptoa(pmap_wired_count(map->pmap)) + size > - lim_cur_proc(td->td_proc, RLIMIT_MEMLOCK)) { + lim_cur(td, RLIMIT_MEMLOCK)) { racct_set_force(td->td_proc, RACCT_VMEM, map->size); - PROC_UNLOCK(td->td_proc); + RACCT_PROC_UNLOCK(td->td_proc); return (ENOMEM); } error = racct_set(td->td_proc, RACCT_MEMLOCK, @@ -1467,11 +1463,11 @@ vm_mmap_object(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, if (error != 0) { racct_set_force(td->td_proc, RACCT_VMEM, map->size); - PROC_UNLOCK(td->td_proc); + RACCT_PROC_UNLOCK(td->td_proc); return (error); } } - PROC_UNLOCK(td->td_proc); + RACCT_PROC_UNLOCK(td->td_proc); } /* diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c index d022435704f1..47628cba2ac1 100644 --- a/sys/vm/vm_page.c +++ b/sys/vm/vm_page.c @@ -2959,7 +2959,7 @@ vm_wait_doms(const domainset_t *wdoms) * consume all freed pages while old allocators wait. */ mtx_lock(&vm_domainset_lock); - if (DOMAINSET_SUBSET(&vm_min_domains, wdoms)) { + if (vm_page_count_min_set(wdoms)) { vm_min_waiters++; msleep(&vm_min_domains, &vm_domainset_lock, PVM | PDROP, "vmwait", 0); @@ -3078,7 +3078,7 @@ vm_waitpfault(struct domainset *dset) * consume all freed pages while old allocators wait. */ mtx_lock(&vm_domainset_lock); - if (DOMAINSET_SUBSET(&vm_min_domains, &dset->ds_mask)) { + if (vm_page_count_min_set(&dset->ds_mask)) { vm_min_waiters++; msleep(&vm_min_domains, &vm_domainset_lock, PUSER | PDROP, "pfault", 0); diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c index c084de7aca44..5c309a04a402 100644 --- a/sys/vm/vm_pageout.c +++ b/sys/vm/vm_pageout.c @@ -2082,6 +2082,13 @@ vm_pageout(void) if (error != 0) panic("starting laundry for domain 0, error %d", error); for (i = 1; i < vm_ndomains; i++) { + if (VM_DOMAIN_EMPTY(i)) { + if (bootverbose) + printf("domain %d empty; skipping pageout\n", + i); + continue; + } + error = kthread_add(vm_pageout_worker, (void *)(uintptr_t)i, curproc, NULL, 0, 0, "dom%d", i); if (error != 0) { diff --git a/sys/vm/vm_pagequeue.h b/sys/vm/vm_pagequeue.h index ac04f2b3ea61..758e51d8ef6e 100644 --- a/sys/vm/vm_pagequeue.h +++ b/sys/vm/vm_pagequeue.h @@ -151,7 +151,8 @@ struct vm_domain { extern struct vm_domain vm_dom[MAXMEMDOM]; -#define VM_DOMAIN(n) (&vm_dom[(n)]) +#define VM_DOMAIN(n) (&vm_dom[(n)]) +#define VM_DOMAIN_EMPTY(n) (vm_dom[(n)].vmd_page_count == 0) #define vm_pagequeue_assert_locked(pq) mtx_assert(&(pq)->pq_mutex, MA_OWNED) #define vm_pagequeue_lock(pq) mtx_lock(&(pq)->pq_mutex) diff --git a/sys/vm/vm_phys.c b/sys/vm/vm_phys.c index 890f5dad9213..5206ba6e658f 100644 --- a/sys/vm/vm_phys.c +++ b/sys/vm/vm_phys.c @@ -78,6 +78,7 @@ int __read_mostly *mem_locality; #endif int __read_mostly vm_ndomains = 1; +domainset_t __read_mostly all_domains = DOMAINSET_T_INITIALIZER(0x1); struct vm_phys_seg __read_mostly vm_phys_segs[VM_PHYSSEG_MAX]; int __read_mostly vm_phys_nsegs; diff --git a/sys/x86/acpica/srat.c b/sys/x86/acpica/srat.c index 0ea747ccc30b..42b506c4447b 100644 --- a/sys/x86/acpica/srat.c +++ b/sys/x86/acpica/srat.c @@ -311,8 +311,20 @@ check_domains(void) } for (i = 0; i <= max_apic_id; i++) if (cpus[i].enabled && !cpus[i].has_memory) { - printf("SRAT: No memory found for CPU %d\n", i); - return (ENXIO); + found = 0; + for (j = 0; j < num_mem && !found; j++) { + if (mem_info[j].domain == cpus[i].domain) + found = 1; + } + if (!found) { + if (bootverbose) + printf("SRAT: mem dom %d is empty\n", + cpus[i].domain); + mem_info[num_mem].start = 0; + mem_info[num_mem].end = 0; + mem_info[num_mem].domain = cpus[i].domain; + num_mem++; + } } return (0); } @@ -470,8 +482,9 @@ parse_srat(void) } #ifdef NUMA - /* Point vm_phys at our memory affinity table. */ vm_ndomains = ndomain; + for (int i = 0; i < vm_ndomains; i++) + DOMAINSET_SET(i, &all_domains); mem_affinity = mem_info; #endif diff --git a/sys/x86/include/ifunc.h b/sys/x86/include/ifunc.h index 13f4b886430e..38c82570770e 100644 --- a/sys/x86/include/ifunc.h +++ b/sys/x86/include/ifunc.h @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2015, 2017 The FreeBSD Foundation + * Copyright (c) 2015-2018 The FreeBSD Foundation * All rights reserved. * * This software was developed by Konstantin Belousov <kib@FreeBSD.org> @@ -32,27 +32,19 @@ #ifndef __X86_IFUNC_H #define __X86_IFUNC_H -#define DECLARE_LIFUNC(ret_type, name, args) \ -ret_type name args - -#define DEFINE_LIFUNC(scope, selector_qual, ret_type, name, args) \ -__asm__ (scope "\t" #name "\n" \ - "\t.type\t" #name ",@function\n" \ - #name ":\n" \ - "\tjmp *" #name "_selector\n" \ - "\t.size\t" #name ",\t. - "#name); \ -selector_qual ret_type (*name##_selector)args __used; \ -DECLARE_LIFUNC(ret_type, name, args) - -#define DEFINE_STATIC_LIFUNC(ret_type, name, args) \ - DEFINE_LIFUNC(".local", static, ret_type, name, args) - -#define DEFINE_GLOBAL_LIFUNC(ret_type, name, args) \ - DEFINE_LIFUNC(".globl", , ret_type, name, args) - -#define DEFINE_IFUNC(qual, ret_type, name, args, resolver_qual) \ +#define DEFINE_IFUNC(qual, ret_type, name, args, resolver_qual) \ resolver_qual ret_type (*name##_resolver(void))args __used; \ qual ret_type name args __attribute__((ifunc(#name "_resolver"))); \ resolver_qual ret_type (*name##_resolver(void))args +#define DEFINE_UIFUNC(qual, ret_type, name, args, resolver_qual) \ + resolver_qual ret_type (*name##_resolver(uint32_t, uint32_t, \ + uint32_t, uint32_t))args __used; \ + qual ret_type name args __attribute__((ifunc(#name "_resolver"))); \ + resolver_qual ret_type (*name##_resolver( \ + uint32_t cpu_feature __unused, \ + uint32_t cpu_feature2 __unused, \ + uint32_t cpu_stdext_feature __unused, \ + uint32_t cpu_stdext_feature2 __unused))args + #endif diff --git a/sys/x86/include/ucode.h b/sys/x86/include/ucode.h index d9c860d20ca8..5c8868108930 100644 --- a/sys/x86/include/ucode.h +++ b/sys/x86/include/ucode.h @@ -58,7 +58,8 @@ struct ucode_intel_extsig_table { } entries[0]; }; -int ucode_intel_load(void *data, bool unsafe); +int ucode_intel_load(void *data, bool unsafe, + uint64_t *nrevp, uint64_t *orevp); size_t ucode_load_bsp(uintptr_t free); void ucode_load_ap(int cpu); void ucode_reload(void); diff --git a/sys/x86/iommu/intel_utils.c b/sys/x86/iommu/intel_utils.c index 5b145c5aeae9..944aa214baa6 100644 --- a/sys/x86/iommu/intel_utils.c +++ b/sys/x86/iommu/intel_utils.c @@ -368,8 +368,7 @@ dmar_flush_transl_to_ram(struct dmar_unit *unit, void *dst, size_t sz) * If DMAR does not snoop paging structures accesses, flush * CPU cache to memory. */ - pmap_invalidate_cache_range((uintptr_t)dst, (uintptr_t)dst + sz, - TRUE); + pmap_force_invalidate_cache_range((uintptr_t)dst, (uintptr_t)dst + sz); } void diff --git a/sys/x86/isa/atpic.c b/sys/x86/isa/atpic.c index 8560793df503..e7e0cc79b54c 100644 --- a/sys/x86/isa/atpic.c +++ b/sys/x86/isa/atpic.c @@ -221,14 +221,20 @@ atpic_register_sources(struct pic *pic) * that APIC ISA routing and allowing the ATPIC source for that IRQ * to leak through. We used to depend on this feature for routing * IRQ0 via mixed mode, but now we don't use mixed mode at all. + * + * To avoid the slave not register sources after the master + * registers its sources, register all IRQs when this function is + * called on the master. */ + if (ap != &atpics[MASTER]) + return; for (i = 0; i < NUM_ISA_IRQS; i++) if (intr_lookup_source(i) != NULL) return; /* Loop through all interrupt sources and add them. */ - for (i = 0, ai = atintrs + ap->at_irqbase; i < 8; i++, ai++) { - if (ap->at_irqbase + i == ICU_SLAVEID) + for (i = 0, ai = atintrs; i < NUM_ISA_IRQS; i++, ai++) { + if (i == ICU_SLAVEID) continue; intr_register_source(&ai->at_intsrc); } diff --git a/sys/x86/x86/ucode.c b/sys/x86/x86/ucode.c index e1229a8401ec..5b039491345a 100644 --- a/sys/x86/x86/ucode.c +++ b/sys/x86/x86/ucode.c @@ -59,7 +59,7 @@ static int ucode_intel_verify(struct ucode_intel_header *hdr, static struct ucode_ops { const char *vendor; - int (*load)(void *, bool); + int (*load)(void *, bool, uint64_t *, uint64_t *); void *(*match)(uint8_t *, size_t *); } loaders[] = { { @@ -72,35 +72,46 @@ static struct ucode_ops { /* Selected microcode update data. */ static void *early_ucode_data; static void *ucode_data; +static struct ucode_ops *ucode_loader; -static char errbuf[128]; - -static void __printflike(1, 2) -log_err(const char *fmt, ...) -{ - va_list ap; - - va_start(ap, fmt); - vsnprintf(errbuf, sizeof(errbuf), fmt, ap); - va_end(ap); -} +/* Variables used for reporting success or failure. */ +enum { + NO_ERROR, + NO_MATCH, + VERIFICATION_FAILED, +} ucode_error = NO_ERROR; +static uint64_t ucode_nrev, ucode_orev; static void -print_err(void *arg __unused) +log_msg(void *arg __unused) { - if (errbuf[0] != '\0') - printf("microcode load error: %s\n", errbuf); + if (ucode_nrev != 0) { + printf("CPU microcode: updated from %#jx to %#jx\n", + (uintmax_t)ucode_orev, (uintmax_t)ucode_nrev); + return; + } + + switch (ucode_error) { + case NO_MATCH: + printf("CPU microcode: no matching update found\n"); + break; + case VERIFICATION_FAILED: + printf("CPU microcode: microcode verification failed\n"); + break; + default: + break; + } } -SYSINIT(ucode_print_err, SI_SUB_CPU, SI_ORDER_FIRST, print_err, NULL); +SYSINIT(ucode_log, SI_SUB_CPU, SI_ORDER_FIRST, log_msg, NULL); int -ucode_intel_load(void *data, bool unsafe) +ucode_intel_load(void *data, bool unsafe, uint64_t *nrevp, uint64_t *orevp) { - uint64_t rev0, rev1; + uint64_t nrev, orev; uint32_t cpuid[4]; - rev0 = rdmsr(MSR_BIOS_SIGN); + orev = rdmsr(MSR_BIOS_SIGN) >> 32; /* * Perform update. Flush caches first to work around seemingly @@ -118,8 +129,15 @@ ucode_intel_load(void *data, bool unsafe) */ do_cpuid(0, cpuid); - rev1 = rdmsr(MSR_BIOS_SIGN); - if (rev1 <= rev0) + /* + * Verify that the microcode revision changed. + */ + nrev = rdmsr(MSR_BIOS_SIGN) >> 32; + if (nrevp != NULL) + *nrevp = nrev; + if (orevp != NULL) + *orevp = orev; + if (nrev <= orev) return (EEXIST); return (0); } @@ -130,36 +148,26 @@ ucode_intel_verify(struct ucode_intel_header *hdr, size_t resid) uint32_t cksum, *data, size; int i; - if (resid < sizeof(struct ucode_intel_header)) { - log_err("truncated update header"); + if (resid < sizeof(struct ucode_intel_header)) return (1); - } size = hdr->total_size; if (size == 0) size = UCODE_INTEL_DEFAULT_DATA_SIZE + sizeof(struct ucode_intel_header); - if (hdr->header_version != 1) { - log_err("unexpected header version %u", hdr->header_version); + if (hdr->header_version != 1) return (1); - } - if (size % 16 != 0) { - log_err("unexpected update size %u", hdr->total_size); + if (size % 16 != 0) return (1); - } - if (resid < size) { - log_err("truncated update"); + if (resid < size) return (1); - } cksum = 0; data = (uint32_t *)hdr; for (i = 0; i < size / sizeof(uint32_t); i++) cksum += data[i]; - if (cksum != 0) { - log_err("checksum failed"); + if (cksum != 0) return (1); - } return (0); } @@ -182,8 +190,10 @@ ucode_intel_match(uint8_t *data, size_t *len) for (resid = *len; resid > 0; data += total_size, resid -= total_size) { hdr = (struct ucode_intel_header *)data; - if (ucode_intel_verify(hdr, resid) != 0) + if (ucode_intel_verify(hdr, resid) != 0) { + ucode_error = VERIFICATION_FAILED; break; + } data_size = hdr->data_size; total_size = hdr->total_size; @@ -259,12 +269,12 @@ ucode_load_ap(int cpu) KASSERT(cpu_info[cpu_apic_ids[cpu]].cpu_present, ("cpu %d not present", cpu)); - if (!cpu_info[cpu_apic_ids[cpu]].cpu_hyperthread) + if (cpu_info[cpu_apic_ids[cpu]].cpu_hyperthread) return; #endif if (ucode_data != NULL) - (void)ucode_intel_load(ucode_data, false); + (void)ucode_loader->load(ucode_data, false, NULL, NULL); } static void * @@ -308,11 +318,12 @@ ucode_load_bsp(uintptr_t free) uint32_t regs[4]; char vendor[13]; } cpuid; - struct ucode_ops *loader; uint8_t *addr, *fileaddr, *match; char *type; + uint64_t nrev, orev; caddr_t file; - size_t i, len, ucode_len; + size_t i, len; + int error; KASSERT(free % PAGE_SIZE == 0, ("unaligned boundary %p", (void *)free)); @@ -320,17 +331,16 @@ ucode_load_bsp(uintptr_t free) cpuid.regs[0] = cpuid.regs[1]; cpuid.regs[1] = cpuid.regs[3]; cpuid.vendor[12] = '\0'; - for (i = 0, loader = NULL; i < nitems(loaders); i++) + for (i = 0; i < nitems(loaders); i++) if (strcmp(cpuid.vendor, loaders[i].vendor) == 0) { - loader = &loaders[i]; + ucode_loader = &loaders[i]; break; } - if (loader == NULL) + if (ucode_loader == NULL) return (0); file = 0; fileaddr = match = NULL; - ucode_len = 0; for (;;) { file = preload_search_next_name(file); if (file == 0) @@ -341,7 +351,7 @@ ucode_load_bsp(uintptr_t free) fileaddr = preload_fetch_addr(file); len = preload_fetch_size(file); - match = loader->match(fileaddr, &len); + match = ucode_loader->match(fileaddr, &len); if (match != NULL) { addr = map_ucode(free, len); /* We can't use memcpy() before ifunc resolution. */ @@ -349,18 +359,19 @@ ucode_load_bsp(uintptr_t free) addr[i] = ((volatile uint8_t *)match)[i]; match = addr; - if (loader->load(match, false) == 0) { - ucode_data = match; - ucode_len = len; - early_ucode_data = ucode_data; - break; + error = ucode_loader->load(match, false, &nrev, &orev); + if (error == 0) { + ucode_data = early_ucode_data = match; + ucode_nrev = nrev; + ucode_orev = orev; + return (len); } unmap_ucode(free, len); } } - if (fileaddr != NULL && ucode_data == NULL) - log_err("no matching update found"); - return (ucode_len); + if (fileaddr != NULL && ucode_error == NO_ERROR) + ucode_error = NO_MATCH; + return (0); } /* |
