From f2990e6c19d33bf5284d45211d5f8184d021afdb Mon Sep 17 00:00:00 2001 From: Ed Maste Date: Thu, 13 Sep 2018 21:00:17 +0000 Subject: Enable Capsicum on armv6/armv7 We ought to be consistent across our Tier-1 and nearly-Tier-1 architectures, so enable Capsicum for 32-bit armv6/armv7 by default. PR: 204008 Reviewed by: ian, oshogbo Approved by: re (gjb) Relnotes: Yes Sponsored by: The FreeBSD Foundation Differential Revision: https://reviews.freebsd.org/D17023 --- sys/arm/conf/std.armv6 | 2 ++ sys/arm/conf/std.armv7 | 2 ++ 2 files changed, 4 insertions(+) (limited to 'sys') diff --git a/sys/arm/conf/std.armv6 b/sys/arm/conf/std.armv6 index 2f6f9c93af4f..52685a9b13bf 100644 --- a/sys/arm/conf/std.armv6 +++ b/sys/arm/conf/std.armv6 @@ -41,6 +41,8 @@ options _KPOSIX_PRIORITY_SCHEDULING # POSIX P1003_1B real-time extensions options PRINTF_BUFR_SIZE=128 # Prevent printf output being interspersed. options KBD_INSTALL_CDEV # install a CDEV entry in /dev options HWPMC_HOOKS # Necessary kernel hooks for hwpmc(4) +options CAPABILITY_MODE # Capsicum capability mode +options CAPABILITIES # Capsicum capabilites options FREEBSD_BOOT_LOADER # Process metadata passed from loader(8) options VFP # Enable floating point hardware support options MAC # Support for Mandatory Access Control (MAC) diff --git a/sys/arm/conf/std.armv7 b/sys/arm/conf/std.armv7 index 5754f4780fea..c3ab6852e615 100644 --- a/sys/arm/conf/std.armv7 +++ b/sys/arm/conf/std.armv7 @@ -41,6 +41,8 @@ options _KPOSIX_PRIORITY_SCHEDULING # POSIX P1003_1B real-time extensions options PRINTF_BUFR_SIZE=128 # Prevent printf output being interspersed. options KBD_INSTALL_CDEV # install a CDEV entry in /dev options HWPMC_HOOKS # Necessary kernel hooks for hwpmc(4) +options CAPABILITY_MODE # Capsicum capability mode +options CAPABILITIES # Capsicum capabilites options FREEBSD_BOOT_LOADER # Process metadata passed from loader(8) options VFP # Enable floating point hardware support options MAC # Support for Mandatory Access Control (MAC) -- cgit v1.3 From 4bb64e96e4d50b852d9947d3bd90c81b05e5b654 Mon Sep 17 00:00:00 2001 From: Navdeep Parhar Date: Thu, 13 Sep 2018 22:58:13 +0000 Subject: cxgbe(4): Use the correct number of parameters when querying the tid range for hashfilters. Approved by: re@ (gjb@) --- sys/dev/cxgbe/t4_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'sys') diff --git a/sys/dev/cxgbe/t4_main.c b/sys/dev/cxgbe/t4_main.c index d6bf50121a15..60bdff978b69 100644 --- a/sys/dev/cxgbe/t4_main.c +++ b/sys/dev/cxgbe/t4_main.c @@ -3954,7 +3954,7 @@ get_params__post_init(struct adapter *sc) sc->toecaps = 0; param[0] = FW_PARAM_DEV(NTID); - rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 6, param, val); + rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, param, val); if (rc != 0) { device_printf(sc->dev, "failed to query HASHFILTER parameters: %d.\n", rc); -- cgit v1.3 From b79672bb0899e90b57371592762da859f2002cf1 Mon Sep 17 00:00:00 2001 From: Glen Barber Date: Thu, 13 Sep 2018 23:59:59 +0000 Subject: Update head from ALPHA5 to ALPHA6 as part of the 12.0-RELEASE cycle. Approved by: re (implicit) Sponsored by: The FreeBSD Foundation --- sys/conf/newvers.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'sys') diff --git a/sys/conf/newvers.sh b/sys/conf/newvers.sh index 9e83b6379f51..ce32cf2950af 100644 --- a/sys/conf/newvers.sh +++ b/sys/conf/newvers.sh @@ -46,7 +46,7 @@ TYPE="FreeBSD" REVISION="12.0" -BRANCH="ALPHA5" +BRANCH="ALPHA6" if [ -n "${BRANCH_OVERRIDE}" ]; then BRANCH=${BRANCH_OVERRIDE} fi -- cgit v1.3 From 0204d85a62dd18966eae15cc71e9be117b2433c2 Mon Sep 17 00:00:00 2001 From: Matt Macy Date: Fri, 14 Sep 2018 01:30:05 +0000 Subject: hwpmc: set default rate if event description lacks one / filter rate against misuse Not all event descriptions have a sample rate (such as inst_retired.any) this will restore the legacy behavior of using 65536 in that case. It also prevents accidental API misuse that could lead to panic. PR: 230985 Reported by: markj Reviewed by: markj Approved by: re (gjb) Sponsored by: Limelight Networks Differential Revision: https://reviews.freebsd.org/D16958 --- lib/libpmc/libpmc_pmu_util.c | 1 + sys/dev/hwpmc/hwpmc_mod.c | 27 +++++++++++++++++++++------ 2 files changed, 22 insertions(+), 6 deletions(-) (limited to 'sys') diff --git a/lib/libpmc/libpmc_pmu_util.c b/lib/libpmc/libpmc_pmu_util.c index 7aaf6f2a5b18..583a2d36bc74 100644 --- a/lib/libpmc/libpmc_pmu_util.c +++ b/lib/libpmc/libpmc_pmu_util.c @@ -237,6 +237,7 @@ pmu_parse_event(struct pmu_event_desc *ped, const char *eventin) return (ENOMEM); r = event; bzero(ped, sizeof(*ped)); + ped->ped_period = DEFAULT_SAMPLE_COUNT; ped->ped_umask = -1; while ((kvp = strsep(&event, ",")) != NULL) { key = strsep(&kvp, "="); diff --git a/sys/dev/hwpmc/hwpmc_mod.c b/sys/dev/hwpmc/hwpmc_mod.c index 4d7a4535d27b..20df37d24bd6 100644 --- a/sys/dev/hwpmc/hwpmc_mod.c +++ b/sys/dev/hwpmc/hwpmc_mod.c @@ -61,6 +61,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include @@ -3942,9 +3943,16 @@ pmc_syscall_handler(struct thread *td, void *syscall_args) pmc->pm_flags = pa.pm_flags; /* XXX set lower bound on sampling for process counters */ - if (PMC_IS_SAMPLING_MODE(mode)) - pmc->pm_sc.pm_reloadcount = pa.pm_count; - else + if (PMC_IS_SAMPLING_MODE(mode)) { + /* + * Don't permit requested sample rate to be less than 1000 + */ + if (pa.pm_count < 1000) + log(LOG_WARNING, + "pmcallocate: passed sample rate %ju - setting to 1000\n", + (uintmax_t)pa.pm_count); + pmc->pm_sc.pm_reloadcount = MAX(1000, pa.pm_count); + } else pmc->pm_sc.pm_initial = pa.pm_count; /* switch thread to CPU 'cpu' */ @@ -4460,9 +4468,16 @@ pmc_syscall_handler(struct thread *td, void *syscall_args) break; } - if (PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm))) - pm->pm_sc.pm_reloadcount = sc.pm_count; - else + if (PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm))) { + /* + * Don't permit requested sample rate to be less than 1000 + */ + if (sc.pm_count < 1000) + log(LOG_WARNING, + "pmcsetcount: passed sample rate %ju - setting to 1000\n", + (uintmax_t)sc.pm_count); + pm->pm_sc.pm_reloadcount = MAX(1000, sc.pm_count); + } else pm->pm_sc.pm_initial = sc.pm_count; } break; -- cgit v1.3 From c51b7ab9e3fb35cfa179ebf532421e94bad97c41 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Fri, 14 Sep 2018 15:29:35 +0000 Subject: amd64: implement pagezero_erms Intel docs claim such a memset (rep stosb + 4096 bytes) is special-cased by microarchs. They also switched Linux to use it for this purpose. Approved by: re (gjb) --- sys/amd64/amd64/machdep.c | 9 +++++++++ sys/amd64/amd64/support.S | 14 ++++++++++++-- 2 files changed, 21 insertions(+), 2 deletions(-) (limited to 'sys') diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c index 092dc095bf65..d9b6e0a04d5a 100644 --- a/sys/amd64/amd64/machdep.c +++ b/sys/amd64/amd64/machdep.c @@ -2693,3 +2693,12 @@ DEFINE_IFUNC(, void *, memcpy, (void * _Nonnull, const void * _Nonnull, size_t), return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ? memcpy_erms : memcpy_std); } + +void pagezero_std(void *addr); +void pagezero_erms(void *addr); +DEFINE_IFUNC(, void , pagezero, (void *), static) +{ + + return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ? + pagezero_erms : pagezero_std); +} diff --git a/sys/amd64/amd64/support.S b/sys/amd64/amd64/support.S index db89a6fadd3c..e683b24162ec 100644 --- a/sys/amd64/amd64/support.S +++ b/sys/amd64/amd64/support.S @@ -41,7 +41,7 @@ .text /* Address: %rdi */ -ENTRY(pagezero) +ENTRY(pagezero_std) PUSH_FRAME_POINTER movq $PAGE_SIZE/8,%rcx xorl %eax,%eax @@ -49,7 +49,17 @@ ENTRY(pagezero) stosq POP_FRAME_POINTER ret -END(pagezero) +END(pagezero_std) + +ENTRY(pagezero_erms) + PUSH_FRAME_POINTER + movq $PAGE_SIZE,%rcx + xorl %eax,%eax + rep + stosb + POP_FRAME_POINTER + ret +END(pagezero_erms) /* * pagecopy(%rdi=from, %rsi=to) -- cgit v1.3 From 6bfb487b6e7128476241acb775b35f436e6997c5 Mon Sep 17 00:00:00 2001 From: "Bjoern A. Zeeb" Date: Fri, 14 Sep 2018 15:46:31 +0000 Subject: Set ident for GENERIC-MMCCAM to not announce itself as GENERIC anymore. Reviewed by: andrew Approved by: re (gjb) --- sys/arm64/conf/GENERIC-MMCCAM | 1 + 1 file changed, 1 insertion(+) (limited to 'sys') diff --git a/sys/arm64/conf/GENERIC-MMCCAM b/sys/arm64/conf/GENERIC-MMCCAM index 0d1e91cd58c8..ab45fcb8168d 100644 --- a/sys/arm64/conf/GENERIC-MMCCAM +++ b/sys/arm64/conf/GENERIC-MMCCAM @@ -9,6 +9,7 @@ #NO_UNIVERSE include GENERIC +ident GENERIC-MMCCAM # Add CAMDEBUG stuff options CAMDEBUG -- cgit v1.3 From d5089b3aed7fab659a38ea1d0a6a281f04a66f7c Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Fri, 14 Sep 2018 17:04:36 +0000 Subject: Log a message after a successful boot-time microcode update. Reviewed by: kib Approved by: re (delphij) Sponsored by: The FreeBSD Foundation Differential Revision: https://reviews.freebsd.org/D17135 --- sys/dev/cpuctl/cpuctl.c | 2 +- sys/x86/include/ucode.h | 3 +- sys/x86/x86/ucode.c | 117 ++++++++++++++++++++++++++---------------------- 3 files changed, 67 insertions(+), 55 deletions(-) (limited to 'sys') diff --git a/sys/dev/cpuctl/cpuctl.c b/sys/dev/cpuctl/cpuctl.c index 33ac83eb731a..4f91b4fbbf28 100644 --- a/sys/dev/cpuctl/cpuctl.c +++ b/sys/dev/cpuctl/cpuctl.c @@ -362,7 +362,7 @@ update_intel(int cpu, cpuctl_update_args_t *args, struct thread *td) set_cpu(cpu, td); critical_enter(); - ret = ucode_intel_load(ptr, true); + ret = ucode_intel_load(ptr, true, NULL, NULL); critical_exit(); restore_cpu(oldcpu, is_bound, td); diff --git a/sys/x86/include/ucode.h b/sys/x86/include/ucode.h index d9c860d20ca8..5c8868108930 100644 --- a/sys/x86/include/ucode.h +++ b/sys/x86/include/ucode.h @@ -58,7 +58,8 @@ struct ucode_intel_extsig_table { } entries[0]; }; -int ucode_intel_load(void *data, bool unsafe); +int ucode_intel_load(void *data, bool unsafe, + uint64_t *nrevp, uint64_t *orevp); size_t ucode_load_bsp(uintptr_t free); void ucode_load_ap(int cpu); void ucode_reload(void); diff --git a/sys/x86/x86/ucode.c b/sys/x86/x86/ucode.c index e1229a8401ec..50a5cbe4462b 100644 --- a/sys/x86/x86/ucode.c +++ b/sys/x86/x86/ucode.c @@ -59,7 +59,7 @@ static int ucode_intel_verify(struct ucode_intel_header *hdr, static struct ucode_ops { const char *vendor; - int (*load)(void *, bool); + int (*load)(void *, bool, uint64_t *, uint64_t *); void *(*match)(uint8_t *, size_t *); } loaders[] = { { @@ -72,35 +72,46 @@ static struct ucode_ops { /* Selected microcode update data. */ static void *early_ucode_data; static void *ucode_data; +static struct ucode_ops *ucode_loader; -static char errbuf[128]; - -static void __printflike(1, 2) -log_err(const char *fmt, ...) -{ - va_list ap; - - va_start(ap, fmt); - vsnprintf(errbuf, sizeof(errbuf), fmt, ap); - va_end(ap); -} +/* Variables used for reporting success or failure. */ +enum { + NO_ERROR, + NO_MATCH, + VERIFICATION_FAILED, +} ucode_error = NO_ERROR; +static uint64_t ucode_nrev, ucode_orev; static void -print_err(void *arg __unused) +log_msg(void *arg __unused) { - if (errbuf[0] != '\0') - printf("microcode load error: %s\n", errbuf); + if (ucode_nrev != 0) { + printf("CPU microcode: updated from %#jx to %#jx\n", + (uintmax_t)ucode_orev, (uintmax_t)ucode_nrev); + return; + } + + switch (ucode_error) { + case NO_MATCH: + printf("CPU microcode: no matching update found\n"); + break; + case VERIFICATION_FAILED: + printf("CPU microcode: microcode verification failed\n"); + break; + default: + break; + } } -SYSINIT(ucode_print_err, SI_SUB_CPU, SI_ORDER_FIRST, print_err, NULL); +SYSINIT(ucode_log, SI_SUB_CPU, SI_ORDER_FIRST, log_msg, NULL); int -ucode_intel_load(void *data, bool unsafe) +ucode_intel_load(void *data, bool unsafe, uint64_t *nrevp, uint64_t *orevp) { - uint64_t rev0, rev1; + uint64_t nrev, orev; uint32_t cpuid[4]; - rev0 = rdmsr(MSR_BIOS_SIGN); + orev = rdmsr(MSR_BIOS_SIGN) >> 32; /* * Perform update. Flush caches first to work around seemingly @@ -118,8 +129,15 @@ ucode_intel_load(void *data, bool unsafe) */ do_cpuid(0, cpuid); - rev1 = rdmsr(MSR_BIOS_SIGN); - if (rev1 <= rev0) + /* + * Verify that the microcode revision changed. + */ + nrev = rdmsr(MSR_BIOS_SIGN) >> 32; + if (nrevp != NULL) + *nrevp = nrev; + if (orevp != NULL) + *orevp = orev; + if (nrev <= orev) return (EEXIST); return (0); } @@ -130,36 +148,26 @@ ucode_intel_verify(struct ucode_intel_header *hdr, size_t resid) uint32_t cksum, *data, size; int i; - if (resid < sizeof(struct ucode_intel_header)) { - log_err("truncated update header"); + if (resid < sizeof(struct ucode_intel_header)) return (1); - } size = hdr->total_size; if (size == 0) size = UCODE_INTEL_DEFAULT_DATA_SIZE + sizeof(struct ucode_intel_header); - if (hdr->header_version != 1) { - log_err("unexpected header version %u", hdr->header_version); + if (hdr->header_version != 1) return (1); - } - if (size % 16 != 0) { - log_err("unexpected update size %u", hdr->total_size); + if (size % 16 != 0) return (1); - } - if (resid < size) { - log_err("truncated update"); + if (resid < size) return (1); - } cksum = 0; data = (uint32_t *)hdr; for (i = 0; i < size / sizeof(uint32_t); i++) cksum += data[i]; - if (cksum != 0) { - log_err("checksum failed"); + if (cksum != 0) return (1); - } return (0); } @@ -182,8 +190,10 @@ ucode_intel_match(uint8_t *data, size_t *len) for (resid = *len; resid > 0; data += total_size, resid -= total_size) { hdr = (struct ucode_intel_header *)data; - if (ucode_intel_verify(hdr, resid) != 0) + if (ucode_intel_verify(hdr, resid) != 0) { + ucode_error = VERIFICATION_FAILED; break; + } data_size = hdr->data_size; total_size = hdr->total_size; @@ -264,7 +274,7 @@ ucode_load_ap(int cpu) #endif if (ucode_data != NULL) - (void)ucode_intel_load(ucode_data, false); + (void)ucode_loader->load(ucode_data, false, NULL, NULL); } static void * @@ -308,11 +318,12 @@ ucode_load_bsp(uintptr_t free) uint32_t regs[4]; char vendor[13]; } cpuid; - struct ucode_ops *loader; uint8_t *addr, *fileaddr, *match; char *type; + uint64_t nrev, orev; caddr_t file; - size_t i, len, ucode_len; + size_t i, len; + int error; KASSERT(free % PAGE_SIZE == 0, ("unaligned boundary %p", (void *)free)); @@ -320,17 +331,16 @@ ucode_load_bsp(uintptr_t free) cpuid.regs[0] = cpuid.regs[1]; cpuid.regs[1] = cpuid.regs[3]; cpuid.vendor[12] = '\0'; - for (i = 0, loader = NULL; i < nitems(loaders); i++) + for (i = 0; i < nitems(loaders); i++) if (strcmp(cpuid.vendor, loaders[i].vendor) == 0) { - loader = &loaders[i]; + ucode_loader = &loaders[i]; break; } - if (loader == NULL) + if (ucode_loader == NULL) return (0); file = 0; fileaddr = match = NULL; - ucode_len = 0; for (;;) { file = preload_search_next_name(file); if (file == 0) @@ -341,7 +351,7 @@ ucode_load_bsp(uintptr_t free) fileaddr = preload_fetch_addr(file); len = preload_fetch_size(file); - match = loader->match(fileaddr, &len); + match = ucode_loader->match(fileaddr, &len); if (match != NULL) { addr = map_ucode(free, len); /* We can't use memcpy() before ifunc resolution. */ @@ -349,18 +359,19 @@ ucode_load_bsp(uintptr_t free) addr[i] = ((volatile uint8_t *)match)[i]; match = addr; - if (loader->load(match, false) == 0) { - ucode_data = match; - ucode_len = len; - early_ucode_data = ucode_data; - break; + error = ucode_loader->load(match, false, &nrev, &orev); + if (error == 0) { + ucode_data = early_ucode_data = match; + ucode_nrev = nrev; + ucode_orev = orev; + return (len); } unmap_ucode(free, len); } } - if (fileaddr != NULL && ucode_data == NULL) - log_err("no matching update found"); - return (ucode_len); + if (fileaddr != NULL && ucode_error == NO_ERROR) + ucode_error = NO_MATCH; + return (0); } /* -- cgit v1.3 From 032d3aaa9607e1010d5ae766ea55d3474c434395 Mon Sep 17 00:00:00 2001 From: John-Mark Gurney Date: Sun, 16 Sep 2018 00:44:23 +0000 Subject: Significantly improve pf purge cpu usage by only taking locks when there is work to do. This reduces CPU consumption to one third on systems. This will help keep the thread CPU usage under control now that the default hash size has increased. Reviewed by: kp Approved by: re (kib) Differential Revision: https://reviews.freebsd.org/D17097 --- sys/netpfil/pf/pf.c | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) (limited to 'sys') diff --git a/sys/netpfil/pf/pf.c b/sys/netpfil/pf/pf.c index 08eea294b4e2..13c3c6463c0c 100644 --- a/sys/netpfil/pf/pf.c +++ b/sys/netpfil/pf/pf.c @@ -1719,24 +1719,28 @@ pf_purge_expired_states(u_int i, int maxcheck) while (maxcheck > 0) { ih = &V_pf_idhash[i]; + + /* only take the lock if we expect to do work */ + if (!LIST_EMPTY(&ih->states)) { relock: - PF_HASHROW_LOCK(ih); - LIST_FOREACH(s, &ih->states, entry) { - if (pf_state_expires(s) <= time_uptime) { - V_pf_status.states -= - pf_unlink_state(s, PF_ENTER_LOCKED); - goto relock; + PF_HASHROW_LOCK(ih); + LIST_FOREACH(s, &ih->states, entry) { + if (pf_state_expires(s) <= time_uptime) { + V_pf_status.states -= + pf_unlink_state(s, PF_ENTER_LOCKED); + goto relock; + } + s->rule.ptr->rule_flag |= PFRULE_REFS; + if (s->nat_rule.ptr != NULL) + s->nat_rule.ptr->rule_flag |= PFRULE_REFS; + if (s->anchor.ptr != NULL) + s->anchor.ptr->rule_flag |= PFRULE_REFS; + s->kif->pfik_flags |= PFI_IFLAG_REFS; + if (s->rt_kif) + s->rt_kif->pfik_flags |= PFI_IFLAG_REFS; } - s->rule.ptr->rule_flag |= PFRULE_REFS; - if (s->nat_rule.ptr != NULL) - s->nat_rule.ptr->rule_flag |= PFRULE_REFS; - if (s->anchor.ptr != NULL) - s->anchor.ptr->rule_flag |= PFRULE_REFS; - s->kif->pfik_flags |= PFI_IFLAG_REFS; - if (s->rt_kif) - s->rt_kif->pfik_flags |= PFI_IFLAG_REFS; + PF_HASHROW_UNLOCK(ih); } - PF_HASHROW_UNLOCK(ih); /* Return when we hit end of hash. */ if (++i > pf_hashmask) { -- cgit v1.3 From bd6c14afa7068172ad48b5081905009f21ca6b52 Mon Sep 17 00:00:00 2001 From: Konstantin Belousov Date: Sun, 16 Sep 2018 18:36:42 +0000 Subject: Remove unneeded new line from the panic string. Reviewed by: alc, markj Sponsored by: The FreeBSD Foundation MFC after: 1 week Approved by: re (rgrimes) Differential revision: https://reviews.freebsd.org/D17181 --- sys/amd64/amd64/trap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'sys') diff --git a/sys/amd64/amd64/trap.c b/sys/amd64/amd64/trap.c index 4d03da234f19..e9cb22960778 100644 --- a/sys/amd64/amd64/trap.c +++ b/sys/amd64/amd64/trap.c @@ -811,7 +811,7 @@ trap_pfault(struct trapframe *frame, int usermode) PGEX_U | PGEX_I)) == (PGEX_P | PGEX_U | PGEX_I) && (curpcb->pcb_saved_ucr3 & ~CR3_PCID_MASK)== (PCPU_GET(curpmap)->pm_cr3 & ~CR3_PCID_MASK)) - panic("PTI: pid %d comm %s tf_err %#lx\n", p->p_pid, + panic("PTI: pid %d comm %s tf_err %#lx", p->p_pid, p->p_comm, frame->tf_err); /* -- cgit v1.3 From 17f67f63b991cbf54cb31c9cd66acff009666529 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Sun, 16 Sep 2018 19:28:27 +0000 Subject: amd64: tidy up kernel memmove There is no need to use %rax for temporary values and avoiding doing so shortens the func. Handle the explicit 'check for tail' depessimisization for backwards copying. This reduces the diff against userspace. Approved by: re (kib) --- sys/amd64/amd64/support.S | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) (limited to 'sys') diff --git a/sys/amd64/amd64/support.S b/sys/amd64/amd64/support.S index e683b24162ec..f86d3d11506c 100644 --- a/sys/amd64/amd64/support.S +++ b/sys/amd64/amd64/support.S @@ -108,12 +108,12 @@ END(sse2_pagezero) */ ENTRY(memmove_std) PUSH_FRAME_POINTER - movq %rdi,%r9 + movq %rdi,%rax movq %rdx,%rcx - movq %rdi,%rax - subq %rsi,%rax - cmpq %rcx,%rax /* overlapping && src < dst? */ + movq %rdi,%r8 + subq %rsi,%r8 + cmpq %rcx,%r8 /* overlapping && src < dst? */ jb 1f shrq $3,%rcx /* copy by 64-bit words */ @@ -128,7 +128,6 @@ ENTRY(memmove_std) 2: rep movsb - movq %r9,%rax POP_FRAME_POINTER ret @@ -140,8 +139,10 @@ ENTRY(memmove_std) decq %rsi andq $7,%rcx /* any fractional bytes? */ std + jne 3f rep movsb +3: movq %rdx,%rcx /* copy remainder by 32-bit words */ shrq $3,%rcx subq $7,%rsi @@ -149,24 +150,22 @@ ENTRY(memmove_std) rep movsq cld - movq %r9,%rax POP_FRAME_POINTER ret END(memmove_std) ENTRY(memmove_erms) PUSH_FRAME_POINTER - movq %rdi,%r9 + movq %rdi,%rax movq %rdx,%rcx - movq %rdi,%rax - subq %rsi,%rax - cmpq %rcx,%rax /* overlapping && src < dst? */ + movq %rdi,%r8 + subq %rsi,%r8 + cmpq %rcx,%r8 /* overlapping && src < dst? */ jb 1f rep movsb - movq %r9,%rax POP_FRAME_POINTER ret @@ -179,7 +178,6 @@ ENTRY(memmove_erms) rep movsb cld - movq %r9,%rax POP_FRAME_POINTER ret END(memmove_erms) -- cgit v1.3 From 234afdb9cccb60d9efa7ad9cddebbd1dbe364d18 Mon Sep 17 00:00:00 2001 From: Oleksandr Tymoshenko Date: Sun, 16 Sep 2018 21:44:36 +0000 Subject: [ig4] Fix device description for Kaby Lake systems Kaby Lake I2C controller is Intel Sunrise Point-H not Intel Sunrise Point-LP. Submitted by: Dmitry Luhtionov Approved by: re (kib) --- sys/dev/ichiic/ig4_pci.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'sys') diff --git a/sys/dev/ichiic/ig4_pci.c b/sys/dev/ichiic/ig4_pci.c index eed7d651bfde..ae73f3b2abca 100644 --- a/sys/dev/ichiic/ig4_pci.c +++ b/sys/dev/ichiic/ig4_pci.c @@ -112,8 +112,8 @@ static struct ig4iic_pci_device ig4iic_pci_devices[] = { { PCI_CHIP_SKYLAKE_I2C_3, "Intel Sunrise Point-LP I2C Controller-3", IG4_SKYLAKE}, { PCI_CHIP_SKYLAKE_I2C_4, "Intel Sunrise Point-LP I2C Controller-4", IG4_SKYLAKE}, { PCI_CHIP_SKYLAKE_I2C_5, "Intel Sunrise Point-LP I2C Controller-5", IG4_SKYLAKE}, - { PCI_CHIP_KABYLAKE_I2C_0, "Intel Sunrise Point-LP I2C Controller-0", IG4_SKYLAKE}, - { PCI_CHIP_KABYLAKE_I2C_1, "Intel Sunrise Point-LP I2C Controller-1", IG4_SKYLAKE}, + { PCI_CHIP_KABYLAKE_I2C_0, "Intel Sunrise Point-H I2C Controller-0", IG4_SKYLAKE}, + { PCI_CHIP_KABYLAKE_I2C_1, "Intel Sunrise Point-H I2C Controller-1", IG4_SKYLAKE}, { PCI_CHIP_APL_I2C_0, "Intel Apollo Lake I2C Controller-0", IG4_APL}, { PCI_CHIP_APL_I2C_1, "Intel Apollo Lake I2C Controller-1", IG4_APL}, { PCI_CHIP_APL_I2C_2, "Intel Apollo Lake I2C Controller-2", IG4_APL}, -- cgit v1.3 From 9d1b868da0b2e9ae971dfa6c4ba3d7396b72377a Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Sun, 16 Sep 2018 21:46:27 +0000 Subject: Revert amd64: tidy up kernel memmove There is a braino in the non-erms variant which breaks the functionality. Will be fixed at a later time with a different patch. Reported by: Manfred Antar Approved by: re (implicit) --- sys/amd64/amd64/support.S | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) (limited to 'sys') diff --git a/sys/amd64/amd64/support.S b/sys/amd64/amd64/support.S index f86d3d11506c..e683b24162ec 100644 --- a/sys/amd64/amd64/support.S +++ b/sys/amd64/amd64/support.S @@ -108,12 +108,12 @@ END(sse2_pagezero) */ ENTRY(memmove_std) PUSH_FRAME_POINTER - movq %rdi,%rax + movq %rdi,%r9 movq %rdx,%rcx - movq %rdi,%r8 - subq %rsi,%r8 - cmpq %rcx,%r8 /* overlapping && src < dst? */ + movq %rdi,%rax + subq %rsi,%rax + cmpq %rcx,%rax /* overlapping && src < dst? */ jb 1f shrq $3,%rcx /* copy by 64-bit words */ @@ -128,6 +128,7 @@ ENTRY(memmove_std) 2: rep movsb + movq %r9,%rax POP_FRAME_POINTER ret @@ -139,10 +140,8 @@ ENTRY(memmove_std) decq %rsi andq $7,%rcx /* any fractional bytes? */ std - jne 3f rep movsb -3: movq %rdx,%rcx /* copy remainder by 32-bit words */ shrq $3,%rcx subq $7,%rsi @@ -150,22 +149,24 @@ ENTRY(memmove_std) rep movsq cld + movq %r9,%rax POP_FRAME_POINTER ret END(memmove_std) ENTRY(memmove_erms) PUSH_FRAME_POINTER - movq %rdi,%rax + movq %rdi,%r9 movq %rdx,%rcx - movq %rdi,%r8 - subq %rsi,%r8 - cmpq %rcx,%r8 /* overlapping && src < dst? */ + movq %rdi,%rax + subq %rsi,%rax + cmpq %rcx,%rax /* overlapping && src < dst? */ jb 1f rep movsb + movq %r9,%rax POP_FRAME_POINTER ret @@ -178,6 +179,7 @@ ENTRY(memmove_erms) rep movsb cld + movq %r9,%rax POP_FRAME_POINTER ret END(memmove_erms) -- cgit v1.3 From c6851ad04cf6be2ead6a46f9c5523c59ddf8301f Mon Sep 17 00:00:00 2001 From: "Andrey V. Elsukov" Date: Mon, 17 Sep 2018 10:10:14 +0000 Subject: Restore outbound packets capturing for if_gre(4). It was missed in r335048. Also clear M_MCAST and M_BCAST flags for encapsulated datagram, since it will have new IP header. Approved by: re (kib) --- sys/net/if_gre.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'sys') diff --git a/sys/net/if_gre.c b/sys/net/if_gre.c index 591396c588dd..a2b923ee04e9 100644 --- a/sys/net/if_gre.c +++ b/sys/net/if_gre.c @@ -569,6 +569,8 @@ gre_transmit(struct ifnet *ifp, struct mbuf *m) goto drop; } af = m->m_pkthdr.csum_data; + BPF_MTAP2(ifp, &af, sizeof(af), m); + m->m_flags &= ~(M_BCAST|M_MCAST); M_SETFIB(m, sc->gre_fibnum); M_PREPEND(m, sc->gre_hlen, M_NOWAIT); if (m == NULL) { -- cgit v1.3 From 76ed0c542fc6249b9cf649f92dd8da4b6f2a8f7d Mon Sep 17 00:00:00 2001 From: Konstantin Belousov Date: Mon, 17 Sep 2018 14:59:05 +0000 Subject: Make the PTI violation check to follow style of the SMAP check. No functional changes. Reviewed by: alc, markj Sponsored by: The FreeBSD Foundation MFC after: 1 week Approved by: re (rgrimes) Differential revision: https://reviews.freebsd.org/D17181 --- sys/amd64/amd64/trap.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'sys') diff --git a/sys/amd64/amd64/trap.c b/sys/amd64/amd64/trap.c index e9cb22960778..72b9b3e78d96 100644 --- a/sys/amd64/amd64/trap.c +++ b/sys/amd64/amd64/trap.c @@ -705,6 +705,17 @@ trap_is_smap(struct trapframe *frame) PGEX_P && (frame->tf_rflags & PSL_AC) == 0); } +static bool +trap_is_pti(struct trapframe *frame) +{ + + return (PCPU_GET(curpmap)->pm_ucr3 != PMAP_NO_CR3 && + pg_nx != 0 && (frame->tf_err & (PGEX_P | PGEX_W | + PGEX_U | PGEX_I)) == (PGEX_P | PGEX_U | PGEX_I) && + (curpcb->pcb_saved_ucr3 & ~CR3_PCID_MASK) == + (PCPU_GET(curpmap)->pm_cr3 & ~CR3_PCID_MASK)); +} + static int trap_pfault(struct trapframe *frame, int usermode) { @@ -806,11 +817,7 @@ trap_pfault(struct trapframe *frame, int usermode) * If nx protection of the usermode portion of kernel page * tables caused trap, panic. */ - if (usermode && PCPU_GET(curpmap)->pm_ucr3 != PMAP_NO_CR3 && - pg_nx != 0 && (frame->tf_err & (PGEX_P | PGEX_W | - PGEX_U | PGEX_I)) == (PGEX_P | PGEX_U | PGEX_I) && - (curpcb->pcb_saved_ucr3 & ~CR3_PCID_MASK)== - (PCPU_GET(curpmap)->pm_cr3 & ~CR3_PCID_MASK)) + if (usermode && trap_is_pti(frame)) panic("PTI: pid %d comm %s tf_err %#lx", p->p_pid, p->p_comm, frame->tf_err); -- cgit v1.3 From 09a6ada9919eac191b28e72fedfa9fa06e50b176 Mon Sep 17 00:00:00 2001 From: Konstantin Belousov Date: Mon, 17 Sep 2018 15:34:19 +0000 Subject: Calculate PTI, PCID and INVPCID modes earlier, before ifuncs are resolved. This will be used in following conversion of pmap_activate_sw(). Reviewed by: alc, markj Sponsored by: The FreeBSD Foundation Approved by: re (gjb) Differential revision: https://reviews.freebsd.org/D17181 --- sys/amd64/amd64/machdep.c | 18 +++++++++++++++--- sys/amd64/amd64/pmap.c | 8 +------- 2 files changed, 16 insertions(+), 10 deletions(-) (limited to 'sys') diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c index d9b6e0a04d5a..dc174e85a61f 100644 --- a/sys/amd64/amd64/machdep.c +++ b/sys/amd64/amd64/machdep.c @@ -1581,6 +1581,21 @@ hammer_time(u_int64_t modulep, u_int64_t physfree) */ identify_cpu2(); + /* + * Check for pti, pcid, and invpcid before ifuncs are + * resolved, to correctly select the implementation for + * pmap_activate_sw_mode(). + */ + pti = pti_get_default(); + TUNABLE_INT_FETCH("vm.pmap.pti", &pti); + TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled); + if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) { + invpcid_works = (cpu_stdext_feature & + CPUID_STDEXT_INVPCID) != 0; + } else { + pmap_pcid_enabled = 0; + } + link_elf_ireloc(kmdp); /* @@ -1645,9 +1660,6 @@ hammer_time(u_int64_t modulep, u_int64_t physfree) mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF); /* exceptions */ - pti = pti_get_default(); - TUNABLE_INT_FETCH("vm.pmap.pti", &pti); - for (x = 0; x < NIDT; x++) setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0); diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index 8cb09df626be..479a29ee737b 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -1179,11 +1179,7 @@ pmap_bootstrap(vm_paddr_t *firstaddr) pmap_init_pat(); /* Initialize TLB Context Id. */ - TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled); - if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) { - /* Check for INVPCID support */ - invpcid_works = (cpu_stdext_feature & CPUID_STDEXT_INVPCID) - != 0; + if (pmap_pcid_enabled) { for (i = 0; i < MAXCPU; i++) { kernel_pmap->pm_pcids[i].pm_pcid = PMAP_PCID_KERN; kernel_pmap->pm_pcids[i].pm_gen = 1; @@ -1204,8 +1200,6 @@ pmap_bootstrap(vm_paddr_t *firstaddr) * during pcpu setup. */ load_cr4(rcr4() | CR4_PCIDE); - } else { - pmap_pcid_enabled = 0; } } -- cgit v1.3 From d6943c58045fa20ef31224c8c2b64ab7cf2865d6 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Mon, 17 Sep 2018 15:51:49 +0000 Subject: amd64: tidy up kernel memmove, take 2 There is no need to use %rax for temporary values and avoiding doing so shortens the func. Handle the explicit 'check for tail' depessimisization for backwards copying. This reduces the diff against userspace. Tested with the glibc test suite. Approved by: re (kib) --- sys/amd64/amd64/support.S | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) (limited to 'sys') diff --git a/sys/amd64/amd64/support.S b/sys/amd64/amd64/support.S index e683b24162ec..bc849b21d965 100644 --- a/sys/amd64/amd64/support.S +++ b/sys/amd64/amd64/support.S @@ -108,40 +108,40 @@ END(sse2_pagezero) */ ENTRY(memmove_std) PUSH_FRAME_POINTER - movq %rdi,%r9 + movq %rdi,%rax movq %rdx,%rcx - movq %rdi,%rax - subq %rsi,%rax - cmpq %rcx,%rax /* overlapping && src < dst? */ - jb 1f + movq %rdi,%r8 + subq %rsi,%r8 + cmpq %rcx,%r8 /* overlapping && src < dst? */ + jb 2f shrq $3,%rcx /* copy by 64-bit words */ rep movsq movq %rdx,%rcx andq $7,%rcx /* any bytes left? */ - jne 2f - movq %r9,%rax + jne 1f POP_FRAME_POINTER ret -2: +1: rep movsb - movq %r9,%rax POP_FRAME_POINTER ret /* ALIGN_TEXT */ -1: +2: addq %rcx,%rdi /* copy backwards */ addq %rcx,%rsi decq %rdi decq %rsi - andq $7,%rcx /* any fractional bytes? */ std + andq $7,%rcx /* any fractional bytes? */ + je 3f rep movsb +3: movq %rdx,%rcx /* copy remainder by 32-bit words */ shrq $3,%rcx subq $7,%rsi @@ -149,24 +149,22 @@ ENTRY(memmove_std) rep movsq cld - movq %r9,%rax POP_FRAME_POINTER ret END(memmove_std) ENTRY(memmove_erms) PUSH_FRAME_POINTER - movq %rdi,%r9 + movq %rdi,%rax movq %rdx,%rcx - movq %rdi,%rax - subq %rsi,%rax - cmpq %rcx,%rax /* overlapping && src < dst? */ + movq %rdi,%r8 + subq %rsi,%r8 + cmpq %rcx,%r8 /* overlapping && src < dst? */ jb 1f rep movsb - movq %r9,%rax POP_FRAME_POINTER ret @@ -179,7 +177,6 @@ ENTRY(memmove_erms) rep movsb cld - movq %r9,%rax POP_FRAME_POINTER ret END(memmove_erms) -- cgit v1.3 From 3c022be2ca64308c1eba8e686afccfbd06bfe1b5 Mon Sep 17 00:00:00 2001 From: Konstantin Belousov Date: Mon, 17 Sep 2018 15:52:19 +0000 Subject: Use ifunc to resolve context switching mode on amd64. Patch removes all checks for pti/pcid/invpcid from the context switch path. I verified this by looking at the generated code, compiling with the in-tree clang. The invpcid_works1 trick required inline attribute for pmap_activate_sw_pcid_pti() to work. Reviewed by: alc, markj Sponsored by: The FreeBSD Foundation Approved by: re (gjb) Differential revision: https://reviews.freebsd.org/D17181 --- sys/amd64/amd64/pmap.c | 257 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 167 insertions(+), 90 deletions(-) (limited to 'sys') diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index 479a29ee737b..9f43f3cfadc7 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -146,6 +146,7 @@ __FBSDID("$FreeBSD$"); #include #include +#include #include #include #include @@ -7435,17 +7436,177 @@ pmap_pcid_alloc(pmap_t pmap, u_int cpuid) return (0); } +static uint64_t +pmap_pcid_alloc_checked(pmap_t pmap, u_int cpuid) +{ + uint64_t cached; + + cached = pmap_pcid_alloc(pmap, cpuid); + KASSERT(pmap->pm_pcids[cpuid].pm_pcid >= 0 && + pmap->pm_pcids[cpuid].pm_pcid < PMAP_PCID_OVERMAX, + ("pmap %p cpu %d pcid %#x", pmap, cpuid, + pmap->pm_pcids[cpuid].pm_pcid)); + KASSERT(pmap->pm_pcids[cpuid].pm_pcid != PMAP_PCID_KERN || + pmap == kernel_pmap, + ("non-kernel pmap pmap %p cpu %d pcid %#x", + pmap, cpuid, pmap->pm_pcids[cpuid].pm_pcid)); + return (cached); +} + +static void +pmap_activate_sw_pti_post(pmap_t pmap) +{ + + if (pmap->pm_ucr3 != PMAP_NO_CR3) + PCPU_GET(tssp)->tss_rsp0 = ((vm_offset_t)PCPU_PTR(pti_stack) + + PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful; +} + +static void inline +pmap_activate_sw_pcid_pti(pmap_t pmap, u_int cpuid, const bool invpcid_works1) +{ + struct invpcid_descr d; + uint64_t cached, cr3, kcr3, ucr3; + + cached = pmap_pcid_alloc_checked(pmap, cpuid); + cr3 = rcr3(); + if ((cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3) + load_cr3(pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid); + PCPU_SET(curpmap, pmap); + kcr3 = pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid; + ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[cpuid].pm_pcid | + PMAP_PCID_USER_PT; + + if (!cached && pmap->pm_ucr3 != PMAP_NO_CR3) { + /* + * Explicitly invalidate translations cached from the + * user page table. They are not automatically + * flushed by reload of cr3 with the kernel page table + * pointer above. + * + * Note that the if() condition is resolved statically + * by using the function argument instead of + * runtime-evaluated invpcid_works value. + */ + if (invpcid_works1) { + d.pcid = PMAP_PCID_USER_PT | + pmap->pm_pcids[cpuid].pm_pcid; + d.pad = 0; + d.addr = 0; + invpcid(&d, INVPCID_CTX); + } else { + pmap_pti_pcid_invalidate(ucr3, kcr3); + } + } + + PCPU_SET(kcr3, kcr3 | CR3_PCID_SAVE); + PCPU_SET(ucr3, ucr3 | CR3_PCID_SAVE); + if (cached) + PCPU_INC(pm_save_cnt); +} + +static void +pmap_activate_sw_pcid_invpcid_pti(pmap_t pmap, u_int cpuid) +{ + + pmap_activate_sw_pcid_pti(pmap, cpuid, true); + pmap_activate_sw_pti_post(pmap); +} + +static void +pmap_activate_sw_pcid_noinvpcid_pti(pmap_t pmap, u_int cpuid) +{ + register_t rflags; + + /* + * If the INVPCID instruction is not available, + * invltlb_pcid_handler() is used to handle an invalidate_all + * IPI, which checks for curpmap == smp_tlb_pmap. The below + * sequence of operations has a window where %CR3 is loaded + * with the new pmap's PML4 address, but the curpmap value has + * not yet been updated. This causes the invltlb IPI handler, + * which is called between the updates, to execute as a NOP, + * which leaves stale TLB entries. + * + * Note that the most typical use of pmap_activate_sw(), from + * the context switch, is immune to this race, because + * interrupts are disabled (while the thread lock is owned), + * and the IPI happens after curpmap is updated. Protect + * other callers in a similar way, by disabling interrupts + * around the %cr3 register reload and curpmap assignment. + */ + rflags = intr_disable(); + pmap_activate_sw_pcid_pti(pmap, cpuid, false); + intr_restore(rflags); + pmap_activate_sw_pti_post(pmap); +} + +static void +pmap_activate_sw_pcid_nopti(pmap_t pmap, u_int cpuid) +{ + uint64_t cached, cr3; + + cached = pmap_pcid_alloc_checked(pmap, cpuid); + cr3 = rcr3(); + if (!cached || (cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3) + load_cr3(pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid | + cached); + PCPU_SET(curpmap, pmap); + if (cached) + PCPU_INC(pm_save_cnt); +} + +static void +pmap_activate_sw_pcid_noinvpcid_nopti(pmap_t pmap, u_int cpuid) +{ + register_t rflags; + + rflags = intr_disable(); + pmap_activate_sw_pcid_nopti(pmap, cpuid); + intr_restore(rflags); +} + +static void +pmap_activate_sw_nopcid_nopti(pmap_t pmap, u_int cpuid __unused) +{ + + load_cr3(pmap->pm_cr3); + PCPU_SET(curpmap, pmap); +} + +static void +pmap_activate_sw_nopcid_pti(pmap_t pmap, u_int cpuid __unused) +{ + + pmap_activate_sw_nopcid_nopti(pmap, cpuid); + PCPU_SET(kcr3, pmap->pm_cr3); + PCPU_SET(ucr3, pmap->pm_ucr3); + pmap_activate_sw_pti_post(pmap); +} + +DEFINE_IFUNC(static, void, pmap_activate_sw_mode, (pmap_t, u_int), static) +{ + + if (pmap_pcid_enabled && pti && invpcid_works) + return (pmap_activate_sw_pcid_invpcid_pti); + else if (pmap_pcid_enabled && pti && !invpcid_works) + return (pmap_activate_sw_pcid_noinvpcid_pti); + else if (pmap_pcid_enabled && !pti && invpcid_works) + return (pmap_activate_sw_pcid_nopti); + else if (pmap_pcid_enabled && !pti && !invpcid_works) + return (pmap_activate_sw_pcid_noinvpcid_nopti); + else if (!pmap_pcid_enabled && pti) + return (pmap_activate_sw_nopcid_pti); + else /* if (!pmap_pcid_enabled && !pti) */ + return (pmap_activate_sw_nopcid_nopti); +} + void pmap_activate_sw(struct thread *td) { pmap_t oldpmap, pmap; - struct invpcid_descr d; - uint64_t cached, cr3, kcr3, kern_pti_cached, rsp0, ucr3; - register_t rflags; u_int cpuid; - struct amd64tss *tssp; - rflags = 0; oldpmap = PCPU_GET(curpmap); pmap = vmspace_pmap(td->td_proc->p_vmspace); if (oldpmap == pmap) @@ -7456,91 +7617,7 @@ pmap_activate_sw(struct thread *td) #else CPU_SET(cpuid, &pmap->pm_active); #endif - cr3 = rcr3(); - if (pmap_pcid_enabled) { - cached = pmap_pcid_alloc(pmap, cpuid); - KASSERT(pmap->pm_pcids[cpuid].pm_pcid >= 0 && - pmap->pm_pcids[cpuid].pm_pcid < PMAP_PCID_OVERMAX, - ("pmap %p cpu %d pcid %#x", pmap, cpuid, - pmap->pm_pcids[cpuid].pm_pcid)); - KASSERT(pmap->pm_pcids[cpuid].pm_pcid != PMAP_PCID_KERN || - pmap == kernel_pmap, - ("non-kernel pmap thread %p pmap %p cpu %d pcid %#x", - td, pmap, cpuid, pmap->pm_pcids[cpuid].pm_pcid)); - - /* - * If the INVPCID instruction is not available, - * invltlb_pcid_handler() is used for handle - * invalidate_all IPI, which checks for curpmap == - * smp_tlb_pmap. Below operations sequence has a - * window where %CR3 is loaded with the new pmap's - * PML4 address, but curpmap value is not yet updated. - * This causes invltlb IPI handler, called between the - * updates, to execute as NOP, which leaves stale TLB - * entries. - * - * Note that the most typical use of - * pmap_activate_sw(), from the context switch, is - * immune to this race, because interrupts are - * disabled (while the thread lock is owned), and IPI - * happens after curpmap is updated. Protect other - * callers in a similar way, by disabling interrupts - * around the %cr3 register reload and curpmap - * assignment. - */ - if (!invpcid_works) - rflags = intr_disable(); - - kern_pti_cached = pti ? 0 : cached; - if (!kern_pti_cached || (cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3) { - load_cr3(pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid | - kern_pti_cached); - } - PCPU_SET(curpmap, pmap); - if (pti) { - kcr3 = pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid; - ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[cpuid].pm_pcid | - PMAP_PCID_USER_PT; - - if (!cached && pmap->pm_ucr3 != PMAP_NO_CR3) { - /* - * Manually invalidate translations cached - * from the user page table. They are not - * flushed by reload of cr3 with the kernel - * page table pointer above. - */ - if (invpcid_works) { - d.pcid = PMAP_PCID_USER_PT | - pmap->pm_pcids[cpuid].pm_pcid; - d.pad = 0; - d.addr = 0; - invpcid(&d, INVPCID_CTX); - } else { - pmap_pti_pcid_invalidate(ucr3, kcr3); - } - } - - PCPU_SET(kcr3, kcr3 | CR3_PCID_SAVE); - PCPU_SET(ucr3, ucr3 | CR3_PCID_SAVE); - } - if (!invpcid_works) - intr_restore(rflags); - if (cached) - PCPU_INC(pm_save_cnt); - } else { - load_cr3(pmap->pm_cr3); - PCPU_SET(curpmap, pmap); - if (pti) { - PCPU_SET(kcr3, pmap->pm_cr3); - PCPU_SET(ucr3, pmap->pm_ucr3); - } - } - if (pmap->pm_ucr3 != PMAP_NO_CR3) { - rsp0 = ((vm_offset_t)PCPU_PTR(pti_stack) + - PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful; - tssp = PCPU_GET(tssp); - tssp->tss_rsp0 = rsp0; - } + pmap_activate_sw_mode(pmap, cpuid); #ifdef SMP CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); #else -- cgit v1.3 From 6368b4e47128986d8376c8a79a2b33bdb3dddb92 Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Mon, 17 Sep 2018 16:16:57 +0000 Subject: Fix an nvpair leak in vdev_geom_read_config(). Also change the behaviour slightly: instead of freeing "config" if the last nvlist doesn't pass the tests, return the last config that did pass those tests. This matches the comment at the beginning of the function. PR: 230704 Diagnosed by: avg Reviewed by: asomers, avg Tested by: Mark Martinec Approved by: re (gjb) MFC after: 1 week Sponsored by: The FreeBSD Foundation Differential revision: https://reviews.freebsd.org/D17202 --- .../opensolaris/uts/common/fs/zfs/vdev_geom.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) (limited to 'sys') diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c index aa8a400f2d78..7794bd505525 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c @@ -415,9 +415,10 @@ vdev_geom_io(struct g_consumer *cp, int *cmds, void **datas, off_t *offsets, * least one valid label was found. */ static int -vdev_geom_read_config(struct g_consumer *cp, nvlist_t **config) +vdev_geom_read_config(struct g_consumer *cp, nvlist_t **configp) { struct g_provider *pp; + nvlist_t *config; vdev_phys_t *vdev_lists[VDEV_LABELS]; char *buf; size_t buflen; @@ -442,7 +443,6 @@ vdev_geom_read_config(struct g_consumer *cp, nvlist_t **config) buflen = sizeof(vdev_lists[0]->vp_nvlist); - *config = NULL; /* Create all of the IO requests */ for (l = 0; l < VDEV_LABELS; l++) { cmds[l] = BIO_READ; @@ -458,6 +458,7 @@ vdev_geom_read_config(struct g_consumer *cp, nvlist_t **config) VDEV_LABELS); /* Parse the labels */ + config = *configp = NULL; nlabels = 0; for (l = 0; l < VDEV_LABELS; l++) { if (errors[l] != 0) @@ -465,25 +466,27 @@ vdev_geom_read_config(struct g_consumer *cp, nvlist_t **config) buf = vdev_lists[l]->vp_nvlist; - if (nvlist_unpack(buf, buflen, config, 0) != 0) + if (nvlist_unpack(buf, buflen, &config, 0) != 0) continue; - if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE, + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &state) != 0 || state > POOL_STATE_L2CACHE) { - nvlist_free(*config); - *config = NULL; + nvlist_free(config); continue; } if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && - (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG, + (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) != 0 || txg == 0)) { - nvlist_free(*config); - *config = NULL; + nvlist_free(config); continue; } + if (*configp != NULL) + nvlist_free(*configp); + *configp = config; + nlabels++; } -- cgit v1.3 From 87bdca8290e725e557d5c71c414e9bf5f68f5a5c Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Mon, 17 Sep 2018 17:18:54 +0000 Subject: Fix a regression in r338360 when booting an x86 machine without APIC. The atpic_register_sources callback tries to avoid registering interrupt sources that would collide with an I/O APIC. However, the previous implementation was failing to register IRQs 8-15 since the slave PIC saw valid IRQs from the master and assumed an I/O APIC was present. To fix, go back to registering all 8259A interrupt sources in one loop when the master's register_sources method is invoked. PR: 231291 Approved by: re (kib) MFC after: 1 month --- sys/x86/isa/atpic.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'sys') diff --git a/sys/x86/isa/atpic.c b/sys/x86/isa/atpic.c index 8560793df503..e7e0cc79b54c 100644 --- a/sys/x86/isa/atpic.c +++ b/sys/x86/isa/atpic.c @@ -221,14 +221,20 @@ atpic_register_sources(struct pic *pic) * that APIC ISA routing and allowing the ATPIC source for that IRQ * to leak through. We used to depend on this feature for routing * IRQ0 via mixed mode, but now we don't use mixed mode at all. + * + * To avoid the slave not register sources after the master + * registers its sources, register all IRQs when this function is + * called on the master. */ + if (ap != &atpics[MASTER]) + return; for (i = 0; i < NUM_ISA_IRQS; i++) if (intr_lookup_source(i) != NULL) return; /* Loop through all interrupt sources and add them. */ - for (i = 0, ai = atintrs + ap->at_irqbase; i < 8; i++, ai++) { - if (ap->at_irqbase + i == ICU_SLAVEID) + for (i = 0, ai = atintrs; i < NUM_ISA_IRQS; i++, ai++) { + if (i == ICU_SLAVEID) continue; intr_register_source(&ai->at_intsrc); } -- cgit v1.3 From a4088415936d863c9f4f61d90b9306ad958ca14d Mon Sep 17 00:00:00 2001 From: Konstantin Belousov Date: Mon, 17 Sep 2018 19:38:43 +0000 Subject: Do not upgrade the vnode lock to call getinoquota(). Doing so can deadlock when the thread already owns another vnode lock, e.g. during a rename, as was demonstrated by the reporter. In fact, there seems to be no need to force the call to getinoquota() always, because vn_open() locks vnode exclusively, and this is the most important case. To add to the point, directories where the dirent is added or removed, are locked exclusively as well. Reported by: bwidawsk Tested by: bwidawsk, pho (as part of the larger patch) Sponsored by: The FreeBSD Foundation Approved by: re (gjb) MFC after: 1 week --- sys/ufs/ufs/ufs_vnops.c | 35 +++++++---------------------------- 1 file changed, 7 insertions(+), 28 deletions(-) (limited to 'sys') diff --git a/sys/ufs/ufs/ufs_vnops.c b/sys/ufs/ufs/ufs_vnops.c index 9e6e24c32db9..9e8896866660 100644 --- a/sys/ufs/ufs/ufs_vnops.c +++ b/sys/ufs/ufs/ufs_vnops.c @@ -325,9 +325,6 @@ ufs_accessx(ap) struct inode *ip = VTOI(vp); accmode_t accmode = ap->a_accmode; int error; -#ifdef QUOTA - int relocked; -#endif #ifdef UFS_ACL struct acl *acl; acl_type_t type; @@ -350,32 +347,14 @@ ufs_accessx(ap) * Inode is accounted in the quotas only if struct * dquot is attached to it. VOP_ACCESS() is called * from vn_open_cred() and provides a convenient - * point to call getinoquota(). + * point to call getinoquota(). The lock mode is + * exclusive when the file is opening for write. */ - if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { - - /* - * Upgrade vnode lock, since getinoquota() - * requires exclusive lock to modify inode. - */ - relocked = 1; - vhold(vp); - vn_lock(vp, LK_UPGRADE | LK_RETRY); - VI_LOCK(vp); - if (vp->v_iflag & VI_DOOMED) { - vdropl(vp); - error = ENOENT; - goto relock; - } - vdropl(vp); - } else - relocked = 0; - error = getinoquota(ip); -relock: - if (relocked) - vn_lock(vp, LK_DOWNGRADE | LK_RETRY); - if (error != 0) - return (error); + if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE) { + error = getinoquota(ip); + if (error != 0) + return (error); + } #endif break; default: -- cgit v1.3 From 9d50798c61e21d9b8d52810c7da805dad348a876 Mon Sep 17 00:00:00 2001 From: David C Somayajulu Date: Mon, 17 Sep 2018 20:15:18 +0000 Subject: Fixed isses: State check before enqueuing transmit task in bxe_link_attn() routine. State check before invoking bxe_nic_unload in bxe_shutdown(). Submitted by:Vaishali.Kulkarni@cavium.com Approved by:re(gjb) --- sys/dev/bxe/bxe.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'sys') diff --git a/sys/dev/bxe/bxe.c b/sys/dev/bxe/bxe.c index 393b78f540ff..4ef92e39d1f9 100644 --- a/sys/dev/bxe/bxe.c +++ b/sys/dev/bxe/bxe.c @@ -7076,13 +7076,13 @@ bxe_link_attn(struct bxe_softc *sc) if (sc->state == BXE_STATE_OPEN) { bxe_stats_handle(sc, STATS_EVENT_LINK_UP); + /* Restart tx when the link comes back. */ + FOR_EACH_ETH_QUEUE(sc, i) { + fp = &sc->fp[i]; + taskqueue_enqueue(fp->tq, &fp->tx_task); + } } - /* Restart tx when the link comes back. */ - FOR_EACH_ETH_QUEUE(sc, i) { - fp = &sc->fp[i]; - taskqueue_enqueue(fp->tq, &fp->tx_task); - } } if (sc->link_vars.link_up && sc->link_vars.line_speed) { @@ -16279,9 +16279,11 @@ bxe_shutdown(device_t dev) /* stop the periodic callout */ bxe_periodic_stop(sc); - BXE_CORE_LOCK(sc); - bxe_nic_unload(sc, UNLOAD_NORMAL, FALSE); - BXE_CORE_UNLOCK(sc); + if (sc->state != BXE_STATE_CLOSED) { + BXE_CORE_LOCK(sc); + bxe_nic_unload(sc, UNLOAD_NORMAL, FALSE); + BXE_CORE_UNLOCK(sc); + } return (0); } -- cgit v1.3 From 2554f86a8d426f6280857d184019506827e328d1 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Tue, 18 Sep 2018 01:24:30 +0000 Subject: vm: stop taking proc lock in mmap to satisfy racct if it is disabled Limits can be safely obtained with lim_cur from the thread. racct is compiled in but disabled by default. Note that racct enablement is a boot-only tunable. This eliminates second most common place of taking the lock while pkg building. While here don't take the lock in mlockall either. Reviewed by: kib Approved by: re (gjb) Differential Revision: https://reviews.freebsd.org/D17210 --- sys/sys/racct.h | 12 ++++++++++++ sys/vm/vm_mmap.c | 22 +++++++++------------- 2 files changed, 21 insertions(+), 13 deletions(-) (limited to 'sys') diff --git a/sys/sys/racct.h b/sys/sys/racct.h index ec3322bdfdf9..84de705f24af 100644 --- a/sys/sys/racct.h +++ b/sys/sys/racct.h @@ -164,6 +164,15 @@ extern struct mtx racct_lock; #define RACCT_UNLOCK() mtx_unlock(&racct_lock) #define RACCT_LOCK_ASSERT() mtx_assert(&racct_lock, MA_OWNED) +#define RACCT_PROC_LOCK(p) do { \ + if (__predict_false(racct_enable)) \ + PROC_LOCK(p); \ +} while (0) +#define RACCT_PROC_UNLOCK(p) do { \ + if (__predict_false(racct_enable)) \ + PROC_UNLOCK(p); \ +} while (0) + int racct_add(struct proc *p, int resource, uint64_t amount); void racct_add_cred(struct ucred *cred, int resource, uint64_t amount); void racct_add_force(struct proc *p, int resource, uint64_t amount); @@ -189,6 +198,9 @@ void racct_proc_throttle(struct proc *p, int timeout); #else +#define RACCT_PROC_LOCK(p) do { } while (0) +#define RACCT_PROC_UNLOCK(p) do { } while (0) + static inline int racct_add(struct proc *p, int resource, uint64_t amount) { diff --git a/sys/vm/vm_mmap.c b/sys/vm/vm_mmap.c index e184b8ff5c1f..e8772a1eadde 100644 --- a/sys/vm/vm_mmap.c +++ b/sys/vm/vm_mmap.c @@ -1055,12 +1055,8 @@ sys_mlockall(struct thread *td, struct mlockall_args *uap) * a hard resource limit, return ENOMEM. */ if (!old_mlock && uap->how & MCL_CURRENT) { - PROC_LOCK(td->td_proc); - if (map->size > lim_cur(td, RLIMIT_MEMLOCK)) { - PROC_UNLOCK(td->td_proc); + if (map->size > lim_cur(td, RLIMIT_MEMLOCK)) return (ENOMEM); - } - PROC_UNLOCK(td->td_proc); } #ifdef RACCT if (racct_enable) { @@ -1445,21 +1441,21 @@ vm_mmap_object(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, curmap = map == &td->td_proc->p_vmspace->vm_map; if (curmap) { - PROC_LOCK(td->td_proc); - if (map->size + size > lim_cur_proc(td->td_proc, RLIMIT_VMEM)) { - PROC_UNLOCK(td->td_proc); + RACCT_PROC_LOCK(td->td_proc); + if (map->size + size > lim_cur(td, RLIMIT_VMEM)) { + RACCT_PROC_UNLOCK(td->td_proc); return (ENOMEM); } if (racct_set(td->td_proc, RACCT_VMEM, map->size + size)) { - PROC_UNLOCK(td->td_proc); + RACCT_PROC_UNLOCK(td->td_proc); return (ENOMEM); } if (!old_mlock && map->flags & MAP_WIREFUTURE) { if (ptoa(pmap_wired_count(map->pmap)) + size > - lim_cur_proc(td->td_proc, RLIMIT_MEMLOCK)) { + lim_cur(td, RLIMIT_MEMLOCK)) { racct_set_force(td->td_proc, RACCT_VMEM, map->size); - PROC_UNLOCK(td->td_proc); + RACCT_PROC_UNLOCK(td->td_proc); return (ENOMEM); } error = racct_set(td->td_proc, RACCT_MEMLOCK, @@ -1467,11 +1463,11 @@ vm_mmap_object(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, if (error != 0) { racct_set_force(td->td_proc, RACCT_VMEM, map->size); - PROC_UNLOCK(td->td_proc); + RACCT_PROC_UNLOCK(td->td_proc); return (error); } } - PROC_UNLOCK(td->td_proc); + RACCT_PROC_UNLOCK(td->td_proc); } /* -- cgit v1.3 From ba4704a278264b679425b1c70b38622d76db0041 Mon Sep 17 00:00:00 2001 From: Michael Tuexen Date: Tue, 18 Sep 2018 10:53:07 +0000 Subject: Remove unused code. Approved by: re (kib@) MFC after: 1 week --- sys/netinet/sctp_auth.c | 34 ---------------------------------- sys/netinet/sctp_auth.h | 3 --- 2 files changed, 37 deletions(-) (limited to 'sys') diff --git a/sys/netinet/sctp_auth.c b/sys/netinet/sctp_auth.c index 3150306356dc..5a5b7880fd30 100644 --- a/sys/netinet/sctp_auth.c +++ b/sys/netinet/sctp_auth.c @@ -1060,40 +1060,6 @@ sctp_hmac_m(uint16_t hmac_algo, uint8_t *key, uint32_t keylen, return (digestlen); } -/*- - * verify the HMAC digest using the desired hash key, text, and HMAC - * algorithm. - * Returns -1 on error, 0 on success. - */ -int -sctp_verify_hmac(uint16_t hmac_algo, uint8_t *key, uint32_t keylen, - uint8_t *text, uint32_t textlen, - uint8_t *digest, uint32_t digestlen) -{ - uint32_t len; - uint8_t temp[SCTP_AUTH_DIGEST_LEN_MAX]; - - /* sanity check the material and length */ - if ((key == NULL) || (keylen == 0) || - (text == NULL) || (textlen == 0) || (digest == NULL)) { - /* can't do HMAC with empty key or text or digest */ - return (-1); - } - len = sctp_get_hmac_digest_len(hmac_algo); - if ((len == 0) || (digestlen != len)) - return (-1); - - /* compute the expected hash */ - if (sctp_hmac(hmac_algo, key, keylen, text, textlen, temp) != len) - return (-1); - - if (memcmp(digest, temp, digestlen) != 0) - return (-1); - else - return (0); -} - - /* * computes the requested HMAC using a key struct (which may be modified if * the keylen exceeds the HMAC block len). diff --git a/sys/netinet/sctp_auth.h b/sys/netinet/sctp_auth.h index 44126e3e590f..5c22cc749c65 100644 --- a/sys/netinet/sctp_auth.h +++ b/sys/netinet/sctp_auth.h @@ -178,9 +178,6 @@ extern uint32_t sctp_get_hmac_digest_len(uint16_t hmac_algo); extern uint32_t sctp_hmac(uint16_t hmac_algo, uint8_t *key, uint32_t keylen, uint8_t *text, uint32_t textlen, uint8_t *digest); -extern int -sctp_verify_hmac(uint16_t hmac_algo, uint8_t *key, uint32_t keylen, - uint8_t *text, uint32_t textlen, uint8_t *digest, uint32_t digestlen); extern uint32_t sctp_compute_hmac(uint16_t hmac_algo, sctp_key_t *key, uint8_t *text, uint32_t textlen, uint8_t *digest); -- cgit v1.3 From 9ed6559e3e70bef18b7e18b87b9c33200b6b2849 Mon Sep 17 00:00:00 2001 From: Ed Maste Date: Tue, 18 Sep 2018 15:01:21 +0000 Subject: Require ifunc-capable linker for i386 The amd64 kernel started using ifunc for a variety of functions with arch-specific implementations, and we would like to make use of the same functionality on i386 and as much as possible avoid divergence between i386 and amd64. In particular, future changes for security improvements and mitigations may rely on ifunc support. Approved by: re (kib) Sponsored by: The FreeBSD Foundation --- sys/conf/kern.pre.mk | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'sys') diff --git a/sys/conf/kern.pre.mk b/sys/conf/kern.pre.mk index a663deb59d1f..523cea605afd 100644 --- a/sys/conf/kern.pre.mk +++ b/sys/conf/kern.pre.mk @@ -121,10 +121,11 @@ CFLAGS+= ${CONF_CFLAGS} LDFLAGS+= -Wl,--build-id=sha1 .endif -.if ${MACHINE_CPUARCH} == "amd64" -.if defined(LINKER_FEATURES) && ${LINKER_FEATURES:Mifunc} == "" -.error amd64 kernel requires linker ifunc support +.if (${MACHINE_CPUARCH} == "amd64" || ${MACHINE_CPUARCH} == "i386") && \ + defined(LINKER_FEATURES) && ${LINKER_FEATURES:Mifunc} == "" +.error amd64/i386 kernel requires linker ifunc support .endif +.if ${MACHINE_CPUARCH} == "amd64" LDFLAGS+= -Wl,-z max-page-size=2097152 -Wl,-z common-page-size=4096 -Wl,-z -Wl,ifunc-noplt .endif -- cgit v1.3 From 26fe2217bfbd6aa6ec67f5c57a2e7344bcf2fdff Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Tue, 18 Sep 2018 17:51:45 +0000 Subject: Only update the domain cursor once in keg_fetch_slab(). We drop the keg lock when we go to actually allocate the slab, allowing other threads to advance the cursor. This can cause us to exit the round-robin loop before having attempted allocations from all domains, resulting in a hang during a subsequent blocking allocation attempt from a depleted domain. Reported and tested by: Jan Bramkamp Reviewed by: alc, cem Approved by: re (gjb) Sponsored by: The FreeBSD Foundation Differential Revision: https://reviews.freebsd.org/D17209 --- sys/vm/uma_core.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'sys') diff --git a/sys/vm/uma_core.c b/sys/vm/uma_core.c index 908349379763..937e78452021 100644 --- a/sys/vm/uma_core.c +++ b/sys/vm/uma_core.c @@ -2698,10 +2698,8 @@ again: LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link); return (slab); } - if (rr) { - keg->uk_cursor = (keg->uk_cursor + 1) % vm_ndomains; - domain = keg->uk_cursor; - } + if (rr) + domain = (domain + 1) % vm_ndomains; } while (domain != start); /* Retry domain scan with blocking. */ -- cgit v1.3 From 5f65b4cab2afc98ffa7ec270fa1f06883191c777 Mon Sep 17 00:00:00 2001 From: Navdeep Parhar Date: Tue, 18 Sep 2018 21:34:37 +0000 Subject: cxgbe(4): Enable TXRTLMT by default when the feature is available in the kernel (options RATELIMIT) and provisioned in the driver's configuration file (nethofld > 0). Submitted by: gallatin@ Approved by: re@ (kib@) --- sys/dev/cxgbe/t4_main.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'sys') diff --git a/sys/dev/cxgbe/t4_main.c b/sys/dev/cxgbe/t4_main.c index 60bdff978b69..d6ad6048973a 100644 --- a/sys/dev/cxgbe/t4_main.c +++ b/sys/dev/cxgbe/t4_main.c @@ -1500,6 +1500,7 @@ cxgbe_vi_attach(device_t dev, struct vi_info *vi) #endif ifp->if_capabilities = T4_CAP; + ifp->if_capenable = T4_CAP_ENABLE; #ifdef TCP_OFFLOAD if (vi->nofldrxq != 0) ifp->if_capabilities |= IFCAP_TOE; @@ -1509,10 +1510,11 @@ cxgbe_vi_attach(device_t dev, struct vi_info *vi) ifp->if_capabilities |= IFCAP_NETMAP; #endif #ifdef RATELIMIT - if (is_ethoffload(vi->pi->adapter) && vi->nofldtxq != 0) + if (is_ethoffload(vi->pi->adapter) && vi->nofldtxq != 0) { ifp->if_capabilities |= IFCAP_TXRTLMT; + ifp->if_capenable |= IFCAP_TXRTLMT; + } #endif - ifp->if_capenable = T4_CAP_ENABLE; ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_IP | CSUM_TSO | CSUM_UDP_IPV6 | CSUM_TCP_IPV6; -- cgit v1.3 From bf94d6c78ba20f70a5726c5c85c7cf33c10e35da Mon Sep 17 00:00:00 2001 From: Konstantin Belousov Date: Wed, 19 Sep 2018 14:36:57 +0000 Subject: Fix state of dquot-less vnodes after failed quotaoff. UFS quotaoff iterates over all mp vnodes, and derefences and clears the pointers to corresponding dquots. If SU work items transiently reference some of dquots,quotaoff() would eventually fail, but all processed vnodes are already stripped from dquots. The state is problematic, since quotas are left enabled, but there is no dquots where blocks and inodes can be accounted. The result is assertion failures and NULL pointer dereferences. Fix it by suspending writes around quotaoff() call. Since the filesystem is synced, no dandling references to dquots from SU workitems can left behind, which means that quotaoff succeeds. The complication there is that quotaoff VFS op is performed with the mount point busied, while to suspend, we need to start write on the mp. If vn_start_write() is called on busied mp, system might deadlock against parallel unmount request. Handle this by unbusy-ing mp before starting write, which in turn requires changing the quotaoff() interface to return with the mount point not busied, same as was done for quotaon(). Reviewed by: mckusick Reported and tested by: pho Sponsored by: The FreeBSD Foundation Approved by: re (gjb) MFC after: 1 week Differential revision: https://reviews.freebsd.org/D17208 --- sys/kern/vfs_syscalls.c | 3 ++- sys/ufs/ufs/ufs_quota.c | 33 ++++++++++++++++++++++++++++++--- sys/ufs/ufs/ufs_vfsops.c | 11 ++++++++--- 3 files changed, 40 insertions(+), 7 deletions(-) (limited to 'sys') diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c index 91d93bb89207..a0ddefaa15df 100644 --- a/sys/kern/vfs_syscalls.c +++ b/sys/kern/vfs_syscalls.c @@ -190,7 +190,8 @@ sys_quotactl(struct thread *td, struct quotactl_args *uap) * Require that Q_QUOTAON handles the vfs_busy() reference on * its own, always returning with ubusied mount point. */ - if ((uap->cmd >> SUBCMDSHIFT) != Q_QUOTAON) + if ((uap->cmd >> SUBCMDSHIFT) != Q_QUOTAON && + (uap->cmd >> SUBCMDSHIFT) != Q_QUOTAOFF) vfs_unbusy(mp); return (error); } diff --git a/sys/ufs/ufs/ufs_quota.c b/sys/ufs/ufs/ufs_quota.c index 25133b834861..31fc49b7b4cf 100644 --- a/sys/ufs/ufs/ufs_quota.c +++ b/sys/ufs/ufs/ufs_quota.c @@ -712,6 +712,34 @@ again: return (error); } +static int +quotaoff_inchange1(struct thread *td, struct mount *mp, int type) +{ + int error; + bool need_resume; + + /* + * mp is already suspended on unmount. If not, suspend it, to + * avoid the situation where quotaoff operation eventually + * failing due to SU structures still keeping references on + * dquots, but vnode's references are already clean. This + * would cause quota accounting leak and asserts otherwise. + * Note that the thread has already called vn_start_write(). + */ + if (mp->mnt_susp_owner == td) { + need_resume = false; + } else { + error = vfs_write_suspend_umnt(mp); + if (error != 0) + return (error); + need_resume = true; + } + error = quotaoff1(td, mp, type); + if (need_resume) + vfs_write_resume(mp, VR_START_WRITE); + return (error); +} + /* * Turns off quotas, assumes that ump->um_qflags are already checked * and QTF_CLOSING is set to indicate operation in progress. Fixes @@ -721,10 +749,9 @@ int quotaoff_inchange(struct thread *td, struct mount *mp, int type) { struct ufsmount *ump; - int i; - int error; + int error, i; - error = quotaoff1(td, mp, type); + error = quotaoff_inchange1(td, mp, type); ump = VFSTOUFS(mp); UFS_LOCK(ump); diff --git a/sys/ufs/ufs/ufs_vfsops.c b/sys/ufs/ufs/ufs_vfsops.c index cf5b1d792aef..f39f50cc9a61 100644 --- a/sys/ufs/ufs/ufs_vfsops.c +++ b/sys/ufs/ufs/ufs_vfsops.c @@ -94,7 +94,8 @@ ufs_quotactl(mp, cmds, id, arg) void *arg; { #ifndef QUOTA - if ((cmds >> SUBCMDSHIFT) == Q_QUOTAON) + if ((cmds >> SUBCMDSHIFT) == Q_QUOTAON || + (cmds >> SUBCMDSHIFT) == Q_QUOTAOFF) vfs_unbusy(mp); return (EOPNOTSUPP); @@ -117,13 +118,13 @@ ufs_quotactl(mp, cmds, id, arg) break; default: - if (cmd == Q_QUOTAON) + if (cmd == Q_QUOTAON || cmd == Q_QUOTAOFF) vfs_unbusy(mp); return (EINVAL); } } if ((u_int)type >= MAXQUOTAS) { - if (cmd == Q_QUOTAON) + if (cmd == Q_QUOTAON || cmd == Q_QUOTAOFF) vfs_unbusy(mp); return (EINVAL); } @@ -134,7 +135,11 @@ ufs_quotactl(mp, cmds, id, arg) break; case Q_QUOTAOFF: + vfs_ref(mp); + vfs_unbusy(mp); + vn_start_write(NULL, &mp, V_WAIT | V_MNTREF); error = quotaoff(td, mp, type); + vn_finished_write(mp); break; case Q_SETQUOTA32: -- cgit v1.3 From a6ade1a07b805f559ec1ebe90b97477b3c41824b Mon Sep 17 00:00:00 2001 From: Konstantin Belousov Date: Wed, 19 Sep 2018 14:38:01 +0000 Subject: Fix ZFS VFS op quotactl to follow busy protocol. Reviewed by: avg, mckusick Tested by: pho Sponsored by: The FreeBSD Foundation Approved by: re (gjb) MFC after: 1 week Differential revision: https://reviews.freebsd.org/D17208 --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'sys') diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c index 92f51420e321..8e421132b10e 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c @@ -198,6 +198,8 @@ zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg) break; default: error = EINVAL; + if (cmd == Q_QUOTAON || cmd == Q_QUOTAOFF) + vfs_unbusy(vfsp); goto done; } } @@ -255,9 +257,11 @@ zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg) case Q_QUOTAON: // As far as I can tell, you can't turn quotas on or off on zfs error = 0; + vfs_unbusy(vfsp); break; case Q_QUOTAOFF: error = ENOTSUP; + vfs_unbusy(vfsp); break; case Q_SETQUOTA: error = copyin(&dqblk, arg, sizeof(dqblk)); -- cgit v1.3 From 215aa93033a2e7b9db61d989c51062ff82175bb2 Mon Sep 17 00:00:00 2001 From: Konstantin Belousov Date: Wed, 19 Sep 2018 15:39:16 +0000 Subject: amd64 pmap: remove tautological assert. pm_pcid is unsigned. Reviewed by: cem, markj CID: 1395727 Noted by: cem Sponsored by: The FreeBSD Foundation Approved by: re (gjb) MFC after: 3 days Differential revision: https://reviews.freebsd.org/D17235 --- sys/amd64/amd64/pmap.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'sys') diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index 9f43f3cfadc7..21ec1838d1d1 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -7442,8 +7442,7 @@ pmap_pcid_alloc_checked(pmap_t pmap, u_int cpuid) uint64_t cached; cached = pmap_pcid_alloc(pmap, cpuid); - KASSERT(pmap->pm_pcids[cpuid].pm_pcid >= 0 && - pmap->pm_pcids[cpuid].pm_pcid < PMAP_PCID_OVERMAX, + KASSERT(pmap->pm_pcids[cpuid].pm_pcid < PMAP_PCID_OVERMAX, ("pmap %p cpu %d pcid %#x", pmap, cpuid, pmap->pm_pcids[cpuid].pm_pcid)); KASSERT(pmap->pm_pcids[cpuid].pm_pcid != PMAP_PCID_KERN || -- cgit v1.3 From c035292545cf7b7ce74236d87ef1b2f985bd0cb7 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 19 Sep 2018 16:02:33 +0000 Subject: vm: check for empty kstack cache before locking The current cache logic checks the total number of stacks in the kernel, which even on small boxes significantly exceeds the 128 limit (e.g. an 8-way box with zfs has almost 800 stacks allocated). Stacks are cached earlier for each main thread. As a result the code is rarely executed, but when it is then (on boxes like the above) it always fails. Since there are no provisions made for NUMA and release time is approaching, just do a quick check to avoid acquiring the lock. Approved by: re (kib) --- sys/vm/vm_glue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'sys') diff --git a/sys/vm/vm_glue.c b/sys/vm/vm_glue.c index 832dbce324ef..beccb31f0b64 100644 --- a/sys/vm/vm_glue.c +++ b/sys/vm/vm_glue.c @@ -327,7 +327,7 @@ vm_thread_new(struct thread *td, int pages) else if (pages > KSTACK_MAX_PAGES) pages = KSTACK_MAX_PAGES; - if (pages == kstack_pages) { + if (pages == kstack_pages && kstack_cache != NULL) { mtx_lock(&kstack_cache_mtx); if (kstack_cache != NULL) { ks_ce = kstack_cache; -- cgit v1.3 From ad8edd79ccc1132b0d7c77b49c51785a619e725d Mon Sep 17 00:00:00 2001 From: Konstantin Belousov Date: Wed, 19 Sep 2018 16:37:43 +0000 Subject: Convert i386 NPX hardware context save methods to ifuncs. Since ifunc-capable linker is now required on i386, bring this code in line with the amd64 counterpart. Reviewed by: alc, markj Sponsored by: The FreeBSD Foundation Approved by: re (gjb) Differential revision: https://reviews.freebsd.org/D16736 --- sys/i386/i386/npx.c | 98 ++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 66 insertions(+), 32 deletions(-) (limited to 'sys') diff --git a/sys/i386/i386/npx.c b/sys/i386/i386/npx.c index a2edacec987e..665b55b1c35b 100644 --- a/sys/i386/i386/npx.c +++ b/sys/i386/i386/npx.c @@ -67,6 +67,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include @@ -183,7 +184,6 @@ CTASSERT(X86_XSTATE_XCR0_OFFSET >= offsetof(struct savexmm, sv_pad) && static void fpu_clean_state(void); -static void fpusave(union savefpu *); static void fpurstor(union savefpu *); int hw_float; @@ -206,8 +206,6 @@ struct xsave_area_elm_descr { u_int size; } *xsave_area_desc; -static int use_xsaveopt; - static volatile u_int npx_traps_while_probing; alias_for_inthand_t probetrap; @@ -314,6 +312,69 @@ cleanup: return (hw_float); } +static void +npxsave_xsaveopt(union savefpu *addr) +{ + + xsaveopt((char *)addr, xsave_mask); +} + +static void +fpusave_xsave(union savefpu *addr) +{ + + xsave((char *)addr, xsave_mask); +} + +static void +fpusave_fxsave(union savefpu *addr) +{ + + fxsave((char *)addr); +} + +static void +fpusave_fnsave(union savefpu *addr) +{ + + fnsave((char *)addr); +} + +static void +init_xsave(void) +{ + + if (use_xsave) + return; + if (!cpu_fxsr || (cpu_feature2 & CPUID2_XSAVE) == 0) + return; + use_xsave = 1; + TUNABLE_INT_FETCH("hw.use_xsave", &use_xsave); +} + +DEFINE_IFUNC(, void, npxsave_core, (union savefpu *), static) +{ + + init_xsave(); + if (use_xsave) + return ((cpu_stdext_feature & CPUID_EXTSTATE_XSAVEOPT) != 0 ? + npxsave_xsaveopt : fpusave_xsave); + if (cpu_fxsr) + return (fpusave_fxsave); + return (fpusave_fnsave); +} + +DEFINE_IFUNC(, void, fpusave, (union savefpu *), static) +{ + + init_xsave(); + if (use_xsave) + return (fpusave_xsave); + if (cpu_fxsr) + return (fpusave_fxsave); + return (fpusave_fnsave); +} + /* * Enable XSAVE if supported and allowed by user. * Calculate the xsave_mask. @@ -325,13 +386,8 @@ npxinit_bsp1(void) uint64_t xsave_mask_user; TUNABLE_INT_FETCH("hw.lazy_fpu_switch", &lazy_fpu_switch); - if (cpu_fxsr && (cpu_feature2 & CPUID2_XSAVE) != 0) { - use_xsave = 1; - TUNABLE_INT_FETCH("hw.use_xsave", &use_xsave); - } if (!use_xsave) return; - cpuid_count(0xd, 0x0, cp); xsave_mask = XFEATURE_ENABLED_X87 | XFEATURE_ENABLED_SSE; if ((cp[0] & xsave_mask) != xsave_mask) @@ -345,14 +401,9 @@ npxinit_bsp1(void) xsave_mask &= ~XFEATURE_AVX512; if ((xsave_mask & XFEATURE_MPX) != XFEATURE_MPX) xsave_mask &= ~XFEATURE_MPX; - - cpuid_count(0xd, 0x1, cp); - if ((cp[0] & CPUID_EXTSTATE_XSAVEOPT) != 0) - use_xsaveopt = 1; } /* - * Calculate the fpu save area size. */ static void @@ -867,15 +918,11 @@ npxdna(void) * npxsave() atomically with checking fpcurthread. */ void -npxsave(addr) - union savefpu *addr; +npxsave(union savefpu *addr) { stop_emulating(); - if (use_xsaveopt) - xsaveopt((char *)addr, xsave_mask); - else - fpusave(addr); + npxsave_core(addr); } void npxswitch(struct thread *td, struct pcb *pcb); @@ -1099,19 +1146,6 @@ npxsetregs(struct thread *td, union savefpu *addr, char *xfpustate, return (error); } -static void -fpusave(addr) - union savefpu *addr; -{ - - if (use_xsave) - xsave((char *)addr, xsave_mask); - else if (cpu_fxsr) - fxsave(addr); - else - fnsave(addr); -} - static void npx_fill_fpregs_xmm1(struct savexmm *sv_xmm, struct save87 *sv_87) { -- cgit v1.3 From 997fecb5c2110c99002b97b647b76b4e3cf0d849 Mon Sep 17 00:00:00 2001 From: "Bjoern A. Zeeb" Date: Wed, 19 Sep 2018 18:49:37 +0000 Subject: Update udp6_output() inp locking to avoid concurrency issues with route cache updates. Bring over locking changes applied to udp_output() for the route cache in r297225 and fixed in r306559 which achieve multiple things: (1) acquire an exclusive inp lock earlier depending on the expected conditions; we add a comment explaining this in udp6, (2) having acquired the exclusive lock earlier eliminates a slight possible chance for a race condition which was present in v4 for multiple years as well and is now gone, and (3) only pass the inp_route6 to ip6_output() if we are holding an exclusive inp lock, so that possible route cache updates in case of routing table generation number changes can happen safely. In addition this change (as the legacy IP counterpart) decomposes the tracking of inp and pcbinfo lock and adds extra assertions, that the two together are acquired correctly. PR: 230950 Reviewed by: karels, markj Approved by: re (gjb) Pointyhat to: bz (for completely missing this bit) Differential Revision: https://reviews.freebsd.org/D17230 --- sys/netinet6/udp6_usrreq.c | 50 ++++++++++++++++++++++++++++++++++------------ 1 file changed, 37 insertions(+), 13 deletions(-) (limited to 'sys') diff --git a/sys/netinet6/udp6_usrreq.c b/sys/netinet6/udp6_usrreq.c index b32e88663172..d5ab934c9d41 100644 --- a/sys/netinet6/udp6_usrreq.c +++ b/sys/netinet6/udp6_usrreq.c @@ -698,7 +698,7 @@ udp6_output(struct socket *so, int flags_arg, struct mbuf *m, u_int32_t ulen, plen; uint16_t cscov; u_short fport; - uint8_t nxt, unlock_udbinfo; + uint8_t nxt, unlock_inp, unlock_udbinfo; /* addr6 has been validated in udp6_send(). */ sin6 = (struct sockaddr_in6 *)addr6; @@ -734,7 +734,22 @@ udp6_output(struct socket *so, int flags_arg, struct mbuf *m, inp = sotoinpcb(so); KASSERT(inp != NULL, ("%s: inp == NULL", __func__)); - INP_RLOCK(inp); + /* + * In the following cases we want a write lock on the inp for either + * local operations or for possible route cache updates in the IPv6 + * output path: + * - on connected sockets (sin6 is NULL) for route cache updates, + * - when we are not bound to an address and source port (it is + * in6_pcbsetport() which will require the write lock). + */ + if (sin6 == NULL || (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) && + inp->inp_lport == 0)) { + INP_WLOCK(inp); + unlock_inp = UH_WLOCKED; + } else { + INP_RLOCK(inp); + unlock_inp = UH_RLOCKED; + } nxt = (inp->inp_socket->so_proto->pr_protocol == IPPROTO_UDP) ? IPPROTO_UDP : IPPROTO_UDPLITE; @@ -758,7 +773,10 @@ udp6_output(struct socket *so, int flags_arg, struct mbuf *m, * potential race in which the factors causing us to * select the UDPv4 output routine are invalidated? */ - INP_RUNLOCK(inp); + if (unlock_inp == UH_WLOCKED) + INP_WUNLOCK(inp); + else + INP_RUNLOCK(inp); if (sin6) in6_sin6_2_sin_in_sock((struct sockaddr *)sin6); pru = inetsw[ip_protox[nxt]].pr_usrreqs; @@ -772,7 +790,10 @@ udp6_output(struct socket *so, int flags_arg, struct mbuf *m, if (control) { if ((error = ip6_setpktopts(control, &opt, inp->in6p_outputopts, td->td_ucred, nxt)) != 0) { - INP_RUNLOCK(inp); + if (unlock_inp == UH_WLOCKED) + INP_WUNLOCK(inp); + else + INP_RUNLOCK(inp); ip6_clearpktopts(&opt, -1); if (control) m_freem(control); @@ -786,12 +807,6 @@ udp6_output(struct socket *so, int flags_arg, struct mbuf *m, pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); if (sin6 != NULL && IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) && inp->inp_lport == 0) { - INP_RUNLOCK(inp); - /* - * XXX there is a short window here which could lead to a race; - * should we re-check that what got us here is still valid? - */ - INP_WLOCK(inp); INP_HASH_WLOCK(pcbinfo); unlock_udbinfo = UH_WLOCKED; } else if (sin6 != NULL && @@ -972,9 +987,10 @@ udp6_output(struct socket *so, int flags_arg, struct mbuf *m, UDPLITE_PROBE(send, NULL, inp, ip6, inp, udp6); else UDP_PROBE(send, NULL, inp, ip6, inp, udp6); - error = ip6_output(m, optp, &inp->inp_route6, flags, + error = ip6_output(m, optp, + (unlock_inp == UH_WLOCKED) ? &inp->inp_route6 : NULL, flags, inp->in6p_moptions, NULL, inp); - if (unlock_udbinfo == UH_WLOCKED) + if (unlock_inp == UH_WLOCKED) INP_WUNLOCK(inp); else INP_RUNLOCK(inp); @@ -987,12 +1003,20 @@ udp6_output(struct socket *so, int flags_arg, struct mbuf *m, release: if (unlock_udbinfo == UH_WLOCKED) { + KASSERT(unlock_inp == UH_WLOCKED, ("%s: excl udbinfo lock, " + "non-excl inp lock: pcbinfo %p %#x inp %p %#x", + __func__, pcbinfo, unlock_udbinfo, inp, unlock_inp)); INP_HASH_WUNLOCK(pcbinfo); INP_WUNLOCK(inp); } else if (unlock_udbinfo == UH_RLOCKED) { + KASSERT(unlock_inp == UH_RLOCKED, ("%s: non-excl udbinfo lock, " + "excl inp lock: pcbinfo %p %#x inp %p %#x", + __func__, pcbinfo, unlock_udbinfo, inp, unlock_inp)); INP_HASH_RUNLOCK_ET(pcbinfo, et); INP_RUNLOCK(inp); - } else + } else if (unlock_inp == UH_WLOCKED) + INP_WUNLOCK(inp); + else INP_RUNLOCK(inp); if (control) { ip6_clearpktopts(&opt, -1); -- cgit v1.3 From 1aed6d48a8b08d7ce7a981085d3b7379fa77d470 Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Wed, 19 Sep 2018 19:13:43 +0000 Subject: Move kernel vmem arena initialization to vm_kern.c. This keeps the initialization coupled together with the kmem_* KPI implementation, which is the main user of these arenas. No functional change intended. Reviewed by: alc Approved by: re (gjb) Sponsored by: The FreeBSD Foundation Differential Revision: https://reviews.freebsd.org/D17247 --- sys/vm/vm_init.c | 84 +------------------------------------------------------- sys/vm/vm_kern.c | 82 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 83 insertions(+), 83 deletions(-) (limited to 'sys') diff --git a/sys/vm/vm_init.c b/sys/vm/vm_init.c index 09e87ed231ed..27ecb960201e 100644 --- a/sys/vm/vm_init.c +++ b/sys/vm/vm_init.c @@ -98,12 +98,6 @@ extern void uma_startup1(void); extern void uma_startup2(void); extern void vm_radix_reserve_kva(void); -#if VM_NRESERVLEVEL > 0 -#define KVA_QUANTUM (1 << (VM_LEVEL_0_ORDER + PAGE_SHIFT)) -#else - /* On non-superpage architectures want large import sizes. */ -#define KVA_QUANTUM (PAGE_SIZE * 1024) -#endif long physmem; /* @@ -112,58 +106,15 @@ long physmem; static void vm_mem_init(void *); SYSINIT(vm_mem, SI_SUB_VM, SI_ORDER_FIRST, vm_mem_init, NULL); -/* - * Import kva into the kernel arena. - */ -static int -kva_import(void *unused, vmem_size_t size, int flags, vmem_addr_t *addrp) -{ - vm_offset_t addr; - int result; - - KASSERT((size % KVA_QUANTUM) == 0, - ("kva_import: Size %jd is not a multiple of %d", - (intmax_t)size, (int)KVA_QUANTUM)); - addr = vm_map_min(kernel_map); - result = vm_map_find(kernel_map, NULL, 0, &addr, size, 0, - VMFS_SUPER_SPACE, VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT); - if (result != KERN_SUCCESS) - return (ENOMEM); - - *addrp = addr; - - return (0); -} - -#if VM_NRESERVLEVEL > 0 -/* - * Import a superpage from the normal kernel arena into the special - * arena for allocations with different permissions. - */ -static int -kernel_rwx_alloc(void *arena, vmem_size_t size, int flags, vmem_addr_t *addrp) -{ - - KASSERT((size % KVA_QUANTUM) == 0, - ("kernel_rwx_alloc: Size %jd is not a multiple of %d", - (intmax_t)size, (int)KVA_QUANTUM)); - return (vmem_xalloc(arena, size, KVA_QUANTUM, 0, 0, VMEM_ADDR_MIN, - VMEM_ADDR_MAX, flags, addrp)); -} -#endif - /* * vm_init initializes the virtual memory system. * This is done only by the first cpu up. * * The start and end address of physical memory is passed in. */ -/* ARGSUSED*/ static void -vm_mem_init(dummy) - void *dummy; +vm_mem_init(void *dummy) { - int domain; /* * Initializes resident memory structures. From here on, all physical @@ -184,39 +135,6 @@ vm_mem_init(dummy) vm_map_startup(); kmem_init(virtual_avail, virtual_end); - /* - * Initialize the kernel_arena. This can grow on demand. - */ - vmem_init(kernel_arena, "kernel arena", 0, 0, PAGE_SIZE, 0, 0); - vmem_set_import(kernel_arena, kva_import, NULL, NULL, KVA_QUANTUM); - -#if VM_NRESERVLEVEL > 0 - /* - * In an architecture with superpages, maintain a separate arena - * for allocations with permissions that differ from the "standard" - * read/write permissions used for memory in the kernel_arena. - */ - kernel_rwx_arena = vmem_create("kernel rwx arena", 0, 0, PAGE_SIZE, - 0, M_WAITOK); - vmem_set_import(kernel_rwx_arena, kernel_rwx_alloc, - (vmem_release_t *)vmem_xfree, kernel_arena, KVA_QUANTUM); -#endif - - for (domain = 0; domain < vm_ndomains; domain++) { - vm_dom[domain].vmd_kernel_arena = vmem_create( - "kernel arena domain", 0, 0, PAGE_SIZE, 0, M_WAITOK); - vmem_set_import(vm_dom[domain].vmd_kernel_arena, - (vmem_import_t *)vmem_alloc, NULL, kernel_arena, - KVA_QUANTUM); -#if VM_NRESERVLEVEL > 0 - vm_dom[domain].vmd_kernel_rwx_arena = vmem_create( - "kernel rwx arena domain", 0, 0, PAGE_SIZE, 0, M_WAITOK); - vmem_set_import(vm_dom[domain].vmd_kernel_rwx_arena, - kernel_rwx_alloc, (vmem_release_t *)vmem_xfree, - vm_dom[domain].vmd_kernel_arena, KVA_QUANTUM); -#endif - } - #ifndef UMA_MD_SMALL_ALLOC /* Set up radix zone to use noobj_alloc. */ vm_radix_reserve_kva(); diff --git a/sys/vm/vm_kern.c b/sys/vm/vm_kern.c index 5dee4d758dd0..31b85fd81854 100644 --- a/sys/vm/vm_kern.c +++ b/sys/vm/vm_kern.c @@ -121,6 +121,13 @@ SYSCTL_ULONG(_vm, OID_AUTO, max_kernel_address, CTLFLAG_RD, #endif "Max kernel address"); +#if VM_NRESERVLEVEL > 0 +#define KVA_QUANTUM (1 << (VM_LEVEL_0_ORDER + PAGE_SHIFT)) +#else +/* On non-superpage architectures want large import sizes. */ +#define KVA_QUANTUM (PAGE_SIZE * 1024) +#endif + /* * kva_alloc: * @@ -644,6 +651,46 @@ kmem_init_zero_region(void) zero_region = (const void *)addr; } +/* + * Import kva into the kernel arena. + */ +static int +kva_import(void *unused, vmem_size_t size, int flags, vmem_addr_t *addrp) +{ + vm_offset_t addr; + int result; + + KASSERT((size % KVA_QUANTUM) == 0, + ("kva_import: Size %jd is not a multiple of %d", + (intmax_t)size, (int)KVA_QUANTUM)); + addr = vm_map_min(kernel_map); + result = vm_map_find(kernel_map, NULL, 0, &addr, size, 0, + VMFS_SUPER_SPACE, VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT); + if (result != KERN_SUCCESS) + return (ENOMEM); + + *addrp = addr; + + return (0); +} + +#if VM_NRESERVLEVEL > 0 +/* + * Import a superpage from the normal kernel arena into the special + * arena for allocations with different permissions. + */ +static int +kernel_rwx_alloc(void *arena, vmem_size_t size, int flags, vmem_addr_t *addrp) +{ + + KASSERT((size % KVA_QUANTUM) == 0, + ("kernel_rwx_alloc: Size %jd is not a multiple of %d", + (intmax_t)size, (int)KVA_QUANTUM)); + return (vmem_xalloc(arena, size, KVA_QUANTUM, 0, 0, VMEM_ADDR_MIN, + VMEM_ADDR_MAX, flags, addrp)); +} +#endif + /* * kmem_init: * @@ -651,11 +698,13 @@ kmem_init_zero_region(void) * data, bss, and all space allocated thus far (`boostrap' data). The * new map will thus map the range between VM_MIN_KERNEL_ADDRESS and * `start' as allocated, and the range between `start' and `end' as free. + * Create the kernel vmem arena and its per-domain children. */ void kmem_init(vm_offset_t start, vm_offset_t end) { vm_map_t m; + int domain; m = vm_map_create(kernel_pmap, VM_MIN_KERNEL_ADDRESS, end); m->system_map = 1; @@ -671,6 +720,39 @@ kmem_init(vm_offset_t start, vm_offset_t end) start, VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT); /* ... and ending with the completion of the above `insert' */ vm_map_unlock(m); + + /* + * Initialize the kernel_arena. This can grow on demand. + */ + vmem_init(kernel_arena, "kernel arena", 0, 0, PAGE_SIZE, 0, 0); + vmem_set_import(kernel_arena, kva_import, NULL, NULL, KVA_QUANTUM); + +#if VM_NRESERVLEVEL > 0 + /* + * In an architecture with superpages, maintain a separate arena + * for allocations with permissions that differ from the "standard" + * read/write permissions used for memory in the kernel_arena. + */ + kernel_rwx_arena = vmem_create("kernel rwx arena", 0, 0, PAGE_SIZE, + 0, M_WAITOK); + vmem_set_import(kernel_rwx_arena, kernel_rwx_alloc, + (vmem_release_t *)vmem_xfree, kernel_arena, KVA_QUANTUM); +#endif + + for (domain = 0; domain < vm_ndomains; domain++) { + vm_dom[domain].vmd_kernel_arena = vmem_create( + "kernel arena domain", 0, 0, PAGE_SIZE, 0, M_WAITOK); + vmem_set_import(vm_dom[domain].vmd_kernel_arena, + (vmem_import_t *)vmem_alloc, NULL, kernel_arena, + KVA_QUANTUM); +#if VM_NRESERVLEVEL > 0 + vm_dom[domain].vmd_kernel_rwx_arena = vmem_create( + "kernel rwx arena domain", 0, 0, PAGE_SIZE, 0, M_WAITOK); + vmem_set_import(vm_dom[domain].vmd_kernel_rwx_arena, + kernel_rwx_alloc, (vmem_release_t *)vmem_xfree, + vm_dom[domain].vmd_kernel_arena, KVA_QUANTUM); +#endif + } } /* -- cgit v1.3 From d12c44655065633dd8b8c249ec271a1d8ba63ba4 Mon Sep 17 00:00:00 2001 From: Konstantin Belousov Date: Wed, 19 Sep 2018 19:35:02 +0000 Subject: Convert x86 cache invalidation functions to ifuncs. This simplifies the runtime logic and reduces the number of runtime-constant branches. Reviewed by: alc, markj Sponsored by: The FreeBSD Foundation Approved by: re (gjb) Differential revision: https://reviews.freebsd.org/D16736 --- sys/amd64/amd64/pmap.c | 97 +++++++++++++++++++++------------ sys/amd64/include/pmap.h | 4 +- sys/dev/drm2/drm_os_freebsd.c | 4 +- sys/dev/drm2/i915/intel_ringbuffer.c | 9 +-- sys/i386/i386/pmap.c | 103 ++++++++++++++++++++++------------- sys/i386/i386/vm_machdep.c | 2 +- sys/i386/include/pmap.h | 4 +- sys/x86/iommu/intel_utils.c | 3 +- 8 files changed, 139 insertions(+), 87 deletions(-) (limited to 'sys') diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index 21ec1838d1d1..d4d91e7c23a9 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -648,6 +648,10 @@ static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte); static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte); +static void pmap_invalidate_cache_range_selfsnoop(vm_offset_t sva, + vm_offset_t eva); +static void pmap_invalidate_cache_range_all(vm_offset_t sva, + vm_offset_t eva); static void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde); static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode); @@ -2171,36 +2175,62 @@ pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde) pmap_invalidate_page(pmap, va); } +DEFINE_IFUNC(, void, pmap_invalidate_cache_range, + (vm_offset_t sva, vm_offset_t eva), static) +{ + + if ((cpu_feature & CPUID_SS) != 0) + return (pmap_invalidate_cache_range_selfsnoop); + if ((cpu_feature & CPUID_CLFSH) != 0) + return (pmap_force_invalidate_cache_range); + return (pmap_invalidate_cache_range_all); +} + #define PMAP_CLFLUSH_THRESHOLD (2 * 1024 * 1024) -void -pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva, boolean_t force) +static void +pmap_invalidate_cache_range_check_align(vm_offset_t sva, vm_offset_t eva) { - if (force) { - sva &= ~(vm_offset_t)(cpu_clflush_line_size - 1); - } else { - KASSERT((sva & PAGE_MASK) == 0, - ("pmap_invalidate_cache_range: sva not page-aligned")); - KASSERT((eva & PAGE_MASK) == 0, - ("pmap_invalidate_cache_range: eva not page-aligned")); - } + KASSERT((sva & PAGE_MASK) == 0, + ("pmap_invalidate_cache_range: sva not page-aligned")); + KASSERT((eva & PAGE_MASK) == 0, + ("pmap_invalidate_cache_range: eva not page-aligned")); +} + +static void +pmap_invalidate_cache_range_selfsnoop(vm_offset_t sva, vm_offset_t eva) +{ + + pmap_invalidate_cache_range_check_align(sva, eva); +} + +void +pmap_force_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva) +{ - if ((cpu_feature & CPUID_SS) != 0 && !force) - ; /* If "Self Snoop" is supported and allowed, do nothing. */ - else if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0 && - eva - sva < PMAP_CLFLUSH_THRESHOLD) { + sva &= ~(vm_offset_t)(cpu_clflush_line_size - 1); + if (eva - sva >= PMAP_CLFLUSH_THRESHOLD) { /* - * XXX: Some CPUs fault, hang, or trash the local APIC - * registers if we use CLFLUSH on the local APIC - * range. The local APIC is always uncached, so we - * don't need to flush for that range anyway. + * The supplied range is bigger than 2MB. + * Globally invalidate cache. */ - if (pmap_kextract(sva) == lapic_paddr) - return; + pmap_invalidate_cache(); + return; + } + + /* + * XXX: Some CPUs fault, hang, or trash the local APIC + * registers if we use CLFLUSH on the local APIC range. The + * local APIC is always uncached, so we don't need to flush + * for that range anyway. + */ + if (pmap_kextract(sva) == lapic_paddr) + return; + if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0) { /* - * Otherwise, do per-cache line flush. Use the sfence + * Do per-cache line flush. Use the sfence * instruction to insure that previous stores are * included in the write-back. The processor * propagates flush to other processors in the cache @@ -2210,10 +2240,7 @@ pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva, boolean_t force) for (; sva < eva; sva += cpu_clflush_line_size) clflushopt(sva); sfence(); - } else if ((cpu_feature & CPUID_CLFSH) != 0 && - eva - sva < PMAP_CLFLUSH_THRESHOLD) { - if (pmap_kextract(sva) == lapic_paddr) - return; + } else { /* * Writes are ordered by CLFLUSH on Intel CPUs. */ @@ -2223,17 +2250,17 @@ pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva, boolean_t force) clflush(sva); if (cpu_vendor_id != CPU_VENDOR_INTEL) mfence(); - } else { - - /* - * No targeted cache flush methods are supported by CPU, - * or the supplied range is bigger than 2MB. - * Globally invalidate cache. - */ - pmap_invalidate_cache(); } } +static void +pmap_invalidate_cache_range_all(vm_offset_t sva, vm_offset_t eva) +{ + + pmap_invalidate_cache_range_check_align(sva, eva); + pmap_invalidate_cache(); +} + /* * Remove the specified set of pages from the data and instruction caches. * @@ -6938,7 +6965,7 @@ pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode) for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE) pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode); pmap_invalidate_range(kernel_pmap, va, va + tmpsize); - pmap_invalidate_cache_range(va, va + tmpsize, FALSE); + pmap_invalidate_cache_range(va, va + tmpsize); return ((void *)(va + offset)); } @@ -7297,7 +7324,7 @@ pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode) */ if (changed) { pmap_invalidate_range(kernel_pmap, base, tmpva); - pmap_invalidate_cache_range(base, tmpva, FALSE); + pmap_invalidate_cache_range(base, tmpva); } return (error); } diff --git a/sys/amd64/include/pmap.h b/sys/amd64/include/pmap.h index 34c3fb868ce3..5b83307bbb68 100644 --- a/sys/amd64/include/pmap.h +++ b/sys/amd64/include/pmap.h @@ -431,8 +431,8 @@ void pmap_invalidate_range(pmap_t, vm_offset_t, vm_offset_t); void pmap_invalidate_all(pmap_t); void pmap_invalidate_cache(void); void pmap_invalidate_cache_pages(vm_page_t *pages, int count); -void pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva, - boolean_t force); +void pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva); +void pmap_force_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva); void pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num); boolean_t pmap_map_io_transient(vm_page_t *, vm_offset_t *, int, boolean_t); void pmap_unmap_io_transient(vm_page_t *, vm_offset_t *, int, boolean_t); diff --git a/sys/dev/drm2/drm_os_freebsd.c b/sys/dev/drm2/drm_os_freebsd.c index 8489ca848027..c7b0bce2c7ab 100644 --- a/sys/dev/drm2/drm_os_freebsd.c +++ b/sys/dev/drm2/drm_os_freebsd.c @@ -395,8 +395,8 @@ drm_clflush_virt_range(char *addr, unsigned long length) { #if defined(__i386__) || defined(__amd64__) - pmap_invalidate_cache_range((vm_offset_t)addr, - (vm_offset_t)addr + length, TRUE); + pmap_force_invalidate_cache_range((vm_offset_t)addr, + (vm_offset_t)addr + length); #else DRM_ERROR("drm_clflush_virt_range not implemented on this architecture"); #endif diff --git a/sys/dev/drm2/i915/intel_ringbuffer.c b/sys/dev/drm2/i915/intel_ringbuffer.c index 92a792746b1b..c6c242c875c3 100644 --- a/sys/dev/drm2/i915/intel_ringbuffer.c +++ b/sys/dev/drm2/i915/intel_ringbuffer.c @@ -471,8 +471,8 @@ init_pipe_control(struct intel_ring_buffer *ring) if (pc->cpu_page == NULL) goto err_unpin; pmap_qenter((uintptr_t)pc->cpu_page, &obj->pages[0], 1); - pmap_invalidate_cache_range((vm_offset_t)pc->cpu_page, - (vm_offset_t)pc->cpu_page + PAGE_SIZE, FALSE); + pmap_force_invalidate_cache_range((vm_offset_t)pc->cpu_page, + (vm_offset_t)pc->cpu_page + PAGE_SIZE); pc->obj = obj; ring->private = pc; @@ -1102,8 +1102,9 @@ static int init_status_page(struct intel_ring_buffer *ring) } pmap_qenter((vm_offset_t)ring->status_page.page_addr, &obj->pages[0], 1); - pmap_invalidate_cache_range((vm_offset_t)ring->status_page.page_addr, - (vm_offset_t)ring->status_page.page_addr + PAGE_SIZE, FALSE); + pmap_force_invalidate_cache_range( + (vm_offset_t)ring->status_page.page_addr, + (vm_offset_t)ring->status_page.page_addr + PAGE_SIZE); ring->status_page.obj = obj; memset(ring->status_page.page_addr, 0, PAGE_SIZE); diff --git a/sys/i386/i386/pmap.c b/sys/i386/i386/pmap.c index 0c1437df5187..a969eef28810 100644 --- a/sys/i386/i386/pmap.c +++ b/sys/i386/i386/pmap.c @@ -148,6 +148,7 @@ __FBSDID("$FreeBSD$"); #include #include #endif +#include #include #include #include @@ -314,6 +315,10 @@ static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte); static void pmap_flush_page(vm_page_t m); static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte); +static void pmap_invalidate_cache_range_selfsnoop(vm_offset_t sva, + vm_offset_t eva); +static void pmap_invalidate_cache_range_all(vm_offset_t sva, + vm_offset_t eva); static void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde); static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte); @@ -1407,37 +1412,62 @@ pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde) pmap_invalidate_page(pmap, va); } +DEFINE_IFUNC(, void, pmap_invalidate_cache_range, (vm_offset_t, vm_offset_t), + static) +{ + + if ((cpu_feature & CPUID_SS) != 0) + return (pmap_invalidate_cache_range_selfsnoop); + if ((cpu_feature & CPUID_CLFSH) != 0) + return (pmap_force_invalidate_cache_range); + return (pmap_invalidate_cache_range_all); +} + #define PMAP_CLFLUSH_THRESHOLD (2 * 1024 * 1024) -void -pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva, boolean_t force) +static void +pmap_invalidate_cache_range_check_align(vm_offset_t sva, vm_offset_t eva) { - if (force) { - sva &= ~(vm_offset_t)(cpu_clflush_line_size - 1); - } else { - KASSERT((sva & PAGE_MASK) == 0, - ("pmap_invalidate_cache_range: sva not page-aligned")); - KASSERT((eva & PAGE_MASK) == 0, - ("pmap_invalidate_cache_range: eva not page-aligned")); - } + KASSERT((sva & PAGE_MASK) == 0, + ("pmap_invalidate_cache_range: sva not page-aligned")); + KASSERT((eva & PAGE_MASK) == 0, + ("pmap_invalidate_cache_range: eva not page-aligned")); +} - if ((cpu_feature & CPUID_SS) != 0 && !force) - ; /* If "Self Snoop" is supported and allowed, do nothing. */ - else if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0 && - eva - sva < PMAP_CLFLUSH_THRESHOLD) { -#ifdef DEV_APIC +static void +pmap_invalidate_cache_range_selfsnoop(vm_offset_t sva, vm_offset_t eva) +{ + + pmap_invalidate_cache_range_check_align(sva, eva); +} + +void +pmap_force_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva) +{ + + sva &= ~(vm_offset_t)(cpu_clflush_line_size - 1); + if (eva - sva >= PMAP_CLFLUSH_THRESHOLD) { /* - * XXX: Some CPUs fault, hang, or trash the local APIC - * registers if we use CLFLUSH on the local APIC - * range. The local APIC is always uncached, so we - * don't need to flush for that range anyway. + * The supplied range is bigger than 2MB. + * Globally invalidate cache. */ - if (pmap_kextract(sva) == lapic_paddr) - return; -#endif + pmap_invalidate_cache(); + return; + } + + /* + * XXX: Some CPUs fault, hang, or trash the local APIC + * registers if we use CLFLUSH on the local APIC + * range. The local APIC is always uncached, so we + * don't need to flush for that range anyway. + */ + if (pmap_kextract(sva) == lapic_paddr) + return; + + if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0) { /* - * Otherwise, do per-cache line flush. Use the sfence + * Do per-cache line flush. Use the sfence * instruction to insure that previous stores are * included in the write-back. The processor * propagates flush to other processors in the cache @@ -1447,12 +1477,7 @@ pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva, boolean_t force) for (; sva < eva; sva += cpu_clflush_line_size) clflushopt(sva); sfence(); - } else if ((cpu_feature & CPUID_CLFSH) != 0 && - eva - sva < PMAP_CLFLUSH_THRESHOLD) { -#ifdef DEV_APIC - if (pmap_kextract(sva) == lapic_paddr) - return; -#endif + } else { /* * Writes are ordered by CLFLUSH on Intel CPUs. */ @@ -1462,17 +1487,17 @@ pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva, boolean_t force) clflush(sva); if (cpu_vendor_id != CPU_VENDOR_INTEL) mfence(); - } else { - - /* - * No targeted cache flush methods are supported by CPU, - * or the supplied range is bigger than 2MB. - * Globally invalidate cache. - */ - pmap_invalidate_cache(); } } +static void +pmap_invalidate_cache_range_all(vm_offset_t sva, vm_offset_t eva) +{ + + pmap_invalidate_cache_range_check_align(sva, eva); + pmap_invalidate_cache(); +} + void pmap_invalidate_cache_pages(vm_page_t *pages, int count) { @@ -5479,7 +5504,7 @@ pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode) for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE) pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode); pmap_invalidate_range(kernel_pmap, va, va + tmpsize); - pmap_invalidate_cache_range(va, va + size, FALSE); + pmap_invalidate_cache_range(va, va + size); return ((void *)(va + offset)); } @@ -5718,7 +5743,7 @@ pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) */ if (changed) { pmap_invalidate_range(kernel_pmap, base, tmpva); - pmap_invalidate_cache_range(base, tmpva, FALSE); + pmap_invalidate_cache_range(base, tmpva); } return (0); } diff --git a/sys/i386/i386/vm_machdep.c b/sys/i386/i386/vm_machdep.c index c3ed95dd4d02..250689459643 100644 --- a/sys/i386/i386/vm_machdep.c +++ b/sys/i386/i386/vm_machdep.c @@ -650,7 +650,7 @@ sf_buf_invalidate(struct sf_buf *sf) * settings are recalculated. */ pmap_qenter(sf->kva, &m, 1); - pmap_invalidate_cache_range(sf->kva, sf->kva + PAGE_SIZE, FALSE); + pmap_invalidate_cache_range(sf->kva, sf->kva + PAGE_SIZE); } /* diff --git a/sys/i386/include/pmap.h b/sys/i386/include/pmap.h index da9ae6588189..e48c0d3c6af5 100644 --- a/sys/i386/include/pmap.h +++ b/sys/i386/include/pmap.h @@ -394,8 +394,8 @@ void pmap_invalidate_range(pmap_t, vm_offset_t, vm_offset_t); void pmap_invalidate_all(pmap_t); void pmap_invalidate_cache(void); void pmap_invalidate_cache_pages(vm_page_t *pages, int count); -void pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva, - boolean_t force); +void pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva); +void pmap_force_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva); void *pmap_trm_alloc(size_t size, int flags); void pmap_trm_free(void *addr, size_t size); diff --git a/sys/x86/iommu/intel_utils.c b/sys/x86/iommu/intel_utils.c index 5b145c5aeae9..944aa214baa6 100644 --- a/sys/x86/iommu/intel_utils.c +++ b/sys/x86/iommu/intel_utils.c @@ -368,8 +368,7 @@ dmar_flush_transl_to_ram(struct dmar_unit *unit, void *dst, size_t sz) * If DMAR does not snoop paging structures accesses, flush * CPU cache to memory. */ - pmap_invalidate_cache_range((uintptr_t)dst, (uintptr_t)dst + sz, - TRUE); + pmap_force_invalidate_cache_range((uintptr_t)dst, (uintptr_t)dst + sz); } void -- cgit v1.3 From 31ce875385b8aa059add44636966821d5035a60e Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Wed, 19 Sep 2018 22:53:52 +0000 Subject: Clear all of the VFP state in fill_fpregs(). Zero the entire FP register set structure returned for ptrace() if a thread hasn't used FP registers rather than leaking garbage in the fp_sr and fp_cr fields. Reviewed by: emaste, andrew Approved by: re (rgrimes) MFC after: 2 weeks Sponsored by: DARPA, AFRL Differential Revision: https://reviews.freebsd.org/D17140 --- sys/arm64/arm64/machdep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'sys') diff --git a/sys/arm64/arm64/machdep.c b/sys/arm64/arm64/machdep.c index dc243c958a63..e82b9e4ab0bf 100644 --- a/sys/arm64/arm64/machdep.c +++ b/sys/arm64/arm64/machdep.c @@ -236,7 +236,7 @@ fill_fpregs(struct thread *td, struct fpreg *regs) regs->fp_sr = pcb->pcb_fpustate.vfp_fpsr; } else #endif - memset(regs->fp_q, 0, sizeof(regs->fp_q)); + memset(regs, 0, sizeof(*regs)); return (0); } -- cgit v1.3 From 232d0b87e0047b095db8103bd06ebfdf9b431d47 Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Wed, 19 Sep 2018 23:45:18 +0000 Subject: Various fixes for floating point on RISC-V. - Explicitly load an empty initial state into FP registers when taking the fault on the first FP instruction in a thread. Setting SSTATE.FS to INITIAL is just a marker to let context switch restore code know that it can load FP registers with zeroes instead of memory loads. It does not imply that the hardware will reset all registers to zero on first access. In addition, set the state to CLEAN instead of INITIAL after the first FP instruction. cpu_switch() doesn't do anything for INITIAL and only restores from the pcb if the state is CLEAN. We could perhaps change cpu_switch to call fpe_state_clear if the state was INITIAL and leave SSTATE.FS set to INITIAL instead of CLEAN after the first FP instruction. However, adding this complexity to cpu_switch() doesn't seem worth the supposed gain. - Only save the current FPU registers in fill_fpregs() if the request is made to save the current thread's registers. Previously if a debugger requested FP registers via ptrace() it was getting a copy of the debugger's FP registers rather than the debugee's. - Zero the entire FP register set structure returned for ptrace() if a thread hasn't used FP registers rather than leaking garbage in the fp_fcsr field. - If a debugger writes FP registers via ptrace(), always mark the pcb as having valid FP registers and set SSTATUS.FS_MASK to CLEAN so that the registers will be restored when the debugged thread resumes. - Be more explicit about clearing the SSTATUS.FS field before setting it to CLEAN on the first FP instruction trap. Submitted by: br, markj Approved by: re (rgrimes) Sponsored by: DARPA Differential Revision: https://reviews.freebsd.org/D17141 --- sys/riscv/include/fpe.h | 1 + sys/riscv/riscv/machdep.c | 10 +++++++-- sys/riscv/riscv/swtch.S | 53 +++++++++++++++++++++++++++++++++++++++++++++++ sys/riscv/riscv/trap.c | 7 ++++++- 4 files changed, 68 insertions(+), 3 deletions(-) (limited to 'sys') diff --git a/sys/riscv/include/fpe.h b/sys/riscv/include/fpe.h index 1294ab0de8fa..a519094d4272 100644 --- a/sys/riscv/include/fpe.h +++ b/sys/riscv/include/fpe.h @@ -34,5 +34,6 @@ #define _MACHINE_FPE_H_ void fpe_state_save(struct thread *td); +void fpe_state_clear(void); #endif /* !_MACHINE_FPE_H_ */ diff --git a/sys/riscv/riscv/machdep.c b/sys/riscv/riscv/machdep.c index 64d20126f3ad..430336408368 100644 --- a/sys/riscv/riscv/machdep.c +++ b/sys/riscv/riscv/machdep.c @@ -204,13 +204,14 @@ fill_fpregs(struct thread *td, struct fpreg *regs) * If we have just been running FPE instructions we will * need to save the state to memcpy it below. */ - fpe_state_save(td); + if (td == curthread) + fpe_state_save(td); memcpy(regs->fp_x, pcb->pcb_x, sizeof(regs->fp_x)); regs->fp_fcsr = pcb->pcb_fcsr; } else #endif - memset(regs->fp_x, 0, sizeof(regs->fp_x)); + memset(regs, 0, sizeof(*regs)); return (0); } @@ -219,12 +220,17 @@ int set_fpregs(struct thread *td, struct fpreg *regs) { #ifdef FPE + struct trapframe *frame; struct pcb *pcb; + frame = td->td_frame; pcb = td->td_pcb; memcpy(pcb->pcb_x, regs->fp_x, sizeof(regs->fp_x)); pcb->pcb_fcsr = regs->fp_fcsr; + pcb->pcb_fpflags |= PCB_FP_STARTED; + frame->tf_sstatus &= ~SSTATUS_FS_MASK; + frame->tf_sstatus |= SSTATUS_FS_CLEAN; #endif return (0); diff --git a/sys/riscv/riscv/swtch.S b/sys/riscv/riscv/swtch.S index b17de23ad395..8192abaf9869 100644 --- a/sys/riscv/riscv/swtch.S +++ b/sys/riscv/riscv/swtch.S @@ -153,6 +153,59 @@ ENTRY(fpe_state_save) END(fpe_state_save) #endif /* FPE */ +/* + * void + * fpe_state_clear(void) + */ +ENTRY(fpe_state_clear) + /* + * Enable FPE usage in supervisor mode, + * so we can access registers. + */ + li t0, SSTATUS_FS_INITIAL + csrs sstatus, t0 + + fscsr zero + fcvt.d.l f0, zero + fcvt.d.l f1, zero + fcvt.d.l f2, zero + fcvt.d.l f3, zero + fcvt.d.l f4, zero + fcvt.d.l f5, zero + fcvt.d.l f6, zero + fcvt.d.l f7, zero + fcvt.d.l f8, zero + fcvt.d.l f9, zero + fcvt.d.l f10, zero + fcvt.d.l f11, zero + fcvt.d.l f12, zero + fcvt.d.l f13, zero + fcvt.d.l f14, zero + fcvt.d.l f15, zero + fcvt.d.l f16, zero + fcvt.d.l f17, zero + fcvt.d.l f18, zero + fcvt.d.l f19, zero + fcvt.d.l f20, zero + fcvt.d.l f21, zero + fcvt.d.l f22, zero + fcvt.d.l f23, zero + fcvt.d.l f24, zero + fcvt.d.l f25, zero + fcvt.d.l f26, zero + fcvt.d.l f27, zero + fcvt.d.l f28, zero + fcvt.d.l f29, zero + fcvt.d.l f30, zero + fcvt.d.l f31, zero + + /* Disable FPE usage in supervisor mode. */ + li t0, SSTATUS_FS_MASK + csrc sstatus, t0 + + ret +END(fpe_state_clear) + /* * void cpu_throw(struct thread *old, struct thread *new) */ diff --git a/sys/riscv/riscv/trap.c b/sys/riscv/riscv/trap.c index ced05c52588f..57f5c558d286 100644 --- a/sys/riscv/riscv/trap.c +++ b/sys/riscv/riscv/trap.c @@ -57,6 +57,9 @@ __FBSDID("$FreeBSD$"); #include #include +#ifdef FPE +#include +#endif #include #include #include @@ -363,7 +366,9 @@ do_trap_user(struct trapframe *frame) * May be a FPE trap. Enable FPE usage * for this thread and try again. */ - frame->tf_sstatus |= SSTATUS_FS_INITIAL; + fpe_state_clear(); + frame->tf_sstatus &= ~SSTATUS_FS_MASK; + frame->tf_sstatus |= SSTATUS_FS_CLEAN; pcb->pcb_fpflags |= PCB_FP_STARTED; break; } -- cgit v1.3 From a286a3099c87bf015063bac0f10cec70b261fac7 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Thu, 20 Sep 2018 13:29:43 +0000 Subject: amd64: move fusufault after all users A lot of function have the following check: cmpq %rax,%rdi /* verify address is valid */ ja fusufault The label is present earlier in kernel .text, which means this is a jump backwards. Absent any information in branch predictor, the cpu predicts it as taken. Since it is almost never taken in practice, this results in a completely avoidable misprediction. Move it past all consumers, so that it is predicted as not taken. Approved by: re (kib) --- sys/amd64/amd64/support.S | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'sys') diff --git a/sys/amd64/amd64/support.S b/sys/amd64/amd64/support.S index bc849b21d965..33ffec1ff5f9 100644 --- a/sys/amd64/amd64/support.S +++ b/sys/amd64/amd64/support.S @@ -755,16 +755,6 @@ ENTRY(fubyte_smap) ret END(fubyte_smap) - ALIGN_TEXT - /* Fault entry clears PSL.AC */ -fusufault: - movq PCPU(CURPCB),%rcx - xorl %eax,%eax - movq %rax,PCB_ONFAULT(%rcx) - decq %rax - POP_FRAME_POINTER - ret - /* * Store a 64-bit word, a 32-bit word, a 16-bit word, or an 8-bit byte to * user memory. @@ -916,6 +906,16 @@ ENTRY(subyte_smap) ret END(subyte_smap) + ALIGN_TEXT + /* Fault entry clears PSL.AC */ +fusufault: + movq PCPU(CURPCB),%rcx + xorl %eax,%eax + movq %rax,PCB_ONFAULT(%rcx) + decq %rax + POP_FRAME_POINTER + ret + /* * copyinstr(from, to, maxlen, int *lencopied) * %rdi, %rsi, %rdx, %rcx -- cgit v1.3 From 51e13c93b6ef3be997aa3b0f5247dbaddbbd45f5 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Thu, 20 Sep 2018 13:32:40 +0000 Subject: fd: prevent inlining of _fdrop thorough kern_descrip.c fdrop is used in several places in the file and almost never has to call _fdrop. Thus inlining it is a pure waste of space. Approved by: re (kib) --- sys/kern/kern_descrip.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'sys') diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c index ca1f7326fe3f..08a704d904b1 100644 --- a/sys/kern/kern_descrip.c +++ b/sys/kern/kern_descrip.c @@ -2936,8 +2936,11 @@ fgetvp_write(struct thread *td, int fd, cap_rights_t *rightsp, /* * Handle the last reference to a file being closed. + * + * Without the noinline attribute clang keeps inlining the func thorough this + * file when fdrop is used. */ -int +int __noinline _fdrop(struct file *fp, struct thread *td) { int error; -- cgit v1.3 From 25ed23cfbb9200f1dd96a70e74a8e3e282635557 Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Thu, 20 Sep 2018 15:45:12 +0000 Subject: Change the domain selection policy in kmem_back(). Ensure that pages backing the same virtual large page come from the same physical domain, as kmem_malloc_domain() does. PR: 231038 Reviewed by: alc, kib Approved by: re (gjb) Sponsored by: The FreeBSD Foundation Differential Revision: https://reviews.freebsd.org/D17248 --- sys/vm/vm_kern.c | 45 +++++++++++++++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 14 deletions(-) (limited to 'sys') diff --git a/sys/vm/vm_kern.c b/sys/vm/vm_kern.c index 31b85fd81854..5f7c4033c218 100644 --- a/sys/vm/vm_kern.c +++ b/sys/vm/vm_kern.c @@ -122,11 +122,12 @@ SYSCTL_ULONG(_vm, OID_AUTO, max_kernel_address, CTLFLAG_RD, "Max kernel address"); #if VM_NRESERVLEVEL > 0 -#define KVA_QUANTUM (1 << (VM_LEVEL_0_ORDER + PAGE_SHIFT)) +#define KVA_QUANTUM_SHIFT (VM_LEVEL_0_ORDER + PAGE_SHIFT) #else /* On non-superpage architectures want large import sizes. */ -#define KVA_QUANTUM (PAGE_SIZE * 1024) +#define KVA_QUANTUM_SHIFT (10 + PAGE_SHIFT) #endif +#define KVA_QUANTUM (1 << KVA_QUANTUM_SHIFT) /* * kva_alloc: @@ -416,9 +417,10 @@ kmem_malloc(vm_size_t size, int flags) } /* - * kmem_back: + * kmem_back_domain: * - * Allocate physical pages for the specified virtual address range. + * Allocate physical pages from the specified domain for the specified + * virtual address range. */ int kmem_back_domain(int domain, vm_object_t object, vm_offset_t addr, @@ -479,24 +481,39 @@ retry: return (KERN_SUCCESS); } +/* + * kmem_back: + * + * Allocate physical pages for the specified virtual address range. + */ int kmem_back(vm_object_t object, vm_offset_t addr, vm_size_t size, int flags) { - struct vm_domainset_iter di; - int domain; - int ret; + vm_offset_t end, next, start; + int domain, rv; KASSERT(object == kernel_object, ("kmem_back: only supports kernel object.")); - vm_domainset_iter_malloc_init(&di, kernel_object, &domain, &flags); - do { - ret = kmem_back_domain(domain, object, addr, size, flags); - if (ret == KERN_SUCCESS) + for (start = addr, end = addr + size; addr < end; addr = next) { + /* + * We must ensure that pages backing a given large virtual page + * all come from the same physical domain. + */ + if (vm_ndomains > 1) { + domain = (addr >> KVA_QUANTUM_SHIFT) % vm_ndomains; + next = roundup2(addr + 1, KVA_QUANTUM); + if (next > end || next < start) + next = end; + } else + next = end; + rv = kmem_back_domain(domain, object, addr, next - addr, flags); + if (rv != KERN_SUCCESS) { + kmem_unback(object, start, addr - start); break; - } while (vm_domainset_iter_malloc(&di, &domain, &flags) == 0); - - return (ret); + } + } + return (rv); } /* -- cgit v1.3 From 6675bee81a93be9833b47c3f4ceda9a643636579 Mon Sep 17 00:00:00 2001 From: "Bjoern A. Zeeb" Date: Thu, 20 Sep 2018 15:45:53 +0000 Subject: In icmp6_rip6_input(), once we have a lock, make sure the inp is not freed. This can happen since the list traversal and locking was converted to epoch(9). If the inp is marked "freed", skip it. This prevents a NULL pointer deref panic in ip6_savecontrol_v4() trying to access the socket hanging off the inp, which was gone by the time we got there. Reported by: andrew Tested by: andrew Approved by: re (gjb) --- sys/netinet6/icmp6.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'sys') diff --git a/sys/netinet6/icmp6.c b/sys/netinet6/icmp6.c index 0bb5636602ff..93cbfe66140b 100644 --- a/sys/netinet6/icmp6.c +++ b/sys/netinet6/icmp6.c @@ -1936,6 +1936,10 @@ icmp6_rip6_input(struct mbuf **mp, int off) !IN6_ARE_ADDR_EQUAL(&in6p->in6p_faddr, &ip6->ip6_src)) continue; INP_RLOCK(in6p); + if (__predict_false(in6p->inp_flags2 & INP_FREED)) { + INP_RUNLOCK(in6p); + continue; + } if (ICMP6_FILTER_WILLBLOCK(icmp6->icmp6_type, in6p->in6p_icmp6filt)) { INP_RUNLOCK(in6p); -- cgit v1.3 From c396945b74fc298fd6351bd9be73541edfa89176 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Thu, 20 Sep 2018 18:25:26 +0000 Subject: vfs: remove lookup_shared tunable Reviewed by: kib, jhb Approved by: re (gjb) Differential Revision: https://reviews.freebsd.org/D17253 --- sys/kern/vfs_lookup.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) (limited to 'sys') diff --git a/sys/kern/vfs_lookup.c b/sys/kern/vfs_lookup.c index cdfb84e42447..675c0aa02ac3 100644 --- a/sys/kern/vfs_lookup.c +++ b/sys/kern/vfs_lookup.c @@ -160,10 +160,6 @@ nameiinit(void *dummy __unused) } SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nameiinit, NULL); -static int lookup_shared = 1; -SYSCTL_INT(_vfs, OID_AUTO, lookup_shared, CTLFLAG_RWTUN, &lookup_shared, 0, - "enables shared locks for path name translation"); - static int lookup_cap_dotdot = 1; SYSCTL_INT(_vfs, OID_AUTO, lookup_cap_dotdot, CTLFLAG_RWTUN, &lookup_cap_dotdot, 0, @@ -307,8 +303,6 @@ namei(struct nameidata *ndp) ("namei: flags contaminated with nameiops")); MPASS(ndp->ni_startdir == NULL || ndp->ni_startdir->v_type == VDIR || ndp->ni_startdir->v_type == VBAD); - if (!lookup_shared) - cnp->cn_flags &= ~LOCKSHARED; fdp = p->p_fd; TAILQ_INIT(&ndp->ni_cap_tracker); ndp->ni_lcf = 0; @@ -660,10 +654,7 @@ lookup(struct nameidata *ndp) * We use shared locks until we hit the parent of the last cn then * we adjust based on the requesting flags. */ - if (lookup_shared) - cnp->cn_lkflags = LK_SHARED; - else - cnp->cn_lkflags = LK_EXCLUSIVE; + cnp->cn_lkflags = LK_SHARED; dp = ndp->ni_startdir; ndp->ni_startdir = NULLVP; vn_lock(dp, @@ -1087,7 +1078,7 @@ nextname: VOP_UNLOCK(dp, 0); success: /* - * Because of lookup_shared we may have the vnode shared locked, but + * Because of shared lookup we may have the vnode shared locked, but * the caller may want it to be exclusively locked. */ if (needs_exclusive_leaf(dp->v_mount, cnp->cn_flags) && -- cgit v1.3 From 969e147aff813235dada3b6e887df076616bdcce Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Thu, 20 Sep 2018 18:29:55 +0000 Subject: Ensure that imports into per-domain kmem arenas are KVA_QUANTUM-aligned. The old code appears to assume that vmem_alloc() would import size-aligned KVA chunks from the parent kernel_arena, but vmem doesn't provide this guarantee. Also remove the unused global RWX arena and add comments explaining why we have per-domain arenas. Reported by: alc Reviewed by: alc, kib (previous version) Approved by: re (gjb) Sponsored by: The FreeBSD Foundation Differential Revision: https://reviews.freebsd.org/D17249 --- sys/kern/subr_vmem.c | 3 --- sys/vm/vm_kern.c | 44 +++++++++++++++++++++----------------------- sys/vm/vm_kern.h | 1 - 3 files changed, 21 insertions(+), 27 deletions(-) (limited to 'sys') diff --git a/sys/kern/subr_vmem.c b/sys/kern/subr_vmem.c index 3c83c777b6a9..8529266f381c 100644 --- a/sys/kern/subr_vmem.c +++ b/sys/kern/subr_vmem.c @@ -241,9 +241,6 @@ static struct vmem buffer_arena_storage; static struct vmem transient_arena_storage; /* kernel and kmem arenas are aliased for backwards KPI compat. */ vmem_t *kernel_arena = &kernel_arena_storage; -#if VM_NRESERVLEVEL > 0 -vmem_t *kernel_rwx_arena = NULL; -#endif vmem_t *kmem_arena = &kernel_arena_storage; vmem_t *buffer_arena = &buffer_arena_storage; vmem_t *transient_arena = &transient_arena_storage; diff --git a/sys/vm/vm_kern.c b/sys/vm/vm_kern.c index 5f7c4033c218..b5915c074ab6 100644 --- a/sys/vm/vm_kern.c +++ b/sys/vm/vm_kern.c @@ -669,7 +669,7 @@ kmem_init_zero_region(void) } /* - * Import kva into the kernel arena. + * Import KVA from the kernel map into the kernel arena. */ static int kva_import(void *unused, vmem_size_t size, int flags, vmem_addr_t *addrp) @@ -691,22 +691,20 @@ kva_import(void *unused, vmem_size_t size, int flags, vmem_addr_t *addrp) return (0); } -#if VM_NRESERVLEVEL > 0 /* - * Import a superpage from the normal kernel arena into the special - * arena for allocations with different permissions. + * Import KVA from a parent arena into a per-domain arena. Imports must be + * KVA_QUANTUM-aligned and a multiple of KVA_QUANTUM in size. */ static int -kernel_rwx_alloc(void *arena, vmem_size_t size, int flags, vmem_addr_t *addrp) +kva_import_domain(void *arena, vmem_size_t size, int flags, vmem_addr_t *addrp) { KASSERT((size % KVA_QUANTUM) == 0, - ("kernel_rwx_alloc: Size %jd is not a multiple of %d", + ("kva_import_domain: Size %jd is not a multiple of %d", (intmax_t)size, (int)KVA_QUANTUM)); return (vmem_xalloc(arena, size, KVA_QUANTUM, 0, 0, VMEM_ADDR_MIN, VMEM_ADDR_MAX, flags, addrp)); } -#endif /* * kmem_init: @@ -744,30 +742,30 @@ kmem_init(vm_offset_t start, vm_offset_t end) vmem_init(kernel_arena, "kernel arena", 0, 0, PAGE_SIZE, 0, 0); vmem_set_import(kernel_arena, kva_import, NULL, NULL, KVA_QUANTUM); -#if VM_NRESERVLEVEL > 0 - /* - * In an architecture with superpages, maintain a separate arena - * for allocations with permissions that differ from the "standard" - * read/write permissions used for memory in the kernel_arena. - */ - kernel_rwx_arena = vmem_create("kernel rwx arena", 0, 0, PAGE_SIZE, - 0, M_WAITOK); - vmem_set_import(kernel_rwx_arena, kernel_rwx_alloc, - (vmem_release_t *)vmem_xfree, kernel_arena, KVA_QUANTUM); -#endif - for (domain = 0; domain < vm_ndomains; domain++) { + /* + * Initialize the per-domain arenas. These are used to color + * the KVA space in a way that ensures that virtual large pages + * are backed by memory from the same physical domain, + * maximizing the potential for superpage promotion. + */ vm_dom[domain].vmd_kernel_arena = vmem_create( "kernel arena domain", 0, 0, PAGE_SIZE, 0, M_WAITOK); vmem_set_import(vm_dom[domain].vmd_kernel_arena, - (vmem_import_t *)vmem_alloc, NULL, kernel_arena, - KVA_QUANTUM); + kva_import_domain, NULL, kernel_arena, KVA_QUANTUM); + + /* + * In architectures with superpages, maintain separate arenas + * for allocations with permissions that differ from the + * "standard" read/write permissions used for kernel memory, + * so as not to inhibit superpage promotion. + */ #if VM_NRESERVLEVEL > 0 vm_dom[domain].vmd_kernel_rwx_arena = vmem_create( "kernel rwx arena domain", 0, 0, PAGE_SIZE, 0, M_WAITOK); vmem_set_import(vm_dom[domain].vmd_kernel_rwx_arena, - kernel_rwx_alloc, (vmem_release_t *)vmem_xfree, - vm_dom[domain].vmd_kernel_arena, KVA_QUANTUM); + kva_import_domain, (vmem_release_t *)vmem_xfree, + kernel_arena, KVA_QUANTUM); #endif } } diff --git a/sys/vm/vm_kern.h b/sys/vm/vm_kern.h index 8d49a598f26d..20e847f5e5af 100644 --- a/sys/vm/vm_kern.h +++ b/sys/vm/vm_kern.h @@ -70,7 +70,6 @@ extern vm_map_t kernel_map; extern vm_map_t exec_map; extern vm_map_t pipe_map; extern struct vmem *kernel_arena; -extern struct vmem *kernel_rwx_arena; extern struct vmem *kmem_arena; extern struct vmem *buffer_arena; extern struct vmem *transient_arena; -- cgit v1.3 From 4bf0035fd1434190fc67a647879437a7919a4cf7 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Thu, 20 Sep 2018 18:30:17 +0000 Subject: amd64: macroify copyin/copyout and provide erms variants Reviewed by: kib Approved by: re (gjb) Differential Revision: https://reviews.freebsd.org/D17257 --- sys/amd64/amd64/copyout.c | 37 ++++++++--- sys/amd64/amd64/support.S | 154 +++++++++++++++++++--------------------------- 2 files changed, 93 insertions(+), 98 deletions(-) (limited to 'sys') diff --git a/sys/amd64/amd64/copyout.c b/sys/amd64/amd64/copyout.c index dce20fb27769..59b7ad5c1307 100644 --- a/sys/amd64/amd64/copyout.c +++ b/sys/amd64/amd64/copyout.c @@ -159,20 +159,41 @@ DEFINE_IFUNC(, int, copyinstr, (const void *, void *, size_t, size_t *), copyinstr_smap : copyinstr_nosmap); } -int copyin_nosmap(const void *udaddr, void *kaddr, size_t len); -int copyin_smap(const void *udaddr, void *kaddr, size_t len); +int copyin_nosmap_std(const void *udaddr, void *kaddr, size_t len); +int copyin_smap_std(const void *udaddr, void *kaddr, size_t len); +int copyin_nosmap_erms(const void *udaddr, void *kaddr, size_t len); +int copyin_smap_erms(const void *udaddr, void *kaddr, size_t len); DEFINE_IFUNC(, int, copyin, (const void *, void *, size_t), static) { - return ((cpu_stdext_feature & CPUID_STDEXT_SMAP) != 0 ? - copyin_smap : copyin_nosmap); + switch (cpu_stdext_feature & (CPUID_STDEXT_SMAP | CPUID_STDEXT_ERMS)) { + case CPUID_STDEXT_SMAP: + return (copyin_smap_std); + case CPUID_STDEXT_ERMS: + return (copyin_nosmap_erms); + case CPUID_STDEXT_SMAP | CPUID_STDEXT_ERMS: + return (copyin_smap_erms); + default: + return (copyin_nosmap_std); + + } } -int copyout_nosmap(const void *kaddr, void *udaddr, size_t len); -int copyout_smap(const void *kaddr, void *udaddr, size_t len); +int copyout_nosmap_std(const void *kaddr, void *udaddr, size_t len); +int copyout_smap_std(const void *kaddr, void *udaddr, size_t len); +int copyout_nosmap_erms(const void *kaddr, void *udaddr, size_t len); +int copyout_smap_erms(const void *kaddr, void *udaddr, size_t len); DEFINE_IFUNC(, int, copyout, (const void *, void *, size_t), static) { - return ((cpu_stdext_feature & CPUID_STDEXT_SMAP) != 0 ? - copyout_smap : copyout_nosmap); + switch (cpu_stdext_feature & (CPUID_STDEXT_SMAP | CPUID_STDEXT_ERMS)) { + case CPUID_STDEXT_SMAP: + return (copyout_smap_std); + case CPUID_STDEXT_ERMS: + return (copyout_nosmap_erms); + case CPUID_STDEXT_SMAP | CPUID_STDEXT_ERMS: + return (copyout_smap_erms); + default: + return (copyout_nosmap_std); + } } diff --git a/sys/amd64/amd64/support.S b/sys/amd64/amd64/support.S index 33ffec1ff5f9..168ea5b32bcf 100644 --- a/sys/amd64/amd64/support.S +++ b/sys/amd64/amd64/support.S @@ -281,62 +281,30 @@ END(fillw) * returns to *curpcb->pcb_onfault instead of the function. */ +.macro SMAP_DISABLE smap +.if \smap + stac +.endif +.endmacro + + +.macro SMAP_ENABLE smap +.if \smap + clac +.endif +.endmacro + /* * copyout(from_kernel, to_user, len) * %rdi, %rsi, %rdx */ -ENTRY(copyout_nosmap) - PUSH_FRAME_POINTER - movq PCPU(CURPCB),%rax - movq $copyout_fault,PCB_ONFAULT(%rax) - testq %rdx,%rdx /* anything to do? */ - jz done_copyout - - /* - * Check explicitly for non-user addresses. This check is essential - * because it prevents usermode from writing into the kernel. We do - * not verify anywhere else that the user did not specify a rogue - * address. - */ - /* - * First, prevent address wrapping. - */ - movq %rsi,%rax - addq %rdx,%rax - jc copyout_fault -/* - * XXX STOP USING VM_MAXUSER_ADDRESS. - * It is an end address, not a max, so every time it is used correctly it - * looks like there is an off by one error, and of course it caused an off - * by one error in several places. - */ - movq $VM_MAXUSER_ADDRESS,%rcx - cmpq %rcx,%rax - ja copyout_fault - - xchgq %rdi,%rsi - /* bcopy(%rsi, %rdi, %rdx) */ - movq %rdx,%rcx - - shrq $3,%rcx - rep - movsq - movb %dl,%cl - andb $7,%cl - je done_copyout - rep - movsb - - jmp done_copyout -END(copyout_nosmap) - -ENTRY(copyout_smap) +.macro COPYOUT smap erms PUSH_FRAME_POINTER movq PCPU(CURPCB),%rax /* Trap entry clears PSL.AC */ movq $copyout_fault,PCB_ONFAULT(%rax) testq %rdx,%rdx /* anything to do? */ - jz done_copyout + jz 2f /* * Check explicitly for non-user addresses. If 486 write protection @@ -365,23 +333,42 @@ ENTRY(copyout_smap) /* bcopy(%rsi, %rdi, %rdx) */ movq %rdx,%rcx + SMAP_DISABLE \smap +.if \erms == 0 shrq $3,%rcx - stac rep movsq movb %dl,%cl andb $7,%cl je 1f +.endif rep movsb -1: clac - -done_copyout: +1: + SMAP_ENABLE \smap +2: xorl %eax,%eax movq PCPU(CURPCB),%rdx movq %rax,PCB_ONFAULT(%rdx) POP_FRAME_POINTER ret +.endmacro + +ENTRY(copyout_nosmap_std) + COPYOUT smap=0 erms=0 +END(copyout_nosmap_std) + +ENTRY(copyout_smap_std) + COPYOUT smap=1 erms=0 +END(copyout_smap_std) + +ENTRY(copyout_nosmap_erms) + COPYOUT smap=0 erms=1 +END(copyout_nosmap_erms) + +ENTRY(copyout_smap_erms) + COPYOUT smap=1 erms=1 +END(copyout_smap_erms) ALIGN_TEXT copyout_fault: @@ -390,18 +377,17 @@ copyout_fault: movq $EFAULT,%rax POP_FRAME_POINTER ret -END(copyout_smap) /* * copyin(from_user, to_kernel, len) * %rdi, %rsi, %rdx */ -ENTRY(copyin_nosmap) +.macro COPYIN smap erms PUSH_FRAME_POINTER movq PCPU(CURPCB),%rax movq $copyin_fault,PCB_ONFAULT(%rax) testq %rdx,%rdx /* anything to do? */ - jz done_copyin + jz 2f /* * make sure address is valid @@ -416,56 +402,44 @@ ENTRY(copyin_nosmap) xchgq %rdi,%rsi movq %rdx,%rcx movb %cl,%al - shrq $3,%rcx /* copy longword-wise */ - rep - movsq - movb %al,%cl - andb $7,%cl /* copy remaining bytes */ - je done_copyin - rep - movsb - - jmp done_copyin -END(copyin_nosmap) - -ENTRY(copyin_smap) - PUSH_FRAME_POINTER - movq PCPU(CURPCB),%rax - movq $copyin_fault,PCB_ONFAULT(%rax) - testq %rdx,%rdx /* anything to do? */ - jz done_copyin - /* - * make sure address is valid - */ - movq %rdi,%rax - addq %rdx,%rax - jc copyin_fault - movq $VM_MAXUSER_ADDRESS,%rcx - cmpq %rcx,%rax - ja copyin_fault - - xchgq %rdi,%rsi - movq %rdx,%rcx - movb %cl,%al + SMAP_DISABLE \smap +.if \erms == 0 shrq $3,%rcx /* copy longword-wise */ - stac rep movsq movb %al,%cl andb $7,%cl /* copy remaining bytes */ - je 1f + je 1 +.endif rep movsb -1: clac -done_copyin: +1: + SMAP_ENABLE \smap +2: xorl %eax,%eax movq PCPU(CURPCB),%rdx movq %rax,PCB_ONFAULT(%rdx) POP_FRAME_POINTER ret -END(copyin_smap) +.endmacro + +ENTRY(copyin_nosmap_std) + COPYIN smap=0 erms=0 +END(copyin_nosmap_std) + +ENTRY(copyin_smap_std) + COPYIN smap=1 erms=0 +END(copyin_smap_std) + +ENTRY(copyin_nosmap_erms) + COPYIN smap=0 erms=1 +END(copyin_nosmap_erms) + +ENTRY(copyin_smap_erms) + COPYIN smap=1 erms=1 +END(copyin_smap_erms) ALIGN_TEXT copyin_fault: -- cgit v1.3 From 0c919c23706c7734a9ecb6b0a95717c37c4b3d56 Mon Sep 17 00:00:00 2001 From: Stephen Hurd Date: Thu, 20 Sep 2018 19:35:35 +0000 Subject: Fix capabilities handling for iflib drivers Various capabilities were not being handled correctly in the SIOCSIFCAP handler. Specifically: IFCAP_RXCSUM and IFCAP_RXCSUM_IPV6 could be set even if not supported It was impossible to disable IFCAP_RXCSUM and/or IFCAP_RXCSUM_IPV6 via ifconfig since it does ioctl() per command-line flag rather than combine them into a single call. IFCAP_VLAN_HWCSUM could not be modified via the ioctl() Setting any combination of the three IFCAP_WOL flags would set only IFCAP_WOL_MCAST | IFCAP_WOL_MAGIC. For example, setting only IFCAP_WOL_UCAST would result in both IFCAP_WOL_MCAST and IFCAP_WOL_MAGIC being enabled, but IFCAP_WOL_UCAST would not be enabled. Because if_vlancap() was called before if_togglecapenable(), vlan flags were sometimes not applied correctly. Interfaces were being unnecessarily stopped and restarted for WoL PR: 231151 Submitted by: Kaho Toshikazu Reported by: Shirkdog Reviewed by: galladin Approved by: re (gjb) Sponsored by: Limelight Networks Differential Revision: https://reviews.freebsd.org/D17158 --- sys/net/iflib.c | 39 ++++++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 13 deletions(-) (limited to 'sys') diff --git a/sys/net/iflib.c b/sys/net/iflib.c index 65bf07a0b2c1..d9a6e3b962d7 100644 --- a/sys/net/iflib.c +++ b/sys/net/iflib.c @@ -4100,9 +4100,10 @@ iflib_if_qflush(if_t ifp) } -#define IFCAP_FLAGS (IFCAP_TXCSUM_IPV6 | IFCAP_RXCSUM_IPV6 | IFCAP_HWCSUM | IFCAP_LRO | \ - IFCAP_TSO4 | IFCAP_TSO6 | IFCAP_VLAN_HWTAGGING | IFCAP_HWSTATS | \ - IFCAP_VLAN_MTU | IFCAP_VLAN_HWFILTER | IFCAP_VLAN_HWTSO) +#define IFCAP_FLAGS (IFCAP_HWCSUM_IPV6 | IFCAP_HWCSUM | IFCAP_LRO | \ + IFCAP_TSO | IFCAP_VLAN_HWTAGGING | IFCAP_HWSTATS | \ + IFCAP_VLAN_MTU | IFCAP_VLAN_HWFILTER | \ + IFCAP_VLAN_HWTSO | IFCAP_VLAN_HWCSUM) static int iflib_if_ioctl(if_t ifp, u_long command, caddr_t data) @@ -4223,39 +4224,51 @@ iflib_if_ioctl(if_t ifp, u_long command, caddr_t data) } case SIOCSIFCAP: { - int mask, setmask; + int mask, setmask, oldmask; - mask = ifr->ifr_reqcap ^ if_getcapenable(ifp); + oldmask = if_getcapenable(ifp); + mask = ifr->ifr_reqcap ^ oldmask; + mask &= ctx->ifc_softc_ctx.isc_capabilities; setmask = 0; #ifdef TCP_OFFLOAD setmask |= mask & (IFCAP_TOE4|IFCAP_TOE6); #endif setmask |= (mask & IFCAP_FLAGS); + setmask |= (mask & IFCAP_WOL); + + /* + * If we're disabling any RX csum, disable all the ones + * the driver supports. This assumes all supported are + * enabled. + * + * Otherwise, if they've changed, enable all of them. + */ + if ((setmask & (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6)) < + (oldmask & (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6))) + setmask &= ~(IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6); + else if ((setmask & (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6)) != + (oldmask & (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6))) + setmask |= (mask & (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6)); - if (setmask & (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6)) - setmask |= (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6); - if ((mask & IFCAP_WOL) && - (if_getcapabilities(ifp) & IFCAP_WOL) != 0) - setmask |= (mask & (IFCAP_WOL_MCAST|IFCAP_WOL_MAGIC)); - if_vlancap(ifp); /* * want to ensure that traffic has stopped before we change any of the flags */ if (setmask) { CTX_LOCK(ctx); bits = if_getdrvflags(ifp); - if (bits & IFF_DRV_RUNNING) + if (bits & IFF_DRV_RUNNING && setmask & ~IFCAP_WOL) iflib_stop(ctx); STATE_LOCK(ctx); if_togglecapenable(ifp, setmask); STATE_UNLOCK(ctx); - if (bits & IFF_DRV_RUNNING) + if (bits & IFF_DRV_RUNNING && setmask & ~IFCAP_WOL) iflib_init_locked(ctx); STATE_LOCK(ctx); if_setdrvflags(ifp, bits); STATE_UNLOCK(ctx); CTX_UNLOCK(ctx); } + if_vlancap(ifp); break; } case SIOCGPRIVATE_0: -- cgit v1.3 From 76b09d1823ee18fe4a0b8abe17d73306263765b8 Mon Sep 17 00:00:00 2001 From: "Andrey V. Elsukov" Date: Thu, 20 Sep 2018 19:45:27 +0000 Subject: Add new field max_hdrsize to struct encap_config. It is currently unused and reserved for future use to keep KBI/KPI. Also add several spare pointers to be able extend structure if it will be needed. Approved by: re (gjb) --- sys/netinet/ip_encap.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'sys') diff --git a/sys/netinet/ip_encap.h b/sys/netinet/ip_encap.h index f3d1d3afcab8..65ac922fc1bb 100644 --- a/sys/netinet/ip_encap.h +++ b/sys/netinet/ip_encap.h @@ -48,12 +48,15 @@ typedef int (*encap_input_t)(struct mbuf *, int , int, void *); struct encap_config { int proto; /* protocol */ int min_length; /* minimum packet length */ + int max_hdrsize; /* maximum header size */ int exact_match; /* a packet is exactly matched */ #define ENCAP_DRV_LOOKUP 0x7fffffff encap_lookup_t lookup; encap_check_t check; encap_input_t input; + + void *pad[3]; }; struct encaptab; -- cgit v1.3 From 861437f83ac101939a96c98ca9266a11de733c2a Mon Sep 17 00:00:00 2001 From: Stephen Hurd Date: Thu, 20 Sep 2018 20:06:44 +0000 Subject: Add IFCAP_TSO6 for igb It seems igb supports TSO6, but the capability got lost in the iflib update. Restore this capability. PR: 231476 Reported by: lev Reviewed by: erj Approved by: re (gjb) Sponsored by: Limelight Networks Differential Revision: https://reviews.freebsd.org/D17242 --- sys/dev/e1000/if_em.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'sys') diff --git a/sys/dev/e1000/if_em.c b/sys/dev/e1000/if_em.c index a95d606ac34f..93714cd10710 100644 --- a/sys/dev/e1000/if_em.c +++ b/sys/dev/e1000/if_em.c @@ -709,7 +709,8 @@ em_set_num_queues(if_ctx_t ctx) #define IGB_CAPS \ IFCAP_HWCSUM | IFCAP_VLAN_MTU | IFCAP_VLAN_HWTAGGING | \ IFCAP_VLAN_HWCSUM | IFCAP_WOL | IFCAP_VLAN_HWFILTER | IFCAP_TSO4 | \ - IFCAP_LRO | IFCAP_VLAN_HWTSO | IFCAP_JUMBO_MTU | IFCAP_HWCSUM_IPV6; + IFCAP_LRO | IFCAP_VLAN_HWTSO | IFCAP_JUMBO_MTU | IFCAP_HWCSUM_IPV6 |\ + IFCAP_TSO6 /********************************************************************* * Device initialization routine -- cgit v1.3 From 5254f065abb41b7c9e9723323aae292d4e3f4739 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Thu, 20 Sep 2018 20:32:08 +0000 Subject: amd64: macroify copyin/copyout and provide erms variants, follow up Fix a fat-fingered typo with a "funny" side-effect: when doing copyin on a cpu without ERMS and with size being a multiply of 8 a page fault would be triggered resulting in EFAULT. Pointy hat: mjg Approved by: re (implicit) --- sys/amd64/amd64/support.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'sys') diff --git a/sys/amd64/amd64/support.S b/sys/amd64/amd64/support.S index 168ea5b32bcf..043110226e22 100644 --- a/sys/amd64/amd64/support.S +++ b/sys/amd64/amd64/support.S @@ -410,7 +410,7 @@ copyout_fault: movsq movb %al,%cl andb $7,%cl /* copy remaining bytes */ - je 1 + je 1f .endif rep movsb -- cgit v1.3 From a128aaea9509f06e8de4e522a5521faee112e8a9 Mon Sep 17 00:00:00 2001 From: Glen Barber Date: Thu, 20 Sep 2018 23:59:42 +0000 Subject: Update head from ALPHA6 to ALPHA7 as part of the 12.0-RELEASE cycle. Approved by: re (implicit) Sponsored by: The FreeBSD Foundation --- sys/conf/newvers.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'sys') diff --git a/sys/conf/newvers.sh b/sys/conf/newvers.sh index ce32cf2950af..e784b373150e 100644 --- a/sys/conf/newvers.sh +++ b/sys/conf/newvers.sh @@ -46,7 +46,7 @@ TYPE="FreeBSD" REVISION="12.0" -BRANCH="ALPHA6" +BRANCH="ALPHA7" if [ -n "${BRANCH_OVERRIDE}" ]; then BRANCH=${BRANCH_OVERRIDE} fi -- cgit v1.3 From b08d611de8351aaae68a204fa94711e0ed03945f Mon Sep 17 00:00:00 2001 From: Matt Macy Date: Fri, 21 Sep 2018 01:37:08 +0000 Subject: fix vlan locking to permit sx acquisition in ioctl calls - update vlan(9) to handle changes earlier this year in multicast locking Tested by: np@, darkfiberu at gmail.com PR: 230510 Reviewed by: mjoras@, shurd@, sbruno@ Approved by: re (gjb@) Sponsored by: Limelight Networks Differential Revision: https://reviews.freebsd.org/D16808 --- sys/net/if_var.h | 1 + sys/net/if_vlan.c | 219 +++++++++++++++++++----------------------------------- 2 files changed, 77 insertions(+), 143 deletions(-) (limited to 'sys') diff --git a/sys/net/if_var.h b/sys/net/if_var.h index 480df7433a6d..68341ef75025 100644 --- a/sys/net/if_var.h +++ b/sys/net/if_var.h @@ -411,6 +411,7 @@ struct ifnet { #define NET_EPOCH_ENTER_ET(et) epoch_enter_preempt(net_epoch_preempt, &(et)) #define NET_EPOCH_EXIT() epoch_exit_preempt(net_epoch_preempt, &nep_et) #define NET_EPOCH_EXIT_ET(et) epoch_exit_preempt(net_epoch_preempt, &(et)) +#define NET_EPOCH_WAIT() epoch_wait_preempt(net_epoch_preempt) /* diff --git a/sys/net/if_vlan.c b/sys/net/if_vlan.c index 11baf8a018b4..5170ee6ae35c 100644 --- a/sys/net/if_vlan.c +++ b/sys/net/if_vlan.c @@ -87,11 +87,11 @@ __FBSDID("$FreeBSD$"); #define UP_AND_RUNNING(ifp) \ ((ifp)->if_flags & IFF_UP && (ifp)->if_drv_flags & IFF_DRV_RUNNING) -LIST_HEAD(ifvlanhead, ifvlan); +CK_SLIST_HEAD(ifvlanhead, ifvlan); struct ifvlantrunk { struct ifnet *parent; /* parent interface of this trunk */ - struct rmlock lock; + struct mtx lock; #ifdef VLAN_ARRAY #define VLAN_ARRAY_SIZE (EVL_VLID_MASK + 1) struct ifvlan *vlans[VLAN_ARRAY_SIZE]; /* static table */ @@ -117,7 +117,7 @@ struct ifvlantrunk { struct ifvlan *_next; \ size_t _i; \ for (_i = 0; _i < (1 << (_trunk)->hwidth); _i++) \ - LIST_FOREACH_SAFE((_ifv), &(_trunk)->hash[_i], ifv_list, _next) + CK_SLIST_FOREACH_SAFE((_ifv), &(_trunk)->hash[_i], ifv_list, _next) #endif /* VLAN_ARRAY */ /* @@ -146,13 +146,13 @@ struct ifvlantrunk { for (_i = 0; \ !(_cond) && _i < (1 << (_trunk)->hwidth); \ _i = (_touch && ((_trunk) != NULL) ? 0 : _i + 1), _touch = false) \ - if (((_ifv) = LIST_FIRST(&(_trunk)->hash[_i])) != NULL && \ + if (((_ifv) = CK_SLIST_FIRST(&(_trunk)->hash[_i])) != NULL && \ (_touch = true)) #endif /* VLAN_ARRAY */ struct vlan_mc_entry { struct sockaddr_dl mc_addr; - SLIST_ENTRY(vlan_mc_entry) mc_entries; + CK_SLIST_ENTRY(vlan_mc_entry) mc_entries; }; struct ifvlan { @@ -173,9 +173,9 @@ struct ifvlan { uint8_t ifvm_pcp; /* Priority Code Point (PCP). */ } ifv_mib; struct task lladdr_task; - SLIST_HEAD(, vlan_mc_entry) vlan_mc_listhead; + CK_SLIST_HEAD(, vlan_mc_entry) vlan_mc_listhead; #ifndef VLAN_ARRAY - LIST_ENTRY(ifvlan) ifv_list; + CK_SLIST_ENTRY(ifvlan) ifv_list; #endif }; #define ifv_proto ifv_mib.ifvm_proto @@ -205,55 +205,36 @@ static eventhandler_tag ifdetach_tag; static eventhandler_tag iflladdr_tag; /* - * if_vlan uses two module-level locks to allow concurrent modification of vlan - * interfaces and (mostly) allow for vlans to be destroyed while they are being - * used for tx/rx. To accomplish this in a way that has acceptable performance - * and cooperation with other parts of the network stack there is a - * non-sleepable rmlock(9) and an sx(9). Both locks are exclusively acquired - * when destroying a vlan interface, i.e. when the if_vlantrunk field of struct - * ifnet is de-allocated and NULL'd. Thus a reader holding either lock has a - * guarantee that the struct ifvlantrunk references a valid vlan trunk. + * if_vlan uses two module-level synchronizations primitives to allow concurrent + * modification of vlan interfaces and (mostly) allow for vlans to be destroyed + * while they are being used for tx/rx. To accomplish this in a way that has + * acceptable performance and cooperation with other parts of the network stack + * there is a non-sleepable epoch(9) and an sx(9). * - * The performance-sensitive paths that warrant using the rmlock(9) are + * The performance-sensitive paths that warrant using the epoch(9) are * vlan_transmit and vlan_input. Both have to check for the vlan interface's * existence using if_vlantrunk, and being in the network tx/rx paths the use - * of an rmlock(9) gives a measureable improvement in performance. + * of an epoch(9) gives a measureable improvement in performance. * * The reason for having an sx(9) is mostly because there are still areas that * must be sleepable and also have safe concurrent access to a vlan interface. * Since the sx(9) exists, it is used by default in most paths unless sleeping * is not permitted, or if it is not clear whether sleeping is permitted. * - * Note that despite these protections, there is still an inherent race in the - * destruction of vlans since there's no guarantee that the ifnet hasn't been - * freed/reused when the tx/rx functions are called by the stack. This can only - * be fixed by addressing ifnet's lifetime issues. */ -#define _VLAN_RM_ID ifv_rm_lock #define _VLAN_SX_ID ifv_sx -static struct rmlock _VLAN_RM_ID; static struct sx _VLAN_SX_ID; #define VLAN_LOCKING_INIT() \ - rm_init(&_VLAN_RM_ID, "vlan_rm"); \ sx_init(&_VLAN_SX_ID, "vlan_sx") #define VLAN_LOCKING_DESTROY() \ - rm_destroy(&_VLAN_RM_ID); \ sx_destroy(&_VLAN_SX_ID) -#define _VLAN_RM_TRACKER _vlan_rm_tracker -#define VLAN_RLOCK() rm_rlock(&_VLAN_RM_ID, \ - &_VLAN_RM_TRACKER) -#define VLAN_RUNLOCK() rm_runlock(&_VLAN_RM_ID, \ - &_VLAN_RM_TRACKER) -#define VLAN_WLOCK() rm_wlock(&_VLAN_RM_ID) -#define VLAN_WUNLOCK() rm_wunlock(&_VLAN_RM_ID) -#define VLAN_RLOCK_ASSERT() rm_assert(&_VLAN_RM_ID, RA_RLOCKED) -#define VLAN_WLOCK_ASSERT() rm_assert(&_VLAN_RM_ID, RA_WLOCKED) -#define VLAN_RWLOCK_ASSERT() rm_assert(&_VLAN_RM_ID, RA_LOCKED) -#define VLAN_LOCK_READER struct rm_priotracker _VLAN_RM_TRACKER +#define VLAN_RLOCK() NET_EPOCH_ENTER(); +#define VLAN_RUNLOCK() NET_EPOCH_EXIT(); +#define VLAN_RLOCK_ASSERT() MPASS(in_epoch(net_epoch_preempt)) #define VLAN_SLOCK() sx_slock(&_VLAN_SX_ID) #define VLAN_SUNLOCK() sx_sunlock(&_VLAN_SX_ID) @@ -265,25 +246,18 @@ static struct sx _VLAN_SX_ID; /* - * We also have a per-trunk rmlock(9), that is locked shared on packet - * processing and exclusive when configuration is changed. Note: This should - * only be acquired while there is a shared lock on either of the global locks - * via VLAN_SLOCK or VLAN_RLOCK. Thus, an exclusive lock on the global locks - * makes a call to TRUNK_RLOCK/TRUNK_WLOCK technically superfluous. + * We also have a per-trunk mutex that should be acquired when changing + * its state. */ -#define _TRUNK_RM_TRACKER _trunk_rm_tracker -#define TRUNK_LOCK_INIT(trunk) rm_init(&(trunk)->lock, vlanname) -#define TRUNK_LOCK_DESTROY(trunk) rm_destroy(&(trunk)->lock) -#define TRUNK_RLOCK(trunk) rm_rlock(&(trunk)->lock, \ - &_TRUNK_RM_TRACKER) -#define TRUNK_WLOCK(trunk) rm_wlock(&(trunk)->lock) -#define TRUNK_RUNLOCK(trunk) rm_runlock(&(trunk)->lock, \ - &_TRUNK_RM_TRACKER) -#define TRUNK_WUNLOCK(trunk) rm_wunlock(&(trunk)->lock) -#define TRUNK_RLOCK_ASSERT(trunk) rm_assert(&(trunk)->lock, RA_RLOCKED) -#define TRUNK_LOCK_ASSERT(trunk) rm_assert(&(trunk)->lock, RA_LOCKED) -#define TRUNK_WLOCK_ASSERT(trunk) rm_assert(&(trunk)->lock, RA_WLOCKED) -#define TRUNK_LOCK_READER struct rm_priotracker _TRUNK_RM_TRACKER +#define TRUNK_LOCK_INIT(trunk) mtx_init(&(trunk)->lock, vlanname, NULL, MTX_DEF) +#define TRUNK_LOCK_DESTROY(trunk) mtx_destroy(&(trunk)->lock) +#define TRUNK_RLOCK(trunk) NET_EPOCH_ENTER() +#define TRUNK_WLOCK(trunk) mtx_lock(&(trunk)->lock) +#define TRUNK_RUNLOCK(trunk) NET_EPOCH_EXIT(); +#define TRUNK_WUNLOCK(trunk) mtx_unlock(&(trunk)->lock) +#define TRUNK_RLOCK_ASSERT(trunk) MPASS(in_epoch(net_epoch_preempt)) +#define TRUNK_LOCK_ASSERT(trunk) MPASS(in_epoch(net_epoch_preempt) || mtx_owned(&(trunk)->lock)) +#define TRUNK_WLOCK_ASSERT(trunk) mtx_assert(&(trunk)->lock, MA_OWNED); /* * The VLAN_ARRAY substitutes the dynamic hash with a static array @@ -361,7 +335,7 @@ vlan_inithash(struct ifvlantrunk *trunk) trunk->hmask = n - 1; trunk->hash = malloc(sizeof(struct ifvlanhead) * n, M_VLAN, M_WAITOK); for (i = 0; i < n; i++) - LIST_INIT(&trunk->hash[i]); + CK_SLIST_INIT(&trunk->hash[i]); } static void @@ -372,7 +346,7 @@ vlan_freehash(struct ifvlantrunk *trunk) KASSERT(trunk->hwidth > 0, ("%s: hwidth not positive", __func__)); for (i = 0; i < (1 << trunk->hwidth); i++) - KASSERT(LIST_EMPTY(&trunk->hash[i]), + KASSERT(CK_SLIST_EMPTY(&trunk->hash[i]), ("%s: hash table not empty", __func__)); #endif free(trunk->hash, M_VLAN); @@ -386,12 +360,12 @@ vlan_inshash(struct ifvlantrunk *trunk, struct ifvlan *ifv) int i, b; struct ifvlan *ifv2; - TRUNK_WLOCK_ASSERT(trunk); + VLAN_XLOCK_ASSERT(); KASSERT(trunk->hwidth > 0, ("%s: hwidth not positive", __func__)); b = 1 << trunk->hwidth; i = HASH(ifv->ifv_vid, trunk->hmask); - LIST_FOREACH(ifv2, &trunk->hash[i], ifv_list) + CK_SLIST_FOREACH(ifv2, &trunk->hash[i], ifv_list) if (ifv->ifv_vid == ifv2->ifv_vid) return (EEXIST); @@ -404,7 +378,7 @@ vlan_inshash(struct ifvlantrunk *trunk, struct ifvlan *ifv) vlan_growhash(trunk, 1); i = HASH(ifv->ifv_vid, trunk->hmask); } - LIST_INSERT_HEAD(&trunk->hash[i], ifv, ifv_list); + CK_SLIST_INSERT_HEAD(&trunk->hash[i], ifv, ifv_list); trunk->refcnt++; return (0); @@ -416,15 +390,15 @@ vlan_remhash(struct ifvlantrunk *trunk, struct ifvlan *ifv) int i, b; struct ifvlan *ifv2; - TRUNK_WLOCK_ASSERT(trunk); + VLAN_XLOCK_ASSERT(); KASSERT(trunk->hwidth > 0, ("%s: hwidth not positive", __func__)); b = 1 << trunk->hwidth; i = HASH(ifv->ifv_vid, trunk->hmask); - LIST_FOREACH(ifv2, &trunk->hash[i], ifv_list) + CK_SLIST_FOREACH(ifv2, &trunk->hash[i], ifv_list) if (ifv2 == ifv) { trunk->refcnt--; - LIST_REMOVE(ifv2, ifv_list); + CK_SLIST_REMOVE(&trunk->hash[i], ifv2, ifvlan, ifv_list); if (trunk->refcnt < (b * b) / 2) vlan_growhash(trunk, -1); return (0); @@ -444,7 +418,7 @@ vlan_growhash(struct ifvlantrunk *trunk, int howmuch) struct ifvlanhead *hash2; int hwidth2, i, j, n, n2; - TRUNK_WLOCK_ASSERT(trunk); + VLAN_XLOCK_ASSERT(); KASSERT(trunk->hwidth > 0, ("%s: hwidth not positive", __func__)); if (howmuch == 0) { @@ -460,21 +434,21 @@ vlan_growhash(struct ifvlantrunk *trunk, int howmuch) if (hwidth2 < VLAN_DEF_HWIDTH) return; - /* M_NOWAIT because we're called with trunk mutex held */ - hash2 = malloc(sizeof(struct ifvlanhead) * n2, M_VLAN, M_NOWAIT); + hash2 = malloc(sizeof(struct ifvlanhead) * n2, M_VLAN, M_WAITOK); if (hash2 == NULL) { printf("%s: out of memory -- hash size not changed\n", __func__); return; /* We can live with the old hash table */ } for (j = 0; j < n2; j++) - LIST_INIT(&hash2[j]); + CK_SLIST_INIT(&hash2[j]); for (i = 0; i < n; i++) - while ((ifv = LIST_FIRST(&trunk->hash[i])) != NULL) { - LIST_REMOVE(ifv, ifv_list); + while ((ifv = CK_SLIST_FIRST(&trunk->hash[i])) != NULL) { + CK_SLIST_REMOVE(&trunk->hash[i], ifv, ifvlan, ifv_list); j = HASH(ifv->ifv_vid, n2 - 1); - LIST_INSERT_HEAD(&hash2[j], ifv, ifv_list); + CK_SLIST_INSERT_HEAD(&hash2[j], ifv, ifv_list); } + NET_EPOCH_WAIT(); free(trunk->hash, M_VLAN); trunk->hash = hash2; trunk->hwidth = hwidth2; @@ -492,7 +466,7 @@ vlan_gethash(struct ifvlantrunk *trunk, uint16_t vid) TRUNK_RLOCK_ASSERT(trunk); - LIST_FOREACH(ifv, &trunk->hash[HASH(vid, trunk->hmask)], ifv_list) + CK_SLIST_FOREACH(ifv, &trunk->hash[HASH(vid, trunk->hmask)], ifv_list) if (ifv->ifv_vid == vid) return (ifv); return (NULL); @@ -508,7 +482,7 @@ vlan_dumphash(struct ifvlantrunk *trunk) for (i = 0; i < (1 << trunk->hwidth); i++) { printf("%d: ", i); - LIST_FOREACH(ifv, &trunk->hash[i], ifv_list) + CK_SLIST_FOREACH(ifv, &trunk->hash[i], ifv_list) printf("%s ", ifv->ifv_ifp->if_xname); printf("\n"); } @@ -561,7 +535,6 @@ static void trunk_destroy(struct ifvlantrunk *trunk) { VLAN_XLOCK_ASSERT(); - VLAN_WLOCK_ASSERT(); vlan_freehash(trunk); trunk->parent->if_vlantrunk = NULL; @@ -587,23 +560,19 @@ vlan_setmulti(struct ifnet *ifp) struct vlan_mc_entry *mc; int error; - /* - * XXX This stupidly needs the rmlock to avoid sleeping while holding - * the in6_multi_mtx (see in6_mc_join_locked). - */ - VLAN_RWLOCK_ASSERT(); + VLAN_XLOCK_ASSERT(); /* Find the parent. */ sc = ifp->if_softc; - TRUNK_WLOCK_ASSERT(TRUNK(sc)); ifp_p = PARENT(sc); CURVNET_SET_QUIET(ifp_p->if_vnet); /* First, remove any existing filter entries. */ - while ((mc = SLIST_FIRST(&sc->vlan_mc_listhead)) != NULL) { - SLIST_REMOVE_HEAD(&sc->vlan_mc_listhead, mc_entries); + while ((mc = CK_SLIST_FIRST(&sc->vlan_mc_listhead)) != NULL) { + CK_SLIST_REMOVE_HEAD(&sc->vlan_mc_listhead, mc_entries); (void)if_delmulti(ifp_p, (struct sockaddr *)&mc->mc_addr); + NET_EPOCH_WAIT(); free(mc, M_VLAN); } @@ -619,10 +588,10 @@ vlan_setmulti(struct ifnet *ifp) } bcopy(ifma->ifma_addr, &mc->mc_addr, ifma->ifma_addr->sa_len); mc->mc_addr.sdl_index = ifp_p->if_index; - SLIST_INSERT_HEAD(&sc->vlan_mc_listhead, mc, mc_entries); + CK_SLIST_INSERT_HEAD(&sc->vlan_mc_listhead, mc, mc_entries); } IF_ADDR_WUNLOCK(ifp); - SLIST_FOREACH (mc, &sc->vlan_mc_listhead, mc_entries) { + CK_SLIST_FOREACH (mc, &sc->vlan_mc_listhead, mc_entries) { error = if_addmulti(ifp_p, (struct sockaddr *)&mc->mc_addr, NULL); if (error) @@ -645,7 +614,6 @@ vlan_iflladdr(void *arg __unused, struct ifnet *ifp) struct ifnet *ifv_ifp; struct ifvlantrunk *trunk; struct sockaddr_dl *sdl; - VLAN_LOCK_READER; /* Need the rmlock since this is run on taskqueue_swi. */ VLAN_RLOCK(); @@ -724,12 +692,10 @@ static struct ifnet * vlan_trunkdev(struct ifnet *ifp) { struct ifvlan *ifv; - VLAN_LOCK_READER; if (ifp->if_type != IFT_L2VLAN) return (NULL); - /* Not clear if callers are sleepable, so acquire the rmlock. */ VLAN_RLOCK(); ifv = ifp->if_softc; ifp = NULL; @@ -809,10 +775,7 @@ vlan_devat(struct ifnet *ifp, uint16_t vid) { struct ifvlantrunk *trunk; struct ifvlan *ifv; - VLAN_LOCK_READER; - TRUNK_LOCK_READER; - /* Not clear if callers are sleepable, so acquire the rmlock. */ VLAN_RLOCK(); trunk = ifp->if_vlantrunk; if (trunk == NULL) { @@ -820,11 +783,9 @@ vlan_devat(struct ifnet *ifp, uint16_t vid) return (NULL); } ifp = NULL; - TRUNK_RLOCK(trunk); ifv = vlan_gethash(trunk, vid); if (ifv) ifp = ifv->ifv_ifp; - TRUNK_RUNLOCK(trunk); VLAN_RUNLOCK(); return (ifp); } @@ -1076,7 +1037,7 @@ vlan_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params) if_rele(p); return (ENOSPC); } - SLIST_INIT(&ifv->vlan_mc_listhead); + CK_SLIST_INIT(&ifv->vlan_mc_listhead); ifp->if_softc = ifv; /* * Set the name manually rather than using if_initname because @@ -1143,6 +1104,7 @@ vlan_clone_destroy(struct if_clone *ifc, struct ifnet *ifp) * ifvlan. */ taskqueue_drain(taskqueue_thread, &ifv->lladdr_task); + NET_EPOCH_WAIT(); if_free(ifp); free(ifv, M_VLAN); ifc_free_unit(ifc, unit); @@ -1167,7 +1129,6 @@ vlan_transmit(struct ifnet *ifp, struct mbuf *m) struct ifvlan *ifv; struct ifnet *p; int error, len, mcast; - VLAN_LOCK_READER; VLAN_RLOCK(); ifv = ifp->if_softc; @@ -1227,8 +1188,6 @@ vlan_input(struct ifnet *ifp, struct mbuf *m) { struct ifvlantrunk *trunk; struct ifvlan *ifv; - VLAN_LOCK_READER; - TRUNK_LOCK_READER; struct m_tag *mtag; uint16_t vid, tag; @@ -1289,16 +1248,13 @@ vlan_input(struct ifnet *ifp, struct mbuf *m) vid = EVL_VLANOFTAG(tag); - TRUNK_RLOCK(trunk); ifv = vlan_gethash(trunk, vid); if (ifv == NULL || !UP_AND_RUNNING(ifv->ifv_ifp)) { - TRUNK_RUNLOCK(trunk); - if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1); VLAN_RUNLOCK(); + if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1); m_freem(m); return; } - TRUNK_RUNLOCK(trunk); if (vlan_mtag_pcp) { /* @@ -1369,22 +1325,19 @@ vlan_config(struct ifvlan *ifv, struct ifnet *p, uint16_t vid) if (ifv->ifv_trunk) return (EBUSY); - /* Acquire rmlock after the branch so we can M_WAITOK. */ VLAN_XLOCK(); if (p->if_vlantrunk == NULL) { trunk = malloc(sizeof(struct ifvlantrunk), M_VLAN, M_WAITOK | M_ZERO); vlan_inithash(trunk); TRUNK_LOCK_INIT(trunk); - VLAN_WLOCK(); TRUNK_WLOCK(trunk); p->if_vlantrunk = trunk; trunk->parent = p; if_ref(trunk->parent); + TRUNK_WUNLOCK(trunk); } else { - VLAN_WLOCK(); trunk = p->if_vlantrunk; - TRUNK_WLOCK(trunk); } ifv->ifv_vid = vid; /* must set this before vlan_inshash() */ @@ -1448,7 +1401,9 @@ vlan_config(struct ifvlan *ifv, struct ifnet *p, uint16_t vid) ifp->if_link_state = p->if_link_state; + TRUNK_RLOCK(TRUNK(ifv)); vlan_capabilities(ifv); + TRUNK_RUNLOCK(TRUNK(ifv)); /* * Set up our interface address to reflect the underlying @@ -1458,12 +1413,6 @@ vlan_config(struct ifvlan *ifv, struct ifnet *p, uint16_t vid) ((struct sockaddr_dl *)ifp->if_addr->ifa_addr)->sdl_alen = p->if_addrlen; - /* - * Configure multicast addresses that may already be - * joined on the vlan device. - */ - (void)vlan_setmulti(ifp); - TASK_INIT(&ifv->lladdr_task, 0, vlan_lladdr_fn, ifv); /* We are ready for operation now. */ @@ -1471,13 +1420,14 @@ vlan_config(struct ifvlan *ifv, struct ifnet *p, uint16_t vid) /* Update flags on the parent, if necessary. */ vlan_setflags(ifp, 1); -done: + /* - * We need to drop the non-sleepable rmlock so that the underlying - * devices can sleep in their vlan_config hooks. + * Configure multicast addresses that may already be + * joined on the vlan device. */ - TRUNK_WUNLOCK(trunk); - VLAN_WUNLOCK(); + (void)vlan_setmulti(ifp); + +done: if (error == 0) EVENTHANDLER_INVOKE(vlan_config, p, ifv->ifv_vid); VLAN_XUNLOCK(); @@ -1510,13 +1460,6 @@ vlan_unconfig_locked(struct ifnet *ifp, int departing) parent = NULL; if (trunk != NULL) { - /* - * Both vlan_transmit and vlan_input rely on the trunk fields - * being NULL to determine whether to bail, so we need to get - * an exclusive lock here to prevent them from using bad - * ifvlans. - */ - VLAN_WLOCK(); parent = trunk->parent; /* @@ -1524,7 +1467,7 @@ vlan_unconfig_locked(struct ifnet *ifp, int departing) * empty the list of multicast groups that we may have joined * while we were alive from the parent's list. */ - while ((mc = SLIST_FIRST(&ifv->vlan_mc_listhead)) != NULL) { + while ((mc = CK_SLIST_FIRST(&ifv->vlan_mc_listhead)) != NULL) { /* * If the parent interface is being detached, * all its multicast addresses have already @@ -1541,19 +1484,14 @@ vlan_unconfig_locked(struct ifnet *ifp, int departing) "Failed to delete multicast address from parent: %d\n", error); } - SLIST_REMOVE_HEAD(&ifv->vlan_mc_listhead, mc_entries); + CK_SLIST_REMOVE_HEAD(&ifv->vlan_mc_listhead, mc_entries); + NET_EPOCH_WAIT(); free(mc, M_VLAN); } vlan_setflags(ifp, 0); /* clear special flags on parent */ - /* - * The trunk lock isn't actually required here, but - * vlan_remhash expects it. - */ - TRUNK_WLOCK(trunk); vlan_remhash(trunk, ifv); - TRUNK_WUNLOCK(trunk); ifv->ifv_trunk = NULL; /* @@ -1561,9 +1499,9 @@ vlan_unconfig_locked(struct ifnet *ifp, int departing) */ if (trunk->refcnt == 0) { parent->if_vlantrunk = NULL; + NET_EPOCH_WAIT(); trunk_destroy(trunk); } - VLAN_WUNLOCK(); } /* Disconnect from parent. */ @@ -1640,7 +1578,6 @@ vlan_link_state(struct ifnet *ifp) { struct ifvlantrunk *trunk; struct ifvlan *ifv; - VLAN_LOCK_READER; /* Called from a taskqueue_swi task, so we cannot sleep. */ VLAN_RLOCK(); @@ -1670,7 +1607,7 @@ vlan_capabilities(struct ifvlan *ifv) u_long hwa = 0; VLAN_SXLOCK_ASSERT(); - TRUNK_WLOCK_ASSERT(TRUNK(ifv)); + TRUNK_RLOCK_ASSERT(TRUNK(ifv)); p = PARENT(ifv); ifp = ifv->ifv_ifp; @@ -1771,11 +1708,11 @@ vlan_trunk_capabilities(struct ifnet *ifp) VLAN_SUNLOCK(); return; } - TRUNK_WLOCK(trunk); + TRUNK_RLOCK(trunk); VLAN_FOREACH(ifv, trunk) { vlan_capabilities(ifv); } - TRUNK_WUNLOCK(trunk); + TRUNK_RUNLOCK(trunk); VLAN_SUNLOCK(); } @@ -1789,7 +1726,6 @@ vlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) struct ifvlantrunk *trunk; struct vlanreq vlr; int error = 0; - VLAN_LOCK_READER; ifr = (struct ifreq *)data; ifa = (struct ifaddr *) data; @@ -1925,16 +1861,13 @@ vlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) * XXX We need the rmlock here to avoid sleeping while * holding in6_multi_mtx. */ - VLAN_RLOCK(); + VLAN_XLOCK(); trunk = TRUNK(ifv); - if (trunk != NULL) { - TRUNK_WLOCK(trunk); + if (trunk != NULL) error = vlan_setmulti(ifp); - TRUNK_WUNLOCK(trunk); - } - VLAN_RUNLOCK(); - break; + VLAN_XUNLOCK(); + break; case SIOCGVLANPCP: #ifdef VIMAGE if (ifp->if_vnet != ifp->if_home_vnet) { @@ -1971,9 +1904,9 @@ vlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) ifv->ifv_capenable = ifr->ifr_reqcap; trunk = TRUNK(ifv); if (trunk != NULL) { - TRUNK_WLOCK(trunk); + TRUNK_RLOCK(trunk); vlan_capabilities(ifv); - TRUNK_WUNLOCK(trunk); + TRUNK_RUNLOCK(trunk); } VLAN_SUNLOCK(); break; -- cgit v1.3 From bb322680862c8eef7a6dae9e19f4e8dd52c52eba Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Fri, 21 Sep 2018 12:27:36 +0000 Subject: amd64: check for small size in memmove, memcpy and memset If the size is 15 bytes or less avoid spinning up rep just to copy the 8 bytes. In my tests on EPYC and old Intel microarchs without ERMS (like Westmere) it provided a nice win over the current version (e.g. for EPYC memset with 15 bytes of size goes from 59712651 ops/s to 70600095) all while almost not pessimizing the other cases. Data collected during package building shows that < 16 sizes are pretty common. Verified with the glibc test suite. Approved by: re (kib) --- sys/amd64/amd64/support.S | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'sys') diff --git a/sys/amd64/amd64/support.S b/sys/amd64/amd64/support.S index 043110226e22..c7f0d9472b66 100644 --- a/sys/amd64/amd64/support.S +++ b/sys/amd64/amd64/support.S @@ -116,6 +116,8 @@ ENTRY(memmove_std) cmpq %rcx,%r8 /* overlapping && src < dst? */ jb 2f + cmpq $15,%rcx + jbe 1f shrq $3,%rcx /* copy by 64-bit words */ rep movsq @@ -124,6 +126,7 @@ ENTRY(memmove_std) jne 1f POP_FRAME_POINTER ret + ALIGN_TEXT 1: rep movsb @@ -191,6 +194,8 @@ ENTRY(memcpy_std) PUSH_FRAME_POINTER movq %rdi,%rax movq %rdx,%rcx + cmpq $15,%rcx + jbe 1f shrq $3,%rcx /* copy by 64-bit words */ rep movsq @@ -199,6 +204,7 @@ ENTRY(memcpy_std) jne 1f POP_FRAME_POINTER ret + ALIGN_TEXT 1: rep movsb @@ -227,6 +233,8 @@ ENTRY(memset_std) movzbq %sil,%r8 movabs $0x0101010101010101,%rax imulq %r8,%rax + cmpq $15,%rcx + jbe 1f shrq $3,%rcx rep stosq @@ -236,6 +244,7 @@ ENTRY(memset_std) movq %r9,%rax POP_FRAME_POINTER ret + ALIGN_TEXT 1: rep stosb -- cgit v1.3 From 18c00da8065d999152abd3ddd2dfd0bf156df113 Mon Sep 17 00:00:00 2001 From: Ed Maste Date: Fri, 21 Sep 2018 13:02:25 +0000 Subject: remove double space between branch and version in kernel ident Reported by: dim Approved by: re (kib) Sponsored by: The FreeBSD Foundation --- sys/conf/newvers.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'sys') diff --git a/sys/conf/newvers.sh b/sys/conf/newvers.sh index e784b373150e..9603de313ecf 100644 --- a/sys/conf/newvers.sh +++ b/sys/conf/newvers.sh @@ -293,7 +293,7 @@ done shift $((OPTIND - 1)) if [ -z "${include_metadata}" ]; then - VERINFO="${VERSION} ${svn}${git}${hg}${p4version}" + VERINFO="${VERSION}${svn}${git}${hg}${p4version}" VERSTR="${VERINFO}\\n" else VERINFO="${VERSION} #${v}${svn}${git}${hg}${p4version}: ${t}" -- cgit v1.3 From d6fda03a64403a2e10340a3b29f898461962fe5b Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Fri, 21 Sep 2018 13:20:41 +0000 Subject: select: stop doing zero-sized memsets Approved by: re (kib) --- sys/kern/sys_generic.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'sys') diff --git a/sys/kern/sys_generic.c b/sys/kern/sys_generic.c index 649cfb191b7f..455220c36752 100644 --- a/sys/kern/sys_generic.c +++ b/sys/kern/sys_generic.c @@ -1031,8 +1031,9 @@ kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou, error = copyin(name, ibits[x], ncpubytes); \ if (error != 0) \ goto done; \ - bzero((char *)ibits[x] + ncpubytes, \ - ncpbytes - ncpubytes); \ + if (ncpbytes != ncpubytes) \ + bzero((char *)ibits[x] + ncpubytes, \ + ncpbytes - ncpubytes); \ } \ } while (0) getbits(fd_in, 0); -- cgit v1.3 From f789d9839a1e38854616c1bc789f715196cecbf8 Mon Sep 17 00:00:00 2001 From: Ed Maste Date: Fri, 21 Sep 2018 13:43:06 +0000 Subject: Include kernel ident in uname In non-reproducible mode we have the kernel ident as a side effect of including the build directory. Explicitly add it to the ident string in reproducible mode. Reported by: mjg Approved by: re (gjb) Sponsored by: The FreeBSD Foundation --- sys/conf/newvers.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'sys') diff --git a/sys/conf/newvers.sh b/sys/conf/newvers.sh index 9603de313ecf..db6f66d7c52c 100644 --- a/sys/conf/newvers.sh +++ b/sys/conf/newvers.sh @@ -293,7 +293,7 @@ done shift $((OPTIND - 1)) if [ -z "${include_metadata}" ]; then - VERINFO="${VERSION}${svn}${git}${hg}${p4version}" + VERINFO="${VERSION}${svn}${git}${hg}${p4version} ${i}" VERSTR="${VERINFO}\\n" else VERINFO="${VERSION} #${v}${svn}${git}${hg}${p4version}: ${t}" -- cgit v1.3 From 50f5c94edb1746add6a9fdbf2d3c323430836c11 Mon Sep 17 00:00:00 2001 From: "Andrey V. Elsukov" Date: Fri, 21 Sep 2018 13:44:05 +0000 Subject: Fix possible NULL pointer dereference in ffec_alloc_mbufcl(). PR: 231514 Approved by: re (kib) MFC after: 1 week --- sys/dev/ffec/if_ffec.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'sys') diff --git a/sys/dev/ffec/if_ffec.c b/sys/dev/ffec/if_ffec.c index ea291ab8df2d..d52bf9a4e3d5 100644 --- a/sys/dev/ffec/if_ffec.c +++ b/sys/dev/ffec/if_ffec.c @@ -801,7 +801,8 @@ ffec_alloc_mbufcl(struct ffec_softc *sc) struct mbuf *m; m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); - m->m_pkthdr.len = m->m_len = m->m_ext.ext_size; + if (m != NULL) + m->m_pkthdr.len = m->m_len = m->m_ext.ext_size; return (m); } -- cgit v1.3 From 68c7542edfe81f74ac57cf5b09c5bc872754c462 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Fri, 21 Sep 2018 15:00:46 +0000 Subject: amd64: even up copyin/copyout with memcpy + other cleanup - _fault handlers for both primitives are identical, provide just one - change the copying scheme to match memcpy (in particular jump avoidance for the most common case of multiply of 8) - stop re-reading pcb address on exit, just store it locally (in r9) Reviewed by: kib Approved by: re (gjb) Differential Revision: https://reviews.freebsd.org/D17265 --- sys/amd64/amd64/support.S | 56 +++++++++++++++++++++++++---------------------- 1 file changed, 30 insertions(+), 26 deletions(-) (limited to 'sys') diff --git a/sys/amd64/amd64/support.S b/sys/amd64/amd64/support.S index c7f0d9472b66..ee4c6ee3dd5d 100644 --- a/sys/amd64/amd64/support.S +++ b/sys/amd64/amd64/support.S @@ -309,9 +309,9 @@ END(fillw) */ .macro COPYOUT smap erms PUSH_FRAME_POINTER - movq PCPU(CURPCB),%rax + movq PCPU(CURPCB),%r9 /* Trap entry clears PSL.AC */ - movq $copyout_fault,PCB_ONFAULT(%rax) + movq $copy_fault,PCB_ONFAULT(%r9) testq %rdx,%rdx /* anything to do? */ jz 2f @@ -327,7 +327,7 @@ END(fillw) */ movq %rsi,%rax addq %rdx,%rax - jc copyout_fault + jc copy_fault /* * XXX STOP USING VM_MAXUSER_ADDRESS. * It is an end address, not a max, so every time it is used correctly it @@ -336,7 +336,7 @@ END(fillw) */ movq $VM_MAXUSER_ADDRESS,%rcx cmpq %rcx,%rax - ja copyout_fault + ja copy_fault xchgq %rdi,%rsi /* bcopy(%rsi, %rdi, %rdx) */ @@ -344,21 +344,27 @@ END(fillw) SMAP_DISABLE \smap .if \erms == 0 + cmpq $15,%rcx + jbe 1f shrq $3,%rcx rep movsq movb %dl,%cl andb $7,%cl - je 1f + jne 1f + SMAP_ENABLE \smap + xorl %eax,%eax + movq %rax,PCB_ONFAULT(%r9) + POP_FRAME_POINTER + ret .endif +1: rep movsb -1: SMAP_ENABLE \smap 2: xorl %eax,%eax - movq PCPU(CURPCB),%rdx - movq %rax,PCB_ONFAULT(%rdx) + movq %rax,PCB_ONFAULT(%r9) POP_FRAME_POINTER ret .endmacro @@ -379,22 +385,14 @@ ENTRY(copyout_smap_erms) COPYOUT smap=1 erms=1 END(copyout_smap_erms) - ALIGN_TEXT -copyout_fault: - movq PCPU(CURPCB),%rdx - movq $0,PCB_ONFAULT(%rdx) - movq $EFAULT,%rax - POP_FRAME_POINTER - ret - /* * copyin(from_user, to_kernel, len) * %rdi, %rsi, %rdx */ .macro COPYIN smap erms PUSH_FRAME_POINTER - movq PCPU(CURPCB),%rax - movq $copyin_fault,PCB_ONFAULT(%rax) + movq PCPU(CURPCB),%r9 + movq $copy_fault,PCB_ONFAULT(%r9) testq %rdx,%rdx /* anything to do? */ jz 2f @@ -403,10 +401,10 @@ copyout_fault: */ movq %rdi,%rax addq %rdx,%rax - jc copyin_fault + jc copy_fault movq $VM_MAXUSER_ADDRESS,%rcx cmpq %rcx,%rax - ja copyin_fault + ja copy_fault xchgq %rdi,%rsi movq %rdx,%rcx @@ -414,22 +412,28 @@ copyout_fault: SMAP_DISABLE \smap .if \erms == 0 + cmpq $15,%rcx + jbe 1f shrq $3,%rcx /* copy longword-wise */ rep movsq movb %al,%cl andb $7,%cl /* copy remaining bytes */ - je 1f + jne 1f + SMAP_ENABLE \smap + xorl %eax,%eax + movq %rax,PCB_ONFAULT(%r9) + POP_FRAME_POINTER + ret .endif +1: rep movsb -1: SMAP_ENABLE \smap 2: xorl %eax,%eax - movq PCPU(CURPCB),%rdx - movq %rax,PCB_ONFAULT(%rdx) + movq %rax,PCB_ONFAULT(%r9) POP_FRAME_POINTER ret .endmacro @@ -451,10 +455,10 @@ ENTRY(copyin_smap_erms) END(copyin_smap_erms) ALIGN_TEXT -copyin_fault: +copy_fault: movq PCPU(CURPCB),%rdx movq $0,PCB_ONFAULT(%rdx) - movq $EFAULT,%rax + movl $EFAULT,%eax POP_FRAME_POINTER ret -- cgit v1.3 From d995b5b1ea85a0a584a258a75aae47f220d6a4cd Mon Sep 17 00:00:00 2001 From: Konstantin Belousov Date: Fri, 21 Sep 2018 17:53:06 +0000 Subject: Convert x86 TLB top-level invalidation functions to ifuncs. Note that shootdown IPI handlers are already per-mode. Suggested by: alc Reviewed by: alc, markj Tested by: pho Sponsored by: The FreeBSD Foundation Approved by: re (gjb) Differential revision: https://reviews.freebsd.org/D17184 --- sys/amd64/amd64/pmap.c | 365 +++++++++++++++++++++++++++++++------------------ 1 file changed, 234 insertions(+), 131 deletions(-) (limited to 'sys') diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index d4d91e7c23a9..cb3625138a6e 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -1704,15 +1704,94 @@ pmap_invalidate_ept(pmap_t pmap) sched_unpin(); } -void -pmap_invalidate_page(pmap_t pmap, vm_offset_t va) +static inline void +pmap_invalidate_page_pcid(pmap_t pmap, vm_offset_t va, + const bool invpcid_works1) { - cpuset_t *mask; struct invpcid_descr d; uint64_t kcr3, ucr3; uint32_t pcid; u_int cpuid, i; + cpuid = PCPU_GET(cpuid); + if (pmap == PCPU_GET(curpmap)) { + if (pmap->pm_ucr3 != PMAP_NO_CR3) { + /* + * Because pm_pcid is recalculated on a + * context switch, we must disable switching. + * Otherwise, we might use a stale value + * below. + */ + critical_enter(); + pcid = pmap->pm_pcids[cpuid].pm_pcid; + if (invpcid_works1) { + d.pcid = pcid | PMAP_PCID_USER_PT; + d.pad = 0; + d.addr = va; + invpcid(&d, INVPCID_ADDR); + } else { + kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE; + ucr3 = pmap->pm_ucr3 | pcid | + PMAP_PCID_USER_PT | CR3_PCID_SAVE; + pmap_pti_pcid_invlpg(ucr3, kcr3, va); + } + critical_exit(); + } + } else + pmap->pm_pcids[cpuid].pm_gen = 0; + + CPU_FOREACH(i) { + if (cpuid != i) + pmap->pm_pcids[i].pm_gen = 0; + } + + /* + * The fence is between stores to pm_gen and the read of the + * pm_active mask. We need to ensure that it is impossible + * for us to miss the bit update in pm_active and + * simultaneously observe a non-zero pm_gen in + * pmap_activate_sw(), otherwise TLB update is missed. + * Without the fence, IA32 allows such an outcome. Note that + * pm_active is updated by a locked operation, which provides + * the reciprocal fence. + */ + atomic_thread_fence_seq_cst(); +} + +static void +pmap_invalidate_page_pcid_invpcid(pmap_t pmap, vm_offset_t va) +{ + + pmap_invalidate_page_pcid(pmap, va, true); +} + +static void +pmap_invalidate_page_pcid_noinvpcid(pmap_t pmap, vm_offset_t va) +{ + + pmap_invalidate_page_pcid(pmap, va, false); +} + +static void +pmap_invalidate_page_nopcid(pmap_t pmap, vm_offset_t va) +{ +} + +DEFINE_IFUNC(static, void, pmap_invalidate_page_mode, (pmap_t, vm_offset_t), + static) +{ + + if (pmap_pcid_enabled) + return (invpcid_works ? pmap_invalidate_page_pcid_invpcid : + pmap_invalidate_page_pcid_noinvpcid); + return (pmap_invalidate_page_nopcid); +} + +void +pmap_invalidate_page(pmap_t pmap, vm_offset_t va) +{ + cpuset_t *mask; + if (pmap_type_guest(pmap)) { pmap_invalidate_ept(pmap); return; @@ -1726,52 +1805,9 @@ pmap_invalidate_page(pmap_t pmap, vm_offset_t va) invlpg(va); mask = &all_cpus; } else { - cpuid = PCPU_GET(cpuid); - if (pmap == PCPU_GET(curpmap)) { + if (pmap == PCPU_GET(curpmap)) invlpg(va); - if (pmap_pcid_enabled && pmap->pm_ucr3 != PMAP_NO_CR3) { - /* - * Disable context switching. pm_pcid - * is recalculated on switch, which - * might make us use wrong pcid below. - */ - critical_enter(); - pcid = pmap->pm_pcids[cpuid].pm_pcid; - - if (invpcid_works) { - d.pcid = pcid | PMAP_PCID_USER_PT; - d.pad = 0; - d.addr = va; - invpcid(&d, INVPCID_ADDR); - } else { - kcr3 = pmap->pm_cr3 | pcid | - CR3_PCID_SAVE; - ucr3 = pmap->pm_ucr3 | pcid | - PMAP_PCID_USER_PT | CR3_PCID_SAVE; - pmap_pti_pcid_invlpg(ucr3, kcr3, va); - } - critical_exit(); - } - } else if (pmap_pcid_enabled) - pmap->pm_pcids[cpuid].pm_gen = 0; - if (pmap_pcid_enabled) { - CPU_FOREACH(i) { - if (cpuid != i) - pmap->pm_pcids[i].pm_gen = 0; - } - - /* - * The fence is between stores to pm_gen and the read of - * the pm_active mask. We need to ensure that it is - * impossible for us to miss the bit update in pm_active - * and simultaneously observe a non-zero pm_gen in - * pmap_activate_sw(), otherwise TLB update is missed. - * Without the fence, IA32 allows such an outcome. - * Note that pm_active is updated by a locked operation, - * which provides the reciprocal fence. - */ - atomic_thread_fence_seq_cst(); - } + pmap_invalidate_page_mode(pmap, va); mask = &pmap->pm_active; } smp_masked_invlpg(*mask, va, pmap); @@ -1781,16 +1817,82 @@ pmap_invalidate_page(pmap_t pmap, vm_offset_t va) /* 4k PTEs -- Chosen to exceed the total size of Broadwell L2 TLB */ #define PMAP_INVLPG_THRESHOLD (4 * 1024 * PAGE_SIZE) -void -pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) +static void +pmap_invalidate_range_pcid(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, + const bool invpcid_works1) { - cpuset_t *mask; struct invpcid_descr d; - vm_offset_t addr; uint64_t kcr3, ucr3; uint32_t pcid; u_int cpuid, i; + cpuid = PCPU_GET(cpuid); + if (pmap == PCPU_GET(curpmap)) { + if (pmap->pm_ucr3 != PMAP_NO_CR3) { + critical_enter(); + pcid = pmap->pm_pcids[cpuid].pm_pcid; + if (invpcid_works1) { + d.pcid = pcid | PMAP_PCID_USER_PT; + d.pad = 0; + d.addr = sva; + for (; d.addr < eva; d.addr += PAGE_SIZE) + invpcid(&d, INVPCID_ADDR); + } else { + kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE; + ucr3 = pmap->pm_ucr3 | pcid | + PMAP_PCID_USER_PT | CR3_PCID_SAVE; + pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva); + } + critical_exit(); + } + } else + pmap->pm_pcids[cpuid].pm_gen = 0; + + CPU_FOREACH(i) { + if (cpuid != i) + pmap->pm_pcids[i].pm_gen = 0; + } + /* See the comment in pmap_invalidate_page_pcid(). */ + atomic_thread_fence_seq_cst(); +} + +static void +pmap_invalidate_range_pcid_invpcid(pmap_t pmap, vm_offset_t sva, + vm_offset_t eva) +{ + + pmap_invalidate_range_pcid(pmap, sva, eva, true); +} + +static void +pmap_invalidate_range_pcid_noinvpcid(pmap_t pmap, vm_offset_t sva, + vm_offset_t eva) +{ + + pmap_invalidate_range_pcid(pmap, sva, eva, false); +} + +static void +pmap_invalidate_range_nopcid(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) +{ +} + +DEFINE_IFUNC(static, void, pmap_invalidate_range_mode, (pmap_t, vm_offset_t, + vm_offset_t), static) +{ + + if (pmap_pcid_enabled) + return (invpcid_works ? pmap_invalidate_range_pcid_invpcid : + pmap_invalidate_range_pcid_noinvpcid); + return (pmap_invalidate_range_nopcid); +} + +void +pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) +{ + cpuset_t *mask; + vm_offset_t addr; + if (eva - sva >= PMAP_INVLPG_THRESHOLD) { pmap_invalidate_all(pmap); return; @@ -1805,7 +1907,6 @@ pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) ("pmap_invalidate_range: invalid type %d", pmap->pm_type)); sched_pin(); - cpuid = PCPU_GET(cpuid); if (pmap == kernel_pmap) { for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); @@ -1814,112 +1915,114 @@ pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) if (pmap == PCPU_GET(curpmap)) { for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); - if (pmap_pcid_enabled && pmap->pm_ucr3 != PMAP_NO_CR3) { - critical_enter(); - pcid = pmap->pm_pcids[cpuid].pm_pcid; - if (invpcid_works) { - d.pcid = pcid | PMAP_PCID_USER_PT; - d.pad = 0; - d.addr = sva; - for (; d.addr < eva; d.addr += - PAGE_SIZE) - invpcid(&d, INVPCID_ADDR); - } else { - kcr3 = pmap->pm_cr3 | pcid | - CR3_PCID_SAVE; - ucr3 = pmap->pm_ucr3 | pcid | - PMAP_PCID_USER_PT | CR3_PCID_SAVE; - pmap_pti_pcid_invlrng(ucr3, kcr3, sva, - eva); - } - critical_exit(); - } - } else if (pmap_pcid_enabled) { - pmap->pm_pcids[cpuid].pm_gen = 0; - } - if (pmap_pcid_enabled) { - CPU_FOREACH(i) { - if (cpuid != i) - pmap->pm_pcids[i].pm_gen = 0; - } - /* See the comment in pmap_invalidate_page(). */ - atomic_thread_fence_seq_cst(); } + pmap_invalidate_range_mode(pmap, sva, eva); mask = &pmap->pm_active; } smp_masked_invlpg_range(*mask, sva, eva, pmap); sched_unpin(); } -void -pmap_invalidate_all(pmap_t pmap) +static inline void +pmap_invalidate_all_pcid(pmap_t pmap, bool invpcid_works1) { - cpuset_t *mask; struct invpcid_descr d; uint64_t kcr3, ucr3; uint32_t pcid; u_int cpuid, i; - if (pmap_type_guest(pmap)) { - pmap_invalidate_ept(pmap); - return; - } - - KASSERT(pmap->pm_type == PT_X86, - ("pmap_invalidate_all: invalid type %d", pmap->pm_type)); - - sched_pin(); if (pmap == kernel_pmap) { - if (pmap_pcid_enabled && invpcid_works) { + if (invpcid_works1) { bzero(&d, sizeof(d)); invpcid(&d, INVPCID_CTXGLOB); } else { invltlb_glob(); } - mask = &all_cpus; } else { cpuid = PCPU_GET(cpuid); if (pmap == PCPU_GET(curpmap)) { - if (pmap_pcid_enabled) { - critical_enter(); - pcid = pmap->pm_pcids[cpuid].pm_pcid; - if (invpcid_works) { - d.pcid = pcid; - d.pad = 0; - d.addr = 0; + critical_enter(); + pcid = pmap->pm_pcids[cpuid].pm_pcid; + if (invpcid_works1) { + d.pcid = pcid; + d.pad = 0; + d.addr = 0; + invpcid(&d, INVPCID_CTX); + if (pmap->pm_ucr3 != PMAP_NO_CR3) { + d.pcid |= PMAP_PCID_USER_PT; invpcid(&d, INVPCID_CTX); - if (pmap->pm_ucr3 != PMAP_NO_CR3) { - d.pcid |= PMAP_PCID_USER_PT; - invpcid(&d, INVPCID_CTX); - } - } else { - kcr3 = pmap->pm_cr3 | pcid; - ucr3 = pmap->pm_ucr3; - if (ucr3 != PMAP_NO_CR3) { - ucr3 |= pcid | PMAP_PCID_USER_PT; - pmap_pti_pcid_invalidate(ucr3, - kcr3); - } else { - load_cr3(kcr3); - } } - critical_exit(); } else { - invltlb(); + kcr3 = pmap->pm_cr3 | pcid; + ucr3 = pmap->pm_ucr3; + if (ucr3 != PMAP_NO_CR3) { + ucr3 |= pcid | PMAP_PCID_USER_PT; + pmap_pti_pcid_invalidate(ucr3, kcr3); + } else { + load_cr3(kcr3); + } } - } else if (pmap_pcid_enabled) { + critical_exit(); + } else pmap->pm_pcids[cpuid].pm_gen = 0; - } - if (pmap_pcid_enabled) { - CPU_FOREACH(i) { - if (cpuid != i) - pmap->pm_pcids[i].pm_gen = 0; - } - /* See the comment in pmap_invalidate_page(). */ - atomic_thread_fence_seq_cst(); - } - mask = &pmap->pm_active; } + CPU_FOREACH(i) { + if (cpuid != i) + pmap->pm_pcids[i].pm_gen = 0; + } + /* See the comment in pmap_invalidate_page_pcid(). */ + atomic_thread_fence_seq_cst(); +} + +static void +pmap_invalidate_all_pcid_invpcid(pmap_t pmap) +{ + + pmap_invalidate_all_pcid(pmap, true); +} + +static void +pmap_invalidate_all_pcid_noinvpcid(pmap_t pmap) +{ + + pmap_invalidate_all_pcid(pmap, false); +} + +static void +pmap_invalidate_all_nopcid(pmap_t pmap) +{ + + if (pmap == kernel_pmap) + invltlb_glob(); + else if (pmap == PCPU_GET(curpmap)) + invltlb(); +} + +DEFINE_IFUNC(static, void, pmap_invalidate_all_mode, (pmap_t), static) +{ + + if (pmap_pcid_enabled) + return (invpcid_works ? pmap_invalidate_all_pcid_invpcid : + pmap_invalidate_all_pcid_noinvpcid); + return (pmap_invalidate_all_nopcid); +} + +void +pmap_invalidate_all(pmap_t pmap) +{ + cpuset_t *mask; + + if (pmap_type_guest(pmap)) { + pmap_invalidate_ept(pmap); + return; + } + + KASSERT(pmap->pm_type == PT_X86, + ("pmap_invalidate_all: invalid type %d", pmap->pm_type)); + + sched_pin(); + mask = pmap == kernel_pmap ? &all_cpus : &pmap->pm_active; + pmap_invalidate_all_mode(pmap); smp_masked_invltlb(*mask, pmap); sched_unpin(); } -- cgit v1.3 From 6cdde6fda2805b6003dd4d2b79ffb92151421455 Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Fri, 21 Sep 2018 20:20:03 +0000 Subject: Use the GNU as-compatible .endm instead of .endmacro. Approved by: re (gjb) --- sys/amd64/amd64/support.S | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'sys') diff --git a/sys/amd64/amd64/support.S b/sys/amd64/amd64/support.S index ee4c6ee3dd5d..7d580b67656a 100644 --- a/sys/amd64/amd64/support.S +++ b/sys/amd64/amd64/support.S @@ -294,14 +294,14 @@ END(fillw) .if \smap stac .endif -.endmacro +.endm .macro SMAP_ENABLE smap .if \smap clac .endif -.endmacro +.endm /* * copyout(from_kernel, to_user, len) @@ -367,7 +367,7 @@ END(fillw) movq %rax,PCB_ONFAULT(%r9) POP_FRAME_POINTER ret -.endmacro +.endm ENTRY(copyout_nosmap_std) COPYOUT smap=0 erms=0 @@ -436,7 +436,7 @@ END(copyout_smap_erms) movq %rax,PCB_ONFAULT(%r9) POP_FRAME_POINTER ret -.endmacro +.endm ENTRY(copyin_nosmap_std) COPYIN smap=0 erms=0 -- cgit v1.3 From 061bbaf7e7fe46b3229bab050b604c27bde6becd Mon Sep 17 00:00:00 2001 From: Navdeep Parhar Date: Sat, 22 Sep 2018 01:24:30 +0000 Subject: cxgbe(4): Reuse existing "switching" L2T entries when possible. Approved by: re@ (rgrimes@) Sponsored by: Chelsio Communications --- sys/dev/cxgbe/t4_filter.c | 61 +++++++++++-------------------- sys/dev/cxgbe/t4_l2t.c | 91 ++++++++++++++++++++++++++++++++--------------- sys/dev/cxgbe/t4_l2t.h | 3 +- 3 files changed, 86 insertions(+), 69 deletions(-) (limited to 'sys') diff --git a/sys/dev/cxgbe/t4_filter.c b/sys/dev/cxgbe/t4_filter.c index fe26b47566e1..4f325bdd235d 100644 --- a/sys/dev/cxgbe/t4_filter.c +++ b/sys/dev/cxgbe/t4_filter.c @@ -593,13 +593,8 @@ set_tcamfilter(struct adapter *sc, struct t4_filter *t, struct l2t_entry *l2te, } } mtx_unlock(&sc->tids.ftid_lock); - if (rc != 0) { - if (l2te) - t4_l2t_release(l2te); - if (smt) - t4_smt_release(smt); + if (rc != 0) return (rc); - } /* * Can't fail now. A set-filter WR will definitely be sent. @@ -817,8 +812,8 @@ int set_filter(struct adapter *sc, struct t4_filter *t) { struct tid_info *ti = &sc->tids; - struct l2t_entry *l2te; - struct smt_entry *smt; + struct l2t_entry *l2te = NULL; + struct smt_entry *smt = NULL; uint64_t ftuple; int rc; @@ -942,43 +937,41 @@ done: * Allocate L2T entry, SMT entry, etc. */ - l2te = NULL; if (t->fs.newdmac || t->fs.newvlan) { /* This filter needs an L2T entry; allocate one. */ - l2te = t4_l2t_alloc_switching(sc->l2t); - if (__predict_false(l2te == NULL)) - return (EAGAIN); - rc = t4_l2t_set_switching(sc, l2te, t->fs.vlan, t->fs.eport, + l2te = t4_l2t_alloc_switching(sc, t->fs.vlan, t->fs.eport, t->fs.dmac); - if (rc) { - t4_l2t_release(l2te); - return (ENOMEM); + if (__predict_false(l2te == NULL)) { + rc = EAGAIN; + goto error; } } - smt = NULL; if (t->fs.newsmac) { /* This filter needs an SMT entry; allocate one. */ smt = t4_smt_alloc_switching(sc->smt, t->fs.smac); if (__predict_false(smt == NULL)) { - if (l2te != NULL) - t4_l2t_release(l2te); - return (EAGAIN); + rc = EAGAIN; + goto error; } rc = t4_smt_set_switching(sc, smt, 0x0, t->fs.smac); - if (rc) { - t4_smt_release(smt); - if (l2te != NULL) - t4_l2t_release(l2te); - return (rc); - } + if (rc) + goto error; } if (t->fs.hash) - return (set_hashfilter(sc, t, ftuple, l2te, smt)); + rc = set_hashfilter(sc, t, ftuple, l2te, smt); else - return (set_tcamfilter(sc, t, l2te, smt)); + rc = set_tcamfilter(sc, t, l2te, smt); + if (rc != 0 && rc != EINPROGRESS) { +error: + if (l2te) + t4_l2t_release(l2te); + if (smt) + t4_smt_release(smt); + } + return (rc); } static int @@ -1552,10 +1545,6 @@ set_hashfilter(struct adapter *sc, struct t4_filter *t, uint64_t ftuple, f = malloc(sizeof(*f), M_CXGBE, M_ZERO | M_NOWAIT); if (__predict_false(f == NULL)) { - if (l2te) - t4_l2t_release(l2te); - if (smt) - t4_smt_release(smt); rc = ENOMEM; goto done; } @@ -1565,10 +1554,6 @@ set_hashfilter(struct adapter *sc, struct t4_filter *t, uint64_t ftuple, atid = alloc_atid(sc, f); if (__predict_false(atid) == -1) { - if (l2te) - t4_l2t_release(l2te); - if (smt) - t4_smt_release(smt); free(f, M_CXGBE); rc = EAGAIN; goto done; @@ -1579,10 +1564,6 @@ set_hashfilter(struct adapter *sc, struct t4_filter *t, uint64_t ftuple, &cookie); if (wr == NULL) { free_atid(sc, atid); - if (l2te) - t4_l2t_release(l2te); - if (smt) - t4_smt_release(smt); free(f, M_CXGBE); rc = ENOMEM; goto done; diff --git a/sys/dev/cxgbe/t4_l2t.c b/sys/dev/cxgbe/t4_l2t.c index 22e84d320aa2..5e96579f1717 100644 --- a/sys/dev/cxgbe/t4_l2t.c +++ b/sys/dev/cxgbe/t4_l2t.c @@ -108,6 +108,44 @@ found: return (e); } +static struct l2t_entry * +find_or_alloc_l2e(struct l2t_data *d, uint16_t vlan, uint8_t port, uint8_t *dmac) +{ + struct l2t_entry *end, *e, **p; + struct l2t_entry *first_free = NULL; + + for (e = &d->l2tab[0], end = &d->l2tab[d->l2t_size]; e != end; ++e) { + if (atomic_load_acq_int(&e->refcnt) == 0) { + if (!first_free) + first_free = e; + } else if (e->state == L2T_STATE_SWITCHING && + memcmp(e->dmac, dmac, ETHER_ADDR_LEN) == 0 && + e->vlan == vlan && e->lport == port) + return (e); /* Found existing entry that matches. */ + } + + if (first_free == NULL) + return (NULL); /* No match and no room for a new entry. */ + + /* + * The entry we found may be an inactive entry that is + * presently in the hash table. We need to remove it. + */ + e = first_free; + if (e->state < L2T_STATE_SWITCHING) { + for (p = &d->l2tab[e->hash].first; *p; p = &(*p)->next) { + if (*p == e) { + *p = e->next; + e->next = NULL; + break; + } + } + } + e->state = L2T_STATE_UNUSED; + return (e); +} + + /* * Write an L2T entry. Must be called with the entry locked. * The write may be synchronous or asynchronous. @@ -154,41 +192,38 @@ t4_write_l2e(struct l2t_entry *e, int sync) * address resolution updates do not see them. */ struct l2t_entry * -t4_l2t_alloc_switching(struct l2t_data *d) +t4_l2t_alloc_switching(struct adapter *sc, uint16_t vlan, uint8_t port, + uint8_t *eth_addr) { + struct l2t_data *d = sc->l2t; struct l2t_entry *e; + int rc; rw_wlock(&d->lock); - e = t4_alloc_l2e(d); + e = find_or_alloc_l2e(d, vlan, port, eth_addr); if (e) { - mtx_lock(&e->lock); /* avoid race with t4_l2t_free */ - e->state = L2T_STATE_SWITCHING; - atomic_store_rel_int(&e->refcnt, 1); - mtx_unlock(&e->lock); + if (atomic_load_acq_int(&e->refcnt) == 0) { + mtx_lock(&e->lock); /* avoid race with t4_l2t_free */ + e->wrq = &sc->sge.ctrlq[0]; + e->iqid = sc->sge.fwq.abs_id; + e->state = L2T_STATE_SWITCHING; + e->vlan = vlan; + e->lport = port; + memcpy(e->dmac, eth_addr, ETHER_ADDR_LEN); + atomic_store_rel_int(&e->refcnt, 1); + atomic_subtract_int(&d->nfree, 1); + rc = t4_write_l2e(e, 0); + mtx_unlock(&e->lock); + if (rc != 0) + e = NULL; + } else { + MPASS(e->vlan == vlan); + MPASS(e->lport == port); + atomic_add_int(&e->refcnt, 1); + } } rw_wunlock(&d->lock); - return e; -} - -/* - * Sets/updates the contents of a switching L2T entry that has been allocated - * with an earlier call to @t4_l2t_alloc_switching. - */ -int -t4_l2t_set_switching(struct adapter *sc, struct l2t_entry *e, uint16_t vlan, - uint8_t port, uint8_t *eth_addr) -{ - int rc; - - e->vlan = vlan; - e->lport = port; - e->wrq = &sc->sge.ctrlq[0]; - e->iqid = sc->sge.fwq.abs_id; - memcpy(e->dmac, eth_addr, ETHER_ADDR_LEN); - mtx_lock(&e->lock); - rc = t4_write_l2e(e, 0); - mtx_unlock(&e->lock); - return (rc); + return (e); } int diff --git a/sys/dev/cxgbe/t4_l2t.h b/sys/dev/cxgbe/t4_l2t.h index 21c392018fe8..188669e80261 100644 --- a/sys/dev/cxgbe/t4_l2t.h +++ b/sys/dev/cxgbe/t4_l2t.h @@ -91,7 +91,8 @@ struct l2t_data { int t4_init_l2t(struct adapter *, int); int t4_free_l2t(struct l2t_data *); struct l2t_entry *t4_alloc_l2e(struct l2t_data *); -struct l2t_entry *t4_l2t_alloc_switching(struct l2t_data *); +struct l2t_entry *t4_l2t_alloc_switching(struct adapter *, uint16_t, uint8_t, + uint8_t *); int t4_l2t_set_switching(struct adapter *, struct l2t_entry *, uint16_t, uint8_t, uint8_t *); int t4_write_l2e(struct l2t_entry *, int); -- cgit v1.3 From 334107b91b349741a2ffc5d9788a51fd8ab43156 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Sat, 22 Sep 2018 11:39:30 +0000 Subject: vfs: __predict common case in VFS_EPILOGUE/PROLOGUE NFS is the only in-tree filesystem using the feature, but all ops test for it. Currently the resulting sigdefer calls have to be jumped over in the common case. This is a bandaid, longer term fix will move this feature away. Approved by: re (kib) --- sys/sys/signalvar.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'sys') diff --git a/sys/sys/signalvar.h b/sys/sys/signalvar.h index 3b5f750db5c9..c6c7d083cefb 100644 --- a/sys/sys/signalvar.h +++ b/sys/sys/signalvar.h @@ -349,7 +349,7 @@ static inline int sigdeferstop(int mode) { - if (mode == SIGDEFERSTOP_NOP) + if (__predict_true(mode == SIGDEFERSTOP_NOP)) return (SIGDEFERSTOP_VAL_NCHG); return (sigdeferstop_impl(mode)); } @@ -358,7 +358,7 @@ static inline void sigallowstop(int prev) { - if (prev == SIGDEFERSTOP_VAL_NCHG) + if (__predict_true(prev == SIGDEFERSTOP_VAL_NCHG)) return; sigallowstop_impl(prev); } -- cgit v1.3 From 108ff63e8a83757adff13ebdc8c3360a596b70b2 Mon Sep 17 00:00:00 2001 From: Konstantin Belousov Date: Sat, 22 Sep 2018 17:04:39 +0000 Subject: Further reorganize pmap_invalidate TLB code. Split calculation of mask for shootdown IPI and local invalidation. Reorder IPI before local. Suggested by: alc Reviewed by: alc, markj Tested by: pho Sponsored by: The FreeBSD Foundation Approved by: re (rgrimes) Differential revision: https://reviews.freebsd.org/D17277 --- sys/amd64/amd64/pmap.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) (limited to 'sys') diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index cb3625138a6e..ab8cc7d1b6f8 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -1704,6 +1704,13 @@ pmap_invalidate_ept(pmap_t pmap) sched_unpin(); } +static cpuset_t +pmap_invalidate_cpu_mask(pmap_t pmap) +{ + + return (pmap == kernel_pmap ? all_cpus : pmap->pm_active); +} + static inline void pmap_invalidate_page_pcid(pmap_t pmap, vm_offset_t va, const bool invpcid_works1) @@ -1790,7 +1797,6 @@ DEFINE_IFUNC(static, void, pmap_invalidate_page_mode, (pmap_t, vm_offset_t), void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { - cpuset_t *mask; if (pmap_type_guest(pmap)) { pmap_invalidate_ept(pmap); @@ -1801,16 +1807,14 @@ pmap_invalidate_page(pmap_t pmap, vm_offset_t va) ("pmap_invalidate_page: invalid type %d", pmap->pm_type)); sched_pin(); + smp_masked_invlpg(pmap_invalidate_cpu_mask(pmap), va, pmap); if (pmap == kernel_pmap) { invlpg(va); - mask = &all_cpus; } else { if (pmap == PCPU_GET(curpmap)) invlpg(va); pmap_invalidate_page_mode(pmap, va); - mask = &pmap->pm_active; } - smp_masked_invlpg(*mask, va, pmap); sched_unpin(); } @@ -1890,7 +1894,6 @@ DEFINE_IFUNC(static, void, pmap_invalidate_range_mode, (pmap_t, vm_offset_t, void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { - cpuset_t *mask; vm_offset_t addr; if (eva - sva >= PMAP_INVLPG_THRESHOLD) { @@ -1907,19 +1910,17 @@ pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) ("pmap_invalidate_range: invalid type %d", pmap->pm_type)); sched_pin(); + smp_masked_invlpg_range(pmap_invalidate_cpu_mask(pmap), sva, eva, pmap); if (pmap == kernel_pmap) { for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); - mask = &all_cpus; } else { if (pmap == PCPU_GET(curpmap)) { for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); } pmap_invalidate_range_mode(pmap, sva, eva); - mask = &pmap->pm_active; } - smp_masked_invlpg_range(*mask, sva, eva, pmap); sched_unpin(); } @@ -2010,7 +2011,6 @@ DEFINE_IFUNC(static, void, pmap_invalidate_all_mode, (pmap_t), static) void pmap_invalidate_all(pmap_t pmap) { - cpuset_t *mask; if (pmap_type_guest(pmap)) { pmap_invalidate_ept(pmap); @@ -2021,9 +2021,8 @@ pmap_invalidate_all(pmap_t pmap) ("pmap_invalidate_all: invalid type %d", pmap->pm_type)); sched_pin(); - mask = pmap == kernel_pmap ? &all_cpus : &pmap->pm_active; + smp_masked_invltlb(pmap_invalidate_cpu_mask(pmap), pmap); pmap_invalidate_all_mode(pmap); - smp_masked_invltlb(*mask, pmap); sched_unpin(); } -- cgit v1.3 From b7befdf50915fe8dda0cbf33e2ffb907c04a2d03 Mon Sep 17 00:00:00 2001 From: Konstantin Belousov Date: Sat, 22 Sep 2018 17:05:49 +0000 Subject: Correct panic messages. Reviewed by: mckusick Sponsored by: The FreeBSD Foundation Approved by: re (rgrimes) MFC after: 1 week --- sys/ufs/ffs/ffs_softdep.c | 49 ++++++++++++++++++++++++++--------------------- 1 file changed, 27 insertions(+), 22 deletions(-) (limited to 'sys') diff --git a/sys/ufs/ffs/ffs_softdep.c b/sys/ufs/ffs/ffs_softdep.c index 89d0b7382e4d..dcc72b2fdfd3 100644 --- a/sys/ufs/ffs/ffs_softdep.c +++ b/sys/ufs/ffs/ffs_softdep.c @@ -10261,22 +10261,22 @@ initiate_write_inodeblock_ufs1(inodedep, bp) prevlbn = adp->ad_offset; if (adp->ad_offset < UFS_NDADDR && dp->di_db[adp->ad_offset] != adp->ad_newblkno) - panic("%s: direct pointer #%jd mismatch %d != %jd", - "softdep_write_inodeblock", + panic("initiate_write_inodeblock_ufs1: " + "direct pointer #%jd mismatch %d != %jd", (intmax_t)adp->ad_offset, dp->di_db[adp->ad_offset], (intmax_t)adp->ad_newblkno); if (adp->ad_offset >= UFS_NDADDR && dp->di_ib[adp->ad_offset - UFS_NDADDR] != adp->ad_newblkno) - panic("%s: indirect pointer #%jd mismatch %d != %jd", - "softdep_write_inodeblock", + panic("initiate_write_inodeblock_ufs1: " + "indirect pointer #%jd mismatch %d != %jd", (intmax_t)adp->ad_offset - UFS_NDADDR, dp->di_ib[adp->ad_offset - UFS_NDADDR], (intmax_t)adp->ad_newblkno); deplist |= 1 << adp->ad_offset; if ((adp->ad_state & ATTACHED) == 0) - panic("softdep_write_inodeblock: Unknown state 0x%x", - adp->ad_state); + panic("initiate_write_inodeblock_ufs1: " + "Unknown state 0x%x", adp->ad_state); #endif /* INVARIANTS */ adp->ad_state &= ~ATTACHED; adp->ad_state |= UNDONE; @@ -10299,7 +10299,8 @@ initiate_write_inodeblock_ufs1(inodedep, bp) for (i = adp->ad_offset + 1; i < UFS_NDADDR; i++) { #ifdef INVARIANTS if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) - panic("softdep_write_inodeblock: lost dep1"); + panic("initiate_write_inodeblock_ufs1: " + "lost dep1"); #endif /* INVARIANTS */ dp->di_db[i] = 0; } @@ -10307,7 +10308,8 @@ initiate_write_inodeblock_ufs1(inodedep, bp) #ifdef INVARIANTS if (dp->di_ib[i] != 0 && (deplist & ((1 << UFS_NDADDR) << i)) == 0) - panic("softdep_write_inodeblock: lost dep2"); + panic("initiate_write_inodeblock_ufs1: " + "lost dep2"); #endif /* INVARIANTS */ dp->di_ib[i] = 0; } @@ -10429,18 +10431,18 @@ initiate_write_inodeblock_ufs2(inodedep, bp) adp = TAILQ_NEXT(adp, ad_next)) { #ifdef INVARIANTS if (deplist != 0 && prevlbn >= adp->ad_offset) - panic("softdep_write_inodeblock: lbn order"); + panic("initiate_write_inodeblock_ufs2: lbn order"); prevlbn = adp->ad_offset; if (dp->di_extb[adp->ad_offset] != adp->ad_newblkno) - panic("%s: direct pointer #%jd mismatch %jd != %jd", - "softdep_write_inodeblock", + panic("initiate_write_inodeblock_ufs2: " + "ext pointer #%jd mismatch %jd != %jd", (intmax_t)adp->ad_offset, (intmax_t)dp->di_extb[adp->ad_offset], (intmax_t)adp->ad_newblkno); deplist |= 1 << adp->ad_offset; if ((adp->ad_state & ATTACHED) == 0) - panic("softdep_write_inodeblock: Unknown state 0x%x", - adp->ad_state); + panic("initiate_write_inodeblock_ufs2: Unknown " + "state 0x%x", adp->ad_state); #endif /* INVARIANTS */ adp->ad_state &= ~ATTACHED; adp->ad_state |= UNDONE; @@ -10461,7 +10463,8 @@ initiate_write_inodeblock_ufs2(inodedep, bp) for (i = adp->ad_offset + 1; i < UFS_NXADDR; i++) { #ifdef INVARIANTS if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0) - panic("softdep_write_inodeblock: lost dep1"); + panic("initiate_write_inodeblock_ufs2: " + "lost dep1"); #endif /* INVARIANTS */ dp->di_extb[i] = 0; } @@ -10494,22 +10497,22 @@ initiate_write_inodeblock_ufs2(inodedep, bp) prevlbn = adp->ad_offset; if (adp->ad_offset < UFS_NDADDR && dp->di_db[adp->ad_offset] != adp->ad_newblkno) - panic("%s: direct pointer #%jd mismatch %jd != %jd", - "softdep_write_inodeblock", + panic("initiate_write_inodeblock_ufs2: " + "direct pointer #%jd mismatch %jd != %jd", (intmax_t)adp->ad_offset, (intmax_t)dp->di_db[adp->ad_offset], (intmax_t)adp->ad_newblkno); if (adp->ad_offset >= UFS_NDADDR && dp->di_ib[adp->ad_offset - UFS_NDADDR] != adp->ad_newblkno) - panic("%s indirect pointer #%jd mismatch %jd != %jd", - "softdep_write_inodeblock:", + panic("initiate_write_inodeblock_ufs2: " + "indirect pointer #%jd mismatch %jd != %jd", (intmax_t)adp->ad_offset - UFS_NDADDR, (intmax_t)dp->di_ib[adp->ad_offset - UFS_NDADDR], (intmax_t)adp->ad_newblkno); deplist |= 1 << adp->ad_offset; if ((adp->ad_state & ATTACHED) == 0) - panic("softdep_write_inodeblock: Unknown state 0x%x", - adp->ad_state); + panic("initiate_write_inodeblock_ufs2: Unknown " + "state 0x%x", adp->ad_state); #endif /* INVARIANTS */ adp->ad_state &= ~ATTACHED; adp->ad_state |= UNDONE; @@ -10532,7 +10535,8 @@ initiate_write_inodeblock_ufs2(inodedep, bp) for (i = adp->ad_offset + 1; i < UFS_NDADDR; i++) { #ifdef INVARIANTS if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) - panic("softdep_write_inodeblock: lost dep2"); + panic("initiate_write_inodeblock_ufs2: " + "lost dep2"); #endif /* INVARIANTS */ dp->di_db[i] = 0; } @@ -10540,7 +10544,8 @@ initiate_write_inodeblock_ufs2(inodedep, bp) #ifdef INVARIANTS if (dp->di_ib[i] != 0 && (deplist & ((1 << UFS_NDADDR) << i)) == 0) - panic("softdep_write_inodeblock: lost dep3"); + panic("initiate_write_inodeblock_ufs2: " + "lost dep3"); #endif /* INVARIANTS */ dp->di_ib[i] = 0; } -- cgit v1.3 From 078a49a077790d2b57ebf8cbccc2148f5ce3358f Mon Sep 17 00:00:00 2001 From: Michael Tuexen Date: Sun, 23 Sep 2018 16:37:32 +0000 Subject: Remove the unused parameter 'locked' from the function syncache_respond(). There is no functional change. The parameter became unused in r313330, but wasn't removed. Approved by: re (kib@) MFC after: 1 month Sponsored by: Netflix, Inc. --- sys/netinet/tcp_syncache.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'sys') diff --git a/sys/netinet/tcp_syncache.c b/sys/netinet/tcp_syncache.c index e5a184fb713e..9eefd0d948d7 100644 --- a/sys/netinet/tcp_syncache.c +++ b/sys/netinet/tcp_syncache.c @@ -130,7 +130,7 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, functions_inherit_listen_socket_stack, static void syncache_drop(struct syncache *, struct syncache_head *); static void syncache_free(struct syncache *); static void syncache_insert(struct syncache *, struct syncache_head *); -static int syncache_respond(struct syncache *, struct syncache_head *, int, +static int syncache_respond(struct syncache *, struct syncache_head *, const struct mbuf *); static struct socket *syncache_socket(struct syncache *, struct socket *, struct mbuf *m); @@ -489,7 +489,7 @@ syncache_timer(void *xsch) free(s, M_TCPLOG); } - syncache_respond(sc, sch, 1, NULL); + syncache_respond(sc, sch, NULL); TCPSTAT_INC(tcps_sc_retransmitted); syncache_timeout(sc, sch, 0); } @@ -1413,7 +1413,7 @@ syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, s, __func__); free(s, M_TCPLOG); } - if (syncache_respond(sc, sch, 1, m) == 0) { + if (syncache_respond(sc, sch, m) == 0) { sc->sc_rxmits = 0; syncache_timeout(sc, sch, 1); TCPSTAT_INC(tcps_sndacks); @@ -1577,7 +1577,7 @@ skip_alloc: /* * Do a standard 3-way handshake. */ - if (syncache_respond(sc, sch, 0, m) == 0) { + if (syncache_respond(sc, sch, m) == 0) { if (V_tcp_syncookies && V_tcp_syncookiesonly && sc != &scs) syncache_free(sc); else if (sc != &scs) @@ -1622,7 +1622,7 @@ tfo_expanded: * i.e. m0 != NULL, or upon 3WHS ACK timeout, i.e. m0 == NULL. */ static int -syncache_respond(struct syncache *sc, struct syncache_head *sch, int locked, +syncache_respond(struct syncache *sc, struct syncache_head *sch, const struct mbuf *m0) { struct ip *ip = NULL; -- cgit v1.3 From 9afff6b1c001084692b8b76fb27b4c07c2edc712 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Sun, 23 Sep 2018 19:00:06 +0000 Subject: Eliminate false sharing in malloc due to statistic collection Currently stats are collected in a MAXCPU-sized array which is not aligned and suffers enormous false-sharing. Fix the problem by utilizing per-cpu allocation. The counter(9) API is not used here as it is too incomplete and does not provide a win over per-cpu zone sized for malloc stats struct. In particular stats are being reported for each cpu separately by just copying what is supposed to be an array element for given cpu. This eliminates significant false-sharing during malloc-heavy tests e.g. on Skylake. See the review for details. Reviewed by: markj Approved by: re (kib) Differential Revision: https://reviews.freebsd.org/D17289 --- sys/kern/kern_malloc.c | 35 +++++++++++++++++++++++++++-------- sys/sys/malloc.h | 2 +- 2 files changed, 28 insertions(+), 9 deletions(-) (limited to 'sys') diff --git a/sys/kern/kern_malloc.c b/sys/kern/kern_malloc.c index c102eb52e5bd..a8d42a85d949 100644 --- a/sys/kern/kern_malloc.c +++ b/sys/kern/kern_malloc.c @@ -61,6 +61,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -173,6 +174,7 @@ struct { * declare malloc types. */ static uma_zone_t mt_zone; +static uma_zone_t mt_stats_zone; u_long vm_kmem_size; SYSCTL_ULONG(_vm, OID_AUTO, kmem_size, CTLFLAG_RDTUN, &vm_kmem_size, 0, @@ -368,7 +370,7 @@ malloc_type_zone_allocated(struct malloc_type *mtp, unsigned long size, critical_enter(); mtip = mtp->ks_handle; - mtsp = &mtip->mti_stats[curcpu]; + mtsp = zpcpu_get(mtip->mti_stats); if (size > 0) { mtsp->mts_memalloced += size; mtsp->mts_numallocs++; @@ -411,7 +413,7 @@ malloc_type_freed(struct malloc_type *mtp, unsigned long size) critical_enter(); mtip = mtp->ks_handle; - mtsp = &mtip->mti_stats[curcpu]; + mtsp = zpcpu_get(mtip->mti_stats); mtsp->mts_memfreed += size; mtsp->mts_numfrees++; @@ -953,6 +955,9 @@ mallocinit(void *dummy) if (kmem_zmax < PAGE_SIZE || kmem_zmax > KMEM_ZMAX) kmem_zmax = KMEM_ZMAX; + mt_stats_zone = uma_zcreate("mt_stats_zone", + sizeof(struct malloc_type_stats), NULL, NULL, NULL, NULL, + UMA_ALIGN_PTR, UMA_ZONE_PCPU); mt_zone = uma_zcreate("mt_zone", sizeof(struct malloc_type_internal), #ifdef INVARIANTS mtrash_ctor, mtrash_dtor, mtrash_init, mtrash_fini, @@ -995,6 +1000,7 @@ malloc_init(void *data) panic("malloc_init: bad malloc type magic"); mtip = uma_zalloc(mt_zone, M_WAITOK | M_ZERO); + mtip->mti_stats = uma_zalloc_pcpu(mt_stats_zone, M_WAITOK | M_ZERO); mtp->ks_handle = mtip; mtp_set_subzone(mtp); @@ -1042,8 +1048,8 @@ malloc_uninit(void *data) * Look for memory leaks. */ temp_allocs = temp_bytes = 0; - for (i = 0; i < MAXCPU; i++) { - mtsp = &mtip->mti_stats[i]; + for (i = 0; i <= mp_maxid; i++) { + mtsp = zpcpu_get_cpu(mtip->mti_stats, i); temp_allocs += mtsp->mts_numallocs; temp_allocs -= mtsp->mts_numfrees; temp_bytes += mtsp->mts_memalloced; @@ -1056,6 +1062,7 @@ malloc_uninit(void *data) } slab = vtoslab((vm_offset_t) mtip & (~UMA_SLAB_MASK)); + uma_zfree_pcpu(mt_stats_zone, mtip->mti_stats); uma_zfree_arg(mt_zone, mtip, slab); } @@ -1077,6 +1084,7 @@ sysctl_kern_malloc_stats(SYSCTL_HANDLER_ARGS) { struct malloc_type_stream_header mtsh; struct malloc_type_internal *mtip; + struct malloc_type_stats *mtsp, zeromts; struct malloc_type_header mth; struct malloc_type *mtp; int error, i; @@ -1089,6 +1097,8 @@ sysctl_kern_malloc_stats(SYSCTL_HANDLER_ARGS) sbuf_clear_flags(&sbuf, SBUF_INCLUDENUL); mtx_lock(&malloc_mtx); + bzero(&zeromts, sizeof(zeromts)); + /* * Insert stream header. */ @@ -1114,10 +1124,17 @@ sysctl_kern_malloc_stats(SYSCTL_HANDLER_ARGS) /* * Insert type statistics for each CPU. */ - for (i = 0; i < MAXCPU; i++) { - (void)sbuf_bcat(&sbuf, &mtip->mti_stats[i], - sizeof(mtip->mti_stats[i])); + for (i = 0; i <= mp_maxid; i++) { + mtsp = zpcpu_get_cpu(mtip->mti_stats, i); + (void)sbuf_bcat(&sbuf, mtsp, sizeof(*mtsp)); } + /* + * Fill in the missing CPUs. + */ + for (; i < MAXCPU; i++) { + (void)sbuf_bcat(&sbuf, &zeromts, sizeof(zeromts)); + } + } mtx_unlock(&malloc_mtx); error = sbuf_finish(&sbuf); @@ -1170,6 +1187,7 @@ restart: DB_SHOW_COMMAND(malloc, db_show_malloc) { struct malloc_type_internal *mtip; + struct malloc_type_internal *mtsp; struct malloc_type *mtp; uint64_t allocs, frees; uint64_t alloced, freed; @@ -1183,7 +1201,8 @@ DB_SHOW_COMMAND(malloc, db_show_malloc) frees = 0; alloced = 0; freed = 0; - for (i = 0; i < MAXCPU; i++) { + for (i = 0; i <= mp_maxid; i++) { + mtsp = zpcpu_get_cpu(mtip->mti_stats, i); allocs += mtip->mti_stats[i].mts_numallocs; frees += mtip->mti_stats[i].mts_numfrees; alloced += mtip->mti_stats[i].mts_memalloced; diff --git a/sys/sys/malloc.h b/sys/sys/malloc.h index 68595091a580..520f18c5a969 100644 --- a/sys/sys/malloc.h +++ b/sys/sys/malloc.h @@ -101,7 +101,7 @@ struct malloc_type_internal { uint32_t mti_probes[DTMALLOC_PROBE_MAX]; /* DTrace probe ID array. */ u_char mti_zone; - struct malloc_type_stats mti_stats[MAXCPU]; + struct malloc_type_stats *mti_stats; }; /* -- cgit v1.3 From 3d14a7bb43d3bff2dad31f0bc928dda275ad86b1 Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Mon, 24 Sep 2018 15:32:46 +0000 Subject: Ensure that "domain" is initialized when vm_ndomains == 1. Reported by: alc Approved by: re (gjb) --- sys/vm/vm_kern.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'sys') diff --git a/sys/vm/vm_kern.c b/sys/vm/vm_kern.c index b5915c074ab6..9774e0b4f22d 100644 --- a/sys/vm/vm_kern.c +++ b/sys/vm/vm_kern.c @@ -505,8 +505,10 @@ kmem_back(vm_object_t object, vm_offset_t addr, vm_size_t size, int flags) next = roundup2(addr + 1, KVA_QUANTUM); if (next > end || next < start) next = end; - } else + } else { + domain = 0; next = end; + } rv = kmem_back_domain(domain, object, addr, next - addr, flags); if (rv != KERN_SUCCESS) { kmem_unback(object, start, addr - start); -- cgit v1.3 From f5fbe90de489d79b8d607d831451b376855f1410 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Mon, 24 Sep 2018 16:49:02 +0000 Subject: Passing UMA_ZONE_NOFREE to uma_zcreate() for swpctrie_zone and swblk_zone is redundant, because uma_zone_reserve_kva() is performed on both zones and it sets this same flag on the zone. (Moreover, the implementation of the swap pager does not itself require these zones to be UMA_ZONE_NOFREE.) Reviewed by: kib, markj Approved by: re (gjb) MFC after: 1 week Differential Revision: https://reviews.freebsd.org/D17296 --- sys/vm/swap_pager.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'sys') diff --git a/sys/vm/swap_pager.c b/sys/vm/swap_pager.c index 858a9e884626..0bd4bde8b57e 100644 --- a/sys/vm/swap_pager.c +++ b/sys/vm/swap_pager.c @@ -545,13 +545,11 @@ swap_pager_swap_init(void) if (maxswzone && n > maxswzone / sizeof(struct swblk)) n = maxswzone / sizeof(struct swblk); swpctrie_zone = uma_zcreate("swpctrie", pctrie_node_size(), NULL, NULL, - pctrie_zone_init, NULL, UMA_ALIGN_PTR, - UMA_ZONE_NOFREE | UMA_ZONE_VM); + pctrie_zone_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM); if (swpctrie_zone == NULL) panic("failed to create swap pctrie zone."); swblk_zone = uma_zcreate("swblk", sizeof(struct swblk), NULL, NULL, - NULL, NULL, _Alignof(struct swblk) - 1, - UMA_ZONE_NOFREE | UMA_ZONE_VM); + NULL, NULL, _Alignof(struct swblk) - 1, UMA_ZONE_VM); if (swblk_zone == NULL) panic("failed to create swap blk zone."); n2 = n; -- cgit v1.3 From cf4a52cf6745dd7546f0aaec1485c522a0424319 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Mon, 24 Sep 2018 16:58:55 +0000 Subject: Fix use-after-free in RAID0 error reporting of GEOM_RAID. PR: 231510 Submitted by: yangx92@hotmail.com Approved by: re (gjb) MFC after: 1 week --- sys/geom/raid/tr_raid0.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'sys') diff --git a/sys/geom/raid/tr_raid0.c b/sys/geom/raid/tr_raid0.c index 78fa1b920f8a..33a802103f0c 100644 --- a/sys/geom/raid/tr_raid0.c +++ b/sys/geom/raid/tr_raid0.c @@ -323,7 +323,7 @@ g_raid_tr_iodone_raid0(struct g_raid_tr_object *tr, pbp->bio_inbed++; if (pbp->bio_children == pbp->bio_inbed) { pbp->bio_completed = pbp->bio_length; - g_raid_iodone(pbp, bp->bio_error); + g_raid_iodone(pbp, pbp->bio_error); } } -- cgit v1.3 From 7a102e0463121db1249f4bb06aba7e120e7e6b84 Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Mon, 24 Sep 2018 17:41:29 +0000 Subject: Implement pmap_sync_icache(). This invokes "fence" on the hart performing the write followed by an IPI to execute "fence.i" on all harts. This is required to support userland debuggers setting breakpoints in user processes. Reviewed by: br (earlier version), markj Approved by: re (gjb) Sponsored by: DARPA Differential Revision: https://reviews.freebsd.org/D17139 --- sys/riscv/riscv/pmap.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) (limited to 'sys') diff --git a/sys/riscv/riscv/pmap.c b/sys/riscv/riscv/pmap.c index 8582667b27b9..b6a58ed796b6 100644 --- a/sys/riscv/riscv/pmap.c +++ b/sys/riscv/riscv/pmap.c @@ -3244,11 +3244,27 @@ pmap_activate(struct thread *td) critical_exit(); } +static void +pmap_sync_icache_one(void *arg __unused) +{ + + __asm __volatile("fence.i"); +} + void pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz) { - panic("RISCVTODO: pmap_sync_icache"); + /* + * From the RISC-V User-Level ISA V2.2: + * + * "To make a store to instruction memory visible to all + * RISC-V harts, the writing hart has to execute a data FENCE + * before requesting that all remote RISC-V harts execute a + * FENCE.I." + */ + __asm __volatile("fence"); + smp_rendezvous(NULL, pmap_sync_icache_one, NULL, NULL); } /* -- cgit v1.3 From b65a9568f89c35b4f6a292768fea4382245bd0bd Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Mon, 24 Sep 2018 18:20:38 +0000 Subject: Restore the API of the kf_sa_local and kf_sa_peer members. In 11.x and earlier these were accessible as direct members of 'struct kinfo_file'. Existing code already knows about the new location of these members as well, so wrapper macros did not work for these fields. Instead, define an anonymous struct containing the fields from 'struct kinfo_file' in FreeBSD 11 that were not part of the 'kf_un' union. This anonymous struct is then placed in an anonymous union along with the new 'kf_un' union. This preserves the API of both structure layouts without requiring any wrapper macros. PR: 231525 Reviewed by: kib Approved by: re (gjb) Differential Revision: https://reviews.freebsd.org/D17262 --- sys/sys/user.h | 171 +++++++++++++++++++++++++++++---------------------------- 1 file changed, 88 insertions(+), 83 deletions(-) (limited to 'sys') diff --git a/sys/sys/user.h b/sys/sys/user.h index f5353f15b0fc..67acf2d0740d 100644 --- a/sys/sys/user.h +++ b/sys/sys/user.h @@ -346,85 +346,96 @@ struct kinfo_file { int64_t kf_offset; /* Seek location. */ union { struct { - /* Sendq size */ - uint32_t kf_sock_sendq; - /* Socket domain. */ - int kf_sock_domain0; - /* Socket type. */ - int kf_sock_type0; - /* Socket protocol. */ - int kf_sock_protocol0; - /* Socket address. */ + /* API compatiblity with FreeBSD < 12. */ + int kf_vnode_type; + int kf_sock_domain; + int kf_sock_type; + int kf_sock_protocol; struct sockaddr_storage kf_sa_local; - /* Peer address. */ struct sockaddr_storage kf_sa_peer; - /* Address of so_pcb. */ - uint64_t kf_sock_pcb; - /* Address of inp_ppcb. */ - uint64_t kf_sock_inpcb; - /* Address of unp_conn. */ - uint64_t kf_sock_unpconn; - /* Send buffer state. */ - uint16_t kf_sock_snd_sb_state; - /* Receive buffer state. */ - uint16_t kf_sock_rcv_sb_state; - /* Recvq size. */ - uint32_t kf_sock_recvq; - } kf_sock; - struct { - /* Vnode type. */ - int kf_file_type; - /* Space for future use */ - int kf_spareint[3]; - uint64_t kf_spareint64[30]; - /* Vnode filesystem id. */ - uint64_t kf_file_fsid; - /* File device. */ - uint64_t kf_file_rdev; - /* Global file id. */ - uint64_t kf_file_fileid; - /* File size. */ - uint64_t kf_file_size; - /* Vnode filesystem id, FreeBSD 11 compat. */ - uint32_t kf_file_fsid_freebsd11; - /* File device, FreeBSD 11 compat. */ - uint32_t kf_file_rdev_freebsd11; - /* File mode. */ - uint16_t kf_file_mode; - /* Round to 64 bit alignment. */ - uint16_t kf_file_pad0; - uint32_t kf_file_pad1; - } kf_file; - struct { - uint32_t kf_spareint[4]; - uint64_t kf_spareint64[32]; - uint32_t kf_sem_value; - uint16_t kf_sem_mode; - } kf_sem; - struct { - uint32_t kf_spareint[4]; - uint64_t kf_spareint64[32]; - uint64_t kf_pipe_addr; - uint64_t kf_pipe_peer; - uint32_t kf_pipe_buffer_cnt; - /* Round to 64 bit alignment. */ - uint32_t kf_pipe_pad0[3]; - } kf_pipe; - struct { - uint32_t kf_spareint[4]; - uint64_t kf_spareint64[32]; - uint32_t kf_pts_dev_freebsd11; - uint32_t kf_pts_pad0; - uint64_t kf_pts_dev; - /* Round to 64 bit alignment. */ - uint32_t kf_pts_pad1[4]; - } kf_pts; - struct { - uint32_t kf_spareint[4]; - uint64_t kf_spareint64[32]; - pid_t kf_pid; - } kf_proc; - } kf_un; + }; + union { + struct { + /* Sendq size */ + uint32_t kf_sock_sendq; + /* Socket domain. */ + int kf_sock_domain0; + /* Socket type. */ + int kf_sock_type0; + /* Socket protocol. */ + int kf_sock_protocol0; + /* Socket address. */ + struct sockaddr_storage kf_sa_local; + /* Peer address. */ + struct sockaddr_storage kf_sa_peer; + /* Address of so_pcb. */ + uint64_t kf_sock_pcb; + /* Address of inp_ppcb. */ + uint64_t kf_sock_inpcb; + /* Address of unp_conn. */ + uint64_t kf_sock_unpconn; + /* Send buffer state. */ + uint16_t kf_sock_snd_sb_state; + /* Receive buffer state. */ + uint16_t kf_sock_rcv_sb_state; + /* Recvq size. */ + uint32_t kf_sock_recvq; + } kf_sock; + struct { + /* Vnode type. */ + int kf_file_type; + /* Space for future use */ + int kf_spareint[3]; + uint64_t kf_spareint64[30]; + /* Vnode filesystem id. */ + uint64_t kf_file_fsid; + /* File device. */ + uint64_t kf_file_rdev; + /* Global file id. */ + uint64_t kf_file_fileid; + /* File size. */ + uint64_t kf_file_size; + /* Vnode filesystem id, FreeBSD 11 compat. */ + uint32_t kf_file_fsid_freebsd11; + /* File device, FreeBSD 11 compat. */ + uint32_t kf_file_rdev_freebsd11; + /* File mode. */ + uint16_t kf_file_mode; + /* Round to 64 bit alignment. */ + uint16_t kf_file_pad0; + uint32_t kf_file_pad1; + } kf_file; + struct { + uint32_t kf_spareint[4]; + uint64_t kf_spareint64[32]; + uint32_t kf_sem_value; + uint16_t kf_sem_mode; + } kf_sem; + struct { + uint32_t kf_spareint[4]; + uint64_t kf_spareint64[32]; + uint64_t kf_pipe_addr; + uint64_t kf_pipe_peer; + uint32_t kf_pipe_buffer_cnt; + /* Round to 64 bit alignment. */ + uint32_t kf_pipe_pad0[3]; + } kf_pipe; + struct { + uint32_t kf_spareint[4]; + uint64_t kf_spareint64[32]; + uint32_t kf_pts_dev_freebsd11; + uint32_t kf_pts_pad0; + uint64_t kf_pts_dev; + /* Round to 64 bit alignment. */ + uint32_t kf_pts_pad1[4]; + } kf_pts; + struct { + uint32_t kf_spareint[4]; + uint64_t kf_spareint64[32]; + pid_t kf_pid; + } kf_proc; + } kf_un; + }; uint16_t kf_status; /* Status flags. */ uint16_t kf_pad1; /* Round to 32 bit alignment. */ int _kf_ispare0; /* Space for more stuff. */ @@ -433,12 +444,6 @@ struct kinfo_file { /* Truncated before copyout in sysctl */ char kf_path[PATH_MAX]; /* Path to file, if any. */ }; -#ifndef _KERNEL -#define kf_vnode_type kf_un.kf_file.kf_file_type -#define kf_sock_domain kf_un.kf_sock.kf_sock_domain0 -#define kf_sock_type kf_un.kf_sock.kf_sock_type0 -#define kf_sock_protocol kf_un.kf_sock.kf_sock_protocol0 -#endif /* * The KERN_PROC_VMMAP sysctl allows a process to dump the VM layout of -- cgit v1.3 From 463406ac4a531b8f4c6102715c17da6183d10be3 Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Mon, 24 Sep 2018 19:24:17 +0000 Subject: Add more NUMA-specific low memory predicates. Use these predicates instead of inline references to vm_min_domains. Also add a global all_domains set, akin to all_cpus. Reviewed by: alc, jeff, kib Approved by: re (gjb) Sponsored by: The FreeBSD Foundation Differential Revision: https://reviews.freebsd.org/D17278 --- sys/sys/vmmeter.h | 28 +++++++++++++++++++++++++--- sys/vm/vm_domainset.c | 16 ++++++++-------- sys/vm/vm_page.c | 4 ++-- sys/vm/vm_phys.c | 1 + sys/x86/acpica/srat.c | 3 ++- 5 files changed, 38 insertions(+), 14 deletions(-) (limited to 'sys') diff --git a/sys/sys/vmmeter.h b/sys/sys/vmmeter.h index c41b151fa502..579d16756e99 100644 --- a/sys/sys/vmmeter.h +++ b/sys/sys/vmmeter.h @@ -145,6 +145,7 @@ struct vmmeter { #include extern struct vmmeter vm_cnt; +extern domainset_t all_domains; extern domainset_t vm_min_domains; extern domainset_t vm_severe_domains; @@ -177,7 +178,7 @@ vm_wire_count(void) /* * Return TRUE if we are under our severe low-free-pages threshold * - * This routine is typically used at the user<->system interface to determine + * These routines are typically used at the user<->system interface to determine * whether we need to block in order to avoid a low memory deadlock. */ static inline int @@ -188,7 +189,14 @@ vm_page_count_severe(void) } static inline int -vm_page_count_severe_set(domainset_t *mask) +vm_page_count_severe_domain(int domain) +{ + + return (DOMAINSET_ISSET(domain, &vm_severe_domains)); +} + +static inline int +vm_page_count_severe_set(const domainset_t *mask) { return (DOMAINSET_SUBSET(&vm_severe_domains, mask)); @@ -197,7 +205,7 @@ vm_page_count_severe_set(domainset_t *mask) /* * Return TRUE if we are under our minimum low-free-pages threshold. * - * This routine is typically used within the system to determine whether + * These routines are typically used within the system to determine whether * we can execute potentially very expensive code in terms of memory. It * is also used by the pageout daemon to calculate when to sleep, when * to wake waiters up, and when (after making a pass) to become more @@ -210,5 +218,19 @@ vm_page_count_min(void) return (!DOMAINSET_EMPTY(&vm_min_domains)); } +static inline int +vm_page_count_min_domain(int domain) +{ + + return (DOMAINSET_ISSET(domain, &vm_min_domains)); +} + +static inline int +vm_page_count_min_set(const domainset_t *mask) +{ + + return (DOMAINSET_SUBSET(&vm_min_domains, mask)); +} + #endif /* _KERNEL */ #endif /* _SYS_VMMETER_H_ */ diff --git a/sys/vm/vm_domainset.c b/sys/vm/vm_domainset.c index 16b078e7dabd..b9348d6c632b 100644 --- a/sys/vm/vm_domainset.c +++ b/sys/vm/vm_domainset.c @@ -66,6 +66,7 @@ vm_domainset_iter_init(struct vm_domainset_iter *di, struct vm_object *obj, vm_pindex_t pindex) { struct domainset *domain; + struct thread *td; /* * object policy takes precedence over thread policy. The policies @@ -76,8 +77,9 @@ vm_domainset_iter_init(struct vm_domainset_iter *di, struct vm_object *obj, di->di_domain = domain; di->di_iter = &obj->domain.dr_iterator; } else { - di->di_domain = curthread->td_domain.dr_policy; - di->di_iter = &curthread->td_domain.dr_iterator; + td = curthread; + di->di_domain = td->td_domain.dr_policy; + di->di_iter = &td->td_domain.dr_iterator; } di->di_policy = di->di_domain->ds_policy; if (di->di_policy == DOMAINSET_POLICY_INTERLEAVE) { @@ -215,7 +217,7 @@ vm_domainset_iter_page_init(struct vm_domainset_iter *di, struct vm_object *obj, *req = (di->di_flags & ~(VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL)) | VM_ALLOC_NOWAIT; vm_domainset_iter_first(di, domain); - if (DOMAINSET_ISSET(*domain, &vm_min_domains)) + if (vm_page_count_min_domain(*domain)) vm_domainset_iter_page(di, domain, req); } @@ -233,8 +235,7 @@ vm_domainset_iter_page(struct vm_domainset_iter *di, int *domain, int *req) /* If there are more domains to visit we run the iterator. */ while (--di->di_n != 0) { vm_domainset_iter_next(di, domain); - if (!di->di_minskip || - !DOMAINSET_ISSET(*domain, &vm_min_domains)) + if (!di->di_minskip || !vm_page_count_min_domain(*domain)) return (0); } if (di->di_minskip) { @@ -269,7 +270,7 @@ vm_domainset_iter_malloc_init(struct vm_domainset_iter *di, di->di_flags = *flags; *flags = (di->di_flags & ~M_WAITOK) | M_NOWAIT; vm_domainset_iter_first(di, domain); - if (DOMAINSET_ISSET(*domain, &vm_min_domains)) + if (vm_page_count_min_domain(*domain)) vm_domainset_iter_malloc(di, domain, flags); } @@ -280,8 +281,7 @@ vm_domainset_iter_malloc(struct vm_domainset_iter *di, int *domain, int *flags) /* If there are more domains to visit we run the iterator. */ while (--di->di_n != 0) { vm_domainset_iter_next(di, domain); - if (!di->di_minskip || - !DOMAINSET_ISSET(*domain, &vm_min_domains)) + if (!di->di_minskip || !vm_page_count_min_domain(*domain)) return (0); } diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c index d022435704f1..47628cba2ac1 100644 --- a/sys/vm/vm_page.c +++ b/sys/vm/vm_page.c @@ -2959,7 +2959,7 @@ vm_wait_doms(const domainset_t *wdoms) * consume all freed pages while old allocators wait. */ mtx_lock(&vm_domainset_lock); - if (DOMAINSET_SUBSET(&vm_min_domains, wdoms)) { + if (vm_page_count_min_set(wdoms)) { vm_min_waiters++; msleep(&vm_min_domains, &vm_domainset_lock, PVM | PDROP, "vmwait", 0); @@ -3078,7 +3078,7 @@ vm_waitpfault(struct domainset *dset) * consume all freed pages while old allocators wait. */ mtx_lock(&vm_domainset_lock); - if (DOMAINSET_SUBSET(&vm_min_domains, &dset->ds_mask)) { + if (vm_page_count_min_set(&dset->ds_mask)) { vm_min_waiters++; msleep(&vm_min_domains, &vm_domainset_lock, PUSER | PDROP, "pfault", 0); diff --git a/sys/vm/vm_phys.c b/sys/vm/vm_phys.c index 890f5dad9213..5206ba6e658f 100644 --- a/sys/vm/vm_phys.c +++ b/sys/vm/vm_phys.c @@ -78,6 +78,7 @@ int __read_mostly *mem_locality; #endif int __read_mostly vm_ndomains = 1; +domainset_t __read_mostly all_domains = DOMAINSET_T_INITIALIZER(0x1); struct vm_phys_seg __read_mostly vm_phys_segs[VM_PHYSSEG_MAX]; int __read_mostly vm_phys_nsegs; diff --git a/sys/x86/acpica/srat.c b/sys/x86/acpica/srat.c index 0ea747ccc30b..08a98429a03d 100644 --- a/sys/x86/acpica/srat.c +++ b/sys/x86/acpica/srat.c @@ -470,8 +470,9 @@ parse_srat(void) } #ifdef NUMA - /* Point vm_phys at our memory affinity table. */ vm_ndomains = ndomain; + for (int i = 0; i < vm_ndomains; i++) + DOMAINSET_SET(i, &all_domains); mem_affinity = mem_info; #endif -- cgit v1.3 From 189bd7a978a0f0a9f13cdf6175c1f6c6fa72f962 Mon Sep 17 00:00:00 2001 From: Colin Percival Date: Mon, 24 Sep 2018 22:15:04 +0000 Subject: Recognize the Amazon PCI serial device found in i3.metal EC2 instances as an NS8250 UART. Reviewed by: sbruno, imp Approved by: re (delphij) Sponsored by: https://www.patreon.com/cperciva Differential Revision: https://reviews.freebsd.org/D17250 --- sys/dev/uart/uart_bus_pci.c | 1 + 1 file changed, 1 insertion(+) (limited to 'sys') diff --git a/sys/dev/uart/uart_bus_pci.c b/sys/dev/uart/uart_bus_pci.c index f4be108ab7b2..15b4472a37b2 100644 --- a/sys/dev/uart/uart_bus_pci.c +++ b/sys/dev/uart/uart_bus_pci.c @@ -125,6 +125,7 @@ static const struct pci_id pci_ns8250_ids[] = { 128 * DEFAULT_RCLK, 2}, { 0x14e4, 0x4344, 0xffff, 0, "Sony Ericsson GC89 PC Card", 0x10}, { 0x151f, 0x0000, 0xffff, 0, "TOPIC Semiconductor TP560 56k modem", 0x10 }, +{ 0x1d0f, 0x8250, 0x1d0f, 0, "Amazon PCI serial device", 0x10 }, { 0x1fd4, 0x1999, 0x1fd4, 0x0001, "Sunix SER5xxxx Serial Port", 0x10, 8 * DEFAULT_RCLK }, { 0x8086, 0x0f0a, 0xffff, 0, "Intel ValleyView LPIO1 HSUART#1", 0x10, -- cgit v1.3 From fd8cf3be5f3a1bfbe6b9b07bc610f3b1ce9361f4 Mon Sep 17 00:00:00 2001 From: Justin Hibbits Date: Tue, 25 Sep 2018 02:34:28 +0000 Subject: powerpc: Blacklist the top 64kB range of the lower 4GB PA space The PHB4 host bridge used by the POWER9 uses a 64kB range in 32-bit space at the address 0xffff0000-0xffffffff. Reserve this range so that DMA memory cannot be allocated within this range. This fixes seemingly random crashes on a POWER9 system. Ideally this range will have been reserved by the firmware, but as of now this is not the case. Submitted by: git_bdragon.rtk0.net Reviewed by: nwhitehorn Approved by: re(kib) Differential Revision: https://reviews.freebsd.org/D17183 --- sys/powerpc/ofw/ofw_machdep.c | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'sys') diff --git a/sys/powerpc/ofw/ofw_machdep.c b/sys/powerpc/ofw/ofw_machdep.c index 7e231373d829..c647ce817905 100644 --- a/sys/powerpc/ofw/ofw_machdep.c +++ b/sys/powerpc/ofw/ofw_machdep.c @@ -69,6 +69,10 @@ __FBSDID("$FreeBSD$"); #include +#ifdef POWERNV +#include +#endif + static void *fdt; int ofw_real_mode; @@ -338,6 +342,34 @@ excise_initrd_region(struct mem_region *avail, int asz) return (asz); } +#ifdef POWERNV +static int +excise_msi_region(struct mem_region *avail, int asz) +{ + uint64_t start, end; + struct mem_region initrdmap[1]; + + /* + * This range of physical addresses is used to implement optimized + * 32 bit MSI interrupts on POWER9. Exclude it to avoid accidentally + * using it for DMA, as this will cause an immediate PHB fence. + * While we could theoretically turn off this behavior in the ETU, + * doing so would break 32-bit MSI, so just reserve the range in + * the physical map instead. + * See section 4.4.2.8 of the PHB4 specification. + */ + start = 0x00000000ffff0000ul; + end = 0x00000000fffffffful; + + initrdmap[0].mr_start = start; + initrdmap[0].mr_size = end - start; + + asz = excise_reserved_regions(avail, asz, initrdmap, 1); + + return (asz); +} +#endif + static int excise_fdt_reserved(struct mem_region *avail, int asz) { @@ -430,6 +462,11 @@ ofw_mem_regions(struct mem_region *memp, int *memsz, asz = excise_initrd_region(availp, asz); #endif +#ifdef POWERNV + if (opal_check() == 0) + asz = excise_msi_region(availp, asz); +#endif + *memsz = msz; *availsz = asz; } -- cgit v1.3 From ea710848dca328f3d617ce52c5553e20d7de671c Mon Sep 17 00:00:00 2001 From: Navdeep Parhar Date: Tue, 25 Sep 2018 05:52:42 +0000 Subject: cxgbe(4): Link related changes. - Switch to using 32b port/link capabilities in the driver. The 32b format is used internally by firmwares > 1.16.45.0 and the driver will now interact with the firmware in its native format, whether it's 16b or 32b. Note that the 16b format doesn't have room for 50G, 200G, or 400G speeds. - Add a bit in the pause_settings knobs to allow negotiated PAUSE settings to override manual settings. - Ensure that manual link settings persist across an administrative down/up as well as transceiver unplug/replug. - Remove unused is_*G_port() functions. Approved by: re@ (gjb@) MFC after: 1 month Sponsored by: Chelsio Communications --- share/man/man4/cxgbe.4 | 10 +- sys/dev/cxgbe/adapter.h | 47 ---- sys/dev/cxgbe/common/common.h | 44 ++-- sys/dev/cxgbe/common/t4_hw.c | 453 ++++++++++++++++++++++++++++--------- sys/dev/cxgbe/osdep.h | 1 + sys/dev/cxgbe/t4_main.c | 515 ++++++++++++++++++++++++++---------------- sys/dev/cxgbe/tom/t4_cpl_io.c | 2 +- 7 files changed, 700 insertions(+), 372 deletions(-) (limited to 'sys') diff --git a/share/man/man4/cxgbe.4 b/share/man/man4/cxgbe.4 index c98f96ad3678..970472efca21 100644 --- a/share/man/man4/cxgbe.4 +++ b/share/man/man4/cxgbe.4 @@ -31,7 +31,7 @@ .\" .\" $FreeBSD$ .\" -.Dd Aug 9, 2018 +.Dd Sep 24, 2018 .Dt CXGBE 4 .Os .Sh NAME @@ -279,19 +279,21 @@ This usually results in the port emitting PAUSE frames. 1 instructs the hardware to drop frames destined for congested queues. .It Va hw.cxgbe.pause_settings PAUSE frame settings. -Bit 0 is rx_pause, bit 1 is tx_pause. +Bit 0 is rx_pause, bit 1 is tx_pause, bit 2 is pause_autoneg. rx_pause = 1 instructs the hardware to heed incoming PAUSE frames, 0 instructs it to ignore them. tx_pause = 1 allows the hardware to emit PAUSE frames when its receive FIFO reaches a high threshold, 0 prohibits the hardware from emitting PAUSE frames. -The default is 3 (both rx_pause and tx_pause = 1). +pause_autoneg = 1 overrides the rx_pause and tx_pause bits and instructs the +hardware to negotiate PAUSE settings with the link peer. +The default is 7 (all three = 1). This tunable establishes the default PAUSE settings for all ports. Settings can be displayed and controlled on a per-port basis via the dev..X.pause_settings sysctl. .It Va hw.cxgbe.fec FEC (Forward Error Correction) settings. 0 diables FEC. -Bit 0 enables RS FEC, bit 1 enables BASE-R RS, bit 3 is reserved. +Bit 0 enables RS FEC, bit 1 enables BASE-R FEC (aka Firecode FEC). The default is -1 which lets the driver pick a value. This tunable establishes the default FEC settings for all ports. Settings can be displayed and controlled on a per-port basis via the diff --git a/sys/dev/cxgbe/adapter.h b/sys/dev/cxgbe/adapter.h index 866bace84c36..f23d16479ee8 100644 --- a/sys/dev/cxgbe/adapter.h +++ b/sys/dev/cxgbe/adapter.h @@ -289,7 +289,6 @@ struct port_info { uint8_t rx_e_chan_map; /* rx TP e-channel bitmap */ struct link_config link_cfg; - struct link_config old_link_cfg; struct ifmedia media; struct timeval last_refreshed; @@ -1073,52 +1072,6 @@ t4_os_set_hw_addr(struct port_info *pi, uint8_t hw_addr[]) bcopy(hw_addr, pi->vi[0].hw_addr, ETHER_ADDR_LEN); } -static inline bool -is_10G_port(const struct port_info *pi) -{ - - return ((pi->link_cfg.supported & FW_PORT_CAP_SPEED_10G) != 0); -} - -static inline bool -is_25G_port(const struct port_info *pi) -{ - - return ((pi->link_cfg.supported & FW_PORT_CAP_SPEED_25G) != 0); -} - -static inline bool -is_40G_port(const struct port_info *pi) -{ - - return ((pi->link_cfg.supported & FW_PORT_CAP_SPEED_40G) != 0); -} - -static inline bool -is_100G_port(const struct port_info *pi) -{ - - return ((pi->link_cfg.supported & FW_PORT_CAP_SPEED_100G) != 0); -} - -static inline int -port_top_speed(const struct port_info *pi) -{ - - if (pi->link_cfg.supported & FW_PORT_CAP_SPEED_100G) - return (100); - if (pi->link_cfg.supported & FW_PORT_CAP_SPEED_40G) - return (40); - if (pi->link_cfg.supported & FW_PORT_CAP_SPEED_25G) - return (25); - if (pi->link_cfg.supported & FW_PORT_CAP_SPEED_10G) - return (10); - if (pi->link_cfg.supported & FW_PORT_CAP_SPEED_1G) - return (1); - - return (0); -} - static inline int tx_resume_threshold(struct sge_eq *eq) { diff --git a/sys/dev/cxgbe/common/common.h b/sys/dev/cxgbe/common/common.h index 044582db6ede..ada4fa4fa775 100644 --- a/sys/dev/cxgbe/common/common.h +++ b/sys/dev/cxgbe/common/common.h @@ -66,9 +66,10 @@ enum { }; enum { + FEC_NONE = 0, FEC_RS = 1 << 0, FEC_BASER_RS = 1 << 1, - FEC_RESERVED = 1 << 2, + FEC_AUTO = 1 << 5, /* M_FW_PORT_CAP32_FEC + 1 */ }; enum t4_bar2_qtype { T4_BAR2_QTYPE_EGRESS, T4_BAR2_QTYPE_INGRESS }; @@ -368,6 +369,7 @@ struct adapter_params { unsigned int ethoffload:1; unsigned int hash_filter:1; unsigned int filter2_wr_support:1; + unsigned int port_caps32:1; unsigned int ofldq_wr_cred; unsigned int eo_wr_cred; @@ -409,20 +411,21 @@ struct trace_params { }; struct link_config { - /* OS-specific code owns all the requested_* fields */ - unsigned char requested_aneg; /* link aneg user has requested */ - unsigned char requested_fc; /* flow control user has requested */ - unsigned char requested_fec; /* FEC user has requested */ - unsigned int requested_speed; /* speed user has requested (Mbps) */ - - unsigned short supported; /* link capabilities */ - unsigned short advertising; /* advertised capabilities */ - unsigned short lp_advertising; /* peer advertised capabilities */ - unsigned int speed; /* actual link speed (Mbps) */ - unsigned char fc; /* actual link flow control */ - unsigned char fec; /* actual FEC */ - unsigned char link_ok; /* link up? */ - unsigned char link_down_rc; /* link down reason */ + /* OS-specific code owns all the requested_* fields. */ + int8_t requested_aneg; /* link autonegotiation */ + int8_t requested_fc; /* flow control */ + int8_t requested_fec; /* FEC */ + u_int requested_speed; /* speed (Mbps) */ + + uint32_t supported; /* link capabilities */ + uint32_t advertising; /* advertised capabilities */ + uint32_t lp_advertising; /* peer advertised capabilities */ + uint32_t fec_hint; /* use this fec */ + u_int speed; /* actual link speed (Mbps) */ + int8_t fc; /* actual link flow control */ + int8_t fec; /* actual FEC */ + bool link_ok; /* link up? */ + uint8_t link_down_rc; /* link down reason */ }; #include "adapter.h" @@ -874,5 +877,16 @@ int t4vf_prep_adapter(struct adapter *adapter); int t4_bar2_sge_qregs(struct adapter *adapter, unsigned int qid, enum t4_bar2_qtype qtype, int user, u64 *pbar2_qoffset, unsigned int *pbar2_qid); +unsigned int fwcap_to_speed(uint32_t caps); +uint32_t speed_to_fwcap(unsigned int speed); +uint32_t fwcap_top_speed(uint32_t caps); + +static inline int +port_top_speed(const struct port_info *pi) +{ + + /* Mbps -> Gbps */ + return (fwcap_to_speed(pi->link_cfg.supported) / 1000); +} #endif /* __CHELSIO_COMMON_H */ diff --git a/sys/dev/cxgbe/common/t4_hw.c b/sys/dev/cxgbe/common/t4_hw.c index 2312b66cf7a8..cec8ebbf00e7 100644 --- a/sys/dev/cxgbe/common/t4_hw.c +++ b/sys/dev/cxgbe/common/t4_hw.c @@ -3755,6 +3755,93 @@ void t4_ulprx_read_la(struct adapter *adap, u32 *la_buf) } } +/** + * fwcaps16_to_caps32 - convert 16-bit Port Capabilities to 32-bits + * @caps16: a 16-bit Port Capabilities value + * + * Returns the equivalent 32-bit Port Capabilities value. + */ +static uint32_t fwcaps16_to_caps32(uint16_t caps16) +{ + uint32_t caps32 = 0; + + #define CAP16_TO_CAP32(__cap) \ + do { \ + if (caps16 & FW_PORT_CAP_##__cap) \ + caps32 |= FW_PORT_CAP32_##__cap; \ + } while (0) + + CAP16_TO_CAP32(SPEED_100M); + CAP16_TO_CAP32(SPEED_1G); + CAP16_TO_CAP32(SPEED_25G); + CAP16_TO_CAP32(SPEED_10G); + CAP16_TO_CAP32(SPEED_40G); + CAP16_TO_CAP32(SPEED_100G); + CAP16_TO_CAP32(FC_RX); + CAP16_TO_CAP32(FC_TX); + CAP16_TO_CAP32(ANEG); + CAP16_TO_CAP32(FORCE_PAUSE); + CAP16_TO_CAP32(MDIAUTO); + CAP16_TO_CAP32(MDISTRAIGHT); + CAP16_TO_CAP32(FEC_RS); + CAP16_TO_CAP32(FEC_BASER_RS); + CAP16_TO_CAP32(802_3_PAUSE); + CAP16_TO_CAP32(802_3_ASM_DIR); + + #undef CAP16_TO_CAP32 + + return caps32; +} + +/** + * fwcaps32_to_caps16 - convert 32-bit Port Capabilities to 16-bits + * @caps32: a 32-bit Port Capabilities value + * + * Returns the equivalent 16-bit Port Capabilities value. Note that + * not all 32-bit Port Capabilities can be represented in the 16-bit + * Port Capabilities and some fields/values may not make it. + */ +static uint16_t fwcaps32_to_caps16(uint32_t caps32) +{ + uint16_t caps16 = 0; + + #define CAP32_TO_CAP16(__cap) \ + do { \ + if (caps32 & FW_PORT_CAP32_##__cap) \ + caps16 |= FW_PORT_CAP_##__cap; \ + } while (0) + + CAP32_TO_CAP16(SPEED_100M); + CAP32_TO_CAP16(SPEED_1G); + CAP32_TO_CAP16(SPEED_10G); + CAP32_TO_CAP16(SPEED_25G); + CAP32_TO_CAP16(SPEED_40G); + CAP32_TO_CAP16(SPEED_100G); + CAP32_TO_CAP16(FC_RX); + CAP32_TO_CAP16(FC_TX); + CAP32_TO_CAP16(802_3_PAUSE); + CAP32_TO_CAP16(802_3_ASM_DIR); + CAP32_TO_CAP16(ANEG); + CAP32_TO_CAP16(FORCE_PAUSE); + CAP32_TO_CAP16(MDIAUTO); + CAP32_TO_CAP16(MDISTRAIGHT); + CAP32_TO_CAP16(FEC_RS); + CAP32_TO_CAP16(FEC_BASER_RS); + + #undef CAP32_TO_CAP16 + + return caps16; +} + +static bool +is_bt(struct port_info *pi) +{ + + return (pi->port_type == FW_PORT_TYPE_BT_SGMII || + pi->port_type == FW_PORT_TYPE_BT_XFI || + pi->port_type == FW_PORT_TYPE_BT_XAUI); +} + /** * t4_link_l1cfg - apply link configuration to MAC/PHY * @phy: the PHY to setup @@ -3772,52 +3859,44 @@ int t4_link_l1cfg(struct adapter *adap, unsigned int mbox, unsigned int port, struct link_config *lc) { struct fw_port_cmd c; - unsigned int mdi = V_FW_PORT_CAP_MDI(FW_PORT_CAP_MDI_AUTO); + unsigned int mdi = V_FW_PORT_CAP32_MDI(FW_PORT_CAP32_MDI_AUTO); unsigned int aneg, fc, fec, speed, rcap; fc = 0; if (lc->requested_fc & PAUSE_RX) - fc |= FW_PORT_CAP_FC_RX; + fc |= FW_PORT_CAP32_FC_RX; if (lc->requested_fc & PAUSE_TX) - fc |= FW_PORT_CAP_FC_TX; + fc |= FW_PORT_CAP32_FC_TX; + if (!(lc->requested_fc & PAUSE_AUTONEG)) + fc |= FW_PORT_CAP32_FORCE_PAUSE; fec = 0; - if (lc->requested_fec & FEC_RS) - fec = FW_PORT_CAP_FEC_RS; - else if (lc->requested_fec & FEC_BASER_RS) - fec = FW_PORT_CAP_FEC_BASER_RS; + if (lc->requested_fec == FEC_AUTO) + fec = lc->fec_hint; + else { + if (lc->requested_fec & FEC_RS) + fec |= FW_PORT_CAP32_FEC_RS; + if (lc->requested_fec & FEC_BASER_RS) + fec |= FW_PORT_CAP32_FEC_BASER_RS; + } - if (!(lc->supported & FW_PORT_CAP_ANEG) || - lc->requested_aneg == AUTONEG_DISABLE) { + if (lc->requested_aneg == AUTONEG_DISABLE) aneg = 0; - switch (lc->requested_speed) { - case 100000: - speed = FW_PORT_CAP_SPEED_100G; - break; - case 40000: - speed = FW_PORT_CAP_SPEED_40G; - break; - case 25000: - speed = FW_PORT_CAP_SPEED_25G; - break; - case 10000: - speed = FW_PORT_CAP_SPEED_10G; - break; - case 1000: - speed = FW_PORT_CAP_SPEED_1G; - break; - case 100: - speed = FW_PORT_CAP_SPEED_100M; - break; - default: - return -EINVAL; - break; - } - } else { - aneg = FW_PORT_CAP_ANEG; - speed = lc->supported & - V_FW_PORT_CAP_SPEED(M_FW_PORT_CAP_SPEED); - } + else if (lc->requested_aneg == AUTONEG_ENABLE) + aneg = FW_PORT_CAP32_ANEG; + else + aneg = lc->supported & FW_PORT_CAP32_ANEG; + + if (aneg) { + speed = lc->supported & V_FW_PORT_CAP32_SPEED(M_FW_PORT_CAP32_SPEED); + } else if (lc->requested_speed != 0) + speed = speed_to_fwcap(lc->requested_speed); + else + speed = fwcap_top_speed(lc->supported); + + /* Force AN on for BT cards. */ + if (is_bt(adap->port[port])) + aneg = lc->supported & FW_PORT_CAP32_ANEG; rcap = aneg | speed | fc | fec; if ((rcap | lc->supported) != lc->supported) { @@ -3833,10 +3912,17 @@ int t4_link_l1cfg(struct adapter *adap, unsigned int mbox, unsigned int port, c.op_to_portid = cpu_to_be32(V_FW_CMD_OP(FW_PORT_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_EXEC | V_FW_PORT_CMD_PORTID(port)); - c.action_to_len16 = - cpu_to_be32(V_FW_PORT_CMD_ACTION(FW_PORT_ACTION_L1_CFG) | + if (adap->params.port_caps32) { + c.action_to_len16 = + cpu_to_be32(V_FW_PORT_CMD_ACTION(FW_PORT_ACTION_L1_CFG32) | + FW_LEN16(c)); + c.u.l1cfg32.rcap32 = cpu_to_be32(rcap); + } else { + c.action_to_len16 = + cpu_to_be32(V_FW_PORT_CMD_ACTION(FW_PORT_ACTION_L1_CFG) | FW_LEN16(c)); - c.u.l1cfg.rcap = cpu_to_be32(rcap); + c.u.l1cfg.rcap = cpu_to_be32(fwcaps32_to_caps16(rcap)); + } return t4_wr_mbox_ns(adap, mbox, &c, sizeof(c), NULL); } @@ -7735,57 +7821,206 @@ const char *t4_link_down_rc_str(unsigned char link_down_rc) return reason[link_down_rc]; } +/* + * Return the highest speed set in the port capabilities, in Mb/s. + */ +unsigned int fwcap_to_speed(uint32_t caps) +{ + #define TEST_SPEED_RETURN(__caps_speed, __speed) \ + do { \ + if (caps & FW_PORT_CAP32_SPEED_##__caps_speed) \ + return __speed; \ + } while (0) + + TEST_SPEED_RETURN(400G, 400000); + TEST_SPEED_RETURN(200G, 200000); + TEST_SPEED_RETURN(100G, 100000); + TEST_SPEED_RETURN(50G, 50000); + TEST_SPEED_RETURN(40G, 40000); + TEST_SPEED_RETURN(25G, 25000); + TEST_SPEED_RETURN(10G, 10000); + TEST_SPEED_RETURN(1G, 1000); + TEST_SPEED_RETURN(100M, 100); + + #undef TEST_SPEED_RETURN + + return 0; +} + +/* + * Return the port capabilities bit for the given speed, which is in Mb/s. + */ +uint32_t speed_to_fwcap(unsigned int speed) +{ + #define TEST_SPEED_RETURN(__caps_speed, __speed) \ + do { \ + if (speed == __speed) \ + return FW_PORT_CAP32_SPEED_##__caps_speed; \ + } while (0) + + TEST_SPEED_RETURN(400G, 400000); + TEST_SPEED_RETURN(200G, 200000); + TEST_SPEED_RETURN(100G, 100000); + TEST_SPEED_RETURN(50G, 50000); + TEST_SPEED_RETURN(40G, 40000); + TEST_SPEED_RETURN(25G, 25000); + TEST_SPEED_RETURN(10G, 10000); + TEST_SPEED_RETURN(1G, 1000); + TEST_SPEED_RETURN(100M, 100); + + #undef TEST_SPEED_RETURN + + return 0; +} + +/* + * Return the port capabilities bit for the highest speed in the capabilities. + */ +uint32_t fwcap_top_speed(uint32_t caps) +{ + #define TEST_SPEED_RETURN(__caps_speed) \ + do { \ + if (caps & FW_PORT_CAP32_SPEED_##__caps_speed) \ + return FW_PORT_CAP32_SPEED_##__caps_speed; \ + } while (0) + + TEST_SPEED_RETURN(400G); + TEST_SPEED_RETURN(200G); + TEST_SPEED_RETURN(100G); + TEST_SPEED_RETURN(50G); + TEST_SPEED_RETURN(40G); + TEST_SPEED_RETURN(25G); + TEST_SPEED_RETURN(10G); + TEST_SPEED_RETURN(1G); + TEST_SPEED_RETURN(100M); + + #undef TEST_SPEED_RETURN + + return 0; +} + + +/** + * lstatus_to_fwcap - translate old lstatus to 32-bit Port Capabilities + * @lstatus: old FW_PORT_ACTION_GET_PORT_INFO lstatus value + * + * Translates old FW_PORT_ACTION_GET_PORT_INFO lstatus field into new + * 32-bit Port Capabilities value. + */ +static uint32_t lstatus_to_fwcap(u32 lstatus) +{ + uint32_t linkattr = 0; + + /* + * Unfortunately the format of the Link Status in the old + * 16-bit Port Information message isn't the same as the + * 16-bit Port Capabilities bitfield used everywhere else ... + */ + if (lstatus & F_FW_PORT_CMD_RXPAUSE) + linkattr |= FW_PORT_CAP32_FC_RX; + if (lstatus & F_FW_PORT_CMD_TXPAUSE) + linkattr |= FW_PORT_CAP32_FC_TX; + if (lstatus & V_FW_PORT_CMD_LSPEED(FW_PORT_CAP_SPEED_100M)) + linkattr |= FW_PORT_CAP32_SPEED_100M; + if (lstatus & V_FW_PORT_CMD_LSPEED(FW_PORT_CAP_SPEED_1G)) + linkattr |= FW_PORT_CAP32_SPEED_1G; + if (lstatus & V_FW_PORT_CMD_LSPEED(FW_PORT_CAP_SPEED_10G)) + linkattr |= FW_PORT_CAP32_SPEED_10G; + if (lstatus & V_FW_PORT_CMD_LSPEED(FW_PORT_CAP_SPEED_25G)) + linkattr |= FW_PORT_CAP32_SPEED_25G; + if (lstatus & V_FW_PORT_CMD_LSPEED(FW_PORT_CAP_SPEED_40G)) + linkattr |= FW_PORT_CAP32_SPEED_40G; + if (lstatus & V_FW_PORT_CMD_LSPEED(FW_PORT_CAP_SPEED_100G)) + linkattr |= FW_PORT_CAP32_SPEED_100G; + + return linkattr; +} + /* * Updates all fields owned by the common code in port_info and link_config * based on information provided by the firmware. Does not touch any * requested_* field. */ -static void handle_port_info(struct port_info *pi, const struct fw_port_info *p) +static void handle_port_info(struct port_info *pi, const struct fw_port_cmd *p, + enum fw_port_action action, bool *mod_changed, bool *link_changed) { - struct link_config *lc = &pi->link_cfg; - int speed; + struct link_config old_lc, *lc = &pi->link_cfg; unsigned char fc, fec; - u32 stat = be32_to_cpu(p->lstatus_to_modtype); - - pi->port_type = G_FW_PORT_CMD_PTYPE(stat); - pi->mod_type = G_FW_PORT_CMD_MODTYPE(stat); - pi->mdio_addr = stat & F_FW_PORT_CMD_MDIOCAP ? - G_FW_PORT_CMD_MDIOADDR(stat) : -1; - - lc->supported = be16_to_cpu(p->pcap); - lc->advertising = be16_to_cpu(p->acap); - lc->lp_advertising = be16_to_cpu(p->lpacap); - lc->link_ok = (stat & F_FW_PORT_CMD_LSTATUS) != 0; - lc->link_down_rc = G_FW_PORT_CMD_LINKDNRC(stat); - - speed = 0; - if (stat & V_FW_PORT_CMD_LSPEED(FW_PORT_CAP_SPEED_100M)) - speed = 100; - else if (stat & V_FW_PORT_CMD_LSPEED(FW_PORT_CAP_SPEED_1G)) - speed = 1000; - else if (stat & V_FW_PORT_CMD_LSPEED(FW_PORT_CAP_SPEED_10G)) - speed = 10000; - else if (stat & V_FW_PORT_CMD_LSPEED(FW_PORT_CAP_SPEED_25G)) - speed = 25000; - else if (stat & V_FW_PORT_CMD_LSPEED(FW_PORT_CAP_SPEED_40G)) - speed = 40000; - else if (stat & V_FW_PORT_CMD_LSPEED(FW_PORT_CAP_SPEED_100G)) - speed = 100000; - lc->speed = speed; + u32 stat, linkattr; + int old_ptype, old_mtype; + + old_ptype = pi->port_type; + old_mtype = pi->mod_type; + old_lc = *lc; + if (action == FW_PORT_ACTION_GET_PORT_INFO) { + stat = be32_to_cpu(p->u.info.lstatus_to_modtype); + + pi->port_type = G_FW_PORT_CMD_PTYPE(stat); + pi->mod_type = G_FW_PORT_CMD_MODTYPE(stat); + pi->mdio_addr = stat & F_FW_PORT_CMD_MDIOCAP ? + G_FW_PORT_CMD_MDIOADDR(stat) : -1; + + lc->supported = fwcaps16_to_caps32(be16_to_cpu(p->u.info.pcap)); + lc->advertising = fwcaps16_to_caps32(be16_to_cpu(p->u.info.acap)); + lc->lp_advertising = fwcaps16_to_caps32(be16_to_cpu(p->u.info.lpacap)); + lc->link_ok = (stat & F_FW_PORT_CMD_LSTATUS) != 0; + lc->link_down_rc = G_FW_PORT_CMD_LINKDNRC(stat); + + linkattr = lstatus_to_fwcap(stat); + } else if (action == FW_PORT_ACTION_GET_PORT_INFO32) { + stat = be32_to_cpu(p->u.info32.lstatus32_to_cbllen32); + + pi->port_type = G_FW_PORT_CMD_PORTTYPE32(stat); + pi->mod_type = G_FW_PORT_CMD_MODTYPE32(stat); + pi->mdio_addr = stat & F_FW_PORT_CMD_MDIOCAP32 ? + G_FW_PORT_CMD_MDIOADDR32(stat) : -1; + + lc->supported = be32_to_cpu(p->u.info32.pcaps32); + lc->advertising = be32_to_cpu(p->u.info32.acaps32); + lc->lp_advertising = be16_to_cpu(p->u.info32.lpacaps32); + lc->link_ok = (stat & F_FW_PORT_CMD_LSTATUS32) != 0; + lc->link_down_rc = G_FW_PORT_CMD_LINKDNRC32(stat); + + linkattr = be32_to_cpu(p->u.info32.linkattr32); + } else { + CH_ERR(pi->adapter, "bad port_info action 0x%x\n", action); + return; + } + + lc->speed = fwcap_to_speed(linkattr); fc = 0; - if (stat & F_FW_PORT_CMD_RXPAUSE) + if (linkattr & FW_PORT_CAP32_FC_RX) fc |= PAUSE_RX; - if (stat & F_FW_PORT_CMD_TXPAUSE) + if (linkattr & FW_PORT_CAP32_FC_TX) fc |= PAUSE_TX; lc->fc = fc; - fec = 0; - if (lc->advertising & FW_PORT_CAP_FEC_RS) - fec = FEC_RS; - else if (lc->advertising & FW_PORT_CAP_FEC_BASER_RS) - fec = FEC_BASER_RS; + fec = FEC_NONE; + if (linkattr & FW_PORT_CAP32_FEC_RS) + fec |= FEC_RS; + if (linkattr & FW_PORT_CAP32_FEC_BASER_RS) + fec |= FEC_BASER_RS; lc->fec = fec; + + if (mod_changed != NULL) + *mod_changed = false; + if (link_changed != NULL) + *link_changed = false; + if (old_ptype != pi->port_type || old_mtype != pi->mod_type || + old_lc.supported != lc->supported) { + if (pi->mod_type != FW_PORT_MOD_TYPE_NONE) { + lc->fec_hint = lc->advertising & + V_FW_PORT_CAP32_FEC(M_FW_PORT_CAP32_FEC); + } + if (mod_changed != NULL) + *mod_changed = true; + } + if (old_lc.link_ok != lc->link_ok || old_lc.speed != lc->speed || + old_lc.fec != lc->fec || old_lc.fc != lc->fc) { + if (link_changed != NULL) + *link_changed = true; + } } /** @@ -7798,22 +8033,24 @@ static void handle_port_info(struct port_info *pi, const struct fw_port_info *p) */ int t4_update_port_info(struct port_info *pi) { - struct fw_port_cmd port_cmd; + struct adapter *sc = pi->adapter; + struct fw_port_cmd cmd; + enum fw_port_action action; int ret; - memset(&port_cmd, 0, sizeof port_cmd); - port_cmd.op_to_portid = cpu_to_be32(V_FW_CMD_OP(FW_PORT_CMD) | - F_FW_CMD_REQUEST | F_FW_CMD_READ | - V_FW_PORT_CMD_PORTID(pi->tx_chan)); - port_cmd.action_to_len16 = cpu_to_be32( - V_FW_PORT_CMD_ACTION(FW_PORT_ACTION_GET_PORT_INFO) | - FW_LEN16(port_cmd)); - ret = t4_wr_mbox_ns(pi->adapter, pi->adapter->mbox, - &port_cmd, sizeof(port_cmd), &port_cmd); + memset(&cmd, 0, sizeof(cmd)); + cmd.op_to_portid = cpu_to_be32(V_FW_CMD_OP(FW_PORT_CMD) | + F_FW_CMD_REQUEST | F_FW_CMD_READ | + V_FW_PORT_CMD_PORTID(pi->tx_chan)); + action = sc->params.port_caps32 ? FW_PORT_ACTION_GET_PORT_INFO32 : + FW_PORT_ACTION_GET_PORT_INFO; + cmd.action_to_len16 = cpu_to_be32(V_FW_PORT_CMD_ACTION(action) | + FW_LEN16(cmd)); + ret = t4_wr_mbox_ns(sc, sc->mbox, &cmd, sizeof(cmd), &cmd); if (ret) return ret; - handle_port_info(pi, &port_cmd.u.info); + handle_port_info(pi, &cmd, action, NULL, NULL); return 0; } @@ -7828,15 +8065,18 @@ int t4_handle_fw_rpl(struct adapter *adap, const __be64 *rpl) { u8 opcode = *(const u8 *)rpl; const struct fw_port_cmd *p = (const void *)rpl; - unsigned int action = - G_FW_PORT_CMD_ACTION(be32_to_cpu(p->action_to_len16)); + enum fw_port_action action = + G_FW_PORT_CMD_ACTION(be32_to_cpu(p->action_to_len16)); + bool mod_changed, link_changed; - if (opcode == FW_PORT_CMD && action == FW_PORT_ACTION_GET_PORT_INFO) { + if (opcode == FW_PORT_CMD && + (action == FW_PORT_ACTION_GET_PORT_INFO || + action == FW_PORT_ACTION_GET_PORT_INFO32)) { /* link/module state change message */ - int i, old_ptype, old_mtype; + int i; int chan = G_FW_PORT_CMD_PORTID(be32_to_cpu(p->op_to_portid)); struct port_info *pi = NULL; - struct link_config *lc, *old_lc; + struct link_config *lc; for_each_port(adap, i) { pi = adap2pinfo(adap, i); @@ -7846,23 +8086,15 @@ int t4_handle_fw_rpl(struct adapter *adap, const __be64 *rpl) lc = &pi->link_cfg; PORT_LOCK(pi); - old_lc = &pi->old_link_cfg; - old_ptype = pi->port_type; - old_mtype = pi->mod_type; - handle_port_info(pi, &p->u.info); + handle_port_info(pi, p, action, &mod_changed, &link_changed); PORT_UNLOCK(pi); - if (old_ptype != pi->port_type || old_mtype != pi->mod_type) { + if (mod_changed) t4_os_portmod_changed(pi); - } - PORT_LOCK(pi); - if (old_lc->link_ok != lc->link_ok || - old_lc->speed != lc->speed || - old_lc->fec != lc->fec || - old_lc->fc != lc->fc) { + if (link_changed) { + PORT_LOCK(pi); t4_os_link_changed(pi); - *old_lc = *lc; + PORT_UNLOCK(pi); } - PORT_UNLOCK(pi); } else { CH_WARN_RATELIMIT(adap, "Unknown firmware reply %d\n", opcode); return -EINVAL; @@ -8595,6 +8827,11 @@ int t4_port_init(struct adapter *adap, int mbox, int pf, int vf, int port_id) } while ((adap->params.portvec & (1 << j)) == 0); } + p->tx_chan = j; + p->mps_bg_map = t4_get_mps_bg_map(adap, j); + p->rx_e_chan_map = t4_get_rx_e_chan_map(adap, j); + p->lport = j; + if (!(adap->flags & IS_VF) || adap->params.vfres.r_caps & FW_CMD_CAP_PORT) { t4_update_port_info(p); @@ -8609,10 +8846,6 @@ int t4_port_init(struct adapter *adap, int mbox, int pf, int vf, int port_id) p->vi[0].smt_idx = (ret & 0x7f) << 1; else p->vi[0].smt_idx = (ret & 0x7f); - p->tx_chan = j; - p->mps_bg_map = t4_get_mps_bg_map(adap, j); - p->rx_e_chan_map = t4_get_rx_e_chan_map(adap, j); - p->lport = j; p->vi[0].rss_size = rss_size; t4_os_set_hw_addr(p, addr); diff --git a/sys/dev/cxgbe/osdep.h b/sys/dev/cxgbe/osdep.h index 5cd3e88803ad..6eec287db1a7 100644 --- a/sys/dev/cxgbe/osdep.h +++ b/sys/dev/cxgbe/osdep.h @@ -108,6 +108,7 @@ typedef boolean_t bool; #define DUPLEX_HALF 0 #define DUPLEX_FULL 1 +#define AUTONEG_AUTO (-1) #define AUTONEG_DISABLE 0 #define AUTONEG_ENABLE 1 diff --git a/sys/dev/cxgbe/t4_main.c b/sys/dev/cxgbe/t4_main.c index d6ad6048973a..3ae560585897 100644 --- a/sys/dev/cxgbe/t4_main.c +++ b/sys/dev/cxgbe/t4_main.c @@ -390,18 +390,20 @@ static char t4_cfg_file[32] = DEFAULT_CF; TUNABLE_STR("hw.cxgbe.config_file", t4_cfg_file, sizeof(t4_cfg_file)); /* - * PAUSE settings (bit 0, 1 = rx_pause, tx_pause respectively). + * PAUSE settings (bit 0, 1, 2 = rx_pause, tx_pause, pause_autoneg respectively). * rx_pause = 1 to heed incoming PAUSE frames, 0 to ignore them. * tx_pause = 1 to emit PAUSE frames when the rx FIFO reaches its high water * mark or when signalled to do so, 0 to never emit PAUSE. + * pause_autoneg = 1 means PAUSE will be negotiated if possible and the + * negotiated settings will override rx_pause/tx_pause. + * Otherwise rx_pause/tx_pause are applied forcibly. */ -static int t4_pause_settings = PAUSE_TX | PAUSE_RX; +static int t4_pause_settings = PAUSE_RX | PAUSE_TX | PAUSE_AUTONEG; TUNABLE_INT("hw.cxgbe.pause_settings", &t4_pause_settings); /* - * Forward Error Correction settings (bit 0, 1, 2 = FEC_RS, FEC_BASER_RS, - * FEC_RESERVED respectively). - * -1 to run with the firmware default. + * Forward Error Correction settings (bit 0, 1 = RS, BASER respectively). + * -1 to run with the firmware default. Same as FEC_AUTO (bit 5) * 0 to disable FEC. */ static int t4_fec = -1; @@ -526,9 +528,11 @@ static int get_params__pre_init(struct adapter *); static int get_params__post_init(struct adapter *); static int set_params__post_init(struct adapter *); static void t4_set_desc(struct adapter *); -static void build_medialist(struct port_info *, struct ifmedia *); -static void init_l1cfg(struct port_info *); -static int apply_l1cfg(struct port_info *); +static bool fixed_ifmedia(struct port_info *); +static void build_medialist(struct port_info *); +static void init_link_config(struct port_info *); +static int fixup_link_config(struct port_info *); +static int apply_link_config(struct port_info *); static int cxgbe_init_synchronized(struct vi_info *); static int cxgbe_uninit_synchronized(struct vi_info *); static void quiesce_txq(struct adapter *, struct sge_txq *); @@ -1018,6 +1022,14 @@ t4_attach(device_t dev) ifmedia_init(&pi->media, IFM_IMASK, cxgbe_media_change, cxgbe_media_status); + PORT_LOCK(pi); + init_link_config(pi); + fixup_link_config(pi); + build_medialist(pi); + if (fixed_ifmedia(pi)) + pi->flags |= FIXED_IFMEDIA; + PORT_UNLOCK(pi); + pi->dev = device_add_child(dev, sc->names->ifnet_name, -1); if (pi->dev == NULL) { device_printf(dev, @@ -1885,7 +1897,7 @@ cxgbe_transmit(struct ifnet *ifp, struct mbuf *m) M_ASSERTPKTHDR(m); MPASS(m->m_nextpkt == NULL); /* not quite ready for this yet */ - if (__predict_false(pi->link_cfg.link_ok == 0)) { + if (__predict_false(pi->link_cfg.link_ok == false)) { m_freem(m); return (ENETDOWN); } @@ -2063,8 +2075,8 @@ cxgbe_get_counter(struct ifnet *ifp, ift_counter c) } /* - * The kernel picks a media from the list we had provided so we do not have to - * validate the request. + * The kernel picks a media from the list we had provided but we still validate + * the requeste. */ int cxgbe_media_change(struct ifnet *ifp) @@ -2081,8 +2093,14 @@ cxgbe_media_change(struct ifnet *ifp) return (rc); PORT_LOCK(pi); if (IFM_SUBTYPE(ifm->ifm_media) == IFM_AUTO) { - MPASS(lc->supported & FW_PORT_CAP_ANEG); + /* ifconfig .. media autoselect */ + if (!(lc->supported & FW_PORT_CAP32_ANEG)) { + rc = ENOTSUP; /* AN not supported by transceiver */ + goto done; + } lc->requested_aneg = AUTONEG_ENABLE; + lc->requested_speed = 0; + lc->requested_fc |= PAUSE_AUTONEG; } else { lc->requested_aneg = AUTONEG_DISABLE; lc->requested_speed = @@ -2093,47 +2111,25 @@ cxgbe_media_change(struct ifnet *ifp) if (IFM_OPTIONS(ifm->ifm_media) & IFM_ETH_TXPAUSE) lc->requested_fc |= PAUSE_TX; } - if (pi->up_vis > 0) - rc = apply_l1cfg(pi); + if (pi->up_vis > 0) { + fixup_link_config(pi); + rc = apply_link_config(pi); + } +done: PORT_UNLOCK(pi); end_synchronized_op(sc, 0); return (rc); } -/* - * Mbps to FW_PORT_CAP_SPEED_* bit. - */ -static uint16_t -speed_to_fwspeed(int speed) -{ - - switch (speed) { - case 100000: - return (FW_PORT_CAP_SPEED_100G); - case 40000: - return (FW_PORT_CAP_SPEED_40G); - case 25000: - return (FW_PORT_CAP_SPEED_25G); - case 10000: - return (FW_PORT_CAP_SPEED_10G); - case 1000: - return (FW_PORT_CAP_SPEED_1G); - case 100: - return (FW_PORT_CAP_SPEED_100M); - } - - return (0); -} - /* * Base media word (without ETHER, pause, link active, etc.) for the port at the * given speed. */ static int -port_mword(struct port_info *pi, uint16_t speed) +port_mword(struct port_info *pi, uint32_t speed) { - MPASS(speed & M_FW_PORT_CAP_SPEED); + MPASS(speed & M_FW_PORT_CAP32_SPEED); MPASS(powerof2(speed)); switch(pi->port_type) { @@ -2142,24 +2138,24 @@ port_mword(struct port_info *pi, uint16_t speed) case FW_PORT_TYPE_BT_XAUI: /* BaseT */ switch (speed) { - case FW_PORT_CAP_SPEED_100M: + case FW_PORT_CAP32_SPEED_100M: return (IFM_100_T); - case FW_PORT_CAP_SPEED_1G: + case FW_PORT_CAP32_SPEED_1G: return (IFM_1000_T); - case FW_PORT_CAP_SPEED_10G: + case FW_PORT_CAP32_SPEED_10G: return (IFM_10G_T); } break; case FW_PORT_TYPE_KX4: - if (speed == FW_PORT_CAP_SPEED_10G) + if (speed == FW_PORT_CAP32_SPEED_10G) return (IFM_10G_KX4); break; case FW_PORT_TYPE_CX4: - if (speed == FW_PORT_CAP_SPEED_10G) + if (speed == FW_PORT_CAP32_SPEED_10G) return (IFM_10G_CX4); break; case FW_PORT_TYPE_KX: - if (speed == FW_PORT_CAP_SPEED_1G) + if (speed == FW_PORT_CAP32_SPEED_1G) return (IFM_1000_KX); break; case FW_PORT_TYPE_KR: @@ -2170,15 +2166,17 @@ port_mword(struct port_info *pi, uint16_t speed) case FW_PORT_TYPE_KR_SFP28: case FW_PORT_TYPE_KR_XLAUI: switch (speed) { - case FW_PORT_CAP_SPEED_1G: + case FW_PORT_CAP32_SPEED_1G: return (IFM_1000_KX); - case FW_PORT_CAP_SPEED_10G: + case FW_PORT_CAP32_SPEED_10G: return (IFM_10G_KR); - case FW_PORT_CAP_SPEED_25G: + case FW_PORT_CAP32_SPEED_25G: return (IFM_25G_KR); - case FW_PORT_CAP_SPEED_40G: + case FW_PORT_CAP32_SPEED_40G: return (IFM_40G_KR4); - case FW_PORT_CAP_SPEED_100G: + case FW_PORT_CAP32_SPEED_50G: + return (IFM_50G_KR2); + case FW_PORT_CAP32_SPEED_100G: return (IFM_100G_KR4); } break; @@ -2196,53 +2194,59 @@ port_mword(struct port_info *pi, uint16_t speed) switch (pi->mod_type) { case FW_PORT_MOD_TYPE_LR: switch (speed) { - case FW_PORT_CAP_SPEED_1G: + case FW_PORT_CAP32_SPEED_1G: return (IFM_1000_LX); - case FW_PORT_CAP_SPEED_10G: + case FW_PORT_CAP32_SPEED_10G: return (IFM_10G_LR); - case FW_PORT_CAP_SPEED_25G: + case FW_PORT_CAP32_SPEED_25G: return (IFM_25G_LR); - case FW_PORT_CAP_SPEED_40G: + case FW_PORT_CAP32_SPEED_40G: return (IFM_40G_LR4); - case FW_PORT_CAP_SPEED_100G: + case FW_PORT_CAP32_SPEED_50G: + return (IFM_50G_LR2); + case FW_PORT_CAP32_SPEED_100G: return (IFM_100G_LR4); } break; case FW_PORT_MOD_TYPE_SR: switch (speed) { - case FW_PORT_CAP_SPEED_1G: + case FW_PORT_CAP32_SPEED_1G: return (IFM_1000_SX); - case FW_PORT_CAP_SPEED_10G: + case FW_PORT_CAP32_SPEED_10G: return (IFM_10G_SR); - case FW_PORT_CAP_SPEED_25G: + case FW_PORT_CAP32_SPEED_25G: return (IFM_25G_SR); - case FW_PORT_CAP_SPEED_40G: + case FW_PORT_CAP32_SPEED_40G: return (IFM_40G_SR4); - case FW_PORT_CAP_SPEED_100G: + case FW_PORT_CAP32_SPEED_50G: + return (IFM_50G_SR2); + case FW_PORT_CAP32_SPEED_100G: return (IFM_100G_SR4); } break; case FW_PORT_MOD_TYPE_ER: - if (speed == FW_PORT_CAP_SPEED_10G) + if (speed == FW_PORT_CAP32_SPEED_10G) return (IFM_10G_ER); break; case FW_PORT_MOD_TYPE_TWINAX_PASSIVE: case FW_PORT_MOD_TYPE_TWINAX_ACTIVE: switch (speed) { - case FW_PORT_CAP_SPEED_1G: + case FW_PORT_CAP32_SPEED_1G: return (IFM_1000_CX); - case FW_PORT_CAP_SPEED_10G: + case FW_PORT_CAP32_SPEED_10G: return (IFM_10G_TWINAX); - case FW_PORT_CAP_SPEED_25G: + case FW_PORT_CAP32_SPEED_25G: return (IFM_25G_CR); - case FW_PORT_CAP_SPEED_40G: + case FW_PORT_CAP32_SPEED_40G: return (IFM_40G_CR4); - case FW_PORT_CAP_SPEED_100G: + case FW_PORT_CAP32_SPEED_50G: + return (IFM_50G_CR2); + case FW_PORT_CAP32_SPEED_100G: return (IFM_100G_CR4); } break; case FW_PORT_MOD_TYPE_LRM: - if (speed == FW_PORT_CAP_SPEED_10G) + if (speed == FW_PORT_CAP32_SPEED_10G) return (IFM_10G_LRM); break; case FW_PORT_MOD_TYPE_NA: @@ -2284,12 +2288,12 @@ cxgbe_media_status(struct ifnet *ifp, struct ifmediareq *ifmr) * function. Just PORT_LOCK would have been enough otherwise. */ t4_update_port_info(pi); - build_medialist(pi, &pi->media); + build_medialist(pi); } /* ifm_status */ ifmr->ifm_status = IFM_AVALID; - if (lc->link_ok == 0) + if (lc->link_ok == false) goto done; ifmr->ifm_status |= IFM_ACTIVE; @@ -2300,7 +2304,7 @@ cxgbe_media_status(struct ifnet *ifp, struct ifmediareq *ifmr) ifmr->ifm_active |= IFM_ETH_RXPAUSE; if (lc->fc & PAUSE_TX) ifmr->ifm_active |= IFM_ETH_TXPAUSE; - ifmr->ifm_active |= port_mword(pi, speed_to_fwspeed(lc->speed)); + ifmr->ifm_active |= port_mword(pi, speed_to_fwcap(lc->speed)); done: PORT_UNLOCK(pi); end_synchronized_op(sc, 0); @@ -4135,6 +4139,12 @@ set_params__post_init(struct adapter *sc) val = 1; (void)t4_set_params(sc, sc->mbox, sc->pf, 0, 1, ¶m, &val); + /* Enable 32b port caps if the firmware supports it. */ + param = FW_PARAM_PFVF(PORT_CAPS32); + val = 1; + if (t4_set_params(sc, sc->mbox, sc->pf, 0, 1, ¶m, &val) == 0) + sc->params.port_caps32 = 1; + #ifdef TCP_OFFLOAD /* * Override the TOE timers with user provided tunables. This is not the @@ -4217,22 +4227,30 @@ ifmedia_add4(struct ifmedia *ifm, int m) ifmedia_add(ifm, m | IFM_ETH_TXPAUSE | IFM_ETH_RXPAUSE, 0, NULL); } +/* + * This is the selected media, which is not quite the same as the active media. + * The media line in ifconfig is "media: Ethernet selected (active)" if selected + * and active are not the same, and "media: Ethernet selected" otherwise. + */ static void -set_current_media(struct port_info *pi, struct ifmedia *ifm) +set_current_media(struct port_info *pi) { struct link_config *lc; + struct ifmedia *ifm; int mword; + u_int speed; PORT_LOCK_ASSERT_OWNED(pi); /* Leave current media alone if it's already set to IFM_NONE. */ + ifm = &pi->media; if (ifm->ifm_cur != NULL && IFM_SUBTYPE(ifm->ifm_cur->ifm_media) == IFM_NONE) return; lc = &pi->link_cfg; - if (lc->requested_aneg == AUTONEG_ENABLE && - lc->supported & FW_PORT_CAP_ANEG) { + if (lc->requested_aneg != AUTONEG_DISABLE && + lc->supported & FW_PORT_CAP32_ANEG) { ifmedia_set(ifm, IFM_ETHER | IFM_AUTO); return; } @@ -4241,16 +4259,42 @@ set_current_media(struct port_info *pi, struct ifmedia *ifm) mword |= IFM_ETH_TXPAUSE; if (lc->requested_fc & PAUSE_RX) mword |= IFM_ETH_RXPAUSE; - mword |= port_mword(pi, speed_to_fwspeed(lc->requested_speed)); + if (lc->requested_speed == 0) + speed = port_top_speed(pi) * 1000; /* Gbps -> Mbps */ + else + speed = lc->requested_speed; + mword |= port_mword(pi, speed_to_fwcap(speed)); ifmedia_set(ifm, mword); } +/* + * Returns true if the ifmedia list for the port cannot change. + */ +static bool +fixed_ifmedia(struct port_info *pi) +{ + + return (pi->port_type == FW_PORT_TYPE_BT_SGMII || + pi->port_type == FW_PORT_TYPE_BT_XFI || + pi->port_type == FW_PORT_TYPE_BT_XAUI || + pi->port_type == FW_PORT_TYPE_KX4 || + pi->port_type == FW_PORT_TYPE_KX || + pi->port_type == FW_PORT_TYPE_KR || + pi->port_type == FW_PORT_TYPE_BP_AP || + pi->port_type == FW_PORT_TYPE_BP4_AP || + pi->port_type == FW_PORT_TYPE_BP40_BA || + pi->port_type == FW_PORT_TYPE_KR4_100G || + pi->port_type == FW_PORT_TYPE_KR_SFP28 || + pi->port_type == FW_PORT_TYPE_KR_XLAUI); +} + static void -build_medialist(struct port_info *pi, struct ifmedia *ifm) +build_medialist(struct port_info *pi) { - uint16_t ss, speed; + uint32_t ss, speed; int unknown, mword, bit; struct link_config *lc; + struct ifmedia *ifm; PORT_LOCK_ASSERT_OWNED(pi); @@ -4258,18 +4302,12 @@ build_medialist(struct port_info *pi, struct ifmedia *ifm) return; /* - * First setup all the requested_ fields so that they comply with what's - * supported by the port + transceiver. Note that this clobbers any - * user preferences set via sysctl_pause_settings or sysctl_autoneg. - */ - init_l1cfg(pi); - - /* - * Now (re)build the ifmedia list. + * Rebuild the ifmedia list. */ + ifm = &pi->media; ifmedia_removeall(ifm); lc = &pi->link_cfg; - ss = G_FW_PORT_CAP_SPEED(lc->supported); /* Supported Speeds */ + ss = G_FW_PORT_CAP32_SPEED(lc->supported); /* Supported Speeds */ if (__predict_false(ss == 0)) { /* not supposed to happen. */ MPASS(ss != 0); no_media: @@ -4280,9 +4318,9 @@ no_media: } unknown = 0; - for (bit = 0; bit < fls(ss); bit++) { + for (bit = S_FW_PORT_CAP32_SPEED; bit < fls(ss); bit++) { speed = 1 << bit; - MPASS(speed & M_FW_PORT_CAP_SPEED); + MPASS(speed & M_FW_PORT_CAP32_SPEED); if (ss & speed) { mword = port_mword(pi, speed); if (mword == IFM_NONE) { @@ -4295,86 +4333,134 @@ no_media: } if (unknown > 0) /* Add one unknown for all unknown media types. */ ifmedia_add4(ifm, IFM_ETHER | IFM_FDX | IFM_UNKNOWN); - if (lc->supported & FW_PORT_CAP_ANEG) + if (lc->supported & FW_PORT_CAP32_ANEG) ifmedia_add(ifm, IFM_ETHER | IFM_AUTO, 0, NULL); - set_current_media(pi, ifm); + set_current_media(pi); } /* - * Update all the requested_* fields in the link config to something valid (and - * reasonable). + * Initialize the requested fields in the link config based on driver tunables. */ static void -init_l1cfg(struct port_info *pi) +init_link_config(struct port_info *pi) { struct link_config *lc = &pi->link_cfg; PORT_LOCK_ASSERT_OWNED(pi); - /* Gbps -> Mbps */ - lc->requested_speed = port_top_speed(pi) * 1000; + lc->requested_speed = 0; - if (t4_autoneg != 0 && lc->supported & FW_PORT_CAP_ANEG) { - lc->requested_aneg = AUTONEG_ENABLE; - } else { + if (t4_autoneg == 0) lc->requested_aneg = AUTONEG_DISABLE; + else if (t4_autoneg == 1) + lc->requested_aneg = AUTONEG_ENABLE; + else + lc->requested_aneg = AUTONEG_AUTO; + + lc->requested_fc = t4_pause_settings & (PAUSE_TX | PAUSE_RX | + PAUSE_AUTONEG); + + if (t4_fec == -1 || t4_fec & FEC_AUTO) + lc->requested_fec = FEC_AUTO; + else { + lc->requested_fec = FEC_NONE; + if (t4_fec & FEC_RS) + lc->requested_fec |= FEC_RS; + if (t4_fec & FEC_BASER_RS) + lc->requested_fec |= FEC_BASER_RS; } +} - lc->requested_fc = t4_pause_settings & (PAUSE_TX | PAUSE_RX); +/* + * Makes sure that all requested settings comply with what's supported by the + * port. Returns the number of settings that were invalid and had to be fixed. + */ +static int +fixup_link_config(struct port_info *pi) +{ + int n = 0; + struct link_config *lc = &pi->link_cfg; + uint32_t fwspeed; - if (t4_fec != -1) { - if (t4_fec & FEC_RS && lc->supported & FW_PORT_CAP_FEC_RS) { - lc->requested_fec = FEC_RS; - } else if (t4_fec & FEC_BASER_RS && - lc->supported & FW_PORT_CAP_FEC_BASER_RS) { - lc->requested_fec = FEC_BASER_RS; - } else { - lc->requested_fec = 0; - } - } else { - /* Use the suggested value provided by the firmware in acaps */ - if (lc->advertising & FW_PORT_CAP_FEC_RS && - lc->supported & FW_PORT_CAP_FEC_RS) { - lc->requested_fec = FEC_RS; - } else if (lc->advertising & FW_PORT_CAP_FEC_BASER_RS && - lc->supported & FW_PORT_CAP_FEC_BASER_RS) { - lc->requested_fec = FEC_BASER_RS; - } else { - lc->requested_fec = 0; + PORT_LOCK_ASSERT_OWNED(pi); + + /* Speed (when not autonegotiating) */ + if (lc->requested_speed != 0) { + fwspeed = speed_to_fwcap(lc->requested_speed); + if ((fwspeed & lc->supported) == 0) { + n++; + lc->requested_speed = 0; } } + + /* Link autonegotiation */ + MPASS(lc->requested_aneg == AUTONEG_ENABLE || + lc->requested_aneg == AUTONEG_DISABLE || + lc->requested_aneg == AUTONEG_AUTO); + if (lc->requested_aneg == AUTONEG_ENABLE && + !(lc->supported & FW_PORT_CAP32_ANEG)) { + n++; + lc->requested_aneg = AUTONEG_AUTO; + } + + /* Flow control */ + MPASS((lc->requested_fc & ~(PAUSE_TX | PAUSE_RX | PAUSE_AUTONEG)) == 0); + if (lc->requested_fc & PAUSE_TX && + !(lc->supported & FW_PORT_CAP32_FC_TX)) { + n++; + lc->requested_fc &= ~PAUSE_TX; + } + if (lc->requested_fc & PAUSE_RX && + !(lc->supported & FW_PORT_CAP32_FC_RX)) { + n++; + lc->requested_fc &= ~PAUSE_RX; + } + if (!(lc->requested_fc & PAUSE_AUTONEG) && + !(lc->supported & FW_PORT_CAP32_FORCE_PAUSE)) { + n++; + lc->requested_fc |= PAUSE_AUTONEG; + } + + /* FEC */ + if ((lc->requested_fec & FEC_RS && + !(lc->supported & FW_PORT_CAP32_FEC_RS)) || + (lc->requested_fec & FEC_BASER_RS && + !(lc->supported & FW_PORT_CAP32_FEC_BASER_RS))) { + n++; + lc->requested_fec = FEC_AUTO; + } + + return (n); } /* - * Apply the settings in requested_* to the hardware. The parameters are - * expected to be sane. + * Apply the requested L1 settings, which are expected to be valid, to the + * hardware. */ static int -apply_l1cfg(struct port_info *pi) +apply_link_config(struct port_info *pi) { struct adapter *sc = pi->adapter; struct link_config *lc = &pi->link_cfg; int rc; -#ifdef INVARIANTS - uint16_t fwspeed; +#ifdef INVARIANTS ASSERT_SYNCHRONIZED_OP(sc); PORT_LOCK_ASSERT_OWNED(pi); if (lc->requested_aneg == AUTONEG_ENABLE) - MPASS(lc->supported & FW_PORT_CAP_ANEG); + MPASS(lc->supported & FW_PORT_CAP32_ANEG); + if (!(lc->requested_fc & PAUSE_AUTONEG)) + MPASS(lc->supported & FW_PORT_CAP32_FORCE_PAUSE); if (lc->requested_fc & PAUSE_TX) - MPASS(lc->supported & FW_PORT_CAP_FC_TX); + MPASS(lc->supported & FW_PORT_CAP32_FC_TX); if (lc->requested_fc & PAUSE_RX) - MPASS(lc->supported & FW_PORT_CAP_FC_RX); - if (lc->requested_fec == FEC_RS) - MPASS(lc->supported & FW_PORT_CAP_FEC_RS); - if (lc->requested_fec == FEC_BASER_RS) - MPASS(lc->supported & FW_PORT_CAP_FEC_BASER_RS); - fwspeed = speed_to_fwspeed(lc->requested_speed); - MPASS(fwspeed != 0); - MPASS(lc->supported & fwspeed); + MPASS(lc->supported & FW_PORT_CAP32_FC_RX); + if (lc->requested_fec & FEC_RS) + MPASS(lc->supported & FW_PORT_CAP32_FEC_RS); + if (lc->requested_fec & FEC_BASER_RS) + MPASS(lc->supported & FW_PORT_CAP32_FEC_BASER_RS); #endif rc = -t4_link_l1cfg(sc, sc->mbox, pi->tx_chan, lc); if (rc != 0) { @@ -4382,8 +4468,17 @@ apply_l1cfg(struct port_info *pi) if (!(sc->flags & IS_VF) || rc != FW_EPERM) device_printf(pi->dev, "l1cfg failed: %d\n", rc); } else { - lc->fc = lc->requested_fc; - lc->fec = lc->requested_fec; + /* + * An L1_CFG will almost always result in a link-change event if + * the link is up, and the driver will refresh the actual + * fec/fc/etc. when the notification is processed. If the link + * is down then the actual settings are meaningless. + * + * This takes care of the case where a change in the L1 settings + * may not result in a notification. + */ + if (lc->link_ok && !(lc->requested_fc & PAUSE_AUTONEG)) + lc->fc = lc->requested_fc & (PAUSE_TX | PAUSE_RX); } return (rc); } @@ -4637,9 +4732,18 @@ cxgbe_init_synchronized(struct vi_info *vi) if (rc) goto done; /* error message displayed already */ + PORT_LOCK(pi); + if (pi->up_vis == 0) { + t4_update_port_info(pi); + fixup_link_config(pi); + build_medialist(pi); + apply_link_config(pi); + } + rc = -t4_enable_vi(sc, sc->mbox, vi->viid, true, true); if (rc != 0) { if_printf(ifp, "enable_vi failed: %d\n", rc); + PORT_UNLOCK(pi); goto done; } @@ -4666,12 +4770,7 @@ cxgbe_init_synchronized(struct vi_info *vi) } /* all ok */ - PORT_LOCK(pi); - if (pi->up_vis++ == 0) { - t4_update_port_info(pi); - build_medialist(pi, &pi->media); - apply_l1cfg(pi); - } + pi->up_vis++; ifp->if_drv_flags |= IFF_DRV_RUNNING; if (pi->nvi > 1 || sc->flags & IS_VF) @@ -4746,11 +4845,10 @@ cxgbe_uninit_synchronized(struct vi_info *vi) return (0); } - pi->link_cfg.link_ok = 0; + pi->link_cfg.link_ok = false; pi->link_cfg.speed = 0; pi->link_cfg.link_down_rc = 255; t4_os_link_changed(pi); - pi->old_link_cfg = pi->link_cfg; PORT_UNLOCK(pi); return (0); @@ -6514,7 +6612,7 @@ sysctl_pause_settings(SYSCTL_HANDLER_ARGS) if (req->newptr == NULL) { struct sbuf *sb; - static char *bits = "\20\1PAUSE_RX\2PAUSE_TX"; + static char *bits = "\20\1RX\2TX\3AUTO"; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) @@ -6524,14 +6622,21 @@ sysctl_pause_settings(SYSCTL_HANDLER_ARGS) if (sb == NULL) return (ENOMEM); - sbuf_printf(sb, "%b", lc->fc & (PAUSE_TX | PAUSE_RX), bits); + if (lc->link_ok) { + sbuf_printf(sb, "%b", (lc->fc & (PAUSE_TX | PAUSE_RX)) | + (lc->requested_fc & PAUSE_AUTONEG), bits); + } else { + sbuf_printf(sb, "%b", lc->requested_fc & (PAUSE_TX | + PAUSE_RX | PAUSE_AUTONEG), bits); + } rc = sbuf_finish(sb); sbuf_delete(sb); } else { char s[2]; int n; - s[0] = '0' + (lc->requested_fc & (PAUSE_TX | PAUSE_RX)); + s[0] = '0' + (lc->requested_fc & (PAUSE_TX | PAUSE_RX | + PAUSE_AUTONEG)); s[1] = 0; rc = sysctl_handle_string(oidp, s, sizeof(s), req); @@ -6543,7 +6648,7 @@ sysctl_pause_settings(SYSCTL_HANDLER_ARGS) if (s[0] < '0' || s[0] > '9') return (EINVAL); /* not a number */ n = s[0] - '0'; - if (n & ~(PAUSE_TX | PAUSE_RX)) + if (n & ~(PAUSE_TX | PAUSE_RX | PAUSE_AUTONEG)) return (EINVAL); /* some other bit is set too */ rc = begin_synchronized_op(sc, &pi->vi[0], SLEEP_OK | INTR_OK, @@ -6551,15 +6656,11 @@ sysctl_pause_settings(SYSCTL_HANDLER_ARGS) if (rc) return (rc); PORT_LOCK(pi); - if ((lc->requested_fc & (PAUSE_TX | PAUSE_RX)) != n) { - lc->requested_fc &= ~(PAUSE_TX | PAUSE_RX); - lc->requested_fc |= n; - rc = -t4_link_l1cfg(sc, sc->mbox, pi->tx_chan, lc); - if (rc == 0) { - lc->fc = lc->requested_fc; - set_current_media(pi, &pi->media); - } - } + lc->requested_fc = n; + fixup_link_config(pi); + if (pi->up_vis > 0) + rc = apply_link_config(pi); + set_current_media(pi); PORT_UNLOCK(pi); end_synchronized_op(sc, 0); } @@ -6574,10 +6675,11 @@ sysctl_fec(SYSCTL_HANDLER_ARGS) struct adapter *sc = pi->adapter; struct link_config *lc = &pi->link_cfg; int rc; + int8_t old; if (req->newptr == NULL) { struct sbuf *sb; - static char *bits = "\20\1RS\2BASER_RS\3RESERVED"; + static char *bits = "\20\1RS\2BASE-R\3RSVD1\4RSVD2\5RSVD3\6AUTO"; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) @@ -6587,43 +6689,68 @@ sysctl_fec(SYSCTL_HANDLER_ARGS) if (sb == NULL) return (ENOMEM); - sbuf_printf(sb, "%b", lc->fec & M_FW_PORT_CAP_FEC, bits); + /* + * Display the requested_fec when the link is down -- the actual + * FEC makes sense only when the link is up. + */ + if (lc->link_ok) { + sbuf_printf(sb, "%b", (lc->fec & M_FW_PORT_CAP32_FEC) | + (lc->requested_fec & FEC_AUTO), bits); + } else { + sbuf_printf(sb, "%b", lc->requested_fec, bits); + } rc = sbuf_finish(sb); sbuf_delete(sb); } else { - char s[2]; + char s[3]; int n; - s[0] = '0' + (lc->requested_fec & M_FW_PORT_CAP_FEC); - s[1] = 0; + snprintf(s, sizeof(s), "%d", + lc->requested_fec == FEC_AUTO ? -1 : + lc->requested_fec & M_FW_PORT_CAP32_FEC); rc = sysctl_handle_string(oidp, s, sizeof(s), req); if (rc != 0) return(rc); - if (s[1] != 0) - return (EINVAL); - if (s[0] < '0' || s[0] > '9') - return (EINVAL); /* not a number */ - n = s[0] - '0'; - if (n & ~M_FW_PORT_CAP_FEC) - return (EINVAL); /* some other bit is set too */ - if (!powerof2(n)) - return (EINVAL); /* one bit can be set at most */ + n = strtol(&s[0], NULL, 0); + if (n < 0 || n & FEC_AUTO) + n = FEC_AUTO; + else { + if (n & ~M_FW_PORT_CAP32_FEC) + return (EINVAL);/* some other bit is set too */ + if (!powerof2(n)) + return (EINVAL);/* one bit can be set at most */ + } rc = begin_synchronized_op(sc, &pi->vi[0], SLEEP_OK | INTR_OK, "t4fec"); if (rc) return (rc); PORT_LOCK(pi); - if ((lc->requested_fec & M_FW_PORT_CAP_FEC) != n) { - lc->requested_fec = n & - G_FW_PORT_CAP_FEC(lc->supported); - rc = -t4_link_l1cfg(sc, sc->mbox, pi->tx_chan, lc); - if (rc == 0) { - lc->fec = lc->requested_fec; + old = lc->requested_fec; + if (n == FEC_AUTO) + lc->requested_fec = FEC_AUTO; + else if (n == 0) + lc->requested_fec = FEC_NONE; + else { + if ((lc->supported | V_FW_PORT_CAP32_FEC(n)) != + lc->supported) { + rc = ENOTSUP; + goto done; + } + lc->requested_fec = n; + } + fixup_link_config(pi); + if (pi->up_vis > 0) { + rc = apply_link_config(pi); + if (rc != 0) { + lc->requested_fec = old; + if (rc == FW_EPROTO) + rc = ENOTSUP; } } +done: PORT_UNLOCK(pi); end_synchronized_op(sc, 0); } @@ -6637,10 +6764,10 @@ sysctl_autoneg(SYSCTL_HANDLER_ARGS) struct port_info *pi = arg1; struct adapter *sc = pi->adapter; struct link_config *lc = &pi->link_cfg; - int rc, val, old; + int rc, val; - if (lc->supported & FW_PORT_CAP_ANEG) - val = lc->requested_aneg == AUTONEG_ENABLE ? 1 : 0; + if (lc->supported & FW_PORT_CAP32_ANEG) + val = lc->requested_aneg == AUTONEG_DISABLE ? 0 : 1; else val = -1; rc = sysctl_handle_int(oidp, &val, 0, req); @@ -6651,28 +6778,22 @@ sysctl_autoneg(SYSCTL_HANDLER_ARGS) else if (val == 1) val = AUTONEG_ENABLE; else - return (EINVAL); + val = AUTONEG_AUTO; rc = begin_synchronized_op(sc, &pi->vi[0], SLEEP_OK | INTR_OK, "t4aneg"); if (rc) return (rc); PORT_LOCK(pi); - if ((lc->supported & FW_PORT_CAP_ANEG) == 0) { + if (val == AUTONEG_ENABLE && !(lc->supported & FW_PORT_CAP32_ANEG)) { rc = ENOTSUP; goto done; } - if (lc->requested_aneg == val) { - rc = 0; /* no change, do nothing. */ - goto done; - } - old = lc->requested_aneg; lc->requested_aneg = val; - rc = -t4_link_l1cfg(sc, sc->mbox, pi->tx_chan, lc); - if (rc != 0) - lc->requested_aneg = old; - else - set_current_media(pi, &pi->media); + fixup_link_config(pi); + if (pi->up_vis > 0) + rc = apply_link_config(pi); + set_current_media(pi); done: PORT_UNLOCK(pi); end_synchronized_op(sc, 0); @@ -9409,13 +9530,17 @@ t4_os_portmod_changed(struct port_info *pi) NULL, "LR", "SR", "ER", "TWINAX", "active TWINAX", "LRM" }; - MPASS((pi->flags & FIXED_IFMEDIA) == 0); + KASSERT((pi->flags & FIXED_IFMEDIA) == 0, + ("%s: port_type %u", __func__, pi->port_type)); vi = &pi->vi[0]; if (begin_synchronized_op(sc, vi, HOLD_LOCK, "t4mod") == 0) { PORT_LOCK(pi); - build_medialist(pi, &pi->media); - apply_l1cfg(pi); + build_medialist(pi); + if (pi->mod_type != FW_PORT_MOD_TYPE_NONE) { + fixup_link_config(pi); + apply_link_config(pi); + } PORT_UNLOCK(pi); end_synchronized_op(sc, LOCK_HELD); } diff --git a/sys/dev/cxgbe/tom/t4_cpl_io.c b/sys/dev/cxgbe/tom/t4_cpl_io.c index 4a6ac900962b..684606a8d5b5 100644 --- a/sys/dev/cxgbe/tom/t4_cpl_io.c +++ b/sys/dev/cxgbe/tom/t4_cpl_io.c @@ -634,7 +634,7 @@ write_tx_wr(void *dst, struct toepcb *toep, unsigned int immdlen, if (txalign > 0) { struct tcpcb *tp = intotcpcb(toep->inp); - if (plen < 2 * tp->t_maxseg || is_10G_port(toep->vi->pi)) + if (plen < 2 * tp->t_maxseg) txwr->lsodisable_to_flags |= htobe32(F_FW_OFLD_TX_DATA_WR_LSODISABLE); else -- cgit v1.3 From af534f8d99b2d1b2cef5804bd4947c33646bf417 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Tue, 25 Sep 2018 17:58:06 +0000 Subject: zfs: depessimize zfs_root with rmlocks Currently vfs calls the root method on each absolute lookup and when crossing mount points. zfs_root ends up looking up the inode internally as if it was not instantianted which results in significant lock contention on systems like EPYC. Store the vnode in the mount point and protect the access with rmlocks. This is a temporary hack for 12.0. Sample result: before: make -s -j 128 buildkernel 2778.09s user 3319.45s system 8370% cpu 1:12.85 total after: make -s -j 128 buildkernel 3199.57s user 1772.78s system 8232% cpu 1:00.40 total Tested by: pho (zfs mount/unmount tests) Reviewed by: kib, mav, sef (different parts) Approved by: re (gjb) Differential Revision: https://reviews.freebsd.org/D17233 --- .../opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h | 2 + .../opensolaris/uts/common/fs/zfs/zfs_vfsops.c | 66 +++++++++++++++++++++- 2 files changed, 67 insertions(+), 1 deletion(-) (limited to 'sys') diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h index 8cd6360e4cf5..bbfb79ce24d3 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h @@ -46,6 +46,8 @@ struct zfsvfs { zfsvfs_t *z_parent; /* parent fs */ objset_t *z_os; /* objset reference */ uint64_t z_root; /* id of root znode */ + struct vnode *z_rootvnode; /* root vnode */ + struct rmlock z_rootvnodelock;/* protection for root vnode */ uint64_t z_unlinkedobj; /* id of unlinked zapobj */ uint64_t z_max_blksz; /* maximum block size for files */ uint64_t z_fuid_obj; /* fuid table object number */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c index 8e421132b10e..1ab51ba77a1f 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c @@ -65,6 +65,7 @@ #include #include #include +#include #include "zfs_comutil.h" @@ -92,6 +93,9 @@ static int zfs_version_zpl = ZPL_VERSION; SYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0, "ZPL_VERSION"); +static int zfs_root_setvnode(zfsvfs_t *zfsvfs); +static void zfs_root_dropvnode(zfsvfs_t *zfsvfs); + static int zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg); static int zfs_mount(vfs_t *vfsp); static int zfs_umount(vfs_t *vfsp, int fflag); @@ -1209,6 +1213,8 @@ zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os) for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++) mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); + rm_init(&zfsvfs->z_rootvnodelock, "zfs root vnode lock"); + error = zfsvfs_init(zfsvfs, os); if (error != 0) { *zfvp = NULL; @@ -1315,6 +1321,8 @@ zfsvfs_free(zfsvfs_t *zfsvfs) rw_enter(&zfsvfs_lock, RW_READER); rw_exit(&zfsvfs_lock); + rm_destroy(&zfsvfs->z_rootvnodelock); + zfs_fuid_destroy(zfsvfs); mutex_destroy(&zfsvfs->z_znodes_lock); @@ -1921,6 +1929,8 @@ zfs_mount(vfs_t *vfsp) error = zfs_domount(vfsp, osname); PICKUP_GIANT(); + zfs_root_setvnode((zfsvfs_t *)vfsp->vfs_data); + #ifdef illumos /* * Add an extra VFS_HOLD on our parent vfs so that it can't @@ -1992,15 +2002,66 @@ zfs_statfs(vfs_t *vfsp, struct statfs *statp) return (0); } +static int +zfs_root_setvnode(zfsvfs_t *zfsvfs) +{ + znode_t *rootzp; + int error; + + ZFS_ENTER(zfsvfs); + error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp); + if (error != 0) + panic("could not zfs_zget for root vnode"); + ZFS_EXIT(zfsvfs); + + rm_wlock(&zfsvfs->z_rootvnodelock); + if (zfsvfs->z_rootvnode != NULL) + panic("zfs mount point already has a root vnode: %p\n", + zfsvfs->z_rootvnode); + zfsvfs->z_rootvnode = ZTOV(rootzp); + rm_wunlock(&zfsvfs->z_rootvnodelock); + return (0); +} + +static void +zfs_root_putvnode(zfsvfs_t *zfsvfs) +{ + struct vnode *vp; + + rm_wlock(&zfsvfs->z_rootvnodelock); + vp = zfsvfs->z_rootvnode; + zfsvfs->z_rootvnode = NULL; + rm_wunlock(&zfsvfs->z_rootvnodelock); + if (vp != NULL) + vrele(vp); +} + static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp) { + struct rm_priotracker tracker; zfsvfs_t *zfsvfs = vfsp->vfs_data; znode_t *rootzp; int error; - ZFS_ENTER(zfsvfs); + rm_rlock(&zfsvfs->z_rootvnodelock, &tracker); + *vpp = zfsvfs->z_rootvnode; + if (*vpp != NULL && (((*vpp)->v_iflag & VI_DOOMED) == 0)) { + vrefact(*vpp); + rm_runlock(&zfsvfs->z_rootvnodelock, &tracker); + goto lock; + } + rm_runlock(&zfsvfs->z_rootvnodelock, &tracker); + /* + * We found the vnode but did not like it. + */ + if (*vpp != NULL) { + *vpp = NULL; + zfs_root_putvnode(zfsvfs); + } + + ZFS_ENTER(zfsvfs); error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp); if (error == 0) *vpp = ZTOV(rootzp); @@ -2008,6 +2069,7 @@ zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp) ZFS_EXIT(zfsvfs); if (error == 0) { +lock: error = vn_lock(*vpp, flags); if (error != 0) { VN_RELE(*vpp); @@ -2126,6 +2188,8 @@ zfs_umount(vfs_t *vfsp, int fflag) cred_t *cr = td->td_ucred; int ret; + zfs_root_putvnode(zfsvfs); + ret = secpolicy_fs_unmount(cr, vfsp); if (ret) { if (dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource), -- cgit v1.3 From cbe100dfca7b4d8412aaa90845980853771a8a24 Mon Sep 17 00:00:00 2001 From: Konstantin Belousov Date: Tue, 25 Sep 2018 18:24:25 +0000 Subject: Fix an issue in r338862. For pmap_invalidate_all_pcid(), only reset pm_gen for non-kernel pmaps, as it was done before the conversion to ifuncs. The reset is useless but innocent for kernel_pmap. Coverity reported that cpuid is used uninitialized in this case. Reported by: cem Reviewed by: alc, cem, markj CID: 1395807 Sponsored by: The FreeBSD Foundation Approved by: re (gjb) Differential revision: https://reviews.freebsd.org/D17314 --- sys/amd64/amd64/pmap.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'sys') diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index ab8cc7d1b6f8..d91906a10508 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -1966,10 +1966,10 @@ pmap_invalidate_all_pcid(pmap_t pmap, bool invpcid_works1) critical_exit(); } else pmap->pm_pcids[cpuid].pm_gen = 0; - } - CPU_FOREACH(i) { - if (cpuid != i) - pmap->pm_pcids[i].pm_gen = 0; + CPU_FOREACH(i) { + if (cpuid != i) + pmap->pm_pcids[i].pm_gen = 0; + } } /* See the comment in pmap_invalidate_page_pcid(). */ atomic_thread_fence_seq_cst(); -- cgit v1.3 From 05e1cca97a180df482a89e3bb382cf695cac31da Mon Sep 17 00:00:00 2001 From: Konstantin Belousov Date: Tue, 25 Sep 2018 20:07:58 +0000 Subject: Fix some uses of dmaplimit. dmaplimit is the first byte after the end of DMAP. Reported by: "Johnson, Archna" Reviewed by: alc, markj Approved by: re (gjb) MFC after: 1 week Differential revision: https://reviews.freebsd.org/D17318 --- sys/amd64/amd64/pmap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'sys') diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index d91906a10508..61dfd2607b40 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -1422,7 +1422,7 @@ pmap_init(void) if (ppim->va == 0) continue; /* Make the direct map consistent */ - if (ppim->pa < dmaplimit && ppim->pa + ppim->sz < dmaplimit) { + if (ppim->pa < dmaplimit && ppim->pa + ppim->sz <= dmaplimit) { (void)pmap_change_attr(PHYS_TO_DMAP(ppim->pa), ppim->sz, ppim->mode); } @@ -7055,7 +7055,7 @@ pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode) * If the specified range of physical addresses fits within * the direct map window, use the direct map. */ - if (pa < dmaplimit && pa + size < dmaplimit) { + if (pa < dmaplimit && pa + size <= dmaplimit) { va = PHYS_TO_DMAP(pa); if (!pmap_change_attr(va, size, mode)) return ((void *)(va + offset)); -- cgit v1.3 From cb8dedc76ef2290559c0e9e95621bf3729b990a3 Mon Sep 17 00:00:00 2001 From: Navdeep Parhar Date: Wed, 26 Sep 2018 02:27:37 +0000 Subject: cxgbe(4): Treat base/end of firmware parameters as signed integers when figuring out whether the range is valid or not. Approved by: re@ (rgrimes@) MFC after: 1 week Sponsored by: Chelsio Communications --- sys/dev/cxgbe/t4_main.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'sys') diff --git a/sys/dev/cxgbe/t4_main.c b/sys/dev/cxgbe/t4_main.c index 3ae560585897..dc491fd1d7d3 100644 --- a/sys/dev/cxgbe/t4_main.c +++ b/sys/dev/cxgbe/t4_main.c @@ -3851,7 +3851,7 @@ get_params__post_init(struct adapter *sc) sc->sge.iq_start = val[0]; sc->sge.eq_start = val[1]; - if (val[3] > val[2]) { + if ((int)val[3] > (int)val[2]) { sc->tids.ftid_base = val[2]; sc->tids.ftid_end = val[3]; sc->tids.nftids = val[3] - val[2] + 1; @@ -3986,7 +3986,7 @@ get_params__post_init(struct adapter *sc) "failed to query NIC parameters: %d.\n", rc); return (rc); } - if (val[1] > val[0]) { + if ((int)val[1] > (int)val[0]) { sc->tids.etid_base = val[0]; sc->tids.etid_end = val[1]; sc->tids.netids = val[1] - val[0] + 1; @@ -4016,7 +4016,7 @@ get_params__post_init(struct adapter *sc) sc->tids.ntids -= sc->tids.nhpftids; } sc->tids.natids = min(sc->tids.ntids / 2, MAX_ATIDS); - if (val[2] > val[1]) { + if ((int)val[2] > (int)val[1]) { sc->tids.stid_base = val[1]; sc->tids.nstids = val[2] - val[1] + 1; } -- cgit v1.3 From 0277ec9c43aef2eeb5af9f1f8814001bb512ea0e Mon Sep 17 00:00:00 2001 From: Michael Tuexen Date: Wed, 26 Sep 2018 10:24:50 +0000 Subject: Whitespace changes and fixing a typo. No functional change. Approved by: re (kib@) MFC after: 1 week --- sys/netinet/sctp_input.c | 1 - sys/netinet/sctp_output.c | 2 +- sys/netinet/sctputil.c | 16 ++++++++-------- 3 files changed, 9 insertions(+), 10 deletions(-) (limited to 'sys') diff --git a/sys/netinet/sctp_input.c b/sys/netinet/sctp_input.c index b77abf4768ce..3a7d93833a0a 100644 --- a/sys/netinet/sctp_input.c +++ b/sys/netinet/sctp_input.c @@ -5669,7 +5669,6 @@ sctp_common_input_processing(struct mbuf **mm, int iphlen, int offset, int lengt vrf_id, port); goto out; } - } if (IS_SCTP_CONTROL(ch)) { /* process the control portion of the SCTP packet */ diff --git a/sys/netinet/sctp_output.c b/sys/netinet/sctp_output.c index e7807b331629..a0b09ddd3336 100644 --- a/sys/netinet/sctp_output.c +++ b/sys/netinet/sctp_output.c @@ -7210,7 +7210,7 @@ one_more_time: if ((sp->msg_is_complete) && (sp->length == 0)) { if (sp->sender_all_done) { /* - * We are doing differed cleanup. Last time through + * We are doing defered cleanup. Last time through * when we took all the data the sender_all_done was * not set. */ diff --git a/sys/netinet/sctputil.c b/sys/netinet/sctputil.c index 12047ad91293..5fc57fe139ea 100644 --- a/sys/netinet/sctputil.c +++ b/sys/netinet/sctputil.c @@ -3700,7 +3700,7 @@ sctp_ulp_notify(uint32_t notification, struct sctp_tcb *stcb, return; } if ((SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_WAIT) || - (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_ECHOED)) { + (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_ECHOED)) { if ((notification == SCTP_NOTIFY_INTERFACE_DOWN) || (notification == SCTP_NOTIFY_INTERFACE_UP) || (notification == SCTP_NOTIFY_INTERFACE_CONFIRMED)) { @@ -7391,8 +7391,8 @@ sctp_set_state(struct sctp_tcb *stcb, int new_state) #endif KASSERT((new_state & ~SCTP_STATE_MASK) == 0, - ("sctp_set_state: Can't set substate (new_state = %x)", - new_state)); + ("sctp_set_state: Can't set substate (new_state = %x)", + new_state)); stcb->asoc.state = (stcb->asoc.state & ~SCTP_STATE_MASK) | new_state; if ((new_state == SCTP_STATE_SHUTDOWN_RECEIVED) || (new_state == SCTP_STATE_SHUTDOWN_SENT) || @@ -7402,7 +7402,7 @@ sctp_set_state(struct sctp_tcb *stcb, int new_state) #if defined(KDTRACE_HOOKS) if (((old_state & SCTP_STATE_MASK) != new_state) && !(((old_state & SCTP_STATE_MASK) == SCTP_STATE_EMPTY) && - (new_state == SCTP_STATE_INUSE))) { + (new_state == SCTP_STATE_INUSE))) { SCTP_PROBE6(state__change, NULL, stcb, NULL, stcb, NULL, old_state); } #endif @@ -7416,14 +7416,14 @@ sctp_add_substate(struct sctp_tcb *stcb, int substate) #endif KASSERT((substate & SCTP_STATE_MASK) == 0, - ("sctp_add_substate: Can't set state (substate = %x)", - substate)); + ("sctp_add_substate: Can't set state (substate = %x)", + substate)); stcb->asoc.state |= substate; #if defined(KDTRACE_HOOKS) if (((substate & SCTP_STATE_ABOUT_TO_BE_FREED) && - ((old_state & SCTP_STATE_ABOUT_TO_BE_FREED) == 0)) || + ((old_state & SCTP_STATE_ABOUT_TO_BE_FREED) == 0)) || ((substate & SCTP_STATE_SHUTDOWN_PENDING) && - ((old_state & SCTP_STATE_SHUTDOWN_PENDING) == 0))) { + ((old_state & SCTP_STATE_SHUTDOWN_PENDING) == 0))) { SCTP_PROBE6(state__change, NULL, stcb, NULL, stcb, NULL, old_state); } #endif -- cgit v1.3 From a4ea412d4ff4de61f72d3b4b8b4cefa62b991f53 Mon Sep 17 00:00:00 2001 From: Slava Shwartsman Date: Wed, 26 Sep 2018 13:16:55 +0000 Subject: Add PCIV_INVALID definition From PCI Spec rev 2.2, 6.2.1. Device Identification: Vendor ID This field identifies the manufacturer of the device. Valid vendor identifiers are allocated by the PCI SIG to ensure uniqueness. 0FFFFh is an invalid value for Vendor ID. MFC after: 3 days Approved by: re (Glen), hselasky (mentor), kib (mentor) Sponsored by: Mellanox Technologies --- sys/dev/pci/pcireg.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'sys') diff --git a/sys/dev/pci/pcireg.h b/sys/dev/pci/pcireg.h index 00589c4b83da..edec95c8e67f 100644 --- a/sys/dev/pci/pcireg.h +++ b/sys/dev/pci/pcireg.h @@ -122,6 +122,9 @@ #define PCIM_MFDEV 0x80 #define PCIR_BIST 0x0f +/* PCI Spec rev 2.2: 0FFFFh is an invalid value for Vendor ID. */ +#define PCIV_INVALID 0xffff + /* Capability Register Offsets */ #define PCICAP_ID 0x0 -- cgit v1.3 From 0ddfd867ed9a3818266e786b0aa453d8544b0669 Mon Sep 17 00:00:00 2001 From: "Andrey V. Elsukov" Date: Wed, 26 Sep 2018 14:47:51 +0000 Subject: Fix witness warning in xform_init(). Do not call crypto_newsession() while holding xforms_lock mutex. Release mutex before invoking crypto_newsession(), and use ipsec_kmod_enter()/ipsec_kmod_exit() functions to protect from doing access to unloaded kernel module memory. Move xform-releated functions into subr_ipsec.c to be able use ipsec_kmod_* functions. Also unconditionally build ipsec_kmod_* functions, since now they are always used by IPSec code. Add xf_cntr field to struct xformsw, it is used by ipsec_kmod_* functions. Also constify xf_name field, since it is not expected to be modified. Approved by: re (kib) Differential Revision: https://reviews.freebsd.org/D17302 --- sys/netipsec/key.c | 81 +------------------------------------------- sys/netipsec/key.h | 3 ++ sys/netipsec/subr_ipsec.c | 86 ++++++++++++++++++++++++++++++++++++++++++----- sys/netipsec/xform.h | 7 ++-- 4 files changed, 87 insertions(+), 90 deletions(-) (limited to 'sys') diff --git a/sys/netipsec/key.c b/sys/netipsec/key.c index 8f5004a0613c..1e027bf6076e 100644 --- a/sys/netipsec/key.c +++ b/sys/netipsec/key.c @@ -534,14 +534,6 @@ MALLOC_DEFINE(M_IPSEC_SPDCACHE, "ipsec-spdcache", "ipsec SPD cache"); VNET_DEFINE_STATIC(uma_zone_t, key_lft_zone); #define V_key_lft_zone VNET(key_lft_zone) -static LIST_HEAD(xforms_list, xformsw) xforms = LIST_HEAD_INITIALIZER(); -static struct mtx xforms_lock; -#define XFORMS_LOCK_INIT() \ - mtx_init(&xforms_lock, "xforms_list", "IPsec transforms list", MTX_DEF) -#define XFORMS_LOCK_DESTROY() mtx_destroy(&xforms_lock) -#define XFORMS_LOCK() mtx_lock(&xforms_lock) -#define XFORMS_UNLOCK() mtx_unlock(&xforms_lock) - /* * set parameters into secpolicyindex buffer. * Must allocate secpolicyindex buffer passed to this function. @@ -717,7 +709,6 @@ static int key_delete(struct socket *, struct mbuf *, const struct sadb_msghdr *); static int key_delete_all(struct socket *, struct mbuf *, const struct sadb_msghdr *, struct secasindex *); -static void key_delete_xform(const struct xformsw *); static int key_get(struct socket *, struct mbuf *, const struct sadb_msghdr *); @@ -750,7 +741,6 @@ static int key_validate_ext(const struct sadb_ext *, int); static int key_align(struct mbuf *, struct sadb_msghdr *); static struct mbuf *key_setlifetime(struct seclifetime *, uint16_t); static struct mbuf *key_setkey(struct seckey *, uint16_t); -static int xform_init(struct secasvar *, u_short); static void spdcache_init(void); static void spdcache_clear(void); @@ -6167,7 +6157,7 @@ key_delete_all(struct socket *so, struct mbuf *m, * Larval SAs have not initialized tdb_xform, so it is safe to leave them * here when xform disappears. */ -static void +void key_delete_xform(const struct xformsw *xsp) { struct secasvar_queue drainq; @@ -8335,7 +8325,6 @@ key_init(void) if (!IS_DEFAULT_VNET(curvnet)) return; - XFORMS_LOCK_INIT(); SPTREE_LOCK_INIT(); REGTREE_LOCK_INIT(); SAHTREE_LOCK_INIT(); @@ -8458,7 +8447,6 @@ key_destroy(void) #ifndef IPSEC_DEBUG2 callout_drain(&key_timer); #endif - XFORMS_LOCK_DESTROY(); SPTREE_LOCK_DESTROY(); REGTREE_LOCK_DESTROY(); SAHTREE_LOCK_DESTROY(); @@ -8617,70 +8605,3 @@ comp_algorithm_lookup(int alg) return (NULL); } -/* - * Register a transform. - */ -static int -xform_register(struct xformsw* xsp) -{ - struct xformsw *entry; - - XFORMS_LOCK(); - LIST_FOREACH(entry, &xforms, chain) { - if (entry->xf_type == xsp->xf_type) { - XFORMS_UNLOCK(); - return (EEXIST); - } - } - LIST_INSERT_HEAD(&xforms, xsp, chain); - XFORMS_UNLOCK(); - return (0); -} - -void -xform_attach(void *data) -{ - struct xformsw *xsp = (struct xformsw *)data; - - if (xform_register(xsp) != 0) - printf("%s: failed to register %s xform\n", __func__, - xsp->xf_name); -} - -void -xform_detach(void *data) -{ - struct xformsw *xsp = (struct xformsw *)data; - - XFORMS_LOCK(); - LIST_REMOVE(xsp, chain); - XFORMS_UNLOCK(); - - /* Delete all SAs related to this xform. */ - key_delete_xform(xsp); -} - -/* - * Initialize transform support in an sav. - */ -static int -xform_init(struct secasvar *sav, u_short xftype) -{ - struct xformsw *entry; - int ret; - - IPSEC_ASSERT(sav->tdb_xform == NULL, - ("tdb_xform is already initialized")); - - ret = EINVAL; - XFORMS_LOCK(); - LIST_FOREACH(entry, &xforms, chain) { - if (entry->xf_type == xftype) { - ret = (*entry->xf_init)(sav, entry); - break; - } - } - XFORMS_UNLOCK(); - return (ret); -} - diff --git a/sys/netipsec/key.h b/sys/netipsec/key.h index 6c3e05c039e8..7d7ae69f379d 100644 --- a/sys/netipsec/key.h +++ b/sys/netipsec/key.h @@ -46,6 +46,7 @@ struct sadb_msg; struct sadb_x_policy; struct secasindex; union sockaddr_union; +struct xformsw; struct secpolicy *key_newsp(void); struct secpolicy *key_allocsp(struct secpolicyindex *, u_int); @@ -74,6 +75,8 @@ int key_sockaddrcmp_withmask(const struct sockaddr *, const struct sockaddr *, int key_register_ifnet(struct secpolicy **, u_int); void key_unregister_ifnet(struct secpolicy **, u_int); +void key_delete_xform(const struct xformsw *); + extern u_long key_random(void); extern void key_randomfill(void *, size_t); extern void key_freereg(struct socket *); diff --git a/sys/netipsec/subr_ipsec.c b/sys/netipsec/subr_ipsec.c index acfe66acf458..37b686a7e91e 100644 --- a/sys/netipsec/subr_ipsec.c +++ b/sys/netipsec/subr_ipsec.c @@ -54,6 +54,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include /* @@ -124,14 +125,6 @@ ipsec6_setsockaddrs(const struct mbuf *m, union sockaddr_union *src, } #endif -#ifdef IPSEC_SUPPORT -/* - * IPSEC_SUPPORT - loading of ipsec.ko and tcpmd5.ko is supported. - * IPSEC + IPSEC_SUPPORT - loading tcpmd5.ko is supported. - * IPSEC + TCP_SIGNATURE - all is build in the kernel, do not build - * IPSEC_SUPPORT. - */ -#if !defined(IPSEC) || !defined(TCP_SIGNATURE) #define IPSEC_MODULE_INCR 2 static int ipsec_kmod_enter(volatile u_int *cntr) @@ -171,6 +164,83 @@ ipsec_kmod_drain(volatile u_int *cntr) pause("ipsecd", hz/2); } +static LIST_HEAD(xforms_list, xformsw) xforms = LIST_HEAD_INITIALIZER(); +static struct mtx xforms_lock; +MTX_SYSINIT(xfroms_list, &xforms_lock, "IPsec transforms list", MTX_DEF); +#define XFORMS_LOCK() mtx_lock(&xforms_lock) +#define XFORMS_UNLOCK() mtx_unlock(&xforms_lock) + +void +xform_attach(void *data) +{ + struct xformsw *xsp, *entry; + + xsp = (struct xformsw *)data; + XFORMS_LOCK(); + LIST_FOREACH(entry, &xforms, chain) { + if (entry->xf_type == xsp->xf_type) { + XFORMS_UNLOCK(); + printf("%s: failed to register %s xform\n", + __func__, xsp->xf_name); + return; + } + } + LIST_INSERT_HEAD(&xforms, xsp, chain); + xsp->xf_cntr = IPSEC_MODULE_ENABLED; + XFORMS_UNLOCK(); +} + +void +xform_detach(void *data) +{ + struct xformsw *xsp = (struct xformsw *)data; + + XFORMS_LOCK(); + LIST_REMOVE(xsp, chain); + XFORMS_UNLOCK(); + + /* Delete all SAs related to this xform. */ + key_delete_xform(xsp); + if (xsp->xf_cntr & IPSEC_MODULE_ENABLED) + ipsec_kmod_drain(&xsp->xf_cntr); +} + +/* + * Initialize transform support in an sav. + */ +int +xform_init(struct secasvar *sav, u_short xftype) +{ + struct xformsw *entry; + int ret; + + IPSEC_ASSERT(sav->tdb_xform == NULL, + ("tdb_xform is already initialized")); + + XFORMS_LOCK(); + LIST_FOREACH(entry, &xforms, chain) { + if (entry->xf_type == xftype) { + ret = ipsec_kmod_enter(&entry->xf_cntr); + XFORMS_UNLOCK(); + if (ret != 0) + return (ret); + ret = (*entry->xf_init)(sav, entry); + ipsec_kmod_exit(&entry->xf_cntr); + return (ret); + } + } + XFORMS_UNLOCK(); + return (EINVAL); +} + +#ifdef IPSEC_SUPPORT +/* + * IPSEC_SUPPORT - loading of ipsec.ko and tcpmd5.ko is supported. + * IPSEC + IPSEC_SUPPORT - loading tcpmd5.ko is supported. + * IPSEC + TCP_SIGNATURE - all is build in the kernel, do not build + * IPSEC_SUPPORT. + */ +#if !defined(IPSEC) || !defined(TCP_SIGNATURE) #define METHOD_DECL(...) __VA_ARGS__ #define METHOD_ARGS(...) __VA_ARGS__ #define IPSEC_KMOD_METHOD(type, name, sc, method, decl, args) \ diff --git a/sys/netipsec/xform.h b/sys/netipsec/xform.h index 389d0b66850b..910a88a706f3 100644 --- a/sys/netipsec/xform.h +++ b/sys/netipsec/xform.h @@ -86,14 +86,16 @@ struct xform_data { #define XF_IPCOMP 6 /* IPCOMP */ struct xformsw { - u_short xf_type; /* xform ID */ - char *xf_name; /* human-readable name */ + u_short xf_type; /* xform ID */ + const char *xf_name; /* human-readable name */ int (*xf_init)(struct secasvar*, struct xformsw*); /* setup */ int (*xf_zeroize)(struct secasvar*); /* cleanup */ int (*xf_input)(struct mbuf*, struct secasvar*, /* input */ int, int); int (*xf_output)(struct mbuf*, /* output */ struct secpolicy *, struct secasvar *, u_int, int, int); + + volatile u_int xf_cntr; LIST_ENTRY(xformsw) chain; }; @@ -103,6 +105,7 @@ const struct comp_algo * comp_algorithm_lookup(int); void xform_attach(void *); void xform_detach(void *); +int xform_init(struct secasvar *, u_short); struct cryptoini; /* XF_AH */ -- cgit v1.3 From 329e817fcc97aa847765c5171cc89a81a0b25527 Mon Sep 17 00:00:00 2001 From: Warner Losh Date: Wed, 26 Sep 2018 17:12:14 +0000 Subject: Reapply, with minor tweaks, r338025, from the original commit: Remove unused and easy to misuse PNP macro parameter Inspired by r338025, just remove the element size parameter to the MODULE_PNP_INFO macro entirely. The 'table' parameter is now required to have correct pointer (or array) type. Since all invocations of the macro already had this property and the emitted PNP data continues to include the element size, there is no functional change. Mostly done with the coccinelle 'spatch' tool: $ cat modpnpsize0.cocci @normaltables@ identifier b,c; expression a,d,e; declarer MODULE_PNP_INFO; @@ MODULE_PNP_INFO(a,b,c,d, -sizeof(d[0]), e); @singletons@ identifier b,c,d; expression a; declarer MODULE_PNP_INFO; @@ MODULE_PNP_INFO(a,b,c,&d, -sizeof(d), 1); $ rg -l MODULE_PNP_INFO -- sys | \ xargs spatch --in-place --sp-file modpnpsize0.cocci (Note that coccinelle invokes diff(1) via a PATH search and expects diff to tolerate the -B flag, which BSD diff does not. So I had to link gdiff into PATH as diff to use spatch.) Tinderbox'd (-DMAKE_JUST_KERNELS). Approved by: re (glen) --- sys/crypto/ccp/ccp.c | 2 +- sys/dev/aac/aac_pci.c | 2 +- sys/dev/aacraid/aacraid_pci.c | 2 +- sys/dev/adlink/adlink.c | 2 +- sys/dev/ae/if_ae.c | 2 +- sys/dev/age/if_age.c | 2 +- sys/dev/ahci/ahci_pci.c | 2 +- sys/dev/alc/if_alc.c | 2 +- sys/dev/ale/if_ale.c | 2 +- sys/dev/amdsmn/amdsmn.c | 2 +- sys/dev/amdtemp/amdtemp.c | 2 +- sys/dev/amr/amr_pci.c | 2 +- sys/dev/an/if_an_pci.c | 2 +- sys/dev/bce/if_bce.c | 2 +- sys/dev/bfe/if_bfe.c | 2 +- sys/dev/bge/if_bge.c | 2 +- sys/dev/bwi/if_bwi_pci.c | 2 +- sys/dev/bwn/if_bwn_pci.c | 4 ++-- sys/dev/cas/if_cas.c | 2 +- sys/dev/ciss/ciss.c | 2 +- sys/dev/dc/if_dc.c | 2 +- sys/dev/drm2/i915/i915_drv.c | 2 +- sys/dev/drm2/radeon/radeon_drv.c | 2 +- sys/dev/ed/if_ed_pci.c | 2 +- sys/dev/ena/ena.c | 2 +- sys/dev/et/if_et.c | 2 +- sys/dev/fxp/if_fxp.c | 2 +- sys/dev/gem/if_gem_pci.c | 2 +- sys/dev/intpm/intpm.c | 2 +- sys/dev/ioat/ioat.c | 2 +- sys/dev/ipw/if_ipw.c | 2 +- sys/dev/ixgbe/if_ix.c | 2 +- sys/dev/ixgbe/if_ixv.c | 2 +- sys/dev/ncr/ncr.c | 2 +- sys/dev/ntb/ntb_hw/ntb_hw_intel.c | 2 +- sys/dev/ofw/ofw_bus_subr.h | 2 +- sys/dev/pccard/pccardvar.h | 2 +- sys/dev/pci/pcivar.h | 2 +- sys/dev/puc/puc_pci.c | 2 +- sys/dev/spibus/spi.h | 2 +- sys/dev/uart/uart_bus_pccard.c | 2 +- sys/dev/usb/usbdi.h | 6 +++--- sys/dev/xl/if_xl.c | 2 +- sys/isa/isavar.h | 2 +- sys/net/iflib.h | 2 +- sys/sys/module.h | 4 ++-- 46 files changed, 50 insertions(+), 50 deletions(-) (limited to 'sys') diff --git a/sys/crypto/ccp/ccp.c b/sys/crypto/ccp/ccp.c index 163fa748b2ef..1983eac79444 100644 --- a/sys/crypto/ccp/ccp.c +++ b/sys/crypto/ccp/ccp.c @@ -735,7 +735,7 @@ MODULE_VERSION(ccp, 1); MODULE_DEPEND(ccp, crypto, 1, 1, 1); MODULE_DEPEND(ccp, random_device, 1, 1, 1); #if 0 /* There are enough known issues that we shouldn't load automatically */ -MODULE_PNP_INFO("W32:vendor/device", pci, ccp, ccp_ids, sizeof(ccp_ids[0]), +MODULE_PNP_INFO("W32:vendor/device", pci, ccp, ccp_ids, nitems(ccp_ids)); #endif diff --git a/sys/dev/aac/aac_pci.c b/sys/dev/aac/aac_pci.c index 399e55a9a47f..738bfbf79167 100644 --- a/sys/dev/aac/aac_pci.c +++ b/sys/dev/aac/aac_pci.c @@ -494,7 +494,7 @@ static driver_t aacch_driver = { static devclass_t aacch_devclass; DRIVER_MODULE(aacch, pci, aacch_driver, aacch_devclass, NULL, NULL); MODULE_PNP_INFO("U16:vendor;U16:device;", pci, aac, - aac_identifiers, sizeof(aac_identifiers[0]), nitems(aac_identifiers) - 1); + aac_identifiers, nitems(aac_identifiers) - 1); static int aacch_probe(device_t dev) diff --git a/sys/dev/aacraid/aacraid_pci.c b/sys/dev/aacraid/aacraid_pci.c index 74ac0ee23312..f445a073b6d5 100644 --- a/sys/dev/aacraid/aacraid_pci.c +++ b/sys/dev/aacraid/aacraid_pci.c @@ -106,7 +106,7 @@ struct aac_ident DRIVER_MODULE(aacraid, pci, aacraid_pci_driver, aacraid_devclass, 0, 0); MODULE_PNP_INFO("U16:vendor;U16:device", pci, aacraid, - aacraid_family_identifiers, sizeof(aacraid_family_identifiers[0]), + aacraid_family_identifiers, nitems(aacraid_family_identifiers) - 1); MODULE_DEPEND(aacraid, pci, 1, 1, 1); diff --git a/sys/dev/adlink/adlink.c b/sys/dev/adlink/adlink.c index 2ae3d768c2fd..19b2f9388c6f 100644 --- a/sys/dev/adlink/adlink.c +++ b/sys/dev/adlink/adlink.c @@ -438,6 +438,6 @@ static driver_t adlink_driver = { }; DRIVER_MODULE(adlink, pci, adlink_driver, adlink_devclass, 0, 0); -MODULE_PNP_INFO("U16:vendor;U16:device;D:#", pci, adlink, adlink_id, sizeof(adlink_id[0]), +MODULE_PNP_INFO("U16:vendor;U16:device;D:#", pci, adlink, adlink_id, nitems(adlink_id)); #endif /* _KERNEL */ diff --git a/sys/dev/ae/if_ae.c b/sys/dev/ae/if_ae.c index f70afb27fa56..cb9474db8553 100644 --- a/sys/dev/ae/if_ae.c +++ b/sys/dev/ae/if_ae.c @@ -178,7 +178,7 @@ static devclass_t ae_devclass; DRIVER_MODULE(ae, pci, ae_driver, ae_devclass, 0, 0); MODULE_PNP_INFO("U16:vendor;U16:device;D:#", pci, ae, ae_devs, - sizeof(ae_devs[0]), nitems(ae_devs)); + nitems(ae_devs)); DRIVER_MODULE(miibus, ae, miibus_driver, miibus_devclass, 0, 0); MODULE_DEPEND(ae, pci, 1, 1, 1); MODULE_DEPEND(ae, ether, 1, 1, 1); diff --git a/sys/dev/age/if_age.c b/sys/dev/age/if_age.c index 764363f19856..ee52564d7cc8 100644 --- a/sys/dev/age/if_age.c +++ b/sys/dev/age/if_age.c @@ -184,7 +184,7 @@ static devclass_t age_devclass; DRIVER_MODULE(age, pci, age_driver, age_devclass, 0, 0); MODULE_PNP_INFO("U16:vendor;U16:device;D:#", pci, age, age_devs, - sizeof(age_devs[0]), nitems(age_devs)); + nitems(age_devs)); DRIVER_MODULE(miibus, age, miibus_driver, miibus_devclass, 0, 0); static struct resource_spec age_res_spec_mem[] = { diff --git a/sys/dev/ahci/ahci_pci.c b/sys/dev/ahci/ahci_pci.c index a117a5720c86..2bc2a243837b 100644 --- a/sys/dev/ahci/ahci_pci.c +++ b/sys/dev/ahci/ahci_pci.c @@ -667,7 +667,7 @@ static driver_t ahci_driver = { DRIVER_MODULE(ahci, pci, ahci_driver, ahci_devclass, NULL, NULL); /* Also matches class / subclass / progid XXX need to add when we have masking support */ MODULE_PNP_INFO("W32:vendor/device", pci, ahci, ahci_ids, - sizeof(ahci_ids[0]), nitems(ahci_ids) - 1); + nitems(ahci_ids) - 1); static device_method_t ahci_ata_methods[] = { DEVMETHOD(device_probe, ahci_ata_probe), DEVMETHOD(device_attach, ahci_pci_attach), diff --git a/sys/dev/alc/if_alc.c b/sys/dev/alc/if_alc.c index ede7bd49c193..1bafdab235d0 100644 --- a/sys/dev/alc/if_alc.c +++ b/sys/dev/alc/if_alc.c @@ -244,7 +244,7 @@ static devclass_t alc_devclass; DRIVER_MODULE(alc, pci, alc_driver, alc_devclass, 0, 0); MODULE_PNP_INFO("U16:vendor;U16:device", pci, alc, alc_ident_table, - sizeof(alc_ident_table[0]), nitems(alc_ident_table) - 1); + nitems(alc_ident_table) - 1); DRIVER_MODULE(miibus, alc, miibus_driver, miibus_devclass, 0, 0); static struct resource_spec alc_res_spec_mem[] = { diff --git a/sys/dev/ale/if_ale.c b/sys/dev/ale/if_ale.c index 987bd2db7421..802fdafd63cf 100644 --- a/sys/dev/ale/if_ale.c +++ b/sys/dev/ale/if_ale.c @@ -179,7 +179,7 @@ static devclass_t ale_devclass; DRIVER_MODULE(ale, pci, ale_driver, ale_devclass, NULL, NULL); MODULE_PNP_INFO("U16:vendor;U16:device;D:#", pci, ale, ale_devs, - sizeof(ale_devs[0]), nitems(ale_devs)); + nitems(ale_devs)); DRIVER_MODULE(miibus, ale, miibus_driver, miibus_devclass, NULL, NULL); static struct resource_spec ale_res_spec_mem[] = { diff --git a/sys/dev/amdsmn/amdsmn.c b/sys/dev/amdsmn/amdsmn.c index fb2c8b17328c..17792dd922cd 100644 --- a/sys/dev/amdsmn/amdsmn.c +++ b/sys/dev/amdsmn/amdsmn.c @@ -90,7 +90,7 @@ static devclass_t amdsmn_devclass; DRIVER_MODULE(amdsmn, hostb, amdsmn_driver, amdsmn_devclass, NULL, NULL); MODULE_VERSION(amdsmn, 1); MODULE_PNP_INFO("W32:vendor/device", pci, amdsmn, amdsmn_ids, - sizeof(amdsmn_ids[0]), nitems(amdsmn_ids)); + nitems(amdsmn_ids)); static bool amdsmn_match(device_t parent) diff --git a/sys/dev/amdtemp/amdtemp.c b/sys/dev/amdtemp/amdtemp.c index 45f8d6397ed0..2463212c25f5 100644 --- a/sys/dev/amdtemp/amdtemp.c +++ b/sys/dev/amdtemp/amdtemp.c @@ -167,7 +167,7 @@ DRIVER_MODULE(amdtemp, hostb, amdtemp_driver, amdtemp_devclass, NULL, NULL); MODULE_VERSION(amdtemp, 1); MODULE_DEPEND(amdtemp, amdsmn, 1, 1, 1); MODULE_PNP_INFO("U16:vendor;U16:device", pci, amdtemp, amdtemp_products, - sizeof(amdtemp_products[0]), nitems(amdtemp_products)); + nitems(amdtemp_products)); static int amdtemp_match(device_t dev) diff --git a/sys/dev/amr/amr_pci.c b/sys/dev/amr/amr_pci.c index 80cd58b5ccd3..25b37eda3895 100644 --- a/sys/dev/amr/amr_pci.c +++ b/sys/dev/amr/amr_pci.c @@ -142,7 +142,7 @@ static struct amr_ident static devclass_t amr_devclass; DRIVER_MODULE(amr, pci, amr_pci_driver, amr_devclass, 0, 0); MODULE_PNP_INFO("U16:vendor;U16:device", pci, amr, amr_device_ids, - sizeof(amr_device_ids[0]), nitems(amr_device_ids) - 1); + nitems(amr_device_ids) - 1); MODULE_DEPEND(amr, pci, 1, 1, 1); MODULE_DEPEND(amr, cam, 1, 1, 1); diff --git a/sys/dev/an/if_an_pci.c b/sys/dev/an/if_an_pci.c index 1105a963128e..15ae3e320e54 100644 --- a/sys/dev/an/if_an_pci.c +++ b/sys/dev/an/if_an_pci.c @@ -274,6 +274,6 @@ static devclass_t an_devclass; DRIVER_MODULE(an, pci, an_pci_driver, an_devclass, 0, 0); MODULE_PNP_INFO("U16:vendor;U16:device;D:#", pci, an, - an_devs, sizeof(an_devs[0]), nitems(an_devs) - 1); + an_devs, nitems(an_devs) - 1); MODULE_DEPEND(an, pci, 1, 1, 1); MODULE_DEPEND(an, wlan, 1, 1, 1); diff --git a/sys/dev/bce/if_bce.c b/sys/dev/bce/if_bce.c index 37eed4abcbc0..3d5c0742580c 100644 --- a/sys/dev/bce/if_bce.c +++ b/sys/dev/bce/if_bce.c @@ -530,7 +530,7 @@ MODULE_DEPEND(bce, miibus, 1, 1, 1); DRIVER_MODULE(bce, pci, bce_driver, bce_devclass, NULL, NULL); DRIVER_MODULE(miibus, bce, miibus_driver, miibus_devclass, NULL, NULL); MODULE_PNP_INFO("U16:vendor;U16:device;U16:#;U16:#;D:#", pci, bce, - bce_devs, sizeof(bce_devs[0]), nitems(bce_devs) - 1); + bce_devs, nitems(bce_devs) - 1); /****************************************************************************/ /* Tunable device values */ diff --git a/sys/dev/bfe/if_bfe.c b/sys/dev/bfe/if_bfe.c index fb0c2949c82d..6a86f3488b4c 100644 --- a/sys/dev/bfe/if_bfe.c +++ b/sys/dev/bfe/if_bfe.c @@ -158,7 +158,7 @@ static devclass_t bfe_devclass; DRIVER_MODULE(bfe, pci, bfe_driver, bfe_devclass, 0, 0); MODULE_PNP_INFO("U16:vendor;U16:device;D:#", pci, bfe, bfe_devs, - sizeof(bfe_devs[0]), nitems(bfe_devs) - 1); + nitems(bfe_devs) - 1); DRIVER_MODULE(miibus, bfe, miibus_driver, miibus_devclass, 0, 0); /* diff --git a/sys/dev/bge/if_bge.c b/sys/dev/bge/if_bge.c index c7134b9a2abd..193116e10681 100644 --- a/sys/dev/bge/if_bge.c +++ b/sys/dev/bge/if_bge.c @@ -548,7 +548,7 @@ static devclass_t bge_devclass; DRIVER_MODULE(bge, pci, bge_driver, bge_devclass, 0, 0); MODULE_PNP_INFO("U16:vendor;U16:device", pci, bge, bge_devs, - sizeof(bge_devs[0]), nitems(bge_devs) - 1); + nitems(bge_devs) - 1); DRIVER_MODULE(miibus, bge, miibus_driver, miibus_devclass, 0, 0); static int bge_allow_asf = 1; diff --git a/sys/dev/bwi/if_bwi_pci.c b/sys/dev/bwi/if_bwi_pci.c index 63378be0452d..f95ef854ceaa 100644 --- a/sys/dev/bwi/if_bwi_pci.c +++ b/sys/dev/bwi/if_bwi_pci.c @@ -257,7 +257,7 @@ static driver_t bwi_driver = { static devclass_t bwi_devclass; DRIVER_MODULE(bwi, pci, bwi_driver, bwi_devclass, 0, 0); MODULE_PNP_INFO("U16:vendor;U16:device;D:#", pci, bwi, bwi_devices, - sizeof(bwi_devices[0]), nitems(bwi_devices) - 1); + nitems(bwi_devices) - 1); MODULE_DEPEND(bwi, wlan, 1, 1, 1); /* 802.11 media layer */ MODULE_DEPEND(bwi, firmware, 1, 1, 1); /* firmware support */ MODULE_DEPEND(bwi, wlan_amrr, 1, 1, 1); diff --git a/sys/dev/bwn/if_bwn_pci.c b/sys/dev/bwn/if_bwn_pci.c index dfe7b50e7bc0..610f8bdeb823 100644 --- a/sys/dev/bwn/if_bwn_pci.c +++ b/sys/dev/bwn/if_bwn_pci.c @@ -296,9 +296,9 @@ DEFINE_CLASS_0(bwn_pci, bwn_pci_driver, bwn_pci_methods, DRIVER_MODULE_ORDERED(bwn_pci, pci, bwn_pci_driver, bwn_pci_devclass, NULL, NULL, SI_ORDER_ANY); MODULE_PNP_INFO("U16:vendor;U16:device;D:#", pci, bwn_siba, - siba_devices, sizeof(siba_devices[0]), nitems(siba_devices) - 1); + siba_devices, nitems(siba_devices) - 1); MODULE_PNP_INFO("U16:vendor;U16:device;D:#", pci, bwn_bcma, - bcma_devices, sizeof(bcma_devices[0]), nitems(bcma_devices) - 1); + bcma_devices, nitems(bcma_devices) - 1); DRIVER_MODULE(bhndb, bwn_pci, bhndb_pci_driver, bhndb_devclass, NULL, NULL); MODULE_DEPEND(bwn_pci, bwn, 1, 1, 1); diff --git a/sys/dev/cas/if_cas.c b/sys/dev/cas/if_cas.c index 3e6dbfe7b463..d1b3761302a1 100644 --- a/sys/dev/cas/if_cas.c +++ b/sys/dev/cas/if_cas.c @@ -2617,7 +2617,7 @@ static const struct cas_pci_dev { DRIVER_MODULE(cas, pci, cas_pci_driver, cas_devclass, 0, 0); MODULE_PNP_INFO("W32:vendor/device", pci, cas, cas_pci_devlist, - sizeof(cas_pci_devlist[0]), nitems(cas_pci_devlist) - 1); + nitems(cas_pci_devlist) - 1); DRIVER_MODULE(miibus, cas, miibus_driver, miibus_devclass, 0, 0); MODULE_DEPEND(cas, pci, 1, 1, 1); diff --git a/sys/dev/ciss/ciss.c b/sys/dev/ciss/ciss.c index 94bd9c69e1c5..e4b887a38e60 100644 --- a/sys/dev/ciss/ciss.c +++ b/sys/dev/ciss/ciss.c @@ -365,7 +365,7 @@ static struct static devclass_t ciss_devclass; DRIVER_MODULE(ciss, pci, ciss_pci_driver, ciss_devclass, 0, 0); MODULE_PNP_INFO("U16:vendor;U16:device;", pci, ciss, ciss_vendor_data, - sizeof(ciss_vendor_data[0]), nitems(ciss_vendor_data) - 1); + nitems(ciss_vendor_data) - 1); MODULE_DEPEND(ciss, cam, 1, 1, 1); MODULE_DEPEND(ciss, pci, 1, 1, 1); diff --git a/sys/dev/dc/if_dc.c b/sys/dev/dc/if_dc.c index e1593b02db25..a4394de9c552 100644 --- a/sys/dev/dc/if_dc.c +++ b/sys/dev/dc/if_dc.c @@ -360,7 +360,7 @@ static devclass_t dc_devclass; DRIVER_MODULE_ORDERED(dc, pci, dc_driver, dc_devclass, NULL, NULL, SI_ORDER_ANY); MODULE_PNP_INFO("W32:vendor/device;U8:revision;D:#", pci, dc, dc_devs, - sizeof(dc_devs[0]), nitems(dc_devs) - 1); + nitems(dc_devs) - 1); DRIVER_MODULE(miibus, dc, miibus_driver, miibus_devclass, NULL, NULL); #define DC_SETBIT(sc, reg, x) \ diff --git a/sys/dev/drm2/i915/i915_drv.c b/sys/dev/drm2/i915/i915_drv.c index f0c8867501b3..8a7168861aca 100644 --- a/sys/dev/drm2/i915/i915_drv.c +++ b/sys/dev/drm2/i915/i915_drv.c @@ -1237,7 +1237,7 @@ MODULE_DEPEND(i915kms, iicbus, 1, 1, 1); MODULE_DEPEND(i915kms, iic, 1, 1, 1); MODULE_DEPEND(i915kms, iicbb, 1, 1, 1); MODULE_PNP_INFO("U32:vendor;U32:device;P:#;D:#", vgapci, i915, pciidlist, - sizeof(pciidlist[0]), nitems(pciidlist) - 1); + nitems(pciidlist) - 1); /* We give fast paths for the really cool registers */ #define NEEDS_FORCE_WAKE(dev_priv, reg) \ diff --git a/sys/dev/drm2/radeon/radeon_drv.c b/sys/dev/drm2/radeon/radeon_drv.c index bf3dd063c178..73f83ccef0cb 100644 --- a/sys/dev/drm2/radeon/radeon_drv.c +++ b/sys/dev/drm2/radeon/radeon_drv.c @@ -402,4 +402,4 @@ MODULE_DEPEND(radeonkms, iic, 1, 1, 1); MODULE_DEPEND(radeonkms, iicbb, 1, 1, 1); MODULE_DEPEND(radeonkms, firmware, 1, 1, 1); MODULE_PNP_INFO("U32:vendor;U32:device;P:#;D:#", vgapci, radeonkms, - pciidlist, sizeof(pciidlist[0]), nitems(pciidlist) - 1); + pciidlist, nitems(pciidlist) - 1); diff --git a/sys/dev/ed/if_ed_pci.c b/sys/dev/ed/if_ed_pci.c index b487b97de820..8ada958abcbe 100644 --- a/sys/dev/ed/if_ed_pci.c +++ b/sys/dev/ed/if_ed_pci.c @@ -145,5 +145,5 @@ static driver_t ed_pci_driver = { DRIVER_MODULE(ed, pci, ed_pci_driver, ed_devclass, 0, 0); MODULE_DEPEND(ed, pci, 1, 1, 1); MODULE_DEPEND(ed, ether, 1, 1, 1); -MODULE_PNP_INFO("W32:vendor/device;D:#", pci, ed, pci_ids, sizeof(pci_ids[0]), +MODULE_PNP_INFO("W32:vendor/device;D:#", pci, ed, pci_ids, nitems(pci_ids) - 1); diff --git a/sys/dev/ena/ena.c b/sys/dev/ena/ena.c index cadc8bdbf677..a27de7db593f 100644 --- a/sys/dev/ena/ena.c +++ b/sys/dev/ena/ena.c @@ -3948,7 +3948,7 @@ static driver_t ena_driver = { devclass_t ena_devclass; DRIVER_MODULE(ena, pci, ena_driver, ena_devclass, 0, 0); MODULE_PNP_INFO("U16:vendor;U16:device", pci, ena, ena_vendor_info_array, - sizeof(ena_vendor_info_array[0]), nitems(ena_vendor_info_array) - 1); + nitems(ena_vendor_info_array) - 1); MODULE_DEPEND(ena, pci, 1, 1, 1); MODULE_DEPEND(ena, ether, 1, 1, 1); diff --git a/sys/dev/et/if_et.c b/sys/dev/et/if_et.c index a5ce9e452872..294d9d5e55aa 100644 --- a/sys/dev/et/if_et.c +++ b/sys/dev/et/if_et.c @@ -189,7 +189,7 @@ static devclass_t et_devclass; DRIVER_MODULE(et, pci, et_driver, et_devclass, 0, 0); MODULE_PNP_INFO("U16:vendor;U16:device;D:#", pci, et, et_devices, - sizeof(et_devices[0]), nitems(et_devices) - 1); + nitems(et_devices) - 1); DRIVER_MODULE(miibus, et, miibus_driver, miibus_devclass, 0, 0); static int et_rx_intr_npkts = 32; diff --git a/sys/dev/fxp/if_fxp.c b/sys/dev/fxp/if_fxp.c index b89f8329a4f2..2c3297a05bea 100644 --- a/sys/dev/fxp/if_fxp.c +++ b/sys/dev/fxp/if_fxp.c @@ -308,7 +308,7 @@ static devclass_t fxp_devclass; DRIVER_MODULE_ORDERED(fxp, pci, fxp_driver, fxp_devclass, NULL, NULL, SI_ORDER_ANY); MODULE_PNP_INFO("U16:vendor;U16:device", pci, fxp, fxp_ident_table, - sizeof(fxp_ident_table[0]), nitems(fxp_ident_table) - 1); + nitems(fxp_ident_table) - 1); DRIVER_MODULE(miibus, fxp, miibus_driver, miibus_devclass, NULL, NULL); static struct resource_spec fxp_res_spec_mem[] = { diff --git a/sys/dev/gem/if_gem_pci.c b/sys/dev/gem/if_gem_pci.c index af9db3da9c3e..ce3027fb3441 100644 --- a/sys/dev/gem/if_gem_pci.c +++ b/sys/dev/gem/if_gem_pci.c @@ -116,7 +116,7 @@ static driver_t gem_pci_driver = { DRIVER_MODULE(gem, pci, gem_pci_driver, gem_devclass, 0, 0); MODULE_PNP_INFO("W32:vendor/device", pci, gem, gem_pci_devlist, - sizeof(gem_pci_devlist[0]), nitems(gem_pci_devlist) - 1); + nitems(gem_pci_devlist) - 1); MODULE_DEPEND(gem, pci, 1, 1, 1); MODULE_DEPEND(gem, ether, 1, 1, 1); diff --git a/sys/dev/intpm/intpm.c b/sys/dev/intpm/intpm.c index 4b4cf9ab10c8..15da5e861a07 100644 --- a/sys/dev/intpm/intpm.c +++ b/sys/dev/intpm/intpm.c @@ -896,4 +896,4 @@ DRIVER_MODULE(smbus, intsmb, smbus_driver, smbus_devclass, 0, 0); MODULE_DEPEND(intsmb, smbus, SMBUS_MINVER, SMBUS_PREFVER, SMBUS_MAXVER); MODULE_VERSION(intsmb, 1); MODULE_PNP_INFO("W32:vendor/device;D:#", pci, intpm, intsmb_products, - sizeof(intsmb_products[0]), nitems(intsmb_products)); + nitems(intsmb_products)); diff --git a/sys/dev/ioat/ioat.c b/sys/dev/ioat/ioat.c index 744c26dfafdc..b03de58fcb55 100644 --- a/sys/dev/ioat/ioat.c +++ b/sys/dev/ioat/ioat.c @@ -241,7 +241,7 @@ static struct _pcsid }; MODULE_PNP_INFO("W32:vendor/device;D:#", pci, ioat, pci_ids, - sizeof(pci_ids[0]), nitems(pci_ids)); + nitems(pci_ids)); /* * OS <-> Driver linkage functions diff --git a/sys/dev/ipw/if_ipw.c b/sys/dev/ipw/if_ipw.c index 1da31934f738..e99d5aedc714 100644 --- a/sys/dev/ipw/if_ipw.c +++ b/sys/dev/ipw/if_ipw.c @@ -203,7 +203,7 @@ static devclass_t ipw_devclass; DRIVER_MODULE(ipw, pci, ipw_driver, ipw_devclass, NULL, NULL); MODULE_PNP_INFO("U16:vendor;U16:device;D:#", pci, ipw, ipw_ident_table, - sizeof(ipw_ident_table[0]), nitems(ipw_ident_table) - 1); + nitems(ipw_ident_table) - 1); MODULE_VERSION(ipw, 1); diff --git a/sys/dev/ixgbe/if_ix.c b/sys/dev/ixgbe/if_ix.c index cca610664065..44843ff3fc98 100644 --- a/sys/dev/ixgbe/if_ix.c +++ b/sys/dev/ixgbe/if_ix.c @@ -238,7 +238,7 @@ static driver_t ix_driver = { devclass_t ix_devclass; DRIVER_MODULE(ix, pci, ix_driver, ix_devclass, 0, 0); MODULE_PNP_INFO("U16:vendor;U16:device", pci, ix, ixgbe_vendor_info_array, - sizeof(ixgbe_vendor_info_array[0]), nitems(ixgbe_vendor_info_array) - 1); + nitems(ixgbe_vendor_info_array) - 1); MODULE_DEPEND(ix, pci, 1, 1, 1); MODULE_DEPEND(ix, ether, 1, 1, 1); diff --git a/sys/dev/ixgbe/if_ixv.c b/sys/dev/ixgbe/if_ixv.c index a6a3465b60d7..91f2f8ef755e 100644 --- a/sys/dev/ixgbe/if_ixv.c +++ b/sys/dev/ixgbe/if_ixv.c @@ -144,7 +144,7 @@ static driver_t ixv_driver = { devclass_t ixv_devclass; DRIVER_MODULE(ixv, pci, ixv_driver, ixv_devclass, 0, 0); MODULE_PNP_INFO("U16:vendor;U16:device", pci, ixv, ixv_vendor_info_array, - sizeof(ixv_vendor_info_array[0]), nitems(ixv_vendor_info_array) - 1); + nitems(ixv_vendor_info_array) - 1); MODULE_DEPEND(ixv, pci, 1, 1, 1); MODULE_DEPEND(ixv, ether, 1, 1, 1); #ifdef DEV_NETMAP diff --git a/sys/dev/ncr/ncr.c b/sys/dev/ncr/ncr.c index 5cfb413731c2..58bc2a6af01c 100644 --- a/sys/dev/ncr/ncr.c +++ b/sys/dev/ncr/ncr.c @@ -7109,7 +7109,7 @@ static devclass_t ncr_devclass; DRIVER_MODULE(ncr, pci, ncr_driver, ncr_devclass, 0, 0); MODULE_PNP_INFO("W32:vendor/device;U16:#;D:#", pci, ncr, ncr_chip_table, - sizeof(ncr_chip_table[0]), nitems(ncr_chip_table)); + nitems(ncr_chip_table)); MODULE_DEPEND(ncr, cam, 1, 1, 1); MODULE_DEPEND(ncr, pci, 1, 1, 1); diff --git a/sys/dev/ntb/ntb_hw/ntb_hw_intel.c b/sys/dev/ntb/ntb_hw/ntb_hw_intel.c index 3bb57fcb71d0..d61f664d8cff 100644 --- a/sys/dev/ntb/ntb_hw/ntb_hw_intel.c +++ b/sys/dev/ntb/ntb_hw/ntb_hw_intel.c @@ -3120,4 +3120,4 @@ DRIVER_MODULE(ntb_hw_intel, pci, ntb_intel_driver, ntb_hw_devclass, NULL, NULL); MODULE_DEPEND(ntb_hw_intel, ntb, 1, 1, 1); MODULE_VERSION(ntb_hw_intel, 1); MODULE_PNP_INFO("W32:vendor/device;D:#", pci, ntb_hw_intel, pci_ids, - sizeof(pci_ids[0]), nitems(pci_ids)); + nitems(pci_ids)); diff --git a/sys/dev/ofw/ofw_bus_subr.h b/sys/dev/ofw/ofw_bus_subr.h index 537856b97ef7..edc90ed7bf3c 100644 --- a/sys/dev/ofw/ofw_bus_subr.h +++ b/sys/dev/ofw/ofw_bus_subr.h @@ -67,7 +67,7 @@ struct intr_map_data_fdt { #define SIMPLEBUS_PNP_DESCR "Z:compat;P:#;" #define SIMPLEBUS_PNP_INFO(t) \ - MODULE_PNP_INFO(SIMPLEBUS_PNP_DESCR, simplebus, t, t, sizeof(t[0]), sizeof(t) / sizeof(t[0])); + MODULE_PNP_INFO(SIMPLEBUS_PNP_DESCR, simplebus, t, t, sizeof(t) / sizeof(t[0])); /* Generic implementation of ofw_bus_if.m methods and helper routines */ int ofw_bus_gen_setup_devinfo(struct ofw_bus_devinfo *, phandle_t); diff --git a/sys/dev/pccard/pccardvar.h b/sys/dev/pccard/pccardvar.h index 5138c243f5b7..87f41850ec5c 100644 --- a/sys/dev/pccard/pccardvar.h +++ b/sys/dev/pccard/pccardvar.h @@ -102,7 +102,7 @@ struct pccard_product { */ #define PCCARD_PNP_DESCR "D:#;V32:manufacturer;V32:product;Z:cisvendor;Z:cisproduct;" #define PCCARD_PNP_INFO(t) \ - MODULE_PNP_INFO(PCCARD_PNP_DESCR, pccard, t, t, sizeof(t[0]), nitems(t) - 1); \ + MODULE_PNP_INFO(PCCARD_PNP_DESCR, pccard, t, t, nitems(t) - 1) typedef int (*pccard_product_match_fn) (device_t dev, const struct pccard_product *ent, int vpfmatch); diff --git a/sys/dev/pci/pcivar.h b/sys/dev/pci/pcivar.h index c5a8afb4ed23..376fb96594ce 100644 --- a/sys/dev/pci/pcivar.h +++ b/sys/dev/pci/pcivar.h @@ -311,7 +311,7 @@ struct pci_device_table { "M16:mask;U16:vendor;U16:device;U16:subvendor;U16:subdevice;" \ "U16:class;U16:subclass;U16:revid;" #define PCI_PNP_INFO(table) \ - MODULE_PNP_INFO(PCI_PNP_STR, pci, table, table, sizeof(table[0]), \ + MODULE_PNP_INFO(PCI_PNP_STR, pci, table, table, \ sizeof(table) / sizeof(table[0])) const struct pci_device_table *pci_match_device(device_t child, diff --git a/sys/dev/puc/puc_pci.c b/sys/dev/puc/puc_pci.c index f0c6aa81dde4..012a16dc9e9b 100644 --- a/sys/dev/puc/puc_pci.c +++ b/sys/dev/puc/puc_pci.c @@ -200,4 +200,4 @@ static driver_t puc_pci_driver = { DRIVER_MODULE(puc, pci, puc_pci_driver, puc_devclass, 0, 0); MODULE_PNP_INFO("U16:vendor;U16:device;U16:#;U16:#;D:#", pci, puc, - puc_pci_devices, sizeof(puc_pci_devices[0]), nitems(puc_pci_devices) - 1); + puc_pci_devices, nitems(puc_pci_devices) - 1); diff --git a/sys/dev/spibus/spi.h b/sys/dev/spibus/spi.h index 1a9c1496a77d..0c12929bee15 100644 --- a/sys/dev/spibus/spi.h +++ b/sys/dev/spibus/spi.h @@ -43,4 +43,4 @@ struct spi_command { #define SPIBUS_PNP_DESCR "Z:compat;P:#;" #define SPIBUS_PNP_INFO(t) \ - MODULE_PNP_INFO(SPIBUS_PNP_DESCR, spibus, t, t, sizeof(t[0]), sizeof(t) / sizeof(t[0])); + MODULE_PNP_INFO(SPIBUS_PNP_DESCR, spibus, t, t, sizeof(t) / sizeof(t[0])); diff --git a/sys/dev/uart/uart_bus_pccard.c b/sys/dev/uart/uart_bus_pccard.c index 3a8446a871f6..8b672ebeadb0 100644 --- a/sys/dev/uart/uart_bus_pccard.c +++ b/sys/dev/uart/uart_bus_pccard.c @@ -103,4 +103,4 @@ uart_pccard_attach(device_t dev) DRIVER_MODULE(uart, pccard, uart_pccard_driver, uart_devclass, 0, 0); MODULE_PNP_INFO("U32:function_type;", pccard, uart, &uart_pccard_function, - sizeof(uart_pccard_function), 1); + 1); diff --git a/sys/dev/usb/usbdi.h b/sys/dev/usb/usbdi.h index 147b5d5e71b8..d5648c0301ea 100644 --- a/sys/dev/usb/usbdi.h +++ b/sys/dev/usb/usbdi.h @@ -342,13 +342,13 @@ struct usb_device_id { #define USB_STD_PNP_HOST_INFO USB_STD_PNP_INFO "T:mode=host;" #define USB_STD_PNP_DEVICE_INFO USB_STD_PNP_INFO "T:mode=device;" #define USB_PNP_HOST_INFO(table) \ - MODULE_PNP_INFO(USB_STD_PNP_HOST_INFO, uhub, table, table, sizeof(table[0]), \ + MODULE_PNP_INFO(USB_STD_PNP_HOST_INFO, uhub, table, table, \ sizeof(table) / sizeof(table[0])) #define USB_PNP_DEVICE_INFO(table) \ - MODULE_PNP_INFO(USB_STD_PNP_DEVICE_INFO, uhub, table, table, sizeof(table[0]), \ + MODULE_PNP_INFO(USB_STD_PNP_DEVICE_INFO, uhub, table, table, \ sizeof(table) / sizeof(table[0])) #define USB_PNP_DUAL_INFO(table) \ - MODULE_PNP_INFO(USB_STD_PNP_INFO, uhub, table, table, sizeof(table[0]), \ + MODULE_PNP_INFO(USB_STD_PNP_INFO, uhub, table, table, \ sizeof(table) / sizeof(table[0])) /* check that the size of the structure above is correct */ diff --git a/sys/dev/xl/if_xl.c b/sys/dev/xl/if_xl.c index 0afe62a522de..29d323399894 100644 --- a/sys/dev/xl/if_xl.c +++ b/sys/dev/xl/if_xl.c @@ -334,7 +334,7 @@ static devclass_t xl_devclass; DRIVER_MODULE_ORDERED(xl, pci, xl_driver, xl_devclass, NULL, NULL, SI_ORDER_ANY); DRIVER_MODULE(miibus, xl, miibus_driver, miibus_devclass, NULL, NULL); -MODULE_PNP_INFO("U16:vendor;U16:device;D:#", pci, xl, xl_devs, sizeof(xl_devs[0]), +MODULE_PNP_INFO("U16:vendor;U16:device;D:#", pci, xl, xl_devs, nitems(xl_devs) - 1); static void diff --git a/sys/isa/isavar.h b/sys/isa/isavar.h index 1a3e661b67a4..d95a9c1ab3f2 100644 --- a/sys/isa/isavar.h +++ b/sys/isa/isavar.h @@ -142,7 +142,7 @@ enum isa_device_ivars { #define ISA_PNP_DESCR "E:pnpid;D:#" #define ISA_PNP_INFO(t) \ - MODULE_PNP_INFO(ISA_PNP_DESCR, isa, t, t, sizeof(t[0]), nitems(t) - 1); \ + MODULE_PNP_INFO(ISA_PNP_DESCR, isa, t, t, nitems(t) - 1); \ /* * Simplified accessors for isa devices diff --git a/sys/net/iflib.h b/sys/net/iflib.h index 6e1eee633a2c..bda5ad45dfc8 100644 --- a/sys/net/iflib.h +++ b/sys/net/iflib.h @@ -173,7 +173,7 @@ typedef struct pci_vendor_info { #define IFLIB_PNP_DESCR "U32:vendor;U32:device;U32:subvendor;U32:subdevice;" \ "U32:revision;U32:class;D:#" #define IFLIB_PNP_INFO(b, u, t) \ - MODULE_PNP_INFO(IFLIB_PNP_DESCR, b, u, t, sizeof(t[0]), nitems(t) - 1) + MODULE_PNP_INFO(IFLIB_PNP_DESCR, b, u, t, nitems(t) - 1) typedef struct if_txrx { int (*ift_txd_encap) (void *, if_pkt_info_t); diff --git a/sys/sys/module.h b/sys/sys/module.h index b40870d32941..89377df401a8 100644 --- a/sys/sys/module.h +++ b/sys/sys/module.h @@ -178,12 +178,12 @@ struct mod_pnp_match_info * to allow external tools to parse their internal device tables * to make an informed guess about what driver(s) to load. */ -#define MODULE_PNP_INFO(d, b, unique, t, l, n) \ +#define MODULE_PNP_INFO(d, b, unique, t, n) \ static const struct mod_pnp_match_info _module_pnp_##b##_##unique = { \ .descr = d, \ .bus = #b, \ .table = t, \ - .entry_len = l, \ + .entry_len = sizeof((t)[0]), \ .num_entry = n \ }; \ MODULE_METADATA(_md_##b##_pnpinfo_##unique, MDT_PNP_INFO, \ -- cgit v1.3 From 0dc34160f3241087ca7d0485f0ba3813406d04ca Mon Sep 17 00:00:00 2001 From: Warner Losh Date: Wed, 26 Sep 2018 17:12:30 +0000 Subject: Add PNP info to PCI attachments of cbb, cxgb, ida, iwn, ixl, ixlv, mfi, mps, mpr, mvs, my, oce, pcn, ral, rl. This only labels existing pci device tables, and has no probe / attach code changes. Reviewed by: imp, chuck Submitted by: Lakhan Shiva Kamireddy Sponsored by: Google, Inc. (GSoC 2018) Approved by: re (glen) --- sys/dev/cxgb/cxgb_main.c | 49 +++++++++++++++++++++++++---------------------- sys/dev/ida/ida_pci.c | 2 ++ sys/dev/iwn/if_iwn.c | 3 ++- sys/dev/ixl/if_ixl.c | 1 + sys/dev/ixl/if_ixlv.c | 4 +++- sys/dev/mfi/mfi_pci.c | 7 +++++-- sys/dev/mpr/mpr_pci.c | 11 ++++++++--- sys/dev/mps/mps_pci.c | 8 ++++---- sys/dev/mvs/mvs_pci.c | 2 ++ sys/dev/my/if_my.c | 2 ++ sys/dev/oce/oce_if.c | 15 +++++++++------ sys/dev/pccbb/pccbb_pci.c | 2 ++ sys/dev/pcn/if_pcn.c | 2 ++ sys/dev/ral/if_ral_pci.c | 2 ++ sys/dev/rl/if_rl.c | 2 ++ 15 files changed, 72 insertions(+), 40 deletions(-) (limited to 'sys') diff --git a/sys/dev/cxgb/cxgb_main.c b/sys/dev/cxgb/cxgb_main.c index 127a61d61c21..a47aa909c237 100644 --- a/sys/dev/cxgb/cxgb_main.c +++ b/sys/dev/cxgb/cxgb_main.c @@ -133,6 +133,30 @@ static void cxgb_update_mac_settings(struct port_info *p); static int toe_capability(struct port_info *, int); #endif +/* Table for probing the cards. The desc field isn't actually used */ +struct cxgb_ident { + uint16_t vendor; + uint16_t device; + int index; + char *desc; +} cxgb_identifiers[] = { + {PCI_VENDOR_ID_CHELSIO, 0x0020, 0, "PE9000"}, + {PCI_VENDOR_ID_CHELSIO, 0x0021, 1, "T302E"}, + {PCI_VENDOR_ID_CHELSIO, 0x0022, 2, "T310E"}, + {PCI_VENDOR_ID_CHELSIO, 0x0023, 3, "T320X"}, + {PCI_VENDOR_ID_CHELSIO, 0x0024, 1, "T302X"}, + {PCI_VENDOR_ID_CHELSIO, 0x0025, 3, "T320E"}, + {PCI_VENDOR_ID_CHELSIO, 0x0026, 2, "T310X"}, + {PCI_VENDOR_ID_CHELSIO, 0x0030, 2, "T3B10"}, + {PCI_VENDOR_ID_CHELSIO, 0x0031, 3, "T3B20"}, + {PCI_VENDOR_ID_CHELSIO, 0x0032, 1, "T3B02"}, + {PCI_VENDOR_ID_CHELSIO, 0x0033, 4, "T3B04"}, + {PCI_VENDOR_ID_CHELSIO, 0x0035, 6, "T3C10"}, + {PCI_VENDOR_ID_CHELSIO, 0x0036, 3, "S320E-CR"}, + {PCI_VENDOR_ID_CHELSIO, 0x0037, 7, "N320E-G2"}, + {0, 0, 0, NULL} +}; + static device_method_t cxgb_controller_methods[] = { DEVMETHOD(device_probe, cxgb_controller_probe), DEVMETHOD(device_attach, cxgb_controller_attach), @@ -151,6 +175,8 @@ static int cxgbc_mod_event(module_t, int, void *); static devclass_t cxgb_controller_devclass; DRIVER_MODULE(cxgbc, pci, cxgb_controller_driver, cxgb_controller_devclass, cxgbc_mod_event, 0); +MODULE_PNP_INFO("U16:vendor;U16:device", pci, cxgbc, cxgb_identifiers, + nitems(cxgb_identifiers) - 1); MODULE_VERSION(cxgbc, 1); MODULE_DEPEND(cxgbc, firmware, 1, 1, 1); @@ -280,29 +306,6 @@ enum { FILTER_NO_VLAN_PRI = 7 }; #define PORT_MASK ((1 << MAX_NPORTS) - 1) -/* Table for probing the cards. The desc field isn't actually used */ -struct cxgb_ident { - uint16_t vendor; - uint16_t device; - int index; - char *desc; -} cxgb_identifiers[] = { - {PCI_VENDOR_ID_CHELSIO, 0x0020, 0, "PE9000"}, - {PCI_VENDOR_ID_CHELSIO, 0x0021, 1, "T302E"}, - {PCI_VENDOR_ID_CHELSIO, 0x0022, 2, "T310E"}, - {PCI_VENDOR_ID_CHELSIO, 0x0023, 3, "T320X"}, - {PCI_VENDOR_ID_CHELSIO, 0x0024, 1, "T302X"}, - {PCI_VENDOR_ID_CHELSIO, 0x0025, 3, "T320E"}, - {PCI_VENDOR_ID_CHELSIO, 0x0026, 2, "T310X"}, - {PCI_VENDOR_ID_CHELSIO, 0x0030, 2, "T3B10"}, - {PCI_VENDOR_ID_CHELSIO, 0x0031, 3, "T3B20"}, - {PCI_VENDOR_ID_CHELSIO, 0x0032, 1, "T3B02"}, - {PCI_VENDOR_ID_CHELSIO, 0x0033, 4, "T3B04"}, - {PCI_VENDOR_ID_CHELSIO, 0x0035, 6, "T3C10"}, - {PCI_VENDOR_ID_CHELSIO, 0x0036, 3, "S320E-CR"}, - {PCI_VENDOR_ID_CHELSIO, 0x0037, 7, "N320E-G2"}, - {0, 0, 0, NULL} -}; static int set_eeprom(struct port_info *pi, const uint8_t *data, int len, int offset); diff --git a/sys/dev/ida/ida_pci.c b/sys/dev/ida/ida_pci.c index 63a18ecf0259..3911a70c0b46 100644 --- a/sys/dev/ida/ida_pci.c +++ b/sys/dev/ida/ida_pci.c @@ -306,3 +306,5 @@ ida_pci_attach(device_t dev) } DRIVER_MODULE(ida, pci, ida_pci_driver, ida_devclass, 0, 0); +MODULE_PNP_INFO("W32:vendor/device;D:#", pci, ida, board_id, + nitems(board_id) - 1); diff --git a/sys/dev/iwn/if_iwn.c b/sys/dev/iwn/if_iwn.c index e46082c0b3b8..3e79f4b400c0 100644 --- a/sys/dev/iwn/if_iwn.c +++ b/sys/dev/iwn/if_iwn.c @@ -372,7 +372,8 @@ static driver_t iwn_driver = { static devclass_t iwn_devclass; DRIVER_MODULE(iwn, pci, iwn_driver, iwn_devclass, NULL, NULL); - +MODULE_PNP_INFO("U16:vendor;U16:device;D:#", pci, iwn, iwn_ident_table, + nitems(iwn_ident_table) - 1); MODULE_VERSION(iwn, 1); MODULE_DEPEND(iwn, firmware, 1, 1, 1); diff --git a/sys/dev/ixl/if_ixl.c b/sys/dev/ixl/if_ixl.c index 6d83545ae941..23c9d13c695d 100644 --- a/sys/dev/ixl/if_ixl.c +++ b/sys/dev/ixl/if_ixl.c @@ -150,6 +150,7 @@ static driver_t ixl_driver = { devclass_t ixl_devclass; DRIVER_MODULE(ixl, pci, ixl_driver, ixl_devclass, 0, 0); +IFLIB_PNP_INFO(pci, ixl, ixl_vendor_info_array); MODULE_VERSION(ixl, 3); MODULE_DEPEND(ixl, pci, 1, 1, 1); diff --git a/sys/dev/ixl/if_ixlv.c b/sys/dev/ixl/if_ixlv.c index f0a91b761f87..dd72d52ab187 100644 --- a/sys/dev/ixl/if_ixlv.c +++ b/sys/dev/ixl/if_ixlv.c @@ -149,7 +149,9 @@ static driver_t ixlv_driver = { devclass_t ixlv_devclass; DRIVER_MODULE(ixlv, pci, ixlv_driver, ixlv_devclass, 0, 0); - +MODULE_PNP_INFO("U32:vendor;U32:device;U32:subvendor;U32:subdevice;U32:revision", + pci, ixlv, ixlv_vendor_info_array, + nitems(ixlv_vendor_info_array) - 1); MODULE_DEPEND(ixlv, pci, 1, 1, 1); MODULE_DEPEND(ixlv, ether, 1, 1, 1); MODULE_DEPEND(ixlv, iflib, 1, 1, 1); diff --git a/sys/dev/mfi/mfi_pci.c b/sys/dev/mfi/mfi_pci.c index d63c5ae6435f..ee609b328990 100644 --- a/sys/dev/mfi/mfi_pci.c +++ b/sys/dev/mfi/mfi_pci.c @@ -106,8 +106,6 @@ static driver_t mfi_pci_driver = { }; static devclass_t mfi_devclass; -DRIVER_MODULE(mfi, pci, mfi_pci_driver, mfi_devclass, 0, 0); -MODULE_VERSION(mfi, 1); static int mfi_msi = 1; SYSCTL_INT(_hw_mfi, OID_AUTO, msi, CTLFLAG_RDTUN, &mfi_msi, 0, @@ -159,6 +157,11 @@ struct mfi_ident { {0, 0, 0, 0, 0, NULL} }; +DRIVER_MODULE(mfi, pci, mfi_pci_driver, mfi_devclass, 0, 0); +MODULE_PNP_INFO("U16:vendor;U16:device;U16:subvendor;U16:subdevice", pci, mfi, + mfi_identifiers, nitems(mfi_identifiers) - 1); +MODULE_VERSION(mfi, 1); + static struct mfi_ident * mfi_find_ident(device_t dev) { diff --git a/sys/dev/mpr/mpr_pci.c b/sys/dev/mpr/mpr_pci.c index 03f3143040c2..e86bd9e102ba 100644 --- a/sys/dev/mpr/mpr_pci.c +++ b/sys/dev/mpr/mpr_pci.c @@ -88,9 +88,6 @@ static driver_t mpr_pci_driver = { sizeof(struct mpr_softc) }; -static devclass_t mpr_devclass; -DRIVER_MODULE(mpr, pci, mpr_pci_driver, mpr_devclass, 0, 0); -MODULE_DEPEND(mpr, cam, 1, 1, 1); struct mpr_ident { uint16_t vendor; @@ -154,6 +151,14 @@ struct mpr_ident { { 0, 0, 0, 0, 0, NULL } }; + +static devclass_t mpr_devclass; +DRIVER_MODULE(mpr, pci, mpr_pci_driver, mpr_devclass, 0, 0); +MODULE_PNP_INFO("U16:vendor;U16:device;U16:subvendor;U16:subdevice;D:#", pci, + mpr, mpr_identifiers, nitems(mpr_identifiers) - 1); + +MODULE_DEPEND(mpr, cam, 1, 1, 1); + static struct mpr_ident * mpr_find_ident(device_t dev) { diff --git a/sys/dev/mps/mps_pci.c b/sys/dev/mps/mps_pci.c index a81fe8a7300f..1e12c9a6906a 100644 --- a/sys/dev/mps/mps_pci.c +++ b/sys/dev/mps/mps_pci.c @@ -88,10 +88,6 @@ static driver_t mps_pci_driver = { sizeof(struct mps_softc) }; -static devclass_t mps_devclass; -DRIVER_MODULE(mps, pci, mps_pci_driver, mps_devclass, 0, 0); -MODULE_DEPEND(mps, cam, 1, 1, 1); - struct mps_ident { uint16_t vendor; uint16_t device; @@ -147,6 +143,10 @@ struct mps_ident { { 0, 0, 0, 0, 0, NULL } }; +static devclass_t mps_devclass; +DRIVER_MODULE(mps, pci, mps_pci_driver, mps_devclass, 0, 0); +MODULE_PNP_INFO("U16:vendor;U16:device;U16:subvendor;U16:subdevice", pci, mps, + mps_identifiers, nitems(mps_identifiers) - 1); static struct mps_ident * mps_find_ident(device_t dev) { diff --git a/sys/dev/mvs/mvs_pci.c b/sys/dev/mvs/mvs_pci.c index cd6afce40f5c..4774390b272c 100644 --- a/sys/dev/mvs/mvs_pci.c +++ b/sys/dev/mvs/mvs_pci.c @@ -521,6 +521,8 @@ static driver_t mvs_driver = { sizeof(struct mvs_controller) }; DRIVER_MODULE(mvs, pci, mvs_driver, mvs_devclass, 0, 0); +MODULE_PNP_INFO("W32:vendor/device", pci, mvs, mvs_ids, + nitems(mvs_ids) - 1); MODULE_VERSION(mvs, 1); MODULE_DEPEND(mvs, cam, 1, 1, 1); diff --git a/sys/dev/my/if_my.c b/sys/dev/my/if_my.c index b1d5e3586c9d..ef5ef7613b99 100644 --- a/sys/dev/my/if_my.c +++ b/sys/dev/my/if_my.c @@ -163,6 +163,8 @@ static driver_t my_driver = { static devclass_t my_devclass; DRIVER_MODULE(my, pci, my_driver, my_devclass, 0, 0); +MODULE_PNP_INFO("U16:vendor;U16:device;D:#", pci, my, my_devs, + nitems(my_devs) - 1); MODULE_DEPEND(my, pci, 1, 1, 1); MODULE_DEPEND(my, ether, 1, 1, 1); diff --git a/sys/dev/oce/oce_if.c b/sys/dev/oce/oce_if.c index 23e32a59de49..0b7aa4da2372 100644 --- a/sys/dev/oce/oce_if.c +++ b/sys/dev/oce/oce_if.c @@ -214,12 +214,6 @@ static driver_t oce_driver = { static devclass_t oce_devclass; -DRIVER_MODULE(oce, pci, oce_driver, oce_devclass, 0, 0); -MODULE_DEPEND(oce, pci, 1, 1, 1); -MODULE_DEPEND(oce, ether, 1, 1, 1); -MODULE_VERSION(oce, 1); - - /* global vars */ const char component_revision[32] = {"///" COMPONENT_REVISION "///"}; @@ -242,6 +236,15 @@ static uint32_t supportedDevices[] = { (PCI_VENDOR_EMULEX << 16) | PCI_PRODUCT_SH }; + +DRIVER_MODULE(oce, pci, oce_driver, oce_devclass, 0, 0); +MODULE_PNP_INFO("W32:vendor/device", pci, oce, supportedDevices, + nitems(supportedDevices)); +MODULE_DEPEND(oce, pci, 1, 1, 1); +MODULE_DEPEND(oce, ether, 1, 1, 1); +MODULE_VERSION(oce, 1); + + POCE_SOFTC softc_head = NULL; POCE_SOFTC softc_tail = NULL; diff --git a/sys/dev/pccbb/pccbb_pci.c b/sys/dev/pccbb/pccbb_pci.c index bbffb8b53200..f2f6387c8aaa 100644 --- a/sys/dev/pccbb/pccbb_pci.c +++ b/sys/dev/pccbb/pccbb_pci.c @@ -983,4 +983,6 @@ static driver_t cbb_driver = { }; DRIVER_MODULE(cbb, pci, cbb_driver, cbb_devclass, 0, 0); +MODULE_PNP_INFO("W32:vendor/device;D:#", pci, cbb, yc_chipsets, + nitems(yc_chipsets) - 1); MODULE_DEPEND(cbb, exca, 1, 1, 1); diff --git a/sys/dev/pcn/if_pcn.c b/sys/dev/pcn/if_pcn.c index 6073310b2ac9..81fb763c18b6 100644 --- a/sys/dev/pcn/if_pcn.c +++ b/sys/dev/pcn/if_pcn.c @@ -193,6 +193,8 @@ static driver_t pcn_driver = { static devclass_t pcn_devclass; DRIVER_MODULE(pcn, pci, pcn_driver, pcn_devclass, 0, 0); +MODULE_PNP_INFO("U16:vendor; U16:device", pci, pcn, pcn_devs, + nitems(pcn_devs) - 1); DRIVER_MODULE(miibus, pcn, miibus_driver, miibus_devclass, 0, 0); #define PCN_CSR_SETBIT(sc, reg, x) \ diff --git a/sys/dev/ral/if_ral_pci.c b/sys/dev/ral/if_ral_pci.c index 6d9de66e53c8..bcf9df033289 100644 --- a/sys/dev/ral/if_ral_pci.c +++ b/sys/dev/ral/if_ral_pci.c @@ -178,6 +178,8 @@ static driver_t ral_pci_driver = { static devclass_t ral_devclass; DRIVER_MODULE(ral, pci, ral_pci_driver, ral_devclass, NULL, NULL); +MODULE_PNP_INFO("U16:vendor; U16:device; D:#", pci, ral, ral_pci_ids, + nitems(ral_pci_ids) - 1); static int ral_pci_probe(device_t dev) diff --git a/sys/dev/rl/if_rl.c b/sys/dev/rl/if_rl.c index 7e8c63131607..622ad5b16813 100644 --- a/sys/dev/rl/if_rl.c +++ b/sys/dev/rl/if_rl.c @@ -259,6 +259,8 @@ static driver_t rl_driver = { static devclass_t rl_devclass; DRIVER_MODULE(rl, pci, rl_driver, rl_devclass, 0, 0); +MODULE_PNP_INFO("U16:vendor; U16:device", pci, rl, rl_devs, + nitems(rl_devs) - 1); DRIVER_MODULE(rl, cardbus, rl_driver, rl_devclass, 0, 0); DRIVER_MODULE(miibus, rl, miibus_driver, miibus_devclass, 0, 0); -- cgit v1.3 From b28589287b04bc1003c5ec70336a5d5f90736001 Mon Sep 17 00:00:00 2001 From: Warner Losh Date: Wed, 26 Sep 2018 19:41:00 +0000 Subject: Remove bogus spaces. Spaces aren't allowed in these strings. Approved by: re@ (glen) --- sys/dev/pcn/if_pcn.c | 2 +- sys/dev/ral/if_ral_pci.c | 2 +- sys/dev/rl/if_rl.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'sys') diff --git a/sys/dev/pcn/if_pcn.c b/sys/dev/pcn/if_pcn.c index 81fb763c18b6..0e91df03ea18 100644 --- a/sys/dev/pcn/if_pcn.c +++ b/sys/dev/pcn/if_pcn.c @@ -193,7 +193,7 @@ static driver_t pcn_driver = { static devclass_t pcn_devclass; DRIVER_MODULE(pcn, pci, pcn_driver, pcn_devclass, 0, 0); -MODULE_PNP_INFO("U16:vendor; U16:device", pci, pcn, pcn_devs, +MODULE_PNP_INFO("U16:vendor;U16:device", pci, pcn, pcn_devs, nitems(pcn_devs) - 1); DRIVER_MODULE(miibus, pcn, miibus_driver, miibus_devclass, 0, 0); diff --git a/sys/dev/ral/if_ral_pci.c b/sys/dev/ral/if_ral_pci.c index bcf9df033289..41e81d573536 100644 --- a/sys/dev/ral/if_ral_pci.c +++ b/sys/dev/ral/if_ral_pci.c @@ -178,7 +178,7 @@ static driver_t ral_pci_driver = { static devclass_t ral_devclass; DRIVER_MODULE(ral, pci, ral_pci_driver, ral_devclass, NULL, NULL); -MODULE_PNP_INFO("U16:vendor; U16:device; D:#", pci, ral, ral_pci_ids, +MODULE_PNP_INFO("U16:vendor;U16:device;D:#", pci, ral, ral_pci_ids, nitems(ral_pci_ids) - 1); static int diff --git a/sys/dev/rl/if_rl.c b/sys/dev/rl/if_rl.c index 622ad5b16813..e5d4787ebbe9 100644 --- a/sys/dev/rl/if_rl.c +++ b/sys/dev/rl/if_rl.c @@ -259,7 +259,7 @@ static driver_t rl_driver = { static devclass_t rl_devclass; DRIVER_MODULE(rl, pci, rl_driver, rl_devclass, 0, 0); -MODULE_PNP_INFO("U16:vendor; U16:device", pci, rl, rl_devs, +MODULE_PNP_INFO("U16:vendor;U16:device", pci, rl, rl_devs, nitems(rl_devs) - 1); DRIVER_MODULE(rl, cardbus, rl_driver, rl_devclass, 0, 0); DRIVER_MODULE(miibus, rl, miibus_driver, miibus_devclass, 0, 0); -- cgit v1.3 From a7fcb1afcb6b798fc6c504449ac8ad4b16951dc8 Mon Sep 17 00:00:00 2001 From: Sean Eric Fagan Date: Wed, 26 Sep 2018 20:23:12 +0000 Subject: Add per-session locking to cryptosoft (swcr). As part of ZFS Crypto, I started getting a series of panics when I did not have AESNI loaded. Adding locking fixed it, and I concluded that the Reinit function altered the AES key schedule. This locking is not as fine-grained as it could be (AESNI uses per-cpu locking), but it's minimally invasive. Sponsored by: iXsystems Inc Reviewed by: cem, mav Approved by: re (gjb), mav (mentor) Differential Revision: https://reviews.freebsd.org/D17307 --- sys/opencrypto/cryptosoft.c | 8 +++++++- sys/opencrypto/cryptosoft.h | 1 + 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'sys') diff --git a/sys/opencrypto/cryptosoft.c b/sys/opencrypto/cryptosoft.c index a4a719b5fbed..0a5a20640a66 100644 --- a/sys/opencrypto/cryptosoft.c +++ b/sys/opencrypto/cryptosoft.c @@ -45,6 +45,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include @@ -765,6 +766,7 @@ swcr_newsession(device_t dev, crypto_session_t cses, struct cryptoini *cri) return EINVAL; ses = crypto_get_driver_session(cses); + mtx_init(&ses->swcr_lock, "swcr session lock", NULL, MTX_DEF); for (i = 0; cri != NULL && i < nitems(ses->swcr_algorithms); i++) { swd = &ses->swcr_algorithms[i]; @@ -1022,6 +1024,7 @@ swcr_freesession(device_t dev, crypto_session_t cses) ses = crypto_get_driver_session(cses); + mtx_destroy(&ses->swcr_lock); for (i = 0; i < nitems(ses->swcr_algorithms); i++) { swd = &ses->swcr_algorithms[i]; @@ -1109,7 +1112,7 @@ swcr_freesession(device_t dev, crypto_session_t cses) static int swcr_process(device_t dev, struct cryptop *crp, int hint) { - struct swcr_session *ses; + struct swcr_session *ses = NULL; struct cryptodesc *crd; struct swcr_data *sw; size_t i; @@ -1124,6 +1127,7 @@ swcr_process(device_t dev, struct cryptop *crp, int hint) } ses = crypto_get_driver_session(crp->crp_session); + mtx_lock(&ses->swcr_lock); /* Go through crypto descriptors, processing as we go */ for (crd = crp->crp_desc; crd; crd = crd->crd_next) { @@ -1213,6 +1217,8 @@ swcr_process(device_t dev, struct cryptop *crp, int hint) } done: + if (ses) + mtx_unlock(&ses->swcr_lock); crypto_done(crp); return 0; } diff --git a/sys/opencrypto/cryptosoft.h b/sys/opencrypto/cryptosoft.h index d88b09d4e1c0..d787dc243ae6 100644 --- a/sys/opencrypto/cryptosoft.h +++ b/sys/opencrypto/cryptosoft.h @@ -58,6 +58,7 @@ struct swcr_data { }; struct swcr_session { + struct mtx swcr_lock; struct swcr_data swcr_algorithms[2]; unsigned swcr_nalgs; }; -- cgit v1.3 From ac4031d8056a1abd8f1cda7e9a6ddb50287eb83b Mon Sep 17 00:00:00 2001 From: Navdeep Parhar Date: Wed, 26 Sep 2018 21:16:07 +0000 Subject: cxgbe(4): Enable support for per-connection rate limiting in the default firmware configuration files. Approved by: re@ (gjb@) Sponsored by: Chelsio Communications --- sys/dev/cxgbe/firmware/t4fw_cfg.txt | 3 ++- sys/dev/cxgbe/firmware/t5fw_cfg.txt | 3 ++- sys/dev/cxgbe/firmware/t6fw_cfg.txt | 3 ++- sys/dev/cxgbe/t4_main.c | 5 +++++ 4 files changed, 11 insertions(+), 3 deletions(-) (limited to 'sys') diff --git a/sys/dev/cxgbe/firmware/t4fw_cfg.txt b/sys/dev/cxgbe/firmware/t4fw_cfg.txt index 43820a0b44cb..11721a1ed648 100644 --- a/sys/dev/cxgbe/firmware/t4fw_cfg.txt +++ b/sys/dev/cxgbe/firmware/t4fw_cfg.txt @@ -110,6 +110,7 @@ nexactf = 280 cmask = all pmask = all + nethofld = 2048 # driver will mask off features it won't use protocol = ofld, rddp, rdmac, iscsi_initiator_pdu, iscsi_target_pdu @@ -245,7 +246,7 @@ [fini] version = 0x1 - checksum = 0xbec0621 + checksum = 0x159b9295 # # $FreeBSD$ # diff --git a/sys/dev/cxgbe/firmware/t5fw_cfg.txt b/sys/dev/cxgbe/firmware/t5fw_cfg.txt index 721ff372ef80..f42966c62e49 100644 --- a/sys/dev/cxgbe/firmware/t5fw_cfg.txt +++ b/sys/dev/cxgbe/firmware/t5fw_cfg.txt @@ -155,6 +155,7 @@ nexactf = 456 cmask = all pmask = all + nethofld = 8192 # driver will mask off features it won't use protocol = ofld, rddp, rdmac, iscsi_initiator_pdu, iscsi_target_pdu, iscsi_t10dif @@ -290,7 +291,7 @@ [fini] version = 0x1 - checksum = 0x89c83d98 + checksum = 0x30b6a157 # # $FreeBSD$ # diff --git a/sys/dev/cxgbe/firmware/t6fw_cfg.txt b/sys/dev/cxgbe/firmware/t6fw_cfg.txt index 3b62ed83b344..c542cfcadf5e 100644 --- a/sys/dev/cxgbe/firmware/t6fw_cfg.txt +++ b/sys/dev/cxgbe/firmware/t6fw_cfg.txt @@ -155,6 +155,7 @@ pmask = all ncrypto_lookaside = 16 nclip = 320 + nethofld = 8192 # TCAM has 6K cells; each region must start at a multiple of 128 cell. # Each entry in these categories takes 2 cells each. nhash will use the @@ -275,7 +276,7 @@ [fini] version = 0x1 - checksum = 0x9e8952d2 + checksum = 0xf3e93001 # # $FreeBSD$ # diff --git a/sys/dev/cxgbe/t4_main.c b/sys/dev/cxgbe/t4_main.c index dc491fd1d7d3..898cbf4db465 100644 --- a/sys/dev/cxgbe/t4_main.c +++ b/sys/dev/cxgbe/t4_main.c @@ -439,8 +439,13 @@ static int t4_switchcaps_allowed = FW_CAPS_CONFIG_SWITCH_INGRESS | FW_CAPS_CONFIG_SWITCH_EGRESS; TUNABLE_INT("hw.cxgbe.switchcaps_allowed", &t4_switchcaps_allowed); +#ifdef RATELIMIT static int t4_niccaps_allowed = FW_CAPS_CONFIG_NIC | FW_CAPS_CONFIG_NIC_HASHFILTER | FW_CAPS_CONFIG_NIC_ETHOFLD; +#else +static int t4_niccaps_allowed = FW_CAPS_CONFIG_NIC | + FW_CAPS_CONFIG_NIC_HASHFILTER; +#endif TUNABLE_INT("hw.cxgbe.niccaps_allowed", &t4_niccaps_allowed); static int t4_toecaps_allowed = -1; -- cgit v1.3 From 27d2645787b9487e086a4f8a2a403cfa6c192253 Mon Sep 17 00:00:00 2001 From: Andrew Turner Date: Thu, 27 Sep 2018 11:16:19 +0000 Subject: Handle a guest executing a vm instruction by trapping and raising an undefined instruction exception. Previously we would exit the guest, however an unprivileged user could execute these. Found with: syzkaller Reviewed by: araujo, tychon (previous version) Approved by: re (kib) MFC after: 1 week Differential Revision: https://reviews.freebsd.org/D17192 --- sys/amd64/include/vmm.h | 1 + sys/amd64/vmm/intel/vmx.c | 16 ++++++++++++++++ sys/amd64/vmm/vmm.c | 1 + 3 files changed, 18 insertions(+) (limited to 'sys') diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h index 6b8a14224874..b66eda0f4768 100644 --- a/sys/amd64/include/vmm.h +++ b/sys/amd64/include/vmm.h @@ -557,6 +557,7 @@ enum vm_exitcode { VM_EXITCODE_SVM, VM_EXITCODE_REQIDLE, VM_EXITCODE_DEBUG, + VM_EXITCODE_VMINSN, VM_EXITCODE_MAX }; diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c index b40846cae6e6..61871e9338eb 100644 --- a/sys/amd64/vmm/intel/vmx.c +++ b/sys/amd64/vmm/intel/vmx.c @@ -267,6 +267,9 @@ SDT_PROBE_DEFINE3(vmm, vmx, exit, monitor, SDT_PROBE_DEFINE3(vmm, vmx, exit, mwait, "struct vmx *", "int", "struct vm_exit *"); +SDT_PROBE_DEFINE3(vmm, vmx, exit, vminsn, + "struct vmx *", "int", "struct vm_exit *"); + SDT_PROBE_DEFINE4(vmm, vmx, exit, unknown, "struct vmx *", "int", "struct vm_exit *", "uint32_t"); @@ -2638,6 +2641,19 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) SDT_PROBE3(vmm, vmx, exit, mwait, vmx, vcpu, vmexit); vmexit->exitcode = VM_EXITCODE_MWAIT; break; + case EXIT_REASON_VMCALL: + case EXIT_REASON_VMCLEAR: + case EXIT_REASON_VMLAUNCH: + case EXIT_REASON_VMPTRLD: + case EXIT_REASON_VMPTRST: + case EXIT_REASON_VMREAD: + case EXIT_REASON_VMRESUME: + case EXIT_REASON_VMWRITE: + case EXIT_REASON_VMXOFF: + case EXIT_REASON_VMXON: + SDT_PROBE3(vmm, vmx, exit, vminsn, vmx, vcpu, vmexit); + vmexit->exitcode = VM_EXITCODE_VMINSN; + break; default: SDT_PROBE4(vmm, vmx, exit, unknown, vmx, vcpu, vmexit, reason); diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c index 6f0e0b2b9554..aecd21f13b4a 100644 --- a/sys/amd64/vmm/vmm.c +++ b/sys/amd64/vmm/vmm.c @@ -1737,6 +1737,7 @@ restart: break; case VM_EXITCODE_MONITOR: case VM_EXITCODE_MWAIT: + case VM_EXITCODE_VMINSN: vm_inject_ud(vm, vcpuid); break; default: -- cgit v1.3 From 8385e87c78f1115e198dd57b792f5f6e34d2048d Mon Sep 17 00:00:00 2001 From: Mateusz Piotrowski <0mp@FreeBSD.org> Date: Thu, 27 Sep 2018 12:15:31 +0000 Subject: newvers.sh: Unbreak building in Git repositories. Building the kernel in Git repositories when git-svn is not available and the "help.autocorrect" Git parameter is enabled results in Git trying to replace the "svn" command (it does not know) with "serve". As a result the output of the "git server" command is appended to the value of the environmental variable VERINFO, which causes the auto generated vers.c file to contain invalid C syntax (missing newline escapes): #define "@(#)FreeBSD 12.0-ALPHA7 r000eversion 2 0015agent=git/2.19.0 000cls-refs 0012fetch=shallow 0012server-option 0000=5e2272613fa(splash-vt)" #define VERSTR "FreeBSD 12.0-ALPHA7 r000eversion 2 0015agent=git/2.19.0 000cls-refs 0012fetch=shallow 0012server-option 0000=5e2272613fa(splash-vt)\n" Using `-c help.autocorrect=0` seems to be a good solution as it does not modify user's environment. I am not sure, however, if we should use programs (or Git commands), which we are not sure exist (we never check if git-svn is available on the host), as there may be more unexpected behaviors like this one. Reviewed by: eadler, emaste, krion Approved by: re (gjb), krion (mentor) Sponsored by: Bally Wulff Games & Entertainment GmbH Differential Revision: https://reviews.freebsd.org/D17271 --- sys/conf/newvers.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'sys') diff --git a/sys/conf/newvers.sh b/sys/conf/newvers.sh index db6f66d7c52c..e0cf2cfa9c13 100644 --- a/sys/conf/newvers.sh +++ b/sys/conf/newvers.sh @@ -183,7 +183,7 @@ done if findvcs .git; then for dir in /usr/bin /usr/local/bin; do if [ -x "${dir}/git" ] ; then - git_cmd="${dir}/git --git-dir=${VCSDIR}" + git_cmd="${dir}/git -c help.autocorrect=0 --git-dir=${VCSDIR}" break fi done -- cgit v1.3 From c7637c4d19df9a2030fd919f1016303d15284623 Mon Sep 17 00:00:00 2001 From: Andrew Turner Date: Thu, 27 Sep 2018 13:50:57 +0000 Subject: Move the undefined instruction handler to identcpu.c so we have access to the registers from boot. Approved by: re (kib) Sponsored by: ABT Systems Ltd Differential Revision: https://reviews.freebsd.org/D17301 --- sys/arm64/arm64/identcpu.c | 95 ++++++++++++++++++++++++++++++ sys/arm64/arm64/undefined.c | 130 ------------------------------------------ sys/arm64/include/undefined.h | 37 ++++++++++++ 3 files changed, 132 insertions(+), 130 deletions(-) (limited to 'sys') diff --git a/sys/arm64/arm64/identcpu.c b/sys/arm64/arm64/identcpu.c index a79490441e91..5ba4fbc60605 100644 --- a/sys/arm64/arm64/identcpu.c +++ b/sys/arm64/arm64/identcpu.c @@ -43,6 +43,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include static int ident_lock; @@ -162,6 +163,98 @@ const struct cpu_implementers cpu_implementers[] = { CPU_IMPLEMENTER_NONE, }; +struct mrs_safe_value { + u_int CRm; + u_int Op2; + uint64_t value; +}; + +static struct mrs_safe_value safe_values[] = { + { /* id_aa64pfr0_el1 */ + .CRm = 4, + .Op2 = 0, + .value = ID_AA64PFR0_ADV_SIMD_NONE | ID_AA64PFR0_FP_NONE | + ID_AA64PFR0_EL1_64 | ID_AA64PFR0_EL0_64, + }, + { /* id_aa64dfr0_el1 */ + .CRm = 5, + .Op2 = 0, + .value = ID_AA64DFR0_DEBUG_VER_8, + }, +}; + +static int +user_mrs_handler(vm_offset_t va, uint32_t insn, struct trapframe *frame, + uint32_t esr) +{ + uint64_t value; + int CRm, Op2, i, reg; + + if ((insn & MRS_MASK) != MRS_VALUE) + return (0); + + /* + * We only emulate Op0 == 3, Op1 == 0, CRn == 0, CRm == {0, 4-7}. + * These are in the EL1 CPU identification space. + * CRm == 0 holds MIDR_EL1, MPIDR_EL1, and REVID_EL1. + * CRm == {4-7} holds the ID_AA64 registers. + * + * For full details see the ARMv8 ARM (ARM DDI 0487C.a) + * Table D9-2 System instruction encodings for non-Debug System + * register accesses. + */ + if (mrs_Op0(insn) != 3 || mrs_Op1(insn) != 0 || mrs_CRn(insn) != 0) + return (0); + + CRm = mrs_CRm(insn); + if (CRm > 7 || (CRm < 4 && CRm != 0)) + return (0); + + Op2 = mrs_Op2(insn); + value = 0; + + for (i = 0; i < nitems(safe_values); i++) { + if (safe_values[i].CRm == CRm && safe_values[i].Op2 == Op2) { + value = safe_values[i].value; + break; + } + } + + if (CRm == 0) { + switch (Op2) { + case 0: + value = READ_SPECIALREG(midr_el1); + break; + case 5: + value = READ_SPECIALREG(mpidr_el1); + break; + case 6: + value = READ_SPECIALREG(revidr_el1); + break; + default: + return (0); + } + } + + /* + * We will handle this instruction, move to the next so we + * don't trap here again. + */ + frame->tf_elr += INSN_SIZE; + + reg = MRS_REGISTER(insn); + /* If reg is 31 then write to xzr, i.e. do nothing */ + if (reg == 31) + return (1); + + if (reg < nitems(frame->tf_x)) + frame->tf_x[reg] = value; + else if (reg == 30) + frame->tf_lr = value; + + return (1); +} + static void identify_cpu_sysinit(void *dummy __unused) { @@ -170,6 +263,8 @@ identify_cpu_sysinit(void *dummy __unused) CPU_FOREACH(cpu) { print_cpu_features(cpu); } + + install_undef_handler(true, user_mrs_handler); } SYSINIT(idenrity_cpu, SI_SUB_SMP, SI_ORDER_ANY, identify_cpu_sysinit, NULL); diff --git a/sys/arm64/arm64/undefined.c b/sys/arm64/arm64/undefined.c index a96d5f88ddd5..753558a085dd 100644 --- a/sys/arm64/arm64/undefined.c +++ b/sys/arm64/arm64/undefined.c @@ -53,135 +53,6 @@ struct undef_handler { */ LIST_HEAD(, undef_handler) undef_handlers[2]; -#define MRS_MASK 0xfff00000 -#define MRS_VALUE 0xd5300000 -#define MRS_SPECIAL(insn) ((insn) & 0x000fffe0) -#define MRS_REGISTER(insn) ((insn) & 0x0000001f) -#define MRS_Op0_SHIFT 19 -#define MRS_Op0_MASK 0x00080000 -#define MRS_Op1_SHIFT 16 -#define MRS_Op1_MASK 0x00070000 -#define MRS_CRn_SHIFT 12 -#define MRS_CRn_MASK 0x0000f000 -#define MRS_CRm_SHIFT 8 -#define MRS_CRm_MASK 0x00000f00 -#define MRS_Op2_SHIFT 5 -#define MRS_Op2_MASK 0x000000e0 -#define MRS_Rt_SHIFT 0 -#define MRS_Rt_MASK 0x0000001f - -static inline int -mrs_Op0(uint32_t insn) -{ - - /* op0 is encoded without the top bit in a mrs instruction */ - return (2 | ((insn & MRS_Op0_MASK) >> MRS_Op0_SHIFT)); -} - -#define MRS_GET(op) \ -static inline int \ -mrs_##op(uint32_t insn) \ -{ \ - \ - return ((insn & MRS_##op##_MASK) >> MRS_##op##_SHIFT); \ -} -MRS_GET(Op1) -MRS_GET(CRn) -MRS_GET(CRm) -MRS_GET(Op2) - -struct mrs_safe_value { - u_int CRm; - u_int Op2; - uint64_t value; -}; - -static struct mrs_safe_value safe_values[] = { - { /* id_aa64pfr0_el1 */ - .CRm = 4, - .Op2 = 0, - .value = ID_AA64PFR0_ADV_SIMD_NONE | ID_AA64PFR0_FP_NONE | - ID_AA64PFR0_EL1_64 | ID_AA64PFR0_EL0_64, - }, - { /* id_aa64dfr0_el1 */ - .CRm = 5, - .Op2 = 0, - .value = ID_AA64DFR0_DEBUG_VER_8, - }, -}; - -static int -user_mrs_handler(vm_offset_t va, uint32_t insn, struct trapframe *frame, - uint32_t esr) -{ - uint64_t value; - int CRm, Op2, i, reg; - - if ((insn & MRS_MASK) != MRS_VALUE) - return (0); - - /* - * We only emulate Op0 == 3, Op1 == 0, CRn == 0, CRm == {0, 4-7}. - * These are in the EL1 CPU identification space. - * CRm == 0 holds MIDR_EL1, MPIDR_EL1, and REVID_EL1. - * CRm == {4-7} holds the ID_AA64 registers. - * - * For full details see the ARMv8 ARM (ARM DDI 0487C.a) - * Table D9-2 System instruction encodings for non-Debug System - * register accesses. - */ - if (mrs_Op0(insn) != 3 || mrs_Op1(insn) != 0 || mrs_CRn(insn) != 0) - return (0); - - CRm = mrs_CRm(insn); - if (CRm > 7 || (CRm < 4 && CRm != 0)) - return (0); - - Op2 = mrs_Op2(insn); - value = 0; - - for (i = 0; i < nitems(safe_values); i++) { - if (safe_values[i].CRm == CRm && safe_values[i].Op2 == Op2) { - value = safe_values[i].value; - break; - } - } - - if (CRm == 0) { - switch (Op2) { - case 0: - value = READ_SPECIALREG(midr_el1); - break; - case 5: - value = READ_SPECIALREG(mpidr_el1); - break; - case 6: - value = READ_SPECIALREG(revidr_el1); - break; - default: - return (0); - } - } - - /* - * We will handle this instruction, move to the next so we - * don't trap here again. - */ - frame->tf_elr += INSN_SIZE; - - reg = MRS_REGISTER(insn); - /* If reg is 31 then write to xzr, i.e. do nothing */ - if (reg == 31) - return (1); - - if (reg < nitems(frame->tf_x)) - frame->tf_x[reg] = value; - else if (reg == 30) - frame->tf_lr = value; - - return (1); -} - /* * Work around a bug in QEMU prior to 2.5.1 where reading unknown ID * registers would raise an exception when they should return 0. @@ -219,7 +90,6 @@ undef_init(void) LIST_INIT(&undef_handlers[0]); LIST_INIT(&undef_handlers[1]); - install_undef_handler(true, user_mrs_handler); install_undef_handler(false, id_aa64mmfr2_handler); } diff --git a/sys/arm64/include/undefined.h b/sys/arm64/include/undefined.h index 7fded28c31f2..ee25af6e3f9c 100644 --- a/sys/arm64/include/undefined.h +++ b/sys/arm64/include/undefined.h @@ -36,6 +36,43 @@ typedef int (*undef_handler_t)(vm_offset_t, uint32_t, struct trapframe *, uint32_t); +#define MRS_MASK 0xfff00000 +#define MRS_VALUE 0xd5300000 +#define MRS_SPECIAL(insn) ((insn) & 0x000fffe0) +#define MRS_REGISTER(insn) ((insn) & 0x0000001f) +#define MRS_Op0_SHIFT 19 +#define MRS_Op0_MASK 0x00080000 +#define MRS_Op1_SHIFT 16 +#define MRS_Op1_MASK 0x00070000 +#define MRS_CRn_SHIFT 12 +#define MRS_CRn_MASK 0x0000f000 +#define MRS_CRm_SHIFT 8 +#define MRS_CRm_MASK 0x00000f00 +#define MRS_Op2_SHIFT 5 +#define MRS_Op2_MASK 0x000000e0 +#define MRS_Rt_SHIFT 0 +#define MRS_Rt_MASK 0x0000001f + +static inline int +mrs_Op0(uint32_t insn) +{ + + /* op0 is encoded without the top bit in a mrs instruction */ + return (2 | ((insn & MRS_Op0_MASK) >> MRS_Op0_SHIFT)); +} + +#define MRS_GET(op) \ +static inline int \ +mrs_##op(uint32_t insn) \ +{ \ + \ + return ((insn & MRS_##op##_MASK) >> MRS_##op##_SHIFT); \ +} +MRS_GET(Op1) +MRS_GET(CRn) +MRS_GET(CRm) +MRS_GET(Op2) + void undef_init(void); void *install_undef_handler(bool, undef_handler_t); void remove_undef_handler(void *); -- cgit v1.3 From e3f284eee7af7bcebf4bc96f64a9bc9e8e9d692b Mon Sep 17 00:00:00 2001 From: Andrew Turner Date: Thu, 27 Sep 2018 13:54:09 +0000 Subject: Export ID_AA64PFR0_EL1 to userland Create a user view of the ID_AA64PFR0_EL1 register with values common across all CPUs. Approved by: re (kib) Sponsored by: ABT Systems Ltd Differential Revision: https://reviews.freebsd.org/D17301 --- sys/arm64/arm64/identcpu.c | 122 +++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 113 insertions(+), 9 deletions(-) (limited to 'sys') diff --git a/sys/arm64/arm64/identcpu.c b/sys/arm64/arm64/identcpu.c index 5ba4fbc60605..286b4358f4a5 100644 --- a/sys/arm64/arm64/identcpu.c +++ b/sys/arm64/arm64/identcpu.c @@ -88,6 +88,7 @@ struct cpu_desc { }; struct cpu_desc cpu_desc[MAXCPU]; +struct cpu_desc user_cpu_desc; static u_int cpu_print_regs; #define PRINT_ID_AA64_AFR0 0x00000001 #define PRINT_ID_AA64_AFR1 0x00000002 @@ -163,26 +164,77 @@ const struct cpu_implementers cpu_implementers[] = { CPU_IMPLEMENTER_NONE, }; -struct mrs_safe_value { +#define MRS_TYPE_MASK 0xf +#define MRS_INVALID 0 +#define MRS_EXACT 1 +#define MRS_EXACT_VAL(x) (MRS_EXACT | ((x) << 4)) +#define MRS_EXACT_FIELD(x) ((x) >> 4) +#define MRS_LOWER 2 + +struct mrs_field { + bool sign; + u_int type; + u_int shift; +}; + +#define MRS_FIELD(_sign, _type, _shift) \ + { \ + .sign = (_sign), \ + .type = (_type), \ + .shift = (_shift), \ + } + +#define MRS_FIELD_END { .type = MRS_INVALID, } + +static struct mrs_field id_aa64pfr0_fields[] = { + MRS_FIELD(false, MRS_EXACT, ID_AA64PFR0_SVE_SHIFT), + MRS_FIELD(false, MRS_EXACT, ID_AA64PFR0_RAS_SHIFT), + MRS_FIELD(false, MRS_EXACT, ID_AA64PFR0_GIC_SHIFT), + MRS_FIELD(true, MRS_LOWER, ID_AA64PFR0_ADV_SIMD_SHIFT), + MRS_FIELD(true, MRS_LOWER, ID_AA64PFR0_FP_SHIFT), + MRS_FIELD(false, MRS_EXACT, ID_AA64PFR0_EL3_SHIFT), + MRS_FIELD(false, MRS_EXACT, ID_AA64PFR0_EL2_SHIFT), + MRS_FIELD(false, MRS_LOWER, ID_AA64PFR0_EL1_SHIFT), + MRS_FIELD(false, MRS_LOWER, ID_AA64PFR0_EL0_SHIFT), + MRS_FIELD_END, +}; + +static struct mrs_field id_aa64dfr0_fields[] = { + MRS_FIELD(false, MRS_EXACT, ID_AA64DFR0_PMS_VER_SHIFT), + MRS_FIELD(false, MRS_EXACT, ID_AA64DFR0_CTX_CMPS_SHIFT), + MRS_FIELD(false, MRS_EXACT, ID_AA64DFR0_WRPS_SHIFT), + MRS_FIELD(false, MRS_EXACT, ID_AA64DFR0_BRPS_SHIFT), + MRS_FIELD(false, MRS_EXACT, ID_AA64DFR0_PMU_VER_SHIFT), + MRS_FIELD(false, MRS_EXACT, ID_AA64DFR0_TRACE_VER_SHIFT), + MRS_FIELD(false, MRS_EXACT_VAL(0x6), ID_AA64DFR0_DEBUG_VER_SHIFT), + MRS_FIELD_END, +}; + +struct mrs_user_reg { u_int CRm; u_int Op2; - uint64_t value; + size_t offset; + struct mrs_field *fields; }; -static struct mrs_safe_value safe_values[] = { +static struct mrs_user_reg user_regs[] = { { /* id_aa64pfr0_el1 */ .CRm = 4, .Op2 = 0, - .value = ID_AA64PFR0_ADV_SIMD_NONE | ID_AA64PFR0_FP_NONE | - ID_AA64PFR0_EL1_64 | ID_AA64PFR0_EL0_64, + .offset = __offsetof(struct cpu_desc, id_aa64pfr0), + .fields = id_aa64pfr0_fields, }, { /* id_aa64dfr0_el1 */ .CRm = 5, .Op2 = 0, - .value = ID_AA64DFR0_DEBUG_VER_8, + .offset = __offsetof(struct cpu_desc, id_aa64dfr0), + .fields = id_aa64dfr0_fields, }, }; +#define CPU_DESC_FIELD(desc, idx) \ + *(uint64_t *)((char *)&(desc) + user_regs[(idx)].offset) + static int user_mrs_handler(vm_offset_t va, uint32_t insn, struct trapframe *frame, uint32_t esr) @@ -213,9 +265,9 @@ user_mrs_handler(vm_offset_t va, uint32_t insn, struct trapframe *frame, Op2 = mrs_Op2(insn); value = 0; - for (i = 0; i < nitems(safe_values); i++) { - if (safe_values[i].CRm == CRm && safe_values[i].Op2 == Op2) { - value = safe_values[i].value; + for (i = 0; i < nitems(user_regs); i++) { + if (user_regs[i].CRm == CRm && user_regs[i].Op2 == Op2) { + value = CPU_DESC_FIELD(user_cpu_desc, i); break; } } @@ -255,13 +307,65 @@ user_mrs_handler(vm_offset_t va, uint32_t insn, struct trapframe *frame, return (1); } +static void +update_user_regs(u_int cpu) +{ + struct mrs_field *fields; + uint64_t cur, value; + int i, j, cur_field, new_field; + + for (i = 0; i < nitems(user_regs); i++) { + value = CPU_DESC_FIELD(cpu_desc[cpu], i); + if (cpu == 0) + cur = value; + else + cur = CPU_DESC_FIELD(user_cpu_desc, i); + + fields = user_regs[i].fields; + for (j = 0; fields[j].type != 0; j++) { + switch (fields[j].type & MRS_TYPE_MASK) { + case MRS_EXACT: + cur &= ~(0xfu << fields[j].shift); + cur |= + (uint64_t)MRS_EXACT_FIELD(fields[j].type) << + fields[j].shift; + break; + case MRS_LOWER: + new_field = (value >> fields[j].shift) & 0xf; + cur_field = (cur >> fields[j].shift) & 0xf; + if ((fields[j].sign && + (int)new_field < (int)cur_field) || + (!fields[j].sign && + (u_int)new_field < (u_int)cur_field)) { + cur &= ~(0xfu << fields[j].shift); + cur |= new_field << fields[j].shift; + } + break; + default: + panic("Invalid field type: %d", fields[j].type); + } + } + + CPU_DESC_FIELD(user_cpu_desc, i) = cur; + } +} + static void identify_cpu_sysinit(void *dummy __unused) { int cpu; + /* Create a user visible cpu description with safe values */ + memset(&user_cpu_desc, 0, sizeof(user_cpu_desc)); + /* Safe values for these registers */ + user_cpu_desc.id_aa64pfr0 = ID_AA64PFR0_ADV_SIMD_NONE | + ID_AA64PFR0_FP_NONE | ID_AA64PFR0_EL1_64 | ID_AA64PFR0_EL0_64; + user_cpu_desc.id_aa64dfr0 = ID_AA64DFR0_DEBUG_VER_8; + + CPU_FOREACH(cpu) { print_cpu_features(cpu); + update_user_regs(cpu); } install_undef_handler(true, user_mrs_handler); -- cgit v1.3 From a8e3f99ec190ee5c11413878b7fd75578e542808 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Thu, 27 Sep 2018 14:05:44 +0000 Subject: amd64: implement memcmp in assembly Both the in-kernel C variant and libc asm variant have very poor performance. The former compiles to a single byte comparison loop, which breaks down even for small sizes. The latter uses rep cmpsq/b which turn out to have very poor throughput and are slower than a hand-coded 32-byte comparison loop. Depending on size this is about 3-4 times faster than the current routines. Reviewed by: kib Approved by: re (gjb) Differential Revision: https://reviews.freebsd.org/D17328 --- sys/amd64/amd64/support.S | 94 +++++++++++++++++++++++++++++++++++++++++++++++ sys/conf/files | 1 - sys/conf/files.arm | 1 + sys/conf/files.arm64 | 1 + sys/conf/files.i386 | 1 + sys/conf/files.mips | 1 + sys/conf/files.powerpc | 1 + sys/conf/files.riscv | 1 + sys/conf/files.sparc64 | 1 + 9 files changed, 101 insertions(+), 1 deletion(-) (limited to 'sys') diff --git a/sys/amd64/amd64/support.S b/sys/amd64/amd64/support.S index 7d580b67656a..6bc8c208c751 100644 --- a/sys/amd64/amd64/support.S +++ b/sys/amd64/amd64/support.S @@ -100,6 +100,100 @@ ENTRY(sse2_pagezero) ret END(sse2_pagezero) +/* + * memcmpy(b1, b2, len) + * rdi,rsi,len + */ +ENTRY(memcmp) + PUSH_FRAME_POINTER + cmpq $16,%rdx + jae 5f +1: + testq %rdx,%rdx + je 3f + xorl %ecx,%ecx +2: + movzbl (%rdi,%rcx,1),%eax + movzbl (%rsi,%rcx,1),%r8d + cmpb %r8b,%al + jne 4f + addq $1,%rcx + cmpq %rcx,%rdx + jz 3f + movzbl (%rdi,%rcx,1),%eax + movzbl (%rsi,%rcx,1),%r8d + cmpb %r8b,%al + jne 4f + addq $1,%rcx + cmpq %rcx,%rdx + jz 3f + movzbl (%rdi,%rcx,1),%eax + movzbl (%rsi,%rcx,1),%r8d + cmpb %r8b,%al + jne 4f + addq $1,%rcx + cmpq %rcx,%rdx + jz 3f + movzbl (%rdi,%rcx,1),%eax + movzbl (%rsi,%rcx,1),%r8d + cmpb %r8b,%al + jne 4f + addq $1,%rcx + cmpq %rcx,%rdx + jne 2b +3: + xorl %eax,%eax + POP_FRAME_POINTER + ret +4: + subl %r8d,%eax + POP_FRAME_POINTER + ret +5: + cmpq $32,%rdx + jae 7f +6: + /* + * 8 bytes + */ + movq (%rdi),%r8 + movq (%rsi),%r9 + cmpq %r8,%r9 + jne 1b + leaq 8(%rdi),%rdi + leaq 8(%rsi),%rsi + subq $8,%rdx + cmpq $8,%rdx + jae 6b + jl 1b + jmp 3b +7: + /* + * 32 bytes + */ + movq (%rsi),%r8 + movq 8(%rsi),%r9 + subq (%rdi),%r8 + subq 8(%rdi),%r9 + or %r8,%r9 + jnz 1b + + movq 16(%rsi),%r8 + movq 24(%rsi),%r9 + subq 16(%rdi),%r8 + subq 24(%rdi),%r9 + or %r8,%r9 + jnz 1b + + leaq 32(%rdi),%rdi + leaq 32(%rsi),%rsi + subq $32,%rdx + cmpq $32,%rdx + jae 7b + jnz 1b + jmp 3b +END(memcmp) + /* * memmove(dst, src, cnt) * rdi, rsi, rdx diff --git a/sys/conf/files b/sys/conf/files index b2fcc65773e0..ee18125a034f 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -4041,7 +4041,6 @@ libkern/murmur3_32.c standard libkern/mcount.c optional profiling-routine libkern/memcchr.c standard libkern/memchr.c standard -libkern/memcmp.c standard libkern/memmem.c optional gdb libkern/qsort.c standard libkern/qsort_r.c standard diff --git a/sys/conf/files.arm b/sys/conf/files.arm index 98d452a8f7ab..087f4c695fa1 100644 --- a/sys/conf/files.arm +++ b/sys/conf/files.arm @@ -163,6 +163,7 @@ libkern/fls.c standard libkern/flsl.c standard libkern/flsll.c standard libkern/lshrdi3.c standard +libkern/memcmp.c standard libkern/moddi3.c standard libkern/qdivrem.c standard libkern/ucmpdi2.c standard diff --git a/sys/conf/files.arm64 b/sys/conf/files.arm64 index 76e9e8e36479..010fbc7460c7 100644 --- a/sys/conf/files.arm64 +++ b/sys/conf/files.arm64 @@ -244,6 +244,7 @@ libkern/ffsll.c standard libkern/fls.c standard libkern/flsl.c standard libkern/flsll.c standard +libkern/memcmp.c standard libkern/memset.c standard libkern/arm64/crc32c_armv8.S standard cddl/contrib/opensolaris/common/atomic/aarch64/opensolaris_atomic.S optional zfs | dtrace compile-with "${CDDL_C}" diff --git a/sys/conf/files.i386 b/sys/conf/files.i386 index d2591137a990..f7a86cf5ee49 100644 --- a/sys/conf/files.i386 +++ b/sys/conf/files.i386 @@ -548,6 +548,7 @@ kern/subr_sfbuf.c standard libkern/divdi3.c standard libkern/ffsll.c standard libkern/flsll.c standard +libkern/memcmp.c standard libkern/memset.c standard libkern/moddi3.c standard libkern/qdivrem.c standard diff --git a/sys/conf/files.mips b/sys/conf/files.mips index 07448f44497c..1977fb9dcce2 100644 --- a/sys/conf/files.mips +++ b/sys/conf/files.mips @@ -65,6 +65,7 @@ libkern/cmpdi2.c optional mips | mipshf | mipsel | mipselhf libkern/ucmpdi2.c optional mips | mipshf | mipsel | mipselhf libkern/ashldi3.c standard libkern/ashrdi3.c standard +libkern/memcmp.c standard # cfe support dev/cfe/cfe_api.c optional cfe diff --git a/sys/conf/files.powerpc b/sys/conf/files.powerpc index d6df0c878641..faa057bfc882 100644 --- a/sys/conf/files.powerpc +++ b/sys/conf/files.powerpc @@ -98,6 +98,7 @@ libkern/fls.c standard libkern/flsl.c standard libkern/flsll.c standard libkern/lshrdi3.c optional powerpc | powerpcspe +libkern/memcmp.c standard libkern/memset.c standard libkern/moddi3.c optional powerpc | powerpcspe libkern/qdivrem.c optional powerpc | powerpcspe diff --git a/sys/conf/files.riscv b/sys/conf/files.riscv index daba1826331b..8a70023eb91a 100644 --- a/sys/conf/files.riscv +++ b/sys/conf/files.riscv @@ -22,6 +22,7 @@ libkern/ffsll.c standard libkern/fls.c standard libkern/flsl.c standard libkern/flsll.c standard +libkern/memcmp.c standard libkern/memset.c standard riscv/riscv/autoconf.c standard riscv/riscv/bus_machdep.c standard diff --git a/sys/conf/files.sparc64 b/sys/conf/files.sparc64 index 7d8cc3c5cb88..ed61aa49648e 100644 --- a/sys/conf/files.sparc64 +++ b/sys/conf/files.sparc64 @@ -71,6 +71,7 @@ libkern/ffsll.c standard libkern/fls.c standard libkern/flsl.c standard libkern/flsll.c standard +libkern/memcmp.c standard sparc64/central/central.c optional central sparc64/ebus/ebus.c optional ebus sparc64/ebus/epic.c optional epic ebus -- cgit v1.3 From 0e59ecce47caa309cf35a0f8eb528931a124ec5c Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Thu, 27 Sep 2018 15:24:16 +0000 Subject: amd64: clean up copyin/copyout - move the PSL.AC comment to the fault handler - stop testing for zero-sized ops. after several minutes of package building there were no copyin calls with zero bytes and very few copyout. the semantic of returning 0 in this case is preserved - shorten exit paths by clearing %eax earlier - replace xchg with 3 movs. this is what compilers do. a naive benchmark on EPYC suggests about 1% increase in thoughput thanks to this change. - remove the useless movb %cl,%al from copyout. it looks like a leftover from many years ago Reviewed by: kib Approved by: re (gjb) Differential Revision: https://reviews.freebsd.org/D17286 --- sys/amd64/amd64/support.S | 41 +++++++++++++++++++++++------------------ 1 file changed, 23 insertions(+), 18 deletions(-) (limited to 'sys') diff --git a/sys/amd64/amd64/support.S b/sys/amd64/amd64/support.S index 6bc8c208c751..7c52421dd847 100644 --- a/sys/amd64/amd64/support.S +++ b/sys/amd64/amd64/support.S @@ -404,10 +404,7 @@ END(fillw) .macro COPYOUT smap erms PUSH_FRAME_POINTER movq PCPU(CURPCB),%r9 - /* Trap entry clears PSL.AC */ movq $copy_fault,PCB_ONFAULT(%r9) - testq %rdx,%rdx /* anything to do? */ - jz 2f /* * Check explicitly for non-user addresses. If 486 write protection @@ -432,10 +429,20 @@ END(fillw) cmpq %rcx,%rax ja copy_fault - xchgq %rdi,%rsi - /* bcopy(%rsi, %rdi, %rdx) */ + /* + * Set up arguments for rep movs*. + */ + movq %rdi,%r8 + movq %rsi,%rdi + movq %r8,%rsi movq %rdx,%rcx + /* + * Set return value to zero. Remaining failure mode goes through + * copy_fault. + */ + xorl %eax,%eax + SMAP_DISABLE \smap .if \erms == 0 cmpq $15,%rcx @@ -447,17 +454,16 @@ END(fillw) andb $7,%cl jne 1f SMAP_ENABLE \smap - xorl %eax,%eax movq %rax,PCB_ONFAULT(%r9) POP_FRAME_POINTER ret .endif + ALIGN_TEXT 1: rep movsb + SMAP_ENABLE \smap -2: - xorl %eax,%eax movq %rax,PCB_ONFAULT(%r9) POP_FRAME_POINTER ret @@ -487,8 +493,6 @@ END(copyout_smap_erms) PUSH_FRAME_POINTER movq PCPU(CURPCB),%r9 movq $copy_fault,PCB_ONFAULT(%r9) - testq %rdx,%rdx /* anything to do? */ - jz 2f /* * make sure address is valid @@ -500,9 +504,12 @@ END(copyout_smap_erms) cmpq %rcx,%rax ja copy_fault - xchgq %rdi,%rsi + movq %rdi,%r8 + movq %rsi,%rdi + movq %r8,%rsi movq %rdx,%rcx - movb %cl,%al + + xorl %eax,%eax SMAP_DISABLE \smap .if \erms == 0 @@ -511,22 +518,20 @@ END(copyout_smap_erms) shrq $3,%rcx /* copy longword-wise */ rep movsq - movb %al,%cl + movb %dl,%cl andb $7,%cl /* copy remaining bytes */ jne 1f SMAP_ENABLE \smap - xorl %eax,%eax movq %rax,PCB_ONFAULT(%r9) POP_FRAME_POINTER ret .endif + ALIGN_TEXT 1: rep movsb SMAP_ENABLE \smap -2: - xorl %eax,%eax movq %rax,PCB_ONFAULT(%r9) POP_FRAME_POINTER ret @@ -549,9 +554,9 @@ ENTRY(copyin_smap_erms) END(copyin_smap_erms) ALIGN_TEXT + /* Trap entry clears PSL.AC */ copy_fault: - movq PCPU(CURPCB),%rdx - movq $0,PCB_ONFAULT(%rdx) + movq $0,PCB_ONFAULT(%r9) movl $EFAULT,%eax POP_FRAME_POINTER ret -- cgit v1.3 From 3d95cc51bbac0db3b512baba91c8cb75f409cfaf Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Thu, 27 Sep 2018 15:27:53 +0000 Subject: amd64: mostly depessimize copystr - remove a forward branch in the common case - replace xchg + lodsb/stosb loop with simple movs A simple test on Intel(R) Core(TM) i7-4600U CPU @ 2.10GH copying /foo/bar/baz in a loop goes from 295715863 ops/s to 465807408. Further changes are pending. Reviewed by: kib Approved by: re (gjb) Differential Revision: https://reviews.freebsd.org/D17281 --- sys/amd64/amd64/support.S | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) (limited to 'sys') diff --git a/sys/amd64/amd64/support.S b/sys/amd64/amd64/support.S index 7c52421dd847..9aeef2b0ebb3 100644 --- a/sys/amd64/amd64/support.S +++ b/sys/amd64/amd64/support.S @@ -1122,34 +1122,33 @@ ENTRY(copystr) PUSH_FRAME_POINTER movq %rdx,%r8 /* %r8 = maxlen */ - xchgq %rdi,%rsi - incq %rdx + incq %rdx 1: decq %rdx jz 4f - lodsb - stosb - orb %al,%al + movb (%rdi),%al + movb %al,(%rsi) + incq %rsi + incq %rdi + testb %al,%al jnz 1b /* Success -- 0 byte reached */ decq %rdx xorl %eax,%eax - jmp 6f -4: - /* rdx is zero -- return ENAMETOOLONG */ - movq $ENAMETOOLONG,%rax - -6: - +2: testq %rcx,%rcx - jz 7f + jz 3f /* set *lencopied and return %rax */ subq %rdx,%r8 movq %r8,(%rcx) -7: +3: POP_FRAME_POINTER ret +4: + /* rdx is zero -- return ENAMETOOLONG */ + movl $ENAMETOOLONG,%eax + jmp 2b END(copystr) /* -- cgit v1.3 From e15e0e3e4db57e719c91b67180035f0bfc679d67 Mon Sep 17 00:00:00 2001 From: "Bjoern A. Zeeb" Date: Thu, 27 Sep 2018 15:32:37 +0000 Subject: In in6_pcbpurgeif0() called, e.g., from if_clone_destroy(), once we have a lock, make sure the inp is not marked freed. This can happen since the list traversal and locking was converted to epoch(9). If the inp is marked "freed", skip it. This prevents a NULL pointer deref panic later on. Reported by: slavash (Mellanox) Tested by: slavash (Mellanox) Reviewed by: markj (no formal review but caught my unlock mistake) Approved by: re (kib) --- sys/netinet6/in6_pcb.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'sys') diff --git a/sys/netinet6/in6_pcb.c b/sys/netinet6/in6_pcb.c index a9c73798b18c..b9d8e9e3187a 100644 --- a/sys/netinet6/in6_pcb.c +++ b/sys/netinet6/in6_pcb.c @@ -809,6 +809,10 @@ in6_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp) INP_INFO_WLOCK(pcbinfo); CK_LIST_FOREACH(in6p, pcbinfo->ipi_listhead, inp_list) { INP_WLOCK(in6p); + if (__predict_false(in6p->inp_flags2 & INP_FREED)) { + INP_WUNLOCK(in6p); + continue; + } im6o = in6p->in6p_moptions; if ((in6p->inp_vflag & INP_IPV6) && im6o != NULL) { /* -- cgit v1.3 From 5910b876054c965c6e6ea9d8585721d5869cd9e5 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Thu, 27 Sep 2018 15:53:36 +0000 Subject: amd64: macroify and mostly depessimize copyinstr See r338968 for details. Reviewed by: kib Approved by: re (gjb) Differential Revision: https://reviews.freebsd.org/D17288 --- sys/amd64/amd64/support.S | 104 ++++++++++++++++++++-------------------------- 1 file changed, 46 insertions(+), 58 deletions(-) (limited to 'sys') diff --git a/sys/amd64/amd64/support.S b/sys/amd64/amd64/support.S index 9aeef2b0ebb3..be92ff2d7234 100644 --- a/sys/amd64/amd64/support.S +++ b/sys/amd64/amd64/support.S @@ -1011,96 +1011,86 @@ fusufault: * EFAULT on protection violations. If lencopied is non-zero, * return the actual length in *lencopied. */ -ENTRY(copyinstr_nosmap) +.macro COPYINSTR smap PUSH_FRAME_POINTER movq %rdx,%r8 /* %r8 = maxlen */ - movq %rcx,%r9 /* %r9 = *len */ - xchgq %rdi,%rsi /* %rdi = from, %rsi = to */ - movq PCPU(CURPCB),%rcx - movq $cpystrflt,PCB_ONFAULT(%rcx) + movq PCPU(CURPCB),%r9 + movq $cpystrflt,PCB_ONFAULT(%r9) movq $VM_MAXUSER_ADDRESS,%rax /* make sure 'from' is within bounds */ - subq %rsi,%rax + subq %rdi,%rax jbe cpystrflt + SMAP_DISABLE \smap + /* restrict maxlen to <= VM_MAXUSER_ADDRESS-from */ cmpq %rdx,%rax - jae 1f - movq %rax,%rdx - movq %rax,%r8 + jb 8f 1: incq %rdx - 2: decq %rdx +.if \smap == 0 jz copyinstr_toolong +.else + jz copyinstr_toolong_smap +.endif - lodsb - stosb - orb %al,%al + movb (%rdi),%al + movb %al,(%rsi) + incq %rsi + incq %rdi + testb %al,%al jnz 2b - jmp copyinstr_succ -END(copyinstr_nosmap) - -ENTRY(copyinstr_smap) - PUSH_FRAME_POINTER - movq %rdx,%r8 /* %r8 = maxlen */ - movq %rcx,%r9 /* %r9 = *len */ - xchgq %rdi,%rsi /* %rdi = from, %rsi = to */ - movq PCPU(CURPCB),%rcx - movq $cpystrflt,PCB_ONFAULT(%rcx) - - movq $VM_MAXUSER_ADDRESS,%rax + SMAP_ENABLE \smap - /* make sure 'from' is within bounds */ - subq %rsi,%rax - jbe cpystrflt + /* Success -- 0 byte reached */ + decq %rdx + xorl %eax,%eax - stac + /* set *lencopied and return %eax */ + movq %rax,PCB_ONFAULT(%r9) - /* restrict maxlen to <= VM_MAXUSER_ADDRESS-from */ - cmpq %rdx,%rax - jae 1f + testq %rcx,%rcx + jz 3f + subq %rdx,%r8 + movq %r8,(%rcx) +3: + POP_FRAME_POINTER + ret + ALIGN_TEXT +8: movq %rax,%rdx movq %rax,%r8 -1: - incq %rdx - -2: - decq %rdx - jz copyinstr_toolong_smap + jmp 1b - lodsb - stosb - orb %al,%al - jnz 2b +.endm - clac +ENTRY(copyinstr_nosmap) + COPYINSTR smap=0 +END(copyinstr_nosmap) -copyinstr_succ: - /* Success -- 0 byte reached */ - decq %rdx - xorl %eax,%eax +ENTRY(copyinstr_smap) + COPYINSTR smap=1 +END(copyinstr_smap) +cpystrflt: + /* Fault entry clears PSL.AC */ + movl $EFAULT,%eax cpystrflt_x: /* set *lencopied and return %eax */ - movq PCPU(CURPCB),%rcx - movq $0,PCB_ONFAULT(%rcx) + movq $0,PCB_ONFAULT(%r9) - testq %r9,%r9 + testq %rcx,%rcx jz 1f subq %rdx,%r8 - movq %r8,(%r9) + movq %r8,(%rcx) 1: POP_FRAME_POINTER ret - /* Fault entry clears PSL.AC */ -cpystrflt: - movq $EFAULT,%rax - jmp cpystrflt_x copyinstr_toolong_smap: clac @@ -1109,11 +1099,9 @@ copyinstr_toolong: movq $VM_MAXUSER_ADDRESS,%rax cmpq %rax,%rsi jae cpystrflt - movq $ENAMETOOLONG,%rax + movl $ENAMETOOLONG,%eax jmp cpystrflt_x -END(copyinstr_smap) - /* * copystr(from, to, maxlen, int *lencopied) * %rdi, %rsi, %rdx, %rcx -- cgit v1.3 From 83382d027f64411dc5ac57d25496e8c63d106297 Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Thu, 27 Sep 2018 17:33:59 +0000 Subject: Don't clear DR6 for debug exceptions from userland. This reverts part of r333368. The attempt to clear DR6 was occuring too soon as trapsignal() does not pause to let the debugger notice the SIGTRAP and query DR6. The signal exchange does not occur until much later during ast(). As a result, GDB was no longer recognizing hardware breakpoints and watchpoints on x86. In addition, any userland programs that want to inspect DR6 in a SIGTRAP handler don't have a way to do this if we clear DR6 in the exception handler. Instead of relying on the kernel to clear DR6, debuggers will have to explicitly clear it after a trace trap (which they needed to do on older kernels anyway). Reviewed by: kib Approved by: re (delphij) MFC after: 3 days Differential Revision: https://reviews.freebsd.org/D17319 --- sys/amd64/amd64/trap.c | 6 ------ sys/i386/i386/trap.c | 6 ------ 2 files changed, 12 deletions(-) (limited to 'sys') diff --git a/sys/amd64/amd64/trap.c b/sys/amd64/amd64/trap.c index 72b9b3e78d96..1a8e5d23ff3a 100644 --- a/sys/amd64/amd64/trap.c +++ b/sys/amd64/amd64/trap.c @@ -659,12 +659,6 @@ trap(struct trapframe *frame) KASSERT((read_rflags() & PSL_I) != 0, ("interrupts disabled")); trapsignal(td, &ksi); - /* - * Clear any pending debug exceptions after allowing a - * debugger to read DR6 while stopped in trapsignal(). - */ - if (type == T_TRCTRAP) - load_dr6(0); userret: userret(td, frame); KASSERT(PCB_USER_FPU(td->td_pcb), diff --git a/sys/i386/i386/trap.c b/sys/i386/i386/trap.c index 19086221b11b..4b62484533c4 100644 --- a/sys/i386/i386/trap.c +++ b/sys/i386/i386/trap.c @@ -762,12 +762,6 @@ kernel_trctrap: KASSERT((read_eflags() & PSL_I) != 0, ("interrupts disabled")); trapsignal(td, &ksi); - /* - * Clear any pending debug exceptions after allowing a - * debugger to read DR6 while stopped in trapsignal(). - */ - if (type == T_TRCTRAP) - load_dr6(0); user: userret(td, frame); KASSERT(PCB_USER_FPU(td->td_pcb), -- cgit v1.3 From 89f652d14937ac249cd3d79c0d3db49810a20d25 Mon Sep 17 00:00:00 2001 From: Gordon Tetlow Date: Thu, 27 Sep 2018 18:39:54 +0000 Subject: Clear stack allocated data structure to prevent kernel memory leak. Reported by: Thomas Barabosch, Fraunhofer FKIE Reviewed by: wes@ Approved by: re (implicit) Approved by: so Security: FreeBSD-EN-18:12.mem Security: CVE-2018-17155 --- sys/kern/kern_context.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'sys') diff --git a/sys/kern/kern_context.c b/sys/kern/kern_context.c index 5afb371bfb27..3bd5f31082ac 100644 --- a/sys/kern/kern_context.c +++ b/sys/kern/kern_context.c @@ -70,6 +70,7 @@ sys_getcontext(struct thread *td, struct getcontext_args *uap) if (uap->ucp == NULL) ret = EINVAL; else { + bzero(&uc, sizeof(ucontext_t)); get_mcontext(td, &uc.uc_mcontext, GET_MC_CLEAR_RET); PROC_LOCK(td->td_proc); uc.uc_sigmask = td->td_sigmask; @@ -110,6 +111,7 @@ sys_swapcontext(struct thread *td, struct swapcontext_args *uap) if (uap->oucp == NULL || uap->ucp == NULL) ret = EINVAL; else { + bzero(&uc, sizeof(ucontext_t)); get_mcontext(td, &uc.uc_mcontext, GET_MC_CLEAR_RET); bzero(uc.__spare__, sizeof(uc.__spare__)); PROC_LOCK(td->td_proc); -- cgit v1.3 From 825eeb55f459bb9221351f45bea01f06ee7a3996 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Thu, 27 Sep 2018 20:48:07 +0000 Subject: amd64: fix return value of copyinstr after r338970 The function stopped swapping rdi and rsi, but the error handling code was not updated with the new register name. Approved by: re (implicit) Sponsored by: The FreeBSD Foundation --- sys/amd64/amd64/support.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'sys') diff --git a/sys/amd64/amd64/support.S b/sys/amd64/amd64/support.S index be92ff2d7234..6fdb71cb9207 100644 --- a/sys/amd64/amd64/support.S +++ b/sys/amd64/amd64/support.S @@ -1097,7 +1097,7 @@ copyinstr_toolong_smap: copyinstr_toolong: /* rdx is zero - return ENAMETOOLONG or EFAULT */ movq $VM_MAXUSER_ADDRESS,%rax - cmpq %rax,%rsi + cmpq %rax,%rdi jae cpystrflt movl $ENAMETOOLONG,%eax jmp cpystrflt_x -- cgit v1.3 From b7edb6fa77fc1ae8a5ff828b14121a5c8330a8a4 Mon Sep 17 00:00:00 2001 From: Brooks Davis Date: Thu, 27 Sep 2018 21:08:32 +0000 Subject: Centralize compat support for PCIOCGETCONF. The pre-7.x compat for both native and 32-bit code was already in pci_user.c. Use this infrastructure to add implement 32-bit support. This is more correct as ioctl(2) commands only have meaning in the context of a file descriptor. Reviewed by: kib Approved by: re (gjb) Obtained from: CheriBSD Sponsored by: DARPA, AFRL Differential revision: https://reviews.freebsd.org/D17324 --- sys/compat/freebsd32/freebsd32_ioctl.c | 109 --------------------- sys/compat/freebsd32/freebsd32_ioctl.h | 40 -------- sys/dev/pci/pci_user.c | 172 +++++++++++++++++++++++++++++---- 3 files changed, 154 insertions(+), 167 deletions(-) (limited to 'sys') diff --git a/sys/compat/freebsd32/freebsd32_ioctl.c b/sys/compat/freebsd32/freebsd32_ioctl.c index a3226cdcd9a0..b181913b6688 100644 --- a/sys/compat/freebsd32/freebsd32_ioctl.c +++ b/sys/compat/freebsd32/freebsd32_ioctl.c @@ -58,9 +58,6 @@ __FBSDID("$FreeBSD$"); CTASSERT(sizeof(struct ioc_read_toc_entry32) == 8); CTASSERT(sizeof(struct mem_range_op32) == 12); -CTASSERT(sizeof(struct pci_conf_io32) == 36); -CTASSERT(sizeof(struct pci_match_conf32) == 44); -CTASSERT(sizeof(struct pci_conf32) == 44); static int freebsd32_ioctl_ioc_read_toc(struct thread *td, @@ -147,108 +144,6 @@ freebsd32_ioctl_memrange(struct thread *td, return (error); } -static int -freebsd32_ioctl_pciocgetconf(struct thread *td, - struct freebsd32_ioctl_args *uap, struct file *fp) -{ - struct pci_conf_io pci; - struct pci_conf_io32 pci32; - struct pci_match_conf32 pmc32; - struct pci_match_conf32 *pmc32p; - struct pci_match_conf pmc; - struct pci_match_conf *pmcp; - struct pci_conf32 pc32; - struct pci_conf32 *pc32p; - struct pci_conf pc; - struct pci_conf *pcp; - u_int32_t i; - u_int32_t npat_to_convert; - u_int32_t nmatch_to_convert; - vm_offset_t addr; - int error; - - if ((error = copyin(uap->data, &pci32, sizeof(pci32))) != 0) - return (error); - - CP(pci32, pci, num_patterns); - CP(pci32, pci, offset); - CP(pci32, pci, generation); - - npat_to_convert = pci32.pat_buf_len / sizeof(struct pci_match_conf32); - pci.pat_buf_len = npat_to_convert * sizeof(struct pci_match_conf); - pci.patterns = NULL; - nmatch_to_convert = pci32.match_buf_len / sizeof(struct pci_conf32); - pci.match_buf_len = nmatch_to_convert * sizeof(struct pci_conf); - pci.matches = NULL; - - if ((error = copyout_map(td, &addr, pci.pat_buf_len)) != 0) - goto cleanup; - pci.patterns = (struct pci_match_conf *)addr; - if ((error = copyout_map(td, &addr, pci.match_buf_len)) != 0) - goto cleanup; - pci.matches = (struct pci_conf *)addr; - - npat_to_convert = min(npat_to_convert, pci.num_patterns); - - for (i = 0, pmc32p = (struct pci_match_conf32 *)PTRIN(pci32.patterns), - pmcp = pci.patterns; - i < npat_to_convert; i++, pmc32p++, pmcp++) { - if ((error = copyin(pmc32p, &pmc32, sizeof(pmc32))) != 0) - goto cleanup; - CP(pmc32,pmc,pc_sel); - strlcpy(pmc.pd_name, pmc32.pd_name, sizeof(pmc.pd_name)); - CP(pmc32,pmc,pd_unit); - CP(pmc32,pmc,pc_vendor); - CP(pmc32,pmc,pc_device); - CP(pmc32,pmc,pc_class); - CP(pmc32,pmc,flags); - if ((error = copyout(&pmc, pmcp, sizeof(pmc))) != 0) - goto cleanup; - } - - if ((error = fo_ioctl(fp, PCIOCGETCONF, (caddr_t)&pci, - td->td_ucred, td)) != 0) - goto cleanup; - - nmatch_to_convert = min(nmatch_to_convert, pci.num_matches); - - for (i = 0, pcp = pci.matches, - pc32p = (struct pci_conf32 *)PTRIN(pci32.matches); - i < nmatch_to_convert; i++, pcp++, pc32p++) { - if ((error = copyin(pcp, &pc, sizeof(pc))) != 0) - goto cleanup; - CP(pc,pc32,pc_sel); - CP(pc,pc32,pc_hdr); - CP(pc,pc32,pc_subvendor); - CP(pc,pc32,pc_subdevice); - CP(pc,pc32,pc_vendor); - CP(pc,pc32,pc_device); - CP(pc,pc32,pc_class); - CP(pc,pc32,pc_subclass); - CP(pc,pc32,pc_progif); - CP(pc,pc32,pc_revid); - strlcpy(pc32.pd_name, pc.pd_name, sizeof(pc32.pd_name)); - CP(pc,pc32,pd_unit); - if ((error = copyout(&pc32, pc32p, sizeof(pc32))) != 0) - goto cleanup; - } - - CP(pci, pci32, num_matches); - CP(pci, pci32, offset); - CP(pci, pci32, generation); - CP(pci, pci32, status); - - error = copyout(&pci32, uap->data, sizeof(pci32)); - -cleanup: - if (pci.patterns) - copyout_unmap(td, (vm_offset_t)pci.patterns, pci.pat_buf_len); - if (pci.matches) - copyout_unmap(td, (vm_offset_t)pci.matches, pci.match_buf_len); - - return (error); -} - static int freebsd32_ioctl_barmmap(struct thread *td, struct freebsd32_ioctl_args *uap, struct file *fp) @@ -382,10 +277,6 @@ freebsd32_ioctl(struct thread *td, struct freebsd32_ioctl_args *uap) error = freebsd32_ioctl_memrange(td, uap, fp); break; - case PCIOCGETCONF_32: - error = freebsd32_ioctl_pciocgetconf(td, uap, fp); - break; - case SG_IO_32: error = freebsd32_ioctl_sg(td, uap, fp); break; diff --git a/sys/compat/freebsd32/freebsd32_ioctl.h b/sys/compat/freebsd32/freebsd32_ioctl.h index fc9c93a7a29b..c19c27e3618a 100644 --- a/sys/compat/freebsd32/freebsd32_ioctl.h +++ b/sys/compat/freebsd32/freebsd32_ioctl.h @@ -56,45 +56,6 @@ struct mem_range_op32 int mo_arg[2]; }; -struct pci_conf32 { - struct pcisel pc_sel; /* domain+bus+slot+function */ - u_int8_t pc_hdr; /* PCI header type */ - u_int16_t pc_subvendor; /* card vendor ID */ - u_int16_t pc_subdevice; /* card device ID, assigned by - card vendor */ - u_int16_t pc_vendor; /* chip vendor ID */ - u_int16_t pc_device; /* chip device ID, assigned by - chip vendor */ - u_int8_t pc_class; /* chip PCI class */ - u_int8_t pc_subclass; /* chip PCI subclass */ - u_int8_t pc_progif; /* chip PCI programming interface */ - u_int8_t pc_revid; /* chip revision ID */ - char pd_name[PCI_MAXNAMELEN + 1]; /* device name */ - u_int32_t pd_unit; /* device unit number */ -}; - -struct pci_match_conf32 { - struct pcisel pc_sel; /* domain+bus+slot+function */ - char pd_name[PCI_MAXNAMELEN + 1]; /* device name */ - u_int32_t pd_unit; /* Unit number */ - u_int16_t pc_vendor; /* PCI Vendor ID */ - u_int16_t pc_device; /* PCI Device ID */ - u_int8_t pc_class; /* PCI class */ - u_int32_t flags; /* Matching expression */ -}; - -struct pci_conf_io32 { - u_int32_t pat_buf_len; /* pattern buffer length */ - u_int32_t num_patterns; /* number of patterns */ - caddr_t32 patterns; /* struct pci_match_conf ptr */ - u_int32_t match_buf_len; /* match buffer length */ - u_int32_t num_matches; /* number of matches returned */ - caddr_t32 matches; /* struct pci_conf ptr */ - u_int32_t offset; /* offset into device list */ - u_int32_t generation; /* device list generation */ - u_int32_t status; /* request status */ -}; - struct pci_bar_mmap32 { uint32_t pbm_map_base; uint32_t pbm_map_length; @@ -110,7 +71,6 @@ struct pci_bar_mmap32 { #define FIODGNAME_32 _IOW('f', 120, struct fiodgname_arg32) #define MEMRANGE_GET32 _IOWR('m', 50, struct mem_range_op32) #define MEMRANGE_SET32 _IOW('m', 51, struct mem_range_op32) -#define PCIOCGETCONF_32 _IOWR('p', 5, struct pci_conf_io32) #define SG_IO_32 _IOWR(SGIOC, 0x85, struct sg_io_hdr32) #define PCIOCBARMMAP_32 _IOWR('p', 8, struct pci_bar_mmap32) diff --git a/sys/dev/pci/pci_user.c b/sys/dev/pci/pci_user.c index 4e40a8167362..380beff0d310 100644 --- a/sys/dev/pci/pci_user.c +++ b/sys/dev/pci/pci_user.c @@ -66,6 +66,49 @@ __FBSDID("$FreeBSD$"); #include "pcib_if.h" #include "pci_if.h" +#ifdef COMPAT_FREEBSD32 +struct pci_conf32 { + struct pcisel pc_sel; /* domain+bus+slot+function */ + u_int8_t pc_hdr; /* PCI header type */ + u_int16_t pc_subvendor; /* card vendor ID */ + u_int16_t pc_subdevice; /* card device ID, assigned by + card vendor */ + u_int16_t pc_vendor; /* chip vendor ID */ + u_int16_t pc_device; /* chip device ID, assigned by + chip vendor */ + u_int8_t pc_class; /* chip PCI class */ + u_int8_t pc_subclass; /* chip PCI subclass */ + u_int8_t pc_progif; /* chip PCI programming interface */ + u_int8_t pc_revid; /* chip revision ID */ + char pd_name[PCI_MAXNAMELEN + 1]; /* device name */ + u_int32_t pd_unit; /* device unit number */ +}; + +struct pci_match_conf32 { + struct pcisel pc_sel; /* domain+bus+slot+function */ + char pd_name[PCI_MAXNAMELEN + 1]; /* device name */ + u_int32_t pd_unit; /* Unit number */ + u_int16_t pc_vendor; /* PCI Vendor ID */ + u_int16_t pc_device; /* PCI Device ID */ + u_int8_t pc_class; /* PCI class */ + u_int32_t flags; /* Matching expression */ +}; + +struct pci_conf_io32 { + u_int32_t pat_buf_len; /* pattern buffer length */ + u_int32_t num_patterns; /* number of patterns */ + u_int32_t patterns; /* struct pci_match_conf ptr */ + u_int32_t match_buf_len; /* match buffer length */ + u_int32_t num_matches; /* number of matches returned */ + u_int32_t matches; /* struct pci_conf ptr */ + u_int32_t offset; /* offset into device list */ + u_int32_t generation; /* device list generation */ + u_int32_t status; /* request status */ +}; + +#define PCIOCGETCONF32 _IOC_NEWTYPE(PCIOCGETCONF, struct pci_conf_io32) +#endif + /* * This is the user interface to PCI configuration space. */ @@ -175,6 +218,73 @@ pci_conf_match_native(struct pci_match_conf *matches, int num_matches, return(1); } +#ifdef COMPAT_FREEBSD32 +static int +pci_conf_match32(struct pci_match_conf32 *matches, int num_matches, + struct pci_conf *match_buf) +{ + int i; + + if ((matches == NULL) || (match_buf == NULL) || (num_matches <= 0)) + return(1); + + for (i = 0; i < num_matches; i++) { + /* + * I'm not sure why someone would do this...but... + */ + if (matches[i].flags == PCI_GETCONF_NO_MATCH) + continue; + + /* + * Look at each of the match flags. If it's set, do the + * comparison. If the comparison fails, we don't have a + * match, go on to the next item if there is one. + */ + if (((matches[i].flags & PCI_GETCONF_MATCH_DOMAIN) != 0) + && (match_buf->pc_sel.pc_domain != + matches[i].pc_sel.pc_domain)) + continue; + + if (((matches[i].flags & PCI_GETCONF_MATCH_BUS) != 0) + && (match_buf->pc_sel.pc_bus != matches[i].pc_sel.pc_bus)) + continue; + + if (((matches[i].flags & PCI_GETCONF_MATCH_DEV) != 0) + && (match_buf->pc_sel.pc_dev != matches[i].pc_sel.pc_dev)) + continue; + + if (((matches[i].flags & PCI_GETCONF_MATCH_FUNC) != 0) + && (match_buf->pc_sel.pc_func != matches[i].pc_sel.pc_func)) + continue; + + if (((matches[i].flags & PCI_GETCONF_MATCH_VENDOR) != 0) + && (match_buf->pc_vendor != matches[i].pc_vendor)) + continue; + + if (((matches[i].flags & PCI_GETCONF_MATCH_DEVICE) != 0) + && (match_buf->pc_device != matches[i].pc_device)) + continue; + + if (((matches[i].flags & PCI_GETCONF_MATCH_CLASS) != 0) + && (match_buf->pc_class != matches[i].pc_class)) + continue; + + if (((matches[i].flags & PCI_GETCONF_MATCH_UNIT) != 0) + && (match_buf->pd_unit != matches[i].pd_unit)) + continue; + + if (((matches[i].flags & PCI_GETCONF_MATCH_NAME) != 0) + && (strncmp(matches[i].pd_name, match_buf->pd_name, + sizeof(match_buf->pd_name)) != 0)) + continue; + + return(0); + } + + return(1); +} +#endif /* COMPAT_FREEBSD32 */ + #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) #define PRE7_COMPAT @@ -259,20 +369,6 @@ struct pci_match_conf_old32 { pci_getconf_flags_old flags; /* Matching expression */ }; -struct pci_conf_io32 { - uint32_t pat_buf_len; /* pattern buffer length */ - uint32_t num_patterns; /* number of patterns */ - uint32_t patterns; /* pattern buffer - (struct pci_match_conf_old32 *) */ - uint32_t match_buf_len; /* match buffer length */ - uint32_t num_matches; /* number of matches returned */ - uint32_t matches; /* match buffer - (struct pci_conf_old32 *) */ - uint32_t offset; /* offset into device list */ - uint32_t generation; /* device list generation */ - pci_getconf_status status; /* request status */ -}; - #define PCIOCGETCONF_OLD32 _IOWR('p', 1, struct pci_conf_io32) #endif /* COMPAT_FREEBSD32 */ @@ -411,6 +507,9 @@ pci_conf_match_old32(struct pci_match_conf_old32 *matches, int num_matches, union pci_conf_union { struct pci_conf pc; +#ifdef COMPAT_FREEBSD32 + struct pci_conf32 pc32; +#endif #ifdef PRE7_COMPAT struct pci_conf_old pco; #ifdef COMPAT_FREEBSD32 @@ -428,6 +527,11 @@ pci_conf_match(u_long cmd, struct pci_match_conf *matches, int num_matches, case PCIOCGETCONF: return (pci_conf_match_native( (struct pci_match_conf *)matches, num_matches, match_buf)); +#ifdef COMPAT_FREEBSD32 + case PCIOCGETCONF32: + return (pci_conf_match32((struct pci_match_conf32 *)matches, + num_matches, match_buf)); +#endif #ifdef PRE7_COMPAT case PCIOCGETCONF_OLD: return (pci_conf_match_old( @@ -544,6 +648,10 @@ pci_match_conf_size(u_long cmd) switch (cmd) { case PCIOCGETCONF: return (sizeof(struct pci_match_conf)); +#ifdef COMPAT_FREEBSD32 + case PCIOCGETCONF32: + return (sizeof(struct pci_match_conf32)); +#endif #ifdef PRE7_COMPAT case PCIOCGETCONF_OLD: return (sizeof(struct pci_match_conf_old)); @@ -565,6 +673,10 @@ pci_conf_size(u_long cmd) switch (cmd) { case PCIOCGETCONF: return (sizeof(struct pci_conf)); +#ifdef COMPAT_FREEBSD32 + case PCIOCGETCONF32: + return (sizeof(struct pci_conf32)); +#endif #ifdef PRE7_COMPAT case PCIOCGETCONF_OLD: return (sizeof(struct pci_conf_old)); @@ -582,7 +694,7 @@ pci_conf_size(u_long cmd) static void pci_conf_io_init(struct pci_conf_io *cio, caddr_t data, u_long cmd) { -#if defined(PRE7_COMPAT) && defined(COMPAT_FREEBSD32) +#if defined(COMPAT_FREEBSD32) struct pci_conf_io32 *cio32; #endif @@ -594,8 +706,11 @@ pci_conf_io_init(struct pci_conf_io *cio, caddr_t data, u_long cmd) *cio = *(struct pci_conf_io *)data; return; -#if defined(PRE7_COMPAT) && defined(COMPAT_FREEBSD32) +#ifdef COMPAT_FREEBSD32 + case PCIOCGETCONF32: +#ifdef PRE7_COMPAT case PCIOCGETCONF_OLD32: +#endif cio32 = (struct pci_conf_io32 *)data; cio->pat_buf_len = cio32->pat_buf_len; cio->num_patterns = cio32->num_patterns; @@ -620,7 +735,7 @@ pci_conf_io_update_data(const struct pci_conf_io *cio, caddr_t data, u_long cmd) { struct pci_conf_io *d_cio; -#if defined(PRE7_COMPAT) && defined(COMPAT_FREEBSD32) +#if defined(COMPAT_FREEBSD32) struct pci_conf_io32 *cio32; #endif @@ -636,8 +751,11 @@ pci_conf_io_update_data(const struct pci_conf_io *cio, caddr_t data, d_cio->num_matches = cio->num_matches; return; -#if defined(PRE7_COMPAT) && defined(COMPAT_FREEBSD32) +#ifdef COMPAT_FREEBSD32 + case PCIOCGETCONF32: +#ifdef PRE7_COMPAT case PCIOCGETCONF_OLD32: +#endif cio32 = (struct pci_conf_io32 *)data; cio32->status = cio->status; @@ -665,6 +783,24 @@ pci_conf_for_copyout(const struct pci_conf *pcp, union pci_conf_union *pcup, pcup->pc = *pcp; return; +#ifdef COMPAT_FREEBSD32 + case PCIOCGETCONF32: + pcup->pc32.pc_sel = pcp->pc_sel; + pcup->pc32.pc_hdr = pcp->pc_hdr; + pcup->pc32.pc_subvendor = pcp->pc_subvendor; + pcup->pc32.pc_subdevice = pcp->pc_subdevice; + pcup->pc32.pc_vendor = pcp->pc_vendor; + pcup->pc32.pc_device = pcp->pc_device; + pcup->pc32.pc_class = pcp->pc_class; + pcup->pc32.pc_subclass = pcp->pc_subclass; + pcup->pc32.pc_progif = pcp->pc_progif; + pcup->pc32.pc_revid = pcp->pc_revid; + strlcpy(pcup->pc32.pd_name, pcp->pd_name, + sizeof(pcup->pc32.pd_name)); + pcup->pc32.pd_unit = (uint32_t)pcp->pd_unit; + return; +#endif + #ifdef PRE7_COMPAT #ifdef COMPAT_FREEBSD32 case PCIOCGETCONF_OLD32: -- cgit v1.3 From d0addc700eee2b9866d1cfaf17ac45abc68d9a54 Mon Sep 17 00:00:00 2001 From: Glen Barber Date: Fri, 28 Sep 2018 00:01:45 +0000 Subject: Update head from ALPHA7 to ALPHA8 as part of the 12.0-RELEASE cycle. Approved by: re (implicit) Sponsored by: The FreeBSD Foundation --- sys/conf/newvers.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'sys') diff --git a/sys/conf/newvers.sh b/sys/conf/newvers.sh index e0cf2cfa9c13..5e749d52126d 100644 --- a/sys/conf/newvers.sh +++ b/sys/conf/newvers.sh @@ -46,7 +46,7 @@ TYPE="FreeBSD" REVISION="12.0" -BRANCH="ALPHA7" +BRANCH="ALPHA8" if [ -n "${BRANCH_OVERRIDE}" ]; then BRANCH=${BRANCH_OVERRIDE} fi -- cgit v1.3 From 9e024036f5ec62ff6b21b3a8b8b2a3a0dae22412 Mon Sep 17 00:00:00 2001 From: Andrew Turner Date: Fri, 28 Sep 2018 11:57:40 +0000 Subject: Export ID_AA64ISAR{0,1}_EL1 to userland. As with r338962 also export the instruction set attribute register. This will allow userland to identify optional instructions the hardware supports, for example in a future ifunc handler to decide which implementation of a function to return. Approved by: re (kib) --- sys/arm64/arm64/identcpu.c | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) (limited to 'sys') diff --git a/sys/arm64/arm64/identcpu.c b/sys/arm64/arm64/identcpu.c index 286b4358f4a5..b9e7826e3435 100644 --- a/sys/arm64/arm64/identcpu.c +++ b/sys/arm64/arm64/identcpu.c @@ -186,6 +186,32 @@ struct mrs_field { #define MRS_FIELD_END { .type = MRS_INVALID, } +static struct mrs_field id_aa64isar0_fields[] = { + MRS_FIELD(false, MRS_LOWER, ID_AA64ISAR0_DP_SHIFT), + MRS_FIELD(false, MRS_LOWER, ID_AA64ISAR0_SM4_SHIFT), + MRS_FIELD(false, MRS_LOWER, ID_AA64ISAR0_SM3_SHIFT), + MRS_FIELD(false, MRS_LOWER, ID_AA64ISAR0_SHA3_SHIFT), + MRS_FIELD(false, MRS_LOWER, ID_AA64ISAR0_RDM_SHIFT), + MRS_FIELD(false, MRS_LOWER, ID_AA64ISAR0_ATOMIC_SHIFT), + MRS_FIELD(false, MRS_LOWER, ID_AA64ISAR0_CRC32_SHIFT), + MRS_FIELD(false, MRS_LOWER, ID_AA64ISAR0_SHA2_SHIFT), + MRS_FIELD(false, MRS_LOWER, ID_AA64ISAR0_SHA1_SHIFT), + MRS_FIELD(false, MRS_LOWER, ID_AA64ISAR0_AES_SHIFT), + MRS_FIELD_END, +}; + +static struct mrs_field id_aa64isar1_fields[] = { + MRS_FIELD(false, MRS_LOWER, ID_AA64ISAR1_GPI_SHIFT), + MRS_FIELD(false, MRS_LOWER, ID_AA64ISAR1_GPA_SHIFT), + MRS_FIELD(false, MRS_LOWER, ID_AA64ISAR1_LRCPC_SHIFT), + MRS_FIELD(false, MRS_LOWER, ID_AA64ISAR1_FCMA_SHIFT), + MRS_FIELD(false, MRS_LOWER, ID_AA64ISAR1_JSCVT_SHIFT), + MRS_FIELD(false, MRS_LOWER, ID_AA64ISAR1_API_SHIFT), + MRS_FIELD(false, MRS_LOWER, ID_AA64ISAR1_APA_SHIFT), + MRS_FIELD(false, MRS_LOWER, ID_AA64ISAR1_DPB_SHIFT), + MRS_FIELD_END, +}; + static struct mrs_field id_aa64pfr0_fields[] = { MRS_FIELD(false, MRS_EXACT, ID_AA64PFR0_SVE_SHIFT), MRS_FIELD(false, MRS_EXACT, ID_AA64PFR0_RAS_SHIFT), @@ -218,6 +244,18 @@ struct mrs_user_reg { }; static struct mrs_user_reg user_regs[] = { + { /* id_aa64isar0_el1 */ + .CRm = 6, + .Op2 = 0, + .offset = __offsetof(struct cpu_desc, id_aa64isar0), + .fields = id_aa64isar0_fields, + }, + { /* id_aa64isar1_el1 */ + .CRm = 6, + .Op2 = 1, + .offset = __offsetof(struct cpu_desc, id_aa64isar1), + .fields = id_aa64isar1_fields, + }, { /* id_aa64pfr0_el1 */ .CRm = 4, .Op2 = 0, -- cgit v1.3 From f76bec2a28be052f2c655ce983594b8fb05e2e04 Mon Sep 17 00:00:00 2001 From: Konstantin Belousov Date: Fri, 28 Sep 2018 14:08:20 +0000 Subject: Revert part of the r338891 which reordered local invalidation and IPI. For PCID case, there is a dependency between pm_gen zeroing and reading pm_active for IPI target selection, to ensure that the invalidation is not missed. Reported and tested by: mjg Sponsored by: The FreeBSD Foundation Approved by: re (gjb) --- sys/amd64/amd64/pmap.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'sys') diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index 61dfd2607b40..d5a79e453207 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -1807,7 +1807,6 @@ pmap_invalidate_page(pmap_t pmap, vm_offset_t va) ("pmap_invalidate_page: invalid type %d", pmap->pm_type)); sched_pin(); - smp_masked_invlpg(pmap_invalidate_cpu_mask(pmap), va, pmap); if (pmap == kernel_pmap) { invlpg(va); } else { @@ -1815,6 +1814,7 @@ pmap_invalidate_page(pmap_t pmap, vm_offset_t va) invlpg(va); pmap_invalidate_page_mode(pmap, va); } + smp_masked_invlpg(pmap_invalidate_cpu_mask(pmap), va, pmap); sched_unpin(); } @@ -1910,7 +1910,6 @@ pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) ("pmap_invalidate_range: invalid type %d", pmap->pm_type)); sched_pin(); - smp_masked_invlpg_range(pmap_invalidate_cpu_mask(pmap), sva, eva, pmap); if (pmap == kernel_pmap) { for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); @@ -1921,6 +1920,7 @@ pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) } pmap_invalidate_range_mode(pmap, sva, eva); } + smp_masked_invlpg_range(pmap_invalidate_cpu_mask(pmap), sva, eva, pmap); sched_unpin(); } @@ -2021,8 +2021,8 @@ pmap_invalidate_all(pmap_t pmap) ("pmap_invalidate_all: invalid type %d", pmap->pm_type)); sched_pin(); - smp_masked_invltlb(pmap_invalidate_cpu_mask(pmap), pmap); pmap_invalidate_all_mode(pmap); + smp_masked_invltlb(pmap_invalidate_cpu_mask(pmap), pmap); sched_unpin(); } -- cgit v1.3 From a60d3db15ef0b4ff188c1c8ff0461645b1be06a8 Mon Sep 17 00:00:00 2001 From: Konstantin Belousov Date: Fri, 28 Sep 2018 14:10:12 +0000 Subject: In vm_fault_copy_entry(), collect the code to initialize a newly allocated dst_object in a single place. Suggested and reviewed by: alc Sponsored by: The FreeBSD Foundation Approved by: re (gjb) MFC after: 1 week Differential revision: https://reviews.freebsd.org/D17323 --- sys/vm/vm_fault.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'sys') diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c index d5a6b57f47e3..e3fcce3c74f1 100644 --- a/sys/vm/vm_fault.c +++ b/sys/vm/vm_fault.c @@ -1633,16 +1633,16 @@ vm_fault_copy_entry(vm_map_t dst_map, vm_map_t src_map, dst_object->flags |= OBJ_COLORED; dst_object->pg_color = atop(dst_entry->start); #endif + dst_object->domain = src_object->domain; + dst_object->charge = dst_entry->end - dst_entry->start; } VM_OBJECT_WLOCK(dst_object); KASSERT(upgrade || dst_entry->object.vm_object == NULL, ("vm_fault_copy_entry: vm_object not NULL")); if (src_object != dst_object) { - dst_object->domain = src_object->domain; dst_entry->object.vm_object = dst_object; dst_entry->offset = 0; - dst_object->charge = dst_entry->end - dst_entry->start; } if (fork_charge != NULL) { KASSERT(dst_entry->cred == NULL, -- cgit v1.3 From 9f25ab83f91986ff68681939620fe74c9dacc548 Mon Sep 17 00:00:00 2001 From: Konstantin Belousov Date: Fri, 28 Sep 2018 14:11:01 +0000 Subject: In vm_fault_copy_entry(), we should not assert that entry is charged if the dst_object is not of swap type. It can only happen when entry does not require copy, otherwise vm_map_protect() already adds the charge. So the assert was right for the case where swap object was allocated in the vm_fault_copy_entry(), but not when it was just copied from src_entry and its type is not swap. Reported by: andrew using syzkaller Reviewed by: alc Sponsored by: The FreeBSD Foundation Approved by: re (gjb) MFC after: 1 week Differential revision: https://reviews.freebsd.org/D17323 --- sys/vm/vm_fault.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'sys') diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c index e3fcce3c74f1..d6545ba5f479 100644 --- a/sys/vm/vm_fault.c +++ b/sys/vm/vm_fault.c @@ -1650,7 +1650,9 @@ vm_fault_copy_entry(vm_map_t dst_map, vm_map_t src_map, dst_object->cred = curthread->td_ucred; crhold(dst_object->cred); *fork_charge += dst_object->charge; - } else if (dst_object->cred == NULL) { + } else if ((dst_object->type == OBJT_DEFAULT || + dst_object->type == OBJT_SWAP) && + dst_object->cred == NULL) { KASSERT(dst_entry->cred != NULL, ("no cred for entry %p", dst_entry)); dst_object->cred = dst_entry->cred; -- cgit v1.3 From c62637d679d326764563f3fe3b8803fa35610c54 Mon Sep 17 00:00:00 2001 From: Konstantin Belousov Date: Fri, 28 Sep 2018 14:11:38 +0000 Subject: Correct vm_fault_copy_entry() handling of backing file truncation after the file mapping was wired. if a wired map entry is backed by vnode and the file is truncated, corresponding pages are invalidated. vm_fault_copy_entry() should be aware of it and allow for invalid pages past end of file. Also, such pages should be not mapped into userspace. If userspace accesses the truncated part of the mapping later, it gets a signal, there is no way kernel can prevent the page fault. Reported by: andrew using syzkaller Reviewed by: alc Sponsored by: The FreeBSD Foundation Approved by: re (gjb) MFC after: 1 week Differential revision: https://reviews.freebsd.org/D17323 --- sys/vm/vm_fault.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'sys') diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c index d6545ba5f479..c56e51f3dbfe 100644 --- a/sys/vm/vm_fault.c +++ b/sys/vm/vm_fault.c @@ -1739,6 +1739,13 @@ again: dst_m = src_m; if (vm_page_sleep_if_busy(dst_m, "fltupg")) goto again; + if (dst_m->pindex >= dst_object->size) + /* + * We are upgrading. Index can occur + * out of bounds if the object type is + * vnode and the file was truncated. + */ + break; vm_page_xbusy(dst_m); KASSERT(dst_m->valid == VM_PAGE_BITS_ALL, ("invalid dst page %p", dst_m)); -- cgit v1.3 From 46e2054905716cea1b790152a806c37ba654ddc5 Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Fri, 28 Sep 2018 17:23:54 +0000 Subject: Mark various removed system calls as OBSOL instead of UNIMPL. This is mostly a cosmetic change except that obsolete system calls are assigned meaningful names in the names arrays which means that using tools like kdump or truss against binaries invoking these system calls will print out the name instead of the number. The script I use to generate the XML list of syscalls for GDB also ignores UNIMPL but not OBSOL entries. In general UNIMPL should only be used to reserve placeholders for system calls that have never been implemented while system calls that existed at one time in FreeBSD but were removed should be marked OBSOL instead. Reviewed by: brooks, kib, imp Approved by: re (gjb) Differential Revision: https://reviews.freebsd.org/D17344 --- sys/compat/freebsd32/syscalls.master | 40 ++++++++++++++++++------------------ sys/kern/syscalls.master | 40 ++++++++++++++++++------------------ 2 files changed, 40 insertions(+), 40 deletions(-) (limited to 'sys') diff --git a/sys/compat/freebsd32/syscalls.master b/sys/compat/freebsd32/syscalls.master index 99103a3553b5..9b5aafe41244 100644 --- a/sys/compat/freebsd32/syscalls.master +++ b/sys/compat/freebsd32/syscalls.master @@ -343,10 +343,10 @@ 181 AUE_SETGID NOPROTO { int setgid(gid_t gid); } 182 AUE_SETEGID NOPROTO { int setegid(gid_t egid); } 183 AUE_SETEUID NOPROTO { int seteuid(uid_t euid); } -184 AUE_NULL UNIMPL lfs_bmapv -185 AUE_NULL UNIMPL lfs_markv -186 AUE_NULL UNIMPL lfs_segclean -187 AUE_NULL UNIMPL lfs_segwait +184 AUE_NULL OBSOL lfs_bmapv +185 AUE_NULL OBSOL lfs_markv +186 AUE_NULL OBSOL lfs_segclean +187 AUE_NULL OBSOL lfs_segwait 188 AUE_STAT COMPAT11 { int freebsd32_stat(char *path, \ struct freebsd11_stat32 *ub); } 189 AUE_FSTAT COMPAT11 { int freebsd32_fstat(int fd, \ @@ -414,7 +414,7 @@ int semflg); } 222 AUE_SEMOP NOSTD|NOPROTO { int semop(int semid, \ struct sembuf *sops, u_int nsops); } -223 AUE_NULL UNIMPL semconfig +223 AUE_NULL OBSOL semconfig 224 AUE_MSGCTL COMPAT7|NOSTD { int freebsd32_msgctl( \ int msqid, int cmd, \ struct msqid_ds32_old *buf); } @@ -662,12 +662,12 @@ struct kevent32_freebsd11 *eventlist, \ int nevents, \ const struct timespec32 *timeout); } -364 AUE_NULL UNIMPL __cap_get_proc -365 AUE_NULL UNIMPL __cap_set_proc -366 AUE_NULL UNIMPL __cap_get_fd -367 AUE_NULL UNIMPL __cap_get_file -368 AUE_NULL UNIMPL __cap_set_fd -369 AUE_NULL UNIMPL __cap_set_file +364 AUE_NULL OBSOL __cap_get_proc +365 AUE_NULL OBSOL __cap_set_proc +366 AUE_NULL OBSOL __cap_get_fd +367 AUE_NULL OBSOL __cap_get_file +368 AUE_NULL OBSOL __cap_set_fd +369 AUE_NULL OBSOL __cap_set_file 370 AUE_NULL UNIMPL nosys 371 AUE_EXTATTR_SET_FD NOPROTO { ssize_t extattr_set_fd(int fd, \ int attrnamespace, const char *attrname, \ @@ -679,16 +679,16 @@ int attrnamespace, \ const char *attrname); } 374 AUE_SETUGID NOPROTO { int __setugid(int flag); } -375 AUE_NULL UNIMPL nfsclnt +375 AUE_NULL OBSOL nfsclnt 376 AUE_EACCESS NOPROTO { int eaccess(char *path, int amode); } 377 AUE_NULL UNIMPL afs_syscall 378 AUE_NMOUNT STD { int freebsd32_nmount(struct iovec32 *iovp, \ unsigned int iovcnt, int flags); } -379 AUE_NULL UNIMPL kse_exit -380 AUE_NULL UNIMPL kse_wakeup -381 AUE_NULL UNIMPL kse_create -382 AUE_NULL UNIMPL kse_thr_interrupt -383 AUE_NULL UNIMPL kse_release +379 AUE_NULL OBSOL kse_exit +380 AUE_NULL OBSOL kse_wakeup +381 AUE_NULL OBSOL kse_create +382 AUE_NULL OBSOL kse_thr_interrupt +383 AUE_NULL OBSOL kse_release 384 AUE_NULL UNIMPL __mac_get_proc 385 AUE_NULL UNIMPL __mac_set_proc 386 AUE_NULL UNIMPL __mac_get_fd @@ -787,7 +787,7 @@ 439 AUE_EXTATTR_LIST_LINK NOPROTO { ssize_t extattr_list_link( \ const char *path, int attrnamespace, \ void *data, size_t nbytes); } -440 AUE_NULL UNIMPL kse_switchin +440 AUE_NULL OBSOL kse_switchin 441 AUE_SEMWAIT NOSTD { int freebsd32_ksem_timedwait(semid_t id, \ const struct timespec32 *abstime); } 442 AUE_NULL STD { int freebsd32_thr_suspend( \ @@ -1074,8 +1074,8 @@ 547 AUE_FUTIMESAT STD { int freebsd32_utimensat(int fd, \ char *path, \ struct timespec *times, int flag); } -548 AUE_NULL UNIMPL numa_getaffinity -549 AUE_NULL UNIMPL numa_setaffinity +548 AUE_NULL OBSOL numa_getaffinity +549 AUE_NULL OBSOL numa_setaffinity 550 AUE_FSYNC NOPROTO { int fdatasync(int fd); } 551 AUE_FSTAT STD { int freebsd32_fstat(int fd, \ struct stat32 *ub); } diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master index 9ea0f1b5353b..08b254655ee5 100644 --- a/sys/kern/syscalls.master +++ b/sys/kern/syscalls.master @@ -459,10 +459,10 @@ 181 AUE_SETGID STD { int setgid(gid_t gid); } 182 AUE_SETEGID STD { int setegid(gid_t egid); } 183 AUE_SETEUID STD { int seteuid(uid_t euid); } -184 AUE_NULL UNIMPL lfs_bmapv -185 AUE_NULL UNIMPL lfs_markv -186 AUE_NULL UNIMPL lfs_segclean -187 AUE_NULL UNIMPL lfs_segwait +184 AUE_NULL OBSOL lfs_bmapv +185 AUE_NULL OBSOL lfs_markv +186 AUE_NULL OBSOL lfs_segclean +187 AUE_NULL OBSOL lfs_segwait 188 AUE_STAT COMPAT11 { int stat(_In_z_ char *path, \ _Out_ struct freebsd11_stat *ub); } 189 AUE_FSTAT COMPAT11 { int fstat(int fd, \ @@ -536,7 +536,7 @@ 222 AUE_SEMOP NOSTD { int semop(int semid, \ _In_reads_(nsops) struct sembuf *sops, \ size_t nsops); } -223 AUE_NULL UNIMPL semconfig +223 AUE_NULL OBSOL semconfig 224 AUE_MSGCTL COMPAT7|NOSTD { int msgctl(int msqid, int cmd, \ struct msqid_ds_old *buf); } 225 AUE_MSGGET NOSTD { int msgget(key_t key, int msgflg); } @@ -821,12 +821,12 @@ struct kevent_freebsd11 *eventlist, \ int nevents, \ _In_opt_ const struct timespec *timeout); } -364 AUE_NULL UNIMPL __cap_get_proc -365 AUE_NULL UNIMPL __cap_set_proc -366 AUE_NULL UNIMPL __cap_get_fd -367 AUE_NULL UNIMPL __cap_get_file -368 AUE_NULL UNIMPL __cap_set_fd -369 AUE_NULL UNIMPL __cap_set_file +364 AUE_NULL OBSOL __cap_get_proc +365 AUE_NULL OBSOL __cap_set_proc +366 AUE_NULL OBSOL __cap_get_fd +367 AUE_NULL OBSOL __cap_get_file +368 AUE_NULL OBSOL __cap_set_fd +369 AUE_NULL OBSOL __cap_set_file 370 AUE_NULL UNIMPL nosys 371 AUE_EXTATTR_SET_FD STD { ssize_t extattr_set_fd(int fd, \ int attrnamespace, \ @@ -842,7 +842,7 @@ int attrnamespace, \ _In_z_ const char *attrname); } 374 AUE_SETUGID STD { int __setugid(int flag); } -375 AUE_NULL UNIMPL nfsclnt +375 AUE_NULL OBSOL nfsclnt 376 AUE_EACCESS STD { int eaccess(_In_z_ char *path, int amode); } 377 AUE_NULL NOSTD|NOTSTATIC { int afs3_syscall(long syscall, \ long parm1, long parm2, long parm3, \ @@ -850,11 +850,11 @@ 378 AUE_NMOUNT STD { int nmount( \ _In_reads_(iovcnt) struct iovec *iovp, \ unsigned int iovcnt, int flags); } -379 AUE_NULL UNIMPL kse_exit -380 AUE_NULL UNIMPL kse_wakeup -381 AUE_NULL UNIMPL kse_create -382 AUE_NULL UNIMPL kse_thr_interrupt -383 AUE_NULL UNIMPL kse_release +379 AUE_NULL OBSOL kse_exit +380 AUE_NULL OBSOL kse_wakeup +381 AUE_NULL OBSOL kse_create +382 AUE_NULL OBSOL kse_thr_interrupt +383 AUE_NULL OBSOL kse_release 384 AUE_NULL STD { int __mac_get_proc( \ _In_ struct mac *mac_p); } 385 AUE_NULL STD { int __mac_set_proc( \ @@ -994,7 +994,7 @@ int attrnamespace, \ _Out_writes_bytes_opt_(nbytes) \ void *data, size_t nbytes); } -440 AUE_NULL UNIMPL kse_switchin +440 AUE_NULL OBSOL kse_switchin 441 AUE_SEMWAIT NOSTD { int ksem_timedwait(semid_t id, \ _In_opt_ const struct timespec *abstime); } 442 AUE_NULL STD { int thr_suspend( \ @@ -1295,8 +1295,8 @@ _In_reads_(2) \ struct timespec *times, \ int flag); } -548 AUE_NULL UNIMPL numa_getaffinity -549 AUE_NULL UNIMPL numa_setaffinity +548 AUE_NULL OBSOL numa_getaffinity +549 AUE_NULL OBSOL numa_setaffinity 550 AUE_FSYNC STD { int fdatasync(int fd); } 551 AUE_FSTAT STD { int fstat(int fd, _Out_ struct stat *sb); } 552 AUE_FSTATAT STD { int fstatat(int fd, _In_z_ char *path, \ -- cgit v1.3 From ff13c0a24f400a5adfe90088ed38c3cc36dafdb3 Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Fri, 28 Sep 2018 17:25:28 +0000 Subject: Regenerate after UNIMPL -> OBSOL changes in r339001. Approved by: re (gjb) --- sys/compat/freebsd32/freebsd32_syscall.h | 20 ++++++++++++++++ sys/compat/freebsd32/freebsd32_syscalls.c | 40 +++++++++++++++---------------- sys/compat/freebsd32/freebsd32_sysent.c | 40 +++++++++++++++---------------- sys/kern/init_sysent.c | 40 +++++++++++++++---------------- sys/kern/syscalls.c | 40 +++++++++++++++---------------- sys/sys/syscall.h | 20 ++++++++++++++++ 6 files changed, 120 insertions(+), 80 deletions(-) (limited to 'sys') diff --git a/sys/compat/freebsd32/freebsd32_syscall.h b/sys/compat/freebsd32/freebsd32_syscall.h index 6cc72e423997..66ab072edab8 100644 --- a/sys/compat/freebsd32/freebsd32_syscall.h +++ b/sys/compat/freebsd32/freebsd32_syscall.h @@ -171,6 +171,10 @@ #define FREEBSD32_SYS_setgid 181 #define FREEBSD32_SYS_setegid 182 #define FREEBSD32_SYS_seteuid 183 + /* 184 is obsolete lfs_bmapv */ + /* 185 is obsolete lfs_markv */ + /* 186 is obsolete lfs_segclean */ + /* 187 is obsolete lfs_segwait */ #define FREEBSD32_SYS_freebsd11_freebsd32_stat 188 #define FREEBSD32_SYS_freebsd11_freebsd32_fstat 189 #define FREEBSD32_SYS_freebsd11_freebsd32_lstat 190 @@ -194,6 +198,7 @@ #define FREEBSD32_SYS_freebsd7_freebsd32_semctl 220 #define FREEBSD32_SYS_semget 221 #define FREEBSD32_SYS_semop 222 + /* 223 is obsolete semconfig */ #define FREEBSD32_SYS_freebsd7_freebsd32_msgctl 224 #define FREEBSD32_SYS_msgget 225 #define FREEBSD32_SYS_freebsd32_msgsnd 226 @@ -300,12 +305,24 @@ #define FREEBSD32_SYS_getresgid 361 #define FREEBSD32_SYS_kqueue 362 #define FREEBSD32_SYS_freebsd11_freebsd32_kevent 363 + /* 364 is obsolete __cap_get_proc */ + /* 365 is obsolete __cap_set_proc */ + /* 366 is obsolete __cap_get_fd */ + /* 367 is obsolete __cap_get_file */ + /* 368 is obsolete __cap_set_fd */ + /* 369 is obsolete __cap_set_file */ #define FREEBSD32_SYS_extattr_set_fd 371 #define FREEBSD32_SYS_extattr_get_fd 372 #define FREEBSD32_SYS_extattr_delete_fd 373 #define FREEBSD32_SYS___setugid 374 + /* 375 is obsolete nfsclnt */ #define FREEBSD32_SYS_eaccess 376 #define FREEBSD32_SYS_freebsd32_nmount 378 + /* 379 is obsolete kse_exit */ + /* 380 is obsolete kse_wakeup */ + /* 381 is obsolete kse_create */ + /* 382 is obsolete kse_thr_interrupt */ + /* 383 is obsolete kse_release */ #define FREEBSD32_SYS_kenv 390 #define FREEBSD32_SYS_lchflags 391 #define FREEBSD32_SYS_uuidgen 392 @@ -343,6 +360,7 @@ #define FREEBSD32_SYS_extattr_list_fd 437 #define FREEBSD32_SYS_extattr_list_file 438 #define FREEBSD32_SYS_extattr_list_link 439 + /* 440 is obsolete kse_switchin */ #define FREEBSD32_SYS_freebsd32_ksem_timedwait 441 #define FREEBSD32_SYS_freebsd32_thr_suspend 442 #define FREEBSD32_SYS_thr_wake 443 @@ -455,6 +473,8 @@ #define FREEBSD32_SYS_freebsd32_ppoll 545 #define FREEBSD32_SYS_freebsd32_futimens 546 #define FREEBSD32_SYS_freebsd32_utimensat 547 + /* 548 is obsolete numa_getaffinity */ + /* 549 is obsolete numa_setaffinity */ #define FREEBSD32_SYS_fdatasync 550 #define FREEBSD32_SYS_freebsd32_fstat 551 #define FREEBSD32_SYS_freebsd32_fstatat 552 diff --git a/sys/compat/freebsd32/freebsd32_syscalls.c b/sys/compat/freebsd32/freebsd32_syscalls.c index cc554b0efb64..c186771f0704 100644 --- a/sys/compat/freebsd32/freebsd32_syscalls.c +++ b/sys/compat/freebsd32/freebsd32_syscalls.c @@ -193,10 +193,10 @@ const char *freebsd32_syscallnames[] = { "setgid", /* 181 = setgid */ "setegid", /* 182 = setegid */ "seteuid", /* 183 = seteuid */ - "#184", /* 184 = lfs_bmapv */ - "#185", /* 185 = lfs_markv */ - "#186", /* 186 = lfs_segclean */ - "#187", /* 187 = lfs_segwait */ + "obs_lfs_bmapv", /* 184 = obsolete lfs_bmapv */ + "obs_lfs_markv", /* 185 = obsolete lfs_markv */ + "obs_lfs_segclean", /* 186 = obsolete lfs_segclean */ + "obs_lfs_segwait", /* 187 = obsolete lfs_segwait */ "compat11.freebsd32_stat", /* 188 = freebsd11 freebsd32_stat */ "compat11.freebsd32_fstat", /* 189 = freebsd11 freebsd32_fstat */ "compat11.freebsd32_lstat", /* 190 = freebsd11 freebsd32_lstat */ @@ -232,7 +232,7 @@ const char *freebsd32_syscallnames[] = { "compat7.freebsd32_semctl", /* 220 = freebsd7 freebsd32_semctl */ "semget", /* 221 = semget */ "semop", /* 222 = semop */ - "#223", /* 223 = semconfig */ + "obs_semconfig", /* 223 = obsolete semconfig */ "compat7.freebsd32_msgctl", /* 224 = freebsd7 freebsd32_msgctl */ "msgget", /* 225 = msgget */ "freebsd32_msgsnd", /* 226 = freebsd32_msgsnd */ @@ -373,26 +373,26 @@ const char *freebsd32_syscallnames[] = { "getresgid", /* 361 = getresgid */ "kqueue", /* 362 = kqueue */ "compat11.freebsd32_kevent", /* 363 = freebsd11 freebsd32_kevent */ - "#364", /* 364 = __cap_get_proc */ - "#365", /* 365 = __cap_set_proc */ - "#366", /* 366 = __cap_get_fd */ - "#367", /* 367 = __cap_get_file */ - "#368", /* 368 = __cap_set_fd */ - "#369", /* 369 = __cap_set_file */ + "obs___cap_get_proc", /* 364 = obsolete __cap_get_proc */ + "obs___cap_set_proc", /* 365 = obsolete __cap_set_proc */ + "obs___cap_get_fd", /* 366 = obsolete __cap_get_fd */ + "obs___cap_get_file", /* 367 = obsolete __cap_get_file */ + "obs___cap_set_fd", /* 368 = obsolete __cap_set_fd */ + "obs___cap_set_file", /* 369 = obsolete __cap_set_file */ "#370", /* 370 = nosys */ "extattr_set_fd", /* 371 = extattr_set_fd */ "extattr_get_fd", /* 372 = extattr_get_fd */ "extattr_delete_fd", /* 373 = extattr_delete_fd */ "__setugid", /* 374 = __setugid */ - "#375", /* 375 = nfsclnt */ + "obs_nfsclnt", /* 375 = obsolete nfsclnt */ "eaccess", /* 376 = eaccess */ "#377", /* 377 = afs_syscall */ "freebsd32_nmount", /* 378 = freebsd32_nmount */ - "#379", /* 379 = kse_exit */ - "#380", /* 380 = kse_wakeup */ - "#381", /* 381 = kse_create */ - "#382", /* 382 = kse_thr_interrupt */ - "#383", /* 383 = kse_release */ + "obs_kse_exit", /* 379 = obsolete kse_exit */ + "obs_kse_wakeup", /* 380 = obsolete kse_wakeup */ + "obs_kse_create", /* 381 = obsolete kse_create */ + "obs_kse_thr_interrupt", /* 382 = obsolete kse_thr_interrupt */ + "obs_kse_release", /* 383 = obsolete kse_release */ "#384", /* 384 = __mac_get_proc */ "#385", /* 385 = __mac_set_proc */ "#386", /* 386 = __mac_get_fd */ @@ -449,7 +449,7 @@ const char *freebsd32_syscallnames[] = { "extattr_list_fd", /* 437 = extattr_list_fd */ "extattr_list_file", /* 438 = extattr_list_file */ "extattr_list_link", /* 439 = extattr_list_link */ - "#440", /* 440 = kse_switchin */ + "obs_kse_switchin", /* 440 = obsolete kse_switchin */ "freebsd32_ksem_timedwait", /* 441 = freebsd32_ksem_timedwait */ "freebsd32_thr_suspend", /* 442 = freebsd32_thr_suspend */ "thr_wake", /* 443 = thr_wake */ @@ -580,8 +580,8 @@ const char *freebsd32_syscallnames[] = { "freebsd32_ppoll", /* 545 = freebsd32_ppoll */ "freebsd32_futimens", /* 546 = freebsd32_futimens */ "freebsd32_utimensat", /* 547 = freebsd32_utimensat */ - "#548", /* 548 = numa_getaffinity */ - "#549", /* 549 = numa_setaffinity */ + "obs_numa_getaffinity", /* 548 = obsolete numa_getaffinity */ + "obs_numa_setaffinity", /* 549 = obsolete numa_setaffinity */ "fdatasync", /* 550 = fdatasync */ "freebsd32_fstat", /* 551 = freebsd32_fstat */ "freebsd32_fstatat", /* 552 = freebsd32_fstatat */ diff --git a/sys/compat/freebsd32/freebsd32_sysent.c b/sys/compat/freebsd32/freebsd32_sysent.c index 27caed81eefa..1a6ff314dfa2 100644 --- a/sys/compat/freebsd32/freebsd32_sysent.c +++ b/sys/compat/freebsd32/freebsd32_sysent.c @@ -240,10 +240,10 @@ struct sysent freebsd32_sysent[] = { { AS(setgid_args), (sy_call_t *)sys_setgid, AUE_SETGID, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 181 = setgid */ { AS(setegid_args), (sy_call_t *)sys_setegid, AUE_SETEGID, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 182 = setegid */ { AS(seteuid_args), (sy_call_t *)sys_seteuid, AUE_SETEUID, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 183 = seteuid */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 184 = lfs_bmapv */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 185 = lfs_markv */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 186 = lfs_segclean */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 187 = lfs_segwait */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 184 = obsolete lfs_bmapv */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 185 = obsolete lfs_markv */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 186 = obsolete lfs_segclean */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 187 = obsolete lfs_segwait */ { compat11(AS(freebsd11_freebsd32_stat_args),freebsd32_stat), AUE_STAT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 188 = freebsd11 freebsd32_stat */ { compat11(AS(freebsd11_freebsd32_fstat_args),freebsd32_fstat), AUE_FSTAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 189 = freebsd11 freebsd32_fstat */ { compat11(AS(freebsd11_freebsd32_lstat_args),freebsd32_lstat), AUE_LSTAT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 190 = freebsd11 freebsd32_lstat */ @@ -279,7 +279,7 @@ struct sysent freebsd32_sysent[] = { { 0, (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 220 = freebsd7 freebsd32_semctl */ { AS(semget_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 221 = semget */ { AS(semop_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 222 = semop */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 223 = semconfig */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 223 = obsolete semconfig */ { 0, (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 224 = freebsd7 freebsd32_msgctl */ { AS(msgget_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 225 = msgget */ { AS(freebsd32_msgsnd_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 226 = freebsd32_msgsnd */ @@ -420,26 +420,26 @@ struct sysent freebsd32_sysent[] = { { AS(getresgid_args), (sy_call_t *)sys_getresgid, AUE_GETRESGID, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 361 = getresgid */ { 0, (sy_call_t *)sys_kqueue, AUE_KQUEUE, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 362 = kqueue */ { compat11(AS(freebsd11_freebsd32_kevent_args),freebsd32_kevent), AUE_KEVENT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 363 = freebsd11 freebsd32_kevent */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 364 = __cap_get_proc */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 365 = __cap_set_proc */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 366 = __cap_get_fd */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 367 = __cap_get_file */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 368 = __cap_set_fd */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 369 = __cap_set_file */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 364 = obsolete __cap_get_proc */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 365 = obsolete __cap_set_proc */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 366 = obsolete __cap_get_fd */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 367 = obsolete __cap_get_file */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 368 = obsolete __cap_set_fd */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 369 = obsolete __cap_set_file */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 370 = nosys */ { AS(extattr_set_fd_args), (sy_call_t *)sys_extattr_set_fd, AUE_EXTATTR_SET_FD, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 371 = extattr_set_fd */ { AS(extattr_get_fd_args), (sy_call_t *)sys_extattr_get_fd, AUE_EXTATTR_GET_FD, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 372 = extattr_get_fd */ { AS(extattr_delete_fd_args), (sy_call_t *)sys_extattr_delete_fd, AUE_EXTATTR_DELETE_FD, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 373 = extattr_delete_fd */ { AS(__setugid_args), (sy_call_t *)sys___setugid, AUE_SETUGID, NULL, 0, 0, 0, SY_THR_STATIC }, /* 374 = __setugid */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 375 = nfsclnt */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 375 = obsolete nfsclnt */ { AS(eaccess_args), (sy_call_t *)sys_eaccess, AUE_EACCESS, NULL, 0, 0, 0, SY_THR_STATIC }, /* 376 = eaccess */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 377 = afs_syscall */ { AS(freebsd32_nmount_args), (sy_call_t *)freebsd32_nmount, AUE_NMOUNT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 378 = freebsd32_nmount */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 379 = kse_exit */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 380 = kse_wakeup */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 381 = kse_create */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 382 = kse_thr_interrupt */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 383 = kse_release */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 379 = obsolete kse_exit */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 380 = obsolete kse_wakeup */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 381 = obsolete kse_create */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 382 = obsolete kse_thr_interrupt */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 383 = obsolete kse_release */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 384 = __mac_get_proc */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 385 = __mac_set_proc */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 386 = __mac_get_fd */ @@ -496,7 +496,7 @@ struct sysent freebsd32_sysent[] = { { AS(extattr_list_fd_args), (sy_call_t *)sys_extattr_list_fd, AUE_EXTATTR_LIST_FD, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 437 = extattr_list_fd */ { AS(extattr_list_file_args), (sy_call_t *)sys_extattr_list_file, AUE_EXTATTR_LIST_FILE, NULL, 0, 0, 0, SY_THR_STATIC }, /* 438 = extattr_list_file */ { AS(extattr_list_link_args), (sy_call_t *)sys_extattr_list_link, AUE_EXTATTR_LIST_LINK, NULL, 0, 0, 0, SY_THR_STATIC }, /* 439 = extattr_list_link */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 440 = kse_switchin */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 440 = obsolete kse_switchin */ { AS(freebsd32_ksem_timedwait_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 441 = freebsd32_ksem_timedwait */ { AS(freebsd32_thr_suspend_args), (sy_call_t *)freebsd32_thr_suspend, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 442 = freebsd32_thr_suspend */ { AS(thr_wake_args), (sy_call_t *)sys_thr_wake, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 443 = thr_wake */ @@ -627,8 +627,8 @@ struct sysent freebsd32_sysent[] = { { AS(freebsd32_ppoll_args), (sy_call_t *)freebsd32_ppoll, AUE_POLL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 545 = freebsd32_ppoll */ { AS(freebsd32_futimens_args), (sy_call_t *)freebsd32_futimens, AUE_FUTIMES, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 546 = freebsd32_futimens */ { AS(freebsd32_utimensat_args), (sy_call_t *)freebsd32_utimensat, AUE_FUTIMESAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 547 = freebsd32_utimensat */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 548 = numa_getaffinity */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 549 = numa_setaffinity */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 548 = obsolete numa_getaffinity */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 549 = obsolete numa_setaffinity */ { AS(fdatasync_args), (sy_call_t *)sys_fdatasync, AUE_FSYNC, NULL, 0, 0, 0, SY_THR_STATIC }, /* 550 = fdatasync */ { AS(freebsd32_fstat_args), (sy_call_t *)freebsd32_fstat, AUE_FSTAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 551 = freebsd32_fstat */ { AS(freebsd32_fstatat_args), (sy_call_t *)freebsd32_fstatat, AUE_FSTATAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 552 = freebsd32_fstatat */ diff --git a/sys/kern/init_sysent.c b/sys/kern/init_sysent.c index 5ea572ff0381..885b5dc6f030 100644 --- a/sys/kern/init_sysent.c +++ b/sys/kern/init_sysent.c @@ -233,10 +233,10 @@ struct sysent sysent[] = { { AS(setgid_args), (sy_call_t *)sys_setgid, AUE_SETGID, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 181 = setgid */ { AS(setegid_args), (sy_call_t *)sys_setegid, AUE_SETEGID, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 182 = setegid */ { AS(seteuid_args), (sy_call_t *)sys_seteuid, AUE_SETEUID, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 183 = seteuid */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 184 = lfs_bmapv */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 185 = lfs_markv */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 186 = lfs_segclean */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 187 = lfs_segwait */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 184 = obsolete lfs_bmapv */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 185 = obsolete lfs_markv */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 186 = obsolete lfs_segclean */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 187 = obsolete lfs_segwait */ { compat11(AS(freebsd11_stat_args),stat), AUE_STAT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 188 = freebsd11 stat */ { compat11(AS(freebsd11_fstat_args),fstat), AUE_FSTAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 189 = freebsd11 fstat */ { compat11(AS(freebsd11_lstat_args),lstat), AUE_LSTAT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 190 = freebsd11 lstat */ @@ -272,7 +272,7 @@ struct sysent sysent[] = { { 0, (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 220 = freebsd7 __semctl */ { AS(semget_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 221 = semget */ { AS(semop_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 222 = semop */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 223 = semconfig */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 223 = obsolete semconfig */ { 0, (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 224 = freebsd7 msgctl */ { AS(msgget_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 225 = msgget */ { AS(msgsnd_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 226 = msgsnd */ @@ -413,26 +413,26 @@ struct sysent sysent[] = { { AS(getresgid_args), (sy_call_t *)sys_getresgid, AUE_GETRESGID, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 361 = getresgid */ { 0, (sy_call_t *)sys_kqueue, AUE_KQUEUE, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 362 = kqueue */ { compat11(AS(freebsd11_kevent_args),kevent), AUE_KEVENT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 363 = freebsd11 kevent */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 364 = __cap_get_proc */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 365 = __cap_set_proc */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 366 = __cap_get_fd */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 367 = __cap_get_file */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 368 = __cap_set_fd */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 369 = __cap_set_file */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 364 = obsolete __cap_get_proc */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 365 = obsolete __cap_set_proc */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 366 = obsolete __cap_get_fd */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 367 = obsolete __cap_get_file */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 368 = obsolete __cap_set_fd */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 369 = obsolete __cap_set_file */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 370 = nosys */ { AS(extattr_set_fd_args), (sy_call_t *)sys_extattr_set_fd, AUE_EXTATTR_SET_FD, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 371 = extattr_set_fd */ { AS(extattr_get_fd_args), (sy_call_t *)sys_extattr_get_fd, AUE_EXTATTR_GET_FD, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 372 = extattr_get_fd */ { AS(extattr_delete_fd_args), (sy_call_t *)sys_extattr_delete_fd, AUE_EXTATTR_DELETE_FD, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 373 = extattr_delete_fd */ { AS(__setugid_args), (sy_call_t *)sys___setugid, AUE_SETUGID, NULL, 0, 0, 0, SY_THR_STATIC }, /* 374 = __setugid */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 375 = nfsclnt */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 375 = obsolete nfsclnt */ { AS(eaccess_args), (sy_call_t *)sys_eaccess, AUE_EACCESS, NULL, 0, 0, 0, SY_THR_STATIC }, /* 376 = eaccess */ { AS(afs3_syscall_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 377 = afs3_syscall */ { AS(nmount_args), (sy_call_t *)sys_nmount, AUE_NMOUNT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 378 = nmount */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 379 = kse_exit */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 380 = kse_wakeup */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 381 = kse_create */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 382 = kse_thr_interrupt */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 383 = kse_release */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 379 = obsolete kse_exit */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 380 = obsolete kse_wakeup */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 381 = obsolete kse_create */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 382 = obsolete kse_thr_interrupt */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 383 = obsolete kse_release */ { AS(__mac_get_proc_args), (sy_call_t *)sys___mac_get_proc, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 384 = __mac_get_proc */ { AS(__mac_set_proc_args), (sy_call_t *)sys___mac_set_proc, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 385 = __mac_set_proc */ { AS(__mac_get_fd_args), (sy_call_t *)sys___mac_get_fd, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 386 = __mac_get_fd */ @@ -489,7 +489,7 @@ struct sysent sysent[] = { { AS(extattr_list_fd_args), (sy_call_t *)sys_extattr_list_fd, AUE_EXTATTR_LIST_FD, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 437 = extattr_list_fd */ { AS(extattr_list_file_args), (sy_call_t *)sys_extattr_list_file, AUE_EXTATTR_LIST_FILE, NULL, 0, 0, 0, SY_THR_STATIC }, /* 438 = extattr_list_file */ { AS(extattr_list_link_args), (sy_call_t *)sys_extattr_list_link, AUE_EXTATTR_LIST_LINK, NULL, 0, 0, 0, SY_THR_STATIC }, /* 439 = extattr_list_link */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 440 = kse_switchin */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 440 = obsolete kse_switchin */ { AS(ksem_timedwait_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 441 = ksem_timedwait */ { AS(thr_suspend_args), (sy_call_t *)sys_thr_suspend, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 442 = thr_suspend */ { AS(thr_wake_args), (sy_call_t *)sys_thr_wake, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 443 = thr_wake */ @@ -597,8 +597,8 @@ struct sysent sysent[] = { { AS(ppoll_args), (sy_call_t *)sys_ppoll, AUE_POLL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 545 = ppoll */ { AS(futimens_args), (sy_call_t *)sys_futimens, AUE_FUTIMES, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 546 = futimens */ { AS(utimensat_args), (sy_call_t *)sys_utimensat, AUE_FUTIMESAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 547 = utimensat */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 548 = numa_getaffinity */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 549 = numa_setaffinity */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 548 = obsolete numa_getaffinity */ + { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 549 = obsolete numa_setaffinity */ { AS(fdatasync_args), (sy_call_t *)sys_fdatasync, AUE_FSYNC, NULL, 0, 0, 0, SY_THR_STATIC }, /* 550 = fdatasync */ { AS(fstat_args), (sy_call_t *)sys_fstat, AUE_FSTAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 551 = fstat */ { AS(fstatat_args), (sy_call_t *)sys_fstatat, AUE_FSTATAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 552 = fstatat */ diff --git a/sys/kern/syscalls.c b/sys/kern/syscalls.c index a107b5571b3c..8e74163fe6d8 100644 --- a/sys/kern/syscalls.c +++ b/sys/kern/syscalls.c @@ -190,10 +190,10 @@ const char *syscallnames[] = { "setgid", /* 181 = setgid */ "setegid", /* 182 = setegid */ "seteuid", /* 183 = seteuid */ - "#184", /* 184 = lfs_bmapv */ - "#185", /* 185 = lfs_markv */ - "#186", /* 186 = lfs_segclean */ - "#187", /* 187 = lfs_segwait */ + "obs_lfs_bmapv", /* 184 = obsolete lfs_bmapv */ + "obs_lfs_markv", /* 185 = obsolete lfs_markv */ + "obs_lfs_segclean", /* 186 = obsolete lfs_segclean */ + "obs_lfs_segwait", /* 187 = obsolete lfs_segwait */ "compat11.stat", /* 188 = freebsd11 stat */ "compat11.fstat", /* 189 = freebsd11 fstat */ "compat11.lstat", /* 190 = freebsd11 lstat */ @@ -229,7 +229,7 @@ const char *syscallnames[] = { "compat7.__semctl", /* 220 = freebsd7 __semctl */ "semget", /* 221 = semget */ "semop", /* 222 = semop */ - "#223", /* 223 = semconfig */ + "obs_semconfig", /* 223 = obsolete semconfig */ "compat7.msgctl", /* 224 = freebsd7 msgctl */ "msgget", /* 225 = msgget */ "msgsnd", /* 226 = msgsnd */ @@ -370,26 +370,26 @@ const char *syscallnames[] = { "getresgid", /* 361 = getresgid */ "kqueue", /* 362 = kqueue */ "compat11.kevent", /* 363 = freebsd11 kevent */ - "#364", /* 364 = __cap_get_proc */ - "#365", /* 365 = __cap_set_proc */ - "#366", /* 366 = __cap_get_fd */ - "#367", /* 367 = __cap_get_file */ - "#368", /* 368 = __cap_set_fd */ - "#369", /* 369 = __cap_set_file */ + "obs___cap_get_proc", /* 364 = obsolete __cap_get_proc */ + "obs___cap_set_proc", /* 365 = obsolete __cap_set_proc */ + "obs___cap_get_fd", /* 366 = obsolete __cap_get_fd */ + "obs___cap_get_file", /* 367 = obsolete __cap_get_file */ + "obs___cap_set_fd", /* 368 = obsolete __cap_set_fd */ + "obs___cap_set_file", /* 369 = obsolete __cap_set_file */ "#370", /* 370 = nosys */ "extattr_set_fd", /* 371 = extattr_set_fd */ "extattr_get_fd", /* 372 = extattr_get_fd */ "extattr_delete_fd", /* 373 = extattr_delete_fd */ "__setugid", /* 374 = __setugid */ - "#375", /* 375 = nfsclnt */ + "obs_nfsclnt", /* 375 = obsolete nfsclnt */ "eaccess", /* 376 = eaccess */ "afs3_syscall", /* 377 = afs3_syscall */ "nmount", /* 378 = nmount */ - "#379", /* 379 = kse_exit */ - "#380", /* 380 = kse_wakeup */ - "#381", /* 381 = kse_create */ - "#382", /* 382 = kse_thr_interrupt */ - "#383", /* 383 = kse_release */ + "obs_kse_exit", /* 379 = obsolete kse_exit */ + "obs_kse_wakeup", /* 380 = obsolete kse_wakeup */ + "obs_kse_create", /* 381 = obsolete kse_create */ + "obs_kse_thr_interrupt", /* 382 = obsolete kse_thr_interrupt */ + "obs_kse_release", /* 383 = obsolete kse_release */ "__mac_get_proc", /* 384 = __mac_get_proc */ "__mac_set_proc", /* 385 = __mac_set_proc */ "__mac_get_fd", /* 386 = __mac_get_fd */ @@ -446,7 +446,7 @@ const char *syscallnames[] = { "extattr_list_fd", /* 437 = extattr_list_fd */ "extattr_list_file", /* 438 = extattr_list_file */ "extattr_list_link", /* 439 = extattr_list_link */ - "#440", /* 440 = kse_switchin */ + "obs_kse_switchin", /* 440 = obsolete kse_switchin */ "ksem_timedwait", /* 441 = ksem_timedwait */ "thr_suspend", /* 442 = thr_suspend */ "thr_wake", /* 443 = thr_wake */ @@ -554,8 +554,8 @@ const char *syscallnames[] = { "ppoll", /* 545 = ppoll */ "futimens", /* 546 = futimens */ "utimensat", /* 547 = utimensat */ - "#548", /* 548 = numa_getaffinity */ - "#549", /* 549 = numa_setaffinity */ + "obs_numa_getaffinity", /* 548 = obsolete numa_getaffinity */ + "obs_numa_setaffinity", /* 549 = obsolete numa_setaffinity */ "fdatasync", /* 550 = fdatasync */ "fstat", /* 551 = fstat */ "fstatat", /* 552 = fstatat */ diff --git a/sys/sys/syscall.h b/sys/sys/syscall.h index 6e880f04950c..436186e90d29 100644 --- a/sys/sys/syscall.h +++ b/sys/sys/syscall.h @@ -175,6 +175,10 @@ #define SYS_setgid 181 #define SYS_setegid 182 #define SYS_seteuid 183 + /* 184 is obsolete lfs_bmapv */ + /* 185 is obsolete lfs_markv */ + /* 186 is obsolete lfs_segclean */ + /* 187 is obsolete lfs_segwait */ #define SYS_freebsd11_stat 188 #define SYS_freebsd11_fstat 189 #define SYS_freebsd11_lstat 190 @@ -198,6 +202,7 @@ #define SYS_freebsd7___semctl 220 #define SYS_semget 221 #define SYS_semop 222 + /* 223 is obsolete semconfig */ #define SYS_freebsd7_msgctl 224 #define SYS_msgget 225 #define SYS_msgsnd 226 @@ -306,13 +311,25 @@ #define SYS_getresgid 361 #define SYS_kqueue 362 #define SYS_freebsd11_kevent 363 + /* 364 is obsolete __cap_get_proc */ + /* 365 is obsolete __cap_set_proc */ + /* 366 is obsolete __cap_get_fd */ + /* 367 is obsolete __cap_get_file */ + /* 368 is obsolete __cap_set_fd */ + /* 369 is obsolete __cap_set_file */ #define SYS_extattr_set_fd 371 #define SYS_extattr_get_fd 372 #define SYS_extattr_delete_fd 373 #define SYS___setugid 374 + /* 375 is obsolete nfsclnt */ #define SYS_eaccess 376 #define SYS_afs3_syscall 377 #define SYS_nmount 378 + /* 379 is obsolete kse_exit */ + /* 380 is obsolete kse_wakeup */ + /* 381 is obsolete kse_create */ + /* 382 is obsolete kse_thr_interrupt */ + /* 383 is obsolete kse_release */ #define SYS___mac_get_proc 384 #define SYS___mac_set_proc 385 #define SYS___mac_get_fd 386 @@ -363,6 +380,7 @@ #define SYS_extattr_list_fd 437 #define SYS_extattr_list_file 438 #define SYS_extattr_list_link 439 + /* 440 is obsolete kse_switchin */ #define SYS_ksem_timedwait 441 #define SYS_thr_suspend 442 #define SYS_thr_wake 443 @@ -465,6 +483,8 @@ #define SYS_ppoll 545 #define SYS_futimens 546 #define SYS_utimensat 547 + /* 548 is obsolete numa_getaffinity */ + /* 549 is obsolete numa_setaffinity */ #define SYS_fdatasync 550 #define SYS_fstat 551 #define SYS_fstatat 552 -- cgit v1.3 From 3552f16d8207f6aab37b5c1354276b18c96484d8 Mon Sep 17 00:00:00 2001 From: Michael Tuexen Date: Fri, 28 Sep 2018 19:47:32 +0000 Subject: Fix typo in comment. Reported by: @danfe Approved by: re (kib@) MFC after: 1 week X-MFC: r338941 --- sys/netinet/sctp_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'sys') diff --git a/sys/netinet/sctp_output.c b/sys/netinet/sctp_output.c index a0b09ddd3336..2d084f018425 100644 --- a/sys/netinet/sctp_output.c +++ b/sys/netinet/sctp_output.c @@ -7210,7 +7210,7 @@ one_more_time: if ((sp->msg_is_complete) && (sp->length == 0)) { if (sp->sender_all_done) { /* - * We are doing defered cleanup. Last time through + * We are doing deferred cleanup. Last time through * when we took all the data the sender_all_done was * not set. */ -- cgit v1.3 From 2a99321a573c4ff944ad8fb0a6436e2b34750270 Mon Sep 17 00:00:00 2001 From: Oleksandr Tymoshenko Date: Sat, 29 Sep 2018 00:35:36 +0000 Subject: [sdhci] Add ACPI identifier for AMD eMMC 5.0 controller Submitted by: Rajesh Kumar Approved by: re (rgrimes) Differential Revision: https://reviews.freebsd.org/D17189 --- sys/dev/sdhci/sdhci_acpi.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'sys') diff --git a/sys/dev/sdhci/sdhci_acpi.c b/sys/dev/sdhci/sdhci_acpi.c index 844be21d64bc..c202ba054cfc 100644 --- a/sys/dev/sdhci/sdhci_acpi.c +++ b/sys/dev/sdhci/sdhci_acpi.c @@ -79,6 +79,8 @@ static const struct sdhci_acpi_device { SDHCI_QUIRK_MMC_DDR52 | SDHCI_QUIRK_CAPS_BIT63_FOR_MMC_HS400 | SDHCI_QUIRK_PRESET_VALUE_BROKEN }, + { "AMDI0040", 0, "AMD eMMC 5.0 Controller", + SDHCI_QUIRK_32BIT_DMA_SIZE }, { NULL, 0, NULL, 0} }; @@ -87,6 +89,7 @@ static char *sdhci_ids[] = { "80860F16", "80865ACA", "80865ACC", + "AMDI0040", NULL }; -- cgit v1.3 From 9d967dd27d212ee960d806801d641ed9eb564a34 Mon Sep 17 00:00:00 2001 From: Allan Jude Date: Sat, 29 Sep 2018 01:26:07 +0000 Subject: Avoid panic when adjusting priority of a read in the face of an IO error PR: 231516 Reported by: sbruno Approved by: re (rgrimes) Obtained from: ZFS-on-Linux X-MFC-with: 334844 Sponsored by: Klara Systems MFV/ZoL: Fix zio->io_priority failed (7 < 6) assert commit c26cf0966d131b722c32f8ccecfe5791a789d975 Author: Tony Hutter Date: Tue May 29 18:13:48 2018 -0700 Fix zio->io_priority failed (7 < 6) assert This fixes an assert in vdev_queue_change_io_priority(): VERIFY3(zio->io_priority < ZIO_PRIORITY_NUM_QUEUEABLE) failed (7 < 6) PANIC at vdev_queue.c:832:vdev_queue_change_io_priority() Reviewed-by: Tom Caputi Reviewed-by: George Melikov Reviewed-by: Brian Behlendorf Signed-off-by: Tony Hutter --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'sys') diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c index 3fad5a76957b..cf75ab0856c0 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c @@ -958,6 +958,15 @@ vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority) vdev_queue_t *vq = &zio->io_vd->vdev_queue; avl_tree_t *tree; + /* + * ZIO_PRIORITY_NOW is used by the vdev cache code and the aggregate zio + * code to issue IOs without adding them to the vdev queue. In this + * case, the zio is already going to be issued as quickly as possible + * and so it doesn't need any reprioitization to help. + */ + if (zio->io_priority == ZIO_PRIORITY_NOW) + return; + ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); ASSERT3U(priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); -- cgit v1.3 From 1687b1ab24cff6f73fb79b815156691459d08400 Mon Sep 17 00:00:00 2001 From: Michael Tuexen Date: Sat, 29 Sep 2018 13:01:23 +0000 Subject: For changing the MTU on tun/tap devices, it should not matter whether it is done via using ifconfig, which uses a SIOCSIFMTU ioctl() command, or doing it using a TUNSIFINFO/TAPSIFINFO ioctl() command. Without this patch, for IPv6 the new MTU is not used when creating routes. Especially, when initiating TCP connections after increasing the MTU, the old MTU is still used to compute the MSS. Thanks to ae@ and bz@ for helping to improve the patch. Reviewed by: ae@, bz@ Approved by: re (kib@) MFC after: 1 week Sponsored by: Netflix, Inc. Differential Revision: https://reviews.freebsd.org/D17180 --- sys/net/if.c | 3 +-- sys/net/if_tap.c | 15 ++++++++++++++- sys/net/if_tun.c | 23 ++++++++++++++--------- sys/net/if_var.h | 2 ++ 4 files changed, 31 insertions(+), 12 deletions(-) (limited to 'sys') diff --git a/sys/net/if.c b/sys/net/if.c index 3147eafb7c78..2acea128cdfd 100644 --- a/sys/net/if.c +++ b/sys/net/if.c @@ -264,7 +264,6 @@ static int if_setflag(struct ifnet *, int, int, int *, int); static int if_transmit(struct ifnet *ifp, struct mbuf *m); static void if_unroute(struct ifnet *, int flag, int fam); static void link_rtrequest(int, struct rtentry *, struct rt_addrinfo *); -static int ifhwioctl(u_long, struct ifnet *, caddr_t, struct thread *); static int if_delmulti_locked(struct ifnet *, struct ifmultiaddr *, int); static void do_link_state_change(void *, int); static int if_getgroup(struct ifgroupreq *, struct ifnet *); @@ -2512,7 +2511,7 @@ ifr_data_get_ptr(void *ifrp) /* * Hardware specific interface ioctls. */ -static int +int ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td) { struct ifreq *ifr; diff --git a/sys/net/if_tap.c b/sys/net/if_tap.c index 2bce8c1e7aad..65feee545b7d 100644 --- a/sys/net/if_tap.c +++ b/sys/net/if_tap.c @@ -723,10 +723,12 @@ tapifstart(struct ifnet *ifp) static int tapioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag, struct thread *td) { + struct ifreq ifr; struct tap_softc *tp = dev->si_drv1; struct ifnet *ifp = tp->tap_ifp; struct tapinfo *tapp = NULL; int f; + int error; #if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD4) int ival; @@ -738,7 +740,18 @@ tapioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag, struct thread *td if (ifp->if_type != tapp->type) return (EPROTOTYPE); mtx_lock(&tp->tap_mtx); - ifp->if_mtu = tapp->mtu; + if (ifp->if_mtu != tapp->mtu) { + strncpy(ifr.ifr_name, if_name(ifp), IFNAMSIZ); + ifr.ifr_mtu = tapp->mtu; + CURVNET_SET(ifp->if_vnet); + error = ifhwioctl(SIOCSIFMTU, ifp, + (caddr_t)&ifr, td); + CURVNET_RESTORE(); + if (error) { + mtx_unlock(&tp->tap_mtx); + return (error); + } + } ifp->if_baudrate = tapp->baudrate; mtx_unlock(&tp->tap_mtx); break; diff --git a/sys/net/if_tun.c b/sys/net/if_tun.c index cf404012a2e8..fc49ef3bbbd1 100644 --- a/sys/net/if_tun.c +++ b/sys/net/if_tun.c @@ -662,24 +662,29 @@ static int tunioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag, struct thread *td) { - int error; + struct ifreq ifr; struct tun_softc *tp = dev->si_drv1; struct tuninfo *tunp; + int error; switch (cmd) { case TUNSIFINFO: tunp = (struct tuninfo *)data; - if (tunp->mtu < IF_MINMTU) - return (EINVAL); - if (TUN2IFP(tp)->if_mtu != tunp->mtu) { - error = priv_check(td, PRIV_NET_SETIFMTU); - if (error) - return (error); - } if (TUN2IFP(tp)->if_type != tunp->type) return (EPROTOTYPE); mtx_lock(&tp->tun_mtx); - TUN2IFP(tp)->if_mtu = tunp->mtu; + if (TUN2IFP(tp)->if_mtu != tunp->mtu) { + strncpy(ifr.ifr_name, if_name(TUN2IFP(tp)), IFNAMSIZ); + ifr.ifr_mtu = tunp->mtu; + CURVNET_SET(TUN2IFP(tp)->if_vnet); + error = ifhwioctl(SIOCSIFMTU, TUN2IFP(tp), + (caddr_t)&ifr, td); + CURVNET_RESTORE(); + if (error) { + mtx_unlock(&tp->tun_mtx); + return (error); + } + } TUN2IFP(tp)->if_baudrate = tunp->baudrate; mtx_unlock(&tp->tun_mtx); break; diff --git a/sys/net/if_var.h b/sys/net/if_var.h index 68341ef75025..c9452406a151 100644 --- a/sys/net/if_var.h +++ b/sys/net/if_var.h @@ -760,6 +760,8 @@ int if_hw_tsomax_update(if_t ifp, struct ifnet_hw_tsomax *); /* accessors for struct ifreq */ void *ifr_data_get_ptr(void *ifrp); +int ifhwioctl(u_long, struct ifnet *, caddr_t, struct thread *); + #ifdef DEVICE_POLLING enum poll_cmd { POLL_ONLY, POLL_AND_CHECK_STATUS }; -- cgit v1.3 From 5f11ee2075a7868e187ac353f80ccda5a4d637f0 Mon Sep 17 00:00:00 2001 From: Konstantin Belousov Date: Sat, 29 Sep 2018 16:17:35 +0000 Subject: Fix UP build. Reported by: tijl Sponsored by: The FreeBSD Foundation Approved by: re (rgrimes) --- sys/i386/i386/pmap.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'sys') diff --git a/sys/i386/i386/pmap.c b/sys/i386/i386/pmap.c index a969eef28810..0c303e25a599 100644 --- a/sys/i386/i386/pmap.c +++ b/sys/i386/i386/pmap.c @@ -1456,6 +1456,7 @@ pmap_force_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva) return; } +#ifdef DEV_APIC /* * XXX: Some CPUs fault, hang, or trash the local APIC * registers if we use CLFLUSH on the local APIC @@ -1464,6 +1465,7 @@ pmap_force_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva) */ if (pmap_kextract(sva) == lapic_paddr) return; +#endif if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0) { /* -- cgit v1.3 From 916023bda93374d9d76a10e15eaaade8b77ab5bc Mon Sep 17 00:00:00 2001 From: "Bjoern A. Zeeb" Date: Sat, 29 Sep 2018 21:14:54 +0000 Subject: Provide MODULE_PNP_INFO() for iwm(4) so that devmatch(8) can do its job. PR: 231625 Submitted by: Yuri Pankov (yuripv yuripv.net) Approved by: re (kib) --- sys/dev/iwm/if_iwm.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'sys') diff --git a/sys/dev/iwm/if_iwm.c b/sys/dev/iwm/if_iwm.c index 9daa85257ac9..02f16be65aa1 100644 --- a/sys/dev/iwm/if_iwm.c +++ b/sys/dev/iwm/if_iwm.c @@ -6460,6 +6460,8 @@ static driver_t iwm_pci_driver = { static devclass_t iwm_devclass; DRIVER_MODULE(iwm, pci, iwm_pci_driver, iwm_devclass, NULL, NULL); +MODULE_PNP_INFO("U16:device;P:#;T:vendor=0x8086", iwm_pci_driver, iwm, + iwm_devices, nitems(iwm_devices)); MODULE_DEPEND(iwm, firmware, 1, 1, 1); MODULE_DEPEND(iwm, pci, 1, 1, 1); MODULE_DEPEND(iwm, wlan, 1, 1, 1); -- cgit v1.3 From ae0a9a88504a1a2adb6ff3ab3a5d83084540087b Mon Sep 17 00:00:00 2001 From: Michael Tuexen Date: Sun, 30 Sep 2018 12:16:06 +0000 Subject: Increment the corresponding UDP stats counter (udps_opackets) when sending UDP encapsulated SCTP packets. This is consistent with the behaviour that when such packets are received, the corresponding UDP stats counter (udps_ipackets) is incremented. Thanks to Peter Lei for making me aware of this inconsistency. Approved by: re (kib@) MFC after: 1 week --- sys/netinet/sctp_output.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'sys') diff --git a/sys/netinet/sctp_output.c b/sys/netinet/sctp_output.c index 2d084f018425..d0bde9f04a4d 100644 --- a/sys/netinet/sctp_output.c +++ b/sys/netinet/sctp_output.c @@ -4263,6 +4263,9 @@ sctp_lowlevel_chunk_output(struct sctp_inpcb *inp, atomic_subtract_int(&stcb->asoc.refcnt, 1); } #endif + if (port) { + UDPSTAT_INC(udps_opackets); + } SCTP_STAT_INCR(sctps_sendpackets); SCTP_STAT_INCR_COUNTER64(sctps_outpackets); if (ret) @@ -4603,6 +4606,9 @@ sctp_lowlevel_chunk_output(struct sctp_inpcb *inp, sin6->sin6_port = prev_port; } SCTPDBG(SCTP_DEBUG_OUTPUT3, "return from send is %d\n", ret); + if (port) { + UDPSTAT_INC(udps_opackets); + } SCTP_STAT_INCR(sctps_sendpackets); SCTP_STAT_INCR_COUNTER64(sctps_outpackets); if (ret) { @@ -11290,6 +11296,9 @@ sctp_send_resp_msg(struct sockaddr *src, struct sockaddr *dst, return; } SCTPDBG(SCTP_DEBUG_OUTPUT3, "return from send is %d\n", ret); + if (port) { + UDPSTAT_INC(udps_opackets); + } SCTP_STAT_INCR(sctps_sendpackets); SCTP_STAT_INCR_COUNTER64(sctps_outpackets); SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks); -- cgit v1.3 From 81846484257fbf53b2bdeadc71d57601a96098b1 Mon Sep 17 00:00:00 2001 From: Michael Tuexen Date: Sun, 30 Sep 2018 16:21:31 +0000 Subject: Fix the handling of ancillary data for SCTP socket. Implement sctp_process_cmsgs_for_init() and sctp_findassociation_cmsgs() similar to sctp_find_cmsg() to improve consistency and avoid the signed/unsigned issues in sctp_process_cmsgs_for_init() and sctp_findassociation_cmsgs(). Thanks to andrew@ for reporting the problem he found using syzcaller. Approved by: re (kib@) MFC after: 1 week --- sys/netinet/sctp_output.c | 54 ++++++++++++++++++++++++----------------------- 1 file changed, 28 insertions(+), 26 deletions(-) (limited to 'sys') diff --git a/sys/netinet/sctp_output.c b/sys/netinet/sctp_output.c index d0bde9f04a4d..9e09954209ab 100644 --- a/sys/netinet/sctp_output.c +++ b/sys/netinet/sctp_output.c @@ -3572,7 +3572,6 @@ static int sctp_process_cmsgs_for_init(struct sctp_tcb *stcb, struct mbuf *control, int *error) { struct cmsghdr cmh; - int tlen, at; struct sctp_initmsg initmsg; #ifdef INET struct sockaddr_in sin; @@ -3580,34 +3579,37 @@ sctp_process_cmsgs_for_init(struct sctp_tcb *stcb, struct mbuf *control, int *er #ifdef INET6 struct sockaddr_in6 sin6; #endif + int tot_len, rem_len, cmsg_data_len, cmsg_data_off, off; - tlen = SCTP_BUF_LEN(control); - at = 0; - while (at < tlen) { - if ((tlen - at) < (int)CMSG_ALIGN(sizeof(cmh))) { + tot_len = SCTP_BUF_LEN(control); + for (off = 0; off < tot_len; off += CMSG_ALIGN(cmh.cmsg_len)) { + rem_len = tot_len - off; + if (rem_len < (int)CMSG_ALIGN(sizeof(cmh))) { /* There is not enough room for one more. */ *error = EINVAL; return (1); } - m_copydata(control, at, sizeof(cmh), (caddr_t)&cmh); + m_copydata(control, off, sizeof(cmh), (caddr_t)&cmh); if (cmh.cmsg_len < CMSG_ALIGN(sizeof(cmh))) { /* We dont't have a complete CMSG header. */ *error = EINVAL; return (1); } - if (((int)cmh.cmsg_len + at) > tlen) { + if ((cmh.cmsg_len > INT_MAX) || ((int)cmh.cmsg_len > rem_len)) { /* We don't have the complete CMSG. */ *error = EINVAL; return (1); } + cmsg_data_len = (int)cmh.cmsg_len - CMSG_ALIGN(sizeof(cmh)); + cmsg_data_off = off + CMSG_ALIGN(sizeof(cmh)); if (cmh.cmsg_level == IPPROTO_SCTP) { switch (cmh.cmsg_type) { case SCTP_INIT: - if ((size_t)(cmh.cmsg_len - CMSG_ALIGN(sizeof(cmh))) < sizeof(struct sctp_initmsg)) { + if (cmsg_data_len < (int)sizeof(struct sctp_initmsg)) { *error = EINVAL; return (1); } - m_copydata(control, at + CMSG_ALIGN(sizeof(cmh)), sizeof(struct sctp_initmsg), (caddr_t)&initmsg); + m_copydata(control, cmsg_data_off, sizeof(struct sctp_initmsg), (caddr_t)&initmsg); if (initmsg.sinit_max_attempts) stcb->asoc.max_init_times = initmsg.sinit_max_attempts; if (initmsg.sinit_num_ostreams) @@ -3662,7 +3664,7 @@ sctp_process_cmsgs_for_init(struct sctp_tcb *stcb, struct mbuf *control, int *er break; #ifdef INET case SCTP_DSTADDRV4: - if ((size_t)(cmh.cmsg_len - CMSG_ALIGN(sizeof(cmh))) < sizeof(struct in_addr)) { + if (cmsg_data_len < (int)sizeof(struct in_addr)) { *error = EINVAL; return (1); } @@ -3670,7 +3672,7 @@ sctp_process_cmsgs_for_init(struct sctp_tcb *stcb, struct mbuf *control, int *er sin.sin_family = AF_INET; sin.sin_len = sizeof(struct sockaddr_in); sin.sin_port = stcb->rport; - m_copydata(control, at + CMSG_ALIGN(sizeof(cmh)), sizeof(struct in_addr), (caddr_t)&sin.sin_addr); + m_copydata(control, cmsg_data_off, sizeof(struct in_addr), (caddr_t)&sin.sin_addr); if ((sin.sin_addr.s_addr == INADDR_ANY) || (sin.sin_addr.s_addr == INADDR_BROADCAST) || IN_MULTICAST(ntohl(sin.sin_addr.s_addr))) { @@ -3686,7 +3688,7 @@ sctp_process_cmsgs_for_init(struct sctp_tcb *stcb, struct mbuf *control, int *er #endif #ifdef INET6 case SCTP_DSTADDRV6: - if ((size_t)(cmh.cmsg_len - CMSG_ALIGN(sizeof(cmh))) < sizeof(struct in6_addr)) { + if (cmsg_data_len < (int)sizeof(struct in6_addr)) { *error = EINVAL; return (1); } @@ -3694,7 +3696,7 @@ sctp_process_cmsgs_for_init(struct sctp_tcb *stcb, struct mbuf *control, int *er sin6.sin6_family = AF_INET6; sin6.sin6_len = sizeof(struct sockaddr_in6); sin6.sin6_port = stcb->rport; - m_copydata(control, at + CMSG_ALIGN(sizeof(cmh)), sizeof(struct in6_addr), (caddr_t)&sin6.sin6_addr); + m_copydata(control, cmsg_data_off, sizeof(struct in6_addr), (caddr_t)&sin6.sin6_addr); if (IN6_IS_ADDR_UNSPECIFIED(&sin6.sin6_addr) || IN6_IS_ADDR_MULTICAST(&sin6.sin6_addr)) { *error = EINVAL; @@ -3727,7 +3729,6 @@ sctp_process_cmsgs_for_init(struct sctp_tcb *stcb, struct mbuf *control, int *er break; } } - at += CMSG_ALIGN(cmh.cmsg_len); } return (0); } @@ -3740,7 +3741,6 @@ sctp_findassociation_cmsgs(struct sctp_inpcb **inp_p, int *error) { struct cmsghdr cmh; - int tlen, at; struct sctp_tcb *stcb; struct sockaddr *addr; #ifdef INET @@ -3749,31 +3749,34 @@ sctp_findassociation_cmsgs(struct sctp_inpcb **inp_p, #ifdef INET6 struct sockaddr_in6 sin6; #endif + int tot_len, rem_len, cmsg_data_len, cmsg_data_off, off; - tlen = SCTP_BUF_LEN(control); - at = 0; - while (at < tlen) { - if ((tlen - at) < (int)CMSG_ALIGN(sizeof(cmh))) { + tot_len = SCTP_BUF_LEN(control); + for (off = 0; off < tot_len; off += CMSG_ALIGN(cmh.cmsg_len)) { + rem_len = tot_len - off; + if (rem_len < (int)CMSG_ALIGN(sizeof(cmh))) { /* There is not enough room for one more. */ *error = EINVAL; return (NULL); } - m_copydata(control, at, sizeof(cmh), (caddr_t)&cmh); + m_copydata(control, off, sizeof(cmh), (caddr_t)&cmh); if (cmh.cmsg_len < CMSG_ALIGN(sizeof(cmh))) { /* We dont't have a complete CMSG header. */ *error = EINVAL; return (NULL); } - if (((int)cmh.cmsg_len + at) > tlen) { + if ((cmh.cmsg_len > INT_MAX) || ((int)cmh.cmsg_len > rem_len)) { /* We don't have the complete CMSG. */ *error = EINVAL; return (NULL); } + cmsg_data_len = (int)cmh.cmsg_len - CMSG_ALIGN(sizeof(cmh)); + cmsg_data_off = off + CMSG_ALIGN(sizeof(cmh)); if (cmh.cmsg_level == IPPROTO_SCTP) { switch (cmh.cmsg_type) { #ifdef INET case SCTP_DSTADDRV4: - if ((size_t)(cmh.cmsg_len - CMSG_ALIGN(sizeof(cmh))) < sizeof(struct in_addr)) { + if (cmsg_data_len < (int)sizeof(struct in_addr)) { *error = EINVAL; return (NULL); } @@ -3781,13 +3784,13 @@ sctp_findassociation_cmsgs(struct sctp_inpcb **inp_p, sin.sin_family = AF_INET; sin.sin_len = sizeof(struct sockaddr_in); sin.sin_port = port; - m_copydata(control, at + CMSG_ALIGN(sizeof(cmh)), sizeof(struct in_addr), (caddr_t)&sin.sin_addr); + m_copydata(control, cmsg_data_off, sizeof(struct in_addr), (caddr_t)&sin.sin_addr); addr = (struct sockaddr *)&sin; break; #endif #ifdef INET6 case SCTP_DSTADDRV6: - if ((size_t)(cmh.cmsg_len - CMSG_ALIGN(sizeof(cmh))) < sizeof(struct in6_addr)) { + if (cmsg_data_len < (int)sizeof(struct in6_addr)) { *error = EINVAL; return (NULL); } @@ -3795,7 +3798,7 @@ sctp_findassociation_cmsgs(struct sctp_inpcb **inp_p, sin6.sin6_family = AF_INET6; sin6.sin6_len = sizeof(struct sockaddr_in6); sin6.sin6_port = port; - m_copydata(control, at + CMSG_ALIGN(sizeof(cmh)), sizeof(struct in6_addr), (caddr_t)&sin6.sin6_addr); + m_copydata(control, cmsg_data_off, sizeof(struct in6_addr), (caddr_t)&sin6.sin6_addr); #ifdef INET if (IN6_IS_ADDR_V4MAPPED(&sin6.sin6_addr)) { in6_sin6_2_sin(&sin, &sin6); @@ -3816,7 +3819,6 @@ sctp_findassociation_cmsgs(struct sctp_inpcb **inp_p, } } } - at += CMSG_ALIGN(cmh.cmsg_len); } return (NULL); } -- cgit v1.3 From 632227a7393cee462beb380b52e4e0704a4f3fc7 Mon Sep 17 00:00:00 2001 From: Konstantin Belousov Date: Sun, 30 Sep 2018 16:57:30 +0000 Subject: Update x86/ifunc.h. Remove ifunc emulation. Add helper for usermode ifunc resolver definition. Update copyright years. Sponsored by: The FreeBSD Foundation Approved by: re (rgrimes) MFC after: 1 week --- sys/x86/include/ifunc.h | 32 ++++++++++++-------------------- 1 file changed, 12 insertions(+), 20 deletions(-) (limited to 'sys') diff --git a/sys/x86/include/ifunc.h b/sys/x86/include/ifunc.h index 13f4b886430e..38c82570770e 100644 --- a/sys/x86/include/ifunc.h +++ b/sys/x86/include/ifunc.h @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2015, 2017 The FreeBSD Foundation + * Copyright (c) 2015-2018 The FreeBSD Foundation * All rights reserved. * * This software was developed by Konstantin Belousov @@ -32,27 +32,19 @@ #ifndef __X86_IFUNC_H #define __X86_IFUNC_H -#define DECLARE_LIFUNC(ret_type, name, args) \ -ret_type name args - -#define DEFINE_LIFUNC(scope, selector_qual, ret_type, name, args) \ -__asm__ (scope "\t" #name "\n" \ - "\t.type\t" #name ",@function\n" \ - #name ":\n" \ - "\tjmp *" #name "_selector\n" \ - "\t.size\t" #name ",\t. - "#name); \ -selector_qual ret_type (*name##_selector)args __used; \ -DECLARE_LIFUNC(ret_type, name, args) - -#define DEFINE_STATIC_LIFUNC(ret_type, name, args) \ - DEFINE_LIFUNC(".local", static, ret_type, name, args) - -#define DEFINE_GLOBAL_LIFUNC(ret_type, name, args) \ - DEFINE_LIFUNC(".globl", , ret_type, name, args) - -#define DEFINE_IFUNC(qual, ret_type, name, args, resolver_qual) \ +#define DEFINE_IFUNC(qual, ret_type, name, args, resolver_qual) \ resolver_qual ret_type (*name##_resolver(void))args __used; \ qual ret_type name args __attribute__((ifunc(#name "_resolver"))); \ resolver_qual ret_type (*name##_resolver(void))args +#define DEFINE_UIFUNC(qual, ret_type, name, args, resolver_qual) \ + resolver_qual ret_type (*name##_resolver(uint32_t, uint32_t, \ + uint32_t, uint32_t))args __used; \ + qual ret_type name args __attribute__((ifunc(#name "_resolver"))); \ + resolver_qual ret_type (*name##_resolver( \ + uint32_t cpu_feature __unused, \ + uint32_t cpu_feature2 __unused, \ + uint32_t cpu_stdext_feature __unused, \ + uint32_t cpu_stdext_feature2 __unused))args + #endif -- cgit v1.3 From 72851e8598bc1283efd02ecd2e0c2d49b4896f2e Mon Sep 17 00:00:00 2001 From: Allan Jude Date: Sun, 30 Sep 2018 21:23:31 +0000 Subject: Use PNP metadata to allow devmatch to autoload ure(4) Reviewed by: manu imp Approved by: re (kib) X-MFC-with: devmatch Sponsored by: Klara Systems --- sys/dev/usb/net/if_ure.c | 1 + 1 file changed, 1 insertion(+) (limited to 'sys') diff --git a/sys/dev/usb/net/if_ure.c b/sys/dev/usb/net/if_ure.c index 2bf0609697ed..24ce36b64a62 100644 --- a/sys/dev/usb/net/if_ure.c +++ b/sys/dev/usb/net/if_ure.c @@ -169,6 +169,7 @@ MODULE_DEPEND(ure, usb, 1, 1, 1); MODULE_DEPEND(ure, ether, 1, 1, 1); MODULE_DEPEND(ure, miibus, 1, 1, 1); MODULE_VERSION(ure, 1); +USB_PNP_HOST_INFO(ure_devs); static const struct usb_ether_methods ure_ue_methods = { .ue_attach_post = ure_attach_post, -- cgit v1.3 From 66bcf0b333e419ce3e5467a17ba80f104d5e8f96 Mon Sep 17 00:00:00 2001 From: Michael Tuexen Date: Sun, 30 Sep 2018 21:31:33 +0000 Subject: Plug mbuf leaks in the SCTP output path in error cases. Approved by: re (kib@) MFC after: 1 week CID: 1395307 --- sys/netinet/sctp_output.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'sys') diff --git a/sys/netinet/sctp_output.c b/sys/netinet/sctp_output.c index 9e09954209ab..79701a444a80 100644 --- a/sys/netinet/sctp_output.c +++ b/sys/netinet/sctp_output.c @@ -4367,6 +4367,7 @@ sctp_lowlevel_chunk_output(struct sctp_inpcb *inp, /* KAME hack: embed scopeid */ if (sa6_embedscope(sin6, MODULE_GLOBAL(ip6_use_defzone)) != 0) { SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL); + sctp_m_freem(m); return (EINVAL); } if (net == NULL) { @@ -4431,6 +4432,7 @@ sctp_lowlevel_chunk_output(struct sctp_inpcb *inp, /* KAME hack: embed scopeid */ if (sa6_embedscope(sin6, MODULE_GLOBAL(ip6_use_defzone)) != 0) { SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL); + sctp_m_freem(m); return (EINVAL); } /* Cache the source address */ @@ -4457,6 +4459,7 @@ sctp_lowlevel_chunk_output(struct sctp_inpcb *inp, /* KAME hack: embed scopeid */ if (sa6_embedscope(sin6, MODULE_GLOBAL(ip6_use_defzone)) != 0) { SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL); + sctp_m_freem(m); return (EINVAL); } if (over_addr == NULL) { -- cgit v1.3 From 1b084a5e5e2b02277b29bff5c333281008196b35 Mon Sep 17 00:00:00 2001 From: Michael Tuexen Date: Sun, 30 Sep 2018 21:54:02 +0000 Subject: Plug mbuf leak in the SCTP input path in an error case. Approved by: re (kib@) MFC after: 1 week CID: 749312 --- sys/netinet/sctp_asconf.c | 1 + 1 file changed, 1 insertion(+) (limited to 'sys') diff --git a/sys/netinet/sctp_asconf.c b/sys/netinet/sctp_asconf.c index 26b1ba6f508b..611280f1ae42 100644 --- a/sys/netinet/sctp_asconf.c +++ b/sys/netinet/sctp_asconf.c @@ -670,6 +670,7 @@ sctp_handle_asconf(struct mbuf *m, unsigned int offset, SCTPDBG(SCTP_DEBUG_ASCONF1, "handle_asconf: couldn't get lookup addr!\n"); /* respond with a missing/invalid mandatory parameter error */ + sctp_m_freem(m_ack); return; } /* param_length is already validated in process_control... */ -- cgit v1.3 From ef08929f90885219ce17c2af89b78996f40e37f2 Mon Sep 17 00:00:00 2001 From: "Bjoern A. Zeeb" Date: Mon, 1 Oct 2018 10:44:33 +0000 Subject: Fix the MODULE_PNP_INFO() for iwm(4) where I got the bus and module arguments wrong in r339020. PR: 231625 Reported by: Yuri Pankov (yuripv yuripv.net) Reviewed by: cem, Yuri Pankov (yuripv yuripv.net) Approved by: re (kib) Pointyhat to: bz (a rather big one for this one) --- sys/dev/iwm/if_iwm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'sys') diff --git a/sys/dev/iwm/if_iwm.c b/sys/dev/iwm/if_iwm.c index 02f16be65aa1..4e08dacf8c4f 100644 --- a/sys/dev/iwm/if_iwm.c +++ b/sys/dev/iwm/if_iwm.c @@ -6460,7 +6460,7 @@ static driver_t iwm_pci_driver = { static devclass_t iwm_devclass; DRIVER_MODULE(iwm, pci, iwm_pci_driver, iwm_devclass, NULL, NULL); -MODULE_PNP_INFO("U16:device;P:#;T:vendor=0x8086", iwm_pci_driver, iwm, +MODULE_PNP_INFO("U16:device;P:#;T:vendor=0x8086", pci, iwm_pci_driver, iwm_devices, nitems(iwm_devices)); MODULE_DEPEND(iwm, firmware, 1, 1, 1); MODULE_DEPEND(iwm, pci, 1, 1, 1); -- cgit v1.3 From 384a5c3c282e27a5eb18d89dbe6dbb266899e72c Mon Sep 17 00:00:00 2001 From: "Andrey V. Elsukov" Date: Mon, 1 Oct 2018 10:46:00 +0000 Subject: Add INP_INFO_WUNLOCK_ASSERT() macro and use it instead of INP_INFO_UNLOCK_ASSERT() in TCP-related code. For encapsulated traffic it is possible, that the code is running in net_epoch_preempt section, and INP_INFO_UNLOCK_ASSERT() is very strict assertion for such case. PR: 231428 Reviewed by: mmacy, tuexen Approved by: re (kib) Differential Revision: https://reviews.freebsd.org/D17335 --- sys/netinet/in_pcb.h | 2 ++ sys/netinet/siftr.c | 2 +- sys/netinet/tcp_hpts.c | 4 ++-- sys/netinet/tcp_input.c | 10 +++++----- 4 files changed, 10 insertions(+), 8 deletions(-) (limited to 'sys') diff --git a/sys/netinet/in_pcb.h b/sys/netinet/in_pcb.h index 86c9705cb905..6d2c86d5014e 100644 --- a/sys/netinet/in_pcb.h +++ b/sys/netinet/in_pcb.h @@ -642,6 +642,8 @@ int inp_so_options(const struct inpcb *inp); #define INP_INFO_LOCK_ASSERT(ipi) MPASS(in_epoch(net_epoch_preempt) || mtx_owned(&(ipi)->ipi_lock)) #define INP_INFO_RLOCK_ASSERT(ipi) MPASS(in_epoch(net_epoch_preempt)) #define INP_INFO_WLOCK_ASSERT(ipi) mtx_assert(&(ipi)->ipi_lock, MA_OWNED) +#define INP_INFO_WUNLOCK_ASSERT(ipi) \ + mtx_assert(&(ipi)->ipi_lock, MA_NOTOWNED) #define INP_INFO_UNLOCK_ASSERT(ipi) MPASS(!in_epoch(net_epoch_preempt) && !mtx_owned(&(ipi)->ipi_lock)) #define INP_LIST_LOCK_INIT(ipi, d) \ diff --git a/sys/netinet/siftr.c b/sys/netinet/siftr.c index 83af89570d5e..bc59e312bb00 100644 --- a/sys/netinet/siftr.c +++ b/sys/netinet/siftr.c @@ -710,7 +710,7 @@ siftr_findinpcb(int ipver, struct ip *ip, struct mbuf *m, uint16_t sport, struct inpcb *inp; /* We need the tcbinfo lock. */ - INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); + INP_INFO_WUNLOCK_ASSERT(&V_tcbinfo); if (dir == PFIL_IN) inp = (ipver == INP_IPV4 ? diff --git a/sys/netinet/tcp_hpts.c b/sys/netinet/tcp_hpts.c index 32047180f883..f3737888f3d0 100644 --- a/sys/netinet/tcp_hpts.c +++ b/sys/netinet/tcp_hpts.c @@ -1282,7 +1282,7 @@ out: * lock again but we also need some kasserts * here. */ - INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); + INP_INFO_WUNLOCK_ASSERT(&V_tcbinfo); INP_UNLOCK_ASSERT(inp); m = n; if (m) @@ -1324,7 +1324,7 @@ out: INP_WUNLOCK(inp); if (ti_locked == TI_RLOCKED) INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); - INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); + INP_INFO_WUNLOCK_ASSERT(&V_tcbinfo); INP_UNLOCK_ASSERT(inp); ti_locked = TI_UNLOCKED; mtx_lock(&hpts->p_mtx); diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c index b61161c3eed8..bbb031439fc8 100644 --- a/sys/netinet/tcp_input.c +++ b/sys/netinet/tcp_input.c @@ -800,7 +800,7 @@ findpcb: if (ti_locked == TI_RLOCKED) { INP_INFO_RLOCK_ASSERT(&V_tcbinfo); } else { - INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); + INP_INFO_WUNLOCK_ASSERT(&V_tcbinfo); } #endif #ifdef INET6 @@ -1358,7 +1358,7 @@ tfo_socket_result: INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); ti_locked = TI_UNLOCKED; } - INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); + INP_INFO_WUNLOCK_ASSERT(&V_tcbinfo); return (IPPROTO_DONE); } else if (tp->t_state == TCPS_LISTEN) { /* @@ -1405,7 +1405,7 @@ dropwithreset: else { KASSERT(ti_locked == TI_UNLOCKED, ("%s: dropwithreset " "ti_locked: %d", __func__, ti_locked)); - INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); + INP_INFO_WUNLOCK_ASSERT(&V_tcbinfo); } #endif @@ -1429,7 +1429,7 @@ dropunlock: else { KASSERT(ti_locked == TI_UNLOCKED, ("%s: dropunlock " "ti_locked: %d", __func__, ti_locked)); - INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); + INP_INFO_WUNLOCK_ASSERT(&V_tcbinfo); } #endif @@ -1437,7 +1437,7 @@ dropunlock: INP_WUNLOCK(inp); drop: - INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); + INP_INFO_WUNLOCK_ASSERT(&V_tcbinfo); if (s != NULL) free(s, M_TCPLOG); if (m != NULL) -- cgit v1.3 From 9d2e3f14c41515eb6889ed4fc6fe8455865c3632 Mon Sep 17 00:00:00 2001 From: Michael Tuexen Date: Mon, 1 Oct 2018 13:09:18 +0000 Subject: After allocating chunks set the fields in a consistent way. This removes two assignments for the flags field being done twice and adds one, which was missing. Thanks to Felix Weinrank for reporting the issue he found by using fuzz testing of the userland stack. Approved by: re (kib@) MFC after: 1 week --- sys/netinet/sctp_output.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'sys') diff --git a/sys/netinet/sctp_output.c b/sys/netinet/sctp_output.c index 79701a444a80..34e91d8b0af6 100644 --- a/sys/netinet/sctp_output.c +++ b/sys/netinet/sctp_output.c @@ -8975,14 +8975,15 @@ sctp_queue_op_err(struct sctp_tcb *stcb, struct mbuf *op_err) return; } chk->copy_by_ref = 0; + chk->rec.chunk_id.id = SCTP_OPERATION_ERROR; + chk->rec.chunk_id.can_take_data = 0; + chk->flags = 0; chk->send_size = (uint16_t)chunk_length; chk->sent = SCTP_DATAGRAM_UNSENT; chk->snd_count = 0; chk->asoc = &stcb->asoc; chk->data = op_err; chk->whoTo = NULL; - chk->rec.chunk_id.id = SCTP_OPERATION_ERROR; - chk->rec.chunk_id.can_take_data = 0; hdr = mtod(op_err, struct sctp_chunkhdr *); hdr->chunk_type = SCTP_OPERATION_ERROR; hdr->chunk_flags = 0; @@ -9204,7 +9205,6 @@ sctp_send_shutdown_ack(struct sctp_tcb *stcb, struct sctp_nets *net) chk->send_size = sizeof(struct sctp_chunkhdr); chk->sent = SCTP_DATAGRAM_UNSENT; chk->snd_count = 0; - chk->flags = 0; chk->asoc = &stcb->asoc; chk->data = m_shutdown_ack; chk->whoTo = net; @@ -9259,7 +9259,6 @@ sctp_send_shutdown(struct sctp_tcb *stcb, struct sctp_nets *net) chk->send_size = sizeof(struct sctp_shutdown_chunk); chk->sent = SCTP_DATAGRAM_UNSENT; chk->snd_count = 0; - chk->flags = 0; chk->asoc = &stcb->asoc; chk->data = m_shutdown; chk->whoTo = net; @@ -12168,7 +12167,6 @@ sctp_send_str_reset_req(struct sctp_tcb *stcb, chk->book_size = sizeof(struct sctp_chunkhdr); chk->send_size = SCTP_SIZE32(chk->book_size); chk->book_size_scale = 0; - chk->data = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_NOWAIT, 1, MT_DATA); if (chk->data == NULL) { sctp_free_a_chunk(stcb, chk, SCTP_SO_LOCKED); -- cgit v1.3 From 15a087e55119214fbe129fbbef9a4e49bab6bc90 Mon Sep 17 00:00:00 2001 From: Michael Tuexen Date: Mon, 1 Oct 2018 14:05:31 +0000 Subject: Mitigate providing a timing signal if the COOKIE or AUTH validation fails. Thanks to jmg@ for reporting the issue, which was discussed in https://admbugs.freebsd.org/show_bug.cgi?id=878 Approved by: re (TBD@) MFC after: 1 week --- sys/netinet/sctp_auth.c | 2 +- sys/netinet/sctp_input.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'sys') diff --git a/sys/netinet/sctp_auth.c b/sys/netinet/sctp_auth.c index 5a5b7880fd30..d379dd0a143e 100644 --- a/sys/netinet/sctp_auth.c +++ b/sys/netinet/sctp_auth.c @@ -1706,7 +1706,7 @@ sctp_handle_auth(struct sctp_tcb *stcb, struct sctp_auth_chunk *auth, m, offset, computed_digest); /* compare the computed digest with the one in the AUTH chunk */ - if (memcmp(digest, computed_digest, digestlen) != 0) { + if (timingsafe_bcmp(digest, computed_digest, digestlen) != 0) { SCTP_STAT_INCR(sctps_recvauthfailed); SCTPDBG(SCTP_DEBUG_AUTH1, "SCTP Auth: HMAC digest check failed\n"); diff --git a/sys/netinet/sctp_input.c b/sys/netinet/sctp_input.c index 3a7d93833a0a..86656a7f7eb2 100644 --- a/sys/netinet/sctp_input.c +++ b/sys/netinet/sctp_input.c @@ -2554,7 +2554,7 @@ sctp_handle_cookie_echo(struct mbuf *m, int iphlen, int offset, return (NULL); } /* compare the received digest with the computed digest */ - if (memcmp(calc_sig, sig, SCTP_SIGNATURE_SIZE) != 0) { + if (timingsafe_bcmp(calc_sig, sig, SCTP_SIGNATURE_SIZE) != 0) { /* try the old cookie? */ if ((cookie->time_entered.tv_sec == (long)ep->time_of_secret_change) && (ep->current_secret_number != ep->last_secret_number)) { @@ -2563,7 +2563,7 @@ sctp_handle_cookie_echo(struct mbuf *m, int iphlen, int offset, (uint8_t *)ep->secret_key[(int)ep->last_secret_number], SCTP_SECRET_SIZE, m, cookie_offset, calc_sig, 0); /* compare */ - if (memcmp(calc_sig, sig, SCTP_SIGNATURE_SIZE) == 0) + if (timingsafe_bcmp(calc_sig, sig, SCTP_SIGNATURE_SIZE) == 0) cookie_ok = 1; } } else { -- cgit v1.3 From 30c5525b3cd28170e736202510ab64d51aa703bf Mon Sep 17 00:00:00 2001 From: Andrew Gallatin Date: Mon, 1 Oct 2018 14:14:21 +0000 Subject: Allow empty NUMA memory domains to support Threadripper2 The AMD Threadripper 2990WX is basically a slightly crippled Epyc. Rather than having 4 memory controllers, one per NUMA domain, it has only 2 memory controllers enabled. This means that only 2 of the 4 NUMA domains can be populated with physical memory, and the others are empty. Add support to FreeBSD for empty NUMA domains by: - creating empty memory domains when parsing the SRAT table, rather than failing to parse the table - not running the pageout deamon threads in empty domains - adding defensive code to UMA to avoid allocating from empty domains - adding defensive code to cpuset to avoid binding to an empty domain Thanks to Jeff for suggesting this strategy. Reviewed by: alc, markj Approved by: re (gjb@) Differential Revision: https://reviews.freebsd.org/D1683 --- sys/kern/kern_cpuset.c | 33 +++++++++++++++++++++++++++++++++ sys/vm/uma_core.c | 42 ++++++++++++++++++++++++++++++++++-------- sys/vm/vm_kern.c | 2 ++ sys/vm/vm_pageout.c | 7 +++++++ sys/vm/vm_pagequeue.h | 3 ++- sys/x86/acpica/srat.c | 16 ++++++++++++++-- 6 files changed, 92 insertions(+), 11 deletions(-) (limited to 'sys') diff --git a/sys/kern/kern_cpuset.c b/sys/kern/kern_cpuset.c index b1612b11f770..3f4a81ff70d7 100644 --- a/sys/kern/kern_cpuset.c +++ b/sys/kern/kern_cpuset.c @@ -65,7 +65,12 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include +#include #include +#include +#include +#include #ifdef DDB #include @@ -478,6 +483,26 @@ _domainset_create(struct domainset *domain, struct domainlist *freelist) } +/* + * Are any of the domains in the mask empty? If so, silently + * remove them. If only empty domains are present, we must + * return failure. + */ +static bool +domainset_empty_vm(struct domainset *domain) +{ + int i, max; + + max = DOMAINSET_FLS(&domain->ds_mask) + 1; + for (i = 0; i < max; i++) { + if (DOMAINSET_ISSET(i, &domain->ds_mask) && + VM_DOMAIN_EMPTY(i)) + DOMAINSET_CLR(i, &domain->ds_mask); + } + + return (DOMAINSET_EMPTY(&domain->ds_mask)); +} + /* * Create or lookup a domainset based on the key held in 'domain'. */ @@ -1360,6 +1385,7 @@ domainset_zero(void) DOMAINSET_SET(i, &dset->ds_mask); dset->ds_policy = DOMAINSET_POLICY_FIRSTTOUCH; dset->ds_prefer = -1; + (void)domainset_empty_vm(dset); curthread->td_domain.dr_policy = _domainset_create(dset, NULL); domainset_copy(dset, &domainset2); @@ -2087,6 +2113,13 @@ kern_cpuset_setdomain(struct thread *td, cpulevel_t level, cpuwhich_t which, DOMAINSET_FILL(&domain.ds_mask); } + /* + * When given an impossible policy, fall back to interleaving + * across all domains + */ + if (domainset_empty_vm(&domain)) + domainset_copy(&domainset2, &domain); + switch (level) { case CPU_LEVEL_ROOT: case CPU_LEVEL_CPUSET: diff --git a/sys/vm/uma_core.c b/sys/vm/uma_core.c index 937e78452021..837eb0787915 100644 --- a/sys/vm/uma_core.c +++ b/sys/vm/uma_core.c @@ -84,6 +84,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -2469,9 +2470,11 @@ zalloc_start: if (bucket != NULL) bucket_free(zone, bucket, udata); - if (zone->uz_flags & UMA_ZONE_NUMA) + if (zone->uz_flags & UMA_ZONE_NUMA) { domain = PCPU_GET(domain); - else + if (VM_DOMAIN_EMPTY(domain)) + domain = UMA_ANYDOMAIN; + } else domain = UMA_ANYDOMAIN; /* Short-circuit for zones without buckets and low memory. */ @@ -2647,7 +2650,11 @@ keg_fetch_slab(uma_keg_t keg, uma_zone_t zone, int rdomain, int flags) rdomain = 0; rr = rdomain == UMA_ANYDOMAIN; if (rr) { - keg->uk_cursor = (keg->uk_cursor + 1) % vm_ndomains; + start = keg->uk_cursor; + do { + keg->uk_cursor = (keg->uk_cursor + 1) % vm_ndomains; + domain = keg->uk_cursor; + } while (VM_DOMAIN_EMPTY(domain) && domain != start); domain = start = keg->uk_cursor; /* Only block on the second pass. */ if ((flags & (M_WAITOK | M_NOVM)) == M_WAITOK) @@ -2698,8 +2705,11 @@ again: LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link); return (slab); } - if (rr) - domain = (domain + 1) % vm_ndomains; + if (rr) { + do { + domain = (domain + 1) % vm_ndomains; + } while (VM_DOMAIN_EMPTY(domain) && domain != start); + } } while (domain != start); /* Retry domain scan with blocking. */ @@ -2903,6 +2913,8 @@ zone_alloc_bucket(uma_zone_t zone, void *udata, int domain, int flags) uma_bucket_t bucket; int max; + CTR1(KTR_UMA, "zone_alloc:_bucket domain %d)", domain); + /* Don't wait for buckets, preserve caller's NOVM setting. */ bucket = bucket_alloc(zone, udata, M_NOWAIT | (flags & M_NOVM)); if (bucket == NULL) @@ -2970,6 +2982,11 @@ zone_alloc_item(uma_zone_t zone, void *udata, int domain, int flags) item = NULL; + if (domain != UMA_ANYDOMAIN) { + /* avoid allocs targeting empty domains */ + if (VM_DOMAIN_EMPTY(domain)) + domain = UMA_ANYDOMAIN; + } if (zone->uz_import(zone->uz_arg, &item, 1, domain, flags) != 1) goto fail; atomic_add_long(&zone->uz_allocs, 1); @@ -3139,9 +3156,11 @@ zfree_start: /* We are no longer associated with this CPU. */ critical_exit(); - if ((zone->uz_flags & UMA_ZONE_NUMA) != 0) + if ((zone->uz_flags & UMA_ZONE_NUMA) != 0) { domain = PCPU_GET(domain); - else + if (VM_DOMAIN_EMPTY(domain)) + domain = UMA_ANYDOMAIN; + } else domain = 0; zdom = &zone->uz_domain[0]; @@ -3588,7 +3607,9 @@ uma_prealloc(uma_zone_t zone, int items) dom = &keg->uk_domain[slab->us_domain]; LIST_INSERT_HEAD(&dom->ud_free_slab, slab, us_link); slabs--; - domain = (domain + 1) % vm_ndomains; + do { + domain = (domain + 1) % vm_ndomains; + } while (VM_DOMAIN_EMPTY(domain)); } KEG_UNLOCK(keg); } @@ -3678,6 +3699,11 @@ uma_large_malloc_domain(vm_size_t size, int domain, int wait) vm_offset_t addr; uma_slab_t slab; + if (domain != UMA_ANYDOMAIN) { + /* avoid allocs targeting empty domains */ + if (VM_DOMAIN_EMPTY(domain)) + domain = UMA_ANYDOMAIN; + } slab = zone_alloc_item(slabzone, NULL, domain, wait); if (slab == NULL) return (NULL); diff --git a/sys/vm/vm_kern.c b/sys/vm/vm_kern.c index 9774e0b4f22d..88fbc74848df 100644 --- a/sys/vm/vm_kern.c +++ b/sys/vm/vm_kern.c @@ -502,6 +502,8 @@ kmem_back(vm_object_t object, vm_offset_t addr, vm_size_t size, int flags) */ if (vm_ndomains > 1) { domain = (addr >> KVA_QUANTUM_SHIFT) % vm_ndomains; + while (VM_DOMAIN_EMPTY(domain)) + domain++; next = roundup2(addr + 1, KVA_QUANTUM); if (next > end || next < start) next = end; diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c index c084de7aca44..5c309a04a402 100644 --- a/sys/vm/vm_pageout.c +++ b/sys/vm/vm_pageout.c @@ -2082,6 +2082,13 @@ vm_pageout(void) if (error != 0) panic("starting laundry for domain 0, error %d", error); for (i = 1; i < vm_ndomains; i++) { + if (VM_DOMAIN_EMPTY(i)) { + if (bootverbose) + printf("domain %d empty; skipping pageout\n", + i); + continue; + } + error = kthread_add(vm_pageout_worker, (void *)(uintptr_t)i, curproc, NULL, 0, 0, "dom%d", i); if (error != 0) { diff --git a/sys/vm/vm_pagequeue.h b/sys/vm/vm_pagequeue.h index ac04f2b3ea61..758e51d8ef6e 100644 --- a/sys/vm/vm_pagequeue.h +++ b/sys/vm/vm_pagequeue.h @@ -151,7 +151,8 @@ struct vm_domain { extern struct vm_domain vm_dom[MAXMEMDOM]; -#define VM_DOMAIN(n) (&vm_dom[(n)]) +#define VM_DOMAIN(n) (&vm_dom[(n)]) +#define VM_DOMAIN_EMPTY(n) (vm_dom[(n)].vmd_page_count == 0) #define vm_pagequeue_assert_locked(pq) mtx_assert(&(pq)->pq_mutex, MA_OWNED) #define vm_pagequeue_lock(pq) mtx_lock(&(pq)->pq_mutex) diff --git a/sys/x86/acpica/srat.c b/sys/x86/acpica/srat.c index 08a98429a03d..42b506c4447b 100644 --- a/sys/x86/acpica/srat.c +++ b/sys/x86/acpica/srat.c @@ -311,8 +311,20 @@ check_domains(void) } for (i = 0; i <= max_apic_id; i++) if (cpus[i].enabled && !cpus[i].has_memory) { - printf("SRAT: No memory found for CPU %d\n", i); - return (ENXIO); + found = 0; + for (j = 0; j < num_mem && !found; j++) { + if (mem_info[j].domain == cpus[i].domain) + found = 1; + } + if (!found) { + if (bootverbose) + printf("SRAT: mem dom %d is empty\n", + cpus[i].domain); + mem_info[num_mem].start = 0; + mem_info[num_mem].end = 0; + mem_info[num_mem].domain = cpus[i].domain; + num_mem++; + } } return (0); } -- cgit v1.3 From 47d41ab50e7b8397adf26efbac7b6d1ed5f09463 Mon Sep 17 00:00:00 2001 From: Emmanuel Vadot Date: Mon, 1 Oct 2018 14:27:53 +0000 Subject: arm64: Raise again L3 table for early devmap The initial raise in r336519 wasn't enough for using big resolution (1920 x 1200 for example). Raise it again. Reported by: bob prohaska Tested by: bob prohaska Approved by: re (gjb@) --- sys/arm64/include/pte.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'sys') diff --git a/sys/arm64/include/pte.h b/sys/arm64/include/pte.h index acd3f81ab41c..7aa216e92b43 100644 --- a/sys/arm64/include/pte.h +++ b/sys/arm64/include/pte.h @@ -109,7 +109,7 @@ typedef uint64_t pt_entry_t; /* page table entry */ /* 0x2 also marks an invalid address */ #define L3_PAGE 0x3 -#define PMAP_MAPDEV_EARLY_SIZE (L2_SIZE * 4) +#define PMAP_MAPDEV_EARLY_SIZE (L2_SIZE * 8) #define L0_ENTRIES_SHIFT 9 #define L0_ENTRIES (1 << L0_ENTRIES_SHIFT) -- cgit v1.3 From c6c770d0419a3e07a5c0514bdfa0c61b24eed161 Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Mon, 1 Oct 2018 14:47:49 +0000 Subject: Count bootstrap data as resident in the kernel pmap. Such data may later be unmapped. This occurs, for example, when a loader-provided microcode update file is discarded. Reviewed by: alc, kib Approved by: re (gjb) Sponsored by: The FreeBSD Foundation Differential Revision: https://reviews.freebsd.org/D17340 --- sys/amd64/amd64/pmap.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'sys') diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index d5a79e453207..7d0b55e39d9a 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -1098,9 +1098,11 @@ pmap_bootstrap(vm_paddr_t *firstaddr) vm_offset_t va; pt_entry_t *pte; uint64_t cr4; + u_long res; int i; KERNend = *firstaddr; + res = atop(KERNend - (vm_paddr_t)kernphys); if (!pti) pg_g = X86_PG_G; @@ -1120,10 +1122,8 @@ pmap_bootstrap(vm_paddr_t *firstaddr) vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt)); virtual_avail = (vm_offset_t) KERNBASE + *firstaddr; - virtual_end = VM_MAX_KERNEL_ADDRESS; - /* * Enable PG_G global pages, then switch to the kernel page * table from the bootstrap page table. After the switch, it @@ -1142,6 +1142,7 @@ pmap_bootstrap(vm_paddr_t *firstaddr) /* * Initialize the kernel pmap (which is statically allocated). + * Count bootstrap data as being resident. */ PMAP_LOCK_INIT(kernel_pmap); kernel_pmap->pm_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(KPML4phys); @@ -1149,6 +1150,7 @@ pmap_bootstrap(vm_paddr_t *firstaddr) kernel_pmap->pm_ucr3 = PMAP_NO_CR3; CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ TAILQ_INIT(&kernel_pmap->pm_pvchunk); + kernel_pmap->pm_stats.resident_count = res; kernel_pmap->pm_flags = pmap_flags; /* -- cgit v1.3 From cb4961abc1dcb48600133a1a3207be4da02a578a Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Mon, 1 Oct 2018 18:48:33 +0000 Subject: Apply r339046 to i386. Belatedly add a comment to the amd64 pmap explaining why we initialize the kernel pmap's resident page count. Reviewed by: alc, kib Approved by: re (gjb) Sponsored by: The FreeBSD Foundation Differential Revision: https://reviews.freebsd.org/D17377 --- sys/amd64/amd64/pmap.c | 3 ++- sys/i386/i386/pmap.c | 7 ++++++- 2 files changed, 8 insertions(+), 2 deletions(-) (limited to 'sys') diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index 7d0b55e39d9a..bab8072d9ec1 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -1142,7 +1142,8 @@ pmap_bootstrap(vm_paddr_t *firstaddr) /* * Initialize the kernel pmap (which is statically allocated). - * Count bootstrap data as being resident. + * Count bootstrap data as being resident in case any of this data is + * later unmapped (using pmap_remove()) and freed. */ PMAP_LOCK_INIT(kernel_pmap); kernel_pmap->pm_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(KPML4phys); diff --git a/sys/i386/i386/pmap.c b/sys/i386/i386/pmap.c index 0c303e25a599..9dd80ad72d49 100644 --- a/sys/i386/i386/pmap.c +++ b/sys/i386/i386/pmap.c @@ -579,8 +579,11 @@ pmap_bootstrap(vm_paddr_t firstaddr) vm_offset_t va; pt_entry_t *pte, *unused; struct pcpu *pc; + u_long res; int i; + res = atop(firstaddr - (vm_paddr_t)KERNLOAD); + /* * Add a physical memory segment (vm_phys_seg) corresponding to the * preallocated kernel page table pages so that vm_page structures @@ -598,11 +601,12 @@ pmap_bootstrap(vm_paddr_t firstaddr) * unused virtual address in addition to "firstaddr". */ virtual_avail = (vm_offset_t)firstaddr; - virtual_end = VM_MAX_KERNEL_ADDRESS; /* * Initialize the kernel pmap (which is statically allocated). + * Count bootstrap data as being resident in case any of this data is + * later unmapped (using pmap_remove()) and freed. */ PMAP_LOCK_INIT(kernel_pmap); kernel_pmap->pm_pdir = IdlePTD; @@ -610,6 +614,7 @@ pmap_bootstrap(vm_paddr_t firstaddr) kernel_pmap->pm_pdpt = IdlePDPT; #endif CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ + kernel_pmap->pm_stats.resident_count = res; TAILQ_INIT(&kernel_pmap->pm_pvchunk); /* -- cgit v1.3 From 8696dcdacfc0ebf0584ce762b982315fafcde0da Mon Sep 17 00:00:00 2001 From: Andrew Turner Date: Mon, 1 Oct 2018 18:51:08 +0000 Subject: Add kernel ifunc support on arm64. Tested with ifunc resolvers in the kernel and module with calls from kernel to kernel, module to kernel, and module to module. Reviewed by: kib (previous version) Approved by: re (gjb) Differential Revision: https://reviews.freebsd.org/D17370 --- sys/arm64/arm64/elf_machdep.c | 10 +++++++-- sys/arm64/arm64/machdep.c | 1 + sys/arm64/include/ifunc.h | 51 +++++++++++++++++++++++++++++++++++++++++++ sys/conf/kern.pre.mk | 5 +++-- sys/kern/link_elf.c | 2 +- 5 files changed, 64 insertions(+), 5 deletions(-) create mode 100644 sys/arm64/include/ifunc.h (limited to 'sys') diff --git a/sys/arm64/arm64/elf_machdep.c b/sys/arm64/arm64/elf_machdep.c index 67244adfb0c9..f39eb3558b5f 100644 --- a/sys/arm64/arm64/elf_machdep.c +++ b/sys/arm64/arm64/elf_machdep.c @@ -133,14 +133,14 @@ bool elf_is_ifunc_reloc(Elf_Size r_info __unused) { - return (false); + return (ELF_R_TYPE(r_info) == R_AARCH64_IRELATIVE); } static int elf_reloc_internal(linker_file_t lf, Elf_Addr relocbase, const void *data, int type, int local, elf_lookup_fn lookup) { - Elf_Addr *where, addr, addend; + Elf_Addr *where, addr, addend, val; Elf_Word rtype, symidx; const Elf_Rel *rel; const Elf_Rela *rela; @@ -183,6 +183,12 @@ elf_reloc_internal(linker_file_t lf, Elf_Addr relocbase, const void *data, return (-1); *where = addr + addend; break; + case R_AARCH64_IRELATIVE: + addr = relocbase + addend; + val = ((Elf64_Addr (*)(void))addr)(); + if (*where != val) + *where = val; + break; default: printf("kldload: unexpected relocation type %d\n", rtype); return (-1); diff --git a/sys/arm64/arm64/machdep.c b/sys/arm64/arm64/machdep.c index e82b9e4ab0bf..613dcbc53a09 100644 --- a/sys/arm64/arm64/machdep.c +++ b/sys/arm64/arm64/machdep.c @@ -1004,6 +1004,7 @@ initarm(struct arm64_bootparams *abp) boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int); init_static_kenv(MD_FETCH(kmdp, MODINFOMD_ENVP, char *), 0); + link_elf_ireloc(kmdp); #ifdef FDT try_load_dtb(kmdp); diff --git a/sys/arm64/include/ifunc.h b/sys/arm64/include/ifunc.h new file mode 100644 index 000000000000..2ce29154180b --- /dev/null +++ b/sys/arm64/include/ifunc.h @@ -0,0 +1,51 @@ +/*- + * Copyright (c) 2015-2018 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Konstantin Belousov + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef __ARM64_IFUNC_H +#define __ARM64_IFUNC_H + +#define DEFINE_IFUNC(qual, ret_type, name, args, resolver_qual) \ + resolver_qual ret_type (*name##_resolver(void))args __used; \ + qual ret_type name args __attribute__((ifunc(#name "_resolver"))); \ + resolver_qual ret_type (*name##_resolver(void))args + +#define DEFINE_UIFUNC(qual, ret_type, name, args, resolver_qual) \ + resolver_qual ret_type (*name##_resolver(uint64_t, uint64_t, \ + uint64_t, uint64_t, uint64_t, uint64_t, uint64_t, \ + uint64_t))args __used; \ + qual ret_type name args __attribute__((ifunc(#name "_resolver"))); \ + resolver_qual ret_type (*name##_resolver(uint64_t _arg1 __unused, \ + uint64_t _arg2 __unused, uint64_t _arg3 __unused, \ + uint64_t _arg4 __unused, uint64_t _arg5 __unused, \ + uint64_t _arg6 __unused, uint64_t _arg7 __unused, \ + uint64_t _arg8 __unused))args + +#endif diff --git a/sys/conf/kern.pre.mk b/sys/conf/kern.pre.mk index 523cea605afd..55072f1046f8 100644 --- a/sys/conf/kern.pre.mk +++ b/sys/conf/kern.pre.mk @@ -121,9 +121,10 @@ CFLAGS+= ${CONF_CFLAGS} LDFLAGS+= -Wl,--build-id=sha1 .endif -.if (${MACHINE_CPUARCH} == "amd64" || ${MACHINE_CPUARCH} == "i386") && \ +.if (${MACHINE_CPUARCH} == "aarch64" || ${MACHINE_CPUARCH} == "amd64" || \ + ${MACHINE_CPUARCH} == "i386") && \ defined(LINKER_FEATURES) && ${LINKER_FEATURES:Mifunc} == "" -.error amd64/i386 kernel requires linker ifunc support +.error amd64/arm64/i386 kernel requires linker ifunc support .endif .if ${MACHINE_CPUARCH} == "amd64" LDFLAGS+= -Wl,-z max-page-size=2097152 -Wl,-z common-page-size=4096 -Wl,-z -Wl,ifunc-noplt diff --git a/sys/kern/link_elf.c b/sys/kern/link_elf.c index 94fe5c407aeb..9338b06bb268 100644 --- a/sys/kern/link_elf.c +++ b/sys/kern/link_elf.c @@ -1653,7 +1653,7 @@ link_elf_strtab_get(linker_file_t lf, caddr_t *strtab) return (ef->ddbstrcnt); } -#if defined(__i386__) || defined(__amd64__) +#if defined(__i386__) || defined(__amd64__) || defined(__aarch64__) /* * Use this lookup routine when performing relocations early during boot. * The generic lookup routine depends on kobj, which is not initialized -- cgit v1.3 From 93db904d19dc7d7f0f66439c43bc3558b2fdcb01 Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Mon, 1 Oct 2018 18:51:39 +0000 Subject: Use an unsigned iterator for domain sets. Otherwise (iter % ds->ds_cnt) is not guaranteed to lie in the range [0, MAXMEMDOM). Reported by: pho Reviewed by: kib Approved by: re (rgrimes) Sponsored by: The FreeBSD Foundation Differential Revision: https://reviews.freebsd.org/D17374 --- sys/sys/_domainset.h | 2 +- sys/vm/vm_domainset.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'sys') diff --git a/sys/sys/_domainset.h b/sys/sys/_domainset.h index 30d8501c8e4a..34d8f61ca9fc 100644 --- a/sys/sys/_domainset.h +++ b/sys/sys/_domainset.h @@ -54,7 +54,7 @@ typedef struct _domainset domainset_t; struct domainset; struct domainset_ref { struct domainset * volatile dr_policy; - int dr_iterator; + unsigned int dr_iterator; }; #endif /* !_SYS__DOMAINSET_H_ */ diff --git a/sys/vm/vm_domainset.h b/sys/vm/vm_domainset.h index 10da5caa0ea7..b1c5766c1c67 100644 --- a/sys/vm/vm_domainset.h +++ b/sys/vm/vm_domainset.h @@ -32,7 +32,7 @@ struct vm_domainset_iter { struct domainset *di_domain; - int *di_iter; + unsigned int *di_iter; vm_pindex_t di_offset; int di_flags; uint16_t di_policy; -- cgit v1.3 From aabac0c17673904e13947cc621c65898e74c3e43 Mon Sep 17 00:00:00 2001 From: "Kenneth D. Merry" Date: Mon, 1 Oct 2018 19:00:46 +0000 Subject: Fix a da(4) driver memory leak for SCSI SMR devices. In the probe case for SCSI SMR Host Aware or Most Managed drives, be sure to free allocated memory. sys/cam/scsi/scsi_da.c: In dadone_probezone(), free the data pointer before returning. MFC after: 3 days Sponsored by: Spectra Logic Approved by: re (kib) --- sys/cam/scsi/scsi_da.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'sys') diff --git a/sys/cam/scsi/scsi_da.c b/sys/cam/scsi/scsi_da.c index 40f593f9b629..80dcb34a0a78 100644 --- a/sys/cam/scsi/scsi_da.c +++ b/sys/cam/scsi/scsi_da.c @@ -5674,6 +5674,9 @@ dadone_probezone(struct cam_periph *periph, union ccb *done_ccb) } } } + + free(csio->data_ptr, M_SCSIDA); + daprobedone(periph, done_ccb); return; } -- cgit v1.3 From 2ddefb6d5d71b66bd2f25208fa39d754ffb3755d Mon Sep 17 00:00:00 2001 From: Robert Watson Date: Tue, 2 Oct 2018 15:58:17 +0000 Subject: Rework the logic around quick checks for auditing that take place at system-call entry and whenever audit arguments or return values are captured: 1. Expose a single global, audit_syscalls_enabled, which controls whether the audit framework is entered, rather than exposing components of the policy -- e.g., if the trail is enabled, suspended, etc. 2. Introduce a new function audit_syscalls_enabled_update(), which is called to update audit_syscalls_enabled whenever an aspect of the policy changes, so that the value can be updated. 3. Remove a check of trail enablement/suspension from audit_new() -- at the point where this function has been entered, we believe that system-call auditing is already in force, or we wouldn't get here, so simply proceed to more expensive policy checks. 4. Use an audit-provided global, audit_dtrace_enabled, rather than a dtaudit-provided global, to provide policy indicating whether dtaudit would like system calls to be audited. 5. Do some minor cosmetic renaming to clarify what various variables are for. These changes collectively arrange it so that traditional audit (trail, pipes) or the DTrace audit provider can enable system-call probes without the other configured. Otherwise, dtaudit cannot capture system-call data without auditd(8) started. Reviewed by: gnn Sponsored by: DARPA, AFRL Approved by: re (gjb) Differential Revision: https://reviews.freebsd.org/D17348 --- sys/security/audit/audit.c | 66 +++++++++++++++++++++++++++---------- sys/security/audit/audit.h | 23 +++++++++---- sys/security/audit/audit_dtrace.c | 16 ++++++--- sys/security/audit/audit_private.h | 9 ++++- sys/security/audit/audit_syscalls.c | 25 +++++++------- sys/security/audit/audit_worker.c | 8 +++-- 6 files changed, 103 insertions(+), 44 deletions(-) (limited to 'sys') diff --git a/sys/security/audit/audit.c b/sys/security/audit/audit.c index 87ceeb670e76..c00ddc126f5a 100644 --- a/sys/security/audit/audit.c +++ b/sys/security/audit/audit.c @@ -2,7 +2,7 @@ * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1999-2005 Apple Inc. - * Copyright (c) 2006-2007, 2016-2017 Robert N. M. Watson + * Copyright (c) 2006-2007, 2016-2018 Robert N. M. Watson * All rights reserved. * * Portions of this software were developed by BAE Systems, the University of @@ -98,8 +98,12 @@ static SYSCTL_NODE(_security, OID_AUTO, audit, CTLFLAG_RW, 0, * * Define the audit control flags. */ -int __read_frequently audit_enabled; -int audit_suspended; +int audit_trail_enabled; +int audit_trail_suspended; +#ifdef KDTRACE_HOOKS +u_int audit_dtrace_enabled; +#endif +int __read_frequently audit_syscalls_enabled; /* * Flags controlling behavior in low storage situations. Should we panic if @@ -198,6 +202,33 @@ static struct rwlock audit_kinfo_lock; #define KINFO_RUNLOCK() rw_runlock(&audit_kinfo_lock) #define KINFO_WUNLOCK() rw_wunlock(&audit_kinfo_lock) +/* + * Check various policies to see if we should enable system-call audit hooks. + * Note that despite the mutex being held, we want to assign a value exactly + * once, as checks of the flag are performed lock-free for performance + * reasons. The mutex is used to get a consistent snapshot of policy state -- + * e.g., safely accessing the two audit_trail flags. + */ +void +audit_syscalls_enabled_update(void) +{ + + mtx_lock(&audit_mtx); +#ifdef KDTRACE_HOOKS + if (audit_dtrace_enabled) + audit_syscalls_enabled = 1; + else { +#endif + if (audit_trail_enabled && !audit_trail_suspended) + audit_syscalls_enabled = 1; + else + audit_syscalls_enabled = 0; +#ifdef KDTRACE_HOOKS + } +#endif + mtx_unlock(&audit_mtx); +} + void audit_set_kinfo(struct auditinfo_addr *ak) { @@ -303,8 +334,9 @@ static void audit_init(void) { - audit_enabled = 0; - audit_suspended = 0; + audit_trail_enabled = 0; + audit_trail_suspended = 0; + audit_syscalls_enabled = 0; audit_panic_on_write_fail = 0; audit_fail_stop = 0; audit_in_failure = 0; @@ -337,6 +369,9 @@ audit_init(void) sizeof(struct kaudit_record), audit_record_ctor, audit_record_dtor, NULL, NULL, UMA_ALIGN_PTR, 0); + /* First initialisation of audit_syscalls_enabled. */ + audit_syscalls_enabled_update(); + /* Initialize the BSM audit subsystem. */ kau_init(); @@ -378,10 +413,6 @@ currecord(void) } /* - * XXXAUDIT: There are a number of races present in the code below due to - * release and re-grab of the mutex. The code should be revised to become - * slightly less racy. - * * XXXAUDIT: Shouldn't there be logic here to sleep waiting on available * pre_q space, suspending the system call until there is room? */ @@ -389,13 +420,6 @@ struct kaudit_record * audit_new(int event, struct thread *td) { struct kaudit_record *ar; - int no_record; - - mtx_lock(&audit_mtx); - no_record = (audit_suspended || !audit_enabled); - mtx_unlock(&audit_mtx); - if (no_record) - return (NULL); /* * Note: the number of outstanding uncommitted audit records is @@ -529,9 +553,13 @@ audit_commit(struct kaudit_record *ar, int error, int retval) /* * Note: it could be that some records initiated while audit was * enabled should still be committed? + * + * NB: The check here is not for audit_syscalls because any + * DTrace-related obligations have been fulfilled above -- we're just + * down to the trail and pipes now. */ mtx_lock(&audit_mtx); - if (audit_suspended || !audit_enabled) { + if (audit_trail_suspended || !audit_trail_enabled) { audit_pre_q_len--; mtx_unlock(&audit_mtx); audit_free(ar); @@ -557,6 +585,10 @@ audit_commit(struct kaudit_record *ar, int error, int retval) * responsible for deciding whether or not to audit the call (preselection), * and if so, allocating a per-thread audit record. audit_new() will fill in * basic thread/credential properties. + * + * This function will be entered only if audit_syscalls_enabled was set in the + * macro wrapper for this function. It could be cleared by the time this + * function runs, but that is an acceptable race. */ void audit_syscall_enter(unsigned short code, struct thread *td) diff --git a/sys/security/audit/audit.h b/sys/security/audit/audit.h index 055194d3a88d..f24bc1e503b2 100644 --- a/sys/security/audit/audit.h +++ b/sys/security/audit/audit.h @@ -2,7 +2,7 @@ * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1999-2005 Apple Inc. - * Copyright (c) 2016-2017 Robert N. M. Watson + * Copyright (c) 2016-2018 Robert N. M. Watson * All rights reserved. * * This software was developed by BAE Systems, the University of Cambridge @@ -55,14 +55,23 @@ #include /* - * Audit subsystem condition flags. The audit_enabled flag is set and + * Audit subsystem condition flags. The audit_trail_enabled flag is set and * removed automatically as a result of configuring log files, and can be * observed but should not be directly manipulated. The audit suspension * flag permits audit to be temporarily disabled without reconfiguring the * audit target. + * + * As DTrace can also request system-call auditing, a further + * audit_syscalls_enabled flag tracks whether newly entering system calls + * should be considered for auditing or not. + * + * XXXRW: Move trail flags to audit_private.h, as they no longer need to be + * visible outside the audit code...? */ -extern int audit_enabled; -extern int audit_suspended; +extern u_int audit_dtrace_enabled; +extern int audit_trail_enabled; +extern int audit_trail_suspended; +extern int audit_syscalls_enabled; void audit_syscall_enter(unsigned short code, struct thread *td); void audit_syscall_exit(int error, struct thread *td); @@ -139,7 +148,7 @@ void audit_thread_free(struct thread *td); /* * Define macros to wrap the audit_arg_* calls by checking the global - * audit_enabled flag before performing the actual call. + * audit_syscalls_enabled flag before performing the actual call. */ #define AUDITING_TD(td) ((td)->td_pflags & TDP_AUDITREC) @@ -369,7 +378,7 @@ void audit_thread_free(struct thread *td); } while (0) #define AUDIT_SYSCALL_ENTER(code, td) do { \ - if (audit_enabled) { \ + if (audit_syscalls_enabled) { \ audit_syscall_enter(code, td); \ } \ } while (0) @@ -377,7 +386,7 @@ void audit_thread_free(struct thread *td); /* * Wrap the audit_syscall_exit() function so that it is called only when * we have a audit record on the thread. Audit records can persist after - * auditing is disabled, so we don't just check audit_enabled here. + * auditing is disabled, so we don't just check audit_syscalls_enabled here. */ #define AUDIT_SYSCALL_EXIT(error, td) do { \ if (td->td_pflags & TDP_AUDITREC) \ diff --git a/sys/security/audit/audit_dtrace.c b/sys/security/audit/audit_dtrace.c index c456ab71400a..985baf142ab3 100644 --- a/sys/security/audit/audit_dtrace.c +++ b/sys/security/audit/audit_dtrace.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2016 Robert N. M. Watson + * Copyright (c) 2016, 2018 Robert N. M. Watson * All rights reserved. * * This software was developed by BAE Systems, the University of Cambridge @@ -147,8 +147,12 @@ static dtrace_provider_id_t dtaudit_id; * maintain a global flag tracking whether any dtaudit probes are enabled. If * not, don't bother doing all that work whenever potential queries about * events turn up during preselection or commit. + * + * NB: We used to maintain our own variable in dtaudit, but now use the + * centralized audit_dtrace_enabled variable imported from the audit code. + * + * static uint_t dtaudit_probes_enabled; */ -static uint_t dtaudit_probes_enabled; /* * Check dtaudit policy for the event to see whether this is an event we would @@ -179,7 +183,7 @@ dtaudit_preselect(au_id_t auid, au_event_t event, au_class_t class) * NB: Lockless reads here may return a slightly stale value; this is * considered better than acquiring a lock, however. */ - if (!dtaudit_probes_enabled) + if (!audit_dtrace_enabled) return (NULL); ene = au_evnamemap_lookup(event); if (ene == NULL) @@ -457,7 +461,8 @@ dtaudit_enable(void *arg, dtrace_id_t id, void *parg) ene->ene_commit_probe_enabled = 1; else ene->ene_bsm_probe_enabled = 1; - refcount_acquire(&dtaudit_probes_enabled); + refcount_acquire(&audit_dtrace_enabled); + audit_syscalls_enabled_update(); } static void @@ -474,7 +479,8 @@ dtaudit_disable(void *arg, dtrace_id_t id, void *parg) ene->ene_commit_probe_enabled = 0; else ene->ene_bsm_probe_enabled = 0; - (void)refcount_release(&dtaudit_probes_enabled); + (void)refcount_release(&audit_dtrace_enabled); + audit_syscalls_enabled_update(); } static void diff --git a/sys/security/audit/audit_private.h b/sys/security/audit/audit_private.h index 5e7a3934835e..4aa811bc1516 100644 --- a/sys/security/audit/audit_private.h +++ b/sys/security/audit/audit_private.h @@ -2,7 +2,7 @@ * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1999-2009 Apple Inc. - * Copyright (c) 2016-2017 Robert N. M. Watson + * Copyright (c) 2016, 2018 Robert N. M. Watson * All rights reserved. * * Portions of this software were developed by BAE Systems, the University of @@ -343,6 +343,13 @@ void audit_commit(struct kaudit_record *ar, int error, int retval); struct kaudit_record *audit_new(int event, struct thread *td); +/* + * Function to update the audit_syscalls_enabled flag, whose value is affected + * by configuration of the audit trail/pipe mechanism and DTrace. Call this + * function when any of the inputs to that policy change. + */ +void audit_syscalls_enabled_update(void); + /* * Functions relating to the conversion of internal kernel audit records to * the BSM file format. diff --git a/sys/security/audit/audit_syscalls.c b/sys/security/audit/audit_syscalls.c index 89af1a7d2817..a092e27b0a74 100644 --- a/sys/security/audit/audit_syscalls.c +++ b/sys/security/audit/audit_syscalls.c @@ -2,7 +2,7 @@ * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1999-2009 Apple Inc. - * Copyright (c) 2016 Robert N. M. Watson + * Copyright (c) 2016, 2018 Robert N. M. Watson * All rights reserved. * * Portions of this software were developed by BAE Systems, the University of @@ -368,7 +368,7 @@ sys_auditon(struct thread *td, struct auditon_args *uap) case A_OLDGETCOND: case A_GETCOND: if (uap->length == sizeof(udata.au_cond64)) { - if (audit_enabled && !audit_suspended) + if (audit_trail_enabled && !audit_trail_suspended) udata.au_cond64 = AUC_AUDITING; else udata.au_cond64 = AUC_NOAUDIT; @@ -376,7 +376,7 @@ sys_auditon(struct thread *td, struct auditon_args *uap) } if (uap->length != sizeof(udata.au_cond)) return (EINVAL); - if (audit_enabled && !audit_suspended) + if (audit_trail_enabled && !audit_trail_suspended) udata.au_cond = AUC_AUDITING; else udata.au_cond = AUC_NOAUDIT; @@ -386,25 +386,27 @@ sys_auditon(struct thread *td, struct auditon_args *uap) case A_SETCOND: if (uap->length == sizeof(udata.au_cond64)) { if (udata.au_cond64 == AUC_NOAUDIT) - audit_suspended = 1; + audit_trail_suspended = 1; if (udata.au_cond64 == AUC_AUDITING) - audit_suspended = 0; + audit_trail_suspended = 0; if (udata.au_cond64 == AUC_DISABLED) { - audit_suspended = 1; + audit_trail_suspended = 1; audit_shutdown(NULL, 0); } + audit_syscalls_enabled_update(); break; } if (uap->length != sizeof(udata.au_cond)) return (EINVAL); if (udata.au_cond == AUC_NOAUDIT) - audit_suspended = 1; + audit_trail_suspended = 1; if (udata.au_cond == AUC_AUDITING) - audit_suspended = 0; + audit_trail_suspended = 0; if (udata.au_cond == AUC_DISABLED) { - audit_suspended = 1; + audit_trail_suspended = 1; audit_shutdown(NULL, 0); } + audit_syscalls_enabled_update(); break; case A_GETCLASS: @@ -826,10 +828,11 @@ sys_auditctl(struct thread *td, struct auditctl_args *uap) crhold(cred); /* - * XXXAUDIT: Should audit_suspended actually be cleared by + * XXXAUDIT: Should audit_trail_suspended actually be cleared by * audit_worker? */ - audit_suspended = 0; + audit_trail_suspended = 0; + audit_syscalls_enabled_update(); audit_rotate_vnode(cred, vp); diff --git a/sys/security/audit/audit_worker.c b/sys/security/audit/audit_worker.c index fa6fe66db259..f169ab8d9dd1 100644 --- a/sys/security/audit/audit_worker.c +++ b/sys/security/audit/audit_worker.c @@ -2,7 +2,7 @@ * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1999-2008 Apple Inc. - * Copyright (c) 2006-2008, 2016 Robert N. M. Watson + * Copyright (c) 2006-2008, 2016, 2018 Robert N. M. Watson * All rights reserved. * * Portions of this software were developed by BAE Systems, the University of @@ -305,7 +305,8 @@ fail_enospc: "Audit log space exhausted and fail-stop set."); } (void)audit_send_trigger(AUDIT_TRIGGER_NO_SPACE); - audit_suspended = 1; + audit_trail_suspended = 1; + audit_syscalls_enabled_update(); /* FALLTHROUGH */ fail: @@ -518,7 +519,8 @@ audit_rotate_vnode(struct ucred *cred, struct vnode *vp) audit_vp = vp; audit_size = vattr.va_size; audit_file_rotate_wait = 0; - audit_enabled = (audit_vp != NULL); + audit_trail_enabled = (audit_vp != NULL); + audit_syscalls_enabled_update(); AUDIT_WORKER_UNLOCK(); /* -- cgit v1.3 From 9cffbc68bdc8ce5f3ab4b33314b34ffe3e1a9696 Mon Sep 17 00:00:00 2001 From: "Bjoern A. Zeeb" Date: Tue, 2 Oct 2018 17:29:56 +0000 Subject: After r338257 is was possible to trigger a KASSERT() in ud6_output() using an application trying to use a v4mapped destination address on a kernel without INET support or on a v6only socket. Catch this case and prevent the packet from going anywhere; else, without the KASSERT() armed, a v4mapped destination address might go out on the wire or other undefined behaviour might happen, while with the KASSERT() we panic. PR: 231728 Reported by: Jeremy Faulkner (gldisater gmail.com) Approved by: re (kib) --- sys/netinet6/udp6_usrreq.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'sys') diff --git a/sys/netinet6/udp6_usrreq.c b/sys/netinet6/udp6_usrreq.c index d5ab934c9d41..93aebbd3440b 100644 --- a/sys/netinet6/udp6_usrreq.c +++ b/sys/netinet6/udp6_usrreq.c @@ -784,8 +784,20 @@ udp6_output(struct socket *so, int flags_arg, struct mbuf *m, return ((*pru->pru_send)(so, flags_arg, m, (struct sockaddr *)sin6, control, td)); } - } + } else #endif + if (sin6 && IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { + /* + * Given this is either an IPv6-only socket or no INET is + * supported we will fail the send if the given destination + * address is a v4mapped address. + */ + if (unlock_inp == UH_WLOCKED) + INP_WUNLOCK(inp); + else + INP_RUNLOCK(inp); + return (EINVAL); + } if (control) { if ((error = ip6_setpktopts(control, &opt, -- cgit v1.3 From 8ac2f3ba9fa82076a9292ff24912ebfb333e7836 Mon Sep 17 00:00:00 2001 From: Kevin Bowling Date: Tue, 2 Oct 2018 21:36:00 +0000 Subject: Use nda(4) on powerpc64 Approved by: re@ (kib), krion (mentor), imp Differential Revision: https://reviews.freebsd.org/D17368 --- UPDATING | 7 +++++++ sys/powerpc/conf/GENERIC64 | 1 + 2 files changed, 8 insertions(+) (limited to 'sys') diff --git a/UPDATING b/UPDATING index ce79b09f0c6c..607b16ff6d35 100644 --- a/UPDATING +++ b/UPDATING @@ -31,6 +31,13 @@ NOTE TO PEOPLE WHO THINK THAT FreeBSD 12.x IS SLOW: disable the most expensive debugging functionality run "ln -s 'abort:false,junk:false' /etc/malloc.conf".) +20181002: + The cam(4) based nda(4) driver will be used over nvd(4) by default on + powerpc64. You may set 'options NVME_USE_NVD=1' in your kernel conf or + loader tunable 'hw.nvme.use_nvd=1' if you wish to use the existing + driver. Make sure to edit /boot/etc/kboot.conf and fstab to use the + nda device name. + 20180913: Reproducible build mode is now on by default, in preparation for FreeBSD 12.0. This eliminates build metadata such as the user, diff --git a/sys/powerpc/conf/GENERIC64 b/sys/powerpc/conf/GENERIC64 index ec8dc06f0980..5913100d037a 100644 --- a/sys/powerpc/conf/GENERIC64 +++ b/sys/powerpc/conf/GENERIC64 @@ -120,6 +120,7 @@ device siis # SiliconImage SiI3124/SiI3132/SiI3531 SATA # NVM Express (NVMe) support device nvme # base NVMe driver +options NVME_USE_NVD=0 # prefer the cam(4) based nda(4) driver device nvd # expose NVMe namespaces as disks, depends on nvme # SCSI Controllers -- cgit v1.3 From 4364eab875fa13ed5d0e1b64ca4ba3b72705f481 Mon Sep 17 00:00:00 2001 From: Brooks Davis Date: Tue, 2 Oct 2018 23:23:56 +0000 Subject: Move 32-bit compat support for CDIOREADTOCENTRYS to the right place. ioctl(2) commands only have meaning in the context of a file descriptor so translating them in the syscall layer is incorrect. The new handler users an accessor to retrieve/construct a pointer from the last member of the passed structure and relies on type punning to access the other members which require no translation. Reviewed by: kib (prior version), jhb Approved by: re (rgrimes) Obtained from: CheriBSD Sponsored by: DARPA, AFRL Differential Review: https://reviews.freebsd.org/D17378 --- sys/cam/scsi/scsi_cd.c | 42 +++++++++++++++++++++++++++++++++- sys/compat/freebsd32/freebsd32_ioctl.c | 31 ------------------------- sys/compat/freebsd32/freebsd32_ioctl.h | 8 ------- 3 files changed, 41 insertions(+), 40 deletions(-) (limited to 'sys') diff --git a/sys/cam/scsi/scsi_cd.c b/sys/cam/scsi/scsi_cd.c index b0d4fe5d11c5..599ebc272629 100644 --- a/sys/cam/scsi/scsi_cd.c +++ b/sys/cam/scsi/scsi_cd.c @@ -63,8 +63,10 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include +#include #include #include @@ -210,6 +212,17 @@ static struct cd_quirk_entry cd_quirk_table[] = } }; +#ifdef COMPAT_FREEBSD32 +struct ioc_read_toc_entry32 { + u_char address_format; + u_char starting_track; + u_short data_len; + uint32_t data; /* (struct cd_toc_entry *) */ +}; +#define CDIOREADTOCENTRYS_32 \ + _IOC_NEWTYPE(CDIOREADTOCENTRYS, struct ioc_read_toc_entry32) +#endif + static disk_open_t cdopen; static disk_close_t cdclose; static disk_ioctl_t cdioctl; @@ -1272,6 +1285,29 @@ cdgetpagesize(int page_num) return (-1); } +static struct cd_toc_entry * +te_data_get_ptr(void *irtep, u_long cmd) +{ + union { + struct ioc_read_toc_entry irte; +#ifdef COMPAT_FREEBSD32 + struct ioc_read_toc_entry32 irte32; +#endif + } *irteup; + + irteup = irtep; + switch (IOCPARM_LEN(cmd)) { + case sizeof(irteup->irte): + return (irteup->irte.data); +#ifdef COMPAT_FREEBSD32 + case sizeof(irteup->irte32): + return ((struct cd_toc_entry *)(uintptr_t)irteup->irte32.data); +#endif + default: + panic("Unhandled ioctl command %ld", cmd); + } +} + static int cdioctl(struct disk *dp, u_long cmd, void *addr, int flag, struct thread *td) { @@ -1587,6 +1623,9 @@ cdioctl(struct disk *dp, u_long cmd, void *addr, int flag, struct thread *td) } break; case CDIOREADTOCENTRYS: +#ifdef COMPAT_FREEBSD32 + case CDIOREADTOCENTRYS_32: +#endif { struct cd_tocdata *data; struct cd_toc_single *lead; @@ -1712,7 +1751,8 @@ cdioctl(struct disk *dp, u_long cmd, void *addr, int flag, struct thread *td) } cam_periph_unlock(periph); - error = copyout(data->entries, te->data, len); + error = copyout(data->entries, te_data_get_ptr(te, cmd), + len); free(data, M_SCSICD); free(lead, M_SCSICD); } diff --git a/sys/compat/freebsd32/freebsd32_ioctl.c b/sys/compat/freebsd32/freebsd32_ioctl.c index b181913b6688..414178a7bffd 100644 --- a/sys/compat/freebsd32/freebsd32_ioctl.c +++ b/sys/compat/freebsd32/freebsd32_ioctl.c @@ -56,35 +56,8 @@ __FBSDID("$FreeBSD$"); #include #include -CTASSERT(sizeof(struct ioc_read_toc_entry32) == 8); CTASSERT(sizeof(struct mem_range_op32) == 12); -static int -freebsd32_ioctl_ioc_read_toc(struct thread *td, - struct freebsd32_ioctl_args *uap, struct file *fp) -{ - struct ioc_read_toc_entry toce; - struct ioc_read_toc_entry32 toce32; - int error; - - if ((error = copyin(uap->data, &toce32, sizeof(toce32)))) - return (error); - CP(toce32, toce, address_format); - CP(toce32, toce, starting_track); - CP(toce32, toce, data_len); - PTRIN_CP(toce32, toce, data); - - if ((error = fo_ioctl(fp, CDIOREADTOCENTRYS, (caddr_t)&toce, - td->td_ucred, td))) { - CP(toce, toce32, address_format); - CP(toce, toce32, starting_track); - CP(toce, toce32, data_len); - PTROUT_CP(toce, toce32, data); - error = copyout(&toce32, uap->data, sizeof(toce32)); - } - return error; -} - static int freebsd32_ioctl_fiodgname(struct thread *td, struct freebsd32_ioctl_args *uap, struct file *fp) @@ -264,10 +237,6 @@ freebsd32_ioctl(struct thread *td, struct freebsd32_ioctl_args *uap) } switch (uap->com) { - case CDIOREADTOCENTRYS_32: - error = freebsd32_ioctl_ioc_read_toc(td, uap, fp); - break; - case FIODGNAME_32: error = freebsd32_ioctl_fiodgname(td, uap, fp); break; diff --git a/sys/compat/freebsd32/freebsd32_ioctl.h b/sys/compat/freebsd32/freebsd32_ioctl.h index c19c27e3618a..1d2312b41c14 100644 --- a/sys/compat/freebsd32/freebsd32_ioctl.h +++ b/sys/compat/freebsd32/freebsd32_ioctl.h @@ -38,13 +38,6 @@ typedef __uint32_t caddr_t32; -struct ioc_read_toc_entry32 { - u_char address_format; - u_char starting_track; - u_short data_len; - uint32_t data; /* struct cd_toc_entry* */ -}; - struct fiodgname_arg32 { int len; caddr_t32 buf; @@ -67,7 +60,6 @@ struct pci_bar_mmap32 { int pbm_memattr; }; -#define CDIOREADTOCENTRYS_32 _IOWR('c', 5, struct ioc_read_toc_entry32) #define FIODGNAME_32 _IOW('f', 120, struct fiodgname_arg32) #define MEMRANGE_GET32 _IOWR('m', 50, struct mem_range_op32) #define MEMRANGE_SET32 _IOW('m', 51, struct mem_range_op32) -- cgit v1.3 From 580e30a33e4b11459e9f9ddbfefb2e408f77302b Mon Sep 17 00:00:00 2001 From: Michael Tuexen Date: Wed, 3 Oct 2018 07:35:16 +0000 Subject: Use strlcpy() instead of strncpy(). Approved by: re (kib@) CID: 1395980, 1395981 X-MFC with: r339012 MFC after: 1 week --- sys/net/if_tap.c | 2 +- sys/net/if_tun.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'sys') diff --git a/sys/net/if_tap.c b/sys/net/if_tap.c index 65feee545b7d..1efd5ea20c6e 100644 --- a/sys/net/if_tap.c +++ b/sys/net/if_tap.c @@ -741,7 +741,7 @@ tapioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag, struct thread *td return (EPROTOTYPE); mtx_lock(&tp->tap_mtx); if (ifp->if_mtu != tapp->mtu) { - strncpy(ifr.ifr_name, if_name(ifp), IFNAMSIZ); + strlcpy(ifr.ifr_name, if_name(ifp), IFNAMSIZ); ifr.ifr_mtu = tapp->mtu; CURVNET_SET(ifp->if_vnet); error = ifhwioctl(SIOCSIFMTU, ifp, diff --git a/sys/net/if_tun.c b/sys/net/if_tun.c index fc49ef3bbbd1..d47117f17b8e 100644 --- a/sys/net/if_tun.c +++ b/sys/net/if_tun.c @@ -674,7 +674,7 @@ tunioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag, return (EPROTOTYPE); mtx_lock(&tp->tun_mtx); if (TUN2IFP(tp)->if_mtu != tunp->mtu) { - strncpy(ifr.ifr_name, if_name(TUN2IFP(tp)), IFNAMSIZ); + strlcpy(ifr.ifr_name, if_name(TUN2IFP(tp)), IFNAMSIZ); ifr.ifr_mtu = tunp->mtu; CURVNET_SET(TUN2IFP(tp)->if_vnet); error = ifhwioctl(SIOCSIFMTU, TUN2IFP(tp), -- cgit v1.3 From 7c179abac7190b7f5fef5facad403a7fb53d6c61 Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Wed, 3 Oct 2018 14:20:43 +0000 Subject: Fix an inverted test in ucode_load_ap(). This caused microcode to be updated only on the BSP if hyperthreading was disabled, typically resulting in a hang or reset. Approved by: re (kib) Sponsored by: The FreeBSD Foundation --- sys/x86/x86/ucode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'sys') diff --git a/sys/x86/x86/ucode.c b/sys/x86/x86/ucode.c index 50a5cbe4462b..5b039491345a 100644 --- a/sys/x86/x86/ucode.c +++ b/sys/x86/x86/ucode.c @@ -269,7 +269,7 @@ ucode_load_ap(int cpu) KASSERT(cpu_info[cpu_apic_ids[cpu]].cpu_present, ("cpu %d not present", cpu)); - if (!cpu_info[cpu_apic_ids[cpu]].cpu_hyperthread) + if (cpu_info[cpu_apic_ids[cpu]].cpu_hyperthread) return; #endif -- cgit v1.3 From ad7eb8cad5320baab8f5b907450750ce72eaa230 Mon Sep 17 00:00:00 2001 From: Gleb Smirnoff Date: Wed, 3 Oct 2018 17:40:04 +0000 Subject: In PR 227259, a user is reporting that they have code which is using shutdown() to wakeup another thread blocked on a stream listen socket. This code is failing, while it used to work on FreeBSD 10 and still works on Linux. It seems reasonable to add another exception to support something users are actually doing, which used to work on FreeBSD 10, and still works on Linux. And, it seems like it should be acceptable to POSIX, as we still return ENOTCONN. This patch is different to what had been committed to stable/11, since code around listening sockets is different. Patch in D15019 is written by jtl@, slightly modified by me. PR: 227259 Obtained from: jtl Approved by: re (kib) Differential Revision: D15019 --- sys/kern/uipc_socket.c | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) (limited to 'sys') diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c index 18a4c6c9e835..af2fd1ff65f7 100644 --- a/sys/kern/uipc_socket.c +++ b/sys/kern/uipc_socket.c @@ -917,12 +917,13 @@ solisten_dequeue(struct socket *head, struct socket **ret, int flags) if (head->so_error) { error = head->so_error; head->so_error = 0; + } else if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp)) + error = EWOULDBLOCK; + else + error = 0; + if (error) { SOLISTEN_UNLOCK(head); return (error); - } - if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp)) { - SOLISTEN_UNLOCK(head); - return (EWOULDBLOCK); } so = TAILQ_FIRST(&head->sol_comp); SOCK_LOCK(so); @@ -2585,11 +2586,20 @@ soshutdown(struct socket *so, int how) * both backward-compatibility and POSIX requirements by forcing * ENOTCONN but still asking protocol to perform pru_shutdown(). */ - if (so->so_type != SOCK_DGRAM) + if (so->so_type != SOCK_DGRAM && !SOLISTENING(so)) return (ENOTCONN); soerror_enotconn = 1; } + if (SOLISTENING(so)) { + if (how != SHUT_WR) { + SOLISTEN_LOCK(so); + so->so_error = ECONNABORTED; + solisten_wakeup(so); /* unlocks so */ + } + goto done; + } + CURVNET_SET(so->so_vnet); if (pr->pr_usrreqs->pru_flush != NULL) (*pr->pr_usrreqs->pru_flush)(so, how); @@ -2604,6 +2614,7 @@ soshutdown(struct socket *so, int how) wakeup(&so->so_timeo); CURVNET_RESTORE(); +done: return (soerror_enotconn ? ENOTCONN : 0); } @@ -3279,6 +3290,8 @@ sopoll_generic(struct socket *so, int events, struct ucred *active_cred, revents = 0; else if (!TAILQ_EMPTY(&so->sol_comp)) revents = events & (POLLIN | POLLRDNORM); + else if ((events & POLLINIGNEOF) == 0 && so->so_error) + revents = (events & (POLLIN | POLLRDNORM)) | POLLHUP; else { selrecord(td, &so->so_rdsel); revents = 0; @@ -3555,6 +3568,11 @@ filt_soread(struct knote *kn, long hint) if (SOLISTENING(so)) { SOCK_LOCK_ASSERT(so); kn->kn_data = so->sol_qlen; + if (so->so_error) { + kn->kn_flags |= EV_EOF; + kn->kn_fflags = so->so_error; + return (1); + } return (!TAILQ_EMPTY(&so->sol_comp)); } -- cgit v1.3 From 23f2e22802fcf2ec666a31c5727004107b596bd3 Mon Sep 17 00:00:00 2001 From: Brooks Davis Date: Wed, 3 Oct 2018 20:39:48 +0000 Subject: Move 32-bit compat support for FIODGNAME to the right place. ioctl(2) commands only have meaning in the context of a file descriptor so translating them in the syscall layer is incorrect. The new handler users an accessor to retrieve/construct a pointer from the last member of the passed structure and relies on type punning to access the other member which requires no translation. Reviewed by: kib Approved by: re (rgrimes, gjb) Obtained from: CheriBSD Sponsored by: DARPA, AFRL Differential Review: https://reviews.freebsd.org/D17388 --- sys/compat/freebsd32/freebsd32_ioctl.c | 20 -------------- sys/compat/freebsd32/freebsd32_ioctl.h | 6 ---- sys/fs/devfs/devfs_vnops.c | 50 ++++++++++++++++++++++++++++------ 3 files changed, 42 insertions(+), 34 deletions(-) (limited to 'sys') diff --git a/sys/compat/freebsd32/freebsd32_ioctl.c b/sys/compat/freebsd32/freebsd32_ioctl.c index 414178a7bffd..db6946246b90 100644 --- a/sys/compat/freebsd32/freebsd32_ioctl.c +++ b/sys/compat/freebsd32/freebsd32_ioctl.c @@ -58,22 +58,6 @@ __FBSDID("$FreeBSD$"); CTASSERT(sizeof(struct mem_range_op32) == 12); -static int -freebsd32_ioctl_fiodgname(struct thread *td, - struct freebsd32_ioctl_args *uap, struct file *fp) -{ - struct fiodgname_arg fgn; - struct fiodgname_arg32 fgn32; - int error; - - if ((error = copyin(uap->data, &fgn32, sizeof fgn32)) != 0) - return (error); - CP(fgn32, fgn, len); - PTRIN_CP(fgn32, fgn, buf); - error = fo_ioctl(fp, FIODGNAME, (caddr_t)&fgn, td->td_ucred, td); - return (error); -} - static int freebsd32_ioctl_memrange(struct thread *td, struct freebsd32_ioctl_args *uap, struct file *fp) @@ -237,10 +221,6 @@ freebsd32_ioctl(struct thread *td, struct freebsd32_ioctl_args *uap) } switch (uap->com) { - case FIODGNAME_32: - error = freebsd32_ioctl_fiodgname(td, uap, fp); - break; - case MEMRANGE_GET32: /* FALLTHROUGH */ case MEMRANGE_SET32: error = freebsd32_ioctl_memrange(td, uap, fp); diff --git a/sys/compat/freebsd32/freebsd32_ioctl.h b/sys/compat/freebsd32/freebsd32_ioctl.h index 1d2312b41c14..0c0b18496a04 100644 --- a/sys/compat/freebsd32/freebsd32_ioctl.h +++ b/sys/compat/freebsd32/freebsd32_ioctl.h @@ -38,11 +38,6 @@ typedef __uint32_t caddr_t32; -struct fiodgname_arg32 { - int len; - caddr_t32 buf; -}; - struct mem_range_op32 { caddr_t32 mo_desc; @@ -60,7 +55,6 @@ struct pci_bar_mmap32 { int pbm_memattr; }; -#define FIODGNAME_32 _IOW('f', 120, struct fiodgname_arg32) #define MEMRANGE_GET32 _IOWR('m', 50, struct mem_range_op32) #define MEMRANGE_SET32 _IOW('m', 51, struct mem_range_op32) #define SG_IO_32 _IOWR(SGIOC, 0x85, struct sg_io_hdr32) diff --git a/sys/fs/devfs/devfs_vnops.c b/sys/fs/devfs/devfs_vnops.c index 514a5914ab7e..43ac38e0a3e1 100644 --- a/sys/fs/devfs/devfs_vnops.c +++ b/sys/fs/devfs/devfs_vnops.c @@ -79,6 +79,14 @@ static struct fileops devfs_ops_f; #include #include +#ifdef COMPAT_FREEBSD32 +struct fiodgname_arg32 { + int len; + uint32_t buf; /* (void *) */ +}; +#define FIODGNAME_32 _IOC_NEWTYPE(FIODGNAME, struct fiodgname_arg32) +#endif + static MALLOC_DEFINE(M_CDEVPDATA, "DEVFSP", "Metainfo for cdev-fp data"); struct mtx devfs_de_interlock; @@ -767,6 +775,29 @@ devfs_ioctl_f(struct file *fp, u_long com, void *data, struct ucred *cred, struc return (error); } +static void * +fiodgname_buf_get_ptr(void *fgnp, u_long com) +{ + union { + struct fiodgname_arg fgn; +#ifdef COMPAT_FREEBSD32 + struct fiodgname_arg32 fgn32; +#endif + } *fgnup; + + fgnup = fgnp; + switch (com) { + case FIODGNAME: + return (fgnup->fgn.buf); +#ifdef COMPAT_FREEBSD32 + case FIODGNAME_32: + return ((void *)(uintptr_t)fgnup->fgn32.buf); +#endif + default: + panic("Unhandled ioctl command %ld", com); + } +} + static int devfs_ioctl(struct vop_ioctl_args *ap) { @@ -789,24 +820,27 @@ devfs_ioctl(struct vop_ioctl_args *ap) KASSERT(dev->si_refcount > 0, ("devfs: un-referenced struct cdev *(%s)", devtoname(dev))); - if (com == FIODTYPE) { + switch (com) { + case FIODTYPE: *(int *)ap->a_data = dsw->d_flags & D_TYPEMASK; error = 0; - goto out; - } else if (com == FIODGNAME) { + break; + case FIODGNAME: +#ifdef COMPAT_FREEBSD32 + case FIODGNAME_32: +#endif fgn = ap->a_data; p = devtoname(dev); i = strlen(p) + 1; if (i > fgn->len) error = EINVAL; else - error = copyout(p, fgn->buf, i); - goto out; + error = copyout(p, fiodgname_buf_get_ptr(fgn, com), i); + break; + default: + error = dsw->d_ioctl(dev, com, ap->a_data, ap->a_fflag, td); } - error = dsw->d_ioctl(dev, com, ap->a_data, ap->a_fflag, td); - -out: dev_relthread(dev, ref); if (error == ENOIOCTL) error = ENOTTY; -- cgit v1.3 From 627e5af85abad768ecca222b082a36ed8840ae8f Mon Sep 17 00:00:00 2001 From: Oleksandr Tymoshenko Date: Thu, 4 Oct 2018 19:54:47 +0000 Subject: [ig4] style(9) clean-up Submitted by: Rajesh Kumar Approved by: re (gjb, kib) --- sys/dev/ichiic/ig4_iic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'sys') diff --git a/sys/dev/ichiic/ig4_iic.c b/sys/dev/ichiic/ig4_iic.c index 237e24c10706..11beb743ed4b 100644 --- a/sys/dev/ichiic/ig4_iic.c +++ b/sys/dev/ichiic/ig4_iic.c @@ -729,9 +729,9 @@ ig4iic_intr(void *cookie) * Workaround to trigger pending interrupt if IG4_REG_INTR_STAT * is changed after clearing it */ - if(sc->access_intr_mask) { + if (sc->access_intr_mask != 0) { status = reg_read(sc, IG4_REG_INTR_MASK); - if(status) { + if (status != 0) { reg_write(sc, IG4_REG_INTR_MASK, 0); reg_write(sc, IG4_REG_INTR_MASK, status); } -- cgit v1.3 From 9657b80ce7682c0737d218e48389b54db1fe16a8 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Thu, 4 Oct 2018 20:01:48 +0000 Subject: amd64: hide non-erms jump label under non-erms copyin/copyout This change is a no-op in terms of semantics, but has a side effect of removing a perfectly useless nop sled for CPUs with ERMS. Approved by: re (gjb) Sponsored by: The FreeBSD Foundation --- sys/amd64/amd64/support.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'sys') diff --git a/sys/amd64/amd64/support.S b/sys/amd64/amd64/support.S index 6fdb71cb9207..f0086e298ea5 100644 --- a/sys/amd64/amd64/support.S +++ b/sys/amd64/amd64/support.S @@ -457,9 +457,9 @@ END(fillw) movq %rax,PCB_ONFAULT(%r9) POP_FRAME_POINTER ret -.endif ALIGN_TEXT 1: +.endif rep movsb @@ -525,9 +525,9 @@ END(copyout_smap_erms) movq %rax,PCB_ONFAULT(%r9) POP_FRAME_POINTER ret -.endif ALIGN_TEXT 1: +.endif rep movsb -- cgit v1.3 From 083a010c62166625fdc9dc1350c8c9203bf1b6cd Mon Sep 17 00:00:00 2001 From: Ryan Stone Date: Thu, 4 Oct 2018 22:03:58 +0000 Subject: Hold a write lock across udp_notify() With the new route cache feature udp_notify() will modify the inp when it needs to invalidate the route cache. Ensure that we hold a write lock on the inp before calling the function to ensure that multiple threads don't race while trying to invalidate the cache (which previously lead to a page fault). Differential Revision: https://reviews.freebsd.org/D17246 Reviewed by: sbruno, bz, karels Sponsored by: Dell EMC Isilon Approved by: re (gjb) --- sys/netinet/udp_usrreq.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) (limited to 'sys') diff --git a/sys/netinet/udp_usrreq.c b/sys/netinet/udp_usrreq.c index cae044c066c3..429f195ee954 100644 --- a/sys/netinet/udp_usrreq.c +++ b/sys/netinet/udp_usrreq.c @@ -756,13 +756,7 @@ struct inpcb * udp_notify(struct inpcb *inp, int errno) { - /* - * While udp_ctlinput() always calls udp_notify() with a read lock - * when invoking it directly, in_pcbnotifyall() currently uses write - * locks due to sharing code with TCP. For now, accept either a read - * or a write lock, but a read lock is sufficient. - */ - INP_LOCK_ASSERT(inp); + INP_WLOCK_ASSERT(inp); if ((errno == EHOSTUNREACH || errno == ENETUNREACH || errno == EHOSTDOWN) && inp->inp_route.ro_rt) { RTFREE(inp->inp_route.ro_rt); @@ -808,13 +802,13 @@ udp_common_ctlinput(int cmd, struct sockaddr *sa, void *vip, if (ip != NULL) { uh = (struct udphdr *)((caddr_t)ip + (ip->ip_hl << 2)); inp = in_pcblookup(pcbinfo, faddr, uh->uh_dport, - ip->ip_src, uh->uh_sport, INPLOOKUP_RLOCKPCB, NULL); + ip->ip_src, uh->uh_sport, INPLOOKUP_WLOCKPCB, NULL); if (inp != NULL) { - INP_RLOCK_ASSERT(inp); + INP_WLOCK_ASSERT(inp); if (inp->inp_socket != NULL) { udp_notify(inp, inetctlerrmap[cmd]); } - INP_RUNLOCK(inp); + INP_WUNLOCK(inp); } else { inp = in_pcblookup(pcbinfo, faddr, uh->uh_dport, ip->ip_src, uh->uh_sport, -- cgit v1.3 From 9bc603bd20e6027adfef4e86ce31146eecf490f7 Mon Sep 17 00:00:00 2001 From: Brooks Davis Date: Thu, 4 Oct 2018 23:55:03 +0000 Subject: Revert r339174: Move 32-bit compat support for FIODGNAME to the right place. A case was missed in this commit which breaks sshing into a 32-bit sshd on a 64-bit system. Approved by: re (gjb) --- sys/compat/freebsd32/freebsd32_ioctl.c | 20 ++++++++++++++ sys/compat/freebsd32/freebsd32_ioctl.h | 6 ++++ sys/fs/devfs/devfs_vnops.c | 50 ++++++---------------------------- 3 files changed, 34 insertions(+), 42 deletions(-) (limited to 'sys') diff --git a/sys/compat/freebsd32/freebsd32_ioctl.c b/sys/compat/freebsd32/freebsd32_ioctl.c index db6946246b90..414178a7bffd 100644 --- a/sys/compat/freebsd32/freebsd32_ioctl.c +++ b/sys/compat/freebsd32/freebsd32_ioctl.c @@ -58,6 +58,22 @@ __FBSDID("$FreeBSD$"); CTASSERT(sizeof(struct mem_range_op32) == 12); +static int +freebsd32_ioctl_fiodgname(struct thread *td, + struct freebsd32_ioctl_args *uap, struct file *fp) +{ + struct fiodgname_arg fgn; + struct fiodgname_arg32 fgn32; + int error; + + if ((error = copyin(uap->data, &fgn32, sizeof fgn32)) != 0) + return (error); + CP(fgn32, fgn, len); + PTRIN_CP(fgn32, fgn, buf); + error = fo_ioctl(fp, FIODGNAME, (caddr_t)&fgn, td->td_ucred, td); + return (error); +} + static int freebsd32_ioctl_memrange(struct thread *td, struct freebsd32_ioctl_args *uap, struct file *fp) @@ -221,6 +237,10 @@ freebsd32_ioctl(struct thread *td, struct freebsd32_ioctl_args *uap) } switch (uap->com) { + case FIODGNAME_32: + error = freebsd32_ioctl_fiodgname(td, uap, fp); + break; + case MEMRANGE_GET32: /* FALLTHROUGH */ case MEMRANGE_SET32: error = freebsd32_ioctl_memrange(td, uap, fp); diff --git a/sys/compat/freebsd32/freebsd32_ioctl.h b/sys/compat/freebsd32/freebsd32_ioctl.h index 0c0b18496a04..1d2312b41c14 100644 --- a/sys/compat/freebsd32/freebsd32_ioctl.h +++ b/sys/compat/freebsd32/freebsd32_ioctl.h @@ -38,6 +38,11 @@ typedef __uint32_t caddr_t32; +struct fiodgname_arg32 { + int len; + caddr_t32 buf; +}; + struct mem_range_op32 { caddr_t32 mo_desc; @@ -55,6 +60,7 @@ struct pci_bar_mmap32 { int pbm_memattr; }; +#define FIODGNAME_32 _IOW('f', 120, struct fiodgname_arg32) #define MEMRANGE_GET32 _IOWR('m', 50, struct mem_range_op32) #define MEMRANGE_SET32 _IOW('m', 51, struct mem_range_op32) #define SG_IO_32 _IOWR(SGIOC, 0x85, struct sg_io_hdr32) diff --git a/sys/fs/devfs/devfs_vnops.c b/sys/fs/devfs/devfs_vnops.c index 43ac38e0a3e1..514a5914ab7e 100644 --- a/sys/fs/devfs/devfs_vnops.c +++ b/sys/fs/devfs/devfs_vnops.c @@ -79,14 +79,6 @@ static struct fileops devfs_ops_f; #include #include -#ifdef COMPAT_FREEBSD32 -struct fiodgname_arg32 { - int len; - uint32_t buf; /* (void *) */ -}; -#define FIODGNAME_32 _IOC_NEWTYPE(FIODGNAME, struct fiodgname_arg32) -#endif - static MALLOC_DEFINE(M_CDEVPDATA, "DEVFSP", "Metainfo for cdev-fp data"); struct mtx devfs_de_interlock; @@ -775,29 +767,6 @@ devfs_ioctl_f(struct file *fp, u_long com, void *data, struct ucred *cred, struc return (error); } -static void * -fiodgname_buf_get_ptr(void *fgnp, u_long com) -{ - union { - struct fiodgname_arg fgn; -#ifdef COMPAT_FREEBSD32 - struct fiodgname_arg32 fgn32; -#endif - } *fgnup; - - fgnup = fgnp; - switch (com) { - case FIODGNAME: - return (fgnup->fgn.buf); -#ifdef COMPAT_FREEBSD32 - case FIODGNAME_32: - return ((void *)(uintptr_t)fgnup->fgn32.buf); -#endif - default: - panic("Unhandled ioctl command %ld", com); - } -} - static int devfs_ioctl(struct vop_ioctl_args *ap) { @@ -820,27 +789,24 @@ devfs_ioctl(struct vop_ioctl_args *ap) KASSERT(dev->si_refcount > 0, ("devfs: un-referenced struct cdev *(%s)", devtoname(dev))); - switch (com) { - case FIODTYPE: + if (com == FIODTYPE) { *(int *)ap->a_data = dsw->d_flags & D_TYPEMASK; error = 0; - break; - case FIODGNAME: -#ifdef COMPAT_FREEBSD32 - case FIODGNAME_32: -#endif + goto out; + } else if (com == FIODGNAME) { fgn = ap->a_data; p = devtoname(dev); i = strlen(p) + 1; if (i > fgn->len) error = EINVAL; else - error = copyout(p, fiodgname_buf_get_ptr(fgn, com), i); - break; - default: - error = dsw->d_ioctl(dev, com, ap->a_data, ap->a_fflag, td); + error = copyout(p, fgn->buf, i); + goto out; } + error = dsw->d_ioctl(dev, com, ap->a_data, ap->a_fflag, td); + +out: dev_relthread(dev, ref); if (error == ENOIOCTL) error = ENOTTY; -- cgit v1.3 From e8bb589d5603fe7761550bf885703f3f9865af56 Mon Sep 17 00:00:00 2001 From: Matt Macy Date: Fri, 5 Oct 2018 05:50:56 +0000 Subject: eliminate locking surrounding ui_vmsize and swap reserve by using atomics Change swap_reserve and swap_total to be in units of pages so that swap reservations can be done using only atomics instead of using a single global mutex for swap_reserve and a single mutex for all processes running under the same uid for uid accounting. Results in mmap speed up and a 70% increase in brk calls / second. Reviewed by: alc@, markj@, kib@ Approved by: re (delphij@) Differential Revision: https://reviews.freebsd.org/D16273 --- sys/kern/kern_resource.c | 3 -- sys/sys/resourcevar.h | 4 +- sys/vm/swap_pager.c | 115 +++++++++++++++++++++++++---------------------- 3 files changed, 63 insertions(+), 59 deletions(-) (limited to 'sys') diff --git a/sys/kern/kern_resource.c b/sys/kern/kern_resource.c index bed34bf4a7fe..271339e5c4c4 100644 --- a/sys/kern/kern_resource.c +++ b/sys/kern/kern_resource.c @@ -1276,7 +1276,6 @@ uifind(uid_t uid) racct_create(&new_uip->ui_racct); refcount_init(&new_uip->ui_ref, 1); new_uip->ui_uid = uid; - mtx_init(&new_uip->ui_vmsize_mtx, "ui_vmsize", NULL, MTX_DEF); rw_wlock(&uihashtbl_lock); /* @@ -1291,7 +1290,6 @@ uifind(uid_t uid) } else { rw_wunlock(&uihashtbl_lock); racct_destroy(&new_uip->ui_racct); - mtx_destroy(&new_uip->ui_vmsize_mtx); free(new_uip, M_UIDINFO); } return (uip); @@ -1352,7 +1350,6 @@ uifree(struct uidinfo *uip) if (uip->ui_vmsize != 0) printf("freeing uidinfo: uid = %d, swapuse = %lld\n", uip->ui_uid, (unsigned long long)uip->ui_vmsize); - mtx_destroy(&uip->ui_vmsize_mtx); free(uip, M_UIDINFO); } diff --git a/sys/sys/resourcevar.h b/sys/sys/resourcevar.h index c61ac3e1a6cf..a60eb3b6dbb3 100644 --- a/sys/sys/resourcevar.h +++ b/sys/sys/resourcevar.h @@ -93,12 +93,10 @@ struct racct; * (a) Constant from inception * (b) Lockless, updated using atomics * (c) Locked by global uihashtbl_lock - * (d) Locked by the ui_vmsize_mtx */ struct uidinfo { LIST_ENTRY(uidinfo) ui_hash; /* (c) hash chain of uidinfos */ - struct mtx ui_vmsize_mtx; - vm_ooffset_t ui_vmsize; /* (d) swap reservation by uid */ + u_long ui_vmsize; /* (b) pages of swap reservation by uid */ long ui_sbsize; /* (b) socket buffer space consumed */ long ui_proccnt; /* (b) number of processes */ long ui_ptscnt; /* (b) number of pseudo-terminals */ diff --git a/sys/vm/swap_pager.c b/sys/vm/swap_pager.c index 0bd4bde8b57e..76a574883619 100644 --- a/sys/vm/swap_pager.c +++ b/sys/vm/swap_pager.c @@ -151,12 +151,16 @@ static int nswapdev; /* Number of swap devices */ int swap_pager_avail; static struct sx swdev_syscall_lock; /* serialize swap(on|off) */ -static vm_ooffset_t swap_total; -SYSCTL_QUAD(_vm, OID_AUTO, swap_total, CTLFLAG_RD, &swap_total, 0, - "Total amount of available swap storage."); -static vm_ooffset_t swap_reserved; -SYSCTL_QUAD(_vm, OID_AUTO, swap_reserved, CTLFLAG_RD, &swap_reserved, 0, +static u_long swap_reserved; +static u_long swap_total; +static int sysctl_page_shift(SYSCTL_HANDLER_ARGS); +SYSCTL_PROC(_vm, OID_AUTO, swap_reserved, CTLTYPE_U64 | CTLFLAG_RD | CTLFLAG_MPSAFE, + &swap_reserved, 0, sysctl_page_shift, "A", "Amount of swap storage needed to back all allocated anonymous memory."); +SYSCTL_PROC(_vm, OID_AUTO, swap_total, CTLTYPE_U64 | CTLFLAG_RD | CTLFLAG_MPSAFE, + &swap_total, 0, sysctl_page_shift, "A", + "Total amount of available swap storage."); + static int overcommit = 0; SYSCTL_INT(_vm, VM_OVERCOMMIT, overcommit, CTLFLAG_RW, &overcommit, 0, "Configure virtual memory overcommit behavior. See tuning(7) " @@ -173,6 +177,16 @@ SYSCTL_ULONG(_vm, OID_AUTO, swap_maxpages, CTLFLAG_RD, &swap_maxpages, 0, #define SWAP_RESERVE_RLIMIT_ON (1 << 1) #define SWAP_RESERVE_ALLOW_NONWIRED (1 << 2) +static int +sysctl_page_shift(SYSCTL_HANDLER_ARGS) +{ + uint64_t newval; + u_long value = *(u_long *)arg1; + + newval = ((uint64_t)value) << PAGE_SHIFT; + return (sysctl_handle_64(oidp, &newval, 0, req)); +} + int swap_reserve(vm_ooffset_t incr) { @@ -183,7 +197,7 @@ swap_reserve(vm_ooffset_t incr) int swap_reserve_by_cred(vm_ooffset_t incr, struct ucred *cred) { - vm_ooffset_t r, s; + u_long r, s, prev, pincr; int res, error; static int curfail; static struct timeval lastfail; @@ -191,8 +205,8 @@ swap_reserve_by_cred(vm_ooffset_t incr, struct ucred *cred) uip = cred->cr_ruidinfo; - if (incr & PAGE_MASK) - panic("swap_reserve: & PAGE_MASK"); + KASSERT((incr & PAGE_MASK) == 0, ("%s: incr: %ju & PAGE_MASK", __func__, + (uintmax_t)incr)); #ifdef RACCT if (racct_enable) { @@ -204,36 +218,33 @@ swap_reserve_by_cred(vm_ooffset_t incr, struct ucred *cred) } #endif + pincr = atop(incr); res = 0; - mtx_lock(&sw_dev_mtx); - r = swap_reserved + incr; + prev = atomic_fetchadd_long(&swap_reserved, pincr); + r = prev + pincr; if (overcommit & SWAP_RESERVE_ALLOW_NONWIRED) { s = vm_cnt.v_page_count - vm_cnt.v_free_reserved - vm_wire_count(); - s *= PAGE_SIZE; } else s = 0; s += swap_total; if ((overcommit & SWAP_RESERVE_FORCE_ON) == 0 || r <= s || (error = priv_check(curthread, PRIV_VM_SWAP_NOQUOTA)) == 0) { res = 1; - swap_reserved = r; + } else { + prev = atomic_fetchadd_long(&swap_reserved, -pincr); + if (prev < pincr) + panic("swap_reserved < incr on overcommit fail"); } - mtx_unlock(&sw_dev_mtx); - if (res) { - UIDINFO_VMSIZE_LOCK(uip); + prev = atomic_fetchadd_long(&uip->ui_vmsize, pincr); if ((overcommit & SWAP_RESERVE_RLIMIT_ON) != 0 && - uip->ui_vmsize + incr > lim_cur(curthread, RLIMIT_SWAP) && - priv_check(curthread, PRIV_VM_SWAP_NORLIMIT)) + prev + pincr > lim_cur(curthread, RLIMIT_SWAP) && + priv_check(curthread, PRIV_VM_SWAP_NORLIMIT)) { res = 0; - else - uip->ui_vmsize += incr; - UIDINFO_VMSIZE_UNLOCK(uip); - if (!res) { - mtx_lock(&sw_dev_mtx); - swap_reserved -= incr; - mtx_unlock(&sw_dev_mtx); + prev = atomic_fetchadd_long(&uip->ui_vmsize, -pincr); + if (prev < pincr) + panic("uip->ui_vmsize < incr on overcommit fail"); } } if (!res && ppsratecheck(&lastfail, &curfail, 1)) { @@ -242,7 +253,7 @@ swap_reserve_by_cred(vm_ooffset_t incr, struct ucred *cred) } #ifdef RACCT - if (!res) { + if (racct_enable && !res) { PROC_LOCK(curproc); racct_sub(curproc, RACCT_SWAP, incr); PROC_UNLOCK(curproc); @@ -256,22 +267,20 @@ void swap_reserve_force(vm_ooffset_t incr) { struct uidinfo *uip; + u_long pincr; - mtx_lock(&sw_dev_mtx); - swap_reserved += incr; - mtx_unlock(&sw_dev_mtx); + KASSERT((incr & PAGE_MASK) == 0, ("%s: incr: %ju & PAGE_MASK", __func__, + (uintmax_t)incr)); -#ifdef RACCT PROC_LOCK(curproc); - racct_add_force(curproc, RACCT_SWAP, incr); - PROC_UNLOCK(curproc); +#ifdef RACCT + if (racct_enable) + racct_add_force(curproc, RACCT_SWAP, incr); #endif - - uip = curthread->td_ucred->cr_ruidinfo; - PROC_LOCK(curproc); - UIDINFO_VMSIZE_LOCK(uip); - uip->ui_vmsize += incr; - UIDINFO_VMSIZE_UNLOCK(uip); + pincr = atop(incr); + atomic_add_long(&swap_reserved, pincr); + uip = curproc->p_ucred->cr_ruidinfo; + atomic_add_long(&uip->ui_vmsize, pincr); PROC_UNLOCK(curproc); } @@ -281,7 +290,7 @@ swap_release(vm_ooffset_t decr) struct ucred *cred; PROC_LOCK(curproc); - cred = curthread->td_ucred; + cred = curproc->p_ucred; swap_release_by_cred(decr, cred); PROC_UNLOCK(curproc); } @@ -289,26 +298,26 @@ swap_release(vm_ooffset_t decr) void swap_release_by_cred(vm_ooffset_t decr, struct ucred *cred) { + u_long prev, pdecr; struct uidinfo *uip; uip = cred->cr_ruidinfo; - if (decr & PAGE_MASK) - panic("swap_release: & PAGE_MASK"); + KASSERT((decr & PAGE_MASK) == 0, ("%s: decr: %ju & PAGE_MASK", __func__, + (uintmax_t)decr)); - mtx_lock(&sw_dev_mtx); - if (swap_reserved < decr) + pdecr = atop(decr); + prev = atomic_fetchadd_long(&swap_reserved, -pdecr); + if (prev < pdecr) panic("swap_reserved < decr"); - swap_reserved -= decr; - mtx_unlock(&sw_dev_mtx); - UIDINFO_VMSIZE_LOCK(uip); - if (uip->ui_vmsize < decr) + prev = atomic_fetchadd_long(&uip->ui_vmsize, -pdecr); + if (prev < pdecr) printf("negative vmsize for uid = %d\n", uip->ui_uid); - uip->ui_vmsize -= decr; - UIDINFO_VMSIZE_UNLOCK(uip); - - racct_sub_cred(cred, RACCT_SWAP, decr); +#ifdef RACCT + if (racct_enable) + racct_sub_cred(cred, RACCT_SWAP, decr); +#endif } #define SWM_POP 0x01 /* pop out */ @@ -2176,7 +2185,7 @@ swapon_check_swzone(void) { unsigned long maxpages, npages; - npages = swap_total / PAGE_SIZE; + npages = swap_total; /* absolute maximum we can handle assuming 100% efficiency */ maxpages = uma_zone_get_max(swblk_zone) * SWAP_META_PAGES; @@ -2254,7 +2263,7 @@ swaponsomething(struct vnode *vp, void *id, u_long nblks, TAILQ_INSERT_TAIL(&swtailq, sp, sw_list); nswapdev++; swap_pager_avail += nblks - 2; - swap_total += (vm_ooffset_t)nblks * PAGE_SIZE; + swap_total += nblks; swapon_check_swzone(); swp_sizecheck(); mtx_unlock(&sw_dev_mtx); @@ -2351,7 +2360,7 @@ swapoff_one(struct swdevt *sp, struct ucred *cred) mtx_lock(&sw_dev_mtx); sp->sw_flags |= SW_CLOSING; swap_pager_avail -= blist_fill(sp->sw_blist, 0, nblks); - swap_total -= (vm_ooffset_t)nblks * PAGE_SIZE; + swap_total -= nblks; mtx_unlock(&sw_dev_mtx); /* -- cgit v1.3 From d9f1b8dbf29d1980a0f80c5c78b1b5867b5bb8d2 Mon Sep 17 00:00:00 2001 From: Matt Macy Date: Fri, 5 Oct 2018 05:55:56 +0000 Subject: hwpmc: Refactor sample ring buffer handling to fix races Refactor sample ring buffer ring handling to make it more robust to long running callchain collection handling r338112 introduced a (now fixed) regression that exposed a number of race conditions within the management of the sample buffers. This simplifies the handling and moves the decision to overwrite a callchain sample that has taken too long out of the NMI in to the hardlock handler. With this change the problem no longer shows up as a ring corruption but as the code spending all of its time in callchain collection. - Makes the producer / consumer index incrementing monotonic, making it easier (for me at least) to reason about. - Moves the decision to overwrite a sample from NMI context to interrupt context where we can enforce serialization. - Puts a time limit on waiting to collect a user callchain - putting a bound on head-of-line blocking causing samples to be dropped - Removes the flush routine which was previously needed to purge dangling references to the pmc from the sample buffers but now is only a source of a race condition on unload. Previously one could lock up or crash HEAD by running: pmcstat -S inst_retired.any_p -T and then hitting ^C After this change it is no longer possible. PR: 231793 Reviewed by: markj@ Approved by: re (gjb@) Differential Revision: https://reviews.freebsd.org/D17011 --- sys/dev/hwpmc/hwpmc_logging.c | 15 ++- sys/dev/hwpmc/hwpmc_mod.c | 220 +++++++++++++++++++----------------------- sys/sys/pmc.h | 18 +++- sys/sys/pmckern.h | 10 +- 4 files changed, 127 insertions(+), 136 deletions(-) (limited to 'sys') diff --git a/sys/dev/hwpmc/hwpmc_logging.c b/sys/dev/hwpmc/hwpmc_logging.c index 4d2e08fe157a..8764ac9e922c 100644 --- a/sys/dev/hwpmc/hwpmc_logging.c +++ b/sys/dev/hwpmc/hwpmc_logging.c @@ -234,7 +234,7 @@ static void pmclog_loop(void *arg); static void pmclog_release(struct pmc_owner *po); static uint32_t *pmclog_reserve(struct pmc_owner *po, int length); static void pmclog_schedule_io(struct pmc_owner *po, int wakeup); -static void pmclog_schedule_all(struct pmc_owner *po, int force); +static void pmclog_schedule_all(struct pmc_owner *po); static void pmclog_stop_kthread(struct pmc_owner *po); /* @@ -842,7 +842,7 @@ pmclog_flush(struct pmc_owner *po, int force) goto error; } - pmclog_schedule_all(po, force); + pmclog_schedule_all(po); error: mtx_unlock(&pmc_kthread_mtx); @@ -850,7 +850,7 @@ pmclog_flush(struct pmc_owner *po, int force) } static void -pmclog_schedule_one_cond(struct pmc_owner *po, int force) +pmclog_schedule_one_cond(struct pmc_owner *po) { struct pmclog_buffer *plb; int cpu; @@ -860,8 +860,7 @@ pmclog_schedule_one_cond(struct pmc_owner *po, int force) /* tell hardclock not to run again */ if (PMC_CPU_HAS_SAMPLES(cpu)) PMC_CALL_HOOK_UNLOCKED(curthread, PMC_FN_DO_SAMPLES, NULL); - if (force) - pmc_flush_samples(cpu); + plb = po->po_curbuf[cpu]; if (plb && plb->plb_ptr != plb->plb_base) pmclog_schedule_io(po, 1); @@ -869,7 +868,7 @@ pmclog_schedule_one_cond(struct pmc_owner *po, int force) } static void -pmclog_schedule_all(struct pmc_owner *po, int force) +pmclog_schedule_all(struct pmc_owner *po) { /* * Schedule the current buffer if any and not empty. @@ -878,7 +877,7 @@ pmclog_schedule_all(struct pmc_owner *po, int force) thread_lock(curthread); sched_bind(curthread, i); thread_unlock(curthread); - pmclog_schedule_one_cond(po, force); + pmclog_schedule_one_cond(po); } thread_lock(curthread); sched_unbind(curthread); @@ -905,7 +904,7 @@ pmclog_close(struct pmc_owner *po) /* * Schedule the current buffer. */ - pmclog_schedule_all(po, 0); + pmclog_schedule_all(po); wakeup_one(po); mtx_unlock(&pmc_kthread_mtx); diff --git a/sys/dev/hwpmc/hwpmc_mod.c b/sys/dev/hwpmc/hwpmc_mod.c index 20df37d24bd6..1fad25c55702 100644 --- a/sys/dev/hwpmc/hwpmc_mod.c +++ b/sys/dev/hwpmc/hwpmc_mod.c @@ -211,7 +211,7 @@ static int pmc_debugflags_parse(char *newstr, char *fence); #endif static int load(struct module *module, int cmd, void *arg); -static int pmc_add_sample(int ring, struct pmc *pm, struct trapframe *tf); +static int pmc_add_sample(ring_type_t ring, struct pmc *pm, struct trapframe *tf); static void pmc_add_thread_descriptors_from_proc(struct proc *p, struct pmc_process *pp); static int pmc_attach_process(struct proc *p, struct pmc *pm); @@ -249,7 +249,7 @@ static void pmc_process_csw_out(struct thread *td); static void pmc_process_exit(void *arg, struct proc *p); static void pmc_process_fork(void *arg, struct proc *p1, struct proc *p2, int n); -static void pmc_process_samples(int cpu, int soft); +static void pmc_process_samples(int cpu, ring_type_t soft); static void pmc_release_pmc_descriptor(struct pmc *pmc); static void pmc_process_thread_add(struct thread *td); static void pmc_process_thread_delete(struct thread *td); @@ -342,6 +342,7 @@ static int pmc_nsamples = PMC_NSAMPLES; SYSCTL_INT(_kern_hwpmc, OID_AUTO, nsamples, CTLFLAG_RDTUN, &pmc_nsamples, 0, "number of PC samples per CPU"); +static uint64_t pmc_sample_mask = PMC_NSAMPLES-1; /* * kern.hwpmc.mtxpoolsize -- number of mutexes in the mutex pool. @@ -1402,6 +1403,10 @@ pmc_process_csw_in(struct thread *td) if (pm->pm_state != PMC_STATE_RUNNING) continue; + KASSERT(counter_u64_fetch(pm->pm_runcount) >= 0, + ("[pmc,%d] pm=%p runcount %ld", __LINE__, (void *) pm, + (unsigned long)counter_u64_fetch(pm->pm_runcount))); + /* increment PMC runcount */ counter_u64_add(pm->pm_runcount, 1); @@ -1596,6 +1601,10 @@ pmc_process_csw_out(struct thread *td) if (pm->pm_pcpu_state[cpu].pps_stalled == 0) pcd->pcd_stop_pmc(cpu, adjri); + KASSERT(counter_u64_fetch(pm->pm_runcount) > 0, + ("[pmc,%d] pm=%p runcount %ld", __LINE__, (void *) pm, + (unsigned long)counter_u64_fetch(pm->pm_runcount))); + /* reduce this PMC's runcount */ counter_u64_add(pm->pm_runcount, -1); @@ -2725,7 +2734,7 @@ pmc_destroy_pmc_descriptor(struct pmc *pm) static void pmc_wait_for_pmc_idle(struct pmc *pm) { -#ifdef HWPMC_DEBUG +#ifdef INVARIANTS volatile int maxloop; maxloop = 100 * pmc_cpu_max(); @@ -2737,7 +2746,7 @@ pmc_wait_for_pmc_idle(struct pmc *pm) pmclog_flush(pm->pm_owner, 1); while (counter_u64_fetch(pm->pm_runcount) > 0) { pmclog_flush(pm->pm_owner, 1); -#ifdef HWPMC_DEBUG +#ifdef INVARIANTS maxloop--; KASSERT(maxloop > 0, ("[pmc,%d] (ri%d, rc%ld) waiting too long for " @@ -4657,7 +4666,7 @@ pmc_post_callchain_callback(void) */ static int -pmc_add_sample(int ring, struct pmc *pm, struct trapframe *tf) +pmc_add_sample(ring_type_t ring, struct pmc *pm, struct trapframe *tf) { int error, cpu, callchaindepth, inuserspace; struct thread *td; @@ -4672,18 +4681,15 @@ pmc_add_sample(int ring, struct pmc *pm, struct trapframe *tf) cpu = curcpu; psb = pmc_pcpu[cpu]->pc_sb[ring]; inuserspace = TRAPF_USERMODE(tf); - ps = psb->ps_write; - if (ps->ps_nsamples == PMC_SAMPLE_INUSE) { - counter_u64_add(ps->ps_pmc->pm_runcount, -1); - counter_u64_add(pmc_stats.pm_overwrites, 1); - ps->ps_nsamples = 0; - } else if (ps->ps_nsamples) { /* in use, reader hasn't caught up */ + ps = PMC_PROD_SAMPLE(psb); + if (psb->ps_considx != psb->ps_prodidx && + ps->ps_nsamples) { /* in use, reader hasn't caught up */ pm->pm_pcpu_state[cpu].pps_stalled = 1; counter_u64_add(pmc_stats.pm_intr_bufferfull, 1); PMCDBG6(SAM,INT,1,"(spc) cpu=%d pm=%p tf=%p um=%d wr=%d rd=%d", cpu, pm, (void *) tf, inuserspace, - (int) (psb->ps_write - psb->ps_samples), - (int) (psb->ps_read - psb->ps_samples)); + (int) (psb->ps_prodidx & pmc_sample_mask), + (int) (psb->ps_considx & pmc_sample_mask)); callchaindepth = 1; error = ENOMEM; goto done; @@ -4692,14 +4698,8 @@ pmc_add_sample(int ring, struct pmc *pm, struct trapframe *tf) /* Fill in entry. */ PMCDBG6(SAM,INT,1,"cpu=%d pm=%p tf=%p um=%d wr=%d rd=%d", cpu, pm, (void *) tf, inuserspace, - (int) (psb->ps_write - psb->ps_samples), - (int) (psb->ps_read - psb->ps_samples)); - - KASSERT(counter_u64_fetch(pm->pm_runcount) >= 0, - ("[pmc,%d] pm=%p runcount %ld", __LINE__, (void *) pm, - (unsigned long)counter_u64_fetch(pm->pm_runcount))); - - counter_u64_add(pm->pm_runcount, 1); /* hold onto PMC */ + (int) (psb->ps_prodidx & pmc_sample_mask), + (int) (psb->ps_considx & pmc_sample_mask)); td = curthread; ps->ps_pmc = pm; @@ -4707,13 +4707,14 @@ pmc_add_sample(int ring, struct pmc *pm, struct trapframe *tf) ps->ps_pid = td->td_proc->p_pid; ps->ps_tid = td->td_tid; ps->ps_tsc = pmc_rdtsc(); - + ps->ps_ticks = ticks; ps->ps_cpu = cpu; ps->ps_flags = inuserspace ? PMC_CC_F_USERSPACE : 0; callchaindepth = (pm->pm_flags & PMC_F_CALLCHAIN) ? pmc_callchaindepth : 1; + MPASS(ps->ps_pc != NULL); if (callchaindepth == 1) ps->ps_pc[0] = PMC_TRAPFRAME_TO_PC(tf); else { @@ -4727,26 +4728,27 @@ pmc_add_sample(int ring, struct pmc *pm, struct trapframe *tf) callchaindepth, tf); } else { pmc_post_callchain_callback(); - callchaindepth = PMC_SAMPLE_INUSE; + callchaindepth = PMC_USER_CALLCHAIN_PENDING; } } ps->ps_nsamples = callchaindepth; /* mark entry as in use */ if (ring == PMC_UR) { ps->ps_nsamples_actual = callchaindepth; /* mark entry as in use */ - ps->ps_nsamples = PMC_SAMPLE_INUSE; + ps->ps_nsamples = PMC_USER_CALLCHAIN_PENDING; } else ps->ps_nsamples = callchaindepth; /* mark entry as in use */ - /* increment write pointer, modulo ring buffer size */ - ps++; - if (ps == psb->ps_fence) - psb->ps_write = psb->ps_samples; - else - psb->ps_write = ps; + KASSERT(counter_u64_fetch(pm->pm_runcount) >= 0, + ("[pmc,%d] pm=%p runcount %ld", __LINE__, (void *) pm, + (unsigned long)counter_u64_fetch(pm->pm_runcount))); + + counter_u64_add(pm->pm_runcount, 1); /* hold onto PMC */ + /* increment write pointer */ + psb->ps_prodidx++; done: /* mark CPU as needing processing */ - if (callchaindepth != PMC_SAMPLE_INUSE) + if (callchaindepth != PMC_USER_CALLCHAIN_PENDING) DPCPU_SET(pmc_sampled, 1); return (error); @@ -4785,14 +4787,15 @@ pmc_capture_user_callchain(int cpu, int ring, struct trapframe *tf) { struct pmc *pm; struct thread *td; - struct pmc_sample *ps, *ps_end; + struct pmc_sample *ps; struct pmc_samplebuffer *psb; - int nsamples, nrecords, pass; + uint64_t considx, prodidx; + int nsamples, nrecords, pass, iter; #ifdef INVARIANTS int ncallchains; int nfree; + int start_ticks = ticks; #endif - psb = pmc_pcpu[cpu]->pc_sb[ring]; td = curthread; @@ -4810,29 +4813,30 @@ pmc_capture_user_callchain(int cpu, int ring, struct trapframe *tf) if (ring == PMC_UR) nrecords = atomic_readandclear_32(&td->td_pmcpend); + for (iter = 0, considx = psb->ps_considx, prodidx = psb->ps_prodidx; + considx < prodidx && iter < pmc_nsamples; considx++, iter++) { + ps = PMC_CONS_SAMPLE_OFF(psb, considx); + /* * Iterate through all deferred callchain requests. * Walk from the current read pointer to the current * write pointer. */ - ps = psb->ps_read; - ps_end = psb->ps_write; - do { #ifdef INVARIANTS if (ps->ps_nsamples == PMC_SAMPLE_FREE) { nfree++; - goto next; + continue; } if ((ps->ps_pmc == NULL) || (ps->ps_pmc->pm_state != PMC_STATE_RUNNING)) nfree++; #endif - if (ps->ps_nsamples != PMC_SAMPLE_INUSE) - goto next; - if (ps->ps_td != td) - goto next; + if (ps->ps_td != td || + ps->ps_nsamples == PMC_USER_CALLCHAIN_PENDING || + ps->ps_pmc->pm_state != PMC_STATE_RUNNING) + continue; KASSERT(ps->ps_cpu == cpu, ("[pmc,%d] cpu mismatch ps_cpu=%d pcpu=%d", __LINE__, @@ -4865,15 +4869,28 @@ pmc_capture_user_callchain(int cpu, int ring, struct trapframe *tf) if (__predict_true(nsamples < pmc_callchaindepth - 1)) nsamples += pmc_save_user_callchain(ps->ps_pc + nsamples, pmc_callchaindepth - nsamples - 1, tf); - wmb(); - ps->ps_nsamples = nsamples; + + /* + * We have to prevent hardclock from potentially overwriting + * this sample between when we read the value and when we set + * it + */ + spinlock_enter(); + /* + * Verify that the sample hasn't been dropped in the meantime + */ + if (ps->ps_nsamples == PMC_USER_CALLCHAIN_PENDING) { + ps->ps_nsamples = nsamples; + /* + * If we couldn't get a sample, simply drop the reference + */ + if (nsamples == 0) + counter_u64_add(pm->pm_runcount, -1); + } + spinlock_exit(); if (nrecords-- == 1) break; -next: - /* increment the pointer, modulo sample ring size */ - if (++ps == psb->ps_fence) - ps = psb->ps_samples; - } while (ps != ps_end); + } if (__predict_false(ring == PMC_UR && td->td_pmcpend)) { if (pass == 0) { pass = 1; @@ -4884,60 +4901,20 @@ next: } #ifdef INVARIANTS - if (ring == PMC_HR) - KASSERT(ncallchains > 0 || nfree > 0, - ("[pmc,%d] cpu %d didn't find a sample to collect", __LINE__, - cpu)); + if ((ticks - start_ticks) > hz) + log(LOG_ERR, "%s took %d ticks\n", __func__, (ticks - start_ticks)); #endif /* mark CPU as needing processing */ DPCPU_SET(pmc_sampled, 1); } - -static void -pmc_flush_ring(int cpu, int ring) -{ - struct pmc *pm; - struct pmc_sample *ps; - struct pmc_samplebuffer *psb; - int n; - - psb = pmc_pcpu[cpu]->pc_sb[ring]; - - for (n = 0; n < pmc_nsamples; n++) { /* bound on #iterations */ - - ps = psb->ps_read; - if (ps->ps_nsamples == PMC_SAMPLE_FREE) - goto next; - pm = ps->ps_pmc; - counter_u64_add(pm->pm_runcount, -1); - ps->ps_nsamples = PMC_SAMPLE_FREE; - /* increment read pointer, modulo sample size */ - next: - if (++ps == psb->ps_fence) - psb->ps_read = psb->ps_samples; - else - psb->ps_read = ps; - } -} - -void -pmc_flush_samples(int cpu) -{ - int n; - - for (n = 0; n < PMC_NUM_SR; n++) - pmc_flush_ring(cpu, n); -} - - /* * Process saved PC samples. */ static void -pmc_process_samples(int cpu, int ring) +pmc_process_samples(int cpu, ring_type_t ring) { struct pmc *pm; int adjri, n; @@ -4946,20 +4923,25 @@ pmc_process_samples(int cpu, int ring) struct pmc_sample *ps; struct pmc_classdep *pcd; struct pmc_samplebuffer *psb; + uint64_t delta; KASSERT(PCPU_GET(cpuid) == cpu, ("[pmc,%d] not on the correct CPU pcpu=%d cpu=%d", __LINE__, PCPU_GET(cpuid), cpu)); psb = pmc_pcpu[cpu]->pc_sb[ring]; + delta = psb->ps_prodidx - psb->ps_considx; + MPASS(delta <= pmc_nsamples); + MPASS(psb->ps_considx <= psb->ps_prodidx); + for (n = 0; psb->ps_considx < psb->ps_prodidx; psb->ps_considx++, n++) { + ps = PMC_CONS_SAMPLE(psb); - for (n = 0; n < pmc_nsamples; n++) { /* bound on #iterations */ - - ps = psb->ps_read; - if (ps->ps_nsamples == PMC_SAMPLE_FREE) - break; - + if (__predict_false(ps->ps_nsamples == PMC_SAMPLE_FREE)) + continue; pm = ps->ps_pmc; + /* skip non-running samples */ + if (pm->pm_state != PMC_STATE_RUNNING) + goto entrydone; KASSERT(counter_u64_fetch(pm->pm_runcount) > 0, ("[pmc,%d] pm=%p runcount %ld", __LINE__, (void *) pm, @@ -4971,12 +4953,19 @@ pmc_process_samples(int cpu, int ring) ("[pmc,%d] pmc=%p non-sampling mode=%d", __LINE__, pm, PMC_TO_MODE(pm))); - /* Ignore PMCs that have been switched off */ - if (pm->pm_state != PMC_STATE_RUNNING) - goto entrydone; /* If there is a pending AST wait for completion */ - if (ps->ps_nsamples == PMC_SAMPLE_INUSE) { + if (ps->ps_nsamples == PMC_USER_CALLCHAIN_PENDING) { + /* if sample is more than 65 ms old, drop it */ + if (ticks - ps->ps_ticks > (hz >> 4)) { + /* + * track how often we hit this as it will + * preferentially lose user samples + * for long running system calls + */ + counter_u64_add(pmc_stats.pm_overwrites, 1); + goto entrydone; + } /* Need a rescan at a later time. */ DPCPU_SET(pmc_sampled, 1); break; @@ -4984,8 +4973,8 @@ pmc_process_samples(int cpu, int ring) PMCDBG6(SAM,OPS,1,"cpu=%d pm=%p n=%d fl=%x wr=%d rd=%d", cpu, pm, ps->ps_nsamples, ps->ps_flags, - (int) (psb->ps_write - psb->ps_samples), - (int) (psb->ps_read - psb->ps_samples)); + (int) (psb->ps_prodidx & pmc_sample_mask), + (int) (psb->ps_considx & pmc_sample_mask)); /* * If this is a process-mode PMC that is attached to @@ -5008,13 +4997,11 @@ pmc_process_samples(int cpu, int ring) entrydone: ps->ps_nsamples = 0; /* mark entry as free */ - counter_u64_add(pm->pm_runcount, -1); + KASSERT(counter_u64_fetch(pm->pm_runcount) > 0, + ("[pmc,%d] pm=%p runcount %ld", __LINE__, (void *) pm, + (unsigned long)counter_u64_fetch(pm->pm_runcount))); - /* increment read pointer, modulo sample size */ - if (++ps == psb->ps_fence) - psb->ps_read = psb->ps_samples; - else - psb->ps_read = ps; + counter_u64_add(pm->pm_runcount, -1); } counter_u64_add(pmc_stats.pm_log_sweeps, 1); @@ -5197,11 +5184,11 @@ pmc_process_exit(void *arg __unused, struct proc *p) } } - counter_u64_add(pm->pm_runcount, -1); - - KASSERT((int) counter_u64_fetch(pm->pm_runcount) >= 0, + KASSERT((int64_t) counter_u64_fetch(pm->pm_runcount) > 0, ("[pmc,%d] runcount is %d", __LINE__, ri)); + counter_u64_add(pm->pm_runcount, -1); + (void) pcd->pcd_config_pmc(cpu, adjri, NULL); } @@ -5583,6 +5570,7 @@ pmc_initialize(void) "range.\n", pmc_nsamples); pmc_nsamples = PMC_NSAMPLES; } + pmc_sample_mask = pmc_nsamples-1; if (pmc_callchaindepth <= 0 || pmc_callchaindepth > PMC_CALLCHAIN_DEPTH_MAX) { @@ -5658,8 +5646,6 @@ pmc_initialize(void) sb = malloc_domain(sizeof(struct pmc_samplebuffer) + pmc_nsamples * sizeof(struct pmc_sample), M_PMC, domain, M_WAITOK|M_ZERO); - sb->ps_read = sb->ps_write = sb->ps_samples; - sb->ps_fence = sb->ps_samples + pmc_nsamples; KASSERT(pmc_pcpu[cpu] != NULL, ("[pmc,%d] cpu=%d Null per-cpu data", __LINE__, cpu)); @@ -5676,8 +5662,6 @@ pmc_initialize(void) sb = malloc_domain(sizeof(struct pmc_samplebuffer) + pmc_nsamples * sizeof(struct pmc_sample), M_PMC, domain, M_WAITOK|M_ZERO); - sb->ps_read = sb->ps_write = sb->ps_samples; - sb->ps_fence = sb->ps_samples + pmc_nsamples; KASSERT(pmc_pcpu[cpu] != NULL, ("[pmc,%d] cpu=%d Null per-cpu data", __LINE__, cpu)); @@ -5694,8 +5678,6 @@ pmc_initialize(void) sb = malloc_domain(sizeof(struct pmc_samplebuffer) + pmc_nsamples * sizeof(struct pmc_sample), M_PMC, domain, M_WAITOK|M_ZERO); - sb->ps_read = sb->ps_write = sb->ps_samples; - sb->ps_fence = sb->ps_samples + pmc_nsamples; KASSERT(pmc_pcpu[cpu] != NULL, ("[pmc,%d] cpu=%d Null per-cpu data", __LINE__, cpu)); diff --git a/sys/sys/pmc.h b/sys/sys/pmc.h index 0c7a3a331abd..be4c1cda03b0 100644 --- a/sys/sys/pmc.h +++ b/sys/sys/pmc.h @@ -936,6 +936,8 @@ struct pmc_sample { uint16_t ps_flags; /* other flags */ lwpid_t ps_tid; /* thread id */ pid_t ps_pid; /* process PID or -1 */ + int ps_ticks; /* ticks at sample time */ + /* pad */ struct thread *ps_td; /* which thread */ struct pmc *ps_pmc; /* interrupting PMC */ uintptr_t *ps_pc; /* (const) callchain start */ @@ -943,16 +945,23 @@ struct pmc_sample { }; #define PMC_SAMPLE_FREE ((uint16_t) 0) -#define PMC_SAMPLE_INUSE ((uint16_t) 0xFFFF) +#define PMC_USER_CALLCHAIN_PENDING ((uint16_t) 0xFFFF) struct pmc_samplebuffer { - struct pmc_sample * volatile ps_read; /* read pointer */ - struct pmc_sample * volatile ps_write; /* write pointer */ + volatile uint64_t ps_prodidx; /* producer index */ + volatile uint64_t ps_considx; /* consumer index */ uintptr_t *ps_callchains; /* all saved call chains */ - struct pmc_sample *ps_fence; /* one beyond ps_samples[] */ struct pmc_sample ps_samples[]; /* array of sample entries */ }; +#define PMC_CONS_SAMPLE(psb) \ + (&(psb)->ps_samples[(psb)->ps_considx & pmc_sample_mask]) + +#define PMC_CONS_SAMPLE_OFF(psb, off) \ + (&(psb)->ps_samples[(off) & pmc_sample_mask]) + +#define PMC_PROD_SAMPLE(psb) \ + (&(psb)->ps_samples[(psb)->ps_prodidx & pmc_sample_mask]) /* * struct pmc_cpustate @@ -1216,7 +1225,6 @@ int pmc_save_user_callchain(uintptr_t *_cc, int _maxsamples, struct trapframe *_tf); struct pmc_mdep *pmc_mdep_alloc(int nclasses); void pmc_mdep_free(struct pmc_mdep *md); -void pmc_flush_samples(int cpu); uint64_t pmc_rdtsc(void); #endif /* _KERNEL */ #endif /* _SYS_PMC_H_ */ diff --git a/sys/sys/pmckern.h b/sys/sys/pmckern.h index 27e02af73cf2..e892d658a1ca 100644 --- a/sys/sys/pmckern.h +++ b/sys/sys/pmckern.h @@ -67,10 +67,12 @@ #define PMC_FN_THR_EXIT_LOG 16 #define PMC_FN_PROC_CREATE_LOG 17 -#define PMC_HR 0 /* Hardware ring buffer */ -#define PMC_SR 1 /* Software ring buffer */ -#define PMC_UR 2 /* userret ring buffer */ -#define PMC_NUM_SR (PMC_UR+1) +typedef enum ring_type { + PMC_HR = 0, /* Hardware ring buffer */ + PMC_SR = 1, /* Software ring buffer */ + PMC_UR = 2, /* userret ring buffer */ + PMC_NUM_SR = PMC_UR+1 +} ring_type_t; struct pmckern_procexec { int pm_credentialschanged; -- cgit v1.3 From b6e870116f9c9ddc8cb80e35de670011921d0ca5 Mon Sep 17 00:00:00 2001 From: Tom Jones Date: Fri, 5 Oct 2018 12:51:30 +0000 Subject: Convert UDP length to host byte order When getting the number of bytes to checksum make sure to convert the UDP length to host byte order when the entire header is not in the first mbuf. Reviewed by: jtl, tuexen, ae Approved by: re (gjb), jtl (mentor) Differential Revision: https://reviews.freebsd.org/D17357 --- sys/netinet/ip_output.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'sys') diff --git a/sys/netinet/ip_output.c b/sys/netinet/ip_output.c index 9d7b9cbe8661..a08806a686bd 100644 --- a/sys/netinet/ip_output.c +++ b/sys/netinet/ip_output.c @@ -932,10 +932,11 @@ in_delayed_cksum(struct mbuf *m) if (m->m_pkthdr.csum_flags & CSUM_UDP) { /* if udp header is not in the first mbuf copy udplen */ - if (offset + sizeof(struct udphdr) > m->m_len) + if (offset + sizeof(struct udphdr) > m->m_len) { m_copydata(m, offset + offsetof(struct udphdr, uh_ulen), sizeof(cklen), (caddr_t)&cklen); - else { + cklen = ntohs(cklen); + } else { uh = (struct udphdr *)mtodo(m, offset); cklen = ntohs(uh->uh_ulen); } -- cgit v1.3 From 1f55b2a4b5a69db7ef1b94a87c0c60efe565520c Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 5 Oct 2018 16:05:59 +0000 Subject: Add sysctls for dbuf metadata cache variables added in r336959. Approved by: re (gjb) MFC after: 1 week --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'sys') diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c index 0a352b1e9798..9012baa0a994 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c @@ -292,8 +292,15 @@ uint_t dbuf_cache_lowater_pct = 10; SYSCTL_DECL(_vfs_zfs); SYSCTL_QUAD(_vfs_zfs, OID_AUTO, dbuf_cache_max_bytes, CTLFLAG_RWTUN, &dbuf_cache_max_bytes, 0, "dbuf cache size in bytes"); +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, dbuf_metadata_cache_max_bytes, CTLFLAG_RWTUN, + &dbuf_metadata_cache_max_bytes, 0, "dbuf metadata cache size in bytes"); SYSCTL_INT(_vfs_zfs, OID_AUTO, dbuf_cache_shift, CTLFLAG_RDTUN, &dbuf_cache_shift, 0, "dbuf cache size as log2 fraction of ARC"); +SYSCTL_INT(_vfs_zfs, OID_AUTO, dbuf_metadata_cache_shift, CTLFLAG_RDTUN, + &dbuf_metadata_cache_shift, 0, + "dbuf metadata cache size as log2 fraction of ARC"); +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, dbuf_metadata_cache_overflow, CTLFLAG_RD, + &dbuf_metadata_cache_overflow, 0, "dbuf metadata cache overflow"); SYSCTL_UINT(_vfs_zfs, OID_AUTO, dbuf_cache_hiwater_pct, CTLFLAG_RWTUN, &dbuf_cache_hiwater_pct, 0, "max percents above the dbuf cache size"); SYSCTL_UINT(_vfs_zfs, OID_AUTO, dbuf_cache_lowater_pct, CTLFLAG_RWTUN, -- cgit v1.3