diff options
Diffstat (limited to 'sys')
216 files changed, 5204 insertions, 1859 deletions
diff --git a/sys/amd64/amd64/elf_machdep.c b/sys/amd64/amd64/elf_machdep.c index 6cc2d58bbbcc..933f1ac0051f 100644 --- a/sys/amd64/amd64/elf_machdep.c +++ b/sys/amd64/amd64/elf_machdep.c @@ -179,7 +179,7 @@ freebsd_brand_info_la57_img_compat(const struct image_params *imgp, return (!prefer_uva_la48); } -static Elf64_Brandinfo freebsd_brand_info_la48 = { +static const Elf64_Brandinfo freebsd_brand_info_la48 = { .brand = ELFOSABI_FREEBSD, .machine = EM_X86_64, .compat_3_brand = "FreeBSD", @@ -190,7 +190,7 @@ static Elf64_Brandinfo freebsd_brand_info_la48 = { .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE, }; -static Elf64_Brandinfo freebsd_brand_info_la57 = { +static const Elf64_Brandinfo freebsd_brand_info_la57 = { .brand = ELFOSABI_FREEBSD, .machine = EM_X86_64, .compat_3_brand = "FreeBSD", @@ -216,7 +216,7 @@ sysinit_register_elf64_brand_entries(void *arg __unused) SYSINIT(elf64, SI_SUB_EXEC, SI_ORDER_FIRST, sysinit_register_elf64_brand_entries, NULL); -static Elf64_Brandinfo freebsd_brand_oinfo = { +static const Elf64_Brandinfo freebsd_brand_oinfo = { .brand = ELFOSABI_FREEBSD, .machine = EM_X86_64, .compat_3_brand = "FreeBSD", @@ -226,11 +226,10 @@ static Elf64_Brandinfo freebsd_brand_oinfo = { .brand_note = &elf64_freebsd_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; - -SYSINIT(oelf64, SI_SUB_EXEC, SI_ORDER_ANY, +C_SYSINIT(oelf64, SI_SUB_EXEC, SI_ORDER_ANY, (sysinit_cfunc_t)elf64_insert_brand_entry, &freebsd_brand_oinfo); -static Elf64_Brandinfo kfreebsd_brand_info = { +static const Elf64_Brandinfo kfreebsd_brand_info = { .brand = ELFOSABI_FREEBSD, .machine = EM_X86_64, .compat_3_brand = "FreeBSD", @@ -240,8 +239,7 @@ static Elf64_Brandinfo kfreebsd_brand_info = { .brand_note = &elf64_kfreebsd_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE_MANDATORY }; - -SYSINIT(kelf64, SI_SUB_EXEC, SI_ORDER_ANY, +C_SYSINIT(kelf64, SI_SUB_EXEC, SI_ORDER_ANY, (sysinit_cfunc_t)elf64_insert_brand_entry, &kfreebsd_brand_info); void diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h index e35119af8572..ad67510fecf3 100644 --- a/sys/amd64/include/vmm.h +++ b/sys/amd64/include/vmm.h @@ -170,55 +170,63 @@ struct vm_eventinfo { int *iptr; /* reqidle cookie */ }; -typedef int (*vmm_init_func_t)(int ipinum); -typedef int (*vmm_cleanup_func_t)(void); -typedef void (*vmm_suspend_func_t)(void); -typedef void (*vmm_resume_func_t)(void); -typedef void * (*vmi_init_func_t)(struct vm *vm, struct pmap *pmap); -typedef int (*vmi_run_func_t)(void *vcpui, register_t rip, - struct pmap *pmap, struct vm_eventinfo *info); -typedef void (*vmi_cleanup_func_t)(void *vmi); -typedef void * (*vmi_vcpu_init_func_t)(void *vmi, struct vcpu *vcpu, - int vcpu_id); -typedef void (*vmi_vcpu_cleanup_func_t)(void *vcpui); -typedef int (*vmi_get_register_t)(void *vcpui, int num, uint64_t *retval); -typedef int (*vmi_set_register_t)(void *vcpui, int num, uint64_t val); -typedef int (*vmi_get_desc_t)(void *vcpui, int num, struct seg_desc *desc); -typedef int (*vmi_set_desc_t)(void *vcpui, int num, struct seg_desc *desc); -typedef int (*vmi_get_cap_t)(void *vcpui, int num, int *retval); -typedef int (*vmi_set_cap_t)(void *vcpui, int num, int val); -typedef struct vmspace * (*vmi_vmspace_alloc)(vm_offset_t min, vm_offset_t max); -typedef void (*vmi_vmspace_free)(struct vmspace *vmspace); -typedef struct vlapic * (*vmi_vlapic_init)(void *vcpui); -typedef void (*vmi_vlapic_cleanup)(struct vlapic *vlapic); -typedef int (*vmi_snapshot_vcpu_t)(void *vcpui, struct vm_snapshot_meta *meta); -typedef int (*vmi_restore_tsc_t)(void *vcpui, uint64_t now); +#define DECLARE_VMMOPS_FUNC(ret_type, opname, args) \ + typedef ret_type (*vmmops_##opname##_t) args; \ + ret_type vmmops_##opname args + +DECLARE_VMMOPS_FUNC(int, modinit, (int ipinum)); +DECLARE_VMMOPS_FUNC(int, modcleanup, (void)); +DECLARE_VMMOPS_FUNC(void, modresume, (void)); +DECLARE_VMMOPS_FUNC(void, modsuspend, (void)); +DECLARE_VMMOPS_FUNC(void *, init, (struct vm *vm, struct pmap *pmap)); +DECLARE_VMMOPS_FUNC(int, run, (void *vcpui, register_t pc, + struct pmap *pmap, struct vm_eventinfo *info)); +DECLARE_VMMOPS_FUNC(void, cleanup, (void *vmi)); +DECLARE_VMMOPS_FUNC(void *, vcpu_init, (void *vmi, struct vcpu *vcpu, + int vcpu_id)); +DECLARE_VMMOPS_FUNC(void, vcpu_cleanup, (void *vcpui)); +DECLARE_VMMOPS_FUNC(int, getreg, (void *vcpui, int num, uint64_t *retval)); +DECLARE_VMMOPS_FUNC(int, setreg, (void *vcpui, int num, uint64_t val)); +DECLARE_VMMOPS_FUNC(int, getdesc, (void *vcpui, int num, + struct seg_desc *desc)); +DECLARE_VMMOPS_FUNC(int, setdesc, (void *vcpui, int num, + struct seg_desc *desc)); +DECLARE_VMMOPS_FUNC(int, getcap, (void *vcpui, int num, int *retval)); +DECLARE_VMMOPS_FUNC(int, setcap, (void *vcpui, int num, int val)); +DECLARE_VMMOPS_FUNC(struct vmspace *, vmspace_alloc, + (vm_offset_t min, vm_offset_t max)); +DECLARE_VMMOPS_FUNC(void, vmspace_free, (struct vmspace *vmspace)); +DECLARE_VMMOPS_FUNC(struct vlapic *, vlapic_init, (void *vcpui)); +DECLARE_VMMOPS_FUNC(void, vlapic_cleanup, (struct vlapic *vlapic)); +DECLARE_VMMOPS_FUNC(int, vcpu_snapshot, (void *vcpui, + struct vm_snapshot_meta *meta)); +DECLARE_VMMOPS_FUNC(int, restore_tsc, (void *vcpui, uint64_t now)); struct vmm_ops { - vmm_init_func_t modinit; /* module wide initialization */ - vmm_cleanup_func_t modcleanup; - vmm_resume_func_t modsuspend; - vmm_resume_func_t modresume; - - vmi_init_func_t init; /* vm-specific initialization */ - vmi_run_func_t run; - vmi_cleanup_func_t cleanup; - vmi_vcpu_init_func_t vcpu_init; - vmi_vcpu_cleanup_func_t vcpu_cleanup; - vmi_get_register_t getreg; - vmi_set_register_t setreg; - vmi_get_desc_t getdesc; - vmi_set_desc_t setdesc; - vmi_get_cap_t getcap; - vmi_set_cap_t setcap; - vmi_vmspace_alloc vmspace_alloc; - vmi_vmspace_free vmspace_free; - vmi_vlapic_init vlapic_init; - vmi_vlapic_cleanup vlapic_cleanup; + vmmops_modinit_t modinit; /* module wide initialization */ + vmmops_modcleanup_t modcleanup; + vmmops_modresume_t modsuspend; + vmmops_modresume_t modresume; + + vmmops_init_t init; /* vm-specific initialization */ + vmmops_run_t run; + vmmops_cleanup_t cleanup; + vmmops_vcpu_init_t vcpu_init; + vmmops_vcpu_cleanup_t vcpu_cleanup; + vmmops_getreg_t getreg; + vmmops_setreg_t setreg; + vmmops_getdesc_t getdesc; + vmmops_setdesc_t setdesc; + vmmops_getcap_t getcap; + vmmops_setcap_t setcap; + vmmops_vmspace_alloc_t vmspace_alloc; + vmmops_vmspace_free_t vmspace_free; + vmmops_vlapic_init_t vlapic_init; + vmmops_vlapic_cleanup_t vlapic_cleanup; /* checkpoint operations */ - vmi_snapshot_vcpu_t vcpu_snapshot; - vmi_restore_tsc_t restore_tsc; + vmmops_vcpu_snapshot_t vcpu_snapshot; + vmmops_restore_tsc_t restore_tsc; }; extern const struct vmm_ops vmm_ops_intel; @@ -229,7 +237,7 @@ extern u_int vm_maxcpu; /* maximum virtual cpus */ int vm_create(const char *name, struct vm **retvm); struct vcpu *vm_alloc_vcpu(struct vm *vm, int vcpuid); void vm_disable_vcpu_creation(struct vm *vm); -void vm_slock_vcpus(struct vm *vm); +void vm_lock_vcpus(struct vm *vm); void vm_unlock_vcpus(struct vm *vm); void vm_destroy(struct vm *vm); int vm_reinit(struct vm *vm); @@ -354,6 +362,7 @@ enum vcpu_state { }; int vcpu_set_state(struct vcpu *vcpu, enum vcpu_state state, bool from_idle); +int vcpu_set_state_all(struct vm *vm, enum vcpu_state state); enum vcpu_state vcpu_get_state(struct vcpu *vcpu, int *hostcpu); static int __inline @@ -375,7 +384,6 @@ vcpu_should_yield(struct vcpu *vcpu) void *vcpu_stats(struct vcpu *vcpu); void vcpu_notify_event(struct vcpu *vcpu, bool lapic_intr); -struct vmspace *vm_vmspace(struct vm *vm); struct vm_mem *vm_mem(struct vm *vm); struct vatpic *vm_atpic(struct vm *vm); struct vatpit *vm_atpit(struct vm *vm); diff --git a/sys/amd64/linux/linux_sysvec.c b/sys/amd64/linux/linux_sysvec.c index c8579c5da4ad..890cf01c46a0 100644 --- a/sys/amd64/linux/linux_sysvec.c +++ b/sys/amd64/linux/linux_sysvec.c @@ -857,7 +857,7 @@ linux_vdso_reloc(char *mapping, Elf_Addr offset) } } -static Elf_Brandnote linux64_brandnote = { +static const Elf_Brandnote linux64_brandnote = { .hdr.n_namesz = sizeof(GNU_ABI_VENDOR), .hdr.n_descsz = 16, .hdr.n_type = 1, @@ -866,7 +866,7 @@ static Elf_Brandnote linux64_brandnote = { .trans_osrel = linux_trans_osrel }; -static Elf64_Brandinfo linux_glibc2brand = { +static const Elf64_Brandinfo linux_glibc2brand = { .brand = ELFOSABI_LINUX, .machine = EM_X86_64, .compat_3_brand = "Linux", @@ -877,7 +877,7 @@ static Elf64_Brandinfo linux_glibc2brand = { .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; -static Elf64_Brandinfo linux_glibc2brandshort = { +static const Elf64_Brandinfo linux_glibc2brandshort = { .brand = ELFOSABI_LINUX, .machine = EM_X86_64, .compat_3_brand = "Linux", @@ -888,7 +888,7 @@ static Elf64_Brandinfo linux_glibc2brandshort = { .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; -static Elf64_Brandinfo linux_muslbrand = { +static const Elf64_Brandinfo linux_muslbrand = { .brand = ELFOSABI_LINUX, .machine = EM_X86_64, .compat_3_brand = "Linux", @@ -900,7 +900,7 @@ static Elf64_Brandinfo linux_muslbrand = { LINUX_BI_FUTEX_REQUEUE }; -static Elf64_Brandinfo *linux_brandlist[] = { +static const Elf64_Brandinfo *linux_brandlist[] = { &linux_glibc2brand, &linux_glibc2brandshort, &linux_muslbrand, @@ -910,7 +910,7 @@ static Elf64_Brandinfo *linux_brandlist[] = { static int linux64_elf_modevent(module_t mod, int type, void *data) { - Elf64_Brandinfo **brandinfo; + const Elf64_Brandinfo **brandinfo; int error; struct linux_ioctl_handler **lihp; diff --git a/sys/amd64/linux32/linux32_sysvec.c b/sys/amd64/linux32/linux32_sysvec.c index 8fac626f9053..735ebb151017 100644 --- a/sys/amd64/linux32/linux32_sysvec.c +++ b/sys/amd64/linux32/linux32_sysvec.c @@ -954,7 +954,7 @@ linux_vdso_reloc(char *mapping, Elf_Addr offset) } } -static Elf_Brandnote linux32_brandnote = { +static const Elf_Brandnote linux32_brandnote = { .hdr.n_namesz = sizeof(GNU_ABI_VENDOR), .hdr.n_descsz = 16, /* XXX at least 16 */ .hdr.n_type = 1, @@ -963,7 +963,7 @@ static Elf_Brandnote linux32_brandnote = { .trans_osrel = linux_trans_osrel }; -static Elf32_Brandinfo linux_brand = { +static const Elf32_Brandinfo linux_brand = { .brand = ELFOSABI_LINUX, .machine = EM_386, .compat_3_brand = "Linux", @@ -974,7 +974,7 @@ static Elf32_Brandinfo linux_brand = { .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; -static Elf32_Brandinfo linux_glibc2brand = { +static const Elf32_Brandinfo linux_glibc2brand = { .brand = ELFOSABI_LINUX, .machine = EM_386, .compat_3_brand = "Linux", @@ -985,7 +985,7 @@ static Elf32_Brandinfo linux_glibc2brand = { .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; -static Elf32_Brandinfo linux_muslbrand = { +static const Elf32_Brandinfo linux_muslbrand = { .brand = ELFOSABI_LINUX, .machine = EM_386, .compat_3_brand = "Linux", @@ -997,7 +997,7 @@ static Elf32_Brandinfo linux_muslbrand = { LINUX_BI_FUTEX_REQUEUE }; -static Elf32_Brandinfo *linux_brandlist[] = { +static const Elf32_Brandinfo *linux_brandlist[] = { &linux_brand, &linux_glibc2brand, &linux_muslbrand, @@ -1007,7 +1007,7 @@ static Elf32_Brandinfo *linux_brandlist[] = { static int linux_elf_modevent(module_t mod, int type, void *data) { - Elf32_Brandinfo **brandinfo; + const Elf32_Brandinfo **brandinfo; int error; struct linux_ioctl_handler **lihp; diff --git a/sys/amd64/pt/pt.c b/sys/amd64/pt/pt.c index c7b75767680a..6b2296de049c 100644 --- a/sys/amd64/pt/pt.c +++ b/sys/amd64/pt/pt.c @@ -42,15 +42,15 @@ */ #include <sys/systm.h> +#include <sys/bus.h> #include <sys/hwt.h> +#include <sys/interrupt.h> #include <sys/kernel.h> #include <sys/lock.h> #include <sys/malloc.h> #include <sys/module.h> #include <sys/mutex.h> -#include <sys/sdt.h> #include <sys/smp.h> -#include <sys/taskqueue.h> #include <vm/vm.h> #include <vm/vm_page.h> @@ -94,12 +94,7 @@ MALLOC_DEFINE(M_PT, "pt", "Intel Processor Trace"); -SDT_PROVIDER_DEFINE(pt); -SDT_PROBE_DEFINE(pt, , , topa__intr); - -TASKQUEUE_FAST_DEFINE_THREAD(pt); - -static void pt_send_buffer_record(void *arg, int pending __unused); +static void pt_send_buffer_record(void *arg); static int pt_topa_intr(struct trapframe *tf); /* @@ -122,29 +117,24 @@ struct pt_buffer { size_t size; struct mtx lock; /* Lock for fields below. */ vm_offset_t offset; - uint64_t wrap_count; - int curpage; }; struct pt_ctx { int id; struct pt_buffer buf; /* ToPA buffer metadata */ - struct task task; /* ToPA buffer notification task */ struct hwt_context *hwt_ctx; uint8_t *save_area; /* PT XSAVE area */ }; /* PT tracing contexts used for CPU mode. */ static struct pt_ctx *pt_pcpu_ctx; -enum pt_cpu_state { - PT_DISABLED = 0, - PT_STOPPED, - PT_ACTIVE -}; +enum pt_cpu_state { PT_INACTIVE = 0, PT_ACTIVE }; static struct pt_cpu { struct pt_ctx *ctx; /* active PT tracing context */ enum pt_cpu_state state; /* used as part of trace stop protocol */ + void *swi_cookie; /* Software interrupt handler context */ + int in_pcint_handler; } *pt_pcpu; /* @@ -199,31 +189,28 @@ static __inline void pt_update_buffer(struct pt_buffer *buf) { uint64_t reg; - int curpage; + uint64_t offset; /* Update buffer offset. */ reg = rdmsr(MSR_IA32_RTIT_OUTPUT_MASK_PTRS); - curpage = (reg & PT_TOPA_PAGE_MASK) >> PT_TOPA_PAGE_SHIFT; - mtx_lock_spin(&buf->lock); - /* Check if the output wrapped. */ - if (buf->curpage > curpage) - buf->wrap_count++; - buf->curpage = curpage; - buf->offset = reg >> 32; - mtx_unlock_spin(&buf->lock); - - dprintf("%s: wrap_cnt: %lu, curpage: %d, offset: %zu\n", __func__, - buf->wrap_count, buf->curpage, buf->offset); + offset = ((reg & PT_TOPA_PAGE_MASK) >> PT_TOPA_PAGE_SHIFT) * PAGE_SIZE; + offset += (reg >> 32); + + atomic_store_rel_64(&buf->offset, offset); } static __inline void pt_fill_buffer_record(int id, struct pt_buffer *buf, struct hwt_record_entry *rec) { + vm_offset_t offset; + + offset = atomic_load_acq_64(&buf->offset); + rec->record_type = HWT_RECORD_BUFFER; rec->buf_id = id; - rec->curpage = buf->curpage; - rec->offset = buf->offset + (buf->wrap_count * buf->size); + rec->curpage = offset / PAGE_SIZE; + rec->offset = offset & PAGE_MASK; } /* @@ -273,9 +260,9 @@ pt_cpu_start(void *dummy) MPASS(cpu->ctx != NULL); dprintf("%s: curcpu %d\n", __func__, curcpu); + pt_cpu_set_state(curcpu, PT_ACTIVE); load_cr4(rcr4() | CR4_XSAVE); wrmsr(MSR_IA32_RTIT_STATUS, 0); - pt_cpu_set_state(curcpu, PT_ACTIVE); pt_cpu_toggle_local(cpu->ctx->save_area, true); } @@ -291,16 +278,16 @@ pt_cpu_stop(void *dummy) struct pt_cpu *cpu; struct pt_ctx *ctx; - /* Shutdown may occur before PT gets properly configured. */ - if (pt_cpu_get_state(curcpu) == PT_DISABLED) - return; - cpu = &pt_pcpu[curcpu]; ctx = cpu->ctx; - MPASS(ctx != NULL); - dprintf("%s: curcpu %d\n", __func__, curcpu); - pt_cpu_set_state(curcpu, PT_STOPPED); + dprintf("%s: curcpu %d\n", __func__, curcpu); + /* Shutdown may occur before PT gets properly configured. */ + if (ctx == NULL) { + dprintf("%s: missing context on cpu %d; bailing\n", __func__, + curcpu); + return; + } pt_cpu_toggle_local(cpu->ctx->save_area, false); pt_update_buffer(&ctx->buf); } @@ -406,13 +393,11 @@ pt_init_ctx(struct pt_ctx *pt_ctx, struct hwt_vm *vm, int ctx_id) return (ENOMEM); dprintf("%s: preparing ToPA buffer\n", __func__); if (pt_topa_prepare(pt_ctx, vm) != 0) { - dprintf("%s: failed to prepare ToPA buffer\n", __func__); free(pt_ctx->save_area, M_PT); return (ENOMEM); } pt_ctx->id = ctx_id; - TASK_INIT(&pt_ctx->task, 0, pt_send_buffer_record, pt_ctx); return (0); } @@ -426,7 +411,6 @@ pt_deinit_ctx(struct pt_ctx *pt_ctx) if (pt_ctx->save_area != NULL) free(pt_ctx->save_area, M_PT); memset(pt_ctx, 0, sizeof(*pt_ctx)); - pt_ctx->buf.topa_hw = NULL; } /* @@ -519,7 +503,6 @@ pt_backend_configure(struct hwt_context *ctx, int cpu_id, int thread_id) XSTATE_XCOMP_BV_COMPACT; pt_ext->rtit_ctl |= RTIT_CTL_TRACEEN; pt_pcpu[cpu_id].ctx = pt_ctx; - pt_cpu_set_state(cpu_id, PT_STOPPED); return (0); } @@ -549,12 +532,19 @@ pt_backend_disable(struct hwt_context *ctx, int cpu_id) if (ctx->mode == HWT_MODE_CPU) return; - KASSERT(curcpu == cpu_id, ("%s: attempting to disable PT on another cpu", __func__)); + + cpu = &pt_pcpu[cpu_id]; + + dprintf("%s: waiting for cpu %d to exit interrupt handler\n", __func__, + cpu_id); + pt_cpu_set_state(cpu_id, PT_INACTIVE); + while (atomic_cmpset_int(&cpu->in_pcint_handler, 1, 0)) + ; + pt_cpu_stop(NULL); CPU_CLR(cpu_id, &ctx->cpu_map); - cpu = &pt_pcpu[cpu_id]; cpu->ctx = NULL; } @@ -564,14 +554,14 @@ pt_backend_disable(struct hwt_context *ctx, int cpu_id) static int pt_backend_enable_smp(struct hwt_context *ctx) { - dprintf("%s\n", __func__); + + KASSERT(ctx->mode == HWT_MODE_CPU, + ("%s: should only be used for CPU mode", __func__)); if (ctx->mode == HWT_MODE_CPU && atomic_swap_32(&cpu_mode_ctr, 1) != 0) return (-1); - KASSERT(ctx->mode == HWT_MODE_CPU, - ("%s: should only be used for CPU mode", __func__)); smp_rendezvous_cpus(ctx->cpu_map, NULL, pt_cpu_start, NULL, NULL); return (0); @@ -583,6 +573,7 @@ pt_backend_enable_smp(struct hwt_context *ctx) static int pt_backend_disable_smp(struct hwt_context *ctx) { + struct pt_cpu *cpu; dprintf("%s\n", __func__); if (ctx->mode == HWT_MODE_CPU && @@ -593,6 +584,14 @@ pt_backend_disable_smp(struct hwt_context *ctx) dprintf("%s: empty cpu map\n", __func__); return (-1); } + CPU_FOREACH_ISSET(cpu_id, &ctx->cpu_map) { + cpu = &pt_pcpu[cpu_id]; + dprintf("%s: waiting for cpu %d to exit interrupt handler\n", + __func__, cpu_id); + pt_cpu_set_state(cpu_id, PT_INACTIVE); + while (atomic_cmpset_int(&cpu->in_pcint_handler, 1, 0)) + ; + } smp_rendezvous_cpus(ctx->cpu_map, NULL, pt_cpu_stop, NULL, NULL); return (0); @@ -611,13 +610,13 @@ pt_backend_init(struct hwt_context *ctx) int error; dprintf("%s\n", __func__); - if (ctx->mode == HWT_MODE_CPU) { - TAILQ_FOREACH(hwt_cpu, &ctx->cpus, next) { - error = pt_init_ctx(&pt_pcpu_ctx[hwt_cpu->cpu_id], - hwt_cpu->vm, hwt_cpu->cpu_id); - if (error) - return (error); - } + if (ctx->mode != HWT_MODE_CPU) + return (0); + TAILQ_FOREACH(hwt_cpu, &ctx->cpus, next) { + error = pt_init_ctx(&pt_pcpu_ctx[hwt_cpu->cpu_id], hwt_cpu->vm, + hwt_cpu->cpu_id); + if (error) + return (error); } return (0); @@ -647,20 +646,16 @@ pt_backend_deinit(struct hwt_context *ctx) pt_deinit_ctx(pt_ctx); } } else { - CPU_FOREACH(cpu_id) { - if (!CPU_ISSET(cpu_id, &ctx->cpu_map)) + CPU_FOREACH_ISSET(cpu_id, &ctx->cpu_map) { + if (pt_pcpu[cpu_id].ctx == NULL) continue; - if (pt_pcpu[cpu_id].ctx != NULL) { - KASSERT(pt_pcpu[cpu_id].ctx == - &pt_pcpu_ctx[cpu_id], - ("%s: CPU mode tracing with non-cpu mode PT" - "context active", - __func__)); - pt_pcpu[cpu_id].ctx = NULL; - } - pt_ctx = &pt_pcpu_ctx[cpu_id]; - pt_deinit_ctx(pt_ctx); - memset(&pt_pcpu[cpu_id], 0, sizeof(struct pt_cpu)); + KASSERT(pt_pcpu[cpu_id].ctx == &pt_pcpu_ctx[cpu_id], + ("%s: CPU mode tracing with non-cpu mode PT" + "context active", + __func__)); + pt_deinit_ctx(pt_pcpu[cpu_id].ctx); + pt_pcpu[cpu_id].ctx = NULL; + atomic_set_int(&pt_pcpu[cpu_id].in_pcint_handler, 0); } } @@ -675,15 +670,15 @@ pt_backend_read(struct hwt_vm *vm, int *curpage, vm_offset_t *curpage_offset, uint64_t *data) { struct pt_buffer *buf; + uint64_t offset; if (vm->ctx->mode == HWT_MODE_THREAD) buf = &((struct pt_ctx *)vm->thr->private)->buf; else buf = &pt_pcpu[vm->cpu->cpu_id].ctx->buf; - mtx_lock_spin(&buf->lock); - *curpage = buf->curpage; - *curpage_offset = buf->offset + (buf->wrap_count * vm->ctx->bufsize); - mtx_unlock_spin(&buf->lock); + offset = atomic_load_acq_64(&buf->offset); + *curpage = offset / PAGE_SIZE; + *curpage_offset = offset & PAGE_MASK; return (0); } @@ -762,15 +757,13 @@ static struct hwt_backend backend = { * Used as a taskqueue routine from the ToPA interrupt handler. */ static void -pt_send_buffer_record(void *arg, int pending __unused) +pt_send_buffer_record(void *arg) { + struct pt_cpu *cpu = (struct pt_cpu *)arg; struct hwt_record_entry record; - struct pt_ctx *ctx = (struct pt_ctx *)arg; - /* Prepare buffer record. */ - mtx_lock_spin(&ctx->buf.lock); + struct pt_ctx *ctx = cpu->ctx; pt_fill_buffer_record(ctx->id, &ctx->buf, &record); - mtx_unlock_spin(&ctx->buf.lock); hwt_record_ctx(ctx->hwt_ctx, &record, M_ZERO | M_NOWAIT); } static void @@ -795,36 +788,40 @@ static int pt_topa_intr(struct trapframe *tf) { struct pt_buffer *buf; + struct pt_cpu *cpu; struct pt_ctx *ctx; uint64_t reg; - SDT_PROBE0(pt, , , topa__intr); - - if (pt_cpu_get_state(curcpu) != PT_ACTIVE) { - return (0); - } + cpu = &pt_pcpu[curcpu]; reg = rdmsr(MSR_IA_GLOBAL_STATUS); if ((reg & GLOBAL_STATUS_FLAG_TRACETOPAPMI) == 0) { - /* ACK spurious or leftover interrupt. */ pt_topa_status_clear(); + return (0); + } + + if (pt_cpu_get_state(curcpu) != PT_ACTIVE) { return (1); } + atomic_set_int(&cpu->in_pcint_handler, 1); - ctx = pt_pcpu[curcpu].ctx; + ctx = cpu->ctx; + KASSERT(ctx != NULL, + ("%s: cpu %d: ToPA PMI interrupt without an active context", + __func__, curcpu)); buf = &ctx->buf; KASSERT(buf->topa_hw != NULL, - ("%s: ToPA PMI interrupt with invalid buffer", __func__)); - + ("%s: cpu %d: ToPA PMI interrupt with invalid buffer", __func__, + curcpu)); pt_cpu_toggle_local(ctx->save_area, false); pt_update_buffer(buf); pt_topa_status_clear(); - taskqueue_enqueue_flags(taskqueue_pt, &ctx->task, - TASKQUEUE_FAIL_IF_PENDING); if (pt_cpu_get_state(curcpu) == PT_ACTIVE) { + swi_sched(cpu->swi_cookie, SWI_FROMNMI); pt_cpu_toggle_local(ctx->save_area, true); lapic_reenable_pcint(); } + atomic_set_int(&cpu->in_pcint_handler, 0); return (1); } @@ -839,7 +836,7 @@ static int pt_init(void) { u_int cp[4]; - int error; + int error, i; dprintf("pt: Enumerating part 1\n"); cpuid_count(CPUID_PT_LEAF, 0, cp); @@ -869,20 +866,38 @@ pt_init(void) pt_pcpu_ctx = mallocarray(mp_ncpus, sizeof(struct pt_ctx), M_PT, M_ZERO | M_WAITOK); + for (i = 0; i < mp_ncpus; i++) { + error = swi_add(&clk_intr_event, "pt", pt_send_buffer_record, + &pt_pcpu[i], SWI_CLOCK, INTR_MPSAFE, + &pt_pcpu[i].swi_cookie); + if (error != 0) { + dprintf( + "%s: failed to add interrupt handler for cpu: %d\n", + __func__, error); + goto err; + } + } + nmi_register_handler(pt_topa_intr); - if (!lapic_enable_pcint()) { - nmi_remove_handler(pt_topa_intr); - hwt_backend_unregister(&backend); - free(pt_pcpu, M_PT); - free(pt_pcpu_ctx, M_PT); - pt_pcpu = NULL; - pt_pcpu_ctx = NULL; + if (lapic_enable_pcint()) { + initialized = true; + return (0); + } else printf("pt: failed to setup interrupt line\n"); - return (error); +err: + nmi_remove_handler(pt_topa_intr); + hwt_backend_unregister(&backend); + + for (i = 0; i < mp_ncpus; i++) { + if (pt_pcpu[i].swi_cookie != 0) + swi_remove(pt_pcpu[i].swi_cookie); } - initialized = true; + free(pt_pcpu, M_PT); + free(pt_pcpu_ctx, M_PT); + pt_pcpu = NULL; + pt_pcpu_ctx = NULL; - return (0); + return (error); } /* @@ -941,14 +956,24 @@ pt_supported(void) static void pt_deinit(void) { + int i; + struct pt_cpu *cpu; + if (!initialized) return; nmi_remove_handler(pt_topa_intr); lapic_disable_pcint(); hwt_backend_unregister(&backend); + + for (i = 0; i < mp_ncpus; i++) { + cpu = &pt_pcpu[i]; + swi_remove(cpu->swi_cookie); + } + free(pt_pcpu, M_PT); free(pt_pcpu_ctx, M_PT); pt_pcpu = NULL; + pt_pcpu_ctx = NULL; initialized = false; } diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c index c42da02d0bf6..f7c59847140b 100644 --- a/sys/amd64/vmm/vmm.c +++ b/sys/amd64/vmm/vmm.c @@ -163,7 +163,6 @@ struct vm { void *rendezvous_arg; /* (x) [r] rendezvous func/arg */ vm_rendezvous_func_t rendezvous_func; struct mtx rendezvous_mtx; /* (o) rendezvous lock */ - struct vmspace *vmspace; /* (o) guest's address space */ struct vm_mem mem; /* (i) [m+v] guest memory */ char name[VM_MAX_NAMELEN+1]; /* (o) virtual machine name */ struct vcpu **vcpu; /* (o) guest vcpus */ @@ -201,7 +200,7 @@ vmmops_panic(void) } #define DEFINE_VMMOPS_IFUNC(ret_type, opname, args) \ - DEFINE_IFUNC(static, ret_type, vmmops_##opname, args) \ + DEFINE_IFUNC(, ret_type, vmmops_##opname, args) \ { \ if (vmm_is_intel()) \ return (vmm_ops_intel.opname); \ @@ -499,7 +498,7 @@ MODULE_VERSION(vmm, 1); static void vm_init(struct vm *vm, bool create) { - vm->cookie = vmmops_init(vm, vmspace_pmap(vm->vmspace)); + vm->cookie = vmmops_init(vm, vmspace_pmap(vm_vmspace(vm))); vm->iommu = NULL; vm->vioapic = vioapic_init(vm); vm->vhpet = vhpet_init(vm); @@ -563,9 +562,9 @@ vm_alloc_vcpu(struct vm *vm, int vcpuid) } void -vm_slock_vcpus(struct vm *vm) +vm_lock_vcpus(struct vm *vm) { - sx_slock(&vm->vcpus_init_lock); + sx_xlock(&vm->vcpus_init_lock); } void @@ -584,7 +583,7 @@ int vm_create(const char *name, struct vm **retvm) { struct vm *vm; - struct vmspace *vmspace; + int error; /* * If vmm.ko could not be successfully initialized then don't attempt @@ -597,14 +596,13 @@ vm_create(const char *name, struct vm **retvm) VM_MAX_NAMELEN + 1) return (EINVAL); - vmspace = vmmops_vmspace_alloc(0, VM_MAXUSER_ADDRESS_LA48); - if (vmspace == NULL) - return (ENOMEM); - vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO); + error = vm_mem_init(&vm->mem, 0, VM_MAXUSER_ADDRESS_LA48); + if (error != 0) { + free(vm, M_VM); + return (error); + } strcpy(vm->name, name); - vm->vmspace = vmspace; - vm_mem_init(&vm->mem); mtx_init(&vm->rendezvous_mtx, "vm rendezvous lock", 0, MTX_DEF); sx_init(&vm->vcpus_init_lock, "vm vcpus"); vm->vcpu = malloc(sizeof(*vm->vcpu) * vm_maxcpu, M_VM, M_WAITOK | @@ -685,9 +683,6 @@ vm_cleanup(struct vm *vm, bool destroy) if (destroy) { vm_mem_destroy(vm); - vmmops_vmspace_free(vm->vmspace); - vm->vmspace = NULL; - free(vm->vcpu, M_VM); sx_destroy(&vm->vcpus_init_lock); mtx_destroy(&vm->rendezvous_mtx); @@ -731,7 +726,7 @@ vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa) { vm_object_t obj; - if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL) + if ((obj = vmm_mmio_alloc(vm_vmspace(vm), gpa, len, hpa)) == NULL) return (ENOMEM); else return (0); @@ -741,19 +736,21 @@ int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len) { - vmm_mmio_free(vm->vmspace, gpa, len); + vmm_mmio_free(vm_vmspace(vm), gpa, len); return (0); } static int vm_iommu_map(struct vm *vm) { + pmap_t pmap; vm_paddr_t gpa, hpa; struct vm_mem_map *mm; int error, i; sx_assert(&vm->mem.mem_segs_lock, SX_LOCKED); + pmap = vmspace_pmap(vm_vmspace(vm)); for (i = 0; i < VM_MAX_MEMMAPS; i++) { if (!vm_memseg_sysmem(vm, i)) continue; @@ -767,7 +764,7 @@ vm_iommu_map(struct vm *vm) mm->flags |= VM_MEMMAP_F_IOMMU; for (gpa = mm->gpa; gpa < mm->gpa + mm->len; gpa += PAGE_SIZE) { - hpa = pmap_extract(vmspace_pmap(vm->vmspace), gpa); + hpa = pmap_extract(pmap, gpa); /* * All mappings in the vmm vmspace must be @@ -816,7 +813,7 @@ vm_iommu_unmap(struct vm *vm) for (gpa = mm->gpa; gpa < mm->gpa + mm->len; gpa += PAGE_SIZE) { KASSERT(vm_page_wired(PHYS_TO_VM_PAGE(pmap_extract( - vmspace_pmap(vm->vmspace), gpa))), + vmspace_pmap(vm_vmspace(vm)), gpa))), ("vm_iommu_unmap: vm %p gpa %jx not wired", vm, (uintmax_t)gpa)); iommu_remove_mapping(vm->iommu, gpa, PAGE_SIZE); @@ -993,6 +990,54 @@ save_guest_fpustate(struct vcpu *vcpu) static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle"); +/* + * Invoke the rendezvous function on the specified vcpu if applicable. Return + * true if the rendezvous is finished, false otherwise. + */ +static bool +vm_rendezvous(struct vcpu *vcpu) +{ + struct vm *vm = vcpu->vm; + int vcpuid; + + mtx_assert(&vcpu->vm->rendezvous_mtx, MA_OWNED); + KASSERT(vcpu->vm->rendezvous_func != NULL, + ("vm_rendezvous: no rendezvous pending")); + + /* 'rendezvous_req_cpus' must be a subset of 'active_cpus' */ + CPU_AND(&vm->rendezvous_req_cpus, &vm->rendezvous_req_cpus, + &vm->active_cpus); + + vcpuid = vcpu->vcpuid; + if (CPU_ISSET(vcpuid, &vm->rendezvous_req_cpus) && + !CPU_ISSET(vcpuid, &vm->rendezvous_done_cpus)) { + VMM_CTR0(vcpu, "Calling rendezvous func"); + (*vm->rendezvous_func)(vcpu, vm->rendezvous_arg); + CPU_SET(vcpuid, &vm->rendezvous_done_cpus); + } + if (CPU_CMP(&vm->rendezvous_req_cpus, + &vm->rendezvous_done_cpus) == 0) { + VMM_CTR0(vcpu, "Rendezvous completed"); + CPU_ZERO(&vm->rendezvous_req_cpus); + vm->rendezvous_func = NULL; + wakeup(&vm->rendezvous_func); + return (true); + } + return (false); +} + +static void +vcpu_wait_idle(struct vcpu *vcpu) +{ + KASSERT(vcpu->state != VCPU_IDLE, ("vcpu already idle")); + + vcpu->reqidle = 1; + vcpu_notify_event_locked(vcpu, false); + VMM_CTR1(vcpu, "vcpu state change from %s to " + "idle requested", vcpu_state2str(vcpu->state)); + msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz); +} + static int vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate, bool from_idle) @@ -1007,13 +1052,8 @@ vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate, * ioctl() operating on a vcpu at any point. */ if (from_idle) { - while (vcpu->state != VCPU_IDLE) { - vcpu->reqidle = 1; - vcpu_notify_event_locked(vcpu, false); - VMM_CTR1(vcpu, "vcpu state change from %s to " - "idle requested", vcpu_state2str(vcpu->state)); - msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz); - } + while (vcpu->state != VCPU_IDLE) + vcpu_wait_idle(vcpu); } else { KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from " "vcpu idle state")); @@ -1065,6 +1105,95 @@ vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate, return (0); } +/* + * Try to lock all of the vCPUs in the VM while taking care to avoid deadlocks + * with vm_smp_rendezvous(). + * + * The complexity here suggests that the rendezvous mechanism needs a rethink. + */ +int +vcpu_set_state_all(struct vm *vm, enum vcpu_state newstate) +{ + cpuset_t locked; + struct vcpu *vcpu; + int error, i; + uint16_t maxcpus; + + KASSERT(newstate != VCPU_IDLE, + ("vcpu_set_state_all: invalid target state %d", newstate)); + + error = 0; + CPU_ZERO(&locked); + maxcpus = vm->maxcpus; + + mtx_lock(&vm->rendezvous_mtx); +restart: + if (vm->rendezvous_func != NULL) { + /* + * If we have a pending rendezvous, then the initiator may be + * blocked waiting for other vCPUs to execute the callback. The + * current thread may be a vCPU thread so we must not block + * waiting for the initiator, otherwise we get a deadlock. + * Thus, execute the callback on behalf of any idle vCPUs. + */ + for (i = 0; i < maxcpus; i++) { + vcpu = vm_vcpu(vm, i); + if (vcpu == NULL) + continue; + vcpu_lock(vcpu); + if (vcpu->state == VCPU_IDLE) { + (void)vcpu_set_state_locked(vcpu, VCPU_FROZEN, + true); + CPU_SET(i, &locked); + } + if (CPU_ISSET(i, &locked)) { + /* + * We can safely execute the callback on this + * vCPU's behalf. + */ + vcpu_unlock(vcpu); + (void)vm_rendezvous(vcpu); + vcpu_lock(vcpu); + } + vcpu_unlock(vcpu); + } + } + + /* + * Now wait for remaining vCPUs to become idle. This may include the + * initiator of a rendezvous that is currently blocked on the rendezvous + * mutex. + */ + CPU_FOREACH_ISCLR(i, &locked) { + if (i >= maxcpus) + break; + vcpu = vm_vcpu(vm, i); + if (vcpu == NULL) + continue; + vcpu_lock(vcpu); + while (vcpu->state != VCPU_IDLE) { + mtx_unlock(&vm->rendezvous_mtx); + vcpu_wait_idle(vcpu); + vcpu_unlock(vcpu); + mtx_lock(&vm->rendezvous_mtx); + if (vm->rendezvous_func != NULL) + goto restart; + vcpu_lock(vcpu); + } + error = vcpu_set_state_locked(vcpu, newstate, true); + vcpu_unlock(vcpu); + if (error != 0) { + /* Roll back state changes. */ + CPU_FOREACH_ISSET(i, &locked) + (void)vcpu_set_state(vcpu, VCPU_IDLE, false); + break; + } + CPU_SET(i, &locked); + } + mtx_unlock(&vm->rendezvous_mtx); + return (error); +} + static void vcpu_require_state(struct vcpu *vcpu, enum vcpu_state newstate) { @@ -1086,36 +1215,23 @@ vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate) static int vm_handle_rendezvous(struct vcpu *vcpu) { - struct vm *vm = vcpu->vm; + struct vm *vm; struct thread *td; - int error, vcpuid; - error = 0; - vcpuid = vcpu->vcpuid; td = curthread; + vm = vcpu->vm; + mtx_lock(&vm->rendezvous_mtx); while (vm->rendezvous_func != NULL) { - /* 'rendezvous_req_cpus' must be a subset of 'active_cpus' */ - CPU_AND(&vm->rendezvous_req_cpus, &vm->rendezvous_req_cpus, &vm->active_cpus); - - if (CPU_ISSET(vcpuid, &vm->rendezvous_req_cpus) && - !CPU_ISSET(vcpuid, &vm->rendezvous_done_cpus)) { - VMM_CTR0(vcpu, "Calling rendezvous func"); - (*vm->rendezvous_func)(vcpu, vm->rendezvous_arg); - CPU_SET(vcpuid, &vm->rendezvous_done_cpus); - } - if (CPU_CMP(&vm->rendezvous_req_cpus, - &vm->rendezvous_done_cpus) == 0) { - VMM_CTR0(vcpu, "Rendezvous completed"); - CPU_ZERO(&vm->rendezvous_req_cpus); - vm->rendezvous_func = NULL; - wakeup(&vm->rendezvous_func); + if (vm_rendezvous(vcpu)) break; - } + VMM_CTR0(vcpu, "Wait for rendezvous completion"); mtx_sleep(&vm->rendezvous_func, &vm->rendezvous_mtx, 0, "vmrndv", hz); if (td_ast_pending(td, TDA_SUSPEND)) { + int error; + mtx_unlock(&vm->rendezvous_mtx); error = thread_check_susp(td, true); if (error != 0) @@ -1249,7 +1365,7 @@ vm_handle_paging(struct vcpu *vcpu, bool *retu) ("vm_handle_paging: invalid fault_type %d", ftype)); if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) { - rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace), + rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm_vmspace(vm)), vme->u.paging.gpa, ftype); if (rv == 0) { VMM_CTR2(vcpu, "%s bit emulation for gpa %#lx", @@ -1259,7 +1375,7 @@ vm_handle_paging(struct vcpu *vcpu, bool *retu) } } - map = &vm->vmspace->vm_map; + map = &vm_vmspace(vm)->vm_map; rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL, NULL); VMM_CTR3(vcpu, "vm_handle_paging rv = %d, gpa = %#lx, " @@ -1560,7 +1676,7 @@ vm_run(struct vcpu *vcpu) if (CPU_ISSET(vcpuid, &vm->suspended_cpus)) return (EINVAL); - pmap = vmspace_pmap(vm->vmspace); + pmap = vmspace_pmap(vm_vmspace(vm)); vme = &vcpu->exitinfo; evinfo.rptr = &vm->rendezvous_req_cpus; evinfo.sptr = &vm->suspend; @@ -2302,12 +2418,6 @@ vcpu_notify_event(struct vcpu *vcpu, bool lapic_intr) vcpu_unlock(vcpu); } -struct vmspace * -vm_vmspace(struct vm *vm) -{ - return (vm->vmspace); -} - struct vm_mem * vm_mem(struct vm *vm) { @@ -2519,7 +2629,7 @@ vm_get_rescnt(struct vcpu *vcpu, struct vmm_stat_type *stat) if (vcpu->vcpuid == 0) { vmm_stat_set(vcpu, VMM_MEM_RESIDENT, PAGE_SIZE * - vmspace_resident_count(vcpu->vm->vmspace)); + vmspace_resident_count(vm_vmspace(vcpu->vm))); } } @@ -2529,7 +2639,7 @@ vm_get_wiredcnt(struct vcpu *vcpu, struct vmm_stat_type *stat) if (vcpu->vcpuid == 0) { vmm_stat_set(vcpu, VMM_MEM_WIRED, PAGE_SIZE * - pmap_wired_count(vmspace_pmap(vcpu->vm->vmspace))); + pmap_wired_count(vmspace_pmap(vm_vmspace(vcpu->vm)))); } } diff --git a/sys/amd64/vmm/vmm_dev_machdep.c b/sys/amd64/vmm/vmm_dev_machdep.c index d8d2b460404c..dfebc9dcadbf 100644 --- a/sys/amd64/vmm/vmm_dev_machdep.c +++ b/sys/amd64/vmm/vmm_dev_machdep.c @@ -48,6 +48,7 @@ #include <x86/apicreg.h> #include <dev/vmm/vmm_dev.h> +#include <dev/vmm/vmm_mem.h> #include <dev/vmm/vmm_stat.h> #include "vmm_lapic.h" diff --git a/sys/arm/allwinner/aw_sid.c b/sys/arm/allwinner/aw_sid.c index ba5faca33c5e..932c2f189e51 100644 --- a/sys/arm/allwinner/aw_sid.c +++ b/sys/arm/allwinner/aw_sid.c @@ -297,7 +297,7 @@ aw_sid_attach(device_t dev) /* Register ourself so device can resolve who we are */ OF_device_register_xref(OF_xref_from_node(node), dev); - for (i = 0; i < sc->sid_conf->nfuses ;i++) {\ + for (i = 0; i < sc->sid_conf->nfuses; i++) { SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, sc->sid_conf->efuses[i].name, diff --git a/sys/arm/arm/pmap-v6.c b/sys/arm/arm/pmap-v6.c index 78883296c5b7..6a0ece1e4d98 100644 --- a/sys/arm/arm/pmap-v6.c +++ b/sys/arm/arm/pmap-v6.c @@ -1246,7 +1246,7 @@ pmap_bootstrap(vm_offset_t firstaddr) } static void -pmap_init_reserved_pages(void) +pmap_init_reserved_pages(void *dummy __unused) { struct pcpu *pc; vm_offset_t pages; diff --git a/sys/arm/arm/unwind.c b/sys/arm/arm/unwind.c index 7ad91a3e01a5..0d77074fae34 100644 --- a/sys/arm/arm/unwind.c +++ b/sys/arm/arm/unwind.c @@ -278,7 +278,7 @@ unwind_module_unloaded(struct linker_file *lf) * the unwind tables might be stripped, so instead we have to use the * _exidx_start/end symbols created by ldscript.arm. */ -static int +static void module_info_init(void *arg __unused) { struct linker_file thekernel; @@ -291,8 +291,6 @@ module_info_init(void *arg __unused) thekernel.exidx_addr = CADDR(&_exidx_start); thekernel.exidx_size = UADDR(&_exidx_end) - UADDR(&_exidx_start); populate_module_info(create_module_info(), &thekernel); - - return (0); } SYSINIT(unwind_init, SI_SUB_KMEM, SI_ORDER_ANY, module_info_init, NULL); diff --git a/sys/arm64/arm64/cpu_errata.c b/sys/arm64/arm64/cpu_errata.c index 989924bc0567..b876703a2a15 100644 --- a/sys/arm64/arm64/cpu_errata.c +++ b/sys/arm64/arm64/cpu_errata.c @@ -52,56 +52,11 @@ struct cpu_quirks { u_int flags; }; -static enum { - SSBD_FORCE_ON, - SSBD_FORCE_OFF, - SSBD_KERNEL, -} ssbd_method = SSBD_KERNEL; - -static cpu_quirk_install install_psci_bp_hardening; -static cpu_quirk_install install_ssbd_workaround; static cpu_quirk_install install_thunderx_bcast_tlbi_workaround; static struct cpu_quirks cpu_quirks[] = { { .midr_mask = CPU_IMPL_MASK | CPU_PART_MASK, - .midr_value = CPU_ID_RAW(CPU_IMPL_ARM, CPU_PART_CORTEX_A57,0,0), - .quirk_install = install_psci_bp_hardening, - .flags = CPU_QUIRK_POST_DEVICE, - }, - { - .midr_mask = CPU_IMPL_MASK | CPU_PART_MASK, - .midr_value = CPU_ID_RAW(CPU_IMPL_ARM, CPU_PART_CORTEX_A72,0,0), - .quirk_install = install_psci_bp_hardening, - .flags = CPU_QUIRK_POST_DEVICE, - }, - { - .midr_mask = CPU_IMPL_MASK | CPU_PART_MASK, - .midr_value = CPU_ID_RAW(CPU_IMPL_ARM, CPU_PART_CORTEX_A73,0,0), - .quirk_install = install_psci_bp_hardening, - .flags = CPU_QUIRK_POST_DEVICE, - }, - { - .midr_mask = CPU_IMPL_MASK | CPU_PART_MASK, - .midr_value = CPU_ID_RAW(CPU_IMPL_ARM, CPU_PART_CORTEX_A75,0,0), - .quirk_install = install_psci_bp_hardening, - .flags = CPU_QUIRK_POST_DEVICE, - }, - { - .midr_mask = CPU_IMPL_MASK | CPU_PART_MASK, - .midr_value = - CPU_ID_RAW(CPU_IMPL_CAVIUM, CPU_PART_THUNDERX2, 0,0), - .quirk_install = install_psci_bp_hardening, - .flags = CPU_QUIRK_POST_DEVICE, - }, - { - .midr_mask = 0, - .midr_value = 0, - .quirk_install = install_ssbd_workaround, - .flags = CPU_QUIRK_POST_DEVICE, - }, - { - .midr_mask = CPU_IMPL_MASK | CPU_PART_MASK, .midr_value = CPU_ID_RAW(CPU_IMPL_CAVIUM, CPU_PART_THUNDERX, 0, 0), .quirk_install = install_thunderx_bcast_tlbi_workaround, @@ -114,57 +69,6 @@ static struct cpu_quirks cpu_quirks[] = { }, }; -static void -install_psci_bp_hardening(void) -{ - /* SMCCC depends on PSCI. If PSCI is missing so is SMCCC */ - if (!psci_present) - return; - - if (smccc_arch_features(SMCCC_ARCH_WORKAROUND_1) != SMCCC_RET_SUCCESS) - return; - - PCPU_SET(bp_harden, smccc_arch_workaround_1); -} - -static void -install_ssbd_workaround(void) -{ - char *env; - - if (PCPU_GET(cpuid) == 0) { - env = kern_getenv("kern.cfg.ssbd"); - if (env != NULL) { - if (strcmp(env, "force-on") == 0) { - ssbd_method = SSBD_FORCE_ON; - } else if (strcmp(env, "force-off") == 0) { - ssbd_method = SSBD_FORCE_OFF; - } - } - } - - /* SMCCC depends on PSCI. If PSCI is missing so is SMCCC */ - if (!psci_present) - return; - - /* Enable the workaround on this CPU if it's enabled in the firmware */ - if (smccc_arch_features(SMCCC_ARCH_WORKAROUND_2) != SMCCC_RET_SUCCESS) - return; - - switch(ssbd_method) { - case SSBD_FORCE_ON: - smccc_arch_workaround_2(1); - break; - case SSBD_FORCE_OFF: - smccc_arch_workaround_2(0); - break; - case SSBD_KERNEL: - default: - PCPU_SET(ssbd, smccc_arch_workaround_2); - break; - } -} - /* * Workaround Cavium erratum 27456. * diff --git a/sys/arm64/arm64/elf_machdep.c b/sys/arm64/arm64/elf_machdep.c index 13af5c5065d6..207b37180a26 100644 --- a/sys/arm64/arm64/elf_machdep.c +++ b/sys/arm64/arm64/elf_machdep.c @@ -121,7 +121,7 @@ static struct sysentvec elf64_freebsd_sysvec = { }; INIT_SYSENTVEC(elf64_sysvec, &elf64_freebsd_sysvec); -static Elf64_Brandinfo freebsd_brand_info = { +static const Elf64_Brandinfo freebsd_brand_info = { .brand = ELFOSABI_FREEBSD, .machine = EM_AARCH64, .compat_3_brand = "FreeBSD", @@ -131,8 +131,7 @@ static Elf64_Brandinfo freebsd_brand_info = { .brand_note = &elf64_freebsd_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; - -SYSINIT(elf64, SI_SUB_EXEC, SI_ORDER_FIRST, +C_SYSINIT(elf64, SI_SUB_EXEC, SI_ORDER_FIRST, (sysinit_cfunc_t)elf64_insert_brand_entry, &freebsd_brand_info); static bool @@ -336,7 +335,7 @@ elf_cpu_parse_dynamic(caddr_t loadbase __unused, Elf_Dyn *dynamic __unused) return (0); } -static Elf_Note gnu_property_note = { +static const Elf_Note gnu_property_note = { .n_namesz = sizeof(GNU_ABI_VENDOR), .n_descsz = 16, .n_type = NT_GNU_PROPERTY_TYPE_0, diff --git a/sys/arm64/arm64/spec_workaround.c b/sys/arm64/arm64/spec_workaround.c new file mode 100644 index 000000000000..7f4f86cdb48c --- /dev/null +++ b/sys/arm64/arm64/spec_workaround.c @@ -0,0 +1,166 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2025 Arm Ltd + * Copyright (c) 2018 Andrew Turner + * + * This software was developed by SRI International and the University of + * Cambridge Computer Laboratory under DARPA/AFRL contract FA8750-10-C-0237 + * ("CTSRD"), as part of the DARPA CRASH research programme. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/pcpu.h> +#include <sys/systm.h> + +#include <machine/cpu.h> +#include <machine/cpu_feat.h> + +#include <dev/psci/psci.h> +#include <dev/psci/smccc.h> + +static enum { + SSBD_FORCE_ON, + SSBD_FORCE_OFF, + SSBD_KERNEL, +} ssbd_method = SSBD_KERNEL; + +struct psci_bp_hardening_impl { + u_int midr_mask; + u_int midr_value; +}; + +static struct psci_bp_hardening_impl psci_bp_hardening_impl[] = { + { + .midr_mask = CPU_IMPL_MASK | CPU_PART_MASK, + .midr_value = CPU_ID_RAW(CPU_IMPL_ARM, CPU_PART_CORTEX_A57,0,0), + }, + { + .midr_mask = CPU_IMPL_MASK | CPU_PART_MASK, + .midr_value = CPU_ID_RAW(CPU_IMPL_ARM, CPU_PART_CORTEX_A72,0,0), + }, + { + .midr_mask = CPU_IMPL_MASK | CPU_PART_MASK, + .midr_value = CPU_ID_RAW(CPU_IMPL_ARM, CPU_PART_CORTEX_A73,0,0), + }, + { + .midr_mask = CPU_IMPL_MASK | CPU_PART_MASK, + .midr_value = CPU_ID_RAW(CPU_IMPL_ARM, CPU_PART_CORTEX_A75,0,0), + }, + { + .midr_mask = CPU_IMPL_MASK | CPU_PART_MASK, + .midr_value = + CPU_ID_RAW(CPU_IMPL_CAVIUM, CPU_PART_THUNDERX2, 0,0), + } +}; + +static cpu_feat_en +psci_bp_hardening_check(const struct cpu_feat *feat __unused, u_int midr) +{ + size_t i; + + for (i = 0; i < nitems(psci_bp_hardening_impl); i++) { + if ((midr & psci_bp_hardening_impl[i].midr_mask) == + psci_bp_hardening_impl[i].midr_value) { + /* SMCCC depends on PSCI. If PSCI is missing so is SMCCC */ + if (!psci_present) + return (FEAT_ALWAYS_DISABLE); + + if (smccc_arch_features(SMCCC_ARCH_WORKAROUND_1) != + SMCCC_RET_SUCCESS) + return (FEAT_ALWAYS_DISABLE); + + return (FEAT_DEFAULT_ENABLE); + } + } + + return (FEAT_ALWAYS_DISABLE); +} + +static bool +psci_bp_hardening_enable(const struct cpu_feat *feat __unused, + cpu_feat_errata errata_status __unused, u_int *errata_list __unused, + u_int errata_count __unused) +{ + PCPU_SET(bp_harden, smccc_arch_workaround_1); + + return (true); +} + +CPU_FEAT(feat_csv2_missing, "Branch Predictor Hardening", + psci_bp_hardening_check, NULL, psci_bp_hardening_enable, NULL, + CPU_FEAT_AFTER_DEV | CPU_FEAT_PER_CPU); + +static cpu_feat_en +ssbd_workaround_check(const struct cpu_feat *feat __unused, u_int midr __unused) +{ + char *env; + + if (PCPU_GET(cpuid) == 0) { + env = kern_getenv("kern.cfg.ssbd"); + if (env != NULL) { + if (strcmp(env, "force-on") == 0) { + ssbd_method = SSBD_FORCE_ON; + } else if (strcmp(env, "force-off") == 0) { + ssbd_method = SSBD_FORCE_OFF; + } + } + } + + /* SMCCC depends on PSCI. If PSCI is missing so is SMCCC */ + if (!psci_present) + return (FEAT_ALWAYS_DISABLE); + + /* Enable the workaround on this CPU if it's enabled in the firmware */ + if (smccc_arch_features(SMCCC_ARCH_WORKAROUND_2) != SMCCC_RET_SUCCESS) + return (FEAT_ALWAYS_DISABLE); + + return (FEAT_DEFAULT_ENABLE); +} + +static bool +ssbd_workaround_enable(const struct cpu_feat *feat __unused, + cpu_feat_errata errata_status __unused, u_int *errata_list __unused, + u_int errata_count __unused) +{ + switch(ssbd_method) { + case SSBD_FORCE_ON: + smccc_arch_workaround_2(1); + break; + case SSBD_FORCE_OFF: + smccc_arch_workaround_2(0); + break; + case SSBD_KERNEL: + default: + PCPU_SET(ssbd, smccc_arch_workaround_2); + break; + } + + return (true); +} + +CPU_FEAT(feat_ssbs_missing, "Speculator Store Bypass Disable Workaround", + ssbd_workaround_check, NULL, ssbd_workaround_enable, NULL, + CPU_FEAT_AFTER_DEV | CPU_FEAT_PER_CPU); diff --git a/sys/arm64/coresight/coresight.c b/sys/arm64/coresight/coresight.c index 5928c153f4ae..9b9d3c65ecc9 100644 --- a/sys/arm64/coresight/coresight.c +++ b/sys/arm64/coresight/coresight.c @@ -113,7 +113,7 @@ coresight_get_output_device(struct endpoint *endp, struct endpoint **out_endp) } static void -coresight_init(void) +coresight_init(void *dummy __unused) { mtx_init(&cs_mtx, "ARM Coresight", NULL, MTX_DEF); diff --git a/sys/arm64/include/vmm.h b/sys/arm64/include/vmm.h index e839b5dd92c9..696a69669a2a 100644 --- a/sys/arm64/include/vmm.h +++ b/sys/arm64/include/vmm.h @@ -143,10 +143,41 @@ struct vm_eventinfo { int *iptr; /* reqidle cookie */ }; +#define DECLARE_VMMOPS_FUNC(ret_type, opname, args) \ + ret_type vmmops_##opname args + +DECLARE_VMMOPS_FUNC(int, modinit, (int ipinum)); +DECLARE_VMMOPS_FUNC(int, modcleanup, (void)); +DECLARE_VMMOPS_FUNC(void *, init, (struct vm *vm, struct pmap *pmap)); +DECLARE_VMMOPS_FUNC(int, gla2gpa, (void *vcpui, struct vm_guest_paging *paging, + uint64_t gla, int prot, uint64_t *gpa, int *is_fault)); +DECLARE_VMMOPS_FUNC(int, run, (void *vcpui, register_t pc, struct pmap *pmap, + struct vm_eventinfo *info)); +DECLARE_VMMOPS_FUNC(void, cleanup, (void *vmi)); +DECLARE_VMMOPS_FUNC(void *, vcpu_init, (void *vmi, struct vcpu *vcpu, + int vcpu_id)); +DECLARE_VMMOPS_FUNC(void, vcpu_cleanup, (void *vcpui)); +DECLARE_VMMOPS_FUNC(int, exception, (void *vcpui, uint64_t esr, uint64_t far)); +DECLARE_VMMOPS_FUNC(int, getreg, (void *vcpui, int num, uint64_t *retval)); +DECLARE_VMMOPS_FUNC(int, setreg, (void *vcpui, int num, uint64_t val)); +DECLARE_VMMOPS_FUNC(int, getcap, (void *vcpui, int num, int *retval)); +DECLARE_VMMOPS_FUNC(int, setcap, (void *vcpui, int num, int val)); +DECLARE_VMMOPS_FUNC(struct vmspace *, vmspace_alloc, (vm_offset_t min, + vm_offset_t max)); +DECLARE_VMMOPS_FUNC(void, vmspace_free, (struct vmspace *vmspace)); +#ifdef notyet +#ifdef BHYVE_SNAPSHOT +DECLARE_VMMOPS_FUNC(int, snapshot, (void *vmi, struct vm_snapshot_meta *meta)); +DECLARE_VMMOPS_FUNC(int, vcpu_snapshot, (void *vcpui, + struct vm_snapshot_meta *meta)); +DECLARE_VMMOPS_FUNC(int, restore_tsc, (void *vcpui, uint64_t now)); +#endif +#endif + int vm_create(const char *name, struct vm **retvm); struct vcpu *vm_alloc_vcpu(struct vm *vm, int vcpuid); void vm_disable_vcpu_creation(struct vm *vm); -void vm_slock_vcpus(struct vm *vm); +void vm_lock_vcpus(struct vm *vm); void vm_unlock_vcpus(struct vm *vm); void vm_destroy(struct vm *vm); int vm_reinit(struct vm *vm); @@ -232,7 +263,6 @@ vcpu_should_yield(struct vcpu *vcpu) void *vcpu_stats(struct vcpu *vcpu); void vcpu_notify_event(struct vcpu *vcpu); -struct vmspace *vm_vmspace(struct vm *vm); struct vm_mem *vm_mem(struct vm *vm); enum vm_reg_name vm_segment_name(int seg_encoding); diff --git a/sys/arm64/linux/linux_sysvec.c b/sys/arm64/linux/linux_sysvec.c index 084b7a11b01f..ac05820f89bc 100644 --- a/sys/arm64/linux/linux_sysvec.c +++ b/sys/arm64/linux/linux_sysvec.c @@ -584,7 +584,7 @@ linux_vdso_reloc(char *mapping, Elf_Addr offset) } } -static Elf_Brandnote linux64_brandnote = { +static const Elf_Brandnote linux64_brandnote = { .hdr.n_namesz = sizeof(GNU_ABI_VENDOR), .hdr.n_descsz = 16, .hdr.n_type = 1, @@ -593,7 +593,7 @@ static Elf_Brandnote linux64_brandnote = { .trans_osrel = linux_trans_osrel }; -static Elf64_Brandinfo linux_glibc2brand = { +static const Elf64_Brandinfo linux_glibc2brand = { .brand = ELFOSABI_LINUX, .machine = EM_AARCH64, .compat_3_brand = "Linux", @@ -604,7 +604,7 @@ static Elf64_Brandinfo linux_glibc2brand = { .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; -Elf64_Brandinfo *linux_brandlist[] = { +const Elf64_Brandinfo *linux_brandlist[] = { &linux_glibc2brand, NULL }; @@ -612,8 +612,8 @@ Elf64_Brandinfo *linux_brandlist[] = { static int linux64_elf_modevent(module_t mod, int type, void *data) { - Elf64_Brandinfo **brandinfo; - struct linux_ioctl_handler**lihp; + const Elf64_Brandinfo **brandinfo; + struct linux_ioctl_handler **lihp; int error; error = 0; diff --git a/sys/arm64/vmm/arm64.h b/sys/arm64/vmm/arm64.h index f9b74aef7188..f530dab05331 100644 --- a/sys/arm64/vmm/arm64.h +++ b/sys/arm64/vmm/arm64.h @@ -136,37 +136,6 @@ struct hyp { struct hypctx *ctx[]; }; -#define DEFINE_VMMOPS_IFUNC(ret_type, opname, args) \ - ret_type vmmops_##opname args; - -DEFINE_VMMOPS_IFUNC(int, modinit, (int ipinum)) -DEFINE_VMMOPS_IFUNC(int, modcleanup, (void)) -DEFINE_VMMOPS_IFUNC(void *, init, (struct vm *vm, struct pmap *pmap)) -DEFINE_VMMOPS_IFUNC(int, gla2gpa, (void *vcpui, struct vm_guest_paging *paging, - uint64_t gla, int prot, uint64_t *gpa, int *is_fault)) -DEFINE_VMMOPS_IFUNC(int, run, (void *vcpui, register_t pc, struct pmap *pmap, - struct vm_eventinfo *info)) -DEFINE_VMMOPS_IFUNC(void, cleanup, (void *vmi)) -DEFINE_VMMOPS_IFUNC(void *, vcpu_init, (void *vmi, struct vcpu *vcpu, - int vcpu_id)) -DEFINE_VMMOPS_IFUNC(void, vcpu_cleanup, (void *vcpui)) -DEFINE_VMMOPS_IFUNC(int, exception, (void *vcpui, uint64_t esr, uint64_t far)) -DEFINE_VMMOPS_IFUNC(int, getreg, (void *vcpui, int num, uint64_t *retval)) -DEFINE_VMMOPS_IFUNC(int, setreg, (void *vcpui, int num, uint64_t val)) -DEFINE_VMMOPS_IFUNC(int, getcap, (void *vcpui, int num, int *retval)) -DEFINE_VMMOPS_IFUNC(int, setcap, (void *vcpui, int num, int val)) -DEFINE_VMMOPS_IFUNC(struct vmspace *, vmspace_alloc, (vm_offset_t min, - vm_offset_t max)) -DEFINE_VMMOPS_IFUNC(void, vmspace_free, (struct vmspace *vmspace)) -#ifdef notyet -#ifdef BHYVE_SNAPSHOT -DEFINE_VMMOPS_IFUNC(int, snapshot, (void *vmi, struct vm_snapshot_meta *meta)) -DEFINE_VMMOPS_IFUNC(int, vcpu_snapshot, (void *vcpui, - struct vm_snapshot_meta *meta)) -DEFINE_VMMOPS_IFUNC(int, restore_tsc, (void *vcpui, uint64_t now)) -#endif -#endif - uint64_t vmm_call_hyp(uint64_t, ...); #if 0 diff --git a/sys/arm64/vmm/vmm.c b/sys/arm64/vmm/vmm.c index a551a2807183..bf52dc0fe916 100644 --- a/sys/arm64/vmm/vmm.c +++ b/sys/arm64/vmm/vmm.c @@ -88,7 +88,6 @@ struct vcpu { struct vfpstate *guestfpu; /* (a,i) guest fpu state */ }; -#define vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx)) #define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN) #define vcpu_lock_destroy(v) mtx_destroy(&((v)->mtx)) #define vcpu_lock(v) mtx_lock_spin(&((v)->mtx)) @@ -126,7 +125,6 @@ struct vm { bool dying; /* (o) is dying */ volatile cpuset_t suspended_cpus; /* (i) suspended vcpus */ volatile cpuset_t halted_cpus; /* (x) cpus in a hard halt */ - struct vmspace *vmspace; /* (o) guest's address space */ struct vm_mem mem; /* (i) guest memory */ char name[VM_MAX_NAMELEN]; /* (o) virtual machine name */ struct vcpu **vcpu; /* (i) guest vcpus */ @@ -274,6 +272,7 @@ vcpu_cleanup(struct vcpu *vcpu, bool destroy) vmm_stat_free(vcpu->stats); fpu_save_area_free(vcpu->guestfpu); vcpu_lock_destroy(vcpu); + free(vcpu, M_VMM); } } @@ -407,7 +406,7 @@ vm_init(struct vm *vm, bool create) { int i; - vm->cookie = vmmops_init(vm, vmspace_pmap(vm->vmspace)); + vm->cookie = vmmops_init(vm, vmspace_pmap(vm_vmspace(vm))); MPASS(vm->cookie != NULL); CPU_ZERO(&vm->active_cpus); @@ -470,9 +469,9 @@ vm_alloc_vcpu(struct vm *vm, int vcpuid) } void -vm_slock_vcpus(struct vm *vm) +vm_lock_vcpus(struct vm *vm) { - sx_slock(&vm->vcpus_init_lock); + sx_xlock(&vm->vcpus_init_lock); } void @@ -485,7 +484,7 @@ int vm_create(const char *name, struct vm **retvm) { struct vm *vm; - struct vmspace *vmspace; + int error; /* * If vmm.ko could not be successfully initialized then don't attempt @@ -497,14 +496,13 @@ vm_create(const char *name, struct vm **retvm) if (name == NULL || strlen(name) >= VM_MAX_NAMELEN) return (EINVAL); - vmspace = vmmops_vmspace_alloc(0, 1ul << 39); - if (vmspace == NULL) - return (ENOMEM); - vm = malloc(sizeof(struct vm), M_VMM, M_WAITOK | M_ZERO); + error = vm_mem_init(&vm->mem, 0, 1ul << 39); + if (error != 0) { + free(vm, M_VMM); + return (error); + } strcpy(vm->name, name); - vm->vmspace = vmspace; - vm_mem_init(&vm->mem); sx_init(&vm->vcpus_init_lock, "vm vcpus"); vm->sockets = 1; @@ -558,7 +556,7 @@ vm_cleanup(struct vm *vm, bool destroy) if (destroy) { vm_xlock_memsegs(vm); - pmap = vmspace_pmap(vm->vmspace); + pmap = vmspace_pmap(vm_vmspace(vm)); sched_pin(); PCPU_SET(curvmpmap, NULL); sched_unpin(); @@ -582,11 +580,6 @@ vm_cleanup(struct vm *vm, bool destroy) if (destroy) { vm_mem_destroy(vm); - vmmops_vmspace_free(vm->vmspace); - vm->vmspace = NULL; - - for (i = 0; i < vm->maxcpus; i++) - free(vm->vcpu[i], M_VMM); free(vm->vcpu, M_VMM); sx_destroy(&vm->vcpus_init_lock); } @@ -1090,12 +1083,6 @@ vcpu_notify_event(struct vcpu *vcpu) vcpu_unlock(vcpu); } -struct vmspace * -vm_vmspace(struct vm *vm) -{ - return (vm->vmspace); -} - struct vm_mem * vm_mem(struct vm *vm) { @@ -1416,7 +1403,7 @@ vm_handle_paging(struct vcpu *vcpu, bool *retu) vme = &vcpu->exitinfo; - pmap = vmspace_pmap(vcpu->vm->vmspace); + pmap = vmspace_pmap(vm_vmspace(vcpu->vm)); addr = vme->u.paging.gpa; esr = vme->u.paging.esr; @@ -1433,7 +1420,7 @@ vm_handle_paging(struct vcpu *vcpu, bool *retu) panic("%s: Invalid exception (esr = %lx)", __func__, esr); } - map = &vm->vmspace->vm_map; + map = &vm_vmspace(vm)->vm_map; rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL, NULL); if (rv != KERN_SUCCESS) return (EFAULT); @@ -1507,7 +1494,7 @@ vm_run(struct vcpu *vcpu) if (CPU_ISSET(vcpuid, &vm->suspended_cpus)) return (EINVAL); - pmap = vmspace_pmap(vm->vmspace); + pmap = vmspace_pmap(vm_vmspace(vm)); vme = &vcpu->exitinfo; evinfo.rptr = NULL; evinfo.sptr = &vm->suspend; diff --git a/sys/cam/scsi/scsi_all.c b/sys/cam/scsi/scsi_all.c index b518f84454ad..fd128e69f1f1 100644 --- a/sys/cam/scsi/scsi_all.c +++ b/sys/cam/scsi/scsi_all.c @@ -112,7 +112,7 @@ static void fetchtableentries(int sense_key, int asc, int ascq, const struct asc_table_entry **); #ifdef _KERNEL -static void init_scsi_delay(void); +static void init_scsi_delay(void *); static int sysctl_scsi_delay(SYSCTL_HANDLER_ARGS); static int set_scsi_delay(int delay); #endif @@ -686,7 +686,7 @@ scsi_op_desc(uint16_t opcode, struct scsi_inquiry_data *inq_data) opmask = 1 << pd_type; for (j = 0; j < num_tables; j++) { - for (i = 0;i < num_ops[j] && table[j][i].opcode <= opcode; i++){ + for (i = 0; i < num_ops[j] && table[j][i].opcode <= opcode; i++) { if ((table[j][i].opcode == opcode) && ((table[j][i].opmask & opmask) != 0)) return(table[j][i].desc); @@ -9379,7 +9379,7 @@ scsi_vpd_supported_page(struct cam_periph *periph, uint8_t page_id) } static void -init_scsi_delay(void) +init_scsi_delay(void *dummy __unused) { int delay; diff --git a/sys/cam/scsi/scsi_enc_ses.c b/sys/cam/scsi/scsi_enc_ses.c index 435874a9874a..3a362eaf11a4 100644 --- a/sys/cam/scsi/scsi_enc_ses.c +++ b/sys/cam/scsi/scsi_enc_ses.c @@ -2302,7 +2302,7 @@ ses_print_addl_data_sas_type0(char *sesname, struct sbuf *sbp, sbuf_putc(sbp, '\n'); if (addl->proto_data.sasdev_phys == NULL) return; - for (i = 0;i < addl->proto_hdr.sas->base_hdr.num_phys;i++) { + for (i = 0; i < addl->proto_hdr.sas->base_hdr.num_phys; i++) { phy = &addl->proto_data.sasdev_phys[i]; sbuf_printf(sbp, "%s: phy %d:", sesname, i); if (ses_elm_sas_dev_phy_sata_dev(phy)) @@ -2349,7 +2349,7 @@ ses_print_addl_data_sas_type1(char *sesname, struct sbuf *sbp, sbuf_printf(sbp, "Expander: %d phys", num_phys); if (addl->proto_data.sasexp_phys == NULL) return; - for (i = 0;i < num_phys;i++) { + for (i = 0; i < num_phys; i++) { exp_phy = &addl->proto_data.sasexp_phys[i]; sbuf_printf(sbp, "%s: phy %d: connector %d other %d\n", sesname, i, exp_phy->connector_index, @@ -2360,7 +2360,7 @@ ses_print_addl_data_sas_type1(char *sesname, struct sbuf *sbp, sbuf_printf(sbp, "Port: %d phys", num_phys); if (addl->proto_data.sasport_phys == NULL) return; - for (i = 0;i < num_phys;i++) { + for (i = 0; i < num_phys; i++) { port_phy = &addl->proto_data.sasport_phys[i]; sbuf_printf(sbp, "%s: phy %d: id %d connector %d other %d\n", diff --git a/sys/cddl/compat/opensolaris/kern/opensolaris.c b/sys/cddl/compat/opensolaris/kern/opensolaris.c index 10924977c20d..898b2ea49f96 100644 --- a/sys/cddl/compat/opensolaris/kern/opensolaris.c +++ b/sys/cddl/compat/opensolaris/kern/opensolaris.c @@ -67,7 +67,7 @@ opensolaris_load(void *dummy) SYSINIT(opensolaris_register, SI_SUB_OPENSOLARIS, SI_ORDER_FIRST, opensolaris_load, NULL); static void -opensolaris_unload(void) +opensolaris_unload(void *dummy __unused) { mutex_destroy(&cpu_lock); } diff --git a/sys/compat/ia32/ia32_sysvec.c b/sys/compat/ia32/ia32_sysvec.c index 0ea7d072e911..b9dada4eee7b 100644 --- a/sys/compat/ia32/ia32_sysvec.c +++ b/sys/compat/ia32/ia32_sysvec.c @@ -145,7 +145,7 @@ struct sysentvec ia32_freebsd_sysvec = { }; INIT_SYSENTVEC(elf_ia32_sysvec, &ia32_freebsd_sysvec); -static Elf32_Brandinfo ia32_brand_info = { +static const Elf32_Brandinfo ia32_brand_info = { .brand = ELFOSABI_FREEBSD, .machine = EM_386, .compat_3_brand = "FreeBSD", @@ -155,12 +155,10 @@ static Elf32_Brandinfo ia32_brand_info = { .brand_note = &elf32_freebsd_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; +C_SYSINIT(ia32, SI_SUB_EXEC, SI_ORDER_MIDDLE, + (sysinit_cfunc_t)elf32_insert_brand_entry, &ia32_brand_info); -SYSINIT(ia32, SI_SUB_EXEC, SI_ORDER_MIDDLE, - (sysinit_cfunc_t) elf32_insert_brand_entry, - &ia32_brand_info); - -static Elf32_Brandinfo ia32_brand_oinfo = { +static const Elf32_Brandinfo ia32_brand_oinfo = { .brand = ELFOSABI_FREEBSD, .machine = EM_386, .compat_3_brand = "FreeBSD", @@ -170,12 +168,10 @@ static Elf32_Brandinfo ia32_brand_oinfo = { .brand_note = &elf32_freebsd_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; +C_SYSINIT(oia32, SI_SUB_EXEC, SI_ORDER_ANY, + (sysinit_cfunc_t)elf32_insert_brand_entry, &ia32_brand_oinfo); -SYSINIT(oia32, SI_SUB_EXEC, SI_ORDER_ANY, - (sysinit_cfunc_t) elf32_insert_brand_entry, - &ia32_brand_oinfo); - -static Elf32_Brandinfo kia32_brand_info = { +static const Elf32_Brandinfo kia32_brand_info = { .brand = ELFOSABI_FREEBSD, .machine = EM_386, .compat_3_brand = "FreeBSD", @@ -184,10 +180,8 @@ static Elf32_Brandinfo kia32_brand_info = { .brand_note = &elf32_kfreebsd_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE_MANDATORY }; - -SYSINIT(kia32, SI_SUB_EXEC, SI_ORDER_ANY, - (sysinit_cfunc_t) elf32_insert_brand_entry, - &kia32_brand_info); +C_SYSINIT(kia32, SI_SUB_EXEC, SI_ORDER_ANY, + (sysinit_cfunc_t)elf32_insert_brand_entry, &kia32_brand_info); void elf32_dump_thread(struct thread *td, void *dst, size_t *off) diff --git a/sys/compat/linprocfs/linprocfs.c b/sys/compat/linprocfs/linprocfs.c index 95b212be1306..7ac48786c77b 100644 --- a/sys/compat/linprocfs/linprocfs.c +++ b/sys/compat/linprocfs/linprocfs.c @@ -2216,6 +2216,67 @@ linprocfs_dosysvipc_shm(PFS_FILL_ARGS) return (0); } +static int +linprocfs_doinotify(const char *sysctl, PFS_FILL_ARGS) +{ + size_t size; + int error, val; + + if (uio->uio_rw == UIO_READ) { + size = sizeof(val); + error = kernel_sysctlbyname(curthread, + __DECONST(void *, sysctl), &val, &size, NULL, 0, 0, 0); + if (error == 0) + sbuf_printf(sb, "%d\n", val); + } else { + char *endp, *newval; + long vall; + + sbuf_trim(sb); + sbuf_finish(sb); + newval = sbuf_data(sb); + vall = strtol(newval, &endp, 10); + if (vall < 0 || vall > INT_MAX || endp == newval || + *endp != '\0') + return (EINVAL); + val = (int)vall; + error = kernel_sysctlbyname(curthread, + __DECONST(void *, sysctl), NULL, NULL, + &val, sizeof(val), 0, 0); + } + return (error); +} + +/* + * Filler function for proc/sys/fs/inotify/max_queued_events + */ +static int +linprocfs_doinotify_max_queued_events(PFS_FILL_ARGS) +{ + return (linprocfs_doinotify("vfs.inotify.max_queued_events", + PFS_FILL_ARGNAMES)); +} + +/* + * Filler function for proc/sys/fs/inotify/max_user_instances + */ +static int +linprocfs_doinotify_max_user_instances(PFS_FILL_ARGS) +{ + return (linprocfs_doinotify("vfs.inotify.max_user_instances", + PFS_FILL_ARGNAMES)); +} + +/* + * Filler function for proc/sys/fs/inotify/max_user_watches + */ +static int +linprocfs_doinotify_max_user_watches(PFS_FILL_ARGS) +{ + return (linprocfs_doinotify("vfs.inotify.max_user_watches", + PFS_FILL_ARGNAMES)); +} + /* * Filler function for proc/sys/fs/mqueue/msg_default */ @@ -2313,9 +2374,7 @@ linprocfs_domqueue_queues_max(PFS_FILL_ARGS) static int linprocfs_init(PFS_INIT_ARGS) { - struct pfs_node *root; - struct pfs_node *dir; - struct pfs_node *sys; + struct pfs_node *dir, *fs, *root, *sys; root = pi->pi_root; @@ -2466,10 +2525,18 @@ linprocfs_init(PFS_INIT_ARGS) NULL, PFS_RD); /* /proc/sys/fs/... */ - pfs_create_dir(sys, &dir, "fs", NULL, NULL, NULL, 0); + pfs_create_dir(sys, &fs, "fs", NULL, NULL, NULL, 0); + + pfs_create_dir(fs, &dir, "inotify", NULL, NULL, NULL, 0); + pfs_create_file(dir, NULL, "max_queued_events", + &linprocfs_doinotify_max_queued_events, NULL, NULL, NULL, PFS_RDWR); + pfs_create_file(dir, NULL, "max_user_instances", + &linprocfs_doinotify_max_user_instances, NULL, NULL, NULL, PFS_RDWR); + pfs_create_file(dir, NULL, "max_user_watches", + &linprocfs_doinotify_max_user_watches, NULL, NULL, NULL, PFS_RDWR); /* /proc/sys/fs/mqueue/... */ - pfs_create_dir(dir, &dir, "mqueue", NULL, NULL, NULL, 0); + pfs_create_dir(fs, &dir, "mqueue", NULL, NULL, NULL, 0); pfs_create_file(dir, NULL, "msg_default", &linprocfs_domqueue_msg_default, NULL, NULL, NULL, PFS_RD); pfs_create_file(dir, NULL, "msgsize_default", diff --git a/sys/compat/linux/linux.c b/sys/compat/linux/linux.c index 61b207070963..a40f110634f7 100644 --- a/sys/compat/linux/linux.c +++ b/sys/compat/linux/linux.c @@ -578,8 +578,13 @@ bsd_to_linux_sockaddr(const struct sockaddr *sa, struct l_sockaddr **lsa, return (0); } +/* + * If sap is NULL, then osa points at already copied in linux sockaddr that + * should be edited in place. Otherwise memory is allocated, sockaddr + * copied in and returned in *sap. + */ int -linux_to_bsd_sockaddr(const struct l_sockaddr *osa, struct sockaddr **sap, +linux_to_bsd_sockaddr(struct l_sockaddr *osa, struct sockaddr **sap, socklen_t *len) { struct sockaddr *sa; @@ -609,10 +614,12 @@ linux_to_bsd_sockaddr(const struct l_sockaddr *osa, struct sockaddr **sap, } #endif - kosa = malloc(salen, M_SONAME, M_WAITOK); - - if ((error = copyin(osa, kosa, *len))) - goto out; + if (sap != NULL) { + kosa = malloc(salen, M_SONAME, M_WAITOK); + if ((error = copyin(osa, kosa, *len))) + goto out; + } else + kosa = osa; bdom = linux_to_bsd_domain(kosa->sa_family); if (bdom == AF_UNKNOWN) { @@ -686,12 +693,15 @@ linux_to_bsd_sockaddr(const struct l_sockaddr *osa, struct sockaddr **sap, sa->sa_family = bdom; sa->sa_len = salen; - *sap = sa; - *len = salen; + if (sap != NULL) { + *sap = sa; + *len = salen; + } return (0); out: - free(kosa, M_SONAME); + if (sap != NULL) + free(kosa, M_SONAME); return (error); } diff --git a/sys/compat/linux/linux_common.h b/sys/compat/linux/linux_common.h index 97f5a259f300..814c183b338a 100644 --- a/sys/compat/linux/linux_common.h +++ b/sys/compat/linux/linux_common.h @@ -43,7 +43,7 @@ sa_family_t bsd_to_linux_domain(sa_family_t domain); #define AF_UNKNOWN UINT8_MAX int bsd_to_linux_sockaddr(const struct sockaddr *sa, struct l_sockaddr **lsa, socklen_t len); -int linux_to_bsd_sockaddr(const struct l_sockaddr *lsa, +int linux_to_bsd_sockaddr(struct l_sockaddr *lsa, struct sockaddr **sap, socklen_t *len); void linux_to_bsd_poll_events(struct thread *td, int fd, short lev, short *bev); diff --git a/sys/compat/linux/linux_futex.c b/sys/compat/linux/linux_futex.c index 37d0142bae8b..0586eb55a8f3 100644 --- a/sys/compat/linux/linux_futex.c +++ b/sys/compat/linux/linux_futex.c @@ -251,7 +251,7 @@ linux_futex(struct thread *td, struct linux_futex_args *args) * set LINUX_BI_FUTEX_REQUEUE bit of Brandinfo flags. */ p = td->td_proc; - Elf_Brandinfo *bi = p->p_elf_brandinfo; + const Elf_Brandinfo *bi = p->p_elf_brandinfo; if (bi == NULL || ((bi->flags & LINUX_BI_FUTEX_REQUEUE)) == 0) return (EINVAL); args->val3_compare = false; diff --git a/sys/compat/linux/linux_socket.c b/sys/compat/linux/linux_socket.c index 0e07b0a60ced..b1a483ce611c 100644 --- a/sys/compat/linux/linux_socket.c +++ b/sys/compat/linux/linux_socket.c @@ -2146,7 +2146,8 @@ linux_setsockopt(struct thread *td, struct linux_setsockopt_args *args) return (ENOPROTOOPT); } - if (name == IPV6_NEXTHOP) { + switch (name) { + case IPV6_NEXTHOP: { len = args->optlen; error = linux_to_bsd_sockaddr(PTRIN(args->optval), &sa, &len); if (error != 0) @@ -2155,7 +2156,34 @@ linux_setsockopt(struct thread *td, struct linux_setsockopt_args *args) error = kern_setsockopt(td, args->s, level, name, sa, UIO_SYSSPACE, len); free(sa, M_SONAME); - } else { + break; + } + case MCAST_JOIN_GROUP: + case MCAST_LEAVE_GROUP: + case MCAST_JOIN_SOURCE_GROUP: + case MCAST_LEAVE_SOURCE_GROUP: { + struct group_source_req req; + size_t size; + + size = (name == MCAST_JOIN_SOURCE_GROUP || + name == MCAST_LEAVE_SOURCE_GROUP) ? + sizeof(struct group_source_req) : sizeof(struct group_req); + + if ((error = copyin(PTRIN(args->optval), &req, size))) + return (error); + len = sizeof(struct sockaddr_storage); + if ((error = linux_to_bsd_sockaddr( + (struct l_sockaddr *)&req.gsr_group, NULL, &len))) + return (error); + if (size == sizeof(struct group_source_req) && + (error = linux_to_bsd_sockaddr( + (struct l_sockaddr *)&req.gsr_source, NULL, &len))) + return (error); + error = kern_setsockopt(td, args->s, level, name, &req, + UIO_SYSSPACE, size); + break; + } + default: error = kern_setsockopt(td, args->s, level, name, PTRIN(args->optval), UIO_USERSPACE, args->optlen); } diff --git a/sys/conf/NOTES b/sys/conf/NOTES index ea9b2667607e..9944375c3615 100644 --- a/sys/conf/NOTES +++ b/sys/conf/NOTES @@ -679,6 +679,7 @@ options TCP_OFFLOAD # TCP offload support. options TCP_RFC7413 # TCP Fast Open options TCPHPTS +#options TCP_HPTS_KTEST # Add KTEST support for HPTS # In order to enable IPSEC you MUST also add device crypto to # your kernel configuration @@ -2800,7 +2801,7 @@ options MAXFILES=999 # Random number generator # Alternative algorithm. -#options RANDOM_FENESTRASX +options RANDOM_FENESTRASX # Allow the CSPRNG algorithm to be loaded as a module. #options RANDOM_LOADABLE # Select this to allow high-rate but potentially expensive diff --git a/sys/conf/dtb.build.mk b/sys/conf/dtb.build.mk index 327d69106244..7eb0db5e8b80 100644 --- a/sys/conf/dtb.build.mk +++ b/sys/conf/dtb.build.mk @@ -1,7 +1,3 @@ - -.include <bsd.init.mk> -# Grab all the options for a kernel build. For backwards compat, we need to -# do this after bsd.own.mk. .include "kern.opts.mk" DTC?= dtc diff --git a/sys/conf/files.arm64 b/sys/conf/files.arm64 index 856ea3af1372..2f412fa3cb1b 100644 --- a/sys/conf/files.arm64 +++ b/sys/conf/files.arm64 @@ -73,6 +73,7 @@ arm64/arm64/pmap.c standard arm64/arm64/ptrace_machdep.c standard arm64/arm64/sdt_machdep.c optional kdtrace_hooks arm64/arm64/sigtramp.S standard +arm64/arm64/spec_workaround.c standard arm64/arm64/stack_machdep.c optional ddb | stack arm64/arm64/strcmp.S standard arm64/arm64/strncmp.S standard diff --git a/sys/conf/kern.opts.mk b/sys/conf/kern.opts.mk index 045e55d1b19a..cef4dd11ba58 100644 --- a/sys/conf/kern.opts.mk +++ b/sys/conf/kern.opts.mk @@ -4,6 +4,7 @@ # parts to omit (eg CDDL or SOURCELESS_HOST). Some of these will cause # config.mk to define symbols in various opt_*.h files. + # # Define MK_* variables (which are either "yes" or "no") for users # to set via WITH_*/WITHOUT_* in /etc/src.conf and override in the @@ -13,17 +14,12 @@ # that haven't been converted over. # -# Note: bsd.own.mk must be included before the rest of kern.opts.mk to make -# building on 10.x and earlier work. This should be removed when that's no -# longer supported since it confounds the defaults (since it uses the host's -# notion of defaults rather than what's default in current when building -# within sys/modules). -.include <bsd.own.mk> - # These options are used by the kernel build process (kern.mk and kmod.mk) # They have to be listed here so we can build modules outside of the # src tree. +.include <bsd.init.mk> + KLDXREF_CMD?= kldxref __DEFAULT_YES_OPTIONS = \ diff --git a/sys/conf/options b/sys/conf/options index b48ad1cf42cf..0b795a8d28fb 100644 --- a/sys/conf/options +++ b/sys/conf/options @@ -231,6 +231,7 @@ SYSVSEM opt_sysvipc.h SYSVSHM opt_sysvipc.h SW_WATCHDOG opt_watchdog.h TCPHPTS +TCP_HPTS_KTEST opt_inet.h TCP_REQUEST_TRK opt_global.h TCP_ACCOUNTING opt_global.h TCP_BBR opt_inet.h diff --git a/sys/conf/std.debug b/sys/conf/std.debug index f5ed5582c78d..0149779b3e5c 100644 --- a/sys/conf/std.debug +++ b/sys/conf/std.debug @@ -16,3 +16,4 @@ options MALLOC_DEBUG_MAXZONES=8 # Separate malloc(9) zones options VERBOSE_SYSINIT=0 # Support debug.verbose_sysinit, off by default options ALT_BREAK_TO_DEBUGGER # Enter debugger on keyboard escape sequence options KDTRACE_MIB_SDT # Add SDT probes to network counters +options TCP_HPTS_KTEST # Add KTEST support for HPTS diff --git a/sys/conf/std.nodebug b/sys/conf/std.nodebug index 4035e28d2a62..79676a1d618f 100644 --- a/sys/conf/std.nodebug +++ b/sys/conf/std.nodebug @@ -16,6 +16,7 @@ nooptions KCOV nooptions MALLOC_DEBUG_MAXZONES nooptions QUEUE_MACRO_DEBUG_TRASH nooptions KDTRACE_MIB_SDT +nooptions TCP_HPTS_KTEST # Net80211 debugging nooptions IEEE80211_DEBUG diff --git a/sys/contrib/libnv/bsd_nvpair.c b/sys/contrib/libnv/bsd_nvpair.c index c73bc2189121..b884dd260b84 100644 --- a/sys/contrib/libnv/bsd_nvpair.c +++ b/sys/contrib/libnv/bsd_nvpair.c @@ -985,13 +985,13 @@ nvpair_unpack_string_array(bool isbe __unused, nvpair_t *nvp, size = nvp->nvp_datasize; tmp = (const char *)ptr; for (ii = 0; ii < nvp->nvp_nitems; ii++) { - len = strnlen(tmp, size - 1) + 1; - size -= len; - if (tmp[len - 1] != '\0') { + if (size <= 0) { ERRNO_SET(EINVAL); return (NULL); } - if (size < 0) { + len = strnlen(tmp, size - 1) + 1; + size -= len; + if (tmp[len - 1] != '\0') { ERRNO_SET(EINVAL); return (NULL); } diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c index 393bfaa65ff5..ebc2c0eeb6d2 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c @@ -163,6 +163,13 @@ param_set_arc_int(SYSCTL_HANDLER_ARGS) return (0); } +static void +warn_deprecated_sysctl(const char *old, const char *new) +{ + printf("WARNING: sysctl vfs.zfs.%s is deprecated. Use vfs.zfs.%s instead.\n", + old, new); +} + int param_set_arc_max(SYSCTL_HANDLER_ARGS) { @@ -185,9 +192,17 @@ param_set_arc_max(SYSCTL_HANDLER_ARGS) if (val != 0) zfs_arc_max = arc_c_max; + if (arg2 != 0) + warn_deprecated_sysctl("arc_max", "arc.max"); + return (0); } +SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_max, + CTLTYPE_ULONG | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, + NULL, 1, param_set_arc_max, "LU", + "Maximum ARC size in bytes (LEGACY)"); + int param_set_arc_min(SYSCTL_HANDLER_ARGS) { @@ -209,9 +224,17 @@ param_set_arc_min(SYSCTL_HANDLER_ARGS) if (val != 0) zfs_arc_min = arc_c_min; + if (arg2 != 0) + warn_deprecated_sysctl("arc_min", "arc.min"); + return (0); } +SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_min, + CTLTYPE_ULONG | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, + NULL, 1, param_set_arc_min, "LU", + "Minimum ARC size in bytes (LEGACY)"); + extern uint_t zfs_arc_free_target; int @@ -232,9 +255,22 @@ param_set_arc_free_target(SYSCTL_HANDLER_ARGS) zfs_arc_free_target = val; + if (arg2 != 0) + warn_deprecated_sysctl("arc_free_target", "arc.free_target"); + return (0); } +/* + * NOTE: This sysctl is CTLFLAG_RW not CTLFLAG_RWTUN due to its dependency on + * pagedaemon initialization. + */ +SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target, + CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, + NULL, 1, param_set_arc_free_target, "IU", + "Desired number of free pages below which ARC triggers reclaim" + " (LEGACY)"); + int param_set_arc_no_grow_shift(SYSCTL_HANDLER_ARGS) { @@ -250,9 +286,193 @@ param_set_arc_no_grow_shift(SYSCTL_HANDLER_ARGS) arc_no_grow_shift = val; + if (arg2 != 0) + warn_deprecated_sysctl("arc_no_grow_shift", "arc.no_grow_shift"); + return (0); } +SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_no_grow_shift, + CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, + NULL, 1, param_set_arc_no_grow_shift, "I", + "log2(fraction of ARC which must be free to allow growing) (LEGACY)"); + +extern uint64_t l2arc_write_max; + +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, + CTLFLAG_RWTUN, &l2arc_write_max, 0, + "Max write bytes per interval (LEGACY)"); + +extern uint64_t l2arc_write_boost; + +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, + CTLFLAG_RWTUN, &l2arc_write_boost, 0, + "Extra write bytes during device warmup (LEGACY)"); + +extern uint64_t l2arc_headroom; + +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, + CTLFLAG_RWTUN, &l2arc_headroom, 0, + "Number of max device writes to precache (LEGACY)"); + +extern uint64_t l2arc_headroom_boost; + +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom_boost, + CTLFLAG_RWTUN, &l2arc_headroom_boost, 0, + "Compressed l2arc_headroom multiplier (LEGACY)"); + +extern uint64_t l2arc_feed_secs; + +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, + CTLFLAG_RWTUN, &l2arc_feed_secs, 0, + "Seconds between L2ARC writing (LEGACY)"); + +extern uint64_t l2arc_feed_min_ms; + +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, + CTLFLAG_RWTUN, &l2arc_feed_min_ms, 0, + "Min feed interval in milliseconds (LEGACY)"); + +extern int l2arc_noprefetch; + +SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, + CTLFLAG_RWTUN, &l2arc_noprefetch, 0, + "Skip caching prefetched buffers (LEGACY)"); + +extern int l2arc_feed_again; + +SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, + CTLFLAG_RWTUN, &l2arc_feed_again, 0, + "Turbo L2ARC warmup (LEGACY)"); + +extern int l2arc_norw; + +SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, + CTLFLAG_RWTUN, &l2arc_norw, 0, + "No reads during writes (LEGACY)"); + +static int +param_get_arc_state_size(SYSCTL_HANDLER_ARGS) +{ + arc_state_t *state = (arc_state_t *)arg1; + int64_t val; + + val = zfs_refcount_count(&state->arcs_size[ARC_BUFC_DATA]) + + zfs_refcount_count(&state->arcs_size[ARC_BUFC_METADATA]); + return (sysctl_handle_64(oidp, &val, 0, req)); +} + +extern arc_state_t ARC_anon; + +SYSCTL_PROC(_vfs_zfs, OID_AUTO, anon_size, + CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, + &ARC_anon, 0, param_get_arc_state_size, "Q", + "size of anonymous state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_esize, CTLFLAG_RD, + &ARC_anon.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, + "size of evictable metadata in anonymous state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_esize, CTLFLAG_RD, + &ARC_anon.arcs_esize[ARC_BUFC_DATA].rc_count, 0, + "size of evictable data in anonymous state"); + +extern arc_state_t ARC_mru; + +SYSCTL_PROC(_vfs_zfs, OID_AUTO, mru_size, + CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, + &ARC_mru, 0, param_get_arc_state_size, "Q", + "size of mru state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_esize, CTLFLAG_RD, + &ARC_mru.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, + "size of evictable metadata in mru state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_esize, CTLFLAG_RD, + &ARC_mru.arcs_esize[ARC_BUFC_DATA].rc_count, 0, + "size of evictable data in mru state"); + +extern arc_state_t ARC_mru_ghost; + +SYSCTL_PROC(_vfs_zfs, OID_AUTO, mru_ghost_size, + CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, + &ARC_mru_ghost, 0, param_get_arc_state_size, "Q", + "size of mru ghost state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_esize, CTLFLAG_RD, + &ARC_mru_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, + "size of evictable metadata in mru ghost state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_esize, CTLFLAG_RD, + &ARC_mru_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0, + "size of evictable data in mru ghost state"); + +extern arc_state_t ARC_mfu; + +SYSCTL_PROC(_vfs_zfs, OID_AUTO, mfu_size, + CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, + &ARC_mfu, 0, param_get_arc_state_size, "Q", + "size of mfu state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_esize, CTLFLAG_RD, + &ARC_mfu.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, + "size of evictable metadata in mfu state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_esize, CTLFLAG_RD, + &ARC_mfu.arcs_esize[ARC_BUFC_DATA].rc_count, 0, + "size of evictable data in mfu state"); + +extern arc_state_t ARC_mfu_ghost; + +SYSCTL_PROC(_vfs_zfs, OID_AUTO, mfu_ghost_size, + CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, + &ARC_mfu_ghost, 0, param_get_arc_state_size, "Q", + "size of mfu ghost state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_esize, CTLFLAG_RD, + &ARC_mfu_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, + "size of evictable metadata in mfu ghost state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_esize, CTLFLAG_RD, + &ARC_mfu_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0, + "size of evictable data in mfu ghost state"); + +extern arc_state_t ARC_uncached; + +SYSCTL_PROC(_vfs_zfs, OID_AUTO, uncached_size, + CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, + &ARC_uncached, 0, param_get_arc_state_size, "Q", + "size of uncached state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, uncached_metadata_esize, CTLFLAG_RD, + &ARC_uncached.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, + "size of evictable metadata in uncached state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, uncached_data_esize, CTLFLAG_RD, + &ARC_uncached.arcs_esize[ARC_BUFC_DATA].rc_count, 0, + "size of evictable data in uncached state"); + +extern arc_state_t ARC_l2c_only; + +SYSCTL_PROC(_vfs_zfs, OID_AUTO, l2c_only_size, + CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, + &ARC_l2c_only, 0, param_get_arc_state_size, "Q", + "size of l2c_only state"); + +/* dbuf.c */ + +/* dmu.c */ + +/* dmu_zfetch.c */ + +SYSCTL_NODE(_vfs_zfs, OID_AUTO, zfetch, CTLFLAG_RW, 0, "ZFS ZFETCH (LEGACY)"); + +extern uint32_t zfetch_max_distance; + +SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_distance, + CTLFLAG_RWTUN, &zfetch_max_distance, 0, + "Max bytes to prefetch per stream (LEGACY)"); + +extern uint32_t zfetch_max_idistance; + +SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_idistance, + CTLFLAG_RWTUN, &zfetch_max_idistance, 0, + "Max bytes to prefetch indirects for per stream (LEGACY)"); + +/* dsl_pool.c */ + +/* dnode.c */ + +/* dsl_scan.c */ + /* metaslab.c */ int @@ -313,6 +533,19 @@ SYSCTL_UINT(_vfs_zfs, OID_AUTO, condense_pct, "Condense on-disk spacemap when it is more than this many percents" " of in-memory counterpart"); +extern uint_t zfs_remove_max_segment; + +SYSCTL_UINT(_vfs_zfs, OID_AUTO, remove_max_segment, + CTLFLAG_RWTUN, &zfs_remove_max_segment, 0, + "Largest contiguous segment ZFS will attempt to allocate when removing" + " a device"); + +extern int zfs_removal_suspend_progress; + +SYSCTL_INT(_vfs_zfs, OID_AUTO, removal_suspend_progress, + CTLFLAG_RWTUN, &zfs_removal_suspend_progress, 0, + "Ensures certain actions can happen while in the middle of a removal"); + /* * Minimum size which forces the dynamic allocator to change * it's allocation strategy. Once the space map cannot satisfy @@ -532,9 +765,18 @@ param_set_min_auto_ashift(SYSCTL_HANDLER_ARGS) zfs_vdev_min_auto_ashift = val; + if (arg2 != 0) + warn_deprecated_sysctl("min_auto_ashift", + "vdev.min_auto_ashift"); + return (0); } +SYSCTL_PROC(_vfs_zfs, OID_AUTO, min_auto_ashift, + CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, NULL, 1, + param_set_min_auto_ashift, "IU", + "Min ashift used when creating new top-level vdev. (LEGACY)"); + int param_set_max_auto_ashift(SYSCTL_HANDLER_ARGS) { @@ -551,9 +793,19 @@ param_set_max_auto_ashift(SYSCTL_HANDLER_ARGS) zfs_vdev_max_auto_ashift = val; + if (arg2 != 0) + warn_deprecated_sysctl("max_auto_ashift", + "vdev.max_auto_ashift"); + return (0); } +SYSCTL_PROC(_vfs_zfs, OID_AUTO, max_auto_ashift, + CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, NULL, 1, + param_set_max_auto_ashift, "IU", + "Max ashift used when optimizing for logical -> physical sector size on" + " new top-level vdevs. (LEGACY)"); + /* * Since the DTL space map of a vdev is not expected to have a lot of * entries, we default its block size to 4K. @@ -575,6 +827,23 @@ SYSCTL_INT(_vfs_zfs, OID_AUTO, standard_sm_blksz, CTLFLAG_RDTUN, &zfs_vdev_standard_sm_blksz, 0, "Block size for standard space map. Power of 2 greater than 4096."); +extern int vdev_validate_skip; + +SYSCTL_INT(_vfs_zfs, OID_AUTO, validate_skip, + CTLFLAG_RDTUN, &vdev_validate_skip, 0, + "Enable to bypass vdev_validate()."); + +/* vdev_mirror.c */ + +/* vdev_queue.c */ + +extern uint_t zfs_vdev_max_active; + +SYSCTL_UINT(_vfs_zfs, OID_AUTO, top_maxinflight, + CTLFLAG_RWTUN, &zfs_vdev_max_active, 0, + "The maximum number of I/Os of all types active for each device." + " (LEGACY)"); + /* zio.c */ SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, exclude_metadata, diff --git a/sys/contrib/openzfs/module/zfs/arc.c b/sys/contrib/openzfs/module/zfs/arc.c index 591e2dade59e..b677f90280d7 100644 --- a/sys/contrib/openzfs/module/zfs/arc.c +++ b/sys/contrib/openzfs/module/zfs/arc.c @@ -486,13 +486,13 @@ static taskq_t *arc_flush_taskq; static uint_t zfs_arc_evict_threads = 0; /* The 7 states: */ -static arc_state_t ARC_anon; -/* */ arc_state_t ARC_mru; -static arc_state_t ARC_mru_ghost; -/* */ arc_state_t ARC_mfu; -static arc_state_t ARC_mfu_ghost; -static arc_state_t ARC_l2c_only; -static arc_state_t ARC_uncached; +arc_state_t ARC_anon; +arc_state_t ARC_mru; +arc_state_t ARC_mru_ghost; +arc_state_t ARC_mfu; +arc_state_t ARC_mfu_ghost; +arc_state_t ARC_l2c_only; +arc_state_t ARC_uncached; arc_stats_t arc_stats = { { "hits", KSTAT_DATA_UINT64 }, @@ -832,15 +832,15 @@ typedef struct arc_async_flush { #define L2ARC_FEED_TYPES 4 /* L2ARC Performance Tunables */ -static uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* def max write size */ -static uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra warmup write */ -static uint64_t l2arc_headroom = L2ARC_HEADROOM; /* # of dev writes */ -static uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST; -static uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ -static uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval msecs */ -static int l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ -static int l2arc_feed_again = B_TRUE; /* turbo warmup */ -static int l2arc_norw = B_FALSE; /* no reads during writes */ +uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* def max write size */ +uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra warmup write */ +uint64_t l2arc_headroom = L2ARC_HEADROOM; /* # of dev writes */ +uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST; +uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ +uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval msecs */ +int l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ +int l2arc_feed_again = B_TRUE; /* turbo warmup */ +int l2arc_norw = B_FALSE; /* no reads during writes */ static uint_t l2arc_meta_percent = 33; /* limit on headers size */ /* diff --git a/sys/contrib/openzfs/module/zfs/dmu_zfetch.c b/sys/contrib/openzfs/module/zfs/dmu_zfetch.c index 3d3a9c713568..51165d0bf723 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_zfetch.c +++ b/sys/contrib/openzfs/module/zfs/dmu_zfetch.c @@ -57,19 +57,19 @@ static unsigned int zfetch_max_sec_reap = 2; /* min bytes to prefetch per stream (default 2MB) */ static unsigned int zfetch_min_distance = 2 * 1024 * 1024; /* max bytes to prefetch per stream (default 8MB) */ -static unsigned int zfetch_max_distance = 8 * 1024 * 1024; +unsigned int zfetch_max_distance = 8 * 1024 * 1024; #else /* min bytes to prefetch per stream (default 4MB) */ static unsigned int zfetch_min_distance = 4 * 1024 * 1024; /* max bytes to prefetch per stream (default 64MB) */ -static unsigned int zfetch_max_distance = 64 * 1024 * 1024; +unsigned int zfetch_max_distance = 64 * 1024 * 1024; #endif /* max bytes to prefetch indirects for per stream (default 128MB) */ -static unsigned int zfetch_max_idistance = 128 * 1024 * 1024; +unsigned int zfetch_max_idistance = 128 * 1024 * 1024; /* max request reorder distance within a stream (default 16MB) */ -static unsigned int zfetch_max_reorder = 16 * 1024 * 1024; +unsigned int zfetch_max_reorder = 16 * 1024 * 1024; /* Max log2 fraction of holes in a stream */ -static unsigned int zfetch_hole_shift = 2; +unsigned int zfetch_hole_shift = 2; typedef struct zfetch_stats { kstat_named_t zfetchstat_hits; diff --git a/sys/contrib/openzfs/module/zfs/vdev.c b/sys/contrib/openzfs/module/zfs/vdev.c index 654e034de9e1..c8d7280387a2 100644 --- a/sys/contrib/openzfs/module/zfs/vdev.c +++ b/sys/contrib/openzfs/module/zfs/vdev.c @@ -100,7 +100,7 @@ static uint_t zfs_vdev_default_ms_shift = 29; /* upper limit for metaslab size (16G) */ static uint_t zfs_vdev_max_ms_shift = 34; -static int vdev_validate_skip = B_FALSE; +int vdev_validate_skip = B_FALSE; /* * Since the DTL space map of a vdev is not expected to have a lot of diff --git a/sys/contrib/openzfs/module/zfs/vdev_queue.c b/sys/contrib/openzfs/module/zfs/vdev_queue.c index e69e5598939e..c12713b107bf 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_queue.c +++ b/sys/contrib/openzfs/module/zfs/vdev_queue.c @@ -122,7 +122,7 @@ * The maximum number of i/os active to each device. Ideally, this will be >= * the sum of each queue's max_active. */ -static uint_t zfs_vdev_max_active = 1000; +uint_t zfs_vdev_max_active = 1000; /* * Per-queue limits on the number of i/os active to each device. If the diff --git a/sys/contrib/openzfs/module/zfs/vdev_removal.c b/sys/contrib/openzfs/module/zfs/vdev_removal.c index 2ce0121324ad..2f7a739da241 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_removal.c +++ b/sys/contrib/openzfs/module/zfs/vdev_removal.c @@ -105,7 +105,7 @@ static const uint_t zfs_remove_max_copy_bytes = 64 * 1024 * 1024; * * See also the accessor function spa_remove_max_segment(). */ -static uint_t zfs_remove_max_segment = SPA_MAXBLOCKSIZE; +uint_t zfs_remove_max_segment = SPA_MAXBLOCKSIZE; /* * Ignore hard IO errors during device removal. When set if a device @@ -137,7 +137,7 @@ uint_t vdev_removal_max_span = 32 * 1024; * This is used by the test suite so that it can ensure that certain * actions happen while in the middle of a removal. */ -static int zfs_removal_suspend_progress = 0; +int zfs_removal_suspend_progress = 0; #define VDEV_REMOVAL_ZAP_OBJS "lzap" diff --git a/sys/contrib/openzfs/tests/zfs-tests/include/tunables.cfg b/sys/contrib/openzfs/tests/zfs-tests/include/tunables.cfg index 54b50c9dba77..127ea188f17f 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/include/tunables.cfg +++ b/sys/contrib/openzfs/tests/zfs-tests/include/tunables.cfg @@ -76,8 +76,8 @@ READ_SIT_OUT_SECS vdev.read_sit_out_secs vdev_read_sit_out_secs SIT_OUT_CHECK_INTERVAL vdev.raidz_outlier_check_interval_ms vdev_raidz_outlier_check_interval_ms SIT_OUT_INSENSITIVITY vdev.raidz_outlier_insensitivity vdev_raidz_outlier_insensitivity REBUILD_SCRUB_ENABLED rebuild_scrub_enabled zfs_rebuild_scrub_enabled -REMOVAL_SUSPEND_PROGRESS vdev.removal_suspend_progress zfs_removal_suspend_progress -REMOVE_MAX_SEGMENT vdev.remove_max_segment zfs_remove_max_segment +REMOVAL_SUSPEND_PROGRESS removal_suspend_progress zfs_removal_suspend_progress +REMOVE_MAX_SEGMENT remove_max_segment zfs_remove_max_segment RESILVER_MIN_TIME_MS resilver_min_time_ms zfs_resilver_min_time_ms RESILVER_DEFER_PERCENT resilver_defer_percent zfs_resilver_defer_percent SCAN_LEGACY scan_legacy zfs_scan_legacy diff --git a/sys/crypto/chacha20/chacha.c b/sys/crypto/chacha20/chacha.c index 52f7e18c651c..cb06003b0ecf 100644 --- a/sys/crypto/chacha20/chacha.c +++ b/sys/crypto/chacha20/chacha.c @@ -138,7 +138,7 @@ chacha_encrypt_bytes(chacha_ctx *x,const u8 *m,u8 *c,u32 bytes) for (;;) { if (bytes < 64) { #ifndef KEYSTREAM_ONLY - for (i = 0;i < bytes;++i) tmp[i] = m[i]; + for (i = 0; i < bytes; ++i) tmp[i] = m[i]; m = tmp; #endif ctarget = c; @@ -160,7 +160,7 @@ chacha_encrypt_bytes(chacha_ctx *x,const u8 *m,u8 *c,u32 bytes) x13 = j13; x14 = j14; x15 = j15; - for (i = 20;i > 0;i -= 2) { + for (i = 20; i > 0; i -= 2) { QUARTERROUND( x0, x4, x8,x12) QUARTERROUND( x1, x5, x9,x13) QUARTERROUND( x2, x6,x10,x14) @@ -240,7 +240,7 @@ chacha_encrypt_bytes(chacha_ctx *x,const u8 *m,u8 *c,u32 bytes) if (bytes <= 64) { if (bytes < 64) { - for (i = 0;i < bytes;++i) ctarget[i] = c[i]; + for (i = 0; i < bytes; ++i) ctarget[i] = c[i]; } x->input[12] = j12; x->input[13] = j13; diff --git a/sys/crypto/openssl/ossl_sha256.c b/sys/crypto/openssl/ossl_sha256.c index 4613a9409b44..50cb9739d114 100644 --- a/sys/crypto/openssl/ossl_sha256.c +++ b/sys/crypto/openssl/ossl_sha256.c @@ -74,11 +74,11 @@ ossl_sha256_init(void *c_) unsigned int nn; \ switch ((c)->md_len) \ { case SHA224_DIGEST_LENGTH: \ - for (nn=0;nn<SHA224_DIGEST_LENGTH/4;nn++) \ + for (nn=0; nn < SHA224_DIGEST_LENGTH / 4; nn++) \ { ll=(c)->h[nn]; (void)HOST_l2c(ll,(s)); } \ break; \ case SHA256_DIGEST_LENGTH: \ - for (nn=0;nn<SHA256_DIGEST_LENGTH/4;nn++) \ + for (nn=0; nn < SHA256_DIGEST_LENGTH / 4; nn++) \ { ll=(c)->h[nn]; (void)HOST_l2c(ll,(s)); } \ break; \ default: \ diff --git a/sys/dev/aic7xxx/aic79xx.c b/sys/dev/aic7xxx/aic79xx.c index 2b5015b20e41..cee45fa5cc8a 100644 --- a/sys/dev/aic7xxx/aic79xx.c +++ b/sys/dev/aic7xxx/aic79xx.c @@ -7788,8 +7788,8 @@ ahd_abort_scbs(struct ahd_softc *ahd, int target, char channel, } if (role != ROLE_TARGET) { - for (;i < maxtarget; i++) { - for (j = minlun;j < maxlun; j++) { + for (; i < maxtarget; i++) { + for (j = minlun; j < maxlun; j++) { u_int scbid; u_int tcl; diff --git a/sys/dev/aic7xxx/aic7xxx.c b/sys/dev/aic7xxx/aic7xxx.c index c09876e9f589..18f68b806948 100644 --- a/sys/dev/aic7xxx/aic7xxx.c +++ b/sys/dev/aic7xxx/aic7xxx.c @@ -5903,8 +5903,8 @@ ahc_abort_scbs(struct ahc_softc *ahc, int target, char channel, } if (role != ROLE_TARGET) { - for (;i < maxtarget; i++) { - for (j = minlun;j < maxlun; j++) { + for (; i < maxtarget; i++) { + for (j = minlun; j < maxlun; j++) { u_int scbid; u_int tcl; diff --git a/sys/dev/dc/if_dc.c b/sys/dev/dc/if_dc.c index bed74c3b6181..5c1d7ff30976 100644 --- a/sys/dev/dc/if_dc.c +++ b/sys/dev/dc/if_dc.c @@ -999,7 +999,7 @@ dc_setfilt_21143(struct dc_softc *sc) else DC_CLRBIT(sc, DC_NETCFG, DC_NETCFG_RX_ALLMULTI); - if_foreach_llmaddr(ifp, dc_hash_maddr_21143, sp); + if_foreach_llmaddr(ifp, dc_hash_maddr_21143, sc); if (if_getflags(ifp) & IFF_BROADCAST) { h = dc_mchash_le(sc, if_getbroadcastaddr(ifp)); diff --git a/sys/dev/enetc/if_enetc.c b/sys/dev/enetc/if_enetc.c index 808397b229a7..53002f9d73ce 100644 --- a/sys/dev/enetc/if_enetc.c +++ b/sys/dev/enetc/if_enetc.c @@ -848,7 +848,7 @@ enetc_hash_vid(uint16_t vid) bool bit; int i; - for (i = 0;i < 6;i++) { + for (i = 0; i < 6; i++) { bit = vid & BIT(i); bit ^= !!(vid & BIT(i + 6)); hash |= bit << i; @@ -1020,7 +1020,7 @@ enetc_msix_intr_assign(if_ctx_t ctx, int msix) ENETC_RBICR0_ICEN | ENETC_RBICR0_SET_ICPT(ENETC_RX_INTR_PKT_THR)); } vector = 0; - for (i = 0;i < sc->tx_num_queues; i++, vector++) { + for (i = 0; i < sc->tx_num_queues; i++, vector++) { tx_queue = &sc->tx_queues[i]; snprintf(irq_name, sizeof(irq_name), "txq%d", i); iflib_softirq_alloc_generic(ctx, &tx_queue->irq, @@ -1130,7 +1130,7 @@ enetc_isc_txd_encap(void *data, if_pkt_info_t ipi) } /* Now add remaining descriptors. */ - for (;i < ipi->ipi_nsegs; i++) { + for (; i < ipi->ipi_nsegs; i++) { desc = &queue->ring[pidx]; bzero(desc, sizeof(*desc)); desc->addr = segs[i].ds_addr; diff --git a/sys/dev/fdt/fdt_slicer.c b/sys/dev/fdt/fdt_slicer.c index 3ba4eddf8b61..50112db5cfae 100644 --- a/sys/dev/fdt/fdt_slicer.c +++ b/sys/dev/fdt/fdt_slicer.c @@ -45,7 +45,7 @@ static int fill_slices(device_t dev, const char *provider, struct flash_slice *slices, int *slices_num); -static void fdt_slicer_init(void); +static void fdt_slicer_init(void *); static int fill_slices_from_node(phandle_t node, struct flash_slice *slices, int *count) @@ -138,7 +138,7 @@ fill_slices(device_t dev, const char *provider __unused, } static void -fdt_slicer_init(void) +fdt_slicer_init(void *dummy __unused) { flash_register_slicer(fill_slices, FLASH_SLICES_TYPE_NAND, false); @@ -147,7 +147,7 @@ fdt_slicer_init(void) } static void -fdt_slicer_cleanup(void) +fdt_slicer_cleanup(void *dummy __unused) { flash_register_slicer(NULL, FLASH_SLICES_TYPE_NAND, true); diff --git a/sys/dev/hptmv/entry.c b/sys/dev/hptmv/entry.c index 5c4718bf582f..f3d58f285b39 100644 --- a/sys/dev/hptmv/entry.c +++ b/sys/dev/hptmv/entry.c @@ -430,7 +430,7 @@ static void device_change(IAL_ADAPTER_T *pAdapter , MV_U8 channelIndex, int plug if(pVDev->pParent) { int iMember; - for(iMember = 0; iMember < pVDev->pParent->u.array.bArnMember; iMember++) + for (iMember = 0; iMember < pVDev->pParent->u.array.bArnMember; iMember++) if((PVDevice)pVDev->pParent->u.array.pMember[iMember] == pVDev) pVDev->pParent->u.array.pMember[iMember] = NULL; pVDev->pParent = NULL; @@ -984,7 +984,7 @@ fRegisterVdevice(IAL_ADAPTER_T *pAdapter) PVBus pVBus; int i,j; - for(i=0;i<MV_SATA_CHANNELS_NUM;i++) { + for (i = 0; i < MV_SATA_CHANNELS_NUM; i++) { pPhysical = &(pAdapter->VDevices[i]); pLogical = pPhysical; while (pLogical->pParent) pLogical = pLogical->pParent; @@ -1027,8 +1027,7 @@ GetSpareDisk(_VBUS_ARG PVDevice pArray) PVDevice pVDevice, pFind = NULL; int i; - for(i=0;i<MV_SATA_CHANNELS_NUM;i++) - { + for (i=0; i < MV_SATA_CHANNELS_NUM; i++) { pVDevice = &pAdapter->VDevices[i]; if(!pVDevice) continue; @@ -1356,7 +1355,7 @@ unregister: goto unregister; } - for (i=0; i<MAX_COMMAND_BLOCKS_FOR_EACH_VBUS; i++) { + for (i = 0; i < MAX_COMMAND_BLOCKS_FOR_EACH_VBUS; i++) { FreeCommand(_VBUS_P &(pAdapter->pCommandBlocks[i])); } @@ -1370,7 +1369,7 @@ unregister: memset((void *)pAdapter->pbus_dmamap, 0, sizeof(struct _BUS_DMAMAP) * MAX_QUEUE_COMM); pAdapter->pbus_dmamap_list = 0; - for (i=0; i < MAX_QUEUE_COMM; i++) { + for (i = 0; i < MAX_QUEUE_COMM; i++) { PBUS_DMAMAP pmap = &(pAdapter->pbus_dmamap[i]); pmap->pAdapter = pAdapter; dmamap_put(pmap); @@ -1398,7 +1397,7 @@ unregister: pAdapter->prdTableAlignedAddr = (PUCHAR)(((ULONG_PTR)pAdapter->prdTableAddr + 0x1f) & ~(ULONG_PTR)0x1fL); { PUCHAR PRDTable = pAdapter->prdTableAlignedAddr; - for (i=0; i<PRD_TABLES_FOR_VBUS; i++) + for (i = 0; i < PRD_TABLES_FOR_VBUS; i++) { /* KdPrint(("i=%d,pAdapter->pFreePRDLink=%p\n",i,pAdapter->pFreePRDLink)); */ FreePRDTable(pAdapter, PRDTable); @@ -1447,7 +1446,7 @@ unregister: } #ifdef SUPPORT_ARRAY - for(i = MAX_ARRAY_DEVICE - 1; i >= 0; i--) { + for (i = MAX_ARRAY_DEVICE - 1; i >= 0; i--) { pVDev = ArrayTables(i); mArFreeArrayTable(pVDev); } @@ -1467,7 +1466,7 @@ unregister: _vbus_p->nInstances = 1; fRegisterVdevice(pAdapter); - for (channel=0;channel<MV_SATA_CHANNELS_NUM;channel++) { + for (channel = 0; channel < MV_SATA_CHANNELS_NUM; channel++) { pVDev = _vbus_p->pVDevice[channel]; if (pVDev && pVDev->vf_online) fCheckBootable(pVDev); @@ -1567,7 +1566,7 @@ fResetActiveCommands(PVBus _vbus_p) { MV_SATA_ADAPTER *pMvSataAdapter = &((IAL_ADAPTER_T *)_vbus_p->OsExt)->mvSataAdapter; MV_U8 channel; - for (channel=0;channel< MV_SATA_CHANNELS_NUM;channel++) { + for (channel = 0; channel < MV_SATA_CHANNELS_NUM; channel++) { if (pMvSataAdapter->sataChannel[channel] && pMvSataAdapter->sataChannel[channel]->outstandingCommands) MvSataResetChannel(pMvSataAdapter,channel); } @@ -1590,7 +1589,7 @@ check_cmds: dataxfer_poll(); xor_poll(); #endif - for (channel=0;channel< MV_SATA_CHANNELS_NUM;channel++) { + for (channel = 0; channel < MV_SATA_CHANNELS_NUM; channel++) { pMvSataChannel = pMvSataAdapter->sataChannel[channel]; if (pMvSataChannel && pMvSataChannel->outstandingCommands) { @@ -1716,7 +1715,7 @@ fDeviceSendCommand(_VBUS_ARG PCommand pCmd) MV_BOOLEAN is48bit; MV_U8 channel; - int i=0; + int i = 0; DECLARE_BUFFER(FPSCAT_GATH, tmpSg); @@ -2141,7 +2140,7 @@ FlushAdapter(IAL_ADAPTER_T *pAdapter) hpt_printk(("flush all devices\n")); /* flush all devices */ - for (i=0; i<MAX_VDEVICE_PER_VBUS; i++) { + for (i = 0; i < MAX_VDEVICE_PER_VBUS; i++) { PVDevice pVDev = pAdapter->VBus.pVDevice[i]; if(pVDev) fFlushVDev(pVDev); } @@ -2174,7 +2173,7 @@ Check_Idle_Call(IAL_ADAPTER_T *pAdapter) { int i; PVDevice pArray; - for(i = 0; i < MAX_ARRAY_PER_VBUS; i++){ + for (i = 0; i < MAX_ARRAY_PER_VBUS; i++) { if ((pArray=ArrayTables(i))->u.array.dArStamp==0) continue; else if (pArray->u.array.rf_auto_rebuild) { @@ -2378,7 +2377,7 @@ hpt_free_ccb(union ccb **ccb_Q, union ccb *ccb) static void hpt_worker_thread(void) { - for(;;) { + for (;;) { mtx_lock(&DpcQueue_Lock); while (DpcQueue_First!=DpcQueue_Last) { ST_HPT_DPC p; @@ -2418,7 +2417,7 @@ static void hpt_worker_thread(void) mtx_lock(&pAdapter->lock); _vbus_p = &pAdapter->VBus; - for (i=0;i<MAX_ARRAY_PER_VBUS;i++) + for (i = 0; i < MAX_ARRAY_PER_VBUS; i++) { if ((pArray=ArrayTables(i))->u.array.dArStamp==0) continue; @@ -2472,7 +2471,7 @@ launch_worker_thread(void) int i; PVDevice pVDev; - for(i = 0; i < MAX_ARRAY_PER_VBUS; i++) + for (i = 0; i < MAX_ARRAY_PER_VBUS; i++) if ((pVDev=ArrayTables(i))->u.array.dArStamp==0) continue; else{ diff --git a/sys/dev/hptmv/gui_lib.c b/sys/dev/hptmv/gui_lib.c index d78fdcca69d2..f11044db733a 100644 --- a/sys/dev/hptmv/gui_lib.c +++ b/sys/dev/hptmv/gui_lib.c @@ -86,8 +86,7 @@ check_VDevice_valid(PVDevice p) while(pAdapter != NULL) { _vbus_p = &pAdapter->VBus; - for (i=0;i<MAX_ARRAY_PER_VBUS;i++) - { + for (i = 0; i<MAX_ARRAY_PER_VBUS; i++) { pVDevice=ArrayTables(i); if ((pVDevice->u.array.dArStamp != 0) && (pVDevice == p)) return 0; @@ -244,9 +243,9 @@ static void get_array_info(PVDevice pVDevice, PHPT_ARRAY_INFO pArrayInfo) if(pVDevice->u.array.pMember[i] != NULL) pArrayInfo->Members[pArrayInfo->nDisk++] = VDEV_TO_ID(pVDevice->u.array.pMember[i]); - for(i=pArrayInfo->nDisk; i<MAX_ARRAY_MEMBERS; i++) + for (i = pArrayInfo->nDisk; i < MAX_ARRAY_MEMBERS; i++) pArrayInfo->Members[i] = INVALID_DEVICEID; - } +} static void get_array_info_v2(PVDevice pVDevice, PHPT_ARRAY_INFO_V2 pArrayInfo) { @@ -266,7 +265,7 @@ static void get_array_info_v2(PVDevice pVDevice, PHPT_ARRAY_INFO_V2 pArrayInfo) if(pVDevice->u.array.pMember[i] != NULL) pArrayInfo->Members[pArrayInfo->nDisk++] = VDEV_TO_ID(pVDevice->u.array.pMember[i]); - for(i=pArrayInfo->nDisk; i<MAX_ARRAY_MEMBERS_V2; i++) + for (i = pArrayInfo->nDisk; i < MAX_ARRAY_MEMBERS_V2; i++) pArrayInfo->Members[i] = INVALID_DEVICEID; } #endif @@ -461,8 +460,7 @@ found: pInfo->IoPort = 0; pInfo->ControlPort = 0; - for (i=0; i<2 ;i++) - { + for (i = 0; i < 2; i++) { pInfo->Devices[i] = (DEVICEID)INVALID_DEVICEID; } diff --git a/sys/dev/hptmv/hptproc.c b/sys/dev/hptmv/hptproc.c index 38fe61ee7e04..328750d9034c 100644 --- a/sys/dev/hptmv/hptproc.c +++ b/sys/dev/hptmv/hptproc.c @@ -107,7 +107,7 @@ hpt_set_asc_info(IAL_ADAPTER_T *pAdapter, char *buffer,int length) return -EINVAL; } - for (i=0;i<MV_SATA_CHANNELS_NUM;i++) + for (i = 0; i < MV_SATA_CHANNELS_NUM; i++) if(i == ichan) goto rebuild; diff --git a/sys/dev/ice/ice_common.c b/sys/dev/ice/ice_common.c index ad4ea4c8e7a1..b895f661bc46 100644 --- a/sys/dev/ice/ice_common.c +++ b/sys/dev/ice/ice_common.c @@ -213,6 +213,15 @@ int ice_set_mac_type(struct ice_hw *hw) case ICE_DEV_ID_E830_L_QSFP: case ICE_DEV_ID_E830C_SFP: case ICE_DEV_ID_E830_L_SFP: + case ICE_DEV_ID_E835CC_BACKPLANE: + case ICE_DEV_ID_E835CC_QSFP56: + case ICE_DEV_ID_E835CC_SFP: + case ICE_DEV_ID_E835C_BACKPLANE: + case ICE_DEV_ID_E835C_QSFP: + case ICE_DEV_ID_E835C_SFP: + case ICE_DEV_ID_E835_L_BACKPLANE: + case ICE_DEV_ID_E835_L_QSFP: + case ICE_DEV_ID_E835_L_SFP: hw->mac_type = ICE_MAC_E830; break; default: diff --git a/sys/dev/ice/ice_devids.h b/sys/dev/ice/ice_devids.h index 3f91e9dfbcaf..74712c61ae8e 100644 --- a/sys/dev/ice/ice_devids.h +++ b/sys/dev/ice/ice_devids.h @@ -62,6 +62,24 @@ #define ICE_DEV_ID_E830C_SFP 0x12DA /* Intel(R) Ethernet Controller E830-L for SFP */ #define ICE_DEV_ID_E830_L_SFP 0x12DE +/* Intel(R) Ethernet Controller E835-CC for backplane */ +#define ICE_DEV_ID_E835CC_BACKPLANE 0x1248 +/* Intel(R) Ethernet Controller E835-CC for QSFP */ +#define ICE_DEV_ID_E835CC_QSFP56 0x1249 +/* Intel(R) Ethernet Controller E835-CC for SFP */ +#define ICE_DEV_ID_E835CC_SFP 0x124A +/* Intel(R) Ethernet Controller E835-C for backplane */ +#define ICE_DEV_ID_E835C_BACKPLANE 0x1261 +/* Intel(R) Ethernet Controller E835-C for QSFP */ +#define ICE_DEV_ID_E835C_QSFP 0x1262 +/* Intel(R) Ethernet Controller E835-C for SFP */ +#define ICE_DEV_ID_E835C_SFP 0x1263 +/* Intel(R) Ethernet Controller E835-L for backplane */ +#define ICE_DEV_ID_E835_L_BACKPLANE 0x1265 +/* Intel(R) Ethernet Controller E835-L for QSFP */ +#define ICE_DEV_ID_E835_L_QSFP 0x1266 +/* Intel(R) Ethernet Controller E835-L for SFP */ +#define ICE_DEV_ID_E835_L_SFP 0x1267 /* Intel(R) Ethernet Controller E810-C for backplane */ #define ICE_DEV_ID_E810C_BACKPLANE 0x1591 /* Intel(R) Ethernet Controller E810-C for QSFP */ diff --git a/sys/dev/ice/ice_drv_info.h b/sys/dev/ice/ice_drv_info.h index 2a51a7394424..46965f4124bc 100644 --- a/sys/dev/ice/ice_drv_info.h +++ b/sys/dev/ice/ice_drv_info.h @@ -218,6 +218,45 @@ static const pci_vendor_info_t ice_vendor_info_array[] = { "Intel(R) Ethernet Network Adapter E830-XXV-2"), PVIDV(ICE_INTEL_VENDOR_ID, ICE_DEV_ID_E830_L_SFP, "Intel(R) Ethernet Connection E830-L for SFP"), + PVIDV(ICE_INTEL_VENDOR_ID, ICE_DEV_ID_E835CC_BACKPLANE, + "Intel(R) Ethernet Connection E835-CC for backplane"), + PVIDV_OEM(ICE_INTEL_VENDOR_ID, ICE_DEV_ID_E835CC_QSFP56, + ICE_INTEL_VENDOR_ID, 0x0001, 0, + "Intel(R) Ethernet Network Adapter E835-C-Q2"), + PVIDV_OEM(ICE_INTEL_VENDOR_ID, ICE_DEV_ID_E835CC_QSFP56, + ICE_INTEL_VENDOR_ID, 0x0002, 0, + "Intel(R) Ethernet Network Adapter E835-C-Q2 for OCP 3.0"), + PVIDV_OEM(ICE_INTEL_VENDOR_ID, ICE_DEV_ID_E835CC_QSFP56, + ICE_INTEL_VENDOR_ID, 0x0003, 0, + "Intel(R) Ethernet Network Adapter E835-CC-Q1"), + PVIDV_OEM(ICE_INTEL_VENDOR_ID, ICE_DEV_ID_E835CC_QSFP56, + ICE_INTEL_VENDOR_ID, 0x0004, 0, + "Intel(R) Ethernet Network Adapter E835-CC-Q1 for OCP 3.0"), + PVIDV(ICE_INTEL_VENDOR_ID, ICE_DEV_ID_E835CC_QSFP56, + "Intel(R) Ethernet Connection E835-CC for QSFP56"), + PVIDV_OEM(ICE_INTEL_VENDOR_ID, ICE_DEV_ID_E835CC_SFP, + ICE_INTEL_VENDOR_ID, 0x0001, 0, + "Intel(R) Ethernet Network Adapter E835-XXV-2 for OCP 3.0"), + PVIDV_OEM(ICE_INTEL_VENDOR_ID, ICE_DEV_ID_E835CC_SFP, + ICE_INTEL_VENDOR_ID, 0x0003, 0, + "Intel(R) Ethernet Network Adapter E835-XXV-2"), + PVIDV_OEM(ICE_INTEL_VENDOR_ID, ICE_DEV_ID_E835CC_SFP, + ICE_INTEL_VENDOR_ID, 0x0004, 0, + "Intel(R) Ethernet Network Adapter E835-XXV-4 for OCP 3.0"), + PVIDV(ICE_INTEL_VENDOR_ID, ICE_DEV_ID_E835CC_SFP, + "Intel(R) Ethernet Connection E835-CC for SFP"), + PVIDV(ICE_INTEL_VENDOR_ID, ICE_DEV_ID_E835C_BACKPLANE, + "Intel(R) Ethernet Connection E835-C for backplane"), + PVIDV(ICE_INTEL_VENDOR_ID, ICE_DEV_ID_E835C_QSFP, + "Intel(R) Ethernet Connection E835-C for QSFP"), + PVIDV(ICE_INTEL_VENDOR_ID, ICE_DEV_ID_E835C_SFP, + "Intel(R) Ethernet Connection E835-C for SFP"), + PVIDV(ICE_INTEL_VENDOR_ID, ICE_DEV_ID_E835_L_BACKPLANE, + "Intel(R) Ethernet Connection E835-L for backplane"), + PVIDV(ICE_INTEL_VENDOR_ID, ICE_DEV_ID_E835_L_QSFP, + "Intel(R) Ethernet Connection E835-L for QSFP"), + PVIDV(ICE_INTEL_VENDOR_ID, ICE_DEV_ID_E835_L_SFP, + "Intel(R) Ethernet Connection E835-L for SFP"), PVIDV(ICE_INTEL_VENDOR_ID, ICE_DEV_ID_E825C_BACKPLANE, "Intel(R) Ethernet Connection E825-C for backplane"), PVIDV(ICE_INTEL_VENDOR_ID, ICE_DEV_ID_E825C_QSFP, diff --git a/sys/dev/iommu/iommu_gas.c b/sys/dev/iommu/iommu_gas.c index ffa8dc096adc..80e37341b3dc 100644 --- a/sys/dev/iommu/iommu_gas.c +++ b/sys/dev/iommu/iommu_gas.c @@ -77,7 +77,7 @@ static int iommu_check_free; #endif static void -intel_gas_init(void) +intel_gas_init(void *dummy __unused) { iommu_map_entry_zone = uma_zcreate("IOMMU_MAP_ENTRY", diff --git a/sys/dev/ixgbe/if_ix.c b/sys/dev/ixgbe/if_ix.c index 6d08bd49bc04..1d36fd11f368 100644 --- a/sys/dev/ixgbe/if_ix.c +++ b/sys/dev/ixgbe/if_ix.c @@ -192,6 +192,8 @@ static int ixgbe_if_i2c_req(if_ctx_t, struct ifi2creq *); static bool ixgbe_if_needs_restart(if_ctx_t, enum iflib_restart_event); int ixgbe_intr(void *); +static int ixgbe_if_priv_ioctl(if_ctx_t ctx, u_long command, caddr_t data); + /************************************************************************ * Function prototypes ************************************************************************/ @@ -239,6 +241,13 @@ static void ixgbe_setup_vlan_hw_support(if_ctx_t); static void ixgbe_config_gpie(struct ixgbe_softc *); static void ixgbe_config_delay_values(struct ixgbe_softc *); +static void ixgbe_add_debug_sysctls(struct ixgbe_softc *sc); +static void ixgbe_add_debug_dump_sysctls(struct ixgbe_softc *sc); +static int ixgbe_debug_dump_ioctl(struct ixgbe_softc *sc, struct ifdrv *ifd); +static u8 ixgbe_debug_dump_print_cluster(struct ixgbe_softc *sc, + struct sbuf *sbuf, u8 cluster_id); +static int ixgbe_nvm_access_ioctl(struct ixgbe_softc *sc, struct ifdrv *ifd); + /* Sysctl handlers */ static int ixgbe_sysctl_flowcntl(SYSCTL_HANDLER_ARGS); static int ixgbe_sysctl_advertise(SYSCTL_HANDLER_ARGS); @@ -260,6 +269,9 @@ static int ixgbe_sysctl_wol_enable(SYSCTL_HANDLER_ARGS); static int ixgbe_sysctl_wufc(SYSCTL_HANDLER_ARGS); static int ixgbe_sysctl_tso_tcp_flags_mask(SYSCTL_HANDLER_ARGS); +static int ixgbe_sysctl_debug_dump_set_clusters(SYSCTL_HANDLER_ARGS); +static int ixgbe_sysctl_dump_debug_dump(SYSCTL_HANDLER_ARGS); + /* Deferred interrupt tasklets */ static void ixgbe_handle_msf(void *); static void ixgbe_handle_mod(void *); @@ -330,6 +342,7 @@ static device_method_t ixgbe_if_methods[] = { DEVMETHOD(ifdi_get_counter, ixgbe_if_get_counter), DEVMETHOD(ifdi_i2c_req, ixgbe_if_i2c_req), DEVMETHOD(ifdi_needs_restart, ixgbe_if_needs_restart), + DEVMETHOD(ifdi_priv_ioctl, ixgbe_if_priv_ioctl), #ifdef PCI_IOV DEVMETHOD(ifdi_iov_init, ixgbe_if_iov_init), DEVMETHOD(ifdi_iov_uninit, ixgbe_if_iov_uninit), @@ -1015,6 +1028,8 @@ ixgbe_if_attach_pre(if_ctx_t ctx) if (hw->mac.type == ixgbe_mac_E610) ixgbe_init_aci(hw); + sc->do_debug_dump = false; + if (hw->mac.ops.fw_recovery_mode && hw->mac.ops.fw_recovery_mode(hw)) { device_printf(dev, @@ -1395,6 +1410,248 @@ ixgbe_if_needs_restart(if_ctx_t ctx __unused, enum iflib_restart_event event) } /************************************************************************ + * ixgbe_if_priv_ioctl - Ioctl handler for driver + * + * Handler for custom driver specific ioctls + * + * return 0 on success, positive on failure + ************************************************************************/ +static int +ixgbe_if_priv_ioctl(if_ctx_t ctx, u_long command, caddr_t data) +{ + struct ixgbe_softc *sc = iflib_get_softc(ctx); + struct ifdrv *ifd; + device_t dev = sc->dev; + + /* Make sure the command type is valid */ + switch (command) { + case SIOCSDRVSPEC: + case SIOCGDRVSPEC: + /* Accepted commands */ + break; + case SIOCGPRIVATE_0: + /* + * Although we do not support this ioctl command, it's expected + * that iflib will forward it to the IFDI_PRIV_IOCTL handler. + * Do not print a message in this case. + */ + return (ENOTSUP); + default: + /* + * If we get a different command for this function, it's + * definitely unexpected, so log a message indicating what + * command we got for debugging purposes. + */ + device_printf(dev, + "%s: unexpected ioctl command %08lx\n", + __func__, command); + return (EINVAL); + } + + ifd = (struct ifdrv *)data; + + switch (ifd->ifd_cmd) { + case IXGBE_NVM_ACCESS: + IOCTL_DEBUGOUT("ioctl: NVM ACCESS"); + return (ixgbe_nvm_access_ioctl(sc, ifd)); + case IXGBE_DEBUG_DUMP: + IOCTL_DEBUGOUT("ioctl: DEBUG DUMP"); + return (ixgbe_debug_dump_ioctl(sc, ifd)); + default: + IOCTL_DEBUGOUT1( + "ioctl: UNKNOWN SIOC(S|G)DRVSPEC (0x%X) command\n", + (int)ifd->ifd_cmd); + return (EINVAL); + } + + return (0); +} + +/************************************************************************ + * ixgbe_nvm_access_ioctl + * + * Handles an NVM access ioctl request + ************************************************************************/ +static int +ixgbe_nvm_access_ioctl(struct ixgbe_softc *sc, struct ifdrv *ifd) +{ + struct ixgbe_nvm_access_data *data; + struct ixgbe_nvm_access_cmd *cmd; + struct ixgbe_hw *hw = &sc->hw; + size_t ifd_len = ifd->ifd_len; + size_t malloc_len; + device_t dev = sc->dev; + u8 *nvm_buffer; + s32 error = 0; + + /* + * ifioctl forwards SIOCxDRVSPEC to iflib without conducting + * a privilege check. Subsequently, iflib passes the ioctl to the driver + * without verifying privileges. To prevent non-privileged threads from + * accessing this interface, perform a privilege check at this point. + */ + error = priv_check(curthread, PRIV_DRIVER); + if (error) + return (error); + + if (ifd_len < sizeof(*cmd)) { + device_printf(dev, + "%s: ifdrv length is too small. Got %zu, " + "but expected %zu\n", + __func__, ifd_len, sizeof(*cmd)); + return (EINVAL); + } + + if (ifd->ifd_data == NULL) { + device_printf(dev, "%s: No ifd data buffer.\n", + __func__); + return (EINVAL); + } + + malloc_len = max(ifd_len, sizeof(*data) + sizeof(*cmd)); + + nvm_buffer = (u8 *)malloc(malloc_len, M_IXGBE, M_ZERO | M_NOWAIT); + if (!nvm_buffer) + return (ENOMEM); + + /* Copy the NVM access command and data in from user space */ + error = copyin(ifd->ifd_data, nvm_buffer, ifd_len); + if (error) { + device_printf(dev, "%s: Failed to copy data in, error: %d\n", + __func__, error); + goto cleanup_free_nvm_buffer; + } + + /* + * The NVM command structure is immediately followed by data which + * varies in size based on the command. + */ + cmd = (struct ixgbe_nvm_access_cmd *)nvm_buffer; + data = (struct ixgbe_nvm_access_data *) + (nvm_buffer + sizeof(struct ixgbe_nvm_access_cmd)); + + /* Handle the NVM access request */ + error = ixgbe_handle_nvm_access(hw, cmd, data); + if (error) { + device_printf(dev, "%s: NVM access request failed, error %d\n", + __func__, error); + } + + /* Copy the possibly modified contents of the handled request out */ + error = copyout(nvm_buffer, ifd->ifd_data, ifd_len); + if (error) { + device_printf(dev, "%s: Copying response back to " + "user space failed, error %d\n", + __func__, error); + goto cleanup_free_nvm_buffer; + } + +cleanup_free_nvm_buffer: + free(nvm_buffer, M_IXGBE); + return (error); +} + +/************************************************************************ + * ixgbe_debug_dump_ioctl + * + * Makes debug dump of internal FW/HW data. + ************************************************************************/ +static int +ixgbe_debug_dump_ioctl(struct ixgbe_softc *sc, struct ifdrv *ifd) +{ + struct ixgbe_debug_dump_cmd *dd_cmd; + struct ixgbe_hw *hw = &sc->hw; + size_t ifd_len = ifd->ifd_len; + device_t dev = sc->dev; + s32 error = 0; + + if (!(sc->feat_en & IXGBE_FEATURE_DBG_DUMP)) + return (ENODEV); + + /* Data returned from ACI command */ + u16 ret_buf_size = 0; + u16 ret_next_cluster = 0; + u16 ret_next_table = 0; + u32 ret_next_index = 0; + + /* + * ifioctl forwards SIOCxDRVSPEC to iflib without conducting + * a privilege check. Subsequently, iflib passes the ioctl to the driver + * without verifying privileges. To prevent non-privileged threads from + * accessing this interface, perform a privilege check at this point. + */ + error = priv_check(curthread, PRIV_DRIVER); + if (error) + return (error); + + if (ifd_len < sizeof(*dd_cmd)) { + device_printf(dev, + "%s: ifdrv length is too small. Got %zu, " + "but expected %zu\n", + __func__, ifd_len, sizeof(*dd_cmd)); + return (EINVAL); + } + + if (ifd->ifd_data == NULL) { + device_printf(dev, "%s: No ifd data buffer.\n", + __func__); + return (EINVAL); + } + + dd_cmd = (struct ixgbe_debug_dump_cmd *)malloc(ifd_len, M_IXGBE, + M_NOWAIT | M_ZERO); + if (!dd_cmd) { + error = -ENOMEM; + goto out; + } + /* copy data from userspace */ + error = copyin(ifd->ifd_data, dd_cmd, ifd_len); + if (error) { + device_printf(dev, "%s: Failed to copy data in, error: %d\n", + __func__, error); + goto out; + } + + /* ACI command requires buf_size arg to be grater than 0 */ + if (dd_cmd->data_size == 0) { + device_printf(dev, "%s: data_size must be greater than 0\n", + __func__); + error = EINVAL; + goto out; + } + + /* Zero the data buffer memory space */ + memset(dd_cmd->data, 0, ifd_len - sizeof(*dd_cmd)); + + error = ixgbe_aci_get_internal_data(hw, dd_cmd->cluster_id, + dd_cmd->table_id, dd_cmd->offset, dd_cmd->data, dd_cmd->data_size, + &ret_buf_size, &ret_next_cluster, &ret_next_table, &ret_next_index); + if (error) { + device_printf(dev, + "%s: Failed to get internal FW/HW data, error: %d\n", + __func__, error); + goto out; + } + + dd_cmd->cluster_id = ret_next_cluster; + dd_cmd->table_id = ret_next_table; + dd_cmd->offset = ret_next_index; + dd_cmd->data_size = ret_buf_size; + + error = copyout(dd_cmd, ifd->ifd_data, ifd->ifd_len); + if (error) { + device_printf(dev, + "%s: Failed to copy data out, error: %d\n", + __func__, error); + } + +out: + free(dd_cmd, M_IXGBE); + + return (error); +} + +/************************************************************************ * ixgbe_add_media_types ************************************************************************/ static void @@ -2883,6 +3140,264 @@ ixgbe_sysctl_interrupt_rate_handler(SYSCTL_HANDLER_ARGS) } /* ixgbe_sysctl_interrupt_rate_handler */ /************************************************************************ + * ixgbe_debug_dump_print_cluster + ************************************************************************/ +static u8 +ixgbe_debug_dump_print_cluster(struct ixgbe_softc *sc, struct sbuf *sbuf, + u8 cluster_id) +{ + u16 data_buf_size = IXGBE_ACI_MAX_BUFFER_SIZE; + device_t dev = sc->dev; + struct ixgbe_hw *hw = &sc->hw; + const u8 reserved_buf[8] = {}; + int max_aci_calls = 1000; + int error, counter = 0; + u8 *data_buf; + + /* Input parameters / loop variables */ + u16 table_id = 0; + u32 offset = 0; + + /* Data returned from ACI command */ + u16 ret_buf_size = 0; + u16 ret_next_cluster = 0; + u16 ret_next_table = 0; + u32 ret_next_index = 0; + + data_buf = (u8 *)malloc(data_buf_size, M_IXGBE, M_NOWAIT | M_ZERO); + if (!data_buf) + return (0); + + DEBUGOUT2("%s: dumping cluster id (relative) %d\n", + __func__, cluster_id); + + do { + DEBUGOUT3("table_id 0x%04x offset 0x%08x buf_size %d\n", + table_id, offset, data_buf_size); + + error = ixgbe_aci_get_internal_data(hw, cluster_id, table_id, + offset, data_buf, data_buf_size, &ret_buf_size, + &ret_next_cluster, &ret_next_table, &ret_next_index); + if (error) { + device_printf(dev, + "%s: Failed to get internal FW/HW data, error: %d, " + "last aci status: %d\n", + __func__, error, hw->aci.last_status); + break; + } + + DEBUGOUT3("ret_table_id 0x%04x ret_offset 0x%08x " + "ret_buf_size %d\n", + ret_next_table, ret_next_index, ret_buf_size); + + /* Print cluster id */ + u32 print_cluster_id = (u32)cluster_id; + sbuf_bcat(sbuf, &print_cluster_id, sizeof(print_cluster_id)); + /* Print table id */ + u32 print_table_id = (u32)table_id; + sbuf_bcat(sbuf, &print_table_id, sizeof(print_table_id)); + /* Print table length */ + u32 print_table_length = (u32)ret_buf_size; + sbuf_bcat(sbuf, &print_table_length, + sizeof(print_table_length)); + /* Print current offset */ + u32 print_curr_offset = offset; + sbuf_bcat(sbuf, &print_curr_offset, sizeof(print_curr_offset)); + /* Print reserved bytes */ + sbuf_bcat(sbuf, reserved_buf, sizeof(reserved_buf)); + /* Print data */ + sbuf_bcat(sbuf, data_buf, ret_buf_size); + + /* Prepare for the next loop spin */ + memset(data_buf, 0, data_buf_size); + + bool last_index = (ret_next_index == 0xffffffff); + bool last_table = ((ret_next_table == 0xff || + ret_next_table == 0xffff) && + last_index); + + if (last_table) { + /* End of the cluster */ + DEBUGOUT1("End of the cluster ID %d\n", cluster_id); + break; + } else if (last_index) { + /* End of the table */ + table_id = ret_next_table; + offset = 0; + } else { + /* More data left in the table */ + offset = ret_next_index; + } + } while (++counter < max_aci_calls); + + if (counter >= max_aci_calls) + device_printf(dev, "Exceeded nr of ACI calls for cluster %d\n", + cluster_id); + + free(data_buf, M_IXGBE); + + return (++cluster_id); +} /* ixgbe_print_debug_dump_cluster */ + +/************************************************************************ + * ixgbe_sysctl_debug_dump_set_clusters + * + * Sets the cluster to dump from FW when Debug Dump requested. + ************************************************************************/ +static int +ixgbe_sysctl_debug_dump_set_clusters(SYSCTL_HANDLER_ARGS) +{ + struct ixgbe_softc *sc = (struct ixgbe_softc *)arg1; + u32 clusters = sc->debug_dump_cluster_mask; + device_t dev = sc->dev; + int error; + + error = sysctl_handle_32(oidp, &clusters, 0, req); + if ((error) || !req->newptr) + return (error); + + if (clusters & ~(IXGBE_DBG_DUMP_VALID_CLUSTERS_MASK)) { + device_printf(dev, + "%s: Unrecognized parameter: %u\n", + __func__, clusters); + sc->debug_dump_cluster_mask = + IXGBE_ACI_DBG_DUMP_CLUSTER_ID_INVALID; + return (EINVAL); + } + + sc->debug_dump_cluster_mask = clusters; + + return (0); +} /* ixgbe_sysctl_debug_dump_set_clusters */ + +/************************************************************************ + * ixgbe_sysctl_dump_debug_dump + ************************************************************************/ +static int +ixgbe_sysctl_dump_debug_dump(SYSCTL_HANDLER_ARGS) +{ + struct ixgbe_softc *sc = (struct ixgbe_softc *)arg1; + device_t dev = sc->dev; + struct sbuf *sbuf; + int error = 0; + + UNREFERENCED_PARAMETER(arg2); + + if (!sc->do_debug_dump) { + if (req->oldptr == NULL && req->newptr == NULL) { + error = SYSCTL_OUT(req, 0, 0); + return (error); + } + + char input_buf[2] = ""; + error = sysctl_handle_string(oidp, input_buf, + sizeof(input_buf), req); + if ((error) || (req->newptr == NULL)) + return (error); + + if (input_buf[0] == '1') { + if (sc->debug_dump_cluster_mask == + IXGBE_ACI_DBG_DUMP_CLUSTER_ID_INVALID) { + device_printf(dev, + "Debug Dump failed because an invalid " + "cluster was specified.\n"); + return (EINVAL); + } + + sc->do_debug_dump = true; + return (0); + } + + return (EINVAL); + } + + /* Caller just wants the upper bound for size */ + if (req->oldptr == NULL && req->newptr == NULL) { + size_t est_output_len = IXGBE_DBG_DUMP_BASE_SIZE; + if (sc->debug_dump_cluster_mask & 0x2) + est_output_len += IXGBE_DBG_DUMP_BASE_SIZE; + error = SYSCTL_OUT(req, 0, est_output_len); + return (error); + } + + sbuf = sbuf_new_for_sysctl(NULL, NULL, 128, req); + sbuf_clear_flags(sbuf, SBUF_INCLUDENUL); + + DEBUGOUT("FW Debug Dump running...\n"); + + if (sc->debug_dump_cluster_mask) { + for (u8 id = 0; id <= IXGBE_ACI_DBG_DUMP_CLUSTER_ID_MAX; id++) { + if (sc->debug_dump_cluster_mask & BIT(id)) { + DEBUGOUT1("Dumping cluster ID %u...\n", id); + ixgbe_debug_dump_print_cluster(sc, sbuf, id); + } + } + } else { + u8 next_cluster_id = 0; + do { + DEBUGOUT1("Dumping cluster ID %u...\n", + next_cluster_id); + next_cluster_id = ixgbe_debug_dump_print_cluster(sc, + sbuf, next_cluster_id); + } while (next_cluster_id != 0 && + next_cluster_id <= IXGBE_ACI_DBG_DUMP_CLUSTER_ID_MAX); + } + + sbuf_finish(sbuf); + sbuf_delete(sbuf); + + sc->do_debug_dump = false; + + return (error); +} /* ixgbe_sysctl_dump_debug_dump */ + +/************************************************************************ + * ixgbe_add_debug_dump_sysctls + ************************************************************************/ +static void +ixgbe_add_debug_dump_sysctls(struct ixgbe_softc *sc) +{ + struct sysctl_oid_list *debug_list, *dump_list; + struct sysctl_oid *dump_node; + struct sysctl_ctx_list *ctx; + device_t dev = sc->dev; + + ctx = device_get_sysctl_ctx(dev); + debug_list = SYSCTL_CHILDREN(sc->debug_sysctls); + + dump_node = SYSCTL_ADD_NODE(ctx, debug_list, OID_AUTO, "dump", + CTLFLAG_RD, NULL, "Internal FW/HW Dump"); + dump_list = SYSCTL_CHILDREN(dump_node); + + SYSCTL_ADD_PROC(ctx, dump_list, OID_AUTO, "clusters", + CTLTYPE_U32 | CTLFLAG_RW, sc, 0, + ixgbe_sysctl_debug_dump_set_clusters, "SU", + IXGBE_SYSCTL_DESC_DEBUG_DUMP_SET_CLUSTER); + + SYSCTL_ADD_PROC(ctx, dump_list, OID_AUTO, "dump", + CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, + ixgbe_sysctl_dump_debug_dump, "", + IXGBE_SYSCTL_DESC_DUMP_DEBUG_DUMP); +} /* ixgbe_add_debug_dump_sysctls */ + +static void +ixgbe_add_debug_sysctls(struct ixgbe_softc *sc) +{ + struct sysctl_oid_list *ctx_list; + struct sysctl_ctx_list *ctx; + device_t dev = sc->dev; + + ctx = device_get_sysctl_ctx(dev); + ctx_list = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); + + sc->debug_sysctls = SYSCTL_ADD_NODE(ctx, ctx_list, OID_AUTO, "debug", + CTLFLAG_RD, NULL, "Debug Sysctls"); + + if (sc->feat_en & IXGBE_FEATURE_DBG_DUMP) + ixgbe_add_debug_dump_sysctls(sc); +} /* ixgbe_add_debug_sysctls */ + +/************************************************************************ * ixgbe_add_device_sysctls ************************************************************************/ static void @@ -2992,6 +3507,8 @@ ixgbe_add_device_sysctls(if_ctx_t ctx) CTLTYPE_INT | CTLFLAG_RW, sc, 0, ixgbe_sysctl_eee_state, "I", "EEE Power Save State"); } + + ixgbe_add_debug_sysctls(sc); } /* ixgbe_add_device_sysctls */ /************************************************************************ @@ -5182,6 +5699,7 @@ ixgbe_init_device_features(struct ixgbe_softc *sc) break; case ixgbe_mac_E610: sc->feat_cap |= IXGBE_FEATURE_RECOVERY_MODE; + sc->feat_cap |= IXGBE_FEATURE_DBG_DUMP; break; default: break; @@ -5203,6 +5721,9 @@ ixgbe_init_device_features(struct ixgbe_softc *sc) /* Recovery mode */ if (sc->feat_cap & IXGBE_FEATURE_RECOVERY_MODE) sc->feat_en |= IXGBE_FEATURE_RECOVERY_MODE; + /* FW Debug Dump */ + if (sc->feat_cap & IXGBE_FEATURE_DBG_DUMP) + sc->feat_en |= IXGBE_FEATURE_DBG_DUMP; /* Enabled via global sysctl... */ /* Flow Director */ diff --git a/sys/dev/ixgbe/ixgbe.h b/sys/dev/ixgbe/ixgbe.h index 844064bf8543..624b71acabea 100644 --- a/sys/dev/ixgbe/ixgbe.h +++ b/sys/dev/ixgbe/ixgbe.h @@ -46,6 +46,7 @@ #include <sys/module.h> #include <sys/sockio.h> #include <sys/eventhandler.h> +#include <sys/priv.h> #include <net/if.h> #include <net/if_var.h> @@ -475,6 +476,20 @@ struct ixgbe_softc { u32 feat_cap; u32 feat_en; u16 lse_mask; + + struct sysctl_oid *debug_sysctls; + u32 debug_dump_cluster_mask; + bool do_debug_dump; +}; + +struct ixgbe_debug_dump_cmd { + u32 offset; /* offset to read/write from table, in bytes */ + u8 cluster_id; /* also used to get next cluster id */ + u16 table_id; + u16 data_size; /* size of data field, in bytes */ + u16 reserved1; + u32 reserved2; + u8 data[]; }; /* Precision Time Sync (IEEE 1588) defines */ @@ -499,6 +514,43 @@ struct ixgbe_softc { #define IXGBE_PHY_CURRENT_TEMP 0xC820 #define IXGBE_PHY_OVERTEMP_STATUS 0xC830 +/** + * The ioctl command number used by NVM update for accessing the driver for + * NVM access commands. + */ +#define IXGBE_NVM_ACCESS \ + (((((((('E' << 4) + '1') << 4) + 'K') << 4) + 'G') << 4) | 5) + +/* + * The ioctl command number used by a userspace tool for accessing the driver + * for getting debug dump data from the firmware. + */ +#define IXGBE_DEBUG_DUMP \ + (((((((('E' << 4) + '1') << 4) + 'K') << 4) + 'G') << 4) | 6) + +/* Debug Dump related definitions */ +#define IXGBE_ACI_DBG_DUMP_CLUSTER_ID_INVALID 0xFFFFFF +#define IXGBE_ACI_DBG_DUMP_CLUSTER_ID_BASE 50 +#define IXGBE_ACI_DBG_DUMP_CLUSTER_ID_MAX 1 + +#define IXGBE_DBG_DUMP_VALID_CLUSTERS_MASK 0x3 +#define IXGBE_DBG_DUMP_BASE_SIZE (2 * 1024 * 1024) + +#define IXGBE_SYSCTL_DESC_DEBUG_DUMP_SET_CLUSTER \ +"\nSelect clusters to dump with \"dump\" sysctl" \ +"\nFlags:" \ +"\n\t 0x1 - Link" \ +"\n\t 0x2 - Full CSR Space, excluding RCW registers" \ +"\n\t" \ +"\nUse \"sysctl -x\" to view flags properly." + +#define IXGBE_SYSCTL_DESC_DUMP_DEBUG_DUMP \ +"\nWrite 1 to output a FW debug dump containing the clusters " \ +"specified by the \"clusters\" sysctl" \ +"\nThe \"-b\" flag must be used in order to dump this data " \ +"as binary data because" \ +"\nthis data is opaque and not a string." + /* Sysctl help messages; displayed with sysctl -d */ #define IXGBE_SYSCTL_DESC_ADV_SPEED \ "\nControl advertised link speed using these flags:\n" \ diff --git a/sys/dev/ixgbe/ixgbe_features.h b/sys/dev/ixgbe/ixgbe_features.h index 0cef334a185f..bee9040319d8 100644 --- a/sys/dev/ixgbe/ixgbe_features.h +++ b/sys/dev/ixgbe/ixgbe_features.h @@ -57,6 +57,7 @@ #define IXGBE_FEATURE_LEGACY_IRQ (u32)(1 << 12) #define IXGBE_FEATURE_NEEDS_CTXD (u32)(1 << 13) #define IXGBE_FEATURE_RECOVERY_MODE (u32)(1 << 15) +#define IXGBE_FEATURE_DBG_DUMP (u32)(1 << 16) /* Check for OS support. Undefine features if not included in the OS */ #ifndef PCI_IOV diff --git a/sys/dev/ixl/if_ixl.c b/sys/dev/ixl/if_ixl.c index 261f76055901..bfaf6cd69e58 100644 --- a/sys/dev/ixl/if_ixl.c +++ b/sys/dev/ixl/if_ixl.c @@ -1480,17 +1480,33 @@ ixl_if_multi_set(if_ctx_t ctx) struct ixl_pf *pf = iflib_get_softc(ctx); struct ixl_vsi *vsi = &pf->vsi; struct i40e_hw *hw = vsi->hw; + enum i40e_status_code status; int mcnt; + if_t ifp = iflib_get_ifp(ctx); IOCTL_DEBUGOUT("ixl_if_multi_set: begin"); /* Delete filters for removed multicast addresses */ ixl_del_multi(vsi, false); - mcnt = min(if_llmaddr_count(iflib_get_ifp(ctx)), MAX_MULTICAST_ADDR); + mcnt = min(if_llmaddr_count(ifp), MAX_MULTICAST_ADDR); if (__predict_false(mcnt == MAX_MULTICAST_ADDR)) { - i40e_aq_set_vsi_multicast_promiscuous(hw, + /* Check if promisc mode is already enabled, if yes return */ + if (vsi->flags & IXL_FLAGS_MC_PROMISC) + return; + + status = i40e_aq_set_vsi_multicast_promiscuous(hw, vsi->seid, TRUE, NULL); + if (status != I40E_SUCCESS) + if_printf(ifp, "Failed to enable multicast promiscuous " + "mode, status: %s\n", i40e_stat_str(hw, status)); + else { + if_printf(ifp, "Enabled multicast promiscuous mode\n"); + + /* Set the flag to track promiscuous mode */ + vsi->flags |= IXL_FLAGS_MC_PROMISC; + } + /* Delete all existing MC filters */ ixl_del_multi(vsi, true); return; } @@ -1693,6 +1709,13 @@ ixl_if_promisc_set(if_ctx_t ctx, int flags) return (err); err = i40e_aq_set_vsi_multicast_promiscuous(hw, vsi->seid, multi, NULL); + + /* Update the multicast promiscuous flag based on the new state */ + if (multi) + vsi->flags |= IXL_FLAGS_MC_PROMISC; + else + vsi->flags &= ~IXL_FLAGS_MC_PROMISC; + return (err); } diff --git a/sys/dev/ixl/ixl.h b/sys/dev/ixl/ixl.h index 95379448b570..ab0f38307d90 100644 --- a/sys/dev/ixl/ixl.h +++ b/sys/dev/ixl/ixl.h @@ -202,6 +202,7 @@ #define IXL_FLAGS_KEEP_TSO6 (1 << 1) #define IXL_FLAGS_USES_MSIX (1 << 2) #define IXL_FLAGS_IS_VF (1 << 3) +#define IXL_FLAGS_MC_PROMISC (1 << 4) #define IXL_VSI_IS_PF(v) ((v->flags & IXL_FLAGS_IS_VF) == 0) #define IXL_VSI_IS_VF(v) ((v->flags & IXL_FLAGS_IS_VF) != 0) diff --git a/sys/dev/ixl/ixl_pf_main.c b/sys/dev/ixl/ixl_pf_main.c index 1752efc02fff..b62619ced5cb 100644 --- a/sys/dev/ixl/ixl_pf_main.c +++ b/sys/dev/ixl/ixl_pf_main.c @@ -593,24 +593,29 @@ ixl_add_maddr(void *arg, struct sockaddr_dl *sdl, u_int cnt) * Routines for multicast and vlan filter management. * *********************************************************************/ + +/** + * ixl_add_multi - Add multicast filters to the hardware + * @vsi: The VSI structure + * + * In case number of multicast filters in the IFP exceeds 127 entries, + * multicast promiscuous mode will be enabled and the filters will be removed + * from the hardware + */ void ixl_add_multi(struct ixl_vsi *vsi) { if_t ifp = vsi->ifp; - struct i40e_hw *hw = vsi->hw; int mcnt = 0; struct ixl_add_maddr_arg cb_arg; IOCTL_DEBUGOUT("ixl_add_multi: begin"); - mcnt = if_llmaddr_count(ifp); - if (__predict_false(mcnt >= MAX_MULTICAST_ADDR)) { - i40e_aq_set_vsi_multicast_promiscuous(hw, - vsi->seid, TRUE, NULL); - /* delete all existing MC filters */ - ixl_del_multi(vsi, true); - return; - } + /* + * There is no need to check if the number of multicast addresses + * exceeds the MAX_MULTICAST_ADDR threshold and set promiscuous mode + * here, as all callers already handle this case. + */ cb_arg.vsi = vsi; LIST_INIT(&cb_arg.to_add); @@ -633,30 +638,103 @@ ixl_match_maddr(void *arg, struct sockaddr_dl *sdl, u_int cnt) return (0); } +/** + * ixl_dis_multi_promisc - Disable multicast promiscuous mode + * @vsi: The VSI structure + * @vsi_mcnt: Number of multicast filters in the VSI + * + * Disable multicast promiscuous mode based on number of entries in the IFP + * and the VSI, then re-add multicast filters. + * + */ +static void +ixl_dis_multi_promisc(struct ixl_vsi *vsi, int vsi_mcnt) +{ + struct ifnet *ifp = vsi->ifp; + struct i40e_hw *hw = vsi->hw; + int ifp_mcnt = 0; + enum i40e_status_code status; + + /* + * Check if multicast promiscuous mode was actually enabled. + * If promiscuous mode was not enabled, don't attempt to disable it. + * Also, don't disable if IFF_PROMISC or IFF_ALLMULTI is set. + */ + if (!(vsi->flags & IXL_FLAGS_MC_PROMISC) || + (if_getflags(ifp) & (IFF_PROMISC | IFF_ALLMULTI))) + return; + + ifp_mcnt = if_llmaddr_count(ifp); + /* + * Equal lists or empty ifp list mean the list has not been changed + * and in such case avoid disabling multicast promiscuous mode as it + * was not previously enabled. Case where multicast promiscuous mode has + * been enabled is when vsi_mcnt == 0 && ifp_mcnt > 0. + */ + if (ifp_mcnt == vsi_mcnt || ifp_mcnt == 0 || + ifp_mcnt >= MAX_MULTICAST_ADDR) + return; + + status = i40e_aq_set_vsi_multicast_promiscuous(hw, vsi->seid, + FALSE, NULL); + if (status != I40E_SUCCESS) { + if_printf(ifp, "Failed to disable multicast promiscuous " + "mode, status: %s\n", i40e_stat_str(hw, status)); + + return; + } + + /* Clear the flag since promiscuous mode is now disabled */ + vsi->flags &= ~IXL_FLAGS_MC_PROMISC; + if_printf(ifp, "Disabled multicast promiscuous mode\n"); + + ixl_add_multi(vsi); +} + +/** + * ixl_del_multi - Delete multicast filters from the hardware + * @vsi: The VSI structure + * @all: Bool to determine if all the multicast filters should be removed + * + * In case number of multicast filters in the IFP drops to 127 entries, + * multicast promiscuous mode will be disabled and the filters will be reapplied + * to the hardware. + */ void ixl_del_multi(struct ixl_vsi *vsi, bool all) { - struct ixl_ftl_head to_del; + int to_del_cnt = 0, vsi_mcnt = 0; if_t ifp = vsi->ifp; struct ixl_mac_filter *f, *fn; - int mcnt = 0; + struct ixl_ftl_head to_del; IOCTL_DEBUGOUT("ixl_del_multi: begin"); LIST_INIT(&to_del); /* Search for removed multicast addresses */ LIST_FOREACH_SAFE(f, &vsi->ftl, ftle, fn) { - if ((f->flags & IXL_FILTER_MC) == 0 || - (!all && (if_foreach_llmaddr(ifp, ixl_match_maddr, f) == 0))) + if ((f->flags & IXL_FILTER_MC) == 0) + continue; + + /* Count all the multicast filters in the VSI for comparison */ + vsi_mcnt++; + + if (!all && if_foreach_llmaddr(ifp, ixl_match_maddr, f) != 0) continue; LIST_REMOVE(f, ftle); LIST_INSERT_HEAD(&to_del, f, ftle); - mcnt++; + to_del_cnt++; } - if (mcnt > 0) - ixl_del_hw_filters(vsi, &to_del, mcnt); + if (to_del_cnt > 0) { + ixl_del_hw_filters(vsi, &to_del, to_del_cnt); + return; + } + + ixl_dis_multi_promisc(vsi, vsi_mcnt); + + IOCTL_DEBUGOUT("ixl_del_multi: end"); } void diff --git a/sys/dev/mii/mv88e151x.c b/sys/dev/mii/mv88e151x.c index 618ad81471c9..fb03b2a7a917 100644 --- a/sys/dev/mii/mv88e151x.c +++ b/sys/dev/mii/mv88e151x.c @@ -97,7 +97,7 @@ mv88e151x_attach(device_t dev) { const struct mii_attach_args *ma; struct mii_softc *sc; - uint32_t cop_cap, cop_extcap; + uint32_t cop_cap = 0, cop_extcap = 0; sc = device_get_softc(dev); ma = device_get_ivars(dev); @@ -224,10 +224,12 @@ mv88e151x_fiber_status(struct mii_softc *phy) else if (reg & MV88E151X_STATUS_LINK && reg & MV88E151X_STATUS_SYNC && (reg & MV88E151X_STATUS_ENERGY) == 0) { - if ((reg & MV88E151X_STATUS_SPEED_MASK) == + if (((reg & MV88E151X_STATUS_SPEED_MASK) >> + MV88E151X_STATUS_SPEED_SHIFT) == MV88E151X_STATUS_SPEED_1000) mii->mii_media_active |= IFM_1000_SX; - else if ((reg & MV88E151X_STATUS_SPEED_MASK) == + else if (((reg & MV88E151X_STATUS_SPEED_MASK) >> + MV88E151X_STATUS_SPEED_SHIFT) == MV88E151X_STATUS_SPEED_100) mii->mii_media_active |= IFM_100_FX; else diff --git a/sys/dev/mps/mps_sas.c b/sys/dev/mps/mps_sas.c index d69c8ea5fded..fa0f817ed67b 100644 --- a/sys/dev/mps/mps_sas.c +++ b/sys/dev/mps/mps_sas.c @@ -858,7 +858,7 @@ mps_detach_sas(struct mps_softc *sc) if (sassc->devq != NULL) cam_simq_free(sassc->devq); - for(i=0; i< sassc->maxtargets ;i++) { + for (i = 0; i < sassc->maxtargets; i++) { targ = &sassc->targets[i]; SLIST_FOREACH_SAFE(lun, &targ->luns, lun_link, lun_tmp) { free(lun, M_MPT2); @@ -3396,7 +3396,7 @@ mpssas_realloc_targets(struct mps_softc *sc, int maxtargets) * the allocated LUNs for each target and then the target buffer * itself. */ - for (i=0; i< maxtargets; i++) { + for (i = 0; i < maxtargets; i++) { targ = &sassc->targets[i]; SLIST_FOREACH_SAFE(lun, &targ->luns, lun_link, lun_tmp) { free(lun, M_MPT2); diff --git a/sys/dev/mpt/mpt_raid.c b/sys/dev/mpt/mpt_raid.c index 5ff08ffcf2b3..2b868f6ef070 100644 --- a/sys/dev/mpt/mpt_raid.c +++ b/sys/dev/mpt/mpt_raid.c @@ -830,7 +830,7 @@ mpt_is_raid_volume(struct mpt_softc *mpt, target_id_t tgt) } ioc_vol = mpt->ioc_page2->RaidVolume; ioc_last_vol = ioc_vol + mpt->ioc_page2->NumActiveVolumes; - for (;ioc_vol != ioc_last_vol; ioc_vol++) { + for (; ioc_vol != ioc_last_vol; ioc_vol++) { if (ioc_vol->VolumeID == tgt) { return (1); } @@ -1406,7 +1406,7 @@ mpt_refresh_raid_data(struct mpt_softc *mpt) ioc_vol = mpt->ioc_page2->RaidVolume; ioc_last_vol = ioc_vol + mpt->ioc_page2->NumActiveVolumes; - for (;ioc_vol != ioc_last_vol; ioc_vol++) { + for (; ioc_vol != ioc_last_vol; ioc_vol++) { struct mpt_raid_volume *mpt_vol; mpt_vol = mpt->raid_volumes + ioc_vol->VolumePageNumber; diff --git a/sys/dev/netmap/netmap_freebsd.c b/sys/dev/netmap/netmap_freebsd.c index 8cc543d54c2e..ac267a66d669 100644 --- a/sys/dev/netmap/netmap_freebsd.c +++ b/sys/dev/netmap/netmap_freebsd.c @@ -738,6 +738,7 @@ nm_os_extmem_create(unsigned long p, struct nmreq_pools_info *pi, int *perror) out_rem: vm_map_remove(kernel_map, e->kva, e->kva + e->size); + e->obj = NULL; /* reference consumed by vm_map_remove() */ out_rel: vm_object_deallocate(e->obj); e->obj = NULL; diff --git a/sys/dev/nfe/if_nfe.c b/sys/dev/nfe/if_nfe.c index 4625c2616562..265181ef7ad0 100644 --- a/sys/dev/nfe/if_nfe.c +++ b/sys/dev/nfe/if_nfe.c @@ -2078,7 +2078,7 @@ nfe_rxeof(struct nfe_softc *sc, int count, int *rx_npktsp) bus_dmamap_sync(sc->rxq.rx_desc_tag, sc->rxq.rx_desc_map, BUS_DMASYNC_POSTREAD); - for (prog = 0;;NFE_INC(sc->rxq.cur, NFE_RX_RING_COUNT), vtag = 0) { + for (prog = 0; ; NFE_INC(sc->rxq.cur, NFE_RX_RING_COUNT), vtag = 0) { if (count <= 0) break; count--; @@ -2192,7 +2192,7 @@ nfe_jrxeof(struct nfe_softc *sc, int count, int *rx_npktsp) bus_dmamap_sync(sc->jrxq.jrx_desc_tag, sc->jrxq.jrx_desc_map, BUS_DMASYNC_POSTREAD); - for (prog = 0;;NFE_INC(sc->jrxq.jcur, NFE_JUMBO_RX_RING_COUNT), + for (prog = 0; ; NFE_INC(sc->jrxq.jcur, NFE_JUMBO_RX_RING_COUNT), vtag = 0) { if (count <= 0) break; diff --git a/sys/dev/nvme/nvme.c b/sys/dev/nvme/nvme.c index ead91f0d01fe..d119f9877aaa 100644 --- a/sys/dev/nvme/nvme.c +++ b/sys/dev/nvme/nvme.c @@ -51,7 +51,7 @@ int32_t nvme_retry_count; MALLOC_DEFINE(M_NVME, "nvme", "nvme(4) memory allocations"); static void -nvme_init(void) +nvme_init(void *dummy __unused) { uint32_t i; @@ -62,7 +62,7 @@ nvme_init(void) SYSINIT(nvme_register, SI_SUB_DRIVERS, SI_ORDER_SECOND, nvme_init, NULL); static void -nvme_uninit(void) +nvme_uninit(void *dummy __unused) { } diff --git a/sys/dev/nvme/nvme.h b/sys/dev/nvme/nvme.h index 17c5cdb4db87..f4ea08f129c0 100644 --- a/sys/dev/nvme/nvme.h +++ b/sys/dev/nvme/nvme.h @@ -1507,9 +1507,7 @@ struct nvme_namespace_data { uint8_t eui64[8]; /** lba format support */ - uint32_t lbaf[16]; - - uint8_t reserved7[192]; + uint32_t lbaf[64]; uint8_t vendor_specific[3712]; } __packed __aligned(4); @@ -2155,8 +2153,6 @@ static inline void nvme_namespace_data_swapbytes(struct nvme_namespace_data *s __unused) { #if _BYTE_ORDER != _LITTLE_ENDIAN - int i; - s->nsze = le64toh(s->nsze); s->ncap = le64toh(s->ncap); s->nuse = le64toh(s->nuse); @@ -2175,7 +2171,7 @@ void nvme_namespace_data_swapbytes(struct nvme_namespace_data *s __unused) s->anagrpid = le32toh(s->anagrpid); s->nvmsetid = le16toh(s->nvmsetid); s->endgid = le16toh(s->endgid); - for (i = 0; i < 16; i++) + for (unsigned i = 0; i < nitems(s->lbaf); i++) s->lbaf[i] = le32toh(s->lbaf[i]); #endif } diff --git a/sys/dev/nvme/nvme_private.h b/sys/dev/nvme/nvme_private.h index 52f9e12f8f9a..52e9fcbbebcd 100644 --- a/sys/dev/nvme/nvme_private.h +++ b/sys/dev/nvme/nvme_private.h @@ -463,13 +463,13 @@ static __inline void nvme_completion_poll(struct nvme_completion_poll_status *status) { int timeout = ticks + 10 * hz; - sbintime_t delta_t = SBT_1US; + sbintime_t delta = SBT_1US; while (!atomic_load_acq_int(&status->done)) { if (timeout - ticks < 0) panic("NVME polled command failed to complete within 10s."); - pause_sbt("nvme", delta_t, 0, C_PREL(1)); - delta_t = min(SBT_1MS, delta_t * 3 / 2); + pause_sbt("nvme", delta, 0, C_PREL(1)); + delta = min(SBT_1MS, delta + delta / 2); } } diff --git a/sys/dev/nvme/nvme_sim.c b/sys/dev/nvme/nvme_sim.c index a06774a64761..7693aa6d54d3 100644 --- a/sys/dev/nvme/nvme_sim.c +++ b/sys/dev/nvme/nvme_sim.c @@ -391,7 +391,7 @@ nvme_sim_controller_fail(void *ctrlr_arg) struct nvme_consumer *consumer_cookie; static void -nvme_sim_init(void) +nvme_sim_init(void *dummy __unused) { if (nvme_use_nvd) return; @@ -404,7 +404,7 @@ SYSINIT(nvme_sim_register, SI_SUB_DRIVERS, SI_ORDER_ANY, nvme_sim_init, NULL); static void -nvme_sim_uninit(void) +nvme_sim_uninit(void *dummy __unused) { if (nvme_use_nvd) return; diff --git a/sys/dev/ocs_fc/ocs_mgmt.c b/sys/dev/ocs_fc/ocs_mgmt.c index 726b499f28ba..5b7f6557c017 100644 --- a/sys/dev/ocs_fc/ocs_mgmt.c +++ b/sys/dev/ocs_fc/ocs_mgmt.c @@ -226,7 +226,7 @@ ocs_mgmt_get_list(ocs_t *ocs, ocs_textbuf_t *textbuf) ocs_mgmt_start_unnumbered_section(textbuf, "ocs"); - for (i=0;i<ARRAY_SIZE(mgmt_table);i++) { + for (i = 0; i < ARRAY_SIZE(mgmt_table); i++) { access = 0; if (mgmt_table[i].get_handler) { access |= MGMT_MODE_RD; @@ -305,7 +305,7 @@ ocs_mgmt_get(ocs_t *ocs, char *name, ocs_textbuf_t *textbuf) if (ocs_strncmp(name, qualifier, strlen(qualifier)) == 0) { char *unqualified_name = name + strlen(qualifier) + 1; - for (i=0;i<ARRAY_SIZE(mgmt_table);i++) { + for (i = 0; i < ARRAY_SIZE(mgmt_table); i++) { if (ocs_strcmp(unqualified_name, mgmt_table[i].name) == 0) { if (mgmt_table[i].get_handler) { mgmt_table[i].get_handler(ocs, name, textbuf); @@ -387,7 +387,7 @@ ocs_mgmt_set(ocs_t *ocs, char *name, char *value) char *unqualified_name = name + strlen(qualifier) +1; /* See if it's a value I can set */ - for (i=0;i<ARRAY_SIZE(mgmt_table);i++) { + for (i = 0; i < ARRAY_SIZE(mgmt_table); i++) { if (ocs_strcmp(unqualified_name, mgmt_table[i].name) == 0) { if (mgmt_table[i].set_handler) { return mgmt_table[i].set_handler(ocs, name, value); @@ -469,7 +469,7 @@ ocs_mgmt_exec(ocs_t *ocs, char *action, void *arg_in, char *unqualified_name = action + strlen(qualifier) +1; /* See if it's an action I can perform */ - for (i=0;i<ARRAY_SIZE(mgmt_table); i++) { + for (i = 0; i < ARRAY_SIZE(mgmt_table); i++) { if (ocs_strcmp(unqualified_name, mgmt_table[i].name) == 0) { if (mgmt_table[i].action_handler) { return mgmt_table[i].action_handler(ocs, action, arg_in, arg_in_length, @@ -527,7 +527,7 @@ ocs_mgmt_get_all(ocs_t *ocs, ocs_textbuf_t *textbuf) ocs_mgmt_start_unnumbered_section(textbuf, "ocs"); - for (i=0;i<ARRAY_SIZE(mgmt_table);i++) { + for (i = 0; i < ARRAY_SIZE(mgmt_table); i++) { if (mgmt_table[i].get_handler) { mgmt_table[i].get_handler(ocs, mgmt_table[i].name, textbuf); } else if (mgmt_table[i].action_handler) { @@ -1212,7 +1212,7 @@ get_sfp_a2(ocs_t *ocs, char *name, ocs_textbuf_t *textbuf) int buffer_remaining = (SFP_PAGE_SIZE * 3) + 1; int bytes_added; - for (i=0; i < bytes_read; i++) { + for (i = 0; i < bytes_read; i++) { bytes_added = ocs_snprintf(d, buffer_remaining, "%02x ", *s); ++s; d += bytes_added; @@ -2040,7 +2040,7 @@ get_profile_list(ocs_t *ocs, char *name, ocs_textbuf_t *textbuf) result_buf = ocs_malloc(ocs, BUFFER_SIZE, OCS_M_ZERO); bytes_left = BUFFER_SIZE; - for (i=0; i<result.list->num_descriptors; i++) { + for (i = 0; i < result.list->num_descriptors; i++) { sprintf(result_line, "0x%02x:%s\n", result.list->descriptors[i].profile_id, result.list->descriptors[i].profile_description); if (strlen(result_line) < bytes_left) { diff --git a/sys/dev/pci/controller/pci_n1sdp.c b/sys/dev/pci/controller/pci_n1sdp.c index 487041bc78e4..22f0ea27d45b 100644 --- a/sys/dev/pci/controller/pci_n1sdp.c +++ b/sys/dev/pci/controller/pci_n1sdp.c @@ -345,6 +345,17 @@ n1sdp_pcie_write_config(device_t dev, u_int bus, u_int slot, bus_space_write_4(t, h, offset & ~3, data); } +static int +n1sdp_pcie_acpi_request_feature(device_t pcib __unused, device_t dev __unused, + enum pci_feature feature __unused) +{ + /* + * HotPlug isn't supported on the N1SDP as it causes an interrupt storm + */ + return (EINVAL); +} + + static device_method_t n1sdp_pcie_acpi_methods[] = { DEVMETHOD(device_probe, n1sdp_pcie_acpi_probe), DEVMETHOD(device_attach, n1sdp_pcie_acpi_attach), @@ -352,6 +363,7 @@ static device_method_t n1sdp_pcie_acpi_methods[] = { /* pcib interface */ DEVMETHOD(pcib_read_config, n1sdp_pcie_read_config), DEVMETHOD(pcib_write_config, n1sdp_pcie_write_config), + DEVMETHOD(pcib_request_feature, n1sdp_pcie_acpi_request_feature), DEVMETHOD_END }; diff --git a/sys/dev/ppc/ppc.c b/sys/dev/ppc/ppc.c index 9870379e2eba..de75f4747709 100644 --- a/sys/dev/ppc/ppc.c +++ b/sys/dev/ppc/ppc.c @@ -1389,7 +1389,7 @@ ppc_exec_microseq(device_t dev, struct ppb_microseq **p_msq) /* let's suppose the next instr. is the same */ prefetch: - for (;mi->opcode == MS_OP_RASSERT; INCR_PC) + for (; mi->opcode == MS_OP_RASSERT; INCR_PC) w_reg(mi->arg[0].i, ppc, (char)mi->arg[1].i); if (mi->opcode == MS_OP_DELAY) { diff --git a/sys/dev/smartpqi/smartpqi_event.c b/sys/dev/smartpqi/smartpqi_event.c index f000d9ce9db3..88dcf45dd08a 100644 --- a/sys/dev/smartpqi/smartpqi_event.c +++ b/sys/dev/smartpqi/smartpqi_event.c @@ -115,7 +115,7 @@ pqisrc_ack_all_events(void *arg1) pending_event = &softs->pending_events[0]; - for (i=0; i < PQI_NUM_SUPPORTED_EVENTS; i++) { + for (i = 0; i < PQI_NUM_SUPPORTED_EVENTS; i++) { if (pending_event->pending == true) { pending_event->pending = false; pqisrc_acknowledge_event(softs, pending_event); @@ -417,7 +417,7 @@ pqisrc_report_event_config(pqisrc_softstate_t *softs) softs->event_config.num_event_descriptors = MIN(event_config_p->num_event_descriptors, PQI_MAX_EVENT_DESCRIPTORS) ; - for (i=0; i < softs->event_config.num_event_descriptors ;i++){ + for (i = 0; i < softs->event_config.num_event_descriptors; i++) { softs->event_config.descriptors[i].event_type = event_config_p->descriptors[i].event_type; } @@ -477,7 +477,7 @@ pqisrc_set_event_config(pqisrc_softstate_t *softs) event_config_p->num_event_descriptors = softs->event_config.num_event_descriptors; - for (i=0; i < softs->event_config.num_event_descriptors ; i++){ + for (i = 0; i < softs->event_config.num_event_descriptors; i++) { event_config_p->descriptors[i].event_type = softs->event_config.descriptors[i].event_type; if( pqisrc_event_type_to_event_index(event_config_p->descriptors[i].event_type) != -1) diff --git a/sys/dev/smartpqi/smartpqi_queue.c b/sys/dev/smartpqi/smartpqi_queue.c index 2e80b01b5436..f05c951cd4f9 100644 --- a/sys/dev/smartpqi/smartpqi_queue.c +++ b/sys/dev/smartpqi/smartpqi_queue.c @@ -700,7 +700,7 @@ pqisrc_create_op_obq(pqisrc_softstate_t *softs, } else { int i = 0; DBG_WARN("Error Status Descriptors\n"); - for(i = 0; i < 4;i++) + for (i = 0; i < 4; i++) DBG_WARN(" %x ",admin_resp.resp_type.create_op_oq.status_desc[i]); } @@ -743,7 +743,7 @@ pqisrc_create_op_ibq(pqisrc_softstate_t *softs, } else { int i = 0; DBG_WARN("Error Status Decsriptors\n"); - for(i = 0; i < 4;i++) + for (i = 0; i < 4; i++) DBG_WARN(" %x ",admin_resp.resp_type.create_op_iq.status_desc[i]); } diff --git a/sys/dev/sym/sym_hipd.c b/sys/dev/sym/sym_hipd.c index fa65d544e17d..b4e5c1075fb4 100644 --- a/sys/dev/sym/sym_hipd.c +++ b/sys/dev/sym/sym_hipd.c @@ -3266,7 +3266,7 @@ static void sym_init (hcb_p np, int reason) * Reinitialize usrwide. * Prepare sync negotiation according to actual SCSI bus mode. */ - for (i=0;i<SYM_CONF_MAX_TARGET;i++) { + for (i = 0; i < SYM_CONF_MAX_TARGET; i++) { tcb_p tp = &np->target[i]; tp->to_reset = 0; @@ -3715,7 +3715,7 @@ static void sym_log_hard_error(hcb_p np, u_short sist, u_char dstat) } printf ("%s: regdump:", sym_name(np)); - for (i=0; i<24;i++) + for (i = 0; i < 24; i++) printf (" %02x", (unsigned)INB_OFF(i)); printf (".\n"); @@ -5527,8 +5527,8 @@ static int sym_show_msg (u_char * msg) u_char i; printf ("%x",*msg); if (*msg==M_EXTENDED) { - for (i=1;i<8;i++) { - if (i-1>msg[1]) break; + for (i = 1; i < 8; i++) { + if (i - 1 > msg[1]) break; printf ("-%x",msg[i]); } return (i+1); @@ -6744,10 +6744,10 @@ restart_test: /* * Wait 'til done (with timeout) */ - for (i=0; i<SYM_SNOOP_TIMEOUT; i++) + for (i = 0; i < SYM_SNOOP_TIMEOUT; i++) if (INB(nc_istat) & (INTF|SIP|DIP)) break; - if (i>=SYM_SNOOP_TIMEOUT) { + if (i >= SYM_SNOOP_TIMEOUT) { printf ("CACHE TEST FAILED: timeout.\n"); return (0x20); } diff --git a/sys/dev/tws/tws.c b/sys/dev/tws/tws.c index af151c8c4f06..fccd6689a6aa 100644 --- a/sys/dev/tws/tws.c +++ b/sys/dev/tws/tws.c @@ -311,7 +311,7 @@ attach_fail_4: if (sc->cmd_tag) bus_dma_tag_destroy(sc->cmd_tag); attach_fail_3: - for(i=0;i<sc->irqs;i++) { + for (i = 0; i < sc->irqs; i++) { if ( sc->irq_res[i] ){ if (bus_release_resource(sc->tws_dev, SYS_RES_IRQ, sc->irq_res_id[i], sc->irq_res[i])) @@ -369,7 +369,7 @@ tws_detach(device_t dev) tws_teardown_intr(sc); /* Release irq resource */ - for(i=0;i<sc->irqs;i++) { + for (i = 0; i < sc->irqs; i++) { if ( sc->irq_res[i] ){ if (bus_release_resource(sc->tws_dev, SYS_RES_IRQ, sc->irq_res_id[i], sc->irq_res[i])) @@ -402,7 +402,7 @@ tws_detach(device_t dev) TWS_TRACE(sc, "bus release mem resource", 0, sc->reg_res_id); } - for ( i=0; i< tws_queue_depth; i++) { + for (i = 0; i < tws_queue_depth; i++) { if (sc->reqs[i].dma_map) bus_dmamap_destroy(sc->data_tag, sc->reqs[i].dma_map); callout_drain(&sc->reqs[i].timeout); @@ -432,7 +432,7 @@ tws_setup_intr(struct tws_softc *sc, int irqs) { int i, error; - for(i=0;i<irqs;i++) { + for (i = 0; i < irqs; i++) { if (!(sc->intr_handle[i])) { if ((error = bus_setup_intr(sc->tws_dev, sc->irq_res[i], INTR_TYPE_CAM | INTR_MPSAFE, @@ -452,7 +452,7 @@ tws_teardown_intr(struct tws_softc *sc) { int i; - for(i=0;i<sc->irqs;i++) { + for (i = 0; i < sc->irqs; i++) { if (sc->intr_handle[i]) { bus_teardown_intr(sc->tws_dev, sc->irq_res[i], sc->intr_handle[i]); @@ -669,8 +669,7 @@ tws_init_reqs(struct tws_softc *sc, u_int32_t dma_mem_size) bzero(cmd_buf, dma_mem_size); TWS_TRACE_DEBUG(sc, "phy cmd", sc->dma_mem_phys, 0); mtx_lock(&sc->q_lock); - for ( i=0; i< tws_queue_depth; i++) - { + for (i = 0; i < tws_queue_depth; i++) { if (bus_dmamap_create(sc->data_tag, 0, &sc->reqs[i].dma_map)) { /* log a ENOMEM failure msg here */ mtx_unlock(&sc->q_lock); diff --git a/sys/dev/tws/tws_services.c b/sys/dev/tws/tws_services.c index da8bbacc39f7..e5c3d45c533f 100644 --- a/sys/dev/tws/tws_services.c +++ b/sys/dev/tws/tws_services.c @@ -200,7 +200,7 @@ tws_init_qs(struct tws_softc *sc) { mtx_lock(&sc->q_lock); - for(int i=0;i<TWS_MAX_QS;i++) { + for (int i = 0; i < TWS_MAX_QS; i++) { sc->q_head[i] = NULL; sc->q_tail[i] = NULL; } diff --git a/sys/dev/vmm/vmm_dev.c b/sys/dev/vmm/vmm_dev.c index 460a508a60dc..4961b21180e1 100644 --- a/sys/dev/vmm/vmm_dev.c +++ b/sys/dev/vmm/vmm_dev.c @@ -120,18 +120,18 @@ vcpu_unlock_one(struct vcpu *vcpu) vcpu_set_state(vcpu, VCPU_IDLE, false); } +#ifndef __amd64__ static int -vcpu_lock_all(struct vmmdev_softc *sc) +vcpu_set_state_all(struct vm *vm, enum vcpu_state newstate) { struct vcpu *vcpu; int error; uint16_t i, j, maxcpus; error = 0; - vm_slock_vcpus(sc->vm); - maxcpus = vm_get_maxcpus(sc->vm); + maxcpus = vm_get_maxcpus(vm); for (i = 0; i < maxcpus; i++) { - vcpu = vm_vcpu(sc->vm, i); + vcpu = vm_vcpu(vm, i); if (vcpu == NULL) continue; error = vcpu_lock_one(vcpu); @@ -141,16 +141,32 @@ vcpu_lock_all(struct vmmdev_softc *sc) if (error) { for (j = 0; j < i; j++) { - vcpu = vm_vcpu(sc->vm, j); + vcpu = vm_vcpu(vm, j); if (vcpu == NULL) continue; vcpu_unlock_one(vcpu); } - vm_unlock_vcpus(sc->vm); } return (error); } +#endif + +static int +vcpu_lock_all(struct vmmdev_softc *sc) +{ + int error; + + /* + * Serialize vcpu_lock_all() callers. Individual vCPUs are not locked + * in a consistent order so we need to serialize to avoid deadlocks. + */ + vm_lock_vcpus(sc->vm); + error = vcpu_set_state_all(sc->vm, VCPU_FROZEN); + if (error != 0) + vm_unlock_vcpus(sc->vm); + return (error); +} static void vcpu_unlock_all(struct vmmdev_softc *sc) diff --git a/sys/dev/vmm/vmm_mem.c b/sys/dev/vmm/vmm_mem.c index be59e37de33d..9df31c9ba133 100644 --- a/sys/dev/vmm/vmm_mem.c +++ b/sys/dev/vmm/vmm_mem.c @@ -26,10 +26,14 @@ static void vm_free_memmap(struct vm *vm, int ident); -void -vm_mem_init(struct vm_mem *mem) +int +vm_mem_init(struct vm_mem *mem, vm_offset_t lo, vm_offset_t hi) { + mem->mem_vmspace = vmmops_vmspace_alloc(lo, hi); + if (mem->mem_vmspace == NULL) + return (ENOMEM); sx_init(&mem->mem_segs_lock, "vm_mem_segs"); + return (0); } static bool @@ -93,10 +97,21 @@ vm_mem_destroy(struct vm *vm) for (int i = 0; i < VM_MAX_MEMSEGS; i++) vm_free_memseg(vm, i); + vmmops_vmspace_free(mem->mem_vmspace); + sx_xunlock(&mem->mem_segs_lock); sx_destroy(&mem->mem_segs_lock); } +struct vmspace * +vm_vmspace(struct vm *vm) +{ + struct vm_mem *mem; + + mem = vm_mem(vm); + return (mem->mem_vmspace); +} + void vm_slock_memsegs(struct vm *vm) { @@ -246,7 +261,7 @@ vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t first, struct vm_mem *mem; struct vm_mem_seg *seg; struct vm_mem_map *m, *map; - struct vmspace *vmspace; + struct vm_map *vmmap; vm_ooffset_t last; int i, error; @@ -282,19 +297,19 @@ vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t first, if (map == NULL) return (ENOSPC); - vmspace = vm_vmspace(vm); - error = vm_map_find(&vmspace->vm_map, seg->object, first, &gpa, - len, 0, VMFS_NO_SPACE, prot, prot, 0); + vmmap = &mem->mem_vmspace->vm_map; + error = vm_map_find(vmmap, seg->object, first, &gpa, len, 0, + VMFS_NO_SPACE, prot, prot, 0); if (error != KERN_SUCCESS) return (EFAULT); vm_object_reference(seg->object); if (flags & VM_MEMMAP_F_WIRED) { - error = vm_map_wire(&vmspace->vm_map, gpa, gpa + len, + error = vm_map_wire(vmmap, gpa, gpa + len, VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); if (error != KERN_SUCCESS) { - vm_map_remove(&vmspace->vm_map, gpa, gpa + len); + vm_map_remove(vmmap, gpa, gpa + len); return (error == KERN_RESOURCE_SHORTAGE ? ENOMEM : EFAULT); } diff --git a/sys/dev/vmm/vmm_mem.h b/sys/dev/vmm/vmm_mem.h index 856470cf2590..f3d22058c7b8 100644 --- a/sys/dev/vmm/vmm_mem.h +++ b/sys/dev/vmm/vmm_mem.h @@ -36,6 +36,7 @@ enum { struct vm; struct vm_object; +struct vmspace; struct vm_mem_seg { size_t len; @@ -56,12 +57,15 @@ struct vm_mem { struct vm_mem_map mem_maps[VM_MAX_MEMMAPS]; struct vm_mem_seg mem_segs[VM_MAX_MEMSEGS]; struct sx mem_segs_lock; + struct vmspace *mem_vmspace; }; -void vm_mem_init(struct vm_mem *mem); +int vm_mem_init(struct vm_mem *mem, vm_offset_t lo, vm_offset_t hi); void vm_mem_cleanup(struct vm *vm); void vm_mem_destroy(struct vm *vm); +struct vmspace *vm_vmspace(struct vm *vm); + /* * APIs that modify the guest memory map require all vcpus to be frozen. */ diff --git a/sys/dev/xdma/xdma.c b/sys/dev/xdma/xdma.c index 62b781159d03..cdd9ad0b8f39 100644 --- a/sys/dev/xdma/xdma.c +++ b/sys/dev/xdma/xdma.c @@ -555,7 +555,7 @@ xdma_put(xdma_controller_t *xdma) } static void -xdma_init(void) +xdma_init(void *dummy __unused) { mtx_init(&xdma_mtx, "xDMA", NULL, MTX_DEF); diff --git a/sys/dev/xen/bus/xen_intr.c b/sys/dev/xen/bus/xen_intr.c index cb30b6efa484..2b5fa8fb7cd1 100644 --- a/sys/dev/xen/bus/xen_intr.c +++ b/sys/dev/xen/bus/xen_intr.c @@ -460,7 +460,7 @@ xen_intr_handle_upcall(void *unused __unused) return (FILTER_HANDLED); } -static int +static void xen_intr_init(void *dummy __unused) { shared_info_t *s = HYPERVISOR_shared_info; @@ -468,7 +468,7 @@ xen_intr_init(void *dummy __unused) int i; if (!xen_domain()) - return (0); + return; _Static_assert(is_valid_evtchn(0), "is_valid_evtchn(0) fails (unused by Xen, but valid by interface"); @@ -502,8 +502,6 @@ xen_intr_init(void *dummy __unused) if (bootverbose) printf("Xen interrupt system initialized\n"); - - return (0); } SYSINIT(xen_intr_init, SI_SUB_INTR, SI_ORDER_SECOND, xen_intr_init, NULL); diff --git a/sys/fs/devfs/devfs_dir.c b/sys/fs/devfs/devfs_dir.c index 3dc87538017d..aad87606e738 100644 --- a/sys/fs/devfs/devfs_dir.c +++ b/sys/fs/devfs/devfs_dir.c @@ -162,7 +162,7 @@ int devfs_pathpath(const char *p1, const char *p2) { - for (;;p1++, p2++) { + for (;; p1++, p2++) { if (*p1 != *p2) { if (*p1 == '/' && *p2 == '\0') return (1); diff --git a/sys/fs/fuse/fuse_vnops.c b/sys/fs/fuse/fuse_vnops.c index 5c28db29fc63..683ee2f7ad56 100644 --- a/sys/fs/fuse/fuse_vnops.c +++ b/sys/fs/fuse/fuse_vnops.c @@ -284,7 +284,7 @@ fuse_flush(struct vnode *vp, struct ucred *cred, pid_t pid, int fflag) struct mount *mp = vnode_mount(vp); int err; - if (fsess_not_impl(vnode_mount(vp), FUSE_FLUSH)) + if (fsess_not_impl(mp, FUSE_FLUSH)) return 0; err = fuse_filehandle_getrw(vp, fflag, &fufh, cred, pid); @@ -292,7 +292,7 @@ fuse_flush(struct vnode *vp, struct ucred *cred, pid_t pid, int fflag) return err; if (fufh->fuse_open_flags & FOPEN_NOFLUSH && - (!fsess_opt_writeback(vnode_mount(vp)))) + (!fsess_opt_writeback(mp))) return (0); fdisp_init(&fdi, sizeof(*ffi)); diff --git a/sys/fs/nullfs/null.h b/sys/fs/nullfs/null.h index ad3f7779e108..7bfdc20a3f67 100644 --- a/sys/fs/nullfs/null.h +++ b/sys/fs/nullfs/null.h @@ -35,11 +35,12 @@ #ifndef FS_NULL_H #define FS_NULL_H -#define NULLM_CACHE 0x0001 - #include <sys/ck.h> #include <vm/uma.h> +#define NULLM_CACHE 0x0001 +#define NULLM_NOUNPBYPASS 0x0002 + struct null_mount { struct mount *nullm_vfs; struct vnode *nullm_lowerrootvp; /* Ref to lower root vnode */ @@ -82,6 +83,16 @@ struct vnode *null_checkvp(struct vnode *vp, char *fil, int lno); #endif extern struct vop_vector null_vnodeops; +extern struct vop_vector null_vnodeops_no_unp_bypass; + +static inline bool +null_is_nullfs_vnode(struct vnode *vp) +{ + const struct vop_vector *op; + + op = vp->v_op; + return (op == &null_vnodeops || op == &null_vnodeops_no_unp_bypass); +} extern uma_zone_t null_node_zone; diff --git a/sys/fs/nullfs/null_subr.c b/sys/fs/nullfs/null_subr.c index d7f847d449d0..a843ae44f121 100644 --- a/sys/fs/nullfs/null_subr.c +++ b/sys/fs/nullfs/null_subr.c @@ -240,7 +240,9 @@ null_nodeget(struct mount *mp, struct vnode *lowervp, struct vnode **vpp) */ xp = uma_zalloc_smr(null_node_zone, M_WAITOK); - error = getnewvnode("nullfs", mp, &null_vnodeops, &vp); + error = getnewvnode("nullfs", mp, (MOUNTTONULLMOUNT(mp)->nullm_flags & + NULLM_NOUNPBYPASS) != 0 ? &null_vnodeops_no_unp_bypass : + &null_vnodeops, &vp); if (error) { vput(lowervp); uma_zfree_smr(null_node_zone, xp); diff --git a/sys/fs/nullfs/null_vfsops.c b/sys/fs/nullfs/null_vfsops.c index 4cddf24a5745..170a3dd51cd8 100644 --- a/sys/fs/nullfs/null_vfsops.c +++ b/sys/fs/nullfs/null_vfsops.c @@ -85,6 +85,10 @@ nullfs_mount(struct mount *mp) char *target; int error, len; bool isvnunlocked; + static const char cache_opt_name[] = "cache"; + static const char nocache_opt_name[] = "nocache"; + static const char unixbypass_opt_name[] = "unixbypass"; + static const char nounixbypass_opt_name[] = "nounixbypass"; NULLFSDEBUG("nullfs_mount(mp = %p)\n", (void *)mp); @@ -116,7 +120,7 @@ nullfs_mount(struct mount *mp) /* * Unlock lower node to avoid possible deadlock. */ - if (mp->mnt_vnodecovered->v_op == &null_vnodeops && + if (null_is_nullfs_vnode(mp->mnt_vnodecovered) && VOP_ISLOCKED(mp->mnt_vnodecovered) == LK_EXCLUSIVE) { VOP_UNLOCK(mp->mnt_vnodecovered); isvnunlocked = true; @@ -150,7 +154,7 @@ nullfs_mount(struct mount *mp) /* * Check multi null mount to avoid `lock against myself' panic. */ - if (mp->mnt_vnodecovered->v_op == &null_vnodeops) { + if (null_is_nullfs_vnode(mp->mnt_vnodecovered)) { nn = VTONULL(mp->mnt_vnodecovered); if (nn == NULL || lowerrootvp == nn->null_lowervp) { NULLFSDEBUG("nullfs_mount: multi null mount?\n"); @@ -205,9 +209,10 @@ nullfs_mount(struct mount *mp) MNT_IUNLOCK(mp); } - if (vfs_getopt(mp->mnt_optnew, "cache", NULL, NULL) == 0) { + if (vfs_getopt(mp->mnt_optnew, cache_opt_name, NULL, NULL) == 0) { xmp->nullm_flags |= NULLM_CACHE; - } else if (vfs_getopt(mp->mnt_optnew, "nocache", NULL, NULL) == 0) { + } else if (vfs_getopt(mp->mnt_optnew, nocache_opt_name, NULL, + NULL) == 0) { ; } else if (null_cache_vnodes && (xmp->nullm_vfs->mnt_kern_flag & MNTK_NULL_NOCACHE) == 0) { @@ -219,6 +224,13 @@ nullfs_mount(struct mount *mp) &xmp->notify_node); } + if (vfs_getopt(mp->mnt_optnew, unixbypass_opt_name, NULL, NULL) == 0) { + ; + } else if (vfs_getopt(mp->mnt_optnew, nounixbypass_opt_name, NULL, + NULL) == 0) { + xmp->nullm_flags |= NULLM_NOUNPBYPASS; + } + if (lowerrootvp == mp->mnt_vnodecovered) { vn_lock(lowerrootvp, LK_EXCLUSIVE | LK_RETRY | LK_CANRECURSE); lowerrootvp->v_vflag |= VV_CROSSLOCK; diff --git a/sys/fs/nullfs/null_vnops.c b/sys/fs/nullfs/null_vnops.c index ec8a6b10b13f..d4baabeb40ab 100644 --- a/sys/fs/nullfs/null_vnops.c +++ b/sys/fs/nullfs/null_vnops.c @@ -278,7 +278,7 @@ null_bypass(struct vop_generic_args *ap) * that aren't. (We must always map first vp or vclean fails.) */ if (i != 0 && (*this_vp_p == NULL || - (*this_vp_p)->v_op != &null_vnodeops)) { + !null_is_nullfs_vnode(*this_vp_p))) { old_vps[i] = NULL; } else { old_vps[i] = *this_vp_p; @@ -1256,3 +1256,11 @@ struct vop_vector null_vnodeops = { .vop_copy_file_range = VOP_PANIC, }; VFS_VOP_VECTOR_REGISTER(null_vnodeops); + +struct vop_vector null_vnodeops_no_unp_bypass = { + .vop_default = &null_vnodeops, + .vop_unp_bind = vop_stdunp_bind, + .vop_unp_connect = vop_stdunp_connect, + .vop_unp_detach = vop_stdunp_detach, +}; +VFS_VOP_VECTOR_REGISTER(null_vnodeops_no_unp_bypass); diff --git a/sys/fs/p9fs/p9_transport.c b/sys/fs/p9fs/p9_transport.c index c82d81fedcd7..25eee984265c 100644 --- a/sys/fs/p9fs/p9_transport.c +++ b/sys/fs/p9fs/p9_transport.c @@ -34,9 +34,8 @@ TAILQ_HEAD(, p9_trans_module) transports; static void -p9_transport_init(void) +p9_transport_init(void *dummy __unused) { - TAILQ_INIT(&transports); } diff --git a/sys/fs/udf/osta.c b/sys/fs/udf/osta.c index f79b86993367..1a083d8c26b1 100644 --- a/sys/fs/udf/osta.c +++ b/sys/fs/udf/osta.c @@ -383,7 +383,7 @@ int UDFTransName( int maxFilenameLen; /* Translate extension, and store it in ext. */ for(index = 0; index<EXT_SIZE && - extIndex + index +1 < udfLen; index++ ) { + extIndex + index +1 < udfLen; index++) { current = udfName[extIndex + index + 1]; if (IsIllegal(current) || !UnicodeIsPrint(current)) { @@ -432,7 +432,7 @@ int UDFTransName( /* Place a translated extension at end, if found. */ if (hasExt) { newName[newIndex++] = PERIOD; - for (index = 0;index < localExtIndex ;index++ ) { + for (index = 0; index < localExtIndex; index++) { newName[newIndex++] = ext[index]; } } diff --git a/sys/fs/unionfs/union_subr.c b/sys/fs/unionfs/union_subr.c index a14f9ca74305..b6d6db60ca3d 100644 --- a/sys/fs/unionfs/union_subr.c +++ b/sys/fs/unionfs/union_subr.c @@ -587,6 +587,7 @@ unionfs_find_node_status(struct unionfs_node *unp, struct thread *td) struct unionfs_node_status *unsp; pid_t pid; + MPASS(td != NULL); pid = td->td_proc->p_pid; ASSERT_VOP_ELOCKED(UNIONFSTOV(unp), __func__); @@ -612,6 +613,7 @@ unionfs_get_node_status(struct unionfs_node *unp, struct thread *td, struct unionfs_node_status *unsp; pid_t pid; + MPASS(td != NULL); pid = td->td_proc->p_pid; KASSERT(NULL != unspp, ("%s: NULL status", __func__)); diff --git a/sys/fs/unionfs/union_vnops.c b/sys/fs/unionfs/union_vnops.c index 627b2f6e9a1d..66fee97a07d5 100644 --- a/sys/fs/unionfs/union_vnops.c +++ b/sys/fs/unionfs/union_vnops.c @@ -814,7 +814,7 @@ unionfs_close(struct vop_close_args *ap) unp = VTOUNIONFS(vp); lvp = unp->un_lowervp; uvp = unp->un_uppervp; - unsp = unionfs_find_node_status(unp, td); + unsp = (td != NULL) ? unionfs_find_node_status(unp, td) : NULL; if (unsp == NULL || (unsp->uns_lower_opencnt <= 0 && unsp->uns_upper_opencnt <= 0)) { @@ -2208,7 +2208,6 @@ unionfs_lock_restart: vholdnz(tvp); VI_UNLOCK(vp); error = VOP_LOCK(tvp, flags); - vdrop(tvp); if (error == 0 && (lvp_locked || VTOUNIONFS(vp) == NULL)) { /* * After dropping the interlock above, there exists a window @@ -2234,6 +2233,7 @@ unionfs_lock_restart: unp = VTOUNIONFS(vp); if (unp == NULL || unp->un_uppervp != NULL) { VOP_UNLOCK(tvp); + vdrop(tvp); /* * If we previously held the lock, the upgrade may * have temporarily dropped the lock, in which case @@ -2249,6 +2249,7 @@ unionfs_lock_restart: goto unionfs_lock_restart; } } + vdrop(tvp); return (error); } @@ -2259,7 +2260,6 @@ unionfs_unlock(struct vop_unlock_args *ap) struct vnode *vp; struct vnode *tvp; struct unionfs_node *unp; - int error; KASSERT_UNIONFS_VNODE(ap->a_vp); @@ -2271,11 +2271,7 @@ unionfs_unlock(struct vop_unlock_args *ap) tvp = (unp->un_uppervp != NULL ? unp->un_uppervp : unp->un_lowervp); - vholdnz(tvp); - error = VOP_UNLOCK(tvp); - vdrop(tvp); - - return (error); + return (VOP_UNLOCK(tvp)); } static int diff --git a/sys/geom/part/g_part.c b/sys/geom/part/g_part.c index 4c0d0c3aa902..1e4236507fa4 100644 --- a/sys/geom/part/g_part.c +++ b/sys/geom/part/g_part.c @@ -122,13 +122,13 @@ struct g_part_alias_list { { "ntfs", G_PART_ALIAS_MS_NTFS }, { "openbsd-data", G_PART_ALIAS_OPENBSD_DATA }, { "prep-boot", G_PART_ALIAS_PREP_BOOT }, - { "solaris-boot", G_PART_ALIAS_SOLARIS_BOOT }, - { "solaris-root", G_PART_ALIAS_SOLARIS_ROOT }, - { "solaris-swap", G_PART_ALIAS_SOLARIS_SWAP }, - { "solaris-backup", G_PART_ALIAS_SOLARIS_BACKUP }, - { "solaris-var", G_PART_ALIAS_SOLARIS_VAR }, - { "solaris-home", G_PART_ALIAS_SOLARIS_HOME }, - { "solaris-altsec", G_PART_ALIAS_SOLARIS_ALTSEC }, + { "solaris-boot", G_PART_ALIAS_SOLARIS_BOOT }, + { "solaris-root", G_PART_ALIAS_SOLARIS_ROOT }, + { "solaris-swap", G_PART_ALIAS_SOLARIS_SWAP }, + { "solaris-backup", G_PART_ALIAS_SOLARIS_BACKUP }, + { "solaris-var", G_PART_ALIAS_SOLARIS_VAR }, + { "solaris-home", G_PART_ALIAS_SOLARIS_HOME }, + { "solaris-altsec", G_PART_ALIAS_SOLARIS_ALTSEC }, { "solaris-reserved", G_PART_ALIAS_SOLARIS_RESERVED }, { "u-boot-env", G_PART_ALIAS_U_BOOT_ENV }, { "vmware-reserved", G_PART_ALIAS_VMRESERVED }, diff --git a/sys/i386/i386/in_cksum_machdep.c b/sys/i386/i386/in_cksum_machdep.c index 27ab09d82da0..b658d85bc892 100644 --- a/sys/i386/i386/in_cksum_machdep.c +++ b/sys/i386/i386/in_cksum_machdep.c @@ -84,7 +84,7 @@ in_cksum_skip(struct mbuf *m, int len, int skip) } } - for (;m && len; m = m->m_next) { + for (; m && len; m = m->m_next) { if (m->m_len == 0) continue; w = mtod(m, u_short *); diff --git a/sys/i386/i386/machdep.c b/sys/i386/i386/machdep.c index 6aac0e968362..3f659432552c 100644 --- a/sys/i386/i386/machdep.c +++ b/sys/i386/i386/machdep.c @@ -1605,7 +1605,7 @@ init386(int first) } static void -machdep_init_trampoline(void) +machdep_init_trampoline(void *dummy __unused) { struct region_descriptor r_gdt, r_idt; struct i386tss *tss; diff --git a/sys/i386/i386/pmap.c b/sys/i386/i386/pmap.c index b44f5e08bbcf..1cf0867d57c3 100644 --- a/sys/i386/i386/pmap.c +++ b/sys/i386/i386/pmap.c @@ -720,7 +720,7 @@ __CONCAT(PMTYPE, bootstrap)(vm_paddr_t firstaddr) } static void -pmap_init_reserved_pages(void) +pmap_init_reserved_pages(void *dummy __unused) { struct pcpu *pc; vm_offset_t pages; diff --git a/sys/kern/imgact_elf.c b/sys/kern/imgact_elf.c index 1bc2491a1a12..a1fabbc86f27 100644 --- a/sys/kern/imgact_elf.c +++ b/sys/kern/imgact_elf.c @@ -92,7 +92,7 @@ #define ELF_ABI_ID __CONCAT(elf, __ELF_WORD_SIZE) static int __elfN(check_header)(const Elf_Ehdr *hdr); -static Elf_Brandinfo *__elfN(get_brandinfo)(struct image_params *imgp, +static const Elf_Brandinfo *__elfN(get_brandinfo)(struct image_params *imgp, const char *interp, int32_t *osrel, uint32_t *fctl0); static int __elfN(load_file)(struct proc *p, const char *file, u_long *addr, u_long *entry); @@ -104,7 +104,7 @@ static bool __elfN(freebsd_trans_osrel)(const Elf_Note *note, int32_t *osrel); static bool kfreebsd_trans_osrel(const Elf_Note *note, int32_t *osrel); static bool __elfN(check_note)(struct image_params *imgp, - Elf_Brandnote *checknote, int32_t *osrel, bool *has_fctl0, + const Elf_Brandnote *checknote, int32_t *osrel, bool *has_fctl0, uint32_t *fctl0); static vm_prot_t __elfN(trans_prot)(Elf_Word); static Elf_Word __elfN(untrans_prot)(vm_prot_t); @@ -227,7 +227,7 @@ SYSCTL_BOOL(ELF_NODE_OID, OID_AUTO, allow_wx, CTLFLAG_RWTUN, &__elfN(allow_wx), 0, "Allow pages to be mapped simultaneously writable and executable"); -static Elf_Brandinfo *elf_brand_list[MAX_BRANDS]; +static const Elf_Brandinfo *elf_brand_list[MAX_BRANDS]; #define aligned(a, t) (rounddown2((u_long)(a), sizeof(t)) == (u_long)(a)) @@ -286,7 +286,7 @@ kfreebsd_trans_osrel(const Elf_Note *note, int32_t *osrel) } int -__elfN(insert_brand_entry)(Elf_Brandinfo *entry) +__elfN(insert_brand_entry)(const Elf_Brandinfo *entry) { int i; @@ -305,7 +305,7 @@ __elfN(insert_brand_entry)(Elf_Brandinfo *entry) } int -__elfN(remove_brand_entry)(Elf_Brandinfo *entry) +__elfN(remove_brand_entry)(const Elf_Brandinfo *entry) { int i; @@ -321,7 +321,7 @@ __elfN(remove_brand_entry)(Elf_Brandinfo *entry) } bool -__elfN(brand_inuse)(Elf_Brandinfo *entry) +__elfN(brand_inuse)(const Elf_Brandinfo *entry) { struct proc *p; bool rval = false; @@ -338,12 +338,12 @@ __elfN(brand_inuse)(Elf_Brandinfo *entry) return (rval); } -static Elf_Brandinfo * +static const Elf_Brandinfo * __elfN(get_brandinfo)(struct image_params *imgp, const char *interp, int32_t *osrel, uint32_t *fctl0) { const Elf_Ehdr *hdr = (const Elf_Ehdr *)imgp->image_header; - Elf_Brandinfo *bi, *bi_m; + const Elf_Brandinfo *bi, *bi_m; bool ret, has_fctl0; int i, interp_name_len; @@ -492,7 +492,7 @@ __elfN(phdr_in_zero_page)(const Elf_Ehdr *hdr) static int __elfN(check_header)(const Elf_Ehdr *hdr) { - Elf_Brandinfo *bi; + const Elf_Brandinfo *bi; int i; if (!IS_ELF(*hdr) || @@ -1109,7 +1109,7 @@ __CONCAT(exec_, __elfN(imgact))(struct image_params *imgp) struct vmspace *vmspace; vm_map_t map; char *interp; - Elf_Brandinfo *brand_info; + const Elf_Brandinfo *brand_info; struct sysentvec *sv; u_long addr, baddr, entry, proghdr; u_long maxalign, maxsalign, mapsz, maxv, maxv1, anon_loc; @@ -1925,7 +1925,7 @@ __elfN(puthdr)(struct thread *td, void *hdr, size_t hdrsize, int numsegs, Elf_Phdr *phdr; Elf_Shdr *shdr; struct phdr_closure phc; - Elf_Brandinfo *bi; + const Elf_Brandinfo *bi; ehdr = (Elf_Ehdr *)hdr; bi = td->td_proc->p_elf_brandinfo; @@ -2831,7 +2831,7 @@ __elfN(parse_notes)(const struct image_params *imgp, const Elf_Note *checknote, } if ((const char *)note_end - (const char *)note < sizeof(Elf_Note)) { - uprintf("ELF note to short\n"); + uprintf("ELF note too short\n"); goto retf; } if (note->n_namesz != checknote->n_namesz || @@ -2839,9 +2839,9 @@ __elfN(parse_notes)(const struct image_params *imgp, const Elf_Note *checknote, note->n_type != checknote->n_type) goto nextnote; note_name = (const char *)(note + 1); - if (note_name + checknote->n_namesz >= - (const char *)note_end || strncmp(note_vendor, - note_name, checknote->n_namesz) != 0) + if (note_name + roundup2(note->n_namesz, ELF_NOTE_ROUNDSIZE) + + note->n_descsz >= (const char *)note_end || + strncmp(note_vendor, note_name, checknote->n_namesz) != 0) goto nextnote; if (cb(note, cb_arg, &res)) @@ -2861,7 +2861,7 @@ ret: } struct brandnote_cb_arg { - Elf_Brandnote *brandnote; + const Elf_Brandnote *brandnote; int32_t *osrel; }; @@ -2883,7 +2883,7 @@ brandnote_cb(const Elf_Note *note, void *arg0, bool *res) return (true); } -static Elf_Note fctl_note = { +static const Elf_Note fctl_note = { .n_namesz = sizeof(FREEBSD_ABI_VENDOR), .n_descsz = sizeof(uint32_t), .n_type = NT_FREEBSD_FEATURE_CTL, @@ -2918,7 +2918,7 @@ note_fctl_cb(const Elf_Note *note, void *arg0, bool *res) * as for headers. */ static bool -__elfN(check_note)(struct image_params *imgp, Elf_Brandnote *brandnote, +__elfN(check_note)(struct image_params *imgp, const Elf_Brandnote *brandnote, int32_t *osrel, bool *has_fctl0, uint32_t *fctl0) { const Elf_Phdr *phdr; diff --git a/sys/kern/kern_boottrace.c b/sys/kern/kern_boottrace.c index 1fa87955a299..c83255bc74ee 100644 --- a/sys/kern/kern_boottrace.c +++ b/sys/kern/kern_boottrace.c @@ -579,7 +579,7 @@ sysctl_boottrace_reset(SYSCTL_HANDLER_ARGS) } static void -boottrace_init(void) +boottrace_init(void *dummy __unused) { if (!boottrace_enabled) diff --git a/sys/kern/kern_devctl.c b/sys/kern/kern_devctl.c index 7a2818c29b1a..a1696225df32 100644 --- a/sys/kern/kern_devctl.c +++ b/sys/kern/kern_devctl.c @@ -140,7 +140,7 @@ static struct devctlbridge { } devctl_notify_hook = { .send_f = NULL }; static void -devctl_init(void) +devctl_init(void *dummy __unused) { int reserve; uma_zone_t z; diff --git a/sys/kern/kern_event.c b/sys/kern/kern_event.c index 23d8dc9cf54a..a6333d8011b1 100644 --- a/sys/kern/kern_event.c +++ b/sys/kern/kern_event.c @@ -156,7 +156,7 @@ static void knote_drop(struct knote *kn, struct thread *td); static void knote_drop_detached(struct knote *kn, struct thread *td); static void knote_enqueue(struct knote *kn); static void knote_dequeue(struct knote *kn); -static void knote_init(void); +static void knote_init(void *); static struct knote *knote_alloc(int mflag); static void knote_free(struct knote *kn); @@ -2887,7 +2887,7 @@ knote_dequeue(struct knote *kn) } static void -knote_init(void) +knote_init(void *dummy __unused) { knote_zone = uma_zcreate("KNOTE", sizeof(struct knote), NULL, NULL, diff --git a/sys/kern/kern_exec.c b/sys/kern/kern_exec.c index 0fc2d0e7f1bc..2bdd6faa025a 100644 --- a/sys/kern/kern_exec.c +++ b/sys/kern/kern_exec.c @@ -418,7 +418,7 @@ do_execve(struct thread *td, struct image_args *args, struct mac *mac_p, #endif int error, i, orig_osrel; uint32_t orig_fctl0; - Elf_Brandinfo *orig_brandinfo; + const Elf_Brandinfo *orig_brandinfo; size_t freepath_size; static const char fexecv_proc_title[] = "(fexecv)"; @@ -1314,7 +1314,7 @@ exec_map_stack(struct image_params *imgp) MAP_INHERIT_SHARE | MAP_ACC_NO_CHARGE); } else { sharedpage_addr = sv->sv_shared_page_base; - vm_map_fixed(map, obj, 0, + error = vm_map_fixed(map, obj, 0, sharedpage_addr, sv->sv_shared_page_len, VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_READ | VM_PROT_EXECUTE, diff --git a/sys/kern/kern_exit.c b/sys/kern/kern_exit.c index ab8ed32ad189..c4b1c8201ff2 100644 --- a/sys/kern/kern_exit.c +++ b/sys/kern/kern_exit.c @@ -807,7 +807,7 @@ kern_abort2(struct thread *td, const char *why, int nargs, void **uargs) } if (nargs > 0) { sbuf_putc(sb, '('); - for (i = 0;i < nargs; i++) + for (i = 0; i < nargs; i++) sbuf_printf(sb, "%s%p", i == 0 ? "" : ", ", uargs[i]); sbuf_putc(sb, ')'); } diff --git a/sys/kern/kern_jailmeta.c b/sys/kern/kern_jailmeta.c index 4e37eccad03a..91bb7155820d 100644 --- a/sys/kern/kern_jailmeta.c +++ b/sys/kern/kern_jailmeta.c @@ -599,22 +599,18 @@ SYSCTL_PROC(_security_jail, OID_AUTO, env, /* Setup and tear down. */ -static int +static void jm_sysinit(void *arg __unused) { meta.osd_slot = osd_jail_register(jm_osd_destructor, meta.methods); env.osd_slot = osd_jail_register(jm_osd_destructor, env.methods); - - return (0); } -static int +static void jm_sysuninit(void *arg __unused) { osd_jail_deregister(meta.osd_slot); osd_jail_deregister(env.osd_slot); - - return (0); } SYSINIT(jailmeta, SI_SUB_DRIVERS, SI_ORDER_ANY, jm_sysinit, NULL); diff --git a/sys/kern/kern_linker.c b/sys/kern/kern_linker.c index d566bc01bc5e..e2f63cbc0c5a 100644 --- a/sys/kern/kern_linker.c +++ b/sys/kern/kern_linker.c @@ -435,7 +435,7 @@ linker_file_register_modules(linker_file_t lf) } static void -linker_init_kernel_modules(void) +linker_init_kernel_modules(void *dummy __unused) { sx_xlock(&kld_sx); diff --git a/sys/kern/kern_malloc.c b/sys/kern/kern_malloc.c index 653ce1ee556b..fcbfbe64f854 100644 --- a/sys/kern/kern_malloc.c +++ b/sys/kern/kern_malloc.c @@ -303,7 +303,7 @@ sysctl_vm_malloc_zone_sizes(SYSCTL_HANDLER_ARGS) */ #if MALLOC_DEBUG_MAXZONES > 1 static void -tunable_set_numzones(void) +tunable_set_numzones(void *dummy __unused) { TUNABLE_INT_FETCH("debug.malloc.numzones", @@ -1302,7 +1302,7 @@ mallocinit(void *dummy) #endif align, UMA_ZONE_MALLOC); } - for (;i <= size; i+= KMEM_ZBASE) + for (; i <= size; i+= KMEM_ZBASE) kmemsize[i >> KMEM_ZSHIFT] = indx; } } diff --git a/sys/kern/kern_racct.c b/sys/kern/kern_racct.c index 7351e9cb6313..2aab151aba08 100644 --- a/sys/kern/kern_racct.c +++ b/sys/kern/kern_racct.c @@ -1312,7 +1312,7 @@ static struct kproc_desc racctd_kp = { }; static void -racctd_init(void) +racctd_init(void *dummy __unused) { if (!racct_enable) return; @@ -1322,7 +1322,7 @@ racctd_init(void) SYSINIT(racctd, SI_SUB_RACCTD, SI_ORDER_FIRST, racctd_init, NULL); static void -racct_init(void) +racct_init(void *dummy __unused) { if (!racct_enable) return; diff --git a/sys/kern/kern_rangelock.c b/sys/kern/kern_rangelock.c index 3854ffbeec29..cd66bff62608 100644 --- a/sys/kern/kern_rangelock.c +++ b/sys/kern/kern_rangelock.c @@ -300,7 +300,7 @@ static void rangelock_free_free(struct rl_q_entry *free); static void rangelock_noncheating_destroy(struct rangelock *lock); static void -rangelock_sys_init(void) +rangelock_sys_init(void *dummy __unused) { rl_entry_zone = uma_zcreate("rl_entry", sizeof(struct rl_q_entry), NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct rl_q_entry), diff --git a/sys/kern/kern_rctl.c b/sys/kern/kern_rctl.c index 4232c71f86fb..682ba86d23ff 100644 --- a/sys/kern/kern_rctl.c +++ b/sys/kern/kern_rctl.c @@ -209,7 +209,7 @@ static struct dict actionnames[] = { { "throttle", RCTL_ACTION_THROTTLE }, { NULL, -1 }}; -static void rctl_init(void); +static void rctl_init(void *); SYSINIT(rctl, SI_SUB_RACCT, SI_ORDER_FIRST, rctl_init, NULL); static uma_zone_t rctl_rule_zone; @@ -2175,7 +2175,7 @@ rctl_racct_release(struct racct *racct) } static void -rctl_init(void) +rctl_init(void *dummy __unused) { if (!racct_enable) diff --git a/sys/kern/kern_sharedpage.c b/sys/kern/kern_sharedpage.c index 5b8398caaca9..f48d0e3d616b 100644 --- a/sys/kern/kern_sharedpage.c +++ b/sys/kern/kern_sharedpage.c @@ -130,8 +130,7 @@ shared_page_init(void *dummy __unused) shared_page_mapping = (char *)addr; } -SYSINIT(shp, SI_SUB_EXEC, SI_ORDER_FIRST, (sysinit_cfunc_t)shared_page_init, - NULL); +SYSINIT(shp, SI_SUB_EXEC, SI_ORDER_FIRST, shared_page_init, NULL); /* * Push the timehands update to the shared page. diff --git a/sys/kern/kern_sig.c b/sys/kern/kern_sig.c index 8efc0886988b..21f765b17f62 100644 --- a/sys/kern/kern_sig.c +++ b/sys/kern/kern_sig.c @@ -113,7 +113,7 @@ static int filt_sigattach(struct knote *kn); static void filt_sigdetach(struct knote *kn); static int filt_signal(struct knote *kn, long hint); static struct thread *sigtd(struct proc *p, int sig, bool fast_sigblock); -static void sigqueue_start(void); +static void sigqueue_start(void *); static void sigfastblock_setpend(struct thread *td, bool resched); static void sig_handle_first_stop(struct thread *td, struct proc *p, int sig); @@ -344,7 +344,7 @@ ast_sigsuspend(struct thread *td, int tda __unused) } static void -sigqueue_start(void) +sigqueue_start(void *dummy __unused) { ksiginfo_zone = uma_zcreate("ksiginfo", sizeof(ksiginfo_t), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); diff --git a/sys/kern/kern_time.c b/sys/kern/kern_time.c index 2a6f0989f6aa..5b7485c25cd7 100644 --- a/sys/kern/kern_time.c +++ b/sys/kern/kern_time.c @@ -90,7 +90,7 @@ static int user_clock_nanosleep(struct thread *td, clockid_t clock_id, int flags, const struct timespec *ua_rqtp, struct timespec *ua_rmtp); -static void itimer_start(void); +static void itimer_start(void *); static int itimer_init(void *, int, int); static void itimer_fini(void *, int); static void itimer_enter(struct itimer *); @@ -1170,7 +1170,7 @@ eventratecheck(struct timeval *lasttime, int *cureps, int maxeps) } static void -itimer_start(void) +itimer_start(void *dummy __unused) { static const struct kclock rt_clock = { .timer_create = realtimer_create, diff --git a/sys/kern/subr_devstat.c b/sys/kern/subr_devstat.c index 07a9cc0f57be..c4d0223d484f 100644 --- a/sys/kern/subr_devstat.c +++ b/sys/kern/subr_devstat.c @@ -415,7 +415,7 @@ sysctl_devstat(SYSCTL_HANDLER_ARGS) if (error != 0) return (error); - for (;nds != NULL;) { + while (nds != NULL) { error = SYSCTL_OUT(req, nds, sizeof(struct devstat)); if (error != 0) return (error); diff --git a/sys/kern/subr_pcpu.c b/sys/kern/subr_pcpu.c index 5c14e15830f4..c9a387a5e87b 100644 --- a/sys/kern/subr_pcpu.c +++ b/sys/kern/subr_pcpu.c @@ -140,7 +140,7 @@ uma_zone_t pcpu_zone_32; uma_zone_t pcpu_zone_64; static void -pcpu_zones_startup(void) +pcpu_zones_startup(void *dummy __unused) { pcpu_zone_4 = uma_zcreate("pcpu-4", 4, diff --git a/sys/kern/subr_prf.c b/sys/kern/subr_prf.c index db0ceb17b9f0..e2070ae3f865 100644 --- a/sys/kern/subr_prf.c +++ b/sys/kern/subr_prf.c @@ -766,7 +766,7 @@ reswitch: switch (ch = (u_char)*fmt++) { PCHAR(hex2ascii(*up & 0x0f)); up++; if (width) - for (q=p;*q;q++) + for (q = p; *q; q++) PCHAR(*q); } break; diff --git a/sys/kern/sys_socket.c b/sys/kern/sys_socket.c index c221106ae067..bc0725230cca 100644 --- a/sys/kern/sys_socket.c +++ b/sys/kern/sys_socket.c @@ -586,7 +586,7 @@ soaio_enqueue(struct task *task) } static void -soaio_init(void) +soaio_init(void *dummy __unused) { soaio_lifetime = AIOD_LIFETIME_DEFAULT; diff --git a/sys/kern/uipc_usrreq.c b/sys/kern/uipc_usrreq.c index 340d84666459..90489e99491a 100644 --- a/sys/kern/uipc_usrreq.c +++ b/sys/kern/uipc_usrreq.c @@ -1069,6 +1069,21 @@ uipc_stream_sbspace(struct sockbuf *sb) return (min(space, mbspace)); } +/* + * UNIX version of generic sbwait() for writes. We wait on peer's receive + * buffer, using our timeout. + */ +static int +uipc_stream_sbwait(struct socket *so, sbintime_t timeo) +{ + struct sockbuf *sb = &so->so_rcv; + + SOCK_RECVBUF_LOCK_ASSERT(so); + sb->sb_flags |= SB_WAIT; + return (msleep_sbt(&sb->sb_acc, SOCK_RECVBUF_MTX(so), PSOCK | PCATCH, + "sbwait", timeo, 0, 0)); +} + static int uipc_sosend_stream_or_seqpacket(struct socket *so, struct sockaddr *addr, struct uio *uio0, struct mbuf *m, struct mbuf *c, int flags, @@ -1203,7 +1218,8 @@ restart: error = EWOULDBLOCK; goto out4; } - if ((error = sbwait(so2, SO_RCV)) != 0) { + if ((error = uipc_stream_sbwait(so2, + so->so_snd.sb_timeo)) != 0) { SOCK_RECVBUF_UNLOCK(so2); goto out4; } else @@ -1543,15 +1559,19 @@ restart: mc_init_m(&cmc, control); SOCK_RECVBUF_LOCK(so); - MPASS(!(sb->sb_state & SBS_CANTRCVMORE)); - - if (__predict_false(cmc.mc_len + sb->sb_ccc + - sb->sb_ctl > sb->sb_hiwat)) { + if (__predict_false( + (sb->sb_state & SBS_CANTRCVMORE) || + cmc.mc_len + sb->sb_ccc + sb->sb_ctl > + sb->sb_hiwat)) { /* - * Too bad, while unp_externalize() was - * failing, the other side had filled - * the buffer and we can't prepend data - * back. Losing data! + * While the lock was dropped and we + * were failing in unp_externalize(), + * the peer could has a) disconnected, + * b) filled the buffer so that we + * can't prepend data back. + * These are two edge conditions that + * we just can't handle, so lose the + * data and return the error. */ SOCK_RECVBUF_UNLOCK(so); SOCK_IO_RECV_UNLOCK(so); @@ -2397,7 +2417,7 @@ uipc_sendfile_wait(struct socket *so, off_t need, int *space) } if (!sockref) soref(so2); - error = sbwait(so2, SO_RCV); + error = uipc_stream_sbwait(so2, so->so_snd.sb_timeo); if (error == 0 && __predict_false(sb->sb_state & SBS_CANTRCVMORE)) error = EPIPE; diff --git a/sys/libkern/arc4random.c b/sys/libkern/arc4random.c index 016822e9f03c..6fca7c3c4e9d 100644 --- a/sys/libkern/arc4random.c +++ b/sys/libkern/arc4random.c @@ -156,7 +156,7 @@ chacha20_randomstir(struct chacha20_s *chacha20) * Initialize the contexts. */ static void -chacha20_init(void) +chacha20_init(void *dummy __unused) { struct chacha20_s *chacha20; @@ -176,7 +176,7 @@ SYSINIT(chacha20, SI_SUB_LOCK, SI_ORDER_ANY, chacha20_init, NULL); static void -chacha20_uninit(void) +chacha20_uninit(void *dummy __unused) { struct chacha20_s *chacha20; diff --git a/sys/libkern/x86/crc32_sse42.c b/sys/libkern/x86/crc32_sse42.c index b79c7afbeeb1..94ffdc178910 100644 --- a/sys/libkern/x86/crc32_sse42.c +++ b/sys/libkern/x86/crc32_sse42.c @@ -199,8 +199,10 @@ crc32c_shift(uint32_t zeros[][256], uint32_t crc) static void #ifndef _KERNEL __attribute__((__constructor__)) -#endif crc32c_init_hw(void) +#else +crc32c_init_hw(void *dummy __unused) +#endif { crc32c_zeros(crc32c_long, LONG); crc32c_zeros(crc32c_2long, 2 * LONG); diff --git a/sys/modules/Makefile b/sys/modules/Makefile index feb9778c23da..63a0b3260e6d 100644 --- a/sys/modules/Makefile +++ b/sys/modules/Makefile @@ -577,6 +577,7 @@ _mlx5ib= mlx5ib ${MACHINE_CPUARCH} == "i386" _ena= ena _gve= gve +_igc= igc # gcc13 and earlier lack __builtin_bitcountg used by linux emulation .if !(${COMPILER_TYPE} == "gcc" && ${COMPILER_VERSION} < 140000) _iwlwifi= iwlwifi @@ -747,7 +748,6 @@ _et= et _ftgpio= ftgpio _ftwd= ftwd _exca= exca -_igc= igc _io= io _itwd= itwd _ix= ix diff --git a/sys/modules/aic7xxx/ahc/Makefile b/sys/modules/aic7xxx/ahc/Makefile index 3741d4fb666f..6f9bdcb1d8bd 100644 --- a/sys/modules/aic7xxx/ahc/Makefile +++ b/sys/modules/aic7xxx/ahc/Makefile @@ -1,6 +1,4 @@ SYSDIR?=${SRCTOP}/sys -.include "${SYSDIR}/conf/kern.opts.mk" - .PATH: ${SYSDIR}/dev/aic7xxx KMOD= ahc SUBDIR+= ahc_isa ahc_pci diff --git a/sys/modules/cxgb/Makefile b/sys/modules/cxgb/Makefile index 2989ad580b97..7ebdc1d51945 100644 --- a/sys/modules/cxgb/Makefile +++ b/sys/modules/cxgb/Makefile @@ -1,6 +1,3 @@ -SYSDIR?=${SRCTOP}/sys -.include "${SYSDIR}/conf/kern.opts.mk" - SUBDIR= cxgb SUBDIR+= cxgb_t3fw diff --git a/sys/modules/dpdk_lpm4/Makefile b/sys/modules/dpdk_lpm4/Makefile index ff68fac78915..9bc2693aeffb 100644 --- a/sys/modules/dpdk_lpm4/Makefile +++ b/sys/modules/dpdk_lpm4/Makefile @@ -1,6 +1,3 @@ -SYSDIR?=${SRCTOP}/sys -.include "${SYSDIR}/conf/kern.opts.mk" - .PATH: ${SYSDIR}/contrib/dpdk_rte_lpm KMOD= dpdk_lpm4 diff --git a/sys/modules/dpdk_lpm6/Makefile b/sys/modules/dpdk_lpm6/Makefile index f2248e5d1c1c..9de2c6650422 100644 --- a/sys/modules/dpdk_lpm6/Makefile +++ b/sys/modules/dpdk_lpm6/Makefile @@ -1,6 +1,3 @@ -SYSDIR?=${SRCTOP}/sys -.include "${SYSDIR}/conf/kern.opts.mk" - .PATH: ${SYSDIR}/contrib/dpdk_rte_lpm KMOD= dpdk_lpm6 diff --git a/sys/modules/fib_dxr/Makefile b/sys/modules/fib_dxr/Makefile index 7d1996ba510f..f8a28abe957a 100644 --- a/sys/modules/fib_dxr/Makefile +++ b/sys/modules/fib_dxr/Makefile @@ -1,6 +1,3 @@ -SYSDIR?=${SRCTOP}/sys -.include "${SYSDIR}/conf/kern.opts.mk" - .PATH: ${SYSDIR}/netinet KMOD= fib_dxr diff --git a/sys/modules/if_enc/Makefile b/sys/modules/if_enc/Makefile index 449d869d6a21..bd865a0216a4 100644 --- a/sys/modules/if_enc/Makefile +++ b/sys/modules/if_enc/Makefile @@ -1,6 +1,4 @@ SYSDIR?=${SRCTOP}/sys -.include "${SYSDIR}/conf/kern.opts.mk" - .PATH: ${SYSDIR}/net KMOD= if_enc diff --git a/sys/modules/if_gif/Makefile b/sys/modules/if_gif/Makefile index efcd6952a8ac..5e3fda3a51c6 100644 --- a/sys/modules/if_gif/Makefile +++ b/sys/modules/if_gif/Makefile @@ -1,6 +1,4 @@ SYSDIR?=${SRCTOP}/sys -.include "${SYSDIR}/conf/kern.opts.mk" - .PATH: ${SYSDIR}/net ${SYSDIR}/netinet ${SYSDIR}/netinet6 KMOD= if_gif diff --git a/sys/modules/if_gre/Makefile b/sys/modules/if_gre/Makefile index 9f50708a14d7..58bd03c23785 100644 --- a/sys/modules/if_gre/Makefile +++ b/sys/modules/if_gre/Makefile @@ -1,6 +1,5 @@ SYSDIR?=${SRCTOP}/sys .PATH: ${SYSDIR}/net ${SYSDIR}/netinet ${SYSDIR}/netinet6 -.include "${SYSDIR}/conf/kern.opts.mk" KMOD= if_gre SRCS= if_gre.c opt_inet.h opt_inet6.h opt_rss.h diff --git a/sys/modules/iser/Makefile b/sys/modules/iser/Makefile index 615199ec97a3..ff08ae6f346a 100644 --- a/sys/modules/iser/Makefile +++ b/sys/modules/iser/Makefile @@ -1,6 +1,4 @@ SYSDIR?=${SRCTOP}/sys -.include "${SYSDIR}/conf/kern.opts.mk" - .PATH: ${SYSDIR}/dev/iser/ KMOD= iser diff --git a/sys/modules/ktest/Makefile b/sys/modules/ktest/Makefile index 151db53417df..d5f15576f38b 100644 --- a/sys/modules/ktest/Makefile +++ b/sys/modules/ktest/Makefile @@ -1,8 +1,6 @@ -SYSDIR?=${SRCTOP}/sys -.include "${SYSDIR}/conf/kern.opts.mk" - SUBDIR= ktest \ ktest_example \ - ktest_netlink_message_writer + ktest_netlink_message_writer \ + ktest_tcphpts .include <bsd.subdir.mk> diff --git a/sys/modules/ktest/ktest/Makefile b/sys/modules/ktest/ktest/Makefile index 3d4f1a8c2cc0..9741662ef709 100644 --- a/sys/modules/ktest/ktest/Makefile +++ b/sys/modules/ktest/ktest/Makefile @@ -1,9 +1,5 @@ PACKAGE= tests - -SYSDIR?=${SRCTOP}/sys -.include "${SYSDIR}/conf/kern.opts.mk" - -.PATH: ${SYSDIR}/tests +.PATH: ${SRCTOP}/sys/tests KMOD= ktest SRCS= ktest.c diff --git a/sys/modules/ktest/ktest_example/Makefile b/sys/modules/ktest/ktest_example/Makefile index 2b572d867aa5..aacc8f0e4ca5 100644 --- a/sys/modules/ktest/ktest_example/Makefile +++ b/sys/modules/ktest/ktest_example/Makefile @@ -1,9 +1,8 @@ PACKAGE= tests -SYSDIR?=${SRCTOP}/sys -.include "${SYSDIR}/conf/kern.opts.mk" +.include "${SRCTOP}/sys/conf/kern.opts.mk" -.PATH: ${SYSDIR}/tests +.PATH: ${SRCTOP}/sys/tests KMOD= ktest_example SRCS= ktest_example.c diff --git a/sys/modules/ktest/ktest_netlink_message_writer/Makefile b/sys/modules/ktest/ktest_netlink_message_writer/Makefile index a91c45755d0d..3f05f9b26785 100644 --- a/sys/modules/ktest/ktest_netlink_message_writer/Makefile +++ b/sys/modules/ktest/ktest_netlink_message_writer/Makefile @@ -1,8 +1,6 @@ PACKAGE= tests SYSDIR?=${SRCTOP}/sys -.include "${SYSDIR}/conf/kern.opts.mk" - .PATH: ${SYSDIR}/netlink KMOD= ktest_netlink_message_writer diff --git a/sys/modules/ktest/ktest_tcphpts/Makefile b/sys/modules/ktest/ktest_tcphpts/Makefile new file mode 100644 index 000000000000..b642c0cb4209 --- /dev/null +++ b/sys/modules/ktest/ktest_tcphpts/Makefile @@ -0,0 +1,13 @@ +PACKAGE= tests +WARNS?= 6 + +SYSDIR?=${SRCTOP}/sys +.include "${SYSDIR}/conf/kern.opts.mk" + +.PATH: ${SYSDIR}/netinet + +KMOD= ktest_tcphpts +SRCS= tcp_hpts_test.c + +.include <bsd.kmod.mk> + diff --git a/sys/modules/miiproxy/Makefile b/sys/modules/miiproxy/Makefile index 730bef4220cd..ab92ebe71b43 100644 --- a/sys/modules/miiproxy/Makefile +++ b/sys/modules/miiproxy/Makefile @@ -3,7 +3,7 @@ KMOD = miiproxy SRCS= miiproxy.c -SRCS+= bus_if.h mdio_if.h miibus_if.h opt_platform.h +SRCS+= bus_if.h device_if.h mdio_if.h miibus_if.h opt_platform.h CFLAGS+= -I${SRCTOP}/sys/dev/etherswitch .include <bsd.kmod.mk> diff --git a/sys/modules/netgraph/Makefile b/sys/modules/netgraph/Makefile index 94560d5c51d7..b2d65af16e7f 100644 --- a/sys/modules/netgraph/Makefile +++ b/sys/modules/netgraph/Makefile @@ -1,5 +1,3 @@ -# $Whistle: Makefile,v 1.5 1999/01/24 06:48:37 archie Exp $ - SYSDIR?=${SRCTOP}/sys .include "${SYSDIR}/conf/kern.opts.mk" diff --git a/sys/modules/netgraph/checksum/Makefile b/sys/modules/netgraph/checksum/Makefile index 4e2b1f547a40..bbbc7363d045 100644 --- a/sys/modules/netgraph/checksum/Makefile +++ b/sys/modules/netgraph/checksum/Makefile @@ -1,6 +1,3 @@ -SYSDIR?=${SRCTOP}/sys -.include "${SYSDIR}/conf/kern.opts.mk" - KMOD= ng_checksum SRCS= ng_checksum.c opt_inet.h opt_inet6.h diff --git a/sys/modules/netmap/Makefile b/sys/modules/netmap/Makefile index 17b52aec1893..8c114ac51538 100644 --- a/sys/modules/netmap/Makefile +++ b/sys/modules/netmap/Makefile @@ -2,9 +2,6 @@ # Compile netmap as a module, useful if you want a netmap bridge # or loadable drivers. -.include <bsd.own.mk> # FreeBSD 10 and earlier -# .include "${SYSDIR}/conf/kern.opts.mk" - .PATH: ${.CURDIR}/../../dev/netmap .PATH.h: ${.CURDIR}/../../net CFLAGS += -I${.CURDIR}/../../ -D INET -D VIMAGE diff --git a/sys/modules/opensolaris/Makefile b/sys/modules/opensolaris/Makefile index 98f52057e45e..7e2d5f9101ad 100644 --- a/sys/modules/opensolaris/Makefile +++ b/sys/modules/opensolaris/Makefile @@ -1,4 +1,4 @@ -SYSDIR?= ${SRCTOP}/sys +SYSDIR?=${SRCTOP}/sys .PATH: ${SYSDIR}/cddl/compat/opensolaris/kern .PATH: ${SYSDIR}/contrib/openzfs/module/os/freebsd/spl diff --git a/sys/modules/ow/Makefile b/sys/modules/ow/Makefile index 76fefe3e63be..7aa9d2de8183 100644 --- a/sys/modules/ow/Makefile +++ b/sys/modules/ow/Makefile @@ -1,6 +1,3 @@ -SYSDIR?=${SRCTOP}/sys -.include "${SYSDIR}/conf/kern.opts.mk" - SUBDIR = ow owc ow_temp .include <bsd.subdir.mk> diff --git a/sys/modules/qlnx/Makefile b/sys/modules/qlnx/Makefile index 2121f9d586a6..291b681c809e 100644 --- a/sys/modules/qlnx/Makefile +++ b/sys/modules/qlnx/Makefile @@ -31,9 +31,6 @@ # # -SYSDIR?=${SRCTOP}/sys -.include "${SYSDIR}/conf/kern.opts.mk" - SUBDIR=qlnxe SUBDIR+=qlnxev SUBDIR+=qlnxr diff --git a/sys/modules/rtwn/Makefile b/sys/modules/rtwn/Makefile index 9afdd2084ecb..f15cbbe8236b 100644 --- a/sys/modules/rtwn/Makefile +++ b/sys/modules/rtwn/Makefile @@ -1,7 +1,5 @@ .PATH: ${SRCTOP}/sys/dev/rtwn - -SYSDIR?=${SRCTOP}/sys -.include "${SYSDIR}/conf/kern.opts.mk" +.include "${SRCTOP}/sys/conf/kern.opts.mk" KMOD = rtwn SRCS = if_rtwn.c if_rtwn_tx.c if_rtwn_rx.c if_rtwn_beacon.c \ diff --git a/sys/modules/rtwn_pci/Makefile b/sys/modules/rtwn_pci/Makefile index ce2144121e88..3fea80d7d256 100644 --- a/sys/modules/rtwn_pci/Makefile +++ b/sys/modules/rtwn_pci/Makefile @@ -1,7 +1,5 @@ .PATH: ${SRCTOP}/sys/dev/rtwn/pci - -SYSDIR?=${SRCTOP}/sys -.include "${SYSDIR}/conf/kern.opts.mk" +.include "${SRCTOP}/sys/conf/kern.opts.mk" KMOD = if_rtwn_pci SRCS = rtwn_pci_attach.c rtwn_pci_reg.c rtwn_pci_rx.c rtwn_pci_tx.c \ diff --git a/sys/modules/rtwn_usb/Makefile b/sys/modules/rtwn_usb/Makefile index 16899b8a8c49..6a73276d088c 100644 --- a/sys/modules/rtwn_usb/Makefile +++ b/sys/modules/rtwn_usb/Makefile @@ -1,7 +1,5 @@ .PATH: ${SRCTOP}/sys/dev/rtwn/usb - -SYSDIR?=${SRCTOP}/sys -.include "${SYSDIR}/conf/kern.opts.mk" +.include "${SRCTOP}/sys/conf/kern.opts.mk" KMOD = if_rtwn_usb SRCS = rtwn_usb_attach.c rtwn_usb_ep.c rtwn_usb_reg.c rtwn_usb_rx.c \ diff --git a/sys/modules/sound/driver/Makefile b/sys/modules/sound/driver/Makefile index ff9499fdf841..02703d4b591a 100644 --- a/sys/modules/sound/driver/Makefile +++ b/sys/modules/sound/driver/Makefile @@ -1,5 +1,4 @@ -SYSDIR?=${SRCTOP}/sys -.include "${SYSDIR}/conf/kern.opts.mk" +.include "${SRCTOP}/sys/conf/kern.opts.mk" # Modules that include binary-only blobs of microcode should be selectable by # MK_SOURCELESS_UCODE option (see below). diff --git a/sys/modules/sound/sound/Makefile b/sys/modules/sound/sound/Makefile index f3978e9bd9cc..169b1a2730ec 100644 --- a/sys/modules/sound/sound/Makefile +++ b/sys/modules/sound/sound/Makefile @@ -1,5 +1,4 @@ SYSDIR?=${SRCTOP}/sys - .PATH: ${SYSDIR}/dev/sound .PATH: ${SYSDIR}/dev/sound/pcm .PATH: ${SYSDIR}/dev/sound/midi diff --git a/sys/modules/tests/fib_lookup/Makefile b/sys/modules/tests/fib_lookup/Makefile index 7d6198396911..b78d4309f145 100644 --- a/sys/modules/tests/fib_lookup/Makefile +++ b/sys/modules/tests/fib_lookup/Makefile @@ -1,6 +1,3 @@ -SYSDIR?=${SRCTOP}/sys -.include "${SYSDIR}/conf/kern.opts.mk" - .PATH: ${SYSDIR}/tests/fib_lookup KMOD= test_lookup diff --git a/sys/modules/vnic/Makefile b/sys/modules/vnic/Makefile index 7b975bfebe81..53e208328159 100644 --- a/sys/modules/vnic/Makefile +++ b/sys/modules/vnic/Makefile @@ -1,6 +1,3 @@ -SYSDIR?=${SRCTOP}/sys -.include "${SYSDIR}/conf/kern.opts.mk" - CFLAGS+= -DFDT SUBDIR = mrmlbus thunder_mdio thunder_bgx vnicpf vnicvf diff --git a/sys/modules/vnic/mrmlbus/Makefile b/sys/modules/vnic/mrmlbus/Makefile index a3581b7a79a5..a8fe9e5474e1 100644 --- a/sys/modules/vnic/mrmlbus/Makefile +++ b/sys/modules/vnic/mrmlbus/Makefile @@ -1,6 +1,3 @@ -SYSDIR?=${SRCTOP}/sys -.include "${SYSDIR}/conf/kern.opts.mk" - S= ${SRCTOP}/sys .PATH: $S/dev/vnic diff --git a/sys/modules/vnic/thunder_bgx/Makefile b/sys/modules/vnic/thunder_bgx/Makefile index 90df4b25df90..bf46c3194493 100644 --- a/sys/modules/vnic/thunder_bgx/Makefile +++ b/sys/modules/vnic/thunder_bgx/Makefile @@ -1,6 +1,3 @@ -SYSDIR?=${SRCTOP}/sys -.include "${SYSDIR}/conf/kern.opts.mk" - S= ${SRCTOP}/sys .PATH: $S/dev/vnic diff --git a/sys/modules/vnic/thunder_mdio/Makefile b/sys/modules/vnic/thunder_mdio/Makefile index 37032516f3ca..07cc583bfaf8 100644 --- a/sys/modules/vnic/thunder_mdio/Makefile +++ b/sys/modules/vnic/thunder_mdio/Makefile @@ -1,6 +1,3 @@ -SYSDIR?=${SRCTOP}/sys -.include "${SYSDIR}/conf/kern.opts.mk" - S= ${SRCTOP}/sys .PATH: $S/dev/vnic diff --git a/sys/modules/vnic/vnicpf/Makefile b/sys/modules/vnic/vnicpf/Makefile index 37cd29e6fdd8..3cd64d08a788 100644 --- a/sys/modules/vnic/vnicpf/Makefile +++ b/sys/modules/vnic/vnicpf/Makefile @@ -1,6 +1,3 @@ -SYSDIR?=${SRCTOP}/sys -.include "${SYSDIR}/conf/kern.opts.mk" - S= ${SRCTOP}/sys .PATH: $S/dev/vnic diff --git a/sys/modules/vnic/vnicvf/Makefile b/sys/modules/vnic/vnicvf/Makefile index c6ffaaa2c302..da938b7fd073 100644 --- a/sys/modules/vnic/vnicvf/Makefile +++ b/sys/modules/vnic/vnicvf/Makefile @@ -1,6 +1,3 @@ -SYSDIR?=${SRCTOP}/sys -.include "${SYSDIR}/conf/kern.opts.mk" - S= ${SRCTOP}/sys .PATH: $S/dev/vnic diff --git a/sys/net/route.c b/sys/net/route.c index 7a50bcc43e06..d2c9f3e39c17 100644 --- a/sys/net/route.c +++ b/sys/net/route.c @@ -89,7 +89,7 @@ static int rt_ifdelroute(const struct rtentry *rt, const struct nhop_object *, * SI_ORDER_MIDDLE. */ static void -route_init(void) +route_init(void *dummy __unused) { nhops_init(); diff --git a/sys/net/route/route_tables.c b/sys/net/route/route_tables.c index 176ca43fa1c5..3b7bb1385d0e 100644 --- a/sys/net/route/route_tables.c +++ b/sys/net/route/route_tables.c @@ -186,7 +186,7 @@ rtables_prison_destructor(void *data) } static void -rtables_init(void) +rtables_init(void *dummy __unused) { osd_method_t methods[PR_MAXMETHOD] = { [PR_METHOD_ATTACH] = rtables_check_proc_fib, diff --git a/sys/net/rtsock.c b/sys/net/rtsock.c index f0dcc973ca7c..be858428bb3e 100644 --- a/sys/net/rtsock.c +++ b/sys/net/rtsock.c @@ -309,7 +309,7 @@ rtsock_notify_event(uint32_t fibnum, const struct rib_cmd_info *rc) } static void -rtsock_init(void) +rtsock_init(void *dummy __unused) { rtsbridge_orig_p = rtsock_callback_p; rtsock_callback_p = &rtsbridge; diff --git a/sys/net80211/ieee80211_ht.c b/sys/net80211/ieee80211_ht.c index 3af56a228295..a8a767785fce 100644 --- a/sys/net80211/ieee80211_ht.c +++ b/sys/net80211/ieee80211_ht.c @@ -167,7 +167,7 @@ static ieee80211_send_action_func ht_send_action_ba_delba; static ieee80211_send_action_func ht_send_action_ht_txchwidth; static void -ieee80211_ht_init(void) +ieee80211_ht_init(void *dummy __unused) { /* * Setup HT parameters that depends on the clock frequency. diff --git a/sys/net80211/ieee80211_hwmp.c b/sys/net80211/ieee80211_hwmp.c index b69210768c54..084e67da13db 100644 --- a/sys/net80211/ieee80211_hwmp.c +++ b/sys/net80211/ieee80211_hwmp.c @@ -212,7 +212,7 @@ SYSCTL_PROC(_net_wlan_hwmp, OID_AUTO, inact, "mesh route inactivity timeout (ms)"); static void -ieee80211_hwmp_init(void) +ieee80211_hwmp_init(void *dummy __unused) { /* Default values as per amendment */ ieee80211_hwmp_pathtimeout = msecs_to_ticks(5*1000); diff --git a/sys/net80211/ieee80211_mesh.c b/sys/net80211/ieee80211_mesh.c index 3f0410a69e3c..7f2e8bdcb963 100644 --- a/sys/net80211/ieee80211_mesh.c +++ b/sys/net80211/ieee80211_mesh.c @@ -548,7 +548,7 @@ mesh_gatemode_cb(void *arg) } static void -ieee80211_mesh_init(void) +ieee80211_mesh_init(void *dummy __unused) { memset(mesh_proto_paths, 0, sizeof(mesh_proto_paths)); diff --git a/sys/net80211/ieee80211_phy.c b/sys/net80211/ieee80211_phy.c index 7f53c717152b..b4d9b16907d2 100644 --- a/sys/net80211/ieee80211_phy.c +++ b/sys/net80211/ieee80211_phy.c @@ -348,7 +348,7 @@ ieee80211_setup_ratetable(struct ieee80211_rate_table *rt) /* Setup all rate tables */ static void -ieee80211_phy_init(void) +ieee80211_phy_init(void *dummy __unused) { static struct ieee80211_rate_table * const ratetables[] = { &ieee80211_half_table, diff --git a/sys/net80211/ieee80211_proto.c b/sys/net80211/ieee80211_proto.c index 0c161d98a55a..4918bf7d025f 100644 --- a/sys/net80211/ieee80211_proto.c +++ b/sys/net80211/ieee80211_proto.c @@ -459,7 +459,7 @@ static const struct ieee80211_authenticator auth_internal = { * Setup internal authenticators once; they are never unregistered. */ static void -ieee80211_auth_setup(void) +ieee80211_auth_setup(void *dummy __unused) { ieee80211_authenticator_register(IEEE80211_AUTH_OPEN, &auth_internal); ieee80211_authenticator_register(IEEE80211_AUTH_SHARED, &auth_internal); diff --git a/sys/net80211/ieee80211_vht.c b/sys/net80211/ieee80211_vht.c index 10a5fc7f08ab..095c4108c768 100644 --- a/sys/net80211/ieee80211_vht.c +++ b/sys/net80211/ieee80211_vht.c @@ -102,7 +102,7 @@ vht_send_action_placeholder(struct ieee80211_node *ni, } static void -ieee80211_vht_init(void) +ieee80211_vht_init(void *dummy __unused) { ieee80211_recv_action_register(IEEE80211_ACTION_CAT_VHT, diff --git a/sys/netgraph/bluetooth/drivers/ubt/ng_ubt_rtl.c b/sys/netgraph/bluetooth/drivers/ubt/ng_ubt_rtl.c index 0181a67ac604..f35712cc8f69 100644 --- a/sys/netgraph/bluetooth/drivers/ubt/ng_ubt_rtl.c +++ b/sys/netgraph/bluetooth/drivers/ubt/ng_ubt_rtl.c @@ -81,9 +81,6 @@ const STRUCT_USB_HOST_ID ubt_rtl_devs[] = { USB_VPI(0x0bda, 0xb00c, 0) }, { USB_VPI(0x0bda, 0xc822, 0) }, - /* Realtek 8822CU Bluetooth devices */ - { USB_VPI(0x13d3, 0x3549, 0) }, - /* Realtek 8851BE Bluetooth devices */ { USB_VPI(0x13d3, 0x3600, 0) }, diff --git a/sys/netinet/cc/cc.c b/sys/netinet/cc/cc.c index c20a20cd983d..bc06616dbf93 100644 --- a/sys/netinet/cc/cc.c +++ b/sys/netinet/cc/cc.c @@ -271,7 +271,7 @@ cc_check_default(struct cc_algo *remove_cc) * Initialise CC subsystem on system boot. */ static void -cc_init(void) +cc_init(void *dummy __unused) { CC_LIST_LOCK_INIT(); STAILQ_INIT(&cc_list); diff --git a/sys/netinet/in_fib_algo.c b/sys/netinet/in_fib_algo.c index 123dacb409e7..95621c300064 100644 --- a/sys/netinet/in_fib_algo.c +++ b/sys/netinet/in_fib_algo.c @@ -767,7 +767,7 @@ struct fib_lookup_module flm_radix4 = { }; static void -fib4_algo_init(void) +fib4_algo_init(void *dummy __unused) { fib_module_register(&flm_bsearch4); diff --git a/sys/netinet/in_mcast.c b/sys/netinet/in_mcast.c index f5b20c49ffd2..ba112afbf002 100644 --- a/sys/netinet/in_mcast.c +++ b/sys/netinet/in_mcast.c @@ -159,9 +159,6 @@ static struct ip_moptions * static int inp_get_source_filters(struct inpcb *, struct sockopt *); static int inp_join_group(struct inpcb *, struct sockopt *); static int inp_leave_group(struct inpcb *, struct sockopt *); -static struct ifnet * - inp_lookup_mcast_ifp(const struct inpcb *, - const struct sockaddr_in *, const struct in_addr); static int inp_block_unblock_source(struct inpcb *, struct sockopt *); static int inp_set_multicast_if(struct inpcb *, struct sockopt *); static int inp_set_source_filters(struct inpcb *, struct sockopt *); @@ -1832,69 +1829,55 @@ inp_getmoptions(struct inpcb *inp, struct sockopt *sopt) } /* - * Look up the ifnet to use for a multicast group membership, - * given the IPv4 address of an interface, and the IPv4 group address. - * - * This routine exists to support legacy multicast applications - * which do not understand that multicast memberships are scoped to - * specific physical links in the networking stack, or which need - * to join link-scope groups before IPv4 addresses are configured. - * - * Use this socket's current FIB number for any required FIB lookup. - * If ina is INADDR_ANY, look up the group address in the unicast FIB, - * and use its ifp; usually, this points to the default next-hop. - * - * If the FIB lookup fails, attempt to use the first non-loopback - * interface with multicast capability in the system as a - * last resort. The legacy IPv4 ASM API requires that we do - * this in order to allow groups to be joined when the routing - * table has not yet been populated during boot. - * - * Returns NULL if no ifp could be found, otherwise return referenced ifp. + * Look up the ifnet to join a multicast group membership via legacy + * IP_ADD_MEMBERSHIP or via more modern MCAST_JOIN_GROUP. * - * FUTURE: Implement IPv4 source-address selection. + * If the interface index was specified explicitly, just use it. If the + * address was specified (legacy), try to find matching interface. Else + * (index == 0 && no address) do a route lookup. If that fails for a modern + * MCAST_JOIN_GROUP return failure, for legacy IP_ADD_MEMBERSHIP find first + * multicast capable interface. */ static struct ifnet * -inp_lookup_mcast_ifp(const struct inpcb *inp, - const struct sockaddr_in *gsin, const struct in_addr ina) +inp_lookup_mcast_ifp(const struct inpcb *inp, const struct in_addr maddr, +const struct in_addr *ina, const u_int index) { struct ifnet *ifp; struct nhop_object *nh; NET_EPOCH_ASSERT(); - KASSERT(inp != NULL, ("%s: inp must not be NULL", __func__)); - KASSERT(gsin->sin_family == AF_INET, ("%s: not AF_INET", __func__)); - KASSERT(IN_MULTICAST(ntohl(gsin->sin_addr.s_addr)), - ("%s: not multicast", __func__)); - ifp = NULL; - if (!in_nullhost(ina)) { - INADDR_TO_IFP(ina, ifp); + if (index != 0) + return (ifnet_byindex_ref(index)); + + if (ina != NULL && !in_nullhost(*ina)) { + INADDR_TO_IFP(*ina, ifp); if (ifp != NULL) if_ref(ifp); - } else { - nh = fib4_lookup(inp->inp_inc.inc_fibnum, gsin->sin_addr, 0, NHR_NONE, 0); - if (nh != NULL) { - ifp = nh->nh_ifp; - if_ref(ifp); - } else { - struct in_ifaddr *ia; - struct ifnet *mifp; - - mifp = NULL; - CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { - mifp = ia->ia_ifp; - if (!(mifp->if_flags & IFF_LOOPBACK) && - (mifp->if_flags & IFF_MULTICAST)) { - ifp = mifp; - if_ref(ifp); - break; - } + return (ifp); + } + + nh = fib4_lookup(inp->inp_inc.inc_fibnum, maddr, 0, NHR_NONE, 0); + if (nh != NULL) { + ifp = nh->nh_ifp; + if_ref(ifp); + return (ifp); + } + + if (ina != NULL) { + struct in_ifaddr *ia; + + CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { + if (!(ia->ia_ifp->if_flags & IFF_LOOPBACK) && + (ia->ia_ifp->if_flags & IFF_MULTICAST)) { + ifp = ia->ia_ifp; + if_ref(ifp); + return (ifp); } } } - return (ifp); + return (NULL); } /* @@ -1926,13 +1909,13 @@ inp_join_group(struct inpcb *inp, struct sockopt *sopt) switch (sopt->sopt_name) { case IP_ADD_MEMBERSHIP: { struct ip_mreqn mreqn; + bool mreq; - if (sopt->sopt_valsize == sizeof(struct ip_mreqn)) - error = sooptcopyin(sopt, &mreqn, - sizeof(struct ip_mreqn), sizeof(struct ip_mreqn)); - else - error = sooptcopyin(sopt, &mreqn, - sizeof(struct ip_mreq), sizeof(struct ip_mreq)); + mreq = (sopt->sopt_valsize != sizeof(struct ip_mreqn)); + + error = sooptcopyin(sopt, &mreqn, + mreq ? sizeof(struct ip_mreq) : sizeof(struct ip_mreqn), + mreq ? sizeof(struct ip_mreq) : sizeof(struct ip_mreqn)); if (error) return (error); @@ -1943,12 +1926,9 @@ inp_join_group(struct inpcb *inp, struct sockopt *sopt) return (EINVAL); NET_EPOCH_ENTER(et); - if (sopt->sopt_valsize == sizeof(struct ip_mreqn) && - mreqn.imr_ifindex != 0) - ifp = ifnet_byindex_ref(mreqn.imr_ifindex); - else - ifp = inp_lookup_mcast_ifp(inp, &gsa->sin, - mreqn.imr_address); + ifp = inp_lookup_mcast_ifp(inp, mreqn.imr_multiaddr, + mreq ? &mreqn.imr_address : NULL, + mreq ? 0 : mreqn.imr_ifindex); NET_EPOCH_EXIT(et); break; } @@ -1971,8 +1951,8 @@ inp_join_group(struct inpcb *inp, struct sockopt *sopt) ssa->sin.sin_addr = mreqs.imr_sourceaddr; NET_EPOCH_ENTER(et); - ifp = inp_lookup_mcast_ifp(inp, &gsa->sin, - mreqs.imr_interface); + ifp = inp_lookup_mcast_ifp(inp, mreqs.imr_multiaddr, + &mreqs.imr_interface, 0); NET_EPOCH_EXIT(et); CTR3(KTR_IGMPV3, "%s: imr_interface = 0x%08x, ifp = %p", __func__, ntohl(mreqs.imr_interface.s_addr), ifp); @@ -2013,7 +1993,8 @@ inp_join_group(struct inpcb *inp, struct sockopt *sopt) return (EINVAL); NET_EPOCH_ENTER(et); - ifp = ifnet_byindex_ref(gsr.gsr_interface); + ifp = inp_lookup_mcast_ifp(inp, gsa->sin.sin_addr, NULL, + gsr.gsr_interface); NET_EPOCH_EXIT(et); if (ifp == NULL) return (EADDRNOTAVAIL); diff --git a/sys/netinet/siftr.c b/sys/netinet/siftr.c index 374b5595fcbc..5b89ca026e85 100644 --- a/sys/netinet/siftr.c +++ b/sys/netinet/siftr.c @@ -519,7 +519,7 @@ siftr_pkt_manager_thread(void *arg) if (log_buf != NULL) { alq_post_flags(siftr_alq, log_buf, 0); } - for (;cnt > 0; cnt--) { + for (; cnt > 0; cnt--) { pkt_node = STAILQ_FIRST(&tmp_pkt_queue); STAILQ_REMOVE_HEAD(&tmp_pkt_queue, nodes); free(pkt_node, M_SIFTR_PKTNODE); diff --git a/sys/netinet/tcp_hpts.c b/sys/netinet/tcp_hpts.c index 63bbe4bba11b..c54459bb5f01 100644 --- a/sys/netinet/tcp_hpts.c +++ b/sys/netinet/tcp_hpts.c @@ -39,15 +39,14 @@ * First, and probably the main thing its used by Rack and BBR, it can * be used to call tcp_output() of a transport stack at some time in the future. * The normal way this is done is that tcp_output() of the stack schedules - * itself to be called again by calling tcp_hpts_insert(tcpcb, slot). The - * slot is the time from now that the stack wants to be called but it - * must be converted to tcp_hpts's notion of slot. This is done with - * one of the macros HPTS_MS_TO_SLOTS or HPTS_USEC_TO_SLOTS. So a typical + * itself to be called again by calling tcp_hpts_insert(tcpcb, usecs). The + * usecs is the time from now that the stack wants to be called and is + * passing time directly in microseconds. So a typical * call from the tcp_output() routine might look like: * - * tcp_hpts_insert(tp, HPTS_USEC_TO_SLOTS(550)); + * tcp_hpts_insert(tp, 550, NULL); * - * The above would schedule tcp_output() to be called in 550 useconds. + * The above would schedule tcp_output() to be called in 550 microseconds. * Note that if using this mechanism the stack will want to add near * its top a check to prevent unwanted calls (from user land or the * arrival of incoming ack's). So it would add something like: @@ -149,27 +148,44 @@ #include <netinet/tcpip.h> #include <netinet/cc/cc.h> #include <netinet/tcp_hpts.h> +#include <netinet/tcp_hpts_internal.h> #include <netinet/tcp_log_buf.h> #ifdef tcp_offload #include <netinet/tcp_offload.h> #endif -/* - * The hpts uses a 102400 wheel. The wheel - * defines the time in 10 usec increments (102400 x 10). - * This gives a range of 10usec - 1024ms to place - * an entry within. If the user requests more than - * 1.024 second, a remaineder is attached and the hpts - * when seeing the remainder will re-insert the - * inpcb forward in time from where it is until - * the remainder is zero. - */ +/* Global instance for TCP HPTS */ +struct tcp_hptsi *tcp_hptsi_pace; + +/* Default function table for production use. */ +const struct tcp_hptsi_funcs tcp_hptsi_default_funcs = { + .microuptime = microuptime, + .swi_add = swi_add, + .swi_remove = swi_remove, + .swi_sched = swi_sched, + .intr_event_bind = intr_event_bind, + .intr_event_bind_ithread_cpuset = intr_event_bind_ithread_cpuset, + .callout_init = callout_init, + .callout_reset_sbt_on = callout_reset_sbt_on, + ._callout_stop_safe = _callout_stop_safe, +}; -#define NUM_OF_HPTSI_SLOTS 102400 +#ifdef TCP_HPTS_KTEST +#define microuptime pace->funcs->microuptime +#define swi_add pace->funcs->swi_add +#define swi_remove pace->funcs->swi_remove +#define swi_sched pace->funcs->swi_sched +#define intr_event_bind pace->funcs->intr_event_bind +#define intr_event_bind_ithread_cpuset pace->funcs->intr_event_bind_ithread_cpuset +#define callout_init pace->funcs->callout_init +#define callout_reset_sbt_on pace->funcs->callout_reset_sbt_on +#define _callout_stop_safe pace->funcs->_callout_stop_safe +#endif -/* The number of connections after which the dynamic sleep logic kicks in. */ -#define DEFAULT_CONNECTION_THRESHOLD 100 +static MALLOC_DEFINE(M_TCPHPTS, "tcp_hpts", "TCP hpts"); + +static void tcp_hpts_thread(void *ctx); /* * When using the hpts, a TCP stack must make sure @@ -204,87 +220,22 @@ * * When we are in the "new" mode i.e. conn_cnt > conn_cnt_thresh * then we do a dynamic adjustment on the time we sleep. - * Our threshold is if the lateness of the first client served (in ticks) is + * Our threshold is if the lateness of the first client served (in slots) is * greater than or equal too slots_indicate_more_sleep (10ms - * or 10000 ticks). If we were that late, the actual sleep time - * is adjusted down by 50%. If the ticks_ran is less than - * slots_indicate_more_sleep (100 ticks or 1000usecs). + * or 10000 slots). If we were that late, the actual sleep time + * is adjusted down by 50%. If the slots_ran is less than + * slots_indicate_more_sleep (100 slots or 1000usecs). * */ -/* Each hpts has its own p_mtx which is used for locking */ -#define HPTS_MTX_ASSERT(hpts) mtx_assert(&(hpts)->p_mtx, MA_OWNED) -#define HPTS_LOCK(hpts) mtx_lock(&(hpts)->p_mtx) -#define HPTS_TRYLOCK(hpts) mtx_trylock(&(hpts)->p_mtx) -#define HPTS_UNLOCK(hpts) mtx_unlock(&(hpts)->p_mtx) -struct tcp_hpts_entry { - /* Cache line 0x00 */ - struct mtx p_mtx; /* Mutex for hpts */ - struct timeval p_mysleep; /* Our min sleep time */ - uint64_t syscall_cnt; - uint64_t sleeping; /* What the actual sleep was (if sleeping) */ - uint16_t p_hpts_active; /* Flag that says hpts is awake */ - uint8_t p_wheel_complete; /* have we completed the wheel arc walk? */ - uint32_t p_curtick; /* Tick in 10 us the hpts is going to */ - uint32_t p_runningslot; /* Current tick we are at if we are running */ - uint32_t p_prev_slot; /* Previous slot we were on */ - uint32_t p_cur_slot; /* Current slot in wheel hpts is draining */ - uint32_t p_nxt_slot; /* The next slot outside the current range of - * slots that the hpts is running on. */ - int32_t p_on_queue_cnt; /* Count on queue in this hpts */ - uint32_t p_lasttick; /* Last tick before the current one */ - uint8_t p_direct_wake :1, /* boolean */ - p_on_min_sleep:1, /* boolean */ - p_hpts_wake_scheduled:1, /* boolean */ - hit_callout_thresh:1, - p_avail:4; - uint8_t p_fill[3]; /* Fill to 32 bits */ - /* Cache line 0x40 */ - struct hptsh { - TAILQ_HEAD(, tcpcb) head; - uint32_t count; - uint32_t gencnt; - } *p_hptss; /* Hptsi wheel */ - uint32_t p_hpts_sleep_time; /* Current sleep interval having a max - * of 255ms */ - uint32_t overidden_sleep; /* what was overrided by min-sleep for logging */ - uint32_t saved_lasttick; /* for logging */ - uint32_t saved_curtick; /* for logging */ - uint32_t saved_curslot; /* for logging */ - uint32_t saved_prev_slot; /* for logging */ - uint32_t p_delayed_by; /* How much were we delayed by */ - /* Cache line 0x80 */ - struct sysctl_ctx_list hpts_ctx; - struct sysctl_oid *hpts_root; - struct intr_event *ie; - void *ie_cookie; - uint16_t p_num; /* The hpts number one per cpu */ - uint16_t p_cpu; /* The hpts CPU */ - /* There is extra space in here */ - /* Cache line 0x100 */ - struct callout co __aligned(CACHE_LINE_SIZE); -} __aligned(CACHE_LINE_SIZE); - -static struct tcp_hptsi { - struct cpu_group **grps; - struct tcp_hpts_entry **rp_ent; /* Array of hptss */ - uint32_t *cts_last_ran; - uint32_t grp_cnt; - uint32_t rp_num_hptss; /* Number of hpts threads */ -} tcp_pace; - -static MALLOC_DEFINE(M_TCPHPTS, "tcp_hpts", "TCP hpts"); #ifdef RSS -static int tcp_bind_threads = 1; +int tcp_bind_threads = 1; #else -static int tcp_bind_threads = 2; +int tcp_bind_threads = 2; #endif static int tcp_use_irq_cpu = 0; static int hpts_does_tp_logging = 0; - -static int32_t tcp_hptsi(struct tcp_hpts_entry *hpts, bool from_callout); -static void tcp_hpts_thread(void *ctx); - +static int32_t tcp_hpts_precision = 120; int32_t tcp_min_hptsi_time = DEFAULT_MIN_SLEEP; static int conn_cnt_thresh = DEFAULT_CONNECTION_THRESHOLD; static int32_t dynamic_min_sleep = DYNAMIC_MIN_SLEEP; @@ -295,23 +246,6 @@ SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hpts, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, SYSCTL_NODE(_net_inet_tcp_hpts, OID_AUTO, stats, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "TCP Hpts statistics"); -#define timersub(tvp, uvp, vvp) \ - do { \ - (vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec; \ - (vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec; \ - if ((vvp)->tv_usec < 0) { \ - (vvp)->tv_sec--; \ - (vvp)->tv_usec += 1000000; \ - } \ - } while (0) - -static int32_t tcp_hpts_precision = 120; - -static struct hpts_domain_info { - int count; - int cpu[MAXCPU]; -} hpts_domains[MAXMEMDOM]; - counter_u64_t hpts_hopelessly_behind; SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, hopeless, CTLFLAG_RD, @@ -459,14 +393,14 @@ SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, nowake_over_thresh, CTLFLAG_RW, &tcp_hpts_no_wake_over_thresh, 0, "When we are over the threshold on the pacer do we prohibit wakeups?"); -static uint16_t -hpts_random_cpu(void) +uint16_t +tcp_hptsi_random_cpu(struct tcp_hptsi *pace) { uint16_t cpuid; uint32_t ran; ran = arc4random(); - cpuid = (((ran & 0xffff) % mp_ncpus) % tcp_pace.rp_num_hptss); + cpuid = (((ran & 0xffff) % mp_ncpus) % pace->rp_num_hptss); return (cpuid); } @@ -487,13 +421,11 @@ tcp_hpts_log(struct tcp_hpts_entry *hpts, struct tcpcb *tp, struct timeval *tv, log.u_bbr.flex2 = hpts->p_cur_slot; log.u_bbr.flex3 = hpts->p_prev_slot; log.u_bbr.flex4 = idx; - log.u_bbr.flex5 = hpts->p_curtick; log.u_bbr.flex6 = hpts->p_on_queue_cnt; log.u_bbr.flex7 = hpts->p_cpu; log.u_bbr.flex8 = (uint8_t)from_callout; log.u_bbr.inflight = slots_to_run; log.u_bbr.applimited = hpts->overidden_sleep; - log.u_bbr.delivered = hpts->saved_curtick; log.u_bbr.timeStamp = tcp_tv_to_usec(tv); log.u_bbr.epoch = hpts->saved_curslot; log.u_bbr.lt_epoch = hpts->saved_prev_slot; @@ -510,11 +442,67 @@ tcp_hpts_log(struct tcp_hpts_entry *hpts, struct tcpcb *tp, struct timeval *tv, } } +/* + * Timeout handler for the HPTS sleep callout. It immediately schedules the SWI + * for the HPTS entry to run. + */ static void -tcp_wakehpts(struct tcp_hpts_entry *hpts) +tcp_hpts_sleep_timeout(void *arg) { +#ifdef TCP_HPTS_KTEST + struct tcp_hptsi *pace; +#endif + struct tcp_hpts_entry *hpts; + + hpts = (struct tcp_hpts_entry *)arg; +#ifdef TCP_HPTS_KTEST + pace = hpts->p_hptsi; +#endif + swi_sched(hpts->ie_cookie, 0); +} + +/* + * Reset the HPTS callout timer with the provided timeval. Returns the results + * of the callout_reset_sbt_on() function. + */ +static int +tcp_hpts_sleep(struct tcp_hpts_entry *hpts, struct timeval *tv) +{ +#ifdef TCP_HPTS_KTEST + struct tcp_hptsi *pace; +#endif + sbintime_t sb; + +#ifdef TCP_HPTS_KTEST + pace = hpts->p_hptsi; +#endif + + /* Store off to make visible the actual sleep time */ + hpts->sleeping = tv->tv_usec; + + sb = tvtosbt(*tv); + return (callout_reset_sbt_on( + &hpts->co, sb, 0, tcp_hpts_sleep_timeout, hpts, hpts->p_cpu, + (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)))); +} + +/* + * Schedules the SWI for the HTPS entry to run, if not already scheduled or + * running. + */ +void +tcp_hpts_wake(struct tcp_hpts_entry *hpts) +{ +#ifdef TCP_HPTS_KTEST + struct tcp_hptsi *pace; +#endif + HPTS_MTX_ASSERT(hpts); +#ifdef TCP_HPTS_KTEST + pace = hpts->p_hptsi; +#endif + if (tcp_hpts_no_wake_over_thresh && (hpts->p_on_queue_cnt >= conn_cnt_thresh)) { hpts->p_direct_wake = 0; return; @@ -526,15 +514,6 @@ tcp_wakehpts(struct tcp_hpts_entry *hpts) } static void -hpts_timeout_swi(void *arg) -{ - struct tcp_hpts_entry *hpts; - - hpts = (struct tcp_hpts_entry *)arg; - swi_sched(hpts->ie_cookie, 0); -} - -static void tcp_hpts_insert_internal(struct tcpcb *tp, struct tcp_hpts_entry *hpts) { struct inpcb *inp = tptoinpcb(tp); @@ -562,13 +541,13 @@ tcp_hpts_insert_internal(struct tcpcb *tp, struct tcp_hpts_entry *hpts) } static struct tcp_hpts_entry * -tcp_hpts_lock(struct tcpcb *tp) +tcp_hpts_lock(struct tcp_hptsi *pace, struct tcpcb *tp) { struct tcp_hpts_entry *hpts; INP_LOCK_ASSERT(tptoinpcb(tp)); - hpts = tcp_pace.rp_ent[tp->t_hpts_cpu]; + hpts = pace->rp_ent[tp->t_hpts_cpu]; HPTS_LOCK(hpts); return (hpts); @@ -595,11 +574,10 @@ tcp_hpts_release(struct tcpcb *tp) * and has never received a first packet. */ void -tcp_hpts_init(struct tcpcb *tp) +__tcp_hpts_init(struct tcp_hptsi *pace, struct tcpcb *tp) { - if (__predict_true(tp->t_hpts_cpu == HPTS_CPU_NONE)) { - tp->t_hpts_cpu = hpts_random_cpu(); + tp->t_hpts_cpu = tcp_hptsi_random_cpu(pace); MPASS(!(tp->t_flags2 & TF2_HPTS_CPU_SET)); } } @@ -611,14 +589,14 @@ tcp_hpts_init(struct tcpcb *tp) * INP lock and then get the hpts lock. */ void -tcp_hpts_remove(struct tcpcb *tp) +__tcp_hpts_remove(struct tcp_hptsi *pace, struct tcpcb *tp) { struct tcp_hpts_entry *hpts; struct hptsh *hptsh; INP_WLOCK_ASSERT(tptoinpcb(tp)); - hpts = tcp_hpts_lock(tp); + hpts = tcp_hpts_lock(pace, tp); if (tp->t_in_hpts == IHPTS_ONQUEUE) { hptsh = &hpts->p_hptss[tp->t_hpts_slot]; tp->t_hpts_request = 0; @@ -662,23 +640,19 @@ hpts_slot(uint32_t wheel_slot, uint32_t plus) { /* * Given a slot on the wheel, what slot - * is that plus ticks out? + * is that plus slots out? */ - KASSERT(wheel_slot < NUM_OF_HPTSI_SLOTS, ("Invalid tick %u not on wheel", wheel_slot)); + KASSERT(wheel_slot < NUM_OF_HPTSI_SLOTS, ("Invalid slot %u not on wheel", wheel_slot)); return ((wheel_slot + plus) % NUM_OF_HPTSI_SLOTS); } static inline int -tick_to_wheel(uint32_t cts_in_wticks) +cts_to_wheel(uint32_t cts) { /* - * Given a timestamp in ticks (so by - * default to get it to a real time one - * would multiply by 10.. i.e the number - * of ticks in a slot) map it to our limited - * space wheel. + * Given a timestamp in useconds map it to our limited space wheel. */ - return (cts_in_wticks % NUM_OF_HPTSI_SLOTS); + return ((cts / HPTS_USECS_PER_SLOT) % NUM_OF_HPTSI_SLOTS); } static inline int @@ -721,7 +695,7 @@ max_slots_available(struct tcp_hpts_entry *hpts, uint32_t wheel_slot, uint32_t * if ((hpts->p_hpts_active == 1) && (hpts->p_wheel_complete == 0)) { end_slot = hpts->p_runningslot; - /* Back up one tick */ + /* Back up one slot */ if (end_slot == 0) end_slot = NUM_OF_HPTSI_SLOTS - 1; else @@ -734,7 +708,7 @@ max_slots_available(struct tcp_hpts_entry *hpts, uint32_t wheel_slot, uint32_t * * not active, or we have * completed the pass over * the wheel, we can use the - * prev tick and subtract one from it. This puts us + * prev slot and subtract one from it. This puts us * as far out as possible on the wheel. */ end_slot = hpts->p_prev_slot; @@ -747,7 +721,7 @@ max_slots_available(struct tcp_hpts_entry *hpts, uint32_t wheel_slot, uint32_t * /* * Now we have close to the full wheel left minus the * time it has been since the pacer went to sleep. Note - * that wheel_tick, passed in, should be the current time + * that wheel_slot, passed in, should be the current time * from the perspective of the caller, mapped to the wheel. */ if (hpts->p_prev_slot != wheel_slot) @@ -824,7 +798,7 @@ max_slots_available(struct tcp_hpts_entry *hpts, uint32_t wheel_slot, uint32_t * #ifdef INVARIANTS static void check_if_slot_would_be_wrong(struct tcp_hpts_entry *hpts, struct tcpcb *tp, - uint32_t hptsslot, int line) + uint32_t hptsslot) { /* * Sanity checks for the pacer with invariants @@ -855,12 +829,13 @@ check_if_slot_would_be_wrong(struct tcp_hpts_entry *hpts, struct tcpcb *tp, } #endif -uint32_t -tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_diag *diag) +void +__tcp_hpts_insert(struct tcp_hptsi *pace, struct tcpcb *tp, uint32_t usecs, + struct hpts_diag *diag) { struct tcp_hpts_entry *hpts; struct timeval tv; - uint32_t slot_on, wheel_cts, last_slot, need_new_to = 0; + uint32_t slot, wheel_cts, last_slot, need_new_to = 0; int32_t wheel_slot, maxslots; bool need_wakeup = false; @@ -869,11 +844,13 @@ tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_ MPASS(!(tp->t_in_hpts == IHPTS_ONQUEUE)); /* + * Convert microseconds to slots for internal use. * We now return the next-slot the hpts will be on, beyond its * current run (if up) or where it was when it stopped if it is * sleeping. */ - hpts = tcp_hpts_lock(tp); + slot = HPTS_USEC_TO_SLOTS(usecs); + hpts = tcp_hpts_lock(pace, tp); microuptime(&tv); if (diag) { memset(diag, 0, sizeof(struct hpts_diag)); @@ -882,8 +859,6 @@ tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_ diag->p_runningslot = hpts->p_runningslot; diag->p_nxt_slot = hpts->p_nxt_slot; diag->p_cur_slot = hpts->p_cur_slot; - diag->p_curtick = hpts->p_curtick; - diag->p_lasttick = hpts->p_lasttick; diag->slot_req = slot; diag->p_on_min_sleep = hpts->p_on_min_sleep; diag->hpts_sleep_time = hpts->p_hpts_sleep_time; @@ -910,17 +885,15 @@ tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_ * timeout is not 1. */ hpts->p_direct_wake = 1; - tcp_wakehpts(hpts); + tcp_hpts_wake(hpts); } - slot_on = hpts->p_nxt_slot; HPTS_UNLOCK(hpts); - return (slot_on); + return; } - /* Get the current time relative to the wheel */ - wheel_cts = tcp_tv_to_hpts_slot(&tv); - /* Map it onto the wheel */ - wheel_slot = tick_to_wheel(wheel_cts); + /* Get the current time stamp and map it onto the wheel */ + wheel_cts = tcp_tv_to_usec(&tv); + wheel_slot = cts_to_wheel(wheel_cts); /* Now what's the max we can place it at? */ maxslots = max_slots_available(hpts, wheel_slot, &last_slot); if (diag) { @@ -952,11 +925,11 @@ tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_ tp->t_hpts_slot = last_slot; } if (diag) { - diag->slot_remaining = tp->t_hpts_request; + diag->time_remaining = tp->t_hpts_request; diag->inp_hptsslot = tp->t_hpts_slot; } #ifdef INVARIANTS - check_if_slot_would_be_wrong(hpts, tp, tp->t_hpts_slot, line); + check_if_slot_would_be_wrong(hpts, tp, tp->t_hpts_slot); #endif if (__predict_true(tp->t_in_hpts != IHPTS_MOVING)) tcp_hpts_insert_internal(tp, hpts); @@ -995,12 +968,12 @@ tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_ } /* * Now how far is the hpts sleeping to? if active is 1, its - * up and ticking we do nothing, otherwise we may need to + * up and running we do nothing, otherwise we may need to * reschedule its callout if need_new_to is set from above. */ if (need_wakeup) { hpts->p_direct_wake = 1; - tcp_wakehpts(hpts); + tcp_hpts_wake(hpts); if (diag) { diag->need_new_to = 0; diag->co_ret = 0xffff0000; @@ -1008,7 +981,6 @@ tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_ } else if (need_new_to) { int32_t co_ret; struct timeval tv; - sbintime_t sb; tv.tv_sec = 0; tv.tv_usec = 0; @@ -1016,24 +988,18 @@ tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_ tv.tv_sec++; need_new_to -= HPTS_USEC_IN_SEC; } - tv.tv_usec = need_new_to; - sb = tvtosbt(tv); - co_ret = callout_reset_sbt_on(&hpts->co, sb, 0, - hpts_timeout_swi, hpts, hpts->p_cpu, - (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision))); + tv.tv_usec = need_new_to; /* XXX: Why is this sleeping over the max? */ + co_ret = tcp_hpts_sleep(hpts, &tv); if (diag) { diag->need_new_to = need_new_to; diag->co_ret = co_ret; } } - slot_on = hpts->p_nxt_slot; HPTS_UNLOCK(hpts); - - return (slot_on); } static uint16_t -hpts_cpuid(struct tcpcb *tp, int *failed) +hpts_cpuid(struct tcp_hptsi *pace, struct tcpcb *tp, int *failed) { struct inpcb *inp = tptoinpcb(tp); u_int cpuid; @@ -1060,7 +1026,7 @@ hpts_cpuid(struct tcpcb *tp, int *failed) #ifdef RSS cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype); if (cpuid == NETISR_CPUID_NONE) - return (hpts_random_cpu()); + return (tcp_hptsi_random_cpu(pace)); else return (cpuid); #endif @@ -1071,7 +1037,7 @@ hpts_cpuid(struct tcpcb *tp, int *failed) */ if (inp->inp_flowtype == M_HASHTYPE_NONE) { counter_u64_add(cpu_uses_random, 1); - return (hpts_random_cpu()); + return (tcp_hptsi_random_cpu(pace)); } /* * Hash to a thread based on the flowid. If we are using numa, @@ -1086,7 +1052,7 @@ hpts_cpuid(struct tcpcb *tp, int *failed) #ifdef NUMA } else { /* Hash into the cpu's that use that domain */ - di = &hpts_domains[inp->inp_numa_domain]; + di = &pace->domains[inp->inp_numa_domain]; cpuid = di->cpu[inp->inp_flowid % di->count]; } #endif @@ -1118,9 +1084,16 @@ tcp_hpts_set_max_sleep(struct tcp_hpts_entry *hpts, int wrap_loop_cnt) } } -static int32_t +static bool +tcp_hpts_different_slots(uint32_t cts, uint32_t cts_last_run) +{ + return ((cts / HPTS_USECS_PER_SLOT) != (cts_last_run / HPTS_USECS_PER_SLOT)); +} + +int32_t tcp_hptsi(struct tcp_hpts_entry *hpts, bool from_callout) { + struct tcp_hptsi *pace; struct tcpcb *tp; struct timeval tv; int32_t slots_to_run, i, error; @@ -1130,6 +1103,7 @@ tcp_hptsi(struct tcp_hpts_entry *hpts, bool from_callout) int32_t wrap_loop_cnt = 0; int32_t slot_pos_of_endpoint = 0; int32_t orig_exit_slot; + uint32_t cts, cts_last_run; bool completed_measure, seen_endpoint; completed_measure = false; @@ -1137,32 +1111,34 @@ tcp_hptsi(struct tcp_hpts_entry *hpts, bool from_callout) HPTS_MTX_ASSERT(hpts); NET_EPOCH_ASSERT(); + + pace = hpts->p_hptsi; + MPASS(pace != NULL); + /* record previous info for any logging */ - hpts->saved_lasttick = hpts->p_lasttick; - hpts->saved_curtick = hpts->p_curtick; hpts->saved_curslot = hpts->p_cur_slot; hpts->saved_prev_slot = hpts->p_prev_slot; - hpts->p_lasttick = hpts->p_curtick; - hpts->p_curtick = tcp_gethptstick(&tv); - tcp_pace.cts_last_ran[hpts->p_num] = tcp_tv_to_usec(&tv); - orig_exit_slot = hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick); + microuptime(&tv); + cts_last_run = pace->cts_last_ran[hpts->p_cpu]; + pace->cts_last_ran[hpts->p_cpu] = cts = tcp_tv_to_usec(&tv); + + orig_exit_slot = hpts->p_cur_slot = cts_to_wheel(cts); if ((hpts->p_on_queue_cnt == 0) || - (hpts->p_lasttick == hpts->p_curtick)) { + !tcp_hpts_different_slots(cts, cts_last_run)) { /* - * No time has yet passed, - * or nothing to do. + * Not enough time has yet passed or nothing to do. */ hpts->p_prev_slot = hpts->p_cur_slot; - hpts->p_lasttick = hpts->p_curtick; goto no_run; } again: hpts->p_wheel_complete = 0; HPTS_MTX_ASSERT(hpts); slots_to_run = hpts_slots_diff(hpts->p_prev_slot, hpts->p_cur_slot); - if (((hpts->p_curtick - hpts->p_lasttick) > (NUM_OF_HPTSI_SLOTS - 1)) && - (hpts->p_on_queue_cnt != 0)) { + if ((hpts->p_on_queue_cnt != 0) && + ((cts - cts_last_run) > + ((NUM_OF_HPTSI_SLOTS-1) * HPTS_USECS_PER_SLOT))) { /* * Wheel wrap is occuring, basically we * are behind and the distance between @@ -1238,7 +1214,7 @@ again: uint32_t runningslot; /* - * Calculate our delay, if there are no extra ticks there + * Calculate our delay, if there are no extra slots there * was not any (i.e. if slots_to_run == 1, no delay). */ hpts->p_delayed_by = (slots_to_run - (i + 1)) * @@ -1391,7 +1367,7 @@ again: * gets added to the hpts (not this one) * :-) */ - tcp_set_hpts(tp); + __tcp_set_hpts(pace, tp); } CURVNET_SET(inp->inp_vnet); /* Lets do any logging that we might want to */ @@ -1450,10 +1426,12 @@ no_one: hpts->p_delayed_by = 0; /* * Check to see if we took an excess amount of time and need to run - * more ticks (if we did not hit eno-bufs). + * more slots (if we did not hit eno-bufs). */ hpts->p_prev_slot = hpts->p_cur_slot; - hpts->p_lasttick = hpts->p_curtick; + microuptime(&tv); + cts_last_run = cts; + cts = tcp_tv_to_usec(&tv); if (!from_callout || (loop_cnt > max_pacer_loops)) { /* * Something is serious slow we have @@ -1465,7 +1443,7 @@ no_one: * can never catch up :( * * We will just lie to this thread - * and let it thing p_curtick is + * and let it think p_curslot is * correct. When it next awakens * it will find itself further behind. */ @@ -1473,20 +1451,19 @@ no_one: counter_u64_add(hpts_hopelessly_behind, 1); goto no_run; } - hpts->p_curtick = tcp_gethptstick(&tv); - hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick); + + hpts->p_cur_slot = cts_to_wheel(cts); if (!seen_endpoint) { /* We saw no endpoint but we may be looping */ orig_exit_slot = hpts->p_cur_slot; } - if ((wrap_loop_cnt < 2) && - (hpts->p_lasttick != hpts->p_curtick)) { + if ((wrap_loop_cnt < 2) && tcp_hpts_different_slots(cts, cts_last_run)) { counter_u64_add(hpts_loops, 1); loop_cnt++; goto again; } no_run: - tcp_pace.cts_last_ran[hpts->p_num] = tcp_tv_to_usec(&tv); + pace->cts_last_ran[hpts->p_cpu] = cts; /* * Set flag to tell that we are done for * any slot input that happens during @@ -1494,25 +1471,36 @@ no_run: */ hpts->p_wheel_complete = 1; /* - * Now did we spend too long running input and need to run more ticks? - * Note that if wrap_loop_cnt < 2 then we should have the conditions - * in the KASSERT's true. But if the wheel is behind i.e. wrap_loop_cnt - * is greater than 2, then the condtion most likely are *not* true. - * Also if we are called not from the callout, we don't run the wheel - * multiple times so the slots may not align either. - */ - KASSERT(((hpts->p_prev_slot == hpts->p_cur_slot) || - (wrap_loop_cnt >= 2) || !from_callout), - ("H:%p p_prev_slot:%u not equal to p_cur_slot:%u", hpts, - hpts->p_prev_slot, hpts->p_cur_slot)); - KASSERT(((hpts->p_lasttick == hpts->p_curtick) - || (wrap_loop_cnt >= 2) || !from_callout), - ("H:%p p_lasttick:%u not equal to p_curtick:%u", hpts, - hpts->p_lasttick, hpts->p_curtick)); - if (from_callout && (hpts->p_lasttick != hpts->p_curtick)) { - hpts->p_curtick = tcp_gethptstick(&tv); + * If enough time has elapsed that we should be processing the next + * slot(s), then we should have kept running and not marked the wheel as + * complete. + * + * But there are several other conditions where we would have stopped + * processing, so the prev/cur slots and cts variables won't match. + * These conditions are: + * + * - Calls not from callouts don't run multiple times + * - The wheel is empty + * - We've processed more than max_pacer_loops times + * - We've wrapped more than 2 times + * + * This assert catches when the logic above has violated this design. + * + */ + KASSERT((!from_callout || (hpts->p_on_queue_cnt == 0) || + (loop_cnt > max_pacer_loops) || (wrap_loop_cnt >= 2) || + ((hpts->p_prev_slot == hpts->p_cur_slot) && + !tcp_hpts_different_slots(cts, cts_last_run))), + ("H:%p Shouldn't be done! prev_slot:%u, cur_slot:%u, " + "cts_last_run:%u, cts:%u, loop_cnt:%d, wrap_loop_cnt:%d", + hpts, hpts->p_prev_slot, hpts->p_cur_slot, + cts_last_run, cts, loop_cnt, wrap_loop_cnt)); + + if (from_callout && tcp_hpts_different_slots(cts, cts_last_run)) { + microuptime(&tv); + cts = tcp_tv_to_usec(&tv); + hpts->p_cur_slot = cts_to_wheel(cts); counter_u64_add(hpts_loops, 1); - hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick); goto again; } @@ -1526,16 +1514,16 @@ no_run: } void -tcp_set_hpts(struct tcpcb *tp) +__tcp_set_hpts(struct tcp_hptsi *pace, struct tcpcb *tp) { struct tcp_hpts_entry *hpts; int failed; INP_WLOCK_ASSERT(tptoinpcb(tp)); - hpts = tcp_hpts_lock(tp); + hpts = tcp_hpts_lock(pace, tp); if (tp->t_in_hpts == IHPTS_NONE && !(tp->t_flags2 & TF2_HPTS_CPU_SET)) { - tp->t_hpts_cpu = hpts_cpuid(tp, &failed); + tp->t_hpts_cpu = hpts_cpuid(pace, tp, &failed); if (failed == 0) tp->t_flags2 |= TF2_HPTS_CPU_SET; } @@ -1543,33 +1531,35 @@ tcp_set_hpts(struct tcpcb *tp) } static struct tcp_hpts_entry * -tcp_choose_hpts_to_run(void) +tcp_choose_hpts_to_run(struct tcp_hptsi *pace) { + struct timeval tv; int i, oldest_idx, start, end; uint32_t cts, time_since_ran, calc; - cts = tcp_get_usecs(NULL); + microuptime(&tv); + cts = tcp_tv_to_usec(&tv); time_since_ran = 0; /* Default is all one group */ start = 0; - end = tcp_pace.rp_num_hptss; + end = pace->rp_num_hptss; /* * If we have more than one L3 group figure out which one * this CPU is in. */ - if (tcp_pace.grp_cnt > 1) { - for (i = 0; i < tcp_pace.grp_cnt; i++) { - if (CPU_ISSET(curcpu, &tcp_pace.grps[i]->cg_mask)) { - start = tcp_pace.grps[i]->cg_first; - end = (tcp_pace.grps[i]->cg_last + 1); + if (pace->grp_cnt > 1) { + for (i = 0; i < pace->grp_cnt; i++) { + if (CPU_ISSET(curcpu, &pace->grps[i]->cg_mask)) { + start = pace->grps[i]->cg_first; + end = (pace->grps[i]->cg_last + 1); break; } } } oldest_idx = -1; for (i = start; i < end; i++) { - if (TSTMP_GT(cts, tcp_pace.cts_last_ran[i])) - calc = cts - tcp_pace.cts_last_ran[i]; + if (TSTMP_GT(cts, pace->cts_last_ran[i])) + calc = cts - pace->cts_last_ran[i]; else calc = 0; if (calc > time_since_ran) { @@ -1578,9 +1568,9 @@ tcp_choose_hpts_to_run(void) } } if (oldest_idx >= 0) - return(tcp_pace.rp_ent[oldest_idx]); + return(pace->rp_ent[oldest_idx]); else - return(tcp_pace.rp_ent[(curcpu % tcp_pace.rp_num_hptss)]); + return(pace->rp_ent[(curcpu % pace->rp_num_hptss)]); } static void @@ -1588,9 +1578,9 @@ __tcp_run_hpts(void) { struct epoch_tracker et; struct tcp_hpts_entry *hpts; - int ticks_ran; + int slots_ran; - hpts = tcp_choose_hpts_to_run(); + hpts = tcp_choose_hpts_to_run(tcp_hptsi_pace); if (hpts->p_hpts_active) { /* Already active */ @@ -1606,12 +1596,11 @@ __tcp_run_hpts(void) hpts->syscall_cnt++; counter_u64_add(hpts_direct_call, 1); hpts->p_hpts_active = 1; - ticks_ran = tcp_hptsi(hpts, false); + slots_ran = tcp_hptsi(hpts, false); /* We may want to adjust the sleep values here */ if (hpts->p_on_queue_cnt >= conn_cnt_thresh) { - if (ticks_ran > slots_indicate_less_sleep) { + if (slots_ran > slots_indicate_less_sleep) { struct timeval tv; - sbintime_t sb; hpts->p_mysleep.tv_usec /= 2; if (hpts->p_mysleep.tv_usec < dynamic_min_sleep) @@ -1635,13 +1624,8 @@ __tcp_run_hpts(void) * the dynamic value and set the on_min_sleep * flag so we will not be awoken. */ - sb = tvtosbt(tv); - /* Store off to make visible the actual sleep time */ - hpts->sleeping = tv.tv_usec; - callout_reset_sbt_on(&hpts->co, sb, 0, - hpts_timeout_swi, hpts, hpts->p_cpu, - (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision))); - } else if (ticks_ran < slots_indicate_more_sleep) { + (void)tcp_hpts_sleep(hpts, &tv); + } else if (slots_ran < slots_indicate_more_sleep) { /* For the further sleep, don't reschedule hpts */ hpts->p_mysleep.tv_usec *= 2; if (hpts->p_mysleep.tv_usec > dynamic_max_sleep) @@ -1658,17 +1642,22 @@ out_with_mtx: static void tcp_hpts_thread(void *ctx) { +#ifdef TCP_HPTS_KTEST + struct tcp_hptsi *pace; +#endif struct tcp_hpts_entry *hpts; struct epoch_tracker et; struct timeval tv; - sbintime_t sb; - int ticks_ran; + int slots_ran; hpts = (struct tcp_hpts_entry *)ctx; +#ifdef TCP_HPTS_KTEST + pace = hpts->p_hptsi; +#endif HPTS_LOCK(hpts); if (hpts->p_direct_wake) { /* Signaled by input or output with low occupancy count. */ - callout_stop(&hpts->co); + _callout_stop_safe(&hpts->co, 0); counter_u64_add(hpts_direct_awakening, 1); } else { /* Timed out, the normal case. */ @@ -1721,7 +1710,7 @@ tcp_hpts_thread(void *ctx) } hpts->sleeping = 0; hpts->p_hpts_active = 1; - ticks_ran = tcp_hptsi(hpts, true); + slots_ran = tcp_hptsi(hpts, true); tv.tv_sec = 0; tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_USECS_PER_SLOT; if ((hpts->p_on_queue_cnt > conn_cnt_thresh) && (hpts->hit_callout_thresh == 0)) { @@ -1737,11 +1726,11 @@ tcp_hpts_thread(void *ctx) * Only adjust sleep time if we were * called from the callout i.e. direct_wake == 0. */ - if (ticks_ran < slots_indicate_more_sleep) { + if (slots_ran < slots_indicate_more_sleep) { hpts->p_mysleep.tv_usec *= 2; if (hpts->p_mysleep.tv_usec > dynamic_max_sleep) hpts->p_mysleep.tv_usec = dynamic_max_sleep; - } else if (ticks_ran > slots_indicate_less_sleep) { + } else if (slots_ran > slots_indicate_less_sleep) { hpts->p_mysleep.tv_usec /= 2; if (hpts->p_mysleep.tv_usec < dynamic_min_sleep) hpts->p_mysleep.tv_usec = dynamic_min_sleep; @@ -1797,18 +1786,11 @@ tcp_hpts_thread(void *ctx) hpts->p_hpts_active = 0; back_to_sleep: hpts->p_direct_wake = 0; - sb = tvtosbt(tv); - /* Store off to make visible the actual sleep time */ - hpts->sleeping = tv.tv_usec; - callout_reset_sbt_on(&hpts->co, sb, 0, - hpts_timeout_swi, hpts, hpts->p_cpu, - (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision))); + (void)tcp_hpts_sleep(hpts, &tv); NET_EPOCH_EXIT(et); HPTS_UNLOCK(hpts); } -#undef timersub - static int32_t hpts_count_level(struct cpu_group *cg) { @@ -1845,57 +1827,63 @@ hpts_gather_grps(struct cpu_group **grps, int32_t *at, int32_t max, struct cpu_g } } -static void -tcp_hpts_mod_load(void) +/* + * Initialize a tcp_hptsi structure. This performs the core initialization + * without starting threads. + */ +struct tcp_hptsi* +tcp_hptsi_create(const struct tcp_hptsi_funcs *funcs, bool enable_sysctl) { + struct tcp_hptsi *pace; struct cpu_group *cpu_top; - int32_t error __diagused; - int32_t i, j, bound = 0, created = 0; + uint32_t i, j, cts; + int32_t count; size_t sz, asz; struct timeval tv; - sbintime_t sb; struct tcp_hpts_entry *hpts; - struct pcpu *pc; char unit[16]; uint32_t ncpus = mp_ncpus ? mp_ncpus : MAXCPU; - int count, domain; + KASSERT(funcs != NULL, ("funcs is NULL")); + + /* Allocate the main structure */ + pace = malloc(sizeof(struct tcp_hptsi), M_TCPHPTS, M_WAITOK | M_ZERO); + if (pace == NULL) + return (NULL); + + memset(pace, 0, sizeof(*pace)); + pace->funcs = funcs; + + /* Setup CPU topology information */ #ifdef SMP cpu_top = smp_topo(); #else cpu_top = NULL; #endif - tcp_pace.rp_num_hptss = ncpus; - hpts_hopelessly_behind = counter_u64_alloc(M_WAITOK); - hpts_loops = counter_u64_alloc(M_WAITOK); - back_tosleep = counter_u64_alloc(M_WAITOK); - combined_wheel_wrap = counter_u64_alloc(M_WAITOK); - wheel_wrap = counter_u64_alloc(M_WAITOK); - hpts_wake_timeout = counter_u64_alloc(M_WAITOK); - hpts_direct_awakening = counter_u64_alloc(M_WAITOK); - hpts_back_tosleep = counter_u64_alloc(M_WAITOK); - hpts_direct_call = counter_u64_alloc(M_WAITOK); - cpu_uses_flowid = counter_u64_alloc(M_WAITOK); - cpu_uses_random = counter_u64_alloc(M_WAITOK); + pace->rp_num_hptss = ncpus; - sz = (tcp_pace.rp_num_hptss * sizeof(struct tcp_hpts_entry *)); - tcp_pace.rp_ent = malloc(sz, M_TCPHPTS, M_WAITOK | M_ZERO); - sz = (sizeof(uint32_t) * tcp_pace.rp_num_hptss); - tcp_pace.cts_last_ran = malloc(sz, M_TCPHPTS, M_WAITOK); - tcp_pace.grp_cnt = 0; + /* Allocate hpts entry array */ + sz = (pace->rp_num_hptss * sizeof(struct tcp_hpts_entry *)); + pace->rp_ent = malloc(sz, M_TCPHPTS, M_WAITOK | M_ZERO); + + /* Allocate timestamp tracking array */ + sz = (sizeof(uint32_t) * pace->rp_num_hptss); + pace->cts_last_ran = malloc(sz, M_TCPHPTS, M_WAITOK); + + /* Setup CPU groups */ if (cpu_top == NULL) { - tcp_pace.grp_cnt = 1; + pace->grp_cnt = 1; } else { /* Find out how many cache level 3 domains we have */ count = 0; - tcp_pace.grp_cnt = hpts_count_level(cpu_top); - if (tcp_pace.grp_cnt == 0) { - tcp_pace.grp_cnt = 1; + pace->grp_cnt = hpts_count_level(cpu_top); + if (pace->grp_cnt == 0) { + pace->grp_cnt = 1; } - sz = (tcp_pace.grp_cnt * sizeof(struct cpu_group *)); - tcp_pace.grps = malloc(sz, M_TCPHPTS, M_WAITOK); + sz = (pace->grp_cnt * sizeof(struct cpu_group *)); + pace->grps = malloc(sz, M_TCPHPTS, M_WAITOK); /* Now populate the groups */ - if (tcp_pace.grp_cnt == 1) { + if (pace->grp_cnt == 1) { /* * All we need is the top level all cpu's are in * the same cache so when we use grp[0]->cg_mask @@ -1903,193 +1891,290 @@ tcp_hpts_mod_load(void) * all cpu's in it. The level here is probably * zero which is ok. */ - tcp_pace.grps[0] = cpu_top; + pace->grps[0] = cpu_top; } else { /* * Here we must find all the level three cache domains * and setup our pointers to them. */ count = 0; - hpts_gather_grps(tcp_pace.grps, &count, tcp_pace.grp_cnt, cpu_top); + hpts_gather_grps(pace->grps, &count, pace->grp_cnt, cpu_top); } } + + /* Cache the current time for initializing the hpts entries */ + microuptime(&tv); + cts = tcp_tv_to_usec(&tv); + + /* Initialize each hpts entry */ asz = sizeof(struct hptsh) * NUM_OF_HPTSI_SLOTS; - for (i = 0; i < tcp_pace.rp_num_hptss; i++) { - tcp_pace.rp_ent[i] = malloc(sizeof(struct tcp_hpts_entry), + for (i = 0; i < pace->rp_num_hptss; i++) { + pace->rp_ent[i] = malloc(sizeof(struct tcp_hpts_entry), M_TCPHPTS, M_WAITOK | M_ZERO); - tcp_pace.rp_ent[i]->p_hptss = malloc(asz, M_TCPHPTS, M_WAITOK); - hpts = tcp_pace.rp_ent[i]; - /* - * Init all the hpts structures that are not specifically - * zero'd by the allocations. Also lets attach them to the - * appropriate sysctl block as well. - */ - mtx_init(&hpts->p_mtx, "tcp_hpts_lck", - "hpts", MTX_DEF | MTX_DUPOK); - for (j = 0; j < NUM_OF_HPTSI_SLOTS; j++) { - TAILQ_INIT(&hpts->p_hptss[j].head); - hpts->p_hptss[j].count = 0; - hpts->p_hptss[j].gencnt = 0; - } - sysctl_ctx_init(&hpts->hpts_ctx); - sprintf(unit, "%d", i); - hpts->hpts_root = SYSCTL_ADD_NODE(&hpts->hpts_ctx, - SYSCTL_STATIC_CHILDREN(_net_inet_tcp_hpts), - OID_AUTO, - unit, - CTLFLAG_RW | CTLFLAG_MPSAFE, 0, - ""); - SYSCTL_ADD_INT(&hpts->hpts_ctx, - SYSCTL_CHILDREN(hpts->hpts_root), - OID_AUTO, "out_qcnt", CTLFLAG_RD, - &hpts->p_on_queue_cnt, 0, - "Count TCB's awaiting output processing"); - SYSCTL_ADD_U16(&hpts->hpts_ctx, - SYSCTL_CHILDREN(hpts->hpts_root), - OID_AUTO, "active", CTLFLAG_RD, - &hpts->p_hpts_active, 0, - "Is the hpts active"); - SYSCTL_ADD_UINT(&hpts->hpts_ctx, - SYSCTL_CHILDREN(hpts->hpts_root), - OID_AUTO, "curslot", CTLFLAG_RD, - &hpts->p_cur_slot, 0, - "What the current running pacers goal"); - SYSCTL_ADD_UINT(&hpts->hpts_ctx, - SYSCTL_CHILDREN(hpts->hpts_root), - OID_AUTO, "runtick", CTLFLAG_RD, - &hpts->p_runningslot, 0, - "What the running pacers current slot is"); - SYSCTL_ADD_UINT(&hpts->hpts_ctx, - SYSCTL_CHILDREN(hpts->hpts_root), - OID_AUTO, "curtick", CTLFLAG_RD, - &hpts->p_curtick, 0, - "What the running pacers last tick mapped to the wheel was"); - SYSCTL_ADD_UINT(&hpts->hpts_ctx, - SYSCTL_CHILDREN(hpts->hpts_root), - OID_AUTO, "lastran", CTLFLAG_RD, - &tcp_pace.cts_last_ran[i], 0, - "The last usec tick that this hpts ran"); - SYSCTL_ADD_LONG(&hpts->hpts_ctx, - SYSCTL_CHILDREN(hpts->hpts_root), - OID_AUTO, "cur_min_sleep", CTLFLAG_RD, - &hpts->p_mysleep.tv_usec, - "What the running pacers is using for p_mysleep.tv_usec"); - SYSCTL_ADD_U64(&hpts->hpts_ctx, - SYSCTL_CHILDREN(hpts->hpts_root), - OID_AUTO, "now_sleeping", CTLFLAG_RD, - &hpts->sleeping, 0, - "What the running pacers is actually sleeping for"); - SYSCTL_ADD_U64(&hpts->hpts_ctx, - SYSCTL_CHILDREN(hpts->hpts_root), - OID_AUTO, "syscall_cnt", CTLFLAG_RD, - &hpts->syscall_cnt, 0, - "How many times we had syscalls on this hpts"); + pace->rp_ent[i]->p_hptss = malloc(asz, M_TCPHPTS, + M_WAITOK | M_ZERO); + hpts = pace->rp_ent[i]; + /* Basic initialization */ hpts->p_hpts_sleep_time = hpts_sleep_max; - hpts->p_num = i; - hpts->p_curtick = tcp_gethptstick(&tv); - tcp_pace.cts_last_ran[i] = tcp_tv_to_usec(&tv); - hpts->p_prev_slot = hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick); - hpts->p_cpu = 0xffff; + hpts->p_cpu = i; + pace->cts_last_ran[i] = cts; + hpts->p_cur_slot = cts_to_wheel(cts); + hpts->p_prev_slot = hpts->p_cur_slot; hpts->p_nxt_slot = hpts_slot(hpts->p_cur_slot, 1); callout_init(&hpts->co, 1); + hpts->p_hptsi = pace; + mtx_init(&hpts->p_mtx, "tcp_hpts_lck", "hpts", + MTX_DEF | MTX_DUPOK); + for (j = 0; j < NUM_OF_HPTSI_SLOTS; j++) { + TAILQ_INIT(&hpts->p_hptss[j].head); + } + + /* Setup SYSCTL if requested */ + if (enable_sysctl) { + sysctl_ctx_init(&hpts->hpts_ctx); + sprintf(unit, "%d", i); + hpts->hpts_root = SYSCTL_ADD_NODE(&hpts->hpts_ctx, + SYSCTL_STATIC_CHILDREN(_net_inet_tcp_hpts), + OID_AUTO, + unit, + CTLFLAG_RW | CTLFLAG_MPSAFE, 0, + ""); + SYSCTL_ADD_INT(&hpts->hpts_ctx, + SYSCTL_CHILDREN(hpts->hpts_root), + OID_AUTO, "out_qcnt", CTLFLAG_RD, + &hpts->p_on_queue_cnt, 0, + "Count TCB's awaiting output processing"); + SYSCTL_ADD_U16(&hpts->hpts_ctx, + SYSCTL_CHILDREN(hpts->hpts_root), + OID_AUTO, "active", CTLFLAG_RD, + &hpts->p_hpts_active, 0, + "Is the hpts active"); + SYSCTL_ADD_UINT(&hpts->hpts_ctx, + SYSCTL_CHILDREN(hpts->hpts_root), + OID_AUTO, "curslot", CTLFLAG_RD, + &hpts->p_cur_slot, 0, + "What the current running pacers goal"); + SYSCTL_ADD_UINT(&hpts->hpts_ctx, + SYSCTL_CHILDREN(hpts->hpts_root), + OID_AUTO, "runslot", CTLFLAG_RD, + &hpts->p_runningslot, 0, + "What the running pacers current slot is"); + SYSCTL_ADD_UINT(&hpts->hpts_ctx, + SYSCTL_CHILDREN(hpts->hpts_root), + OID_AUTO, "lastran", CTLFLAG_RD, + &pace->cts_last_ran[i], 0, + "The last usec timestamp that this hpts ran"); + SYSCTL_ADD_LONG(&hpts->hpts_ctx, + SYSCTL_CHILDREN(hpts->hpts_root), + OID_AUTO, "cur_min_sleep", CTLFLAG_RD, + &hpts->p_mysleep.tv_usec, + "What the running pacers is using for p_mysleep.tv_usec"); + SYSCTL_ADD_U64(&hpts->hpts_ctx, + SYSCTL_CHILDREN(hpts->hpts_root), + OID_AUTO, "now_sleeping", CTLFLAG_RD, + &hpts->sleeping, 0, + "What the running pacers is actually sleeping for"); + SYSCTL_ADD_U64(&hpts->hpts_ctx, + SYSCTL_CHILDREN(hpts->hpts_root), + OID_AUTO, "syscall_cnt", CTLFLAG_RD, + &hpts->syscall_cnt, 0, + "How many times we had syscalls on this hpts"); + } } - /* Don't try to bind to NUMA domains if we don't have any */ - if (vm_ndomains == 1 && tcp_bind_threads == 2) - tcp_bind_threads = 0; - /* - * Now lets start ithreads to handle the hptss. - */ - for (i = 0; i < tcp_pace.rp_num_hptss; i++) { - hpts = tcp_pace.rp_ent[i]; - hpts->p_cpu = i; + return (pace); +} + +/* + * Create threads for a tcp_hptsi structure and starts timers for the current + * (minimum) sleep interval. + */ +void +tcp_hptsi_start(struct tcp_hptsi *pace) +{ + struct tcp_hpts_entry *hpts; + struct pcpu *pc; + struct timeval tv; + uint32_t i, j; + int count, domain; + int error __diagused; + + KASSERT(pace != NULL, ("tcp_hptsi_start: pace is NULL")); + + /* Start threads for each hpts entry */ + for (i = 0; i < pace->rp_num_hptss; i++) { + hpts = pace->rp_ent[i]; + + KASSERT(hpts->ie_cookie == NULL, + ("tcp_hptsi_start: hpts[%d]->ie_cookie is not NULL", i)); error = swi_add(&hpts->ie, "hpts", tcp_hpts_thread, (void *)hpts, SWI_NET, INTR_MPSAFE, &hpts->ie_cookie); KASSERT(error == 0, - ("Can't add hpts:%p i:%d err:%d", - hpts, i, error)); - created++; - hpts->p_mysleep.tv_sec = 0; - hpts->p_mysleep.tv_usec = tcp_min_hptsi_time; + ("Can't add hpts:%p i:%d err:%d", hpts, i, error)); + if (tcp_bind_threads == 1) { - if (intr_event_bind(hpts->ie, i) == 0) - bound++; + (void)intr_event_bind(hpts->ie, i); } else if (tcp_bind_threads == 2) { /* Find the group for this CPU (i) and bind into it */ - for (j = 0; j < tcp_pace.grp_cnt; j++) { - if (CPU_ISSET(i, &tcp_pace.grps[j]->cg_mask)) { + for (j = 0; j < pace->grp_cnt; j++) { + if (CPU_ISSET(i, &pace->grps[j]->cg_mask)) { if (intr_event_bind_ithread_cpuset(hpts->ie, - &tcp_pace.grps[j]->cg_mask) == 0) { - bound++; + &pace->grps[j]->cg_mask) == 0) { pc = pcpu_find(i); domain = pc->pc_domain; - count = hpts_domains[domain].count; - hpts_domains[domain].cpu[count] = i; - hpts_domains[domain].count++; + count = pace->domains[domain].count; + pace->domains[domain].cpu[count] = i; + pace->domains[domain].count++; break; } } } } + + hpts->p_mysleep.tv_sec = 0; + hpts->p_mysleep.tv_usec = tcp_min_hptsi_time; tv.tv_sec = 0; tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_USECS_PER_SLOT; - hpts->sleeping = tv.tv_usec; - sb = tvtosbt(tv); - callout_reset_sbt_on(&hpts->co, sb, 0, - hpts_timeout_swi, hpts, hpts->p_cpu, - (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision))); - } - /* - * If we somehow have an empty domain, fall back to choosing - * among all htps threads. - */ - for (i = 0; i < vm_ndomains; i++) { - if (hpts_domains[i].count == 0) { - tcp_bind_threads = 0; - break; - } + (void)tcp_hpts_sleep(hpts, &tv); } - tcp_hpts_softclock = __tcp_run_hpts; - tcp_lro_hpts_init(); - printf("TCP Hpts created %d swi interrupt threads and bound %d to %s\n", - created, bound, - tcp_bind_threads == 2 ? "NUMA domains" : "cpus"); } -static void -tcp_hpts_mod_unload(void) +/* + * Stop all callouts/threads for a tcp_hptsi structure. + */ +void +tcp_hptsi_stop(struct tcp_hptsi *pace) { + struct tcp_hpts_entry *hpts; int rv __diagused; + uint32_t i; - tcp_lro_hpts_uninit(); - atomic_store_ptr(&tcp_hpts_softclock, NULL); + KASSERT(pace != NULL, ("tcp_hptsi_stop: pace is NULL")); - for (int i = 0; i < tcp_pace.rp_num_hptss; i++) { - struct tcp_hpts_entry *hpts = tcp_pace.rp_ent[i]; + for (i = 0; i < pace->rp_num_hptss; i++) { + hpts = pace->rp_ent[i]; + KASSERT(hpts != NULL, ("tcp_hptsi_stop: hpts[%d] is NULL", i)); + KASSERT(hpts->ie_cookie != NULL, + ("tcp_hptsi_stop: hpts[%d]->ie_cookie is NULL", i)); - rv = callout_drain(&hpts->co); + rv = _callout_stop_safe(&hpts->co, CS_DRAIN); MPASS(rv != 0); rv = swi_remove(hpts->ie_cookie); MPASS(rv == 0); + hpts->ie_cookie = NULL; + } +} - rv = sysctl_ctx_free(&hpts->hpts_ctx); - MPASS(rv == 0); +/* + * Destroy a tcp_hptsi structure initialized by tcp_hptsi_create. + */ +void +tcp_hptsi_destroy(struct tcp_hptsi *pace) +{ + struct tcp_hpts_entry *hpts; + uint32_t i; + + KASSERT(pace != NULL, ("tcp_hptsi_destroy: pace is NULL")); + KASSERT(pace->rp_ent != NULL, ("tcp_hptsi_destroy: pace->rp_ent is NULL")); + + /* Cleanup each hpts entry */ + for (i = 0; i < pace->rp_num_hptss; i++) { + hpts = pace->rp_ent[i]; + if (hpts != NULL) { + /* Cleanup SYSCTL if it was initialized */ + if (hpts->hpts_root != NULL) { + sysctl_ctx_free(&hpts->hpts_ctx); + } - mtx_destroy(&hpts->p_mtx); - free(hpts->p_hptss, M_TCPHPTS); - free(hpts, M_TCPHPTS); + mtx_destroy(&hpts->p_mtx); + free(hpts->p_hptss, M_TCPHPTS); + free(hpts, M_TCPHPTS); + } } - free(tcp_pace.rp_ent, M_TCPHPTS); - free(tcp_pace.cts_last_ran, M_TCPHPTS); + /* Cleanup main arrays */ + free(pace->rp_ent, M_TCPHPTS); + free(pace->cts_last_ran, M_TCPHPTS); #ifdef SMP - free(tcp_pace.grps, M_TCPHPTS); + free(pace->grps, M_TCPHPTS); #endif + /* Free the main structure */ + free(pace, M_TCPHPTS); +} + +static int +tcp_hpts_mod_load(void) +{ + int i; + + /* Don't try to bind to NUMA domains if we don't have any */ + if (vm_ndomains == 1 && tcp_bind_threads == 2) + tcp_bind_threads = 0; + + /* Create the tcp_hptsi structure */ + tcp_hptsi_pace = tcp_hptsi_create(&tcp_hptsi_default_funcs, true); + if (tcp_hptsi_pace == NULL) + return (ENOMEM); + + /* Initialize global counters */ + hpts_hopelessly_behind = counter_u64_alloc(M_WAITOK); + hpts_loops = counter_u64_alloc(M_WAITOK); + back_tosleep = counter_u64_alloc(M_WAITOK); + combined_wheel_wrap = counter_u64_alloc(M_WAITOK); + wheel_wrap = counter_u64_alloc(M_WAITOK); + hpts_wake_timeout = counter_u64_alloc(M_WAITOK); + hpts_direct_awakening = counter_u64_alloc(M_WAITOK); + hpts_back_tosleep = counter_u64_alloc(M_WAITOK); + hpts_direct_call = counter_u64_alloc(M_WAITOK); + cpu_uses_flowid = counter_u64_alloc(M_WAITOK); + cpu_uses_random = counter_u64_alloc(M_WAITOK); + + /* Start the threads */ + tcp_hptsi_start(tcp_hptsi_pace); + + /* Enable the global HPTS softclock function */ + tcp_hpts_softclock = __tcp_run_hpts; + + /* Initialize LRO HPTS */ + tcp_lro_hpts_init(); + + /* + * If we somehow have an empty domain, fall back to choosing among all + * HPTS threads. + */ + for (i = 0; i < vm_ndomains; i++) { + if (tcp_hptsi_pace->domains[i].count == 0) { + tcp_bind_threads = 0; + break; + } + } + + printf("TCP HPTS started %u (%s) swi interrupt threads\n", + tcp_hptsi_pace->rp_num_hptss, (tcp_bind_threads == 0) ? + "(unbounded)" : + (tcp_bind_threads == 1 ? "per-cpu" : "per-NUMA-domain")); + + return (0); +} + +static void +tcp_hpts_mod_unload(void) +{ + tcp_lro_hpts_uninit(); + + /* Disable the global HPTS softclock function */ + atomic_store_ptr(&tcp_hpts_softclock, NULL); + + tcp_hptsi_stop(tcp_hptsi_pace); + tcp_hptsi_destroy(tcp_hptsi_pace); + tcp_hptsi_pace = NULL; + + /* Cleanup global counters */ counter_u64_free(hpts_hopelessly_behind); counter_u64_free(hpts_loops); counter_u64_free(back_tosleep); @@ -2104,13 +2189,11 @@ tcp_hpts_mod_unload(void) } static int -tcp_hpts_modevent(module_t mod, int what, void *arg) +tcp_hpts_mod_event(module_t mod, int what, void *arg) { - switch (what) { case MOD_LOAD: - tcp_hpts_mod_load(); - return (0); + return (tcp_hpts_mod_load()); case MOD_QUIESCE: /* * Since we are a dependency of TCP stack modules, they should @@ -2130,7 +2213,7 @@ tcp_hpts_modevent(module_t mod, int what, void *arg) static moduledata_t tcp_hpts_module = { .name = "tcphpts", - .evhand = tcp_hpts_modevent, + .evhand = tcp_hpts_mod_event, }; DECLARE_MODULE(tcphpts, tcp_hpts_module, SI_SUB_SOFTINTR, SI_ORDER_ANY); diff --git a/sys/netinet/tcp_hpts.h b/sys/netinet/tcp_hpts.h index 6172baf2a062..6b05f9701ac2 100644 --- a/sys/netinet/tcp_hpts.h +++ b/sys/netinet/tcp_hpts.h @@ -28,19 +28,11 @@ /* Number of useconds represented by an hpts slot */ #define HPTS_USECS_PER_SLOT 10 -#define HPTS_MS_TO_SLOTS(x) ((x * 100) + 1) -#define HPTS_USEC_TO_SLOTS(x) ((x+9) /10) #define HPTS_USEC_IN_SEC 1000000 #define HPTS_MSEC_IN_SEC 1000 #define HPTS_USEC_IN_MSEC 1000 static inline uint32_t -tcp_tv_to_hpts_slot(const struct timeval *sv) -{ - return ((sv->tv_sec * 100000) + (sv->tv_usec / HPTS_USECS_PER_SLOT)); -} - -static inline uint32_t tcp_tv_to_usec(const struct timeval *sv) { return ((uint32_t) ((sv->tv_sec * HPTS_USEC_IN_SEC) + sv->tv_usec)); @@ -66,7 +58,7 @@ struct hpts_diag { uint32_t p_runningslot; /* bbr->inflight */ uint32_t slot_req; /* bbr->flex3 x */ uint32_t inp_hptsslot; /* bbr->flex4 x */ - uint32_t slot_remaining; /* bbr->flex5 x */ + uint32_t time_remaining; /* bbr->flex5 x */ uint32_t have_slept; /* bbr->epoch x */ uint32_t hpts_sleep_time; /* bbr->applimited x */ uint32_t yet_to_sleep; /* bbr->lt_epoch x */ @@ -75,8 +67,6 @@ struct hpts_diag { uint32_t maxslots; /* bbr->delRate x */ uint32_t wheel_cts; /* bbr->rttProp x */ int32_t co_ret; /* bbr->pkts_out x */ - uint32_t p_curtick; /* upper bbr->cur_del_rate */ - uint32_t p_lasttick; /* lower bbr->cur_del_rate */ uint8_t p_on_min_sleep; /* bbr->flex8 x */ }; @@ -92,13 +82,18 @@ struct hpts_diag { #ifdef _KERNEL +extern struct tcp_hptsi *tcp_hptsi_pace; + /* * The following are the definitions for the kernel HPTS interface for managing * the HPTS ring and the TCBs on it. */ -void tcp_hpts_init(struct tcpcb *); -void tcp_hpts_remove(struct tcpcb *); +void __tcp_hpts_init(struct tcp_hptsi *pace, struct tcpcb *); +#define tcp_hpts_init(tp) __tcp_hpts_init(tcp_hptsi_pace, tp) + +void __tcp_hpts_remove(struct tcp_hptsi *pace, struct tcpcb *); +#define tcp_hpts_remove(tp) __tcp_hpts_remove(tcp_hptsi_pace, tp) static inline bool tcp_in_hpts(struct tcpcb *tp) @@ -132,12 +127,13 @@ tcp_in_hpts(struct tcpcb *tp) * that INP_WLOCK() or from destroying your TCB where again * you should already have the INP_WLOCK(). */ -uint32_t tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, - struct hpts_diag *diag); -#define tcp_hpts_insert(inp, slot) \ - tcp_hpts_insert_diag((inp), (slot), __LINE__, NULL) +void __tcp_hpts_insert(struct tcp_hptsi *pace, struct tcpcb *tp, uint32_t usecs, + struct hpts_diag *diag); +#define tcp_hpts_insert(tp, usecs, diag) \ + __tcp_hpts_insert(tcp_hptsi_pace, (tp), (usecs), (diag)) -void tcp_set_hpts(struct tcpcb *tp); +void __tcp_set_hpts(struct tcp_hptsi *pace, struct tcpcb *tp); +#define tcp_set_hpts(tp) __tcp_set_hpts(tcp_hptsi_pace, tp) extern int32_t tcp_min_hptsi_time; @@ -147,17 +143,6 @@ get_hpts_min_sleep_time(void) return (tcp_min_hptsi_time + HPTS_USECS_PER_SLOT); } -static inline uint32_t -tcp_gethptstick(struct timeval *sv) -{ - struct timeval tv; - - if (sv == NULL) - sv = &tv; - microuptime(sv); - return (tcp_tv_to_hpts_slot(sv)); -} - static inline uint64_t tcp_get_u64_usecs(struct timeval *tv) { @@ -180,12 +165,5 @@ tcp_get_usecs(struct timeval *tv) return (tcp_tv_to_usec(tv)); } -/* - * LRO HPTS initialization and uninitialization, only for internal use by the - * HPTS code. - */ -void tcp_lro_hpts_init(void); -void tcp_lro_hpts_uninit(void); - #endif /* _KERNEL */ #endif /* __tcp_hpts_h__ */ diff --git a/sys/netinet/tcp_hpts_internal.h b/sys/netinet/tcp_hpts_internal.h new file mode 100644 index 000000000000..8b33e03a6981 --- /dev/null +++ b/sys/netinet/tcp_hpts_internal.h @@ -0,0 +1,184 @@ +/*- + * Copyright (c) 2025 Netflix, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef __tcp_hpts_internal_h__ +#define __tcp_hpts_internal_h__ + +/* + * TCP High Precision Timer System (HPTS) - Internal Definitions + * + * This header contains internal structures, constants, and interfaces that are + * implemented in tcp_hpts.c but exposed to enable comprehensive unit testing of + * the HPTS subsystem. + */ + +#if defined(_KERNEL) + +/* + * The hpts uses a 102400 wheel. The wheel + * defines the time in 10 usec increments (102400 x 10). + * This gives a range of 10usec - 1024ms to place + * an entry within. If the user requests more than + * 1.024 second, a remaineder is attached and the hpts + * when seeing the remainder will re-insert the + * inpcb forward in time from where it is until + * the remainder is zero. + */ + +#define NUM_OF_HPTSI_SLOTS 102400 + +/* The number of connections after which the dynamic sleep logic kicks in. */ +#define DEFAULT_CONNECTION_THRESHOLD 100 + +/* + * The hpts uses a 102400 wheel. The wheel + * defines the time in 10 usec increments (102400 x 10). + * This gives a range of 10usec - 1024ms to place + * an entry within. If the user requests more than + * 1.024 second, a remaineder is attached and the hpts + * when seeing the remainder will re-insert the + * inpcb forward in time from where it is until + * the remainder is zero. + */ + +#define NUM_OF_HPTSI_SLOTS 102400 + +/* Convert microseconds to HPTS slots */ +#define HPTS_USEC_TO_SLOTS(x) ((x+9) /10) + +/* The number of connections after which the dynamic sleep logic kicks in. */ +#define DEFAULT_CONNECTION_THRESHOLD 100 + +extern int tcp_bind_threads; /* Thread binding configuration + * (0=none, 1=cpu, 2=numa) */ + +/* + * Abstraction layer controlling time, interrupts and callouts. + */ +struct tcp_hptsi_funcs { + void (*microuptime)(struct timeval *tv); + int (*swi_add)(struct intr_event **eventp, const char *name, + driver_intr_t handler, void *arg, int pri, enum intr_type flags, + void **cookiep); + int (*swi_remove)(void *cookie); + void (*swi_sched)(void *cookie, int flags); + int (*intr_event_bind)(struct intr_event *ie, int cpu); + int (*intr_event_bind_ithread_cpuset)(struct intr_event *ie, + struct _cpuset *mask); + void (*callout_init)(struct callout *c, int mpsafe); + int (*callout_reset_sbt_on)(struct callout *c, sbintime_t sbt, + sbintime_t precision, void (*func)(void *), void *arg, int cpu, + int flags); + int (*_callout_stop_safe)(struct callout *c, int flags); +}; + +/* Default function table for system operation */ +extern const struct tcp_hptsi_funcs tcp_hptsi_default_funcs; + +/* Each hpts has its own p_mtx which is used for locking */ +#define HPTS_MTX_ASSERT(hpts) mtx_assert(&(hpts)->p_mtx, MA_OWNED) +#define HPTS_LOCK(hpts) mtx_lock(&(hpts)->p_mtx) +#define HPTS_TRYLOCK(hpts) mtx_trylock(&(hpts)->p_mtx) +#define HPTS_UNLOCK(hpts) mtx_unlock(&(hpts)->p_mtx) + +struct tcp_hpts_entry { + /* Cache line 0x00 */ + struct mtx p_mtx; /* Mutex for hpts */ + struct timeval p_mysleep; /* Our min sleep time */ + uint64_t syscall_cnt; + uint64_t sleeping; /* What the actual sleep was (if sleeping) */ + uint16_t p_hpts_active; /* Flag that says hpts is awake */ + uint8_t p_wheel_complete; /* have we completed the wheel arc walk? */ + uint32_t p_runningslot; /* Current slot we are at if we are running */ + uint32_t p_prev_slot; /* Previous slot we were on */ + uint32_t p_cur_slot; /* Current slot in wheel hpts is draining */ + uint32_t p_nxt_slot; /* The next slot outside the current range + * of slots that the hpts is running on. */ + int32_t p_on_queue_cnt; /* Count on queue in this hpts */ + uint8_t p_direct_wake :1, /* boolean */ + p_on_min_sleep:1, /* boolean */ + p_hpts_wake_scheduled:1,/* boolean */ + hit_callout_thresh:1, + p_avail:4; + uint8_t p_fill[3]; /* Fill to 32 bits */ + /* Cache line 0x40 */ + struct hptsh { + TAILQ_HEAD(, tcpcb) head; + uint32_t count; + uint32_t gencnt; + } *p_hptss; /* Hptsi wheel */ + uint32_t p_hpts_sleep_time; /* Current sleep interval having a max + * of 255ms */ + uint32_t overidden_sleep; /* what was overrided by min-sleep for logging */ + uint32_t saved_curslot; /* for logging */ + uint32_t saved_prev_slot; /* for logging */ + uint32_t p_delayed_by; /* How much were we delayed by */ + /* Cache line 0x80 */ + struct sysctl_ctx_list hpts_ctx; + struct sysctl_oid *hpts_root; + struct intr_event *ie; + void *ie_cookie; + uint16_t p_cpu; /* The hpts CPU */ + struct tcp_hptsi *p_hptsi; /* Back pointer to parent hptsi structure */ + /* There is extra space in here */ + /* Cache line 0x100 */ + struct callout co __aligned(CACHE_LINE_SIZE); +} __aligned(CACHE_LINE_SIZE); + +struct tcp_hptsi { + struct cpu_group **grps; + struct tcp_hpts_entry **rp_ent; /* Array of hptss */ + uint32_t *cts_last_ran; + uint32_t grp_cnt; + uint32_t rp_num_hptss; /* Number of hpts threads */ + struct hpts_domain_info { + int count; + int cpu[MAXCPU]; + } domains[MAXMEMDOM]; /* Per-NUMA domain CPU assignments */ + const struct tcp_hptsi_funcs *funcs; /* Function table for testability */ +}; + +/* + * Core tcp_hptsi structure manipulation functions. + */ +struct tcp_hptsi* tcp_hptsi_create(const struct tcp_hptsi_funcs *funcs, + bool enable_sysctl); +void tcp_hptsi_destroy(struct tcp_hptsi *pace); +void tcp_hptsi_start(struct tcp_hptsi *pace); +void tcp_hptsi_stop(struct tcp_hptsi *pace); +uint16_t tcp_hptsi_random_cpu(struct tcp_hptsi *pace); +int32_t tcp_hptsi(struct tcp_hpts_entry *hpts, bool from_callout); + +void tcp_hpts_wake(struct tcp_hpts_entry *hpts); + +/* + * LRO HPTS initialization and uninitialization, only for internal use by the + * HPTS code. + */ +void tcp_lro_hpts_init(void); +void tcp_lro_hpts_uninit(void); + +#endif /* defined(_KERNEL) */ +#endif /* __tcp_hpts_internal_h__ */ diff --git a/sys/netinet/tcp_hpts_test.c b/sys/netinet/tcp_hpts_test.c new file mode 100644 index 000000000000..c5dc9cb5b03b --- /dev/null +++ b/sys/netinet/tcp_hpts_test.c @@ -0,0 +1,1682 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2025 Netflix, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <tests/ktest.h> +#include <sys/cdefs.h> +#include "opt_inet.h" +#include <sys/param.h> +#include <sys/bus.h> +#include <sys/interrupt.h> +#include <sys/errno.h> +#include <sys/malloc.h> +#include <sys/mutex.h> +#include <sys/refcount.h> +#include <sys/socket.h> +#include <sys/sysctl.h> +#include <sys/systm.h> + +#include <netinet/in.h> +#include <netinet/tcp.h> +#include <netinet/in_pcb.h> +#include <netinet/tcp_seq.h> +#include <netinet/tcp_var.h> +#include <netinet/tcp_hpts.h> +#include <netinet/tcp_hpts_internal.h> +#include <dev/tcp_log/tcp_log_dev.h> +#include <netinet/tcp_log_buf.h> + +#undef tcp_hpts_init +#undef tcp_hpts_remove +#undef tcp_hpts_insert +#undef tcp_set_hpts + +/* Custom definitions that take the tcp_hptsi */ +#define tcp_hpts_init(pace, tp) __tcp_hpts_init((pace), (tp)) +#define tcp_hpts_remove(pace, tp) __tcp_hpts_remove((pace), (tp)) +#define tcp_hpts_insert(pace, tp, usecs, diag) \ + __tcp_hpts_insert((pace), (tp), (usecs), (diag)) +#define tcp_set_hpts(pace, tp) __tcp_set_hpts((pace), (tp)) + +static MALLOC_DEFINE(M_TCPHPTS, "tcp_hpts_test", "TCP hpts test"); + +static int test_exit_on_failure = true; +SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hpts_test, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, + "TCP HPTS test controls"); +SYSCTL_INT(_net_inet_tcp_hpts_test, OID_AUTO, exit_on_failure, CTLFLAG_RW, + &test_exit_on_failure, 0, + "Exit HPTS test immediately on first failure (1) or continue running all tests (0)"); + +#define KTEST_VERIFY(x) do { \ + if (!(x)) { \ + KTEST_ERR(ctx, "FAIL: %s", #x); \ + if (test_exit_on_failure) \ + return (EINVAL); \ + } else { \ + KTEST_LOG(ctx, "PASS: %s", #x); \ + } \ +} while (0) + +#define KTEST_EQUAL(x, y) do { \ + if ((x) != (y)) { \ + KTEST_ERR(ctx, "FAIL: %s != %s (%d != %d)", #x, #y, (x), (y)); \ + if (test_exit_on_failure) \ + return (EINVAL); \ + } else { \ + KTEST_LOG(ctx, "PASS: %s == %s", #x, #y); \ + } \ +} while (0) + +#define KTEST_NEQUAL(x, y) do { \ + if ((x) == (y)) { \ + KTEST_ERR(ctx, "FAIL: %s == %s (%d == %d)", #x, #y, (x), (y)); \ + if (test_exit_on_failure) \ + return (EINVAL); \ + } else { \ + KTEST_LOG(ctx, "PASS: %s != %s", #x, #y); \ + } \ +} while (0) + +#define KTEST_GREATER_THAN(x, y) do { \ + if ((x) <= (y)) { \ + KTEST_ERR(ctx, "FAIL: %s <= %s (%d <= %d)", #x, #y, (x), (y)); \ + if (test_exit_on_failure) \ + return (EINVAL); \ + } else { \ + KTEST_LOG(ctx, "PASS: %s > %s", #x, #y); \ + } \ +} while (0) + +#define KTEST_VERIFY_RET(x, y) do { \ + if (!(x)) { \ + KTEST_ERR(ctx, "FAIL: %s", #x); \ + if (test_exit_on_failure) \ + return (y); \ + } else { \ + KTEST_LOG(ctx, "PASS: %s", #x); \ + } \ +} while (0) + +#ifdef TCP_HPTS_KTEST + +static void +dump_hpts_entry(struct ktest_test_context *ctx, struct tcp_hpts_entry *hpts) +{ + KTEST_LOG(ctx, "tcp_hpts_entry(%p)", hpts); + KTEST_LOG(ctx, " p_cur_slot: %u", hpts->p_cur_slot); + KTEST_LOG(ctx, " p_prev_slot: %u", hpts->p_prev_slot); + KTEST_LOG(ctx, " p_nxt_slot: %u", hpts->p_nxt_slot); + KTEST_LOG(ctx, " p_runningslot: %u", hpts->p_runningslot); + KTEST_LOG(ctx, " p_on_queue_cnt: %d", hpts->p_on_queue_cnt); + KTEST_LOG(ctx, " p_hpts_active: %u", hpts->p_hpts_active); + KTEST_LOG(ctx, " p_wheel_complete: %u", hpts->p_wheel_complete); + KTEST_LOG(ctx, " p_direct_wake: %u", hpts->p_direct_wake); + KTEST_LOG(ctx, " p_on_min_sleep: %u", hpts->p_on_min_sleep); + KTEST_LOG(ctx, " p_hpts_wake_scheduled: %u", hpts->p_hpts_wake_scheduled); + KTEST_LOG(ctx, " hit_callout_thresh: %u", hpts->hit_callout_thresh); + KTEST_LOG(ctx, " p_hpts_sleep_time: %u", hpts->p_hpts_sleep_time); + KTEST_LOG(ctx, " p_delayed_by: %u", hpts->p_delayed_by); + KTEST_LOG(ctx, " overidden_sleep: %u", hpts->overidden_sleep); + KTEST_LOG(ctx, " saved_curslot: %u", hpts->saved_curslot); + KTEST_LOG(ctx, " saved_prev_slot: %u", hpts->saved_prev_slot); + KTEST_LOG(ctx, " syscall_cnt: %lu", hpts->syscall_cnt); + KTEST_LOG(ctx, " sleeping: %lu", hpts->sleeping); + KTEST_LOG(ctx, " p_cpu: %u", hpts->p_cpu); + KTEST_LOG(ctx, " ie_cookie: %p", hpts->ie_cookie); + KTEST_LOG(ctx, " p_hptsi: %p", hpts->p_hptsi); + KTEST_LOG(ctx, " p_mysleep: %ld.%06ld", hpts->p_mysleep.tv_sec, hpts->p_mysleep.tv_usec); +} + +static void +dump_tcpcb(struct tcpcb *tp) +{ + struct ktest_test_context *ctx = tp->t_fb_ptr; + struct inpcb *inp = &tp->t_inpcb; + + KTEST_LOG(ctx, "tcp_control_block(%p)", tp); + + /* HPTS-specific fields */ + KTEST_LOG(ctx, " t_in_hpts: %d", tp->t_in_hpts); + KTEST_LOG(ctx, " t_hpts_cpu: %u", tp->t_hpts_cpu); + KTEST_LOG(ctx, " t_hpts_slot: %d", tp->t_hpts_slot); + KTEST_LOG(ctx, " t_hpts_gencnt: %u", tp->t_hpts_gencnt); + KTEST_LOG(ctx, " t_hpts_request: %u", tp->t_hpts_request); + + /* LRO CPU field */ + KTEST_LOG(ctx, " t_lro_cpu: %u", tp->t_lro_cpu); + + /* TCP flags that affect HPTS */ + KTEST_LOG(ctx, " t_flags2: 0x%x", tp->t_flags2); + KTEST_LOG(ctx, " TF2_HPTS_CPU_SET: %s", (tp->t_flags2 & TF2_HPTS_CPU_SET) ? "YES" : "NO"); + KTEST_LOG(ctx, " TF2_HPTS_CALLS: %s", (tp->t_flags2 & TF2_HPTS_CALLS) ? "YES" : "NO"); + KTEST_LOG(ctx, " TF2_SUPPORTS_MBUFQ: %s", (tp->t_flags2 & TF2_SUPPORTS_MBUFQ) ? "YES" : "NO"); + + /* Input PCB fields that HPTS uses */ + KTEST_LOG(ctx, " inp_flags: 0x%x", inp->inp_flags); + KTEST_LOG(ctx, " INP_DROPPED: %s", (inp->inp_flags & INP_DROPPED) ? "YES" : "NO"); + KTEST_LOG(ctx, " inp_flowid: 0x%x", inp->inp_flowid); + KTEST_LOG(ctx, " inp_flowtype: %u", inp->inp_flowtype); + KTEST_LOG(ctx, " inp_numa_domain: %d", inp->inp_numa_domain); +} + +/* Enum for call counting indices */ +enum test_call_counts { + CCNT_MICROUPTIME = 0, + CCNT_SWI_ADD, + CCNT_SWI_REMOVE, + CCNT_SWI_SCHED, + CCNT_INTR_EVENT_BIND, + CCNT_INTR_EVENT_BIND_CPUSET, + CCNT_CALLOUT_INIT, + CCNT_CALLOUT_RESET_SBT_ON, + CCNT_CALLOUT_STOP_SAFE, + CCNT_TCP_OUTPUT, + CCNT_TCP_TFB_DO_QUEUED_SEGMENTS, + CCNT_MAX +}; + +static uint32_t call_counts[CCNT_MAX]; + +static uint64_t test_time_usec = 0; + +/* + * Reset all test global variables to a clean state. + */ +static void +test_hpts_init(void) +{ + memset(call_counts, 0, sizeof(call_counts)); + test_time_usec = 0; +} + +static void +test_microuptime(struct timeval *tv) +{ + call_counts[CCNT_MICROUPTIME]++; + tv->tv_sec = test_time_usec / 1000000; + tv->tv_usec = test_time_usec % 1000000; +} + +static int +test_swi_add(struct intr_event **eventp, const char *name, + driver_intr_t handler, void *arg, int pri, enum intr_type flags, + void **cookiep) +{ + call_counts[CCNT_SWI_ADD]++; + /* Simulate successful SWI creation */ + *eventp = (struct intr_event *)0xfeedface; /* Mock event */ + *cookiep = (void *)0xdeadbeef; /* Mock cookie */ + return (0); +} + +static int +test_swi_remove(void *cookie) +{ + call_counts[CCNT_SWI_REMOVE]++; + /* Simulate successful removal */ + return (0); +} + +static void +test_swi_sched(void *cookie, int flags) +{ + call_counts[CCNT_SWI_SCHED]++; + /* Simulate successful SWI scheduling */ +} + +static int +test_intr_event_bind(struct intr_event *ie, int cpu) +{ + call_counts[CCNT_INTR_EVENT_BIND]++; + /* Simulate successful binding */ + return (0); +} + +static int +test_intr_event_bind_ithread_cpuset(struct intr_event *ie, struct _cpuset *mask) +{ + call_counts[CCNT_INTR_EVENT_BIND_CPUSET]++; + /* Simulate successful cpuset binding */ + return (0); +} + +static void +test_callout_init(struct callout *c, int mpsafe) +{ + call_counts[CCNT_CALLOUT_INIT]++; + memset(c, 0, sizeof(*c)); +} + +static int +test_callout_reset_sbt_on(struct callout *c, sbintime_t sbt, sbintime_t precision, + void (*func)(void *), void *arg, int cpu, int flags) +{ + call_counts[CCNT_CALLOUT_RESET_SBT_ON]++; + /* Return 1 to simulate successful timer scheduling */ + return (1); +} + +static int +test_callout_stop_safe(struct callout *c, int flags) +{ + call_counts[CCNT_CALLOUT_STOP_SAFE]++; + /* Return 1 to simulate successful timer stopping */ + return (1); +} + +static const struct tcp_hptsi_funcs test_funcs = { + .microuptime = test_microuptime, + .swi_add = test_swi_add, + .swi_remove = test_swi_remove, + .swi_sched = test_swi_sched, + .intr_event_bind = test_intr_event_bind, + .intr_event_bind_ithread_cpuset = test_intr_event_bind_ithread_cpuset, + .callout_init = test_callout_init, + .callout_reset_sbt_on = test_callout_reset_sbt_on, + ._callout_stop_safe = test_callout_stop_safe, +}; + +#define TP_REMOVE_FROM_HPTS(tp) tp->bits_spare +#define TP_LOG_TEST(tp) tp->t_log_state_set + +static int +test_tcp_output(struct tcpcb *tp) +{ + struct ktest_test_context *ctx = tp->t_fb_ptr; + struct tcp_hptsi *pace = (struct tcp_hptsi*)tp->t_tfo_pending; + struct tcp_hpts_entry *hpts = pace->rp_ent[tp->t_hpts_cpu]; + + call_counts[CCNT_TCP_OUTPUT]++; + if (TP_LOG_TEST(tp)) { + KTEST_LOG(ctx, "=> tcp_output(%p)", tp); + dump_tcpcb(tp); + dump_hpts_entry(ctx, hpts); + } + + if ((TP_REMOVE_FROM_HPTS(tp) & 1) != 0) { + if (TP_LOG_TEST(tp)) + KTEST_LOG(ctx, "=> tcp_hpts_remove(%p)", tp); + tcp_hpts_remove(pace, tp); + } + + if ((TP_REMOVE_FROM_HPTS(tp) & 2) != 0) { + INP_WUNLOCK(&tp->t_inpcb); /* tcp_output unlocks on error */ + return (-1); /* Simulate tcp_output error */ + } + + return (0); +} + +static int +test_tfb_do_queued_segments(struct tcpcb *tp, int flag) +{ + struct ktest_test_context *ctx = tp->t_fb_ptr; + struct tcp_hptsi *pace = (struct tcp_hptsi*)tp->t_tfo_pending; + struct tcp_hpts_entry *hpts = pace->rp_ent[tp->t_hpts_cpu]; + + call_counts[CCNT_TCP_TFB_DO_QUEUED_SEGMENTS]++; + KTEST_LOG(ctx, "=> tfb_do_queued_segments(%p, %d)", tp, flag); + dump_tcpcb(tp); + dump_hpts_entry(ctx, hpts); + + if ((TP_REMOVE_FROM_HPTS(tp) & 1) != 0) { + if (TP_LOG_TEST(tp)) + KTEST_LOG(ctx, "=> tcp_hpts_remove(%p)", tp); + tcp_hpts_remove(pace, tp); + } + + if ((TP_REMOVE_FROM_HPTS(tp) & 2) != 0) { + INP_WUNLOCK(&tp->t_inpcb); /* do_queued_segments unlocks on error */ + return (-1); /* Simulate do_queued_segments error */ + } + + return (0); +} + +static struct tcp_function_block test_tcp_fb = { + .tfb_tcp_block_name = "hpts_test_tcp", + .tfb_tcp_output = test_tcp_output, + .tfb_do_queued_segments = test_tfb_do_queued_segments, +}; + +/* + * Create a minimally initialized tcpcb that can be safely inserted into HPTS. + * This function allocates and initializes all the fields that HPTS code + * reads or writes. + */ +static struct tcpcb * +test_hpts_create_tcpcb(struct ktest_test_context *ctx, struct tcp_hptsi *pace) +{ + struct tcpcb *tp; + + tp = malloc(sizeof(struct tcpcb), M_TCPHPTS, M_WAITOK | M_ZERO); + if (tp) { + rw_init_flags(&tp->t_inpcb.inp_lock, "test-inp", + RW_RECURSE | RW_DUPOK); + refcount_init(&tp->t_inpcb.inp_refcount, 1); + tp->t_inpcb.inp_pcbinfo = &V_tcbinfo; + tp->t_fb = &test_tcp_fb; + tp->t_hpts_cpu = HPTS_CPU_NONE; + STAILQ_INIT(&tp->t_inqueue); + tcp_hpts_init(pace, tp); + + /* Stuff some pointers in the tcb for test purposes. */ + tp->t_fb_ptr = ctx; + tp->t_tfo_pending = (unsigned int*)pace; + } + + return (tp); +} + +/* + * Free a test tcpcb created by test_hpts_create_tcpcb() + */ +static void +test_hpts_free_tcpcb(struct tcpcb *tp) +{ + if (tp == NULL) + return; + + INP_LOCK_DESTROY(&tp->t_inpcb); + free(tp, M_TCPHPTS); +} + +/* + * *********************************************** + * * KTEST functions for testing the HPTS module * + * *********************************************** + */ + +/* + * Validates that the HPTS module is properly loaded and initialized by checking + * that the minimum HPTS time is configured. + */ +KTEST_FUNC(module_load) +{ + test_hpts_init(); + KTEST_NEQUAL(tcp_min_hptsi_time, 0); + KTEST_VERIFY(tcp_bind_threads >= 0 && tcp_bind_threads <= 2); + KTEST_NEQUAL(tcp_hptsi_pace, NULL); + return (0); +} + +/* + * Validates the creation and destruction of tcp_hptsi structures, ensuring + * proper initialization of internal fields and clean destruction. + */ +KTEST_FUNC(hptsi_create_destroy) +{ + struct tcp_hptsi *pace; + + test_hpts_init(); + + pace = tcp_hptsi_create(&test_funcs, false); + KTEST_NEQUAL(pace, NULL); + KTEST_NEQUAL(pace->rp_ent, NULL); + KTEST_NEQUAL(pace->cts_last_ran, NULL); + KTEST_VERIFY(pace->rp_num_hptss > 0); + KTEST_VERIFY(pace->rp_num_hptss <= MAXCPU); /* Reasonable upper bound */ + KTEST_VERIFY(pace->grp_cnt >= 1); /* At least one group */ + KTEST_EQUAL(pace->funcs, &test_funcs); /* Verify function pointer was set */ + + /* Verify individual HPTS entries are properly initialized */ + for (uint32_t i = 0; i < pace->rp_num_hptss; i++) { + KTEST_NEQUAL(pace->rp_ent[i], NULL); + KTEST_EQUAL(pace->rp_ent[i]->p_cpu, i); + KTEST_EQUAL(pace->rp_ent[i]->p_hptsi, pace); + KTEST_EQUAL(pace->rp_ent[i]->p_on_queue_cnt, 0); + } + + tcp_hptsi_destroy(pace); + + return (0); +} + +/* + * Validates that tcp_hptsi structures can be started and stopped properly, + * including verification that threads are created during start and cleaned up + * during stop operations. + */ +KTEST_FUNC(hptsi_start_stop) +{ + struct tcp_hptsi *pace; + + test_hpts_init(); + + pace = tcp_hptsi_create(&test_funcs, false); + KTEST_NEQUAL(pace, NULL); + + tcp_hptsi_start(pace); + + /* Verify that entries have threads started */ + struct tcp_hpts_entry *hpts = pace->rp_ent[0]; + KTEST_NEQUAL(hpts->ie_cookie, NULL); /* Should have SWI handler */ + KTEST_EQUAL(hpts->p_hptsi, pace); /* Should point to our pace */ + + tcp_hptsi_stop(pace); + tcp_hptsi_destroy(pace); + + return (0); +} + +/* + * Validates that multiple tcp_hptsi instances can coexist independently, with + * different configurations and CPU assignments without interfering with each + * other. + */ +KTEST_FUNC(hptsi_independence) +{ + struct tcp_hptsi *pace1, *pace2; + uint16_t cpu1, cpu2; + + test_hpts_init(); + + pace1 = tcp_hptsi_create(&test_funcs, false); + pace2 = tcp_hptsi_create(&test_funcs, false); + KTEST_NEQUAL(pace1, NULL); + KTEST_NEQUAL(pace2, NULL); + KTEST_NEQUAL(pace2->rp_ent, NULL); + + cpu1 = tcp_hptsi_random_cpu(pace1); + cpu2 = tcp_hptsi_random_cpu(pace2); + KTEST_VERIFY(cpu1 < pace1->rp_num_hptss); + KTEST_VERIFY(cpu2 < pace2->rp_num_hptss); + + /* Verify both instances have independent entry arrays */ + KTEST_NEQUAL(pace1->rp_ent, pace2->rp_ent); + /* Verify they may have different CPU counts but both reasonable */ + KTEST_VERIFY(pace1->rp_num_hptss > 0 && pace1->rp_num_hptss <= MAXCPU); + KTEST_VERIFY(pace2->rp_num_hptss > 0 && pace2->rp_num_hptss <= MAXCPU); + + tcp_hptsi_destroy(pace1); + tcp_hptsi_destroy(pace2); + + return (0); +} + +/* + * Validates that custom function injection works correctly, ensuring that + * test-specific implementations of microuptime and others are properly + * called by the HPTS system. + */ +KTEST_FUNC(function_injection) +{ + struct tcp_hptsi *pace; + + test_hpts_init(); + + pace = tcp_hptsi_create(&test_funcs, false); + KTEST_NEQUAL(pace, NULL); + KTEST_EQUAL(pace->funcs, &test_funcs); + KTEST_VERIFY(call_counts[CCNT_MICROUPTIME] > 0); + KTEST_VERIFY(call_counts[CCNT_CALLOUT_INIT] > 0); + + tcp_hptsi_start(pace); + KTEST_VERIFY(call_counts[CCNT_SWI_ADD] > 0); + KTEST_VERIFY(tcp_bind_threads == 0 || + call_counts[CCNT_INTR_EVENT_BIND] > 0 || + call_counts[CCNT_INTR_EVENT_BIND_CPUSET] > 0); + KTEST_VERIFY(call_counts[CCNT_CALLOUT_RESET_SBT_ON] > 0); + + tcp_hptsi_stop(pace); + KTEST_VERIFY(call_counts[CCNT_CALLOUT_STOP_SAFE] > 0); + KTEST_VERIFY(call_counts[CCNT_SWI_REMOVE] > 0); + + tcp_hptsi_destroy(pace); + + /* Verify we have a reasonable balance of create/destroy calls */ + KTEST_EQUAL(call_counts[CCNT_SWI_ADD], call_counts[CCNT_SWI_REMOVE]); + KTEST_VERIFY(call_counts[CCNT_CALLOUT_RESET_SBT_ON] <= call_counts[CCNT_CALLOUT_STOP_SAFE]); + + return (0); +} + +/* + * Validates that a tcpcb can be properly initialized for HPTS compatibility, + * ensuring all required fields are set correctly and function pointers are + * valid for safe HPTS operations. + */ +KTEST_FUNC(tcpcb_initialization) +{ + struct tcp_hptsi *pace; + struct tcpcb *tp; + + test_hpts_init(); + + pace = tcp_hptsi_create(&test_funcs, false); + KTEST_NEQUAL(pace, NULL); + tcp_hptsi_start(pace); + + /* Verify the tcpcb is properly initialized for HPTS */ + tp = test_hpts_create_tcpcb(ctx, pace); + KTEST_NEQUAL(tp, NULL); + KTEST_NEQUAL(tp->t_fb, NULL); + KTEST_NEQUAL(tp->t_fb->tfb_tcp_output, NULL); + KTEST_NEQUAL(tp->t_fb->tfb_do_queued_segments, NULL); + KTEST_EQUAL(tp->t_in_hpts, IHPTS_NONE); + KTEST_EQUAL((tp->t_flags2 & (TF2_HPTS_CPU_SET | TF2_HPTS_CALLS)), 0); + + /* Verify that HPTS-specific fields are initialized */ + KTEST_EQUAL(tp->t_hpts_gencnt, 0); + KTEST_EQUAL(tp->t_hpts_slot, 0); + KTEST_EQUAL(tp->t_hpts_request, 0); + KTEST_EQUAL(tp->t_lro_cpu, 0); + KTEST_VERIFY(tp->t_hpts_cpu < pace->rp_num_hptss); + KTEST_EQUAL(tp->t_inpcb.inp_refcount, 1); + KTEST_VERIFY(!(tp->t_inpcb.inp_flags & INP_DROPPED)); + + test_hpts_free_tcpcb(tp); + tcp_hptsi_stop(pace); + tcp_hptsi_destroy(pace); + + return (0); +} + +/* + * Validates that tcpcb structures can be successfully inserted into and removed + * from the HPTS wheel, with proper state tracking and slot assignment during + * the process. + */ +KTEST_FUNC(tcpcb_insertion) +{ + struct tcp_hptsi *pace; + struct tcpcb *tp; + struct tcp_hpts_entry *hpts; + uint32_t timeout_usecs = 10; + + test_hpts_init(); + + pace = tcp_hptsi_create(&test_funcs, false); + KTEST_NEQUAL(pace, NULL); + tcp_hptsi_start(pace); + + tp = test_hpts_create_tcpcb(ctx, pace); + KTEST_NEQUAL(tp, NULL); + KTEST_EQUAL(tp->t_in_hpts, IHPTS_NONE); + KTEST_EQUAL((tp->t_flags2 & TF2_HPTS_CALLS), 0); + + INP_WLOCK(&tp->t_inpcb); + tp->t_flags2 |= TF2_HPTS_CALLS; + KTEST_EQUAL(call_counts[CCNT_SWI_SCHED], 0); + tcp_hpts_insert(pace, tp, timeout_usecs, NULL); + KTEST_EQUAL(tp->t_in_hpts, IHPTS_ONQUEUE); + INP_WUNLOCK(&tp->t_inpcb); + KTEST_EQUAL(call_counts[CCNT_TCP_OUTPUT], 0); + KTEST_EQUAL(call_counts[CCNT_SWI_SCHED], 1); + KTEST_VERIFY(tcp_in_hpts(tp)); + KTEST_VERIFY(tp->t_hpts_slot >= 0); + KTEST_VERIFY(tp->t_hpts_slot < NUM_OF_HPTSI_SLOTS); + + hpts = pace->rp_ent[tp->t_hpts_cpu]; + KTEST_EQUAL(hpts->p_on_queue_cnt, 1); + KTEST_EQUAL(tp->t_hpts_request, 0); + KTEST_EQUAL(tp->t_hpts_slot, HPTS_USEC_TO_SLOTS(timeout_usecs)); + //KTEST_EQUAL(tp->t_hpts_gencnt, 1); + + INP_WLOCK(&tp->t_inpcb); + tcp_hpts_remove(pace, tp); + KTEST_EQUAL(tp->t_in_hpts, IHPTS_NONE); + INP_WUNLOCK(&tp->t_inpcb); + KTEST_EQUAL(call_counts[CCNT_TCP_OUTPUT], 0); + KTEST_VERIFY(!tcp_in_hpts(tp)); + + KTEST_EQUAL(hpts->p_on_queue_cnt, 0); + + test_hpts_free_tcpcb(tp); + tcp_hptsi_stop(pace); + tcp_hptsi_destroy(pace); + + return (0); +} + +/* + * Validates the core HPTS timer functionality by verifying that scheduled + * tcpcb entries trigger tcp_output calls at appropriate times, simulating + * real-world timer-driven TCP processing. + */ +KTEST_FUNC(timer_functionality) +{ + struct epoch_tracker et; + struct tcp_hptsi *pace; + struct tcp_hpts_entry *hpts; + struct tcpcb *tp; + int32_t slots_ran; + uint32_t i; + + test_hpts_init(); + + pace = tcp_hptsi_create(&test_funcs, false); + KTEST_NEQUAL(pace, NULL); + tcp_hptsi_start(pace); + + for (i = 0; i < pace->rp_num_hptss; i++) + dump_hpts_entry(ctx, pace->rp_ent[i]); + + /* Create and insert the tcpcb into the HPTS wheel to wait for 500 usec */ + tp = test_hpts_create_tcpcb(ctx, pace); + KTEST_NEQUAL(tp, NULL); + dump_tcpcb(tp); + TP_LOG_TEST(tp) = 1; /* Enable logging for this tcpcb */ + + KTEST_LOG(ctx, "=> tcp_hpts_insert(%p)", tp); + INP_WLOCK(&tp->t_inpcb); + tp->t_flags2 |= TF2_HPTS_CALLS; /* Mark as needing HPTS processing */ + tcp_hpts_insert(pace, tp, 500, NULL); + INP_WUNLOCK(&tp->t_inpcb); + + dump_tcpcb(tp); + for (i = 0; i < pace->rp_num_hptss; i++) + dump_hpts_entry(ctx, pace->rp_ent[i]); + + hpts = pace->rp_ent[tp->t_hpts_cpu]; + KTEST_EQUAL(hpts->p_on_queue_cnt, 1); + KTEST_EQUAL(hpts->p_prev_slot, 0); + KTEST_EQUAL(hpts->p_cur_slot, 0); + KTEST_EQUAL(hpts->p_runningslot, 0); + KTEST_EQUAL(hpts->p_nxt_slot, 1); + KTEST_EQUAL(hpts->p_hpts_active, 0); + + KTEST_EQUAL(tp->t_in_hpts, IHPTS_ONQUEUE); + KTEST_EQUAL(tp->t_hpts_request, 0); + KTEST_EQUAL(tp->t_hpts_slot, HPTS_USEC_TO_SLOTS(500)); + + /* Set our test flag to indicate the tcpcb should be removed from the + * wheel when tcp_output is called. */ + TP_REMOVE_FROM_HPTS(tp) = 1; + + /* Test early exit condition: advance time by insufficient amount */ + KTEST_LOG(ctx, "Testing early exit with insufficient time advancement"); + test_time_usec += 1; /* Very small advancement - should cause early exit */ + HPTS_LOCK(hpts); + NET_EPOCH_ENTER(et); + slots_ran = tcp_hptsi(hpts, true); + HPTS_UNLOCK(hpts); + NET_EPOCH_EXIT(et); + + /* Should return 0 slots due to insufficient time advancement */ + KTEST_EQUAL(slots_ran, 0); + KTEST_EQUAL(call_counts[CCNT_TCP_OUTPUT], 0); /* No processing should occur */ + KTEST_EQUAL(tp->t_in_hpts, IHPTS_ONQUEUE); /* Connection still queued */ + + /* Wait for 498 more usecs and trigger the HPTS workers and verify + * nothing happens yet (total 499 usec) */ + KTEST_EQUAL(call_counts[CCNT_TCP_OUTPUT], 0); + test_time_usec += 498; + for (i = 0; i < pace->rp_num_hptss; i++) { + KTEST_LOG(ctx, "=> tcp_hptsi(%p)", pace->rp_ent[i]); + HPTS_LOCK(pace->rp_ent[i]); + NET_EPOCH_ENTER(et); + slots_ran = tcp_hptsi(pace->rp_ent[i], true); + HPTS_UNLOCK(pace->rp_ent[i]); + NET_EPOCH_EXIT(et); + + dump_hpts_entry(ctx, pace->rp_ent[i]); + KTEST_VERIFY(slots_ran >= 0); + KTEST_EQUAL(pace->rp_ent[i]->p_prev_slot, 49); + KTEST_EQUAL(pace->rp_ent[i]->p_cur_slot, 49); + } + + dump_tcpcb(tp); + KTEST_EQUAL(call_counts[CCNT_TCP_OUTPUT], 0); + KTEST_EQUAL(tp->t_in_hpts, IHPTS_ONQUEUE); + KTEST_EQUAL(tp->t_hpts_request, 0); + KTEST_EQUAL(tp->t_hpts_slot, HPTS_USEC_TO_SLOTS(500)); + KTEST_EQUAL(hpts->p_on_queue_cnt, 1); + + /* Wait for 1 more usec and trigger the HPTS workers and verify it + * triggers tcp_output this time */ + KTEST_EQUAL(call_counts[CCNT_TCP_OUTPUT], 0); + test_time_usec += 1; + for (i = 0; i < pace->rp_num_hptss; i++) { + KTEST_LOG(ctx, "=> tcp_hptsi(%p)", pace->rp_ent[i]); + HPTS_LOCK(pace->rp_ent[i]); + NET_EPOCH_ENTER(et); + slots_ran = tcp_hptsi(pace->rp_ent[i], true); + HPTS_UNLOCK(pace->rp_ent[i]); + NET_EPOCH_EXIT(et); + + dump_hpts_entry(ctx, pace->rp_ent[i]); + KTEST_VERIFY(slots_ran >= 0); + KTEST_EQUAL(pace->rp_ent[i]->p_prev_slot, 50); + KTEST_EQUAL(pace->rp_ent[i]->p_cur_slot, 50); + } + + dump_tcpcb(tp); + KTEST_EQUAL(call_counts[CCNT_TCP_OUTPUT], 1); + KTEST_EQUAL(tp->t_in_hpts, IHPTS_NONE); + KTEST_EQUAL(hpts->p_on_queue_cnt, 0); + + test_hpts_free_tcpcb(tp); + tcp_hptsi_stop(pace); + tcp_hptsi_destroy(pace); + + return (0); +} + +/* + * Validates HPTS scalability by creating and inserting a LOT of tcpcbs into + * the HPTS wheel, testing performance under high load conditions. + */ +KTEST_FUNC(scalability_tcpcbs) +{ + struct tcp_hptsi *pace; + struct tcpcb **tcpcbs; + uint32_t i, num_tcpcbs = 100000, total_queued = 0; + + test_hpts_init(); + + pace = tcp_hptsi_create(&test_funcs, false); + KTEST_NEQUAL(pace, NULL); + tcp_hptsi_start(pace); + + /* Allocate array to hold pointers to all tcpcbs */ + tcpcbs = malloc(num_tcpcbs * sizeof(struct tcpcb *), M_TCPHPTS, M_WAITOK | M_ZERO); + KTEST_VERIFY_RET(tcpcbs != NULL, ENOMEM); + + /* Create a LOT of tcpcbs */ + KTEST_LOG(ctx, "Creating %u tcpcbs...", num_tcpcbs); + for (i = 0; i < num_tcpcbs; i++) { + tcpcbs[i] = test_hpts_create_tcpcb(ctx, pace); + if (tcpcbs[i] == NULL) { + KTEST_ERR(ctx, "FAIL: tcpcbs[i] == NULL"); + return (EINVAL); + } + } + + /* Insert all created tcpcbs into HPTS */ + KTEST_LOG(ctx, "Inserting all tcpcbs into HPTS..."); + for (i = 0; i < num_tcpcbs; i++) { + INP_WLOCK(&tcpcbs[i]->t_inpcb); + tcpcbs[i]->t_flags2 |= TF2_HPTS_CALLS; + /* Insert with varying future timeouts to distribute across slots */ + tcp_hpts_insert(pace, tcpcbs[i], 100 + (i % 1000), NULL); + INP_WUNLOCK(&tcpcbs[i]->t_inpcb); + } + + /* Verify total queue counts across all CPUs */ + for (i = 0; i < pace->rp_num_hptss; i++) { + total_queued += pace->rp_ent[i]->p_on_queue_cnt; + } + KTEST_EQUAL(total_queued, num_tcpcbs); + + for (i = 0; i < pace->rp_num_hptss; i++) + dump_hpts_entry(ctx, pace->rp_ent[i]); + + /* Remove all tcpcbs from HPTS */ + KTEST_LOG(ctx, "Removing all tcpcbs from HPTS..."); + for (i = 0; i < num_tcpcbs; i++) { + INP_WLOCK(&tcpcbs[i]->t_inpcb); + if (tcpcbs[i]->t_in_hpts != IHPTS_NONE) { + tcp_hpts_remove(pace, tcpcbs[i]); + } + INP_WUNLOCK(&tcpcbs[i]->t_inpcb); + } + + /* Verify all queues are now empty */ + for (i = 0; i < pace->rp_num_hptss; i++) { + if (pace->rp_ent[i]->p_on_queue_cnt != 0) { + KTEST_ERR(ctx, "FAIL: pace->rp_ent[i]->p_on_queue_cnt != 0"); + return (EINVAL); + } + } + + for (i = 0; i < num_tcpcbs; i++) { + test_hpts_free_tcpcb(tcpcbs[i]); + } + free(tcpcbs, M_TCPHPTS); + tcp_hptsi_stop(pace); + tcp_hptsi_destroy(pace); + + return (0); +} + +/* + * Validates wheel wrap scenarios where the timer falls significantly behind + * and needs to process more than one full wheel revolution worth of slots. + */ +KTEST_FUNC(wheel_wrap_recovery) +{ + struct epoch_tracker et; + struct tcp_hptsi *pace; + struct tcpcb **tcpcbs; + uint32_t i, timeout_usecs, num_tcpcbs = 500; + int32_t slots_ran; + + test_hpts_init(); + + pace = tcp_hptsi_create(&test_funcs, false); + KTEST_NEQUAL(pace, NULL); + tcp_hptsi_start(pace); + + /* Allocate array to hold pointers to tcpcbs */ + tcpcbs = malloc(num_tcpcbs * sizeof(struct tcpcb *), M_TCPHPTS, M_WAITOK | M_ZERO); + KTEST_VERIFY_RET(tcpcbs != NULL, ENOMEM); + + /* Create tcpcbs and insert them across many slots */ + for (i = 0; i < num_tcpcbs; i++) { + tcpcbs[i] = test_hpts_create_tcpcb(ctx, pace); + KTEST_NEQUAL(tcpcbs[i], NULL); + TP_REMOVE_FROM_HPTS(tcpcbs[i]) = 1; + + timeout_usecs = ((i * NUM_OF_HPTSI_SLOTS) / num_tcpcbs) * HPTS_USECS_PER_SLOT; /* Spread across slots */ + + INP_WLOCK(&tcpcbs[i]->t_inpcb); + tcpcbs[i]->t_flags2 |= TF2_HPTS_CALLS; + tcp_hpts_insert(pace, tcpcbs[i], timeout_usecs, NULL); + INP_WUNLOCK(&tcpcbs[i]->t_inpcb); + } + + /* Fast forward time significantly to trigger wheel wrap */ + test_time_usec += (NUM_OF_HPTSI_SLOTS + 5000) * HPTS_USECS_PER_SLOT; + + for (i = 0; i < pace->rp_num_hptss; i++) { + KTEST_LOG(ctx, "=> tcp_hptsi(%u)", i); + KTEST_NEQUAL(pace->rp_ent[i]->p_on_queue_cnt, 0); + + HPTS_LOCK(pace->rp_ent[i]); + NET_EPOCH_ENTER(et); + slots_ran = tcp_hptsi(pace->rp_ent[i], true); + HPTS_UNLOCK(pace->rp_ent[i]); + NET_EPOCH_EXIT(et); + + KTEST_EQUAL(slots_ran, NUM_OF_HPTSI_SLOTS-1); /* Should process all slots */ + KTEST_EQUAL(pace->rp_ent[i]->p_on_queue_cnt, 0); + KTEST_NEQUAL(pace->rp_ent[i]->p_cur_slot, + pace->rp_ent[i]->p_prev_slot); + } + + /* Cleanup */ + for (i = 0; i < num_tcpcbs; i++) { + INP_WLOCK(&tcpcbs[i]->t_inpcb); + if (tcpcbs[i]->t_in_hpts != IHPTS_NONE) { + tcp_hpts_remove(pace, tcpcbs[i]); + } + INP_WUNLOCK(&tcpcbs[i]->t_inpcb); + test_hpts_free_tcpcb(tcpcbs[i]); + } + free(tcpcbs, M_TCPHPTS); + tcp_hptsi_stop(pace); + tcp_hptsi_destroy(pace); + + return (0); +} + +/* + * Validates proper handling of tcpcbs in the IHPTS_MOVING state, which occurs + * when a tcpcb is being processed by the HPTS thread but gets removed. + */ +KTEST_FUNC(tcpcb_moving_state) +{ + struct epoch_tracker et; + struct tcp_hptsi *pace; + struct tcpcb *tp1, *tp2; + struct tcp_hpts_entry *hpts; + int32_t slots_ran; + + test_hpts_init(); + + pace = tcp_hptsi_create(&test_funcs, false); + KTEST_NEQUAL(pace, NULL); + tcp_hptsi_start(pace); + + /* Create two tcpcbs on the same CPU/slot */ + tp1 = test_hpts_create_tcpcb(ctx, pace); + tp2 = test_hpts_create_tcpcb(ctx, pace); + KTEST_NEQUAL(tp1, NULL); + KTEST_NEQUAL(tp2, NULL); + + /* Force them to the same CPU for predictable testing */ + tp1->t_hpts_cpu = 0; + tp2->t_hpts_cpu = 0; + + /* Insert both into the same slot */ + INP_WLOCK(&tp1->t_inpcb); + tp1->t_flags2 |= TF2_HPTS_CALLS; + tcp_hpts_insert(pace, tp1, 100, NULL); + INP_WUNLOCK(&tp1->t_inpcb); + + INP_WLOCK(&tp2->t_inpcb); + tp2->t_flags2 |= TF2_HPTS_CALLS; + tcp_hpts_insert(pace, tp2, 100, NULL); + INP_WUNLOCK(&tp2->t_inpcb); + + hpts = pace->rp_ent[0]; + + /* Manually transition tp1 to MOVING state to simulate race condition */ + HPTS_LOCK(hpts); + tp1->t_in_hpts = IHPTS_MOVING; + tp1->t_hpts_slot = -1; /* Mark for removal */ + HPTS_UNLOCK(hpts); + + /* Set time and run HPTS to process the moving state */ + test_time_usec += 100; + HPTS_LOCK(hpts); + NET_EPOCH_ENTER(et); + slots_ran = tcp_hptsi(hpts, true); + HPTS_UNLOCK(hpts); + NET_EPOCH_EXIT(et); + + KTEST_VERIFY(slots_ran >= 0); + KTEST_EQUAL(call_counts[CCNT_TCP_OUTPUT], 1); /* Shouldn't call on both */ + + /* tp1 should be cleaned up and removed */ + KTEST_EQUAL(tp1->t_in_hpts, IHPTS_NONE); + /* tp2 should have been processed normally */ + KTEST_EQUAL(tp2->t_in_hpts, IHPTS_NONE); + + test_hpts_free_tcpcb(tp1); + test_hpts_free_tcpcb(tp2); + tcp_hptsi_stop(pace); + tcp_hptsi_destroy(pace); + + return (0); +} + +/* + * Validates that tcpcbs with deferred requests (t_hpts_request > 0) are + * properly handled and re-inserted into appropriate future slots after + * the wheel processes enough slots to accommodate the original request. + */ +KTEST_FUNC(deferred_requests) +{ + struct epoch_tracker et; + struct tcp_hptsi *pace; + struct tcpcb *tp, *tp2; + struct tcp_hpts_entry *hpts; + uint32_t large_timeout_usecs = (NUM_OF_HPTSI_SLOTS + 5000) * HPTS_USECS_PER_SLOT; /* Beyond wheel capacity */ + uint32_t huge_timeout_usecs = (NUM_OF_HPTSI_SLOTS * 3) * HPTS_USECS_PER_SLOT; /* 3x wheel capacity */ + uint32_t initial_request; + int32_t slots_ran; + + test_hpts_init(); + + pace = tcp_hptsi_create(&test_funcs, false); + KTEST_NEQUAL(pace, NULL); + tcp_hptsi_start(pace); + + tp = test_hpts_create_tcpcb(ctx, pace); + KTEST_NEQUAL(tp, NULL); + + /* Insert with a request that exceeds current wheel capacity */ + INP_WLOCK(&tp->t_inpcb); + tp->t_flags2 |= TF2_HPTS_CALLS; + tcp_hpts_insert(pace, tp, large_timeout_usecs, NULL); + INP_WUNLOCK(&tp->t_inpcb); + + /* Verify it was inserted with a deferred request */ + dump_tcpcb(tp); + KTEST_EQUAL(tp->t_in_hpts, IHPTS_ONQUEUE); + KTEST_VERIFY(tp->t_hpts_request > 0); + KTEST_VERIFY(tp->t_hpts_slot < NUM_OF_HPTSI_SLOTS); + + hpts = pace->rp_ent[tp->t_hpts_cpu]; + + /* Advance time to process deferred requests */ + test_time_usec += NUM_OF_HPTSI_SLOTS * HPTS_USECS_PER_SLOT; + + /* Process the wheel to handle deferred requests */ + HPTS_LOCK(hpts); + NET_EPOCH_ENTER(et); + slots_ran = tcp_hptsi(hpts, true); + HPTS_UNLOCK(hpts); + NET_EPOCH_EXIT(et); + + dump_hpts_entry(ctx, hpts); + KTEST_GREATER_THAN(slots_ran, 0); + dump_tcpcb(tp); + KTEST_EQUAL(tp->t_hpts_request, 0); + + /* Test incremental deferred request processing over multiple cycles */ + KTEST_LOG(ctx, "Testing incremental deferred request processing"); + + /* Create a new connection with an even larger request */ + tp2 = test_hpts_create_tcpcb(ctx, pace); + KTEST_NEQUAL(tp2, NULL); + tp2->t_hpts_cpu = tp->t_hpts_cpu; /* Same CPU for predictable testing */ + + INP_WLOCK(&tp2->t_inpcb); + tp2->t_flags2 |= TF2_HPTS_CALLS; + tcp_hpts_insert(pace, tp2, huge_timeout_usecs, NULL); + INP_WUNLOCK(&tp2->t_inpcb); + + /* Verify initial deferred request */ + initial_request = tp2->t_hpts_request; + KTEST_VERIFY(initial_request > NUM_OF_HPTSI_SLOTS); + + /* Process one wheel cycle - should reduce but not eliminate request */ + test_time_usec += NUM_OF_HPTSI_SLOTS * HPTS_USECS_PER_SLOT; + HPTS_LOCK(hpts); + NET_EPOCH_ENTER(et); + slots_ran = tcp_hptsi(hpts, true); + HPTS_UNLOCK(hpts); + NET_EPOCH_EXIT(et); + + /* Request should be reduced but not zero */ + KTEST_GREATER_THAN(initial_request, tp2->t_hpts_request); + KTEST_VERIFY(tp2->t_hpts_request > 0); + KTEST_EQUAL(tp2->t_in_hpts, IHPTS_ONQUEUE); /* Still queued */ + + /* For huge_timeout_usecs = NUM_OF_HPTSI_SLOTS * 3 * HPTS_USECS_PER_SLOT, we need ~3 cycles to complete. + * Each cycle can reduce the request by at most NUM_OF_HPTSI_SLOTS. */ + test_time_usec += NUM_OF_HPTSI_SLOTS * HPTS_USECS_PER_SLOT; + HPTS_LOCK(hpts); + NET_EPOCH_ENTER(et); + slots_ran = tcp_hptsi(hpts, true); + HPTS_UNLOCK(hpts); + NET_EPOCH_EXIT(et); + + /* After second cycle, request should be reduced significantly (likely by ~NUM_OF_HPTSI_SLOTS) */ + KTEST_VERIFY(tp2->t_hpts_request < initial_request); + KTEST_VERIFY(tp2->t_hpts_request > 0); /* But not yet zero for such a large request */ + + /* Clean up second connection */ + INP_WLOCK(&tp2->t_inpcb); + if (tp2->t_in_hpts != IHPTS_NONE) { + tcp_hpts_remove(pace, tp2); + } + INP_WUNLOCK(&tp2->t_inpcb); + test_hpts_free_tcpcb(tp2); + + /* Clean up */ + INP_WLOCK(&tp->t_inpcb); + if (tp->t_in_hpts != IHPTS_NONE) { + tcp_hpts_remove(pace, tp); + } + INP_WUNLOCK(&tp->t_inpcb); + test_hpts_free_tcpcb(tp); + tcp_hptsi_stop(pace); + tcp_hptsi_destroy(pace); + + return (0); +} + +/* + * Validates CPU assignment and affinity mechanisms, including flowid-based + * assignment, random fallback scenarios, and explicit CPU setting. Tests + * the actual cpu assignment logic in hpts_cpuid via tcp_set_hpts. + */ +KTEST_FUNC(cpu_assignment) +{ + struct tcp_hptsi *pace; + struct tcpcb *tp1, *tp2, *tp2_dup, *tp3; + + test_hpts_init(); + + pace = tcp_hptsi_create(&test_funcs, false); + KTEST_NEQUAL(pace, NULL); + + /* Test random CPU assignment (no flowid) */ + tp1 = test_hpts_create_tcpcb(ctx, pace); + KTEST_NEQUAL(tp1, NULL); + tp1->t_inpcb.inp_flowtype = M_HASHTYPE_NONE; + INP_WLOCK(&tp1->t_inpcb); + tcp_set_hpts(pace, tp1); + INP_WUNLOCK(&tp1->t_inpcb); + KTEST_VERIFY(tp1->t_hpts_cpu < pace->rp_num_hptss); + KTEST_VERIFY(tp1->t_flags2 & TF2_HPTS_CPU_SET); + + /* Test flowid-based assignment */ + tp2 = test_hpts_create_tcpcb(ctx, pace); + KTEST_NEQUAL(tp2, NULL); + tp2->t_inpcb.inp_flowtype = M_HASHTYPE_RSS_TCP_IPV4; + tp2->t_inpcb.inp_flowid = 12345; + INP_WLOCK(&tp2->t_inpcb); + tcp_set_hpts(pace, tp2); + INP_WUNLOCK(&tp2->t_inpcb); + KTEST_VERIFY(tp2->t_hpts_cpu < pace->rp_num_hptss); + KTEST_VERIFY(tp2->t_flags2 & TF2_HPTS_CPU_SET); + + /* With the same flowid, should get same CPU assignment */ + tp2_dup = test_hpts_create_tcpcb(ctx, pace); + KTEST_NEQUAL(tp2_dup, NULL); + tp2_dup->t_inpcb.inp_flowtype = M_HASHTYPE_RSS_TCP_IPV4; + tp2_dup->t_inpcb.inp_flowid = 12345; + INP_WLOCK(&tp2_dup->t_inpcb); + tcp_set_hpts(pace, tp2_dup); + INP_WUNLOCK(&tp2_dup->t_inpcb); + KTEST_EQUAL(tp2_dup->t_hpts_cpu, tp2->t_hpts_cpu); + + /* Test explicit CPU setting */ + tp3 = test_hpts_create_tcpcb(ctx, pace); + KTEST_NEQUAL(tp3, NULL); + tp3->t_hpts_cpu = 1; /* Assume we have at least 2 CPUs */ + tp3->t_flags2 |= TF2_HPTS_CPU_SET; + INP_WLOCK(&tp3->t_inpcb); + tcp_set_hpts(pace, tp3); + INP_WUNLOCK(&tp3->t_inpcb); + KTEST_EQUAL(tp3->t_hpts_cpu, 1); + + test_hpts_free_tcpcb(tp1); + test_hpts_free_tcpcb(tp2); + test_hpts_free_tcpcb(tp2_dup); + test_hpts_free_tcpcb(tp3); + tcp_hptsi_destroy(pace); + + return (0); +} + +/* + * Validates edge cases in slot calculation including boundary conditions + * around slot 0, maximum slots, and slot wrapping arithmetic. + */ +KTEST_FUNC(slot_boundary_conditions) +{ + struct tcp_hptsi *pace; + struct tcpcb *tp; + + test_hpts_init(); + + pace = tcp_hptsi_create(&test_funcs, false); + KTEST_NEQUAL(pace, NULL); + tcp_hptsi_start(pace); + + /* Test insertion at slot 0 */ + tp = test_hpts_create_tcpcb(ctx, pace); + KTEST_NEQUAL(tp, NULL); + INP_WLOCK(&tp->t_inpcb); + tp->t_flags2 |= TF2_HPTS_CALLS; + tcp_hpts_insert(pace, tp, 0, NULL); /* Should insert immediately (0 timeout) */ + INP_WUNLOCK(&tp->t_inpcb); + KTEST_EQUAL(tp->t_in_hpts, IHPTS_ONQUEUE); + KTEST_VERIFY(tp->t_hpts_slot < NUM_OF_HPTSI_SLOTS); + + INP_WLOCK(&tp->t_inpcb); + tcp_hpts_remove(pace, tp); + INP_WUNLOCK(&tp->t_inpcb); + + /* Test insertion at maximum slot value */ + INP_WLOCK(&tp->t_inpcb); + tp->t_flags2 |= TF2_HPTS_CALLS; + tcp_hpts_insert(pace, tp, (NUM_OF_HPTSI_SLOTS - 1) * HPTS_USECS_PER_SLOT, NULL); + INP_WUNLOCK(&tp->t_inpcb); + KTEST_EQUAL(tp->t_in_hpts, IHPTS_ONQUEUE); + + INP_WLOCK(&tp->t_inpcb); + tcp_hpts_remove(pace, tp); + INP_WUNLOCK(&tp->t_inpcb); + + /* Test very small timeout values */ + INP_WLOCK(&tp->t_inpcb); + tp->t_flags2 |= TF2_HPTS_CALLS; + tcp_hpts_insert(pace, tp, 1, NULL); + INP_WUNLOCK(&tp->t_inpcb); + KTEST_EQUAL(tp->t_in_hpts, IHPTS_ONQUEUE); + KTEST_EQUAL(tp->t_hpts_slot, HPTS_USEC_TO_SLOTS(1)); /* Should convert 1 usec to slot */ + + INP_WLOCK(&tp->t_inpcb); + tcp_hpts_remove(pace, tp); + INP_WUNLOCK(&tp->t_inpcb); + + test_hpts_free_tcpcb(tp); + tcp_hptsi_stop(pace); + tcp_hptsi_destroy(pace); + + return (0); +} + +/* + * Validates HPTS behavior under high load conditions, including proper + * processing of many connections and connection count tracking. + */ +KTEST_FUNC(dynamic_sleep_adjustment) +{ + struct epoch_tracker et; + struct tcp_hptsi *pace; + struct tcpcb **tcpcbs; + struct tcp_hpts_entry *hpts; + uint32_t i, num_tcpcbs = DEFAULT_CONNECTION_THRESHOLD + 50; + int32_t slots_ran; + + test_hpts_init(); + + pace = tcp_hptsi_create(&test_funcs, false); + KTEST_NEQUAL(pace, NULL); + tcp_hptsi_start(pace); + + /* Create many connections to exceed threshold */ + tcpcbs = malloc(num_tcpcbs * sizeof(struct tcpcb *), M_TCPHPTS, M_WAITOK | M_ZERO); + KTEST_VERIFY_RET(tcpcbs != NULL, ENOMEM); + + for (i = 0; i < num_tcpcbs; i++) { + tcpcbs[i] = test_hpts_create_tcpcb(ctx, pace); + KTEST_NEQUAL(tcpcbs[i], NULL); + tcpcbs[i]->t_hpts_cpu = 0; /* Force all to CPU 0 */ + INP_WLOCK(&tcpcbs[i]->t_inpcb); + tcpcbs[i]->t_flags2 |= TF2_HPTS_CALLS; + TP_REMOVE_FROM_HPTS(tcpcbs[i]) = 1; /* Will be removed after output */ + tcp_hpts_insert(pace, tcpcbs[i], 100, NULL); + INP_WUNLOCK(&tcpcbs[i]->t_inpcb); + } + + hpts = pace->rp_ent[0]; + dump_hpts_entry(ctx, hpts); + + /* Verify we're above threshold */ + KTEST_GREATER_THAN(hpts->p_on_queue_cnt, DEFAULT_CONNECTION_THRESHOLD); + + /* Run HPTS to process many connections */ + test_time_usec += 100; + HPTS_LOCK(hpts); + NET_EPOCH_ENTER(et); + slots_ran = tcp_hptsi(hpts, true); + HPTS_UNLOCK(hpts); + NET_EPOCH_EXIT(et); + + /* Verify HPTS processed slots and connections correctly */ + KTEST_GREATER_THAN(slots_ran, 0); + KTEST_EQUAL(call_counts[CCNT_TCP_OUTPUT], num_tcpcbs); + + /* Verify all connections were removed from queue */ + KTEST_EQUAL(hpts->p_on_queue_cnt, 0); + + /* Cleanup */ + for (i = 0; i < num_tcpcbs; i++) { + test_hpts_free_tcpcb(tcpcbs[i]); + } + free(tcpcbs, M_TCPHPTS); + tcp_hptsi_stop(pace); + tcp_hptsi_destroy(pace); + + return (0); +} + +/* + * Validates handling of concurrent insert/remove operations and race conditions + * between HPTS processing and user operations. + */ +KTEST_FUNC(concurrent_operations) +{ + struct tcp_hptsi *pace; + struct tcpcb *tp1, *tp2; + struct tcp_hpts_entry *hpts; + + test_hpts_init(); + + pace = tcp_hptsi_create(&test_funcs, false); + KTEST_NEQUAL(pace, NULL); + tcp_hptsi_start(pace); + + tp1 = test_hpts_create_tcpcb(ctx, pace); + tp2 = test_hpts_create_tcpcb(ctx, pace); + KTEST_NEQUAL(tp1, NULL); + KTEST_NEQUAL(tp2, NULL); + + /* Force all to CPU 0 */ + tp1->t_hpts_cpu = 0; + tp2->t_hpts_cpu = 0; + + /* Insert tp1 */ + INP_WLOCK(&tp1->t_inpcb); + tp1->t_flags2 |= TF2_HPTS_CALLS; + tcp_hpts_insert(pace, tp1, 100, NULL); + INP_WUNLOCK(&tp1->t_inpcb); + + /* Insert tp2 into same slot */ + INP_WLOCK(&tp2->t_inpcb); + tp2->t_flags2 |= TF2_HPTS_CALLS; + tcp_hpts_insert(pace, tp2, 100, NULL); + INP_WUNLOCK(&tp2->t_inpcb); + + /* Verify both are inserted */ + KTEST_EQUAL(tp1->t_in_hpts, IHPTS_ONQUEUE); + KTEST_EQUAL(tp2->t_in_hpts, IHPTS_ONQUEUE); + + /* Verify they're both assigned to the same slot */ + KTEST_EQUAL(tp1->t_hpts_slot, tp2->t_hpts_slot); + + /* Verify queue count reflects both connections */ + KTEST_EQUAL(tp1->t_hpts_cpu, tp2->t_hpts_cpu); /* Should be on same CPU */ + hpts = pace->rp_ent[tp1->t_hpts_cpu]; + KTEST_EQUAL(hpts->p_on_queue_cnt, 2); + + /* Remove tp1 while tp2 is still there */ + INP_WLOCK(&tp1->t_inpcb); + tcp_hpts_remove(pace, tp1); + INP_WUNLOCK(&tp1->t_inpcb); + + /* Verify tp1 removed, tp2 still there */ + KTEST_EQUAL(tp1->t_in_hpts, IHPTS_NONE); + KTEST_EQUAL(tp2->t_in_hpts, IHPTS_ONQUEUE); + + /* Verify queue count decreased by one */ + KTEST_EQUAL(hpts->p_on_queue_cnt, 1); + + /* Remove tp2 */ + INP_WLOCK(&tp2->t_inpcb); + tcp_hpts_remove(pace, tp2); + INP_WUNLOCK(&tp2->t_inpcb); + + KTEST_EQUAL(tp2->t_in_hpts, IHPTS_NONE); + + /* Verify queue is now completely empty */ + KTEST_EQUAL(hpts->p_on_queue_cnt, 0); + + test_hpts_free_tcpcb(tp1); + test_hpts_free_tcpcb(tp2); + tcp_hptsi_stop(pace); + tcp_hptsi_destroy(pace); + + return (0); +} + +/* + * Validates the queued segments processing path via tfb_do_queued_segments, + * which is an alternative to direct tcp_output calls. + */ +KTEST_FUNC(queued_segments_processing) +{ + struct epoch_tracker et; + struct tcp_hptsi *pace; + struct tcpcb *tp; + struct tcp_hpts_entry *hpts; + struct mbuf *fake_mbuf; + int32_t slots_ran; + + test_hpts_init(); + + pace = tcp_hptsi_create(&test_funcs, false); + KTEST_NEQUAL(pace, NULL); + tcp_hptsi_start(pace); + + tp = test_hpts_create_tcpcb(ctx, pace); + KTEST_NEQUAL(tp, NULL); + + /* Create a minimal fake mbuf that has valid STAILQ pointers */ + fake_mbuf = malloc(sizeof(struct mbuf), M_TCPHPTS, M_WAITOK | M_ZERO); + KTEST_NEQUAL(fake_mbuf, NULL); + + /* Set up for queued segments path */ + tp->t_flags2 |= (TF2_HPTS_CALLS | TF2_SUPPORTS_MBUFQ); + STAILQ_INSERT_TAIL(&tp->t_inqueue, fake_mbuf, m_stailqpkt); + + INP_WLOCK(&tp->t_inpcb); + tcp_hpts_insert(pace, tp, 100, NULL); + INP_WUNLOCK(&tp->t_inpcb); + + hpts = pace->rp_ent[tp->t_hpts_cpu]; + + /* Run HPTS and verify queued segments path is taken */ + test_time_usec += 100; + HPTS_LOCK(hpts); + NET_EPOCH_ENTER(et); + slots_ran = tcp_hptsi(hpts, true); + HPTS_UNLOCK(hpts); + NET_EPOCH_EXIT(et); + + KTEST_VERIFY(slots_ran >= 0); + KTEST_EQUAL(call_counts[CCNT_TCP_TFB_DO_QUEUED_SEGMENTS], 1); + + /* Connection should be removed from HPTS after processing */ + KTEST_EQUAL(tp->t_in_hpts, IHPTS_NONE); + + /* Clean up the fake mbuf if it's still in the queue */ + if (!STAILQ_EMPTY(&tp->t_inqueue)) { + struct mbuf *m = STAILQ_FIRST(&tp->t_inqueue); + STAILQ_REMOVE_HEAD(&tp->t_inqueue, m_stailqpkt); + free(m, M_TCPHPTS); + } + + test_hpts_free_tcpcb(tp); + tcp_hptsi_stop(pace); + tcp_hptsi_destroy(pace); + + return (0); +} + +/* + * Validates the direct wake mechanism and wake inhibition logic when + * the connection count exceeds thresholds. + */ +KTEST_FUNC(direct_wake_mechanism) +{ + struct tcp_hptsi *pace; + struct tcpcb *tp; + struct tcp_hpts_entry *hpts; + + test_hpts_init(); + + pace = tcp_hptsi_create(&test_funcs, false); + KTEST_NEQUAL(pace, NULL); + tcp_hptsi_start(pace); + + tp = test_hpts_create_tcpcb(ctx, pace); + KTEST_NEQUAL(tp, NULL); + hpts = pace->rp_ent[tp->t_hpts_cpu]; + + /* Test direct wake when not over threshold */ + HPTS_LOCK(hpts); + hpts->p_on_queue_cnt = 50; /* Below threshold */ + hpts->p_hpts_wake_scheduled = 0; + tcp_hpts_wake(hpts); + KTEST_EQUAL(hpts->p_hpts_wake_scheduled, 1); + KTEST_EQUAL(call_counts[CCNT_SWI_SCHED], 1); + HPTS_UNLOCK(hpts); + + /* Reset for next test */ + hpts->p_hpts_wake_scheduled = 0; + call_counts[CCNT_SWI_SCHED] = 0; + + /* Test wake inhibition when over threshold */ + HPTS_LOCK(hpts); + hpts->p_on_queue_cnt = 200; /* Above threshold */ + hpts->p_direct_wake = 1; /* Request direct wake */ + tcp_hpts_wake(hpts); + KTEST_EQUAL(hpts->p_hpts_wake_scheduled, 0); /* Should be inhibited */ + KTEST_EQUAL(hpts->p_direct_wake, 0); /* Should be cleared */ + KTEST_EQUAL(call_counts[CCNT_SWI_SCHED], 0); /* No SWI scheduled */ + HPTS_UNLOCK(hpts); + + test_hpts_free_tcpcb(tp); + tcp_hptsi_stop(pace); + tcp_hptsi_destroy(pace); + + return (0); +} + +/* + * Validates HPTS collision detection when attempting to run HPTS while + * it's already active. + */ +KTEST_FUNC(hpts_collision_detection) +{ + struct epoch_tracker et; + struct tcp_hptsi *pace; + struct tcp_hpts_entry *hpts; + int32_t slots_ran; + + test_hpts_init(); + + pace = tcp_hptsi_create(&test_funcs, false); + KTEST_NEQUAL(pace, NULL); + tcp_hptsi_start(pace); + + hpts = pace->rp_ent[0]; + + /* Mark HPTS as active */ + HPTS_LOCK(hpts); + hpts->p_hpts_active = 1; + HPTS_UNLOCK(hpts); + + /* Attempt to run HPTS again - should detect collision */ + HPTS_LOCK(hpts); + NET_EPOCH_ENTER(et); + slots_ran = tcp_hptsi(hpts, false); /* from_callout = false */ + HPTS_UNLOCK(hpts); + NET_EPOCH_EXIT(et); + + /* Should return 0 indicating no work done due to collision */ + KTEST_EQUAL(slots_ran, 0); + + tcp_hptsi_stop(pace); + tcp_hptsi_destroy(pace); + + return (0); +} + +/* + * Validates generation count handling for race condition detection between + * HPTS processing and connection insertion/removal operations. + */ +KTEST_FUNC(generation_count_validation) +{ + struct epoch_tracker et; + struct tcp_hptsi *pace; + struct tcp_hpts_entry *hpts; + struct tcpcb *tp1, *tp2; + uint32_t initial_gencnt, slot_to_test = 10; + uint32_t timeout_usecs = slot_to_test * HPTS_USECS_PER_SLOT; + uint32_t tp2_original_gencnt; + int32_t slots_ran; + + test_hpts_init(); + + pace = tcp_hptsi_create(&test_funcs, false); + KTEST_NEQUAL(pace, NULL); + tcp_hptsi_start(pace); + + hpts = pace->rp_ent[0]; + + /* Record initial generation count for the test slot */ + initial_gencnt = hpts->p_hptss[slot_to_test].gencnt; + + /* Create and insert first connection */ + tp1 = test_hpts_create_tcpcb(ctx, pace); + KTEST_NEQUAL(tp1, NULL); + tp1->t_hpts_cpu = 0; /* Force to CPU 0 */ + + INP_WLOCK(&tp1->t_inpcb); + tp1->t_flags2 |= TF2_HPTS_CALLS; + tcp_hpts_insert(pace, tp1, timeout_usecs, NULL); + INP_WUNLOCK(&tp1->t_inpcb); + + /* Verify connection stored the generation count */ + KTEST_EQUAL(tp1->t_in_hpts, IHPTS_ONQUEUE); + KTEST_EQUAL(tp1->t_hpts_slot, slot_to_test); + KTEST_EQUAL(tp1->t_hpts_gencnt, initial_gencnt); + + /* Create second connection but don't insert yet */ + tp2 = test_hpts_create_tcpcb(ctx, pace); + KTEST_NEQUAL(tp2, NULL); + tp2->t_hpts_cpu = 0; /* Force to CPU 0 */ + + /* Force generation count increment by processing the slot */ + test_time_usec += (slot_to_test + 1) * HPTS_USECS_PER_SLOT; + HPTS_LOCK(hpts); + NET_EPOCH_ENTER(et); + slots_ran = tcp_hptsi(hpts, true); + HPTS_UNLOCK(hpts); + NET_EPOCH_EXIT(et); + + /* Verify processing occurred */ + KTEST_VERIFY(slots_ran > 0); + KTEST_EQUAL(call_counts[CCNT_TCP_OUTPUT], 1); + + /* Verify generation count was incremented */ + KTEST_EQUAL(hpts->p_hptss[slot_to_test].gencnt, initial_gencnt + 1); + + /* Verify first connection was processed and removed */ + KTEST_EQUAL(tp1->t_in_hpts, IHPTS_NONE); + + /* Insert second connection and record its generation count */ + INP_WLOCK(&tp2->t_inpcb); + tp2->t_flags2 |= TF2_HPTS_CALLS; + tcp_hpts_insert(pace, tp2, timeout_usecs, NULL); + INP_WUNLOCK(&tp2->t_inpcb); + + /* Verify connection was inserted successfully */ + KTEST_EQUAL(tp2->t_in_hpts, IHPTS_ONQUEUE); + + /* Record the generation count that tp2 received */ + tp2_original_gencnt = tp2->t_hpts_gencnt; + + /* Test generation count mismatch detection during processing */ + /* Manually set stale generation count to simulate race condition */ + tp2->t_hpts_gencnt = tp2_original_gencnt + 100; /* Force a mismatch */ + + /* Process the slot to trigger generation count validation */ + test_time_usec += (slot_to_test + 1) * HPTS_USECS_PER_SLOT; + HPTS_LOCK(hpts); + NET_EPOCH_ENTER(et); + slots_ran = tcp_hptsi(hpts, true); + HPTS_UNLOCK(hpts); + NET_EPOCH_EXIT(et); + + /* Connection should be processed despite generation count mismatch */ + KTEST_EQUAL(tp2->t_in_hpts, IHPTS_NONE); /* Processed and released */ + + /* The key test: HPTS should handle mismatched generation counts gracefully */ + KTEST_VERIFY(slots_ran > 0); /* Processing should still occur */ + + test_hpts_free_tcpcb(tp1); + test_hpts_free_tcpcb(tp2); + tcp_hptsi_stop(pace); + tcp_hptsi_destroy(pace); + + return (0); +} + +static const struct ktest_test_info tests[] = { + KTEST_INFO(module_load), + KTEST_INFO(hptsi_create_destroy), + KTEST_INFO(hptsi_start_stop), + KTEST_INFO(hptsi_independence), + KTEST_INFO(function_injection), + KTEST_INFO(tcpcb_initialization), + KTEST_INFO(tcpcb_insertion), + KTEST_INFO(timer_functionality), + KTEST_INFO(scalability_tcpcbs), + KTEST_INFO(wheel_wrap_recovery), + KTEST_INFO(tcpcb_moving_state), + KTEST_INFO(deferred_requests), + KTEST_INFO(cpu_assignment), + KTEST_INFO(slot_boundary_conditions), + KTEST_INFO(dynamic_sleep_adjustment), + KTEST_INFO(concurrent_operations), + KTEST_INFO(queued_segments_processing), + KTEST_INFO(direct_wake_mechanism), + KTEST_INFO(hpts_collision_detection), + KTEST_INFO(generation_count_validation), +}; + +#else /* TCP_HPTS_KTEST */ + +/* + * Stub to indicate that the TCP HPTS ktest is not enabled. + */ +KTEST_FUNC(module_load_without_tests) +{ + KTEST_LOG(ctx, "Warning: TCP HPTS ktest is not enabled"); + return (0); +} + +static const struct ktest_test_info tests[] = { + KTEST_INFO(module_load_without_tests), +}; + +#endif + +KTEST_MODULE_DECLARE(ktest_tcphpts, tests); +KTEST_MODULE_DEPEND(ktest_tcphpts, tcphpts); diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c index dd27ec77c1af..2146b0cac48f 100644 --- a/sys/netinet/tcp_input.c +++ b/sys/netinet/tcp_input.c @@ -219,7 +219,7 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_auto, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_autorcvbuf), 0, "Enable automatic receive buffer sizing"); -VNET_DEFINE(int, tcp_autorcvbuf_max) = 2*1024*1024; +VNET_DEFINE(int, tcp_autorcvbuf_max) = 8*1024*1024; SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_autorcvbuf_max), 0, "Max size of automatic receive buffer"); diff --git a/sys/netinet/tcp_lro.c b/sys/netinet/tcp_lro.c index 64efa4bf060f..9b5baf115855 100644 --- a/sys/netinet/tcp_lro.c +++ b/sys/netinet/tcp_lro.c @@ -1475,10 +1475,11 @@ tcp_lro_queue_mbuf(struct lro_ctrl *lc, struct mbuf *mb) } /* create sequence number */ - lc->lro_mbuf_data[lc->lro_mbuf_count].seq = - (((uint64_t)M_HASHTYPE_GET(mb)) << 56) | - (((uint64_t)mb->m_pkthdr.flowid) << 24) | - ((uint64_t)lc->lro_mbuf_count); + lc->lro_mbuf_data[lc->lro_mbuf_count].seq = lc->lro_mbuf_count; + if (M_HASHTYPE_ISHASH(mb)) + lc->lro_mbuf_data[lc->lro_mbuf_count].seq |= + (((uint64_t)M_HASHTYPE_GET(mb)) << 56) | + (((uint64_t)mb->m_pkthdr.flowid) << 24); /* enter mbuf */ lc->lro_mbuf_data[lc->lro_mbuf_count].mb = mb; diff --git a/sys/netinet/tcp_lro_hpts.c b/sys/netinet/tcp_lro_hpts.c index 43587285fe26..ac1a27a4290a 100644 --- a/sys/netinet/tcp_lro_hpts.c +++ b/sys/netinet/tcp_lro_hpts.c @@ -29,6 +29,8 @@ #include "opt_inet6.h" #include <sys/param.h> +#include <sys/bus.h> +#include <sys/interrupt.h> #include <sys/systm.h> #include <sys/kernel.h> #include <sys/malloc.h> @@ -62,6 +64,7 @@ #include <netinet/tcp_lro.h> #include <netinet/tcp_var.h> #include <netinet/tcp_hpts.h> +#include <netinet/tcp_hpts_internal.h> #ifdef TCP_BLACKBOX #include <netinet/tcp_log_buf.h> #endif diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c index 2dfb7faf56e3..208f72c4661c 100644 --- a/sys/netinet/tcp_output.c +++ b/sys/netinet/tcp_output.c @@ -123,7 +123,7 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_inc, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_autosndbuf_inc), 0, "Incrementor step size of automatic send buffer"); -VNET_DEFINE(int, tcp_autosndbuf_max) = 2*1024*1024; +VNET_DEFINE(int, tcp_autosndbuf_max) = 8*1024*1024; SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_max, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_autosndbuf_max), 0, "Max size of automatic send buffer"); diff --git a/sys/netinet/tcp_stacks/bbr.c b/sys/netinet/tcp_stacks/bbr.c index f2d7867df9b4..66983edcdd73 100644 --- a/sys/netinet/tcp_stacks/bbr.c +++ b/sys/netinet/tcp_stacks/bbr.c @@ -480,7 +480,7 @@ bbr_find_lowest_rsm(struct tcp_bbr *bbr); static __inline uint32_t bbr_get_rtt(struct tcp_bbr *bbr, int32_t rtt_type); static void -bbr_log_to_start(struct tcp_bbr *bbr, uint32_t cts, uint32_t to, int32_t slot, +bbr_log_to_start(struct tcp_bbr *bbr, uint32_t cts, uint32_t to, int32_t pacing_delay, uint8_t which); static void bbr_log_timer_var(struct tcp_bbr *bbr, int mode, uint32_t cts, @@ -489,7 +489,7 @@ bbr_log_timer_var(struct tcp_bbr *bbr, int mode, uint32_t cts, static void bbr_log_hpts_diag(struct tcp_bbr *bbr, uint32_t cts, struct hpts_diag *diag); static void -bbr_log_type_bbrsnd(struct tcp_bbr *bbr, uint32_t len, uint32_t slot, +bbr_log_type_bbrsnd(struct tcp_bbr *bbr, uint32_t len, uint32_t pacing_delay, uint32_t del_by, uint32_t cts, uint32_t sloton, uint32_t prev_delay); static void @@ -724,7 +724,7 @@ bbr_minseg(struct tcp_bbr *bbr) } static void -bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_t frm, int32_t slot, uint32_t tot_len) +bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_t frm, int32_t pacing_delay, uint32_t tot_len) { struct inpcb *inp = tptoinpcb(tp); struct hpts_diag diag; @@ -751,40 +751,40 @@ bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_ bbr->r_ctl.rc_timer_exp = 0; prev_delay = bbr->r_ctl.rc_last_delay_val; if (bbr->r_ctl.rc_last_delay_val && - (slot == 0)) { + (pacing_delay == 0)) { /* * If a previous pacer delay was in place we * are not coming from the output side (where * we calculate a delay, more likely a timer). */ - slot = bbr->r_ctl.rc_last_delay_val; + pacing_delay = bbr->r_ctl.rc_last_delay_val; if (TSTMP_GT(cts, bbr->rc_pacer_started)) { /* Compensate for time passed */ delay_calc = cts - bbr->rc_pacer_started; - if (delay_calc <= slot) - slot -= delay_calc; + if (delay_calc <= pacing_delay) + pacing_delay -= delay_calc; } } /* Do we have early to make up for by pushing out the pacing time? */ if (bbr->r_agg_early_set) { - bbr_log_pacing_delay_calc(bbr, 0, bbr->r_ctl.rc_agg_early, cts, slot, 0, bbr->r_agg_early_set, 2); - slot += bbr->r_ctl.rc_agg_early; + bbr_log_pacing_delay_calc(bbr, 0, bbr->r_ctl.rc_agg_early, cts, pacing_delay, 0, bbr->r_agg_early_set, 2); + pacing_delay += bbr->r_ctl.rc_agg_early; bbr->r_ctl.rc_agg_early = 0; bbr->r_agg_early_set = 0; } /* Are we running a total debt that needs to be compensated for? */ if (bbr->r_ctl.rc_hptsi_agg_delay) { - if (slot > bbr->r_ctl.rc_hptsi_agg_delay) { + if (pacing_delay > bbr->r_ctl.rc_hptsi_agg_delay) { /* We nuke the delay */ - slot -= bbr->r_ctl.rc_hptsi_agg_delay; + pacing_delay -= bbr->r_ctl.rc_hptsi_agg_delay; bbr->r_ctl.rc_hptsi_agg_delay = 0; } else { /* We nuke some of the delay, put in a minimal 100usecs */ - bbr->r_ctl.rc_hptsi_agg_delay -= slot; - bbr->r_ctl.rc_last_delay_val = slot = 100; + bbr->r_ctl.rc_hptsi_agg_delay -= pacing_delay; + bbr->r_ctl.rc_last_delay_val = pacing_delay = 100; } } - bbr->r_ctl.rc_last_delay_val = slot; + bbr->r_ctl.rc_last_delay_val = pacing_delay; hpts_timeout = bbr_timer_start(tp, bbr, cts); if (tp->t_flags & TF_DELACK) { if (bbr->rc_in_persist == 0) { @@ -810,7 +810,7 @@ bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_ bbr->r_ctl.rc_hpts_flags = PACE_TMR_DELACK; hpts_timeout = delayed_ack; } - if (slot) { + if (pacing_delay) { /* Mark that we have a pacing timer up */ BBR_STAT_INC(bbr_paced_segments); bbr->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT; @@ -820,7 +820,7 @@ bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_ * wheel, we resort to a keep-alive timer if its configured. */ if ((hpts_timeout == 0) && - (slot == 0)) { + (pacing_delay == 0)) { if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && (tp->t_state <= TCPS_CLOSING)) { /* @@ -849,7 +849,7 @@ bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_ if (left < hpts_timeout) hpts_timeout = left; } - if (bbr->r_ctl.rc_incr_tmrs && slot && + if (bbr->r_ctl.rc_incr_tmrs && pacing_delay && (bbr->r_ctl.rc_hpts_flags & (PACE_TMR_TLP|PACE_TMR_RXT))) { /* * If configured to do so, and the timer is either @@ -867,7 +867,7 @@ bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_ * this extra delay but this is easier and being more * conservative is probably better. */ - hpts_timeout += slot; + hpts_timeout += pacing_delay; } if (hpts_timeout) { /* @@ -879,10 +879,10 @@ bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_ bbr->r_ctl.rc_timer_exp = cts + hpts_timeout; } else bbr->r_ctl.rc_timer_exp = 0; - if ((slot) && + if ((pacing_delay) && (bbr->rc_use_google || bbr->output_error_seen || - (slot <= hpts_timeout)) ) { + (pacing_delay <= hpts_timeout)) ) { /* * Tell LRO that it can queue packets while * we pace. @@ -900,17 +900,15 @@ bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_ tp->t_flags2 &= ~TF2_DONT_SACK_QUEUE; bbr->rc_pacer_started = cts; - (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(slot), - __LINE__, &diag); + tcp_hpts_insert(tp, pacing_delay, &diag); bbr->rc_timer_first = 0; bbr->bbr_timer_src = frm; - bbr_log_to_start(bbr, cts, hpts_timeout, slot, 1); + bbr_log_to_start(bbr, cts, hpts_timeout, pacing_delay, 1); bbr_log_hpts_diag(bbr, cts, &diag); } else if (hpts_timeout) { - (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(hpts_timeout), - __LINE__, &diag); + tcp_hpts_insert(tp, hpts_timeout, &diag); /* - * We add the flag here as well if the slot is set, + * We add the flag here as well if the pacing delay is set, * since hpts will call in to clear the queue first before * calling the output routine (which does our timers). * We don't want to set the flag if its just a timer @@ -919,7 +917,7 @@ bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_ * on a keep-alive timer and a request comes in for * more data. */ - if (slot) + if (pacing_delay) bbr->rc_pacer_started = cts; if ((bbr->r_ctl.rc_hpts_flags & PACE_TMR_RACK) && (bbr->rc_cwnd_limited == 0)) { @@ -936,12 +934,12 @@ bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_ TF2_DONT_SACK_QUEUE); } bbr->bbr_timer_src = frm; - bbr_log_to_start(bbr, cts, hpts_timeout, slot, 0); + bbr_log_to_start(bbr, cts, hpts_timeout, pacing_delay, 0); bbr_log_hpts_diag(bbr, cts, &diag); bbr->rc_timer_first = 1; } bbr->rc_tmr_stopped = 0; - bbr_log_type_bbrsnd(bbr, tot_len, slot, delay_calc, cts, frm, prev_delay); + bbr_log_type_bbrsnd(bbr, tot_len, pacing_delay, delay_calc, cts, frm, prev_delay); } static void @@ -1033,8 +1031,8 @@ bbr_timer_audit(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts, struct sock } /* * Ok the timer originally started is not what we want now. We will - * force the hpts to be stopped if any, and restart with the slot - * set to what was in the saved slot. + * force the hpts to be stopped if any, and restart with the pacing + * delay set to what was in the saved delay. */ wrong_timer: if ((bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) { @@ -2397,7 +2395,7 @@ bbr_log_hpts_diag(struct tcp_bbr *bbr, uint32_t cts, struct hpts_diag *diag) log.u_bbr.flex2 = diag->p_cur_slot; log.u_bbr.flex3 = diag->slot_req; log.u_bbr.flex4 = diag->inp_hptsslot; - log.u_bbr.flex5 = diag->slot_remaining; + log.u_bbr.flex5 = diag->time_remaining; log.u_bbr.flex6 = diag->need_new_to; log.u_bbr.flex7 = diag->p_hpts_active; log.u_bbr.flex8 = diag->p_on_min_sleep; @@ -2411,9 +2409,6 @@ bbr_log_hpts_diag(struct tcp_bbr *bbr, uint32_t cts, struct hpts_diag *diag) log.u_bbr.bw_inuse = diag->wheel_slot; log.u_bbr.rttProp = diag->wheel_cts; log.u_bbr.delRate = diag->maxslots; - log.u_bbr.cur_del_rate = diag->p_curtick; - log.u_bbr.cur_del_rate <<= 32; - log.u_bbr.cur_del_rate |= diag->p_lasttick; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, @@ -2473,7 +2468,7 @@ bbr_log_pacing_delay_calc(struct tcp_bbr *bbr, uint16_t gain, uint32_t len, } static void -bbr_log_to_start(struct tcp_bbr *bbr, uint32_t cts, uint32_t to, int32_t slot, uint8_t which) +bbr_log_to_start(struct tcp_bbr *bbr, uint32_t cts, uint32_t to, int32_t pacing_delay, uint8_t which) { if (tcp_bblogging_on(bbr->rc_tp)) { union tcp_log_stackspecific log; @@ -2483,7 +2478,7 @@ bbr_log_to_start(struct tcp_bbr *bbr, uint32_t cts, uint32_t to, int32_t slot, u log.u_bbr.flex1 = bbr->bbr_timer_src; log.u_bbr.flex2 = to; log.u_bbr.flex3 = bbr->r_ctl.rc_hpts_flags; - log.u_bbr.flex4 = slot; + log.u_bbr.flex4 = pacing_delay; log.u_bbr.flex5 = bbr->rc_tp->t_hpts_slot; log.u_bbr.flex6 = TICKS_2_USEC(bbr->rc_tp->t_rxtcur); log.u_bbr.pkts_out = bbr->rc_tp->t_flags2; @@ -2733,13 +2728,13 @@ bbr_type_log_hdwr_pacing(struct tcp_bbr *bbr, const struct ifnet *ifp, } static void -bbr_log_type_bbrsnd(struct tcp_bbr *bbr, uint32_t len, uint32_t slot, uint32_t del_by, uint32_t cts, uint32_t line, uint32_t prev_delay) +bbr_log_type_bbrsnd(struct tcp_bbr *bbr, uint32_t len, uint32_t pacing_delay, uint32_t del_by, uint32_t cts, uint32_t line, uint32_t prev_delay) { if (tcp_bblogging_on(bbr->rc_tp)) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); - log.u_bbr.flex1 = slot; + log.u_bbr.flex1 = pacing_delay; log.u_bbr.flex2 = del_by; log.u_bbr.flex3 = prev_delay; log.u_bbr.flex4 = line; @@ -5205,7 +5200,7 @@ bbr_process_timers(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts, uint8_t left = bbr->r_ctl.rc_timer_exp - cts; ret = -3; bbr_log_to_processing(bbr, cts, ret, left, hpts_calling); - tcp_hpts_insert(tp, HPTS_USEC_TO_SLOTS(left)); + tcp_hpts_insert(tp, left, NULL); return (1); } bbr->rc_tmr_stopped = 0; @@ -5254,7 +5249,7 @@ bbr_timer_cancel(struct tcp_bbr *bbr, int32_t line, uint32_t cts) else time_since_send = 0; if (bbr->r_ctl.rc_last_delay_val > time_since_send) { - /* Cut down our slot time */ + /* Cut down our pacing_delay time */ bbr->r_ctl.rc_last_delay_val -= time_since_send; } else { bbr->r_ctl.rc_last_delay_val = 0; @@ -5888,7 +5883,7 @@ bbr_log_output(struct tcp_bbr *bbr, struct tcpcb *tp, struct tcpopt *to, int32_t * sequence 1 for 10 bytes. In such an example the r_start would be * 1 (starting sequence) but the r_end would be r_start+len i.e. 11. * This means that r_end is actually the first sequence for the next - * slot (11). + * pacing delay (11). * */ INP_WLOCK_ASSERT(tptoinpcb(tp)); @@ -11856,7 +11851,7 @@ bbr_output_wtime(struct tcpcb *tp, const struct timeval *tv) struct bbr_sendmap *rsm = NULL; int32_t tso, mtu; struct tcpopt to; - int32_t slot = 0; + int32_t pacing_delay = 0; struct inpcb *inp; struct sockbuf *sb; bool hpts_calling; @@ -11986,8 +11981,7 @@ bbr_output_wtime(struct tcpcb *tp, const struct timeval *tv) delay_calc -= bbr->r_ctl.rc_last_delay_val; else { /* - * We are early setup to adjust - * our slot time. + * We are early setup to adjust out pacing delay. */ uint64_t merged_val; @@ -12104,7 +12098,7 @@ again: #endif error = 0; tso = 0; - slot = 0; + pacing_delay = 0; mtu = 0; sendwin = min(tp->snd_wnd, tp->snd_cwnd); sb_offset = tp->snd_max - tp->snd_una; @@ -12126,7 +12120,7 @@ recheck_resend: tot_len = tp->t_maxseg; if (hpts_calling) /* Retry in a ms */ - slot = 1001; + pacing_delay = 1001; goto just_return_nolock; } TAILQ_INSERT_TAIL(&bbr->r_ctl.rc_free, rsm, r_next); @@ -12699,9 +12693,9 @@ just_return: SOCK_SENDBUF_UNLOCK(so); just_return_nolock: if (tot_len) - slot = bbr_get_pacing_delay(bbr, bbr->r_ctl.rc_bbr_hptsi_gain, tot_len, cts, 0); + pacing_delay = bbr_get_pacing_delay(bbr, bbr->r_ctl.rc_bbr_hptsi_gain, tot_len, cts, 0); if (bbr->rc_no_pacing) - slot = 0; + pacing_delay = 0; if (tot_len == 0) { if ((ctf_outstanding(tp) + min((bbr->r_ctl.rc_high_rwnd/2), bbr_minseg(bbr))) >= tp->snd_wnd) { @@ -12751,7 +12745,7 @@ just_return_nolock: /* Dont update the time if we did not send */ bbr->r_ctl.rc_last_delay_val = 0; bbr->rc_output_starts_timer = 1; - bbr_start_hpts_timer(bbr, tp, cts, 9, slot, tot_len); + bbr_start_hpts_timer(bbr, tp, cts, 9, pacing_delay, tot_len); bbr_log_type_just_return(bbr, cts, tot_len, hpts_calling, app_limited, p_maxseg, len); if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { /* Make sure snd_nxt is drug up */ @@ -12787,7 +12781,7 @@ send: flags &= ~TH_FIN; if ((len == 0) && ((tp->t_flags & TF_ACKNOW) == 0)) { /* Lets not send this */ - slot = 0; + pacing_delay = 0; goto just_return; } } @@ -13053,7 +13047,7 @@ send: /* * We have outstanding data, don't send a fin by itself!. */ - slot = 0; + pacing_delay = 0; goto just_return; } /* @@ -13763,7 +13757,7 @@ nomore: if (tp->snd_cwnd < maxseg) tp->snd_cwnd = maxseg; } - slot = (bbr_error_base_paceout + 1) << bbr->oerror_cnt; + pacing_delay = (bbr_error_base_paceout + 1) << bbr->oerror_cnt; BBR_STAT_INC(bbr_saw_enobuf); if (bbr->bbr_hdrw_pacing) counter_u64_add(bbr_hdwr_pacing_enobuf, 1); @@ -13812,18 +13806,18 @@ nomore: } /* * Nuke all other things that can interfere - * with slot + * with pacing delay */ if ((tot_len + len) && (len >= tp->t_maxseg)) { - slot = bbr_get_pacing_delay(bbr, + pacing_delay = bbr_get_pacing_delay(bbr, bbr->r_ctl.rc_bbr_hptsi_gain, (tot_len + len), cts, 0); - if (slot < bbr_error_base_paceout) - slot = (bbr_error_base_paceout + 2) << bbr->oerror_cnt; + if (pacing_delay < bbr_error_base_paceout) + pacing_delay = (bbr_error_base_paceout + 2) << bbr->oerror_cnt; } else - slot = (bbr_error_base_paceout + 2) << bbr->oerror_cnt; + pacing_delay = (bbr_error_base_paceout + 2) << bbr->oerror_cnt; bbr->rc_output_starts_timer = 1; - bbr_start_hpts_timer(bbr, tp, cts, 10, slot, + bbr_start_hpts_timer(bbr, tp, cts, 10, pacing_delay, tot_len); return (error); } @@ -13841,9 +13835,9 @@ nomore: } /* FALLTHROUGH */ default: - slot = (bbr_error_base_paceout + 3) << bbr->oerror_cnt; + pacing_delay = (bbr_error_base_paceout + 3) << bbr->oerror_cnt; bbr->rc_output_starts_timer = 1; - bbr_start_hpts_timer(bbr, tp, cts, 11, slot, 0); + bbr_start_hpts_timer(bbr, tp, cts, 11, pacing_delay, 0); return (error); } #ifdef STATS @@ -13981,12 +13975,12 @@ skip_again: tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); if (((flags & (TH_RST | TH_SYN | TH_FIN)) == 0) && tot_len) { /* - * Calculate/Re-Calculate the hptsi slot in usecs based on + * Calculate/Re-Calculate the hptsi timeout in usecs based on * what we have sent so far */ - slot = bbr_get_pacing_delay(bbr, bbr->r_ctl.rc_bbr_hptsi_gain, tot_len, cts, 0); + pacing_delay = bbr_get_pacing_delay(bbr, bbr->r_ctl.rc_bbr_hptsi_gain, tot_len, cts, 0); if (bbr->rc_no_pacing) - slot = 0; + pacing_delay = 0; } tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); enobufs: @@ -13999,8 +13993,8 @@ enobufs: (more_to_rxt || ((bbr->r_ctl.rc_resend = bbr_check_recovery_mode(tp, bbr, cts)) != NULL))) { /* Rack cheats and shotguns out all rxt's 1ms apart */ - if (slot > 1000) - slot = 1000; + if (pacing_delay > 1000) + pacing_delay = 1000; } if (bbr->bbr_hdrw_pacing && (bbr->hw_pacing_set == 0)) { /* @@ -14014,7 +14008,7 @@ enobufs: tcp_bbr_tso_size_check(bbr, cts); } } - bbr_start_hpts_timer(bbr, tp, cts, 12, slot, tot_len); + bbr_start_hpts_timer(bbr, tp, cts, 12, pacing_delay, tot_len); if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { /* Make sure snd_nxt is drug up */ tp->snd_nxt = tp->snd_max; @@ -14132,8 +14126,7 @@ bbr_switch_failed(struct tcpcb *tp) } } else toval = HPTS_USECS_PER_SLOT; - (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(toval), - __LINE__, &diag); + tcp_hpts_insert(tp, toval, &diag); bbr_log_hpts_diag(bbr, cts, &diag); } diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c index 11ef5ba706c5..c7962b57a69e 100644 --- a/sys/netinet/tcp_stacks/rack.c +++ b/sys/netinet/tcp_stacks/rack.c @@ -250,11 +250,11 @@ static int32_t rack_non_rxt_use_cr = 0; /* does a non-rxt in recovery use the co static int32_t rack_persist_min = 250000; /* 250usec */ static int32_t rack_persist_max = 2000000; /* 2 Second in usec's */ static int32_t rack_honors_hpts_min_to = 1; /* Do we honor the hpts minimum time out for pacing timers */ -static uint32_t rack_max_reduce = 10; /* Percent we can reduce slot by */ +static uint32_t rack_max_reduce = 10; /* Percent we can reduce pacing delay by */ static int32_t rack_sack_not_required = 1; /* set to one to allow non-sack to use rack */ static int32_t rack_limit_time_with_srtt = 0; static int32_t rack_autosndbuf_inc = 20; /* In percentage form */ -static int32_t rack_enobuf_hw_boost_mult = 0; /* How many times the hw rate we boost slot using time_between */ +static int32_t rack_enobuf_hw_boost_mult = 0; /* How many times the hw rate we boost pacing delay using time_between */ static int32_t rack_enobuf_hw_max = 12000; /* 12 ms in usecs */ static int32_t rack_enobuf_hw_min = 10000; /* 10 ms in usecs */ static int32_t rack_hw_rwnd_factor = 2; /* How many max_segs the rwnd must be before we hold off sending */ @@ -278,7 +278,7 @@ static int32_t rack_hptsi_segments = 40; static int32_t rack_rate_sample_method = USE_RTT_LOW; static int32_t rack_pace_every_seg = 0; static int32_t rack_delayed_ack_time = 40000; /* 40ms in usecs */ -static int32_t rack_slot_reduction = 4; +static int32_t rack_pacing_delay_reduction = 4; static int32_t rack_wma_divisor = 8; /* For WMA calculation */ static int32_t rack_cwnd_block_ends_measure = 0; static int32_t rack_rwnd_block_ends_measure = 0; @@ -478,7 +478,7 @@ rack_log_alt_to_to_cancel(struct tcp_rack *rack, uint16_t flex7, uint8_t mod); static void -rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t slot, +rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t pacing_delay, uint64_t bw_est, uint64_t bw, uint64_t len_time, int method, int line, struct rack_sendmap *rsm, uint8_t quality); static struct rack_sendmap * @@ -1107,7 +1107,7 @@ rack_init_sysctls(void) SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_pacing), OID_AUTO, "burst_reduces", CTLFLAG_RW, - &rack_slot_reduction, 4, + &rack_pacing_delay_reduction, 4, "When doing only burst mitigation what is the reduce divisor"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), @@ -1399,7 +1399,7 @@ rack_init_sysctls(void) SYSCTL_CHILDREN(rack_timers), OID_AUTO, "hpts_max_reduce", CTLFLAG_RW, &rack_max_reduce, 10, - "Max percentage we will reduce slot by for pacing when we are behind"); + "Max percentage we will reduce pacing delay by for pacing when we are behind"); SYSCTL_ADD_U32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_timers), OID_AUTO, "persmin", CTLFLAG_RW, @@ -2700,7 +2700,7 @@ rack_log_retran_reason(struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t } static void -rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot, uint8_t which) +rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t pacing_delay, uint8_t which) { if (tcp_bblogging_on(rack->rc_tp)) { union tcp_log_stackspecific log; @@ -2710,7 +2710,7 @@ rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot log.u_bbr.flex1 = rack->rc_tp->t_srtt; log.u_bbr.flex2 = to; log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags; - log.u_bbr.flex4 = slot; + log.u_bbr.flex4 = pacing_delay; log.u_bbr.flex5 = rack->rc_tp->t_hpts_slot; log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; log.u_bbr.flex7 = rack->rc_in_persist; @@ -3034,14 +3034,14 @@ rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, } static void -rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t slot, uint32_t cts, struct timeval *tv, int line) +rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t pacing_delay, uint32_t cts, struct timeval *tv, int line) { if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { union tcp_log_stackspecific log; memset(&log, 0, sizeof(log)); log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); - log.u_bbr.flex1 = slot; + log.u_bbr.flex1 = pacing_delay; if (rack->rack_no_prr) log.u_bbr.flex2 = 0; else @@ -3139,7 +3139,7 @@ rack_log_type_pacing_sizes(struct tcpcb *tp, struct tcp_rack *rack, uint32_t arg } static void -rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, uint32_t slot, +rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, uint32_t pacing_delay, uint8_t hpts_calling, int reason, uint32_t cwnd_to_use) { if (tcp_bblogging_on(rack->rc_tp)) { @@ -3148,7 +3148,7 @@ rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, ui memset(&log, 0, sizeof(log)); log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); - log.u_bbr.flex1 = slot; + log.u_bbr.flex1 = pacing_delay; log.u_bbr.flex2 = rack->r_ctl.rc_hpts_flags; log.u_bbr.flex4 = reason; if (rack->rack_no_prr) @@ -6482,7 +6482,7 @@ rack_log_hpts_diag(struct tcp_rack *rack, uint32_t cts, log.u_bbr.flex2 = diag->p_cur_slot; log.u_bbr.flex3 = diag->slot_req; log.u_bbr.flex4 = diag->inp_hptsslot; - log.u_bbr.flex5 = diag->slot_remaining; + log.u_bbr.flex5 = diag->time_remaining; log.u_bbr.flex6 = diag->need_new_to; log.u_bbr.flex7 = diag->p_hpts_active; log.u_bbr.flex8 = diag->p_on_min_sleep; @@ -6497,9 +6497,6 @@ rack_log_hpts_diag(struct tcp_rack *rack, uint32_t cts, log.u_bbr.rttProp = diag->wheel_cts; log.u_bbr.timeStamp = cts; log.u_bbr.delRate = diag->maxslots; - log.u_bbr.cur_del_rate = diag->p_curtick; - log.u_bbr.cur_del_rate <<= 32; - log.u_bbr.cur_del_rate |= diag->p_lasttick; TCP_LOG_EVENTP(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, @@ -6532,14 +6529,14 @@ rack_log_wakeup(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb, uin static void rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, - int32_t slot, uint32_t tot_len_this_send, int sup_rack) + int32_t usecs, uint32_t tot_len_this_send, int sup_rack) { struct hpts_diag diag; struct inpcb *inp = tptoinpcb(tp); struct timeval tv; uint32_t delayed_ack = 0; uint32_t hpts_timeout; - uint32_t entry_slot = slot; + uint32_t entry_usecs = usecs; uint8_t stopped; uint32_t left = 0; uint32_t us_cts; @@ -6560,7 +6557,7 @@ rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, rack->r_ctl.rc_hpts_flags = 0; us_cts = tcp_get_usecs(&tv); /* Now early/late accounting */ - rack_log_pacing_delay_calc(rack, entry_slot, slot, 0, 0, 0, 26, __LINE__, NULL, 0); + rack_log_pacing_delay_calc(rack, entry_usecs, usecs, 0, 0, 0, 26, __LINE__, NULL, 0); if (rack->r_early && (rack->rc_ack_can_sendout_data == 0)) { /* * We have a early carry over set, @@ -6571,7 +6568,7 @@ rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, * penalize the next timer for being awoke * by an ack aka the rc_agg_early (non-paced mode). */ - slot += rack->r_ctl.rc_agg_early; + usecs += rack->r_ctl.rc_agg_early; rack->r_early = 0; rack->r_ctl.rc_agg_early = 0; } @@ -6583,29 +6580,29 @@ rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, * really depends on what * the current pacing time is. */ - if (rack->r_ctl.rc_agg_delayed >= slot) { + if (rack->r_ctl.rc_agg_delayed >= usecs) { /* * We can't compensate for it all. * And we have to have some time * on the clock. We always have a min - * 10 slots (10 x 10 i.e. 100 usecs). + * 10 HPTS timer units (10 x 10 i.e. 100 usecs). */ - if (slot <= HPTS_USECS_PER_SLOT) { + if (usecs <= HPTS_USECS_PER_SLOT) { /* We gain delay */ - rack->r_ctl.rc_agg_delayed += (HPTS_USECS_PER_SLOT - slot); - slot = HPTS_USECS_PER_SLOT; + rack->r_ctl.rc_agg_delayed += (HPTS_USECS_PER_SLOT - usecs); + usecs = HPTS_USECS_PER_SLOT; } else { /* We take off some */ - rack->r_ctl.rc_agg_delayed -= (slot - HPTS_USECS_PER_SLOT); - slot = HPTS_USECS_PER_SLOT; + rack->r_ctl.rc_agg_delayed -= (usecs - HPTS_USECS_PER_SLOT); + usecs = HPTS_USECS_PER_SLOT; } } else { - slot -= rack->r_ctl.rc_agg_delayed; + usecs -= rack->r_ctl.rc_agg_delayed; rack->r_ctl.rc_agg_delayed = 0; /* Make sure we have 100 useconds at minimum */ - if (slot < HPTS_USECS_PER_SLOT) { - rack->r_ctl.rc_agg_delayed = HPTS_USECS_PER_SLOT - slot; - slot = HPTS_USECS_PER_SLOT; + if (usecs < HPTS_USECS_PER_SLOT) { + rack->r_ctl.rc_agg_delayed = HPTS_USECS_PER_SLOT - usecs; + usecs = HPTS_USECS_PER_SLOT; } if (rack->r_ctl.rc_agg_delayed == 0) rack->r_late = 0; @@ -6614,17 +6611,17 @@ rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, /* r_use_hpts_min is on and so is DGP */ uint32_t max_red; - max_red = (slot * rack->r_ctl.max_reduction) / 100; + max_red = (usecs * rack->r_ctl.max_reduction) / 100; if (max_red >= rack->r_ctl.rc_agg_delayed) { - slot -= rack->r_ctl.rc_agg_delayed; + usecs -= rack->r_ctl.rc_agg_delayed; rack->r_ctl.rc_agg_delayed = 0; } else { - slot -= max_red; + usecs -= max_red; rack->r_ctl.rc_agg_delayed -= max_red; } } if ((rack->r_use_hpts_min == 1) && - (slot > 0) && + (usecs > 0) && (rack->dgp_on == 1)) { /* * We are enforcing a min pacing timer @@ -6633,8 +6630,8 @@ rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, uint32_t min; min = get_hpts_min_sleep_time(); - if (min > slot) { - slot = min; + if (min > usecs) { + usecs = min; } } hpts_timeout = rack_timer_start(tp, rack, cts, sup_rack); @@ -6652,7 +6649,7 @@ rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, * wheel, we resort to a keep-alive timer if its configured. */ if ((hpts_timeout == 0) && - (slot == 0)) { + (usecs == 0)) { if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && (tp->t_state <= TCPS_CLOSING)) { /* @@ -6709,10 +6706,10 @@ rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, hpts_timeout = 0x7ffffffe; rack->r_ctl.rc_timer_exp = cts + hpts_timeout; } - rack_log_pacing_delay_calc(rack, entry_slot, slot, hpts_timeout, 0, 0, 27, __LINE__, NULL, 0); + rack_log_pacing_delay_calc(rack, entry_usecs, usecs, hpts_timeout, 0, 0, 27, __LINE__, NULL, 0); if ((rack->gp_ready == 0) && (rack->use_fixed_rate == 0) && - (hpts_timeout < slot) && + (hpts_timeout < usecs) && (rack->r_ctl.rc_hpts_flags & (PACE_TMR_TLP|PACE_TMR_RXT))) { /* * We have no good estimate yet for the @@ -6722,7 +6719,7 @@ rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, * pace that long since we know the calculation * so far is not accurate. */ - slot = hpts_timeout; + usecs = hpts_timeout; } /** * Turn off all the flags for queuing by default. The @@ -6754,11 +6751,11 @@ rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, * so LRO can call into us. */ tp->t_flags2 &= ~(TF2_DONT_SACK_QUEUE|TF2_MBUF_QUEUE_READY); - if (slot) { + if (usecs) { rack->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT; - rack->r_ctl.rc_last_output_to = us_cts + slot; + rack->r_ctl.rc_last_output_to = us_cts + usecs; /* - * A pacing timer (slot) is being set, in + * A pacing timer (usecs microseconds) is being set, in * such a case we cannot send (we are blocked by * the timer). So lets tell LRO that it should not * wake us unless there is a SACK. Note this only @@ -6799,20 +6796,18 @@ rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, } if ((rack->use_rack_rr) && (rack->r_rr_config < 2) && - ((hpts_timeout) && (hpts_timeout < slot))) { + ((hpts_timeout) && (hpts_timeout < usecs))) { /* * Arrange for the hpts to kick back in after the * t-o if the t-o does not cause a send. */ - (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(hpts_timeout), - __LINE__, &diag); + tcp_hpts_insert(tp, hpts_timeout, &diag); rack_log_hpts_diag(rack, us_cts, &diag, &tv); - rack_log_to_start(rack, cts, hpts_timeout, slot, 0); + rack_log_to_start(rack, cts, hpts_timeout, usecs, 0); } else { - (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(slot), - __LINE__, &diag); + tcp_hpts_insert(tp, usecs, &diag); rack_log_hpts_diag(rack, us_cts, &diag, &tv); - rack_log_to_start(rack, cts, hpts_timeout, slot, 1); + rack_log_to_start(rack, cts, hpts_timeout, usecs, 1); } } else if (hpts_timeout) { /* @@ -6824,22 +6819,21 @@ rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, * at the start of this block) are good enough. */ rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; - (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(hpts_timeout), - __LINE__, &diag); + tcp_hpts_insert(tp, hpts_timeout, &diag); rack_log_hpts_diag(rack, us_cts, &diag, &tv); - rack_log_to_start(rack, cts, hpts_timeout, slot, 0); + rack_log_to_start(rack, cts, hpts_timeout, usecs, 0); } else { /* No timer starting */ #ifdef INVARIANTS if (SEQ_GT(tp->snd_max, tp->snd_una)) { - panic("tp:%p rack:%p tlts:%d cts:%u slot:%u pto:%u -- no timer started?", - tp, rack, tot_len_this_send, cts, slot, hpts_timeout); + panic("tp:%p rack:%p tlts:%d cts:%u usecs:%u pto:%u -- no timer started?", + tp, rack, tot_len_this_send, cts, usecs, hpts_timeout); } #endif } rack->rc_tmr_stopped = 0; - if (slot) - rack_log_type_bbrsnd(rack, tot_len_this_send, slot, us_cts, &tv, __LINE__); + if (usecs) + rack_log_type_bbrsnd(rack, tot_len_this_send, usecs, us_cts, &tv, __LINE__); } static void @@ -8016,7 +8010,7 @@ rack_process_timers(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8 rack->rc_tp->t_flags2 &= ~TF2_DONT_SACK_QUEUE; ret = -3; left = rack->r_ctl.rc_timer_exp - cts; - tcp_hpts_insert(tp, HPTS_MS_TO_SLOTS(left)); + tcp_hpts_insert(tp, left, NULL); rack_log_to_processing(rack, cts, ret, left); return (1); } @@ -14377,8 +14371,7 @@ rack_switch_failed(struct tcpcb *tp) } } else toval = HPTS_USECS_PER_SLOT; - (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(toval), - __LINE__, &diag); + tcp_hpts_insert(tp, toval, &diag); rack_log_hpts_diag(rack, cts, &diag, &tv); } @@ -14973,8 +14966,7 @@ rack_init(struct tcpcb *tp, void **ptr) if (tov) { struct hpts_diag diag; - (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(tov), - __LINE__, &diag); + tcp_hpts_insert(tp, tov, &diag); rack_log_hpts_diag(rack, us_cts, &diag, &rack->r_ctl.act_rcv_time); } } @@ -16367,7 +16359,7 @@ rack_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, struct rack_sendmap *rsm; int32_t prev_state = 0; int no_output = 0; - int slot_remaining = 0; + int time_remaining = 0; #ifdef TCP_ACCOUNTING int ack_val_set = 0xf; #endif @@ -16416,7 +16408,7 @@ rack_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, * could be, if a sack is present, we want to be awoken and * so should process the packets. */ - slot_remaining = rack->r_ctl.rc_last_output_to - us_cts; + time_remaining = rack->r_ctl.rc_last_output_to - us_cts; if (rack->rc_tp->t_flags2 & TF2_DONT_SACK_QUEUE) { no_output = 1; } else { @@ -16436,7 +16428,7 @@ rack_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, (*ts_ptr == TCP_LRO_TS_OPTION))) no_output = 1; } - if ((no_output == 1) && (slot_remaining < tcp_min_hptsi_time)) { + if ((no_output == 1) && (time_remaining < tcp_min_hptsi_time)) { /* * It is unrealistic to think we can pace in less than * the minimum granularity of the pacer (def:250usec). So @@ -16919,10 +16911,10 @@ do_output_now: (tcp_in_hpts(rack->rc_tp) == 0)) { /* * We are not in hpts and we had a pacing timer up. Use - * the remaining time (slot_remaining) to restart the timer. + * the remaining time (time_remaining) to restart the timer. */ - KASSERT ((slot_remaining != 0), ("slot remaining is zero for rack:%p tp:%p", rack, tp)); - rack_start_hpts_timer(rack, tp, cts, slot_remaining, 0, 0); + KASSERT ((time_remaining != 0), ("slot remaining is zero for rack:%p tp:%p", rack, tp)); + rack_start_hpts_timer(rack, tp, cts, time_remaining, 0, 0); rack_free_trim(rack); } /* Clear the flag, it may have been cleared by output but we may not have */ @@ -17102,7 +17094,7 @@ check_it: } static void -rack_log_pacing_delay_calc (struct tcp_rack *rack, uint32_t len, uint32_t slot, +rack_log_pacing_delay_calc (struct tcp_rack *rack, uint32_t len, uint32_t pacing_delay, uint64_t bw_est, uint64_t bw, uint64_t len_time, int method, int line, struct rack_sendmap *rsm, uint8_t quality) { @@ -17125,7 +17117,7 @@ rack_log_pacing_delay_calc (struct tcp_rack *rack, uint32_t len, uint32_t slot, } } memset(&log, 0, sizeof(log)); - log.u_bbr.flex1 = slot; + log.u_bbr.flex1 = pacing_delay; log.u_bbr.flex2 = len; log.u_bbr.flex3 = rack->r_ctl.rc_pace_min_segs; log.u_bbr.flex4 = rack->r_ctl.rc_pace_max_segs; @@ -17284,25 +17276,25 @@ rack_arrive_at_discounted_rate(struct tcp_rack *rack, uint64_t window_input, uin } static int32_t -pace_to_fill_cwnd(struct tcp_rack *rack, int32_t slot, uint32_t len, uint32_t segsiz, int *capped, uint64_t *rate_wanted, uint8_t non_paced) +pace_to_fill_cwnd(struct tcp_rack *rack, int32_t pacing_delay, uint32_t len, uint32_t segsiz, int *capped, uint64_t *rate_wanted, uint8_t non_paced) { uint64_t lentim, fill_bw; rack->r_via_fill_cw = 0; if (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > rack->r_ctl.cwnd_to_use) - return (slot); + return (pacing_delay); if ((ctf_outstanding(rack->rc_tp) + (segsiz-1)) > rack->rc_tp->snd_wnd) - return (slot); + return (pacing_delay); if (rack->r_ctl.rc_last_us_rtt == 0) - return (slot); + return (pacing_delay); if (rack->rc_pace_fill_if_rttin_range && (rack->r_ctl.rc_last_us_rtt >= (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack->rtt_limit_mul))) { /* The rtt is huge, N * smallest, lets not fill */ - return (slot); + return (pacing_delay); } if (rack->r_ctl.fillcw_cap && *rate_wanted >= rack->r_ctl.fillcw_cap) - return (slot); + return (pacing_delay); /* * first lets calculate the b/w based on the last us-rtt * and the the smallest send window. @@ -17368,7 +17360,7 @@ at_lt_bw: if (non_paced) *rate_wanted = fill_bw; if ((fill_bw < RACK_MIN_BW) || (fill_bw < *rate_wanted)) - return (slot); + return (pacing_delay); rack->r_via_fill_cw = 1; if (rack->r_rack_hw_rate_caps && (rack->r_ctl.crte != NULL)) { @@ -17423,19 +17415,19 @@ at_lt_bw: lentim = (uint64_t)(len) * (uint64_t)HPTS_USEC_IN_SEC; lentim /= fill_bw; *rate_wanted = fill_bw; - if (non_paced || (lentim < slot)) { - rack_log_pacing_delay_calc(rack, len, slot, fill_bw, + if (non_paced || (lentim < pacing_delay)) { + rack_log_pacing_delay_calc(rack, len, pacing_delay, fill_bw, 0, lentim, 12, __LINE__, NULL, 0); return ((int32_t)lentim); } else - return (slot); + return (pacing_delay); } static int32_t rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, struct rack_sendmap *rsm, uint32_t segsiz, int line) { uint64_t srtt; - int32_t slot = 0; + int32_t pacing_delay = 0; int can_start_hw_pacing = 1; int err; int pace_one; @@ -17483,25 +17475,25 @@ rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, str * cwnd. Which in that case we are just waiting for * a ACK. */ - slot = len / tr_perms; + pacing_delay = len / tr_perms; /* Now do we reduce the time so we don't run dry? */ - if (slot && rack_slot_reduction) { - reduce = (slot / rack_slot_reduction); - if (reduce < slot) { - slot -= reduce; + if (pacing_delay && rack_pacing_delay_reduction) { + reduce = (pacing_delay / rack_pacing_delay_reduction); + if (reduce < pacing_delay) { + pacing_delay -= reduce; } else - slot = 0; + pacing_delay = 0; } else reduce = 0; - slot *= HPTS_USEC_IN_MSEC; + pacing_delay *= HPTS_USEC_IN_MSEC; if (rack->rc_pace_to_cwnd) { uint64_t rate_wanted = 0; - slot = pace_to_fill_cwnd(rack, slot, len, segsiz, NULL, &rate_wanted, 1); + pacing_delay = pace_to_fill_cwnd(rack, pacing_delay, len, segsiz, NULL, &rate_wanted, 1); rack->rc_ack_can_sendout_data = 1; - rack_log_pacing_delay_calc(rack, len, slot, rate_wanted, 0, 0, 14, __LINE__, NULL, 0); + rack_log_pacing_delay_calc(rack, len, pacing_delay, rate_wanted, 0, 0, 14, __LINE__, NULL, 0); } else - rack_log_pacing_delay_calc(rack, len, slot, tr_perms, reduce, 0, 7, __LINE__, NULL, 0); + rack_log_pacing_delay_calc(rack, len, pacing_delay, tr_perms, reduce, 0, 7, __LINE__, NULL, 0); /*******************************************************/ /* RRS: We insert non-paced call to stats here for len */ /*******************************************************/ @@ -17575,7 +17567,7 @@ rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, str segs *= oh; lentim = (uint64_t)(len + segs) * (uint64_t)HPTS_USEC_IN_SEC; res = lentim / rate_wanted; - slot = (uint32_t)res; + pacing_delay = (uint32_t)res; if (rack_hw_rate_min && (rate_wanted < rack_hw_rate_min)) { can_start_hw_pacing = 0; @@ -17635,7 +17627,7 @@ rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, str * We want to pace at our rate *or* faster to * fill the cwnd to the max if its not full. */ - slot = pace_to_fill_cwnd(rack, slot, (len+segs), segsiz, &capped, &rate_wanted, 0); + pacing_delay = pace_to_fill_cwnd(rack, pacing_delay, (len+segs), segsiz, &capped, &rate_wanted, 0); /* Re-check to make sure we are not exceeding our max b/w */ if ((rack->r_ctl.crte != NULL) && (tcp_hw_highest_rate(rack->r_ctl.crte) < rate_wanted)) { @@ -17786,15 +17778,15 @@ rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, str srtt = rack->rc_tp->t_srtt; else srtt = RACK_INITIAL_RTO * HPTS_USEC_IN_MSEC; /* its in ms convert */ - if (srtt < (uint64_t)slot) { - rack_log_pacing_delay_calc(rack, srtt, slot, rate_wanted, bw_est, lentim, 99, __LINE__, NULL, 0); - slot = srtt; + if (srtt < (uint64_t)pacing_delay) { + rack_log_pacing_delay_calc(rack, srtt, pacing_delay, rate_wanted, bw_est, lentim, 99, __LINE__, NULL, 0); + pacing_delay = srtt; } } /*******************************************************************/ /* RRS: We insert paced call to stats here for len and rate_wanted */ /*******************************************************************/ - rack_log_pacing_delay_calc(rack, len, slot, rate_wanted, bw_est, lentim, 2, __LINE__, rsm, 0); + rack_log_pacing_delay_calc(rack, len, pacing_delay, rate_wanted, bw_est, lentim, 2, __LINE__, rsm, 0); } if (rack->r_ctl.crte && (rack->r_ctl.crte->rs_num_enobufs > 0)) { /* @@ -17811,9 +17803,9 @@ rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, str hw_boost_delay = rack_enobuf_hw_max; else if (hw_boost_delay < rack_enobuf_hw_min) hw_boost_delay = rack_enobuf_hw_min; - slot += hw_boost_delay; + pacing_delay += hw_boost_delay; } - return (slot); + return (pacing_delay); } static void @@ -18482,7 +18474,7 @@ rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendma struct tcpopt to; u_char opt[TCP_MAXOLEN]; uint32_t hdrlen, optlen; - int32_t slot, segsiz, max_val, tso = 0, error = 0, ulen = 0; + int32_t pacing_delay, segsiz, max_val, tso = 0, error = 0, ulen = 0; uint16_t flags; uint32_t if_hw_tsomaxsegcount = 0, startseq; uint32_t if_hw_tsomaxsegsize; @@ -18688,9 +18680,9 @@ rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendma } if (rack->r_ctl.crte != NULL) { /* See if we can send via the hw queue */ - slot = rack_check_queue_level(rack, tp, tv, cts, len, segsiz); + pacing_delay = rack_check_queue_level(rack, tp, tv, cts, len, segsiz); /* If there is nothing in queue (no pacing time) we can send via the hw queue */ - if (slot == 0) + if (pacing_delay == 0) ip_sendflag = 0; } tcp_set_flags(th, flags); @@ -18955,20 +18947,20 @@ rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendma rack_log_queue_level(tp, rack, len, tv, cts); } else tcp_trace_point(rack->rc_tp, TCP_TP_ENOBUF); - slot = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC); + pacing_delay = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC); if (rack->rc_enobuf < 0x7f) rack->rc_enobuf++; - if (slot < (10 * HPTS_USEC_IN_MSEC)) - slot = 10 * HPTS_USEC_IN_MSEC; + if (pacing_delay < (10 * HPTS_USEC_IN_MSEC)) + pacing_delay = 10 * HPTS_USEC_IN_MSEC; if (rack->r_ctl.crte != NULL) { counter_u64_add(rack_saw_enobuf_hw, 1); tcp_rl_log_enobuf(rack->r_ctl.crte); } counter_u64_add(rack_saw_enobuf, 1); } else { - slot = rack_get_pacing_delay(rack, tp, len, NULL, segsiz, __LINE__); + pacing_delay = rack_get_pacing_delay(rack, tp, len, NULL, segsiz, __LINE__); } - rack_start_hpts_timer(rack, tp, cts, slot, len, 0); + rack_start_hpts_timer(rack, tp, cts, pacing_delay, len, 0); #ifdef TCP_ACCOUNTING crtsc = get_cyclecount(); if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { @@ -19071,7 +19063,7 @@ rack_fast_output(struct tcpcb *tp, struct tcp_rack *rack, uint64_t ts_val, #ifdef TCP_ACCOUNTING int cnt_thru = 1; #endif - int32_t slot, segsiz, len, max_val, tso = 0, sb_offset, error, ulen = 0; + int32_t pacing_delay, segsiz, len, max_val, tso = 0, sb_offset, error, ulen = 0; uint16_t flags; uint32_t s_soff; uint32_t if_hw_tsomaxsegcount = 0, startseq; @@ -19519,8 +19511,8 @@ again: } tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); counter_u64_add(rack_fto_send, 1); - slot = rack_get_pacing_delay(rack, tp, *tot_len, NULL, segsiz, __LINE__); - rack_start_hpts_timer(rack, tp, cts, slot, *tot_len, 0); + pacing_delay = rack_get_pacing_delay(rack, tp, *tot_len, NULL, segsiz, __LINE__); + rack_start_hpts_timer(rack, tp, cts, pacing_delay, *tot_len, 0); #ifdef TCP_ACCOUNTING crtsc = get_cyclecount(); if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { @@ -19707,7 +19699,7 @@ rack_output(struct tcpcb *tp) struct rack_sendmap *rsm = NULL; int32_t tso, mtu; struct tcpopt to; - int32_t slot = 0; + int32_t pacing_delay = 0; int32_t sup_rack = 0; uint32_t cts, ms_cts, delayed, early; uint32_t add_flag = RACK_SENT_SP; @@ -20070,7 +20062,7 @@ again: if (rsm == NULL) { if (hpts_calling) /* Retry in a ms */ - slot = (1 * HPTS_USEC_IN_MSEC); + pacing_delay = (1 * HPTS_USEC_IN_MSEC); so = inp->inp_socket; sb = &so->so_snd; goto just_return_nolock; @@ -20877,7 +20869,7 @@ just_return_nolock: } if (tot_len_this_send > 0) { rack->r_ctl.fsb.recwin = recwin; - slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, NULL, segsiz, __LINE__); + pacing_delay = rack_get_pacing_delay(rack, tp, tot_len_this_send, NULL, segsiz, __LINE__); if ((error == 0) && rack_use_rfo && ((flags & (TH_SYN|TH_FIN)) == 0) && @@ -21060,8 +21052,8 @@ just_return_nolock: /* Yes lets make sure to move to persist before timer-start */ rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime, tp->snd_una); } - rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, sup_rack); - rack_log_type_just_return(rack, cts, tot_len_this_send, slot, hpts_calling, app_limited, cwnd_to_use); + rack_start_hpts_timer(rack, tp, cts, pacing_delay, tot_len_this_send, sup_rack); + rack_log_type_just_return(rack, cts, tot_len_this_send, pacing_delay, hpts_calling, app_limited, cwnd_to_use); } #ifdef NETFLIX_SHARED_CWND if ((sbavail(sb) == 0) && @@ -21100,8 +21092,8 @@ send: * we come around to again, the flag will be clear. */ check_done = 1; - slot = rack_check_queue_level(rack, tp, &tv, cts, len, segsiz); - if (slot) { + pacing_delay = rack_check_queue_level(rack, tp, &tv, cts, len, segsiz); + if (pacing_delay) { rack->r_ctl.rc_agg_delayed = 0; rack->r_ctl.rc_agg_early = 0; rack->r_early = 0; @@ -22358,11 +22350,11 @@ nomore: rack_log_queue_level(tp, rack, len, &tv, cts); } else tcp_trace_point(rack->rc_tp, TCP_TP_ENOBUF); - slot = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC); + pacing_delay = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC); if (rack->rc_enobuf < 0x7f) rack->rc_enobuf++; - if (slot < (10 * HPTS_USEC_IN_MSEC)) - slot = 10 * HPTS_USEC_IN_MSEC; + if (pacing_delay < (10 * HPTS_USEC_IN_MSEC)) + pacing_delay = 10 * HPTS_USEC_IN_MSEC; if (rack->r_ctl.crte != NULL) { counter_u64_add(rack_saw_enobuf_hw, 1); tcp_rl_log_enobuf(rack->r_ctl.crte); @@ -22389,8 +22381,8 @@ nomore: goto again; } } - slot = 10 * HPTS_USEC_IN_MSEC; - rack_start_hpts_timer(rack, tp, cts, slot, 0, 0); + pacing_delay = 10 * HPTS_USEC_IN_MSEC; + rack_start_hpts_timer(rack, tp, cts, pacing_delay, 0, 0); #ifdef TCP_ACCOUNTING crtsc = get_cyclecount(); if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { @@ -22412,8 +22404,8 @@ nomore: } /* FALLTHROUGH */ default: - slot = 10 * HPTS_USEC_IN_MSEC; - rack_start_hpts_timer(rack, tp, cts, slot, 0, 0); + pacing_delay = 10 * HPTS_USEC_IN_MSEC; + rack_start_hpts_timer(rack, tp, cts, pacing_delay, 0, 0); #ifdef TCP_ACCOUNTING crtsc = get_cyclecount(); if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { @@ -22456,18 +22448,18 @@ enobufs: /* * We don't send again after sending a RST. */ - slot = 0; + pacing_delay = 0; sendalot = 0; if (error == 0) tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); - } else if ((slot == 0) && (sendalot == 0) && tot_len_this_send) { + } else if ((pacing_delay == 0) && (sendalot == 0) && tot_len_this_send) { /* * Get our pacing rate, if an error * occurred in sending (ENOBUF) we would * hit the else if with slot preset. Other * errors return. */ - slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, rsm, segsiz, __LINE__); + pacing_delay = rack_get_pacing_delay(rack, tp, tot_len_this_send, rsm, segsiz, __LINE__); } /* We have sent clear the flag */ rack->r_ent_rec_ns = 0; @@ -22499,7 +22491,7 @@ enobufs: */ tp->t_flags &= ~(TF_WASCRECOVERY|TF_WASFRECOVERY); } - if (slot) { + if (pacing_delay) { /* set the rack tcb into the slot N */ if ((error == 0) && rack_use_rfo && @@ -22564,7 +22556,7 @@ skip_all_send: /* Assure when we leave that snd_nxt will point to top */ if (SEQ_GT(tp->snd_max, tp->snd_nxt)) tp->snd_nxt = tp->snd_max; - rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, 0); + rack_start_hpts_timer(rack, tp, cts, pacing_delay, tot_len_this_send, 0); #ifdef TCP_ACCOUNTING crtsc = get_cyclecount() - ts_val; if (tot_len_this_send) { diff --git a/sys/netinet/tcp_syncache.c b/sys/netinet/tcp_syncache.c index f842a5678fa1..be20fb44a820 100644 --- a/sys/netinet/tcp_syncache.c +++ b/sys/netinet/tcp_syncache.c @@ -1046,6 +1046,8 @@ abort: * * On syncache_socket() success the newly created socket * has its underlying inp locked. + * + * *lsop is updated, if and only if 1 is returned. */ int syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, @@ -1094,12 +1096,14 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, */ SCH_UNLOCK(sch); TCPSTAT_INC(tcps_sc_spurcookie); - if ((s = tcp_log_addrs(inc, th, NULL, NULL))) + if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Spurious ACK, " "segment rejected " "(syncookies disabled)\n", s, __func__); - goto failed; + free(s, M_TCPLOG); + } + return (0); } if (sch->sch_last_overflow < time_uptime - SYNCOOKIE_LIFETIME) { @@ -1109,12 +1113,14 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, */ SCH_UNLOCK(sch); TCPSTAT_INC(tcps_sc_spurcookie); - if ((s = tcp_log_addrs(inc, th, NULL, NULL))) + if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Spurious ACK, " "segment rejected " "(no syncache entry)\n", s, __func__); - goto failed; + free(s, M_TCPLOG); + } + return (0); } SCH_UNLOCK(sch); } @@ -1128,11 +1134,13 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, TCPSTAT_INC(tcps_sc_recvcookie); } else { TCPSTAT_INC(tcps_sc_failcookie); - if ((s = tcp_log_addrs(inc, th, NULL, NULL))) + if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Segment failed " "SYNCOOKIE authentication, segment rejected " "(probably spoofed)\n", s, __func__); - goto failed; + free(s, M_TCPLOG); + } + return (0); } #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) /* If received ACK has MD5 signature, check it. */ @@ -1206,9 +1214,9 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, "%s; %s: SEG.TSval %u < TS.Recent %u, " "segment dropped\n", s, __func__, to->to_tsval, sc->sc_tsreflect); - free(s, M_TCPLOG); } SCH_UNLOCK(sch); + free(s, M_TCPLOG); return (-1); /* Do not send RST */ } @@ -1225,7 +1233,6 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, "expected, segment processed normally\n", s, __func__); free(s, M_TCPLOG); - s = NULL; } } @@ -1312,16 +1319,6 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, if (sc != &scs) syncache_free(sc); return (1); -failed: - if (sc != NULL) { - TCPSTATES_DEC(TCPS_SYN_RECEIVED); - if (sc != &scs) - syncache_free(sc); - } - if (s != NULL) - free(s, M_TCPLOG); - *lsop = NULL; - return (0); } static struct socket * diff --git a/sys/netinet6/in6_fib_algo.c b/sys/netinet6/in6_fib_algo.c index 10ffe7ab0265..ef5cfc6d5ef6 100644 --- a/sys/netinet6/in6_fib_algo.c +++ b/sys/netinet6/in6_fib_algo.c @@ -351,7 +351,7 @@ struct fib_lookup_module flm_radix6 = { }; static void -fib6_algo_init(void) +fib6_algo_init(void *dummy __unused) { fib_module_register(&flm_radix6_lockless); diff --git a/sys/netipsec/xform_ipcomp.c b/sys/netipsec/xform_ipcomp.c index 737d4a50098a..05a01b75e0bb 100644 --- a/sys/netipsec/xform_ipcomp.c +++ b/sys/netipsec/xform_ipcomp.c @@ -750,7 +750,7 @@ static struct xformsw ipcomp_xformsw = { }; static void -ipcomp_attach(void) +ipcomp_attach(void *dummy __unused) { #ifdef INET @@ -763,7 +763,7 @@ ipcomp_attach(void) } static void -ipcomp_detach(void) +ipcomp_detach(void *dummy __unused) { #ifdef INET diff --git a/sys/netpfil/ipfw/ip_dn_io.c b/sys/netpfil/ipfw/ip_dn_io.c index 03116cb0641c..3a8de2b2bfee 100644 --- a/sys/netpfil/ipfw/ip_dn_io.c +++ b/sys/netpfil/ipfw/ip_dn_io.c @@ -43,6 +43,7 @@ #include <sys/priv.h> #include <sys/proc.h> #include <sys/rwlock.h> +#include <sys/sdt.h> #include <sys/socket.h> #include <sys/time.h> #include <sys/sysctl.h> @@ -70,6 +71,9 @@ #endif #include <netpfil/ipfw/dn_sched.h> +SDT_PROVIDER_DEFINE(dummynet); +SDT_PROBE_DEFINE2(dummynet, , , drop, "struct mbuf *", "struct dn_queue *"); + /* * We keep a private variable for the simulation time, but we could * probably use an existing one ("softticks" in sys/kern/kern_timeout.c) @@ -545,6 +549,7 @@ dn_enqueue(struct dn_queue *q, struct mbuf* m, int drop) drop: V_dn_cfg.io_pkt_drop++; + SDT_PROBE2(dummynet, , , drop, m, q); q->ni.drops++; ni->drops++; FREE_PKT(m); @@ -1001,6 +1006,7 @@ done: dropit: V_dn_cfg.io_pkt_drop++; + SDT_PROBE2(dummynet, , , drop, m, q); DN_BH_WUNLOCK(); if (m) FREE_PKT(m); diff --git a/sys/netpfil/ipfw/ip_dummynet.c b/sys/netpfil/ipfw/ip_dummynet.c index b3f52322425f..d522f9da0fbe 100644 --- a/sys/netpfil/ipfw/ip_dummynet.c +++ b/sys/netpfil/ipfw/ip_dummynet.c @@ -1150,7 +1150,7 @@ copy_data_helper(void *_o, void *_arg) return 0; /* not a pipe */ /* see if the object is within one of our ranges */ - for (;r < lim; r += 2) { + for (; r < lim; r += 2) { if (n < r[0] || n > r[1]) continue; /* Found a valid entry, copy and we are done */ @@ -1183,7 +1183,7 @@ copy_data_helper(void *_o, void *_arg) if (n >= DN_MAX_ID) return 0; /* see if the object is within one of our ranges */ - for (;r < lim; r += 2) { + for (; r < lim; r += 2) { if (n < r[0] || n > r[1]) continue; if (copy_flowset(a, fs, 0)) diff --git a/sys/netpfil/ipfw/ip_fw2.c b/sys/netpfil/ipfw/ip_fw2.c index b59d8d08bf80..d15d7760d7f1 100644 --- a/sys/netpfil/ipfw/ip_fw2.c +++ b/sys/netpfil/ipfw/ip_fw2.c @@ -3578,11 +3578,9 @@ sysctl_ipfw_tables_sets(SYSCTL_HANDLER_ARGS) /* * Stuff that must be initialised only on boot or module load */ -static int -ipfw_init(void) +static void +ipfw_init(void *dummy __unused) { - int error = 0; - /* * Only print out this stuff the first time around, * when called from the sysinit code. @@ -3627,14 +3625,13 @@ ipfw_init(void) ipfw_init_sopt_handler(); ipfw_init_obj_rewriter(); ipfw_iface_init(); - return (error); } /* * Called for the removal of the last instance only on module unload. */ static void -ipfw_destroy(void) +ipfw_destroy(void *dummy __unused) { ipfw_iface_destroy(); diff --git a/sys/netpfil/ipfw/ip_fw_nat.c b/sys/netpfil/ipfw/ip_fw_nat.c index 1e2ff1bca290..8bd27f6885ab 100644 --- a/sys/netpfil/ipfw/ip_fw_nat.c +++ b/sys/netpfil/ipfw/ip_fw_nat.c @@ -999,9 +999,11 @@ ipfw_nat_del(struct sockopt *sopt) { struct cfg_nat *ptr; struct ip_fw_chain *chain = &V_layer3_chain; - int i; + int error, i; - sooptcopyin(sopt, &i, sizeof i, sizeof i); + error = sooptcopyin(sopt, &i, sizeof i, sizeof i); + if (error != 0) + return (error); /* XXX validate i */ IPFW_UH_WLOCK(chain); ptr = lookup_nat(&chain->nat, i); @@ -1104,7 +1106,7 @@ ipfw_nat_get_log(struct sockopt *sopt) { uint8_t *data; struct cfg_nat *ptr; - int i, size; + int error, i, size; struct ip_fw_chain *chain; IPFW_RLOCK_TRACKER; @@ -1134,9 +1136,9 @@ ipfw_nat_get_log(struct sockopt *sopt) i += LIBALIAS_BUF_SIZE; } IPFW_RUNLOCK(chain); - sooptcopyout(sopt, data, size); + error = sooptcopyout(sopt, data, size); free(data, M_IPFW); - return(0); + return (error); } static int @@ -1166,7 +1168,7 @@ vnet_ipfw_nat_uninit(const void *arg __unused) } static void -ipfw_nat_init(void) +ipfw_nat_init(void *dummy __unused) { /* init ipfw hooks */ @@ -1183,7 +1185,7 @@ ipfw_nat_init(void) } static void -ipfw_nat_destroy(void) +ipfw_nat_destroy(void *dummy __unused) { EVENTHANDLER_DEREGISTER(ifaddr_event, ifaddr_event_tag); diff --git a/sys/netpfil/pf/pf_ioctl.c b/sys/netpfil/pf/pf_ioctl.c index d58af6e5ec4d..a4557f139ae5 100644 --- a/sys/netpfil/pf/pf_ioctl.c +++ b/sys/netpfil/pf/pf_ioctl.c @@ -259,7 +259,7 @@ static void dehook_pf_eth(void); static void dehook_pf(void); static int shutdown_pf(void); static int pf_load(void); -static void pf_unload(void); +static void pf_unload(void *); static struct cdevsw pf_cdevsw = { .d_ioctl = pfioctl, @@ -7082,7 +7082,7 @@ pf_unload_vnet(void) } static void -pf_unload(void) +pf_unload(void *dummy __unused) { sx_xlock(&pf_end_lock); diff --git a/sys/nfs/nfs_diskless.c b/sys/nfs/nfs_diskless.c index 42cfee63d184..0f0cf80feeec 100644 --- a/sys/nfs/nfs_diskless.c +++ b/sys/nfs/nfs_diskless.c @@ -428,7 +428,7 @@ decode_nfshandle(char *ev, u_char *fh, int maxfh) #if !defined(BOOTP_NFSROOT) static void -nfs_rootconf(void) +nfs_rootconf(void *dummy __unused) { nfs_setup_diskless(); diff --git a/sys/powerpc/aim/mmu_oea64.c b/sys/powerpc/aim/mmu_oea64.c index 796b1719b8ba..01bf4c7e90a8 100644 --- a/sys/powerpc/aim/mmu_oea64.c +++ b/sys/powerpc/aim/mmu_oea64.c @@ -297,7 +297,7 @@ static u_int moea64_clear_bit(vm_page_t, uint64_t); static void moea64_kremove(vm_offset_t); static void moea64_syncicache(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, vm_size_t sz); -static void moea64_pmap_init_qpages(void); +static void moea64_pmap_init_qpages(void *); static void moea64_remove_locked(pmap_t, vm_offset_t, vm_offset_t, struct pvo_dlist *); @@ -1284,7 +1284,7 @@ moea64_late_bootstrap(vm_offset_t kernelstart, vm_offset_t kernelend) } static void -moea64_pmap_init_qpages(void) +moea64_pmap_init_qpages(void *dummy __unused) { struct pcpu *pc; int i; diff --git a/sys/powerpc/cpufreq/pmcr.c b/sys/powerpc/cpufreq/pmcr.c index dd489b607606..6ae0777a8ac7 100644 --- a/sys/powerpc/cpufreq/pmcr.c +++ b/sys/powerpc/cpufreq/pmcr.c @@ -40,7 +40,8 @@ static int pstate_ids[256]; static int pstate_freqs[256]; static int npstates; -static void parse_pstates(void) +static void +parse_pstates(void *dummy __unused) { phandle_t node; diff --git a/sys/riscv/include/vmm.h b/sys/riscv/include/vmm.h index de7119dd534a..e227dd825966 100644 --- a/sys/riscv/include/vmm.h +++ b/sys/riscv/include/vmm.h @@ -123,10 +123,33 @@ struct vm_eventinfo { int *iptr; /* reqidle cookie */ }; +#define DECLARE_VMMOPS_FUNC(ret_type, opname, args) \ + ret_type vmmops_##opname args + +DECLARE_VMMOPS_FUNC(int, modinit, (void)); +DECLARE_VMMOPS_FUNC(int, modcleanup, (void)); +DECLARE_VMMOPS_FUNC(void *, init, (struct vm *vm, struct pmap *pmap)); +DECLARE_VMMOPS_FUNC(int, gla2gpa, (void *vcpui, struct vm_guest_paging *paging, + uint64_t gla, int prot, uint64_t *gpa, int *is_fault)); +DECLARE_VMMOPS_FUNC(int, run, (void *vcpui, register_t pc, struct pmap *pmap, + struct vm_eventinfo *info)); +DECLARE_VMMOPS_FUNC(void, cleanup, (void *vmi)); +DECLARE_VMMOPS_FUNC(void *, vcpu_init, (void *vmi, struct vcpu *vcpu, + int vcpu_id)); +DECLARE_VMMOPS_FUNC(void, vcpu_cleanup, (void *vcpui)); +DECLARE_VMMOPS_FUNC(int, exception, (void *vcpui, uint64_t scause)); +DECLARE_VMMOPS_FUNC(int, getreg, (void *vcpui, int num, uint64_t *retval)); +DECLARE_VMMOPS_FUNC(int, setreg, (void *vcpui, int num, uint64_t val)); +DECLARE_VMMOPS_FUNC(int, getcap, (void *vcpui, int num, int *retval)); +DECLARE_VMMOPS_FUNC(int, setcap, (void *vcpui, int num, int val)); +DECLARE_VMMOPS_FUNC(struct vmspace *, vmspace_alloc, (vm_offset_t min, + vm_offset_t max)); +DECLARE_VMMOPS_FUNC(void, vmspace_free, (struct vmspace *vmspace)); + int vm_create(const char *name, struct vm **retvm); struct vcpu *vm_alloc_vcpu(struct vm *vm, int vcpuid); void vm_disable_vcpu_creation(struct vm *vm); -void vm_slock_vcpus(struct vm *vm); +void vm_lock_vcpus(struct vm *vm); void vm_unlock_vcpus(struct vm *vm); void vm_destroy(struct vm *vm); int vm_reinit(struct vm *vm); @@ -212,7 +235,6 @@ vcpu_should_yield(struct vcpu *vcpu) void *vcpu_stats(struct vcpu *vcpu); void vcpu_notify_event(struct vcpu *vcpu); -struct vmspace *vm_vmspace(struct vm *vm); struct vm_mem *vm_mem(struct vm *vm); enum vm_reg_name vm_segment_name(int seg_encoding); diff --git a/sys/riscv/vmm/riscv.h b/sys/riscv/vmm/riscv.h index 870d0d6c5cd1..917a333520ed 100644 --- a/sys/riscv/vmm/riscv.h +++ b/sys/riscv/vmm/riscv.h @@ -122,29 +122,6 @@ struct hyptrap { uint64_t htinst; }; -#define DEFINE_VMMOPS_IFUNC(ret_type, opname, args) \ - ret_type vmmops_##opname args; - -DEFINE_VMMOPS_IFUNC(int, modinit, (void)) -DEFINE_VMMOPS_IFUNC(int, modcleanup, (void)) -DEFINE_VMMOPS_IFUNC(void *, init, (struct vm *vm, struct pmap *pmap)) -DEFINE_VMMOPS_IFUNC(int, gla2gpa, (void *vcpui, struct vm_guest_paging *paging, - uint64_t gla, int prot, uint64_t *gpa, int *is_fault)) -DEFINE_VMMOPS_IFUNC(int, run, (void *vcpui, register_t pc, struct pmap *pmap, - struct vm_eventinfo *info)) -DEFINE_VMMOPS_IFUNC(void, cleanup, (void *vmi)) -DEFINE_VMMOPS_IFUNC(void *, vcpu_init, (void *vmi, struct vcpu *vcpu, - int vcpu_id)) -DEFINE_VMMOPS_IFUNC(void, vcpu_cleanup, (void *vcpui)) -DEFINE_VMMOPS_IFUNC(int, exception, (void *vcpui, uint64_t scause)) -DEFINE_VMMOPS_IFUNC(int, getreg, (void *vcpui, int num, uint64_t *retval)) -DEFINE_VMMOPS_IFUNC(int, setreg, (void *vcpui, int num, uint64_t val)) -DEFINE_VMMOPS_IFUNC(int, getcap, (void *vcpui, int num, int *retval)) -DEFINE_VMMOPS_IFUNC(int, setcap, (void *vcpui, int num, int val)) -DEFINE_VMMOPS_IFUNC(struct vmspace *, vmspace_alloc, (vm_offset_t min, - vm_offset_t max)) -DEFINE_VMMOPS_IFUNC(void, vmspace_free, (struct vmspace *vmspace)) - #define dprintf(fmt, ...) struct hypctx *riscv_get_active_vcpu(void); diff --git a/sys/riscv/vmm/vmm.c b/sys/riscv/vmm/vmm.c index ec4514f70fa6..4c9b1fa53f7a 100644 --- a/sys/riscv/vmm/vmm.c +++ b/sys/riscv/vmm/vmm.c @@ -92,7 +92,6 @@ struct vcpu { struct fpreg *guestfpu; /* (a,i) guest fpu state */ }; -#define vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx)) #define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN) #define vcpu_lock_destroy(v) mtx_destroy(&((v)->mtx)) #define vcpu_lock(v) mtx_lock_spin(&((v)->mtx)) @@ -121,7 +120,6 @@ struct vm { bool dying; /* (o) is dying */ volatile cpuset_t suspended_cpus; /* (i) suspended vcpus */ volatile cpuset_t halted_cpus; /* (x) cpus in a hard halt */ - struct vmspace *vmspace; /* (o) guest's address space */ struct vm_mem mem; /* (i) [m+v] guest memory */ char name[VM_MAX_NAMELEN]; /* (o) virtual machine name */ struct vcpu **vcpu; /* (i) guest vcpus */ @@ -174,6 +172,7 @@ vcpu_cleanup(struct vcpu *vcpu, bool destroy) vmm_stat_free(vcpu->stats); fpu_save_area_free(vcpu->guestfpu); vcpu_lock_destroy(vcpu); + free(vcpu, M_VMM); } } @@ -285,7 +284,7 @@ vm_init(struct vm *vm, bool create) { int i; - vm->cookie = vmmops_init(vm, vmspace_pmap(vm->vmspace)); + vm->cookie = vmmops_init(vm, vmspace_pmap(vm_vmspace(vm))); MPASS(vm->cookie != NULL); CPU_ZERO(&vm->active_cpus); @@ -347,9 +346,9 @@ vm_alloc_vcpu(struct vm *vm, int vcpuid) } void -vm_slock_vcpus(struct vm *vm) +vm_lock_vcpus(struct vm *vm) { - sx_slock(&vm->vcpus_init_lock); + sx_xlock(&vm->vcpus_init_lock); } void @@ -362,7 +361,7 @@ int vm_create(const char *name, struct vm **retvm) { struct vm *vm; - struct vmspace *vmspace; + int error; /* * If vmm.ko could not be successfully initialized then don't attempt @@ -374,14 +373,13 @@ vm_create(const char *name, struct vm **retvm) if (name == NULL || strlen(name) >= VM_MAX_NAMELEN) return (EINVAL); - vmspace = vmmops_vmspace_alloc(0, 1ul << 39); - if (vmspace == NULL) - return (ENOMEM); - vm = malloc(sizeof(struct vm), M_VMM, M_WAITOK | M_ZERO); + error = vm_mem_init(&vm->mem, 0, 1ul << 39); + if (error != 0) { + free(vm, M_VMM); + return (error); + } strcpy(vm->name, name); - vm->vmspace = vmspace; - vm_mem_init(&vm->mem); sx_init(&vm->vcpus_init_lock, "vm vcpus"); vm->sockets = 1; @@ -450,11 +448,6 @@ vm_cleanup(struct vm *vm, bool destroy) if (destroy) { vm_mem_destroy(vm); - vmmops_vmspace_free(vm->vmspace); - vm->vmspace = NULL; - - for (i = 0; i < vm->maxcpus; i++) - free(vm->vcpu[i], M_VMM); free(vm->vcpu, M_VMM); sx_destroy(&vm->vcpus_init_lock); } @@ -760,12 +753,6 @@ vcpu_notify_event(struct vcpu *vcpu) vcpu_unlock(vcpu); } -struct vmspace * -vm_vmspace(struct vm *vm) -{ - return (vm->vmspace); -} - struct vm_mem * vm_mem(struct vm *vm) { @@ -1084,7 +1071,7 @@ vm_handle_paging(struct vcpu *vcpu, bool *retu) vm = vcpu->vm; vme = &vcpu->exitinfo; - pmap = vmspace_pmap(vm->vmspace); + pmap = vmspace_pmap(vm_vmspace(vm)); addr = (vme->htval << 2) & ~(PAGE_SIZE - 1); dprintf("%s: %lx\n", __func__, addr); @@ -1107,7 +1094,7 @@ vm_handle_paging(struct vcpu *vcpu, bool *retu) if (pmap_fault(pmap, addr, ftype)) return (0); - map = &vm->vmspace->vm_map; + map = &vm_vmspace(vm)->vm_map; rv = vm_fault(map, addr, ftype, VM_FAULT_NORMAL, NULL); if (rv != KERN_SUCCESS) { printf("%s: vm_fault failed, addr %lx, ftype %d, err %d\n", @@ -1189,7 +1176,7 @@ vm_run(struct vcpu *vcpu) if (CPU_ISSET(vcpuid, &vm->suspended_cpus)) return (EINVAL); - pmap = vmspace_pmap(vm->vmspace); + pmap = vmspace_pmap(vm_vmspace(vm)); vme = &vcpu->exitinfo; evinfo.rptr = NULL; evinfo.sptr = &vm->suspend; diff --git a/sys/rpc/auth.h b/sys/rpc/auth.h index 33c33ffd594d..648fb99a3a27 100644 --- a/sys/rpc/auth.h +++ b/sys/rpc/auth.h @@ -354,6 +354,10 @@ __END_DECLS #define RPCSEC_GSS 6 /* RPCSEC_GSS */ #define AUTH_TLS 7 /* Initiate RPC-over-TLS */ +/* RFC 5531's prescribed limits for variable-lenth arrays. */ +#define AUTH_SYS_MAX_HOSTNAME 255 +#define AUTH_SYS_MAX_GROUPS 16 /* Supplementary groups. */ + /* * Pseudo auth flavors for RPCSEC_GSS. */ diff --git a/sys/rpc/authunix_prot.c b/sys/rpc/authunix_prot.c index b107d5541c50..ff4c12c3f52e 100644 --- a/sys/rpc/authunix_prot.c +++ b/sys/rpc/authunix_prot.c @@ -30,7 +30,6 @@ * POSSIBILITY OF SUCH DAMAGE. */ -#include <sys/cdefs.h> /* * authunix_prot.c * XDR for UNIX style authentication parameters for RPC @@ -40,8 +39,7 @@ #include <sys/param.h> #include <sys/jail.h> -#include <sys/kernel.h> -#include <sys/systm.h> +#include <sys/libkern.h> #include <sys/ucred.h> #include <rpc/types.h> @@ -50,9 +48,6 @@ #include <rpc/rpc_com.h> -/* gids compose part of a credential; there may not be more than 16 of them */ -#define NGRPS 16 - /* * XDR for unix authentication parameters. */ @@ -60,25 +55,23 @@ bool_t xdr_authunix_parms(XDR *xdrs, uint32_t *time, struct xucred *cred) { uint32_t namelen; - uint32_t ngroups, i; + uint32_t supp_ngroups, i; uint32_t junk; char hostbuf[MAXHOSTNAMELEN]; + if (xdrs->x_op == XDR_FREE) + /* This function does not allocate auxiliary memory. */ + return (TRUE); + if (xdrs->x_op == XDR_ENCODE) { - /* - * Restrict name length to 255 according to RFC 1057. - */ getcredhostname(NULL, hostbuf, sizeof(hostbuf)); namelen = strlen(hostbuf); - if (namelen > 255) - namelen = 255; - } else { + if (namelen > AUTH_SYS_MAX_HOSTNAME) + namelen = AUTH_SYS_MAX_HOSTNAME; + } else namelen = 0; - } - junk = 0; - if (!xdr_uint32_t(xdrs, time) - || !xdr_uint32_t(xdrs, &namelen)) + if (!xdr_uint32_t(xdrs, time) || !xdr_uint32_t(xdrs, &namelen)) return (FALSE); /* @@ -88,43 +81,65 @@ xdr_authunix_parms(XDR *xdrs, uint32_t *time, struct xucred *cred) if (!xdr_opaque(xdrs, hostbuf, namelen)) return (FALSE); } else { + if (namelen > AUTH_SYS_MAX_HOSTNAME) + return (FALSE); xdr_setpos(xdrs, xdr_getpos(xdrs) + RNDUP(namelen)); } if (!xdr_uint32_t(xdrs, &cred->cr_uid)) return (FALSE); + + /* + * Safety check: The protocol needs at least one group (access to + * 'cr_gid', decrementation of 'cr_ngroups' below). + */ + if (xdrs->x_op == XDR_ENCODE && cred->cr_ngroups == 0) + return (FALSE); if (!xdr_uint32_t(xdrs, &cred->cr_gid)) return (FALSE); if (xdrs->x_op == XDR_ENCODE) { /* - * Note that this is a `struct xucred`, which maintains its - * historical layout of preserving the egid in cr_ngroups and - * cr_groups[0] == egid. + * Note that this is a 'struct xucred', which still has the + * historical layout where the effective GID is in cr_groups[0] + * and is accounted in 'cr_ngroups'. We substract 1 to obtain + * the number of "supplementary" groups, passed in the AUTH_SYS + * credentials variable-length array called gids[] in RFC 5531. */ - ngroups = cred->cr_ngroups - 1; - if (ngroups > NGRPS) - ngroups = NGRPS; + MPASS(cred->cr_ngroups <= XU_NGROUPS); + supp_ngroups = cred->cr_ngroups - 1; + if (supp_ngroups > AUTH_SYS_MAX_GROUPS) + /* With current values, this should never execute. */ + supp_ngroups = AUTH_SYS_MAX_GROUPS; } - if (!xdr_uint32_t(xdrs, &ngroups)) + if (!xdr_uint32_t(xdrs, &supp_ngroups)) return (FALSE); - for (i = 0; i < ngroups; i++) { - if (i < ngroups_max) { - if (!xdr_uint32_t(xdrs, &cred->cr_groups[i + 1])) - return (FALSE); - } else { - if (!xdr_uint32_t(xdrs, &junk)) - return (FALSE); - } - } - if (xdrs->x_op == XDR_DECODE) { - if (ngroups > ngroups_max) - cred->cr_ngroups = ngroups_max + 1; - else - cred->cr_ngroups = ngroups + 1; - } + /* + * Because we cannot store more than XU_NGROUPS in total (16 at time of + * this writing), for now we choose to be strict with respect to RFC + * 5531's maximum number of supplementary groups (AUTH_SYS_MAX_GROUPS). + * That would also be an accidental DoS prevention measure if the + * request handling code didn't try to reassemble it in full without any + * size limits. Although AUTH_SYS_MAX_GROUPS and XU_NGROUPS are equal, + * since the latter includes the "effective" GID, we cannot store the + * last group of a message with exactly AUTH_SYS_MAX_GROUPS + * supplementary groups. We accept such messages so as not to violate + * the protocol, silently dropping the last group on the floor. + */ + + if (xdrs->x_op != XDR_ENCODE && supp_ngroups > AUTH_SYS_MAX_GROUPS) + return (FALSE); + + junk = 0; + for (i = 0; i < supp_ngroups; ++i) + if (!xdr_uint32_t(xdrs, i < XU_NGROUPS - 1 ? + &cred->cr_sgroups[i] : &junk)) + return (FALSE); + + if (xdrs->x_op != XDR_ENCODE) + cred->cr_ngroups = MIN(supp_ngroups + 1, XU_NGROUPS); return (TRUE); } diff --git a/sys/rpc/svc_auth_unix.c b/sys/rpc/svc_auth_unix.c index 963f4f272964..aa0fc585865f 100644 --- a/sys/rpc/svc_auth_unix.c +++ b/sys/rpc/svc_auth_unix.c @@ -41,18 +41,12 @@ */ #include <sys/param.h> -#include <sys/lock.h> -#include <sys/mutex.h> -#include <sys/systm.h> #include <sys/ucred.h> #include <rpc/rpc.h> #include <rpc/rpc_com.h> -#define MAX_MACHINE_NAME 255 -#define NGRPS 16 - /* * Unix longhand authenticator */ @@ -62,11 +56,8 @@ _svcauth_unix(struct svc_req *rqst, struct rpc_msg *msg) enum auth_stat stat; XDR xdrs; int32_t *buf; - uint32_t time; struct xucred *xcr; - u_int auth_len; - size_t str_len, gid_len; - u_int i; + uint32_t auth_len, time; xcr = rqst->rq_clntcred; auth_len = (u_int)msg->rm_call.cb_cred.oa_length; @@ -74,51 +65,58 @@ _svcauth_unix(struct svc_req *rqst, struct rpc_msg *msg) XDR_DECODE); buf = XDR_INLINE(&xdrs, auth_len); if (buf != NULL) { + /* 'time', 'str_len', UID, GID and 'supp_ngroups'. */ + const uint32_t min_len = 5 * BYTES_PER_XDR_UNIT; + uint32_t str_len, supp_ngroups; + + if (auth_len < min_len) + goto badcred; time = IXDR_GET_UINT32(buf); - str_len = (size_t)IXDR_GET_UINT32(buf); - if (str_len > MAX_MACHINE_NAME) { - stat = AUTH_BADCRED; - goto done; - } + str_len = IXDR_GET_UINT32(buf); + if (str_len > AUTH_SYS_MAX_HOSTNAME) + goto badcred; str_len = RNDUP(str_len); + /* + * Recheck message length now that we know the value of + * 'str_len' (and that it won't cause an overflow in additions + * below) to protect access to the credentials part. + */ + if (auth_len < min_len + str_len) + goto badcred; buf += str_len / sizeof (int32_t); xcr->cr_uid = IXDR_GET_UINT32(buf); xcr->cr_gid = IXDR_GET_UINT32(buf); - gid_len = (size_t)IXDR_GET_UINT32(buf); - if (gid_len > NGRPS) { - stat = AUTH_BADCRED; - goto done; - } - for (i = 0; i < gid_len; i++) { - /* - * Note that this is a `struct xucred`, which maintains - * its historical layout of preserving the egid in - * cr_ngroups and cr_groups[0] == egid. - */ - if (i + 1 < XU_NGROUPS) - xcr->cr_groups[i + 1] = IXDR_GET_INT32(buf); - else - buf++; - } - if (gid_len + 1 > XU_NGROUPS) - xcr->cr_ngroups = XU_NGROUPS; - else - xcr->cr_ngroups = gid_len + 1; + supp_ngroups = IXDR_GET_UINT32(buf); + /* + * See the herald comment before a similar test at the end of + * xdr_authunix_parms() for why we strictly respect RFC 5531 and + * why we may have to drop the last supplementary group when + * there are AUTH_SYS_MAX_GROUPS of them. + */ + if (supp_ngroups > AUTH_SYS_MAX_GROUPS) + goto badcred; + /* + * Final message length check, as we now know how much we will + * read in total. + */ + if (auth_len < min_len + str_len + + supp_ngroups * BYTES_PER_XDR_UNIT) + goto badcred; /* - * five is the smallest unix credentials structure - - * timestamp, hostname len (0), uid, gid, and gids len (0). + * Note that 'xcr' is a 'struct xucred', which still has the + * historical layout where the effective GID is in cr_groups[0] + * and is accounted in 'cr_ngroups'. */ - if ((5 + gid_len) * BYTES_PER_XDR_UNIT + str_len > auth_len) { - (void) printf("bad auth_len gid %ld str %ld auth %u\n", - (long)gid_len, (long)str_len, auth_len); - stat = AUTH_BADCRED; - goto done; + for (uint32_t i = 0; i < supp_ngroups; ++i) { + if (i < XU_NGROUPS - 1) + xcr->cr_sgroups[i] = IXDR_GET_INT32(buf); + else + buf++; } - } else if (! xdr_authunix_parms(&xdrs, &time, xcr)) { - stat = AUTH_BADCRED; - goto done; - } + xcr->cr_ngroups = MIN(supp_ngroups + 1, XU_NGROUPS); + } else if (!xdr_authunix_parms(&xdrs, &time, xcr)) + goto badcred; rqst->rq_verf = _null_auth; stat = AUTH_OK; @@ -126,6 +124,10 @@ done: XDR_DESTROY(&xdrs); return (stat); + +badcred: + stat = AUTH_BADCRED; + goto done; } diff --git a/sys/security/audit/audit.c b/sys/security/audit/audit.c index 7ec50d990d4e..876776e5f62e 100644 --- a/sys/security/audit/audit.c +++ b/sys/security/audit/audit.c @@ -329,7 +329,7 @@ audit_record_dtor(void *mem, int size, void *arg) * call into the BSM assembly code to initialize it. */ static void -audit_init(void) +audit_init(void *dummy __unused) { audit_trail_enabled = 0; diff --git a/sys/security/mac/mac_framework.c b/sys/security/mac/mac_framework.c index d742b5dcbc3a..b0776160cc74 100644 --- a/sys/security/mac/mac_framework.c +++ b/sys/security/mac/mac_framework.c @@ -320,7 +320,7 @@ mac_policy_xlock_assert(void) * Initialize the MAC subsystem, including appropriate SMP locks. */ static void -mac_init(void) +mac_init(void *dummy __unused) { LIST_INIT(&mac_static_policy_list); @@ -340,7 +340,7 @@ mac_init(void) * kernel, or loaded before the kernel startup. */ static void -mac_late_init(void) +mac_late_init(void *dummy __unused) { mac_late = 1; diff --git a/sys/sys/imgact_elf.h b/sys/sys/imgact_elf.h index 2845a9dbc1e2..9e2a233248b4 100644 --- a/sys/sys/imgact_elf.h +++ b/sys/sys/imgact_elf.h @@ -86,7 +86,7 @@ typedef struct { struct sysentvec *sysvec; const char *interp_newpath; int flags; - Elf_Brandnote *brand_note; + const Elf_Brandnote *brand_note; bool (*header_supported)(const struct image_params *, const int32_t *, const uint32_t *); /* High 8 bits of flags is private to the ABI */ @@ -111,9 +111,9 @@ struct sseg_closure { size_t size; /* Total size of all writable segments. */ }; -bool __elfN(brand_inuse)(Elf_Brandinfo *entry); -int __elfN(insert_brand_entry)(Elf_Brandinfo *entry); -int __elfN(remove_brand_entry)(Elf_Brandinfo *entry); +bool __elfN(brand_inuse)(const Elf_Brandinfo *entry); +int __elfN(insert_brand_entry)(const Elf_Brandinfo *entry); +int __elfN(remove_brand_entry)(const Elf_Brandinfo *entry); int __elfN(freebsd_fixup)(uintptr_t *, struct image_params *); int __elfN(coredump)(struct thread *, struct coredump_writer *, off_t, int); size_t __elfN(populate_note)(int, void *, void *, size_t, void **); diff --git a/sys/sys/proc.h b/sys/sys/proc.h index 9140cee56885..8c0729d3ec66 100644 --- a/sys/sys/proc.h +++ b/sys/sys/proc.h @@ -741,7 +741,7 @@ struct proc { reaper which spawned our subtree. */ uint64_t p_elf_flags; /* (x) ELF flags */ - void *p_elf_brandinfo; /* (x) Elf_Brandinfo, NULL for + const void *p_elf_brandinfo; /* (x) Elf_Brandinfo, NULL for non ELF binaries. */ sbintime_t p_umtx_min_timeout; /* End area that is copied on creation. */ diff --git a/sys/sys/sockbuf.h b/sys/sys/sockbuf.h index b4593f38f592..739723754b7d 100644 --- a/sys/sys/sockbuf.h +++ b/sys/sys/sockbuf.h @@ -62,7 +62,7 @@ #include <sys/_sx.h> #include <sys/_task.h> -#define SB_MAX (2*1024*1024) /* default for max chars in sockbuf */ +#define SB_MAX (8*1024*1024) /* default for max chars in sockbuf */ struct ktls_session; struct mbuf; diff --git a/sys/sys/socket.h b/sys/sys/socket.h index cdd4fa3b4b89..cf1d95da6168 100644 --- a/sys/sys/socket.h +++ b/sys/sys/socket.h @@ -396,6 +396,7 @@ struct sockproto { #define PF_NETLINK AF_NETLINK #define PF_INET_SDP AF_INET_SDP #define PF_INET6_SDP AF_INET6_SDP +#define PF_HYPERV AF_HYPERV #define PF_DIVERT AF_DIVERT #define PF_IPFWLOG AF_IPFWLOG diff --git a/sys/sys/sockopt.h b/sys/sys/sockopt.h index bfe12d8510d7..d2b0ff5ed2c8 100644 --- a/sys/sys/sockopt.h +++ b/sys/sys/sockopt.h @@ -57,8 +57,10 @@ struct sockopt { int sosetopt(struct socket *so, struct sockopt *sopt); int sogetopt(struct socket *so, struct sockopt *sopt); -int sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen); -int sooptcopyout(struct sockopt *sopt, const void *buf, size_t len); +int __result_use_check sooptcopyin(struct sockopt *sopt, void *buf, size_t len, + size_t minlen); +int __result_use_check sooptcopyout(struct sockopt *sopt, const void *buf, + size_t len); int soopt_getm(struct sockopt *sopt, struct mbuf **mp); int soopt_mcopyin(struct sockopt *sopt, struct mbuf *m); int soopt_mcopyout(struct sockopt *sopt, struct mbuf *m); diff --git a/sys/sys/sysent.h b/sys/sys/sysent.h index 1714fa5a7416..6de391dcc03e 100644 --- a/sys/sys/sysent.h +++ b/sys/sys/sysent.h @@ -343,8 +343,7 @@ void exec_free_abi_mappings(struct proc *p); void exec_onexec_old(struct thread *td); #define INIT_SYSENTVEC(name, sv) \ - SYSINIT(name, SI_SUB_EXEC, SI_ORDER_ANY, \ - (sysinit_cfunc_t)exec_sysvec_init, sv); + SYSINIT(name, SI_SUB_EXEC, SI_ORDER_ANY, exec_sysvec_init, sv) #endif /* _KERNEL */ diff --git a/sys/sys/tree.h b/sys/sys/tree.h index c11bccfb387c..194ad505b038 100644 --- a/sys/sys/tree.h +++ b/sys/sys/tree.h @@ -334,10 +334,13 @@ struct { \ #define _RB_L ((__uintptr_t)1) #define _RB_R ((__uintptr_t)2) #define _RB_LR ((__uintptr_t)3) -#define _RB_BITS(elm) (*(__uintptr_t *)&elm) +#define _RB_BITS(elm) ((__uintptr_t)elm) #define _RB_BITSUP(elm, field) _RB_BITS(_RB_UP(elm, field)) -#define _RB_PTR(elm) (__typeof(elm)) \ - ((__uintptr_t)elm & ~_RB_LR) +#define _RB_PTR_OP(elm, op, dir) ((__typeof(elm)) \ + ((__uintptr_t)(elm) op (dir))) +#define _RB_PTR(elm) _RB_PTR_OP((elm), &, ~_RB_LR) +#define _RB_MOD_OR(elm, dir) ((elm) = _RB_PTR_OP((elm), |, (dir))) +#define _RB_MOD_XOR(elm, dir) ((elm) = _RB_PTR_OP((elm), ^, (dir))) #define RB_PARENT(elm, field) _RB_PTR(_RB_UP(elm, field)) #define RB_LEFT(elm, field) _RB_LINK(elm, _RB_L, field) @@ -346,8 +349,8 @@ struct { \ #define RB_EMPTY(head) (RB_ROOT(head) == NULL) #define RB_SET_PARENT(dst, src, field) do { \ - _RB_BITSUP(dst, field) = (__uintptr_t)src | \ - (_RB_BITSUP(dst, field) & _RB_LR); \ + _RB_UP(dst, field) = (__typeof(src))((__uintptr_t)src | \ + (_RB_BITSUP(dst, field) & _RB_LR)); \ } while (/*CONSTCOND*/ 0) #define RB_SET(elm, parent, field) do { \ @@ -546,12 +549,12 @@ name##_RB_INSERT_COLOR(struct name *head, \ elmdir = RB_RIGHT(parent, field) == elm ? _RB_R : _RB_L; \ if (_RB_BITS(gpar) & elmdir) { \ /* shorten the parent-elm edge to rebalance */ \ - _RB_BITSUP(parent, field) ^= elmdir; \ + _RB_MOD_XOR(_RB_UP(parent, field), elmdir); \ return (NULL); \ } \ sibdir = elmdir ^ _RB_LR; \ /* the other edge must change length */ \ - _RB_BITSUP(parent, field) ^= sibdir; \ + _RB_MOD_XOR(_RB_UP(parent, field), sibdir); \ if ((_RB_BITS(gpar) & _RB_LR) == 0) { \ /* both edges now short, retry from parent */ \ child = elm; \ @@ -583,11 +586,14 @@ name##_RB_INSERT_COLOR(struct name *head, \ RB_ROTATE(elm, child, elmdir, field); \ child_up = _RB_UP(child, field); \ if (_RB_BITS(child_up) & sibdir) \ - _RB_BITSUP(parent, field) ^= elmdir; \ + _RB_MOD_XOR(_RB_UP(parent, field), \ + elmdir); \ if (_RB_BITS(child_up) & elmdir) \ - _RB_BITSUP(elm, field) ^= _RB_LR; \ + _RB_MOD_XOR(_RB_UP(elm, field), \ + _RB_LR); \ else \ - _RB_BITSUP(elm, field) ^= elmdir; \ + _RB_MOD_XOR(_RB_UP(elm, field), \ + elmdir); \ /* if child is a leaf, don't augment elm, \ * since it is restored to be a leaf again. */ \ if ((_RB_BITS(child_up) & _RB_LR) == 0) \ @@ -656,7 +662,7 @@ name##_RB_REMOVE_COLOR(struct name *head, \ /* the rank of the tree rooted at elm shrank */ \ gpar = _RB_UP(parent, field); \ elmdir = RB_RIGHT(parent, field) == elm ? _RB_R : _RB_L; \ - _RB_BITS(gpar) ^= elmdir; \ + _RB_MOD_XOR(gpar, elmdir); \ if (_RB_BITS(gpar) & elmdir) { \ /* lengthen the parent-elm edge to rebalance */ \ _RB_UP(parent, field) = gpar; \ @@ -664,7 +670,7 @@ name##_RB_REMOVE_COLOR(struct name *head, \ } \ if (_RB_BITS(gpar) & _RB_LR) { \ /* shorten other edge, retry from parent */ \ - _RB_BITS(gpar) ^= _RB_LR; \ + _RB_MOD_XOR(gpar, _RB_LR); \ _RB_UP(parent, field) = gpar; \ gpar = _RB_PTR(gpar); \ continue; \ @@ -672,7 +678,7 @@ name##_RB_REMOVE_COLOR(struct name *head, \ sibdir = elmdir ^ _RB_LR; \ sib = _RB_LINK(parent, sibdir, field); \ up = _RB_UP(sib, field); \ - _RB_BITS(up) ^= _RB_LR; \ + _RB_MOD_XOR(up, _RB_LR); \ if ((_RB_BITS(up) & _RB_LR) == 0) { \ /* shorten edges descending from sib, retry */ \ _RB_UP(sib, field) = up; \ @@ -703,24 +709,29 @@ name##_RB_REMOVE_COLOR(struct name *head, \ /* elm is a 1-child. First rotate at elm. */ \ RB_ROTATE(sib, elm, sibdir, field); \ up = _RB_UP(elm, field); \ - _RB_BITSUP(parent, field) ^= \ - (_RB_BITS(up) & elmdir) ? _RB_LR : elmdir; \ - _RB_BITSUP(sib, field) ^= \ - (_RB_BITS(up) & sibdir) ? _RB_LR : sibdir; \ - _RB_BITSUP(elm, field) |= _RB_LR; \ + _RB_MOD_XOR(_RB_UP(parent, field), \ + (_RB_BITS(up) & elmdir) ? _RB_LR : elmdir); \ + _RB_MOD_XOR(_RB_UP(sib, field), \ + (_RB_BITS(up) & sibdir) ? _RB_LR : sibdir); \ + _RB_MOD_OR(_RB_UP(elm, field), _RB_LR); \ } else { \ if ((_RB_BITS(up) & elmdir) == 0 && \ RB_STRICT_HST && elm != NULL) { \ /* if parent does not become a leaf, \ do not demote parent yet. */ \ - _RB_BITSUP(parent, field) ^= sibdir; \ - _RB_BITSUP(sib, field) ^= _RB_LR; \ + _RB_MOD_XOR(_RB_UP(parent, field), \ + sibdir); \ + _RB_MOD_XOR(_RB_UP(sib, field), \ + _RB_LR); \ } else if ((_RB_BITS(up) & elmdir) == 0) { \ /* demote parent. */ \ - _RB_BITSUP(parent, field) ^= elmdir; \ - _RB_BITSUP(sib, field) ^= sibdir; \ + _RB_MOD_XOR(_RB_UP(parent, field), \ + elmdir); \ + _RB_MOD_XOR(_RB_UP(sib, field), \ + sibdir); \ } else \ - _RB_BITSUP(sib, field) ^= sibdir; \ + _RB_MOD_XOR(_RB_UP(sib, field), \ + sibdir); \ elm = sib; \ } \ \ diff --git a/sys/sys/user.h b/sys/sys/user.h index 3183f0792256..1704bc089d85 100644 --- a/sys/sys/user.h +++ b/sys/sys/user.h @@ -617,7 +617,8 @@ struct kinfo_vmobject { } kvo_type_spec; /* Type-specific union */ uint64_t kvo_me; /* Uniq handle for anon obj */ uint64_t kvo_laundry; /* Number of laundry pages. */ - uint64_t _kvo_qspare[5]; + uint64_t kvo_wired; /* Number of wired pages. */ + uint64_t _kvo_qspare[4]; uint32_t kvo_swapped; /* Number of swapped pages */ uint32_t kvo_flags; uint32_t _kvo_ispare[6]; diff --git a/sys/tests/ktest.h b/sys/tests/ktest.h index c767aa31e8e5..75d7a75e2fff 100644 --- a/sys/tests/ktest.h +++ b/sys/tests/ktest.h @@ -57,6 +57,8 @@ struct ktest_test_info { ktest_parse_t parse; }; +#define KTEST_FUNC(X) static int __ktest_##X(struct ktest_test_context *ctx) + struct ktest_module_info { const char *name; const struct ktest_test_info *tests; @@ -64,6 +66,8 @@ struct ktest_module_info { void *module_ptr; }; +#define KTEST_INFO(X) { "test_" #X, "Test " #X, __ktest_##X, NULL } + int ktest_default_modevent(module_t mod, int type, void *arg); bool ktest_start_msg(struct ktest_test_context *ctx); @@ -84,6 +88,9 @@ void ktest_end_msg(struct ktest_test_context *ctx); #define KTEST_LOG(_ctx, _fmt, ...) \ KTEST_LOG_LEVEL(_ctx, LOG_DEBUG, _fmt, ## __VA_ARGS__) +#define KTEST_ERR(_ctx, _fmt, ...) \ + KTEST_LOG_LEVEL(_ctx, LOG_ERR, _fmt, ## __VA_ARGS__) + #define KTEST_MAX_BUF 512 #define KTEST_MODULE_DECLARE(_n, _t) \ @@ -104,6 +111,9 @@ MODULE_VERSION(ktest_##_n, 1); \ MODULE_DEPEND(ktest_##_n, ktestmod, 1, 1, 1); \ MODULE_DEPEND(ktest_##_n, netlink, 1, 1, 1); \ +#define KTEST_MODULE_DEPEND(_n, _d) \ +MODULE_DEPEND(ktest_##_n, _d, 1, 1, 1); \ + #endif /* _KERNEL */ /* genetlink definitions */ diff --git a/sys/ufs/ffs/ffs_inode.c b/sys/ufs/ffs/ffs_inode.c index 970536a13aa5..f47cfd08f75a 100644 --- a/sys/ufs/ffs/ffs_inode.c +++ b/sys/ufs/ffs/ffs_inode.c @@ -653,8 +653,8 @@ done: for (i = 0; i < UFS_NDADDR; i++) if (newblks[i] != DIP(ip, i_db[i])) panic("ffs_truncate2: blkno %d newblks %jd != i_db %jd", - i, (intmax_t)newblks[UFS_NDADDR + level], - (intmax_t)DIP(ip, i_ib[level])); + i, (intmax_t)newblks[i], + (intmax_t)DIP(ip, i_db[i])); BO_LOCK(bo); if (length == 0 && (fs->fs_magic != FS_UFS2_MAGIC || ip->i_din2->di_extsize == 0) && diff --git a/sys/vm/uma_core.c b/sys/vm/uma_core.c index 679b2e20e88b..b80b5cc781f7 100644 --- a/sys/vm/uma_core.c +++ b/sys/vm/uma_core.c @@ -4009,21 +4009,15 @@ restart: /* * Use the keg's policy if upper layers haven't already specified a * domain (as happens with first-touch zones). - * - * To avoid races we run the iterator with the keg lock held, but that - * means that we cannot allow the vm_domainset layer to sleep. Thus, - * clear M_WAITOK and handle low memory conditions locally. */ rr = rdomain == UMA_ANYDOMAIN; + aflags = flags; if (rr) { - aflags = (flags & ~M_WAITOK) | M_NOWAIT; if (vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain, &aflags) != 0) return (NULL); - } else { - aflags = flags; + } else domain = rdomain; - } for (;;) { slab = keg_fetch_free_slab(keg, domain, rr, flags); @@ -4053,13 +4047,8 @@ restart: if ((flags & M_WAITOK) == 0) break; vm_wait_domain(domain); - } else if (vm_domainset_iter_policy(&di, &domain) != 0) { - if ((flags & M_WAITOK) != 0) { - vm_wait_doms(&keg->uk_dr.dr_policy->ds_mask, 0); - goto restart; - } + } else if (vm_domainset_iter_policy(&di, &domain) != 0) break; - } } /* @@ -5245,7 +5234,7 @@ uma_prealloc(uma_zone_t zone, int items) KEG_GET(zone, keg); slabs = howmany(items, keg->uk_ipers); while (slabs-- > 0) { - aflags = M_NOWAIT; + aflags = M_WAITOK; if (vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain, &aflags) != 0) panic("%s: Domainset is empty", __func__); @@ -5266,7 +5255,8 @@ uma_prealloc(uma_zone_t zone, int items) break; } if (vm_domainset_iter_policy(&di, &domain) != 0) - vm_wait_doms(&keg->uk_dr.dr_policy->ds_mask, 0); + panic("%s: Cannot allocate from any domain", + __func__); } } } diff --git a/sys/vm/vm_domainset.c b/sys/vm/vm_domainset.c index 9fa17da954f7..c25ed0cc2267 100644 --- a/sys/vm/vm_domainset.c +++ b/sys/vm/vm_domainset.c @@ -113,7 +113,6 @@ vm_domainset_iter_interleave(struct vm_domainset_iter *di, int *domain) int d; d = di->di_offset % di->di_domain->ds_cnt; - *di->di_iter = d; *domain = di->di_domain->ds_order[d]; } @@ -260,9 +259,14 @@ vm_domainset_iter_page_init(struct vm_domainset_iter *di, struct vm_object *obj, * are immutable and unsynchronized. Updates can race but pointer * loads are assumed to be atomic. */ - if (obj != NULL && obj->domain.dr_policy != NULL) + if (obj != NULL && obj->domain.dr_policy != NULL) { + /* + * This write lock protects non-atomic increments of the + * iterator index in vm_domainset_iter_rr(). + */ + VM_OBJECT_ASSERT_WLOCKED(obj); dr = &obj->domain; - else + } else dr = &curthread->td_domain; vm_domainset_iter_init(di, dr->dr_policy, &dr->dr_iter, obj, pindex); diff --git a/sys/vm/vm_glue.c b/sys/vm/vm_glue.c index e0f1807a1b32..18d789c59281 100644 --- a/sys/vm/vm_glue.c +++ b/sys/vm/vm_glue.c @@ -441,19 +441,16 @@ vm_thread_kstack_arena_release(void *arena, vmem_addr_t addr, vmem_size_t size) * Create the kernel stack for a new thread. */ static vm_offset_t -vm_thread_stack_create(struct domainset *ds, int pages) +vm_thread_stack_create(struct domainset *ds, int pages, int flags) { vm_page_t ma[KSTACK_MAX_PAGES]; struct vm_domainset_iter di; - int req = VM_ALLOC_NORMAL; - vm_object_t obj; + int req; vm_offset_t ks; int domain, i; - obj = vm_thread_kstack_size_to_obj(pages); - if (vm_ndomains > 1) - obj->domain.dr_policy = ds; - vm_domainset_iter_page_init(&di, obj, 0, &domain, &req); + vm_domainset_iter_policy_init(&di, ds, &domain, &flags); + req = malloc2vm_flags(flags); do { /* * Get a kernel virtual address for this thread's kstack. @@ -480,7 +477,7 @@ vm_thread_stack_create(struct domainset *ds, int pages) vm_page_valid(ma[i]); pmap_qenter(ks, ma, pages); return (ks); - } while (vm_domainset_iter_page(&di, obj, &domain, NULL) == 0); + } while (vm_domainset_iter_policy(&di, &domain) == 0); return (0); } @@ -532,15 +529,9 @@ vm_thread_new(struct thread *td, int pages) ks = 0; if (pages == kstack_pages && kstack_cache != NULL) ks = (vm_offset_t)uma_zalloc(kstack_cache, M_NOWAIT); - - /* - * Ensure that kstack objects can draw pages from any memory - * domain. Otherwise a local memory shortage can block a process - * swap-in. - */ if (ks == 0) ks = vm_thread_stack_create(DOMAINSET_PREF(PCPU_GET(domain)), - pages); + pages, M_NOWAIT); if (ks == 0) return (0); @@ -660,7 +651,8 @@ kstack_import(void *arg, void **store, int cnt, int domain, int flags) ds = DOMAINSET_PREF(domain); for (i = 0; i < cnt; i++) { - store[i] = (void *)vm_thread_stack_create(ds, kstack_pages); + store[i] = (void *)vm_thread_stack_create(ds, kstack_pages, + flags); if (store[i] == NULL) break; } diff --git a/sys/vm/vm_meter.c b/sys/vm/vm_meter.c index fef28bb883e4..fee50f49c844 100644 --- a/sys/vm/vm_meter.c +++ b/sys/vm/vm_meter.c @@ -96,7 +96,7 @@ struct vmmeter __read_mostly vm_cnt = { u_long __exclusive_cache_line vm_user_wire_count; static void -vmcounter_startup(void) +vmcounter_startup(void *dummy __unused) { counter_u64_t *cnt = (counter_u64_t *)&vm_cnt; diff --git a/sys/vm/vm_object.c b/sys/vm/vm_object.c index 6d9ea8bf9d93..5b4517d2bf0c 100644 --- a/sys/vm/vm_object.c +++ b/sys/vm/vm_object.c @@ -2522,15 +2522,13 @@ vm_object_list_handler(struct sysctl_req *req, bool swap_only) continue; } mtx_unlock(&vm_object_list_mtx); + + memset(kvo, 0, sizeof(*kvo)); kvo->kvo_size = ptoa(obj->size); kvo->kvo_resident = obj->resident_page_count; kvo->kvo_ref_count = obj->ref_count; kvo->kvo_shadow_count = atomic_load_int(&obj->shadow_count); kvo->kvo_memattr = obj->memattr; - kvo->kvo_active = 0; - kvo->kvo_inactive = 0; - kvo->kvo_laundry = 0; - kvo->kvo_flags = 0; if (!swap_only) { vm_page_iter_init(&pages, obj); VM_RADIX_FOREACH(m, &pages) { @@ -2549,12 +2547,12 @@ vm_object_list_handler(struct sysctl_req *req, bool swap_only) kvo->kvo_inactive++; else if (vm_page_in_laundry(m)) kvo->kvo_laundry++; + + if (vm_page_wired(m)) + kvo->kvo_wired++; } } - kvo->kvo_vn_fileid = 0; - kvo->kvo_vn_fsid = 0; - kvo->kvo_vn_fsid_freebsd11 = 0; freepath = NULL; fullpath = ""; vp = NULL; diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c index 3f1be78342c9..418a9cff8abf 100644 --- a/sys/vm/vm_pageout.c +++ b/sys/vm/vm_pageout.c @@ -120,7 +120,7 @@ /* the kernel process "vm_pageout"*/ static void vm_pageout(void); -static void vm_pageout_init(void); +static void vm_pageout_init(void *); static int vm_pageout_clean(vm_page_t m, int *numpagedout); static int vm_pageout_cluster(vm_page_t m); static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage, @@ -2333,7 +2333,7 @@ vm_pageout_init_domain(int domain) } static void -vm_pageout_init(void) +vm_pageout_init(void *dummy __unused) { u_long freecount; int i; diff --git a/sys/x86/x86/mca.c b/sys/x86/x86/mca.c index 4b40f343ac90..735efe307215 100644 --- a/sys/x86/x86/mca.c +++ b/sys/x86/x86/mca.c @@ -46,9 +46,11 @@ #include <sys/malloc.h> #include <sys/mutex.h> #include <sys/proc.h> +#include <sys/sbuf.h> #include <sys/sched.h> #include <sys/smp.h> #include <sys/sysctl.h> +#include <sys/syslog.h> #include <sys/systm.h> #include <sys/taskqueue.h> #include <machine/intr_machdep.h> @@ -135,6 +137,11 @@ SYSCTL_INT(_hw_mca, OID_AUTO, fake_bank, CTLFLAG_RW, "Bank to use for artificial MCAs (testing purpose only)"); #endif +static bool mca_uselog = false; +SYSCTL_BOOL(_hw_mca, OID_AUTO, uselog, CTLFLAG_RWTUN, &mca_uselog, 0, + "Should the system send non-fatal machine check errors to the log " + "(instead of the console)?"); + static STAILQ_HEAD(, mca_internal) mca_freelist; static int mca_freecount; static STAILQ_HEAD(, mca_internal) mca_records; @@ -147,12 +154,40 @@ static struct timeout_task mca_scan_task; static struct mtx mca_lock; static bool mca_startup_done = false; -/* Statistics on number of MCA events by type, updated atomically. */ +/* Static buffer to compose messages while in an interrupt context. */ +static char mca_msg_buf[1024]; +static struct mtx mca_msg_buf_lock; + +/* Statistics on number of MCA events by type, updated with the mca_lock. */ static uint64_t mca_stats[MCA_T_COUNT]; SYSCTL_OPAQUE(_hw_mca, OID_AUTO, stats, CTLFLAG_RD | CTLFLAG_SKIP, mca_stats, MCA_T_COUNT * sizeof(mca_stats[0]), "S", "Array of MCA events by type"); +/* Variables to track and control message rate limiting. */ +static struct timeval mca_last_log_time; +static struct timeval mca_log_interval; +static int mca_log_skipped; + +static int +sysctl_mca_log_interval(SYSCTL_HANDLER_ARGS) +{ + int error; + u_int val; + + val = mca_log_interval.tv_sec; + error = sysctl_handle_int(oidp, &val, 0, req); + if (error != 0 || req->newptr == NULL) + return (error); + mca_log_interval.tv_sec = val; + return (0); +} +SYSCTL_PROC(_hw_mca, OID_AUTO, log_interval, + CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, &mca_log_interval, 0, + sysctl_mca_log_interval, "IU", + "Minimum number of seconds between logging correctable MCAs" + " (0 = no limit)"); + static unsigned int mca_ia32_ctl_reg(int bank) { @@ -448,98 +483,111 @@ mca_mute(const struct mca_record *rec) /* Dump details about a single machine check. */ static void -mca_log(const struct mca_record *rec) +mca_log(enum scan_mode mode, const struct mca_record *rec, bool fatal) { + int error, numskipped; uint16_t mca_error; enum mca_stat_types event_type; + struct sbuf sb; + bool uncor, using_shared_buf; if (mca_mute(rec)) return; - if (!log_corrected && (rec->mr_status & MC_STATUS_UC) == 0 && - (!tes_supported(rec->mr_mcg_cap) || + uncor = (rec->mr_status & MC_STATUS_UC) != 0; + + if (!log_corrected && !uncor && (!tes_supported(rec->mr_mcg_cap) || ((rec->mr_status & MC_STATUS_TES_STATUS) >> 53) != 0x2)) return; - printf("MCA: Bank %d, Status 0x%016llx\n", rec->mr_bank, + /* Try to use an allocated buffer when not in an interrupt context. */ + if (mode == POLLED && sbuf_new(&sb, NULL, 512, SBUF_AUTOEXTEND) != NULL) + using_shared_buf = false; + else { + using_shared_buf = true; + mtx_lock_spin(&mca_msg_buf_lock); + sbuf_new(&sb, mca_msg_buf, sizeof(mca_msg_buf), SBUF_FIXEDLEN); + } + + sbuf_printf(&sb, "MCA: Bank %d, Status 0x%016llx\n", rec->mr_bank, (long long)rec->mr_status); - printf("MCA: Global Cap 0x%016llx, Status 0x%016llx\n", + sbuf_printf(&sb, "MCA: Global Cap 0x%016llx, Status 0x%016llx\n", (long long)rec->mr_mcg_cap, (long long)rec->mr_mcg_status); - printf("MCA: Vendor \"%s\", ID 0x%x, APIC ID %d\n", cpu_vendor, - rec->mr_cpu_id, rec->mr_apic_id); - printf("MCA: CPU %d ", rec->mr_cpu); + sbuf_printf(&sb, "MCA: Vendor \"%s\", ID 0x%x, APIC ID %d\n", + cpu_vendor, rec->mr_cpu_id, rec->mr_apic_id); + sbuf_printf(&sb, "MCA: CPU %d ", rec->mr_cpu); if (rec->mr_status & MC_STATUS_UC) - printf("UNCOR "); + sbuf_printf(&sb, "UNCOR "); else { - printf("COR "); + sbuf_printf(&sb, "COR "); if (cmci_supported(rec->mr_mcg_cap)) - printf("(%lld) ", ((long long)rec->mr_status & + sbuf_printf(&sb, "(%lld) ", ((long long)rec->mr_status & MC_STATUS_COR_COUNT) >> 38); if (tes_supported(rec->mr_mcg_cap)) { switch ((rec->mr_status & MC_STATUS_TES_STATUS) >> 53) { case 0x1: - printf("(Green) "); + sbuf_printf(&sb, "(Green) "); break; case 0x2: - printf("(Yellow) "); + sbuf_printf(&sb, "(Yellow) "); break; } } } if (rec->mr_status & MC_STATUS_EN) - printf("EN "); + sbuf_printf(&sb, "EN "); if (rec->mr_status & MC_STATUS_PCC) - printf("PCC "); + sbuf_printf(&sb, "PCC "); if (ser_supported(rec->mr_mcg_cap)) { if (rec->mr_status & MC_STATUS_S) - printf("S "); + sbuf_printf(&sb, "S "); if (rec->mr_status & MC_STATUS_AR) - printf("AR "); + sbuf_printf(&sb, "AR "); } if (rec->mr_status & MC_STATUS_OVER) - printf("OVER "); + sbuf_printf(&sb, "OVER "); mca_error = rec->mr_status & MC_STATUS_MCA_ERROR; event_type = MCA_T_COUNT; switch (mca_error) { /* Simple error codes. */ case 0x0000: - printf("no error"); + sbuf_printf(&sb, "no error"); event_type = MCA_T_NONE; break; case 0x0001: - printf("unclassified error"); + sbuf_printf(&sb, "unclassified error"); event_type = MCA_T_UNCLASSIFIED; break; case 0x0002: - printf("ucode ROM parity error"); + sbuf_printf(&sb, "ucode ROM parity error"); event_type = MCA_T_UCODE_ROM_PARITY; break; case 0x0003: - printf("external error"); + sbuf_printf(&sb, "external error"); event_type = MCA_T_EXTERNAL; break; case 0x0004: - printf("FRC error"); + sbuf_printf(&sb, "FRC error"); event_type = MCA_T_FRC; break; case 0x0005: - printf("internal parity error"); + sbuf_printf(&sb, "internal parity error"); event_type = MCA_T_INTERNAL_PARITY; break; case 0x0006: - printf("SMM handler code access violation"); + sbuf_printf(&sb, "SMM handler code access violation"); event_type = MCA_T_SMM_HANDLER; break; case 0x0400: - printf("internal timer error"); + sbuf_printf(&sb, "internal timer error"); event_type = MCA_T_INTERNAL_TIMER; break; case 0x0e0b: - printf("generic I/O error"); + sbuf_printf(&sb, "generic I/O error"); event_type = MCA_T_GENERIC_IO; if (rec->mr_cpu_vendor_id == CPU_VENDOR_INTEL && (rec->mr_status & MC_STATUS_MISCV)) { - printf(" (pci%d:%d:%d:%d)", + sbuf_printf(&sb, " (pci%d:%d:%d:%d)", (int)((rec->mr_misc & MC_MISC_PCIE_SEG) >> 32), (int)((rec->mr_misc & MC_MISC_PCIE_BUS) >> 24), (int)((rec->mr_misc & MC_MISC_PCIE_SLOT) >> 19), @@ -548,7 +596,8 @@ mca_log(const struct mca_record *rec) break; default: if ((mca_error & 0xfc00) == 0x0400) { - printf("internal error %x", mca_error & 0x03ff); + sbuf_printf(&sb, "internal error %x", + mca_error & 0x03ff); event_type = MCA_T_INTERNAL; break; } @@ -557,14 +606,16 @@ mca_log(const struct mca_record *rec) /* Memory hierarchy error. */ if ((mca_error & 0xeffc) == 0x000c) { - printf("%s memory error", mca_error_level(mca_error)); + sbuf_printf(&sb, "%s memory error", + mca_error_level(mca_error)); event_type = MCA_T_MEMORY; break; } /* TLB error. */ if ((mca_error & 0xeff0) == 0x0010) { - printf("%sTLB %s error", mca_error_ttype(mca_error), + sbuf_printf(&sb, "%sTLB %s error", + mca_error_ttype(mca_error), mca_error_level(mca_error)); event_type = MCA_T_TLB; break; @@ -572,19 +623,19 @@ mca_log(const struct mca_record *rec) /* Memory controller error. */ if ((mca_error & 0xef80) == 0x0080) { - printf("%s channel ", mca_error_mmtype(mca_error, - &event_type)); + sbuf_printf(&sb, "%s channel ", + mca_error_mmtype(mca_error, &event_type)); if ((mca_error & 0x000f) != 0x000f) - printf("%d", mca_error & 0x000f); + sbuf_printf(&sb, "%d", mca_error & 0x000f); else - printf("??"); - printf(" memory error"); + sbuf_printf(&sb, "??"); + sbuf_printf(&sb, " memory error"); break; } /* Cache error. */ if ((mca_error & 0xef00) == 0x0100) { - printf("%sCACHE %s %s error", + sbuf_printf(&sb, "%sCACHE %s %s error", mca_error_ttype(mca_error), mca_error_level(mca_error), mca_error_request(mca_error)); @@ -594,77 +645,129 @@ mca_log(const struct mca_record *rec) /* Extended memory error. */ if ((mca_error & 0xef80) == 0x0280) { - printf("%s channel ", mca_error_mmtype(mca_error, - &event_type)); + sbuf_printf(&sb, "%s channel ", + mca_error_mmtype(mca_error, &event_type)); if ((mca_error & 0x000f) != 0x000f) - printf("%d", mca_error & 0x000f); + sbuf_printf(&sb, "%d", mca_error & 0x000f); else - printf("??"); - printf(" extended memory error"); + sbuf_printf(&sb, "??"); + sbuf_printf(&sb, " extended memory error"); break; } /* Bus and/or Interconnect error. */ if ((mca_error & 0xe800) == 0x0800) { - printf("BUS%s ", mca_error_level(mca_error)); + sbuf_printf(&sb, "BUS%s ", mca_error_level(mca_error)); event_type = MCA_T_BUS; switch ((mca_error & 0x0600) >> 9) { case 0: - printf("Source"); + sbuf_printf(&sb, "Source"); break; case 1: - printf("Responder"); + sbuf_printf(&sb, "Responder"); break; case 2: - printf("Observer"); + sbuf_printf(&sb, "Observer"); break; default: - printf("???"); + sbuf_printf(&sb, "???"); break; } - printf(" %s ", mca_error_request(mca_error)); + sbuf_printf(&sb, " %s ", mca_error_request(mca_error)); switch ((mca_error & 0x000c) >> 2) { case 0: - printf("Memory"); + sbuf_printf(&sb, "Memory"); break; case 2: - printf("I/O"); + sbuf_printf(&sb, "I/O"); break; case 3: - printf("Other"); + sbuf_printf(&sb, "Other"); break; default: - printf("???"); + sbuf_printf(&sb, "???"); break; } if (mca_error & 0x0100) - printf(" timed out"); + sbuf_printf(&sb, " timed out"); break; } - printf("unknown error %x", mca_error); + sbuf_printf(&sb, "unknown error %x", mca_error); event_type = MCA_T_UNKNOWN; break; } - printf("\n"); + sbuf_printf(&sb, "\n"); if (rec->mr_status & MC_STATUS_ADDRV) { - printf("MCA: Address 0x%llx", (long long)rec->mr_addr); + sbuf_printf(&sb, "MCA: Address 0x%llx", + (long long)rec->mr_addr); if (ser_supported(rec->mr_mcg_cap) && (rec->mr_status & MC_STATUS_MISCV)) { - printf(" (Mode: %s, LSB: %d)", + sbuf_printf(&sb, " (Mode: %s, LSB: %d)", mca_addres_mode(rec->mr_misc), (int)(rec->mr_misc & MC_MISC_RA_LSB)); } - printf("\n"); + sbuf_printf(&sb, "\n"); } if (rec->mr_status & MC_STATUS_MISCV) - printf("MCA: Misc 0x%llx\n", (long long)rec->mr_misc); + sbuf_printf(&sb, "MCA: Misc 0x%llx\n", (long long)rec->mr_misc); + if (event_type < 0 || event_type >= MCA_T_COUNT) { KASSERT(0, ("%s: invalid event type (%d)", __func__, event_type)); event_type = MCA_T_UNKNOWN; } - atomic_add_64(&mca_stats[event_type], 1); + numskipped = 0; + if (!fatal && !uncor) { + /* + * Update statistics and check the rate limit for + * correctable errors. The rate limit is only applied + * after the system records a reasonable number of errors + * of the same type. The goal is to reduce the impact of + * the system seeing and attempting to log a burst of + * similar errors, which (especially when printed to the + * console) can be expensive. + */ + mtx_lock_spin(&mca_lock); + mca_stats[event_type]++; + if (mca_log_interval.tv_sec > 0 && mca_stats[event_type] > 50 && + ratecheck(&mca_last_log_time, &mca_log_interval) == 0) { + mca_log_skipped++; + mtx_unlock_spin(&mca_lock); + goto done; + } + numskipped = mca_log_skipped; + mca_log_skipped = 0; + mtx_unlock_spin(&mca_lock); + } + + error = sbuf_finish(&sb); + if (fatal || !mca_uselog) { + if (numskipped > 0) + printf("MCA: %d events skipped due to rate limit\n", + numskipped); + if (error) + printf("MCA: error logging message (sbuf error %d)\n", + error); + else + sbuf_putbuf(&sb); + } else { + if (numskipped > 0) + log(LOG_ERR, + "MCA: %d events skipped due to rate limit\n", + numskipped); + if (error) + log(LOG_ERR, + "MCA: error logging message (sbuf error %d)\n", + error); + else + log(uncor ? LOG_CRIT : LOG_ERR, "%s", sbuf_data(&sb)); + } + +done: + sbuf_delete(&sb); + if (using_shared_buf) + mtx_unlock_spin(&mca_msg_buf_lock); } static bool @@ -825,7 +928,7 @@ mca_record_entry(enum scan_mode mode, const struct mca_record *record) if (rec == NULL) { mtx_unlock_spin(&mca_lock); printf("MCA: Unable to allocate space for an event.\n"); - mca_log(record); + mca_log(mode, record, false); return; } STAILQ_REMOVE_HEAD(&mca_freelist, link); @@ -982,7 +1085,7 @@ mca_scan(enum scan_mode mode, bool *recoverablep) if (*recoverablep) mca_record_entry(mode, &rec); else - mca_log(&rec); + mca_log(mode, &rec, true); } #ifdef DEV_APIC @@ -1066,7 +1169,7 @@ mca_process_records(enum scan_mode mode) mtx_unlock_spin(&mca_lock); STAILQ_FOREACH(mca, &tmplist, link) - mca_log(&mca->rec); + mca_log(mode, &mca->rec, false); mtx_lock_spin(&mca_lock); while ((mca = STAILQ_FIRST(&tmplist)) != NULL) { @@ -1231,6 +1334,7 @@ mca_setup(uint64_t mcg_cap) mca_banks = mcg_cap & MCG_CAP_COUNT; mtx_init(&mca_lock, "mca", NULL, MTX_SPIN); + mtx_init(&mca_msg_buf_lock, "mca_msg_buf", NULL, MTX_SPIN); STAILQ_INIT(&mca_records); STAILQ_INIT(&mca_pending); mca_tq = taskqueue_create_fast("mca", M_WAITOK, diff --git a/sys/x86/x86/tsc.c b/sys/x86/x86/tsc.c index a1a5d8140b14..3b873d9dae73 100644 --- a/sys/x86/x86/tsc.c +++ b/sys/x86/x86/tsc.c @@ -650,7 +650,7 @@ retry: #endif /* SMP */ static void -init_TSC_tc(void) +init_TSC_tc(void *dummy __unused) { uint64_t max_freq; int shift; diff --git a/sys/x86/xen/xen_apic.c b/sys/x86/xen/xen_apic.c index 994dc3e0804c..43a253cc2860 100644 --- a/sys/x86/xen/xen_apic.c +++ b/sys/x86/xen/xen_apic.c @@ -330,7 +330,7 @@ xen_cpu_ipi_init(int cpu) } static void -xen_setup_cpus(void) +xen_setup_cpus(void *dummy __unused) { uint32_t regs[4]; int i; |