diff options
Diffstat (limited to 'sys')
73 files changed, 840 insertions, 253 deletions
diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h index 66d8991d36e8..ad67510fecf3 100644 --- a/sys/amd64/include/vmm.h +++ b/sys/amd64/include/vmm.h @@ -237,7 +237,7 @@ extern u_int vm_maxcpu; /* maximum virtual cpus */ int vm_create(const char *name, struct vm **retvm); struct vcpu *vm_alloc_vcpu(struct vm *vm, int vcpuid); void vm_disable_vcpu_creation(struct vm *vm); -void vm_slock_vcpus(struct vm *vm); +void vm_lock_vcpus(struct vm *vm); void vm_unlock_vcpus(struct vm *vm); void vm_destroy(struct vm *vm); int vm_reinit(struct vm *vm); @@ -362,6 +362,7 @@ enum vcpu_state { }; int vcpu_set_state(struct vcpu *vcpu, enum vcpu_state state, bool from_idle); +int vcpu_set_state_all(struct vm *vm, enum vcpu_state state); enum vcpu_state vcpu_get_state(struct vcpu *vcpu, int *hostcpu); static int __inline diff --git a/sys/amd64/pt/pt.c b/sys/amd64/pt/pt.c index c7b75767680a..6b2296de049c 100644 --- a/sys/amd64/pt/pt.c +++ b/sys/amd64/pt/pt.c @@ -42,15 +42,15 @@ */ #include <sys/systm.h> +#include <sys/bus.h> #include <sys/hwt.h> +#include <sys/interrupt.h> #include <sys/kernel.h> #include <sys/lock.h> #include <sys/malloc.h> #include <sys/module.h> #include <sys/mutex.h> -#include <sys/sdt.h> #include <sys/smp.h> -#include <sys/taskqueue.h> #include <vm/vm.h> #include <vm/vm_page.h> @@ -94,12 +94,7 @@ MALLOC_DEFINE(M_PT, "pt", "Intel Processor Trace"); -SDT_PROVIDER_DEFINE(pt); -SDT_PROBE_DEFINE(pt, , , topa__intr); - -TASKQUEUE_FAST_DEFINE_THREAD(pt); - -static void pt_send_buffer_record(void *arg, int pending __unused); +static void pt_send_buffer_record(void *arg); static int pt_topa_intr(struct trapframe *tf); /* @@ -122,29 +117,24 @@ struct pt_buffer { size_t size; struct mtx lock; /* Lock for fields below. */ vm_offset_t offset; - uint64_t wrap_count; - int curpage; }; struct pt_ctx { int id; struct pt_buffer buf; /* ToPA buffer metadata */ - struct task task; /* ToPA buffer notification task */ struct hwt_context *hwt_ctx; uint8_t *save_area; /* PT XSAVE area */ }; /* PT tracing contexts used for CPU mode. */ static struct pt_ctx *pt_pcpu_ctx; -enum pt_cpu_state { - PT_DISABLED = 0, - PT_STOPPED, - PT_ACTIVE -}; +enum pt_cpu_state { PT_INACTIVE = 0, PT_ACTIVE }; static struct pt_cpu { struct pt_ctx *ctx; /* active PT tracing context */ enum pt_cpu_state state; /* used as part of trace stop protocol */ + void *swi_cookie; /* Software interrupt handler context */ + int in_pcint_handler; } *pt_pcpu; /* @@ -199,31 +189,28 @@ static __inline void pt_update_buffer(struct pt_buffer *buf) { uint64_t reg; - int curpage; + uint64_t offset; /* Update buffer offset. */ reg = rdmsr(MSR_IA32_RTIT_OUTPUT_MASK_PTRS); - curpage = (reg & PT_TOPA_PAGE_MASK) >> PT_TOPA_PAGE_SHIFT; - mtx_lock_spin(&buf->lock); - /* Check if the output wrapped. */ - if (buf->curpage > curpage) - buf->wrap_count++; - buf->curpage = curpage; - buf->offset = reg >> 32; - mtx_unlock_spin(&buf->lock); - - dprintf("%s: wrap_cnt: %lu, curpage: %d, offset: %zu\n", __func__, - buf->wrap_count, buf->curpage, buf->offset); + offset = ((reg & PT_TOPA_PAGE_MASK) >> PT_TOPA_PAGE_SHIFT) * PAGE_SIZE; + offset += (reg >> 32); + + atomic_store_rel_64(&buf->offset, offset); } static __inline void pt_fill_buffer_record(int id, struct pt_buffer *buf, struct hwt_record_entry *rec) { + vm_offset_t offset; + + offset = atomic_load_acq_64(&buf->offset); + rec->record_type = HWT_RECORD_BUFFER; rec->buf_id = id; - rec->curpage = buf->curpage; - rec->offset = buf->offset + (buf->wrap_count * buf->size); + rec->curpage = offset / PAGE_SIZE; + rec->offset = offset & PAGE_MASK; } /* @@ -273,9 +260,9 @@ pt_cpu_start(void *dummy) MPASS(cpu->ctx != NULL); dprintf("%s: curcpu %d\n", __func__, curcpu); + pt_cpu_set_state(curcpu, PT_ACTIVE); load_cr4(rcr4() | CR4_XSAVE); wrmsr(MSR_IA32_RTIT_STATUS, 0); - pt_cpu_set_state(curcpu, PT_ACTIVE); pt_cpu_toggle_local(cpu->ctx->save_area, true); } @@ -291,16 +278,16 @@ pt_cpu_stop(void *dummy) struct pt_cpu *cpu; struct pt_ctx *ctx; - /* Shutdown may occur before PT gets properly configured. */ - if (pt_cpu_get_state(curcpu) == PT_DISABLED) - return; - cpu = &pt_pcpu[curcpu]; ctx = cpu->ctx; - MPASS(ctx != NULL); - dprintf("%s: curcpu %d\n", __func__, curcpu); - pt_cpu_set_state(curcpu, PT_STOPPED); + dprintf("%s: curcpu %d\n", __func__, curcpu); + /* Shutdown may occur before PT gets properly configured. */ + if (ctx == NULL) { + dprintf("%s: missing context on cpu %d; bailing\n", __func__, + curcpu); + return; + } pt_cpu_toggle_local(cpu->ctx->save_area, false); pt_update_buffer(&ctx->buf); } @@ -406,13 +393,11 @@ pt_init_ctx(struct pt_ctx *pt_ctx, struct hwt_vm *vm, int ctx_id) return (ENOMEM); dprintf("%s: preparing ToPA buffer\n", __func__); if (pt_topa_prepare(pt_ctx, vm) != 0) { - dprintf("%s: failed to prepare ToPA buffer\n", __func__); free(pt_ctx->save_area, M_PT); return (ENOMEM); } pt_ctx->id = ctx_id; - TASK_INIT(&pt_ctx->task, 0, pt_send_buffer_record, pt_ctx); return (0); } @@ -426,7 +411,6 @@ pt_deinit_ctx(struct pt_ctx *pt_ctx) if (pt_ctx->save_area != NULL) free(pt_ctx->save_area, M_PT); memset(pt_ctx, 0, sizeof(*pt_ctx)); - pt_ctx->buf.topa_hw = NULL; } /* @@ -519,7 +503,6 @@ pt_backend_configure(struct hwt_context *ctx, int cpu_id, int thread_id) XSTATE_XCOMP_BV_COMPACT; pt_ext->rtit_ctl |= RTIT_CTL_TRACEEN; pt_pcpu[cpu_id].ctx = pt_ctx; - pt_cpu_set_state(cpu_id, PT_STOPPED); return (0); } @@ -549,12 +532,19 @@ pt_backend_disable(struct hwt_context *ctx, int cpu_id) if (ctx->mode == HWT_MODE_CPU) return; - KASSERT(curcpu == cpu_id, ("%s: attempting to disable PT on another cpu", __func__)); + + cpu = &pt_pcpu[cpu_id]; + + dprintf("%s: waiting for cpu %d to exit interrupt handler\n", __func__, + cpu_id); + pt_cpu_set_state(cpu_id, PT_INACTIVE); + while (atomic_cmpset_int(&cpu->in_pcint_handler, 1, 0)) + ; + pt_cpu_stop(NULL); CPU_CLR(cpu_id, &ctx->cpu_map); - cpu = &pt_pcpu[cpu_id]; cpu->ctx = NULL; } @@ -564,14 +554,14 @@ pt_backend_disable(struct hwt_context *ctx, int cpu_id) static int pt_backend_enable_smp(struct hwt_context *ctx) { - dprintf("%s\n", __func__); + + KASSERT(ctx->mode == HWT_MODE_CPU, + ("%s: should only be used for CPU mode", __func__)); if (ctx->mode == HWT_MODE_CPU && atomic_swap_32(&cpu_mode_ctr, 1) != 0) return (-1); - KASSERT(ctx->mode == HWT_MODE_CPU, - ("%s: should only be used for CPU mode", __func__)); smp_rendezvous_cpus(ctx->cpu_map, NULL, pt_cpu_start, NULL, NULL); return (0); @@ -583,6 +573,7 @@ pt_backend_enable_smp(struct hwt_context *ctx) static int pt_backend_disable_smp(struct hwt_context *ctx) { + struct pt_cpu *cpu; dprintf("%s\n", __func__); if (ctx->mode == HWT_MODE_CPU && @@ -593,6 +584,14 @@ pt_backend_disable_smp(struct hwt_context *ctx) dprintf("%s: empty cpu map\n", __func__); return (-1); } + CPU_FOREACH_ISSET(cpu_id, &ctx->cpu_map) { + cpu = &pt_pcpu[cpu_id]; + dprintf("%s: waiting for cpu %d to exit interrupt handler\n", + __func__, cpu_id); + pt_cpu_set_state(cpu_id, PT_INACTIVE); + while (atomic_cmpset_int(&cpu->in_pcint_handler, 1, 0)) + ; + } smp_rendezvous_cpus(ctx->cpu_map, NULL, pt_cpu_stop, NULL, NULL); return (0); @@ -611,13 +610,13 @@ pt_backend_init(struct hwt_context *ctx) int error; dprintf("%s\n", __func__); - if (ctx->mode == HWT_MODE_CPU) { - TAILQ_FOREACH(hwt_cpu, &ctx->cpus, next) { - error = pt_init_ctx(&pt_pcpu_ctx[hwt_cpu->cpu_id], - hwt_cpu->vm, hwt_cpu->cpu_id); - if (error) - return (error); - } + if (ctx->mode != HWT_MODE_CPU) + return (0); + TAILQ_FOREACH(hwt_cpu, &ctx->cpus, next) { + error = pt_init_ctx(&pt_pcpu_ctx[hwt_cpu->cpu_id], hwt_cpu->vm, + hwt_cpu->cpu_id); + if (error) + return (error); } return (0); @@ -647,20 +646,16 @@ pt_backend_deinit(struct hwt_context *ctx) pt_deinit_ctx(pt_ctx); } } else { - CPU_FOREACH(cpu_id) { - if (!CPU_ISSET(cpu_id, &ctx->cpu_map)) + CPU_FOREACH_ISSET(cpu_id, &ctx->cpu_map) { + if (pt_pcpu[cpu_id].ctx == NULL) continue; - if (pt_pcpu[cpu_id].ctx != NULL) { - KASSERT(pt_pcpu[cpu_id].ctx == - &pt_pcpu_ctx[cpu_id], - ("%s: CPU mode tracing with non-cpu mode PT" - "context active", - __func__)); - pt_pcpu[cpu_id].ctx = NULL; - } - pt_ctx = &pt_pcpu_ctx[cpu_id]; - pt_deinit_ctx(pt_ctx); - memset(&pt_pcpu[cpu_id], 0, sizeof(struct pt_cpu)); + KASSERT(pt_pcpu[cpu_id].ctx == &pt_pcpu_ctx[cpu_id], + ("%s: CPU mode tracing with non-cpu mode PT" + "context active", + __func__)); + pt_deinit_ctx(pt_pcpu[cpu_id].ctx); + pt_pcpu[cpu_id].ctx = NULL; + atomic_set_int(&pt_pcpu[cpu_id].in_pcint_handler, 0); } } @@ -675,15 +670,15 @@ pt_backend_read(struct hwt_vm *vm, int *curpage, vm_offset_t *curpage_offset, uint64_t *data) { struct pt_buffer *buf; + uint64_t offset; if (vm->ctx->mode == HWT_MODE_THREAD) buf = &((struct pt_ctx *)vm->thr->private)->buf; else buf = &pt_pcpu[vm->cpu->cpu_id].ctx->buf; - mtx_lock_spin(&buf->lock); - *curpage = buf->curpage; - *curpage_offset = buf->offset + (buf->wrap_count * vm->ctx->bufsize); - mtx_unlock_spin(&buf->lock); + offset = atomic_load_acq_64(&buf->offset); + *curpage = offset / PAGE_SIZE; + *curpage_offset = offset & PAGE_MASK; return (0); } @@ -762,15 +757,13 @@ static struct hwt_backend backend = { * Used as a taskqueue routine from the ToPA interrupt handler. */ static void -pt_send_buffer_record(void *arg, int pending __unused) +pt_send_buffer_record(void *arg) { + struct pt_cpu *cpu = (struct pt_cpu *)arg; struct hwt_record_entry record; - struct pt_ctx *ctx = (struct pt_ctx *)arg; - /* Prepare buffer record. */ - mtx_lock_spin(&ctx->buf.lock); + struct pt_ctx *ctx = cpu->ctx; pt_fill_buffer_record(ctx->id, &ctx->buf, &record); - mtx_unlock_spin(&ctx->buf.lock); hwt_record_ctx(ctx->hwt_ctx, &record, M_ZERO | M_NOWAIT); } static void @@ -795,36 +788,40 @@ static int pt_topa_intr(struct trapframe *tf) { struct pt_buffer *buf; + struct pt_cpu *cpu; struct pt_ctx *ctx; uint64_t reg; - SDT_PROBE0(pt, , , topa__intr); - - if (pt_cpu_get_state(curcpu) != PT_ACTIVE) { - return (0); - } + cpu = &pt_pcpu[curcpu]; reg = rdmsr(MSR_IA_GLOBAL_STATUS); if ((reg & GLOBAL_STATUS_FLAG_TRACETOPAPMI) == 0) { - /* ACK spurious or leftover interrupt. */ pt_topa_status_clear(); + return (0); + } + + if (pt_cpu_get_state(curcpu) != PT_ACTIVE) { return (1); } + atomic_set_int(&cpu->in_pcint_handler, 1); - ctx = pt_pcpu[curcpu].ctx; + ctx = cpu->ctx; + KASSERT(ctx != NULL, + ("%s: cpu %d: ToPA PMI interrupt without an active context", + __func__, curcpu)); buf = &ctx->buf; KASSERT(buf->topa_hw != NULL, - ("%s: ToPA PMI interrupt with invalid buffer", __func__)); - + ("%s: cpu %d: ToPA PMI interrupt with invalid buffer", __func__, + curcpu)); pt_cpu_toggle_local(ctx->save_area, false); pt_update_buffer(buf); pt_topa_status_clear(); - taskqueue_enqueue_flags(taskqueue_pt, &ctx->task, - TASKQUEUE_FAIL_IF_PENDING); if (pt_cpu_get_state(curcpu) == PT_ACTIVE) { + swi_sched(cpu->swi_cookie, SWI_FROMNMI); pt_cpu_toggle_local(ctx->save_area, true); lapic_reenable_pcint(); } + atomic_set_int(&cpu->in_pcint_handler, 0); return (1); } @@ -839,7 +836,7 @@ static int pt_init(void) { u_int cp[4]; - int error; + int error, i; dprintf("pt: Enumerating part 1\n"); cpuid_count(CPUID_PT_LEAF, 0, cp); @@ -869,20 +866,38 @@ pt_init(void) pt_pcpu_ctx = mallocarray(mp_ncpus, sizeof(struct pt_ctx), M_PT, M_ZERO | M_WAITOK); + for (i = 0; i < mp_ncpus; i++) { + error = swi_add(&clk_intr_event, "pt", pt_send_buffer_record, + &pt_pcpu[i], SWI_CLOCK, INTR_MPSAFE, + &pt_pcpu[i].swi_cookie); + if (error != 0) { + dprintf( + "%s: failed to add interrupt handler for cpu: %d\n", + __func__, error); + goto err; + } + } + nmi_register_handler(pt_topa_intr); - if (!lapic_enable_pcint()) { - nmi_remove_handler(pt_topa_intr); - hwt_backend_unregister(&backend); - free(pt_pcpu, M_PT); - free(pt_pcpu_ctx, M_PT); - pt_pcpu = NULL; - pt_pcpu_ctx = NULL; + if (lapic_enable_pcint()) { + initialized = true; + return (0); + } else printf("pt: failed to setup interrupt line\n"); - return (error); +err: + nmi_remove_handler(pt_topa_intr); + hwt_backend_unregister(&backend); + + for (i = 0; i < mp_ncpus; i++) { + if (pt_pcpu[i].swi_cookie != 0) + swi_remove(pt_pcpu[i].swi_cookie); } - initialized = true; + free(pt_pcpu, M_PT); + free(pt_pcpu_ctx, M_PT); + pt_pcpu = NULL; + pt_pcpu_ctx = NULL; - return (0); + return (error); } /* @@ -941,14 +956,24 @@ pt_supported(void) static void pt_deinit(void) { + int i; + struct pt_cpu *cpu; + if (!initialized) return; nmi_remove_handler(pt_topa_intr); lapic_disable_pcint(); hwt_backend_unregister(&backend); + + for (i = 0; i < mp_ncpus; i++) { + cpu = &pt_pcpu[i]; + swi_remove(cpu->swi_cookie); + } + free(pt_pcpu, M_PT); free(pt_pcpu_ctx, M_PT); pt_pcpu = NULL; + pt_pcpu_ctx = NULL; initialized = false; } diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c index 2ac076551165..f7c59847140b 100644 --- a/sys/amd64/vmm/vmm.c +++ b/sys/amd64/vmm/vmm.c @@ -562,9 +562,9 @@ vm_alloc_vcpu(struct vm *vm, int vcpuid) } void -vm_slock_vcpus(struct vm *vm) +vm_lock_vcpus(struct vm *vm) { - sx_slock(&vm->vcpus_init_lock); + sx_xlock(&vm->vcpus_init_lock); } void @@ -990,6 +990,54 @@ save_guest_fpustate(struct vcpu *vcpu) static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle"); +/* + * Invoke the rendezvous function on the specified vcpu if applicable. Return + * true if the rendezvous is finished, false otherwise. + */ +static bool +vm_rendezvous(struct vcpu *vcpu) +{ + struct vm *vm = vcpu->vm; + int vcpuid; + + mtx_assert(&vcpu->vm->rendezvous_mtx, MA_OWNED); + KASSERT(vcpu->vm->rendezvous_func != NULL, + ("vm_rendezvous: no rendezvous pending")); + + /* 'rendezvous_req_cpus' must be a subset of 'active_cpus' */ + CPU_AND(&vm->rendezvous_req_cpus, &vm->rendezvous_req_cpus, + &vm->active_cpus); + + vcpuid = vcpu->vcpuid; + if (CPU_ISSET(vcpuid, &vm->rendezvous_req_cpus) && + !CPU_ISSET(vcpuid, &vm->rendezvous_done_cpus)) { + VMM_CTR0(vcpu, "Calling rendezvous func"); + (*vm->rendezvous_func)(vcpu, vm->rendezvous_arg); + CPU_SET(vcpuid, &vm->rendezvous_done_cpus); + } + if (CPU_CMP(&vm->rendezvous_req_cpus, + &vm->rendezvous_done_cpus) == 0) { + VMM_CTR0(vcpu, "Rendezvous completed"); + CPU_ZERO(&vm->rendezvous_req_cpus); + vm->rendezvous_func = NULL; + wakeup(&vm->rendezvous_func); + return (true); + } + return (false); +} + +static void +vcpu_wait_idle(struct vcpu *vcpu) +{ + KASSERT(vcpu->state != VCPU_IDLE, ("vcpu already idle")); + + vcpu->reqidle = 1; + vcpu_notify_event_locked(vcpu, false); + VMM_CTR1(vcpu, "vcpu state change from %s to " + "idle requested", vcpu_state2str(vcpu->state)); + msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz); +} + static int vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate, bool from_idle) @@ -1004,13 +1052,8 @@ vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate, * ioctl() operating on a vcpu at any point. */ if (from_idle) { - while (vcpu->state != VCPU_IDLE) { - vcpu->reqidle = 1; - vcpu_notify_event_locked(vcpu, false); - VMM_CTR1(vcpu, "vcpu state change from %s to " - "idle requested", vcpu_state2str(vcpu->state)); - msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz); - } + while (vcpu->state != VCPU_IDLE) + vcpu_wait_idle(vcpu); } else { KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from " "vcpu idle state")); @@ -1062,6 +1105,95 @@ vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate, return (0); } +/* + * Try to lock all of the vCPUs in the VM while taking care to avoid deadlocks + * with vm_smp_rendezvous(). + * + * The complexity here suggests that the rendezvous mechanism needs a rethink. + */ +int +vcpu_set_state_all(struct vm *vm, enum vcpu_state newstate) +{ + cpuset_t locked; + struct vcpu *vcpu; + int error, i; + uint16_t maxcpus; + + KASSERT(newstate != VCPU_IDLE, + ("vcpu_set_state_all: invalid target state %d", newstate)); + + error = 0; + CPU_ZERO(&locked); + maxcpus = vm->maxcpus; + + mtx_lock(&vm->rendezvous_mtx); +restart: + if (vm->rendezvous_func != NULL) { + /* + * If we have a pending rendezvous, then the initiator may be + * blocked waiting for other vCPUs to execute the callback. The + * current thread may be a vCPU thread so we must not block + * waiting for the initiator, otherwise we get a deadlock. + * Thus, execute the callback on behalf of any idle vCPUs. + */ + for (i = 0; i < maxcpus; i++) { + vcpu = vm_vcpu(vm, i); + if (vcpu == NULL) + continue; + vcpu_lock(vcpu); + if (vcpu->state == VCPU_IDLE) { + (void)vcpu_set_state_locked(vcpu, VCPU_FROZEN, + true); + CPU_SET(i, &locked); + } + if (CPU_ISSET(i, &locked)) { + /* + * We can safely execute the callback on this + * vCPU's behalf. + */ + vcpu_unlock(vcpu); + (void)vm_rendezvous(vcpu); + vcpu_lock(vcpu); + } + vcpu_unlock(vcpu); + } + } + + /* + * Now wait for remaining vCPUs to become idle. This may include the + * initiator of a rendezvous that is currently blocked on the rendezvous + * mutex. + */ + CPU_FOREACH_ISCLR(i, &locked) { + if (i >= maxcpus) + break; + vcpu = vm_vcpu(vm, i); + if (vcpu == NULL) + continue; + vcpu_lock(vcpu); + while (vcpu->state != VCPU_IDLE) { + mtx_unlock(&vm->rendezvous_mtx); + vcpu_wait_idle(vcpu); + vcpu_unlock(vcpu); + mtx_lock(&vm->rendezvous_mtx); + if (vm->rendezvous_func != NULL) + goto restart; + vcpu_lock(vcpu); + } + error = vcpu_set_state_locked(vcpu, newstate, true); + vcpu_unlock(vcpu); + if (error != 0) { + /* Roll back state changes. */ + CPU_FOREACH_ISSET(i, &locked) + (void)vcpu_set_state(vcpu, VCPU_IDLE, false); + break; + } + CPU_SET(i, &locked); + } + mtx_unlock(&vm->rendezvous_mtx); + return (error); +} + static void vcpu_require_state(struct vcpu *vcpu, enum vcpu_state newstate) { @@ -1083,36 +1215,23 @@ vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate) static int vm_handle_rendezvous(struct vcpu *vcpu) { - struct vm *vm = vcpu->vm; + struct vm *vm; struct thread *td; - int error, vcpuid; - error = 0; - vcpuid = vcpu->vcpuid; td = curthread; + vm = vcpu->vm; + mtx_lock(&vm->rendezvous_mtx); while (vm->rendezvous_func != NULL) { - /* 'rendezvous_req_cpus' must be a subset of 'active_cpus' */ - CPU_AND(&vm->rendezvous_req_cpus, &vm->rendezvous_req_cpus, &vm->active_cpus); - - if (CPU_ISSET(vcpuid, &vm->rendezvous_req_cpus) && - !CPU_ISSET(vcpuid, &vm->rendezvous_done_cpus)) { - VMM_CTR0(vcpu, "Calling rendezvous func"); - (*vm->rendezvous_func)(vcpu, vm->rendezvous_arg); - CPU_SET(vcpuid, &vm->rendezvous_done_cpus); - } - if (CPU_CMP(&vm->rendezvous_req_cpus, - &vm->rendezvous_done_cpus) == 0) { - VMM_CTR0(vcpu, "Rendezvous completed"); - CPU_ZERO(&vm->rendezvous_req_cpus); - vm->rendezvous_func = NULL; - wakeup(&vm->rendezvous_func); + if (vm_rendezvous(vcpu)) break; - } + VMM_CTR0(vcpu, "Wait for rendezvous completion"); mtx_sleep(&vm->rendezvous_func, &vm->rendezvous_mtx, 0, "vmrndv", hz); if (td_ast_pending(td, TDA_SUSPEND)) { + int error; + mtx_unlock(&vm->rendezvous_mtx); error = thread_check_susp(td, true); if (error != 0) diff --git a/sys/arm/arm/elf_machdep.c b/sys/arm/arm/elf_machdep.c index ea6437f320ce..881c4fcff475 100644 --- a/sys/arm/arm/elf_machdep.c +++ b/sys/arm/arm/elf_machdep.c @@ -106,7 +106,7 @@ struct sysentvec elf32_freebsd_sysvec = { }; INIT_SYSENTVEC(elf32_sysvec, &elf32_freebsd_sysvec); -static Elf32_Brandinfo freebsd_brand_info = { +static const Elf32_Brandinfo freebsd_brand_info = { .brand = ELFOSABI_FREEBSD, .machine = EM_ARM, .compat_3_brand = "FreeBSD", @@ -118,7 +118,7 @@ static Elf32_Brandinfo freebsd_brand_info = { .header_supported= elf32_arm_abi_supported, }; -SYSINIT(elf32, SI_SUB_EXEC, SI_ORDER_FIRST, +C_SYSINIT(elf32, SI_SUB_EXEC, SI_ORDER_FIRST, (sysinit_cfunc_t) elf32_insert_brand_entry, &freebsd_brand_info); diff --git a/sys/arm/ti/ti_pruss.c b/sys/arm/ti/ti_pruss.c index 4e9f2022240c..bae1de9f2ddf 100644 --- a/sys/arm/ti/ti_pruss.c +++ b/sys/arm/ti/ti_pruss.c @@ -793,6 +793,7 @@ static const struct filterops ti_pruss_kq_read = { .f_isfd = 1, .f_detach = ti_pruss_irq_kqread_detach, .f_event = ti_pruss_irq_kqevent, + .f_copy = knote_triv_copy, }; static void diff --git a/sys/arm64/include/vmm.h b/sys/arm64/include/vmm.h index 84b286a60b38..696a69669a2a 100644 --- a/sys/arm64/include/vmm.h +++ b/sys/arm64/include/vmm.h @@ -177,7 +177,7 @@ DECLARE_VMMOPS_FUNC(int, restore_tsc, (void *vcpui, uint64_t now)); int vm_create(const char *name, struct vm **retvm); struct vcpu *vm_alloc_vcpu(struct vm *vm, int vcpuid); void vm_disable_vcpu_creation(struct vm *vm); -void vm_slock_vcpus(struct vm *vm); +void vm_lock_vcpus(struct vm *vm); void vm_unlock_vcpus(struct vm *vm); void vm_destroy(struct vm *vm); int vm_reinit(struct vm *vm); diff --git a/sys/arm64/vmm/vmm.c b/sys/arm64/vmm/vmm.c index aeda689f3b1a..bf52dc0fe916 100644 --- a/sys/arm64/vmm/vmm.c +++ b/sys/arm64/vmm/vmm.c @@ -469,9 +469,9 @@ vm_alloc_vcpu(struct vm *vm, int vcpuid) } void -vm_slock_vcpus(struct vm *vm) +vm_lock_vcpus(struct vm *vm) { - sx_slock(&vm->vcpus_init_lock); + sx_xlock(&vm->vcpus_init_lock); } void diff --git a/sys/cam/scsi/scsi_pass.c b/sys/cam/scsi/scsi_pass.c index c3587421c176..b44ab866dfe7 100644 --- a/sys/cam/scsi/scsi_pass.c +++ b/sys/cam/scsi/scsi_pass.c @@ -206,7 +206,8 @@ static struct cdevsw pass_cdevsw = { static const struct filterops passread_filtops = { .f_isfd = 1, .f_detach = passreadfiltdetach, - .f_event = passreadfilt + .f_event = passreadfilt, + .f_copy = knote_triv_copy, }; static MALLOC_DEFINE(M_SCSIPASS, "scsi_pass", "scsi passthrough buffers"); diff --git a/sys/cam/scsi/scsi_target.c b/sys/cam/scsi/scsi_target.c index 21c78e35dadc..39ce2bcea8f4 100644 --- a/sys/cam/scsi/scsi_target.c +++ b/sys/cam/scsi/scsi_target.c @@ -108,6 +108,7 @@ static const struct filterops targread_filtops = { .f_isfd = 1, .f_detach = targreadfiltdetach, .f_event = targreadfilt, + .f_copy = knote_triv_copy, }; static struct cdevsw targ_cdevsw = { diff --git a/sys/compat/linprocfs/linprocfs.c b/sys/compat/linprocfs/linprocfs.c index 95b212be1306..7ac48786c77b 100644 --- a/sys/compat/linprocfs/linprocfs.c +++ b/sys/compat/linprocfs/linprocfs.c @@ -2216,6 +2216,67 @@ linprocfs_dosysvipc_shm(PFS_FILL_ARGS) return (0); } +static int +linprocfs_doinotify(const char *sysctl, PFS_FILL_ARGS) +{ + size_t size; + int error, val; + + if (uio->uio_rw == UIO_READ) { + size = sizeof(val); + error = kernel_sysctlbyname(curthread, + __DECONST(void *, sysctl), &val, &size, NULL, 0, 0, 0); + if (error == 0) + sbuf_printf(sb, "%d\n", val); + } else { + char *endp, *newval; + long vall; + + sbuf_trim(sb); + sbuf_finish(sb); + newval = sbuf_data(sb); + vall = strtol(newval, &endp, 10); + if (vall < 0 || vall > INT_MAX || endp == newval || + *endp != '\0') + return (EINVAL); + val = (int)vall; + error = kernel_sysctlbyname(curthread, + __DECONST(void *, sysctl), NULL, NULL, + &val, sizeof(val), 0, 0); + } + return (error); +} + +/* + * Filler function for proc/sys/fs/inotify/max_queued_events + */ +static int +linprocfs_doinotify_max_queued_events(PFS_FILL_ARGS) +{ + return (linprocfs_doinotify("vfs.inotify.max_queued_events", + PFS_FILL_ARGNAMES)); +} + +/* + * Filler function for proc/sys/fs/inotify/max_user_instances + */ +static int +linprocfs_doinotify_max_user_instances(PFS_FILL_ARGS) +{ + return (linprocfs_doinotify("vfs.inotify.max_user_instances", + PFS_FILL_ARGNAMES)); +} + +/* + * Filler function for proc/sys/fs/inotify/max_user_watches + */ +static int +linprocfs_doinotify_max_user_watches(PFS_FILL_ARGS) +{ + return (linprocfs_doinotify("vfs.inotify.max_user_watches", + PFS_FILL_ARGNAMES)); +} + /* * Filler function for proc/sys/fs/mqueue/msg_default */ @@ -2313,9 +2374,7 @@ linprocfs_domqueue_queues_max(PFS_FILL_ARGS) static int linprocfs_init(PFS_INIT_ARGS) { - struct pfs_node *root; - struct pfs_node *dir; - struct pfs_node *sys; + struct pfs_node *dir, *fs, *root, *sys; root = pi->pi_root; @@ -2466,10 +2525,18 @@ linprocfs_init(PFS_INIT_ARGS) NULL, PFS_RD); /* /proc/sys/fs/... */ - pfs_create_dir(sys, &dir, "fs", NULL, NULL, NULL, 0); + pfs_create_dir(sys, &fs, "fs", NULL, NULL, NULL, 0); + + pfs_create_dir(fs, &dir, "inotify", NULL, NULL, NULL, 0); + pfs_create_file(dir, NULL, "max_queued_events", + &linprocfs_doinotify_max_queued_events, NULL, NULL, NULL, PFS_RDWR); + pfs_create_file(dir, NULL, "max_user_instances", + &linprocfs_doinotify_max_user_instances, NULL, NULL, NULL, PFS_RDWR); + pfs_create_file(dir, NULL, "max_user_watches", + &linprocfs_doinotify_max_user_watches, NULL, NULL, NULL, PFS_RDWR); /* /proc/sys/fs/mqueue/... */ - pfs_create_dir(dir, &dir, "mqueue", NULL, NULL, NULL, 0); + pfs_create_dir(fs, &dir, "mqueue", NULL, NULL, NULL, 0); pfs_create_file(dir, NULL, "msg_default", &linprocfs_domqueue_msg_default, NULL, NULL, NULL, PFS_RD); pfs_create_file(dir, NULL, "msgsize_default", diff --git a/sys/compat/linux/linux_event.c b/sys/compat/linux/linux_event.c index e88791659f1f..fc3ef7c3e90a 100644 --- a/sys/compat/linux/linux_event.c +++ b/sys/compat/linux/linux_event.c @@ -104,7 +104,7 @@ static int epoll_create_common(struct thread *td, int flags) { - return (kern_kqueue(td, flags, NULL)); + return (kern_kqueue(td, flags, false, NULL)); } #ifdef LINUX_LEGACY_SYSCALLS diff --git a/sys/compat/linuxkpi/common/src/linux_compat.c b/sys/compat/linuxkpi/common/src/linux_compat.c index 458744a9fec6..ff0f477ea8cc 100644 --- a/sys/compat/linuxkpi/common/src/linux_compat.c +++ b/sys/compat/linuxkpi/common/src/linux_compat.c @@ -1171,12 +1171,14 @@ static const struct filterops linux_dev_kqfiltops_read = { .f_isfd = 1, .f_detach = linux_file_kqfilter_detach, .f_event = linux_file_kqfilter_read_event, + .f_copy = knote_triv_copy, }; static const struct filterops linux_dev_kqfiltops_write = { .f_isfd = 1, .f_detach = linux_file_kqfilter_detach, .f_event = linux_file_kqfilter_write_event, + .f_copy = knote_triv_copy, }; static void diff --git a/sys/conf/NOTES b/sys/conf/NOTES index a25ee8f6e1af..9944375c3615 100644 --- a/sys/conf/NOTES +++ b/sys/conf/NOTES @@ -2801,7 +2801,7 @@ options MAXFILES=999 # Random number generator # Alternative algorithm. -#options RANDOM_FENESTRASX +options RANDOM_FENESTRASX # Allow the CSPRNG algorithm to be loaded as a module. #options RANDOM_LOADABLE # Select this to allow high-rate but potentially expensive diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c index 0dd2ecd7fd8d..3ddbfcb97184 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c @@ -183,6 +183,7 @@ static struct filterops zvol_filterops_vnode = { .f_isfd = 1, .f_detach = zvol_filter_detach, .f_event = zvol_filter_vnode, + .f_copy = knote_triv_copy, }; extern uint_t zfs_geom_probe_vdev_key; diff --git a/sys/dev/ahci/ahciem.c b/sys/dev/ahci/ahciem.c index c9e6c35f4233..012796aa5d6f 100644 --- a/sys/dev/ahci/ahciem.c +++ b/sys/dev/ahci/ahciem.c @@ -479,7 +479,7 @@ ahci_em_emulate_ses_on_led(device_t dev, union ccb *ccb) else ads->common.bytes[0] |= SES_OBJSTAT_NOTINSTALLED; if (ch->disablephy) - ads->common.bytes[3] |= SESCTL_DEVOFF; + ads->bytes[2] |= SESCTL_DEVOFF; ahci_putch(ch); } ccb->ccb_h.status = CAM_REQ_CMP; diff --git a/sys/dev/atkbdc/psm.c b/sys/dev/atkbdc/psm.c index 8563b5f93aa2..137758b104d3 100644 --- a/sys/dev/atkbdc/psm.c +++ b/sys/dev/atkbdc/psm.c @@ -5287,6 +5287,7 @@ static const struct filterops psmfiltops = { .f_isfd = 1, .f_detach = psmfilter_detach, .f_event = psmfilter, + .f_copy = knote_triv_copy, }; static int diff --git a/sys/dev/cyapa/cyapa.c b/sys/dev/cyapa/cyapa.c index ed755f992949..464b03c0ab64 100644 --- a/sys/dev/cyapa/cyapa.c +++ b/sys/dev/cyapa/cyapa.c @@ -1121,7 +1121,8 @@ static int cyapafilt(struct knote *, long); static const struct filterops cyapa_filtops = { .f_isfd = 1, .f_detach = cyapafiltdetach, - .f_event = cyapafilt + .f_event = cyapafilt, + .f_copy = knote_triv_copy, }; static int diff --git a/sys/dev/dc/if_dc.c b/sys/dev/dc/if_dc.c index bed74c3b6181..5c1d7ff30976 100644 --- a/sys/dev/dc/if_dc.c +++ b/sys/dev/dc/if_dc.c @@ -999,7 +999,7 @@ dc_setfilt_21143(struct dc_softc *sc) else DC_CLRBIT(sc, DC_NETCFG, DC_NETCFG_RX_ALLMULTI); - if_foreach_llmaddr(ifp, dc_hash_maddr_21143, sp); + if_foreach_llmaddr(ifp, dc_hash_maddr_21143, sc); if (if_getflags(ifp) & IFF_BROADCAST) { h = dc_mchash_le(sc, if_getbroadcastaddr(ifp)); diff --git a/sys/dev/evdev/cdev.c b/sys/dev/evdev/cdev.c index 9fe1299a0937..dd4115cdfc71 100644 --- a/sys/dev/evdev/cdev.c +++ b/sys/dev/evdev/cdev.c @@ -96,6 +96,7 @@ static const struct filterops evdev_cdev_filterops = { .f_attach = NULL, .f_detach = evdev_kqdetach, .f_event = evdev_kqread, + .f_copy = knote_triv_copy, }; static int diff --git a/sys/dev/evdev/uinput.c b/sys/dev/evdev/uinput.c index 9ac9fee8a157..76a530479c02 100644 --- a/sys/dev/evdev/uinput.c +++ b/sys/dev/evdev/uinput.c @@ -104,6 +104,7 @@ static const struct filterops uinput_filterops = { .f_attach = NULL, .f_detach = uinput_kqdetach, .f_event = uinput_kqread, + .f_copy = knote_triv_copy, }; struct uinput_cdev_state diff --git a/sys/dev/gpio/gpioc.c b/sys/dev/gpio/gpioc.c index 6c6f79227166..f690140af97b 100644 --- a/sys/dev/gpio/gpioc.c +++ b/sys/dev/gpio/gpioc.c @@ -158,7 +158,8 @@ static const struct filterops gpioc_read_filterops = { .f_attach = NULL, .f_detach = gpioc_kqdetach, .f_event = gpioc_kqread, - .f_touch = NULL + .f_touch = NULL, + .f_copy = knote_triv_copy, }; static struct gpioc_pin_event * diff --git a/sys/dev/hid/hidraw.c b/sys/dev/hid/hidraw.c index 4855843cd265..d17356642042 100644 --- a/sys/dev/hid/hidraw.c +++ b/sys/dev/hid/hidraw.c @@ -182,6 +182,7 @@ static const struct filterops hidraw_filterops_read = { .f_isfd = 1, .f_detach = hidraw_kqdetach, .f_event = hidraw_kqread, + .f_copy = knote_triv_copy, }; static void diff --git a/sys/dev/hid/u2f.c b/sys/dev/hid/u2f.c index 08f1a5ceedba..4232322c61df 100644 --- a/sys/dev/hid/u2f.c +++ b/sys/dev/hid/u2f.c @@ -132,6 +132,7 @@ static struct filterops u2f_filterops_read = { .f_isfd = 1, .f_detach = u2f_kqdetach, .f_event = u2f_kqread, + .f_copy = knote_triv_copy, }; static int diff --git a/sys/dev/netmap/netmap_freebsd.c b/sys/dev/netmap/netmap_freebsd.c index 8cc543d54c2e..9fb4370129f3 100644 --- a/sys/dev/netmap/netmap_freebsd.c +++ b/sys/dev/netmap/netmap_freebsd.c @@ -738,6 +738,7 @@ nm_os_extmem_create(unsigned long p, struct nmreq_pools_info *pi, int *perror) out_rem: vm_map_remove(kernel_map, e->kva, e->kva + e->size); + e->obj = NULL; /* reference consumed by vm_map_remove() */ out_rel: vm_object_deallocate(e->obj); e->obj = NULL; @@ -1406,19 +1407,34 @@ netmap_knwrite(struct knote *kn, long hint) return netmap_knrw(kn, hint, POLLOUT); } +static int +netmap_kncopy(struct knote *kn, struct proc *p1) +{ + struct netmap_priv_d *priv; + struct nm_selinfo *si; + + priv = kn->kn_hook; + si = priv->np_si[kn->kn_filter == EVFILT_WRITE ? NR_TX : NR_RX]; + NMG_LOCK(); + si->kqueue_users++; + NMG_UNLOCK(); + return (0); +} + static const struct filterops netmap_rfiltops = { .f_isfd = 1, .f_detach = netmap_knrdetach, .f_event = netmap_knread, + .f_copy = netmap_kncopy, }; static const struct filterops netmap_wfiltops = { .f_isfd = 1, .f_detach = netmap_knwdetach, .f_event = netmap_knwrite, + .f_copy = netmap_kncopy, }; - /* * This is called when a thread invokes kevent() to record * a change in the configuration of the kqueue(). diff --git a/sys/dev/null/null.c b/sys/dev/null/null.c index 8525eb9543c3..c4f138b102c7 100644 --- a/sys/dev/null/null.c +++ b/sys/dev/null/null.c @@ -61,12 +61,14 @@ static int zero_ev(struct knote *kn, long hint); static const struct filterops one_fop = { .f_isfd = 1, - .f_event = one_ev + .f_event = one_ev, + .f_copy = knote_triv_copy, }; static const struct filterops zero_fop = { .f_isfd = 1, - .f_event = zero_ev + .f_event = zero_ev, + .f_copy = knote_triv_copy, }; static struct cdevsw full_cdevsw = { diff --git a/sys/dev/pci/controller/pci_n1sdp.c b/sys/dev/pci/controller/pci_n1sdp.c index 487041bc78e4..22f0ea27d45b 100644 --- a/sys/dev/pci/controller/pci_n1sdp.c +++ b/sys/dev/pci/controller/pci_n1sdp.c @@ -345,6 +345,17 @@ n1sdp_pcie_write_config(device_t dev, u_int bus, u_int slot, bus_space_write_4(t, h, offset & ~3, data); } +static int +n1sdp_pcie_acpi_request_feature(device_t pcib __unused, device_t dev __unused, + enum pci_feature feature __unused) +{ + /* + * HotPlug isn't supported on the N1SDP as it causes an interrupt storm + */ + return (EINVAL); +} + + static device_method_t n1sdp_pcie_acpi_methods[] = { DEVMETHOD(device_probe, n1sdp_pcie_acpi_probe), DEVMETHOD(device_attach, n1sdp_pcie_acpi_attach), @@ -352,6 +363,7 @@ static device_method_t n1sdp_pcie_acpi_methods[] = { /* pcib interface */ DEVMETHOD(pcib_read_config, n1sdp_pcie_read_config), DEVMETHOD(pcib_write_config, n1sdp_pcie_write_config), + DEVMETHOD(pcib_request_feature, n1sdp_pcie_acpi_request_feature), DEVMETHOD_END }; diff --git a/sys/dev/qat/qat_common/adf_freebsd_dev_processes.c b/sys/dev/qat/qat_common/adf_freebsd_dev_processes.c index 67e1d4ad2cab..c5b745bb78fb 100644 --- a/sys/dev/qat/qat_common/adf_freebsd_dev_processes.c +++ b/sys/dev/qat/qat_common/adf_freebsd_dev_processes.c @@ -89,6 +89,7 @@ static struct filterops adf_state_read_filterops = { .f_attach = NULL, .f_detach = adf_state_kqread_detach, .f_event = adf_state_kqread_event, + .f_copy = knote_triv_copy, }; static struct cdev *adf_processes_dev; diff --git a/sys/dev/usb/usb_dev.c b/sys/dev/usb/usb_dev.c index 293b0c72587f..e58d6a674ec0 100644 --- a/sys/dev/usb/usb_dev.c +++ b/sys/dev/usb/usb_dev.c @@ -1231,12 +1231,14 @@ static const struct filterops usb_filtops_write = { .f_isfd = 1, .f_detach = usb_filter_detach, .f_event = usb_filter_write, + .f_copy = knote_triv_copy, }; static const struct filterops usb_filtops_read = { .f_isfd = 1, .f_detach = usb_filter_detach, .f_event = usb_filter_read, + .f_copy = knote_triv_copy, }; /* ARGSUSED */ diff --git a/sys/dev/vmm/vmm_dev.c b/sys/dev/vmm/vmm_dev.c index 460a508a60dc..4961b21180e1 100644 --- a/sys/dev/vmm/vmm_dev.c +++ b/sys/dev/vmm/vmm_dev.c @@ -120,18 +120,18 @@ vcpu_unlock_one(struct vcpu *vcpu) vcpu_set_state(vcpu, VCPU_IDLE, false); } +#ifndef __amd64__ static int -vcpu_lock_all(struct vmmdev_softc *sc) +vcpu_set_state_all(struct vm *vm, enum vcpu_state newstate) { struct vcpu *vcpu; int error; uint16_t i, j, maxcpus; error = 0; - vm_slock_vcpus(sc->vm); - maxcpus = vm_get_maxcpus(sc->vm); + maxcpus = vm_get_maxcpus(vm); for (i = 0; i < maxcpus; i++) { - vcpu = vm_vcpu(sc->vm, i); + vcpu = vm_vcpu(vm, i); if (vcpu == NULL) continue; error = vcpu_lock_one(vcpu); @@ -141,16 +141,32 @@ vcpu_lock_all(struct vmmdev_softc *sc) if (error) { for (j = 0; j < i; j++) { - vcpu = vm_vcpu(sc->vm, j); + vcpu = vm_vcpu(vm, j); if (vcpu == NULL) continue; vcpu_unlock_one(vcpu); } - vm_unlock_vcpus(sc->vm); } return (error); } +#endif + +static int +vcpu_lock_all(struct vmmdev_softc *sc) +{ + int error; + + /* + * Serialize vcpu_lock_all() callers. Individual vCPUs are not locked + * in a consistent order so we need to serialize to avoid deadlocks. + */ + vm_lock_vcpus(sc->vm); + error = vcpu_set_state_all(sc->vm, VCPU_FROZEN); + if (error != 0) + vm_unlock_vcpus(sc->vm); + return (error); +} static void vcpu_unlock_all(struct vmmdev_softc *sc) diff --git a/sys/fs/cuse/cuse.c b/sys/fs/cuse/cuse.c index d63a7d4691cf..b2524324584a 100644 --- a/sys/fs/cuse/cuse.c +++ b/sys/fs/cuse/cuse.c @@ -195,12 +195,14 @@ static const struct filterops cuse_client_kqfilter_read_ops = { .f_isfd = 1, .f_detach = cuse_client_kqfilter_read_detach, .f_event = cuse_client_kqfilter_read_event, + .f_copy = knote_triv_copy, }; static const struct filterops cuse_client_kqfilter_write_ops = { .f_isfd = 1, .f_detach = cuse_client_kqfilter_write_detach, .f_event = cuse_client_kqfilter_write_event, + .f_copy = knote_triv_copy, }; static d_open_t cuse_client_open; diff --git a/sys/fs/fuse/fuse_device.c b/sys/fs/fuse/fuse_device.c index 57b3559731f7..75bc0357571f 100644 --- a/sys/fs/fuse/fuse_device.c +++ b/sys/fs/fuse/fuse_device.c @@ -126,11 +126,13 @@ static const struct filterops fuse_device_rfiltops = { .f_isfd = 1, .f_detach = fuse_device_filt_detach, .f_event = fuse_device_filt_read, + .f_copy = knote_triv_copy, }; static const struct filterops fuse_device_wfiltops = { .f_isfd = 1, .f_event = fuse_device_filt_write, + .f_copy = knote_triv_copy, }; /**************************** diff --git a/sys/geom/geom_dev.c b/sys/geom/geom_dev.c index 27c65f15d5e3..db0bc77a752f 100644 --- a/sys/geom/geom_dev.c +++ b/sys/geom/geom_dev.c @@ -82,6 +82,7 @@ static const struct filterops gdev_filterops_vnode = { .f_isfd = 1, .f_detach = gdev_filter_detach, .f_event = gdev_filter_vnode, + .f_copy = knote_triv_copy, }; static struct cdevsw g_dev_cdevsw = { diff --git a/sys/i386/i386/elf_machdep.c b/sys/i386/i386/elf_machdep.c index 13769af0fbca..14c942942d08 100644 --- a/sys/i386/i386/elf_machdep.c +++ b/sys/i386/i386/elf_machdep.c @@ -92,7 +92,7 @@ struct sysentvec elf32_freebsd_sysvec = { }; INIT_SYSENTVEC(elf32_sysvec, &elf32_freebsd_sysvec); -static Elf32_Brandinfo freebsd_brand_info = { +static const Elf32_Brandinfo freebsd_brand_info = { .brand = ELFOSABI_FREEBSD, .machine = EM_386, .compat_3_brand = "FreeBSD", @@ -103,11 +103,11 @@ static Elf32_Brandinfo freebsd_brand_info = { .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; -SYSINIT(elf32, SI_SUB_EXEC, SI_ORDER_FIRST, +C_SYSINIT(elf32, SI_SUB_EXEC, SI_ORDER_FIRST, (sysinit_cfunc_t) elf32_insert_brand_entry, &freebsd_brand_info); -static Elf32_Brandinfo freebsd_brand_oinfo = { +static const Elf32_Brandinfo freebsd_brand_oinfo = { .brand = ELFOSABI_FREEBSD, .machine = EM_386, .compat_3_brand = "FreeBSD", @@ -118,11 +118,11 @@ static Elf32_Brandinfo freebsd_brand_oinfo = { .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; -SYSINIT(oelf32, SI_SUB_EXEC, SI_ORDER_ANY, +C_SYSINIT(oelf32, SI_SUB_EXEC, SI_ORDER_ANY, (sysinit_cfunc_t) elf32_insert_brand_entry, &freebsd_brand_oinfo); -static Elf32_Brandinfo kfreebsd_brand_info = { +static const Elf32_Brandinfo kfreebsd_brand_info = { .brand = ELFOSABI_FREEBSD, .machine = EM_386, .compat_3_brand = "FreeBSD", @@ -133,7 +133,7 @@ static Elf32_Brandinfo kfreebsd_brand_info = { .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE_MANDATORY }; -SYSINIT(kelf32, SI_SUB_EXEC, SI_ORDER_ANY, +C_SYSINIT(kelf32, SI_SUB_EXEC, SI_ORDER_ANY, (sysinit_cfunc_t) elf32_insert_brand_entry, &kfreebsd_brand_info); diff --git a/sys/i386/linux/linux_sysvec.c b/sys/i386/linux/linux_sysvec.c index 85877bf40997..14b5f64388d2 100644 --- a/sys/i386/linux/linux_sysvec.c +++ b/sys/i386/linux/linux_sysvec.c @@ -796,7 +796,7 @@ linux_vdso_reloc(char *mapping, Elf_Addr offset) } } -static Elf_Brandnote linux_brandnote = { +static const Elf_Brandnote linux_brandnote = { .hdr.n_namesz = sizeof(GNU_ABI_VENDOR), .hdr.n_descsz = 16, /* XXX at least 16 */ .hdr.n_type = 1, @@ -805,7 +805,7 @@ static Elf_Brandnote linux_brandnote = { .trans_osrel = linux_trans_osrel }; -static Elf32_Brandinfo linux_brand = { +static const Elf32_Brandinfo linux_brand = { .brand = ELFOSABI_LINUX, .machine = EM_386, .compat_3_brand = "Linux", @@ -816,7 +816,7 @@ static Elf32_Brandinfo linux_brand = { .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; -static Elf32_Brandinfo linux_glibc2brand = { +static const Elf32_Brandinfo linux_glibc2brand = { .brand = ELFOSABI_LINUX, .machine = EM_386, .compat_3_brand = "Linux", @@ -827,7 +827,7 @@ static Elf32_Brandinfo linux_glibc2brand = { .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; -static Elf32_Brandinfo linux_muslbrand = { +static const Elf32_Brandinfo linux_muslbrand = { .brand = ELFOSABI_LINUX, .machine = EM_386, .compat_3_brand = "Linux", @@ -839,7 +839,7 @@ static Elf32_Brandinfo linux_muslbrand = { LINUX_BI_FUTEX_REQUEUE }; -Elf32_Brandinfo *linux_brandlist[] = { +const Elf32_Brandinfo *linux_brandlist[] = { &linux_brand, &linux_glibc2brand, &linux_muslbrand, @@ -849,7 +849,7 @@ Elf32_Brandinfo *linux_brandlist[] = { static int linux_elf_modevent(module_t mod, int type, void *data) { - Elf32_Brandinfo **brandinfo; + const Elf32_Brandinfo **brandinfo; int error; struct linux_ioctl_handler **lihp; diff --git a/sys/kern/imgact_elf.c b/sys/kern/imgact_elf.c index a1fabbc86f27..779158b41221 100644 --- a/sys/kern/imgact_elf.c +++ b/sys/kern/imgact_elf.c @@ -231,7 +231,7 @@ static const Elf_Brandinfo *elf_brand_list[MAX_BRANDS]; #define aligned(a, t) (rounddown2((u_long)(a), sizeof(t)) == (u_long)(a)) -Elf_Brandnote __elfN(freebsd_brandnote) = { +const Elf_Brandnote __elfN(freebsd_brandnote) = { .hdr.n_namesz = sizeof(FREEBSD_ABI_VENDOR), .hdr.n_descsz = sizeof(int32_t), .hdr.n_type = NT_FREEBSD_ABI_TAG, @@ -254,7 +254,7 @@ __elfN(freebsd_trans_osrel)(const Elf_Note *note, int32_t *osrel) static int GNU_KFREEBSD_ABI_DESC = 3; -Elf_Brandnote __elfN(kfreebsd_brandnote) = { +const Elf_Brandnote __elfN(kfreebsd_brandnote) = { .hdr.n_namesz = sizeof(GNU_ABI_VENDOR), .hdr.n_descsz = 16, /* XXX at least 16 */ .hdr.n_type = 1, diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c index 19118eb7f275..a71a601733e5 100644 --- a/sys/kern/kern_descrip.c +++ b/sys/kern/kern_descrip.c @@ -2486,7 +2486,7 @@ fdunshare(struct thread *td) if (refcount_load(&p->p_fd->fd_refcnt) == 1) return; - tmp = fdcopy(p->p_fd); + tmp = fdcopy(p->p_fd, p); fdescfree(td); p->p_fd = tmp; } @@ -2515,14 +2515,17 @@ pdunshare(struct thread *td) * this is to ease callers, not catch errors. */ struct filedesc * -fdcopy(struct filedesc *fdp) +fdcopy(struct filedesc *fdp, struct proc *p1) { struct filedesc *newfdp; struct filedescent *nfde, *ofde; + struct file *fp; int i, lastfile; + bool fork_pass; MPASS(fdp != NULL); + fork_pass = false; newfdp = fdinit(); FILEDESC_SLOCK(fdp); for (;;) { @@ -2533,10 +2536,35 @@ fdcopy(struct filedesc *fdp) fdgrowtable(newfdp, lastfile + 1); FILEDESC_SLOCK(fdp); } - /* copy all passable descriptors (i.e. not kqueue) */ + + /* + * Copy all passable descriptors (i.e. not kqueue), and + * prepare to handle copyable but not passable descriptors + * (kqueues). + * + * The pass to handle copying is performed after all passable + * files are installed into the new file descriptor's table, + * since kqueues need all referenced file descriptors already + * valid, including other kqueues. For the same reason the + * copying is done in two passes by itself, first installing + * not fully initialized ('empty') copyable files into the new + * fd table, and then giving the subsystems a second chance to + * really fill the copied file backing structure with the + * content. + */ newfdp->fd_freefile = fdp->fd_freefile; FILEDESC_FOREACH_FDE(fdp, i, ofde) { - if ((ofde->fde_file->f_ops->fo_flags & DFLAG_PASSABLE) == 0 || + const struct fileops *ops; + + ops = ofde->fde_file->f_ops; + fp = NULL; + if ((ops->fo_flags & DFLAG_FORK) != 0 && + (ofde->fde_flags & UF_FOCLOSE) == 0) { + if (ops->fo_fork(newfdp, ofde->fde_file, &fp, p1, + curthread) != 0) + continue; + fork_pass = true; + } else if ((ops->fo_flags & DFLAG_PASSABLE) == 0 || (ofde->fde_flags & UF_FOCLOSE) != 0 || !fhold(ofde->fde_file)) { if (newfdp->fd_freefile == fdp->fd_freefile) @@ -2545,11 +2573,30 @@ fdcopy(struct filedesc *fdp) } nfde = &newfdp->fd_ofiles[i]; *nfde = *ofde; + if (fp != NULL) + nfde->fde_file = fp; filecaps_copy(&ofde->fde_caps, &nfde->fde_caps, true); fdused_init(newfdp, i); } MPASS(newfdp->fd_freefile != -1); FILEDESC_SUNLOCK(fdp); + + /* + * Now handle copying kqueues, since all fds, including + * kqueues, are in place. + */ + if (__predict_false(fork_pass)) { + FILEDESC_FOREACH_FDE(newfdp, i, nfde) { + const struct fileops *ops; + + ops = nfde->fde_file->f_ops; + if ((ops->fo_flags & DFLAG_FORK) == 0 || + nfde->fde_file == NULL) + continue; + ops->fo_fork(newfdp, NULL, &nfde->fde_file, p1, + curthread); + } + } return (newfdp); } diff --git a/sys/kern/kern_devctl.c b/sys/kern/kern_devctl.c index a1696225df32..a37cb23efed8 100644 --- a/sys/kern/kern_devctl.c +++ b/sys/kern/kern_devctl.c @@ -130,6 +130,7 @@ static const struct filterops devctl_rfiltops = { .f_isfd = 1, .f_detach = filt_devctl_detach, .f_event = filt_devctl_read, + .f_copy = knote_triv_copy, }; static struct cdev *devctl_dev; diff --git a/sys/kern/kern_event.c b/sys/kern/kern_event.c index a6333d8011b1..1baa24d278bf 100644 --- a/sys/kern/kern_event.c +++ b/sys/kern/kern_event.c @@ -134,6 +134,7 @@ static fo_kqfilter_t kqueue_kqfilter; static fo_stat_t kqueue_stat; static fo_close_t kqueue_close; static fo_fill_kinfo_t kqueue_fill_kinfo; +static fo_fork_t kqueue_fork; static const struct fileops kqueueops = { .fo_read = invfo_rdwr, @@ -148,7 +149,9 @@ static const struct fileops kqueueops = { .fo_chown = invfo_chown, .fo_sendfile = invfo_sendfile, .fo_cmp = file_kcmp_generic, + .fo_fork = kqueue_fork, .fo_fill_kinfo = kqueue_fill_kinfo, + .fo_flags = DFLAG_FORK, }; static int knote_attach(struct knote *kn, struct kqueue *kq); @@ -176,6 +179,7 @@ static void filt_timerdetach(struct knote *kn); static void filt_timerstart(struct knote *kn, sbintime_t to); static void filt_timertouch(struct knote *kn, struct kevent *kev, u_long type); +static int filt_timercopy(struct knote *kn, struct proc *p1); static int filt_timervalidate(struct knote *kn, sbintime_t *to); static int filt_timer(struct knote *kn, long hint); static int filt_userattach(struct knote *kn); @@ -187,11 +191,13 @@ static void filt_usertouch(struct knote *kn, struct kevent *kev, static const struct filterops file_filtops = { .f_isfd = 1, .f_attach = filt_fileattach, + .f_copy = knote_triv_copy, }; static const struct filterops kqread_filtops = { .f_isfd = 1, .f_detach = filt_kqdetach, .f_event = filt_kqueue, + .f_copy = knote_triv_copy, }; /* XXX - move to kern_proc.c? */ static const struct filterops proc_filtops = { @@ -199,12 +205,14 @@ static const struct filterops proc_filtops = { .f_attach = filt_procattach, .f_detach = filt_procdetach, .f_event = filt_proc, + .f_copy = knote_triv_copy, }; static const struct filterops jail_filtops = { .f_isfd = 0, .f_attach = filt_jailattach, .f_detach = filt_jaildetach, .f_event = filt_jail, + .f_copy = knote_triv_copy, }; static const struct filterops timer_filtops = { .f_isfd = 0, @@ -212,12 +220,14 @@ static const struct filterops timer_filtops = { .f_detach = filt_timerdetach, .f_event = filt_timer, .f_touch = filt_timertouch, + .f_copy = filt_timercopy, }; static const struct filterops user_filtops = { .f_attach = filt_userattach, .f_detach = filt_userdetach, .f_event = filt_user, .f_touch = filt_usertouch, + .f_copy = knote_triv_copy, }; static uma_zone_t knote_zone; @@ -347,6 +357,7 @@ filt_nullattach(struct knote *kn) static const struct filterops null_filtops = { .f_isfd = 0, .f_attach = filt_nullattach, + .f_copy = knote_triv_copy, }; /* XXX - make SYSINIT to add these, and move into respective modules. */ @@ -940,6 +951,30 @@ filt_timerattach(struct knote *kn) return (0); } +static int +filt_timercopy(struct knote *kn, struct proc *p) +{ + struct kq_timer_cb_data *kc_src, *kc; + + if (atomic_fetchadd_int(&kq_ncallouts, 1) + 1 > kq_calloutmax) { + atomic_subtract_int(&kq_ncallouts, 1); + return (ENOMEM); + } + + kn->kn_status &= ~KN_DETACHED; + kc_src = kn->kn_ptr.p_v; + kn->kn_ptr.p_v = kc = malloc(sizeof(*kc), M_KQUEUE, M_WAITOK); + kc->kn = kn; + kc->p = p; + kc->flags = kc_src->flags & ~KQ_TIMER_CB_ENQUEUED; + kc->next = kc_src->next; + kc->to = kc_src->to; + kc->cpuid = PCPU_GET(cpuid); + callout_init(&kc->c, 1); + kqtimer_sched_callout(kc); + return (0); +} + static void filt_timerstart(struct knote *kn, sbintime_t to) { @@ -1151,7 +1186,7 @@ int sys_kqueue(struct thread *td, struct kqueue_args *uap) { - return (kern_kqueue(td, 0, NULL)); + return (kern_kqueue(td, 0, false, NULL)); } int @@ -1159,55 +1194,76 @@ sys_kqueuex(struct thread *td, struct kqueuex_args *uap) { int flags; - if ((uap->flags & ~(KQUEUE_CLOEXEC)) != 0) + if ((uap->flags & ~(KQUEUE_CLOEXEC | KQUEUE_CPONFORK)) != 0) return (EINVAL); flags = 0; if ((uap->flags & KQUEUE_CLOEXEC) != 0) flags |= O_CLOEXEC; - return (kern_kqueue(td, flags, NULL)); + return (kern_kqueue(td, flags, (uap->flags & KQUEUE_CPONFORK) != 0, + NULL)); } static void -kqueue_init(struct kqueue *kq) +kqueue_init(struct kqueue *kq, bool cponfork) { mtx_init(&kq->kq_lock, "kqueue", NULL, MTX_DEF | MTX_DUPOK); TAILQ_INIT(&kq->kq_head); knlist_init_mtx(&kq->kq_sel.si_note, &kq->kq_lock); TASK_INIT(&kq->kq_task, 0, kqueue_task, kq); + if (cponfork) + kq->kq_state |= KQ_CPONFORK; } -int -kern_kqueue(struct thread *td, int flags, struct filecaps *fcaps) +static int +kern_kqueue_alloc(struct thread *td, struct filedesc *fdp, int *fdip, + struct file **fpp, int flags, struct filecaps *fcaps, bool cponfork, + struct kqueue **kqp) { - struct filedesc *fdp; - struct kqueue *kq; - struct file *fp; struct ucred *cred; - int fd, error; + struct kqueue *kq; + int error; - fdp = td->td_proc->p_fd; cred = td->td_ucred; if (!chgkqcnt(cred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_KQUEUES))) return (ENOMEM); - error = falloc_caps(td, &fp, &fd, flags, fcaps); + error = fdip != NULL ? falloc_caps(td, fpp, fdip, flags, fcaps) : + _falloc_noinstall(td, fpp, 1); if (error != 0) { chgkqcnt(cred->cr_ruidinfo, -1, 0); return (error); } /* An extra reference on `fp' has been held for us by falloc(). */ - kq = malloc(sizeof *kq, M_KQUEUE, M_WAITOK | M_ZERO); - kqueue_init(kq); + kq = malloc(sizeof(*kq), M_KQUEUE, M_WAITOK | M_ZERO); + kqueue_init(kq, cponfork); kq->kq_fdp = fdp; kq->kq_cred = crhold(cred); - FILEDESC_XLOCK(fdp); + if (fdip != NULL) + FILEDESC_XLOCK(fdp); TAILQ_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_list); - FILEDESC_XUNLOCK(fdp); + if (fdip != NULL) + FILEDESC_XUNLOCK(fdp); + + finit(*fpp, FREAD | FWRITE, DTYPE_KQUEUE, kq, &kqueueops); + *kqp = kq; + return (0); +} + +int +kern_kqueue(struct thread *td, int flags, bool cponfork, struct filecaps *fcaps) +{ + struct kqueue *kq; + struct file *fp; + int fd, error; + + error = kern_kqueue_alloc(td, td->td_proc->p_fd, &fd, &fp, flags, + fcaps, cponfork, &kq); + if (error != 0) + return (error); - finit(fp, FREAD | FWRITE, DTYPE_KQUEUE, kq, &kqueueops); fdrop(fp, td); td->td_retval[0] = fd; @@ -1488,7 +1544,7 @@ kern_kevent_anonymous(struct thread *td, int nevents, struct kqueue kq = {}; int error; - kqueue_init(&kq); + kqueue_init(&kq, false); kq.kq_refcnt = 1; error = kqueue_kevent(&kq, td, nevents, nevents, k_ops, NULL); kqueue_drain(&kq, td); @@ -1576,7 +1632,7 @@ kqueue_fo_release(int filt) mtx_lock(&filterops_lock); KASSERT(sysfilt_ops[~filt].for_refcnt > 0, - ("filter object refcount not valid on release")); + ("filter object %d refcount not valid on release", filt)); sysfilt_ops[~filt].for_refcnt--; mtx_unlock(&filterops_lock); } @@ -1855,17 +1911,8 @@ done: } static int -kqueue_acquire(struct file *fp, struct kqueue **kqp) +kqueue_acquire_ref(struct kqueue *kq) { - int error; - struct kqueue *kq; - - error = 0; - - kq = fp->f_data; - if (fp->f_type != DTYPE_KQUEUE || kq == NULL) - return (EINVAL); - *kqp = kq; KQ_LOCK(kq); if ((kq->kq_state & KQ_CLOSING) == KQ_CLOSING) { KQ_UNLOCK(kq); @@ -1873,8 +1920,22 @@ kqueue_acquire(struct file *fp, struct kqueue **kqp) } kq->kq_refcnt++; KQ_UNLOCK(kq); + return (0); +} - return error; +static int +kqueue_acquire(struct file *fp, struct kqueue **kqp) +{ + struct kqueue *kq; + int error; + + kq = fp->f_data; + if (fp->f_type != DTYPE_KQUEUE || kq == NULL) + return (EINVAL); + error = kqueue_acquire_ref(kq); + if (error == 0) + *kqp = kq; + return (error); } static void @@ -2937,6 +2998,152 @@ noacquire: return (error); } +static int +kqueue_fork_alloc(struct filedesc *fdp, struct file *fp, struct file **fp1, + struct thread *td) +{ + struct kqueue *kq, *kq1; + int error; + + MPASS(fp->f_type == DTYPE_KQUEUE); + kq = fp->f_data; + if ((kq->kq_state & KQ_CPONFORK) == 0) + return (EOPNOTSUPP); + error = kqueue_acquire_ref(kq); + if (error != 0) + return (error); + error = kern_kqueue_alloc(td, fdp, NULL, fp1, 0, NULL, true, &kq1); + if (error == 0) { + kq1->kq_forksrc = kq; + (*fp1)->f_flag = fp->f_flag & (FREAD | FWRITE | FEXEC | + O_CLOEXEC | O_CLOFORK); + } else { + kqueue_release(kq, 0); + } + return (error); +} + +static void +kqueue_fork_copy_knote(struct kqueue *kq1, struct knote *kn, struct proc *p1, + struct filedesc *fdp) +{ + struct knote *kn1; + const struct filterops *fop; + int error; + + fop = kn->kn_fop; + if (fop->f_copy == NULL || (fop->f_isfd && + fdp->fd_files->fdt_ofiles[kn->kn_kevent.ident].fde_file == NULL)) + return; + error = kqueue_expand(kq1, fop, kn->kn_kevent.ident, M_WAITOK); + if (error != 0) + return; + + kn1 = knote_alloc(M_WAITOK); + *kn1 = *kn; + kn1->kn_status |= KN_DETACHED; + kn1->kn_status &= ~KN_QUEUED; + kn1->kn_kq = kq1; + error = fop->f_copy(kn1, p1); + if (error != 0) { + knote_free(kn1); + return; + } + (void)kqueue_fo_find(kn->kn_kevent.filter); + if (fop->f_isfd && !fhold(kn1->kn_fp)) { + fop->f_detach(kn1); + kqueue_fo_release(kn->kn_kevent.filter); + knote_free(kn1); + return; + } + if (kn->kn_knlist != NULL) + knlist_add(kn->kn_knlist, kn1, 0); + KQ_LOCK(kq1); + knote_attach(kn1, kq1); + kn1->kn_influx = 0; + if ((kn->kn_status & KN_QUEUED) != 0) + knote_enqueue(kn1); + KQ_UNLOCK(kq1); +} + +static void +kqueue_fork_copy_list(struct klist *knlist, struct knote *marker, + struct kqueue *kq, struct kqueue *kq1, struct proc *p1, + struct filedesc *fdp) +{ + struct knote *kn; + + KQ_OWNED(kq); + kn = SLIST_FIRST(knlist); + while (kn != NULL) { + if ((kn->kn_status & KN_DETACHED) != 0 || + (kn_in_flux(kn) && (kn->kn_status & KN_SCAN) == 0)) { + kn = SLIST_NEXT(kn, kn_link); + continue; + } + kn_enter_flux(kn); + SLIST_INSERT_AFTER(kn, marker, kn_link); + KQ_UNLOCK(kq); + kqueue_fork_copy_knote(kq1, kn, p1, fdp); + KQ_LOCK(kq); + kn_leave_flux(kn); + kn = SLIST_NEXT(marker, kn_link); + /* XXXKIB switch kn_link to LIST? */ + SLIST_REMOVE(knlist, marker, knote, kn_link); + } +} + +static int +kqueue_fork_copy(struct filedesc *fdp, struct file *fp, struct file *fp1, + struct proc *p1, struct thread *td) +{ + struct kqueue *kq, *kq1; + struct knote *marker; + int error, i; + + error = 0; + MPASS(fp == NULL); + MPASS(fp1->f_type == DTYPE_KQUEUE); + + kq1 = fp1->f_data; + kq = kq1->kq_forksrc; + marker = knote_alloc(M_WAITOK); + marker->kn_status = KN_MARKER; + + KQ_LOCK(kq); + for (i = 0; i < kq->kq_knlistsize; i++) { + kqueue_fork_copy_list(&kq->kq_knlist[i], marker, kq, kq1, + p1, fdp); + } + if (kq->kq_knhashmask != 0) { + for (i = 0; i <= kq->kq_knhashmask; i++) { + kqueue_fork_copy_list(&kq->kq_knhash[i], marker, kq, + kq1, p1, fdp); + } + } + kqueue_release(kq, 1); + kq1->kq_forksrc = NULL; + KQ_UNLOCK(kq); + + knote_free(marker); + return (error); +} + +static int +kqueue_fork(struct filedesc *fdp, struct file *fp, struct file **fp1, + struct proc *p1, struct thread *td) +{ + if (*fp1 == NULL) + return (kqueue_fork_alloc(fdp, fp, fp1, td)); + return (kqueue_fork_copy(fdp, fp, *fp1, p1, td)); +} + +int +knote_triv_copy(struct knote *kn __unused, struct proc *p1 __unused) +{ + return (0); +} + struct knote_status_export_bit { int kn_status_bit; int knt_status_bit; diff --git a/sys/kern/kern_fork.c b/sys/kern/kern_fork.c index 7f6abae187b3..8b237b6dbd17 100644 --- a/sys/kern/kern_fork.c +++ b/sys/kern/kern_fork.c @@ -423,7 +423,7 @@ do_fork(struct thread *td, struct fork_req *fr, struct proc *p2, struct thread * pd = pdshare(p1->p_pd); else pd = pdcopy(p1->p_pd); - fd = fdcopy(p1->p_fd); + fd = fdcopy(p1->p_fd, p2); fdtol = NULL; } else { if (fr->fr_flags2 & FR2_SHARE_PATHS) diff --git a/sys/kern/kern_jaildesc.c b/sys/kern/kern_jaildesc.c index 3f322b271400..a564393d3366 100644 --- a/sys/kern/kern_jaildesc.c +++ b/sys/kern/kern_jaildesc.c @@ -344,6 +344,7 @@ static const struct filterops jaildesc_kqops = { .f_isfd = 1, .f_detach = jaildesc_kqops_detach, .f_event = jaildesc_kqops_event, + .f_copy = knote_triv_copy, }; static int diff --git a/sys/kern/kern_sig.c b/sys/kern/kern_sig.c index 21f765b17f62..a55f3c761449 100644 --- a/sys/kern/kern_sig.c +++ b/sys/kern/kern_sig.c @@ -124,6 +124,7 @@ const struct filterops sig_filtops = { .f_attach = filt_sigattach, .f_detach = filt_sigdetach, .f_event = filt_signal, + .f_copy = knote_triv_copy, }; static int kern_forcesigexit = 1; diff --git a/sys/kern/subr_log.c b/sys/kern/subr_log.c index 5380902e602f..aac35a56130e 100644 --- a/sys/kern/subr_log.c +++ b/sys/kern/subr_log.c @@ -79,6 +79,7 @@ static const struct filterops log_read_filterops = { .f_attach = NULL, .f_detach = logkqdetach, .f_event = logkqread, + .f_copy = knote_triv_copy, }; static struct logsoftc { diff --git a/sys/kern/sys_eventfd.c b/sys/kern/sys_eventfd.c index c2a0f67cae85..04ed107c933d 100644 --- a/sys/kern/sys_eventfd.c +++ b/sys/kern/sys_eventfd.c @@ -85,13 +85,16 @@ static int filt_eventfdwrite(struct knote *kn, long hint); static const struct filterops eventfd_rfiltops = { .f_isfd = 1, .f_detach = filt_eventfddetach, - .f_event = filt_eventfdread + .f_event = filt_eventfdread, + .f_copy = knote_triv_copy, }; + static const struct filterops eventfd_wfiltops = { .f_isfd = 1, .f_detach = filt_eventfddetach, - .f_event = filt_eventfdwrite + .f_event = filt_eventfdwrite, + .f_copy = knote_triv_copy, }; struct eventfd { diff --git a/sys/kern/sys_pipe.c b/sys/kern/sys_pipe.c index 57ebe8dc85f0..6531cea31423 100644 --- a/sys/kern/sys_pipe.c +++ b/sys/kern/sys_pipe.c @@ -181,20 +181,23 @@ static int filt_pipedump(struct proc *p, struct knote *kn, static const struct filterops pipe_nfiltops = { .f_isfd = 1, .f_detach = filt_pipedetach_notsup, - .f_event = filt_pipenotsup + .f_event = filt_pipenotsup, /* no userdump */ + .f_copy = knote_triv_copy, }; static const struct filterops pipe_rfiltops = { .f_isfd = 1, .f_detach = filt_pipedetach, .f_event = filt_piperead, .f_userdump = filt_pipedump, + .f_copy = knote_triv_copy, }; static const struct filterops pipe_wfiltops = { .f_isfd = 1, .f_detach = filt_pipedetach, .f_event = filt_pipewrite, .f_userdump = filt_pipedump, + .f_copy = knote_triv_copy, }; /* diff --git a/sys/kern/sys_procdesc.c b/sys/kern/sys_procdesc.c index acaf1241cb2e..c5db21544b0f 100644 --- a/sys/kern/sys_procdesc.c +++ b/sys/kern/sys_procdesc.c @@ -486,6 +486,7 @@ static const struct filterops procdesc_kqops = { .f_isfd = 1, .f_detach = procdesc_kqops_detach, .f_event = procdesc_kqops_event, + .f_copy = knote_triv_copy, }; static int diff --git a/sys/kern/tty.c b/sys/kern/tty.c index c8e2c561b7cf..067471eb949a 100644 --- a/sys/kern/tty.c +++ b/sys/kern/tty.c @@ -754,12 +754,14 @@ static const struct filterops tty_kqops_read = { .f_isfd = 1, .f_detach = tty_kqops_read_detach, .f_event = tty_kqops_read_event, + .f_copy = knote_triv_copy, }; static const struct filterops tty_kqops_write = { .f_isfd = 1, .f_detach = tty_kqops_write_detach, .f_event = tty_kqops_write_event, + .f_copy = knote_triv_copy, }; static int diff --git a/sys/kern/tty_pts.c b/sys/kern/tty_pts.c index 1291770a9ccb..2672935c2d89 100644 --- a/sys/kern/tty_pts.c +++ b/sys/kern/tty_pts.c @@ -491,11 +491,13 @@ static const struct filterops pts_kqops_read = { .f_isfd = 1, .f_detach = pts_kqops_read_detach, .f_event = pts_kqops_read_event, + .f_copy = knote_triv_copy, }; static const struct filterops pts_kqops_write = { .f_isfd = 1, .f_detach = pts_kqops_write_detach, .f_event = pts_kqops_write_event, + .f_copy = knote_triv_copy, }; static int diff --git a/sys/kern/uipc_mqueue.c b/sys/kern/uipc_mqueue.c index a8aec397b352..4c1bb1ff228e 100644 --- a/sys/kern/uipc_mqueue.c +++ b/sys/kern/uipc_mqueue.c @@ -281,11 +281,13 @@ static const struct filterops mq_rfiltops = { .f_isfd = 1, .f_detach = filt_mqdetach, .f_event = filt_mqread, + .f_copy = knote_triv_copy, }; static const struct filterops mq_wfiltops = { .f_isfd = 1, .f_detach = filt_mqdetach, .f_event = filt_mqwrite, + .f_copy = knote_triv_copy, }; /* diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c index fe2d8d056062..eb9544628137 100644 --- a/sys/kern/uipc_socket.c +++ b/sys/kern/uipc_socket.c @@ -191,16 +191,19 @@ static const struct filterops soread_filtops = { .f_isfd = 1, .f_detach = filt_sordetach, .f_event = filt_soread, + .f_copy = knote_triv_copy, }; static const struct filterops sowrite_filtops = { .f_isfd = 1, .f_detach = filt_sowdetach, .f_event = filt_sowrite, + .f_copy = knote_triv_copy, }; static const struct filterops soempty_filtops = { .f_isfd = 1, .f_detach = filt_sowdetach, .f_event = filt_soempty, + .f_copy = knote_triv_copy, }; so_gen_t so_gencnt; /* generation count for sockets */ diff --git a/sys/kern/uipc_usrreq.c b/sys/kern/uipc_usrreq.c index 90489e99491a..807271488af2 100644 --- a/sys/kern/uipc_usrreq.c +++ b/sys/kern/uipc_usrreq.c @@ -1855,11 +1855,13 @@ static const struct filterops uipc_write_filtops = { .f_isfd = 1, .f_detach = uipc_filt_sowdetach, .f_event = uipc_filt_sowrite, + .f_copy = knote_triv_copy, }; static const struct filterops uipc_empty_filtops = { .f_isfd = 1, .f_detach = uipc_filt_sowdetach, .f_event = uipc_filt_soempty, + .f_copy = knote_triv_copy, }; static int diff --git a/sys/kern/vfs_aio.c b/sys/kern/vfs_aio.c index e63fa4c01434..60916a9fbd32 100644 --- a/sys/kern/vfs_aio.c +++ b/sys/kern/vfs_aio.c @@ -345,12 +345,14 @@ static const struct filterops aio_filtops = { .f_attach = filt_aioattach, .f_detach = filt_aiodetach, .f_event = filt_aio, + .f_copy = knote_triv_copy, }; static const struct filterops lio_filtops = { .f_isfd = 0, .f_attach = filt_lioattach, .f_detach = filt_liodetach, - .f_event = filt_lio + .f_event = filt_lio, + .f_copy = knote_triv_copy, }; static eventhandler_tag exit_tag, exec_tag; diff --git a/sys/kern/vfs_inotify.c b/sys/kern/vfs_inotify.c index b265a5ff3a62..e60d8426ee42 100644 --- a/sys/kern/vfs_inotify.c +++ b/sys/kern/vfs_inotify.c @@ -111,6 +111,7 @@ static const struct filterops inotify_rfiltops = { .f_isfd = 1, .f_detach = filt_inotifydetach, .f_event = filt_inotifyevent, + .f_copy = knote_triv_copy, }; static MALLOC_DEFINE(M_INOTIFY, "inotify", "inotify data structures"); diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c index 73e110c05bc1..58975f7ac932 100644 --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -6545,6 +6545,7 @@ const struct filterops fs_filtops = { .f_attach = filt_fsattach, .f_detach = filt_fsdetach, .f_event = filt_fsevent, + .f_copy = knote_triv_copy, }; static int @@ -6624,24 +6625,28 @@ static int filt_vfsvnode(struct knote *kn, long hint); static void filt_vfsdetach(struct knote *kn); static int filt_vfsdump(struct proc *p, struct knote *kn, struct kinfo_knote *kin); +static int filt_vfscopy(struct knote *kn, struct proc *p1); static const struct filterops vfsread_filtops = { .f_isfd = 1, .f_detach = filt_vfsdetach, .f_event = filt_vfsread, .f_userdump = filt_vfsdump, + .f_copy = filt_vfscopy, }; static const struct filterops vfswrite_filtops = { .f_isfd = 1, .f_detach = filt_vfsdetach, .f_event = filt_vfswrite, .f_userdump = filt_vfsdump, + .f_copy = filt_vfscopy, }; static const struct filterops vfsvnode_filtops = { .f_isfd = 1, .f_detach = filt_vfsdetach, .f_event = filt_vfsvnode, .f_userdump = filt_vfsdump, + .f_copy = filt_vfscopy, }; static void @@ -6825,6 +6830,16 @@ filt_vfsdump(struct proc *p, struct knote *kn, struct kinfo_knote *kin) return (0); } +static int +filt_vfscopy(struct knote *kn, struct proc *p1) +{ + struct vnode *vp; + + vp = (struct vnode *)kn->kn_hook; + vhold(vp); + return (0); +} + int vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off) { diff --git a/sys/modules/Makefile b/sys/modules/Makefile index feb9778c23da..63a0b3260e6d 100644 --- a/sys/modules/Makefile +++ b/sys/modules/Makefile @@ -577,6 +577,7 @@ _mlx5ib= mlx5ib ${MACHINE_CPUARCH} == "i386" _ena= ena _gve= gve +_igc= igc # gcc13 and earlier lack __builtin_bitcountg used by linux emulation .if !(${COMPILER_TYPE} == "gcc" && ${COMPILER_VERSION} < 140000) _iwlwifi= iwlwifi @@ -747,7 +748,6 @@ _et= et _ftgpio= ftgpio _ftwd= ftwd _exca= exca -_igc= igc _io= io _itwd= itwd _ix= ix diff --git a/sys/net/bpf.c b/sys/net/bpf.c index a347dbe2eb73..f598733773d0 100644 --- a/sys/net/bpf.c +++ b/sys/net/bpf.c @@ -253,12 +253,14 @@ static const struct filterops bpfread_filtops = { .f_isfd = 1, .f_detach = filt_bpfdetach, .f_event = filt_bpfread, + .f_copy = knote_triv_copy, }; static const struct filterops bpfwrite_filtops = { .f_isfd = 1, .f_detach = filt_bpfdetach, .f_event = filt_bpfwrite, + .f_copy = knote_triv_copy, }; /* diff --git a/sys/net/if_tuntap.c b/sys/net/if_tuntap.c index c8dbb6aa8893..56bb90cce9bc 100644 --- a/sys/net/if_tuntap.c +++ b/sys/net/if_tuntap.c @@ -261,6 +261,7 @@ static const struct filterops tun_read_filterops = { .f_attach = NULL, .f_detach = tunkqdetach, .f_event = tunkqread, + .f_copy = knote_triv_copy, }; static const struct filterops tun_write_filterops = { @@ -268,6 +269,7 @@ static const struct filterops tun_write_filterops = { .f_attach = NULL, .f_detach = tunkqdetach, .f_event = tunkqwrite, + .f_copy = knote_triv_copy, }; static struct tuntap_driver { diff --git a/sys/netgraph/bluetooth/drivers/ubt/ng_ubt_rtl.c b/sys/netgraph/bluetooth/drivers/ubt/ng_ubt_rtl.c index 0181a67ac604..f35712cc8f69 100644 --- a/sys/netgraph/bluetooth/drivers/ubt/ng_ubt_rtl.c +++ b/sys/netgraph/bluetooth/drivers/ubt/ng_ubt_rtl.c @@ -81,9 +81,6 @@ const STRUCT_USB_HOST_ID ubt_rtl_devs[] = { USB_VPI(0x0bda, 0xb00c, 0) }, { USB_VPI(0x0bda, 0xc822, 0) }, - /* Realtek 8822CU Bluetooth devices */ - { USB_VPI(0x13d3, 0x3549, 0) }, - /* Realtek 8851BE Bluetooth devices */ { USB_VPI(0x13d3, 0x3600, 0) }, diff --git a/sys/netinet/libalias/alias_db.c b/sys/netinet/libalias/alias_db.c index c143d74a2f45..41f0a328daec 100644 --- a/sys/netinet/libalias/alias_db.c +++ b/sys/netinet/libalias/alias_db.c @@ -2181,7 +2181,7 @@ LibAliasInit(struct libalias *la) #undef malloc /* XXX: ugly */ la = malloc(sizeof *la, M_ALIAS, M_WAITOK | M_ZERO); #else - la = calloc(sizeof *la, 1); + la = calloc(1, sizeof *la); if (la == NULL) return (la); #endif diff --git a/sys/netinet/tcp_syncache.c b/sys/netinet/tcp_syncache.c index f842a5678fa1..be20fb44a820 100644 --- a/sys/netinet/tcp_syncache.c +++ b/sys/netinet/tcp_syncache.c @@ -1046,6 +1046,8 @@ abort: * * On syncache_socket() success the newly created socket * has its underlying inp locked. + * + * *lsop is updated, if and only if 1 is returned. */ int syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, @@ -1094,12 +1096,14 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, */ SCH_UNLOCK(sch); TCPSTAT_INC(tcps_sc_spurcookie); - if ((s = tcp_log_addrs(inc, th, NULL, NULL))) + if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Spurious ACK, " "segment rejected " "(syncookies disabled)\n", s, __func__); - goto failed; + free(s, M_TCPLOG); + } + return (0); } if (sch->sch_last_overflow < time_uptime - SYNCOOKIE_LIFETIME) { @@ -1109,12 +1113,14 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, */ SCH_UNLOCK(sch); TCPSTAT_INC(tcps_sc_spurcookie); - if ((s = tcp_log_addrs(inc, th, NULL, NULL))) + if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Spurious ACK, " "segment rejected " "(no syncache entry)\n", s, __func__); - goto failed; + free(s, M_TCPLOG); + } + return (0); } SCH_UNLOCK(sch); } @@ -1128,11 +1134,13 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, TCPSTAT_INC(tcps_sc_recvcookie); } else { TCPSTAT_INC(tcps_sc_failcookie); - if ((s = tcp_log_addrs(inc, th, NULL, NULL))) + if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Segment failed " "SYNCOOKIE authentication, segment rejected " "(probably spoofed)\n", s, __func__); - goto failed; + free(s, M_TCPLOG); + } + return (0); } #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) /* If received ACK has MD5 signature, check it. */ @@ -1206,9 +1214,9 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, "%s; %s: SEG.TSval %u < TS.Recent %u, " "segment dropped\n", s, __func__, to->to_tsval, sc->sc_tsreflect); - free(s, M_TCPLOG); } SCH_UNLOCK(sch); + free(s, M_TCPLOG); return (-1); /* Do not send RST */ } @@ -1225,7 +1233,6 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, "expected, segment processed normally\n", s, __func__); free(s, M_TCPLOG); - s = NULL; } } @@ -1312,16 +1319,6 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, if (sc != &scs) syncache_free(sc); return (1); -failed: - if (sc != NULL) { - TCPSTATES_DEC(TCPS_SYN_RECEIVED); - if (sc != &scs) - syncache_free(sc); - } - if (s != NULL) - free(s, M_TCPLOG); - *lsop = NULL; - return (0); } static struct socket * diff --git a/sys/powerpc/powerpc/elf32_machdep.c b/sys/powerpc/powerpc/elf32_machdep.c index e1118713bff0..6b904c02ea15 100644 --- a/sys/powerpc/powerpc/elf32_machdep.c +++ b/sys/powerpc/powerpc/elf32_machdep.c @@ -143,7 +143,7 @@ struct sysentvec elf32_freebsd_sysvec = { }; INIT_SYSENTVEC(elf32_sysvec, &elf32_freebsd_sysvec); -static Elf32_Brandinfo freebsd_brand_info = { +static const Elf32_Brandinfo freebsd_brand_info = { .brand = ELFOSABI_FREEBSD, .machine = EM_PPC, .compat_3_brand = "FreeBSD", @@ -158,11 +158,11 @@ static Elf32_Brandinfo freebsd_brand_info = { .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; -SYSINIT(elf32, SI_SUB_EXEC, SI_ORDER_FIRST, +C_SYSINIT(elf32, SI_SUB_EXEC, SI_ORDER_FIRST, (sysinit_cfunc_t) elf32_insert_brand_entry, &freebsd_brand_info); -static Elf32_Brandinfo freebsd_brand_oinfo = { +static const Elf32_Brandinfo freebsd_brand_oinfo = { .brand = ELFOSABI_FREEBSD, .machine = EM_PPC, .compat_3_brand = "FreeBSD", @@ -173,7 +173,7 @@ static Elf32_Brandinfo freebsd_brand_oinfo = { .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; -SYSINIT(oelf32, SI_SUB_EXEC, SI_ORDER_ANY, +C_SYSINIT(oelf32, SI_SUB_EXEC, SI_ORDER_ANY, (sysinit_cfunc_t) elf32_insert_brand_entry, &freebsd_brand_oinfo); diff --git a/sys/powerpc/powerpc/elf64_machdep.c b/sys/powerpc/powerpc/elf64_machdep.c index a999f742caeb..0be40bed69cb 100644 --- a/sys/powerpc/powerpc/elf64_machdep.c +++ b/sys/powerpc/powerpc/elf64_machdep.c @@ -154,7 +154,7 @@ static bool ppc64_elfv1_header_match(const struct image_params *params, static bool ppc64_elfv2_header_match(const struct image_params *params, const int32_t *, const uint32_t *); -static Elf64_Brandinfo freebsd_brand_info_elfv1 = { +static const Elf64_Brandinfo freebsd_brand_info_elfv1 = { .brand = ELFOSABI_FREEBSD, .machine = EM_PPC64, .compat_3_brand = "FreeBSD", @@ -166,11 +166,11 @@ static Elf64_Brandinfo freebsd_brand_info_elfv1 = { .header_supported = &ppc64_elfv1_header_match }; -SYSINIT(elf64v1, SI_SUB_EXEC, SI_ORDER_ANY, +C_SYSINIT(elf64v1, SI_SUB_EXEC, SI_ORDER_ANY, (sysinit_cfunc_t) elf64_insert_brand_entry, &freebsd_brand_info_elfv1); -static Elf64_Brandinfo freebsd_brand_info_elfv2 = { +static const Elf64_Brandinfo freebsd_brand_info_elfv2 = { .brand = ELFOSABI_FREEBSD, .machine = EM_PPC64, .compat_3_brand = "FreeBSD", @@ -182,11 +182,11 @@ static Elf64_Brandinfo freebsd_brand_info_elfv2 = { .header_supported = &ppc64_elfv2_header_match }; -SYSINIT(elf64v2, SI_SUB_EXEC, SI_ORDER_ANY, +C_SYSINIT(elf64v2, SI_SUB_EXEC, SI_ORDER_ANY, (sysinit_cfunc_t) elf64_insert_brand_entry, &freebsd_brand_info_elfv2); -static Elf64_Brandinfo freebsd_brand_oinfo = { +static const Elf64_Brandinfo freebsd_brand_oinfo = { .brand = ELFOSABI_FREEBSD, .machine = EM_PPC64, .compat_3_brand = "FreeBSD", @@ -198,7 +198,7 @@ static Elf64_Brandinfo freebsd_brand_oinfo = { .header_supported = &ppc64_elfv1_header_match }; -SYSINIT(oelf64, SI_SUB_EXEC, SI_ORDER_ANY, +C_SYSINIT(oelf64, SI_SUB_EXEC, SI_ORDER_ANY, (sysinit_cfunc_t) elf64_insert_brand_entry, &freebsd_brand_oinfo); diff --git a/sys/riscv/include/vmm.h b/sys/riscv/include/vmm.h index bc00474ed0fd..e227dd825966 100644 --- a/sys/riscv/include/vmm.h +++ b/sys/riscv/include/vmm.h @@ -149,7 +149,7 @@ DECLARE_VMMOPS_FUNC(void, vmspace_free, (struct vmspace *vmspace)); int vm_create(const char *name, struct vm **retvm); struct vcpu *vm_alloc_vcpu(struct vm *vm, int vcpuid); void vm_disable_vcpu_creation(struct vm *vm); -void vm_slock_vcpus(struct vm *vm); +void vm_lock_vcpus(struct vm *vm); void vm_unlock_vcpus(struct vm *vm); void vm_destroy(struct vm *vm); int vm_reinit(struct vm *vm); diff --git a/sys/riscv/riscv/elf_machdep.c b/sys/riscv/riscv/elf_machdep.c index 67b1fcc4c1a9..5bd4af4c15f8 100644 --- a/sys/riscv/riscv/elf_machdep.c +++ b/sys/riscv/riscv/elf_machdep.c @@ -100,7 +100,7 @@ static struct sysentvec elf64_freebsd_sysvec = { }; INIT_SYSENTVEC(elf64_sysvec, &elf64_freebsd_sysvec); -static Elf64_Brandinfo freebsd_brand_info = { +static const Elf64_Brandinfo freebsd_brand_info = { .brand = ELFOSABI_FREEBSD, .machine = EM_RISCV, .compat_3_brand = "FreeBSD", @@ -110,7 +110,7 @@ static Elf64_Brandinfo freebsd_brand_info = { .brand_note = &elf64_freebsd_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; -SYSINIT(elf64, SI_SUB_EXEC, SI_ORDER_FIRST, +C_SYSINIT(elf64, SI_SUB_EXEC, SI_ORDER_FIRST, (sysinit_cfunc_t)elf64_insert_brand_entry, &freebsd_brand_info); static void diff --git a/sys/riscv/vmm/vmm.c b/sys/riscv/vmm/vmm.c index 790dcc576507..4c9b1fa53f7a 100644 --- a/sys/riscv/vmm/vmm.c +++ b/sys/riscv/vmm/vmm.c @@ -346,9 +346,9 @@ vm_alloc_vcpu(struct vm *vm, int vcpuid) } void -vm_slock_vcpus(struct vm *vm) +vm_lock_vcpus(struct vm *vm) { - sx_slock(&vm->vcpus_init_lock); + sx_xlock(&vm->vcpus_init_lock); } void diff --git a/sys/security/audit/audit_pipe.c b/sys/security/audit/audit_pipe.c index fb773fd04297..4d9815467e1a 100644 --- a/sys/security/audit/audit_pipe.c +++ b/sys/security/audit/audit_pipe.c @@ -243,6 +243,7 @@ static const struct filterops audit_pipe_read_filterops = { .f_attach = NULL, .f_detach = audit_pipe_kqdetach, .f_event = audit_pipe_kqread, + .f_copy = knote_triv_copy, }; /* diff --git a/sys/sys/event.h b/sys/sys/event.h index 084eaafcbdc0..ebbcdb703183 100644 --- a/sys/sys/event.h +++ b/sys/sys/event.h @@ -228,6 +228,7 @@ struct freebsd11_kevent32 { /* Flags for kqueuex(2) */ #define KQUEUE_CLOEXEC 0x00000001 /* close on exec */ +#define KQUEUE_CPONFORK 0x00000002 /* copy on fork */ struct knote; SLIST_HEAD(klist, knote); @@ -283,6 +284,7 @@ struct filterops { void (*f_touch)(struct knote *kn, struct kevent *kev, u_long type); int (*f_userdump)(struct proc *p, struct knote *kn, struct kinfo_knote *kin); + int (*f_copy)(struct knote *kn, struct proc *p1); }; /* @@ -346,6 +348,7 @@ struct rwlock; void knote(struct knlist *list, long hint, int lockflags); void knote_fork(struct knlist *list, int pid); +int knote_triv_copy(struct knote *kn, struct proc *p1); struct knlist *knlist_alloc(struct mtx *lock); void knlist_detach(struct knlist *knl); void knlist_add(struct knlist *knl, struct knote *kn, int islocked); diff --git a/sys/sys/eventvar.h b/sys/sys/eventvar.h index 7fec444447f9..7cb3269f1fdf 100644 --- a/sys/sys/eventvar.h +++ b/sys/sys/eventvar.h @@ -55,12 +55,14 @@ struct kqueue { #define KQ_CLOSING 0x10 #define KQ_TASKSCHED 0x20 /* task scheduled */ #define KQ_TASKDRAIN 0x40 /* waiting for task to drain */ +#define KQ_CPONFORK 0x80 int kq_knlistsize; /* size of knlist */ struct klist *kq_knlist; /* list of knotes */ u_long kq_knhashmask; /* size of knhash */ struct klist *kq_knhash; /* hash table for knotes */ struct task kq_task; struct ucred *kq_cred; + struct kqueue *kq_forksrc; }; #endif /* !_SYS_EVENTVAR_H_ */ diff --git a/sys/sys/file.h b/sys/sys/file.h index c44fd0f28929..e0195c7c6c2a 100644 --- a/sys/sys/file.h +++ b/sys/sys/file.h @@ -139,6 +139,8 @@ typedef int fo_fspacectl_t(struct file *fp, int cmd, off_t *offset, off_t *length, int flags, struct ucred *active_cred, struct thread *td); typedef int fo_cmp_t(struct file *fp, struct file *fp1, struct thread *td); +typedef int fo_fork_t(struct filedesc *fdp, struct file *fp, struct file **fp1, + struct proc *p1, struct thread *td); typedef int fo_spare_t(struct file *fp); typedef int fo_flags_t; @@ -163,12 +165,14 @@ struct fileops { fo_fallocate_t *fo_fallocate; fo_fspacectl_t *fo_fspacectl; fo_cmp_t *fo_cmp; + fo_fork_t *fo_fork; fo_spare_t *fo_spares[8]; /* Spare slots */ fo_flags_t fo_flags; /* DFLAG_* below */ }; #define DFLAG_PASSABLE 0x01 /* may be passed via unix sockets. */ #define DFLAG_SEEKABLE 0x02 /* seekable / nonsequential */ +#define DFLAG_FORK 0x04 /* copy on fork */ #endif /* _KERNEL */ #if defined(_KERNEL) || defined(_WANT_FILE) diff --git a/sys/sys/filedesc.h b/sys/sys/filedesc.h index 0a388c90de26..4817855443af 100644 --- a/sys/sys/filedesc.h +++ b/sys/sys/filedesc.h @@ -265,7 +265,7 @@ int fdcheckstd(struct thread *td); void fdclose(struct thread *td, struct file *fp, int idx); void fdcloseexec(struct thread *td); void fdsetugidsafety(struct thread *td); -struct filedesc *fdcopy(struct filedesc *fdp); +struct filedesc *fdcopy(struct filedesc *fdp, struct proc *p1); void fdunshare(struct thread *td); void fdescfree(struct thread *td); int fdlastfile(struct filedesc *fdp); diff --git a/sys/sys/imgact_elf.h b/sys/sys/imgact_elf.h index 9e2a233248b4..25ce7871ba7c 100644 --- a/sys/sys/imgact_elf.h +++ b/sys/sys/imgact_elf.h @@ -87,7 +87,7 @@ typedef struct { const char *interp_newpath; int flags; const Elf_Brandnote *brand_note; - bool (*header_supported)(const struct image_params *, + bool (*const header_supported)(const struct image_params *, const int32_t *, const uint32_t *); /* High 8 bits of flags is private to the ABI */ #define BI_CAN_EXEC_DYN 0x0001 @@ -132,8 +132,8 @@ bool __elfN(parse_notes)(const struct image_params *, const Elf_Note *, void __elfN(dump_thread)(struct thread *, void *, size_t *); extern int __elfN(fallback_brand); -extern Elf_Brandnote __elfN(freebsd_brandnote); -extern Elf_Brandnote __elfN(kfreebsd_brandnote); +extern const Elf_Brandnote __elfN(freebsd_brandnote); +extern const Elf_Brandnote __elfN(kfreebsd_brandnote); #endif /* _KERNEL */ #endif /* !_SYS_IMGACT_ELF_H_ */ diff --git a/sys/sys/syscallsubr.h b/sys/sys/syscallsubr.h index 8237165b84ce..d32690634059 100644 --- a/sys/sys/syscallsubr.h +++ b/sys/sys/syscallsubr.h @@ -211,7 +211,8 @@ int kern_kevent_fp(struct thread *td, struct file *fp, int nchanges, int nevents, struct kevent_copyops *k_ops, const struct timespec *timeout); int kern_kill(struct thread *td, pid_t pid, int signum); -int kern_kqueue(struct thread *td, int flags, struct filecaps *fcaps); +int kern_kqueue(struct thread *td, int flags, bool cponfork, + struct filecaps *fcaps); int kern_kldload(struct thread *td, const char *file, int *fileid); int kern_kldstat(struct thread *td, int fileid, struct kld_file_stat *stat); int kern_kldunload(struct thread *td, int fileid, int flags); diff --git a/sys/vm/vm_domainset.c b/sys/vm/vm_domainset.c index 9fa17da954f7..c25ed0cc2267 100644 --- a/sys/vm/vm_domainset.c +++ b/sys/vm/vm_domainset.c @@ -113,7 +113,6 @@ vm_domainset_iter_interleave(struct vm_domainset_iter *di, int *domain) int d; d = di->di_offset % di->di_domain->ds_cnt; - *di->di_iter = d; *domain = di->di_domain->ds_order[d]; } @@ -260,9 +259,14 @@ vm_domainset_iter_page_init(struct vm_domainset_iter *di, struct vm_object *obj, * are immutable and unsynchronized. Updates can race but pointer * loads are assumed to be atomic. */ - if (obj != NULL && obj->domain.dr_policy != NULL) + if (obj != NULL && obj->domain.dr_policy != NULL) { + /* + * This write lock protects non-atomic increments of the + * iterator index in vm_domainset_iter_rr(). + */ + VM_OBJECT_ASSERT_WLOCKED(obj); dr = &obj->domain; - else + } else dr = &curthread->td_domain; vm_domainset_iter_init(di, dr->dr_policy, &dr->dr_iter, obj, pindex); diff --git a/sys/x86/acpica/acpi_apm.c b/sys/x86/acpica/acpi_apm.c index 8e5785cf0ed6..919f76949dd4 100644 --- a/sys/x86/acpica/acpi_apm.c +++ b/sys/x86/acpica/acpi_apm.c @@ -64,6 +64,7 @@ static const struct filterops apm_readfiltops = { .f_isfd = 1, .f_detach = apmreadfiltdetach, .f_event = apmreadfilt, + .f_copy = knote_triv_copy, }; static struct cdevsw apm_cdevsw = { |