diff options
Diffstat (limited to 'sys')
144 files changed, 3707 insertions, 1716 deletions
diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index cd8ab58a07ab..8df082f6c5dc 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -481,6 +481,8 @@ vm_paddr_t KERNend; /* and the end */ struct kva_layout_s kva_layout = { .kva_min = KV4ADDR(PML4PML4I, 0, 0, 0), + .kva_max = KV4ADDR(NPML4EPG - 1, NPDPEPG - 1, + NPDEPG - 1, NPTEPG - 1), .dmap_low = KV4ADDR(DMPML4I, 0, 0, 0), .dmap_high = KV4ADDR(DMPML4I + NDMPML4E, 0, 0, 0), .lm_low = KV4ADDR(LMSPML4I, 0, 0, 0), @@ -489,10 +491,20 @@ struct kva_layout_s kva_layout = { .km_high = KV4ADDR(KPML4BASE + NKPML4E - 1, NPDPEPG - 1, NPDEPG - 1, NPTEPG - 1), .rec_pt = KV4ADDR(PML4PML4I, 0, 0, 0), + .kasan_shadow_low = KV4ADDR(KASANPML4I, 0, 0, 0), + .kasan_shadow_high = KV4ADDR(KASANPML4I + NKASANPML4E, 0, 0, 0), + .kmsan_shadow_low = KV4ADDR(KMSANSHADPML4I, 0, 0, 0), + .kmsan_shadow_high = KV4ADDR(KMSANSHADPML4I + NKMSANSHADPML4E, + 0, 0, 0), + .kmsan_origin_low = KV4ADDR(KMSANORIGPML4I, 0, 0, 0), + .kmsan_origin_high = KV4ADDR(KMSANORIGPML4I + NKMSANORIGPML4E, + 0, 0, 0), }; struct kva_layout_s kva_layout_la57 = { .kva_min = KV5ADDR(NPML5EPG / 2, 0, 0, 0, 0), /* == rec_pt */ + .kva_max = KV5ADDR(NPML5EPG - 1, NPML4EPG - 1, NPDPEPG - 1, + NPDEPG - 1, NPTEPG - 1), .dmap_low = KV5ADDR(DMPML5I, 0, 0, 0, 0), .dmap_high = KV5ADDR(DMPML5I + NDMPML5E, 0, 0, 0, 0), .lm_low = KV5ADDR(LMSPML5I, 0, 0, 0, 0), @@ -501,6 +513,14 @@ struct kva_layout_s kva_layout_la57 = { .km_high = KV4ADDR(KPML4BASE + NKPML4E - 1, NPDPEPG - 1, NPDEPG - 1, NPTEPG - 1), .rec_pt = KV5ADDR(PML5PML5I, 0, 0, 0, 0), + .kasan_shadow_low = KV4ADDR(KASANPML4I, 0, 0, 0), + .kasan_shadow_high = KV4ADDR(KASANPML4I + NKASANPML4E, 0, 0, 0), + .kmsan_shadow_low = KV4ADDR(KMSANSHADPML4I, 0, 0, 0), + .kmsan_shadow_high = KV4ADDR(KMSANSHADPML4I + NKMSANSHADPML4E, + 0, 0, 0), + .kmsan_origin_low = KV4ADDR(KMSANORIGPML4I, 0, 0, 0), + .kmsan_origin_high = KV4ADDR(KMSANORIGPML4I + NKMSANORIGPML4E, + 0, 0, 0), }; /* @@ -2003,7 +2023,7 @@ create_pagetables(vm_paddr_t *firstaddr) */ p5_p[i] = KPML5phys | X86_PG_RW | X86_PG_A | X86_PG_M | X86_PG_V | pg_nx; - } else if (i >= DMPML5I && i < DMPML5I + NDMPML5E) { + } else if (i >= DMPML5I && i < DMPML5I + ndmpml4phys) { /* Connect DMAP pml4 pages to PML5. */ p5_p[i] = (DMPML4phys + ptoa(i - DMPML5I)) | X86_PG_RW | X86_PG_V | pg_nx; @@ -5942,17 +5962,18 @@ pmap_demote_pde_mpte(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, if (mpte == NULL) { /* * Invalidate the 2MB page mapping and return "failure" if the - * mapping was never accessed. + * mapping was never accessed and not wired. */ if ((oldpde & PG_A) == 0) { - KASSERT((oldpde & PG_W) == 0, - ("pmap_demote_pde: a wired mapping is missing PG_A")); - pmap_demote_pde_abort(pmap, va, pde, oldpde, lockp); - return (false); - } - - mpte = pmap_remove_pt_page(pmap, va); - if (mpte == NULL) { + if ((oldpde & PG_W) == 0) { + pmap_demote_pde_abort(pmap, va, pde, oldpde, + lockp); + return (false); + } + mpte = pmap_remove_pt_page(pmap, va); + /* Fill the PTP with PTEs that have PG_A cleared. */ + mpte->valid = 0; + } else if ((mpte = pmap_remove_pt_page(pmap, va)) == NULL) { KASSERT((oldpde & PG_W) == 0, ("pmap_demote_pde: page table page for a wired mapping is missing")); @@ -6004,7 +6025,7 @@ pmap_demote_pde_mpte(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, /* * If the PTP is not leftover from an earlier promotion or it does not * have PG_A set in every PTE, then fill it. The new PTEs will all - * have PG_A set. + * have PG_A set, unless this is a wired mapping with PG_A clear. */ if (!vm_page_all_valid(mpte)) pmap_fill_ptp(firstpte, newpte); @@ -11880,9 +11901,7 @@ sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range, mode, range->pdpes, range->pdes, range->ptes); /* Reset to sentinel value. */ - range->sva = la57 ? KV5ADDR(NPML5EPG - 1, NPML4EPG - 1, NPDPEPG - 1, - NPDEPG - 1, NPTEPG - 1) : KV4ADDR(NPML4EPG - 1, NPDPEPG - 1, - NPDEPG - 1, NPTEPG - 1); + range->sva = kva_layout.kva_max; } /* @@ -11923,12 +11942,18 @@ sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va, */ static void sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range, - vm_offset_t va, pml4_entry_t pml4e, pdp_entry_t pdpe, pd_entry_t pde, - pt_entry_t pte) + vm_offset_t va, pml5_entry_t pml5e, pml4_entry_t pml4e, pdp_entry_t pdpe, + pd_entry_t pde, pt_entry_t pte) { pt_entry_t attrs; - attrs = pml4e & (X86_PG_RW | X86_PG_U | pg_nx); + if (la57) { + attrs = pml5e & (X86_PG_RW | X86_PG_U | pg_nx); + attrs |= pml4e & pg_nx; + attrs &= pg_nx | (pml4e & (X86_PG_RW | X86_PG_U)); + } else { + attrs = pml4e & (X86_PG_RW | X86_PG_U | pg_nx); + } attrs |= pdpe & pg_nx; attrs &= pg_nx | (pdpe & (X86_PG_RW | X86_PG_U)); @@ -11961,13 +11986,15 @@ sysctl_kmaps(SYSCTL_HANDLER_ARGS) { struct pmap_kernel_map_range range; struct sbuf sbuf, *sb; + pml5_entry_t pml5e; pml4_entry_t pml4e; pdp_entry_t *pdp, pdpe; pd_entry_t *pd, pde; pt_entry_t *pt, pte; vm_offset_t sva; vm_paddr_t pa; - int error, i, j, k, l; + int error, j, k, l; + bool first; error = sysctl_wire_old_buffer(req, 0); if (error != 0) @@ -11976,9 +12003,8 @@ sysctl_kmaps(SYSCTL_HANDLER_ARGS) sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req); /* Sentinel value. */ - range.sva = la57 ? KV5ADDR(NPML5EPG - 1, NPML4EPG - 1, NPDPEPG - 1, - NPDEPG - 1, NPTEPG - 1) : KV4ADDR(NPML4EPG - 1, NPDPEPG - 1, - NPDEPG - 1, NPTEPG - 1); + range.sva = kva_layout.kva_max; + pml5e = 0; /* no UB for la48 */ /* * Iterate over the kernel page tables without holding the kernel pmap @@ -11987,44 +12013,50 @@ sysctl_kmaps(SYSCTL_HANDLER_ARGS) * Within the large map, ensure that PDP and PD page addresses are * valid before descending. */ - for (sva = 0, i = pmap_pml4e_index(sva); i < NPML4EPG; i++) { - switch (i) { - case PML4PML4I: - if (!la57) - sbuf_printf(sb, "\nRecursive map:\n"); - break; - case DMPML4I: - if (!la57) - sbuf_printf(sb, "\nDirect map:\n"); - break; + for (first = true, sva = 0; sva != 0 || first; first = false) { + if (sva == kva_layout.rec_pt) + sbuf_printf(sb, "\nRecursive map:\n"); + else if (sva == kva_layout.dmap_low) + sbuf_printf(sb, "\nDirect map:\n"); #ifdef KASAN - case KASANPML4I: + else if (sva == kva_layout.kasan_shadow_low) sbuf_printf(sb, "\nKASAN shadow map:\n"); - break; #endif #ifdef KMSAN - case KMSANSHADPML4I: + else if (sva == kva_layout.kmsan_shadow_low) sbuf_printf(sb, "\nKMSAN shadow map:\n"); - break; - case KMSANORIGPML4I: + else if (sva == kva_layout.kmsan_origin_low) sbuf_printf(sb, "\nKMSAN origin map:\n"); - break; #endif - case KPML4BASE: + else if (sva == kva_layout.km_low) sbuf_printf(sb, "\nKernel map:\n"); - break; - case LMSPML4I: - if (!la57) - sbuf_printf(sb, "\nLarge map:\n"); - break; - } + else if (sva == kva_layout.lm_low) + sbuf_printf(sb, "\nLarge map:\n"); /* Convert to canonical form. */ - if (sva == 1ul << 47) - sva |= -1ul << 48; + if (la57) { + if (sva == 1ul << 56) { + sva |= -1ul << 57; + continue; + } + } else { + if (sva == 1ul << 47) { + sva |= -1ul << 48; + continue; + } + } restart: - pml4e = kernel_pml4[i]; + if (la57) { + pml5e = *pmap_pml5e(kernel_pmap, sva); + if ((pml5e & X86_PG_V) == 0) { + sva = rounddown2(sva, NBPML5); + sysctl_kmaps_dump(sb, &range, sva); + sva += NBPML5; + continue; + } + } + pml4e = *pmap_pml4e(kernel_pmap, sva); if ((pml4e & X86_PG_V) == 0) { sva = rounddown2(sva, NBPML4); sysctl_kmaps_dump(sb, &range, sva); @@ -12045,8 +12077,8 @@ restart: pa = pdpe & PG_FRAME; if ((pdpe & PG_PS) != 0) { sva = rounddown2(sva, NBPDP); - sysctl_kmaps_check(sb, &range, sva, pml4e, pdpe, - 0, 0); + sysctl_kmaps_check(sb, &range, sva, pml5e, + pml4e, pdpe, 0, 0); range.pdpes++; sva += NBPDP; continue; @@ -12058,6 +12090,7 @@ restart: * freed. Validate the next-level address * before descending. */ + sva += NBPDP; goto restart; } pd = (pd_entry_t *)PHYS_TO_DMAP(pa); @@ -12074,7 +12107,7 @@ restart: if ((pde & PG_PS) != 0) { sva = rounddown2(sva, NBPDR); sysctl_kmaps_check(sb, &range, sva, - pml4e, pdpe, pde, 0); + pml5e, pml4e, pdpe, pde, 0); range.pdes++; sva += NBPDR; continue; @@ -12086,6 +12119,7 @@ restart: * may be freed. Validate the * next-level address before descending. */ + sva += NBPDR; goto restart; } pt = (pt_entry_t *)PHYS_TO_DMAP(pa); @@ -12099,7 +12133,7 @@ restart: continue; } sysctl_kmaps_check(sb, &range, sva, - pml4e, pdpe, pde, pte); + pml5e, pml4e, pdpe, pde, pte); range.ptes++; } } diff --git a/sys/amd64/include/pmap.h b/sys/amd64/include/pmap.h index a0ca97f2d5a0..e2f97442c10f 100644 --- a/sys/amd64/include/pmap.h +++ b/sys/amd64/include/pmap.h @@ -557,6 +557,7 @@ pmap_pml5e_index(vm_offset_t va) struct kva_layout_s { vm_offset_t kva_min; + vm_offset_t kva_max; vm_offset_t dmap_low; /* DMAP_MIN_ADDRESS */ vm_offset_t dmap_high; /* DMAP_MAX_ADDRESS */ vm_offset_t lm_low; /* LARGEMAP_MIN_ADDRESS */ @@ -564,6 +565,12 @@ struct kva_layout_s { vm_offset_t km_low; /* VM_MIN_KERNEL_ADDRESS */ vm_offset_t km_high; /* VM_MAX_KERNEL_ADDRESS */ vm_offset_t rec_pt; + vm_offset_t kasan_shadow_low; /* KASAN_MIN_ADDRESS */ + vm_offset_t kasan_shadow_high; /* KASAN_MAX_ADDRESS */ + vm_offset_t kmsan_shadow_low; /* KMSAN_SHAD_MIN_ADDRESS */ + vm_offset_t kmsan_shadow_high; /* KMSAN_SHAD_MAX_ADDRESS */ + vm_offset_t kmsan_origin_low; /* KMSAN_ORIG_MIN_ADDRESS */ + vm_offset_t kmsan_origin_high; /* KMSAN_ORIG_MAX_ADDRESS */ }; extern struct kva_layout_s kva_layout; diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h index a9c73b75213b..0b3daed4f69e 100644 --- a/sys/amd64/include/vmm.h +++ b/sys/amd64/include/vmm.h @@ -649,6 +649,8 @@ struct vm_inout_str { int addrsize; enum vm_reg_name seg_name; struct seg_desc seg_desc; + int cs_d; + uint64_t cs_base; }; enum task_switch_reason { diff --git a/sys/amd64/include/vmm_dev.h b/sys/amd64/include/vmm_dev.h index 1f86538ce5f3..441330fd57b8 100644 --- a/sys/amd64/include/vmm_dev.h +++ b/sys/amd64/include/vmm_dev.h @@ -29,6 +29,8 @@ #ifndef _VMM_DEV_H_ #define _VMM_DEV_H_ +#include <sys/domainset.h> + #include <machine/vmm.h> #include <machine/vmm_snapshot.h> @@ -52,7 +54,10 @@ struct vm_munmap { struct vm_memseg { int segid; size_t len; - char name[VM_MAX_SUFFIXLEN + 1]; + char name[VM_MAX_SUFFIXLEN + 1]; + domainset_t *ds_mask; + size_t ds_mask_size; + int ds_policy; }; struct vm_register { diff --git a/sys/amd64/include/vmm_instruction_emul.h b/sys/amd64/include/vmm_instruction_emul.h index d5f0363cfb41..1fb0f97682a7 100644 --- a/sys/amd64/include/vmm_instruction_emul.h +++ b/sys/amd64/include/vmm_instruction_emul.h @@ -31,6 +31,31 @@ #include <sys/mman.h> +/* struct vie_op.op_type */ +enum { + VIE_OP_TYPE_NONE = 0, + VIE_OP_TYPE_MOV, + VIE_OP_TYPE_MOVSX, + VIE_OP_TYPE_MOVZX, + VIE_OP_TYPE_AND, + VIE_OP_TYPE_OR, + VIE_OP_TYPE_SUB, + VIE_OP_TYPE_TWO_BYTE, + VIE_OP_TYPE_PUSH, + VIE_OP_TYPE_CMP, + VIE_OP_TYPE_POP, + VIE_OP_TYPE_MOVS, + VIE_OP_TYPE_GROUP1, + VIE_OP_TYPE_STOS, + VIE_OP_TYPE_BITTEST, + VIE_OP_TYPE_TWOB_GRP15, + VIE_OP_TYPE_ADD, + VIE_OP_TYPE_TEST, + VIE_OP_TYPE_BEXTR, + VIE_OP_TYPE_OUTS, + VIE_OP_TYPE_LAST +}; + /* * Callback functions to read and write memory regions. */ diff --git a/sys/amd64/include/vmparam.h b/sys/amd64/include/vmparam.h index ef352e776af6..d2ac3c6648b2 100644 --- a/sys/amd64/include/vmparam.h +++ b/sys/amd64/include/vmparam.h @@ -200,16 +200,14 @@ #define VM_MIN_KERNEL_ADDRESS kva_layout.km_low #define VM_MAX_KERNEL_ADDRESS kva_layout.km_high -#define KASAN_MIN_ADDRESS KV4ADDR(KASANPML4I, 0, 0, 0) -#define KASAN_MAX_ADDRESS KV4ADDR(KASANPML4I + NKASANPML4E, 0, 0, 0) +#define KASAN_MIN_ADDRESS (kva_layout.kasan_shadow_low) +#define KASAN_MAX_ADDRESS (kva_layout.kasan_shadow_high) -#define KMSAN_SHAD_MIN_ADDRESS KV4ADDR(KMSANSHADPML4I, 0, 0, 0) -#define KMSAN_SHAD_MAX_ADDRESS KV4ADDR(KMSANSHADPML4I + NKMSANSHADPML4E, \ - 0, 0, 0) +#define KMSAN_SHAD_MIN_ADDRESS (kva_layout.kmsan_shadow_low) +#define KMSAN_SHAD_MAX_ADDRESS (kva_layout.kmsan_shadow_high) -#define KMSAN_ORIG_MIN_ADDRESS KV4ADDR(KMSANORIGPML4I, 0, 0, 0) -#define KMSAN_ORIG_MAX_ADDRESS KV4ADDR(KMSANORIGPML4I + NKMSANORIGPML4E, \ - 0, 0, 0) +#define KMSAN_ORIG_MIN_ADDRESS (kva_layout.kmsan_origin_low) +#define KMSAN_ORIG_MAX_ADDRESS (kva_layout.kmsan_origin_high) /* * Formally kernel mapping starts at KERNBASE, but kernel linker diff --git a/sys/amd64/vmm/amd/svm.c b/sys/amd64/vmm/amd/svm.c index 6c16daaa47c2..2fe6a5bc3584 100644 --- a/sys/amd64/vmm/amd/svm.c +++ b/sys/amd64/vmm/amd/svm.c @@ -317,6 +317,33 @@ svm_set_tsc_offset(struct svm_vcpu *vcpu, uint64_t offset) #define MSR_AMD7TH_START 0xC0010000UL #define MSR_AMD7TH_END 0xC0011FFFUL +static void +svm_get_cs_info(struct vmcb *vmcb, struct vm_guest_paging *paging, int *cs_d, + uint64_t *base) +{ + struct vmcb_segment seg; + int error __diagused; + + error = vmcb_seg(vmcb, VM_REG_GUEST_CS, &seg); + KASSERT(error == 0, ("%s: vmcb_seg error %d", __func__, error)); + + switch (paging->cpu_mode) { + case CPU_MODE_REAL: + *base = seg.base; + *cs_d = 0; + break; + case CPU_MODE_PROTECTED: + case CPU_MODE_COMPATIBILITY: + *cs_d = !!(seg.attrib & VMCB_CS_ATTRIB_D); + *base = seg.base; + break; + default: + *base = 0; + *cs_d = 0; + break; + } +} + /* * Get the index and bit position for a MSR in permission bitmap. * Two bits are used for each MSR: lower bit for read and higher bit for write. @@ -735,10 +762,29 @@ svm_inout_str_seginfo(struct svm_vcpu *vcpu, int64_t info1, int in, if (in) { vis->seg_name = VM_REG_GUEST_ES; - } else { - /* The segment field has standard encoding */ + } else if (decode_assist()) { + /* + * The effective segment number in EXITINFO1[12:10] is populated + * only if the processor has the DecodeAssist capability. + * + * XXX this is not specified explicitly in APMv2 but can be + * verified empirically. + */ s = (info1 >> 10) & 0x7; + + /* The segment field has standard encoding */ vis->seg_name = vm_segment_name(s); + } else { + /* + * The segment register need to be manually decoded by fetching + * the instructions near ip. However, we are unable to fetch it + * while the interrupts are disabled. Therefore, we leave the + * value unset until the generic ins/outs handler runs. + */ + vis->seg_name = VM_REG_LAST; + svm_get_cs_info(vcpu->vmcb, &vis->paging, &vis->cs_d, + &vis->cs_base); + return; } error = svm_getdesc(vcpu, vis->seg_name, &vis->seg_desc); @@ -798,16 +844,6 @@ svm_handle_io(struct svm_vcpu *vcpu, struct vm_exit *vmexit) info1 = ctrl->exitinfo1; inout_string = info1 & BIT(2) ? 1 : 0; - /* - * The effective segment number in EXITINFO1[12:10] is populated - * only if the processor has the DecodeAssist capability. - * - * XXX this is not specified explicitly in APMv2 but can be verified - * empirically. - */ - if (inout_string && !decode_assist()) - return (UNHANDLED); - vmexit->exitcode = VM_EXITCODE_INOUT; vmexit->u.inout.in = (info1 & BIT(0)) ? 1 : 0; vmexit->u.inout.string = inout_string; @@ -825,6 +861,8 @@ svm_handle_io(struct svm_vcpu *vcpu, struct vm_exit *vmexit) vis->index = svm_inout_str_index(regs, vmexit->u.inout.in); vis->count = svm_inout_str_count(regs, vmexit->u.inout.rep); vis->addrsize = svm_inout_str_addrsize(info1); + vis->cs_d = 0; + vis->cs_base = 0; svm_inout_str_seginfo(vcpu, info1, vmexit->u.inout.in, vis); } @@ -866,10 +904,9 @@ static void svm_handle_inst_emul(struct vmcb *vmcb, uint64_t gpa, struct vm_exit *vmexit) { struct vm_guest_paging *paging; - struct vmcb_segment seg; struct vmcb_ctrl *ctrl; char *inst_bytes; - int error __diagused, inst_len; + int inst_len; ctrl = &vmcb->ctrl; paging = &vmexit->u.inst_emul.paging; @@ -879,29 +916,8 @@ svm_handle_inst_emul(struct vmcb *vmcb, uint64_t gpa, struct vm_exit *vmexit) vmexit->u.inst_emul.gla = VIE_INVALID_GLA; svm_paging_info(vmcb, paging); - error = vmcb_seg(vmcb, VM_REG_GUEST_CS, &seg); - KASSERT(error == 0, ("%s: vmcb_seg(CS) error %d", __func__, error)); - - switch(paging->cpu_mode) { - case CPU_MODE_REAL: - vmexit->u.inst_emul.cs_base = seg.base; - vmexit->u.inst_emul.cs_d = 0; - break; - case CPU_MODE_PROTECTED: - case CPU_MODE_COMPATIBILITY: - vmexit->u.inst_emul.cs_base = seg.base; - - /* - * Section 4.8.1 of APM2, Default Operand Size or D bit. - */ - vmexit->u.inst_emul.cs_d = (seg.attrib & VMCB_CS_ATTRIB_D) ? - 1 : 0; - break; - default: - vmexit->u.inst_emul.cs_base = 0; - vmexit->u.inst_emul.cs_d = 0; - break; - } + svm_get_cs_info(vmcb, paging, &vmexit->u.inst_emul.cs_d, + &vmexit->u.inst_emul.cs_base); /* * Copy the instruction bytes into 'vie' if available. diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c index 957217ab2258..842281ab862e 100644 --- a/sys/amd64/vmm/intel/vmx.c +++ b/sys/amd64/vmm/intel/vmx.c @@ -2659,6 +2659,8 @@ vmx_exit_process(struct vmx *vmx, struct vmx_vcpu *vcpu, struct vm_exit *vmexit) vis->index = inout_str_index(vcpu, in); vis->count = inout_str_count(vcpu, vis->inout.rep); vis->addrsize = inout_str_addrsize(inst_info); + vis->cs_d = 0; + vis->cs_base = 0; inout_str_seginfo(vcpu, inst_info, in, vis); } SDT_PROBE3(vmm, vmx, exit, inout, vmx, vcpuid, vmexit); diff --git a/sys/amd64/vmm/vmm_instruction_emul.c b/sys/amd64/vmm/vmm_instruction_emul.c index c53e32889000..c54b6e6d0074 100644 --- a/sys/amd64/vmm/vmm_instruction_emul.c +++ b/sys/amd64/vmm/vmm_instruction_emul.c @@ -65,30 +65,6 @@ #include <x86/psl.h> #include <x86/specialreg.h> -/* struct vie_op.op_type */ -enum { - VIE_OP_TYPE_NONE = 0, - VIE_OP_TYPE_MOV, - VIE_OP_TYPE_MOVSX, - VIE_OP_TYPE_MOVZX, - VIE_OP_TYPE_AND, - VIE_OP_TYPE_OR, - VIE_OP_TYPE_SUB, - VIE_OP_TYPE_TWO_BYTE, - VIE_OP_TYPE_PUSH, - VIE_OP_TYPE_CMP, - VIE_OP_TYPE_POP, - VIE_OP_TYPE_MOVS, - VIE_OP_TYPE_GROUP1, - VIE_OP_TYPE_STOS, - VIE_OP_TYPE_BITTEST, - VIE_OP_TYPE_TWOB_GRP15, - VIE_OP_TYPE_ADD, - VIE_OP_TYPE_TEST, - VIE_OP_TYPE_BEXTR, - VIE_OP_TYPE_LAST -}; - /* struct vie_op.op_flags */ #define VIE_OP_F_IMM (1 << 0) /* 16/32-bit immediate operand */ #define VIE_OP_F_IMM8 (1 << 1) /* 8-bit immediate operand */ @@ -152,6 +128,16 @@ static const struct vie_op one_byte_opcodes[256] = { .op_byte = 0x3B, .op_type = VIE_OP_TYPE_CMP, }, + [0x6E] = { + .op_byte = 0x6E, + .op_type = VIE_OP_TYPE_OUTS, + .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION, + }, + [0x6F] = { + .op_byte = 0x6F, + .op_type = VIE_OP_TYPE_OUTS, + .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION, + }, [0x88] = { .op_byte = 0x88, .op_type = VIE_OP_TYPE_MOV, diff --git a/sys/amd64/vmm/vmm_ioport.c b/sys/amd64/vmm/vmm_ioport.c index fc1ecab9f209..8aab28f5e68e 100644 --- a/sys/amd64/vmm/vmm_ioport.c +++ b/sys/amd64/vmm/vmm_ioport.c @@ -145,9 +145,49 @@ emulate_inout_port(struct vcpu *vcpu, struct vm_exit *vmexit, bool *retu) } static int +decode_segment(struct vcpu *vcpu, enum vm_reg_name *segment) +{ + struct vm_guest_paging *paging; + struct vie vie; + struct vm_exit *vme; + int err; + int fault; + + vme = vm_exitinfo(vcpu); + paging = &vme->u.inout_str.paging; + + vie_init(&vie, NULL, 0); + err = vmm_fetch_instruction(vcpu, paging, + vme->rip + vme->u.inout_str.cs_base, VIE_INST_SIZE, &vie, &fault); + if (err || fault) + return (err); + + err = vmm_decode_instruction(vcpu, VIE_INVALID_GLA, paging->cpu_mode, + vme->u.inout_str.cs_d, &vie); + + if (err || vie.op.op_type != VIE_OP_TYPE_OUTS) + return (EINVAL); + if (vie.segment_override) + *segment = vie.segment_register; + else + *segment = VM_REG_GUEST_DS; + + return (0); +} + +static int emulate_inout_str(struct vcpu *vcpu, struct vm_exit *vmexit, bool *retu) { + int err; + *retu = true; + if (vmexit->u.inout_str.seg_name == VM_REG_LAST) { + err = decode_segment(vcpu, &vmexit->u.inout_str.seg_name); + if (err) + return (err); + return (vm_get_seg_desc(vcpu, vmexit->u.inout_str.seg_name, + &vmexit->u.inout_str.seg_desc)); + } return (0); /* Return to userspace to finish emulation */ } diff --git a/sys/arm/allwinner/aw_mmc.c b/sys/arm/allwinner/aw_mmc.c index 6bebf5e5fb5e..a8add957dc74 100644 --- a/sys/arm/allwinner/aw_mmc.c +++ b/sys/arm/allwinner/aw_mmc.c @@ -84,21 +84,26 @@ struct aw_mmc_conf { uint32_t dma_xferlen; + uint32_t dma_desc_shift; bool mask_data0; bool can_calibrate; bool new_timing; + bool zero_is_skip; }; static const struct aw_mmc_conf a10_mmc_conf = { .dma_xferlen = 0x2000, + .dma_desc_shift = 0, }; static const struct aw_mmc_conf a13_mmc_conf = { .dma_xferlen = 0x10000, + .dma_desc_shift = 0, }; static const struct aw_mmc_conf a64_mmc_conf = { .dma_xferlen = 0x10000, + .dma_desc_shift = 0, .mask_data0 = true, .can_calibrate = true, .new_timing = true, @@ -106,13 +111,24 @@ static const struct aw_mmc_conf a64_mmc_conf = { static const struct aw_mmc_conf a64_emmc_conf = { .dma_xferlen = 0x2000, + .dma_desc_shift = 0, .can_calibrate = true, }; +static const struct aw_mmc_conf d1_mmc_conf = { + .dma_xferlen = 0x1000, + .dma_desc_shift = 2, + .mask_data0 = true, + .can_calibrate = true, + .new_timing = true, + .zero_is_skip = true, +}; + static struct ofw_compat_data compat_data[] = { {"allwinner,sun4i-a10-mmc", (uintptr_t)&a10_mmc_conf}, {"allwinner,sun5i-a13-mmc", (uintptr_t)&a13_mmc_conf}, {"allwinner,sun7i-a20-mmc", (uintptr_t)&a13_mmc_conf}, + {"allwinner,sun20i-d1-mmc", (uintptr_t)&d1_mmc_conf}, {"allwinner,sun50i-a64-mmc", (uintptr_t)&a64_mmc_conf}, {"allwinner,sun50i-a64-emmc", (uintptr_t)&a64_emmc_conf}, {NULL, 0} @@ -607,16 +623,18 @@ aw_dma_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int err) dma_desc = sc->aw_dma_desc; for (i = 0; i < nsegs; i++) { - if (segs[i].ds_len == sc->aw_mmc_conf->dma_xferlen) + if ((segs[i].ds_len == sc->aw_mmc_conf->dma_xferlen) && + !sc->aw_mmc_conf->zero_is_skip) dma_desc[i].buf_size = 0; /* Size of 0 indicate max len */ else dma_desc[i].buf_size = segs[i].ds_len; - dma_desc[i].buf_addr = segs[i].ds_addr; + dma_desc[i].buf_addr = segs[i].ds_addr >> + sc->aw_mmc_conf->dma_desc_shift; dma_desc[i].config = AW_MMC_DMA_CONFIG_CH | - AW_MMC_DMA_CONFIG_OWN | AW_MMC_DMA_CONFIG_DIC; - - dma_desc[i].next = sc->aw_dma_desc_phys + - ((i + 1) * sizeof(struct aw_mmc_dma_desc)); + AW_MMC_DMA_CONFIG_OWN | AW_MMC_DMA_CONFIG_DIC; + dma_desc[i].next = (sc->aw_dma_desc_phys + + (i + 1) * sizeof(struct aw_mmc_dma_desc)) >> + sc->aw_mmc_conf->dma_desc_shift; } dma_desc[0].config |= AW_MMC_DMA_CONFIG_FD; @@ -678,7 +696,8 @@ aw_mmc_prepare_dma(struct aw_mmc_softc *sc) AW_MMC_WRITE_4(sc, AW_MMC_IDIE, val); /* Set DMA descritptor list address */ - AW_MMC_WRITE_4(sc, AW_MMC_DLBA, sc->aw_dma_desc_phys); + AW_MMC_WRITE_4(sc, AW_MMC_DLBA, sc->aw_dma_desc_phys >> + sc->aw_mmc_conf->dma_desc_shift); /* FIFO trigger level */ AW_MMC_WRITE_4(sc, AW_MMC_FWLR, AW_MMC_DMA_FTRGLEVEL); diff --git a/sys/arm64/arm64/pmap.c b/sys/arm64/arm64/pmap.c index 459cc8ebe505..2152f7fcc1c6 100644 --- a/sys/arm64/arm64/pmap.c +++ b/sys/arm64/arm64/pmap.c @@ -8501,18 +8501,20 @@ pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, vm_offset_t va, /* * Invalidate the 2MB page mapping and return "failure" if the - * mapping was never accessed. + * mapping was never accessed and not wired. */ if ((oldl2 & ATTR_AF) == 0) { - KASSERT((oldl2 & ATTR_SW_WIRED) == 0, - ("pmap_demote_l2: a wired mapping is missing ATTR_AF")); - pmap_demote_l2_abort(pmap, va, l2, lockp); - CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx in pmap %p", - va, pmap); - goto fail; - } - - if ((ml3 = pmap_remove_pt_page(pmap, va)) == NULL) { + if ((oldl2 & ATTR_SW_WIRED) == 0) { + pmap_demote_l2_abort(pmap, va, l2, lockp); + CTR2(KTR_PMAP, + "pmap_demote_l2: failure for va %#lx in pmap %p", + va, pmap); + goto fail; + } + ml3 = pmap_remove_pt_page(pmap, va); + /* Fill the PTP with L3Es that have ATTR_AF cleared. */ + ml3->valid = 0; + } else if ((ml3 = pmap_remove_pt_page(pmap, va)) == NULL) { KASSERT((oldl2 & ATTR_SW_WIRED) == 0, ("pmap_demote_l2: page table page for a wired mapping" " is missing")); @@ -8568,7 +8570,7 @@ pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, vm_offset_t va, /* * If the PTP is not leftover from an earlier promotion or it does not * have ATTR_AF set in every L3E, then fill it. The new L3Es will all - * have ATTR_AF set. + * have ATTR_AF set, unless this is a wired mapping with ATTR_AF clear. * * When pmap_update_entry() clears the old L2 mapping, it (indirectly) * performs a dsb(). That dsb() ensures that the stores for filling diff --git a/sys/arm64/include/vmm_dev.h b/sys/arm64/include/vmm_dev.h index 938bea47c7f8..219f1116c728 100644 --- a/sys/arm64/include/vmm_dev.h +++ b/sys/arm64/include/vmm_dev.h @@ -27,6 +27,8 @@ #ifndef _VMM_DEV_H_ #define _VMM_DEV_H_ +#include <sys/domainset.h> + #include <machine/vmm.h> struct vm_memmap { @@ -49,6 +51,9 @@ struct vm_memseg { int segid; size_t len; char name[VM_MAX_SUFFIXLEN + 1]; + domainset_t *ds_mask; + size_t ds_mask_size; + int ds_policy; }; struct vm_register { diff --git a/sys/cam/cam_xpt.c b/sys/cam/cam_xpt.c index 2ec736e7f4ac..cae29226d13c 100644 --- a/sys/cam/cam_xpt.c +++ b/sys/cam/cam_xpt.c @@ -2515,6 +2515,15 @@ xpt_action(union ccb *start_ccb) ("xpt_action: func %#x %s\n", start_ccb->ccb_h.func_code, xpt_action_name(start_ccb->ccb_h.func_code))); + /* + * Either it isn't queued, or it has a real priority. There still too + * many places that reuse CCBs with a real priority to do immediate + * queries to do the other side of this assert. + */ + KASSERT((start_ccb->ccb_h.func_code & XPT_FC_QUEUED) == 0 || + start_ccb->ccb_h.pinfo.priority != CAM_PRIORITY_NONE, + ("%s: queued ccb and CAM_PRIORITY_NONE illegal.", __func__)); + start_ccb->ccb_h.status = CAM_REQ_INPROG; (*(start_ccb->ccb_h.path->bus->xport->ops->action))(start_ccb); } diff --git a/sys/cam/mmc/mmc_da.c b/sys/cam/mmc/mmc_da.c index 7f8bf3516804..322141a72707 100644 --- a/sys/cam/mmc/mmc_da.c +++ b/sys/cam/mmc/mmc_da.c @@ -1081,7 +1081,7 @@ sdda_start_init_task(void *context, int pending) CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("sdda_start_init_task\n")); new_ccb = xpt_alloc_ccb(); xpt_setup_ccb(&new_ccb->ccb_h, periph->path, - CAM_PRIORITY_NONE); + CAM_PRIORITY_NORMAL); cam_periph_lock(periph); cam_periph_hold(periph, PRIBIO|PCATCH); diff --git a/sys/cam/mmc/mmc_xpt.c b/sys/cam/mmc/mmc_xpt.c index 4fce03004994..f5f66f5214a8 100644 --- a/sys/cam/mmc/mmc_xpt.c +++ b/sys/cam/mmc/mmc_xpt.c @@ -610,7 +610,6 @@ mmcprobe_start(struct cam_periph *periph, union ccb *start_ccb) CAM_DEBUG(start_ccb->ccb_h.path, CAM_DEBUG_PROBE, ("Start with PROBE_RESET\n")); /* FALLTHROUGH */ case PROBE_IDENTIFY: - xpt_path_inq(&start_ccb->cpi, periph->path); CAM_DEBUG(start_ccb->ccb_h.path, CAM_DEBUG_PROBE, ("Start with PROBE_IDENTIFY\n")); init_standard_ccb(start_ccb, XPT_MMC_GET_TRAN_SETTINGS); break; diff --git a/sys/cddl/dev/sdt/sdt.c b/sys/cddl/dev/sdt/sdt.c index a8da618204af..0a9059104671 100644 --- a/sys/cddl/dev/sdt/sdt.c +++ b/sys/cddl/dev/sdt/sdt.c @@ -72,6 +72,7 @@ static void sdt_load(void); static int sdt_unload(void); static void sdt_create_provider(struct sdt_provider *); static void sdt_create_probe(struct sdt_probe *); +static void sdt_init_probe(struct sdt_probe *, linker_file_t); static void sdt_kld_load(void *, struct linker_file *); static void sdt_kld_unload_try(void *, struct linker_file *, int *); @@ -204,6 +205,14 @@ sdt_create_probe(struct sdt_probe *probe) (void)dtrace_probe_create(prov->id, mod, func, name, aframes, probe); } +static void +sdt_init_probe(struct sdt_probe *probe, linker_file_t lf) +{ + probe->sdtp_lf = lf; + TAILQ_INIT(&probe->argtype_list); + STAILQ_INIT(&probe->tracepoint_list); +} + /* * Probes are created through the SDT module load/unload hook, so this function * has nothing to do. It only exists because the DTrace provider framework @@ -361,12 +370,19 @@ static void sdt_kld_load_providers(struct linker_file *lf) { struct sdt_provider **prov, **begin, **end; + struct sdt_probe **p_begin, **p_end; if (linker_file_lookup_set(lf, "sdt_providers_set", &begin, &end, NULL) == 0) { for (prov = begin; prov < end; prov++) sdt_create_provider(*prov); } + + if (linker_file_lookup_set(lf, "sdt_probes_set", &p_begin, &p_end, + NULL) == 0) { + for (struct sdt_probe **probe = p_begin; probe < p_end; probe++) + sdt_init_probe(*probe, lf); + } } static void @@ -378,13 +394,8 @@ sdt_kld_load_probes(struct linker_file *lf) if (linker_file_lookup_set(lf, "sdt_probes_set", &p_begin, &p_end, NULL) == 0) { - for (struct sdt_probe **probe = p_begin; probe < p_end; - probe++) { - (*probe)->sdtp_lf = lf; + for (struct sdt_probe **probe = p_begin; probe < p_end; probe++) sdt_create_probe(*probe); - TAILQ_INIT(&(*probe)->argtype_list); - STAILQ_INIT(&(*probe)->tracepoint_list); - } } if (linker_file_lookup_set(lf, "sdt_argtypes_set", &a_begin, &a_end, diff --git a/sys/compat/linprocfs/linprocfs.c b/sys/compat/linprocfs/linprocfs.c index cfb054235489..1c6d64d6b8bc 100644 --- a/sys/compat/linprocfs/linprocfs.c +++ b/sys/compat/linprocfs/linprocfs.c @@ -1911,7 +1911,7 @@ linprocfs_doproclimits(PFS_FILL_ARGS) "kern.sigqueue.max_pending_per_proc", &res, &size, 0, 0, 0, 0); if (error != 0) - goto out; + continue; rl.rlim_cur = res; rl.rlim_max = res; break; @@ -1919,7 +1919,7 @@ linprocfs_doproclimits(PFS_FILL_ARGS) error = kernel_sysctlbyname(td, "kern.ipc.msgmnb", &res, &size, 0, 0, 0, 0); if (error != 0) - goto out; + continue; rl.rlim_cur = res; rl.rlim_max = res; break; @@ -1941,9 +1941,9 @@ linprocfs_doproclimits(PFS_FILL_ARGS) li->desc, (unsigned long long)rl.rlim_cur, (unsigned long long)rl.rlim_max, li->unit); } -out: + lim_free(limp); - return (error); + return (0); } /* diff --git a/sys/compat/linux/linux_file.c b/sys/compat/linux/linux_file.c index 86834a7ecea8..a4be5313aa96 100644 --- a/sys/compat/linux/linux_file.c +++ b/sys/compat/linux/linux_file.c @@ -1792,7 +1792,7 @@ linux_memfd_create(struct thread *td, struct linux_memfd_create_args *args) if ((flags & MFD_ALLOW_SEALING) != 0) shmflags |= SHM_ALLOW_SEALING; return (kern_shm_open2(td, SHM_ANON, oflags, 0, shmflags, NULL, - memfd_name)); + memfd_name, NULL)); } int diff --git a/sys/conf/files b/sys/conf/files index dd0d390962f2..b7c19fae0b8e 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -3768,6 +3768,7 @@ gnu/gcov/gcov_subr.c optional gcov kern/bus_if.m standard kern/clock_if.m standard +kern/coredump_vnode.c standard kern/cpufreq_if.m standard kern/device_if.m standard kern/imgact_binmisc.c optional imgact_binmisc @@ -3856,6 +3857,7 @@ kern/kern_time.c standard kern/kern_timeout.c standard kern/kern_tslog.c optional tslog kern/kern_ubsan.c optional kubsan +kern/kern_ucoredump.c standard kern/kern_umtx.c standard kern/kern_uuid.c standard kern/kern_vnodedumper.c standard diff --git a/sys/conf/files.arm64 b/sys/conf/files.arm64 index 901da27e63f2..641001efab5e 100644 --- a/sys/conf/files.arm64 +++ b/sys/conf/files.arm64 @@ -368,6 +368,10 @@ dev/ice/irdma_di_if.m optional ice pci \ compile-with "${NORMAL_M} -I$S/dev/ice" dev/ice/ice_ddp_common.c optional ice pci \ compile-with "${NORMAL_C} -I$S/dev/ice" +dev/ice/ice_iov.c optional ice pci pci_iov \ + compile-with "${NORMAL_C} -I$S/dev/ice" +dev/ice/ice_vf_mbx.c optional ice pci pci_iov \ + compile-with "${NORMAL_C} -I$S/dev/ice" ice_ddp.c optional ice_ddp \ compile-with "${AWK} -f $S/tools/fw_stub.awk ice_ddp.fw:ice_ddp:0x01032900 -mice_ddp -c${.TARGET}" \ no-ctfconvert no-implicit-rule before-depend local \ diff --git a/sys/conf/files.x86 b/sys/conf/files.x86 index df206b314b38..9976e9cfec5d 100644 --- a/sys/conf/files.x86 +++ b/sys/conf/files.x86 @@ -62,6 +62,7 @@ dev/acpi_support/acpi_wmi_if.m standard dev/agp/agp_amd64.c optional agp dev/agp/agp_i810.c optional agp dev/agp/agp_via.c optional agp +dev/amdsmu/amdsmu.c optional amdsmu pci dev/amdsbwd/amdsbwd.c optional amdsbwd dev/amdsmn/amdsmn.c optional amdsmn | amdtemp dev/amdtemp/amdtemp.c optional amdtemp diff --git a/sys/dev/amdsmu/amdsmu.c b/sys/dev/amdsmu/amdsmu.c new file mode 100644 index 000000000000..416f875c6176 --- /dev/null +++ b/sys/dev/amdsmu/amdsmu.c @@ -0,0 +1,466 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2025 The FreeBSD Foundation + * + * This software was developed by Aymeric Wibo <obiwac@freebsd.org> + * under sponsorship from the FreeBSD Foundation. + */ + +#include <sys/param.h> +#include <sys/bus.h> +#include <sys/kernel.h> +#include <sys/module.h> +#include <sys/rman.h> +#include <sys/sysctl.h> + +#include <dev/pci/pcivar.h> +#include <dev/amdsmu/amdsmu.h> + +static bool +amdsmu_match(device_t dev, const struct amdsmu_product **product_out) +{ + const uint16_t vendorid = pci_get_vendor(dev); + const uint16_t deviceid = pci_get_device(dev); + + for (size_t i = 0; i < nitems(amdsmu_products); i++) { + const struct amdsmu_product *prod = &amdsmu_products[i]; + + if (vendorid == prod->amdsmu_vendorid && + deviceid == prod->amdsmu_deviceid) { + if (product_out != NULL) + *product_out = prod; + return (true); + } + } + return (false); +} + +static void +amdsmu_identify(driver_t *driver, device_t parent) +{ + if (device_find_child(parent, "amdsmu", -1) != NULL) + return; + + if (amdsmu_match(parent, NULL)) { + if (device_add_child(parent, "amdsmu", -1) == NULL) + device_printf(parent, "add amdsmu child failed\n"); + } +} + +static int +amdsmu_probe(device_t dev) +{ + if (resource_disabled("amdsmu", 0)) + return (ENXIO); + if (!amdsmu_match(device_get_parent(dev), NULL)) + return (ENXIO); + device_set_descf(dev, "AMD System Management Unit"); + + return (BUS_PROBE_GENERIC); +} + +static enum amdsmu_res +amdsmu_wait_res(device_t dev) +{ + struct amdsmu_softc *sc = device_get_softc(dev); + enum amdsmu_res res; + + /* + * The SMU has a response ready for us when the response register is + * set. Otherwise, we must wait. + */ + for (size_t i = 0; i < SMU_RES_READ_MAX; i++) { + res = amdsmu_read4(sc, SMU_REG_RESPONSE); + if (res != SMU_RES_WAIT) + return (res); + pause_sbt("amdsmu", ustosbt(SMU_RES_READ_PERIOD_US), 0, + C_HARDCLOCK); + } + device_printf(dev, "timed out waiting for response from SMU\n"); + return (SMU_RES_WAIT); +} + +static int +amdsmu_cmd(device_t dev, enum amdsmu_msg msg, uint32_t arg, uint32_t *ret) +{ + struct amdsmu_softc *sc = device_get_softc(dev); + enum amdsmu_res res; + + /* Wait for SMU to be ready. */ + if (amdsmu_wait_res(dev) == SMU_RES_WAIT) + return (ETIMEDOUT); + + /* Clear previous response. */ + amdsmu_write4(sc, SMU_REG_RESPONSE, SMU_RES_WAIT); + + /* Write out command to registers. */ + amdsmu_write4(sc, SMU_REG_MESSAGE, msg); + amdsmu_write4(sc, SMU_REG_ARGUMENT, arg); + + /* Wait for SMU response and handle it. */ + res = amdsmu_wait_res(dev); + + switch (res) { + case SMU_RES_WAIT: + return (ETIMEDOUT); + case SMU_RES_OK: + if (ret != NULL) + *ret = amdsmu_read4(sc, SMU_REG_ARGUMENT); + return (0); + case SMU_RES_REJECT_BUSY: + device_printf(dev, "SMU is busy\n"); + return (EBUSY); + case SMU_RES_REJECT_PREREQ: + case SMU_RES_UNKNOWN: + case SMU_RES_FAILED: + device_printf(dev, "SMU error: %02x\n", res); + return (EIO); + } + + return (EINVAL); +} + +static int +amdsmu_get_vers(device_t dev) +{ + int err; + uint32_t smu_vers; + struct amdsmu_softc *sc = device_get_softc(dev); + + err = amdsmu_cmd(dev, SMU_MSG_GETSMUVERSION, 0, &smu_vers); + if (err != 0) { + device_printf(dev, "failed to get SMU version\n"); + return (err); + } + sc->smu_program = (smu_vers >> 24) & 0xFF; + sc->smu_maj = (smu_vers >> 16) & 0xFF; + sc->smu_min = (smu_vers >> 8) & 0xFF; + sc->smu_rev = smu_vers & 0xFF; + device_printf(dev, "SMU version: %d.%d.%d (program %d)\n", + sc->smu_maj, sc->smu_min, sc->smu_rev, sc->smu_program); + + return (0); +} + +static int +amdsmu_get_ip_blocks(device_t dev) +{ + struct amdsmu_softc *sc = device_get_softc(dev); + const uint16_t deviceid = pci_get_device(dev); + int err; + struct amdsmu_metrics *m = &sc->metrics; + bool active; + char sysctl_descr[32]; + + /* Get IP block count. */ + switch (deviceid) { + case PCI_DEVICEID_AMD_REMBRANDT_ROOT: + sc->ip_block_count = 12; + break; + case PCI_DEVICEID_AMD_PHOENIX_ROOT: + sc->ip_block_count = 21; + break; + /* TODO How many IP blocks does Strix Point (and the others) have? */ + case PCI_DEVICEID_AMD_STRIX_POINT_ROOT: + default: + sc->ip_block_count = nitems(amdsmu_ip_blocks_names); + } + KASSERT(sc->ip_block_count <= nitems(amdsmu_ip_blocks_names), + ("too many IP blocks for array")); + + /* Get and print out IP blocks. */ + err = amdsmu_cmd(dev, SMU_MSG_GET_SUP_CONSTRAINTS, 0, + &sc->active_ip_blocks); + if (err != 0) { + device_printf(dev, "failed to get IP blocks\n"); + return (err); + } + device_printf(dev, "Active IP blocks: "); + for (size_t i = 0; i < sc->ip_block_count; i++) { + active = (sc->active_ip_blocks & (1 << i)) != 0; + sc->ip_blocks_active[i] = active; + if (!active) + continue; + printf("%s%s", amdsmu_ip_blocks_names[i], + i + 1 < sc->ip_block_count ? " " : "\n"); + } + + /* Create a sysctl node for IP blocks. */ + sc->ip_blocks_sysctlnode = SYSCTL_ADD_NODE(sc->sysctlctx, + SYSCTL_CHILDREN(sc->sysctlnode), OID_AUTO, "ip_blocks", + CTLFLAG_RD, NULL, "SMU metrics"); + if (sc->ip_blocks_sysctlnode == NULL) { + device_printf(dev, "could not add sysctl node for IP blocks\n"); + return (ENOMEM); + } + + /* Create a sysctl node for each IP block. */ + for (size_t i = 0; i < sc->ip_block_count; i++) { + /* Create the sysctl node itself for the IP block. */ + snprintf(sysctl_descr, sizeof sysctl_descr, + "Metrics about the %s AMD IP block", + amdsmu_ip_blocks_names[i]); + sc->ip_block_sysctlnodes[i] = SYSCTL_ADD_NODE(sc->sysctlctx, + SYSCTL_CHILDREN(sc->ip_blocks_sysctlnode), OID_AUTO, + amdsmu_ip_blocks_names[i], CTLFLAG_RD, NULL, sysctl_descr); + if (sc->ip_block_sysctlnodes[i] == NULL) { + device_printf(dev, + "could not add sysctl node for \"%s\"\n", sysctl_descr); + continue; + } + /* + * Create sysctls for if the IP block is currently active, last + * active time, and total active time. + */ + SYSCTL_ADD_BOOL(sc->sysctlctx, + SYSCTL_CHILDREN(sc->ip_block_sysctlnodes[i]), OID_AUTO, + "active", CTLFLAG_RD, &sc->ip_blocks_active[i], 0, + "IP block is currently active"); + SYSCTL_ADD_U64(sc->sysctlctx, + SYSCTL_CHILDREN(sc->ip_block_sysctlnodes[i]), OID_AUTO, + "last_time", CTLFLAG_RD, &m->ip_block_last_active_time[i], + 0, "How long the IP block was active for during the last" + " sleep (us)"); +#ifdef IP_BLOCK_TOTAL_ACTIVE_TIME + SYSCTL_ADD_U64(sc->sysctlctx, + SYSCTL_CHILDREN(sc->ip_block_sysctlnodes[i]), OID_AUTO, + "total_time", CTLFLAG_RD, &m->ip_block_total_active_time[i], + 0, "How long the IP block was active for during sleep in" + " total (us)"); +#endif + } + return (0); +} + +static int +amdsmu_init_metrics(device_t dev) +{ + struct amdsmu_softc *sc = device_get_softc(dev); + int err; + uint32_t metrics_addr_lo, metrics_addr_hi; + uint64_t metrics_addr; + + /* Get physical address of logging buffer. */ + err = amdsmu_cmd(dev, SMU_MSG_LOG_GETDRAM_ADDR_LO, 0, &metrics_addr_lo); + if (err != 0) + return (err); + err = amdsmu_cmd(dev, SMU_MSG_LOG_GETDRAM_ADDR_HI, 0, &metrics_addr_hi); + if (err != 0) + return (err); + metrics_addr = ((uint64_t) metrics_addr_hi << 32) | metrics_addr_lo; + + /* Map memory of logging buffer. */ + err = bus_space_map(sc->bus_tag, metrics_addr, + sizeof(struct amdsmu_metrics), 0, &sc->metrics_space); + if (err != 0) { + device_printf(dev, "could not map bus space for SMU metrics\n"); + return (err); + } + + /* Start logging for metrics. */ + amdsmu_cmd(dev, SMU_MSG_LOG_RESET, 0, NULL); + amdsmu_cmd(dev, SMU_MSG_LOG_START, 0, NULL); + return (0); +} + +static int +amdsmu_dump_metrics(device_t dev) +{ + struct amdsmu_softc *sc = device_get_softc(dev); + int err; + + err = amdsmu_cmd(dev, SMU_MSG_LOG_DUMP_DATA, 0, NULL); + if (err != 0) { + device_printf(dev, "failed to dump metrics\n"); + return (err); + } + bus_space_read_region_4(sc->bus_tag, sc->metrics_space, 0, + (uint32_t *)&sc->metrics, sizeof(sc->metrics) / sizeof(uint32_t)); + + return (0); +} + +static void +amdsmu_fetch_idlemask(device_t dev) +{ + struct amdsmu_softc *sc = device_get_softc(dev); + + sc->idlemask = amdsmu_read4(sc, SMU_REG_IDLEMASK); +} + +static int +amdsmu_attach(device_t dev) +{ + struct amdsmu_softc *sc = device_get_softc(dev); + int err; + uint32_t physbase_addr_lo, physbase_addr_hi; + uint64_t physbase_addr; + int rid = 0; + struct sysctl_oid *node; + + /* + * Find physical base address for SMU. + * XXX I am a little confused about the masks here. I'm just copying + * what Linux does in the amd-pmc driver to get the base address. + */ + pci_write_config(dev, SMU_INDEX_ADDRESS, SMU_PHYSBASE_ADDR_LO, 4); + physbase_addr_lo = pci_read_config(dev, SMU_INDEX_DATA, 4) & 0xFFF00000; + + pci_write_config(dev, SMU_INDEX_ADDRESS, SMU_PHYSBASE_ADDR_HI, 4); + physbase_addr_hi = pci_read_config(dev, SMU_INDEX_DATA, 4) & 0x0000FFFF; + + physbase_addr = (uint64_t)physbase_addr_hi << 32 | physbase_addr_lo; + + /* Map memory for SMU and its registers. */ + sc->res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_ACTIVE); + if (sc->res == NULL) { + device_printf(dev, "could not allocate resource\n"); + return (ENXIO); + } + + sc->bus_tag = rman_get_bustag(sc->res); + + if (bus_space_map(sc->bus_tag, physbase_addr, + SMU_MEM_SIZE, 0, &sc->smu_space) != 0) { + device_printf(dev, "could not map bus space for SMU\n"); + err = ENXIO; + goto err_smu_space; + } + if (bus_space_map(sc->bus_tag, physbase_addr + SMU_REG_SPACE_OFF, + SMU_MEM_SIZE, 0, &sc->reg_space) != 0) { + device_printf(dev, "could not map bus space for SMU regs\n"); + err = ENXIO; + goto err_reg_space; + } + + /* sysctl stuff. */ + sc->sysctlctx = device_get_sysctl_ctx(dev); + sc->sysctlnode = device_get_sysctl_tree(dev); + + /* Get version & add sysctls. */ + if ((err = amdsmu_get_vers(dev)) != 0) + goto err_dump; + + SYSCTL_ADD_U8(sc->sysctlctx, SYSCTL_CHILDREN(sc->sysctlnode), OID_AUTO, + "program", CTLFLAG_RD, &sc->smu_program, 0, "SMU program number"); + SYSCTL_ADD_U8(sc->sysctlctx, SYSCTL_CHILDREN(sc->sysctlnode), OID_AUTO, + "version_major", CTLFLAG_RD, &sc->smu_maj, 0, + "SMU firmware major version number"); + SYSCTL_ADD_U8(sc->sysctlctx, SYSCTL_CHILDREN(sc->sysctlnode), OID_AUTO, + "version_minor", CTLFLAG_RD, &sc->smu_min, 0, + "SMU firmware minor version number"); + SYSCTL_ADD_U8(sc->sysctlctx, SYSCTL_CHILDREN(sc->sysctlnode), OID_AUTO, + "version_revision", CTLFLAG_RD, &sc->smu_rev, 0, + "SMU firmware revision number"); + + /* Set up for getting metrics & add sysctls. */ + if ((err = amdsmu_init_metrics(dev)) != 0) + goto err_dump; + if ((err = amdsmu_dump_metrics(dev)) != 0) + goto err_dump; + + node = SYSCTL_ADD_NODE(sc->sysctlctx, SYSCTL_CHILDREN(sc->sysctlnode), + OID_AUTO, "metrics", CTLFLAG_RD, NULL, "SMU metrics"); + if (node == NULL) { + device_printf(dev, "could not add sysctl node for metrics\n"); + err = ENOMEM; + goto err_dump; + } + + SYSCTL_ADD_U32(sc->sysctlctx, SYSCTL_CHILDREN(node), OID_AUTO, + "table_version", CTLFLAG_RD, &sc->metrics.table_version, 0, + "SMU metrics table version"); + SYSCTL_ADD_U32(sc->sysctlctx, SYSCTL_CHILDREN(node), OID_AUTO, + "hint_count", CTLFLAG_RD, &sc->metrics.hint_count, 0, + "How many times the sleep hint was set"); + SYSCTL_ADD_U32(sc->sysctlctx, SYSCTL_CHILDREN(node), OID_AUTO, + "s0i3_last_entry_status", CTLFLAG_RD, + &sc->metrics.s0i3_last_entry_status, 0, + "1 if last S0i3 entry was successful"); + SYSCTL_ADD_U32(sc->sysctlctx, SYSCTL_CHILDREN(node), OID_AUTO, + "time_last_in_s0i2", CTLFLAG_RD, &sc->metrics.time_last_in_s0i2, 0, + "Time spent in S0i2 during last sleep (us)"); + SYSCTL_ADD_U64(sc->sysctlctx, SYSCTL_CHILDREN(node), OID_AUTO, + "time_last_entering_s0i3", CTLFLAG_RD, + &sc->metrics.time_last_entering_s0i3, 0, + "Time spent entering S0i3 during last sleep (us)"); + SYSCTL_ADD_U64(sc->sysctlctx, SYSCTL_CHILDREN(node), OID_AUTO, + "total_time_entering_s0i3", CTLFLAG_RD, + &sc->metrics.total_time_entering_s0i3, 0, + "Total time spent entering S0i3 (us)"); + SYSCTL_ADD_U64(sc->sysctlctx, SYSCTL_CHILDREN(node), OID_AUTO, + "time_last_resuming", CTLFLAG_RD, &sc->metrics.time_last_resuming, + 0, "Time spent resuming from last sleep (us)"); + SYSCTL_ADD_U64(sc->sysctlctx, SYSCTL_CHILDREN(node), OID_AUTO, + "total_time_resuming", CTLFLAG_RD, &sc->metrics.total_time_resuming, + 0, "Total time spent resuming from sleep (us)"); + SYSCTL_ADD_U64(sc->sysctlctx, SYSCTL_CHILDREN(node), OID_AUTO, + "time_last_in_s0i3", CTLFLAG_RD, &sc->metrics.time_last_in_s0i3, 0, + "Time spent in S0i3 during last sleep (us)"); + SYSCTL_ADD_U64(sc->sysctlctx, SYSCTL_CHILDREN(node), OID_AUTO, + "total_time_in_s0i3", CTLFLAG_RD, &sc->metrics.total_time_in_s0i3, + 0, "Total time spent in S0i3 (us)"); + SYSCTL_ADD_U64(sc->sysctlctx, SYSCTL_CHILDREN(node), OID_AUTO, + "time_last_in_sw_drips", CTLFLAG_RD, + &sc->metrics.time_last_in_sw_drips, 0, + "Time spent in awake during last sleep (us)"); + SYSCTL_ADD_U64(sc->sysctlctx, SYSCTL_CHILDREN(node), OID_AUTO, + "total_time_in_sw_drips", CTLFLAG_RD, + &sc->metrics.total_time_in_sw_drips, 0, + "Total time spent awake (us)"); + + /* Get IP blocks & add sysctls. */ + err = amdsmu_get_ip_blocks(dev); + if (err != 0) + goto err_dump; + + /* Get idlemask & add sysctl. */ + amdsmu_fetch_idlemask(dev); + SYSCTL_ADD_U32(sc->sysctlctx, SYSCTL_CHILDREN(sc->sysctlnode), OID_AUTO, + "idlemask", CTLFLAG_RD, &sc->idlemask, 0, "SMU idlemask. This " + "value is not documented - only used to help AMD internally debug " + "issues"); + + return (0); +err_dump: + bus_space_unmap(sc->bus_tag, sc->reg_space, SMU_MEM_SIZE); +err_reg_space: + bus_space_unmap(sc->bus_tag, sc->smu_space, SMU_MEM_SIZE); +err_smu_space: + bus_release_resource(dev, SYS_RES_MEMORY, rid, sc->res); + return (err); +} + +static int +amdsmu_detach(device_t dev) +{ + struct amdsmu_softc *sc = device_get_softc(dev); + int rid = 0; + + bus_space_unmap(sc->bus_tag, sc->smu_space, SMU_MEM_SIZE); + bus_space_unmap(sc->bus_tag, sc->reg_space, SMU_MEM_SIZE); + + bus_release_resource(dev, SYS_RES_MEMORY, rid, sc->res); + return (0); +} + +static device_method_t amdsmu_methods[] = { + DEVMETHOD(device_identify, amdsmu_identify), + DEVMETHOD(device_probe, amdsmu_probe), + DEVMETHOD(device_attach, amdsmu_attach), + DEVMETHOD(device_detach, amdsmu_detach), + DEVMETHOD_END +}; + +static driver_t amdsmu_driver = { + "amdsmu", + amdsmu_methods, + sizeof(struct amdsmu_softc), +}; + +DRIVER_MODULE(amdsmu, hostb, amdsmu_driver, NULL, NULL); +MODULE_VERSION(amdsmu, 1); +MODULE_DEPEND(amdsmu, amdsmn, 1, 1, 1); +MODULE_PNP_INFO("U16:vendor;U16:device", pci, amdsmu, amdsmu_products, + nitems(amdsmu_products)); diff --git a/sys/dev/amdsmu/amdsmu.h b/sys/dev/amdsmu/amdsmu.h new file mode 100644 index 000000000000..025887f7fe5a --- /dev/null +++ b/sys/dev/amdsmu/amdsmu.h @@ -0,0 +1,95 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2025 The FreeBSD Foundation + * + * This software was developed by Aymeric Wibo <obiwac@freebsd.org> + * under sponsorship from the FreeBSD Foundation. + */ +#ifndef _AMDSMU_H_ +#define _AMDSMU_H_ + +#include <sys/param.h> +#include <sys/bus.h> +#include <sys/kernel.h> +#include <machine/bus.h> +#include <x86/cputypes.h> + +#include <dev/amdsmu/amdsmu_reg.h> + +#define SMU_RES_READ_PERIOD_US 50 +#define SMU_RES_READ_MAX 20000 + +static const struct amdsmu_product { + uint16_t amdsmu_vendorid; + uint16_t amdsmu_deviceid; +} amdsmu_products[] = { + { CPU_VENDOR_AMD, PCI_DEVICEID_AMD_REMBRANDT_ROOT }, + { CPU_VENDOR_AMD, PCI_DEVICEID_AMD_PHOENIX_ROOT }, + { CPU_VENDOR_AMD, PCI_DEVICEID_AMD_STRIX_POINT_ROOT }, +}; + +static const char *const amdsmu_ip_blocks_names[] = { + "DISPLAY", + "CPU", + "GFX", + "VDD", + "ACP", + "VCN", + "ISP", + "NBIO", + "DF", + "USB3_0", + "USB3_1", + "LAPIC", + "USB3_2", + "USB3_3", + "USB3_4", + "USB4_0", + "USB4_1", + "MPM", + "JPEG", + "IPU", + "UMSCH", + "VPE", +}; + +CTASSERT(nitems(amdsmu_ip_blocks_names) <= 32); + +struct amdsmu_softc { + struct sysctl_ctx_list *sysctlctx; + struct sysctl_oid *sysctlnode; + + struct resource *res; + bus_space_tag_t bus_tag; + + bus_space_handle_t smu_space; + bus_space_handle_t reg_space; + + uint8_t smu_program; + uint8_t smu_maj, smu_min, smu_rev; + + uint32_t active_ip_blocks; + struct sysctl_oid *ip_blocks_sysctlnode; + size_t ip_block_count; + struct sysctl_oid *ip_block_sysctlnodes[nitems(amdsmu_ip_blocks_names)]; + bool ip_blocks_active[nitems(amdsmu_ip_blocks_names)]; + + bus_space_handle_t metrics_space; + struct amdsmu_metrics metrics; + uint32_t idlemask; +}; + +static inline uint32_t +amdsmu_read4(const struct amdsmu_softc *sc, bus_size_t reg) +{ + return (bus_space_read_4(sc->bus_tag, sc->reg_space, reg)); +} + +static inline void +amdsmu_write4(const struct amdsmu_softc *sc, bus_size_t reg, uint32_t val) +{ + bus_space_write_4(sc->bus_tag, sc->reg_space, reg, val); +} + +#endif /* _AMDSMU_H_ */ diff --git a/sys/dev/amdsmu/amdsmu_reg.h b/sys/dev/amdsmu/amdsmu_reg.h new file mode 100644 index 000000000000..e685b34e6883 --- /dev/null +++ b/sys/dev/amdsmu/amdsmu_reg.h @@ -0,0 +1,84 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2025 The FreeBSD Foundation + * + * This software was developed by Aymeric Wibo <obiwac@freebsd.org> + * under sponsorship from the FreeBSD Foundation. + */ +#ifndef _AMDSMU_REG_H_ +#define _AMDSMU_REG_H_ + +#include <sys/types.h> + +/* + * TODO These are in common with amdtemp; should we find a way to factor these + * out? Also, there are way more of these. I couldn't find a centralized place + * which lists them though. + */ +#define PCI_DEVICEID_AMD_REMBRANDT_ROOT 0x14B5 +#define PCI_DEVICEID_AMD_PHOENIX_ROOT 0x14E8 +#define PCI_DEVICEID_AMD_STRIX_POINT_ROOT 0x14A4 + +#define SMU_INDEX_ADDRESS 0xB8 +#define SMU_INDEX_DATA 0xBC + +#define SMU_PHYSBASE_ADDR_LO 0x13B102E8 +#define SMU_PHYSBASE_ADDR_HI 0x13B102EC + +#define SMU_MEM_SIZE 0x1000 +#define SMU_REG_SPACE_OFF 0x10000 + +#define SMU_REG_MESSAGE 0x538 +#define SMU_REG_RESPONSE 0x980 +#define SMU_REG_ARGUMENT 0x9BC +#define SMU_REG_IDLEMASK 0xD14 + +enum amdsmu_res { + SMU_RES_WAIT = 0x00, + SMU_RES_OK = 0x01, + SMU_RES_REJECT_BUSY = 0xFC, + SMU_RES_REJECT_PREREQ = 0xFD, + SMU_RES_UNKNOWN = 0xFE, + SMU_RES_FAILED = 0xFF, +}; + +enum amdsmu_msg { + SMU_MSG_GETSMUVERSION = 0x02, + SMU_MSG_LOG_GETDRAM_ADDR_HI = 0x04, + SMU_MSG_LOG_GETDRAM_ADDR_LO = 0x05, + SMU_MSG_LOG_START = 0x06, + SMU_MSG_LOG_RESET = 0x07, + SMU_MSG_LOG_DUMP_DATA = 0x08, + SMU_MSG_GET_SUP_CONSTRAINTS = 0x09, +}; + +/* XXX Copied from Linux struct smu_metrics. */ +struct amdsmu_metrics { + uint32_t table_version; + uint32_t hint_count; + uint32_t s0i3_last_entry_status; + uint32_t time_last_in_s0i2; + uint64_t time_last_entering_s0i3; + uint64_t total_time_entering_s0i3; + uint64_t time_last_resuming; + uint64_t total_time_resuming; + uint64_t time_last_in_s0i3; + uint64_t total_time_in_s0i3; + uint64_t time_last_in_sw_drips; + uint64_t total_time_in_sw_drips; + /* + * This is how long each IP block was active for (us), i.e., blocking + * entry to S0i3. In Linux, these are called "timecondition_notmet_*". + * + * XXX Total active time for IP blocks seems to be buggy and reporting + * garbage (at least on Phoenix), so it's disabled for now. The last + * active time for the USB4_0 IP block also seems to be buggy. + */ + uint64_t ip_block_last_active_time[32]; +#ifdef IP_BLOCK_TOTAL_ACTIVE_TIME + uint64_t ip_block_total_active_time[32]; +#endif +} __attribute__((packed)); + +#endif /* _AMDSMU_REG_H_ */ diff --git a/sys/dev/cxgbe/tom/t4_cpl_io.c b/sys/dev/cxgbe/tom/t4_cpl_io.c index 8547f21586e1..7a6b1cbdd736 100644 --- a/sys/dev/cxgbe/tom/t4_cpl_io.c +++ b/sys/dev/cxgbe/tom/t4_cpl_io.c @@ -703,7 +703,7 @@ t4_push_frames(struct adapter *sc, struct toepcb *toep, int drop) for (m = sndptr; m != NULL; m = m->m_next) { int n; - if ((m->m_flags & M_NOTAVAIL) != 0) + if ((m->m_flags & M_NOTREADY) != 0) break; if (m->m_flags & M_EXTPG) { #ifdef KERN_TLS @@ -787,7 +787,7 @@ t4_push_frames(struct adapter *sc, struct toepcb *toep, int drop) /* nothing to send */ if (plen == 0) { - KASSERT(m == NULL || (m->m_flags & M_NOTAVAIL) != 0, + KASSERT(m == NULL || (m->m_flags & M_NOTREADY) != 0, ("%s: nothing to send, but m != NULL is ready", __func__)); break; @@ -880,7 +880,7 @@ t4_push_frames(struct adapter *sc, struct toepcb *toep, int drop) toep->txsd_avail--; t4_l2t_send(sc, wr, toep->l2te); - } while (m != NULL && (m->m_flags & M_NOTAVAIL) == 0); + } while (m != NULL && (m->m_flags & M_NOTREADY) == 0); /* Send a FIN if requested, but only if there's no more data to send */ if (m == NULL && toep->flags & TPF_SEND_FIN) diff --git a/sys/dev/cxgbe/tom/t4_tls.c b/sys/dev/cxgbe/tom/t4_tls.c index c6377980fca9..27c16b9988ae 100644 --- a/sys/dev/cxgbe/tom/t4_tls.c +++ b/sys/dev/cxgbe/tom/t4_tls.c @@ -563,7 +563,7 @@ t4_push_ktls(struct adapter *sc, struct toepcb *toep, int drop) * If there is no ready data to send, wait until more * data arrives. */ - if (m == NULL || (m->m_flags & M_NOTAVAIL) != 0) { + if (m == NULL || (m->m_flags & M_NOTREADY) != 0) { if (sowwakeup) sowwakeup_locked(so); else @@ -614,7 +614,7 @@ t4_push_ktls(struct adapter *sc, struct toepcb *toep, int drop) /* Shove if there is no additional data pending. */ shove = ((m->m_next == NULL || - (m->m_next->m_flags & M_NOTAVAIL) != 0)) && + (m->m_next->m_flags & M_NOTREADY) != 0)) && (tp->t_flags & TF_MORETOCOME) == 0; if (sb->sb_flags & SB_AUTOSIZE && diff --git a/sys/dev/drm2/drm_fb_helper.c b/sys/dev/drm2/drm_fb_helper.c index f67cc9f60d02..1f4abd255690 100644 --- a/sys/dev/drm2/drm_fb_helper.c +++ b/sys/dev/drm2/drm_fb_helper.c @@ -51,7 +51,7 @@ struct vt_kms_softc { struct task fb_mode_task; }; -/* Call restore out of vt(9) locks. */ +/* Call restore out of vt(4) locks. */ static void vt_restore_fbdev_mode(void *arg, int pending) { diff --git a/sys/dev/efidev/efirt.c b/sys/dev/efidev/efirt.c index b0fa33daeca7..b55c1c191077 100644 --- a/sys/dev/efidev/efirt.c +++ b/sys/dev/efidev/efirt.c @@ -107,7 +107,8 @@ static int efi_status2err[25] = { enum efi_table_type { TYPE_ESRT = 0, - TYPE_PROP + TYPE_PROP, + TYPE_MEMORY_ATTR }; static int efi_enter(void); @@ -445,6 +446,42 @@ get_table_length(enum efi_table_type type, size_t *table_len, void **taddr) free(buf, M_TEMP); return (0); } + case TYPE_MEMORY_ATTR: + { + efi_guid_t guid = EFI_MEMORY_ATTRIBUTES_TABLE; + struct efi_memory_attribute_table *tbl_addr, *mem_addr; + int error; + void *buf; + size_t len = sizeof(struct efi_memory_attribute_table); + + error = efi_get_table(&guid, (void **)&tbl_addr); + if (error) + return (error); + + buf = malloc(len, M_TEMP, M_WAITOK); + error = physcopyout((vm_paddr_t)tbl_addr, buf, len); + if (error) { + free(buf, M_TEMP); + return (error); + } + + mem_addr = (struct efi_memory_attribute_table *)buf; + if (mem_addr->version != 2) { + free(buf, M_TEMP); + return (EINVAL); + } + len += mem_addr->descriptor_size * mem_addr->num_ents; + if (len > EFI_TABLE_ALLOC_MAX) { + free(buf, M_TEMP); + return (ENOMEM); + } + + *table_len = len; + if (taddr != NULL) + *taddr = tbl_addr; + free(buf, M_TEMP); + return (0); + } } return (ENOENT); } @@ -457,7 +494,8 @@ copy_table(efi_guid_t *guid, void **buf, size_t buf_len, size_t *table_len) enum efi_table_type type; } tables[] = { { EFI_TABLE_ESRT, TYPE_ESRT }, - { EFI_PROPERTIES_TABLE, TYPE_PROP } + { EFI_PROPERTIES_TABLE, TYPE_PROP }, + { EFI_MEMORY_ATTRIBUTES_TABLE, TYPE_MEMORY_ATTR } }; size_t table_idx; void *taddr; diff --git a/sys/dev/iicbus/iichid.c b/sys/dev/iicbus/iichid.c index 9c0324a24685..3f1d7a0cefba 100644 --- a/sys/dev/iicbus/iichid.c +++ b/sys/dev/iicbus/iichid.c @@ -275,62 +275,36 @@ iichid_cmd_read(struct iichid_softc* sc, void *buf, iichid_size_t maxlen, * 6.1.3 - Retrieval of Input Reports * DEVICE returns the length (2 Bytes) and the entire Input Report. */ - uint8_t actbuf[2] = { 0, 0 }; - /* Read actual input report length. */ + + memset(buf, 0xaa, 2); // In case nothing gets read struct iic_msg msgs[] = { - { sc->addr, IIC_M_RD | IIC_M_NOSTOP, sizeof(actbuf), actbuf }, + { sc->addr, IIC_M_RD, maxlen, buf }, }; - uint16_t actlen; int error; error = iicbus_transfer(sc->dev, msgs, nitems(msgs)); if (error != 0) return (error); - actlen = actbuf[0] | actbuf[1] << 8; -#ifdef IICHID_SAMPLING - if ((actlen == 0 && sc->sampling_rate_slow < 0) || - (maxlen == 0 && sc->sampling_rate_slow >= 0)) { -#else + DPRINTFN(sc, 5, "%*D\n", msgs[0].len, msgs[0].buf, " "); + + uint16_t actlen = le16dec(buf); + if (actlen == 0) { -#endif - /* Read and discard reset command response. */ - msgs[0] = (struct iic_msg) - { sc->addr, IIC_M_RD | IIC_M_NOSTART, - le16toh(sc->desc.wMaxInputLength) - 2, sc->intr_buf }; - actlen = 0; if (!sc->reset_acked) { mtx_lock(&sc->mtx); sc->reset_acked = true; wakeup(&sc->reset_acked); mtx_unlock(&sc->mtx); } -#ifdef IICHID_SAMPLING - } else if ((actlen <= 2 || actlen == 0xFFFF) && - sc->sampling_rate_slow >= 0) { - /* Read and discard 1 byte to send I2C STOP condition. */ - msgs[0] = (struct iic_msg) - { sc->addr, IIC_M_RD | IIC_M_NOSTART, 1, actbuf }; - actlen = 0; -#endif - } else { - actlen -= 2; - if (actlen > maxlen) { - DPRINTF(sc, "input report too big. requested=%d " - "received=%d\n", maxlen, actlen); - actlen = maxlen; - } - /* Read input report itself. */ - msgs[0] = (struct iic_msg) - { sc->addr, IIC_M_RD | IIC_M_NOSTART, actlen, buf }; } - error = iicbus_transfer(sc->dev, msgs, 1); - if (error == 0 && actual_len != NULL) + if (actlen <= 2 || actlen > maxlen) { + actlen = 0; + } + if (actual_len != NULL) { *actual_len = actlen; - - DPRINTFN(sc, 5, - "%*D - %*D\n", 2, actbuf, " ", msgs[0].len, msgs[0].buf, " "); + } return (error); } @@ -566,7 +540,7 @@ iichid_sampling_task(void *context, int pending) error = iichid_cmd_read(sc, sc->intr_buf, sc->intr_bufsize, &actual); if (error == 0) { if (actual > 0) { - sc->intr_handler(sc->intr_ctx, sc->intr_buf, actual); + sc->intr_handler(sc->intr_ctx, sc->intr_buf + 2, actual); sc->missing_samples = 0; if (sc->dup_size != actual || memcmp(sc->dup_buf, sc->intr_buf, actual) != 0) { @@ -577,7 +551,7 @@ iichid_sampling_task(void *context, int pending) ++sc->dup_samples; } else { if (++sc->missing_samples == 1) - sc->intr_handler(sc->intr_ctx, sc->intr_buf, 0); + sc->intr_handler(sc->intr_ctx, sc->intr_buf + 2, 0); sc->dup_samples = 0; } } else @@ -632,7 +606,7 @@ iichid_intr(void *context) if (error == 0) { if (sc->power_on && sc->open) { if (actual != 0) - sc->intr_handler(sc->intr_ctx, sc->intr_buf, + sc->intr_handler(sc->intr_ctx, sc->intr_buf + 2, actual); else DPRINTF(sc, "no data received\n"); @@ -842,11 +816,12 @@ iichid_intr_setup(device_t dev, device_t child __unused, hid_intr_t intr, sc = device_get_softc(dev); /* - * Do not rely on wMaxInputLength, as some devices may set it to - * a wrong length. Find the longest input report in report descriptor. + * Do not rely just on wMaxInputLength, as some devices (which?) + * may set it to a wrong length. Also find the longest input report + * in report descriptor, and add two for the length field. */ - rdesc->rdsize = - MAX(rdesc->isize, le16toh(sc->desc.wMaxInputLength) - 2); + rdesc->rdsize = 2 + + MAX(rdesc->isize, le16toh(sc->desc.wMaxInputLength)); /* Write and get/set_report sizes are limited by I2C-HID protocol. */ rdesc->grsize = rdesc->srsize = IICHID_SIZE_MAX; rdesc->wrsize = IICHID_SIZE_MAX; @@ -919,7 +894,7 @@ iichid_intr_poll(device_t dev, device_t child __unused) sc = device_get_softc(dev); error = iichid_cmd_read(sc, sc->intr_buf, sc->intr_bufsize, &actual); if (error == 0 && actual != 0) - sc->intr_handler(sc->intr_ctx, sc->intr_buf, actual); + sc->intr_handler(sc->intr_ctx, sc->intr_buf + 2, actual); } /* @@ -946,6 +921,7 @@ iichid_read(device_t dev, device_t child __unused, void *buf, { struct iichid_softc *sc; device_t parent; + uint8_t *tmpbuf; int error; if (maxlen > IICHID_SIZE_MAX) @@ -954,8 +930,12 @@ iichid_read(device_t dev, device_t child __unused, void *buf, parent = device_get_parent(sc->dev); error = iicbus_request_bus(parent, sc->dev, IIC_WAIT); if (error == 0) { - error = iichid_cmd_read(sc, buf, maxlen, actlen); + tmpbuf = malloc(maxlen + 2, M_DEVBUF, M_WAITOK | M_ZERO); + error = iichid_cmd_read(sc, tmpbuf, maxlen + 2, actlen); iicbus_release_bus(parent, sc->dev); + if (*actlen > 0) + memcpy(buf, tmpbuf + 2, *actlen); + free(tmpbuf, M_DEVBUF); } return (iic2errno(error)); } diff --git a/sys/dev/md/md.c b/sys/dev/md/md.c index 29dc0c880e3a..ec1664fac701 100644 --- a/sys/dev/md/md.c +++ b/sys/dev/md/md.c @@ -89,6 +89,8 @@ #include <sys/unistd.h> #include <sys/vnode.h> #include <sys/disk.h> +#include <sys/param.h> +#include <sys/bus.h> #include <geom/geom.h> #include <geom/geom_int.h> @@ -2082,8 +2084,10 @@ g_md_init(struct g_class *mp __unused) { caddr_t mod; u_char *ptr, *name, *type; + u_char scratch[40]; unsigned len; int i; + vm_offset_t paddr; /* figure out log2(NINDIR) */ for (i = NINDIR, nshift = -1; i; nshift++) @@ -2123,6 +2127,25 @@ g_md_init(struct g_class *mp __unused) sx_xunlock(&md_sx); } } + + /* + * Load up to 32 pre-loaded disks + */ + for (int i = 0; i < 32; i++) { + if (resource_long_value("md", i, "physaddr", + (long *) &paddr) != 0 || + resource_int_value("md", i, "len", &len) != 0) + break; + ptr = (char *)pmap_map(NULL, paddr, paddr + len, VM_PROT_READ); + if (ptr != NULL && len != 0) { + sprintf(scratch, "preload%d 0x%016jx", i, + (uintmax_t)paddr); + sx_xlock(&md_sx); + md_preloaded(ptr, len, scratch); + sx_xunlock(&md_sx); + } + } + status_dev = make_dev(&mdctl_cdevsw, INT_MAX, UID_ROOT, GID_WHEEL, 0600, MDCTL_NAME); g_topology_lock(); diff --git a/sys/dev/nvme/nvme_ctrlr.c b/sys/dev/nvme/nvme_ctrlr.c index 73a7cee4aad0..fd7f00ced14b 100644 --- a/sys/dev/nvme/nvme_ctrlr.c +++ b/sys/dev/nvme/nvme_ctrlr.c @@ -48,7 +48,7 @@ #define B4_CHK_RDY_DELAY_MS 2300 /* work around controller bug */ static void nvme_ctrlr_construct_and_submit_aer(struct nvme_controller *ctrlr, - struct nvme_async_event_request *aer); + struct nvme_async_event_request *aer); static void nvme_ctrlr_barrier(struct nvme_controller *ctrlr, int flags) @@ -680,96 +680,6 @@ nvme_ctrlr_log_critical_warnings(struct nvme_controller *ctrlr, } static void -nvme_ctrlr_async_event_log_page_cb(void *arg, const struct nvme_completion *cpl) -{ - struct nvme_async_event_request *aer = arg; - struct nvme_health_information_page *health_info; - struct nvme_ns_list *nsl; - struct nvme_error_information_entry *err; - int i; - - /* - * If the log page fetch for some reason completed with an error, - * don't pass log page data to the consumers. In practice, this case - * should never happen. - */ - if (nvme_completion_is_error(cpl)) - nvme_notify_async_consumers(aer->ctrlr, &aer->cpl, - aer->log_page_id, NULL, 0); - else { - /* Convert data to host endian */ - switch (aer->log_page_id) { - case NVME_LOG_ERROR: - err = (struct nvme_error_information_entry *)aer->log_page_buffer; - for (i = 0; i < (aer->ctrlr->cdata.elpe + 1); i++) - nvme_error_information_entry_swapbytes(err++); - break; - case NVME_LOG_HEALTH_INFORMATION: - nvme_health_information_page_swapbytes( - (struct nvme_health_information_page *)aer->log_page_buffer); - break; - case NVME_LOG_CHANGED_NAMESPACE: - nvme_ns_list_swapbytes( - (struct nvme_ns_list *)aer->log_page_buffer); - break; - case NVME_LOG_COMMAND_EFFECT: - nvme_command_effects_page_swapbytes( - (struct nvme_command_effects_page *)aer->log_page_buffer); - break; - case NVME_LOG_RES_NOTIFICATION: - nvme_res_notification_page_swapbytes( - (struct nvme_res_notification_page *)aer->log_page_buffer); - break; - case NVME_LOG_SANITIZE_STATUS: - nvme_sanitize_status_page_swapbytes( - (struct nvme_sanitize_status_page *)aer->log_page_buffer); - break; - default: - break; - } - - if (aer->log_page_id == NVME_LOG_HEALTH_INFORMATION) { - health_info = (struct nvme_health_information_page *) - aer->log_page_buffer; - nvme_ctrlr_log_critical_warnings(aer->ctrlr, - health_info->critical_warning); - /* - * Critical warnings reported through the - * SMART/health log page are persistent, so - * clear the associated bits in the async event - * config so that we do not receive repeated - * notifications for the same event. - */ - aer->ctrlr->async_event_config &= - ~health_info->critical_warning; - nvme_ctrlr_cmd_set_async_event_config(aer->ctrlr, - aer->ctrlr->async_event_config, NULL, NULL); - } else if (aer->log_page_id == NVME_LOG_CHANGED_NAMESPACE && - !nvme_use_nvd) { - nsl = (struct nvme_ns_list *)aer->log_page_buffer; - for (i = 0; i < nitems(nsl->ns) && nsl->ns[i] != 0; i++) { - if (nsl->ns[i] > NVME_MAX_NAMESPACES) - break; - nvme_notify_ns(aer->ctrlr, nsl->ns[i]); - } - } - - /* - * Pass the cpl data from the original async event completion, - * not the log page fetch. - */ - nvme_notify_async_consumers(aer->ctrlr, &aer->cpl, - aer->log_page_id, aer->log_page_buffer, aer->log_page_size); - } - - /* - * Repost another asynchronous event request to replace the one - * that just completed. - */ - nvme_ctrlr_construct_and_submit_aer(aer->ctrlr, aer); -} - -static void nvme_ctrlr_async_event_cb(void *arg, const struct nvme_completion *cpl) { struct nvme_async_event_request *aer = arg; @@ -784,33 +694,18 @@ nvme_ctrlr_async_event_cb(void *arg, const struct nvme_completion *cpl) return; } - /* Associated log page is in bits 23:16 of completion entry dw0. */ + /* + * Save the completion status and associated log page is in bits 23:16 + * of completion entry dw0. Print a message and queue it for further + * processing. + */ + memcpy(&aer->cpl, cpl, sizeof(*cpl)); aer->log_page_id = NVMEV(NVME_ASYNC_EVENT_LOG_PAGE_ID, cpl->cdw0); - nvme_printf(aer->ctrlr, "async event occurred (type 0x%x, info 0x%02x," " page 0x%02x)\n", NVMEV(NVME_ASYNC_EVENT_TYPE, cpl->cdw0), NVMEV(NVME_ASYNC_EVENT_INFO, cpl->cdw0), aer->log_page_id); - - if (is_log_page_id_valid(aer->log_page_id)) { - aer->log_page_size = nvme_ctrlr_get_log_page_size(aer->ctrlr, - aer->log_page_id); - memcpy(&aer->cpl, cpl, sizeof(*cpl)); - nvme_ctrlr_cmd_get_log_page(aer->ctrlr, aer->log_page_id, - NVME_GLOBAL_NAMESPACE_TAG, aer->log_page_buffer, - aer->log_page_size, nvme_ctrlr_async_event_log_page_cb, - aer); - /* Wait to notify consumers until after log page is fetched. */ - } else { - nvme_notify_async_consumers(aer->ctrlr, cpl, aer->log_page_id, - NULL, 0); - - /* - * Repost another asynchronous event request to replace the one - * that just completed. - */ - nvme_ctrlr_construct_and_submit_aer(aer->ctrlr, aer); - } + taskqueue_enqueue(aer->ctrlr->taskqueue, &aer->task); } static void @@ -819,15 +714,21 @@ nvme_ctrlr_construct_and_submit_aer(struct nvme_controller *ctrlr, { struct nvme_request *req; - aer->ctrlr = ctrlr; /* - * XXX-MJ this should be M_WAITOK but we might be in a non-sleepable - * callback context. AER completions should be handled on a dedicated - * thread. + * We're racing the reset thread, so let that process submit this again. + * XXX does this really solve that race? And is that race even possible + * since we only reset when we've no theard from the card in a long + * time. Why would we get an AER in the middle of that just before we + * kick off the reset? */ - req = nvme_allocate_request_null(M_NOWAIT, nvme_ctrlr_async_event_cb, + if (ctrlr->is_resetting) + return; + + aer->ctrlr = ctrlr; + req = nvme_allocate_request_null(M_WAITOK, nvme_ctrlr_async_event_cb, aer); aer->req = req; + aer->log_page_id = 0; /* Not a valid page */ /* * Disable timeout here, since asynchronous event requests should by @@ -1203,6 +1104,140 @@ nvme_ctrlr_reset_task(void *arg, int pending) atomic_cmpset_32(&ctrlr->is_resetting, 1, 0); } +static void +nvme_ctrlr_aer_done(void *arg, const struct nvme_completion *cpl) +{ + struct nvme_async_event_request *aer = arg; + + mtx_lock(&aer->mtx); + if (nvme_completion_is_error(cpl)) + aer->log_page_size = (uint32_t)-1; + else + aer->log_page_size = nvme_ctrlr_get_log_page_size( + aer->ctrlr, aer->log_page_id); + wakeup(aer); + mtx_unlock(&aer->mtx); +} + +static void +nvme_ctrlr_aer_task(void *arg, int pending) +{ + struct nvme_async_event_request *aer = arg; + struct nvme_controller *ctrlr = aer->ctrlr; + uint32_t len; + + /* + * We're resetting, so just punt. + */ + if (ctrlr->is_resetting) + return; + + if (!is_log_page_id_valid(aer->log_page_id)) { + /* + * Repost another asynchronous event request to replace the one + * that just completed. + */ + nvme_notify_async_consumers(ctrlr, &aer->cpl, aer->log_page_id, + NULL, 0); + nvme_ctrlr_construct_and_submit_aer(ctrlr, aer); + goto out; + } + + aer->log_page_size = 0; + len = nvme_ctrlr_get_log_page_size(aer->ctrlr, aer->log_page_id); + nvme_ctrlr_cmd_get_log_page(aer->ctrlr, aer->log_page_id, + NVME_GLOBAL_NAMESPACE_TAG, aer->log_page_buffer, len, + nvme_ctrlr_aer_done, aer); + mtx_lock(&aer->mtx); + while (aer->log_page_size == 0) + mtx_sleep(aer, &aer->mtx, PRIBIO, "nvme_pt", 0); + mtx_unlock(&aer->mtx); + + if (aer->log_page_size != (uint32_t)-1) { + /* + * If the log page fetch for some reason completed with an + * error, don't pass log page data to the consumers. In + * practice, this case should never happen. + */ + nvme_notify_async_consumers(aer->ctrlr, &aer->cpl, + aer->log_page_id, NULL, 0); + goto out; + } + + /* Convert data to host endian */ + switch (aer->log_page_id) { + case NVME_LOG_ERROR: { + struct nvme_error_information_entry *err = + (struct nvme_error_information_entry *)aer->log_page_buffer; + for (int i = 0; i < (aer->ctrlr->cdata.elpe + 1); i++) + nvme_error_information_entry_swapbytes(err++); + break; + } + case NVME_LOG_HEALTH_INFORMATION: + nvme_health_information_page_swapbytes( + (struct nvme_health_information_page *)aer->log_page_buffer); + break; + case NVME_LOG_CHANGED_NAMESPACE: + nvme_ns_list_swapbytes( + (struct nvme_ns_list *)aer->log_page_buffer); + break; + case NVME_LOG_COMMAND_EFFECT: + nvme_command_effects_page_swapbytes( + (struct nvme_command_effects_page *)aer->log_page_buffer); + break; + case NVME_LOG_RES_NOTIFICATION: + nvme_res_notification_page_swapbytes( + (struct nvme_res_notification_page *)aer->log_page_buffer); + break; + case NVME_LOG_SANITIZE_STATUS: + nvme_sanitize_status_page_swapbytes( + (struct nvme_sanitize_status_page *)aer->log_page_buffer); + break; + default: + break; + } + + if (aer->log_page_id == NVME_LOG_HEALTH_INFORMATION) { + struct nvme_health_information_page *health_info = + (struct nvme_health_information_page *)aer->log_page_buffer; + + /* + * Critical warnings reported through the SMART/health log page + * are persistent, so clear the associated bits in the async + * event config so that we do not receive repeated notifications + * for the same event. + */ + nvme_ctrlr_log_critical_warnings(aer->ctrlr, + health_info->critical_warning); + aer->ctrlr->async_event_config &= + ~health_info->critical_warning; + nvme_ctrlr_cmd_set_async_event_config(aer->ctrlr, + aer->ctrlr->async_event_config, NULL, NULL); + } else if (aer->log_page_id == NVME_LOG_CHANGED_NAMESPACE) { + struct nvme_ns_list *nsl = + (struct nvme_ns_list *)aer->log_page_buffer; + for (int i = 0; i < nitems(nsl->ns) && nsl->ns[i] != 0; i++) { + if (nsl->ns[i] > NVME_MAX_NAMESPACES) + break; + nvme_notify_ns(aer->ctrlr, nsl->ns[i]); + } + } + + /* + * Pass the cpl data from the original async event completion, not the + * log page fetch. + */ + nvme_notify_async_consumers(aer->ctrlr, &aer->cpl, + aer->log_page_id, aer->log_page_buffer, aer->log_page_size); + + /* + * Repost another asynchronous event request to replace the one + * that just completed. + */ +out: + nvme_ctrlr_construct_and_submit_aer(ctrlr, aer); +} + /* * Poll all the queues enabled on the device for completion. */ @@ -1574,13 +1609,8 @@ nvme_ctrlr_construct(struct nvme_controller *ctrlr, device_t dev) /* * Create 2 threads for the taskqueue. The reset thread will block when * it detects that the controller has failed until all I/O has been - * failed up the stack. The fail_req task needs to be able to run in - * this case to finish the request failure for some cases. - * - * We could partially solve this race by draining the failed requeust - * queue before proceding to free the sim, though nothing would stop - * new I/O from coming in after we do that drain, but before we reach - * cam_sim_free, so this big hammer is used instead. + * failed up the stack. The second thread is used for AER events, which + * can block, but only briefly for memory and log page fetching. */ ctrlr->taskqueue = taskqueue_create("nvme_taskq", M_WAITOK, taskqueue_thread_enqueue, &ctrlr->taskqueue); @@ -1590,7 +1620,12 @@ nvme_ctrlr_construct(struct nvme_controller *ctrlr, device_t dev) ctrlr->is_initialized = false; ctrlr->notification_sent = 0; TASK_INIT(&ctrlr->reset_task, 0, nvme_ctrlr_reset_task, ctrlr); - STAILQ_INIT(&ctrlr->fail_req); + for (int i = 0; i < NVME_MAX_ASYNC_EVENTS; i++) { + struct nvme_async_event_request *aer = &ctrlr->aer[i]; + + TASK_INIT(&aer->task, 0, nvme_ctrlr_aer_task, aer); + mtx_init(&aer->mtx, "AER mutex", NULL, MTX_DEF); + } ctrlr->is_failed = false; make_dev_args_init(&md_args); @@ -1678,8 +1713,14 @@ nvme_ctrlr_destruct(struct nvme_controller *ctrlr, device_t dev) } noadminq: - if (ctrlr->taskqueue) + if (ctrlr->taskqueue) { taskqueue_free(ctrlr->taskqueue); + for (int i = 0; i < NVME_MAX_ASYNC_EVENTS; i++) { + struct nvme_async_event_request *aer = &ctrlr->aer[i]; + + mtx_destroy(&aer->mtx); + } + } if (ctrlr->tag) bus_teardown_intr(ctrlr->dev, ctrlr->res, ctrlr->tag); diff --git a/sys/dev/nvme/nvme_private.h b/sys/dev/nvme/nvme_private.h index 949e69ec9290..36f00fedc48e 100644 --- a/sys/dev/nvme/nvme_private.h +++ b/sys/dev/nvme/nvme_private.h @@ -123,6 +123,8 @@ struct nvme_request { struct nvme_async_event_request { struct nvme_controller *ctrlr; struct nvme_request *req; + struct task task; + struct mtx mtx; struct nvme_completion cpl; uint32_t log_page_id; uint32_t log_page_size; @@ -307,8 +309,6 @@ struct nvme_controller { bool isr_warned; bool is_initialized; - STAILQ_HEAD(, nvme_request) fail_req; - /* Host Memory Buffer */ int hmb_nchunks; size_t hmb_chunk; diff --git a/sys/dev/nvmf/controller/nvmft_subr.c b/sys/dev/nvmf/controller/nvmft_subr.c index bb2bc0988e81..245971813854 100644 --- a/sys/dev/nvmf/controller/nvmft_subr.c +++ b/sys/dev/nvmf/controller/nvmft_subr.c @@ -26,46 +26,6 @@ nvmf_nqn_valid(const char *nqn) len = strnlen(nqn, NVME_NQN_FIELD_SIZE); if (len == 0 || len > NVMF_NQN_MAX_LEN) return (false); - -#ifdef STRICT_CHECKS - /* - * Stricter checks from the spec. Linux does not seem to - * require these. - */ - - /* - * NVMF_NQN_MIN_LEN does not include '.', and require at least - * one character of a domain name. - */ - if (len < NVMF_NQN_MIN_LEN + 2) - return (false); - if (memcmp("nqn.", nqn, strlen("nqn.")) != 0) - return (false); - nqn += strlen("nqn."); - - /* Next 4 digits must be a year. */ - for (u_int i = 0; i < 4; i++) { - if (!isdigit(nqn[i])) - return (false); - } - nqn += 4; - - /* '-' between year and month. */ - if (nqn[0] != '-') - return (false); - nqn++; - - /* 2 digit month. */ - for (u_int i = 0; i < 2; i++) { - if (!isdigit(nqn[i])) - return (false); - } - nqn += 2; - - /* '.' between month and reverse domain name. */ - if (nqn[0] != '.') - return (false); -#endif return (true); } diff --git a/sys/dev/pci/pci_iov.c b/sys/dev/pci/pci_iov.c index 1f72391fb6b4..0efcfeac9eff 100644 --- a/sys/dev/pci/pci_iov.c +++ b/sys/dev/pci/pci_iov.c @@ -734,11 +734,18 @@ pci_iov_config(struct cdev *cdev, struct pci_iov_arg *arg) first_rid = pci_get_rid(dev) + rid_off; last_rid = first_rid + (num_vfs - 1) * rid_stride; - /* We don't yet support allocating extra bus numbers for VFs. */ if (pci_get_bus(dev) != PCI_RID2BUS(last_rid)) { - device_printf(dev, "not enough PCIe bus numbers for VFs\n"); - error = ENOSPC; - goto out; + int rid = 0; + uint16_t last_rid_bus = PCI_RID2BUS(last_rid); + + iov->iov_bus_res = bus_alloc_resource(bus, PCI_RES_BUS, &rid, + last_rid_bus, last_rid_bus, 1, RF_ACTIVE); + if (iov->iov_bus_res == NULL) { + device_printf(dev, + "failed to allocate PCIe bus number for VFs\n"); + error = ENOSPC; + goto out; + } } if (!ari_enabled && PCI_RID2SLOT(last_rid) != 0) { @@ -786,6 +793,11 @@ out: } } + if (iov->iov_bus_res != NULL) { + bus_release_resource(bus, iov->iov_bus_res); + iov->iov_bus_res = NULL; + } + if (iov->iov_flags & IOV_RMAN_INITED) { rman_fini(&iov->rman); iov->iov_flags &= ~IOV_RMAN_INITED; @@ -896,6 +908,11 @@ pci_iov_delete_iov_children(struct pci_devinfo *dinfo) } } + if (iov->iov_bus_res != NULL) { + bus_release_resource(bus, iov->iov_bus_res); + iov->iov_bus_res = NULL; + } + if (iov->iov_flags & IOV_RMAN_INITED) { rman_fini(&iov->rman); iov->iov_flags &= ~IOV_RMAN_INITED; diff --git a/sys/dev/pci/pci_iov_private.h b/sys/dev/pci/pci_iov_private.h index 7ae2219b936d..ecf0a9b21be5 100644 --- a/sys/dev/pci/pci_iov_private.h +++ b/sys/dev/pci/pci_iov_private.h @@ -39,6 +39,8 @@ struct pcicfg_iov { struct cdev *iov_cdev; nvlist_t *iov_schema; + struct resource *iov_bus_res; + struct pci_iov_bar iov_bar[PCIR_MAX_BAR_0 + 1]; struct rman rman; char rman_name[64]; diff --git a/sys/dev/qlnx/qlnxe/qlnx_os.c b/sys/dev/qlnx/qlnxe/qlnx_os.c index 9d23d5df1d2b..4ad190374f87 100644 --- a/sys/dev/qlnx/qlnxe/qlnx_os.c +++ b/sys/dev/qlnx/qlnxe/qlnx_os.c @@ -2308,8 +2308,6 @@ qlnx_init_ifnet(device_t dev, qlnx_host_t *ha) else if (device_id == QLOGIC_PCI_DEVICE_ID_1644) if_setbaudrate(ifp, IF_Gbps(100)); - if_setcapabilities(ifp, IFCAP_LINKSTATE); - if_setinitfn(ifp, qlnx_init); if_setsoftc(ifp, ha); if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST); @@ -2343,7 +2341,6 @@ qlnx_init_ifnet(device_t dev, qlnx_host_t *ha) if_setcapabilities(ifp, IFCAP_HWCSUM); if_setcapabilitiesbit(ifp, IFCAP_JUMBO_MTU, 0); - if_setcapabilitiesbit(ifp, IFCAP_VLAN_MTU, 0); if_setcapabilitiesbit(ifp, IFCAP_VLAN_HWTAGGING, 0); if_setcapabilitiesbit(ifp, IFCAP_VLAN_HWFILTER, 0); @@ -2352,6 +2349,8 @@ qlnx_init_ifnet(device_t dev, qlnx_host_t *ha) if_setcapabilitiesbit(ifp, IFCAP_TSO4, 0); if_setcapabilitiesbit(ifp, IFCAP_TSO6, 0); if_setcapabilitiesbit(ifp, IFCAP_LRO, 0); + if_setcapabilitiesbit(ifp, IFCAP_LINKSTATE, 0); + if_setcapabilitiesbit(ifp, IFCAP_HWSTATS, 0); if_sethwtsomax(ifp, QLNX_MAX_TSO_FRAME_SIZE - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN)); diff --git a/sys/dev/vmm/vmm_dev.c b/sys/dev/vmm/vmm_dev.c index 819debadd1ac..9f2b009d02ec 100644 --- a/sys/dev/vmm/vmm_dev.c +++ b/sys/dev/vmm/vmm_dev.c @@ -30,7 +30,8 @@ #include <dev/vmm/vmm_mem.h> #include <dev/vmm/vmm_stat.h> -#if defined(__amd64__) && defined(COMPAT_FREEBSD12) +#ifdef __amd64__ +#ifdef COMPAT_FREEBSD12 struct vm_memseg_12 { int segid; size_t len; @@ -42,7 +43,22 @@ _Static_assert(sizeof(struct vm_memseg_12) == 80, "COMPAT_FREEBSD12 ABI"); _IOW('v', IOCNUM_ALLOC_MEMSEG, struct vm_memseg_12) #define VM_GET_MEMSEG_12 \ _IOWR('v', IOCNUM_GET_MEMSEG, struct vm_memseg_12) -#endif +#endif /* COMPAT_FREEBSD12 */ +#ifdef COMPAT_FREEBSD14 +struct vm_memseg_14 { + int segid; + size_t len; + char name[VM_MAX_SUFFIXLEN + 1]; +}; +_Static_assert(sizeof(struct vm_memseg_14) == (VM_MAX_SUFFIXLEN + 1 + 16), + "COMPAT_FREEBSD14 ABI"); + +#define VM_ALLOC_MEMSEG_14 \ + _IOW('v', IOCNUM_ALLOC_MEMSEG, struct vm_memseg_14) +#define VM_GET_MEMSEG_14 \ + _IOWR('v', IOCNUM_GET_MEMSEG, struct vm_memseg_14) +#endif /* COMPAT_FREEBSD14 */ +#endif /* __amd64__ */ struct devmem_softc { int segid; @@ -257,7 +273,8 @@ get_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len) } static int -alloc_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len) +alloc_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len, + struct domainset *domainset) { char *name; int error; @@ -278,8 +295,7 @@ alloc_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len) if (error) goto done; } - - error = vm_alloc_memseg(sc->vm, mseg->segid, mseg->len, sysmem); + error = vm_alloc_memseg(sc->vm, mseg->segid, mseg->len, sysmem, domainset); if (error) goto done; @@ -295,6 +311,20 @@ done: return (error); } +#if defined(__amd64__) && \ + (defined(COMPAT_FREEBSD14) || defined(COMPAT_FREEBSD12)) +/* + * Translate pre-15.0 memory segment identifiers into their 15.0 counterparts. + */ +static void +adjust_segid(struct vm_memseg *mseg) +{ + if (mseg->segid != VM_SYSMEM) { + mseg->segid += (VM_BOOTROM - 1); + } +} +#endif + static int vm_get_register_set(struct vcpu *vcpu, unsigned int count, int *regnum, uint64_t *regval) @@ -353,10 +383,16 @@ static const struct vmmdev_ioctl vmmdev_ioctls[] = { VMMDEV_IOCTL(VM_STATS, VMMDEV_IOCTL_LOCK_ONE_VCPU), VMMDEV_IOCTL(VM_STAT_DESC, 0), -#if defined(__amd64__) && defined(COMPAT_FREEBSD12) +#ifdef __amd64__ +#ifdef COMPAT_FREEBSD12 VMMDEV_IOCTL(VM_ALLOC_MEMSEG_12, VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS), #endif +#ifdef COMPAT_FREEBSD14 + VMMDEV_IOCTL(VM_ALLOC_MEMSEG_14, + VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS), +#endif +#endif /* __amd64__ */ VMMDEV_IOCTL(VM_ALLOC_MEMSEG, VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS), VMMDEV_IOCTL(VM_MMAP_MEMSEG, @@ -366,9 +402,14 @@ static const struct vmmdev_ioctl vmmdev_ioctls[] = { VMMDEV_IOCTL(VM_REINIT, VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS), -#if defined(__amd64__) && defined(COMPAT_FREEBSD12) +#ifdef __amd64__ +#if defined(COMPAT_FREEBSD12) VMMDEV_IOCTL(VM_GET_MEMSEG_12, VMMDEV_IOCTL_SLOCK_MEMSEGS), #endif +#ifdef COMPAT_FREEBSD14 + VMMDEV_IOCTL(VM_GET_MEMSEG_14, VMMDEV_IOCTL_SLOCK_MEMSEGS), +#endif +#endif /* __amd64__ */ VMMDEV_IOCTL(VM_GET_MEMSEG, VMMDEV_IOCTL_SLOCK_MEMSEGS), VMMDEV_IOCTL(VM_MMAP_GETNEXT, VMMDEV_IOCTL_SLOCK_MEMSEGS), @@ -388,6 +429,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, struct vmmdev_softc *sc; struct vcpu *vcpu; const struct vmmdev_ioctl *ioctl; + struct vm_memseg *mseg; int error, vcpuid; sc = vmmdev_lookup2(cdev); @@ -499,20 +541,77 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, error = vm_munmap_memseg(sc->vm, mu->gpa, mu->len); break; } -#if defined(__amd64__) && defined(COMPAT_FREEBSD12) +#ifdef __amd64__ +#ifdef COMPAT_FREEBSD12 case VM_ALLOC_MEMSEG_12: - error = alloc_memseg(sc, (struct vm_memseg *)data, - sizeof(((struct vm_memseg_12 *)0)->name)); + mseg = (struct vm_memseg *)data; + + adjust_segid(mseg); + error = alloc_memseg(sc, mseg, + sizeof(((struct vm_memseg_12 *)0)->name), NULL); break; case VM_GET_MEMSEG_12: - error = get_memseg(sc, (struct vm_memseg *)data, + mseg = (struct vm_memseg *)data; + + adjust_segid(mseg); + error = get_memseg(sc, mseg, sizeof(((struct vm_memseg_12 *)0)->name)); break; -#endif - case VM_ALLOC_MEMSEG: - error = alloc_memseg(sc, (struct vm_memseg *)data, - sizeof(((struct vm_memseg *)0)->name)); +#endif /* COMPAT_FREEBSD12 */ +#ifdef COMPAT_FREEBSD14 + case VM_ALLOC_MEMSEG_14: + mseg = (struct vm_memseg *)data; + + adjust_segid(mseg); + error = alloc_memseg(sc, mseg, + sizeof(((struct vm_memseg_14 *)0)->name), NULL); + break; + case VM_GET_MEMSEG_14: + mseg = (struct vm_memseg *)data; + + adjust_segid(mseg); + error = get_memseg(sc, mseg, + sizeof(((struct vm_memseg_14 *)0)->name)); + break; +#endif /* COMPAT_FREEBSD14 */ +#endif /* __amd64__ */ + case VM_ALLOC_MEMSEG: { + domainset_t *mask; + struct domainset *domainset, domain; + + domainset = NULL; + mseg = (struct vm_memseg *)data; + if (mseg->ds_policy != DOMAINSET_POLICY_INVALID && mseg->ds_mask != NULL) { + if (mseg->ds_mask_size < sizeof(domainset_t) || + mseg->ds_mask_size > DOMAINSET_MAXSIZE / NBBY) { + error = ERANGE; + break; + } + memset(&domain, 0, sizeof(domain)); + mask = malloc(mseg->ds_mask_size, M_VMMDEV, M_WAITOK); + error = copyin(mseg->ds_mask, mask, mseg->ds_mask_size); + if (error) { + free(mask, M_VMMDEV); + break; + } + error = domainset_populate(&domain, mask, mseg->ds_policy, + mseg->ds_mask_size); + if (error) { + free(mask, M_VMMDEV); + break; + } + domainset = domainset_create(&domain); + if (domainset == NULL) { + error = EINVAL; + free(mask, M_VMMDEV); + break; + } + free(mask, M_VMMDEV); + } + error = alloc_memseg(sc, mseg, sizeof(mseg->name), domainset); + break; + } case VM_GET_MEMSEG: error = get_memseg(sc, (struct vm_memseg *)data, sizeof(((struct vm_memseg *)0)->name)); @@ -820,7 +919,6 @@ sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS) buflen = VM_MAX_NAMELEN + 1; buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO); - strlcpy(buf, "beavis", buflen); error = sysctl_handle_string(oidp, buf, buflen, req); if (error == 0 && req->newptr != NULL) error = vmmdev_lookup_and_destroy(buf, req->td->td_ucred); @@ -830,7 +928,7 @@ sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS) SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE, NULL, 0, sysctl_vmm_destroy, "A", - NULL); + "Destroy a vmm(4) instance (legacy interface)"); static struct cdevsw vmmdevsw = { .d_name = "vmmdev", @@ -909,7 +1007,6 @@ sysctl_vmm_create(SYSCTL_HANDLER_ARGS) buflen = VM_MAX_NAMELEN + 1; buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO); - strlcpy(buf, "beavis", buflen); error = sysctl_handle_string(oidp, buf, buflen, req); if (error == 0 && req->newptr != NULL) error = vmmdev_create(buf, req->td->td_ucred); @@ -919,7 +1016,7 @@ sysctl_vmm_create(SYSCTL_HANDLER_ARGS) SYSCTL_PROC(_hw_vmm, OID_AUTO, create, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE, NULL, 0, sysctl_vmm_create, "A", - NULL); + "Create a vmm(4) instance (legacy interface)"); static int vmmctl_open(struct cdev *cdev, int flags, int fmt, struct thread *td) diff --git a/sys/dev/vmm/vmm_mem.c b/sys/dev/vmm/vmm_mem.c index c61ae2d44b96..be59e37de33d 100644 --- a/sys/dev/vmm/vmm_mem.c +++ b/sys/dev/vmm/vmm_mem.c @@ -7,6 +7,7 @@ #include <sys/types.h> #include <sys/lock.h> +#include <sys/malloc.h> #include <sys/sx.h> #include <sys/systm.h> @@ -156,10 +157,11 @@ vm_mem_allocated(struct vcpu *vcpu, vm_paddr_t gpa) } int -vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem) +vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem, + struct domainset *obj_domainset) { - struct vm_mem *mem; struct vm_mem_seg *seg; + struct vm_mem *mem; vm_object_t obj; mem = vm_mem(vm); @@ -179,13 +181,22 @@ vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem) return (EINVAL); } + /* + * When given an impossible policy, signal an + * error to the user. + */ + if (obj_domainset != NULL && domainset_empty_vm(obj_domainset)) + return (EINVAL); obj = vm_object_allocate(OBJT_SWAP, len >> PAGE_SHIFT); if (obj == NULL) return (ENOMEM); seg->len = len; seg->object = obj; + if (obj_domainset != NULL) + seg->object->domain.dr_policy = obj_domainset; seg->sysmem = sysmem; + return (0); } diff --git a/sys/dev/vmm/vmm_mem.h b/sys/dev/vmm/vmm_mem.h index a4be4c1c57aa..856470cf2590 100644 --- a/sys/dev/vmm/vmm_mem.h +++ b/sys/dev/vmm/vmm_mem.h @@ -8,6 +8,27 @@ #ifndef _DEV_VMM_MEM_H_ #define _DEV_VMM_MEM_H_ +/* Maximum number of NUMA domains in a guest. */ +#define VM_MAXMEMDOM 8 +#define VM_MAXSYSMEM VM_MAXMEMDOM + +/* + * Identifiers for memory segments. + * Each guest NUMA domain is represented by a single system + * memory segment from [VM_SYSMEM, VM_MAXSYSMEM). + * The remaining identifiers can be used to create devmem segments. + */ +enum { + VM_SYSMEM = 0, + VM_BOOTROM = VM_MAXSYSMEM, + VM_FRAMEBUFFER, + VM_PCIROM, + VM_MEMSEG_END +}; + +#define VM_MAX_MEMSEGS VM_MEMSEG_END +#define VM_MAX_MEMMAPS (VM_MAX_MEMSEGS * 2) + #ifdef _KERNEL #include <sys/types.h> @@ -31,9 +52,6 @@ struct vm_mem_map { int flags; }; -#define VM_MAX_MEMSEGS 4 -#define VM_MAX_MEMMAPS 8 - struct vm_mem { struct vm_mem_map mem_maps[VM_MAX_MEMMAPS]; struct vm_mem_seg mem_segs[VM_MAX_MEMSEGS]; @@ -55,7 +73,8 @@ void vm_assert_memseg_xlocked(struct vm *vm); int vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t off, size_t len, int prot, int flags); int vm_munmap_memseg(struct vm *vm, vm_paddr_t gpa, size_t len); -int vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem); +int vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem, + struct domainset *obj_domainset); void vm_free_memseg(struct vm *vm, int ident); /* diff --git a/sys/dev/vt/hw/vga/vt_vga.c b/sys/dev/vt/hw/vga/vt_vga.c index 64039575c0ad..675c0573bd7e 100644 --- a/sys/dev/vt/hw/vga/vt_vga.c +++ b/sys/dev/vt/hw/vga/vt_vga.c @@ -1347,7 +1347,7 @@ vga_postswitch(struct vt_device *vd) /* Reinit VGA mode, to restore view after app which change mode. */ vga_initialize(vd, (vd->vd_flags & VDF_TEXTMODE)); - /* Ask vt(9) to update chars on visible area. */ + /* Ask vt(4) to update chars on visible area. */ vd->vd_flags |= VDF_INVALID; } diff --git a/sys/dev/vt/vt_core.c b/sys/dev/vt/vt_core.c index b0f58b38a6f1..b51ef6766de4 100644 --- a/sys/dev/vt/vt_core.c +++ b/sys/dev/vt/vt_core.c @@ -125,10 +125,10 @@ static const struct terminal_class vt_termclass = { (vw)->vw_number) static SYSCTL_NODE(_kern, OID_AUTO, vt, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, - "vt(9) parameters"); + "vt(4) parameters"); static VT_SYSCTL_INT(enable_altgr, 1, "Enable AltGr key (Do not assume R.Alt as Alt)"); static VT_SYSCTL_INT(enable_bell, 0, "Enable bell"); -static VT_SYSCTL_INT(debug, 0, "vt(9) debug level"); +static VT_SYSCTL_INT(debug, 0, "vt(4) debug level"); static VT_SYSCTL_INT(deadtimer, 15, "Time to wait busy process in VT_PROCESS mode"); static VT_SYSCTL_INT(suspendswitch, 1, "Switch to VT0 before suspend"); diff --git a/sys/fs/fuse/fuse_internal.h b/sys/fs/fuse/fuse_internal.h index cddf88095840..932012b5f52a 100644 --- a/sys/fs/fuse/fuse_internal.h +++ b/sys/fs/fuse/fuse_internal.h @@ -208,9 +208,9 @@ fuse_match_cred(struct ucred *basecred, struct ucred *usercred) if (basecred->cr_uid == usercred->cr_uid && basecred->cr_uid == usercred->cr_ruid && basecred->cr_uid == usercred->cr_svuid && - basecred->cr_groups[0] == usercred->cr_groups[0] && - basecred->cr_groups[0] == usercred->cr_rgid && - basecred->cr_groups[0] == usercred->cr_svgid) + basecred->cr_gid == usercred->cr_gid && + basecred->cr_gid == usercred->cr_rgid && + basecred->cr_gid == usercred->cr_svgid) return (0); return (EPERM); diff --git a/sys/fs/fuse/fuse_ipc.c b/sys/fs/fuse/fuse_ipc.c index 0b6048644d32..a751c09159ff 100644 --- a/sys/fs/fuse/fuse_ipc.c +++ b/sys/fs/fuse/fuse_ipc.c @@ -868,7 +868,7 @@ fuse_setup_ihead(struct fuse_in_header *ihead, struct fuse_ticket *ftick, ihead->pid = pid; ihead->uid = cred->cr_uid; - ihead->gid = cred->cr_groups[0]; + ihead->gid = cred->cr_gid; } /* diff --git a/sys/fs/fuse/fuse_vnops.c b/sys/fs/fuse/fuse_vnops.c index ae28617537fd..32872e8f3f3a 100644 --- a/sys/fs/fuse/fuse_vnops.c +++ b/sys/fs/fuse/fuse_vnops.c @@ -884,7 +884,7 @@ fuse_vnop_copy_file_range(struct vop_copy_file_range_args *ap) return (EXTERROR(ENOSYS, "FUSE_COPY_FILE_RANGE does not " "support different credentials for infd and outfd")); - if (incred->cr_groups[0] != outcred->cr_groups[0]) + if (incred->cr_gid != outcred->cr_gid) return (EXTERROR(ENOSYS, "FUSE_COPY_FILE_RANGE does not " "support different credentials for infd and outfd")); diff --git a/sys/fs/nfs/nfs_commonport.c b/sys/fs/nfs/nfs_commonport.c index 0c94f4e7dc52..222cfc03e4b3 100644 --- a/sys/fs/nfs/nfs_commonport.c +++ b/sys/fs/nfs/nfs_commonport.c @@ -379,7 +379,8 @@ newnfs_setroot(struct ucred *cred) { cred->cr_uid = 0; - cred->cr_groups[0] = 0; + cred->cr_gid = 0; + /* XXXKE Fix this if cr_gid gets separated out. */ cred->cr_ngroups = 1; } diff --git a/sys/fs/nfsclient/nfs_clrpcops.c b/sys/fs/nfsclient/nfs_clrpcops.c index 2f3c59b68518..36b534be531e 100644 --- a/sys/fs/nfsclient/nfs_clrpcops.c +++ b/sys/fs/nfsclient/nfs_clrpcops.c @@ -6933,7 +6933,8 @@ nfscl_dofflayoutio(vnode_t vp, struct uio *uiop, int *iomode, int *must_commit, if ((dp->nfsdi_flags & NFSDI_TIGHTCOUPLED) == 0) { tcred = NFSNEWCRED(cred); tcred->cr_uid = flp->nfsfl_ffm[mirror].user; - tcred->cr_groups[0] = flp->nfsfl_ffm[mirror].group; + tcred->cr_gid = flp->nfsfl_ffm[mirror].group; + /* XXXKE Fix this if cr_gid gets separated out. */ tcred->cr_ngroups = 1; } else tcred = cred; diff --git a/sys/fs/pseudofs/pseudofs_vnops.c b/sys/fs/pseudofs/pseudofs_vnops.c index 0bdfedffafcb..8cd092118d0e 100644 --- a/sys/fs/pseudofs/pseudofs_vnops.c +++ b/sys/fs/pseudofs/pseudofs_vnops.c @@ -850,7 +850,7 @@ pfs_readdir(struct vop_readdir_args *va) struct uio *uio; struct pfsentry *pfsent, *pfsent2; struct pfsdirentlist lst; - off_t offset; + off_t coffset, offset; int error, i, resid; STAILQ_INIT(&lst); @@ -860,6 +860,9 @@ pfs_readdir(struct vop_readdir_args *va) PFS_TRACE(("%s pid %lu", pd->pn_name, (unsigned long)pid)); pfs_assert_not_owned(pd); + if (va->a_eofflag != NULL) + *va->a_eofflag = 0; + if (vn->v_type != VDIR) PFS_RETURN (ENOTDIR); KASSERT_PN_IS_DIR(pd); @@ -878,6 +881,10 @@ pfs_readdir(struct vop_readdir_args *va) if (pid != NO_PID && !pfs_lookup_proc(pid, &proc)) PFS_RETURN (ENOENT); + /* + * The allproc lock is required in pfs_iterate() for procdir + * directories. + */ sx_slock(&allproc_lock); pfs_lock(pd); @@ -897,23 +904,15 @@ pfs_readdir(struct vop_readdir_args *va) } } - /* skip unwanted entries */ - for (pn = NULL, p = NULL; offset > 0; offset -= PFS_DELEN) { + for (pn = NULL, p = NULL, coffset = 0; resid >= PFS_DELEN; + coffset += PFS_DELEN) { if (pfs_iterate(curthread, proc, pd, &pn, &p) == -1) { - /* nothing left... */ - if (proc != NULL) { - _PRELE(proc); - PROC_UNLOCK(proc); - } - pfs_unlock(pd); - sx_sunlock(&allproc_lock); - PFS_RETURN (0); + if (va->a_eofflag != NULL) + *va->a_eofflag = 1; + break; } - } - - /* fill in entries */ - while (pfs_iterate(curthread, proc, pd, &pn, &p) != -1 && - resid >= PFS_DELEN) { + if (coffset < offset) + continue; if ((pfsent = malloc(sizeof(struct pfsentry), M_IOV, M_NOWAIT | M_ZERO)) == NULL) { error = ENOMEM; diff --git a/sys/fs/smbfs/smbfs_io.c b/sys/fs/smbfs/smbfs_io.c index 35454998fc8e..8c484381ed59 100644 --- a/sys/fs/smbfs/smbfs_io.c +++ b/sys/fs/smbfs/smbfs_io.c @@ -71,7 +71,7 @@ SYSCTL_INT(_vfs_smbfs, OID_AUTO, fastlookup, CTLFLAG_RW, &smbfs_fastlookup, 0, " #define DE_SIZE (sizeof(struct dirent)) static int -smbfs_readvdir(struct vnode *vp, struct uio *uio, struct ucred *cred) +smbfs_readvdir(struct vnode *vp, struct uio *uio, struct ucred *cred, int *eofp) { struct dirent de; struct componentname cn; @@ -86,6 +86,8 @@ smbfs_readvdir(struct vnode *vp, struct uio *uio, struct ucred *cred) SMBVDEBUG("dirname='%s'\n", np->n_name); scred = smbfs_malloc_scred(); smb_makescred(scred, uio->uio_td, cred); + if (eofp != NULL) + *eofp = 0; offset = uio->uio_offset / DE_SIZE; /* offset in the directory */ limit = uio->uio_resid / DE_SIZE; if (uio->uio_resid < DE_SIZE || uio->uio_offset < 0) { @@ -138,8 +140,7 @@ smbfs_readvdir(struct vnode *vp, struct uio *uio, struct ucred *cred) if (error) { smbfs_findclose(np->n_dirseq, scred); np->n_dirseq = NULL; - error = ENOENT ? 0 : error; - goto out; + goto out1; } } error = 0; @@ -170,16 +171,21 @@ smbfs_readvdir(struct vnode *vp, struct uio *uio, struct ucred *cred) if (error) break; } - if (error == ENOENT) - error = 0; uio->uio_offset = offset * DE_SIZE; +out1: + if (error == ENOENT) { + if (eofp != NULL) + *eofp = 1; + error = 0; + } out: smbfs_free_scred(scred); return error; } int -smbfs_readvnode(struct vnode *vp, struct uio *uiop, struct ucred *cred) +smbfs_readvnode(struct vnode *vp, struct uio *uiop, struct ucred *cred, + int *eofp) { struct smbmount *smp = VFSTOSMBFS(vp->v_mount); struct smbnode *np = VTOSMB(vp); @@ -209,7 +215,7 @@ smbfs_readvnode(struct vnode *vp, struct uio *uiop, struct ucred *cred) lks = LK_EXCLUSIVE; /* lockstatus(vp->v_vnlock); */ if (lks == LK_SHARED) vn_lock(vp, LK_UPGRADE | LK_RETRY); - error = smbfs_readvdir(vp, uiop, cred); + error = smbfs_readvdir(vp, uiop, cred, eofp); if (lks == LK_SHARED) vn_lock(vp, LK_DOWNGRADE | LK_RETRY); return error; diff --git a/sys/fs/smbfs/smbfs_node.h b/sys/fs/smbfs/smbfs_node.h index f28f0007100a..8c8ce038b913 100644 --- a/sys/fs/smbfs/smbfs_node.h +++ b/sys/fs/smbfs/smbfs_node.h @@ -93,7 +93,7 @@ u_int32_t smbfs_hash(const u_char *name, int nmlen); int smbfs_getpages(struct vop_getpages_args *); int smbfs_putpages(struct vop_putpages_args *); -int smbfs_readvnode(struct vnode *vp, struct uio *uiop, struct ucred *cred); +int smbfs_readvnode(struct vnode *vp, struct uio *uiop, struct ucred *cred, int *eofp); int smbfs_writevnode(struct vnode *vp, struct uio *uiop, struct ucred *cred, int ioflag); void smbfs_attr_cacheenter(struct vnode *vp, struct smbfattr *fap); int smbfs_attr_cachelookup(struct vnode *vp ,struct vattr *va); diff --git a/sys/fs/smbfs/smbfs_vnops.c b/sys/fs/smbfs/smbfs_vnops.c index 5d412cabadb8..63b249c93771 100644 --- a/sys/fs/smbfs/smbfs_vnops.c +++ b/sys/fs/smbfs/smbfs_vnops.c @@ -466,7 +466,7 @@ smbfs_read(struct vop_read_args *ap) SMBVDEBUG("\n"); if (vp->v_type != VREG && vp->v_type != VDIR) return EPERM; - return smbfs_readvnode(vp, uio, ap->a_cred); + return smbfs_readvnode(vp, uio, ap->a_cred, NULL); } static int @@ -748,7 +748,6 @@ smbfs_readdir(struct vop_readdir_args *ap) { struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; - int error; if (vp->v_type != VDIR) return (EPERM); @@ -758,8 +757,7 @@ smbfs_readdir(struct vop_readdir_args *ap) return (EOPNOTSUPP); } #endif - error = smbfs_readvnode(vp, uio, ap->a_cred); - return error; + return (smbfs_readvnode(vp, uio, ap->a_cred, ap->a_eofflag)); } /* ARGSUSED */ diff --git a/sys/geom/concat/g_concat.c b/sys/geom/concat/g_concat.c index 2b1cb575cac8..2173a84c7acf 100644 --- a/sys/geom/concat/g_concat.c +++ b/sys/geom/concat/g_concat.c @@ -590,6 +590,7 @@ g_concat_add_disk(struct g_concat_softc *sc, struct g_provider *pp, u_int no) strcmp(md.md_name, sc->sc_name) != 0 || md.md_id != sc->sc_id) { G_CONCAT_DEBUG(0, "Metadata on %s changed.", pp->name); + error = EINVAL; goto fail; } diff --git a/sys/geom/geom.h b/sys/geom/geom.h index dcd6f793f9f7..908ce86f03a6 100644 --- a/sys/geom/geom.h +++ b/sys/geom/geom.h @@ -282,7 +282,7 @@ void g_detach(struct g_consumer *cp); void g_error_provider(struct g_provider *pp, int error); struct g_provider *g_provider_by_name(char const *arg); int g_getattr__(const char *attr, struct g_consumer *cp, void *var, int len); -#define g_getattr(a, c, v) g_getattr__((a), (c), (v), sizeof *(v)) +#define g_getattr(a, c, v) g_getattr__((a), (c), (v), sizeof(*(v))) int g_handleattr(struct bio *bp, const char *attribute, const void *val, int len); int g_handleattr_int(struct bio *bp, const char *attribute, int val); diff --git a/sys/geom/geom_ccd.c b/sys/geom/geom_ccd.c index 5700399ee5d1..2140d005160e 100644 --- a/sys/geom/geom_ccd.c +++ b/sys/geom/geom_ccd.c @@ -730,17 +730,17 @@ g_ccd_create(struct gctl_req *req, struct g_class *mp) int i, error; g_topology_assert(); - unit = gctl_get_paraml(req, "unit", sizeof (*unit)); + unit = gctl_get_paraml(req, "unit", sizeof(*unit)); if (unit == NULL) { gctl_error(req, "unit parameter not given"); return; } - ileave = gctl_get_paraml(req, "ileave", sizeof (*ileave)); + ileave = gctl_get_paraml(req, "ileave", sizeof(*ileave)); if (ileave == NULL) { gctl_error(req, "ileave parameter not given"); return; } - nprovider = gctl_get_paraml(req, "nprovider", sizeof (*nprovider)); + nprovider = gctl_get_paraml(req, "nprovider", sizeof(*nprovider)); if (nprovider == NULL) { gctl_error(req, "nprovider parameter not given"); return; @@ -769,7 +769,7 @@ g_ccd_create(struct gctl_req *req, struct g_class *mp) } gp = g_new_geomf(mp, "ccd%d", *unit); - sc = g_malloc(sizeof *sc, M_WAITOK | M_ZERO); + sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO); gp->softc = sc; sc->sc_ndisks = *nprovider; @@ -872,7 +872,7 @@ g_ccd_list(struct gctl_req *req, struct g_class *mp) struct g_geom *gp; int i, unit, *up; - up = gctl_get_paraml(req, "unit", sizeof (*up)); + up = gctl_get_paraml(req, "unit", sizeof(*up)); if (up == NULL) { gctl_error(req, "unit parameter not given"); return; diff --git a/sys/geom/geom_event.c b/sys/geom/geom_event.c index 0a76fd6c6f57..341233a6ef47 100644 --- a/sys/geom/geom_event.c +++ b/sys/geom/geom_event.c @@ -145,7 +145,7 @@ g_attr_changed(struct g_provider *pp, const char *attr, int flag) struct g_attrchanged_args *args; int error; - args = g_malloc(sizeof *args, flag); + args = g_malloc(sizeof(*args), flag); if (args == NULL) return (ENOMEM); args->pp = pp; diff --git a/sys/geom/geom_io.c b/sys/geom/geom_io.c index 8d6b9a926e1d..247a623bf1bf 100644 --- a/sys/geom/geom_io.c +++ b/sys/geom/geom_io.c @@ -278,7 +278,7 @@ g_io_init(void) g_bioq_init(&g_bio_run_down); g_bioq_init(&g_bio_run_up); - biozone = uma_zcreate("g_bio", sizeof (struct bio), + biozone = uma_zcreate("g_bio", sizeof(struct bio), NULL, NULL, NULL, NULL, 0, 0); diff --git a/sys/geom/geom_slice.c b/sys/geom/geom_slice.c index 8cfffc478849..0491b0069be4 100644 --- a/sys/geom/geom_slice.c +++ b/sys/geom/geom_slice.c @@ -57,7 +57,7 @@ g_slice_alloc(unsigned nslice, unsigned scsize) { struct g_slicer *gsp; - gsp = g_malloc(sizeof *gsp, M_WAITOK | M_ZERO); + gsp = g_malloc(sizeof(*gsp), M_WAITOK | M_ZERO); if (scsize > 0) gsp->softc = g_malloc(scsize, M_WAITOK | M_ZERO); else @@ -463,9 +463,9 @@ g_slice_conf_hot(struct g_geom *gp, u_int idx, off_t offset, off_t length, int r } gsl = gsp->hotspot; if(idx >= gsp->nhotspot) { - gsl2 = g_malloc((idx + 1) * sizeof *gsl2, M_WAITOK | M_ZERO); + gsl2 = g_malloc((idx + 1) * sizeof(*gsl2), M_WAITOK | M_ZERO); if (gsp->hotspot != NULL) - bcopy(gsp->hotspot, gsl2, gsp->nhotspot * sizeof *gsl2); + bcopy(gsp->hotspot, gsl2, gsp->nhotspot * sizeof(*gsl2)); gsp->hotspot = gsl2; if (gsp->hotspot != NULL) g_free(gsl); diff --git a/sys/geom/geom_subr.c b/sys/geom/geom_subr.c index 41cc115225f9..1429c84942ed 100644 --- a/sys/geom/geom_subr.c +++ b/sys/geom/geom_subr.c @@ -267,7 +267,7 @@ g_modevent(module_t mod, int type, void *data) switch (type) { case MOD_LOAD: g_trace(G_T_TOPOLOGY, "g_modevent(%s, LOAD)", mp->name); - hh = g_malloc(sizeof *hh, M_WAITOK | M_ZERO); + hh = g_malloc(sizeof(*hh), M_WAITOK | M_ZERO); hh->mp = mp; /* * Once the system is not cold, MOD_LOAD calls will be @@ -351,7 +351,7 @@ g_retaste(struct g_class *mp) if (mp->taste == NULL) return (EINVAL); - hh = g_malloc(sizeof *hh, M_WAITOK | M_ZERO); + hh = g_malloc(sizeof(*hh), M_WAITOK | M_ZERO); hh->mp = mp; if (cold) { @@ -381,8 +381,8 @@ g_new_geomf(struct g_class *mp, const char *fmt, ...) sbuf_vprintf(sb, fmt, ap); va_end(ap); sbuf_finish(sb); - gp = g_malloc(sizeof *gp, M_WAITOK | M_ZERO); - gp->name = g_malloc(sbuf_len(sb) + 1, M_WAITOK | M_ZERO); + gp = g_malloc(sizeof(*gp) + sbuf_len(sb) + 1, M_WAITOK | M_ZERO); + gp->name = (char *)(gp + 1); gp->class = mp; gp->rank = 1; LIST_INIT(&gp->consumer); @@ -420,7 +420,6 @@ g_destroy_geom(struct g_geom *gp) g_cancel_event(gp); LIST_REMOVE(gp, geom); TAILQ_REMOVE(&geoms, gp, geoms); - g_free(gp->name); g_free(gp); } @@ -528,7 +527,7 @@ g_new_consumer(struct g_geom *gp) ("g_new_consumer on geom(%s) (class %s) without orphan", gp->name, gp->class->name)); - cp = g_malloc(sizeof *cp, M_WAITOK | M_ZERO); + cp = g_malloc(sizeof(*cp), M_WAITOK | M_ZERO); cp->geom = gp; cp->stat = devstat_new_entry(cp, -1, 0, DEVSTAT_ALL_SUPPORTED, DEVSTAT_TYPE_DIRECT, DEVSTAT_PRIORITY_MAX); @@ -617,7 +616,7 @@ g_new_providerf(struct g_geom *gp, const char *fmt, ...) sbuf_vprintf(sb, fmt, ap); va_end(ap); sbuf_finish(sb); - pp = g_malloc(sizeof *pp + sbuf_len(sb) + 1, M_WAITOK | M_ZERO); + pp = g_malloc(sizeof(*pp) + sbuf_len(sb) + 1, M_WAITOK | M_ZERO); pp->name = (char *)(pp + 1); strcpy(pp->name, sbuf_data(sb)); sbuf_delete(sb); @@ -749,7 +748,7 @@ g_resize_provider(struct g_provider *pp, off_t size) if (size == pp->mediasize) return; - hh = g_malloc(sizeof *hh, M_WAITOK | M_ZERO); + hh = g_malloc(sizeof(*hh), M_WAITOK | M_ZERO); hh->pp = pp; hh->size = size; g_post_event(g_resize_provider_event, hh, M_WAITOK, NULL); @@ -1083,21 +1082,21 @@ int g_handleattr_int(struct bio *bp, const char *attribute, int val) { - return (g_handleattr(bp, attribute, &val, sizeof val)); + return (g_handleattr(bp, attribute, &val, sizeof(val))); } int g_handleattr_uint16_t(struct bio *bp, const char *attribute, uint16_t val) { - return (g_handleattr(bp, attribute, &val, sizeof val)); + return (g_handleattr(bp, attribute, &val, sizeof(val))); } int g_handleattr_off_t(struct bio *bp, const char *attribute, off_t val) { - return (g_handleattr(bp, attribute, &val, sizeof val)); + return (g_handleattr(bp, attribute, &val, sizeof(val))); } int diff --git a/sys/geom/multipath/g_multipath.c b/sys/geom/multipath/g_multipath.c index 23088c895541..a4935df7eaa1 100644 --- a/sys/geom/multipath/g_multipath.c +++ b/sys/geom/multipath/g_multipath.c @@ -321,7 +321,7 @@ g_multipath_resize(struct g_consumer *cp) if (sc->sc_uuid[0] != 0) { pp = cp->provider; strlcpy(md.md_magic, G_MULTIPATH_MAGIC, sizeof(md.md_magic)); - memcpy(md.md_uuid, sc->sc_uuid, sizeof (sc->sc_uuid)); + memcpy(md.md_uuid, sc->sc_uuid, sizeof(sc->sc_uuid)); strlcpy(md.md_name, sc->sc_name, sizeof(md.md_name)); md.md_version = G_MULTIPATH_VERSION; md.md_size = size; @@ -552,8 +552,8 @@ g_multipath_create(struct g_class *mp, struct g_multipath_metadata *md) gp = g_new_geomf(mp, "%s", md->md_name); sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO); mtx_init(&sc->sc_mtx, "multipath", NULL, MTX_DEF); - memcpy(sc->sc_uuid, md->md_uuid, sizeof (sc->sc_uuid)); - memcpy(sc->sc_name, md->md_name, sizeof (sc->sc_name)); + memcpy(sc->sc_uuid, md->md_uuid, sizeof(sc->sc_uuid)); + memcpy(sc->sc_name, md->md_name, sizeof(sc->sc_name)); sc->sc_active_active = md->md_active_active; sc->sc_size = md->md_size; gp->softc = sc; @@ -906,7 +906,7 @@ g_multipath_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) char buf[16]; u_long rand = random(); - snprintf(buf, sizeof (buf), "%s-%lu", md.md_name, rand); + snprintf(buf, sizeof(buf), "%s-%lu", md.md_name, rand); printf("GEOM_MULTIPATH: geom %s/%s exists already\n", sc->sc_name, sc->sc_uuid); printf("GEOM_MULTIPATH: %s will be (temporarily) %s\n", @@ -1200,7 +1200,7 @@ g_multipath_ctl_configure(struct gctl_req *req, struct g_class *mp) cp = sc->sc_active; pp = cp->provider; strlcpy(md.md_magic, G_MULTIPATH_MAGIC, sizeof(md.md_magic)); - memcpy(md.md_uuid, sc->sc_uuid, sizeof (sc->sc_uuid)); + memcpy(md.md_uuid, sc->sc_uuid, sizeof(sc->sc_uuid)); strlcpy(md.md_name, name, sizeof(md.md_name)); md.md_version = G_MULTIPATH_VERSION; md.md_size = pp->mediasize; diff --git a/sys/geom/virstor/g_virstor.c b/sys/geom/virstor/g_virstor.c index b8cf32875660..c7d737493f11 100644 --- a/sys/geom/virstor/g_virstor.c +++ b/sys/geom/virstor/g_virstor.c @@ -202,7 +202,7 @@ virstor_ctl_stop(struct gctl_req *req, struct g_class *cp) int *force, *nargs; int i; - nargs = gctl_get_paraml(req, "nargs", sizeof *nargs); + nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "Error fetching argument '%s'", "nargs"); return; @@ -211,7 +211,7 @@ virstor_ctl_stop(struct gctl_req *req, struct g_class *cp) gctl_error(req, "Invalid number of arguments"); return; } - force = gctl_get_paraml(req, "force", sizeof *force); + force = gctl_get_paraml(req, "force", sizeof(*force)); if (force == NULL) { gctl_error(req, "Error fetching argument '%s'", "force"); return; @@ -315,7 +315,7 @@ virstor_ctl_add(struct gctl_req *req, struct g_class *cp) u_int nc; u_int j; - snprintf(aname, sizeof aname, "arg%d", i); + snprintf(aname, sizeof(aname), "arg%d", i); pp = gctl_get_provider(req, aname); if (pp == NULL) { /* This is the most common error so be verbose about it */ @@ -487,12 +487,12 @@ fill_metadata(struct g_virstor_softc *sc, struct g_virstor_metadata *md, { struct g_virstor_component *c; - bzero(md, sizeof *md); + bzero(md, sizeof(*md)); c = &sc->components[nc]; - strncpy(md->md_magic, G_VIRSTOR_MAGIC, sizeof md->md_magic); + strncpy(md->md_magic, G_VIRSTOR_MAGIC, sizeof(md->md_magic)); md->md_version = G_VIRSTOR_VERSION; - strncpy(md->md_name, sc->geom->name, sizeof md->md_name); + strncpy(md->md_name, sc->geom->name, sizeof(md->md_name)); md->md_id = sc->id; md->md_virsize = sc->virsize; md->md_chunk_size = sc->chunk_size; @@ -500,7 +500,7 @@ fill_metadata(struct g_virstor_softc *sc, struct g_virstor_metadata *md, if (hardcode) { strncpy(md->provider, c->gcons->provider->name, - sizeof md->provider); + sizeof(md->provider)); } md->no = nc; md->provsize = c->gcons->provider->mediasize; @@ -589,7 +589,7 @@ virstor_ctl_remove(struct gctl_req *req, struct g_class *cp) M_GVIRSTOR, M_WAITOK | M_ZERO); bcopy(sc->components, newcomp, found * sizeof(*sc->components)); bcopy(&sc->components[found + 1], newcomp + found, - found * sizeof(*sc->components)); + (sc->n_components - (found + 1)) * sizeof(*sc->components)); if ((sc->components[j].flags & VIRSTOR_PROVIDER_ALLOCATED) != 0) { LOG_MSG(LVL_ERROR, "Allocated provider %s cannot be " "removed from %s", @@ -959,7 +959,7 @@ virstor_geom_destroy(struct g_virstor_softc *sc, boolean_t force, free(sc->map, M_GVIRSTOR); free(sc->components, M_GVIRSTOR); - bzero(sc, sizeof *sc); + bzero(sc, sizeof(*sc)); free(sc, M_GVIRSTOR); pp = LIST_FIRST(&gp->provider); /* We only offer one provider */ @@ -1213,7 +1213,7 @@ virstor_check_and_run(struct g_virstor_softc *sc) sc->provider->name, sc->chunk_count * (off_t)sc->chunk_size); } - sc->map_size = sc->chunk_count * sizeof *(sc->map); + sc->map_size = sc->chunk_count * sizeof(*(sc->map)); /* The following allocation is in order of 4MB - 8MB */ sc->map = malloc(sc->map_size, M_GVIRSTOR, M_WAITOK); KASSERT(sc->map != NULL, ("%s: Memory allocation error (%zu bytes) for %s", @@ -1267,7 +1267,7 @@ virstor_check_and_run(struct g_virstor_softc *sc) bcopy(mapbuf, &sc->map[n], bs); off += bs; count += bs; - n += bs / sizeof *(sc->map); + n += bs / sizeof(*(sc->map)); g_free(mapbuf); } g_access(sc->components[0].gcons, -1, 0, 0); @@ -1306,8 +1306,8 @@ virstor_check_and_run(struct g_virstor_softc *sc) sc->components[index].chunk_next); } - sc->me_per_sector = sc->sectorsize / sizeof *(sc->map); - if (sc->sectorsize % sizeof *(sc->map) != 0) { + sc->me_per_sector = sc->sectorsize / sizeof(*(sc->map)); + if (sc->sectorsize % sizeof(*(sc->map)) != 0) { LOG_MSG(LVL_ERROR, "%s: Map entries don't fit exactly in a sector (%s)", __func__, sc->geom->name); @@ -1653,7 +1653,7 @@ g_virstor_start(struct bio *b) * XXX: this will prevent the fs from * being umounted! */ struct g_virstor_bio_q *biq; - biq = malloc(sizeof *biq, M_GVIRSTOR, + biq = malloc(sizeof(*biq), M_GVIRSTOR, M_NOWAIT); if (biq == NULL) { bioq_dismantle(&bq); @@ -1703,7 +1703,7 @@ g_virstor_start(struct bio *b) * map array. * sc_offset will end up pointing to the drive * sector. */ - s_offset = chunk_index * sizeof *me; + s_offset = chunk_index * sizeof(*me); s_offset = rounddown(s_offset, sc->sectorsize); /* data_me points to map entry sector diff --git a/sys/kern/coredump_vnode.c b/sys/kern/coredump_vnode.c new file mode 100644 index 000000000000..8b857e9aa4a2 --- /dev/null +++ b/sys/kern/coredump_vnode.c @@ -0,0 +1,562 @@ +/* + * SPDX-License-Identifier: BSD-3-Clause AND BSD-2-Clause + * + * Copyright (c) 1982, 1986, 1989, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * - kern_sig.c + */ +/* + * Copyright (c) 1993, David Greenman + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * -kern_exec.c + */ + +#include <sys/systm.h> +#include <sys/acct.h> +#include <sys/compressor.h> +#include <sys/devctl.h> +#include <sys/fcntl.h> +#include <sys/jail.h> +#include <sys/limits.h> +#include <sys/namei.h> +#include <sys/proc.h> +#include <sys/sbuf.h> +#include <sys/stat.h> +#include <sys/sysctl.h> +#include <sys/sysent.h> +#include <sys/syslog.h> +#include <sys/ucoredump.h> +#include <sys/unistd.h> +#include <sys/vnode.h> + +#include <security/audit/audit.h> + +#define GZIP_SUFFIX ".gz" +#define ZSTD_SUFFIX ".zst" + +#define MAX_NUM_CORE_FILES 100000 +#ifndef NUM_CORE_FILES +#define NUM_CORE_FILES 5 +#endif + +static coredumper_handle_fn coredump_vnode; +static struct coredumper vnode_coredumper = { + .cd_name = "vnode_coredumper", + .cd_handle = coredump_vnode, +}; + +SYSINIT(vnode_coredumper_register, SI_SUB_EXEC, SI_ORDER_ANY, + coredumper_register, &vnode_coredumper); + +_Static_assert(NUM_CORE_FILES >= 0 && NUM_CORE_FILES <= MAX_NUM_CORE_FILES, + "NUM_CORE_FILES is out of range (0 to " __STRING(MAX_NUM_CORE_FILES) ")"); +static int num_cores = NUM_CORE_FILES; + +static int capmode_coredump; +SYSCTL_INT(_kern, OID_AUTO, capmode_coredump, CTLFLAG_RWTUN, + &capmode_coredump, 0, "Allow processes in capability mode to dump core"); + +static int set_core_nodump_flag = 0; +SYSCTL_INT(_kern, OID_AUTO, nodump_coredump, CTLFLAG_RW, &set_core_nodump_flag, + 0, "Enable setting the NODUMP flag on coredump files"); + +static int coredump_devctl = 0; +SYSCTL_INT(_kern, OID_AUTO, coredump_devctl, CTLFLAG_RW, &coredump_devctl, + 0, "Generate a devctl notification when processes coredump"); + +/* + * corefilename[] is protected by the allproc_lock. + */ +static char corefilename[MAXPATHLEN] = { "%N.core" }; +TUNABLE_STR("kern.corefile", corefilename, sizeof(corefilename)); + +static int +sysctl_kern_corefile(SYSCTL_HANDLER_ARGS) +{ + int error; + + sx_xlock(&allproc_lock); + error = sysctl_handle_string(oidp, corefilename, sizeof(corefilename), + req); + sx_xunlock(&allproc_lock); + + return (error); +} +SYSCTL_PROC(_kern, OID_AUTO, corefile, CTLTYPE_STRING | CTLFLAG_RW | + CTLFLAG_MPSAFE, 0, 0, sysctl_kern_corefile, "A", + "Process corefile name format string"); + +static int +sysctl_debug_num_cores_check (SYSCTL_HANDLER_ARGS) +{ + int error; + int new_val; + + new_val = num_cores; + error = sysctl_handle_int(oidp, &new_val, 0, req); + if (error != 0 || req->newptr == NULL) + return (error); + if (new_val > MAX_NUM_CORE_FILES) + new_val = MAX_NUM_CORE_FILES; + if (new_val < 0) + new_val = 0; + num_cores = new_val; + return (0); +} +SYSCTL_PROC(_debug, OID_AUTO, ncores, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, sizeof(int), + sysctl_debug_num_cores_check, "I", + "Maximum number of generated process corefiles while using index format"); + +static void +vnode_close_locked(struct thread *td, struct vnode *vp) +{ + + VOP_UNLOCK(vp); + vn_close(vp, FWRITE, td->td_ucred, td); +} + +int +core_vn_write(const struct coredump_writer *cdw, const void *base, size_t len, + off_t offset, enum uio_seg seg, struct ucred *cred, size_t *resid, + struct thread *td) +{ + struct coredump_vnode_ctx *ctx = cdw->ctx; + + return (vn_rdwr_inchunks(UIO_WRITE, ctx->vp, __DECONST(void *, base), + len, offset, seg, IO_UNIT | IO_DIRECT | IO_RANGELOCKED, + cred, ctx->fcred, resid, td)); +} + +int +core_vn_extend(const struct coredump_writer *cdw, off_t newsz, + struct ucred *cred) +{ + struct coredump_vnode_ctx *ctx = cdw->ctx; + struct mount *mp; + int error; + + error = vn_start_write(ctx->vp, &mp, V_WAIT); + if (error != 0) + return (error); + vn_lock(ctx->vp, LK_EXCLUSIVE | LK_RETRY); + error = vn_truncate_locked(ctx->vp, newsz, false, cred); + VOP_UNLOCK(ctx->vp); + vn_finished_write(mp); + return (error); +} + +/* + * If the core format has a %I in it, then we need to check + * for existing corefiles before defining a name. + * To do this we iterate over 0..ncores to find a + * non-existing core file name to use. If all core files are + * already used we choose the oldest one. + */ +static int +corefile_open_last(struct thread *td, char *name, int indexpos, + int indexlen, int ncores, struct vnode **vpp) +{ + struct vnode *oldvp, *nextvp, *vp; + struct vattr vattr; + struct nameidata nd; + int error, i, flags, oflags, cmode; + char ch; + struct timespec lasttime; + + nextvp = oldvp = NULL; + cmode = S_IRUSR | S_IWUSR; + oflags = VN_OPEN_NOAUDIT | VN_OPEN_NAMECACHE | + (capmode_coredump ? VN_OPEN_NOCAPCHECK : 0); + + for (i = 0; i < ncores; i++) { + flags = O_CREAT | FWRITE | O_NOFOLLOW; + + ch = name[indexpos + indexlen]; + (void)snprintf(name + indexpos, indexlen + 1, "%.*u", indexlen, + i); + name[indexpos + indexlen] = ch; + + NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, name); + error = vn_open_cred(&nd, &flags, cmode, oflags, td->td_ucred, + NULL); + if (error != 0) + break; + + vp = nd.ni_vp; + NDFREE_PNBUF(&nd); + if ((flags & O_CREAT) == O_CREAT) { + nextvp = vp; + break; + } + + error = VOP_GETATTR(vp, &vattr, td->td_ucred); + if (error != 0) { + vnode_close_locked(td, vp); + break; + } + + if (oldvp == NULL || + lasttime.tv_sec > vattr.va_mtime.tv_sec || + (lasttime.tv_sec == vattr.va_mtime.tv_sec && + lasttime.tv_nsec >= vattr.va_mtime.tv_nsec)) { + if (oldvp != NULL) + vn_close(oldvp, FWRITE, td->td_ucred, td); + oldvp = vp; + VOP_UNLOCK(oldvp); + lasttime = vattr.va_mtime; + } else { + vnode_close_locked(td, vp); + } + } + + if (oldvp != NULL) { + if (nextvp == NULL) { + if ((td->td_proc->p_flag & P_SUGID) != 0) { + error = EFAULT; + vn_close(oldvp, FWRITE, td->td_ucred, td); + } else { + nextvp = oldvp; + error = vn_lock(nextvp, LK_EXCLUSIVE); + if (error != 0) { + vn_close(nextvp, FWRITE, td->td_ucred, + td); + nextvp = NULL; + } + } + } else { + vn_close(oldvp, FWRITE, td->td_ucred, td); + } + } + if (error != 0) { + if (nextvp != NULL) + vnode_close_locked(td, oldvp); + } else { + *vpp = nextvp; + } + + return (error); +} + +/* + * corefile_open(comm, uid, pid, td, compress, vpp, namep) + * Expand the name described in corefilename, using name, uid, and pid + * and open/create core file. + * corefilename is a printf-like string, with three format specifiers: + * %N name of process ("name") + * %P process id (pid) + * %U user id (uid) + * For example, "%N.core" is the default; they can be disabled completely + * by using "/dev/null", or all core files can be stored in "/cores/%U/%N-%P". + * This is controlled by the sysctl variable kern.corefile (see above). + */ +static int +corefile_open(const char *comm, uid_t uid, pid_t pid, struct thread *td, + int compress, int signum, struct vnode **vpp, char **namep) +{ + struct sbuf sb; + struct nameidata nd; + const char *format; + char *hostname, *name; + int cmode, error, flags, i, indexpos, indexlen, oflags, ncores; + + hostname = NULL; + format = corefilename; + name = malloc(MAXPATHLEN, M_TEMP, M_WAITOK | M_ZERO); + indexlen = 0; + indexpos = -1; + ncores = num_cores; + (void)sbuf_new(&sb, name, MAXPATHLEN, SBUF_FIXEDLEN); + sx_slock(&allproc_lock); + for (i = 0; format[i] != '\0'; i++) { + switch (format[i]) { + case '%': /* Format character */ + i++; + switch (format[i]) { + case '%': + sbuf_putc(&sb, '%'); + break; + case 'H': /* hostname */ + if (hostname == NULL) { + hostname = malloc(MAXHOSTNAMELEN, + M_TEMP, M_WAITOK); + } + getcredhostname(td->td_ucred, hostname, + MAXHOSTNAMELEN); + sbuf_cat(&sb, hostname); + break; + case 'I': /* autoincrementing index */ + if (indexpos != -1) { + sbuf_printf(&sb, "%%I"); + break; + } + + indexpos = sbuf_len(&sb); + sbuf_printf(&sb, "%u", ncores - 1); + indexlen = sbuf_len(&sb) - indexpos; + break; + case 'N': /* process name */ + sbuf_printf(&sb, "%s", comm); + break; + case 'P': /* process id */ + sbuf_printf(&sb, "%u", pid); + break; + case 'S': /* signal number */ + sbuf_printf(&sb, "%i", signum); + break; + case 'U': /* user id */ + sbuf_printf(&sb, "%u", uid); + break; + default: + log(LOG_ERR, + "Unknown format character %c in " + "corename `%s'\n", format[i], format); + break; + } + break; + default: + sbuf_putc(&sb, format[i]); + break; + } + } + sx_sunlock(&allproc_lock); + free(hostname, M_TEMP); + if (compress == COMPRESS_GZIP) + sbuf_cat(&sb, GZIP_SUFFIX); + else if (compress == COMPRESS_ZSTD) + sbuf_cat(&sb, ZSTD_SUFFIX); + if (sbuf_error(&sb) != 0) { + log(LOG_ERR, "pid %ld (%s), uid (%lu): corename is too " + "long\n", (long)pid, comm, (u_long)uid); + sbuf_delete(&sb); + free(name, M_TEMP); + return (ENOMEM); + } + sbuf_finish(&sb); + sbuf_delete(&sb); + + if (indexpos != -1) { + error = corefile_open_last(td, name, indexpos, indexlen, ncores, + vpp); + if (error != 0) { + log(LOG_ERR, + "pid %d (%s), uid (%u): Path `%s' failed " + "on initial open test, error = %d\n", + pid, comm, uid, name, error); + } + } else { + cmode = S_IRUSR | S_IWUSR; + oflags = VN_OPEN_NOAUDIT | VN_OPEN_NAMECACHE | + (capmode_coredump ? VN_OPEN_NOCAPCHECK : 0); + flags = O_CREAT | FWRITE | O_NOFOLLOW; + if ((td->td_proc->p_flag & P_SUGID) != 0) + flags |= O_EXCL; + + NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, name); + error = vn_open_cred(&nd, &flags, cmode, oflags, td->td_ucred, + NULL); + if (error == 0) { + *vpp = nd.ni_vp; + NDFREE_PNBUF(&nd); + } + } + + if (error != 0) { +#ifdef AUDIT + audit_proc_coredump(td, name, error); +#endif + free(name, M_TEMP); + return (error); + } + *namep = name; + return (0); +} + +/* + * The vnode dumper is the traditional coredump handler. Our policy and limits + * are generally checked already, so it creates the coredump name and passes on + * a vnode and a size limit to the process-specific coredump routine if there is + * one. If there _is not_ one, it returns ENOSYS; otherwise it returns the + * error from the process-specific routine. + */ +static int +coredump_vnode(struct thread *td, off_t limit) +{ + struct proc *p = td->td_proc; + struct ucred *cred = td->td_ucred; + struct vnode *vp; + struct coredump_vnode_ctx wctx; + struct coredump_writer cdw = { }; + struct flock lf; + struct vattr vattr; + size_t fullpathsize; + int error, error1, jid, locked, ppid, sig; + char *name; /* name of corefile */ + void *rl_cookie; + char *fullpath, *freepath = NULL; + struct sbuf *sb; + + PROC_LOCK_ASSERT(p, MA_OWNED); + + ppid = p->p_oppid; + sig = p->p_sig; + jid = p->p_ucred->cr_prison->pr_id; + PROC_UNLOCK(p); + + error = corefile_open(p->p_comm, cred->cr_uid, p->p_pid, td, + compress_user_cores, sig, &vp, &name); + if (error != 0) + return (error); + + /* + * Don't dump to non-regular files or files with links. + * Do not dump into system files. Effective user must own the corefile. + */ + if (vp->v_type != VREG || VOP_GETATTR(vp, &vattr, cred) != 0 || + vattr.va_nlink != 1 || (vp->v_vflag & VV_SYSTEM) != 0 || + vattr.va_uid != cred->cr_uid) { + VOP_UNLOCK(vp); + error = EFAULT; + goto out; + } + + VOP_UNLOCK(vp); + + /* Postpone other writers, including core dumps of other processes. */ + rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX); + + lf.l_whence = SEEK_SET; + lf.l_start = 0; + lf.l_len = 0; + lf.l_type = F_WRLCK; + locked = (VOP_ADVLOCK(vp, (caddr_t)p, F_SETLK, &lf, F_FLOCK) == 0); + + VATTR_NULL(&vattr); + vattr.va_size = 0; + if (set_core_nodump_flag) + vattr.va_flags = UF_NODUMP; + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + VOP_SETATTR(vp, &vattr, cred); + VOP_UNLOCK(vp); + PROC_LOCK(p); + p->p_acflag |= ACORE; + PROC_UNLOCK(p); + + wctx.vp = vp; + wctx.fcred = NOCRED; + + cdw.ctx = &wctx; + cdw.write_fn = core_vn_write; + cdw.extend_fn = core_vn_extend; + + if (p->p_sysent->sv_coredump != NULL) { + error = p->p_sysent->sv_coredump(td, &cdw, limit, 0); + } else { + error = ENOSYS; + } + + if (locked) { + lf.l_type = F_UNLCK; + VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_FLOCK); + } + vn_rangelock_unlock(vp, rl_cookie); + + /* + * Notify the userland helper that a process triggered a core dump. + * This allows the helper to run an automated debugging session. + */ + if (error != 0 || coredump_devctl == 0) + goto out; + sb = sbuf_new_auto(); + if (vn_fullpath_global(p->p_textvp, &fullpath, &freepath) != 0) + goto out2; + sbuf_cat(sb, "comm=\""); + devctl_safe_quote_sb(sb, fullpath); + free(freepath, M_TEMP); + sbuf_cat(sb, "\" core=\""); + + /* + * We can't lookup core file vp directly. When we're replacing a core, and + * other random times, we flush the name cache, so it will fail. Instead, + * if the path of the core is relative, add the current dir in front if it. + */ + if (name[0] != '/') { + fullpathsize = MAXPATHLEN; + freepath = malloc(fullpathsize, M_TEMP, M_WAITOK); + if (vn_getcwd(freepath, &fullpath, &fullpathsize) != 0) { + free(freepath, M_TEMP); + goto out2; + } + devctl_safe_quote_sb(sb, fullpath); + free(freepath, M_TEMP); + sbuf_putc(sb, '/'); + } + devctl_safe_quote_sb(sb, name); + sbuf_putc(sb, '"'); + + sbuf_printf(sb, " jid=%d pid=%d ppid=%d signo=%d", + jid, p->p_pid, ppid, sig); + if (sbuf_finish(sb) == 0) + devctl_notify("kernel", "signal", "coredump", sbuf_data(sb)); +out2: + sbuf_delete(sb); +out: + error1 = vn_close(vp, FWRITE, cred, td); + if (error == 0) + error = error1; +#ifdef AUDIT + audit_proc_coredump(td, name, error); +#endif + free(name, M_TEMP); + return (error); +} diff --git a/sys/kern/imgact_elf.c b/sys/kern/imgact_elf.c index b7ffbe68b483..2690ad3b2679 100644 --- a/sys/kern/imgact_elf.c +++ b/sys/kern/imgact_elf.c @@ -64,6 +64,7 @@ #include <sys/syscall.h> #include <sys/sysctl.h> #include <sys/sysent.h> +#include <sys/ucoredump.h> #include <sys/vnode.h> #include <sys/syslog.h> #include <sys/eventhandler.h> @@ -1562,9 +1563,6 @@ struct note_info { TAILQ_HEAD(note_info_list, note_info); -extern int compress_user_cores; -extern int compress_user_cores_level; - static void cb_put_phdr(vm_map_entry_t, void *); static void cb_size_segment(vm_map_entry_t, void *); static void each_dumpable_segment(struct thread *, segment_callback, void *, @@ -1595,7 +1593,7 @@ core_compressed_write(void *base, size_t len, off_t offset, void *arg) } int -__elfN(coredump)(struct thread *td, struct vnode *vp, off_t limit, int flags) +__elfN(coredump)(struct thread *td, struct coredump_writer *cdw, off_t limit, int flags) { struct ucred *cred = td->td_ucred; int compm, error = 0; @@ -1625,9 +1623,8 @@ __elfN(coredump)(struct thread *td, struct vnode *vp, off_t limit, int flags) /* Set up core dump parameters. */ params.offset = 0; params.active_cred = cred; - params.file_cred = NOCRED; params.td = td; - params.vp = vp; + params.cdw = cdw; params.comp = NULL; #ifdef RACCT @@ -1662,6 +1659,12 @@ __elfN(coredump)(struct thread *td, struct vnode *vp, off_t limit, int flags) tmpbuf = malloc(CORE_BUF_SIZE, M_TEMP, M_WAITOK | M_ZERO); } + if (cdw->init_fn != NULL) { + error = (*cdw->init_fn)(cdw, ¶ms); + if (error != 0) + goto done; + } + /* * Allocate memory for building the header, fill it up, * and write it out following the notes. diff --git a/sys/kern/kern_cpuset.c b/sys/kern/kern_cpuset.c index 5d9e2f2f326b..d7eb82d5f259 100644 --- a/sys/kern/kern_cpuset.c +++ b/sys/kern/kern_cpuset.c @@ -530,7 +530,7 @@ _domainset_create(struct domainset *domain, struct domainlist *freelist) * remove them and update the domainset accordingly. If only empty * domains are present, we must return failure. */ -static bool +bool domainset_empty_vm(struct domainset *domain) { domainset_t empty; @@ -2409,82 +2409,92 @@ sys_cpuset_setdomain(struct thread *td, struct cpuset_setdomain_args *uap) } int -kern_cpuset_setdomain(struct thread *td, cpulevel_t level, cpuwhich_t which, - id_t id, size_t domainsetsize, const domainset_t *maskp, int policy, - const struct cpuset_copy_cb *cb) +domainset_populate(struct domainset *domain, const domainset_t *mask, int policy, + size_t mask_size) { - struct cpuset *nset; - struct cpuset *set; - struct thread *ttd; - struct proc *p; - struct domainset domain; - domainset_t *mask; - int error; - if (domainsetsize < sizeof(domainset_t) || - domainsetsize > DOMAINSET_MAXSIZE / NBBY) - return (ERANGE); if (policy <= DOMAINSET_POLICY_INVALID || - policy > DOMAINSET_POLICY_MAX) + policy > DOMAINSET_POLICY_MAX) { return (EINVAL); - error = cpuset_check_capabilities(td, level, which, id); - if (error != 0) - return (error); - memset(&domain, 0, sizeof(domain)); - mask = malloc(domainsetsize, M_TEMP, M_WAITOK | M_ZERO); - error = cb->cpuset_copyin(maskp, mask, domainsetsize); - if (error) - goto out; + } + /* * Verify that no high bits are set. */ - if (domainsetsize > sizeof(domainset_t)) { - char *end; - char *cp; + if (mask_size > sizeof(domainset_t)) { + const char *end; + const char *cp; - end = cp = (char *)&mask->__bits; - end += domainsetsize; + end = cp = (const char *)&mask->__bits; + end += mask_size; cp += sizeof(domainset_t); - while (cp != end) + while (cp != end) { if (*cp++ != 0) { - error = EINVAL; - goto out; + return (EINVAL); } + } } if (DOMAINSET_EMPTY(mask)) { - error = EDEADLK; - goto out; + return (EDEADLK); } - DOMAINSET_COPY(mask, &domain.ds_mask); - domain.ds_policy = policy; + DOMAINSET_COPY(mask, &domain->ds_mask); + domain->ds_policy = policy; /* * Sanitize the provided mask. */ - if (!DOMAINSET_SUBSET(&all_domains, &domain.ds_mask)) { - error = EINVAL; - goto out; + if (!DOMAINSET_SUBSET(&all_domains, &domain->ds_mask)) { + return (EINVAL); } /* Translate preferred policy into a mask and fallback. */ if (policy == DOMAINSET_POLICY_PREFER) { /* Only support a single preferred domain. */ - if (DOMAINSET_COUNT(&domain.ds_mask) != 1) { - error = EINVAL; - goto out; + if (DOMAINSET_COUNT(&domain->ds_mask) != 1) { + return (EINVAL); } - domain.ds_prefer = DOMAINSET_FFS(&domain.ds_mask) - 1; + domain->ds_prefer = DOMAINSET_FFS(&domain->ds_mask) - 1; /* This will be constrained by domainset_shadow(). */ - DOMAINSET_COPY(&all_domains, &domain.ds_mask); + DOMAINSET_COPY(&all_domains, &domain->ds_mask); } + return (0); +} + +int +kern_cpuset_setdomain(struct thread *td, cpulevel_t level, cpuwhich_t which, + id_t id, size_t domainsetsize, const domainset_t *maskp, int policy, + const struct cpuset_copy_cb *cb) +{ + struct cpuset *nset; + struct cpuset *set; + struct thread *ttd; + struct proc *p; + struct domainset domain; + domainset_t *mask; + int error; + + error = cpuset_check_capabilities(td, level, which, id); + if (error != 0) + return (error); + if (domainsetsize < sizeof(domainset_t) || + domainsetsize > DOMAINSET_MAXSIZE / NBBY) + return (ERANGE); + memset(&domain, 0, sizeof(domain)); + mask = malloc(domainsetsize, M_TEMP, M_WAITOK | M_ZERO); + error = cb->cpuset_copyin(maskp, mask, domainsetsize); + if (error) + goto out; + error = domainset_populate(&domain, mask, policy, domainsetsize); + if (error) + goto out; + /* * When given an impossible policy, fall back to interleaving * across all domains. */ if (domainset_empty_vm(&domain)) domainset_copy(domainset2, &domain); - switch (level) { case CPU_LEVEL_ROOT: case CPU_LEVEL_CPUSET: diff --git a/sys/kern/kern_exec.c b/sys/kern/kern_exec.c index 03268365891e..0fc2d0e7f1bc 100644 --- a/sys/kern/kern_exec.c +++ b/sys/kern/kern_exec.c @@ -70,6 +70,7 @@ #include <sys/sysent.h> #include <sys/sysproto.h> #include <sys/timers.h> +#include <sys/ucoredump.h> #include <sys/umtxvar.h> #include <sys/vnode.h> #include <sys/wait.h> @@ -2002,10 +2003,14 @@ int core_write(struct coredump_params *cp, const void *base, size_t len, off_t offset, enum uio_seg seg, size_t *resid) { + return ((*cp->cdw->write_fn)(cp->cdw, base, len, offset, seg, + cp->active_cred, resid, cp->td)); +} - return (vn_rdwr_inchunks(UIO_WRITE, cp->vp, __DECONST(void *, base), - len, offset, seg, IO_UNIT | IO_DIRECT | IO_RANGELOCKED, - cp->active_cred, cp->file_cred, resid, cp->td)); +static int +core_extend(struct coredump_params *cp, off_t newsz) +{ + return ((*cp->cdw->extend_fn)(cp->cdw, newsz, cp->active_cred)); } int @@ -2013,7 +2018,6 @@ core_output(char *base, size_t len, off_t offset, struct coredump_params *cp, void *tmpbuf) { vm_map_t map; - struct mount *mp; size_t resid, runlen; int error; bool success; @@ -2068,14 +2072,7 @@ core_output(char *base, size_t len, off_t offset, struct coredump_params *cp, } } if (!success) { - error = vn_start_write(cp->vp, &mp, V_WAIT); - if (error != 0) - break; - vn_lock(cp->vp, LK_EXCLUSIVE | LK_RETRY); - error = vn_truncate_locked(cp->vp, offset + runlen, - false, cp->td->td_ucred); - VOP_UNLOCK(cp->vp); - vn_finished_write(mp); + error = core_extend(cp, offset + runlen); if (error != 0) break; } diff --git a/sys/kern/kern_jail.c b/sys/kern/kern_jail.c index d4529e096929..7ef1d19f0ea8 100644 --- a/sys/kern/kern_jail.c +++ b/sys/kern/kern_jail.c @@ -3466,7 +3466,7 @@ prison_check_af(struct ucred *cred, int af) pr = cred->cr_prison; #ifdef VIMAGE /* Prisons with their own network stack are not limited. */ - if (prison_owns_vnet(cred)) + if (prison_owns_vnet(pr)) return (0); #endif @@ -3531,7 +3531,7 @@ prison_if(struct ucred *cred, const struct sockaddr *sa) KASSERT(sa != NULL, ("%s: sa is NULL", __func__)); #ifdef VIMAGE - if (prison_owns_vnet(cred)) + if (prison_owns_vnet(cred->cr_prison)) return (0); #endif @@ -3648,7 +3648,7 @@ jailed_without_vnet(struct ucred *cred) if (!jailed(cred)) return (false); #ifdef VIMAGE - if (prison_owns_vnet(cred)) + if (prison_owns_vnet(cred->cr_prison)) return (false); #endif @@ -3711,20 +3711,17 @@ getjailname(struct ucred *cred, char *name, size_t len) #ifdef VIMAGE /* - * Determine whether the prison represented by cred owns - * its vnet rather than having it inherited. - * - * Returns true in case the prison owns the vnet, false otherwise. + * Determine whether the prison owns its VNET. */ bool -prison_owns_vnet(struct ucred *cred) +prison_owns_vnet(struct prison *pr) { /* * vnets cannot be added/removed after jail creation, * so no need to lock here. */ - return ((cred->cr_prison->pr_flags & PR_VNET) != 0); + return ((pr->pr_flags & PR_VNET) != 0); } #endif @@ -4425,7 +4422,7 @@ sysctl_jail_vnet(SYSCTL_HANDLER_ARGS) #ifdef VIMAGE struct ucred *cred = req->td->td_ucred; - havevnet = jailed(cred) && prison_owns_vnet(cred); + havevnet = jailed(cred) && prison_owns_vnet(cred->cr_prison); #else havevnet = 0; #endif diff --git a/sys/kern/kern_prot.c b/sys/kern/kern_prot.c index d9aeec68e620..0f0bc056cafd 100644 --- a/sys/kern/kern_prot.c +++ b/sys/kern/kern_prot.c @@ -287,7 +287,7 @@ sys_getgid(struct thread *td, struct getgid_args *uap) td->td_retval[0] = td->td_ucred->cr_rgid; #if defined(COMPAT_43) - td->td_retval[1] = td->td_ucred->cr_groups[0]; + td->td_retval[1] = td->td_ucred->cr_gid; #endif return (0); } @@ -307,7 +307,7 @@ int sys_getegid(struct thread *td, struct getegid_args *uap) { - td->td_retval[0] = td->td_ucred->cr_groups[0]; + td->td_retval[0] = td->td_ucred->cr_gid; return (0); } @@ -1080,7 +1080,7 @@ sys_setgid(struct thread *td, struct setgid_args *uap) gid != oldcred->cr_svgid && /* allow setgid(saved gid) */ #endif #ifdef POSIX_APPENDIX_B_4_2_2 /* Use BSD-compat clause from B.4.2.2 */ - gid != oldcred->cr_groups[0] && /* allow setgid(getegid()) */ + gid != oldcred->cr_gid && /* allow setgid(getegid()) */ #endif (error = priv_check_cred(oldcred, PRIV_CRED_SETGID)) != 0) goto fail; @@ -1092,7 +1092,7 @@ sys_setgid(struct thread *td, struct setgid_args *uap) */ if ( #ifdef POSIX_APPENDIX_B_4_2_2 /* use the clause from B.4.2.2 */ - gid == oldcred->cr_groups[0] || + gid == oldcred->cr_gid || #endif /* We are using privs. */ priv_check_cred(oldcred, PRIV_CRED_SETGID) == 0) @@ -1121,7 +1121,7 @@ sys_setgid(struct thread *td, struct setgid_args *uap) * In all cases permitted cases, we are changing the egid. * Copy credentials so other references do not see our changes. */ - if (oldcred->cr_groups[0] != gid) { + if (oldcred->cr_gid != gid) { change_egid(newcred, gid); setsugid(p); } @@ -1167,7 +1167,7 @@ sys_setegid(struct thread *td, struct setegid_args *uap) (error = priv_check_cred(oldcred, PRIV_CRED_SETEGID)) != 0) goto fail; - if (oldcred->cr_groups[0] != egid) { + if (oldcred->cr_gid != egid) { change_egid(newcred, egid); setsugid(p); } @@ -1393,12 +1393,12 @@ sys_setregid(struct thread *td, struct setregid_args *uap) if (((rgid != (gid_t)-1 && rgid != oldcred->cr_rgid && rgid != oldcred->cr_svgid) || - (egid != (gid_t)-1 && egid != oldcred->cr_groups[0] && + (egid != (gid_t)-1 && egid != oldcred->cr_gid && egid != oldcred->cr_rgid && egid != oldcred->cr_svgid)) && (error = priv_check_cred(oldcred, PRIV_CRED_SETREGID)) != 0) goto fail; - if (egid != (gid_t)-1 && oldcred->cr_groups[0] != egid) { + if (egid != (gid_t)-1 && oldcred->cr_gid != egid) { change_egid(newcred, egid); setsugid(p); } @@ -1406,9 +1406,9 @@ sys_setregid(struct thread *td, struct setregid_args *uap) change_rgid(newcred, rgid); setsugid(p); } - if ((rgid != (gid_t)-1 || newcred->cr_groups[0] != newcred->cr_rgid) && - newcred->cr_svgid != newcred->cr_groups[0]) { - change_svgid(newcred, newcred->cr_groups[0]); + if ((rgid != (gid_t)-1 || newcred->cr_gid != newcred->cr_rgid) && + newcred->cr_svgid != newcred->cr_gid) { + change_svgid(newcred, newcred->cr_gid); setsugid(p); } proc_set_cred(p, newcred); @@ -1547,17 +1547,17 @@ sys_setresgid(struct thread *td, struct setresgid_args *uap) if (((rgid != (gid_t)-1 && rgid != oldcred->cr_rgid && rgid != oldcred->cr_svgid && - rgid != oldcred->cr_groups[0]) || + rgid != oldcred->cr_gid) || (egid != (gid_t)-1 && egid != oldcred->cr_rgid && egid != oldcred->cr_svgid && - egid != oldcred->cr_groups[0]) || + egid != oldcred->cr_gid) || (sgid != (gid_t)-1 && sgid != oldcred->cr_rgid && sgid != oldcred->cr_svgid && - sgid != oldcred->cr_groups[0])) && + sgid != oldcred->cr_gid)) && (error = priv_check_cred(oldcred, PRIV_CRED_SETRESGID)) != 0) goto fail; - if (egid != (gid_t)-1 && oldcred->cr_groups[0] != egid) { + if (egid != (gid_t)-1 && oldcred->cr_gid != egid) { change_egid(newcred, egid); setsugid(p); } @@ -1626,8 +1626,8 @@ sys_getresgid(struct thread *td, struct getresgid_args *uap) error1 = copyout(&cred->cr_rgid, uap->rgid, sizeof(cred->cr_rgid)); if (uap->egid) - error2 = copyout(&cred->cr_groups[0], - uap->egid, sizeof(cred->cr_groups[0])); + error2 = copyout(&cred->cr_gid, + uap->egid, sizeof(cred->cr_gid)); if (uap->sgid) error3 = copyout(&cred->cr_svgid, uap->sgid, sizeof(cred->cr_svgid)); @@ -1737,7 +1737,7 @@ groupmember(gid_t gid, const struct ucred *cred) groups_check_positive_len(cred->cr_ngroups); - if (gid == cred->cr_groups[0]) + if (gid == cred->cr_gid) return (true); return (group_is_supplementary(gid, cred)); @@ -3015,7 +3015,7 @@ void change_egid(struct ucred *newcred, gid_t egid) { - newcred->cr_groups[0] = egid; + newcred->cr_gid = egid; } /*- diff --git a/sys/kern/kern_sendfile.c b/sys/kern/kern_sendfile.c index 35b258e68701..8438298afc0e 100644 --- a/sys/kern/kern_sendfile.c +++ b/sys/kern/kern_sendfile.c @@ -698,10 +698,13 @@ sendfile_wait_generic(struct socket *so, off_t need, int *space) */ error = 0; SOCK_SENDBUF_LOCK(so); - if (so->so_snd.sb_lowat < so->so_snd.sb_hiwat / 2) - so->so_snd.sb_lowat = so->so_snd.sb_hiwat / 2; - if (so->so_snd.sb_lowat < PAGE_SIZE && so->so_snd.sb_hiwat >= PAGE_SIZE) - so->so_snd.sb_lowat = PAGE_SIZE; + if (so->so_snd.sb_flags & SB_AUTOLOWAT) { + if (so->so_snd.sb_lowat < so->so_snd.sb_hiwat / 2) + so->so_snd.sb_lowat = so->so_snd.sb_hiwat / 2; + if (so->so_snd.sb_lowat < PAGE_SIZE && + so->so_snd.sb_hiwat >= PAGE_SIZE) + so->so_snd.sb_lowat = PAGE_SIZE; + } retry_space: if (so->so_snd.sb_state & SBS_CANTSENDMORE) { error = EPIPE; diff --git a/sys/kern/kern_sig.c b/sys/kern/kern_sig.c index 5d51aa675cb7..da0efac0598d 100644 --- a/sys/kern/kern_sig.c +++ b/sys/kern/kern_sig.c @@ -45,10 +45,10 @@ #include <sys/vnode.h> #include <sys/acct.h> #include <sys/capsicum.h> -#include <sys/compressor.h> #include <sys/condvar.h> #include <sys/devctl.h> #include <sys/event.h> +#include <sys/exec.h> #include <sys/fcntl.h> #include <sys/imgact.h> #include <sys/jail.h> @@ -80,6 +80,7 @@ #include <sys/syslog.h> #include <sys/sysproto.h> #include <sys/timers.h> +#include <sys/ucoredump.h> #include <sys/unistd.h> #include <sys/vmmeter.h> #include <sys/wait.h> @@ -101,7 +102,6 @@ SDT_PROBE_DEFINE2(proc, , , signal__clear, SDT_PROBE_DEFINE3(proc, , , signal__discard, "struct thread *", "struct proc *", "int"); -static int coredump(struct thread *); static int killpg1(struct thread *td, int sig, int pgid, int all, ksiginfo_t *ksi); static int issignal(struct thread *td); @@ -126,11 +126,6 @@ const struct filterops sig_filtops = { .f_event = filt_signal, }; -static int kern_logsigexit = 1; -SYSCTL_INT(_kern, KERN_LOGSIGEXIT, logsigexit, CTLFLAG_RW, - &kern_logsigexit, 0, - "Log processes quitting on abnormal signals to syslog(3)"); - static int kern_forcesigexit = 1; SYSCTL_INT(_kern, OID_AUTO, forcesigexit, CTLFLAG_RW, &kern_forcesigexit, 0, "Force trap signal to be handled"); @@ -193,26 +188,6 @@ SYSINIT(signal, SI_SUB_P1003_1B, SI_ORDER_FIRST+3, sigqueue_start, NULL); (cr1)->cr_ruid == (cr2)->cr_uid || \ (cr1)->cr_uid == (cr2)->cr_uid) -static int sugid_coredump; -SYSCTL_INT(_kern, OID_AUTO, sugid_coredump, CTLFLAG_RWTUN, - &sugid_coredump, 0, "Allow setuid and setgid processes to dump core"); - -static int capmode_coredump; -SYSCTL_INT(_kern, OID_AUTO, capmode_coredump, CTLFLAG_RWTUN, - &capmode_coredump, 0, "Allow processes in capability mode to dump core"); - -static int do_coredump = 1; -SYSCTL_INT(_kern, OID_AUTO, coredump, CTLFLAG_RW, - &do_coredump, 0, "Enable/Disable coredumps"); - -static int set_core_nodump_flag = 0; -SYSCTL_INT(_kern, OID_AUTO, nodump_coredump, CTLFLAG_RW, &set_core_nodump_flag, - 0, "Enable setting the NODUMP flag on coredump files"); - -static int coredump_devctl = 0; -SYSCTL_INT(_kern, OID_AUTO, coredump_devctl, CTLFLAG_RW, &coredump_devctl, - 0, "Generate a devctl notification when processes coredump"); - /* * Signal properties and actions. * The array below categorizes the signals and their default actions @@ -784,6 +759,13 @@ sigprop(int sig) return (0); } +bool +sig_do_core(int sig) +{ + + return ((sigprop(sig) & SIGPROP_CORE) != 0); +} + static bool sigact_flag_test(const struct sigaction *act, int flag) { @@ -2665,6 +2647,8 @@ static void ptrace_coredumpreq(struct thread *td, struct proc *p, struct thr_coredump_req *tcq) { + struct coredump_vnode_ctx wctx; + struct coredump_writer cdw; void *rl_cookie; if (p->p_sysent->sv_coredump == NULL) { @@ -2672,8 +2656,15 @@ ptrace_coredumpreq(struct thread *td, struct proc *p, return; } + wctx.vp = tcq->tc_vp; + wctx.fcred = NOCRED; + + cdw.ctx = &wctx; + cdw.write_fn = core_vn_write; + cdw.extend_fn = core_vn_extend; + rl_cookie = vn_rangelock_wlock(tcq->tc_vp, 0, OFF_MAX); - tcq->tc_error = p->p_sysent->sv_coredump(td, tcq->tc_vp, + tcq->tc_error = p->p_sysent->sv_coredump(td, &cdw, tcq->tc_limit, tcq->tc_flags); vn_rangelock_unlock(tcq->tc_vp, rl_cookie); } @@ -3635,82 +3626,6 @@ killproc(struct proc *p, const char *why) } /* - * Force the current process to exit with the specified signal, dumping core - * if appropriate. We bypass the normal tests for masked and caught signals, - * allowing unrecoverable failures to terminate the process without changing - * signal state. Mark the accounting record with the signal termination. - * If dumping core, save the signal number for the debugger. Calls exit and - * does not return. - */ -void -sigexit(struct thread *td, int sig) -{ - struct proc *p = td->td_proc; - const char *coreinfo; - int rv; - bool logexit; - - PROC_LOCK_ASSERT(p, MA_OWNED); - proc_set_p2_wexit(p); - - p->p_acflag |= AXSIG; - if ((p->p_flag2 & P2_LOGSIGEXIT_CTL) == 0) - logexit = kern_logsigexit != 0; - else - logexit = (p->p_flag2 & P2_LOGSIGEXIT_ENABLE) != 0; - - /* - * We must be single-threading to generate a core dump. This - * ensures that the registers in the core file are up-to-date. - * Also, the ELF dump handler assumes that the thread list doesn't - * change out from under it. - * - * XXX If another thread attempts to single-thread before us - * (e.g. via fork()), we won't get a dump at all. - */ - if ((sigprop(sig) & SIGPROP_CORE) && - thread_single(p, SINGLE_NO_EXIT) == 0) { - p->p_sig = sig; - /* - * Log signals which would cause core dumps - * (Log as LOG_INFO to appease those who don't want - * these messages.) - * XXX : Todo, as well as euid, write out ruid too - * Note that coredump() drops proc lock. - */ - rv = coredump(td); - switch (rv) { - case 0: - sig |= WCOREFLAG; - coreinfo = " (core dumped)"; - break; - case EFAULT: - coreinfo = " (no core dump - bad address)"; - break; - case EINVAL: - coreinfo = " (no core dump - invalid argument)"; - break; - case EFBIG: - coreinfo = " (no core dump - too large)"; - break; - default: - coreinfo = " (no core dump - other error)"; - break; - } - if (logexit) - log(LOG_INFO, - "pid %d (%s), jid %d, uid %d: exited on " - "signal %d%s\n", p->p_pid, p->p_comm, - p->p_ucred->cr_prison->pr_id, - td->td_ucred->cr_uid, - sig &~ WCOREFLAG, coreinfo); - } else - PROC_UNLOCK(p); - exit1(td, 0, sig); - /* NOTREACHED */ -} - -/* * Send queued SIGCHLD to parent when child process's state * is changed. */ @@ -3803,477 +3718,6 @@ childproc_exited(struct proc *p) sigparent(p, reason, status); } -#define MAX_NUM_CORE_FILES 100000 -#ifndef NUM_CORE_FILES -#define NUM_CORE_FILES 5 -#endif -CTASSERT(NUM_CORE_FILES >= 0 && NUM_CORE_FILES <= MAX_NUM_CORE_FILES); -static int num_cores = NUM_CORE_FILES; - -static int -sysctl_debug_num_cores_check (SYSCTL_HANDLER_ARGS) -{ - int error; - int new_val; - - new_val = num_cores; - error = sysctl_handle_int(oidp, &new_val, 0, req); - if (error != 0 || req->newptr == NULL) - return (error); - if (new_val > MAX_NUM_CORE_FILES) - new_val = MAX_NUM_CORE_FILES; - if (new_val < 0) - new_val = 0; - num_cores = new_val; - return (0); -} -SYSCTL_PROC(_debug, OID_AUTO, ncores, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, sizeof(int), - sysctl_debug_num_cores_check, "I", - "Maximum number of generated process corefiles while using index format"); - -#define GZIP_SUFFIX ".gz" -#define ZSTD_SUFFIX ".zst" - -int compress_user_cores = 0; - -static int -sysctl_compress_user_cores(SYSCTL_HANDLER_ARGS) -{ - int error, val; - - val = compress_user_cores; - error = sysctl_handle_int(oidp, &val, 0, req); - if (error != 0 || req->newptr == NULL) - return (error); - if (val != 0 && !compressor_avail(val)) - return (EINVAL); - compress_user_cores = val; - return (error); -} -SYSCTL_PROC(_kern, OID_AUTO, compress_user_cores, - CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NEEDGIANT, 0, sizeof(int), - sysctl_compress_user_cores, "I", - "Enable compression of user corefiles (" - __XSTRING(COMPRESS_GZIP) " = gzip, " - __XSTRING(COMPRESS_ZSTD) " = zstd)"); - -int compress_user_cores_level = 6; -SYSCTL_INT(_kern, OID_AUTO, compress_user_cores_level, CTLFLAG_RWTUN, - &compress_user_cores_level, 0, - "Corefile compression level"); - -/* - * Protect the access to corefilename[] by allproc_lock. - */ -#define corefilename_lock allproc_lock - -static char corefilename[MAXPATHLEN] = {"%N.core"}; -TUNABLE_STR("kern.corefile", corefilename, sizeof(corefilename)); - -static int -sysctl_kern_corefile(SYSCTL_HANDLER_ARGS) -{ - int error; - - sx_xlock(&corefilename_lock); - error = sysctl_handle_string(oidp, corefilename, sizeof(corefilename), - req); - sx_xunlock(&corefilename_lock); - - return (error); -} -SYSCTL_PROC(_kern, OID_AUTO, corefile, CTLTYPE_STRING | CTLFLAG_RW | - CTLFLAG_MPSAFE, 0, 0, sysctl_kern_corefile, "A", - "Process corefile name format string"); - -static void -vnode_close_locked(struct thread *td, struct vnode *vp) -{ - - VOP_UNLOCK(vp); - vn_close(vp, FWRITE, td->td_ucred, td); -} - -/* - * If the core format has a %I in it, then we need to check - * for existing corefiles before defining a name. - * To do this we iterate over 0..ncores to find a - * non-existing core file name to use. If all core files are - * already used we choose the oldest one. - */ -static int -corefile_open_last(struct thread *td, char *name, int indexpos, - int indexlen, int ncores, struct vnode **vpp) -{ - struct vnode *oldvp, *nextvp, *vp; - struct vattr vattr; - struct nameidata nd; - int error, i, flags, oflags, cmode; - char ch; - struct timespec lasttime; - - nextvp = oldvp = NULL; - cmode = S_IRUSR | S_IWUSR; - oflags = VN_OPEN_NOAUDIT | VN_OPEN_NAMECACHE | - (capmode_coredump ? VN_OPEN_NOCAPCHECK : 0); - - for (i = 0; i < ncores; i++) { - flags = O_CREAT | FWRITE | O_NOFOLLOW; - - ch = name[indexpos + indexlen]; - (void)snprintf(name + indexpos, indexlen + 1, "%.*u", indexlen, - i); - name[indexpos + indexlen] = ch; - - NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, name); - error = vn_open_cred(&nd, &flags, cmode, oflags, td->td_ucred, - NULL); - if (error != 0) - break; - - vp = nd.ni_vp; - NDFREE_PNBUF(&nd); - if ((flags & O_CREAT) == O_CREAT) { - nextvp = vp; - break; - } - - error = VOP_GETATTR(vp, &vattr, td->td_ucred); - if (error != 0) { - vnode_close_locked(td, vp); - break; - } - - if (oldvp == NULL || - lasttime.tv_sec > vattr.va_mtime.tv_sec || - (lasttime.tv_sec == vattr.va_mtime.tv_sec && - lasttime.tv_nsec >= vattr.va_mtime.tv_nsec)) { - if (oldvp != NULL) - vn_close(oldvp, FWRITE, td->td_ucred, td); - oldvp = vp; - VOP_UNLOCK(oldvp); - lasttime = vattr.va_mtime; - } else { - vnode_close_locked(td, vp); - } - } - - if (oldvp != NULL) { - if (nextvp == NULL) { - if ((td->td_proc->p_flag & P_SUGID) != 0) { - error = EFAULT; - vn_close(oldvp, FWRITE, td->td_ucred, td); - } else { - nextvp = oldvp; - error = vn_lock(nextvp, LK_EXCLUSIVE); - if (error != 0) { - vn_close(nextvp, FWRITE, td->td_ucred, - td); - nextvp = NULL; - } - } - } else { - vn_close(oldvp, FWRITE, td->td_ucred, td); - } - } - if (error != 0) { - if (nextvp != NULL) - vnode_close_locked(td, oldvp); - } else { - *vpp = nextvp; - } - - return (error); -} - -/* - * corefile_open(comm, uid, pid, td, compress, vpp, namep) - * Expand the name described in corefilename, using name, uid, and pid - * and open/create core file. - * corefilename is a printf-like string, with three format specifiers: - * %N name of process ("name") - * %P process id (pid) - * %U user id (uid) - * For example, "%N.core" is the default; they can be disabled completely - * by using "/dev/null", or all core files can be stored in "/cores/%U/%N-%P". - * This is controlled by the sysctl variable kern.corefile (see above). - */ -static int -corefile_open(const char *comm, uid_t uid, pid_t pid, struct thread *td, - int compress, int signum, struct vnode **vpp, char **namep) -{ - struct sbuf sb; - struct nameidata nd; - const char *format; - char *hostname, *name; - int cmode, error, flags, i, indexpos, indexlen, oflags, ncores; - - hostname = NULL; - format = corefilename; - name = malloc(MAXPATHLEN, M_TEMP, M_WAITOK | M_ZERO); - indexlen = 0; - indexpos = -1; - ncores = num_cores; - (void)sbuf_new(&sb, name, MAXPATHLEN, SBUF_FIXEDLEN); - sx_slock(&corefilename_lock); - for (i = 0; format[i] != '\0'; i++) { - switch (format[i]) { - case '%': /* Format character */ - i++; - switch (format[i]) { - case '%': - sbuf_putc(&sb, '%'); - break; - case 'H': /* hostname */ - if (hostname == NULL) { - hostname = malloc(MAXHOSTNAMELEN, - M_TEMP, M_WAITOK); - } - getcredhostname(td->td_ucred, hostname, - MAXHOSTNAMELEN); - sbuf_cat(&sb, hostname); - break; - case 'I': /* autoincrementing index */ - if (indexpos != -1) { - sbuf_printf(&sb, "%%I"); - break; - } - - indexpos = sbuf_len(&sb); - sbuf_printf(&sb, "%u", ncores - 1); - indexlen = sbuf_len(&sb) - indexpos; - break; - case 'N': /* process name */ - sbuf_printf(&sb, "%s", comm); - break; - case 'P': /* process id */ - sbuf_printf(&sb, "%u", pid); - break; - case 'S': /* signal number */ - sbuf_printf(&sb, "%i", signum); - break; - case 'U': /* user id */ - sbuf_printf(&sb, "%u", uid); - break; - default: - log(LOG_ERR, - "Unknown format character %c in " - "corename `%s'\n", format[i], format); - break; - } - break; - default: - sbuf_putc(&sb, format[i]); - break; - } - } - sx_sunlock(&corefilename_lock); - free(hostname, M_TEMP); - if (compress == COMPRESS_GZIP) - sbuf_cat(&sb, GZIP_SUFFIX); - else if (compress == COMPRESS_ZSTD) - sbuf_cat(&sb, ZSTD_SUFFIX); - if (sbuf_error(&sb) != 0) { - log(LOG_ERR, "pid %ld (%s), uid (%lu): corename is too " - "long\n", (long)pid, comm, (u_long)uid); - sbuf_delete(&sb); - free(name, M_TEMP); - return (ENOMEM); - } - sbuf_finish(&sb); - sbuf_delete(&sb); - - if (indexpos != -1) { - error = corefile_open_last(td, name, indexpos, indexlen, ncores, - vpp); - if (error != 0) { - log(LOG_ERR, - "pid %d (%s), uid (%u): Path `%s' failed " - "on initial open test, error = %d\n", - pid, comm, uid, name, error); - } - } else { - cmode = S_IRUSR | S_IWUSR; - oflags = VN_OPEN_NOAUDIT | VN_OPEN_NAMECACHE | - (capmode_coredump ? VN_OPEN_NOCAPCHECK : 0); - flags = O_CREAT | FWRITE | O_NOFOLLOW; - if ((td->td_proc->p_flag & P_SUGID) != 0) - flags |= O_EXCL; - - NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, name); - error = vn_open_cred(&nd, &flags, cmode, oflags, td->td_ucred, - NULL); - if (error == 0) { - *vpp = nd.ni_vp; - NDFREE_PNBUF(&nd); - } - } - - if (error != 0) { -#ifdef AUDIT - audit_proc_coredump(td, name, error); -#endif - free(name, M_TEMP); - return (error); - } - *namep = name; - return (0); -} - -/* - * Dump a process' core. The main routine does some - * policy checking, and creates the name of the coredump; - * then it passes on a vnode and a size limit to the process-specific - * coredump routine if there is one; if there _is not_ one, it returns - * ENOSYS; otherwise it returns the error from the process-specific routine. - */ - -static int -coredump(struct thread *td) -{ - struct proc *p = td->td_proc; - struct ucred *cred = td->td_ucred; - struct vnode *vp; - struct flock lf; - struct vattr vattr; - size_t fullpathsize; - int error, error1, jid, locked, ppid, sig; - char *name; /* name of corefile */ - void *rl_cookie; - off_t limit; - char *fullpath, *freepath = NULL; - struct sbuf *sb; - - PROC_LOCK_ASSERT(p, MA_OWNED); - MPASS((p->p_flag & P_HADTHREADS) == 0 || p->p_singlethread == td); - - if (!do_coredump || (!sugid_coredump && (p->p_flag & P_SUGID) != 0) || - (p->p_flag2 & P2_NOTRACE) != 0) { - PROC_UNLOCK(p); - return (EFAULT); - } - - /* - * Note that the bulk of limit checking is done after - * the corefile is created. The exception is if the limit - * for corefiles is 0, in which case we don't bother - * creating the corefile at all. This layout means that - * a corefile is truncated instead of not being created, - * if it is larger than the limit. - */ - limit = (off_t)lim_cur(td, RLIMIT_CORE); - if (limit == 0 || racct_get_available(p, RACCT_CORE) == 0) { - PROC_UNLOCK(p); - return (EFBIG); - } - - ppid = p->p_oppid; - sig = p->p_sig; - jid = p->p_ucred->cr_prison->pr_id; - PROC_UNLOCK(p); - - error = corefile_open(p->p_comm, cred->cr_uid, p->p_pid, td, - compress_user_cores, p->p_sig, &vp, &name); - if (error != 0) - return (error); - - /* - * Don't dump to non-regular files or files with links. - * Do not dump into system files. Effective user must own the corefile. - */ - if (vp->v_type != VREG || VOP_GETATTR(vp, &vattr, cred) != 0 || - vattr.va_nlink != 1 || (vp->v_vflag & VV_SYSTEM) != 0 || - vattr.va_uid != cred->cr_uid) { - VOP_UNLOCK(vp); - error = EFAULT; - goto out; - } - - VOP_UNLOCK(vp); - - /* Postpone other writers, including core dumps of other processes. */ - rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX); - - lf.l_whence = SEEK_SET; - lf.l_start = 0; - lf.l_len = 0; - lf.l_type = F_WRLCK; - locked = (VOP_ADVLOCK(vp, (caddr_t)p, F_SETLK, &lf, F_FLOCK) == 0); - - VATTR_NULL(&vattr); - vattr.va_size = 0; - if (set_core_nodump_flag) - vattr.va_flags = UF_NODUMP; - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); - VOP_SETATTR(vp, &vattr, cred); - VOP_UNLOCK(vp); - PROC_LOCK(p); - p->p_acflag |= ACORE; - PROC_UNLOCK(p); - - if (p->p_sysent->sv_coredump != NULL) { - error = p->p_sysent->sv_coredump(td, vp, limit, 0); - } else { - error = ENOSYS; - } - - if (locked) { - lf.l_type = F_UNLCK; - VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_FLOCK); - } - vn_rangelock_unlock(vp, rl_cookie); - - /* - * Notify the userland helper that a process triggered a core dump. - * This allows the helper to run an automated debugging session. - */ - if (error != 0 || coredump_devctl == 0) - goto out; - sb = sbuf_new_auto(); - if (vn_fullpath_global(p->p_textvp, &fullpath, &freepath) != 0) - goto out2; - sbuf_cat(sb, "comm=\""); - devctl_safe_quote_sb(sb, fullpath); - free(freepath, M_TEMP); - sbuf_cat(sb, "\" core=\""); - - /* - * We can't lookup core file vp directly. When we're replacing a core, and - * other random times, we flush the name cache, so it will fail. Instead, - * if the path of the core is relative, add the current dir in front if it. - */ - if (name[0] != '/') { - fullpathsize = MAXPATHLEN; - freepath = malloc(fullpathsize, M_TEMP, M_WAITOK); - if (vn_getcwd(freepath, &fullpath, &fullpathsize) != 0) { - free(freepath, M_TEMP); - goto out2; - } - devctl_safe_quote_sb(sb, fullpath); - free(freepath, M_TEMP); - sbuf_putc(sb, '/'); - } - devctl_safe_quote_sb(sb, name); - sbuf_putc(sb, '"'); - - sbuf_printf(sb, " jid=%d pid=%d ppid=%d signo=%d", - jid, p->p_pid, ppid, sig); - if (sbuf_finish(sb) == 0) - devctl_notify("kernel", "signal", "coredump", sbuf_data(sb)); -out2: - sbuf_delete(sb); -out: - error1 = vn_close(vp, FWRITE, cred, td); - if (error == 0) - error = error1; -#ifdef AUDIT - audit_proc_coredump(td, name, error); -#endif - free(name, M_TEMP); - return (error); -} - /* * Nonexistent system call-- signal process (may want to handle it). Flag * error in case process won't see signal immediately (blocked or ignored). diff --git a/sys/kern/kern_sysctl.c b/sys/kern/kern_sysctl.c index 46226cc31980..25da134661e9 100644 --- a/sys/kern/kern_sysctl.c +++ b/sys/kern/kern_sysctl.c @@ -2368,7 +2368,7 @@ sysctl_root(SYSCTL_HANDLER_ARGS) priv = PRIV_SYSCTL_WRITEJAIL; #ifdef VIMAGE else if ((oid->oid_kind & CTLFLAG_VNET) && - prison_owns_vnet(req->td->td_ucred)) + prison_owns_vnet(req->td->td_ucred->cr_prison)) priv = PRIV_SYSCTL_WRITEJAIL; #endif else diff --git a/sys/kern/kern_thread.c b/sys/kern/kern_thread.c index f853af193016..50b040132396 100644 --- a/sys/kern/kern_thread.c +++ b/sys/kern/kern_thread.c @@ -571,7 +571,7 @@ threadinit(void) /* * Thread structures are specially aligned so that (at least) the - * 5 lower bits of a pointer to 'struct thead' must be 0. These bits + * 5 lower bits of a pointer to 'struct thread' must be 0. These bits * are used by synchronization primitives to store flags in pointers to * such structures. */ diff --git a/sys/kern/kern_ucoredump.c b/sys/kern/kern_ucoredump.c new file mode 100644 index 000000000000..d425596b5f24 --- /dev/null +++ b/sys/kern/kern_ucoredump.c @@ -0,0 +1,299 @@ +/* + * SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 1982, 1986, 1989, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/param.h> +#include <sys/acct.h> +#include <sys/compressor.h> +#include <sys/jail.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/proc.h> +#include <sys/signalvar.h> +#include <sys/racct.h> +#include <sys/resourcevar.h> +#include <sys/rmlock.h> +#include <sys/sysctl.h> +#include <sys/syslog.h> +#include <sys/ucoredump.h> +#include <sys/wait.h> + +static int coredump(struct thread *td, const char **); + +int compress_user_cores = 0; + +static SLIST_HEAD(, coredumper) coredumpers = + SLIST_HEAD_INITIALIZER(coredumpers); +static struct rmlock coredump_rmlock; +RM_SYSINIT(coredump_lock, &coredump_rmlock, "coredump_lock"); + +static int kern_logsigexit = 1; +SYSCTL_INT(_kern, KERN_LOGSIGEXIT, logsigexit, CTLFLAG_RW, + &kern_logsigexit, 0, + "Log processes quitting on abnormal signals to syslog(3)"); + +static int sugid_coredump; +SYSCTL_INT(_kern, OID_AUTO, sugid_coredump, CTLFLAG_RWTUN, + &sugid_coredump, 0, "Allow setuid and setgid processes to dump core"); + +static int do_coredump = 1; +SYSCTL_INT(_kern, OID_AUTO, coredump, CTLFLAG_RW, + &do_coredump, 0, "Enable/Disable coredumps"); + +static int +sysctl_compress_user_cores(SYSCTL_HANDLER_ARGS) +{ + int error, val; + + val = compress_user_cores; + error = sysctl_handle_int(oidp, &val, 0, req); + if (error != 0 || req->newptr == NULL) + return (error); + if (val != 0 && !compressor_avail(val)) + return (EINVAL); + compress_user_cores = val; + return (error); +} +SYSCTL_PROC(_kern, OID_AUTO, compress_user_cores, + CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NEEDGIANT, 0, sizeof(int), + sysctl_compress_user_cores, "I", + "Enable compression of user corefiles (" + __XSTRING(COMPRESS_GZIP) " = gzip, " + __XSTRING(COMPRESS_ZSTD) " = zstd)"); + +int compress_user_cores_level = 6; +SYSCTL_INT(_kern, OID_AUTO, compress_user_cores_level, CTLFLAG_RWTUN, + &compress_user_cores_level, 0, + "Corefile compression level"); + +void +coredumper_register(struct coredumper *cd) +{ + + blockcount_init(&cd->cd_refcount); + rm_wlock(&coredump_rmlock); + SLIST_INSERT_HEAD(&coredumpers, cd, cd_entry); + rm_wunlock(&coredump_rmlock); +} + +void +coredumper_unregister(struct coredumper *cd) +{ + + rm_wlock(&coredump_rmlock); + SLIST_REMOVE(&coredumpers, cd, coredumper, cd_entry); + rm_wunlock(&coredump_rmlock); + + /* + * Wait for any in-process coredumps to finish before returning. + */ + blockcount_wait(&cd->cd_refcount, NULL, "dumpwait", 0); +} + +/* + * Force the current process to exit with the specified signal, dumping core + * if appropriate. We bypass the normal tests for masked and caught signals, + * allowing unrecoverable failures to terminate the process without changing + * signal state. Mark the accounting record with the signal termination. + * If dumping core, save the signal number for the debugger. Calls exit and + * does not return. + */ +void +sigexit(struct thread *td, int sig) +{ + struct proc *p = td->td_proc; + int rv; + bool logexit; + + PROC_LOCK_ASSERT(p, MA_OWNED); + proc_set_p2_wexit(p); + + p->p_acflag |= AXSIG; + if ((p->p_flag2 & P2_LOGSIGEXIT_CTL) == 0) + logexit = kern_logsigexit != 0; + else + logexit = (p->p_flag2 & P2_LOGSIGEXIT_ENABLE) != 0; + + /* + * We must be single-threading to generate a core dump. This + * ensures that the registers in the core file are up-to-date. + * Also, the ELF dump handler assumes that the thread list doesn't + * change out from under it. + * + * XXX If another thread attempts to single-thread before us + * (e.g. via fork()), we won't get a dump at all. + */ + if (sig_do_core(sig) && thread_single(p, SINGLE_NO_EXIT) == 0) { + const char *err = NULL; + + p->p_sig = sig; + /* + * Log signals which would cause core dumps + * (Log as LOG_INFO to appease those who don't want + * these messages.) + * XXX : Todo, as well as euid, write out ruid too + * Note that coredump() drops proc lock. + */ + rv = coredump(td, &err); + if (rv == 0) { + MPASS(err == NULL); + sig |= WCOREFLAG; + } else if (err == NULL) { + switch (rv) { + case EFAULT: + err = "bad address"; + break; + case EINVAL: + err = "invalild argument"; + break; + case EFBIG: + err = "too large"; + break; + default: + err = "other error"; + break; + } + } + if (logexit) + log(LOG_INFO, + "pid %d (%s), jid %d, uid %d: exited on " + "signal %d (%s%s)\n", p->p_pid, p->p_comm, + p->p_ucred->cr_prison->pr_id, + td->td_ucred->cr_uid, sig &~ WCOREFLAG, + err != NULL ? "no core dump - " : "core dumped", + err != NULL ? err : ""); + } else + PROC_UNLOCK(p); + exit1(td, 0, sig); + /* NOTREACHED */ +} + + +/* + * Dump a process' core. The main routine does some + * policy checking, and creates the name of the coredump; + * then it passes on a vnode and a size limit to the process-specific + * coredump routine if there is one; if there _is not_ one, it returns + * ENOSYS; otherwise it returns the error from the process-specific routine. + */ +static int +coredump(struct thread *td, const char **errmsg) +{ + struct coredumper *iter, *chosen; + struct proc *p = td->td_proc; + struct rm_priotracker tracker; + off_t limit; + int error, priority; + + PROC_LOCK_ASSERT(p, MA_OWNED); + MPASS((p->p_flag & P_HADTHREADS) == 0 || p->p_singlethread == td); + + if (!do_coredump || (!sugid_coredump && (p->p_flag & P_SUGID) != 0) || + (p->p_flag2 & P2_NOTRACE) != 0) { + PROC_UNLOCK(p); + + if (!do_coredump) + *errmsg = "denied by kern.coredump"; + else if ((p->p_flag2 & P2_NOTRACE) != 0) + *errmsg = "process has trace disabled"; + else + *errmsg = "sugid process denied by kern.sugid_coredump"; + return (EFAULT); + } + + /* + * Note that the bulk of limit checking is done after + * the corefile is created. The exception is if the limit + * for corefiles is 0, in which case we don't bother + * creating the corefile at all. This layout means that + * a corefile is truncated instead of not being created, + * if it is larger than the limit. + */ + limit = (off_t)lim_cur(td, RLIMIT_CORE); + if (limit == 0 || racct_get_available(p, RACCT_CORE) == 0) { + PROC_UNLOCK(p); + *errmsg = "coredumpsize limit is 0"; + return (EFBIG); + } + + rm_rlock(&coredump_rmlock, &tracker); + priority = -1; + chosen = NULL; + SLIST_FOREACH(iter, &coredumpers, cd_entry) { + if (iter->cd_probe == NULL) { + /* + * If we haven't found anything of a higher priority + * yet, we'll call this a GENERIC. Ideally, we want + * coredumper modules to include a probe function. + */ + if (priority < 0) { + priority = COREDUMPER_GENERIC; + chosen = iter; + } + + continue; + } + + error = (*iter->cd_probe)(td); + if (error < 0) + continue; + + /* + * Higher priority than previous options. + */ + if (error > priority) { + priority = error; + chosen = iter; + } + } + + /* + * Acquire our refcount before we drop the lock so that + * coredumper_unregister() can safely assume that the refcount will only + * go down once it's dropped the rmlock. + */ + blockcount_acquire(&chosen->cd_refcount, 1); + rm_runlock(&coredump_rmlock, &tracker); + + /* Currently, we always have the vnode dumper built in. */ + MPASS(chosen != NULL); + error = ((*chosen->cd_handle)(td, limit)); + PROC_LOCK_ASSERT(p, MA_NOTOWNED); + + blockcount_release(&chosen->cd_refcount, 1); + + return (error); +} diff --git a/sys/kern/subr_compressor.c b/sys/kern/subr_compressor.c index 280264881241..5d59622e0455 100644 --- a/sys/kern/subr_compressor.c +++ b/sys/kern/subr_compressor.c @@ -538,6 +538,12 @@ compressor_init(compressor_cb_t cb, int format, size_t maxiosize, int level, return (s); } +int +compressor_format(const struct compressor *stream) +{ + return (stream->methods->format); +} + void compressor_reset(struct compressor *stream) { diff --git a/sys/kern/sys_generic.c b/sys/kern/sys_generic.c index b472aaea89e6..5606b36f772f 100644 --- a/sys/kern/sys_generic.c +++ b/sys/kern/sys_generic.c @@ -2269,6 +2269,7 @@ exterr_copyout(struct thread *td) ue.error = 0; sz = sizeof(ue.error); } else { + ktrexterr(td); sz = sizeof(ue) - __offsetof(struct uexterror, error); } error = copyout(&ue.error, uloc, sz); @@ -2335,7 +2336,6 @@ exterr_set(int eerror, int category, const char *mmsg, uintptr_t pp1, td->td_kexterr.p1 = pp1; td->td_kexterr.p2 = pp2; td->td_kexterr.src_line = line; - ktrexterr(td); } return (eerror); } diff --git a/sys/kern/uipc_ktls.c b/sys/kern/uipc_ktls.c index ce09042abdac..66ce1b5a081d 100644 --- a/sys/kern/uipc_ktls.c +++ b/sys/kern/uipc_ktls.c @@ -1207,7 +1207,7 @@ sb_mark_notready(struct sockbuf *sb) for (; m != NULL; m = m->m_next) { KASSERT(m->m_nextpkt == NULL, ("%s: m_nextpkt != NULL", __func__)); - KASSERT((m->m_flags & M_NOTAVAIL) == 0, ("%s: mbuf not avail", + KASSERT((m->m_flags & M_NOTREADY) == 0, ("%s: mbuf not ready", __func__)); KASSERT(sb->sb_acc >= m->m_len, ("%s: sb_acc < m->m_len", __func__)); diff --git a/sys/kern/uipc_shm.c b/sys/kern/uipc_shm.c index 6f83b875a6b6..85fe48ddd466 100644 --- a/sys/kern/uipc_shm.c +++ b/sys/kern/uipc_shm.c @@ -1134,10 +1134,10 @@ shm_doremove(struct shm_mapping *map) int kern_shm_open2(struct thread *td, const char *userpath, int flags, mode_t mode, - int shmflags, struct filecaps *fcaps, const char *name __unused) + int shmflags, struct filecaps *fcaps, const char *name __unused, + struct shmfd *shmfd) { struct pwddesc *pdp; - struct shmfd *shmfd; struct file *fp; char *path; void *rl_cookie; @@ -1214,23 +1214,41 @@ kern_shm_open2(struct thread *td, const char *userpath, int flags, mode_t mode, if (error != 0) goto outnofp; - /* A SHM_ANON path pointer creates an anonymous object. */ + /* + * A SHM_ANON path pointer creates an anonymous object. We allow other + * parts of the kernel to pre-populate a shmfd and then materialize an + * fd for it here as a means to pass data back up to userland. This + * doesn't really make sense for named shm objects, but it makes plenty + * of sense for anonymous objects. + */ if (userpath == SHM_ANON) { - /* A read-only anonymous object is pointless. */ - if ((flags & O_ACCMODE) == O_RDONLY) { - error = EINVAL; - goto out; - } - shmfd = shm_alloc(td->td_ucred, cmode, largepage); - if (shmfd == NULL) { - error = ENOMEM; - goto out; + if (shmfd != NULL) { + shm_hold(shmfd); + } else { + /* + * A read-only anonymous object is pointless, unless it + * was pre-populated by the kernel with the expectation + * that a shmfd would later be created for userland to + * access it through. + */ + if ((flags & O_ACCMODE) == O_RDONLY) { + error = EINVAL; + goto out; + } + shmfd = shm_alloc(td->td_ucred, cmode, largepage); + if (shmfd == NULL) { + error = ENOMEM; + goto out; + } + + shmfd->shm_seals = initial_seals; + shmfd->shm_flags = shmflags; } - shmfd->shm_seals = initial_seals; - shmfd->shm_flags = shmflags; } else { fnv = fnv_32_str(path, FNV1_32_INIT); sx_xlock(&shm_dict_lock); + + MPASS(shmfd == NULL); shmfd = shm_lookup(path, fnv); if (shmfd == NULL) { /* Object does not yet exist, create it if requested. */ @@ -2173,7 +2191,7 @@ kern_shm_open(struct thread *td, const char *path, int flags, mode_t mode, struct filecaps *caps) { - return (kern_shm_open2(td, path, flags, mode, 0, caps, NULL)); + return (kern_shm_open2(td, path, flags, mode, 0, caps, NULL, NULL)); } /* @@ -2191,7 +2209,7 @@ sys_shm_open2(struct thread *td, struct shm_open2_args *uap) { return (kern_shm_open2(td, uap->path, uap->flags, uap->mode, - uap->shmflags, NULL, uap->name)); + uap->shmflags, NULL, uap->name, NULL)); } int diff --git a/sys/kern/uipc_sockbuf.c b/sys/kern/uipc_sockbuf.c index ec00878cd9a5..745702bd4a4f 100644 --- a/sys/kern/uipc_sockbuf.c +++ b/sys/kern/uipc_sockbuf.c @@ -195,14 +195,14 @@ int sbready(struct sockbuf *sb, struct mbuf *m0, int count) { struct mbuf *m; - u_int blocker; + bool blocker; SOCKBUF_LOCK_ASSERT(sb); KASSERT(sb->sb_fnrdy != NULL, ("%s: sb %p NULL fnrdy", __func__, sb)); KASSERT(count > 0, ("%s: invalid count %d", __func__, count)); m = m0; - blocker = (sb->sb_fnrdy == m) ? M_BLOCKED : 0; + blocker = (sb->sb_fnrdy == m); while (count > 0) { KASSERT(m->m_flags & M_NOTREADY, @@ -217,8 +217,7 @@ sbready(struct sockbuf *sb, struct mbuf *m0, int count) m->m_epg_nrdy = 0; } else count--; - - m->m_flags &= ~(M_NOTREADY | blocker); + m->m_flags &= ~M_NOTREADY; if (blocker) sb->sb_acc += m->m_len; m = m->m_next; @@ -240,12 +239,8 @@ sbready(struct sockbuf *sb, struct mbuf *m0, int count) } /* This one was blocking all the queue. */ - for (; m && (m->m_flags & M_NOTREADY) == 0; m = m->m_next) { - KASSERT(m->m_flags & M_BLOCKED, - ("%s: m %p !M_BLOCKED", __func__, m)); - m->m_flags &= ~M_BLOCKED; + for (; m && (m->m_flags & M_NOTREADY) == 0; m = m->m_next) sb->sb_acc += m->m_len; - } sb->sb_fnrdy = m; sbready_compress(sb, m0, m); @@ -269,8 +264,7 @@ sballoc(struct sockbuf *sb, struct mbuf *m) sb->sb_fnrdy = m; else sb->sb_acc += m->m_len; - } else - m->m_flags |= M_BLOCKED; + } if (m->m_type != MT_DATA && m->m_type != MT_OOBDATA) sb->sb_ctl += m->m_len; @@ -287,29 +281,29 @@ sballoc(struct sockbuf *sb, struct mbuf *m) void sbfree(struct sockbuf *sb, struct mbuf *m) { + struct mbuf *n; #if 0 /* XXX: not yet: soclose() call path comes here w/o lock. */ SOCKBUF_LOCK_ASSERT(sb); #endif - sb->sb_ccc -= m->m_len; - if (!(m->m_flags & M_NOTAVAIL)) - sb->sb_acc -= m->m_len; - if (m == sb->sb_fnrdy) { - struct mbuf *n; - KASSERT(m->m_flags & M_NOTREADY, ("%s: m %p !M_NOTREADY", __func__, m)); n = m->m_next; while (n != NULL && !(n->m_flags & M_NOTREADY)) { - n->m_flags &= ~M_BLOCKED; sb->sb_acc += n->m_len; n = n->m_next; } sb->sb_fnrdy = n; + } else { + /* Assert that mbuf is not behind sb_fnrdy. */ + for (n = sb->sb_fnrdy; n != NULL; n = n->m_next) + KASSERT(n != m, ("%s: sb %p freeing %p behind sb_fnrdy", + __func__, sb, m)); + sb->sb_acc -= m->m_len; } if (m->m_type != MT_DATA && m->m_type != MT_OOBDATA) @@ -779,6 +773,7 @@ sbsetopt(struct socket *so, struct sockopt *sopt) * high-water. */ *lowat = (cc > *hiwat) ? *hiwat : cc; + *flags &= ~SB_AUTOLOWAT; break; } @@ -1128,13 +1123,7 @@ sbcheck(struct sockbuf *sb, const char *file, int line) } fnrdy = m; } - if (fnrdy) { - if (!(m->m_flags & M_NOTAVAIL)) { - printf("sb %p: fnrdy %p, m %p is avail\n", - sb, sb->sb_fnrdy, m); - goto fail; - } - } else + if (fnrdy == NULL) acc += m->m_len; ccc += m->m_len; mbcnt += MSIZE; @@ -1601,8 +1590,8 @@ sbcut_internal(struct sockbuf *sb, int len) next = m->m_nextpkt; } if (m->m_len > len) { - KASSERT(!(m->m_flags & M_NOTAVAIL), - ("%s: m %p M_NOTAVAIL", __func__, m)); + KASSERT(!(m->m_flags & M_NOTREADY), + ("%s: m %p M_NOTREADY", __func__, m)); m->m_len -= len; m->m_data += len; sb->sb_ccc -= len; diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c index 6c9eb7139cd1..fe2d8d056062 100644 --- a/sys/kern/uipc_socket.c +++ b/sys/kern/uipc_socket.c @@ -1211,7 +1211,8 @@ solisten_clone(struct socket *head) so->so_rcv.sb_timeo = head->sol_sbrcv_timeo; so->so_snd.sb_timeo = head->sol_sbsnd_timeo; so->so_rcv.sb_flags = head->sol_sbrcv_flags & SB_AUTOSIZE; - so->so_snd.sb_flags = head->sol_sbsnd_flags & SB_AUTOSIZE; + so->so_snd.sb_flags = head->sol_sbsnd_flags & + (SB_AUTOSIZE | SB_AUTOLOWAT); if ((so->so_proto->pr_flags & PR_SOCKBUF) == 0) { so->so_snd.sb_mtx = &so->so_snd_mtx; so->so_rcv.sb_mtx = &so->so_rcv_mtx; @@ -2988,8 +2989,8 @@ dontblock: */ moff = 0; offset = 0; - while (m != NULL && !(m->m_flags & M_NOTAVAIL) && uio->uio_resid > 0 - && error == 0) { + while (m != NULL && !(m->m_flags & M_NOTREADY) && uio->uio_resid > 0 && + error == 0) { /* * If the type of mbuf has changed since the last mbuf * examined ('type'), end the receive operation. @@ -3341,7 +3342,7 @@ deliver: for (m = sb->sb_mb; m != NULL && m->m_len <= len; m = m->m_next) { - KASSERT(!(m->m_flags & M_NOTAVAIL), + KASSERT(!(m->m_flags & M_NOTREADY), ("%s: m %p not available", __func__, m)); len -= m->m_len; uio->uio_resid -= m->m_len; @@ -4514,6 +4515,9 @@ sokqfilter_generic(struct socket *so, struct knote *kn) SOCK_BUF_LOCK(so, which); knlist_add(knl, kn, 1); sb->sb_flags |= SB_KNOTE; + if ((kn->kn_sfflags & NOTE_LOWAT) && + (sb->sb_flags & SB_AUTOLOWAT)) + sb->sb_flags &= ~SB_AUTOLOWAT; SOCK_BUF_UNLOCK(so, which); } SOCK_UNLOCK(so); diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c index 918b256e6c59..29774cf87393 100644 --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -6533,17 +6533,6 @@ vop_read_pgcache_post(void *ap, int rc) VFS_KNOTE_UNLOCKED(a->a_vp, NOTE_READ); } -void -vop_readdir_post(void *ap, int rc) -{ - struct vop_readdir_args *a = ap; - - if (!rc) { - VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ); - INOTIFY(a->a_vp, IN_ACCESS); - } -} - static struct knlist fs_knlist; static void diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c index c71e0d9ee569..25d40a9806cb 100644 --- a/sys/kern/vfs_syscalls.c +++ b/sys/kern/vfs_syscalls.c @@ -2253,10 +2253,10 @@ kern_accessat(struct thread *td, int fd, const char *path, cred = td->td_ucred; if ((flag & AT_EACCESS) == 0 && ((cred->cr_uid != cred->cr_ruid || - cred->cr_rgid != cred->cr_groups[0]))) { + cred->cr_rgid != cred->cr_gid))) { usecred = crdup(cred); usecred->cr_uid = cred->cr_ruid; - usecred->cr_groups[0] = cred->cr_rgid; + usecred->cr_gid = cred->cr_rgid; td->td_ucred = usecred; } else usecred = cred; diff --git a/sys/kern/vnode_if.src b/sys/kern/vnode_if.src index 38138a4af921..2e63215b2f97 100644 --- a/sys/kern/vnode_if.src +++ b/sys/kern/vnode_if.src @@ -242,8 +242,8 @@ vop_read_pgcache { %% write vp L L L -%! write pre VOP_WRITE_PRE -%! write post VOP_WRITE_POST +%! write pre vop_write_pre +%! write post vop_write_post vop_write { IN struct vnode *vp; @@ -380,6 +380,7 @@ vop_symlink { %% readdir vp L L L +%! readdir pre vop_readdir_pre %! readdir post vop_readdir_post vop_readdir { diff --git a/sys/modules/Makefile b/sys/modules/Makefile index 7cb6e2124326..99c9ec9dcd01 100644 --- a/sys/modules/Makefile +++ b/sys/modules/Makefile @@ -34,6 +34,7 @@ SUBDIR= \ alq \ ${_amd_ecc_inject} \ ${_amdgpio} \ + ${_amdsmu} \ ${_amdsbwd} \ ${_amdsmn} \ ${_amdtemp} \ @@ -772,6 +773,7 @@ _acpi= acpi _aesni= aesni .endif _amd_ecc_inject=amd_ecc_inject +_amdsmu= amdsmu _amdsbwd= amdsbwd _amdsmn= amdsmn _amdtemp= amdtemp diff --git a/sys/modules/amdsmu/Makefile b/sys/modules/amdsmu/Makefile new file mode 100644 index 000000000000..752f57173d61 --- /dev/null +++ b/sys/modules/amdsmu/Makefile @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: BSD-2-Clause +# +# Copyright (c) 2025 The FreeBSD Foundation +# +# This software was developed by Aymeric Wibo <obiwac@freebsd.org> +# under sponsorship from the FreeBSD Foundation. + +.PATH: ${SRCTOP}/sys/dev/amdsmu + +KMOD= amdsmu +SRCS= amdsmu.c +SRCS+= bus_if.h device_if.h pci_if.h + +.include <bsd.kmod.mk> diff --git a/sys/modules/efirt/Makefile b/sys/modules/efirt/Makefile index 4738996fd4e6..c46484465b68 100644 --- a/sys/modules/efirt/Makefile +++ b/sys/modules/efirt/Makefile @@ -9,7 +9,7 @@ SRCS+= device_if.h bus_if.h clock_if.h DPSRCS+= assym.inc .if ${MACHINE_CPUARCH} == "amd64" -SRCS+= opt_hwpmc_hooks.h opt_kstack_pages.h +SRCS+= opt_acpi.h opt_hwpmc_hooks.h opt_kstack_pages.h .endif efirt_support.o: efirt_support.S assym.inc diff --git a/sys/net/if_bridge.c b/sys/net/if_bridge.c index 5b3ee740d75e..0a35fb4095fb 100644 --- a/sys/net/if_bridge.c +++ b/sys/net/if_bridge.c @@ -76,31 +76,34 @@ * heterogeneous bridges). */ -#include <sys/cdefs.h> #include "opt_inet.h" #include "opt_inet6.h" +#define EXTERR_CATEGORY EXTERR_CAT_BRIDGE + #include <sys/param.h> +#include <sys/ctype.h> /* string functions */ #include <sys/eventhandler.h> -#include <sys/mbuf.h> +#include <sys/exterrvar.h> +#include <sys/jail.h> +#include <sys/kernel.h> +#include <sys/lock.h> #include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/module.h> +#include <sys/mutex.h> +#include <sys/priv.h> +#include <sys/proc.h> #include <sys/protosw.h> +#include <sys/random.h> #include <sys/systm.h> -#include <sys/jail.h> -#include <sys/time.h> #include <sys/socket.h> /* for net/if.h */ #include <sys/sockio.h> -#include <sys/ctype.h> /* string functions */ -#include <sys/kernel.h> -#include <sys/random.h> #include <sys/syslog.h> #include <sys/sysctl.h> +#include <sys/time.h> + #include <vm/uma.h> -#include <sys/module.h> -#include <sys/priv.h> -#include <sys/proc.h> -#include <sys/lock.h> -#include <sys/mutex.h> #include <net/bpf.h> #include <net/if.h> @@ -254,8 +257,8 @@ struct bridge_iflist { uint32_t bif_addrcnt; /* cur. # of addresses */ uint32_t bif_addrexceeded;/* # of address violations */ struct epoch_context bif_epoch_ctx; - ether_vlanid_t bif_untagged; /* untagged vlan id */ - ifbvlan_set_t bif_vlan_set; /* allowed tagged vlans */ + ether_vlanid_t bif_pvid; /* port vlan id */ + ifbvlan_set_t bif_vlan_set; /* if allowed tagged vlans */ }; /* @@ -404,7 +407,7 @@ static int bridge_ioctl_sma(struct bridge_softc *, void *); static int bridge_ioctl_sifprio(struct bridge_softc *, void *); static int bridge_ioctl_sifcost(struct bridge_softc *, void *); static int bridge_ioctl_sifmaxaddr(struct bridge_softc *, void *); -static int bridge_ioctl_sifuntagged(struct bridge_softc *, void *); +static int bridge_ioctl_sifpvid(struct bridge_softc *, void *); static int bridge_ioctl_sifvlanset(struct bridge_softc *, void *); static int bridge_ioctl_gifvlanset(struct bridge_softc *, void *); static int bridge_ioctl_addspan(struct bridge_softc *, void *); @@ -625,7 +628,7 @@ static const struct bridge_control bridge_control_table[] = { { bridge_ioctl_sifmaxaddr, sizeof(struct ifbreq), BC_F_COPYIN|BC_F_SUSER }, - { bridge_ioctl_sifuntagged, sizeof(struct ifbreq), + { bridge_ioctl_sifpvid, sizeof(struct ifbreq), BC_F_COPYIN|BC_F_SUSER }, { bridge_ioctl_sifvlanset, sizeof(struct ifbif_vlan_req), @@ -986,31 +989,37 @@ bridge_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) case SIOCGDRVSPEC: case SIOCSDRVSPEC: if (ifd->ifd_cmd >= bridge_control_table_size) { - error = EINVAL; + error = EXTERROR(EINVAL, "Invalid control command"); break; } bc = &bridge_control_table[ifd->ifd_cmd]; if (cmd == SIOCGDRVSPEC && (bc->bc_flags & BC_F_COPYOUT) == 0) { - error = EINVAL; + error = EXTERROR(EINVAL, + "Inappropriate ioctl for command " + "(expected SIOCSDRVSPEC)"); break; } else if (cmd == SIOCSDRVSPEC && (bc->bc_flags & BC_F_COPYOUT) != 0) { - error = EINVAL; + error = EXTERROR(EINVAL, + "Inappropriate ioctl for command " + "(expected SIOCGDRVSPEC)"); break; } if (bc->bc_flags & BC_F_SUSER) { error = priv_check(td, PRIV_NET_BRIDGE); - if (error) + if (error) { + EXTERROR(error, "PRIV_NET_BRIDGE required"); break; + } } if (ifd->ifd_len != bc->bc_argsize || ifd->ifd_len > sizeof(args)) { - error = EINVAL; + error = EXTERROR(EINVAL, "Invalid argument size"); break; } @@ -1062,7 +1071,8 @@ bridge_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) oldmtu = sc->sc_ifp->if_mtu; if (ifr->ifr_mtu < IF_MINMTU) { - error = EINVAL; + error = EXTERROR(EINVAL, + "Requested MTU is lower than IF_MINMTU"); break; } if (CK_LIST_EMPTY(&sc->sc_iflist)) { @@ -1088,6 +1098,8 @@ bridge_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) (*bif->bif_ifp->if_ioctl)(bif->bif_ifp, SIOCSIFMTU, (caddr_t)ifr); } + EXTERROR(error, + "Failed to set MTU on member interface"); } else { sc->sc_ifp->if_mtu = ifr->ifr_mtu; } @@ -1125,14 +1137,14 @@ bridge_mutecaps(struct bridge_softc *sc) mask = BRIDGE_IFCAPS_MASK; CK_LIST_FOREACH(bif, &sc->sc_iflist, bif_next) { - /* Every member must support it or its disabled */ + /* Every member must support it or it's disabled */ mask &= bif->bif_savedcaps; } CK_LIST_FOREACH(bif, &sc->sc_iflist, bif_next) { enabled = bif->bif_ifp->if_capenable; enabled &= ~BRIDGE_IFCAPS_STRIP; - /* strip off mask bits and enable them again if allowed */ + /* Strip off mask bits and enable them again if allowed */ enabled &= ~BRIDGE_IFCAPS_MASK; enabled |= mask; bridge_set_ifcap(sc, bif, enabled); @@ -1282,7 +1294,7 @@ bridge_delete_member(struct bridge_softc *sc, struct bridge_iflist *bif, #endif break; } - /* reneable any interface capabilities */ + /* Re-enable any interface capabilities */ bridge_set_ifcap(sc, bif, bif->bif_savedcaps); } bstp_destroy(&bif->bif_stp); /* prepare to free */ @@ -1318,21 +1330,48 @@ bridge_ioctl_add(struct bridge_softc *sc, void *arg) ifs = ifunit(req->ifbr_ifsname); if (ifs == NULL) - return (ENOENT); + return (EXTERROR(ENOENT, "No such interface", + req->ifbr_ifsname)); if (ifs->if_ioctl == NULL) /* must be supported */ - return (EINVAL); + return (EXTERROR(EINVAL, "Interface must support ioctl(2)")); + + /* + * If the new interface is a vlan(4), it could be a bridge SVI. + * Don't allow such things to be added to bridges. + */ + if (ifs->if_type == IFT_L2VLAN) { + struct ifnet *parent; + struct epoch_tracker et; + bool is_bridge; + + /* + * Entering NET_EPOCH with BRIDGE_LOCK held, but this is okay + * since we don't sleep here. + */ + NET_EPOCH_ENTER(et); + parent = VLAN_TRUNKDEV(ifs); + is_bridge = (parent != NULL && parent->if_type == IFT_BRIDGE); + NET_EPOCH_EXIT(et); + + if (is_bridge) + return (EXTERROR(EINVAL, + "Bridge SVI cannot be added to a bridge")); + } /* If it's in the span list, it can't be a member. */ CK_LIST_FOREACH(bif, &sc->sc_spanlist, bif_next) if (ifs == bif->bif_ifp) - return (EBUSY); + return (EXTERROR(EBUSY, + "Span interface cannot be a member")); if (ifs->if_bridge) { struct bridge_iflist *sbif = ifs->if_bridge; if (sbif->bif_sc == sc) - return (EEXIST); + return (EXTERROR(EEXIST, + "Interface is already a member of this bridge")); - return (EBUSY); + return (EXTERROR(EBUSY, + "Interface is already a member of another bridge")); } switch (ifs->if_type) { @@ -1342,7 +1381,7 @@ bridge_ioctl_add(struct bridge_softc *sc, void *arg) /* permitted interface types */ break; default: - return (EINVAL); + return (EXTERROR(EINVAL, "Unsupported interface type")); } #ifdef INET6 @@ -1394,11 +1433,15 @@ bridge_ioctl_add(struct bridge_softc *sc, void *arg) CK_STAILQ_FOREACH(ifa, &ifs->if_addrhead, ifa_link) { #ifdef INET if (ifa->ifa_addr->sa_family == AF_INET) - return (EINVAL); + return (EXTERROR(EINVAL, + "Member interface may not have " + "an IPv4 address configured")); #endif #ifdef INET6 if (ifa->ifa_addr->sa_family == AF_INET6) - return (EINVAL); + return (EXTERROR(EINVAL, + "Member interface may not have " + "an IPv6 address configured")); #endif } } @@ -1420,7 +1463,8 @@ bridge_ioctl_add(struct bridge_softc *sc, void *arg) " new member %s\n", sc->sc_ifp->if_xname, ifr.ifr_mtu, ifs->if_xname); - return (EINVAL); + return (EXTERROR(EINVAL, + "Failed to set MTU on new member")); } } @@ -1482,7 +1526,7 @@ bridge_ioctl_del(struct bridge_softc *sc, void *arg) bif = bridge_lookup_member(sc, req->ifbr_ifsname); if (bif == NULL) - return (ENOENT); + return (EXTERROR(ENOENT, "Interface is not a bridge member")); bridge_delete_member(sc, bif, 0); @@ -1498,7 +1542,7 @@ bridge_ioctl_gifflags(struct bridge_softc *sc, void *arg) bif = bridge_lookup_member(sc, req->ifbr_ifsname); if (bif == NULL) - return (ENOENT); + return (EXTERROR(ENOENT, "Interface is not a bridge member")); bp = &bif->bif_stp; req->ifbr_ifsflags = bif->bif_flags; @@ -1512,7 +1556,7 @@ bridge_ioctl_gifflags(struct bridge_softc *sc, void *arg) req->ifbr_addrcnt = bif->bif_addrcnt; req->ifbr_addrmax = bif->bif_addrmax; req->ifbr_addrexceeded = bif->bif_addrexceeded; - req->ifbr_untagged = bif->bif_untagged; + req->ifbr_pvid = bif->bif_pvid; /* Copy STP state options as flags */ if (bp->bp_operedge) @@ -1541,12 +1585,12 @@ bridge_ioctl_sifflags(struct bridge_softc *sc, void *arg) bif = bridge_lookup_member(sc, req->ifbr_ifsname); if (bif == NULL) - return (ENOENT); + return (EXTERROR(ENOENT, "Interface is not a bridge member")); bp = &bif->bif_stp; if (req->ifbr_ifsflags & IFBIF_SPAN) /* SPAN is readonly */ - return (EINVAL); + return (EXTERROR(EINVAL, "Span interface cannot be modified")); NET_EPOCH_ENTER(et); @@ -1555,7 +1599,8 @@ bridge_ioctl_sifflags(struct bridge_softc *sc, void *arg) error = bstp_enable(&bif->bif_stp); if (error) { NET_EPOCH_EXIT(et); - return (error); + return (EXTERROR(error, + "Failed to enable STP")); } } } else { @@ -1724,7 +1769,7 @@ bridge_ioctl_saddr(struct bridge_softc *sc, void *arg) bif = bridge_lookup_member(sc, req->ifba_ifsname); if (bif == NULL) { NET_EPOCH_EXIT(et); - return (ENOENT); + return (EXTERROR(ENOENT, "Interface is not a bridge member")); } /* bridge_rtupdate() may acquire the lock. */ @@ -1858,7 +1903,7 @@ bridge_ioctl_sifprio(struct bridge_softc *sc, void *arg) bif = bridge_lookup_member(sc, req->ifbr_ifsname); if (bif == NULL) - return (ENOENT); + return (EXTERROR(ENOENT, "Interface is not a bridge member")); return (bstp_set_port_priority(&bif->bif_stp, req->ifbr_priority)); } @@ -1871,7 +1916,7 @@ bridge_ioctl_sifcost(struct bridge_softc *sc, void *arg) bif = bridge_lookup_member(sc, req->ifbr_ifsname); if (bif == NULL) - return (ENOENT); + return (EXTERROR(ENOENT, "Interface is not a bridge member")); return (bstp_set_path_cost(&bif->bif_stp, req->ifbr_path_cost)); } @@ -1884,28 +1929,28 @@ bridge_ioctl_sifmaxaddr(struct bridge_softc *sc, void *arg) bif = bridge_lookup_member(sc, req->ifbr_ifsname); if (bif == NULL) - return (ENOENT); + return (EXTERROR(ENOENT, "Interface is not a bridge member")); bif->bif_addrmax = req->ifbr_addrmax; return (0); } static int -bridge_ioctl_sifuntagged(struct bridge_softc *sc, void *arg) +bridge_ioctl_sifpvid(struct bridge_softc *sc, void *arg) { struct ifbreq *req = arg; struct bridge_iflist *bif; bif = bridge_lookup_member(sc, req->ifbr_ifsname); if (bif == NULL) - return (ENOENT); + return (EXTERROR(ENOENT, "Interface is not a bridge member")); - if (req->ifbr_untagged > DOT1Q_VID_MAX) - return (EINVAL); + if (req->ifbr_pvid > DOT1Q_VID_MAX) + return (EXTERROR(EINVAL, "Invalid VLAN ID")); - if (req->ifbr_untagged != DOT1Q_VID_NULL) + if (req->ifbr_pvid != DOT1Q_VID_NULL) bif->bif_flags |= IFBIF_VLANFILTER; - bif->bif_untagged = req->ifbr_untagged; + bif->bif_pvid = req->ifbr_pvid; return (0); } @@ -1917,12 +1962,12 @@ bridge_ioctl_sifvlanset(struct bridge_softc *sc, void *arg) bif = bridge_lookup_member(sc, req->bv_ifname); if (bif == NULL) - return (ENOENT); + return (EXTERROR(ENOENT, "Interface is not a bridge member")); /* Reject invalid VIDs. */ if (BRVLAN_TEST(&req->bv_set, DOT1Q_VID_NULL) || BRVLAN_TEST(&req->bv_set, DOT1Q_VID_RSVD_IMPL)) - return (EINVAL); + return (EXTERROR(EINVAL, "Invalid VLAN ID in set")); switch (req->bv_op) { /* Replace the existing vlan set with the new set */ @@ -1942,7 +1987,8 @@ bridge_ioctl_sifvlanset(struct bridge_softc *sc, void *arg) /* Invalid or unknown operation */ default: - return (EINVAL); + return (EXTERROR(EINVAL, + "Unsupported BRDGSIFVLANSET operation")); } /* @@ -1962,7 +2008,7 @@ bridge_ioctl_gifvlanset(struct bridge_softc *sc, void *arg) bif = bridge_lookup_member(sc, req->bv_ifname); if (bif == NULL) - return (ENOENT); + return (EXTERROR(ENOENT, "Interface is not a bridge member")); BIT_COPY(BRVLAN_SETSIZE, &bif->bif_vlan_set, &req->bv_set); return (0); @@ -1977,14 +2023,16 @@ bridge_ioctl_addspan(struct bridge_softc *sc, void *arg) ifs = ifunit(req->ifbr_ifsname); if (ifs == NULL) - return (ENOENT); + return (EXTERROR(ENOENT, "No such interface")); CK_LIST_FOREACH(bif, &sc->sc_spanlist, bif_next) if (ifs == bif->bif_ifp) - return (EBUSY); + return (EXTERROR(EBUSY, + "Interface is already a span port")); if (ifs->if_bridge != NULL) - return (EBUSY); + return (EXTERROR(EEXIST, + "Interface is already a bridge member")); switch (ifs->if_type) { case IFT_ETHER: @@ -1992,7 +2040,7 @@ bridge_ioctl_addspan(struct bridge_softc *sc, void *arg) case IFT_L2VLAN: break; default: - return (EINVAL); + return (EXTERROR(EINVAL, "Unsupported interface type")); } bif = malloc(sizeof(*bif), M_DEVBUF, M_NOWAIT|M_ZERO); @@ -2016,14 +2064,14 @@ bridge_ioctl_delspan(struct bridge_softc *sc, void *arg) ifs = ifunit(req->ifbr_ifsname); if (ifs == NULL) - return (ENOENT); + return (EXTERROR(ENOENT, "No such interface")); CK_LIST_FOREACH(bif, &sc->sc_spanlist, bif_next) if (ifs == bif->bif_ifp) break; if (bif == NULL) - return (ENOENT); + return (EXTERROR(ENOENT, "Interface is not a span port")); bridge_delete_span(sc, bif); @@ -2278,8 +2326,8 @@ bridge_enqueue(struct bridge_softc *sc, struct ifnet *dst_ifp, struct mbuf *m, * the VLAN header. */ if ((bif->bif_flags & IFBIF_VLANFILTER) && - bif->bif_untagged != DOT1Q_VID_NULL && - VLANTAGOF(m) == bif->bif_untagged) { + bif->bif_pvid != DOT1Q_VID_NULL && + VLANTAGOF(m) == bif->bif_pvid) { m->m_flags &= ~M_VLANTAG; m->m_pkthdr.ether_vtag = 0; } @@ -3145,14 +3193,14 @@ bridge_vfilter_in(const struct bridge_iflist *sbif, struct mbuf *m) * The frame doesn't have a tag. If the interface does not * have an untagged vlan configured, drop the frame. */ - if (sbif->bif_untagged == DOT1Q_VID_NULL) + if (sbif->bif_pvid == DOT1Q_VID_NULL) return (false); /* * Otherwise, insert a new tag based on the interface's * untagged vlan id. */ - m->m_pkthdr.ether_vtag = sbif->bif_untagged; + m->m_pkthdr.ether_vtag = sbif->bif_pvid; m->m_flags |= M_VLANTAG; } else { /* @@ -3213,7 +3261,7 @@ bridge_vfilter_out(const struct bridge_iflist *dbif, const struct mbuf *m) * If the frame's vlan matches the interfaces's untagged vlan, * allow it. */ - if (vlan == dbif->bif_untagged) + if (vlan == dbif->bif_pvid) return (true); /* @@ -3244,10 +3292,11 @@ bridge_rtupdate(struct bridge_softc *sc, const uint8_t *dst, BRIDGE_LOCK_OR_NET_EPOCH_ASSERT(sc); /* Check the source address is valid and not multicast. */ - if (ETHER_IS_MULTICAST(dst) || - (dst[0] == 0 && dst[1] == 0 && dst[2] == 0 && - dst[3] == 0 && dst[4] == 0 && dst[5] == 0) != 0) - return (EINVAL); + if (ETHER_IS_MULTICAST(dst)) + return (EXTERROR(EINVAL, "Multicast address not permitted")); + if (dst[0] == 0 && dst[1] == 0 && dst[2] == 0 && + dst[3] == 0 && dst[4] == 0 && dst[5] == 0) + return (EXTERROR(EINVAL, "Zero address not permitted")); /* * A route for this destination might already exist. If so, @@ -3266,13 +3315,14 @@ bridge_rtupdate(struct bridge_softc *sc, const uint8_t *dst, if (sc->sc_brtcnt >= sc->sc_brtmax) { sc->sc_brtexceeded++; BRIDGE_RT_UNLOCK(sc); - return (ENOSPC); + return (EXTERROR(ENOSPC, "Address table is full")); } /* Check per interface address limits (if enabled) */ if (bif->bif_addrmax && bif->bif_addrcnt >= bif->bif_addrmax) { bif->bif_addrexceeded++; BRIDGE_RT_UNLOCK(sc); - return (ENOSPC); + return (EXTERROR(ENOSPC, + "Interface address limit exceeded")); } /* @@ -3283,7 +3333,8 @@ bridge_rtupdate(struct bridge_softc *sc, const uint8_t *dst, brt = uma_zalloc(V_bridge_rtnode_zone, M_NOWAIT | M_ZERO); if (brt == NULL) { BRIDGE_RT_UNLOCK(sc); - return (ENOMEM); + return (EXTERROR(ENOMEM, + "Cannot allocate address node")); } brt->brt_vnet = curvnet; @@ -3631,7 +3682,7 @@ bridge_rtnode_insert(struct bridge_softc *sc, struct bridge_rtnode *brt) do { dir = bridge_rtnode_addr_cmp(brt->brt_addr, lbrt->brt_addr); if (dir == 0 && brt->brt_vlan == lbrt->brt_vlan) - return (EEXIST); + return (EXTERROR(EEXIST, "Address already exists")); if (dir > 0) { CK_LIST_INSERT_BEFORE(lbrt, brt, brt_hash); goto out; diff --git a/sys/net/if_bridgevar.h b/sys/net/if_bridgevar.h index 97b63e3d4416..c458dcc152a0 100644 --- a/sys/net/if_bridgevar.h +++ b/sys/net/if_bridgevar.h @@ -124,7 +124,7 @@ #define BRDGSPROTO 28 /* set protocol (ifbrparam) */ #define BRDGSTXHC 29 /* set tx hold count (ifbrparam) */ #define BRDGSIFAMAX 30 /* set max interface addrs (ifbreq) */ -#define BRDGSIFUNTAGGED 31 /* set if untagged vlan */ +#define BRDGSIFPVID 31 /* set if PVID */ #define BRDGSIFVLANSET 32 /* set if vlan set */ #define BRDGGIFVLANSET 33 /* get if vlan set */ @@ -144,7 +144,7 @@ struct ifbreq { uint32_t ifbr_addrcnt; /* member if addr number */ uint32_t ifbr_addrmax; /* member if addr max */ uint32_t ifbr_addrexceeded; /* member if addr violations */ - ether_vlanid_t ifbr_untagged; /* member if untagged vlan */ + ether_vlanid_t ifbr_pvid; /* member if PVID */ uint8_t pad[32]; }; diff --git a/sys/net/if_ovpn.c b/sys/net/if_ovpn.c index 7bdbc565f4ca..fe3e7bbd7fff 100644 --- a/sys/net/if_ovpn.c +++ b/sys/net/if_ovpn.c @@ -34,11 +34,13 @@ #include <sys/epoch.h> #include <sys/file.h> #include <sys/filedesc.h> +#include <sys/jail.h> #include <sys/kernel.h> #include <sys/malloc.h> #include <sys/mbuf.h> #include <sys/module.h> #include <sys/nv.h> +#include <sys/osd.h> #include <sys/priv.h> #include <sys/protosw.h> #include <sys/rmlock.h> @@ -79,7 +81,6 @@ #include "if_ovpn.h" struct ovpn_kkey_dir { - int refcount; uint8_t key[32]; uint8_t keylen; uint8_t nonce[8]; @@ -132,6 +133,9 @@ struct ovpn_notification { /* Delete notification */ enum ovpn_del_reason del_reason; struct ovpn_peer_counters counters; + + /* Float notification */ + struct sockaddr_storage address; }; struct ovpn_softc; @@ -196,6 +200,10 @@ struct ovpn_softc { struct epoch_context epoch_ctx; }; +struct ovpn_mtag { + struct sockaddr_storage addr; +}; + static struct ovpn_kpeer *ovpn_find_peer(struct ovpn_softc *, uint32_t); static bool ovpn_udp_input(struct mbuf *, int, struct inpcb *, const struct sockaddr *, void *); @@ -205,7 +213,10 @@ static int ovpn_encap(struct ovpn_softc *, uint32_t, struct mbuf *); static int ovpn_get_af(struct mbuf *); static void ovpn_free_kkey_dir(struct ovpn_kkey_dir *); static bool ovpn_check_replay(struct ovpn_kkey_dir *, uint32_t); -static int ovpn_peer_compare(struct ovpn_kpeer *, struct ovpn_kpeer *); +static int ovpn_peer_compare(const struct ovpn_kpeer *, + const struct ovpn_kpeer *); +static bool ovpn_sockaddr_compare(const struct sockaddr *, + const struct sockaddr *); static RB_PROTOTYPE(ovpn_kpeers, ovpn_kpeer, tree, ovpn_peer_compare); static RB_GENERATE(ovpn_kpeers, ovpn_kpeer, tree, ovpn_peer_compare); @@ -278,11 +289,48 @@ SYSCTL_INT(_net_link_openvpn, OID_AUTO, netisr_queue, "Use netisr_queue() rather than netisr_dispatch()."); static int -ovpn_peer_compare(struct ovpn_kpeer *a, struct ovpn_kpeer *b) +ovpn_peer_compare(const struct ovpn_kpeer *a, const struct ovpn_kpeer *b) { return (a->peerid - b->peerid); } +static bool +ovpn_sockaddr_compare(const struct sockaddr *a, + const struct sockaddr *b) +{ + if (a->sa_family != b->sa_family) + return (false); + MPASS(a->sa_len == b->sa_len); + + switch (a->sa_family) { + case AF_INET: { + const struct sockaddr_in *a4, *b4; + + a4 = (const struct sockaddr_in *)a; + b4 = (const struct sockaddr_in *)b; + + if (a4->sin_port != b4->sin_port) + return (false); + + return (a4->sin_addr.s_addr == b4->sin_addr.s_addr); + } + case AF_INET6: { + const struct sockaddr_in6 *a6, *b6; + + a6 = (const struct sockaddr_in6 *)a; + b6 = (const struct sockaddr_in6 *)b; + + if (a6->sin6_port != b6->sin6_port) + return (false); + + return (memcmp(&a6->sin6_addr, &b6->sin6_addr, + sizeof(a6->sin6_addr)) == 0); + } + default: + panic("Unknown address family %d", a->sa_family); + } +} + static struct ovpn_kpeer * ovpn_find_peer(struct ovpn_softc *sc, uint32_t peerid) { @@ -304,15 +352,15 @@ ovpn_find_only_peer(struct ovpn_softc *sc) } static uint16_t -ovpn_get_port(struct sockaddr_storage *s) +ovpn_get_port(const struct sockaddr_storage *s) { switch (s->ss_family) { case AF_INET: { - struct sockaddr_in *in = (struct sockaddr_in *)s; + const struct sockaddr_in *in = (const struct sockaddr_in *)s; return (in->sin_port); } case AF_INET6: { - struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)s; + const struct sockaddr_in6 *in6 = (const struct sockaddr_in6 *)s; return (in6->sin6_port); } default: @@ -320,6 +368,25 @@ ovpn_get_port(struct sockaddr_storage *s) } } +static void +ovpn_set_port(struct sockaddr_storage *s, unsigned short port) +{ + switch (s->ss_family) { + case AF_INET: { + struct sockaddr_in *in = (struct sockaddr_in *)s; + in->sin_port = port; + break; + } + case AF_INET6: { + struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)s; + in6->sin6_port = port; + break; + } + default: + panic("Unsupported address family %d", s->ss_family); + } +} + static int ovpn_nvlist_to_sockaddr(const nvlist_t *nvl, struct sockaddr_storage *sa) { @@ -333,14 +400,16 @@ ovpn_nvlist_to_sockaddr(const nvlist_t *nvl, struct sockaddr_storage *sa) return (EINVAL); af = nvlist_get_number(nvl, "af"); - switch (af) { #ifdef INET case AF_INET: { struct sockaddr_in *in = (struct sockaddr_in *)sa; size_t len; const void *addr = nvlist_get_binary(nvl, "address", &len); + + memset(in, 0, sizeof(*in)); in->sin_family = af; + in->sin_len = sizeof(*in); if (len != sizeof(in->sin_addr)) return (EINVAL); @@ -354,7 +423,10 @@ ovpn_nvlist_to_sockaddr(const nvlist_t *nvl, struct sockaddr_storage *sa) struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)sa; size_t len; const void *addr = nvlist_get_binary(nvl, "address", &len); + + memset(in6, 0, sizeof(*in6)); in6->sin6_family = af; + in6->sin6_len = sizeof(*in6); if (len != sizeof(in6->sin6_addr)) return (EINVAL); @@ -370,31 +442,42 @@ ovpn_nvlist_to_sockaddr(const nvlist_t *nvl, struct sockaddr_storage *sa) return (0); } -static bool -ovpn_has_peers(struct ovpn_softc *sc) +static int +ovpn_add_sockaddr(nvlist_t *parent, const char *name, const struct sockaddr *s) { - OVPN_ASSERT(sc); - - return (sc->peercount > 0); -} + nvlist_t *nvl; -static void -ovpn_rele_so(struct ovpn_softc *sc) -{ - bool has_peers; + nvl = nvlist_create(0); + if (nvl == NULL) + return (ENOMEM); - OVPN_WASSERT(sc); + nvlist_add_number(nvl, "af", s->sa_family); - if (sc->so == NULL) - return; + switch (s->sa_family) { + case AF_INET: { + const struct sockaddr_in *s4 = (const struct sockaddr_in *)s; - has_peers = ovpn_has_peers(sc); + nvlist_add_number(nvl, "port", s4->sin_port); + nvlist_add_binary(nvl, "address", &s4->sin_addr, + sizeof(s4->sin_addr)); + break; + } + case AF_INET6: { + const struct sockaddr_in6 *s6 = (const struct sockaddr_in6 *)s; - if (! has_peers) { - MPASS(sc->peercount == 0); - } else { - MPASS(sc->peercount > 0); + nvlist_add_number(nvl, "port", s6->sin6_port); + nvlist_add_binary(nvl, "address", &s6->sin6_addr, + sizeof(s6->sin6_addr)); + break; + } + default: + nvlist_destroy(nvl); + return (EINVAL); } + + nvlist_move_nvlist(parent, name, nvl); + + return (0); } static void @@ -449,6 +532,33 @@ ovpn_notify_key_rotation(struct ovpn_softc *sc, struct ovpn_kpeer *peer) } } +static int +ovpn_notify_float(struct ovpn_softc *sc, uint32_t peerid, + const struct sockaddr_storage *remote) +{ + struct ovpn_notification *n; + + n = malloc(sizeof(*n), M_OVPN, M_NOWAIT | M_ZERO); + if (n == NULL) + return (ENOMEM); + + n->peerid = peerid; + n->type = OVPN_NOTIF_FLOAT; + memcpy(&n->address, remote, sizeof(n->address)); + + if (buf_ring_enqueue(sc->notifring, n) != 0) { + free(n, M_OVPN); + return (ENOMEM); + } else if (sc->so != NULL) { + /* Wake up userspace */ + sc->so->so_error = EAGAIN; + sorwakeup(sc->so); + sowwakeup(sc->so); + } + + return (0); +} + static void ovpn_peer_release_ref(struct ovpn_kpeer *peer, bool locked) { @@ -485,8 +595,6 @@ ovpn_peer_release_ref(struct ovpn_kpeer *peer, bool locked) ovpn_free_kkey_dir(peer->keys[i].decrypt); } - ovpn_rele_so(sc); - callout_stop(&peer->ping_send); callout_stop(&peer->ping_rcv); uma_zfree_pcpu(pcpu_zone_4, peer->last_active); @@ -502,7 +610,7 @@ ovpn_new_peer(struct ifnet *ifp, const nvlist_t *nvl) #ifdef INET6 struct epoch_tracker et; #endif - struct sockaddr_storage remote; + struct sockaddr_storage local, remote; struct ovpn_kpeer *peer = NULL; struct file *fp = NULL; struct ovpn_softc *sc = ifp->if_softc; @@ -571,20 +679,37 @@ ovpn_new_peer(struct ifnet *ifp, const nvlist_t *nvl) callout_init_rm(&peer->ping_send, &sc->lock, CALLOUT_SHAREDLOCK); callout_init_rm(&peer->ping_rcv, &sc->lock, 0); - peer->local.ss_len = sizeof(peer->local); - ret = sosockaddr(so, (struct sockaddr *)&peer->local); - if (ret) + memset(&local, 0, sizeof(local)); + local.ss_len = sizeof(local); + ret = sosockaddr(so, (struct sockaddr *)&local); + if (ret != 0) goto error; + if (nvlist_exists_nvlist(nvl, "local")) { + struct sockaddr_storage local1; + + ret = ovpn_nvlist_to_sockaddr(nvlist_get_nvlist(nvl, "local"), + &local1); + if (ret != 0) + goto error; - if (ovpn_get_port(&peer->local) == 0) { + /* + * openvpn doesn't provide a port here when in multihome mode, + * just steal the one the socket is bound to. + */ + if (ovpn_get_port(&local1) == 0) + ovpn_set_port(&local1, ovpn_get_port(&local)); + memcpy(&local, &local1, sizeof(local1)); + } + if (ovpn_get_port(&local) == 0) { ret = EINVAL; goto error; } - if (peer->local.ss_family != remote.ss_family) { + if (local.ss_family != remote.ss_family) { ret = EINVAL; goto error; } + memcpy(&peer->local, &local, sizeof(local)); memcpy(&peer->remote, &remote, sizeof(remote)); #ifdef INET6 @@ -633,6 +758,7 @@ ovpn_new_peer(struct ifnet *ifp, const nvlist_t *nvl) * a new one. */ ret = udp_set_kernel_tunneling(sc->so, NULL, NULL, NULL); + MPASS(ret == 0); sorele(sc->so); sc->so = NULL; } @@ -1364,12 +1490,36 @@ opvn_get_pkt(struct ovpn_softc *sc, nvlist_t **onvl) } nvlist_add_number(nvl, "peerid", n->peerid); nvlist_add_number(nvl, "notification", n->type); - if (n->type == OVPN_NOTIF_DEL_PEER) { + switch (n->type) { + case OVPN_NOTIF_DEL_PEER: { nvlist_add_number(nvl, "del_reason", n->del_reason); /* No error handling, because we want to send the notification * even if we can't attach the counters. */ ovpn_notif_add_counters(nvl, n); + break; + } + case OVPN_NOTIF_FLOAT: { + int ret; + + ret = ovpn_add_sockaddr(nvl, "address", + (struct sockaddr *)&n->address); + + if (ret) { + /* + * Try to re-enqueue the notification. Maybe we'll + * have better luck next time. No error handling, + * because if we fail to re-enqueue there's nothing we can do. + */ + (void)ovpn_notify_float(sc, n->peerid, &n->address); + nvlist_destroy(nvl); + free(n, M_OVPN); + return (ret); + } + break; + } + default: + break; } free(n, M_OVPN); @@ -1525,6 +1675,7 @@ ovpn_finish_rx(struct ovpn_softc *sc, struct mbuf *m, struct rm_priotracker *_ovpn_lock_trackerp) { uint32_t af; + struct m_tag *mtag; OVPN_RASSERT(sc); NET_EPOCH_ASSERT(); @@ -1543,6 +1694,38 @@ ovpn_finish_rx(struct ovpn_softc *sc, struct mbuf *m, OVPN_RUNLOCK(sc); + /* Check if the peer changed to a new source address. */ + mtag = m_tag_find(m, PACKET_TAG_OVPN, NULL); + if (mtag != NULL) { + struct ovpn_mtag *ot = (struct ovpn_mtag *)(mtag + 1); + + OVPN_WLOCK(sc); + + /* + * Check the address against the peer's remote again, because we may race + * against ourselves (i.e. we may have tagged multiple packets to indicate we + * floated). + */ + if (ovpn_sockaddr_compare((struct sockaddr *)&ot->addr, + (struct sockaddr *)&peer->remote)) { + OVPN_WUNLOCK(sc); + goto skip_float; + } + + /* And notify userspace. */ + if (ovpn_notify_float(sc, peer->peerid, &ot->addr) == 0) { + /* + * Update the 'remote' for this peer, but only if + * we've actually enqueued the notification. + * Otherwise we can try again later. + */ + memcpy(&peer->remote, &ot->addr, sizeof(peer->remote)); + } + + OVPN_WUNLOCK(sc); + } + +skip_float: OVPN_COUNTER_ADD(sc, received_data_pkts, 1); OVPN_COUNTER_ADD(sc, tunnel_bytes_received, m->m_pkthdr.len); OVPN_PEER_COUNTER_ADD(peer, pkt_in, 1); @@ -2305,6 +2488,29 @@ ovpn_udp_input(struct mbuf *m, int off, struct inpcb *inp, return (true); } + /* + * If we got this from a different address than we expected tag the packet. + * We'll deal with notifiying userspace later, after we've decrypted and + * verified. + */ + if (! ovpn_sockaddr_compare((struct sockaddr *)&peer->remote, sa)) { + struct m_tag *mt; + struct ovpn_mtag *ot; + + MPASS(sa->sa_len <= sizeof(ot->addr)); + mt = m_tag_get(PACKET_TAG_OVPN, sizeof(*ot), M_NOWAIT); + /* + * If we fail to allocate here we'll just try again on the next + * packet. + */ + if (mt != NULL) { + ot = (struct ovpn_mtag *)(mt + 1); + memcpy(&ot->addr, sa, sa->sa_len); + + m_tag_prepend(m, mt); + } + } + if (key->decrypt->cipher == OVPN_CIPHER_ALG_NONE) { /* Now remove the outer headers */ m_adj_decap(m, sizeof(struct udphdr) + ohdrlen); @@ -2519,6 +2725,7 @@ ovpn_clone_destroy_cb(struct epoch_context *ctx) COUNTER_ARRAY_FREE(sc->counters, OVPN_COUNTER_SIZE); + rm_destroy(&sc->lock); if_free(sc->ifp); free(sc, M_OVPN); } @@ -2579,23 +2786,53 @@ vnet_ovpn_init(const void *unused __unused) VNET_SYSINIT(vnet_ovpn_init, SI_SUB_PSEUDO, SI_ORDER_ANY, vnet_ovpn_init, NULL); -static void -vnet_ovpn_uninit(const void *unused __unused) +static int +ovpn_prison_remove(void *obj, void *data __unused) { - if_clone_detach(V_ovpn_cloner); +#ifdef VIMAGE + struct prison *pr; + + pr = obj; + if (prison_owns_vnet(pr)) { + CURVNET_SET(pr->pr_vnet); + if (V_ovpn_cloner != NULL) { + ifc_detach_cloner(V_ovpn_cloner); + V_ovpn_cloner = NULL; + } + CURVNET_RESTORE(); + } +#endif + return (0); } -VNET_SYSUNINIT(vnet_ovpn_uninit, SI_SUB_PSEUDO, SI_ORDER_ANY, - vnet_ovpn_uninit, NULL); static int ovpnmodevent(module_t mod, int type, void *data) { + static int ovpn_osd_jail_slot; + switch (type) { - case MOD_LOAD: - /* Done in vnet_ovpn_init() */ + case MOD_LOAD: { + /* + * Registration is handled in vnet_ovpn_init(), but cloned + * interfaces must be destroyed via PR_METHOD_REMOVE since they + * hold a reference to the prison via the UDP socket, which + * prevents the prison from being destroyed. + */ + osd_method_t methods[PR_MAXMETHOD] = { + [PR_METHOD_REMOVE] = ovpn_prison_remove, + }; + ovpn_osd_jail_slot = osd_jail_register(NULL, methods); break; + } case MOD_UNLOAD: - /* Done in vnet_ovpn_uninit() */ + if (ovpn_osd_jail_slot != 0) + osd_jail_deregister(ovpn_osd_jail_slot); + CURVNET_SET(vnet0); + if (V_ovpn_cloner != NULL) { + ifc_detach_cloner(V_ovpn_cloner); + V_ovpn_cloner = NULL; + } + CURVNET_RESTORE(); break; default: return (EOPNOTSUPP); diff --git a/sys/net/if_ovpn.h b/sys/net/if_ovpn.h index 2d6b8c1e7eff..2a24c35788a9 100644 --- a/sys/net/if_ovpn.h +++ b/sys/net/if_ovpn.h @@ -37,6 +37,7 @@ enum ovpn_notif_type { OVPN_NOTIF_DEL_PEER, OVPN_NOTIF_ROTATE_KEY, + OVPN_NOTIF_FLOAT, }; enum ovpn_del_reason { diff --git a/sys/net/if_tuntap.c b/sys/net/if_tuntap.c index 3bab04aa4d38..5e6f65c04b2f 100644 --- a/sys/net/if_tuntap.c +++ b/sys/net/if_tuntap.c @@ -74,6 +74,7 @@ #include <sys/malloc.h> #include <sys/random.h> #include <sys/ctype.h> +#include <sys/osd.h> #include <net/ethernet.h> #include <net/if.h> @@ -178,6 +179,7 @@ struct tuntap_softc { static struct mtx tunmtx; static eventhandler_tag arrival_tag; static eventhandler_tag clone_tag; +static int tuntap_osd_jail_slot; static const char tunname[] = "tun"; static const char tapname[] = "tap"; static const char vmnetname[] = "vmnet"; @@ -497,6 +499,10 @@ vmnet_clone_match(struct if_clone *ifc, const char *name) return (0); } +/* + * Create a clone via the ifnet cloning mechanism. Note that this is invoked + * indirectly by tunclone() below. + */ static int tun_clone_create(struct if_clone *ifc, char *name, size_t len, struct ifc_data *ifd, struct ifnet **ifpp) @@ -532,15 +538,19 @@ tun_clone_create(struct if_clone *ifc, char *name, size_t len, if (i != 0) i = tun_create_device(drv, unit, NULL, &dev, name); if (i == 0) { - dev_ref(dev); + struct tuntap_softc *tp; + tuncreate(dev); - struct tuntap_softc *tp = dev->si_drv1; + tp = dev->si_drv1; *ifpp = tp->tun_ifp; } return (i); } +/* + * Create a clone via devfs access. + */ static void tunclone(void *arg, struct ucred *cred, char *name, int namelen, struct cdev **dev) @@ -595,11 +605,12 @@ tunclone(void *arg, struct ucred *cred, char *name, int namelen, } i = tun_create_device(drv, u, cred, dev, name); - } - if (i == 0) { + } else { + /* Consumed by the dev_clone invoker. */ dev_ref(*dev); - if_clone_create(name, namelen, NULL); } + if (i == 0) + if_clone_create(name, namelen, NULL); out: CURVNET_RESTORE(); } @@ -670,16 +681,6 @@ VNET_SYSINIT(vnet_tun_init, SI_SUB_PROTO_IF, SI_ORDER_ANY, vnet_tun_init, NULL); static void -vnet_tun_uninit(const void *unused __unused) -{ - - for (u_int i = 0; i < NDRV; ++i) - if_clone_detach(V_tuntap_driver_cloners[i]); -} -VNET_SYSUNINIT(vnet_tun_uninit, SI_SUB_PROTO_IF, SI_ORDER_ANY, - vnet_tun_uninit, NULL); - -static void tun_uninit(const void *unused __unused) { struct tuntap_driver *drv; @@ -689,6 +690,16 @@ tun_uninit(const void *unused __unused) EVENTHANDLER_DEREGISTER(ifnet_arrival_event, arrival_tag); EVENTHANDLER_DEREGISTER(dev_clone, clone_tag); + CURVNET_SET(vnet0); + for (u_int i = 0; i < NDRV; i++) { + if_clone_detach(V_tuntap_driver_cloners[i]); + V_tuntap_driver_cloners[i] = NULL; + } + CURVNET_RESTORE(); + + if (tuntap_osd_jail_slot != 0) + osd_jail_deregister(tuntap_osd_jail_slot); + mtx_lock(&tunmtx); while ((tp = TAILQ_FIRST(&tunhead)) != NULL) { TAILQ_REMOVE(&tunhead, tp, tun_list); @@ -724,6 +735,30 @@ tuntap_driver_from_ifnet(const struct ifnet *ifp) return (NULL); } +/* + * Remove devices that were created by devfs cloning, as they hold references + * which prevent the prison from collapsing, in which state VNET sysuninits will + * not be invoked. + */ +static int +tuntap_prison_remove(void *obj, void *data __unused) +{ +#ifdef VIMAGE + struct prison *pr; + + pr = obj; + if (prison_owns_vnet(pr)) { + CURVNET_SET(pr->pr_vnet); + for (u_int i = 0; i < NDRV; i++) { + if_clone_detach(V_tuntap_driver_cloners[i]); + V_tuntap_driver_cloners[i] = NULL; + } + CURVNET_RESTORE(); + } +#endif + return (0); +} + static int tuntapmodevent(module_t mod, int type, void *data) { @@ -738,8 +773,12 @@ tuntapmodevent(module_t mod, int type, void *data) clone_setup(&drv->clones); drv->unrhdr = new_unrhdr(0, IF_MAXUNIT, &tunmtx); } + osd_method_t methods[PR_MAXMETHOD] = { + [PR_METHOD_REMOVE] = tuntap_prison_remove, + }; + tuntap_osd_jail_slot = osd_jail_register(NULL, methods); arrival_tag = EVENTHANDLER_REGISTER(ifnet_arrival_event, - tunrename, 0, 1000); + tunrename, 0, 1000); if (arrival_tag == NULL) return (ENOMEM); clone_tag = EVENTHANDLER_REGISTER(dev_clone, tunclone, 0, 1000); @@ -747,7 +786,7 @@ tuntapmodevent(module_t mod, int type, void *data) return (ENOMEM); break; case MOD_UNLOAD: - /* See tun_uninit, so it's done after the vnet_sysuninit() */ + /* See tun_uninit(). */ break; default: return EOPNOTSUPP; @@ -798,6 +837,8 @@ tun_create_device(struct tuntap_driver *drv, int unit, struct ucred *cr, args.mda_si_drv1 = tp; error = make_dev_s(&args, dev, "%s", name); if (error != 0) { + mtx_destroy(&tp->tun_mtx); + cv_destroy(&tp->tun_cv); free(tp, M_TUN); return (error); } @@ -914,7 +955,6 @@ tap_transmit(struct ifnet *ifp, struct mbuf *m) return (error); } -/* XXX: should return an error code so it can fail. */ static void tuncreate(struct cdev *dev) { diff --git a/sys/net/if_vlan.c b/sys/net/if_vlan.c index 22fcb7bf7c64..61000018e5a4 100644 --- a/sys/net/if_vlan.c +++ b/sys/net/if_vlan.c @@ -2336,6 +2336,18 @@ vlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) error = ENOENT; break; } + + /* + * If the ifp is in a bridge, do not allow setting the device + * to a bridge; this prevents having a bridge SVI as a bridge + * member (which is not permitted). + */ + if (ifp->if_bridge != NULL && p->if_type == IFT_BRIDGE) { + if_rele(p); + error = EINVAL; + break; + } + if (vlr.vlr_proto == 0) vlr.vlr_proto = ETHERTYPE_VLAN; oldmtu = ifp->if_mtu; diff --git a/sys/net/pfvar.h b/sys/net/pfvar.h index 452a8eb4024b..d55afe750869 100644 --- a/sys/net/pfvar.h +++ b/sys/net/pfvar.h @@ -331,6 +331,14 @@ MALLOC_DECLARE(M_PF_RULE_ITEM); SDT_PROVIDER_DECLARE(pf); SDT_PROBE_DECLARE(pf, , test, reason_set); +SDT_PROBE_DECLARE(pf, , log, log); + +#define DPFPRINTF(n, fmt, x...) \ + do { \ + SDT_PROBE2(pf, , log, log, (n), fmt); \ + if (V_pf_status.debug >= (n)) \ + printf(fmt "\n", ##x); \ + } while (0) struct pfi_dynaddr { TAILQ_ENTRY(pfi_dynaddr) entry; @@ -1676,6 +1684,9 @@ struct pf_pdesc { u_int32_t fragoff; /* fragment header offset */ u_int32_t jumbolen; /* length from v6 jumbo header */ u_int32_t badopts; /* v4 options or v6 routing headers */ +#define PF_OPT_OTHER 0x0001 +#define PF_OPT_JUMBO 0x0002 +#define PF_OPT_ROUTER_ALERT 0x0004 u_int16_t *ip_sum; u_int16_t flags; /* Let SCRUB trigger behavior in diff --git a/sys/netinet/in_fib_dxr.c b/sys/netinet/in_fib_dxr.c index b889131b544b..538cd43a88a3 100644 --- a/sys/netinet/in_fib_dxr.c +++ b/sys/netinet/in_fib_dxr.c @@ -345,7 +345,7 @@ initheap(struct dxr_aux *da, uint32_t dst_u32, uint32_t chunk) struct heap_entry *fhp = &da->heap[0]; struct rtentry *rt; struct route_nhop_data rnd; - + da->heap_index = 0; da->dst.sin_addr.s_addr = htonl(dst_u32); rt = fib4_lookup_rt(da->fibnum, da->dst.sin_addr, 0, NHR_UNLOCKED, @@ -1143,7 +1143,7 @@ dxr_destroy(void *data) free(da, M_DXRAUX); } -static void +static void epoch_dxr_destroy(epoch_context_t ctx) { struct dxr *dxr = __containerof(ctx, struct dxr, epoch_ctx); @@ -1202,7 +1202,7 @@ dxr_dump_end(void *data, struct fib_dp *dp) static enum flm_op_result dxr_dump_rib_item(struct rtentry *rt, void *data) { - + return (FLM_SUCCESS); } diff --git a/sys/netinet/sctp_timer.c b/sys/netinet/sctp_timer.c index 66af716eea52..7d8cb965ab09 100644 --- a/sys/netinet/sctp_timer.c +++ b/sys/netinet/sctp_timer.c @@ -35,7 +35,6 @@ #define _IP_VHL #include <netinet/sctp_os.h> #include <netinet/sctp_pcb.h> - #include <netinet/sctp_var.h> #include <netinet/sctp_sysctl.h> #include <netinet/sctp_timer.h> diff --git a/sys/netinet/tcp_log_buf.h b/sys/netinet/tcp_log_buf.h index 3e7eef8a1cda..f8c064b6a104 100644 --- a/sys/netinet/tcp_log_buf.h +++ b/sys/netinet/tcp_log_buf.h @@ -377,12 +377,12 @@ extern int32_t tcp_trace_point_count; /* * Returns true if any sort of BB logging is enabled, - * commonly used throughout the codebase. + * commonly used throughout the codebase. */ static inline int tcp_bblogging_on(struct tcpcb *tp) { - if (tp->_t_logstate <= TCP_LOG_STATE_OFF) + if (tp->_t_logstate <= TCP_LOG_STATE_OFF) return (0); if (tp->_t_logstate == TCP_LOG_VIA_BBPOINTS) return (0); @@ -427,7 +427,7 @@ tcp_set_bblog_state(struct tcpcb *tp, uint8_t ls, uint8_t bbpoint) } } -static inline uint32_t +static inline uint32_t tcp_get_bblog_state(struct tcpcb *tp) { return (tp->_t_logstate); diff --git a/sys/netinet/tcp_lro.c b/sys/netinet/tcp_lro.c index 10afed17bf3b..7512679bd4e9 100644 --- a/sys/netinet/tcp_lro.c +++ b/sys/netinet/tcp_lro.c @@ -1301,9 +1301,9 @@ tcp_lro_rx_common(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum, bool use_h return (TCP_LRO_CANNOT); #endif if (((m->m_pkthdr.csum_flags & (CSUM_DATA_VALID | CSUM_PSEUDO_HDR)) != - ((CSUM_DATA_VALID | CSUM_PSEUDO_HDR))) || + ((CSUM_DATA_VALID | CSUM_PSEUDO_HDR))) || (m->m_pkthdr.csum_data != 0xffff)) { - /* + /* * The checksum either did not have hardware offload * or it was a bad checksum. We can't LRO such * a packet. @@ -1334,7 +1334,7 @@ tcp_lro_rx_common(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum, bool use_h #endif /* If no hardware or arrival stamp on the packet add timestamp */ if ((m->m_flags & (M_TSTMP_LRO | M_TSTMP)) == 0) { - m->m_pkthdr.rcv_tstmp = bintime2ns(&lc->lro_last_queue_time); + m->m_pkthdr.rcv_tstmp = bintime2ns(&lc->lro_last_queue_time); m->m_flags |= M_TSTMP_LRO; } @@ -1429,9 +1429,9 @@ tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum) int error; if (((m->m_pkthdr.csum_flags & (CSUM_DATA_VALID | CSUM_PSEUDO_HDR)) != - ((CSUM_DATA_VALID | CSUM_PSEUDO_HDR))) || + ((CSUM_DATA_VALID | CSUM_PSEUDO_HDR))) || (m->m_pkthdr.csum_data != 0xffff)) { - /* + /* * The checksum either did not have hardware offload * or it was a bad checksum. We can't LRO such * a packet. @@ -1481,7 +1481,7 @@ tcp_lro_queue_mbuf(struct lro_ctrl *lc, struct mbuf *mb) ((mb->m_flags & M_TSTMP) == 0)) { /* Add in an LRO time since no hardware */ binuptime(&lc->lro_last_queue_time); - mb->m_pkthdr.rcv_tstmp = bintime2ns(&lc->lro_last_queue_time); + mb->m_pkthdr.rcv_tstmp = bintime2ns(&lc->lro_last_queue_time); mb->m_flags |= M_TSTMP_LRO; } diff --git a/sys/netinet/tcp_sack.c b/sys/netinet/tcp_sack.c index 90d789f0e224..4405098a8620 100644 --- a/sys/netinet/tcp_sack.c +++ b/sys/netinet/tcp_sack.c @@ -744,7 +744,7 @@ tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack) while (cur != NULL) { if (!(sblkp >= sack_blocks)) { if (((loss_sblks >= tcprexmtthresh) || - (loss_thresh > (tcprexmtthresh-1)*tp->t_maxseg))) + (loss_thresh > (tcprexmtthresh-1)*tp->t_maxseg))) break; loss_thresh += loss_hiack - cur->end; loss_hiack = cur->start; diff --git a/sys/netinet/tcp_stacks/bbr.c b/sys/netinet/tcp_stacks/bbr.c index d2636f01714e..b232d3f08fe6 100644 --- a/sys/netinet/tcp_stacks/bbr.c +++ b/sys/netinet/tcp_stacks/bbr.c @@ -5126,8 +5126,8 @@ bbr_timeout_rxt(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts) tp->t_maxseg = tp->t_pmtud_saved_maxseg; if (tp->t_maxseg < V_tcp_mssdflt) { /* - * The MSS is so small we should not - * process incoming SACK's since we are + * The MSS is so small we should not + * process incoming SACK's since we are * subject to attack in such a case. */ tp->t_flags2 |= TF2_PROC_SACK_PROHIBIT; @@ -10141,7 +10141,7 @@ bbr_init(struct tcpcb *tp, void **ptr) * flags. */ bbr_stop_all_timers(tp, bbr); - /* + /* * Validate the timers are not in usec, if they are convert. * BBR should in theory move to USEC and get rid of a * lot of the TICKS_2 calls.. but for now we stay @@ -11544,7 +11544,7 @@ bbr_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, bbr_check_bbr_for_state(bbr, cts, __LINE__, (bbr->r_ctl.rc_lost - lost)); if (nxt_pkt == 0) { if ((bbr->r_wanted_output != 0) || - (tp->t_flags & TF_ACKNOW)) { + (tp->t_flags & TF_ACKNOW)) { bbr->rc_output_starts_timer = 0; did_out = 1; @@ -13172,11 +13172,7 @@ send: mb, moff, &len, if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb, - ((rsm == NULL) ? hw_tls : 0) -#ifdef NETFLIX_COPY_ARGS - , NULL, NULL -#endif - ); + ((rsm == NULL) ? hw_tls : 0)); if (len <= maxseg) { /* * Must have ran out of mbufs for the copy @@ -13806,8 +13802,8 @@ nomore: tp->t_maxseg = old_maxseg - 40; if (tp->t_maxseg < V_tcp_mssdflt) { /* - * The MSS is so small we should not - * process incoming SACK's since we are + * The MSS is so small we should not + * process incoming SACK's since we are * subject to attack in such a case. */ tp->t_flags2 |= TF2_PROC_SACK_PROHIBIT; diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c index 834e1347a152..940a4024bb73 100644 --- a/sys/netinet/tcp_stacks/rack.c +++ b/sys/netinet/tcp_stacks/rack.c @@ -7888,8 +7888,8 @@ drop_it: tp->t_maxseg = tp->t_pmtud_saved_maxseg; if (tp->t_maxseg < V_tcp_mssdflt) { /* - * The MSS is so small we should not - * process incoming SACK's since we are + * The MSS is so small we should not + * process incoming SACK's since we are * subject to attack in such a case. */ tp->t_flags2 |= TF2_PROC_SACK_PROHIBIT; @@ -14638,9 +14638,6 @@ rack_init(struct tcpcb *tp, void **ptr) if (rack->r_ctl.pcm_s == NULL) { rack->r_ctl.pcm_i.cnt_alloc = 0; } -#ifdef NETFLIX_STATS - rack->r_ctl.side_chan_dis_mask = tcp_sidechannel_disable_mask; -#endif rack->r_ctl.rack_per_upper_bound_ss = (uint8_t)rack_per_upper_bound_ss; rack->r_ctl.rack_per_upper_bound_ca = (uint8_t)rack_per_upper_bound_ca; if (rack_enable_shared_cwnd) @@ -15564,7 +15561,7 @@ rack_log_pcm(struct tcp_rack *rack, uint8_t mod, uint32_t flex1, uint32_t flex2, if (tcp_bblogging_on(rack->rc_tp)) { union tcp_log_stackspecific log; struct timeval tv; - + (void)tcp_get_usecs(&tv); memset(&log, 0, sizeof(log)); log.u_bbr.timeStamp = tcp_tv_to_usectick(&tv); @@ -19915,7 +19912,7 @@ rack_output(struct tcpcb *tp) goto nomore; } else { /* Return == 0, if there is more we can send tot_len wise fall through and send */ - if (tot_len_this_send >= pace_max_seg) + if (tot_len_this_send >= pace_max_seg) return (ret); #ifdef TCP_ACCOUNTING /* We need to re-pin since fast_output un-pined */ @@ -21556,11 +21553,7 @@ send: m->m_next = tcp_m_copym( mb, moff, &len, if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb, - ((rsm == NULL) ? hw_tls : 0) -#ifdef NETFLIX_COPY_ARGS - , &s_mb, &s_moff -#endif - ); + ((rsm == NULL) ? hw_tls : 0)); if (len <= (tp->t_maxseg - optlen)) { /* * Must have ran out of mbufs for the copy diff --git a/sys/netinet/tcp_stacks/rack_pcm.c b/sys/netinet/tcp_stacks/rack_pcm.c index b0e300847c4a..101e6826536c 100644 --- a/sys/netinet/tcp_stacks/rack_pcm.c +++ b/sys/netinet/tcp_stacks/rack_pcm.c @@ -172,7 +172,7 @@ rack_update_pcm_ack(struct tcp_rack *rack, int was_cumack, uint32_t start, uint3 goto skip_ack_accounting; } /* - * Record ACK data. + * Record ACK data. */ ack_arrival = tcp_tv_to_lusectick(&rack->r_ctl.act_rcv_time); if (SEQ_GT(end, rack->r_ctl.pcm_i.eseq)) { @@ -305,7 +305,7 @@ skip_ack_accounting: 0, &log, false, NULL, NULL, 0, &tv); } } - /* + /* * Here we need a lot to be added including: * 1) Some form of measurement, where if we think the measurement * is valid we iterate over the PCM data and come up with a path diff --git a/sys/netinet/tcp_stacks/sack_filter.c b/sys/netinet/tcp_stacks/sack_filter.c index fc9ee8454a1e..2b70548f3cc6 100644 --- a/sys/netinet/tcp_stacks/sack_filter.c +++ b/sys/netinet/tcp_stacks/sack_filter.c @@ -400,7 +400,7 @@ sack_filter_run(struct sack_filter *sf, struct sackblk *in, int numblks, tcp_seq break; } /* Copy it out to the outbound */ - memcpy(&in[at], &blkboard[i], sizeof(struct sackblk)); + memcpy(&in[at], &blkboard[i], sizeof(struct sackblk)); at++; room--; /* now lets add it to our sack-board */ @@ -588,7 +588,7 @@ sack_filter_blks(struct tcpcb *tp, struct sack_filter *sf, struct sackblk *in, i sf->sf_ack = th_ack; for(i=0, sf->sf_cur=0; i<numblks; i++) { - if ((in[i].end != tp->snd_max) && + if ((in[i].end != tp->snd_max) && ((in[i].end - in[i].start) < segmax)) { /* * We do not accept blocks less than a MSS minus all @@ -707,7 +707,7 @@ main(int argc, char **argv) out = stdout; memset(&tp, 0, sizeof(tp)); tp.t_maxseg = 1460; - + while ((i = getopt(argc, argv, "dIi:o:?hS:")) != -1) { switch (i) { case 'S': @@ -883,7 +883,7 @@ main(int argc, char **argv) } else { printf("can't open sack_setup.bin -- sorry no load\n"); } - + } else if (strncmp(buffer, "help", 4) == 0) { help: fprintf(out, "You can input:\n"); diff --git a/sys/netinet/tcp_stacks/sack_filter.h b/sys/netinet/tcp_stacks/sack_filter.h index b12fcf84567c..a1c0684a4359 100644 --- a/sys/netinet/tcp_stacks/sack_filter.h +++ b/sys/netinet/tcp_stacks/sack_filter.h @@ -42,7 +42,7 @@ * previously processed sack information. * * The second thing that the sack filter does is help protect against malicious - * attackers that are trying to attack any linked lists (or other data structures) + * attackers that are trying to attack any linked lists (or other data structures) * that are used in sack processing. Consider an attacker sending in sacks for * every other byte of data outstanding. This could in theory drastically split * up any scoreboard you are maintaining and make you search through a very large diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c index db415f6bdf03..26e7e53d540c 100644 --- a/sys/netinet/tcp_subr.c +++ b/sys/netinet/tcp_subr.c @@ -4537,7 +4537,7 @@ tcp_change_time_units(struct tcpcb *tp, int granularity) panic("Unknown granularity:%d tp:%p", granularity, tp); } -#endif +#endif } void diff --git a/sys/netinet/tcp_timer.c b/sys/netinet/tcp_timer.c index 32ce3001929c..3b9fe7a317b0 100644 --- a/sys/netinet/tcp_timer.c +++ b/sys/netinet/tcp_timer.c @@ -757,8 +757,8 @@ tcp_timer_rexmt(struct tcpcb *tp) tp->t_maxseg = tp->t_pmtud_saved_maxseg; if (tp->t_maxseg < V_tcp_mssdflt) { /* - * The MSS is so small we should not - * process incoming SACK's since we are + * The MSS is so small we should not + * process incoming SACK's since we are * subject to attack in such a case. */ tp->t_flags2 |= TF2_PROC_SACK_PROHIBIT; diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c index 687b0d538666..98c934955121 100644 --- a/sys/netinet/tcp_usrreq.c +++ b/sys/netinet/tcp_usrreq.c @@ -164,7 +164,7 @@ tcp_usr_attach(struct socket *so, int proto, struct thread *td) goto out; so->so_rcv.sb_flags |= SB_AUTOSIZE; - so->so_snd.sb_flags |= SB_AUTOSIZE; + so->so_snd.sb_flags |= (SB_AUTOLOWAT | SB_AUTOSIZE); error = in_pcballoc(so, &V_tcbinfo); if (error) goto out; @@ -1768,9 +1768,9 @@ tcp_ctloutput_set(struct inpcb *inp, struct sockopt *sopt) /* * Release the ref count the lookup * acquired. - */ + */ refcount_release(&blk->tfb_refcnt); - /* + /* * Now there is a chance that the * init() function mucked with some * things before it failed, such as @@ -1800,7 +1800,7 @@ tcp_ctloutput_set(struct inpcb *inp, struct sockopt *sopt) * new one already. */ refcount_release(&tp->t_fb->tfb_refcnt); - /* + /* * Set in the new stack. */ tp->t_fb = blk; @@ -1934,7 +1934,7 @@ tcp_set_cc_mod(struct inpcb *inp, struct sockopt *sopt) CC_LIST_RUNLOCK(); return(ESRCH); } - /* + /* * With a reference the algorithm cannot be removed * so we hold a reference through the change process. */ diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h index 059b2aff689d..b90f65e83cb1 100644 --- a/sys/netinet/tcp_var.h +++ b/sys/netinet/tcp_var.h @@ -182,7 +182,7 @@ struct tcp_sendfile_track { * snd_una). When the response comes back indicating * that there was data (return value 1), then the caller * can build a sendmap entry based on the range and the - * times. The next query would then be done at the + * times. The next query would then be done at the * newly created sendmap_end. Repeated until sendmap_end == snd_max. * * Flags in sendmap_flags are defined below as well. @@ -197,7 +197,7 @@ struct tcp_sendfile_track { * The rack_times are a misc collection of information that * the old stack might possibly fill in. Of course its possible * that an old stack may not have a piece of information. If so - * then setting that value to zero is advised. Setting any + * then setting that value to zero is advised. Setting any * timestamp passed should only place a zero in it when it * is unfilled. This may mean that a time is off by a micro-second * but this is ok in the grand scheme of things. @@ -205,13 +205,13 @@ struct tcp_sendfile_track { * When switching stacks it is desireable to get as much information * from the old stack to the new stack as possible. Though not always * will the stack be compatible in the types of information. The - * init() function needs to take care when it begins changing + * init() function needs to take care when it begins changing * things such as inp_flags2 and the timer units to position these * changes at a point where it is unlikely they will fail after * making such changes. A stack optionally can have an "undo" - * function + * function * - * To transfer information to the old stack from the new in + * To transfer information to the old stack from the new in * respect to LRO and the inp_flags2, the new stack should set * the inp_flags2 to what it supports. The old stack in its * fini() function should call the tcp_handle_orphaned_packets() @@ -544,13 +544,13 @@ typedef enum { * do is: * a) Make sure that the inp_flags2 is setup correctly * for LRO. There are two flags that the previous - * stack may have set INP_MBUF_ACKCMP and + * stack may have set INP_MBUF_ACKCMP and * INP_SUPPORTS_MBUFQ. If the new stack does not * support these it *should* clear the flags. * b) Make sure that the timers are in the proper * granularity that the stack wants. The stack * should check the t_tmr_granularity field. Currently - * there are two values that it may hold + * there are two values that it may hold * TCP_TMR_GRANULARITY_TICKS and TCP_TMR_GRANULARITY_USEC. * Use the functions tcp_timer_convert(tp, granularity); * to move the timers to the correct format for your stack. @@ -558,14 +558,14 @@ typedef enum { * The new stack may also optionally query the tfb_chg_query * function if the old stack has one. The new stack may ask * for one of three entries and can also state to the old - * stack its support for the INP_MBUF_ACKCMP and + * stack its support for the INP_MBUF_ACKCMP and * INP_SUPPORTS_MBUFQ. This is important since if there are * queued ack's without that statement the old stack will * be forced to discard the queued acks. The requests that * can be made for information by the new stacks are: * * Note also that the tfb_tcp_fb_init() when called can - * determine if a query is needed by looking at the + * determine if a query is needed by looking at the * value passed in the ptr. The ptr is designed to be * set in with any allocated memory, but the address * of the condtion (ptr == &tp->t_fb_ptr) will be @@ -573,17 +573,17 @@ typedef enum { * setup of a tcb (which means no query would be needed). * If, however, the value is not t_fb_ptr, then the caller * is in the middle of a stack switch and is the new stack. - * A query would be appropriate (if the new stack support + * A query would be appropriate (if the new stack support * the query mechanism). * * TCP_QUERY_SENDMAP - Query of outstanding data. * TCP_QUERY_TIMERS_UP - Query about running timers. - * TCP_SUPPORTED_LRO - Declaration in req_param of - * the inp_flags2 supported by + * TCP_SUPPORTED_LRO - Declaration in req_param of + * the inp_flags2 supported by * the new stack. * TCP_QUERY_RACK_TIMES - Enquire about various timestamps * and states the old stack may be in. - * + * * tfb_tcp_fb_fini is changed to add a flag to tell * the old stack if the tcb is being destroyed or * not. A one in the flag means the TCB is being diff --git a/sys/netinet/udp_usrreq.c b/sys/netinet/udp_usrreq.c index dafbaf6dc672..42cfb919e263 100644 --- a/sys/netinet/udp_usrreq.c +++ b/sys/netinet/udp_usrreq.c @@ -243,7 +243,6 @@ udp_append(struct inpcb *inp, struct ip *ip, struct mbuf *n, int off, struct sockaddr_in6 udp_in6; #endif struct udpcb *up; - bool filtered; INP_LOCK_ASSERT(inp); @@ -252,13 +251,19 @@ udp_append(struct inpcb *inp, struct ip *ip, struct mbuf *n, int off, */ up = intoudpcb(inp); if (up->u_tun_func != NULL) { + bool filtered; + in_pcbref(inp); INP_RUNLOCK(inp); filtered = (*up->u_tun_func)(n, off, inp, (struct sockaddr *)&udp_in[0], up->u_tun_ctx); INP_RLOCK(inp); - if (filtered) - return (in_pcbrele_rlocked(inp)); + if (in_pcbrele_rlocked(inp)) + return (1); + if (filtered) { + INP_RUNLOCK(inp); + return (1); + } } off += sizeof(struct udphdr); diff --git a/sys/netinet6/scope6.c b/sys/netinet6/scope6.c index 0987ea7e99ad..08702a2e81ab 100644 --- a/sys/netinet6/scope6.c +++ b/sys/netinet6/scope6.c @@ -505,8 +505,23 @@ in6_set_unicast_scopeid(struct in6_addr *in6, uint32_t scopeid) struct ifnet* in6_getlinkifnet(uint32_t zoneid) { + struct ifnet *ifp; - return (ifnet_byindex((u_short)zoneid)); + ifp = ifnet_byindex((u_short)zoneid); + + if (ifp == NULL) + return (NULL); + + /* An interface might not be IPv6 capable. */ + if (ifp->if_afdata[AF_INET6] == NULL) { + log(LOG_NOTICE, + "%s: embedded scope points to an interface without " + "IPv6: %s%%%d.\n", __func__, + if_name(ifp), zoneid); + return (NULL); + } + + return (ifp); } /* diff --git a/sys/netinet6/udp6_usrreq.c b/sys/netinet6/udp6_usrreq.c index 304effa26e01..b3ed16fda713 100644 --- a/sys/netinet6/udp6_usrreq.c +++ b/sys/netinet6/udp6_usrreq.c @@ -142,7 +142,6 @@ udp6_append(struct inpcb *inp, struct mbuf *n, int off, struct socket *so; struct mbuf *opts = NULL, *tmp_opts; struct udpcb *up; - bool filtered; INP_LOCK_ASSERT(inp); @@ -151,13 +150,19 @@ udp6_append(struct inpcb *inp, struct mbuf *n, int off, */ up = intoudpcb(inp); if (up->u_tun_func != NULL) { + bool filtered; + in_pcbref(inp); INP_RUNLOCK(inp); filtered = (*up->u_tun_func)(n, off, inp, (struct sockaddr *)&fromsa[0], up->u_tun_ctx); INP_RLOCK(inp); - if (filtered) - return (in_pcbrele_rlocked(inp)); + if (in_pcbrele_rlocked(inp)) + return (1); + if (filtered) { + INP_RUNLOCK(inp); + return (1); + } } off += sizeof(struct udphdr); diff --git a/sys/netpfil/ipfw/ip_fw2.c b/sys/netpfil/ipfw/ip_fw2.c index 923633d76df7..c129c8c49921 100644 --- a/sys/netpfil/ipfw/ip_fw2.c +++ b/sys/netpfil/ipfw/ip_fw2.c @@ -196,7 +196,7 @@ SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Firewall"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, one_pass, CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_one_pass), 0, - "Only do a single pass through ipfw when using dummynet(4)"); + "Only do a single pass through ipfw when using dummynet(4), ipfw_nat or other divert(4)-like interfaces"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, autoinc_step, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(autoinc_step), 0, "Rule number auto-increment step"); diff --git a/sys/netpfil/pf/if_pfsync.c b/sys/netpfil/pf/if_pfsync.c index 4e03584b8f85..ee10a997c977 100644 --- a/sys/netpfil/pf/if_pfsync.c +++ b/sys/netpfil/pf/if_pfsync.c @@ -110,8 +110,6 @@ #include <netpfil/pf/pfsync_nv.h> -#define DPFPRINTF(n, x) if (V_pf_status.debug >= (n)) printf x - struct pfsync_bucket; struct pfsync_softc; @@ -597,9 +595,9 @@ pfsync_state_import(union pfsync_state_union *sp, int flags, int msg_version) if ((rpool_first == NULL) || (TAILQ_NEXT(rpool_first, entries) != NULL)) { DPFPRINTF(PF_DEBUG_MISC, - ("%s: can't recover routing information " - "because of empty or bad redirection pool\n", - __func__)); + "%s: can't recover routing information " + "because of empty or bad redirection pool", + __func__); return ((flags & PFSYNC_SI_IOCTL) ? EINVAL : 0); } rt = r->rt; @@ -610,8 +608,8 @@ pfsync_state_import(union pfsync_state_union *sp, int flags, int msg_version) * give up on recovering. */ DPFPRINTF(PF_DEBUG_MISC, - ("%s: can't recover routing information " - "because of different ruleset\n", __func__)); + "%s: can't recover routing information " + "because of different ruleset", __func__); return ((flags & PFSYNC_SI_IOCTL) ? EINVAL : 0); } break; @@ -624,8 +622,8 @@ pfsync_state_import(union pfsync_state_union *sp, int flags, int msg_version) rt_kif = pfi_kkif_find(sp->pfs_1400.rt_ifname); if (rt_kif == NULL) { DPFPRINTF(PF_DEBUG_MISC, - ("%s: unknown route interface: %s\n", - __func__, sp->pfs_1400.rt_ifname)); + "%s: unknown route interface: %s", + __func__, sp->pfs_1400.rt_ifname); return ((flags & PFSYNC_SI_IOCTL) ? EINVAL : 0); } rt = sp->pfs_1400.rt; diff --git a/sys/netpfil/pf/pf.c b/sys/netpfil/pf/pf.c index 009f7e4d78b1..79c298c18b46 100644 --- a/sys/netpfil/pf/pf.c +++ b/sys/netpfil/pf/pf.c @@ -119,8 +119,6 @@ #include <machine/in_cksum.h> #include <security/mac/mac_framework.h> -#define DPFPRINTF(n, x) if (V_pf_status.debug >= (n)) printf x - SDT_PROVIDER_DEFINE(pf); SDT_PROBE_DEFINE2(pf, , test, reason_set, "int", "int"); SDT_PROBE_DEFINE4(pf, ip, test, done, "int", "int", "struct pf_krule *", @@ -161,6 +159,7 @@ SDT_PROBE_DEFINE2(pf, eth, test_rule, match, "int", "struct pf_keth_rule *"); SDT_PROBE_DEFINE2(pf, eth, test_rule, final_match, "int", "struct pf_keth_rule *"); SDT_PROBE_DEFINE2(pf, purge, state, rowcount, "int", "size_t"); +SDT_PROBE_DEFINE2(pf, , log, log, "int", "const char *"); /* * Global variables @@ -375,6 +374,8 @@ static u_int16_t pf_calc_mss(struct pf_addr *, sa_family_t, int, u_int16_t); static int pf_check_proto_cksum(struct mbuf *, int, int, u_int8_t, sa_family_t); +static int pf_walk_option(struct pf_pdesc *, struct ip *, + int, int, u_short *); static int pf_walk_header(struct pf_pdesc *, struct ip *, u_short *); #ifdef INET6 static int pf_walk_option6(struct pf_pdesc *, struct ip6_hdr *, @@ -4615,8 +4616,8 @@ pf_match_rcvif(struct mbuf *m, struct pf_krule *r) if (kif == NULL) { DPFPRINTF(PF_DEBUG_URGENT, - ("%s: kif == NULL, @%d via %s\n", __func__, r->nr, - r->rcv_ifname)); + "%s: kif == NULL, @%d via %s", __func__, r->nr, + r->rcv_ifname); return (0); } @@ -4975,7 +4976,7 @@ pf_socket_lookup(struct pf_pdesc *pd) } INP_RLOCK_ASSERT(inp); pd->lookup.uid = inp->inp_cred->cr_uid; - pd->lookup.gid = inp->inp_cred->cr_groups[0]; + pd->lookup.gid = inp->inp_cred->cr_gid; INP_RUNLOCK(inp); return (1); @@ -5242,8 +5243,8 @@ pf_test_eth_rule(int dir, struct pfi_kkif *kif, struct mbuf **m0) if (__predict_false(m->m_len < sizeof(struct ether_header)) && (m = *m0 = m_pullup(*m0, sizeof(struct ether_header))) == NULL) { DPFPRINTF(PF_DEBUG_URGENT, - ("%s: m_len < sizeof(struct ether_header)" - ", pullup failed\n", __func__)); + "%s: m_len < sizeof(struct ether_header)" + ", pullup failed", __func__); return (PF_DROP); } e = mtod(m, struct ether_header *); @@ -5759,7 +5760,7 @@ pf_test_rule(struct pf_krule **rm, struct pf_kstate **sm, if (inp != NULL) { INP_LOCK_ASSERT(inp); pd->lookup.uid = inp->inp_cred->cr_uid; - pd->lookup.gid = inp->inp_cred->cr_groups[0]; + pd->lookup.gid = inp->inp_cred->cr_gid; pd->lookup.done = 1; } @@ -6168,8 +6169,8 @@ pf_create_state(struct pf_krule *r, struct pf_test_ctx *ctx, &s->src, &s->dst, &ctx->rewrite)) { /* This really shouldn't happen!!! */ DPFPRINTF(PF_DEBUG_URGENT, - ("%s: tcp normalize failed on first " - "pkt\n", __func__)); + "%s: tcp normalize failed on first " + "pkt", __func__); goto csfailed; } } else if (pd->proto == IPPROTO_SCTP) { @@ -7398,7 +7399,7 @@ pf_sctp_multihome_delayed(struct pf_pdesc *pd, struct pfi_kkif *kif, { struct pf_sctp_multihome_job *j, *tmp; struct pf_sctp_source *i; - int ret __unused; + int ret; struct pf_kstate *sm = NULL; struct pf_krule *ra = NULL; struct pf_krule *r = &V_pf_default_rule; @@ -7965,8 +7966,8 @@ pf_test_state_icmp(struct pf_kstate **state, struct pf_pdesc *pd, if (!pf_pull_hdr(pd->m, ipoff2, &h2, sizeof(h2), NULL, reason, pd2.af)) { DPFPRINTF(PF_DEBUG_MISC, - ("pf: ICMP error message too short " - "(ip)\n")); + "pf: ICMP error message too short " + "(ip)"); return (PF_DROP); } /* @@ -7996,8 +7997,8 @@ pf_test_state_icmp(struct pf_kstate **state, struct pf_pdesc *pd, if (!pf_pull_hdr(pd->m, ipoff2, &h2_6, sizeof(h2_6), NULL, reason, pd2.af)) { DPFPRINTF(PF_DEBUG_MISC, - ("pf: ICMP error message too short " - "(ip6)\n")); + "pf: ICMP error message too short " + "(ip6)"); return (PF_DROP); } pd2.off = ipoff2; @@ -8049,8 +8050,8 @@ pf_test_state_icmp(struct pf_kstate **state, struct pf_pdesc *pd, if (!pf_pull_hdr(pd->m, pd2.off, th, 8, NULL, reason, pd2.af)) { DPFPRINTF(PF_DEBUG_MISC, - ("pf: ICMP error message too short " - "(tcp)\n")); + "pf: ICMP error message too short " + "(tcp)"); return (PF_DROP); } pd2.pcksum = &pd2.hdr.tcp.th_sum; @@ -8244,8 +8245,8 @@ pf_test_state_icmp(struct pf_kstate **state, struct pf_pdesc *pd, if (!pf_pull_hdr(pd->m, pd2.off, uh, sizeof(*uh), NULL, reason, pd2.af)) { DPFPRINTF(PF_DEBUG_MISC, - ("pf: ICMP error message too short " - "(udp)\n")); + "pf: ICMP error message too short " + "(udp)"); return (PF_DROP); } pd2.pcksum = &pd2.hdr.udp.uh_sum; @@ -8376,8 +8377,8 @@ pf_test_state_icmp(struct pf_kstate **state, struct pf_pdesc *pd, if (! pf_pull_hdr(pd->m, pd2.off, sh, sizeof(*sh), NULL, reason, pd2.af)) { DPFPRINTF(PF_DEBUG_MISC, - ("pf: ICMP error message too short " - "(sctp)\n")); + "pf: ICMP error message too short " + "(sctp)"); return (PF_DROP); } pd2.pcksum = &pd2.sctp_dummy_sum; @@ -8407,8 +8408,8 @@ pf_test_state_icmp(struct pf_kstate **state, struct pf_pdesc *pd, if (src->scrub->pfss_v_tag != sh->v_tag) { DPFPRINTF(PF_DEBUG_MISC, - ("pf: ICMP error message has incorrect " - "SCTP v_tag\n")); + "pf: ICMP error message has incorrect " + "SCTP v_tag"); return (PF_DROP); } @@ -8531,8 +8532,8 @@ pf_test_state_icmp(struct pf_kstate **state, struct pf_pdesc *pd, if (!pf_pull_hdr(pd->m, pd2.off, iih, ICMP_MINLEN, NULL, reason, pd2.af)) { DPFPRINTF(PF_DEBUG_MISC, - ("pf: ICMP error message too short i" - "(icmp)\n")); + "pf: ICMP error message too short i" + "(icmp)"); return (PF_DROP); } pd2.pcksum = &pd2.hdr.icmp.icmp_cksum; @@ -8651,8 +8652,8 @@ pf_test_state_icmp(struct pf_kstate **state, struct pf_pdesc *pd, if (!pf_pull_hdr(pd->m, pd2.off, iih, sizeof(struct icmp6_hdr), NULL, reason, pd2.af)) { DPFPRINTF(PF_DEBUG_MISC, - ("pf: ICMP error message too short " - "(icmp6)\n")); + "pf: ICMP error message too short " + "(icmp6)"); return (PF_DROP); } pd2.pcksum = &pd2.hdr.icmp6.icmp6_cksum; @@ -9082,7 +9083,7 @@ pf_route(struct pf_krule *r, struct ifnet *oifp, } if (m0->m_len < sizeof(struct ip)) { DPFPRINTF(PF_DEBUG_URGENT, - ("%s: m0->m_len < sizeof(struct ip)\n", __func__)); + "%s: m0->m_len < sizeof(struct ip)", __func__); SDT_PROBE1(pf, ip, route_to, drop, __LINE__); goto bad; } @@ -9387,8 +9388,8 @@ pf_route6(struct pf_krule *r, struct ifnet *oifp, } if (m0->m_len < sizeof(struct ip6_hdr)) { DPFPRINTF(PF_DEBUG_URGENT, - ("%s: m0->m_len < sizeof(struct ip6_hdr)\n", - __func__)); + "%s: m0->m_len < sizeof(struct ip6_hdr)", + __func__); SDT_PROBE1(pf, ip6, route_to, drop, __LINE__); goto bad; } @@ -9683,7 +9684,7 @@ pf_test_eth(int dir, int pflags, struct ifnet *ifp, struct mbuf **m0, if (kif == NULL) { DPFPRINTF(PF_DEBUG_URGENT, - ("%s: kif == NULL, if_xname %s\n", __func__, ifp->if_xname)); + "%s: kif == NULL, if_xname %s", __func__, ifp->if_xname); return (PF_DROP); } if (kif->pfik_flags & PFI_IFLAG_SKIP) @@ -9798,6 +9799,62 @@ pf_dummynet_route(struct pf_pdesc *pd, struct pf_kstate *s, } static int +pf_walk_option(struct pf_pdesc *pd, struct ip *h, int off, int end, + u_short *reason) +{ + uint8_t type, length, opts[15 * 4 - sizeof(struct ip)]; + + /* IP header in payload of ICMP packet may be too short */ + if (pd->m->m_pkthdr.len < end) { + DPFPRINTF(PF_DEBUG_MISC, "IP option too short"); + REASON_SET(reason, PFRES_SHORT); + return (PF_DROP); + } + + MPASS(end - off <= sizeof(opts)); + m_copydata(pd->m, off, end - off, opts); + end -= off; + off = 0; + + while (off < end) { + type = opts[off]; + if (type == IPOPT_EOL) + break; + if (type == IPOPT_NOP) { + off++; + continue; + } + if (off + 2 > end) { + DPFPRINTF(PF_DEBUG_MISC, "IP length opt"); + REASON_SET(reason, PFRES_IPOPTIONS); + return (PF_DROP); + } + length = opts[off + 1]; + if (length < 2) { + DPFPRINTF(PF_DEBUG_MISC, "IP short opt"); + REASON_SET(reason, PFRES_IPOPTIONS); + return (PF_DROP); + } + if (off + length > end) { + DPFPRINTF(PF_DEBUG_MISC, "IP long opt"); + REASON_SET(reason, PFRES_IPOPTIONS); + return (PF_DROP); + } + switch (type) { + case IPOPT_RA: + pd->badopts |= PF_OPT_ROUTER_ALERT; + break; + default: + pd->badopts |= PF_OPT_OTHER; + break; + } + off += length; + } + + return (PF_PASS); +} + +static int pf_walk_header(struct pf_pdesc *pd, struct ip *h, u_short *reason) { struct ah ext; @@ -9809,11 +9866,28 @@ pf_walk_header(struct pf_pdesc *pd, struct ip *h, u_short *reason) REASON_SET(reason, PFRES_SHORT); return (PF_DROP); } - if (hlen != sizeof(struct ip)) - pd->badopts++; + if (hlen != sizeof(struct ip)) { + if (pf_walk_option(pd, h, pd->off + sizeof(struct ip), + pd->off + hlen, reason) != PF_PASS) + return (PF_DROP); + /* header options which contain only padding is fishy */ + if (pd->badopts == 0) + pd->badopts |= PF_OPT_OTHER; + } end = pd->off + ntohs(h->ip_len); pd->off += hlen; pd->proto = h->ip_p; + /* IGMP packets have router alert options, allow them */ + if (pd->proto == IPPROTO_IGMP) { + /* According to RFC 1112 ttl must be set to 1. */ + if ((h->ip_ttl != 1) || + !IN_MULTICAST(ntohl(h->ip_dst.s_addr))) { + DPFPRINTF(PF_DEBUG_MISC, "Invalid IGMP"); + REASON_SET(reason, PFRES_IPOPTIONS); + return (PF_DROP); + } + pd->badopts &= ~PF_OPT_ROUTER_ALERT; + } /* stop walking over non initial fragments */ if ((h->ip_off & htons(IP_OFFMASK)) != 0) return (PF_PASS); @@ -9826,7 +9900,7 @@ pf_walk_header(struct pf_pdesc *pd, struct ip *h, u_short *reason) return (PF_PASS); if (!pf_pull_hdr(pd->m, pd->off, &ext, sizeof(ext), NULL, reason, AF_INET)) { - DPFPRINTF(PF_DEBUG_MISC, ("IP short exthdr")); + DPFPRINTF(PF_DEBUG_MISC, "IP short exthdr"); return (PF_DROP); } pd->off += (ext.ah_len + 2) * 4; @@ -9836,7 +9910,7 @@ pf_walk_header(struct pf_pdesc *pd, struct ip *h, u_short *reason) return (PF_PASS); } } - DPFPRINTF(PF_DEBUG_MISC, ("IPv4 nested authentication header limit")); + DPFPRINTF(PF_DEBUG_MISC, "IPv4 nested authentication header limit"); REASON_SET(reason, PFRES_IPOPTIONS); return (PF_DROP); } @@ -9852,7 +9926,7 @@ pf_walk_option6(struct pf_pdesc *pd, struct ip6_hdr *h, int off, int end, while (off < end) { if (!pf_pull_hdr(pd->m, off, &opt.ip6o_type, sizeof(opt.ip6o_type), NULL, reason, AF_INET6)) { - DPFPRINTF(PF_DEBUG_MISC, ("IPv6 short opt type")); + DPFPRINTF(PF_DEBUG_MISC, "IPv6 short opt type"); return (PF_DROP); } if (opt.ip6o_type == IP6OPT_PAD1) { @@ -9861,41 +9935,48 @@ pf_walk_option6(struct pf_pdesc *pd, struct ip6_hdr *h, int off, int end, } if (!pf_pull_hdr(pd->m, off, &opt, sizeof(opt), NULL, reason, AF_INET6)) { - DPFPRINTF(PF_DEBUG_MISC, ("IPv6 short opt")); + DPFPRINTF(PF_DEBUG_MISC, "IPv6 short opt"); return (PF_DROP); } if (off + sizeof(opt) + opt.ip6o_len > end) { - DPFPRINTF(PF_DEBUG_MISC, ("IPv6 long opt")); + DPFPRINTF(PF_DEBUG_MISC, "IPv6 long opt"); REASON_SET(reason, PFRES_IPOPTIONS); return (PF_DROP); } switch (opt.ip6o_type) { + case IP6OPT_PADN: + break; case IP6OPT_JUMBO: + pd->badopts |= PF_OPT_JUMBO; if (pd->jumbolen != 0) { - DPFPRINTF(PF_DEBUG_MISC, ("IPv6 multiple jumbo")); + DPFPRINTF(PF_DEBUG_MISC, "IPv6 multiple jumbo"); REASON_SET(reason, PFRES_IPOPTIONS); return (PF_DROP); } if (ntohs(h->ip6_plen) != 0) { - DPFPRINTF(PF_DEBUG_MISC, ("IPv6 bad jumbo plen")); + DPFPRINTF(PF_DEBUG_MISC, "IPv6 bad jumbo plen"); REASON_SET(reason, PFRES_IPOPTIONS); return (PF_DROP); } if (!pf_pull_hdr(pd->m, off, &jumbo, sizeof(jumbo), NULL, reason, AF_INET6)) { - DPFPRINTF(PF_DEBUG_MISC, ("IPv6 short jumbo")); + DPFPRINTF(PF_DEBUG_MISC, "IPv6 short jumbo"); return (PF_DROP); } memcpy(&pd->jumbolen, jumbo.ip6oj_jumbo_len, sizeof(pd->jumbolen)); pd->jumbolen = ntohl(pd->jumbolen); if (pd->jumbolen < IPV6_MAXPACKET) { - DPFPRINTF(PF_DEBUG_MISC, ("IPv6 short jumbolen")); + DPFPRINTF(PF_DEBUG_MISC, "IPv6 short jumbolen"); REASON_SET(reason, PFRES_IPOPTIONS); return (PF_DROP); } break; + case IP6OPT_ROUTER_ALERT: + pd->badopts |= PF_OPT_ROUTER_ALERT; + break; default: + pd->badopts |= PF_OPT_OTHER; break; } off += sizeof(opt) + opt.ip6o_len; @@ -9909,6 +9990,7 @@ pf_walk_header6(struct pf_pdesc *pd, struct ip6_hdr *h, u_short *reason) { struct ip6_frag frag; struct ip6_ext ext; + struct icmp6_hdr icmp6; struct ip6_rthdr rthdr; uint32_t end; int hdr_cnt, fraghdr_cnt = 0, rthdr_cnt = 0; @@ -9920,27 +10002,40 @@ pf_walk_header6(struct pf_pdesc *pd, struct ip6_hdr *h, u_short *reason) for (hdr_cnt = 0; hdr_cnt < PF_HDR_LIMIT; hdr_cnt++) { switch (pd->proto) { case IPPROTO_ROUTING: - case IPPROTO_HOPOPTS: case IPPROTO_DSTOPTS: - pd->badopts++; + pd->badopts |= PF_OPT_OTHER; + break; + case IPPROTO_HOPOPTS: + if (!pf_pull_hdr(pd->m, pd->off, &ext, sizeof(ext), + NULL, reason, AF_INET6)) { + DPFPRINTF(PF_DEBUG_MISC, "IPv6 short exthdr"); + return (PF_DROP); + } + if (pf_walk_option6(pd, h, pd->off + sizeof(ext), + pd->off + (ext.ip6e_len + 1) * 8, + reason) != PF_PASS) + return (PF_DROP); + /* option header which contains only padding is fishy */ + if (pd->badopts == 0) + pd->badopts |= PF_OPT_OTHER; break; } switch (pd->proto) { case IPPROTO_FRAGMENT: if (fraghdr_cnt++) { - DPFPRINTF(PF_DEBUG_MISC, ("IPv6 multiple fragment")); + DPFPRINTF(PF_DEBUG_MISC, "IPv6 multiple fragment"); REASON_SET(reason, PFRES_FRAG); return (PF_DROP); } /* jumbo payload packets cannot be fragmented */ if (pd->jumbolen != 0) { - DPFPRINTF(PF_DEBUG_MISC, ("IPv6 fragmented jumbo")); + DPFPRINTF(PF_DEBUG_MISC, "IPv6 fragmented jumbo"); REASON_SET(reason, PFRES_FRAG); return (PF_DROP); } if (!pf_pull_hdr(pd->m, pd->off, &frag, sizeof(frag), NULL, reason, AF_INET6)) { - DPFPRINTF(PF_DEBUG_MISC, ("IPv6 short fragment")); + DPFPRINTF(PF_DEBUG_MISC, "IPv6 short fragment"); return (PF_DROP); } /* stop walking over non initial fragments */ @@ -9956,7 +10051,7 @@ pf_walk_header6(struct pf_pdesc *pd, struct ip6_hdr *h, u_short *reason) break; case IPPROTO_ROUTING: if (rthdr_cnt++) { - DPFPRINTF(PF_DEBUG_MISC, ("IPv6 multiple rthdr")); + DPFPRINTF(PF_DEBUG_MISC, "IPv6 multiple rthdr"); REASON_SET(reason, PFRES_IPOPTIONS); return (PF_DROP); } @@ -9968,11 +10063,11 @@ pf_walk_header6(struct pf_pdesc *pd, struct ip6_hdr *h, u_short *reason) } if (!pf_pull_hdr(pd->m, pd->off, &rthdr, sizeof(rthdr), NULL, reason, AF_INET6)) { - DPFPRINTF(PF_DEBUG_MISC, ("IPv6 short rthdr")); + DPFPRINTF(PF_DEBUG_MISC, "IPv6 short rthdr"); return (PF_DROP); } if (rthdr.ip6r_type == IPV6_RTHDR_TYPE_0) { - DPFPRINTF(PF_DEBUG_MISC, ("IPv6 rthdr0")); + DPFPRINTF(PF_DEBUG_MISC, "IPv6 rthdr0"); REASON_SET(reason, PFRES_IPOPTIONS); return (PF_DROP); } @@ -9980,7 +10075,7 @@ pf_walk_header6(struct pf_pdesc *pd, struct ip6_hdr *h, u_short *reason) case IPPROTO_HOPOPTS: /* RFC2460 4.1: Hop-by-Hop only after IPv6 header */ if (pd->proto == IPPROTO_HOPOPTS && hdr_cnt > 0) { - DPFPRINTF(PF_DEBUG_MISC, ("IPv6 hopopts not first")); + DPFPRINTF(PF_DEBUG_MISC, "IPv6 hopopts not first"); REASON_SET(reason, PFRES_IPOPTIONS); return (PF_DROP); } @@ -9989,7 +10084,7 @@ pf_walk_header6(struct pf_pdesc *pd, struct ip6_hdr *h, u_short *reason) case IPPROTO_DSTOPTS: if (!pf_pull_hdr(pd->m, pd->off, &ext, sizeof(ext), NULL, reason, AF_INET6)) { - DPFPRINTF(PF_DEBUG_MISC, ("IPv6 short exthdr")); + DPFPRINTF(PF_DEBUG_MISC, "IPv6 short exthdr"); return (PF_DROP); } /* fragments may be short */ @@ -10001,18 +10096,11 @@ pf_walk_header6(struct pf_pdesc *pd, struct ip6_hdr *h, u_short *reason) /* reassembly needs the ext header before the frag */ if (pd->fragoff == 0) pd->extoff = pd->off; - if (pd->proto == IPPROTO_HOPOPTS && pd->fragoff == 0) { - if (pf_walk_option6(pd, h, - pd->off + sizeof(ext), - pd->off + (ext.ip6e_len + 1) * 8, reason) - != PF_PASS) - return (PF_DROP); - if (ntohs(h->ip6_plen) == 0 && pd->jumbolen != 0) { - DPFPRINTF(PF_DEBUG_MISC, - ("IPv6 missing jumbo")); - REASON_SET(reason, PFRES_IPOPTIONS); - return (PF_DROP); - } + if (pd->proto == IPPROTO_HOPOPTS && pd->fragoff == 0 && + ntohs(h->ip6_plen) == 0 && pd->jumbolen != 0) { + DPFPRINTF(PF_DEBUG_MISC, "IPv6 missing jumbo"); + REASON_SET(reason, PFRES_IPOPTIONS); + return (PF_DROP); } if (pd->proto == IPPROTO_AH) pd->off += (ext.ip6e_len + 2) * 4; @@ -10020,10 +10108,45 @@ pf_walk_header6(struct pf_pdesc *pd, struct ip6_hdr *h, u_short *reason) pd->off += (ext.ip6e_len + 1) * 8; pd->proto = ext.ip6e_nxt; break; + case IPPROTO_ICMPV6: + /* fragments may be short, ignore inner header then */ + if (pd->fragoff != 0 && end < pd->off + sizeof(icmp6)) { + pd->off = pd->fragoff; + pd->proto = IPPROTO_FRAGMENT; + return (PF_PASS); + } + if (!pf_pull_hdr(pd->m, pd->off, &icmp6, sizeof(icmp6), + NULL, reason, AF_INET6)) { + DPFPRINTF(PF_DEBUG_MISC, + "IPv6 short icmp6hdr"); + return (PF_DROP); + } + /* ICMP multicast packets have router alert options */ + switch (icmp6.icmp6_type) { + case MLD_LISTENER_QUERY: + case MLD_LISTENER_REPORT: + case MLD_LISTENER_DONE: + case MLDV2_LISTENER_REPORT: + /* + * According to RFC 2710 all MLD messages are + * sent with hop-limit (ttl) set to 1, and link + * local source address. If either one is + * missing then MLD message is invalid and + * should be discarded. + */ + if ((h->ip6_hlim != 1) || + !IN6_IS_ADDR_LINKLOCAL(&h->ip6_src)) { + DPFPRINTF(PF_DEBUG_MISC, "Invalid MLD"); + REASON_SET(reason, PFRES_IPOPTIONS); + return (PF_DROP); + } + pd->badopts &= ~PF_OPT_ROUTER_ALERT; + break; + } + return (PF_PASS); case IPPROTO_TCP: case IPPROTO_UDP: case IPPROTO_SCTP: - case IPPROTO_ICMPV6: /* fragments may be short, ignore inner header then */ if (pd->fragoff != 0 && end < pd->off + (pd->proto == IPPROTO_TCP ? sizeof(struct tcphdr) : @@ -10038,7 +10161,7 @@ pf_walk_header6(struct pf_pdesc *pd, struct ip6_hdr *h, u_short *reason) return (PF_PASS); } } - DPFPRINTF(PF_DEBUG_MISC, ("IPv6 nested extension header limit")); + DPFPRINTF(PF_DEBUG_MISC, "IPv6 nested extension header limit"); REASON_SET(reason, PFRES_IPOPTIONS); return (PF_DROP); } @@ -10083,8 +10206,15 @@ pf_setup_pdesc(sa_family_t af, int dir, struct pf_pdesc *pd, struct mbuf **m0, if (__predict_false((*m0)->m_len < sizeof(struct ip)) && (pd->m = *m0 = m_pullup(*m0, sizeof(struct ip))) == NULL) { DPFPRINTF(PF_DEBUG_URGENT, - ("%s: m_len < sizeof(struct ip), pullup failed\n", - __func__)); + "%s: m_len < sizeof(struct ip), pullup failed", + __func__); + *action = PF_DROP; + REASON_SET(reason, PFRES_SHORT); + return (-1); + } + + h = mtod(pd->m, struct ip *); + if (pd->m->m_pkthdr.len < ntohs(h->ip_len)) { *action = PF_DROP; REASON_SET(reason, PFRES_SHORT); return (-1); @@ -10097,13 +10227,7 @@ pf_setup_pdesc(sa_family_t af, int dir, struct pf_pdesc *pd, struct mbuf **m0, return (-1); } *m0 = pd->m; - h = mtod(pd->m, struct ip *); - if (pd->m->m_pkthdr.len < ntohs(h->ip_len)) { - *action = PF_DROP; - REASON_SET(reason, PFRES_SHORT); - return (-1); - } if (pf_walk_header(pd, h, reason) != PF_PASS) { *action = PF_DROP; @@ -10133,8 +10257,8 @@ pf_setup_pdesc(sa_family_t af, int dir, struct pf_pdesc *pd, struct mbuf **m0, if (__predict_false((*m0)->m_len < sizeof(struct ip6_hdr)) && (pd->m = *m0 = m_pullup(*m0, sizeof(struct ip6_hdr))) == NULL) { DPFPRINTF(PF_DEBUG_URGENT, - ("%s: m_len < sizeof(struct ip6_hdr)" - ", pullup failed\n", __func__)); + "%s: m_len < sizeof(struct ip6_hdr)" + ", pullup failed", __func__); *action = PF_DROP; REASON_SET(reason, PFRES_SHORT); return (-1); @@ -10148,6 +10272,15 @@ pf_setup_pdesc(sa_family_t af, int dir, struct pf_pdesc *pd, struct mbuf **m0, return (-1); } + /* + * we do not support jumbogram. if we keep going, zero ip6_plen + * will do something bad, so drop the packet for now. + */ + if (htons(h->ip6_plen) == 0) { + *action = PF_DROP; + return (-1); + } + if (pf_walk_header6(pd, h, reason) != PF_PASS) { *action = PF_DROP; return (-1); @@ -10167,15 +10300,6 @@ pf_setup_pdesc(sa_family_t af, int dir, struct pf_pdesc *pd, struct mbuf **m0, pd->virtual_proto = (pd->fragoff != 0) ? PF_VPROTO_FRAGMENT : pd->proto; - /* - * we do not support jumbogram. if we keep going, zero ip6_plen - * will do something bad, so drop the packet for now. - */ - if (htons(h->ip6_plen) == 0) { - *action = PF_DROP; - return (-1); - } - /* We do IP header normalization and packet reassembly here */ if (pf_normalize_ip6(pd->fragoff, reason, pd) != PF_PASS) { @@ -10494,8 +10618,8 @@ pf_test(sa_family_t af, int dir, int pflags, struct ifnet *ifp, struct mbuf **m0 if (__predict_false(kif == NULL)) { DPFPRINTF(PF_DEBUG_URGENT, - ("%s: kif == NULL, if_xname %s\n", - __func__, ifp->if_xname)); + "%s: kif == NULL, if_xname %s", + __func__, ifp->if_xname); return (PF_DROP); } if (kif->pfik_flags & PFI_IFLAG_SKIP) { @@ -10699,14 +10823,14 @@ pf_test(sa_family_t af, int dir, int pflags, struct ifnet *ifp, struct mbuf **m0 action = PF_DROP; REASON_SET(&reason, PFRES_NORM); DPFPRINTF(PF_DEBUG_MISC, - ("dropping IPv6 packet with ICMPv4 payload")); + "dropping IPv6 packet with ICMPv4 payload"); break; } if (pd.virtual_proto == IPPROTO_ICMPV6 && af != AF_INET6) { action = PF_DROP; REASON_SET(&reason, PFRES_NORM); DPFPRINTF(PF_DEBUG_MISC, - ("pf: dropping IPv4 packet with ICMPv6 payload\n")); + "pf: dropping IPv4 packet with ICMPv6 payload"); break; } action = pf_test_state_icmp(&s, &pd, &reason); @@ -10732,12 +10856,12 @@ done: if (s) memcpy(&pd.act, &s->act, sizeof(s->act)); - if (action == PF_PASS && pd.badopts && !pd.act.allow_opts) { + if (action == PF_PASS && pd.badopts != 0 && !pd.act.allow_opts) { action = PF_DROP; REASON_SET(&reason, PFRES_IPOPTIONS); pd.act.log = PF_LOG_FORCE; DPFPRINTF(PF_DEBUG_MISC, - ("pf: dropping packet with dangerous headers\n")); + "pf: dropping packet with dangerous headers"); } if (pd.act.max_pkt_size && pd.act.max_pkt_size && @@ -10746,7 +10870,7 @@ done: REASON_SET(&reason, PFRES_NORM); pd.act.log = PF_LOG_FORCE; DPFPRINTF(PF_DEBUG_MISC, - ("pf: dropping overly long packet\n")); + "pf: dropping overly long packet"); } if (s) { @@ -10778,7 +10902,7 @@ done: REASON_SET(&reason, PFRES_MEMORY); pd.act.log = PF_LOG_FORCE; DPFPRINTF(PF_DEBUG_MISC, - ("pf: failed to allocate 802.1q mtag\n")); + "pf: failed to allocate 802.1q mtag"); } } @@ -10835,7 +10959,7 @@ done: REASON_SET(&reason, PFRES_MEMORY); pd.act.log = PF_LOG_FORCE; DPFPRINTF(PF_DEBUG_MISC, - ("pf: failed to allocate tag\n")); + "pf: failed to allocate tag"); } else { pd.pf_mtag->flags |= PF_MTAG_FLAG_FASTFWD_OURS_PRESENT; @@ -10852,7 +10976,7 @@ done: REASON_SET(&reason, PFRES_MEMORY); pd.act.log = PF_LOG_FORCE; DPFPRINTF(PF_DEBUG_MISC, - ("pf: failed to allocate divert tag\n")); + "pf: failed to allocate divert tag"); } } /* XXX: Anybody working on it?! */ diff --git a/sys/netpfil/pf/pf_ioctl.c b/sys/netpfil/pf/pf_ioctl.c index 5c69c395c5fc..9abc07c36788 100644 --- a/sys/netpfil/pf/pf_ioctl.c +++ b/sys/netpfil/pf/pf_ioctl.c @@ -217,8 +217,6 @@ static u_int16_t tagname2tag(struct pf_tagset *, const char *); static u_int16_t pf_tagname2tag(const char *); static void tag_unref(struct pf_tagset *, u_int16_t); -#define DPFPRINTF(n, x) if (V_pf_status.debug >= (n)) printf x - struct cdev *pf_dev; /* @@ -2094,19 +2092,18 @@ pf_ioctl_addrule(struct pf_krule *rule, uint32_t ticket, int rs_num; int error = 0; - if ((rule->return_icmp >> 8) > ICMP_MAXTYPE) { - error = EINVAL; - goto errout_unlocked; - } +#define ERROUT(x) ERROUT_FUNCTION(errout, x) +#define ERROUT_UNLOCKED(x) ERROUT_FUNCTION(errout_unlocked, x) -#define ERROUT(x) ERROUT_FUNCTION(errout, x) + if ((rule->return_icmp >> 8) > ICMP_MAXTYPE) + ERROUT_UNLOCKED(EINVAL); if ((error = pf_rule_checkaf(rule))) - ERROUT(error); + ERROUT_UNLOCKED(error); if (pf_validate_range(rule->src.port_op, rule->src.port)) - ERROUT(EINVAL); + ERROUT_UNLOCKED(EINVAL); if (pf_validate_range(rule->dst.port_op, rule->dst.port)) - ERROUT(EINVAL); + ERROUT_UNLOCKED(EINVAL); if (rule->ifname[0]) kif = pf_kkif_create(M_WAITOK); @@ -2143,14 +2140,14 @@ pf_ioctl_addrule(struct pf_krule *rule, uint32_t ticket, ERROUT(EINVAL); if (ticket != ruleset->rules[rs_num].inactive.ticket) { DPFPRINTF(PF_DEBUG_MISC, - ("ticket: %d != [%d]%d\n", ticket, rs_num, - ruleset->rules[rs_num].inactive.ticket)); + "ticket: %d != [%d]%d", ticket, rs_num, + ruleset->rules[rs_num].inactive.ticket); ERROUT(EBUSY); } if (pool_ticket != V_ticket_pabuf) { DPFPRINTF(PF_DEBUG_MISC, - ("pool_ticket: %d != %d\n", pool_ticket, - V_ticket_pabuf)); + "pool_ticket: %d != %d", pool_ticket, + V_ticket_pabuf); ERROUT(EBUSY); } /* @@ -2296,6 +2293,7 @@ pf_ioctl_addrule(struct pf_krule *rule, uint32_t ticket, return (0); #undef ERROUT +#undef ERROUT_UNLOCKED errout: PF_RULES_WUNLOCK(); PF_CONFIG_UNLOCK(); @@ -2469,7 +2467,7 @@ pf_start(void) V_pf_status.since = time_uptime; new_unrhdr64(&V_pf_stateid, time_second); - DPFPRINTF(PF_DEBUG_MISC, ("pf: started\n")); + DPFPRINTF(PF_DEBUG_MISC, "pf: started"); } sx_xunlock(&V_pf_ioctl_lock); @@ -2489,7 +2487,7 @@ pf_stop(void) dehook_pf(); dehook_pf_eth(); V_pf_status.since = time_uptime; - DPFPRINTF(PF_DEBUG_MISC, ("pf: stopped\n")); + DPFPRINTF(PF_DEBUG_MISC, "pf: stopped"); } sx_xunlock(&V_pf_ioctl_lock); @@ -3264,9 +3262,9 @@ DIOCGETETHRULE_error: if (nvlist_get_number(nvl, "ticket") != ruleset->inactive.ticket) { DPFPRINTF(PF_DEBUG_MISC, - ("ticket: %d != %d\n", + "ticket: %d != %d", (u_int32_t)nvlist_get_number(nvl, "ticket"), - ruleset->inactive.ticket)); + ruleset->inactive.ticket); ERROUT(EBUSY); } @@ -4340,7 +4338,7 @@ DIOCGETSTATESV2_full: if (error == 0) V_pf_altq_running = 1; PF_RULES_WUNLOCK(); - DPFPRINTF(PF_DEBUG_MISC, ("altq: started\n")); + DPFPRINTF(PF_DEBUG_MISC, "altq: started"); break; } @@ -4359,7 +4357,7 @@ DIOCGETSTATESV2_full: if (error == 0) V_pf_altq_running = 0; PF_RULES_WUNLOCK(); - DPFPRINTF(PF_DEBUG_MISC, ("altq: stopped\n")); + DPFPRINTF(PF_DEBUG_MISC, "altq: stopped"); break; } @@ -6457,9 +6455,9 @@ shutdown_pf(void) for (rs_num = 0; rs_num < PF_RULESET_MAX; ++rs_num) { if ((error = pf_begin_rules(&t[rs_num], rs_num, anchor->path)) != 0) { - DPFPRINTF(PF_DEBUG_MISC, ("%s: " - "anchor.path=%s rs_num=%d\n", - __func__, anchor->path, rs_num)); + DPFPRINTF(PF_DEBUG_MISC, "%s: " + "anchor.path=%s rs_num=%d", + __func__, anchor->path, rs_num); goto error; /* XXX: rollback? */ } } @@ -6481,9 +6479,9 @@ shutdown_pf(void) eth_anchor->refcnt = 1; if ((error = pf_begin_eth(&t[0], eth_anchor->path)) != 0) { - DPFPRINTF(PF_DEBUG_MISC, ("%s: eth " - "anchor.path=%s\n", __func__, - eth_anchor->path)); + DPFPRINTF(PF_DEBUG_MISC, "%s: eth " + "anchor.path=%s", __func__, + eth_anchor->path); goto error; } error = pf_commit_eth(t[0], eth_anchor->path); @@ -6492,27 +6490,27 @@ shutdown_pf(void) if ((error = pf_begin_rules(&t[0], PF_RULESET_SCRUB, &nn)) != 0) { - DPFPRINTF(PF_DEBUG_MISC, ("%s: SCRUB\n", __func__)); + DPFPRINTF(PF_DEBUG_MISC, "%s: SCRUB", __func__); break; } if ((error = pf_begin_rules(&t[1], PF_RULESET_FILTER, &nn)) != 0) { - DPFPRINTF(PF_DEBUG_MISC, ("%s: FILTER\n", __func__)); + DPFPRINTF(PF_DEBUG_MISC, "%s: FILTER", __func__); break; /* XXX: rollback? */ } if ((error = pf_begin_rules(&t[2], PF_RULESET_NAT, &nn)) != 0) { - DPFPRINTF(PF_DEBUG_MISC, ("%s: NAT\n", __func__)); + DPFPRINTF(PF_DEBUG_MISC, "%s: NAT", __func__); break; /* XXX: rollback? */ } if ((error = pf_begin_rules(&t[3], PF_RULESET_BINAT, &nn)) != 0) { - DPFPRINTF(PF_DEBUG_MISC, ("%s: BINAT\n", __func__)); + DPFPRINTF(PF_DEBUG_MISC, "%s: BINAT", __func__); break; /* XXX: rollback? */ } if ((error = pf_begin_rules(&t[4], PF_RULESET_RDR, &nn)) != 0) { - DPFPRINTF(PF_DEBUG_MISC, ("%s: RDR\n", __func__)); + DPFPRINTF(PF_DEBUG_MISC, "%s: RDR", __func__); break; /* XXX: rollback? */ } @@ -6531,7 +6529,7 @@ shutdown_pf(void) break; if ((error = pf_begin_eth(&t[0], &nn)) != 0) { - DPFPRINTF(PF_DEBUG_MISC, ("%s: eth\n", __func__)); + DPFPRINTF(PF_DEBUG_MISC, "%s: eth", __func__); break; } error = pf_commit_eth(t[0], &nn); @@ -6539,7 +6537,7 @@ shutdown_pf(void) #ifdef ALTQ if ((error = pf_begin_altq(&t[0])) != 0) { - DPFPRINTF(PF_DEBUG_MISC, ("%s: ALTQ\n", __func__)); + DPFPRINTF(PF_DEBUG_MISC, "%s: ALTQ", __func__); break; } pf_commit_altq(t[0]); diff --git a/sys/netpfil/pf/pf_lb.c b/sys/netpfil/pf/pf_lb.c index 9c7863bb301e..ea0d6facf695 100644 --- a/sys/netpfil/pf/pf_lb.c +++ b/sys/netpfil/pf/pf_lb.c @@ -71,8 +71,6 @@ #define V_pf_rdr_srcport_rewrite_tries VNET(pf_rdr_srcport_rewrite_tries) VNET_DEFINE_STATIC(int, pf_rdr_srcport_rewrite_tries) = 16; -#define DPFPRINTF(n, x) if (V_pf_status.debug >= (n)) printf x - static uint64_t pf_hash(struct pf_addr *, struct pf_addr *, struct pf_poolhashkey *, sa_family_t); struct pf_krule *pf_match_translation(int, struct pf_test_ctx *); @@ -904,19 +902,19 @@ pf_get_transaddr(struct pf_test_ctx *ctx, struct pf_krule *r, if (pf_get_mape_sport(pd, r, naddr, nportp, &ctx->udp_mapping, rpool)) { DPFPRINTF(PF_DEBUG_MISC, - ("pf: MAP-E port allocation (%u/%u/%u)" - " failed\n", + "pf: MAP-E port allocation (%u/%u/%u)" + " failed", rpool->mape.offset, rpool->mape.psidlen, - rpool->mape.psid)); + rpool->mape.psid); reason = PFRES_MAPFAILED; goto notrans; } } else if (pf_get_sport(pd, r, naddr, nportp, low, high, rpool, &ctx->udp_mapping, PF_SN_NAT)) { DPFPRINTF(PF_DEBUG_MISC, - ("pf: NAT proxy port allocation (%u-%u) failed\n", - rpool->proxy_port[0], rpool->proxy_port[1])); + "pf: NAT proxy port allocation (%u-%u) failed", + rpool->proxy_port[0], rpool->proxy_port[1]); reason = PFRES_MAPFAILED; goto notrans; } @@ -1085,13 +1083,13 @@ pf_get_transaddr(struct pf_test_ctx *ctx, struct pf_krule *r, * the state may be reused if the TCP state is terminal. */ DPFPRINTF(PF_DEBUG_MISC, - ("pf: RDR source port allocation failed\n")); + "pf: RDR source port allocation failed"); break; out: DPFPRINTF(PF_DEBUG_MISC, - ("pf: RDR source port allocation %u->%u\n", - ntohs(pd->nsport), ntohs(ctx->nk->port[0]))); + "pf: RDR source port allocation %u->%u", + ntohs(pd->nsport), ntohs(ctx->nk->port[0])); break; } default: @@ -1140,8 +1138,8 @@ pf_get_transaddr_af(struct pf_krule *r, struct pf_pdesc *pd) if (pf_get_sport(pd, r, &nsaddr, &nport, r->nat.proxy_port[0], r->nat.proxy_port[1], &r->nat, NULL, PF_SN_NAT)) { DPFPRINTF(PF_DEBUG_MISC, - ("pf: af-to NAT proxy port allocation (%u-%u) failed", - r->nat.proxy_port[0], r->nat.proxy_port[1])); + "pf: af-to NAT proxy port allocation (%u-%u) failed", + r->nat.proxy_port[0], r->nat.proxy_port[1]); return (-1); } diff --git a/sys/netpfil/pf/pf_norm.c b/sys/netpfil/pf/pf_norm.c index 369292ca365e..8cea9557633c 100644 --- a/sys/netpfil/pf/pf_norm.c +++ b/sys/netpfil/pf/pf_norm.c @@ -160,13 +160,6 @@ static int pf_reassemble6(struct mbuf **, struct ip6_frag *, uint16_t, uint16_t, u_short *); #endif /* INET6 */ -#define DPFPRINTF(x) do { \ - if (V_pf_status.debug >= PF_DEBUG_MISC) { \ - printf("%s: ", __func__); \ - printf x ; \ - } \ -} while(0) - #ifdef INET static void pf_ip2key(struct ip *ip, struct pf_frnode *key) @@ -262,7 +255,8 @@ pf_purge_fragments(uint32_t expire) if (frag->fr_timeout > expire) break; - DPFPRINTF(("expiring %d(%p)\n", frag->fr_id, frag)); + DPFPRINTF(PF_DEBUG_MISC, "expiring %d(%p)", + frag->fr_id, frag); pf_free_fragment(frag); } @@ -281,7 +275,7 @@ pf_flush_fragments(void) PF_FRAG_ASSERT(); goal = uma_zone_get_cur(V_pf_frent_z) * 9 / 10; - DPFPRINTF(("trying to free %d frag entriess\n", goal)); + DPFPRINTF(PF_DEBUG_MISC, "trying to free %d frag entriess", goal); while (goal < uma_zone_get_cur(V_pf_frent_z)) { frag = TAILQ_LAST(&V_pf_fragqueue, pf_fragqueue); if (frag) @@ -573,26 +567,30 @@ pf_fillup_fragment(struct pf_frnode *key, uint32_t id, /* No empty fragments. */ if (frent->fe_len == 0) { - DPFPRINTF(("bad fragment: len 0\n")); + DPFPRINTF(PF_DEBUG_MISC, "bad fragment: len 0"); goto bad_fragment; } /* All fragments are 8 byte aligned. */ if (frent->fe_mff && (frent->fe_len & 0x7)) { - DPFPRINTF(("bad fragment: mff and len %d\n", frent->fe_len)); + DPFPRINTF(PF_DEBUG_MISC, "bad fragment: mff and len %d", + frent->fe_len); goto bad_fragment; } /* Respect maximum length, IP_MAXPACKET == IPV6_MAXPACKET. */ if (frent->fe_off + frent->fe_len > IP_MAXPACKET) { - DPFPRINTF(("bad fragment: max packet %d\n", - frent->fe_off + frent->fe_len)); + DPFPRINTF(PF_DEBUG_MISC, "bad fragment: max packet %d", + frent->fe_off + frent->fe_len); goto bad_fragment; } - DPFPRINTF((key->fn_af == AF_INET ? - "reass frag %d @ %d-%d\n" : "reass frag %#08x @ %d-%d\n", - id, frent->fe_off, frent->fe_off + frent->fe_len)); + if (key->fn_af == AF_INET) + DPFPRINTF(PF_DEBUG_MISC, "reass frag %d @ %d-%d\n", + id, frent->fe_off, frent->fe_off + frent->fe_len); + else + DPFPRINTF(PF_DEBUG_MISC, "reass frag %#08x @ %d-%d", + id, frent->fe_off, frent->fe_off + frent->fe_len); /* Fully buffer all of the fragments in this fragment queue. */ frag = pf_find_fragment(key, id); @@ -690,10 +688,10 @@ pf_fillup_fragment(struct pf_frnode *key, uint32_t id, precut = prev->fe_off + prev->fe_len - frent->fe_off; if (precut >= frent->fe_len) { - DPFPRINTF(("new frag overlapped\n")); + DPFPRINTF(PF_DEBUG_MISC, "new frag overlapped"); goto drop_fragment; } - DPFPRINTF(("frag head overlap %d\n", precut)); + DPFPRINTF(PF_DEBUG_MISC, "frag head overlap %d", precut); m_adj(frent->fe_m, precut); frent->fe_off += precut; frent->fe_len -= precut; @@ -705,7 +703,8 @@ pf_fillup_fragment(struct pf_frnode *key, uint32_t id, aftercut = frent->fe_off + frent->fe_len - after->fe_off; if (aftercut < after->fe_len) { - DPFPRINTF(("frag tail overlap %d", aftercut)); + DPFPRINTF(PF_DEBUG_MISC, "frag tail overlap %d", + aftercut); m_adj(after->fe_m, aftercut); /* Fragment may switch queue as fe_off changes */ pf_frent_remove(frag, after); @@ -713,7 +712,8 @@ pf_fillup_fragment(struct pf_frnode *key, uint32_t id, after->fe_len -= aftercut; /* Insert into correct queue */ if (pf_frent_insert(frag, after, prev)) { - DPFPRINTF(("fragment requeue limit exceeded")); + DPFPRINTF(PF_DEBUG_MISC, + "fragment requeue limit exceeded"); m_freem(after->fe_m); uma_zfree(V_pf_frent_z, after); /* There is not way to recover */ @@ -723,7 +723,7 @@ pf_fillup_fragment(struct pf_frnode *key, uint32_t id, } /* This fragment is completely overlapped, lose it. */ - DPFPRINTF(("old frag overlapped\n")); + DPFPRINTF(PF_DEBUG_MISC, "old frag overlapped"); next = TAILQ_NEXT(after, fr_next); pf_frent_remove(frag, after); m_freem(after->fe_m); @@ -732,7 +732,7 @@ pf_fillup_fragment(struct pf_frnode *key, uint32_t id, /* If part of the queue gets too long, there is not way to recover. */ if (pf_frent_insert(frag, frent, prev)) { - DPFPRINTF(("fragment queue limit exceeded\n")); + DPFPRINTF(PF_DEBUG_MISC, "fragment queue limit exceeded"); goto bad_fragment; } @@ -748,7 +748,7 @@ free_fragment: * fragment, the entire datagram (and any constituent fragments) MUST * be silently discarded. */ - DPFPRINTF(("flush overlapping fragments\n")); + DPFPRINTF(PF_DEBUG_MISC, "flush overlapping fragments"); pf_free_fragment(frag); bad_fragment: @@ -826,7 +826,8 @@ pf_reassemble(struct mbuf **m0, u_short *reason) m = *m0 = NULL; if (frag->fr_holes) { - DPFPRINTF(("frag %d, holes %d\n", frag->fr_id, frag->fr_holes)); + DPFPRINTF(PF_DEBUG_MISC, "frag %d, holes %d", + frag->fr_id, frag->fr_holes); return (PF_PASS); /* drop because *m0 is NULL, no error */ } @@ -872,14 +873,14 @@ pf_reassemble(struct mbuf **m0, u_short *reason) ip->ip_off &= ~(IP_MF|IP_OFFMASK); if (hdrlen + total > IP_MAXPACKET) { - DPFPRINTF(("drop: too big: %d\n", total)); + DPFPRINTF(PF_DEBUG_MISC, "drop: too big: %d", total); ip->ip_len = 0; REASON_SET(reason, PFRES_SHORT); /* PF_DROP requires a valid mbuf *m0 in pf_test() */ return (PF_DROP); } - DPFPRINTF(("complete: %p(%d)\n", m, ntohs(ip->ip_len))); + DPFPRINTF(PF_DEBUG_MISC, "complete: %p(%d)", m, ntohs(ip->ip_len)); return (PF_PASS); } #endif /* INET */ @@ -931,8 +932,8 @@ pf_reassemble6(struct mbuf **m0, struct ip6_frag *fraghdr, m = *m0 = NULL; if (frag->fr_holes) { - DPFPRINTF(("frag %d, holes %d\n", frag->fr_id, - frag->fr_holes)); + DPFPRINTF(PF_DEBUG_MISC, "frag %d, holes %d", frag->fr_id, + frag->fr_holes); PF_FRAG_UNLOCK(); return (PF_PASS); /* Drop because *m0 is NULL, no error. */ } @@ -993,14 +994,15 @@ pf_reassemble6(struct mbuf **m0, struct ip6_frag *fraghdr, ip6->ip6_nxt = proto; if (hdrlen - sizeof(struct ip6_hdr) + total > IPV6_MAXPACKET) { - DPFPRINTF(("drop: too big: %d\n", total)); + DPFPRINTF(PF_DEBUG_MISC, "drop: too big: %d", total); ip6->ip6_plen = 0; REASON_SET(reason, PFRES_SHORT); /* PF_DROP requires a valid mbuf *m0 in pf_test6(). */ return (PF_DROP); } - DPFPRINTF(("complete: %p(%d)\n", m, ntohs(ip6->ip6_plen))); + DPFPRINTF(PF_DEBUG_MISC, "complete: %p(%d)", m, + ntohs(ip6->ip6_plen)); return (PF_PASS); fail: @@ -1090,7 +1092,7 @@ pf_refragment6(struct ifnet *ifp, struct mbuf **m0, struct m_tag *mtag, action = PF_PASS; } else { /* Drop expects an mbuf to free. */ - DPFPRINTF(("refragment error %d\n", error)); + DPFPRINTF(PF_DEBUG_MISC, "refragment error %d", error); action = PF_DROP; } for (; m; m = t) { @@ -1230,7 +1232,7 @@ pf_normalize_ip(u_short *reason, struct pf_pdesc *pd) * no-df above, fine. Otherwise drop it. */ if (h->ip_off & htons(IP_DF)) { - DPFPRINTF(("IP_DF\n")); + DPFPRINTF(PF_DEBUG_MISC, "IP_DF"); goto bad; } @@ -1238,13 +1240,13 @@ pf_normalize_ip(u_short *reason, struct pf_pdesc *pd) /* All fragments are 8 byte aligned */ if (mff && (ip_len & 0x7)) { - DPFPRINTF(("mff and %d\n", ip_len)); + DPFPRINTF(PF_DEBUG_MISC, "mff and %d", ip_len); goto bad; } /* Respect maximum length */ if (fragoff + ip_len > IP_MAXPACKET) { - DPFPRINTF(("max packet %d\n", fragoff + ip_len)); + DPFPRINTF(PF_DEBUG_MISC, "max packet %d", fragoff + ip_len); goto bad; } @@ -1256,7 +1258,8 @@ pf_normalize_ip(u_short *reason, struct pf_pdesc *pd) /* Fully buffer all of the fragments * Might return a completely reassembled mbuf, or NULL */ PF_FRAG_LOCK(); - DPFPRINTF(("reass frag %d @ %d-%d\n", h->ip_id, fragoff, max)); + DPFPRINTF(PF_DEBUG_MISC, "reass frag %d @ %d-%d", + h->ip_id, fragoff, max); verdict = pf_reassemble(&pd->m, reason); PF_FRAG_UNLOCK(); @@ -1282,7 +1285,7 @@ pf_normalize_ip(u_short *reason, struct pf_pdesc *pd) return (PF_PASS); bad: - DPFPRINTF(("dropping bad fragment\n")); + DPFPRINTF(PF_DEBUG_MISC, "dropping bad fragment"); REASON_SET(reason, PFRES_FRAG); drop: if (r != NULL && r->log) @@ -1711,7 +1714,7 @@ pf_normalize_tcp_stateful(struct pf_pdesc *pd, (uptime.tv_sec - src->scrub->pfss_last.tv_sec > TS_MAX_IDLE || time_uptime - (state->creation / 1000) > TS_MAX_CONN)) { if (V_pf_status.debug >= PF_DEBUG_MISC) { - DPFPRINTF(("src idled out of PAWS\n")); + DPFPRINTF(PF_DEBUG_MISC, "src idled out of PAWS"); pf_print_state(state); printf("\n"); } @@ -1721,7 +1724,7 @@ pf_normalize_tcp_stateful(struct pf_pdesc *pd, if (dst->scrub && (dst->scrub->pfss_flags & PFSS_PAWS) && uptime.tv_sec - dst->scrub->pfss_last.tv_sec > TS_MAX_IDLE) { if (V_pf_status.debug >= PF_DEBUG_MISC) { - DPFPRINTF(("dst idled out of PAWS\n")); + DPFPRINTF(PF_DEBUG_MISC, "dst idled out of PAWS"); pf_print_state(state); printf("\n"); } @@ -1826,22 +1829,22 @@ pf_normalize_tcp_stateful(struct pf_pdesc *pd, * an old timestamp. */ - DPFPRINTF(("Timestamp failed %c%c%c%c\n", + DPFPRINTF(PF_DEBUG_MISC, "Timestamp failed %c%c%c%c", SEQ_LT(tsval, dst->scrub->pfss_tsecr) ? '0' : ' ', SEQ_GT(tsval, src->scrub->pfss_tsval + tsval_from_last) ? '1' : ' ', SEQ_GT(tsecr, dst->scrub->pfss_tsval) ? '2' : ' ', - SEQ_LT(tsecr, dst->scrub->pfss_tsval0)? '3' : ' ')); - DPFPRINTF((" tsval: %u tsecr: %u +ticks: %u " - "idle: %jus %lums\n", + SEQ_LT(tsecr, dst->scrub->pfss_tsval0)? '3' : ' '); + DPFPRINTF(PF_DEBUG_MISC, " tsval: %u tsecr: %u +ticks: " + "%u idle: %jus %lums", tsval, tsecr, tsval_from_last, (uintmax_t)delta_ts.tv_sec, - delta_ts.tv_usec / 1000)); - DPFPRINTF((" src->tsval: %u tsecr: %u\n", - src->scrub->pfss_tsval, src->scrub->pfss_tsecr)); - DPFPRINTF((" dst->tsval: %u tsecr: %u tsval0: %u" - "\n", dst->scrub->pfss_tsval, - dst->scrub->pfss_tsecr, dst->scrub->pfss_tsval0)); + delta_ts.tv_usec / 1000); + DPFPRINTF(PF_DEBUG_MISC, " src->tsval: %u tsecr: %u", + src->scrub->pfss_tsval, src->scrub->pfss_tsecr); + DPFPRINTF(PF_DEBUG_MISC, " dst->tsval: %u tsecr: %u " + "tsval0: %u", dst->scrub->pfss_tsval, + dst->scrub->pfss_tsecr, dst->scrub->pfss_tsval0); if (V_pf_status.debug >= PF_DEBUG_MISC) { pf_print_state(state); pf_print_flags(tcp_get_flags(th)); @@ -1891,8 +1894,8 @@ pf_normalize_tcp_stateful(struct pf_pdesc *pd, * stack changed its RFC1323 behavior?!?! */ if (V_pf_status.debug >= PF_DEBUG_MISC) { - DPFPRINTF(("Did not receive expected RFC1323 " - "timestamp\n")); + DPFPRINTF(PF_DEBUG_MISC, "Did not receive expected " + "RFC1323 timestamp"); pf_print_state(state); pf_print_flags(tcp_get_flags(th)); printf("\n"); @@ -1919,9 +1922,9 @@ pf_normalize_tcp_stateful(struct pf_pdesc *pd, if (V_pf_status.debug >= PF_DEBUG_MISC && dst->scrub && (dst->scrub->pfss_flags & PFSS_TIMESTAMP)) { /* Don't warn if other host rejected RFC1323 */ - DPFPRINTF(("Broken RFC1323 stack did not " - "timestamp data packet. Disabled PAWS " - "security.\n")); + DPFPRINTF(PF_DEBUG_MISC, "Broken RFC1323 stack did " + "not timestamp data packet. Disabled PAWS " + "security."); pf_print_state(state); pf_print_flags(tcp_get_flags(th)); printf("\n"); diff --git a/sys/netpfil/pf/pf_osfp.c b/sys/netpfil/pf/pf_osfp.c index 3e00cc7c80a2..150626c5f3fb 100644 --- a/sys/netpfil/pf/pf_osfp.c +++ b/sys/netpfil/pf/pf_osfp.c @@ -40,9 +40,6 @@ #endif static MALLOC_DEFINE(M_PFOSFP, "pf_osfp", "pf(4) operating system fingerprints"); -#define DPFPRINTF(format, x...) \ - if (V_pf_status.debug >= PF_DEBUG_NOISY) \ - printf(format , ##x) SLIST_HEAD(pf_osfp_list, pf_os_fingerprint); VNET_DEFINE_STATIC(struct pf_osfp_list, pf_osfp_list) = @@ -189,8 +186,8 @@ pf_osfp_fingerprint_hdr(const struct ip *ip, const struct ip6_hdr *ip6, const st optlen = MAX(optlen, 1); /* paranoia */ } - DPFPRINTF("fingerprinted %s:%d %d:%d:%d:%d:%llx (%d) " - "(TS=%s,M=%s%d,W=%s%d)\n", + DPFPRINTF(PF_DEBUG_NOISY, "fingerprinted %s:%d %d:%d:%d:%d:%llx (%d) " + "(TS=%s,M=%s%d,W=%s%d)", srcname, ntohs(tcp->th_sport), fp.fp_wsize, fp.fp_ttl, (fp.fp_flags & PF_OSFP_DF) != 0, fp.fp_psize, (long long int)fp.fp_tcpopts, fp.fp_optcnt, @@ -219,7 +216,7 @@ pf_osfp_match(struct pf_osfp_enlist *list, pf_osfp_t os) if (os == PF_OSFP_ANY) return (1); if (list == NULL) { - DPFPRINTF("osfp no match against %x\n", os); + DPFPRINTF(PF_DEBUG_NOISY, "osfp no match against %x", os); return (os == PF_OSFP_UNKNOWN); } PF_OSFP_UNPACK(os, os_class, os_version, os_subtype); @@ -228,13 +225,13 @@ pf_osfp_match(struct pf_osfp_enlist *list, pf_osfp_t os) if ((os_class == PF_OSFP_ANY || en_class == os_class) && (os_version == PF_OSFP_ANY || en_version == os_version) && (os_subtype == PF_OSFP_ANY || en_subtype == os_subtype)) { - DPFPRINTF("osfp matched %s %s %s %x==%x\n", + DPFPRINTF(PF_DEBUG_NOISY, "osfp matched %s %s %s %x==%x", entry->fp_class_nm, entry->fp_version_nm, entry->fp_subtype_nm, os, entry->fp_os); return (1); } } - DPFPRINTF("fingerprint 0x%x didn't match\n", os); + DPFPRINTF(PF_DEBUG_NOISY, "fingerprint 0x%x didn't match", os); return (0); } @@ -275,8 +272,8 @@ pf_osfp_add(struct pf_osfp_ioctl *fpioc) fpadd.fp_ttl = fpioc->fp_ttl; #if 0 /* XXX RYAN wants to fix logging */ - DPFPRINTF("adding osfp %s %s %s = %s%d:%d:%d:%s%d:0x%llx %d " - "(TS=%s,M=%s%d,W=%s%d) %x\n", + DPFPRINTF(PF_DEBUG_NOISY, "adding osfp %s %s %s =" + " %s%d:%d:%d:%s%d:0x%llx %d (TS=%s,M=%s%d,W=%s%d) %x", fpioc->fp_os.fp_class_nm, fpioc->fp_os.fp_version_nm, fpioc->fp_os.fp_subtype_nm, (fpadd.fp_flags & PF_OSFP_WSIZE_MOD) ? "%" : diff --git a/sys/netpfil/pf/pf_ruleset.c b/sys/netpfil/pf/pf_ruleset.c index 2e5165a9900c..43b51f2933f4 100644 --- a/sys/netpfil/pf/pf_ruleset.c +++ b/sys/netpfil/pf/pf_ruleset.c @@ -59,9 +59,6 @@ #error "Kernel only file. Please use sbin/pfctl/pf_ruleset.c instead." #endif -#define DPFPRINTF(format, x...) \ - if (V_pf_status.debug >= PF_DEBUG_NOISY) \ - printf(format , ##x) #define rs_malloc(x) malloc(x, M_TEMP, M_NOWAIT|M_ZERO) #define rs_free(x) free(x, M_TEMP) @@ -386,7 +383,8 @@ pf_kanchor_setup(struct pf_krule *r, const struct pf_kruleset *s, strlcpy(path, s->anchor->path, MAXPATHLEN); while (name[0] == '.' && name[1] == '.' && name[2] == '/') { if (!path[0]) { - DPFPRINTF("%s: .. beyond root\n", __func__); + DPFPRINTF(PF_DEBUG_NOISY, "%s: .. beyond root", + __func__); rs_free(path); return (1); } @@ -408,7 +406,7 @@ pf_kanchor_setup(struct pf_krule *r, const struct pf_kruleset *s, ruleset = pf_find_or_create_kruleset(path); rs_free(path); if (ruleset == NULL || ruleset == &pf_main_ruleset) { - DPFPRINTF("%s: ruleset\n", __func__); + DPFPRINTF(PF_DEBUG_NOISY, "%s: ruleset", __func__); return (1); } r->anchor = ruleset->anchor; @@ -690,7 +688,8 @@ pf_keth_anchor_setup(struct pf_keth_rule *r, const struct pf_keth_ruleset *s, strlcpy(path, s->anchor->path, MAXPATHLEN); while (name[0] == '.' && name[1] == '.' && name[2] == '/') { if (!path[0]) { - DPFPRINTF("%s: .. beyond root\n", __func__); + DPFPRINTF(PF_DEBUG_NOISY, "%s: .. beyond root", + __func__); rs_free(path); return (1); } @@ -712,7 +711,7 @@ pf_keth_anchor_setup(struct pf_keth_rule *r, const struct pf_keth_ruleset *s, ruleset = pf_find_or_create_keth_ruleset(path); rs_free(path); if (ruleset == NULL || ruleset->anchor == NULL) { - DPFPRINTF("%s: ruleset\n", __func__); + DPFPRINTF(PF_DEBUG_NOISY, "%s: ruleset", __func__); return (1); } r->anchor = ruleset->anchor; diff --git a/sys/netpfil/pf/pf_syncookies.c b/sys/netpfil/pf/pf_syncookies.c index 66757fa4b756..4a935bc65767 100644 --- a/sys/netpfil/pf/pf_syncookies.c +++ b/sys/netpfil/pf/pf_syncookies.c @@ -88,8 +88,6 @@ #include <net/pfvar.h> #include <netpfil/pf/pf_nv.h> -#define DPFPRINTF(n, x) if (V_pf_status.debug >= (n)) printf x - union pf_syncookie { uint8_t cookie; struct { @@ -281,7 +279,7 @@ pf_synflood_check(struct pf_pdesc *pd) pf_syncookie_rotate, curvnet); V_pf_status.syncookies_active = true; DPFPRINTF(LOG_WARNING, - ("synflood detected, enabling syncookies\n")); + "synflood detected, enabling syncookies"); // XXXTODO V_pf_status.lcounters[LCNT_SYNFLOODS]++; } @@ -367,7 +365,7 @@ pf_syncookie_rotate(void *arg) V_pf_status.syncookies_mode == PF_SYNCOOKIES_NEVER) ) { V_pf_status.syncookies_active = false; - DPFPRINTF(PF_DEBUG_MISC, ("syncookies disabled\n")); + DPFPRINTF(PF_DEBUG_MISC, "syncookies disabled"); } /* nothing in flight any more? delete keys and return */ diff --git a/sys/netpfil/pf/pf_table.c b/sys/netpfil/pf/pf_table.c index 9c0151b7da2b..ecc185f89ad7 100644 --- a/sys/netpfil/pf/pf_table.c +++ b/sys/netpfil/pf/pf_table.c @@ -49,8 +49,6 @@ #include <net/vnet.h> #include <net/pfvar.h> -#define DPFPRINTF(n, x) if (V_pf_status.debug >= (n)) printf x - #define ACCEPT_FLAGS(flags, oklist) \ do { \ if ((flags & ~(oklist)) & \ @@ -2189,7 +2187,7 @@ pfr_update_stats(struct pfr_ktable *kt, struct pf_addr *a, sa_family_t af, if ((ke == NULL || ke->pfrke_not) != notrule) { if (op_pass != PFR_OP_PASS) DPFPRINTF(PF_DEBUG_URGENT, - ("pfr_update_stats: assertion failed.\n")); + "pfr_update_stats: assertion failed."); op_pass = PFR_OP_XPASS; } pfr_kstate_counter_add(&kt->pfrkt_packets[dir_out][op_pass], 1); diff --git a/sys/netsmb/smb_conn.c b/sys/netsmb/smb_conn.c index 259635e2d8d5..ab6cd130a057 100644 --- a/sys/netsmb/smb_conn.c +++ b/sys/netsmb/smb_conn.c @@ -422,7 +422,7 @@ smb_vc_create(struct smb_vcspec *vcspec, if (uid == SMBM_ANY_OWNER) uid = realuid; if (gid == SMBM_ANY_GROUP) - gid = cred->cr_groups[0]; + gid = cred->cr_gid; vcp->vc_uid = uid; vcp->vc_grp = gid; @@ -765,7 +765,7 @@ smb_share_create(struct smb_vc *vcp, struct smb_sharespec *shspec, if (uid == SMBM_ANY_OWNER) uid = realuid; if (gid == SMBM_ANY_GROUP) - gid = cred->cr_groups[0]; + gid = cred->cr_gid; ssp = smb_zmalloc(sizeof(*ssp), M_SMBCONN, M_WAITOK); smb_co_init(SSTOCP(ssp), SMBL_SHARE, "smbss ilock", "smbss"); ssp->obj.co_free = smb_share_free; diff --git a/sys/riscv/allwinner/files.allwinner b/sys/riscv/allwinner/files.allwinner index 73fa9660e2d2..7a4ff6b9c62e 100644 --- a/sys/riscv/allwinner/files.allwinner +++ b/sys/riscv/allwinner/files.allwinner @@ -1,5 +1,6 @@ arm/allwinner/aw_gpio.c optional gpio aw_gpio fdt +arm/allwinner/aw_mmc.c optional mmc aw_mmc fdt | mmccam aw_mmc fdt arm/allwinner/aw_rtc.c optional aw_rtc fdt arm/allwinner/aw_syscon.c optional syscon arm/allwinner/aw_sid.c optional aw_sid nvmem diff --git a/sys/riscv/conf/std.allwinner b/sys/riscv/conf/std.allwinner index 2b1e0d4e09dc..34fe195b01ba 100644 --- a/sys/riscv/conf/std.allwinner +++ b/sys/riscv/conf/std.allwinner @@ -7,6 +7,7 @@ options SOC_ALLWINNER_D1 device aw_ccu # Allwinner clock controller device aw_gpio # Allwinner GPIO controller +device aw_mmc # Allwinner SD/MMC controller device aw_rtc # Allwinner Real-time Clock device aw_sid # Allwinner Secure ID EFUSE device aw_timer # Allwinner Timer diff --git a/sys/riscv/include/vmm_dev.h b/sys/riscv/include/vmm_dev.h index 856ff0778b95..4d30d5a1c35b 100644 --- a/sys/riscv/include/vmm_dev.h +++ b/sys/riscv/include/vmm_dev.h @@ -34,6 +34,8 @@ #ifndef _VMM_DEV_H_ #define _VMM_DEV_H_ +#include <sys/domainset.h> + #include <machine/vmm.h> struct vm_memmap { @@ -56,6 +58,9 @@ struct vm_memseg { int segid; size_t len; char name[VM_MAX_SUFFIXLEN + 1]; + domainset_t *ds_mask; + size_t ds_mask_size; + int ds_policy; }; struct vm_register { diff --git a/sys/rpc/authunix_prot.c b/sys/rpc/authunix_prot.c index 91fb96f44397..7b531946488a 100644 --- a/sys/rpc/authunix_prot.c +++ b/sys/rpc/authunix_prot.c @@ -93,9 +93,10 @@ xdr_authunix_parms(XDR *xdrs, uint32_t *time, struct xucred *cred) if (!xdr_uint32_t(xdrs, &cred->cr_uid)) return (FALSE); - if (!xdr_uint32_t(xdrs, &cred->cr_groups[0])) + if (!xdr_uint32_t(xdrs, &cred->cr_gid)) return (FALSE); + /* XXXKE Fix this is cr_gid gets separated out. */ if (xdrs->x_op == XDR_ENCODE) { ngroups = cred->cr_ngroups - 1; if (ngroups > NGRPS) @@ -105,7 +106,7 @@ xdr_authunix_parms(XDR *xdrs, uint32_t *time, struct xucred *cred) if (!xdr_uint32_t(xdrs, &ngroups)) return (FALSE); for (i = 0; i < ngroups; i++) { - if (i + 1 < ngroups_max + 1) { + if (i < ngroups_max) { if (!xdr_uint32_t(xdrs, &cred->cr_groups[i + 1])) return (FALSE); } else { @@ -115,7 +116,7 @@ xdr_authunix_parms(XDR *xdrs, uint32_t *time, struct xucred *cred) } if (xdrs->x_op == XDR_DECODE) { - if (ngroups + 1 > ngroups_max + 1) + if (ngroups > ngroups_max) cred->cr_ngroups = ngroups_max + 1; else cred->cr_ngroups = ngroups + 1; diff --git a/sys/rpc/svc_auth_unix.c b/sys/rpc/svc_auth_unix.c index 5d6402a05006..b10ef33be704 100644 --- a/sys/rpc/svc_auth_unix.c +++ b/sys/rpc/svc_auth_unix.c @@ -83,12 +83,13 @@ _svcauth_unix(struct svc_req *rqst, struct rpc_msg *msg) str_len = RNDUP(str_len); buf += str_len / sizeof (int32_t); xcr->cr_uid = IXDR_GET_UINT32(buf); - xcr->cr_groups[0] = IXDR_GET_UINT32(buf); + xcr->cr_gid = IXDR_GET_UINT32(buf); gid_len = (size_t)IXDR_GET_UINT32(buf); if (gid_len > NGRPS) { stat = AUTH_BADCRED; goto done; } + /* XXXKE Fix this if cr_gid gets separated out. */ for (i = 0; i < gid_len; i++) { if (i + 1 < XU_NGROUPS) xcr->cr_groups[i + 1] = IXDR_GET_INT32(buf); diff --git a/sys/security/audit/audit.c b/sys/security/audit/audit.c index 05928f1c33e8..7ec50d990d4e 100644 --- a/sys/security/audit/audit.c +++ b/sys/security/audit/audit.c @@ -279,7 +279,7 @@ audit_record_ctor(void *mem, int size, void *arg, int flags) cru2x(cred, &ar->k_ar.ar_subj_cred); ar->k_ar.ar_subj_ruid = cred->cr_ruid; ar->k_ar.ar_subj_rgid = cred->cr_rgid; - ar->k_ar.ar_subj_egid = cred->cr_groups[0]; + ar->k_ar.ar_subj_egid = cred->cr_gid; ar->k_ar.ar_subj_auid = cred->cr_audit.ai_auid; ar->k_ar.ar_subj_asid = cred->cr_audit.ai_asid; ar->k_ar.ar_subj_pid = td->td_proc->p_pid; diff --git a/sys/security/audit/audit_arg.c b/sys/security/audit/audit_arg.c index c667d3968817..3ea645373dbe 100644 --- a/sys/security/audit/audit_arg.c +++ b/sys/security/audit/audit_arg.c @@ -408,7 +408,7 @@ audit_arg_process(struct proc *p) cred = p->p_ucred; ar->k_ar.ar_arg_auid = cred->cr_audit.ai_auid; ar->k_ar.ar_arg_euid = cred->cr_uid; - ar->k_ar.ar_arg_egid = cred->cr_groups[0]; + ar->k_ar.ar_arg_egid = cred->cr_gid; ar->k_ar.ar_arg_ruid = cred->cr_ruid; ar->k_ar.ar_arg_rgid = cred->cr_rgid; ar->k_ar.ar_arg_asid = cred->cr_audit.ai_asid; diff --git a/sys/sys/compressor.h b/sys/sys/compressor.h index cad9080b46ff..e59eeabec2cd 100644 --- a/sys/sys/compressor.h +++ b/sys/sys/compressor.h @@ -42,6 +42,7 @@ struct compressor; bool compressor_avail(int format); struct compressor *compressor_init(compressor_cb_t cb, int format, size_t maxiosize, int level, void *arg); +int compressor_format(const struct compressor *stream); void compressor_reset(struct compressor *stream); int compressor_write(struct compressor *stream, void *data, size_t len); diff --git a/sys/sys/domainset.h b/sys/sys/domainset.h index f98b175e9bc8..f3dc92ec6383 100644 --- a/sys/sys/domainset.h +++ b/sys/sys/domainset.h @@ -113,6 +113,20 @@ void domainset_zero(void); * returned value will not match the key pointer. */ struct domainset *domainset_create(const struct domainset *); + +/* + * Remove empty domains from a given domainset. + * Returns 'false' if the domainset consists entirely of empty domains. + */ +bool domainset_empty_vm(struct domainset *domain); + +/* + * Validate and populate a domainset structure according to the specified + * policy and mask. + */ +int domainset_populate(struct domainset *domain, const domainset_t *mask, int policy, + size_t mask_size); + #ifdef _SYS_SYSCTL_H_ int sysctl_handle_domainset(SYSCTL_HANDLER_ARGS); #endif diff --git a/sys/sys/efi.h b/sys/sys/efi.h index 95a433a950db..89c8b15519de 100644 --- a/sys/sys/efi.h +++ b/sys/sys/efi.h @@ -42,6 +42,8 @@ {0xb122a263,0x3661,0x4f68,{0x99,0x29,0x78,0xf8,0xb0,0xd6,0x21,0x80}} #define EFI_PROPERTIES_TABLE \ {0x880aaca3,0x4adc,0x4a04,{0x90,0x79,0xb7,0x47,0x34,0x08,0x25,0xe5}} +#define EFI_MEMORY_ATTRIBUTES_TABLE \ + {0xdcfa911d,0x26eb,0x469f,{0xa2,0x20,0x38,0xb7,0xdc,0x46,0x12,0x20}} #define LINUX_EFI_MEMRESERVE_TABLE \ {0x888eb0c6,0x8ede,0x4ff5,{0xa8,0xf0,0x9a,0xee,0x5c,0xb9,0x77,0xc2}} @@ -166,6 +168,22 @@ struct efi_prop_table { uint64_t memory_protection_attribute; }; +struct efi_memory_descriptor { + uint32_t type; + caddr_t phy_addr; + caddr_t virt_addr; + uint64_t pages; + uint64_t attrs; +}; + +struct efi_memory_attribute_table { + uint32_t version; + uint32_t num_ents; + uint32_t descriptor_size; + uint32_t flags; + struct efi_memory_descriptor tables[]; +}; + #ifdef _KERNEL #ifdef EFIABI_ATTR diff --git a/sys/sys/exec.h b/sys/sys/exec.h index 4bf114a7c698..580a5372c4db 100644 --- a/sys/sys/exec.h +++ b/sys/sys/exec.h @@ -57,16 +57,6 @@ struct ps_strings { unsigned int ps_nenvstr; /* the number of environment strings */ }; -/* Coredump output parameters. */ -struct coredump_params { - off_t offset; - struct ucred *active_cred; - struct ucred *file_cred; - struct thread *td; - struct vnode *vp; - struct compressor *comp; -}; - struct image_params; struct execsw { @@ -105,16 +95,6 @@ int exec_unregister(const struct execsw *); enum uio_seg; -#define CORE_BUF_SIZE (16 * 1024) - -int core_write(struct coredump_params *, const void *, size_t, off_t, - enum uio_seg, size_t *); -int core_output(char *, size_t, off_t, struct coredump_params *, void *); -int sbuf_drain_core_output(void *, const char *, int); - -extern int coredump_pack_fileinfo; -extern int coredump_pack_vmmapinfo; - /* * note: name##_mod cannot be const storage because the * linker_file_sysinit() function modifies _file in the diff --git a/sys/sys/exterr_cat.h b/sys/sys/exterr_cat.h index cab94ac511a5..80cff53b3576 100644 --- a/sys/sys/exterr_cat.h +++ b/sys/sys/exterr_cat.h @@ -18,6 +18,8 @@ #define EXTERR_CAT_FUSE 4 #define EXTERR_CAT_INOTIFY 5 #define EXTERR_CAT_GENIO 6 +#define EXTERR_CAT_BRIDGE 7 +#define EXTERR_CAT_SWAP 8 #endif diff --git a/sys/sys/imgact_elf.h b/sys/sys/imgact_elf.h index c9444e5aec41..2845a9dbc1e2 100644 --- a/sys/sys/imgact_elf.h +++ b/sys/sys/imgact_elf.h @@ -45,6 +45,7 @@ {(pos)->a_type = (id); (pos)->a_un.a_ptr = (ptr); (pos)++;} #endif +struct coredump_writer; struct image_params; struct thread; struct vnode; @@ -114,7 +115,7 @@ bool __elfN(brand_inuse)(Elf_Brandinfo *entry); int __elfN(insert_brand_entry)(Elf_Brandinfo *entry); int __elfN(remove_brand_entry)(Elf_Brandinfo *entry); int __elfN(freebsd_fixup)(uintptr_t *, struct image_params *); -int __elfN(coredump)(struct thread *, struct vnode *, off_t, int); +int __elfN(coredump)(struct thread *, struct coredump_writer *, off_t, int); size_t __elfN(populate_note)(int, void *, void *, size_t, void **); int __elfN(freebsd_copyout_auxargs)(struct image_params *, uintptr_t); void __elfN(puthdr)(struct thread *, void *, size_t, int, size_t, int); diff --git a/sys/sys/jail.h b/sys/sys/jail.h index 08caa9f49270..24c420e2c976 100644 --- a/sys/sys/jail.h +++ b/sys/sys/jail.h @@ -435,7 +435,7 @@ void prison0_init(void); bool prison_allow(struct ucred *, unsigned); int prison_check(struct ucred *cred1, struct ucred *cred2); bool prison_check_nfsd(struct ucred *cred); -bool prison_owns_vnet(struct ucred *); +bool prison_owns_vnet(struct prison *pr); int prison_canseemount(struct ucred *cred, struct mount *mp); void prison_enforce_statfs(struct ucred *cred, struct mount *mp, struct statfs *sp); diff --git a/sys/sys/mbuf.h b/sys/sys/mbuf.h index c75094aea450..304bd019c9fc 100644 --- a/sys/sys/mbuf.h +++ b/sys/sys/mbuf.h @@ -1391,6 +1391,7 @@ extern bool mb_use_ext_pgs; /* Use ext_pgs for sendfile */ #define PACKET_TAG_PF_REASSEMBLED 31 #define PACKET_TAG_IPSEC_ACCEL_OUT 32 /* IPSEC accel out */ #define PACKET_TAG_IPSEC_ACCEL_IN 33 /* IPSEC accel in */ +#define PACKET_TAG_OVPN 34 /* if_ovpn */ /* Specific cookies and tags. */ diff --git a/sys/sys/param.h b/sys/sys/param.h index f941f021a423..33d61e8a1619 100644 --- a/sys/sys/param.h +++ b/sys/sys/param.h @@ -74,7 +74,7 @@ * cannot include sys/param.h and should only be updated here. */ #undef __FreeBSD_version -#define __FreeBSD_version 1500054 +#define __FreeBSD_version 1500055 /* * __FreeBSD_kernel__ indicates that this system uses the kernel of FreeBSD, diff --git a/sys/sys/signalvar.h b/sys/sys/signalvar.h index 23e8426b26ee..8f181b7beee6 100644 --- a/sys/sys/signalvar.h +++ b/sys/sys/signalvar.h @@ -403,6 +403,7 @@ int sigev_findtd(struct proc *p, struct sigevent *sigev, struct thread **); void sigfastblock_clear(struct thread *td); void sigfastblock_fetch(struct thread *td); int sig_intr(void); +bool sig_do_core(int); void siginit(struct proc *p); void signotify(struct thread *td); void sigqueue_delete(struct sigqueue *queue, int sig); diff --git a/sys/sys/sockbuf.h b/sys/sys/sockbuf.h index 7f6234ade6f4..b4593f38f592 100644 --- a/sys/sys/sockbuf.h +++ b/sys/sys/sockbuf.h @@ -40,7 +40,7 @@ #define SB_SEL 0x08 /* someone is selecting */ #define SB_ASYNC 0x10 /* ASYNC I/O, need signals */ #define SB_UPCALL 0x20 /* someone wants an upcall */ -/* was SB_NOINTR 0x40 */ +#define SB_AUTOLOWAT 0x40 /* sendfile(2) may autotune sb_lowat */ #define SB_AIO 0x80 /* AIO operations queued */ #define SB_KNOTE 0x100 /* kernel note attached */ #define SB_NOCOALESCE 0x200 /* don't coalesce new data into existing mbufs */ @@ -210,8 +210,6 @@ typedef enum { SO_RCV, SO_SND } sb_which; * Socket buffer private mbuf(9) flags. */ #define M_NOTREADY M_PROTO1 /* m_data not populated yet */ -#define M_BLOCKED M_PROTO2 /* M_NOTREADY in front of m */ -#define M_NOTAVAIL (M_NOTREADY | M_BLOCKED) void sbappend(struct sockbuf *sb, struct mbuf *m, int flags); void sbappend_locked(struct sockbuf *sb, struct mbuf *m, int flags); diff --git a/sys/sys/syscallsubr.h b/sys/sys/syscallsubr.h index fd183ffbc7a4..8237165b84ce 100644 --- a/sys/sys/syscallsubr.h +++ b/sys/sys/syscallsubr.h @@ -60,6 +60,7 @@ struct rusage; struct sched_param; struct sembuf; union semun; +struct shmfd; struct sockaddr; struct spacectl_range; struct stat; @@ -337,7 +338,7 @@ int kern_shm_open(struct thread *td, const char *userpath, int flags, mode_t mode, struct filecaps *fcaps); int kern_shm_open2(struct thread *td, const char *path, int flags, mode_t mode, int shmflags, struct filecaps *fcaps, - const char *name); + const char *name, struct shmfd *shmfd); int kern_shmat(struct thread *td, int shmid, const void *shmaddr, int shmflg); int kern_shmctl(struct thread *td, int shmid, int cmd, void *buf, diff --git a/sys/sys/sysent.h b/sys/sys/sysent.h index 4ddfc8516053..1714fa5a7416 100644 --- a/sys/sys/sysent.h +++ b/sys/sys/sysent.h @@ -90,6 +90,7 @@ struct sysent { /* system call table */ #define SY_THR_STATIC_KLD SY_THR_STATIC #endif +struct coredump_writer; struct image_params; struct proc; struct __sigset; @@ -108,7 +109,8 @@ struct sysentvec { int *sv_szsigcode; /* size of sigtramp code */ int sv_sigcodeoff; char *sv_name; /* name of binary type */ - int (*sv_coredump)(struct thread *, struct vnode *, off_t, int); + int (*sv_coredump)(struct thread *, struct coredump_writer *, + off_t, int); /* function to dump core, or NULL */ int sv_elf_core_osabi; const char *sv_elf_core_abi_vendor; diff --git a/sys/sys/ucoredump.h b/sys/sys/ucoredump.h new file mode 100644 index 000000000000..0a51ee7f50c8 --- /dev/null +++ b/sys/sys/ucoredump.h @@ -0,0 +1,99 @@ +/* + * + * Copyright (c) 2015 Mark Johnston <markj@FreeBSD.org> + * Copyright (c) 2025 Kyle Evans <kevans@FreeBSD.org> + * + * SPDX-License-Identifier: BSD-2-Clause + * + */ + +#ifndef _SYS_UCOREDUMP_H_ +#define _SYS_UCOREDUMP_H_ + +#ifdef _KERNEL + +#include <sys/_uio.h> +#include <sys/blockcount.h> +#include <sys/queue.h> + +/* Coredump output parameters. */ +struct coredump_params; +struct coredump_writer; +struct thread; +struct ucred; + +typedef int coredump_init_fn(const struct coredump_writer *, + const struct coredump_params *); +typedef int coredump_write_fn(const struct coredump_writer *, const void *, size_t, + off_t, enum uio_seg, struct ucred *, size_t *, struct thread *); +typedef int coredump_extend_fn(const struct coredump_writer *, off_t, + struct ucred *); + +struct coredump_vnode_ctx { + struct vnode *vp; + struct ucred *fcred; +}; + +coredump_write_fn core_vn_write; +coredump_extend_fn core_vn_extend; + +struct coredump_writer { + void *ctx; + coredump_init_fn *init_fn; + coredump_write_fn *write_fn; + coredump_extend_fn *extend_fn; +}; + +struct coredump_params { + off_t offset; + struct ucred *active_cred; + struct thread *td; + const struct coredump_writer *cdw; + struct compressor *comp; +}; + +#define CORE_BUF_SIZE (16 * 1024) + +int core_write(struct coredump_params *, const void *, size_t, off_t, + enum uio_seg, size_t *); +int core_output(char *, size_t, off_t, struct coredump_params *, void *); +int sbuf_drain_core_output(void *, const char *, int); + +extern int coredump_pack_fileinfo; +extern int coredump_pack_vmmapinfo; + +extern int compress_user_cores; +extern int compress_user_cores_level; + +typedef int coredumper_probe_fn(struct thread *); + +/* + * Some arbitrary values for coredumper probes to return. The highest priority + * we can find wins. It's somewhat expected that a coredumper may want to bid + * differently based on the process in question. Note that probe functions will + * be called with the proc lock held, so they must not sleep. + */ +#define COREDUMPER_NOMATCH (-1) /* Decline to touch it */ +#define COREDUMPER_GENERIC (0) /* I handle coredumps */ +#define COREDUMPER_SPECIAL (50) /* Special handler */ +#define COREDUMPER_HIGH_PRIORITY (100) /* High-priority handler */ + +/* + * The handle functions will be called with the proc lock held, and should + * return with the proc lock dropped. + */ +typedef int coredumper_handle_fn(struct thread *, off_t); + +struct coredumper { + SLIST_ENTRY(coredumper) cd_entry; + const char *cd_name; + coredumper_probe_fn *cd_probe; + coredumper_handle_fn *cd_handle; + blockcount_t cd_refcount; +}; + +void coredumper_register(struct coredumper *); +void coredumper_unregister(struct coredumper *); + +#endif /* _KERNEL */ +#endif /* _SYS_UCOREDUMP_H_ */ diff --git a/sys/sys/unistd.h b/sys/sys/unistd.h index c291c1dc2b95..85ed93fd359d 100644 --- a/sys/sys/unistd.h +++ b/sys/sys/unistd.h @@ -156,6 +156,8 @@ #define _PC_DEALLOC_PRESENT 65 #define _PC_NAMEDATTR_ENABLED 66 #define _PC_HAS_NAMEDATTR 67 +#define _PC_XATTR_ENABLED _PC_NAMEDATTR_ENABLED /* Solaris Compatible */ +#define _PC_XATTR_EXISTS _PC_HAS_NAMEDATTR /* Solaris Compatible */ #define _PC_HAS_HIDDENSYSTEM 68 #endif diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h index 2c6947103c94..a416fddcddc3 100644 --- a/sys/sys/vnode.h +++ b/sys/sys/vnode.h @@ -939,7 +939,6 @@ void vop_mknod_post(void *a, int rc); void vop_open_post(void *a, int rc); void vop_read_post(void *a, int rc); void vop_read_pgcache_post(void *ap, int rc); -void vop_readdir_post(void *a, int rc); void vop_reclaim_post(void *a, int rc); void vop_remove_pre(void *a); void vop_remove_post(void *a, int rc); @@ -1015,7 +1014,36 @@ void vop_rename_fail(struct vop_rename_args *ap); _error; \ }) -#define VOP_WRITE_PRE(ap) \ +#ifdef INVARIANTS +#define vop_readdir_pre_assert(ap) \ + ssize_t nresid, oresid; \ + \ + oresid = (ap)->a_uio->uio_resid; + +#define vop_readdir_post_assert(ap, ret) \ + nresid = (ap)->a_uio->uio_resid; \ + if ((ret) == 0 && (ap)->a_eofflag != NULL) { \ + VNASSERT(oresid == 0 || nresid != oresid || \ + *(ap)->a_eofflag == 1, \ + (ap)->a_vp, ("VOP_READDIR: eofflag not set")); \ + } +#else +#define vop_readdir_pre_assert(ap) +#define vop_readdir_post_assert(ap, ret) +#endif + +#define vop_readdir_pre(ap) do { \ + vop_readdir_pre_assert(ap) + +#define vop_readdir_post(ap, ret) \ + vop_readdir_post_assert(ap, ret); \ + if ((ret) == 0) { \ + VFS_KNOTE_LOCKED((ap)->a_vp, NOTE_READ); \ + INOTIFY((ap)->a_vp, IN_ACCESS); \ + } \ +} while (0) + +#define vop_write_pre(ap) \ struct vattr va; \ int error; \ off_t osize, ooffset, noffset; \ @@ -1029,7 +1057,7 @@ void vop_rename_fail(struct vop_rename_args *ap); osize = (off_t)va.va_size; \ } -#define VOP_WRITE_POST(ap, ret) \ +#define vop_write_post(ap, ret) \ noffset = (ap)->a_uio->uio_offset; \ if (noffset > ooffset) { \ if (!VN_KNLIST_EMPTY((ap)->a_vp)) { \ diff --git a/sys/ufs/ufs/ufs_vnops.c b/sys/ufs/ufs/ufs_vnops.c index 53fac4b0665e..b7453db9013c 100644 --- a/sys/ufs/ufs/ufs_vnops.c +++ b/sys/ufs/ufs/ufs_vnops.c @@ -1268,7 +1268,8 @@ ufs_rename( struct inode *fip, *tip, *tdp, *fdp; struct direct newdir; off_t endoff; - int doingdirectory, newparent; + int doingdirectory; + u_int newparent; int error = 0; struct mount *mp; ino_t ino; @@ -1475,7 +1476,7 @@ relock: * the user must have write permission in the source so * as to be able to change "..". */ - if (doingdirectory && newparent) { + if (doingdirectory && newparent != 0) { error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred, curthread); if (error) goto unlockout; @@ -1538,7 +1539,7 @@ relock: if (tip == NULL) { if (ITODEV(tdp) != ITODEV(fip)) panic("ufs_rename: EXDEV"); - if (doingdirectory && newparent) { + if (doingdirectory && newparent != 0) { /* * Account for ".." in new directory. * When source and destination have the same @@ -1631,7 +1632,7 @@ relock: goto bad; } if (doingdirectory) { - if (!newparent) { + if (newparent == 0) { tdp->i_effnlink--; if (DOINGSOFTDEP(tdvp)) softdep_change_linkcnt(tdp); @@ -1641,11 +1642,11 @@ relock: softdep_change_linkcnt(tip); } error = ufs_dirrewrite(tdp, tip, fip->i_number, - IFTODT(fip->i_mode), - (doingdirectory && newparent) ? newparent : doingdirectory); + IFTODT(fip->i_mode), (doingdirectory && newparent != 0) ? + newparent : doingdirectory); if (error) { if (doingdirectory) { - if (!newparent) { + if (newparent == 0) { tdp->i_effnlink++; if (DOINGSOFTDEP(tdvp)) softdep_change_linkcnt(tdp); @@ -1668,7 +1669,7 @@ relock: * disk, so when running with that code we avoid doing * them now. */ - if (!newparent) { + if (newparent == 0) { tdp->i_nlink--; DIP_SET_NLINK(tdp, tdp->i_nlink); UFS_INODE_SET_FLAG(tdp, IN_CHANGE); @@ -1697,7 +1698,7 @@ relock: * parent directory must be decremented * and ".." set to point to the new parent. */ - if (doingdirectory && newparent) { + if (doingdirectory && newparent != 0) { /* * Set the directory depth based on its new parent. */ @@ -2064,9 +2065,13 @@ ufs_mkdir( */ ucred.cr_ref = 1; ucred.cr_uid = ip->i_uid; + + /* + * XXXKE Fix this is cr_gid gets separated out + */ ucred.cr_ngroups = 1; ucred.cr_groups = &ucred_group; - ucred.cr_groups[0] = dp->i_gid; + ucred.cr_gid = ucred_group = dp->i_gid; ucp = &ucred; } #endif @@ -2823,9 +2828,13 @@ ufs_makeinode(int mode, struct vnode *dvp, struct vnode **vpp, */ ucred.cr_ref = 1; ucred.cr_uid = ip->i_uid; + + /* + * XXXKE Fix this is cr_gid gets separated out + */ ucred.cr_ngroups = 1; ucred.cr_groups = &ucred_group; - ucred.cr_groups[0] = pdir->i_gid; + ucred.cr_gid = ucred_group = pdir->i_gid; ucp = &ucred; #endif } else { diff --git a/sys/vm/swap_pager.c b/sys/vm/swap_pager.c index d6bd06226d04..327cac661044 100644 --- a/sys/vm/swap_pager.c +++ b/sys/vm/swap_pager.c @@ -65,9 +65,9 @@ * from: Utah $Hdr: swap_pager.c 1.4 91/04/30$ */ -#include <sys/cdefs.h> #include "opt_vm.h" +#define EXTERR_CATEGORY EXTERR_CAT_SWAP #include <sys/param.h> #include <sys/bio.h> #include <sys/blist.h> @@ -76,6 +76,7 @@ #include <sys/disk.h> #include <sys/disklabel.h> #include <sys/eventhandler.h> +#include <sys/exterrvar.h> #include <sys/fcntl.h> #include <sys/limits.h> #include <sys/lock.h> @@ -2686,7 +2687,7 @@ swapon_check_swzone(void) } } -static void +static int swaponsomething(struct vnode *vp, void *id, u_long nblks, sw_strategy_t *strategy, sw_close_t *close, dev_t dev, int flags) { @@ -2701,6 +2702,8 @@ swaponsomething(struct vnode *vp, void *id, u_long nblks, */ nblks &= ~(ctodb(1) - 1); nblks = dbtoc(nblks); + if (nblks == 0) + return (EXTERROR(EINVAL, "swap device too small")); sp = malloc(sizeof *sp, M_VMPGDATA, M_WAITOK | M_ZERO); sp->sw_blist = blist_create(nblks, M_WAITOK); @@ -2742,6 +2745,8 @@ swaponsomething(struct vnode *vp, void *id, u_long nblks, swp_sizecheck(); mtx_unlock(&sw_dev_mtx); EVENTHANDLER_INVOKE(swapon, sp); + + return (0); } /* @@ -3286,10 +3291,10 @@ swapongeom_locked(struct cdev *dev, struct vnode *vp) return (error); } nblks = pp->mediasize / DEV_BSIZE; - swaponsomething(vp, cp, nblks, swapgeom_strategy, + error = swaponsomething(vp, cp, nblks, swapgeom_strategy, swapgeom_close, dev2udev(dev), (pp->flags & G_PF_ACCEPT_UNMAPPED) != 0 ? SW_UNMAPPED : 0); - return (0); + return (error); } static int @@ -3378,9 +3383,9 @@ swaponvp(struct thread *td, struct vnode *vp, u_long nblks) if (error != 0) return (error); - swaponsomething(vp, vp, nblks, swapdev_strategy, swapdev_close, + error = swaponsomething(vp, vp, nblks, swapdev_strategy, swapdev_close, NODEV, 0); - return (0); + return (error); } static int diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c index bbae55895c2c..b239a6ffb4ce 100644 --- a/sys/vm/vm_page.c +++ b/sys/vm/vm_page.c @@ -396,7 +396,7 @@ vm_page_blacklist_load(char **list, char **end) } *list = ptr; if (ptr != NULL) - *end = ptr + len; + *end = ptr + len - 1; else *end = NULL; return; |