diff options
Diffstat (limited to 'sys')
370 files changed, 14679 insertions, 6052 deletions
diff --git a/sys/amd64/acpica/acpi_wakeup.c b/sys/amd64/acpica/acpi_wakeup.c index 51d6d5e36840..99565fbb69ca 100644 --- a/sys/amd64/acpica/acpi_wakeup.c +++ b/sys/amd64/acpica/acpi_wakeup.c @@ -54,10 +54,8 @@ #include <x86/apicreg.h> #include <x86/apicvar.h> -#ifdef SMP #include <machine/smp.h> #include <machine/vmparam.h> -#endif #include <contrib/dev/acpica/include/acpi.h> @@ -73,19 +71,13 @@ extern int acpi_resume_beep; extern int acpi_reset_video; extern int acpi_susp_bounce; -#ifdef SMP extern struct susppcb **susppcbs; static cpuset_t suspcpus; -#else -static struct susppcb **susppcbs; -#endif static void acpi_stop_beep(void *); -#ifdef SMP static int acpi_wakeup_ap(struct acpi_softc *, int); static void acpi_wakeup_cpus(struct acpi_softc *); -#endif #define ACPI_WAKEPT_PAGES 7 @@ -103,7 +95,6 @@ acpi_stop_beep(void *arg) timer_spkr_release(); } -#ifdef SMP static int acpi_wakeup_ap(struct acpi_softc *sc, int cpu) { @@ -177,7 +168,6 @@ acpi_wakeup_cpus(struct acpi_softc *sc) outb(CMOS_DATA, mpbiosreason); } } -#endif int acpi_sleep_machdep(struct acpi_softc *sc, int state) @@ -190,10 +180,8 @@ acpi_sleep_machdep(struct acpi_softc *sc, int state) if (sc->acpi_wakeaddr == 0ul) return (-1); /* couldn't alloc wake memory */ -#ifdef SMP suspcpus = all_cpus; CPU_CLR(PCPU_GET(cpuid), &suspcpus); -#endif if (acpi_resume_beep != 0) timer_spkr_acquire(); @@ -208,12 +196,10 @@ acpi_sleep_machdep(struct acpi_softc *sc, int state) pcb = &susppcbs[0]->sp_pcb; if (savectx(pcb)) { fpususpend(susppcbs[0]->sp_fpususpend); -#ifdef SMP if (!CPU_EMPTY(&suspcpus) && suspend_cpus(suspcpus) == 0) { device_printf(sc->acpi_dev, "Failed to suspend APs\n"); return (0); /* couldn't sleep */ } -#endif hw_ibrs_ibpb_active = 0; hw_ssb_active = 0; cpu_stdext_feature3 = 0; @@ -278,16 +264,12 @@ acpi_wakeup_machdep(struct acpi_softc *sc, int state, int sleep_result, PCPU_SET(switchtime, 0); PCPU_SET(switchticks, ticks); lapic_xapic_mode(); -#ifdef SMP if (!CPU_EMPTY(&suspcpus)) acpi_wakeup_cpus(sc); -#endif } -#ifdef SMP if (!CPU_EMPTY(&suspcpus)) resume_cpus(suspcpus); -#endif /* * Re-read cpu_stdext_feature3, which was zeroed-out diff --git a/sys/amd64/amd64/apic_vector.S b/sys/amd64/amd64/apic_vector.S index 6e51ebff298a..e98bae9eb6c5 100644 --- a/sys/amd64/amd64/apic_vector.S +++ b/sys/amd64/amd64/apic_vector.S @@ -49,12 +49,6 @@ #include <machine/specialreg.h> #include <x86/apicreg.h> -#ifdef SMP -#define LK lock ; -#else -#define LK -#endif - .text SUPERALIGN_TEXT /* End Of Interrupt to APIC */ @@ -163,7 +157,6 @@ IDTVEC(spuriousint) jmp doreti #endif -#ifdef SMP /* * Global address space TLB shootdown. */ @@ -270,5 +263,3 @@ IDTVEC(justreturn) INTR_HANDLER justreturn1 call as_lapic_eoi jmp doreti - -#endif /* SMP */ diff --git a/sys/amd64/amd64/cpu_switch.S b/sys/amd64/amd64/cpu_switch.S index a053f6c70af1..d7e954f573b0 100644 --- a/sys/amd64/amd64/cpu_switch.S +++ b/sys/amd64/amd64/cpu_switch.S @@ -136,7 +136,7 @@ ctx_switch_fpusave_done: movq %r15,TD_LOCK(%r13) /* Release the old thread */ sw1: leaq TD_MD_PCB(%r12),%r8 -#if defined(SCHED_ULE) && defined(SMP) +#if defined(SCHED_ULE) movq $blocked_lock, %rdx movq TD_LOCK(%r12),%rcx cmpq %rcx, %rdx @@ -492,7 +492,7 @@ ENTRY(resumectx) END(resumectx) /* Wait for the new thread to become unblocked */ -#if defined(SCHED_ULE) && defined(SMP) +#if defined(SCHED_ULE) sw1wait: 1: pause diff --git a/sys/amd64/amd64/efirt_machdep.c b/sys/amd64/amd64/efirt_machdep.c index 81a28ebe97ee..fe5d60c978dd 100644 --- a/sys/amd64/amd64/efirt_machdep.c +++ b/sys/amd64/amd64/efirt_machdep.c @@ -56,6 +56,13 @@ #include <vm/vm_pager.h> #include <vm/vm_radix.h> +/* The EFI regions we're allowed to map. */ +#define EFI_ALLOWED_TYPES_MASK ( \ + 1u << EFI_MD_TYPE_BS_CODE | 1u << EFI_MD_TYPE_BS_DATA | \ + 1u << EFI_MD_TYPE_RT_CODE | 1u << EFI_MD_TYPE_RT_DATA | \ + 1u << EFI_MD_TYPE_FIRMWARE \ +) + static pml5_entry_t *efi_pml5; static pml4_entry_t *efi_pml4; static vm_object_t obj_1t1_pt; @@ -181,6 +188,7 @@ efi_create_1t1_map(struct efi_md *map, int ndesc, int descsz) vm_offset_t va; uint64_t idx; int bits, i, mode; + bool map_pz = true; obj_1t1_pt = vm_pager_allocate(OBJT_PHYS, NULL, ptoa(1 + NPML4EPG + NPML4EPG * NPDPEPG + NPML4EPG * NPDPEPG * NPDEPG), @@ -198,9 +206,16 @@ efi_create_1t1_map(struct efi_md *map, int ndesc, int descsz) pmap_pinit_pml4(efi_pmltop_page); } + if ((efi_map_regs & ~EFI_ALLOWED_TYPES_MASK) != 0) { + printf("Ignoring the following runtime EFI regions: %#x\n", + efi_map_regs & ~EFI_ALLOWED_TYPES_MASK); + efi_map_regs &= EFI_ALLOWED_TYPES_MASK; + } + for (i = 0, p = map; i < ndesc; i++, p = efi_next_descriptor(p, descsz)) { - if ((p->md_attr & EFI_MD_ATTR_RT) == 0) + if ((p->md_attr & EFI_MD_ATTR_RT) == 0 && + !EFI_MAP_BOOTTYPE_ALLOWED(p->md_type)) continue; if (p->md_virt != 0 && p->md_virt != p->md_phys) { if (bootverbose) @@ -256,6 +271,22 @@ efi_create_1t1_map(struct efi_md *map, int ndesc, int descsz) } } VM_OBJECT_WUNLOCK(obj_1t1_pt); + if (p->md_phys == 0) + map_pz = false; + } + + /* + * Some BIOSes tend to access phys 0 during efirt calls, + * so map it if we haven't yet. + */ + if (map_pz) { + VM_OBJECT_WLOCK(obj_1t1_pt); + pte = efi_1t1_pte(0); + /* Assume Write-Back */ + bits = pmap_cache_bits(kernel_pmap, VM_MEMATTR_WRITE_BACK, + false) | X86_PG_RW | X86_PG_V; + pte_store(pte, bits); + VM_OBJECT_WUNLOCK(obj_1t1_pt); } return (true); diff --git a/sys/amd64/amd64/exec_machdep.c b/sys/amd64/amd64/exec_machdep.c index da68289e2c83..6752b716deb5 100644 --- a/sys/amd64/amd64/exec_machdep.c +++ b/sys/amd64/amd64/exec_machdep.c @@ -59,9 +59,7 @@ #include <sys/reg.h> #include <sys/rwlock.h> #include <sys/signalvar.h> -#ifdef SMP #include <sys/smp.h> -#endif #include <sys/syscallsubr.h> #include <sys/sysctl.h> #include <sys/sysent.h> diff --git a/sys/amd64/amd64/initcpu.c b/sys/amd64/amd64/initcpu.c index 05e482f7783b..7f317674907e 100644 --- a/sys/amd64/amd64/initcpu.c +++ b/sys/amd64/amd64/initcpu.c @@ -325,6 +325,10 @@ initializecpu(void) wrmsr(MSR_EFER, msr); pg_nx = PG_NX; } + if ((amd_feature2 & AMDID2_TCE) != 0) { + msr = rdmsr(MSR_EFER) | EFER_TCE; + wrmsr(MSR_EFER, msr); + } hw_ibrs_recalculate(false); hw_ssb_recalculate(false); amd64_syscall_ret_flush_l1d_recalc(); diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c index 032a134bbd4b..37c7056f649c 100644 --- a/sys/amd64/amd64/machdep.c +++ b/sys/amd64/amd64/machdep.c @@ -38,7 +38,6 @@ * SUCH DAMAGE. */ -#include <sys/cdefs.h> #include "opt_atpic.h" #include "opt_cpu.h" #include "opt_ddb.h" @@ -82,9 +81,7 @@ #include <sys/rwlock.h> #include <sys/sched.h> #include <sys/signalvar.h> -#ifdef SMP #include <sys/smp.h> -#endif #include <sys/syscallsubr.h> #include <sys/sysctl.h> #include <sys/sysent.h> @@ -132,9 +129,7 @@ #include <machine/tss.h> #include <x86/ucode.h> #include <x86/ifunc.h> -#ifdef SMP #include <machine/smp.h> -#endif #ifdef FDT #include <x86/fdt.h> #endif @@ -149,6 +144,10 @@ #include <isa/rtc.h> #include <x86/init.h> +#ifndef SMP +#error amd64 requires options SMP +#endif + /* Sanity check for __curthread() */ CTASSERT(offsetof(struct pcpu, pc_curthread) == 0); @@ -188,6 +187,12 @@ struct init_ops init_ops = { */ vm_paddr_t efi_systbl_phys; +/* + * Bitmap of extra EFI memory region types that should be preserved and mapped + * during runtime services calls. + */ +uint32_t efi_map_regs; + /* Intel ICH registers */ #define ICH_PMBASE 0x400 #define ICH_SMI_EN ICH_PMBASE + 0x30 @@ -645,7 +650,7 @@ add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap, * NB: physmap_idx points to the next free slot. */ insert_idx = physmap_idx; - for (i = 0; i <= physmap_idx; i += 2) { + for (i = 0; i < physmap_idx; i += 2) { if (base < physmap[i + 1]) { if (base + length <= physmap[i]) { insert_idx = i; @@ -659,7 +664,7 @@ add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap, } /* See if we can prepend to the next entry. */ - if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) { + if (insert_idx < physmap_idx && base + length == physmap[insert_idx]) { physmap[insert_idx] = base; return (1); } @@ -670,8 +675,6 @@ add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap, return (1); } - physmap_idx += 2; - *physmap_idxp = physmap_idx; if (physmap_idx == PHYS_AVAIL_ENTRIES) { printf( "Too many segments in the physical address map, giving up\n"); @@ -682,11 +685,14 @@ add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap, * Move the last 'N' entries down to make room for the new * entry if needed. */ - for (i = (physmap_idx - 2); i > insert_idx; i -= 2) { + for (i = physmap_idx; i > insert_idx; i -= 2) { physmap[i] = physmap[i - 2]; physmap[i + 1] = physmap[i - 1]; } + physmap_idx += 2; + *physmap_idxp = physmap_idx; + /* Insert the new entry. */ physmap[insert_idx] = base; physmap[insert_idx + 1] = base + length; @@ -757,6 +763,7 @@ add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap, printf("%23s %12s %12s %8s %4s\n", "Type", "Physical", "Virtual", "#Pages", "Attr"); + TUNABLE_INT_FETCH("machdep.efirt.regs", &efi_map_regs); for (i = 0, p = map; i < ndesc; i++, p = efi_next_descriptor(p, efihdr->descriptor_size)) { if (boothowto & RB_VERBOSE) { @@ -794,10 +801,13 @@ add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap, } switch (p->md_type) { - case EFI_MD_TYPE_CODE: - case EFI_MD_TYPE_DATA: case EFI_MD_TYPE_BS_CODE: case EFI_MD_TYPE_BS_DATA: + if (EFI_MAP_BOOTTYPE_ALLOWED(p->md_type)) + continue; + /* FALLTHROUGH */ + case EFI_MD_TYPE_CODE: + case EFI_MD_TYPE_DATA: case EFI_MD_TYPE_FREE: /* * We're allowed to use any entry with these types. diff --git a/sys/amd64/amd64/mem.c b/sys/amd64/amd64/mem.c index 413b7c74890e..851f2df0e6e1 100644 --- a/sys/amd64/amd64/mem.c +++ b/sys/amd64/amd64/mem.c @@ -105,8 +105,8 @@ memrw(struct cdev *dev, struct uio *uio, int flags) * PAGE_SIZE, the uiomove() call does not * access past the end of the direct map. */ - if (v >= DMAP_MIN_ADDRESS && - v < DMAP_MIN_ADDRESS + dmaplimit) { + if (v >= kva_layout.dmap_low && + v < kva_layout.dmap_high) { error = uiomove((void *)v, c, uio); break; } diff --git a/sys/amd64/amd64/minidump_machdep.c b/sys/amd64/amd64/minidump_machdep.c index 6d0917e16099..43bf81a991bf 100644 --- a/sys/amd64/amd64/minidump_machdep.c +++ b/sys/amd64/amd64/minidump_machdep.c @@ -186,7 +186,7 @@ cpu_minidumpsys(struct dumperinfo *di, const struct minidumpstate *state) * tables, so care must be taken to read each entry only once. */ pmapsize = 0; - for (va = VM_MIN_KERNEL_ADDRESS; va < kva_end; ) { + for (va = kva_layout.km_low; va < kva_end; ) { /* * We always write a page, even if it is zero. Each * page written corresponds to 1GB of space @@ -279,9 +279,9 @@ cpu_minidumpsys(struct dumperinfo *di, const struct minidumpstate *state) mdhdr.msgbufsize = mbp->msg_size; mdhdr.bitmapsize = round_page(BITSET_SIZE(vm_page_dump_pages)); mdhdr.pmapsize = pmapsize; - mdhdr.kernbase = VM_MIN_KERNEL_ADDRESS; - mdhdr.dmapbase = DMAP_MIN_ADDRESS; - mdhdr.dmapend = DMAP_MAX_ADDRESS; + mdhdr.kernbase = kva_layout.km_low; + mdhdr.dmapbase = kva_layout.dmap_low; + mdhdr.dmapend = kva_layout.dmap_high; mdhdr.dumpavailsize = round_page(sizeof(dump_avail)); dump_init_header(di, &kdh, KERNELDUMPMAGIC, KERNELDUMP_AMD64_VERSION, @@ -323,7 +323,7 @@ cpu_minidumpsys(struct dumperinfo *di, const struct minidumpstate *state) /* Dump kernel page directory pages */ bzero(fakepd, sizeof(fakepd)); - for (va = VM_MIN_KERNEL_ADDRESS; va < kva_end; va += NBPDP) { + for (va = kva_layout.km_low; va < kva_end; va += NBPDP) { ii = pmap_pml4e_index(va); pml4 = (uint64_t *)PHYS_TO_DMAP(KPML4phys) + ii; pdp = (uint64_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index 2ab8c3b17e22..d1d80afccdc7 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -162,9 +162,7 @@ #include <machine/msan.h> #include <machine/pcb.h> #include <machine/specialreg.h> -#ifdef SMP #include <machine/smp.h> -#endif #include <machine/sysarch.h> #include <machine/tss.h> @@ -415,7 +413,7 @@ SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0, static int ndmpdp; vm_paddr_t dmaplimit; -vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS; +vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS_LA48; pt_entry_t pg_nx; static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, @@ -475,11 +473,56 @@ _Static_assert(DMPML4I + NDMPML4E <= KMSANSHADPML4I, "direct map overflow"); static pml4_entry_t *kernel_pml4; static u_int64_t DMPDphys; /* phys addr of direct mapped level 2 */ static u_int64_t DMPDPphys; /* phys addr of direct mapped level 3 */ +static u_int64_t DMPML4phys; /* ... level 4, for la57 */ static int ndmpdpphys; /* number of DMPDPphys pages */ vm_paddr_t kernphys; /* phys addr of start of bootstrap data */ vm_paddr_t KERNend; /* and the end */ +struct kva_layout_s kva_layout = { + .kva_min = KV4ADDR(PML4PML4I, 0, 0, 0), + .kva_max = KV4ADDR(NPML4EPG - 1, NPDPEPG - 1, + NPDEPG - 1, NPTEPG - 1), + .dmap_low = KV4ADDR(DMPML4I, 0, 0, 0), + .dmap_high = KV4ADDR(DMPML4I + NDMPML4E, 0, 0, 0), + .lm_low = KV4ADDR(LMSPML4I, 0, 0, 0), + .lm_high = KV4ADDR(LMEPML4I + 1, 0, 0, 0), + .km_low = KV4ADDR(KPML4BASE, 0, 0, 0), + .km_high = KV4ADDR(KPML4BASE + NKPML4E - 1, NPDPEPG - 1, + NPDEPG - 1, NPTEPG - 1), + .rec_pt = KV4ADDR(PML4PML4I, 0, 0, 0), + .kasan_shadow_low = KV4ADDR(KASANPML4I, 0, 0, 0), + .kasan_shadow_high = KV4ADDR(KASANPML4I + NKASANPML4E, 0, 0, 0), + .kmsan_shadow_low = KV4ADDR(KMSANSHADPML4I, 0, 0, 0), + .kmsan_shadow_high = KV4ADDR(KMSANSHADPML4I + NKMSANSHADPML4E, + 0, 0, 0), + .kmsan_origin_low = KV4ADDR(KMSANORIGPML4I, 0, 0, 0), + .kmsan_origin_high = KV4ADDR(KMSANORIGPML4I + NKMSANORIGPML4E, + 0, 0, 0), +}; + +struct kva_layout_s kva_layout_la57 = { + .kva_min = KV5ADDR(NPML5EPG / 2, 0, 0, 0, 0), /* == rec_pt */ + .kva_max = KV5ADDR(NPML5EPG - 1, NPML4EPG - 1, NPDPEPG - 1, + NPDEPG - 1, NPTEPG - 1), + .dmap_low = KV5ADDR(DMPML5I, 0, 0, 0, 0), + .dmap_high = KV5ADDR(DMPML5I + NDMPML5E, 0, 0, 0, 0), + .lm_low = KV5ADDR(LMSPML5I, 0, 0, 0, 0), + .lm_high = KV5ADDR(LMEPML5I + 1, 0, 0, 0, 0), + .km_low = KV4ADDR(KPML4BASE, 0, 0, 0), + .km_high = KV4ADDR(KPML4BASE + NKPML4E - 1, NPDPEPG - 1, + NPDEPG - 1, NPTEPG - 1), + .rec_pt = KV5ADDR(PML5PML5I, 0, 0, 0, 0), + .kasan_shadow_low = KV4ADDR(KASANPML4I, 0, 0, 0), + .kasan_shadow_high = KV4ADDR(KASANPML4I + NKASANPML4E, 0, 0, 0), + .kmsan_shadow_low = KV4ADDR(KMSANSHADPML4I, 0, 0, 0), + .kmsan_shadow_high = KV4ADDR(KMSANSHADPML4I + NKMSANSHADPML4E, + 0, 0, 0), + .kmsan_origin_low = KV4ADDR(KMSANORIGPML4I, 0, 0, 0), + .kmsan_origin_high = KV4ADDR(KMSANORIGPML4I + NKMSANORIGPML4E, + 0, 0, 0), +}; + /* * pmap_mapdev support pre initialization (i.e. console) */ @@ -549,8 +592,8 @@ static int pmap_flags = PMAP_PDE_SUPERPAGE; /* flags for x86 pmaps */ static vmem_t *large_vmem; static u_int lm_ents; -#define PMAP_ADDRESS_IN_LARGEMAP(va) ((va) >= LARGEMAP_MIN_ADDRESS && \ - (va) < LARGEMAP_MIN_ADDRESS + NBPML4 * (u_long)lm_ents) +#define PMAP_ADDRESS_IN_LARGEMAP(va) ((va) >= kva_layout.lm_low && \ + (va) < kva_layout.lm_high) int pmap_pcid_enabled = 1; SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, @@ -1301,8 +1344,10 @@ static int pmap_change_props_locked(vm_offset_t va, vm_size_t size, static bool pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); static bool pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, struct rwlock **lockp); +static bool pmap_demote_pde_mpte(pmap_t pmap, pd_entry_t *pde, + vm_offset_t va, struct rwlock **lockp, vm_page_t mpte); static bool pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, - vm_offset_t va); + vm_offset_t va, vm_page_t m); static int pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, struct rwlock **lockp); static int pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, @@ -1334,7 +1379,7 @@ static pdp_entry_t *pmap_pti_pdpe(vm_offset_t va); static pd_entry_t *pmap_pti_pde(vm_offset_t va); static void pmap_pti_wire_pte(void *pte); static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, - struct spglist *free, struct rwlock **lockp); + bool demote_kpde, struct spglist *free, struct rwlock **lockp); static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp); static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va); @@ -1720,7 +1765,7 @@ create_pagetables(vm_paddr_t *firstaddr) { pd_entry_t *pd_p; pdp_entry_t *pdp_p; - pml4_entry_t *p4_p; + pml4_entry_t *p4_p, *p4d_p; pml5_entry_t *p5_p; uint64_t DMPDkernphys; vm_paddr_t pax; @@ -1730,7 +1775,7 @@ create_pagetables(vm_paddr_t *firstaddr) vm_offset_t kasankernbase; int kasankpdpi, kasankpdi, nkasanpte; #endif - int i, j, ndm1g, nkpdpe, nkdmpde; + int i, j, ndm1g, nkpdpe, nkdmpde, ndmpml4phys; TSENTER(); /* Allocate page table pages for the direct map */ @@ -1738,15 +1783,30 @@ create_pagetables(vm_paddr_t *firstaddr) if (ndmpdp < 4) /* Minimum 4GB of dirmap */ ndmpdp = 4; ndmpdpphys = howmany(ndmpdp, NPDPEPG); - if (ndmpdpphys > NDMPML4E) { - /* - * Each NDMPML4E allows 512 GB, so limit to that, - * and then readjust ndmpdp and ndmpdpphys. - */ - printf("NDMPML4E limits system to %d GB\n", NDMPML4E * 512); - Maxmem = atop(NDMPML4E * NBPML4); - ndmpdpphys = NDMPML4E; - ndmpdp = NDMPML4E * NPDEPG; + if (la57) { + ndmpml4phys = howmany(ndmpdpphys, NPML4EPG); + if (ndmpml4phys > NDMPML5E) { + printf("NDMPML5E limits system to %ld GB\n", + (u_long)NDMPML5E * NBPML5 / 1024 / 1024 / 1024); + Maxmem = atop(NDMPML5E * NBPML5); + ndmpml4phys = NDMPML5E; + ndmpdpphys = ndmpml4phys * NPML4EPG; + ndmpdp = ndmpdpphys * NPDEPG; + } + DMPML4phys = allocpages(firstaddr, ndmpml4phys); + } else { + if (ndmpdpphys > NDMPML4E) { + /* + * Each NDMPML4E allows 512 GB, so limit to + * that, and then readjust ndmpdp and + * ndmpdpphys. + */ + printf("NDMPML4E limits system to %d GB\n", + NDMPML4E * 512); + Maxmem = atop(NDMPML4E * NBPML4); + ndmpdpphys = NDMPML4E; + ndmpdp = NDMPML4E * NPDEPG; + } } DMPDPphys = allocpages(firstaddr, ndmpdpphys); ndm1g = 0; @@ -1771,7 +1831,13 @@ create_pagetables(vm_paddr_t *firstaddr) dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT; /* Allocate pages. */ + if (la57) { + KPML5phys = allocpages(firstaddr, 1); + p5_p = (pml5_entry_t *)KPML5phys; + } KPML4phys = allocpages(firstaddr, 1); + p4_p = (pml4_entry_t *)KPML4phys; + KPDPphys = allocpages(firstaddr, NKPML4E); #ifdef KASAN KASANPDPphys = allocpages(firstaddr, NKASANPML4E); @@ -1891,6 +1957,16 @@ create_pagetables(vm_paddr_t *firstaddr) } /* + * Connect the Direct Map slots up to the PML4. + * pml5 entries for DMAP are handled below in global pml5 loop. + */ + p4d_p = la57 ? (pml4_entry_t *)DMPML4phys : &p4_p[DMPML4I]; + for (i = 0; i < ndmpdpphys; i++) { + p4d_p[i] = (DMPDPphys + ptoa(i)) | X86_PG_RW | X86_PG_V | + pg_nx; + } + + /* * Instead of using a 1G page for the memory containing the kernel, * use 2M pages with read-only and no-execute permissions. (If using 1G * pages, this will partially overwrite the PDPEs above.) @@ -1909,11 +1985,6 @@ create_pagetables(vm_paddr_t *firstaddr) } } - /* And recursively map PML4 to itself in order to get PTmap */ - p4_p = (pml4_entry_t *)KPML4phys; - p4_p[PML4PML4I] = KPML4phys; - p4_p[PML4PML4I] |= X86_PG_RW | X86_PG_V | pg_nx; - #ifdef KASAN /* Connect the KASAN shadow map slots up to the PML4. */ for (i = 0; i < NKASANPML4E; i++) { @@ -1936,25 +2007,15 @@ create_pagetables(vm_paddr_t *firstaddr) } #endif - /* Connect the Direct Map slots up to the PML4. */ - for (i = 0; i < ndmpdpphys; i++) { - p4_p[DMPML4I + i] = DMPDPphys + ptoa(i); - p4_p[DMPML4I + i] |= X86_PG_RW | X86_PG_V | pg_nx; - } - /* Connect the KVA slots up to the PML4 */ for (i = 0; i < NKPML4E; i++) { p4_p[KPML4BASE + i] = KPDPphys + ptoa(i); p4_p[KPML4BASE + i] |= X86_PG_RW | X86_PG_V; } - kernel_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys); - if (la57) { /* XXXKIB bootstrap KPML5phys page is lost */ - KPML5phys = allocpages(firstaddr, 1); - for (i = 0, p5_p = (pml5_entry_t *)KPML5phys; i < NPML5EPG; - i++) { + for (i = 0; i < NPML5EPG; i++) { if (i == PML5PML5I) { /* * Recursively map PML5 to itself in @@ -1962,6 +2023,10 @@ create_pagetables(vm_paddr_t *firstaddr) */ p5_p[i] = KPML5phys | X86_PG_RW | X86_PG_A | X86_PG_M | X86_PG_V | pg_nx; + } else if (i >= DMPML5I && i < DMPML5I + ndmpml4phys) { + /* Connect DMAP pml4 pages to PML5. */ + p5_p[i] = (DMPML4phys + ptoa(i - DMPML5I)) | + X86_PG_RW | X86_PG_V | pg_nx; } else if (i == pmap_pml5e_index(UPT_MAX_ADDRESS)) { p5_p[i] = KPML4phys | X86_PG_RW | X86_PG_A | X86_PG_M | X86_PG_V; @@ -1969,6 +2034,10 @@ create_pagetables(vm_paddr_t *firstaddr) p5_p[i] = 0; } } + } else { + /* Recursively map PML4 to itself in order to get PTmap */ + p4_p[PML4PML4I] = KPML4phys; + p4_p[PML4PML4I] |= X86_PG_RW | X86_PG_V | pg_nx; } TSEXIT(); } @@ -2022,7 +2091,7 @@ pmap_bootstrap(vm_paddr_t *firstaddr) */ virtual_avail = (vm_offset_t)KERNSTART + round_2mpage(KERNend - (vm_paddr_t)kernphys); - virtual_end = VM_MAX_KERNEL_ADDRESS; + virtual_end = kva_layout.km_high; /* * Enable PG_G global pages, then switch to the kernel page @@ -2044,9 +2113,13 @@ pmap_bootstrap(vm_paddr_t *firstaddr) * Initialize the kernel pmap (which is statically allocated). * Count bootstrap data as being resident in case any of this data is * later unmapped (using pmap_remove()) and freed. + * + * DMAP_TO_PHYS()/PHYS_TO_DMAP() are functional only after + * kva_layout is fixed. */ PMAP_LOCK_INIT(kernel_pmap); if (la57) { + kva_layout = kva_layout_la57; vtoptem = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT + NPML5EPGSHIFT)) - 1) << 3; PTmap = (vm_offset_t)P5Tmap; @@ -2057,6 +2130,7 @@ pmap_bootstrap(vm_paddr_t *firstaddr) kernel_pmap->pm_cr3 = KPML5phys; pmap_pt_page_count_adj(kernel_pmap, 1); /* top-level page */ } else { + kernel_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys); kernel_pmap->pm_pmltop = kernel_pml4; kernel_pmap->pm_cr3 = KPML4phys; } @@ -2418,6 +2492,8 @@ pmap_init(void) { struct pmap_preinit_mapping *ppim; vm_page_t m, mpte; + pml4_entry_t *pml4e; + unsigned long lm_max; int error, i, ret, skz63; /* L1TF, reserve page @0 unconditionally */ @@ -2543,10 +2619,15 @@ pmap_init(void) lm_ents = 8; TUNABLE_INT_FETCH("vm.pmap.large_map_pml4_entries", &lm_ents); - if (lm_ents > LMEPML4I - LMSPML4I + 1) - lm_ents = LMEPML4I - LMSPML4I + 1; + lm_max = (kva_layout.lm_high - kva_layout.lm_low) / NBPML4; + if (lm_ents > lm_max) { + printf( + "pmap: shrinking large map from requested %d slots to %ld slots\n", + lm_ents, lm_max); + lm_ents = lm_max; + } #ifdef KMSAN - if (lm_ents > KMSANORIGPML4I - LMSPML4I) { + if (!la57 && lm_ents > KMSANORIGPML4I - LMSPML4I) { printf( "pmap: shrinking large map for KMSAN (%d slots to %ld slots)\n", lm_ents, KMSANORIGPML4I - LMSPML4I); @@ -2557,18 +2638,27 @@ pmap_init(void) printf("pmap: large map %u PML4 slots (%lu GB)\n", lm_ents, (u_long)lm_ents * (NBPML4 / 1024 / 1024 / 1024)); if (lm_ents != 0) { - large_vmem = vmem_create("large", LARGEMAP_MIN_ADDRESS, + large_vmem = vmem_create("large", kva_layout.lm_low, (vmem_size_t)lm_ents * NBPML4, PAGE_SIZE, 0, M_WAITOK); if (large_vmem == NULL) { printf("pmap: cannot create large map\n"); lm_ents = 0; } + if (la57) { + for (i = 0; i < howmany((vm_offset_t)NBPML4 * + lm_ents, NBPML5); i++) { + m = pmap_large_map_getptp_unlocked(); + kernel_pmap->pm_pmltop[LMSPML5I + i] = X86_PG_V | + X86_PG_RW | X86_PG_A | X86_PG_M | + pg_nx | VM_PAGE_TO_PHYS(m); + } + } for (i = 0; i < lm_ents; i++) { m = pmap_large_map_getptp_unlocked(); - /* XXXKIB la57 */ - kernel_pml4[LMSPML4I + i] = X86_PG_V | - X86_PG_RW | X86_PG_A | X86_PG_M | pg_nx | - VM_PAGE_TO_PHYS(m); + pml4e = pmap_pml4e(kernel_pmap, kva_layout.lm_low + + (u_long)i * NBPML4); + *pml4e = X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M | + pg_nx | VM_PAGE_TO_PHYS(m); } } } @@ -2973,7 +3063,6 @@ pmap_update_pde_invalidate(pmap_t pmap, vm_offset_t va, pd_entry_t newpde) * XXX TODO */ -#ifdef SMP /* * Interrupt the cpus that are executing in the guest context. * This will force the vcpu to exit and the cached EPT mappings @@ -3431,168 +3520,6 @@ pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) } sched_unpin(); } -#else /* !SMP */ -/* - * Normal, non-SMP, invalidation functions. - */ -void -pmap_invalidate_page(pmap_t pmap, vm_offset_t va) -{ - struct invpcid_descr d; - struct pmap_pcid *pcidp; - uint64_t kcr3, ucr3; - uint32_t pcid; - - if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) { - pmap->pm_eptgen++; - return; - } - KASSERT(pmap->pm_type == PT_X86, - ("pmap_invalidate_range: unknown type %d", pmap->pm_type)); - - if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) { - invlpg(va); - if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled && - pmap->pm_ucr3 != PMAP_NO_CR3) { - critical_enter(); - pcid = pmap_get_pcid(pmap); - if (invpcid_works) { - d.pcid = pcid | PMAP_PCID_USER_PT; - d.pad = 0; - d.addr = va; - invpcid(&d, INVPCID_ADDR); - } else { - kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE; - ucr3 = pmap->pm_ucr3 | pcid | - PMAP_PCID_USER_PT | CR3_PCID_SAVE; - pmap_pti_pcid_invlpg(ucr3, kcr3, va); - } - critical_exit(); - } - } else if (pmap_pcid_enabled) { - pcidp = zpcpu_get(pmap->pm_pcidp); - pcidp->pm_gen = 0; - } -} - -void -pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) -{ - struct invpcid_descr d; - struct pmap_pcid *pcidp; - vm_offset_t addr; - uint64_t kcr3, ucr3; - uint32_t pcid; - - if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) { - pmap->pm_eptgen++; - return; - } - KASSERT(pmap->pm_type == PT_X86, - ("pmap_invalidate_range: unknown type %d", pmap->pm_type)); - - if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) { - for (addr = sva; addr < eva; addr += PAGE_SIZE) - invlpg(addr); - if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled && - pmap->pm_ucr3 != PMAP_NO_CR3) { - critical_enter(); - pcid = pmap_get_pcid(pmap); - if (invpcid_works) { - d.pcid = pcid | PMAP_PCID_USER_PT; - d.pad = 0; - d.addr = sva; - for (; d.addr < eva; d.addr += PAGE_SIZE) - invpcid(&d, INVPCID_ADDR); - } else { - kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE; - ucr3 = pmap->pm_ucr3 | pcid | - PMAP_PCID_USER_PT | CR3_PCID_SAVE; - pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva); - } - critical_exit(); - } - } else if (pmap_pcid_enabled) { - pcidp = zpcpu_get(pmap->pm_pcidp); - pcidp->pm_gen = 0; - } -} - -void -pmap_invalidate_all(pmap_t pmap) -{ - struct invpcid_descr d; - struct pmap_pcid *pcidp; - uint64_t kcr3, ucr3; - uint32_t pcid; - - if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) { - pmap->pm_eptgen++; - return; - } - KASSERT(pmap->pm_type == PT_X86, - ("pmap_invalidate_all: unknown type %d", pmap->pm_type)); - - if (pmap == kernel_pmap) { - if (pmap_pcid_enabled && invpcid_works) { - bzero(&d, sizeof(d)); - invpcid(&d, INVPCID_CTXGLOB); - } else { - invltlb_glob(); - } - } else if (pmap == PCPU_GET(curpmap)) { - if (pmap_pcid_enabled) { - critical_enter(); - pcid = pmap_get_pcid(pmap); - if (invpcid_works) { - d.pcid = pcid; - d.pad = 0; - d.addr = 0; - invpcid(&d, INVPCID_CTX); - if (pmap->pm_ucr3 != PMAP_NO_CR3) { - d.pcid |= PMAP_PCID_USER_PT; - invpcid(&d, INVPCID_CTX); - } - } else { - kcr3 = pmap->pm_cr3 | pcid; - if (pmap->pm_ucr3 != PMAP_NO_CR3) { - ucr3 = pmap->pm_ucr3 | pcid | - PMAP_PCID_USER_PT; - pmap_pti_pcid_invalidate(ucr3, kcr3); - } else - load_cr3(kcr3); - } - critical_exit(); - } else { - invltlb(); - } - } else if (pmap_pcid_enabled) { - pcidp = zpcpu_get(pmap->pm_pcidp); - pcidp->pm_gen = 0; - } -} - -void -pmap_invalidate_cache(void) -{ - - wbinvd(); -} - -static void -pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) -{ - struct pmap_pcid *pcidp; - - pmap_update_pde_store(pmap, pde, newpde); - if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) - pmap_update_pde_invalidate(pmap, va, newpde); - else { - pcidp = zpcpu_get(pmap->pm_pcidp); - pcidp->pm_gen = 0; - } -} -#endif /* !SMP */ static void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde) @@ -3897,7 +3824,7 @@ pmap_kextract(vm_offset_t va) pd_entry_t pde; vm_paddr_t pa; - if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { + if (va >= kva_layout.dmap_low && va < kva_layout.dmap_high) { pa = DMAP_TO_PHYS(va); } else if (PMAP_ADDRESS_IN_LARGEMAP(va)) { pa = pmap_large_map_kextract(va); @@ -4038,7 +3965,7 @@ pmap_qremove(vm_offset_t sva, int count) * enough to one of those pmap_enter() calls for it to * be caught up in a promotion. */ - KASSERT(va >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", va)); + KASSERT(va >= kva_layout.km_low, ("usermode va %lx", va)); KASSERT((*vtopde(va) & X86_PG_PS) == 0, ("pmap_qremove on promoted va %#lx", va)); @@ -4326,21 +4253,13 @@ void pmap_pinit_pml5(vm_page_t pml5pg) { pml5_entry_t *pm_pml5; + int i; pm_pml5 = (pml5_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml5pg)); - - /* - * Add pml5 entry at top of KVA pointing to existing pml4 table, - * entering all existing kernel mappings into level 5 table. - */ - pm_pml5[pmap_pml5e_index(UPT_MAX_ADDRESS)] = KPML4phys | X86_PG_V | - X86_PG_RW | X86_PG_A | X86_PG_M; - - /* - * Install self-referential address mapping entry. - */ - pm_pml5[PML5PML5I] = VM_PAGE_TO_PHYS(pml5pg) | - X86_PG_RW | X86_PG_V | X86_PG_M | X86_PG_A; + for (i = 0; i < NPML5EPG / 2; i++) + pm_pml5[i] = 0; + for (; i < NPML5EPG; i++) + pm_pml5[i] = kernel_pmap->pm_pmltop[i]; } static void @@ -4897,8 +4816,8 @@ pmap_release(pmap_t pmap) m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pmltop)); if (pmap_is_la57(pmap)) { - pmap->pm_pmltop[pmap_pml5e_index(UPT_MAX_ADDRESS)] = 0; - pmap->pm_pmltop[PML5PML5I] = 0; + for (i = NPML5EPG / 2; i < NPML5EPG; i++) + pmap->pm_pmltop[i] = 0; } else { for (i = 0; i < NKPML4E; i++) /* KVA */ pmap->pm_pmltop[KPML4BASE + i] = 0; @@ -4940,7 +4859,7 @@ pmap_release(pmap_t pmap) static int kvm_size(SYSCTL_HANDLER_ARGS) { - unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS; + unsigned long ksize = kva_layout.km_high - kva_layout.km_low; return sysctl_handle_long(oidp, &ksize, 0, req); } @@ -4951,7 +4870,7 @@ SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, static int kvm_free(SYSCTL_HANDLER_ARGS) { - unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; + unsigned long kfree = kva_layout.km_high - kernel_vm_end; return sysctl_handle_long(oidp, &kfree, 0, req); } @@ -5029,7 +4948,7 @@ pmap_page_array_startup(long pages) vm_page_array_size = pages; - start = VM_MIN_KERNEL_ADDRESS; + start = kva_layout.km_low; end = start + pages * sizeof(struct vm_page); for (va = start; va < end; va += NBPDR) { pfn = first_page + (va - start) / sizeof(struct vm_page); @@ -5999,7 +5918,7 @@ pmap_demote_pde_abort(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, SLIST_INIT(&free); sva = trunc_2mpage(va); - pmap_remove_pde(pmap, pde, sva, &free, lockp); + pmap_remove_pde(pmap, pde, sva, true, &free, lockp); if ((oldpde & pmap_global_bit(pmap)) == 0) pmap_invalidate_pde_page(pmap, sva, oldpde); vm_page_free_pages_toq(&free, true); @@ -6011,11 +5930,17 @@ static bool pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, struct rwlock **lockp) { + return (pmap_demote_pde_mpte(pmap, pde, va, lockp, NULL)); +} + +static bool +pmap_demote_pde_mpte(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, + struct rwlock **lockp, vm_page_t mpte) +{ pd_entry_t newpde, oldpde; pt_entry_t *firstpte, newpte; pt_entry_t PG_A, PG_G, PG_M, PG_PKU_MASK, PG_RW, PG_V; vm_paddr_t mptepa; - vm_page_t mpte; int PG_PTE_CACHE; bool in_kernel; @@ -6028,61 +5953,65 @@ pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, PG_PKU_MASK = pmap_pku_mask_bit(pmap); PMAP_LOCK_ASSERT(pmap, MA_OWNED); - in_kernel = va >= VM_MAXUSER_ADDRESS; oldpde = *pde; KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V), ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V")); - - /* - * Invalidate the 2MB page mapping and return "failure" if the - * mapping was never accessed. - */ - if ((oldpde & PG_A) == 0) { - KASSERT((oldpde & PG_W) == 0, - ("pmap_demote_pde: a wired mapping is missing PG_A")); - pmap_demote_pde_abort(pmap, va, pde, oldpde, lockp); - return (false); - } - - mpte = pmap_remove_pt_page(pmap, va); + KASSERT((oldpde & PG_MANAGED) == 0 || lockp != NULL, + ("pmap_demote_pde: lockp for a managed mapping is NULL")); + in_kernel = va >= VM_MAXUSER_ADDRESS; if (mpte == NULL) { - KASSERT((oldpde & PG_W) == 0, - ("pmap_demote_pde: page table page for a wired mapping" - " is missing")); - /* - * If the page table page is missing and the mapping - * is for a kernel address, the mapping must belong to - * the direct map. Page table pages are preallocated - * for every other part of the kernel address space, - * so the direct map region is the only part of the - * kernel address space that must be handled here. + * Invalidate the 2MB page mapping and return "failure" if the + * mapping was never accessed. */ - KASSERT(!in_kernel || (va >= DMAP_MIN_ADDRESS && - va < DMAP_MAX_ADDRESS), - ("pmap_demote_pde: No saved mpte for va %#lx", va)); - - /* - * If the 2MB page mapping belongs to the direct map - * region of the kernel's address space, then the page - * allocation request specifies the highest possible - * priority (VM_ALLOC_INTERRUPT). Otherwise, the - * priority is normal. - */ - mpte = pmap_alloc_pt_page(pmap, pmap_pde_pindex(va), - (in_kernel ? VM_ALLOC_INTERRUPT : 0) | VM_ALLOC_WIRED); - - /* - * If the allocation of the new page table page fails, - * invalidate the 2MB page mapping and return "failure". - */ - if (mpte == NULL) { + if ((oldpde & PG_A) == 0) { + KASSERT((oldpde & PG_W) == 0, + ("pmap_demote_pde: a wired mapping is missing PG_A")); pmap_demote_pde_abort(pmap, va, pde, oldpde, lockp); return (false); } - if (!in_kernel) - mpte->ref_count = NPTEPG; + mpte = pmap_remove_pt_page(pmap, va); + if (mpte == NULL) { + KASSERT((oldpde & PG_W) == 0, + ("pmap_demote_pde: page table page for a wired mapping is missing")); + + /* + * If the page table page is missing and the mapping + * is for a kernel address, the mapping must belong to + * the direct map. Page table pages are preallocated + * for every other part of the kernel address space, + * so the direct map region is the only part of the + * kernel address space that must be handled here. + */ + KASSERT(!in_kernel || (va >= kva_layout.dmap_low && + va < kva_layout.dmap_high), + ("pmap_demote_pde: No saved mpte for va %#lx", va)); + + /* + * If the 2MB page mapping belongs to the direct map + * region of the kernel's address space, then the page + * allocation request specifies the highest possible + * priority (VM_ALLOC_INTERRUPT). Otherwise, the + * priority is normal. + */ + mpte = pmap_alloc_pt_page(pmap, pmap_pde_pindex(va), + (in_kernel ? VM_ALLOC_INTERRUPT : 0) | + VM_ALLOC_WIRED); + + /* + * If the allocation of the new page table page fails, + * invalidate the 2MB page mapping and return "failure". + */ + if (mpte == NULL) { + pmap_demote_pde_abort(pmap, va, pde, oldpde, + lockp); + return (false); + } + + if (!in_kernel) + mpte->ref_count = NPTEPG; + } } mptepa = VM_PAGE_TO_PHYS(mpte); firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa); @@ -6162,8 +6091,7 @@ pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); PMAP_LOCK_ASSERT(pmap, MA_OWNED); mpte = pmap_remove_pt_page(pmap, va); - if (mpte == NULL) - panic("pmap_remove_kernel_pde: Missing pt page."); + KASSERT(mpte != NULL, ("pmap_remove_kernel_pde: missing pt page")); mptepa = VM_PAGE_TO_PHYS(mpte); newpde = mptepa | X86_PG_M | X86_PG_A | X86_PG_RW | X86_PG_V; @@ -6193,7 +6121,7 @@ pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) * pmap_remove_pde: do the things to unmap a superpage in a process */ static int -pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, +pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, bool demote_kpde, struct spglist *free, struct rwlock **lockp) { struct md_page *pvh; @@ -6233,9 +6161,7 @@ pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, pmap_delayed_invl_page(m); } } - if (pmap == kernel_pmap) { - pmap_remove_kernel_pde(pmap, pdq, sva); - } else { + if (pmap != kernel_pmap) { mpte = pmap_remove_pt_page(pmap, sva); if (mpte != NULL) { KASSERT(vm_page_any_valid(mpte), @@ -6246,6 +6172,14 @@ pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, mpte->ref_count = 0; pmap_add_delayed_free_list(mpte, free, false); } + } else if (demote_kpde) { + pmap_remove_kernel_pde(pmap, pdq, sva); + } else { + mpte = vm_radix_lookup(&pmap->pm_root, pmap_pde_pindex(sva)); + if (vm_page_any_valid(mpte)) { + mpte->valid = 0; + pmap_zero_page(mpte); + } } return (pmap_unuse_pt(pmap, sva, *pmap_pdpe(pmap, sva), free)); } @@ -6476,7 +6410,8 @@ pmap_remove1(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, bool map_delete) */ if ((ptpaddr & PG_G) == 0) anyvalid = 1; - pmap_remove_pde(pmap, pde, sva, &free, &lock); + pmap_remove_pde(pmap, pde, sva, true, &free, + &lock); continue; } else if (!pmap_demote_pde_locked(pmap, pde, sva, &lock)) { @@ -7166,7 +7101,7 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, PG_RW = pmap_rw_bit(pmap); va = trunc_page(va); - KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig")); + KASSERT(va <= kva_layout.km_high, ("pmap_enter: toobig")); KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS, ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)", va)); @@ -7495,6 +7430,9 @@ pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags, PG_RW = pmap_rw_bit(pmap); KASSERT((newpde & (pmap_modified_bit(pmap) | PG_RW)) != PG_RW, ("pmap_enter_pde: newpde is missing PG_M")); + KASSERT((flags & (PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM)) != + PMAP_ENTER_NORECLAIM, + ("pmap_enter_pde: flags is missing PMAP_ENTER_NOREPLACE")); PG_V = pmap_valid_bit(pmap); PMAP_LOCK_ASSERT(pmap, MA_OWNED); @@ -7552,13 +7490,35 @@ pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags, /* * The reference to the PD page that was acquired by * pmap_alloc_pde() ensures that it won't be freed. - * However, if the PDE resulted from a promotion, then + * However, if the PDE resulted from a promotion, and + * the mapping is not from kernel_pmap, then * a reserved PT page could be freed. */ - (void)pmap_remove_pde(pmap, pde, va, &free, lockp); + (void)pmap_remove_pde(pmap, pde, va, false, &free, + lockp); if ((oldpde & PG_G) == 0) pmap_invalidate_pde_page(pmap, va, oldpde); } else { + if (va >= VM_MAXUSER_ADDRESS) { + /* + * Try to save the ptp in the trie + * before any changes to mappings are + * made. Abort on failure. + */ + mt = PHYS_TO_VM_PAGE(oldpde & PG_FRAME); + if (pmap_insert_pt_page(pmap, mt, false, + false)) { + CTR1(KTR_PMAP, + "pmap_enter_pde: cannot ins kern ptp va %#lx", + va); + return (KERN_RESOURCE_SHORTAGE); + } + /* + * Both pmap_remove_pde() and + * pmap_remove_ptes() will zero-fill + * the kernel page table page. + */ + } pmap_delayed_invl_start(); if (pmap_remove_ptes(pmap, va, va + NBPDR, pde, &free, lockp)) @@ -7572,14 +7532,6 @@ pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags, } else { KASSERT(SLIST_EMPTY(&free), ("pmap_enter_pde: freed kernel page table page")); - - /* - * Both pmap_remove_pde() and pmap_remove_ptes() will - * leave the kernel page table page zero filled. - */ - mt = PHYS_TO_VM_PAGE(*pde & PG_FRAME); - if (pmap_insert_pt_page(pmap, mt, false, false)) - panic("pmap_enter_pde: trie insert failed"); } } @@ -7609,6 +7561,14 @@ pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags, if (!pmap_pv_insert_pde(pmap, va, newpde, flags, lockp)) { if (pdpg != NULL) pmap_abort_ptp(pmap, va, pdpg); + else { + KASSERT(va >= VM_MAXUSER_ADDRESS && + (*pde & (PG_PS | PG_V)) == PG_V, + ("pmap_enter_pde: invalid kernel PDE")); + mt = pmap_remove_pt_page(pmap, va); + KASSERT(mt != NULL, + ("pmap_enter_pde: missing kernel PTP")); + } if (uwptpg != NULL) { mt = pmap_remove_pt_page(pmap, va); KASSERT(mt == uwptpg, @@ -9518,7 +9478,7 @@ pmap_unmapdev(void *p, vm_size_t size) va = (vm_offset_t)p; /* If we gave a direct map region in pmap_mapdev, do nothing */ - if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) + if (va >= kva_layout.dmap_low && va < kva_layout.dmap_high) return; offset = va & PAGE_MASK; size = round_page(offset + size); @@ -9547,7 +9507,7 @@ pmap_unmapdev(void *p, vm_size_t size) * Tries to demote a 1GB page mapping. */ static bool -pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va) +pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va, vm_page_t m) { pdp_entry_t newpdpe, oldpdpe; pd_entry_t *firstpde, newpde, *pde; @@ -9564,12 +9524,19 @@ pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va) oldpdpe = *pdpe; KASSERT((oldpdpe & (PG_PS | PG_V)) == (PG_PS | PG_V), ("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V")); - pdpg = pmap_alloc_pt_page(pmap, va >> PDPSHIFT, - VM_ALLOC_WIRED | VM_ALLOC_INTERRUPT); - if (pdpg == NULL) { - CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx" - " in pmap %p", va, pmap); - return (false); + if (m == NULL) { + pdpg = pmap_alloc_pt_page(pmap, va >> PDPSHIFT, + VM_ALLOC_WIRED); + if (pdpg == NULL) { + CTR2(KTR_PMAP, + "pmap_demote_pdpe: failure for va %#lx in pmap %p", + va, pmap); + return (false); + } + } else { + pdpg = m; + pdpg->pindex = va >> PDPSHIFT; + pmap_pt_page_count_adj(pmap, 1); } pdpgpa = VM_PAGE_TO_PHYS(pdpg); firstpde = (pd_entry_t *)PHYS_TO_DMAP(pdpgpa); @@ -9610,6 +9577,8 @@ pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va) void pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) { + if (m->md.pat_mode == ma) + return; m->md.pat_mode = ma; @@ -9629,6 +9598,9 @@ pmap_page_set_memattr_noflush(vm_page_t m, vm_memattr_t ma) { int error; + if (m->md.pat_mode == ma) + return; + m->md.pat_mode = ma; if ((m->flags & PG_FICTITIOUS) != 0) @@ -9685,7 +9657,7 @@ pmap_change_prot(vm_offset_t va, vm_size_t size, vm_prot_t prot) int error; /* Only supported within the kernel map. */ - if (va < VM_MIN_KERNEL_ADDRESS) + if (va < kva_layout.km_low) return (EINVAL); PMAP_LOCK(kernel_pmap); @@ -9716,7 +9688,7 @@ pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot, * Only supported on kernel virtual addresses, including the direct * map but excluding the recursive map. */ - if (base < DMAP_MIN_ADDRESS) + if (base < kva_layout.dmap_low) return (EINVAL); /* @@ -9739,7 +9711,7 @@ pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot, pte_bits |= X86_PG_RW; } if ((prot & VM_PROT_EXECUTE) == 0 || - va < VM_MIN_KERNEL_ADDRESS) { + va < kva_layout.km_low) { pde_bits |= pg_nx; pte_bits |= pg_nx; } @@ -9779,7 +9751,7 @@ pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot, tmpva += NBPDP; continue; } - if (!pmap_demote_pdpe(kernel_pmap, pdpe, tmpva)) + if (!pmap_demote_pdpe(kernel_pmap, pdpe, tmpva, NULL)) return (ENOMEM); } pde = pmap_pdpe_to_pde(pdpe, tmpva); @@ -9835,7 +9807,7 @@ pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot, pmap_pte_props(pdpe, pde_bits, pde_mask); changed = true; } - if (tmpva >= VM_MIN_KERNEL_ADDRESS && + if (tmpva >= kva_layout.km_low && (*pdpe & PG_PS_FRAME) < dmaplimit) { if (pa_start == pa_end) { /* Start physical address run. */ @@ -9865,7 +9837,7 @@ pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot, pmap_pte_props(pde, pde_bits, pde_mask); changed = true; } - if (tmpva >= VM_MIN_KERNEL_ADDRESS && + if (tmpva >= kva_layout.km_low && (*pde & PG_PS_FRAME) < dmaplimit) { if (pa_start == pa_end) { /* Start physical address run. */ @@ -9893,7 +9865,7 @@ pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot, pmap_pte_props(pte, pte_bits, pte_mask); changed = true; } - if (tmpva >= VM_MIN_KERNEL_ADDRESS && + if (tmpva >= kva_layout.km_low && (*pte & PG_FRAME) < dmaplimit) { if (pa_start == pa_end) { /* Start physical address run. */ @@ -9937,11 +9909,13 @@ pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot, } /* - * Demotes any mapping within the direct map region that covers more than the - * specified range of physical addresses. This range's size must be a power - * of two and its starting address must be a multiple of its size. Since the - * demotion does not change any attributes of the mapping, a TLB invalidation - * is not mandatory. The caller may, however, request a TLB invalidation. + * Demotes any mapping within the direct map region that covers more + * than the specified range of physical addresses. This range's size + * must be a power of two and its starting address must be a multiple + * of its size, which means that any pdp from the mapping is fully + * covered by the range if len > NBPDP. Since the demotion does not + * change any attributes of the mapping, a TLB invalidation is not + * mandatory. The caller may, however, request a TLB invalidation. */ void pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, bool invalidate) @@ -9949,38 +9923,67 @@ pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, bool invalidate) pdp_entry_t *pdpe; pd_entry_t *pde; vm_offset_t va; - bool changed; + vm_page_t m, mpte; + bool changed, rv __diagused; if (len == 0) return; KASSERT(powerof2(len), ("pmap_demote_DMAP: len is not a power of 2")); KASSERT((base & (len - 1)) == 0, ("pmap_demote_DMAP: base is not a multiple of len")); + WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "pmap_demote_DMAP"); + if (len < NBPDP && base < dmaplimit) { va = PHYS_TO_DMAP(base); changed = false; + + /* + * Assume that it is fine to sleep there. + * The only existing caller of pmap_demote_DMAP() is the + * x86_mr_split_dmap() function. + */ + m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_WAITOK); + if (len < NBPDR) { + mpte = vm_page_alloc_noobj(VM_ALLOC_WIRED | + VM_ALLOC_WAITOK); + } else + mpte = NULL; + PMAP_LOCK(kernel_pmap); pdpe = pmap_pdpe(kernel_pmap, va); if ((*pdpe & X86_PG_V) == 0) panic("pmap_demote_DMAP: invalid PDPE"); if ((*pdpe & PG_PS) != 0) { - if (!pmap_demote_pdpe(kernel_pmap, pdpe, va)) - panic("pmap_demote_DMAP: PDPE failed"); + rv = pmap_demote_pdpe(kernel_pmap, pdpe, va, m); + KASSERT(rv, ("pmap_demote_DMAP: PDPE failed")); changed = true; + m = NULL; } if (len < NBPDR) { pde = pmap_pdpe_to_pde(pdpe, va); if ((*pde & X86_PG_V) == 0) panic("pmap_demote_DMAP: invalid PDE"); if ((*pde & PG_PS) != 0) { - if (!pmap_demote_pde(kernel_pmap, pde, va)) - panic("pmap_demote_DMAP: PDE failed"); + mpte->pindex = pmap_pde_pindex(va); + pmap_pt_page_count_adj(kernel_pmap, 1); + rv = pmap_demote_pde_mpte(kernel_pmap, pde, va, + NULL, mpte); + KASSERT(rv, ("pmap_demote_DMAP: PDE failed")); changed = true; + mpte = NULL; } } if (changed && invalidate) pmap_invalidate_page(kernel_pmap, va); PMAP_UNLOCK(kernel_pmap); + if (m != NULL) { + vm_page_unwire_noq(m); + vm_page_free(m); + } + if (mpte != NULL) { + vm_page_unwire_noq(mpte); + vm_page_free(mpte); + } } } @@ -10210,17 +10213,9 @@ pmap_activate_sw(struct thread *td) return; } cpuid = PCPU_GET(cpuid); -#ifdef SMP CPU_SET_ATOMIC(cpuid, &pmap->pm_active); -#else - CPU_SET(cpuid, &pmap->pm_active); -#endif pmap_activate_sw_mode(td, pmap, cpuid); -#ifdef SMP CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); -#else - CPU_CLR(cpuid, &oldpmap->pm_active); -#endif } void @@ -10261,11 +10256,7 @@ pmap_activate_boot(pmap_t pmap) MPASS(pmap != kernel_pmap); cpuid = PCPU_GET(cpuid); -#ifdef SMP CPU_SET_ATOMIC(cpuid, &pmap->pm_active); -#else - CPU_SET(cpuid, &pmap->pm_active); -#endif PCPU_SET(curpmap, pmap); if (pti) { kcr3 = pmap->pm_cr3; @@ -10629,19 +10620,28 @@ pmap_large_map_getptp(void) static pdp_entry_t * pmap_large_map_pdpe(vm_offset_t va) { + pml4_entry_t *pml4; vm_pindex_t pml4_idx; vm_paddr_t mphys; - pml4_idx = pmap_pml4e_index(va); - KASSERT(LMSPML4I <= pml4_idx && pml4_idx < LMSPML4I + lm_ents, - ("pmap_large_map_pdpe: va %#jx out of range idx %#jx LMSPML4I " - "%#jx lm_ents %d", - (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents)); - KASSERT((kernel_pml4[pml4_idx] & X86_PG_V) != 0, - ("pmap_large_map_pdpe: invalid pml4 for va %#jx idx %#jx " - "LMSPML4I %#jx lm_ents %d", - (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents)); - mphys = kernel_pml4[pml4_idx] & PG_FRAME; + KASSERT(va >= kva_layout.lm_low && va < kva_layout.lm_low + + (vm_offset_t)NBPML4 * lm_ents, ("va %#lx not in large map", va)); + if (la57) { + pml4 = pmap_pml4e(kernel_pmap, va); + mphys = *pml4 & PG_FRAME; + } else { + pml4_idx = pmap_pml4e_index(va); + + KASSERT(LMSPML4I <= pml4_idx && pml4_idx < LMSPML4I + lm_ents, + ("pmap_large_map_pdpe: va %#jx out of range idx %#jx " + "LMSPML4I %#jx lm_ents %d", + (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents)); + KASSERT((kernel_pml4[pml4_idx] & X86_PG_V) != 0, + ("pmap_large_map_pdpe: invalid pml4 for va %#jx idx %#jx " + "LMSPML4I %#jx lm_ents %d", + (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents)); + mphys = kernel_pml4[pml4_idx] & PG_FRAME; + } return ((pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va)); } @@ -10834,8 +10834,8 @@ pmap_large_unmap(void *svaa, vm_size_t len) struct spglist spgf; sva = (vm_offset_t)svaa; - if (len == 0 || sva + len < sva || (sva >= DMAP_MIN_ADDRESS && - sva + len <= DMAP_MIN_ADDRESS + dmaplimit)) + if (len == 0 || sva + len < sva || (sva >= kva_layout.dmap_low && + sva + len < kva_layout.dmap_high)) return; SLIST_INIT(&spgf); @@ -11081,11 +11081,10 @@ pmap_large_map_wb(void *svap, vm_size_t len) sva = (vm_offset_t)svap; eva = sva + len; pmap_large_map_wb_fence(); - if (sva >= DMAP_MIN_ADDRESS && eva <= DMAP_MIN_ADDRESS + dmaplimit) { + if (sva >= kva_layout.dmap_low && eva < kva_layout.dmap_high) { pmap_large_map_flush_range(sva, len); } else { - KASSERT(sva >= LARGEMAP_MIN_ADDRESS && - eva <= LARGEMAP_MIN_ADDRESS + lm_ents * NBPML4, + KASSERT(sva >= kva_layout.lm_low && eva < kva_layout.lm_high, ("pmap_large_map_wb: not largemap %#lx %#lx", sva, len)); pmap_large_map_wb_large(sva, eva); } @@ -11126,8 +11125,8 @@ pmap_pti_init(void) VM_OBJECT_WLOCK(pti_obj); pml4_pg = pmap_pti_alloc_page(); pti_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4_pg)); - for (va = VM_MIN_KERNEL_ADDRESS; va <= VM_MAX_KERNEL_ADDRESS && - va >= VM_MIN_KERNEL_ADDRESS && va > NBPML4; va += NBPML4) { + for (va = kva_layout.km_low; va <= kva_layout.km_high && + va >= kva_layout.km_low && va > NBPML4; va += NBPML4) { pdpe = pmap_pti_pdpe(va); pmap_pti_wire_pte(pdpe); } @@ -11901,9 +11900,7 @@ sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range, mode, range->pdpes, range->pdes, range->ptes); /* Reset to sentinel value. */ - range->sva = la57 ? KV5ADDR(NPML5EPG - 1, NPML4EPG - 1, NPDPEPG - 1, - NPDEPG - 1, NPTEPG - 1) : KV4ADDR(NPML4EPG - 1, NPDPEPG - 1, - NPDEPG - 1, NPTEPG - 1); + range->sva = kva_layout.kva_max; } /* @@ -11944,12 +11941,18 @@ sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va, */ static void sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range, - vm_offset_t va, pml4_entry_t pml4e, pdp_entry_t pdpe, pd_entry_t pde, - pt_entry_t pte) + vm_offset_t va, pml5_entry_t pml5e, pml4_entry_t pml4e, pdp_entry_t pdpe, + pd_entry_t pde, pt_entry_t pte) { pt_entry_t attrs; - attrs = pml4e & (X86_PG_RW | X86_PG_U | pg_nx); + if (la57) { + attrs = pml5e & (X86_PG_RW | X86_PG_U | pg_nx); + attrs |= pml4e & pg_nx; + attrs &= pg_nx | (pml4e & (X86_PG_RW | X86_PG_U)); + } else { + attrs = pml4e & (X86_PG_RW | X86_PG_U | pg_nx); + } attrs |= pdpe & pg_nx; attrs &= pg_nx | (pdpe & (X86_PG_RW | X86_PG_U)); @@ -11982,13 +11985,15 @@ sysctl_kmaps(SYSCTL_HANDLER_ARGS) { struct pmap_kernel_map_range range; struct sbuf sbuf, *sb; + pml5_entry_t pml5e; pml4_entry_t pml4e; pdp_entry_t *pdp, pdpe; pd_entry_t *pd, pde; pt_entry_t *pt, pte; vm_offset_t sva; vm_paddr_t pa; - int error, i, j, k, l; + int error, j, k, l; + bool first; error = sysctl_wire_old_buffer(req, 0); if (error != 0) @@ -11997,9 +12002,8 @@ sysctl_kmaps(SYSCTL_HANDLER_ARGS) sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req); /* Sentinel value. */ - range.sva = la57 ? KV5ADDR(NPML5EPG - 1, NPML4EPG - 1, NPDPEPG - 1, - NPDEPG - 1, NPTEPG - 1) : KV4ADDR(NPML4EPG - 1, NPDPEPG - 1, - NPDEPG - 1, NPTEPG - 1); + range.sva = kva_layout.kva_max; + pml5e = 0; /* no UB for la48 */ /* * Iterate over the kernel page tables without holding the kernel pmap @@ -12008,41 +12012,50 @@ sysctl_kmaps(SYSCTL_HANDLER_ARGS) * Within the large map, ensure that PDP and PD page addresses are * valid before descending. */ - for (sva = 0, i = pmap_pml4e_index(sva); i < NPML4EPG; i++) { - switch (i) { - case PML4PML4I: + for (first = true, sva = 0; sva != 0 || first; first = false) { + if (sva == kva_layout.rec_pt) sbuf_printf(sb, "\nRecursive map:\n"); - break; - case DMPML4I: + else if (sva == kva_layout.dmap_low) sbuf_printf(sb, "\nDirect map:\n"); - break; #ifdef KASAN - case KASANPML4I: + else if (sva == kva_layout.kasan_shadow_low) sbuf_printf(sb, "\nKASAN shadow map:\n"); - break; #endif #ifdef KMSAN - case KMSANSHADPML4I: + else if (sva == kva_layout.kmsan_shadow_low) sbuf_printf(sb, "\nKMSAN shadow map:\n"); - break; - case KMSANORIGPML4I: + else if (sva == kva_layout.kmsan_origin_low) sbuf_printf(sb, "\nKMSAN origin map:\n"); - break; #endif - case KPML4BASE: + else if (sva == kva_layout.km_low) sbuf_printf(sb, "\nKernel map:\n"); - break; - case LMSPML4I: + else if (sva == kva_layout.lm_low) sbuf_printf(sb, "\nLarge map:\n"); - break; - } /* Convert to canonical form. */ - if (sva == 1ul << 47) - sva |= -1ul << 48; + if (la57) { + if (sva == 1ul << 56) { + sva |= -1ul << 57; + continue; + } + } else { + if (sva == 1ul << 47) { + sva |= -1ul << 48; + continue; + } + } restart: - pml4e = kernel_pml4[i]; + if (la57) { + pml5e = *pmap_pml5e(kernel_pmap, sva); + if ((pml5e & X86_PG_V) == 0) { + sva = rounddown2(sva, NBPML5); + sysctl_kmaps_dump(sb, &range, sva); + sva += NBPML5; + continue; + } + } + pml4e = *pmap_pml4e(kernel_pmap, sva); if ((pml4e & X86_PG_V) == 0) { sva = rounddown2(sva, NBPML4); sysctl_kmaps_dump(sb, &range, sva); @@ -12063,8 +12076,8 @@ restart: pa = pdpe & PG_FRAME; if ((pdpe & PG_PS) != 0) { sva = rounddown2(sva, NBPDP); - sysctl_kmaps_check(sb, &range, sva, pml4e, pdpe, - 0, 0); + sysctl_kmaps_check(sb, &range, sva, pml5e, + pml4e, pdpe, 0, 0); range.pdpes++; sva += NBPDP; continue; @@ -12076,6 +12089,7 @@ restart: * freed. Validate the next-level address * before descending. */ + sva += NBPDP; goto restart; } pd = (pd_entry_t *)PHYS_TO_DMAP(pa); @@ -12092,7 +12106,7 @@ restart: if ((pde & PG_PS) != 0) { sva = rounddown2(sva, NBPDR); sysctl_kmaps_check(sb, &range, sva, - pml4e, pdpe, pde, 0); + pml5e, pml4e, pdpe, pde, 0); range.pdes++; sva += NBPDR; continue; @@ -12104,6 +12118,7 @@ restart: * may be freed. Validate the * next-level address before descending. */ + sva += NBPDR; goto restart; } pt = (pt_entry_t *)PHYS_TO_DMAP(pa); @@ -12117,7 +12132,7 @@ restart: continue; } sysctl_kmaps_check(sb, &range, sva, - pml4e, pdpe, pde, pte); + pml5e, pml4e, pdpe, pde, pte); range.ptes++; } } diff --git a/sys/amd64/amd64/support.S b/sys/amd64/amd64/support.S index c95696bbe7ef..870cd255abb7 100644 --- a/sys/amd64/amd64/support.S +++ b/sys/amd64/amd64/support.S @@ -934,10 +934,7 @@ ENTRY(casueword32_nosmap) ja fusufault movl %esi,%eax /* old */ -#ifdef SMP - lock -#endif - cmpxchgl %ecx,(%rdi) /* new = %ecx */ + lock cmpxchgl %ecx,(%rdi) /* new = %ecx */ setne %cl /* @@ -971,10 +968,7 @@ ENTRY(casueword32_smap) movl %esi,%eax /* old */ stac -#ifdef SMP - lock -#endif - cmpxchgl %ecx,(%rdi) /* new = %ecx */ + lock cmpxchgl %ecx,(%rdi) /* new = %ecx */ clac setne %cl @@ -1014,10 +1008,7 @@ ENTRY(casueword_nosmap) ja fusufault movq %rsi,%rax /* old */ -#ifdef SMP - lock -#endif - cmpxchgq %rcx,(%rdi) /* new = %rcx */ + lock cmpxchgq %rcx,(%rdi) /* new = %rcx */ setne %cl /* @@ -1045,10 +1036,7 @@ ENTRY(casueword_smap) movq %rsi,%rax /* old */ stac -#ifdef SMP - lock -#endif - cmpxchgq %rcx,(%rdi) /* new = %rcx */ + lock cmpxchgq %rcx,(%rdi) /* new = %rcx */ clac setne %cl diff --git a/sys/amd64/amd64/trap.c b/sys/amd64/amd64/trap.c index 09ac0a67dbef..f3469ed5e2bc 100644 --- a/sys/amd64/amd64/trap.c +++ b/sys/amd64/amd64/trap.c @@ -37,7 +37,6 @@ * SUCH DAMAGE. */ -#include <sys/cdefs.h> /* * AMD64 Trap and System call handling */ @@ -87,9 +86,7 @@ PMC_SOFT_DEFINE( , , page_fault, write); #include <x86/mca.h> #include <machine/md_var.h> #include <machine/pcb.h> -#ifdef SMP #include <machine/smp.h> -#endif #include <machine/stack.h> #include <machine/trap.h> #include <machine/tss.h> @@ -769,7 +766,7 @@ trap_pfault(struct trapframe *frame, bool usermode, int *signo, int *ucode) return (-1); } } - if (eva >= VM_MIN_KERNEL_ADDRESS) { + if (eva >= kva_layout.km_low) { /* * Don't allow user-mode faults in kernel address space. */ @@ -900,11 +897,9 @@ trap_diag(struct trapframe *frame, vm_offset_t eva) printf("\n\nFatal trap %d: %s while in %s mode\n", type, type < nitems(trap_msg) ? trap_msg[type] : UNKNOWN, TRAPF_USERMODE(frame) ? "user" : "kernel"); -#ifdef SMP - /* two separate prints in case of a trap on an unmapped page */ - printf("cpuid = %d; ", PCPU_GET(cpuid)); - printf("apic id = %02x\n", PCPU_GET(apic_id)); -#endif + /* Print these separately in case pcpu accesses trap. */ + printf("cpuid = %d; apic id = %02x\n", PCPU_GET(cpuid), + PCPU_GET(apic_id)); if (type == T_PAGEFLT) { printf("fault virtual address = 0x%lx\n", eva); printf("fault code = %s %s %s%s%s, %s\n", @@ -1025,11 +1020,9 @@ dblfault_handler(struct trapframe *frame) frame->tf_cs, frame->tf_ss, frame->tf_ds, frame->tf_es, frame->tf_fs, frame->tf_gs, rdmsr(MSR_FSBASE), rdmsr(MSR_GSBASE), rdmsr(MSR_KGSBASE)); -#ifdef SMP - /* two separate prints in case of a trap on an unmapped page */ - printf("cpuid = %d; ", PCPU_GET(cpuid)); - printf("apic id = %02x\n", PCPU_GET(apic_id)); -#endif + /* Print these separately in case pcpu accesses trap. */ + printf("cpuid = %d; apic id = %02x\n", PCPU_GET(cpuid), + PCPU_GET(apic_id)); panic("double fault"); } diff --git a/sys/amd64/conf/MINIMALUP b/sys/amd64/conf/MINIMALUP deleted file mode 100644 index 0dbddbe5b341..000000000000 --- a/sys/amd64/conf/MINIMALUP +++ /dev/null @@ -1,4 +0,0 @@ -include MINIMAL -ident MINIMALUP -nooptions SMP -nooptions NUMA diff --git a/sys/amd64/include/efi.h b/sys/amd64/include/efi.h index b47c4aa27ac7..439f2f0b317d 100644 --- a/sys/amd64/include/efi.h +++ b/sys/amd64/include/efi.h @@ -53,6 +53,10 @@ #define EFI_TIME_OWNED() mtx_assert(&atrtc_time_lock, MA_OWNED) #define EFI_RT_HANDLE_FAULTS_DEFAULT 1 + +#define EFI_MAP_BOOTTYPE_ALLOWED(type) (((efi_map_regs >> (type)) & 1) != 0) + +extern uint32_t efi_map_regs; #endif struct efirt_callinfo { diff --git a/sys/amd64/include/param.h b/sys/amd64/include/param.h index 8db314fa034d..5a9c3162e14c 100644 --- a/sys/amd64/include/param.h +++ b/sys/amd64/include/param.h @@ -146,11 +146,10 @@ #define amd64_btop(x) ((unsigned long)(x) >> PAGE_SHIFT) #define amd64_ptob(x) ((unsigned long)(x) << PAGE_SHIFT) -#define INKERNEL(va) (((va) >= DMAP_MIN_ADDRESS && (va) < DMAP_MAX_ADDRESS) \ - || ((va) >= VM_MIN_KERNEL_ADDRESS && (va) < VM_MAX_KERNEL_ADDRESS)) +#define INKERNEL(va) \ + (((va) >= kva_layout.dmap_low && (va) < kva_layout.dmap_high) || \ + ((va) >= kva_layout.km_low && (va) < kva_layout.km_high)) -#ifdef SMP #define SC_TABLESIZE 1024 /* Must be power of 2. */ -#endif #endif /* !_AMD64_INCLUDE_PARAM_H_ */ diff --git a/sys/amd64/include/pmap.h b/sys/amd64/include/pmap.h index 7d3e91bcd9b9..e2f97442c10f 100644 --- a/sys/amd64/include/pmap.h +++ b/sys/amd64/include/pmap.h @@ -169,11 +169,12 @@ * the recursive page table map. */ #define NDMPML4E 8 +#define NDMPML5E 32 /* - * These values control the layout of virtual memory. The starting address - * of the direct map, which is controlled by DMPML4I, must be a multiple of - * its size. (See the PHYS_TO_DMAP() and DMAP_TO_PHYS() macros.) + * These values control the layout of virtual memory. The starting + * address of the direct map is controlled by DMPML4I on LA48 and + * DMPML5I on LA57. * * Note: KPML4I is the index of the (single) level 4 page that maps * the KVA that holds KERNBASE, while KPML4BASE is the index of the @@ -191,6 +192,7 @@ #define KPML4BASE (NPML4EPG-NKPML4E) /* KVM at highest addresses */ #define DMPML4I rounddown(KPML4BASE-NDMPML4E, NDMPML4E) /* Below KVM */ +#define DMPML5I (NPML5EPG / 2 + 1) #define KPML4I (NPML4EPG-1) #define KPDPI (NPDPEPG-2) /* kernbase at -2GB */ @@ -200,9 +202,14 @@ #define KMSANSHADPML4I (KPML4BASE - NKMSANSHADPML4E) #define KMSANORIGPML4I (DMPML4I - NKMSANORIGPML4E) -/* Large map: index of the first and max last pml4 entry */ +/* + * Large map: index of the first and max last pml4/la48 and pml5/la57 + * entry. + */ #define LMSPML4I (PML4PML4I + 1) #define LMEPML4I (KASANPML4I - 1) +#define LMSPML5I (DMPML5I + NDMPML5E) +#define LMEPML5I (LMSPML5I + 32 - 1) /* 32 slots for large map */ /* * XXX doesn't really belong here I guess... @@ -548,6 +555,25 @@ pmap_pml5e_index(vm_offset_t va) return ((va >> PML5SHIFT) & ((1ul << NPML5EPGSHIFT) - 1)); } +struct kva_layout_s { + vm_offset_t kva_min; + vm_offset_t kva_max; + vm_offset_t dmap_low; /* DMAP_MIN_ADDRESS */ + vm_offset_t dmap_high; /* DMAP_MAX_ADDRESS */ + vm_offset_t lm_low; /* LARGEMAP_MIN_ADDRESS */ + vm_offset_t lm_high; /* LARGEMAP_MAX_ADDRESS */ + vm_offset_t km_low; /* VM_MIN_KERNEL_ADDRESS */ + vm_offset_t km_high; /* VM_MAX_KERNEL_ADDRESS */ + vm_offset_t rec_pt; + vm_offset_t kasan_shadow_low; /* KASAN_MIN_ADDRESS */ + vm_offset_t kasan_shadow_high; /* KASAN_MAX_ADDRESS */ + vm_offset_t kmsan_shadow_low; /* KMSAN_SHAD_MIN_ADDRESS */ + vm_offset_t kmsan_shadow_high; /* KMSAN_SHAD_MAX_ADDRESS */ + vm_offset_t kmsan_origin_low; /* KMSAN_ORIG_MIN_ADDRESS */ + vm_offset_t kmsan_origin_high; /* KMSAN_ORIG_MAX_ADDRESS */ +}; +extern struct kva_layout_s kva_layout; + #endif /* !LOCORE */ #endif /* !_MACHINE_PMAP_H_ */ diff --git a/sys/amd64/include/smp.h b/sys/amd64/include/smp.h index 26eb227211da..bff92570ff82 100644 --- a/sys/amd64/include/smp.h +++ b/sys/amd64/include/smp.h @@ -13,8 +13,6 @@ #ifdef _KERNEL -#ifdef SMP - #ifndef LOCORE #include <x86/x86_smp.h> @@ -39,7 +37,6 @@ void invlop_handler(void); int start_all_aps(void); #endif /* !LOCORE */ -#endif /* SMP */ #endif /* _KERNEL */ #endif /* _MACHINE_SMP_H_ */ diff --git a/sys/amd64/include/vmparam.h b/sys/amd64/include/vmparam.h index 0cd9bb4fa7a4..d2ac3c6648b2 100644 --- a/sys/amd64/include/vmparam.h +++ b/sys/amd64/include/vmparam.h @@ -163,6 +163,7 @@ * Virtual addresses of things. Derived from the page directory and * page table indexes from pmap.h for precision. * + * LA48: * 0x0000000000000000 - 0x00007fffffffffff user map * 0x0000800000000000 - 0xffff7fffffffffff does not exist (hole) * 0xffff800000000000 - 0xffff804020100fff recursive page table (512GB slot) @@ -175,32 +176,38 @@ * 0xfffffc0000000000 - 0xfffffdffffffffff 2TB KMSAN shadow map, optional * 0xfffffe0000000000 - 0xffffffffffffffff 2TB kernel map * + * LA57: + * 0x0000000000000000 - 0x00ffffffffffffff user map + * 0x0100000000000000 - 0xf0ffffffffffffff does not exist (hole) + * 0xff00000000000000 - 0xff00ffffffffffff recursive page table (2048TB slot) + * 0xff01000000000000 - 0xff20ffffffffffff direct map (32 x 2048TB slots) + * 0xff21000000000000 - 0xff40ffffffffffff large map + * 0xff41000000000000 - 0xffff7fffffffffff unused + * 0xffff800000000000 - 0xfffff5ffffffffff unused (start of kernel pml4 entry) + * 0xfffff60000000000 - 0xfffff7ffffffffff 2TB KMSAN origin map, optional + * 0xfffff78000000000 - 0xfffff7bfffffffff 512GB KASAN shadow map, optional + * 0xfffff80000000000 - 0xfffffbffffffffff 4TB unused + * 0xfffffc0000000000 - 0xfffffdffffffffff 2TB KMSAN shadow map, optional + * 0xfffffe0000000000 - 0xffffffffffffffff 2TB kernel map + * * Within the kernel map: * * 0xfffffe0000000000 vm_page_array * 0xffffffff80000000 KERNBASE */ -#define VM_MIN_KERNEL_ADDRESS KV4ADDR(KPML4BASE, 0, 0, 0) -#define VM_MAX_KERNEL_ADDRESS KV4ADDR(KPML4BASE + NKPML4E - 1, \ - NPDPEPG-1, NPDEPG-1, NPTEPG-1) - -#define DMAP_MIN_ADDRESS KV4ADDR(DMPML4I, 0, 0, 0) -#define DMAP_MAX_ADDRESS KV4ADDR(DMPML4I + NDMPML4E, 0, 0, 0) - -#define KASAN_MIN_ADDRESS KV4ADDR(KASANPML4I, 0, 0, 0) -#define KASAN_MAX_ADDRESS KV4ADDR(KASANPML4I + NKASANPML4E, 0, 0, 0) +#define VM_MIN_KERNEL_ADDRESS_LA48 KV4ADDR(KPML4BASE, 0, 0, 0) +#define VM_MIN_KERNEL_ADDRESS kva_layout.km_low +#define VM_MAX_KERNEL_ADDRESS kva_layout.km_high -#define KMSAN_SHAD_MIN_ADDRESS KV4ADDR(KMSANSHADPML4I, 0, 0, 0) -#define KMSAN_SHAD_MAX_ADDRESS KV4ADDR(KMSANSHADPML4I + NKMSANSHADPML4E, \ - 0, 0, 0) +#define KASAN_MIN_ADDRESS (kva_layout.kasan_shadow_low) +#define KASAN_MAX_ADDRESS (kva_layout.kasan_shadow_high) -#define KMSAN_ORIG_MIN_ADDRESS KV4ADDR(KMSANORIGPML4I, 0, 0, 0) -#define KMSAN_ORIG_MAX_ADDRESS KV4ADDR(KMSANORIGPML4I + NKMSANORIGPML4E, \ - 0, 0, 0) +#define KMSAN_SHAD_MIN_ADDRESS (kva_layout.kmsan_shadow_low) +#define KMSAN_SHAD_MAX_ADDRESS (kva_layout.kmsan_shadow_high) -#define LARGEMAP_MIN_ADDRESS KV4ADDR(LMSPML4I, 0, 0, 0) -#define LARGEMAP_MAX_ADDRESS KV4ADDR(LMEPML4I + 1, 0, 0, 0) +#define KMSAN_ORIG_MIN_ADDRESS (kva_layout.kmsan_origin_low) +#define KMSAN_ORIG_MAX_ADDRESS (kva_layout.kmsan_origin_high) /* * Formally kernel mapping starts at KERNBASE, but kernel linker @@ -239,21 +246,21 @@ * vt fb startup needs to be reworked. */ #define PHYS_IN_DMAP(pa) (dmaplimit == 0 || (pa) < dmaplimit) -#define VIRT_IN_DMAP(va) ((va) >= DMAP_MIN_ADDRESS && \ - (va) < (DMAP_MIN_ADDRESS + dmaplimit)) +#define VIRT_IN_DMAP(va) \ + ((va) >= kva_layout.dmap_low && (va) < kva_layout.dmap_low + dmaplimit) #define PMAP_HAS_DMAP 1 -#define PHYS_TO_DMAP(x) ({ \ +#define PHYS_TO_DMAP(x) __extension__ ({ \ KASSERT(PHYS_IN_DMAP(x), \ ("physical address %#jx not covered by the DMAP", \ (uintmax_t)x)); \ - (x) | DMAP_MIN_ADDRESS; }) + (x) + kva_layout.dmap_low; }) -#define DMAP_TO_PHYS(x) ({ \ +#define DMAP_TO_PHYS(x) __extension__ ({ \ KASSERT(VIRT_IN_DMAP(x), \ ("virtual address %#jx not covered by the DMAP", \ (uintmax_t)x)); \ - (x) & ~DMAP_MIN_ADDRESS; }) + (x) - kva_layout.dmap_low; }) /* * amd64 maps the page array into KVA so that it can be more easily @@ -274,7 +281,7 @@ */ #ifndef VM_KMEM_SIZE_MAX #define VM_KMEM_SIZE_MAX ((VM_MAX_KERNEL_ADDRESS - \ - VM_MIN_KERNEL_ADDRESS + 1) * 3 / 5) + kva_layout.km_low + 1) * 3 / 5) #endif /* initial pagein size of beginning of executable file */ diff --git a/sys/amd64/linux/linux_proto.h b/sys/amd64/linux/linux_proto.h index 15e1dfc1a444..f1d9c96a78d7 100644 --- a/sys/amd64/linux/linux_proto.h +++ b/sys/amd64/linux/linux_proto.h @@ -914,10 +914,13 @@ struct linux_inotify_init_args { syscallarg_t dummy; }; struct linux_inotify_add_watch_args { - syscallarg_t dummy; + char fd_l_[PADL_(l_int)]; l_int fd; char fd_r_[PADR_(l_int)]; + char pathname_l_[PADL_(const char *)]; const char * pathname; char pathname_r_[PADR_(const char *)]; + char mask_l_[PADL_(uint32_t)]; uint32_t mask; char mask_r_[PADR_(uint32_t)]; }; struct linux_inotify_rm_watch_args { - syscallarg_t dummy; + char fd_l_[PADL_(l_int)]; l_int fd; char fd_r_[PADR_(l_int)]; + char wd_l_[PADL_(uint32_t)]; uint32_t wd; char wd_r_[PADR_(uint32_t)]; }; struct linux_migrate_pages_args { syscallarg_t dummy; diff --git a/sys/amd64/linux/linux_sysent.c b/sys/amd64/linux/linux_sysent.c index 8413d2723551..62b50cf68a32 100644 --- a/sys/amd64/linux/linux_sysent.c +++ b/sys/amd64/linux/linux_sysent.c @@ -268,8 +268,8 @@ struct sysent linux_sysent[] = { { .sy_narg = AS(linux_ioprio_set_args), .sy_call = (sy_call_t *)linux_ioprio_set, .sy_auevent = AUE_SETPRIORITY, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 251 = linux_ioprio_set */ { .sy_narg = AS(linux_ioprio_get_args), .sy_call = (sy_call_t *)linux_ioprio_get, .sy_auevent = AUE_GETPRIORITY, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 252 = linux_ioprio_get */ { .sy_narg = 0, .sy_call = (sy_call_t *)linux_inotify_init, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 253 = linux_inotify_init */ - { .sy_narg = 0, .sy_call = (sy_call_t *)linux_inotify_add_watch, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 254 = linux_inotify_add_watch */ - { .sy_narg = 0, .sy_call = (sy_call_t *)linux_inotify_rm_watch, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 255 = linux_inotify_rm_watch */ + { .sy_narg = AS(linux_inotify_add_watch_args), .sy_call = (sy_call_t *)linux_inotify_add_watch, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 254 = linux_inotify_add_watch */ + { .sy_narg = AS(linux_inotify_rm_watch_args), .sy_call = (sy_call_t *)linux_inotify_rm_watch, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 255 = linux_inotify_rm_watch */ { .sy_narg = 0, .sy_call = (sy_call_t *)linux_migrate_pages, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 256 = linux_migrate_pages */ { .sy_narg = AS(linux_openat_args), .sy_call = (sy_call_t *)linux_openat, .sy_auevent = AUE_OPEN_RWTC, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 257 = linux_openat */ { .sy_narg = AS(linux_mkdirat_args), .sy_call = (sy_call_t *)linux_mkdirat, .sy_auevent = AUE_MKDIRAT, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 258 = linux_mkdirat */ diff --git a/sys/amd64/linux/linux_systrace_args.c b/sys/amd64/linux/linux_systrace_args.c index 20322f7a8660..1dc4de019080 100644 --- a/sys/amd64/linux/linux_systrace_args.c +++ b/sys/amd64/linux/linux_systrace_args.c @@ -1918,12 +1918,19 @@ systrace_args(int sysnum, void *params, uint64_t *uarg, int *n_args) } /* linux_inotify_add_watch */ case 254: { - *n_args = 0; + struct linux_inotify_add_watch_args *p = params; + iarg[a++] = p->fd; /* l_int */ + uarg[a++] = (intptr_t)p->pathname; /* const char * */ + uarg[a++] = p->mask; /* uint32_t */ + *n_args = 3; break; } /* linux_inotify_rm_watch */ case 255: { - *n_args = 0; + struct linux_inotify_rm_watch_args *p = params; + iarg[a++] = p->fd; /* l_int */ + uarg[a++] = p->wd; /* uint32_t */ + *n_args = 2; break; } /* linux_migrate_pages */ @@ -5860,9 +5867,32 @@ systrace_entry_setargdesc(int sysnum, int ndx, char *desc, size_t descsz) break; /* linux_inotify_add_watch */ case 254: + switch (ndx) { + case 0: + p = "l_int"; + break; + case 1: + p = "userland const char *"; + break; + case 2: + p = "uint32_t"; + break; + default: + break; + }; break; /* linux_inotify_rm_watch */ case 255: + switch (ndx) { + case 0: + p = "l_int"; + break; + case 1: + p = "uint32_t"; + break; + default: + break; + }; break; /* linux_migrate_pages */ case 256: @@ -8353,8 +8383,14 @@ systrace_return_setargdesc(int sysnum, int ndx, char *desc, size_t descsz) case 253: /* linux_inotify_add_watch */ case 254: + if (ndx == 0 || ndx == 1) + p = "int"; + break; /* linux_inotify_rm_watch */ case 255: + if (ndx == 0 || ndx == 1) + p = "int"; + break; /* linux_migrate_pages */ case 256: /* linux_openat */ diff --git a/sys/amd64/linux/syscalls.master b/sys/amd64/linux/syscalls.master index fd08c9b0279d..5e1394751ef6 100644 --- a/sys/amd64/linux/syscalls.master +++ b/sys/amd64/linux/syscalls.master @@ -1476,10 +1476,17 @@ int linux_inotify_init(void); } 254 AUE_NULL STD { - int linux_inotify_add_watch(void); + int linux_inotify_add_watch( + l_int fd, + const char *pathname, + uint32_t mask + ); } 255 AUE_NULL STD { - int linux_inotify_rm_watch(void); + int linux_inotify_rm_watch( + l_int fd, + uint32_t wd + ); } 256 AUE_NULL STD { int linux_migrate_pages(void); diff --git a/sys/amd64/linux32/linux32_proto.h b/sys/amd64/linux32/linux32_proto.h index ab0edd99df42..57a303271f1c 100644 --- a/sys/amd64/linux32/linux32_proto.h +++ b/sys/amd64/linux32/linux32_proto.h @@ -983,10 +983,13 @@ struct linux_inotify_init_args { syscallarg_t dummy; }; struct linux_inotify_add_watch_args { - syscallarg_t dummy; + char fd_l_[PADL_(l_int)]; l_int fd; char fd_r_[PADR_(l_int)]; + char pathname_l_[PADL_(const char *)]; const char * pathname; char pathname_r_[PADR_(const char *)]; + char mask_l_[PADL_(uint32_t)]; uint32_t mask; char mask_r_[PADR_(uint32_t)]; }; struct linux_inotify_rm_watch_args { - syscallarg_t dummy; + char fd_l_[PADL_(l_int)]; l_int fd; char fd_r_[PADR_(l_int)]; + char wd_l_[PADL_(uint32_t)]; uint32_t wd; char wd_r_[PADR_(uint32_t)]; }; struct linux_migrate_pages_args { syscallarg_t dummy; @@ -1184,7 +1187,7 @@ struct linux_pipe2_args { char flags_l_[PADL_(l_int)]; l_int flags; char flags_r_[PADR_(l_int)]; }; struct linux_inotify_init1_args { - syscallarg_t dummy; + char flags_l_[PADL_(l_int)]; l_int flags; char flags_r_[PADR_(l_int)]; }; struct linux_preadv_args { char fd_l_[PADL_(l_ulong)]; l_ulong fd; char fd_r_[PADR_(l_ulong)]; diff --git a/sys/amd64/linux32/linux32_sysent.c b/sys/amd64/linux32/linux32_sysent.c index add9844254ce..1bc8841badf3 100644 --- a/sys/amd64/linux32/linux32_sysent.c +++ b/sys/amd64/linux32/linux32_sysent.c @@ -307,8 +307,8 @@ struct sysent linux32_sysent[] = { { .sy_narg = AS(linux_ioprio_set_args), .sy_call = (sy_call_t *)linux_ioprio_set, .sy_auevent = AUE_SETPRIORITY, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 289 = linux_ioprio_set */ { .sy_narg = AS(linux_ioprio_get_args), .sy_call = (sy_call_t *)linux_ioprio_get, .sy_auevent = AUE_GETPRIORITY, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 290 = linux_ioprio_get */ { .sy_narg = 0, .sy_call = (sy_call_t *)linux_inotify_init, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 291 = linux_inotify_init */ - { .sy_narg = 0, .sy_call = (sy_call_t *)linux_inotify_add_watch, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 292 = linux_inotify_add_watch */ - { .sy_narg = 0, .sy_call = (sy_call_t *)linux_inotify_rm_watch, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 293 = linux_inotify_rm_watch */ + { .sy_narg = AS(linux_inotify_add_watch_args), .sy_call = (sy_call_t *)linux_inotify_add_watch, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 292 = linux_inotify_add_watch */ + { .sy_narg = AS(linux_inotify_rm_watch_args), .sy_call = (sy_call_t *)linux_inotify_rm_watch, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 293 = linux_inotify_rm_watch */ { .sy_narg = 0, .sy_call = (sy_call_t *)linux_migrate_pages, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 294 = linux_migrate_pages */ { .sy_narg = AS(linux_openat_args), .sy_call = (sy_call_t *)linux_openat, .sy_auevent = AUE_OPEN_RWTC, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 295 = linux_openat */ { .sy_narg = AS(linux_mkdirat_args), .sy_call = (sy_call_t *)linux_mkdirat, .sy_auevent = AUE_MKDIRAT, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 296 = linux_mkdirat */ @@ -347,7 +347,7 @@ struct sysent linux32_sysent[] = { { .sy_narg = AS(linux_epoll_create1_args), .sy_call = (sy_call_t *)linux_epoll_create1, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 329 = linux_epoll_create1 */ { .sy_narg = AS(linux_dup3_args), .sy_call = (sy_call_t *)linux_dup3, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 330 = linux_dup3 */ { .sy_narg = AS(linux_pipe2_args), .sy_call = (sy_call_t *)linux_pipe2, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 331 = linux_pipe2 */ - { .sy_narg = 0, .sy_call = (sy_call_t *)linux_inotify_init1, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 332 = linux_inotify_init1 */ + { .sy_narg = AS(linux_inotify_init1_args), .sy_call = (sy_call_t *)linux_inotify_init1, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 332 = linux_inotify_init1 */ { .sy_narg = AS(linux_preadv_args), .sy_call = (sy_call_t *)linux_preadv, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 333 = linux_preadv */ { .sy_narg = AS(linux_pwritev_args), .sy_call = (sy_call_t *)linux_pwritev, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 334 = linux_pwritev */ { .sy_narg = AS(linux_rt_tgsigqueueinfo_args), .sy_call = (sy_call_t *)linux_rt_tgsigqueueinfo, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 335 = linux_rt_tgsigqueueinfo */ diff --git a/sys/amd64/linux32/linux32_systrace_args.c b/sys/amd64/linux32/linux32_systrace_args.c index 7793124e6935..cbd1641c2a34 100644 --- a/sys/amd64/linux32/linux32_systrace_args.c +++ b/sys/amd64/linux32/linux32_systrace_args.c @@ -2036,12 +2036,19 @@ systrace_args(int sysnum, void *params, uint64_t *uarg, int *n_args) } /* linux_inotify_add_watch */ case 292: { - *n_args = 0; + struct linux_inotify_add_watch_args *p = params; + iarg[a++] = p->fd; /* l_int */ + uarg[a++] = (intptr_t)p->pathname; /* const char * */ + uarg[a++] = p->mask; /* uint32_t */ + *n_args = 3; break; } /* linux_inotify_rm_watch */ case 293: { - *n_args = 0; + struct linux_inotify_rm_watch_args *p = params; + iarg[a++] = p->fd; /* l_int */ + uarg[a++] = p->wd; /* uint32_t */ + *n_args = 2; break; } /* linux_migrate_pages */ @@ -2379,7 +2386,9 @@ systrace_args(int sysnum, void *params, uint64_t *uarg, int *n_args) } /* linux_inotify_init1 */ case 332: { - *n_args = 0; + struct linux_inotify_init1_args *p = params; + iarg[a++] = p->flags; /* l_int */ + *n_args = 1; break; } /* linux_preadv */ @@ -6536,9 +6545,32 @@ systrace_entry_setargdesc(int sysnum, int ndx, char *desc, size_t descsz) break; /* linux_inotify_add_watch */ case 292: + switch (ndx) { + case 0: + p = "l_int"; + break; + case 1: + p = "userland const char *"; + break; + case 2: + p = "uint32_t"; + break; + default: + break; + }; break; /* linux_inotify_rm_watch */ case 293: + switch (ndx) { + case 0: + p = "l_int"; + break; + case 1: + p = "uint32_t"; + break; + default: + break; + }; break; /* linux_migrate_pages */ case 294: @@ -7116,6 +7148,13 @@ systrace_entry_setargdesc(int sysnum, int ndx, char *desc, size_t descsz) break; /* linux_inotify_init1 */ case 332: + switch (ndx) { + case 0: + p = "l_int"; + break; + default: + break; + }; break; /* linux_preadv */ case 333: @@ -9809,8 +9848,14 @@ systrace_return_setargdesc(int sysnum, int ndx, char *desc, size_t descsz) case 291: /* linux_inotify_add_watch */ case 292: + if (ndx == 0 || ndx == 1) + p = "int"; + break; /* linux_inotify_rm_watch */ case 293: + if (ndx == 0 || ndx == 1) + p = "int"; + break; /* linux_migrate_pages */ case 294: /* linux_openat */ @@ -9982,6 +10027,9 @@ systrace_return_setargdesc(int sysnum, int ndx, char *desc, size_t descsz) break; /* linux_inotify_init1 */ case 332: + if (ndx == 0 || ndx == 1) + p = "int"; + break; /* linux_preadv */ case 333: if (ndx == 0 || ndx == 1) diff --git a/sys/amd64/linux32/syscalls.master b/sys/amd64/linux32/syscalls.master index 92d5f09c423f..7bd522a598e8 100644 --- a/sys/amd64/linux32/syscalls.master +++ b/sys/amd64/linux32/syscalls.master @@ -1589,10 +1589,17 @@ int linux_inotify_init(void); } 292 AUE_NULL STD { - int linux_inotify_add_watch(void); + int linux_inotify_add_watch( + l_int fd, + const char *pathname, + uint32_t mask + ); } 293 AUE_NULL STD { - int linux_inotify_rm_watch(void); + int linux_inotify_rm_watch( + l_int fd, + uint32_t wd + ); } ; Linux 2.6.16: 294 AUE_NULL STD { @@ -1860,7 +1867,9 @@ ); } 332 AUE_NULL STD { - int linux_inotify_init1(void); + int linux_inotify_init1( + l_int flags + ); } ; Linux 2.6.30: 333 AUE_NULL STD { diff --git a/sys/amd64/pt/pt.c b/sys/amd64/pt/pt.c new file mode 100644 index 000000000000..c7b75767680a --- /dev/null +++ b/sys/amd64/pt/pt.c @@ -0,0 +1,978 @@ +/* + * Copyright (c) 2025 Bojan Novković <bnovkov@freebsd.org> + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +/* + * hwt(4) Intel Processor Trace (PT) backend + * + * Driver Design Overview + * + * - Since PT is configured on a per-core basis, the driver uses + * 'smp_rendezvous' to start and disable tracing on each target core. + * - PT-specific resources are stored in a 'struct pt_ctx' context structure for + * each traced CPU core or thread. Upon initialization, a ToPA configuration + * is generated for each 'pt_ctx' structure using the HWT tracing buffers. + * The HWT tracing buffer is split into 4K ToPA entries. Currently, each + * 4K ToPA entry is configured to trigger an interrupt after it is filled. + * - The PT driver uses the XSAVE/XRSTOR PT extensions to load and save all + * relevant PT registers. Every time a traced thread is switched + * out or in, its state will be saved to or loaded from its corresponding + * 'pt_ctx' context. + * - When tracing starts, the PT hardware will start writing data into the + * tracing buffer. When a TOPA_INT entry is filled, it will trigger an + * interrupt before continuing. The interrupt handler will then fetch the + * last valid tracing buffer offset and enqueue a HWT_RECORD_BUFFER record. + * The driver is currently configured to use the NMI interrupt line. + * - The userspace PT backend waits for incoming HWT_RECORD_BUFFER records + * and uses the offsets to decode data from the tracing buffer. + * + * Future improvements and limitations + * + * - We currently configure the PT hardware to trigger an interrupt whenever + * a 4K ToPA entry is filled. While this is fine when tracing smaller + * functions or infrequent code paths, this will generate too much interrupt + * traffic when tracing hotter functions. A proper solution for this issue + * should estimate the amount of data generated by the current configuration + * and use it to determine interrupt frequency. + * + * - Support for more tracing options and PT features. + * + */ + +#include <sys/systm.h> +#include <sys/hwt.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/module.h> +#include <sys/mutex.h> +#include <sys/sdt.h> +#include <sys/smp.h> +#include <sys/taskqueue.h> + +#include <vm/vm.h> +#include <vm/vm_page.h> + +#include <machine/atomic.h> +#include <machine/cpufunc.h> +#include <machine/fpu.h> +#include <machine/smp.h> +#include <machine/specialreg.h> + +#include <x86/apicvar.h> +#include <x86/x86_var.h> + +#include <dev/hwt/hwt_context.h> +#include <dev/hwt/hwt_vm.h> +#include <dev/hwt/hwt_backend.h> +#include <dev/hwt/hwt_config.h> +#include <dev/hwt/hwt_cpu.h> +#include <dev/hwt/hwt_record.h> +#include <dev/hwt/hwt_thread.h> + +#include <amd64/pt/pt.h> + +#ifdef PT_DEBUG +#define dprintf(fmt, ...) printf(fmt, ##__VA_ARGS__) +#else +#define dprintf(fmt, ...) +#endif +#define PT_SUPPORTED_FLAGS \ + (RTIT_CTL_MTCEN | RTIT_CTL_CR3FILTER | RTIT_CTL_DIS_TNT | \ + RTIT_CTL_USER | RTIT_CTL_OS | RTIT_CTL_BRANCHEN) +#define PT_XSAVE_MASK (XFEATURE_ENABLED_X87 | XFEATURE_ENABLED_SSE) +#define PT_XSTATE_BV (PT_XSAVE_MASK | XFEATURE_ENABLED_PT) +#define PT_MAX_IP_RANGES 2 + +#define PT_TOPA_MASK_PTRS 0x7f +#define PT_TOPA_PAGE_MASK 0xffffff80 +#define PT_TOPA_PAGE_SHIFT 7 + +#define CPUID_PT_LEAF 0x14 + +MALLOC_DEFINE(M_PT, "pt", "Intel Processor Trace"); + +SDT_PROVIDER_DEFINE(pt); +SDT_PROBE_DEFINE(pt, , , topa__intr); + +TASKQUEUE_FAST_DEFINE_THREAD(pt); + +static void pt_send_buffer_record(void *arg, int pending __unused); +static int pt_topa_intr(struct trapframe *tf); + +/* + * Intel Processor Trace XSAVE-managed state. + */ +struct pt_ext_area { + uint64_t rtit_ctl; + uint64_t rtit_output_base; + uint64_t rtit_output_mask_ptrs; + uint64_t rtit_status; + uint64_t rtit_cr3_match; + uint64_t rtit_addr0_a; + uint64_t rtit_addr0_b; + uint64_t rtit_addr1_a; + uint64_t rtit_addr1_b; +}; + +struct pt_buffer { + uint64_t *topa_hw; /* ToPA table entries. */ + size_t size; + struct mtx lock; /* Lock for fields below. */ + vm_offset_t offset; + uint64_t wrap_count; + int curpage; +}; + +struct pt_ctx { + int id; + struct pt_buffer buf; /* ToPA buffer metadata */ + struct task task; /* ToPA buffer notification task */ + struct hwt_context *hwt_ctx; + uint8_t *save_area; /* PT XSAVE area */ +}; +/* PT tracing contexts used for CPU mode. */ +static struct pt_ctx *pt_pcpu_ctx; + +enum pt_cpu_state { + PT_DISABLED = 0, + PT_STOPPED, + PT_ACTIVE +}; + +static struct pt_cpu { + struct pt_ctx *ctx; /* active PT tracing context */ + enum pt_cpu_state state; /* used as part of trace stop protocol */ +} *pt_pcpu; + +/* + * PT-related CPUID bits. + */ +static struct pt_cpu_info { + uint32_t l0_eax; + uint32_t l0_ebx; + uint32_t l0_ecx; + uint32_t l1_eax; + uint32_t l1_ebx; + size_t xsave_area_size; + size_t xstate_hdr_offset; + size_t pt_xsave_offset; +} pt_info __read_mostly; + +static bool initialized = false; +static int cpu_mode_ctr = 0; + +static __inline enum pt_cpu_state +pt_cpu_get_state(int cpu_id) +{ + return (atomic_load_int(&pt_pcpu[cpu_id].state)); +} + +static __inline void +pt_cpu_set_state(int cpu_id, enum pt_cpu_state state) +{ + atomic_store_int(&pt_pcpu[cpu_id].state, state); +} + +static __inline struct xstate_hdr * +pt_ctx_get_xstate_hdr(struct pt_ctx *ctx) +{ + return ((struct xstate_hdr *)(ctx->save_area + + pt_info.xstate_hdr_offset)); +} + + +static __inline struct pt_ext_area * +pt_ctx_get_ext_area(struct pt_ctx *ctx) +{ + return ((struct pt_ext_area *)(ctx->save_area + + pt_info.pt_xsave_offset)); +} + +/* + * Updates current trace buffer offset from the + * ToPA MSRs. Records if the trace buffer wrapped. + */ +static __inline void +pt_update_buffer(struct pt_buffer *buf) +{ + uint64_t reg; + int curpage; + + /* Update buffer offset. */ + reg = rdmsr(MSR_IA32_RTIT_OUTPUT_MASK_PTRS); + curpage = (reg & PT_TOPA_PAGE_MASK) >> PT_TOPA_PAGE_SHIFT; + mtx_lock_spin(&buf->lock); + /* Check if the output wrapped. */ + if (buf->curpage > curpage) + buf->wrap_count++; + buf->curpage = curpage; + buf->offset = reg >> 32; + mtx_unlock_spin(&buf->lock); + + dprintf("%s: wrap_cnt: %lu, curpage: %d, offset: %zu\n", __func__, + buf->wrap_count, buf->curpage, buf->offset); +} + +static __inline void +pt_fill_buffer_record(int id, struct pt_buffer *buf, + struct hwt_record_entry *rec) +{ + rec->record_type = HWT_RECORD_BUFFER; + rec->buf_id = id; + rec->curpage = buf->curpage; + rec->offset = buf->offset + (buf->wrap_count * buf->size); +} + +/* + * Enables or disables tracing on curcpu + * using the XSAVE/XRSTOR PT extensions. + */ +static void +pt_cpu_toggle_local(uint8_t *save_area, bool enable) +{ + u_long xcr0, cr0; + u_long xss; + + cr0 = rcr0(); + if (cr0 & CR0_TS) + clts(); + xcr0 = rxcr(XCR0); + if ((xcr0 & PT_XSAVE_MASK) != PT_XSAVE_MASK) + load_xcr(XCR0, xcr0 | PT_XSAVE_MASK); + xss = rdmsr(MSR_IA32_XSS); + wrmsr(MSR_IA32_XSS, xss | XFEATURE_ENABLED_PT); + + if (!enable) { + KASSERT((rdmsr(MSR_IA32_RTIT_CTL) & RTIT_CTL_TRACEEN) != 0, + ("%s: PT is disabled", __func__)); + xsaves(save_area, XFEATURE_ENABLED_PT); + } else { + KASSERT((rdmsr(MSR_IA32_RTIT_CTL) & RTIT_CTL_TRACEEN) == 0, + ("%s: PT is enabled", __func__)); + xrstors(save_area, XFEATURE_ENABLED_PT); + } + wrmsr(MSR_IA32_XSS, xss); + if ((xcr0 & PT_XSAVE_MASK) != PT_XSAVE_MASK) + load_xcr(XCR0, xcr0); + if (cr0 & CR0_TS) + load_cr0(cr0); +} + +/* + * Starts PT tracing on 'curcpu'. + */ +static void +pt_cpu_start(void *dummy) +{ + struct pt_cpu *cpu; + + cpu = &pt_pcpu[curcpu]; + MPASS(cpu->ctx != NULL); + + dprintf("%s: curcpu %d\n", __func__, curcpu); + load_cr4(rcr4() | CR4_XSAVE); + wrmsr(MSR_IA32_RTIT_STATUS, 0); + pt_cpu_set_state(curcpu, PT_ACTIVE); + pt_cpu_toggle_local(cpu->ctx->save_area, true); +} + +/* + * Stops PT tracing on 'curcpu'. + * Updates trace buffer offset to ensure + * any data generated between the last interrupt + * and the trace stop gets picked up by userspace. + */ +static void +pt_cpu_stop(void *dummy) +{ + struct pt_cpu *cpu; + struct pt_ctx *ctx; + + /* Shutdown may occur before PT gets properly configured. */ + if (pt_cpu_get_state(curcpu) == PT_DISABLED) + return; + + cpu = &pt_pcpu[curcpu]; + ctx = cpu->ctx; + MPASS(ctx != NULL); + dprintf("%s: curcpu %d\n", __func__, curcpu); + + pt_cpu_set_state(curcpu, PT_STOPPED); + pt_cpu_toggle_local(cpu->ctx->save_area, false); + pt_update_buffer(&ctx->buf); +} + +/* + * Prepares the Table of Physical Addresses (ToPA) metadata for 'pt_ctx'. + * The HWT trace buffer is split into 4K ToPA table entries and used + * as a circular buffer, meaning that the last ToPA entry points to + * the first ToPA entry. Each entry is configured to raise an + * interrupt after being filled. + */ +static int +pt_topa_prepare(struct pt_ctx *ctx, struct hwt_vm *vm) +{ + struct pt_buffer *buf; + size_t topa_size; + int i; + + topa_size = TOPA_SIZE_4K; + buf = &ctx->buf; + + KASSERT(buf->topa_hw == NULL, + ("%s: ToPA info already exists", __func__)); + buf->topa_hw = mallocarray(vm->npages + 1, sizeof(uint64_t), M_PT, + M_ZERO | M_WAITOK); + dprintf("%s: ToPA virt addr %p\n", __func__, buf->topa_hw); + buf->size = vm->npages * PAGE_SIZE; + for (i = 0; i < vm->npages; i++) { + buf->topa_hw[i] = VM_PAGE_TO_PHYS(vm->pages[i]) | topa_size; + /* + * XXX: TOPA_INT should ideally be set according to + * expected amount of incoming trace data. Too few TOPA_INT + * entries will not trigger interrupts often enough when tracing + * smaller functions. + */ + buf->topa_hw[i] |= TOPA_INT; + } + buf->topa_hw[vm->npages] = (uint64_t)vtophys(buf->topa_hw) | TOPA_END; + + return (0); +} + +/* + * Configures IP filtering for trace generation. + * A maximum of 2 ranges can be specified due to + * limitations imposed by the XSAVE/XRSTOR PT extensions. + */ +static int +pt_configure_ranges(struct pt_ctx *ctx, struct pt_cpu_config *cfg) +{ + struct pt_ext_area *pt_ext; + int nranges_supp, n, error = 0; + + pt_ext = pt_ctx_get_ext_area(ctx); + if (pt_info.l0_ebx & CPUPT_IPF) { + nranges_supp = (pt_info.l1_eax & CPUPT_NADDR_M) >> + CPUPT_NADDR_S; + + if (nranges_supp > PT_IP_FILTER_MAX_RANGES) + nranges_supp = PT_IP_FILTER_MAX_RANGES; + n = cfg->nranges; + if (n > nranges_supp) { + printf("%s: %d IP filtering ranges requested, CPU " + "supports %d, truncating\n", + __func__, n, nranges_supp); + n = nranges_supp; + } + + switch (n) { + case 2: + pt_ext->rtit_ctl |= (1UL << RTIT_CTL_ADDR_CFG_S(1)); + pt_ext->rtit_addr1_a = cfg->ip_ranges[1].start; + pt_ext->rtit_addr1_b = cfg->ip_ranges[1].end; + case 1: + pt_ext->rtit_ctl |= (1UL << RTIT_CTL_ADDR_CFG_S(0)); + pt_ext->rtit_addr0_a = cfg->ip_ranges[0].start; + pt_ext->rtit_addr0_b = cfg->ip_ranges[0].end; + break; + default: + error = (EINVAL); + break; + }; + } else + error = (ENXIO); + + return (error); +} + +static int +pt_init_ctx(struct pt_ctx *pt_ctx, struct hwt_vm *vm, int ctx_id) +{ + + dprintf("%s: ctx id %d\n", __func__, ctx_id); + + KASSERT(pt_ctx->buf.topa_hw == NULL, + ("%s: active ToPA buffer in context %p\n", __func__, pt_ctx)); + + memset(pt_ctx, 0, sizeof(struct pt_ctx)); + mtx_init(&pt_ctx->buf.lock, "pttopa", NULL, MTX_SPIN); + pt_ctx->save_area = malloc_aligned(pt_info.xsave_area_size, 64, + M_PT, M_NOWAIT | M_ZERO); + if (pt_ctx->save_area == NULL) + return (ENOMEM); + dprintf("%s: preparing ToPA buffer\n", __func__); + if (pt_topa_prepare(pt_ctx, vm) != 0) { + dprintf("%s: failed to prepare ToPA buffer\n", __func__); + free(pt_ctx->save_area, M_PT); + return (ENOMEM); + } + + pt_ctx->id = ctx_id; + TASK_INIT(&pt_ctx->task, 0, pt_send_buffer_record, pt_ctx); + + return (0); +} + +static void +pt_deinit_ctx(struct pt_ctx *pt_ctx) +{ + + if (pt_ctx->buf.topa_hw != NULL) + free(pt_ctx->buf.topa_hw, M_PT); + if (pt_ctx->save_area != NULL) + free(pt_ctx->save_area, M_PT); + memset(pt_ctx, 0, sizeof(*pt_ctx)); + pt_ctx->buf.topa_hw = NULL; +} + +/* + * HWT backend configuration method. + * + * Checks and translates the user-defined configuration to a + * set of PT tracing features. Uses the feature set to initialize + * the tracing context for the target CPU or thread. + */ +static int +pt_backend_configure(struct hwt_context *ctx, int cpu_id, int thread_id) +{ + struct hwt_cpu *hwt_cpu; + struct hwt_thread *thr; + struct pt_ctx *pt_ctx; + struct pt_cpu_config *cfg; + struct pt_ext_area *pt_ext; + struct xstate_hdr *hdr; + int error; + + dprintf("%s\n", __func__); + + cfg = (struct pt_cpu_config *)ctx->config; + pt_ctx = NULL; + + /* Clear any flags we don't support yet. */ + cfg->rtit_ctl &= PT_SUPPORTED_FLAGS; + if (cfg->rtit_ctl & RTIT_CTL_MTCEN) { + if ((pt_info.l0_ebx & CPUPT_MTC) == 0) { + printf("%s: CPU does not support generating MTC " + "packets\n", __func__); + return (ENXIO); + } + } + + if (cfg->rtit_ctl & RTIT_CTL_CR3FILTER) { + if ((pt_info.l0_ebx & CPUPT_CR3) == 0) { + printf("%s: CPU does not support CR3 filtering\n", + __func__); + return (ENXIO); + } + } + + if (cfg->rtit_ctl & RTIT_CTL_DIS_TNT) { + if ((pt_info.l0_ebx & CPUPT_DIS_TNT) == 0) { + printf("%s: CPU does not support TNT\n", __func__); + return (ENXIO); + } + } + /* TODO: support for more config bits. */ + + if (ctx->mode == HWT_MODE_CPU) { + TAILQ_FOREACH(hwt_cpu, &ctx->cpus, next) { + if (hwt_cpu->cpu_id != cpu_id) + continue; + pt_ctx = &pt_pcpu_ctx[cpu_id]; + break; + } + } else { + TAILQ_FOREACH(thr, &ctx->threads, next) { + if (thr->thread_id != thread_id) + continue; + KASSERT(thr->private != NULL, + ("%s: hwt thread private" + " not set, thr %p", + __func__, thr)); + pt_ctx = (struct pt_ctx *)thr->private; + break; + } + } + if (pt_ctx == NULL) + return (ENOENT); + + dprintf("%s: preparing MSRs\n", __func__); + pt_ext = pt_ctx_get_ext_area(pt_ctx); + hdr = pt_ctx_get_xstate_hdr(pt_ctx); + + pt_ext->rtit_ctl |= cfg->rtit_ctl; + if (cfg->nranges != 0) { + dprintf("%s: preparing IPF ranges\n", __func__); + if ((error = pt_configure_ranges(pt_ctx, cfg)) != 0) + return (error); + } + pt_ctx->hwt_ctx = ctx; + pt_ext->rtit_ctl |= RTIT_CTL_TOPA; + pt_ext->rtit_output_base = (uint64_t)vtophys(pt_ctx->buf.topa_hw); + pt_ext->rtit_output_mask_ptrs = PT_TOPA_MASK_PTRS; + hdr->xstate_bv = XFEATURE_ENABLED_PT; + hdr->xstate_xcomp_bv = XFEATURE_ENABLED_PT | + XSTATE_XCOMP_BV_COMPACT; + pt_ext->rtit_ctl |= RTIT_CTL_TRACEEN; + pt_pcpu[cpu_id].ctx = pt_ctx; + pt_cpu_set_state(cpu_id, PT_STOPPED); + + return (0); +} + +/* + * hwt backend trace start operation. CPU affine. + */ +static void +pt_backend_enable(struct hwt_context *ctx, int cpu_id) +{ + if (ctx->mode == HWT_MODE_CPU) + return; + + KASSERT(curcpu == cpu_id, + ("%s: attempting to start PT on another cpu", __func__)); + pt_cpu_start(NULL); + CPU_SET(cpu_id, &ctx->cpu_map); +} + +/* + * hwt backend trace stop operation. CPU affine. + */ +static void +pt_backend_disable(struct hwt_context *ctx, int cpu_id) +{ + struct pt_cpu *cpu; + + if (ctx->mode == HWT_MODE_CPU) + return; + + KASSERT(curcpu == cpu_id, + ("%s: attempting to disable PT on another cpu", __func__)); + pt_cpu_stop(NULL); + CPU_CLR(cpu_id, &ctx->cpu_map); + cpu = &pt_pcpu[cpu_id]; + cpu->ctx = NULL; +} + +/* + * hwt backend trace start operation for remote CPUs. + */ +static int +pt_backend_enable_smp(struct hwt_context *ctx) +{ + + dprintf("%s\n", __func__); + if (ctx->mode == HWT_MODE_CPU && + atomic_swap_32(&cpu_mode_ctr, 1) != 0) + return (-1); + + KASSERT(ctx->mode == HWT_MODE_CPU, + ("%s: should only be used for CPU mode", __func__)); + smp_rendezvous_cpus(ctx->cpu_map, NULL, pt_cpu_start, NULL, NULL); + + return (0); +} + +/* + * hwt backend trace stop operation for remote CPUs. + */ +static int +pt_backend_disable_smp(struct hwt_context *ctx) +{ + + dprintf("%s\n", __func__); + if (ctx->mode == HWT_MODE_CPU && + atomic_swap_32(&cpu_mode_ctr, 0) == 0) + return (-1); + + if (CPU_EMPTY(&ctx->cpu_map)) { + dprintf("%s: empty cpu map\n", __func__); + return (-1); + } + smp_rendezvous_cpus(ctx->cpu_map, NULL, pt_cpu_stop, NULL, NULL); + + return (0); +} + +/* + * HWT backend initialization method. + * + * Installs the ToPA interrupt handler and initializes + * the tracing contexts used for HWT_MODE_CPU. + */ +static int +pt_backend_init(struct hwt_context *ctx) +{ + struct hwt_cpu *hwt_cpu; + int error; + + dprintf("%s\n", __func__); + if (ctx->mode == HWT_MODE_CPU) { + TAILQ_FOREACH(hwt_cpu, &ctx->cpus, next) { + error = pt_init_ctx(&pt_pcpu_ctx[hwt_cpu->cpu_id], + hwt_cpu->vm, hwt_cpu->cpu_id); + if (error) + return (error); + } + } + + return (0); +} + +/* + * HWT backend teardown method. + * + * Removes the ToPA interrupt handler, stops tracing on all active CPUs, + * and releases all previously allocated ToPA metadata. + */ +static int +pt_backend_deinit(struct hwt_context *ctx) +{ + struct pt_ctx *pt_ctx; + struct hwt_thread *thr; + int cpu_id; + + dprintf("%s\n", __func__); + + pt_backend_disable_smp(ctx); + if (ctx->mode == HWT_MODE_THREAD) { + TAILQ_FOREACH(thr, &ctx->threads, next) { + KASSERT(thr->private != NULL, + ("%s: thr->private not set", __func__)); + pt_ctx = (struct pt_ctx *)thr->private; + pt_deinit_ctx(pt_ctx); + } + } else { + CPU_FOREACH(cpu_id) { + if (!CPU_ISSET(cpu_id, &ctx->cpu_map)) + continue; + if (pt_pcpu[cpu_id].ctx != NULL) { + KASSERT(pt_pcpu[cpu_id].ctx == + &pt_pcpu_ctx[cpu_id], + ("%s: CPU mode tracing with non-cpu mode PT" + "context active", + __func__)); + pt_pcpu[cpu_id].ctx = NULL; + } + pt_ctx = &pt_pcpu_ctx[cpu_id]; + pt_deinit_ctx(pt_ctx); + memset(&pt_pcpu[cpu_id], 0, sizeof(struct pt_cpu)); + } + } + + return (0); +} + +/* + * Fetches current offset into the tracing buffer. + */ +static int +pt_backend_read(struct hwt_vm *vm, int *curpage, vm_offset_t *curpage_offset, + uint64_t *data) +{ + struct pt_buffer *buf; + + if (vm->ctx->mode == HWT_MODE_THREAD) + buf = &((struct pt_ctx *)vm->thr->private)->buf; + else + buf = &pt_pcpu[vm->cpu->cpu_id].ctx->buf; + mtx_lock_spin(&buf->lock); + *curpage = buf->curpage; + *curpage_offset = buf->offset + (buf->wrap_count * vm->ctx->bufsize); + mtx_unlock_spin(&buf->lock); + + return (0); +} + +/* + * HWT thread creation hook. + * Allocates and associates a 'struct pt_ctx' for a given hwt thread. + */ +static int +pt_backend_alloc_thread(struct hwt_thread *thr) +{ + struct pt_ctx *pt_ctx; + int error; + + /* Omit M_WAITOK since this might get invoked a non-sleepable context */ + pt_ctx = malloc(sizeof(*pt_ctx), M_PT, M_NOWAIT | M_ZERO); + if (pt_ctx == NULL) + return (ENOMEM); + + error = pt_init_ctx(pt_ctx, thr->vm, thr->thread_id); + if (error) + return (error); + + thr->private = pt_ctx; + return (0); +} +/* + * HWT thread teardown hook. + */ +static void +pt_backend_free_thread(struct hwt_thread *thr) +{ + struct pt_ctx *ctx; + + ctx = (struct pt_ctx *)thr->private; + + pt_deinit_ctx(ctx); + free(ctx, M_PT); +} + +static void +pt_backend_dump(int cpu_id) +{ +} + +static struct hwt_backend_ops pt_ops = { + .hwt_backend_init = pt_backend_init, + .hwt_backend_deinit = pt_backend_deinit, + + .hwt_backend_configure = pt_backend_configure, + + .hwt_backend_enable = pt_backend_enable, + .hwt_backend_disable = pt_backend_disable, + +#ifdef SMP + .hwt_backend_enable_smp = pt_backend_enable_smp, + .hwt_backend_disable_smp = pt_backend_disable_smp, +#endif + + .hwt_backend_read = pt_backend_read, + .hwt_backend_dump = pt_backend_dump, + + .hwt_backend_thread_alloc = pt_backend_alloc_thread, + .hwt_backend_thread_free = pt_backend_free_thread, +}; + +static struct hwt_backend backend = { + .ops = &pt_ops, + .name = "pt", + .kva_req = 1, +}; + +/* + * Reads the latest valid trace buffer offset and enqueues + * a HWT_RECORD_BUFFER record. + * Used as a taskqueue routine from the ToPA interrupt handler. + */ +static void +pt_send_buffer_record(void *arg, int pending __unused) +{ + struct hwt_record_entry record; + struct pt_ctx *ctx = (struct pt_ctx *)arg; + + /* Prepare buffer record. */ + mtx_lock_spin(&ctx->buf.lock); + pt_fill_buffer_record(ctx->id, &ctx->buf, &record); + mtx_unlock_spin(&ctx->buf.lock); + hwt_record_ctx(ctx->hwt_ctx, &record, M_ZERO | M_NOWAIT); +} +static void +pt_topa_status_clear(void) +{ + uint64_t reg; + + reg = rdmsr(MSR_IA_GLOBAL_STATUS_RESET); + reg &= ~GLOBAL_STATUS_FLAG_TRACETOPAPMI; + reg |= GLOBAL_STATUS_FLAG_TRACETOPAPMI; + wrmsr(MSR_IA_GLOBAL_STATUS_RESET, reg); +} + +/* + * ToPA PMI handler. + * + * Invoked every time a ToPA entry marked with TOPA_INT is filled. + * Uses taskqueue to enqueue a buffer record for userspace. + * Re-enables the PC interrupt line as long as tracing is active. + */ +static int +pt_topa_intr(struct trapframe *tf) +{ + struct pt_buffer *buf; + struct pt_ctx *ctx; + uint64_t reg; + + SDT_PROBE0(pt, , , topa__intr); + + if (pt_cpu_get_state(curcpu) != PT_ACTIVE) { + return (0); + } + reg = rdmsr(MSR_IA_GLOBAL_STATUS); + if ((reg & GLOBAL_STATUS_FLAG_TRACETOPAPMI) == 0) { + /* ACK spurious or leftover interrupt. */ + pt_topa_status_clear(); + return (1); + } + + ctx = pt_pcpu[curcpu].ctx; + buf = &ctx->buf; + KASSERT(buf->topa_hw != NULL, + ("%s: ToPA PMI interrupt with invalid buffer", __func__)); + + pt_cpu_toggle_local(ctx->save_area, false); + pt_update_buffer(buf); + pt_topa_status_clear(); + taskqueue_enqueue_flags(taskqueue_pt, &ctx->task, + TASKQUEUE_FAIL_IF_PENDING); + + if (pt_cpu_get_state(curcpu) == PT_ACTIVE) { + pt_cpu_toggle_local(ctx->save_area, true); + lapic_reenable_pcint(); + } + return (1); +} + +/* + * Module initialization. + * + * Saves all PT-related cpuid info, registers itself as a HWT backend, + * and allocates metadata required to keep track of tracing operations + * on each CPU. + */ +static int +pt_init(void) +{ + u_int cp[4]; + int error; + + dprintf("pt: Enumerating part 1\n"); + cpuid_count(CPUID_PT_LEAF, 0, cp); + dprintf("pt: Maximum valid sub-leaf Index: %x\n", cp[0]); + dprintf("pt: ebx %x\n", cp[1]); + dprintf("pt: ecx %x\n", cp[2]); + + pt_info.l0_eax = cp[0]; + pt_info.l0_ebx = cp[1]; + pt_info.l0_ecx = cp[2]; + + dprintf("pt: Enumerating part 2\n"); + cpuid_count(CPUID_PT_LEAF, 1, cp); + dprintf("pt: eax %x\n", cp[0]); + dprintf("pt: ebx %x\n", cp[1]); + + pt_info.l1_eax = cp[0]; + pt_info.l1_ebx = cp[1]; + + error = hwt_backend_register(&backend); + if (error != 0) { + printf("pt: unable to register hwt backend, error %d\n", error); + return (error); + } + pt_pcpu = mallocarray(mp_ncpus, sizeof(struct pt_cpu), M_PT, + M_ZERO | M_WAITOK); + pt_pcpu_ctx = mallocarray(mp_ncpus, sizeof(struct pt_ctx), M_PT, + M_ZERO | M_WAITOK); + + nmi_register_handler(pt_topa_intr); + if (!lapic_enable_pcint()) { + nmi_remove_handler(pt_topa_intr); + hwt_backend_unregister(&backend); + free(pt_pcpu, M_PT); + free(pt_pcpu_ctx, M_PT); + pt_pcpu = NULL; + pt_pcpu_ctx = NULL; + printf("pt: failed to setup interrupt line\n"); + return (error); + } + initialized = true; + + return (0); +} + +/* + * Checks whether the CPU support Intel PT and + * initializes XSAVE area info. + * + * The driver relies on XSAVE/XRSTOR PT extensions, + * Table of Physical Addresses (ToPA) support, and + * support for multiple ToPA entries. + */ +static bool +pt_supported(void) +{ + u_int cp[4]; + + if ((cpu_stdext_feature & CPUID_STDEXT_PROCTRACE) == 0) { + printf("pt: CPU does not support Intel Processor Trace\n"); + return (false); + } + if ((cpu_feature2 & CPUID2_XSAVE) == 0) { + printf("pt: XSAVE is not supported\n"); + return (false); + } + if (!xsave_extfeature_supported(XFEATURE_ENABLED_PT, true)) { + printf("pt: CPU does not support managing PT state using XSAVE\n"); + return (false); + } + if (!xsave_extension_supported(CPUID_EXTSTATE_XSAVEC)) { + printf("pt: XSAVE compaction is not supported\n"); + return (false); + } + if (!xsave_extension_supported(CPUID_EXTSTATE_XSAVES)) { + printf("pt: CPU does not support XSAVES/XRSTORS\n"); + return (false); + } + + /* Require ToPA support. */ + cpuid_count(CPUID_PT_LEAF, 0, cp); + if ((cp[2] & CPUPT_TOPA) == 0) { + printf("pt: ToPA is not supported\n"); + return (false); + } + if ((cp[2] & CPUPT_TOPA_MULTI) == 0) { + printf("pt: multiple ToPA outputs are not supported\n"); + return (false); + } + + pt_info.xstate_hdr_offset = xsave_area_hdr_offset(); + pt_info.xsave_area_size = xsave_area_size(PT_XSTATE_BV, true, true); + pt_info.pt_xsave_offset = xsave_area_offset(PT_XSTATE_BV, + XFEATURE_ENABLED_PT, true, true); + + return (true); +} + +static void +pt_deinit(void) +{ + if (!initialized) + return; + nmi_remove_handler(pt_topa_intr); + lapic_disable_pcint(); + hwt_backend_unregister(&backend); + free(pt_pcpu, M_PT); + free(pt_pcpu_ctx, M_PT); + pt_pcpu = NULL; + initialized = false; +} + +static int +pt_modevent(module_t mod, int type, void *data) +{ + switch (type) { + case MOD_LOAD: + if (!pt_supported() || pt_init() != 0) { + return (ENXIO); + } + break; + case MOD_UNLOAD: + pt_deinit(); + break; + default: + break; + } + + return (0); +} + +static moduledata_t pt_mod = { "intel_pt", pt_modevent, NULL }; + +DECLARE_MODULE(intel_pt, pt_mod, SI_SUB_DRIVERS, SI_ORDER_FIRST); +MODULE_DEPEND(intel_pt, hwt, 1, 1, 1); +MODULE_VERSION(intel_pt, 1); diff --git a/sys/amd64/pt/pt.h b/sys/amd64/pt/pt.h new file mode 100644 index 000000000000..2423afdf22e9 --- /dev/null +++ b/sys/amd64/pt/pt.h @@ -0,0 +1,49 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023 Bojan Novković <bnovkov@freebsd.org> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _AMD64_PT_PT_H_ +#define _AMD64_PT_PT_H_ + +#include <sys/types.h> + +#include <x86/include/specialreg.h> + +#define PT_IP_FILTER_MAX_RANGES (2) /* Intel SDM Vol. 3C, 33-29 */ + +struct pt_cpu_config { + uint64_t rtit_ctl; + register_t cr3_filter; + int nranges; + struct ipf_range { + vm_offset_t start; + vm_offset_t end; + } ip_ranges[PT_IP_FILTER_MAX_RANGES]; + uint32_t mtc_freq; + uint32_t cyc_thresh; + uint32_t psb_freq; +}; +#endif /* !_AMD64_PT_PT_H_ */ diff --git a/sys/amd64/vmm/intel/vmx_support.S b/sys/amd64/vmm/intel/vmx_support.S index f393f160b101..130130b64541 100644 --- a/sys/amd64/vmm/intel/vmx_support.S +++ b/sys/amd64/vmm/intel/vmx_support.S @@ -32,12 +32,6 @@ #include "vmx_assym.h" -#ifdef SMP -#define LK lock ; -#else -#define LK -#endif - /* Be friendly to DTrace FBT's prologue/epilogue pattern matching */ #define VENTER push %rbp ; mov %rsp,%rbp #define VLEAVE pop %rbp diff --git a/sys/arm/allwinner/aw_gpio.c b/sys/arm/allwinner/aw_gpio.c index 18b47bab12d9..2061e38a155f 100644 --- a/sys/arm/allwinner/aw_gpio.c +++ b/sys/arm/allwinner/aw_gpio.c @@ -1154,10 +1154,6 @@ aw_gpio_attach(device_t dev) aw_gpio_register_isrcs(sc); intr_pic_register(dev, OF_xref_from_node(ofw_bus_get_node(dev))); - sc->sc_busdev = gpiobus_attach_bus(dev); - if (sc->sc_busdev == NULL) - goto fail; - /* * Register as a pinctrl device */ @@ -1166,6 +1162,10 @@ aw_gpio_attach(device_t dev) fdt_pinctrl_register(dev, "allwinner,pins"); fdt_pinctrl_configure_tree(dev); + sc->sc_busdev = gpiobus_attach_bus(dev); + if (sc->sc_busdev == NULL) + goto fail; + config_intrhook_oneshot(aw_gpio_enable_bank_supply, sc); return (0); diff --git a/sys/arm/allwinner/aw_mmc.c b/sys/arm/allwinner/aw_mmc.c index 6bebf5e5fb5e..a8add957dc74 100644 --- a/sys/arm/allwinner/aw_mmc.c +++ b/sys/arm/allwinner/aw_mmc.c @@ -84,21 +84,26 @@ struct aw_mmc_conf { uint32_t dma_xferlen; + uint32_t dma_desc_shift; bool mask_data0; bool can_calibrate; bool new_timing; + bool zero_is_skip; }; static const struct aw_mmc_conf a10_mmc_conf = { .dma_xferlen = 0x2000, + .dma_desc_shift = 0, }; static const struct aw_mmc_conf a13_mmc_conf = { .dma_xferlen = 0x10000, + .dma_desc_shift = 0, }; static const struct aw_mmc_conf a64_mmc_conf = { .dma_xferlen = 0x10000, + .dma_desc_shift = 0, .mask_data0 = true, .can_calibrate = true, .new_timing = true, @@ -106,13 +111,24 @@ static const struct aw_mmc_conf a64_mmc_conf = { static const struct aw_mmc_conf a64_emmc_conf = { .dma_xferlen = 0x2000, + .dma_desc_shift = 0, .can_calibrate = true, }; +static const struct aw_mmc_conf d1_mmc_conf = { + .dma_xferlen = 0x1000, + .dma_desc_shift = 2, + .mask_data0 = true, + .can_calibrate = true, + .new_timing = true, + .zero_is_skip = true, +}; + static struct ofw_compat_data compat_data[] = { {"allwinner,sun4i-a10-mmc", (uintptr_t)&a10_mmc_conf}, {"allwinner,sun5i-a13-mmc", (uintptr_t)&a13_mmc_conf}, {"allwinner,sun7i-a20-mmc", (uintptr_t)&a13_mmc_conf}, + {"allwinner,sun20i-d1-mmc", (uintptr_t)&d1_mmc_conf}, {"allwinner,sun50i-a64-mmc", (uintptr_t)&a64_mmc_conf}, {"allwinner,sun50i-a64-emmc", (uintptr_t)&a64_emmc_conf}, {NULL, 0} @@ -607,16 +623,18 @@ aw_dma_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int err) dma_desc = sc->aw_dma_desc; for (i = 0; i < nsegs; i++) { - if (segs[i].ds_len == sc->aw_mmc_conf->dma_xferlen) + if ((segs[i].ds_len == sc->aw_mmc_conf->dma_xferlen) && + !sc->aw_mmc_conf->zero_is_skip) dma_desc[i].buf_size = 0; /* Size of 0 indicate max len */ else dma_desc[i].buf_size = segs[i].ds_len; - dma_desc[i].buf_addr = segs[i].ds_addr; + dma_desc[i].buf_addr = segs[i].ds_addr >> + sc->aw_mmc_conf->dma_desc_shift; dma_desc[i].config = AW_MMC_DMA_CONFIG_CH | - AW_MMC_DMA_CONFIG_OWN | AW_MMC_DMA_CONFIG_DIC; - - dma_desc[i].next = sc->aw_dma_desc_phys + - ((i + 1) * sizeof(struct aw_mmc_dma_desc)); + AW_MMC_DMA_CONFIG_OWN | AW_MMC_DMA_CONFIG_DIC; + dma_desc[i].next = (sc->aw_dma_desc_phys + + (i + 1) * sizeof(struct aw_mmc_dma_desc)) >> + sc->aw_mmc_conf->dma_desc_shift; } dma_desc[0].config |= AW_MMC_DMA_CONFIG_FD; @@ -678,7 +696,8 @@ aw_mmc_prepare_dma(struct aw_mmc_softc *sc) AW_MMC_WRITE_4(sc, AW_MMC_IDIE, val); /* Set DMA descritptor list address */ - AW_MMC_WRITE_4(sc, AW_MMC_DLBA, sc->aw_dma_desc_phys); + AW_MMC_WRITE_4(sc, AW_MMC_DLBA, sc->aw_dma_desc_phys >> + sc->aw_mmc_conf->dma_desc_shift); /* FIFO trigger level */ AW_MMC_WRITE_4(sc, AW_MMC_FWLR, AW_MMC_DMA_FTRGLEVEL); diff --git a/sys/arm/allwinner/aw_rtc.c b/sys/arm/allwinner/aw_rtc.c index 9938601f17ce..4af57ab879e8 100644 --- a/sys/arm/allwinner/aw_rtc.c +++ b/sys/arm/allwinner/aw_rtc.c @@ -134,6 +134,7 @@ static struct ofw_compat_data compat_data[] = { { "allwinner,sun7i-a20-rtc", (uintptr_t) &a20_conf }, { "allwinner,sun6i-a31-rtc", (uintptr_t) &a31_conf }, { "allwinner,sun8i-h3-rtc", (uintptr_t) &h3_conf }, + { "allwinner,sun20i-d1-rtc", (uintptr_t) &h3_conf }, { "allwinner,sun50i-h5-rtc", (uintptr_t) &h3_conf }, { "allwinner,sun50i-h6-rtc", (uintptr_t) &h3_conf }, { NULL, 0 } @@ -147,11 +148,13 @@ struct aw_rtc_softc { static struct clk_fixed_def aw_rtc_osc32k = { .clkdef.id = 0, + .clkdef.name = "osc32k", .freq = 32768, }; static struct clk_fixed_def aw_rtc_iosc = { .clkdef.id = 2, + .clkdef.name = "iosc", }; static void aw_rtc_install_clocks(struct aw_rtc_softc *sc, device_t dev); @@ -250,23 +253,33 @@ aw_rtc_install_clocks(struct aw_rtc_softc *sc, device_t dev) { int nclocks; node = ofw_bus_get_node(dev); - nclocks = ofw_bus_string_list_to_array(node, "clock-output-names", &clknames); - /* No clocks to export */ - if (nclocks <= 0) - return; - if (nclocks != 3) { - device_printf(dev, "Having only %d clocks instead of 3, aborting\n", nclocks); + /* Nothing to do. */ + if (!OF_hasprop(node, "clocks")) return; + + /* + * If the device tree gives us specific output names for the clocks, + * use them. + */ + nclocks = ofw_bus_string_list_to_array(node, "clock-output-names", &clknames); + if (nclocks > 0) { + if (nclocks != 3) { + device_printf(dev, + "Found %d clocks names instead of 3, aborting\n", + nclocks); + return; + } + + aw_rtc_osc32k.clkdef.name = clknames[0]; + aw_rtc_iosc.clkdef.name = clknames[2]; } clkdom = clkdom_create(dev); - aw_rtc_osc32k.clkdef.name = clknames[0]; if (clknode_fixed_register(clkdom, &aw_rtc_osc32k) != 0) device_printf(dev, "Cannot register osc32k clock\n"); - aw_rtc_iosc.clkdef.name = clknames[2]; aw_rtc_iosc.freq = sc->conf->iosc_freq; if (clknode_fixed_register(clkdom, &aw_rtc_iosc) != 0) device_printf(dev, "Cannot register iosc clock\n"); diff --git a/sys/arm/arm/pmap-v6.c b/sys/arm/arm/pmap-v6.c index 92eb0589f80b..78883296c5b7 100644 --- a/sys/arm/arm/pmap-v6.c +++ b/sys/arm/arm/pmap-v6.c @@ -5767,7 +5767,7 @@ pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) CTR5(KTR_PMAP, "%s: page %p - 0x%08X oma: %d, ma: %d", __func__, m, VM_PAGE_TO_PHYS(m), oma, ma); - if ((m->flags & PG_FICTITIOUS) != 0) + if (ma == oma || (m->flags & PG_FICTITIOUS) != 0) return; #if 0 /* @@ -5784,22 +5784,20 @@ pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) * If page is not mapped by sf buffer, map the page * transient and do invalidation. */ - if (ma != oma) { - pa = VM_PAGE_TO_PHYS(m); - sched_pin(); - pc = get_pcpu(); - cmap2_pte2p = pc->pc_cmap2_pte2p; - mtx_lock(&pc->pc_cmap_lock); - if (pte2_load(cmap2_pte2p) != 0) - panic("%s: CMAP2 busy", __func__); - pte2_store(cmap2_pte2p, PTE2_KERN_NG(pa, PTE2_AP_KRW, - vm_memattr_to_pte2(ma))); - dcache_wbinv_poc((vm_offset_t)pc->pc_cmap2_addr, pa, PAGE_SIZE); - pte2_clear(cmap2_pte2p); - tlb_flush((vm_offset_t)pc->pc_cmap2_addr); - sched_unpin(); - mtx_unlock(&pc->pc_cmap_lock); - } + pa = VM_PAGE_TO_PHYS(m); + sched_pin(); + pc = get_pcpu(); + cmap2_pte2p = pc->pc_cmap2_pte2p; + mtx_lock(&pc->pc_cmap_lock); + if (pte2_load(cmap2_pte2p) != 0) + panic("%s: CMAP2 busy", __func__); + pte2_store(cmap2_pte2p, PTE2_KERN_NG(pa, PTE2_AP_KRW, + vm_memattr_to_pte2(ma))); + dcache_wbinv_poc((vm_offset_t)pc->pc_cmap2_addr, pa, PAGE_SIZE); + pte2_clear(cmap2_pte2p); + tlb_flush((vm_offset_t)pc->pc_cmap2_addr); + sched_unpin(); + mtx_unlock(&pc->pc_cmap_lock); } /* diff --git a/sys/arm/broadcom/bcm2835/bcm2835_gpio.c b/sys/arm/broadcom/bcm2835/bcm2835_gpio.c index e4fc57b79ba5..48d1d2af5abc 100644 --- a/sys/arm/broadcom/bcm2835/bcm2835_gpio.c +++ b/sys/arm/broadcom/bcm2835/bcm2835_gpio.c @@ -837,12 +837,12 @@ bcm_gpio_attach(device_t dev) } sc->sc_gpio_npins = i; bcm_gpio_sysctl_init(sc); - sc->sc_busdev = gpiobus_attach_bus(dev); - if (sc->sc_busdev == NULL) - goto fail; fdt_pinctrl_register(dev, "brcm,pins"); fdt_pinctrl_configure_tree(dev); + sc->sc_busdev = gpiobus_attach_bus(dev); + if (sc->sc_busdev == NULL) + goto fail; return (0); diff --git a/sys/arm/mv/mvebu_gpio.c b/sys/arm/mv/mvebu_gpio.c index 681cf20f7f9f..7acdfff539dc 100644 --- a/sys/arm/mv/mvebu_gpio.c +++ b/sys/arm/mv/mvebu_gpio.c @@ -810,7 +810,6 @@ mvebu_gpio_attach(device_t dev) return (ENXIO); } - bus_attach_children(dev); return (0); } diff --git a/sys/arm/nvidia/as3722_gpio.c b/sys/arm/nvidia/as3722_gpio.c index 073d057884c9..f7b3d4d43bab 100644 --- a/sys/arm/nvidia/as3722_gpio.c +++ b/sys/arm/nvidia/as3722_gpio.c @@ -544,7 +544,7 @@ as3722_gpio_attach(struct as3722_softc *sc, phandle_t node) sc->gpio_pins = malloc(sizeof(struct as3722_gpio_pin *) * sc->gpio_npins, M_AS3722_GPIO, M_WAITOK | M_ZERO); - sc->gpio_busdev = gpiobus_attach_bus(sc->dev); + sc->gpio_busdev = gpiobus_add_bus(sc->dev); if (sc->gpio_busdev == NULL) return (ENXIO); for (i = 0; i < sc->gpio_npins; i++) { diff --git a/sys/arm/nvidia/tegra_gpio.c b/sys/arm/nvidia/tegra_gpio.c index 16e1ef94d6a9..e37fd69a121e 100644 --- a/sys/arm/nvidia/tegra_gpio.c +++ b/sys/arm/nvidia/tegra_gpio.c @@ -824,7 +824,6 @@ tegra_gpio_attach(device_t dev) return (ENXIO); } - bus_attach_children(dev); return (0); } diff --git a/sys/arm64/apple/apple_pinctrl.c b/sys/arm64/apple/apple_pinctrl.c index ec2dd5907024..ebaaccea1d99 100644 --- a/sys/arm64/apple/apple_pinctrl.c +++ b/sys/arm64/apple/apple_pinctrl.c @@ -161,22 +161,22 @@ apple_pinctrl_attach(device_t dev) goto error; } + fdt_pinctrl_register(dev, "pinmux"); + fdt_pinctrl_configure_tree(dev); + + if (OF_hasprop(node, "interrupt-controller")) { + sc->sc_irqs = mallocarray(sc->sc_ngpios, + sizeof(*sc->sc_irqs), M_DEVBUF, M_ZERO | M_WAITOK); + intr_pic_register(dev, + OF_xref_from_node(ofw_bus_get_node(dev))); + } + sc->sc_busdev = gpiobus_attach_bus(dev); if (sc->sc_busdev == NULL) { device_printf(dev, "failed to attach gpiobus\n"); goto error; } - fdt_pinctrl_register(dev, "pinmux"); - fdt_pinctrl_configure_tree(dev); - - if (!OF_hasprop(node, "interrupt-controller")) - return (0); - - sc->sc_irqs = mallocarray(sc->sc_ngpios, - sizeof(*sc->sc_irqs), M_DEVBUF, M_ZERO | M_WAITOK); - intr_pic_register(dev, OF_xref_from_node(ofw_bus_get_node(dev))); - return (0); error: mtx_destroy(&sc->sc_mtx); diff --git a/sys/arm64/arm64/pmap.c b/sys/arm64/arm64/pmap.c index d2e56a270f54..459cc8ebe505 100644 --- a/sys/arm64/arm64/pmap.c +++ b/sys/arm64/arm64/pmap.c @@ -497,7 +497,8 @@ static bool pmap_pv_insert_l3c(pmap_t pmap, vm_offset_t va, vm_page_t m, struct rwlock **lockp); static void pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va); static int pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, - pd_entry_t l1e, struct spglist *free, struct rwlock **lockp); + pd_entry_t l1e, bool demote_kl2e, struct spglist *free, + struct rwlock **lockp); static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva, pd_entry_t l2e, struct spglist *free, struct rwlock **lockp); static bool pmap_remove_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va, @@ -3847,8 +3848,7 @@ pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va) PMAP_LOCK_ASSERT(pmap, MA_OWNED); ml3 = pmap_remove_pt_page(pmap, va); - if (ml3 == NULL) - panic("pmap_remove_kernel_l2: Missing pt page"); + KASSERT(ml3 != NULL, ("pmap_remove_kernel_l2: missing pt page")); ml3pa = VM_PAGE_TO_PHYS(ml3); newl2 = PHYS_TO_PTE(ml3pa) | L2_TABLE; @@ -3873,8 +3873,8 @@ pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va) * pmap_remove_l2: Do the things to unmap a level 2 superpage. */ static int -pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, - pd_entry_t l1e, struct spglist *free, struct rwlock **lockp) +pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, pd_entry_t l1e, + bool demote_kl2e, struct spglist *free, struct rwlock **lockp) { struct md_page *pvh; pt_entry_t old_l2; @@ -3910,9 +3910,7 @@ pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, vm_page_aflag_clear(mt, PGA_WRITEABLE); } } - if (pmap == kernel_pmap) { - pmap_remove_kernel_l2(pmap, l2, sva); - } else { + if (pmap != kernel_pmap) { ml3 = pmap_remove_pt_page(pmap, sva); if (ml3 != NULL) { KASSERT(vm_page_any_valid(ml3), @@ -3923,6 +3921,14 @@ pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, ml3->ref_count = 0; pmap_add_delayed_free_list(ml3, free, false); } + } else if (demote_kl2e) { + pmap_remove_kernel_l2(pmap, l2, sva); + } else { + ml3 = vm_radix_lookup(&pmap->pm_root, pmap_l2_pindex(sva)); + if (vm_page_any_valid(ml3)) { + ml3->valid = 0; + pmap_zero_page(ml3); + } } return (pmap_unuse_pt(pmap, sva, l1e, free)); } @@ -4232,7 +4238,7 @@ pmap_remove1(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, bool map_delete) if ((l3_paddr & ATTR_DESCR_MASK) == L2_BLOCK) { if (sva + L2_SIZE == va_next && eva >= va_next) { pmap_remove_l2(pmap, l2, sva, pmap_load(l1), - &free, &lock); + true, &free, &lock); continue; } else if (pmap_demote_l2_locked(pmap, l2, sva, &lock) == NULL) @@ -5703,6 +5709,9 @@ pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags, PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT(ADDR_IS_CANONICAL(va), ("%s: Address not in canonical form: %lx", __func__, va)); + KASSERT((flags & (PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM)) != + PMAP_ENTER_NORECLAIM, + ("pmap_enter_l2: flags is missing PMAP_ENTER_NOREPLACE")); if ((l2 = pmap_alloc_l2(pmap, va, &l2pg, (flags & PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp)) == NULL) { @@ -5747,33 +5756,51 @@ pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags, } } SLIST_INIT(&free); - if ((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK) + if ((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK) { (void)pmap_remove_l2(pmap, l2, va, - pmap_load(pmap_l1(pmap, va)), &free, lockp); - else + pmap_load(pmap_l1(pmap, va)), false, &free, lockp); + } else { + if (ADDR_IS_KERNEL(va)) { + /* + * Try to save the ptp in the trie + * before any changes to mappings are + * made. Abort on failure. + */ + mt = PTE_TO_VM_PAGE(old_l2); + if (pmap_insert_pt_page(pmap, mt, false, + false)) { + CTR1(KTR_PMAP, + "pmap_enter_l2: cannot ins kern ptp va %#lx", + va); + return (KERN_RESOURCE_SHORTAGE); + } + /* + * Both pmap_remove_l2() and + * pmap_remove_l3_range() will zero fill + * the L3 kernel page table page. + */ + } pmap_remove_l3_range(pmap, old_l2, va, va + L2_SIZE, &free, lockp); + if (ADDR_IS_KERNEL(va)) { + /* + * The TLB could have an intermediate + * entry for the L3 kernel page table + * page, so request an invalidation at + * all levels after clearing the + * L2_TABLE entry. + */ + pmap_clear(l2); + pmap_s1_invalidate_page(pmap, va, false); + } + } + KASSERT(pmap_load(l2) == 0, + ("pmap_enter_l2: non-zero L2 entry %p", l2)); if (!ADDR_IS_KERNEL(va)) { vm_page_free_pages_toq(&free, true); - KASSERT(pmap_load(l2) == 0, - ("pmap_enter_l2: non-zero L2 entry %p", l2)); } else { KASSERT(SLIST_EMPTY(&free), ("pmap_enter_l2: freed kernel page table page")); - - /* - * Both pmap_remove_l2() and pmap_remove_l3_range() - * will leave the kernel page table page zero filled. - * Nonetheless, the TLB could have an intermediate - * entry for the kernel page table page, so request - * an invalidation at all levels after clearing - * the L2_TABLE entry. - */ - mt = PTE_TO_VM_PAGE(pmap_load(l2)); - if (pmap_insert_pt_page(pmap, mt, false, false)) - panic("pmap_enter_l2: trie insert failed"); - pmap_clear(l2); - pmap_s1_invalidate_page(pmap, va, false); } } @@ -5804,6 +5831,15 @@ pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags, if (!pmap_pv_insert_l2(pmap, va, new_l2, flags, lockp)) { if (l2pg != NULL) pmap_abort_ptp(pmap, va, l2pg); + else { + KASSERT(ADDR_IS_KERNEL(va) && + (pmap_load(l2) & ATTR_DESCR_MASK) == + L2_TABLE, + ("pmap_enter_l2: invalid kernel L2E")); + mt = pmap_remove_pt_page(pmap, va); + KASSERT(mt != NULL, + ("pmap_enter_l2: missing kernel PTP")); + } if (uwptpg != NULL) { mt = pmap_remove_pt_page(pmap, va); KASSERT(mt == uwptpg, @@ -8045,6 +8081,8 @@ pmap_unmapbios(void *p, vm_size_t size) void pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) { + if (m->md.pv_memattr == ma) + return; m->md.pv_memattr = ma; @@ -8424,8 +8462,8 @@ pmap_demote_l2_abort(pmap_t pmap, vm_offset_t va, pt_entry_t *l2, struct spglist free; SLIST_INIT(&free); - (void)pmap_remove_l2(pmap, l2, va, pmap_load(pmap_l1(pmap, va)), &free, - lockp); + (void)pmap_remove_l2(pmap, l2, va, pmap_load(pmap_l1(pmap, va)), true, + &free, lockp); vm_page_free_pages_toq(&free, true); } diff --git a/sys/arm64/broadcom/genet/if_genet.c b/sys/arm64/broadcom/genet/if_genet.c index 0602f076b257..182b5582fb7c 100644 --- a/sys/arm64/broadcom/genet/if_genet.c +++ b/sys/arm64/broadcom/genet/if_genet.c @@ -349,7 +349,7 @@ gen_attach(device_t dev) } /* If address was not found, create one based on the hostid and name. */ - if (eaddr_found == 0) + if (!eaddr_found) ether_gen_addr(sc->ifp, &eaddr); /* Attach ethernet interface */ ether_ifattach(sc->ifp, eaddr.octet); @@ -653,7 +653,7 @@ gen_bus_dma_teardown(struct gen_softc *sc) error); } - if (sc->tx_buf_tag != NULL) { + if (sc->rx_buf_tag != NULL) { for (i = 0; i < RX_DESC_COUNT; i++) { error = bus_dmamap_destroy(sc->rx_buf_tag, sc->rx_ring_ent[i].map); diff --git a/sys/arm64/linux/linux_proto.h b/sys/arm64/linux/linux_proto.h index ae3d8569df58..82f57f77ffae 100644 --- a/sys/arm64/linux/linux_proto.h +++ b/sys/arm64/linux/linux_proto.h @@ -141,10 +141,13 @@ struct linux_inotify_init1_args { char flags_l_[PADL_(l_int)]; l_int flags; char flags_r_[PADR_(l_int)]; }; struct linux_inotify_add_watch_args { - syscallarg_t dummy; + char fd_l_[PADL_(l_int)]; l_int fd; char fd_r_[PADR_(l_int)]; + char pathname_l_[PADL_(const char *)]; const char * pathname; char pathname_r_[PADR_(const char *)]; + char mask_l_[PADL_(uint32_t)]; uint32_t mask; char mask_r_[PADR_(uint32_t)]; }; struct linux_inotify_rm_watch_args { - syscallarg_t dummy; + char fd_l_[PADL_(l_int)]; l_int fd; char fd_r_[PADR_(l_int)]; + char wd_l_[PADL_(uint32_t)]; uint32_t wd; char wd_r_[PADR_(uint32_t)]; }; struct linux_ioctl_args { char fd_l_[PADL_(l_uint)]; l_uint fd; char fd_r_[PADR_(l_uint)]; diff --git a/sys/arm64/linux/linux_sysent.c b/sys/arm64/linux/linux_sysent.c index 722ada465730..e54a76cfd55e 100644 --- a/sys/arm64/linux/linux_sysent.c +++ b/sys/arm64/linux/linux_sysent.c @@ -41,8 +41,8 @@ struct sysent linux_sysent[] = { { .sy_narg = AS(linux_dup3_args), .sy_call = (sy_call_t *)linux_dup3, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 24 = linux_dup3 */ { .sy_narg = AS(linux_fcntl_args), .sy_call = (sy_call_t *)linux_fcntl, .sy_auevent = AUE_FCNTL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 25 = linux_fcntl */ { .sy_narg = AS(linux_inotify_init1_args), .sy_call = (sy_call_t *)linux_inotify_init1, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 26 = linux_inotify_init1 */ - { .sy_narg = 0, .sy_call = (sy_call_t *)linux_inotify_add_watch, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 27 = linux_inotify_add_watch */ - { .sy_narg = 0, .sy_call = (sy_call_t *)linux_inotify_rm_watch, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 28 = linux_inotify_rm_watch */ + { .sy_narg = AS(linux_inotify_add_watch_args), .sy_call = (sy_call_t *)linux_inotify_add_watch, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 27 = linux_inotify_add_watch */ + { .sy_narg = AS(linux_inotify_rm_watch_args), .sy_call = (sy_call_t *)linux_inotify_rm_watch, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 28 = linux_inotify_rm_watch */ { .sy_narg = AS(linux_ioctl_args), .sy_call = (sy_call_t *)linux_ioctl, .sy_auevent = AUE_IOCTL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 29 = linux_ioctl */ { .sy_narg = AS(linux_ioprio_set_args), .sy_call = (sy_call_t *)linux_ioprio_set, .sy_auevent = AUE_SETPRIORITY, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 30 = linux_ioprio_set */ { .sy_narg = AS(linux_ioprio_get_args), .sy_call = (sy_call_t *)linux_ioprio_get, .sy_auevent = AUE_GETPRIORITY, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 31 = linux_ioprio_get */ diff --git a/sys/arm64/linux/linux_systrace_args.c b/sys/arm64/linux/linux_systrace_args.c index 54e4dd82355d..1b946a9406a5 100644 --- a/sys/arm64/linux/linux_systrace_args.c +++ b/sys/arm64/linux/linux_systrace_args.c @@ -210,12 +210,19 @@ systrace_args(int sysnum, void *params, uint64_t *uarg, int *n_args) } /* linux_inotify_add_watch */ case 27: { - *n_args = 0; + struct linux_inotify_add_watch_args *p = params; + iarg[a++] = p->fd; /* l_int */ + uarg[a++] = (intptr_t)p->pathname; /* const char * */ + uarg[a++] = p->mask; /* uint32_t */ + *n_args = 3; break; } /* linux_inotify_rm_watch */ case 28: { - *n_args = 0; + struct linux_inotify_rm_watch_args *p = params; + iarg[a++] = p->fd; /* l_int */ + uarg[a++] = p->wd; /* uint32_t */ + *n_args = 2; break; } /* linux_ioctl */ @@ -2780,9 +2787,32 @@ systrace_entry_setargdesc(int sysnum, int ndx, char *desc, size_t descsz) break; /* linux_inotify_add_watch */ case 27: + switch (ndx) { + case 0: + p = "l_int"; + break; + case 1: + p = "userland const char *"; + break; + case 2: + p = "uint32_t"; + break; + default: + break; + }; break; /* linux_inotify_rm_watch */ case 28: + switch (ndx) { + case 0: + p = "l_int"; + break; + case 1: + p = "uint32_t"; + break; + default: + break; + }; break; /* linux_ioctl */ case 29: @@ -6455,8 +6485,14 @@ systrace_return_setargdesc(int sysnum, int ndx, char *desc, size_t descsz) break; /* linux_inotify_add_watch */ case 27: + if (ndx == 0 || ndx == 1) + p = "int"; + break; /* linux_inotify_rm_watch */ case 28: + if (ndx == 0 || ndx == 1) + p = "int"; + break; /* linux_ioctl */ case 29: if (ndx == 0 || ndx == 1) diff --git a/sys/arm64/linux/syscalls.master b/sys/arm64/linux/syscalls.master index 79c04c398e00..2babdcaf03bf 100644 --- a/sys/arm64/linux/syscalls.master +++ b/sys/arm64/linux/syscalls.master @@ -170,10 +170,17 @@ ); } 27 AUE_NULL STD { - int linux_inotify_add_watch(void); + int linux_inotify_add_watch( + l_int fd, + const char *pathname, + uint32_t mask + ); } 28 AUE_NULL STD { - int linux_inotify_rm_watch(void); + int linux_inotify_rm_watch( + l_int fd, + uint32_t wd + ); } 29 AUE_IOCTL STD { int linux_ioctl( diff --git a/sys/arm64/nvidia/tegra210/max77620_gpio.c b/sys/arm64/nvidia/tegra210/max77620_gpio.c index 8dcf98099dac..5d91e23324c7 100644 --- a/sys/arm64/nvidia/tegra210/max77620_gpio.c +++ b/sys/arm64/nvidia/tegra210/max77620_gpio.c @@ -672,7 +672,7 @@ max77620_gpio_attach(struct max77620_softc *sc, phandle_t node) sx_init(&sc->gpio_lock, "MAX77620 GPIO lock"); - sc->gpio_busdev = gpiobus_attach_bus(sc->dev); + sc->gpio_busdev = gpiobus_add_bus(sc->dev); if (sc->gpio_busdev == NULL) return (ENXIO); diff --git a/sys/arm64/rockchip/rk_gpio.c b/sys/arm64/rockchip/rk_gpio.c index a86392f16624..847bc7394dd0 100644 --- a/sys/arm64/rockchip/rk_gpio.c +++ b/sys/arm64/rockchip/rk_gpio.c @@ -362,12 +362,6 @@ rk_gpio_attach(device_t dev) return (ENXIO); } - sc->sc_busdev = gpiobus_attach_bus(dev); - if (sc->sc_busdev == NULL) { - rk_gpio_detach(dev); - return (ENXIO); - } - /* Set the cached value to unknown */ for (i = 0; i < RK_GPIO_MAX_PINS; i++) sc->pin_cached[i].is_gpio = 2; @@ -377,6 +371,12 @@ rk_gpio_attach(device_t dev) sc->swporta_ddr = rk_gpio_read_4(sc, RK_GPIO_SWPORTA_DDR); RK_GPIO_UNLOCK(sc); + sc->sc_busdev = gpiobus_attach_bus(dev); + if (sc->sc_busdev == NULL) { + rk_gpio_detach(dev); + return (ENXIO); + } + return (0); } diff --git a/sys/bsm/audit_kevents.h b/sys/bsm/audit_kevents.h index 0f110d5f9ddd..9381396f247c 100644 --- a/sys/bsm/audit_kevents.h +++ b/sys/bsm/audit_kevents.h @@ -663,6 +663,7 @@ #define AUE_FSPACECTL 43269 /* FreeBSD-specific. */ #define AUE_TIMERFD 43270 /* FreeBSD/Linux. */ #define AUE_SETCRED 43271 /* FreeBSD-specific. */ +#define AUE_INOTIFY 43272 /* FreeBSD/Linux. */ /* * Darwin BSM uses a number of AUE_O_* definitions, which are aliased to the diff --git a/sys/cam/ata/ata_da.c b/sys/cam/ata/ata_da.c index ae7cf14c8f8e..1facab47473c 100644 --- a/sys/cam/ata/ata_da.c +++ b/sys/cam/ata/ata_da.c @@ -1359,10 +1359,7 @@ adaasync(void *callback_arg, uint32_t code, case AC_GETDEV_CHANGED: { softc = (struct ada_softc *)periph->softc; - memset(&cgd, 0, sizeof(cgd)); - xpt_setup_ccb(&cgd.ccb_h, periph->path, CAM_PRIORITY_NORMAL); - cgd.ccb_h.func_code = XPT_GDEV_TYPE; - xpt_action((union ccb *)&cgd); + xpt_gdev_type(&cgd, periph->path); /* * Update our information based on the new Identify data. diff --git a/sys/cam/cam_periph.c b/sys/cam/cam_periph.c index 833df6cfb99b..730656684e2a 100644 --- a/sys/cam/cam_periph.c +++ b/sys/cam/cam_periph.c @@ -767,27 +767,28 @@ camperiphfree(struct cam_periph *periph) CAM_DEBUG(periph->path, CAM_DEBUG_INFO, ("Periph destroyed\n")); if (periph->flags & CAM_PERIPH_NEW_DEV_FOUND) { - union ccb ccb; - void *arg; - - memset(&ccb, 0, sizeof(ccb)); switch (periph->deferred_ac) { - case AC_FOUND_DEVICE: - ccb.ccb_h.func_code = XPT_GDEV_TYPE; - xpt_setup_ccb(&ccb.ccb_h, periph->path, CAM_PRIORITY_NORMAL); - xpt_action(&ccb); - arg = &ccb; + case AC_FOUND_DEVICE: { + struct ccb_getdev cgd; + + xpt_gdev_type(&cgd, periph->path); + periph->deferred_callback(NULL, periph->deferred_ac, + periph->path, &cgd); break; - case AC_PATH_REGISTERED: - xpt_path_inq(&ccb.cpi, periph->path); - arg = &ccb; + } + case AC_PATH_REGISTERED: { + struct ccb_pathinq cpi; + + xpt_path_inq(&cpi, periph->path); + periph->deferred_callback(NULL, periph->deferred_ac, + periph->path, &cpi); break; + } default: - arg = NULL; + periph->deferred_callback(NULL, periph->deferred_ac, + periph->path, NULL); break; } - periph->deferred_callback(NULL, periph->deferred_ac, - periph->path, arg); } xpt_free_path(periph->path); free(periph, M_CAMPERIPH); @@ -1682,10 +1683,7 @@ camperiphscsisenseerror(union ccb *ccb, union ccb **orig, /* * Grab the inquiry data for this device. */ - memset(&cgd, 0, sizeof(cgd)); - xpt_setup_ccb(&cgd.ccb_h, ccb->ccb_h.path, CAM_PRIORITY_NORMAL); - cgd.ccb_h.func_code = XPT_GDEV_TYPE; - xpt_action((union ccb *)&cgd); + xpt_gdev_type(&cgd, ccb->ccb_h.path); err_action = scsi_error_action(&ccb->csio, &cgd.inq_data, sense_flags); @@ -2133,11 +2131,7 @@ cam_periph_devctl_notify(union ccb *ccb) sbuf_cat(&sb, "serial=\""); if ((cgd = (struct ccb_getdev *)xpt_alloc_ccb_nowait()) != NULL) { - xpt_setup_ccb(&cgd->ccb_h, ccb->ccb_h.path, - CAM_PRIORITY_NORMAL); - cgd->ccb_h.func_code = XPT_GDEV_TYPE; - xpt_action((union ccb *)cgd); - + xpt_gdev_type(cgd, ccb->ccb_h.path); if (cgd->ccb_h.status == CAM_REQ_CMP) sbuf_bcat(&sb, cgd->serial_num, cgd->serial_num_len); xpt_free_ccb((union ccb *)cgd); diff --git a/sys/cam/cam_xpt.c b/sys/cam/cam_xpt.c index 38bc82c69aad..cae29226d13c 100644 --- a/sys/cam/cam_xpt.c +++ b/sys/cam/cam_xpt.c @@ -2471,15 +2471,12 @@ xptsetasyncfunc(struct cam_ed *device, void *arg) if ((device->flags & CAM_DEV_UNCONFIGURED) != 0) return (1); - memset(&cgd, 0, sizeof(cgd)); xpt_compile_path(&path, NULL, device->target->bus->path_id, device->target->target_id, device->lun_id); - xpt_setup_ccb(&cgd.ccb_h, &path, CAM_PRIORITY_NORMAL); - cgd.ccb_h.func_code = XPT_GDEV_TYPE; - xpt_action((union ccb *)&cgd); + xpt_gdev_type(&cgd, &path); csa->callback(csa->callback_arg, AC_FOUND_DEVICE, &path, &cgd); @@ -2518,6 +2515,15 @@ xpt_action(union ccb *start_ccb) ("xpt_action: func %#x %s\n", start_ccb->ccb_h.func_code, xpt_action_name(start_ccb->ccb_h.func_code))); + /* + * Either it isn't queued, or it has a real priority. There still too + * many places that reuse CCBs with a real priority to do immediate + * queries to do the other side of this assert. + */ + KASSERT((start_ccb->ccb_h.func_code & XPT_FC_QUEUED) == 0 || + start_ccb->ccb_h.pinfo.priority != CAM_PRIORITY_NONE, + ("%s: queued ccb and CAM_PRIORITY_NONE illegal.", __func__)); + start_ccb->ccb_h.status = CAM_REQ_INPROG; (*(start_ccb->ccb_h.path->bus->xport->ops->action))(start_ccb); } diff --git a/sys/cam/cam_xpt.h b/sys/cam/cam_xpt.h index 06ef52580120..efa6c823245a 100644 --- a/sys/cam/cam_xpt.h +++ b/sys/cam/cam_xpt.h @@ -145,19 +145,31 @@ uint32_t xpt_poll_setup(union ccb *start_ccb); void xpt_sim_poll(struct cam_sim *sim); /* - * Perform a path inquiry at the request priority. The bzero may be - * unnecessary. + * Perform a path inquiry. bzero may be redundant for allocated CCBs, but for + * the on-stack CCBs it's required. */ static inline void xpt_path_inq(struct ccb_pathinq *cpi, struct cam_path *path) { - bzero(cpi, sizeof(*cpi)); - xpt_setup_ccb(&cpi->ccb_h, path, CAM_PRIORITY_NORMAL); + xpt_setup_ccb(&cpi->ccb_h, path, CAM_PRIORITY_NONE); cpi->ccb_h.func_code = XPT_PATH_INQ; xpt_action((union ccb *)cpi); } +/* + * Perform get device type. bzero may be redundant for allocated CCBs, but for + * the on-stack CCBs it's required. + */ +static inline void +xpt_gdev_type(struct ccb_getdev *cgd, struct cam_path *path) +{ + bzero(cgd, sizeof(*cgd)); + xpt_setup_ccb(&cgd->ccb_h, path, CAM_PRIORITY_NONE); + cgd->ccb_h.func_code = XPT_GDEV_TYPE; + xpt_action((union ccb *)cgd); +} + #endif /* _KERNEL */ #endif /* _CAM_CAM_XPT_H */ diff --git a/sys/cam/mmc/mmc_da.c b/sys/cam/mmc/mmc_da.c index 1c455e1951d7..322141a72707 100644 --- a/sys/cam/mmc/mmc_da.c +++ b/sys/cam/mmc/mmc_da.c @@ -692,10 +692,7 @@ sddaasync(void *callback_arg, uint32_t code, case AC_GETDEV_CHANGED: { CAM_DEBUG(path, CAM_DEBUG_TRACE, ("=> AC_GETDEV_CHANGED\n")); - memset(&cgd, 0, sizeof(cgd)); - xpt_setup_ccb(&cgd.ccb_h, periph->path, CAM_PRIORITY_NORMAL); - cgd.ccb_h.func_code = XPT_GDEV_TYPE; - xpt_action((union ccb *)&cgd); + xpt_gdev_type(&cgd, periph->path); cam_periph_async(periph, code, path, arg); break; } @@ -789,7 +786,8 @@ sddaregister(struct cam_periph *periph, void *arg) static int mmc_exec_app_cmd(struct cam_periph *periph, union ccb *ccb, - struct mmc_command *cmd) { + struct mmc_command *cmd) +{ int err; /* Send APP_CMD first */ @@ -843,7 +841,8 @@ mmc_exec_app_cmd(struct cam_periph *periph, union ccb *ccb, } static int -mmc_app_get_scr(struct cam_periph *periph, union ccb *ccb, uint32_t *rawscr) { +mmc_app_get_scr(struct cam_periph *periph, union ccb *ccb, uint32_t *rawscr) +{ int err; struct mmc_command cmd; struct mmc_data d; @@ -869,7 +868,8 @@ mmc_app_get_scr(struct cam_periph *periph, union ccb *ccb, uint32_t *rawscr) { static int mmc_send_ext_csd(struct cam_periph *periph, union ccb *ccb, - uint8_t *rawextcsd, size_t buf_len) { + uint8_t *rawextcsd, size_t buf_len) +{ int err; struct mmc_data d; @@ -966,14 +966,16 @@ mmc_switch(struct cam_periph *periph, union ccb *ccb, } static uint32_t -mmc_get_spec_vers(struct cam_periph *periph) { +mmc_get_spec_vers(struct cam_periph *periph) +{ struct sdda_softc *softc = (struct sdda_softc *)periph->softc; return (softc->csd.spec_vers); } static uint64_t -mmc_get_media_size(struct cam_periph *periph) { +mmc_get_media_size(struct cam_periph *periph) +{ struct sdda_softc *softc = (struct sdda_softc *)periph->softc; return (softc->mediasize); @@ -992,7 +994,8 @@ mmc_get_cmd6_timeout(struct cam_periph *periph) static int mmc_sd_switch(struct cam_periph *periph, union ccb *ccb, uint8_t mode, uint8_t grp, uint8_t value, - uint8_t *res) { + uint8_t *res) +{ struct mmc_data mmc_d; uint32_t arg; int err; @@ -1069,7 +1072,8 @@ mmc_set_timing(struct cam_periph *periph, } static void -sdda_start_init_task(void *context, int pending) { +sdda_start_init_task(void *context, int pending) +{ union ccb *new_ccb; struct cam_periph *periph; @@ -1077,7 +1081,7 @@ sdda_start_init_task(void *context, int pending) { CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("sdda_start_init_task\n")); new_ccb = xpt_alloc_ccb(); xpt_setup_ccb(&new_ccb->ccb_h, periph->path, - CAM_PRIORITY_NONE); + CAM_PRIORITY_NORMAL); cam_periph_lock(periph); cam_periph_hold(periph, PRIBIO|PCATCH); @@ -1088,7 +1092,8 @@ sdda_start_init_task(void *context, int pending) { } static void -sdda_set_bus_width(struct cam_periph *periph, union ccb *ccb, int width) { +sdda_set_bus_width(struct cam_periph *periph, union ccb *ccb, int width) +{ struct sdda_softc *softc = (struct sdda_softc *)periph->softc; struct mmc_params *mmcp = &periph->path->device->mmc_ident_data; int err; @@ -1198,27 +1203,6 @@ sdda_get_host_caps(struct cam_periph *periph, union ccb *ccb) return (cts->host_caps); } -static uint32_t -sdda_get_max_data(struct cam_periph *periph, union ccb *ccb) -{ - struct ccb_trans_settings_mmc *cts; - - cts = &ccb->cts.proto_specific.mmc; - memset(cts, 0, sizeof(struct ccb_trans_settings_mmc)); - - ccb->ccb_h.func_code = XPT_GET_TRAN_SETTINGS; - ccb->ccb_h.flags = CAM_DIR_NONE; - ccb->ccb_h.retry_count = 0; - ccb->ccb_h.timeout = 100; - ccb->ccb_h.cbfcnp = NULL; - xpt_action(ccb); - - if (ccb->ccb_h.status != CAM_REQ_CMP) - panic("Cannot get host max data"); - KASSERT(cts->host_max_data != 0, ("host_max_data == 0?!")); - return (cts->host_max_data); -} - static void sdda_start_init(void *context, union ccb *start_ccb) { @@ -1544,10 +1528,7 @@ sdda_add_part(struct cam_periph *periph, u_int type, const char *name, bioq_init(&part->bio_queue); - bzero(&cpi, sizeof(cpi)); - xpt_setup_ccb(&cpi.ccb_h, periph->path, CAM_PRIORITY_NONE); - cpi.ccb_h.func_code = XPT_PATH_INQ; - xpt_action((union ccb *)&cpi); + xpt_path_inq(&cpi, periph->path); /* * Register this media as a disk diff --git a/sys/cam/mmc/mmc_xpt.c b/sys/cam/mmc/mmc_xpt.c index 4fce03004994..f5f66f5214a8 100644 --- a/sys/cam/mmc/mmc_xpt.c +++ b/sys/cam/mmc/mmc_xpt.c @@ -610,7 +610,6 @@ mmcprobe_start(struct cam_periph *periph, union ccb *start_ccb) CAM_DEBUG(start_ccb->ccb_h.path, CAM_DEBUG_PROBE, ("Start with PROBE_RESET\n")); /* FALLTHROUGH */ case PROBE_IDENTIFY: - xpt_path_inq(&start_ccb->cpi, periph->path); CAM_DEBUG(start_ccb->ccb_h.path, CAM_DEBUG_PROBE, ("Start with PROBE_IDENTIFY\n")); init_standard_ccb(start_ccb, XPT_MMC_GET_TRAN_SETTINGS); break; diff --git a/sys/cam/scsi/scsi_all.c b/sys/cam/scsi/scsi_all.c index 13a376ebb6e3..b518f84454ad 100644 --- a/sys/cam/scsi/scsi_all.c +++ b/sys/cam/scsi/scsi_all.c @@ -3708,11 +3708,7 @@ scsi_command_string(struct cam_device *device, struct ccb_scsiio *csio, /* * Get the device information. */ - xpt_setup_ccb(&cgd->ccb_h, - csio->ccb_h.path, - CAM_PRIORITY_NORMAL); - cgd->ccb_h.func_code = XPT_GDEV_TYPE; - xpt_action((union ccb *)cgd); + xpt_gdev_type(cgd, csio->ccb_h.path); /* * If the device is unconfigured, just pretend that it is a hard @@ -5144,11 +5140,7 @@ scsi_sense_sbuf(struct cam_device *device, struct ccb_scsiio *csio, /* * Get the device information. */ - xpt_setup_ccb(&cgd->ccb_h, - csio->ccb_h.path, - CAM_PRIORITY_NORMAL); - cgd->ccb_h.func_code = XPT_GDEV_TYPE; - xpt_action((union ccb *)cgd); + xpt_gdev_type(cgd, csio->ccb_h.path); /* * If the device is unconfigured, just pretend that it is a hard diff --git a/sys/cam/scsi/scsi_cd.c b/sys/cam/scsi/scsi_cd.c index 00a417f65052..e622a96ec77e 100644 --- a/sys/cam/scsi/scsi_cd.c +++ b/sys/cam/scsi/scsi_cd.c @@ -1240,13 +1240,7 @@ cddone(struct cam_periph *periph, union ccb *done_ccb) /*getcount_only*/0); status = done_ccb->ccb_h.status; - - bzero(&cgd, sizeof(cgd)); - xpt_setup_ccb(&cgd.ccb_h, - done_ccb->ccb_h.path, - CAM_PRIORITY_NORMAL); - cgd.ccb_h.func_code = XPT_GDEV_TYPE; - xpt_action((union ccb *)&cgd); + xpt_gdev_type(&cgd, done_ccb->ccb_h.path); if (scsi_extract_sense_ccb(done_ccb, &error_code, &sense_key, &asc, &ascq)) diff --git a/sys/cam/scsi/scsi_ch.c b/sys/cam/scsi/scsi_ch.c index 89a817c1b488..3da22ba61392 100644 --- a/sys/cam/scsi/scsi_ch.c +++ b/sys/cam/scsi/scsi_ch.c @@ -1705,11 +1705,7 @@ chscsiversion(struct cam_periph *periph) /* * Get the device information. */ - xpt_setup_ccb(&cgd->ccb_h, - periph->path, - CAM_PRIORITY_NORMAL); - cgd->ccb_h.func_code = XPT_GDEV_TYPE; - xpt_action((union ccb *)cgd); + xpt_gdev_type(cgd, periph->path); if (cgd->ccb_h.status != CAM_REQ_CMP) { xpt_free_ccb((union ccb *)cgd); diff --git a/sys/cam/scsi/scsi_da.c b/sys/cam/scsi/scsi_da.c index 0a2389cd9b5d..d02750aaacaf 100644 --- a/sys/cam/scsi/scsi_da.c +++ b/sys/cam/scsi/scsi_da.c @@ -5035,11 +5035,7 @@ dadone_proberc(struct cam_periph *periph, union ccb *done_ccb) /*timeout*/0, /*getcount_only*/0); - memset(&cgd, 0, sizeof(cgd)); - xpt_setup_ccb(&cgd.ccb_h, done_ccb->ccb_h.path, - CAM_PRIORITY_NORMAL); - cgd.ccb_h.func_code = XPT_GDEV_TYPE; - xpt_action((union ccb *)&cgd); + xpt_gdev_type(&cgd, done_ccb->ccb_h.path); if (scsi_extract_sense_ccb(done_ccb, &error_code, &sense_key, &asc, &ascq)) @@ -5077,6 +5073,18 @@ dadone_proberc(struct cam_periph *periph, union ccb *done_ccb) * behind a SATL translation that's fallen into a * terminally fatal state. * + * 4/2 happens on some HGST drives that are quite + * ill. We've already sent the start unit command (for + * which we ignore a 44/0 asc/ascq, which I'm hesitant + * to change since it's so basic and there's other error + * conditions to the START UNIT we should ignore). So to + * require initialization at this point when it should + * be fine implies to me, at least, that we should + * invalidate. Since we do read capacity in geom tasting + * a lot, and since this timeout is long, this leads to + * up to a 10 minute delay in booting. + * + * 4/2: LOGICAL UNIT NOT READY, INITIALIZING COMMAND REQUIRED * 25/0: LOGICAL UNIT NOT SUPPORTED * 44/0: INTERNAL TARGET FAILURE * 44/1: PERSISTENT RESERVATION INFORMATION LOST @@ -5084,6 +5092,7 @@ dadone_proberc(struct cam_periph *periph, union ccb *done_ccb) */ if ((have_sense) && (asc != 0x25) && (asc != 0x44) + && (asc != 0x04 && ascq != 0x02) && (error_code == SSD_CURRENT_ERROR || error_code == SSD_DESC_CURRENT_ERROR)) { const char *sense_key_desc; diff --git a/sys/cam/scsi/scsi_enc_ses.c b/sys/cam/scsi/scsi_enc_ses.c index c429e820a1fd..435874a9874a 100644 --- a/sys/cam/scsi/scsi_enc_ses.c +++ b/sys/cam/scsi/scsi_enc_ses.c @@ -979,10 +979,7 @@ ses_paths_iter(enc_softc_t *enc, enc_element_t *elm, != CAM_REQ_CMP) return; - memset(&cgd, 0, sizeof(cgd)); - xpt_setup_ccb(&cgd.ccb_h, path, CAM_PRIORITY_NORMAL); - cgd.ccb_h.func_code = XPT_GDEV_TYPE; - xpt_action((union ccb *)&cgd); + xpt_gdev_type(&cgd, path); if (cam_ccb_success((union ccb *)&cgd)) callback(enc, elm, path, callback_arg); diff --git a/sys/cam/scsi/scsi_sa.c b/sys/cam/scsi/scsi_sa.c index cfd48c98f30e..88147393192f 100644 --- a/sys/cam/scsi/scsi_sa.c +++ b/sys/cam/scsi/scsi_sa.c @@ -4731,12 +4731,7 @@ saextget(struct cdev *dev, struct cam_periph *periph, struct sbuf *sb, SASBADDVARSTR(sb, indent, periph->periph_name, %s, periph_name, strlen(periph->periph_name) + 1); SASBADDUINT(sb, indent, periph->unit_number, %u, unit_number); - memset(&cgd, 0, sizeof(cgd)); - xpt_setup_ccb(&cgd.ccb_h, - periph->path, - CAM_PRIORITY_NORMAL); - cgd.ccb_h.func_code = XPT_GDEV_TYPE; - xpt_action((union ccb *)&cgd); + xpt_gdev_type(&cgd, periph->path); if ((cgd.ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) { g->status = MT_EXT_GET_ERROR; snprintf(g->error_str, sizeof(g->error_str), diff --git a/sys/cam/scsi/scsi_xpt.c b/sys/cam/scsi/scsi_xpt.c index 439dd2050a95..bef35243af98 100644 --- a/sys/cam/scsi/scsi_xpt.c +++ b/sys/cam/scsi/scsi_xpt.c @@ -1915,6 +1915,15 @@ typedef struct { int lunindex[0]; } scsi_scan_bus_info; +static void +free_scan_info(scsi_scan_bus_info *scan_info) +{ + KASSERT(scan_info->cpi != NULL, + ("scan_info (%p) missing its ccb_pathinq CCB\n", scan_info)); + xpt_free_ccb((union ccb *)scan_info->cpi); + free(scan_info, M_CAMXPT); +} + /* * To start a scan, request_ccb is an XPT_SCAN_BUS ccb. * As the scan progresses, scsi_scan_bus is used as the @@ -1945,10 +1954,7 @@ scsi_scan_bus(struct cam_periph *periph, union ccb *request_ccb) xpt_done(request_ccb); return; } - xpt_setup_ccb(&work_ccb->ccb_h, request_ccb->ccb_h.path, - request_ccb->ccb_h.pinfo.priority); - work_ccb->ccb_h.func_code = XPT_PATH_INQ; - xpt_action(work_ccb); + xpt_path_inq(&work_ccb->cpi, request_ccb->ccb_h.path); if (work_ccb->ccb_h.status != CAM_REQ_CMP) { request_ccb->ccb_h.status = work_ccb->ccb_h.status; xpt_free_ccb(work_ccb); @@ -2037,16 +2043,14 @@ scsi_scan_bus(struct cam_periph *periph, union ccb *request_ccb) printf( "scsi_scan_bus: xpt_create_path failed with status %#x, bus scan halted\n", status); - free(scan_info, M_CAMXPT); + free_scan_info(scan_info); request_ccb->ccb_h.status = status; - xpt_free_ccb(work_ccb); xpt_done(request_ccb); break; } work_ccb = xpt_alloc_ccb_nowait(); if (work_ccb == NULL) { - xpt_free_ccb((union ccb *)scan_info->cpi); - free(scan_info, M_CAMXPT); + free_scan_info(scan_info); xpt_free_path(path); request_ccb->ccb_h.status = CAM_RESRC_UNAVAIL; xpt_done(request_ccb); @@ -2179,16 +2183,16 @@ scsi_scan_bus(struct cam_periph *periph, union ccb *request_ccb) * Check to see if we scan any further luns. */ if (next_target) { - int done; + bool done; /* * Free the current request path- we're done with it. */ xpt_free_path(oldpath); hop_again: - done = 0; + done = false; if (scan_info->request_ccb->ccb_h.func_code == XPT_SCAN_TGT) { - done = 1; + done = true; } else if (scan_info->cpi->hba_misc & PIM_SEQSCAN) { scan_info->counter++; if (scan_info->counter == @@ -2197,23 +2201,22 @@ scsi_scan_bus(struct cam_periph *periph, union ccb *request_ccb) } if (scan_info->counter >= scan_info->cpi->max_target+1) { - done = 1; + done = true; } } else { scan_info->counter--; if (scan_info->counter == 0) { - done = 1; + done = true; } } if (done) { mtx_unlock(mtx); xpt_free_ccb(request_ccb); - xpt_free_ccb((union ccb *)scan_info->cpi); request_ccb = scan_info->request_ccb; CAM_DEBUG(request_ccb->ccb_h.path, CAM_DEBUG_TRACE, ("SCAN done for %p\n", scan_info)); - free(scan_info, M_CAMXPT); + free_scan_info(scan_info); request_ccb->ccb_h.status = CAM_REQ_CMP; xpt_done(request_ccb); break; @@ -2233,9 +2236,8 @@ scsi_scan_bus(struct cam_periph *periph, union ccb *request_ccb) "scsi_scan_bus: xpt_create_path failed with status %#x, bus scan halted\n", status); xpt_free_ccb(request_ccb); - xpt_free_ccb((union ccb *)scan_info->cpi); request_ccb = scan_info->request_ccb; - free(scan_info, M_CAMXPT); + free_scan_info(scan_info); request_ccb->ccb_h.status = status; xpt_done(request_ccb); break; @@ -2294,10 +2296,7 @@ scsi_scan_lun(struct cam_periph *periph, struct cam_path *path, CAM_DEBUG(path, CAM_DEBUG_TRACE, ("scsi_scan_lun\n")); - memset(&cpi, 0, sizeof(cpi)); - xpt_setup_ccb(&cpi.ccb_h, path, CAM_PRIORITY_NONE); - cpi.ccb_h.func_code = XPT_PATH_INQ; - xpt_action((union ccb *)&cpi); + xpt_path_inq(&cpi, path); if (cpi.ccb_h.status != CAM_REQ_CMP) { if (request_ccb != NULL) { @@ -2421,10 +2420,7 @@ scsi_devise_transport(struct cam_path *path) struct scsi_inquiry_data *inq_buf; /* Get transport information from the SIM */ - memset(&cpi, 0, sizeof(cpi)); - xpt_setup_ccb(&cpi.ccb_h, path, CAM_PRIORITY_NONE); - cpi.ccb_h.func_code = XPT_PATH_INQ; - xpt_action((union ccb *)&cpi); + xpt_path_inq(&cpi, path); inq_buf = NULL; if ((path->device->flags & CAM_DEV_INQUIRY_DATA_VALID) != 0) @@ -2732,10 +2728,7 @@ scsi_set_transfer_settings(struct ccb_trans_settings *cts, struct cam_path *path inq_data = &device->inq_data; scsi = &cts->proto_specific.scsi; - memset(&cpi, 0, sizeof(cpi)); - xpt_setup_ccb(&cpi.ccb_h, path, CAM_PRIORITY_NONE); - cpi.ccb_h.func_code = XPT_PATH_INQ; - xpt_action((union ccb *)&cpi); + xpt_path_inq(&cpi, path); /* SCSI specific sanity checking */ if ((cpi.hba_inquiry & PI_TAG_ABLE) == 0 @@ -3046,10 +3039,7 @@ _scsi_announce_periph(struct cam_periph *periph, u_int *speed, u_int *freq, stru return; /* Ask the SIM for its base transfer speed */ - memset(&cpi, 0, sizeof(cpi)); - xpt_setup_ccb(&cpi.ccb_h, path, CAM_PRIORITY_NORMAL); - cpi.ccb_h.func_code = XPT_PATH_INQ; - xpt_action((union ccb *)&cpi); + xpt_path_inq(&cpi, path); /* Report connection speed */ *speed = cpi.base_transfer_speed; diff --git a/sys/cddl/boot/zfs/zfsimpl.h b/sys/cddl/boot/zfs/zfsimpl.h index 0ce38384abbf..83d964360343 100644 --- a/sys/cddl/boot/zfs/zfsimpl.h +++ b/sys/cddl/boot/zfs/zfsimpl.h @@ -2019,6 +2019,7 @@ typedef struct vdev { vdev_list_t v_children; /* children of this vdev */ const char *v_name; /* vdev name */ uint64_t v_guid; /* vdev guid */ + uint64_t v_txg; /* most recent transaction */ uint64_t v_id; /* index in parent */ uint64_t v_psize; /* physical device capacity */ int v_ashift; /* offset to block shift */ @@ -2048,7 +2049,6 @@ typedef struct spa { STAILQ_ENTRY(spa) spa_link; /* link in global pool list */ char *spa_name; /* pool name */ uint64_t spa_guid; /* pool guid */ - uint64_t spa_txg; /* most recent transaction */ struct uberblock *spa_uberblock; /* best uberblock so far */ vdev_t *spa_root_vdev; /* toplevel vdev container */ objset_phys_t *spa_mos; /* MOS for this pool */ diff --git a/sys/compat/freebsd32/freebsd32_syscall.h b/sys/compat/freebsd32/freebsd32_syscall.h index eaa086188b5f..8d2748098c00 100644 --- a/sys/compat/freebsd32/freebsd32_syscall.h +++ b/sys/compat/freebsd32/freebsd32_syscall.h @@ -511,4 +511,6 @@ #define FREEBSD32_SYS_fchroot 590 #define FREEBSD32_SYS_freebsd32_setcred 591 #define FREEBSD32_SYS_exterrctl 592 -#define FREEBSD32_SYS_MAXSYSCALL 593 +#define FREEBSD32_SYS_inotify_add_watch_at 593 +#define FREEBSD32_SYS_inotify_rm_watch 594 +#define FREEBSD32_SYS_MAXSYSCALL 595 diff --git a/sys/compat/freebsd32/freebsd32_syscalls.c b/sys/compat/freebsd32/freebsd32_syscalls.c index 989f32a5c6f0..bda373268cc5 100644 --- a/sys/compat/freebsd32/freebsd32_syscalls.c +++ b/sys/compat/freebsd32/freebsd32_syscalls.c @@ -598,4 +598,6 @@ const char *freebsd32_syscallnames[] = { "fchroot", /* 590 = fchroot */ "freebsd32_setcred", /* 591 = freebsd32_setcred */ "exterrctl", /* 592 = exterrctl */ + "inotify_add_watch_at", /* 593 = inotify_add_watch_at */ + "inotify_rm_watch", /* 594 = inotify_rm_watch */ }; diff --git a/sys/compat/freebsd32/freebsd32_sysent.c b/sys/compat/freebsd32/freebsd32_sysent.c index 476fe2ac3f80..ef0aff8bf852 100644 --- a/sys/compat/freebsd32/freebsd32_sysent.c +++ b/sys/compat/freebsd32/freebsd32_sysent.c @@ -659,5 +659,7 @@ struct sysent freebsd32_sysent[] = { { .sy_narg = AS(getrlimitusage_args), .sy_call = (sy_call_t *)sys_getrlimitusage, .sy_auevent = AUE_NULL, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 589 = getrlimitusage */ { .sy_narg = AS(fchroot_args), .sy_call = (sy_call_t *)sys_fchroot, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 590 = fchroot */ { .sy_narg = AS(freebsd32_setcred_args), .sy_call = (sy_call_t *)freebsd32_setcred, .sy_auevent = AUE_SETCRED, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 591 = freebsd32_setcred */ - { .sy_narg = AS(exterrctl_args), .sy_call = (sy_call_t *)sys_exterrctl, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 592 = exterrctl */ + { .sy_narg = AS(exterrctl_args), .sy_call = (sy_call_t *)sys_exterrctl, .sy_auevent = AUE_NULL, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 592 = exterrctl */ + { .sy_narg = AS(inotify_add_watch_at_args), .sy_call = (sy_call_t *)sys_inotify_add_watch_at, .sy_auevent = AUE_INOTIFY, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 593 = inotify_add_watch_at */ + { .sy_narg = AS(inotify_rm_watch_args), .sy_call = (sy_call_t *)sys_inotify_rm_watch, .sy_auevent = AUE_INOTIFY, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 594 = inotify_rm_watch */ }; diff --git a/sys/compat/freebsd32/freebsd32_systrace_args.c b/sys/compat/freebsd32/freebsd32_systrace_args.c index cf08938cd5de..37564a737a62 100644 --- a/sys/compat/freebsd32/freebsd32_systrace_args.c +++ b/sys/compat/freebsd32/freebsd32_systrace_args.c @@ -3395,6 +3395,24 @@ systrace_args(int sysnum, void *params, uint64_t *uarg, int *n_args) *n_args = 3; break; } + /* inotify_add_watch_at */ + case 593: { + struct inotify_add_watch_at_args *p = params; + iarg[a++] = p->fd; /* int */ + iarg[a++] = p->dfd; /* int */ + uarg[a++] = (intptr_t)p->path; /* const char * */ + uarg[a++] = p->mask; /* uint32_t */ + *n_args = 4; + break; + } + /* inotify_rm_watch */ + case 594: { + struct inotify_rm_watch_args *p = params; + iarg[a++] = p->fd; /* int */ + iarg[a++] = p->wd; /* int */ + *n_args = 2; + break; + } default: *n_args = 0; break; @@ -9172,6 +9190,38 @@ systrace_entry_setargdesc(int sysnum, int ndx, char *desc, size_t descsz) break; }; break; + /* inotify_add_watch_at */ + case 593: + switch (ndx) { + case 0: + p = "int"; + break; + case 1: + p = "int"; + break; + case 2: + p = "userland const char *"; + break; + case 3: + p = "uint32_t"; + break; + default: + break; + }; + break; + /* inotify_rm_watch */ + case 594: + switch (ndx) { + case 0: + p = "int"; + break; + case 1: + p = "int"; + break; + default: + break; + }; + break; default: break; }; @@ -11070,6 +11120,16 @@ systrace_return_setargdesc(int sysnum, int ndx, char *desc, size_t descsz) if (ndx == 0 || ndx == 1) p = "int"; break; + /* inotify_add_watch_at */ + case 593: + if (ndx == 0 || ndx == 1) + p = "int"; + break; + /* inotify_rm_watch */ + case 594: + if (ndx == 0 || ndx == 1) + p = "int"; + break; default: break; }; diff --git a/sys/compat/linux/linux_dummy.c b/sys/compat/linux/linux_dummy.c index 35d6debe0da9..19cd55849f65 100644 --- a/sys/compat/linux/linux_dummy.c +++ b/sys/compat/linux/linux_dummy.c @@ -74,9 +74,6 @@ DUMMY(kexec_load); DUMMY(add_key); DUMMY(request_key); DUMMY(keyctl); -/* Linux 2.6.13: */ -DUMMY(inotify_add_watch); -DUMMY(inotify_rm_watch); /* Linux 2.6.16: */ DUMMY(migrate_pages); DUMMY(unshare); @@ -87,7 +84,6 @@ DUMMY(vmsplice); DUMMY(move_pages); /* Linux 2.6.27: */ DUMMY(signalfd4); -DUMMY(inotify_init1); /* Linux 2.6.31: */ DUMMY(perf_event_open); /* Linux 2.6.36: */ diff --git a/sys/compat/linux/linux_file.c b/sys/compat/linux/linux_file.c index 246bc26d85d4..86834a7ecea8 100644 --- a/sys/compat/linux/linux_file.c +++ b/sys/compat/linux/linux_file.c @@ -32,11 +32,13 @@ #include <sys/fcntl.h> #include <sys/file.h> #include <sys/filedesc.h> +#include <sys/inotify.h> #include <sys/lock.h> #include <sys/mman.h> #include <sys/selinfo.h> #include <sys/pipe.h> #include <sys/proc.h> +#include <sys/specialfd.h> #include <sys/stat.h> #include <sys/sx.h> #include <sys/syscallsubr.h> @@ -1877,3 +1879,122 @@ linux_writev(struct thread *td, struct linux_writev_args *args) freeuio(auio); return (linux_enobufs2eagain(td, args->fd, error)); } + +static int +linux_inotify_init_flags(int l_flags) +{ + int bsd_flags; + + if ((l_flags & ~(LINUX_IN_CLOEXEC | LINUX_IN_NONBLOCK)) != 0) + linux_msg(NULL, "inotify_init1 unsupported flags 0x%x", + l_flags); + + bsd_flags = 0; + if ((l_flags & LINUX_IN_CLOEXEC) != 0) + bsd_flags |= O_CLOEXEC; + if ((l_flags & LINUX_IN_NONBLOCK) != 0) + bsd_flags |= O_NONBLOCK; + return (bsd_flags); +} + +static int +inotify_init_common(struct thread *td, int flags) +{ + struct specialfd_inotify si; + + si.flags = linux_inotify_init_flags(flags); + return (kern_specialfd(td, SPECIALFD_INOTIFY, &si)); +} + +#if defined(__i386__) || defined(__amd64__) +int +linux_inotify_init(struct thread *td, struct linux_inotify_init_args *args) +{ + return (inotify_init_common(td, 0)); +} +#endif + +int +linux_inotify_init1(struct thread *td, struct linux_inotify_init1_args *args) +{ + return (inotify_init_common(td, args->flags)); +} + +/* + * The native implementation uses the same values for inotify events as + * libinotify, which gives us binary compatibility with Linux. This simplifies + * the shim implementation a lot, as otherwise we would have to handle read(2) + * calls on inotify descriptors and translate events to Linux's ABI. + */ +_Static_assert(LINUX_IN_ACCESS == IN_ACCESS, + "IN_ACCESS mismatch"); +_Static_assert(LINUX_IN_MODIFY == IN_MODIFY, + "IN_MODIFY mismatch"); +_Static_assert(LINUX_IN_ATTRIB == IN_ATTRIB, + "IN_ATTRIB mismatch"); +_Static_assert(LINUX_IN_CLOSE_WRITE == IN_CLOSE_WRITE, + "IN_CLOSE_WRITE mismatch"); +_Static_assert(LINUX_IN_CLOSE_NOWRITE == IN_CLOSE_NOWRITE, + "IN_CLOSE_NOWRITE mismatch"); +_Static_assert(LINUX_IN_OPEN == IN_OPEN, + "IN_OPEN mismatch"); +_Static_assert(LINUX_IN_MOVED_FROM == IN_MOVED_FROM, + "IN_MOVED_FROM mismatch"); +_Static_assert(LINUX_IN_MOVED_TO == IN_MOVED_TO, + "IN_MOVED_TO mismatch"); +_Static_assert(LINUX_IN_CREATE == IN_CREATE, + "IN_CREATE mismatch"); +_Static_assert(LINUX_IN_DELETE == IN_DELETE, + "IN_DELETE mismatch"); +_Static_assert(LINUX_IN_DELETE_SELF == IN_DELETE_SELF, + "IN_DELETE_SELF mismatch"); +_Static_assert(LINUX_IN_MOVE_SELF == IN_MOVE_SELF, + "IN_MOVE_SELF mismatch"); + +_Static_assert(LINUX_IN_UNMOUNT == IN_UNMOUNT, + "IN_UNMOUNT mismatch"); +_Static_assert(LINUX_IN_Q_OVERFLOW == IN_Q_OVERFLOW, + "IN_Q_OVERFLOW mismatch"); +_Static_assert(LINUX_IN_IGNORED == IN_IGNORED, + "IN_IGNORED mismatch"); + +_Static_assert(LINUX_IN_ISDIR == IN_ISDIR, + "IN_ISDIR mismatch"); +_Static_assert(LINUX_IN_ONLYDIR == IN_ONLYDIR, + "IN_ONLYDIR mismatch"); +_Static_assert(LINUX_IN_DONT_FOLLOW == IN_DONT_FOLLOW, + "IN_DONT_FOLLOW mismatch"); +_Static_assert(LINUX_IN_MASK_CREATE == IN_MASK_CREATE, + "IN_MASK_CREATE mismatch"); +_Static_assert(LINUX_IN_MASK_ADD == IN_MASK_ADD, + "IN_MASK_ADD mismatch"); +_Static_assert(LINUX_IN_ONESHOT == IN_ONESHOT, + "IN_ONESHOT mismatch"); +_Static_assert(LINUX_IN_EXCL_UNLINK == IN_EXCL_UNLINK, + "IN_EXCL_UNLINK mismatch"); + +static int +linux_inotify_watch_flags(int l_flags) +{ + if ((l_flags & ~(LINUX_IN_ALL_EVENTS | LINUX_IN_ALL_FLAGS)) != 0) { + linux_msg(NULL, "inotify_add_watch unsupported flags 0x%x", + l_flags); + } + + return (l_flags); +} + +int +linux_inotify_add_watch(struct thread *td, + struct linux_inotify_add_watch_args *args) +{ + return (kern_inotify_add_watch(args->fd, AT_FDCWD, args->pathname, + linux_inotify_watch_flags(args->mask), td)); +} + +int +linux_inotify_rm_watch(struct thread *td, + struct linux_inotify_rm_watch_args *args) +{ + return (kern_inotify_rm_watch(args->fd, args->wd, td)); +} diff --git a/sys/compat/linux/linux_file.h b/sys/compat/linux/linux_file.h index 2e56942b0f40..7448dc597230 100644 --- a/sys/compat/linux/linux_file.h +++ b/sys/compat/linux/linux_file.h @@ -189,6 +189,38 @@ #define LINUX_HUGETLB_FLAG_ENCODE_2GB (31 << LINUX_HUGETLB_FLAG_ENCODE_SHIFT) #define LINUX_HUGETLB_FLAG_ENCODE_16GB (34U << LINUX_HUGETLB_FLAG_ENCODE_SHIFT) +/* inotify flags */ +#define LINUX_IN_ACCESS 0x00000001 +#define LINUX_IN_MODIFY 0x00000002 +#define LINUX_IN_ATTRIB 0x00000004 +#define LINUX_IN_CLOSE_WRITE 0x00000008 +#define LINUX_IN_CLOSE_NOWRITE 0x00000010 +#define LINUX_IN_OPEN 0x00000020 +#define LINUX_IN_MOVED_FROM 0x00000040 +#define LINUX_IN_MOVED_TO 0x00000080 +#define LINUX_IN_CREATE 0x00000100 +#define LINUX_IN_DELETE 0x00000200 +#define LINUX_IN_DELETE_SELF 0x00000400 +#define LINUX_IN_MOVE_SELF 0x00000800 + +#define LINUX_IN_UNMOUNT 0x00002000 +#define LINUX_IN_Q_OVERFLOW 0x00004000 +#define LINUX_IN_IGNORED 0x00008000 + +#define LINUX_IN_ONLYDIR 0x01000000 +#define LINUX_IN_DONT_FOLLOW 0x02000000 +#define LINUX_IN_EXCL_UNLINK 0x04000000 +#define LINUX_IN_MASK_CREATE 0x10000000 +#define LINUX_IN_MASK_ADD 0x20000000 +#define LINUX_IN_ISDIR 0x40000000 +#define LINUX_IN_ONESHOT 0x80000000 + +#define LINUX_IN_ALL_EVENTS 0x00000fff +#define LINUX_IN_ALL_FLAGS 0xf700e000 + +#define LINUX_IN_NONBLOCK 0x00000800 +#define LINUX_IN_CLOEXEC 0x00080000 + #if defined(_KERNEL) struct l_file_handle { l_uint handle_bytes; diff --git a/sys/compat/linuxkpi/common/include/acpi/acpi.h b/sys/compat/linuxkpi/common/include/acpi/acpi.h index e0218bdde12e..1e398d05ba20 100644 --- a/sys/compat/linuxkpi/common/include/acpi/acpi.h +++ b/sys/compat/linuxkpi/common/include/acpi/acpi.h @@ -3,6 +3,10 @@ * * Copyright (c) 2017 Mark Johnston <markj@FreeBSD.org> * Copyright (c) 2020 Vladimir Kondratyev <wulf@FreeBSD.org> + * Copyright (c) 2025 The FreeBSD Foundation + * + * Portions of this software were developed by Björn Zeeb + * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are @@ -31,6 +35,13 @@ #define _LINUXKPI_ACPI_ACPI_H_ /* + * LINUXKPI_WANT_LINUX_ACPI is a temporary workaround to allow drm-kmod + * to update all needed branches without breaking builds. + * Once that happened and checks are implemented based on __FreeBSD_verison + * we will remove these conditions again. + */ + +/* * FreeBSD import of ACPICA has a typedef for BOOLEAN which conflicts with * amdgpu driver. Workaround it on preprocessor level. */ @@ -46,8 +57,8 @@ typedef int64_t INT64; #include <contrib/dev/acpica/include/acpi.h> #undef BOOLEAN +typedef ACPI_IO_ADDRESS acpi_io_address; typedef ACPI_HANDLE acpi_handle; -typedef ACPI_OBJECT acpi_object; typedef ACPI_OBJECT_HANDLER acpi_object_handler; typedef ACPI_OBJECT_TYPE acpi_object_type; typedef ACPI_STATUS acpi_status; @@ -55,12 +66,62 @@ typedef ACPI_STRING acpi_string; typedef ACPI_SIZE acpi_size; typedef ACPI_WALK_CALLBACK acpi_walk_callback; +union linuxkpi_acpi_object { + acpi_object_type type; + struct { + acpi_object_type type; + UINT64 value; + } integer; + struct { + acpi_object_type type; + UINT32 length; + char *pointer; + } string; + struct { + acpi_object_type type; + UINT32 length; + UINT8 *pointer; + } buffer; + struct { + acpi_object_type type; + UINT32 count; + union linuxkpi_acpi_object *elements; + } package; + struct { + acpi_object_type type; + acpi_object_type actual_type; + acpi_handle handle; + } reference; + struct { + acpi_object_type type; + UINT32 proc_id; + acpi_io_address pblk_address; + UINT32 pblk_length; + } processor; + struct { + acpi_object_type type; + UINT32 system_level; + UINT32 resource_order; + } power_resource; +}; + +#ifdef LINUXKPI_WANT_LINUX_ACPI +struct linuxkpi_acpi_buffer { + acpi_size length; /* Length in bytes of the buffer */ + void *pointer; /* pointer to buffer */ +}; + +typedef struct linuxkpi_acpi_buffer lkpi_acpi_buffer_t; +#else +typedef ACPI_BUFFER lkpi_acpi_buffer_t; +#endif + static inline ACPI_STATUS acpi_evaluate_object(ACPI_HANDLE Object, ACPI_STRING Pathname, - ACPI_OBJECT_LIST *ParameterObjects, ACPI_BUFFER *ReturnObjectBuffer) + ACPI_OBJECT_LIST *ParameterObjects, lkpi_acpi_buffer_t *ReturnObjectBuffer) { return (AcpiEvaluateObject( - Object, Pathname, ParameterObjects, ReturnObjectBuffer)); + Object, Pathname, ParameterObjects, (ACPI_BUFFER *)ReturnObjectBuffer)); } static inline const char * @@ -83,9 +144,9 @@ acpi_get_data(ACPI_HANDLE ObjHandle, ACPI_OBJECT_HANDLER Handler, void **Data) } static inline ACPI_STATUS -acpi_get_name(ACPI_HANDLE Object, UINT32 NameType, ACPI_BUFFER *RetPathPtr) +acpi_get_name(ACPI_HANDLE Object, UINT32 NameType, lkpi_acpi_buffer_t *RetPathPtr) { - return (AcpiGetName(Object, NameType, RetPathPtr)); + return (AcpiGetName(Object, NameType, (ACPI_BUFFER *)RetPathPtr)); } static inline ACPI_STATUS @@ -101,4 +162,9 @@ acpi_put_table(ACPI_TABLE_HEADER *Table) AcpiPutTable(Table); } +#ifdef LINUXKPI_WANT_LINUX_ACPI +#define acpi_object linuxkpi_acpi_object +#define acpi_buffer linuxkpi_acpi_buffer +#endif + #endif /* _LINUXKPI_ACPI_ACPI_H_ */ diff --git a/sys/compat/linuxkpi/common/include/acpi/acpi_bus.h b/sys/compat/linuxkpi/common/include/acpi/acpi_bus.h index 65bcbe7f1bdd..47195e7d66a6 100644 --- a/sys/compat/linuxkpi/common/include/acpi/acpi_bus.h +++ b/sys/compat/linuxkpi/common/include/acpi/acpi_bus.h @@ -58,4 +58,10 @@ bool lkpi_acpi_dev_present(const char *hid, const char *uid, struct acpi_device *lkpi_acpi_dev_get_first_match_dev(const char *hid, const char *uid, int64_t hrv); +union linuxkpi_acpi_object; + +union linuxkpi_acpi_object * +acpi_evaluate_dsm(ACPI_HANDLE ObjHandle, const guid_t *guid, + UINT64 rev, UINT64 func, union linuxkpi_acpi_object *arg); + #endif /* _LINUXKPI_ACPI_ACPI_BUS_H_ */ diff --git a/sys/compat/linuxkpi/common/include/linux/device.h b/sys/compat/linuxkpi/common/include/linux/device.h index a5f6874a07f6..2556b0c45e49 100644 --- a/sys/compat/linuxkpi/common/include/linux/device.h +++ b/sys/compat/linuxkpi/common/include/linux/device.h @@ -90,6 +90,8 @@ struct dev_pm_ops { struct device_driver { const char *name; const struct dev_pm_ops *pm; + + void (*shutdown) (struct device *); }; struct device_type { diff --git a/sys/compat/linuxkpi/common/include/linux/pci.h b/sys/compat/linuxkpi/common/include/linux/pci.h index 174015ba7a58..af19829f1cbb 100644 --- a/sys/compat/linuxkpi/common/include/linux/pci.h +++ b/sys/compat/linuxkpi/common/include/linux/pci.h @@ -72,6 +72,10 @@ struct pci_device_id { uintptr_t driver_data; }; +#define MODULE_DEVICE_TABLE_BUS_pci(_bus, _table) \ +MODULE_PNP_INFO("U32:vendor;U32:device;V32:subvendor;V32:subdevice", \ + _bus, lkpi_ ## _table, _table, nitems(_table) - 1) + /* Linux has an empty element at the end of the ID table -> nitems() - 1. */ #define MODULE_DEVICE_TABLE(_bus, _table) \ \ @@ -85,11 +89,10 @@ static driver_t _ ## _bus ## _ ## _table ## _driver = { \ 0 \ }; \ \ -DRIVER_MODULE(lkpi_ ## _table, pci, _ ## _bus ## _ ## _table ## _driver,\ +DRIVER_MODULE(lkpi_ ## _table, _bus, _ ## _bus ## _ ## _table ## _driver,\ 0, 0); \ \ -MODULE_PNP_INFO("U32:vendor;U32:device;V32:subvendor;V32:subdevice", \ - _bus, lkpi_ ## _table, _table, nitems(_table) - 1) +MODULE_DEVICE_TABLE_BUS_ ## _bus(_bus, _table) #define PCI_ANY_ID -1U diff --git a/sys/compat/linuxkpi/common/include/linux/slab.h b/sys/compat/linuxkpi/common/include/linux/slab.h index f3a840d9bf4b..efa5c8cb67b3 100644 --- a/sys/compat/linuxkpi/common/include/linux/slab.h +++ b/sys/compat/linuxkpi/common/include/linux/slab.h @@ -45,7 +45,7 @@ MALLOC_DECLARE(M_KMALLOC); -#define kvzalloc(size, flags) kmalloc(size, (flags) | __GFP_ZERO) +#define kvzalloc(size, flags) kvmalloc(size, (flags) | __GFP_ZERO) #define kvcalloc(n, size, flags) kvmalloc_array(n, size, (flags) | __GFP_ZERO) #define kzalloc(size, flags) kmalloc(size, (flags) | __GFP_ZERO) #define kzalloc_node(size, flags, node) kmalloc_node(size, (flags) | __GFP_ZERO, node) diff --git a/sys/compat/linuxkpi/common/src/linux_acpi.c b/sys/compat/linuxkpi/common/src/linux_acpi.c index 6a9afb3ddff0..d18c69d9210d 100644 --- a/sys/compat/linuxkpi/common/src/linux_acpi.c +++ b/sys/compat/linuxkpi/common/src/linux_acpi.c @@ -39,6 +39,7 @@ #include <linux/notifier.h> #include <linux/suspend.h> +#include <linux/uuid.h> #include <acpi/acpi_bus.h> #include <acpi/video.h> @@ -99,6 +100,17 @@ acpi_evaluate_dsm_typed(ACPI_HANDLE handle, const char *uuid, int rev, argv4, &buf, type)) ? (ACPI_OBJECT *)buf.Pointer : NULL); } +union linuxkpi_acpi_object * +acpi_evaluate_dsm(ACPI_HANDLE ObjHandle, const guid_t *guid, + UINT64 rev, UINT64 func, union linuxkpi_acpi_object *pkg) +{ + ACPI_BUFFER buf; + + return (ACPI_SUCCESS(acpi_EvaluateDSM(ObjHandle, (const uint8_t *)guid, + rev, func, (ACPI_OBJECT *)pkg, &buf)) ? + (union linuxkpi_acpi_object *)buf.Pointer : NULL); +} + static void linux_handle_power_suspend_event(void *arg __unused) { @@ -323,6 +335,13 @@ acpi_evaluate_dsm_typed(ACPI_HANDLE handle, const char *uuid, int rev, return (NULL); } +union linuxkpi_acpi_object * +acpi_evaluate_dsm(ACPI_HANDLE ObjHandle, const guid_t *guid, + UINT64 rev, UINT64 func, union linuxkpi_acpi_object *pkg) +{ + return (NULL); +} + int register_acpi_notifier(struct notifier_block *nb) { diff --git a/sys/compat/linuxkpi/common/src/linux_page.c b/sys/compat/linuxkpi/common/src/linux_page.c index ebb92eacbf9a..628af17df853 100644 --- a/sys/compat/linuxkpi/common/src/linux_page.c +++ b/sys/compat/linuxkpi/common/src/linux_page.c @@ -106,6 +106,7 @@ linux_alloc_pages(gfp_t flags, unsigned int order) if ((flags & M_ZERO) != 0) req |= VM_ALLOC_ZERO; + if (order == 0 && (flags & GFP_DMA32) == 0) { page = vm_page_alloc_noobj(req); if (page == NULL) @@ -113,6 +114,10 @@ linux_alloc_pages(gfp_t flags, unsigned int order) } else { vm_paddr_t pmax = (flags & GFP_DMA32) ? BUS_SPACE_MAXADDR_32BIT : BUS_SPACE_MAXADDR; + + if ((flags & __GFP_NORETRY) != 0) + req |= VM_ALLOC_NORECLAIM; + retry: page = vm_page_alloc_noobj_contig(req, npages, 0, pmax, PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); diff --git a/sys/conf/files b/sys/conf/files index 75ee10be5896..dd0d390962f2 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -598,42 +598,24 @@ contrib/dev/acpica/components/utilities/utxface.c optional acpi contrib/dev/acpica/components/utilities/utxferror.c optional acpi contrib/dev/acpica/components/utilities/utxfinit.c optional acpi contrib/dev/acpica/os_specific/service_layers/osgendbg.c optional acpi acpi_debug -netpfil/ipfilter/netinet/fil.c optional ipfilter inet \ - compile-with "${NORMAL_C} ${NO_WSELF_ASSIGN} -Wno-unused -I$S/netpfil/ipfilter" -netpfil/ipfilter/netinet/ip_auth.c optional ipfilter inet \ - compile-with "${NORMAL_C} -Wno-unused -I$S/netpfil/ipfilter" -netpfil/ipfilter/netinet/ip_fil_freebsd.c optional ipfilter inet \ - compile-with "${NORMAL_C} -Wno-unused -I$S/netpfil/ipfilter" -netpfil/ipfilter/netinet/ip_frag.c optional ipfilter inet \ - compile-with "${NORMAL_C} -Wno-unused -I$S/netpfil/ipfilter" -netpfil/ipfilter/netinet/ip_log.c optional ipfilter inet \ - compile-with "${NORMAL_C} -I$S/netpfil/ipfilter" -netpfil/ipfilter/netinet/ip_nat.c optional ipfilter inet \ - compile-with "${NORMAL_C} -Wno-unused -I$S/netpfil/ipfilter" -netpfil/ipfilter/netinet/ip_proxy.c optional ipfilter inet \ - compile-with "${NORMAL_C} ${NO_WSELF_ASSIGN} -Wno-unused -I$S/netpfil/ipfilter" -netpfil/ipfilter/netinet/ip_state.c optional ipfilter inet \ - compile-with "${NORMAL_C} -Wno-unused -I$S/netpfil/ipfilter" -netpfil/ipfilter/netinet/ip_lookup.c optional ipfilter inet \ - compile-with "${NORMAL_C} ${NO_WSELF_ASSIGN} -Wno-unused -Wno-error -I$S/netpfil/ipfilter" -netpfil/ipfilter/netinet/ip_pool.c optional ipfilter inet \ - compile-with "${NORMAL_C} -Wno-unused -I$S/netpfil/ipfilter" -netpfil/ipfilter/netinet/ip_htable.c optional ipfilter inet \ - compile-with "${NORMAL_C} -Wno-unused -I$S/netpfil/ipfilter ${NO_WTAUTOLOGICAL_POINTER_COMPARE}" -netpfil/ipfilter/netinet/ip_sync.c optional ipfilter inet \ - compile-with "${NORMAL_C} -Wno-unused -I$S/netpfil/ipfilter" -netpfil/ipfilter/netinet/mlfk_ipl.c optional ipfilter inet \ - compile-with "${NORMAL_C} -I$S/netpfil/ipfilter" -netpfil/ipfilter/netinet/ip_nat6.c optional ipfilter inet \ - compile-with "${NORMAL_C} -Wno-unused -I$S/netpfil/ipfilter" -netpfil/ipfilter/netinet/ip_rules.c optional ipfilter inet \ - compile-with "${NORMAL_C} -I$S/netpfil/ipfilter" -netpfil/ipfilter/netinet/ip_scan.c optional ipfilter inet \ - compile-with "${NORMAL_C} -Wno-unused -I$S/netpfil/ipfilter" -netpfil/ipfilter/netinet/ip_dstlist.c optional ipfilter inet \ - compile-with "${NORMAL_C} -Wno-unused -I$S/netpfil/ipfilter" -netpfil/ipfilter/netinet/radix_ipf.c optional ipfilter inet \ - compile-with "${NORMAL_C} -I$S/netpfil/ipfilter" +netpfil/ipfilter/netinet/fil.c optional ipfilter inet compile-with "${IPFILTER_C}" +netpfil/ipfilter/netinet/ip_auth.c optional ipfilter inet compile-with "${IPFILTER_C}" +netpfil/ipfilter/netinet/ip_fil_freebsd.c optional ipfilter inet compile-with "${IPFILTER_C}" +netpfil/ipfilter/netinet/ip_frag.c optional ipfilter inet compile-with "${IPFILTER_C}" +netpfil/ipfilter/netinet/ip_log.c optional ipfilter inet compile-with "${IPFILTER_C}" +netpfil/ipfilter/netinet/ip_nat.c optional ipfilter inet compile-with "${IPFILTER_C}" +netpfil/ipfilter/netinet/ip_proxy.c optional ipfilter inet compile-with "${IPFILTER_C}" +netpfil/ipfilter/netinet/ip_state.c optional ipfilter inet compile-with "${IPFILTER_C}" +netpfil/ipfilter/netinet/ip_lookup.c optional ipfilter inet compile-with "${IPFILTER_C}" +netpfil/ipfilter/netinet/ip_pool.c optional ipfilter inet compile-with "${IPFILTER_C}" +netpfil/ipfilter/netinet/ip_htable.c optional ipfilter inet compile-with "${IPFILTER_C}" +netpfil/ipfilter/netinet/ip_sync.c optional ipfilter inet compile-with "${IPFILTER_C}" +netpfil/ipfilter/netinet/mlfk_ipl.c optional ipfilter inet compile-with "${IPFILTER_C}" +netpfil/ipfilter/netinet/ip_nat6.c optional ipfilter inet compile-with "${IPFILTER_C}" +netpfil/ipfilter/netinet/ip_rules.c optional ipfilter inet compile-with "${IPFILTER_C}" +netpfil/ipfilter/netinet/ip_scan.c optional ipfilter inet compile-with "${IPFILTER_C}" +netpfil/ipfilter/netinet/ip_dstlist.c optional ipfilter inet compile-with "${IPFILTER_C}" +netpfil/ipfilter/netinet/radix_ipf.c optional ipfilter inet compile-with "${IPFILTER_C}" contrib/libfdt/fdt.c optional fdt contrib/libfdt/fdt_ro.c optional fdt contrib/libfdt/fdt_rw.c optional fdt @@ -1776,6 +1758,19 @@ dev/hwpmc/hwpmc_soft.c optional hwpmc dev/hwreset/hwreset.c optional hwreset dev/hwreset/hwreset_array.c optional hwreset dev/hwreset/hwreset_if.m optional hwreset +dev/hwt/hwt.c optional hwt +dev/hwt/hwt_backend.c optional hwt +dev/hwt/hwt_config.c optional hwt +dev/hwt/hwt_context.c optional hwt +dev/hwt/hwt_contexthash.c optional hwt +dev/hwt/hwt_cpu.c optional hwt +dev/hwt/hwt_hook.c optional hwt +dev/hwt/hwt_ioctl.c optional hwt +dev/hwt/hwt_owner.c optional hwt +dev/hwt/hwt_ownerhash.c optional hwt +dev/hwt/hwt_record.c optional hwt +dev/hwt/hwt_thread.c optional hwt +dev/hwt/hwt_vm.c optional hwt dev/ichiic/ig4_acpi.c optional ig4 acpi iicbus dev/ichiic/ig4_iic.c optional ig4 iicbus dev/ichiic/ig4_pci.c optional ig4 pci iicbus @@ -3160,8 +3155,6 @@ dev/sound/midi/midi.c optional sound dev/sound/midi/mpu401.c optional sound dev/sound/midi/mpu_if.m optional sound dev/sound/midi/mpufoi_if.m optional sound -dev/sound/midi/sequencer.c optional sound -dev/sound/midi/synth_if.m optional sound dev/spibus/acpi_spibus.c optional acpi spibus dev/spibus/ofw_spibus.c optional fdt spibus dev/spibus/spibus.c optional spibus \ @@ -3234,6 +3227,19 @@ dev/uart/uart_if.m optional uart dev/uart/uart_subr.c optional uart dev/uart/uart_tty.c optional uart # +# Universal Flash Storage Host Controller Interface drivers +# +dev/ufshci/ufshci.c optional ufshci +dev/ufshci/ufshci_ctrlr.c optional ufshci +dev/ufshci/ufshci_ctrlr_cmd.c optional ufshci +dev/ufshci/ufshci_dev.c optional ufshci +dev/ufshci/ufshci_pci.c optional ufshci +dev/ufshci/ufshci_req_queue.c optional ufshci +dev/ufshci/ufshci_req_sdb.c optional ufshci +dev/ufshci/ufshci_sim.c optional ufshci +dev/ufshci/ufshci_sysctl.c optional ufshci +dev/ufshci/ufshci_uic_cmd.c optional ufshci +# # USB controller drivers # dev/usb/controller/musb_otg.c optional musb @@ -3979,6 +3985,7 @@ kern/vfs_export.c standard kern/vfs_extattr.c standard kern/vfs_hash.c standard kern/vfs_init.c standard +kern/vfs_inotify.c standard kern/vfs_lookup.c standard kern/vfs_mount.c standard kern/vfs_mountroot.c standard diff --git a/sys/conf/files.amd64 b/sys/conf/files.amd64 index 0584fc29d039..80548320c3fc 100644 --- a/sys/conf/files.amd64 +++ b/sys/conf/files.amd64 @@ -84,8 +84,8 @@ amd64/amd64/xen-locore.S optional xenhvm \ amd64/amd64/machdep.c standard amd64/amd64/mem.c optional mem amd64/amd64/minidump_machdep.c standard -amd64/amd64/mp_machdep.c optional smp -amd64/amd64/mpboot.S optional smp +amd64/amd64/mp_machdep.c standard +amd64/amd64/mpboot.S standard amd64/amd64/pmap.c standard amd64/amd64/ptrace_machdep.c standard amd64/amd64/support.S standard @@ -191,6 +191,10 @@ dev/ice/irdma_di_if.m optional ice pci \ compile-with "${NORMAL_M} -I$S/dev/ice" dev/ice/ice_ddp_common.c optional ice pci \ compile-with "${NORMAL_C} -I$S/dev/ice" +dev/ice/ice_iov.c optional ice pci pci_iov \ + compile-with "${NORMAL_C} -I$S/dev/ice" +dev/ice/ice_vf_mbx.c optional ice pci pci_iov \ + compile-with "${NORMAL_C} -I$S/dev/ice" ice_ddp.c optional ice_ddp \ compile-with "${AWK} -f $S/tools/fw_stub.awk ice_ddp.fw:ice_ddp:0x01032900 -mice_ddp -c${.TARGET}" \ no-ctfconvert no-implicit-rule before-depend local \ diff --git a/sys/conf/kern.pre.mk b/sys/conf/kern.pre.mk index e6e42b33a9b7..78178065e15b 100644 --- a/sys/conf/kern.pre.mk +++ b/sys/conf/kern.pre.mk @@ -290,6 +290,10 @@ BNXT_CFLAGS= -I$S/dev/bnxt/bnxt_en ${OFEDCFLAGS} BNXT_C_NOIMP= ${CC} -c -o ${.TARGET} ${BNXT_CFLAGS} ${WERROR} BNXT_C= ${BNXT_C_NOIMP} ${.IMPSRC} +# IP Filter +IPFILTER_CFLAGS= -I$S/netpfil/ipfilter +IPFILTER_C= ${NORMAL_C} ${IPFILTER_CFLAGS} + GEN_CFILES= $S/$M/$M/genassym.c ${MFILES:T:S/.m$/.c/} SYSTEM_CFILES= config.c env.c hints.c vnode_if.c SYSTEM_DEP= Makefile ${SYSTEM_OBJS} diff --git a/sys/conf/options b/sys/conf/options index 03e8964e965d..a637b0b74a77 100644 --- a/sys/conf/options +++ b/sys/conf/options @@ -885,6 +885,9 @@ DCONS_FORCE_GDB opt_dcons.h HWPMC_DEBUG opt_global.h HWPMC_HOOKS +# Hardware Trace (HWT) framework options +HWT_HOOKS + # 802.11 support layer IEEE80211_DEBUG opt_wlan.h IEEE80211_DEBUG_REFCNT opt_wlan.h diff --git a/sys/contrib/dev/iwlwifi/iwl-debug.h b/sys/contrib/dev/iwlwifi/iwl-debug.h index 43288a5a8d74..7b3b402766b4 100644 --- a/sys/contrib/dev/iwlwifi/iwl-debug.h +++ b/sys/contrib/dev/iwlwifi/iwl-debug.h @@ -47,7 +47,7 @@ enum iwl_dl { IWL_DL_DROP = 0x00000010, IWL_DL_EEPROM = 0x00000020, IWL_DL_FW = 0x00000040, - /* = 0x00000080, */ + IWL_DL_DEV_RADIO = 0x00000080, IWL_DL_HC = 0x00000100, IWL_DL_HT = 0x00000200, IWL_DL_INFO = 0x00000400, @@ -195,6 +195,8 @@ void __iwl_dbg(struct device *, u32, bool, const char *, const char *fmt, ...); IWL_DPRINTF(_subsys, IWL_DL_WEP, _fmt, ##__VA_ARGS__) #define IWL_DEBUG_WOWLAN(_subsys, _fmt, ...) \ IWL_DPRINTF(_subsys, IWL_DL_WOWLAN, _fmt, ##__VA_ARGS__) +#define IWL_DEBUG_DEV_RADIO(_dev, _fmt, ...) \ + IWL_DPRINTF_DEV((_dev), IWL_DL_DEV_RADIO, _fmt, ##__VA_ARGS__) #define IWL_DEBUG_PCI_RW(_subsys, _fmt, ...) \ IWL_DPRINTF(_subsys, IWL_DL_PCI_RW, _fmt, ##__VA_ARGS__) diff --git a/sys/contrib/dev/rtw89/acpi.c b/sys/contrib/dev/rtw89/acpi.c index 02d4526c1538..f5dedb12c129 100644 --- a/sys/contrib/dev/rtw89/acpi.c +++ b/sys/contrib/dev/rtw89/acpi.c @@ -8,7 +8,6 @@ #include "acpi.h" #include "debug.h" -#if defined(__linux__) static const guid_t rtw89_guid = GUID_INIT(0xD2A8C3E8, 0x4B69, 0x4F00, 0x82, 0xBD, 0xFE, 0x86, 0x07, 0x80, 0x3A, 0xA7); @@ -149,14 +148,6 @@ int rtw89_acpi_evaluate_dsm(struct rtw89_dev *rtwdev, ACPI_FREE(obj); return ret; } -#elif defined(__FreeBSD__) -int rtw89_acpi_evaluate_dsm(struct rtw89_dev *rtwdev, - enum rtw89_acpi_dsm_func func, - struct rtw89_acpi_dsm_result *res) -{ - return -ENOENT; -} -#endif int rtw89_acpi_evaluate_rtag(struct rtw89_dev *rtwdev, struct rtw89_acpi_rtag_result *res) @@ -180,28 +171,15 @@ int rtw89_acpi_evaluate_rtag(struct rtw89_dev *rtwdev, if (ACPI_FAILURE(status)) return -EIO; -#if defined(__linux__) obj = buf.pointer; if (obj->type != ACPI_TYPE_BUFFER) { -#elif defined(__FreeBSD__) - obj = buf.Pointer; - if (obj->Type != ACPI_TYPE_BUFFER) { -#endif rtw89_debug(rtwdev, RTW89_DBG_ACPI, -#if defined(__linux__) "acpi: expect buffer but type: %d\n", obj->type); -#elif defined(__FreeBSD__) - "acpi: expect buffer but type: %d\n", obj->Type); -#endif ret = -EINVAL; goto out; } -#if defined(__linux__) buf_len = obj->buffer.length; -#elif defined(__FreeBSD__) - buf_len = obj->Buffer.Length; -#endif if (buf_len != sizeof(*res)) { rtw89_debug(rtwdev, RTW89_DBG_ACPI, "%s: invalid buffer length: %u\n", __func__, buf_len); @@ -209,11 +187,7 @@ int rtw89_acpi_evaluate_rtag(struct rtw89_dev *rtwdev, goto out; } -#if defined(__linux__) *res = *(struct rtw89_acpi_rtag_result *)obj->buffer.pointer; -#elif defined(__FreeBSD__) - *res = *(struct rtw89_acpi_rtag_result *)obj->Buffer.Pointer; -#endif rtw89_hex_dump(rtwdev, RTW89_DBG_ACPI, "antenna_gain: ", res, sizeof(*res)); diff --git a/sys/dev/bnxt/bnxt_en/bnxt_auxbus_compat.h b/sys/dev/bnxt/bnxt_en/bnxt_auxbus_compat.h index 1d844a67c928..c4c9e789cf3e 100644 --- a/sys/dev/bnxt/bnxt_en/bnxt_auxbus_compat.h +++ b/sys/dev/bnxt/bnxt_en/bnxt_auxbus_compat.h @@ -39,6 +39,7 @@ struct auxiliary_device_id { char name[AUXILIARY_NAME_SIZE]; uint64_t driver_data; }; +#define MODULE_DEVICE_TABLE_BUS_auxiliary(_bus, _table) struct auxiliary_device { struct device dev; diff --git a/sys/dev/drm2/drm_fb_helper.c b/sys/dev/drm2/drm_fb_helper.c index f67cc9f60d02..1f4abd255690 100644 --- a/sys/dev/drm2/drm_fb_helper.c +++ b/sys/dev/drm2/drm_fb_helper.c @@ -51,7 +51,7 @@ struct vt_kms_softc { struct task fb_mode_task; }; -/* Call restore out of vt(9) locks. */ +/* Call restore out of vt(4) locks. */ static void vt_restore_fbdev_mode(void *arg, int pending) { diff --git a/sys/dev/efidev/efirt.c b/sys/dev/efidev/efirt.c index b0fa33daeca7..b55c1c191077 100644 --- a/sys/dev/efidev/efirt.c +++ b/sys/dev/efidev/efirt.c @@ -107,7 +107,8 @@ static int efi_status2err[25] = { enum efi_table_type { TYPE_ESRT = 0, - TYPE_PROP + TYPE_PROP, + TYPE_MEMORY_ATTR }; static int efi_enter(void); @@ -445,6 +446,42 @@ get_table_length(enum efi_table_type type, size_t *table_len, void **taddr) free(buf, M_TEMP); return (0); } + case TYPE_MEMORY_ATTR: + { + efi_guid_t guid = EFI_MEMORY_ATTRIBUTES_TABLE; + struct efi_memory_attribute_table *tbl_addr, *mem_addr; + int error; + void *buf; + size_t len = sizeof(struct efi_memory_attribute_table); + + error = efi_get_table(&guid, (void **)&tbl_addr); + if (error) + return (error); + + buf = malloc(len, M_TEMP, M_WAITOK); + error = physcopyout((vm_paddr_t)tbl_addr, buf, len); + if (error) { + free(buf, M_TEMP); + return (error); + } + + mem_addr = (struct efi_memory_attribute_table *)buf; + if (mem_addr->version != 2) { + free(buf, M_TEMP); + return (EINVAL); + } + len += mem_addr->descriptor_size * mem_addr->num_ents; + if (len > EFI_TABLE_ALLOC_MAX) { + free(buf, M_TEMP); + return (ENOMEM); + } + + *table_len = len; + if (taddr != NULL) + *taddr = tbl_addr; + free(buf, M_TEMP); + return (0); + } } return (ENOENT); } @@ -457,7 +494,8 @@ copy_table(efi_guid_t *guid, void **buf, size_t buf_len, size_t *table_len) enum efi_table_type type; } tables[] = { { EFI_TABLE_ESRT, TYPE_ESRT }, - { EFI_PROPERTIES_TABLE, TYPE_PROP } + { EFI_PROPERTIES_TABLE, TYPE_PROP }, + { EFI_MEMORY_ATTRIBUTES_TABLE, TYPE_MEMORY_ATTR } }; size_t table_idx; void *taddr; diff --git a/sys/dev/gpio/acpi_gpiobus.c b/sys/dev/gpio/acpi_gpiobus.c index 2987af634866..94f4e5771266 100644 --- a/sys/dev/gpio/acpi_gpiobus.c +++ b/sys/dev/gpio/acpi_gpiobus.c @@ -36,6 +36,7 @@ #include <dev/gpio/gpiobusvar.h> #include <dev/gpio/acpi_gpiobusvar.h> +#include <dev/gpio/gpiobus_internal.h> #include "gpiobus_if.h" @@ -356,7 +357,7 @@ acpi_gpiobus_attach(device_t dev) status = AcpiWalkResources(handle, "_AEI", acpi_gpiobus_enumerate_aei, &ctx); - if (ACPI_FAILURE(status)) + if (ACPI_FAILURE(status) && status != AE_NOT_FOUND) device_printf(dev, "Failed to enumerate AEI resources\n"); return (0); diff --git a/sys/dev/gpio/gpiobus.c b/sys/dev/gpio/gpiobus.c index 2e2618805e7b..764bcb7e6ee8 100644 --- a/sys/dev/gpio/gpiobus.c +++ b/sys/dev/gpio/gpiobus.c @@ -39,6 +39,7 @@ #include <sys/sbuf.h> #include <dev/gpio/gpiobusvar.h> +#include <dev/gpio/gpiobus_internal.h> #include "gpiobus_if.h" @@ -109,10 +110,9 @@ gpio_alloc_intr_resource(device_t consumer_dev, int *rid, u_int alloc_flags, res = bus_alloc_resource(consumer_dev, SYS_RES_IRQ, rid, irq, irq, 1, alloc_flags); if (res == NULL) { - intr_free_intr_map_data((struct intr_map_data *)gpio_data); + intr_unmap_irq(irq); return (NULL); } - rman_set_virtual(res, gpio_data); return (res); } #else @@ -213,20 +213,40 @@ gpio_pin_is_active(gpio_pin_t pin, bool *active) return (0); } +/* + * Note that this function should only + * be used in cases where a pre-existing + * gpiobus_pin structure exists. In most + * cases, the gpio_pin_get_by_* functions + * suffice. + */ +int +gpio_pin_acquire(gpio_pin_t gpio) +{ + device_t busdev; + + KASSERT(gpio != NULL, ("GPIO pin is NULL.")); + KASSERT(gpio->dev != NULL, ("GPIO pin device is NULL.")); + + busdev = GPIO_GET_BUS(gpio->dev); + if (busdev == NULL) + return (ENXIO); + + return (gpiobus_acquire_pin(busdev, gpio->pin)); +} + void gpio_pin_release(gpio_pin_t gpio) { device_t busdev; - if (gpio == NULL) - return; - + KASSERT(gpio != NULL, ("GPIO pin is NULL.")); KASSERT(gpio->dev != NULL, ("GPIO pin device is NULL.")); busdev = GPIO_GET_BUS(gpio->dev); - if (busdev != NULL) - gpiobus_release_pin(busdev, gpio->pin); + KASSERT(busdev != NULL, ("gpiobus dev is NULL.")); + gpiobus_release_pin(busdev, gpio->pin); free(gpio, M_DEVBUF); } @@ -293,7 +313,7 @@ gpiobus_print_pins(struct gpiobus_ivar *devi, struct sbuf *sb) } device_t -gpiobus_attach_bus(device_t dev) +gpiobus_add_bus(device_t dev) { device_t busdev; @@ -307,8 +327,24 @@ gpiobus_attach_bus(device_t dev) #ifdef FDT ofw_gpiobus_register_provider(dev); #endif - bus_attach_children(dev); + return (busdev); +} + +/* + * Attach a gpiobus child. + * Note that the controller is expected + * to be fully initialized at this point. + */ +device_t +gpiobus_attach_bus(device_t dev) +{ + device_t busdev; + + busdev = gpiobus_add_bus(dev); + if (busdev == NULL) + return (NULL); + bus_attach_children(dev); return (busdev); } @@ -385,14 +421,13 @@ gpiobus_acquire_pin(device_t bus, uint32_t pin) sc = device_get_softc(bus); /* Consistency check. */ if (pin >= sc->sc_npins) { - device_printf(bus, - "invalid pin %d, max: %d\n", pin, sc->sc_npins - 1); - return (-1); + panic("%s: invalid pin %d, max: %d", + device_get_nameunit(bus), pin, sc->sc_npins - 1); } /* Mark pin as mapped and give warning if it's already mapped. */ if (sc->sc_pins[pin].mapped) { device_printf(bus, "warning: pin %d is already mapped\n", pin); - return (-1); + return (EBUSY); } sc->sc_pins[pin].mapped = 1; @@ -400,7 +435,7 @@ gpiobus_acquire_pin(device_t bus, uint32_t pin) } /* Release mapped pin */ -int +void gpiobus_release_pin(device_t bus, uint32_t pin) { struct gpiobus_softc *sc; @@ -408,19 +443,15 @@ gpiobus_release_pin(device_t bus, uint32_t pin) sc = device_get_softc(bus); /* Consistency check. */ if (pin >= sc->sc_npins) { - device_printf(bus, - "invalid pin %d, max=%d\n", - pin, sc->sc_npins - 1); - return (-1); + panic("%s: invalid pin %d, max: %d", + device_get_nameunit(bus), pin, sc->sc_npins - 1); } - if (!sc->sc_pins[pin].mapped) { - device_printf(bus, "pin %d is not mapped\n", pin); - return (-1); - } - sc->sc_pins[pin].mapped = 0; + if (!sc->sc_pins[pin].mapped) + panic("%s: pin %d is not mapped", device_get_nameunit(bus), + pin); - return (0); + sc->sc_pins[pin].mapped = 0; } static int @@ -435,8 +466,7 @@ gpiobus_acquire_child_pins(device_t dev, device_t child) device_printf(child, "cannot acquire pin %d\n", devi->pins[i]); while (--i >= 0) { - (void)gpiobus_release_pin(dev, - devi->pins[i]); + gpiobus_release_pin(dev, devi->pins[i]); } gpiobus_free_ivars(devi); return (EBUSY); @@ -835,6 +865,25 @@ gpiobus_alloc_resource(device_t bus, device_t child, int type, int *rid, end, count, flags)); } +static int +gpiobus_release_resource(device_t dev, device_t child, struct resource *r) +{ + int err; +#ifdef INTRNG + u_int irq; + + irq = rman_get_start(r); + MPASS(irq == rman_get_end(r)); +#endif + err = bus_generic_rman_release_resource(dev, child, r); + if (err != 0) + return (err); +#ifdef INTRNG + intr_unmap_irq(irq); +#endif + return (0); +} + static struct resource_list * gpiobus_get_resource_list(device_t bus __unused, device_t child) { @@ -1029,7 +1078,7 @@ static device_method_t gpiobus_methods[] = { DEVMETHOD(bus_get_resource, bus_generic_rl_get_resource), DEVMETHOD(bus_set_resource, bus_generic_rl_set_resource), DEVMETHOD(bus_alloc_resource, gpiobus_alloc_resource), - DEVMETHOD(bus_release_resource, bus_generic_rman_release_resource), + DEVMETHOD(bus_release_resource, gpiobus_release_resource), DEVMETHOD(bus_activate_resource, bus_generic_rman_activate_resource), DEVMETHOD(bus_deactivate_resource, bus_generic_rman_deactivate_resource), DEVMETHOD(bus_get_resource_list, gpiobus_get_resource_list), diff --git a/sys/dev/sound/midi/sequencer.h b/sys/dev/gpio/gpiobus_internal.h index 22ea0ae6c1b6..de3f57663132 100644 --- a/sys/dev/sound/midi/sequencer.h +++ b/sys/dev/gpio/gpiobus_internal.h @@ -1,8 +1,7 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * - * Copyright (c) 2003 Mathew Kanner - * Copyright (c) 1999 Seigo Tanimura + * Copyright (c) 2009 Oleksandr Tymoshenko <gonzo@freebsd.org> * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -25,65 +24,24 @@ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. + * */ -/* - * Include file for the midi sequence driver. - */ - -#ifndef _SEQUENCER_H_ -#define _SEQUENCER_H_ - -#define NSEQ_MAX 16 +#ifndef __GPIOBUS_INTERNAL_H__ +#define __GPIOBUS_INTERNAL_H__ /* - * many variables should be reduced to a range. Here define a macro + * Functions shared between gpiobus and other bus classes that derive from it; + * these should not be called directly by other drivers. */ - -#define RANGE(var, low, high) (var) = \ -((var)<(low)?(low) : (var)>(high)?(high) : (var)) - -#ifdef _KERNEL - -void seq_timer(void *arg); - -SYSCTL_DECL(_hw_midi_seq); - -extern int seq_debug; - -#define SEQ_DEBUG(y, x) \ - do { \ - if (seq_debug >= y) { \ - (x); \ - } \ - } while (0) - -SYSCTL_DECL(_hw_midi); - -#endif /* _KERNEL */ - -#define SYNTHPROP_MIDI 1 -#define SYNTHPROP_SYNTH 2 -#define SYNTHPROP_RX 4 -#define SYNTHPROP_TX 8 - -struct _midi_cmdtab { - int cmd; - char *name; -}; -typedef struct _midi_cmdtab midi_cmdtab; -extern midi_cmdtab cmdtab_seqevent[]; -extern midi_cmdtab cmdtab_seqioctl[]; -extern midi_cmdtab cmdtab_timer[]; -extern midi_cmdtab cmdtab_seqcv[]; -extern midi_cmdtab cmdtab_seqccmn[]; - -char *midi_cmdname(int cmd, midi_cmdtab * tab); - -enum { - MORE, - TIMERARMED, - QUEUEFULL -}; - +int gpiobus_attach(device_t); +int gpiobus_detach(device_t); +int gpiobus_init_softc(device_t); +int gpiobus_alloc_ivars(struct gpiobus_ivar *); +void gpiobus_free_ivars(struct gpiobus_ivar *); +int gpiobus_read_ivar(device_t, device_t, int, uintptr_t *); +int gpiobus_acquire_pin(device_t, uint32_t); +void gpiobus_release_pin(device_t, uint32_t); + +extern driver_t gpiobus_driver; #endif diff --git a/sys/dev/gpio/gpiobusvar.h b/sys/dev/gpio/gpiobusvar.h index 74783e112f89..7f504236a774 100644 --- a/sys/dev/gpio/gpiobusvar.h +++ b/sys/dev/gpio/gpiobusvar.h @@ -156,6 +156,8 @@ int gpio_pin_get_by_bus_pinnum(device_t _bus, uint32_t _pinnum, gpio_pin_t *_gp) /* Acquire a pin by child and index (used by direct children of gpiobus). */ int gpio_pin_get_by_child_index(device_t _child, uint32_t _idx, gpio_pin_t *_gp); +/* Acquire a pin from an existing gpio_pin_t. */ +int gpio_pin_acquire(gpio_pin_t gpio); /* Release a pin acquired via any gpio_pin_get_xxx() function. */ void gpio_pin_release(gpio_pin_t gpio); @@ -167,22 +169,9 @@ int gpio_pin_setflags(gpio_pin_t pin, uint32_t flags); struct resource *gpio_alloc_intr_resource(device_t consumer_dev, int *rid, u_int alloc_flags, gpio_pin_t pin, uint32_t intr_mode); -/* - * Functions shared between gpiobus and other bus classes that derive from it; - * these should not be called directly by other drivers. - */ int gpio_check_flags(uint32_t, uint32_t); +device_t gpiobus_add_bus(device_t); device_t gpiobus_attach_bus(device_t); int gpiobus_detach_bus(device_t); -int gpiobus_attach(device_t); -int gpiobus_detach(device_t); -int gpiobus_init_softc(device_t); -int gpiobus_alloc_ivars(struct gpiobus_ivar *); -void gpiobus_free_ivars(struct gpiobus_ivar *); -int gpiobus_read_ivar(device_t, device_t, int, uintptr_t *); -int gpiobus_acquire_pin(device_t, uint32_t); -int gpiobus_release_pin(device_t, uint32_t); - -extern driver_t gpiobus_driver; #endif /* __GPIOBUS_H__ */ diff --git a/sys/dev/gpio/gpiopps.c b/sys/dev/gpio/gpiopps.c index bb8afa5e062c..82620a50a798 100644 --- a/sys/dev/gpio/gpiopps.c +++ b/sys/dev/gpio/gpiopps.c @@ -160,7 +160,7 @@ gpiopps_detach(device_t dev) if (sc->ires != NULL) bus_release_resource(dev, SYS_RES_IRQ, sc->irid, sc->ires); if (sc->gpin != NULL) - gpiobus_release_pin(GPIO_GET_BUS(sc->gpin->dev), sc->gpin->pin); + gpio_pin_release(sc->gpin); return (0); } diff --git a/sys/dev/gpio/ofw_gpiobus.c b/sys/dev/gpio/ofw_gpiobus.c index 32dc5b55e698..fc5fb03d6824 100644 --- a/sys/dev/gpio/ofw_gpiobus.c +++ b/sys/dev/gpio/ofw_gpiobus.c @@ -36,6 +36,7 @@ #include <sys/module.h> #include <dev/gpio/gpiobusvar.h> +#include <dev/gpio/gpiobus_internal.h> #include <dev/ofw/ofw_bus.h> #include "gpiobus_if.h" diff --git a/sys/dev/gpio/pl061.c b/sys/dev/gpio/pl061.c index cc39790322b6..87d4310a6396 100644 --- a/sys/dev/gpio/pl061.c +++ b/sys/dev/gpio/pl061.c @@ -487,14 +487,21 @@ pl061_attach(device_t dev) } } + mtx_init(&sc->sc_mtx, device_get_nameunit(dev), "pl061", MTX_SPIN); + + if (sc->sc_xref != 0 && !intr_pic_register(dev, sc->sc_xref)) { + device_printf(dev, "couldn't register PIC\n"); + PL061_LOCK_DESTROY(sc); + goto free_isrc; + } + sc->sc_busdev = gpiobus_attach_bus(dev); if (sc->sc_busdev == NULL) { device_printf(dev, "couldn't attach gpio bus\n"); + PL061_LOCK_DESTROY(sc); goto free_isrc; } - mtx_init(&sc->sc_mtx, device_get_nameunit(dev), "pl061", MTX_SPIN); - return (0); free_isrc: @@ -503,6 +510,7 @@ free_isrc: * for (irq = 0; irq < PL061_NUM_GPIO; irq++) * intr_isrc_deregister(PIC_INTR_ISRC(sc, irq)); */ + bus_teardown_intr(dev, sc->sc_irq_res, sc->sc_irq_hdlr); bus_release_resource(dev, SYS_RES_IRQ, sc->sc_irq_rid, sc->sc_irq_res); free_pic: diff --git a/sys/dev/gpio/pl061.h b/sys/dev/gpio/pl061.h index 809a1168493d..d9fe23e502b1 100644 --- a/sys/dev/gpio/pl061.h +++ b/sys/dev/gpio/pl061.h @@ -46,6 +46,7 @@ struct pl061_softc { struct resource *sc_mem_res; struct resource *sc_irq_res; void *sc_irq_hdlr; + intptr_t sc_xref; int sc_mem_rid; int sc_irq_rid; struct pl061_pin_irqsrc sc_isrcs[PL061_NUM_GPIO]; diff --git a/sys/dev/gpio/pl061_acpi.c b/sys/dev/gpio/pl061_acpi.c index f5885025083e..8e9921261e4e 100644 --- a/sys/dev/gpio/pl061_acpi.c +++ b/sys/dev/gpio/pl061_acpi.c @@ -67,19 +67,12 @@ pl061_acpi_probe(device_t dev) static int pl061_acpi_attach(device_t dev) { - int error; + struct pl061_softc *sc; - error = pl061_attach(dev); - if (error != 0) - return (error); + sc = device_get_softc(dev); + sc->sc_xref = ACPI_GPIO_XREF; - if (!intr_pic_register(dev, ACPI_GPIO_XREF)) { - device_printf(dev, "couldn't register PIC\n"); - pl061_detach(dev); - error = ENXIO; - } - - return (error); + return (pl061_attach(dev)); } static device_method_t pl061_acpi_methods[] = { diff --git a/sys/dev/gpio/pl061_fdt.c b/sys/dev/gpio/pl061_fdt.c index aa22298b43c6..681b3ccdfdeb 100644 --- a/sys/dev/gpio/pl061_fdt.c +++ b/sys/dev/gpio/pl061_fdt.c @@ -61,19 +61,12 @@ pl061_fdt_probe(device_t dev) static int pl061_fdt_attach(device_t dev) { - int error; + struct pl061_softc *sc; - error = pl061_attach(dev); - if (error != 0) - return (error); + sc = device_get_softc(dev); + sc->sc_xref = OF_xref_from_node(ofw_bus_get_node(dev)); - if (!intr_pic_register(dev, OF_xref_from_node(ofw_bus_get_node(dev)))) { - device_printf(dev, "couldn't register PIC\n"); - pl061_detach(dev); - error = ENXIO; - } - - return (error); + return (pl061_attach(dev)); } static device_method_t pl061_fdt_methods[] = { diff --git a/sys/dev/gpio/qoriq_gpio.c b/sys/dev/gpio/qoriq_gpio.c index 25dfccede29f..8b44cd256c79 100644 --- a/sys/dev/gpio/qoriq_gpio.c +++ b/sys/dev/gpio/qoriq_gpio.c @@ -369,11 +369,6 @@ qoriq_gpio_attach(device_t dev) for (i = 0; i <= MAXPIN; i++) sc->sc_pins[i].gp_caps = DEFAULT_CAPS; - sc->busdev = gpiobus_attach_bus(dev); - if (sc->busdev == NULL) { - qoriq_gpio_detach(dev); - return (ENOMEM); - } /* * Enable the GPIO Input Buffer for all GPIOs. * This is safe on devices without a GPIBE register, because those @@ -384,6 +379,12 @@ qoriq_gpio_attach(device_t dev) OF_device_register_xref(OF_xref_from_node(ofw_bus_get_node(dev)), dev); + sc->busdev = gpiobus_attach_bus(dev); + if (sc->busdev == NULL) { + qoriq_gpio_detach(dev); + return (ENOMEM); + } + return (0); } diff --git a/sys/dev/hwt/hwt.c b/sys/dev/hwt/hwt.c new file mode 100644 index 000000000000..c476e6031ba8 --- /dev/null +++ b/sys/dev/hwt/hwt.c @@ -0,0 +1,242 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023-2025 Ruslan Bukin <br@bsdpad.com> + * + * This work was supported by Innovate UK project 105694, "Digital Security + * by Design (DSbD) Technology Platform Prototype". + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Hardware Tracing framework. + * + * The framework manages hardware tracing units that collect information + * about software execution and store it as events in highly compressed format + * into DRAM. The events cover information about control flow changes of a + * program, whether branches taken or not, exceptions taken, timing information, + * cycles elapsed and more. That allows us to restore entire program flow of a + * given application without performance impact. + * + * Design overview. + * + * The framework provides character devices for mmap(2) and ioctl(2) system + * calls to allow user to manage CPU (hardware) tracing units. + * + * /dev/hwt: + * .ioctl: + * hwt_ioctl(): + * a) HWT_IOC_ALLOC + * Allocates kernel tracing context CTX based on requested mode + * of operation. Verifies the information that comes with the + * request (pid, cpus), allocates unique ID for the context. + * Creates a new character device for CTX management. + * + * /dev/hwt_%d[_%d], ident[, thread_id] + * .mmap + * Maps tracing buffers of the corresponding thread to userspace. + * .ioctl + * hwt_thread_ioctl(): + * a) HWT_IOC_START + * Enables tracing unit for a given context. + * b) HWT_IOC_RECORD_GET + * Transfers (small) record entries collected during program + * execution for a given context to userspace, such as mmaping + * tables of executable and dynamic libraries, interpreter, + * kernel mappings, tid of threads created, etc. + * c) HWT_IOC_SET_CONFIG + * Allows to specify backend-specific configuration of the + * trace unit. + * d) HWT_IOC_WAKEUP + * Wakes up a thread that is currently sleeping. + * e) HWT_IOC_BUFPTR_GET + * Transfers current hardware pointer in the filling buffer + * to the userspace. + * f) HWT_IOC_SVC_BUF + * To avoid data loss, userspace may notify kernel it has + * copied out the given buffer, so kernel is ok to overwrite + * + * HWT context lifecycle in THREAD mode of operation: + * 1. User invokes HWT_IOC_ALLOC ioctl with information about pid to trace and + * size of the buffers for the trace data to allocate. + * Some architectures may have different tracing units supported, so user + * also provides backend name to use for this context, e.g. "coresight". + * 2. Kernel allocates context, lookups the proc for the given pid. Then it + * creates first hwt_thread in the context and allocates trace buffers for + * it. Immediately, kernel initializes tracing backend. + * Kernel creates character device and returns unique identificator of + * trace context to the user. + * 3. To manage the new context, user opens the character device created. + * User invokes HWT_IOC_START ioctl, kernel marks context as RUNNING. + * At this point any HWT hook invocation by scheduler enables/disables + * tracing for threads associated with the context (threads of the proc). + * Any new threads creation (of the target proc) procedures will be invoking + * corresponding hooks in HWT framework, so that new hwt_thread and buffers + * allocated, character device for mmap(2) created on the fly. + * 4. User issues HWT_IOC_RECORD_GET ioctl to fetch information about mmaping + * tables and threads created during application startup. + * 5. User mmaps tracing buffers of each thread to userspace (using + * /dev/hwt_%d_%d % (ident, thread_id) character devices). + * 6. User can repeat 4 if expected thread is not yet created during target + * application execution. + * 7. User issues HWT_IOC_BUFPTR_GET ioctl to get current filling level of the + * hardware buffer of a given thread. + * 8. User invokes trace decoder library to process available data and see the + * results in human readable form. + * 9. User repeats 7 if needed. + * + * HWT context lifecycle in CPU mode of operation: + * 1. User invokes HWT_IOC_ALLOC ioctl providing a set of CPU to trace within + * single CTX. + * 2. Kernel verifies the set of CPU and allocates tracing context, creates + * a buffer for each CPU. + * Kernel creates a character device for every CPU provided in the request. + * Kernel initialized tracing backend. + * 3. User opens character devices of interest to map the buffers to userspace. + * User can start tracing by invoking HWT_IOC_START on any of character + * device within the context, entire context will be marked as RUNNING. + * 4. The rest is similar to the THREAD mode. + * + */ + +#include <sys/param.h> +#include <sys/conf.h> +#include <sys/eventhandler.h> +#include <sys/kernel.h> +#include <sys/module.h> + +#include <dev/hwt/hwt_context.h> +#include <dev/hwt/hwt_contexthash.h> +#include <dev/hwt/hwt_thread.h> +#include <dev/hwt/hwt_owner.h> +#include <dev/hwt/hwt_ownerhash.h> +#include <dev/hwt/hwt_backend.h> +#include <dev/hwt/hwt_record.h> +#include <dev/hwt/hwt_ioctl.h> +#include <dev/hwt/hwt_hook.h> + +#define HWT_DEBUG +#undef HWT_DEBUG + +#ifdef HWT_DEBUG +#define dprintf(fmt, ...) printf(fmt, ##__VA_ARGS__) +#else +#define dprintf(fmt, ...) +#endif + +static eventhandler_tag hwt_exit_tag; +static struct cdev *hwt_cdev; +static struct cdevsw hwt_cdevsw = { + .d_version = D_VERSION, + .d_name = "hwt", + .d_mmap_single = NULL, + .d_ioctl = hwt_ioctl +}; + +static void +hwt_process_exit(void *arg __unused, struct proc *p) +{ + struct hwt_owner *ho; + + /* Stop HWTs associated with exiting owner, if any. */ + ho = hwt_ownerhash_lookup(p); + if (ho) + hwt_owner_shutdown(ho); +} + +static int +hwt_load(void) +{ + struct make_dev_args args; + int error; + + make_dev_args_init(&args); + args.mda_devsw = &hwt_cdevsw; + args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK; + args.mda_uid = UID_ROOT; + args.mda_gid = GID_WHEEL; + args.mda_mode = 0660; + args.mda_si_drv1 = NULL; + + hwt_backend_load(); + hwt_ctx_load(); + hwt_contexthash_load(); + hwt_ownerhash_load(); + hwt_record_load(); + + error = make_dev_s(&args, &hwt_cdev, "hwt"); + if (error != 0) + return (error); + + hwt_exit_tag = EVENTHANDLER_REGISTER(process_exit, hwt_process_exit, + NULL, EVENTHANDLER_PRI_ANY); + + hwt_hook_load(); + + return (0); +} + +static int +hwt_unload(void) +{ + + hwt_hook_unload(); + EVENTHANDLER_DEREGISTER(process_exit, hwt_exit_tag); + destroy_dev(hwt_cdev); + hwt_record_unload(); + hwt_ownerhash_unload(); + hwt_contexthash_unload(); + hwt_ctx_unload(); + hwt_backend_unload(); + + return (0); +} + +static int +hwt_modevent(module_t mod, int type, void *data) +{ + int error; + + switch (type) { + case MOD_LOAD: + error = hwt_load(); + break; + case MOD_UNLOAD: + error = hwt_unload(); + break; + default: + error = 0; + break; + } + + return (error); +} + +static moduledata_t hwt_mod = { + "hwt", + hwt_modevent, + NULL +}; + +DECLARE_MODULE(hwt, hwt_mod, SI_SUB_DRIVERS, SI_ORDER_FIRST); +MODULE_VERSION(hwt, 1); diff --git a/sys/dev/hwt/hwt_backend.c b/sys/dev/hwt/hwt_backend.c new file mode 100644 index 000000000000..1ba5db0d3d09 --- /dev/null +++ b/sys/dev/hwt/hwt_backend.c @@ -0,0 +1,289 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023-2025 Ruslan Bukin <br@bsdpad.com> + * + * This work was supported by Innovate UK project 105694, "Digital Security + * by Design (DSbD) Technology Platform Prototype". + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* Hardware Trace (HWT) framework. */ + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/mutex.h> +#include <sys/hwt.h> + +#include <dev/hwt/hwt_hook.h> +#include <dev/hwt/hwt_context.h> +#include <dev/hwt/hwt_config.h> +#include <dev/hwt/hwt_thread.h> +#include <dev/hwt/hwt_backend.h> + +#define HWT_BACKEND_DEBUG +#undef HWT_BACKEND_DEBUG + +#ifdef HWT_BACKEND_DEBUG +#define dprintf(fmt, ...) printf(fmt, ##__VA_ARGS__) +#else +#define dprintf(fmt, ...) +#endif + +static struct mtx hwt_backend_mtx; + +struct hwt_backend_entry { + struct hwt_backend *backend; + LIST_ENTRY(hwt_backend_entry) next; +}; + +static LIST_HEAD(, hwt_backend_entry) hwt_backends; + +static MALLOC_DEFINE(M_HWT_BACKEND, "hwt_backend", "HWT backend"); + +int +hwt_backend_init(struct hwt_context *ctx) +{ + int error; + + dprintf("%s\n", __func__); + + error = ctx->hwt_backend->ops->hwt_backend_init(ctx); + + return (error); +} + +void +hwt_backend_deinit(struct hwt_context *ctx) +{ + + dprintf("%s\n", __func__); + + ctx->hwt_backend->ops->hwt_backend_deinit(ctx); +} + +int +hwt_backend_configure(struct hwt_context *ctx, int cpu_id, int thread_id) +{ + int error; + + dprintf("%s\n", __func__); + + error = ctx->hwt_backend->ops->hwt_backend_configure(ctx, cpu_id, + thread_id); + + return (error); +} + +void +hwt_backend_enable(struct hwt_context *ctx, int cpu_id) +{ + + dprintf("%s\n", __func__); + + ctx->hwt_backend->ops->hwt_backend_enable(ctx, cpu_id); +} + +void +hwt_backend_disable(struct hwt_context *ctx, int cpu_id) +{ + + dprintf("%s\n", __func__); + + ctx->hwt_backend->ops->hwt_backend_disable(ctx, cpu_id); +} + +void +hwt_backend_enable_smp(struct hwt_context *ctx) +{ + + dprintf("%s\n", __func__); + + ctx->hwt_backend->ops->hwt_backend_enable_smp(ctx); +} + +void +hwt_backend_disable_smp(struct hwt_context *ctx) +{ + + dprintf("%s\n", __func__); + + ctx->hwt_backend->ops->hwt_backend_disable_smp(ctx); +} + +void __unused +hwt_backend_dump(struct hwt_context *ctx, int cpu_id) +{ + + dprintf("%s\n", __func__); + + ctx->hwt_backend->ops->hwt_backend_dump(cpu_id); +} + +int +hwt_backend_read(struct hwt_context *ctx, struct hwt_vm *vm, int *ident, + vm_offset_t *offset, uint64_t *data) +{ + int error; + + dprintf("%s\n", __func__); + + error = ctx->hwt_backend->ops->hwt_backend_read(vm, ident, + offset, data); + + return (error); +} + +struct hwt_backend * +hwt_backend_lookup(const char *name) +{ + struct hwt_backend_entry *entry; + struct hwt_backend *backend; + + HWT_BACKEND_LOCK(); + LIST_FOREACH(entry, &hwt_backends, next) { + backend = entry->backend; + if (strcmp(backend->name, name) == 0) { + HWT_BACKEND_UNLOCK(); + return (backend); + } + } + HWT_BACKEND_UNLOCK(); + + return (NULL); +} + +int +hwt_backend_register(struct hwt_backend *backend) +{ + struct hwt_backend_entry *entry; + + if (backend == NULL || + backend->name == NULL || + backend->ops == NULL) + return (EINVAL); + + entry = malloc(sizeof(struct hwt_backend_entry), M_HWT_BACKEND, + M_WAITOK | M_ZERO); + entry->backend = backend; + + HWT_BACKEND_LOCK(); + LIST_INSERT_HEAD(&hwt_backends, entry, next); + HWT_BACKEND_UNLOCK(); + + return (0); +} + +int +hwt_backend_unregister(struct hwt_backend *backend) +{ + struct hwt_backend_entry *entry, *tmp; + + if (backend == NULL) + return (EINVAL); + + /* TODO: check if not in use */ + + HWT_BACKEND_LOCK(); + LIST_FOREACH_SAFE(entry, &hwt_backends, next, tmp) { + if (entry->backend == backend) { + LIST_REMOVE(entry, next); + HWT_BACKEND_UNLOCK(); + free(entry, M_HWT_BACKEND); + return (0); + } + } + HWT_BACKEND_UNLOCK(); + + return (ENOENT); +} + +void +hwt_backend_load(void) +{ + + mtx_init(&hwt_backend_mtx, "hwt backend", NULL, MTX_DEF); + LIST_INIT(&hwt_backends); +} + +void +hwt_backend_unload(void) +{ + + /* TODO: ensure all unregistered */ + + mtx_destroy(&hwt_backend_mtx); +} + +void +hwt_backend_stop(struct hwt_context *ctx) +{ + dprintf("%s\n", __func__); + + ctx->hwt_backend->ops->hwt_backend_stop(ctx); +} + +int +hwt_backend_svc_buf(struct hwt_context *ctx, void *data, size_t data_size, + int data_version) +{ + int error; + + dprintf("%s\n", __func__); + + error = ctx->hwt_backend->ops->hwt_backend_svc_buf(ctx, data, data_size, + data_version); + + return (error); +} + +int +hwt_backend_thread_alloc(struct hwt_context *ctx, struct hwt_thread *thr) +{ + int error; + + dprintf("%s\n", __func__); + + if (ctx->hwt_backend->ops->hwt_backend_thread_alloc == NULL) + return (0); + KASSERT(thr->private == NULL, + ("%s: thread private data is not NULL\n", __func__)); + error = ctx->hwt_backend->ops->hwt_backend_thread_alloc(thr); + + return (error); +} + +void +hwt_backend_thread_free(struct hwt_thread *thr) +{ + dprintf("%s\n", __func__); + + if (thr->backend->ops->hwt_backend_thread_free == NULL) + return; + KASSERT(thr->private != NULL, + ("%s: thread private data is NULL\n", __func__)); + thr->backend->ops->hwt_backend_thread_free(thr); + + return; +} diff --git a/sys/dev/hwt/hwt_backend.h b/sys/dev/hwt/hwt_backend.h new file mode 100644 index 000000000000..3b6c9442a7a6 --- /dev/null +++ b/sys/dev/hwt/hwt_backend.h @@ -0,0 +1,87 @@ +/*- + * Copyright (c) 2023-2025 Ruslan Bukin <br@bsdpad.com> + * + * This work was supported by Innovate UK project 105694, "Digital Security + * by Design (DSbD) Technology Platform Prototype". + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _DEV_HWT_HWT_BACKEND_H_ +#define _DEV_HWT_HWT_BACKEND_H_ + +struct hwt_backend_ops { + int (*hwt_backend_init)(struct hwt_context *); + int (*hwt_backend_deinit)(struct hwt_context *); + int (*hwt_backend_configure)(struct hwt_context *, int cpu_id, + int thread_id); + int (*hwt_backend_svc_buf)(struct hwt_context *, void *data, + size_t data_size, int data_version); + void (*hwt_backend_enable)(struct hwt_context *, int cpu_id); + void (*hwt_backend_disable)(struct hwt_context *, int cpu_id); + int (*hwt_backend_read)(struct hwt_vm *, int *ident, + vm_offset_t *offset, uint64_t *data); + void (*hwt_backend_stop)(struct hwt_context *); + /* For backends that are tied to local CPU registers */ + int (*hwt_backend_enable_smp)(struct hwt_context *); + int (*hwt_backend_disable_smp)(struct hwt_context *); + /* Allocation and initialization of backend-specific thread data. */ + int (*hwt_backend_thread_alloc)(struct hwt_thread *); + void (*hwt_backend_thread_free)(struct hwt_thread *); + /* Debugging only. */ + void (*hwt_backend_dump)(int cpu_id); +}; + +struct hwt_backend { + const char *name; + struct hwt_backend_ops *ops; + /* buffers require kernel virtual addresses */ + bool kva_req; +}; + +int hwt_backend_init(struct hwt_context *ctx); +void hwt_backend_deinit(struct hwt_context *ctx); +int hwt_backend_configure(struct hwt_context *ctx, int cpu_id, int thread_id); +void hwt_backend_enable(struct hwt_context *ctx, int cpu_id); +void hwt_backend_disable(struct hwt_context *ctx, int cpu_id); +void hwt_backend_enable_smp(struct hwt_context *ctx); +void hwt_backend_disable_smp(struct hwt_context *ctx); +void hwt_backend_dump(struct hwt_context *ctx, int cpu_id); +int hwt_backend_read(struct hwt_context *ctx, struct hwt_vm *vm, int *ident, + vm_offset_t *offset, uint64_t *data); +int hwt_backend_register(struct hwt_backend *); +int hwt_backend_unregister(struct hwt_backend *); +void hwt_backend_stop(struct hwt_context *); +int hwt_backend_svc_buf(struct hwt_context *ctx, void *data, size_t data_size, + int data_version); +struct hwt_backend * hwt_backend_lookup(const char *name); +int hwt_backend_thread_alloc(struct hwt_context *ctx, struct hwt_thread *); +void hwt_backend_thread_free(struct hwt_thread *); + +void hwt_backend_load(void); +void hwt_backend_unload(void); + +#define HWT_BACKEND_LOCK() mtx_lock(&hwt_backend_mtx) +#define HWT_BACKEND_UNLOCK() mtx_unlock(&hwt_backend_mtx) + +#endif /* !_DEV_HWT_HWT_BACKEND_H_ */ + diff --git a/sys/dev/hwt/hwt_config.c b/sys/dev/hwt/hwt_config.c new file mode 100644 index 000000000000..30688e7fc76b --- /dev/null +++ b/sys/dev/hwt/hwt_config.c @@ -0,0 +1,108 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023-2025 Ruslan Bukin <br@bsdpad.com> + * + * This work was supported by Innovate UK project 105694, "Digital Security + * by Design (DSbD) Technology Platform Prototype". + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/param.h> +#include <sys/proc.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/mutex.h> +#include <sys/lock.h> +#include <sys/hwt.h> + +#include <vm/vm.h> + +#include <dev/hwt/hwt_hook.h> +#include <dev/hwt/hwt_context.h> +#include <dev/hwt/hwt_contexthash.h> +#include <dev/hwt/hwt_config.h> +#include <dev/hwt/hwt_thread.h> +#include <dev/hwt/hwt_record.h> + +#define HWT_MAXCONFIGSIZE PAGE_SIZE + +#define HWT_CONFIG_DEBUG +#undef HWT_CONFIG_DEBUG + +#ifdef HWT_CONFIG_DEBUG +#define dprintf(fmt, ...) printf(fmt, ##__VA_ARGS__) +#else +#define dprintf(fmt, ...) +#endif + +static MALLOC_DEFINE(M_HWT_CONFIG, "hwt_config", "HWT config"); + +int +hwt_config_set(struct thread *td, struct hwt_context *ctx, + struct hwt_set_config *sconf) +{ + size_t config_size; + void *old_config; + void *config; + int error; + + config_size = sconf->config_size; + if (config_size == 0) + return (0); + + if (config_size > HWT_MAXCONFIGSIZE) + return (EFBIG); + + config = malloc(config_size, M_HWT_CONFIG, M_WAITOK | M_ZERO); + + error = copyin(sconf->config, config, config_size); + if (error) { + free(config, M_HWT_CONFIG); + return (error); + } + + HWT_CTX_LOCK(ctx); + old_config = ctx->config; + ctx->config = config; + ctx->config_size = sconf->config_size; + ctx->config_version = sconf->config_version; + HWT_CTX_UNLOCK(ctx); + + if (old_config != NULL) + free(old_config, M_HWT_CONFIG); + + return (error); +} + +void +hwt_config_free(struct hwt_context *ctx) +{ + + if (ctx->config == NULL) + return; + + free(ctx->config, M_HWT_CONFIG); + + ctx->config = NULL; +} diff --git a/sys/dev/hwt/hwt_config.h b/sys/dev/hwt/hwt_config.h new file mode 100644 index 000000000000..47485583063c --- /dev/null +++ b/sys/dev/hwt/hwt_config.h @@ -0,0 +1,36 @@ +/*- + * Copyright (c) 2023-2025 Ruslan Bukin <br@bsdpad.com> + * + * This work was supported by Innovate UK project 105694, "Digital Security + * by Design (DSbD) Technology Platform Prototype". + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _DEV_HWT_HWT_CONFIG_H_ +#define _DEV_HWT_HWT_CONFIG_H_ + +int hwt_config_set(struct thread *td, struct hwt_context *ctx, + struct hwt_set_config *sconf); +void hwt_config_free(struct hwt_context *ctx); + +#endif /* !_DEV_HWT_HWT_CONFIG_H_ */ diff --git a/sys/dev/hwt/hwt_context.c b/sys/dev/hwt/hwt_context.c new file mode 100644 index 000000000000..9af76cffc928 --- /dev/null +++ b/sys/dev/hwt/hwt_context.c @@ -0,0 +1,201 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023-2025 Ruslan Bukin <br@bsdpad.com> + * + * This work was supported by Innovate UK project 105694, "Digital Security + * by Design (DSbD) Technology Platform Prototype". + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/param.h> +#include <sys/bitstring.h> +#include <sys/conf.h> +#include <sys/proc.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/mman.h> +#include <sys/mutex.h> +#include <sys/refcount.h> +#include <sys/rwlock.h> +#include <sys/hwt.h> + +#include <dev/hwt/hwt_hook.h> +#include <dev/hwt/hwt_context.h> +#include <dev/hwt/hwt_config.h> +#include <dev/hwt/hwt_thread.h> +#include <dev/hwt/hwt_owner.h> +#include <dev/hwt/hwt_vm.h> +#include <dev/hwt/hwt_cpu.h> + +#define HWT_DEBUG +#undef HWT_DEBUG + +#ifdef HWT_DEBUG +#define dprintf(fmt, ...) printf(fmt, ##__VA_ARGS__) +#else +#define dprintf(fmt, ...) +#endif + +static MALLOC_DEFINE(M_HWT_CTX, "hwt_ctx", "Hardware Trace"); + +static bitstr_t *ident_set; +static int ident_set_size; +static struct mtx ident_set_mutex; + +static int +hwt_ctx_ident_alloc(int *new_ident) +{ + + mtx_lock(&ident_set_mutex); + bit_ffc(ident_set, ident_set_size, new_ident); + if (*new_ident == -1) { + mtx_unlock(&ident_set_mutex); + return (ENOMEM); + } + bit_set(ident_set, *new_ident); + mtx_unlock(&ident_set_mutex); + + return (0); +} + +static void +hwt_ctx_ident_free(int ident) +{ + + mtx_lock(&ident_set_mutex); + bit_clear(ident_set, ident); + mtx_unlock(&ident_set_mutex); +} + +int +hwt_ctx_alloc(struct hwt_context **ctx0) +{ + struct hwt_context *ctx; + int error; + + ctx = malloc(sizeof(struct hwt_context), M_HWT_CTX, M_WAITOK | M_ZERO); + + TAILQ_INIT(&ctx->records); + TAILQ_INIT(&ctx->threads); + TAILQ_INIT(&ctx->cpus); + mtx_init(&ctx->mtx, "ctx", NULL, MTX_SPIN); + mtx_init(&ctx->rec_mtx, "ctx_rec", NULL, MTX_DEF); + refcount_init(&ctx->refcnt, 0); + + error = hwt_ctx_ident_alloc(&ctx->ident); + if (error) { + printf("could not allocate ident bit str\n"); + return (error); + } + + *ctx0 = ctx; + + return (0); +} + +static void +hwt_ctx_free_cpus(struct hwt_context *ctx) +{ + struct hwt_cpu *cpu; + + do { + HWT_CTX_LOCK(ctx); + cpu = TAILQ_FIRST(&ctx->cpus); + if (cpu) + TAILQ_REMOVE(&ctx->cpus, cpu, next); + HWT_CTX_UNLOCK(ctx); + + if (cpu == NULL) + break; + + /* TODO: move vm_free() to cpu_free()? */ + hwt_vm_free(cpu->vm); + hwt_cpu_free(cpu); + } while (1); +} + +static void +hwt_ctx_free_threads(struct hwt_context *ctx) +{ + struct hwt_thread *thr; + + dprintf("%s: remove threads\n", __func__); + + do { + HWT_CTX_LOCK(ctx); + thr = TAILQ_FIRST(&ctx->threads); + if (thr) + TAILQ_REMOVE(&ctx->threads, thr, next); + HWT_CTX_UNLOCK(ctx); + + if (thr == NULL) + break; + + HWT_THR_LOCK(thr); + /* TODO: check if thr is sleeping before waking it up. */ + wakeup(thr); + HWT_THR_UNLOCK(thr); + + if (refcount_release(&thr->refcnt)) + hwt_thread_free(thr); + } while (1); +} + +void +hwt_ctx_free(struct hwt_context *ctx) +{ + + if (ctx->mode == HWT_MODE_CPU) + hwt_ctx_free_cpus(ctx); + else + hwt_ctx_free_threads(ctx); + + hwt_config_free(ctx); + hwt_ctx_ident_free(ctx->ident); + free(ctx, M_HWT_CTX); +} + +void +hwt_ctx_put(struct hwt_context *ctx) +{ + + refcount_release(&ctx->refcnt); +} + +void +hwt_ctx_load(void) +{ + + ident_set_size = (1 << 8); + ident_set = bit_alloc(ident_set_size, M_HWT_CTX, M_WAITOK); + mtx_init(&ident_set_mutex, "ident set", NULL, MTX_DEF); +} + +void +hwt_ctx_unload(void) +{ + + mtx_destroy(&ident_set_mutex); + free(ident_set, M_HWT_CTX); +} diff --git a/sys/dev/hwt/hwt_context.h b/sys/dev/hwt/hwt_context.h new file mode 100644 index 000000000000..cafb197ae348 --- /dev/null +++ b/sys/dev/hwt/hwt_context.h @@ -0,0 +1,86 @@ +/*- + * Copyright (c) 2023-2025 Ruslan Bukin <br@bsdpad.com> + * + * This work was supported by Innovate UK project 105694, "Digital Security + * by Design (DSbD) Technology Platform Prototype". + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _DEV_HWT_HWT_CONTEXT_H_ +#define _DEV_HWT_HWT_CONTEXT_H_ + +enum hwt_ctx_state { + CTX_STATE_STOPPED, + CTX_STATE_RUNNING, +}; + +struct hwt_context { + TAILQ_HEAD(, hwt_record_entry) records; + + LIST_ENTRY(hwt_context) next_hch; /* Entry in contexthash. */ + LIST_ENTRY(hwt_context) next_hwts; /* Entry in ho->hwts. */ + + int mode; + int ident; + + int kqueue_fd; + struct thread *hwt_td; + + /* CPU mode. */ + cpuset_t cpu_map; + TAILQ_HEAD(, hwt_cpu) cpus; + + /* Thread mode. */ + struct proc *proc; /* Target proc. */ + pid_t pid; /* Target pid. */ + TAILQ_HEAD(, hwt_thread) threads; + int thread_counter; + int pause_on_mmap; + + size_t bufsize; /* Trace bufsize for each vm.*/ + + void *config; + size_t config_size; + int config_version; + + struct hwt_owner *hwt_owner; + struct hwt_backend *hwt_backend; + + struct mtx mtx; + struct mtx rec_mtx; + enum hwt_ctx_state state; + int refcnt; +}; + +#define HWT_CTX_LOCK(ctx) mtx_lock_spin(&(ctx)->mtx) +#define HWT_CTX_UNLOCK(ctx) mtx_unlock_spin(&(ctx)->mtx) +#define HWT_CTX_ASSERT_LOCKED(ctx) mtx_assert(&(ctx)->mtx, MA_OWNED) + +int hwt_ctx_alloc(struct hwt_context **ctx0); +void hwt_ctx_free(struct hwt_context *ctx); +void hwt_ctx_put(struct hwt_context *ctx); + +void hwt_ctx_load(void); +void hwt_ctx_unload(void); + +#endif /* !_DEV_HWT_HWT_CONTEXT_H_ */ diff --git a/sys/dev/hwt/hwt_contexthash.c b/sys/dev/hwt/hwt_contexthash.c new file mode 100644 index 000000000000..5682b7d38e5e --- /dev/null +++ b/sys/dev/hwt/hwt_contexthash.c @@ -0,0 +1,134 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023-2025 Ruslan Bukin <br@bsdpad.com> + * + * This work was supported by Innovate UK project 105694, "Digital Security + * by Design (DSbD) Technology Platform Prototype". + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/mutex.h> +#include <sys/refcount.h> +#include <sys/hwt.h> + +#include <dev/hwt/hwt_context.h> +#include <dev/hwt/hwt_contexthash.h> +#include <dev/hwt/hwt_config.h> + +#define HWT_DEBUG +#undef HWT_DEBUG + +#ifdef HWT_DEBUG +#define dprintf(fmt, ...) printf(fmt, ##__VA_ARGS__) +#else +#define dprintf(fmt, ...) +#endif + +#define HWT_CONTEXTHASH_SIZE 1024 + +static MALLOC_DEFINE(M_HWT_CONTEXTHASH, "hwt_chash", "Hardware Trace"); + +/* + * Hash function. Discard the lower 2 bits of the pointer since + * these are always zero for our uses. The hash multiplier is + * round((2^LONG_BIT) * ((sqrt(5)-1)/2)). + */ + +#define _HWT_HM 11400714819323198486u /* hash multiplier */ +#define HWT_HASH_PTR(P, M) ((((unsigned long) (P) >> 2) * _HWT_HM) & (M)) + +static struct mtx hwt_contexthash_mtx; +static u_long hwt_contexthashmask; +static LIST_HEAD(hwt_contexthash, hwt_context) *hwt_contexthash; + +/* + * To use by hwt_switch_in/out() and hwt_record() only. + * This function returns with refcnt acquired. + */ +struct hwt_context * +hwt_contexthash_lookup(struct proc *p) +{ + struct hwt_contexthash *hch; + struct hwt_context *ctx; + int hindex; + + hindex = HWT_HASH_PTR(p, hwt_contexthashmask); + hch = &hwt_contexthash[hindex]; + + HWT_CTXHASH_LOCK(); + LIST_FOREACH(ctx, hch, next_hch) { + if (ctx->proc == p) { + refcount_acquire(&ctx->refcnt); + HWT_CTXHASH_UNLOCK(); + return (ctx); + } + } + HWT_CTXHASH_UNLOCK(); + + return (NULL); +} + +void +hwt_contexthash_insert(struct hwt_context *ctx) +{ + struct hwt_contexthash *hch; + int hindex; + + hindex = HWT_HASH_PTR(ctx->proc, hwt_contexthashmask); + hch = &hwt_contexthash[hindex]; + + HWT_CTXHASH_LOCK(); + LIST_INSERT_HEAD(hch, ctx, next_hch); + HWT_CTXHASH_UNLOCK(); +} + +void +hwt_contexthash_remove(struct hwt_context *ctx) +{ + + HWT_CTXHASH_LOCK(); + LIST_REMOVE(ctx, next_hch); + HWT_CTXHASH_UNLOCK(); +} + +void +hwt_contexthash_load(void) +{ + + hwt_contexthash = hashinit(HWT_CONTEXTHASH_SIZE, M_HWT_CONTEXTHASH, + &hwt_contexthashmask); + mtx_init(&hwt_contexthash_mtx, "hwt ctx hash", "hwt ctx", MTX_SPIN); +} + +void +hwt_contexthash_unload(void) +{ + + mtx_destroy(&hwt_contexthash_mtx); + hashdestroy(hwt_contexthash, M_HWT_CONTEXTHASH, hwt_contexthashmask); +} diff --git a/sys/dev/hwt/hwt_contexthash.h b/sys/dev/hwt/hwt_contexthash.h new file mode 100644 index 000000000000..c3ab7acd2a74 --- /dev/null +++ b/sys/dev/hwt/hwt_contexthash.h @@ -0,0 +1,42 @@ +/*- + * Copyright (c) 2023-2025 Ruslan Bukin <br@bsdpad.com> + * + * This work was supported by Innovate UK project 105694, "Digital Security + * by Design (DSbD) Technology Platform Prototype". + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _DEV_HWT_HWT_CONTEXTHASH_H_ +#define _DEV_HWT_HWT_CONTEXTHASH_H_ + +struct hwt_context * hwt_contexthash_lookup(struct proc *p); +void hwt_contexthash_insert(struct hwt_context *ctx); +void hwt_contexthash_remove(struct hwt_context *ctx); + +void hwt_contexthash_load(void); +void hwt_contexthash_unload(void); + +#define HWT_CTXHASH_LOCK() mtx_lock_spin(&hwt_contexthash_mtx) +#define HWT_CTXHASH_UNLOCK() mtx_unlock_spin(&hwt_contexthash_mtx) + +#endif /* !_DEV_HWT_HWT_CONTEXTHASH_H_ */ diff --git a/sys/dev/hwt/hwt_cpu.c b/sys/dev/hwt/hwt_cpu.c new file mode 100644 index 000000000000..7d38eb082e65 --- /dev/null +++ b/sys/dev/hwt/hwt_cpu.c @@ -0,0 +1,115 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023-2025 Ruslan Bukin <br@bsdpad.com> + * + * This work was supported by Innovate UK project 105694, "Digital Security + * by Design (DSbD) Technology Platform Prototype". + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/param.h> +#include <sys/proc.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/mutex.h> +#include <sys/hwt.h> + +#include <vm/vm.h> + +#include <dev/hwt/hwt_hook.h> +#include <dev/hwt/hwt_context.h> +#include <dev/hwt/hwt_contexthash.h> +#include <dev/hwt/hwt_config.h> +#include <dev/hwt/hwt_thread.h> +#include <dev/hwt/hwt_record.h> +#include <dev/hwt/hwt_cpu.h> + +#define HWT_CPU_DEBUG +#undef HWT_CPU_DEBUG + +#ifdef HWT_CPU_DEBUG +#define dprintf(fmt, ...) printf(fmt, ##__VA_ARGS__) +#else +#define dprintf(fmt, ...) +#endif + +static MALLOC_DEFINE(M_HWT_CPU, "hwt_cpu", "HWT cpu"); + +struct hwt_cpu * +hwt_cpu_alloc(void) +{ + struct hwt_cpu *cpu; + + cpu = malloc(sizeof(struct hwt_cpu), M_HWT_CPU, M_WAITOK | M_ZERO); + + return (cpu); +} + +void +hwt_cpu_free(struct hwt_cpu *cpu) +{ + + free(cpu, M_HWT_CPU); +} + +struct hwt_cpu * +hwt_cpu_first(struct hwt_context *ctx) +{ + struct hwt_cpu *cpu; + + HWT_CTX_ASSERT_LOCKED(ctx); + + cpu = TAILQ_FIRST(&ctx->cpus); + + KASSERT(cpu != NULL, ("cpu is NULL")); + + return (cpu); +} + +struct hwt_cpu * +hwt_cpu_get(struct hwt_context *ctx, int cpu_id) +{ + struct hwt_cpu *cpu, *tcpu; + + HWT_CTX_ASSERT_LOCKED(ctx); + + TAILQ_FOREACH_SAFE(cpu, &ctx->cpus, next, tcpu) { + KASSERT(cpu != NULL, ("cpu is NULL")); + if (cpu->cpu_id == cpu_id) { + return cpu; + } + } + + return (NULL); +} + +void +hwt_cpu_insert(struct hwt_context *ctx, struct hwt_cpu *cpu) +{ + + HWT_CTX_ASSERT_LOCKED(ctx); + + TAILQ_INSERT_TAIL(&ctx->cpus, cpu, next); +} diff --git a/sys/dev/hwt/hwt_cpu.h b/sys/dev/hwt/hwt_cpu.h new file mode 100644 index 000000000000..92b89229b6e4 --- /dev/null +++ b/sys/dev/hwt/hwt_cpu.h @@ -0,0 +1,45 @@ +/*- + * Copyright (c) 2023-2025 Ruslan Bukin <br@bsdpad.com> + * + * This work was supported by Innovate UK project 105694, "Digital Security + * by Design (DSbD) Technology Platform Prototype". + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _DEV_HWT_HWT_CPU_H_ +#define _DEV_HWT_HWT_CPU_H_ + +struct hwt_cpu { + int cpu_id; + struct hwt_vm *vm; + TAILQ_ENTRY(hwt_cpu) next; +}; + +struct hwt_cpu * hwt_cpu_alloc(void); +void hwt_cpu_free(struct hwt_cpu *cpu); + +struct hwt_cpu * hwt_cpu_first(struct hwt_context *ctx); +struct hwt_cpu * hwt_cpu_get(struct hwt_context *ctx, int cpu_id); +void hwt_cpu_insert(struct hwt_context *ctx, struct hwt_cpu *cpu); + +#endif /* !_DEV_HWT_HWT_CPU_H_ */ diff --git a/sys/dev/hwt/hwt_hook.c b/sys/dev/hwt/hwt_hook.c new file mode 100644 index 000000000000..258279b14f20 --- /dev/null +++ b/sys/dev/hwt/hwt_hook.c @@ -0,0 +1,323 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023-2025 Ruslan Bukin <br@bsdpad.com> + * + * This work was supported by Innovate UK project 105694, "Digital Security + * by Design (DSbD) Technology Platform Prototype". + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* Hardware Trace (HWT) framework. */ + +#include <sys/param.h> +#include <sys/proc.h> +#include <sys/malloc.h> +#include <sys/mman.h> +#include <sys/refcount.h> +#include <sys/hwt.h> + +#include <dev/hwt/hwt_hook.h> +#include <dev/hwt/hwt_context.h> +#include <dev/hwt/hwt_contexthash.h> +#include <dev/hwt/hwt_config.h> +#include <dev/hwt/hwt_thread.h> +#include <dev/hwt/hwt_owner.h> +#include <dev/hwt/hwt_backend.h> +#include <dev/hwt/hwt_record.h> +#include <dev/hwt/hwt_vm.h> + +#define HWT_DEBUG +#undef HWT_DEBUG + +#ifdef HWT_DEBUG +#define dprintf(fmt, ...) printf(fmt, ##__VA_ARGS__) +#else +#define dprintf(fmt, ...) +#endif + +static void +hwt_switch_in(struct thread *td) +{ + struct hwt_context *ctx; + struct hwt_thread *thr; + struct proc *p; + int cpu_id; + + p = td->td_proc; + + cpu_id = PCPU_GET(cpuid); + + ctx = hwt_contexthash_lookup(p); + if (ctx == NULL) + return; + + if (ctx->state != CTX_STATE_RUNNING) { + hwt_ctx_put(ctx); + return; + } + + thr = hwt_thread_lookup(ctx, td); + if (thr == NULL) { + hwt_ctx_put(ctx); + return; + } + + dprintf("%s: thr %p index %d tid %d on cpu_id %d\n", __func__, thr, + thr->thread_id, td->td_tid, cpu_id); + + hwt_backend_configure(ctx, cpu_id, thr->thread_id); + hwt_backend_enable(ctx, cpu_id); + + hwt_ctx_put(ctx); +} + +static void +hwt_switch_out(struct thread *td) +{ + struct hwt_context *ctx; + struct hwt_thread *thr; + struct proc *p; + int cpu_id; + + p = td->td_proc; + + cpu_id = PCPU_GET(cpuid); + + ctx = hwt_contexthash_lookup(p); + if (ctx == NULL) + return; + + if (ctx->state != CTX_STATE_RUNNING) { + hwt_ctx_put(ctx); + return; + } + thr = hwt_thread_lookup(ctx, td); + if (thr == NULL) { + hwt_ctx_put(ctx); + return; + } + + dprintf("%s: thr %p index %d tid %d on cpu_id %d\n", __func__, thr, + thr->thread_id, td->td_tid, cpu_id); + + hwt_backend_disable(ctx, cpu_id); + + hwt_ctx_put(ctx); +} + +static void +hwt_hook_thread_exit(struct thread *td) +{ + struct hwt_context *ctx; + struct hwt_thread *thr; + struct proc *p; + int cpu_id; + + p = td->td_proc; + + cpu_id = PCPU_GET(cpuid); + + ctx = hwt_contexthash_lookup(p); + if (ctx == NULL) + return; + + thr = hwt_thread_lookup(ctx, td); + if (thr == NULL) { + hwt_ctx_put(ctx); + return; + } + + thr->state = HWT_THREAD_STATE_EXITED; + + dprintf("%s: thr %p index %d tid %d on cpu_id %d\n", __func__, thr, + thr->thread_id, td->td_tid, cpu_id); + + if (ctx->state == CTX_STATE_RUNNING) + hwt_backend_disable(ctx, cpu_id); + + hwt_ctx_put(ctx); +} + +static void +hwt_hook_mmap(struct thread *td) +{ + struct hwt_context *ctx; + struct hwt_thread *thr; + struct proc *p; + int pause; + + p = td->td_proc; + + ctx = hwt_contexthash_lookup(p); + if (ctx == NULL) + return; + + /* The ctx state could be any here. */ + + pause = ctx->pause_on_mmap ? 1 : 0; + + thr = hwt_thread_lookup(ctx, td); + if (thr == NULL) { + hwt_ctx_put(ctx); + return; + } + + /* + * msleep(9) atomically releases the mtx lock, so take refcount + * to ensure that thr is not destroyed. + * It could not be destroyed prior to this call as we are holding ctx + * refcnt. + */ + refcount_acquire(&thr->refcnt); + hwt_ctx_put(ctx); + + if (pause) { + HWT_THR_LOCK(thr); + msleep(thr, &thr->mtx, PCATCH, "hwt-mmap", 0); + HWT_THR_UNLOCK(thr); + } + + if (refcount_release(&thr->refcnt)) + hwt_thread_free(thr); +} + +static int +hwt_hook_thread_create(struct thread *td) +{ + struct hwt_record_entry *entry; + struct hwt_context *ctx; + struct hwt_thread *thr; + char path[MAXPATHLEN]; + size_t bufsize; + struct proc *p; + int thread_id, kva_req; + int error; + + p = td->td_proc; + + /* Step 1. Get CTX and collect information needed. */ + ctx = hwt_contexthash_lookup(p); + if (ctx == NULL) + return (ENXIO); + thread_id = atomic_fetchadd_int(&ctx->thread_counter, 1); + bufsize = ctx->bufsize; + kva_req = ctx->hwt_backend->kva_req; + sprintf(path, "hwt_%d_%d", ctx->ident, thread_id); + hwt_ctx_put(ctx); + + /* Step 2. Allocate some memory without holding ctx ref. */ + error = hwt_thread_alloc(&thr, path, bufsize, kva_req); + if (error) { + printf("%s: could not allocate thread, error %d\n", + __func__, error); + return (error); + } + + entry = hwt_record_entry_alloc(); + entry->record_type = HWT_RECORD_THREAD_CREATE; + entry->thread_id = thread_id; + + /* Step 3. Get CTX once again. */ + ctx = hwt_contexthash_lookup(p); + if (ctx == NULL) { + hwt_record_entry_free(entry); + hwt_thread_free(thr); + /* ctx->thread_counter does not matter. */ + return (ENXIO); + } + /* Allocate backend-specific thread data. */ + error = hwt_backend_thread_alloc(ctx, thr); + if (error != 0) { + dprintf("%s: failed to allocate backend thread data\n", + __func__); + return (error); + } + + thr->vm->ctx = ctx; + thr->ctx = ctx; + thr->backend = ctx->hwt_backend; + thr->thread_id = thread_id; + thr->td = td; + + HWT_CTX_LOCK(ctx); + hwt_thread_insert(ctx, thr, entry); + HWT_CTX_UNLOCK(ctx); + + /* Notify userspace. */ + hwt_record_wakeup(ctx); + + hwt_ctx_put(ctx); + + return (0); +} + +static void +hwt_hook_handler(struct thread *td, int func, void *arg) +{ + struct proc *p; + + p = td->td_proc; + if ((p->p_flag2 & P2_HWT) == 0) + return; + + switch (func) { + case HWT_SWITCH_IN: + hwt_switch_in(td); + break; + case HWT_SWITCH_OUT: + hwt_switch_out(td); + break; + case HWT_THREAD_CREATE: + hwt_hook_thread_create(td); + break; + case HWT_THREAD_SET_NAME: + /* TODO. */ + break; + case HWT_THREAD_EXIT: + hwt_hook_thread_exit(td); + break; + case HWT_EXEC: + case HWT_MMAP: + hwt_record_td(td, arg, M_WAITOK | M_ZERO); + hwt_hook_mmap(td); + break; + case HWT_RECORD: + hwt_record_td(td, arg, M_WAITOK | M_ZERO); + break; + }; +} + +void +hwt_hook_load(void) +{ + + hwt_hook = hwt_hook_handler; +} + +void +hwt_hook_unload(void) +{ + + hwt_hook = NULL; +} diff --git a/sys/dev/hwt/hwt_hook.h b/sys/dev/hwt/hwt_hook.h new file mode 100644 index 000000000000..a8eccba3ec43 --- /dev/null +++ b/sys/dev/hwt/hwt_hook.h @@ -0,0 +1,56 @@ +/*- + * Copyright (c) 2023-2025 Ruslan Bukin <br@bsdpad.com> + * + * This work was supported by Innovate UK project 105694, "Digital Security + * by Design (DSbD) Technology Platform Prototype". + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/hwt_record.h> + +#ifndef _DEV_HWT_HWT_HOOK_H_ +#define _DEV_HWT_HWT_HOOK_H_ + +#define HWT_SWITCH_IN 0 +#define HWT_SWITCH_OUT 1 +#define HWT_THREAD_EXIT 2 +#define HWT_THREAD_CREATE 3 +#define HWT_THREAD_SET_NAME 4 +#define HWT_RECORD 5 +#define HWT_MMAP 6 +#define HWT_EXEC 7 + +#define HWT_CALL_HOOK(td, func, arg) \ +do { \ + if (hwt_hook != NULL) \ + (hwt_hook)((td), (func), (arg)); \ +} while (0) + +#define HWT_HOOK_INSTALLED (hwt_hook != NULL) + +extern void (*hwt_hook)(struct thread *td, int func, void *arg); + +void hwt_hook_load(void); +void hwt_hook_unload(void); + +#endif /* !_DEV_HWT_HWT_HOOK_H_ */ diff --git a/sys/dev/hwt/hwt_intr.h b/sys/dev/hwt/hwt_intr.h new file mode 100644 index 000000000000..e601969f001c --- /dev/null +++ b/sys/dev/hwt/hwt_intr.h @@ -0,0 +1,33 @@ +/*- + * Copyright (c) 2023-2025 Bojan Novković <bnovkov@freebsd.org> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _DEV_HWT_HWT_INTR_H_ +#define _DEV_HWT_HWT_INTR_H_ + +#include <machine/frame.h> + +extern int (*hwt_intr)(struct trapframe *tf); + +#endif /* !_DEV_HWT_HWT_INTR_H_ */ diff --git a/sys/dev/hwt/hwt_ioctl.c b/sys/dev/hwt/hwt_ioctl.c new file mode 100644 index 000000000000..592db4931bb4 --- /dev/null +++ b/sys/dev/hwt/hwt_ioctl.c @@ -0,0 +1,445 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023-2025 Ruslan Bukin <br@bsdpad.com> + * + * This work was supported by Innovate UK project 105694, "Digital Security + * by Design (DSbD) Technology Platform Prototype". + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* Hardware Trace (HWT) framework. */ + +#include <sys/param.h> +#include <sys/proc.h> +#include <sys/ioccom.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/mman.h> +#include <sys/mutex.h> +#include <sys/refcount.h> +#include <sys/rwlock.h> +#include <sys/smp.h> +#include <sys/hwt.h> + +#include <dev/hwt/hwt_hook.h> +#include <dev/hwt/hwt_context.h> +#include <dev/hwt/hwt_contexthash.h> +#include <dev/hwt/hwt_config.h> +#include <dev/hwt/hwt_cpu.h> +#include <dev/hwt/hwt_thread.h> +#include <dev/hwt/hwt_owner.h> +#include <dev/hwt/hwt_ownerhash.h> +#include <dev/hwt/hwt_backend.h> +#include <dev/hwt/hwt_record.h> +#include <dev/hwt/hwt_ioctl.h> +#include <dev/hwt/hwt_vm.h> + +#define HWT_IOCTL_DEBUG +#undef HWT_IOCTL_DEBUG + +#ifdef HWT_IOCTL_DEBUG +#define dprintf(fmt, ...) printf(fmt, ##__VA_ARGS__) +#else +#define dprintf(fmt, ...) +#endif + +/* No real reason for these limitations just sanity checks. */ +#define HWT_MAXBUFSIZE (32UL * 1024 * 1024 * 1024) /* 32 GB */ + +static MALLOC_DEFINE(M_HWT_IOCTL, "hwt_ioctl", "Hardware Trace"); + +/* + * Check if owner process *o can trace target process *t. + */ + +static int +hwt_priv_check(struct proc *o, struct proc *t) +{ + struct ucred *oc, *tc; + int error; + int i; + + PROC_LOCK(o); + oc = o->p_ucred; + crhold(oc); + PROC_UNLOCK(o); + + PROC_LOCK_ASSERT(t, MA_OWNED); + tc = t->p_ucred; + crhold(tc); + + error = 0; + + /* + * The effective uid of the HWT owner should match at least one + * of the effective / real / saved uids of the target process. + */ + + if (oc->cr_uid != tc->cr_uid && + oc->cr_uid != tc->cr_svuid && + oc->cr_uid != tc->cr_ruid) { + error = EPERM; + goto done; + } + + /* + * Everyone of the target's group ids must be in the owner's + * group list. + */ + for (i = 0; i < tc->cr_ngroups; i++) + if (!groupmember(tc->cr_groups[i], oc)) { + error = EPERM; + goto done; + } + + /* Check the read and saved GIDs too. */ + if (!groupmember(tc->cr_rgid, oc) || + !groupmember(tc->cr_svgid, oc)) { + error = EPERM; + goto done; + } + +done: + crfree(tc); + crfree(oc); + + return (error); +} + +static int +hwt_ioctl_alloc_mode_thread(struct thread *td, struct hwt_owner *ho, + struct hwt_backend *backend, struct hwt_alloc *halloc) +{ + struct thread **threads, *td1; + struct hwt_record_entry *entry; + struct hwt_context *ctx, *ctx1; + struct hwt_thread *thr; + char path[MAXPATHLEN]; + struct proc *p; + int thread_id; + int error; + int cnt; + int i; + + /* Check if the owner have this pid configured already. */ + ctx = hwt_owner_lookup_ctx(ho, halloc->pid); + if (ctx) + return (EEXIST); + + /* Allocate a new HWT context. */ + error = hwt_ctx_alloc(&ctx); + if (error) + return (error); + ctx->bufsize = halloc->bufsize; + ctx->pid = halloc->pid; + ctx->hwt_backend = backend; + ctx->hwt_owner = ho; + ctx->mode = HWT_MODE_THREAD; + ctx->hwt_td = td; + ctx->kqueue_fd = halloc->kqueue_fd; + + error = copyout(&ctx->ident, halloc->ident, sizeof(int)); + if (error) { + hwt_ctx_free(ctx); + return (error); + } + + /* Now get the victim proc. */ + p = pfind(halloc->pid); + if (p == NULL) { + hwt_ctx_free(ctx); + return (ENXIO); + } + + /* Ensure we can trace it. */ + error = hwt_priv_check(td->td_proc, p); + if (error) { + PROC_UNLOCK(p); + hwt_ctx_free(ctx); + return (error); + } + + /* Ensure it is not being traced already. */ + ctx1 = hwt_contexthash_lookup(p); + if (ctx1) { + refcount_release(&ctx1->refcnt); + PROC_UNLOCK(p); + hwt_ctx_free(ctx); + return (EEXIST); + } + + /* Allocate hwt threads and buffers. */ + + cnt = 0; + + FOREACH_THREAD_IN_PROC(p, td1) { + cnt += 1; + } + + KASSERT(cnt > 0, ("no threads")); + + threads = malloc(sizeof(struct thread *) * cnt, M_HWT_IOCTL, + M_NOWAIT | M_ZERO); + if (threads == NULL) { + PROC_UNLOCK(p); + hwt_ctx_free(ctx); + return (ENOMEM); + } + + i = 0; + + FOREACH_THREAD_IN_PROC(p, td1) { + threads[i++] = td1; + } + + ctx->proc = p; + PROC_UNLOCK(p); + + for (i = 0; i < cnt; i++) { + thread_id = atomic_fetchadd_int(&ctx->thread_counter, 1); + sprintf(path, "hwt_%d_%d", ctx->ident, thread_id); + + error = hwt_thread_alloc(&thr, path, ctx->bufsize, + ctx->hwt_backend->kva_req); + if (error) { + free(threads, M_HWT_IOCTL); + hwt_ctx_free(ctx); + return (error); + } + /* Allocate backend-specific thread data. */ + error = hwt_backend_thread_alloc(ctx, thr); + if (error != 0) { + dprintf("%s: failed to allocate thread backend data\n", + __func__); + free(threads, M_HWT_IOCTL); + hwt_ctx_free(ctx); + return (error); + } + + /* + * Insert a THREAD_CREATE record so userspace picks up + * the thread's tracing buffers. + */ + entry = hwt_record_entry_alloc(); + entry->record_type = HWT_RECORD_THREAD_CREATE; + entry->thread_id = thread_id; + + thr->vm->ctx = ctx; + thr->td = threads[i]; + thr->ctx = ctx; + thr->backend = ctx->hwt_backend; + thr->thread_id = thread_id; + + HWT_CTX_LOCK(ctx); + hwt_thread_insert(ctx, thr, entry); + HWT_CTX_UNLOCK(ctx); + } + + free(threads, M_HWT_IOCTL); + + error = hwt_backend_init(ctx); + if (error) { + hwt_ctx_free(ctx); + return (error); + } + + /* hwt_owner_insert_ctx? */ + mtx_lock(&ho->mtx); + LIST_INSERT_HEAD(&ho->hwts, ctx, next_hwts); + mtx_unlock(&ho->mtx); + + /* + * Hooks are now in action after this, but the ctx is not in RUNNING + * state. + */ + hwt_contexthash_insert(ctx); + + p = pfind(halloc->pid); + if (p) { + p->p_flag2 |= P2_HWT; + PROC_UNLOCK(p); + } + + return (0); +} + +static int +hwt_ioctl_alloc_mode_cpu(struct thread *td, struct hwt_owner *ho, + struct hwt_backend *backend, struct hwt_alloc *halloc) +{ + struct hwt_context *ctx; + struct hwt_cpu *cpu; + struct hwt_vm *vm; + char path[MAXPATHLEN]; + size_t cpusetsize; + cpuset_t cpu_map; + int cpu_count = 0; + int cpu_id; + int error; + + CPU_ZERO(&cpu_map); + cpusetsize = min(halloc->cpusetsize, sizeof(cpuset_t)); + error = copyin(halloc->cpu_map, &cpu_map, cpusetsize); + if (error) + return (error); + + CPU_FOREACH_ISSET(cpu_id, &cpu_map) { +#ifdef SMP + /* Ensure CPU is not halted. */ + if (CPU_ISSET(cpu_id, &hlt_cpus_mask)) + return (ENXIO); +#endif +#if 0 + /* TODO: Check if the owner have this cpu configured already. */ + ctx = hwt_owner_lookup_ctx_by_cpu(ho, halloc->cpu); + if (ctx) + return (EEXIST); +#endif + + cpu_count++; + } + + if (cpu_count == 0) + return (ENODEV); + + /* Allocate a new HWT context. */ + error = hwt_ctx_alloc(&ctx); + if (error) + return (error); + ctx->bufsize = halloc->bufsize; + ctx->hwt_backend = backend; + ctx->hwt_owner = ho; + ctx->mode = HWT_MODE_CPU; + ctx->cpu_map = cpu_map; + ctx->hwt_td = td; + ctx->kqueue_fd = halloc->kqueue_fd; + + error = copyout(&ctx->ident, halloc->ident, sizeof(int)); + if (error) { + hwt_ctx_free(ctx); + return (error); + } + + CPU_FOREACH_ISSET(cpu_id, &cpu_map) { + sprintf(path, "hwt_%d_%d", ctx->ident, cpu_id); + error = hwt_vm_alloc(ctx->bufsize, ctx->hwt_backend->kva_req, + path, &vm); + if (error) { + /* TODO: remove all allocated cpus. */ + hwt_ctx_free(ctx); + return (error); + } + + cpu = hwt_cpu_alloc(); + cpu->cpu_id = cpu_id; + cpu->vm = vm; + + vm->cpu = cpu; + vm->ctx = ctx; + + HWT_CTX_LOCK(ctx); + hwt_cpu_insert(ctx, cpu); + HWT_CTX_UNLOCK(ctx); + } + + error = hwt_backend_init(ctx); + if (error) { + /* TODO: remove all allocated cpus. */ + hwt_ctx_free(ctx); + return (error); + } + + /* hwt_owner_insert_ctx? */ + mtx_lock(&ho->mtx); + LIST_INSERT_HEAD(&ho->hwts, ctx, next_hwts); + mtx_unlock(&ho->mtx); + + hwt_record_kernel_objects(ctx); + + return (0); +} + +static int +hwt_ioctl_alloc(struct thread *td, struct hwt_alloc *halloc) +{ + char backend_name[HWT_BACKEND_MAXNAMELEN]; + struct hwt_backend *backend; + struct hwt_owner *ho; + int error; + + if (halloc->bufsize > HWT_MAXBUFSIZE) + return (EINVAL); + if (halloc->bufsize % PAGE_SIZE) + return (EINVAL); + if (halloc->backend_name == NULL) + return (EINVAL); + + error = copyinstr(halloc->backend_name, (void *)backend_name, + HWT_BACKEND_MAXNAMELEN, NULL); + if (error) + return (error); + + backend = hwt_backend_lookup(backend_name); + if (backend == NULL) + return (ENODEV); + + /* First get the owner. */ + ho = hwt_ownerhash_lookup(td->td_proc); + if (ho == NULL) { + /* Create a new owner. */ + ho = hwt_owner_alloc(td->td_proc); + if (ho == NULL) + return (ENOMEM); + hwt_ownerhash_insert(ho); + } + + switch (halloc->mode) { + case HWT_MODE_THREAD: + error = hwt_ioctl_alloc_mode_thread(td, ho, backend, halloc); + break; + case HWT_MODE_CPU: + error = hwt_ioctl_alloc_mode_cpu(td, ho, backend, halloc); + break; + default: + error = ENXIO; + }; + + return (error); +} + +int +hwt_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, + struct thread *td) +{ + int error; + + switch (cmd) { + case HWT_IOC_ALLOC: + /* Allocate HWT context. */ + error = hwt_ioctl_alloc(td, (struct hwt_alloc *)addr); + return (error); + default: + return (ENXIO); + }; +} diff --git a/sys/dev/hwt/hwt_ioctl.h b/sys/dev/hwt/hwt_ioctl.h new file mode 100644 index 000000000000..ce4270dc0d44 --- /dev/null +++ b/sys/dev/hwt/hwt_ioctl.h @@ -0,0 +1,35 @@ +/*- + * Copyright (c) 2023-2025 Ruslan Bukin <br@bsdpad.com> + * + * This work was supported by Innovate UK project 105694, "Digital Security + * by Design (DSbD) Technology Platform Prototype". + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _DEV_HWT_HWT_IOCTL_H +#define _DEV_HWT_HWT_IOCTL_H + +int hwt_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, + struct thread *td); + +#endif /* !_DEV_HWT_HWT_IOCTL_H */ diff --git a/sys/dev/hwt/hwt_owner.c b/sys/dev/hwt/hwt_owner.c new file mode 100644 index 000000000000..3c82040578de --- /dev/null +++ b/sys/dev/hwt/hwt_owner.c @@ -0,0 +1,157 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023-2025 Ruslan Bukin <br@bsdpad.com> + * + * This work was supported by Innovate UK project 105694, "Digital Security + * by Design (DSbD) Technology Platform Prototype". + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/mman.h> +#include <sys/mutex.h> +#include <sys/refcount.h> +#include <sys/rwlock.h> +#include <sys/hwt.h> + +#include <dev/hwt/hwt_hook.h> +#include <dev/hwt/hwt_context.h> +#include <dev/hwt/hwt_contexthash.h> +#include <dev/hwt/hwt_config.h> +#include <dev/hwt/hwt_cpu.h> +#include <dev/hwt/hwt_thread.h> +#include <dev/hwt/hwt_owner.h> +#include <dev/hwt/hwt_ownerhash.h> +#include <dev/hwt/hwt_backend.h> +#include <dev/hwt/hwt_vm.h> +#include <dev/hwt/hwt_record.h> + +#define HWT_DEBUG +#undef HWT_DEBUG + +#ifdef HWT_DEBUG +#define dprintf(fmt, ...) printf(fmt, ##__VA_ARGS__) +#else +#define dprintf(fmt, ...) +#endif + +static MALLOC_DEFINE(M_HWT_OWNER, "hwt_owner", "Hardware Trace"); + +struct hwt_context * +hwt_owner_lookup_ctx(struct hwt_owner *ho, pid_t pid) +{ + struct hwt_context *ctx; + + mtx_lock(&ho->mtx); + LIST_FOREACH(ctx, &ho->hwts, next_hwts) { + if (ctx->pid == pid) { + mtx_unlock(&ho->mtx); + return (ctx); + } + } + mtx_unlock(&ho->mtx); + + return (NULL); +} + +#if 0 +struct hwt_context * +hwt_owner_lookup_ctx_by_cpu(struct hwt_owner *ho, int cpu) +{ + struct hwt_context *ctx; + + mtx_lock(&ho->mtx); + LIST_FOREACH(ctx, &ho->hwts, next_hwts) { + if (ctx->cpu == cpu) { + mtx_unlock(&ho->mtx); + return (ctx); + } + } + mtx_unlock(&ho->mtx); + + return (NULL); +} +#endif + +struct hwt_owner * +hwt_owner_alloc(struct proc *p) +{ + struct hwt_owner *ho; + + ho = malloc(sizeof(struct hwt_owner), M_HWT_OWNER, + M_WAITOK | M_ZERO); + ho->p = p; + + LIST_INIT(&ho->hwts); + mtx_init(&ho->mtx, "hwts", NULL, MTX_DEF); + + return (ho); +} + +void +hwt_owner_shutdown(struct hwt_owner *ho) +{ + struct hwt_context *ctx; + + dprintf("%s: stopping hwt owner\n", __func__); + + while (1) { + mtx_lock(&ho->mtx); + ctx = LIST_FIRST(&ho->hwts); + if (ctx) + LIST_REMOVE(ctx, next_hwts); + mtx_unlock(&ho->mtx); + + if (ctx == NULL) + break; + + if (ctx->mode == HWT_MODE_THREAD) + hwt_contexthash_remove(ctx); + + /* + * A hook could be still dealing with this ctx right here. + */ + + HWT_CTX_LOCK(ctx); + ctx->state = 0; + HWT_CTX_UNLOCK(ctx); + + /* Ensure hooks invocation is now completed. */ + while (refcount_load(&ctx->refcnt) > 0) + continue; + + /* + * Note that a thread could be still sleeping on msleep(9). + */ + + hwt_backend_deinit(ctx); + hwt_record_free_all(ctx); + hwt_ctx_free(ctx); + } + + hwt_ownerhash_remove(ho); + free(ho, M_HWT_OWNER); +} diff --git a/sys/dev/hwt/hwt_owner.h b/sys/dev/hwt/hwt_owner.h new file mode 100644 index 000000000000..2ac569a55050 --- /dev/null +++ b/sys/dev/hwt/hwt_owner.h @@ -0,0 +1,45 @@ +/*- + * Copyright (c) 2023-2025 Ruslan Bukin <br@bsdpad.com> + * + * This work was supported by Innovate UK project 105694, "Digital Security + * by Design (DSbD) Technology Platform Prototype". + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _DEV_HWT_HWT_OWNER_H_ +#define _DEV_HWT_HWT_OWNER_H_ + +struct hwt_owner { + struct proc *p; + struct mtx mtx; /* Protects hwts. */ + LIST_HEAD(, hwt_context) hwts; /* Owned HWTs. */ + LIST_ENTRY(hwt_owner) next; /* Entry in hwt owner hash. */ +}; + + +struct hwt_context * hwt_owner_lookup_ctx(struct hwt_owner *ho, pid_t pid); +struct hwt_owner * hwt_owner_alloc(struct proc *p); +void hwt_owner_shutdown(struct hwt_owner *ho); +struct hwt_context * hwt_owner_lookup_ctx_by_cpu(struct hwt_owner *ho, int cpu); + +#endif /* !_DEV_HWT_HWT_OWNER_H_ */ diff --git a/sys/dev/hwt/hwt_ownerhash.c b/sys/dev/hwt/hwt_ownerhash.c new file mode 100644 index 000000000000..7c9e2232bac4 --- /dev/null +++ b/sys/dev/hwt/hwt_ownerhash.c @@ -0,0 +1,141 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023-2025 Ruslan Bukin <br@bsdpad.com> + * + * This work was supported by Innovate UK project 105694, "Digital Security + * by Design (DSbD) Technology Platform Prototype". + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/mman.h> +#include <sys/mutex.h> +#include <sys/rwlock.h> +#include <sys/hwt.h> + +#include <dev/hwt/hwt_owner.h> +#include <dev/hwt/hwt_ownerhash.h> + +#define HWT_DEBUG +#undef HWT_DEBUG + +#ifdef HWT_DEBUG +#define dprintf(fmt, ...) printf(fmt, ##__VA_ARGS__) +#else +#define dprintf(fmt, ...) +#endif + +#define HWT_OWNERHASH_SIZE 1024 + +static MALLOC_DEFINE(M_HWT_OWNERHASH, "hwt_ohash", "Hardware Trace"); + +/* + * Hash function. Discard the lower 2 bits of the pointer since + * these are always zero for our uses. The hash multiplier is + * round((2^LONG_BIT) * ((sqrt(5)-1)/2)). + */ + +#define _HWT_HM 11400714819323198486u /* hash multiplier */ +#define HWT_HASH_PTR(P, M) ((((unsigned long) (P) >> 2) * _HWT_HM) & (M)) + +static struct mtx hwt_ownerhash_mtx; +static u_long hwt_ownerhashmask; +static LIST_HEAD(hwt_ownerhash, hwt_owner) *hwt_ownerhash; + +struct hwt_owner * +hwt_ownerhash_lookup(struct proc *p) +{ + struct hwt_ownerhash *hoh; + struct hwt_owner *ho; + int hindex; + + hindex = HWT_HASH_PTR(p, hwt_ownerhashmask); + hoh = &hwt_ownerhash[hindex]; + + HWT_OWNERHASH_LOCK(); + LIST_FOREACH(ho, hoh, next) { + if (ho->p == p) { + HWT_OWNERHASH_UNLOCK(); + return (ho); + } + } + HWT_OWNERHASH_UNLOCK(); + + return (NULL); +} + +void +hwt_ownerhash_insert(struct hwt_owner *ho) +{ + struct hwt_ownerhash *hoh; + int hindex; + + hindex = HWT_HASH_PTR(ho->p, hwt_ownerhashmask); + hoh = &hwt_ownerhash[hindex]; + + HWT_OWNERHASH_LOCK(); + LIST_INSERT_HEAD(hoh, ho, next); + HWT_OWNERHASH_UNLOCK(); +} + +void +hwt_ownerhash_remove(struct hwt_owner *ho) +{ + + /* Destroy hwt owner. */ + HWT_OWNERHASH_LOCK(); + LIST_REMOVE(ho, next); + HWT_OWNERHASH_UNLOCK(); +} + +void +hwt_ownerhash_load(void) +{ + + hwt_ownerhash = hashinit(HWT_OWNERHASH_SIZE, M_HWT_OWNERHASH, + &hwt_ownerhashmask); + mtx_init(&hwt_ownerhash_mtx, "hwt-owner-hash", "hwt-owner", MTX_DEF); +} + +void +hwt_ownerhash_unload(void) +{ + struct hwt_ownerhash *hoh; + struct hwt_owner *ho, *tmp; + + HWT_OWNERHASH_LOCK(); + for (hoh = hwt_ownerhash; + hoh <= &hwt_ownerhash[hwt_ownerhashmask]; + hoh++) { + LIST_FOREACH_SAFE(ho, hoh, next, tmp) { + /* TODO: module is in use ? */ + } + } + HWT_OWNERHASH_UNLOCK(); + + mtx_destroy(&hwt_ownerhash_mtx); + hashdestroy(hwt_ownerhash, M_HWT_OWNERHASH, hwt_ownerhashmask); +} diff --git a/sys/dev/hwt/hwt_ownerhash.h b/sys/dev/hwt/hwt_ownerhash.h new file mode 100644 index 000000000000..4a7bc958d0f7 --- /dev/null +++ b/sys/dev/hwt/hwt_ownerhash.h @@ -0,0 +1,42 @@ +/*- + * Copyright (c) 2023-2025 Ruslan Bukin <br@bsdpad.com> + * + * This work was supported by Innovate UK project 105694, "Digital Security + * by Design (DSbD) Technology Platform Prototype". + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _DEV_HWT_HWT_OWNERHASH_H_ +#define _DEV_HWT_HWT_OWNERHASH_H_ + +struct hwt_owner * hwt_ownerhash_lookup(struct proc *p); +void hwt_ownerhash_insert(struct hwt_owner *ho); +void hwt_ownerhash_remove(struct hwt_owner *ho); + +void hwt_ownerhash_load(void); +void hwt_ownerhash_unload(void); + +#define HWT_OWNERHASH_LOCK() mtx_lock(&hwt_ownerhash_mtx) +#define HWT_OWNERHASH_UNLOCK() mtx_unlock(&hwt_ownerhash_mtx) + +#endif /* !_DEV_HWT_HWT_OWNERHASH_H_ */ diff --git a/sys/dev/hwt/hwt_record.c b/sys/dev/hwt/hwt_record.c new file mode 100644 index 000000000000..850ea6f8c5be --- /dev/null +++ b/sys/dev/hwt/hwt_record.c @@ -0,0 +1,302 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023-2025 Ruslan Bukin <br@bsdpad.com> + * + * This work was supported by Innovate UK project 105694, "Digital Security + * by Design (DSbD) Technology Platform Prototype". + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/param.h> +#include <sys/proc.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/mutex.h> +#include <sys/hwt.h> +#include <sys/linker.h> +#include <sys/pmckern.h> /* linker_hwpmc_list_objects */ + +#include <vm/vm.h> +#include <vm/uma.h> + +#include <dev/hwt/hwt_hook.h> +#include <dev/hwt/hwt_context.h> +#include <dev/hwt/hwt_contexthash.h> +#include <dev/hwt/hwt_config.h> +#include <dev/hwt/hwt_thread.h> +#include <dev/hwt/hwt_record.h> + +#define HWT_RECORD_DEBUG +#undef HWT_RECORD_DEBUG + +#ifdef HWT_RECORD_DEBUG +#define dprintf(fmt, ...) printf(fmt, ##__VA_ARGS__) +#else +#define dprintf(fmt, ...) +#endif + +static MALLOC_DEFINE(M_HWT_RECORD, "hwt_record", "Hardware Trace"); +static uma_zone_t record_zone = NULL; + +static struct hwt_record_entry * +hwt_record_clone(struct hwt_record_entry *ent, int flags) +{ + struct hwt_record_entry *entry; + + entry = uma_zalloc(record_zone, flags); + if (entry == NULL) + return (NULL); + memcpy(entry, ent, sizeof(struct hwt_record_entry)); + switch (ent->record_type) { + case HWT_RECORD_MMAP: + case HWT_RECORD_EXECUTABLE: + case HWT_RECORD_KERNEL: + entry->fullpath = strdup(ent->fullpath, M_HWT_RECORD); + break; + default: + break; + } + + return (entry); +} + +static void +hwt_record_to_user(struct hwt_record_entry *ent, + struct hwt_record_user_entry *usr) +{ + usr->record_type = ent->record_type; + switch (ent->record_type) { + case HWT_RECORD_MMAP: + case HWT_RECORD_EXECUTABLE: + case HWT_RECORD_KERNEL: + usr->addr = ent->addr; + usr->baseaddr = ent->baseaddr; + strncpy(usr->fullpath, ent->fullpath, MAXPATHLEN); + break; + case HWT_RECORD_BUFFER: + usr->buf_id = ent->buf_id; + usr->curpage = ent->curpage; + usr->offset = ent->offset; + break; + case HWT_RECORD_THREAD_CREATE: + case HWT_RECORD_THREAD_SET_NAME: + usr->thread_id = ent->thread_id; + break; + default: + break; + } +} + +void +hwt_record_load(void) +{ + record_zone = uma_zcreate("HWT records", + sizeof(struct hwt_record_entry), NULL, NULL, NULL, NULL, 0, 0); +} + +void +hwt_record_unload(void) +{ + uma_zdestroy(record_zone); +} + +void +hwt_record_ctx(struct hwt_context *ctx, struct hwt_record_entry *ent, int flags) +{ + struct hwt_record_entry *entry; + + KASSERT(ent != NULL, ("ent is NULL")); + entry = hwt_record_clone(ent, flags); + if (entry == NULL) { + /* XXX: Not sure what to do here other than logging an error. */ + return; + } + + HWT_CTX_LOCK(ctx); + TAILQ_INSERT_TAIL(&ctx->records, entry, next); + HWT_CTX_UNLOCK(ctx); + hwt_record_wakeup(ctx); +} + +void +hwt_record_td(struct thread *td, struct hwt_record_entry *ent, int flags) +{ + struct hwt_record_entry *entry; + struct hwt_context *ctx; + struct proc *p; + + p = td->td_proc; + + KASSERT(ent != NULL, ("ent is NULL")); + entry = hwt_record_clone(ent, flags); + if (entry == NULL) { + /* XXX: Not sure what to do here other than logging an error. */ + return; + } + ctx = hwt_contexthash_lookup(p); + if (ctx == NULL) { + hwt_record_entry_free(entry); + return; + } + HWT_CTX_LOCK(ctx); + TAILQ_INSERT_TAIL(&ctx->records, entry, next); + HWT_CTX_UNLOCK(ctx); + hwt_record_wakeup(ctx); + + hwt_ctx_put(ctx); +} + +struct hwt_record_entry * +hwt_record_entry_alloc(void) +{ + return (uma_zalloc(record_zone, M_WAITOK | M_ZERO)); +} + +void +hwt_record_entry_free(struct hwt_record_entry *entry) +{ + + switch (entry->record_type) { + case HWT_RECORD_MMAP: + case HWT_RECORD_EXECUTABLE: + case HWT_RECORD_KERNEL: + free(entry->fullpath, M_HWT_RECORD); + break; + default: + break; + } + + uma_zfree(record_zone, entry); +} + +static int +hwt_record_grab(struct hwt_context *ctx, + struct hwt_record_user_entry *user_entry, int nitems_req, int wait) +{ + struct hwt_record_entry *entry; + int i; + + if (wait) { + mtx_lock(&ctx->rec_mtx); + if (TAILQ_FIRST(&ctx->records) == NULL) { + /* Wait until we have new records. */ + msleep(ctx, &ctx->rec_mtx, PCATCH, "recsnd", 0); + } + mtx_unlock(&ctx->rec_mtx); + } + + for (i = 0; i < nitems_req; i++) { + HWT_CTX_LOCK(ctx); + entry = TAILQ_FIRST(&ctx->records); + if (entry) + TAILQ_REMOVE_HEAD(&ctx->records, next); + HWT_CTX_UNLOCK(ctx); + + if (entry == NULL) + break; + hwt_record_to_user(entry, &user_entry[i]); + hwt_record_entry_free(entry); + } + + return (i); +} + +void +hwt_record_free_all(struct hwt_context *ctx) +{ + struct hwt_record_entry *entry; + + while (1) { + HWT_CTX_LOCK(ctx); + entry = TAILQ_FIRST(&ctx->records); + if (entry) + TAILQ_REMOVE_HEAD(&ctx->records, next); + HWT_CTX_UNLOCK(ctx); + + if (entry == NULL) + break; + + hwt_record_entry_free(entry); + } +} + +int +hwt_record_send(struct hwt_context *ctx, struct hwt_record_get *record_get) +{ + struct hwt_record_user_entry *user_entry; + int nitems_req; + int error; + int i; + + nitems_req = 0; + + error = copyin(record_get->nentries, &nitems_req, sizeof(int)); + if (error) + return (error); + + if (nitems_req < 1 || nitems_req > 1024) + return (ENXIO); + + user_entry = malloc(sizeof(struct hwt_record_user_entry) * nitems_req, + M_HWT_RECORD, M_WAITOK | M_ZERO); + + i = hwt_record_grab(ctx, user_entry, nitems_req, record_get->wait); + if (i > 0) + error = copyout(user_entry, record_get->records, + sizeof(struct hwt_record_user_entry) * i); + + if (error == 0) + error = copyout(&i, record_get->nentries, sizeof(int)); + + free(user_entry, M_HWT_RECORD); + + return (error); +} + +void +hwt_record_kernel_objects(struct hwt_context *ctx) +{ + struct hwt_record_entry *entry; + struct pmckern_map_in *kobase; + int i; + + kobase = linker_hwpmc_list_objects(); + for (i = 0; kobase[i].pm_file != NULL; i++) { + entry = hwt_record_entry_alloc(); + entry->record_type = HWT_RECORD_KERNEL; + entry->fullpath = strdup(kobase[i].pm_file, M_HWT_RECORD); + entry->addr = kobase[i].pm_address; + + HWT_CTX_LOCK(ctx); + TAILQ_INSERT_HEAD(&ctx->records, entry, next); + HWT_CTX_UNLOCK(ctx); + } + free(kobase, M_LINKER); +} + +void +hwt_record_wakeup(struct hwt_context *ctx) +{ + wakeup(ctx); +} diff --git a/sys/dev/hwt/hwt_record.h b/sys/dev/hwt/hwt_record.h new file mode 100644 index 000000000000..3f347ca67d54 --- /dev/null +++ b/sys/dev/hwt/hwt_record.h @@ -0,0 +1,47 @@ +/*- + * Copyright (c) 2023-2025 Ruslan Bukin <br@bsdpad.com> + * + * This work was supported by Innovate UK project 105694, "Digital Security + * by Design (DSbD) Technology Platform Prototype". + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _DEV_HWT_HWT_RECORD_H_ +#define _DEV_HWT_HWT_RECORD_H_ + +struct hwt_record_get; + +void hwt_record_load(void); +void hwt_record_unload(void); + +int hwt_record_send(struct hwt_context *ctx, struct hwt_record_get *record_get); +void hwt_record_td(struct thread *td, struct hwt_record_entry *ent, int flags); +void hwt_record_ctx(struct hwt_context *ctx, struct hwt_record_entry *ent, + int flags); +struct hwt_record_entry * hwt_record_entry_alloc(void); +void hwt_record_entry_free(struct hwt_record_entry *entry); +void hwt_record_kernel_objects(struct hwt_context *ctx); +void hwt_record_free_all(struct hwt_context *ctx); +void hwt_record_wakeup(struct hwt_context *ctx); + +#endif /* !_DEV_HWT_HWT_RECORD_H_ */ diff --git a/sys/dev/hwt/hwt_thread.c b/sys/dev/hwt/hwt_thread.c new file mode 100644 index 000000000000..827c068a681f --- /dev/null +++ b/sys/dev/hwt/hwt_thread.c @@ -0,0 +1,162 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023-2025 Ruslan Bukin <br@bsdpad.com> + * + * This work was supported by Innovate UK project 105694, "Digital Security + * by Design (DSbD) Technology Platform Prototype". + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/refcount.h> +#include <sys/rwlock.h> +#include <sys/hwt.h> + +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/vm_kern.h> +#include <vm/vm_page.h> +#include <vm/vm_object.h> +#include <vm/vm_pager.h> +#include <vm/vm_pageout.h> +#include <vm/vm_phys.h> + +#include <dev/hwt/hwt_hook.h> +#include <dev/hwt/hwt_context.h> +#include <dev/hwt/hwt_contexthash.h> +#include <dev/hwt/hwt_config.h> +#include <dev/hwt/hwt_thread.h> +#include <dev/hwt/hwt_owner.h> +#include <dev/hwt/hwt_ownerhash.h> +#include <dev/hwt/hwt_backend.h> +#include <dev/hwt/hwt_vm.h> +#include <dev/hwt/hwt_record.h> + +#define HWT_THREAD_DEBUG +#undef HWT_THREAD_DEBUG + +#ifdef HWT_THREAD_DEBUG +#define dprintf(fmt, ...) printf(fmt, ##__VA_ARGS__) +#else +#define dprintf(fmt, ...) +#endif + +static MALLOC_DEFINE(M_HWT_THREAD, "hwt_thread", "Hardware Trace"); + +struct hwt_thread * +hwt_thread_first(struct hwt_context *ctx) +{ + struct hwt_thread *thr; + + HWT_CTX_ASSERT_LOCKED(ctx); + + thr = TAILQ_FIRST(&ctx->threads); + + KASSERT(thr != NULL, ("thr is NULL")); + + return (thr); +} + +/* + * To use by hwt_switch_in/out() only. + */ +struct hwt_thread * +hwt_thread_lookup(struct hwt_context *ctx, struct thread *td) +{ + struct hwt_thread *thr; + + /* Caller of this func holds ctx refcnt right here. */ + + HWT_CTX_LOCK(ctx); + TAILQ_FOREACH(thr, &ctx->threads, next) { + if (thr->td == td) { + HWT_CTX_UNLOCK(ctx); + return (thr); + } + } + HWT_CTX_UNLOCK(ctx); + + /* + * We are here because the hook on thread creation failed to allocate + * a thread. + */ + + return (NULL); +} + +int +hwt_thread_alloc(struct hwt_thread **thr0, char *path, size_t bufsize, + int kva_req) +{ + struct hwt_thread *thr; + struct hwt_vm *vm; + int error; + + error = hwt_vm_alloc(bufsize, kva_req, path, &vm); + if (error) + return (error); + + thr = malloc(sizeof(struct hwt_thread), M_HWT_THREAD, + M_WAITOK | M_ZERO); + thr->vm = vm; + + mtx_init(&thr->mtx, "thr", NULL, MTX_DEF); + + refcount_init(&thr->refcnt, 1); + + vm->thr = thr; + + *thr0 = thr; + + return (0); +} + +void +hwt_thread_free(struct hwt_thread *thr) +{ + + hwt_vm_free(thr->vm); + /* Free private backend data, if any. */ + if (thr->private != NULL) + hwt_backend_thread_free(thr); + free(thr, M_HWT_THREAD); +} + +/* + * Inserts a new thread and a thread creation record into the + * context notifies userspace about the newly created thread. + */ +void +hwt_thread_insert(struct hwt_context *ctx, struct hwt_thread *thr, + struct hwt_record_entry *entry) +{ + + HWT_CTX_ASSERT_LOCKED(ctx); + TAILQ_INSERT_TAIL(&ctx->threads, thr, next); + TAILQ_INSERT_TAIL(&ctx->records, entry, next); +} diff --git a/sys/dev/hwt/hwt_thread.h b/sys/dev/hwt/hwt_thread.h new file mode 100644 index 000000000000..ccc29aeb3494 --- /dev/null +++ b/sys/dev/hwt/hwt_thread.h @@ -0,0 +1,64 @@ +/*- + * Copyright (c) 2023-2025 Ruslan Bukin <br@bsdpad.com> + * + * This work was supported by Innovate UK project 105694, "Digital Security + * by Design (DSbD) Technology Platform Prototype". + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _DEV_HWT_HWT_THREAD_H_ +#define _DEV_HWT_HWT_THREAD_H_ + +struct hwt_record_entry; + +struct hwt_thread { + struct hwt_vm *vm; + struct hwt_context *ctx; + struct hwt_backend *backend; + struct thread *td; + TAILQ_ENTRY(hwt_thread) next; + int thread_id; + int state; +#define HWT_THREAD_STATE_EXITED (1 << 0) + struct mtx mtx; + u_int refcnt; + int cpu_id; /* last cpu_id */ + void *private; /* backend-specific private data */ +}; + +/* Thread allocation. */ +int hwt_thread_alloc(struct hwt_thread **thr0, char *path, size_t bufsize, + int kva_req); +void hwt_thread_free(struct hwt_thread *thr); + +/* Thread list mgt. */ +void hwt_thread_insert(struct hwt_context *ctx, struct hwt_thread *thr, struct hwt_record_entry *entry); +struct hwt_thread * hwt_thread_first(struct hwt_context *ctx); +struct hwt_thread * hwt_thread_lookup(struct hwt_context *ctx, + struct thread *td); + +#define HWT_THR_LOCK(thr) mtx_lock(&(thr)->mtx) +#define HWT_THR_UNLOCK(thr) mtx_unlock(&(thr)->mtx) +#define HWT_THR_ASSERT_LOCKED(thr) mtx_assert(&(thr)->mtx, MA_OWNED) + +#endif /* !_DEV_HWT_HWT_THREAD_H_ */ diff --git a/sys/dev/hwt/hwt_vm.c b/sys/dev/hwt/hwt_vm.c new file mode 100644 index 000000000000..6c55e218dcec --- /dev/null +++ b/sys/dev/hwt/hwt_vm.c @@ -0,0 +1,503 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023-2025 Ruslan Bukin <br@bsdpad.com> + * + * This work was supported by Innovate UK project 105694, "Digital Security + * by Design (DSbD) Technology Platform Prototype". + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/param.h> +#include <sys/ioccom.h> +#include <sys/conf.h> +#include <sys/proc.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/mman.h> +#include <sys/refcount.h> +#include <sys/rwlock.h> +#include <sys/hwt.h> +#include <sys/smp.h> + +#include <vm/vm.h> +#include <vm/pmap.h> +#include <vm/vm_extern.h> +#include <vm/vm_param.h> +#include <vm/vm_kern.h> +#include <vm/vm_page.h> +#include <vm/vm_object.h> +#include <vm/vm_pager.h> +#include <vm/vm_pageout.h> +#include <vm/vm_phys.h> + +#include <dev/hwt/hwt_hook.h> +#include <dev/hwt/hwt_context.h> +#include <dev/hwt/hwt_contexthash.h> +#include <dev/hwt/hwt_config.h> +#include <dev/hwt/hwt_cpu.h> +#include <dev/hwt/hwt_owner.h> +#include <dev/hwt/hwt_ownerhash.h> +#include <dev/hwt/hwt_thread.h> +#include <dev/hwt/hwt_backend.h> +#include <dev/hwt/hwt_vm.h> +#include <dev/hwt/hwt_record.h> + +#define HWT_THREAD_DEBUG +#undef HWT_THREAD_DEBUG + +#ifdef HWT_THREAD_DEBUG +#define dprintf(fmt, ...) printf(fmt, ##__VA_ARGS__) +#else +#define dprintf(fmt, ...) +#endif + +static MALLOC_DEFINE(M_HWT_VM, "hwt_vm", "Hardware Trace"); + +static int +hwt_vm_fault(vm_object_t vm_obj, vm_ooffset_t offset, + int prot, vm_page_t *mres) +{ + + return (0); +} + +static int +hwt_vm_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot, + vm_ooffset_t foff, struct ucred *cred, u_short *color) +{ + + *color = 0; + + return (0); +} + +static void +hwt_vm_dtor(void *handle) +{ + +} + +static struct cdev_pager_ops hwt_vm_pager_ops = { + .cdev_pg_fault = hwt_vm_fault, + .cdev_pg_ctor = hwt_vm_ctor, + .cdev_pg_dtor = hwt_vm_dtor +}; + +static int +hwt_vm_alloc_pages(struct hwt_vm *vm, int kva_req) +{ + vm_paddr_t low, high, boundary; + vm_memattr_t memattr; +#ifdef __aarch64__ + uintptr_t va; +#endif + int alignment; + vm_page_t m; + int pflags; + int tries; + int i; + + alignment = PAGE_SIZE; + low = 0; + high = -1UL; + boundary = 0; + pflags = VM_ALLOC_NORMAL | VM_ALLOC_WIRED | VM_ALLOC_ZERO; + memattr = VM_MEMATTR_DEVICE; + + if (kva_req) { + vm->kvaddr = kva_alloc(vm->npages * PAGE_SIZE); + if (!vm->kvaddr) + return (ENOMEM); + } + + vm->obj = cdev_pager_allocate(vm, OBJT_MGTDEVICE, + &hwt_vm_pager_ops, vm->npages * PAGE_SIZE, PROT_READ, 0, + curthread->td_ucred); + + for (i = 0; i < vm->npages; i++) { + tries = 0; +retry: + m = vm_page_alloc_noobj_contig(pflags, 1, low, high, + alignment, boundary, memattr); + if (m == NULL) { + if (tries < 3) { + if (!vm_page_reclaim_contig(pflags, 1, low, + high, alignment, boundary)) + vm_wait(NULL); + tries++; + goto retry; + } + + return (ENOMEM); + } + +#if 0 + /* TODO: could not clean device memory on arm64. */ + if ((m->flags & PG_ZERO) == 0) + pmap_zero_page(m); +#endif + +#ifdef __aarch64__ + va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); + cpu_dcache_wb_range((void *)va, PAGE_SIZE); +#endif + + m->valid = VM_PAGE_BITS_ALL; + m->oflags &= ~VPO_UNMANAGED; + m->flags |= PG_FICTITIOUS; + vm->pages[i] = m; + + VM_OBJECT_WLOCK(vm->obj); + vm_page_insert(m, vm->obj, i); + if (kva_req) + pmap_qenter(vm->kvaddr + i * PAGE_SIZE, &m, 1); + VM_OBJECT_WUNLOCK(vm->obj); + } + + return (0); +} + +static int +hwt_vm_open(struct cdev *cdev, int oflags, int devtype, struct thread *td) +{ + + dprintf("%s\n", __func__); + + return (0); +} + +static int +hwt_vm_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, + vm_size_t mapsize, struct vm_object **objp, int nprot) +{ + struct hwt_vm *vm; + + vm = cdev->si_drv1; + + if (nprot != PROT_READ || *offset != 0) + return (ENXIO); + + vm_object_reference(vm->obj); + *objp = vm->obj; + + return (0); +} + +static void +hwt_vm_start_cpu_mode(struct hwt_context *ctx) +{ + cpuset_t enable_cpus; + int cpu_id; + + CPU_ZERO(&enable_cpus); + + CPU_FOREACH_ISSET(cpu_id, &ctx->cpu_map) { +#ifdef SMP + /* Ensure CPU is not halted. */ + if (CPU_ISSET(cpu_id, &hlt_cpus_mask)) + continue; +#endif + + hwt_backend_configure(ctx, cpu_id, cpu_id); + + CPU_SET(cpu_id, &enable_cpus); + } + + if (ctx->hwt_backend->ops->hwt_backend_enable_smp == NULL) { + CPU_FOREACH_ISSET(cpu_id, &enable_cpus) + hwt_backend_enable(ctx, cpu_id); + } else { + /* Some backends require enabling all CPUs at once. */ + hwt_backend_enable_smp(ctx); + } +} + +static int +hwt_vm_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, + struct thread *td) +{ + struct hwt_record_get *rget; + struct hwt_set_config *sconf; + struct hwt_bufptr_get *ptr_get; + struct hwt_svc_buf *sbuf; + + struct hwt_context *ctx; + struct hwt_vm *vm; + struct hwt_owner *ho; + + vm_offset_t offset; + int ident; + int error; + uint64_t data = 0; + void *data2; + size_t data_size; + int data_version; + + vm = dev->si_drv1; + KASSERT(vm != NULL, ("si_drv1 is NULL")); + + ctx = vm->ctx; + + /* Ensure process is registered owner of this HWT. */ + ho = hwt_ownerhash_lookup(td->td_proc); + if (ho == NULL) + return (ENXIO); + + if (ctx->hwt_owner != ho) + return (EPERM); + + switch (cmd) { + case HWT_IOC_START: + dprintf("%s: start tracing\n", __func__); + + HWT_CTX_LOCK(ctx); + if (ctx->state == CTX_STATE_RUNNING) { + /* Already running ? */ + HWT_CTX_UNLOCK(ctx); + return (ENXIO); + } + ctx->state = CTX_STATE_RUNNING; + HWT_CTX_UNLOCK(ctx); + + if (ctx->mode == HWT_MODE_CPU) + hwt_vm_start_cpu_mode(ctx); + else { + /* + * Tracing backend will be configured and enabled + * during hook invocation. See hwt_hook.c. + */ + } + + break; + + case HWT_IOC_STOP: + if (ctx->state == CTX_STATE_STOPPED) + return (ENXIO); + hwt_backend_stop(ctx); + ctx->state = CTX_STATE_STOPPED; + break; + + case HWT_IOC_RECORD_GET: + rget = (struct hwt_record_get *)addr; + error = hwt_record_send(ctx, rget); + if (error) + return (error); + break; + + case HWT_IOC_SET_CONFIG: + if (ctx->state == CTX_STATE_RUNNING) { + return (ENXIO); + } + sconf = (struct hwt_set_config *)addr; + error = hwt_config_set(td, ctx, sconf); + if (error) + return (error); + ctx->pause_on_mmap = sconf->pause_on_mmap ? 1 : 0; + break; + + case HWT_IOC_WAKEUP: + + if (ctx->mode == HWT_MODE_CPU) + return (ENXIO); + + KASSERT(vm->thr != NULL, ("thr is NULL")); + + wakeup(vm->thr); + + break; + + case HWT_IOC_BUFPTR_GET: + ptr_get = (struct hwt_bufptr_get *)addr; + + error = hwt_backend_read(ctx, vm, &ident, &offset, &data); + if (error) + return (error); + + if (ptr_get->ident) + error = copyout(&ident, ptr_get->ident, sizeof(int)); + if (error) + return (error); + + if (ptr_get->offset) + error = copyout(&offset, ptr_get->offset, + sizeof(vm_offset_t)); + if (error) + return (error); + + if (ptr_get->data) + error = copyout(&data, ptr_get->data, sizeof(uint64_t)); + if (error) + return (error); + + break; + + case HWT_IOC_SVC_BUF: + if (ctx->state == CTX_STATE_STOPPED) { + return (ENXIO); + } + + sbuf = (struct hwt_svc_buf *)addr; + data_size = sbuf->data_size; + data_version = sbuf->data_version; + + if (data_size == 0 || data_size > PAGE_SIZE) + return (EINVAL); + + data2 = malloc(data_size, M_HWT_VM, M_WAITOK | M_ZERO); + error = copyin(sbuf->data, data2, data_size); + if (error) { + free(data2, M_HWT_VM); + return (error); + } + + error = hwt_backend_svc_buf(ctx, data2, data_size, data_version); + if (error) { + free(data2, M_HWT_VM); + return (error); + } + + free(data2, M_HWT_VM); + break; + + default: + break; + } + + return (0); +} + +static struct cdevsw hwt_vm_cdevsw = { + .d_version = D_VERSION, + .d_name = "hwt", + .d_open = hwt_vm_open, + .d_mmap_single = hwt_vm_mmap_single, + .d_ioctl = hwt_vm_ioctl, +}; + +static int +hwt_vm_create_cdev(struct hwt_vm *vm, char *path) +{ + struct make_dev_args args; + int error; + + dprintf("%s: path %s\n", __func__, path); + + make_dev_args_init(&args); + args.mda_devsw = &hwt_vm_cdevsw; + args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK; + args.mda_uid = UID_ROOT; + args.mda_gid = GID_WHEEL; + args.mda_mode = 0660; + args.mda_si_drv1 = vm; + + error = make_dev_s(&args, &vm->cdev, "%s", path); + if (error != 0) + return (error); + + return (0); +} + +static int +hwt_vm_alloc_buffers(struct hwt_vm *vm, int kva_req) +{ + int error; + + vm->pages = malloc(sizeof(struct vm_page *) * vm->npages, + M_HWT_VM, M_WAITOK | M_ZERO); + + error = hwt_vm_alloc_pages(vm, kva_req); + if (error) { + printf("%s: could not alloc pages\n", __func__); + return (error); + } + + return (0); +} + +static void +hwt_vm_destroy_buffers(struct hwt_vm *vm) +{ + vm_page_t m; + int i; + + if (vm->ctx->hwt_backend->kva_req && vm->kvaddr != 0) { + pmap_qremove(vm->kvaddr, vm->npages); + kva_free(vm->kvaddr, vm->npages * PAGE_SIZE); + } + VM_OBJECT_WLOCK(vm->obj); + for (i = 0; i < vm->npages; i++) { + m = vm->pages[i]; + if (m == NULL) + break; + + vm_page_busy_acquire(m, 0); + cdev_pager_free_page(vm->obj, m); + m->flags &= ~PG_FICTITIOUS; + vm_page_unwire_noq(m); + vm_page_free(m); + + } + vm_pager_deallocate(vm->obj); + VM_OBJECT_WUNLOCK(vm->obj); + + free(vm->pages, M_HWT_VM); +} + +void +hwt_vm_free(struct hwt_vm *vm) +{ + + dprintf("%s\n", __func__); + + if (vm->cdev) + destroy_dev_sched(vm->cdev); + hwt_vm_destroy_buffers(vm); + free(vm, M_HWT_VM); +} + +int +hwt_vm_alloc(size_t bufsize, int kva_req, char *path, struct hwt_vm **vm0) +{ + struct hwt_vm *vm; + int error; + + vm = malloc(sizeof(struct hwt_vm), M_HWT_VM, M_WAITOK | M_ZERO); + vm->npages = bufsize / PAGE_SIZE; + + error = hwt_vm_alloc_buffers(vm, kva_req); + if (error) { + free(vm, M_HWT_VM); + return (error); + } + + error = hwt_vm_create_cdev(vm, path); + if (error) { + hwt_vm_free(vm); + return (error); + } + + *vm0 = vm; + + return (0); +} diff --git a/sys/dev/hwt/hwt_vm.h b/sys/dev/hwt/hwt_vm.h new file mode 100644 index 000000000000..5002bd43e093 --- /dev/null +++ b/sys/dev/hwt/hwt_vm.h @@ -0,0 +1,47 @@ +/*- + * Copyright (c) 2023-2025 Ruslan Bukin <br@bsdpad.com> + * + * This work was supported by Innovate UK project 105694, "Digital Security + * by Design (DSbD) Technology Platform Prototype". + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _DEV_HWT_HWT_VM_H_ +#define _DEV_HWT_HWT_VM_H_ + +struct hwt_vm { + vm_page_t *pages; + int npages; + vm_object_t obj; + vm_offset_t kvaddr; + struct cdev *cdev; + + struct hwt_context *ctx; + struct hwt_cpu *cpu; /* cpu mode only. */ + struct hwt_thread *thr; /* thr mode only. */ +}; + +int hwt_vm_alloc(size_t bufsize, int kva_req, char *path, struct hwt_vm **vm0); +void hwt_vm_free(struct hwt_vm *vm); + +#endif /* !_DEV_HWT_HWT_VM_H_ */ diff --git a/sys/dev/hyperv/vmbus/vmbus_chan.c b/sys/dev/hyperv/vmbus/vmbus_chan.c index 189a3e66a039..7ea60a499c72 100644 --- a/sys/dev/hyperv/vmbus/vmbus_chan.c +++ b/sys/dev/hyperv/vmbus/vmbus_chan.c @@ -1555,7 +1555,7 @@ vmbus_event_flags_proc(struct vmbus_softc *sc, volatile u_long *event_flags, continue; flags = atomic_swap_long(&event_flags[f], 0); - chid_base = f << VMBUS_EVTFLAG_SHIFT; + chid_base = f * VMBUS_EVTFLAG_LEN; while ((chid_ofs = ffsl(flags)) != 0) { struct vmbus_channel *chan; @@ -1599,7 +1599,7 @@ vmbus_event_proc_compat(struct vmbus_softc *sc, int cpu) eventf = VMBUS_PCPU_GET(sc, event_flags, cpu) + VMBUS_SINT_MESSAGE; if (atomic_testandclear_long(&eventf->evt_flags[0], 0)) { vmbus_event_flags_proc(sc, sc->vmbus_rx_evtflags, - VMBUS_CHAN_MAX_COMPAT >> VMBUS_EVTFLAG_SHIFT); + VMBUS_CHAN_MAX_COMPAT / VMBUS_EVTFLAG_LEN); } } @@ -1903,7 +1903,7 @@ vmbus_chan_msgproc_choffer(struct vmbus_softc *sc, * Setup event flag. */ chan->ch_evtflag = - &sc->vmbus_tx_evtflags[chan->ch_id >> VMBUS_EVTFLAG_SHIFT]; + &sc->vmbus_tx_evtflags[chan->ch_id / VMBUS_EVTFLAG_LEN]; chan->ch_evtflag_mask = 1UL << (chan->ch_id & VMBUS_EVTFLAG_MASK); /* diff --git a/sys/dev/hyperv/vmbus/vmbus_reg.h b/sys/dev/hyperv/vmbus/vmbus_reg.h index 4aa729475b5d..76cdca0ebeb2 100644 --- a/sys/dev/hyperv/vmbus/vmbus_reg.h +++ b/sys/dev/hyperv/vmbus/vmbus_reg.h @@ -60,16 +60,10 @@ CTASSERT(sizeof(struct vmbus_message) == VMBUS_MSG_SIZE); * Hyper-V SynIC event flags */ -#ifdef __LP64__ -#define VMBUS_EVTFLAGS_MAX 32 -#define VMBUS_EVTFLAG_SHIFT 6 -#else -#define VMBUS_EVTFLAGS_MAX 64 -#define VMBUS_EVTFLAG_SHIFT 5 -#endif -#define VMBUS_EVTFLAG_LEN (1 << VMBUS_EVTFLAG_SHIFT) +#define VMBUS_EVTFLAG_LEN (sizeof(u_long) * 8) #define VMBUS_EVTFLAG_MASK (VMBUS_EVTFLAG_LEN - 1) #define VMBUS_EVTFLAGS_SIZE 256 +#define VMBUS_EVTFLAGS_MAX (VMBUS_EVTFLAGS_SIZE / sizeof(u_long)) struct vmbus_evtflags { u_long evt_flags[VMBUS_EVTFLAGS_MAX]; diff --git a/sys/dev/ice/ice_features.h b/sys/dev/ice/ice_features.h index 821abe4806ca..5b23757b1c98 100644 --- a/sys/dev/ice/ice_features.h +++ b/sys/dev/ice/ice_features.h @@ -91,7 +91,9 @@ enum feat_list { static inline void ice_disable_unsupported_features(ice_bitmap_t __unused *bitmap) { +#ifndef PCI_IOV ice_clear_bit(ICE_FEATURE_SRIOV, bitmap); +#endif #ifndef DEV_NETMAP ice_clear_bit(ICE_FEATURE_NETMAP, bitmap); #endif diff --git a/sys/dev/ice/ice_iflib.h b/sys/dev/ice/ice_iflib.h index 3a5dc201189a..e1d5307a9516 100644 --- a/sys/dev/ice/ice_iflib.h +++ b/sys/dev/ice/ice_iflib.h @@ -139,6 +139,9 @@ struct ice_irq_vector { * @tc: traffic class queue belongs to * @q_handle: qidx in tc; used in TXQ enable functions * + * ice_iov.c requires the following parameters (when PCI_IOV is defined): + * @itr_idx: ITR index to use for this queue + * * Other parameters may be iflib driver specific */ struct ice_tx_queue { @@ -153,6 +156,9 @@ struct ice_tx_queue { u32 me; u16 q_handle; u8 tc; +#ifdef PCI_IOV + u8 itr_idx; +#endif /* descriptor writeback status */ qidx_t *tx_rsq; @@ -175,6 +181,9 @@ struct ice_tx_queue { * @stats: queue statistics * @tc: traffic class queue belongs to * + * ice_iov.c requires the following parameters (when PCI_IOV is defined): + * @itr_idx: ITR index to use for this queue + * * Other parameters may be iflib driver specific */ struct ice_rx_queue { @@ -187,6 +196,9 @@ struct ice_rx_queue { struct ice_irq_vector *irqv; u32 me; u8 tc; +#ifdef PCI_IOV + u8 itr_idx; +#endif struct if_irq que_irq; }; @@ -332,6 +344,10 @@ struct ice_softc { ice_declare_bitmap(feat_cap, ICE_FEATURE_COUNT); ice_declare_bitmap(feat_en, ICE_FEATURE_COUNT); +#ifdef PCI_IOV + struct ice_vf *vfs; + u16 num_vfs; +#endif struct ice_resmgr os_imgr; /* For mirror interface */ struct ice_mirr_if *mirr_if; diff --git a/sys/dev/ice/ice_iov.c b/sys/dev/ice/ice_iov.c new file mode 100644 index 000000000000..c5a3e1060e44 --- /dev/null +++ b/sys/dev/ice/ice_iov.c @@ -0,0 +1,1856 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright (c) 2025, Intel Corporation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * @file ice_iov.c + * @brief Virtualization support functions + * + * Contains functions for enabling and managing PCIe virtual function devices, + * including enabling new VFs, and managing VFs over the virtchnl interface. + */ + +#include "ice_iov.h" + +static struct ice_vf *ice_iov_get_vf(struct ice_softc *sc, int vf_num); +static void ice_iov_ready_vf(struct ice_softc *sc, struct ice_vf *vf); +static void ice_reset_vf(struct ice_softc *sc, struct ice_vf *vf, + bool trigger_vflr); +static void ice_iov_setup_intr_mapping(struct ice_softc *sc, struct ice_vf *vf); + +static void ice_vc_version_msg(struct ice_softc *sc, struct ice_vf *vf, + u8 *msg_buf); +static void ice_vc_get_vf_res_msg(struct ice_softc *sc, struct ice_vf *vf, + u8 *msg_buf); +static void ice_vc_add_eth_addr_msg(struct ice_softc *sc, struct ice_vf *vf, + u8 *msg_buf); +static void ice_vc_del_eth_addr_msg(struct ice_softc *sc, struct ice_vf *vf, + u8 *msg_buf); +static bool ice_vc_isvalid_ring_len(u16 ring_len); +static void ice_vc_cfg_vsi_qs_msg(struct ice_softc *sc, struct ice_vf *vf, + u8 *msg_buf); +static void ice_vc_cfg_rss_key_msg(struct ice_softc *sc, struct ice_vf *vf, + u8 *msg_buf); +static void ice_vc_set_rss_hena_msg(struct ice_softc *sc, struct ice_vf *vf, + u8 *msg_buf); +static void ice_vc_enable_queues_msg(struct ice_softc *sc, struct ice_vf *vf, + u8 *msg_buf); +static void ice_vc_notify_vf_link_state(struct ice_softc *sc, struct ice_vf *vf); +static void ice_vc_disable_queues_msg(struct ice_softc *sc, struct ice_vf *vf, + u8 *msg_buf); +static void ice_vc_cfg_irq_map_msg(struct ice_softc *sc, struct ice_vf *vf, + u8 *msg_buf); +static void ice_vc_get_stats_msg(struct ice_softc *sc, struct ice_vf *vf, + u8 *msg_buf); +static void ice_eth_stats_to_virtchnl_eth_stats(struct ice_eth_stats *istats, + struct virtchnl_eth_stats *vstats); +static void ice_vc_cfg_rss_lut_msg(struct ice_softc *sc, struct ice_vf *vf, + u8 *msg_buf); +static void ice_vc_cfg_promisc_mode_msg(struct ice_softc *sc, struct ice_vf *vf, + u8 *msg_buf); +static void ice_vc_add_vlan_msg(struct ice_softc *sc, struct ice_vf *vf, + u8 *msg_buf); +static void ice_vc_del_vlan_msg(struct ice_softc *sc, struct ice_vf *vf, + u8 *msg_buf); +static enum virtchnl_status_code ice_iov_err_to_virt_err(int ice_err); +static int ice_vf_validate_mac(struct ice_vf *vf, const uint8_t *addr); + +/** + * ice_iov_attach - Initialize SR-IOV PF host support + * @sc: device softc structure + * + * Initialize SR-IOV PF host support at the end of the driver attach process. + * + * @pre Must be called from sleepable context (calls malloc() w/ M_WAITOK) + * + * @returns 0 if successful, or + * - ENOMEM if there is no memory for the PF/VF schemas or iov device + * - ENXIO if the device isn't PCI-E or doesn't support the same SR-IOV + * version as the kernel + * - ENOENT if the device doesn't have the SR-IOV capability + */ +int +ice_iov_attach(struct ice_softc *sc) +{ + device_t dev = sc->dev; + nvlist_t *pf_schema, *vf_schema; + int error; + + pf_schema = pci_iov_schema_alloc_node(); + vf_schema = pci_iov_schema_alloc_node(); + + pci_iov_schema_add_unicast_mac(vf_schema, "mac-addr", 0, NULL); + pci_iov_schema_add_bool(vf_schema, "mac-anti-spoof", + IOV_SCHEMA_HASDEFAULT, TRUE); + pci_iov_schema_add_bool(vf_schema, "allow-set-mac", + IOV_SCHEMA_HASDEFAULT, FALSE); + pci_iov_schema_add_bool(vf_schema, "allow-promisc", + IOV_SCHEMA_HASDEFAULT, FALSE); + pci_iov_schema_add_uint16(vf_schema, "num-queues", + IOV_SCHEMA_HASDEFAULT, ICE_DEFAULT_VF_QUEUES); + pci_iov_schema_add_uint16(vf_schema, "mirror-src-vsi", + IOV_SCHEMA_HASDEFAULT, ICE_INVALID_MIRROR_VSI); + pci_iov_schema_add_uint16(vf_schema, "max-vlan-allowed", + IOV_SCHEMA_HASDEFAULT, ICE_DEFAULT_VF_VLAN_LIMIT); + pci_iov_schema_add_uint16(vf_schema, "max-mac-filters", + IOV_SCHEMA_HASDEFAULT, ICE_DEFAULT_VF_FILTER_LIMIT); + + error = pci_iov_attach(dev, pf_schema, vf_schema); + if (error != 0) { + device_printf(dev, + "pci_iov_attach failed (error=%s)\n", + ice_err_str(error)); + ice_clear_bit(ICE_FEATURE_SRIOV, sc->feat_en); + } else + ice_set_bit(ICE_FEATURE_SRIOV, sc->feat_en); + + return (error); +} + +/** + * ice_iov_detach - Teardown SR-IOV PF host support + * @sc: device softc structure + * + * Teardown SR-IOV PF host support at the start of the driver detach process. + * + * @returns 0 if successful or IOV support hasn't been setup, or + * - EBUSY if VFs still exist + */ +int +ice_iov_detach(struct ice_softc *sc) +{ + device_t dev = sc->dev; + int error; + + error = pci_iov_detach(dev); + if (error != 0) { + device_printf(dev, + "pci_iov_detach failed (error=%s)\n", + ice_err_str(error)); + } + + return (error); +} + +/** + * ice_iov_init - Called by the OS before the first VF is created. + * @sc: device softc structure + * @num_vfs: number of VFs to setup resources for + * @params: configuration parameters for the PF + * + * @returns 0 if successful or an error code on failure + */ +int +ice_iov_init(struct ice_softc *sc, uint16_t num_vfs, const nvlist_t *params __unused) +{ + /* Allocate array of VFs, for tracking */ + sc->vfs = (struct ice_vf *)malloc(sizeof(struct ice_vf) * num_vfs, M_ICE, M_NOWAIT | + M_ZERO); + if (sc->vfs == NULL) + return (ENOMEM); + + /* Initialize each VF with basic information */ + for (int i = 0; i < num_vfs; i++) + sc->vfs[i].vf_num = i; + + /* Save off number of configured VFs */ + sc->num_vfs = num_vfs; + + return (0); +} + +/** + * ice_iov_get_vf - Get pointer to VF at given index + * @sc: device softc structure + * @vf_num: Index of VF to retrieve + * + * @remark will throw an assertion if vf_num is not in the + * range of allocated VFs + * + * @returns a pointer to the VF structure at the given index + */ +static struct ice_vf * +ice_iov_get_vf(struct ice_softc *sc, int vf_num) +{ + MPASS(vf_num < sc->num_vfs); + + return &sc->vfs[vf_num]; +} + +/** + * ice_iov_add_vf - Called by the OS for each VF to create + * @sc: device softc structure + * @vfnum: index of VF to configure + * @params: configuration parameters for the VF + * + * @returns 0 if successful or an error code on failure + */ +int +ice_iov_add_vf(struct ice_softc *sc, uint16_t vfnum, const nvlist_t *params) +{ + struct ice_tx_queue *txq; + struct ice_rx_queue *rxq; + device_t dev = sc->dev; + struct ice_vsi *vsi; + struct ice_vf *vf; + int vf_num_queues; + const void *mac; + size_t size; + int error; + int i; + + vf = ice_iov_get_vf(sc, vfnum); + vf->vf_flags = VF_FLAG_ENABLED; + + /* This VF needs at least one VSI */ + vsi = ice_alloc_vsi(sc, ICE_VSI_VF); + if (vsi == NULL) + return (ENOMEM); + vf->vsi = vsi; + vsi->vf_num = vfnum; + + vf_num_queues = nvlist_get_number(params, "num-queues"); + /* Validate and clamp value if invalid */ + if (vf_num_queues < 1 || vf_num_queues > ICE_MAX_SCATTERED_QUEUES) + device_printf(dev, "Invalid num-queues (%d) for VF %d\n", + vf_num_queues, vf->vf_num); + if (vf_num_queues < 1) { + device_printf(dev, "Setting VF %d num-queues to 1\n", vf->vf_num); + vf_num_queues = 1; + } else if (vf_num_queues > ICE_MAX_SCATTERED_QUEUES) { + device_printf(dev, "Setting VF %d num-queues to %d\n", + vf->vf_num, ICE_MAX_SCATTERED_QUEUES); + vf_num_queues = ICE_MAX_SCATTERED_QUEUES; + } + vsi->qmap_type = ICE_RESMGR_ALLOC_SCATTERED; + + /* Reserve VF queue allocation from PF queues */ + ice_alloc_vsi_qmap(vsi, vf_num_queues, vf_num_queues); + vsi->num_tx_queues = vsi->num_rx_queues = vf_num_queues; + + /* Assign Tx queues from PF space */ + error = ice_resmgr_assign_scattered(&sc->tx_qmgr, vsi->tx_qmap, + vsi->num_tx_queues); + if (error) { + device_printf(sc->dev, "Unable to assign VF Tx queues: %s\n", + ice_err_str(error)); + goto release_vsi; + } + + /* Assign Rx queues from PF space */ + error = ice_resmgr_assign_scattered(&sc->rx_qmgr, vsi->rx_qmap, + vsi->num_rx_queues); + if (error) { + device_printf(sc->dev, "Unable to assign VF Rx queues: %s\n", + ice_err_str(error)); + goto release_vsi; + } + + vsi->max_frame_size = ICE_MAX_FRAME_SIZE; + + /* Allocate queue structure memory */ + vsi->tx_queues = (struct ice_tx_queue *) + malloc(sizeof(struct ice_tx_queue) * vsi->num_tx_queues, M_ICE, + M_NOWAIT | M_ZERO); + if (!vsi->tx_queues) { + device_printf(sc->dev, "VF-%d: Unable to allocate Tx queue memory\n", + vfnum); + error = ENOMEM; + goto release_vsi; + } + for (i = 0, txq = vsi->tx_queues; i < vsi->num_tx_queues; i++, txq++) { + txq->me = i; + txq->vsi = vsi; + } + + /* Allocate queue structure memory */ + vsi->rx_queues = (struct ice_rx_queue *) + malloc(sizeof(struct ice_rx_queue) * vsi->num_rx_queues, M_ICE, + M_NOWAIT | M_ZERO); + if (!vsi->rx_queues) { + device_printf(sc->dev, "VF-%d: Unable to allocate Rx queue memory\n", + vfnum); + error = ENOMEM; + goto free_txqs; + } + for (i = 0, rxq = vsi->rx_queues; i < vsi->num_rx_queues; i++, rxq++) { + rxq->me = i; + rxq->vsi = vsi; + } + + /* Allocate space to store the IRQ vector data */ + vf->num_irq_vectors = vf_num_queues + 1; + vf->tx_irqvs = (struct ice_irq_vector *) + malloc(sizeof(struct ice_irq_vector) * (vf->num_irq_vectors), + M_ICE, M_NOWAIT); + if (!vf->tx_irqvs) { + device_printf(sc->dev, + "Unable to allocate TX irqv memory for VF-%d's %d vectors\n", + vfnum, vf->num_irq_vectors); + error = ENOMEM; + goto free_rxqs; + } + vf->rx_irqvs = (struct ice_irq_vector *) + malloc(sizeof(struct ice_irq_vector) * (vf->num_irq_vectors), + M_ICE, M_NOWAIT); + if (!vf->rx_irqvs) { + device_printf(sc->dev, + "Unable to allocate RX irqv memory for VF-%d's %d vectors\n", + vfnum, vf->num_irq_vectors); + error = ENOMEM; + goto free_txirqvs; + } + + /* Assign VF interrupts from PF space */ + if (!(vf->vf_imap = + (u16 *)malloc(sizeof(u16) * vf->num_irq_vectors, + M_ICE, M_NOWAIT))) { + device_printf(dev, "Unable to allocate VF-%d imap memory\n", vfnum); + error = ENOMEM; + goto free_rxirqvs; + } + error = ice_resmgr_assign_contiguous(&sc->dev_imgr, vf->vf_imap, vf->num_irq_vectors); + if (error) { + device_printf(dev, "Unable to assign VF-%d interrupt mapping: %s\n", + vfnum, ice_err_str(error)); + goto free_imap; + } + + if (nvlist_exists_binary(params, "mac-addr")) { + mac = nvlist_get_binary(params, "mac-addr", &size); + memcpy(vf->mac, mac, ETHER_ADDR_LEN); + + if (nvlist_get_bool(params, "allow-set-mac")) + vf->vf_flags |= VF_FLAG_SET_MAC_CAP; + } else + /* + * If the administrator has not specified a MAC address then + * we must allow the VF to choose one. + */ + vf->vf_flags |= VF_FLAG_SET_MAC_CAP; + + if (nvlist_get_bool(params, "mac-anti-spoof")) + vf->vf_flags |= VF_FLAG_MAC_ANTI_SPOOF; + + if (nvlist_get_bool(params, "allow-promisc")) + vf->vf_flags |= VF_FLAG_PROMISC_CAP; + + vsi->mirror_src_vsi = nvlist_get_number(params, "mirror-src-vsi"); + + vf->vlan_limit = nvlist_get_number(params, "max-vlan-allowed"); + vf->mac_filter_limit = nvlist_get_number(params, "max-mac-filters"); + + vf->vf_flags |= VF_FLAG_VLAN_CAP; + + /* Create and setup VSI in HW */ + error = ice_initialize_vsi(vsi); + if (error) { + device_printf(sc->dev, "Unable to initialize VF %d VSI: %s\n", + vfnum, ice_err_str(error)); + goto release_imap; + } + + /* Add the broadcast address */ + error = ice_add_vsi_mac_filter(vsi, broadcastaddr); + if (error) { + device_printf(sc->dev, "Unable to add broadcast filter VF %d VSI: %s\n", + vfnum, ice_err_str(error)); + goto release_imap; + } + + ice_iov_ready_vf(sc, vf); + + return (0); + +release_imap: + ice_resmgr_release_map(&sc->dev_imgr, vf->vf_imap, + vf->num_irq_vectors); +free_imap: + free(vf->vf_imap, M_ICE); + vf->vf_imap = NULL; +free_rxirqvs: + free(vf->rx_irqvs, M_ICE); + vf->rx_irqvs = NULL; +free_txirqvs: + free(vf->tx_irqvs, M_ICE); + vf->tx_irqvs = NULL; +free_rxqs: + free(vsi->rx_queues, M_ICE); + vsi->rx_queues = NULL; +free_txqs: + free(vsi->tx_queues, M_ICE); + vsi->tx_queues = NULL; +release_vsi: + ice_release_vsi(vsi); + vf->vsi = NULL; + return (error); +} + +/** + * ice_iov_uninit - Called by the OS when VFs are destroyed + * @sc: device softc structure + */ +void +ice_iov_uninit(struct ice_softc *sc) +{ + struct ice_vf *vf; + struct ice_vsi *vsi; + + /* Release per-VF resources */ + for (int i = 0; i < sc->num_vfs; i++) { + vf = &sc->vfs[i]; + vsi = vf->vsi; + + /* Free VF interrupt reservation */ + if (vf->vf_imap) { + free(vf->vf_imap, M_ICE); + vf->vf_imap = NULL; + } + + /* Free queue interrupt mapping trackers */ + if (vf->tx_irqvs) { + free(vf->tx_irqvs, M_ICE); + vf->tx_irqvs = NULL; + } + if (vf->rx_irqvs) { + free(vf->rx_irqvs, M_ICE); + vf->rx_irqvs = NULL; + } + + if (!vsi) + continue; + + /* Free VSI queues */ + if (vsi->tx_queues) { + free(vsi->tx_queues, M_ICE); + vsi->tx_queues = NULL; + } + if (vsi->rx_queues) { + free(vsi->rx_queues, M_ICE); + vsi->rx_queues = NULL; + } + + ice_release_vsi(vsi); + vf->vsi = NULL; + } + + /* Release memory used for VF tracking */ + if (sc->vfs) { + free(sc->vfs, M_ICE); + sc->vfs = NULL; + } + sc->num_vfs = 0; +} + +/** + * ice_iov_handle_vflr - Process VFLR event + * @sc: device softc structure + * + * Identifys which VFs have been reset and re-configure + * them. + */ +void +ice_iov_handle_vflr(struct ice_softc *sc) +{ + struct ice_hw *hw = &sc->hw; + struct ice_vf *vf; + u32 reg, reg_idx, bit_idx; + + for (int i = 0; i < sc->num_vfs; i++) { + vf = &sc->vfs[i]; + + reg_idx = (hw->func_caps.vf_base_id + vf->vf_num) / 32; + bit_idx = (hw->func_caps.vf_base_id + vf->vf_num) % 32; + reg = rd32(hw, GLGEN_VFLRSTAT(reg_idx)); + if (reg & BIT(bit_idx)) + ice_reset_vf(sc, vf, false); + } +} + +/** + * ice_iov_ready_vf - Setup VF interrupts and mark it as ready + * @sc: device softc structure + * @vf: driver's VF structure for the VF to update + * + * Clears VF reset triggering bit, sets up the PF<->VF interrupt + * mapping and marks the VF as active in the HW so that the VF + * driver can use it. + */ +static void +ice_iov_ready_vf(struct ice_softc *sc, struct ice_vf *vf) +{ + struct ice_hw *hw = &sc->hw; + u32 reg; + + /* Clear the triggering bit */ + reg = rd32(hw, VPGEN_VFRTRIG(vf->vf_num)); + reg &= ~VPGEN_VFRTRIG_VFSWR_M; + wr32(hw, VPGEN_VFRTRIG(vf->vf_num), reg); + + /* Setup VF interrupt allocation and mapping */ + ice_iov_setup_intr_mapping(sc, vf); + + /* Indicate to the VF that reset is done */ + wr32(hw, VFGEN_RSTAT(vf->vf_num), VIRTCHNL_VFR_VFACTIVE); + + ice_flush(hw); +} + +/** + * ice_reset_vf - Perform a hardware reset (VFR) on a VF + * @sc: device softc structure + * @vf: driver's VF structure for VF to be reset + * @trigger_vflr: trigger a reset or only handle already executed reset + * + * Performs a VFR for the given VF. This function busy waits until the + * reset completes in the HW, notifies the VF that the reset is done + * by setting a bit in a HW register, then returns. + * + * @remark This also sets up the PF<->VF interrupt mapping and allocations in + * the hardware after the hardware reset is finished, via + * ice_iov_setup_intr_mapping() + */ +static void +ice_reset_vf(struct ice_softc *sc, struct ice_vf *vf, bool trigger_vflr) +{ + u16 global_vf_num, reg_idx, bit_idx; + struct ice_hw *hw = &sc->hw; + int status; + u32 reg; + int i; + + global_vf_num = vf->vf_num + hw->func_caps.vf_base_id; + + if (trigger_vflr) { + reg = rd32(hw, VPGEN_VFRTRIG(vf->vf_num)); + reg |= VPGEN_VFRTRIG_VFSWR_M; + wr32(hw, VPGEN_VFRTRIG(vf->vf_num), reg); + } + + /* clear the VFLR bit for the VF in a GLGEN_VFLRSTAT register */ + reg_idx = (global_vf_num) / 32; + bit_idx = (global_vf_num) % 32; + wr32(hw, GLGEN_VFLRSTAT(reg_idx), BIT(bit_idx)); + ice_flush(hw); + + /* Wait until there are no pending PCI transactions */ + wr32(hw, PF_PCI_CIAA, + ICE_PCIE_DEV_STATUS | (global_vf_num << PF_PCI_CIAA_VF_NUM_S)); + + for (i = 0; i < ICE_PCI_CIAD_WAIT_COUNT; i++) { + reg = rd32(hw, PF_PCI_CIAD); + if (!(reg & PCIEM_STA_TRANSACTION_PND)) + break; + + DELAY(ICE_PCI_CIAD_WAIT_DELAY_US); + } + if (i == ICE_PCI_CIAD_WAIT_COUNT) + device_printf(sc->dev, + "VF-%d PCI transactions stuck\n", vf->vf_num); + + /* Disable TX queues, which is required during VF reset */ + status = ice_dis_vsi_txq(hw->port_info, vf->vsi->idx, 0, 0, NULL, NULL, + NULL, ICE_VF_RESET, vf->vf_num, NULL); + if (status) + device_printf(sc->dev, + "%s: Failed to disable LAN Tx queues: err %s aq_err %s\n", + __func__, ice_status_str(status), + ice_aq_str(hw->adminq.sq_last_status)); + + /* Then check for the VF reset to finish in HW */ + for (i = 0; i < ICE_VPGEN_VFRSTAT_WAIT_COUNT; i++) { + reg = rd32(hw, VPGEN_VFRSTAT(vf->vf_num)); + if ((reg & VPGEN_VFRSTAT_VFRD_M)) + break; + + DELAY(ICE_VPGEN_VFRSTAT_WAIT_DELAY_US); + } + if (i == ICE_VPGEN_VFRSTAT_WAIT_COUNT) + device_printf(sc->dev, + "VF-%d Reset is stuck\n", vf->vf_num); + + ice_iov_ready_vf(sc, vf); +} + +/** + * ice_vc_get_vf_res_msg - Handle VIRTCHNL_OP_GET_VF_RESOURCES msg from VF + * @sc: device private structure + * @vf: VF tracking structure + * @msg_buf: raw message buffer from the VF + * + * Receives a message from the VF listing its supported capabilities, and + * replies to the VF with information about what resources the PF has + * allocated for the VF. + * + * @remark This always replies to the VF with a success status; it does not + * fail. It's up to the VF driver to reject or complain about the PF's response. + */ +static void +ice_vc_get_vf_res_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf) +{ + struct ice_hw *hw = &sc->hw; + struct virtchnl_vf_resource *vf_res; + struct virtchnl_vsi_resource *vsi_res; + u16 vf_res_len; + u32 vf_caps; + + /* XXX: Only support one VSI per VF, so this size doesn't need adjusting */ + vf_res_len = sizeof(struct virtchnl_vf_resource); + vf_res = (struct virtchnl_vf_resource *)malloc(vf_res_len, M_ICE, + M_WAITOK | M_ZERO); + + vf_res->num_vsis = 1; + vf_res->num_queue_pairs = vf->vsi->num_tx_queues; + vf_res->max_vectors = vf_res->num_queue_pairs + 1; + + vf_res->rss_key_size = ICE_GET_SET_RSS_KEY_EXTEND_KEY_SIZE; + vf_res->rss_lut_size = ICE_VSIQF_HLUT_ARRAY_SIZE; + vf_res->max_mtu = 0; + + vf_res->vf_cap_flags = VF_BASE_MODE_OFFLOADS; + if (msg_buf != NULL) { + vf_caps = *((u32 *)(msg_buf)); + + if (vf_caps & VIRTCHNL_VF_CAP_ADV_LINK_SPEED) + vf_res->vf_cap_flags |= VIRTCHNL_VF_CAP_ADV_LINK_SPEED; + + if (vf_caps & VIRTCHNL_VF_OFFLOAD_WB_ON_ITR) + vf_res->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_WB_ON_ITR; + } + + vsi_res = &vf_res->vsi_res[0]; + vsi_res->vsi_id = vf->vsi->idx; + vsi_res->num_queue_pairs = vf->vsi->num_tx_queues; + vsi_res->vsi_type = VIRTCHNL_VSI_SRIOV; + vsi_res->qset_handle = 0; + if (!ETHER_IS_ZERO(vf->mac)) + memcpy(vsi_res->default_mac_addr, vf->mac, ETHER_ADDR_LEN); + + ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_GET_VF_RESOURCES, + VIRTCHNL_STATUS_SUCCESS, (u8 *)vf_res, vf_res_len, NULL); + + free(vf_res, M_ICE); +} + +/** + * ice_vc_version_msg - Handle VIRTCHNL_OP_VERSION msg from VF + * @sc: device private structure + * @vf: VF tracking structure + * @msg_buf: raw message buffer from the VF + * + * Receives a version message from the VF, and responds to the VF with + * the version number that the PF will use. + * + * @remark This always replies to the VF with a success status; it does not + * fail. + */ +static void +ice_vc_version_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf) +{ + struct virtchnl_version_info *recv_vf_version; + struct ice_hw *hw = &sc->hw; + device_t dev = sc->dev; + + recv_vf_version = (struct virtchnl_version_info *)msg_buf; + + /* VFs running the 1.0 API expect to get 1.0 back */ + if (VF_IS_V10(recv_vf_version)) { + vf->version.major = 1; + vf->version.minor = VIRTCHNL_VERSION_MINOR_NO_VF_CAPS; + } else { + vf->version.major = VIRTCHNL_VERSION_MAJOR; + vf->version.minor = VIRTCHNL_VERSION_MINOR; + + if ((recv_vf_version->major != VIRTCHNL_VERSION_MAJOR) || + (recv_vf_version->minor != VIRTCHNL_VERSION_MINOR)) + device_printf(dev, + "%s: VF-%d requested version (%d.%d) differs from PF version (%d.%d)\n", + __func__, vf->vf_num, + recv_vf_version->major, recv_vf_version->minor, + VIRTCHNL_VERSION_MAJOR, VIRTCHNL_VERSION_MINOR); + } + + ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_VERSION, + VIRTCHNL_STATUS_SUCCESS, (u8 *)&vf->version, sizeof(vf->version), + NULL); +} + +/** + * ice_vf_validate_mac - Validate MAC address before adding it + * @vf: VF tracking structure + * @addr: MAC address to validate + * + * Validate a MAC address before adding it to a VF during the handling + * of a VIRTCHNL_OP_ADD_ETH_ADDR operation. Notably, this also checks if + * the VF is allowed to set its own arbitrary MAC addresses. + * + * Returns 0 if MAC address is valid for the given vf + */ +static int +ice_vf_validate_mac(struct ice_vf *vf, const uint8_t *addr) +{ + + if (ETHER_IS_ZERO(addr) || ETHER_IS_BROADCAST(addr)) + return (EINVAL); + + /* + * If the VF is not allowed to change its MAC address, don't let it + * set a MAC filter for an address that is not a multicast address and + * is not its assigned MAC. + */ + if (!(vf->vf_flags & VF_FLAG_SET_MAC_CAP) && + !(ETHER_IS_MULTICAST(addr) || !bcmp(addr, vf->mac, ETHER_ADDR_LEN))) + return (EPERM); + + return (0); +} + +/** + * ice_vc_add_eth_addr_msg - Handle VIRTCHNL_OP_ADD_ETH_ADDR msg from VF + * @sc: device private structure + * @vf: VF tracking structure + * @msg_buf: raw message buffer from the VF + * + * Receives a list of MAC addresses from the VF and adds those addresses + * to the VSI's filter list. + */ +static void +ice_vc_add_eth_addr_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf) +{ + enum virtchnl_status_code v_status = VIRTCHNL_STATUS_SUCCESS; + struct virtchnl_ether_addr_list *addr_list; + struct ice_hw *hw = &sc->hw; + u16 added_addr_cnt = 0; + int error = 0; + + addr_list = (struct virtchnl_ether_addr_list *)msg_buf; + + if (addr_list->num_elements > + (vf->mac_filter_limit - vf->mac_filter_cnt)) { + v_status = VIRTCHNL_STATUS_ERR_NO_MEMORY; + goto done; + } + + for (int i = 0; i < addr_list->num_elements; i++) { + u8 *addr = addr_list->list[i].addr; + + /* The type flag is currently ignored; every MAC address is + * treated as the LEGACY type + */ + + error = ice_vf_validate_mac(vf, addr); + if (error == EPERM) { + device_printf(sc->dev, + "%s: VF-%d: Not permitted to add MAC addr for VSI %d\n", + __func__, vf->vf_num, vf->vsi->idx); + v_status = VIRTCHNL_STATUS_ERR_PARAM; + continue; + } else if (error) { + device_printf(sc->dev, + "%s: VF-%d: Did not add invalid MAC addr for VSI %d\n", + __func__, vf->vf_num, vf->vsi->idx); + v_status = VIRTCHNL_STATUS_ERR_PARAM; + continue; + } + + error = ice_add_vsi_mac_filter(vf->vsi, addr); + if (error) { + device_printf(sc->dev, + "%s: VF-%d: Error adding MAC addr for VSI %d\n", + __func__, vf->vf_num, vf->vsi->idx); + v_status = VIRTCHNL_STATUS_ERR_PARAM; + continue; + } + /* Don't count VF's MAC against its MAC filter limit */ + if (memcmp(addr, vf->mac, ETHER_ADDR_LEN)) + added_addr_cnt++; + } + + vf->mac_filter_cnt += added_addr_cnt; + +done: + ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_ADD_ETH_ADDR, + v_status, NULL, 0, NULL); +} + +/** + * ice_vc_del_eth_addr_msg - Handle VIRTCHNL_OP_DEL_ETH_ADDR msg from VF + * @sc: device private structure + * @vf: VF tracking structure + * @msg_buf: raw message buffer from the VF + * + * Receives a list of MAC addresses from the VF and removes those addresses + * from the VSI's filter list. + */ +static void +ice_vc_del_eth_addr_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf) +{ + enum virtchnl_status_code v_status = VIRTCHNL_STATUS_SUCCESS; + struct virtchnl_ether_addr_list *addr_list; + struct ice_hw *hw = &sc->hw; + u16 deleted_addr_cnt = 0; + int error = 0; + + addr_list = (struct virtchnl_ether_addr_list *)msg_buf; + + for (int i = 0; i < addr_list->num_elements; i++) { + error = ice_remove_vsi_mac_filter(vf->vsi, addr_list->list[i].addr); + if (error) { + device_printf(sc->dev, + "%s: VF-%d: Error removing MAC addr for VSI %d\n", + __func__, vf->vf_num, vf->vsi->idx); + v_status = VIRTCHNL_STATUS_ERR_PARAM; + continue; + } + /* Don't count VF's MAC against its MAC filter limit */ + if (memcmp(addr_list->list[i].addr, vf->mac, ETHER_ADDR_LEN)) + deleted_addr_cnt++; + } + + if (deleted_addr_cnt >= vf->mac_filter_cnt) + vf->mac_filter_cnt = 0; + else + vf->mac_filter_cnt -= deleted_addr_cnt; + + ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_DEL_ETH_ADDR, + v_status, NULL, 0, NULL); +} + +/** + * ice_vc_add_vlan_msg - Handle VIRTCHNL_OP_ADD_VLAN msg from VF + * @sc: PF's softc structure + * @vf: VF tracking structure + * @msg_buf: message buffer from VF + * + * Adds the VLANs in msg_buf to the VF's VLAN filter list. + */ +static void +ice_vc_add_vlan_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf) +{ + struct ice_hw *hw = &sc->hw; + struct virtchnl_vlan_filter_list *vlan_list; + int status = 0; + enum virtchnl_status_code v_status = VIRTCHNL_STATUS_SUCCESS; + struct ice_vsi *vsi = vf->vsi; + + vlan_list = (struct virtchnl_vlan_filter_list *)msg_buf; + + if (vlan_list->vsi_id != vsi->idx) { + device_printf(sc->dev, + "VF-%d: Message has invalid VSI ID (expected %d, got %d)\n", + vf->vf_num, vsi->idx, vlan_list->vsi_id); + v_status = VIRTCHNL_STATUS_ERR_PARAM; + goto done; + } + + if (vlan_list->num_elements > (vf->vlan_limit - vf->vlan_cnt)) { + v_status = VIRTCHNL_STATUS_ERR_NO_MEMORY; + goto done; + } + + status = ice_add_vlan_hw_filters(vsi, vlan_list->vlan_id, + vlan_list->num_elements); + if (status) { + device_printf(sc->dev, + "VF-%d: Failure adding VLANs to VSI %d, err %s aq_err %s\n", + vf->vf_num, vsi->idx, ice_status_str(status), + ice_aq_str(sc->hw.adminq.sq_last_status)); + v_status = ice_iov_err_to_virt_err(status); + goto done; + } + + vf->vlan_cnt += vlan_list->num_elements; + +done: + ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_ADD_VLAN, + v_status, NULL, 0, NULL); +} + +/** + * ice_vc_del_vlan_msg - Handle VIRTCHNL_OP_DEL_VLAN msg from VF + * @sc: PF's softc structure + * @vf: VF tracking structure + * @msg_buf: message buffer from VF + * + * Removes the VLANs in msg_buf from the VF's VLAN filter list. + */ +static void +ice_vc_del_vlan_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf) +{ + struct ice_hw *hw = &sc->hw; + struct virtchnl_vlan_filter_list *vlan_list; + int status = 0; + enum virtchnl_status_code v_status = VIRTCHNL_STATUS_SUCCESS; + struct ice_vsi *vsi = vf->vsi; + + vlan_list = (struct virtchnl_vlan_filter_list *)msg_buf; + + if (vlan_list->vsi_id != vsi->idx) { + device_printf(sc->dev, + "VF-%d: Message has invalid VSI ID (expected %d, got %d)\n", + vf->vf_num, vsi->idx, vlan_list->vsi_id); + v_status = VIRTCHNL_STATUS_ERR_PARAM; + goto done; + } + + status = ice_remove_vlan_hw_filters(vsi, vlan_list->vlan_id, + vlan_list->num_elements); + if (status) { + device_printf(sc->dev, + "VF-%d: Failure deleting VLANs from VSI %d, err %s aq_err %s\n", + vf->vf_num, vsi->idx, ice_status_str(status), + ice_aq_str(sc->hw.adminq.sq_last_status)); + v_status = ice_iov_err_to_virt_err(status); + goto done; + } + + if (vlan_list->num_elements >= vf->vlan_cnt) + vf->vlan_cnt = 0; + else + vf->vlan_cnt -= vlan_list->num_elements; + +done: + ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_DEL_VLAN, + v_status, NULL, 0, NULL); +} + +/** + * ice_vc_validate_ring_len - Check to see if a descriptor ring length is valid + * @ring_len: length of ring + * + * Check whether a ring size value is valid. + * + * @returns true if given ring size is valid + */ +static bool +ice_vc_isvalid_ring_len(u16 ring_len) +{ + return (ring_len >= ICE_MIN_DESC_COUNT && + ring_len <= ICE_MAX_DESC_COUNT && + !(ring_len % ICE_DESC_COUNT_INCR)); +} + +/** + * ice_vc_cfg_vsi_qs_msg - Handle VIRTCHNL_OP_CONFIG_VSI_QUEUES msg from VF + * @sc: PF's softc structure + * @vf: VF tracking structure + * @msg_buf: message buffer from VF + */ +static void +ice_vc_cfg_vsi_qs_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf) +{ + device_t dev = sc->dev; + struct ice_hw *hw = &sc->hw; + struct virtchnl_vsi_queue_config_info *vqci; + struct virtchnl_queue_pair_info *vqpi; + enum virtchnl_status_code status = VIRTCHNL_STATUS_SUCCESS; + struct ice_vsi *vsi = vf->vsi; + struct ice_tx_queue *txq; + struct ice_rx_queue *rxq; + int i, error = 0; + + vqci = (struct virtchnl_vsi_queue_config_info *)msg_buf; + + if (vqci->num_queue_pairs > vf->vsi->num_tx_queues && + vqci->num_queue_pairs > vf->vsi->num_rx_queues) { + status = VIRTCHNL_STATUS_ERR_PARAM; + goto done; + } + + ice_vsi_disable_tx(vf->vsi); + ice_control_all_rx_queues(vf->vsi, false); + + /* + * Clear TX and RX queues config in case VF + * requests different number of queues. + */ + for (i = 0; i < vsi->num_tx_queues; i++) { + txq = &vsi->tx_queues[i]; + + txq->desc_count = 0; + txq->tx_paddr = 0; + txq->tc = 0; + } + + for (i = 0; i < vsi->num_rx_queues; i++) { + rxq = &vsi->rx_queues[i]; + + rxq->desc_count = 0; + rxq->rx_paddr = 0; + } + + vqpi = vqci->qpair; + for (i = 0; i < vqci->num_queue_pairs; i++, vqpi++) { + /* Initial parameter validation */ + if (vqpi->txq.vsi_id != vf->vsi->idx || + vqpi->rxq.vsi_id != vf->vsi->idx || + vqpi->txq.queue_id != vqpi->rxq.queue_id || + vqpi->txq.headwb_enabled || + vqpi->rxq.splithdr_enabled || + vqpi->rxq.crc_disable || + !(ice_vc_isvalid_ring_len(vqpi->txq.ring_len)) || + !(ice_vc_isvalid_ring_len(vqpi->rxq.ring_len))) { + status = VIRTCHNL_STATUS_ERR_PARAM; + goto done; + } + + /* Copy parameters into VF's queue/VSI structs */ + txq = &vsi->tx_queues[vqpi->txq.queue_id]; + + txq->desc_count = vqpi->txq.ring_len; + txq->tx_paddr = vqpi->txq.dma_ring_addr; + txq->q_handle = vqpi->txq.queue_id; + txq->tc = 0; + + rxq = &vsi->rx_queues[vqpi->rxq.queue_id]; + + rxq->desc_count = vqpi->rxq.ring_len; + rxq->rx_paddr = vqpi->rxq.dma_ring_addr; + vsi->mbuf_sz = vqpi->rxq.databuffer_size; + } + + /* Configure TX queues in HW */ + error = ice_cfg_vsi_for_tx(vsi); + if (error) { + device_printf(dev, + "VF-%d: Unable to configure VSI for Tx: %s\n", + vf->vf_num, ice_err_str(error)); + status = VIRTCHNL_STATUS_ERR_ADMIN_QUEUE_ERROR; + goto done; + } + + /* Configure RX queues in HW */ + error = ice_cfg_vsi_for_rx(vsi); + if (error) { + device_printf(dev, + "VF-%d: Unable to configure VSI for Rx: %s\n", + vf->vf_num, ice_err_str(error)); + status = VIRTCHNL_STATUS_ERR_ADMIN_QUEUE_ERROR; + ice_vsi_disable_tx(vsi); + goto done; + } + +done: + ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_CONFIG_VSI_QUEUES, + status, NULL, 0, NULL); +} + +/** + * ice_vc_cfg_rss_key_msg - Handle VIRTCHNL_OP_CONFIG_RSS_KEY msg from VF + * @sc: PF's softc structure + * @vf: VF tracking structure + * @msg_buf: message buffer from VF + * + * Sets the RSS key for the given VF, using the contents of msg_buf. + */ +static void +ice_vc_cfg_rss_key_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf) +{ + struct ice_aqc_get_set_rss_keys keydata = + { .standard_rss_key = {0}, .extended_hash_key = {0} }; + struct ice_hw *hw = &sc->hw; + struct virtchnl_rss_key *vrk; + int status = 0; + enum virtchnl_status_code v_status = VIRTCHNL_STATUS_SUCCESS; + struct ice_vsi *vsi = vf->vsi; + + vrk = (struct virtchnl_rss_key *)msg_buf; + + if (vrk->vsi_id != vsi->idx) { + device_printf(sc->dev, + "VF-%d: Message has invalid VSI ID (expected %d, got %d)\n", + vf->vf_num, vsi->idx, vrk->vsi_id); + v_status = VIRTCHNL_STATUS_ERR_PARAM; + goto done; + } + + if ((vrk->key_len > + (ICE_AQC_GET_SET_RSS_KEY_DATA_RSS_KEY_SIZE + + ICE_AQC_GET_SET_RSS_KEY_DATA_HASH_KEY_SIZE)) || + vrk->key_len == 0) { + v_status = VIRTCHNL_STATUS_ERR_PARAM; + goto done; + } + + memcpy(&keydata, vrk->key, vrk->key_len); + + status = ice_aq_set_rss_key(hw, vsi->idx, &keydata); + if (status) { + device_printf(sc->dev, + "ice_aq_set_rss_key status %s, error %s\n", + ice_status_str(status), ice_aq_str(hw->adminq.sq_last_status)); + v_status = ice_iov_err_to_virt_err(status); + goto done; + } + +done: + ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_CONFIG_RSS_KEY, + v_status, NULL, 0, NULL); +} + +/** + * ice_vc_cfg_rss_lut_msg - Handle VIRTCHNL_OP_CONFIG_RSS_LUT msg from VF + * @sc: PF's softc structure + * @vf: VF tracking structure + * @msg_buf: message buffer from VF + * + * Adds the LUT from the VF in msg_buf to the PF via an admin queue call. + */ +static void +ice_vc_cfg_rss_lut_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf) +{ + struct ice_hw *hw = &sc->hw; + struct virtchnl_rss_lut *vrl; + int status = 0; + enum virtchnl_status_code v_status = VIRTCHNL_STATUS_SUCCESS; + struct ice_aq_get_set_rss_lut_params lut_params = {}; + struct ice_vsi *vsi = vf->vsi; + + vrl = (struct virtchnl_rss_lut *)msg_buf; + + if (vrl->vsi_id != vsi->idx) { + device_printf(sc->dev, + "VF-%d: Message has invalid VSI ID (expected %d, got %d)\n", + vf->vf_num, vsi->idx, vrl->vsi_id); + v_status = VIRTCHNL_STATUS_ERR_PARAM; + goto done; + } + + if (vrl->lut_entries > ICE_VSIQF_HLUT_ARRAY_SIZE) { + v_status = VIRTCHNL_STATUS_ERR_PARAM; + goto done; + } + + lut_params.vsi_handle = vsi->idx; + lut_params.lut_size = vsi->rss_table_size; + lut_params.lut_type = vsi->rss_lut_type; + lut_params.lut = vrl->lut; + lut_params.global_lut_id = 0; + + status = ice_aq_set_rss_lut(hw, &lut_params); + if (status) { + device_printf(sc->dev, + "VF-%d: Cannot set RSS lut, err %s aq_err %s\n", + vf->vf_num, ice_status_str(status), + ice_aq_str(hw->adminq.sq_last_status)); + v_status = ice_iov_err_to_virt_err(status); + } + +done: + ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_CONFIG_RSS_LUT, + v_status, NULL, 0, NULL); +} + +/** + * ice_vc_set_rss_hena_msg - Handle VIRTCHNL_OP_SET_RSS_HENA msg from VF + * @sc: PF's softc structure + * @vf: VF tracking structure + * @msg_buf: message buffer from VF + * + * Adds the VF's hena (hash enable) bits as flow types to the PF's RSS flow + * type list. + */ +static void +ice_vc_set_rss_hena_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf) +{ + struct ice_hw *hw = &sc->hw; + struct virtchnl_rss_hena *vrh; + int status = 0; + enum virtchnl_status_code v_status = VIRTCHNL_STATUS_SUCCESS; + struct ice_vsi *vsi = vf->vsi; + + MPASS(vsi != NULL); + + vrh = (struct virtchnl_rss_hena *)msg_buf; + + /* + * Remove existing configuration to make sure only requested + * config is applied and allow VFs to disable RSS completly. + */ + status = ice_rem_vsi_rss_cfg(hw, vsi->idx); + if (vrh->hena) { + /* + * Problem with removing config is not fatal, when new one + * is requested. Warn about it but try to apply new config + * anyway. + */ + if (status) + device_printf(sc->dev, + "ice_rem_vsi_rss_cfg status %s, error %s\n", + ice_status_str(status), + ice_aq_str(hw->adminq.sq_last_status)); + status = ice_add_avf_rss_cfg(hw, vsi->idx, vrh->hena); + if (status) + device_printf(sc->dev, + "ice_add_avf_rss_cfg status %s, error %s\n", + ice_status_str(status), + ice_aq_str(hw->adminq.sq_last_status)); + } + v_status = ice_iov_err_to_virt_err(status); + ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_SET_RSS_HENA, + v_status, NULL, 0, NULL); +} + +/** + * ice_vc_enable_queues_msg - Handle VIRTCHNL_OP_ENABLE_QUEUES msg from VF + * @sc: PF's softc structure + * @vf: VF tracking structure + * @msg_buf: message buffer from VF + * + * Enables VF queues selected in msg_buf for Tx/Rx traffic. + * + * @remark Only actually operates on Rx queues; Tx queues are enabled in + * CONFIG_VSI_QUEUES message handler. + */ +static void +ice_vc_enable_queues_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf) +{ + struct ice_hw *hw = &sc->hw; + struct virtchnl_queue_select *vqs; + enum virtchnl_status_code v_status = VIRTCHNL_STATUS_SUCCESS; + struct ice_vsi *vsi = vf->vsi; + int bit, error = 0; + + vqs = (struct virtchnl_queue_select *)msg_buf; + + if (vqs->vsi_id != vsi->idx) { + device_printf(sc->dev, + "%s: VF-%d: Message has invalid VSI ID (expected %d, got %d)\n", + __func__, vf->vf_num, vsi->idx, vqs->vsi_id); + v_status = VIRTCHNL_STATUS_ERR_PARAM; + goto done; + } + + if (!vqs->rx_queues && !vqs->tx_queues) { + device_printf(sc->dev, + "%s: VF-%d: message queue masks are empty\n", + __func__, vf->vf_num); + v_status = VIRTCHNL_STATUS_ERR_PARAM; + goto done; + } + + /* Validate rx_queue mask */ + bit = fls(vqs->rx_queues); + if (bit > vsi->num_rx_queues) { + device_printf(sc->dev, + "%s: VF-%d: message's rx_queues map (0x%08x) has invalid bit set (%d)\n", + __func__, vf->vf_num, vqs->rx_queues, bit); + v_status = VIRTCHNL_STATUS_ERR_PARAM; + goto done; + } + + /* Tx ring enable is handled in an earlier message. */ + for_each_set_bit(bit, &vqs->rx_queues, 32) { + error = ice_control_rx_queue(vsi, bit, true); + if (error) { + device_printf(sc->dev, + "Unable to enable Rx ring %d for receive: %s\n", + bit, ice_err_str(error)); + v_status = VIRTCHNL_STATUS_ERR_PARAM; + goto done; + } + } + +done: + ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_ENABLE_QUEUES, + v_status, NULL, 0, NULL); +} + +/** + * ice_vc_disable_queues_msg - Handle VIRTCHNL_OP_DISABLE_QUEUES msg + * @sc: PF's softc structure + * @vf: VF tracking structure + * @msg_buf: message buffer from VF + * + * Disables all VF queues for the VF's VSI. + * + * @remark Unlike the ENABLE_QUEUES handler, this operates on both + * Tx and Rx queues + */ +static void +ice_vc_disable_queues_msg(struct ice_softc *sc, struct ice_vf *vf, + u8 *msg_buf __unused) +{ + struct ice_hw *hw = &sc->hw; + enum virtchnl_status_code v_status = VIRTCHNL_STATUS_SUCCESS; + struct ice_vsi *vsi = vf->vsi; + int error = 0; + + error = ice_control_all_rx_queues(vsi, false); + if (error) { + device_printf(sc->dev, + "Unable to disable Rx rings for transmit: %s\n", + ice_err_str(error)); + v_status = VIRTCHNL_STATUS_ERR_PARAM; + goto done; + } + + error = ice_vsi_disable_tx(vsi); + if (error) { + /* Already prints an error message */ + v_status = VIRTCHNL_STATUS_ERR_PARAM; + } + +done: + ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_DISABLE_QUEUES, + v_status, NULL, 0, NULL); +} + +/** + * ice_vc_cfg_irq_map_msg - Handle VIRTCHNL_OP_CFG_IRQ_MAP msg from VF + * @sc: PF's softc structure + * @vf: VF tracking structure + * @msg_buf: message buffer from VF + * + * Configures the interrupt vectors described in the message in msg_buf. The + * VF needs to send this message during init, so that queues can be allowed + * to generate interrupts. + */ +static void +ice_vc_cfg_irq_map_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf) +{ +#define ICE_VIRTCHNL_QUEUE_MAP_SIZE 16 + struct ice_hw *hw = &sc->hw; + struct virtchnl_irq_map_info *vimi; + struct virtchnl_vector_map *vvm; + enum virtchnl_status_code v_status = VIRTCHNL_STATUS_SUCCESS; + struct ice_vsi *vsi = vf->vsi; + u16 vector; + + vimi = (struct virtchnl_irq_map_info *)msg_buf; + + if (vimi->num_vectors > vf->num_irq_vectors) { + device_printf(sc->dev, + "%s: VF-%d: message has more vectors (%d) than configured for VF (%d)\n", + __func__, vf->vf_num, vimi->num_vectors, vf->num_irq_vectors); + v_status = VIRTCHNL_STATUS_ERR_PARAM; + goto done; + } + + vvm = vimi->vecmap; + /* Save off information from message */ + for (int i = 0; i < vimi->num_vectors; i++, vvm++) { + struct ice_tx_queue *txq; + struct ice_rx_queue *rxq; + int bit; + + if (vvm->vsi_id != vf->vsi->idx) { + device_printf(sc->dev, + "%s: VF-%d: message's VSI ID (%d) does not match VF's (%d) for vector %d\n", + __func__, vf->vf_num, vvm->vsi_id, vf->vsi->idx, i); + v_status = VIRTCHNL_STATUS_ERR_PARAM; + goto done; + } + + /* vvm->vector_id is relative to VF space */ + vector = vvm->vector_id; + + if (vector >= vf->num_irq_vectors) { + device_printf(sc->dev, + "%s: VF-%d: message's vector ID (%d) is greater than VF's max ID (%d)\n", + __func__, vf->vf_num, vector, vf->num_irq_vectors - 1); + v_status = VIRTCHNL_STATUS_ERR_PARAM; + goto done; + } + + /* The Misc/Admin Queue vector doesn't need mapping */ + if (vector == 0) + continue; + + /* coverity[address_of] */ + for_each_set_bit(bit, &vvm->txq_map, ICE_VIRTCHNL_QUEUE_MAP_SIZE) { + if (bit >= vsi->num_tx_queues) { + device_printf(sc->dev, + "%s: VF-%d: txq map has invalid bit set\n", + __func__, vf->vf_num); + v_status = VIRTCHNL_STATUS_ERR_PARAM; + goto done; + } + + vf->tx_irqvs[vector].me = vector; + + txq = &vsi->tx_queues[bit]; + txq->irqv = &vf->tx_irqvs[vector]; + txq->itr_idx = vvm->txitr_idx; + } + /* coverity[address_of] */ + for_each_set_bit(bit, &vvm->rxq_map, ICE_VIRTCHNL_QUEUE_MAP_SIZE) { + if (bit >= vsi->num_rx_queues) { + device_printf(sc->dev, + "%s: VF-%d: rxq map has invalid bit set\n", + __func__, vf->vf_num); + v_status = VIRTCHNL_STATUS_ERR_PARAM; + goto done; + } + vf->rx_irqvs[vector].me = vector; + + rxq = &vsi->rx_queues[bit]; + rxq->irqv = &vf->rx_irqvs[vector]; + rxq->itr_idx = vvm->rxitr_idx; + } + } + + /* Write to T/RQCTL registers to actually map vectors to queues */ + for (int i = 0; i < vf->vsi->num_rx_queues; i++) + if (vsi->rx_queues[i].irqv != NULL) + ice_configure_rxq_interrupt(hw, vsi->rx_qmap[i], + vsi->rx_queues[i].irqv->me, vsi->rx_queues[i].itr_idx); + + for (int i = 0; i < vf->vsi->num_tx_queues; i++) + if (vsi->tx_queues[i].irqv != NULL) + ice_configure_txq_interrupt(hw, vsi->tx_qmap[i], + vsi->tx_queues[i].irqv->me, vsi->tx_queues[i].itr_idx); + + ice_flush(hw); + +done: + ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_CONFIG_IRQ_MAP, + v_status, NULL, 0, NULL); +} + +/** + * ice_eth_stats_to_virtchnl_eth_stats - Convert stats for virtchnl + * @istats: VSI stats from HW to convert + * @vstats: stats struct to copy to + * + * This function copies all known stats in struct virtchnl_eth_stats from the + * input struct ice_eth_stats to an output struct virtchnl_eth_stats. + * + * @remark These two structure types currently have the same definition up to + * the size of struct virtchnl_eth_stats (on FreeBSD), but that could change + * in the future. + */ +static void +ice_eth_stats_to_virtchnl_eth_stats(struct ice_eth_stats *istats, + struct virtchnl_eth_stats *vstats) +{ + vstats->rx_bytes = istats->rx_bytes; + vstats->rx_unicast = istats->rx_unicast; + vstats->rx_multicast = istats->rx_multicast; + vstats->rx_broadcast = istats->rx_broadcast; + vstats->rx_discards = istats->rx_discards; + vstats->rx_unknown_protocol = istats->rx_unknown_protocol; + vstats->tx_bytes = istats->tx_bytes; + vstats->tx_unicast = istats->tx_unicast; + vstats->tx_multicast = istats->tx_multicast; + vstats->tx_broadcast = istats->tx_broadcast; + vstats->tx_discards = istats->tx_discards; + vstats->tx_errors = istats->tx_errors; +} + +/** + * ice_vc_get_stats_msg - Handle VIRTCHNL_OP_GET_STATS msg + * @sc: device private structure + * @vf: VF tracking structure + * @msg_buf: raw message buffer from the VF + * + * Updates the VF's VSI stats and sends those stats back to the VF. + */ +static void +ice_vc_get_stats_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf) +{ + struct virtchnl_queue_select *vqs; + struct virtchnl_eth_stats stats; + struct ice_vsi *vsi = vf->vsi; + struct ice_hw *hw = &sc->hw; + + vqs = (struct virtchnl_queue_select *)msg_buf; + + if (vqs->vsi_id != vsi->idx) { + device_printf(sc->dev, + "%s: VF-%d: message has invalid VSI ID %d (VF has VSI ID %d)\n", + __func__, vf->vf_num, vqs->vsi_id, vsi->idx); + ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_GET_STATS, + VIRTCHNL_STATUS_ERR_PARAM, NULL, 0, NULL); + } + + ice_update_vsi_hw_stats(vf->vsi); + ice_eth_stats_to_virtchnl_eth_stats(&vsi->hw_stats.cur, &stats); + + ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_GET_STATS, + VIRTCHNL_STATUS_SUCCESS, (u8 *)&stats, + sizeof(struct virtchnl_eth_stats), NULL); +} + +/** + * ice_vc_cfg_promisc_mode_msg - Handle VIRTCHNL_OP_CONFIG_PROMISCUOUS_MODE + * @sc: PF's softc structure + * @vf: VF tracking structure + * @msg_buf: message buffer from VF + * + * Configures the promiscuous modes for the given VSI in msg_buf. + */ +static void +ice_vc_cfg_promisc_mode_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf) +{ + struct ice_hw *hw = &sc->hw; + struct virtchnl_promisc_info *vpi; + enum virtchnl_status_code v_status = VIRTCHNL_STATUS_SUCCESS; + int status = 0; + struct ice_vsi *vsi = vf->vsi; + ice_declare_bitmap(old_promisc_mask, ICE_PROMISC_MAX); + ice_declare_bitmap(req_promisc_mask, ICE_PROMISC_MAX); + ice_declare_bitmap(clear_promisc_mask, ICE_PROMISC_MAX); + ice_declare_bitmap(set_promisc_mask, ICE_PROMISC_MAX); + ice_declare_bitmap(old_req_xor_mask, ICE_PROMISC_MAX); + u16 vid; + + vpi = (struct virtchnl_promisc_info *)msg_buf; + + /* Check to see if VF has permission to configure promiscuous mode */ + if (!(vf->vf_flags & VF_FLAG_PROMISC_CAP)) { + device_printf(sc->dev, + "VF-%d: attempted to configure promiscuous mode\n", + vf->vf_num); + /* Don't reply to VF with an error */ + goto done; + } + + if (vpi->vsi_id != vsi->idx) { + device_printf(sc->dev, + "VF-%d: Message has invalid VSI ID (expected %d, got %d)\n", + vf->vf_num, vsi->idx, vpi->vsi_id); + v_status = VIRTCHNL_STATUS_ERR_PARAM; + goto done; + } + + if (vpi->flags & ~ICE_VIRTCHNL_VALID_PROMISC_FLAGS) { + device_printf(sc->dev, + "VF-%d: Message has invalid promiscuous flags set (valid 0x%02x, got 0x%02x)\n", + vf->vf_num, ICE_VIRTCHNL_VALID_PROMISC_FLAGS, + vpi->flags); + v_status = VIRTCHNL_STATUS_ERR_PARAM; + goto done; + + } + + ice_zero_bitmap(req_promisc_mask, ICE_PROMISC_MAX); + /* Convert virtchnl flags to ice AQ promiscuous mode flags */ + if (vpi->flags & FLAG_VF_UNICAST_PROMISC) { + ice_set_bit(ICE_PROMISC_UCAST_TX, req_promisc_mask); + ice_set_bit(ICE_PROMISC_UCAST_RX, req_promisc_mask); + } + if (vpi->flags & FLAG_VF_MULTICAST_PROMISC) { + ice_set_bit(ICE_PROMISC_MCAST_TX, req_promisc_mask); + ice_set_bit(ICE_PROMISC_MCAST_RX, req_promisc_mask); + } + + status = ice_get_vsi_promisc(hw, vsi->idx, old_promisc_mask, &vid); + if (status) { + device_printf(sc->dev, + "VF-%d: Failed to get promiscuous mode mask for VSI %d, err %s aq_err %s\n", + vf->vf_num, vsi->idx, + ice_status_str(status), + ice_aq_str(hw->adminq.sq_last_status)); + v_status = ice_iov_err_to_virt_err(status); + goto done; + } + + /* Figure out what got added and what got removed */ + ice_zero_bitmap(old_req_xor_mask, ICE_PROMISC_MAX); + ice_xor_bitmap(old_req_xor_mask, old_promisc_mask, req_promisc_mask, ICE_PROMISC_MAX); + ice_and_bitmap(clear_promisc_mask, old_req_xor_mask, old_promisc_mask, ICE_PROMISC_MAX); + ice_and_bitmap(set_promisc_mask, old_req_xor_mask, req_promisc_mask, ICE_PROMISC_MAX); + + if (ice_is_any_bit_set(clear_promisc_mask, ICE_PROMISC_MAX)) { + status = ice_clear_vsi_promisc(hw, vsi->idx, + clear_promisc_mask, 0); + if (status) { + device_printf(sc->dev, + "VF-%d: Failed to clear promiscuous mode for VSI %d, err %s aq_err %s\n", + vf->vf_num, vsi->idx, + ice_status_str(status), + ice_aq_str(hw->adminq.sq_last_status)); + v_status = ice_iov_err_to_virt_err(status); + goto done; + } + } + + if (ice_is_any_bit_set(set_promisc_mask, ICE_PROMISC_MAX)) { + status = ice_set_vsi_promisc(hw, vsi->idx, set_promisc_mask, 0); + if (status) { + device_printf(sc->dev, + "VF-%d: Failed to set promiscuous mode for VSI %d, err %s aq_err %s\n", + vf->vf_num, vsi->idx, + ice_status_str(status), + ice_aq_str(hw->adminq.sq_last_status)); + v_status = ice_iov_err_to_virt_err(status); + goto done; + } + } + +done: + ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_CONFIG_PROMISCUOUS_MODE, + v_status, NULL, 0, NULL); +} + +/** + * ice_vc_notify_all_vfs_link_state - Notify all VFs of PF link state + * @sc: device private structure + * + * Sends a message to all VFs about the status of the PF's link + * state. For more details, @see ice_vc_notify_vf_link_state. + */ +void +ice_vc_notify_all_vfs_link_state(struct ice_softc *sc) +{ + for (int i = 0; i < sc->num_vfs; i++) + ice_vc_notify_vf_link_state(sc, &sc->vfs[i]); +} + +/** + * ice_vc_notify_vf_link_state - Notify VF of PF link state + * @sc: device private structure + * @vf: VF tracking structure + * + * Sends an event message to the specified VF with information about + * the current link state from the PF's port. This includes whether + * link is up or down, and the link speed in 100Mbps units. + */ +static void +ice_vc_notify_vf_link_state(struct ice_softc *sc, struct ice_vf *vf) +{ + struct virtchnl_pf_event event = {}; + struct ice_hw *hw = &sc->hw; + + event.event = VIRTCHNL_EVENT_LINK_CHANGE; + event.severity = PF_EVENT_SEVERITY_INFO; + event.event_data.link_event_adv.link_status = sc->link_up; + event.event_data.link_event_adv.link_speed = + (u32)ice_conv_link_speed_to_virtchnl(true, + hw->port_info->phy.link_info.link_speed); + + ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_EVENT, + VIRTCHNL_STATUS_SUCCESS, (u8 *)&event, sizeof(event), NULL); +} + +/** + * ice_vc_handle_vf_msg - Handle a message from a VF + * @sc: device private structure + * @event: event received from the HW MBX queue + * + * Called whenever an event is received from a VF on the HW mailbox queue. + * Responsible for handling these messages as well as responding to the + * VF afterwards, depending on the received message type. + */ +void +ice_vc_handle_vf_msg(struct ice_softc *sc, struct ice_rq_event_info *event) +{ + struct ice_hw *hw = &sc->hw; + device_t dev = sc->dev; + struct ice_vf *vf; + int err = 0; + + u32 v_opcode = event->desc.cookie_high; + u16 v_id = event->desc.retval; + u8 *msg = event->msg_buf; + u16 msglen = event->msg_len; + + if (v_id >= sc->num_vfs) { + device_printf(dev, "%s: Received msg from invalid VF-%d: opcode %d, len %d\n", + __func__, v_id, v_opcode, msglen); + return; + } + + vf = &sc->vfs[v_id]; + + /* Perform basic checks on the msg */ + err = virtchnl_vc_validate_vf_msg(&vf->version, v_opcode, msg, msglen); + if (err) { + device_printf(dev, "%s: Received invalid msg from VF-%d: opcode %d, len %d, error %d\n", + __func__, vf->vf_num, v_opcode, msglen, err); + ice_aq_send_msg_to_vf(hw, v_id, v_opcode, VIRTCHNL_STATUS_ERR_PARAM, NULL, 0, NULL); + return; + } + + switch (v_opcode) { + case VIRTCHNL_OP_VERSION: + ice_vc_version_msg(sc, vf, msg); + break; + case VIRTCHNL_OP_RESET_VF: + ice_reset_vf(sc, vf, true); + break; + case VIRTCHNL_OP_GET_VF_RESOURCES: + ice_vc_get_vf_res_msg(sc, vf, msg); + break; + case VIRTCHNL_OP_ADD_ETH_ADDR: + ice_vc_add_eth_addr_msg(sc, vf, msg); + break; + case VIRTCHNL_OP_DEL_ETH_ADDR: + ice_vc_del_eth_addr_msg(sc, vf, msg); + break; + case VIRTCHNL_OP_ADD_VLAN: + ice_vc_add_vlan_msg(sc, vf, msg); + break; + case VIRTCHNL_OP_DEL_VLAN: + ice_vc_del_vlan_msg(sc, vf, msg); + break; + case VIRTCHNL_OP_CONFIG_VSI_QUEUES: + ice_vc_cfg_vsi_qs_msg(sc, vf, msg); + break; + case VIRTCHNL_OP_CONFIG_RSS_KEY: + ice_vc_cfg_rss_key_msg(sc, vf, msg); + break; + case VIRTCHNL_OP_CONFIG_RSS_LUT: + ice_vc_cfg_rss_lut_msg(sc, vf, msg); + break; + case VIRTCHNL_OP_SET_RSS_HENA: + ice_vc_set_rss_hena_msg(sc, vf, msg); + break; + case VIRTCHNL_OP_ENABLE_QUEUES: + ice_vc_enable_queues_msg(sc, vf, msg); + ice_vc_notify_vf_link_state(sc, vf); + break; + case VIRTCHNL_OP_DISABLE_QUEUES: + ice_vc_disable_queues_msg(sc, vf, msg); + break; + case VIRTCHNL_OP_CONFIG_IRQ_MAP: + ice_vc_cfg_irq_map_msg(sc, vf, msg); + break; + case VIRTCHNL_OP_GET_STATS: + ice_vc_get_stats_msg(sc, vf, msg); + break; + case VIRTCHNL_OP_CONFIG_PROMISCUOUS_MODE: + ice_vc_cfg_promisc_mode_msg(sc, vf, msg); + break; + default: + device_printf(dev, "%s: Received unknown msg from VF-%d: opcode %d, len %d\n", + __func__, vf->vf_num, v_opcode, msglen); + ice_aq_send_msg_to_vf(hw, v_id, v_opcode, + VIRTCHNL_STATUS_ERR_NOT_SUPPORTED, NULL, 0, NULL); + break; + } +} + +/** + * ice_iov_setup_intr_mapping - Setup interrupt config for a VF + * @sc: device softc structure + * @vf: driver's VF structure for VF to be configured + * + * Before a VF can be used, and after a VF reset, the PF must configure + * the VF's interrupt allocation registers. This includes allocating + * interrupts from the PF's interrupt pool to the VF using the + * VPINT_ALLOC(_PCI) registers, and setting up a mapping from PF vectors + * to VF vectors in GLINT_VECT2FUNC. + * + * As well, this sets up queue allocation registers and maps the mailbox + * interrupt for the VF. + */ +static void +ice_iov_setup_intr_mapping(struct ice_softc *sc, struct ice_vf *vf) +{ + struct ice_hw *hw = &sc->hw; + struct ice_vsi *vsi = vf->vsi; + u16 v; + + /* Calculate indices for register ops below */ + u16 vf_first_irq_idx = vf->vf_imap[0]; + u16 vf_last_irq_idx = (vf_first_irq_idx + vf->num_irq_vectors) - 1; + u16 abs_vf_first_irq_idx = hw->func_caps.common_cap.msix_vector_first_id + + vf_first_irq_idx; + u16 abs_vf_last_irq_idx = (abs_vf_first_irq_idx + vf->num_irq_vectors) - 1; + u16 abs_vf_num = vf->vf_num + hw->func_caps.vf_base_id; + + /* Map out VF interrupt allocation in global device space. Both + * VPINT_ALLOC and VPINT_ALLOC_PCI use the same values. + */ + wr32(hw, VPINT_ALLOC(vf->vf_num), + (((abs_vf_first_irq_idx << VPINT_ALLOC_FIRST_S) & VPINT_ALLOC_FIRST_M) | + ((abs_vf_last_irq_idx << VPINT_ALLOC_LAST_S) & VPINT_ALLOC_LAST_M) | + VPINT_ALLOC_VALID_M)); + wr32(hw, VPINT_ALLOC_PCI(vf->vf_num), + (((abs_vf_first_irq_idx << VPINT_ALLOC_PCI_FIRST_S) & VPINT_ALLOC_PCI_FIRST_M) | + ((abs_vf_last_irq_idx << VPINT_ALLOC_PCI_LAST_S) & VPINT_ALLOC_PCI_LAST_M) | + VPINT_ALLOC_PCI_VALID_M)); + + /* Create inverse mapping of vectors to PF/VF combinations */ + for (v = vf_first_irq_idx; v <= vf_last_irq_idx; v++) + { + wr32(hw, GLINT_VECT2FUNC(v), + (((abs_vf_num << GLINT_VECT2FUNC_VF_NUM_S) & GLINT_VECT2FUNC_VF_NUM_M) | + ((hw->pf_id << GLINT_VECT2FUNC_PF_NUM_S) & GLINT_VECT2FUNC_PF_NUM_M))); + } + + /* Map mailbox interrupt to MSI-X index 0. Disable ITR for it, too. */ + wr32(hw, VPINT_MBX_CTL(abs_vf_num), + ((0 << VPINT_MBX_CTL_MSIX_INDX_S) & VPINT_MBX_CTL_MSIX_INDX_M) | + ((0x3 << VPINT_MBX_CTL_ITR_INDX_S) & VPINT_MBX_CTL_ITR_INDX_M) | + VPINT_MBX_CTL_CAUSE_ENA_M); + + /* Mark the TX queue mapping registers as valid */ + wr32(hw, VPLAN_TXQ_MAPENA(vf->vf_num), VPLAN_TXQ_MAPENA_TX_ENA_M); + + /* Indicate to HW that VF has scattered queue allocation */ + wr32(hw, VPLAN_TX_QBASE(vf->vf_num), VPLAN_TX_QBASE_VFQTABLE_ENA_M); + for (int i = 0; i < vsi->num_tx_queues; i++) { + wr32(hw, VPLAN_TX_QTABLE(i, vf->vf_num), + (vsi->tx_qmap[i] << VPLAN_TX_QTABLE_QINDEX_S) & VPLAN_TX_QTABLE_QINDEX_M); + } + + /* Mark the RX queue mapping registers as valid */ + wr32(hw, VPLAN_RXQ_MAPENA(vf->vf_num), VPLAN_RXQ_MAPENA_RX_ENA_M); + wr32(hw, VPLAN_RX_QBASE(vf->vf_num), VPLAN_RX_QBASE_VFQTABLE_ENA_M); + for (int i = 0; i < vsi->num_rx_queues; i++) { + wr32(hw, VPLAN_RX_QTABLE(i, vf->vf_num), + (vsi->rx_qmap[i] << VPLAN_RX_QTABLE_QINDEX_S) & VPLAN_RX_QTABLE_QINDEX_M); + } +} + +/** + * ice_err_to_virt err - translate ice errors into virtchnl errors + * @ice_err: status returned from ice function + */ +static enum virtchnl_status_code +ice_iov_err_to_virt_err(int ice_err) +{ + switch (ice_err) { + case 0: + return VIRTCHNL_STATUS_SUCCESS; + case ICE_ERR_BAD_PTR: + case ICE_ERR_INVAL_SIZE: + case ICE_ERR_DEVICE_NOT_SUPPORTED: + case ICE_ERR_PARAM: + case ICE_ERR_CFG: + return VIRTCHNL_STATUS_ERR_PARAM; + case ICE_ERR_NO_MEMORY: + return VIRTCHNL_STATUS_ERR_NO_MEMORY; + case ICE_ERR_NOT_READY: + case ICE_ERR_RESET_FAILED: + case ICE_ERR_FW_API_VER: + case ICE_ERR_AQ_ERROR: + case ICE_ERR_AQ_TIMEOUT: + case ICE_ERR_AQ_FULL: + case ICE_ERR_AQ_NO_WORK: + case ICE_ERR_AQ_EMPTY: + return VIRTCHNL_STATUS_ERR_ADMIN_QUEUE_ERROR; + default: + return VIRTCHNL_STATUS_ERR_NOT_SUPPORTED; + } +} diff --git a/sys/dev/ice/ice_iov.h b/sys/dev/ice/ice_iov.h new file mode 100644 index 000000000000..c4fb3e932e3f --- /dev/null +++ b/sys/dev/ice/ice_iov.h @@ -0,0 +1,125 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright (c) 2025, Intel Corporation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * @file ice_iov.h + * @brief header for IOV functionality + * + * This header includes definitions used to implement device Virtual Functions + * for the ice driver. + */ + +#ifndef _ICE_IOV_H_ +#define _ICE_IOV_H_ + +#include <sys/types.h> +#include <sys/bus.h> +#include <sys/nv.h> +#include <sys/iov_schema.h> +#include <sys/param.h> +#include <dev/pci/pcivar.h> +#include <dev/pci/pcireg.h> + +#include <dev/pci/pci_iov.h> + +#include "ice_iflib.h" +#include "ice_vf_mbx.h" + +/** + * @enum ice_vf_flags + * @brief VF state flags + * + * Used to indicate the status of a PF's VF, as well as indicating what each VF + * is capabile of. Intended to be modified only using atomic operations, so + * they can be read and modified in places that aren't locked. + * + * Used in struct ice_vf's vf_flags field. + */ +enum ice_vf_flags { + VF_FLAG_ENABLED = BIT(0), + VF_FLAG_SET_MAC_CAP = BIT(1), + VF_FLAG_VLAN_CAP = BIT(2), + VF_FLAG_PROMISC_CAP = BIT(3), + VF_FLAG_MAC_ANTI_SPOOF = BIT(4), +}; + +/** + * @struct ice_vf + * @brief PF's VF software context + * + * Represents the state and options for a VF spawned from a PF. + */ +struct ice_vf { + struct ice_vsi *vsi; + u32 vf_flags; + + u8 mac[ETHER_ADDR_LEN]; + u16 vf_num; + struct virtchnl_version_info version; + + u16 mac_filter_limit; + u16 mac_filter_cnt; + u16 vlan_limit; + u16 vlan_cnt; + + u16 num_irq_vectors; + u16 *vf_imap; + struct ice_irq_vector *tx_irqvs; + struct ice_irq_vector *rx_irqvs; +}; + +#define ICE_PCIE_DEV_STATUS 0xAA + +#define ICE_PCI_CIAD_WAIT_COUNT 100 +#define ICE_PCI_CIAD_WAIT_DELAY_US 1 +#define ICE_VPGEN_VFRSTAT_WAIT_COUNT 100 +#define ICE_VPGEN_VFRSTAT_WAIT_DELAY_US 20 + +#define ICE_VIRTCHNL_VALID_PROMISC_FLAGS (FLAG_VF_UNICAST_PROMISC | \ + FLAG_VF_MULTICAST_PROMISC) + +#define ICE_DEFAULT_VF_VLAN_LIMIT 64 +#define ICE_DEFAULT_VF_FILTER_LIMIT 16 + +int ice_iov_attach(struct ice_softc *sc); +int ice_iov_detach(struct ice_softc *sc); + +int ice_iov_init(struct ice_softc *sc, uint16_t num_vfs, const nvlist_t *params); +int ice_iov_add_vf(struct ice_softc *sc, uint16_t vfnum, const nvlist_t *params); +void ice_iov_uninit(struct ice_softc *sc); + +void ice_iov_handle_vflr(struct ice_softc *sc); + +void ice_vc_handle_vf_msg(struct ice_softc *sc, struct ice_rq_event_info *event); +void ice_vc_notify_all_vfs_link_state(struct ice_softc *sc); + +#endif /* _ICE_IOV_H_ */ + diff --git a/sys/dev/ice/ice_lib.c b/sys/dev/ice/ice_lib.c index d44ae5f37750..442111e5ffaf 100644 --- a/sys/dev/ice/ice_lib.c +++ b/sys/dev/ice/ice_lib.c @@ -42,6 +42,9 @@ #include "ice_lib.h" #include "ice_iflib.h" +#ifdef PCI_IOV +#include "ice_iov.h" +#endif #include <dev/pci/pcivar.h> #include <dev/pci/pcireg.h> #include <machine/resource.h> @@ -741,6 +744,12 @@ ice_initialize_vsi(struct ice_vsi *vsi) case ICE_VSI_VMDQ2: ctx.flags = ICE_AQ_VSI_TYPE_VMDQ2; break; +#ifdef PCI_IOV + case ICE_VSI_VF: + ctx.flags = ICE_AQ_VSI_TYPE_VF; + ctx.vf_num = vsi->vf_num; + break; +#endif default: return (ENODEV); } @@ -1607,6 +1616,12 @@ ice_setup_tx_ctx(struct ice_tx_queue *txq, struct ice_tlan_ctx *tlan_ctx, u16 pf case ICE_VSI_VMDQ2: tlan_ctx->vmvf_type = ICE_TLAN_CTX_VMVF_TYPE_VMQ; break; +#ifdef PCI_IOV + case ICE_VSI_VF: + tlan_ctx->vmvf_type = ICE_TLAN_CTX_VMVF_TYPE_VF; + tlan_ctx->vmvf_num = hw->func_caps.vf_base_id + vsi->vf_num; + break; +#endif default: return (ENODEV); } @@ -1660,6 +1675,10 @@ ice_cfg_vsi_for_tx(struct ice_vsi *vsi) struct ice_tlan_ctx tlan_ctx = { 0 }; struct ice_tx_queue *txq = &vsi->tx_queues[i]; + /* Last configured queue */ + if (txq->desc_count == 0) + break; + pf_q = vsi->tx_qmap[txq->me]; qg->txqs[0].txq_id = htole16(pf_q); @@ -1788,6 +1807,10 @@ ice_cfg_vsi_for_rx(struct ice_vsi *vsi) for (i = 0; i < vsi->num_rx_queues; i++) { MPASS(vsi->mbuf_sz > 0); + /* Last configured queue */ + if (vsi->rx_queues[i].desc_count == 0) + break; + err = ice_setup_rx_ctx(&vsi->rx_queues[i]); if (err) return err; @@ -2257,6 +2280,11 @@ ice_process_ctrlq_event(struct ice_softc *sc, const char *qname, case ice_aqc_opc_get_link_status: ice_process_link_event(sc, event); break; +#ifdef PCI_IOV + case ice_mbx_opc_send_msg_to_pf: + ice_vc_handle_vf_msg(sc, event); + break; +#endif case ice_aqc_opc_fw_logs_event: ice_handle_fw_log_event(sc, &event->desc, event->msg_buf); break; diff --git a/sys/dev/ice/ice_lib.h b/sys/dev/ice/ice_lib.h index b6b23ec82161..308b2bda2790 100644 --- a/sys/dev/ice/ice_lib.h +++ b/sys/dev/ice/ice_lib.h @@ -611,6 +611,10 @@ struct ice_vsi { u16 mirror_src_vsi; u16 rule_mir_ingress; u16 rule_mir_egress; + +#ifdef PCI_IOV + u8 vf_num; /* Index of owning VF, if applicable */ +#endif }; /** diff --git a/sys/dev/ice/ice_vf_mbx.c b/sys/dev/ice/ice_vf_mbx.c new file mode 100644 index 000000000000..387a1c6739a6 --- /dev/null +++ b/sys/dev/ice/ice_vf_mbx.c @@ -0,0 +1,471 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright (c) 2025, Intel Corporation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "ice_common.h" +#include "ice_hw_autogen.h" +#include "ice_vf_mbx.h" + +/** + * ice_aq_send_msg_to_vf + * @hw: pointer to the hardware structure + * @vfid: VF ID to send msg + * @v_opcode: opcodes for VF-PF communication + * @v_retval: return error code + * @msg: pointer to the msg buffer + * @msglen: msg length + * @cd: pointer to command details + * + * Send message to VF driver (0x0802) using mailbox + * queue and asynchronously sending message via + * ice_sq_send_cmd() function + */ +int +ice_aq_send_msg_to_vf(struct ice_hw *hw, u16 vfid, u32 v_opcode, u32 v_retval, + u8 *msg, u16 msglen, struct ice_sq_cd *cd) +{ + struct ice_aqc_pf_vf_msg *cmd; + struct ice_aq_desc desc; + + ice_fill_dflt_direct_cmd_desc(&desc, ice_mbx_opc_send_msg_to_vf); + + cmd = &desc.params.virt; + cmd->id = CPU_TO_LE32(vfid); + + desc.cookie_high = CPU_TO_LE32(v_opcode); + desc.cookie_low = CPU_TO_LE32(v_retval); + + if (msglen) + desc.flags |= CPU_TO_LE16(ICE_AQ_FLAG_RD); + + return ice_sq_send_cmd(hw, &hw->mailboxq, &desc, msg, msglen, cd); +} + +/** + * ice_aq_send_msg_to_pf + * @hw: pointer to the hardware structure + * @v_opcode: opcodes for VF-PF communication + * @v_retval: return error code + * @msg: pointer to the msg buffer + * @msglen: msg length + * @cd: pointer to command details + * + * Send message to PF driver using mailbox queue. By default, this + * message is sent asynchronously, i.e. ice_sq_send_cmd() + * does not wait for completion before returning. + */ +int +ice_aq_send_msg_to_pf(struct ice_hw *hw, enum virtchnl_ops v_opcode, + int v_retval, u8 *msg, u16 msglen, + struct ice_sq_cd *cd) +{ + struct ice_aq_desc desc; + + ice_fill_dflt_direct_cmd_desc(&desc, ice_mbx_opc_send_msg_to_pf); + desc.cookie_high = CPU_TO_LE32(v_opcode); + desc.cookie_low = CPU_TO_LE32(v_retval); + + if (msglen) + desc.flags |= CPU_TO_LE16(ICE_AQ_FLAG_RD); + + return ice_sq_send_cmd(hw, &hw->mailboxq, &desc, msg, msglen, cd); +} + +static const u32 ice_legacy_aq_to_vc_speed[] = { + VIRTCHNL_LINK_SPEED_100MB, /* BIT(0) */ + VIRTCHNL_LINK_SPEED_100MB, + VIRTCHNL_LINK_SPEED_1GB, + VIRTCHNL_LINK_SPEED_1GB, + VIRTCHNL_LINK_SPEED_1GB, + VIRTCHNL_LINK_SPEED_10GB, + VIRTCHNL_LINK_SPEED_20GB, + VIRTCHNL_LINK_SPEED_25GB, + VIRTCHNL_LINK_SPEED_40GB, + VIRTCHNL_LINK_SPEED_40GB, + VIRTCHNL_LINK_SPEED_40GB, +}; + +/** + * ice_conv_link_speed_to_virtchnl + * @adv_link_support: determines the format of the returned link speed + * @link_speed: variable containing the link_speed to be converted + * + * Convert link speed supported by HW to link speed supported by virtchnl. + * If adv_link_support is true, then return link speed in Mbps. Else return + * link speed as a VIRTCHNL_LINK_SPEED_* casted to a u32. Note that the caller + * needs to cast back to an enum virtchnl_link_speed in the case where + * adv_link_support is false, but when adv_link_support is true the caller can + * expect the speed in Mbps. + */ +u32 ice_conv_link_speed_to_virtchnl(bool adv_link_support, u16 link_speed) +{ + /* convert a BIT() value into an array index */ + u16 index = (u16)(ice_fls(link_speed) - 1); + + if (adv_link_support) + return ice_get_link_speed(index); + else if (index < ARRAY_SIZE(ice_legacy_aq_to_vc_speed)) + /* Virtchnl speeds are not defined for every speed supported in + * the hardware. To maintain compatibility with older AVF + * drivers, while reporting the speed the new speed values are + * resolved to the closest known virtchnl speeds + */ + return ice_legacy_aq_to_vc_speed[index]; + + return VIRTCHNL_LINK_SPEED_UNKNOWN; +} + +/* The mailbox overflow detection algorithm helps to check if there + * is a possibility of a malicious VF transmitting too many MBX messages to the + * PF. + * 1. The mailbox snapshot structure, ice_mbx_snapshot, is initialized during + * driver initialization in ice_init_hw() using ice_mbx_init_snapshot(). + * The struct ice_mbx_snapshot helps to track and traverse a static window of + * messages within the mailbox queue while looking for a malicious VF. + * + * 2. When the caller starts processing its mailbox queue in response to an + * interrupt, the structure ice_mbx_snapshot is expected to be cleared before + * the algorithm can be run for the first time for that interrupt. This + * requires calling ice_mbx_reset_snapshot() as well as calling + * ice_mbx_reset_vf_info() for each VF tracking structure. + * + * 3. For every message read by the caller from the MBX Queue, the caller must + * call the detection algorithm's entry function ice_mbx_vf_state_handler(). + * Before every call to ice_mbx_vf_state_handler() the struct ice_mbx_data is + * filled as it is required to be passed to the algorithm. + * + * 4. Every time a message is read from the MBX queue, a tracking structure + * for the VF must be passed to the state handler. The boolean output + * report_malvf from ice_mbx_vf_state_handler() serves as an indicator to the + * caller whether it must report this VF as malicious or not. + * + * 5. When a VF is identified to be malicious, the caller can send a message + * to the system administrator. + * + * 6. The PF is responsible for maintaining the struct ice_mbx_vf_info + * structure for each VF. The PF should clear the VF tracking structure if the + * VF is reset. When a VF is shut down and brought back up, we will then + * assume that the new VF is not malicious and may report it again if we + * detect it again. + * + * 7. The function ice_mbx_reset_snapshot() is called to reset the information + * in ice_mbx_snapshot for every new mailbox interrupt handled. + */ +#define ICE_RQ_DATA_MASK(rq_data) ((rq_data) & PF_MBX_ARQH_ARQH_M) +/* Using the highest value for an unsigned 16-bit value 0xFFFF to indicate that + * the max messages check must be ignored in the algorithm + */ +#define ICE_IGNORE_MAX_MSG_CNT 0xFFFF + +/** + * ice_mbx_reset_snapshot - Initialize mailbox snapshot structure + * @snap: pointer to the mailbox snapshot + */ +static void ice_mbx_reset_snapshot(struct ice_mbx_snapshot *snap) +{ + struct ice_mbx_vf_info *vf_info; + + /* Clear mbx_buf in the mailbox snaphot structure and setting the + * mailbox snapshot state to a new capture. + */ + ice_memset(&snap->mbx_buf, 0, sizeof(snap->mbx_buf), ICE_NONDMA_MEM); + snap->mbx_buf.state = ICE_MAL_VF_DETECT_STATE_NEW_SNAPSHOT; + + /* Reset message counts for all VFs to zero */ + LIST_FOR_EACH_ENTRY(vf_info, &snap->mbx_vf, ice_mbx_vf_info, list_entry) + vf_info->msg_count = 0; +} + +/** + * ice_mbx_traverse - Pass through mailbox snapshot + * @hw: pointer to the HW struct + * @new_state: new algorithm state + * + * Traversing the mailbox static snapshot without checking + * for malicious VFs. + */ +static void +ice_mbx_traverse(struct ice_hw *hw, + enum ice_mbx_snapshot_state *new_state) +{ + struct ice_mbx_snap_buffer_data *snap_buf; + u32 num_iterations; + + snap_buf = &hw->mbx_snapshot.mbx_buf; + + /* As mailbox buffer is circular, applying a mask + * on the incremented iteration count. + */ + num_iterations = ICE_RQ_DATA_MASK(++snap_buf->num_iterations); + + /* Checking either of the below conditions to exit snapshot traversal: + * Condition-1: If the number of iterations in the mailbox is equal to + * the mailbox head which would indicate that we have reached the end + * of the static snapshot. + * Condition-2: If the maximum messages serviced in the mailbox for a + * given interrupt is the highest possible value then there is no need + * to check if the number of messages processed is equal to it. If not + * check if the number of messages processed is greater than or equal + * to the maximum number of mailbox entries serviced in current work item. + */ + if (num_iterations == snap_buf->head || + (snap_buf->max_num_msgs_mbx < ICE_IGNORE_MAX_MSG_CNT && + ++snap_buf->num_msg_proc >= snap_buf->max_num_msgs_mbx)) + *new_state = ICE_MAL_VF_DETECT_STATE_NEW_SNAPSHOT; +} + +/** + * ice_mbx_detect_malvf - Detect malicious VF in snapshot + * @hw: pointer to the HW struct + * @vf_info: mailbox tracking structure for a VF + * @new_state: new algorithm state + * @is_malvf: boolean output to indicate if VF is malicious + * + * This function tracks the number of asynchronous messages + * sent per VF and marks the VF as malicious if it exceeds + * the permissible number of messages to send. + */ +static int +ice_mbx_detect_malvf(struct ice_hw *hw, struct ice_mbx_vf_info *vf_info, + enum ice_mbx_snapshot_state *new_state, + bool *is_malvf) +{ + /* increment the message count for this VF */ + vf_info->msg_count++; + + if (vf_info->msg_count >= ICE_ASYNC_VF_MSG_THRESHOLD) + *is_malvf = true; + + /* continue to iterate through the mailbox snapshot */ + ice_mbx_traverse(hw, new_state); + + return 0; +} + +/** + * ice_e830_mbx_vf_dec_trig - Decrements the VF mailbox queue counter + * @hw: pointer to the HW struct + * @event: pointer to the control queue receive event + * + * This function triggers to decrement the counter + * MBX_VF_IN_FLIGHT_MSGS_AT_PF_CNT when the driver replenishes + * the buffers at the PF mailbox queue. + */ +void ice_e830_mbx_vf_dec_trig(struct ice_hw *hw, + struct ice_rq_event_info *event) +{ + u16 vfid = LE16_TO_CPU(event->desc.retval); + + wr32(hw, E830_MBX_VF_DEC_TRIG(vfid), 1); +} + +/** + * ice_mbx_vf_clear_cnt_e830 - Clear the VF mailbox queue count + * @hw: pointer to the HW struct + * @vf_id: VF ID in the PF space + * + * This function clears the counter MBX_VF_IN_FLIGHT_MSGS_AT_PF_CNT, and should + * be called when a VF is created and on VF reset. + */ +void ice_mbx_vf_clear_cnt_e830(struct ice_hw *hw, u16 vf_id) +{ + u32 reg = rd32(hw, E830_MBX_VF_IN_FLIGHT_MSGS_AT_PF_CNT(vf_id)); + + wr32(hw, E830_MBX_VF_DEC_TRIG(vf_id), reg); +} + +/** + * ice_mbx_vf_state_handler - Handle states of the overflow algorithm + * @hw: pointer to the HW struct + * @mbx_data: pointer to structure containing mailbox data + * @vf_info: mailbox tracking structure for the VF in question + * @report_malvf: boolean output to indicate whether VF should be reported + * + * The function serves as an entry point for the malicious VF + * detection algorithm by handling the different states and state + * transitions of the algorithm: + * New snapshot: This state is entered when creating a new static + * snapshot. The data from any previous mailbox snapshot is + * cleared and a new capture of the mailbox head and tail is + * logged. This will be the new static snapshot to detect + * asynchronous messages sent by VFs. On capturing the snapshot + * and depending on whether the number of pending messages in that + * snapshot exceed the watermark value, the state machine enters + * traverse or detect states. + * Traverse: If pending message count is below watermark then iterate + * through the snapshot without any action on VF. + * Detect: If pending message count exceeds watermark traverse + * the static snapshot and look for a malicious VF. + */ +int +ice_mbx_vf_state_handler(struct ice_hw *hw, struct ice_mbx_data *mbx_data, + struct ice_mbx_vf_info *vf_info, bool *report_malvf) +{ + struct ice_mbx_snapshot *snap = &hw->mbx_snapshot; + struct ice_mbx_snap_buffer_data *snap_buf; + struct ice_ctl_q_info *cq = &hw->mailboxq; + enum ice_mbx_snapshot_state new_state; + int status = 0; + bool is_malvf = false; + + if (!report_malvf || !mbx_data || !vf_info) + return ICE_ERR_BAD_PTR; + + *report_malvf = false; + + /* When entering the mailbox state machine assume that the VF + * is not malicious until detected. + */ + /* Checking if max messages allowed to be processed while servicing current + * interrupt is not less than the defined AVF message threshold. + */ + if (mbx_data->max_num_msgs_mbx <= ICE_ASYNC_VF_MSG_THRESHOLD) + return ICE_ERR_INVAL_SIZE; + + /* The watermark value should not be lesser than the threshold limit + * set for the number of asynchronous messages a VF can send to mailbox + * nor should it be greater than the maximum number of messages in the + * mailbox serviced in current interrupt. + */ + if (mbx_data->async_watermark_val < ICE_ASYNC_VF_MSG_THRESHOLD || + mbx_data->async_watermark_val > mbx_data->max_num_msgs_mbx) + return ICE_ERR_PARAM; + + new_state = ICE_MAL_VF_DETECT_STATE_INVALID; + snap_buf = &snap->mbx_buf; + + switch (snap_buf->state) { + case ICE_MAL_VF_DETECT_STATE_NEW_SNAPSHOT: + /* Clear any previously held data in mailbox snapshot structure. */ + ice_mbx_reset_snapshot(snap); + + /* Collect the pending ARQ count, number of messages processed and + * the maximum number of messages allowed to be processed from the + * Mailbox for current interrupt. + */ + snap_buf->num_pending_arq = mbx_data->num_pending_arq; + snap_buf->num_msg_proc = mbx_data->num_msg_proc; + snap_buf->max_num_msgs_mbx = mbx_data->max_num_msgs_mbx; + + /* Capture a new static snapshot of the mailbox by logging the + * head and tail of snapshot and set num_iterations to the tail + * value to mark the start of the iteration through the snapshot. + */ + snap_buf->head = ICE_RQ_DATA_MASK(cq->rq.next_to_clean + + mbx_data->num_pending_arq); + snap_buf->tail = ICE_RQ_DATA_MASK(cq->rq.next_to_clean - 1); + snap_buf->num_iterations = snap_buf->tail; + + /* Pending ARQ messages returned by ice_clean_rq_elem + * is the difference between the head and tail of the + * mailbox queue. Comparing this value against the watermark + * helps to check if we potentially have malicious VFs. + */ + if (snap_buf->num_pending_arq >= + mbx_data->async_watermark_val) { + new_state = ICE_MAL_VF_DETECT_STATE_DETECT; + status = ice_mbx_detect_malvf(hw, vf_info, &new_state, &is_malvf); + } else { + new_state = ICE_MAL_VF_DETECT_STATE_TRAVERSE; + ice_mbx_traverse(hw, &new_state); + } + break; + + case ICE_MAL_VF_DETECT_STATE_TRAVERSE: + new_state = ICE_MAL_VF_DETECT_STATE_TRAVERSE; + ice_mbx_traverse(hw, &new_state); + break; + + case ICE_MAL_VF_DETECT_STATE_DETECT: + new_state = ICE_MAL_VF_DETECT_STATE_DETECT; + status = ice_mbx_detect_malvf(hw, vf_info, &new_state, &is_malvf); + break; + + default: + new_state = ICE_MAL_VF_DETECT_STATE_INVALID; + status = ICE_ERR_CFG; + } + + snap_buf->state = new_state; + + /* Only report VFs as malicious the first time we detect it */ + if (is_malvf && !vf_info->malicious) { + vf_info->malicious = 1; + *report_malvf = true; + } + + return status; +} + +/** + * ice_mbx_clear_malvf - Clear VF mailbox info + * @vf_info: the mailbox tracking structure for a VF + * + * In case of a VF reset, this function shall be called to clear the VF's + * current mailbox tracking state. + */ +void ice_mbx_clear_malvf(struct ice_mbx_vf_info *vf_info) +{ + vf_info->malicious = 0; + vf_info->msg_count = 0; +} + +/** + * ice_mbx_init_vf_info - Initialize a new VF mailbox tracking info + * @hw: pointer to the hardware structure + * @vf_info: the mailbox tracking info structure for a VF + * + * Initialize a VF mailbox tracking info structure and insert it into the + * snapshot list. + * + * If you remove the VF, you must also delete the associated VF info structure + * from the linked list. + */ +void ice_mbx_init_vf_info(struct ice_hw *hw, struct ice_mbx_vf_info *vf_info) +{ + struct ice_mbx_snapshot *snap = &hw->mbx_snapshot; + + ice_mbx_clear_malvf(vf_info); + LIST_ADD(&vf_info->list_entry, &snap->mbx_vf); +} + +/** + * ice_mbx_init_snapshot - Initialize mailbox snapshot data + * @hw: pointer to the hardware structure + * + * Clear the mailbox snapshot structure and initialize the VF mailbox list. + */ +void ice_mbx_init_snapshot(struct ice_hw *hw) +{ + struct ice_mbx_snapshot *snap = &hw->mbx_snapshot; + + INIT_LIST_HEAD(&snap->mbx_vf); + ice_mbx_reset_snapshot(snap); +} diff --git a/sys/dev/ice/ice_vf_mbx.h b/sys/dev/ice/ice_vf_mbx.h new file mode 100644 index 000000000000..3b185ac89c11 --- /dev/null +++ b/sys/dev/ice/ice_vf_mbx.h @@ -0,0 +1,67 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright (c) 2025, Intel Corporation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _ICE_VF_MBX_H_ +#define _ICE_VF_MBX_H_ + +#include "ice_type.h" +#include "ice_controlq.h" + +/* Defining the mailbox message threshold as 63 asynchronous + * pending messages. Normal VF functionality does not require + * sending more than 63 asynchronous pending message. + */ + + /* Threshold value should be used to initialize + * MBX_VF_IN_FLIGHT_MSGS_AT_PF_CNT register. + */ +#define ICE_ASYNC_VF_MSG_THRESHOLD 63 + +int +ice_aq_send_msg_to_pf(struct ice_hw *hw, enum virtchnl_ops v_opcode, + int v_retval, u8 *msg, u16 msglen, + struct ice_sq_cd *cd); +int +ice_aq_send_msg_to_vf(struct ice_hw *hw, u16 vfid, u32 v_opcode, u32 v_retval, + u8 *msg, u16 msglen, struct ice_sq_cd *cd); + +u32 ice_conv_link_speed_to_virtchnl(bool adv_link_support, u16 link_speed); + +void ice_e830_mbx_vf_dec_trig(struct ice_hw *hw, + struct ice_rq_event_info *event); +void ice_mbx_vf_clear_cnt_e830(struct ice_hw *hw, u16 vf_id); +int +ice_mbx_vf_state_handler(struct ice_hw *hw, struct ice_mbx_data *mbx_data, + struct ice_mbx_vf_info *vf_info, bool *report_malvf); +void ice_mbx_clear_malvf(struct ice_mbx_vf_info *vf_info); +void ice_mbx_init_vf_info(struct ice_hw *hw, struct ice_mbx_vf_info *vf_info); +void ice_mbx_init_snapshot(struct ice_hw *hw); +#endif /* _ICE_VF_MBX_H_ */ diff --git a/sys/dev/ice/if_ice_iflib.c b/sys/dev/ice/if_ice_iflib.c index e60ee0f1c5c3..1469d2916465 100644 --- a/sys/dev/ice/if_ice_iflib.c +++ b/sys/dev/ice/if_ice_iflib.c @@ -42,6 +42,9 @@ #include "ice_drv_info.h" #include "ice_switch.h" #include "ice_sched.h" +#ifdef PCI_IOV +#include "ice_iov.h" +#endif #include <sys/module.h> #include <sys/sockio.h> @@ -85,6 +88,12 @@ static int ice_if_suspend(if_ctx_t ctx); static int ice_if_resume(if_ctx_t ctx); static bool ice_if_needs_restart(if_ctx_t ctx, enum iflib_restart_event event); static void ice_init_link(struct ice_softc *sc); +#ifdef PCI_IOV +static int ice_if_iov_init(if_ctx_t ctx, uint16_t num_vfs, const nvlist_t *params); +static void ice_if_iov_uninit(if_ctx_t ctx); +static int ice_if_iov_vf_add(if_ctx_t ctx, uint16_t vfnum, const nvlist_t *params); +static void ice_if_vflr_handle(if_ctx_t ctx); +#endif static int ice_setup_mirror_vsi(struct ice_mirr_if *mif); static int ice_wire_mirror_intrs(struct ice_mirr_if *mif); static void ice_free_irqvs_subif(struct ice_mirr_if *mif); @@ -158,6 +167,11 @@ static device_method_t ice_methods[] = { DEVMETHOD(device_shutdown, iflib_device_shutdown), DEVMETHOD(device_suspend, iflib_device_suspend), DEVMETHOD(device_resume, iflib_device_resume), +#ifdef PCI_IOV + DEVMETHOD(pci_iov_init, iflib_device_iov_init), + DEVMETHOD(pci_iov_uninit, iflib_device_iov_uninit), + DEVMETHOD(pci_iov_add_vf, iflib_device_iov_add_vf), +#endif DEVMETHOD_END }; @@ -198,6 +212,12 @@ static device_method_t ice_iflib_methods[] = { DEVMETHOD(ifdi_suspend, ice_if_suspend), DEVMETHOD(ifdi_resume, ice_if_resume), DEVMETHOD(ifdi_needs_restart, ice_if_needs_restart), +#ifdef PCI_IOV + DEVMETHOD(ifdi_iov_vf_add, ice_if_iov_vf_add), + DEVMETHOD(ifdi_iov_init, ice_if_iov_init), + DEVMETHOD(ifdi_iov_uninit, ice_if_iov_uninit), + DEVMETHOD(ifdi_vflr_handle, ice_if_vflr_handle), +#endif DEVMETHOD_END }; @@ -733,6 +753,9 @@ ice_update_link_status(struct ice_softc *sc, bool update_media) iflib_link_state_change(sc->ctx, LINK_STATE_DOWN, 0); ice_rdma_link_change(sc, LINK_STATE_DOWN, 0); } +#ifdef PCI_IOV + ice_vc_notify_all_vfs_link_state(sc); +#endif update_media = true; } @@ -831,6 +854,14 @@ ice_if_attach_post(if_ctx_t ctx) ice_add_device_sysctls(sc); +#ifdef PCI_IOV + if (ice_is_bit_set(sc->feat_cap, ICE_FEATURE_SRIOV)) { + err = ice_iov_attach(sc); + if (err == ENOMEM) + return (err); + } +#endif /* PCI_IOV */ + /* Get DCBX/LLDP state and start DCBX agent */ ice_init_dcb_setup(sc); @@ -953,6 +984,11 @@ ice_if_detach(if_ctx_t ctx) ice_destroy_mirror_interface(sc); ice_rdma_pf_detach(sc); +#ifdef PCI_IOV + if (ice_is_bit_set(sc->feat_cap, ICE_FEATURE_SRIOV)) + ice_iov_detach(sc); +#endif /* PCI_IOV */ + /* Free allocated media types */ ifmedia_removeall(sc->media); @@ -1676,6 +1712,11 @@ ice_if_msix_intr_assign(if_ctx_t ctx, int msix) /* For future interrupt assignments */ sc->last_rid = rid + sc->irdma_vectors; +#ifdef PCI_IOV + /* Create soft IRQ for handling VF resets */ + iflib_softirq_alloc_generic(ctx, NULL, IFLIB_INTR_IOV, sc, 0, "iov"); +#endif + return (0); fail: for (; i >= 0; i--, vector--) @@ -2277,7 +2318,12 @@ ice_transition_recovery_mode(struct ice_softc *sc) ice_rdma_pf_detach(sc); ice_clear_bit(ICE_FEATURE_RDMA, sc->feat_cap); +#ifdef PCI_IOV + if (ice_test_and_clear_bit(ICE_FEATURE_SRIOV, sc->feat_en)) + ice_iov_detach(sc); +#else ice_clear_bit(ICE_FEATURE_SRIOV, sc->feat_en); +#endif /* PCI_IOV */ ice_clear_bit(ICE_FEATURE_SRIOV, sc->feat_cap); ice_vsi_del_txqs_ctx(vsi); @@ -2325,7 +2371,12 @@ ice_transition_safe_mode(struct ice_softc *sc) ice_rdma_pf_detach(sc); ice_clear_bit(ICE_FEATURE_RDMA, sc->feat_cap); +#ifdef PCI_IOV + if (ice_test_and_clear_bit(ICE_FEATURE_SRIOV, sc->feat_en)) + ice_iov_detach(sc); +#else ice_clear_bit(ICE_FEATURE_SRIOV, sc->feat_en); +#endif /* PCI_IOV */ ice_clear_bit(ICE_FEATURE_SRIOV, sc->feat_cap); ice_clear_bit(ICE_FEATURE_RSS, sc->feat_cap); @@ -2410,6 +2461,15 @@ ice_if_update_admin_status(if_ctx_t ctx) /* Check and update link status */ ice_update_link_status(sc, false); +#ifdef PCI_IOV + /* + * Schedule VFs' reset handler after global resets + * and other events were processed. + */ + if (ice_testandclear_state(&sc->state, ICE_STATE_VFLR_PENDING)) + iflib_iov_intr_deferred(ctx); +#endif + /* * If there are still messages to process, we need to reschedule * ourselves. Otherwise, we can just re-enable the interrupt. We'll be @@ -3349,6 +3409,78 @@ ice_init_link(struct ice_softc *sc) } +#ifdef PCI_IOV +/** + * ice_if_iov_init - iov init handler for iflib + * @ctx: iflib context pointer + * @num_vfs: number of VFs to create + * @params: configuration parameters for the PF + * + * Configure the driver for SR-IOV mode. Used to setup things like memory + * before any VFs are created. + * + * @remark This is a wrapper for ice_iov_init + */ +static int +ice_if_iov_init(if_ctx_t ctx, uint16_t num_vfs, const nvlist_t *params) +{ + struct ice_softc *sc = (struct ice_softc *)iflib_get_softc(ctx); + + return ice_iov_init(sc, num_vfs, params); +} + +/** + * ice_if_iov_uninit - iov uninit handler for iflib + * @ctx: iflib context pointer + * + * Destroys VFs and frees their memory and resources. + * + * @remark This is a wrapper for ice_iov_uninit + */ +static void +ice_if_iov_uninit(if_ctx_t ctx) +{ + struct ice_softc *sc = (struct ice_softc *)iflib_get_softc(ctx); + + ice_iov_uninit(sc); +} + +/** + * ice_if_iov_vf_add - iov add vf handler for iflib + * @ctx: iflib context pointer + * @vfnum: index of VF to configure + * @params: configuration parameters for the VF + * + * Sets up the VF given by the vfnum index. This is called by the OS + * for each VF created by the PF driver after it is spawned. + * + * @remark This is a wrapper for ice_iov_vf_add + */ +static int +ice_if_iov_vf_add(if_ctx_t ctx, uint16_t vfnum, const nvlist_t *params) +{ + struct ice_softc *sc = (struct ice_softc *)iflib_get_softc(ctx); + + return ice_iov_add_vf(sc, vfnum, params); +} + +/** + * ice_if_vflr_handle - iov VFLR handler + * @ctx: iflib context pointer + * + * Performs the necessar teardown or setup required for a VF after + * a VFLR is initiated. + * + * @remark This is a wrapper for ice_iov_handle_vflr + */ +static void +ice_if_vflr_handle(if_ctx_t ctx) +{ + struct ice_softc *sc = (struct ice_softc *)iflib_get_softc(ctx); + ice_iov_handle_vflr(sc); +} +#endif /* PCI_IOV */ + extern struct if_txrx ice_subif_txrx; /** diff --git a/sys/dev/ichiic/ig4_pci.c b/sys/dev/ichiic/ig4_pci.c index 0195466150eb..3a49e220e335 100644 --- a/sys/dev/ichiic/ig4_pci.c +++ b/sys/dev/ichiic/ig4_pci.c @@ -186,6 +186,12 @@ static int ig4iic_pci_detach(device_t dev); #define PCI_CHIP_METEORLAKE_M_I2C_3 0x7e518086 #define PCI_CHIP_METEORLAKE_M_I2C_4 0x7e7a8086 #define PCI_CHIP_METEORLAKE_M_I2C_5 0x7e7b8086 +#define PCI_CHIP_ARROWLAKE_U_I2C_0 0x77788086 +#define PCI_CHIP_ARROWLAKE_U_I2C_1 0x77798086 +#define PCI_CHIP_ARROWLAKE_U_I2C_2 0x777a8086 +#define PCI_CHIP_ARROWLAKE_U_I2C_3 0x777b8086 +#define PCI_CHIP_ARROWLAKE_U_I2C_4 0x77508086 +#define PCI_CHIP_ARROWLAKE_U_I2C_5 0x77518086 struct ig4iic_pci_device { uint32_t devid; @@ -316,6 +322,12 @@ static struct ig4iic_pci_device ig4iic_pci_devices[] = { { PCI_CHIP_METEORLAKE_M_I2C_3, "Intel Meteor Lake-M I2C Controller-3", IG4_TIGERLAKE}, { PCI_CHIP_METEORLAKE_M_I2C_4, "Intel Meteor Lake-M I2C Controller-4", IG4_TIGERLAKE}, { PCI_CHIP_METEORLAKE_M_I2C_5, "Intel Meteor Lake-M I2C Controller-5", IG4_TIGERLAKE}, + { PCI_CHIP_ARROWLAKE_U_I2C_0, "Intel Arrow Lake-H/U I2C Controller-0", IG4_TIGERLAKE}, + { PCI_CHIP_ARROWLAKE_U_I2C_1, "Intel Arrow Lake-H/U I2C Controller-1", IG4_TIGERLAKE}, + { PCI_CHIP_ARROWLAKE_U_I2C_2, "Intel Arrow Lake-H/U I2C Controller-2", IG4_TIGERLAKE}, + { PCI_CHIP_ARROWLAKE_U_I2C_3, "Intel Arrow Lake-H/U I2C Controller-3", IG4_TIGERLAKE}, + { PCI_CHIP_ARROWLAKE_U_I2C_4, "Intel Arrow Lake-H/U I2C Controller-4", IG4_TIGERLAKE}, + { PCI_CHIP_ARROWLAKE_U_I2C_5, "Intel Arrow Lake-H/U I2C Controller-5", IG4_TIGERLAKE}, }; static int diff --git a/sys/dev/iicbus/gpio/tca64xx.c b/sys/dev/iicbus/gpio/tca64xx.c index 3b3bca9936f1..cd011ae9be75 100644 --- a/sys/dev/iicbus/gpio/tca64xx.c +++ b/sys/dev/iicbus/gpio/tca64xx.c @@ -261,14 +261,13 @@ tca64xx_attach(device_t dev) sc->addr = iicbus_get_addr(dev); mtx_init(&sc->mtx, "tca64xx gpio", "gpio", MTX_DEF); + OF_device_register_xref(OF_xref_from_node(ofw_bus_get_node(dev)), dev); sc->busdev = gpiobus_attach_bus(dev); if (sc->busdev == NULL) { device_printf(dev, "Could not create busdev child\n"); return (ENXIO); } - OF_device_register_xref(OF_xref_from_node(ofw_bus_get_node(dev)), dev); - #ifdef DEBUG switch (sc->chip) { case TCA6416_TYPE: diff --git a/sys/dev/md/md.c b/sys/dev/md/md.c index b842d4f2fd8e..29dc0c880e3a 100644 --- a/sys/dev/md/md.c +++ b/sys/dev/md/md.c @@ -11,9 +11,9 @@ */ /*- - * The following functions are based on the vn(4) driver: mdstart_swap(), - * mdstart_vnode(), mdcreate_swap(), mdcreate_vnode() and mddestroy(), - * and as such under the following copyright: + * The following functions are based on the historical vn(4) driver: + * mdstart_swap(), mdstart_vnode(), mdcreate_swap(), mdcreate_vnode() + * and mddestroy(), and as such under the following copyright: * * Copyright (c) 1988 University of Utah. * Copyright (c) 1990, 1993 @@ -1559,19 +1559,26 @@ mddestroy(struct md_s *sc, struct thread *td) mtx_destroy(&sc->queue_mtx); switch (sc->type) { case MD_VNODE: - vn_lock(sc->s_vnode.vnode, LK_EXCLUSIVE | LK_RETRY); - sc->s_vnode.vnode->v_vflag &= ~VV_MD; - VOP_UNLOCK(sc->s_vnode.vnode); - (void)vn_close(sc->s_vnode.vnode, sc->flags & MD_READONLY ? - FREAD : (FREAD|FWRITE), sc->cred, td); - kva_free(sc->s_vnode.kva, maxphys + PAGE_SIZE); + if (sc->s_vnode.vnode != NULL) { + vn_lock(sc->s_vnode.vnode, LK_EXCLUSIVE | LK_RETRY); + sc->s_vnode.vnode->v_vflag &= ~VV_MD; + VOP_UNLOCK(sc->s_vnode.vnode); + (void)vn_close(sc->s_vnode.vnode, + sc->flags & MD_READONLY ? FREAD : (FREAD|FWRITE), + sc->cred, td); + } + if (sc->s_vnode.kva != 0) + kva_free(sc->s_vnode.kva, maxphys + PAGE_SIZE); break; case MD_SWAP: - vm_object_deallocate(sc->s_swap.object); + if (sc->s_swap.object != NULL) + vm_object_deallocate(sc->s_swap.object); break; case MD_MALLOC: - destroy_indir(sc, sc->s_malloc.indir); - uma_zdestroy(sc->s_malloc.uma); + if (sc->s_malloc.indir != NULL) + destroy_indir(sc, sc->s_malloc.indir); + if (sc->s_malloc.uma != NULL) + uma_zdestroy(sc->s_malloc.uma); break; case MD_PRELOAD: case MD_NULL: diff --git a/sys/dev/mem/memutil.c b/sys/dev/mem/memutil.c index cf9714d6ec8f..20ce337df0ab 100644 --- a/sys/dev/mem/memutil.c +++ b/sys/dev/mem/memutil.c @@ -26,15 +26,14 @@ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include <sys/param.h> +#include <sys/systm.h> #include <sys/kernel.h> #include <sys/lock.h> #include <sys/malloc.h> #include <sys/memrange.h> -#include <sys/rwlock.h> -#include <sys/systm.h> +#include <sys/sx.h> -static struct rwlock mr_lock; +static struct sx mr_lock; /* * Implementation-neutral, kernel-callable functions for manipulating @@ -46,7 +45,7 @@ mem_range_init(void) if (mem_range_softc.mr_op == NULL) return; - rw_init(&mr_lock, "memrange"); + sx_init(&mr_lock, "memrange"); mem_range_softc.mr_op->init(&mem_range_softc); } @@ -56,7 +55,7 @@ mem_range_destroy(void) if (mem_range_softc.mr_op == NULL) return; - rw_destroy(&mr_lock); + sx_destroy(&mr_lock); } int @@ -67,12 +66,12 @@ mem_range_attr_get(struct mem_range_desc *mrd, int *arg) if (mem_range_softc.mr_op == NULL) return (EOPNOTSUPP); nd = *arg; - rw_rlock(&mr_lock); + sx_slock(&mr_lock); if (nd == 0) *arg = mem_range_softc.mr_ndesc; else bcopy(mem_range_softc.mr_desc, mrd, nd * sizeof(*mrd)); - rw_runlock(&mr_lock); + sx_sunlock(&mr_lock); return (0); } @@ -83,8 +82,8 @@ mem_range_attr_set(struct mem_range_desc *mrd, int *arg) if (mem_range_softc.mr_op == NULL) return (EOPNOTSUPP); - rw_wlock(&mr_lock); + sx_xlock(&mr_lock); ret = mem_range_softc.mr_op->set(&mem_range_softc, mrd, arg); - rw_wunlock(&mr_lock); + sx_xunlock(&mr_lock); return (ret); } diff --git a/sys/dev/mgb/if_mgb.c b/sys/dev/mgb/if_mgb.c index 1240d0f84415..409f34167df0 100644 --- a/sys/dev/mgb/if_mgb.c +++ b/sys/dev/mgb/if_mgb.c @@ -1435,7 +1435,7 @@ mgb_hw_teardown(struct mgb_softc *sc) /* Stop MAC */ CSR_CLEAR_REG(sc, MGB_MAC_RX, MGB_MAC_ENBL); - CSR_WRITE_REG(sc, MGB_MAC_TX, MGB_MAC_ENBL); + CSR_CLEAR_REG(sc, MGB_MAC_TX, MGB_MAC_ENBL); if ((err = mgb_wait_for_bits(sc, MGB_MAC_RX, MGB_MAC_DSBL, 0))) return (err); if ((err = mgb_wait_for_bits(sc, MGB_MAC_TX, MGB_MAC_DSBL, 0))) diff --git a/sys/dev/mlx5/mlx5_accel/ipsec.h b/sys/dev/mlx5/mlx5_accel/ipsec.h index 361b9f72d873..c3f3a2372482 100644 --- a/sys/dev/mlx5/mlx5_accel/ipsec.h +++ b/sys/dev/mlx5/mlx5_accel/ipsec.h @@ -260,8 +260,8 @@ int mlx5e_accel_ipsec_fs_rx_tables_create(struct mlx5e_priv *priv); void mlx5e_accel_ipsec_fs_rx_catchall_rules_destroy(struct mlx5e_priv *priv); int mlx5e_accel_ipsec_fs_rx_catchall_rules(struct mlx5e_priv *priv); int mlx5_accel_ipsec_rx_tag_add(if_t ifp, struct mlx5e_rq_mbuf *mr); -void mlx5e_accel_ipsec_handle_rx_cqe(struct mbuf *mb, struct mlx5_cqe64 *cqe, - struct mlx5e_rq_mbuf *mr); +void mlx5e_accel_ipsec_handle_rx_cqe(if_t ifp, struct mbuf *mb, + struct mlx5_cqe64 *cqe, struct mlx5e_rq_mbuf *mr); static inline int mlx5e_accel_ipsec_flow(struct mlx5_cqe64 *cqe) { @@ -269,12 +269,12 @@ static inline int mlx5e_accel_ipsec_flow(struct mlx5_cqe64 *cqe) } static inline void -mlx5e_accel_ipsec_handle_rx(struct mbuf *mb, struct mlx5_cqe64 *cqe, +mlx5e_accel_ipsec_handle_rx(if_t ifp, struct mbuf *mb, struct mlx5_cqe64 *cqe, struct mlx5e_rq_mbuf *mr) { u32 ipsec_meta_data = be32_to_cpu(cqe->ft_metadata); if (MLX5_IPSEC_METADATA_MARKER(ipsec_meta_data)) - mlx5e_accel_ipsec_handle_rx_cqe(mb, cqe, mr); + mlx5e_accel_ipsec_handle_rx_cqe(ifp, mb, cqe, mr); } #endif /* __MLX5_ACCEL_IPSEC_H__ */ diff --git a/sys/dev/mlx5/mlx5_accel/mlx5_ipsec_rxtx.c b/sys/dev/mlx5/mlx5_accel/mlx5_ipsec_rxtx.c index 0883cfb2d510..5dccb8bc2b87 100644 --- a/sys/dev/mlx5/mlx5_accel/mlx5_ipsec_rxtx.c +++ b/sys/dev/mlx5/mlx5_accel/mlx5_ipsec_rxtx.c @@ -24,11 +24,14 @@ * */ +#include "opt_ipsec.h" + #include <sys/mbuf.h> #include <sys/socket.h> #include <netinet/in.h> #include <netipsec/keydb.h> #include <netipsec/ipsec_offload.h> +#include <netipsec/xform.h> #include <dev/mlx5/qp.h> #include <dev/mlx5/mlx5_en/en.h> #include <dev/mlx5/mlx5_accel/ipsec.h> @@ -48,7 +51,8 @@ mlx5_accel_ipsec_rx_tag_add(if_t ifp, struct mlx5e_rq_mbuf *mr) return (0); mtag = (struct ipsec_accel_in_tag *)m_tag_get( - PACKET_TAG_IPSEC_ACCEL_IN, sizeof(*mtag), M_NOWAIT); + PACKET_TAG_IPSEC_ACCEL_IN, sizeof(struct ipsec_accel_in_tag) - + __offsetof(struct ipsec_accel_in_tag, xh), M_NOWAIT); if (mtag == NULL) return (-ENOMEM); mr->ipsec_mtag = mtag; @@ -56,8 +60,8 @@ mlx5_accel_ipsec_rx_tag_add(if_t ifp, struct mlx5e_rq_mbuf *mr) } void -mlx5e_accel_ipsec_handle_rx_cqe(struct mbuf *mb, struct mlx5_cqe64 *cqe, - struct mlx5e_rq_mbuf *mr) +mlx5e_accel_ipsec_handle_rx_cqe(if_t ifp, struct mbuf *mb, + struct mlx5_cqe64 *cqe, struct mlx5e_rq_mbuf *mr) { struct ipsec_accel_in_tag *mtag; u32 drv_spi; @@ -65,10 +69,12 @@ mlx5e_accel_ipsec_handle_rx_cqe(struct mbuf *mb, struct mlx5_cqe64 *cqe, drv_spi = MLX5_IPSEC_METADATA_HANDLE(be32_to_cpu(cqe->ft_metadata)); mtag = mr->ipsec_mtag; WARN_ON(mtag == NULL); - mr->ipsec_mtag = NULL; if (mtag != NULL) { mtag->drv_spi = drv_spi; - m_tag_prepend(mb, &mtag->tag); + if (ipsec_accel_fill_xh(ifp, drv_spi, &mtag->xh)) { + m_tag_prepend(mb, &mtag->tag); + mr->ipsec_mtag = NULL; + } } } diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_hw_tls_rx.c b/sys/dev/mlx5/mlx5_en/mlx5_en_hw_tls_rx.c index 8b8f2e570245..89d2010656c5 100644 --- a/sys/dev/mlx5/mlx5_en/mlx5_en_hw_tls_rx.c +++ b/sys/dev/mlx5/mlx5_en/mlx5_en_hw_tls_rx.c @@ -42,13 +42,30 @@ static if_snd_tag_free_t mlx5e_tls_rx_snd_tag_free; static if_snd_tag_modify_t mlx5e_tls_rx_snd_tag_modify; +static if_snd_tag_status_str_t mlx5e_tls_rx_snd_tag_status_str; static const struct if_snd_tag_sw mlx5e_tls_rx_snd_tag_sw = { .snd_tag_modify = mlx5e_tls_rx_snd_tag_modify, .snd_tag_free = mlx5e_tls_rx_snd_tag_free, + .snd_tag_status_str = mlx5e_tls_rx_snd_tag_status_str, .type = IF_SND_TAG_TYPE_TLS_RX }; +static const char *mlx5e_tls_rx_progress_params_auth_state_str[] = { + [MLX5E_TLS_RX_PROGRESS_PARAMS_AUTH_STATE_NO_OFFLOAD] = "no_offload", + [MLX5E_TLS_RX_PROGRESS_PARAMS_AUTH_STATE_OFFLOAD] = "offload", + [MLX5E_TLS_RX_PROGRESS_PARAMS_AUTH_STATE_AUTHENTICATION] = + "authentication", +}; + +static const char *mlx5e_tls_rx_progress_params_record_tracker_state_str[] = { + [MLX5E_TLS_RX_PROGRESS_PARAMS_RECORD_TRACKER_STATE_START] = "start", + [MLX5E_TLS_RX_PROGRESS_PARAMS_RECORD_TRACKER_STATE_TRACKING] = + "tracking", + [MLX5E_TLS_RX_PROGRESS_PARAMS_RECORD_TRACKER_STATE_SEARCHING] = + "searching", +}; + MALLOC_DEFINE(M_MLX5E_TLS_RX, "MLX5E_TLS_RX", "MLX5 ethernet HW TLS RX"); /* software TLS RX context */ @@ -250,7 +267,8 @@ mlx5e_tls_rx_send_progress_parameters_sync(struct mlx5e_iq *iq, mtx_unlock(&iq->lock); while (1) { - if (wait_for_completion_timeout(&ptag->progress_complete, hz) != 0) + if (wait_for_completion_timeout(&ptag->progress_complete, + msecs_to_jiffies(1000)) != 0) break; priv = container_of(iq, struct mlx5e_channel, iq)->priv; if (priv->mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR || @@ -331,7 +349,8 @@ done: * Zero is returned upon success, else some error happened. */ static int -mlx5e_tls_rx_receive_progress_parameters(struct mlx5e_iq *iq, struct mlx5e_tls_rx_tag *ptag) +mlx5e_tls_rx_receive_progress_parameters(struct mlx5e_iq *iq, + struct mlx5e_tls_rx_tag *ptag, mlx5e_iq_callback_t *cb) { struct mlx5e_get_tls_progress_params_wqe *wqe; const u32 ds_cnt = DIV_ROUND_UP(sizeof(*wqe), MLX5_SEND_WQE_DS); @@ -367,7 +386,7 @@ mlx5e_tls_rx_receive_progress_parameters(struct mlx5e_iq *iq, struct mlx5e_tls_r memcpy(iq->doorbell.d32, &wqe->ctrl, sizeof(iq->doorbell.d32)); iq->data[pi].num_wqebbs = DIV_ROUND_UP(ds_cnt, MLX5_SEND_WQEBB_NUM_DS); - iq->data[pi].callback = &mlx5e_tls_rx_receive_progress_parameters_cb; + iq->data[pi].callback = cb; iq->data[pi].arg = ptag; m_snd_tag_ref(&ptag->tag); @@ -640,7 +659,8 @@ mlx5e_tls_rx_set_params(void *ctx, struct inpcb *inp, const struct tls_session_p return (EINVAL); MLX5_SET64(sw_tls_rx_cntx, ctx, param.initial_record_number, tls_sn_he); - MLX5_SET(sw_tls_rx_cntx, ctx, param.resync_tcp_sn, tcp_sn_he); + MLX5_SET(sw_tls_rx_cntx, ctx, param.resync_tcp_sn, 0); + MLX5_SET(sw_tls_rx_cntx, ctx, progress.next_record_tcp_sn, tcp_sn_he); return (0); } @@ -819,6 +839,7 @@ mlx5e_tls_rx_snd_tag_alloc(if_t ifp, } ptag->flow_rule = flow_rule; + init_completion(&ptag->progress_complete); return (0); @@ -968,7 +989,8 @@ mlx5e_tls_rx_snd_tag_modify(struct m_snd_tag *pmt, union if_snd_tag_modify_param params->tls_rx.tls_rec_length, params->tls_rx.tls_seq_number) && ptag->tcp_resync_pending == 0) { - err = mlx5e_tls_rx_receive_progress_parameters(iq, ptag); + err = mlx5e_tls_rx_receive_progress_parameters(iq, ptag, + &mlx5e_tls_rx_receive_progress_parameters_cb); if (err != 0) { MLX5E_TLS_RX_STAT_INC(ptag, rx_resync_err, 1); } else { @@ -1001,6 +1023,74 @@ mlx5e_tls_rx_snd_tag_free(struct m_snd_tag *pmt) queue_work(priv->tls_rx.wq, &ptag->work); } +static void +mlx5e_tls_rx_str_status_cb(void *arg) +{ + struct mlx5e_tls_rx_tag *ptag; + + ptag = (struct mlx5e_tls_rx_tag *)arg; + complete_all(&ptag->progress_complete); + m_snd_tag_rele(&ptag->tag); +} + +static int +mlx5e_tls_rx_snd_tag_status_str(struct m_snd_tag *pmt, char *buf, size_t *sz) +{ + int err, out_size; + struct mlx5e_iq *iq; + void *buffer; + uint32_t tracker_state_val; + uint32_t auth_state_val; + struct mlx5e_priv *priv; + struct mlx5e_tls_rx_tag *ptag = + container_of(pmt, struct mlx5e_tls_rx_tag, tag); + + if (buf == NULL) + return (0); + + MLX5E_TLS_RX_TAG_LOCK(ptag); + priv = container_of(ptag->tls_rx, struct mlx5e_priv, tls_rx); + iq = mlx5e_tls_rx_get_iq(priv, ptag->flowid, ptag->flowtype); + reinit_completion(&ptag->progress_complete); + err = mlx5e_tls_rx_receive_progress_parameters(iq, ptag, + &mlx5e_tls_rx_str_status_cb); + MLX5E_TLS_RX_TAG_UNLOCK(ptag); + if (err != 0) + return (err); + + for (;;) { + if (wait_for_completion_timeout(&ptag->progress_complete, + msecs_to_jiffies(1000)) != 0) + break; + if (priv->mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR || + pci_channel_offline(priv->mdev->pdev) != 0) + return (ENXIO); + } + buffer = mlx5e_tls_rx_get_progress_buffer(ptag); + tracker_state_val = MLX5_GET(tls_progress_params, buffer, + record_tracker_state); + auth_state_val = MLX5_GET(tls_progress_params, buffer, auth_state); + + /* Validate tracker state value is in range */ + if (tracker_state_val > + MLX5E_TLS_RX_PROGRESS_PARAMS_RECORD_TRACKER_STATE_SEARCHING) + return (EINVAL); + + /* Validate auth state value is in range */ + if (auth_state_val > + MLX5E_TLS_RX_PROGRESS_PARAMS_AUTH_STATE_AUTHENTICATION) + return (EINVAL); + + out_size = snprintf(buf, *sz, "tracker_state: %s, auth_state: %s", + mlx5e_tls_rx_progress_params_record_tracker_state_str[ + tracker_state_val], + mlx5e_tls_rx_progress_params_auth_state_str[auth_state_val]); + + if (out_size <= *sz) + *sz = out_size; + return (0); +} + #else int diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c b/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c index 6b53db6fea23..eb569488631a 100644 --- a/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c +++ b/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c @@ -467,7 +467,7 @@ mlx5e_build_rx_mbuf(struct mlx5_cqe64 *cqe, struct mlx5e_rq *rq, break; } - mlx5e_accel_ipsec_handle_rx(mb, cqe, mr); + mlx5e_accel_ipsec_handle_rx(ifp, mb, cqe, mr); } static inline void diff --git a/sys/dev/nvme/nvme_ctrlr.c b/sys/dev/nvme/nvme_ctrlr.c index 73a7cee4aad0..fd7f00ced14b 100644 --- a/sys/dev/nvme/nvme_ctrlr.c +++ b/sys/dev/nvme/nvme_ctrlr.c @@ -48,7 +48,7 @@ #define B4_CHK_RDY_DELAY_MS 2300 /* work around controller bug */ static void nvme_ctrlr_construct_and_submit_aer(struct nvme_controller *ctrlr, - struct nvme_async_event_request *aer); + struct nvme_async_event_request *aer); static void nvme_ctrlr_barrier(struct nvme_controller *ctrlr, int flags) @@ -680,96 +680,6 @@ nvme_ctrlr_log_critical_warnings(struct nvme_controller *ctrlr, } static void -nvme_ctrlr_async_event_log_page_cb(void *arg, const struct nvme_completion *cpl) -{ - struct nvme_async_event_request *aer = arg; - struct nvme_health_information_page *health_info; - struct nvme_ns_list *nsl; - struct nvme_error_information_entry *err; - int i; - - /* - * If the log page fetch for some reason completed with an error, - * don't pass log page data to the consumers. In practice, this case - * should never happen. - */ - if (nvme_completion_is_error(cpl)) - nvme_notify_async_consumers(aer->ctrlr, &aer->cpl, - aer->log_page_id, NULL, 0); - else { - /* Convert data to host endian */ - switch (aer->log_page_id) { - case NVME_LOG_ERROR: - err = (struct nvme_error_information_entry *)aer->log_page_buffer; - for (i = 0; i < (aer->ctrlr->cdata.elpe + 1); i++) - nvme_error_information_entry_swapbytes(err++); - break; - case NVME_LOG_HEALTH_INFORMATION: - nvme_health_information_page_swapbytes( - (struct nvme_health_information_page *)aer->log_page_buffer); - break; - case NVME_LOG_CHANGED_NAMESPACE: - nvme_ns_list_swapbytes( - (struct nvme_ns_list *)aer->log_page_buffer); - break; - case NVME_LOG_COMMAND_EFFECT: - nvme_command_effects_page_swapbytes( - (struct nvme_command_effects_page *)aer->log_page_buffer); - break; - case NVME_LOG_RES_NOTIFICATION: - nvme_res_notification_page_swapbytes( - (struct nvme_res_notification_page *)aer->log_page_buffer); - break; - case NVME_LOG_SANITIZE_STATUS: - nvme_sanitize_status_page_swapbytes( - (struct nvme_sanitize_status_page *)aer->log_page_buffer); - break; - default: - break; - } - - if (aer->log_page_id == NVME_LOG_HEALTH_INFORMATION) { - health_info = (struct nvme_health_information_page *) - aer->log_page_buffer; - nvme_ctrlr_log_critical_warnings(aer->ctrlr, - health_info->critical_warning); - /* - * Critical warnings reported through the - * SMART/health log page are persistent, so - * clear the associated bits in the async event - * config so that we do not receive repeated - * notifications for the same event. - */ - aer->ctrlr->async_event_config &= - ~health_info->critical_warning; - nvme_ctrlr_cmd_set_async_event_config(aer->ctrlr, - aer->ctrlr->async_event_config, NULL, NULL); - } else if (aer->log_page_id == NVME_LOG_CHANGED_NAMESPACE && - !nvme_use_nvd) { - nsl = (struct nvme_ns_list *)aer->log_page_buffer; - for (i = 0; i < nitems(nsl->ns) && nsl->ns[i] != 0; i++) { - if (nsl->ns[i] > NVME_MAX_NAMESPACES) - break; - nvme_notify_ns(aer->ctrlr, nsl->ns[i]); - } - } - - /* - * Pass the cpl data from the original async event completion, - * not the log page fetch. - */ - nvme_notify_async_consumers(aer->ctrlr, &aer->cpl, - aer->log_page_id, aer->log_page_buffer, aer->log_page_size); - } - - /* - * Repost another asynchronous event request to replace the one - * that just completed. - */ - nvme_ctrlr_construct_and_submit_aer(aer->ctrlr, aer); -} - -static void nvme_ctrlr_async_event_cb(void *arg, const struct nvme_completion *cpl) { struct nvme_async_event_request *aer = arg; @@ -784,33 +694,18 @@ nvme_ctrlr_async_event_cb(void *arg, const struct nvme_completion *cpl) return; } - /* Associated log page is in bits 23:16 of completion entry dw0. */ + /* + * Save the completion status and associated log page is in bits 23:16 + * of completion entry dw0. Print a message and queue it for further + * processing. + */ + memcpy(&aer->cpl, cpl, sizeof(*cpl)); aer->log_page_id = NVMEV(NVME_ASYNC_EVENT_LOG_PAGE_ID, cpl->cdw0); - nvme_printf(aer->ctrlr, "async event occurred (type 0x%x, info 0x%02x," " page 0x%02x)\n", NVMEV(NVME_ASYNC_EVENT_TYPE, cpl->cdw0), NVMEV(NVME_ASYNC_EVENT_INFO, cpl->cdw0), aer->log_page_id); - - if (is_log_page_id_valid(aer->log_page_id)) { - aer->log_page_size = nvme_ctrlr_get_log_page_size(aer->ctrlr, - aer->log_page_id); - memcpy(&aer->cpl, cpl, sizeof(*cpl)); - nvme_ctrlr_cmd_get_log_page(aer->ctrlr, aer->log_page_id, - NVME_GLOBAL_NAMESPACE_TAG, aer->log_page_buffer, - aer->log_page_size, nvme_ctrlr_async_event_log_page_cb, - aer); - /* Wait to notify consumers until after log page is fetched. */ - } else { - nvme_notify_async_consumers(aer->ctrlr, cpl, aer->log_page_id, - NULL, 0); - - /* - * Repost another asynchronous event request to replace the one - * that just completed. - */ - nvme_ctrlr_construct_and_submit_aer(aer->ctrlr, aer); - } + taskqueue_enqueue(aer->ctrlr->taskqueue, &aer->task); } static void @@ -819,15 +714,21 @@ nvme_ctrlr_construct_and_submit_aer(struct nvme_controller *ctrlr, { struct nvme_request *req; - aer->ctrlr = ctrlr; /* - * XXX-MJ this should be M_WAITOK but we might be in a non-sleepable - * callback context. AER completions should be handled on a dedicated - * thread. + * We're racing the reset thread, so let that process submit this again. + * XXX does this really solve that race? And is that race even possible + * since we only reset when we've no theard from the card in a long + * time. Why would we get an AER in the middle of that just before we + * kick off the reset? */ - req = nvme_allocate_request_null(M_NOWAIT, nvme_ctrlr_async_event_cb, + if (ctrlr->is_resetting) + return; + + aer->ctrlr = ctrlr; + req = nvme_allocate_request_null(M_WAITOK, nvme_ctrlr_async_event_cb, aer); aer->req = req; + aer->log_page_id = 0; /* Not a valid page */ /* * Disable timeout here, since asynchronous event requests should by @@ -1203,6 +1104,140 @@ nvme_ctrlr_reset_task(void *arg, int pending) atomic_cmpset_32(&ctrlr->is_resetting, 1, 0); } +static void +nvme_ctrlr_aer_done(void *arg, const struct nvme_completion *cpl) +{ + struct nvme_async_event_request *aer = arg; + + mtx_lock(&aer->mtx); + if (nvme_completion_is_error(cpl)) + aer->log_page_size = (uint32_t)-1; + else + aer->log_page_size = nvme_ctrlr_get_log_page_size( + aer->ctrlr, aer->log_page_id); + wakeup(aer); + mtx_unlock(&aer->mtx); +} + +static void +nvme_ctrlr_aer_task(void *arg, int pending) +{ + struct nvme_async_event_request *aer = arg; + struct nvme_controller *ctrlr = aer->ctrlr; + uint32_t len; + + /* + * We're resetting, so just punt. + */ + if (ctrlr->is_resetting) + return; + + if (!is_log_page_id_valid(aer->log_page_id)) { + /* + * Repost another asynchronous event request to replace the one + * that just completed. + */ + nvme_notify_async_consumers(ctrlr, &aer->cpl, aer->log_page_id, + NULL, 0); + nvme_ctrlr_construct_and_submit_aer(ctrlr, aer); + goto out; + } + + aer->log_page_size = 0; + len = nvme_ctrlr_get_log_page_size(aer->ctrlr, aer->log_page_id); + nvme_ctrlr_cmd_get_log_page(aer->ctrlr, aer->log_page_id, + NVME_GLOBAL_NAMESPACE_TAG, aer->log_page_buffer, len, + nvme_ctrlr_aer_done, aer); + mtx_lock(&aer->mtx); + while (aer->log_page_size == 0) + mtx_sleep(aer, &aer->mtx, PRIBIO, "nvme_pt", 0); + mtx_unlock(&aer->mtx); + + if (aer->log_page_size != (uint32_t)-1) { + /* + * If the log page fetch for some reason completed with an + * error, don't pass log page data to the consumers. In + * practice, this case should never happen. + */ + nvme_notify_async_consumers(aer->ctrlr, &aer->cpl, + aer->log_page_id, NULL, 0); + goto out; + } + + /* Convert data to host endian */ + switch (aer->log_page_id) { + case NVME_LOG_ERROR: { + struct nvme_error_information_entry *err = + (struct nvme_error_information_entry *)aer->log_page_buffer; + for (int i = 0; i < (aer->ctrlr->cdata.elpe + 1); i++) + nvme_error_information_entry_swapbytes(err++); + break; + } + case NVME_LOG_HEALTH_INFORMATION: + nvme_health_information_page_swapbytes( + (struct nvme_health_information_page *)aer->log_page_buffer); + break; + case NVME_LOG_CHANGED_NAMESPACE: + nvme_ns_list_swapbytes( + (struct nvme_ns_list *)aer->log_page_buffer); + break; + case NVME_LOG_COMMAND_EFFECT: + nvme_command_effects_page_swapbytes( + (struct nvme_command_effects_page *)aer->log_page_buffer); + break; + case NVME_LOG_RES_NOTIFICATION: + nvme_res_notification_page_swapbytes( + (struct nvme_res_notification_page *)aer->log_page_buffer); + break; + case NVME_LOG_SANITIZE_STATUS: + nvme_sanitize_status_page_swapbytes( + (struct nvme_sanitize_status_page *)aer->log_page_buffer); + break; + default: + break; + } + + if (aer->log_page_id == NVME_LOG_HEALTH_INFORMATION) { + struct nvme_health_information_page *health_info = + (struct nvme_health_information_page *)aer->log_page_buffer; + + /* + * Critical warnings reported through the SMART/health log page + * are persistent, so clear the associated bits in the async + * event config so that we do not receive repeated notifications + * for the same event. + */ + nvme_ctrlr_log_critical_warnings(aer->ctrlr, + health_info->critical_warning); + aer->ctrlr->async_event_config &= + ~health_info->critical_warning; + nvme_ctrlr_cmd_set_async_event_config(aer->ctrlr, + aer->ctrlr->async_event_config, NULL, NULL); + } else if (aer->log_page_id == NVME_LOG_CHANGED_NAMESPACE) { + struct nvme_ns_list *nsl = + (struct nvme_ns_list *)aer->log_page_buffer; + for (int i = 0; i < nitems(nsl->ns) && nsl->ns[i] != 0; i++) { + if (nsl->ns[i] > NVME_MAX_NAMESPACES) + break; + nvme_notify_ns(aer->ctrlr, nsl->ns[i]); + } + } + + /* + * Pass the cpl data from the original async event completion, not the + * log page fetch. + */ + nvme_notify_async_consumers(aer->ctrlr, &aer->cpl, + aer->log_page_id, aer->log_page_buffer, aer->log_page_size); + + /* + * Repost another asynchronous event request to replace the one + * that just completed. + */ +out: + nvme_ctrlr_construct_and_submit_aer(ctrlr, aer); +} + /* * Poll all the queues enabled on the device for completion. */ @@ -1574,13 +1609,8 @@ nvme_ctrlr_construct(struct nvme_controller *ctrlr, device_t dev) /* * Create 2 threads for the taskqueue. The reset thread will block when * it detects that the controller has failed until all I/O has been - * failed up the stack. The fail_req task needs to be able to run in - * this case to finish the request failure for some cases. - * - * We could partially solve this race by draining the failed requeust - * queue before proceding to free the sim, though nothing would stop - * new I/O from coming in after we do that drain, but before we reach - * cam_sim_free, so this big hammer is used instead. + * failed up the stack. The second thread is used for AER events, which + * can block, but only briefly for memory and log page fetching. */ ctrlr->taskqueue = taskqueue_create("nvme_taskq", M_WAITOK, taskqueue_thread_enqueue, &ctrlr->taskqueue); @@ -1590,7 +1620,12 @@ nvme_ctrlr_construct(struct nvme_controller *ctrlr, device_t dev) ctrlr->is_initialized = false; ctrlr->notification_sent = 0; TASK_INIT(&ctrlr->reset_task, 0, nvme_ctrlr_reset_task, ctrlr); - STAILQ_INIT(&ctrlr->fail_req); + for (int i = 0; i < NVME_MAX_ASYNC_EVENTS; i++) { + struct nvme_async_event_request *aer = &ctrlr->aer[i]; + + TASK_INIT(&aer->task, 0, nvme_ctrlr_aer_task, aer); + mtx_init(&aer->mtx, "AER mutex", NULL, MTX_DEF); + } ctrlr->is_failed = false; make_dev_args_init(&md_args); @@ -1678,8 +1713,14 @@ nvme_ctrlr_destruct(struct nvme_controller *ctrlr, device_t dev) } noadminq: - if (ctrlr->taskqueue) + if (ctrlr->taskqueue) { taskqueue_free(ctrlr->taskqueue); + for (int i = 0; i < NVME_MAX_ASYNC_EVENTS; i++) { + struct nvme_async_event_request *aer = &ctrlr->aer[i]; + + mtx_destroy(&aer->mtx); + } + } if (ctrlr->tag) bus_teardown_intr(ctrlr->dev, ctrlr->res, ctrlr->tag); diff --git a/sys/dev/nvme/nvme_private.h b/sys/dev/nvme/nvme_private.h index 949e69ec9290..36f00fedc48e 100644 --- a/sys/dev/nvme/nvme_private.h +++ b/sys/dev/nvme/nvme_private.h @@ -123,6 +123,8 @@ struct nvme_request { struct nvme_async_event_request { struct nvme_controller *ctrlr; struct nvme_request *req; + struct task task; + struct mtx mtx; struct nvme_completion cpl; uint32_t log_page_id; uint32_t log_page_size; @@ -307,8 +309,6 @@ struct nvme_controller { bool isr_warned; bool is_initialized; - STAILQ_HEAD(, nvme_request) fail_req; - /* Host Memory Buffer */ int hmb_nchunks; size_t hmb_chunk; diff --git a/sys/dev/nvmf/host/nvmf.c b/sys/dev/nvmf/host/nvmf.c index dbdd4568bdf1..1ac0d142443b 100644 --- a/sys/dev/nvmf/host/nvmf.c +++ b/sys/dev/nvmf/host/nvmf.c @@ -27,6 +27,7 @@ #include <dev/nvmf/host/nvmf_var.h> static struct cdevsw nvmf_cdevsw; +static struct taskqueue *nvmf_tq; bool nvmf_fail_disconnect = false; SYSCTL_BOOL(_kern_nvmf, OID_AUTO, fail_on_disconnection, CTLFLAG_RWTUN, @@ -34,7 +35,10 @@ SYSCTL_BOOL(_kern_nvmf, OID_AUTO, fail_on_disconnection, CTLFLAG_RWTUN, MALLOC_DEFINE(M_NVMF, "nvmf", "NVMe over Fabrics host"); +static void nvmf_controller_loss_task(void *arg, int pending); static void nvmf_disconnect_task(void *arg, int pending); +static void nvmf_request_reconnect(struct nvmf_softc *sc); +static void nvmf_request_reconnect_task(void *arg, int pending); static void nvmf_shutdown_pre_sync(void *arg, int howto); static void nvmf_shutdown_post_sync(void *arg, int howto); @@ -294,6 +298,9 @@ nvmf_establish_connection(struct nvmf_softc *sc, nvlist_t *nvl) admin = nvlist_get_nvlist(nvl, "admin"); io = nvlist_get_nvlist_array(nvl, "io", &num_io_queues); kato = dnvlist_get_number(nvl, "kato", 0); + sc->reconnect_delay = dnvlist_get_number(nvl, "reconnect_delay", 0); + sc->controller_loss_timeout = dnvlist_get_number(nvl, + "controller_loss_timeout", 0); /* Setup the admin queue. */ sc->admin = nvmf_init_qp(sc, trtype, admin, "admin queue", 0); @@ -504,6 +511,10 @@ nvmf_attach(device_t dev) callout_init(&sc->ka_tx_timer, 1); sx_init(&sc->connection_lock, "nvmf connection"); TASK_INIT(&sc->disconnect_task, 0, nvmf_disconnect_task, sc); + TIMEOUT_TASK_INIT(nvmf_tq, &sc->controller_loss_task, 0, + nvmf_controller_loss_task, sc); + TIMEOUT_TASK_INIT(nvmf_tq, &sc->request_reconnect_task, 0, + nvmf_request_reconnect_task, sc); oid = SYSCTL_ADD_NODE(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "ioq", @@ -603,7 +614,9 @@ out: nvmf_destroy_aer(sc); - taskqueue_drain(taskqueue_thread, &sc->disconnect_task); + taskqueue_drain_timeout(nvmf_tq, &sc->request_reconnect_task); + taskqueue_drain_timeout(nvmf_tq, &sc->controller_loss_task); + taskqueue_drain(nvmf_tq, &sc->disconnect_task); sx_destroy(&sc->connection_lock); nvlist_destroy(sc->rparams); free(sc->cdata, M_NVMF); @@ -613,7 +626,7 @@ out: void nvmf_disconnect(struct nvmf_softc *sc) { - taskqueue_enqueue(taskqueue_thread, &sc->disconnect_task); + taskqueue_enqueue(nvmf_tq, &sc->disconnect_task); } static void @@ -676,6 +689,74 @@ nvmf_disconnect_task(void *arg, int pending __unused) nvmf_destroy_qp(sc->admin); sc->admin = NULL; + if (sc->reconnect_delay != 0) + nvmf_request_reconnect(sc); + if (sc->controller_loss_timeout != 0) + taskqueue_enqueue_timeout(nvmf_tq, + &sc->controller_loss_task, sc->controller_loss_timeout * + hz); + + sx_xunlock(&sc->connection_lock); +} + +static void +nvmf_controller_loss_task(void *arg, int pending) +{ + struct nvmf_softc *sc = arg; + device_t dev; + int error; + + bus_topo_lock(); + sx_xlock(&sc->connection_lock); + if (sc->admin != NULL || sc->detaching) { + /* Reconnected or already detaching. */ + sx_xunlock(&sc->connection_lock); + bus_topo_unlock(); + return; + } + + sc->controller_timedout = true; + sx_xunlock(&sc->connection_lock); + + /* + * XXX: Doing this from here is a bit ugly. We don't have an + * extra reference on `dev` but bus_topo_lock should block any + * concurrent device_delete_child invocations. + */ + dev = sc->dev; + error = device_delete_child(root_bus, dev); + if (error != 0) + device_printf(dev, + "failed to detach after controller loss: %d\n", error); + bus_topo_unlock(); +} + +static void +nvmf_request_reconnect(struct nvmf_softc *sc) +{ + char buf[64]; + + sx_assert(&sc->connection_lock, SX_LOCKED); + + snprintf(buf, sizeof(buf), "name=\"%s\"", device_get_nameunit(sc->dev)); + devctl_notify("nvme", "controller", "RECONNECT", buf); + taskqueue_enqueue_timeout(nvmf_tq, &sc->request_reconnect_task, + sc->reconnect_delay * hz); +} + +static void +nvmf_request_reconnect_task(void *arg, int pending) +{ + struct nvmf_softc *sc = arg; + + sx_xlock(&sc->connection_lock); + if (sc->admin != NULL || sc->detaching || sc->controller_timedout) { + /* Reconnected or already detaching. */ + sx_xunlock(&sc->connection_lock); + return; + } + + nvmf_request_reconnect(sc); sx_xunlock(&sc->connection_lock); } @@ -699,7 +780,7 @@ nvmf_reconnect_host(struct nvmf_softc *sc, struct nvmf_ioc_nv *nv) } sx_xlock(&sc->connection_lock); - if (sc->admin != NULL || sc->detaching) { + if (sc->admin != NULL || sc->detaching || sc->controller_timedout) { error = EBUSY; goto out; } @@ -745,6 +826,9 @@ nvmf_reconnect_host(struct nvmf_softc *sc, struct nvmf_ioc_nv *nv) nvmf_reconnect_sim(sc); nvmf_rescan_all_ns(sc); + + taskqueue_cancel_timeout(nvmf_tq, &sc->request_reconnect_task, NULL); + taskqueue_cancel_timeout(nvmf_tq, &sc->controller_loss_task, NULL); out: sx_xunlock(&sc->connection_lock); nvlist_destroy(nvl); @@ -852,7 +936,21 @@ nvmf_detach(device_t dev) } free(sc->io, M_NVMF); - taskqueue_drain(taskqueue_thread, &sc->disconnect_task); + taskqueue_drain(nvmf_tq, &sc->disconnect_task); + if (taskqueue_cancel_timeout(nvmf_tq, &sc->request_reconnect_task, + NULL) != 0) + taskqueue_drain_timeout(nvmf_tq, &sc->request_reconnect_task); + + /* + * Don't cancel/drain the controller loss task if that task + * has fired and is triggering the detach. + */ + if (!sc->controller_timedout) { + if (taskqueue_cancel_timeout(nvmf_tq, &sc->controller_loss_task, + NULL) != 0) + taskqueue_drain_timeout(nvmf_tq, + &sc->controller_loss_task); + } if (sc->admin != NULL) nvmf_destroy_qp(sc->admin); @@ -1154,14 +1252,25 @@ static struct cdevsw nvmf_cdevsw = { static int nvmf_modevent(module_t mod, int what, void *arg) { + int error; + switch (what) { case MOD_LOAD: - return (nvmf_ctl_load()); + error = nvmf_ctl_load(); + if (error != 0) + return (error); + + nvmf_tq = taskqueue_create("nvmf", M_WAITOK | M_ZERO, + taskqueue_thread_enqueue, &nvmf_tq); + taskqueue_start_threads(&nvmf_tq, 1, PWAIT, "nvmf taskq"); + return (0); case MOD_QUIESCE: return (0); case MOD_UNLOAD: nvmf_ctl_unload(); destroy_dev_drain(&nvmf_cdevsw); + if (nvmf_tq != NULL) + taskqueue_free(nvmf_tq); return (0); default: return (EOPNOTSUPP); diff --git a/sys/dev/nvmf/host/nvmf_var.h b/sys/dev/nvmf/host/nvmf_var.h index e45a31f413a4..606245b3969c 100644 --- a/sys/dev/nvmf/host/nvmf_var.h +++ b/sys/dev/nvmf/host/nvmf_var.h @@ -75,9 +75,15 @@ struct nvmf_softc { struct callout ka_rx_timer; sbintime_t ka_rx_sbt; + struct timeout_task request_reconnect_task; + struct timeout_task controller_loss_task; + uint32_t reconnect_delay; + uint32_t controller_loss_timeout; + struct sx connection_lock; struct task disconnect_task; bool detaching; + bool controller_timedout; u_int num_aer; struct nvmf_aer *aer; diff --git a/sys/dev/nvmf/nvmf.h b/sys/dev/nvmf/nvmf.h index d4e7b1511e9d..9b2b4c1dea40 100644 --- a/sys/dev/nvmf/nvmf.h +++ b/sys/dev/nvmf/nvmf.h @@ -27,6 +27,13 @@ #define NVMF_NN (1024) /* + * Default timeouts for Fabrics hosts. These match values used by + * Linux. + */ +#define NVMF_DEFAULT_RECONNECT_DELAY 10 +#define NVMF_DEFAULT_CONTROLLER_LOSS 600 + +/* * (data, size) is the userspace buffer for a packed nvlist. * * For requests that copyout an nvlist, len is the amount of data @@ -68,6 +75,8 @@ struct nvmf_ioc_nv { * * number trtype * number kato (optional) + * number reconnect_delay (optional) + * number controller_loss_timeout (optional) * qpair handoff nvlist admin * qpair handoff nvlist array io * binary cdata struct nvme_controller_data @@ -81,6 +90,8 @@ struct nvmf_ioc_nv { * string hostnqn * number num_io_queues * number kato (optional) + * number reconnect_delay (optional) + * number controller_loss_timeout (optional) * number io_qsize * bool sq_flow_control * diff --git a/sys/dev/ofw/ofw_bus_subr.c b/sys/dev/ofw/ofw_bus_subr.c index 4d0479dfb957..b99d784929bc 100644 --- a/sys/dev/ofw/ofw_bus_subr.c +++ b/sys/dev/ofw/ofw_bus_subr.c @@ -634,11 +634,89 @@ ofw_bus_find_iparent(phandle_t node) return (iparent); } +static phandle_t +ofw_bus_search_iparent(phandle_t node) +{ + phandle_t iparent; + + do { + if (OF_getencprop(node, "interrupt-parent", &iparent, + sizeof(iparent)) > 0) { + node = OF_node_from_xref(iparent); + } else { + node = OF_parent(node); + } + if (node == 0) + return (0); + } while (!OF_hasprop(node, "#interrupt-cells")); + + return (OF_xref_from_node(node)); +} + +static int +ofw_bus_traverse_imap(phandle_t inode, phandle_t node, uint32_t *intr, + int intrsz, pcell_t *res, int ressz, phandle_t *iparentp) +{ + struct ofw_bus_iinfo ii; + void *reg; + uint32_t *intrp; + phandle_t iparent; + int rv = 0; + + /* We already have an interrupt controller */ + if (OF_hasprop(node, "interrupt-controller")) + return (0); + + intrp = malloc(intrsz, M_OFWPROP, M_WAITOK); + memcpy(intrp, intr, intrsz); + + while (true) { + /* There is no interrupt-map to follow */ + if (!OF_hasprop(inode, "interrupt-map")) { + free(intrp, M_OFWPROP); + return (0); + } + + memset(&ii, 0, sizeof(ii)); + ofw_bus_setup_iinfo(inode, &ii, sizeof(cell_t)); + + reg = NULL; + if (ii.opi_addrc > 0) + reg = malloc(ii.opi_addrc, M_OFWPROP, M_WAITOK); + + rv = ofw_bus_lookup_imap(node, &ii, reg, ii.opi_addrc, intrp, + intrsz, res, ressz, &iparent); + + free(reg, M_OFWPROP); + free(ii.opi_imap, M_OFWPROP); + free(ii.opi_imapmsk, M_OFWPROP); + free(intrp, M_OFWPROP); + + if (rv == 0) + return (0); + + node = inode; + inode = OF_node_from_xref(iparent); + + /* Stop when we have an interrupt controller */ + if (OF_hasprop(inode, "interrupt-controller")) { + *iparentp = iparent; + return (rv); + } + + intrsz = rv * sizeof(pcell_t); + intrp = malloc(intrsz, M_OFWPROP, M_WAITOK); + memcpy(intrp, res, intrsz); + } +} + int ofw_bus_intr_to_rl(device_t dev, phandle_t node, struct resource_list *rl, int *rlen) { - phandle_t iparent; + phandle_t iparent, iparent_node; + uint32_t result[16]; + uint32_t intrpcells, *intrp; uint32_t icells, *intr; int err, i, irqnum, nintr, rid; bool extended; @@ -646,15 +724,16 @@ ofw_bus_intr_to_rl(device_t dev, phandle_t node, nintr = OF_getencprop_alloc_multi(node, "interrupts", sizeof(*intr), (void **)&intr); if (nintr > 0) { - iparent = ofw_bus_find_iparent(node); + iparent = ofw_bus_search_iparent(node); if (iparent == 0) { device_printf(dev, "No interrupt-parent found, " "assuming direct parent\n"); iparent = OF_parent(node); iparent = OF_xref_from_node(iparent); } - if (OF_searchencprop(OF_node_from_xref(iparent), - "#interrupt-cells", &icells, sizeof(icells)) == -1) { + iparent_node = OF_node_from_xref(iparent); + if (OF_searchencprop(iparent_node, "#interrupt-cells", &icells, + sizeof(icells)) == -1) { device_printf(dev, "Missing #interrupt-cells " "property, assuming <1>\n"); icells = 1; @@ -677,7 +756,8 @@ ofw_bus_intr_to_rl(device_t dev, phandle_t node, for (i = 0; i < nintr; i += icells) { if (extended) { iparent = intr[i++]; - if (OF_searchencprop(OF_node_from_xref(iparent), + iparent_node = OF_node_from_xref(iparent); + if (OF_searchencprop(iparent_node, "#interrupt-cells", &icells, sizeof(icells)) == -1) { device_printf(dev, "Missing #interrupt-cells " "property\n"); @@ -691,7 +771,16 @@ ofw_bus_intr_to_rl(device_t dev, phandle_t node, break; } } - irqnum = ofw_bus_map_intr(dev, iparent, icells, &intr[i]); + + intrp = &intr[i]; + intrpcells = ofw_bus_traverse_imap(iparent_node, node, intrp, + icells * sizeof(intr[0]), result, sizeof(result), &iparent); + if (intrpcells > 0) + intrp = result; + else + intrpcells = icells; + + irqnum = ofw_bus_map_intr(dev, iparent, intrpcells, intrp); resource_list_add(rl, SYS_RES_IRQ, rid++, irqnum, irqnum, 1); } if (rlen != NULL) diff --git a/sys/dev/qlnx/qlnxe/qlnx_os.c b/sys/dev/qlnx/qlnxe/qlnx_os.c index 05ec69a70dfe..4ad190374f87 100644 --- a/sys/dev/qlnx/qlnxe/qlnx_os.c +++ b/sys/dev/qlnx/qlnxe/qlnx_os.c @@ -30,6 +30,8 @@ * Author : David C Somayajulu, Cavium, Inc., San Jose, CA 95131. */ +#include "opt_inet.h" + #include <sys/cdefs.h> #include "qlnx_os.h" #include "bcm_osal.h" @@ -2306,8 +2308,6 @@ qlnx_init_ifnet(device_t dev, qlnx_host_t *ha) else if (device_id == QLOGIC_PCI_DEVICE_ID_1644) if_setbaudrate(ifp, IF_Gbps(100)); - if_setcapabilities(ifp, IFCAP_LINKSTATE); - if_setinitfn(ifp, qlnx_init); if_setsoftc(ifp, ha); if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST); @@ -2341,7 +2341,6 @@ qlnx_init_ifnet(device_t dev, qlnx_host_t *ha) if_setcapabilities(ifp, IFCAP_HWCSUM); if_setcapabilitiesbit(ifp, IFCAP_JUMBO_MTU, 0); - if_setcapabilitiesbit(ifp, IFCAP_VLAN_MTU, 0); if_setcapabilitiesbit(ifp, IFCAP_VLAN_HWTAGGING, 0); if_setcapabilitiesbit(ifp, IFCAP_VLAN_HWFILTER, 0); @@ -2350,6 +2349,8 @@ qlnx_init_ifnet(device_t dev, qlnx_host_t *ha) if_setcapabilitiesbit(ifp, IFCAP_TSO4, 0); if_setcapabilitiesbit(ifp, IFCAP_TSO6, 0); if_setcapabilitiesbit(ifp, IFCAP_LRO, 0); + if_setcapabilitiesbit(ifp, IFCAP_LINKSTATE, 0); + if_setcapabilitiesbit(ifp, IFCAP_HWSTATS, 0); if_sethwtsomax(ifp, QLNX_MAX_TSO_FRAME_SIZE - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN)); @@ -2778,7 +2779,7 @@ qlnx_ioctl(if_t ifp, u_long cmd, caddr_t data) if (!p_ptt) { QL_DPRINT1(ha, "ecore_ptt_acquire failed\n"); - ret = -1; + ret = ERESTART; break; } @@ -2789,7 +2790,7 @@ qlnx_ioctl(if_t ifp, u_long cmd, caddr_t data) ecore_ptt_release(p_hwfn, p_ptt); if (ret) { - ret = -1; + ret = ENODEV; break; } diff --git a/sys/dev/random/fortuna.c b/sys/dev/random/fortuna.c index c4282c723a44..8363de99a60a 100644 --- a/sys/dev/random/fortuna.c +++ b/sys/dev/random/fortuna.c @@ -341,6 +341,13 @@ random_fortuna_process_event(struct harvest_event *event) u_int pl; RANDOM_RESEED_LOCK(); + /* + * Run SP 800-90B health tests on the source if so configured. + */ + if (!random_harvest_healthtest(event)) { + RANDOM_RESEED_UNLOCK(); + return; + } /*- * FS&K - P_i = P_i|<harvested stuff> * Accumulate the event into the appropriate pool diff --git a/sys/dev/random/random_harvestq.c b/sys/dev/random/random_harvestq.c index 379b64ac15f1..c7762967c4fb 100644 --- a/sys/dev/random/random_harvestq.c +++ b/sys/dev/random/random_harvestq.c @@ -88,6 +88,8 @@ static void random_sources_feed(void); static __read_mostly bool epoch_inited; static __read_mostly epoch_t rs_epoch; +static const char *random_source_descr[ENTROPYSOURCE]; + /* * How many events to queue up. We create this many items in * an 'empty' queue, then transfer them to the 'harvest' queue with @@ -131,36 +133,25 @@ static struct harvest_context { /* The context of the kernel thread processing harvested entropy */ struct proc *hc_kthread_proc; /* - * Lockless ring buffer holding entropy events - * If ring.in == ring.out, - * the buffer is empty. - * If ring.in != ring.out, - * the buffer contains harvested entropy. - * If (ring.in + 1) == ring.out (mod RANDOM_RING_MAX), - * the buffer is full. - * - * NOTE: ring.in points to the last added element, - * and ring.out points to the last consumed element. - * - * The ring.in variable needs locking as there are multiple - * sources to the ring. Only the sources may change ring.in, - * but the consumer may examine it. - * - * The ring.out variable does not need locking as there is - * only one consumer. Only the consumer may change ring.out, - * but the sources may examine it. + * A pair of buffers for queued events. New events are added to the + * active queue while the kthread processes the other one in parallel. */ - struct entropy_ring { + struct entropy_buffer { struct harvest_event ring[RANDOM_RING_MAX]; - volatile u_int in; - volatile u_int out; - } hc_entropy_ring; + u_int pos; + } hc_entropy_buf[2]; + u_int hc_active_buf; struct fast_entropy_accumulator { volatile u_int pos; uint32_t buf[RANDOM_ACCUM_MAX]; } hc_entropy_fast_accumulator; } harvest_context; +#define RANDOM_HARVEST_INIT_LOCK() mtx_init(&harvest_context.hc_mtx, \ + "entropy harvest mutex", NULL, MTX_SPIN) +#define RANDOM_HARVEST_LOCK() mtx_lock_spin(&harvest_context.hc_mtx) +#define RANDOM_HARVEST_UNLOCK() mtx_unlock_spin(&harvest_context.hc_mtx) + static struct kproc_desc random_proc_kp = { "rand_harvestq", random_kthread, @@ -178,43 +169,48 @@ random_harvestq_fast_process_event(struct harvest_event *event) static void random_kthread(void) { - u_int maxloop, ring_out, i; + struct harvest_context *hc; - /* - * Locking is not needed as this is the only place we modify ring.out, and - * we only examine ring.in without changing it. Both of these are volatile, - * and this is a unique thread. - */ + hc = &harvest_context; for (random_kthread_control = 1; random_kthread_control;) { - /* Deal with events, if any. Restrict the number we do in one go. */ - maxloop = RANDOM_RING_MAX; - while (harvest_context.hc_entropy_ring.out != harvest_context.hc_entropy_ring.in) { - ring_out = (harvest_context.hc_entropy_ring.out + 1)%RANDOM_RING_MAX; - random_harvestq_fast_process_event(harvest_context.hc_entropy_ring.ring + ring_out); - harvest_context.hc_entropy_ring.out = ring_out; - if (!--maxloop) - break; - } + struct entropy_buffer *buf; + u_int entries; + + /* Deal with queued events. */ + RANDOM_HARVEST_LOCK(); + buf = &hc->hc_entropy_buf[hc->hc_active_buf]; + entries = buf->pos; + buf->pos = 0; + hc->hc_active_buf = (hc->hc_active_buf + 1) % + nitems(hc->hc_entropy_buf); + RANDOM_HARVEST_UNLOCK(); + for (u_int i = 0; i < entries; i++) + random_harvestq_fast_process_event(&buf->ring[i]); + + /* Poll sources of noise. */ random_sources_feed(); + /* XXX: FIX!! Increase the high-performance data rate? Need some measurements first. */ - for (i = 0; i < RANDOM_ACCUM_MAX; i++) { - if (harvest_context.hc_entropy_fast_accumulator.buf[i]) { - random_harvest_direct(harvest_context.hc_entropy_fast_accumulator.buf + i, sizeof(harvest_context.hc_entropy_fast_accumulator.buf[0]), RANDOM_UMA); - harvest_context.hc_entropy_fast_accumulator.buf[i] = 0; + for (u_int i = 0; i < RANDOM_ACCUM_MAX; i++) { + if (hc->hc_entropy_fast_accumulator.buf[i]) { + random_harvest_direct(&hc->hc_entropy_fast_accumulator.buf[i], + sizeof(hc->hc_entropy_fast_accumulator.buf[0]), RANDOM_UMA); + hc->hc_entropy_fast_accumulator.buf[i] = 0; } } /* XXX: FIX!! This is a *great* place to pass hardware/live entropy to random(9) */ - tsleep_sbt(&harvest_context.hc_kthread_proc, 0, "-", + tsleep_sbt(&hc->hc_kthread_proc, 0, "-", SBT_1S/RANDOM_KTHREAD_HZ, 0, C_PREL(1)); } random_kthread_control = -1; - wakeup(&harvest_context.hc_kthread_proc); + wakeup(&hc->hc_kthread_proc); kproc_exit(0); /* NOTREACHED */ } -/* This happens well after SI_SUB_RANDOM */ SYSINIT(random_device_h_proc, SI_SUB_KICK_SCHEDULER, SI_ORDER_ANY, kproc_start, &random_proc_kp); +_Static_assert(SI_SUB_KICK_SCHEDULER > SI_SUB_RANDOM, + "random kthread starting before subsystem initialization"); static void rs_epoch_init(void *dummy __unused) @@ -305,7 +301,230 @@ random_sources_feed(void) explicit_bzero(entropy, sizeof(entropy)); } -/* ARGSUSED */ +/* + * State used for conducting NIST SP 800-90B health tests on entropy sources. + */ +static struct health_test_softc { + uint32_t ht_rct_value[HARVESTSIZE + 1]; + u_int ht_rct_count; /* number of samples with the same value */ + u_int ht_rct_limit; /* constant after init */ + + uint32_t ht_apt_value[HARVESTSIZE + 1]; + u_int ht_apt_count; /* number of samples with the same value */ + u_int ht_apt_seq; /* sequence number of the last sample */ + u_int ht_apt_cutoff; /* constant after init */ + + uint64_t ht_total_samples; + bool ondemand; /* Set to true to restart the state machine */ + enum { + INIT = 0, /* initial state */ + DISABLED, /* health checking is disabled */ + STARTUP, /* doing startup tests, samples are discarded */ + STEADY, /* steady-state operation */ + FAILED, /* health check failed, discard samples */ + } ht_state; +} healthtest[ENTROPYSOURCE]; + +#define RANDOM_SELFTEST_STARTUP_SAMPLES 1024 /* 4.3, requirement 4 */ +#define RANDOM_SELFTEST_APT_WINDOW 512 /* 4.4.2 */ + +static void +copy_event(uint32_t dst[static HARVESTSIZE + 1], + const struct harvest_event *event) +{ + memset(dst, 0, sizeof(uint32_t) * (HARVESTSIZE + 1)); + memcpy(dst, event->he_entropy, event->he_size); + dst[HARVESTSIZE] = event->he_somecounter; +} + +static void +random_healthtest_rct_init(struct health_test_softc *ht, + const struct harvest_event *event) +{ + ht->ht_rct_count = 1; + copy_event(ht->ht_rct_value, event); +} + +/* + * Apply the repitition count test to a sample. + * + * Return false if the test failed, i.e., we observed >= C consecutive samples + * with the same value, and true otherwise. + */ +static bool +random_healthtest_rct_next(struct health_test_softc *ht, + const struct harvest_event *event) +{ + uint32_t val[HARVESTSIZE + 1]; + + copy_event(val, event); + if (memcmp(val, ht->ht_rct_value, sizeof(ht->ht_rct_value)) != 0) { + ht->ht_rct_count = 1; + memcpy(ht->ht_rct_value, val, sizeof(ht->ht_rct_value)); + return (true); + } else { + ht->ht_rct_count++; + return (ht->ht_rct_count < ht->ht_rct_limit); + } +} + +static void +random_healthtest_apt_init(struct health_test_softc *ht, + const struct harvest_event *event) +{ + ht->ht_apt_count = 1; + ht->ht_apt_seq = 1; + copy_event(ht->ht_apt_value, event); +} + +static bool +random_healthtest_apt_next(struct health_test_softc *ht, + const struct harvest_event *event) +{ + uint32_t val[HARVESTSIZE + 1]; + + if (ht->ht_apt_seq == 0) { + random_healthtest_apt_init(ht, event); + return (true); + } + + copy_event(val, event); + if (memcmp(val, ht->ht_apt_value, sizeof(ht->ht_apt_value)) == 0) { + ht->ht_apt_count++; + if (ht->ht_apt_count >= ht->ht_apt_cutoff) + return (false); + } + + ht->ht_apt_seq++; + if (ht->ht_apt_seq == RANDOM_SELFTEST_APT_WINDOW) + ht->ht_apt_seq = 0; + + return (true); +} + +/* + * Run the health tests for the given event. This is assumed to be called from + * a serialized context. + */ +bool +random_harvest_healthtest(const struct harvest_event *event) +{ + struct health_test_softc *ht; + + ht = &healthtest[event->he_source]; + + /* + * Was on-demand testing requested? Restart the state machine if so, + * restarting the startup tests. + */ + if (atomic_load_bool(&ht->ondemand)) { + atomic_store_bool(&ht->ondemand, false); + ht->ht_state = INIT; + } + + switch (ht->ht_state) { + case __predict_false(INIT): + /* Store the first sample and initialize test state. */ + random_healthtest_rct_init(ht, event); + random_healthtest_apt_init(ht, event); + ht->ht_total_samples = 0; + ht->ht_state = STARTUP; + return (false); + case DISABLED: + /* No health testing for this source. */ + return (true); + case STEADY: + case STARTUP: + ht->ht_total_samples++; + if (random_healthtest_rct_next(ht, event) && + random_healthtest_apt_next(ht, event)) { + if (ht->ht_state == STARTUP && + ht->ht_total_samples >= + RANDOM_SELFTEST_STARTUP_SAMPLES) { + printf( + "random: health test passed for source %s\n", + random_source_descr[event->he_source]); + ht->ht_state = STEADY; + } + return (ht->ht_state == STEADY); + } + ht->ht_state = FAILED; + printf( + "random: health test failed for source %s, discarding samples\n", + random_source_descr[event->he_source]); + /* FALLTHROUGH */ + case FAILED: + return (false); + } +} + +static bool nist_healthtest_enabled = false; +SYSCTL_BOOL(_kern_random, OID_AUTO, nist_healthtest_enabled, + CTLFLAG_RDTUN, &nist_healthtest_enabled, 0, + "Enable NIST SP 800-90B health tests for noise sources"); + +static void +random_healthtest_init(enum random_entropy_source source) +{ + struct health_test_softc *ht; + + ht = &healthtest[source]; + KASSERT(ht->ht_state == INIT, + ("%s: health test state is %d for source %d", + __func__, ht->ht_state, source)); + + /* + * If health-testing is enabled, validate all sources except CACHED and + * VMGENID: they are deterministic sources used only a small, fixed + * number of times, so statistical testing is not applicable. + */ + if (!nist_healthtest_enabled || + source == RANDOM_CACHED || source == RANDOM_PURE_VMGENID) { + ht->ht_state = DISABLED; + return; + } + + /* + * Set cutoff values for the two tests, assuming that each sample has + * min-entropy of 1 bit and allowing for an error rate of 1 in 2^{34}. + * With a sample rate of RANDOM_KTHREAD_HZ, we expect to see an false + * positive once in ~54.5 years. + * + * The RCT limit comes from the formula in section 4.4.1. + * + * The APT cutoff is calculated using the formula in section 4.4.2 + * footnote 10 with the window size changed from 512 to 511, since the + * test as written counts the number of samples equal to the first + * sample in the window, and thus tests W-1 samples. + */ + ht->ht_rct_limit = 35; + ht->ht_apt_cutoff = 330; +} + +static int +random_healthtest_ondemand(SYSCTL_HANDLER_ARGS) +{ + u_int mask, source; + int error; + + mask = 0; + error = sysctl_handle_int(oidp, &mask, 0, req); + if (error != 0 || req->newptr == NULL) + return (error); + + while (mask != 0) { + source = ffs(mask) - 1; + if (source < nitems(healthtest)) + atomic_store_bool(&healthtest[source].ondemand, true); + mask &= ~(1u << source); + } + return (0); +} +SYSCTL_PROC(_kern_random, OID_AUTO, nist_healthtest_ondemand, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, + random_healthtest_ondemand, "I", + "Re-run NIST SP 800-90B startup health tests for a noise source"); + static int random_check_uint_harvestmask(SYSCTL_HANDLER_ARGS) { @@ -336,7 +555,6 @@ SYSCTL_PROC(_kern_random_harvest, OID_AUTO, mask, random_check_uint_harvestmask, "IU", "Entropy harvesting mask"); -/* ARGSUSED */ static int random_print_harvestmask(SYSCTL_HANDLER_ARGS) { @@ -370,7 +588,8 @@ static const char *random_source_descr[ENTROPYSOURCE] = { [RANDOM_SWI] = "SWI", [RANDOM_FS_ATIME] = "FS_ATIME", [RANDOM_UMA] = "UMA", - [RANDOM_CALLOUT] = "CALLOUT", /* ENVIRONMENTAL_END */ + [RANDOM_CALLOUT] = "CALLOUT", + [RANDOM_RANDOMDEV] = "RANDOMDEV", /* ENVIRONMENTAL_END */ [RANDOM_PURE_OCTEON] = "PURE_OCTEON", /* PURE_START */ [RANDOM_PURE_SAFE] = "PURE_SAFE", [RANDOM_PURE_GLXSB] = "PURE_GLXSB", @@ -390,7 +609,6 @@ static const char *random_source_descr[ENTROPYSOURCE] = { /* "ENTROPYSOURCE" */ }; -/* ARGSUSED */ static int random_print_harvestmask_symbolic(SYSCTL_HANDLER_ARGS) { @@ -423,7 +641,6 @@ SYSCTL_PROC(_kern_random_harvest, OID_AUTO, mask_symbolic, random_print_harvestmask_symbolic, "A", "Entropy harvesting mask (symbolic)"); -/* ARGSUSED */ static void random_harvestq_init(void *unused __unused) { @@ -433,7 +650,10 @@ random_harvestq_init(void *unused __unused) hc_source_mask = almost_everything_mask; RANDOM_HARVEST_INIT_LOCK(); - harvest_context.hc_entropy_ring.in = harvest_context.hc_entropy_ring.out = 0; + harvest_context.hc_active_buf = 0; + + for (int i = 0; i < ENTROPYSOURCE; i++) + random_healthtest_init(i); } SYSINIT(random_device_h_init, SI_SUB_RANDOM, SI_ORDER_THIRD, random_harvestq_init, NULL); @@ -453,7 +673,7 @@ random_early_prime(char *entropy, size_t len) return (0); for (i = 0; i < len; i += sizeof(event.he_entropy)) { - event.he_somecounter = (uint32_t)get_cyclecount(); + event.he_somecounter = random_get_cyclecount(); event.he_size = sizeof(event.he_entropy); event.he_source = RANDOM_CACHED; event.he_destination = @@ -493,7 +713,6 @@ random_prime_loader_file(const char *type) * known to the kernel, and inserting it directly into the hashing * module, currently Fortuna. */ -/* ARGSUSED */ static void random_harvestq_prime(void *unused __unused) { @@ -522,7 +741,6 @@ random_harvestq_prime(void *unused __unused) } SYSINIT(random_device_prime, SI_SUB_RANDOM, SI_ORDER_MIDDLE, random_harvestq_prime, NULL); -/* ARGSUSED */ static void random_harvestq_deinit(void *unused __unused) { @@ -540,9 +758,9 @@ SYSUNINIT(random_device_h_init, SI_SUB_RANDOM, SI_ORDER_THIRD, random_harvestq_d * This is supposed to be fast; do not do anything slow in here! * It is also illegal (and morally reprehensible) to insert any * high-rate data here. "High-rate" is defined as a data source - * that will usually cause lots of failures of the "Lockless read" - * check a few lines below. This includes the "always-on" sources - * like the Intel "rdrand" or the VIA Nehamiah "xstore" sources. + * that is likely to fill up the buffer in much less than 100ms. + * This includes the "always-on" sources like the Intel "rdrand" + * or the VIA Nehamiah "xstore" sources. */ /* XXXRW: get_cyclecount() is cheap on most modern hardware, where cycle * counters are built in, but on older hardware it will do a real time clock @@ -551,28 +769,29 @@ SYSUNINIT(random_device_h_init, SI_SUB_RANDOM, SI_ORDER_THIRD, random_harvestq_d void random_harvest_queue_(const void *entropy, u_int size, enum random_entropy_source origin) { + struct harvest_context *hc; + struct entropy_buffer *buf; struct harvest_event *event; - u_int ring_in; - KASSERT(origin >= RANDOM_START && origin < ENTROPYSOURCE, ("%s: origin %d invalid\n", __func__, origin)); + KASSERT(origin >= RANDOM_START && origin < ENTROPYSOURCE, + ("%s: origin %d invalid", __func__, origin)); + + hc = &harvest_context; RANDOM_HARVEST_LOCK(); - ring_in = (harvest_context.hc_entropy_ring.in + 1)%RANDOM_RING_MAX; - if (ring_in != harvest_context.hc_entropy_ring.out) { - /* The ring is not full */ - event = harvest_context.hc_entropy_ring.ring + ring_in; - event->he_somecounter = (uint32_t)get_cyclecount(); + buf = &hc->hc_entropy_buf[hc->hc_active_buf]; + if (buf->pos < RANDOM_RING_MAX) { + event = &buf->ring[buf->pos++]; + event->he_somecounter = random_get_cyclecount(); event->he_source = origin; - event->he_destination = harvest_context.hc_destination[origin]++; + event->he_destination = hc->hc_destination[origin]++; if (size <= sizeof(event->he_entropy)) { event->he_size = size; memcpy(event->he_entropy, entropy, size); - } - else { + } else { /* Big event, so squash it */ event->he_size = sizeof(event->he_entropy[0]); event->he_entropy[0] = jenkins_hash(entropy, size, (uint32_t)(uintptr_t)event); } - harvest_context.hc_entropy_ring.in = ring_in; } RANDOM_HARVEST_UNLOCK(); } @@ -589,7 +808,8 @@ random_harvest_fast_(const void *entropy, u_int size) u_int pos; pos = harvest_context.hc_entropy_fast_accumulator.pos; - harvest_context.hc_entropy_fast_accumulator.buf[pos] ^= jenkins_hash(entropy, size, (uint32_t)get_cyclecount()); + harvest_context.hc_entropy_fast_accumulator.buf[pos] ^= + jenkins_hash(entropy, size, random_get_cyclecount()); harvest_context.hc_entropy_fast_accumulator.pos = (pos + 1)%RANDOM_ACCUM_MAX; } @@ -606,7 +826,7 @@ random_harvest_direct_(const void *entropy, u_int size, enum random_entropy_sour KASSERT(origin >= RANDOM_START && origin < ENTROPYSOURCE, ("%s: origin %d invalid\n", __func__, origin)); size = MIN(size, sizeof(event.he_entropy)); - event.he_somecounter = (uint32_t)get_cyclecount(); + event.he_somecounter = random_get_cyclecount(); event.he_size = size; event.he_source = origin; event.he_destination = harvest_context.hc_destination[origin]++; diff --git a/sys/dev/random/random_harvestq.h b/sys/dev/random/random_harvestq.h index 69a9dfabd44a..1d462500df85 100644 --- a/sys/dev/random/random_harvestq.h +++ b/sys/dev/random/random_harvestq.h @@ -27,6 +27,9 @@ #ifndef SYS_DEV_RANDOM_RANDOM_HARVESTQ_H_INCLUDED #define SYS_DEV_RANDOM_RANDOM_HARVESTQ_H_INCLUDED +#include <sys/types.h> +#include <machine/cpu.h> + #define HARVESTSIZE 2 /* Max length in words of each harvested entropy unit */ /* These are used to queue harvested packets of entropy. The entropy @@ -40,8 +43,12 @@ struct harvest_event { uint8_t he_source; /* origin of the entropy */ }; -#define RANDOM_HARVEST_INIT_LOCK(x) mtx_init(&harvest_context.hc_mtx, "entropy harvest mutex", NULL, MTX_SPIN) -#define RANDOM_HARVEST_LOCK(x) mtx_lock_spin(&harvest_context.hc_mtx) -#define RANDOM_HARVEST_UNLOCK(x) mtx_unlock_spin(&harvest_context.hc_mtx) +static inline uint32_t +random_get_cyclecount(void) +{ + return ((uint32_t)get_cyclecount()); +} + +bool random_harvest_healthtest(const struct harvest_event *event); #endif /* SYS_DEV_RANDOM_RANDOM_HARVESTQ_H_INCLUDED */ diff --git a/sys/dev/random/randomdev.c b/sys/dev/random/randomdev.c index 6d637ab5a53e..ced4dd8067d9 100644 --- a/sys/dev/random/randomdev.c +++ b/sys/dev/random/randomdev.c @@ -303,16 +303,16 @@ randomdev_accumulate(uint8_t *buf, u_int count) /* Extra timing here is helpful to scrape scheduler jitter entropy */ randomdev_hash_init(&hash); - timestamp = (uint32_t)get_cyclecount(); + timestamp = random_get_cyclecount(); randomdev_hash_iterate(&hash, ×tamp, sizeof(timestamp)); randomdev_hash_iterate(&hash, buf, count); - timestamp = (uint32_t)get_cyclecount(); + timestamp = random_get_cyclecount(); randomdev_hash_iterate(&hash, ×tamp, sizeof(timestamp)); randomdev_hash_finish(&hash, entropy_data); for (i = 0; i < RANDOM_KEYSIZE_WORDS; i += sizeof(event.he_entropy)/sizeof(event.he_entropy[0])) { - event.he_somecounter = (uint32_t)get_cyclecount(); + event.he_somecounter = random_get_cyclecount(); event.he_size = sizeof(event.he_entropy); - event.he_source = RANDOM_CACHED; + event.he_source = RANDOM_RANDOMDEV; event.he_destination = destination++; /* Harmless cheating */ memcpy(event.he_entropy, entropy_data + i, sizeof(event.he_entropy)); p_random_alg_context->ra_event_processor(&event); diff --git a/sys/dev/regulator/regulator_fixed.c b/sys/dev/regulator/regulator_fixed.c index 0a76da7140a0..55cdb5e4aeae 100644 --- a/sys/dev/regulator/regulator_fixed.c +++ b/sys/dev/regulator/regulator_fixed.c @@ -100,12 +100,8 @@ static struct gpio_entry * regnode_get_gpio_entry(struct gpiobus_pin *gpio_pin) { struct gpio_entry *entry, *tmp; - device_t busdev; int rv; - busdev = GPIO_GET_BUS(gpio_pin->dev); - if (busdev == NULL) - return (NULL); entry = malloc(sizeof(struct gpio_entry), M_FIXEDREGULATOR, M_WAITOK | M_ZERO); @@ -122,8 +118,8 @@ regnode_get_gpio_entry(struct gpiobus_pin *gpio_pin) } /* Reserve pin. */ - /* XXX Can we call gpiobus_acquire_pin() with gpio_list_mtx held? */ - rv = gpiobus_acquire_pin(busdev, gpio_pin->pin); + /* XXX Can we call gpio_pin_acquire() with gpio_list_mtx held? */ + rv = gpio_pin_acquire(gpio_pin); if (rv != 0) { mtx_unlock(&gpio_list_mtx); free(entry, M_FIXEDREGULATOR); diff --git a/sys/dev/sound/midi/midi.c b/sys/dev/sound/midi/midi.c index fbfb69de2913..6753f864ba9c 100644 --- a/sys/dev/sound/midi/midi.c +++ b/sys/dev/sound/midi/midi.c @@ -30,12 +30,6 @@ * POSSIBILITY OF SUCH DAMAGE. */ - /* - * Parts of this file started out as NetBSD: midi.c 1.31 - * They are mostly gone. Still the most obvious will be the state - * machine midi_in - */ - #include <sys/param.h> #include <sys/systm.h> #include <sys/queue.h> @@ -66,7 +60,6 @@ #include "mpu_if.h" #include <dev/sound/midi/midiq.h> -#include "synth_if.h" MALLOC_DEFINE(M_MIDI, "midi buffers", "Midi data allocation area"); #ifndef KOBJMETHOD_END @@ -79,17 +72,6 @@ enum midi_states { MIDI_IN_START, MIDI_IN_SYSEX, MIDI_IN_DATA }; -/* - * The MPU interface current has init() uninit() inqsize() outqsize() - * callback() : fiddle with the tx|rx status. - */ - -#include "mpu_if.h" - -/* - * /dev/rmidi Structure definitions - */ - #define MIDI_NAMELEN 16 struct snd_midi { KOBJ_FIELDS; @@ -115,95 +97,13 @@ struct snd_midi { * complete command packets. */ struct proc *async; struct cdev *dev; - struct synth_midi *synth; - int synth_flags; TAILQ_ENTRY(snd_midi) link; }; -struct synth_midi { - KOBJ_FIELDS; - struct snd_midi *m; -}; - -static synth_open_t midisynth_open; -static synth_close_t midisynth_close; -static synth_writeraw_t midisynth_writeraw; -static synth_killnote_t midisynth_killnote; -static synth_startnote_t midisynth_startnote; -static synth_setinstr_t midisynth_setinstr; -static synth_alloc_t midisynth_alloc; -static synth_controller_t midisynth_controller; -static synth_bender_t midisynth_bender; - -static kobj_method_t midisynth_methods[] = { - KOBJMETHOD(synth_open, midisynth_open), - KOBJMETHOD(synth_close, midisynth_close), - KOBJMETHOD(synth_writeraw, midisynth_writeraw), - KOBJMETHOD(synth_setinstr, midisynth_setinstr), - KOBJMETHOD(synth_startnote, midisynth_startnote), - KOBJMETHOD(synth_killnote, midisynth_killnote), - KOBJMETHOD(synth_alloc, midisynth_alloc), - KOBJMETHOD(synth_controller, midisynth_controller), - KOBJMETHOD(synth_bender, midisynth_bender), - KOBJMETHOD_END -}; - -DEFINE_CLASS(midisynth, midisynth_methods, 0); - -/* - * Module Exports & Interface - * - * struct midi_chan *midi_init(MPU_CLASS cls, int unit, int chan, - * void *cookie) - * int midi_uninit(struct snd_midi *) - * - * 0 == no error - * EBUSY or other error - * - * int midi_in(struct snd_midi *, char *buf, int count) - * int midi_out(struct snd_midi *, char *buf, int count) - * - * midi_{in,out} return actual size transfered - * - */ - -/* - * midi_devs tailq, holder of all rmidi instances protected by midistat_lock - */ - TAILQ_HEAD(, snd_midi) midi_devs; -/* - * /dev/midistat variables and declarations, protected by midistat_lock - */ - struct sx mstat_lock; -static int midistat_isopen = 0; -static struct sbuf midistat_sbuf; -static struct cdev *midistat_dev; - -/* - * /dev/midistat dev_t declarations - */ - -static d_open_t midistat_open; -static d_close_t midistat_close; -static d_read_t midistat_read; - -static struct cdevsw midistat_cdevsw = { - .d_version = D_VERSION, - .d_open = midistat_open, - .d_close = midistat_close, - .d_read = midistat_read, - .d_name = "midistat", -}; - -/* - * /dev/rmidi dev_t declarations, struct variable access is protected by - * locks contained within the structure. - */ - static d_open_t midi_open; static d_close_t midi_close; static d_ioctl_t midi_ioctl; @@ -222,41 +122,18 @@ static struct cdevsw midi_cdevsw = { .d_name = "rmidi", }; -/* - * Prototypes of library functions - */ - static int midi_destroy(struct snd_midi *, int); -static int midistat_prepare(struct sbuf * s); static int midi_load(void); static int midi_unload(void); -/* - * Misc declr. - */ SYSCTL_NODE(_hw, OID_AUTO, midi, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "Midi driver"); -static SYSCTL_NODE(_hw_midi, OID_AUTO, stat, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, - "Status device"); int midi_debug; /* XXX: should this be moved into debug.midi? */ SYSCTL_INT(_hw_midi, OID_AUTO, debug, CTLFLAG_RW, &midi_debug, 0, ""); -int midi_dumpraw; -SYSCTL_INT(_hw_midi, OID_AUTO, dumpraw, CTLFLAG_RW, &midi_dumpraw, 0, ""); - -int midi_instroff; -SYSCTL_INT(_hw_midi, OID_AUTO, instroff, CTLFLAG_RW, &midi_instroff, 0, ""); - -int midistat_verbose; -SYSCTL_INT(_hw_midi_stat, OID_AUTO, verbose, CTLFLAG_RW, - &midistat_verbose, 0, ""); - #define MIDI_DEBUG(l,a) if(midi_debug>=l) a -/* - * CODE START - */ void midistat_lock(void) @@ -285,9 +162,6 @@ midistat_lockassert(void) * what unit number is used. * * It is an error to call midi_init with an already used unit/channel combo. - * - * Returns NULL on error - * */ struct snd_midi * midi_init(kobj_class_t cls, int unit, int channel, void *cookie) @@ -326,9 +200,6 @@ midi_init(kobj_class_t cls, int unit, int channel, void *cookie) MIDI_DEBUG(1, printf("midiinit #2: unit %d/%d.\n", unit, channel)); m = malloc(sizeof(*m), M_MIDI, M_WAITOK | M_ZERO); - m->synth = malloc(sizeof(*m->synth), M_MIDI, M_WAITOK | M_ZERO); - kobj_init((kobj_t)m->synth, &midisynth_class); - m->synth->m = m; kobj_init((kobj_t)m, cls); inqsize = MPU_INQSIZE(m, cookie); outqsize = MPU_OUTQSIZE(m, cookie); @@ -393,7 +264,6 @@ err2: if (MIDIQ_BUF(m->outq)) free(MIDIQ_BUF(m->outq), M_MIDI); err1: - free(m->synth, M_MIDI); free(m, M_MIDI); err0: midistat_unlock(); @@ -405,9 +275,7 @@ err0: * midi_uninit does not call MIDI_UNINIT, as since this is the implementors * entry point. midi_uninit if fact, does not send any methods. A call to * midi_uninit is a defacto promise that you won't manipulate ch anymore - * */ - int midi_uninit(struct snd_midi *m) { @@ -440,13 +308,6 @@ exit: return err; } -/* - * midi_in: process all data until the queue is full, then discards the rest. - * Since midi_in is a state machine, data discards can cause it to get out of - * whack. Process as much as possible. It calls, wakeup, selnotify and - * psignal at most once. - */ - #ifdef notdef static int midi_lengths[] = {2, 2, 2, 2, 1, 1, 2, 0}; @@ -460,6 +321,12 @@ static int midi_lengths[] = {2, 2, 2, 2, 1, 1, 2, 0}; #define MIDI_SYSEX_START 0xF0 #define MIDI_SYSEX_END 0xF7 +/* + * midi_in: process all data until the queue is full, then discards the rest. + * Since midi_in is a state machine, data discards can cause it to get out of + * whack. Process as much as possible. It calls, wakeup, selnotify and + * psignal at most once. + */ int midi_in(struct snd_midi *m, uint8_t *buf, int size) { @@ -627,9 +494,6 @@ midi_out(struct snd_midi *m, uint8_t *buf, int size) return used; } -/* - * /dev/rmidi#.# device access functions - */ int midi_open(struct cdev *i_dev, int flags, int mode, struct thread *td) { @@ -934,434 +798,6 @@ midi_poll(struct cdev *i_dev, int events, struct thread *td) } /* - * /dev/midistat device functions - * - */ -static int -midistat_open(struct cdev *i_dev, int flags, int mode, struct thread *td) -{ - int error; - - MIDI_DEBUG(1, printf("midistat_open\n")); - - midistat_lock(); - if (midistat_isopen) { - midistat_unlock(); - return EBUSY; - } - midistat_isopen = 1; - sbuf_new(&midistat_sbuf, NULL, 4096, SBUF_AUTOEXTEND); - error = (midistat_prepare(&midistat_sbuf) > 0) ? 0 : ENOMEM; - if (error) - midistat_isopen = 0; - midistat_unlock(); - return error; -} - -static int -midistat_close(struct cdev *i_dev, int flags, int mode, struct thread *td) -{ - MIDI_DEBUG(1, printf("midistat_close\n")); - midistat_lock(); - if (!midistat_isopen) { - midistat_unlock(); - return EBADF; - } - sbuf_delete(&midistat_sbuf); - midistat_isopen = 0; - midistat_unlock(); - return 0; -} - -static int -midistat_read(struct cdev *i_dev, struct uio *uio, int flag) -{ - long l; - int err; - - MIDI_DEBUG(4, printf("midistat_read\n")); - midistat_lock(); - if (!midistat_isopen) { - midistat_unlock(); - return EBADF; - } - if (uio->uio_offset < 0 || uio->uio_offset > sbuf_len(&midistat_sbuf)) { - midistat_unlock(); - return EINVAL; - } - err = 0; - l = lmin(uio->uio_resid, sbuf_len(&midistat_sbuf) - uio->uio_offset); - if (l > 0) { - err = uiomove(sbuf_data(&midistat_sbuf) + uio->uio_offset, l, - uio); - } - midistat_unlock(); - return err; -} - -/* - * Module library functions - */ - -static int -midistat_prepare(struct sbuf *s) -{ - struct snd_midi *m; - - midistat_lockassert(); - - sbuf_printf(s, "FreeBSD Midi Driver (midi2)\n"); - if (TAILQ_EMPTY(&midi_devs)) { - sbuf_printf(s, "No devices installed.\n"); - sbuf_finish(s); - return sbuf_len(s); - } - sbuf_printf(s, "Installed devices:\n"); - - TAILQ_FOREACH(m, &midi_devs, link) { - mtx_lock(&m->lock); - sbuf_printf(s, "%s [%d/%d:%s]", m->name, m->unit, m->channel, - MPU_PROVIDER(m, m->cookie)); - sbuf_printf(s, "%s", MPU_DESCR(m, m->cookie, midistat_verbose)); - sbuf_printf(s, "\n"); - mtx_unlock(&m->lock); - } - - sbuf_finish(s); - return sbuf_len(s); -} - -#ifdef notdef -/* - * Convert IOCTL command to string for debugging - */ - -static char * -midi_cmdname(int cmd) -{ - static struct { - int cmd; - char *name; - } *tab, cmdtab_midiioctl[] = { -#define A(x) {x, ## x} - /* - * Once we have some real IOCTLs define, the following will - * be relavant. - * - * A(SNDCTL_MIDI_PRETIME), A(SNDCTL_MIDI_MPUMODE), - * A(SNDCTL_MIDI_MPUCMD), A(SNDCTL_SYNTH_INFO), - * A(SNDCTL_MIDI_INFO), A(SNDCTL_SYNTH_MEMAVL), - * A(SNDCTL_FM_LOAD_INSTR), A(SNDCTL_FM_4OP_ENABLE), - * A(MIOSPASSTHRU), A(MIOGPASSTHRU), A(AIONWRITE), - * A(AIOGSIZE), A(AIOSSIZE), A(AIOGFMT), A(AIOSFMT), - * A(AIOGMIX), A(AIOSMIX), A(AIOSTOP), A(AIOSYNC), - * A(AIOGCAP), - */ -#undef A - { - -1, "unknown" - }, - }; - - for (tab = cmdtab_midiioctl; tab->cmd != cmd && tab->cmd != -1; tab++); - return tab->name; -} - -#endif /* notdef */ - -/* - * midisynth - */ - -int -midisynth_open(void *n, void *arg, int flags) -{ - struct snd_midi *m = ((struct synth_midi *)n)->m; - int retval; - - MIDI_DEBUG(1, printf("midisynth_open %s %s\n", - flags & FREAD ? "M_RX" : "", flags & FWRITE ? "M_TX" : "")); - - if (m == NULL) - return ENXIO; - - mtx_lock(&m->lock); - mtx_lock(&m->qlock); - - retval = 0; - - if (flags & FREAD) { - if (MIDIQ_SIZE(m->inq) == 0) - retval = ENXIO; - else if (m->flags & M_RX) - retval = EBUSY; - if (retval) - goto err; - } - if (flags & FWRITE) { - if (MIDIQ_SIZE(m->outq) == 0) - retval = ENXIO; - else if (m->flags & M_TX) - retval = EBUSY; - if (retval) - goto err; - } - m->busy++; - - /* - * TODO: Consider m->async = 0; - */ - - if (flags & FREAD) { - m->flags |= M_RX | M_RXEN; - /* - * Only clear the inq, the outq might still have data to drain - * from a previous session - */ - MIDIQ_CLEAR(m->inq); - m->rchan = 0; - } - - if (flags & FWRITE) { - m->flags |= M_TX; - m->wchan = 0; - } - m->synth_flags = flags & (FREAD | FWRITE); - - MPU_CALLBACK(m, m->cookie, m->flags); - -err: mtx_unlock(&m->qlock); - mtx_unlock(&m->lock); - MIDI_DEBUG(2, printf("midisynth_open: return %d.\n", retval)); - return retval; -} - -int -midisynth_close(void *n) -{ - struct snd_midi *m = ((struct synth_midi *)n)->m; - int retval; - int oldflags; - - MIDI_DEBUG(1, printf("midisynth_close %s %s\n", - m->synth_flags & FREAD ? "M_RX" : "", - m->synth_flags & FWRITE ? "M_TX" : "")); - - if (m == NULL) - return ENXIO; - - mtx_lock(&m->lock); - mtx_lock(&m->qlock); - - if ((m->synth_flags & FREAD && !(m->flags & M_RX)) || - (m->synth_flags & FWRITE && !(m->flags & M_TX))) { - retval = ENXIO; - goto err; - } - m->busy--; - - oldflags = m->flags; - - if (m->synth_flags & FREAD) - m->flags &= ~(M_RX | M_RXEN); - if (m->synth_flags & FWRITE) - m->flags &= ~M_TX; - - if ((m->flags & (M_TXEN | M_RXEN)) != (oldflags & (M_RXEN | M_TXEN))) - MPU_CALLBACK(m, m->cookie, m->flags); - - MIDI_DEBUG(1, printf("midi_close: closed, busy = %d.\n", m->busy)); - - mtx_unlock(&m->qlock); - mtx_unlock(&m->lock); - retval = 0; -err: return retval; -} - -/* - * Always blocking. - */ - -int -midisynth_writeraw(void *n, uint8_t *buf, size_t len) -{ - struct snd_midi *m = ((struct synth_midi *)n)->m; - int retval; - int used; - int i; - - MIDI_DEBUG(4, printf("midisynth_writeraw\n")); - - retval = 0; - - if (m == NULL) - return ENXIO; - - mtx_lock(&m->lock); - mtx_lock(&m->qlock); - - if (!(m->flags & M_TX)) - goto err1; - - if (midi_dumpraw) - printf("midi dump: "); - - while (len > 0) { - while (MIDIQ_AVAIL(m->outq) == 0) { - if (!(m->flags & M_TXEN)) { - m->flags |= M_TXEN; - MPU_CALLBACK(m, m->cookie, m->flags); - } - mtx_unlock(&m->lock); - m->wchan = 1; - MIDI_DEBUG(3, printf("midisynth_writeraw msleep\n")); - retval = msleep(&m->wchan, &m->qlock, - PCATCH | PDROP, "midi TX", 0); - /* - * We slept, maybe things have changed since last - * dying check - */ - if (retval == EINTR) - goto err0; - - if (retval) - goto err0; - mtx_lock(&m->lock); - mtx_lock(&m->qlock); - m->wchan = 0; - if (!m->busy) - goto err1; - } - - /* - * We are certain than data can be placed on the queue - */ - - used = MIN(MIDIQ_AVAIL(m->outq), len); - used = MIN(used, MIDI_WSIZE); - MIDI_DEBUG(5, - printf("midi_synth: resid %zu len %jd avail %jd\n", - len, (intmax_t)MIDIQ_LEN(m->outq), - (intmax_t)MIDIQ_AVAIL(m->outq))); - - if (midi_dumpraw) - for (i = 0; i < used; i++) - printf("%x ", buf[i]); - - MIDIQ_ENQ(m->outq, buf, used); - len -= used; - - /* - * Inform the bottom half that data can be written - */ - if (!(m->flags & M_TXEN)) { - m->flags |= M_TXEN; - MPU_CALLBACK(m, m->cookie, m->flags); - } - } - /* - * If we Made it here then transfer is good - */ - if (midi_dumpraw) - printf("\n"); - - retval = 0; -err1: mtx_unlock(&m->qlock); - mtx_unlock(&m->lock); -err0: return retval; -} - -static int -midisynth_killnote(void *n, uint8_t chn, uint8_t note, uint8_t vel) -{ - u_char c[3]; - - if (note > 127 || chn > 15) - return (EINVAL); - - if (vel > 127) - vel = 127; - - if (vel == 64) { - c[0] = 0x90 | (chn & 0x0f); /* Note on. */ - c[1] = (u_char)note; - c[2] = 0; - } else { - c[0] = 0x80 | (chn & 0x0f); /* Note off. */ - c[1] = (u_char)note; - c[2] = (u_char)vel; - } - - return midisynth_writeraw(n, c, 3); -} - -static int -midisynth_setinstr(void *n, uint8_t chn, uint16_t instr) -{ - u_char c[2]; - - if (instr > 127 || chn > 15) - return EINVAL; - - c[0] = 0xc0 | (chn & 0x0f); /* Progamme change. */ - c[1] = instr + midi_instroff; - - return midisynth_writeraw(n, c, 2); -} - -static int -midisynth_startnote(void *n, uint8_t chn, uint8_t note, uint8_t vel) -{ - u_char c[3]; - - if (note > 127 || chn > 15) - return EINVAL; - - if (vel > 127) - vel = 127; - - c[0] = 0x90 | (chn & 0x0f); /* Note on. */ - c[1] = (u_char)note; - c[2] = (u_char)vel; - - return midisynth_writeraw(n, c, 3); -} -static int -midisynth_alloc(void *n, uint8_t chan, uint8_t note) -{ - return chan; -} - -static int -midisynth_controller(void *n, uint8_t chn, uint8_t ctrlnum, uint16_t val) -{ - u_char c[3]; - - if (ctrlnum > 127 || chn > 15) - return EINVAL; - - c[0] = 0xb0 | (chn & 0x0f); /* Control Message. */ - c[1] = ctrlnum; - c[2] = val; - return midisynth_writeraw(n, c, 3); -} - -static int -midisynth_bender(void *n, uint8_t chn, uint16_t val) -{ - u_char c[3]; - - if (val > 16383 || chn > 15) - return EINVAL; - - c[0] = 0xe0 | (chn & 0x0f); /* Pitch bend. */ - c[1] = (u_char)val & 0x7f; - c[2] = (u_char)(val >> 7) & 0x7f; - - return midisynth_writeraw(n, c, 3); -} - -/* * Single point of midi destructions. */ static int @@ -1381,24 +817,16 @@ midi_destroy(struct snd_midi *m, int midiuninit) free(MIDIQ_BUF(m->outq), M_MIDI); mtx_destroy(&m->qlock); mtx_destroy(&m->lock); - free(m->synth, M_MIDI); free(m, M_MIDI); return 0; } -/* - * Load and unload functions, creates the /dev/midistat device - */ - static int midi_load(void) { sx_init(&mstat_lock, "midistat lock"); TAILQ_INIT(&midi_devs); - midistat_dev = make_dev(&midistat_cdevsw, MIDI_DEV_MIDICTL, UID_ROOT, - GID_WHEEL, 0666, "midistat"); - return 0; } @@ -1411,9 +839,6 @@ midi_unload(void) MIDI_DEBUG(1, printf("midi_unload()\n")); retval = EBUSY; midistat_lock(); - if (midistat_isopen) - goto exit0; - TAILQ_FOREACH_SAFE(m, &midi_devs, link, tmp) { mtx_lock(&m->lock); if (m->busy) @@ -1421,28 +846,21 @@ midi_unload(void) else retval = midi_destroy(m, 1); if (retval) - goto exit1; + goto exit; } midistat_unlock(); - destroy_dev(midistat_dev); - /* - * Made it here then unload is complete - */ sx_destroy(&mstat_lock); return 0; -exit1: +exit: mtx_unlock(&m->lock); -exit0: midistat_unlock(); if (retval) MIDI_DEBUG(2, printf("midi_unload: failed\n")); return retval; } -extern int seq_modevent(module_t mod, int type, void *data); - static int midi_modevent(module_t mod, int type, void *data) { @@ -1453,14 +871,10 @@ midi_modevent(module_t mod, int type, void *data) switch (type) { case MOD_LOAD: retval = midi_load(); - if (retval == 0) - retval = seq_modevent(mod, type, data); break; case MOD_UNLOAD: retval = midi_unload(); - if (retval == 0) - retval = seq_modevent(mod, type, data); break; default: @@ -1470,73 +884,5 @@ midi_modevent(module_t mod, int type, void *data) return retval; } -kobj_t -midimapper_addseq(void *arg1, int *unit, void **cookie) -{ - unit = NULL; - - return (kobj_t)arg1; -} - -int -midimapper_open_locked(void *arg1, void **cookie) -{ - int retval = 0; - struct snd_midi *m; - - midistat_lockassert(); - TAILQ_FOREACH(m, &midi_devs, link) { - retval++; - } - - return retval; -} - -int -midimapper_open(void *arg1, void **cookie) -{ - int retval; - - midistat_lock(); - retval = midimapper_open_locked(arg1, cookie); - midistat_unlock(); - - return retval; -} - -int -midimapper_close(void *arg1, void *cookie) -{ - return 0; -} - -kobj_t -midimapper_fetch_synth_locked(void *arg, void *cookie, int unit) -{ - struct snd_midi *m; - int retval = 0; - - midistat_lockassert(); - TAILQ_FOREACH(m, &midi_devs, link) { - if (unit == retval) - return (kobj_t)m->synth; - retval++; - } - - return NULL; -} - -kobj_t -midimapper_fetch_synth(void *arg, void *cookie, int unit) -{ - kobj_t synth; - - midistat_lock(); - synth = midimapper_fetch_synth_locked(arg, cookie, unit); - midistat_unlock(); - - return synth; -} - DEV_MODULE(midi, midi_modevent, NULL); MODULE_VERSION(midi, 1); diff --git a/sys/dev/sound/midi/midi.h b/sys/dev/sound/midi/midi.h index 2254fab690e9..286e84264ef3 100644 --- a/sys/dev/sound/midi/midi.h +++ b/sys/dev/sound/midi/midi.h @@ -51,11 +51,4 @@ int midi_uninit(struct snd_midi *_m); int midi_out(struct snd_midi *_m, uint8_t *_buf, int _size); int midi_in(struct snd_midi *_m, uint8_t *_buf, int _size); -kobj_t midimapper_addseq(void *arg1, int *unit, void **cookie); -int midimapper_open_locked(void *arg1, void **cookie); -int midimapper_open(void *arg1, void **cookie); -int midimapper_close(void *arg1, void *cookie); -kobj_t midimapper_fetch_synth_locked(void *arg, void *cookie, int unit); -kobj_t midimapper_fetch_synth(void *arg, void *cookie, int unit); - #endif diff --git a/sys/dev/sound/midi/mpu401.c b/sys/dev/sound/midi/mpu401.c index 2be285bc0040..224ebb1b01f4 100644 --- a/sys/dev/sound/midi/mpu401.c +++ b/sys/dev/sound/midi/mpu401.c @@ -88,8 +88,6 @@ static int mpu401_minqsize(struct snd_midi *, void *); static int mpu401_moutqsize(struct snd_midi *, void *); static void mpu401_mcallback(struct snd_midi *, void *, int); static void mpu401_mcallbackp(struct snd_midi *, void *, int); -static const char *mpu401_mdescr(struct snd_midi *, void *, int); -static const char *mpu401_mprovider(struct snd_midi *, void *); static kobj_method_t mpu401_methods[] = { KOBJMETHOD(mpu_init, mpu401_minit), @@ -98,8 +96,6 @@ static kobj_method_t mpu401_methods[] = { KOBJMETHOD(mpu_outqsize, mpu401_moutqsize), KOBJMETHOD(mpu_callback, mpu401_mcallback), KOBJMETHOD(mpu_callbackp, mpu401_mcallbackp), - KOBJMETHOD(mpu_descr, mpu401_mdescr), - KOBJMETHOD(mpu_provider, mpu401_mprovider), KOBJMETHOD_END }; @@ -122,24 +118,12 @@ mpu401_intr(struct mpu401 *m) int i; int s; -/* - printf("mpu401_intr\n"); -*/ #define RXRDY(m) ( (STATUS(m) & MPU_INPUTBUSY) == 0) #define TXRDY(m) ( (STATUS(m) & MPU_OUTPUTBUSY) == 0) -#if 0 -#define D(x,l) printf("mpu401_intr %d %x %s %s\n",l, x, x&MPU_INPUTBUSY?"RX":"", x&MPU_OUTPUTBUSY?"TX":"") -#else -#define D(x,l) -#endif i = 0; s = STATUS(m); - D(s, 1); while ((s & MPU_INPUTBUSY) == 0 && i < MPU_INTR_BUF) { b[i] = READ(m); -/* - printf("mpu401_intr in i %d d %d\n", i, b[i]); -*/ i++; s = STATUS(m); } @@ -148,15 +132,9 @@ mpu401_intr(struct mpu401 *m) i = 0; while (!(s & MPU_OUTPUTBUSY) && i < MPU_INTR_BUF) { if (midi_out(m->mid, b, 1)) { -/* - printf("mpu401_intr out i %d d %d\n", i, b[0]); -*/ WRITE(m, *b); } else { -/* - printf("mpu401_intr write: no output\n"); -*/ return 0; } i++; @@ -262,13 +240,7 @@ static void mpu401_mcallback(struct snd_midi *sm, void *arg, int flags) { struct mpu401 *m = arg; -#if 0 - printf("mpu401_callback %s %s %s %s\n", - flags & M_RX ? "M_RX" : "", - flags & M_TX ? "M_TX" : "", - flags & M_RXEN ? "M_RXEN" : "", - flags & M_TXEN ? "M_TXEN" : ""); -#endif + if (flags & M_TXEN && m->si) { callout_reset(&m->timer, 1, mpu401_timeout, m); } @@ -278,19 +250,5 @@ mpu401_mcallback(struct snd_midi *sm, void *arg, int flags) static void mpu401_mcallbackp(struct snd_midi *sm, void *arg, int flags) { -/* printf("mpu401_callbackp\n"); */ mpu401_mcallback(sm, arg, flags); } - -static const char * -mpu401_mdescr(struct snd_midi *sm, void *arg, int verbosity) -{ - - return "descr mpu401"; -} - -static const char * -mpu401_mprovider(struct snd_midi *m, void *arg) -{ - return "provider mpu401"; -} diff --git a/sys/dev/sound/midi/mpu_if.m b/sys/dev/sound/midi/mpu_if.m index b7cb586c5dd0..835d887f703a 100644 --- a/sys/dev/sound/midi/mpu_if.m +++ b/sys/dev/sound/midi/mpu_if.m @@ -56,17 +56,6 @@ METHOD void callback { int _flags; }; -METHOD const char * provider { - struct snd_midi *_kobj; - void *_cookie; -}; - -METHOD const char * descr { - struct snd_midi *_kobj; - void *_cookie; - int _verbosity; -}; - METHOD int uninit { struct snd_midi *_kobj; void *_cookie; diff --git a/sys/dev/sound/midi/sequencer.c b/sys/dev/sound/midi/sequencer.c deleted file mode 100644 index 03b71688175c..000000000000 --- a/sys/dev/sound/midi/sequencer.c +++ /dev/null @@ -1,2107 +0,0 @@ -/*- - * SPDX-License-Identifier: BSD-2-Clause - * - * Copyright (c) 2003 Mathew Kanner - * Copyright (c) 1993 Hannu Savolainen - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -/* - * The sequencer personality manager. - */ - -#include <sys/param.h> -#include <sys/systm.h> -#include <sys/ioccom.h> - -#include <sys/filio.h> -#include <sys/lock.h> -#include <sys/sockio.h> -#include <sys/fcntl.h> -#include <sys/proc.h> -#include <sys/sysctl.h> - -#include <sys/kernel.h> /* for DATA_SET */ - -#include <sys/module.h> -#include <sys/conf.h> -#include <sys/file.h> -#include <sys/uio.h> -#include <sys/syslog.h> -#include <sys/errno.h> -#include <sys/malloc.h> -#include <sys/bus.h> -#include <machine/resource.h> -#include <machine/bus.h> -#include <machine/clock.h> /* for DELAY */ -#include <sys/soundcard.h> -#include <sys/rman.h> -#include <sys/mman.h> -#include <sys/poll.h> -#include <sys/mutex.h> -#include <sys/condvar.h> -#include <sys/kthread.h> -#include <sys/unistd.h> -#include <sys/selinfo.h> -#include <sys/sx.h> - -#ifdef HAVE_KERNEL_OPTION_HEADERS -#include "opt_snd.h" -#endif - -#include <dev/sound/midi/midi.h> -#include <dev/sound/midi/midiq.h> -#include "synth_if.h" - -#include <dev/sound/midi/sequencer.h> - -#define TMR_TIMERBASE 13 - -#define SND_DEV_SEQ 1 /* Sequencer output /dev/sequencer (FM - * synthesizer and MIDI output) */ -#define SND_DEV_MUSIC 8 /* /dev/music, level 2 interface */ - -/* Length of a sequencer event. */ -#define EV_SZ 8 -#define IEV_SZ 8 - -/* Lookup modes */ -#define LOOKUP_EXIST (0) -#define LOOKUP_OPEN (1) -#define LOOKUP_CLOSE (2) - -#define MIDIDEV(y) (dev2unit(y) & 0x0f) - -/* These are the entries to the sequencer driver. */ -static d_open_t mseq_open; -static d_close_t mseq_close; -static d_ioctl_t mseq_ioctl; -static d_read_t mseq_read; -static d_write_t mseq_write; -static d_poll_t mseq_poll; - -static struct cdevsw seq_cdevsw = { - .d_version = D_VERSION, - .d_open = mseq_open, - .d_close = mseq_close, - .d_read = mseq_read, - .d_write = mseq_write, - .d_ioctl = mseq_ioctl, - .d_poll = mseq_poll, - .d_name = "sequencer", -}; - -struct seq_softc { - KOBJ_FIELDS; - - struct mtx seq_lock, q_lock; - struct cv empty_cv, reset_cv, in_cv, out_cv, state_cv, th_cv; - - MIDIQ_HEAD(, u_char) in_q, out_q; - - u_long flags; - /* Flags (protected by flag_mtx of mididev_info) */ - int fflags; /* Access mode */ - int music; - - int out_water; /* Sequence output threshould */ - snd_sync_parm sync_parm; /* AIOSYNC parameter set */ - struct thread *sync_thread; /* AIOSYNCing thread */ - struct selinfo in_sel, out_sel; - int midi_number; - struct cdev *seqdev, *musicdev; - int unit; - int maxunits; - kobj_t *midis; - int *midi_flags; - kobj_t mapper; - void *mapper_cookie; - struct timeval timerstop, timersub; - int timerbase, tempo; - int timerrun; - int done; - int playing; - int recording; - int busy; - int pre_event_timeout; - int waiting; -}; - -/* - * Module specific stuff, including how many sequecers - * we currently own. - */ - -SYSCTL_NODE(_hw_midi, OID_AUTO, seq, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, - "Midi sequencer"); - -int seq_debug; -/* XXX: should this be moved into debug.midi? */ -SYSCTL_INT(_hw_midi_seq, OID_AUTO, debug, CTLFLAG_RW, &seq_debug, 0, ""); - -midi_cmdtab cmdtab_seqevent[] = { - {SEQ_NOTEOFF, "SEQ_NOTEOFF"}, - {SEQ_NOTEON, "SEQ_NOTEON"}, - {SEQ_WAIT, "SEQ_WAIT"}, - {SEQ_PGMCHANGE, "SEQ_PGMCHANGE"}, - {SEQ_SYNCTIMER, "SEQ_SYNCTIMER"}, - {SEQ_MIDIPUTC, "SEQ_MIDIPUTC"}, - {SEQ_DRUMON, "SEQ_DRUMON"}, - {SEQ_DRUMOFF, "SEQ_DRUMOFF"}, - {SEQ_ECHO, "SEQ_ECHO"}, - {SEQ_AFTERTOUCH, "SEQ_AFTERTOUCH"}, - {SEQ_CONTROLLER, "SEQ_CONTROLLER"}, - {SEQ_BALANCE, "SEQ_BALANCE"}, - {SEQ_VOLMODE, "SEQ_VOLMODE"}, - {SEQ_FULLSIZE, "SEQ_FULLSIZE"}, - {SEQ_PRIVATE, "SEQ_PRIVATE"}, - {SEQ_EXTENDED, "SEQ_EXTENDED"}, - {EV_SEQ_LOCAL, "EV_SEQ_LOCAL"}, - {EV_TIMING, "EV_TIMING"}, - {EV_CHN_COMMON, "EV_CHN_COMMON"}, - {EV_CHN_VOICE, "EV_CHN_VOICE"}, - {EV_SYSEX, "EV_SYSEX"}, - {-1, NULL}, -}; - -midi_cmdtab cmdtab_seqioctl[] = { - {SNDCTL_SEQ_RESET, "SNDCTL_SEQ_RESET"}, - {SNDCTL_SEQ_SYNC, "SNDCTL_SEQ_SYNC"}, - {SNDCTL_SYNTH_INFO, "SNDCTL_SYNTH_INFO"}, - {SNDCTL_SEQ_CTRLRATE, "SNDCTL_SEQ_CTRLRATE"}, - {SNDCTL_SEQ_GETOUTCOUNT, "SNDCTL_SEQ_GETOUTCOUNT"}, - {SNDCTL_SEQ_GETINCOUNT, "SNDCTL_SEQ_GETINCOUNT"}, - {SNDCTL_SEQ_PERCMODE, "SNDCTL_SEQ_PERCMODE"}, - {SNDCTL_FM_LOAD_INSTR, "SNDCTL_FM_LOAD_INSTR"}, - {SNDCTL_SEQ_TESTMIDI, "SNDCTL_SEQ_TESTMIDI"}, - {SNDCTL_SEQ_RESETSAMPLES, "SNDCTL_SEQ_RESETSAMPLES"}, - {SNDCTL_SEQ_NRSYNTHS, "SNDCTL_SEQ_NRSYNTHS"}, - {SNDCTL_SEQ_NRMIDIS, "SNDCTL_SEQ_NRMIDIS"}, - {SNDCTL_SEQ_GETTIME, "SNDCTL_SEQ_GETTIME"}, - {SNDCTL_MIDI_INFO, "SNDCTL_MIDI_INFO"}, - {SNDCTL_SEQ_THRESHOLD, "SNDCTL_SEQ_THRESHOLD"}, - {SNDCTL_SYNTH_MEMAVL, "SNDCTL_SYNTH_MEMAVL"}, - {SNDCTL_FM_4OP_ENABLE, "SNDCTL_FM_4OP_ENABLE"}, - {SNDCTL_PMGR_ACCESS, "SNDCTL_PMGR_ACCESS"}, - {SNDCTL_SEQ_PANIC, "SNDCTL_SEQ_PANIC"}, - {SNDCTL_SEQ_OUTOFBAND, "SNDCTL_SEQ_OUTOFBAND"}, - {SNDCTL_TMR_TIMEBASE, "SNDCTL_TMR_TIMEBASE"}, - {SNDCTL_TMR_START, "SNDCTL_TMR_START"}, - {SNDCTL_TMR_STOP, "SNDCTL_TMR_STOP"}, - {SNDCTL_TMR_CONTINUE, "SNDCTL_TMR_CONTINUE"}, - {SNDCTL_TMR_TEMPO, "SNDCTL_TMR_TEMPO"}, - {SNDCTL_TMR_SOURCE, "SNDCTL_TMR_SOURCE"}, - {SNDCTL_TMR_METRONOME, "SNDCTL_TMR_METRONOME"}, - {SNDCTL_TMR_SELECT, "SNDCTL_TMR_SELECT"}, - {SNDCTL_MIDI_PRETIME, "SNDCTL_MIDI_PRETIME"}, - {AIONWRITE, "AIONWRITE"}, - {AIOGSIZE, "AIOGSIZE"}, - {AIOSSIZE, "AIOSSIZE"}, - {AIOGFMT, "AIOGFMT"}, - {AIOSFMT, "AIOSFMT"}, - {AIOGMIX, "AIOGMIX"}, - {AIOSMIX, "AIOSMIX"}, - {AIOSTOP, "AIOSTOP"}, - {AIOSYNC, "AIOSYNC"}, - {AIOGCAP, "AIOGCAP"}, - {-1, NULL}, -}; - -midi_cmdtab cmdtab_timer[] = { - {TMR_WAIT_REL, "TMR_WAIT_REL"}, - {TMR_WAIT_ABS, "TMR_WAIT_ABS"}, - {TMR_STOP, "TMR_STOP"}, - {TMR_START, "TMR_START"}, - {TMR_CONTINUE, "TMR_CONTINUE"}, - {TMR_TEMPO, "TMR_TEMPO"}, - {TMR_ECHO, "TMR_ECHO"}, - {TMR_CLOCK, "TMR_CLOCK"}, - {TMR_SPP, "TMR_SPP"}, - {TMR_TIMESIG, "TMR_TIMESIG"}, - {-1, NULL}, -}; - -midi_cmdtab cmdtab_seqcv[] = { - {MIDI_NOTEOFF, "MIDI_NOTEOFF"}, - {MIDI_NOTEON, "MIDI_NOTEON"}, - {MIDI_KEY_PRESSURE, "MIDI_KEY_PRESSURE"}, - {-1, NULL}, -}; - -midi_cmdtab cmdtab_seqccmn[] = { - {MIDI_CTL_CHANGE, "MIDI_CTL_CHANGE"}, - {MIDI_PGM_CHANGE, "MIDI_PGM_CHANGE"}, - {MIDI_CHN_PRESSURE, "MIDI_CHN_PRESSURE"}, - {MIDI_PITCH_BEND, "MIDI_PITCH_BEND"}, - {MIDI_SYSTEM_PREFIX, "MIDI_SYSTEM_PREFIX"}, - {-1, NULL}, -}; - -#ifndef KOBJMETHOD_END -#define KOBJMETHOD_END { NULL, NULL } -#endif - -/* - * static const char *mpu401_mprovider(kobj_t obj, struct mpu401 *m); - */ - -static kobj_method_t seq_methods[] = { - /* KOBJMETHOD(mpu_provider,mpu401_mprovider), */ - KOBJMETHOD_END -}; - -DEFINE_CLASS(sequencer, seq_methods, 0); - -/* The followings are the local function. */ -static int seq_convertold(u_char *event, u_char *out); - -/* - * static void seq_midiinput(struct seq_softc * scp, void *md); - */ -static void seq_reset(struct seq_softc *scp); -static int seq_sync(struct seq_softc *scp); - -static int seq_processevent(struct seq_softc *scp, u_char *event); - -static int seq_timing(struct seq_softc *scp, u_char *event); -static int seq_local(struct seq_softc *scp, u_char *event); - -static int seq_chnvoice(struct seq_softc *scp, kobj_t md, u_char *event); -static int seq_chncommon(struct seq_softc *scp, kobj_t md, u_char *event); -static int seq_sysex(struct seq_softc *scp, kobj_t md, u_char *event); - -static int seq_fetch_mid(struct seq_softc *scp, int unit, kobj_t *md); -void seq_copytoinput(struct seq_softc *scp, u_char *event, int len); -int seq_modevent(module_t mod, int type, void *data); -struct seq_softc *seqs[10]; -static struct mtx seqinfo_mtx; -static u_long nseq = 0; - -static void timer_start(struct seq_softc *t); -static void timer_stop(struct seq_softc *t); -static void timer_setvals(struct seq_softc *t, int tempo, int timerbase); -static void timer_wait(struct seq_softc *t, int ticks, int wait_abs); -static int timer_now(struct seq_softc *t); - -static void -timer_start(struct seq_softc *t) -{ - t->timerrun = 1; - getmicrotime(&t->timersub); -} - -static void -timer_continue(struct seq_softc *t) -{ - struct timeval now; - - if (t->timerrun == 1) - return; - t->timerrun = 1; - getmicrotime(&now); - timevalsub(&now, &t->timerstop); - timevaladd(&t->timersub, &now); -} - -static void -timer_stop(struct seq_softc *t) -{ - t->timerrun = 0; - getmicrotime(&t->timerstop); -} - -static void -timer_setvals(struct seq_softc *t, int tempo, int timerbase) -{ - t->tempo = tempo; - t->timerbase = timerbase; -} - -static void -timer_wait(struct seq_softc *t, int ticks, int wait_abs) -{ - struct timeval now, when; - int ret; - unsigned long long i; - - while (t->timerrun == 0) { - SEQ_DEBUG(2, printf("Timer wait when timer isn't running\n")); - /* - * The old sequencer used timeouts that only increased - * the timer when the timer was running. - * Hence the sequencer would stick (?) if the - * timer was disabled. - */ - cv_wait(&t->reset_cv, &t->seq_lock); - if (t->playing == 0) - return; - } - - i = ticks * 60ull * 1000000ull / (t->tempo * t->timerbase); - - when.tv_sec = i / 1000000; - when.tv_usec = i % 1000000; - -#if 0 - printf("timer_wait tempo %d timerbase %d ticks %d abs %d u_sec %llu\n", - t->tempo, t->timerbase, ticks, wait_abs, i); -#endif - - if (wait_abs != 0) { - getmicrotime(&now); - timevalsub(&now, &t->timersub); - timevalsub(&when, &now); - } - if (when.tv_sec < 0 || when.tv_usec < 0) { - SEQ_DEBUG(3, - printf("seq_timer error negative time %lds.%06lds\n", - (long)when.tv_sec, (long)when.tv_usec)); - return; - } - i = when.tv_sec * 1000000ull; - i += when.tv_usec; - i *= hz; - i /= 1000000ull; -#if 0 - printf("seq_timer usec %llu ticks %llu\n", - when.tv_sec * 1000000ull + when.tv_usec, i); -#endif - t->waiting = 1; - ret = cv_timedwait(&t->reset_cv, &t->seq_lock, i + 1); - t->waiting = 0; - - if (ret != EWOULDBLOCK) - SEQ_DEBUG(3, printf("seq_timer didn't timeout\n")); - -} - -static int -timer_now(struct seq_softc *t) -{ - struct timeval now; - unsigned long long i; - int ret; - - if (t->timerrun == 0) - now = t->timerstop; - else - getmicrotime(&now); - - timevalsub(&now, &t->timersub); - - i = now.tv_sec * 1000000ull; - i += now.tv_usec; - i *= t->timerbase; -/* i /= t->tempo; */ - i /= 1000000ull; - - ret = i; - /* - * printf("timer_now: %llu %d\n", i, ret); - */ - - return ret; -} - -static void -seq_eventthread(void *arg) -{ - struct seq_softc *scp = arg; - u_char event[EV_SZ]; - - mtx_lock(&scp->seq_lock); - SEQ_DEBUG(2, printf("seq_eventthread started\n")); - while (scp->done == 0) { -restart: - while (scp->playing == 0) { - cv_wait(&scp->state_cv, &scp->seq_lock); - if (scp->done) - goto done; - } - - while (MIDIQ_EMPTY(scp->out_q)) { - cv_broadcast(&scp->empty_cv); - cv_wait(&scp->out_cv, &scp->seq_lock); - if (scp->playing == 0) - goto restart; - if (scp->done) - goto done; - } - - MIDIQ_DEQ(scp->out_q, event, EV_SZ); - - if (MIDIQ_AVAIL(scp->out_q) < scp->out_water) { - cv_broadcast(&scp->out_cv); - selwakeup(&scp->out_sel); - } - seq_processevent(scp, event); - } - -done: - cv_broadcast(&scp->th_cv); - mtx_unlock(&scp->seq_lock); - SEQ_DEBUG(2, printf("seq_eventthread finished\n")); - kproc_exit(0); -} - -/* - * seq_processevent: This maybe called by the event thread or the IOCTL - * handler for queued and out of band events respectively. - */ -static int -seq_processevent(struct seq_softc *scp, u_char *event) -{ - int ret; - kobj_t m; - - ret = 0; - - if (event[0] == EV_SEQ_LOCAL) - ret = seq_local(scp, event); - else if (event[0] == EV_TIMING) - ret = seq_timing(scp, event); - else if (event[0] != EV_CHN_VOICE && - event[0] != EV_CHN_COMMON && - event[0] != EV_SYSEX && - event[0] != SEQ_MIDIPUTC) { - ret = 1; - SEQ_DEBUG(2, printf("seq_processevent not known %d\n", - event[0])); - } else if (seq_fetch_mid(scp, event[1], &m) != 0) { - ret = 1; - SEQ_DEBUG(2, printf("seq_processevent midi unit not found %d\n", - event[1])); - } else - switch (event[0]) { - case EV_CHN_VOICE: - ret = seq_chnvoice(scp, m, event); - break; - case EV_CHN_COMMON: - ret = seq_chncommon(scp, m, event); - break; - case EV_SYSEX: - ret = seq_sysex(scp, m, event); - break; - case SEQ_MIDIPUTC: - mtx_unlock(&scp->seq_lock); - ret = SYNTH_WRITERAW(m, &event[2], 1); - mtx_lock(&scp->seq_lock); - break; - } - return ret; -} - -static int -seq_addunit(void) -{ - struct seq_softc *scp; - int ret; - u_char *buf; - - gone_in(15, "Warning! MIDI sequencer to be removed soon: no longer " - "needed or used\n"); - - /* Allocate the softc. */ - ret = ENOMEM; - scp = malloc(sizeof(*scp), M_DEVBUF, M_NOWAIT | M_ZERO); - if (scp == NULL) { - SEQ_DEBUG(1, printf("seq_addunit: softc allocation failed.\n")); - goto err; - } - kobj_init((kobj_t)scp, &sequencer_class); - - buf = malloc(sizeof(*buf) * EV_SZ * 1024, M_TEMP, M_NOWAIT | M_ZERO); - if (buf == NULL) - goto err; - MIDIQ_INIT(scp->in_q, buf, EV_SZ * 1024); - buf = malloc(sizeof(*buf) * EV_SZ * 1024, M_TEMP, M_NOWAIT | M_ZERO); - if (buf == NULL) - goto err; - MIDIQ_INIT(scp->out_q, buf, EV_SZ * 1024); - ret = EINVAL; - - scp->midis = malloc(sizeof(kobj_t) * 32, M_TEMP, M_NOWAIT | M_ZERO); - scp->midi_flags = malloc(sizeof(*scp->midi_flags) * 32, M_TEMP, - M_NOWAIT | M_ZERO); - - if (scp->midis == NULL || scp->midi_flags == NULL) - goto err; - - scp->flags = 0; - - mtx_init(&scp->seq_lock, "seqflq", NULL, 0); - cv_init(&scp->state_cv, "seqstate"); - cv_init(&scp->empty_cv, "seqempty"); - cv_init(&scp->reset_cv, "seqtimer"); - cv_init(&scp->out_cv, "seqqout"); - cv_init(&scp->in_cv, "seqqin"); - cv_init(&scp->th_cv, "seqstart"); - - /* - * Init the damn timer - */ - - scp->mapper = midimapper_addseq(scp, &scp->unit, &scp->mapper_cookie); - if (scp->mapper == NULL) - goto err; - - scp->seqdev = make_dev(&seq_cdevsw, SND_DEV_SEQ, UID_ROOT, GID_WHEEL, - 0666, "sequencer%d", scp->unit); - - scp->musicdev = make_dev(&seq_cdevsw, SND_DEV_MUSIC, UID_ROOT, - GID_WHEEL, 0666, "music%d", scp->unit); - - if (scp->seqdev == NULL || scp->musicdev == NULL) - goto err; - /* - * TODO: Add to list of sequencers this module provides - */ - - ret = - kproc_create - (seq_eventthread, scp, NULL, RFHIGHPID, 0, - "sequencer %02d", scp->unit); - - if (ret) - goto err; - - scp->seqdev->si_drv1 = scp->musicdev->si_drv1 = scp; - - SEQ_DEBUG(2, printf("sequencer %d created scp %p\n", scp->unit, scp)); - - ret = 0; - - mtx_lock(&seqinfo_mtx); - seqs[nseq++] = scp; - mtx_unlock(&seqinfo_mtx); - - goto ok; - -err: - if (scp != NULL) { - if (scp->seqdev != NULL) - destroy_dev(scp->seqdev); - if (scp->musicdev != NULL) - destroy_dev(scp->musicdev); - /* - * TODO: Destroy mutex and cv - */ - if (scp->midis != NULL) - free(scp->midis, M_TEMP); - if (scp->midi_flags != NULL) - free(scp->midi_flags, M_TEMP); - if (scp->out_q.b) - free(scp->out_q.b, M_TEMP); - if (scp->in_q.b) - free(scp->in_q.b, M_TEMP); - free(scp, M_DEVBUF); - } -ok: - return ret; -} - -static int -seq_delunit(int unit) -{ - struct seq_softc *scp = seqs[unit]; - int i; - - //SEQ_DEBUG(4, printf("seq_delunit: %d\n", unit)); - SEQ_DEBUG(1, printf("seq_delunit: 1 \n")); - mtx_lock(&scp->seq_lock); - - scp->playing = 0; - scp->done = 1; - cv_broadcast(&scp->out_cv); - cv_broadcast(&scp->state_cv); - cv_broadcast(&scp->reset_cv); - SEQ_DEBUG(1, printf("seq_delunit: 2 \n")); - cv_wait(&scp->th_cv, &scp->seq_lock); - SEQ_DEBUG(1, printf("seq_delunit: 3.0 \n")); - mtx_unlock(&scp->seq_lock); - SEQ_DEBUG(1, printf("seq_delunit: 3.1 \n")); - - cv_destroy(&scp->state_cv); - SEQ_DEBUG(1, printf("seq_delunit: 4 \n")); - cv_destroy(&scp->empty_cv); - SEQ_DEBUG(1, printf("seq_delunit: 5 \n")); - cv_destroy(&scp->reset_cv); - SEQ_DEBUG(1, printf("seq_delunit: 6 \n")); - cv_destroy(&scp->out_cv); - SEQ_DEBUG(1, printf("seq_delunit: 7 \n")); - cv_destroy(&scp->in_cv); - SEQ_DEBUG(1, printf("seq_delunit: 8 \n")); - cv_destroy(&scp->th_cv); - - SEQ_DEBUG(1, printf("seq_delunit: 10 \n")); - if (scp->seqdev) - destroy_dev(scp->seqdev); - SEQ_DEBUG(1, printf("seq_delunit: 11 \n")); - if (scp->musicdev) - destroy_dev(scp->musicdev); - SEQ_DEBUG(1, printf("seq_delunit: 12 \n")); - scp->seqdev = scp->musicdev = NULL; - if (scp->midis != NULL) - free(scp->midis, M_TEMP); - SEQ_DEBUG(1, printf("seq_delunit: 13 \n")); - if (scp->midi_flags != NULL) - free(scp->midi_flags, M_TEMP); - SEQ_DEBUG(1, printf("seq_delunit: 14 \n")); - free(scp->out_q.b, M_TEMP); - SEQ_DEBUG(1, printf("seq_delunit: 15 \n")); - free(scp->in_q.b, M_TEMP); - - SEQ_DEBUG(1, printf("seq_delunit: 16 \n")); - - mtx_destroy(&scp->seq_lock); - SEQ_DEBUG(1, printf("seq_delunit: 17 \n")); - free(scp, M_DEVBUF); - - mtx_lock(&seqinfo_mtx); - for (i = unit; i < (nseq - 1); i++) - seqs[i] = seqs[i + 1]; - nseq--; - mtx_unlock(&seqinfo_mtx); - - return 0; -} - -int -seq_modevent(module_t mod, int type, void *data) -{ - int retval, r; - - retval = 0; - - switch (type) { - case MOD_LOAD: - mtx_init(&seqinfo_mtx, "seqmod", NULL, 0); - retval = seq_addunit(); - break; - - case MOD_UNLOAD: - while (nseq) { - r = seq_delunit(nseq - 1); - if (r) { - retval = r; - break; - } - } - if (nseq == 0) { - retval = 0; - mtx_destroy(&seqinfo_mtx); - } - break; - - default: - break; - } - - return retval; -} - -static int -seq_fetch_mid(struct seq_softc *scp, int unit, kobj_t *md) -{ - - if (unit >= scp->midi_number || unit < 0) - return EINVAL; - - *md = scp->midis[unit]; - - return 0; -} - -int -mseq_open(struct cdev *i_dev, int flags, int mode, struct thread *td) -{ - struct seq_softc *scp = i_dev->si_drv1; - int i; - - gone_in(15, "Warning! MIDI sequencer to be removed soon: no longer " - "needed or used\n"); - - if (scp == NULL) - return ENXIO; - - SEQ_DEBUG(3, printf("seq_open: scp %p unit %d, flags 0x%x.\n", - scp, scp->unit, flags)); - - /* - * Mark this device busy. - */ - - midistat_lock(); - mtx_lock(&scp->seq_lock); - if (scp->busy) { - mtx_unlock(&scp->seq_lock); - midistat_unlock(); - SEQ_DEBUG(2, printf("seq_open: unit %d is busy.\n", scp->unit)); - return EBUSY; - } - scp->fflags = flags; - /* - if ((scp->fflags & O_NONBLOCK) != 0) - scp->flags |= SEQ_F_NBIO; - */ - scp->music = MIDIDEV(i_dev) == SND_DEV_MUSIC; - - /* - * Enumerate the available midi devices - */ - scp->midi_number = 0; - scp->maxunits = midimapper_open_locked(scp->mapper, &scp->mapper_cookie); - - if (scp->maxunits == 0) - SEQ_DEBUG(2, printf("seq_open: no midi devices\n")); - - for (i = 0; i < scp->maxunits; i++) { - scp->midis[scp->midi_number] = - midimapper_fetch_synth_locked(scp->mapper, - scp->mapper_cookie, i); - if (scp->midis[scp->midi_number]) { - if (SYNTH_OPEN(scp->midis[scp->midi_number], scp, - scp->fflags) != 0) - scp->midis[scp->midi_number] = NULL; - else { - scp->midi_flags[scp->midi_number] = - SYNTH_QUERY(scp->midis[scp->midi_number]); - scp->midi_number++; - } - } - } - midistat_unlock(); - - timer_setvals(scp, 60, 100); - - timer_start(scp); - timer_stop(scp); - /* - * actually, if we're in rdonly mode, we should start the timer - */ - /* - * TODO: Handle recording now - */ - - scp->out_water = MIDIQ_SIZE(scp->out_q) / 2; - - scp->busy = 1; - mtx_unlock(&scp->seq_lock); - - SEQ_DEBUG(2, printf("seq_open: opened, mode %s.\n", - scp->music ? "music" : "sequencer")); - SEQ_DEBUG(2, - printf("Sequencer %d %p opened maxunits %d midi_number %d:\n", - scp->unit, scp, scp->maxunits, scp->midi_number)); - for (i = 0; i < scp->midi_number; i++) - SEQ_DEBUG(3, printf(" midi %d %p\n", i, scp->midis[i])); - - return 0; -} - -/* - * mseq_close - */ -int -mseq_close(struct cdev *i_dev, int flags, int mode, struct thread *td) -{ - int i; - struct seq_softc *scp = i_dev->si_drv1; - int ret; - - if (scp == NULL) - return ENXIO; - - SEQ_DEBUG(2, printf("seq_close: unit %d.\n", scp->unit)); - - mtx_lock(&scp->seq_lock); - - ret = ENXIO; - if (scp->busy == 0) - goto err; - - seq_reset(scp); - seq_sync(scp); - - for (i = 0; i < scp->midi_number; i++) - if (scp->midis[i]) - SYNTH_CLOSE(scp->midis[i]); - - midimapper_close(scp->mapper, scp->mapper_cookie); - - timer_stop(scp); - - scp->busy = 0; - ret = 0; - -err: - SEQ_DEBUG(3, printf("seq_close: closed ret = %d.\n", ret)); - mtx_unlock(&scp->seq_lock); - return ret; -} - -int -mseq_read(struct cdev *i_dev, struct uio *uio, int ioflag) -{ - int retval, used; - struct seq_softc *scp = i_dev->si_drv1; - -#define SEQ_RSIZE 32 - u_char buf[SEQ_RSIZE]; - - if (scp == NULL) - return ENXIO; - - SEQ_DEBUG(7, printf("mseq_read: unit %d, resid %zd.\n", - scp->unit, uio->uio_resid)); - - mtx_lock(&scp->seq_lock); - if ((scp->fflags & FREAD) == 0) { - SEQ_DEBUG(2, printf("mseq_read: unit %d is not for reading.\n", - scp->unit)); - retval = EIO; - goto err1; - } - /* - * Begin recording. - */ - /* - * if ((scp->flags & SEQ_F_READING) == 0) - */ - /* - * TODO, start recording if not alread - */ - - /* - * I think the semantics are to return as soon - * as possible. - * Second thought, it doesn't seem like midimoutain - * expects that at all. - * TODO: Look up in some sort of spec - */ - - while (uio->uio_resid > 0) { - while (MIDIQ_EMPTY(scp->in_q)) { - retval = EWOULDBLOCK; - /* - * I wish I knew which one to care about - */ - - if (scp->fflags & O_NONBLOCK) - goto err1; - if (ioflag & O_NONBLOCK) - goto err1; - - retval = cv_wait_sig(&scp->in_cv, &scp->seq_lock); - if (retval != 0) - goto err1; - } - - used = MIN(MIDIQ_LEN(scp->in_q), uio->uio_resid); - used = MIN(used, SEQ_RSIZE); - - SEQ_DEBUG(8, printf("midiread: uiomove cc=%d\n", used)); - MIDIQ_DEQ(scp->in_q, buf, used); - mtx_unlock(&scp->seq_lock); - retval = uiomove(buf, used, uio); - mtx_lock(&scp->seq_lock); - if (retval) - goto err1; - } - - retval = 0; -err1: - mtx_unlock(&scp->seq_lock); - SEQ_DEBUG(6, printf("mseq_read: ret %d, resid %zd.\n", - retval, uio->uio_resid)); - - return retval; -} - -int -mseq_write(struct cdev *i_dev, struct uio *uio, int ioflag) -{ - u_char event[EV_SZ], newevent[EV_SZ], ev_code; - struct seq_softc *scp = i_dev->si_drv1; - int retval; - int used; - - SEQ_DEBUG(7, printf("seq_write: unit %d, resid %zd.\n", - scp->unit, uio->uio_resid)); - - if (scp == NULL) - return ENXIO; - - mtx_lock(&scp->seq_lock); - - if ((scp->fflags & FWRITE) == 0) { - SEQ_DEBUG(2, printf("seq_write: unit %d is not for writing.\n", - scp->unit)); - retval = EIO; - goto err0; - } - while (uio->uio_resid > 0) { - while (MIDIQ_AVAIL(scp->out_q) == 0) { - retval = EWOULDBLOCK; - if (scp->fflags & O_NONBLOCK) - goto err0; - if (ioflag & O_NONBLOCK) - goto err0; - SEQ_DEBUG(8, printf("seq_write cvwait\n")); - - scp->playing = 1; - cv_broadcast(&scp->out_cv); - cv_broadcast(&scp->state_cv); - - retval = cv_wait_sig(&scp->out_cv, &scp->seq_lock); - /* - * We slept, maybe things have changed since last - * dying check - */ - if (retval != 0) - goto err0; -#if 0 - /* - * Useless test - */ - if (scp != i_dev->si_drv1) - retval = ENXIO; -#endif - } - - used = MIN(uio->uio_resid, 4); - - SEQ_DEBUG(8, printf("seqout: resid %zd len %jd avail %jd\n", - uio->uio_resid, (intmax_t)MIDIQ_LEN(scp->out_q), - (intmax_t)MIDIQ_AVAIL(scp->out_q))); - - if (used != 4) { - retval = ENXIO; - goto err0; - } - mtx_unlock(&scp->seq_lock); - retval = uiomove(event, used, uio); - mtx_lock(&scp->seq_lock); - if (retval) - goto err0; - - ev_code = event[0]; - SEQ_DEBUG(8, printf("seq_write: unit %d, event %s.\n", - scp->unit, midi_cmdname(ev_code, cmdtab_seqevent))); - - /* Have a look at the event code. */ - if (ev_code == SEQ_FULLSIZE) { - /* - * TODO: restore code for SEQ_FULLSIZE - */ -#if 0 - /* - * A long event, these are the patches/samples for a - * synthesizer. - */ - midiunit = *(u_short *)&event[2]; - mtx_lock(&sd->seq_lock); - ret = lookup_mididev(scp, midiunit, LOOKUP_OPEN, &md); - mtx_unlock(&sd->seq_lock); - if (ret != 0) - return (ret); - - SEQ_DEBUG(printf("seq_write: loading a patch to the unit %d.\n", midiunit)); - - ret = md->synth.loadpatch(md, *(short *)&event[0], buf, - p + 4, count, 0); - return (ret); -#else - /* - * For now, just flush the darn buffer - */ - SEQ_DEBUG(2, - printf("seq_write: SEQ_FULLSIZE flusing buffer.\n")); - while (uio->uio_resid > 0) { - mtx_unlock(&scp->seq_lock); - retval = uiomove(event, MIN(EV_SZ, uio->uio_resid), uio); - mtx_lock(&scp->seq_lock); - if (retval) - goto err0; - } - retval = 0; - goto err0; -#endif - } - retval = EINVAL; - if (ev_code >= 128) { - int error; - - /* - * Some sort of an extended event. The size is eight - * bytes. scoop extra info. - */ - if (scp->music && ev_code == SEQ_EXTENDED) { - SEQ_DEBUG(2, printf("seq_write: invalid level two event %x.\n", ev_code)); - goto err0; - } - mtx_unlock(&scp->seq_lock); - if (uio->uio_resid < 4) - error = EINVAL; - else - error = uiomove((caddr_t)&event[4], 4, uio); - mtx_lock(&scp->seq_lock); - if (error) { - SEQ_DEBUG(2, - printf("seq_write: user memory mangled?\n")); - goto err0; - } - } else { - /* - * Size four event. - */ - if (scp->music) { - SEQ_DEBUG(2, printf("seq_write: four byte event in music mode.\n")); - goto err0; - } - } - if (ev_code == SEQ_MIDIPUTC) { - /* - * TODO: event[2] is unit number to receive char. - * Range check it. - */ - } - if (scp->music) { -#ifdef not_ever_ever - if (event[0] == EV_TIMING && - (event[1] == TMR_START || event[1] == TMR_STOP)) { - /* - * For now, try to make midimoutain work by - * forcing these events to be processed - * immediately. - */ - seq_processevent(scp, event); - } else - MIDIQ_ENQ(scp->out_q, event, EV_SZ); -#else - MIDIQ_ENQ(scp->out_q, event, EV_SZ); -#endif - } else { - if (seq_convertold(event, newevent) > 0) - MIDIQ_ENQ(scp->out_q, newevent, EV_SZ); -#if 0 - else - goto err0; -#endif - } - } - - scp->playing = 1; - cv_broadcast(&scp->state_cv); - cv_broadcast(&scp->out_cv); - - retval = 0; - -err0: - SEQ_DEBUG(6, - printf("seq_write done: leftover buffer length %zd retval %d\n", - uio->uio_resid, retval)); - mtx_unlock(&scp->seq_lock); - return retval; -} - -int -mseq_ioctl(struct cdev *i_dev, u_long cmd, caddr_t arg, int mode, - struct thread *td) -{ - int midiunit, ret, tmp; - struct seq_softc *scp = i_dev->si_drv1; - struct synth_info *synthinfo; - struct midi_info *midiinfo; - u_char event[EV_SZ]; - u_char newevent[EV_SZ]; - - kobj_t md; - - /* - * struct snd_size *sndsize; - */ - - if (scp == NULL) - return ENXIO; - - SEQ_DEBUG(6, printf("seq_ioctl: unit %d, cmd %s.\n", - scp->unit, midi_cmdname(cmd, cmdtab_seqioctl))); - - ret = 0; - - switch (cmd) { - case SNDCTL_SEQ_GETTIME: - /* - * ioctl needed by libtse - */ - mtx_lock(&scp->seq_lock); - *(int *)arg = timer_now(scp); - mtx_unlock(&scp->seq_lock); - SEQ_DEBUG(6, printf("seq_ioctl: gettime %d.\n", *(int *)arg)); - ret = 0; - break; - case SNDCTL_TMR_METRONOME: - /* fallthrough */ - case SNDCTL_TMR_SOURCE: - /* - * Not implemented - */ - ret = 0; - break; - case SNDCTL_TMR_TEMPO: - event[1] = TMR_TEMPO; - event[4] = *(int *)arg & 0xFF; - event[5] = (*(int *)arg >> 8) & 0xFF; - event[6] = (*(int *)arg >> 16) & 0xFF; - event[7] = (*(int *)arg >> 24) & 0xFF; - goto timerevent; - case SNDCTL_TMR_TIMEBASE: - event[1] = TMR_TIMERBASE; - event[4] = *(int *)arg & 0xFF; - event[5] = (*(int *)arg >> 8) & 0xFF; - event[6] = (*(int *)arg >> 16) & 0xFF; - event[7] = (*(int *)arg >> 24) & 0xFF; - goto timerevent; - case SNDCTL_TMR_START: - event[1] = TMR_START; - goto timerevent; - case SNDCTL_TMR_STOP: - event[1] = TMR_STOP; - goto timerevent; - case SNDCTL_TMR_CONTINUE: - event[1] = TMR_CONTINUE; -timerevent: - event[0] = EV_TIMING; - mtx_lock(&scp->seq_lock); - if (!scp->music) { - ret = EINVAL; - mtx_unlock(&scp->seq_lock); - break; - } - seq_processevent(scp, event); - mtx_unlock(&scp->seq_lock); - break; - case SNDCTL_TMR_SELECT: - SEQ_DEBUG(2, - printf("seq_ioctl: SNDCTL_TMR_SELECT not supported\n")); - ret = EINVAL; - break; - case SNDCTL_SEQ_SYNC: - if (mode == O_RDONLY) { - ret = 0; - break; - } - mtx_lock(&scp->seq_lock); - ret = seq_sync(scp); - mtx_unlock(&scp->seq_lock); - break; - case SNDCTL_SEQ_PANIC: - /* fallthrough */ - case SNDCTL_SEQ_RESET: - /* - * SNDCTL_SEQ_PANIC == SNDCTL_SEQ_RESET - */ - mtx_lock(&scp->seq_lock); - seq_reset(scp); - mtx_unlock(&scp->seq_lock); - ret = 0; - break; - case SNDCTL_SEQ_TESTMIDI: - mtx_lock(&scp->seq_lock); - /* - * TODO: SNDCTL_SEQ_TESTMIDI now means "can I write to the - * device?". - */ - mtx_unlock(&scp->seq_lock); - break; -#if 0 - case SNDCTL_SEQ_GETINCOUNT: - if (mode == O_WRONLY) - *(int *)arg = 0; - else { - mtx_lock(&scp->seq_lock); - *(int *)arg = scp->in_q.rl; - mtx_unlock(&scp->seq_lock); - SEQ_DEBUG(printf("seq_ioctl: incount %d.\n", - *(int *)arg)); - } - ret = 0; - break; - case SNDCTL_SEQ_GETOUTCOUNT: - if (mode == O_RDONLY) - *(int *)arg = 0; - else { - mtx_lock(&scp->seq_lock); - *(int *)arg = scp->out_q.fl; - mtx_unlock(&scp->seq_lock); - SEQ_DEBUG(printf("seq_ioctl: outcount %d.\n", - *(int *)arg)); - } - ret = 0; - break; -#endif - case SNDCTL_SEQ_CTRLRATE: - if (*(int *)arg != 0) { - ret = EINVAL; - break; - } - mtx_lock(&scp->seq_lock); - *(int *)arg = scp->timerbase; - mtx_unlock(&scp->seq_lock); - SEQ_DEBUG(3, printf("seq_ioctl: ctrlrate %d.\n", *(int *)arg)); - ret = 0; - break; - /* - * TODO: ioctl SNDCTL_SEQ_RESETSAMPLES - */ -#if 0 - case SNDCTL_SEQ_RESETSAMPLES: - mtx_lock(&scp->seq_lock); - ret = lookup_mididev(scp, *(int *)arg, LOOKUP_OPEN, &md); - mtx_unlock(&scp->seq_lock); - if (ret != 0) - break; - ret = midi_ioctl(MIDIMKDEV(major(i_dev), *(int *)arg, - SND_DEV_MIDIN), cmd, arg, mode, td); - break; -#endif - case SNDCTL_SEQ_NRSYNTHS: - mtx_lock(&scp->seq_lock); - *(int *)arg = scp->midi_number; - mtx_unlock(&scp->seq_lock); - SEQ_DEBUG(3, printf("seq_ioctl: synths %d.\n", *(int *)arg)); - ret = 0; - break; - case SNDCTL_SEQ_NRMIDIS: - mtx_lock(&scp->seq_lock); - if (scp->music) - *(int *)arg = 0; - else { - /* - * TODO: count the numbder of devices that can WRITERAW - */ - *(int *)arg = scp->midi_number; - } - mtx_unlock(&scp->seq_lock); - SEQ_DEBUG(3, printf("seq_ioctl: midis %d.\n", *(int *)arg)); - ret = 0; - break; - /* - * TODO: ioctl SNDCTL_SYNTH_MEMAVL - */ -#if 0 - case SNDCTL_SYNTH_MEMAVL: - mtx_lock(&scp->seq_lock); - ret = lookup_mididev(scp, *(int *)arg, LOOKUP_OPEN, &md); - mtx_unlock(&scp->seq_lock); - if (ret != 0) - break; - ret = midi_ioctl(MIDIMKDEV(major(i_dev), *(int *)arg, - SND_DEV_MIDIN), cmd, arg, mode, td); - break; -#endif - case SNDCTL_SEQ_OUTOFBAND: - for (ret = 0; ret < EV_SZ; ret++) - event[ret] = (u_char)arg[0]; - - mtx_lock(&scp->seq_lock); - if (scp->music) - ret = seq_processevent(scp, event); - else { - if (seq_convertold(event, newevent) > 0) - ret = seq_processevent(scp, newevent); - else - ret = EINVAL; - } - mtx_unlock(&scp->seq_lock); - break; - case SNDCTL_SYNTH_INFO: - synthinfo = (struct synth_info *)arg; - midiunit = synthinfo->device; - mtx_lock(&scp->seq_lock); - if (seq_fetch_mid(scp, midiunit, &md) == 0) { - bzero(synthinfo, sizeof(*synthinfo)); - synthinfo->name[0] = 'f'; - synthinfo->name[1] = 'a'; - synthinfo->name[2] = 'k'; - synthinfo->name[3] = 'e'; - synthinfo->name[4] = 's'; - synthinfo->name[5] = 'y'; - synthinfo->name[6] = 'n'; - synthinfo->name[7] = 't'; - synthinfo->name[8] = 'h'; - synthinfo->device = midiunit; - synthinfo->synth_type = SYNTH_TYPE_MIDI; - synthinfo->capabilities = scp->midi_flags[midiunit]; - ret = 0; - } else - ret = EINVAL; - mtx_unlock(&scp->seq_lock); - break; - case SNDCTL_MIDI_INFO: - midiinfo = (struct midi_info *)arg; - midiunit = midiinfo->device; - mtx_lock(&scp->seq_lock); - if (seq_fetch_mid(scp, midiunit, &md) == 0) { - bzero(midiinfo, sizeof(*midiinfo)); - midiinfo->name[0] = 'f'; - midiinfo->name[1] = 'a'; - midiinfo->name[2] = 'k'; - midiinfo->name[3] = 'e'; - midiinfo->name[4] = 'm'; - midiinfo->name[5] = 'i'; - midiinfo->name[6] = 'd'; - midiinfo->name[7] = 'i'; - midiinfo->device = midiunit; - midiinfo->capabilities = scp->midi_flags[midiunit]; - /* - * TODO: What devtype? - */ - midiinfo->dev_type = 0x01; - ret = 0; - } else - ret = EINVAL; - mtx_unlock(&scp->seq_lock); - break; - case SNDCTL_SEQ_THRESHOLD: - mtx_lock(&scp->seq_lock); - RANGE(*(int *)arg, 1, MIDIQ_SIZE(scp->out_q) - 1); - scp->out_water = *(int *)arg; - mtx_unlock(&scp->seq_lock); - SEQ_DEBUG(3, printf("seq_ioctl: water %d.\n", *(int *)arg)); - ret = 0; - break; - case SNDCTL_MIDI_PRETIME: - tmp = *(int *)arg; - if (tmp < 0) - tmp = 0; - mtx_lock(&scp->seq_lock); - scp->pre_event_timeout = (hz * tmp) / 10; - *(int *)arg = scp->pre_event_timeout; - mtx_unlock(&scp->seq_lock); - SEQ_DEBUG(3, printf("seq_ioctl: pretime %d.\n", *(int *)arg)); - ret = 0; - break; - case SNDCTL_FM_4OP_ENABLE: - case SNDCTL_PMGR_IFACE: - case SNDCTL_PMGR_ACCESS: - /* - * Patch manager and fm are ded, ded, ded. - */ - /* fallthrough */ - default: - /* - * TODO: Consider ioctl default case. - * Old code used to - * if ((scp->fflags & O_ACCMODE) == FREAD) { - * ret = EIO; - * break; - * } - * Then pass on the ioctl to device 0 - */ - SEQ_DEBUG(2, - printf("seq_ioctl: unsupported IOCTL %ld.\n", cmd)); - ret = EINVAL; - break; - } - - return ret; -} - -int -mseq_poll(struct cdev *i_dev, int events, struct thread *td) -{ - int ret, lim; - struct seq_softc *scp = i_dev->si_drv1; - - SEQ_DEBUG(3, printf("seq_poll: unit %d.\n", scp->unit)); - SEQ_DEBUG(1, printf("seq_poll: unit %d.\n", scp->unit)); - - mtx_lock(&scp->seq_lock); - - ret = 0; - - /* Look up the appropriate queue and select it. */ - if ((events & (POLLOUT | POLLWRNORM)) != 0) { - /* Start playing. */ - scp->playing = 1; - cv_broadcast(&scp->state_cv); - cv_broadcast(&scp->out_cv); - - lim = scp->out_water; - - if (MIDIQ_AVAIL(scp->out_q) < lim) - /* No enough space, record select. */ - selrecord(td, &scp->out_sel); - else - /* We can write now. */ - ret |= events & (POLLOUT | POLLWRNORM); - } - if ((events & (POLLIN | POLLRDNORM)) != 0) { - /* TODO: Start recording. */ - - /* Find out the boundary. */ - lim = 1; - if (MIDIQ_LEN(scp->in_q) < lim) - /* No data ready, record select. */ - selrecord(td, &scp->in_sel); - else - /* We can read now. */ - ret |= events & (POLLIN | POLLRDNORM); - } - mtx_unlock(&scp->seq_lock); - - return (ret); -} - -#if 0 -static void -sein_qtr(void *p, void /* mididev_info */ *md) -{ - struct seq_softc *scp; - - scp = (struct seq_softc *)p; - - mtx_lock(&scp->seq_lock); - - /* Restart playing if we have the data to output. */ - if (scp->queueout_pending) - seq_callback(scp, SEQ_CB_START | SEQ_CB_WR); - /* Check the midi device if we are reading. */ - if ((scp->flags & SEQ_F_READING) != 0) - seq_midiinput(scp, md); - - mtx_unlock(&scp->seq_lock); -} - -#endif -/* - * seq_convertold - * Was the old playevent. Use this to convert and old - * style /dev/sequencer event to a /dev/music event - */ -static int -seq_convertold(u_char *event, u_char *out) -{ - int used; - u_char dev, chn, note, vel; - - out[0] = out[1] = out[2] = out[3] = out[4] = out[5] = out[6] = - out[7] = 0; - - dev = 0; - chn = event[1]; - note = event[2]; - vel = event[3]; - - used = 0; - -restart: - /* - * TODO: Debug statement - */ - switch (event[0]) { - case EV_TIMING: - case EV_CHN_VOICE: - case EV_CHN_COMMON: - case EV_SYSEX: - case EV_SEQ_LOCAL: - out[0] = event[0]; - out[1] = event[1]; - out[2] = event[2]; - out[3] = event[3]; - out[4] = event[4]; - out[5] = event[5]; - out[6] = event[6]; - out[7] = event[7]; - used += 8; - break; - case SEQ_NOTEOFF: - out[0] = EV_CHN_VOICE; - out[1] = dev; - out[2] = MIDI_NOTEOFF; - out[3] = chn; - out[4] = note; - out[5] = 255; - used += 4; - break; - - case SEQ_NOTEON: - out[0] = EV_CHN_VOICE; - out[1] = dev; - out[2] = MIDI_NOTEON; - out[3] = chn; - out[4] = note; - out[5] = vel; - used += 4; - break; - - /* - * wait delay = (event[2] << 16) + (event[3] << 8) + event[4] - */ - - case SEQ_PGMCHANGE: - out[0] = EV_CHN_COMMON; - out[1] = dev; - out[2] = MIDI_PGM_CHANGE; - out[3] = chn; - out[4] = note; - out[5] = vel; - used += 4; - break; -/* - out[0] = EV_TIMING; - out[1] = dev; - out[2] = MIDI_PGM_CHANGE; - out[3] = chn; - out[4] = note; - out[5] = vel; - SEQ_DEBUG(4,printf("seq_playevent: synctimer\n")); - break; -*/ - - case SEQ_MIDIPUTC: - SEQ_DEBUG(4, - printf("seq_playevent: put data 0x%02x, unit %d.\n", - event[1], event[2])); - /* - * Pass through to the midi device. - * device = event[2] - * data = event[1] - */ - out[0] = SEQ_MIDIPUTC; - out[1] = dev; - out[2] = chn; - used += 4; - break; -#ifdef notyet - case SEQ_ECHO: - /* - * This isn't handled here yet because I don't know if I can - * just use four bytes events. There might be consequences - * in the _read routing - */ - if (seq_copytoinput(scp, event, 4) == EAGAIN) { - ret = QUEUEFULL; - break; - } - ret = MORE; - break; -#endif - case SEQ_EXTENDED: - switch (event[1]) { - case SEQ_NOTEOFF: - case SEQ_NOTEON: - case SEQ_PGMCHANGE: - event++; - used = 4; - goto restart; - break; - case SEQ_AFTERTOUCH: - /* - * SYNTH_AFTERTOUCH(md, event[3], event[4]) - */ - case SEQ_BALANCE: - /* - * SYNTH_PANNING(md, event[3], (char)event[4]) - */ - case SEQ_CONTROLLER: - /* - * SYNTH_CONTROLLER(md, event[3], event[4], *(short *)&event[5]) - */ - case SEQ_VOLMODE: - /* - * SYNTH_VOLUMEMETHOD(md, event[3]) - */ - default: - SEQ_DEBUG(2, - printf("seq_convertold: SEQ_EXTENDED type %d" - "not handled\n", event[1])); - break; - } - break; - case SEQ_WAIT: - out[0] = EV_TIMING; - out[1] = TMR_WAIT_REL; - out[4] = event[2]; - out[5] = event[3]; - out[6] = event[4]; - - SEQ_DEBUG(5, printf("SEQ_WAIT %d", - event[2] + (event[3] << 8) + (event[4] << 24))); - - used += 4; - break; - - case SEQ_ECHO: - case SEQ_SYNCTIMER: - case SEQ_PRIVATE: - default: - SEQ_DEBUG(2, - printf("seq_convertold: event type %d not handled %d %d %d\n", - event[0], event[1], event[2], event[3])); - break; - } - return used; -} - -/* - * Writting to the sequencer buffer never blocks and drops - * input which cannot be queued - */ -void -seq_copytoinput(struct seq_softc *scp, u_char *event, int len) -{ - - mtx_assert(&scp->seq_lock, MA_OWNED); - - if (MIDIQ_AVAIL(scp->in_q) < len) { - /* - * ENOROOM? EINPUTDROPPED? ETOUGHLUCK? - */ - SEQ_DEBUG(2, printf("seq_copytoinput: queue full\n")); - } else { - MIDIQ_ENQ(scp->in_q, event, len); - selwakeup(&scp->in_sel); - cv_broadcast(&scp->in_cv); - } - -} - -static int -seq_chnvoice(struct seq_softc *scp, kobj_t md, u_char *event) -{ - int ret, voice; - u_char cmd, chn, note, parm; - - ret = 0; - cmd = event[2]; - chn = event[3]; - note = event[4]; - parm = event[5]; - - mtx_assert(&scp->seq_lock, MA_OWNED); - - SEQ_DEBUG(5, printf("seq_chnvoice: unit %d, dev %d, cmd %s," - " chn %d, note %d, parm %d.\n", scp->unit, event[1], - midi_cmdname(cmd, cmdtab_seqcv), chn, note, parm)); - - voice = SYNTH_ALLOC(md, chn, note); - - mtx_unlock(&scp->seq_lock); - - switch (cmd) { - case MIDI_NOTEON: - if (note < 128 || note == 255) { -#if 0 - if (scp->music && chn == 9) { - /* - * This channel is a percussion. The note - * number is the patch number. - */ - /* - mtx_unlock(&scp->seq_lock); - if (SYNTH_SETINSTR(md, voice, 128 + note) - == EAGAIN) { - mtx_lock(&scp->seq_lock); - return (QUEUEFULL); - } - mtx_lock(&scp->seq_lock); - */ - note = 60; /* Middle C. */ - } -#endif - if (scp->music) { - /* - mtx_unlock(&scp->seq_lock); - if (SYNTH_SETUPVOICE(md, voice, chn) - == EAGAIN) { - mtx_lock(&scp->seq_lock); - return (QUEUEFULL); - } - mtx_lock(&scp->seq_lock); - */ - } - SYNTH_STARTNOTE(md, voice, note, parm); - } - break; - case MIDI_NOTEOFF: - SYNTH_KILLNOTE(md, voice, note, parm); - break; - case MIDI_KEY_PRESSURE: - SYNTH_AFTERTOUCH(md, voice, parm); - break; - default: - ret = 1; - SEQ_DEBUG(2, printf("seq_chnvoice event type %d not handled\n", - event[1])); - break; - } - - mtx_lock(&scp->seq_lock); - return ret; -} - -static int -seq_chncommon(struct seq_softc *scp, kobj_t md, u_char *event) -{ - int ret; - u_short w14; - u_char cmd, chn, p1; - - ret = 0; - cmd = event[2]; - chn = event[3]; - p1 = event[4]; - w14 = *(u_short *)&event[6]; - - SEQ_DEBUG(5, printf("seq_chncommon: unit %d, dev %d, cmd %s, chn %d," - " p1 %d, w14 %d.\n", scp->unit, event[1], - midi_cmdname(cmd, cmdtab_seqccmn), chn, p1, w14)); - mtx_unlock(&scp->seq_lock); - switch (cmd) { - case MIDI_PGM_CHANGE: - SEQ_DEBUG(4, printf("seq_chncommon pgmchn chn %d pg %d\n", - chn, p1)); - SYNTH_SETINSTR(md, chn, p1); - break; - case MIDI_CTL_CHANGE: - SEQ_DEBUG(4, printf("seq_chncommon ctlch chn %d pg %d %d\n", - chn, p1, w14)); - SYNTH_CONTROLLER(md, chn, p1, w14); - break; - case MIDI_PITCH_BEND: - if (scp->music) { - /* - * TODO: MIDI_PITCH_BEND - */ -#if 0 - mtx_lock(&md->synth.vc_mtx); - md->synth.chn_info[chn].bender_value = w14; - if (md->midiunit >= 0) { - /* - * Handle all of the notes playing on this - * channel. - */ - key = ((int)chn << 8); - for (i = 0; i < md->synth.alloc.max_voice; i++) - if ((md->synth.alloc.map[i] & 0xff00) == key) { - mtx_unlock(&md->synth.vc_mtx); - mtx_unlock(&scp->seq_lock); - if (md->synth.bender(md, i, w14) == EAGAIN) { - mtx_lock(&scp->seq_lock); - return (QUEUEFULL); - } - mtx_lock(&scp->seq_lock); - } - } else { - mtx_unlock(&md->synth.vc_mtx); - mtx_unlock(&scp->seq_lock); - if (md->synth.bender(md, chn, w14) == EAGAIN) { - mtx_lock(&scp->seq_lock); - return (QUEUEFULL); - } - mtx_lock(&scp->seq_lock); - } -#endif - } else - SYNTH_BENDER(md, chn, w14); - break; - default: - ret = 1; - SEQ_DEBUG(2, - printf("seq_chncommon event type %d not handled.\n", - event[1])); - break; - } - mtx_lock(&scp->seq_lock); - return ret; -} - -static int -seq_timing(struct seq_softc *scp, u_char *event) -{ - int param; - int ret; - - ret = 0; - param = event[4] + (event[5] << 8) + - (event[6] << 16) + (event[7] << 24); - - SEQ_DEBUG(5, printf("seq_timing: unit %d, cmd %d, param %d.\n", - scp->unit, event[1], param)); - switch (event[1]) { - case TMR_WAIT_REL: - timer_wait(scp, param, 0); - break; - case TMR_WAIT_ABS: - timer_wait(scp, param, 1); - break; - case TMR_START: - timer_start(scp); - cv_broadcast(&scp->reset_cv); - break; - case TMR_STOP: - timer_stop(scp); - /* - * The following cv_broadcast isn't needed since we only - * wait for 0->1 transitions. It probably won't hurt - */ - cv_broadcast(&scp->reset_cv); - break; - case TMR_CONTINUE: - timer_continue(scp); - cv_broadcast(&scp->reset_cv); - break; - case TMR_TEMPO: - if (param < 8) - param = 8; - if (param > 360) - param = 360; - SEQ_DEBUG(4, printf("Timer set tempo %d\n", param)); - timer_setvals(scp, param, scp->timerbase); - break; - case TMR_TIMERBASE: - if (param < 1) - param = 1; - if (param > 1000) - param = 1000; - SEQ_DEBUG(4, printf("Timer set timerbase %d\n", param)); - timer_setvals(scp, scp->tempo, param); - break; - case TMR_ECHO: - /* - * TODO: Consider making 4-byte events for /dev/sequencer - * PRO: Maybe needed by legacy apps - * CON: soundcard.h has been warning for a while many years - * to expect 8 byte events. - */ -#if 0 - if (scp->music) - seq_copytoinput(scp, event, 8); - else { - param = (param << 8 | SEQ_ECHO); - seq_copytoinput(scp, (u_char *)¶m, 4); - } -#else - seq_copytoinput(scp, event, 8); -#endif - break; - default: - SEQ_DEBUG(2, printf("seq_timing event type %d not handled.\n", - event[1])); - ret = 1; - break; - } - return ret; -} - -static int -seq_local(struct seq_softc *scp, u_char *event) -{ - int ret; - - ret = 0; - mtx_assert(&scp->seq_lock, MA_OWNED); - - SEQ_DEBUG(5, printf("seq_local: unit %d, cmd %d\n", scp->unit, - event[1])); - switch (event[1]) { - default: - SEQ_DEBUG(1, printf("seq_local event type %d not handled\n", - event[1])); - ret = 1; - break; - } - return ret; -} - -static int -seq_sysex(struct seq_softc *scp, kobj_t md, u_char *event) -{ - int i, l; - - mtx_assert(&scp->seq_lock, MA_OWNED); - SEQ_DEBUG(5, printf("seq_sysex: unit %d device %d\n", scp->unit, - event[1])); - l = 0; - for (i = 0; i < 6 && event[i + 2] != 0xff; i++) - l = i + 1; - if (l > 0) { - mtx_unlock(&scp->seq_lock); - if (SYNTH_SENDSYSEX(md, &event[2], l) == EAGAIN) { - mtx_lock(&scp->seq_lock); - return 1; - } - mtx_lock(&scp->seq_lock); - } - return 0; -} - -/* - * Reset no longer closes the raw devices nor seq_sync's - * Callers are IOCTL and seq_close - */ -static void -seq_reset(struct seq_softc *scp) -{ - int chn, i; - kobj_t m; - - mtx_assert(&scp->seq_lock, MA_OWNED); - - SEQ_DEBUG(5, printf("seq_reset: unit %d.\n", scp->unit)); - - /* - * Stop reading and writing. - */ - - /* scp->recording = 0; */ - scp->playing = 0; - cv_broadcast(&scp->state_cv); - cv_broadcast(&scp->out_cv); - cv_broadcast(&scp->reset_cv); - - /* - * For now, don't reset the timers. - */ - MIDIQ_CLEAR(scp->in_q); - MIDIQ_CLEAR(scp->out_q); - - for (i = 0; i < scp->midi_number; i++) { - m = scp->midis[i]; - mtx_unlock(&scp->seq_lock); - SYNTH_RESET(m); - for (chn = 0; chn < 16; chn++) { - SYNTH_CONTROLLER(m, chn, 123, 0); - SYNTH_CONTROLLER(m, chn, 121, 0); - SYNTH_BENDER(m, chn, 1 << 13); - } - mtx_lock(&scp->seq_lock); - } -} - -/* - * seq_sync - * *really* flush the output queue - * flush the event queue, then flush the synthsisers. - * Callers are IOCTL and close - */ - -#define SEQ_SYNC_TIMEOUT 8 -static int -seq_sync(struct seq_softc *scp) -{ - int i, rl, sync[16], done; - - mtx_assert(&scp->seq_lock, MA_OWNED); - - SEQ_DEBUG(4, printf("seq_sync: unit %d.\n", scp->unit)); - - /* - * Wait until output queue is empty. Check every so often to see if - * the queue is moving along. If it isn't just abort. - */ - while (!MIDIQ_EMPTY(scp->out_q)) { - if (!scp->playing) { - scp->playing = 1; - cv_broadcast(&scp->state_cv); - cv_broadcast(&scp->out_cv); - } - rl = MIDIQ_LEN(scp->out_q); - - i = cv_timedwait_sig(&scp->out_cv, - &scp->seq_lock, SEQ_SYNC_TIMEOUT * hz); - - if (i == EINTR || i == ERESTART) { - if (i == EINTR) { - /* - * XXX: I don't know why we stop playing - */ - scp->playing = 0; - cv_broadcast(&scp->out_cv); - } - return i; - } - if (i == EWOULDBLOCK && rl == MIDIQ_LEN(scp->out_q) && - scp->waiting == 0) { - /* - * A queue seems to be stuck up. Give up and clear - * queues. - */ - MIDIQ_CLEAR(scp->out_q); - scp->playing = 0; - cv_broadcast(&scp->state_cv); - cv_broadcast(&scp->out_cv); - cv_broadcast(&scp->reset_cv); - - /* - * TODO: Consider if the raw devices need to be flushed - */ - - SEQ_DEBUG(1, printf("seq_sync queue stuck, aborting\n")); - - return i; - } - } - - scp->playing = 0; - /* - * Since syncing a midi device might block, unlock scp->seq_lock. - */ - - mtx_unlock(&scp->seq_lock); - for (i = 0; i < scp->midi_number; i++) - sync[i] = 1; - - do { - done = 1; - for (i = 0; i < scp->midi_number; i++) - if (sync[i]) { - if (SYNTH_INSYNC(scp->midis[i]) == 0) - sync[i] = 0; - else - done = 0; - } - if (!done) - DELAY(5000); - - } while (!done); - - mtx_lock(&scp->seq_lock); - return 0; -} - -char * -midi_cmdname(int cmd, midi_cmdtab *tab) -{ - while (tab->name != NULL) { - if (cmd == tab->cmd) - return (tab->name); - tab++; - } - - return ("unknown"); -} diff --git a/sys/dev/sound/midi/synth_if.m b/sys/dev/sound/midi/synth_if.m deleted file mode 100644 index a763b3422bc6..000000000000 --- a/sys/dev/sound/midi/synth_if.m +++ /dev/null @@ -1,312 +0,0 @@ -#- -# Copyright (c) 2003 Mathew Kanner -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# 1. Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# 2. Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND -# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS -# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) -# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY -# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF -# SUCH DAMAGE. -# -# - -INTERFACE synth; - -#include <sys/systm.h> - -CODE { - -synth_killnote_t nokillnote; -synth_startnote_t nostartnote; -synth_setinstr_t nosetinstr; -synth_hwcontrol_t nohwcontrol; -synth_aftertouch_t noaftertouch; -synth_panning_t nopanning; -synth_controller_t nocontroller; -synth_volumemethod_t novolumemethod; -synth_bender_t nobender; -synth_setupvoice_t nosetupvoice; -synth_sendsysex_t nosendsysex; -synth_allocvoice_t noallocvoice; -synth_writeraw_t nowriteraw; -synth_reset_t noreset; -synth_shortname_t noshortname; -synth_open_t noopen; -synth_close_t noclose; -synth_query_t noquery; -synth_insync_t noinsync; -synth_alloc_t noalloc; - - int - nokillnote(void *_kobj, uint8_t _chn, uint8_t _note, uint8_t _vel) - { - printf("nokillnote\n"); - return 0; - } - - int - noopen(void *_kobj, void *_arg, int mode) - { - printf("noopen\n"); - return 0; - } - - int - noquery(void *_kboj) - { - printf("noquery\n"); - return 0; - } - - int - nostartnote(void *_kb, uint8_t _voice, uint8_t _note, uint8_t _parm) - { - printf("nostartnote\n"); - return 0; - } - - int - nosetinstr(void *_kb, uint8_t _chn, uint16_t _patchno) - { - printf("nosetinstr\n"); - return 0; - } - - int - nohwcontrol(void *_kb, uint8_t *_event) - { - printf("nohwcontrol\n"); - return 0; - } - - int - noaftertouch ( void /* X */ * _kobj, uint8_t _x1, uint8_t _x2) - { - printf("noaftertouch\n"); - return 0; - } - - int - nopanning ( void /* X */ * _kobj, uint8_t _x1, uint8_t _x2) - { - printf("nopanning\n"); - return 0; - } - - int - nocontroller ( void /* X */ * _kobj, uint8_t _x1, uint8_t _x2, uint16_t _x3) - { - printf("nocontroller\n"); - return 0; - } - - int - novolumemethod ( - void /* X */ * _kobj, - uint8_t _x1) - { - printf("novolumemethod\n"); - return 0; - } - - int - nobender ( void /* X */ * _kobj, uint8_t _voice, uint16_t _bend) - { - printf("nobender\n"); - return 0; - } - - int - nosetupvoice ( void /* X */ * _kobj, uint8_t _voice, uint8_t _chn) - { - - printf("nosetupvoice\n"); - return 0; - } - - int - nosendsysex ( void /* X */ * _kobj, void * _buf, size_t _len) - { - printf("nosendsysex\n"); - return 0; - } - - int - noallocvoice ( void /* X */ * _kobj, uint8_t _chn, uint8_t _note, void *_x) - { - printf("noallocvoice\n"); - return 0; - } - - int - nowriteraw ( void /* X */ * _kobjt, uint8_t * _buf, size_t _len) - { - printf("nowriteraw\n"); - return 1; - } - - int - noreset ( void /* X */ * _kobjt) - { - - printf("noreset\n"); - return 0; - } - - char * - noshortname (void /* X */ * _kobjt) - { - printf("noshortname\n"); - return "noshortname"; - } - - int - noclose ( void /* X */ * _kobjt) - { - - printf("noclose\n"); - return 0; - } - - int - noinsync (void /* X */ * _kobjt) - { - - printf("noinsync\n"); - return 0; - } - - int - noalloc ( void /* x */ * _kbojt, uint8_t _chn, uint8_t _note) - { - printf("noalloc\n"); - return 0; - } -} - -METHOD int killnote { - void /* X */ *_kobj; - uint8_t _chan; - uint8_t _note; - uint8_t _vel; -} DEFAULT nokillnote; - -METHOD int startnote { - void /* X */ *_kobj; - uint8_t _voice; - uint8_t _note; - uint8_t _parm; -} DEFAULT nostartnote; - -METHOD int setinstr { - void /* X */ *_kobj; - uint8_t _chn; - uint16_t _patchno; -} DEFAULT nosetinstr; - -METHOD int hwcontrol { - void /* X */ *_kobj; - uint8_t *_event; -} DEFAULT nohwcontrol; - -METHOD int aftertouch { - void /* X */ *_kobj; - uint8_t _x1; - uint8_t _x2; -} DEFAULT noaftertouch; - -METHOD int panning { - void /* X */ *_kobj; - uint8_t _x1; - uint8_t _x2; -} DEFAULT nopanning; - -METHOD int controller { - void /* X */ *_kobj; - uint8_t _x1; - uint8_t _x2; - uint16_t _x3; -} DEFAULT nocontroller; - -METHOD int volumemethod { - void /* X */ *_kobj; - uint8_t _x1; -} DEFAULT novolumemethod; - -METHOD int bender { - void /* X */ *_kobj; - uint8_t _voice; - uint16_t _bend; -} DEFAULT nobender; - -METHOD int setupvoice { - void /* X */ *_kobj; - uint8_t _voice; - uint8_t _chn; -} DEFAULT nosetupvoice; - -METHOD int sendsysex { - void /* X */ *_kobj; - void *_buf; - size_t _len; -} DEFAULT nosendsysex; - -METHOD int allocvoice { - void /* X */ *_kobj; - uint8_t _chn; - uint8_t _note; - void *_x; -} DEFAULT noallocvoice; - -METHOD int writeraw { - void /* X */ *_kobjt; - uint8_t *_buf; - size_t _len; -} DEFAULT nowriteraw; - -METHOD int reset { - void /* X */ *_kobjt; -} DEFAULT noreset; - -METHOD char * shortname { - void /* X */ *_kobjt; -} DEFAULT noshortname; - -METHOD int open { - void /* X */ *_kobjt; - void *_sythn; - int _mode; -} DEFAULT noopen; - -METHOD int close { - void /* X */ *_kobjt; -} DEFAULT noclose; - -METHOD int query { - void /* X */ *_kobjt; -} DEFAULT noquery; - -METHOD int insync { - void /* X */ *_kobjt; -} DEFAULT noinsync; - -METHOD int alloc { - void /* x */ *_kbojt; - uint8_t _chn; - uint8_t _note; -} DEFAULT noalloc; diff --git a/sys/dev/sound/pcm/mixer.c b/sys/dev/sound/pcm/mixer.c index 092af3298f0e..f281dff36248 100644 --- a/sys/dev/sound/pcm/mixer.c +++ b/sys/dev/sound/pcm/mixer.c @@ -750,8 +750,8 @@ mixer_init(device_t dev, kobj_class_t cls, void *devinfo) mixer_setrecsrc(m, 0); /* Set default input. */ - pdev = make_dev(&mixer_cdevsw, SND_DEV_CTL, UID_ROOT, GID_WHEEL, 0666, - "mixer%d", unit); + pdev = make_dev(&mixer_cdevsw, 0, UID_ROOT, GID_WHEEL, 0666, "mixer%d", + unit); pdev->si_drv1 = m; snddev->mixer_dev = pdev; diff --git a/sys/dev/sound/pcm/sndstat.c b/sys/dev/sound/pcm/sndstat.c index 509a35c5a038..51d0fb3bb686 100644 --- a/sys/dev/sound/pcm/sndstat.c +++ b/sys/dev/sound/pcm/sndstat.c @@ -52,7 +52,6 @@ #define SS_TYPE_PCM 1 #define SS_TYPE_MIDI 2 -#define SS_TYPE_SEQUENCER 3 static d_open_t sndstat_open; static void sndstat_close(void *); @@ -1165,8 +1164,6 @@ sndstat_register(device_t dev, char *str) type = SS_TYPE_PCM; else if (!strcmp(devtype, "midi")) type = SS_TYPE_MIDI; - else if (!strcmp(devtype, "sequencer")) - type = SS_TYPE_SEQUENCER; else return (EINVAL); @@ -1441,8 +1438,8 @@ static void sndstat_sysinit(void *p) { sx_init(&sndstat_lock, "sndstat lock"); - sndstat_dev = make_dev(&sndstat_cdevsw, SND_DEV_STATUS, - UID_ROOT, GID_WHEEL, 0644, "sndstat"); + sndstat_dev = make_dev(&sndstat_cdevsw, 0, UID_ROOT, GID_WHEEL, 0644, + "sndstat"); } SYSINIT(sndstat_sysinit, SI_SUB_DRIVERS, SI_ORDER_FIRST, sndstat_sysinit, NULL); diff --git a/sys/dev/sound/pcm/sound.h b/sys/dev/sound/pcm/sound.h index 315452e294d1..6bd435d0ea25 100644 --- a/sys/dev/sound/pcm/sound.h +++ b/sys/dev/sound/pcm/sound.h @@ -148,14 +148,6 @@ struct snd_mixer; #define RANGE(var, low, high) (var) = \ (((var)<(low))? (low) : ((var)>(high))? (high) : (var)) -enum { - SND_DEV_CTL = 0, /* Control port /dev/mixer */ - SND_DEV_SEQ, /* Sequencer /dev/sequencer */ - SND_DEV_MIDIN, /* Raw midi access */ - SND_DEV_DSP, /* Digitized voice /dev/dsp */ - SND_DEV_STATUS, /* /dev/sndstat */ -}; - #define DSP_DEFAULT_SPEED 8000 extern int snd_unit; diff --git a/sys/dev/ufshci/ufshci_private.h b/sys/dev/ufshci/ufshci_private.h index cac743884ee6..ac58d44102a0 100644 --- a/sys/dev/ufshci/ufshci_private.h +++ b/sys/dev/ufshci/ufshci_private.h @@ -149,6 +149,8 @@ struct ufshci_hw_queue { bus_dmamap_t queuemem_map; bus_addr_t req_queue_addr; + bus_addr_t *ucd_bus_addr; + uint32_t num_entries; uint32_t num_trackers; @@ -198,8 +200,6 @@ struct ufshci_req_queue { bus_dma_tag_t dma_tag_payload; bus_dmamap_t ucdmem_map; - - bus_addr_t ucd_addr; }; struct ufshci_device { diff --git a/sys/dev/ufshci/ufshci_req_sdb.c b/sys/dev/ufshci/ufshci_req_sdb.c index 4670281d367a..b1f303afaef5 100644 --- a/sys/dev/ufshci/ufshci_req_sdb.c +++ b/sys/dev/ufshci/ufshci_req_sdb.c @@ -48,6 +48,29 @@ ufshci_req_sdb_cmd_desc_destroy(struct ufshci_req_queue *req_queue) } } +static void +ufshci_ucd_map(void *arg, bus_dma_segment_t *seg, int nseg, int error) +{ + struct ufshci_hw_queue *hwq = arg; + int i; + + if (error != 0) { + printf("ufshci: Failed to map UCD, error = %d\n", error); + return; + } + + if (hwq->num_trackers != nseg) { + printf( + "ufshci: Failed to map UCD, num_trackers = %d, nseg = %d\n", + hwq->num_trackers, nseg); + return; + } + + for (i = 0; i < nseg; i++) { + hwq->ucd_bus_addr[i] = seg[i].ds_addr; + } +} + static int ufshci_req_sdb_cmd_desc_construct(struct ufshci_req_queue *req_queue, uint32_t num_entries, struct ufshci_controller *ctrlr) @@ -55,7 +78,6 @@ ufshci_req_sdb_cmd_desc_construct(struct ufshci_req_queue *req_queue, struct ufshci_hw_queue *hwq = &req_queue->hwq[UFSHCI_SDB_Q]; struct ufshci_tracker *tr; size_t ucd_allocsz, payload_allocsz; - uint64_t ucdmem_phys; uint8_t *ucdmem; int i, error; @@ -71,10 +93,11 @@ ufshci_req_sdb_cmd_desc_construct(struct ufshci_req_queue *req_queue, * Allocate physical memory for UTP Command Descriptor (UCD) * Note: UFSHCI UCD format is restricted to 128-byte alignment. */ - error = bus_dma_tag_create(bus_get_dma_tag(ctrlr->dev), 128, - ctrlr->page_size, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, - ucd_allocsz, howmany(ucd_allocsz, ctrlr->page_size), - ctrlr->page_size, 0, NULL, NULL, &req_queue->dma_tag_ucd); + error = bus_dma_tag_create(bus_get_dma_tag(ctrlr->dev), 128, 0, + BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, ucd_allocsz, + howmany(ucd_allocsz, sizeof(struct ufshci_utp_cmd_desc)), + sizeof(struct ufshci_utp_cmd_desc), 0, NULL, NULL, + &req_queue->dma_tag_ucd); if (error != 0) { ufshci_printf(ctrlr, "request cmd desc tag create failed %d\n", error); @@ -88,7 +111,7 @@ ufshci_req_sdb_cmd_desc_construct(struct ufshci_req_queue *req_queue, } if (bus_dmamap_load(req_queue->dma_tag_ucd, req_queue->ucdmem_map, - ucdmem, ucd_allocsz, ufshci_single_map, &ucdmem_phys, 0) != 0) { + ucdmem, ucd_allocsz, ufshci_ucd_map, hwq, 0) != 0) { ufshci_printf(ctrlr, "failed to load cmd desc memory\n"); bus_dmamem_free(req_queue->dma_tag_ucd, req_queue->ucd, req_queue->ucdmem_map); @@ -96,7 +119,6 @@ ufshci_req_sdb_cmd_desc_construct(struct ufshci_req_queue *req_queue, } req_queue->ucd = (struct ufshci_utp_cmd_desc *)ucdmem; - req_queue->ucd_addr = ucdmem_phys; /* * Allocate physical memory for PRDT @@ -128,10 +150,9 @@ ufshci_req_sdb_cmd_desc_construct(struct ufshci_req_queue *req_queue, tr->slot_state = UFSHCI_SLOT_STATE_FREE; tr->ucd = (struct ufshci_utp_cmd_desc *)ucdmem; - tr->ucd_bus_addr = ucdmem_phys; + tr->ucd_bus_addr = hwq->ucd_bus_addr[i]; ucdmem += sizeof(struct ufshci_utp_cmd_desc); - ucdmem_phys += sizeof(struct ufshci_utp_cmd_desc); hwq->act_tr[i] = tr; } @@ -175,6 +196,11 @@ ufshci_req_sdb_construct(struct ufshci_controller *ctrlr, req_queue->hwq = malloc(sizeof(struct ufshci_hw_queue), M_UFSHCI, M_ZERO | M_NOWAIT); hwq = &req_queue->hwq[UFSHCI_SDB_Q]; + hwq->num_entries = req_queue->num_entries; + hwq->num_trackers = req_queue->num_trackers; + req_queue->hwq->ucd_bus_addr = malloc(sizeof(bus_addr_t) * + req_queue->num_trackers, + M_UFSHCI, M_ZERO | M_NOWAIT); mtx_init(&hwq->qlock, "ufshci req_queue lock", NULL, MTX_DEF); @@ -277,6 +303,7 @@ ufshci_req_sdb_destroy(struct ufshci_controller *ctrlr, if (mtx_initialized(&hwq->qlock)) mtx_destroy(&hwq->qlock); + free(req_queue->hwq->ucd_bus_addr, M_UFSHCI); free(req_queue->hwq, M_UFSHCI); } diff --git a/sys/dev/usb/controller/xhci_pci.c b/sys/dev/usb/controller/xhci_pci.c index b50e33ea36ce..d5cfd228a429 100644 --- a/sys/dev/usb/controller/xhci_pci.c +++ b/sys/dev/usb/controller/xhci_pci.c @@ -99,6 +99,11 @@ xhci_pci_match(device_t self) return ("AMD Starship USB 3.0 controller"); case 0x149c1022: return ("AMD Matisse USB 3.0 controller"); + case 0x15b61022: + case 0x15b71022: + return ("AMD Raphael/Granite Ridge USB 3.1 controller"); + case 0x15b81022: + return ("AMD Raphael/Granite Ridge USB 2.0 controller"); case 0x15e01022: case 0x15e11022: return ("AMD Raven USB 3.1 controller"); @@ -109,6 +114,8 @@ xhci_pci_match(device_t self) return ("AMD 300 Series USB 3.1 controller"); case 0x43d51022: return ("AMD 400 Series USB 3.1 controller"); + case 0x43f71022: + return ("AMD 600 Series USB 3.2 controller"); case 0x78121022: case 0x78141022: case 0x79141022: diff --git a/sys/dev/vmm/vmm_dev.c b/sys/dev/vmm/vmm_dev.c index 1ffa15dd157b..819debadd1ac 100644 --- a/sys/dev/vmm/vmm_dev.c +++ b/sys/dev/vmm/vmm_dev.c @@ -351,6 +351,7 @@ static const struct vmmdev_ioctl vmmdev_ioctls[] = { VMMDEV_IOCTL(VM_ACTIVATE_CPU, VMMDEV_IOCTL_LOCK_ONE_VCPU), VMMDEV_IOCTL(VM_INJECT_EXCEPTION, VMMDEV_IOCTL_LOCK_ONE_VCPU), VMMDEV_IOCTL(VM_STATS, VMMDEV_IOCTL_LOCK_ONE_VCPU), + VMMDEV_IOCTL(VM_STAT_DESC, 0), #if defined(__amd64__) && defined(COMPAT_FREEBSD12) VMMDEV_IOCTL(VM_ALLOC_MEMSEG_12, diff --git a/sys/dev/vt/hw/vga/vt_vga.c b/sys/dev/vt/hw/vga/vt_vga.c index 64039575c0ad..675c0573bd7e 100644 --- a/sys/dev/vt/hw/vga/vt_vga.c +++ b/sys/dev/vt/hw/vga/vt_vga.c @@ -1347,7 +1347,7 @@ vga_postswitch(struct vt_device *vd) /* Reinit VGA mode, to restore view after app which change mode. */ vga_initialize(vd, (vd->vd_flags & VDF_TEXTMODE)); - /* Ask vt(9) to update chars on visible area. */ + /* Ask vt(4) to update chars on visible area. */ vd->vd_flags |= VDF_INVALID; } diff --git a/sys/dev/vt/vt_core.c b/sys/dev/vt/vt_core.c index b0f58b38a6f1..b51ef6766de4 100644 --- a/sys/dev/vt/vt_core.c +++ b/sys/dev/vt/vt_core.c @@ -125,10 +125,10 @@ static const struct terminal_class vt_termclass = { (vw)->vw_number) static SYSCTL_NODE(_kern, OID_AUTO, vt, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, - "vt(9) parameters"); + "vt(4) parameters"); static VT_SYSCTL_INT(enable_altgr, 1, "Enable AltGr key (Do not assume R.Alt as Alt)"); static VT_SYSCTL_INT(enable_bell, 0, "Enable bell"); -static VT_SYSCTL_INT(debug, 0, "vt(9) debug level"); +static VT_SYSCTL_INT(debug, 0, "vt(4) debug level"); static VT_SYSCTL_INT(deadtimer, 15, "Time to wait busy process in VT_PROCESS mode"); static VT_SYSCTL_INT(suspendswitch, 1, "Switch to VT0 before suspend"); diff --git a/sys/fs/fdescfs/fdesc_vnops.c b/sys/fs/fdescfs/fdesc_vnops.c index 676ea5de12b8..58a22b8bdc50 100644 --- a/sys/fs/fdescfs/fdesc_vnops.c +++ b/sys/fs/fdescfs/fdesc_vnops.c @@ -547,6 +547,8 @@ fdesc_readdir(struct vop_readdir_args *ap) fmp = VFSTOFDESC(ap->a_vp->v_mount); if (ap->a_ncookies != NULL) *ap->a_ncookies = 0; + if (ap->a_eofflag != NULL) + *ap->a_eofflag = 0; off = (int)uio->uio_offset; if (off != uio->uio_offset || off < 0 || (u_int)off % UIO_MX != 0 || @@ -559,7 +561,12 @@ fdesc_readdir(struct vop_readdir_args *ap) fcnt = i - 2; /* The first two nodes are `.' and `..' */ FILEDESC_SLOCK(fdp); - while (i < fdp->fd_nfiles + 2 && uio->uio_resid >= UIO_MX) { + while (uio->uio_resid >= UIO_MX) { + if (i >= fdp->fd_nfiles + 2) { + if (ap->a_eofflag != NULL) + *ap->a_eofflag = 1; + break; + } bzero((caddr_t)dp, UIO_MX); switch (i) { case 0: /* `.' */ diff --git a/sys/fs/fuse/fuse_file.h b/sys/fs/fuse/fuse_file.h index 2a90e66d1b23..232132473953 100644 --- a/sys/fs/fuse/fuse_file.h +++ b/sys/fs/fuse/fuse_file.h @@ -139,7 +139,7 @@ struct fuse_filehandle { /* * flags returned by FUSE_OPEN - * Supported flags: FOPEN_DIRECT_IO, FOPEN_KEEP_CACHE + * Supported flags: FOPEN_DIRECT_IO, FOPEN_KEEP_CACHE, FOPEN_NOFLUSH * Unsupported: * FOPEN_NONSEEKABLE: Adding support would require a new per-file * or per-vnode attribute, which would have to be checked by diff --git a/sys/fs/fuse/fuse_kernel.h b/sys/fs/fuse/fuse_kernel.h index c95caf898ad8..942448b47365 100644 --- a/sys/fs/fuse/fuse_kernel.h +++ b/sys/fs/fuse/fuse_kernel.h @@ -182,6 +182,12 @@ * - add FUSE_OPEN_KILL_SUIDGID * - extend fuse_setxattr_in, add FUSE_SETXATTR_EXT * - add FUSE_SETXATTR_ACL_KILL_SGID + * + * 7.34 + * - add FUSE_SYNCFS + * + * 7.35 + * - add FOPEN_NOFLUSH */ #ifndef _FUSE_FUSE_KERNEL_H @@ -217,7 +223,7 @@ #define FUSE_KERNEL_VERSION 7 /** Minor version number of this interface */ -#define FUSE_KERNEL_MINOR_VERSION 33 +#define FUSE_KERNEL_MINOR_VERSION 35 /** The node ID of the root inode */ #define FUSE_ROOT_ID 1 @@ -288,12 +294,14 @@ struct fuse_file_lock { * FOPEN_NONSEEKABLE: the file is not seekable * FOPEN_CACHE_DIR: allow caching this directory * FOPEN_STREAM: the file is stream-like (no file position at all) + * FOPEN_NOFLUSH: don't flush data cache on close (unless FUSE_WRITEBACK_CACHE) */ #define FOPEN_DIRECT_IO (1 << 0) #define FOPEN_KEEP_CACHE (1 << 1) #define FOPEN_NONSEEKABLE (1 << 2) #define FOPEN_CACHE_DIR (1 << 3) #define FOPEN_STREAM (1 << 4) +#define FOPEN_NOFLUSH (1 << 5) /** * INIT request/reply flags @@ -518,6 +526,7 @@ enum fuse_opcode { FUSE_COPY_FILE_RANGE = 47, FUSE_SETUPMAPPING = 48, FUSE_REMOVEMAPPING = 49, + FUSE_SYNCFS = 50, #ifdef linux /* CUSE specific operations */ @@ -939,7 +948,8 @@ struct fuse_notify_retrieve_in { }; /* Device ioctls: */ -#define FUSE_DEV_IOC_CLONE _IOR(229, 0, uint32_t) +#define FUSE_DEV_IOC_MAGIC 229 +#define FUSE_DEV_IOC_CLONE _IOR(FUSE_DEV_IOC_MAGIC, 0, uint32_t) struct fuse_lseek_in { uint64_t fh; @@ -992,4 +1002,8 @@ struct fuse_removemapping_one { #define FUSE_REMOVEMAPPING_MAX_ENTRY \ (PAGE_SIZE / sizeof(struct fuse_removemapping_one)) +struct fuse_syncfs_in { + uint64_t padding; +}; + #endif /* _FUSE_FUSE_KERNEL_H */ diff --git a/sys/fs/fuse/fuse_vnops.c b/sys/fs/fuse/fuse_vnops.c index 107e6db299e0..ae28617537fd 100644 --- a/sys/fs/fuse/fuse_vnops.c +++ b/sys/fs/fuse/fuse_vnops.c @@ -89,6 +89,8 @@ #include <sys/buf.h> #include <sys/sysctl.h> #include <sys/vmmeter.h> +#define EXTERR_CATEGORY EXTERR_CAT_FUSE +#include <sys/exterrvar.h> #include <vm/vm.h> #include <vm/vm_extern.h> @@ -289,6 +291,10 @@ fuse_flush(struct vnode *vp, struct ucred *cred, pid_t pid, int fflag) if (err) return err; + if (fufh->fuse_open_flags & FOPEN_NOFLUSH && + (!fsess_opt_writeback(vnode_mount(vp)))) + return (0); + fdisp_init(&fdi, sizeof(*ffi)); fdisp_make_vp(&fdi, FUSE_FLUSH, vp, td, cred); ffi = fdi.indata; @@ -435,7 +441,8 @@ fuse_vnop_access(struct vop_access_args *ap) if (vnode_isvroot(vp)) { return 0; } - return ENXIO; + return (EXTERROR(ENXIO, "This FUSE session is about " + "to be closed")); } if (!(data->dataflags & FSESS_INITED)) { if (vnode_isvroot(vp)) { @@ -444,7 +451,8 @@ fuse_vnop_access(struct vop_access_args *ap) return 0; } } - return EBADF; + return (EXTERROR(EBADF, "Access denied until FUSE session " + "is initialized")); } if (vnode_islnk(vp)) { return 0; @@ -485,7 +493,8 @@ fuse_vnop_advlock(struct vop_advlock_args *ap) dataflags = fuse_get_mpdata(vnode_mount(vp))->dataflags; if (fuse_isdeadfs(vp)) { - return ENXIO; + return (EXTERROR(ENXIO, "This FUSE session is about " + "to be closed")); } switch(ap->a_op) { @@ -502,7 +511,7 @@ fuse_vnop_advlock(struct vop_advlock_args *ap) op = FUSE_SETLK; break; default: - return EINVAL; + return (EXTERROR(EINVAL, "Unsupported lock flags")); } if (!(dataflags & FSESS_POSIX_LOCKS)) @@ -530,14 +539,14 @@ fuse_vnop_advlock(struct vop_advlock_args *ap) size = vattr.va_size; if (size > OFF_MAX || (fl->l_start > 0 && size > OFF_MAX - fl->l_start)) { - err = EOVERFLOW; + err = EXTERROR(EOVERFLOW, "Offset is too large"); goto out; } start = size + fl->l_start; break; default: - return (EINVAL); + return (EXTERROR(EINVAL, "Unsupported offset type")); } err = fuse_filehandle_get_anyflags(vp, &fufh, cred, pid); @@ -599,15 +608,14 @@ fuse_vnop_allocate(struct vop_allocate_args *ap) int err; if (fuse_isdeadfs(vp)) - return (ENXIO); + return (EXTERROR(ENXIO, "This FUSE session is about " + "to be closed")); switch (vp->v_type) { case VFIFO: return (ESPIPE); case VLNK: case VREG: - if (vfs_isrdonly(mp)) - return (EROFS); break; default: return (ENODEV); @@ -617,7 +625,8 @@ fuse_vnop_allocate(struct vop_allocate_args *ap) return (EROFS); if (fsess_not_impl(mp, FUSE_FALLOCATE)) - return (EINVAL); + return (EXTERROR(EINVAL, "This server does not implement " + "FUSE_FALLOCATE")); io.uio_offset = *offset; io.uio_resid = *len; @@ -647,13 +656,14 @@ fuse_vnop_allocate(struct vop_allocate_args *ap) if (err == ENOSYS) { fsess_set_notimpl(mp, FUSE_FALLOCATE); - err = EINVAL; + err = EXTERROR(EINVAL, "This server does not implement " + "FUSE_ALLOCATE"); } else if (err == EOPNOTSUPP) { /* * The file system server does not support FUSE_FALLOCATE with * the supplied mode for this particular file. */ - err = EINVAL; + err = EXTERROR(EINVAL, "This file can't be pre-allocated"); } else if (!err) { *offset += *len; *len = 0; @@ -699,7 +709,8 @@ fuse_vnop_bmap(struct vop_bmap_args *ap) int maxrun; if (fuse_isdeadfs(vp)) { - return ENXIO; + return (EXTERROR(ENXIO, "This FUSE session is about " + "to be closed")); } mp = vnode_mount(vp); @@ -866,19 +877,21 @@ fuse_vnop_copy_file_range(struct vop_copy_file_range_args *ap) pid_t pid; int err; - err = ENOSYS; if (mp == NULL || mp != vnode_mount(outvp)) - goto fallback; + return (EXTERROR(ENOSYS, "Mount points do not match")); if (incred->cr_uid != outcred->cr_uid) - goto fallback; + return (EXTERROR(ENOSYS, "FUSE_COPY_FILE_RANGE does not " + "support different credentials for infd and outfd")); if (incred->cr_groups[0] != outcred->cr_groups[0]) - goto fallback; + return (EXTERROR(ENOSYS, "FUSE_COPY_FILE_RANGE does not " + "support different credentials for infd and outfd")); /* Caller busied mp, mnt_data can be safely accessed. */ if (fsess_not_impl(mp, FUSE_COPY_FILE_RANGE)) - goto fallback; + return (EXTERROR(ENOSYS, "This daemon does not " + "implement COPY_FILE_RANGE")); if (ap->a_fsizetd == NULL) td = curthread; @@ -888,7 +901,7 @@ fuse_vnop_copy_file_range(struct vop_copy_file_range_args *ap) vn_lock_pair(invp, false, LK_SHARED, outvp, false, LK_EXCLUSIVE); if (invp->v_data == NULL || outvp->v_data == NULL) { - err = EBADF; + err = EXTERROR(EBADF, "vnode got reclaimed"); goto unlock; } @@ -952,7 +965,6 @@ unlock: if (err == ENOSYS) fsess_set_notimpl(mp, FUSE_COPY_FILE_RANGE); -fallback: /* * No need to call vn_rlimit_fsizex_res before return, since the uio is @@ -1020,7 +1032,8 @@ fuse_vnop_create(struct vop_create_args *ap) int flags; if (fuse_isdeadfs(dvp)) - return ENXIO; + return (EXTERROR(ENXIO, "This FUSE session is about " + "to be closed")); /* FUSE expects sockets to be created with FUSE_MKNOD */ if (vap->va_type == VSOCK) @@ -1036,7 +1049,7 @@ fuse_vnop_create(struct vop_create_args *ap) bzero(&fdi, sizeof(fdi)); if (vap->va_type != VREG) - return (EINVAL); + return (EXTERROR(EINVAL, "Only regular files can be created")); if (fsess_not_impl(mp, FUSE_CREATE) || vap->va_type == VSOCK) { /* Fallback to FUSE_MKNOD/FUSE_OPEN */ @@ -1217,8 +1230,8 @@ fuse_vnop_getattr(struct vop_getattr_args *ap) if (!(dataflags & FSESS_INITED)) { if (!vnode_isvroot(vp)) { fdata_set_dead(fuse_get_mpdata(vnode_mount(vp))); - err = ENOTCONN; - return err; + return (EXTERROR(ENOTCONN, "FUSE daemon is not " + "initialized")); } else { goto fake; } @@ -1347,10 +1360,11 @@ fuse_vnop_link(struct vop_link_args *ap) int err; if (fuse_isdeadfs(vp)) { - return ENXIO; + return (EXTERROR(ENXIO, "This FUSE session is about " + "to be closed")); } if (vnode_mount(tdvp) != vnode_mount(vp)) { - return EXDEV; + return (EXDEV); } /* @@ -1360,7 +1374,7 @@ fuse_vnop_link(struct vop_link_args *ap) * validating that nlink does not overflow. */ if (vap != NULL && vap->va_nlink >= FUSE_LINK_MAX) - return EMLINK; + return (EMLINK); fli.oldnodeid = VTOI(vp); fdisp_init(&fdi, 0); @@ -1372,12 +1386,13 @@ fuse_vnop_link(struct vop_link_args *ap) feo = fdi.answ; if (fli.oldnodeid != feo->nodeid) { + static const char exterr[] = "Server assigned wrong inode " + "for a hard link."; struct fuse_data *data = fuse_get_mpdata(vnode_mount(vp)); - fuse_warn(data, FSESS_WARN_ILLEGAL_INODE, - "Assigned wrong inode for a hard link."); + fuse_warn(data, FSESS_WARN_ILLEGAL_INODE, exterr); fuse_vnode_clear_attr_cache(vp); fuse_vnode_clear_attr_cache(tdvp); - err = EIO; + err = EXTERROR(EIO, exterr); goto out; } @@ -1454,7 +1469,8 @@ fuse_vnop_lookup(struct vop_lookup_args *ap) if (fuse_isdeadfs(dvp)) { *vpp = NULL; - return ENXIO; + return (EXTERROR(ENXIO, "This FUSE session is about " + "to be closed")); } if (!vnode_isdir(dvp)) return ENOTDIR; @@ -1474,7 +1490,8 @@ fuse_vnop_lookup(struct vop_lookup_args *ap) * Since the file system doesn't support ".." lookups, * we have no way to find this entry. */ - return ESTALE; + return (EXTERROR(ESTALE, "This server does not support " + "'..' lookups")); } nid = VTOFUD(dvp)->parent_nid; if (nid == 0) @@ -1597,11 +1614,11 @@ fuse_vnop_lookup(struct vop_lookup_args *ap) vref(dvp); *vpp = dvp; } else { + static const char exterr[] = "Server assigned " + "same inode to both parent and child."; fuse_warn(fuse_get_mpdata(mp), - FSESS_WARN_ILLEGAL_INODE, - "Assigned same inode to both parent and " - "child."); - err = EIO; + FSESS_WARN_ILLEGAL_INODE, exterr); + err = EXTERROR(EIO, exterr); } } else { @@ -1689,7 +1706,8 @@ fuse_vnop_mkdir(struct vop_mkdir_args *ap) struct fuse_mkdir_in fmdi; if (fuse_isdeadfs(dvp)) { - return ENXIO; + return (EXTERROR(ENXIO, "This FUSE session is about " + "to be closed")); } fmdi.mode = MAKEIMODE(vap->va_type, vap->va_mode); fmdi.umask = curthread->td_proc->p_pd->pd_cmask; @@ -1716,7 +1734,8 @@ fuse_vnop_mknod(struct vop_mknod_args *ap) struct vattr *vap = ap->a_vap; if (fuse_isdeadfs(dvp)) - return ENXIO; + return (EXTERROR(ENXIO, "This FUSE session is about " + "to be closed")); return fuse_internal_mknod(dvp, vpp, cnp, vap); } @@ -1740,11 +1759,13 @@ fuse_vnop_open(struct vop_open_args *ap) pid_t pid = td->td_proc->p_pid; if (fuse_isdeadfs(vp)) - return ENXIO; + return (EXTERROR(ENXIO, "This FUSE session is about " + "to be closed")); if (vp->v_type == VCHR || vp->v_type == VBLK || vp->v_type == VFIFO) - return (EOPNOTSUPP); + return (EXTERROR(EOPNOTSUPP, "Unsupported vnode type", + vp->v_type)); if ((a_mode & (FREAD | FWRITE | FEXEC)) == 0) - return EINVAL; + return (EXTERROR(EINVAL, "Illegal mode", a_mode)); if (fuse_filehandle_validrw(vp, a_mode, cred, pid)) { fuse_vnode_open(vp, 0, td); @@ -1826,7 +1847,8 @@ fuse_vnop_pathconf(struct vop_pathconf_args *ap) return (0); } else if (fsess_not_impl(mp, FUSE_LSEEK)) { /* FUSE_LSEEK is not implemented */ - return (EINVAL); + return (EXTERROR(EINVAL, "This server does not " + "implement FUSE_LSEEK")); } else { return (err); } @@ -1860,7 +1882,8 @@ fuse_vnop_read(struct vop_read_args *ap) MPASS(vp->v_type == VREG || vp->v_type == VDIR); if (fuse_isdeadfs(vp)) { - return ENXIO; + return (EXTERROR(ENXIO, "This FUSE session is about " + "to be closed")); } if (VTOFUD(vp)->flag & FN_DIRECTIO) { @@ -1937,10 +1960,11 @@ fuse_vnop_readdir(struct vop_readdir_args *ap) if (ap->a_eofflag) *ap->a_eofflag = 0; if (fuse_isdeadfs(vp)) { - return ENXIO; + return (EXTERROR(ENXIO, "This FUSE session is about " + "to be closed")); } if (uio_resid(uio) < sizeof(struct dirent)) - return EINVAL; + return (EXTERROR(EINVAL, "Buffer is too small")); tresid = uio->uio_resid; err = fuse_filehandle_get_dir(vp, &fufh, cred, pid); @@ -2010,7 +2034,8 @@ fuse_vnop_readlink(struct vop_readlink_args *ap) int err; if (fuse_isdeadfs(vp)) { - return ENXIO; + return (EXTERROR(ENXIO, "This FUSE session is about " + "to be closed")); } if (!vnode_islnk(vp)) { return EINVAL; @@ -2021,10 +2046,11 @@ fuse_vnop_readlink(struct vop_readlink_args *ap) goto out; } if (strnlen(fdi.answ, fdi.iosize) + 1 < fdi.iosize) { + static const char exterr[] = "Server returned an embedded NUL " + "from FUSE_READLINK."; struct fuse_data *data = fuse_get_mpdata(vnode_mount(vp)); - fuse_warn(data, FSESS_WARN_READLINK_EMBEDDED_NUL, - "Returned an embedded NUL from FUSE_READLINK."); - err = EIO; + fuse_warn(data, FSESS_WARN_READLINK_EMBEDDED_NUL, exterr); + err = EXTERROR(EIO, exterr); goto out; } if (((char *)fdi.answ)[0] == '/' && @@ -2108,10 +2134,11 @@ fuse_vnop_remove(struct vop_remove_args *ap) int err; if (fuse_isdeadfs(vp)) { - return ENXIO; + return (EXTERROR(ENXIO, "This FUSE session is about " + "to be closed")); } if (vnode_isdir(vp)) { - return EPERM; + return (EXTERROR(EPERM, "vnode is a directory")); } err = fuse_internal_remove(dvp, vp, cnp, FUSE_UNLINK); @@ -2144,12 +2171,13 @@ fuse_vnop_rename(struct vop_rename_args *ap) int err = 0; if (fuse_isdeadfs(fdvp)) { - return ENXIO; + return (EXTERROR(ENXIO, "This FUSE session is about " + "to be closed")); } if (fvp->v_mount != tdvp->v_mount || (tvp && fvp->v_mount != tvp->v_mount)) { SDT_PROBE2(fusefs, , vnops, trace, 1, "cross-device rename"); - err = EXDEV; + err = EXTERROR(EXDEV, "Cross-device rename"); goto out; } cache_purge(fvp); @@ -2220,10 +2248,12 @@ fuse_vnop_rmdir(struct vop_rmdir_args *ap) int err; if (fuse_isdeadfs(vp)) { - return ENXIO; + return (EXTERROR(ENXIO, "This FUSE session is about " + "to be closed")); } if (VTOFUD(vp) == VTOFUD(dvp)) { - return EINVAL; + return (EXTERROR(EINVAL, "Directory to be removed " + "contains itself")); } err = fuse_internal_remove(dvp, vp, ap->a_cnp, FUSE_RMDIR); @@ -2260,7 +2290,8 @@ fuse_vnop_setattr(struct vop_setattr_args *ap) checkperm = dataflags & FSESS_DEFAULT_PERMISSIONS; if (fuse_isdeadfs(vp)) { - return ENXIO; + return (EXTERROR(ENXIO, "This FUSE session is about " + "to be closed")); } if (vap->va_uid != (uid_t)VNOVAL) { @@ -2425,7 +2456,8 @@ fuse_vnop_symlink(struct vop_symlink_args *ap) size_t len; if (fuse_isdeadfs(dvp)) { - return ENXIO; + return (EXTERROR(ENXIO, "This FUSE session is about " + "to be closed")); } /* * Unlike the other creator type calls, here we have to create a message @@ -2471,7 +2503,8 @@ fuse_vnop_write(struct vop_write_args *ap) MPASS(vp->v_type == VREG || vp->v_type == VDIR); if (fuse_isdeadfs(vp)) { - return ENXIO; + return (EXTERROR(ENXIO, "This FUSE session is about " + "to be closed")); } if (VTOFUD(vp)->flag & FN_DIRECTIO) { @@ -2624,10 +2657,12 @@ fuse_vnop_getextattr(struct vop_getextattr_args *ap) int err; if (fuse_isdeadfs(vp)) - return (ENXIO); + return (EXTERROR(ENXIO, "This FUSE session is about " + "to be closed")); if (fsess_not_impl(mp, FUSE_GETXATTR)) - return EOPNOTSUPP; + return (EXTERROR(EOPNOTSUPP, "This server does not implement " + "extended attributes")); err = fuse_extattr_check_cred(vp, ap->a_attrnamespace, cred, td, VREAD); if (err) @@ -2665,7 +2700,8 @@ fuse_vnop_getextattr(struct vop_getextattr_args *ap) if (err != 0) { if (err == ENOSYS) { fsess_set_notimpl(mp, FUSE_GETXATTR); - err = EOPNOTSUPP; + err = (EXTERROR(EOPNOTSUPP, "This server does not " + "implement extended attributes")); } goto out; } @@ -2711,10 +2747,12 @@ fuse_vnop_setextattr(struct vop_setextattr_args *ap) int err; if (fuse_isdeadfs(vp)) - return (ENXIO); + return (EXTERROR(ENXIO, "This FUSE session is about " + "to be closed")); if (fsess_not_impl(mp, FUSE_SETXATTR)) - return EOPNOTSUPP; + return (EXTERROR(EOPNOTSUPP, "This server does not implement " + "setting extended attributes")); if (vfs_isrdonly(mp)) return EROFS; @@ -2726,9 +2764,11 @@ fuse_vnop_setextattr(struct vop_setextattr_args *ap) * return EOPNOTSUPP. */ if (fsess_not_impl(mp, FUSE_REMOVEXATTR)) - return (EOPNOTSUPP); + return (EXTERROR(EOPNOTSUPP, "This server does not " + "implement removing extended attributess")); else - return (EINVAL); + return (EXTERROR(EINVAL, "DELETEEXTATTR should be used " + "to remove extattrs")); } err = fuse_extattr_check_cred(vp, ap->a_attrnamespace, cred, td, @@ -2774,7 +2814,8 @@ fuse_vnop_setextattr(struct vop_setextattr_args *ap) if (err == ENOSYS) { fsess_set_notimpl(mp, FUSE_SETXATTR); - err = EOPNOTSUPP; + err = EXTERROR(EOPNOTSUPP, "This server does not implement " + "setting extended attributes"); } if (err == ERESTART) { /* Can't restart after calling uiomove */ @@ -2885,10 +2926,12 @@ fuse_vnop_listextattr(struct vop_listextattr_args *ap) int err; if (fuse_isdeadfs(vp)) - return (ENXIO); + return (EXTERROR(ENXIO, "This FUSE session is about " + "to be closed")); if (fsess_not_impl(mp, FUSE_LISTXATTR)) - return EOPNOTSUPP; + return (EXTERROR(EOPNOTSUPP, "This server does not implement " + "extended attributes")); err = fuse_extattr_check_cred(vp, ap->a_attrnamespace, cred, td, VREAD); if (err) @@ -2916,7 +2959,8 @@ fuse_vnop_listextattr(struct vop_listextattr_args *ap) if (err != 0) { if (err == ENOSYS) { fsess_set_notimpl(mp, FUSE_LISTXATTR); - err = EOPNOTSUPP; + err = EXTERROR(EOPNOTSUPP, "This server does not " + "implement extended attributes"); } goto out; } @@ -3016,7 +3060,8 @@ fuse_vnop_deallocate(struct vop_deallocate_args *ap) bool closefufh = false; if (fuse_isdeadfs(vp)) - return (ENXIO); + return (EXTERROR(ENXIO, "This FUSE session is about " + "to be closed")); if (vfs_isrdonly(mp)) return (EROFS); @@ -3122,10 +3167,12 @@ fuse_vnop_deleteextattr(struct vop_deleteextattr_args *ap) int err; if (fuse_isdeadfs(vp)) - return (ENXIO); + return (EXTERROR(ENXIO, "This FUSE session is about " + "to be closed")); if (fsess_not_impl(mp, FUSE_REMOVEXATTR)) - return EOPNOTSUPP; + return (EXTERROR(EOPNOTSUPP, "This server does not implement " + "removing extended attributes")); if (vfs_isrdonly(mp)) return EROFS; @@ -3154,7 +3201,8 @@ fuse_vnop_deleteextattr(struct vop_deleteextattr_args *ap) err = fdisp_wait_answ(&fdi); if (err == ENOSYS) { fsess_set_notimpl(mp, FUSE_REMOVEXATTR); - err = EOPNOTSUPP; + err = EXTERROR(EOPNOTSUPP, "This server does not implement " + "removing extended attributes"); } fdisp_destroy(&fdi); @@ -3208,7 +3256,8 @@ fuse_vnop_vptofh(struct vop_vptofh_args *ap) /* NFS requires lookups for "." and ".." */ SDT_PROBE2(fusefs, , vnops, trace, 1, "VOP_VPTOFH without FUSE_EXPORT_SUPPORT"); - return EOPNOTSUPP; + return (EXTERROR(EOPNOTSUPP, "This server is " + "missing FUSE_EXPORT_SUPPORT")); } if ((mp->mnt_flag & MNT_EXPORTED) && fsess_is_impl(mp, FUSE_OPENDIR)) @@ -3226,7 +3275,8 @@ fuse_vnop_vptofh(struct vop_vptofh_args *ap) */ SDT_PROBE2(fusefs, , vnops, trace, 1, "VOP_VPTOFH with FUSE_OPENDIR"); - return EOPNOTSUPP; + return (EXTERROR(EOPNOTSUPP, "This server implements " + "FUSE_OPENDIR so is not compatible with getfh")); } err = fuse_internal_getattr(vp, &va, curthread->td_ucred, curthread); @@ -3240,6 +3290,7 @@ fuse_vnop_vptofh(struct vop_vptofh_args *ap) if (fvdat->generation <= UINT32_MAX) fhp->gen = fvdat->generation; else - return EOVERFLOW; + return (EXTERROR(EOVERFLOW, "inode generation " + "number overflow")); return (0); } diff --git a/sys/fs/msdosfs/msdosfs_conv.c b/sys/fs/msdosfs/msdosfs_conv.c index da4848169173..208b64930e61 100644 --- a/sys/fs/msdosfs/msdosfs_conv.c +++ b/sys/fs/msdosfs/msdosfs_conv.c @@ -797,19 +797,24 @@ mbsadjpos(const char **instr, size_t inlen, size_t outlen, int weight, int flag, static u_char * dos2unixchr(u_char *outbuf, const u_char **instr, size_t *ilen, int lower, struct msdosfsmount *pmp) { - u_char c, *outp; - size_t len, olen; + u_char c, *outp, *outp1; + size_t i, len, olen; outp = outbuf; if (pmp->pm_flags & MSDOSFSMNT_KICONV && msdosfs_iconv) { olen = len = 4; + outp1 = outp; if (lower & (LCASE_BASE | LCASE_EXT)) msdosfs_iconv->convchr_case(pmp->pm_d2u, (const char **)instr, ilen, (char **)&outp, &olen, KICONV_LOWER); else msdosfs_iconv->convchr(pmp->pm_d2u, (const char **)instr, ilen, (char **)&outp, &olen); + for (i = 0; i < outp - outp1; i++) { + if (outp1[i] == '/') + outp1[i] = '?'; + } len -= olen; /* @@ -826,6 +831,8 @@ dos2unixchr(u_char *outbuf, const u_char **instr, size_t *ilen, int lower, struc c = dos2unix[c]; if (lower & (LCASE_BASE | LCASE_EXT)) c = u2l[c]; + if (c == '/') + c = '?'; *outp++ = c; outbuf[1] = '\0'; } diff --git a/sys/fs/msdosfs/msdosfs_lookup.c b/sys/fs/msdosfs/msdosfs_lookup.c index e799a5ce05f6..8ab6d35a2685 100644 --- a/sys/fs/msdosfs/msdosfs_lookup.c +++ b/sys/fs/msdosfs/msdosfs_lookup.c @@ -845,7 +845,6 @@ doscheckpath(struct denode *source, struct denode *target, daddr_t *wait_scn) *wait_scn = 0; pmp = target->de_pmp; - lockmgr_assert(&pmp->pm_checkpath_lock, KA_XLOCKED); KASSERT(pmp == source->de_pmp, ("doscheckpath: source and target on different filesystems")); diff --git a/sys/fs/msdosfs/msdosfs_vfsops.c b/sys/fs/msdosfs/msdosfs_vfsops.c index adcffe45df82..4431d36c8a8e 100644 --- a/sys/fs/msdosfs/msdosfs_vfsops.c +++ b/sys/fs/msdosfs/msdosfs_vfsops.c @@ -575,7 +575,6 @@ mountmsdosfs(struct vnode *odevvp, struct mount *mp) pmp->pm_bo = bo; lockinit(&pmp->pm_fatlock, 0, msdosfs_lock_msg, 0, 0); - lockinit(&pmp->pm_checkpath_lock, 0, "msdoscp", 0, 0); TASK_INIT(&pmp->pm_rw2ro_task, 0, msdosfs_remount_ro, pmp); @@ -871,7 +870,6 @@ error_exit: } if (pmp != NULL) { lockdestroy(&pmp->pm_fatlock); - lockdestroy(&pmp->pm_checkpath_lock); free(pmp->pm_inusemap, M_MSDOSFSFAT); free(pmp, M_MSDOSFSMNT); mp->mnt_data = NULL; @@ -971,7 +969,6 @@ msdosfs_unmount(struct mount *mp, int mntflags) dev_rel(pmp->pm_dev); free(pmp->pm_inusemap, M_MSDOSFSFAT); lockdestroy(&pmp->pm_fatlock); - lockdestroy(&pmp->pm_checkpath_lock); free(pmp, M_MSDOSFSMNT); mp->mnt_data = NULL; return (error); diff --git a/sys/fs/msdosfs/msdosfs_vnops.c b/sys/fs/msdosfs/msdosfs_vnops.c index 6417b7dac16b..33e0d94954d7 100644 --- a/sys/fs/msdosfs/msdosfs_vnops.c +++ b/sys/fs/msdosfs/msdosfs_vnops.c @@ -945,7 +945,7 @@ msdosfs_rename(struct vop_rename_args *ap) struct denode *fdip, *fip, *tdip, *tip, *nip; u_char toname[12], oldname[11]; u_long to_diroffset; - bool checkpath_locked, doingdirectory, newparent; + bool doingdirectory, newparent; int error; u_long cn, pcl, blkoff; daddr_t bn, wait_scn, scn; @@ -986,8 +986,6 @@ msdosfs_rename(struct vop_rename_args *ap) if (tvp != NULL && tvp != tdvp) VOP_UNLOCK(tvp); - checkpath_locked = false; - relock: doingdirectory = newparent = false; @@ -1108,12 +1106,8 @@ relock: if (doingdirectory && newparent) { if (error != 0) /* write access check above */ goto unlock; - lockmgr(&pmp->pm_checkpath_lock, LK_EXCLUSIVE, NULL); - checkpath_locked = true; error = doscheckpath(fip, tdip, &wait_scn); if (wait_scn != 0) { - lockmgr(&pmp->pm_checkpath_lock, LK_RELEASE, NULL); - checkpath_locked = false; VOP_UNLOCK(fdvp); VOP_UNLOCK(tdvp); VOP_UNLOCK(fvp); @@ -1276,8 +1270,6 @@ relock: cache_purge(fvp); unlock: - if (checkpath_locked) - lockmgr(&pmp->pm_checkpath_lock, LK_RELEASE, NULL); vput(fdvp); vput(fvp); if (tvp != NULL) { @@ -1289,7 +1281,6 @@ unlock: vput(tdvp); return (error); releout: - MPASS(!checkpath_locked); vrele(tdvp); if (tvp != NULL) vrele(tvp); @@ -1530,6 +1521,9 @@ msdosfs_readdir(struct vop_readdir_args *ap) ap->a_vp, uio, ap->a_cred, ap->a_eofflag); #endif + if (ap->a_eofflag != NULL) + *ap->a_eofflag = 0; + /* * msdosfs_readdir() won't operate properly on regular files since * it does i/o only with the filesystem vnode, and hence can @@ -1623,8 +1617,11 @@ msdosfs_readdir(struct vop_readdir_args *ap) on = (offset - bias) & pmp->pm_crbomask; n = min(pmp->pm_bpcluster - on, uio->uio_resid); diff = dep->de_FileSize - (offset - bias); - if (diff <= 0) - break; + if (diff <= 0) { + if (ap->a_eofflag != NULL) + *ap->a_eofflag = 1; + goto out; + } n = min(n, diff); error = pcbmap(dep, lbn, &bn, &cn, &blsize); if (error) @@ -1655,6 +1652,8 @@ msdosfs_readdir(struct vop_readdir_args *ap) */ if (dentp->deName[0] == SLOT_EMPTY) { brelse(bp); + if (ap->a_eofflag != NULL) + *ap->a_eofflag = 1; goto out; } /* @@ -1752,15 +1751,6 @@ out: uio->uio_offset = off; - /* - * Set the eofflag (NFS uses it) - */ - if (ap->a_eofflag) { - if (dep->de_FileSize - (offset - bias) <= 0) - *ap->a_eofflag = 1; - else - *ap->a_eofflag = 0; - } return (error); } @@ -1951,6 +1941,9 @@ msdosfs_pathconf(struct vop_pathconf_args *ap) case _PC_NO_TRUNC: *ap->a_retval = 0; return (0); + case _PC_HAS_HIDDENSYSTEM: + *ap->a_retval = 1; + return (0); default: return (vop_stdpathconf(ap)); } diff --git a/sys/fs/msdosfs/msdosfsmount.h b/sys/fs/msdosfs/msdosfsmount.h index fcaac544a74d..04e6b75bea2a 100644 --- a/sys/fs/msdosfs/msdosfsmount.h +++ b/sys/fs/msdosfs/msdosfsmount.h @@ -118,7 +118,6 @@ struct msdosfsmount { void *pm_u2d; /* Unicode->DOS iconv handle */ void *pm_d2u; /* DOS->Local iconv handle */ struct lock pm_fatlock; /* lockmgr protecting allocations */ - struct lock pm_checkpath_lock; /* protects doscheckpath result */ struct task pm_rw2ro_task; /* context for emergency remount ro */ }; diff --git a/sys/fs/nfs/nfs_commonsubs.c b/sys/fs/nfs/nfs_commonsubs.c index f46b0d282861..a957315aaa12 100644 --- a/sys/fs/nfs/nfs_commonsubs.c +++ b/sys/fs/nfs/nfs_commonsubs.c @@ -630,6 +630,10 @@ nfscl_fillsattr(struct nfsrv_descript *nd, struct vattr *vap, NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_OWNERGROUP); if ((flags & NFSSATTR_FULL) && vap->va_size != VNOVAL) NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_SIZE); + if ((flags & NFSSATTR_FULL) && vap->va_flags != VNOVAL) { + NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_HIDDEN); + NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_SYSTEM); + } if (vap->va_atime.tv_sec != VNOVAL) NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_TIMEACCESSSET); if (vap->va_mtime.tv_sec != VNOVAL) @@ -643,7 +647,8 @@ nfscl_fillsattr(struct nfsrv_descript *nd, struct vattr *vap, NFSATTRBIT_TIMECREATE)) NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_TIMECREATE); (void) nfsv4_fillattr(nd, vp->v_mount, vp, NULL, vap, NULL, 0, - &attrbits, NULL, NULL, 0, 0, 0, 0, (uint64_t)0, NULL); + &attrbits, NULL, NULL, 0, 0, 0, 0, (uint64_t)0, NULL, + false, false, false); break; } } @@ -1314,6 +1319,7 @@ nfsv4_loadattr(struct nfsrv_descript *nd, vnode_t vp, u_int32_t freenum = 0, tuint; u_int64_t uquad = 0, thyp, thyp2; uint16_t tui16; + long has_pathconf; #ifdef QUOTA struct dqblk dqb; uid_t savuid; @@ -1421,6 +1427,16 @@ nfsv4_loadattr(struct nfsrv_descript *nd, vnode_t vp, NFSCLRBIT_ATTRBIT(&checkattrbits, NFSATTRBIT_ACL); NFSCLRBIT_ATTRBIT(&checkattrbits, NFSATTRBIT_ACLSUPPORT); } + /* Some filesystems do not support uf_hidden */ + if (vp == NULL || VOP_PATHCONF(vp, + _PC_HAS_HIDDENSYSTEM, &has_pathconf) != 0) + has_pathconf = 0; + if (has_pathconf == 0) { + NFSCLRBIT_ATTRBIT(&checkattrbits, + NFSATTRBIT_HIDDEN); + NFSCLRBIT_ATTRBIT(&checkattrbits, + NFSATTRBIT_SYSTEM); + } if (!NFSEQUAL_ATTRBIT(&retattrbits, &checkattrbits) || retnotsup) *retcmpp = NFSERR_NOTSAME; @@ -1521,15 +1537,13 @@ nfsv4_loadattr(struct nfsrv_descript *nd, vnode_t vp, NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); if (compare) { if (!(*retcmpp)) { - long has_named_attr; - if (vp == NULL || VOP_PATHCONF(vp, - _PC_HAS_NAMEDATTR, &has_named_attr) + _PC_HAS_NAMEDATTR, &has_pathconf) != 0) - has_named_attr = 0; - if ((has_named_attr != 0 && + has_pathconf = 0; + if ((has_pathconf != 0 && *tl != newnfs_true) || - (has_named_attr == 0 && + (has_pathconf == 0 && *tl != newnfs_false)) *retcmpp = NFSERR_NOTSAME; } @@ -1792,9 +1806,17 @@ nfsv4_loadattr(struct nfsrv_descript *nd, vnode_t vp, free(cp2, M_NFSSTRING); break; case NFSATTRBIT_HIDDEN: - NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); - if (compare && !(*retcmpp)) - *retcmpp = NFSERR_ATTRNOTSUPP; + NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); + if (compare) { + if (!(*retcmpp) && ((*tl == newnfs_true && + (nap->na_flags & UF_HIDDEN) == 0) || + (*tl == newnfs_false && + (nap->na_flags & UF_HIDDEN) != 0))) + *retcmpp = NFSERR_NOTSAME; + } else if (nap != NULL) { + if (*tl == newnfs_true) + nap->na_flags |= UF_HIDDEN; + } attrsum += NFSX_UNSIGNED; break; case NFSATTRBIT_HOMOGENEOUS: @@ -2166,9 +2188,17 @@ nfsv4_loadattr(struct nfsrv_descript *nd, vnode_t vp, attrsum += NFSX_HYPER; break; case NFSATTRBIT_SYSTEM: - NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); - if (compare && !(*retcmpp)) - *retcmpp = NFSERR_ATTRNOTSUPP; + NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); + if (compare) { + if (!(*retcmpp) && ((*tl == newnfs_true && + (nap->na_flags & UF_SYSTEM) == 0) || + (*tl == newnfs_false && + (nap->na_flags & UF_SYSTEM) != 0))) + *retcmpp = NFSERR_NOTSAME; + } else if (nap != NULL) { + if (*tl == newnfs_true) + nap->na_flags |= UF_SYSTEM; + } attrsum += NFSX_UNSIGNED; break; case NFSATTRBIT_TIMEACCESS: @@ -2617,7 +2647,8 @@ nfsv4_fillattr(struct nfsrv_descript *nd, struct mount *mp, vnode_t vp, NFSACL_T *saclp, struct vattr *vap, fhandle_t *fhp, int rderror, nfsattrbit_t *attrbitp, struct ucred *cred, NFSPROC_T *p, int isdgram, int reterr, int supports_nfsv4acls, int at_root, uint64_t mounted_on_fileno, - struct statfs *pnfssf) + struct statfs *pnfssf, bool xattrsupp, bool has_hiddensystem, + bool has_namedattr) { int bitpos, retnum = 0; u_int32_t *tl; @@ -2631,10 +2662,7 @@ nfsv4_fillattr(struct nfsrv_descript *nd, struct mount *mp, vnode_t vp, struct nfsfsinfo fsinf; struct timespec temptime; NFSACL_T *aclp, *naclp = NULL; - size_t atsiz; - bool xattrsupp; short irflag; - long has_named_attr; #ifdef QUOTA struct dqblk dqb; uid_t savuid; @@ -2718,18 +2746,6 @@ nfsv4_fillattr(struct nfsrv_descript *nd, struct mount *mp, vnode_t vp, } } - /* Check to see if Extended Attributes are supported. */ - xattrsupp = false; - if (NFSISSET_ATTRBIT(retbitp, NFSATTRBIT_XATTRSUPPORT)) { - if (NFSVOPLOCK(vp, LK_SHARED) == 0) { - error = VOP_GETEXTATTR(vp, EXTATTR_NAMESPACE_USER, - "xxx", NULL, &atsiz, cred, p); - NFSVOPUNLOCK(vp); - if (error != EOPNOTSUPP) - xattrsupp = true; - } - } - /* * Put out the attribute bitmap for the ones being filled in * and get the field for the number of attributes returned. @@ -2751,6 +2767,10 @@ nfsv4_fillattr(struct nfsrv_descript *nd, struct mount *mp, vnode_t vp, NFSCLRBIT_ATTRBIT(&attrbits,NFSATTRBIT_ACLSUPPORT); NFSCLRBIT_ATTRBIT(&attrbits,NFSATTRBIT_ACL); } + if (!has_hiddensystem) { + NFSCLRBIT_ATTRBIT(&attrbits, NFSATTRBIT_HIDDEN); + NFSCLRBIT_ATTRBIT(&attrbits, NFSATTRBIT_SYSTEM); + } retnum += nfsrv_putattrbit(nd, &attrbits); break; case NFSATTRBIT_TYPE: @@ -2791,10 +2811,7 @@ nfsv4_fillattr(struct nfsrv_descript *nd, struct mount *mp, vnode_t vp, break; case NFSATTRBIT_NAMEDATTR: NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); - if (VOP_PATHCONF(vp, _PC_HAS_NAMEDATTR, &has_named_attr) - != 0) - has_named_attr = 0; - if (has_named_attr != 0) + if (has_namedattr) *tl = newnfs_true; else *tl = newnfs_false; @@ -2899,6 +2916,14 @@ nfsv4_fillattr(struct nfsrv_descript *nd, struct mount *mp, vnode_t vp, *tl = 0; retnum += 2 * NFSX_UNSIGNED; break; + case NFSATTRBIT_HIDDEN: + NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED); + if ((vap->va_flags & UF_HIDDEN) != 0) + *tl = newnfs_true; + else + *tl = newnfs_false; + retnum += NFSX_UNSIGNED; + break; case NFSATTRBIT_HOMOGENEOUS: NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); if (fsinf.fs_properties & NFSV3FSINFO_HOMOGENEOUS) @@ -3088,6 +3113,14 @@ nfsv4_fillattr(struct nfsrv_descript *nd, struct mount *mp, vnode_t vp, txdr_hyper(vap->va_bytes, tl); retnum += NFSX_HYPER; break; + case NFSATTRBIT_SYSTEM: + NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED); + if ((vap->va_flags & UF_SYSTEM) != 0) + *tl = newnfs_true; + else + *tl = newnfs_false; + retnum += NFSX_UNSIGNED; + break; case NFSATTRBIT_TIMEACCESS: NFSM_BUILD(tl, u_int32_t *, NFSX_V4TIME); txdr_nfsv4time(&vap->va_atime, tl); diff --git a/sys/fs/nfs/nfs_var.h b/sys/fs/nfs/nfs_var.h index 3b6c1ec90c06..54f60a753c50 100644 --- a/sys/fs/nfs/nfs_var.h +++ b/sys/fs/nfs/nfs_var.h @@ -395,8 +395,9 @@ int nfsrv_putopbit(struct nfsrv_descript *, nfsopbit_t *); void nfsrv_wcc(struct nfsrv_descript *, int, struct nfsvattr *, int, struct nfsvattr *); int nfsv4_fillattr(struct nfsrv_descript *, struct mount *, vnode_t, NFSACL_T *, - struct vattr *, fhandle_t *, int, nfsattrbit_t *, - struct ucred *, NFSPROC_T *, int, int, int, int, uint64_t, struct statfs *); + struct vattr *, fhandle_t *, int, nfsattrbit_t *, struct ucred *, + NFSPROC_T *, int, int, int, int, uint64_t, struct statfs *, bool, bool, + bool); void nfsrv_fillattr(struct nfsrv_descript *, struct nfsvattr *); struct mbuf *nfsrv_adj(struct mbuf *, int, int); void nfsrv_postopattr(struct nfsrv_descript *, int, struct nfsvattr *); @@ -735,7 +736,8 @@ int nfsvno_updfilerev(vnode_t, struct nfsvattr *, struct nfsrv_descript *, NFSPROC_T *); int nfsvno_fillattr(struct nfsrv_descript *, struct mount *, vnode_t, struct nfsvattr *, fhandle_t *, int, nfsattrbit_t *, - struct ucred *, NFSPROC_T *, int, int, int, int, uint64_t); + struct ucred *, NFSPROC_T *, int, int, int, int, uint64_t, bool, bool, + bool); int nfsrv_sattr(struct nfsrv_descript *, vnode_t, struct nfsvattr *, nfsattrbit_t *, NFSACL_T *, NFSPROC_T *); int nfsv4_sattr(struct nfsrv_descript *, vnode_t, struct nfsvattr *, nfsattrbit_t *, diff --git a/sys/fs/nfs/nfsproto.h b/sys/fs/nfs/nfsproto.h index eff53e1a384e..cb5a80e8df73 100644 --- a/sys/fs/nfs/nfsproto.h +++ b/sys/fs/nfs/nfsproto.h @@ -1142,6 +1142,7 @@ struct nfsv3_sattr { NFSATTRBM_FILESFREE | \ NFSATTRBM_FILESTOTAL | \ NFSATTRBM_FSLOCATIONS | \ + NFSATTRBM_HIDDEN | \ NFSATTRBM_HOMOGENEOUS | \ NFSATTRBM_MAXFILESIZE | \ NFSATTRBM_MAXLINK | \ @@ -1163,6 +1164,7 @@ struct nfsv3_sattr { NFSATTRBM_SPACEFREE | \ NFSATTRBM_SPACETOTAL | \ NFSATTRBM_SPACEUSED | \ + NFSATTRBM_SYSTEM | \ NFSATTRBM_TIMEACCESS | \ NFSATTRBM_TIMECREATE | \ NFSATTRBM_TIMEDELTA | \ @@ -1210,11 +1212,13 @@ struct nfsv3_sattr { */ #define NFSATTRBIT_SETABLE0 \ (NFSATTRBM_SIZE | \ + NFSATTRBM_HIDDEN | \ NFSATTRBM_ACL) #define NFSATTRBIT_SETABLE1 \ (NFSATTRBM_MODE | \ NFSATTRBM_OWNER | \ NFSATTRBM_OWNERGROUP | \ + NFSATTRBM_SYSTEM | \ NFSATTRBM_TIMECREATE | \ NFSATTRBM_TIMEACCESSSET | \ NFSATTRBM_TIMEMODIFYSET) @@ -1254,6 +1258,7 @@ struct nfsv3_sattr { NFSATTRBM_SIZE | \ NFSATTRBM_FSID | \ NFSATTRBM_FILEID | \ + NFSATTRBM_HIDDEN | \ NFSATTRBM_MAXREAD) /* @@ -1266,6 +1271,7 @@ struct nfsv3_sattr { NFSATTRBM_OWNERGROUP | \ NFSATTRBM_RAWDEV | \ NFSATTRBM_SPACEUSED | \ + NFSATTRBM_SYSTEM | \ NFSATTRBM_TIMEACCESS | \ NFSATTRBM_TIMECREATE | \ NFSATTRBM_TIMEMETADATA | \ @@ -1288,6 +1294,7 @@ struct nfsv3_sattr { NFSATTRBM_SIZE | \ NFSATTRBM_FSID | \ NFSATTRBM_FILEID | \ + NFSATTRBM_HIDDEN | \ NFSATTRBM_MAXREAD) /* @@ -1298,6 +1305,7 @@ struct nfsv3_sattr { NFSATTRBM_NUMLINKS | \ NFSATTRBM_RAWDEV | \ NFSATTRBM_SPACEUSED | \ + NFSATTRBM_SYSTEM | \ NFSATTRBM_TIMEACCESS | \ NFSATTRBM_TIMECREATE | \ NFSATTRBM_TIMEMETADATA | \ diff --git a/sys/fs/nfsclient/nfs_clrpcops.c b/sys/fs/nfsclient/nfs_clrpcops.c index c07da6f9275f..2f3c59b68518 100644 --- a/sys/fs/nfsclient/nfs_clrpcops.c +++ b/sys/fs/nfsclient/nfs_clrpcops.c @@ -4158,6 +4158,13 @@ nfsrpc_readdirplus(vnode_t vp, struct uio *uiop, nfsuint64 *cookiep, if (!NFSISSET_ATTRBIT(&dnp->n_vattr.na_suppattr, NFSATTRBIT_TIMECREATE)) NFSCLRBIT_ATTRBIT(&attrbits, NFSATTRBIT_TIMECREATE); + if (!NFSISSET_ATTRBIT(&dnp->n_vattr.na_suppattr, + NFSATTRBIT_HIDDEN) || + !NFSISSET_ATTRBIT(&dnp->n_vattr.na_suppattr, + NFSATTRBIT_SYSTEM)) { + NFSCLRBIT_ATTRBIT(&attrbits, NFSATTRBIT_HIDDEN); + NFSCLRBIT_ATTRBIT(&attrbits, NFSATTRBIT_SYSTEM); + } } /* @@ -5429,7 +5436,8 @@ nfsrpc_setaclrpc(vnode_t vp, struct ucred *cred, NFSPROC_T *p, NFSZERO_ATTRBIT(&attrbits); NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_ACL); (void) nfsv4_fillattr(nd, vp->v_mount, vp, aclp, NULL, NULL, 0, - &attrbits, NULL, NULL, 0, 0, 0, 0, (uint64_t)0, NULL); + &attrbits, NULL, NULL, 0, 0, 0, 0, (uint64_t)0, NULL, false, false, + false); error = nfscl_request(nd, vp, p, cred); if (error) return (error); diff --git a/sys/fs/nfsclient/nfs_clstate.c b/sys/fs/nfsclient/nfs_clstate.c index 1ae5ed1a75ca..99a781640c53 100644 --- a/sys/fs/nfsclient/nfs_clstate.c +++ b/sys/fs/nfsclient/nfs_clstate.c @@ -3701,7 +3701,7 @@ nfscl_docb(struct nfsrv_descript *nd, NFSPROC_T *p) if (!error) (void) nfsv4_fillattr(nd, NULL, NULL, NULL, &va, NULL, 0, &rattrbits, NULL, p, 0, 0, 0, 0, - (uint64_t)0, NULL); + (uint64_t)0, NULL, false, false, false); break; case NFSV4OP_CBRECALL: NFSCL_DEBUG(4, "cbrecall\n"); diff --git a/sys/fs/nfsclient/nfs_clvnops.c b/sys/fs/nfsclient/nfs_clvnops.c index 0049d7edca33..fa451887e73e 100644 --- a/sys/fs/nfsclient/nfs_clvnops.c +++ b/sys/fs/nfsclient/nfs_clvnops.c @@ -1074,21 +1074,29 @@ nfs_setattr(struct vop_setattr_args *ap) int error = 0; u_quad_t tsize; struct timespec ts; + struct nfsmount *nmp; #ifndef nolint tsize = (u_quad_t)0; #endif /* - * Setting of flags and marking of atimes are not supported. + * Only setting of UF_HIDDEN and UF_SYSTEM are supported and + * only for NFSv4 servers that support them. */ - if (vap->va_flags != VNOVAL) + nmp = VFSTONFS(vp->v_mount); + if (vap->va_flags != VNOVAL && (!NFSHASNFSV4(nmp) || + (vap->va_flags & ~(UF_HIDDEN | UF_SYSTEM)) != 0 || + ((vap->va_flags & UF_HIDDEN) != 0 && + !NFSISSET_ATTRBIT(&np->n_vattr.na_suppattr, NFSATTRBIT_HIDDEN)) || + ((vap->va_flags & UF_SYSTEM) != 0 && + !NFSISSET_ATTRBIT(&np->n_vattr.na_suppattr, NFSATTRBIT_SYSTEM)))) return (EOPNOTSUPP); /* * Disallow write attempts if the filesystem is mounted read-only. */ - if ((vap->va_flags != VNOVAL || vap->va_uid != (uid_t)VNOVAL || + if ((vap->va_flags != (u_long)VNOVAL || vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL || vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL || vap->va_birthtime.tv_sec != VNOVAL || @@ -4754,6 +4762,15 @@ nfs_pathconf(struct vop_pathconf_args *ap) else *ap->a_retval = 0; break; + case _PC_HAS_HIDDENSYSTEM: + if (NFS_ISV4(vp) && NFSISSET_ATTRBIT(&np->n_vattr.na_suppattr, + NFSATTRBIT_HIDDEN) && + NFSISSET_ATTRBIT(&np->n_vattr.na_suppattr, + NFSATTRBIT_SYSTEM)) + *ap->a_retval = 1; + else + *ap->a_retval = 0; + break; default: error = vop_stdpathconf(ap); diff --git a/sys/fs/nfsserver/nfs_nfsdport.c b/sys/fs/nfsserver/nfs_nfsdport.c index 3bf54d82b959..4f0d5946d6b9 100644 --- a/sys/fs/nfsserver/nfs_nfsdport.c +++ b/sys/fs/nfsserver/nfs_nfsdport.c @@ -449,6 +449,7 @@ nfsvno_getattr(struct vnode *vp, struct nfsvattr *nvap, } nvap->na_bsdflags = 0; + nvap->na_flags = 0; error = VOP_GETATTR(vp, &nvap->na_vattr, nd->nd_cred); if (lockedit != 0) NFSVOPUNLOCK(vp); @@ -1651,10 +1652,11 @@ nfsvno_rename(struct nameidata *fromndp, struct nameidata *tondp, } if (fvp == tvp) { /* - * If source and destination are the same, there is nothing to - * do. Set error to -1 to indicate this. + * If source and destination are the same, there is + * nothing to do. Set error to EJUSTRETURN to indicate + * this. */ - error = -1; + error = EJUSTRETURN; goto out; } if (nd->nd_flag & ND_NFSV4) { @@ -1696,10 +1698,26 @@ nfsvno_rename(struct nameidata *fromndp, struct nameidata *tondp, " dsdvp=%p\n", dsdvp[0]); } out: - if (!error) { + mp = NULL; + if (error == 0) { + error = VOP_GETWRITEMOUNT(tondp->ni_dvp, &mp); + if (error == 0) { + if (mp == NULL) { + error = ENOENT; + } else { + error = lockmgr(&mp->mnt_renamelock, + LK_EXCLUSIVE | LK_NOWAIT, NULL); + if (error != 0) + error = ERELOOKUP; + } + } + } + if (error == 0) { error = VOP_RENAME(fromndp->ni_dvp, fromndp->ni_vp, &fromndp->ni_cnd, tondp->ni_dvp, tondp->ni_vp, &tondp->ni_cnd); + lockmgr(&mp->mnt_renamelock, LK_RELEASE, 0); + vfs_rel(mp); } else { if (tdvp == tvp) vrele(tdvp); @@ -1709,8 +1727,13 @@ out: vput(tvp); vrele(fromndp->ni_dvp); vrele(fvp); - if (error == -1) + if (error == EJUSTRETURN) { error = 0; + } else if (error == ERELOOKUP && mp != NULL) { + lockmgr(&mp->mnt_renamelock, LK_EXCLUSIVE, 0); + lockmgr(&mp->mnt_renamelock, LK_RELEASE, 0); + vfs_rel(mp); + } } /* @@ -2089,7 +2112,8 @@ int nfsvno_fillattr(struct nfsrv_descript *nd, struct mount *mp, struct vnode *vp, struct nfsvattr *nvap, fhandle_t *fhp, int rderror, nfsattrbit_t *attrbitp, struct ucred *cred, struct thread *p, int isdgram, int reterr, - int supports_nfsv4acls, int at_root, uint64_t mounted_on_fileno) + int supports_nfsv4acls, int at_root, uint64_t mounted_on_fileno, + bool xattrsupp, bool has_hiddensystem, bool has_namedattr) { struct statfs *sf; int error; @@ -2108,7 +2132,7 @@ nfsvno_fillattr(struct nfsrv_descript *nd, struct mount *mp, struct vnode *vp, } error = nfsv4_fillattr(nd, mp, vp, NULL, &nvap->na_vattr, fhp, rderror, attrbitp, cred, p, isdgram, reterr, supports_nfsv4acls, at_root, - mounted_on_fileno, sf); + mounted_on_fileno, sf, xattrsupp, has_hiddensystem, has_namedattr); free(sf, M_TEMP); NFSEXITCODE2(0, nd); return (error); @@ -2425,7 +2449,7 @@ nfsrvd_readdirplus(struct nfsrv_descript *nd, int isdgram, struct nfsvattr nva, at, *nvap = &nva; struct mbuf *mb0, *mb1; struct nfsreferral *refp; - int nlen, r, error = 0, getret = 1, usevget = 1; + int nlen, r, error = 0, getret = 1, ret, usevget = 1; int siz, cnt, fullsiz, eofflag, ncookies, entrycnt; caddr_t bpos0, bpos1; u_int64_t off, toff, verf __unused; @@ -2439,6 +2463,9 @@ nfsrvd_readdirplus(struct nfsrv_descript *nd, int isdgram, uint64_t mounted_on_fileno; struct thread *p = curthread; int bextpg0, bextpg1, bextpgsiz0, bextpgsiz1; + size_t atsiz; + long pathval; + bool has_hiddensystem, has_namedattr, xattrsupp; if (nd->nd_repstat) { nfsrv_postopattr(nd, getret, &at); @@ -2913,9 +2940,32 @@ again: *tl++ = newnfs_true; txdr_hyper(*cookiep, tl); dirlen += nfsm_strtom(nd, dp->d_name, nlen); + xattrsupp = false; + has_hiddensystem = false; + has_namedattr = false; if (nvp != NULL) { supports_nfsv4acls = nfs_supportsnfsv4acls(nvp); + if (NFSISSET_ATTRBIT(&attrbits, + NFSATTRBIT_XATTRSUPPORT)) { + ret = VOP_GETEXTATTR(nvp, + EXTATTR_NAMESPACE_USER, + "xxx", NULL, &atsiz, + nd->nd_cred, p); + xattrsupp = ret != EOPNOTSUPP; + } + if (VOP_PATHCONF(nvp, + _PC_HAS_HIDDENSYSTEM, &pathval) != + 0) + pathval = 0; + has_hiddensystem = pathval > 0; + pathval = 0; + if (NFSISSET_ATTRBIT(&attrbits, + NFSATTRBIT_NAMEDATTR) && + VOP_PATHCONF(nvp, _PC_HAS_NAMEDATTR, + &pathval) != 0) + pathval = 0; + has_namedattr = pathval > 0; NFSVOPUNLOCK(nvp); } else supports_nfsv4acls = 0; @@ -2935,13 +2985,15 @@ again: nvp, nvap, &nfh, r, &rderrbits, nd->nd_cred, p, isdgram, 0, supports_nfsv4acls, at_root, - mounted_on_fileno); + mounted_on_fileno, xattrsupp, + has_hiddensystem, has_namedattr); } else { dirlen += nfsvno_fillattr(nd, new_mp, nvp, nvap, &nfh, r, &attrbits, nd->nd_cred, p, isdgram, 0, supports_nfsv4acls, at_root, - mounted_on_fileno); + mounted_on_fileno, xattrsupp, + has_hiddensystem, has_namedattr); } if (nvp != NULL) vrele(nvp); @@ -3127,6 +3179,9 @@ nfsv4_sattr(struct nfsrv_descript *nd, vnode_t vp, struct nfsvattr *nvap, bitpos = NFSATTRBIT_MAX; } else { bitpos = 0; + if (NFSISSET_ATTRBIT(attrbitp, NFSATTRBIT_HIDDEN) || + NFSISSET_ATTRBIT(attrbitp, NFSATTRBIT_SYSTEM)) + nvap->na_flags = 0; } moderet = 0; for (; bitpos < NFSATTRBIT_MAX; bitpos++) { @@ -3163,9 +3218,11 @@ nfsv4_sattr(struct nfsrv_descript *nd, vnode_t vp, struct nfsvattr *nvap, attrsum += NFSX_UNSIGNED; break; case NFSATTRBIT_HIDDEN: - NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); - if (!nd->nd_repstat) - nd->nd_repstat = NFSERR_ATTRNOTSUPP; + NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); + if (nd->nd_repstat == 0) { + if (*tl == newnfs_true) + nvap->na_flags |= UF_HIDDEN; + } attrsum += NFSX_UNSIGNED; break; case NFSATTRBIT_MIMETYPE: @@ -3240,9 +3297,11 @@ nfsv4_sattr(struct nfsrv_descript *nd, vnode_t vp, struct nfsvattr *nvap, attrsum += (NFSX_UNSIGNED + NFSM_RNDUP(j)); break; case NFSATTRBIT_SYSTEM: - NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); - if (!nd->nd_repstat) - nd->nd_repstat = NFSERR_ATTRNOTSUPP; + NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); + if (nd->nd_repstat == 0) { + if (*tl == newnfs_true) + nvap->na_flags |= UF_SYSTEM; + } attrsum += NFSX_UNSIGNED; break; case NFSATTRBIT_TIMEACCESSSET: @@ -6326,7 +6385,7 @@ nfsrv_setacldsdorpc(fhandle_t *fhp, struct ucred *cred, NFSPROC_T *p, * the same type (VREG). */ nfsv4_fillattr(nd, NULL, vp, aclp, NULL, NULL, 0, &attrbits, NULL, - NULL, 0, 0, 0, 0, 0, NULL); + NULL, 0, 0, 0, 0, 0, NULL, false, false, false); error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred, NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL); if (error != 0) { diff --git a/sys/fs/nfsserver/nfs_nfsdserv.c b/sys/fs/nfsserver/nfs_nfsdserv.c index 4e15d55eb312..9eebcda548c6 100644 --- a/sys/fs/nfsserver/nfs_nfsdserv.c +++ b/sys/fs/nfsserver/nfs_nfsdserv.c @@ -241,7 +241,7 @@ nfsrvd_getattr(struct nfsrv_descript *nd, int isdgram, { struct nfsvattr nva; fhandle_t fh; - int at_root = 0, error = 0, supports_nfsv4acls; + int at_root = 0, error = 0, ret, supports_nfsv4acls; struct nfsreferral *refp; nfsattrbit_t attrbits, tmpbits; struct mount *mp; @@ -250,6 +250,9 @@ nfsrvd_getattr(struct nfsrv_descript *nd, int isdgram, uint64_t mounted_on_fileno = 0; accmode_t accmode; struct thread *p = curthread; + size_t atsiz; + long pathval; + bool has_hiddensystem, has_namedattr, xattrsupp; if (nd->nd_repstat) goto out; @@ -307,6 +310,26 @@ nfsrvd_getattr(struct nfsrv_descript *nd, int isdgram, &nva, &attrbits, p); if (nd->nd_repstat == 0) { supports_nfsv4acls = nfs_supportsnfsv4acls(vp); + xattrsupp = false; + if (NFSISSET_ATTRBIT(&attrbits, + NFSATTRBIT_XATTRSUPPORT)) { + ret = VOP_GETEXTATTR(vp, + EXTATTR_NAMESPACE_USER, + "xxx", NULL, &atsiz, nd->nd_cred, + p); + xattrsupp = ret != EOPNOTSUPP; + } + if (VOP_PATHCONF(vp, _PC_HAS_HIDDENSYSTEM, + &pathval) != 0) + pathval = 0; + has_hiddensystem = pathval > 0; + pathval = 0; + if (NFSISSET_ATTRBIT(&attrbits, + NFSATTRBIT_NAMEDATTR) && + VOP_PATHCONF(vp, _PC_HAS_NAMEDATTR, + &pathval) != 0) + pathval = 0; + has_namedattr = pathval > 0; mp = vp->v_mount; if (nfsrv_enable_crossmntpt != 0 && vp->v_type == VDIR && @@ -340,7 +363,9 @@ nfsrvd_getattr(struct nfsrv_descript *nd, int isdgram, (void)nfsvno_fillattr(nd, mp, vp, &nva, &fh, 0, &attrbits, nd->nd_cred, p, isdgram, 1, supports_nfsv4acls, - at_root, mounted_on_fileno); + at_root, mounted_on_fileno, + xattrsupp, has_hiddensystem, + has_namedattr); vfs_unbusy(mp); } vrele(vp); @@ -403,8 +428,10 @@ nfsrvd_setattr(struct nfsrv_descript *nd, __unused int isdgram, if (error) goto nfsmout; - /* For NFSv4, only va_uid is used from nva2. */ + /* For NFSv4, only va_uid and va_flags is used from nva2. */ NFSSETBIT_ATTRBIT(&retbits, NFSATTRBIT_OWNER); + NFSSETBIT_ATTRBIT(&retbits, NFSATTRBIT_HIDDEN); + NFSSETBIT_ATTRBIT(&retbits, NFSATTRBIT_SYSTEM); preat_ret = nfsvno_getattr(vp, &nva2, nd, p, 1, &retbits); if (!nd->nd_repstat) nd->nd_repstat = preat_ret; @@ -463,6 +490,9 @@ nfsrvd_setattr(struct nfsrv_descript *nd, __unused int isdgram, &nva, &attrbits, exp, p); if (!nd->nd_repstat && (nd->nd_flag & ND_NFSV4)) { + u_long oldflags; + + oldflags = nva2.na_flags; /* * For V4, try setting the attributes in sets, so that the * reply bitmap will be correct for an error case. @@ -532,6 +562,32 @@ nfsrvd_setattr(struct nfsrv_descript *nd, __unused int isdgram, NFSSETBIT_ATTRBIT(&retbits, NFSATTRBIT_MODESETMASKED); } } + if (!nd->nd_repstat && + (NFSISSET_ATTRBIT(&attrbits, NFSATTRBIT_HIDDEN) || + NFSISSET_ATTRBIT(&attrbits, NFSATTRBIT_SYSTEM))) { + if (NFSISSET_ATTRBIT(&attrbits, NFSATTRBIT_HIDDEN)) { + if ((nva.na_flags & UF_HIDDEN) != 0) + oldflags |= UF_HIDDEN; + else + oldflags &= ~UF_HIDDEN; + } + if (NFSISSET_ATTRBIT(&attrbits, NFSATTRBIT_SYSTEM)) { + if ((nva.na_flags & UF_SYSTEM) != 0) + oldflags |= UF_SYSTEM; + else + oldflags &= ~UF_SYSTEM; + } + NFSVNO_ATTRINIT(&nva2); + NFSVNO_SETATTRVAL(&nva2, flags, oldflags); + nd->nd_repstat = nfsvno_setattr(vp, &nva2, nd->nd_cred, p, + exp); + if (!nd->nd_repstat) { + if (NFSISSET_ATTRBIT(&attrbits, NFSATTRBIT_HIDDEN)) + NFSSETBIT_ATTRBIT(&retbits, NFSATTRBIT_HIDDEN); + if (NFSISSET_ATTRBIT(&attrbits, NFSATTRBIT_SYSTEM)) + NFSSETBIT_ATTRBIT(&retbits, NFSATTRBIT_SYSTEM); + } + } #ifdef NFS4_ACL_EXTATTR_NAME if (!nd->nd_repstat && aclp->acl_cnt > 0 && @@ -4322,9 +4378,10 @@ nfsrvd_openattr(struct nfsrv_descript *nd, __unused int isdgram, int error = 0; NFSNAMEICNDSET(&cn, nd->nd_cred, LOOKUP, OPENNAMED | ISLASTCN | - NOFOLLOW); + NOFOLLOW | LOCKLEAF); cn.cn_nameptr = "."; cn.cn_namelen = 1; + cn.cn_lkflags = LK_SHARED; NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); if (*tl == newnfs_true) cn.cn_flags |= CREATENAMED; @@ -4343,6 +4400,8 @@ nfsrvd_openattr(struct nfsrv_descript *nd, __unused int isdgram, if (nd->nd_repstat == ENOATTR) nd->nd_repstat = NFSERR_NOENT; } + if (nd->nd_repstat == 0) + NFSVOPUNLOCK(*vpp); vput(dp); NFSEXITCODE2(0, nd); diff --git a/sys/fs/nullfs/null_subr.c b/sys/fs/nullfs/null_subr.c index 0356877eaf05..7dcc83880bb9 100644 --- a/sys/fs/nullfs/null_subr.c +++ b/sys/fs/nullfs/null_subr.c @@ -245,6 +245,10 @@ null_nodeget(struct mount *mp, struct vnode *lowervp, struct vnode **vpp) vp->v_object = lowervp->v_object; vn_irflag_set(vp, VIRF_PGREAD); } + if ((vn_irflag_read(lowervp) & VIRF_INOTIFY) != 0) + vn_irflag_set(vp, VIRF_INOTIFY); + if ((vn_irflag_read(lowervp) & VIRF_INOTIFY_PARENT) != 0) + vn_irflag_set(vp, VIRF_INOTIFY_PARENT); if (lowervp == MOUNTTONULLMOUNT(mp)->nullm_lowerrootvp) vp->v_vflag |= VV_ROOT; diff --git a/sys/fs/nullfs/null_vnops.c b/sys/fs/nullfs/null_vnops.c index 8608216e10e5..74c1a8f3acb6 100644 --- a/sys/fs/nullfs/null_vnops.c +++ b/sys/fs/nullfs/null_vnops.c @@ -190,6 +190,26 @@ SYSCTL_INT(_debug, OID_AUTO, nullfs_bug_bypass, CTLFLAG_RW, &null_bug_bypass, 0, ""); /* + * Synchronize inotify flags with the lower vnode: + * - If the upper vnode has the flag set and the lower does not, then the lower + * vnode is unwatched and the upper vnode does not need to go through + * VOP_INOTIFY. + * - If the lower vnode is watched, then the upper vnode should go through + * VOP_INOTIFY, so copy the flag up. + */ +static void +null_copy_inotify(struct vnode *vp, struct vnode *lvp, short flag) +{ + if ((vn_irflag_read(vp) & flag) != 0) { + if (__predict_false((vn_irflag_read(lvp) & flag) == 0)) + vn_irflag_unset(vp, flag); + } else if ((vn_irflag_read(lvp) & flag) != 0) { + if (__predict_false((vn_irflag_read(vp) & flag) == 0)) + vn_irflag_set(vp, flag); + } +} + +/* * This is the 10-Apr-92 bypass routine. * This version has been optimized for speed, throwing away some * safety checks. It should still always work, but it's not as @@ -305,7 +325,10 @@ null_bypass(struct vop_generic_args *ap) lvp = *(vps_p[i]); /* - * Get rid of the transient hold on lvp. + * Get rid of the transient hold on lvp. Copy inotify + * flags up in case something is watching the lower + * layer. + * * If lowervp was unlocked during VOP * operation, nullfs upper vnode could have * been reclaimed, which changes its v_vnlock @@ -314,6 +337,10 @@ null_bypass(struct vop_generic_args *ap) * upper (reclaimed) vnode. */ if (lvp != NULLVP) { + null_copy_inotify(old_vps[i], lvp, + VIRF_INOTIFY); + null_copy_inotify(old_vps[i], lvp, + VIRF_INOTIFY_PARENT); if (VOP_ISLOCKED(lvp) == LK_EXCLUSIVE && old_vps[i]->v_vnlock != lvp->v_vnlock) { VOP_UNLOCK(lvp); diff --git a/sys/fs/p9fs/p9fs_vnops.c b/sys/fs/p9fs/p9fs_vnops.c index 56bf766ef801..227e2b93883e 100644 --- a/sys/fs/p9fs/p9fs_vnops.c +++ b/sys/fs/p9fs/p9fs_vnops.c @@ -1784,6 +1784,9 @@ p9fs_readdir(struct vop_readdir_args *ap) return (EBADF); } + if (ap->a_eofflag != NULL) + *ap->a_eofflag = 0; + io_buffer = uma_zalloc(p9fs_io_buffer_zone, M_WAITOK); /* We haven't reached the end yet. read more. */ @@ -1801,8 +1804,11 @@ p9fs_readdir(struct vop_readdir_args *ap) count = p9_client_readdir(vofid, (char *)io_buffer, diroffset, count); - if (count == 0) + if (count == 0) { + if (ap->a_eofflag != NULL) + *ap->a_eofflag = 1; break; + } if (count < 0) { error = EIO; diff --git a/sys/fs/smbfs/smbfs_vnops.c b/sys/fs/smbfs/smbfs_vnops.c index c30995508c00..5d412cabadb8 100644 --- a/sys/fs/smbfs/smbfs_vnops.c +++ b/sys/fs/smbfs/smbfs_vnops.c @@ -810,6 +810,9 @@ smbfs_pathconf(struct vop_pathconf_args *ap) case _PC_NO_TRUNC: *retval = 1; break; + case _PC_HAS_HIDDENSYSTEM: + *retval = 1; + break; default: error = vop_stdpathconf(ap); } diff --git a/sys/fs/tmpfs/tmpfs_vnops.c b/sys/fs/tmpfs/tmpfs_vnops.c index c99d0732be50..9d2a587b177a 100644 --- a/sys/fs/tmpfs/tmpfs_vnops.c +++ b/sys/fs/tmpfs/tmpfs_vnops.c @@ -1691,6 +1691,10 @@ tmpfs_pathconf(struct vop_pathconf_args *v) *retval = PAGE_SIZE; break; + case _PC_HAS_HIDDENSYSTEM: + *retval = 1; + break; + default: error = vop_stdpathconf(v); } diff --git a/sys/fs/udf/ecma167-udf.h b/sys/fs/udf/ecma167-udf.h index 839bbec08254..19e114763cac 100644 --- a/sys/fs/udf/ecma167-udf.h +++ b/sys/fs/udf/ecma167-udf.h @@ -243,7 +243,7 @@ struct part_map_spare { uint8_t n_st; /* Number of Sparing Tables */ uint8_t reserved1; uint32_t st_size; - uint32_t st_loc[1]; + uint32_t st_loc[]; } __packed; union udf_pmap { @@ -266,7 +266,7 @@ struct udf_sparing_table { uint16_t rt_l; /* Relocation Table len */ uint8_t reserved[2]; uint32_t seq_num; - struct spare_map_entry entries[1]; + struct spare_map_entry entries[]; } __packed; /* Partition Descriptor [3/10.5] */ diff --git a/sys/fs/udf/udf_vfsops.c b/sys/fs/udf/udf_vfsops.c index c7438147c0a0..c5ef1f686093 100644 --- a/sys/fs/udf/udf_vfsops.c +++ b/sys/fs/udf/udf_vfsops.c @@ -81,6 +81,7 @@ #include <sys/fcntl.h> #include <sys/iconv.h> #include <sys/kernel.h> +#include <sys/limits.h> #include <sys/malloc.h> #include <sys/mount.h> #include <sys/namei.h> @@ -729,7 +730,7 @@ udf_fhtovp(struct mount *mp, struct fid *fhp, int flags, struct vnode **vpp) struct ifid *ifhp; struct vnode *nvp; struct udf_node *np; - off_t fsize; + uint64_t fsize; int error; ifhp = (struct ifid *)fhp; @@ -741,6 +742,10 @@ udf_fhtovp(struct mount *mp, struct fid *fhp, int flags, struct vnode **vpp) np = VTON(nvp); fsize = le64toh(np->fentry->inf_len); + if (fsize > OFF_MAX) { + *vpp = NULLVP; + return (EIO); + } *vpp = nvp; vnode_create_vobject(*vpp, fsize, curthread); diff --git a/sys/fs/udf/udf_vnops.c b/sys/fs/udf/udf_vnops.c index 88bf4917a851..37889241e8c3 100644 --- a/sys/fs/udf/udf_vnops.c +++ b/sys/fs/udf/udf_vnops.c @@ -39,6 +39,7 @@ #include <sys/conf.h> #include <sys/buf.h> #include <sys/iconv.h> +#include <sys/limits.h> #include <sys/mount.h> #include <sys/vnode.h> #include <sys/dirent.h> @@ -182,11 +183,14 @@ udf_access(struct vop_access_args *a) } static int -udf_open(struct vop_open_args *ap) { +udf_open(struct vop_open_args *ap) +{ struct udf_node *np = VTON(ap->a_vp); - off_t fsize; + uint64_t fsize; fsize = le64toh(np->fentry->inf_len); + if (fsize > OFF_MAX) + return (EIO); vnode_create_vobject(ap->a_vp, fsize, ap->a_td); return 0; } @@ -314,12 +318,13 @@ udf_getattr(struct vop_getattr_args *a) * that directories consume at least one logical block, * make it appear so. */ - if (fentry->logblks_rec != 0) { - vap->va_size = - le64toh(fentry->logblks_rec) * node->udfmp->bsize; - } else { + vap->va_size = le64toh(fentry->logblks_rec); + if (vap->va_size == 0) vap->va_size = node->udfmp->bsize; - } + else if (vap->va_size > UINT64_MAX / node->udfmp->bsize) + vap->va_size = UINT64_MAX; + else + vap->va_size *= node->udfmp->bsize; } else { vap->va_size = le64toh(fentry->inf_len); } @@ -446,6 +451,7 @@ udf_read(struct vop_read_args *ap) struct buf *bp; uint8_t *data; daddr_t lbn, rablock; + uint64_t len; off_t diff, fsize; ssize_t n; int error = 0; @@ -471,7 +477,12 @@ udf_read(struct vop_read_args *ap) return (error); } - fsize = le64toh(node->fentry->inf_len); + len = le64toh(node->fentry->inf_len); + if (len > OFF_MAX) { + /* too big, just cap to the requested length */ + len = uio->uio_resid; + } + fsize = len; udfmp = node->udfmp; do { lbn = lblkno(udfmp, uio->uio_offset); @@ -783,6 +794,7 @@ udf_readdir(struct vop_readdir_args *a) struct udf_uiodir uiodir; struct udf_dirstream *ds; uint64_t *cookies = NULL; + uint64_t len; int ncookies; int error = 0; @@ -811,8 +823,12 @@ udf_readdir(struct vop_readdir_args *a) * Iterate through the file id descriptors. Give the parent dir * entry special attention. */ - ds = udf_opendir(node, uio->uio_offset, le64toh(node->fentry->inf_len), - node->udfmp); + len = le64toh(node->fentry->inf_len); + if (len > INT_MAX) { + /* too big, just cap to INT_MAX */ + len = INT_MAX; + } + ds = udf_opendir(node, uio->uio_offset, len, node->udfmp); while ((fid = udf_getfid(ds)) != NULL) { /* XXX Should we return an error on a bad fid? */ @@ -904,7 +920,8 @@ udf_readlink(struct vop_readlink_args *ap) struct udf_node *node; void *buf; char *cp; - int error, len, root; + uint64_t len; + int error, root; /* * A symbolic link in UDF is a list of variable-length path @@ -914,6 +931,8 @@ udf_readlink(struct vop_readlink_args *ap) vp = ap->a_vp; node = VTON(vp); len = le64toh(node->fentry->inf_len); + if (len > MAXPATHLEN) + return (EIO); buf = malloc(len, M_DEVBUF, M_WAITOK); iov[0].iov_len = len; iov[0].iov_base = buf; @@ -1116,13 +1135,14 @@ udf_lookup(struct vop_cachedlookup_args *a) struct udf_mnt *udfmp; struct fileid_desc *fid = NULL; struct udf_dirstream *ds; + uint64_t fsize; u_long nameiop; u_long flags; char *nameptr; long namelen; ino_t id = 0; int offset, error = 0; - int fsize, lkflags, ltype, numdirpasses; + int lkflags, ltype, numdirpasses; dvp = a->a_dvp; node = VTON(dvp); @@ -1133,6 +1153,10 @@ udf_lookup(struct vop_cachedlookup_args *a) nameptr = a->a_cnp->cn_nameptr; namelen = a->a_cnp->cn_namelen; fsize = le64toh(node->fentry->inf_len); + if (fsize > INT_MAX) { + /* too big, just cap to INT_MAX */ + fsize = INT_MAX; + } /* * If this is a LOOKUP and we've already partially searched through diff --git a/sys/i386/conf/GENERIC b/sys/i386/conf/GENERIC index e7d460af21d4..f577cd07ac7c 100644 --- a/sys/i386/conf/GENERIC +++ b/sys/i386/conf/GENERIC @@ -17,6 +17,8 @@ # in NOTES. # +#NO_UNIVERSE + cpu I486_CPU cpu I586_CPU cpu I686_CPU diff --git a/sys/i386/conf/GENERIC-NODEBUG b/sys/i386/conf/GENERIC-NODEBUG index ea07613a796f..a93304481b5f 100644 --- a/sys/i386/conf/GENERIC-NODEBUG +++ b/sys/i386/conf/GENERIC-NODEBUG @@ -25,6 +25,8 @@ # in NOTES. # +#NO_UNIVERSE + include GENERIC include "std.nodebug" diff --git a/sys/i386/conf/LINT b/sys/i386/conf/LINT index 41207eb63cb9..2e947202f723 100644 --- a/sys/i386/conf/LINT +++ b/sys/i386/conf/LINT @@ -1,3 +1,4 @@ +#NO_UNIVERSE include "../../conf/NOTES" include "../../x86/conf/NOTES" diff --git a/sys/i386/conf/MINIMAL b/sys/i386/conf/MINIMAL index 2a06eb84bff8..8019617ca4d4 100644 --- a/sys/i386/conf/MINIMAL +++ b/sys/i386/conf/MINIMAL @@ -31,6 +31,8 @@ # in NOTES. # +#NO_UNIVERSE + cpu I486_CPU cpu I586_CPU cpu I686_CPU diff --git a/sys/i386/conf/PAE b/sys/i386/conf/PAE index a39d32d77106..72af9e9a9eec 100644 --- a/sys/i386/conf/PAE +++ b/sys/i386/conf/PAE @@ -2,6 +2,8 @@ # PAE -- Generic kernel configuration file for FreeBSD/i386 PAE # +#NO_UNIVERSE + include GENERIC ident PAE-GENERIC diff --git a/sys/i386/i386/pmap.c b/sys/i386/i386/pmap.c index 465b4d0f365b..b44f5e08bbcf 100644 --- a/sys/i386/i386/pmap.c +++ b/sys/i386/i386/pmap.c @@ -876,14 +876,16 @@ __CONCAT(PMTYPE, init_pat)(void) #ifdef PMAP_PAE_COMP static void * -pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags, - int wait) +pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *sflagsp, + int flags) { /* Inform UMA that this allocator uses kernel_map/object. */ - *flags = UMA_SLAB_KERNEL; + *sflagsp = UMA_SLAB_KERNEL; + /* contig allocations cannot be NEVERFREED */ + flags &= ~M_NEVERFREED; return ((void *)kmem_alloc_contig_domainset(DOMAINSET_FIXED(domain), - bytes, wait, 0x0ULL, 0xffffffffULL, 1, 0, VM_MEMATTR_DEFAULT)); + bytes, flags, 0x0ULL, 0xffffffffULL, 1, 0, VM_MEMATTR_DEFAULT)); } #endif @@ -5617,6 +5619,8 @@ __CONCAT(PMTYPE, unmapdev)(void *p, vm_size_t size) static void __CONCAT(PMTYPE, page_set_memattr)(vm_page_t m, vm_memattr_t ma) { + if (m->md.pat_mode == ma) + return; m->md.pat_mode = ma; if ((m->flags & PG_FICTITIOUS) != 0) diff --git a/sys/i386/linux/linux_proto.h b/sys/i386/linux/linux_proto.h index aa2dfbb68745..49f002a633d2 100644 --- a/sys/i386/linux/linux_proto.h +++ b/sys/i386/linux/linux_proto.h @@ -981,10 +981,13 @@ struct linux_inotify_init_args { syscallarg_t dummy; }; struct linux_inotify_add_watch_args { - syscallarg_t dummy; + char fd_l_[PADL_(l_int)]; l_int fd; char fd_r_[PADR_(l_int)]; + char pathname_l_[PADL_(const char *)]; const char * pathname; char pathname_r_[PADR_(const char *)]; + char mask_l_[PADL_(uint32_t)]; uint32_t mask; char mask_r_[PADR_(uint32_t)]; }; struct linux_inotify_rm_watch_args { - syscallarg_t dummy; + char fd_l_[PADL_(l_int)]; l_int fd; char fd_r_[PADR_(l_int)]; + char wd_l_[PADL_(uint32_t)]; uint32_t wd; char wd_r_[PADR_(uint32_t)]; }; struct linux_migrate_pages_args { syscallarg_t dummy; @@ -1178,7 +1181,7 @@ struct linux_pipe2_args { char flags_l_[PADL_(l_int)]; l_int flags; char flags_r_[PADR_(l_int)]; }; struct linux_inotify_init1_args { - syscallarg_t dummy; + char flags_l_[PADL_(l_int)]; l_int flags; char flags_r_[PADR_(l_int)]; }; struct linux_preadv_args { char fd_l_[PADL_(l_ulong)]; l_ulong fd; char fd_r_[PADR_(l_ulong)]; diff --git a/sys/i386/linux/linux_sysent.c b/sys/i386/linux/linux_sysent.c index 7be646f34144..b8893008944b 100644 --- a/sys/i386/linux/linux_sysent.c +++ b/sys/i386/linux/linux_sysent.c @@ -306,8 +306,8 @@ struct sysent linux_sysent[] = { { .sy_narg = AS(linux_ioprio_set_args), .sy_call = (sy_call_t *)linux_ioprio_set, .sy_auevent = AUE_SETPRIORITY, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 289 = linux_ioprio_set */ { .sy_narg = AS(linux_ioprio_get_args), .sy_call = (sy_call_t *)linux_ioprio_get, .sy_auevent = AUE_GETPRIORITY, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 290 = linux_ioprio_get */ { .sy_narg = 0, .sy_call = (sy_call_t *)linux_inotify_init, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 291 = linux_inotify_init */ - { .sy_narg = 0, .sy_call = (sy_call_t *)linux_inotify_add_watch, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 292 = linux_inotify_add_watch */ - { .sy_narg = 0, .sy_call = (sy_call_t *)linux_inotify_rm_watch, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 293 = linux_inotify_rm_watch */ + { .sy_narg = AS(linux_inotify_add_watch_args), .sy_call = (sy_call_t *)linux_inotify_add_watch, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 292 = linux_inotify_add_watch */ + { .sy_narg = AS(linux_inotify_rm_watch_args), .sy_call = (sy_call_t *)linux_inotify_rm_watch, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 293 = linux_inotify_rm_watch */ { .sy_narg = 0, .sy_call = (sy_call_t *)linux_migrate_pages, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 294 = linux_migrate_pages */ { .sy_narg = AS(linux_openat_args), .sy_call = (sy_call_t *)linux_openat, .sy_auevent = AUE_OPEN_RWTC, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 295 = linux_openat */ { .sy_narg = AS(linux_mkdirat_args), .sy_call = (sy_call_t *)linux_mkdirat, .sy_auevent = AUE_MKDIRAT, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 296 = linux_mkdirat */ @@ -346,7 +346,7 @@ struct sysent linux_sysent[] = { { .sy_narg = AS(linux_epoll_create1_args), .sy_call = (sy_call_t *)linux_epoll_create1, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 329 = linux_epoll_create1 */ { .sy_narg = AS(linux_dup3_args), .sy_call = (sy_call_t *)linux_dup3, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 330 = linux_dup3 */ { .sy_narg = AS(linux_pipe2_args), .sy_call = (sy_call_t *)linux_pipe2, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 331 = linux_pipe2 */ - { .sy_narg = 0, .sy_call = (sy_call_t *)linux_inotify_init1, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 332 = linux_inotify_init1 */ + { .sy_narg = AS(linux_inotify_init1_args), .sy_call = (sy_call_t *)linux_inotify_init1, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 332 = linux_inotify_init1 */ { .sy_narg = AS(linux_preadv_args), .sy_call = (sy_call_t *)linux_preadv, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 333 = linux_preadv */ { .sy_narg = AS(linux_pwritev_args), .sy_call = (sy_call_t *)linux_pwritev, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 334 = linux_pwritev */ { .sy_narg = AS(linux_rt_tgsigqueueinfo_args), .sy_call = (sy_call_t *)linux_rt_tgsigqueueinfo, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 335 = linux_rt_tgsigqueueinfo */ diff --git a/sys/i386/linux/linux_systrace_args.c b/sys/i386/linux/linux_systrace_args.c index f3e3c32a2bbf..563d1a795ae1 100644 --- a/sys/i386/linux/linux_systrace_args.c +++ b/sys/i386/linux/linux_systrace_args.c @@ -2071,12 +2071,19 @@ systrace_args(int sysnum, void *params, uint64_t *uarg, int *n_args) } /* linux_inotify_add_watch */ case 292: { - *n_args = 0; + struct linux_inotify_add_watch_args *p = params; + iarg[a++] = p->fd; /* l_int */ + uarg[a++] = (intptr_t)p->pathname; /* const char * */ + uarg[a++] = p->mask; /* uint32_t */ + *n_args = 3; break; } /* linux_inotify_rm_watch */ case 293: { - *n_args = 0; + struct linux_inotify_rm_watch_args *p = params; + iarg[a++] = p->fd; /* l_int */ + uarg[a++] = p->wd; /* uint32_t */ + *n_args = 2; break; } /* linux_migrate_pages */ @@ -2410,7 +2417,9 @@ systrace_args(int sysnum, void *params, uint64_t *uarg, int *n_args) } /* linux_inotify_init1 */ case 332: { - *n_args = 0; + struct linux_inotify_init1_args *p = params; + iarg[a++] = p->flags; /* l_int */ + *n_args = 1; break; } /* linux_preadv */ @@ -6604,9 +6613,32 @@ systrace_entry_setargdesc(int sysnum, int ndx, char *desc, size_t descsz) break; /* linux_inotify_add_watch */ case 292: + switch (ndx) { + case 0: + p = "l_int"; + break; + case 1: + p = "userland const char *"; + break; + case 2: + p = "uint32_t"; + break; + default: + break; + }; break; /* linux_inotify_rm_watch */ case 293: + switch (ndx) { + case 0: + p = "l_int"; + break; + case 1: + p = "uint32_t"; + break; + default: + break; + }; break; /* linux_migrate_pages */ case 294: @@ -7172,6 +7204,13 @@ systrace_entry_setargdesc(int sysnum, int ndx, char *desc, size_t descsz) break; /* linux_inotify_init1 */ case 332: + switch (ndx) { + case 0: + p = "l_int"; + break; + default: + break; + }; break; /* linux_preadv */ case 333: @@ -9889,8 +9928,14 @@ systrace_return_setargdesc(int sysnum, int ndx, char *desc, size_t descsz) case 291: /* linux_inotify_add_watch */ case 292: + if (ndx == 0 || ndx == 1) + p = "int"; + break; /* linux_inotify_rm_watch */ case 293: + if (ndx == 0 || ndx == 1) + p = "int"; + break; /* linux_migrate_pages */ case 294: /* linux_openat */ @@ -10062,6 +10107,9 @@ systrace_return_setargdesc(int sysnum, int ndx, char *desc, size_t descsz) break; /* linux_inotify_init1 */ case 332: + if (ndx == 0 || ndx == 1) + p = "int"; + break; /* linux_preadv */ case 333: if (ndx == 0 || ndx == 1) diff --git a/sys/i386/linux/syscalls.master b/sys/i386/linux/syscalls.master index 958336be0f08..2113ea51ac5d 100644 --- a/sys/i386/linux/syscalls.master +++ b/sys/i386/linux/syscalls.master @@ -1605,10 +1605,17 @@ int linux_inotify_init(void); } 292 AUE_NULL STD { - int linux_inotify_add_watch(void); + int linux_inotify_add_watch( + l_int fd, + const char *pathname, + uint32_t mask + ); } 293 AUE_NULL STD { - int linux_inotify_rm_watch(void); + int linux_inotify_rm_watch( + l_int fd, + uint32_t wd + ); } ; Linux 2.6.16: 294 AUE_NULL STD { @@ -1872,7 +1879,9 @@ ); } 332 AUE_NULL STD { - int linux_inotify_init1(void); + int linux_inotify_init1( + l_int flags + ); } ; Linux 2.6.30: 333 AUE_NULL STD { diff --git a/sys/kern/init_sysent.c b/sys/kern/init_sysent.c index a48a513aa3b5..91792430d24c 100644 --- a/sys/kern/init_sysent.c +++ b/sys/kern/init_sysent.c @@ -658,5 +658,7 @@ struct sysent sysent[] = { { .sy_narg = AS(getrlimitusage_args), .sy_call = (sy_call_t *)sys_getrlimitusage, .sy_auevent = AUE_NULL, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 589 = getrlimitusage */ { .sy_narg = AS(fchroot_args), .sy_call = (sy_call_t *)sys_fchroot, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 590 = fchroot */ { .sy_narg = AS(setcred_args), .sy_call = (sy_call_t *)sys_setcred, .sy_auevent = AUE_SETCRED, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 591 = setcred */ - { .sy_narg = AS(exterrctl_args), .sy_call = (sy_call_t *)sys_exterrctl, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 592 = exterrctl */ + { .sy_narg = AS(exterrctl_args), .sy_call = (sy_call_t *)sys_exterrctl, .sy_auevent = AUE_NULL, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 592 = exterrctl */ + { .sy_narg = AS(inotify_add_watch_at_args), .sy_call = (sy_call_t *)sys_inotify_add_watch_at, .sy_auevent = AUE_INOTIFY, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 593 = inotify_add_watch_at */ + { .sy_narg = AS(inotify_rm_watch_args), .sy_call = (sy_call_t *)sys_inotify_rm_watch, .sy_auevent = AUE_INOTIFY, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 594 = inotify_rm_watch */ }; diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c index ac4b6ac3f457..a27ab33b34da 100644 --- a/sys/kern/kern_descrip.c +++ b/sys/kern/kern_descrip.c @@ -38,9 +38,11 @@ #include "opt_ddb.h" #include "opt_ktrace.h" +#define EXTERR_CATEGORY EXTERR_CAT_FILEDESC #include <sys/systm.h> #include <sys/capsicum.h> #include <sys/conf.h> +#include <sys/exterrvar.h> #include <sys/fcntl.h> #include <sys/file.h> #include <sys/filedesc.h> @@ -478,6 +480,92 @@ kern_fcntl_freebsd(struct thread *td, int fd, int cmd, intptr_t arg) return (error); } +struct flags_trans_elem { + u_int f; + u_int t; +}; + +static u_int +flags_trans(const struct flags_trans_elem *ftes, int nitems, u_int from_flags) +{ + u_int res; + int i; + + res = 0; + for (i = 0; i < nitems; i++) { + if ((from_flags & ftes[i].f) != 0) + res |= ftes[i].t; + } + return (res); +} + +static uint8_t +fd_to_fde_flags(int fd_flags) +{ + static const struct flags_trans_elem fd_to_fde_flags_s[] = { + { .f = FD_CLOEXEC, .t = UF_EXCLOSE }, + { .f = FD_CLOFORK, .t = UF_FOCLOSE }, + { .f = FD_RESOLVE_BENEATH, .t = UF_RESOLVE_BENEATH }, + }; + + return (flags_trans(fd_to_fde_flags_s, nitems(fd_to_fde_flags_s), + fd_flags)); +} + +static int +fde_to_fd_flags(uint8_t fde_flags) +{ + static const struct flags_trans_elem fde_to_fd_flags_s[] = { + { .f = UF_EXCLOSE, .t = FD_CLOEXEC }, + { .f = UF_FOCLOSE, .t = FD_CLOFORK }, + { .f = UF_RESOLVE_BENEATH, .t = FD_RESOLVE_BENEATH }, + }; + + return (flags_trans(fde_to_fd_flags_s, nitems(fde_to_fd_flags_s), + fde_flags)); +} + +static uint8_t +fddup_to_fde_flags(int fddup_flags) +{ + static const struct flags_trans_elem fddup_to_fde_flags_s[] = { + { .f = FDDUP_FLAG_CLOEXEC, .t = UF_EXCLOSE }, + { .f = FDDUP_FLAG_CLOFORK, .t = UF_FOCLOSE }, + }; + + return (flags_trans(fddup_to_fde_flags_s, nitems(fddup_to_fde_flags_s), + fddup_flags)); +} + +static uint8_t +close_range_to_fde_flags(int close_range_flags) +{ + static const struct flags_trans_elem close_range_to_fde_flags_s[] = { + { .f = CLOSE_RANGE_CLOEXEC, .t = UF_EXCLOSE }, + { .f = CLOSE_RANGE_CLOFORK, .t = UF_FOCLOSE }, + }; + + return (flags_trans(close_range_to_fde_flags_s, + nitems(close_range_to_fde_flags_s), close_range_flags)); +} + +static uint8_t +open_to_fde_flags(int open_flags, bool sticky_orb) +{ + static const struct flags_trans_elem open_to_fde_flags_s[] = { + { .f = O_CLOEXEC, .t = UF_EXCLOSE }, + { .f = O_CLOFORK, .t = UF_FOCLOSE }, + { .f = O_RESOLVE_BENEATH, .t = UF_RESOLVE_BENEATH }, + }; +#if defined(__clang__) && __clang_major__ >= 19 + _Static_assert(open_to_fde_flags_s[nitems(open_to_fde_flags_s) - 1].f == + O_RESOLVE_BENEATH, "O_RESOLVE_BENEATH must be last, for sticky_orb"); +#endif + + return (flags_trans(open_to_fde_flags_s, nitems(open_to_fde_flags_s) - + (sticky_orb ? 0 : 1), open_flags)); +} + int kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg) { @@ -492,6 +580,7 @@ kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg) int error, flg, kif_sz, seals, tmp, got_set, got_cleared; uint64_t bsize; off_t foffset; + int flags; error = 0; flg = F_POSIX; @@ -511,6 +600,11 @@ kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg) error = kern_dup(td, FDDUP_FCNTL, FDDUP_FLAG_CLOEXEC, fd, tmp); break; + case F_DUPFD_CLOFORK: + tmp = arg; + error = kern_dup(td, FDDUP_FCNTL, FDDUP_FLAG_CLOFORK, fd, tmp); + break; + case F_DUP2FD: tmp = arg; error = kern_dup(td, FDDUP_FIXED, 0, fd, tmp); @@ -526,10 +620,7 @@ kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg) FILEDESC_SLOCK(fdp); fde = fdeget_noref(fdp, fd); if (fde != NULL) { - td->td_retval[0] = - ((fde->fde_flags & UF_EXCLOSE) ? FD_CLOEXEC : 0) | - ((fde->fde_flags & UF_RESOLVE_BENEATH) ? - FD_RESOLVE_BENEATH : 0); + td->td_retval[0] = fde_to_fd_flags(fde->fde_flags); error = 0; } FILEDESC_SUNLOCK(fdp); @@ -543,10 +634,8 @@ kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg) /* * UF_RESOLVE_BENEATH is sticky and cannot be cleared. */ - fde->fde_flags = (fde->fde_flags & ~UF_EXCLOSE) | - ((arg & FD_CLOEXEC) != 0 ? UF_EXCLOSE : 0) | - ((arg & FD_RESOLVE_BENEATH) != 0 ? - UF_RESOLVE_BENEATH : 0); + fde->fde_flags = (fde->fde_flags & + ~(UF_EXCLOSE | UF_FOCLOSE)) | fd_to_fde_flags(arg); error = 0; } FILEDESC_XUNLOCK(fdp); @@ -916,7 +1005,17 @@ revert_f_setfl: break; default: - error = EINVAL; + if ((cmd & ((1u << F_DUP3FD_SHIFT) - 1)) != F_DUP3FD) + return (EXTERROR(EINVAL, "invalid fcntl cmd")); + /* Handle F_DUP3FD */ + flags = (cmd >> F_DUP3FD_SHIFT); + if ((flags & ~(FD_CLOEXEC | FD_CLOFORK)) != 0) + return (EXTERROR(EINVAL, "invalid flags for F_DUP3FD")); + tmp = arg; + error = kern_dup(td, FDDUP_FIXED, + ((flags & FD_CLOEXEC) != 0 ? FDDUP_FLAG_CLOEXEC : 0) | + ((flags & FD_CLOFORK) != 0 ? FDDUP_FLAG_CLOFORK : 0), + fd, tmp); break; } return (error); @@ -946,7 +1045,7 @@ kern_dup(struct thread *td, u_int mode, int flags, int old, int new) fdp = p->p_fd; oioctls = NULL; - MPASS((flags & ~(FDDUP_FLAG_CLOEXEC)) == 0); + MPASS((flags & ~(FDDUP_FLAG_CLOEXEC | FDDUP_FLAG_CLOFORK)) == 0); MPASS(mode < FDDUP_LASTMODE); AUDIT_ARG_FD(old); @@ -971,8 +1070,7 @@ kern_dup(struct thread *td, u_int mode, int flags, int old, int new) goto unlock; if (mode == FDDUP_FIXED && old == new) { td->td_retval[0] = new; - if (flags & FDDUP_FLAG_CLOEXEC) - fdp->fd_ofiles[new].fde_flags |= UF_EXCLOSE; + fdp->fd_ofiles[new].fde_flags |= fddup_to_fde_flags(flags); error = 0; goto unlock; } @@ -1047,10 +1145,8 @@ kern_dup(struct thread *td, u_int mode, int flags, int old, int new) fde_copy(oldfde, newfde); filecaps_copy_finish(&oldfde->fde_caps, &newfde->fde_caps, nioctls); - if ((flags & FDDUP_FLAG_CLOEXEC) != 0) - newfde->fde_flags = oldfde->fde_flags | UF_EXCLOSE; - else - newfde->fde_flags = oldfde->fde_flags & ~UF_EXCLOSE; + newfde->fde_flags = (oldfde->fde_flags & ~(UF_EXCLOSE | UF_FOCLOSE)) | + fddup_to_fde_flags(flags); #ifdef CAPABILITIES seqc_write_end(&newfde->fde_seqc); #endif @@ -1416,13 +1512,14 @@ kern_close(struct thread *td, int fd) } static int -close_range_cloexec(struct thread *td, u_int lowfd, u_int highfd) +close_range_flags(struct thread *td, u_int lowfd, u_int highfd, int flags) { struct filedesc *fdp; struct fdescenttbl *fdt; struct filedescent *fde; - int fd; + int fd, fde_flags; + fde_flags = close_range_to_fde_flags(flags); fdp = td->td_proc->p_fd; FILEDESC_XLOCK(fdp); fdt = atomic_load_ptr(&fdp->fd_files); @@ -1434,7 +1531,7 @@ close_range_cloexec(struct thread *td, u_int lowfd, u_int highfd) for (; fd <= highfd; fd++) { fde = &fdt->fdt_ofiles[fd]; if (fde->fde_file != NULL) - fde->fde_flags |= UF_EXCLOSE; + fde->fde_flags |= fde_flags; } out_locked: FILEDESC_XUNLOCK(fdp); @@ -1492,8 +1589,8 @@ kern_close_range(struct thread *td, int flags, u_int lowfd, u_int highfd) return (EINVAL); } - if ((flags & CLOSE_RANGE_CLOEXEC) != 0) - return (close_range_cloexec(td, lowfd, highfd)); + if ((flags & (CLOSE_RANGE_CLOEXEC | CLOSE_RANGE_CLOFORK)) != 0) + return (close_range_flags(td, lowfd, highfd, flags)); return (close_range_impl(td, lowfd, highfd)); } @@ -1513,7 +1610,7 @@ sys_close_range(struct thread *td, struct close_range_args *uap) AUDIT_ARG_CMD(uap->highfd); AUDIT_ARG_FFLAGS(uap->flags); - if ((uap->flags & ~(CLOSE_RANGE_CLOEXEC)) != 0) + if ((uap->flags & ~(CLOSE_RANGE_CLOEXEC | CLOSE_RANGE_CLOFORK)) != 0) return (EINVAL); return (kern_close_range(td, uap->flags, uap->lowfd, uap->highfd)); } @@ -2171,8 +2268,7 @@ _finstall(struct filedesc *fdp, struct file *fp, int fd, int flags, seqc_write_begin(&fde->fde_seqc); #endif fde->fde_file = fp; - fde->fde_flags = ((flags & O_CLOEXEC) != 0 ? UF_EXCLOSE : 0) | - ((flags & O_RESOLVE_BENEATH) != 0 ? UF_RESOLVE_BENEATH : 0); + fde->fde_flags = open_to_fde_flags(flags, true); if (fcaps != NULL) filecaps_move(fcaps, &fde->fde_caps); else @@ -2432,6 +2528,7 @@ fdcopy(struct filedesc *fdp) newfdp->fd_freefile = fdp->fd_freefile; FILEDESC_FOREACH_FDE(fdp, i, ofde) { if ((ofde->fde_file->f_ops->fo_flags & DFLAG_PASSABLE) == 0 || + (ofde->fde_flags & UF_FOCLOSE) != 0 || !fhold(ofde->fde_file)) { if (newfdp->fd_freefile == fdp->fd_freefile) newfdp->fd_freefile = i; @@ -2729,6 +2826,12 @@ fdcloseexec(struct thread *td) fdfree(fdp, i); (void) closefp(fdp, i, fp, td, false, false); FILEDESC_UNLOCK_ASSERT(fdp); + } else if (fde->fde_flags & UF_FOCLOSE) { + /* + * https://austingroupbugs.net/view.php?id=1851 + * FD_CLOFORK should not be preserved across exec + */ + fde->fde_flags &= ~UF_FOCLOSE; } } } diff --git a/sys/kern/kern_exec.c b/sys/kern/kern_exec.c index cf067527237e..03268365891e 100644 --- a/sys/kern/kern_exec.c +++ b/sys/kern/kern_exec.c @@ -29,6 +29,7 @@ #include <sys/cdefs.h> #include "opt_capsicum.h" #include "opt_hwpmc_hooks.h" +#include "opt_hwt_hooks.h" #include "opt_ktrace.h" #include "opt_vm.h" @@ -90,6 +91,10 @@ #include <sys/pmckern.h> #endif +#ifdef HWT_HOOKS +#include <dev/hwt/hwt_hook.h> +#endif + #include <security/audit/audit.h> #include <security/mac/mac_framework.h> @@ -936,6 +941,20 @@ interpret: } #endif +#ifdef HWT_HOOKS + if ((td->td_proc->p_flag2 & P2_HWT) != 0) { + struct hwt_record_entry ent; + + VOP_UNLOCK(imgp->vp); + ent.fullpath = imgp->execpath; + ent.addr = imgp->et_dyn_addr; + ent.baseaddr = imgp->reloc_base; + ent.record_type = HWT_RECORD_EXECUTABLE; + HWT_CALL_HOOK(td, HWT_EXEC, &ent); + vn_lock(imgp->vp, LK_SHARED | LK_RETRY); + } +#endif + /* Set values passed into the program in registers. */ (*p->p_sysent->sv_setregs)(td, imgp, stack_base); diff --git a/sys/kern/kern_linker.c b/sys/kern/kern_linker.c index f388ac8a583a..d566bc01bc5e 100644 --- a/sys/kern/kern_linker.c +++ b/sys/kern/kern_linker.c @@ -30,6 +30,7 @@ #include "opt_ddb.h" #include "opt_kld.h" #include "opt_hwpmc_hooks.h" +#include "opt_hwt_hooks.h" #include <sys/param.h> #include <sys/systm.h> @@ -64,7 +65,7 @@ #include "linker_if.h" -#ifdef HWPMC_HOOKS +#if defined(HWPMC_HOOKS) || defined(HWT_HOOKS) #include <sys/pmckern.h> #endif @@ -2184,7 +2185,7 @@ linker_basename(const char *path) return (filename); } -#ifdef HWPMC_HOOKS +#if defined(HWPMC_HOOKS) || defined(HWT_HOOKS) /* * Inform hwpmc about the set of kernel modules currently loaded. */ diff --git a/sys/kern/kern_pmc.c b/sys/kern/kern_pmc.c index a3b572976fbf..15afe1a46d07 100644 --- a/sys/kern/kern_pmc.c +++ b/sys/kern/kern_pmc.c @@ -72,6 +72,10 @@ int __read_mostly (*pmc_hook)(struct thread *td, int function, void *arg) = NULL /* Interrupt handler */ int __read_mostly (*pmc_intr)(struct trapframe *tf) = NULL; +/* HWT hooks */ +void __read_mostly (*hwt_hook)(struct thread *td, int func, void *arg) = NULL; +int __read_mostly (*hwt_intr)(struct trapframe *tf) = NULL; + DPCPU_DEFINE(uint8_t, pmc_sampled); /* diff --git a/sys/kern/kern_resource.c b/sys/kern/kern_resource.c index c8b01afeab4f..dcd38c6e6fbe 100644 --- a/sys/kern/kern_resource.c +++ b/sys/kern/kern_resource.c @@ -1637,6 +1637,12 @@ uifree(struct uidinfo *uip) if (uip->ui_pipecnt != 0) printf("freeing uidinfo: uid = %d, pipecnt = %ld\n", uip->ui_uid, uip->ui_pipecnt); + if (uip->ui_inotifycnt != 0) + printf("freeing uidinfo: uid = %d, inotifycnt = %ld\n", + uip->ui_uid, uip->ui_inotifycnt); + if (uip->ui_inotifywatchcnt != 0) + printf("freeing uidinfo: uid = %d, inotifywatchcnt = %ld\n", + uip->ui_uid, uip->ui_inotifywatchcnt); free(uip, M_UIDINFO); } @@ -1742,6 +1748,21 @@ chgpipecnt(struct uidinfo *uip, int diff, rlim_t max) return (chglimit(uip, &uip->ui_pipecnt, diff, max, "pipecnt")); } +int +chginotifycnt(struct uidinfo *uip, int diff, rlim_t max) +{ + + return (chglimit(uip, &uip->ui_inotifycnt, diff, max, "inotifycnt")); +} + +int +chginotifywatchcnt(struct uidinfo *uip, int diff, rlim_t max) +{ + + return (chglimit(uip, &uip->ui_inotifywatchcnt, diff, max, + "inotifywatchcnt")); +} + static int sysctl_kern_proc_rlimit_usage(SYSCTL_HANDLER_ARGS) { diff --git a/sys/kern/kern_sendfile.c b/sys/kern/kern_sendfile.c index 17b53208157a..35b258e68701 100644 --- a/sys/kern/kern_sendfile.c +++ b/sys/kern/kern_sendfile.c @@ -27,12 +27,12 @@ * SUCH DAMAGE. */ -#include <sys/cdefs.h> #include "opt_kern_tls.h" #include <sys/param.h> #include <sys/systm.h> #include <sys/capsicum.h> +#include <sys/inotify.h> #include <sys/kernel.h> #include <sys/lock.h> #include <sys/ktls.h> @@ -1246,6 +1246,8 @@ out: */ if (error == 0) { td->td_retval[0] = 0; + if (sbytes > 0 && vp != NULL) + INOTIFY(vp, IN_ACCESS); } if (sent != NULL) { (*sent) = sbytes; diff --git a/sys/kern/kern_sig.c b/sys/kern/kern_sig.c index 4565abc4b540..5d51aa675cb7 100644 --- a/sys/kern/kern_sig.c +++ b/sys/kern/kern_sig.c @@ -1050,8 +1050,7 @@ osigaction(struct thread *td, struct osigaction_args *uap) int osigreturn(struct thread *td, struct osigreturn_args *uap) { - - return (nosys(td, (struct nosys_args *)uap)); + return (kern_nosys(td, 0)); } #endif #endif /* COMPAT_43 */ @@ -4139,7 +4138,7 @@ coredump(struct thread *td) struct flock lf; struct vattr vattr; size_t fullpathsize; - int error, error1, locked; + int error, error1, jid, locked, ppid, sig; char *name; /* name of corefile */ void *rl_cookie; off_t limit; @@ -4168,6 +4167,10 @@ coredump(struct thread *td) PROC_UNLOCK(p); return (EFBIG); } + + ppid = p->p_oppid; + sig = p->p_sig; + jid = p->p_ucred->cr_prison->pr_id; PROC_UNLOCK(p); error = corefile_open(p->p_comm, cred->cr_uid, p->p_pid, td, @@ -4253,6 +4256,9 @@ coredump(struct thread *td) } devctl_safe_quote_sb(sb, name); sbuf_putc(sb, '"'); + + sbuf_printf(sb, " jid=%d pid=%d ppid=%d signo=%d", + jid, p->p_pid, ppid, sig); if (sbuf_finish(sb) == 0) devctl_notify("kernel", "signal", "coredump", sbuf_data(sb)); out2: @@ -4281,6 +4287,12 @@ struct nosys_args { int nosys(struct thread *td, struct nosys_args *args) { + return (kern_nosys(td, args->dummy)); +} + +int +kern_nosys(struct thread *td, int dummy) +{ struct proc *p; p = td->td_proc; diff --git a/sys/kern/kern_syscalls.c b/sys/kern/kern_syscalls.c index 24406763a93a..a93d711e7597 100644 --- a/sys/kern/kern_syscalls.c +++ b/sys/kern/kern_syscalls.c @@ -35,6 +35,7 @@ #include <sys/resourcevar.h> #include <sys/sx.h> #include <sys/syscall.h> +#include <sys/syscallsubr.h> #include <sys/sysent.h> #include <sys/sysproto.h> #include <sys/systm.h> @@ -50,14 +51,14 @@ int lkmnosys(struct thread *td, struct nosys_args *args) { - return (nosys(td, args)); + return (kern_nosys(td, 0)); } int lkmressys(struct thread *td, struct nosys_args *args) { - return (nosys(td, args)); + return (kern_nosys(td, 0)); } struct sysent nosys_sysent = { diff --git a/sys/kern/kern_thr.c b/sys/kern/kern_thr.c index 8ad885b42ebe..0e8c2b9f362e 100644 --- a/sys/kern/kern_thr.c +++ b/sys/kern/kern_thr.c @@ -29,7 +29,7 @@ #include "opt_ktrace.h" #include "opt_posix.h" #include "opt_hwpmc_hooks.h" - +#include "opt_hwt_hooks.h" #include <sys/systm.h> #include <sys/kernel.h> #ifdef KTRACE @@ -60,6 +60,9 @@ #ifdef HWPMC_HOOKS #include <sys/pmckern.h> #endif +#ifdef HWT_HOOKS +#include <dev/hwt/hwt_hook.h> +#endif #include <machine/frame.h> @@ -280,6 +283,10 @@ thread_create(struct thread *td, struct rtprio *rtp, PMC_CALL_HOOK_UNLOCKED(newtd, PMC_FN_THR_CREATE_LOG, NULL); #endif +#ifdef HWT_HOOKS + HWT_CALL_HOOK(newtd, HWT_THREAD_CREATE, NULL); +#endif + tidhash_add(newtd); /* ignore timesharing class */ @@ -613,6 +620,9 @@ sys_thr_set_name(struct thread *td, struct thr_set_name_args *uap) if (PMC_PROC_IS_USING_PMCS(p) || PMC_SYSTEM_SAMPLING_ACTIVE()) PMC_CALL_HOOK_UNLOCKED(ttd, PMC_FN_THR_CREATE_LOG, NULL); #endif +#ifdef HWT_HOOKS + HWT_CALL_HOOK(ttd, HWT_THREAD_SET_NAME, NULL); +#endif #ifdef KTR sched_clear_tdname(ttd); #endif diff --git a/sys/kern/kern_thread.c b/sys/kern/kern_thread.c index 2dff461e932a..f853af193016 100644 --- a/sys/kern/kern_thread.c +++ b/sys/kern/kern_thread.c @@ -30,6 +30,7 @@ #include "opt_witness.h" #include "opt_hwpmc_hooks.h" +#include "opt_hwt_hooks.h" #include <sys/systm.h> #include <sys/asan.h> @@ -60,6 +61,9 @@ #ifdef HWPMC_HOOKS #include <sys/pmckern.h> #endif +#ifdef HWT_HOOKS +#include <dev/hwt/hwt_hook.h> +#endif #include <sys/priv.h> #include <security/audit/audit.h> @@ -1002,6 +1006,11 @@ thread_exit(void) } else if (PMC_SYSTEM_SAMPLING_ACTIVE()) PMC_CALL_HOOK_UNLOCKED(td, PMC_FN_THR_EXIT_LOG, NULL); #endif + +#ifdef HWT_HOOKS + HWT_CALL_HOOK(td, HWT_THREAD_EXIT, NULL); +#endif + PROC_UNLOCK(p); PROC_STATLOCK(p); thread_lock(td); diff --git a/sys/kern/sched_4bsd.c b/sys/kern/sched_4bsd.c index 753494983416..504f9a2338ef 100644 --- a/sys/kern/sched_4bsd.c +++ b/sys/kern/sched_4bsd.c @@ -36,6 +36,7 @@ #include <sys/cdefs.h> #include "opt_hwpmc_hooks.h" +#include "opt_hwt_hooks.h" #include "opt_sched.h" #include <sys/param.h> @@ -63,6 +64,10 @@ #include <sys/pmckern.h> #endif +#ifdef HWT_HOOKS +#include <dev/hwt/hwt_hook.h> +#endif + #ifdef KDTRACE_HOOKS #include <sys/dtrace_bsd.h> int __read_mostly dtrace_vtime_active; @@ -1075,6 +1080,11 @@ sched_switch(struct thread *td, int flags) PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT); #endif +#ifdef HWT_HOOKS + HWT_CALL_HOOK(td, HWT_SWITCH_OUT, NULL); + HWT_CALL_HOOK(newtd, HWT_SWITCH_IN, NULL); +#endif + SDT_PROBE2(sched, , , off__cpu, newtd, newtd->td_proc); /* I feel sleepy */ @@ -1696,10 +1706,20 @@ sched_idletd(void *dummy) static void sched_throw_tail(struct thread *td) { + struct thread *newtd; mtx_assert(&sched_lock, MA_OWNED); KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count")); - cpu_throw(td, choosethread()); /* doesn't return */ + + newtd = choosethread(); + +#ifdef HWT_HOOKS + if (td) + HWT_CALL_HOOK(td, HWT_SWITCH_OUT, NULL); + HWT_CALL_HOOK(newtd, HWT_SWITCH_IN, NULL); +#endif + + cpu_throw(td, newtd); /* doesn't return */ } /* diff --git a/sys/kern/sched_ule.c b/sys/kern/sched_ule.c index 508ec0ab97ec..409439ca34da 100644 --- a/sys/kern/sched_ule.c +++ b/sys/kern/sched_ule.c @@ -39,6 +39,7 @@ #include <sys/cdefs.h> #include "opt_hwpmc_hooks.h" +#include "opt_hwt_hooks.h" #include "opt_sched.h" #include <sys/param.h> @@ -69,6 +70,10 @@ #include <sys/pmckern.h> #endif +#ifdef HWT_HOOKS +#include <dev/hwt/hwt_hook.h> +#endif + #ifdef KDTRACE_HOOKS #include <sys/dtrace_bsd.h> int __read_mostly dtrace_vtime_active; @@ -2432,6 +2437,12 @@ sched_switch(struct thread *td, int flags) if (dtrace_vtime_active) (*dtrace_vtime_switch_func)(newtd); #endif + +#ifdef HWT_HOOKS + HWT_CALL_HOOK(td, HWT_SWITCH_OUT, NULL); + HWT_CALL_HOOK(newtd, HWT_SWITCH_IN, NULL); +#endif + td->td_oncpu = NOCPU; cpu_switch(td, newtd, mtx); cpuid = td->td_oncpu = PCPU_GET(cpuid); @@ -3252,6 +3263,10 @@ sched_ap_entry(void) newtd = sched_throw_grab(tdq); +#ifdef HWT_HOOKS + HWT_CALL_HOOK(newtd, HWT_SWITCH_IN, NULL); +#endif + /* doesn't return */ cpu_throw(NULL, newtd); } @@ -3278,6 +3293,10 @@ sched_throw(struct thread *td) newtd = sched_throw_grab(tdq); +#ifdef HWT_HOOKS + HWT_CALL_HOOK(newtd, HWT_SWITCH_IN, NULL); +#endif + /* doesn't return */ cpu_switch(td, newtd, TDQ_LOCKPTR(tdq)); } diff --git a/sys/kern/subr_asan.c b/sys/kern/subr_asan.c index 0edb631d1475..464efda1e91a 100644 --- a/sys/kern/subr_asan.c +++ b/sys/kern/subr_asan.c @@ -263,8 +263,7 @@ kasan_mark(const void *addr, size_t size, size_t redzsize, uint8_t code) if (__predict_false(!kasan_enabled)) return; - if ((vm_offset_t)addr >= DMAP_MIN_ADDRESS && - (vm_offset_t)addr < DMAP_MAX_ADDRESS) + if (kasan_md_unsupported((vm_offset_t)addr)) return; KASSERT((vm_offset_t)addr >= VM_MIN_KERNEL_ADDRESS && diff --git a/sys/kern/subr_capability.c b/sys/kern/subr_capability.c index 7cc6fb593697..5ad5b0af1681 100644 --- a/sys/kern/subr_capability.c +++ b/sys/kern/subr_capability.c @@ -74,6 +74,10 @@ const cap_rights_t cap_getsockopt_rights = CAP_RIGHTS_INITIALIZER(CAP_GETSOCKOPT); const cap_rights_t cap_getsockname_rights = CAP_RIGHTS_INITIALIZER(CAP_GETSOCKNAME); +const cap_rights_t cap_inotify_add_rights = + CAP_RIGHTS_INITIALIZER(CAP_INOTIFY_ADD); +const cap_rights_t cap_inotify_rm_rights = + CAP_RIGHTS_INITIALIZER(CAP_INOTIFY_RM); const cap_rights_t cap_ioctl_rights = CAP_RIGHTS_INITIALIZER(CAP_IOCTL); const cap_rights_t cap_listen_rights = CAP_RIGHTS_INITIALIZER(CAP_LISTEN); const cap_rights_t cap_linkat_source_rights = diff --git a/sys/kern/subr_pctrie.c b/sys/kern/subr_pctrie.c index 3a3548bad52b..bb86c779b936 100644 --- a/sys/kern/subr_pctrie.c +++ b/sys/kern/subr_pctrie.c @@ -691,21 +691,23 @@ _pctrie_lookup_ge(struct pctrie *ptree, struct pctrie_node *node, */ if (node == PCTRIE_NULL || *pctrie_toval(node) < index) { /* Climb the path to find a node with a descendant > index. */ - for (node = parent; node != NULL; node = pctrie_parent(node)) { - slot = pctrie_slot(node, index) + 1; - if ((node->pn_popmap >> slot) != 0) + node = NULL; + while (parent != NULL) { + slot = pctrie_slot(parent, index) + 1; + if ((parent->pn_popmap >> slot) != 0) break; + node = parent; + parent = pctrie_parent(node); } - if (node == NULL) { + if (parent == NULL) { if (parent_out != NULL) - *parent_out = NULL; + *parent_out = node; return (NULL); } /* Step to the least child with a descendant > index. */ - slot += ffs(node->pn_popmap >> slot) - 1; - parent = node; - node = pctrie_node_load(&node->pn_child[slot], NULL, + slot += ffs(parent->pn_popmap >> slot) - 1; + node = pctrie_node_load(&parent->pn_child[slot], NULL, PCTRIE_LOCKED); } /* Descend to the least leaf of the subtrie. */ @@ -785,21 +787,23 @@ _pctrie_lookup_le(struct pctrie *ptree, struct pctrie_node *node, */ if (node == PCTRIE_NULL || *pctrie_toval(node) > index) { /* Climb the path to find a node with a descendant < index. */ - for (node = parent; node != NULL; node = pctrie_parent(node)) { - slot = pctrie_slot(node, index); - if ((node->pn_popmap & ((1 << slot) - 1)) != 0) + node = NULL; + while (parent != NULL) { + slot = pctrie_slot(parent, index); + if ((parent->pn_popmap & ((1 << slot) - 1)) != 0) break; + node = parent; + parent = pctrie_parent(node); } - if (node == NULL) { + if (parent == NULL) { if (parent_out != NULL) - *parent_out = NULL; + *parent_out = node; return (NULL); } /* Step to the greatest child with a descendant < index. */ - slot = ilog2(node->pn_popmap & ((1 << slot) - 1)); - parent = node; - node = pctrie_node_load(&node->pn_child[slot], NULL, + slot = ilog2(parent->pn_popmap & ((1 << slot) - 1)); + node = pctrie_node_load(&parent->pn_child[slot], NULL, PCTRIE_LOCKED); } /* Descend to the greatest leaf of the subtrie. */ diff --git a/sys/kern/subr_trap.c b/sys/kern/subr_trap.c index 18388ae5f232..bac7d0080c71 100644 --- a/sys/kern/subr_trap.c +++ b/sys/kern/subr_trap.c @@ -338,8 +338,9 @@ ast_handler(struct thread *td, struct trapframe *framep, bool dtor) td->td_ast = 0; } - CTR3(KTR_SYSC, "ast: thread %p (pid %d, %s)", td, td->td_proc->p_pid, - td->td_proc->p_comm); + CTR3(KTR_SYSC, "ast: thread %p (pid %d, %s)", td, + td->td_proc == NULL ? -1 : td->td_proc->p_pid, + td->td_proc == NULL ? "" : td->td_proc->p_comm); KASSERT(framep == NULL || TRAPF_USERMODE(framep), ("ast in kernel mode")); diff --git a/sys/kern/sys_generic.c b/sys/kern/sys_generic.c index d31ff3b939cc..b472aaea89e6 100644 --- a/sys/kern/sys_generic.c +++ b/sys/kern/sys_generic.c @@ -37,16 +37,17 @@ #include "opt_capsicum.h" #include "opt_ktrace.h" -#define EXTERR_CATEGORY EXTERR_CAT_FILEDESC +#define EXTERR_CATEGORY EXTERR_CAT_GENIO #include <sys/param.h> #include <sys/systm.h> #include <sys/sysproto.h> #include <sys/capsicum.h> +#include <sys/exterrvar.h> #include <sys/filedesc.h> #include <sys/filio.h> #include <sys/fcntl.h> #include <sys/file.h> -#include <sys/exterrvar.h> +#include <sys/inotify.h> #include <sys/lock.h> #include <sys/proc.h> #include <sys/signalvar.h> @@ -195,7 +196,7 @@ sys_read(struct thread *td, struct read_args *uap) int error; if (uap->nbyte > IOSIZE_MAX) - return (EINVAL); + return (EXTERROR(EINVAL, "length > iosize_max")); aiov.iov_base = uap->buf; aiov.iov_len = uap->nbyte; auio.uio_iov = &aiov; @@ -233,7 +234,7 @@ kern_pread(struct thread *td, int fd, void *buf, size_t nbyte, off_t offset) int error; if (nbyte > IOSIZE_MAX) - return (EINVAL); + return (EXTERROR(EINVAL, "length > iosize_max")); aiov.iov_base = buf; aiov.iov_len = nbyte; auio.uio_iov = &aiov; @@ -329,7 +330,7 @@ kern_preadv(struct thread *td, int fd, struct uio *auio, off_t offset) error = ESPIPE; else if (offset < 0 && (fp->f_vnode == NULL || fp->f_vnode->v_type != VCHR)) - error = EINVAL; + error = EXTERROR(EINVAL, "neg offset"); else error = dofileread(td, fd, fp, auio, offset, FOF_OFFSET); fdrop(fp, td); @@ -396,7 +397,7 @@ sys_write(struct thread *td, struct write_args *uap) int error; if (uap->nbyte > IOSIZE_MAX) - return (EINVAL); + return (EXTERROR(EINVAL, "length > iosize_max")); aiov.iov_base = (void *)(uintptr_t)uap->buf; aiov.iov_len = uap->nbyte; auio.uio_iov = &aiov; @@ -435,7 +436,7 @@ kern_pwrite(struct thread *td, int fd, const void *buf, size_t nbyte, int error; if (nbyte > IOSIZE_MAX) - return (EINVAL); + return (EXTERROR(EINVAL, "length > iosize_max")); aiov.iov_base = (void *)(uintptr_t)buf; aiov.iov_len = nbyte; auio.uio_iov = &aiov; @@ -531,7 +532,7 @@ kern_pwritev(struct thread *td, int fd, struct uio *auio, off_t offset) error = ESPIPE; else if (offset < 0 && (fp->f_vnode == NULL || fp->f_vnode->v_type != VCHR)) - error = EINVAL; + error = EXTERROR(EINVAL, "neg offset"); else error = dofilewrite(td, fd, fp, auio, offset, FOF_OFFSET); fdrop(fp, td); @@ -602,14 +603,14 @@ kern_ftruncate(struct thread *td, int fd, off_t length) AUDIT_ARG_FD(fd); if (length < 0) - return (EINVAL); + return (EXTERROR(EINVAL, "negative length")); error = fget(td, fd, &cap_ftruncate_rights, &fp); if (error) return (error); AUDIT_ARG_FILE(td->td_proc, fp); if (!(fp->f_flag & FWRITE)) { fdrop(fp, td); - return (EINVAL); + return (EXTERROR(EINVAL, "non-writable")); } error = fo_truncate(fp, length, td->td_ucred, td); fdrop(fp, td); @@ -840,8 +841,10 @@ kern_posix_fallocate(struct thread *td, int fd, off_t offset, off_t len) int error; AUDIT_ARG_FD(fd); - if (offset < 0 || len <= 0) - return (EINVAL); + if (offset < 0) + return (EXTERROR(EINVAL, "negative offset")); + if (len <= 0) + return (EXTERROR(EINVAL, "negative length")); /* Check for wrap. */ if (offset > OFF_MAX - len) return (EFBIG); @@ -898,16 +901,21 @@ kern_fspacectl(struct thread *td, int fd, int cmd, AUDIT_ARG_FFLAGS(flags); if (rqsr == NULL) - return (EINVAL); + return (EXTERROR(EINVAL, "no range")); rmsr = *rqsr; if (rmsrp != NULL) *rmsrp = rmsr; - if (cmd != SPACECTL_DEALLOC || - rqsr->r_offset < 0 || rqsr->r_len <= 0 || - rqsr->r_offset > OFF_MAX - rqsr->r_len || - (flags & ~SPACECTL_F_SUPPORTED) != 0) - return (EINVAL); + if (cmd != SPACECTL_DEALLOC) + return (EXTERROR(EINVAL, "cmd", cmd)); + if (rqsr->r_offset < 0) + return (EXTERROR(EINVAL, "neg offset")); + if (rqsr->r_len <= 0) + return (EXTERROR(EINVAL, "neg len")); + if (rqsr->r_offset > OFF_MAX - rqsr->r_len) + return (EXTERROR(EINVAL, "offset too large")); + if ((flags & ~SPACECTL_F_SUPPORTED) != 0) + return (EXTERROR(EINVAL, "reserved flags", flags)); error = fget_write(td, fd, &cap_pwrite_rights, &fp); if (error != 0) @@ -939,7 +947,6 @@ int kern_specialfd(struct thread *td, int type, void *arg) { struct file *fp; - struct specialfd_eventfd *ae; int error, fd, fflags; fflags = 0; @@ -948,14 +955,24 @@ kern_specialfd(struct thread *td, int type, void *arg) return (error); switch (type) { - case SPECIALFD_EVENTFD: + case SPECIALFD_EVENTFD: { + struct specialfd_eventfd *ae; + ae = arg; if ((ae->flags & EFD_CLOEXEC) != 0) fflags |= O_CLOEXEC; error = eventfd_create_file(td, fp, ae->initval, ae->flags); break; + } + case SPECIALFD_INOTIFY: { + struct specialfd_inotify *si; + + si = arg; + error = inotify_create_file(td, fp, si->flags, &fflags); + break; + } default: - error = EINVAL; + error = EXTERROR(EINVAL, "invalid type", type); break; } @@ -970,13 +987,14 @@ kern_specialfd(struct thread *td, int type, void *arg) int sys___specialfd(struct thread *td, struct __specialfd_args *args) { - struct specialfd_eventfd ae; int error; switch (args->type) { - case SPECIALFD_EVENTFD: + case SPECIALFD_EVENTFD: { + struct specialfd_eventfd ae; + if (args->len != sizeof(struct specialfd_eventfd)) { - error = EINVAL; + error = EXTERROR(EINVAL, "eventfd params ABI"); break; } error = copyin(args->req, &ae, sizeof(ae)); @@ -984,13 +1002,27 @@ sys___specialfd(struct thread *td, struct __specialfd_args *args) break; if ((ae.flags & ~(EFD_CLOEXEC | EFD_NONBLOCK | EFD_SEMAPHORE)) != 0) { - error = EINVAL; + error = EXTERROR(EINVAL, "reserved flag"); break; } error = kern_specialfd(td, args->type, &ae); break; + } + case SPECIALFD_INOTIFY: { + struct specialfd_inotify si; + + if (args->len != sizeof(si)) { + error = EINVAL; + break; + } + error = copyin(args->req, &si, sizeof(si)); + if (error != 0) + break; + error = kern_specialfd(td, args->type, &si); + break; + } default: - error = EINVAL; + error = EXTERROR(EINVAL, "unknown type", args->type); break; } return (error); @@ -1166,7 +1198,7 @@ kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou, int error, lf, ndu; if (nd < 0) - return (EINVAL); + return (EXTERROR(EINVAL, "negative ndescs")); fdp = td->td_proc->p_fd; ndu = nd; lf = fdp->fd_nfiles; @@ -1259,7 +1291,7 @@ kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou, rtv = *tvp; if (rtv.tv_sec < 0 || rtv.tv_usec < 0 || rtv.tv_usec >= 1000000) { - error = EINVAL; + error = EXTERROR(EINVAL, "invalid timeval"); goto done; } if (!timevalisset(&rtv)) @@ -1491,7 +1523,7 @@ sys_poll(struct thread *td, struct poll_args *uap) if (uap->timeout != INFTIM) { if (uap->timeout < 0) - return (EINVAL); + return (EXTERROR(EINVAL, "invalid timeout")); ts.tv_sec = uap->timeout / 1000; ts.tv_nsec = (uap->timeout % 1000) * 1000000; tsp = &ts; @@ -1516,7 +1548,7 @@ kern_poll_kfds(struct thread *td, struct pollfd *kfds, u_int nfds, precision = 0; if (tsp != NULL) { if (!timespecvalid_interval(tsp)) - return (EINVAL); + return (EXTERROR(EINVAL, "invalid timespec")); if (tsp->tv_sec == 0 && tsp->tv_nsec == 0) sbt = 0; else { @@ -1619,7 +1651,7 @@ kern_poll(struct thread *td, struct pollfd *ufds, u_int nfds, int error; if (kern_poll_maxfds(nfds)) - return (EINVAL); + return (EXTERROR(EINVAL, "too large nfds")); if (nfds > nitems(stackfds)) kfds = mallocarray(nfds, sizeof(*kfds), M_TEMP, M_WAITOK); else @@ -1796,7 +1828,7 @@ selsocket(struct socket *so, int events, struct timeval *tvp, struct thread *td) rtv = *tvp; if (rtv.tv_sec < 0 || rtv.tv_usec < 0 || rtv.tv_usec >= 1000000) - return (EINVAL); + return (EXTERROR(EINVAL, "invalid timeval")); if (!timevalisset(&rtv)) asbt = 0; else if (rtv.tv_sec <= INT32_MAX) { @@ -2173,7 +2205,7 @@ kern_kcmp(struct thread *td, pid_t pid1, pid_t pid2, int type, (uintptr_t)p2->p_vmspace); break; default: - error = EINVAL; + error = EXTERROR(EINVAL, "unknown op"); break; } @@ -2277,6 +2309,12 @@ sys_exterrctl(struct thread *td, struct exterrctl_args *uap) return (EINVAL); td->td_pflags2 &= ~TDP2_UEXTERR; return (0); + case EXTERRCTL_UD: + /* + * Important: this code must always return EINVAL and never any + * extended error, for testing purposes. + */ + /* FALLTHROUGH */ default: return (EINVAL); } diff --git a/sys/kern/sys_pipe.c b/sys/kern/sys_pipe.c index 9340779918a2..ed651da96b14 100644 --- a/sys/kern/sys_pipe.c +++ b/sys/kern/sys_pipe.c @@ -548,7 +548,7 @@ sys_pipe2(struct thread *td, struct pipe2_args *uap) { int error, fildes[2]; - if (uap->flags & ~(O_CLOEXEC | O_NONBLOCK)) + if ((uap->flags & ~(O_CLOEXEC | O_CLOFORK | O_NONBLOCK)) != 0) return (EINVAL); error = kern_pipe(td, fildes, uap->flags, NULL, NULL); if (error) diff --git a/sys/kern/syscalls.c b/sys/kern/syscalls.c index fa36cc824078..90a4f3a7dad8 100644 --- a/sys/kern/syscalls.c +++ b/sys/kern/syscalls.c @@ -598,4 +598,6 @@ const char *syscallnames[] = { "fchroot", /* 590 = fchroot */ "setcred", /* 591 = setcred */ "exterrctl", /* 592 = exterrctl */ + "inotify_add_watch_at", /* 593 = inotify_add_watch_at */ + "inotify_rm_watch", /* 594 = inotify_rm_watch */ }; diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master index 08b557a7a540..90559fab6086 100644 --- a/sys/kern/syscalls.master +++ b/sys/kern/syscalls.master @@ -3349,11 +3349,26 @@ size_t size ); } -592 AUE_NULL STD { +592 AUE_NULL STD|CAPENABLED { int exterrctl( u_int op, u_int flags, _In_reads_bytes_(4) void *ptr ); } +593 AUE_INOTIFY STD|CAPENABLED { + int inotify_add_watch_at( + int fd, + int dfd, + _In_z_ const char *path, + uint32_t mask + ); + } +594 AUE_INOTIFY STD|CAPENABLED { + int inotify_rm_watch( + int fd, + int wd + ); + } + ; vim: syntax=off diff --git a/sys/kern/systrace_args.c b/sys/kern/systrace_args.c index 15789d3eb5fa..90b21616a558 100644 --- a/sys/kern/systrace_args.c +++ b/sys/kern/systrace_args.c @@ -3482,6 +3482,24 @@ systrace_args(int sysnum, void *params, uint64_t *uarg, int *n_args) *n_args = 3; break; } + /* inotify_add_watch_at */ + case 593: { + struct inotify_add_watch_at_args *p = params; + iarg[a++] = p->fd; /* int */ + iarg[a++] = p->dfd; /* int */ + uarg[a++] = (intptr_t)p->path; /* const char * */ + uarg[a++] = p->mask; /* uint32_t */ + *n_args = 4; + break; + } + /* inotify_rm_watch */ + case 594: { + struct inotify_rm_watch_args *p = params; + iarg[a++] = p->fd; /* int */ + iarg[a++] = p->wd; /* int */ + *n_args = 2; + break; + } default: *n_args = 0; break; @@ -9317,6 +9335,38 @@ systrace_entry_setargdesc(int sysnum, int ndx, char *desc, size_t descsz) break; }; break; + /* inotify_add_watch_at */ + case 593: + switch (ndx) { + case 0: + p = "int"; + break; + case 1: + p = "int"; + break; + case 2: + p = "userland const char *"; + break; + case 3: + p = "uint32_t"; + break; + default: + break; + }; + break; + /* inotify_rm_watch */ + case 594: + switch (ndx) { + case 0: + p = "int"; + break; + case 1: + p = "int"; + break; + default: + break; + }; + break; default: break; }; @@ -11305,6 +11355,16 @@ systrace_return_setargdesc(int sysnum, int ndx, char *desc, size_t descsz) if (ndx == 0 || ndx == 1) p = "int"; break; + /* inotify_add_watch_at */ + case 593: + if (ndx == 0 || ndx == 1) + p = "int"; + break; + /* inotify_rm_watch */ + case 594: + if (ndx == 0 || ndx == 1) + p = "int"; + break; default: break; }; diff --git a/sys/kern/sysv_msg.c b/sys/kern/sysv_msg.c index 11141d197aec..a545a0a54c25 100644 --- a/sys/kern/sysv_msg.c +++ b/sys/kern/sysv_msg.c @@ -1724,7 +1724,7 @@ freebsd32_msgsys(struct thread *td, struct freebsd32_msgsys_args *uap) return (sys_msgsys(td, (struct msgsys_args *)uap)); } #else - return (nosys(td, NULL)); + return (kern_nosys(td, 0)); #endif } diff --git a/sys/kern/sysv_sem.c b/sys/kern/sysv_sem.c index e399517010fc..a99e1a4de14e 100644 --- a/sys/kern/sysv_sem.c +++ b/sys/kern/sysv_sem.c @@ -1904,7 +1904,7 @@ freebsd32_semsys(struct thread *td, struct freebsd32_semsys_args *uap) return (sys_semsys(td, (struct semsys_args *)uap)); } #else - return (nosys(td, NULL)); + return (kern_nosys(td, 0)); #endif } diff --git a/sys/kern/sysv_shm.c b/sys/kern/sysv_shm.c index 60e3fe92a4b7..8d1a469127c6 100644 --- a/sys/kern/sysv_shm.c +++ b/sys/kern/sysv_shm.c @@ -1474,7 +1474,7 @@ freebsd32_shmsys(struct thread *td, struct freebsd32_shmsys_args *uap) return (EINVAL); } #else - return (nosys(td, NULL)); + return (kern_nosys(td, 0)); #endif } diff --git a/sys/kern/uipc_syscalls.c b/sys/kern/uipc_syscalls.c index ad8485028987..133724ac76c5 100644 --- a/sys/kern/uipc_syscalls.c +++ b/sys/kern/uipc_syscalls.c @@ -151,6 +151,10 @@ kern_socket(struct thread *td, int domain, int type, int protocol) type &= ~SOCK_CLOEXEC; oflag |= O_CLOEXEC; } + if ((type & SOCK_CLOFORK) != 0) { + type &= ~SOCK_CLOFORK; + oflag |= O_CLOFORK; + } if ((type & SOCK_NONBLOCK) != 0) { type &= ~SOCK_NONBLOCK; fflag |= FNONBLOCK; @@ -352,7 +356,8 @@ kern_accept4(struct thread *td, int s, struct sockaddr *sa, int flags, goto done; #endif error = falloc_caps(td, &nfp, &fd, - (flags & SOCK_CLOEXEC) ? O_CLOEXEC : 0, &fcaps); + ((flags & SOCK_CLOEXEC) != 0 ? O_CLOEXEC : 0) | + ((flags & SOCK_CLOFORK) != 0 ? O_CLOFORK : 0), &fcaps); if (error != 0) goto done; SOCK_LOCK(head); @@ -435,7 +440,7 @@ int sys_accept4(struct thread *td, struct accept4_args *uap) { - if (uap->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) + if ((uap->flags & ~(SOCK_CLOEXEC | SOCK_CLOFORK | SOCK_NONBLOCK)) != 0) return (EINVAL); return (accept1(td, uap->s, uap->name, uap->anamelen, uap->flags)); @@ -557,6 +562,10 @@ kern_socketpair(struct thread *td, int domain, int type, int protocol, type &= ~SOCK_CLOEXEC; oflag |= O_CLOEXEC; } + if ((type & SOCK_CLOFORK) != 0) { + type &= ~SOCK_CLOFORK; + oflag |= O_CLOFORK; + } if ((type & SOCK_NONBLOCK) != 0) { type &= ~SOCK_NONBLOCK; fflag |= FNONBLOCK; diff --git a/sys/kern/uipc_usrreq.c b/sys/kern/uipc_usrreq.c index 72bd0246db11..0056dac65c7d 100644 --- a/sys/kern/uipc_usrreq.c +++ b/sys/kern/uipc_usrreq.c @@ -3463,7 +3463,8 @@ unp_externalize(struct mbuf *control, struct mbuf **controlp, int flags) UNP_LINK_UNLOCK_ASSERT(); - fdflags = (flags & MSG_CMSG_CLOEXEC) ? O_CLOEXEC : 0; + fdflags = ((flags & MSG_CMSG_CLOEXEC) ? O_CLOEXEC : 0) | + ((flags & MSG_CMSG_CLOFORK) ? O_CLOFORK : 0); error = 0; if (controlp != NULL) /* controlp == NULL => free control messages */ diff --git a/sys/kern/vfs_aio.c b/sys/kern/vfs_aio.c index 97dc854c9386..02973146068d 100644 --- a/sys/kern/vfs_aio.c +++ b/sys/kern/vfs_aio.c @@ -301,7 +301,7 @@ static TAILQ_HEAD(,kaiocb) aio_jobs; /* (c) Async job list */ static struct unrhdr *aiod_unr; static void aio_biocleanup(struct bio *bp); -void aio_init_aioinfo(struct proc *p); +static int aio_init_aioinfo(struct proc *p); static int aio_onceonly(void); static int aio_free_entry(struct kaiocb *job); static void aio_process_rw(struct kaiocb *job); @@ -309,7 +309,7 @@ static void aio_process_sync(struct kaiocb *job); static void aio_process_mlock(struct kaiocb *job); static void aio_schedule_fsync(void *context, int pending); static int aio_newproc(int *); -int aio_aqueue(struct thread *td, struct aiocb *ujob, +static int aio_aqueue(struct thread *td, struct aiocb *ujob, struct aioliojob *lio, int type, struct aiocb_ops *ops); static int aio_queue_file(struct file *fp, struct kaiocb *job); static void aio_biowakeup(struct bio *bp); @@ -422,10 +422,11 @@ aio_onceonly(void) * Init the per-process aioinfo structure. The aioinfo limits are set * per-process for user limit (resource) management. */ -void +static int aio_init_aioinfo(struct proc *p) { struct kaioinfo *ki; + int error; ki = uma_zalloc(kaio_zone, M_WAITOK); mtx_init(&ki->kaio_mtx, "aiomtx", NULL, MTX_DEF | MTX_NEW); @@ -451,8 +452,20 @@ aio_init_aioinfo(struct proc *p) uma_zfree(kaio_zone, ki); } - while (num_aio_procs < MIN(target_aio_procs, max_aio_procs)) - aio_newproc(NULL); + error = 0; + while (num_aio_procs < MIN(target_aio_procs, max_aio_procs)) { + error = aio_newproc(NULL); + if (error != 0) { + /* + * At least one worker is enough to have AIO + * functional. Clear error in that case. + */ + if (num_aio_procs > 0) + error = 0; + break; + } + } + return (error); } static int @@ -1476,7 +1489,7 @@ static struct aiocb_ops aiocb_ops_osigevent = { * Queue a new AIO request. Choosing either the threaded or direct bio VCHR * technique is done in this code. */ -int +static int aio_aqueue(struct thread *td, struct aiocb *ujob, struct aioliojob *lj, int type, struct aiocb_ops *ops) { @@ -1490,8 +1503,11 @@ aio_aqueue(struct thread *td, struct aiocb *ujob, struct aioliojob *lj, int fd, kqfd; u_short evflags; - if (p->p_aioinfo == NULL) - aio_init_aioinfo(p); + if (p->p_aioinfo == NULL) { + error = aio_init_aioinfo(p); + if (error != 0) + goto err1; + } ki = p->p_aioinfo; @@ -2213,8 +2229,11 @@ kern_lio_listio(struct thread *td, int mode, struct aiocb * const *uacb_list, if (nent < 0 || nent > max_aio_queue_per_proc) return (EINVAL); - if (p->p_aioinfo == NULL) - aio_init_aioinfo(p); + if (p->p_aioinfo == NULL) { + error = aio_init_aioinfo(p); + if (error != 0) + return (error); + } ki = p->p_aioinfo; @@ -2503,8 +2522,11 @@ kern_aio_waitcomplete(struct thread *td, struct aiocb **ujobp, timo = tvtohz(&atv); } - if (p->p_aioinfo == NULL) - aio_init_aioinfo(p); + if (p->p_aioinfo == NULL) { + error = aio_init_aioinfo(p); + if (error != 0) + return (error); + } ki = p->p_aioinfo; error = 0; diff --git a/sys/kern/vfs_cache.c b/sys/kern/vfs_cache.c index 883beaf6d1da..89c1d779f04c 100644 --- a/sys/kern/vfs_cache.c +++ b/sys/kern/vfs_cache.c @@ -41,6 +41,7 @@ #include <sys/counter.h> #include <sys/filedesc.h> #include <sys/fnv_hash.h> +#include <sys/inotify.h> #include <sys/kernel.h> #include <sys/ktr.h> #include <sys/lock.h> @@ -331,7 +332,8 @@ SDT_PROBE_DEFINE2(vfs, namecache, evict_negative, done, "struct vnode *", "char *"); SDT_PROBE_DEFINE1(vfs, namecache, symlink, alloc__fail, "size_t"); -SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool"); +SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata *", "int", + "enum cache_fpl_status"); SDT_PROBE_DECLARE(vfs, namei, lookup, entry); SDT_PROBE_DECLARE(vfs, namei, lookup, return); @@ -2629,6 +2631,14 @@ cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, atomic_store_ptr(&dvp->v_cache_dd, ncp); } else if (vp != NULL) { /* + * Take the slow path in INOTIFY(). This flag will be lazily + * cleared by cache_vop_inotify() once all directories referring + * to vp are unwatched. + */ + if (__predict_false((vn_irflag_read(dvp) & VIRF_INOTIFY) != 0)) + vn_irflag_set_cond(vp, VIRF_INOTIFY_PARENT); + + /* * For this case, the cache entry maps both the * directory name in it and the name ".." for the * directory's parent. @@ -4008,6 +4018,56 @@ out: return (error); } +void +cache_vop_inotify(struct vnode *vp, int event, uint32_t cookie) +{ + struct mtx *vlp; + struct namecache *ncp; + int isdir; + bool logged, self; + + isdir = vp->v_type == VDIR ? IN_ISDIR : 0; + self = (vn_irflag_read(vp) & VIRF_INOTIFY) != 0 && + (vp->v_type != VDIR || (event & ~_IN_DIR_EVENTS) != 0); + + if (self) { + int selfevent; + + if (event == _IN_ATTRIB_LINKCOUNT) + selfevent = IN_ATTRIB; + else + selfevent = event; + inotify_log(vp, NULL, 0, selfevent | isdir, cookie); + } + if ((event & IN_ALL_EVENTS) == 0) + return; + + logged = false; + vlp = VP2VNODELOCK(vp); + mtx_lock(vlp); + TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) { + if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) + continue; + if ((vn_irflag_read(ncp->nc_dvp) & VIRF_INOTIFY) != 0) { + /* + * XXX-MJ if the vnode has two links in the same + * dir, we'll log the same event twice. + */ + inotify_log(ncp->nc_dvp, ncp->nc_name, ncp->nc_nlen, + event | isdir, cookie); + logged = true; + } + } + if (!logged && (vn_irflag_read(vp) & VIRF_INOTIFY_PARENT) != 0) { + /* + * We didn't find a watched directory that contains this vnode, + * so stop calling VOP_INOTIFY for operations on the vnode. + */ + vn_irflag_unset(vp, VIRF_INOTIFY_PARENT); + } + mtx_unlock(vlp); +} + #ifdef DDB static void db_print_vpath(struct vnode *vp) @@ -6361,15 +6421,11 @@ out: cache_fpl_smr_assert_not_entered(&fpl); cache_fpl_assert_status(&fpl); *status = fpl.status; - if (SDT_PROBES_ENABLED()) { - SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); - if (fpl.status == CACHE_FPL_STATUS_HANDLED) - SDT_PROBE4(vfs, namei, lookup, return, error, ndp->ni_vp, true, - ndp); - } - + SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); if (__predict_true(fpl.status == CACHE_FPL_STATUS_HANDLED)) { MPASS(error != CACHE_FPL_FAILED); + SDT_PROBE4(vfs, namei, lookup, return, error, ndp->ni_vp, true, + ndp); if (error != 0) { cache_fpl_cleanup_cnp(fpl.cnp); MPASS(fpl.dvp == NULL); diff --git a/sys/kern/vfs_default.c b/sys/kern/vfs_default.c index be49c0887609..fd6202a1424c 100644 --- a/sys/kern/vfs_default.c +++ b/sys/kern/vfs_default.c @@ -39,6 +39,7 @@ #include <sys/conf.h> #include <sys/event.h> #include <sys/filio.h> +#include <sys/inotify.h> #include <sys/kernel.h> #include <sys/limits.h> #include <sys/lock.h> @@ -119,6 +120,8 @@ struct vop_vector default_vnodeops = { .vop_getwritemount = vop_stdgetwritemount, .vop_inactive = VOP_NULL, .vop_need_inactive = vop_stdneed_inactive, + .vop_inotify = vop_stdinotify, + .vop_inotify_add_watch = vop_stdinotify_add_watch, .vop_ioctl = vop_stdioctl, .vop_kqfilter = vop_stdkqfilter, .vop_islocked = vop_stdislocked, @@ -453,6 +456,7 @@ vop_stdpathconf(struct vop_pathconf_args *ap) case _PC_MAC_PRESENT: case _PC_NAMEDATTR_ENABLED: case _PC_HAS_NAMEDATTR: + case _PC_HAS_HIDDENSYSTEM: *ap->a_retval = 0; return (0); default: @@ -1306,6 +1310,20 @@ vop_stdneed_inactive(struct vop_need_inactive_args *ap) } int +vop_stdinotify(struct vop_inotify_args *ap) +{ + vn_inotify(ap->a_vp, ap->a_dvp, ap->a_cnp, ap->a_event, ap->a_cookie); + return (0); +} + +int +vop_stdinotify_add_watch(struct vop_inotify_add_watch_args *ap) +{ + return (vn_inotify_add_watch(ap->a_vp, ap->a_sc, ap->a_mask, + ap->a_wdp, ap->a_td)); +} + +int vop_stdioctl(struct vop_ioctl_args *ap) { struct vnode *vp; diff --git a/sys/kern/vfs_inotify.c b/sys/kern/vfs_inotify.c new file mode 100644 index 000000000000..d3cd0d1f9832 --- /dev/null +++ b/sys/kern/vfs_inotify.c @@ -0,0 +1,1011 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2025 Klara, Inc. + */ + +#include "opt_ktrace.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/caprights.h> +#include <sys/counter.h> +#include <sys/dirent.h> +#define EXTERR_CATEGORY EXTERR_CAT_INOTIFY +#include <sys/exterrvar.h> +#include <sys/fcntl.h> +#include <sys/file.h> +#include <sys/filio.h> +#include <sys/inotify.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/ktrace.h> +#include <sys/malloc.h> +#include <sys/mutex.h> +#include <sys/namei.h> +#include <sys/poll.h> +#include <sys/proc.h> +#include <sys/queue.h> +#include <sys/resourcevar.h> +#include <sys/selinfo.h> +#include <sys/stat.h> +#include <sys/syscallsubr.h> +#include <sys/sysctl.h> +#include <sys/sysent.h> +#include <sys/syslimits.h> +#include <sys/sysproto.h> +#include <sys/tree.h> +#include <sys/user.h> +#include <sys/vnode.h> + +uint32_t inotify_rename_cookie; + +static SYSCTL_NODE(_vfs, OID_AUTO, inotify, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, + "inotify configuration"); + +static int inotify_max_queued_events = 16384; +SYSCTL_INT(_vfs_inotify, OID_AUTO, max_queued_events, CTLFLAG_RWTUN, + &inotify_max_queued_events, 0, + "Maximum number of events to queue on an inotify descriptor"); + +static int inotify_max_user_instances = 256; +SYSCTL_INT(_vfs_inotify, OID_AUTO, max_user_instances, CTLFLAG_RWTUN, + &inotify_max_user_instances, 0, + "Maximum number of inotify descriptors per user"); + +static int inotify_max_user_watches; +SYSCTL_INT(_vfs_inotify, OID_AUTO, max_user_watches, CTLFLAG_RWTUN, + &inotify_max_user_watches, 0, + "Maximum number of inotify watches per user"); + +static int inotify_max_watches; +SYSCTL_INT(_vfs_inotify, OID_AUTO, max_watches, CTLFLAG_RWTUN, + &inotify_max_watches, 0, + "Maximum number of inotify watches system-wide"); + +static int inotify_watches; +SYSCTL_INT(_vfs_inotify, OID_AUTO, watches, CTLFLAG_RD, + &inotify_watches, 0, + "Total number of inotify watches currently in use"); + +static int inotify_coalesce = 1; +SYSCTL_INT(_vfs_inotify, OID_AUTO, coalesce, CTLFLAG_RWTUN, + &inotify_coalesce, 0, + "Coalesce inotify events when possible"); + +static COUNTER_U64_DEFINE_EARLY(inotify_event_drops); +SYSCTL_COUNTER_U64(_vfs_inotify, OID_AUTO, event_drops, CTLFLAG_RD, + &inotify_event_drops, + "Number of inotify events dropped due to limits or allocation failures"); + +static fo_rdwr_t inotify_read; +static fo_ioctl_t inotify_ioctl; +static fo_poll_t inotify_poll; +static fo_kqfilter_t inotify_kqfilter; +static fo_stat_t inotify_stat; +static fo_close_t inotify_close; +static fo_fill_kinfo_t inotify_fill_kinfo; + +static const struct fileops inotifyfdops = { + .fo_read = inotify_read, + .fo_write = invfo_rdwr, + .fo_truncate = invfo_truncate, + .fo_ioctl = inotify_ioctl, + .fo_poll = inotify_poll, + .fo_kqfilter = inotify_kqfilter, + .fo_stat = inotify_stat, + .fo_close = inotify_close, + .fo_chmod = invfo_chmod, + .fo_chown = invfo_chown, + .fo_sendfile = invfo_sendfile, + .fo_fill_kinfo = inotify_fill_kinfo, + .fo_cmp = file_kcmp_generic, + .fo_flags = DFLAG_PASSABLE, +}; + +static void filt_inotifydetach(struct knote *kn); +static int filt_inotifyevent(struct knote *kn, long hint); + +static const struct filterops inotify_rfiltops = { + .f_isfd = 1, + .f_detach = filt_inotifydetach, + .f_event = filt_inotifyevent, +}; + +static MALLOC_DEFINE(M_INOTIFY, "inotify", "inotify data structures"); + +struct inotify_record { + STAILQ_ENTRY(inotify_record) link; + struct inotify_event ev; +}; + +static uint64_t inotify_ino = 1; + +/* + * On LP64 systems this occupies 64 bytes, so we don't get internal + * fragmentation by allocating watches with malloc(9). If the size changes, + * consider using a UMA zone to improve memory efficiency. + */ +struct inotify_watch { + struct inotify_softc *sc; /* back-pointer */ + int wd; /* unique ID */ + uint32_t mask; /* event mask */ + struct vnode *vp; /* vnode being watched, refed */ + RB_ENTRY(inotify_watch) ilink; /* inotify linkage */ + TAILQ_ENTRY(inotify_watch) vlink; /* vnode linkage */ +}; + +static void +inotify_init(void *arg __unused) +{ + /* Don't let a user hold too many vnodes. */ + inotify_max_user_watches = desiredvnodes / 3; + /* Don't let the system hold too many vnodes. */ + inotify_max_watches = desiredvnodes / 2; +} +SYSINIT(inotify, SI_SUB_VFS, SI_ORDER_ANY, inotify_init, NULL); + +static int +inotify_watch_cmp(const struct inotify_watch *a, + const struct inotify_watch *b) +{ + if (a->wd < b->wd) + return (-1); + else if (a->wd > b->wd) + return (1); + else + return (0); +} +RB_HEAD(inotify_watch_tree, inotify_watch); +RB_GENERATE_STATIC(inotify_watch_tree, inotify_watch, ilink, inotify_watch_cmp); + +struct inotify_softc { + struct mtx lock; /* serialize all softc writes */ + STAILQ_HEAD(, inotify_record) pending; /* events waiting to be read */ + struct inotify_record overflow; /* preallocated record */ + int nextwatch; /* next watch ID to try */ + int npending; /* number of pending events */ + size_t nbpending; /* bytes available to read */ + uint64_t ino; /* unique identifier */ + struct inotify_watch_tree watches; /* active watches */ + struct selinfo sel; /* select/poll/kevent info */ + struct ucred *cred; /* credential ref */ +}; + +static struct inotify_record * +inotify_dequeue(struct inotify_softc *sc) +{ + struct inotify_record *rec; + + mtx_assert(&sc->lock, MA_OWNED); + KASSERT(!STAILQ_EMPTY(&sc->pending), + ("%s: queue for %p is empty", __func__, sc)); + + rec = STAILQ_FIRST(&sc->pending); + STAILQ_REMOVE_HEAD(&sc->pending, link); + sc->npending--; + sc->nbpending -= sizeof(rec->ev) + rec->ev.len; + return (rec); +} + +static void +inotify_enqueue(struct inotify_softc *sc, struct inotify_record *rec, bool head) +{ + mtx_assert(&sc->lock, MA_OWNED); + + if (head) + STAILQ_INSERT_HEAD(&sc->pending, rec, link); + else + STAILQ_INSERT_TAIL(&sc->pending, rec, link); + sc->npending++; + sc->nbpending += sizeof(rec->ev) + rec->ev.len; +} + +static int +inotify_read(struct file *fp, struct uio *uio, struct ucred *cred, int flags, + struct thread *td) +{ + struct inotify_softc *sc; + struct inotify_record *rec; + int error; + bool first; + + sc = fp->f_data; + error = 0; + + mtx_lock(&sc->lock); + while (STAILQ_EMPTY(&sc->pending)) { + if ((flags & IO_NDELAY) != 0 || (fp->f_flag & FNONBLOCK) != 0) { + mtx_unlock(&sc->lock); + return (EWOULDBLOCK); + } + error = msleep(&sc->pending, &sc->lock, PCATCH, "inotify", 0); + if (error != 0) { + mtx_unlock(&sc->lock); + return (error); + } + } + for (first = true; !STAILQ_EMPTY(&sc->pending); first = false) { + size_t len; + + rec = inotify_dequeue(sc); + len = sizeof(rec->ev) + rec->ev.len; + if (uio->uio_resid < (ssize_t)len) { + inotify_enqueue(sc, rec, true); + if (first) { + error = EXTERROR(EINVAL, + "read buffer is too small"); + } + break; + } + mtx_unlock(&sc->lock); + error = uiomove(&rec->ev, len, uio); +#ifdef KTRACE + if (error == 0 && KTRPOINT(td, KTR_STRUCT)) + ktrstruct("inotify", &rec->ev, len); +#endif + mtx_lock(&sc->lock); + if (error != 0) { + inotify_enqueue(sc, rec, true); + mtx_unlock(&sc->lock); + return (error); + } + if (rec == &sc->overflow) { + /* + * Signal to inotify_queue_record() that the overflow + * record can be reused. + */ + memset(rec, 0, sizeof(*rec)); + } else { + free(rec, M_INOTIFY); + } + } + mtx_unlock(&sc->lock); + return (error); +} + +static int +inotify_ioctl(struct file *fp, u_long com, void *data, struct ucred *cred, + struct thread *td) +{ + struct inotify_softc *sc; + + sc = fp->f_data; + + switch (com) { + case FIONREAD: + *(int *)data = (int)sc->nbpending; + return (0); + case FIONBIO: + case FIOASYNC: + return (0); + default: + return (ENOTTY); + } + + return (0); +} + +static int +inotify_poll(struct file *fp, int events, struct ucred *cred, struct thread *td) +{ + struct inotify_softc *sc; + int revents; + + sc = fp->f_data; + revents = 0; + + mtx_lock(&sc->lock); + if ((events & (POLLIN | POLLRDNORM)) != 0 && sc->npending > 0) + revents |= events & (POLLIN | POLLRDNORM); + else + selrecord(td, &sc->sel); + mtx_unlock(&sc->lock); + return (revents); +} + +static void +filt_inotifydetach(struct knote *kn) +{ + struct inotify_softc *sc; + + sc = kn->kn_hook; + knlist_remove(&sc->sel.si_note, kn, 0); +} + +static int +filt_inotifyevent(struct knote *kn, long hint) +{ + struct inotify_softc *sc; + + sc = kn->kn_hook; + mtx_assert(&sc->lock, MA_OWNED); + kn->kn_data = sc->nbpending; + return (kn->kn_data > 0); +} + +static int +inotify_kqfilter(struct file *fp, struct knote *kn) +{ + struct inotify_softc *sc; + + if (kn->kn_filter != EVFILT_READ) + return (EINVAL); + sc = fp->f_data; + kn->kn_fop = &inotify_rfiltops; + kn->kn_hook = sc; + knlist_add(&sc->sel.si_note, kn, 0); + return (0); +} + +static int +inotify_stat(struct file *fp, struct stat *sb, struct ucred *cred) +{ + struct inotify_softc *sc; + + sc = fp->f_data; + + memset(sb, 0, sizeof(*sb)); + sb->st_mode = S_IFREG | S_IRUSR; + sb->st_blksize = sizeof(struct inotify_event) + _IN_NAMESIZE(NAME_MAX); + mtx_lock(&sc->lock); + sb->st_size = sc->nbpending; + sb->st_blocks = sc->npending; + sb->st_uid = sc->cred->cr_ruid; + sb->st_gid = sc->cred->cr_rgid; + sb->st_ino = sc->ino; + mtx_unlock(&sc->lock); + return (0); +} + +static void +inotify_unlink_watch_locked(struct inotify_softc *sc, struct inotify_watch *watch) +{ + struct vnode *vp; + + vp = watch->vp; + mtx_assert(&vp->v_pollinfo->vpi_lock, MA_OWNED); + + atomic_subtract_int(&inotify_watches, 1); + (void)chginotifywatchcnt(sc->cred->cr_ruidinfo, -1, 0); + + TAILQ_REMOVE(&vp->v_pollinfo->vpi_inotify, watch, vlink); + if (TAILQ_EMPTY(&vp->v_pollinfo->vpi_inotify)) + vn_irflag_unset(vp, VIRF_INOTIFY); +} + +/* + * Assumes that the watch has already been removed from its softc. + */ +static void +inotify_remove_watch(struct inotify_watch *watch) +{ + struct inotify_softc *sc; + struct vnode *vp; + + sc = watch->sc; + + vp = watch->vp; + mtx_lock(&vp->v_pollinfo->vpi_lock); + inotify_unlink_watch_locked(sc, watch); + mtx_unlock(&vp->v_pollinfo->vpi_lock); + + vrele(vp); + free(watch, M_INOTIFY); +} + +static int +inotify_close(struct file *fp, struct thread *td) +{ + struct inotify_softc *sc; + struct inotify_record *rec; + struct inotify_watch *watch; + + sc = fp->f_data; + + mtx_lock(&sc->lock); + (void)chginotifycnt(sc->cred->cr_ruidinfo, -1, 0); + while ((watch = RB_MIN(inotify_watch_tree, &sc->watches)) != NULL) { + RB_REMOVE(inotify_watch_tree, &sc->watches, watch); + mtx_unlock(&sc->lock); + inotify_remove_watch(watch); + mtx_lock(&sc->lock); + } + while (!STAILQ_EMPTY(&sc->pending)) { + rec = inotify_dequeue(sc); + if (rec != &sc->overflow) + free(rec, M_INOTIFY); + } + mtx_unlock(&sc->lock); + seldrain(&sc->sel); + knlist_destroy(&sc->sel.si_note); + mtx_destroy(&sc->lock); + crfree(sc->cred); + free(sc, M_INOTIFY); + return (0); +} + +static int +inotify_fill_kinfo(struct file *fp, struct kinfo_file *kif, + struct filedesc *fdp) +{ + struct inotify_softc *sc; + + sc = fp->f_data; + + kif->kf_type = KF_TYPE_INOTIFY; + kif->kf_un.kf_inotify.kf_inotify_npending = sc->npending; + kif->kf_un.kf_inotify.kf_inotify_nbpending = sc->nbpending; + return (0); +} + +int +inotify_create_file(struct thread *td, struct file *fp, int flags, int *fflagsp) +{ + struct inotify_softc *sc; + int fflags; + + if ((flags & ~(IN_NONBLOCK | IN_CLOEXEC)) != 0) + return (EINVAL); + + if (!chginotifycnt(td->td_ucred->cr_ruidinfo, 1, + inotify_max_user_instances)) + return (EMFILE); + + sc = malloc(sizeof(*sc), M_INOTIFY, M_WAITOK | M_ZERO); + sc->nextwatch = 1; /* Required for compatibility. */ + STAILQ_INIT(&sc->pending); + RB_INIT(&sc->watches); + mtx_init(&sc->lock, "inotify", NULL, MTX_DEF); + knlist_init_mtx(&sc->sel.si_note, &sc->lock); + sc->cred = crhold(td->td_ucred); + sc->ino = atomic_fetchadd_64(&inotify_ino, 1); + + fflags = FREAD; + if ((flags & IN_NONBLOCK) != 0) + fflags |= FNONBLOCK; + if ((flags & IN_CLOEXEC) != 0) + *fflagsp |= O_CLOEXEC; + finit(fp, fflags, DTYPE_INOTIFY, sc, &inotifyfdops); + + return (0); +} + +static struct inotify_record * +inotify_alloc_record(uint32_t wd, const char *name, size_t namelen, int event, + uint32_t cookie, int waitok) +{ + struct inotify_event *evp; + struct inotify_record *rec; + + rec = malloc(sizeof(*rec) + _IN_NAMESIZE(namelen), M_INOTIFY, + waitok | M_ZERO); + if (rec == NULL) + return (NULL); + evp = &rec->ev; + evp->wd = wd; + evp->mask = event; + evp->cookie = cookie; + evp->len = _IN_NAMESIZE(namelen); + if (name != NULL) + memcpy(evp->name, name, namelen); + return (rec); +} + +static bool +inotify_can_coalesce(struct inotify_softc *sc, struct inotify_event *evp) +{ + struct inotify_record *prev; + + mtx_assert(&sc->lock, MA_OWNED); + + prev = STAILQ_LAST(&sc->pending, inotify_record, link); + return (prev != NULL && prev->ev.mask == evp->mask && + prev->ev.wd == evp->wd && prev->ev.cookie == evp->cookie && + prev->ev.len == evp->len && + memcmp(prev->ev.name, evp->name, evp->len) == 0); +} + +static void +inotify_overflow_event(struct inotify_event *evp) +{ + evp->mask = IN_Q_OVERFLOW; + evp->wd = -1; + evp->cookie = 0; + evp->len = 0; +} + +/* + * Put an event record on the queue for an inotify desscriptor. Return false if + * the record was not enqueued for some reason, true otherwise. + */ +static bool +inotify_queue_record(struct inotify_softc *sc, struct inotify_record *rec) +{ + struct inotify_event *evp; + + mtx_assert(&sc->lock, MA_OWNED); + + evp = &rec->ev; + if (__predict_false(rec == &sc->overflow)) { + /* + * Is the overflow record already in the queue? If so, there's + * not much else we can do: we're here because a kernel memory + * shortage prevented new record allocations. + */ + counter_u64_add(inotify_event_drops, 1); + if (evp->mask == IN_Q_OVERFLOW) + return (false); + inotify_overflow_event(evp); + } else { + /* Try to coalesce duplicate events. */ + if (inotify_coalesce && inotify_can_coalesce(sc, evp)) + return (false); + + /* + * Would this one overflow the queue? If so, convert it to an + * overflow event and try again to coalesce. + */ + if (sc->npending >= inotify_max_queued_events) { + counter_u64_add(inotify_event_drops, 1); + inotify_overflow_event(evp); + if (inotify_can_coalesce(sc, evp)) + return (false); + } + } + inotify_enqueue(sc, rec, false); + selwakeup(&sc->sel); + KNOTE_LOCKED(&sc->sel.si_note, 0); + wakeup(&sc->pending); + return (true); +} + +static int +inotify_log_one(struct inotify_watch *watch, const char *name, size_t namelen, + int event, uint32_t cookie) +{ + struct inotify_watch key; + struct inotify_softc *sc; + struct inotify_record *rec; + int relecount; + bool allocfail; + + relecount = 0; + + sc = watch->sc; + rec = inotify_alloc_record(watch->wd, name, namelen, event, cookie, + M_NOWAIT); + if (rec == NULL) { + rec = &sc->overflow; + allocfail = true; + } else { + allocfail = false; + } + + mtx_lock(&sc->lock); + if (!inotify_queue_record(sc, rec) && rec != &sc->overflow) + free(rec, M_INOTIFY); + if ((watch->mask & IN_ONESHOT) != 0 || + (event & (IN_DELETE_SELF | IN_UNMOUNT)) != 0) { + if (!allocfail) { + rec = inotify_alloc_record(watch->wd, NULL, 0, + IN_IGNORED, 0, M_NOWAIT); + if (rec == NULL) + rec = &sc->overflow; + if (!inotify_queue_record(sc, rec) && + rec != &sc->overflow) + free(rec, M_INOTIFY); + } + + /* + * Remove the watch, taking care to handle races with + * inotify_close(). + */ + key.wd = watch->wd; + if (RB_FIND(inotify_watch_tree, &sc->watches, &key) != NULL) { + RB_REMOVE(inotify_watch_tree, &sc->watches, watch); + inotify_unlink_watch_locked(sc, watch); + free(watch, M_INOTIFY); + + /* Defer vrele() to until locks are dropped. */ + relecount++; + } + } + mtx_unlock(&sc->lock); + return (relecount); +} + +void +inotify_log(struct vnode *vp, const char *name, size_t namelen, int event, + uint32_t cookie) +{ + struct inotify_watch *watch, *tmp; + int relecount; + + KASSERT((event & ~(IN_ALL_EVENTS | IN_ISDIR | IN_UNMOUNT)) == 0, + ("inotify_log: invalid event %#x", event)); + + relecount = 0; + mtx_lock(&vp->v_pollinfo->vpi_lock); + TAILQ_FOREACH_SAFE(watch, &vp->v_pollinfo->vpi_inotify, vlink, tmp) { + KASSERT(watch->vp == vp, + ("inotify_log: watch %p vp != vp", watch)); + if ((watch->mask & event) != 0 || event == IN_UNMOUNT) { + relecount += inotify_log_one(watch, name, namelen, event, + cookie); + } + } + mtx_unlock(&vp->v_pollinfo->vpi_lock); + + for (int i = 0; i < relecount; i++) + vrele(vp); +} + +/* + * An inotify event occurred on a watched vnode. + */ +void +vn_inotify(struct vnode *vp, struct vnode *dvp, struct componentname *cnp, + int event, uint32_t cookie) +{ + int isdir; + + VNPASS(vp->v_holdcnt > 0, vp); + + isdir = vp->v_type == VDIR ? IN_ISDIR : 0; + + if (dvp != NULL) { + VNPASS(dvp->v_holdcnt > 0, dvp); + + /* + * Should we log an event for the vnode itself? + */ + if ((vn_irflag_read(vp) & VIRF_INOTIFY) != 0) { + int selfevent; + + switch (event) { + case _IN_MOVE_DELETE: + case IN_DELETE: + /* + * IN_DELETE_SELF is only generated when the + * last hard link of a file is removed. + */ + selfevent = IN_DELETE_SELF; + if (vp->v_type != VDIR) { + struct vattr va; + int error; + + error = VOP_GETATTR(vp, &va, + cnp->cn_cred); + if (error == 0 && va.va_nlink != 0) + selfevent = 0; + } + break; + case IN_MOVED_FROM: + cookie = 0; + selfevent = IN_MOVE_SELF; + break; + case _IN_ATTRIB_LINKCOUNT: + selfevent = IN_ATTRIB; + break; + default: + selfevent = event; + break; + } + + if ((selfevent & ~_IN_DIR_EVENTS) != 0) { + inotify_log(vp, NULL, 0, selfevent | isdir, + cookie); + } + } + + /* + * Something is watching the directory through which this vnode + * was referenced, so we may need to log the event. + */ + if ((event & IN_ALL_EVENTS) != 0 && + (vn_irflag_read(dvp) & VIRF_INOTIFY) != 0) { + inotify_log(dvp, cnp->cn_nameptr, + cnp->cn_namelen, event | isdir, cookie); + } + } else { + /* + * We don't know which watched directory might contain the + * vnode, so we have to fall back to searching the name cache. + */ + cache_vop_inotify(vp, event, cookie); + } +} + +int +vn_inotify_add_watch(struct vnode *vp, struct inotify_softc *sc, uint32_t mask, + uint32_t *wdp, struct thread *td) +{ + struct inotify_watch *watch, *watch1; + uint32_t wd; + + /* + * If this is a directory, make sure all of its entries are present in + * the name cache so that we're able to look them up if an event occurs. + * The persistent reference on the directory prevents the outgoing name + * cache entries from being reclaimed. + */ + if (vp->v_type == VDIR) { + struct dirent *dp; + char *buf; + off_t off; + size_t buflen, len; + int eof, error; + + buflen = 128 * sizeof(struct dirent); + buf = malloc(buflen, M_TEMP, M_WAITOK); + + error = 0; + len = off = eof = 0; + for (;;) { + struct nameidata nd; + + error = vn_dir_next_dirent(vp, td, buf, buflen, &dp, + &len, &off, &eof); + if (error != 0) + break; + if (len == 0) + /* Finished reading. */ + break; + if (strcmp(dp->d_name, ".") == 0 || + strcmp(dp->d_name, "..") == 0) + continue; + + /* + * namei() consumes a reference on the starting + * directory if it's specified as a vnode. + */ + vrefact(vp); + VOP_UNLOCK(vp); + NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, + dp->d_name, vp); + error = namei(&nd); + vn_lock(vp, LK_SHARED | LK_RETRY); + if (error != 0) + break; + vn_irflag_set_cond(nd.ni_vp, VIRF_INOTIFY_PARENT); + vrele(nd.ni_vp); + } + free(buf, M_TEMP); + if (error != 0) + return (error); + } + + /* + * The vnode referenced in kern_inotify_add_watch() might be different + * than this one if nullfs is in the picture. + */ + vrefact(vp); + watch = malloc(sizeof(*watch), M_INOTIFY, M_WAITOK | M_ZERO); + watch->sc = sc; + watch->vp = vp; + watch->mask = mask; + + /* + * Are we updating an existing watch? Search the vnode's list rather + * than that of the softc, as the former is likely to be shorter. + */ + v_addpollinfo(vp); + mtx_lock(&vp->v_pollinfo->vpi_lock); + TAILQ_FOREACH(watch1, &vp->v_pollinfo->vpi_inotify, vlink) { + if (watch1->sc == sc) + break; + } + mtx_lock(&sc->lock); + if (watch1 != NULL) { + mtx_unlock(&vp->v_pollinfo->vpi_lock); + + /* + * We found an existing watch, update it based on our flags. + */ + if ((mask & IN_MASK_CREATE) != 0) { + mtx_unlock(&sc->lock); + vrele(vp); + free(watch, M_INOTIFY); + return (EEXIST); + } + if ((mask & IN_MASK_ADD) != 0) + watch1->mask |= mask; + else + watch1->mask = mask; + *wdp = watch1->wd; + mtx_unlock(&sc->lock); + vrele(vp); + free(watch, M_INOTIFY); + return (EJUSTRETURN); + } + + /* + * We're creating a new watch. Add it to the softc and vnode watch + * lists. + */ + do { + struct inotify_watch key; + + /* + * Search for the next available watch descriptor. This is + * implemented so as to avoid reusing watch descriptors for as + * long as possible. + */ + key.wd = wd = sc->nextwatch++; + watch1 = RB_FIND(inotify_watch_tree, &sc->watches, &key); + } while (watch1 != NULL || wd == 0); + watch->wd = wd; + RB_INSERT(inotify_watch_tree, &sc->watches, watch); + TAILQ_INSERT_TAIL(&vp->v_pollinfo->vpi_inotify, watch, vlink); + mtx_unlock(&sc->lock); + mtx_unlock(&vp->v_pollinfo->vpi_lock); + vn_irflag_set_cond(vp, VIRF_INOTIFY); + + *wdp = wd; + + return (0); +} + +void +vn_inotify_revoke(struct vnode *vp) +{ + if (vp->v_pollinfo == NULL) { + /* This is a nullfs vnode which shadows a watched vnode. */ + return; + } + inotify_log(vp, NULL, 0, IN_UNMOUNT, 0); +} + +static int +fget_inotify(struct thread *td, int fd, const cap_rights_t *needrightsp, + struct file **fpp) +{ + struct file *fp; + int error; + + error = fget(td, fd, needrightsp, &fp); + if (error != 0) + return (error); + if (fp->f_type != DTYPE_INOTIFY) { + fdrop(fp, td); + return (EINVAL); + } + *fpp = fp; + return (0); +} + +int +kern_inotify_add_watch(int fd, int dfd, const char *path, uint32_t mask, + struct thread *td) +{ + struct nameidata nd; + struct file *fp; + struct inotify_softc *sc; + struct vnode *vp; + uint32_t wd; + int count, error; + + fp = NULL; + vp = NULL; + + if ((mask & IN_ALL_EVENTS) == 0) + return (EXTERROR(EINVAL, "no events specified")); + if ((mask & (IN_MASK_ADD | IN_MASK_CREATE)) == + (IN_MASK_ADD | IN_MASK_CREATE)) + return (EXTERROR(EINVAL, + "IN_MASK_ADD and IN_MASK_CREATE are mutually exclusive")); + if ((mask & ~(IN_ALL_EVENTS | _IN_ALL_FLAGS | IN_UNMOUNT)) != 0) + return (EXTERROR(EINVAL, "unrecognized flag")); + + error = fget_inotify(td, fd, &cap_inotify_add_rights, &fp); + if (error != 0) + return (error); + sc = fp->f_data; + + NDINIT_AT(&nd, LOOKUP, + ((mask & IN_DONT_FOLLOW) ? NOFOLLOW : FOLLOW) | LOCKLEAF | + LOCKSHARED | AUDITVNODE1, UIO_USERSPACE, path, dfd); + error = namei(&nd); + if (error != 0) + goto out; + NDFREE_PNBUF(&nd); + vp = nd.ni_vp; + + error = VOP_ACCESS(vp, VREAD, td->td_ucred, td); + if (error != 0) + goto out; + + if ((mask & IN_ONLYDIR) != 0 && vp->v_type != VDIR) { + error = ENOTDIR; + goto out; + } + + count = atomic_fetchadd_int(&inotify_watches, 1); + if (count > inotify_max_watches) { + atomic_subtract_int(&inotify_watches, 1); + error = ENOSPC; + goto out; + } + if (!chginotifywatchcnt(sc->cred->cr_ruidinfo, 1, + inotify_max_user_watches)) { + atomic_subtract_int(&inotify_watches, 1); + error = ENOSPC; + goto out; + } + error = VOP_INOTIFY_ADD_WATCH(vp, sc, mask, &wd, td); + if (error != 0) { + atomic_subtract_int(&inotify_watches, 1); + (void)chginotifywatchcnt(sc->cred->cr_ruidinfo, -1, 0); + if (error == EJUSTRETURN) { + /* We updated an existing watch, everything is ok. */ + error = 0; + } else { + goto out; + } + } + td->td_retval[0] = wd; + +out: + if (vp != NULL) + vput(vp); + fdrop(fp, td); + return (error); +} + +int +sys_inotify_add_watch_at(struct thread *td, + struct inotify_add_watch_at_args *uap) +{ + return (kern_inotify_add_watch(uap->fd, uap->dfd, uap->path, + uap->mask, td)); +} + +int +kern_inotify_rm_watch(int fd, uint32_t wd, struct thread *td) +{ + struct file *fp; + struct inotify_softc *sc; + struct inotify_record *rec; + struct inotify_watch key, *watch; + int error; + + error = fget_inotify(td, fd, &cap_inotify_rm_rights, &fp); + if (error != 0) + return (error); + sc = fp->f_data; + + rec = inotify_alloc_record(wd, NULL, 0, IN_IGNORED, 0, M_WAITOK); + + /* + * For compatibility with Linux, we do not remove pending events + * associated with the watch. Watch descriptors are implemented so as + * to avoid being reused for as long as possible, so one hopes that any + * pending events from the removed watch descriptor will be removed + * before the watch descriptor is recycled. + */ + key.wd = wd; + mtx_lock(&sc->lock); + watch = RB_FIND(inotify_watch_tree, &sc->watches, &key); + if (watch == NULL) { + free(rec, M_INOTIFY); + error = EINVAL; + } else { + RB_REMOVE(inotify_watch_tree, &sc->watches, watch); + if (!inotify_queue_record(sc, rec)) { + free(rec, M_INOTIFY); + error = 0; + } + } + mtx_unlock(&sc->lock); + if (watch != NULL) + inotify_remove_watch(watch); + fdrop(fp, td); + return (error); +} + +int +sys_inotify_rm_watch(struct thread *td, struct inotify_rm_watch_args *uap) +{ + return (kern_inotify_rm_watch(uap->fd, uap->wd, td)); +} diff --git a/sys/kern/vfs_lookup.c b/sys/kern/vfs_lookup.c index 86c7bdaa02c0..fb3e6a7a2534 100644 --- a/sys/kern/vfs_lookup.c +++ b/sys/kern/vfs_lookup.c @@ -75,14 +75,20 @@ static void NDVALIDATE_impl(struct nameidata *, int); #endif /* + * Reset ndp to its original state. + */ +#define NDRESET(ndp) do { \ + NDREINIT_DBG(ndp); \ + ndp->ni_resflags = 0; \ + ndp->ni_cnd.cn_flags &= ~NAMEI_INTERNAL_FLAGS; \ +} while (0) +/* * Prepare namei() to restart. Reset components to its original state and set * ISRESTARTED flag which signals the underlying lookup code to change the root * from ABI root to actual root and prevents a further restarts. */ #define NDRESTART(ndp) do { \ - NDREINIT_DBG(ndp); \ - ndp->ni_resflags = 0; \ - ndp->ni_cnd.cn_flags &= ~NAMEI_INTERNAL_FLAGS; \ + NDRESET(ndp); \ ndp->ni_cnd.cn_flags |= ISRESTARTED; \ } while (0) @@ -162,8 +168,8 @@ static struct vop_vector crossmp_vnodeops = { */ struct nameicap_tracker { - struct vnode *dp; TAILQ_ENTRY(nameicap_tracker) nm_link; + struct mount *mp; }; /* Zone for cap mode tracker elements used for dotdot capability checks. */ @@ -192,49 +198,75 @@ SYSCTL_INT(_vfs, OID_AUTO, lookup_cap_dotdot_nonlocal, CTLFLAG_RWTUN, "enables \"..\" components in path lookup in capability mode " "on non-local mount"); -static void +static int nameicap_tracker_add(struct nameidata *ndp, struct vnode *dp) { struct nameicap_tracker *nt; + struct mount *mp; + int error; if ((ndp->ni_lcf & NI_LCF_CAP_DOTDOT) == 0 || dp->v_type != VDIR) - return; + return (0); + mp = NULL; + error = VOP_GETWRITEMOUNT(dp, &mp); + if (error != 0) + return (error); nt = TAILQ_LAST(&ndp->ni_cap_tracker, nameicap_tracker_head); - if (nt != NULL && nt->dp == dp) - return; + if (nt != NULL && nt->mp == mp) { + vfs_rel(mp); + return (0); + } nt = malloc(sizeof(*nt), M_NAMEITRACKER, M_WAITOK); - vhold(dp); - nt->dp = dp; - TAILQ_INSERT_TAIL(&ndp->ni_cap_tracker, nt, nm_link); + nt->mp = mp; + error = lockmgr(&mp->mnt_renamelock, LK_SHARED | LK_NOWAIT, 0); + if (error != 0) { + MPASS(ndp->ni_nctrack_mnt == NULL); + ndp->ni_nctrack_mnt = mp; + free(nt, M_NAMEITRACKER); + error = ERESTART; + } else { + TAILQ_INSERT_TAIL(&ndp->ni_cap_tracker, nt, nm_link); + } + return (error); } static void -nameicap_cleanup_from(struct nameidata *ndp, struct nameicap_tracker *first) +nameicap_cleanup(struct nameidata *ndp, int error) { struct nameicap_tracker *nt, *nt1; + struct mount *mp; + + KASSERT((ndp->ni_nctrack_mnt == NULL && + TAILQ_EMPTY(&ndp->ni_cap_tracker)) || + (ndp->ni_lcf & NI_LCF_CAP_DOTDOT) != 0, + ("tracker active and not strictrelative")); - nt = first; - TAILQ_FOREACH_FROM_SAFE(nt, &ndp->ni_cap_tracker, nm_link, nt1) { + TAILQ_FOREACH_SAFE(nt, &ndp->ni_cap_tracker, nm_link, nt1) { + mp = nt->mp; + lockmgr(&mp->mnt_renamelock, LK_RELEASE, 0); + vfs_rel(mp); TAILQ_REMOVE(&ndp->ni_cap_tracker, nt, nm_link); - vdrop(nt->dp); free(nt, M_NAMEITRACKER); } -} -static void -nameicap_cleanup(struct nameidata *ndp) -{ - KASSERT(TAILQ_EMPTY(&ndp->ni_cap_tracker) || - (ndp->ni_lcf & NI_LCF_CAP_DOTDOT) != 0, ("not strictrelative")); - nameicap_cleanup_from(ndp, NULL); + mp = ndp->ni_nctrack_mnt; + if (mp != NULL) { + if (error == ERESTART) { + lockmgr(&mp->mnt_renamelock, LK_EXCLUSIVE, 0); + lockmgr(&mp->mnt_renamelock, LK_RELEASE, 0); + } + vfs_rel(mp); + ndp->ni_nctrack_mnt = NULL; + } } /* - * For dotdot lookups in capability mode, only allow the component - * lookup to succeed if the resulting directory was already traversed - * during the operation. This catches situations where already - * traversed directory is moved to different parent, and then we walk - * over it with dotdots. + * For dotdot lookups in capability mode, disallow walking over the + * directory no_rbeneath_dpp that was used as the starting point of + * the lookup. Since we take the mnt_renamelocks of all mounts we + * ever walked over during lookup, parallel renames are disabled. + * This prevents the situation where we circumvent walk over + * ni_rbeneath_dpp following dotdots. * * Also allow to force failure of dotdot lookups for non-local * filesystems, where external agents might assist local lookups to @@ -243,7 +275,6 @@ nameicap_cleanup(struct nameidata *ndp) static int nameicap_check_dotdot(struct nameidata *ndp, struct vnode *dp) { - struct nameicap_tracker *nt; struct mount *mp; if (dp == NULL || dp->v_type != VDIR || (ndp->ni_lcf & @@ -253,22 +284,16 @@ nameicap_check_dotdot(struct nameidata *ndp, struct vnode *dp) NI_LCF_CAP_DOTDOT_KTR)) == NI_LCF_STRICTREL_KTR)) NI_CAP_VIOLATION(ndp, ndp->ni_cnd.cn_pnbuf); if ((ndp->ni_lcf & NI_LCF_CAP_DOTDOT) == 0) - return (ENOTCAPABLE); + goto violation; + if (dp == ndp->ni_rbeneath_dpp) + goto violation; mp = dp->v_mount; if (lookup_cap_dotdot_nonlocal == 0 && mp != NULL && (mp->mnt_flag & MNT_LOCAL) == 0) - goto capfail; - TAILQ_FOREACH_REVERSE(nt, &ndp->ni_cap_tracker, nameicap_tracker_head, - nm_link) { - if (dp == nt->dp) { - nt = TAILQ_NEXT(nt, nm_link); - if (nt != NULL) - nameicap_cleanup_from(ndp, nt); - return (0); - } - } + goto violation; + return (0); -capfail: +violation: if (__predict_false((ndp->ni_lcf & NI_LCF_STRICTREL_KTR) != 0)) NI_CAP_VIOLATION(ndp, ndp->ni_cnd.cn_pnbuf); return (ENOTCAPABLE); @@ -394,6 +419,8 @@ namei_setup(struct nameidata *ndp, struct vnode **dpp, struct pwd **pwdp) NI_LCF_CAP_DOTDOT; } } + if (error == 0 && (ndp->ni_lcf & NI_LCF_STRICTREL) != 0) + ndp->ni_rbeneath_dpp = *dpp; /* * If we are auditing the kernel pathname, save the user pathname. @@ -631,6 +658,7 @@ restart: error = namei_getpath(ndp); if (__predict_false(error != 0)) { namei_cleanup_cnp(cnp); + nameicap_cleanup(ndp, error); SDT_PROBE4(vfs, namei, lookup, return, error, NULL, false, ndp); return (error); @@ -661,12 +689,12 @@ restart: else if (__predict_false(pwd->pwd_adir != pwd->pwd_rdir && (cnp->cn_flags & ISRESTARTED) == 0)) { namei_cleanup_cnp(cnp); + nameicap_cleanup(ndp, ERESTART); NDRESTART(ndp); goto restart; } return (error); case CACHE_FPL_STATUS_PARTIAL: - TAILQ_INIT(&ndp->ni_cap_tracker); dp = ndp->ni_startdir; break; case CACHE_FPL_STATUS_DESTROYED: @@ -674,18 +702,21 @@ restart: error = namei_getpath(ndp); if (__predict_false(error != 0)) { namei_cleanup_cnp(cnp); + nameicap_cleanup(ndp, error); return (error); } cnp->cn_nameptr = cnp->cn_pnbuf; /* FALLTHROUGH */ case CACHE_FPL_STATUS_ABORTED: - TAILQ_INIT(&ndp->ni_cap_tracker); MPASS(ndp->ni_lcf == 0); if (*cnp->cn_pnbuf == '\0') { if ((cnp->cn_flags & EMPTYPATH) != 0) { - return (namei_emptypath(ndp)); + error = namei_emptypath(ndp); + nameicap_cleanup(ndp, error); + return (error); } namei_cleanup_cnp(cnp); + nameicap_cleanup(ndp, ENOENT); SDT_PROBE4(vfs, namei, lookup, return, ENOENT, NULL, false, ndp); return (ENOENT); @@ -693,6 +724,7 @@ restart: error = namei_setup(ndp, &dp, &pwd); if (error != 0) { namei_cleanup_cnp(cnp); + nameicap_cleanup(ndp, error); return (error); } break; @@ -705,16 +737,23 @@ restart: ndp->ni_startdir = dp; error = vfs_lookup(ndp); if (error != 0) { - if (__predict_false(pwd->pwd_adir != pwd->pwd_rdir && - error == ENOENT && - (cnp->cn_flags & ISRESTARTED) == 0)) { - nameicap_cleanup(ndp); - pwd_drop(pwd); - namei_cleanup_cnp(cnp); - NDRESTART(ndp); - goto restart; - } else + uint64_t was_restarted; + bool abi_restart; + + was_restarted = ndp->ni_cnd.cn_flags & + ISRESTARTED; + abi_restart = pwd->pwd_adir != pwd->pwd_rdir && + error == ENOENT && was_restarted == 0; + if (error != ERESTART && !abi_restart) goto out; + nameicap_cleanup(ndp, error); + pwd_drop(pwd); + namei_cleanup_cnp(cnp); + NDRESET(ndp); + if (abi_restart) + was_restarted = ISRESTARTED; + ndp->ni_cnd.cn_flags |= was_restarted; + goto restart; } /* @@ -723,7 +762,7 @@ restart: if ((cnp->cn_flags & ISSYMLINK) == 0) { SDT_PROBE4(vfs, namei, lookup, return, error, ndp->ni_vp, false, ndp); - nameicap_cleanup(ndp); + nameicap_cleanup(ndp, 0); pwd_drop(pwd); NDVALIDATE(ndp); return (0); @@ -756,10 +795,10 @@ restart: ndp->ni_vp = NULL; vrele(ndp->ni_dvp); out: - MPASS(error != 0); + MPASS(error != 0 && error != ERESTART); SDT_PROBE4(vfs, namei, lookup, return, error, NULL, false, ndp); namei_cleanup_cnp(cnp); - nameicap_cleanup(ndp); + nameicap_cleanup(ndp, error); pwd_drop(pwd); return (error); } @@ -1185,7 +1224,9 @@ dirloop: } } - nameicap_tracker_add(ndp, dp); + error = nameicap_tracker_add(ndp, dp); + if (error != 0) + goto bad; /* * Make sure degenerate names don't get here, their handling was @@ -1210,9 +1251,7 @@ dirloop: * the jail or chroot, don't let them out. * 5. If doing a capability lookup and lookup_cap_dotdot is * enabled, return ENOTCAPABLE if the lookup would escape - * from the initial file descriptor directory. Checks are - * done by ensuring that namei() already traversed the - * result of dotdot lookup. + * from the initial file descriptor directory. */ if (cnp->cn_flags & ISDOTDOT) { if (__predict_false((ndp->ni_lcf & (NI_LCF_STRICTREL_KTR | @@ -1238,7 +1277,7 @@ dirloop: NI_CAP_VIOLATION(ndp, cnp->cn_pnbuf); if ((ndp->ni_lcf & NI_LCF_STRICTREL) != 0) { error = ENOTCAPABLE; - goto capdotdot; + goto bad; } } if (isroot || ((dp->v_vflag & VV_ROOT) != 0 && @@ -1261,11 +1300,6 @@ dirloop: vn_lock(dp, enforce_lkflags(dp->v_mount, cnp->cn_lkflags | LK_RETRY)); - error = nameicap_check_dotdot(ndp, dp); - if (error != 0) { -capdotdot: - goto bad; - } } } @@ -1314,7 +1348,9 @@ unionlookup: vn_lock(dp, enforce_lkflags(dp->v_mount, cnp->cn_lkflags | LK_RETRY)); - nameicap_tracker_add(ndp, dp); + error = nameicap_tracker_add(ndp, dp); + if (error != 0) + goto bad; goto unionlookup; } @@ -1415,7 +1451,7 @@ nextname: goto dirloop; } if (cnp->cn_flags & ISDOTDOT) { - error = nameicap_check_dotdot(ndp, ndp->ni_vp); + error = nameicap_check_dotdot(ndp, ndp->ni_dvp); if (error != 0) goto bad2; } @@ -1485,8 +1521,11 @@ success: } success_right_lock: if (ndp->ni_vp != NULL) { - if ((cnp->cn_flags & ISDOTDOT) == 0) - nameicap_tracker_add(ndp, ndp->ni_vp); + if ((cnp->cn_flags & ISDOTDOT) == 0) { + error = nameicap_tracker_add(ndp, ndp->ni_vp); + if (error != 0) + goto bad2; + } if ((cnp->cn_flags & (FAILIFEXISTS | ISSYMLINK)) == FAILIFEXISTS) return (vfs_lookup_failifexists(ndp)); } diff --git a/sys/kern/vfs_mount.c b/sys/kern/vfs_mount.c index cb18468d28bc..8e64a7fe966b 100644 --- a/sys/kern/vfs_mount.c +++ b/sys/kern/vfs_mount.c @@ -156,6 +156,7 @@ mount_init(void *mem, int size, int flags) mtx_init(&mp->mnt_mtx, "struct mount mtx", NULL, MTX_DEF); mtx_init(&mp->mnt_listmtx, "struct mount vlist mtx", NULL, MTX_DEF); lockinit(&mp->mnt_explock, PVFS, "explock", 0, 0); + lockinit(&mp->mnt_renamelock, PVFS, "rename", 0, 0); mp->mnt_pcpu = uma_zalloc_pcpu(pcpu_zone_16, M_WAITOK | M_ZERO); mp->mnt_ref = 0; mp->mnt_vfs_ops = 1; @@ -170,6 +171,7 @@ mount_fini(void *mem, int size) mp = (struct mount *)mem; uma_zfree_pcpu(pcpu_zone_16, mp->mnt_pcpu); + lockdestroy(&mp->mnt_renamelock); lockdestroy(&mp->mnt_explock); mtx_destroy(&mp->mnt_listmtx); mtx_destroy(&mp->mnt_mtx); diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c index dc2fb59fb81c..918b256e6c59 100644 --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -38,7 +38,6 @@ * External virtual filesystem routines */ -#include <sys/cdefs.h> #include "opt_ddb.h" #include "opt_watchdog.h" @@ -57,6 +56,7 @@ #include <sys/extattr.h> #include <sys/file.h> #include <sys/fcntl.h> +#include <sys/inotify.h> #include <sys/jail.h> #include <sys/kdb.h> #include <sys/kernel.h> @@ -5246,7 +5246,8 @@ destroy_vpollinfo_free(struct vpollinfo *vi) static void destroy_vpollinfo(struct vpollinfo *vi) { - + KASSERT(TAILQ_EMPTY(&vi->vpi_inotify), + ("%s: pollinfo %p has lingering watches", __func__, vi)); knlist_clear(&vi->vpi_selinfo.si_note, 1); seldrain(&vi->vpi_selinfo); destroy_vpollinfo_free(vi); @@ -5260,12 +5261,13 @@ v_addpollinfo(struct vnode *vp) { struct vpollinfo *vi; - if (vp->v_pollinfo != NULL) + if (atomic_load_ptr(&vp->v_pollinfo) != NULL) return; vi = malloc(sizeof(*vi), M_VNODEPOLL, M_WAITOK | M_ZERO); mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF); knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock, vfs_knlunlock, vfs_knl_assert_lock); + TAILQ_INIT(&vi->vpi_inotify); VI_LOCK(vp); if (vp->v_pollinfo != NULL) { VI_UNLOCK(vp); @@ -5851,6 +5853,8 @@ vop_rename_pre(void *ap) struct vop_rename_args *a = ap; #ifdef DEBUG_VFS_LOCKS + struct mount *tmp; + if (a->a_tvp) ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME"); ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME"); @@ -5868,6 +5872,11 @@ vop_rename_pre(void *ap) if (a->a_tvp) ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked"); ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked"); + + tmp = NULL; + VOP_GETWRITEMOUNT(a->a_tdvp, &tmp); + lockmgr_assert(&tmp->mnt_renamelock, KA_XLOCKED); + vfs_rel(tmp); #endif /* * It may be tempting to add vn_seqc_write_begin/end calls here and @@ -6057,6 +6066,28 @@ vop_need_inactive_debugpost(void *ap, int rc) #endif void +vop_allocate_post(void *ap, int rc) +{ + struct vop_allocate_args *a; + + a = ap; + if (rc == 0) + INOTIFY(a->a_vp, IN_MODIFY); +} + +void +vop_copy_file_range_post(void *ap, int rc) +{ + struct vop_copy_file_range_args *a; + + a = ap; + if (rc == 0) { + INOTIFY(a->a_invp, IN_ACCESS); + INOTIFY(a->a_outvp, IN_MODIFY); + } +} + +void vop_create_pre(void *ap) { struct vop_create_args *a; @@ -6076,8 +6107,20 @@ vop_create_post(void *ap, int rc) a = ap; dvp = a->a_dvp; vn_seqc_write_end(dvp); - if (!rc) + if (!rc) { VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); + INOTIFY_NAME(*a->a_vpp, dvp, a->a_cnp, IN_CREATE); + } +} + +void +vop_deallocate_post(void *ap, int rc) +{ + struct vop_deallocate_args *a; + + a = ap; + if (rc == 0) + INOTIFY(a->a_vp, IN_MODIFY); } void @@ -6122,8 +6165,10 @@ vop_deleteextattr_post(void *ap, int rc) a = ap; vp = a->a_vp; vn_seqc_write_end(vp); - if (!rc) + if (!rc) { VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); + INOTIFY(vp, IN_ATTRIB); + } } void @@ -6153,6 +6198,8 @@ vop_link_post(void *ap, int rc) if (!rc) { VFS_KNOTE_LOCKED(vp, NOTE_LINK); VFS_KNOTE_LOCKED(tdvp, NOTE_WRITE); + INOTIFY_NAME(vp, tdvp, a->a_cnp, _IN_ATTRIB_LINKCOUNT); + INOTIFY_NAME(vp, tdvp, a->a_cnp, IN_CREATE); } } @@ -6176,8 +6223,10 @@ vop_mkdir_post(void *ap, int rc) a = ap; dvp = a->a_dvp; vn_seqc_write_end(dvp); - if (!rc) + if (!rc) { VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK); + INOTIFY_NAME(*a->a_vpp, dvp, a->a_cnp, IN_CREATE); + } } #ifdef DEBUG_VFS_LOCKS @@ -6212,8 +6261,10 @@ vop_mknod_post(void *ap, int rc) a = ap; dvp = a->a_dvp; vn_seqc_write_end(dvp); - if (!rc) + if (!rc) { VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); + INOTIFY_NAME(*a->a_vpp, dvp, a->a_cnp, IN_CREATE); + } } void @@ -6225,8 +6276,10 @@ vop_reclaim_post(void *ap, int rc) a = ap; vp = a->a_vp; ASSERT_VOP_IN_SEQC(vp); - if (!rc) + if (!rc) { VFS_KNOTE_LOCKED(vp, NOTE_REVOKE); + INOTIFY_REVOKE(vp); + } } void @@ -6257,6 +6310,8 @@ vop_remove_post(void *ap, int rc) if (!rc) { VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); VFS_KNOTE_LOCKED(vp, NOTE_DELETE); + INOTIFY_NAME(vp, dvp, a->a_cnp, _IN_ATTRIB_LINKCOUNT); + INOTIFY_NAME(vp, dvp, a->a_cnp, IN_DELETE); } } @@ -6288,6 +6343,8 @@ vop_rename_post(void *ap, int rc) VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME); if (a->a_tvp) VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE); + INOTIFY_MOVE(a->a_fvp, a->a_fdvp, a->a_fcnp, a->a_tvp, + a->a_tdvp, a->a_tcnp); } if (a->a_tdvp != a->a_fdvp) vdrop(a->a_fdvp); @@ -6327,6 +6384,7 @@ vop_rmdir_post(void *ap, int rc) vp->v_vflag |= VV_UNLINKED; VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK); VFS_KNOTE_LOCKED(vp, NOTE_DELETE); + INOTIFY_NAME(vp, dvp, a->a_cnp, IN_DELETE); } } @@ -6350,8 +6408,10 @@ vop_setattr_post(void *ap, int rc) a = ap; vp = a->a_vp; vn_seqc_write_end(vp); - if (!rc) + if (!rc) { VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB); + INOTIFY(vp, IN_ATTRIB); + } } void @@ -6396,8 +6456,10 @@ vop_setextattr_post(void *ap, int rc) a = ap; vp = a->a_vp; vn_seqc_write_end(vp); - if (!rc) + if (!rc) { VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB); + INOTIFY(vp, IN_ATTRIB); + } } void @@ -6420,8 +6482,10 @@ vop_symlink_post(void *ap, int rc) a = ap; dvp = a->a_dvp; vn_seqc_write_end(dvp); - if (!rc) + if (!rc) { VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); + INOTIFY_NAME(*a->a_vpp, dvp, a->a_cnp, IN_CREATE); + } } void @@ -6429,8 +6493,10 @@ vop_open_post(void *ap, int rc) { struct vop_open_args *a = ap; - if (!rc) + if (!rc) { VFS_KNOTE_LOCKED(a->a_vp, NOTE_OPEN); + INOTIFY(a->a_vp, IN_OPEN); + } } void @@ -6442,6 +6508,8 @@ vop_close_post(void *ap, int rc) !VN_IS_DOOMED(a->a_vp))) { VFS_KNOTE_LOCKED(a->a_vp, (a->a_fflag & FWRITE) != 0 ? NOTE_CLOSE_WRITE : NOTE_CLOSE); + INOTIFY(a->a_vp, (a->a_fflag & FWRITE) != 0 ? + IN_CLOSE_WRITE : IN_CLOSE_NOWRITE); } } @@ -6450,8 +6518,10 @@ vop_read_post(void *ap, int rc) { struct vop_read_args *a = ap; - if (!rc) + if (!rc) { VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ); + INOTIFY(a->a_vp, IN_ACCESS); + } } void @@ -6468,8 +6538,10 @@ vop_readdir_post(void *ap, int rc) { struct vop_readdir_args *a = ap; - if (!rc) + if (!rc) { VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ); + INOTIFY(a->a_vp, IN_ACCESS); + } } static struct knlist fs_knlist; diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c index c236f241bf20..c71e0d9ee569 100644 --- a/sys/kern/vfs_syscalls.c +++ b/sys/kern/vfs_syscalls.c @@ -3766,7 +3766,7 @@ int kern_renameat(struct thread *td, int oldfd, const char *old, int newfd, const char *new, enum uio_seg pathseg) { - struct mount *mp = NULL; + struct mount *mp, *tmp; struct vnode *tvp, *fvp, *tdvp; struct nameidata fromnd, tond; uint64_t tondflags; @@ -3774,6 +3774,7 @@ kern_renameat(struct thread *td, int oldfd, const char *old, int newfd, short irflag; again: + tmp = mp = NULL; bwillwrite(); #ifdef MAC if (mac_vnode_check_rename_from_enabled()) { @@ -3809,6 +3810,7 @@ again: tvp = tond.ni_vp; error = vn_start_write(fvp, &mp, V_NOWAIT); if (error != 0) { +again1: NDFREE_PNBUF(&fromnd); NDFREE_PNBUF(&tond); if (tvp != NULL) @@ -3819,11 +3821,25 @@ again: vput(tdvp); vrele(fromnd.ni_dvp); vrele(fvp); + if (tmp != NULL) { + lockmgr(&tmp->mnt_renamelock, LK_EXCLUSIVE, NULL); + lockmgr(&tmp->mnt_renamelock, LK_RELEASE, NULL); + vfs_rel(tmp); + tmp = NULL; + } error = vn_start_write(NULL, &mp, V_XSLEEP | V_PCATCH); if (error != 0) return (error); goto again; } + error = VOP_GETWRITEMOUNT(tdvp, &tmp); + if (error != 0 || tmp == NULL) + goto again1; + error = lockmgr(&tmp->mnt_renamelock, LK_EXCLUSIVE | LK_NOWAIT, NULL); + if (error != 0) { + vn_finished_write(mp); + goto again1; + } irflag = vn_irflag_read(fvp); if (((irflag & VIRF_NAMEDATTR) != 0 && tdvp != fromnd.ni_dvp) || (irflag & VIRF_NAMEDDIR) != 0) { @@ -3884,6 +3900,8 @@ out: vrele(fromnd.ni_dvp); vrele(fvp); } + lockmgr(&tmp->mnt_renamelock, LK_RELEASE, 0); + vfs_rel(tmp); vn_finished_write(mp); out1: if (error == ERESTART) @@ -4296,10 +4314,6 @@ kern_getdirentries(struct thread *td, int fd, char *buf, size_t count, vp = fp->f_vnode; foffset = foffset_lock(fp, 0); unionread: - if (vp->v_type != VDIR) { - error = EINVAL; - goto fail; - } if (__predict_false((vp->v_vflag & VV_UNLINKED) != 0)) { error = ENOENT; goto fail; @@ -4312,6 +4326,19 @@ unionread: auio.uio_segflg = bufseg; auio.uio_td = td; vn_lock(vp, LK_SHARED | LK_RETRY); + /* + * We want to return ENOTDIR for anything that is not VDIR, but + * not for VBAD, and we can't check for VBAD while the vnode is + * unlocked. + */ + if (vp->v_type != VDIR) { + if (vp->v_type == VBAD) + error = EBADF; + else + error = ENOTDIR; + VOP_UNLOCK(vp); + goto fail; + } AUDIT_ARG_VNODE1(vp); loff = auio.uio_offset = foffset; #ifdef MAC diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c index b29286654f60..6451c9e07a60 100644 --- a/sys/kern/vfs_vnops.c +++ b/sys/kern/vfs_vnops.c @@ -41,6 +41,7 @@ */ #include "opt_hwpmc_hooks.h" +#include "opt_hwt_hooks.h" #include <sys/param.h> #include <sys/systm.h> @@ -51,6 +52,7 @@ #include <sys/fcntl.h> #include <sys/file.h> #include <sys/filio.h> +#include <sys/inotify.h> #include <sys/ktr.h> #include <sys/ktrace.h> #include <sys/limits.h> @@ -86,6 +88,10 @@ #include <sys/pmckern.h> #endif +#ifdef HWT_HOOKS +#include <dev/hwt/hwt_hook.h> +#endif + static fo_rdwr_t vn_read; static fo_rdwr_t vn_write; static fo_rdwr_t vn_io_fault; @@ -303,7 +309,8 @@ restart: NDREINIT(ndp); goto restart; } - if ((vn_open_flags & VN_OPEN_NAMECACHE) != 0) + if ((vn_open_flags & VN_OPEN_NAMECACHE) != 0 || + (vn_irflag_read(ndp->ni_dvp) & VIRF_INOTIFY) != 0) ndp->ni_cnd.cn_flags |= MAKEENTRY; #ifdef MAC error = mac_vnode_check_create(cred, ndp->ni_dvp, @@ -479,6 +486,7 @@ vn_open_vnode(struct vnode *vp, int fmode, struct ucred *cred, if (vp->v_type != VFIFO && vp->v_type != VSOCK && VOP_ACCESS(vp, VREAD, cred, td) == 0) fp->f_flag |= FKQALLOWED; + INOTIFY(vp, IN_OPEN); return (0); } @@ -1741,6 +1749,8 @@ vn_truncate_locked(struct vnode *vp, off_t length, bool sync, vattr.va_vaflags |= VA_SYNC; error = VOP_SETATTR(vp, &vattr, cred); VOP_ADD_WRITECOUNT_CHECKED(vp, -1); + if (error == 0) + INOTIFY(vp, IN_MODIFY); } return (error); } @@ -3005,6 +3015,24 @@ vn_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size, } } #endif + +#ifdef HWT_HOOKS + if (HWT_HOOK_INSTALLED && (prot & VM_PROT_EXECUTE) != 0 && + error == 0) { + struct hwt_record_entry ent; + char *fullpath; + char *freepath; + + if (vn_fullpath(vp, &fullpath, &freepath) == 0) { + ent.fullpath = fullpath; + ent.addr = (uintptr_t) *addr; + ent.record_type = HWT_RECORD_MMAP; + HWT_CALL_HOOK(td, HWT_MMAP, &ent); + free(freepath, M_TEMP); + } + } +#endif + return (error); } diff --git a/sys/kern/vnode_if.src b/sys/kern/vnode_if.src index a2b6a7c8ff9f..38138a4af921 100644 --- a/sys/kern/vnode_if.src +++ b/sys/kern/vnode_if.src @@ -702,6 +702,7 @@ vop_vptocnp { %% allocate vp E E E +%! allocate post vop_allocate_post vop_allocate { IN struct vnode *vp; @@ -786,6 +787,7 @@ vop_fdatasync { %% copy_file_range invp U U U %% copy_file_range outvp U U U +%! copy_file_range post vop_copy_file_range_post vop_copy_file_range { IN struct vnode *invp; @@ -810,6 +812,7 @@ vop_vput_pair { %% deallocate vp L L L +%! deallocate post vop_deallocate_post vop_deallocate { IN struct vnode *vp; @@ -821,6 +824,27 @@ vop_deallocate { }; +%% inotify vp - - - + +vop_inotify { + IN struct vnode *vp; + IN struct vnode *dvp; + IN struct componentname *cnp; + IN int event; + IN uint32_t cookie; +}; + + +%% inotify_add_watch vp L L L + +vop_inotify_add_watch { + IN struct vnode *vp; + IN struct inotify_softc *sc; + IN uint32_t mask; + OUT uint32_t *wdp; + IN struct thread *td; +}; + # The VOPs below are spares at the end of the table to allow new VOPs to be # added in stable branches without breaking the KBI. New VOPs in HEAD should # be added above these spares. When merging a new VOP to a stable branch, diff --git a/sys/modules/Makefile b/sys/modules/Makefile index 35cf17be109f..7cb6e2124326 100644 --- a/sys/modules/Makefile +++ b/sys/modules/Makefile @@ -141,6 +141,7 @@ SUBDIR= \ ${_hptnr} \ ${_hptrr} \ hwpmc \ + ${_hwt} \ ${_hyperv} \ i2c \ ${_iavf} \ @@ -325,6 +326,7 @@ SUBDIR= \ proto \ pseudofs \ ${_pst} \ + ${_pt} \ pty \ puc \ pwm \ @@ -841,6 +843,7 @@ _iwx= iwx _ixl= ixl _nvdimm= nvdimm _pms= pms +_pt= pt _qat= qat .if ${MK_SOURCELESS_UCODE} != "no" _qatfw= qatfw @@ -859,6 +862,10 @@ _smartpqi= smartpqi _p2sb= p2sb .endif +.if ${MACHINE_CPUARCH} == "aarch64" || ${MACHINE_CPUARCH} == "amd64" +_hwt= hwt +.endif + .if ${MACHINE_CPUARCH} == "aarch64" || ${MACHINE_CPUARCH} == "amd64" || \ ${MACHINE_CPUARCH} == "riscv" .if ${MK_BHYVE} != "no" || defined(ALL_MODULES) diff --git a/sys/modules/efirt/Makefile b/sys/modules/efirt/Makefile index 4738996fd4e6..c46484465b68 100644 --- a/sys/modules/efirt/Makefile +++ b/sys/modules/efirt/Makefile @@ -9,7 +9,7 @@ SRCS+= device_if.h bus_if.h clock_if.h DPSRCS+= assym.inc .if ${MACHINE_CPUARCH} == "amd64" -SRCS+= opt_hwpmc_hooks.h opt_kstack_pages.h +SRCS+= opt_acpi.h opt_hwpmc_hooks.h opt_kstack_pages.h .endif efirt_support.o: efirt_support.S assym.inc diff --git a/sys/modules/hwt/Makefile b/sys/modules/hwt/Makefile new file mode 100644 index 000000000000..6704e22422d1 --- /dev/null +++ b/sys/modules/hwt/Makefile @@ -0,0 +1,21 @@ +# $FreeBSD$ + +.PATH: ${SRCTOP}/sys/dev/hwt + +KMOD = hwt +SRCS = \ + hwt.c \ + hwt_backend.c \ + hwt_config.c \ + hwt_context.c \ + hwt_contexthash.c \ + hwt_cpu.c \ + hwt_hook.c \ + hwt_ioctl.c \ + hwt_owner.c \ + hwt_ownerhash.c \ + hwt_record.c \ + hwt_thread.c \ + hwt_vm.c + +.include <bsd.kmod.mk> diff --git a/sys/modules/ice/Makefile b/sys/modules/ice/Makefile index 91f20193d878..9f9c9f602cda 100644 --- a/sys/modules/ice/Makefile +++ b/sys/modules/ice/Makefile @@ -13,6 +13,7 @@ SRCS += opt_inet.h opt_inet6.h opt_rss.h opt_iflib.h SRCS += ice_lib.c ice_osdep.c ice_resmgr.c ice_strings.c SRCS += ice_iflib_recovery_txrx.c ice_iflib_txrx.c if_ice_iflib.c SRCS += ice_fw_logging.c ice_ddp_common.c +SRCS.PCI_IOV += pci_iov_if.h ice_iov.c ice_vf_mbx.c # RDMA Client interface # TODO: Is this the right way to compile this? diff --git a/sys/modules/iwlwifi/Makefile b/sys/modules/iwlwifi/Makefile index 6e0fea6efc3a..9774c3da61ee 100644 --- a/sys/modules/iwlwifi/Makefile +++ b/sys/modules/iwlwifi/Makefile @@ -4,6 +4,7 @@ DEVIWLWIFIDIR= ${SRCTOP}/sys/contrib/dev/iwlwifi WITH_CONFIG_PM= 0 WITH_DEBUGFS= 1 +WITH_CONFIG_ACPI= 1 KMOD= if_iwlwifi @@ -40,6 +41,12 @@ CFLAGS+= -DCONFIG_PM CFLAGS+= -DCONFIG_PM_SLEEP .endif +.if defined(WITH_CONFIG_ACPI) && ${WITH_CONFIG_ACPI} > 0 +SRCS+= fw/acpi.c +CFLAGS+= -DCONFIG_ACPI +CFLAGS+= -DLINUXKPI_WANT_LINUX_ACPI +.endif + SRCS+= iwl-devtrace.c # Other @@ -56,7 +63,6 @@ CFLAGS+= -DCONFIG_IWLMVM=1 # Helpful after fresh imports. #CFLAGS+= -ferror-limit=0 -#CFLAGS+= -DCONFIG_ACPI=1 #CFLAGS+= -DCONFIG_INET=1 # Need LKPI TSO implementation. #CFLAGS+= -DCONFIG_IPV6=1 CFLAGS+= -DCONFIG_IWLWIFI_DEBUG=1 diff --git a/sys/modules/pt/Makefile b/sys/modules/pt/Makefile new file mode 100644 index 000000000000..416b072face9 --- /dev/null +++ b/sys/modules/pt/Makefile @@ -0,0 +1,8 @@ + +.PATH: ${SRCTOP}/sys/amd64/pt + +KMOD= pt +SRCS= pt.c pt.h device_if.h bus_if.h +SRCS+= opt_hwpmc_hooks.h opt_kstack_pages.h + +.include <bsd.kmod.mk> diff --git a/sys/modules/qlnx/qlnxe/Makefile b/sys/modules/qlnx/qlnxe/Makefile index 3d8415cf0e57..2a44ae6ddde5 100644 --- a/sys/modules/qlnx/qlnxe/Makefile +++ b/sys/modules/qlnx/qlnxe/Makefile @@ -58,6 +58,7 @@ SRCS+=qlnx_rdma.c SRCS+=qlnx_ioctl.c SRCS+=qlnx_os.c +SRCS+=opt_inet.h SRCS+= ${LINUXKPI_GENSRCS} diff --git a/sys/modules/rtw89/Makefile b/sys/modules/rtw89/Makefile index 73945591826c..09580f288c62 100644 --- a/sys/modules/rtw89/Makefile +++ b/sys/modules/rtw89/Makefile @@ -39,6 +39,7 @@ SRCS+= ${LINUXKPI_GENSRCS} SRCS+= opt_wlan.h opt_inet6.h opt_inet.h opt_acpi.h CFLAGS+= -DKBUILD_MODNAME='"rtw89"' +CFLAGS+= -DLINUXKPI_WANT_LINUX_ACPI CFLAGS+= -I${DEVRTW89DIR} CFLAGS+= ${LINUXKPI_INCLUDES} diff --git a/sys/modules/sound/sound/Makefile b/sys/modules/sound/sound/Makefile index d2cfed2f4b6a..f3978e9bd9cc 100644 --- a/sys/modules/sound/sound/Makefile +++ b/sys/modules/sound/sound/Makefile @@ -13,11 +13,11 @@ SRCS+= feeder.c feeder_rate.c feeder_volume.c SRCS+= feeder_chain.c feeder_eq.c feeder_format.c SRCS+= feeder_matrix.c feeder_mixer.c SRCS+= feeder_eq_gen.h feeder_rate_gen.h snd_fxdiv_gen.h -SRCS+= mpu_if.h mpufoi_if.h synth_if.h -SRCS+= mpu_if.c mpufoi_if.c synth_if.c +SRCS+= mpu_if.h mpufoi_if.h +SRCS+= mpu_if.c mpufoi_if.c SRCS+= ac97.c buffer.c channel.c dsp.c SRCS+= mixer.c sndstat.c sound.c vchan.c -SRCS+= midi.c mpu401.c sequencer.c +SRCS+= midi.c mpu401.c feeder_eq_gen.h: ${SYSDIR}/tools/sound/feeder_eq_mkfilter.awk ${AWK} -f ${SYSDIR}/tools/sound/feeder_eq_mkfilter.awk -- ${FEEDER_EQ_PRESETS} > ${.TARGET} diff --git a/sys/net/ethernet.h b/sys/net/ethernet.h index 6eefedba8775..01485cf26e06 100644 --- a/sys/net/ethernet.h +++ b/sys/net/ethernet.h @@ -62,6 +62,8 @@ struct ether_header { u_char ether_shost[ETHER_ADDR_LEN]; u_short ether_type; } __packed; +_Static_assert(sizeof(struct ether_header) == ETHER_HDR_LEN, + "size of struct ether_header is wrong"); /* * Structure of a 48-bit Ethernet address. @@ -69,6 +71,8 @@ struct ether_header { struct ether_addr { u_char octet[ETHER_ADDR_LEN]; } __packed; +_Static_assert(sizeof(struct ether_addr) == ETHER_ADDR_LEN, + "size of struct ether_addr is wrong"); #define ETHER_IS_MULTICAST(addr) (*(addr) & 0x01) /* is address mcast/bcast? */ #define ETHER_IS_IPV6_MULTICAST(addr) \ @@ -81,6 +85,23 @@ struct ether_addr { (addr)[3] | (addr)[4] | (addr)[5]) == 0x00) /* + * 802.1q VID constants from IEEE 802.1Q-2014, table 9-2. + */ + +/* Null VID: The tag contains only PCP (priority) and DEI information. */ +#define DOT1Q_VID_NULL 0x0 +/* The default PVID for a bridge port. NB: bridge(4) does not honor this. */ +#define DOT1Q_VID_DEF_PVID 0x1 +/* The default SR_PVID for SRP Stream related traffic. */ +#define DOT1Q_VID_DEF_SR_PVID 0x2 +/* A VID reserved for implementation use, not permitted on the wire. */ +#define DOT1Q_VID_RSVD_IMPL 0xfff +/* The lowest valid VID. */ +#define DOT1Q_VID_MIN 0x1 +/* The highest valid VID. */ +#define DOT1Q_VID_MAX 0xffe + +/* * This is the type of the VLAN ID inside the tag, not the tag itself. */ typedef uint16_t ether_vlanid_t; @@ -95,6 +116,8 @@ struct ether_vlan_header { uint16_t evl_tag; uint16_t evl_proto; } __packed; +_Static_assert(sizeof(struct ether_vlan_header) == ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN, + "size of struct ether_vlan_header is wrong"); #define EVL_VLID_MASK 0x0FFF #define EVL_PRI_MASK 0xE000 diff --git a/sys/net/if_bridge.c b/sys/net/if_bridge.c index bc421a8e156d..5b3ee740d75e 100644 --- a/sys/net/if_bridge.c +++ b/sys/net/if_bridge.c @@ -254,6 +254,8 @@ struct bridge_iflist { uint32_t bif_addrcnt; /* cur. # of addresses */ uint32_t bif_addrexceeded;/* # of address violations */ struct epoch_context bif_epoch_ctx; + ether_vlanid_t bif_untagged; /* untagged vlan id */ + ifbvlan_set_t bif_vlan_set; /* allowed tagged vlans */ }; /* @@ -331,13 +333,12 @@ static void bridge_inject(struct ifnet *, struct mbuf *); static int bridge_output(struct ifnet *, struct mbuf *, struct sockaddr *, struct rtentry *); static int bridge_enqueue(struct bridge_softc *, struct ifnet *, - struct mbuf *); + struct mbuf *, struct bridge_iflist *); static void bridge_rtdelete(struct bridge_softc *, struct ifnet *ifp, int); static void bridge_forward(struct bridge_softc *, struct bridge_iflist *, struct mbuf *m); static bool bridge_member_ifaddrs(void); - static void bridge_timer(void *); static void bridge_broadcast(struct bridge_softc *, struct ifnet *, @@ -353,6 +354,9 @@ static void bridge_rtage(struct bridge_softc *); static void bridge_rtflush(struct bridge_softc *, int); static int bridge_rtdaddr(struct bridge_softc *, const uint8_t *, ether_vlanid_t); +static bool bridge_vfilter_in(const struct bridge_iflist *, struct mbuf *); +static bool bridge_vfilter_out(const struct bridge_iflist *, + const struct mbuf *); static void bridge_rtable_init(struct bridge_softc *); static void bridge_rtable_fini(struct bridge_softc *); @@ -400,6 +404,9 @@ static int bridge_ioctl_sma(struct bridge_softc *, void *); static int bridge_ioctl_sifprio(struct bridge_softc *, void *); static int bridge_ioctl_sifcost(struct bridge_softc *, void *); static int bridge_ioctl_sifmaxaddr(struct bridge_softc *, void *); +static int bridge_ioctl_sifuntagged(struct bridge_softc *, void *); +static int bridge_ioctl_sifvlanset(struct bridge_softc *, void *); +static int bridge_ioctl_gifvlanset(struct bridge_softc *, void *); static int bridge_ioctl_addspan(struct bridge_softc *, void *); static int bridge_ioctl_delspan(struct bridge_softc *, void *); static int bridge_ioctl_gbparam(struct bridge_softc *, void *); @@ -618,6 +625,14 @@ static const struct bridge_control bridge_control_table[] = { { bridge_ioctl_sifmaxaddr, sizeof(struct ifbreq), BC_F_COPYIN|BC_F_SUSER }, + { bridge_ioctl_sifuntagged, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_sifvlanset, sizeof(struct ifbif_vlan_req), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_gifvlanset, sizeof(struct ifbif_vlan_req), + BC_F_COPYIN|BC_F_COPYOUT }, }; static const int bridge_control_table_size = nitems(bridge_control_table); @@ -832,6 +847,7 @@ bridge_clone_create(struct if_clone *ifc, char *name, size_t len, ifp->if_softc = sc; if_initname(ifp, bridge_name, ifd->unit); ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; + ifp->if_capabilities = ifp->if_capenable = IFCAP_VLAN_HWTAGGING; ifp->if_ioctl = bridge_ioctl; #ifdef ALTQ ifp->if_start = bridge_altq_start; @@ -954,6 +970,7 @@ bridge_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) struct ifbaconf ifbaconf; struct ifbrparam ifbrparam; struct ifbropreq ifbropreq; + struct ifbif_vlan_req ifvlanreq; } args; struct ifdrv *ifd = (struct ifdrv *) data; const struct bridge_control *bc; @@ -1495,6 +1512,7 @@ bridge_ioctl_gifflags(struct bridge_softc *sc, void *arg) req->ifbr_addrcnt = bif->bif_addrcnt; req->ifbr_addrmax = bif->bif_addrmax; req->ifbr_addrexceeded = bif->bif_addrexceeded; + req->ifbr_untagged = bif->bif_untagged; /* Copy STP state options as flags */ if (bp->bp_operedge) @@ -1873,6 +1891,84 @@ bridge_ioctl_sifmaxaddr(struct bridge_softc *sc, void *arg) } static int +bridge_ioctl_sifuntagged(struct bridge_softc *sc, void *arg) +{ + struct ifbreq *req = arg; + struct bridge_iflist *bif; + + bif = bridge_lookup_member(sc, req->ifbr_ifsname); + if (bif == NULL) + return (ENOENT); + + if (req->ifbr_untagged > DOT1Q_VID_MAX) + return (EINVAL); + + if (req->ifbr_untagged != DOT1Q_VID_NULL) + bif->bif_flags |= IFBIF_VLANFILTER; + bif->bif_untagged = req->ifbr_untagged; + return (0); +} + +static int +bridge_ioctl_sifvlanset(struct bridge_softc *sc, void *arg) +{ + struct ifbif_vlan_req *req = arg; + struct bridge_iflist *bif; + + bif = bridge_lookup_member(sc, req->bv_ifname); + if (bif == NULL) + return (ENOENT); + + /* Reject invalid VIDs. */ + if (BRVLAN_TEST(&req->bv_set, DOT1Q_VID_NULL) || + BRVLAN_TEST(&req->bv_set, DOT1Q_VID_RSVD_IMPL)) + return (EINVAL); + + switch (req->bv_op) { + /* Replace the existing vlan set with the new set */ + case BRDG_VLAN_OP_SET: + BIT_COPY(BRVLAN_SETSIZE, &req->bv_set, &bif->bif_vlan_set); + break; + + /* Modify the existing vlan set to add the given vlans */ + case BRDG_VLAN_OP_ADD: + BIT_OR(BRVLAN_SETSIZE, &bif->bif_vlan_set, &req->bv_set); + break; + + /* Modify the existing vlan set to remove the given vlans */ + case BRDG_VLAN_OP_DEL: + BIT_ANDNOT(BRVLAN_SETSIZE, &bif->bif_vlan_set, &req->bv_set); + break; + + /* Invalid or unknown operation */ + default: + return (EINVAL); + } + + /* + * The only reason to modify the VLAN access list is to use VLAN + * filtering on this interface, so enable it automatically. + */ + bif->bif_flags |= IFBIF_VLANFILTER; + + return (0); +} + +static int +bridge_ioctl_gifvlanset(struct bridge_softc *sc, void *arg) +{ + struct ifbif_vlan_req *req = arg; + struct bridge_iflist *bif; + + bif = bridge_lookup_member(sc, req->bv_ifname); + if (bif == NULL) + return (ENOENT); + + BIT_COPY(BRVLAN_SETSIZE, &bif->bif_vlan_set, &req->bv_set); + return (0); +} + +static int bridge_ioctl_addspan(struct bridge_softc *sc, void *arg) { struct ifbreq *req = arg; @@ -2150,12 +2246,25 @@ bridge_stop(struct ifnet *ifp, int disable) * */ static int -bridge_enqueue(struct bridge_softc *sc, struct ifnet *dst_ifp, struct mbuf *m) +bridge_enqueue(struct bridge_softc *sc, struct ifnet *dst_ifp, struct mbuf *m, + struct bridge_iflist *bif) { int len, err = 0; short mflags; struct mbuf *m0; + /* + * Find the bridge member port this packet is being sent on, if the + * caller didn't already provide it. + */ + if (bif == NULL) + bif = bridge_lookup_member_if(sc, dst_ifp); + if (bif == NULL) { + /* Perhaps the interface was removed from the bridge */ + m_freem(m); + return (EINVAL); + } + /* We may be sending a fragment so traverse the mbuf */ for (; m; m = m0) { m0 = m->m_nextpkt; @@ -2164,6 +2273,18 @@ bridge_enqueue(struct bridge_softc *sc, struct ifnet *dst_ifp, struct mbuf *m) mflags = m->m_flags; /* + * If VLAN filtering is enabled, and the native VLAN ID of the + * outgoing interface matches the VLAN ID of the frame, remove + * the VLAN header. + */ + if ((bif->bif_flags & IFBIF_VLANFILTER) && + bif->bif_untagged != DOT1Q_VID_NULL && + VLANTAGOF(m) == bif->bif_untagged) { + m->m_flags &= ~M_VLANTAG; + m->m_pkthdr.ether_vtag = 0; + } + + /* * If underlying interface can not do VLAN tag insertion itself * then attach a packet tag that holds it. */ @@ -2234,7 +2355,7 @@ bridge_dummynet(struct mbuf *m, struct ifnet *ifp) return; } - bridge_enqueue(sc, ifp, m); + bridge_enqueue(sc, ifp, m, NULL); } /* @@ -2329,7 +2450,7 @@ bridge_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *sa, } } - bridge_enqueue(sc, dst_if, mc); + bridge_enqueue(sc, dst_if, mc, bif); } if (used == 0) m_freem(m); @@ -2347,7 +2468,7 @@ sendunicast: return (0); } - bridge_enqueue(sc, dst_if, m); + bridge_enqueue(sc, dst_if, m, NULL); return (0); } @@ -2364,17 +2485,18 @@ bridge_transmit(struct ifnet *ifp, struct mbuf *m) struct ether_header *eh; struct ifnet *dst_if; int error = 0; + ether_vlanid_t vlan; sc = ifp->if_softc; ETHER_BPF_MTAP(ifp, m); eh = mtod(m, struct ether_header *); + vlan = VLANTAGOF(m); if (((m->m_flags & (M_BCAST|M_MCAST)) == 0) && - (dst_if = bridge_rtlookup(sc, eh->ether_dhost, DOT1Q_VID_NULL)) != - NULL) { - error = bridge_enqueue(sc, dst_if, m); + (dst_if = bridge_rtlookup(sc, eh->ether_dhost, vlan)) != NULL) { + error = bridge_enqueue(sc, dst_if, m, NULL); } else bridge_broadcast(sc, ifp, m, 0); @@ -2435,18 +2557,18 @@ bridge_forward(struct bridge_softc *sc, struct bridge_iflist *sbif, struct bridge_iflist *dbif; struct ifnet *src_if, *dst_if, *ifp; struct ether_header *eh; - uint16_t vlan; uint8_t *dst; int error; + ether_vlanid_t vlan; NET_EPOCH_ASSERT(); src_if = m->m_pkthdr.rcvif; ifp = sc->sc_ifp; + vlan = VLANTAGOF(m); if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); - vlan = VLANTAGOF(m); if ((sbif->bif_flags & IFBIF_STP) && sbif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING) @@ -2555,6 +2677,10 @@ bridge_forward(struct bridge_softc *sc, struct bridge_iflist *sbif, if (sbif->bif_flags & dbif->bif_flags & IFBIF_PRIVATE) goto drop; + /* Do VLAN filtering. */ + if (!bridge_vfilter_out(dbif, m)) + goto drop; + if ((dbif->bif_flags & IFBIF_STP) && dbif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING) goto drop; @@ -2566,7 +2692,7 @@ bridge_forward(struct bridge_softc *sc, struct bridge_iflist *sbif, return; } - bridge_enqueue(sc, dst_if, m); + bridge_enqueue(sc, dst_if, m, dbif); return; drop: @@ -2636,6 +2762,15 @@ bridge_input(struct ifnet *ifp, struct mbuf *m) return (NULL); } + /* Do VLAN filtering. */ + if (!bridge_vfilter_in(bif, m)) { + if_inc_counter(sc->sc_ifp, IFCOUNTER_IERRORS, 1); + m_freem(m); + return (NULL); + } + /* bridge_vfilter_in() may add a tag */ + vlan = VLANTAGOF(m); + bridge_span(sc, m); if (m->m_flags & (M_BCAST|M_MCAST)) { @@ -2761,6 +2896,15 @@ bridge_input(struct ifnet *ifp, struct mbuf *m) } \ if ((iface) != bifp) \ ETHER_BPF_MTAP(iface, m); \ + /* Pass tagged packets to if_vlan, if it's loaded */ \ + if (VLANTAGOF(m) != 0) { \ + if (bifp->if_vlantrunk == NULL) { \ + m_freem(m); \ + return (NULL); \ + } \ + (*vlan_input_p)(bifp, m); \ + return (NULL); \ + } \ return (m); \ } \ \ @@ -2817,6 +2961,30 @@ bridge_inject(struct ifnet *ifp, struct mbuf *m) { struct bridge_softc *sc; + if (ifp->if_type == IFT_L2VLAN) { + /* + * vlan(4) gives us the vlan ifnet, so we need to get the + * bridge softc to get a pointer to ether_input to send the + * packet to. + */ + struct ifnet *bifp = NULL; + + if (vlan_trunkdev_p == NULL) { + m_freem(m); + return; + } + + bifp = vlan_trunkdev_p(ifp); + if (bifp == NULL) { + m_freem(m); + return; + } + + sc = if_getsoftc(bifp); + sc->sc_if_input(ifp, m); + return; + } + KASSERT((if_getcapenable(ifp) & IFCAP_NETMAP) != 0, ("%s: iface %s is not running in netmap mode", __func__, if_name(ifp))); @@ -2867,6 +3035,10 @@ bridge_broadcast(struct bridge_softc *sc, struct ifnet *src_if, if (sbif && (sbif->bif_flags & dbif->bif_flags & IFBIF_PRIVATE)) continue; + /* Do VLAN filtering. */ + if (!bridge_vfilter_out(dbif, m)) + continue; + if ((dbif->bif_flags & IFBIF_STP) && dbif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING) continue; @@ -2910,7 +3082,7 @@ bridge_broadcast(struct bridge_softc *sc, struct ifnet *src_if, continue; } - bridge_enqueue(sc, dst_if, mc); + bridge_enqueue(sc, dst_if, mc, dbif); } if (used == 0) m_freem(m); @@ -2946,11 +3118,116 @@ bridge_span(struct bridge_softc *sc, struct mbuf *m) continue; } - bridge_enqueue(sc, dst_if, mc); + bridge_enqueue(sc, dst_if, mc, bif); } } /* + * Incoming VLAN filtering. Given a frame and the member interface it was + * received on, decide whether the port configuration allows it. + */ +static bool +bridge_vfilter_in(const struct bridge_iflist *sbif, struct mbuf *m) +{ + ether_vlanid_t vlan; + + vlan = VLANTAGOF(m); + /* Make sure the vlan id is reasonable. */ + if (vlan > DOT1Q_VID_MAX) + return (false); + + /* If VLAN filtering isn't enabled, pass everything. */ + if ((sbif->bif_flags & IFBIF_VLANFILTER) == 0) + return (true); + + if (vlan == DOT1Q_VID_NULL) { + /* + * The frame doesn't have a tag. If the interface does not + * have an untagged vlan configured, drop the frame. + */ + if (sbif->bif_untagged == DOT1Q_VID_NULL) + return (false); + + /* + * Otherwise, insert a new tag based on the interface's + * untagged vlan id. + */ + m->m_pkthdr.ether_vtag = sbif->bif_untagged; + m->m_flags |= M_VLANTAG; + } else { + /* + * The frame has a tag, so check it matches the interface's + * vlan access list. We explicitly do not accept tagged + * frames for the untagged vlan id here (unless it's also + * in the access list). + */ + if (!BRVLAN_TEST(&sbif->bif_vlan_set, vlan)) + return (false); + } + + /* Accept the frame. */ + return (true); +} + +/* + * Outgoing VLAN filtering. Given a frame, its vlan, and the member interface + * we intend to send it to, decide whether the port configuration allows it to + * be sent. + */ +static bool +bridge_vfilter_out(const struct bridge_iflist *dbif, const struct mbuf *m) +{ + struct ether_header *eh; + ether_vlanid_t vlan; + + NET_EPOCH_ASSERT(); + + /* If VLAN filtering isn't enabled, pass everything. */ + if ((dbif->bif_flags & IFBIF_VLANFILTER) == 0) + return (true); + + vlan = VLANTAGOF(m); + + /* + * Always allow untagged 802.1D STP frames, even if they would + * otherwise be dropped. This is required for STP to work on + * a filtering bridge. + * + * Tagged STP (Cisco PVST+) is a non-standard extension, so + * handle those frames via the normal filtering path. + */ + eh = mtod(m, struct ether_header *); + if (vlan == DOT1Q_VID_NULL && + memcmp(eh->ether_dhost, bstp_etheraddr, ETHER_ADDR_LEN) == 0) + return (true); + + /* + * If the frame wasn't assigned to a vlan at ingress, drop it. + * We can't forward these frames to filtering ports because we + * don't know what VLAN they're supposed to be in. + */ + if (vlan == DOT1Q_VID_NULL) + return (false); + + /* + * If the frame's vlan matches the interfaces's untagged vlan, + * allow it. + */ + if (vlan == dbif->bif_untagged) + return (true); + + /* + * If the frame's vlan is on the interface's tagged access list, + * allow it. + */ + if (BRVLAN_TEST(&dbif->bif_vlan_set, vlan)) + return (true); + + /* The frame was not permitted, so drop it. */ + return (false); +} + +/* * bridge_rtupdate: * * Add a bridge routing entry. diff --git a/sys/net/if_bridgevar.h b/sys/net/if_bridgevar.h index 90beb6c96d82..97b63e3d4416 100644 --- a/sys/net/if_bridgevar.h +++ b/sys/net/if_bridgevar.h @@ -78,6 +78,8 @@ #define _NET_IF_BRIDGEVAR_H_ #include <sys/types.h> +#include <sys/_bitset.h> +#include <sys/bitset.h> #include <sys/callout.h> #include <sys/queue.h> #include <sys/condvar.h> @@ -122,6 +124,9 @@ #define BRDGSPROTO 28 /* set protocol (ifbrparam) */ #define BRDGSTXHC 29 /* set tx hold count (ifbrparam) */ #define BRDGSIFAMAX 30 /* set max interface addrs (ifbreq) */ +#define BRDGSIFUNTAGGED 31 /* set if untagged vlan */ +#define BRDGSIFVLANSET 32 /* set if vlan set */ +#define BRDGGIFVLANSET 33 /* get if vlan set */ /* * Generic bridge control request. @@ -139,6 +144,7 @@ struct ifbreq { uint32_t ifbr_addrcnt; /* member if addr number */ uint32_t ifbr_addrmax; /* member if addr max */ uint32_t ifbr_addrexceeded; /* member if addr violations */ + ether_vlanid_t ifbr_untagged; /* member if untagged vlan */ uint8_t pad[32]; }; @@ -155,10 +161,11 @@ struct ifbreq { #define IFBIF_BSTP_ADMEDGE 0x0200 /* member stp admin edge enabled */ #define IFBIF_BSTP_ADMCOST 0x0400 /* member stp admin path cost */ #define IFBIF_PRIVATE 0x0800 /* if is a private segment */ +#define IFBIF_VLANFILTER 0x1000 /* if does vlan filtering */ #define IFBIFBITS "\020\001LEARNING\002DISCOVER\003STP\004SPAN" \ "\005STICKY\014PRIVATE\006EDGE\007AUTOEDGE\010PTP" \ - "\011AUTOPTP" + "\011AUTOPTP\015VLANFILTER" #define IFBIFMASK ~(IFBIF_BSTP_EDGE|IFBIF_BSTP_AUTOEDGE|IFBIF_BSTP_PTP| \ IFBIF_BSTP_AUTOPTP|IFBIF_BSTP_ADMEDGE| \ IFBIF_BSTP_ADMCOST) /* not saved */ @@ -304,6 +311,26 @@ struct ifbpstpconf { eaddr[5] = pv >> 0; \ } while (0) +/* + * Bridge VLAN access request. + */ +#define BRVLAN_SETSIZE 4096 +typedef __BITSET_DEFINE(ifbvlan_set, BRVLAN_SETSIZE) ifbvlan_set_t; + +#define BRVLAN_SET(set, bit) __BIT_SET(BRVLAN_SETSIZE, (bit), set) +#define BRVLAN_CLR(set, bit) __BIT_CLR(BRVLAN_SETSIZE, (bit), set) +#define BRVLAN_TEST(set, bit) __BIT_ISSET(BRVLAN_SETSIZE, (bit), set) + +#define BRDG_VLAN_OP_SET 1 /* replace current vlan set */ +#define BRDG_VLAN_OP_ADD 2 /* add vlans to current set */ +#define BRDG_VLAN_OP_DEL 3 /* remove vlans from current set */ + +struct ifbif_vlan_req { + char bv_ifname[IFNAMSIZ]; + uint8_t bv_op; + ifbvlan_set_t bv_set; +}; + #ifdef _KERNEL #define BRIDGE_INPUT(_ifp, _m) do { \ diff --git a/sys/net/if_ethersubr.c b/sys/net/if_ethersubr.c index cf697089708c..3ae0c01c0efc 100644 --- a/sys/net/if_ethersubr.c +++ b/sys/net/if_ethersubr.c @@ -92,11 +92,6 @@ #include <crypto/sha1.h> -#ifdef CTASSERT -CTASSERT(sizeof (struct ether_header) == ETHER_ADDR_LEN * 2 + 2); -CTASSERT(sizeof (struct ether_addr) == ETHER_ADDR_LEN); -#endif - VNET_DEFINE(pfil_head_t, link_pfil_head); /* Packet filter hooks */ /* netgraph node hooks for ng_ether(4) */ @@ -1510,9 +1505,7 @@ ether_gen_addr_byname(const char *nameunit, struct ether_addr *hwaddr) SHA1Final(digest, &ctx); free(buf, M_TEMP); - addr = ((digest[0] << 16) | (digest[1] << 8) | digest[2]) & - OUI_FREEBSD_GENERATED_MASK; - addr = OUI_FREEBSD(addr); + addr = (digest[0] << 8) | digest[1] | OUI_FREEBSD_GENERATED_LOW; for (i = 0; i < ETHER_ADDR_LEN; ++i) { hwaddr->octet[i] = addr >> ((ETHER_ADDR_LEN - i - 1) * 8) & 0xFF; diff --git a/sys/net/if_gif.h b/sys/net/if_gif.h index 3c1846b8f82a..c6692d3dd6bc 100644 --- a/sys/net/if_gif.h +++ b/sys/net/if_gif.h @@ -120,7 +120,8 @@ int in6_gif_setopts(struct gif_softc *, u_int); #define GIFGOPTS _IOWR('i', 150, struct ifreq) #define GIFSOPTS _IOW('i', 151, struct ifreq) +#define GIF_NOCLAMP 0x0001 #define GIF_IGNORE_SOURCE 0x0002 -#define GIF_OPTMASK (GIF_IGNORE_SOURCE) +#define GIF_OPTMASK (GIF_NOCLAMP|GIF_IGNORE_SOURCE) #endif /* _NET_IF_GIF_H_ */ diff --git a/sys/net/if_lagg.c b/sys/net/if_lagg.c index 9867a718e148..5b52bfa80e3b 100644 --- a/sys/net/if_lagg.c +++ b/sys/net/if_lagg.c @@ -718,6 +718,7 @@ lagg_capabilities(struct lagg_softc *sc) sc->sc_ifp->if_capenable = ena; sc->sc_ifp->if_capenable2 = ena2; sc->sc_ifp->if_hwassist = hwa; + (void)if_hw_tsomax_update(sc->sc_ifp, &hw_tsomax); getmicrotime(&sc->sc_ifp->if_lastchange); if (sc->sc_ifflags & IFF_DEBUG) diff --git a/sys/net/if_vlan.c b/sys/net/if_vlan.c index e9e1c82cb688..22fcb7bf7c64 100644 --- a/sys/net/if_vlan.c +++ b/sys/net/if_vlan.c @@ -1673,6 +1673,7 @@ vlan_config(struct ifvlan *ifv, struct ifnet *p, uint16_t vid, */ if (p->if_type != IFT_ETHER && p->if_type != IFT_L2VLAN && + p->if_type != IFT_BRIDGE && (p->if_capenable & IFCAP_VLAN_HWTAGGING) == 0) return (EPROTONOSUPPORT); if ((p->if_flags & VLAN_IFFLAGS) != VLAN_IFFLAGS) diff --git a/sys/net/if_vlan_var.h b/sys/net/if_vlan_var.h index f0b09445d04b..695bb81f77b3 100644 --- a/sys/net/if_vlan_var.h +++ b/sys/net/if_vlan_var.h @@ -126,13 +126,6 @@ struct vlanreq { #define VLAN_PCP_MAX 7 -#define DOT1Q_VID_NULL 0x0 -#define DOT1Q_VID_DEF_PVID 0x1 -#define DOT1Q_VID_DEF_SR_PVID 0x2 -#define DOT1Q_VID_RSVD_IMPL 0xfff -#define DOT1Q_VID_MIN 1 /* minimum valid vlan id */ -#define DOT1Q_VID_MAX 4094 /* maximum valid vlan id */ - /* * 802.1q full tag. Proto and vid are stored in host byte order. */ diff --git a/sys/net/pfvar.h b/sys/net/pfvar.h index 71cb1862aabf..452a8eb4024b 100644 --- a/sys/net/pfvar.h +++ b/sys/net/pfvar.h @@ -508,18 +508,6 @@ extern struct sx pf_end_lock; (c == AF_INET6 && !(a)->addr32[0] && !(a)->addr32[1] && \ !(a)->addr32[2] && !(a)->addr32[3] )) \ -#define PF_MATCHA(n, a, m, b, f) \ - pf_match_addr(n, a, m, b, f) - -#define PF_ACPY(a, b, f) \ - pf_addrcpy(a, b, f) - -#define PF_AINC(a, f) \ - pf_addr_inc(a, f) - -#define PF_POOLMASK(a, b, c, d, f) \ - pf_poolmask(a, b, c, d, f) - #else /* Just IPv6 */ @@ -544,18 +532,6 @@ extern struct sx pf_end_lock; !(a)->addr32[2] && \ !(a)->addr32[3] ) \ -#define PF_MATCHA(n, a, m, b, f) \ - pf_match_addr(n, a, m, b, f) - -#define PF_ACPY(a, b, f) \ - pf_addrcpy(a, b, f) - -#define PF_AINC(a, f) \ - pf_addr_inc(a, f) - -#define PF_POOLMASK(a, b, c, d, f) \ - pf_poolmask(a, b, c, d, f) - #else /* Just IPv4 */ @@ -570,29 +546,14 @@ extern struct sx pf_end_lock; #define PF_AZERO(a, c) \ (!(a)->addr32[0]) -#define PF_MATCHA(n, a, m, b, f) \ - pf_match_addr(n, a, m, b, f) - -#define PF_ACPY(a, b, f) \ - (a)->v4.s_addr = (b)->v4.s_addr - -#define PF_AINC(a, f) \ - do { \ - (a)->addr32[0] = htonl(ntohl((a)->addr32[0]) + 1); \ - } while (0) - -#define PF_POOLMASK(a, b, c, d, f) \ - do { \ - (a)->addr32[0] = ((b)->addr32[0] & (c)->addr32[0]) | \ - (((c)->addr32[0] ^ 0xffffffff ) & (d)->addr32[0]); \ - } while (0) - #endif /* PF_INET_ONLY */ #endif /* PF_INET6_ONLY */ #endif /* PF_INET_INET6 */ #ifdef _KERNEL -#ifdef INET6 + +void unhandled_af(int) __dead2; + static void inline pf_addrcpy(struct pf_addr *dst, const struct pf_addr *src, sa_family_t af) { @@ -602,12 +563,15 @@ pf_addrcpy(struct pf_addr *dst, const struct pf_addr *src, sa_family_t af) memcpy(&dst->v4, &src->v4, sizeof(dst->v4)); break; #endif /* INET */ +#ifdef INET6 case AF_INET6: memcpy(&dst->v6, &src->v6, sizeof(dst->v6)); break; +#endif /* INET6 */ + default: + unhandled_af(af); } } -#endif /* INET6 */ #endif /* @@ -629,7 +593,7 @@ pf_addrcpy(struct pf_addr *dst, const struct pf_addr *src, sa_family_t af) &(aw)->v.a.mask, (x), (af))) || \ ((aw)->type == PF_ADDR_ADDRMASK && \ !PF_AZERO(&(aw)->v.a.mask, (af)) && \ - !PF_MATCHA(0, &(aw)->v.a.addr, \ + !pf_match_addr(0, &(aw)->v.a.addr, \ &(aw)->v.a.mask, (x), (af))))) != \ (neg) \ ) @@ -1406,7 +1370,6 @@ struct pf_kruleset { struct pf_krulequeue queues[2]; struct { struct pf_krulequeue *ptr; - struct pf_krule **ptr_array; u_int32_t rcount; u_int32_t ticket; int open; @@ -2341,7 +2304,6 @@ VNET_DECLARE(struct pf_krule *, pf_rulemarker); #define V_pf_rulemarker VNET(pf_rulemarker) #endif -void unhandled_af(int) __dead2; int pf_start(void); int pf_stop(void); void pf_initialize(void); @@ -2477,11 +2439,11 @@ int pf_test(sa_family_t, int, int, struct ifnet *, struct mbuf **, struct inpcb int pf_normalize_ip(u_short *, struct pf_pdesc *); #endif /* INET */ -#ifdef INET6 -int pf_normalize_ip6(int, u_short *, struct pf_pdesc *); void pf_poolmask(struct pf_addr *, struct pf_addr*, struct pf_addr *, struct pf_addr *, sa_family_t); void pf_addr_inc(struct pf_addr *, sa_family_t); +#ifdef INET6 +int pf_normalize_ip6(int, u_short *, struct pf_pdesc *); int pf_max_frag_size(struct mbuf *); int pf_refragment6(struct ifnet *, struct mbuf **, struct m_tag *, struct ifnet *, bool); @@ -2537,7 +2499,7 @@ int pfr_match_addr(struct pfr_ktable *, struct pf_addr *, sa_family_t); void pfr_update_stats(struct pfr_ktable *, struct pf_addr *, sa_family_t, u_int64_t, int, int, int); int pfr_pool_get(struct pfr_ktable *, int *, struct pf_addr *, sa_family_t, - pf_addr_filter_func_t); + pf_addr_filter_func_t, bool); void pfr_dynaddr_update(struct pfr_ktable *, struct pfi_dynaddr *); struct pfr_ktable * pfr_attach_table(struct pf_kruleset *, char *); @@ -2571,6 +2533,8 @@ int pfr_ina_rollback(struct pfr_table *, u_int32_t, int *, int); int pfr_ina_commit(struct pfr_table *, u_int32_t, int *, int *, int); int pfr_ina_define(struct pfr_table *, struct pfr_addr *, int, int *, int *, u_int32_t, int); +struct pfr_ktable + *pfr_ktable_select_active(struct pfr_ktable *); MALLOC_DECLARE(PFI_MTYPE); VNET_DECLARE(struct pfi_kkif *, pfi_all); @@ -2674,11 +2638,10 @@ int pf_kanchor_copyout(const struct pf_kruleset *, const struct pf_krule *, char *, size_t); int pf_kanchor_nvcopyout(const struct pf_kruleset *, const struct pf_krule *, nvlist_t *); -void pf_kanchor_remove(struct pf_krule *); +void pf_remove_kanchor(struct pf_krule *); void pf_remove_if_empty_kruleset(struct pf_kruleset *); struct pf_kruleset *pf_find_kruleset(const char *); struct pf_kruleset *pf_get_leaf_kruleset(char *, char **); -struct pf_kanchor *pf_create_kanchor(struct pf_kanchor *, const char *); struct pf_kruleset *pf_find_or_create_kruleset(const char *); void pf_rs_initialize(void); @@ -2712,6 +2675,7 @@ int pf_ioctl_get_addrs(struct pf_nl_pooladdr *); int pf_ioctl_get_addr(struct pf_nl_pooladdr *); int pf_ioctl_get_rulesets(struct pfioc_ruleset *); int pf_ioctl_get_ruleset(struct pfioc_ruleset *); +int pf_ioctl_natlook(struct pfioc_natlook *); void pf_krule_free(struct pf_krule *); void pf_krule_clear_counters(struct pf_krule *); @@ -2749,7 +2713,6 @@ u_short pf_map_addr(u_int8_t, struct pf_krule *, u_short pf_map_addr_sn(u_int8_t, struct pf_krule *, struct pf_addr *, struct pf_addr *, struct pfi_kkif **nkif, struct pf_addr *, - struct pf_ksrc_node **, struct pf_srchash **, struct pf_kpool *, pf_sn_types_t); int pf_get_transaddr_af(struct pf_krule *, struct pf_pdesc *); diff --git a/sys/net80211/ieee80211_hostap.c b/sys/net80211/ieee80211_hostap.c index c5a478533313..9074878e17e4 100644 --- a/sys/net80211/ieee80211_hostap.c +++ b/sys/net80211/ieee80211_hostap.c @@ -2214,12 +2214,9 @@ hostap_recv_mgmt(struct ieee80211_node *ni, struct mbuf *m0, /* VHT */ if (IEEE80211_IS_CHAN_VHT(ni->ni_chan) && - vhtcap != NULL && - vhtinfo != NULL) { - /* XXX TODO; see below */ - net80211_vap_printf(vap, "%s: VHT TODO!\n", __func__); + vhtcap != NULL) { ieee80211_vht_node_init(ni); - ieee80211_vht_update_cap(ni, vhtcap, vhtinfo); + ieee80211_vht_update_cap(ni, vhtcap); } else if (ni->ni_flags & IEEE80211_NODE_VHT) ieee80211_vht_node_cleanup(ni); diff --git a/sys/net80211/ieee80211_ht.c b/sys/net80211/ieee80211_ht.c index 5ec80e3646b8..c28f124648a1 100644 --- a/sys/net80211/ieee80211_ht.c +++ b/sys/net80211/ieee80211_ht.c @@ -1952,6 +1952,11 @@ do { \ _RETURN_CHAN_BITS(0); /* + * TODO: should we bail out if there's no htinfo? + * Or just treat it as if we can't do the HT20/HT40 check? + */ + + /* * The original code was based on * 802.11ac-2013, Table 8-183x-VHT Operation Information subfields. * 802.11-2020, Table 9-274-VHT Operation Information subfields @@ -1962,8 +1967,12 @@ do { \ */ htinfo = (const struct ieee80211_ie_htinfo *)ni->ni_ies.htinfo_ie; - ht40 = ((htinfo->hi_byte1 & IEEE80211_HTINFO_TXWIDTH) == - IEEE80211_HTINFO_TXWIDTH_2040); + if (htinfo != NULL) + ht40 = ((htinfo->hi_byte1 & IEEE80211_HTINFO_TXWIDTH) == + IEEE80211_HTINFO_TXWIDTH_2040); + else + ht40 = false; + can_vht160 = can_vht80p80 = can_vht80 = false; /* 20 Mhz */ diff --git a/sys/net80211/ieee80211_node.c b/sys/net80211/ieee80211_node.c index ad17af6778a1..a201d1b278f0 100644 --- a/sys/net80211/ieee80211_node.c +++ b/sys/net80211/ieee80211_node.c @@ -3138,6 +3138,36 @@ ieee80211_getsignal(struct ieee80211vap *vap, int8_t *rssi, int8_t *noise) } /** + * @brief Increment the given TID TX sequence, return the current one. + * + * @param ni ieee80211_node to operate on + * @param tid TID, or IEEE80211_NONQOS_TID + * @returns sequence number, from 0 .. 4095 inclusive, post increments + */ +ieee80211_seq ieee80211_tx_seqno_fetch_incr(struct ieee80211_node *ni, + uint8_t tid) +{ + ieee80211_seq seq; + + seq = ni->ni_txseqs[tid]; + ni->ni_txseqs[tid] = (ni->ni_txseqs[tid] + 1) % IEEE80211_SEQ_RANGE; + return (seq); +} + +/** + * @brief Return the current sequence number for the given TID + * + * @param ni ieee80211_node to operate on + * @param tid TID, or IEEE80211_NONQOS_TID + * @returns sequence number, from 0 .. 4095 inclusive + */ +ieee80211_seq ieee80211_tx_seqno_fetch(const struct ieee80211_node *ni, + uint8_t tid) +{ + return (ni->ni_txseqs[tid]); +} + +/** * @brief return a dot11rate / ratecode representing the current transmit rate * * This is the API call for legacy / 802.11n drivers and rate control APIs diff --git a/sys/net80211/ieee80211_node.h b/sys/net80211/ieee80211_node.h index c83eee04a8dc..ef25fa0d7fdd 100644 --- a/sys/net80211/ieee80211_node.h +++ b/sys/net80211/ieee80211_node.h @@ -531,6 +531,12 @@ void ieee80211_node_leave(struct ieee80211_node *); int8_t ieee80211_getrssi(struct ieee80211vap *); void ieee80211_getsignal(struct ieee80211vap *, int8_t *, int8_t *); +/* TX sequence space related routines */ +ieee80211_seq ieee80211_tx_seqno_fetch_incr(struct ieee80211_node *, + uint8_t); +ieee80211_seq ieee80211_tx_seqno_fetch(const struct ieee80211_node *, + uint8_t); + /* * Node transmit rate specific manipulation. * diff --git a/sys/net80211/ieee80211_output.c b/sys/net80211/ieee80211_output.c index a4151f807882..afe83ea0805c 100644 --- a/sys/net80211/ieee80211_output.c +++ b/sys/net80211/ieee80211_output.c @@ -4195,17 +4195,15 @@ ieee80211_tx_complete(struct ieee80211_node *ni, struct mbuf *m, int status) * Check the frame type and TID and assign a suitable sequence number * from the correct sequence number space. * + * This implements the components of 802.11-2020 10.3.2.14.2 + * (Transmitter Requirements) that net80211 currently supports. + * * It assumes the mbuf has been encapsulated, and has the TID assigned * if it is a QoS frame. * * Note this also clears any existing fragment ID in the header, so it * must be called first before assigning fragment IDs. * - * For now this implements parts of 802.11-2012; it doesn't do all of - * the needed checks for full compliance (notably QoS-Data NULL frames). - * - * TODO: update to 802.11-2020 10.3.2.14.2 (Transmitter Requirements) - * * @param ni ieee80211_node this frame will be transmitted to * @param arg_tid A temporary check, existing callers may set * this to a TID variable they were using, and this routine @@ -4239,16 +4237,30 @@ ieee80211_output_seqno_assign(struct ieee80211_node *ni, int arg_tid, "%s: called; TID mismatch; tid=%u, arg_tid=%d\n", __func__, tid, arg_tid); - if (IEEE80211_HAS_SEQ(type, subtype)) { - /* - * 802.11-2012 9.3.2.10 - QoS multicast frames - * come out of a different seqno space. - */ - if (IEEE80211_IS_MULTICAST(wh->i_addr1)) - seqno = ni->ni_txseqs[IEEE80211_NONQOS_TID]++; - else - seqno = ni->ni_txseqs[tid]++; - } else + + /* 802.11-2020 10.3.2.14.2 (Transmitter Requirements) sections */ + + /* SNS7 - unicast PV1 management frame */ + + /* SNS6 - unicast PV1 data frame */ + + /* SNS5 - QoS NULL frames */ + if (IEEE80211_QOS_HAS_SEQ(wh) && IEEE80211_IS_QOS_NULL(wh)) + seqno = ieee80211_tx_seqno_fetch_incr(ni, IEEE80211_NONQOS_TID); + + /* SNS4 - QMF STA transmitting a QMF */ + + /* SNS3 - QoS STA; Time Priority Management frame */ + + /* SNS2 - unicast QoS STA, data frame, excluding SNS5 */ + else if (IEEE80211_QOS_HAS_SEQ(wh) && + !IEEE80211_IS_MULTICAST(wh->i_addr1)) + seqno = ieee80211_tx_seqno_fetch_incr(ni, tid); + + /* SNS1 - Baseline (everything else) */ + else if (IEEE80211_HAS_SEQ(type, subtype)) + seqno = ieee80211_tx_seqno_fetch_incr(ni, IEEE80211_NONQOS_TID); + else seqno = 0; /* @@ -4276,7 +4288,7 @@ ieee80211_output_beacon_seqno_assign(struct ieee80211_node *ni, struct mbuf *m) wh = mtod(m, struct ieee80211_frame *); - seqno = ni->ni_txseqs[IEEE80211_NONQOS_TID]++; + seqno = ieee80211_tx_seqno_fetch_incr(ni, IEEE80211_NONQOS_TID); *(uint16_t *)&wh->i_seq[0] = htole16(seqno << IEEE80211_SEQ_SEQ_SHIFT); M_SEQNO_SET(m, seqno); diff --git a/sys/net80211/ieee80211_vht.c b/sys/net80211/ieee80211_vht.c index e91977f1ef98..de0b691d4d2a 100644 --- a/sys/net80211/ieee80211_vht.c +++ b/sys/net80211/ieee80211_vht.c @@ -838,12 +838,10 @@ ieee80211_add_vhtinfo(uint8_t *frm, struct ieee80211_node *ni) } void -ieee80211_vht_update_cap(struct ieee80211_node *ni, const uint8_t *vhtcap_ie, - const uint8_t *vhtop_ie) +ieee80211_vht_update_cap(struct ieee80211_node *ni, const uint8_t *vhtcap_ie) { ieee80211_parse_vhtcap(ni, vhtcap_ie); - ieee80211_parse_vhtopmode(ni, vhtop_ie); } static struct ieee80211_channel * diff --git a/sys/net80211/ieee80211_vht.h b/sys/net80211/ieee80211_vht.h index 2964de63c343..a1529df4a85b 100644 --- a/sys/net80211/ieee80211_vht.h +++ b/sys/net80211/ieee80211_vht.h @@ -52,8 +52,7 @@ uint8_t * ieee80211_add_vhtinfo(uint8_t *frm, struct ieee80211_node *); uint8_t *ieee80211_add_vhtcap_ch(uint8_t *, struct ieee80211vap *, struct ieee80211_channel *); -void ieee80211_vht_update_cap(struct ieee80211_node *, - const uint8_t *, const uint8_t *); +void ieee80211_vht_update_cap(struct ieee80211_node *, const uint8_t *); struct ieee80211_channel * ieee80211_vht_adjust_channel(struct ieee80211com *, diff --git a/sys/netinet/icmp_var.h b/sys/netinet/icmp_var.h index b1f2b0ebf911..d6b75e482e35 100644 --- a/sys/netinet/icmp_var.h +++ b/sys/netinet/icmp_var.h @@ -104,11 +104,10 @@ extern int badport_bandlim(int); #define BANDLIM_ICMP_UNREACH 0 #define BANDLIM_ICMP_ECHO 1 #define BANDLIM_ICMP_TSTAMP 2 -#define BANDLIM_RST_CLOSEDPORT 3 /* No connection, and no listeners */ -#define BANDLIM_RST_OPENPORT 4 /* No connection, listener */ -#define BANDLIM_ICMP6_UNREACH 5 -#define BANDLIM_SCTP_OOTB 6 -#define BANDLIM_MAX 7 +#define BANDLIM_TCP_RST 3 +#define BANDLIM_ICMP6_UNREACH 4 +#define BANDLIM_SCTP_OOTB 5 +#define BANDLIM_MAX 6 #endif #endif diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c index bccd4b84561a..dbe48242381d 100644 --- a/sys/netinet/in_pcb.c +++ b/sys/netinet/in_pcb.c @@ -1745,6 +1745,23 @@ in_pcbrele(struct inpcb *inp, const inp_lookup_t lock) } /* + * Dereference and rlock inp, for which the caller must own the + * reference. Returns true if inp no longer usable, false otherwise. + */ +bool +in_pcbrele_rlock(struct inpcb *inp) +{ + INP_RLOCK(inp); + if (in_pcbrele_rlocked(inp)) + return (true); + if ((inp->inp_flags & INP_FREED) != 0) { + INP_RUNLOCK(inp); + return (true); + } + return (false); +} + +/* * Unconditionally schedule an inpcb to be freed by decrementing its * reference count, which should occur only after the inpcb has been detached * from its socket. If another thread holds a temporary reference (acquired diff --git a/sys/netinet/in_pcb.h b/sys/netinet/in_pcb.h index 57cf15ca37fc..9e0618e87601 100644 --- a/sys/netinet/in_pcb.h +++ b/sys/netinet/in_pcb.h @@ -681,6 +681,7 @@ void in_pcbref(struct inpcb *); bool in_pcbrele(struct inpcb *, inp_lookup_t); bool in_pcbrele_rlocked(struct inpcb *); bool in_pcbrele_wlocked(struct inpcb *); +bool in_pcbrele_rlock(struct inpcb *inp); typedef bool inp_match_t(const struct inpcb *, void *); struct inpcb_iterator { diff --git a/sys/netinet/ip_icmp.c b/sys/netinet/ip_icmp.c index cb4b6df57c57..71b75d18efd0 100644 --- a/sys/netinet/ip_icmp.c +++ b/sys/netinet/ip_icmp.c @@ -1097,8 +1097,7 @@ static const char *icmp_rate_descrs[BANDLIM_MAX] = { [BANDLIM_ICMP_UNREACH] = "icmp unreach", [BANDLIM_ICMP_ECHO] = "icmp ping", [BANDLIM_ICMP_TSTAMP] = "icmp tstamp", - [BANDLIM_RST_CLOSEDPORT] = "closed port RST", - [BANDLIM_RST_OPENPORT] = "open port RST", + [BANDLIM_TCP_RST] = "tcp reset", [BANDLIM_ICMP6_UNREACH] = "icmp6 unreach", [BANDLIM_SCTP_OOTB] = "sctp ootb", }; diff --git a/sys/netinet/tcp_hpts.c b/sys/netinet/tcp_hpts.c index 91f8251589e4..b60cdf45af52 100644 --- a/sys/netinet/tcp_hpts.c +++ b/sys/netinet/tcp_hpts.c @@ -433,38 +433,40 @@ static void tcp_hpts_log(struct tcp_hpts_entry *hpts, struct tcpcb *tp, struct timeval *tv, int slots_to_run, int idx, bool from_callout) { - union tcp_log_stackspecific log; - /* - * Unused logs are - * 64 bit - delRate, rttProp, bw_inuse - * 16 bit - cwnd_gain - * 8 bit - bbr_state, bbr_substate, inhpts; - */ - memset(&log, 0, sizeof(log)); - log.u_bbr.flex1 = hpts->p_nxt_slot; - log.u_bbr.flex2 = hpts->p_cur_slot; - log.u_bbr.flex3 = hpts->p_prev_slot; - log.u_bbr.flex4 = idx; - log.u_bbr.flex5 = hpts->p_curtick; - log.u_bbr.flex6 = hpts->p_on_queue_cnt; - log.u_bbr.flex7 = hpts->p_cpu; - log.u_bbr.flex8 = (uint8_t)from_callout; - log.u_bbr.inflight = slots_to_run; - log.u_bbr.applimited = hpts->overidden_sleep; - log.u_bbr.delivered = hpts->saved_curtick; - log.u_bbr.timeStamp = tcp_tv_to_usectick(tv); - log.u_bbr.epoch = hpts->saved_curslot; - log.u_bbr.lt_epoch = hpts->saved_prev_slot; - log.u_bbr.pkts_out = hpts->p_delayed_by; - log.u_bbr.lost = hpts->p_hpts_sleep_time; - log.u_bbr.pacing_gain = hpts->p_cpu; - log.u_bbr.pkt_epoch = hpts->p_runningslot; - log.u_bbr.use_lt_bw = 1; - TCP_LOG_EVENTP(tp, NULL, - &tptosocket(tp)->so_rcv, - &tptosocket(tp)->so_snd, - BBR_LOG_HPTSDIAG, 0, - 0, &log, false, tv); + if (hpts_does_tp_logging && tcp_bblogging_on(tp)) { + union tcp_log_stackspecific log; + /* + * Unused logs are + * 64 bit - delRate, rttProp, bw_inuse + * 16 bit - cwnd_gain + * 8 bit - bbr_state, bbr_substate, inhpts; + */ + memset(&log, 0, sizeof(log)); + log.u_bbr.flex1 = hpts->p_nxt_slot; + log.u_bbr.flex2 = hpts->p_cur_slot; + log.u_bbr.flex3 = hpts->p_prev_slot; + log.u_bbr.flex4 = idx; + log.u_bbr.flex5 = hpts->p_curtick; + log.u_bbr.flex6 = hpts->p_on_queue_cnt; + log.u_bbr.flex7 = hpts->p_cpu; + log.u_bbr.flex8 = (uint8_t)from_callout; + log.u_bbr.inflight = slots_to_run; + log.u_bbr.applimited = hpts->overidden_sleep; + log.u_bbr.delivered = hpts->saved_curtick; + log.u_bbr.timeStamp = tcp_tv_to_usectick(tv); + log.u_bbr.epoch = hpts->saved_curslot; + log.u_bbr.lt_epoch = hpts->saved_prev_slot; + log.u_bbr.pkts_out = hpts->p_delayed_by; + log.u_bbr.lost = hpts->p_hpts_sleep_time; + log.u_bbr.pacing_gain = hpts->p_cpu; + log.u_bbr.pkt_epoch = hpts->p_runningslot; + log.u_bbr.use_lt_bw = 1; + TCP_LOG_EVENTP(tp, NULL, + &tptosocket(tp)->so_rcv, + &tptosocket(tp)->so_snd, + BBR_LOG_HPTSDIAG, 0, + 0, &log, false, tv); + } } static void @@ -1353,10 +1355,7 @@ again: } CURVNET_SET(inp->inp_vnet); /* Lets do any logging that we might want to */ - if (hpts_does_tp_logging && tcp_bblogging_on(tp)) { - tcp_hpts_log(hpts, tp, &tv, slots_to_run, i, - from_callout); - } + tcp_hpts_log(hpts, tp, &tv, slots_to_run, i, from_callout); if (tp->t_fb_ptr != NULL) { kern_prefetch(tp->t_fb_ptr, &did_prefetch); @@ -1487,7 +1486,7 @@ no_run: } void -__tcp_set_hpts(struct tcpcb *tp, int32_t line) +tcp_set_hpts(struct tcpcb *tp) { struct tcp_hpts_entry *hpts; int failed; diff --git a/sys/netinet/tcp_hpts.h b/sys/netinet/tcp_hpts.h index b097a2b98db9..f5856ed8e688 100644 --- a/sys/netinet/tcp_hpts.h +++ b/sys/netinet/tcp_hpts.h @@ -149,8 +149,7 @@ uint32_t tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, #define tcp_hpts_insert(inp, slot) \ tcp_hpts_insert_diag((inp), (slot), __LINE__, NULL) -void __tcp_set_hpts(struct tcpcb *tp, int32_t line); -#define tcp_set_hpts(a) __tcp_set_hpts(a, __LINE__) +void tcp_set_hpts(struct tcpcb *tp); void tcp_set_inp_to_drop(struct inpcb *inp, uint16_t reason); @@ -165,25 +164,25 @@ extern int32_t tcp_min_hptsi_time; * The following functions should also be available * to userspace as well. */ -static __inline uint32_t +static inline uint32_t tcp_tv_to_hptstick(const struct timeval *sv) { return ((sv->tv_sec * 100000) + (sv->tv_usec / HPTS_TICKS_PER_SLOT)); } -static __inline uint32_t +static inline uint32_t tcp_tv_to_usectick(const struct timeval *sv) { return ((uint32_t) ((sv->tv_sec * HPTS_USEC_IN_SEC) + sv->tv_usec)); } -static __inline uint32_t +static inline uint32_t tcp_tv_to_mssectick(const struct timeval *sv) { return ((uint32_t) ((sv->tv_sec * HPTS_MSEC_IN_SEC) + (sv->tv_usec/HPTS_USEC_IN_MSEC))); } -static __inline uint64_t +static inline uint64_t tcp_tv_to_lusectick(const struct timeval *sv) { return ((uint64_t)((sv->tv_sec * HPTS_USEC_IN_SEC) + sv->tv_usec)); @@ -199,7 +198,7 @@ get_hpts_min_sleep_time(void) return (tcp_min_hptsi_time + HPTS_TICKS_PER_SLOT); } -static __inline uint32_t +static inline uint32_t tcp_gethptstick(struct timeval *sv) { struct timeval tv; @@ -210,7 +209,7 @@ tcp_gethptstick(struct timeval *sv) return (tcp_tv_to_hptstick(sv)); } -static __inline uint64_t +static inline uint64_t tcp_get_u64_usecs(struct timeval *tv) { struct timeval tvd; @@ -221,7 +220,7 @@ tcp_get_u64_usecs(struct timeval *tv) return (tcp_tv_to_lusectick(tv)); } -static __inline uint32_t +static inline uint32_t tcp_get_usecs(struct timeval *tv) { struct timeval tvd; diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c index 7c032e13f37a..de428ae1af6f 100644 --- a/sys/netinet/tcp_input.c +++ b/sys/netinet/tcp_input.c @@ -621,6 +621,7 @@ tcp_input_with_port(struct mbuf **mp, int *offp, int proto, uint16_t port) #endif /* INET6 */ struct tcpopt to; /* options in this segment */ char *s = NULL; /* address and port logging */ + bool closed_port = false; /* segment is hitting a closed port */ NET_EPOCH_ASSERT(); @@ -907,7 +908,8 @@ findpcb: log(LOG_INFO, "%s; %s: Connection attempt " "to closed port\n", s, __func__); } - rstreason = BANDLIM_RST_CLOSEDPORT; + rstreason = BANDLIM_TCP_RST; + closed_port = true; goto dropwithreset; } INP_LOCK_ASSERT(inp); @@ -998,12 +1000,14 @@ findpcb: * down or it is in the CLOSED state. Either way we drop the * segment and send an appropriate response. */ - rstreason = BANDLIM_RST_CLOSEDPORT; + rstreason = BANDLIM_TCP_RST; + closed_port = true; goto dropwithreset; } if ((tp->t_port != port) && (tp->t_state > TCPS_LISTEN)) { - rstreason = BANDLIM_RST_CLOSEDPORT; + rstreason = BANDLIM_TCP_RST; + closed_port = true; goto dropwithreset; } @@ -1055,6 +1059,8 @@ findpcb: * socket appended to the listen queue in SYN_RECEIVED state. */ if ((thflags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK) { + int result; + /* * Parse the TCP options here because * syncookies need access to the reflected @@ -1064,8 +1070,8 @@ findpcb: /* * NB: syncache_expand() doesn't unlock inp. */ - rstreason = syncache_expand(&inc, &to, th, &so, m, port); - if (rstreason < 0) { + result = syncache_expand(&inc, &to, th, &so, m, port); + if (result < 0) { /* * A failing TCP MD5 signature comparison * must result in the segment being dropped @@ -1073,7 +1079,7 @@ findpcb: * to the sender. */ goto dropunlock; - } else if (rstreason == 0) { + } else if (result == 0) { /* * No syncache entry, or ACK was not for our * SYN/ACK. Do our protection against double @@ -1092,7 +1098,7 @@ findpcb: * of the failure cause. */ INP_WUNLOCK(inp); - rstreason = BANDLIM_RST_OPENPORT; + rstreason = BANDLIM_TCP_RST; lookupflag &= ~INPLOOKUP_WILDCARD; goto findpcb; } @@ -1183,7 +1189,7 @@ tfo_socket_result: s, __func__); syncache_badack(&inc, port); /* XXX: Not needed! */ TCPSTAT_INC(tcps_badsyn); - rstreason = BANDLIM_RST_OPENPORT; + rstreason = BANDLIM_TCP_RST; goto dropwithreset; } /* @@ -1259,7 +1265,7 @@ tfo_socket_result: "Connection attempt to deprecated " "IPv6 address rejected\n", s, __func__); - rstreason = BANDLIM_RST_OPENPORT; + rstreason = BANDLIM_TCP_RST; goto dropwithreset; } } @@ -1380,9 +1386,10 @@ dropwithreset: * When blackholing do not respond with a RST but * completely ignore the segment and drop it. */ - if (((rstreason == BANDLIM_RST_OPENPORT && V_blackhole == 3) || - (rstreason == BANDLIM_RST_CLOSEDPORT && - ((V_blackhole == 1 && (thflags & TH_SYN)) || V_blackhole > 1))) && + if (rstreason == BANDLIM_TCP_RST && + ((!closed_port && V_blackhole == 3) || + (closed_port && + ((V_blackhole == 1 && (thflags & TH_SYN)) || V_blackhole > 1))) && (V_blackhole_local || ( #ifdef INET6 isipv6 ? !in6_localip(&ip6->ip6_src) : @@ -1515,7 +1522,9 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, struct tcpopt to; int tfo_syn; u_int maxseg = 0; + bool no_data; + no_data = (tlen == 0); thflags = tcp_get_flags(th); tp->sackhint.last_sack_ack = 0; sack_changed = SACK_NOCHANGE; @@ -1754,7 +1763,7 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, tp->ts_recent = to.to_tsval; } - if (tlen == 0) { + if (no_data) { if (SEQ_GT(th->th_ack, tp->snd_una) && SEQ_LEQ(th->th_ack, tp->snd_max) && !IN_RECOVERY(tp->t_flags) && @@ -1963,7 +1972,7 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, if ((thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->snd_una) || SEQ_GT(th->th_ack, tp->snd_max))) { - rstreason = BANDLIM_RST_OPENPORT; + rstreason = BANDLIM_TCP_RST; tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); goto dropwithreset; } @@ -1976,7 +1985,7 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, * FIN, or a RST. */ if ((thflags & (TH_SYN|TH_ACK)) == (TH_SYN|TH_ACK)) { - rstreason = BANDLIM_RST_OPENPORT; + rstreason = BANDLIM_TCP_RST; tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); goto dropwithreset; } else if (thflags & TH_SYN) { @@ -2244,7 +2253,7 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, * for the "LAND" DoS attack. */ if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) { - rstreason = BANDLIM_RST_OPENPORT; + rstreason = BANDLIM_TCP_RST; tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); goto dropwithreset; } @@ -2557,7 +2566,7 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, if (SEQ_LEQ(th->th_ack, tp->snd_una)) { maxseg = tcp_maxseg(tp); - if (tlen == 0 && + if (no_data && (tiwin == tp->snd_wnd || (tp->t_flags & TF_SACK_PERMIT))) { /* @@ -3113,8 +3122,7 @@ step6: (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { /* keep track of pure window updates */ - if (tlen == 0 && - tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) + if (no_data && tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) TCPSTAT_INC(tcps_rcvwinupd); tp->snd_wnd = tiwin; tp->snd_wl1 = th->th_seq; @@ -3424,7 +3432,7 @@ dropafterack: if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) && (SEQ_GT(tp->snd_una, th->th_ack) || SEQ_GT(th->th_ack, tp->snd_max)) ) { - rstreason = BANDLIM_RST_OPENPORT; + rstreason = BANDLIM_TCP_RST; tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); goto dropwithreset; } diff --git a/sys/netinet/tcp_log_buf.c b/sys/netinet/tcp_log_buf.c index 75d693bc019b..e24790ece43d 100644 --- a/sys/netinet/tcp_log_buf.c +++ b/sys/netinet/tcp_log_buf.c @@ -2878,7 +2878,7 @@ tcp_log_sendfile(struct socket *so, off_t offset, size_t nbytes, int flags) /* double check log state now that we have the lock */ if (inp->inp_flags & INP_DROPPED) goto done; - if (tp->_t_logstate != TCP_LOG_STATE_OFF) { + if (tcp_bblogging_on(tp)) { struct timeval tv; tcp_log_eventspecific_t log; diff --git a/sys/netinet/tcp_log_buf.h b/sys/netinet/tcp_log_buf.h index fef32e16b2e4..3e7eef8a1cda 100644 --- a/sys/netinet/tcp_log_buf.h +++ b/sys/netinet/tcp_log_buf.h @@ -539,12 +539,12 @@ struct tcpcb; NULL, NULL, 0, NULL); \ } while (0) #endif /* TCP_LOG_FORCEVERBOSE */ +/* Assumes/requires the caller has already checked tcp_bblogging_on(tp). */ #define TCP_LOG_EVENTP(tp, th, rxbuf, txbuf, eventid, errornum, len, stackinfo, th_hostorder, tv) \ do { \ - if (tcp_bblogging_on(tp)) \ - tcp_log_event(tp, th, rxbuf, txbuf, eventid, \ - errornum, len, stackinfo, th_hostorder, \ - NULL, NULL, 0, tv); \ + KASSERT(tcp_bblogging_on(tp), ("bblogging is off")); \ + tcp_log_event(tp, th, rxbuf, txbuf, eventid, errornum, len, \ + stackinfo, th_hostorder, NULL, NULL, 0, tv); \ } while (0) #ifdef TCP_BLACKBOX diff --git a/sys/netinet/tcp_stacks/bbr.c b/sys/netinet/tcp_stacks/bbr.c index e2cfec5c9275..d2636f01714e 100644 --- a/sys/netinet/tcp_stacks/bbr.c +++ b/sys/netinet/tcp_stacks/bbr.c @@ -8763,7 +8763,7 @@ bbr_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so, (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); - ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset(m, tp, th, BANDLIM_TCP_RST, tlen); return (1); } if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) { @@ -8965,7 +8965,7 @@ bbr_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, (SEQ_LEQ(th->th_ack, tp->snd_una) || SEQ_GT(th->th_ack, tp->snd_max))) { tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); - ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset(m, tp, th, BANDLIM_TCP_RST, tlen); return (1); } if (tp->t_flags & TF_FASTOPEN) { @@ -8977,7 +8977,7 @@ bbr_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, */ if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) { tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); - ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset(m, tp, th, BANDLIM_TCP_RST, tlen); return (1); } else if (thflags & TH_SYN) { /* non-initial SYN is ignored */ @@ -9010,7 +9010,7 @@ bbr_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, */ if (SEQ_LT(th->th_seq, tp->irs)) { tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); - ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset(m, tp, th, BANDLIM_TCP_RST, tlen); return (1); } if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { @@ -9288,7 +9288,7 @@ bbr_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so, if (sbavail(&so->so_snd)) { if (ctf_progress_timeout_check(tp, true)) { bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__); - ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen); return (1); } } @@ -9385,7 +9385,7 @@ bbr_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so, if (sbavail(&so->so_snd)) { if (ctf_progress_timeout_check(tp, true)) { bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__); - ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen); return (1); } } @@ -9535,7 +9535,7 @@ bbr_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so, if (sbavail(&so->so_snd)) { if (ctf_progress_timeout_check(tp, true)) { bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__); - ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen); return (1); } } @@ -9637,7 +9637,7 @@ bbr_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so, if (sbavail(&so->so_snd)) { if (ctf_progress_timeout_check(tp, true)) { bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__); - ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen); return (1); } } @@ -9739,7 +9739,7 @@ bbr_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so, if (sbavail(&so->so_snd)) { if (ctf_progress_timeout_check(tp, true)) { bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__); - ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen); return (1); } } @@ -9848,7 +9848,7 @@ bbr_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so, if (sbavail(&so->so_snd)) { if (ctf_progress_timeout_check(tp, true)) { bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__); - ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen); return (1); } } @@ -11510,7 +11510,7 @@ bbr_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); - ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen); return (1); } if (tiwin > bbr->r_ctl.rc_high_rwnd) diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c index 8e05498863b9..834e1347a152 100644 --- a/sys/netinet/tcp_stacks/rack.c +++ b/sys/netinet/tcp_stacks/rack.c @@ -40,7 +40,6 @@ #endif #include <sys/lock.h> #include <sys/malloc.h> -#include <sys/lock.h> #include <sys/mutex.h> #include <sys/mbuf.h> #include <sys/proc.h> /* for proc0 declaration */ @@ -198,7 +197,7 @@ static uint32_t rack_pcm_blast = 0; static uint32_t rack_pcm_is_enabled = 1; static uint8_t rack_ssthresh_rest_rto_rec = 0; /* Do we restore ssthresh when we have rec -> rto -> rec */ -static uint32_t rack_gp_gain_req = 1200; /* Amount percent wise required to gain to record a round has "gaining" */ +static uint32_t rack_gp_gain_req = 1200; /* Amount percent wise required to gain to record a round as "gaining" */ static uint32_t rack_rnd_cnt_req = 0x10005; /* Default number of rounds if we are below rack_gp_gain_req where we exit ss */ @@ -938,7 +937,7 @@ rack_init_sysctls(void) SYSCTL_ADD_U32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_probertt), OID_AUTO, "time_between", CTLFLAG_RW, - & rack_time_between_probertt, 96000000, + &rack_time_between_probertt, 96000000, "How many useconds between the lowest rtt falling must past before we enter probertt"); SYSCTL_ADD_U32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_probertt), @@ -3480,9 +3479,9 @@ static void rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm) { if (rsm->r_flags & RACK_APP_LIMITED) { - if (rack->r_ctl.rc_app_limited_cnt > 0) { - rack->r_ctl.rc_app_limited_cnt--; - } + KASSERT((rack->r_ctl.rc_app_limited_cnt > 0), + ("app_cnt %u, rsm %p", rack->r_ctl.rc_app_limited_cnt, rsm)); + rack->r_ctl.rc_app_limited_cnt--; } if (rsm->r_limit_type) { /* currently there is only one limit type */ @@ -3554,8 +3553,7 @@ rack_get_measure_window(struct tcpcb *tp, struct tcp_rack *rack) * earlier. * * So lets calculate the BDP with the "known" b/w using - * the SRTT has our rtt and then multiply it by the - * goal. + * the SRTT as our rtt and then multiply it by the goal. */ bw = rack_get_bw(rack); srtt = (uint64_t)tp->t_srtt; @@ -5793,7 +5791,7 @@ rack_cong_signal(struct tcpcb *tp, uint32_t type, uint32_t ack, int line) tp->t_badrxtwin = 0; break; } - if ((CC_ALGO(tp)->cong_signal != NULL) && + if ((CC_ALGO(tp)->cong_signal != NULL) && (type != CC_RTO)){ tp->t_ccv.curack = ack; CC_ALGO(tp)->cong_signal(&tp->t_ccv, type); @@ -5904,7 +5902,7 @@ rack_calc_thresh_rack(struct tcp_rack *rack, uint32_t srtt, uint32_t cts, int li * * If reorder-fade is configured, then we track the last time we saw * re-ordering occur. If we reach the point where enough time as - * passed we no longer consider reordering has occuring. + * passed we no longer consider reordering as occurring. * * Or if reorder-face is 0, then once we see reordering we consider * the connection to alway be subject to reordering and just set lro @@ -7045,6 +7043,9 @@ rack_clone_rsm(struct tcp_rack *rack, struct rack_sendmap *nrsm, /* Push bit must go to the right edge as well */ if (rsm->r_flags & RACK_HAD_PUSH) rsm->r_flags &= ~RACK_HAD_PUSH; + /* Update the count if app limited */ + if (nrsm->r_flags & RACK_APP_LIMITED) + rack->r_ctl.rc_app_limited_cnt++; /* Clone over the state of the hw_tls flag */ nrsm->r_hw_tls = rsm->r_hw_tls; /* @@ -7096,7 +7097,7 @@ rack_merge_rsm(struct tcp_rack *rack, l_rsm->r_flags |= RACK_TLP; if (r_rsm->r_flags & RACK_RWND_COLLAPSED) l_rsm->r_flags |= RACK_RWND_COLLAPSED; - if ((r_rsm->r_flags & RACK_APP_LIMITED) && + if ((r_rsm->r_flags & RACK_APP_LIMITED) && ((l_rsm->r_flags & RACK_APP_LIMITED) == 0)) { /* * If both are app-limited then let the @@ -8137,7 +8138,7 @@ rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, * remove the lost desgination and reduce the * bytes considered lost. */ - rsm->r_flags &= ~RACK_WAS_LOST; + rsm->r_flags &= ~RACK_WAS_LOST; KASSERT((rack->r_ctl.rc_considered_lost >= (rsm->r_end - rsm->r_start)), ("rsm:%p rack:%p rc_considered_lost goes negative", rsm, rack)); if (rack->r_ctl.rc_considered_lost >= (rsm->r_end - rsm->r_start)) @@ -8832,7 +8833,7 @@ rack_apply_updated_usrtt(struct tcp_rack *rack, uint32_t us_rtt, uint32_t us_cts val = rack_probertt_lower_within * rack_time_between_probertt; val /= 100; - if ((rack->in_probe_rtt == 0) && + if ((rack->in_probe_rtt == 0) && (rack->rc_skip_timely == 0) && ((us_cts - rack->r_ctl.rc_lower_rtt_us_cts) >= (rack_time_between_probertt - val))) { rack_enter_probertt(rack, us_cts); @@ -10369,7 +10370,7 @@ more: * and yet before retransmitting we get an ack * which can happen due to reordering. */ - rsm->r_flags &= ~RACK_WAS_LOST; + rsm->r_flags &= ~RACK_WAS_LOST; KASSERT((rack->r_ctl.rc_considered_lost >= (rsm->r_end - rsm->r_start)), ("rsm:%p rack:%p rc_considered_lost goes negative", rsm, rack)); if (rack->r_ctl.rc_considered_lost >= (rsm->r_end - rsm->r_start)) @@ -11065,7 +11066,7 @@ rack_strike_dupack(struct tcp_rack *rack, tcp_seq th_ack) * We need to skip anything already set * to be retransmitted. */ - if ((rsm->r_dupack >= DUP_ACK_THRESHOLD) || + if ((rsm->r_dupack >= DUP_ACK_THRESHOLD) || (rsm->r_flags & RACK_MUST_RXT)) { rsm = TAILQ_NEXT(rsm, r_tnext); continue; @@ -12875,7 +12876,7 @@ rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so, (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); - ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset(m, tp, th, BANDLIM_TCP_RST, tlen); return (1); } if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) { @@ -13089,7 +13090,7 @@ rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, (SEQ_LEQ(th->th_ack, tp->snd_una) || SEQ_GT(th->th_ack, tp->snd_max))) { tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); - ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset(m, tp, th, BANDLIM_TCP_RST, tlen); return (1); } if (tp->t_flags & TF_FASTOPEN) { @@ -13102,7 +13103,7 @@ rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, */ if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) { tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); - ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset(m, tp, th, BANDLIM_TCP_RST, tlen); return (1); } else if (thflags & TH_SYN) { /* non-initial SYN is ignored */ @@ -13136,7 +13137,7 @@ rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, */ if (SEQ_LT(th->th_seq, tp->irs)) { tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); - ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset(m, tp, th, BANDLIM_TCP_RST, tlen); return (1); } if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { @@ -13399,7 +13400,7 @@ rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so, if (sbavail(&so->so_snd)) { if (ctf_progress_timeout_check(tp, true)) { rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); - ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen); return (1); } } @@ -13495,7 +13496,7 @@ rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so, if (ctf_progress_timeout_check(tp, true)) { rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, tp, tick, PROGRESS_DROP, __LINE__); - ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen); return (1); } } @@ -13645,7 +13646,7 @@ rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so, if (ctf_progress_timeout_check(tp, true)) { rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, tp, tick, PROGRESS_DROP, __LINE__); - ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen); return (1); } } @@ -13746,7 +13747,7 @@ rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so, if (ctf_progress_timeout_check(tp, true)) { rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, tp, tick, PROGRESS_DROP, __LINE__); - ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen); return (1); } } @@ -13848,7 +13849,7 @@ rack_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so, if (ctf_progress_timeout_check(tp, true)) { rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, tp, tick, PROGRESS_DROP, __LINE__); - ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen); return (1); } } @@ -13952,7 +13953,7 @@ rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so, if (ctf_progress_timeout_check(tp, true)) { rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, tp, tick, PROGRESS_DROP, __LINE__); - ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen); return (1); } } @@ -16655,7 +16656,7 @@ rack_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); - ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset(m, tp, th, BANDLIM_TCP_RST, tlen); #ifdef TCP_ACCOUNTING sched_unpin(); #endif @@ -16919,7 +16920,7 @@ do_output_now: } else if ((nxt_pkt == 0) && (tp->t_flags & TF_ACKNOW)) { goto do_output_now; } else if ((no_output == 1) && - (nxt_pkt == 0) && + (nxt_pkt == 0) && (tcp_in_hpts(rack->rc_tp) == 0)) { /* * We are not in hpts and we had a pacing timer up. Use @@ -17546,7 +17547,7 @@ rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, str rack->r_ctl.rc_last_us_rtt, 88, __LINE__, NULL, gain); } - if (((bw_est == 0) || (rate_wanted == 0) || (rack->gp_ready == 0)) && + if (((bw_est == 0) || (rate_wanted == 0) || (rack->gp_ready == 0)) && (rack->use_fixed_rate == 0)) { /* * No way yet to make a b/w estimate or @@ -17986,7 +17987,7 @@ start_set: tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack); rack->r_ctl.rc_gp_cumack_ts = 0; if ((rack->r_ctl.cleared_app_ack == 1) && - (SEQ_GEQ(rack->r_ctl.cleared_app_ack, tp->gput_seq))) { + (SEQ_GEQ(tp->gput_seq, rack->r_ctl.cleared_app_ack_seq))) { /* * We just cleared an application limited period * so the next seq out needs to skip the first @@ -20043,7 +20044,7 @@ again: rack->r_ctl.pcm_max_seg = ctf_fixed_maxseg(tp) * 10; } } - if ((rack->r_ctl.pcm_max_seg != 0) && (rack->pcm_needed == 1)) { + if ((rack->r_ctl.pcm_max_seg != 0) && (rack->pcm_needed == 1)) { uint32_t rw_avail, cwa; if (tp->snd_wnd > ctf_outstanding(tp)) @@ -21031,7 +21032,7 @@ just_return_nolock: } else log = 1; } - /* Mark the last packet has app limited */ + /* Mark the last packet as app limited */ rsm = tqhash_max(rack->r_ctl.tqh); if (rsm && ((rsm->r_flags & RACK_APP_LIMITED) == 0)) { if (rack->r_ctl.rc_app_limited_cnt == 0) diff --git a/sys/netinet/tcp_stacks/rack_bbr_common.c b/sys/netinet/tcp_stacks/rack_bbr_common.c index da26b8cb1f9b..d1c4ba58bf55 100644 --- a/sys/netinet/tcp_stacks/rack_bbr_common.c +++ b/sys/netinet/tcp_stacks/rack_bbr_common.c @@ -672,7 +672,7 @@ ctf_do_dropafterack(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t (SEQ_GT(tp->snd_una, th->th_ack) || SEQ_GT(th->th_ack, tp->snd_max))) { *ret_val = 1; - ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset(m, tp, th, BANDLIM_TCP_RST, tlen); return; } else *ret_val = 0; diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c index cd42a67294a6..db415f6bdf03 100644 --- a/sys/netinet/tcp_subr.c +++ b/sys/netinet/tcp_subr.c @@ -2720,9 +2720,15 @@ tcp_ktlslist_locked(SYSCTL_HANDLER_ARGS, bool export_keys) ksr->snd_tag->sw->snd_tag_status_str != NULL) { sz = SND_TAG_STATUS_MAXLEN; - ksr->snd_tag->sw->snd_tag_status_str( + in_pcbref(inp); + INP_RUNLOCK(inp); + error = ksr->snd_tag->sw-> + snd_tag_status_str( ksr->snd_tag, NULL, &sz); - len += sz; + if (in_pcbrele_rlock(inp)) + return (EDEADLK); + if (error == 0) + len += sz; } } kss = so->so_snd.sb_tls_info; @@ -2739,9 +2745,15 @@ tcp_ktlslist_locked(SYSCTL_HANDLER_ARGS, bool export_keys) kss->snd_tag->sw->snd_tag_status_str != NULL) { sz = SND_TAG_STATUS_MAXLEN; - kss->snd_tag->sw->snd_tag_status_str( + in_pcbref(inp); + INP_RUNLOCK(inp); + error = kss->snd_tag->sw-> + snd_tag_status_str( kss->snd_tag, NULL, &sz); - len += sz; + if (in_pcbrele_rlock(inp)) + return (EDEADLK); + if (error == 0) + len += sz; } } if (p) { @@ -2811,9 +2823,16 @@ tcp_ktlslist_locked(SYSCTL_HANDLER_ARGS, bool export_keys) if (ksr->snd_tag != NULL && ksr->snd_tag->sw->snd_tag_status_str != NULL) { sz = SND_TAG_STATUS_MAXLEN; - ksr->snd_tag->sw->snd_tag_status_str( + in_pcbref(inp); + INP_RUNLOCK(inp); + error = ksr->snd_tag->sw->snd_tag_status_str( ksr->snd_tag, buf + len, &sz); - len += sz; + if (in_pcbrele_rlock(inp)) + return (EDEADLK); + if (error == 0) { + xktls->rcv.drv_st_len = sz; + len += sz; + } } } if (kss != NULL && kss->gen == xig.xig_gen) { @@ -2828,9 +2847,16 @@ tcp_ktlslist_locked(SYSCTL_HANDLER_ARGS, bool export_keys) if (kss->snd_tag != NULL && kss->snd_tag->sw->snd_tag_status_str != NULL) { sz = SND_TAG_STATUS_MAXLEN; - kss->snd_tag->sw->snd_tag_status_str( + in_pcbref(inp); + INP_RUNLOCK(inp); + error = kss->snd_tag->sw->snd_tag_status_str( kss->snd_tag, buf + len, &sz); - len += sz; + if (in_pcbrele_rlock(inp)) + return (EDEADLK); + if (error == 0) { + xktls->snd.drv_st_len = sz; + len += sz; + } } } len = roundup2(len, __alignof(*xktls)); @@ -2858,12 +2884,23 @@ tcp_ktlslist_locked(SYSCTL_HANDLER_ARGS, bool export_keys) static int tcp_ktlslist1(SYSCTL_HANDLER_ARGS, bool export_keys) { - int res; - - sx_xlock(&ktlslist_lock); - res = tcp_ktlslist_locked(oidp, arg1, arg2, req, export_keys); - sx_xunlock(&ktlslist_lock); - return (res); + int repeats, error; + + for (repeats = 0; repeats < 100; repeats++) { + if (sx_xlock_sig(&ktlslist_lock)) + return (EINTR); + error = tcp_ktlslist_locked(oidp, arg1, arg2, req, + export_keys); + sx_xunlock(&ktlslist_lock); + if (error != EDEADLK) + break; + if (sig_intr() != 0) { + error = EINTR; + break; + } + req->oldidx = 0; + } + return (error); } static int diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c index 3ea561e63503..687b0d538666 100644 --- a/sys/netinet/tcp_usrreq.c +++ b/sys/netinet/tcp_usrreq.c @@ -1520,7 +1520,8 @@ tcp6_connect(struct tcpcb *tp, struct sockaddr_in6 *sin6, struct thread *td) INP_WLOCK_ASSERT(inp); if (__predict_false((so->so_state & - (SS_ISCONNECTING | SS_ISCONNECTED)) != 0)) + (SS_ISCONNECTING | SS_ISCONNECTED | SS_ISDISCONNECTING | + SS_ISDISCONNECTED)) != 0)) return (EISCONN); if (__predict_false((so->so_options & SO_REUSEPORT_LB) != 0)) return (EOPNOTSUPP); diff --git a/sys/netinet6/in6_gif.c b/sys/netinet6/in6_gif.c index d476829e8e3b..2bab1c57ce2a 100644 --- a/sys/netinet6/in6_gif.c +++ b/sys/netinet6/in6_gif.c @@ -194,6 +194,11 @@ in6_gif_setopts(struct gif_softc *sc, u_int options) sc->gif_options = options; in6_gif_attach(sc); } + + if ((options & GIF_NOCLAMP) != + (sc->gif_options & GIF_NOCLAMP)) { + sc->gif_options = options; + } return (0); } @@ -289,6 +294,7 @@ in6_gif_output(struct ifnet *ifp, struct mbuf *m, int proto, uint8_t ecn) { struct gif_softc *sc = ifp->if_softc; struct ip6_hdr *ip6; + u_long mtu; /* prepend new IP header */ NET_EPOCH_ASSERT(); @@ -304,11 +310,15 @@ in6_gif_output(struct ifnet *ifp, struct mbuf *m, int proto, uint8_t ecn) ip6->ip6_nxt = proto; ip6->ip6_hlim = V_ip6_gif_hlim; /* - * force fragmentation to minimum MTU, to avoid path MTU discovery. - * it is too painful to ask for resend of inner packet, to achieve - * path MTU discovery for encapsulated packets. + * Enforce fragmentation to minimum MTU, even if the interface MTU + * is larger, to avoid path MTU discovery when NOCLAMP is not + * set (default). IPv6 does not allow fragmentation on intermediate + * router nodes, so it is too painful to ask for resend of inner + * packet, to achieve path MTU discovery for encapsulated packets. */ - return (ip6_output(m, 0, NULL, IPV6_MINMTU, 0, NULL, NULL)); + mtu = ((sc->gif_options & GIF_NOCLAMP) == 0) ? IPV6_MINMTU : 0; + + return (ip6_output(m, 0, NULL, mtu, 0, NULL, NULL)); } static int diff --git a/sys/netinet6/mld6.c b/sys/netinet6/mld6.c index 06fe9e8820c9..a825658bd9ee 100644 --- a/sys/netinet6/mld6.c +++ b/sys/netinet6/mld6.c @@ -234,17 +234,20 @@ static SYSCTL_NODE(_net_inet6_mld, OID_AUTO, ifinfo, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_mld_ifinfo, "Per-interface MLDv2 state"); -static int mld_v1enable = 1; -SYSCTL_INT(_net_inet6_mld, OID_AUTO, v1enable, CTLFLAG_RWTUN, - &mld_v1enable, 0, "Enable fallback to MLDv1"); +VNET_DEFINE_STATIC(bool, mld_v1enable) = true; +#define V_mld_v1enable VNET(mld_v1enable) +SYSCTL_BOOL(_net_inet6_mld, OID_AUTO, v1enable, CTLFLAG_VNET | CTLFLAG_RWTUN, + &VNET_NAME(mld_v1enable), 0, "Enable fallback to MLDv1"); -static int mld_v2enable = 1; -SYSCTL_INT(_net_inet6_mld, OID_AUTO, v2enable, CTLFLAG_RWTUN, - &mld_v2enable, 0, "Enable MLDv2"); +VNET_DEFINE_STATIC(bool, mld_v2enable) = true; +#define V_mld_v2enable VNET(mld_v2enable) +SYSCTL_BOOL(_net_inet6_mld, OID_AUTO, v2enable, CTLFLAG_VNET | CTLFLAG_RWTUN, + &VNET_NAME(mld_v2enable), 0, "Enable MLDv2"); -static int mld_use_allow = 1; -SYSCTL_INT(_net_inet6_mld, OID_AUTO, use_allow, CTLFLAG_RWTUN, - &mld_use_allow, 0, "Use ALLOW/BLOCK for RFC 4604 SSM joins/leaves"); +VNET_DEFINE_STATIC(bool, mld_use_allow) = true; +#define V_mld_use_allow VNET(mld_use_allow) +SYSCTL_BOOL(_net_inet6_mld, OID_AUTO, use_allow, CTLFLAG_VNET | CTLFLAG_RWTUN, + &VNET_NAME(mld_use_allow), 0, "Use ALLOW/BLOCK for RFC 4604 SSM joins/leaves"); /* * Packed Router Alert option structure declaration. @@ -481,7 +484,7 @@ mld_domifattach(struct ifnet *ifp) mbufq_init(&mli->mli_gq, MLD_MAX_RESPONSE_PACKETS); if ((ifp->if_flags & IFF_MULTICAST) == 0) mli->mli_flags |= MLIF_SILENT; - if (mld_use_allow) + if (V_mld_use_allow) mli->mli_flags |= MLIF_USEALLOW; MLD_LOCK(); @@ -614,7 +617,7 @@ mld_v1_input_query(struct ifnet *ifp, const struct ip6_hdr *ip6, is_general_query = 0; - if (!mld_v1enable) { + if (!V_mld_v1enable) { CTR3(KTR_MLD, "ignore v1 query %s on ifp %p(%s)", ip6_sprintf(ip6tbuf, &mld->mld_addr), ifp, if_name(ifp)); @@ -790,7 +793,7 @@ mld_v2_input_query(struct ifnet *ifp, const struct ip6_hdr *ip6, NET_EPOCH_ASSERT(); - if (!mld_v2enable) { + if (!V_mld_v2enable) { CTR3(KTR_MLD, "ignore v2 query src %s on ifp %p(%s)", ip6_sprintf(ip6tbuf, &ip6->ip6_src), ifp, if_name(ifp)); @@ -1076,7 +1079,7 @@ mld_v1_input_report(struct ifnet *ifp, const struct ip6_hdr *ip6, NET_EPOCH_ASSERT(); - if (!mld_v1enable) { + if (!V_mld_v1enable) { CTR3(KTR_MLD, "ignore v1 report %s on ifp %p(%s)", ip6_sprintf(ip6tbuf, &mld->mld_addr), ifp, if_name(ifp)); diff --git a/sys/netinet6/raw_ip6.c b/sys/netinet6/raw_ip6.c index 0379ef7c789a..c90a1213bd66 100644 --- a/sys/netinet6/raw_ip6.c +++ b/sys/netinet6/raw_ip6.c @@ -765,8 +765,7 @@ rip6_bind(struct socket *so, struct sockaddr *nam, struct thread *td) } if (ifa != NULL && ((struct in6_ifaddr *)ifa)->ia6_flags & - (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY| - IN6_IFF_DETACHED|IN6_IFF_DEPRECATED)) { + (IN6_IFF_NOTREADY|IN6_IFF_DETACHED|IN6_IFF_DEPRECATED)) { NET_EPOCH_EXIT(et); return (EADDRNOTAVAIL); } diff --git a/sys/netipsec/ipsec.c b/sys/netipsec/ipsec.c index 6bacc68b7441..92d0201b398a 100644 --- a/sys/netipsec/ipsec.c +++ b/sys/netipsec/ipsec.c @@ -636,8 +636,10 @@ ipsec4_in_reject1(const struct mbuf *m, struct ip *ip1, struct inpcb *inp) #ifdef IPSEC_OFFLOAD tag = ipsec_accel_input_tag_lookup(m); - if (tag != NULL) - return (0); + if (tag != NULL) { + tag->tag.m_tag_id = PACKET_TAG_IPSEC_IN_DONE; + __DECONST(struct mbuf *, m)->m_flags |= M_DECRYPTED; + } #endif if (ip1 == NULL) { diff --git a/sys/netipsec/ipsec_offload.c b/sys/netipsec/ipsec_offload.c index 467d5ded1d7a..8a09d5f37b4a 100644 --- a/sys/netipsec/ipsec_offload.c +++ b/sys/netipsec/ipsec_offload.c @@ -94,6 +94,7 @@ struct ifp_handle_sav { size_t hdr_ext_size; uint64_t cnt_octets; uint64_t cnt_allocs; + struct xform_history xfh; }; #define IFP_HS_HANDLED 0x00000001 @@ -159,6 +160,8 @@ static void ipsec_accel_drv_sa_lifetime_update_impl(struct secasvar *sav, static int ipsec_accel_drv_sa_lifetime_fetch_impl(struct secasvar *sav, if_t ifp, u_int drv_spi, uint64_t *octets, uint64_t *allocs); static void ipsec_accel_ifdetach_event(void *arg, struct ifnet *ifp); +static bool ipsec_accel_fill_xh_impl(if_t ifp, uint32_t drv_spi, + struct xform_history *xh); static void ipsec_accel_init(void *arg) @@ -185,6 +188,7 @@ ipsec_accel_init(void *arg) ipsec_accel_drv_sa_lifetime_update_impl; ipsec_accel_drv_sa_lifetime_fetch_p = ipsec_accel_drv_sa_lifetime_fetch_impl; + ipsec_accel_fill_xh_p = ipsec_accel_fill_xh_impl; pctrie_init(&drv_spi_pctrie); ipsec_accel_ifdetach_event_tag = EVENTHANDLER_REGISTER( ifnet_departure_event, ipsec_accel_ifdetach_event, NULL, @@ -209,6 +213,7 @@ ipsec_accel_fini(void *arg) ipsec_accel_on_ifdown_p = NULL; ipsec_accel_drv_sa_lifetime_update_p = NULL; ipsec_accel_drv_sa_lifetime_fetch_p = NULL; + ipsec_accel_fill_xh_p = NULL; ipsec_accel_sync_imp(); clean_unrhdr(drv_spi_unr); /* avoid panic, should go later */ clear_unrhdr(drv_spi_unr); @@ -412,6 +417,10 @@ ipsec_accel_handle_sav(struct secasvar *sav, struct ifnet *ifp, ihs->ifdata = priv; ihs->flags = flags; ihs->hdr_ext_size = esp_hdrsiz(sav); + memcpy(&ihs->xfh.dst, &sav->sah->saidx.dst, sizeof(ihs->xfh.dst)); + ihs->xfh.spi = sav->spi; + ihs->xfh.proto = sav->sah->saidx.proto; + ihs->xfh.mode = sav->sah->saidx.mode; mtx_lock(&ipsec_accel_sav_tmp); CK_LIST_FOREACH(i, &sav->accel_ifps, sav_link) { if (i->ifp == ifp) { @@ -1162,4 +1171,20 @@ ipsec_accel_key_setaccelif_impl(struct secasvar *sav) return (m); } +static bool +ipsec_accel_fill_xh_impl(if_t ifp, uint32_t drv_spi, struct xform_history *xh) +{ + struct ifp_handle_sav *i; + + if (drv_spi < IPSEC_ACCEL_DRV_SPI_MIN || + drv_spi > IPSEC_ACCEL_DRV_SPI_MAX) + return (false); + + i = DRVSPI_SA_PCTRIE_LOOKUP(&drv_spi_pctrie, drv_spi); + if (i == NULL) + return (false); + memcpy(xh, &i->xfh, sizeof(*xh)); + return (true); +} + #endif /* IPSEC_OFFLOAD */ diff --git a/sys/netipsec/ipsec_offload.h b/sys/netipsec/ipsec_offload.h index 904fe6252396..ae60eaa8ae78 100644 --- a/sys/netipsec/ipsec_offload.h +++ b/sys/netipsec/ipsec_offload.h @@ -30,6 +30,7 @@ #include <sys/errno.h> #include <net/if.h> #include <net/if_var.h> +#include <netipsec/xform.h> struct secpolicy; struct secasvar; @@ -42,6 +43,7 @@ struct ipsec_accel_out_tag { struct ipsec_accel_in_tag { struct m_tag tag; + struct xform_history xh; /* Must be first to mimic IPSEC_IN_DONE */ uint16_t drv_spi; }; @@ -66,6 +68,8 @@ extern void (*ipsec_accel_drv_sa_lifetime_update_p)(struct secasvar *sav, if_t ifp, u_int drv_spi, uint64_t octets, uint64_t allocs); extern int (*ipsec_accel_drv_sa_lifetime_fetch_p)(struct secasvar *sav, if_t ifp, u_int drv_spi, uint64_t *octets, uint64_t *allocs); +extern bool (*ipsec_accel_fill_xh_p)(if_t ifp, uint32_t drv_spi, + struct xform_history *xh); #ifdef IPSEC_OFFLOAD /* @@ -158,6 +162,16 @@ ipsec_accel_key_setaccelif(struct secasvar *sav) return (NULL); } +static inline bool +ipsec_accel_fill_xh(if_t ifp, uint32_t drv_spi, struct xform_history *xh) +{ + bool (*p)(if_t ifp, uint32_t drv_spi, struct xform_history *xh); + + p = atomic_load_ptr(&ipsec_accel_fill_xh_p); + if (p != NULL) + return (p(ifp, drv_spi, xh)); + return (false); +} #else #define ipsec_accel_sa_newkey(a) @@ -168,6 +182,7 @@ ipsec_accel_key_setaccelif(struct secasvar *sav) #define ipsec_accel_sync() #define ipsec_accel_is_accel_sav(a) #define ipsec_accel_key_setaccelif(a) +#define ipsec_accel_fill_xh(a, b, c) (false) #endif void ipsec_accel_forget_sav_impl(struct secasvar *sav); @@ -180,6 +195,7 @@ bool ipsec_accel_output(struct ifnet *ifp, struct mbuf *m, struct inpcb *inp, struct secpolicy *sp, struct secasvar *sav, int af, int mtu, int *hwassist); void ipsec_accel_forget_sav(struct secasvar *sav); +struct xform_history; #else #define ipsec_accel_input(a, b, c) (ENXIO) #define ipsec_accel_output(a, b, c, d, e, f, g, h) ({ \ diff --git a/sys/netipsec/key.c b/sys/netipsec/key.c index ae67d83c6d13..4ba1b49c24f0 100644 --- a/sys/netipsec/key.c +++ b/sys/netipsec/key.c @@ -114,6 +114,8 @@ void (*ipsec_accel_drv_sa_lifetime_update_p)(struct secasvar *sav, if_t ifp, u_int drv_spi, uint64_t octets, uint64_t allocs); int (*ipsec_accel_drv_sa_lifetime_fetch_p)(struct secasvar *sav, if_t ifp, u_int drv_spi, uint64_t *octets, uint64_t *allocs); +bool (*ipsec_accel_fill_xh_p)(if_t ifp, uint32_t drv_spi, + struct xform_history *xh); #endif #define FULLMASK 0xff diff --git a/sys/netlink/netlink_message_parser.h b/sys/netlink/netlink_message_parser.h index 8492ecb3021b..720317ed74f3 100644 --- a/sys/netlink/netlink_message_parser.h +++ b/sys/netlink/netlink_message_parser.h @@ -209,7 +209,8 @@ int nlattr_get_nested(struct nlattr *nla, struct nl_pstate *npt, int nlattr_get_nested_ptr(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target); -bool nlmsg_report_err_msg(struct nl_pstate *npt, const char *fmt, ...); +bool nlmsg_report_err_msg(struct nl_pstate *npt, const char *fmt, ...) + __printflike(2, 3); #define NLMSG_REPORT_ERR_MSG(_npt, _fmt, ...) { \ nlmsg_report_err_msg(_npt, _fmt, ## __VA_ARGS__); \ diff --git a/sys/netpfil/ipfilter/netinet/fil.c b/sys/netpfil/ipfilter/netinet/fil.c index 2a75190a3ec7..2fcea433295f 100644 --- a/sys/netpfil/ipfilter/netinet/fil.c +++ b/sys/netpfil/ipfilter/netinet/fil.c @@ -437,7 +437,7 @@ static inline void ipf_pr_ipv6hdr(fr_info_t *fin) { ip6_t *ip6 = (ip6_t *)fin->fin_ip; - int p, go = 1, i, hdrcount; + int p, go = 1, i; fr_ip_t *fi = &fin->fin_fi; fin->fin_off = 0; @@ -464,7 +464,6 @@ ipf_pr_ipv6hdr(fr_info_t *fin) if (IN6_IS_ADDR_MULTICAST(&fi->fi_dst.in6)) fin->fin_flx |= FI_MULTICAST|FI_MBCAST; - hdrcount = 0; while (go && !(fin->fin_flx & FI_SHORT)) { switch (p) { @@ -542,7 +541,6 @@ ipf_pr_ipv6hdr(fr_info_t *fin) go = 0; break; } - hdrcount++; /* * It is important to note that at this point, for the @@ -2590,14 +2588,13 @@ ipf_scanlist(fr_info_t *fin, u_32_t pass) /* functions called from the IPFilter "mainline" in ipf_check(). */ /* ------------------------------------------------------------------------ */ frentry_t * -ipf_acctpkt(fr_info_t *fin, u_32_t *passp) +ipf_acctpkt(fr_info_t *fin, u_32_t *passp __unused) { ipf_main_softc_t *softc = fin->fin_main_soft; char group[FR_GROUPLEN]; frentry_t *fr, *frsave; u_32_t pass, rulen; - passp = passp; fr = softc->ipf_acct[fin->fin_out][softc->ipf_active]; if (fr != NULL) { @@ -4200,7 +4197,7 @@ ipf_getstat(ipf_main_softc_t *softc, friostat_t *fiop, int rev) (rev / 10000) % 100, (rev / 100) % 100); #else - rev = rev; + (void)rev; /* UNUSED */ (void) strncpy(fiop->f_version, ipfilter_version, sizeof(fiop->f_version)); #endif @@ -4408,13 +4405,12 @@ frrequest(ipf_main_softc_t *softc, int unit, ioctlcmd_t req, caddr_t data, OP_ZERO /* zero statistics and counters */ } addrem = OP_ADD; frentry_t frd, *fp, *f, **fprev, **ftail; - void *ptr, *uptr, *cptr; + void *ptr, *uptr; u_int *p, *pp; frgroup_t *fg; char *group; ptr = NULL; - cptr = NULL; fg = NULL; fp = &frd; if (makecopy != 0) { @@ -4532,7 +4528,6 @@ frrequest(ipf_main_softc_t *softc, int unit, ioctlcmd_t req, caddr_t data, } ptr = NULL; - cptr = NULL; if (FR_ISACCOUNT(fp->fr_flags)) unit = IPL_LOGCOUNT; @@ -7314,11 +7309,10 @@ ipf_resolvedest(ipf_main_softc_t *softc, char *base, frdest_t *fdp, int v) /* for both IPv4 and IPv6 on the same physical NIC. */ /* ------------------------------------------------------------------------ */ void * -ipf_resolvenic(ipf_main_softc_t *softc, char *name, int v) +ipf_resolvenic(ipf_main_softc_t *softc __unused, char *name, int v) { void *nic; - softc = softc; /* gcc -Wextra */ if (name[0] == '\0') return (NULL); @@ -7455,6 +7449,10 @@ ipf_token_find(ipf_main_softc_t *softc, int type, int uid, void *ptr) { ipftoken_t *it, *new; + KMALLOC(new, ipftoken_t *); + if (new != NULL) + bzero((char *)new, sizeof(*new)); + WRITE_ENTER(&softc->ipf_tokens); for (it = softc->ipf_token_head; it != NULL; it = it->ipt_next) { if ((ptr == it->ipt_ctx) && (type == it->ipt_type) && @@ -7463,10 +7461,6 @@ ipf_token_find(ipf_main_softc_t *softc, int type, int uid, void *ptr) } if (it == NULL) { - KMALLOC(new, ipftoken_t *); - if (new != NULL) - bzero((char *)new, sizeof(*new)); - it = new; new = NULL; if (it == NULL) { @@ -7478,6 +7472,11 @@ ipf_token_find(ipf_main_softc_t *softc, int type, int uid, void *ptr) it->ipt_type = type; it->ipt_ref = 1; } else { + if (new != NULL) { + KFREE(new); + new = NULL; + } + if (it->ipt_complete > 0) it = NULL; else diff --git a/sys/netpfil/ipfilter/netinet/ip_fil_freebsd.c b/sys/netpfil/ipfilter/netinet/ip_fil_freebsd.c index 04850549db98..6eb6cf2a7a47 100644 --- a/sys/netpfil/ipfilter/netinet/ip_fil_freebsd.c +++ b/sys/netpfil/ipfilter/netinet/ip_fil_freebsd.c @@ -463,13 +463,14 @@ ipf_send_ip(fr_info_t *fin, mb_t *m) int ipf_send_icmp_err(int type, fr_info_t *fin, int dst) { - int err, hlen, xtra, iclen, ohlen, avail, code; + int err, hlen, xtra, iclen, ohlen, avail; struct in_addr dst4; struct icmp *icmp; struct mbuf *m; i6addr_t dst6; void *ifp; #ifdef USE_INET6 + int code; ip6_t *ip6; #endif ip_t *ip, *ip2; @@ -477,8 +478,8 @@ ipf_send_icmp_err(int type, fr_info_t *fin, int dst) if ((type < 0) || (type >= ICMP_MAXTYPE)) return (-1); - code = fin->fin_icode; #ifdef USE_INET6 + code = fin->fin_icode; /* See NetBSD ip_fil_netbsd.c r1.4: */ if ((code < 0) || (code >= sizeof(icmptoicmp6unreach)/sizeof(int))) return (-1); diff --git a/sys/netpfil/ipfilter/netinet/ip_ftp_pxy.c b/sys/netpfil/ipfilter/netinet/ip_ftp_pxy.c index 482e0b456ae5..8c9317c38326 100644 --- a/sys/netpfil/ipfilter/netinet/ip_ftp_pxy.c +++ b/sys/netpfil/ipfilter/netinet/ip_ftp_pxy.c @@ -219,7 +219,7 @@ ipf_p_ftp_soft_destroy(ipf_main_softc_t *softc, void *arg) int -ipf_p_ftp_new(void *arg, fr_info_t *fin, ap_session_t *aps, nat_t *nat) +ipf_p_ftp_new(void *arg, fr_info_t *fin, ap_session_t *aps, nat_t *nat __unused) { ftpinfo_t *ftp; ftpside_t *f; @@ -228,8 +228,6 @@ ipf_p_ftp_new(void *arg, fr_info_t *fin, ap_session_t *aps, nat_t *nat) if (ftp == NULL) return (-1); - nat = nat; /* LINT */ - aps->aps_data = ftp; aps->aps_psiz = sizeof(ftpinfo_t); aps->aps_sport = htons(fin->fin_sport); @@ -1715,7 +1713,9 @@ ipf_p_ftp_eprt4(ipf_ftp_softc_t *softf, fr_info_t *fin, ip_t *ip, nat_t *nat, return (0); if (c != delim) return (0); - addr |= addr; +#if 0 + addr |= (addr << 0); +#endif /* * Get the port number diff --git a/sys/netpfil/ipfilter/netinet/ip_htable.c b/sys/netpfil/ipfilter/netinet/ip_htable.c index 22d427b87a71..91b375f80db1 100644 --- a/sys/netpfil/ipfilter/netinet/ip_htable.c +++ b/sys/netpfil/ipfilter/netinet/ip_htable.c @@ -343,6 +343,7 @@ ipf_htable_create(ipf_main_softc_t *softc, void *arg, iplookupop_t *op) iph->iph_ref = 1; iph->iph_list = NULL; iph->iph_tail = &iph->iph_list; + iph->iph_unit = unit; iph->iph_next = softh->ipf_htables[unit + 1]; iph->iph_pnext = &softh->ipf_htables[unit + 1]; if (softh->ipf_htables[unit + 1] != NULL) @@ -603,7 +604,7 @@ ipf_htent_remove(ipf_main_softc_t *softc, void *arg, iphtable_t *iph, switch (iph->iph_type & ~IPHASH_ANON) { case IPHASH_GROUPMAP : - if (ipe->ipe_group != NULL) + if (ipe->ipe_ptr != NULL) ipf_group_del(softc, ipe->ipe_ptr, NULL); break; @@ -973,7 +974,6 @@ ipf_htent_find(iphtable_t *iph, iphtent_t *ipeo) { iphtent_t ipe, *ent; u_int hv; - int bits; bcopy((char *)ipeo, (char *)&ipe, sizeof(ipe)); ipe.ipe_addr.i6[0] &= ipe.ipe_mask.i6[0]; @@ -981,7 +981,6 @@ ipf_htent_find(iphtable_t *iph, iphtent_t *ipeo) ipe.ipe_addr.i6[2] &= ipe.ipe_mask.i6[2]; ipe.ipe_addr.i6[3] &= ipe.ipe_mask.i6[3]; if (ipe.ipe_family == AF_INET) { - bits = count4bits(ipe.ipe_mask.in4_addr); ipe.ipe_addr.i6[1] = 0; ipe.ipe_addr.i6[2] = 0; ipe.ipe_addr.i6[3] = 0; @@ -993,7 +992,6 @@ ipf_htent_find(iphtable_t *iph, iphtent_t *ipeo) } else #ifdef USE_INET6 if (ipe.ipe_family == AF_INET6) { - bits = count6bits(ipe.ipe_mask.i6); hv = IPE_V6_HASH_FN(ipe.ipe_addr.i6, ipe.ipe_mask.i6, iph->iph_size); } else diff --git a/sys/netpfil/ipfilter/netinet/ip_ipsec_pxy.c b/sys/netpfil/ipfilter/netinet/ip_ipsec_pxy.c index c6e4be17e22e..d5103c2944dc 100644 --- a/sys/netpfil/ipfilter/netinet/ip_ipsec_pxy.c +++ b/sys/netpfil/ipfilter/netinet/ip_ipsec_pxy.c @@ -341,15 +341,13 @@ ipf_p_ipsec_inout(void *arg, fr_info_t *fin, ap_session_t *aps, nat_t *nat) * UDP/TCP port numbers). */ int -ipf_p_ipsec_match(fr_info_t *fin, ap_session_t *aps, nat_t *nat) +ipf_p_ipsec_match(fr_info_t *fin, ap_session_t *aps, nat_t *nat __unused) { ipsec_pxy_t *ipsec; u_32_t cookies[4]; mb_t *m; int off; - nat = nat; /* LINT */ - if ((fin->fin_dlen < sizeof(cookies)) || (fin->fin_flx & FI_FRAG)) return (-1); diff --git a/sys/netpfil/ipfilter/netinet/ip_irc_pxy.c b/sys/netpfil/ipfilter/netinet/ip_irc_pxy.c index 026459299efd..aa9e84be19ed 100644 --- a/sys/netpfil/ipfilter/netinet/ip_irc_pxy.c +++ b/sys/netpfil/ipfilter/netinet/ip_irc_pxy.c @@ -221,7 +221,7 @@ ipf_p_irc_complete(ircinfo_t *ircp, char *buf, size_t len) int -ipf_p_irc_new(void *arg, fr_info_t *fin, ap_session_t *aps, nat_t *nat) +ipf_p_irc_new(void *arg, fr_info_t *fin, ap_session_t *aps, nat_t *nat __unused) { ircinfo_t *irc; @@ -232,8 +232,6 @@ ipf_p_irc_new(void *arg, fr_info_t *fin, ap_session_t *aps, nat_t *nat) if (irc == NULL) return (-1); - nat = nat; /* LINT */ - aps->aps_data = irc; aps->aps_psiz = sizeof(ircinfo_t); @@ -422,8 +420,7 @@ ipf_p_irc_send(fr_info_t *fin, nat_t *nat) int -ipf_p_irc_out(void *arg, fr_info_t *fin, ap_session_t *aps, nat_t *nat) +ipf_p_irc_out(void *arg, fr_info_t *fin, ap_session_t *aps __unused, nat_t *nat) { - aps = aps; /* LINT */ return (ipf_p_irc_send(fin, nat)); } diff --git a/sys/netpfil/ipfilter/netinet/ip_lookup.c b/sys/netpfil/ipfilter/netinet/ip_lookup.c index b46d1b875003..a52dbef00166 100644 --- a/sys/netpfil/ipfilter/netinet/ip_lookup.c +++ b/sys/netpfil/ipfilter/netinet/ip_lookup.c @@ -230,13 +230,11 @@ ipf_lookup_soft_destroy(ipf_main_softc_t *softc, void *arg) /* ------------------------------------------------------------------------ */ int ipf_lookup_ioctl(ipf_main_softc_t *softc, caddr_t data, ioctlcmd_t cmd, - int mode, int uid, void *ctx) + int mode __unused, int uid, void *ctx) { int err; SPL_INT(s); - mode = mode; /* LINT */ - SPL_NET(s); switch (cmd) diff --git a/sys/netpfil/ipfilter/netinet/ip_nat.c b/sys/netpfil/ipfilter/netinet/ip_nat.c index a13c6129a287..972511f43bd5 100644 --- a/sys/netpfil/ipfilter/netinet/ip_nat.c +++ b/sys/netpfil/ipfilter/netinet/ip_nat.c @@ -3224,13 +3224,10 @@ ipf_nat_finalise(fr_info_t *fin, nat_t *nat) ipf_nat_softc_t *softn = softc->ipf_nat_soft; u_32_t sum1, sum2, sumd; frentry_t *fr; - u_32_t flags; #if SOLARIS && defined(_KERNEL) && defined(ICK_M_CTL_MAGIC) qpktinfo_t *qpi = fin->fin_qpi; #endif - flags = nat->nat_flags; - switch (nat->nat_pr[0]) { case IPPROTO_ICMP : @@ -3538,8 +3535,8 @@ ipf_nat_icmperrorlookup(fr_info_t *fin, int dir) { ipf_main_softc_t *softc = fin->fin_main_soft; ipf_nat_softc_t *softn = softc->ipf_nat_soft; - int flags = 0, type, minlen; - icmphdr_t *icmp, *orgicmp; + int flags = 0, minlen; + icmphdr_t *orgicmp; nat_stat_side_t *nside; tcphdr_t *tcp = NULL; u_short data[2]; @@ -3547,8 +3544,6 @@ ipf_nat_icmperrorlookup(fr_info_t *fin, int dir) ip_t *oip; u_int p; - icmp = fin->fin_dp; - type = icmp->icmp_type; nside = &softn->ipf_nat_stats.ns_side[fin->fin_out]; /* * Does it at least have the return (basic) IP header ? @@ -3999,9 +3994,7 @@ ipf_nat_inlookup(fr_info_t *fin, u_int flags, u_int p, ipf_main_softc_t *softc = fin->fin_main_soft; ipf_nat_softc_t *softn = softc->ipf_nat_soft; u_short sport, dport; - grehdr_t *gre; ipnat_t *ipn; - u_int sflags; nat_t *nat; int nflags; u_32_t dst; @@ -4009,9 +4002,7 @@ ipf_nat_inlookup(fr_info_t *fin, u_int flags, u_int p, u_int hv, rhv; ifp = fin->fin_ifp; - gre = NULL; dst = mapdst.s_addr; - sflags = flags & NAT_TCPUDPICMP; switch (p) { @@ -4330,14 +4321,12 @@ ipf_nat_outlookup(fr_info_t *fin, u_int flags, u_int p, ipf_main_softc_t *softc = fin->fin_main_soft; ipf_nat_softc_t *softn = softc->ipf_nat_soft; u_short sport, dport; - u_int sflags; ipnat_t *ipn; nat_t *nat; void *ifp; u_int hv; ifp = fin->fin_ifp; - sflags = flags & IPN_TCPUDPICMP; switch (p) { @@ -4756,7 +4745,6 @@ ipf_nat_checkout(fr_info_t *fin, u_32_t *passp) struct ifnet *ifp, *sifp; ipf_main_softc_t *softc; ipf_nat_softc_t *softn; - icmphdr_t *icmp = NULL; tcphdr_t *tcp = NULL; int rval, natfailed; u_int nflags = 0; @@ -4802,8 +4790,6 @@ ipf_nat_checkout(fr_info_t *fin, u_32_t *passp) nflags = IPN_UDP; break; case IPPROTO_ICMP : - icmp = fin->fin_dp; - /* * This is an incoming packet, so the destination is * the icmp_id and the source port equals 0 @@ -5463,7 +5449,10 @@ ipf_nat_in(fr_info_t *fin, nat_t *nat, int natadd, u_32_t nflags) { ipf_main_softc_t *softc = fin->fin_main_soft; ipf_nat_softc_t *softn = softc->ipf_nat_soft; - u_32_t sumd, ipsumd, sum1, sum2; + u_32_t sumd, sum1, sum2; +#if !defined(_KERNEL) || SOLARIS + u_32_t ipsumd; +#endif icmphdr_t *icmp; tcphdr_t *tcp; ipnat_t *np; @@ -5499,7 +5488,9 @@ ipf_nat_in(fr_info_t *fin, nat_t *nat, int natadd, u_32_t nflags) ipf_sync_update(softc, SMC_NAT, fin, nat->nat_sync); +#if !defined(_KERNEL) || SOLARIS ipsumd = nat->nat_ipsumd; +#endif /* * Fix up checksums, not by recalculating them, but * simply computing adjustments. @@ -5521,7 +5512,9 @@ ipf_nat_in(fr_info_t *fin, nat_t *nat, int natadd, u_32_t nflags) sum1 = nat->nat_osrcaddr; sum2 = nat->nat_nsrcaddr; CALC_SUMD(sum1, sum2, sumd); +#if !defined(_KERNEL) || SOLARIS ipsumd -= sumd; +#endif } fin->fin_ip->ip_dst = nat->nat_ndstip; fin->fin_daddr = nat->nat_ndstaddr; @@ -5538,7 +5531,9 @@ ipf_nat_in(fr_info_t *fin, nat_t *nat, int natadd, u_32_t nflags) sum1 = nat->nat_odstaddr; sum2 = nat->nat_ndstaddr; CALC_SUMD(sum1, sum2, sumd); +#if !defined(_KERNEL) || SOLARIS ipsumd -= sumd; +#endif } fin->fin_ip->ip_dst = nat->nat_osrcip; fin->fin_daddr = nat->nat_osrcaddr; @@ -7352,30 +7347,18 @@ ipf_nat_nextaddr(fr_info_t *fin, nat_addr_t *na, u_32_t *old, u_32_t *dst) { ipf_main_softc_t *softc = fin->fin_main_soft; ipf_nat_softc_t *softn = softc->ipf_nat_soft; - u_32_t amin, amax, new; + u_32_t new; i6addr_t newip; int error; new = 0; - amin = na->na_addr[0].in4.s_addr; switch (na->na_atype) { case FRI_RANGE : - amax = na->na_addr[1].in4.s_addr; - break; - case FRI_NETMASKED : case FRI_DYNAMIC : case FRI_NORMAL : - /* - * Compute the maximum address by adding the inverse of the - * netmask to the minimum address. - */ - amax = ~na->na_addr[1].in4.s_addr; - amax |= amin; - break; - case FRI_LOOKUP : break; diff --git a/sys/netpfil/ipfilter/netinet/ip_nat6.c b/sys/netpfil/ipfilter/netinet/ip_nat6.c index dbe19c40c2f2..6d5913177b90 100644 --- a/sys/netpfil/ipfilter/netinet/ip_nat6.c +++ b/sys/netpfil/ipfilter/netinet/ip_nat6.c @@ -1130,9 +1130,6 @@ ipf_nat6_finalise(fr_info_t *fin, nat_t *nat) ipf_nat_softc_t *softn = softc->ipf_nat_soft; u_32_t sum1, sum2, sumd; frentry_t *fr; - u_32_t flags; - - flags = nat->nat_flags; switch (fin->fin_p) { @@ -1355,8 +1352,8 @@ ipf_nat6_icmperrorlookup(fr_info_t *fin, int dir) { ipf_main_softc_t *softc = fin->fin_main_soft; ipf_nat_softc_t *softn = softc->ipf_nat_soft; - struct icmp6_hdr *icmp6, *orgicmp; - int flags = 0, type, minlen; + struct icmp6_hdr *orgicmp; + int flags = 0, minlen; nat_stat_side_t *nside; tcphdr_t *tcp = NULL; u_short data[2]; @@ -1365,8 +1362,6 @@ ipf_nat6_icmperrorlookup(fr_info_t *fin, int dir) u_int p; minlen = 40; - icmp6 = fin->fin_dp; - type = icmp6->icmp6_type; nside = &softn->ipf_nat_stats.ns_side6[fin->fin_out]; /* * Does it at least have the return (basic) IP header ? @@ -1500,9 +1495,8 @@ ipf_nat6_ip6subtract(i6addr_t *ip1, i6addr_t *ip2) i6addr_t l1, l2, d; u_short *s1, *s2, *ds; u_32_t r; - int i, neg; + int i; - neg = 0; l1 = *ip1; l2 = *ip2; s1 = (u_short *)&l1; @@ -1519,7 +1513,6 @@ ipf_nat6_ip6subtract(i6addr_t *ip1, i6addr_t *ip2) } if (s2[0] > s1[0]) { ds[0] = s2[0] + 0x10000 - s1[0]; - neg = 1; } else { ds[0] = s2[0] - s1[0]; } @@ -1869,9 +1862,9 @@ ipf_nat6_inlookup(fr_info_t *fin, u_int flags, u_int p, ipf_main_softc_t *softc = fin->fin_main_soft; ipf_nat_softc_t *softn = softc->ipf_nat_soft; u_short sport, dport; - grehdr_t *gre; +#ifdef IPF_V6_PROXIES ipnat_t *ipn; - u_int sflags; +#endif nat_t *nat; int nflags; i6addr_t dst; @@ -1881,10 +1874,7 @@ ipf_nat6_inlookup(fr_info_t *fin, u_int flags, u_int p, ifp = fin->fin_ifp; sport = 0; dport = 0; - gre = NULL; dst.in6 = *mapdst; - sflags = flags & NAT_TCPUDPICMP; - switch (p) { case IPPROTO_TCP : @@ -1962,8 +1952,8 @@ ipf_nat6_inlookup(fr_info_t *fin, u_int flags, u_int p, if ((nat->nat_flags & IPN_TCPUDP) != 0) { - ipn = nat->nat_ptr; #ifdef IPF_V6_PROXIES + ipn = nat->nat_ptr; if ((ipn != NULL) && (nat->nat_aps != NULL)) if (appr_match(fin, nat) != 0) continue; @@ -2192,14 +2182,14 @@ ipf_nat6_outlookup(fr_info_t *fin, u_int flags, u_int p, ipf_main_softc_t *softc = fin->fin_main_soft; ipf_nat_softc_t *softn = softc->ipf_nat_soft; u_short sport, dport; - u_int sflags; +#ifdef IPF_V6_PROXIES ipnat_t *ipn; +#endif nat_t *nat; void *ifp; u_int hv; ifp = fin->fin_ifp; - sflags = flags & IPN_TCPUDPICMP; sport = 0; dport = 0; @@ -2280,8 +2270,8 @@ ipf_nat6_outlookup(fr_info_t *fin, u_int flags, u_int p, break; } - ipn = nat->nat_ptr; #ifdef IPF_V6_PROXIES + ipn = nat->nat_ptr; if ((ipn != NULL) && (nat->nat_aps != NULL)) if (appr_match(fin, nat) != 0) continue; @@ -2568,7 +2558,6 @@ ipf_nat6_checkout(fr_info_t *fin, u_32_t *passp) ipf_nat_softc_t *softn = softc->ipf_nat_soft; struct icmp6_hdr *icmp6 = NULL; struct ifnet *ifp, *sifp; - tcphdr_t *tcp = NULL; int rval, natfailed; ipnat_t *np = NULL; u_int nflags = 0; @@ -2621,9 +2610,6 @@ ipf_nat6_checkout(fr_info_t *fin, u_32_t *passp) default : break; } - - if ((nflags & IPN_TCPUDP)) - tcp = fin->fin_dp; } ipa = fin->fin_src6; @@ -2965,7 +2951,9 @@ ipf_nat6_checkin(fr_info_t *fin, u_32_t *passp) int rval, natfailed; struct ifnet *ifp; i6addr_t ipa, iph; - tcphdr_t *tcp; +#ifdef IPF_V6_PROXIES + tcphdr_t *tcp = NULL; +#endif u_short dport; ipnat_t *np; nat_t *nat; @@ -2973,7 +2961,6 @@ ipf_nat6_checkin(fr_info_t *fin, u_32_t *passp) if (softn->ipf_nat_stats.ns_rules == 0 || softn->ipf_nat_lock != 0) return (0); - tcp = NULL; icmp6 = NULL; dport = 0; natadd = 1; @@ -3014,7 +3001,9 @@ ipf_nat6_checkin(fr_info_t *fin, u_32_t *passp) } if ((nflags & IPN_TCPUDP)) { +#ifdef IPF_V6_PROXIES tcp = fin->fin_dp; +#endif dport = fin->fin_data[1]; } } @@ -3802,32 +3791,19 @@ ipf_nat6_nextaddr(fr_info_t *fin, nat_addr_t *na, i6addr_t *old, i6addr_t *dst) ipf_main_softc_t *softc = fin->fin_main_soft; ipf_nat_softc_t *softn = softc->ipf_nat_soft; i6addr_t newip, new; - u_32_t amin, amax; int error; new.i6[0] = 0; new.i6[1] = 0; new.i6[2] = 0; new.i6[3] = 0; - amin = na->na_addr[0].in4.s_addr; switch (na->na_atype) { case FRI_RANGE : - amax = na->na_addr[1].in4.s_addr; - break; - case FRI_NETMASKED : case FRI_DYNAMIC : case FRI_NORMAL : - /* - * Compute the maximum address by adding the inverse of the - * netmask to the minimum address. - */ - amax = ~na->na_addr[1].in4.s_addr; - amax |= amin; - break; - case FRI_LOOKUP : break; diff --git a/sys/netpfil/ipfilter/netinet/ip_netbios_pxy.c b/sys/netpfil/ipfilter/netinet/ip_netbios_pxy.c index 2ad642adfbcd..f9c1ab50b8a2 100644 --- a/sys/netpfil/ipfilter/netinet/ip_netbios_pxy.c +++ b/sys/netpfil/ipfilter/netinet/ip_netbios_pxy.c @@ -67,7 +67,7 @@ ipf_p_netbios_main_unload(void) int -ipf_p_netbios_out(void *arg, fr_info_t *fin, ap_session_t *aps, nat_t *nat) +ipf_p_netbios_out(void *arg, fr_info_t *fin, ap_session_t *aps __unused, nat_t *nat __unused) { char dgmbuf[6]; int off, dlen; @@ -75,9 +75,6 @@ ipf_p_netbios_out(void *arg, fr_info_t *fin, ap_session_t *aps, nat_t *nat) ip_t *ip; mb_t *m; - aps = aps; /* LINT */ - nat = nat; /* LINT */ - m = fin->fin_m; dlen = fin->fin_dlen - sizeof(*udp); /* diff --git a/sys/netpfil/ipfilter/netinet/ip_pptp_pxy.c b/sys/netpfil/ipfilter/netinet/ip_pptp_pxy.c index 0ac19b067d2d..dc4c67dc14f0 100644 --- a/sys/netpfil/ipfilter/netinet/ip_pptp_pxy.c +++ b/sys/netpfil/ipfilter/netinet/ip_pptp_pxy.c @@ -281,7 +281,6 @@ ipf_p_pptp_nextmessage(fr_info_t *fin, nat_t *nat, pptp_pxy_t *pptp, int rev) tcphdr_t *tcp; int dlen, off; u_short len; - char *msg; tcp = fin->fin_dp; dlen = fin->fin_dlen - (TCP_OFF(tcp) << 2); @@ -310,8 +309,6 @@ ipf_p_pptp_nextmessage(fr_info_t *fin, nat_t *nat, pptp_pxy_t *pptp, int rev) return (-1); } - msg = (char *)fin->fin_dp + (TCP_OFF(tcp) << 2); - while (dlen > 0) { off += pptps->pptps_bytes; if (pptps->pptps_gothdr == 0) { @@ -337,7 +334,6 @@ ipf_p_pptp_nextmessage(fr_info_t *fin, nat_t *nat, pptp_pxy_t *pptp, int rev) } } dlen -= len; - msg += len; off += len; pptps->pptps_gothdr = 1; @@ -381,7 +377,6 @@ ipf_p_pptp_nextmessage(fr_info_t *fin, nat_t *nat, pptp_pxy_t *pptp, int rev) pptps->pptps_len = 0; start += len; - msg += len; dlen -= len; } diff --git a/sys/netpfil/ipfilter/netinet/ip_proxy.c b/sys/netpfil/ipfilter/netinet/ip_proxy.c index 9785fc37d3da..9fb6dbd2a9e1 100644 --- a/sys/netpfil/ipfilter/netinet/ip_proxy.c +++ b/sys/netpfil/ipfilter/netinet/ip_proxy.c @@ -679,14 +679,12 @@ ipf_proxy_ok(fr_info_t *fin, tcphdr_t *tcp, ipnat_t *np) /* ------------------------------------------------------------------------ */ int ipf_proxy_ioctl(ipf_main_softc_t *softc, caddr_t data, ioctlcmd_t cmd, - int mode, void *ctx) + int mode __unused, void *ctx) { ap_ctl_t ctl; caddr_t ptr; int error; - mode = mode; /* LINT */ - switch (cmd) { case SIOCPROXY : diff --git a/sys/netpfil/ipfilter/netinet/ip_raudio_pxy.c b/sys/netpfil/ipfilter/netinet/ip_raudio_pxy.c index 2cfaaa58200f..94f0e3ada707 100644 --- a/sys/netpfil/ipfilter/netinet/ip_raudio_pxy.c +++ b/sys/netpfil/ipfilter/netinet/ip_raudio_pxy.c @@ -49,12 +49,10 @@ ipf_p_raudio_main_unload(void) * Setup for a new proxy to handle Real Audio. */ int -ipf_p_raudio_new(void *arg, fr_info_t *fin, ap_session_t *aps, nat_t *nat) +ipf_p_raudio_new(void *arg, fr_info_t *fin, ap_session_t *aps, nat_t *nat __unused) { raudio_t *rap; - nat = nat; /* LINT */ - if (fin->fin_v != 4) return (-1); @@ -72,7 +70,7 @@ ipf_p_raudio_new(void *arg, fr_info_t *fin, ap_session_t *aps, nat_t *nat) int -ipf_p_raudio_out(void *arg, fr_info_t *fin, ap_session_t *aps, nat_t *nat) +ipf_p_raudio_out(void *arg, fr_info_t *fin, ap_session_t *aps, nat_t *nat __unused) { raudio_t *rap = aps->aps_data; unsigned char membuf[512 + 1], *s; @@ -82,8 +80,6 @@ ipf_p_raudio_out(void *arg, fr_info_t *fin, ap_session_t *aps, nat_t *nat) int len = 0; mb_t *m; - nat = nat; /* LINT */ - /* * If we've already processed the start messages, then nothing left * for the proxy to do. diff --git a/sys/netpfil/ipfilter/netinet/ip_rcmd_pxy.c b/sys/netpfil/ipfilter/netinet/ip_rcmd_pxy.c index 778f14f442de..b85794e75499 100644 --- a/sys/netpfil/ipfilter/netinet/ip_rcmd_pxy.c +++ b/sys/netpfil/ipfilter/netinet/ip_rcmd_pxy.c @@ -63,18 +63,12 @@ ipf_p_rcmd_main_unload(void) * Setup for a new RCMD proxy. */ int -ipf_p_rcmd_new(void *arg, fr_info_t *fin, ap_session_t *aps, nat_t *nat) +ipf_p_rcmd_new(void *arg, fr_info_t *fin __unused, ap_session_t *aps, nat_t *nat) { tcphdr_t *tcp = (tcphdr_t *)fin->fin_dp; rcmdinfo_t *rc; ipnat_t *ipn; - ipnat_t *np; - int size; - fin = fin; /* LINT */ - - np = nat->nat_ptr; - size = np->in_size; KMALLOC(rc, rcmdinfo_t *); if (rc == NULL) { #ifdef IP_RCMD_PROXY_DEBUG diff --git a/sys/netpfil/ipfilter/netinet/ip_rpcb_pxy.c b/sys/netpfil/ipfilter/netinet/ip_rpcb_pxy.c index f8f4d2d325e1..c608f84d7b3b 100644 --- a/sys/netpfil/ipfilter/netinet/ip_rpcb_pxy.c +++ b/sys/netpfil/ipfilter/netinet/ip_rpcb_pxy.c @@ -144,12 +144,10 @@ ipf_p_rpcb_main_unload(void) /* Allocate resources for per-session proxy structures. */ /* -------------------------------------------------------------------- */ int -ipf_p_rpcb_new(void *arg, fr_info_t *fin, ap_session_t *aps, nat_t *nat) +ipf_p_rpcb_new(void *arg, fr_info_t *fin, ap_session_t *aps, nat_t *nat __unused) { rpcb_session_t *rs; - nat = nat; /* LINT */ - if (fin->fin_v != 4) return (-1); @@ -1023,10 +1021,8 @@ ipf_p_rpcb_lookup(rpcb_session_t *rs, u_32_t xid) /* Free the RPCB transaction record rx from the chain of entries. */ /* -------------------------------------------------------------------- */ static void -ipf_p_rpcb_deref(rpcb_session_t *rs, rpcb_xact_t *rx) +ipf_p_rpcb_deref(rpcb_session_t *rs __unused, rpcb_xact_t *rx) { - rs = rs; /* LINT */ - if (rx == NULL) return; diff --git a/sys/netpfil/ipfilter/netinet/ip_state.c b/sys/netpfil/ipfilter/netinet/ip_state.c index 8fe11e3f1215..36fdf23cd062 100644 --- a/sys/netpfil/ipfilter/netinet/ip_state.c +++ b/sys/netpfil/ipfilter/netinet/ip_state.c @@ -883,7 +883,7 @@ ipf_state_putent(ipf_main_softc_t *softc, ipf_state_softc_t *softs, { ipstate_t *is, *isn; ipstate_save_t ips; - int error, out, i; + int error, i; frentry_t *fr; char *name; @@ -929,7 +929,6 @@ ipf_state_putent(ipf_main_softc_t *softc, ipf_state_softc_t *softs, return (ENOMEM); } bcopy((char *)&ips.ips_fr, (char *)fr, sizeof(*fr)); - out = fr->fr_flags & FR_OUTQUE ? 1 : 0; isn->is_rule = fr; ips.ips_is.is_rule = fr; MUTEX_NUKE(&fr->fr_lock); @@ -2207,20 +2206,6 @@ ipf_state_tcpinwindow(fr_info_t *fin, tcpdata_t *fdata, tcpdata_t *tdata, (ackskew >= -1) && (ackskew <= 1)) { inseq = 1; } else if (!(flags & IS_TCPFSM)) { - int i; - - i = (fin->fin_rev << 1) + fin->fin_out; - -#if 0 - if (is_pkts[i]0 == 0) { - /* - * Picking up a connection in the middle, the "next" - * packet seen from a direction that is new should be - * accepted, even if it appears out of sequence. - */ - inseq = 1; - } else -#endif if (!(fdata->td_winflags & (TCP_WSCALE_SEEN|TCP_WSCALE_FIRST))) { /* @@ -2616,7 +2601,7 @@ ipf_checkicmpmatchingstate(fr_info_t *fin) icmphdr_t *icmp; fr_info_t ofin; tcphdr_t *tcp; - int type, len; + int len; u_char pr; ip_t *oip; u_int hv; @@ -2634,7 +2619,6 @@ ipf_checkicmpmatchingstate(fr_info_t *fin) return (NULL); } ic = fin->fin_dp; - type = ic->icmp_type; oip = (ip_t *)((char *)ic + ICMPERR_ICMPHLEN); /* @@ -4362,7 +4346,6 @@ ipf_checkicmp6matchingstate(fr_info_t *fin) ip6_t *oip6; u_char pr; u_int hv; - int type; /* * Does it at least have the return (basic) IP header ? @@ -4377,7 +4360,6 @@ ipf_checkicmp6matchingstate(fr_info_t *fin) } ic6 = fin->fin_dp; - type = ic6->icmp6_type; oip6 = (ip6_t *)((char *)ic6 + ICMPERR_ICMPHLEN); if (fin->fin_plen < sizeof(*oip6)) { diff --git a/sys/netpfil/ipfilter/netinet/ip_tftp_pxy.c b/sys/netpfil/ipfilter/netinet/ip_tftp_pxy.c index d81de100120b..3c737b38aacc 100644 --- a/sys/netpfil/ipfilter/netinet/ip_tftp_pxy.c +++ b/sys/netpfil/ipfilter/netinet/ip_tftp_pxy.c @@ -151,7 +151,7 @@ ipf_p_tftp_in(void *arg, fr_info_t *fin, ap_session_t *aps, nat_t *nat) int -ipf_p_tftp_new(void *arg, fr_info_t *fin, ap_session_t *aps, nat_t *nat) +ipf_p_tftp_new(void *arg, fr_info_t *fin __unused, ap_session_t *aps, nat_t *nat) { udphdr_t *udp; tftpinfo_t *ti; @@ -159,8 +159,6 @@ ipf_p_tftp_new(void *arg, fr_info_t *fin, ap_session_t *aps, nat_t *nat) ipnat_t *np; int size; - fin = fin; /* LINT */ - np = nat->nat_ptr; size = np->in_size; diff --git a/sys/netpfil/ipfilter/netinet/ipf_rb.h b/sys/netpfil/ipfilter/netinet/ipf_rb.h index e047c7f44a4a..334311502aab 100644 --- a/sys/netpfil/ipfilter/netinet/ipf_rb.h +++ b/sys/netpfil/ipfilter/netinet/ipf_rb.h @@ -305,13 +305,11 @@ _n##_rb_walktree(struct _n##_rb_head *head, _n##_rb_walker_t func, void *arg)\ _t *prev; \ _t *next; \ _t *node = head->top._f.right; \ - _t *base; \ \ while (node != &_n##_rb_zero) \ node = node->_f.left; \ \ for (;;) { \ - base = node; \ prev = node; \ while ((node->_f.parent->_f.right == node) && \ (node != &_n##_rb_zero)) { \ diff --git a/sys/netpfil/ipfw/ip_fw2.c b/sys/netpfil/ipfw/ip_fw2.c index 923633d76df7..c129c8c49921 100644 --- a/sys/netpfil/ipfw/ip_fw2.c +++ b/sys/netpfil/ipfw/ip_fw2.c @@ -196,7 +196,7 @@ SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Firewall"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, one_pass, CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_one_pass), 0, - "Only do a single pass through ipfw when using dummynet(4)"); + "Only do a single pass through ipfw when using dummynet(4), ipfw_nat or other divert(4)-like interfaces"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, autoinc_step, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(autoinc_step), 0, "Rule number auto-increment step"); diff --git a/sys/netpfil/pf/if_pflog.c b/sys/netpfil/pf/if_pflog.c index 6a87ea2471cb..cb96d2fcc44c 100644 --- a/sys/netpfil/pf/if_pflog.c +++ b/sys/netpfil/pf/if_pflog.c @@ -284,12 +284,12 @@ pflog_packet(uint8_t action, u_int8_t reason, * state lock, since this leads to unsafe LOR. * These conditions are very very rare, however. */ - if (trigger->log & PF_LOG_SOCKET_LOOKUP && !pd->lookup.done && lookupsafe) + if (trigger->log & PF_LOG_USER && !pd->lookup.done && lookupsafe) pd->lookup.done = pf_socket_lookup(pd); - if (pd->lookup.done > 0) + if (trigger->log & PF_LOG_USER && pd->lookup.done > 0) hdr.uid = pd->lookup.uid; else - hdr.uid = UID_MAX; + hdr.uid = -1; hdr.pid = NO_PID; hdr.rule_uid = rm->cuid; hdr.rule_pid = rm->cpid; diff --git a/sys/netpfil/pf/if_pfsync.c b/sys/netpfil/pf/if_pfsync.c index fdedb9424117..4e03584b8f85 100644 --- a/sys/netpfil/pf/if_pfsync.c +++ b/sys/netpfil/pf/if_pfsync.c @@ -532,6 +532,7 @@ pfsync_state_import(union pfsync_state_union *sp, int flags, int msg_version) struct pf_kpooladdr *rpool_first; int error; uint8_t rt = 0; + int n = 0; PF_RULES_RASSERT(); @@ -557,10 +558,12 @@ pfsync_state_import(union pfsync_state_union *sp, int flags, int msg_version) */ if (sp->pfs_1301.rule != htonl(-1) && sp->pfs_1301.anchor == htonl(-1) && (flags & (PFSYNC_SI_IOCTL | PFSYNC_SI_CKSUM)) && ntohl(sp->pfs_1301.rule) < - pf_main_ruleset.rules[PF_RULESET_FILTER].active.rcount) - r = pf_main_ruleset.rules[ - PF_RULESET_FILTER].active.ptr_array[ntohl(sp->pfs_1301.rule)]; - else + pf_main_ruleset.rules[PF_RULESET_FILTER].active.rcount) { + TAILQ_FOREACH(r, pf_main_ruleset.rules[ + PF_RULESET_FILTER].active.ptr, entries) + if (ntohl(sp->pfs_1301.rule) == n++) + break; + } else r = &V_pf_default_rule; /* @@ -763,6 +766,10 @@ pfsync_state_import(union pfsync_state_union *sp, int flags, int msg_version) __func__, msg_version); } + if (! (st->act.rtableid == -1 || + (st->act.rtableid >= 0 && st->act.rtableid < rt_numfibs))) + goto cleanup; + st->id = sp->pfs_1301.id; st->creatorid = sp->pfs_1301.creatorid; pf_state_peer_ntoh(&sp->pfs_1301.src, &st->src); @@ -1083,7 +1090,7 @@ pfsync_in_ins(struct mbuf *m, int offset, int count, int flags, int action) msg_version = PFSYNC_MSG_VERSION_1400; break; default: - V_pfsyncstats.pfsyncs_badact++; + V_pfsyncstats.pfsyncs_badver++; return (-1); } @@ -1110,9 +1117,8 @@ pfsync_in_ins(struct mbuf *m, int offset, int count, int flags, int action) continue; } - if (pfsync_state_import(sp, flags, msg_version) == ENOMEM) - /* Drop out, but process the rest of the actions. */ - break; + if (pfsync_state_import(sp, flags, msg_version) != 0) + V_pfsyncstats.pfsyncs_badact++; } return (total_len); diff --git a/sys/netpfil/pf/pf.c b/sys/netpfil/pf/pf.c index b24bbe036141..009f7e4d78b1 100644 --- a/sys/netpfil/pf/pf.c +++ b/sys/netpfil/pf/pf.c @@ -682,7 +682,8 @@ pf_packet_rework_nat(struct pf_pdesc *pd, int off, struct pf_state_key *nk) 0); break; case AF_INET6: - PF_ACPY(pd->src, &nk->addr[pd->sidx], pd->af); + pf_addrcpy(pd->src, &nk->addr[pd->sidx], + pd->af); break; default: unhandled_af(pd->af); @@ -696,7 +697,8 @@ pf_packet_rework_nat(struct pf_pdesc *pd, int off, struct pf_state_key *nk) 0); break; case AF_INET6: - PF_ACPY(pd->dst, &nk->addr[pd->didx], pd->af); + pf_addrcpy(pd->dst, &nk->addr[pd->didx], + pd->af); break; default: unhandled_af(pd->af); @@ -1084,9 +1086,9 @@ pf_insert_src_node(struct pf_ksrc_node *sns[PF_SN_MAX], (*sn)->af = af; (*sn)->rule = r_track; - PF_ACPY(&(*sn)->addr, src, af); + pf_addrcpy(&(*sn)->addr, src, af); if (raddr != NULL) - PF_ACPY(&(*sn)->raddr, raddr, af); + pf_addrcpy(&(*sn)->raddr, raddr, af); (*sn)->rkif = rkif; LIST_INSERT_HEAD(&(*sh)->nodes, *sn, entry); (*sn)->creation = time_uptime; @@ -1687,9 +1689,9 @@ pf_state_key_addr_setup(struct pf_pdesc *pd, copy: #endif /* INET6 */ if (saddr) - PF_ACPY(&key->addr[pd->sidx], saddr, pd->af); + pf_addrcpy(&key->addr[pd->sidx], saddr, pd->af); if (daddr) - PF_ACPY(&key->addr[pd->didx], daddr, pd->af); + pf_addrcpy(&key->addr[pd->didx], daddr, pd->af); return (0); } @@ -1734,13 +1736,17 @@ pf_state_key_setup(struct pf_pdesc *pd, u_int16_t sport, u_int16_t dport, bzero(&(*nk)->addr[0], sizeof((*nk)->addr[0])); bzero(&(*nk)->addr[1], sizeof((*nk)->addr[1])); if (pd->dir == PF_IN) { - PF_ACPY(&(*nk)->addr[pd->didx], &pd->nsaddr, pd->naf); - PF_ACPY(&(*nk)->addr[pd->sidx], &pd->ndaddr, pd->naf); + pf_addrcpy(&(*nk)->addr[pd->didx], &pd->nsaddr, + pd->naf); + pf_addrcpy(&(*nk)->addr[pd->sidx], &pd->ndaddr, + pd->naf); (*nk)->port[pd->didx] = pd->nsport; (*nk)->port[pd->sidx] = pd->ndport; } else { - PF_ACPY(&(*nk)->addr[pd->sidx], &pd->nsaddr, pd->naf); - PF_ACPY(&(*nk)->addr[pd->didx], &pd->ndaddr, pd->naf); + pf_addrcpy(&(*nk)->addr[pd->sidx], &pd->nsaddr, + pd->naf); + pf_addrcpy(&(*nk)->addr[pd->didx], &pd->ndaddr, + pd->naf); (*nk)->port[pd->sidx] = pd->nsport; (*nk)->port[pd->didx] = pd->ndport; } @@ -2053,11 +2059,11 @@ pf_udp_mapping_create(sa_family_t af, struct pf_addr *src_addr, uint16_t src_por mapping = uma_zalloc(V_pf_udp_mapping_z, M_NOWAIT | M_ZERO); if (mapping == NULL) return (NULL); - PF_ACPY(&mapping->endpoints[0].addr, src_addr, af); + pf_addrcpy(&mapping->endpoints[0].addr, src_addr, af); mapping->endpoints[0].port = src_port; mapping->endpoints[0].af = af; mapping->endpoints[0].mapping = mapping; - PF_ACPY(&mapping->endpoints[1].addr, nat_addr, af); + pf_addrcpy(&mapping->endpoints[1].addr, nat_addr, af); mapping->endpoints[1].port = nat_port; mapping->endpoints[1].af = af; mapping->endpoints[1].mapping = mapping; @@ -3295,9 +3301,9 @@ pf_change_ap(struct pf_pdesc *pd, struct pf_addr *a, u_int16_t *p, MPASS(pd->ip_sum); } - PF_ACPY(&ao, a, pd->af); + pf_addrcpy(&ao, a, pd->af); if (pd->af == pd->naf) - PF_ACPY(a, an, pd->af); + pf_addrcpy(a, an, pd->af); if (pd->m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) *pd->pcksum = ~*pd->pcksum; @@ -3426,8 +3432,8 @@ pf_change_a6(struct pf_addr *a, u_int16_t *c, struct pf_addr *an, u_int8_t u) { struct pf_addr ao; - PF_ACPY(&ao, a, AF_INET6); - PF_ACPY(a, an, AF_INET6); + pf_addrcpy(&ao, a, AF_INET6); + pf_addrcpy(a, an, AF_INET6); *c = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( @@ -3450,9 +3456,9 @@ pf_change_icmp(struct pf_addr *ia, u_int16_t *ip, struct pf_addr *oa, { struct pf_addr oia, ooa; - PF_ACPY(&oia, ia, af); + pf_addrcpy(&oia, ia, af); if (oa) - PF_ACPY(&ooa, oa, af); + pf_addrcpy(&ooa, oa, af); /* Change inner protocol port, fix inner protocol checksum. */ if (ip != NULL) { @@ -3469,7 +3475,7 @@ pf_change_icmp(struct pf_addr *ia, u_int16_t *ip, struct pf_addr *oa, *ic = pf_cksum_fixup(*ic, opc, *pc, 0); } /* Change inner ip address, fix inner ip and icmp checksums. */ - PF_ACPY(ia, na, af); + pf_addrcpy(ia, na, af); switch (af) { #ifdef INET case AF_INET: { @@ -3503,7 +3509,7 @@ pf_change_icmp(struct pf_addr *ia, u_int16_t *ip, struct pf_addr *oa, } /* Outer ip address, fix outer ip or icmpv6 checksum, if necessary. */ if (oa) { - PF_ACPY(oa, na, af); + pf_addrcpy(oa, na, af); switch (af) { #ifdef INET case AF_INET: @@ -4299,8 +4305,8 @@ pf_undo_nat(struct pf_krule *nr, struct pf_pdesc *pd, uint16_t bip_sum) { /* undo NAT changes, if they have taken place */ if (nr != NULL) { - PF_ACPY(pd->src, &pd->osrc, pd->af); - PF_ACPY(pd->dst, &pd->odst, pd->af); + pf_addrcpy(pd->src, &pd->osrc, pd->af); + pf_addrcpy(pd->dst, &pd->odst, pd->af); if (pd->sport) *pd->sport = pd->osport; if (pd->dport) @@ -4573,7 +4579,7 @@ pf_match_port(u_int8_t op, u_int16_t a1, u_int16_t a2, u_int16_t p) static int pf_match_uid(u_int8_t op, uid_t a1, uid_t a2, uid_t u) { - if (u == UID_MAX && op != PF_OP_EQ && op != PF_OP_NE) + if (u == -1 && op != PF_OP_EQ && op != PF_OP_NE) return (0); return (pf_match(op, a1, a2, u)); } @@ -4581,7 +4587,7 @@ pf_match_uid(u_int8_t op, uid_t a1, uid_t a2, uid_t u) static int pf_match_gid(u_int8_t op, gid_t a1, gid_t a2, gid_t g) { - if (g == GID_MAX && op != PF_OP_EQ && op != PF_OP_NE) + if (g == -1 && op != PF_OP_EQ && op != PF_OP_NE) return (0); return (pf_match(op, a1, a2, g)); } @@ -4675,6 +4681,13 @@ pf_step_into_anchor(struct pf_test_ctx *ctx, struct pf_krule *r) } } else { rv = pf_match_rule(ctx, &r->anchor->ruleset); + /* + * Unless errors occured, stop iff any rule matched + * within quick anchors. + */ + if (rv != PF_TEST_FAIL && r->quick == PF_TEST_QUICK && + *ctx->am == r) + rv = PF_TEST_QUICK; } ctx->depth--; @@ -4784,7 +4797,6 @@ pf_step_out_of_keth_anchor(struct pf_keth_anchor_stackframe *stack, int *depth, return (quick); } -#ifdef INET6 void pf_poolmask(struct pf_addr *naddr, struct pf_addr *raddr, struct pf_addr *rmask, struct pf_addr *saddr, sa_family_t af) @@ -4796,6 +4808,7 @@ pf_poolmask(struct pf_addr *naddr, struct pf_addr *raddr, ((rmask->addr32[0] ^ 0xffffffff ) & saddr->addr32[0]); break; #endif /* INET */ +#ifdef INET6 case AF_INET6: naddr->addr32[0] = (raddr->addr32[0] & rmask->addr32[0]) | ((rmask->addr32[0] ^ 0xffffffff ) & saddr->addr32[0]); @@ -4806,6 +4819,7 @@ pf_poolmask(struct pf_addr *naddr, struct pf_addr *raddr, naddr->addr32[3] = (raddr->addr32[3] & rmask->addr32[3]) | ((rmask->addr32[3] ^ 0xffffffff ) & saddr->addr32[3]); break; +#endif /* INET6 */ } } @@ -4818,6 +4832,7 @@ pf_addr_inc(struct pf_addr *addr, sa_family_t af) addr->addr32[0] = htonl(ntohl(addr->addr32[0]) + 1); break; #endif /* INET */ +#ifdef INET6 case AF_INET6: if (addr->addr32[3] == 0xffffffff) { addr->addr32[3] = 0; @@ -4837,9 +4852,9 @@ pf_addr_inc(struct pf_addr *addr, sa_family_t af) addr->addr32[3] = htonl(ntohl(addr->addr32[3]) + 1); break; +#endif /* INET6 */ } } -#endif /* INET6 */ void pf_rule_to_actions(struct pf_krule *r, struct pf_rule_actions *a) @@ -4899,8 +4914,8 @@ pf_socket_lookup(struct pf_pdesc *pd) struct inpcbinfo *pi; struct inpcb *inp; - pd->lookup.uid = UID_MAX; - pd->lookup.gid = GID_MAX; + pd->lookup.uid = -1; + pd->lookup.gid = -1; switch (pd->proto) { case IPPROTO_TCP: @@ -5738,8 +5753,8 @@ pf_test_rule(struct pf_krule **rm, struct pf_kstate **sm, ctx.reason = *reason; SLIST_INIT(&ctx.rules); - PF_ACPY(&pd->nsaddr, pd->src, pd->af); - PF_ACPY(&pd->ndaddr, pd->dst, pd->af); + pf_addrcpy(&pd->nsaddr, pd->src, pd->af); + pf_addrcpy(&pd->ndaddr, pd->dst, pd->af); if (inp != NULL) { INP_LOCK_ASSERT(inp); @@ -5886,18 +5901,17 @@ pf_test_rule(struct pf_krule **rm, struct pf_kstate **sm, M_SETFIB(pd->m, pd->act.rtableid); if (r->rt) { - struct pf_ksrc_node *sn = NULL; - struct pf_srchash *snh = NULL; /* * Set act.rt here instead of in pf_rule_to_actions() because * it is applied only from the last pass rule. */ pd->act.rt = r->rt; - /* Don't use REASON_SET, pf_map_addr increases the reason counters */ - ctx.reason = pf_map_addr_sn(pd->af, r, pd->src, &pd->act.rt_addr, - &pd->act.rt_kif, NULL, &sn, &snh, &(r->route), PF_SN_ROUTE); - if (ctx.reason != 0) + if ((transerror = pf_map_addr_sn(pd->af, r, pd->src, + &pd->act.rt_addr, &pd->act.rt_kif, NULL, &(r->route), + PF_SN_ROUTE)) != PFRES_MATCH) { + REASON_SET(&ctx.reason, transerror); goto cleanup; + } } if (pd->virtual_proto != PF_VPROTO_FRAGMENT && @@ -6041,9 +6055,16 @@ pf_create_state(struct pf_krule *r, struct pf_test_ctx *ctx, /* src node for translation rule */ if (ctx->nr != NULL) { KASSERT(ctx->nat_pool != NULL, ("%s: nat_pool is NULL", __func__)); + /* + * The NAT addresses are chosen during ruleset parsing. + * The new afto code stores post-nat addresses in nsaddr. + * The old nat code (also used for new nat-to rules) creates + * state keys and stores addresses in them. + */ if ((ctx->nat_pool->opts & PF_POOL_STICKYADDR) && (sn_reason = pf_insert_src_node(sns, snhs, ctx->nr, - &ctx->sk->addr[pd->sidx], pd->af, &ctx->nk->addr[1], NULL, + ctx->sk ? &(ctx->sk->addr[pd->sidx]) : pd->src, pd->af, + ctx->nk ? &(ctx->nk->addr[1]) : &(pd->nsaddr), NULL, PF_SN_NAT)) != 0 ) { REASON_SET(&ctx->reason, sn_reason); goto csfailed; @@ -6198,7 +6219,7 @@ pf_create_state(struct pf_krule *r, struct pf_test_ctx *ctx, if (ctx->tag > 0) s->tag = ctx->tag; if (pd->proto == IPPROTO_TCP && (tcp_get_flags(th) & (TH_SYN|TH_ACK)) == - TH_SYN && r->keep_state == PF_STATE_SYNPROXY) { + TH_SYN && r->keep_state == PF_STATE_SYNPROXY && pd->dir == PF_IN) { pf_set_protostate(s, PF_PEER_SRC, PF_TCPS_PROXY_SRC); pf_undo_nat(ctx->nr, pd, bip_sum); s->src.seqhi = arc4random(); @@ -6357,7 +6378,7 @@ pf_translate_compat(struct pf_test_ctx *ctx) &nk->addr[pd->sidx], nk->port[pd->sidx]); pd->sport = &th->th_sport; pd->nsport = th->th_sport; - PF_ACPY(&pd->nsaddr, pd->src, pd->af); + pf_addrcpy(&pd->nsaddr, pd->src, pd->af); } if (PF_ANEQ(&pd->ndaddr, &nk->addr[pd->didx], pd->af) || @@ -6366,7 +6387,7 @@ pf_translate_compat(struct pf_test_ctx *ctx) &nk->addr[pd->didx], nk->port[pd->didx]); pd->dport = &th->th_dport; pd->ndport = th->th_dport; - PF_ACPY(&pd->ndaddr, pd->dst, pd->af); + pf_addrcpy(&pd->ndaddr, pd->dst, pd->af); } rewrite++; break; @@ -6379,7 +6400,7 @@ pf_translate_compat(struct pf_test_ctx *ctx) nk->port[pd->sidx]); pd->sport = &pd->hdr.udp.uh_sport; pd->nsport = pd->hdr.udp.uh_sport; - PF_ACPY(&pd->nsaddr, pd->src, pd->af); + pf_addrcpy(&pd->nsaddr, pd->src, pd->af); } if (PF_ANEQ(&pd->ndaddr, &nk->addr[pd->didx], pd->af) || @@ -6390,7 +6411,7 @@ pf_translate_compat(struct pf_test_ctx *ctx) nk->port[pd->didx]); pd->dport = &pd->hdr.udp.uh_dport; pd->ndport = pd->hdr.udp.uh_dport; - PF_ACPY(&pd->ndaddr, pd->dst, pd->af); + pf_addrcpy(&pd->ndaddr, pd->dst, pd->af); } rewrite++; break; @@ -6403,7 +6424,7 @@ pf_translate_compat(struct pf_test_ctx *ctx) nk->port[pd->sidx]); pd->sport = &pd->hdr.sctp.src_port; pd->nsport = pd->hdr.sctp.src_port; - PF_ACPY(&pd->nsaddr, pd->src, pd->af); + pf_addrcpy(&pd->nsaddr, pd->src, pd->af); } if (PF_ANEQ(&pd->ndaddr, &nk->addr[pd->didx], pd->af) || nk->port[pd->didx] != pd->ndport) { @@ -6413,7 +6434,7 @@ pf_translate_compat(struct pf_test_ctx *ctx) nk->port[pd->didx]); pd->dport = &pd->hdr.sctp.dest_port; pd->ndport = pd->hdr.sctp.dest_port; - PF_ACPY(&pd->ndaddr, pd->dst, pd->af); + pf_addrcpy(&pd->ndaddr, pd->dst, pd->af); } break; } @@ -6422,13 +6443,13 @@ pf_translate_compat(struct pf_test_ctx *ctx) if (PF_ANEQ(&pd->nsaddr, &nk->addr[pd->sidx], AF_INET)) { pf_change_a(&pd->src->v4.s_addr, pd->ip_sum, nk->addr[pd->sidx].v4.s_addr, 0); - PF_ACPY(&pd->nsaddr, pd->src, pd->af); + pf_addrcpy(&pd->nsaddr, pd->src, pd->af); } if (PF_ANEQ(&pd->ndaddr, &nk->addr[pd->didx], AF_INET)) { pf_change_a(&pd->dst->v4.s_addr, pd->ip_sum, nk->addr[pd->didx].v4.s_addr, 0); - PF_ACPY(&pd->ndaddr, pd->dst, pd->af); + pf_addrcpy(&pd->ndaddr, pd->dst, pd->af); } if (ctx->virtual_type == htons(ICMP_ECHO) && @@ -6447,13 +6468,13 @@ pf_translate_compat(struct pf_test_ctx *ctx) if (PF_ANEQ(&pd->nsaddr, &nk->addr[pd->sidx], AF_INET6)) { pf_change_a6(pd->src, &pd->hdr.icmp6.icmp6_cksum, &nk->addr[pd->sidx], 0); - PF_ACPY(&pd->nsaddr, pd->src, pd->af); + pf_addrcpy(&pd->nsaddr, pd->src, pd->af); } if (PF_ANEQ(&pd->ndaddr, &nk->addr[pd->didx], AF_INET6)) { pf_change_a6(pd->dst, &pd->hdr.icmp6.icmp6_cksum, &nk->addr[pd->didx], 0); - PF_ACPY(&pd->ndaddr, pd->dst, pd->af); + pf_addrcpy(&pd->ndaddr, pd->dst, pd->af); } rewrite++; break; @@ -6467,7 +6488,7 @@ pf_translate_compat(struct pf_test_ctx *ctx) pf_change_a(&pd->src->v4.s_addr, pd->ip_sum, nk->addr[pd->sidx].v4.s_addr, 0); - PF_ACPY(&pd->nsaddr, pd->src, pd->af); + pf_addrcpy(&pd->nsaddr, pd->src, pd->af); } if (PF_ANEQ(&pd->ndaddr, @@ -6475,7 +6496,7 @@ pf_translate_compat(struct pf_test_ctx *ctx) pf_change_a(&pd->dst->v4.s_addr, pd->ip_sum, nk->addr[pd->didx].v4.s_addr, 0); - PF_ACPY(&pd->ndaddr, pd->dst, pd->af); + pf_addrcpy(&pd->ndaddr, pd->dst, pd->af); } break; #endif /* INET */ @@ -6483,14 +6504,17 @@ pf_translate_compat(struct pf_test_ctx *ctx) case AF_INET6: if (PF_ANEQ(&pd->nsaddr, &nk->addr[pd->sidx], AF_INET6)) { - PF_ACPY(&pd->nsaddr, &nk->addr[pd->sidx], pd->af); - PF_ACPY(pd->src, &nk->addr[pd->sidx], pd->af); + pf_addrcpy(&pd->nsaddr, &nk->addr[pd->sidx], + pd->af); + pf_addrcpy(pd->src, &nk->addr[pd->sidx], pd->af); } if (PF_ANEQ(&pd->ndaddr, &nk->addr[pd->didx], AF_INET6)) { - PF_ACPY(&pd->ndaddr, &nk->addr[pd->didx], pd->af); - PF_ACPY(pd->dst, &nk->addr[pd->didx], pd->af); + pf_addrcpy(&pd->ndaddr, &nk->addr[pd->didx], + pd->af); + pf_addrcpy(pd->dst, &nk->addr[pd->didx], + pd->af); } break; #endif /* INET6 */ @@ -7009,8 +7033,8 @@ pf_test_state(struct pf_kstate **state, struct pf_pdesc *pd, u_short *reason) bzero(&key, sizeof(key)); key.af = pd->af; key.proto = pd->virtual_proto; - PF_ACPY(&key.addr[pd->sidx], pd->src, key.af); - PF_ACPY(&key.addr[pd->didx], pd->dst, key.af); + pf_addrcpy(&key.addr[pd->sidx], pd->src, key.af); + pf_addrcpy(&key.addr[pd->didx], pd->dst, key.af); key.port[pd->sidx] = pd->osport; key.port[pd->didx] = pd->odport; @@ -7201,8 +7225,8 @@ pf_test_state(struct pf_kstate **state, struct pf_pdesc *pd, u_short *reason) } if (afto) { - PF_ACPY(&pd->nsaddr, &nk->addr[sidx], nk->af); - PF_ACPY(&pd->ndaddr, &nk->addr[didx], nk->af); + pf_addrcpy(&pd->nsaddr, &nk->addr[sidx], nk->af); + pf_addrcpy(&pd->ndaddr, &nk->addr[didx], nk->af); pd->naf = nk->af; action = PF_AFRT; } @@ -7496,13 +7520,13 @@ again: key.af = j->pd.af; key.proto = IPPROTO_SCTP; if (j->pd.dir == PF_IN) { /* wire side, straight */ - PF_ACPY(&key.addr[0], j->pd.src, key.af); - PF_ACPY(&key.addr[1], j->pd.dst, key.af); + pf_addrcpy(&key.addr[0], j->pd.src, key.af); + pf_addrcpy(&key.addr[1], j->pd.dst, key.af); key.port[0] = j->pd.hdr.sctp.src_port; key.port[1] = j->pd.hdr.sctp.dest_port; } else { /* stack side, reverse */ - PF_ACPY(&key.addr[1], j->pd.src, key.af); - PF_ACPY(&key.addr[0], j->pd.dst, key.af); + pf_addrcpy(&key.addr[1], j->pd.src, key.af); + pf_addrcpy(&key.addr[0], j->pd.dst, key.af); key.port[1] = j->pd.hdr.sctp.src_port; key.port[0] = j->pd.hdr.sctp.dest_port; } @@ -7898,8 +7922,10 @@ pf_test_state_icmp(struct pf_kstate **state, struct pf_pdesc *pd, #endif /* INET6 */ } if (afto) { - PF_ACPY(&pd->nsaddr, &nk->addr[sidx], nk->af); - PF_ACPY(&pd->ndaddr, &nk->addr[didx], nk->af); + pf_addrcpy(&pd->nsaddr, &nk->addr[sidx], + nk->af); + pf_addrcpy(&pd->ndaddr, &nk->addr[didx], + nk->af); pd->naf = nk->af; return (PF_AFRT); } @@ -8031,8 +8057,8 @@ pf_test_state_icmp(struct pf_kstate **state, struct pf_pdesc *pd, key.af = pd2.af; key.proto = IPPROTO_TCP; - PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af); - PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af); + pf_addrcpy(&key.addr[pd2.sidx], pd2.src, key.af); + pf_addrcpy(&key.addr[pd2.didx], pd2.dst, key.af); key.port[pd2.sidx] = th->th_sport; key.port[pd2.didx] = th->th_dport; @@ -8135,9 +8161,9 @@ pf_test_state_icmp(struct pf_kstate **state, struct pf_pdesc *pd, &nk->addr[didx], pd->af, nk->af)) return (PF_DROP); - PF_ACPY(&pd->nsaddr, &nk->addr[pd2.sidx], - nk->af); - PF_ACPY(&pd->ndaddr, + pf_addrcpy(&pd->nsaddr, + &nk->addr[pd2.sidx], nk->af); + pf_addrcpy(&pd->ndaddr, &nk->addr[pd2.didx], nk->af); if (nk->af == AF_INET) { pd->proto = IPPROTO_ICMP; @@ -8226,8 +8252,8 @@ pf_test_state_icmp(struct pf_kstate **state, struct pf_pdesc *pd, key.af = pd2.af; key.proto = IPPROTO_UDP; - PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af); - PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af); + pf_addrcpy(&key.addr[pd2.sidx], pd2.src, key.af); + pf_addrcpy(&key.addr[pd2.didx], pd2.dst, key.af); key.port[pd2.sidx] = uh->uh_sport; key.port[pd2.didx] = uh->uh_dport; @@ -8270,9 +8296,9 @@ pf_test_state_icmp(struct pf_kstate **state, struct pf_pdesc *pd, &nk->addr[didx], pd->af, nk->af)) return (PF_DROP); - PF_ACPY(&pd->nsaddr, + pf_addrcpy(&pd->nsaddr, &nk->addr[pd2.sidx], nk->af); - PF_ACPY(&pd->ndaddr, + pf_addrcpy(&pd->ndaddr, &nk->addr[pd2.didx], nk->af); if (nk->af == AF_INET) { pd->proto = IPPROTO_ICMP; @@ -8358,8 +8384,8 @@ pf_test_state_icmp(struct pf_kstate **state, struct pf_pdesc *pd, key.af = pd2.af; key.proto = IPPROTO_SCTP; - PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af); - PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af); + pf_addrcpy(&key.addr[pd2.sidx], pd2.src, key.af); + pf_addrcpy(&key.addr[pd2.didx], pd2.dst, key.af); key.port[pd2.sidx] = sh->src_port; key.port[pd2.didx] = sh->dest_port; @@ -8425,9 +8451,9 @@ pf_test_state_icmp(struct pf_kstate **state, struct pf_pdesc *pd, sh->src_port = nk->port[sidx]; sh->dest_port = nk->port[didx]; m_copyback(pd2.m, pd2.off, sizeof(*sh), (c_caddr_t)sh); - PF_ACPY(&pd->nsaddr, + pf_addrcpy(&pd->nsaddr, &nk->addr[pd2.sidx], nk->af); - PF_ACPY(&pd->ndaddr, + pf_addrcpy(&pd->ndaddr, &nk->addr[pd2.didx], nk->af); if (nk->af == AF_INET) { pd->proto = IPPROTO_ICMP; @@ -8568,9 +8594,9 @@ pf_test_state_icmp(struct pf_kstate **state, struct pf_pdesc *pd, iih->icmp_id = nk->port[iidx]; m_copyback(pd2.m, pd2.off, ICMP_MINLEN, (c_caddr_t)iih); - PF_ACPY(&pd->nsaddr, + pf_addrcpy(&pd->nsaddr, &nk->addr[pd2.sidx], nk->af); - PF_ACPY(&pd->ndaddr, + pf_addrcpy(&pd->ndaddr, &nk->addr[pd2.didx], nk->af); /* * IPv4 becomes IPv6 so we must copy @@ -8696,9 +8722,9 @@ pf_test_state_icmp(struct pf_kstate **state, struct pf_pdesc *pd, iih->icmp6_id = nk->port[iidx]; m_copyback(pd2.m, pd2.off, sizeof(struct icmp6_hdr), (c_caddr_t)iih); - PF_ACPY(&pd->nsaddr, + pf_addrcpy(&pd->nsaddr, &nk->addr[pd2.sidx], nk->af); - PF_ACPY(&pd->ndaddr, + pf_addrcpy(&pd->ndaddr, &nk->addr[pd2.didx], nk->af); pd->naf = nk->af; return (PF_AFRT); @@ -8740,8 +8766,8 @@ pf_test_state_icmp(struct pf_kstate **state, struct pf_pdesc *pd, key.af = pd2.af; key.proto = pd2.proto; - PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af); - PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af); + pf_addrcpy(&key.addr[pd2.sidx], pd2.src, key.af); + pf_addrcpy(&key.addr[pd2.didx], pd2.dst, key.af); key.port[0] = key.port[1] = 0; action = pf_find_state(&pd2, &key, state); @@ -9042,6 +9068,9 @@ pf_route(struct pf_krule *r, struct ifnet *oifp, goto bad; } + if (r->rt == PF_DUPTO) + skip_test = true; + if (pd->dir == PF_IN && !skip_test) { if (pf_test(AF_INET, PF_OUT, PFIL_FWD, ifp, &m0, inp, &pd->act) != PF_PASS) { @@ -9277,7 +9306,8 @@ pf_route6(struct pf_krule *r, struct ifnet *oifp, bzero(&dst, sizeof(dst)); dst.sin6_family = AF_INET6; dst.sin6_len = sizeof(dst); - PF_ACPY((struct pf_addr *)&dst.sin6_addr, &pd->act.rt_addr, AF_INET6); + pf_addrcpy((struct pf_addr *)&dst.sin6_addr, &pd->act.rt_addr, + AF_INET6); if (pd->dir == PF_IN) { if (ip6->ip6_hlim <= IPV6_HLIMDEC) { @@ -9343,6 +9373,9 @@ pf_route6(struct pf_krule *r, struct ifnet *oifp, goto bad; } + if (r->rt == PF_DUPTO) + skip_test = true; + if (pd->dir == PF_IN && !skip_test) { if (pf_test(AF_INET6, PF_OUT, PFIL_FWD | PF_PFIL_NOREFRAGMENT, ifp, &m0, inp, &pd->act) != PF_PASS) { @@ -10031,6 +10064,8 @@ pf_setup_pdesc(sa_family_t af, int dir, struct pf_pdesc *pd, struct mbuf **m0, pd->didx = (dir == PF_IN) ? 1 : 0; pd->af = pd->naf = af; + PF_RULES_ASSERT(); + TAILQ_INIT(&pd->sctp_multihome_jobs); if (default_actions != NULL) memcpy(&pd->act, default_actions, sizeof(pd->act)); @@ -10077,8 +10112,8 @@ pf_setup_pdesc(sa_family_t af, int dir, struct pf_pdesc *pd, struct mbuf **m0, pd->src = (struct pf_addr *)&h->ip_src; pd->dst = (struct pf_addr *)&h->ip_dst; - PF_ACPY(&pd->osrc, pd->src, af); - PF_ACPY(&pd->odst, pd->dst, af); + pf_addrcpy(&pd->osrc, pd->src, af); + pf_addrcpy(&pd->odst, pd->dst, af); pd->ip_sum = &h->ip_sum; pd->tos = h->ip_tos & ~IPTOS_ECN_MASK; pd->ttl = h->ip_ttl; @@ -10106,6 +10141,12 @@ pf_setup_pdesc(sa_family_t af, int dir, struct pf_pdesc *pd, struct mbuf **m0, } h = mtod(pd->m, struct ip6_hdr *); + if (pd->m->m_pkthdr.len < + sizeof(struct ip6_hdr) + ntohs(h->ip6_plen)) { + *action = PF_DROP; + REASON_SET(reason, PFRES_SHORT); + return (-1); + } if (pf_walk_header6(pd, h, reason) != PF_PASS) { *action = PF_DROP; @@ -10115,8 +10156,8 @@ pf_setup_pdesc(sa_family_t af, int dir, struct pf_pdesc *pd, struct mbuf **m0, h = mtod(pd->m, struct ip6_hdr *); pd->src = (struct pf_addr *)&h->ip6_src; pd->dst = (struct pf_addr *)&h->ip6_dst; - PF_ACPY(&pd->osrc, pd->src, af); - PF_ACPY(&pd->odst, pd->dst, af); + pf_addrcpy(&pd->osrc, pd->src, af); + pf_addrcpy(&pd->odst, pd->dst, af); pd->ip_sum = NULL; pd->tos = IPV6_DSCP(h); pd->ttl = h->ip6_hlim; @@ -10444,35 +10485,30 @@ pf_test(sa_family_t af, int dir, int pflags, struct ifnet *ifp, struct mbuf **m0 PF_RULES_RLOCK_TRACKER; KASSERT(dir == PF_IN || dir == PF_OUT, ("%s: bad direction %d\n", __func__, dir)); M_ASSERTPKTHDR(*m0); + NET_EPOCH_ASSERT(); if (!V_pf_status.running) return (PF_PASS); - PF_RULES_RLOCK(); - kif = (struct pfi_kkif *)ifp->if_pf_kif; if (__predict_false(kif == NULL)) { DPFPRINTF(PF_DEBUG_URGENT, ("%s: kif == NULL, if_xname %s\n", __func__, ifp->if_xname)); - PF_RULES_RUNLOCK(); return (PF_DROP); } if (kif->pfik_flags & PFI_IFLAG_SKIP) { - PF_RULES_RUNLOCK(); return (PF_PASS); } if ((*m0)->m_flags & M_SKIP_FIREWALL) { - PF_RULES_RUNLOCK(); return (PF_PASS); } if (__predict_false(! M_WRITABLE(*m0))) { *m0 = m_unshare(*m0, M_NOWAIT); if (*m0 == NULL) { - PF_RULES_RUNLOCK(); return (PF_DROP); } } @@ -10485,12 +10521,10 @@ pf_test(sa_family_t af, int dir, int pflags, struct ifnet *ifp, struct mbuf **m0 ifp = ifnet_byindexgen(pd.pf_mtag->if_index, pd.pf_mtag->if_idxgen); if (ifp == NULL || ifp->if_flags & IFF_DYING) { - PF_RULES_RUNLOCK(); m_freem(*m0); *m0 = NULL; return (PF_PASS); } - PF_RULES_RUNLOCK(); (ifp->if_output)(ifp, *m0, sintosa(&pd.pf_mtag->dst), NULL); *m0 = NULL; return (PF_PASS); @@ -10505,11 +10539,12 @@ pf_test(sa_family_t af, int dir, int pflags, struct ifnet *ifp, struct mbuf **m0 /* But only once. We may see the packet multiple times (e.g. * PFIL_IN/PFIL_OUT). */ pf_dummynet_flag_remove(pd.m, pd.pf_mtag); - PF_RULES_RUNLOCK(); return (PF_PASS); } + PF_RULES_RLOCK(); + if (pf_setup_pdesc(af, dir, &pd, m0, &action, &reason, kif, default_actions) == -1) { if (action != PF_PASS) diff --git a/sys/netpfil/pf/pf.h b/sys/netpfil/pf/pf.h index 2009d2907985..cfff58064922 100644 --- a/sys/netpfil/pf/pf.h +++ b/sys/netpfil/pf/pf.h @@ -140,7 +140,7 @@ enum { PF_ADDR_ADDRMASK, PF_ADDR_NOROUTE, PF_ADDR_DYNIFTL, #define PF_LOG 0x01 #define PF_LOG_ALL 0x02 -#define PF_LOG_SOCKET_LOOKUP 0x04 +#define PF_LOG_USER 0x04 #define PF_LOG_FORCE 0x08 #define PF_LOG_MATCHES 0x10 @@ -490,6 +490,7 @@ struct pf_osfp_ioctl { #define PF_ANCHOR_NAME_SIZE 64 #define PF_ANCHOR_MAXPATH (MAXPATHLEN - PF_ANCHOR_NAME_SIZE - 1) +#define PF_OPTIMIZER_TABLE_PFX "__automatic_" struct pf_rule { struct pf_rule_addr src; diff --git a/sys/netpfil/pf/pf_if.c b/sys/netpfil/pf/pf_if.c index 389b74d09d37..e2200c15c704 100644 --- a/sys/netpfil/pf/pf_if.c +++ b/sys/netpfil/pf/pf_if.c @@ -522,7 +522,7 @@ pfi_match_addr(struct pfi_dynaddr *dyn, struct pf_addr *a, sa_family_t af) case 0: return (0); case 1: - return (PF_MATCHA(0, &dyn->pfid_addr4, + return (pf_match_addr(0, &dyn->pfid_addr4, &dyn->pfid_mask4, a, AF_INET)); default: return (pfr_match_addr(dyn->pfid_kt, a, AF_INET)); @@ -535,7 +535,7 @@ pfi_match_addr(struct pfi_dynaddr *dyn, struct pf_addr *a, sa_family_t af) case 0: return (0); case 1: - return (PF_MATCHA(0, &dyn->pfid_addr6, + return (pf_match_addr(0, &dyn->pfid_addr6, &dyn->pfid_mask6, a, AF_INET6)); default: return (pfr_match_addr(dyn->pfid_kt, a, AF_INET6)); diff --git a/sys/netpfil/pf/pf_ioctl.c b/sys/netpfil/pf/pf_ioctl.c index 05a7e1311ad8..5c69c395c5fc 100644 --- a/sys/netpfil/pf/pf_ioctl.c +++ b/sys/netpfil/pf/pf_ioctl.c @@ -615,7 +615,7 @@ pf_free_rule(struct pf_krule *rule) pfi_kkif_unref(rule->kif); if (rule->rcv_kif) pfi_kkif_unref(rule->rcv_kif); - pf_kanchor_remove(rule); + pf_remove_kanchor(rule); pf_empty_kpool(&rule->rdr.list); pf_empty_kpool(&rule->nat.list); pf_empty_kpool(&rule->route.list); @@ -1274,7 +1274,9 @@ pf_hash_rule_addr(MD5_CTX *ctx, struct pf_rule_addr *pfr) PF_MD5_UPD(pfr, addr.iflags); break; case PF_ADDR_TABLE: - PF_MD5_UPD(pfr, addr.v.tblname); + if (strncmp(pfr->addr.v.tblname, PF_OPTIMIZER_TABLE_PFX, + strlen(PF_OPTIMIZER_TABLE_PFX))) + PF_MD5_UPD(pfr, addr.v.tblname); break; case PF_ADDR_ADDRMASK: /* XXX ignore af? */ @@ -1357,7 +1359,7 @@ static int pf_commit_rules(u_int32_t ticket, int rs_num, char *anchor) { struct pf_kruleset *rs; - struct pf_krule *rule, **old_array, *old_rule; + struct pf_krule *rule, *old_rule; struct pf_krulequeue *old_rules; struct pf_krule_global *old_tree; int error; @@ -1382,13 +1384,10 @@ pf_commit_rules(u_int32_t ticket, int rs_num, char *anchor) /* Swap rules, keep the old. */ old_rules = rs->rules[rs_num].active.ptr; old_rcount = rs->rules[rs_num].active.rcount; - old_array = rs->rules[rs_num].active.ptr_array; old_tree = rs->rules[rs_num].active.tree; rs->rules[rs_num].active.ptr = rs->rules[rs_num].inactive.ptr; - rs->rules[rs_num].active.ptr_array = - rs->rules[rs_num].inactive.ptr_array; rs->rules[rs_num].active.tree = rs->rules[rs_num].inactive.tree; rs->rules[rs_num].active.rcount = @@ -1418,7 +1417,6 @@ pf_commit_rules(u_int32_t ticket, int rs_num, char *anchor) } rs->rules[rs_num].inactive.ptr = old_rules; - rs->rules[rs_num].inactive.ptr_array = old_array; rs->rules[rs_num].inactive.tree = NULL; /* important for pf_ioctl_addrule */ rs->rules[rs_num].inactive.rcount = old_rcount; @@ -1431,9 +1429,6 @@ pf_commit_rules(u_int32_t ticket, int rs_num, char *anchor) while ((rule = TAILQ_FIRST(old_rules)) != NULL) pf_unlink_rule_locked(old_rules, rule); PF_UNLNKDRULES_UNLOCK(); - if (rs->rules[rs_num].inactive.ptr_array) - free(rs->rules[rs_num].inactive.ptr_array, M_TEMP); - rs->rules[rs_num].inactive.ptr_array = NULL; rs->rules[rs_num].inactive.rcount = 0; rs->rules[rs_num].inactive.open = 0; pf_remove_if_empty_kruleset(rs); @@ -1456,24 +1451,11 @@ pf_setup_pfsync_matching(struct pf_kruleset *rs) if (rs_cnt == PF_RULESET_SCRUB) continue; - if (rs->rules[rs_cnt].inactive.ptr_array) - free(rs->rules[rs_cnt].inactive.ptr_array, M_TEMP); - rs->rules[rs_cnt].inactive.ptr_array = NULL; - if (rs->rules[rs_cnt].inactive.rcount) { - rs->rules[rs_cnt].inactive.ptr_array = - mallocarray(rs->rules[rs_cnt].inactive.rcount, - sizeof(struct pf_rule **), - M_TEMP, M_NOWAIT); - - if (!rs->rules[rs_cnt].inactive.ptr_array) - return (ENOMEM); - } - - TAILQ_FOREACH(rule, rs->rules[rs_cnt].inactive.ptr, - entries) { - pf_hash_rule_rolling(&ctx, rule); - (rs->rules[rs_cnt].inactive.ptr_array)[rule->nr] = rule; + TAILQ_FOREACH(rule, rs->rules[rs_cnt].inactive.ptr, + entries) { + pf_hash_rule_rolling(&ctx, rule); + } } } @@ -2059,6 +2041,47 @@ pf_ioctl_getrules(struct pfioc_rule *pr) return (0); } +static int +pf_rule_checkaf(struct pf_krule *r) +{ + switch (r->af) { + case 0: + if (r->rule_flag & PFRULE_AFTO) + return (EPFNOSUPPORT); + break; + case AF_INET: + if ((r->rule_flag & PFRULE_AFTO) && r->naf != AF_INET6) + return (EPFNOSUPPORT); + break; +#ifdef INET6 + case AF_INET6: + if ((r->rule_flag & PFRULE_AFTO) && r->naf != AF_INET) + return (EPFNOSUPPORT); + break; +#endif /* INET6 */ + default: + return (EPFNOSUPPORT); + } + + if ((r->rule_flag & PFRULE_AFTO) == 0 && r->naf != 0) + return (EPFNOSUPPORT); + + return (0); +} + +static int +pf_validate_range(uint8_t op, uint16_t port[2]) +{ + uint16_t a = ntohs(port[0]); + uint16_t b = ntohs(port[1]); + + if ((op == PF_OP_RRG && a > b) || /* 34:12, i.e. none */ + (op == PF_OP_IRG && a >= b) || /* 34><12, i.e. none */ + (op == PF_OP_XRG && a > b)) /* 34<>22, i.e. all */ + return 1; + return 0; +} + int pf_ioctl_addrule(struct pf_krule *rule, uint32_t ticket, uint32_t pool_ticket, const char *anchor, const char *anchor_call, @@ -2078,6 +2101,13 @@ pf_ioctl_addrule(struct pf_krule *rule, uint32_t ticket, #define ERROUT(x) ERROUT_FUNCTION(errout, x) + if ((error = pf_rule_checkaf(rule))) + ERROUT(error); + if (pf_validate_range(rule->src.port_op, rule->src.port)) + ERROUT(EINVAL); + if (pf_validate_range(rule->dst.port_op, rule->dst.port)) + ERROUT(EINVAL); + if (rule->ifname[0]) kif = pf_kkif_create(M_WAITOK); if (rule->rcv_ifname[0]) @@ -2155,51 +2185,51 @@ pf_ioctl_addrule(struct pf_krule *rule, uint32_t ticket, rule->rcv_kif = NULL; if (rule->rtableid > 0 && rule->rtableid >= rt_numfibs) - error = EBUSY; + ERROUT(EBUSY); #ifdef ALTQ /* set queue IDs */ if (rule->qname[0] != 0) { if ((rule->qid = pf_qname2qid(rule->qname)) == 0) - error = EBUSY; + ERROUT(EBUSY); else if (rule->pqname[0] != 0) { if ((rule->pqid = pf_qname2qid(rule->pqname)) == 0) - error = EBUSY; + ERROUT(EBUSY); } else rule->pqid = rule->qid; } #endif if (rule->tagname[0]) if ((rule->tag = pf_tagname2tag(rule->tagname)) == 0) - error = EBUSY; + ERROUT(EBUSY); if (rule->match_tagname[0]) if ((rule->match_tag = pf_tagname2tag(rule->match_tagname)) == 0) - error = EBUSY; + ERROUT(EBUSY); if (rule->rt && !rule->direction) - error = EINVAL; + ERROUT(EINVAL); if (!rule->log) rule->logif = 0; if (! pf_init_threshold(&rule->pktrate, rule->pktrate.limit, rule->pktrate.seconds)) - error = ENOMEM; + ERROUT(ENOMEM); if (pf_addr_setup(ruleset, &rule->src.addr, rule->af)) - error = ENOMEM; + ERROUT(ENOMEM); if (pf_addr_setup(ruleset, &rule->dst.addr, rule->af)) - error = ENOMEM; + ERROUT(ENOMEM); if (pf_kanchor_setup(rule, ruleset, anchor_call)) - error = EINVAL; + ERROUT(EINVAL); if (rule->scrub_flags & PFSTATE_SETPRIO && (rule->set_prio[0] > PF_PRIO_MAX || rule->set_prio[1] > PF_PRIO_MAX)) - error = EINVAL; + ERROUT(EINVAL); for (int i = 0; i < 3; i++) { TAILQ_FOREACH(pa, &V_pf_pabuf[i], entries) if (pa->addr.type == PF_ADDR_TABLE) { pa->addr.p.tbl = pfr_attach_table(ruleset, pa->addr.v.tblname); if (pa->addr.p.tbl == NULL) - error = ENOMEM; + ERROUT(ENOMEM); } } @@ -2207,7 +2237,7 @@ pf_ioctl_addrule(struct pf_krule *rule, uint32_t ticket, if (rule->overload_tblname[0]) { if ((rule->overload_tbl = pfr_attach_table(ruleset, rule->overload_tblname)) == NULL) - error = EINVAL; + ERROUT(EINVAL); else rule->overload_tbl->pfrkt_flags |= PFR_TFLAG_ACTIVE; @@ -2230,23 +2260,19 @@ pf_ioctl_addrule(struct pf_krule *rule, uint32_t ticket, if (((rule->action == PF_NAT) || (rule->action == PF_RDR) || (rule->action == PF_BINAT)) && rule->anchor == NULL && TAILQ_FIRST(&rule->rdr.list) == NULL) { - error = EINVAL; + ERROUT(EINVAL); } if (rule->rt > PF_NOPFROUTE && (TAILQ_FIRST(&rule->route.list) == NULL)) { - error = EINVAL; + ERROUT(EINVAL); } if (rule->action == PF_PASS && (rule->rdr.opts & PF_POOL_STICKYADDR || rule->nat.opts & PF_POOL_STICKYADDR) && !rule->keep_state) { - error = EINVAL; + ERROUT(EINVAL); } - if (error) { - pf_free_rule(rule); - rule = NULL; - ERROUT(error); - } + MPASS(error == 0); rule->nat.cur = TAILQ_FIRST(&rule->nat.list); rule->rdr.cur = TAILQ_FIRST(&rule->rdr.list); @@ -2350,15 +2376,17 @@ relock_DIOCKILLSTATES: if (psk->psk_proto && psk->psk_proto != sk->proto) continue; - if (! PF_MATCHA(psk->psk_src.neg, &psk->psk_src.addr.v.a.addr, + if (! pf_match_addr(psk->psk_src.neg, + &psk->psk_src.addr.v.a.addr, &psk->psk_src.addr.v.a.mask, srcaddr, sk->af)) continue; - if (! PF_MATCHA(psk->psk_dst.neg, &psk->psk_dst.addr.v.a.addr, + if (! pf_match_addr(psk->psk_dst.neg, + &psk->psk_dst.addr.v.a.addr, &psk->psk_dst.addr.v.a.mask, dstaddr, sk->af)) continue; - if (! PF_MATCHA(psk->psk_rt_addr.neg, + if (! pf_match_addr(psk->psk_rt_addr.neg, &psk->psk_rt_addr.addr.v.a.addr, &psk->psk_rt_addr.addr.v.a.mask, &s->act.rt_addr, sk->af)) @@ -2398,10 +2426,10 @@ relock_DIOCKILLSTATES: match_key.af = s->key[idx]->af; match_key.proto = s->key[idx]->proto; - PF_ACPY(&match_key.addr[0], + pf_addrcpy(&match_key.addr[0], &s->key[idx]->addr[1], match_key.af); match_key.port[0] = s->key[idx]->port[1]; - PF_ACPY(&match_key.addr[1], + pf_addrcpy(&match_key.addr[1], &s->key[idx]->addr[0], match_key.af); match_key.port[1] = s->key[idx]->port[0]; } @@ -2697,7 +2725,7 @@ pf_ioctl_get_addr(struct pf_nl_pooladdr *pp) PF_RULES_RLOCK_TRACKER; - pp->anchor[sizeof(pp->anchor) - 1] = 0; + pp->anchor[sizeof(pp->anchor) - 1] = '\0'; PF_RULES_RLOCK(); pool = pf_get_kpool(pp->anchor, pp->ticket, pp->r_action, @@ -2730,7 +2758,7 @@ pf_ioctl_get_rulesets(struct pfioc_ruleset *pr) PF_RULES_RLOCK_TRACKER; - pr->path[sizeof(pr->path) - 1] = 0; + pr->path[sizeof(pr->path) - 1] = '\0'; PF_RULES_RLOCK(); if ((ruleset = pf_find_kruleset(pr->path)) == NULL) { @@ -2738,7 +2766,7 @@ pf_ioctl_get_rulesets(struct pfioc_ruleset *pr) return (ENOENT); } pr->nr = 0; - if (ruleset->anchor == NULL) { + if (ruleset == &pf_main_ruleset) { /* XXX kludge for pf_main_ruleset */ RB_FOREACH(anchor, pf_kanchor_global, &V_pf_anchors) if (anchor->parent == NULL) @@ -2769,8 +2797,8 @@ pf_ioctl_get_ruleset(struct pfioc_ruleset *pr) return (ENOENT); } - pr->name[0] = 0; - if (ruleset->anchor == NULL) { + pr->name[0] = '\0'; + if (ruleset == &pf_main_ruleset) { /* XXX kludge for pf_main_ruleset */ RB_FOREACH(anchor, pf_kanchor_global, &V_pf_anchors) if (anchor->parent == NULL && nr++ == pr->nr) { @@ -2794,6 +2822,78 @@ pf_ioctl_get_ruleset(struct pfioc_ruleset *pr) return (error); } +int +pf_ioctl_natlook(struct pfioc_natlook *pnl) +{ + struct pf_state_key *sk; + struct pf_kstate *state; + struct pf_state_key_cmp key; + int m = 0, direction = pnl->direction; + int sidx, didx; + + /* NATLOOK src and dst are reversed, so reverse sidx/didx */ + sidx = (direction == PF_IN) ? 1 : 0; + didx = (direction == PF_IN) ? 0 : 1; + + if (!pnl->proto || + PF_AZERO(&pnl->saddr, pnl->af) || + PF_AZERO(&pnl->daddr, pnl->af) || + ((pnl->proto == IPPROTO_TCP || + pnl->proto == IPPROTO_UDP) && + (!pnl->dport || !pnl->sport))) + return (EINVAL); + + switch (pnl->direction) { + case PF_IN: + case PF_OUT: + case PF_INOUT: + break; + default: + return (EINVAL); + } + + switch (pnl->af) { +#ifdef INET + case AF_INET: + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + break; +#endif /* INET6 */ + default: + return (EAFNOSUPPORT); + } + + bzero(&key, sizeof(key)); + key.af = pnl->af; + key.proto = pnl->proto; + pf_addrcpy(&key.addr[sidx], &pnl->saddr, pnl->af); + key.port[sidx] = pnl->sport; + pf_addrcpy(&key.addr[didx], &pnl->daddr, pnl->af); + key.port[didx] = pnl->dport; + + state = pf_find_state_all(&key, direction, &m); + if (state == NULL) + return (ENOENT); + + if (m > 1) { + PF_STATE_UNLOCK(state); + return (E2BIG); /* more than one state */ + } + + sk = state->key[sidx]; + pf_addrcpy(&pnl->rsaddr, + &sk->addr[sidx], sk->af); + pnl->rsport = sk->port[sidx]; + pf_addrcpy(&pnl->rdaddr, + &sk->addr[didx], sk->af); + pnl->rdport = sk->port[didx]; + PF_STATE_UNLOCK(state); + + return (0); +} + static int pfioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, struct thread *td) { @@ -3497,10 +3597,10 @@ DIOCADDRULENV_error: error = pf_rule_to_krule(&pr->rule, rule); if (error != 0) { pf_krule_free(rule); - break; + goto fail; } - pr->anchor[sizeof(pr->anchor) - 1] = 0; + pr->anchor[sizeof(pr->anchor) - 1] = '\0'; /* Frees rule on error */ error = pf_ioctl_addrule(rule, pr->ticket, pr->pool_ticket, @@ -3512,7 +3612,7 @@ DIOCADDRULENV_error: case DIOCGETRULES: { struct pfioc_rule *pr = (struct pfioc_rule *)addr; - pr->anchor[sizeof(pr->anchor) - 1] = 0; + pr->anchor[sizeof(pr->anchor) - 1] = '\0'; error = pf_ioctl_getrules(pr); @@ -3651,16 +3751,16 @@ DIOCGETRULENV_error: u_int32_t nr = 0; int rs_num; - pcr->anchor[sizeof(pcr->anchor) - 1] = 0; + pcr->anchor[sizeof(pcr->anchor) - 1] = '\0'; if (pcr->action < PF_CHANGE_ADD_HEAD || pcr->action > PF_CHANGE_GET_TICKET) { error = EINVAL; - break; + goto fail; } if (pcr->rule.return_icmp >> 8 > ICMP_MAXTYPE) { error = EINVAL; - break; + goto fail; } if (pcr->action != PF_CHANGE_REMOVE) { @@ -3668,9 +3768,13 @@ DIOCGETRULENV_error: error = pf_rule_to_krule(&pcr->rule, newrule); if (error != 0) { pf_krule_free(newrule); - break; + goto fail; } + if ((error = pf_rule_checkaf(newrule))) { + pf_krule_free(newrule); + goto fail; + } if (newrule->ifname[0]) kif = pf_kkif_create(M_WAITOK); pf_counter_u64_init(&newrule->evaluations, M_WAITOK); @@ -3818,7 +3922,7 @@ DIOCGETRULENV_error: pf_free_rule(newrule); PF_RULES_WUNLOCK(); PF_CONFIG_UNLOCK(); - break; + goto fail; } newrule->nat.cur = TAILQ_FIRST(&newrule->nat.list); @@ -3845,7 +3949,7 @@ DIOCGETRULENV_error: PF_RULES_WUNLOCK(); PF_CONFIG_UNLOCK(); error = EINVAL; - break; + goto fail; } } @@ -3863,7 +3967,7 @@ DIOCGETRULENV_error: PF_RULES_WUNLOCK(); PF_CONFIG_UNLOCK(); error = EEXIST; - break; + goto fail; } if (oldrule == NULL) @@ -3919,7 +4023,7 @@ DIOCCHANGERULE_error: if (sp->timeout >= PFTM_MAX) { error = EINVAL; - break; + goto fail; } if (V_pfsync_state_import_ptr != NULL) { PF_RULES_RLOCK(); @@ -3939,7 +4043,7 @@ DIOCCHANGERULE_error: s = pf_find_state_byid(ps->state.id, ps->state.creatorid); if (s == NULL) { error = ENOENT; - break; + goto fail; } pfsync_state_export((union pfsync_state_union*)&ps->state, @@ -4018,7 +4122,7 @@ DIOCGETSTATES_retry: error = copyout(pstore, out, sizeof(struct pfsync_state_1301) * count); if (error) - break; + goto fail; out = ps->ps_states + nr; } DIOCGETSTATES_full: @@ -4038,7 +4142,7 @@ DIOCGETSTATES_full: if (ps->ps_req_version > PF_STATE_VERSION) { error = ENOTSUP; - break; + goto fail; } if (ps->ps_len <= 0) { @@ -4096,7 +4200,7 @@ DIOCGETSTATESV2_retry: error = copyout(pstore, out, sizeof(struct pf_state_export) * count); if (error) - break; + goto fail; out = ps->ps_states + nr; } DIOCGETSTATESV2_full: @@ -4131,49 +4235,8 @@ DIOCGETSTATESV2_full: case DIOCNATLOOK: { struct pfioc_natlook *pnl = (struct pfioc_natlook *)addr; - struct pf_state_key *sk; - struct pf_kstate *state; - struct pf_state_key_cmp key; - int m = 0, direction = pnl->direction; - int sidx, didx; - - /* NATLOOK src and dst are reversed, so reverse sidx/didx */ - sidx = (direction == PF_IN) ? 1 : 0; - didx = (direction == PF_IN) ? 0 : 1; - - if (!pnl->proto || - PF_AZERO(&pnl->saddr, pnl->af) || - PF_AZERO(&pnl->daddr, pnl->af) || - ((pnl->proto == IPPROTO_TCP || - pnl->proto == IPPROTO_UDP) && - (!pnl->dport || !pnl->sport))) - error = EINVAL; - else { - bzero(&key, sizeof(key)); - key.af = pnl->af; - key.proto = pnl->proto; - PF_ACPY(&key.addr[sidx], &pnl->saddr, pnl->af); - key.port[sidx] = pnl->sport; - PF_ACPY(&key.addr[didx], &pnl->daddr, pnl->af); - key.port[didx] = pnl->dport; - - state = pf_find_state_all(&key, direction, &m); - if (state == NULL) { - error = ENOENT; - } else { - if (m > 1) { - PF_STATE_UNLOCK(state); - error = E2BIG; /* more than one state */ - } else { - sk = state->key[sidx]; - PF_ACPY(&pnl->rsaddr, &sk->addr[sidx], sk->af); - pnl->rsport = sk->port[sidx]; - PF_ACPY(&pnl->rdaddr, &sk->addr[didx], sk->af); - pnl->rdport = sk->port[didx]; - PF_STATE_UNLOCK(state); - } - } - } + + error = pf_ioctl_natlook(pnl); break; } @@ -4243,12 +4306,12 @@ DIOCGETSTATESV2_full: if (psp->ifname[0] == '\0') { error = EINVAL; - break; + goto fail; } error = pf_user_strcpy(ps.ifname, psp->ifname, IFNAMSIZ); if (error != 0) - break; + goto fail; ifp = ifunit(ps.ifname); if (ifp != NULL) { psp->baudrate32 = @@ -4309,7 +4372,7 @@ DIOCGETSTATESV2_full: altq = malloc(sizeof(*altq), M_PFALTQ, M_WAITOK | M_ZERO); error = pf_import_kaltq(pa, altq, IOCPARM_LEN(cmd)); if (error) - break; + goto fail; altq->local_flags = 0; PF_RULES_WLOCK(); @@ -4317,7 +4380,7 @@ DIOCGETSTATESV2_full: PF_RULES_WUNLOCK(); free(altq, M_PFALTQ); error = EBUSY; - break; + goto fail; } /* @@ -4329,7 +4392,7 @@ DIOCGETSTATESV2_full: PF_RULES_WUNLOCK(); error = EBUSY; free(altq, M_PFALTQ); - break; + goto fail; } altq->altq_disc = NULL; TAILQ_FOREACH(a, V_pf_altq_ifs_inactive, entries) { @@ -4349,7 +4412,7 @@ DIOCGETSTATESV2_full: if (error) { PF_RULES_WUNLOCK(); free(altq, M_PFALTQ); - break; + goto fail; } if (altq->qname[0] != 0) @@ -4387,13 +4450,13 @@ DIOCGETSTATESV2_full: if (pa->ticket != V_ticket_altqs_active) { PF_RULES_RUNLOCK(); error = EBUSY; - break; + goto fail; } altq = pf_altq_get_nth_active(pa->nr); if (altq == NULL) { PF_RULES_RUNLOCK(); error = EBUSY; - break; + goto fail; } pf_export_kaltq(altq, pa, IOCPARM_LEN(cmd)); PF_RULES_RUNLOCK(); @@ -4417,20 +4480,20 @@ DIOCGETSTATESV2_full: if (pq->ticket != V_ticket_altqs_active) { PF_RULES_RUNLOCK(); error = EBUSY; - break; + goto fail; } nbytes = pq->nbytes; altq = pf_altq_get_nth_active(pq->nr); if (altq == NULL) { PF_RULES_RUNLOCK(); error = EBUSY; - break; + goto fail; } if ((altq->local_flags & PFALTQ_FLAG_IF_REMOVED) != 0) { PF_RULES_RUNLOCK(); error = ENXIO; - break; + goto fail; } PF_RULES_RUNLOCK(); if (cmd == DIOCGETQSTATSV0) @@ -4494,35 +4557,35 @@ DIOCGETSTATESV2_full: struct pf_kruleset *ruleset; struct pfi_kkif *kif = NULL; - pca->anchor[sizeof(pca->anchor) - 1] = 0; + pca->anchor[sizeof(pca->anchor) - 1] = '\0'; if (pca->action < PF_CHANGE_ADD_HEAD || pca->action > PF_CHANGE_REMOVE) { error = EINVAL; - break; + goto fail; } if (pca->addr.addr.type != PF_ADDR_ADDRMASK && pca->addr.addr.type != PF_ADDR_DYNIFTL && pca->addr.addr.type != PF_ADDR_TABLE) { error = EINVAL; - break; + goto fail; } if (pca->addr.addr.p.dyn != NULL) { error = EINVAL; - break; + goto fail; } if (pca->action != PF_CHANGE_REMOVE) { #ifndef INET if (pca->af == AF_INET) { error = EAFNOSUPPORT; - break; + goto fail; } #endif /* INET */ #ifndef INET6 if (pca->af == AF_INET6) { error = EAFNOSUPPORT; - break; + goto fail; } #endif /* INET6 */ newpa = malloc(sizeof(*newpa), M_PFRULE, M_WAITOK); @@ -4606,7 +4669,7 @@ DIOCGETSTATESV2_full: } pool->cur = TAILQ_FIRST(&pool->list); - PF_ACPY(&pool->counter, &pool->cur->addr.v.a.addr, pca->af); + pf_addrcpy(&pool->counter, &pool->cur->addr.v.a.addr, pca->af); PF_RULES_WUNLOCK(); break; @@ -4625,7 +4688,7 @@ DIOCCHANGEADDR_error: case DIOCGETRULESETS: { struct pfioc_ruleset *pr = (struct pfioc_ruleset *)addr; - pr->path[sizeof(pr->path) - 1] = 0; + pr->path[sizeof(pr->path) - 1] = '\0'; error = pf_ioctl_get_rulesets(pr); break; @@ -4634,7 +4697,7 @@ DIOCCHANGEADDR_error: case DIOCGETRULESET: { struct pfioc_ruleset *pr = (struct pfioc_ruleset *)addr; - pr->path[sizeof(pr->path) - 1] = 0; + pr->path[sizeof(pr->path) - 1] = '\0'; error = pf_ioctl_get_ruleset(pr); break; @@ -4645,7 +4708,7 @@ DIOCCHANGEADDR_error: if (io->pfrio_esize != 0) { error = ENODEV; - break; + goto fail; } PF_RULES_WLOCK(); error = pfr_clr_tables(&io->pfrio_table, &io->pfrio_ndel, @@ -4661,13 +4724,13 @@ DIOCCHANGEADDR_error: if (io->pfrio_esize != sizeof(struct pfr_table)) { error = ENODEV; - break; + goto fail; } if (io->pfrio_size < 0 || io->pfrio_size > pf_ioctl_maxcount || WOULD_OVERFLOW(io->pfrio_size, sizeof(struct pfr_table))) { error = ENOMEM; - break; + goto fail; } totlen = io->pfrio_size * sizeof(struct pfr_table); @@ -4676,7 +4739,7 @@ DIOCCHANGEADDR_error: error = copyin(io->pfrio_buffer, pfrts, totlen); if (error) { free(pfrts, M_TEMP); - break; + goto fail; } PF_RULES_WLOCK(); error = pfr_add_tables(pfrts, io->pfrio_size, @@ -4693,13 +4756,13 @@ DIOCCHANGEADDR_error: if (io->pfrio_esize != sizeof(struct pfr_table)) { error = ENODEV; - break; + goto fail; } if (io->pfrio_size < 0 || io->pfrio_size > pf_ioctl_maxcount || WOULD_OVERFLOW(io->pfrio_size, sizeof(struct pfr_table))) { error = ENOMEM; - break; + goto fail; } totlen = io->pfrio_size * sizeof(struct pfr_table); @@ -4708,7 +4771,7 @@ DIOCCHANGEADDR_error: error = copyin(io->pfrio_buffer, pfrts, totlen); if (error) { free(pfrts, M_TEMP); - break; + goto fail; } PF_RULES_WLOCK(); error = pfr_del_tables(pfrts, io->pfrio_size, @@ -4726,14 +4789,14 @@ DIOCCHANGEADDR_error: if (io->pfrio_esize != sizeof(struct pfr_table)) { error = ENODEV; - break; + goto fail; } PF_RULES_RLOCK(); n = pfr_table_count(&io->pfrio_table, io->pfrio_flags); if (n < 0) { PF_RULES_RUNLOCK(); error = EINVAL; - break; + goto fail; } io->pfrio_size = min(io->pfrio_size, n); @@ -4744,7 +4807,7 @@ DIOCCHANGEADDR_error: if (pfrts == NULL) { error = ENOMEM; PF_RULES_RUNLOCK(); - break; + goto fail; } error = pfr_get_tables(&io->pfrio_table, pfrts, &io->pfrio_size, io->pfrio_flags | PFR_FLAG_USERIOCTL); @@ -4763,7 +4826,7 @@ DIOCCHANGEADDR_error: if (io->pfrio_esize != sizeof(struct pfr_tstats)) { error = ENODEV; - break; + goto fail; } PF_TABLE_STATS_LOCK(); PF_RULES_RLOCK(); @@ -4772,7 +4835,7 @@ DIOCCHANGEADDR_error: PF_RULES_RUNLOCK(); PF_TABLE_STATS_UNLOCK(); error = EINVAL; - break; + goto fail; } io->pfrio_size = min(io->pfrio_size, n); @@ -4783,7 +4846,7 @@ DIOCCHANGEADDR_error: error = ENOMEM; PF_RULES_RUNLOCK(); PF_TABLE_STATS_UNLOCK(); - break; + goto fail; } error = pfr_get_tstats(&io->pfrio_table, pfrtstats, &io->pfrio_size, io->pfrio_flags | PFR_FLAG_USERIOCTL); @@ -4802,7 +4865,7 @@ DIOCCHANGEADDR_error: if (io->pfrio_esize != sizeof(struct pfr_table)) { error = ENODEV; - break; + goto fail; } if (io->pfrio_size < 0 || io->pfrio_size > pf_ioctl_maxcount || @@ -4811,7 +4874,7 @@ DIOCCHANGEADDR_error: * size, so we didn't fail on overly large requests. * Keep doing so. */ io->pfrio_size = pf_ioctl_maxcount; - break; + goto fail; } totlen = io->pfrio_size * sizeof(struct pfr_table); @@ -4820,7 +4883,7 @@ DIOCCHANGEADDR_error: error = copyin(io->pfrio_buffer, pfrts, totlen); if (error) { free(pfrts, M_TEMP); - break; + goto fail; } PF_TABLE_STATS_LOCK(); @@ -4841,7 +4904,7 @@ DIOCCHANGEADDR_error: if (io->pfrio_esize != sizeof(struct pfr_table)) { error = ENODEV; - break; + goto fail; } PF_RULES_RLOCK(); @@ -4849,7 +4912,7 @@ DIOCCHANGEADDR_error: if (n < 0) { PF_RULES_RUNLOCK(); error = EINVAL; - break; + goto fail; } io->pfrio_size = min(io->pfrio_size, n); @@ -4861,7 +4924,7 @@ DIOCCHANGEADDR_error: error = copyin(io->pfrio_buffer, pfrts, totlen); if (error) { free(pfrts, M_TEMP); - break; + goto fail; } PF_RULES_WLOCK(); error = pfr_set_tflags(pfrts, io->pfrio_size, @@ -4877,7 +4940,7 @@ DIOCCHANGEADDR_error: if (io->pfrio_esize != 0) { error = ENODEV; - break; + goto fail; } PF_RULES_WLOCK(); error = pfr_clr_addrs(&io->pfrio_table, &io->pfrio_ndel, @@ -4893,13 +4956,13 @@ DIOCCHANGEADDR_error: if (io->pfrio_esize != sizeof(struct pfr_addr)) { error = ENODEV; - break; + goto fail; } if (io->pfrio_size < 0 || io->pfrio_size > pf_ioctl_maxcount || WOULD_OVERFLOW(io->pfrio_size, sizeof(struct pfr_addr))) { error = EINVAL; - break; + goto fail; } totlen = io->pfrio_size * sizeof(struct pfr_addr); pfras = mallocarray(io->pfrio_size, sizeof(struct pfr_addr), @@ -4907,7 +4970,7 @@ DIOCCHANGEADDR_error: error = copyin(io->pfrio_buffer, pfras, totlen); if (error) { free(pfras, M_TEMP); - break; + goto fail; } PF_RULES_WLOCK(); error = pfr_add_addrs(&io->pfrio_table, pfras, @@ -4927,13 +4990,13 @@ DIOCCHANGEADDR_error: if (io->pfrio_esize != sizeof(struct pfr_addr)) { error = ENODEV; - break; + goto fail; } if (io->pfrio_size < 0 || io->pfrio_size > pf_ioctl_maxcount || WOULD_OVERFLOW(io->pfrio_size, sizeof(struct pfr_addr))) { error = EINVAL; - break; + goto fail; } totlen = io->pfrio_size * sizeof(struct pfr_addr); pfras = mallocarray(io->pfrio_size, sizeof(struct pfr_addr), @@ -4941,7 +5004,7 @@ DIOCCHANGEADDR_error: error = copyin(io->pfrio_buffer, pfras, totlen); if (error) { free(pfras, M_TEMP); - break; + goto fail; } PF_RULES_WLOCK(); error = pfr_del_addrs(&io->pfrio_table, pfras, @@ -4961,17 +5024,17 @@ DIOCCHANGEADDR_error: if (io->pfrio_esize != sizeof(struct pfr_addr)) { error = ENODEV; - break; + goto fail; } if (io->pfrio_size < 0 || io->pfrio_size2 < 0) { error = EINVAL; - break; + goto fail; } count = max(io->pfrio_size, io->pfrio_size2); if (count > pf_ioctl_maxcount || WOULD_OVERFLOW(count, sizeof(struct pfr_addr))) { error = EINVAL; - break; + goto fail; } totlen = count * sizeof(struct pfr_addr); pfras = mallocarray(count, sizeof(struct pfr_addr), M_TEMP, @@ -4979,7 +5042,7 @@ DIOCCHANGEADDR_error: error = copyin(io->pfrio_buffer, pfras, totlen); if (error) { free(pfras, M_TEMP); - break; + goto fail; } PF_RULES_WLOCK(); error = pfr_set_addrs(&io->pfrio_table, pfras, @@ -5000,13 +5063,13 @@ DIOCCHANGEADDR_error: if (io->pfrio_esize != sizeof(struct pfr_addr)) { error = ENODEV; - break; + goto fail; } if (io->pfrio_size < 0 || io->pfrio_size > pf_ioctl_maxcount || WOULD_OVERFLOW(io->pfrio_size, sizeof(struct pfr_addr))) { error = EINVAL; - break; + goto fail; } totlen = io->pfrio_size * sizeof(struct pfr_addr); pfras = mallocarray(io->pfrio_size, sizeof(struct pfr_addr), @@ -5028,13 +5091,13 @@ DIOCCHANGEADDR_error: if (io->pfrio_esize != sizeof(struct pfr_astats)) { error = ENODEV; - break; + goto fail; } if (io->pfrio_size < 0 || io->pfrio_size > pf_ioctl_maxcount || WOULD_OVERFLOW(io->pfrio_size, sizeof(struct pfr_astats))) { error = EINVAL; - break; + goto fail; } totlen = io->pfrio_size * sizeof(struct pfr_astats); pfrastats = mallocarray(io->pfrio_size, @@ -5056,13 +5119,13 @@ DIOCCHANGEADDR_error: if (io->pfrio_esize != sizeof(struct pfr_addr)) { error = ENODEV; - break; + goto fail; } if (io->pfrio_size < 0 || io->pfrio_size > pf_ioctl_maxcount || WOULD_OVERFLOW(io->pfrio_size, sizeof(struct pfr_addr))) { error = EINVAL; - break; + goto fail; } totlen = io->pfrio_size * sizeof(struct pfr_addr); pfras = mallocarray(io->pfrio_size, sizeof(struct pfr_addr), @@ -5070,7 +5133,7 @@ DIOCCHANGEADDR_error: error = copyin(io->pfrio_buffer, pfras, totlen); if (error) { free(pfras, M_TEMP); - break; + goto fail; } PF_RULES_WLOCK(); error = pfr_clr_astats(&io->pfrio_table, pfras, @@ -5090,13 +5153,13 @@ DIOCCHANGEADDR_error: if (io->pfrio_esize != sizeof(struct pfr_addr)) { error = ENODEV; - break; + goto fail; } if (io->pfrio_size < 0 || io->pfrio_size > pf_ioctl_maxcount || WOULD_OVERFLOW(io->pfrio_size, sizeof(struct pfr_addr))) { error = EINVAL; - break; + goto fail; } totlen = io->pfrio_size * sizeof(struct pfr_addr); pfras = mallocarray(io->pfrio_size, sizeof(struct pfr_addr), @@ -5104,7 +5167,7 @@ DIOCCHANGEADDR_error: error = copyin(io->pfrio_buffer, pfras, totlen); if (error) { free(pfras, M_TEMP); - break; + goto fail; } PF_RULES_RLOCK(); error = pfr_tst_addrs(&io->pfrio_table, pfras, @@ -5124,13 +5187,13 @@ DIOCCHANGEADDR_error: if (io->pfrio_esize != sizeof(struct pfr_addr)) { error = ENODEV; - break; + goto fail; } if (io->pfrio_size < 0 || io->pfrio_size > pf_ioctl_maxcount || WOULD_OVERFLOW(io->pfrio_size, sizeof(struct pfr_addr))) { error = EINVAL; - break; + goto fail; } totlen = io->pfrio_size * sizeof(struct pfr_addr); pfras = mallocarray(io->pfrio_size, sizeof(struct pfr_addr), @@ -5138,7 +5201,7 @@ DIOCCHANGEADDR_error: error = copyin(io->pfrio_buffer, pfras, totlen); if (error) { free(pfras, M_TEMP); - break; + goto fail; } PF_RULES_WLOCK(); error = pfr_ina_define(&io->pfrio_table, pfras, @@ -5173,13 +5236,13 @@ DIOCCHANGEADDR_error: if (io->esize != sizeof(*ioe)) { error = ENODEV; - break; + goto fail; } if (io->size < 0 || io->size > pf_ioctl_maxcount || WOULD_OVERFLOW(io->size, sizeof(struct pfioc_trans_e))) { error = EINVAL; - break; + goto fail; } totlen = sizeof(struct pfioc_trans_e) * io->size; ioes = mallocarray(io->size, sizeof(struct pfioc_trans_e), @@ -5187,7 +5250,7 @@ DIOCCHANGEADDR_error: error = copyin(io->array, ioes, totlen); if (error) { free(ioes, M_TEMP); - break; + goto fail; } PF_RULES_WLOCK(); for (i = 0, ioe = ioes; i < io->size; i++, ioe++) { @@ -5254,13 +5317,13 @@ DIOCCHANGEADDR_error: if (io->esize != sizeof(*ioe)) { error = ENODEV; - break; + goto fail; } if (io->size < 0 || io->size > pf_ioctl_maxcount || WOULD_OVERFLOW(io->size, sizeof(struct pfioc_trans_e))) { error = EINVAL; - break; + goto fail; } totlen = sizeof(struct pfioc_trans_e) * io->size; ioes = mallocarray(io->size, sizeof(struct pfioc_trans_e), @@ -5268,7 +5331,7 @@ DIOCCHANGEADDR_error: error = copyin(io->array, ioes, totlen); if (error) { free(ioes, M_TEMP); - break; + goto fail; } PF_RULES_WLOCK(); for (i = 0, ioe = ioes; i < io->size; i++, ioe++) { @@ -5337,14 +5400,14 @@ DIOCCHANGEADDR_error: if (io->esize != sizeof(*ioe)) { error = ENODEV; - break; + goto fail; } if (io->size < 0 || io->size > pf_ioctl_maxcount || WOULD_OVERFLOW(io->size, sizeof(struct pfioc_trans_e))) { error = EINVAL; - break; + goto fail; } totlen = sizeof(struct pfioc_trans_e) * io->size; @@ -5353,12 +5416,12 @@ DIOCCHANGEADDR_error: error = copyin(io->array, ioes, totlen); if (error) { free(ioes, M_TEMP); - break; + goto fail; } PF_RULES_WLOCK(); /* First makes sure everything will succeed. */ for (i = 0, ioe = ioes; i < io->size; i++, ioe++) { - ioe->anchor[sizeof(ioe->anchor) - 1] = 0; + ioe->anchor[sizeof(ioe->anchor) - 1] = '\0'; switch (ioe->rs_num) { case PF_RULESET_ETH: ers = pf_find_keth_ruleset(ioe->anchor); @@ -5494,7 +5557,7 @@ DIOCCHANGEADDR_error: if (psn->psn_len == 0) { psn->psn_len = sizeof(struct pf_src_node) * nr; - break; + goto fail; } nr = 0; @@ -5519,7 +5582,7 @@ DIOCCHANGEADDR_error: sizeof(struct pf_src_node) * nr); if (error) { free(pstore, M_TEMP); - break; + goto fail; } psn->psn_len = sizeof(struct pf_src_node) * nr; free(pstore, M_TEMP); @@ -5575,14 +5638,14 @@ DIOCCHANGEADDR_error: if (io->pfiio_esize != sizeof(struct pfi_kif)) { error = ENODEV; - break; + goto fail; } if (io->pfiio_size < 0 || io->pfiio_size > pf_ioctl_maxcount || WOULD_OVERFLOW(io->pfiio_size, sizeof(struct pfi_kif))) { error = EINVAL; - break; + goto fail; } io->pfiio_name[sizeof(io->pfiio_name) - 1] = '\0'; @@ -6024,11 +6087,11 @@ pf_kill_srcnodes(struct pfioc_src_node_kill *psnk) PF_HASHROW_LOCK(sh); LIST_FOREACH_SAFE(sn, &sh->nodes, entry, tmp) if (psnk == NULL || - (PF_MATCHA(psnk->psnk_src.neg, + (pf_match_addr(psnk->psnk_src.neg, &psnk->psnk_src.addr.v.a.addr, &psnk->psnk_src.addr.v.a.mask, &sn->addr, sn->af) && - PF_MATCHA(psnk->psnk_dst.neg, + pf_match_addr(psnk->psnk_dst.neg, &psnk->psnk_dst.addr.v.a.addr, &psnk->psnk_dst.addr.v.a.mask, &sn->raddr, sn->af))) { @@ -6132,10 +6195,10 @@ relock_DIOCCLRSTATES: match_key.af = s->key[idx]->af; match_key.proto = s->key[idx]->proto; - PF_ACPY(&match_key.addr[0], + pf_addrcpy(&match_key.addr[0], &s->key[idx]->addr[1], match_key.af); match_key.port[0] = s->key[idx]->port[1]; - PF_ACPY(&match_key.addr[1], + pf_addrcpy(&match_key.addr[1], &s->key[idx]->addr[0], match_key.af); match_key.port[1] = s->key[idx]->port[0]; } diff --git a/sys/netpfil/pf/pf_lb.c b/sys/netpfil/pf/pf_lb.c index 5e7865e4fac5..9c7863bb301e 100644 --- a/sys/netpfil/pf/pf_lb.c +++ b/sys/netpfil/pf/pf_lb.c @@ -80,7 +80,6 @@ static enum pf_test_status pf_step_into_translation_anchor(int, struct pf_test_c struct pf_krule *); static int pf_get_sport(struct pf_pdesc *, struct pf_krule *, struct pf_addr *, uint16_t *, uint16_t, uint16_t, - struct pf_ksrc_node **, struct pf_srchash **, struct pf_kpool *, struct pf_udp_mapping **, pf_sn_types_t); static bool pf_islinklocal(const sa_family_t, const struct pf_addr *); @@ -291,10 +290,8 @@ pf_match_translation(int rs_num, struct pf_test_ctx *ctx) } static int -pf_get_sport(struct pf_pdesc *pd, struct pf_krule *r, - struct pf_addr *naddr, uint16_t *nport, uint16_t low, - uint16_t high, struct pf_ksrc_node **sn, - struct pf_srchash **sh, struct pf_kpool *rpool, +pf_get_sport(struct pf_pdesc *pd, struct pf_krule *r, struct pf_addr *naddr, + uint16_t *nport, uint16_t low, uint16_t high, struct pf_kpool *rpool, struct pf_udp_mapping **udp_mapping, pf_sn_types_t sn_type) { struct pf_state_key_cmp key; @@ -319,20 +316,27 @@ pf_get_sport(struct pf_pdesc *pd, struct pf_krule *r, bzero(&udp_source, sizeof(udp_source)); udp_source.af = pd->af; - PF_ACPY(&udp_source.addr, &pd->nsaddr, pd->af); + pf_addrcpy(&udp_source.addr, &pd->nsaddr, pd->af); udp_source.port = pd->nsport; if (udp_mapping) { + struct pf_ksrc_node *sn = NULL; + struct pf_srchash *sh = NULL; *udp_mapping = pf_udp_mapping_find(&udp_source); if (*udp_mapping) { - PF_ACPY(naddr, &(*udp_mapping)->endpoints[1].addr, pd->af); + pf_addrcpy(naddr, + &(*udp_mapping)->endpoints[1].addr, + pd->af); *nport = (*udp_mapping)->endpoints[1].port; - /* Try to find a src_node as per pf_map_addr(). */ - if (*sn == NULL && rpool->opts & PF_POOL_STICKYADDR && + /* + * Try to find a src_node as per pf_map_addr(). + * XXX: Why? This code seems to do nothing. + */ + if (rpool->opts & PF_POOL_STICKYADDR && (rpool->opts & PF_POOL_TYPEMASK) != PF_POOL_NONE) - *sn = pf_find_src_node(&pd->nsaddr, r, - pd->af, sh, sn_type, false); - if (*sn != NULL) - PF_SRC_NODE_UNLOCK(*sn); + sn = pf_find_src_node(&pd->nsaddr, r, + pd->af, &sh, sn_type, false); + if (sn != NULL) + PF_SRC_NODE_UNLOCK(sn); return (0); } else { *udp_mapping = pf_udp_mapping_create(pd->af, &pd->nsaddr, @@ -344,7 +348,7 @@ pf_get_sport(struct pf_pdesc *pd, struct pf_krule *r, } if (pf_map_addr_sn(pd->naf, r, &pd->nsaddr, naddr, NULL, &init_addr, - sn, sh, rpool, sn_type)) + rpool, sn_type)) goto failed; if (pd->proto == IPPROTO_ICMP) { @@ -369,12 +373,13 @@ pf_get_sport(struct pf_pdesc *pd, struct pf_krule *r, key.proto = pd->proto; do { - PF_ACPY(&key.addr[didx], &pd->ndaddr, key.af); - PF_ACPY(&key.addr[sidx], naddr, key.af); + pf_addrcpy(&key.addr[didx], &pd->ndaddr, key.af); + pf_addrcpy(&key.addr[sidx], naddr, key.af); key.port[didx] = pd->ndport; if (udp_mapping && *udp_mapping) - PF_ACPY(&(*udp_mapping)->endpoints[1].addr, naddr, pd->af); + pf_addrcpy(&(*udp_mapping)->endpoints[1].addr, naddr, + pd->af); /* * port search; start random, step; @@ -467,9 +472,8 @@ pf_get_sport(struct pf_pdesc *pd, struct pf_krule *r, * pick a different source address since we're out * of free port choices for the current one. */ - (*sn) = NULL; if (pf_map_addr_sn(pd->naf, r, &pd->nsaddr, naddr, NULL, - &init_addr, sn, sh, rpool, sn_type)) + &init_addr, rpool, sn_type)) return (1); break; case PF_POOL_NONE: @@ -500,7 +504,6 @@ pf_islinklocal(const sa_family_t af, const struct pf_addr *addr) static int pf_get_mape_sport(struct pf_pdesc *pd, struct pf_krule *r, struct pf_addr *naddr, uint16_t *nport, - struct pf_ksrc_node **sn, struct pf_srchash **sh, struct pf_udp_mapping **udp_mapping, struct pf_kpool *rpool) { uint16_t psmask, low, highmask; @@ -520,16 +523,14 @@ pf_get_mape_sport(struct pf_pdesc *pd, struct pf_krule *r, for (i = cut; i <= ahigh; i++) { low = (i << ashift) | psmask; - if (!pf_get_sport(pd, r, - naddr, nport, low, low | highmask, sn, sh, rpool, - udp_mapping, PF_SN_NAT)) + if (!pf_get_sport(pd, r, naddr, nport, low, low | highmask, + rpool, udp_mapping, PF_SN_NAT)) return (0); } for (i = cut - 1; i > 0; i--) { low = (i << ashift) | psmask; - if (!pf_get_sport(pd, r, - naddr, nport, low, low | highmask, sn, sh, rpool, - udp_mapping, PF_SN_NAT)) + if (!pf_get_sport(pd, r, naddr, nport, low, low | highmask, + rpool, udp_mapping, PF_SN_NAT)) return (0); } return (1); @@ -542,6 +543,7 @@ pf_map_addr(sa_family_t af, struct pf_krule *r, struct pf_addr *saddr, { u_short reason = PFRES_MATCH; struct pf_addr *raddr = NULL, *rmask = NULL; + struct pfr_ktable *kt; uint64_t hashidx; int cnt; @@ -591,39 +593,35 @@ pf_map_addr(sa_family_t af, struct pf_krule *r, struct pf_addr *saddr, switch (rpool->opts & PF_POOL_TYPEMASK) { case PF_POOL_NONE: - PF_ACPY(naddr, raddr, af); + pf_addrcpy(naddr, raddr, af); break; case PF_POOL_BITMASK: - PF_POOLMASK(naddr, raddr, rmask, saddr, af); + pf_poolmask(naddr, raddr, rmask, saddr, af); break; case PF_POOL_RANDOM: - if (rpool->cur->addr.type == PF_ADDR_TABLE) { - cnt = rpool->cur->addr.p.tbl->pfrkt_cnt; - if (cnt == 0) - rpool->tblidx = 0; + if (rpool->cur->addr.type == PF_ADDR_TABLE || + rpool->cur->addr.type == PF_ADDR_DYNIFTL) { + if (rpool->cur->addr.type == PF_ADDR_TABLE) + kt = rpool->cur->addr.p.tbl; else - rpool->tblidx = (int)arc4random_uniform(cnt); - memset(&rpool->counter, 0, sizeof(rpool->counter)); - if (pfr_pool_get(rpool->cur->addr.p.tbl, - &rpool->tblidx, &rpool->counter, af, NULL)) { + kt = rpool->cur->addr.p.dyn->pfid_kt; + kt = pfr_ktable_select_active(kt); + if (kt == NULL) { reason = PFRES_MAPFAILED; goto done_pool_mtx; /* unsupported */ } - PF_ACPY(naddr, &rpool->counter, af); - } else if (rpool->cur->addr.type == PF_ADDR_DYNIFTL) { - cnt = rpool->cur->addr.p.dyn->pfid_kt->pfrkt_cnt; + cnt = kt->pfrkt_cnt; if (cnt == 0) rpool->tblidx = 0; else rpool->tblidx = (int)arc4random_uniform(cnt); memset(&rpool->counter, 0, sizeof(rpool->counter)); - if (pfr_pool_get(rpool->cur->addr.p.dyn->pfid_kt, - &rpool->tblidx, &rpool->counter, af, - pf_islinklocal)) { + if (pfr_pool_get(kt, &rpool->tblidx, &rpool->counter, + af, pf_islinklocal, false)) { reason = PFRES_MAPFAILED; goto done_pool_mtx; /* unsupported */ } - PF_ACPY(naddr, &rpool->counter, af); + pf_addrcpy(naddr, &rpool->counter, af); } else if (init_addr != NULL && PF_AZERO(init_addr, af)) { switch (af) { #ifdef INET @@ -654,12 +652,12 @@ pf_map_addr(sa_family_t af, struct pf_krule *r, struct pf_addr *saddr, break; #endif /* INET6 */ } - PF_POOLMASK(naddr, raddr, rmask, &rpool->counter, af); - PF_ACPY(init_addr, naddr, af); + pf_poolmask(naddr, raddr, rmask, &rpool->counter, af); + pf_addrcpy(init_addr, naddr, af); } else { - PF_AINC(&rpool->counter, af); - PF_POOLMASK(naddr, raddr, rmask, &rpool->counter, af); + pf_addr_inc(&rpool->counter, af); + pf_poolmask(naddr, raddr, rmask, &rpool->counter, af); } break; case PF_POOL_SRCHASH: @@ -668,35 +666,31 @@ pf_map_addr(sa_family_t af, struct pf_krule *r, struct pf_addr *saddr, hashidx = pf_hash(saddr, (struct pf_addr *)&hash, &rpool->key, af); - if (rpool->cur->addr.type == PF_ADDR_TABLE) { - cnt = rpool->cur->addr.p.tbl->pfrkt_cnt; - if (cnt == 0) - rpool->tblidx = 0; + if (rpool->cur->addr.type == PF_ADDR_TABLE || + rpool->cur->addr.type == PF_ADDR_DYNIFTL) { + if (rpool->cur->addr.type == PF_ADDR_TABLE) + kt = rpool->cur->addr.p.tbl; else - rpool->tblidx = (int)(hashidx % cnt); - memset(&rpool->counter, 0, sizeof(rpool->counter)); - if (pfr_pool_get(rpool->cur->addr.p.tbl, - &rpool->tblidx, &rpool->counter, af, NULL)) { + kt = rpool->cur->addr.p.dyn->pfid_kt; + kt = pfr_ktable_select_active(kt); + if (kt == NULL) { reason = PFRES_MAPFAILED; goto done_pool_mtx; /* unsupported */ } - PF_ACPY(naddr, &rpool->counter, af); - } else if (rpool->cur->addr.type == PF_ADDR_DYNIFTL) { - cnt = rpool->cur->addr.p.dyn->pfid_kt->pfrkt_cnt; + cnt = kt->pfrkt_cnt; if (cnt == 0) rpool->tblidx = 0; else rpool->tblidx = (int)(hashidx % cnt); memset(&rpool->counter, 0, sizeof(rpool->counter)); - if (pfr_pool_get(rpool->cur->addr.p.dyn->pfid_kt, - &rpool->tblidx, &rpool->counter, af, - pf_islinklocal)) { + if (pfr_pool_get(kt, &rpool->tblidx, &rpool->counter, + af, pf_islinklocal, false)) { reason = PFRES_MAPFAILED; goto done_pool_mtx; /* unsupported */ } - PF_ACPY(naddr, &rpool->counter, af); + pf_addrcpy(naddr, &rpool->counter, af); } else { - PF_POOLMASK(naddr, raddr, rmask, + pf_poolmask(naddr, raddr, rmask, (struct pf_addr *)&hash, af); } break; @@ -707,11 +701,12 @@ pf_map_addr(sa_family_t af, struct pf_krule *r, struct pf_addr *saddr, if (rpool->cur->addr.type == PF_ADDR_TABLE) { if (!pfr_pool_get(rpool->cur->addr.p.tbl, - &rpool->tblidx, &rpool->counter, af, NULL)) + &rpool->tblidx, &rpool->counter, af, NULL, true)) goto get_addr; } else if (rpool->cur->addr.type == PF_ADDR_DYNIFTL) { if (!pfr_pool_get(rpool->cur->addr.p.dyn->pfid_kt, - &rpool->tblidx, &rpool->counter, af, pf_islinklocal)) + &rpool->tblidx, &rpool->counter, af, pf_islinklocal, + true)) goto get_addr; } else if (pf_match_addr(0, raddr, rmask, &rpool->counter, af)) goto get_addr; @@ -721,9 +716,10 @@ pf_map_addr(sa_family_t af, struct pf_krule *r, struct pf_addr *saddr, rpool->cur = TAILQ_FIRST(&rpool->list); else rpool->cur = TAILQ_NEXT(rpool->cur, entries); + rpool->tblidx = -1; if (rpool->cur->addr.type == PF_ADDR_TABLE) { if (pfr_pool_get(rpool->cur->addr.p.tbl, - &rpool->tblidx, &rpool->counter, af, NULL)) { + &rpool->tblidx, &rpool->counter, af, NULL, true)) { /* table contains no address of type 'af' */ if (rpool->cur != acur) goto try_next; @@ -731,9 +727,9 @@ pf_map_addr(sa_family_t af, struct pf_krule *r, struct pf_addr *saddr, goto done_pool_mtx; } } else if (rpool->cur->addr.type == PF_ADDR_DYNIFTL) { - rpool->tblidx = -1; if (pfr_pool_get(rpool->cur->addr.p.dyn->pfid_kt, - &rpool->tblidx, &rpool->counter, af, pf_islinklocal)) { + &rpool->tblidx, &rpool->counter, af, pf_islinklocal, + true)) { /* table contains no address of type 'af' */ if (rpool->cur != acur) goto try_next; @@ -743,14 +739,14 @@ pf_map_addr(sa_family_t af, struct pf_krule *r, struct pf_addr *saddr, } else { raddr = &rpool->cur->addr.v.a.addr; rmask = &rpool->cur->addr.v.a.mask; - PF_ACPY(&rpool->counter, raddr, af); + pf_addrcpy(&rpool->counter, raddr, af); } get_addr: - PF_ACPY(naddr, &rpool->counter, af); + pf_addrcpy(naddr, &rpool->counter, af); if (init_addr != NULL && PF_AZERO(init_addr, af)) - PF_ACPY(init_addr, naddr, af); - PF_AINC(&rpool->counter, af); + pf_addrcpy(init_addr, naddr, af); + pf_addr_inc(&rpool->counter, af); break; } } @@ -761,48 +757,41 @@ pf_map_addr(sa_family_t af, struct pf_krule *r, struct pf_addr *saddr, done_pool_mtx: mtx_unlock(&rpool->mtx); - if (reason) { - counter_u64_add(V_pf_status.counters[reason], 1); - } - return (reason); } u_short pf_map_addr_sn(sa_family_t af, struct pf_krule *r, struct pf_addr *saddr, struct pf_addr *naddr, struct pfi_kkif **nkif, struct pf_addr *init_addr, - struct pf_ksrc_node **sn, struct pf_srchash **sh, struct pf_kpool *rpool, - pf_sn_types_t sn_type) + struct pf_kpool *rpool, pf_sn_types_t sn_type) { + struct pf_ksrc_node *sn = NULL; + struct pf_srchash *sh = NULL; u_short reason = 0; - KASSERT(*sn == NULL, ("*sn not NULL")); - /* * If this is a sticky-address rule, try to find an existing src_node. - * Request the sh to be unlocked if sn was not found, as we never - * insert a new sn when parsing the ruleset. */ if (rpool->opts & PF_POOL_STICKYADDR && (rpool->opts & PF_POOL_TYPEMASK) != PF_POOL_NONE) - *sn = pf_find_src_node(saddr, r, af, sh, sn_type, false); + sn = pf_find_src_node(saddr, r, af, &sh, sn_type, false); - if (*sn != NULL) { - PF_SRC_NODE_LOCK_ASSERT(*sn); + if (sn != NULL) { + PF_SRC_NODE_LOCK_ASSERT(sn); /* If the supplied address is the same as the current one we've * been asked before, so tell the caller that there's no other * address to be had. */ - if (PF_AEQ(naddr, &(*sn)->raddr, af)) { + if (PF_AEQ(naddr, &(sn->raddr), af)) { reason = PFRES_MAPFAILED; goto done; } - PF_ACPY(naddr, &(*sn)->raddr, af); + pf_addrcpy(naddr, &(sn->raddr), af); if (nkif) - *nkif = (*sn)->rkif; + *nkif = sn->rkif; if (V_pf_status.debug >= PF_DEBUG_NOISY) { - printf("pf_map_addr: src tracking maps "); + printf("%s: src tracking maps ", __func__); pf_print_host(saddr, 0, af); printf(" to "); pf_print_host(naddr, 0, af); @@ -817,14 +806,16 @@ pf_map_addr_sn(sa_family_t af, struct pf_krule *r, struct pf_addr *saddr, * Source node has not been found. Find a new address and store it * in variables given by the caller. */ - if (pf_map_addr(af, r, saddr, naddr, nkif, init_addr, rpool) != 0) { - /* pf_map_addr() sets reason counters on its own */ + if ((reason = pf_map_addr(af, r, saddr, naddr, nkif, init_addr, + rpool)) != 0) { + if (V_pf_status.debug >= PF_DEBUG_MISC) + printf("%s: pf_map_addr has failed\n", __func__); goto done; } if (V_pf_status.debug >= PF_DEBUG_NOISY && (rpool->opts & PF_POOL_TYPEMASK) != PF_POOL_NONE) { - printf("pf_map_addr: selected address "); + printf("%s: selected address ", __func__); pf_print_host(naddr, 0, af); if (nkif) printf("@%s", (*nkif)->pfik_name); @@ -832,12 +823,8 @@ pf_map_addr_sn(sa_family_t af, struct pf_krule *r, struct pf_addr *saddr, } done: - if ((*sn) != NULL) - PF_SRC_NODE_UNLOCK(*sn); - - if (reason) { - counter_u64_add(V_pf_status.counters[reason], 1); - } + if (sn != NULL) + PF_SRC_NODE_UNLOCK(sn); return (reason); } @@ -887,8 +874,6 @@ pf_get_transaddr(struct pf_test_ctx *ctx, struct pf_krule *r, { struct pf_pdesc *pd = ctx->pd; struct pf_addr *naddr; - struct pf_ksrc_node *sn = NULL; - struct pf_srchash *sh = NULL; uint16_t *nportp; uint16_t low, high; u_short reason; @@ -916,8 +901,8 @@ pf_get_transaddr(struct pf_test_ctx *ctx, struct pf_krule *r, high = rpool->proxy_port[1]; } if (rpool->mape.offset > 0) { - if (pf_get_mape_sport(pd, r, naddr, nportp, &sn, - &sh, &ctx->udp_mapping, rpool)) { + if (pf_get_mape_sport(pd, r, naddr, nportp, + &ctx->udp_mapping, rpool)) { DPFPRINTF(PF_DEBUG_MISC, ("pf: MAP-E port allocation (%u/%u/%u)" " failed\n", @@ -927,8 +912,8 @@ pf_get_transaddr(struct pf_test_ctx *ctx, struct pf_krule *r, reason = PFRES_MAPFAILED; goto notrans; } - } else if (pf_get_sport(pd, r, naddr, nportp, low, high, &sn, - &sh, rpool, &ctx->udp_mapping, PF_SN_NAT)) { + } else if (pf_get_sport(pd, r, naddr, nportp, low, high, + rpool, &ctx->udp_mapping, PF_SN_NAT)) { DPFPRINTF(PF_DEBUG_MISC, ("pf: NAT proxy port allocation (%u-%u) failed\n", rpool->proxy_port[0], rpool->proxy_port[1])); @@ -948,7 +933,7 @@ pf_get_transaddr(struct pf_test_ctx *ctx, struct pf_krule *r, reason = PFRES_MAPFAILED; goto notrans; } - PF_POOLMASK(naddr, + pf_poolmask(naddr, &rpool->cur->addr.p.dyn->pfid_addr4, &rpool->cur->addr.p.dyn->pfid_mask4, &pd->nsaddr, AF_INET); @@ -961,7 +946,7 @@ pf_get_transaddr(struct pf_test_ctx *ctx, struct pf_krule *r, reason = PFRES_MAPFAILED; goto notrans; } - PF_POOLMASK(naddr, + pf_poolmask(naddr, &rpool->cur->addr.p.dyn->pfid_addr6, &rpool->cur->addr.p.dyn->pfid_mask6, &pd->nsaddr, AF_INET6); @@ -969,7 +954,7 @@ pf_get_transaddr(struct pf_test_ctx *ctx, struct pf_krule *r, #endif /* INET6 */ } } else - PF_POOLMASK(naddr, + pf_poolmask(naddr, &rpool->cur->addr.v.a.addr, &rpool->cur->addr.v.a.mask, &pd->nsaddr, pd->af); @@ -983,7 +968,7 @@ pf_get_transaddr(struct pf_test_ctx *ctx, struct pf_krule *r, reason = PFRES_MAPFAILED; goto notrans; } - PF_POOLMASK(naddr, + pf_poolmask(naddr, &r->src.addr.p.dyn->pfid_addr4, &r->src.addr.p.dyn->pfid_mask4, &pd->ndaddr, AF_INET); @@ -995,7 +980,7 @@ pf_get_transaddr(struct pf_test_ctx *ctx, struct pf_krule *r, reason = PFRES_MAPFAILED; goto notrans; } - PF_POOLMASK(naddr, + pf_poolmask(naddr, &r->src.addr.p.dyn->pfid_addr6, &r->src.addr.p.dyn->pfid_mask6, &pd->ndaddr, AF_INET6); @@ -1003,7 +988,7 @@ pf_get_transaddr(struct pf_test_ctx *ctx, struct pf_krule *r, #endif /* INET6 */ } } else - PF_POOLMASK(naddr, &r->src.addr.v.a.addr, + pf_poolmask(naddr, &r->src.addr.v.a.addr, &r->src.addr.v.a.mask, &pd->ndaddr, pd->af); break; } @@ -1014,11 +999,11 @@ pf_get_transaddr(struct pf_test_ctx *ctx, struct pf_krule *r, uint16_t cut, low, high, nport; reason = pf_map_addr_sn(pd->af, r, &pd->nsaddr, naddr, NULL, - NULL, &sn, &sh, rpool, PF_SN_NAT); + NULL, rpool, PF_SN_NAT); if (reason != 0) goto notrans; if ((rpool->opts & PF_POOL_TYPEMASK) == PF_POOL_BITMASK) - PF_POOLMASK(naddr, naddr, &rpool->cur->addr.v.a.mask, + pf_poolmask(naddr, naddr, &rpool->cur->addr.v.a.mask, &pd->ndaddr, pd->af); /* Do not change SCTP ports. */ @@ -1027,10 +1012,13 @@ pf_get_transaddr(struct pf_test_ctx *ctx, struct pf_krule *r, if (rpool->proxy_port[1]) { uint32_t tmp_nport; + uint16_t div; + + div = r->rdr.proxy_port[1] - r->rdr.proxy_port[0] + 1; + div = (div == 0) ? 1 : div; - tmp_nport = ((ntohs(pd->ndport) - ntohs(r->dst.port[0])) % - (rpool->proxy_port[1] - rpool->proxy_port[0] + - 1)) + rpool->proxy_port[0]; + tmp_nport = ((ntohs(pd->ndport) - ntohs(r->dst.port[0])) % div) + + rpool->proxy_port[0]; /* Wrap around if necessary. */ if (tmp_nport > 65535) @@ -1056,9 +1044,9 @@ pf_get_transaddr(struct pf_test_ctx *ctx, struct pf_krule *r, key.af = pd->af; key.proto = pd->proto; key.port[0] = pd->nsport; - PF_ACPY(&key.addr[0], &pd->nsaddr, key.af); + pf_addrcpy(&key.addr[0], &pd->nsaddr, key.af); key.port[1] = nport; - PF_ACPY(&key.addr[1], naddr, key.af); + pf_addrcpy(&key.addr[1], naddr, key.af); if (!pf_find_state_all_exists(&key, PF_OUT)) break; @@ -1131,8 +1119,6 @@ pf_get_transaddr_af(struct pf_krule *r, struct pf_pdesc *pd) struct pf_addr ndaddr, nsaddr, naddr; u_int16_t nport = 0; int prefixlen = 96; - struct pf_srchash *sh = NULL; - struct pf_ksrc_node *sns = NULL; bzero(&nsaddr, sizeof(nsaddr)); bzero(&ndaddr, sizeof(ndaddr)); @@ -1151,9 +1137,8 @@ pf_get_transaddr_af(struct pf_krule *r, struct pf_pdesc *pd) panic("pf_get_transaddr_af: no nat pool for source address"); /* get source address and port */ - if (pf_get_sport(pd, r, &nsaddr, &nport, - r->nat.proxy_port[0], r->nat.proxy_port[1], &sns, &sh, &r->nat, - NULL, PF_SN_NAT)) { + if (pf_get_sport(pd, r, &nsaddr, &nport, r->nat.proxy_port[0], + r->nat.proxy_port[1], &r->nat, NULL, PF_SN_NAT)) { DPFPRINTF(PF_DEBUG_MISC, ("pf: af-to NAT proxy port allocation (%u-%u) failed", r->nat.proxy_port[0], r->nat.proxy_port[1])); @@ -1179,7 +1164,7 @@ pf_get_transaddr_af(struct pf_krule *r, struct pf_pdesc *pd) /* get the destination address and port */ if (! TAILQ_EMPTY(&r->rdr.list)) { if (pf_map_addr_sn(pd->naf, r, &nsaddr, &naddr, NULL, NULL, - &sns, NULL, &r->rdr, PF_SN_NAT)) + &r->rdr, PF_SN_NAT)) return (-1); if (r->rdr.proxy_port[0]) pd->ndport = htons(r->rdr.proxy_port[0]); @@ -1220,8 +1205,8 @@ pf_get_transaddr_af(struct pf_krule *r, struct pf_pdesc *pd) } } - PF_ACPY(&pd->nsaddr, &nsaddr, pd->naf); - PF_ACPY(&pd->ndaddr, &ndaddr, pd->naf); + pf_addrcpy(&pd->nsaddr, &nsaddr, pd->naf); + pf_addrcpy(&pd->ndaddr, &ndaddr, pd->naf); if (V_pf_status.debug >= PF_DEBUG_MISC) { printf("pf: af-to %s done, prefixlen %d, ", diff --git a/sys/netpfil/pf/pf_nl.c b/sys/netpfil/pf/pf_nl.c index 381e966eacf1..73933c022ca2 100644 --- a/sys/netpfil/pf/pf_nl.c +++ b/sys/netpfil/pf/pf_nl.c @@ -1256,23 +1256,13 @@ pf_handle_clear_status(struct nlmsghdr *hdr, struct nl_pstate *npt) return (0); } -struct pf_nl_natlook { - sa_family_t af; - uint8_t direction; - uint8_t proto; - struct pf_addr src; - struct pf_addr dst; - uint16_t sport; - uint16_t dport; -}; - -#define _OUT(_field) offsetof(struct pf_nl_natlook, _field) +#define _OUT(_field) offsetof(struct pfioc_natlook, _field) static const struct nlattr_parser nla_p_natlook[] = { { .type = PF_NL_AF, .off = _OUT(af), .cb = nlattr_get_uint8 }, { .type = PF_NL_DIRECTION, .off = _OUT(direction), .cb = nlattr_get_uint8 }, { .type = PF_NL_PROTO, .off = _OUT(proto), .cb = nlattr_get_uint8 }, - { .type = PF_NL_SRC_ADDR, .off = _OUT(src), .cb = nlattr_get_in6_addr }, - { .type = PF_NL_DST_ADDR, .off = _OUT(dst), .cb = nlattr_get_in6_addr }, + { .type = PF_NL_SRC_ADDR, .off = _OUT(saddr), .cb = nlattr_get_in6_addr }, + { .type = PF_NL_DST_ADDR, .off = _OUT(daddr), .cb = nlattr_get_in6_addr }, { .type = PF_NL_SRC_PORT, .off = _OUT(sport), .cb = nlattr_get_uint16 }, { .type = PF_NL_DST_PORT, .off = _OUT(dport), .cb = nlattr_get_uint16 }, }; @@ -1282,63 +1272,31 @@ NL_DECLARE_PARSER(natlook_parser, struct genlmsghdr, nlf_p_empty, nla_p_natlook) static int pf_handle_natlook(struct nlmsghdr *hdr, struct nl_pstate *npt) { - struct pf_nl_natlook attrs = {}; - struct pf_state_key_cmp key = {}; + struct pfioc_natlook attrs = {}; struct nl_writer *nw = npt->nw; - struct pf_state_key *sk; - struct pf_kstate *state; struct genlmsghdr *ghdr_new; - int error, m = 0; - int sidx, didx; + int error; error = nl_parse_nlmsg(hdr, &natlook_parser, npt, &attrs); if (error != 0) return (error); - if (attrs.proto == 0 || - PF_AZERO(&attrs.src, attrs.af) || - PF_AZERO(&attrs.dst, attrs.af) || - ((attrs.proto == IPPROTO_TCP || attrs.proto == IPPROTO_UDP) && - (attrs.sport == 0 || attrs.dport == 0))) - return (EINVAL); - - /* NATLOOK src and dst are reversed, so reverse sidx/didx */ - sidx = (attrs.direction == PF_IN) ? 1 : 0; - didx = (attrs.direction == PF_IN) ? 0 : 1; - - key.af = attrs.af; - key.proto = attrs.proto; - PF_ACPY(&key.addr[sidx], &attrs.src, attrs.af); - key.port[sidx] = attrs.sport; - PF_ACPY(&key.addr[didx], &attrs.dst, attrs.af); - key.port[didx] = attrs.dport; - - state = pf_find_state_all(&key, attrs.direction, &m); - if (state == NULL) - return (ENOENT); - if (m > 1) { - PF_STATE_UNLOCK(state); - return (E2BIG); - } + error = pf_ioctl_natlook(&attrs); + if (error != 0) + return (error); - if (!nlmsg_reply(nw, hdr, sizeof(struct genlmsghdr))) { - PF_STATE_UNLOCK(state); + if (!nlmsg_reply(nw, hdr, sizeof(struct genlmsghdr))) return (ENOMEM); - } ghdr_new = nlmsg_reserve_object(nw, struct genlmsghdr); ghdr_new->cmd = PFNL_CMD_NATLOOK; ghdr_new->version = 0; ghdr_new->reserved = 0; - sk = state->key[sidx]; - - nlattr_add_in6_addr(nw, PF_NL_SRC_ADDR, &sk->addr[sidx].v6); - nlattr_add_in6_addr(nw, PF_NL_DST_ADDR, &sk->addr[didx].v6); - nlattr_add_u16(nw, PF_NL_SRC_PORT, sk->port[sidx]); - nlattr_add_u16(nw, PF_NL_DST_PORT, sk->port[didx]); - - PF_STATE_UNLOCK(state); + nlattr_add_in6_addr(nw, PF_NL_SRC_ADDR, &attrs.rsaddr.v6); + nlattr_add_in6_addr(nw, PF_NL_DST_ADDR, &attrs.rdaddr.v6); + nlattr_add_u16(nw, PF_NL_SRC_PORT, attrs.rsport); + nlattr_add_u16(nw, PF_NL_DST_PORT, attrs.rdport); if (!nlmsg_end(nw)) { nlmsg_abort(nw); diff --git a/sys/netpfil/pf/pf_ruleset.c b/sys/netpfil/pf/pf_ruleset.c index 865c5ecd72d9..2e5165a9900c 100644 --- a/sys/netpfil/pf/pf_ruleset.c +++ b/sys/netpfil/pf/pf_ruleset.c @@ -232,7 +232,7 @@ pf_get_leaf_kruleset(char *path, char **path_remainder) return (ruleset); } -struct pf_kanchor * +static struct pf_kanchor * pf_create_kanchor(struct pf_kanchor *parent, const char *aname) { struct pf_kanchor *anchor, *dup; @@ -259,8 +259,8 @@ pf_create_kanchor(struct pf_kanchor *parent, const char *aname) if ((dup = RB_INSERT(pf_kanchor_global, &V_pf_anchors, anchor)) != NULL) { - printf("pf_find_or_create_ruleset: RB_INSERT1 " - "'%s' '%s' collides with '%s' '%s'\n", + printf("%s: RB_INSERT1 " + "'%s' '%s' collides with '%s' '%s'\n", __func__, anchor->path, anchor->name, dup->path, dup->name); rs_free(anchor); return (NULL); @@ -270,10 +270,10 @@ pf_create_kanchor(struct pf_kanchor *parent, const char *aname) anchor->parent = parent; if ((dup = RB_INSERT(pf_kanchor_node, &parent->children, anchor)) != NULL) { - printf("pf_find_or_create_ruleset: " + printf("%s: " "RB_INSERT2 '%s' '%s' collides with " - "'%s' '%s'\n", anchor->path, anchor->name, - dup->path, dup->name); + "'%s' '%s'\n", __func__, anchor->path, + anchor->name, dup->path, dup->name); RB_REMOVE(pf_kanchor_global, &V_pf_anchors, anchor); rs_free(anchor); @@ -339,7 +339,7 @@ pf_remove_if_empty_kruleset(struct pf_kruleset *ruleset) int i; while (ruleset != NULL) { - if (ruleset == &pf_main_ruleset || ruleset->anchor == NULL || + if (ruleset == &pf_main_ruleset || !RB_EMPTY(&ruleset->anchor->children) || ruleset->anchor->refcnt > 0 || ruleset->tables > 0 || ruleset->topen) @@ -407,7 +407,7 @@ pf_kanchor_setup(struct pf_krule *r, const struct pf_kruleset *s, } ruleset = pf_find_or_create_kruleset(path); rs_free(path); - if (ruleset == NULL || ruleset->anchor == NULL) { + if (ruleset == NULL || ruleset == &pf_main_ruleset) { DPFPRINTF("%s: ruleset\n", __func__); return (1); } @@ -432,7 +432,7 @@ pf_kanchor_copyout(const struct pf_kruleset *rs, const struct pf_krule *r, char a[MAXPATHLEN]; char *p; int i; - if (rs->anchor == NULL) + if (rs == &pf_main_ruleset) a[0] = 0; else strlcpy(a, rs->anchor->path, MAXPATHLEN); @@ -444,7 +444,7 @@ pf_kanchor_copyout(const struct pf_kruleset *rs, const struct pf_krule *r, anchor_call_len); } if (strncmp(a, r->anchor->path, strlen(a))) { - printf("pf_anchor_copyout: '%s' '%s'\n", a, + printf("%s: '%s' '%s'\n", __func__, a, r->anchor->path); return (1); } @@ -525,16 +525,13 @@ done: } void -pf_kanchor_remove(struct pf_krule *r) +pf_remove_kanchor(struct pf_krule *r) { if (r->anchor == NULL) return; - if (r->anchor->refcnt <= 0) { - printf("pf_anchor_remove: broken refcount\n"); - r->anchor = NULL; - return; - } - if (!--r->anchor->refcnt) + if (r->anchor->refcnt <= 0) + printf("%s: broken refcount\n", __func__); + else if (!--r->anchor->refcnt) pf_remove_if_empty_kruleset(&r->anchor->ruleset); r->anchor = NULL; } diff --git a/sys/netpfil/pf/pf_table.c b/sys/netpfil/pf/pf_table.c index d5874df3df66..9c0151b7da2b 100644 --- a/sys/netpfil/pf/pf_table.c +++ b/sys/netpfil/pf/pf_table.c @@ -704,7 +704,7 @@ pfr_validate_addr(struct pfr_addr *ad) return (-1); if (ad->pfra_not && ad->pfra_not != 1) return (-1); - if (ad->pfra_fback) + if (ad->pfra_fback != PFR_FB_NONE) return (-1); return (0); } @@ -819,10 +819,10 @@ pfr_create_kentry(struct pfr_addr *ad, bool counters) static void pfr_destroy_kentries(struct pfr_kentryworkq *workq) { - struct pfr_kentry *p, *q; + struct pfr_kentry *p; - for (p = SLIST_FIRST(workq); p != NULL; p = q) { - q = SLIST_NEXT(p, pfrke_workq); + while ((p = SLIST_FIRST(workq)) != NULL) { + SLIST_REMOVE_HEAD(workq, pfrke_workq); pfr_destroy_kentry(p); } } @@ -1680,8 +1680,7 @@ pfr_ina_commit(struct pfr_table *trs, u_int32_t ticket, int *nadd, } if (!(flags & PFR_FLAG_DUMMY)) { - for (p = SLIST_FIRST(&workq); p != NULL; p = q) { - q = SLIST_NEXT(p, pfrkt_workq); + SLIST_FOREACH_SAFE(p, &workq, pfrkt_workq, q) { pfr_commit_ktable(p, tzero); } rs->topen = 0; @@ -1710,7 +1709,7 @@ pfr_commit_ktable(struct pfr_ktable *kt, time_t tzero) } else if (kt->pfrkt_flags & PFR_TFLAG_ACTIVE) { /* kt might contain addresses */ struct pfr_kentryworkq addrq, addq, changeq, delq, garbageq; - struct pfr_kentry *p, *q, *next; + struct pfr_kentry *p, *q; struct pfr_addr ad; pfr_enqueue_addrs(shadow, &addrq, NULL, 0); @@ -1720,7 +1719,8 @@ pfr_commit_ktable(struct pfr_ktable *kt, time_t tzero) SLIST_INIT(&delq); SLIST_INIT(&garbageq); pfr_clean_node_mask(shadow, &addrq); - SLIST_FOREACH_SAFE(p, &addrq, pfrke_workq, next) { + while ((p = SLIST_FIRST(&addrq)) != NULL) { + SLIST_REMOVE_HEAD(&addrq, pfrke_workq); pfr_copyout_addr(&ad, p); q = pfr_lookup_addr(kt, &ad, 1); if (q != NULL) { @@ -1864,8 +1864,7 @@ pfr_setflags_ktables(struct pfr_ktableworkq *workq) { struct pfr_ktable *p, *q; - for (p = SLIST_FIRST(workq); p; p = q) { - q = SLIST_NEXT(p, pfrkt_workq); + SLIST_FOREACH_SAFE(p, workq, pfrkt_workq, q) { pfr_setflags_ktable(p, p->pfrkt_nflags); } } @@ -2015,10 +2014,10 @@ pfr_create_ktable(struct pfr_table *tbl, time_t tzero, int attachruleset) static void pfr_destroy_ktables(struct pfr_ktableworkq *workq, int flushaddr) { - struct pfr_ktable *p, *q; + struct pfr_ktable *p; - for (p = SLIST_FIRST(workq); p; p = q) { - q = SLIST_NEXT(p, pfrkt_workq); + while ((p = SLIST_FIRST(workq)) != NULL) { + SLIST_REMOVE_HEAD(workq, pfrkt_workq); pfr_destroy_ktable(p, flushaddr); } } @@ -2074,17 +2073,16 @@ pfr_lookup_table(struct pfr_table *tbl) (struct pfr_ktable *)tbl)); } -int -pfr_match_addr(struct pfr_ktable *kt, struct pf_addr *a, sa_family_t af) +static struct pfr_kentry * +pfr_kentry_byaddr(struct pfr_ktable *kt, struct pf_addr *a, sa_family_t af, + int exact) { struct pfr_kentry *ke = NULL; - int match; PF_RULES_RASSERT(); - if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE) && kt->pfrkt_root != NULL) - kt = kt->pfrkt_root; - if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE)) + kt = pfr_ktable_select_active(kt); + if (kt == NULL) return (0); switch (af) { @@ -2121,11 +2119,26 @@ pfr_match_addr(struct pfr_ktable *kt, struct pf_addr *a, sa_family_t af) default: unhandled_af(af); } + if (exact && ke && KENTRY_NETWORK(ke)) + ke = NULL; + + return (ke); +} + +int +pfr_match_addr(struct pfr_ktable *kt, struct pf_addr *a, sa_family_t af) +{ + struct pfr_kentry *ke = NULL; + int match; + + ke = pfr_kentry_byaddr(kt, a, af, 0); + match = (ke && !ke->pfrke_not); if (match) pfr_kstate_counter_add(&kt->pfrkt_match, 1); else pfr_kstate_counter_add(&kt->pfrkt_nomatch, 1); + return (match); } @@ -2135,9 +2148,8 @@ pfr_update_stats(struct pfr_ktable *kt, struct pf_addr *a, sa_family_t af, { struct pfr_kentry *ke = NULL; - if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE) && kt->pfrkt_root != NULL) - kt = kt->pfrkt_root; - if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE)) + kt = pfr_ktable_select_active(kt); + if (kt == NULL) return; switch (af) { @@ -2281,7 +2293,7 @@ pfr_detach_table(struct pfr_ktable *kt) int pfr_pool_get(struct pfr_ktable *kt, int *pidx, struct pf_addr *counter, - sa_family_t af, pf_addr_filter_func_t filter) + sa_family_t af, pf_addr_filter_func_t filter, bool loop_once) { struct pf_addr *addr, cur, mask, umask_addr; union sockaddr_union uaddr, umask; @@ -2306,9 +2318,8 @@ pfr_pool_get(struct pfr_ktable *kt, int *pidx, struct pf_addr *counter, unhandled_af(af); } - if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE) && kt->pfrkt_root != NULL) - kt = kt->pfrkt_root; - if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE)) + kt = pfr_ktable_select_active(kt); + if (kt == NULL) return (-1); idx = *pidx; @@ -2327,7 +2338,7 @@ _next_block: ke = pfr_kentry_byidx(kt, idx, af); if (ke == NULL) { /* we don't have this idx, try looping */ - if (loop || (ke = pfr_kentry_byidx(kt, 0, af)) == NULL) { + if ((loop || loop_once) || (ke = pfr_kentry_byidx(kt, 0, af)) == NULL) { pfr_kstate_counter_add(&kt->pfrkt_nomatch, 1); return (1); } @@ -2340,16 +2351,16 @@ _next_block: if (use_counter && !PF_AZERO(counter, af)) { /* is supplied address within block? */ - if (!PF_MATCHA(0, &cur, &mask, counter, af)) { + if (!pf_match_addr(0, &cur, &mask, counter, af)) { /* no, go to next block in table */ idx++; use_counter = 0; goto _next_block; } - PF_ACPY(addr, counter, af); + pf_addrcpy(addr, counter, af); } else { /* use first address of block */ - PF_ACPY(addr, &cur, af); + pf_addrcpy(addr, &cur, af); } if (!KENTRY_NETWORK(ke)) { @@ -2358,7 +2369,7 @@ _next_block: idx++; goto _next_block; } - PF_ACPY(counter, addr, af); + pf_addrcpy(counter, addr, af); *pidx = idx; pfr_kstate_counter_add(&kt->pfrkt_match, 1); return (0); @@ -2382,7 +2393,7 @@ _next_block: /* lookup return the same block - perfect */ if (filter && filter(af, addr)) goto _next_entry; - PF_ACPY(counter, addr, af); + pf_addrcpy(counter, addr, af); *pidx = idx; pfr_kstate_counter_add(&kt->pfrkt_match, 1); return (0); @@ -2392,9 +2403,9 @@ _next_entry: /* we need to increase the counter past the nested block */ pfr_prepare_network(&umask, AF_INET, ke2->pfrke_net); pfr_sockaddr_to_pf_addr(&umask, &umask_addr); - PF_POOLMASK(addr, addr, &umask_addr, &pfr_ffaddr, af); - PF_AINC(addr, af); - if (!PF_MATCHA(0, &cur, &mask, addr, af)) { + pf_poolmask(addr, addr, &umask_addr, &pfr_ffaddr, af); + pf_addr_inc(addr, af); + if (!pf_match_addr(0, &cur, &mask, addr, af)) { /* ok, we reached the end of our main block */ /* go to next block in table */ idx++; @@ -2455,3 +2466,14 @@ pfr_dynaddr_update(struct pfr_ktable *kt, struct pfi_dynaddr *dyn) unhandled_af(dyn->pfid_af); } } + +struct pfr_ktable * +pfr_ktable_select_active(struct pfr_ktable *kt) +{ + if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE) && kt->pfrkt_root != NULL) + kt = kt->pfrkt_root; + if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE)) + return (NULL); + + return (kt); +} diff --git a/sys/powerpc/aim/mmu_oea.c b/sys/powerpc/aim/mmu_oea.c index 7746b668265d..ae17b3289593 100644 --- a/sys/powerpc/aim/mmu_oea.c +++ b/sys/powerpc/aim/mmu_oea.c @@ -1469,6 +1469,9 @@ moea_page_set_memattr(vm_page_t m, vm_memattr_t ma) pmap_t pmap; u_int lo; + if (m->md.mdpg_cache_attrs == ma) + return; + if ((m->oflags & VPO_UNMANAGED) != 0) { m->md.mdpg_cache_attrs = ma; return; diff --git a/sys/powerpc/aim/mmu_oea64.c b/sys/powerpc/aim/mmu_oea64.c index 79cea408bb5f..796b1719b8ba 100644 --- a/sys/powerpc/aim/mmu_oea64.c +++ b/sys/powerpc/aim/mmu_oea64.c @@ -2134,6 +2134,9 @@ moea64_page_set_memattr(vm_page_t m, vm_memattr_t ma) CTR3(KTR_PMAP, "%s: pa=%#jx, ma=%#x", __func__, (uintmax_t)VM_PAGE_TO_PHYS(m), ma); + if (m->md.mdpg_cache_attrs == ma) + return; + if ((m->oflags & VPO_UNMANAGED) != 0) { m->md.mdpg_cache_attrs = ma; return; diff --git a/sys/powerpc/aim/mmu_radix.c b/sys/powerpc/aim/mmu_radix.c index 45f7bef8bcc9..a12142fc2d7b 100644 --- a/sys/powerpc/aim/mmu_radix.c +++ b/sys/powerpc/aim/mmu_radix.c @@ -5937,6 +5937,10 @@ mmu_radix_page_set_memattr(vm_page_t m, vm_memattr_t ma) { CTR3(KTR_PMAP, "%s(%p, %#x)", __func__, m, ma); + + if (m->md.mdpg_cache_attrs == ma) + return; + m->md.mdpg_cache_attrs = ma; /* diff --git a/sys/powerpc/include/pcb.h b/sys/powerpc/include/pcb.h index 050ada6b0f64..0230cf78aba7 100644 --- a/sys/powerpc/include/pcb.h +++ b/sys/powerpc/include/pcb.h @@ -66,16 +66,8 @@ struct pcb { #define PCB_VECREGS 0x200 /* Process had Altivec registers initialized */ struct fpu { union { -#if _BYTE_ORDER == _BIG_ENDIAN - double fpr; - uint32_t vsr[4]; -#else uint32_t vsr[4]; - struct { - double padding; - double fpr; - }; -#endif + double fpr; } fpr[32]; double fpscr; /* FPSCR stored as double for easier access */ } pcb_fpu; /* Floating point processor */ diff --git a/sys/powerpc/include/ucontext.h b/sys/powerpc/include/ucontext.h index d35c6c773fe0..dc87edd578bc 100644 --- a/sys/powerpc/include/ucontext.h +++ b/sys/powerpc/include/ucontext.h @@ -41,6 +41,7 @@ typedef struct __mcontext { int mc_flags; #define _MC_FP_VALID 0x01 #define _MC_AV_VALID 0x02 +#define _MC_VS_VALID 0x04 int mc_onstack; /* saved onstack flag */ int mc_len; /* sizeof(__mcontext) */ __uint64_t mc_avec[32*2]; /* vector register file */ @@ -56,6 +57,7 @@ typedef struct __mcontext32 { int mc_flags; #define _MC_FP_VALID 0x01 #define _MC_AV_VALID 0x02 +#define _MC_VS_VALID 0x04 int mc_onstack; /* saved onstack flag */ int mc_len; /* sizeof(__mcontext) */ uint64_t mc_avec[32*2]; /* vector register file */ diff --git a/sys/powerpc/mpc85xx/mpc85xx_gpio.c b/sys/powerpc/mpc85xx/mpc85xx_gpio.c index 0f333feb747f..cb96d768adef 100644 --- a/sys/powerpc/mpc85xx/mpc85xx_gpio.c +++ b/sys/powerpc/mpc85xx/mpc85xx_gpio.c @@ -226,14 +226,14 @@ mpc85xx_gpio_attach(device_t dev) return (ENOMEM); } + OF_device_register_xref(OF_xref_from_node(ofw_bus_get_node(dev)), dev); + sc->busdev = gpiobus_attach_bus(dev); if (sc->busdev == NULL) { mpc85xx_gpio_detach(dev); return (ENOMEM); } - OF_device_register_xref(OF_xref_from_node(ofw_bus_get_node(dev)), dev); - return (0); } diff --git a/sys/powerpc/powerpc/exec_machdep.c b/sys/powerpc/powerpc/exec_machdep.c index 1893d79f29a8..8a33d0f589a7 100644 --- a/sys/powerpc/powerpc/exec_machdep.c +++ b/sys/powerpc/powerpc/exec_machdep.c @@ -214,10 +214,10 @@ sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) sfpsize = sizeof(sf); #ifdef __powerpc64__ /* - * 64-bit PPC defines a 288 byte scratch region - * below the stack. + * 64-bit PPC defines a 512 byte red zone below + * the existing stack (ELF ABI v2 §2.2.2.4) */ - rndfsize = 288 + roundup(sizeof(sf), 48); + rndfsize = 512 + roundup(sizeof(sf), 48); #else rndfsize = roundup(sizeof(sf), 16); #endif @@ -349,13 +349,6 @@ sys_sigreturn(struct thread *td, struct sigreturn_args *uap) if (error != 0) return (error); - /* - * Save FPU state if needed. User may have changed it on - * signal handler - */ - if (uc.uc_mcontext.mc_srr1 & PSL_FP) - save_fpu(td); - kern_sigprocmask(td, SIG_SETMASK, &uc.uc_sigmask, NULL, 0); CTR3(KTR_SIG, "sigreturn: return td=%p pc=%#x sp=%#x", @@ -432,6 +425,7 @@ grab_mcontext(struct thread *td, mcontext_t *mcp, int flags) } if (pcb->pcb_flags & PCB_VSX) { + mcp->mc_flags |= _MC_VS_VALID; for (i = 0; i < 32; i++) memcpy(&mcp->mc_vsxfpreg[i], &pcb->pcb_fpu.fpr[i].vsr[2], sizeof(double)); @@ -481,6 +475,7 @@ set_mcontext(struct thread *td, mcontext_t *mcp) struct pcb *pcb; struct trapframe *tf; register_t tls; + register_t msr; int i; pcb = td->td_pcb; @@ -531,6 +526,22 @@ set_mcontext(struct thread *td, mcontext_t *mcp) tf->srr1 &= ~(PSL_FP | PSL_VSX | PSL_VEC); pcb->pcb_flags &= ~(PCB_FPU | PCB_VSX | PCB_VEC); + /* + * Ensure the FPU is also disabled in hardware. + * + * Without this, it's possible for the register reload to fail if we + * don't switch to a FPU disabled context before resuming the original + * thread. Specifically, if the FPU/VSX unavailable exception is never + * hit, then whatever data is still in the FP/VSX registers when + * sigresume is callled will used by the resumed thread, instead of the + * previously saved data from the mcontext. + */ + critical_enter(); + msr = mfmsr() & ~(PSL_FP | PSL_VSX | PSL_VEC); + isync(); + mtmsr(msr); + critical_exit(); + if (mcp->mc_flags & _MC_FP_VALID) { /* enable_fpu() will happen lazily on a fault */ pcb->pcb_flags |= PCB_FPREGS; @@ -539,8 +550,12 @@ set_mcontext(struct thread *td, mcontext_t *mcp) for (i = 0; i < 32; i++) { memcpy(&pcb->pcb_fpu.fpr[i].fpr, &mcp->mc_fpreg[i], sizeof(double)); - memcpy(&pcb->pcb_fpu.fpr[i].vsr[2], - &mcp->mc_vsxfpreg[i], sizeof(double)); + } + if (mcp->mc_flags & _MC_VS_VALID) { + for (i = 0; i < 32; i++) { + memcpy(&pcb->pcb_fpu.fpr[i].vsr[2], + &mcp->mc_vsxfpreg[i], sizeof(double)); + } } } diff --git a/sys/powerpc/powerpc/fpu.c b/sys/powerpc/powerpc/fpu.c index 0eaff2ea4932..cc8f22f7dda3 100644 --- a/sys/powerpc/powerpc/fpu.c +++ b/sys/powerpc/powerpc/fpu.c @@ -64,8 +64,19 @@ save_fpu_int(struct thread *td) * Save the floating-point registers and FPSCR to the PCB */ if (pcb->pcb_flags & PCB_VSX) { - #define SFP(n) __asm ("stxvw4x " #n ", 0,%0" \ +#if _BYTE_ORDER == _BIG_ENDIAN + #define SFP(n) __asm("stxvw4x " #n ", 0,%0" \ :: "b"(&pcb->pcb_fpu.fpr[n])); +#else + /* + * stxvw2x will swap words within the FP double word on LE systems, + * leading to corruption if VSX is used to store state and FP is + * subsequently used to restore state. + * Use stxvd2x instead. + */ + #define SFP(n) __asm("stxvd2x " #n ", 0,%0" \ + :: "b"(&pcb->pcb_fpu.fpr[n])); +#endif SFP(0); SFP(1); SFP(2); SFP(3); SFP(4); SFP(5); SFP(6); SFP(7); SFP(8); SFP(9); SFP(10); SFP(11); @@ -76,7 +87,7 @@ save_fpu_int(struct thread *td) SFP(28); SFP(29); SFP(30); SFP(31); #undef SFP } else { - #define SFP(n) __asm ("stfd " #n ", 0(%0)" \ + #define SFP(n) __asm("stfd " #n ", 0(%0)" \ :: "b"(&pcb->pcb_fpu.fpr[n].fpr)); SFP(0); SFP(1); SFP(2); SFP(3); SFP(4); SFP(5); SFP(6); SFP(7); @@ -149,8 +160,19 @@ enable_fpu(struct thread *td) :: "b"(&pcb->pcb_fpu.fpscr)); if (pcb->pcb_flags & PCB_VSX) { - #define LFP(n) __asm ("lxvw4x " #n ", 0,%0" \ +#if _BYTE_ORDER == _BIG_ENDIAN + #define LFP(n) __asm("lxvw4x " #n ", 0,%0" \ + :: "b"(&pcb->pcb_fpu.fpr[n])); +#else + /* + * lxvw4x will swap words within the FP double word on LE systems, + * leading to corruption if FP is used to store state and VSX is + * subsequently used to restore state. + * Use lxvd2x instead. + */ + #define LFP(n) __asm("lxvd2x " #n ", 0,%0" \ :: "b"(&pcb->pcb_fpu.fpr[n])); +#endif LFP(0); LFP(1); LFP(2); LFP(3); LFP(4); LFP(5); LFP(6); LFP(7); LFP(8); LFP(9); LFP(10); LFP(11); @@ -161,7 +183,7 @@ enable_fpu(struct thread *td) LFP(28); LFP(29); LFP(30); LFP(31); #undef LFP } else { - #define LFP(n) __asm ("lfd " #n ", 0(%0)" \ + #define LFP(n) __asm("lfd " #n ", 0(%0)" \ :: "b"(&pcb->pcb_fpu.fpr[n].fpr)); LFP(0); LFP(1); LFP(2); LFP(3); LFP(4); LFP(5); LFP(6); LFP(7); diff --git a/sys/riscv/allwinner/files.allwinner b/sys/riscv/allwinner/files.allwinner index 423a89c10c78..7a4ff6b9c62e 100644 --- a/sys/riscv/allwinner/files.allwinner +++ b/sys/riscv/allwinner/files.allwinner @@ -1,5 +1,7 @@ arm/allwinner/aw_gpio.c optional gpio aw_gpio fdt +arm/allwinner/aw_mmc.c optional mmc aw_mmc fdt | mmccam aw_mmc fdt +arm/allwinner/aw_rtc.c optional aw_rtc fdt arm/allwinner/aw_syscon.c optional syscon arm/allwinner/aw_sid.c optional aw_sid nvmem arm/allwinner/aw_timer.c optional aw_timer fdt diff --git a/sys/riscv/conf/std.allwinner b/sys/riscv/conf/std.allwinner index 1bf6b027a4cb..34fe195b01ba 100644 --- a/sys/riscv/conf/std.allwinner +++ b/sys/riscv/conf/std.allwinner @@ -7,6 +7,8 @@ options SOC_ALLWINNER_D1 device aw_ccu # Allwinner clock controller device aw_gpio # Allwinner GPIO controller +device aw_mmc # Allwinner SD/MMC controller +device aw_rtc # Allwinner Real-time Clock device aw_sid # Allwinner Secure ID EFUSE device aw_timer # Allwinner Timer device aw_usbphy # Allwinner USB PHY diff --git a/sys/riscv/riscv/pmap.c b/sys/riscv/riscv/pmap.c index 5d15bd671285..26efaecc64d1 100644 --- a/sys/riscv/riscv/pmap.c +++ b/sys/riscv/riscv/pmap.c @@ -4838,6 +4838,8 @@ pmap_unmapbios(void *p, vm_size_t size) void pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) { + if (m->md.pv_memattr == ma) + return; m->md.pv_memattr = ma; diff --git a/sys/rpc/clnt_rc.c b/sys/rpc/clnt_rc.c index 9e87af578885..44b63e38a8e6 100644 --- a/sys/rpc/clnt_rc.c +++ b/sys/rpc/clnt_rc.c @@ -198,6 +198,12 @@ clnt_reconnect_connect(CLIENT *cl) newclient = clnt_vc_create(so, (struct sockaddr *) &rc->rc_addr, rc->rc_prog, rc->rc_vers, rc->rc_sendsz, rc->rc_recvsz, rc->rc_intr); + /* + * CLSET_FD_CLOSE must be done now, in case rpctls_connect() + * fails just below. + */ + if (newclient != NULL) + CLNT_CONTROL(newclient, CLSET_FD_CLOSE, 0); if (rc->rc_tls && newclient != NULL) { CURVNET_SET(so->so_vnet); stat = rpctls_connect(newclient, rc->rc_tlscertname, so, @@ -236,7 +242,6 @@ clnt_reconnect_connect(CLIENT *cl) goto out; } - CLNT_CONTROL(newclient, CLSET_FD_CLOSE, 0); CLNT_CONTROL(newclient, CLSET_CONNECT, &one); CLNT_CONTROL(newclient, CLSET_TIMEOUT, &rc->rc_timeout); CLNT_CONTROL(newclient, CLSET_RETRY_TIMEOUT, &rc->rc_retry); diff --git a/sys/rpc/rpcsec_gss/rpcsec_gss.c b/sys/rpc/rpcsec_gss/rpcsec_gss.c index 62c71937a185..983dd251f81f 100644 --- a/sys/rpc/rpcsec_gss/rpcsec_gss.c +++ b/sys/rpc/rpcsec_gss/rpcsec_gss.c @@ -67,6 +67,7 @@ #include <sys/param.h> #include <sys/systm.h> #include <sys/hash.h> +#include <sys/jail.h> #include <sys/kernel.h> #include <sys/kobj.h> #include <sys/lock.h> @@ -772,6 +773,17 @@ rpc_gss_init(AUTH *auth, rpc_gss_options_ret_t *options_ret) gd->gd_cred.gc_seq = 0; /* + * XXX Threads from inside jails can get here via calls + * to clnt_vc_call()->AUTH_REFRESH()->rpc_gss_refresh() + * but the NFS mount is always done outside of the + * jails in vnet0. Since the thread credentials won't + * necessarily have cr_prison == vnet0 and this function + * has no access to the socket, using vnet0 seems the + * only option. This is broken if NFS mounts are enabled + * within vnet prisons. + */ + KGSS_CURVNET_SET_QUIET(vnet0); + /* * For KerberosV, if there is a client principal name, that implies * that this is a host based initiator credential in the default * keytab file. For this case, it is necessary to do a @@ -994,12 +1006,14 @@ out: gss_delete_sec_context(&min_stat, &gd->gd_ctx, GSS_C_NO_BUFFER); } + KGSS_CURVNET_RESTORE(); mtx_lock(&gd->gd_lock); gd->gd_state = RPCSEC_GSS_START; wakeup(gd); mtx_unlock(&gd->gd_lock); return (FALSE); } + KGSS_CURVNET_RESTORE(); mtx_lock(&gd->gd_lock); gd->gd_state = RPCSEC_GSS_ESTABLISHED; diff --git a/sys/rpc/rpcsec_tls/rpctls_impl.c b/sys/rpc/rpcsec_tls/rpctls_impl.c index 93fe283e65fd..51fe270b13d9 100644 --- a/sys/rpc/rpcsec_tls/rpctls_impl.c +++ b/sys/rpc/rpcsec_tls/rpctls_impl.c @@ -240,6 +240,14 @@ rpctls_rpc_failed(struct upsock *ups, struct socket *so) * failed to do the handshake. */ mtx_unlock(&rpctls_lock); + /* + * Do a shutdown on the socket, since the daemon is + * probably stuck in SSL_accept() or SSL_connect() trying to + * read the socket. Do not soclose() the socket, since the + * daemon will close() the socket after SSL_accept() + * returns an error. + */ + soshutdown(so, SHUT_RD); } } diff --git a/sys/sys/caprights.h b/sys/sys/caprights.h index 48c75afc62a0..6a5a17eda5ee 100644 --- a/sys/sys/caprights.h +++ b/sys/sys/caprights.h @@ -79,6 +79,8 @@ extern const cap_rights_t cap_futimes_rights; extern const cap_rights_t cap_getpeername_rights; extern const cap_rights_t cap_getsockopt_rights; extern const cap_rights_t cap_getsockname_rights; +extern const cap_rights_t cap_inotify_add_rights; +extern const cap_rights_t cap_inotify_rm_rights; extern const cap_rights_t cap_ioctl_rights; extern const cap_rights_t cap_linkat_source_rights; extern const cap_rights_t cap_linkat_target_rights; diff --git a/sys/sys/capsicum.h b/sys/sys/capsicum.h index d493535454e9..3847c4c73e75 100644 --- a/sys/sys/capsicum.h +++ b/sys/sys/capsicum.h @@ -279,11 +279,15 @@ #define CAP_KQUEUE (CAP_KQUEUE_EVENT | CAP_KQUEUE_CHANGE) +/* Allows operations on inotify descriptors. */ +#define CAP_INOTIFY_ADD CAPRIGHT(1, 0x0000000000200000ULL) +#define CAP_INOTIFY_RM CAPRIGHT(1, 0x0000000000400000ULL) + /* All used bits for index 1. */ -#define CAP_ALL1 CAPRIGHT(1, 0x00000000001FFFFFULL) +#define CAP_ALL1 CAPRIGHT(1, 0x00000000007FFFFFULL) /* Available bits for index 1. */ -#define CAP_UNUSED1_22 CAPRIGHT(1, 0x0000000000200000ULL) +#define CAP_UNUSED1_22 CAPRIGHT(1, 0x0000000000800000ULL) /* ... */ #define CAP_UNUSED1_57 CAPRIGHT(1, 0x0100000000000000ULL) diff --git a/sys/sys/efi.h b/sys/sys/efi.h index 95a433a950db..89c8b15519de 100644 --- a/sys/sys/efi.h +++ b/sys/sys/efi.h @@ -42,6 +42,8 @@ {0xb122a263,0x3661,0x4f68,{0x99,0x29,0x78,0xf8,0xb0,0xd6,0x21,0x80}} #define EFI_PROPERTIES_TABLE \ {0x880aaca3,0x4adc,0x4a04,{0x90,0x79,0xb7,0x47,0x34,0x08,0x25,0xe5}} +#define EFI_MEMORY_ATTRIBUTES_TABLE \ + {0xdcfa911d,0x26eb,0x469f,{0xa2,0x20,0x38,0xb7,0xdc,0x46,0x12,0x20}} #define LINUX_EFI_MEMRESERVE_TABLE \ {0x888eb0c6,0x8ede,0x4ff5,{0xa8,0xf0,0x9a,0xee,0x5c,0xb9,0x77,0xc2}} @@ -166,6 +168,22 @@ struct efi_prop_table { uint64_t memory_protection_attribute; }; +struct efi_memory_descriptor { + uint32_t type; + caddr_t phy_addr; + caddr_t virt_addr; + uint64_t pages; + uint64_t attrs; +}; + +struct efi_memory_attribute_table { + uint32_t version; + uint32_t num_ents; + uint32_t descriptor_size; + uint32_t flags; + struct efi_memory_descriptor tables[]; +}; + #ifdef _KERNEL #ifdef EFIABI_ATTR diff --git a/sys/sys/elf_common.h b/sys/sys/elf_common.h index 87460aae2dd4..efda38279848 100644 --- a/sys/sys/elf_common.h +++ b/sys/sys/elf_common.h @@ -306,7 +306,7 @@ typedef struct { and MPRC of Peking University */ #define EM_AARCH64 183 /* AArch64 (64-bit ARM) */ #define EM_RISCV 243 /* RISC-V */ -#define EM_LOONGARCH 258 /* Loongson LoongArch */ +#define EM_LOONGARCH 258 /* Loongson LoongArch */ /* Non-standard or deprecated. */ #define EM_486 6 /* Intel i486. */ @@ -392,15 +392,15 @@ typedef struct { */ /* LoongArch Base ABI Modifiers */ -#define EF_LOONGARCH_ABI_SOFT_FLOAT 0x00000001 -#define EF_LOONGARCH_ABI_SINGLE_FLOAT 0x00000002 -#define EF_LOONGARCH_ABI_DOUBLE_FLOAT 0x00000003 -#define EF_LOONGARCH_ABI_MODIFIER_MASK 0x00000007 +#define EF_LOONGARCH_ABI_SOFT_FLOAT 0x00000001 +#define EF_LOONGARCH_ABI_SINGLE_FLOAT 0x00000002 +#define EF_LOONGARCH_ABI_DOUBLE_FLOAT 0x00000003 +#define EF_LOONGARCH_ABI_MODIFIER_MASK 0x00000007 /* LoongArch Object file ABI versions */ -#define EF_LOONGARCH_OBJABI_V0 0x00000000 -#define EF_LOONGARCH_OBJABI_V1 0x00000040 -#define EF_LOONGARCH_OBJABI_MASK 0x000000C0 +#define EF_LOONGARCH_OBJABI_V0 0x00000000 +#define EF_LOONGARCH_OBJABI_V1 0x00000040 +#define EF_LOONGARCH_OBJABI_MASK 0x000000C0 #define EF_SPARC_EXT_MASK 0x00ffff00 #define EF_SPARC_32PLUS 0x00000100 @@ -470,12 +470,12 @@ typedef struct { #define SHT_HIOS 0x6fffffff /* Last of OS specific semantics */ #define SHT_LOPROC 0x70000000 /* reserved range for processor */ #define SHT_X86_64_UNWIND 0x70000001 /* unwind information */ -#define SHT_AMD64_UNWIND SHT_X86_64_UNWIND +#define SHT_AMD64_UNWIND SHT_X86_64_UNWIND #define SHT_ARM_EXIDX 0x70000001 /* Exception index table. */ -#define SHT_ARM_PREEMPTMAP 0x70000002 /* BPABI DLL dynamic linking +#define SHT_ARM_PREEMPTMAP 0x70000002 /* BPABI DLL dynamic linking pre-emption map. */ -#define SHT_ARM_ATTRIBUTES 0x70000003 /* Object file compatibility +#define SHT_ARM_ATTRIBUTES 0x70000003 /* Object file compatibility attributes. */ #define SHT_ARM_DEBUGOVERLAY 0x70000004 /* See DBGOVL for details. */ #define SHT_ARM_OVERLAYSECTION 0x70000005 /* See DBGOVL for details. */ @@ -791,7 +791,7 @@ typedef struct { #define DF_1_NODELETE 0x00000008 /* Set the RTLD_NODELETE for object */ #define DF_1_LOADFLTR 0x00000010 /* Immediate loading of filtees */ #define DF_1_INITFIRST 0x00000020 /* Initialize DSO first at runtime */ -#define DF_1_NOOPEN 0x00000040 /* Do not allow loading on dlopen() */ +#define DF_1_NOOPEN 0x00000040 /* Do not allow loading on dlopen() */ #define DF_1_ORIGIN 0x00000080 /* Process $ORIGIN */ #define DF_1_INTERPOSE 0x00000400 /* Interpose all objects but main */ #define DF_1_NODEFLIB 0x00000800 /* Do not search default paths */ @@ -908,7 +908,7 @@ typedef struct { #define STV_ELIMINATE 0x6 /* Architecture specific data - st_other */ -#define STO_AARCH64_VARIANT_PCS 0x80 +#define STO_AARCH64_VARIANT_PCS 0x80 /* Special symbol table indexes. */ #define STN_UNDEF 0 /* Undefined symbol index. */ @@ -1084,11 +1084,11 @@ typedef struct { #define R_AARCH64_COPY 1024 /* Copy data from shared object */ #define R_AARCH64_GLOB_DAT 1025 /* Set GOT entry to data address */ #define R_AARCH64_JUMP_SLOT 1026 /* Set GOT entry to code address */ -#define R_AARCH64_RELATIVE 1027 /* Add load address of shared object */ +#define R_AARCH64_RELATIVE 1027 /* Add load address of shared object */ #define R_AARCH64_TLS_DTPREL64 1028 #define R_AARCH64_TLS_DTPMOD64 1029 -#define R_AARCH64_TLS_TPREL64 1030 -#define R_AARCH64_TLSDESC 1031 /* Identify the TLS descriptor */ +#define R_AARCH64_TLS_TPREL64 1030 +#define R_AARCH64_TLSDESC 1031 /* Identify the TLS descriptor */ #define R_AARCH64_IRELATIVE 1032 #define R_ARM_NONE 0 /* No relocation. */ @@ -1231,8 +1231,8 @@ typedef struct { #define R_MIPS_GOT_HI16 22 /* GOT HI 16 bit */ #define R_MIPS_GOT_LO16 23 /* GOT LO 16 bit */ #define R_MIPS_SUB 24 -#define R_MIPS_CALLHI16 30 /* upper 16 bit GOT entry for function */ -#define R_MIPS_CALLLO16 31 /* lower 16 bit GOT entry for function */ +#define R_MIPS_CALLHI16 30 /* upper 16 bit GOT entry for function */ +#define R_MIPS_CALLLO16 31 /* lower 16 bit GOT entry for function */ #define R_MIPS_JALR 37 #define R_MIPS_TLS_GD 42 #define R_MIPS_COPY 126 @@ -1352,7 +1352,6 @@ typedef struct { * RISC-V relocation types. */ -/* Relocation types used by the dynamic linker. */ #define R_RISCV_NONE 0 #define R_RISCV_32 1 #define R_RISCV_64 2 @@ -1365,8 +1364,7 @@ typedef struct { #define R_RISCV_TLS_DTPREL64 9 #define R_RISCV_TLS_TPREL32 10 #define R_RISCV_TLS_TPREL64 11 - -/* Relocation types not used by the dynamic linker. */ +#define R_RISCV_TLSDESC 12 #define R_RISCV_BRANCH 16 #define R_RISCV_JAL 17 #define R_RISCV_CALL 18 @@ -1392,10 +1390,10 @@ typedef struct { #define R_RISCV_SUB16 38 #define R_RISCV_SUB32 39 #define R_RISCV_SUB64 40 +#define R_RISCV_GOT32_PCREL 41 #define R_RISCV_ALIGN 43 #define R_RISCV_RVC_BRANCH 44 #define R_RISCV_RVC_JUMP 45 -#define R_RISCV_RVC_LUI 46 #define R_RISCV_RELAX 51 #define R_RISCV_SUB6 52 #define R_RISCV_SET6 53 @@ -1404,6 +1402,14 @@ typedef struct { #define R_RISCV_SET32 56 #define R_RISCV_32_PCREL 57 #define R_RISCV_IRELATIVE 58 +#define R_RISCV_PLT32 59 +#define R_RISCV_SET_ULEB128 60 +#define R_RISCV_SUB_ULEB128 61 +#define R_RISCV_TLSDESC_HI20 62 +#define R_RISCV_TLSDESC_LOAD_LO12 63 +#define R_RISCV_TLSDESC_ADD_LO12 64 +#define R_RISCV_TLSDESC_CALL 65 +#define R_RISCV_VENDOR 191 /* * Loongson LoongArch relocation types. @@ -1413,101 +1419,101 @@ typedef struct { */ /* Relocation types used by the dynamic linker */ -#define R_LARCH_NONE 0 -#define R_LARCH_32 1 -#define R_LARCH_64 2 -#define R_LARCH_RELATIVE 3 -#define R_LARCH_COPY 4 -#define R_LARCH_JUMP_SLOT 5 -#define R_LARCH_TLS_DTPMOD32 6 -#define R_LARCH_TLS_DTPMOD64 7 -#define R_LARCH_TLS_DTPREL32 8 -#define R_LARCH_TLS_DTPREL64 9 -#define R_LARCH_TLS_TPREL32 10 -#define R_LARCH_TLS_TPREL64 11 -#define R_LARCH_IRELATIVE 12 -#define R_LARCH_MARK_LA 20 -#define R_LARCH_MARK_PCREL 21 -#define R_LARCH_SOP_PUSH_PCREL 22 -#define R_LARCH_SOP_PUSH_ABSOLUTE 23 -#define R_LARCH_SOP_PUSH_DUP 24 -#define R_LARCH_SOP_PUSH_GPREL 25 -#define R_LARCH_SOP_PUSH_TLS_TPREL 26 -#define R_LARCH_SOP_PUSH_TLS_GOT 27 -#define R_LARCH_SOP_PUSH_TLS_GD 28 -#define R_LARCH_SOP_PUSH_PLT_PCREL 29 -#define R_LARCH_SOP_ASSERT 30 -#define R_LARCH_SOP_NOT 31 -#define R_LARCH_SOP_SUB 32 -#define R_LARCH_SOP_SL 33 -#define R_LARCH_SOP_SR 34 -#define R_LARCH_SOP_ADD 35 -#define R_LARCH_SOP_AND 36 -#define R_LARCH_SOP_IF_ELSE 37 -#define R_LARCH_SOP_POP_32_S_10_5 38 -#define R_LARCH_SOP_POP_32_U_10_12 39 -#define R_LARCH_SOP_POP_32_S_10_12 40 -#define R_LARCH_SOP_POP_32_S_10_16 41 -#define R_LARCH_SOP_POP_32_S_10_16_S2 42 -#define R_LARCH_SOP_POP_32_S_5_20 43 -#define R_LARCH_SOP_POP_32_S_0_5_10_16_S2 44 -#define R_LARCH_SOP_POP_32_S_0_10_10_16_S2 45 -#define R_LARCH_SOP_POP_32_U 46 -#define R_LARCH_ADD8 47 -#define R_LARCH_ADD16 48 -#define R_LARCH_ADD24 49 -#define R_LARCH_ADD32 50 -#define R_LARCH_ADD64 51 -#define R_LARCH_SUB8 52 -#define R_LARCH_SUB16 53 -#define R_LARCH_SUB24 54 -#define R_LARCH_SUB32 55 -#define R_LARCH_SUB64 56 -#define R_LARCH_GNU_VTINHERIT 57 -#define R_LARCH_GNU_VTENTRY 58 +#define R_LARCH_NONE 0 +#define R_LARCH_32 1 +#define R_LARCH_64 2 +#define R_LARCH_RELATIVE 3 +#define R_LARCH_COPY 4 +#define R_LARCH_JUMP_SLOT 5 +#define R_LARCH_TLS_DTPMOD32 6 +#define R_LARCH_TLS_DTPMOD64 7 +#define R_LARCH_TLS_DTPREL32 8 +#define R_LARCH_TLS_DTPREL64 9 +#define R_LARCH_TLS_TPREL32 10 +#define R_LARCH_TLS_TPREL64 11 +#define R_LARCH_IRELATIVE 12 +#define R_LARCH_MARK_LA 20 +#define R_LARCH_MARK_PCREL 21 +#define R_LARCH_SOP_PUSH_PCREL 22 +#define R_LARCH_SOP_PUSH_ABSOLUTE 23 +#define R_LARCH_SOP_PUSH_DUP 24 +#define R_LARCH_SOP_PUSH_GPREL 25 +#define R_LARCH_SOP_PUSH_TLS_TPREL 26 +#define R_LARCH_SOP_PUSH_TLS_GOT 27 +#define R_LARCH_SOP_PUSH_TLS_GD 28 +#define R_LARCH_SOP_PUSH_PLT_PCREL 29 +#define R_LARCH_SOP_ASSERT 30 +#define R_LARCH_SOP_NOT 31 +#define R_LARCH_SOP_SUB 32 +#define R_LARCH_SOP_SL 33 +#define R_LARCH_SOP_SR 34 +#define R_LARCH_SOP_ADD 35 +#define R_LARCH_SOP_AND 36 +#define R_LARCH_SOP_IF_ELSE 37 +#define R_LARCH_SOP_POP_32_S_10_5 38 +#define R_LARCH_SOP_POP_32_U_10_12 39 +#define R_LARCH_SOP_POP_32_S_10_12 40 +#define R_LARCH_SOP_POP_32_S_10_16 41 +#define R_LARCH_SOP_POP_32_S_10_16_S2 42 +#define R_LARCH_SOP_POP_32_S_5_20 43 +#define R_LARCH_SOP_POP_32_S_0_5_10_16_S2 44 +#define R_LARCH_SOP_POP_32_S_0_10_10_16_S2 45 +#define R_LARCH_SOP_POP_32_U 46 +#define R_LARCH_ADD8 47 +#define R_LARCH_ADD16 48 +#define R_LARCH_ADD24 49 +#define R_LARCH_ADD32 50 +#define R_LARCH_ADD64 51 +#define R_LARCH_SUB8 52 +#define R_LARCH_SUB16 53 +#define R_LARCH_SUB24 54 +#define R_LARCH_SUB32 55 +#define R_LARCH_SUB64 56 +#define R_LARCH_GNU_VTINHERIT 57 +#define R_LARCH_GNU_VTENTRY 58 /* * Relocs whose processing do not require a stack machine. * * Spec addition: https://github.com/loongson/LoongArch-Documentation/pull/57 */ -#define R_LARCH_B16 64 -#define R_LARCH_B21 65 -#define R_LARCH_B26 66 -#define R_LARCH_ABS_HI20 67 -#define R_LARCH_ABS_LO12 68 -#define R_LARCH_ABS64_LO20 69 -#define R_LARCH_ABS64_HI12 70 -#define R_LARCH_PCALA_HI20 71 -#define R_LARCH_PCALA_LO12 72 -#define R_LARCH_PCALA64_LO20 73 -#define R_LARCH_PCALA64_HI12 74 -#define R_LARCH_GOT_PC_HI20 75 -#define R_LARCH_GOT_PC_LO12 76 -#define R_LARCH_GOT64_PC_LO20 77 -#define R_LARCH_GOT64_PC_HI12 78 -#define R_LARCH_GOT_HI20 79 -#define R_LARCH_GOT_LO12 80 -#define R_LARCH_GOT64_LO20 81 -#define R_LARCH_GOT64_HI12 82 -#define R_LARCH_TLS_LE_HI20 83 -#define R_LARCH_TLS_LE_LO12 84 -#define R_LARCH_TLS_LE64_LO20 85 -#define R_LARCH_TLS_LE64_HI12 86 -#define R_LARCH_TLS_IE_PC_HI20 87 -#define R_LARCH_TLS_IE_PC_LO12 88 -#define R_LARCH_TLS_IE64_PC_LO20 89 -#define R_LARCH_TLS_IE64_PC_HI12 90 -#define R_LARCH_TLS_IE_HI20 91 -#define R_LARCH_TLS_IE_LO12 92 -#define R_LARCH_TLS_IE64_LO20 93 -#define R_LARCH_TLS_IE64_HI12 94 -#define R_LARCH_TLS_LD_PC_HI20 95 -#define R_LARCH_TLS_LD_HI20 96 -#define R_LARCH_TLS_GD_PC_HI20 97 -#define R_LARCH_TLS_GD_HI20 98 -#define R_LARCH_32_PCREL 99 -#define R_LARCH_RELAX 100 +#define R_LARCH_B16 64 +#define R_LARCH_B21 65 +#define R_LARCH_B26 66 +#define R_LARCH_ABS_HI20 67 +#define R_LARCH_ABS_LO12 68 +#define R_LARCH_ABS64_LO20 69 +#define R_LARCH_ABS64_HI12 70 +#define R_LARCH_PCALA_HI20 71 +#define R_LARCH_PCALA_LO12 72 +#define R_LARCH_PCALA64_LO20 73 +#define R_LARCH_PCALA64_HI12 74 +#define R_LARCH_GOT_PC_HI20 75 +#define R_LARCH_GOT_PC_LO12 76 +#define R_LARCH_GOT64_PC_LO20 77 +#define R_LARCH_GOT64_PC_HI12 78 +#define R_LARCH_GOT_HI20 79 +#define R_LARCH_GOT_LO12 80 +#define R_LARCH_GOT64_LO20 81 +#define R_LARCH_GOT64_HI12 82 +#define R_LARCH_TLS_LE_HI20 83 +#define R_LARCH_TLS_LE_LO12 84 +#define R_LARCH_TLS_LE64_LO20 85 +#define R_LARCH_TLS_LE64_HI12 86 +#define R_LARCH_TLS_IE_PC_HI20 87 +#define R_LARCH_TLS_IE_PC_LO12 88 +#define R_LARCH_TLS_IE64_PC_LO20 89 +#define R_LARCH_TLS_IE64_PC_HI12 90 +#define R_LARCH_TLS_IE_HI20 91 +#define R_LARCH_TLS_IE_LO12 92 +#define R_LARCH_TLS_IE64_LO20 93 +#define R_LARCH_TLS_IE64_HI12 94 +#define R_LARCH_TLS_LD_PC_HI20 95 +#define R_LARCH_TLS_LD_HI20 96 +#define R_LARCH_TLS_GD_PC_HI20 97 +#define R_LARCH_TLS_GD_HI20 98 +#define R_LARCH_32_PCREL 99 +#define R_LARCH_RELAX 100 /* * Relocs added in ELF for the LoongArchâ„¢ Architecture v20230519, part of the @@ -1520,13 +1526,13 @@ typedef struct { * in psABI v2.20 because they were proved not necessary to be exposed outside * of the linker. */ -#define R_LARCH_ALIGN 102 -#define R_LARCH_PCREL20_S2 103 -#define R_LARCH_ADD6 105 -#define R_LARCH_SUB6 106 -#define R_LARCH_ADD_ULEB128 107 -#define R_LARCH_SUB_ULEB128 108 -#define R_LARCH_64_PCREL 109 +#define R_LARCH_ALIGN 102 +#define R_LARCH_PCREL20_S2 103 +#define R_LARCH_ADD6 105 +#define R_LARCH_SUB6 106 +#define R_LARCH_ADD_ULEB128 107 +#define R_LARCH_SUB_ULEB128 108 +#define R_LARCH_64_PCREL 109 /* * Relocs added in ELF for the LoongArchâ„¢ Architecture v20231102, part of the @@ -1534,7 +1540,7 @@ typedef struct { * * Spec addition: https://github.com/loongson/la-abi-specs/pull/4 */ -#define R_LARCH_CALL36 110 +#define R_LARCH_CALL36 110 /* * Relocs added in ELF for the LoongArchâ„¢ Architecture v20231219, part of the @@ -1542,24 +1548,24 @@ typedef struct { * * Spec addition: https://github.com/loongson/la-abi-specs/pull/5 */ -#define R_LARCH_TLS_DESC32 13 -#define R_LARCH_TLS_DESC64 14 -#define R_LARCH_TLS_DESC_PC_HI20 111 -#define R_LARCH_TLS_DESC_PC_LO12 112 -#define R_LARCH_TLS_DESC64_PC_LO20 113 -#define R_LARCH_TLS_DESC64_PC_HI12 114 -#define R_LARCH_TLS_DESC_HI20 115 -#define R_LARCH_TLS_DESC_LO12 116 -#define R_LARCH_TLS_DESC64_LO20 117 -#define R_LARCH_TLS_DESC64_HI12 118 -#define R_LARCH_TLS_DESC_LD 119 -#define R_LARCH_TLS_DESC_CALL 120 -#define R_LARCH_TLS_LE_HI20_R 121 -#define R_LARCH_TLS_LE_ADD_R 122 -#define R_LARCH_TLS_LE_LO12_R 123 -#define R_LARCH_TLS_LD_PCREL20_S2 124 -#define R_LARCH_TLS_GD_PCREL20_S2 125 -#define R_LARCH_TLS_DESC_PCREL20_S2 126 +#define R_LARCH_TLS_DESC32 13 +#define R_LARCH_TLS_DESC64 14 +#define R_LARCH_TLS_DESC_PC_HI20 111 +#define R_LARCH_TLS_DESC_PC_LO12 112 +#define R_LARCH_TLS_DESC64_PC_LO20 113 +#define R_LARCH_TLS_DESC64_PC_HI12 114 +#define R_LARCH_TLS_DESC_HI20 115 +#define R_LARCH_TLS_DESC_LO12 116 +#define R_LARCH_TLS_DESC64_LO20 117 +#define R_LARCH_TLS_DESC64_HI12 118 +#define R_LARCH_TLS_DESC_LD 119 +#define R_LARCH_TLS_DESC_CALL 120 +#define R_LARCH_TLS_LE_HI20_R 121 +#define R_LARCH_TLS_LE_ADD_R 122 +#define R_LARCH_TLS_LE_LO12_R 123 +#define R_LARCH_TLS_LD_PCREL20_S2 124 +#define R_LARCH_TLS_GD_PCREL20_S2 125 +#define R_LARCH_TLS_DESC_PCREL20_S2 126 #define R_SPARC_NONE 0 #define R_SPARC_8 1 diff --git a/sys/sys/exterr_cat.h b/sys/sys/exterr_cat.h index d770c274d7b7..cab94ac511a5 100644 --- a/sys/sys/exterr_cat.h +++ b/sys/sys/exterr_cat.h @@ -16,6 +16,8 @@ #define EXTERR_KTRACE 3 /* To allow inclusion of this file into kern_ktrace.c */ #define EXTERR_CAT_FUSE 4 +#define EXTERR_CAT_INOTIFY 5 +#define EXTERR_CAT_GENIO 6 #endif diff --git a/sys/sys/exterrvar.h b/sys/sys/exterrvar.h index 15557c614f88..7bf1d264ff5e 100644 --- a/sys/sys/exterrvar.h +++ b/sys/sys/exterrvar.h @@ -21,6 +21,7 @@ #define EXTERRCTL_ENABLE 1 #define EXTERRCTL_DISABLE 2 +#define EXTERRCTL_UD 3 #define EXTERRCTLF_FORCE 0x00000001 diff --git a/sys/sys/fcntl.h b/sys/sys/fcntl.h index dd9fccf5cf38..18d3928e91c7 100644 --- a/sys/sys/fcntl.h +++ b/sys/sys/fcntl.h @@ -144,6 +144,10 @@ typedef __pid_t pid_t; #define O_XATTR O_NAMEDATTR /* Solaris compatibility */ #endif +#if __POSIX_VISIBLE >= 202405 +#define O_CLOFORK 0x08000000 +#endif + /* * !!! DANGER !!! * @@ -280,6 +284,16 @@ typedef __pid_t pid_t; #define F_GET_SEALS 20 #define F_ISUNIONSTACK 21 /* Kludge for libc, don't use it. */ #define F_KINFO 22 /* Return kinfo_file for this fd */ +#endif /* __BSD_VISIBLE */ + +#if __POSIX_VISIBLE >= 202405 +#define F_DUPFD_CLOFORK 23 /* Like F_DUPFD, but FD_CLOFORK is set */ +#endif + +#if __BSD_VISIBLE +#define F_DUP3FD 24 /* Used with dup3() */ + +#define F_DUP3FD_SHIFT 16 /* Shift used for F_DUP3FD */ /* Seals (F_ADD_SEALS, F_GET_SEALS). */ #define F_SEAL_SEAL 0x0001 /* Prevent adding sealings */ @@ -292,6 +306,9 @@ typedef __pid_t pid_t; #define FD_CLOEXEC 1 /* close-on-exec flag */ #define FD_RESOLVE_BENEATH 2 /* all lookups relative to fd have O_RESOLVE_BENEATH semantics */ +#if __POSIX_VISIBLE >= 202405 +#define FD_CLOFORK 4 /* close-on-fork flag */ +#endif /* record locking flags (F_GETLK, F_SETLK, F_SETLKW) */ #define F_RDLCK 1 /* shared or read lock */ diff --git a/sys/sys/file.h b/sys/sys/file.h index 284d523147b6..63313926c4f0 100644 --- a/sys/sys/file.h +++ b/sys/sys/file.h @@ -71,6 +71,7 @@ struct nameidata; #define DTYPE_PROCDESC 12 /* process descriptor */ #define DTYPE_EVENTFD 13 /* eventfd */ #define DTYPE_TIMERFD 14 /* timerfd */ +#define DTYPE_INOTIFY 15 /* inotify descriptor */ #ifdef _KERNEL diff --git a/sys/sys/filedesc.h b/sys/sys/filedesc.h index 55969b2ff4b3..0a388c90de26 100644 --- a/sys/sys/filedesc.h +++ b/sys/sys/filedesc.h @@ -149,6 +149,7 @@ struct filedesc_to_leader { */ #define UF_EXCLOSE 0x01 /* auto-close on exec */ #define UF_RESOLVE_BENEATH 0x02 /* lookups must be beneath this dir */ +#define UF_FOCLOSE 0x04 /* auto-close on fork */ #ifdef _KERNEL @@ -221,6 +222,7 @@ enum { /* Flags for kern_dup(). */ #define FDDUP_FLAG_CLOEXEC 0x1 /* Atomically set UF_EXCLOSE. */ +#define FDDUP_FLAG_CLOFORK 0x2 /* Atomically set UF_FOCLOSE. */ /* For backward compatibility. */ #define falloc(td, resultfp, resultfd, flags) \ diff --git a/sys/sys/hwt.h b/sys/sys/hwt.h new file mode 100644 index 000000000000..78b774a70f9f --- /dev/null +++ b/sys/sys/hwt.h @@ -0,0 +1,129 @@ +/*- + * Copyright (c) 2023-2025 Ruslan Bukin <br@bsdpad.com> + * + * This work was supported by Innovate UK project 105694, "Digital Security + * by Design (DSbD) Technology Platform Prototype". + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* User-visible header. */ + +#include <sys/param.h> +#include <sys/cpuset.h> +#include <sys/types.h> +#include <sys/hwt_record.h> + +#ifndef _SYS_HWT_H_ +#define _SYS_HWT_H_ + +#define HWT_MAGIC 0x42 +#define HWT_IOC_ALLOC _IOW(HWT_MAGIC, 0x00, struct hwt_alloc) +#define HWT_IOC_START _IOW(HWT_MAGIC, 0x01, struct hwt_start) +#define HWT_IOC_STOP _IOW(HWT_MAGIC, 0x02, struct hwt_stop) +#define HWT_IOC_RECORD_GET _IOW(HWT_MAGIC, 0x03, struct hwt_record_get) +#define HWT_IOC_BUFPTR_GET _IOW(HWT_MAGIC, 0x04, struct hwt_bufptr_get) +#define HWT_IOC_SET_CONFIG _IOW(HWT_MAGIC, 0x05, struct hwt_set_config) +#define HWT_IOC_WAKEUP _IOW(HWT_MAGIC, 0x06, struct hwt_wakeup) +#define HWT_IOC_SVC_BUF _IOW(HWT_MAGIC, 0x07, struct hwt_svc_buf) + +#define HWT_BACKEND_MAXNAMELEN 256 + +#define HWT_MODE_THREAD 1 +#define HWT_MODE_CPU 2 + +struct hwt_alloc { + size_t bufsize; + int mode; + pid_t pid; /* thread mode */ + cpuset_t *cpu_map; /* cpu mode only */ + size_t cpusetsize; + const char *backend_name; + int *ident; + int kqueue_fd; +} __aligned(16); + +struct hwt_start { + int reserved; +} __aligned(16); + +struct hwt_stop { + int reserved; +} __aligned(16); + +struct hwt_wakeup { + int reserved; +} __aligned(16); + +struct hwt_record_user_entry { + enum hwt_record_type record_type; + union { + /* + * Used for MMAP, EXECUTABLE, INTERP, + * and KERNEL records. + */ + struct { + char fullpath[MAXPATHLEN]; + uintptr_t addr; + uintptr_t baseaddr; + }; + /* Used for BUFFER records. */ + struct { + int buf_id; + int curpage; + vm_offset_t offset; + }; + /* Used for THREAD_* records. */ + int thread_id; + }; +} __aligned(16); + +struct hwt_record_get { + struct hwt_record_user_entry *records; + int *nentries; + int wait; +} __aligned(16); + +struct hwt_bufptr_get { + int *ident; + vm_offset_t *offset; + uint64_t *data; +} __aligned(16); + +struct hwt_set_config { + /* Configuration of ctx. */ + int pause_on_mmap; + + /* The following passed to backend as is. */ + void *config; + size_t config_size; + int config_version; +} __aligned(16); + +struct hwt_svc_buf { + /* The following passed to backend as is. */ + void *data; + size_t data_size; + int data_version; +} __aligned(16); + +#endif /* !_SYS_HWT_H_ */ diff --git a/sys/sys/hwt_record.h b/sys/sys/hwt_record.h new file mode 100644 index 000000000000..8336723f9396 --- /dev/null +++ b/sys/sys/hwt_record.h @@ -0,0 +1,70 @@ +/*- + * Copyright (c) 2023-2025 Ruslan Bukin <br@bsdpad.com> + * + * This work was supported by Innovate UK project 105694, "Digital Security + * by Design (DSbD) Technology Platform Prototype". + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* User-visible header. */ + +#ifndef _SYS_HWT_RECORD_H_ +#define _SYS_HWT_RECORD_H_ + +enum hwt_record_type { + HWT_RECORD_MMAP, + HWT_RECORD_MUNMAP, + HWT_RECORD_EXECUTABLE, + HWT_RECORD_KERNEL, + HWT_RECORD_THREAD_CREATE, + HWT_RECORD_THREAD_SET_NAME, + HWT_RECORD_BUFFER +}; + +#ifdef _KERNEL +struct hwt_record_entry { + TAILQ_ENTRY(hwt_record_entry) next; + enum hwt_record_type record_type; + union { + /* + * Used for MMAP, EXECUTABLE, INTERP, + * and KERNEL records. + */ + struct { + char *fullpath; + uintptr_t addr; + uintptr_t baseaddr; + }; + /* Used for BUFFER records. */ + struct { + int buf_id; + int curpage; + vm_offset_t offset; + }; + /* Used for THREAD_* records. */ + int thread_id; + }; +}; +#endif + +#endif /* !_SYS_HWT_RECORD_H_ */ diff --git a/sys/sys/inotify.h b/sys/sys/inotify.h new file mode 100644 index 000000000000..d1f23d5898bb --- /dev/null +++ b/sys/sys/inotify.h @@ -0,0 +1,158 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2025 Klara, Inc. + */ + +#ifndef _INOTIFY_H_ +#define _INOTIFY_H_ + +#include <sys/_types.h> + +/* Flags for inotify_init1(). */ +#define IN_NONBLOCK 0x00000004 /* O_NONBLOCK */ +#define IN_CLOEXEC 0x00100000 /* O_CLOEXEC */ + +struct inotify_event { + int wd; + __uint32_t mask; + __uint32_t cookie; + __uint32_t len; + char name[0]; +}; + +/* Events, set in the mask field. */ +#define IN_ACCESS 0x00000001 +#define IN_MODIFY 0x00000002 +#define IN_ATTRIB 0x00000004 +#define IN_CLOSE_WRITE 0x00000008 +#define IN_CLOSE_NOWRITE 0x00000010 +#define IN_CLOSE (IN_CLOSE_WRITE | IN_CLOSE_NOWRITE) +#define IN_OPEN 0x00000020 +#define IN_MOVED_FROM 0x00000040 +#define IN_MOVED_TO 0x00000080 +#define IN_MOVE (IN_MOVED_FROM | IN_MOVED_TO) +#define IN_CREATE 0x00000100 +#define IN_DELETE 0x00000200 +#define IN_DELETE_SELF 0x00000400 +#define IN_MOVE_SELF 0x00000800 +#define IN_ALL_EVENTS 0x00000fff + +/* Events report only for entries in a watched dir, not the dir itself. */ +#define _IN_DIR_EVENTS (IN_CLOSE_WRITE | IN_DELETE | IN_MODIFY | \ + IN_MOVED_FROM | IN_MOVED_TO) + +#ifdef _KERNEL +/* + * An unlink that's done as part of a rename only records IN_DELETE if the + * unlinked vnode itself is watched, and not when the containing directory is + * watched. + */ +#define _IN_MOVE_DELETE 0x40000000 +/* + * Inode link count changes only trigger IN_ATTRIB events if the inode itself is + * watched, and not when the containing directory is watched. + */ +#define _IN_ATTRIB_LINKCOUNT 0x80000000 +#endif + +/* Flags, set in the mask field. */ +#define IN_ONLYDIR 0x01000000 +#define IN_DONT_FOLLOW 0x02000000 +#define IN_EXCL_UNLINK 0x04000000 +#define IN_MASK_CREATE 0x10000000 +#define IN_MASK_ADD 0x20000000 +#define IN_ONESHOT 0x80000000 +#define _IN_ALL_FLAGS (IN_ONLYDIR | IN_DONT_FOLLOW | \ + IN_EXCL_UNLINK | IN_MASK_CREATE | \ + IN_MASK_ADD | IN_ONESHOT) + +/* Flags returned by the kernel. */ +#define IN_UNMOUNT 0x00002000 +#define IN_Q_OVERFLOW 0x00004000 +#define IN_IGNORED 0x00008000 +#define IN_ISDIR 0x40000000 +#define _IN_ALL_RETFLAGS (IN_Q_OVERFLOW | IN_UNMOUNT | IN_IGNORED | \ + IN_ISDIR) + +#define _IN_ALIGN _Alignof(struct inotify_event) +#define _IN_NAMESIZE(namelen) \ + ((namelen) == 0 ? 0 : __align_up((namelen) + 1, _IN_ALIGN)) + +#ifdef _KERNEL +struct componentname; +struct file; +struct inotify_softc; +struct thread; +struct vnode; + +int inotify_create_file(struct thread *, struct file *, int, int *); +void inotify_log(struct vnode *, const char *, size_t, int, __uint32_t); + +int kern_inotify_rm_watch(int, uint32_t, struct thread *); +int kern_inotify_add_watch(int, int, const char *, uint32_t, + struct thread *); + +void vn_inotify(struct vnode *, struct vnode *, struct componentname *, int, + uint32_t); +int vn_inotify_add_watch(struct vnode *, struct inotify_softc *, + __uint32_t, __uint32_t *, struct thread *); +void vn_inotify_revoke(struct vnode *); + +/* Log an inotify event. */ +#define INOTIFY(vp, ev) do { \ + if (__predict_false((vn_irflag_read(vp) & (VIRF_INOTIFY | \ + VIRF_INOTIFY_PARENT)) != 0)) \ + VOP_INOTIFY((vp), NULL, NULL, (ev), 0); \ +} while (0) + +/* Log an inotify event using a specific name for the vnode. */ +#define INOTIFY_NAME_LOCK(vp, dvp, cnp, ev, lock) do { \ + if (__predict_false((vn_irflag_read(vp) & VIRF_INOTIFY) != 0 || \ + (vn_irflag_read(dvp) & VIRF_INOTIFY) != 0)) { \ + if (lock) \ + vn_lock((vp), LK_SHARED | LK_RETRY); \ + VOP_INOTIFY((vp), (dvp), (cnp), (ev), 0); \ + if (lock) \ + VOP_UNLOCK(vp); \ + } \ +} while (0) +#define INOTIFY_NAME(vp, dvp, cnp, ev) \ + INOTIFY_NAME_LOCK((vp), (dvp), (cnp), (ev), false) + +extern __uint32_t inotify_rename_cookie; + +#define INOTIFY_MOVE(vp, fdvp, fcnp, tvp, tdvp, tcnp) do { \ + if (__predict_false((vn_irflag_read(fdvp) & VIRF_INOTIFY) != 0 || \ + (vn_irflag_read(tdvp) & VIRF_INOTIFY) != 0 || \ + (vn_irflag_read(vp) & VIRF_INOTIFY) != 0)) { \ + __uint32_t cookie; \ + \ + cookie = atomic_fetchadd_32(&inotify_rename_cookie, 1); \ + VOP_INOTIFY((vp), (fdvp), (fcnp), IN_MOVED_FROM, cookie); \ + VOP_INOTIFY((vp), (tdvp), (tcnp), IN_MOVED_TO, cookie); \ + } \ + if ((tvp) != NULL) \ + INOTIFY_NAME_LOCK((tvp), (tdvp), (tcnp), \ + _IN_MOVE_DELETE, true); \ +} while (0) + +#define INOTIFY_REVOKE(vp) do { \ + if (__predict_false((vn_irflag_read(vp) & VIRF_INOTIFY) != 0)) \ + vn_inotify_revoke((vp)); \ +} while (0) + +#else +#include <sys/cdefs.h> + +__BEGIN_DECLS +int inotify_init(void); +int inotify_init1(int flags); +int inotify_add_watch(int fd, const char *pathname, __uint32_t mask); +int inotify_add_watch_at(int fd, int dfd, const char *pathname, + __uint32_t mask); +int inotify_rm_watch(int fd, int wd); +__END_DECLS +#endif /* !_KERNEL */ + +#endif /* !_INOTIFY_H_ */ diff --git a/sys/sys/mount.h b/sys/sys/mount.h index a6f858e02395..f6480b173a5c 100644 --- a/sys/sys/mount.h +++ b/sys/sys/mount.h @@ -267,6 +267,7 @@ struct mount { int mnt_lazyvnodelistsize; /* (l) # of lazy vnodes */ int mnt_upper_pending; /* (i) # of pending ops on mnt_uppers */ struct lock mnt_explock; /* vfs_export walkers lock */ + struct lock mnt_renamelock; /* renames and O_RESOLVE_BENEATH */ TAILQ_HEAD(, mount_upper_node) mnt_uppers; /* (i) upper mounts over us */ TAILQ_HEAD(, mount_upper_node) mnt_notify; /* (i) upper mounts for notification */ STAILQ_ENTRY(mount) mnt_taskqueue_link; /* (d) our place in deferred unmount list */ diff --git a/sys/sys/namei.h b/sys/sys/namei.h index 5c245235ace5..6008d83f729d 100644 --- a/sys/sys/namei.h +++ b/sys/sys/namei.h @@ -108,7 +108,12 @@ struct nameidata { * through the VOP interface. */ struct componentname ni_cnd; + + /* Serving RBENEATH. */ struct nameicap_tracker_head ni_cap_tracker; + struct vnode *ni_rbeneath_dpp; + struct mount *ni_nctrack_mnt; + /* * Private helper data for UFS, must be at the end. See * NDINIT_PREFILL(). @@ -235,6 +240,10 @@ int cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status, panic("namei data not inited"); \ if (((arg)->ni_debugflags & NAMEI_DBG_HADSTARTDIR) != 0) \ panic("NDREINIT on namei data with NAMEI_DBG_HADSTARTDIR"); \ + if ((arg)->ni_nctrack_mnt != NULL) \ + panic("NDREINIT on namei data with leaked ni_nctrack_mnt"); \ + if (!TAILQ_EMPTY(&(arg)->ni_cap_tracker)) \ + panic("NDREINIT on namei data with leaked ni_cap_tracker"); \ (arg)->ni_debugflags = NAMEI_DBG_INITED; \ } #else @@ -259,6 +268,9 @@ do { \ _ndp->ni_resflags = 0; \ filecaps_init(&_ndp->ni_filecaps); \ _ndp->ni_rightsneeded = _rightsp; \ + _ndp->ni_rbeneath_dpp = NULL; \ + _ndp->ni_nctrack_mnt = NULL; \ + TAILQ_INIT(&_ndp->ni_cap_tracker); \ } while (0) #define NDREINIT(ndp) do { \ diff --git a/sys/sys/param.h b/sys/sys/param.h index f1bf874cb5fd..f941f021a423 100644 --- a/sys/sys/param.h +++ b/sys/sys/param.h @@ -74,7 +74,7 @@ * cannot include sys/param.h and should only be updated here. */ #undef __FreeBSD_version -#define __FreeBSD_version 1500049 +#define __FreeBSD_version 1500054 /* * __FreeBSD_kernel__ indicates that this system uses the kernel of FreeBSD, diff --git a/sys/sys/proc.h b/sys/sys/proc.h index c7e1a1f51cb4..af9cafa99dd0 100644 --- a/sys/sys/proc.h +++ b/sys/sys/proc.h @@ -893,6 +893,8 @@ struct proc { #define P2_LOGSIGEXIT_ENABLE 0x00800000 /* Disable logging on sigexit */ #define P2_LOGSIGEXIT_CTL 0x01000000 /* Override kern.logsigexit */ +#define P2_HWT 0x02000000 /* Process is using HWT. */ + /* Flags protected by proctree_lock, kept in p_treeflags. */ #define P_TREE_ORPHANED 0x00000001 /* Reparented, on orphan list */ #define P_TREE_FIRST_ORPHAN 0x00000002 /* First element of orphan diff --git a/sys/sys/random.h b/sys/sys/random.h index 254ba9451d0a..5abf762cd200 100644 --- a/sys/sys/random.h +++ b/sys/sys/random.h @@ -85,7 +85,8 @@ enum random_entropy_source { RANDOM_FS_ATIME, RANDOM_UMA, /* Special!! UMA/SLAB Allocator */ RANDOM_CALLOUT, - RANDOM_ENVIRONMENTAL_END = RANDOM_CALLOUT, + RANDOM_RANDOMDEV, + RANDOM_ENVIRONMENTAL_END = RANDOM_RANDOMDEV, /* Fast hardware random-number sources from here on. */ RANDOM_PURE_START, RANDOM_PURE_OCTEON = RANDOM_PURE_START, diff --git a/sys/sys/resourcevar.h b/sys/sys/resourcevar.h index b15dace8cfa0..61411890c85b 100644 --- a/sys/sys/resourcevar.h +++ b/sys/sys/resourcevar.h @@ -122,6 +122,8 @@ struct uidinfo { long ui_kqcnt; /* (b) number of kqueues */ long ui_umtxcnt; /* (b) number of shared umtxs */ long ui_pipecnt; /* (b) consumption of pipe buffers */ + long ui_inotifycnt; /* (b) number of inotify descriptors */ + long ui_inotifywatchcnt; /* (b) number of inotify watches */ uid_t ui_uid; /* (a) uid */ u_int ui_ref; /* (b) reference count */ #ifdef RACCT @@ -144,6 +146,8 @@ int chgsbsize(struct uidinfo *uip, u_int *hiwat, u_int to, int chgptscnt(struct uidinfo *uip, int diff, rlim_t maxval); int chgumtxcnt(struct uidinfo *uip, int diff, rlim_t maxval); int chgpipecnt(struct uidinfo *uip, int diff, rlim_t max); +int chginotifycnt(struct uidinfo *uip, int diff, rlim_t maxval); +int chginotifywatchcnt(struct uidinfo *uip, int diff, rlim_t maxval); int kern_proc_setrlimit(struct thread *td, struct proc *p, u_int which, struct rlimit *limp); struct plimit diff --git a/sys/sys/socket.h b/sys/sys/socket.h index 5e7c554c34cf..cdd4fa3b4b89 100644 --- a/sys/sys/socket.h +++ b/sys/sys/socket.h @@ -111,10 +111,11 @@ typedef __uintptr_t uintptr_t; */ #define SOCK_CLOEXEC 0x10000000 #define SOCK_NONBLOCK 0x20000000 +#define SOCK_CLOFORK 0x40000000 #ifdef _KERNEL /* * Flags for accept1(), kern_accept4() and solisten_dequeue, in addition - * to SOCK_CLOEXEC and SOCK_NONBLOCK. + * to SOCK_CLOEXEC, SOCK_CLOFORK and SOCK_NONBLOCK. */ #define ACCEPT4_INHERIT 0x1 #define ACCEPT4_COMPAT 0x2 @@ -478,6 +479,9 @@ struct msghdr { #define MSG_MORETOCOME 0x00100000 /* additional data pending */ #define MSG_TLSAPPDATA 0x00200000 /* do not soreceive() alert rec. (TLS) */ #endif +#if __BSD_VISIBLE +#define MSG_CMSG_CLOFORK 0x00400000 /* make received fds close-on-fork */ +#endif /* * Header for ancillary data objects in msg_control buffer. diff --git a/sys/sys/specialfd.h b/sys/sys/specialfd.h index dc4d88ce689f..0b79c841d149 100644 --- a/sys/sys/specialfd.h +++ b/sys/sys/specialfd.h @@ -30,6 +30,7 @@ enum specialfd_type { SPECIALFD_EVENTFD = 1, + SPECIALFD_INOTIFY = 2, }; struct specialfd_eventfd { @@ -37,4 +38,8 @@ struct specialfd_eventfd { int flags; }; +struct specialfd_inotify { + int flags; +}; + #endif /* !_SYS_SPECIALFD_H_ */ diff --git a/sys/sys/syscall.h b/sys/sys/syscall.h index 68406a2dfc29..eec923d0b82e 100644 --- a/sys/sys/syscall.h +++ b/sys/sys/syscall.h @@ -529,4 +529,6 @@ #define SYS_fchroot 590 #define SYS_setcred 591 #define SYS_exterrctl 592 -#define SYS_MAXSYSCALL 593 +#define SYS_inotify_add_watch_at 593 +#define SYS_inotify_rm_watch 594 +#define SYS_MAXSYSCALL 595 diff --git a/sys/sys/syscall.mk b/sys/sys/syscall.mk index 9a90a63f35a3..547242a73277 100644 --- a/sys/sys/syscall.mk +++ b/sys/sys/syscall.mk @@ -434,4 +434,6 @@ MIASM = \ getrlimitusage.o \ fchroot.o \ setcred.o \ - exterrctl.o + exterrctl.o \ + inotify_add_watch_at.o \ + inotify_rm_watch.o diff --git a/sys/sys/syscallsubr.h b/sys/sys/syscallsubr.h index fe6dd9e14fb4..fd183ffbc7a4 100644 --- a/sys/sys/syscallsubr.h +++ b/sys/sys/syscallsubr.h @@ -257,6 +257,7 @@ int kern_munlock(struct thread *td, uintptr_t addr, size_t size); int kern_munmap(struct thread *td, uintptr_t addr, size_t size); int kern_nanosleep(struct thread *td, struct timespec *rqt, struct timespec *rmt); +int kern_nosys(struct thread *td, int dummy); int kern_ntp_adjtime(struct thread *td, struct timex *ntv, int *retvalp); int kern_ogetdirentries(struct thread *td, struct ogetdirentries_args *uap, long *ploff); diff --git a/sys/sys/sysent.h b/sys/sys/sysent.h index 6314b03142e7..4ddfc8516053 100644 --- a/sys/sys/sysent.h +++ b/sys/sys/sysent.h @@ -79,11 +79,10 @@ struct sysent { /* system call table */ */ #define SYF_CAPENABLED 0x00000001 -#define SY_THR_FLAGMASK 0x7 -#define SY_THR_STATIC 0x1 -#define SY_THR_DRAINING 0x2 -#define SY_THR_ABSENT 0x4 -#define SY_THR_INCR 0x8 +#define SY_THR_STATIC 0x01 +#define SY_THR_DRAINING 0x02 +#define SY_THR_ABSENT 0x04 +#define SY_THR_INCR 0x08 #ifdef KLD_MODULE #define SY_THR_STATIC_KLD 0 diff --git a/sys/sys/sysproto.h b/sys/sys/sysproto.h index 94da81c84d25..94b5a0a7a95e 100644 --- a/sys/sys/sysproto.h +++ b/sys/sys/sysproto.h @@ -1891,6 +1891,16 @@ struct exterrctl_args { char flags_l_[PADL_(u_int)]; u_int flags; char flags_r_[PADR_(u_int)]; char ptr_l_[PADL_(void *)]; void * ptr; char ptr_r_[PADR_(void *)]; }; +struct inotify_add_watch_at_args { + char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)]; + char dfd_l_[PADL_(int)]; int dfd; char dfd_r_[PADR_(int)]; + char path_l_[PADL_(const char *)]; const char * path; char path_r_[PADR_(const char *)]; + char mask_l_[PADL_(uint32_t)]; uint32_t mask; char mask_r_[PADR_(uint32_t)]; +}; +struct inotify_rm_watch_args { + char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)]; + char wd_l_[PADL_(int)]; int wd; char wd_r_[PADR_(int)]; +}; int sys_exit(struct thread *, struct exit_args *); int sys_fork(struct thread *, struct fork_args *); int sys_read(struct thread *, struct read_args *); @@ -2293,6 +2303,8 @@ int sys_getrlimitusage(struct thread *, struct getrlimitusage_args *); int sys_fchroot(struct thread *, struct fchroot_args *); int sys_setcred(struct thread *, struct setcred_args *); int sys_exterrctl(struct thread *, struct exterrctl_args *); +int sys_inotify_add_watch_at(struct thread *, struct inotify_add_watch_at_args *); +int sys_inotify_rm_watch(struct thread *, struct inotify_rm_watch_args *); #ifdef COMPAT_43 @@ -3275,6 +3287,8 @@ int freebsd13_swapoff(struct thread *, struct freebsd13_swapoff_args *); #define SYS_AUE_fchroot AUE_NULL #define SYS_AUE_setcred AUE_SETCRED #define SYS_AUE_exterrctl AUE_NULL +#define SYS_AUE_inotify_add_watch_at AUE_INOTIFY +#define SYS_AUE_inotify_rm_watch AUE_INOTIFY #undef PAD_ #undef PADL_ diff --git a/sys/sys/unistd.h b/sys/sys/unistd.h index f5caea2e3919..c291c1dc2b95 100644 --- a/sys/sys/unistd.h +++ b/sys/sys/unistd.h @@ -156,6 +156,7 @@ #define _PC_DEALLOC_PRESENT 65 #define _PC_NAMEDATTR_ENABLED 66 #define _PC_HAS_NAMEDATTR 67 +#define _PC_HAS_HIDDENSYSTEM 68 #endif /* From OpenSolaris, used by SEEK_DATA/SEEK_HOLE. */ @@ -210,6 +211,7 @@ * close_range() options. */ #define CLOSE_RANGE_CLOEXEC (1<<2) +#define CLOSE_RANGE_CLOFORK (1<<3) #endif /* __BSD_VISIBLE */ diff --git a/sys/sys/user.h b/sys/sys/user.h index f94a91ca1238..103236b6ed1b 100644 --- a/sys/sys/user.h +++ b/sys/sys/user.h @@ -265,6 +265,7 @@ struct user { #define KF_TYPE_DEV 12 #define KF_TYPE_EVENTFD 13 #define KF_TYPE_TIMERFD 14 +#define KF_TYPE_INOTIFY 15 #define KF_TYPE_UNKNOWN 255 #define KF_VTYPE_VNON 0 @@ -456,6 +457,10 @@ struct kinfo_file { int32_t kf_kqueue_count; int32_t kf_kqueue_state; } kf_kqueue; + struct { + uint64_t kf_inotify_npending; + uint64_t kf_inotify_nbpending; + } kf_inotify; } kf_un; }; uint16_t kf_status; /* Status flags. */ diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h index bed20f607339..2c6947103c94 100644 --- a/sys/sys/vnode.h +++ b/sys/sys/vnode.h @@ -86,11 +86,13 @@ enum vgetstate { * it from v_data. If non-null, this area is freed in getnewvnode(). */ -struct namecache; struct cache_fpl; +struct inotify_watch; +struct namecache; struct vpollinfo { struct mtx vpi_lock; /* lock to protect below */ + TAILQ_HEAD(, inotify_watch) vpi_inotify; /* list of inotify watchers */ struct selinfo vpi_selinfo; /* identity of poller(s) */ short vpi_events; /* what they are looking for */ short vpi_revents; /* what has happened */ @@ -248,6 +250,9 @@ _Static_assert(sizeof(struct vnode) <= 448, "vnode size crosses 448 bytes"); #define VIRF_CROSSMP 0x0010 /* Cross-mp vnode, no locking */ #define VIRF_NAMEDDIR 0x0020 /* Named attribute directory */ #define VIRF_NAMEDATTR 0x0040 /* Named attribute */ +#define VIRF_INOTIFY 0x0080 /* This vnode is being watched */ +#define VIRF_INOTIFY_PARENT 0x0100 /* A parent of this vnode may be being + watched */ #define VI_UNUSED0 0x0001 /* unused */ #define VI_MOUNT 0x0002 /* Mount in progress */ @@ -667,6 +672,7 @@ char *cache_symlink_alloc(size_t size, int flags); void cache_symlink_free(char *string, size_t size); int cache_symlink_resolve(struct cache_fpl *fpl, const char *string, size_t len); +void cache_vop_inotify(struct vnode *vp, int event, uint32_t cookie); void cache_vop_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp, struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp); void cache_vop_rmdir(struct vnode *dvp, struct vnode *vp); @@ -869,8 +875,10 @@ int vop_stdfsync(struct vop_fsync_args *); int vop_stdgetwritemount(struct vop_getwritemount_args *); int vop_stdgetpages(struct vop_getpages_args *); int vop_stdinactive(struct vop_inactive_args *); -int vop_stdioctl(struct vop_ioctl_args *); int vop_stdneed_inactive(struct vop_need_inactive_args *); +int vop_stdinotify(struct vop_inotify_args *); +int vop_stdinotify_add_watch(struct vop_inotify_add_watch_args *); +int vop_stdioctl(struct vop_ioctl_args *); int vop_stdkqfilter(struct vop_kqfilter_args *); int vop_stdlock(struct vop_lock1_args *); int vop_stdunlock(struct vop_unlock_args *); @@ -910,9 +918,12 @@ int dead_read(struct vop_read_args *ap); int dead_write(struct vop_write_args *ap); /* These are called from within the actual VOPS. */ +void vop_allocate_post(void *a, int rc); +void vop_copy_file_range_post(void *ap, int rc); void vop_close_post(void *a, int rc); void vop_create_pre(void *a); void vop_create_post(void *a, int rc); +void vop_deallocate_post(void *a, int rc); void vop_whiteout_pre(void *a); void vop_whiteout_post(void *a, int rc); void vop_deleteextattr_pre(void *a); @@ -1020,9 +1031,12 @@ void vop_rename_fail(struct vop_rename_args *ap); #define VOP_WRITE_POST(ap, ret) \ noffset = (ap)->a_uio->uio_offset; \ - if (noffset > ooffset && !VN_KNLIST_EMPTY((ap)->a_vp)) { \ - VFS_KNOTE_LOCKED((ap)->a_vp, NOTE_WRITE \ - | (noffset > osize ? NOTE_EXTEND : 0)); \ + if (noffset > ooffset) { \ + if (!VN_KNLIST_EMPTY((ap)->a_vp)) { \ + VFS_KNOTE_LOCKED((ap)->a_vp, NOTE_WRITE | \ + (noffset > osize ? NOTE_EXTEND : 0)); \ + } \ + INOTIFY((ap)->a_vp, IN_MODIFY); \ } #define VOP_LOCK(vp, flags) VOP_LOCK1(vp, flags, __FILE__, __LINE__) diff --git a/sys/tools/vnode_if.awk b/sys/tools/vnode_if.awk index d23c2af9bd9a..e829105197cc 100644 --- a/sys/tools/vnode_if.awk +++ b/sys/tools/vnode_if.awk @@ -193,6 +193,7 @@ if (cfile) { printc(common_head \ "#include <sys/param.h>\n" \ "#include <sys/event.h>\n" \ + "#include <sys/inotify.h>\n" \ "#include <sys/kernel.h>\n" \ "#include <sys/mount.h>\n" \ "#include <sys/sdt.h>\n" \ diff --git a/sys/ufs/ffs/ffs_vfsops.c b/sys/ufs/ffs/ffs_vfsops.c index 891e490a7031..75f5fe716c31 100644 --- a/sys/ufs/ffs/ffs_vfsops.c +++ b/sys/ufs/ffs/ffs_vfsops.c @@ -1012,7 +1012,6 @@ ffs_mountfs(struct vnode *odevvp, struct mount *mp, struct thread *td) else ump->um_check_blkno = NULL; mtx_init(UFS_MTX(ump), "FFS", "FFS Lock", MTX_DEF); - sx_init(&ump->um_checkpath_lock, "uchpth"); fs->fs_ronly = ronly; fs->fs_active = NULL; mp->mnt_data = ump; @@ -1182,7 +1181,6 @@ out: } if (ump != NULL) { mtx_destroy(UFS_MTX(ump)); - sx_destroy(&ump->um_checkpath_lock); if (mp->mnt_gjprovider != NULL) { free(mp->mnt_gjprovider, M_UFSMNT); mp->mnt_gjprovider = NULL; @@ -1306,7 +1304,6 @@ ffs_unmount(struct mount *mp, int mntflags) vrele(ump->um_odevvp); dev_rel(ump->um_dev); mtx_destroy(UFS_MTX(ump)); - sx_destroy(&ump->um_checkpath_lock); if (mp->mnt_gjprovider != NULL) { free(mp->mnt_gjprovider, M_UFSMNT); mp->mnt_gjprovider = NULL; diff --git a/sys/ufs/ufs/ufs_lookup.c b/sys/ufs/ufs/ufs_lookup.c index eaf37c58756b..3f9c95e934fc 100644 --- a/sys/ufs/ufs/ufs_lookup.c +++ b/sys/ufs/ufs/ufs_lookup.c @@ -1412,7 +1412,6 @@ ufs_checkpath(ino_t source_ino, ino_t parent_ino, struct inode *target, vp = tvp = ITOV(target); mp = vp->v_mount; *wait_ino = 0; - sx_assert(&VFSTOUFS(mp)->um_checkpath_lock, SA_XLOCKED); if (target->i_number == source_ino) return (EEXIST); diff --git a/sys/ufs/ufs/ufs_vnops.c b/sys/ufs/ufs/ufs_vnops.c index 9aea01e70951..53fac4b0665e 100644 --- a/sys/ufs/ufs/ufs_vnops.c +++ b/sys/ufs/ufs/ufs_vnops.c @@ -1273,9 +1273,9 @@ ufs_rename( struct mount *mp; ino_t ino; seqc_t fdvp_s, fvp_s, tdvp_s, tvp_s; - bool checkpath_locked, want_seqc_end; + bool want_seqc_end; - checkpath_locked = want_seqc_end = false; + want_seqc_end = false; endoff = 0; mp = tdvp->v_mount; @@ -1427,10 +1427,6 @@ relock: } vfs_ref(mp); MPASS(!want_seqc_end); - if (checkpath_locked) { - sx_xunlock(&VFSTOUFS(mp)->um_checkpath_lock); - checkpath_locked = false; - } VOP_UNLOCK(fdvp); VOP_UNLOCK(fvp); vref(tdvp); @@ -1484,8 +1480,6 @@ relock: if (error) goto unlockout; - sx_xlock(&VFSTOUFS(mp)->um_checkpath_lock); - checkpath_locked = true; error = ufs_checkpath(ino, fdp->i_number, tdp, tcnp->cn_cred, &ino); /* @@ -1493,8 +1487,6 @@ relock: * everything else and VGET before restarting. */ if (ino) { - sx_xunlock(&VFSTOUFS(mp)->um_checkpath_lock); - checkpath_locked = false; VOP_UNLOCK(fdvp); VOP_UNLOCK(fvp); VOP_UNLOCK(tdvp); @@ -1574,9 +1566,6 @@ relock: vn_seqc_write_end(fdvp); want_seqc_end = false; vfs_ref(mp); - MPASS(checkpath_locked); - sx_xunlock(&VFSTOUFS(mp)->um_checkpath_lock); - checkpath_locked = false; VOP_UNLOCK(fdvp); VOP_UNLOCK(fvp); vref(tdvp); @@ -1763,9 +1752,6 @@ unlockout: vn_seqc_write_end(fdvp); } - if (checkpath_locked) - sx_xunlock(&VFSTOUFS(mp)->um_checkpath_lock); - vput(fdvp); vput(fvp); @@ -2734,6 +2720,9 @@ ufs_pathconf( case _PC_SYMLINK_MAX: *ap->a_retval = MAXPATHLEN; break; + case _PC_HAS_HIDDENSYSTEM: + *ap->a_retval = 1; + break; default: error = vop_stdpathconf(ap); diff --git a/sys/ufs/ufs/ufsmount.h b/sys/ufs/ufs/ufsmount.h index 5c7fa11dae6a..d33b01e4425e 100644 --- a/sys/ufs/ufs/ufsmount.h +++ b/sys/ufs/ufs/ufsmount.h @@ -97,8 +97,6 @@ struct ufsmount { uint64_t um_maxsymlinklen; /* (c) max size of short symlink */ struct mtx um_lock; /* (c) Protects ufsmount & fs */ - struct sx um_checkpath_lock; /* (c) Protects ufs_checkpath() - result */ struct mount_softdeps *um_softdep; /* (c) softdep mgmt structure */ struct vnode *um_quotas[MAXQUOTAS]; /* (q) pointer to quota files */ struct ucred *um_cred[MAXQUOTAS]; /* (q) quota file access cred */ diff --git a/sys/vm/swap_pager.c b/sys/vm/swap_pager.c index 86b75a2d7989..d6bd06226d04 100644 --- a/sys/vm/swap_pager.c +++ b/sys/vm/swap_pager.c @@ -384,8 +384,8 @@ swap_release_by_cred(vm_ooffset_t decr, struct ucred *cred) #endif } -static int swap_pager_full = 2; /* swap space exhaustion (task killing) */ -static int swap_pager_almost_full = 1; /* swap space exhaustion (w/hysteresis)*/ +static bool swap_pager_full = true; /* swap space exhaustion (task killing) */ +static bool swap_pager_almost_full = true; /* swap space exhaustion (w/hysteresis) */ static struct mtx swbuf_mtx; /* to sync nsw_wcount_async */ static int nsw_wcount_async; /* limit async write buffers */ static int nsw_wcount_async_max;/* assigned maximum */ @@ -642,14 +642,14 @@ swp_sizecheck(void) { if (swap_pager_avail < nswap_lowat) { - if (swap_pager_almost_full == 0) { + if (!swap_pager_almost_full) { printf("swap_pager: out of swap space\n"); - swap_pager_almost_full = 1; + swap_pager_almost_full = true; } } else { - swap_pager_full = 0; + swap_pager_full = false; if (swap_pager_avail > nswap_hiwat) - swap_pager_almost_full = 0; + swap_pager_almost_full = false; } } @@ -958,11 +958,10 @@ swp_pager_getswapspace(int *io_npages) swp_sizecheck(); swdevhd = TAILQ_NEXT(sp, sw_list); } else { - if (swap_pager_full != 2) { + if (!swap_pager_full) { printf("swp_pager_getswapspace(%d): failed\n", *io_npages); - swap_pager_full = 2; - swap_pager_almost_full = 1; + swap_pager_full = swap_pager_almost_full = true; } swdevhd = NULL; } @@ -2863,10 +2862,8 @@ swapoff_one(struct swdevt *sp, struct ucred *cred, u_int flags) sp->sw_id = NULL; TAILQ_REMOVE(&swtailq, sp, sw_list); nswapdev--; - if (nswapdev == 0) { - swap_pager_full = 2; - swap_pager_almost_full = 1; - } + if (nswapdev == 0) + swap_pager_full = swap_pager_almost_full = true; if (swdevhd == sp) swdevhd = NULL; mtx_unlock(&sw_dev_mtx); diff --git a/sys/vm/vm_domainset.c b/sys/vm/vm_domainset.c index 7b8bf4c77663..b44bdb96b0d4 100644 --- a/sys/vm/vm_domainset.c +++ b/sys/vm/vm_domainset.c @@ -131,8 +131,7 @@ static void vm_domainset_iter_next(struct vm_domainset_iter *di, int *domain) { - KASSERT(di->di_n > 0, - ("vm_domainset_iter_first: Invalid n %d", di->di_n)); + KASSERT(di->di_n > 0, ("%s: Invalid n %d", __func__, di->di_n)); switch (di->di_policy) { case DOMAINSET_POLICY_FIRSTTOUCH: /* @@ -149,11 +148,10 @@ vm_domainset_iter_next(struct vm_domainset_iter *di, int *domain) vm_domainset_iter_prefer(di, domain); break; default: - panic("vm_domainset_iter_first: Unknown policy %d", - di->di_policy); + panic("%s: Unknown policy %d", __func__, di->di_policy); } KASSERT(*domain < vm_ndomains, - ("vm_domainset_iter_next: Invalid domain %d", *domain)); + ("%s: Invalid domain %d", __func__, *domain)); } static void @@ -189,13 +187,11 @@ vm_domainset_iter_first(struct vm_domainset_iter *di, int *domain) di->di_n = di->di_domain->ds_cnt; break; default: - panic("vm_domainset_iter_first: Unknown policy %d", - di->di_policy); + panic("%s: Unknown policy %d", __func__, di->di_policy); } - KASSERT(di->di_n > 0, - ("vm_domainset_iter_first: Invalid n %d", di->di_n)); + KASSERT(di->di_n > 0, ("%s: Invalid n %d", __func__, di->di_n)); KASSERT(*domain < vm_ndomains, - ("vm_domainset_iter_first: Invalid domain %d", *domain)); + ("%s: Invalid domain %d", __func__, *domain)); } void diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c index 21584abacfa3..3e57e8d4f1d0 100644 --- a/sys/vm/vm_fault.c +++ b/sys/vm/vm_fault.c @@ -1441,8 +1441,7 @@ vm_fault_busy_sleep(struct faultstate *fs) } vm_object_pip_wakeup(fs->object); vm_fault_unlock_map(fs); - if (fs->m != vm_page_lookup(fs->object, fs->pindex) || - !vm_page_busy_sleep(fs->m, "vmpfw", 0)) + if (!vm_page_busy_sleep(fs->m, "vmpfw", 0)) VM_OBJECT_UNLOCK(fs->object); VM_CNT_INC(v_intrans); vm_object_deallocate(fs->first_object); diff --git a/sys/vm/vm_kern.c b/sys/vm/vm_kern.c index 875c22d27628..e7d7b6726d2c 100644 --- a/sys/vm/vm_kern.c +++ b/sys/vm/vm_kern.c @@ -110,11 +110,18 @@ u_int exec_map_entry_size; u_int exec_map_entries; SYSCTL_ULONG(_vm, OID_AUTO, min_kernel_address, CTLFLAG_RD, - SYSCTL_NULL_ULONG_PTR, VM_MIN_KERNEL_ADDRESS, "Min kernel address"); +#if defined(__amd64__) + &kva_layout.km_low, 0, +#else + SYSCTL_NULL_ULONG_PTR, VM_MIN_KERNEL_ADDRESS, +#endif + "Min kernel address"); SYSCTL_ULONG(_vm, OID_AUTO, max_kernel_address, CTLFLAG_RD, #if defined(__arm__) &vm_max_kernel_address, 0, +#elif defined(__amd64__) + &kva_layout.km_high, 0, #else SYSCTL_NULL_ULONG_PTR, VM_MAX_KERNEL_ADDRESS, #endif diff --git a/sys/vm/vm_mmap.c b/sys/vm/vm_mmap.c index 46fd212df299..501ace32bd11 100644 --- a/sys/vm/vm_mmap.c +++ b/sys/vm/vm_mmap.c @@ -41,6 +41,7 @@ */ #include "opt_hwpmc_hooks.h" +#include "opt_hwt_hooks.h" #include "opt_vm.h" #define EXTERR_CATEGORY EXTERR_CAT_MMAP @@ -95,6 +96,10 @@ #include <sys/pmckern.h> #endif +#ifdef HWT_HOOKS +#include <dev/hwt/hwt_hook.h> +#endif + int old_mlock = 0; SYSCTL_INT(_vm, OID_AUTO, old_mlock, CTLFLAG_RWTUN, &old_mlock, 0, "Do not apply RLIMIT_MEMLOCK on mlockall"); @@ -613,6 +618,17 @@ kern_munmap(struct thread *td, uintptr_t addr0, size_t size) #endif rv = vm_map_delete(map, addr, end); +#ifdef HWT_HOOKS + if (HWT_HOOK_INSTALLED && rv == KERN_SUCCESS) { + struct hwt_record_entry ent; + + ent.addr = (uintptr_t) addr; + ent.fullpath = NULL; + ent.record_type = HWT_RECORD_MUNMAP; + HWT_CALL_HOOK(td, HWT_RECORD, &ent); + } +#endif + #ifdef HWPMC_HOOKS if (rv == KERN_SUCCESS && __predict_false(pmc_handled)) { /* downgrade the lock to prevent a LOR with the pmc-sx lock */ diff --git a/sys/vm/vm_pagequeue.h b/sys/vm/vm_pagequeue.h index cbbd27389662..9bd3b389fb60 100644 --- a/sys/vm/vm_pagequeue.h +++ b/sys/vm/vm_pagequeue.h @@ -260,9 +260,9 @@ struct vm_domain { u_int vmd_inactive_shortage; /* Per-thread shortage. */ blockcount_t vmd_inactive_running; /* Number of inactive threads. */ blockcount_t vmd_inactive_starting; /* Number of threads started. */ - volatile u_int vmd_addl_shortage; /* Shortage accumulator. */ - volatile u_int vmd_inactive_freed; /* Successful inactive frees. */ - volatile u_int vmd_inactive_us; /* Microseconds for above. */ + u_int vmd_addl_shortage; /* (a) Shortage accumulator. */ + u_int vmd_inactive_freed; /* (a) Successful inactive frees. */ + u_int vmd_inactive_us; /* (a) Microseconds for above. */ u_int vmd_inactive_pps; /* Exponential decay frees/second. */ int vmd_oom_seq; int vmd_last_active_scan; diff --git a/sys/x86/linux/linux_dummy_x86.c b/sys/x86/linux/linux_dummy_x86.c index ae1d23e811e7..221f5dbf5ba3 100644 --- a/sys/x86/linux/linux_dummy_x86.c +++ b/sys/x86/linux/linux_dummy_x86.c @@ -46,7 +46,5 @@ LIN_SDT_PROVIDER_DECLARE(LINUX_DTRACE); DUMMY(sysfs); DUMMY(quotactl); -/* Linux 2.6.13: */ -DUMMY(inotify_init); /* Linux 2.6.22: */ DUMMY(signalfd); |